Remove unnecessary <s> and </s> tokens. And add a test.

TensorFlow Datasets Team · copybara-github · commit 242938110e9c · 2019-04-30T22:04:14.000-07:00
PiperOrigin-RevId: 246080224
diff --git a/tensorflow_datasets/text/cnn_dailymail.py b/tensorflow_datasets/text/cnn_dailymail.py
@@ -145,8 +145,6 @@ def _subset_filenames(dl_paths, split):
 
 DM_SINGLE_CLOSE_QUOTE = u'\u2019'  # unicode
 DM_DOUBLE_CLOSE_QUOTE = u'\u201d'
-SENTENCE_START = '<s>'
-SENTENCE_END = '</s>'
 # acceptable ways to end a sentence
 END_TOKENS = ['.', '!', '?', '...', "'", '`', '"',
               DM_SINGLE_CLOSE_QUOTE, DM_DOUBLE_CLOSE_QUOTE, ')']
@@ -201,31 +199,31 @@ def fix_missing_period(line):
 
   # Make abstract into a single string, putting <s> and </s> tags around
   # the sentences.
-  abstract = ' '.join(['%s %s %s' % (SENTENCE_START, sent,
-                                     SENTENCE_END) for sent in highlights])
+  abstract = ' '.join(highlights)
 
   return article, abstract
 
 
 class CnnDailymail(tfds.core.GeneratorBasedBuilder):
   """CNN/DailyMail non-anonymized summarization dataset."""
+  # 0.0.2 is like 0.0.1 but without special tokens <s> and </s>.
   BUILDER_CONFIGS = [
       CnnDailymailConfig(
           name='plain_text',
-          version='0.0.1',
+          version='0.0.2',
           description='Plain text',
       ),
       CnnDailymailConfig(
           name='bytes',
-          version='0.0.1',
+          version='0.0.2',
           description=('Uses byte-level text encoding with '
                        '`tfds.features.text.ByteTextEncoder`'),
           text_encoder_config=tfds.features.text.TextEncoderConfig(
               encoder=tfds.features.text.ByteTextEncoder()),
       ),
       CnnDailymailConfig(
           name='subwords32k',
-          version='0.0.1',
+          version='0.0.2',
           description=('Uses `tfds.features.text.SubwordTextEncoder` with '
                        '32k vocab size'),
           text_encoder_config=tfds.features.text.TextEncoderConfig(
@@ -260,8 +258,7 @@ def _split_generators(self, dl_manager):
     # Generate shared vocabulary
     # maybe_build_from_corpus uses SubwordTextEncoder if that's configured
     self.info.features[_ARTICLE].maybe_build_from_corpus(
-        self._vocab_text_gen(train_files),
-        reserved_tokens=[SENTENCE_START, SENTENCE_END])
+        self._vocab_text_gen(train_files))
     encoder = self.info.features[_ARTICLE].encoder
     # Use maybe_set_encoder because the encoder may have been restored from
     # package data.
diff --git a/tensorflow_datasets/text/cnn_dailymail_test.py b/tensorflow_datasets/text/cnn_dailymail_test.py
@@ -18,10 +18,29 @@
 from __future__ import division
 from __future__ import print_function
 
+import tempfile
+
 import tensorflow_datasets.testing as tfds_test
 from tensorflow_datasets.text import cnn_dailymail
 
 
+_STORY_FILE = b"""Some article.
+This is some article text.
+
+@highlight
+
+highlight text
+
+@highlight
+
+Highlight two
+
+@highlight
+
+highlight Three
+"""
+
+
 class CnnDailymailTest(tfds_test.DatasetBuilderTestCase):
   DATASET_CLASS = cnn_dailymail.CnnDailymail
   SPLITS = {
@@ -35,6 +54,19 @@ class CnnDailymailTest(tfds_test.DatasetBuilderTestCase):
                        'train_urls': 'all_train.txt',
                        'val_urls': 'all_val.txt'}
 
+  def test_get_art_abs(self):
+    with tempfile.NamedTemporaryFile(delete=True) as f:
+      f.write(_STORY_FILE)
+      f.flush()
+      article, abstract = cnn_dailymail._get_art_abs(f.name)
+      self.assertEqual('some article. this is some article text.',
+                       article)
+      # This is a bit weird, but the original code at
+      # https://github.com/abisee/cnn-dailymail/ adds space before period
+      # for abstracts and we retain this behavior.
+      self.assertEqual('highlight text . highlight two . highlight three .',
+                       abstract)
+
 
 if __name__ == '__main__':
   tfds_test.test_main()