@@ -145,8 +145,6 @@ def _subset_filenames(dl_paths, split):
145
145
146
146
DM_SINGLE_CLOSE_QUOTE = u'\u2019 ' # unicode
147
147
DM_DOUBLE_CLOSE_QUOTE = u'\u201d '
148
- SENTENCE_START = '<s>'
149
- SENTENCE_END = '</s>'
150
148
# acceptable ways to end a sentence
151
149
END_TOKENS = ['.' , '!' , '?' , '...' , "'" , '`' , '"' ,
152
150
DM_SINGLE_CLOSE_QUOTE , DM_DOUBLE_CLOSE_QUOTE , ')' ]
@@ -201,31 +199,31 @@ def fix_missing_period(line):
201
199
202
200
# Make abstract into a single string, putting <s> and </s> tags around
203
201
# the sentences.
204
- abstract = ' ' .join (['%s %s %s' % (SENTENCE_START , sent ,
205
- SENTENCE_END ) for sent in highlights ])
202
+ abstract = ' ' .join (highlights )
206
203
207
204
return article , abstract
208
205
209
206
210
207
class CnnDailymail (tfds .core .GeneratorBasedBuilder ):
211
208
"""CNN/DailyMail non-anonymized summarization dataset."""
209
+ # 0.0.2 is like 0.0.1 but without special tokens <s> and </s>.
212
210
BUILDER_CONFIGS = [
213
211
CnnDailymailConfig (
214
212
name = 'plain_text' ,
215
- version = '0.0.1 ' ,
213
+ version = '0.0.2 ' ,
216
214
description = 'Plain text' ,
217
215
),
218
216
CnnDailymailConfig (
219
217
name = 'bytes' ,
220
- version = '0.0.1 ' ,
218
+ version = '0.0.2 ' ,
221
219
description = ('Uses byte-level text encoding with '
222
220
'`tfds.features.text.ByteTextEncoder`' ),
223
221
text_encoder_config = tfds .features .text .TextEncoderConfig (
224
222
encoder = tfds .features .text .ByteTextEncoder ()),
225
223
),
226
224
CnnDailymailConfig (
227
225
name = 'subwords32k' ,
228
- version = '0.0.1 ' ,
226
+ version = '0.0.2 ' ,
229
227
description = ('Uses `tfds.features.text.SubwordTextEncoder` with '
230
228
'32k vocab size' ),
231
229
text_encoder_config = tfds .features .text .TextEncoderConfig (
@@ -260,8 +258,7 @@ def _split_generators(self, dl_manager):
260
258
# Generate shared vocabulary
261
259
# maybe_build_from_corpus uses SubwordTextEncoder if that's configured
262
260
self .info .features [_ARTICLE ].maybe_build_from_corpus (
263
- self ._vocab_text_gen (train_files ),
264
- reserved_tokens = [SENTENCE_START , SENTENCE_END ])
261
+ self ._vocab_text_gen (train_files ))
265
262
encoder = self .info .features [_ARTICLE ].encoder
266
263
# Use maybe_set_encoder because the encoder may have been restored from
267
264
# package data.
0 commit comments