|
| 1 | +# coding=utf-8 |
| 2 | +# Copyright 2020 The TensorFlow Datasets Authors. |
| 3 | +# |
| 4 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | +# you may not use this file except in compliance with the License. |
| 6 | +# You may obtain a copy of the License at |
| 7 | +# |
| 8 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +# |
| 10 | +# Unless required by applicable law or agreed to in writing, software |
| 11 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | +# See the License for the specific language governing permissions and |
| 14 | +# limitations under the License. |
| 15 | + |
| 16 | +"""LibriTTS dataset.""" |
| 17 | + |
| 18 | +from __future__ import absolute_import |
| 19 | +from __future__ import division |
| 20 | +from __future__ import print_function |
| 21 | + |
| 22 | +import os |
| 23 | + |
| 24 | +import tensorflow.compat.v2 as tf |
| 25 | + |
| 26 | +import tensorflow_datasets.public_api as tfds |
| 27 | + |
| 28 | +_CITATION = """\ |
| 29 | +@inproceedings{zen2019libritts, |
| 30 | + title = {LibriTTS: A Corpus Derived from LibriSpeech for Text-to-Speech}, |
| 31 | + author = {H. Zen and V. Dang and R. Clark and Y. Zhang and R. J. Weiss and Y. Jia and Z. Chen and Y. Wu}, |
| 32 | + booktitle = {Proc. Interspeech}, |
| 33 | + month = sep, |
| 34 | + year = {2019}, |
| 35 | + doi = {10.21437/Interspeech.2019-2441}, |
| 36 | +} |
| 37 | +""" |
| 38 | + |
| 39 | +_DESCRIPTION = """\ |
| 40 | +LibriTTS is a multi-speaker English corpus of approximately 585 hours of read |
| 41 | +English speech at 24kHz sampling rate, prepared by Heiga Zen with the assistance |
| 42 | +of Google Speech and Google Brain team members. The LibriTTS corpus is designed |
| 43 | +for TTS research. It is derived from the original materials (mp3 audio files |
| 44 | +from LibriVox and text files from Project Gutenberg) of the LibriSpeech corpus. |
| 45 | +The main differences from the LibriSpeech corpus are listed below: |
| 46 | +
|
| 47 | +1. The audio files are at 24kHz sampling rate. |
| 48 | +2. The speech is split at sentence breaks. |
| 49 | +3. Both original and normalized texts are included. |
| 50 | +4. Contextual information (e.g., neighbouring sentences) can be extracted. |
| 51 | +5. Utterances with significant background noise are excluded. |
| 52 | +""" |
| 53 | + |
| 54 | +_URL = "http://www.openslr.org/60" |
| 55 | +_DL_URL = "http://www.openslr.org/resources/60/" |
| 56 | +_DL_URLS = { |
| 57 | + "dev_clean": _DL_URL + "dev-clean.tar.gz", |
| 58 | + "dev_other": _DL_URL + "dev-other.tar.gz", |
| 59 | + "test_clean": _DL_URL + "test-clean.tar.gz", |
| 60 | + "test_other": _DL_URL + "test-other.tar.gz", |
| 61 | + "train_clean100": _DL_URL + "train-clean-100.tar.gz", |
| 62 | + "train_clean360": _DL_URL + "train-clean-360.tar.gz", |
| 63 | + "train_other500": _DL_URL + "train-other-500.tar.gz", |
| 64 | +} |
| 65 | + |
| 66 | + |
| 67 | +class Libritts(tfds.core.BeamBasedBuilder): |
| 68 | + """LibriTTS dataset.""" |
| 69 | + |
| 70 | + VERSION = tfds.core.Version("1.0.0") |
| 71 | + |
| 72 | + def _info(self): |
| 73 | + return tfds.core.DatasetInfo( |
| 74 | + builder=self, |
| 75 | + description=_DESCRIPTION, |
| 76 | + features=tfds.features.FeaturesDict({ |
| 77 | + "speech": tfds.features.Audio(), |
| 78 | + "text_original": tfds.features.Text(), |
| 79 | + "text_normalized": tfds.features.Text(), |
| 80 | + "speaker_id": tf.int64, |
| 81 | + "chapter_id": tf.int64, |
| 82 | + "id": tf.string, |
| 83 | + }), |
| 84 | + supervised_keys=("text_normalized", "speech"), |
| 85 | + homepage=_URL, |
| 86 | + citation=_CITATION, |
| 87 | + metadata=tfds.core.MetadataDict(sample_rate=24000,), |
| 88 | + ) |
| 89 | + |
| 90 | + def _populate_metadata(self, dirs): |
| 91 | + # All dirs contain the same metadata. |
| 92 | + directory = list(dirs.values())[0] |
| 93 | + |
| 94 | + speaker_info = {} |
| 95 | + path = os.path.join(directory, "LibriTTS/speakers.tsv") |
| 96 | + with tf.io.gfile.GFile(path) as f: |
| 97 | + for n, line in enumerate(f): |
| 98 | + # Skip the first line which is just a header. |
| 99 | + if n == 0: |
| 100 | + continue |
| 101 | + fields = line.strip().split("\t") |
| 102 | + if len(fields) == 3: |
| 103 | + # Some lines are missing the final field, so leave it blank. |
| 104 | + fields.append("") |
| 105 | + id_str, gender, subset, name = fields |
| 106 | + speaker_info[int(id_str)] = { |
| 107 | + "gender": gender, |
| 108 | + "subset": subset, |
| 109 | + "name": name, |
| 110 | + } |
| 111 | + self.info.metadata["speakers"] = speaker_info |
| 112 | + |
| 113 | + def _split_generators(self, dl_manager): |
| 114 | + extracted_dirs = dl_manager.download_and_extract(_DL_URLS) |
| 115 | + self._populate_metadata(extracted_dirs) |
| 116 | + splits = [tfds.core.SplitGenerator(name=k, gen_kwargs={"directory": v}) |
| 117 | + for k, v in extracted_dirs.items()] |
| 118 | + return splits |
| 119 | + |
| 120 | + def _build_pcollection(self, pipeline, directory): |
| 121 | + """Generates examples as dicts.""" |
| 122 | + beam = tfds.core.lazy_imports.apache_beam |
| 123 | + return (pipeline |
| 124 | + | beam.Create([directory]) |
| 125 | + | beam.FlatMap(_generate_libritts_examples) |
| 126 | + | beam.Reshuffle()) |
| 127 | + |
| 128 | + |
| 129 | +def _generate_libritts_examples(directory): |
| 130 | + """Generate examples from a LibriTTS directory.""" |
| 131 | + transcripts_glob = os.path.join(directory, "LibriTTS", "*/*/*/*.trans.tsv") |
| 132 | + for transcript_file in tf.io.gfile.glob(transcripts_glob): |
| 133 | + path = os.path.dirname(transcript_file) |
| 134 | + with tf.io.gfile.GFile(os.path.join(path, transcript_file)) as f: |
| 135 | + for line in f: |
| 136 | + key, text_original, text_normalized = line.split("\t") |
| 137 | + audio_file = "%s.wav" % key |
| 138 | + speaker_id, chapter_id = [int(el) for el in key.split("_")[:2]] |
| 139 | + example = { |
| 140 | + "speech": os.path.join(path, audio_file), |
| 141 | + "text_normalized": text_normalized, |
| 142 | + "text_original": text_original, |
| 143 | + "speaker_id": speaker_id, |
| 144 | + "chapter_id": chapter_id, |
| 145 | + "id": key, |
| 146 | + } |
| 147 | + yield key, example |
0 commit comments