Skip to content

Commit 375fa36

Browse files
cyfracopybara-github
authored andcommitted
Fixing utf-8 issues in translate datasets.
PiperOrigin-RevId: 275898852
1 parent 3cadb7f commit 375fa36

File tree

1 file changed

+8
-2
lines changed
  • tensorflow_datasets/translate

1 file changed

+8
-2
lines changed

tensorflow_datasets/translate/wmt.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from __future__ import division
2020
from __future__ import print_function
2121

22+
import codecs
2223
import functools
2324
import gzip
2425
import itertools
@@ -870,8 +871,13 @@ def _get_tuv_seg(tuv):
870871
assert len(segs) == 1, "Invalid number of segments: %d" % len(segs)
871872
return segs[0].text
872873

873-
with tf.io.gfile.GFile(path) as f:
874-
for _, elem in ElementTree.iterparse(f):
874+
with tf.io.gfile.GFile(path, "rb") as f:
875+
if six.PY3:
876+
# Workaround due to: https://github.com/tensorflow/tensorflow/issues/33563
877+
utf_f = codecs.getreader("utf-8")(f)
878+
else:
879+
utf_f = f
880+
for _, elem in ElementTree.iterparse(utf_f):
875881
if elem.tag == "tu":
876882
yield {
877883
_get_tuv_lang(tuv):

0 commit comments

Comments
 (0)