Skip to content
This repository was archived by the owner on Jul 4, 2023. It is now read-only.

Commit 49bb1d7

Browse files
committed
Lazy load six
1 parent 38619d9 commit 49bb1d7

File tree

1 file changed

+24
-19
lines changed

1 file changed

+24
-19
lines changed

torchnlp/encoders/text/subword_text_tokenizer.py

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
15+
from functools import lru_cache
1516

1617
import collections
1718
import logging
@@ -26,12 +27,16 @@
2627

2728
logger = logging.getLogger(__name__)
2829

29-
# This set contains all letter and number characters.
30-
_ALPHANUMERIC_CHAR_SET = set(
31-
six.unichr(i)
32-
for i in six.moves.xrange(sys.maxunicode)
33-
if (unicodedata.category(six.unichr(i)).startswith("L") or
34-
unicodedata.category(six.unichr(i)).startswith("N")))
30+
31+
@lru_cache()
32+
def get_alphanumeric_char_set():
33+
""" This set contains all letter and number characters. """
34+
return set(
35+
six.unichr(i)
36+
for i in six.moves.xrange(sys.maxunicode)
37+
if (unicodedata.category(six.unichr(i)).startswith("L") or
38+
unicodedata.category(six.unichr(i)).startswith("N")))
39+
3540

3641
# Regular expression for unescaping token strings.
3742
# '\u' is converted to '_'
@@ -41,19 +46,19 @@
4146
_ESCAPE_CHARS = set(u"\\_u;0123456789")
4247

4348

44-
def native_to_unicode_py2(s):
45-
"""Python 2: transform native string to Unicode."""
46-
return s if isinstance(s, unicode) else s.decode("utf8") # noqa: F821
49+
# Conversion between Unicode and UTF-8, if required (on Python2)
50+
def native_to_unicode(s):
51+
if six.PY2:
52+
return s if isinstance(s, unicode) else s.decode("utf8") # noqa: F821
53+
else:
54+
return s
4755

4856

49-
# Conversion between Unicode and UTF-8, if required (on Python2)
50-
if six.PY2:
51-
native_to_unicode = native_to_unicode_py2
52-
unicode_to_native = lambda s: s.encode("utf-8")
53-
else:
54-
# No conversion required on Python3
55-
native_to_unicode = lambda s: s
56-
unicode_to_native = lambda s: s
57+
def unicode_to_native(s):
58+
if six.PY2:
59+
return s.encode("utf-8")
60+
else:
61+
return s
5762

5863

5964
def encode(text):
@@ -69,7 +74,7 @@ def encode(text):
6974
ret = []
7075
token_start = 0
7176
# Classify each character in the input string
72-
is_alnum = [c in _ALPHANUMERIC_CHAR_SET for c in text]
77+
is_alnum = [c in get_alphanumeric_char_set() for c in text]
7378
for pos in six.moves.xrange(1, len(text)):
7479
if is_alnum[pos] != is_alnum[pos - 1]:
7580
token = text[token_start:pos]
@@ -89,7 +94,7 @@ def decode(tokens):
8994
Returns:
9095
a unicode string
9196
"""
92-
token_is_alnum = [t[0] in _ALPHANUMERIC_CHAR_SET for t in tokens]
97+
token_is_alnum = [t[0] in get_alphanumeric_char_set() for t in tokens]
9398
ret = []
9499
for i, token in enumerate(tokens):
95100
if i > 0 and token_is_alnum[i - 1] and token_is_alnum[i]:

0 commit comments

Comments
 (0)