12
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
13
# See the License for the specific language governing permissions and
14
14
# limitations under the License.
15
+ from functools import lru_cache
15
16
16
17
import collections
17
18
import logging
26
27
27
28
logger = logging .getLogger (__name__ )
28
29
29
- # This set contains all letter and number characters.
30
- _ALPHANUMERIC_CHAR_SET = set (
31
- six .unichr (i )
32
- for i in six .moves .xrange (sys .maxunicode )
33
- if (unicodedata .category (six .unichr (i )).startswith ("L" ) or
34
- unicodedata .category (six .unichr (i )).startswith ("N" )))
30
+
31
+ @lru_cache ()
32
+ def get_alphanumeric_char_set ():
33
+ """ This set contains all letter and number characters. """
34
+ return set (
35
+ six .unichr (i )
36
+ for i in six .moves .xrange (sys .maxunicode )
37
+ if (unicodedata .category (six .unichr (i )).startswith ("L" ) or
38
+ unicodedata .category (six .unichr (i )).startswith ("N" )))
39
+
35
40
36
41
# Regular expression for unescaping token strings.
37
42
# '\u' is converted to '_'
41
46
_ESCAPE_CHARS = set (u"\\ _u;0123456789" )
42
47
43
48
44
- def native_to_unicode_py2 (s ):
45
- """Python 2: transform native string to Unicode."""
46
- return s if isinstance (s , unicode ) else s .decode ("utf8" ) # noqa: F821
49
+ # Conversion between Unicode and UTF-8, if required (on Python2)
50
+ def native_to_unicode (s ):
51
+ if six .PY2 :
52
+ return s if isinstance (s , unicode ) else s .decode ("utf8" ) # noqa: F821
53
+ else :
54
+ return s
47
55
48
56
49
- # Conversion between Unicode and UTF-8, if required (on Python2)
50
- if six .PY2 :
51
- native_to_unicode = native_to_unicode_py2
52
- unicode_to_native = lambda s : s .encode ("utf-8" )
53
- else :
54
- # No conversion required on Python3
55
- native_to_unicode = lambda s : s
56
- unicode_to_native = lambda s : s
57
+ def unicode_to_native (s ):
58
+ if six .PY2 :
59
+ return s .encode ("utf-8" )
60
+ else :
61
+ return s
57
62
58
63
59
64
def encode (text ):
@@ -69,7 +74,7 @@ def encode(text):
69
74
ret = []
70
75
token_start = 0
71
76
# Classify each character in the input string
72
- is_alnum = [c in _ALPHANUMERIC_CHAR_SET for c in text ]
77
+ is_alnum = [c in get_alphanumeric_char_set () for c in text ]
73
78
for pos in six .moves .xrange (1 , len (text )):
74
79
if is_alnum [pos ] != is_alnum [pos - 1 ]:
75
80
token = text [token_start :pos ]
@@ -89,7 +94,7 @@ def decode(tokens):
89
94
Returns:
90
95
a unicode string
91
96
"""
92
- token_is_alnum = [t [0 ] in _ALPHANUMERIC_CHAR_SET for t in tokens ]
97
+ token_is_alnum = [t [0 ] in get_alphanumeric_char_set () for t in tokens ]
93
98
ret = []
94
99
for i , token in enumerate (tokens ):
95
100
if i > 0 and token_is_alnum [i - 1 ] and token_is_alnum [i ]:
0 commit comments