Skip to content

Commit 0f3fa18

Browse files
committed
[feat] merge
2 parents 3322cd9 + 822cc88 commit 0f3fa18

File tree

7 files changed

+396
-265
lines changed

7 files changed

+396
-265
lines changed

EduNLP/Pretrain/gensim_vec.py

Lines changed: 40 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,31 +7,60 @@
77
from gensim.models.doc2vec import TaggedDocument
88
from gensim.models.callbacks import CallbackAny2Vec
99
from EduNLP.SIF.sif import sif4sci
10-
from EduNLP.Vector import D2V
10+
from EduNLP.Vector import D2V, BowLoader
1111
from copy import deepcopy
1212
import itertools as it
1313

1414
__all__ = ["GensimWordTokenizer", "train_vector", "GensimSegTokenizer"]
1515

1616

1717
class GensimWordTokenizer(object):
18-
def __init__(self, symbol="gm"):
18+
def __init__(self, symbol="gm", general=False):
1919
"""
2020
2121
Parameters
2222
----------
2323
symbol:
2424
gm
2525
fgm
26+
gmas
27+
fgmas
28+
general:
29+
True when item isn't in standard format, and want to tokenize formulas(except formulas in figure) linearly.
30+
False when use 'ast' mothed to tokenize formulas instead of 'linear'.
31+
32+
Returns
33+
----------
34+
35+
Examples
36+
----------
37+
>>> tokenizer = GensimWordTokenizer(symbol="gmas", general=True)
38+
>>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\
39+
... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$")
40+
>>> print(token_item.tokens[:10])
41+
['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]']
42+
>>> tokenizer = GensimWordTokenizer(symbol="fgmas", general=False)
43+
>>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\
44+
... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$")
45+
>>> print(token_item.tokens[:10])
46+
['公式', '[FORMULA]', '如图', '[FIGURE]', '[FORMULA]', '约束条件', '公式', '[FORMULA]', '[SEP]', '[FORMULA]']
2647
"""
2748
self.symbol = symbol
28-
self.tokenization_params = {
29-
"formula_params": {
30-
"method": "ast",
31-
"return_type": "list",
32-
"ord2token": True
49+
if general is True:
50+
self.tokenization_params = {
51+
"formula_params": {
52+
"method": "linear",
53+
"symbolize_figure_formula": True
54+
}
55+
}
56+
else:
57+
self.tokenization_params = {
58+
"formula_params": {
59+
"method": "ast",
60+
"return_type": "list",
61+
"ord2token": True
62+
}
3363
}
34-
}
3564

3665
def batch_process(self, *items):
3766
pass
@@ -135,16 +164,16 @@ def train_vector(items, w2v_prefix, embedding_dim=None, method="sg", binary=None
135164
binary = binary if binary is not None else True
136165
elif method == "tfidf":
137166
dictionary_path = train_vector(items, w2v_prefix, method="bow")
138-
dictionary = D2V(dictionary_path, method="bow")
139-
corpus = [dictionary(item) for item in items]
167+
dictionary = BowLoader(dictionary_path)
168+
corpus = [dictionary.infer_vector(item) for item in items]
140169
model = gensim.models.TfidfModel(corpus)
141170
binary = binary if binary is not None else True
142171
else:
143172
raise ValueError("Unknown method: %s" % method)
144173

145174
filepath = w2v_prefix + method
146175
if embedding_dim is not None:
147-
filepath = w2v_prefix + "_" + str(embedding_dim)
176+
filepath = filepath + "_" + str(embedding_dim)
148177

149178
if binary is True:
150179
filepath += ".bin"

EduNLP/SIF/tokenization/tokenization.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@ def __init__(self, segment_list: SegmentList, text_params=None, formula_params=N
3333
"s": []
3434
}
3535
self.text_params = text_params if text_params is not None else {}
36+
if formula_params is not None and "symbolize_figure_formula" in formula_params:
37+
self.symbolize_figure_formula = formula_params.pop("symbolize_figure_formula")
38+
else:
39+
self.symbolize_figure_formula = False
3640
self.formula_params = formula_params if formula_params is not None else {"method": "linear"}
3741
self.formula_tokenize_method = self.formula_params.get("method")
3842
self.figure_params = figure_params if figure_params is not None else {}
@@ -166,6 +170,9 @@ def append_formula(self, segment, symbol=False, init=True):
166170
if symbol is True:
167171
self._formula_tokens.append(len(self._tokens))
168172
self._tokens.append(segment)
173+
elif self.symbolize_figure_formula and isinstance(segment, FigureFormulaSegment):
174+
self._formula_tokens.append(len(self._tokens))
175+
self._tokens.append(Symbol(FORMULA_SYMBOL))
169176
elif isinstance(segment, FigureFormulaSegment):
170177
self._formula_tokens.append(len(self._tokens))
171178
self._tokens.append(segment)

EduNLP/Vector/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# coding: utf-8
22
# 2021/5/29 @ tongshiwei
33

4-
from .gensim_vec import W2V, D2V
4+
from .gensim_vec import W2V, D2V, BowLoader, TfidfLoader
55
from .const import *
66
from .rnn import RNNModel
77
from .t2v import T2V, get_pretrained_t2v

EduNLP/Vector/gensim_vec.py

Lines changed: 37 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212

1313
class W2V(Vector):
14-
def __init__(self, filepath, method, binary=None):
14+
def __init__(self, filepath, method=None, binary=None):
1515
"""
1616
1717
Parameters
@@ -69,16 +69,41 @@ def infer_tokens(self, items, *args, **kwargs) -> list:
6969
return [list(self(*item)) for item in items]
7070

7171

72+
class BowLoader(object):
73+
def __init__(self, filepath):
74+
self.dictionary = corpora.Dictionary.load(filepath)
75+
76+
def infer_vector(self, item, return_vec=False):
77+
item = self.dictionary.doc2bow(item)
78+
if not return_vec:
79+
return item # return dic as default
80+
vec = [0 for i in range(len(self.dictionary.keys()))]
81+
for i, v in item:
82+
vec[i] = v
83+
return vec
84+
85+
@property
86+
def vector_size(self):
87+
return len(self.dictionary.keys())
88+
89+
7290
class TfidfLoader(object):
7391
def __init__(self, filepath):
7492
self.tfidf_model = TfidfModel.load(filepath)
7593
# 'tfidf' model shold be used based on 'bow' model
7694
dictionary_path = re.sub(r"(.*)tfidf", r"\1bow", filepath)
7795
self.dictionary = corpora.Dictionary.load(dictionary_path)
7896

79-
def infer_vector(self, item):
80-
item = self.dictionary.doc2bow(item)
81-
return self.tfidf_model[item]
97+
def infer_vector(self, item, return_vec=False):
98+
dic_item = self.dictionary.doc2bow(item)
99+
tfidf_item = self.tfidf_model[dic_item]
100+
# return dic as default
101+
if not return_vec:
102+
return tfidf_item # pragma: no cover
103+
vec = [0 for i in range(len(self.dictionary.keys()))]
104+
for i, v in tfidf_item:
105+
vec[i] = v
106+
return vec
82107

83108
@property
84109
def vector_size(self):
@@ -92,24 +117,26 @@ def __init__(self, filepath, method="d2v"):
92117
if self._method == "d2v":
93118
self.d2v = Doc2Vec.load(filepath)
94119
elif self._method == "bow":
95-
self.d2v = corpora.Dictionary.load(filepath)
120+
self.d2v = BowLoader(filepath)
96121
elif self._method == "tfidf":
97122
self.d2v = TfidfLoader(filepath)
98123
else:
99124
raise ValueError("Unknown method: %s" % method)
100125

101126
def __call__(self, item):
102-
if self._method == "bow":
103-
return self.d2v.doc2bow(item)
104-
else:
127+
if self._method == "d2v":
105128
return self.d2v.infer_vector(item)
129+
else:
130+
return self.d2v.infer_vector(item, return_vec=True)
106131

107132
@property
108133
def vector_size(self):
109-
if self._method in {"d2v", "tfidf"}:
134+
if self._method == "d2v":
110135
return self.d2v.vector_size
111136
elif self._method == "bow":
112-
return len(self.d2v.token2id)
137+
return self.d2v.vector_size
138+
elif self._method == "tfidf":
139+
return self.d2v.vector_size
113140

114141
def infer_vector(self, items, *args, **kwargs) -> list:
115142
return [self(item) for item in items]

0 commit comments

Comments
 (0)