Skip to content

Commit 3ac8f52

Browse files
authored
Merge pull request #14 from bigdata-ustc/parser
[FEATURE] add formula check in parser
2 parents e0f46ed + d2eb30f commit 3ac8f52

File tree

12 files changed

+584
-88
lines changed

12 files changed

+584
-88
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,4 +109,5 @@ venv.bak/
109109
# Pyre type checker
110110
.pyre/
111111

112-
# User Definition
112+
# User Definition
113+
data/

EduNLP/Formula/ast/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# coding: utf-8
22
# 2021/5/20 @ tongshiwei
33

4-
from .ast import str2ast, get_edges, ast, link_variable
4+
from .ast import str2ast, get_edges, ast, link_variable, katex_parse

EduNLP/Formula/ast/ast.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
11
# coding: utf-8
22
# 2021/5/20 @ tongshiwei
33
from typing import List, Dict
4-
54
from .katex import katex
65

7-
__all__ = ["str2ast", "get_edges", "ast", "link_variable"]
6+
7+
__all__ = ["str2ast", "get_edges", "ast", "link_variable", "katex_parse"]
8+
9+
10+
def katex_parse(formula):
11+
return katex.katex.__parse(formula,{'displayMode':True,'trust': True}).to_list()
812

913

1014
def str2ast(formula: str, *args, **kwargs):
@@ -34,6 +38,18 @@ def ast(formula: (str, List[Dict]), index=0, forest_begin=0, father_tree=None, i
3438
重新解析形成的特征树
3539
3640
todo: finish all types
41+
42+
Notes
43+
----------
44+
Some functions are not supportd in katex
45+
eg :
46+
1. tag
47+
'\\begin{equation} \\tag{tagName} F=ma \\end{equation}'
48+
'\\begin{align} \\tag{1} y=x+z \\end{align}'
49+
'\\tag*{hi} x+y^{2x}'
50+
2. dddot
51+
'\\frac{ \\dddot y }{ x }'
52+
3. see other: https://github.com/KaTeX/KaTeX/blob/master/docs/support_table.md
3753
"""
3854
tree = []
3955
index += forest_begin
File renamed without changes.

EduNLP/SIF/parser/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# coding: utf-8
2+
# 2021/6/02 @ fannazya
3+
4+
from .parser import (Parser)

EduNLP/SIF/parser/parser.py

Lines changed: 337 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,337 @@
1+
from EduNLP.Formula.ast import str2ast, katex_parse
2+
3+
4+
class Parser:
5+
def __init__(self, data):
6+
self.lookahead = 0
7+
self.head = 0
8+
self.text = data
9+
self.error_message = ''
10+
self.error_postion = 0
11+
self.error_flag = 0
12+
self.modify_flag = 0
13+
self.warnning = 0
14+
self.fomula_illegal_flag = 0
15+
self.fomula_illegal_message = ''
16+
17+
# 定义特殊变量
18+
self.len_bracket = len('$\\SIFChoice$')
19+
self.len_underline = len('$\\SIFBlank$')
20+
21+
# 定义 token
22+
self.error = -1
23+
self.character = 1
24+
self.en_pun = 2
25+
self.ch_pun = 3
26+
self.latex = 4
27+
self.end = 5
28+
self.empty = 6
29+
self.modify = 7
30+
self.blank = 8
31+
32+
self.en_pun_list = [',', '.', '?', '!',
33+
':', ';', '\'', '\"', '(', ')', ' ', '_', '/', '|', '\\', '<', '>', '[', ']',
34+
'-'] # add some other chars
35+
self.ch_pun_list = [',', '。', '!', '?', ':',
36+
';', '‘', '’', '“', '”', '(', ')', ' ', '、', '《', '》', '—', '.']
37+
self.in_list = [',', '_', '-', '%']
38+
self.flag_list = [',', '。', '!', '?', ':',
39+
';', '‘', '’', '“', '”', '(', ')', ' ', '、', '《', '》',
40+
'$', ',', '.', '?', '!', ':', ';', '\'', '\"', '(', ')', ' ', '_', '/', '|', '<', '>', '-',
41+
'[', ']', '—']
42+
43+
def is_number(self, uchar):
44+
"""判断一个unicode是否是数字"""
45+
if u'\u0030' <= uchar <= u'\u0039':
46+
# print(uchar, ord(uchar))(u'\u0030' <= uchar <= u'\u0039')
47+
return True
48+
else:
49+
return False
50+
51+
def is_alphabet(self, uchar):
52+
"""判断一个unicode是否是英文字母"""
53+
if (u'\u0041' <= uchar <= u'\u005a') or (u'\u0061' <= uchar <= u'\u007a'):
54+
return True
55+
else:
56+
return False
57+
58+
def is_chinese(self, uchar):
59+
"""判断一个unicode是否是汉字"""
60+
if u'\u4e00' <= uchar <= u'\u9fa5':
61+
return True
62+
else:
63+
return False
64+
65+
def _is_formula_legal(self, formula_str):
66+
r"""
67+
Judge whether the current formula meet our specification or not.
68+
69+
Parameters
70+
----------
71+
formula_str
72+
73+
Returns
74+
-------
75+
True or False
76+
77+
"""
78+
legal_tags = ['FormFigureID', 'FormFigureBase64', 'FigureID', 'FigureBase64',
79+
'SIFBlank', 'SIFChoice', 'SIFTag', 'SIFSep', 'SIFUnderline']
80+
for tag in legal_tags:
81+
if tag in formula_str:
82+
return True
83+
try:
84+
katex_parse(formula_str)
85+
except Exception as e:
86+
assert 'ParseError' in str(e)
87+
self.fomula_illegal_message = "[FormulaError] " + str(e)
88+
self.fomula_illegal_flag = 1
89+
return False
90+
return True
91+
92+
def call_error(self):
93+
"""语法解析函数"""
94+
# print('ERROR::position is >>> ',self.head)
95+
# print('ERROR::match is >>>', self.text[self.head])
96+
self.error_postion = self.head
97+
self.error_message = self.text[:self.head + 1]
98+
self.error_flag = 1
99+
100+
def get_token(self):
101+
if self.head >= len(self.text):
102+
return self.empty
103+
ch = self.text[self.head]
104+
if self.is_chinese(ch):
105+
# 匹配中文字符 [\u4e00-\u9fa5]
106+
self.head += 1
107+
return self.character
108+
elif self.is_alphabet(ch):
109+
# 匹配公式之外的英文字母,只对两个汉字之间的字母做修正,其余匹配到的情况视为不合 latex 语法录入的公式
110+
left = head = self.head
111+
if self.head == 0:
112+
while (head < len(self.text) and (
113+
self.is_alphabet(self.text[head]) or self.text[head] in self.in_list)):
114+
head += 1
115+
if head == len(self.text) or self.is_chinese(self.text[head]) or self.text[head] in self.flag_list:
116+
self.head = head
117+
self.text = self.text[:left] + "$" + self.text[left:head] + "$" + self.text[head:]
118+
self.head += 2
119+
# print(self.text[left:self.head])
120+
self.modify = 1
121+
return self.modify
122+
else:
123+
forward = self.text[self.head - 1]
124+
if self.is_chinese(forward) or forward in self.flag_list:
125+
while (head < len(self.text) and (
126+
self.is_alphabet(self.text[head]) or self.text[head] in self.in_list)):
127+
head += 1
128+
if head == len(self.text) or self.is_chinese(self.text[head]) or self.text[head] in self.flag_list:
129+
self.head = head
130+
self.text = self.text[:left] + "$" + self.text[left:head] + "$" + self.text[head:]
131+
self.head += 2
132+
self.modify_flag = 1
133+
return self.modify
134+
# self.call_error()
135+
# return self.error
136+
137+
elif self.is_number(ch):
138+
# 匹配公式之外的数字,只对两个汉字之间的数字做修正,其余匹配到的情况视为不合 latex 语法录入的公式
139+
left = head = self.head
140+
if self.head == 0:
141+
while (head < len(self.text) and (
142+
self.is_number(self.text[head]) or self.text[head] in self.in_list)):
143+
head += 1
144+
if head == len(self.text) or self.is_chinese(self.text[head]) or self.text[head] in self.flag_list:
145+
self.head = head
146+
self.text = self.text[:left] + "$" + self.text[left:head] + "$" + self.text[head:]
147+
self.head += 2
148+
self.modify_flag = 1
149+
return self.modify
150+
151+
else:
152+
forward = self.text[self.head - 1]
153+
if self.is_chinese(forward) or forward in self.flag_list:
154+
while (head < len(self.text) and (
155+
self.is_number(self.text[head]) or self.text[head] in self.in_list)):
156+
head += 1
157+
158+
if head == len(self.text) or self.is_chinese(self.text[head]) or self.text[head] in self.flag_list:
159+
self.head = head
160+
self.text = self.text[:left] + "$" + self.text[left:head] + "$" + self.text[head:]
161+
self.head += 2
162+
self.modify_flag = 1
163+
return self.modify
164+
# self.call_error()
165+
# return self.error
166+
167+
elif ch == '\n':
168+
# 匹配换行符
169+
self.head += 1
170+
return self.end
171+
172+
elif ch in self.ch_pun_list:
173+
# 匹配中文标点
174+
left = self.head
175+
self.head += 1
176+
if self.text[left] == '(':
177+
# 匹配到一个左括号
178+
while self.text[self.head] == ' ' or self.text[self.head] == '\xa0':
179+
self.head += 1
180+
if self.text[self.head] == ')':
181+
self.head += 1
182+
self.text = self.text[:left] + '$\\SIFChoice$' + self.text[self.head:]
183+
self.head += self.len_bracket
184+
self.modify_flag = 1
185+
return self.modify
186+
return self.ch_pun
187+
elif ch in self.en_pun_list:
188+
# 匹配英文标点
189+
# print('en-pun-list')
190+
left = self.head
191+
self.head += 1
192+
if self.text[left] == '(':
193+
# 匹配到一个左括号
194+
while self.text[self.head] == ' ' or self.text[self.head] == '\xa0':
195+
self.head += 1
196+
if self.text[self.head] == ')':
197+
self.head += 1
198+
self.text = self.text[:left] + '$\\SIFChoice$' + self.text[self.head:]
199+
self.head += self.len_bracket
200+
self.modify_flag = 1
201+
return self.modify
202+
if self.text[left] == '_':
203+
# 匹配到一个下划线
204+
# print('this is an underline')
205+
while self.text[self.head] == '_' or self.text[self.head] == ' ':
206+
self.head += 1
207+
if self.head >= len(self.text):
208+
break
209+
# print('change the text')
210+
self.text = self.text[:left] + '$\\SIFBlank$' + self.text[self.head:]
211+
self.head += self.len_underline
212+
# print(self.text)
213+
self.modify_flag = 1
214+
return self.modify
215+
return self.en_pun
216+
217+
elif ch == '$':
218+
# 匹配 latex 公式
219+
self.head += 1
220+
flag = 1
221+
formula_start = self.head
222+
while self.head < len(self.text) and self.text[self.head] != '$':
223+
ch_informula = self.text[self.head]
224+
if flag and self.is_chinese(ch_informula):
225+
# latex 中出现中文字符,打印且只打印一次 warning
226+
print("Warning: there is some chinese characters in formula!")
227+
self.warnning = 1
228+
flag = 0
229+
self.head += 1
230+
if self.head >= len(self.text):
231+
self.call_error()
232+
return self.error
233+
# 检查latex公式的完整性和可解析性
234+
if not self._is_formula_legal(self.text[formula_start:self.head]):
235+
self.call_error()
236+
return self.error
237+
self.head += 1
238+
# print('is latex!')
239+
return self.latex
240+
else:
241+
self.call_error()
242+
return self.error
243+
244+
def next_token(self):
245+
# print('call next_token')
246+
# if self.error_flag:
247+
# return
248+
self.lookahead = self.get_token()
249+
if self.error_flag:
250+
return
251+
252+
def match(self, terminal):
253+
# print('call match')
254+
# if self.error_flag:
255+
# return
256+
if self.lookahead == terminal:
257+
self.next_token()
258+
if self.error_flag:
259+
return
260+
# else:
261+
# print('match error!')
262+
# self.call_error()
263+
264+
def txt(self):
265+
# print('call txt')
266+
# if self.error_flag:
267+
# return
268+
self.lookahead = self.get_token()
269+
if self.error_flag:
270+
return
271+
if self.lookahead == self.character or self.lookahead == self.en_pun or \
272+
self.lookahead == self.ch_pun or self.lookahead == self.latex:
273+
self.match(self.lookahead)
274+
275+
def txt_list(self):
276+
# print('call txt_list')
277+
# if self.error_flag:
278+
# return
279+
self.txt()
280+
if self.error_flag:
281+
return
282+
if self.lookahead != self.empty:
283+
self.txt_list()
284+
285+
def description(self):
286+
# print('call description')
287+
# if self.error_flag:
288+
# return
289+
self.txt_list()
290+
if self.error_flag:
291+
return
292+
if self.lookahead == self.empty:
293+
self.match(self.lookahead)
294+
295+
def description_list(self):
296+
r"""
297+
use Parser to process and describe the txt
298+
299+
Parameters
300+
----------
301+
302+
Returns
303+
----------
304+
305+
Examples
306+
--------
307+
>>> text = '生产某种零件的A工厂25名工人的日加工零件数_ _'
308+
>>> text_parser = Parser(text)
309+
>>> text_parser.description_list()
310+
>>> text_parser.text
311+
'生产某种零件的$A$工厂$25$名工人的日加工零件数$\\SIFBlank$'
312+
>>> text = 'X的分布列为( )'
313+
>>> text_parser = Parser(text)
314+
>>> text_parser.description_list()
315+
>>> text_parser.text
316+
'$X$的分布列为$\\SIFChoice$'
317+
>>> text = '① AB是⊙O的直径,AC是⊙O的切线,BC交⊙O于点E.AC的中点为D'
318+
>>> text_parser = Parser(text)
319+
>>> text_parser.description_list()
320+
>>> text_parser.error_flag
321+
1
322+
>>> text = '支持公式如$\\frac{y}{x}$,$\\SIFBlank$,$\\FigureID{1}$,不支持公式如$\\frac{ \\dddot y}{x}$'
323+
>>> text_parser = Parser(text)
324+
>>> text_parser.description_list()
325+
>>> text_parser.fomula_illegal_flag
326+
1
327+
"""
328+
# print('call description_list')
329+
self.description()
330+
if self.error_flag:
331+
# print("Error")
332+
return
333+
if self.lookahead != self.empty:
334+
self.description_list() # pragma: no cover
335+
else:
336+
self.error_flag = 0
337+
# print('parse successfully!')

0 commit comments

Comments
 (0)