Skip to content

Commit 4e875c5

Browse files
authored
upload addmwt.py
1 parent f7dc798 commit 4e875c5

File tree

1 file changed

+58
-0
lines changed

1 file changed

+58
-0
lines changed

udapi/block/ud/la/addmwt.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
""" Block ud.la.AddMwt for heuristic detection of multi-word (PRON + cum, nonne) and abbreviations-dots tokens. """
2+
import udapi.block.ud.addmwt
3+
4+
MWTS = {
5+
'mecum': {'lemma': 'ego cum', 'form': 'me cum', 'feats': 'Case=Abl|Gender=Masc|Number=Sing AdpType=Post|Clitic=Yes'},
6+
'tecum': {'lemma': 'tu cum', 'form': 'te cum', 'feats': 'Case=Abl|Gender=Masc|Number=Sing AdpType=Post|Clitic=Yes'},
7+
'nobiscum': {'lemma': 'nos cum', 'form': 'nobis cum', 'feats': 'Case=Abl|Gender=Neut|Number=Plur AdpType=Post|Clitic=Yes'},
8+
'vobiscum': {'lemma': 'vos cum', 'form': 'vobis cum', 'feats': 'Case=Abl|Gender=Masc|Number=Plur AdpType=Post|Clitic=Yes'},
9+
'uobiscum': {'lemma': 'uos cum', 'form': 'uobis cum', 'feats': 'Case=Abl|Gender=Masc|Number=Plur AdpType=Post|Clitic=Yes'},
10+
'secum': {'lemma': 'sui cum', 'form': 'se cum', 'feats': 'Case=Abl|Gender=Masc AdpType=Post|Clitic=Yes'}, # can be singular or plural
11+
}
12+
13+
# shared values for all entries in MWTS
14+
for v in MWTS.values():
15+
v['upos'] = 'PRON ADP'
16+
# v['xpos'] = '' # treebank-specific
17+
v['deprel'] = 'obl case'
18+
v['main'] = 0
19+
v['shape'] = 'subtree'
20+
21+
22+
class AddMwt(udapi.block.ud.addmwt.AddMwt):
23+
"""Detect and mark MWTs (split them into words and add the words to the tree)."""
24+
25+
def multiword_analysis(self, node):
26+
"""Return a dict with MWT info or None if `node` does not represent a multiword token."""
27+
analysis = MWTS.get(node.form.lower(), None)
28+
if analysis is not None:
29+
return analysis
30+
31+
if node.form.endswith('.') and len(node.form) > 1 and node.form != '...':
32+
dic = {
33+
'form': 'x .',
34+
'lemma': '* .',
35+
'upos': '* PUNCT',
36+
'xpos': '_ _',
37+
'feats': '* _',
38+
'deprel': '* punct',
39+
'main': 0,
40+
'shape': 'subtree'
41+
}
42+
forma = node.form[:-1] + ' .'
43+
dic.update(form = forma)
44+
return dic
45+
elif node.lemma == 'nonne':
46+
dic = {
47+
'form': 'non ne',
48+
'lemma': 'non ne',
49+
'upos': 'PART PART',
50+
# 'xpos': '_ _', # treebank-specific
51+
'feats': 'Polarity=Neg Clitic=Yes|PartType=Int',
52+
'deprel': 'advmod:neg discourse',
53+
'main': 0,
54+
'shape': 'sibling'
55+
}
56+
return dic
57+
return None
58+

0 commit comments

Comments
 (0)