Skip to content

Commit a566653

Browse files
authored
Merge pull request #118 from fjambe/master
upload addmwt.py for Latin
2 parents f7dc798 + 6fe35b1 commit a566653

File tree

1 file changed

+41
-0
lines changed

1 file changed

+41
-0
lines changed

udapi/block/ud/la/addmwt.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
""" Block ud.la.AddMwt for heuristic detection of multi-word (PRON + cum, nonne) and abbreviations-dots tokens. """
2+
import udapi.block.ud.addmwt
3+
4+
MWTS = {
5+
'mecum': {'lemma': 'ego cum', 'form': 'me cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Sing AdpType=Post|Clitic=Yes', 'deprel': 'obl case'},
6+
'tecum': {'lemma': 'tu cum', 'form': 'te cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Sing AdpType=Post|Clitic=Yes', 'deprel': 'obl case'},
7+
'nobiscum': {'lemma': 'nos cum', 'form': 'nobis cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Neut|Number=Plur AdpType=Post|Clitic=Yes', 'deprel': 'obl case'},
8+
'vobiscum': {'lemma': 'vos cum', 'form': 'vobis cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Plur AdpType=Post|Clitic=Yes', 'deprel': 'obl case'},
9+
'uobiscum': {'lemma': 'uos cum', 'form': 'uobis cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Plur AdpType=Post|Clitic=Yes', 'deprel': 'obl case'},
10+
'secum': {'lemma': 'sui cum', 'form': 'se cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, # can be singular or plural
11+
'nonne': {'lemma': 'non ne', 'form': 'non ne', 'upos': 'PART PART', 'feats': 'Polarity=Neg Clitic=Yes|PartType=Int', 'deprel': 'advmod:neg discourse', 'shape': 'sibling'}
12+
}
13+
14+
# shared values for all entries in MWTS
15+
for v in MWTS.values():
16+
# v['xpos'] = '' # treebank-specific
17+
if 'shape' not in v:
18+
v['shape'] = 'subtree'
19+
v['main'] = 0
20+
21+
22+
class AddMwt(udapi.block.ud.addmwt.AddMwt):
23+
"""Detect and mark MWTs (split them into words and add the words to the tree)."""
24+
25+
def multiword_analysis(self, node):
26+
"""Return a dict with MWT info or None if `node` does not represent a multiword token."""
27+
analysis = MWTS.get(node.form.lower(), None)
28+
if analysis is not None:
29+
return analysis
30+
31+
if node.form.endswith('.') and len(node.form) > 1 and node.form != '...':
32+
# currently under discussion
33+
return {'form': node.form[:-1] + ' .',
34+
'lemma': '* .',
35+
'upos': '* PUNCT',
36+
'xpos': '_ _',
37+
'feats': '* _',
38+
'deprel': '* punct',
39+
'main': 0,
40+
'shape': 'subtree'}
41+

0 commit comments

Comments
 (0)