|
| 1 | +""" Block ud.la.AddMwt for heuristic detection of multi-word (PRON + cum, nonne) and abbreviations-dots tokens. """ |
| 2 | +import udapi.block.ud.addmwt |
| 3 | + |
| 4 | +MWTS = { |
| 5 | + 'mecum': {'lemma': 'ego cum', 'form': 'me cum', 'feats': 'Case=Abl|Gender=Masc|Number=Sing AdpType=Post|Clitic=Yes'}, |
| 6 | + 'tecum': {'lemma': 'tu cum', 'form': 'te cum', 'feats': 'Case=Abl|Gender=Masc|Number=Sing AdpType=Post|Clitic=Yes'}, |
| 7 | + 'nobiscum': {'lemma': 'nos cum', 'form': 'nobis cum', 'feats': 'Case=Abl|Gender=Neut|Number=Plur AdpType=Post|Clitic=Yes'}, |
| 8 | + 'vobiscum': {'lemma': 'vos cum', 'form': 'vobis cum', 'feats': 'Case=Abl|Gender=Masc|Number=Plur AdpType=Post|Clitic=Yes'}, |
| 9 | + 'uobiscum': {'lemma': 'uos cum', 'form': 'uobis cum', 'feats': 'Case=Abl|Gender=Masc|Number=Plur AdpType=Post|Clitic=Yes'}, |
| 10 | + 'secum': {'lemma': 'sui cum', 'form': 'se cum', 'feats': 'Case=Abl|Gender=Masc AdpType=Post|Clitic=Yes'}, # can be singular or plural |
| 11 | +} |
| 12 | + |
| 13 | +# shared values for all entries in MWTS |
| 14 | +for v in MWTS.values(): |
| 15 | + v['upos'] = 'PRON ADP' |
| 16 | + # v['xpos'] = '' # treebank-specific |
| 17 | + v['deprel'] = 'obl case' |
| 18 | + v['main'] = 0 |
| 19 | + v['shape'] = 'subtree' |
| 20 | + |
| 21 | + |
| 22 | +class AddMwt(udapi.block.ud.addmwt.AddMwt): |
| 23 | + """Detect and mark MWTs (split them into words and add the words to the tree).""" |
| 24 | + |
| 25 | + def multiword_analysis(self, node): |
| 26 | + """Return a dict with MWT info or None if `node` does not represent a multiword token.""" |
| 27 | + analysis = MWTS.get(node.form.lower(), None) |
| 28 | + if analysis is not None: |
| 29 | + return analysis |
| 30 | + |
| 31 | + if node.form.endswith('.') and len(node.form) > 1 and node.form != '...': |
| 32 | + dic = { |
| 33 | + 'form': 'x .', |
| 34 | + 'lemma': '* .', |
| 35 | + 'upos': '* PUNCT', |
| 36 | + 'xpos': '_ _', |
| 37 | + 'feats': '* _', |
| 38 | + 'deprel': '* punct', |
| 39 | + 'main': 0, |
| 40 | + 'shape': 'subtree' |
| 41 | + } |
| 42 | + forma = node.form[:-1] + ' .' |
| 43 | + dic.update(form = forma) |
| 44 | + return dic |
| 45 | + elif node.lemma == 'nonne': |
| 46 | + dic = { |
| 47 | + 'form': 'non ne', |
| 48 | + 'lemma': 'non ne', |
| 49 | + 'upos': 'PART PART', |
| 50 | + # 'xpos': '_ _', # treebank-specific |
| 51 | + 'feats': 'Polarity=Neg Clitic=Yes|PartType=Int', |
| 52 | + 'deprel': 'advmod:neg discourse', |
| 53 | + 'main': 0, |
| 54 | + 'shape': 'sibling' |
| 55 | + } |
| 56 | + return dic |
| 57 | + return None |
| 58 | + |
0 commit comments