|
| 1 | +""" Block ud.la.AddMwt for heuristic detection of multi-word (PRON + cum, nonne) and abbreviations-dots tokens. """ |
| 2 | +import udapi.block.ud.addmwt |
| 3 | + |
| 4 | +MWTS = { |
| 5 | + 'mecum': {'lemma': 'ego cum', 'form': 'me cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Sing AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, |
| 6 | + 'tecum': {'lemma': 'tu cum', 'form': 'te cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Sing AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, |
| 7 | + 'nobiscum': {'lemma': 'nos cum', 'form': 'nobis cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Neut|Number=Plur AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, |
| 8 | + 'vobiscum': {'lemma': 'vos cum', 'form': 'vobis cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Plur AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, |
| 9 | + 'uobiscum': {'lemma': 'uos cum', 'form': 'uobis cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Plur AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, |
| 10 | + 'secum': {'lemma': 'sui cum', 'form': 'se cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, # can be singular or plural |
| 11 | + 'nonne': {'lemma': 'non ne', 'form': 'non ne', 'upos': 'PART PART', 'feats': 'Polarity=Neg Clitic=Yes|PartType=Int', 'deprel': 'advmod:neg discourse', 'shape': 'sibling'} |
| 12 | +} |
| 13 | + |
| 14 | +# shared values for all entries in MWTS |
| 15 | +for v in MWTS.values(): |
| 16 | + # v['xpos'] = '' # treebank-specific |
| 17 | + if 'shape' not in v: |
| 18 | + v['shape'] = 'subtree' |
| 19 | + v['main'] = 0 |
| 20 | + |
| 21 | + |
| 22 | +class AddMwt(udapi.block.ud.addmwt.AddMwt): |
| 23 | + """Detect and mark MWTs (split them into words and add the words to the tree).""" |
| 24 | + |
| 25 | + def multiword_analysis(self, node): |
| 26 | + """Return a dict with MWT info or None if `node` does not represent a multiword token.""" |
| 27 | + analysis = MWTS.get(node.form.lower(), None) |
| 28 | + if analysis is not None: |
| 29 | + return analysis |
| 30 | + |
| 31 | + if node.form.endswith('.') and len(node.form) > 1 and node.form != '...': |
| 32 | + # currently under discussion |
| 33 | + return {'form': node.form[:-1] + ' .', |
| 34 | + 'lemma': '* .', |
| 35 | + 'upos': '* PUNCT', |
| 36 | + 'xpos': '_ _', |
| 37 | + 'feats': '* _', |
| 38 | + 'deprel': '* punct', |
| 39 | + 'main': 0, |
| 40 | + 'shape': 'subtree'} |
| 41 | + |
0 commit comments