Skip to content

Commit 0269841

Browse files
committed
Improved block ud.FixCompoundName.
1 parent ac75629 commit 0269841

File tree

1 file changed

+23
-14
lines changed

1 file changed

+23
-14
lines changed

udapi/block/ud/fixcompoundname.py

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
for person names. This block can be used to fix them.
88
"""
99
from udapi.core.block import Block
10+
import regex as re
1011
import logging
1112

1213

@@ -19,18 +20,26 @@ class FixCompoundName(Block):
1920

2021
def process_node(self, node):
2122
if node.upos == 'PROPN' and node.udeprel == 'compound' and node.parent.upos == 'PROPN':
23+
origparent = node.parent
24+
grandparent = origparent.parent
25+
outdeprel = origparent.deprel
2226
# See if there are other PROPN compound siblings.
23-
namewords = [x for x in node.siblings if x.upos == 'PROPN' and x.udeprel == 'compound']
24-
namewords.append(node.parent)
25-
namewords = sorted(namewords, key=lambda x: x.ord)
26-
###!!! We currently cannot transform enhanced dependencies.
27-
###!!! If we proceed, the basic tree would diverge from the enhanced dependencies.
28-
if len(node.deps) > 0:
29-
logging.fatal('There are enhanced dependencies but ud.FixCompoundName has been implemented only for basic dependencies.')
30-
# The first name word will be the technical head. If it is the current parent, fine.
31-
if namewords[0] != node.parent:
32-
namewords[0].parent = node.parent.parent
33-
namewords[0].deprel = node.parent.deprel
34-
for i in range(len(namewords)-1):
35-
namewords[i+1].parent = namewords[0]
36-
namewords[i+1].deprel = 'flat:name'
27+
namewords = sorted([x for x in origparent.children(add_self=True) if x.upos == 'PROPN' and (x.udeprel == 'compound' or x == origparent)], key=lambda y: y.ord)
28+
# The Hindi treebank tags dates (['30', 'navaṁbara'], ['disaṁbara', '1993']) as PROPN compounds.
29+
# This is wrong but it is also different from personal names we are targeting here.
30+
# Hence, we will skip "names" that contain numbers.
31+
if len([x for x in namewords if re.search(r"\d", x.form)]) == 0:
32+
#logging.info(str([x.misc['Translit'] for x in namewords]))
33+
###!!! We currently cannot transform enhanced dependencies.
34+
###!!! If we proceed, the basic tree would diverge from the enhanced dependencies.
35+
if len(node.deps) > 0:
36+
logging.fatal('There are enhanced dependencies but ud.FixCompoundName has been implemented only for basic dependencies.')
37+
# The first name word will be the technical head. If it is the current parent, fine.
38+
head = namewords[0]
39+
rest = namewords[1:]
40+
if head != origparent:
41+
head.parent = grandparent
42+
head.deprel = outdeprel
43+
for n in rest:
44+
n.parent = head
45+
n.deprel = 'flat:name'

0 commit comments

Comments
 (0)