Skip to content

Commit

Permalink
conll2cgel.py: incorporate subtokens
Browse files Browse the repository at this point in the history
  • Loading branch information
nschneid committed Jan 8, 2025
1 parent d8627f7 commit f6035b7
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 1 deletion.
6 changes: 5 additions & 1 deletion convertor/conll2cgel.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ def infer_cgel_pos(dtree: DependencyGraph) -> None:
cpos = 'V_aux'
case 'JJ'|'JJR'|'JJS':
cpos = 'Adj'
case 'RB' if node['word'].lower()=="n't": # negative clitic
cpos = ':subt'
case 'RB'|'RBR'|'RBS'|'WRB':
cpos = 'Adv' # TODO: some actually P
case 'TO':
Expand All @@ -81,7 +83,7 @@ def attach_subtokens(dtree: DependencyGraph) -> None:
if i==0: continue
if node['cpos']==':subt': # 's clitic
prevnode = dtree.nodes[i-1]
prevnode['cfeats'] = [(':subt', prevnode['word']), (':subt', node['word'])]
prevnode['subtoks'] = [(':subt', prevnode['word']), (':subt', node['word'])]
prevnode.setdefault('extra',[]).append('poss')
prevnode['word'] += node['word']

Expand Down Expand Up @@ -391,6 +393,8 @@ def build_cgel_tree(targettree: CGELTree, subtree: T, parent: int, dtree: Depend
if dnode['lemma'] != dnode['word']:
targettree.tokens[i].lemma = dnode['lemma']
targettree.tokens[i].xpos = dnode['tag']
if 'subtoks' in dnode:
targettree.tokens[i].substrings = dnode['subtoks']

def attach_punct(tree: CGELTree, sent: list[tuple[str,int]], dtree: dict) -> None:
"""Add punctuation terminals to the CGEL tree given the CGEL-tokenized
Expand Down
22 changes: 22 additions & 0 deletions convertor/wsj-dev-sample-conll2008.conll
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,25 @@
12 market market NN NN market market NN 13 NMOD _ _ _ _ A0
13 jitters jitters NNS NNS jitters jitters NNS 11 OBJ jitters.01 _ _ A1 _
14 . . . . . . . 4 P _ _ _ _ _

1 `` `` `` `` `` `` `` 3 P _ _ _ _ _
2 It it PRP PRP It it PRP 3 SBJ _ _ _ _ _
3 's be VBZ VBZ 's be VBZ 0 ROOT _ _ _ _ _
4 the the DT DT the the DT 7 NMOD _ _ _ _ _
5 most most RBS RBS most most RBS 6 AMOD _ AM-EXT _ _ _
6 distracting distracting VBG JJ distracting distract VBG 7 NMOD distract.01 _ _ _ _
7 thing thing NN NN thing thing NN 3 PRD _ A0 _ _ _
8 in in IN IN in in IN 7 LOC _ AM-LOC _ _ _
9 my my PRP$ PRP$ my my PRP$ 10 NMOD _ _ A0 _ _
10 life life NN NN life life NN 8 PMOD life.01 _ _ _ _
11 -- -- : : -- -- : 3 P _ _ _ _ _
12 I i PRP PRP I i PRP 13 SBJ _ _ _ A0 _
13 ca ca MD MD ca ca MD 3 COORD _ _ _ AM-MOD _
14 n't not RB RB n't not RB 13 ADV _ _ _ AM-NEG _
15 even even RB RB even even RB 13 ADV _ _ _ AM-DIS _
16 attend attend VB VB attend attend VB 13 VC attend.01 _ _ _ _
17 to to TO TO to to TO 16 ADV _ _ _ A1 _
18 my my PRP$ PRP$ my my PRP$ 19 NMOD _ _ _ _ A0
19 business business NN NN business business NN 17 PMOD business.01 _ _ _ A1
20 . . . . . . . 3 P _ _ _ _ _
21 '' '' '' '' '' '' '' 3 P _ _ _ _ _
22 changes: 22 additions & 0 deletions convertor/wsj-dev-sample-conll2008.conll10
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,25 @@
12 market market NN NN _ 13 NMOD _ _
13 jitters jitters NNS NNS _ 11 OBJ _ _
14 . . . . _ 4 P _ _

1 `` `` `` `` _ 3 P _ _
2 It it PRP PRP _ 3 SBJ _ _
3 's be VBZ VBZ _ 0 ROOT _ _
4 the the DT DT _ 7 NMOD _ _
5 most most RBS RBS _ 6 AMOD _ _
6 distracting distracting VBG JJ _ 7 NMOD _ _
7 thing thing NN NN _ 3 PRD _ _
8 in in IN IN _ 7 LOC _ _
9 my my PRP$ PRP$ _ 10 NMOD _ _
10 life life NN NN _ 8 PMOD _ _
11 -- -- : : _ 3 P _ _
12 I i PRP PRP _ 13 SBJ _ _
13 ca ca MD MD _ 3 COORD _ _
14 n't not RB RB _ 13 ADV _ _
15 even even RB RB _ 13 ADV _ _
16 attend attend VB VB _ 13 VC _ _
17 to to TO TO _ 16 ADV _ _
18 my my PRP$ PRP$ _ 19 NMOD _ _
19 business business NN NN _ 17 PMOD _ _
20 . . . . _ 3 P _ _
21 '' '' '' '' _ 3 P _ _

0 comments on commit f6035b7

Please sign in to comment.