-
Notifications
You must be signed in to change notification settings - Fork 46
/
conllu_align_tokens.pl
executable file
·176 lines (170 loc) · 6.3 KB
/
conllu_align_tokens.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#!/usr/bin/env perl
# Compares tokenization and word segmentation of two CoNLL-U files. Assumes
# that no normalization was performed, that is, the sequence of non-whitespace
# characters is identical on both sides.
# Copyright © 2018 Dan Zeman <[email protected]>
# License: GNU GPL
# Usage: Merge gold file with lemmas predicted by UDPipe in the shared task:
# tools/conllu_align_tokens.pl UD_Turkish-PUD/tr_pud-ud-test.conllu media/conll17-ud-test-2017-05-09/UFAL-UDPipe-1-2/2017-05-15-02-00-38/output/tr_pud.conllu
use utf8;
use open ':utf8';
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
# One file is considered gold standard, the other is a system's output.
if(scalar(@ARGV) != 2)
{
die("Usage: $0 goldfile systemfile");
}
my $goldpath = $ARGV[0];
my $syspath = $ARGV[1];
open(GOLD, $goldpath) or die("Cannot read $goldpath: $!");
open(SYS, $syspath) or die("Cannot read $syspath: $!");
my $gli = 0; # gold line number
my $sli = 0; # system line number
my $gboff = 0;
my $gbuffer = '';
my $sbuffer = '';
while(my $goldline = <GOLD>)
{
$gli++;
my @goldlines = ($goldline);
my $new_gold_token_read = 0;
# Sentence-level comments start with '#'. Pass through gold comments, ignore system comments.
# Empty nodes of the enhanced representation start with decimal numbers. Pass through gold, ignore system.
# Empty line terminates every sentence. Pass through gold, ignore system.
# Multi-word token.
if($goldline =~ m/^(\d+)-(\d+)\t/)
{
my $from = $1;
my $to = $2;
my @gf = split(/\t/, $goldline);
my $gform = $gf[1];
# Word forms may contain spaces but we are interested in non-whitespace characters only.
$gform =~ s/\s//g;
$gbuffer .= $gform;
# Read the syntactic words that belong to this multi-word token.
for(my $i = $from; $i <= $to; $i++)
{
$goldline = <GOLD>;
$gli++;
push(@goldlines, $goldline);
}
$new_gold_token_read = 1;
}
# Single-word token.
elsif($goldline =~ m/^\d+\t/)
{
my @gf = split(/\t/, $goldline);
my $gform = $gf[1];
# Word forms may contain spaces but we are interested in non-whitespace characters only.
$gform =~ s/\s//g;
$gbuffer .= $gform;
$new_gold_token_read = 1;
}
if($new_gold_token_read)
{
my @syslines = ();
while(length($gbuffer) > length($sbuffer))
{
my $nr = read_token_to_buffer(*SYS, \$sli, \$sbuffer, \@syslines);
if($nr == 0)
{
die("The system output ended prematurely. Gold line no. $gli, offset $gboff, buffer '$gbuffer'. System line no. $sli, buffer '$sbuffer'.");
}
}
# If the system buffer equals to the gold buffer, we are synchronized and may go on.
if($sbuffer eq $gbuffer)
{
###!!! The fact that the two buffers match does not entail that the last words match.
###!!! Nevertheless, we will now assume that it is the case, and copy the system lemma to the gold line.
if(scalar(@syslines) == scalar(@goldlines))
{
for(my $i = 0; $i <= $#syslines; $i++)
{
my @gf = split(/\t/, $goldlines[$i]);
my @sf = split(/\t/, $syslines[$i]);
# LEMMA is the column [2].
if(defined($sf[2]) && $sf[2] ne '')
{
$gf[2] = $sf[2];
$goldlines[$i] = join("\t", @gf);
}
}
}
$gboff += length($gbuffer);
$gbuffer = '';
$sbuffer = '';
}
# If the gold buffer is a prefix of the system buffer, eat the prefix and go to the next gold token.
elsif(substr($sbuffer, 0, length($gbuffer)) eq $gbuffer)
{
my $gbl = length($gbuffer);
$gboff += $gbl;
$gbuffer = '';
$sbuffer = substr($sbuffer, $gbl);
}
# Otherwise there must be a mismatch in the non-whitespace characters.
else
{
die("Non-whitespace character mismatch. Gold line no. $gli, offset $gboff, buffer '$gbuffer'. System line no. $sli, buffer '$sbuffer'.");
}
}
foreach my $gl (@goldlines)
{
print($gl);
}
}
close(GOLD);
close(SYS);
#------------------------------------------------------------------------------
# Reads next token from a CoNLL-U file. Adds it to a buffer. Returns the number
# of non-whitespace characters read. (Returns 0 if there are no more tokens in
# the file. The same would happen if there were an empty string instead of the
# word form, i.e., not even the underscore character, but such file would not
# be valid CoNLL-U.)
#------------------------------------------------------------------------------
sub read_token_to_buffer
{
my $fh = shift; # the handle of the open file
my $li = shift; # reference to the current line number
my $buffer = shift; # reference to the buffer
my $tokenlines = shift; # reference to array where token and word lines should be stored (other lines are thrown away)
# Read the next system token.
my $form;
while(my $line = <$fh>)
{
${$li}++;
# Multi-word token.
if($line =~ m/^(\d+)-(\d+)\t/)
{
my $from = $1;
my $to = $2;
push(@{$tokenlines}, $line);
my @f = split(/\t/, $line);
$form = $f[1];
# Word forms may contain spaces but we are interested in non-whitespace characters only.
$form =~ s/\s//g;
# Read the syntactic words that belong to this multi-word token.
for(my $i = $from; $i <= $to; $i++)
{
$line = <$fh>;
${$li}++;
push(@{$tokenlines}, $line);
}
last;
}
# Single-word token.
elsif($line =~ m/^\d+\t/)
{
push(@{$tokenlines}, $line);
my @f = split(/\t/, $line);
$form = $f[1];
# Word forms may contain spaces but we are interested in non-whitespace characters only.
$form =~ s/\s//g;
last;
}
}
${$buffer} .= $form;
return length($form);
}