-
Notifications
You must be signed in to change notification settings - Fork 46
/
fix-space-after-paragraph.pl
executable file
·130 lines (128 loc) · 4.31 KB
/
fix-space-after-paragraph.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env perl
# Checks whether SpaceAfter=No does not occur at the end of a paragraph.
# If it finds such an error, it fixes the error in-place (unlike check-space-after-paragraph.pl, which only reports the error).
# Note that this script does not read STDIN. It requires one or more arguments = paths to CoNLL-U files.
# Copyright © 2020 Dan Zeman <[email protected]>
# License: GNU GPL
use utf8;
use open ':utf8';
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
if(scalar(@ARGV)==0)
{
die("One or more paths to files needed. No arguments found");
}
my $tmpfile = '/tmp/'.$$;
while(my $filename = shift(@ARGV))
{
my $ok = open(FILE, $filename);
if($ok)
{
my @errors = ();
my $iline = 0;
my $ignore_until;
my $spaceafternoline;
my $sentid;
my $spaceaftersentid;
while(my $line = <FILE>)
{
chomp($line);
$iline++;
# Remember SpaceAfter=No.
if($line =~ m/^\d/)
{
my @f = split(/\t/, $line);
# Multi-word tokens need a special treatment.
if($f[0] =~ m/^(\d+)-(\d+)$/)
{
my $id0 = $1;
my $id1 = $2;
$ignore_until = $id1;
}
if($f[0] =~ m/^\d+$/ && defined($ignore_until) && $f[0] > $ignore_until)
{
$ignore_until = undef;
}
if($f[0] =~ m/^\d+-\d+$/ || !defined($ignore_until))
{
my @misc = split(/\|/, $f[9]);
if(grep {$_ eq 'SpaceAfter=No'} (@misc))
{
$spaceafternoline = $iline;
$spaceaftersentid = $sentid;
}
else
{
$spaceafternoline = undef;
$spaceaftersentid = undef;
}
}
}
elsif($line =~ m/^\s*$/)
{
# Reset $ignore_until at the end of the sentence if we did not reset it earlier.
$ignore_until = undef;
}
elsif($line =~ m/^\#\s*new(doc|par)(\s|$)/)
{
# It is possible that there is no space between two sentences.
# But it is not possible between two paragraphs or documents.
if(defined($spaceafternoline))
{
push(@errors, $spaceafternoline);
#print STDERR ("Line $iline: new paragraph or document was preceded by SpaceAfter=No on line $spaceafternoline (sentence $spaceaftersentid).\n");
$spaceafternoline = undef;
$spaceaftersentid = undef;
}
}
elsif($line =~ m/^\#\s*sent_id\s*=\s*(\S+)/)
{
$sentid = $1;
}
}
close(FILE);
my $n = scalar(@errors);
if($n > 0)
{
print STDERR ("$filename ... $n errors\n");
open(IN, $filename) or die("Cannot read '$filename': $!");
open(OUT, ">$tmpfile") or die("Cannot write '$tmpfile': $!");
my $next_error = shift(@errors);
$iline = 0;
while(<IN>)
{
$iline++;
if(defined($next_error) && $iline == $next_error)
{
s/\r?\n$//;
my @f = split(/\t/);
my @misc = grep {$_ ne 'SpaceAfter=No'} (split(/\|/, $f[9]));
if(scalar(@misc)==0)
{
$f[9] = '_';
}
else
{
$f[9] = join('|', @misc);
}
$_ = join("\t", @f)."\n";
$next_error = shift(@errors);
}
print OUT;
}
close(IN);
close(OUT);
system("cp $filename $filename.bak");
system("mv $tmpfile $filename");
}
else
{
print STDERR ("$filename ... no errors\n");
}
}
else
{
print STDERR ("Cannot open '$filename' ($!), skipping.\n");
}
}