-
Notifications
You must be signed in to change notification settings - Fork 1
/
prediction_format_checker.pl
110 lines (96 loc) · 3.4 KB
/
prediction_format_checker.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/perl -w
#
#
# Author: Preslav Nakov
# National University of Singapore
#
# WHAT: This is an official output file format checker for SemEval-2010 Task #8.
#
# Use:
# semeval2010_task8_format_checker.pl <PROPOSED_ANSWERS>
#
# Examples:
# semeval2010_task8_format_checker.pl proposed_answer1.txt
# semeval2010_task8_format_checker.pl proposed_answer2.txt
# semeval2010_task8_format_checker.pl proposed_answer3.txt
# semeval2010_task8_format_checker.pl proposed_answer4.txt
#
# In the examples above, the first three files are OK, while the last one contains four errors.
# And answer_key2.txt contains the true labels for the *training* dataset.
#
# Description:
# The scorer takes as input a proposed classification file,
# which should contain one prediction per line in the format "<SENT_ID> <RELATION>"
# with a TAB as a separator, e.g.,
# 1 Component-Whole(e2,e1)
# 2 Other
# 3 Instrument-Agency(e2,e1)
# ...
# The file does not have to be sorted in any way.
# Repetitions of IDs are not allowed.
#
# In case of problems, the checker outputs the problemtic line and its number.
# Finally, the total number of problems found is reported
# or a message is output saying that the file format is OK.
#
# Participants are expected to check their output using this checker before submission.
#
# Last modified: March 10, 2010
#
#
use strict;
###############
### I/O ###
###############
if ($#ARGV != 0) {
die "Usage:\nsemeval2010_task8_format_checker.pl <PROPOSED_ANSWERS>\n";
}
my $INPUT_FILE_NAME = $ARGV[0];
################
### MAIN ###
################
my %ids = ();
my $errCnt = 0;
open(INPUT, $INPUT_FILE_NAME) or die "Failed to open $INPUT_FILE_NAME for text reading.\n";
for (my $lineNo = 1; <INPUT>; $lineNo++) {
my ($id, $label) = &getIDandLabel($_);
if ($id < 0) {
s/[\n\r]*$//;
print "Bad file format on line $lineNo: '$_'\n";
$errCnt++;
}
elsif (defined $ids{$id}) {
s/[\n\r]*$//;
print "Bad file format on line $lineNo (ID $id is already defined): '$_'\n";
$errCnt++;
}
$ids{$id}++;
}
close(INPUT) or die "Failed to close $INPUT_FILE_NAME.\n";
if (0 == $errCnt) {
print "\n<<< The file format is OK.\n";
}
else {
print "\n<<< The format is INCORRECT: $errCnt problematic line(s) found!\n";
}
################
### SUBS ###
################
sub getIDandLabel() {
my $line = shift;
return (-1,()) if ($line !~ /^([0-9]+)\t([^\r]+)\r?\n$/);
my ($id, $label) = ($1, $2);
return ($id, '_Other') if ($label eq 'Other');
return ($id, $label)
if (($label eq 'Cause-Effect(e1,e2)') || ($label eq 'Cause-Effect(e2,e1)') ||
($label eq 'Component-Whole(e1,e2)') || ($label eq 'Component-Whole(e2,e1)') ||
($label eq 'Content-Container(e1,e2)') || ($label eq 'Content-Container(e2,e1)') ||
($label eq 'Entity-Destination(e1,e2)') || ($label eq 'Entity-Destination(e2,e1)') ||
($label eq 'Entity-Origin(e1,e2)') || ($label eq 'Entity-Origin(e2,e1)') ||
($label eq 'Instrument-Agency(e1,e2)') || ($label eq 'Instrument-Agency(e2,e1)') ||
($label eq 'Member-Collection(e1,e2)') || ($label eq 'Member-Collection(e2,e1)') ||
($label eq 'Message-Topic(e1,e2)') || ($label eq 'Message-Topic(e2,e1)') ||
($label eq 'Product-Producer(e1,e2)') || ($label eq 'Product-Producer(e2,e1)'));
return (-1, ());
}