-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwilliams-sig.R
195 lines (149 loc) · 7.28 KB
/
williams-sig.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# Copyright 2014 Yvette Graham
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>
args <- commandArgs(T);
library(psych)
LP <- args[1]
HUMAN.FN <- args[2]
METRICS.FN <- args[3]
WRITE.DIR <- args[4]
if( (((LP=="") | (!file.exists(HUMAN.FN))) | (!file.exists(METRICS.FN))) | (WRITE.DIR=="") ){
cat( paste( "usage: <lang pair> <human scores filename> <metric scores filename> <write directory>\n"))
quit("no")
}
dir.create(WRITE.DIR,showWarnings=F)
CORR.FN <- paste(WRITE.DIR,"/pearson-corr.",LP,sep="")
MATRIX.FN <- paste(WRITE.DIR,"/williams-results.",LP,sep="")
h <- read.table( HUMAN.FN, header=T, colClasses=c('factor','factor','factor','factor', 'numeric'))
h <- h[ which( (h$METRIC=="HUMAN") & (h$LP==LP) ), ]
N <- length(h$SCORE)
if( length( h$LP ) < 2 ){
cat(paste("Error, too few human scores found for systems\n"))
cat(paste("You should provide more MT systems scored by humans, did you specify the correct language pair? (see README file)\n"))
quit("no")
}
a <- read.table( METRICS.FN, header=T, colClasses=c('factor','factor','factor','factor','numeric'))
a <- a[ which( a$LP==LP ), ]
if( length( h$LP ) < 4 ){
cat(paste("Error, too few metric scores found for systems\n"))
cat(paste("You should provide more MT systems scored by at least two metrics\n"))
quit("no")
}
h.scrs <- c()
for(s in sort(unique(unlist( a$SYSTEM ))) ){
sys <- h[ which( h$SYSTEM==s), ]
if( length(sys$SCORE) != 1 ){
cat( paste( "Error with human MT system scores, too few:", length(sys$SCORE) ,"\n"))
quit("no")
}
h.scrs <- c(h.scrs, sys$SCORE)
}
sig.c <- 0
sink(CORR.FN)
cat(paste("# --------------------------------------------------------------------\n"))
cat(paste("# Pearson Correlation with Human Scores\n"))
cat(paste("# Language Pair:",LP,"\n"))
cat(paste("# --------------------------------------------------------------------\n"))
# print matrix of results
for( m1 in sort( unique( unlist( a$METRIC ) ) ) ){
cat(paste(m1),"")
m1.scrs <- c()
for( s in sort( unique( unlist( a$SYSTEM ) ) ) ){
sys <- a[ which( a$METRIC==m1 & a$SYSTEM==s ), ]
if( length(sys$SCORE) != 1 ){
sink()
cat( paste( "Error with metric scores, each MT system must be scored exactly once by each metric.\n"))
cat( paste( "Number of scores for MT system", s," by metric ",m1,":", length(sys$SCORE),"\n"))
quit("no")
}
m1.scrs <- c( m1.scrs, sys$SCORE)
}
cat(paste( abs( cor( h.scrs, m1.scrs, method="pearson")),"\n"))
}
sink()
sink(MATRIX.FN)
cat(paste("# -------------------------------------------------------------------------------------------\n"))
cat(paste("# William's Test Results (",LP,")\n",sep=""))
cat(paste("# -------------------------------------------------------------------------------------------\n"))
cat(paste("# \n"))
cat(paste("# A one-tailed test was carried out exactly once for each pair of metrics with a \n"))
cat(paste("# non-zero difference in absolute Pearson correlation with human scores.\n"))
cat(paste("# The resulting p-value of each test, for a given pair of metrics, is printed in the row \n"))
cat(paste("# belonging to the metric whose absolute Pearson correlation with human scores is higher \n"))
cat(paste("# than that of the other metric in the pair. \n"))
cat(paste("# '-' is printed in the opposite cell in the matrix (for that pair) or in both cells for \n"))
cat(paste("# pairs of metrics with no difference in absolute Pearson correlation with human scores. \n"))
cat(paste("# \n"))
cat(paste("# Results read as follows: \n"))
cat(paste("# \n"))
cat(paste("# If the p-value in a given cell is lower than a specified threshold, for example 0.05, \n"))
cat(paste("# the absolute Pearson correlation with human scores of the metric named in that ROW \n"))
cat(paste("# is considered SIGNIFICANTLY higher than \n"))
cat(paste("# the absolute Pearson correlation with human scores of the metric named in that COLUMN. \n"))
cat(paste("# \n"))
cat(paste("# -------------------------------------------------------------------------------------------\n"))
# print matrix of results
for( m1 in sort( unique( unlist( a$METRIC ) ) ) ){
cat(paste( "\t", m1, sep="" ))
}
cat(paste("\n"))
for( m1 in sort( unique( unlist( a$METRIC ) ) ) ){
cat(paste(m1))
m1.scrs <- c()
for( s in sort( unique( unlist( a$SYSTEM ) ) ) ){
sys <- a[ which( a$METRIC==m1 & a$SYSTEM==s ), ]
if( length(sys$SCORE) != 1 ){
sink()
cat( paste( "Error with metric scores, each MT system must be scored exactly once by each metric.\n"))
cat( paste( "Number of scores for MT system", s," by metric ",m1,":", length(sys$SCORE),"\n"))
quit("no")
}
m1.scrs <- c( m1.scrs, sys$SCORE)
}
for( m2 in sort( unique( unlist( a$METRIC ) ) ) ){
cat(paste("\t"))
if( m1==m2 ){
cat( paste( "-" ) )
}else{
m2.scrs <- c()
for( s in sort( unique( unlist( a$SYSTEM ) ) ) ){
sys <- a[ which( a$METRIC==m2 & a$SYSTEM==s ), ]
if( length(sys$SCORE) != 1 ){
sink()
cat( paste( "Error with metric scores, each MT system must be scored exactly once by each metric.\n"))
cat( paste( "Number of scores for MT system", s," by metric ",m2,":", length(sys$SCORE),"\n"))
quit("no")
}
m2.scrs <- c( m2.scrs, sys$SCORE)
}
# Perform a one-tailed test for this pair of metrics and human scores
# Tests if the increase in correlation between m1 (and human judgment) and m2 (and human judgment) is significant
if( abs( cor( h.scrs, m1.scrs, method="pearson")) > abs( cor( h.scrs, m2.scrs, method="pearson")) ){
p <- r.test(
n = N,
r12 = abs( cor( h.scrs, m1.scrs, method="pearson")),
r13 = abs( cor( h.scrs, m2.scrs, method="pearson")),
r23 = abs( cor( m1.scrs, m2.scrs, method="pearson")),
twotailed=F)$p
cat( paste( p, sep="" ) )
}else{
cat( paste( "-", sep="" ) )
}
}
}
cat(paste("\n"))
}
sink()
cat( paste( "Pearson Correlations of Metrics with Human Scores written to: ", CORR.FN, "\n"))
cat( paste( "Significance Test Results written to: ", MATRIX.FN, "\n"))