-
Notifications
You must be signed in to change notification settings - Fork 1
/
swapwords
executable file
·111 lines (99 loc) · 4.24 KB
/
swapwords
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/local/bin/perl5 -w
# Author: Jason Eisner, University of Pennsylvania
# Usage: swapwords n trainfile testfile
#
# trainfile and testfile are in the format produced by rules2frames.
# (Actually, testfile should be piped through "nobadnonterm trainfile" first.)
# Thus, it is a bunch of (word, frame) pairs. We output a list of up to n DISTINCT
# forced-choice paradigms, each consisting of four (word, frame) pairs as
# follows:
# (w1, f1) - in testfile
# (w2, f2) - in testfile
# (w1, f2) - not in testfile
# (w2, f1) - not in testfile
# where f1 and f2 do not appear at all in trainfile. (We also require at least
# one of w1, w2 to appear in trainfile, so we have some basis for making the decision -
# this avoids unnecessary ties.) In addition, we require f1 != f2 and w1 != w2.
# Indeed we require w1 < w2 so we don't get symmetric duplicates.
# We strip ~ from the LHS of all frames, to prevent duplicate frames from sneaking in.
#
# The n distinct paradigms are found by sampling WITHOUT REPLACEMENT from the SET of
# possible paradigms meeting these conditions. Note that the sampling is by type, with
# no preference given to frequent wordframes.
#
# If n is 0, we just use all the possible paradigms.
#
# The idea is that we'll score these four pairs with conditional probabilities
# p(frame | word) of p1 ... p4, and correctly guess which frame goes with which
# word if p1*p2 > p3*p4.
#
# This format is convenient, but will contain a lot of repeated
# wordframes. The first two wordframes in each quadruple will especially
# get repeated a lot. If scoring is slow, then it would be wise to score
# a uniqified version of the output and then merge the scores back into
# this file.
require("stamp.inc"); &stamp; # modify $0 and @INC, and print timestamp
&citestamps(@ARGV[1..2]);
die "$0: bad command line args\n" unless @ARGV==3;
($n,$trainfile,$testfile) = @ARGV;
$trainwf = 0;
open(TRAIN, $trainfile) || die "$0: can't open $trainfile";
while (<TRAIN>) {
chop;
s/^(\S+:[0-9]+:\t)?//; # kill loc
next if /^\#/; # skip comment
@a=split("\t"); $a[1] =~ s/~//g; $_=join("\t",@a); # kill ~ on LHS
$trainwf++;
/\t/ || die;
$trainword{$`} = 1;
$trainframe{$'} = 1;
}
close(TRAIN);
$testwf = 0;
open(TEST, $testfile) || die "$0: can't open $testfile";
while (<TEST>) {
chop;
s/^(\S+:[0-9]+:\t)?//; # kill loc
next if /^\#/; # skip comment
@a=split("\t"); $a[1] =~ s/~//g; $_=join("\t",@a); # kill ~ on LHS
$testwf++;
/\t/ || die;
next if $trainframe{$'}; # novel frames only
$testwf{$_} = 1; # remember we've seen this wordframe in testing but didn't see the frame in training
$novelwords++ unless $trainword{$`};
}
close(TEST);
# Create a list of ALL possible quadruples from test data.
foreach (keys %testwf) {
print STDERR ".";
/\t/ || die; $w1 = $`; $f1 = $';
foreach (keys %testwf) {
/\t/ || die; $w2 = $`; $f2 = $';
next unless $w1 lt $w2;
$ok=1;
$ok=0, print STDERR "w" unless $trainword{$w1} || $trainword{$w2};
$ok=0, print STDERR "f" if $f1 eq $f2;
$ok=0, print STDERR "t" if $testwf{"$w1\t$f2"} || $testwf{"$w2\t$f1"};
if (!$ok) { print STDERR ("/"); next; }
$quads{"# QUAD\n$w1\t$f1\n$w2\t$f2\n$w1\t$f2\n$w2\t$f1\n"} = 1; # use hash table to avoid duplicates
}
}
# Print up to $n randomly selected quadruples, by sorting their
# indices into a random order. (We could have used random hash values
# in %quads and used that to sort the hash keys into a random order,
# but that's slow because it means hashing a long string on every
# comparison.)
@quads = keys %quads;
if ($n==0) {
printf STDERR "(using all quads)\n";
@printindices = 0..$#quads;
} else {
printf STDERR "(shuffling)\n";
srand(123); # replicable
@rands = 0 x scalar @quads;
foreach $i (0..$#quads) { $rands[$i] = rand; }
@shuffledindices = sort { $rands[$a] <=> $rands[$b] } 0..$#quads;
@printindices = @shuffledindices[0..$n-1];
}
print @quads[@printindices];
print STDERR "$0: $trainwf training and $testwf test wordframe tokens, ", scalar keys %testwf, " test tokens with novel frames, ", $novelwords, " of these had novel words and couldn't be paired with one another, ", scalar @quads, " possible quadruples, ", scalar @printindices, " selected\n";