-
Notifications
You must be signed in to change notification settings - Fork 0
/
inspect_freqs.py
175 lines (126 loc) · 8.11 KB
/
inspect_freqs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# calculate average word frequencies of two prepositional phrases
# based on the difference in length between each phrase
import re, sys, time, nltk
from nltk.probability import FreqDist
from collections import defaultdict
infile = sys.argv[1] # file containing CoNLL-formatted text with word frequencies appended
# dictionary to contain lines of infile
lines = defaultdict( list )
phrase_probs = []
morph_counts = [] # holds total morpheme counts for each pp
phrase_prob = 1 # unigram probability of prepositional phrase
pp_count = 0 # counter to keep track of prepositional phrases contained in utterance
morph_count = 0 # counter to keep track of number of morphemes in prepositional phrase
i = 0
# dictionary to contain word / word-freq pairs for prepositional phrases in utterance
phrases = defaultdict( dict )
# regular expression to split words on morpheme boundaries
regex = re.compile( '[a-z]+' )
"""
TODO: create function to get morphemes
"""
with open( infile, 'r' ) as file1, open( 'morph_inspect.txt', 'a+' ) as file2:
# get all lines of infile
for line in file1:
lines[i] = line.split( '\t' ) # divide line into columns
i = i + 1
# iterate over each line of file
for j in range(0, len( lines ) ):
match = re.match( '-----', lines[j][0] )
sentence = re.match( '# text', lines[j][0] )
# while not at end of utterance
if( sentence ):
file2.write( lines[j][0] + '\n' )
if( not match ):
if( len(lines[j]) > 1 ):
# if current word is a prep
if( lines[j][3] == 'prep' ):
pp_count = pp_count + 1
phrase_prob = 1
# if immediately preceded by adv with same head as prep
if( len( lines[j - 1] ) > 1 and ( re.match( 'adv', lines[j - 1][3] ) and lines[j - 1][6] == lines[j][6] ) ):
phrases[pp_count][lines[j - 1][1]] = lines[j - 1][8]
phrase_prob = phrase_prob * float(lines[j - 1][8])
file2.write( lines[j - 1][1] + ':\t' + phrases[pp_count][lines[j - 1][1]] + '\n' )
# inspect morphemes
temp = filter( None, re.split( '\&|\-', lines[j - 1][1] ) ) # split word on morpheme delimeters
morph_count = morph_count + 1
# inspect temp for lowercase letters
for w in temp:
print(w)
# if word contains lowercase letters, do not increment morph_count
if( regex.match( w ) ):
print( 'current morph count is: ' + str( morph_count ) )
# if word contains no lowercase letters, increment morph_count
else:
morph_count = morph_count + 1
print( 'updated morph count is: ' + str( morph_count ) )
"""
a morpheme is delimited by '&' or '-'
if the delimiter is followed by capital letters and/or numbers
if followed by lowercase letters and/or numbers, not a morpheme
"""
# add word frequencies of each word in pp
for k in range( j, len( lines ) ):
if( lines[k][7] != 'POBJ' ):
phrases[pp_count][lines[k][1]] = lines[k][8]
phrase_prob = phrase_prob * float(lines[k][8])
file2.write( lines[k][1] + ':\t' + phrases[pp_count][lines[k][1]] + '\n' )
# inspect morphemes
temp = filter( None, re.split( '\&|\-', lines[k][1] ) ) # split word on morpheme delimeters
morph_count = morph_count + 1
# inspect temp for lowercase letters
for w in temp:
print(w)
# if word contains lowercase letters, do not increment morph_count
if( regex.match( w ) ):
print( 'current morph count is: ' + str( morph_count ) )
# if word contains no lowercase letters, increment morph_count
else:
morph_count = morph_count + 1
print( 'updated morph count is: ' + str( morph_count ) )
# end of prepositional phrase
if( lines[k][7] == 'POBJ' ):
phrases[pp_count][lines[k][1]] = lines[k][8]
phrase_prob = phrase_prob * float(lines[k][8])
phrase_probs.append(phrase_prob)
file2.write( lines[k][1] + ':\t' + phrases[pp_count][lines[k][1]] + '\n\n' )
file2.write( 'phrase unigram probability:\t' + str(phrase_prob) + '\n\n' )
# inspect morphemes
temp = filter( None, re.split( '\&|\-', lines[k][1] ) ) # split word on morpheme delimeters
morph_count = morph_count + 1
# inspect temp for lowercase letters
for w in temp:
print(w)
# if word contains lowercase letters, do not increment morph_count
if( regex.match( w ) ):
print( 'current morph count is: ' + str( morph_count ) )
# if word contains no lowercase letters, increment morph_count
else:
morph_count = morph_count + 1
print( 'updated morph count is: ' + str( morph_count ) )
morph_counts.append(morph_count)
print(morph_counts)
morph_count = 0 # reset for next prepositional phrase
break # end of pp
else:
if( phrase_probs[0] > phrase_probs[1] ):
file2.write( 'pp1 is more probable than pp2 by:\t' + str(phrase_probs[0] - phrase_probs[1]) + '\n\n' )
elif( phrase_probs[0] < phrase_probs[1] ):
file2.write( 'pp2 is more probable than pp1 by:\t' + str(phrase_probs[1] - phrase_probs[0]) + '\n\n' )
else:
file2.write( 'pp1 and pp2 are equally probable \n\n' )
file2.write( 'pp1 morpheme count: ' + str( morph_counts[0] ) + '\n' )
file2.write( 'pp2 morpheme count: ' + str( morph_counts[1] ) + '\n' )
if( morph_counts[0] > morph_counts[1] ):
file2.write( 'pp1 contains ' + str( morph_counts[0] - morph_counts[1] ) + ' more morphemes than pp2 \n\n' )
elif( morph_counts[0] < morph_counts[1] ):
file2.write( 'pp2 contains ' + str( morph_counts[1] - morph_counts[0] ) + ' more morphemes than pp1 \n\n' )
else:
file2.write( 'pp1 and pp2 have the same number of morphemes \n\n' )
file2.write( '-----\n\n' )
pp_count = 0 # reset for next utterance
morph_count = 1
phrase_probs = []
morph_counts = []
continue