-
Notifications
You must be signed in to change notification settings - Fork 0
/
lookup_gen.py
115 lines (97 loc) · 3.89 KB
/
lookup_gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 1 16:45:38 2016
"""
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
import project_config
def lookup_gen( corpus_per_q ):
max_rank_check = project_config.max_rank_check
stanford_path = "./stanford-ner-2015-12-09/"
stanford_model_path = stanford_path + '/classifiers/english.muc.7class.distsim.crf.ser.gz'
stanford_jar_path = stanford_path + 'stanford-ner.jar'
st = StanfordNERTagger( stanford_model_path, stanford_jar_path)
ner_config = { "location": "where",
"person" : "who",
"organization": "where",
"date": "when",
"time": "when",
"money": "none",
"percent": "none",
'o': "none",
}
lookup_list = { "who": [] , "where": [], "when":[] }
doc_id = 0
for doc in corpus_per_q :
if doc_id == 0: # This condition is to skip the empty doc in the begining
doc_id += 1
continue
sent_id = 0
sent_len_list = []
doc_content = []
#get cumulative lengths of sentences in a list and club all the sentences of doc together.
for sentence in doc:
tokenized_sent = word_tokenize( sentence )
try:
previous_len = sent_len_list[-1]
except:
previous_len = 0
sent_len_list += [ previous_len + len(tokenized_sent) ]
doc_content += tokenized_sent
#NER the whole doc at once
doc_arr = st.tag(doc_content)
loop_var = 0 #iterator over sent len list
start_index = 0 #keeps track of sentence starting point
for sentence in doc:
last_index = sent_len_list[ loop_var ]
classified_unicode = doc_arr[ start_index:last_index ]
#convert unicode to string
classified_sent = []
len_classified_unicode = len( classified_unicode )
uni_iter = 0
prev_tag = ""
while( uni_iter < len_classified_unicode ):
cu = classified_unicode[uni_iter]
try:
current_word = str(cu[0])
current_tag = str(cu[1]).lower()
if( prev_tag == current_tag):
classified_sent[-1] = (classified_sent[-1][0] + " " + current_word, current_tag)
else:
classified_sent += [ (current_word, current_tag) ]
prev_tag = current_tag
except:
print( cu[0], cu[1], "conversion to raw string failed")
uni_iter += 1
for tup in classified_sent:
if( not ner_config[tup[1]] == "none"):
lookup_list[ ner_config[tup[1]] ] += [(doc_id, sent_id, tup[0])]
sent_id += 1
loop_var += 1
start_index = last_index
doc_id += 1
if doc_id > max_rank_check:
break;
return( lookup_list )
''' sample run for the function in this file ''
def lookup_gen_test():
corpus_per_q = [
[
"Gandhi is father of India.",
"Newton made laws of motion",
],
[
"Apple makes Iphone",
"Hololens was made by Microsoft",
"Facebook has caputerd VR space"
],
[
"New Macbooks are not upto the mark",
"Surface desktops are the new thing to look at"
]
]
print( lookup_gen( corpus_per_q ) )
##sample call below
lookup_gen_test()
'''