-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_processing.py
129 lines (91 loc) · 3.15 KB
/
text_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
'''
Kristen McGarry
Data Analyst Intern
Summer 2017
'''
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import codecs
import string
def onlyAscii(inFile,outFile):
ip=open(inFile,'r')
op=open(outFile,'w')
for line in ip:
line=line.strip().decode("ascii","ignore").encode("ascii")
if line=="":continue
op.write(line)
ip.close()
op.close()
def getText(inFile):
i_file = open(inFile,'r')
text = i_file.read()
return text
def removeStopWords(text):
stop = (stopwords.words('english')) + (list(string.punctuation))
clean_text = []
tokens = word_tokenize(text.lower())
for word in tokens:
if word not in stop:
clean_text.append(word)
return clean_text
def wordFrequency(clean_text):
freq_dict = {}
for word in clean_text:
if word in freq_dict:
freq_dict[word] = freq_dict[word] + 1
else:
freq_dict[word] =1
return freq_dict
# Sentence Tokenization
def tokenizer(text):
sent_tokens = nltk.sent_tokenize(text)
return sent_tokens
# Part of speech tagger
def posTagger(sent_tokens):
pos_text = []
for i in range(0,(len(sent_tokens)-1)):
word_tokens = word_tokenize(sent_tokens[i])
pos_temp = nltk.pos_tag(word_tokens)
pos_text.append(pos_temp)
return pos_text
# Count frequency of all part of speech tags
def countPos(pos_text):
return "incomplete."
# ****** COMMANDS ******
#onlyAscii("C:/Users/mcgark9/Documents/main-project/data/sec-edgar-txt-files/APPLE DEF 14A.txt","C:/Users/mcgark9/Documents/main-project/data/sec-edgar-txt-files/APPLE DEF 14A ASCII.txt")
text = getText("C:/Users/mcgark9/Documents/main-project/data/sec-edgar-txt-files/APPLE DEF 14A ASCII.txt")
#clean_text = removeStopWords(text)
#out_file = open("C:/Users/mcgark9/Documents/main-project/data/sec-edgar-txt-files/APPLE DEF 14A ASCII WORD FREQ.txt","w")
#out_file.write(str(wordFrequency(clean_text)))
#print "written to out file"
sent_tokens = tokenizer(text)
#print (sent_tokens[100])
#print len(sent_tokens)
out_file = open("C:/Users/mcgark9/Documents/main-project/data/sec-edgar-txt-files/APPLE DEF 14A ASCII POS.txt","w")
out_file.write(str(posTagger(sent_tokens)))
print "written to file"
#print posTagger(tokenizer(text))
# IF NEED TO TAKE IN FROM HTML FILE: INCOMPLETE
def cleanHtml(htmlFile,outFile):
i_file = codecs.open(htmlFile,'r')
text = i_file.read()
# remove all tables, find all lines that start with <TABLE and end with </TABLE>
counter = 0
table_occurences = text.count("<TABLE ")
print table_occurences
for i in range(0,table_occurences):
counter += 1
begin = text.find("<TABLE ")
end = text.find("</TABLE>")
#print "BEGIN " + str(begin)
#print "END " + str(end)
text = text[:begin] + text[end+8:]
begin = (-1)
end = (-1)
i+=1
text_v1 = text
o_file = codecs.open(outFile,'w')
o_file.write(text_v1)
#cleanHtml("C:\Users\mcgark9\Desktop\practice.html")
#cleanHtml("C:/Users/mcgark9/Desktop/apple.html","C:/Users/mcgark9/Desktop/apple_v1.html")