-
Notifications
You must be signed in to change notification settings - Fork 1
/
books.py
127 lines (83 loc) · 3.09 KB
/
books.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os
import re
import nltk
import string
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
import numpy as np
from sklearn.neighbors import DistanceMetric
def de_gutenberger(filename):
#read in
with open(filename, 'r+', encoding='ISO-8859-1') as f:
txt = f.read()
author, title = get_author_and_title(txt)
# get rid of header & footer
start_txt = "START OF THIS PROJECT GUTENBERG EBOOK"
end_txt = "END OF THIS PROJECT GUTENBERG EBOOK"
start = re.search('('+start_txt+').*?\n', txt)
start_ind = start.end() if start != None else 0
end = re.search('('+end_txt+').*?\n', txt)
end_ind = end.start() if end != None else len(txt)
word_string = stem_and_stop(nltk.word_tokenize(txt[start_ind:end_ind]))
return author, title, word_string
def get_author_and_title(txt):
name_search = re.search('(?<=Author: ).*?\n', txt)
name = 'Unknown' if name_search == None else name_search.group()[:-1]
title_search = re.search('(?<=Title: ).*?\n', txt)
title = 'Unknown' if title_search == None else title_search.group()[:-1]
return name, title
def stem_and_stop(words):
stemmer = PorterStemmer()
stop_list = stopwords.words('english') + list(string.punctuation)
stop_list = [str(x) for x in stop_list]
# add a list of bullshit punctuation to stop words list
stop_list.extend([",--", ',"--', ',--', '?"--', ".--", "--", '"',
'."--', '?--', '!--', "?-", ";--",
'--"', '.--"', ',"', '?"', '."', '!"', '!"--',
';"', '--`', ".'--", ".'", ";'", ",'",
"!'", ".)", "),", ":--", "\n", '``', "'d", "''", "'s"])
empty_string = ""
for word in words:
if word not in stop_list:
empty_string += " " + stemmer.stem(word.lower())
return empty_string
def get_books(folder, files):
authors = []
texts = []
titles = []
for i, f in enumerate(files):
print("Number {} of {}".format(i+1, len(files)))
author, title, txt = de_gutenberger(folder + f)
authors.append(author)
titles.append(title)
texts.append(txt)
return authors, titles, texts
def get_matrix(texts):
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2,
use_idf=True, sublinear_tf=True)
tfidf = vectorizer.fit_transform(texts)
# get covariance
vari = (tfidf * tfidf.T).A
svd = TruncatedSVD(100, random_state=123)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(tfidf)
return X
def get_clusters_and_dists(texts):
dense_matrix = get_matrix(texts)
km = KMeans(n_clusters=5, init='k-means++', max_iter=100, n_init=1)
km.fit(dense_matrix)
clusters = km.labels_
dist = DistanceMetric.get_metric('manhattan')
dist_mat = dist.pairwise(dense_matrix)
return clusters, dist_mat
if __name__ == '__main__':
folder = "Bookssample/"
files = os.listdir(folder)
authors, titles, texts = get_books(folder, files)
clusters, dist_mat = get_clusters_and_dists(texts)