-
Notifications
You must be signed in to change notification settings - Fork 3
/
minibatchKM.py
98 lines (77 loc) · 2.93 KB
/
minibatchKM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#
# kmeans using sci-kit
from sklearn.cluster import KMeans, MiniBatchKMeans
import numpy as np
import csv
import random
from scipy.stats import mode
import matplotlib.pyplot as plt
def read_txt(filename):
"""
Reads txt file and returns tuple with
list of top 5,000 words and
the index : frequency for each track
"""
content = [] # list with word index : word count for each track
string = '%'
find = False
words = []
track_id = [] # list with track ID's from the MSD
mxm_tid = [] # track ID's from musiXmatch
str_data = []
read_file = open(filename, "r")
for line in read_file:
if find:
line = line.strip() # converting line into list
index1 = line.find(',') # finds index of 1st comma
index2 = line.find(',', index1+1) # finds index of 2nd comma
track_id.append(line[:index1]) # appends track id to list
mxm_tid.append(line[:index2]) # appends track id to list
res = '{' + line[index2+1:] + '}' # simulates dictionary with string
d = eval(res) # converts string to actual dictionary
content.append(d) # appends track data to content list
else:
# obtaining line with 5,000 words
if line.startswith(string):
line = line[1:] # getting rid of %
words = [word.strip() for word in line.split(',')]
find = True # already found list of words
read_file.close()
return (words, content, track_id, mxm_tid)
def create_vectors(list_dict, num_words):
"""
Returns a list x for all the data points.
Each element of x is a NumPy vector with 5,000 elements,
one for each word.
"""
x = [] # list that will hold data
# maxval = 0
# minval = 0
# res = 0
for d in list_dict:
# initializing numpy vector
# it contains 5,000 (number of words) zeros
temp = np.zeros(num_words, dtype=np.float64)
for key, val in d.items():
if key < num_words:
key -= 1 # indexing in data starts at 1
temp[key] = 1 # adding word and its frequency to vector
# temp[key] = val
x.append(temp) # appends vector to x
return np.array(x)
if __name__ == "__main__":
(words, test_list, track_id, mxm_tid) = read_txt('mxm_dataset_test.txt')
print('== Finished test txt')
(words, train_list, track_id, mxm_tid) = read_txt('mxm_dataset_train.txt')
print('== Finished train txt')
#print(words)
x_test = create_vectors(test_list, 5000)
x_train = create_vectors(train_list, 5000)
X = np.concatenate((x_test, x_train), axis=0)
print('== Beginning clustering')
kmeans = MiniBatchKMeans(n_clusters=3, random_state=0).fit(X)
cost = (kmeans.inertia_)/ len(X)
mu = kmeans.cluster_centers_
m = len(X)
c = kmeans.labels_
print('J = ' + str(kmeans.inertia_ / m))