-
Notifications
You must be signed in to change notification settings - Fork 6
/
fuzzy_c.py
215 lines (192 loc) · 5.77 KB
/
fuzzy_c.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
###############################################################################
##
## Ananya Kirti @ June 9 2015
## Fuzzy C means
##
###############################################################################
## Ananya Kirti
# importing all the required components, you may also use scikit for a direct implementation.
import copy
import math
import random
import time
import sys
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import decimal
#used for randomising U
global MAX
MAX = 10000.0
#used for end condition
global Epsilon
Epsilon = 0.00000001
def import_data(file):
"""
This function imports the data into a list form a file name passed as an argument.
The file should only the data seperated by a space.(or change the delimiter as required in split)
"""
data = []
f = open(str(file), 'r')
for line in f:
current = line.split() #enter your own delimiter like ","
for j in range(0,len(current)):
current[j] = int(current[j])
data.append(current)
print "finished importing data"
return data
def import_data_format_iris(file):
"""
This would format the data as required by iris
the link for the same is http://archive.ics.uci.edu/ml/machine-learning-databases/iris/
"""
data = []
cluster_location =[]
f = open(str(file), 'r')
for line in f:
current = line.split(",")
current_dummy = []
for j in range(0,len(current)-1):
current_dummy.append(float(current[j]))
j+=1
#print current[j]
if current[j] == "Iris-setosa\n":
cluster_location.append(0)
elif current[j] == "Iris-versicolor\n":
cluster_location.append(1)
else:
cluster_location.append(2)
data.append(current_dummy)
print "finished importing data"
return data , cluster_location
def randomise_data(data):
"""
This function randomises the data, and also keeps record of the order of randomisation.
"""
order = range(0,len(data))
random.shuffle(order)
new_data = [[] for i in range(0,len(data))]
for index in range(0,len(order)):
new_data[index] = data[order[index]]
return new_data, order
def de_randomise_data(data, order):
"""
This function would return the original order of the data, pass the order list returned in randomise_data() as an argument
"""
new_data = [[]for i in range(0,len(data))]
for index in range(len(order)):
new_data[order[index]] = data[index]
return new_data
def print_matrix(list):
"""
Prints the matrix in a more reqdable way
"""
for i in range(0,len(list)):
print list[i]
def end_conditon(U,U_old):
"""
This is the end conditions, it happens when the U matrix stops chaning too much with successive iterations.
"""
global Epsilon
for i in range(0,len(U)):
for j in range(0,len(U[0])):
if abs(U[i][j] - U_old[i][j]) > Epsilon :
return False
return True
def initialise_U(data, cluster_number):
"""
This function would randomis U such that the rows add up to 1. it requires a global MAX.
"""
global MAX
U = []
for i in range(0,len(data)):
current = []
rand_sum = 0.0
for j in range(0,cluster_number):
dummy = random.randint(1,int(MAX))
current.append(dummy)
rand_sum += dummy
for j in range(0,cluster_number):
current[j] = current[j] / rand_sum
U.append(current)
return U
def distance(point, center):
"""
This function calculates the distance between 2 points (taken as a list). We are refering to Eucledian Distance.
"""
if len(point) != len(center):
return -1
dummy = 0.0
for i in range(0,len(point)):
dummy += abs(point[i] - center[i]) ** 2
return math.sqrt(dummy)
def normalise_U(U):
"""
This de-fuzzifies the U, at the end of the clustering. It would assume that the point is a member of the cluster whoes membership is maximum.
"""
for i in range(0,len(U)):
maximum = max(U[i])
for j in range(0,len(U[0])):
if U[i][j] != maximum:
U[i][j] = 0
else:
U[i][j] = 1
return U
def fuzzy(data, cluster_number, m = 2):
"""
This is the main function, it would calculate the required center, and return the final normalised membership matrix U.
It's paramaters are the : cluster number and the fuzzifier "m".
"""
## initialise the U matrix:
U = initialise_U(data, cluster_number)
#print_matrix(U)
#initilise the loop
while (True):
#create a copy of it, to check the end conditions
U_old = copy.deepcopy(U)
# cluster center vector
C = []
for j in range(0,cluster_number):
current_cluster_center = []
for i in range(0,len(data[0])): #this is the number of dimensions
dummy_sum_num = 0.0
dummy_sum_dum = 0.0
for k in range(0,len(data)):
dummy_sum_num += (U[k][j] ** m) * data[k][i]
dummy_sum_dum += (U[k][j] ** m)
current_cluster_center.append(dummy_sum_num/dummy_sum_dum)
C.append(current_cluster_center)
#creating a distance vector, useful in calculating the U matrix.
distance_matrix =[]
for i in range(0,len(data)):
current = []
for j in range(0,cluster_number):
current.append(distance(data[i], C[j]))
distance_matrix.append(current)
# update U vector
for j in range(0, cluster_number):
for i in range(0, len(data)):
dummy = 0.0
for k in range(0,cluster_number):
dummy += (distance_matrix[i][j]/ distance_matrix[i][k]) ** (2/(m-1))
U[i][j] = 1 / dummy
if end_conditon(U,U_old):
print "finished clustering"
break
U = normalise_U(U)
print "normalised U"
return U
## main
if __name__ == '__main__':
# import the data
data = import_data(str(sys.argv[1]))
#data, cluster_location = import_data_format_iris("iris.txt")
#print_matrix(data)
#data , order = randomise_data(data)
#print_matrix(data)
start = time.time()
# now we have the data in a list called data, this is only number
# call the fuzzy - c means function
final_location = fuzzy(data , 2 , 2)
#final_location = de_randomise_data(final_location, order)
#print_matrix(final_location)
print "time elapsed=", time.time() - start