I have this code thanks to experts on year and i have recreated it to fit my nee
ID: 3784428 • Letter: I
Question
I have this code thanks to experts on year and i have recreated it to fit my needs. However, I'm getting two an error on lines 47 and 61 of a ZeroDivisionError.
line 61 new_centroids=update_location(data_points, clusters, centroids)
a Float division by zero.
line 47 new_centroids={k:sum(v)/float(len(v)) for k,v in clusters.items()}
Code
import os.path
import sys
input_name='input2.txt' #raw_input("Enter the name of the input file:")
output_name='output.txt'#raw_input("Enter the name of the output file:")
num_clusters=5 #input("Enter the number of clusters:");
if os.path.exists(input_name):
with open(input_name) as in_f:
content = in_f.readline()
data_points = [float(i) for i in content.strip().split(" ")]
k=num_clusters
centroids = dict(zip(range(k),data_points[0:k]))
clusters = dict(zip(range(k),[[] for i in range(k)]))
point_assignments= dict(zip(range(k), clusters))
old_point_assignments=dict()
def assign_to_clusters(data_points, clusters, centroids,point_assignments):
for key,point in enumerate(data_points):
closest_index = ('inf')
index = 0
for i in range(len(centroids)):
distance = abs(point-centroids[i])
if distance < closest_index:
closest_index = distance
index = i
clusters[index].append(point)
point_assignments[index]=closest_index
return point_assignments
def update_location(data_points, clusters, centroids):
new_centroids={k:sum(v)/float(len(v)) for k,v in clusters.items()}
centroids.update(new_centroids)
return centroids
#Algorithm
iteration=0
point_assignments = assign_to_clusters(data_points, clusters, centroids,point_assignments)
np=dict(old_point_assignments)
while point_assignments != np:
iteration += 1
print " ","Iteration", iteration
for t, o in clusters.items():
print t, '', o
new_centroids=update_location(data_points, clusters, centroids)
old_point_assignments = point_assignments
np=dict(old_point_assignments)
clusters = dict(zip(range(k),[[] for i in range(k)]))
point_assignments = assign_to_clusters(data_points, clusters, new_centroids,point_assignments)
print ""
####do output
with open(output_name, 'w') as f:
for c,p in clusters.items():
for points in p:
f.write("Point " + str(points) + " in " + str(c) + " ")
f.close()
It prints start to print the first interation but does not complete. Any help would be great.
Iteration 1
0 [1.8]
1 []
2 []
3 []
4 []
Thanks
Explanation / Answer
Please find the working code for the problem, previous code had many bugs, so I have changed the code a lot.
Please note that you should keep the input.txt file in same folder.
import os.path
import sys
# print "Please enter the input_data filename"
#in_file = raw_input().strip()
# print "Please enter the output filename"
#out_file = raw_input().strip()
# print "Please enter the number of clusters"
#k = int(raw_input().strip())
in_file = "input.txt"
out_file = "output.txt"
k = 4
if os.path.exists(in_file):
with open(in_file) as in_f:
content = in_f.readline()
l = [float(i) for i in content.strip().split(" ")]
centroids = l[0:k]
centroids_prev = l[0:k]
it = 0
change = 1000
while((change > 0.01)and (it < 1000)):
it = it + 1
print "Iteration ", it
clusters = []
for i in range(0,k):
clusters.append([])
for e in l:
distances = []
for c in centroids:
distances.append(abs(c - e))
clusters[distances.index(min(distances)) - 1].append(e)
for ww in range(0,k):
print ww, clusters[ww]
#update centroids
for x in range(0,len(clusters)):
summ = 0.0
for y in range(0,len(clusters[x])):
summ = summ + clusters[x][y]
if(len(clusters[x]) > 0):
tmp = float(summ)/float(len(clusters[x]))
centroids[x] = tmp
else:
tmp = 0.0
centroids[x] = tmp
centroids_prev.sort()
centroids.sort()
change = 0
for s in range(0,k):
change = change + abs(centroids[s] - centroids_prev[s])
centroids_prev[s] = centroids[s]
f= open(out_file,"w+")
for i in range(0,len(clusters)):
for j in range(0,len(clusters[i])):
f.write("Point ")
f.write(str(clusters[i][j]))
f.write(" in clusters ")
f.write(str(i))
f.write(" ")
f.close()
Sample input.txt
1 800 400 2 3 4 5 401 402 403 404 801 802 803 805
Sample output.txt
Point 3.0 in clusters 0
Point 4.0 in clusters 0
Point 5.0 in clusters 0
Point 400.0 in clusters 1
Point 401.0 in clusters 1
Point 402.0 in clusters 1
Point 403.0 in clusters 1
Point 404.0 in clusters 1
Point 800.0 in clusters 2
Point 801.0 in clusters 2
Point 802.0 in clusters 2
Point 803.0 in clusters 2
Point 805.0 in clusters 2
Point 1.0 in clusters 3
Point 2.0 in clusters 3