Please implement k-means on 2dimensional numerical data by C++, which should be
ID: 3747843 • Letter: P
Question
Please implement k-means on 2dimensional numerical data by C++, which should be fairly easy to derive from this.
Let k be the number of clusters you want
Let S be the set of data samples (|S| is the size of the set)
Let A be the set of associate clusters for each data sample
Let sim(x,y) be the similarity function
Let c[k] be the vectors for cluster centers
Init:
Let S' = S
//choose k random vectors to start our clusters
for i=1 to k
j = rand(|S'|)
c[k] = S'[j]
S' = S' - {c[n]} //remove that vector from S' so we can't choose it again end
//assign initial clusters
for i=1 to |S|
A[i] = argmin(j = 1 to k) { sim(S[i], c[j]) }
end
Run:
Let change = true
//recalculate cluster locations if a change occurred
if change
for i = 1 to k mean, count = 0
for j = 1 to |S|
if A[j] == i
mean = mean + S[j]
count = count + 1
end
end c[i] = mean/count
end
end
while change
change = false //assume there is no change
//reassign feature vectors to clusters
for i = 1 to |S|
a = argmin(j = 1 to k) { sim(S[i], c[j]) }
if a != A[i]
A[i] = a
change = true //a vector changed affiliations -- so we need to
//recompute our cluster vectors and run again
end
end
Use your code to cluster the following eight points (with (x, y) representing locations) into three clusters A1(2, 10) A2(2, 5) A3(8, 4) A4(5, 8) A5(7, 5) A6(6, 4) A7(1, 2) A8(4, 9). The distance (similarity) function between two points a=(x1, y1) and b=(x2, y2) is defined as: (a, b) = |x2 – x1| + |y2 – y1| .
To be simple, you can just set k=3 and set S equals to the above eight points. Your program needs output the final clustering result (members in each cluster).
For example:
Cluster1: {A1, A4, …}
Cluster2: {A3, A5, ..}
Cluster3: {A2, …}
Please run your program at least 5 times.
Round 1: Please choose three initial cluster centers as A1, A7, and A8 Round 2: Please choose three initial cluster centers as A2, A6, and A8
Round 3: Please choose three initial cluster centers as A3, A5, and A6
Round 4: Please choose three initial cluster centers as A2, A3, and A7
Round 5: Please randomly choose any three points from a two dimensional space (0,0) to (10,10) as initial three cluster centers.
Among all of these clustering results, please tell us which clustering result is the best one and why?
Explanation / Answer
In each Round main difference is regarding choosing starting point.Its better we choose the center point among available coordiantes rather than randomly choosing.Round 2 was most optimal because in this round algo rans for 2 times only till you arrived at central cluster points.In other round it was 3 times.
//C++ CODE FOR K-Means
#include <iostream>
#include<bits/stdc++.h>
using namespace std;
int sx[8]={2,2,8,5,7,6,1,4};
int sy[8]={10,5,4,8,5,4,2,9};
int A[8]={-1};
double diff(double x1,double y1,double x2,double y2)
{
double x=x2-x1;
double y=y2-y1;
double h= (abs(x)+abs(y));
return h;
}
int func(vector<int >x,vector<int> y)
{
int i,j;
int no_of_times_algo_run=0;
double cx[3];
double cy[3];
for(i=0;i<3;i++)
{
cx[i]=x[i];
cy[i]=y[i];
}
for(i=0;i<8;i++)
{
double mi=20;
for(j=0;j<3;j++)
{
double f=diff(sx[i],sy[i],cx[j],cy[j]);
if(f<mi)
{
mi=f;
A[i]=j;
}
}
}
no_of_times_algo_run++;
bool change=1;
while(change)
{
double count[3]={0};
double meanx[3]={0};
double meany[3]={0};
for(i=0;i<8;i++)
{
count[A[i]]++;
meanx[A[i]]+=sx[i];
meany[A[i]]+=sy[i];
}
for(i=0;i<3;i++)
{
cx[i]=meanx[i]/count[i];
cy[i]=meany[i]/count[i];
}
no_of_times_algo_run++;
change=0;
int cnt=0;
for(i=0;i<8;i++)
{
double mi=20;
int no;
for(j=0;j<3;j++)
{
double f=diff(sx[i],sy[i],cx[j],cy[j]);
if(f<mi)
{
mi=f;
no=j;
}
}
if(no==A[i])
{
cnt++;
}
else
{
A[i]=no;
}
}
if(cnt!=8)
{
change=1;
}
}
return no_of_times_algo_run;
}
int main() {
int k=3;//No of Cluster
int arr1[]={1,7,8};// Round 1
vector<int> x;
vector<int> y;
int i;
for(i=0;i<3;i++)
{
x.push_back(sx[arr1[i]-1]);
y.push_back(sy[arr1[i]-1]);
}
int res=func(x,y);
cout<<"Round 1 Results"<<" ";
cout<<"No of times Algorithm Runs "<<res<<" ";
for(i=0;i<8;i++)
{
cout<<(A[i]+1)<<" ";
}
cout<<" ";
x.clear();
y.clear();
int arr2[]={2,6,8};
for(i=0;i<3;i++)
{
x.push_back(sx[arr2[i]-1]);
y.push_back(sy[arr2[i]-1]);
}
res=func(x,y);
cout<<"Round 2 Results"<<" ";
cout<<"No of times Algorithm Runs "<<res<<" ";
for(i=0;i<8;i++)
{
cout<<(A[i]+1)<<" ";
}
cout<<" ";
x.clear();
y.clear();
int arr3[]={3,5,6};
for(i=0;i<3;i++)
{
x.push_back(sx[arr3[i]-1]);
y.push_back(sy[arr3[i]-1]);
}
res=func(x,y);
cout<<"Round 3 Results"<<" ";
cout<<"No of times Algorithm Runs "<<res<<" ";
for(i=0;i<8;i++)
{
cout<<(A[i]+1)<<" ";
}
cout<<" ";
x.clear();
y.clear();
int arr4[]={2,3,7};
for(i=0;i<3;i++)
{
x.push_back(sx[arr4[i]-1]);
y.push_back(sy[arr4[i]-1]);
}
res=func(x,y);
cout<<"Round 4 Results"<<" ";
cout<<"No of times Algorithm Runs "<<res<<" ";
for(i=0;i<8;i++)
{
cout<<(A[i]+1)<<" ";
}
cout<<" ";
x.clear();
y.clear();
int iSecret;
for(i=0;i<3;i++)
{
iSecret = rand() % 10 + 1;
x.push_back(iSecret);
//cout<<iSecret<<" ";
iSecret = rand() % 10 + 1;
y.push_back(iSecret);
}
res=func(x,y);
cout<<"Round 5 Results"<<" ";
cout<<"No of times Algorithm Runs "<<res<<" ";
for(i=0;i<8;i++)
{
cout<<(A[i]+1)<<" ";
}
cout<<" ";
return 0;
}