Academic Integrity: tutoring, explanations, and feedback — we don’t complete graded work or submit on a student’s behalf.

Please implement k-means on 2dimensional numerical data by C++, which should be

ID: 3747843 • Letter: P

Question

Please implement k-means on 2dimensional numerical data by C++, which should be fairly easy to derive from this.

Let k be the number of clusters you want

Let S be the set of data samples (|S| is the size of the set)

Let A be the set of associate clusters for each data sample

Let sim(x,y) be the similarity function

Let c[k] be the vectors for cluster centers

Init:

Let S' = S

//choose k random vectors to start our clusters

for i=1 to k

j = rand(|S'|)

c[k] = S'[j]

S' = S' - {c[n]} //remove that vector from S' so we can't choose it again end

//assign initial clusters

for i=1 to |S|

A[i] = argmin(j = 1 to k) { sim(S[i], c[j]) }

end

Run:

Let change = true

//recalculate cluster locations if a change occurred

if change

for i = 1 to k mean, count = 0

for j = 1 to |S|

if A[j] == i

mean = mean + S[j]

count = count + 1

end

end c[i] = mean/count

end

end

while change

change = false //assume there is no change

//reassign feature vectors to clusters

for i = 1 to |S|

a = argmin(j = 1 to k) { sim(S[i], c[j]) }

if a != A[i]

A[i] = a

change = true //a vector changed affiliations -- so we need to

//recompute our cluster vectors and run again

end

end

Use your code to cluster the following eight points (with (x, y) representing locations) into three clusters A1(2, 10) A2(2, 5) A3(8, 4) A4(5, 8) A5(7, 5) A6(6, 4) A7(1, 2) A8(4, 9). The distance (similarity) function between two points a=(x1, y1) and b=(x2, y2) is defined as: (a, b) = |x2 – x1| + |y2 – y1| .

To be simple, you can just set k=3 and set S equals to the above eight points. Your program needs output the final clustering result (members in each cluster).

For example:

Cluster1: {A1, A4, …}

Cluster2: {A3, A5, ..}

Cluster3: {A2, …}

Please run your program at least 5 times.

Round 1: Please choose three initial cluster centers as A1, A7, and A8 Round 2: Please choose three initial cluster centers as A2, A6, and A8

Round 3: Please choose three initial cluster centers as A3, A5, and A6

Round 4: Please choose three initial cluster centers as A2, A3, and A7

Round 5: Please randomly choose any three points from a two dimensional space (0,0) to (10,10) as initial three cluster centers.

Among all of these clustering results, please tell us which clustering result is the best one and why?

Explanation / Answer

In each Round main difference is regarding choosing starting point.Its better we choose the center point among available coordiantes rather than randomly choosing.Round 2 was most optimal because in this round algo rans for 2 times only till you arrived at central cluster points.In other round it was 3 times.

//C++ CODE FOR K-Means

#include <iostream>

#include<bits/stdc++.h>

using namespace std;

int sx[8]={2,2,8,5,7,6,1,4};

int sy[8]={10,5,4,8,5,4,2,9};

int A[8]={-1};

double diff(double x1,double y1,double x2,double y2)

{

double x=x2-x1;

double y=y2-y1;

double h= (abs(x)+abs(y));

return h;

}

int func(vector<int >x,vector<int> y)

{

int i,j;

int no_of_times_algo_run=0;

double cx[3];

double cy[3];

for(i=0;i<3;i++)

{

cx[i]=x[i];

cy[i]=y[i];

}

for(i=0;i<8;i++)

{

double mi=20;

for(j=0;j<3;j++)

{

double f=diff(sx[i],sy[i],cx[j],cy[j]);

if(f<mi)

{

mi=f;

A[i]=j;

}

}

}

no_of_times_algo_run++;

bool change=1;

while(change)

{

double count[3]={0};

double meanx[3]={0};

double meany[3]={0};

for(i=0;i<8;i++)

{

count[A[i]]++;

meanx[A[i]]+=sx[i];

meany[A[i]]+=sy[i];

}

for(i=0;i<3;i++)

{

cx[i]=meanx[i]/count[i];

cy[i]=meany[i]/count[i];

}

no_of_times_algo_run++;

change=0;

int cnt=0;

for(i=0;i<8;i++)

{

double mi=20;

int no;

for(j=0;j<3;j++)

{

double f=diff(sx[i],sy[i],cx[j],cy[j]);

if(f<mi)

{

mi=f;

no=j;

}

}

if(no==A[i])

{

cnt++;

}

else

{

A[i]=no;

}

}

if(cnt!=8)

{

change=1;

}

}

return no_of_times_algo_run;

}

int main() {

int k=3;//No of Cluster

int arr1[]={1,7,8};// Round 1

vector<int> x;

vector<int> y;

int i;

for(i=0;i<3;i++)

{

x.push_back(sx[arr1[i]-1]);

y.push_back(sy[arr1[i]-1]);

}

int res=func(x,y);

cout<<"Round 1 Results"<<" ";

cout<<"No of times Algorithm Runs "<<res<<" ";

for(i=0;i<8;i++)

{

cout<<(A[i]+1)<<" ";

}

cout<<" ";

x.clear();

y.clear();

int arr2[]={2,6,8};

for(i=0;i<3;i++)

{

x.push_back(sx[arr2[i]-1]);

y.push_back(sy[arr2[i]-1]);

}

res=func(x,y);

cout<<"Round 2 Results"<<" ";

cout<<"No of times Algorithm Runs "<<res<<" ";

for(i=0;i<8;i++)

{

cout<<(A[i]+1)<<" ";

}

cout<<" ";

x.clear();

y.clear();

int arr3[]={3,5,6};

for(i=0;i<3;i++)

{

x.push_back(sx[arr3[i]-1]);

y.push_back(sy[arr3[i]-1]);

}

res=func(x,y);

cout<<"Round 3 Results"<<" ";

cout<<"No of times Algorithm Runs "<<res<<" ";

for(i=0;i<8;i++)

{

cout<<(A[i]+1)<<" ";

}

cout<<" ";

x.clear();

y.clear();

int arr4[]={2,3,7};

for(i=0;i<3;i++)

{

x.push_back(sx[arr4[i]-1]);

y.push_back(sy[arr4[i]-1]);

}

res=func(x,y);

cout<<"Round 4 Results"<<" ";

cout<<"No of times Algorithm Runs "<<res<<" ";

for(i=0;i<8;i++)

{

cout<<(A[i]+1)<<" ";

}

cout<<" ";

x.clear();

y.clear();

int iSecret;

for(i=0;i<3;i++)

{

iSecret = rand() % 10 + 1;

x.push_back(iSecret);

//cout<<iSecret<<" ";

iSecret = rand() % 10 + 1;

y.push_back(iSecret);

}

res=func(x,y);

cout<<"Round 5 Results"<<" ";

cout<<"No of times Algorithm Runs "<<res<<" ";

for(i=0;i<8;i++)

{

cout<<(A[i]+1)<<" ";

}

cout<<" ";

  

return 0;

}