Academic Integrity: tutoring, explanations, and feedback — we don’t complete graded work or submit on a student’s behalf.

CS4347: Homework # 1. (50 pt) The k-means algorithm is pretty straight forward.

ID: 3752530 • Letter: C

Question

CS4347: Homework # 1. (50 pt) The k-means algorithm is pretty straight forward. Here is the pseudo-code for it. Please implement k-means on 2-dimensional numerical data by C++, which should be fairly easy to derive from this. Let k be the number of clusters vou want Let S be the set of data samples (S is the size of the set) Let A be the set of associate clusters for each data sample Let sim(x.y) be the similarity function Let c[k] be the vectors for cluster centers Init Let S. = S /choose k random vectors to start our clusters for i-1 to k rand(S S-S- e[n]) //remove that vector from S' so we can't choose it again end Wassign initial clusters for i= l to SI A[i] = arginin(j = 1 to k) { sim(S[i], c(j]); end Run: Let change-true /recalculate cluster locations if a change occurred if change for i = l to k mean, count 0 for j-1 to S if Ali]-. mean = mean + S[j] count = count + 1 end end c[i] mean/count end end while change = false /assume there is no change change- /reassign feature vectors to clusters for i = l to S a = argmin(j = l to k) { sim(s[i], cli) } if a!-A[i) A[i] = a

Explanation / Answer

#include <iostream>

#include <vector>

#include <math.h>

#include <stdlib.h>

#include <time.h>

#include <algorithm>

using namespace std;

class Point

{

private:

int id_point, id_cluster;

vector<double> values;

int total_values;

string name;

public:

Point(int id_point, vector<double>& values, string name = "")

{

this->id_point = id_point;

total_values = values.size();

for(int i = 0; i < total_values; i++)

this->values.push_back(values[i]);

this->name = name;

id_cluster = -1;

}

int getID()

{

return id_point;

}

void setCluster(int id_cluster)

{

this->id_cluster = id_cluster;

}

int getCluster()

{

return id_cluster;

}

double getValue(int index)

{

return values[index];

}

int getTotalValues()

{

return total_values;

}

void addValue(double value)

{

values.push_back(value);

}

string getName()

{

return name;

}

};

class Cluster

{

private:

int id_cluster;

vector<double> central_values;

vector<Point> points;

public:

Cluster(int id_cluster, Point point)

{

this->id_cluster = id_cluster;

int total_values = point.getTotalValues();

for(int i = 0; i < total_values; i++)

central_values.push_back(point.getValue(i));

points.push_back(point);

}

void addPoint(Point point)

{

points.push_back(point);

}

bool removePoint(int id_point)

{

int total_points = points.size();

for(int i = 0; i < total_points; i++)

{

if(points[i].getID() == id_point)

{

points.erase(points.begin() + i);

return true;

}

}

return false;

}

double getCentralValue(int index)

{

return central_values[index];

}

void setCentralValue(int index, double value)

{

central_values[index] = value;

}

Point getPoint(int index)

{

return points[index];

}

int getTotalPoints()

{

return points.size();

}

int getID()

{

return id_cluster;

}

};

class KMeans

{

private:

int K; // number of clusters

int total_values, total_points, max_iterations;

vector<Cluster> clusters;

// return ID of nearest center (uses euclidean distance)

int getIDNearestCenter(Point point)

{

double sum = 0.0, min_dist;

int id_cluster_center = 0;

for(int i = 0; i < total_values; i++)

{

sum += pow(clusters[0].getCentralValue(i) -

point.getValue(i), 2.0);

}

min_dist = sqrt(sum);

for(int i = 1; i < K; i++)

{

double dist;

sum = 0.0;

for(int j = 0; j < total_values; j++)

{

sum += pow(clusters[i].getCentralValue(j) -

point.getValue(j), 2.0);

}

dist = sqrt(sum);

if(dist < min_dist)

{

min_dist = dist;

id_cluster_center = i;

}

}

return id_cluster_center;

}

public:

KMeans(int K, int total_points, int total_values, int max_iterations)

{

this->K = K;

this->total_points = total_points;

this->total_values = total_values;

this->max_iterations = max_iterations;

}

void run(vector<Point> & points)

{

if(K > total_points)

return;

vector<int> prohibited_indexes;

// choose K distinct values for the centers of the clusters

for(int i = 0; i < K; i++)

{

while(true)

{

int index_point = rand() % total_points;

if(find(prohibited_indexes.begin(), prohibited_indexes.end(),

index_point) == prohibited_indexes.end())

{

prohibited_indexes.push_back(index_point);

points[index_point].setCluster(i);

Cluster cluster(i, points[index_point]);

clusters.push_back(cluster);

break;

}

}

}

int iter = 1;

while(true)

{

bool done = true;

// associates each point to the nearest center

for(int i = 0; i < total_points; i++)

{

int id_old_cluster = points[i].getCluster();

int id_nearest_center = getIDNearestCenter(points[i]);

if(id_old_cluster != id_nearest_center)

{

if(id_old_cluster != -1)

clusters[id_old_cluster].removePoint(points[i].getID());

points[i].setCluster(id_nearest_center);

clusters[id_nearest_center].addPoint(points[i]);

done = false;

}

}

// recalculating the center of each cluster

for(int i = 0; i < K; i++)

{

for(int j = 0; j < total_values; j++)

{

int total_points_cluster = clusters[i].getTotalPoints();

double sum = 0.0;

if(total_points_cluster > 0)

{

for(int p = 0; p < total_points_cluster; p++)

sum += clusters[i].getPoint(p).getValue(j);

clusters[i].setCentralValue(j, sum / total_points_cluster);

}

}

}

if(done == true || iter >= max_iterations)

{

cout << "Break in iteration " << iter << " ";

break;

}

iter++;

}

// shows elements of clusters

for(int i = 0; i < K; i++)

{

int total_points_cluster = clusters[i].getTotalPoints();

cout << "Cluster " << clusters[i].getID() + 1 << endl;

for(int j = 0; j < total_points_cluster; j++)

{

cout << "Point " << clusters[i].getPoint(j).getID() + 1 << ": ";

for(int p = 0; p < total_values; p++)

cout << clusters[i].getPoint(j).getValue(p) << " ";

string point_name = clusters[i].getPoint(j).getName();

if(point_name != "")

cout << "- " << point_name;

cout << endl;

}

cout << "Cluster values: ";

for(int j = 0; j < total_values; j++)

cout << clusters[i].getCentralValue(j) << " ";

cout << " ";

}

}

};

int main(int argc, char *argv[])

{

srand (time(NULL));

int total_points, total_values, K, max_iterations, has_name;

cin >> total_points >> total_values >> K >> max_iterations >> has_name;

vector<Point> points;

string point_name;

for(int i = 0; i < total_points; i++)

{

vector<double> values;

for(int j = 0; j < total_values; j++)

{

double value;

cin >> value;

values.push_back(value);

}

if(has_name)

{

cin >> point_name;

Point p(i, values, point_name);

points.push_back(p);

}

else

{

Point p(i, values);

points.push_back(p);

}

}

KMeans kmeans(K, total_points, total_values, max_iterations);

kmeans.run(points);

return 0;

}