### K Means Clustering

K means Clustering is one of the simplest and most commonly used unsupervised clustering algorithms around.

The general approach is as follows:

Created by Pretty R at inside-R.org

The general approach is as follows:

- Choose k centroids randomly.
- Calculate the distance from each point in the dataset to be classified to each centroid.
- Assign each point to the nearest centroid.
- Calculate the centroids of the resulting clusters.
- Repeat until the centroids don't move too much.

########################################### # R code to implement k means classification ########################################## # NB - to make the animation - make sure you have ImageMajick installed from http://www.imagemagick.org/ ########################################## # initiate libraries library(animation) # set working directory setwd('C:/Users/RF186004/Desktop') ######################################### # Define some functions to be used later ######################################### make_animation <- function(){ #given x, this function finds the y values to create a circular cluster find_y<- function(x) runif(1, -(sqrt((radius)^2-(x-cent_x)^2)+cent_y)+2*cent_y, sqrt((radius)^2-(x-cent_x)^2)+cent_y) #finds the distances from each point to each centroid get_distances<- function(x) sqrt((x[1] - centroids[1:k,1])^2 + (x[2] - centroids[1:k,2])^2) #finds the centroids of a the clusters find_centroids<- function(i) c(mean(data[current_cluster == i,1]), mean(data[current_cluster == i,2])) #finds how far the centroids have moved find_delta <- function(i) sqrt((new_centroids[i, 1]-centroids[i, 1])^2+(new_centroids[i, 2]-centroids[i, 2])^2) #plots the centroids on the graph plot_centroids <- function(i) points(new_centroids[i,1], new_centroids[i,2], pch = 16, cex=1.5, col = "red") ######################################### # set the parameters ######################################### span <- 3 # see belwo for definiion radius = 4 # of the circles of data to generate num_in_group = 500 k <- 4 # specify the number of clusters to identify ########################################## # generate the data to be clustered. It comprises four circular groups of num_in_group points, centered on (span, span), (-span, span), (-span, -span), (-span, span) with a radius of radius ########################################## #make the 1st groups of data cent_x<- span cent_y<- span x = runif(num_in_group, cent_x-radius, cent_x+radius) y = apply(as.matrix(x), 1, find_y) g1 <- cbind(x, y, group = rep(1, num_in_group)) #make the 2nd groups of data cent_x<- -span cent_y<- span x = runif(num_in_group, cent_x-radius, cent_x+radius) y = apply(as.matrix(x), 1, find_y) g2 <- cbind(x, y, rep(2, num_in_group)) #make the 3rd groups of data cent_x<- -span cent_y<- -span x = runif(num_in_group, cent_x-radius, cent_x+radius) y = apply(as.matrix(x), 1, find_y) g3 <- cbind(x, y, rep(3, num_in_group)) #make the 4th groups of data cent_x<- span cent_y<- -span x = runif(num_in_group, cent_x-radius, cent_x+radius) y = apply(as.matrix(x), 1, find_y) g4 <- cbind(x, y, rep(4, num_in_group)) data <- rbind(g1, g2, g3, g4) ########################################## # do the clustering ########################################## #randomly select 4 centroids centroids_indicies <- sample(c(1:length(data[,1])), k, replace = FALSE) centroids <- data[centroids_indicies,1:2] delta_avg <- 10 num_interations <- 0 while(delta_avg > 0.01 && num_interations < 50){ #find the distance between each point and each of the 4 centroids distance <- t(apply(as.matrix(data), 1, get_distances)) # assign each point to a cluster current_cluster <- apply(as.matrix(distance), 1, which.min) # find the new centroids - a k by 2 matrix new_centroids <- t(apply(as.matrix(c(1:k)), 1, find_centroids)) #plot the data and the centroids plot(data[,1], data[,2], col = current_cluster, pch = 3, cex=0.5, xlab="x", ylab="y", main="K Means Clustering") apply(as.matrix(c(1:k)), 1, plot_centroids) #find how much each centroid moved delta <- t(apply(as.matrix(c(1:k)), 1, find_delta)) delta_avg = mean(delta) centroids <- new_centroids num_interations <- num_interations+1 } } saveMovie(make_animation(),interval = 0.01, width = 580, height = 400) paste("number of iterations =", num_interations) paste("Last avg delta =", delta_avg)

## Comments