K Means Clustering

K means Clustering is one of the simplest and most commonly used unsupervised clustering algorithms around.

The general approach is as follows:

Choose k centroids randomly.
Calculate the distance from each point in the dataset to be classified to each centroid.
Assign each point to the nearest centroid.
Calculate the centroids of the resulting clusters.
Repeat until the centroids don't move too much.

Here is some R code which generates a data set and implements the algorithm. Click here to see the animation.

###########################################
# R code to implement k means classification
##########################################
#  NB - to make the animation - make sure you have ImageMajick installed from http://www.imagemagick.org/
##########################################
 
# initiate libraries
library(animation)
 
# set working directory
setwd('C:/Users/RF186004/Desktop')
 
 
#########################################
# Define some functions to be used later
#########################################
 
make_animation <- function(){
  #given x, this function finds the y values to create a circular cluster
  find_y<- function(x) runif(1, -(sqrt((radius)^2-(x-cent_x)^2)+cent_y)+2*cent_y, sqrt((radius)^2-(x-cent_x)^2)+cent_y)
 
  #finds the distances from each point to each centroid
  get_distances<- function(x) sqrt((x[1] - centroids[1:k,1])^2 + (x[2] - centroids[1:k,2])^2)
 
  #finds the centroids of a the clusters
  find_centroids<- function(i) c(mean(data[current_cluster == i,1]), mean(data[current_cluster == i,2]))
 
  #finds how far the centroids have moved
  find_delta <- function(i) sqrt((new_centroids[i, 1]-centroids[i, 1])^2+(new_centroids[i, 2]-centroids[i, 2])^2)
 
  #plots the centroids on the graph
  plot_centroids <- function(i) points(new_centroids[i,1], new_centroids[i,2], pch = 16, cex=1.5, col = "red")
 
  #########################################
  # set the parameters
  #########################################
  span <- 3       # see belwo for definiion
  radius = 4      # of the circles of data to generate 
  num_in_group = 500
  k <- 4          # specify the number of clusters to identify
 
  ##########################################
  # generate the data to be clustered. It comprises four circular groups of num_in_group points, centered on (span, span), (-span, span), (-span, -span), (-span, span) with a radius of radius
  ##########################################
  #make the 1st groups of data
  cent_x<- span
  cent_y<- span
  x = runif(num_in_group, cent_x-radius, cent_x+radius)
  y = apply(as.matrix(x), 1, find_y)
  g1 <- cbind(x, y, group = rep(1, num_in_group))
 
  #make the 2nd groups of data
  cent_x<- -span
  cent_y<- span
  x = runif(num_in_group, cent_x-radius, cent_x+radius)
  y = apply(as.matrix(x), 1, find_y)
  g2 <- cbind(x, y, rep(2, num_in_group))
 
  #make the 3rd groups of data
  cent_x<- -span
  cent_y<- -span
  x = runif(num_in_group, cent_x-radius, cent_x+radius)
  y = apply(as.matrix(x), 1, find_y)
  g3 <- cbind(x, y, rep(3, num_in_group))
 
  #make the 4th groups of data
  cent_x<- span
  cent_y<- -span
  x = runif(num_in_group, cent_x-radius, cent_x+radius)
  y = apply(as.matrix(x), 1, find_y)
  g4 <- cbind(x, y, rep(4, num_in_group))
 
  data <- rbind(g1, g2, g3, g4)
 
  ##########################################
  # do the clustering
  ##########################################
  #randomly select 4 centroids
  centroids_indicies <- sample(c(1:length(data[,1])), k, replace = FALSE)
  centroids <- data[centroids_indicies,1:2]
  delta_avg <- 10
  num_interations  <- 0
 
  while(delta_avg > 0.01 && num_interations < 50){
 
    #find the distance between each point and each of the 4 centroids
    distance <- t(apply(as.matrix(data), 1, get_distances))
 
    # assign each point to a cluster
    current_cluster <- apply(as.matrix(distance), 1, which.min)
 
    # find the new centroids - a k by 2 matrix
    new_centroids <- t(apply(as.matrix(c(1:k)), 1, find_centroids))
 
    #plot the data and the centroids 
    plot(data[,1], data[,2], col = current_cluster, pch = 3, cex=0.5, xlab="x", ylab="y", main="K Means Clustering")
    apply(as.matrix(c(1:k)), 1, plot_centroids)
 
    #find how much each centroid moved
    delta <- t(apply(as.matrix(c(1:k)), 1, find_delta))
    delta_avg = mean(delta)
 
    centroids <- new_centroids
 
    num_interations <- num_interations+1
  }
}
saveMovie(make_animation(),interval = 0.01, width = 580, height = 400)
paste("number of iterations =", num_interations)
paste("Last avg delta =", delta_avg)

Created by Pretty R at inside-R.org

Search This Blog

Ross Farrelly

K Means Clustering

Comments

Popular posts from this blog

Why Study Shakespeare?

Can Data Mining Algorithms Extract Value from your Personal Data (and should you get a piece of the action?)

Please support me on Kaggle