Thursday, February 14, 2013

Here is the R code for the competition entry mentioned in my previous post. See http://www.datamilk.com/leaderboard_animation.gif for the animation.

library(lubridate)
library(plyr)
library(sqldf)
library(ggplot2)
library(animation)
 
#clear everything
rm(list=ls(all=TRUE))
 
# Injest data
data <- read.csv("unimelb_public_leaderboard.csv", header=TRUE)
 
# calculate days and date time as numeric
data <- data.frame(
  data
  , SubmissionDate_datetime = strptime(data$SubmissionDate, format="%m/%d/%Y %H:%M:%S %p")
  , Submission_day = round(strptime(data$SubmissionDate, format="%m/%d/%Y %H:%M:%S %p"), "day")
  , Submission_time_num  = as.numeric(strptime(data$SubmissionDate, format="%m/%d/%Y %H:%M:%S %p"))
  )
 
start_time <- min(na.omit(data$SubmissionDate_datetime))
end_time <- max(na.omit(data$SubmissionDate_datetime))
start_day <- min(na.omit(data$Submission_day))
end_day <- max(na.omit(data$Submission_day))
duration<- round(end_time - start_time,0)
team_names <- sqldf("select distinct TeamName from data")
num_teams <- nrow(team_names)
 
competition_days<- seq(start_day, b = "days", length = as.numeric(round(end_day - start_day,0)))
 
####################################################
#make a new column with the leaderboard scores in it
####################################################
 
#sort data by team and submission date
data <- data[with(data, order(TeamName, Submission_time_num)),]
data <- data.frame(data, lb_score = rep(0, nrow(data)))
 
 
current_team <- data$TeamName[1]
current_score <- data$Score[1]
data$lb_score[1] <- current_score
 
for (i in 2:nrow(data)){
  if(data$TeamName[i]==current_team){
    current_score<- max(current_score, data$Score[i])
    data$lb_score[i] <- current_score  
  }    
  else{
    current_team <- data$TeamName[i]
    current_score <- data$Score[i]
    data$lb_score[i] <- current_score
  }
}
 
# Make the animation
 
make_animation <- function(){
  start_time <- min(na.omit(data$SubmissionDate_datetime))
  end_time <- max(na.omit(data$SubmissionDate_datetime))
  start_day <- min(na.omit(data$Submission_day))
  end_day <- max(na.omit(data$Submission_day))
  duration<- round(end_time - start_time,0)
  #fix the start time a an exact number of days before the end time
  t <- end_time - as.numeric(duration)*24*60*60
 
  while(t < end_time){
    t <- min(c(end_time, t+24*60*60))
    days_left <- round((end_time - t), 0)
 
    #extract the data prior to time t
    temp_dat<- na.omit(data[data$SubmissionDate_datetime <= t,])
 
      #find the top 5 teams
      top_teams <- sqldf("select TeamName, max(lb_score) as lb_score
        from temp_dat
          group by Teamname
            order  by 2 desc
              limit 7")
      #get the ranking for each team
      top_teams <- data.frame(top_teams, Leaderboard = paste(rownames(top_teams), " ", top_teams$TeamName, " (", top_teams$lb_score, ")"), sep = "")
 
      # select just the top teams
      temp_dat <- sqldf("select temp_dat.*, top_teams.Leaderboard as Leaderboard
        from temp_dat, top_teams
          where temp_dat.TeamName = top_teams.TeamName and temp_dat.TeamName in (select TeamName from top_teams)")
 
      #turn off the scale for alpha
      sc <- scale_alpha_continuous()
      sc$legend <- FALSE
         #plot the data
         p <- ggplot(temp_dat, aes(x=SubmissionDate_datetime, y=lb_score, group=Leaderboard, colour = Leaderboard, alpha = rank(lb_score)))
         theme_set(theme_gray(base_size = 18))
         print(p+ geom_line(size = 2)
                + geom_point(size = 4, aes(colour = Leaderboard, alpha = rank(lb_score)))
                + xlab("Submission Date")
                + coord_cartesian(xlim = c(min(na.omit(data$SubmissionDate_datetime)), max(na.omit(data$SubmissionDate_datetime))+20*24*60*60)
                                  , #ylim = c(max(c(0, min(na.omit(data$Score)))), max(na.omit(data$Score))+0.01)
                                  ylim = c(0.85, max(na.omit(data$Score))+0.01)
                )
                #add a verticle line to show the current date
                + geom_vline(linetype = 2, xintercept = as.numeric(t))
 
                #add a vertical line to show the end date
                + geom_vline(linetype = 2, xintercept = as.numeric(end_time))
 
                #lable the end date line
                + annotate("text"
                  , label = paste("Competition ends: "
                  , end_day)
                  , x = as.numeric(end_time)+(48*60*60)
                  , y = 0.86
                  , hjust=0
                  , vjust=0
                  , angle = 90
                  )
 
                # add a vertical line to show the current submission date
                + geom_hline(linetype = 2, yintercept = max(temp_dat$Score))
 
                # add a lable to show the current top score
                + annotate("text"
                  , label = paste("Top Score:"
                  , round(max(temp_dat$Score),4))
                  , x = as.numeric(t)+12*60*60
                  , y = max(temp_dat$Score)
                  , hjust=0
                  , vjust=-1
                  )
                #add a lable to show the number of days left
                + annotate("text"
                  , label = paste(days_left, "days left.")
                  , x = as.numeric(t)+12*60*60
                  , y = max(temp_dat$Score)-0.002
                  , hjust=0
                  , vjust=1
                  )
 
                 # add a title
                 + opts(title = paste("'Grant Applications Comp' as of", max(temp_dat$Submission_day)))
                 #turn off the legend for alpha
                 + sc
         )
 
  }
}
 
saveGIF(
    make_animation()
, title = "Predict Grant Applications"
, description = "Predict Grant Applications"
,ani.width = 900
,ani.height = 600
)
 
#create the file
saveHTML(
    make_animation()
, title = "Predict Grant Applications"
, description = "Predict Grant Applications"
,ani.width = 900
,ani.height = 600
)

No comments: