Saturday, August 25, 2012

Create a pdf data dictionary from a dataset

Here is some R code you can use to create a pdf data dictionary from a dataset. It's a quick and easy way to get an overview of a dataset.

You need to have a latext program like MiKTeX installed to make it work. 

You can see the sort of output it produces here.
#######################################
# code to produce a pdf data dictionary of a dataset using sweave
#
# df: the data frame you want to create the data dictionary of
# file_loc: the location where you want the resulting report stored
# filename: the name you want the report to have
# author: the name of the person who created the data dictionary
#
#######################################
 
 
data_dict <- function(df, file_loc, file_name, author, num_lev = 2){
  # initiate libraries
  library(tools)
  library(ggplot2)
  library(gplots)
  library(lattice)
  library(sqldf)
 
  #Make some subsidary functions
 
  # initial funtion to print without quotes and line numbers
  cat_nl<-function(x){
    cat(x, fill = T)
  }
 
  #this function takes a factor f and an integer num_levels and returns a subset of f 
  reduce <- function(f, n){
  levels(f) <- names(sort(table(f), descreasing = T))
  n <- min(n, length(levels(f)))
  f<- factor(f[f %in% levels(f)[1:n]])
  return(f)
  }
 
  #finds the class of each variable in a dataset
 find_class <- function(df){
  l<-c()
  for(i in 1 : ncol(df)){
   l<-c(l, class(df[, i]))
  }
  return(l)
 }
 
 classes <- find_class(df)
 
 #returns the levels of a factor as a string
 find_levels<- function(v){
  sort(table(v), decreasing = T)
 }
 
  #create the output filenames
  rnw_file_name <- paste(file_name, ".rnw", sep = "")
  tex_file_name <- paste(file_name, ".tex", sep = "")
 
  #direct the output correctly
  setwd(file_loc)
  sink(rnw_file_name, type = "output")
  # initial funtion to print without quotes and line numbers
  cat_nl<-function(x){
    cat(x, fill = T)
  }
 
  setwd(file_loc)
  df_name <- deparse(substitute(df))
  cat_nl('\\documentclass[a4paper]{article}')
  cat_nl(paste('\\title{Data Dictionary for dataset: ', df_name, '}', sep = ""))
  cat_nl(paste('\\author{', author, '}', sep = ""))
  cat_nl('\\begin{document}')
  cat_nl('\\maketitle')
 
  cat_nl('\\noindent')
 
  cat_nl("Dataset\\\\")
  cat_nl("------------------------\\\\")
  cat_nl("Variables\\\\")
  cat_nl("------------------------\\\\")
  cat_nl(paste("Names of variables (alphabetical order):", paste(sort(names(df)), collapse=", "), '\\\\'))
  cat_nl(paste("Number of variables:", ncol(df), "\\\\"))
  if (ncol(df[classes =="numeric"])>0){
    cat_nl(paste("Number of numeric variables:", ncol(df[classes =="numeric"]), "\\\\"))
  }
  if (ncol(df[classes =="integer"])>0){
    cat_nl(paste("Number of integer variables:", ncol(df[classes =="integer"]), "\\\\"))
  }
  if (ncol(df[classes =="character"])>0){
    cat_nl(paste("Number of character variables:", ncol(df[classes =="character"]), "\\\\"))
  }
  if (ncol(df[classes =="logical"])>0){
    cat_nl(paste("Number of logical variables:", ncol(df[classes =="logical"]), "\\\\"))
  }
  if (ncol(df[classes =="factor"])>0){
    cat_nl(paste("Number of factors:", ncol(df[classes =="factor"]), "\\\\"))
  }
  if (ncol(df[classes =="Date"])>0){
    cat_nl(paste("Number of Date variables:", ncol(df[classes =="Date"]), "\\\\"))
  }
  if (ncol(df[classes =="ts"])>0){
    cat_nl(paste("Number of time series variables:", ncol(df[classes =="ts"]), "\\\\"))
  }
  cat_nl('\\newline')
  cat_nl("Observations\\\\")
  cat_nl("------------------------\\\\")
  cat_nl(paste("Number of observations:", nrow(df), "\\\\"))
  cat_nl(paste("Number of complete observations:", nrow(na.omit(df)), "\\\\"))
 
 
  var_num <- 0
  for(i in 1:ncol(df)){
  var_num <- var_num + 1
 if(class(df[, i]) == "numeric" | class(df[, i]) == "integer" | class(df[, i]) == "Date"){
    cat_nl('\\pagebreak')
    cat_nl('\\newline')
   cat_nl(paste("Variable", var_num, "\\\\"))
    cat_nl("------------------------\\\\")
    cat_nl(paste("Name:", names(df)[i], "\\\\"))
    cat_nl(paste("Class:", classes[i], "\\\\"))
    cat_nl(paste("Total obs:", length(df[, i]), "\\\\"))
    cat_nl(paste("Missing obs:", length(df[, i])-length(na.omit(df[, i])), "\\\\"))
    cat_nl(paste("Mean of non missing:", round(mean(na.omit(df[, i])), 3), "\\\\"))
    cat_nl(paste("Median of non missing:", round(median(na.omit(df[, i])),3), "\\\\"))
    cat_nl(paste("SD of non missing:", round(sd(na.omit(df[, i])),3), "\\\\"))
    cat_nl(paste("Range of non missing:", round(max(na.omit(df[, i]))- min(na.omit(df[, i])),3), "\\\\" ))
    cat_nl('\\newline')
 
    #plot a histogram
    cat_nl('<<echo = FALSE, fig = TRUE>>=')
    cat_nl(paste('print(qplot(',df_name,'[,',i,'], data=',df_name,', geom="histogram", fill=..count.., xlab=names(',df_name,')[',i,'])+ opts(title = paste("Histogram of", names(',df_name,')[',i,'])))', sep = ""))
    cat_nl('@')
    cat_nl('\\newline')
  }
 
  if(class(df[, i]) == "factor" | class(df[, i]) == "character"){
    cat_nl('\\pagebreak')
    cat_nl('\\newline')
   cat_nl(paste("Variable", var_num, "\\\\"))
    cat_nl("------------------------\\\\")
    cat_nl(paste("Name:", names(df)[i], "\\\\"))
    cat_nl(paste("Class:", classes[i], "\\\\"))
    cat_nl(paste("Levels:", paste(names(find_levels(df[, i])), collapse=", "), "\\\\"))
    cat_nl('\\newline')
 
    cat_nl('<<echo = FALSE, fig = TRUE>>=')  
    # keep only data from the top num_levels
    # comprising only the num_levels most populus levels of f, or the whole of f if num_levels > length(levels(f))
    cat_nl('reduce <- function(f, n){')
    cat_nl('levels(f) <- names(sort(table(f), descreasing = T))')
    cat_nl('n <- min(n, length(levels(f)))')
    cat_nl('f<- factor(f[f %in% levels(f)[1:n]])')
    cat_nl('return(f)')
    cat_nl('}') 
 
    cat_nl(paste('temp<- data.frame(f = factor(',df_name,'[, ',i,'], levels = names(sort(table(',df_name,'[, ',i,']), decreasing = T))))'))
    cat_nl(paste('temp<- data.frame(g = reduce(temp$f, ',num_lev,'))')) 
    cat_nl('c <- ggplot(temp, aes(x = g, fill = g))')
    cat_nl(paste('print(c + geom_bar()+ xlab(names(',df_name,')[',i,']) + opts(axis.text.x=theme_text(angle=-45, hjust=0), title = paste("Bar chart showing ",',min(num_lev, length(levels(as.factor(df[,i])))),',"of",',length(levels(as.factor(df[,i]))), ',"levels of", names(',df_name,')[',i,']), legend.title=theme_blank()))'))
 
 
    cat_nl('@')
    cat_nl('\\newline')
  }
  if(class(df[, i]) == "logical"){
    cat_nl('\\pagebreak')
    cat_nl('\\newline')
   cat_nl(paste("Variable", var_num, "\\\\"))
    cat_nl("------------------------\\\\")
    cat_nl(paste("Name:", names(df)[i], "\\\\"))
    cat_nl(paste("Class:", classes[i], "\\\\"))
    cat_nl(paste("Total obs:", length(df[, i]), "\\\\"))
    cat_nl(paste("Missing obs:", length(df[, i])-length(na.omit(df[, i])), "\\\\"))
    cat_nl(paste("count of obs = T:", sum(na.omit(df[, i])), "\\\\"))
    cat_nl(paste("count of obs = F:", sum(na.omit(df[, i]==F)), "\\\\"))
    cat_nl(paste("\\% non missing which = T: ", round(sum(na.omit(df[, i]))/length(na.omit(df[, i]))*100, 3), "\\%", "\\\\", sep = ""))
    cat_nl('\\newline')
 
    cat_nl('<<echo = FALSE, fig = TRUE>>=')  
    # keep only data from the top num_levels
    # comprising only the num_levels most populus levels of f, or the whole of f if num_levels > length(levels(f))
    cat_nl('reduce <- function(f, n){')
    cat_nl('levels(f) <- names(sort(table(f), descreasing = T))')
    cat_nl('n <- min(n, length(levels(f)))')
    cat_nl('f<- factor(f[f %in% levels(f)[1:n]])')
    cat_nl('return(f)')
    cat_nl('}') 
 
    cat_nl(paste('temp<- data.frame(f = factor(',df_name,'[, ',i,'], levels = names(sort(table(',df_name,'[, ',i,']), decreasing = T))))'))
        cat_nl(paste('temp<- data.frame(g = reduce(temp$f, ',num_lev,'))')) 
    cat_nl('c <- ggplot(temp, aes(x = g, fill = g))')
    cat_nl(paste('print(c + geom_bar()+ xlab(names(',df_name,')[',i,']) + opts(axis.text.x=theme_text(angle=-45, hjust=0), title = paste("Bar chart of", names(',df_name,')[',i,']), legend.title=theme_blank()))'))
 
    cat_nl('@')
    cat_nl('\\newline')
  }
 
   if(class(df[, i]) == "ts"){
    cat_nl('\\pagebreak')
    cat_nl('\\newline')
   cat_nl(paste("Variable", var_num, "\\\\"))
    cat_nl("------------------------\\\\")
    cat_nl(paste("Name:", names(df)[i], "\\\\"))
    cat_nl(paste("Class:", classes[i], "(time series) \\\\"))
    cat_nl(paste("Total obs:", length(df[, i]), "\\\\"))
    cat_nl(paste("Missing obs:", length(df[, i])-length(na.omit(df[, i])), "\\\\"))
    cat_nl(paste("First obs observed at: ",start(df[,i])[1] , ' \\\\', sep = ''))
    cat_nl(paste("Last obs observed at: ",end(df[,i])[1] , ' \\\\', sep = ''))
    cat_nl(paste("Number of obs per unit time:", frequency(df[,i])))
 
    cat_nl('\\newline')
    cat_nl('<<echo = FALSE, fig = TRUE>>=')
    cat_nl(paste("plot(", df_name, "[", i, "], ylab = names(", df_name, "[", i, "]), main = names(", df_name, "[", i, "]))", sep = ""))
    cat_nl('@')
  } 
  }
 
  cat_nl('\\end{document}')
  #return the output to the screen
  sink()
  Sweave(rnw_file_name)
  texi2dvi(tex_file_name, pdf=TRUE)
}
 
#make a test data set
 
#x <- iris[,4:5]
#x<- data.frame(x, vir = (x$Species=="setosa"))
 
#data_dict(x, "C:/Users/Ross/Desktop/test", "x_test", "Ross Farrelly")
# print.ts(.)
## Using July 1954 as start date:
 
df_1 <- data.frame(
  weight = rnorm(100), 
  height = rnorm(10), 
  class = as.factor(c(rep("class 1", 90), rep(NA, 2), rep("class 2", 8))), 
  name = as.character(c(rep("fred", 40), rep("jim",20), rep("sally", 30), rep("bill", 10))), 
  date = seq(as.Date("2000/1/1"), by="month", length.out=100),
  paid = rep(c(F, NA, T, T),25) ,
  gnp = ts(cumsum(1 + round(rnorm(100), 2)), start = c(1954, 7), frequency = 12),
  stringsAsFactors=FALSE
)
 
data_dict(df_1, "C:/Users/Ross/Desktop/test", "df_1_test", "Ross Farrelly", 3)
#data_dict(trees , "C:/Users/Ross/Desktop/test", "trees", "Ross Farrelly")
#data_dict(stackloss , "C:/Users/Ross/Desktop/test", "stackloss", "Ross Farrelly")
Created by Pretty R at inside-R.org

No comments: