Here is some R code you can use to create a pdf data dictionary from a dataset. It's a quick and easy way to get an overview of a dataset.
You need to have a latext program like MiKTeX installed to make it work.
You can see the sort of output it produces here.
#######################################
# code to produce a pdf data dictionary of a dataset using sweave
#
# df: the data frame you want to create the data dictionary of
# file_loc: the location where you want the resulting report stored
# filename: the name you want the report to have
# author: the name of the person who created the data dictionary
#
#######################################
data_dict <- function(df, file_loc, file_name, author, num_lev = 2){
# initiate libraries
library(tools)
library(ggplot2)
library(gplots)
library(lattice)
library(sqldf)
#Make some subsidary functions
# initial funtion to print without quotes and line numbers
cat_nl<-function(x){
cat(x, fill = T)
}
#this function takes a factor f and an integer num_levels and returns a subset of f
reduce <- function(f, n){
levels(f) <- names(sort(table(f), descreasing = T))
n <- min(n, length(levels(f)))
f<- factor(f[f %in% levels(f)[1:n]])
return(f)
}
#finds the class of each variable in a dataset
find_class <- function(df){
l<-c()
for(i in 1 : ncol(df)){
l<-c(l, class(df[, i]))
}
return(l)
}
classes <- find_class(df)
#returns the levels of a factor as a string
find_levels<- function(v){
sort(table(v), decreasing = T)
}
#create the output filenames
rnw_file_name <- paste(file_name, ".rnw", sep = "")
tex_file_name <- paste(file_name, ".tex", sep = "")
#direct the output correctly
setwd(file_loc)
sink(rnw_file_name, type = "output")
# initial funtion to print without quotes and line numbers
cat_nl<-function(x){
cat(x, fill = T)
}
setwd(file_loc)
df_name <- deparse(substitute(df))
cat_nl('\\documentclass[a4paper]{article}')
cat_nl(paste('\\title{Data Dictionary for dataset: ', df_name, '}', sep = ""))
cat_nl(paste('\\author{', author, '}', sep = ""))
cat_nl('\\begin{document}')
cat_nl('\\maketitle')
cat_nl('\\noindent')
cat_nl("Dataset\\\\")
cat_nl("------------------------\\\\")
cat_nl("Variables\\\\")
cat_nl("------------------------\\\\")
cat_nl(paste("Names of variables (alphabetical order):", paste(sort(names(df)), collapse=", "), '\\\\'))
cat_nl(paste("Number of variables:", ncol(df), "\\\\"))
if (ncol(df[classes =="numeric"])>0){
cat_nl(paste("Number of numeric variables:", ncol(df[classes =="numeric"]), "\\\\"))
}
if (ncol(df[classes =="integer"])>0){
cat_nl(paste("Number of integer variables:", ncol(df[classes =="integer"]), "\\\\"))
}
if (ncol(df[classes =="character"])>0){
cat_nl(paste("Number of character variables:", ncol(df[classes =="character"]), "\\\\"))
}
if (ncol(df[classes =="logical"])>0){
cat_nl(paste("Number of logical variables:", ncol(df[classes =="logical"]), "\\\\"))
}
if (ncol(df[classes =="factor"])>0){
cat_nl(paste("Number of factors:", ncol(df[classes =="factor"]), "\\\\"))
}
if (ncol(df[classes =="Date"])>0){
cat_nl(paste("Number of Date variables:", ncol(df[classes =="Date"]), "\\\\"))
}
if (ncol(df[classes =="ts"])>0){
cat_nl(paste("Number of time series variables:", ncol(df[classes =="ts"]), "\\\\"))
}
cat_nl('\\newline')
cat_nl("Observations\\\\")
cat_nl("------------------------\\\\")
cat_nl(paste("Number of observations:", nrow(df), "\\\\"))
cat_nl(paste("Number of complete observations:", nrow(na.omit(df)), "\\\\"))
var_num <- 0
for(i in 1:ncol(df)){
var_num <- var_num + 1
if(class(df[, i]) == "numeric" | class(df[, i]) == "integer" | class(df[, i]) == "Date"){
cat_nl('\\pagebreak')
cat_nl('\\newline')
cat_nl(paste("Variable", var_num, "\\\\"))
cat_nl("------------------------\\\\")
cat_nl(paste("Name:", names(df)[i], "\\\\"))
cat_nl(paste("Class:", classes[i], "\\\\"))
cat_nl(paste("Total obs:", length(df[, i]), "\\\\"))
cat_nl(paste("Missing obs:", length(df[, i])-length(na.omit(df[, i])), "\\\\"))
cat_nl(paste("Mean of non missing:", round(mean(na.omit(df[, i])), 3), "\\\\"))
cat_nl(paste("Median of non missing:", round(median(na.omit(df[, i])),3), "\\\\"))
cat_nl(paste("SD of non missing:", round(sd(na.omit(df[, i])),3), "\\\\"))
cat_nl(paste("Range of non missing:", round(max(na.omit(df[, i]))- min(na.omit(df[, i])),3), "\\\\" ))
cat_nl('\\newline')
#plot a histogram
cat_nl('<<echo = FALSE, fig = TRUE>>=')
cat_nl(paste('print(qplot(',df_name,'[,',i,'], data=',df_name,', geom="histogram", fill=..count.., xlab=names(',df_name,')[',i,'])+ opts(title = paste("Histogram of", names(',df_name,')[',i,'])))', sep = ""))
cat_nl('@')
cat_nl('\\newline')
}
if(class(df[, i]) == "factor" | class(df[, i]) == "character"){
cat_nl('\\pagebreak')
cat_nl('\\newline')
cat_nl(paste("Variable", var_num, "\\\\"))
cat_nl("------------------------\\\\")
cat_nl(paste("Name:", names(df)[i], "\\\\"))
cat_nl(paste("Class:", classes[i], "\\\\"))
cat_nl(paste("Levels:", paste(names(find_levels(df[, i])), collapse=", "), "\\\\"))
cat_nl('\\newline')
cat_nl('<<echo = FALSE, fig = TRUE>>=')
# keep only data from the top num_levels
# comprising only the num_levels most populus levels of f, or the whole of f if num_levels > length(levels(f))
cat_nl('reduce <- function(f, n){')
cat_nl('levels(f) <- names(sort(table(f), descreasing = T))')
cat_nl('n <- min(n, length(levels(f)))')
cat_nl('f<- factor(f[f %in% levels(f)[1:n]])')
cat_nl('return(f)')
cat_nl('}')
cat_nl(paste('temp<- data.frame(f = factor(',df_name,'[, ',i,'], levels = names(sort(table(',df_name,'[, ',i,']), decreasing = T))))'))
cat_nl(paste('temp<- data.frame(g = reduce(temp$f, ',num_lev,'))'))
cat_nl('c <- ggplot(temp, aes(x = g, fill = g))')
cat_nl(paste('print(c + geom_bar()+ xlab(names(',df_name,')[',i,']) + opts(axis.text.x=theme_text(angle=-45, hjust=0), title = paste("Bar chart showing ",',min(num_lev, length(levels(as.factor(df[,i])))),',"of",',length(levels(as.factor(df[,i]))), ',"levels of", names(',df_name,')[',i,']), legend.title=theme_blank()))'))
cat_nl('@')
cat_nl('\\newline')
}
if(class(df[, i]) == "logical"){
cat_nl('\\pagebreak')
cat_nl('\\newline')
cat_nl(paste("Variable", var_num, "\\\\"))
cat_nl("------------------------\\\\")
cat_nl(paste("Name:", names(df)[i], "\\\\"))
cat_nl(paste("Class:", classes[i], "\\\\"))
cat_nl(paste("Total obs:", length(df[, i]), "\\\\"))
cat_nl(paste("Missing obs:", length(df[, i])-length(na.omit(df[, i])), "\\\\"))
cat_nl(paste("count of obs = T:", sum(na.omit(df[, i])), "\\\\"))
cat_nl(paste("count of obs = F:", sum(na.omit(df[, i]==F)), "\\\\"))
cat_nl(paste("\\% non missing which = T: ", round(sum(na.omit(df[, i]))/length(na.omit(df[, i]))*100, 3), "\\%", "\\\\", sep = ""))
cat_nl('\\newline')
cat_nl('<<echo = FALSE, fig = TRUE>>=')
# keep only data from the top num_levels
# comprising only the num_levels most populus levels of f, or the whole of f if num_levels > length(levels(f))
cat_nl('reduce <- function(f, n){')
cat_nl('levels(f) <- names(sort(table(f), descreasing = T))')
cat_nl('n <- min(n, length(levels(f)))')
cat_nl('f<- factor(f[f %in% levels(f)[1:n]])')
cat_nl('return(f)')
cat_nl('}')
cat_nl(paste('temp<- data.frame(f = factor(',df_name,'[, ',i,'], levels = names(sort(table(',df_name,'[, ',i,']), decreasing = T))))'))
cat_nl(paste('temp<- data.frame(g = reduce(temp$f, ',num_lev,'))'))
cat_nl('c <- ggplot(temp, aes(x = g, fill = g))')
cat_nl(paste('print(c + geom_bar()+ xlab(names(',df_name,')[',i,']) + opts(axis.text.x=theme_text(angle=-45, hjust=0), title = paste("Bar chart of", names(',df_name,')[',i,']), legend.title=theme_blank()))'))
cat_nl('@')
cat_nl('\\newline')
}
if(class(df[, i]) == "ts"){
cat_nl('\\pagebreak')
cat_nl('\\newline')
cat_nl(paste("Variable", var_num, "\\\\"))
cat_nl("------------------------\\\\")
cat_nl(paste("Name:", names(df)[i], "\\\\"))
cat_nl(paste("Class:", classes[i], "(time series) \\\\"))
cat_nl(paste("Total obs:", length(df[, i]), "\\\\"))
cat_nl(paste("Missing obs:", length(df[, i])-length(na.omit(df[, i])), "\\\\"))
cat_nl(paste("First obs observed at: ",start(df[,i])[1] , ' \\\\', sep = ''))
cat_nl(paste("Last obs observed at: ",end(df[,i])[1] , ' \\\\', sep = ''))
cat_nl(paste("Number of obs per unit time:", frequency(df[,i])))
cat_nl('\\newline')
cat_nl('<<echo = FALSE, fig = TRUE>>=')
cat_nl(paste("plot(", df_name, "[", i, "], ylab = names(", df_name, "[", i, "]), main = names(", df_name, "[", i, "]))", sep = ""))
cat_nl('@')
}
}
cat_nl('\\end{document}')
#return the output to the screen
sink()
Sweave(rnw_file_name)
texi2dvi(tex_file_name, pdf=TRUE)
}
#make a test data set
#x <- iris[,4:5]
#x<- data.frame(x, vir = (x$Species=="setosa"))
#data_dict(x, "C:/Users/Ross/Desktop/test", "x_test", "Ross Farrelly")
# print.ts(.)
## Using July 1954 as start date:
df_1 <- data.frame(
weight = rnorm(100),
height = rnorm(10),
class = as.factor(c(rep("class 1", 90), rep(NA, 2), rep("class 2", 8))),
name = as.character(c(rep("fred", 40), rep("jim",20), rep("sally", 30), rep("bill", 10))),
date = seq(as.Date("2000/1/1"), by="month", length.out=100),
paid = rep(c(F, NA, T, T),25) ,
gnp = ts(cumsum(1 + round(rnorm(100), 2)), start = c(1954, 7), frequency = 12),
stringsAsFactors=FALSE
)
data_dict(df_1, "C:/Users/Ross/Desktop/test", "df_1_test", "Ross Farrelly", 3)
#data_dict(trees , "C:/Users/Ross/Desktop/test", "trees", "Ross Farrelly")
#data_dict(stackloss , "C:/Users/Ross/Desktop/test", "stackloss", "Ross Farrelly")
Comments