samyishak
11/1/2015 - 6:44 AM

Use this code to turn a list of SQL queries into a visual of how your database is used.

Use this code to turn a list of SQL queries into a visual of how your database is used.


## Install packages if you don't already have them
install.packages(c("stringr", "tm", "igraph"), dependencies = TRUE)

## Load the packages
library(stringr)
library(tm)
library(igraph)

## Read in the data
queries <- read.csv("~/Downloads/queries.csv")
tables <- read.csv("~/Downloads/tables.csv")

# See what tables look like
head(queries,1)
head(tables)

## Create an empty data frame, with rows equal to number of queries being analyzed
x <- data.frame(rep(0,length(queries$query)))

## Populate data frame
# In the data frame x, each row is a query and each column is the name of a table from your database; each cell is an indicator for whether a particular table name appears in a particular query
for (i in 1:length(tables$tables)) {
  x[,i] <- str_extract(queries$query, as.character(tables$tables[i]))
}

## Name columns and replace NAs
colnames(x) <- tables$tables
x[is.na(x)] <- 0

##Concatenate each row into a single string
x_args <- c(x, sep = " ")
x$list <- do.call(paste, x_args)

## Clean out zeroes and whitespace
x$list <- str_trim(gsub(0, "", x$list))
x$list <- gsub("[ ]+", " ", x$list)

## Check out data frame
x$list[520:524]

## Convert x$list into a corpus of documents
corpus <- Corpus(VectorSource(x$list))


## Create term-document matrix
tdm <- TermDocumentMatrix(corpus)
inspect(tdm)
termDocMatrix <- as.matrix(tdm)

## Create term-term matrix
termMatrix <- termDocMatrix %*% t(termDocMatrix)

## Create graph-adjacency matrix
g <- graph.adjacency(termMatrix, weighted=T, mode = "undirected")

## Remove loops
g <- simplify(g)

## Set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)

## Plot a Graph
plot(g, layout=layout_in_circle(g))

## Plot a prettier graph
V(g)$label.cex <- 3 * (0.06125 * V(g)$degree / max(V(g)$degree) + .2)
V(g)$label.color <- rgb(0, 0, .2, .49 * V(g)$degree / max(V(g)$degree) + .5)
V(g)$frame.color <- rgb(0, 0, .2, .39 * V(g)$degree / max(V(g)$degree) + .6)
egam <- (log(E(g)$weight)+.4) / max(log(E(g)$weight)+.4)
#E(g)$color <- rgb(0, .3, .6, egam)
E(g)$color <- rgb((colorRamp(c("blue", "yellow", "red"))(E(g)$weight/max(E(g)$weight)))/255, alpha=egam)
E(g)$width <- egam

plot(g, layout=layout_in_circle(g), vertex.color = rgb((colorRamp(c("blue", "yellow", "red"))(degree(g)/max(degree(g))))/255), vertex.size = ((V(g)$degree)*2/3)+2, edge.width = 6 * E(g)$weight / max(E(g)$weight))