daniel-s
8/11/2015 - 10:42 AM

R ggplot2 library

R ggplot2 library

cuberoot_trans = function() trans_new('cuberoot', transform = function(x) x^(1/3), inverse = function(x) x^3)

# Let's ty it...
ggplot(diamonds, aes(x = carat, y = price)) +
  geom_point() +
  scale_x_continuous(trans = cuberoot_trans()) +
  scale_y_log10()
qqnorm(df$y)
qqline(df$y, col = 'red')

# ggplot2 version
ggplot(df, aes(sample = y)) +
  geom_point(stat = "qq") +
  geom_abline(intercept = mean(df$y), slope = sd(df$y), color = 'red')
fb <- fb <- read.csv('birthdaysExample.csv')
fb$dates <- as.Date(fb$dates, '%m/%d/%y')

p_dates <- ggplot(fb, aes(x = dates)) 

# Histogram by day (I'm not really sure what refers the binwidth here with dates)
p_dates + 
  geom_histogram(binwidth = 1) +
  scale_x_date(breaks = date_breaks('months'),
               labels = date_format('%B')) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggtitle('Birthdays by day of the year') +
  xlab('Month') +
  ylab('Nº of birthdays')
  
# Histogram aggregating by month
p_dates + 
  stat_bin(breaks = as.numeric(seq(min(fb$dates), max(fb$dates) + 1, '1 month')),
                   position = 'identity',
                   color = 'white') +
  scale_x_date(breaks = date_breaks('months'),
               labels = date_format('%B')) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggtitle('Birthdays by month') +
  xlab('Month') +
  ylab('Nº of birthdays')
  
# Histrogram aggregating by week
p_dates + 
stat_bin(breaks = as.numeric(seq(min(fb$dates), max(fb$dates) + 1, '1 week')),
         position = 'identity',
         color = 'white') +
  scale_x_date(breaks = date_breaks('weeks'),
               labels = date_format('%W')) +
  ggtitle('Birthdays by week') +
  xlab('Week') +
  ylab('Nº of birthdays')
  
# Other way of doing it, with cut
fb$month <- as.Date(cut(fb$dates, breaks = 'month'))
ggplot(fb, aes(x = month)) + geom_histogram(binwidth = 15) +
          scale_x_date(breaks = date_breaks('months'),
                       labels = date_format('%B')) +
          theme(axis.text.x = element_text(angle = 45, hjust = 1))

fb$week <- as.Date(cut(fb$dates, breaks = 'week'))
ggplot(data = fb, aes(x = week)) + geom_histogram(binwidth = 1) +
  scale_x_date(breaks = date_breaks('weeks'),
               labels = date_format('Week %W (%b %Y)')) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
# Other way, creating the aggregated variables
#  Day of the month
fb$day <- as.numeric(format(fb$dates, '%d'))
ggplot(fb, aes(x = day)) + geom_histogram(binwidth = 1, color = 'white', origin = -0.5) +
  scale_x_continuous(limits = c(0, 32), breaks = seq(0, 32, 1)) +
  ggtitle('Birthdays by day of the month') +
  xlab('Day of the month') +
  ylab('Nº of birthdays')
#  Day of the week
fb$week_day <- weekdays(fb$dates)
fb$week_day <- factor(fb$week_day, levels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"), ordered = T)
ggplot(fb, aes(week_day)) + geom_histogram(binwidth = 1) +
  ggtitle('Birthdays by day of week') +
  xlab('Day of week') +
  ylab('Nº of birthdays')
pf <- read.csv('pseudo_facebook.tsv', sep = '\t')

ggplot(pf, aes(x = age, y = friend_count)) +
  geom_point(alpha = 1/20, position = position_jitter(h = 0)) +
  xlim(13, 90) +
  coord_trans(y = 'sqrt')
  
# Conditional Means
pf.fc_by_age <- pf %>% 
                  group_by(age) %>% 
                  summarise(friend_count_mean = mean(friend_count),
                            fiend_count_median = median(friend_count),
                            n = n()) %>% 
                  arrange(age)
ggplot(pf.fc_by_age, aes(x = age, y = friend_count_mean)) +
  geom_line()
  
# Overlaying Summaries with Raw Data
ggplot(pf, aes(x = age, y = friend_count)) +
  geom_point(alpha = 1/20, position = position_jitter(h = 0), color = 'orange') +
  geom_line(stat = 'summary', fun.y = mean) +
  geom_line(stat = 'summary', fun.y = quantile, probs = 0.1, linetype = 2, color = 'blue') +
  geom_line(stat = 'summary', fun.y = quantile, probs = 0.5, color = 'blue') +
  geom_line(stat = 'summary', fun.y = quantile, probs = 0.9, linetype = 2, color = 'blue') +
  coord_cartesian(xlim = c(13, 70), ylim = c(0, 1000))
  
# Correlation
cor.test(pf$age, pf$friend_count, method = 'pearson')
with(pf, cor.test(age, friend_count)) # Other way of doing it

# Scatter plots
ggplot(pf, aes(x = www_likes_received, y = likes_received)) +
  geom_point(alpha = 0.05) +
  xlim(0, quantile(pf$www_likes_received, 0.95)) +
  ylim(0, quantile(pf$likes_received, 0.95)) +
  
ggplot(pf, aes(x = www_likes_received, y = likes_received)) +
  geom_point(alpha = .1, color = 'orange') +
  coord_trans(x = 'sqrt', y = 'sqrt') +
  geom_line(stat = 'summary', fun.y = quantile, probs = 0.1, linetype = 2, color = 'blue') +
  geom_line(stat = 'summary', fun.y = quantile, probs = 0.5, color = 'blue') +
  geom_line(stat = 'summary', fun.y = quantile, probs = 0.9, linetype = 2, color = 'blue') +
  xlim(c(0, 250)) +
  ylim(c(0, 250))
  
# Making sense of data
ggplot(Mitchell, aes(x = Month, y = Temp)) +
  geom_point() +
  scale_x_continuous(breaks = seq(0, 204, 12))

ggplot(Mitchell, aes(x = Month%%12, y = Temp)) + # MODULE OPERATOR
  geom_point() +
  scale_x_continuous(breaks = seq(0, 204, 12)) +
  geom_line(stat = 'summary', fun.y = mean)
  geom_smooth(method = 'lm', color = 'red')
install.packages('ggplot2')
library(ggplot2)

install.packages('ggthemes', dependencies = TRUE)
library(ggthemes)
theme_set(theme_minimal(24))

pf <- read.csv('pseudo_facebook.tsv', sep = '\t')

# Histogram
qplot(x = dob_day, data = pf)

qplot(x = dob_day, data = pf) +
  scale_x_discrete(breaks = 1:31)

# Faceting
#   facet_wrap(~variable)
#   facet_grid(vertical~horizontal)
qplot(x = dob_day, data = pf) +
  scale_x_discrete(breaks = 1:31) +
  facet_wrap(~dob_month, ncol = 3)
  
ggplot(data = pf, aes(x = dob_day)) + 
  geom_histogram() + 
  scale_x_discrete(breaks = 1:31) + 
  facet_wrap(~dob_month)
  
ggplot(data = pf, aes(x = dob_day)) + 
  geom_histogram() + 
  scale_x_discrete(breaks = 1:31) + 
  facet_grid(dob_month ~ .)
  
# Limiting axes
qplot(data = pf, x = friend_count, xlim = c(0, 1000))

qplot(x = friend_count, data = pf) + 
  scale_x_continuous(limits = c(0, 1000))
  
ggplot(aes(x = friend_count), data = pf) + 
  geom_histogram() + 
  scale_x_continuous(limits = c(0, 1000))
  
# Adjusting bin width
qplot(x = friend_count, data = pf, binwidth = 25) +
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50))
  
ggplot(aes(x = friend_count), data = pf) + 
  geom_histogram(binwidth = 25) + 
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50))
  
# Omitting NA values
qplot(x = friend_count, data = pf[!is.na(pf$gender), ], binwidth = 25) +
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) + 
  facet_wrap(~gender)

qplot(x = friend_count, data = subset(pf, !is.na(gender)), binwidth = 25) +
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) + 
  facet_wrap(~gender)

qplot(x = friend_count, data = na.omit(pf), binwidth = 25) + # Omits all NA values within pf (not only gender)
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) + 
  facet_wrap(~gender)

# Color
qplot(x = tenure, data = pf, binwidth = 30, color = I('black'), fill = I('#099DD9'))

ggplot(aes(x = tenure), data = pf) + 
  geom_histogram(binwidth = 30, color = 'black', fill = '#099DD9')
  
# Labeling plots
qplot(x = tenure/365, data = pf, 
      xlab = 'Number of years using Facebook', 
      ylab = 'Number of users in sample', color = I('black'), fill = I('#099DD9')) +
  scale_x_continuous(breaks = seq(0, 7, 1), limits = c(0, 7))

ggplot(aes(x = tenure / 365), data = pf) + 
  geom_histogram(color = 'black', fill = '#F79420') + 
  scale_x_continuous(breaks = seq(0, 7, 1), limits = c(0, 7)) + 
  xlab('Number of years using Facebook') + 
  ylab('Number of users in sample')
  
# Grids & Transforming data
install.packages("gridExtra", dependencies = T)
library(gridExtra)

p1 <- ggplot(pf, aes(x = friend_count)) +
  geom_histogram()
p2 <- p1 + scale_x_log10()
p3 <- p1 + scale_x_sqrt()
grid.arrange(p1, p2, p3, ncol=1)

# Frequency polygons & Color parameter
qplot(x = friend_count, y = ..count../sum(..count..),
      data = subset(pf, !is.na(gender)),
      xlab = 'Friend Count', 
      ylab = 'Proportion of Users with that friend count',
      binwidth = 10,
      geom = 'freqpoly',
      color = gender) +
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50))

ggplot(aes(x = friend_count, y = ..count../sum(..count..)), data = subset(pf, !is.na(gender))) + # percentages of total users
  geom_freqpoly(aes(color = gender), binwidth=10) + 
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
  xlab('Friend Count') + 
  ylab('Percentage of users with that friend count')

ggplot(aes(x = friend_count, y = ..density..), data = subset(pf, !is.na(gender))) + # percentages within each group
  geom_freqpoly(aes(color = gender), binwidth=10) + 
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
  xlab('Friend Count') + 
  ylab('Percentage of users with that friend count')
  
# Box Plots
qplot(x = gender, y = friend_count,
      data = subset(pf, !is.na(gender)),
      geom = 'boxplot')

ggplot(subset(pf, !is.na(gender)), aes(x = gender, y = friend_count)) +
  geom_boxplot()
  
qplot(x = gender, y = friend_count,
      data = subset(pf, !is.na(gender)),
      geom = 'boxplot') +
  coord_cartesian(ylim = c(0, 1000))

ggplot(subset(pf, !is.na(gender)), aes(x = gender, y = friend_count)) +
  geom_boxplot() +
  coord_cartesian(ylim = c(0, 1000))
  
# Free Scales
qplot(x = price, data = diamonds) + facet_wrap(~cut, scales = 'free_y')
install.packages('RColorBrewer')
library(RColorBrewer)

ggplot(aes(x = carat, y = price, color = clarity), data = diamonds) + 
  geom_point(alpha = 0.5, size = 1, position = 'jitter') +
  scale_color_brewer(
    type = 'div',
    guide = guide_legend(
      title = 'Clarity', 
      reverse = T,
      override.aes = list(alpha = 1, size = 2)
    )
  ) +  
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
    breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
    breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat and Clarity')