R ggplot2 library
cuberoot_trans = function() trans_new('cuberoot', transform = function(x) x^(1/3), inverse = function(x) x^3)
# Let's ty it...
ggplot(diamonds, aes(x = carat, y = price)) +
geom_point() +
scale_x_continuous(trans = cuberoot_trans()) +
scale_y_log10()
qqnorm(df$y)
qqline(df$y, col = 'red')
# ggplot2 version
ggplot(df, aes(sample = y)) +
geom_point(stat = "qq") +
geom_abline(intercept = mean(df$y), slope = sd(df$y), color = 'red')
fb <- fb <- read.csv('birthdaysExample.csv')
fb$dates <- as.Date(fb$dates, '%m/%d/%y')
p_dates <- ggplot(fb, aes(x = dates))
# Histogram by day (I'm not really sure what refers the binwidth here with dates)
p_dates +
geom_histogram(binwidth = 1) +
scale_x_date(breaks = date_breaks('months'),
labels = date_format('%B')) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggtitle('Birthdays by day of the year') +
xlab('Month') +
ylab('Nº of birthdays')
# Histogram aggregating by month
p_dates +
stat_bin(breaks = as.numeric(seq(min(fb$dates), max(fb$dates) + 1, '1 month')),
position = 'identity',
color = 'white') +
scale_x_date(breaks = date_breaks('months'),
labels = date_format('%B')) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggtitle('Birthdays by month') +
xlab('Month') +
ylab('Nº of birthdays')
# Histrogram aggregating by week
p_dates +
stat_bin(breaks = as.numeric(seq(min(fb$dates), max(fb$dates) + 1, '1 week')),
position = 'identity',
color = 'white') +
scale_x_date(breaks = date_breaks('weeks'),
labels = date_format('%W')) +
ggtitle('Birthdays by week') +
xlab('Week') +
ylab('Nº of birthdays')
# Other way of doing it, with cut
fb$month <- as.Date(cut(fb$dates, breaks = 'month'))
ggplot(fb, aes(x = month)) + geom_histogram(binwidth = 15) +
scale_x_date(breaks = date_breaks('months'),
labels = date_format('%B')) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
fb$week <- as.Date(cut(fb$dates, breaks = 'week'))
ggplot(data = fb, aes(x = week)) + geom_histogram(binwidth = 1) +
scale_x_date(breaks = date_breaks('weeks'),
labels = date_format('Week %W (%b %Y)')) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Other way, creating the aggregated variables
# Day of the month
fb$day <- as.numeric(format(fb$dates, '%d'))
ggplot(fb, aes(x = day)) + geom_histogram(binwidth = 1, color = 'white', origin = -0.5) +
scale_x_continuous(limits = c(0, 32), breaks = seq(0, 32, 1)) +
ggtitle('Birthdays by day of the month') +
xlab('Day of the month') +
ylab('Nº of birthdays')
# Day of the week
fb$week_day <- weekdays(fb$dates)
fb$week_day <- factor(fb$week_day, levels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"), ordered = T)
ggplot(fb, aes(week_day)) + geom_histogram(binwidth = 1) +
ggtitle('Birthdays by day of week') +
xlab('Day of week') +
ylab('Nº of birthdays')
pf <- read.csv('pseudo_facebook.tsv', sep = '\t')
ggplot(pf, aes(x = age, y = friend_count)) +
geom_point(alpha = 1/20, position = position_jitter(h = 0)) +
xlim(13, 90) +
coord_trans(y = 'sqrt')
# Conditional Means
pf.fc_by_age <- pf %>%
group_by(age) %>%
summarise(friend_count_mean = mean(friend_count),
fiend_count_median = median(friend_count),
n = n()) %>%
arrange(age)
ggplot(pf.fc_by_age, aes(x = age, y = friend_count_mean)) +
geom_line()
# Overlaying Summaries with Raw Data
ggplot(pf, aes(x = age, y = friend_count)) +
geom_point(alpha = 1/20, position = position_jitter(h = 0), color = 'orange') +
geom_line(stat = 'summary', fun.y = mean) +
geom_line(stat = 'summary', fun.y = quantile, probs = 0.1, linetype = 2, color = 'blue') +
geom_line(stat = 'summary', fun.y = quantile, probs = 0.5, color = 'blue') +
geom_line(stat = 'summary', fun.y = quantile, probs = 0.9, linetype = 2, color = 'blue') +
coord_cartesian(xlim = c(13, 70), ylim = c(0, 1000))
# Correlation
cor.test(pf$age, pf$friend_count, method = 'pearson')
with(pf, cor.test(age, friend_count)) # Other way of doing it
# Scatter plots
ggplot(pf, aes(x = www_likes_received, y = likes_received)) +
geom_point(alpha = 0.05) +
xlim(0, quantile(pf$www_likes_received, 0.95)) +
ylim(0, quantile(pf$likes_received, 0.95)) +
ggplot(pf, aes(x = www_likes_received, y = likes_received)) +
geom_point(alpha = .1, color = 'orange') +
coord_trans(x = 'sqrt', y = 'sqrt') +
geom_line(stat = 'summary', fun.y = quantile, probs = 0.1, linetype = 2, color = 'blue') +
geom_line(stat = 'summary', fun.y = quantile, probs = 0.5, color = 'blue') +
geom_line(stat = 'summary', fun.y = quantile, probs = 0.9, linetype = 2, color = 'blue') +
xlim(c(0, 250)) +
ylim(c(0, 250))
# Making sense of data
ggplot(Mitchell, aes(x = Month, y = Temp)) +
geom_point() +
scale_x_continuous(breaks = seq(0, 204, 12))
ggplot(Mitchell, aes(x = Month%%12, y = Temp)) + # MODULE OPERATOR
geom_point() +
scale_x_continuous(breaks = seq(0, 204, 12)) +
geom_line(stat = 'summary', fun.y = mean)
geom_smooth(method = 'lm', color = 'red')
install.packages('ggplot2')
library(ggplot2)
install.packages('ggthemes', dependencies = TRUE)
library(ggthemes)
theme_set(theme_minimal(24))
pf <- read.csv('pseudo_facebook.tsv', sep = '\t')
# Histogram
qplot(x = dob_day, data = pf)
qplot(x = dob_day, data = pf) +
scale_x_discrete(breaks = 1:31)
# Faceting
# facet_wrap(~variable)
# facet_grid(vertical~horizontal)
qplot(x = dob_day, data = pf) +
scale_x_discrete(breaks = 1:31) +
facet_wrap(~dob_month, ncol = 3)
ggplot(data = pf, aes(x = dob_day)) +
geom_histogram() +
scale_x_discrete(breaks = 1:31) +
facet_wrap(~dob_month)
ggplot(data = pf, aes(x = dob_day)) +
geom_histogram() +
scale_x_discrete(breaks = 1:31) +
facet_grid(dob_month ~ .)
# Limiting axes
qplot(data = pf, x = friend_count, xlim = c(0, 1000))
qplot(x = friend_count, data = pf) +
scale_x_continuous(limits = c(0, 1000))
ggplot(aes(x = friend_count), data = pf) +
geom_histogram() +
scale_x_continuous(limits = c(0, 1000))
# Adjusting bin width
qplot(x = friend_count, data = pf, binwidth = 25) +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50))
ggplot(aes(x = friend_count), data = pf) +
geom_histogram(binwidth = 25) +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50))
# Omitting NA values
qplot(x = friend_count, data = pf[!is.na(pf$gender), ], binwidth = 25) +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
facet_wrap(~gender)
qplot(x = friend_count, data = subset(pf, !is.na(gender)), binwidth = 25) +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
facet_wrap(~gender)
qplot(x = friend_count, data = na.omit(pf), binwidth = 25) + # Omits all NA values within pf (not only gender)
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
facet_wrap(~gender)
# Color
qplot(x = tenure, data = pf, binwidth = 30, color = I('black'), fill = I('#099DD9'))
ggplot(aes(x = tenure), data = pf) +
geom_histogram(binwidth = 30, color = 'black', fill = '#099DD9')
# Labeling plots
qplot(x = tenure/365, data = pf,
xlab = 'Number of years using Facebook',
ylab = 'Number of users in sample', color = I('black'), fill = I('#099DD9')) +
scale_x_continuous(breaks = seq(0, 7, 1), limits = c(0, 7))
ggplot(aes(x = tenure / 365), data = pf) +
geom_histogram(color = 'black', fill = '#F79420') +
scale_x_continuous(breaks = seq(0, 7, 1), limits = c(0, 7)) +
xlab('Number of years using Facebook') +
ylab('Number of users in sample')
# Grids & Transforming data
install.packages("gridExtra", dependencies = T)
library(gridExtra)
p1 <- ggplot(pf, aes(x = friend_count)) +
geom_histogram()
p2 <- p1 + scale_x_log10()
p3 <- p1 + scale_x_sqrt()
grid.arrange(p1, p2, p3, ncol=1)
# Frequency polygons & Color parameter
qplot(x = friend_count, y = ..count../sum(..count..),
data = subset(pf, !is.na(gender)),
xlab = 'Friend Count',
ylab = 'Proportion of Users with that friend count',
binwidth = 10,
geom = 'freqpoly',
color = gender) +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50))
ggplot(aes(x = friend_count, y = ..count../sum(..count..)), data = subset(pf, !is.na(gender))) + # percentages of total users
geom_freqpoly(aes(color = gender), binwidth=10) +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
xlab('Friend Count') +
ylab('Percentage of users with that friend count')
ggplot(aes(x = friend_count, y = ..density..), data = subset(pf, !is.na(gender))) + # percentages within each group
geom_freqpoly(aes(color = gender), binwidth=10) +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
xlab('Friend Count') +
ylab('Percentage of users with that friend count')
# Box Plots
qplot(x = gender, y = friend_count,
data = subset(pf, !is.na(gender)),
geom = 'boxplot')
ggplot(subset(pf, !is.na(gender)), aes(x = gender, y = friend_count)) +
geom_boxplot()
qplot(x = gender, y = friend_count,
data = subset(pf, !is.na(gender)),
geom = 'boxplot') +
coord_cartesian(ylim = c(0, 1000))
ggplot(subset(pf, !is.na(gender)), aes(x = gender, y = friend_count)) +
geom_boxplot() +
coord_cartesian(ylim = c(0, 1000))
# Free Scales
qplot(x = price, data = diamonds) + facet_wrap(~cut, scales = 'free_y')
install.packages('RColorBrewer')
library(RColorBrewer)
ggplot(aes(x = carat, y = price, color = clarity), data = diamonds) +
geom_point(alpha = 0.5, size = 1, position = 'jitter') +
scale_color_brewer(
type = 'div',
guide = guide_legend(
title = 'Clarity',
reverse = T,
override.aes = list(alpha = 1, size = 2)
)
) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat and Clarity')