cloudspotter86
7/6/2018 - 7:03 PM

dplyr_joins

Playing with filtering joins

##############################################
# Playing around with filtering joins in dplyr
##############################################

library(dplyr)

# Create dat and dat2
cities = c("Vancouver", "Toronto", "Ottawa", "Halifax")

dat = seq(as.Date("2002-01-01", format = "%Y-%m-%d"),as.Date("2012-01-01", format = "%Y-%m-%d"), by = "day")
dat = as.data.frame(dat) ;names(dat) = "date"
dat$value = sample(100, size = nrow(dat), replace = TRUE)
dat$city = sample(cities, size = nrow(dat), replace = TRUE)


dat2 = seq(as.Date("2002-01-01", format = "%Y-%m-%d"),as.Date("2012-01-01", format = "%Y-%m-%d"), by = "day")
dat2 = as.data.frame(dat2); names(dat2) = c("date", "name")
dat2$name = rep(c("cat", "dog", "fish"), length.out = 3653)
dat2$city = sample(cities, size = nrow(dat), replace = TRUE)



# left join based on date only
dat_joined = left_join(dat, dat2, by = "date")


# left join, filtering dates from dat2 (keeps all rows in dat)
dat_joined2 = left_join(dat, filter(dat2, date < as.Date("2002-01-08", format = "%Y-%m-%d")), by = "date")
# vs semi join (only keeps matching rows)
dat_joined2.1 = semi_join(dat, filter(dat2, date < as.Date("2002-01-08", format = "%Y-%m-%d")), by = "date")

# semi join, includes all rows in dat that have a match in dat2 based on date and city
dat_joined3 = left_join(dat, dat2, by = c("date", "city")) # keeps all rows in dat
dat_joined3.1 = semi_join(dat, dat2, by = c("date", "city")) # only keeps rows in dat that match dat2

# join based on dates and city, filtering dates too
dat_joined4 = left_join(dat, filter(dat2, date < as.Date("2002-01-08", format = "%Y-%m-%d")), by = c("date", "city"))
dat_joined4.1 = semi_join(dat, filter(dat2, date < as.Date("2002-01-08", format = "%Y-%m-%d")), by = c("date", "city"))

rm(dat, dat2, list = ls(pattern = "dat_joined"))