simonthompson99
9/27/2019 - 4:18 PM

dplyr Cheatsheet

[dplyr Cheatsheet Some commands for dplyr #r #dplyr #cheatsheet

#-- convert df to tibble
tbl <- tbl_df(df)

#-- aggregate by multiple groups
d <- df %>% filter(<filter_conditional>) %>%
    group_by(<grouping_variable1>, ...) %>%
    summarise(<out_var> = <out_var_function_call>,...)

#-- get crosstabs table and replace low counts
tab_ldp <- d %>%
    rename(dis = normalised_disease_name) %>%
    group_by(gmc, ldp, dis) %>%
    summarise(n = n()) %>%
    spread(dis, n) %>%
    mutate_if(is.numeric, function(x) replace(x, x %in% 1:5, "<5"))

#-- select only columns that start with m
d <- df %>% select(starts_with("m"))

#-- mutate weight to be in kgs and create new var
mtcars <- mtcars %>% mutate(wt = wt * 1000, good_mpg=ifelse(mpg > 25,"good","bad"))

#-- equivalent of do.call("rbind", list(dfs))
bind_rows(df1, df2)
bind_rows(list(df1, df2))
bind_rows(list_of_dfs, .id = "id") # .id argument allows to specify a new column name to preserve the original list names

#-- get the first or last or nth row of the group
d <- df %>%
    group_by(grp_var) %>%
    arrange(desc(order_var)) %>%
    slice(1) # or could use n() to get the number of rows in that group
#-- to just get the most recent/max row
df <- df %>% group_by(grp_var) %>%
	slice(which.max(order_var))


#-- long list to crosstab/pivot
out <- d %>% group_by(week, lab) %>% summarize(count = n()) %>% spread(lab, count)

#-- change class of number of specific columns (specified in a vecotr of column names
d <- d %>% mutate_at(vars(one_of(convert_to_numeric)), funs(as.numeric))