vgrabovets
11/11/2016 - 4:48 PM

churn all users

churn all users

runQuery = function(query, cyrillic = FALSE){
     if (cyrillic) {
          Sys.setlocale("LC_CTYPE", "russian")
     }
     library(RMySQL)
     mydb = dbConnect(MySQL(), user='WWBQ9O', password='08ZN1D', dbname='megogo2', 
                      host='193.187.76.250')
     dbGetQuery(mydb, "SET NAMES cp1251")
     query.res = dbGetQuery(mydb, query)
     dbDisconnect(mydb)
     return (query.res)
}

add_month_id = function(month_id, num_months) {
     ret_val = month_id - month_id %% 100 + 100 * floor((month_id + num_months) %% 100 / 12.1) + 
          (((month_id + num_months) %% 100) - 12 * floor((month_id + num_months) %% 100 / 12.1))
     return (ret_val)
}

query = "select 
uid,
year(reg_date) * 100 + month(reg_date) month_id_reg
from megogo2.user"

library(dplyr)
users_query = runQuery(query)

# select    
# user_id,    
# region,    
# month_id,    
# sum(duration) duration    
# from   
# (    
#      SELECT             
#      user_id,          
#      case when country_code in ('RU', 'UA') then country_code when country_code in ('UZ','KZ','BY','TJ','TM','KG','AM','GE','MD','LV','AZ','LT','EE') then 'SNG' else 'world' end region,
#      year(date) * 100 + month(date) month_id,           
#      duration     
#      FROM access_to_objects_stat_new          
#      where date >= '2015-09-01' and date <= '2016-10-31'  and user_id > 0        
# ) q    
# group by user_id, region, month_id 

watch_report <- read.table(gzfile("report.gz"), header=F, sep = ",", stringsAsFactors = TRUE)
names(watch_report) = c("user_id", "region", "month_id", "duration")

main = watch_report %>%
       left_join(users_query, by = c("user_id" = "uid")) %>%
       group_by(user_id, month_id, month_id_reg) %>%
       summarise(total_duration = sum(duration)) %>%
       arrange(user_id, month_id)
library(tidyr)

main_mutate = main %>% 
              group_by(user_id) %>%
              mutate(next_period = lead(month_id)) %>%
              filter(month_id != 201610) %>%
              mutate(next_period2 = ifelse((next_period - month_id) %% 88 != 1 | is.na(next_period), 
                                         paste(add_month_id(month_id, 1), " churn"), as.character(next_period))) %>%
              mutate(month_id = as.character(month_id))

new_users = main_mutate %>%
            filter(month_id == month_id_reg) %>%
            mutate(next_period2 = month_id) %>%
            mutate(month_id = 'new user')

#network plot ####
plot = main_mutate %>%
        bind_rows(new_users) %>%
        group_by(month_id, next_period2) %>%
        tally() %>%
        rename(source = month_id, target = next_period2)

node = as.data.frame(unique(c(plot$source, plot$target)), stringsAsFactors=FALSE)
names(node) = c("name")
node$row = as.numeric(rownames(node)) - 1

library(dplyr)
links = plot %>%
     ungroup() %>%
     mutate(group = ifelse(grepl("churn", target), "churn", target),
            source = as.numeric(plyr::mapvalues(source, from = node$name, to = node$row)),
            target = as.numeric(plyr::mapvalues(target, from = node$name, to = node$row))
     ) %>%
     as.data.frame()

library(networkD3)

sankeyNetwork(Links = links, Nodes = node, Source = "source",
              Target = "target", Value = "n", NodeID = "name", LinkGroup = "group",
              units = "qty", fontSize = 12, nodeWidth = 5, margin = list(right=0,bottom=0,left=0,top=0),
              height = 570, width = 1260, iterations = 5) %>%
              saveNetwork("churn_all_geos.html")

#stat ####
stat = plot %>%
       ungroup() %>%
       mutate(target = ifelse(grepl("churn", target), "churn", target)) %>%
       mutate(source2 = source) %>%
       mutate(source = ifelse(source2 == 'new user', target, source),
              target = ifelse(source2 == 'new user', source2, target)
              ) %>%
       mutate(target = ifelse(!target %in% c("new user", "churn"), 'continue', target)) %>%
       select(-source2) %>% 
       spread(key = target, value = n) %>%
       mutate(all_users = continue + churn,
              returning_user = continue + churn - `new user` - lag(continue),
              continue_perc = round(continue / (churn + continue), 2),
              churn_perc = round(churn / (churn + continue), 2),
              new_user_perc = round(`new user` / (churn + continue), 2),
              returning_user_perc = round(returning_user / (churn + continue), 2)
              ) %>%
       rename(month_id = source) %>%
       select(month_id, all_users, `new user`, new_user_perc, continue, continue_perc, returning_user, returning_user_perc, churn,
              churn_perc)

library(xlsx)
write.xlsx(as.data.frame(stat), "churn.xlsx", row.names = FALSE, sheetName = "all_geos", append = TRUE)


#change geos ####

getmode <- function(v) {
     uniqv <- unique(v)
     uniqv[which.max(tabulate(match(v, uniqv)))]
}

main = watch_report %>%
     left_join(users_query, by = c("user_id" = "uid")) %>%
     group_by(user_id) %>%
     mutate(region = getmode(region)) %>%
     group_by(user_id, month_id, month_id_reg, region) %>%
     summarise(total_duration = sum(duration)) %>%
     arrange(user_id, month_id)
library(tidyr)

#change region
main_mutate = main %>% 
     filter(region == 'world') %>%
     group_by(user_id) %>%
     mutate(next_period = lead(month_id)) %>%
     filter(month_id != 201610) %>%
     mutate(next_period2 = ifelse((next_period - month_id) %% 88 != 1 | is.na(next_period), 
                                  paste(add_month_id(month_id, 1), " churn"), as.character(next_period))) %>%
     mutate(month_id = as.character(month_id))

new_users = main_mutate %>%
     filter(month_id == month_id_reg) %>%
     mutate(next_period2 = month_id) %>%
     mutate(month_id = 'new user')

#network plot ####
plot = main_mutate %>%
     bind_rows(new_users) %>%
     group_by(month_id, next_period2) %>%
     tally() %>%
     rename(source = month_id, target = next_period2)

node = as.data.frame(unique(c(plot$source, plot$target)), stringsAsFactors=FALSE)
names(node) = c("name")
node$row = as.numeric(rownames(node)) - 1

library(dplyr)
links = plot %>%
     ungroup() %>%
     mutate(group = ifelse(grepl("churn", target), "churn", target),
            source = as.numeric(plyr::mapvalues(source, from = node$name, to = node$row)),
            target = as.numeric(plyr::mapvalues(target, from = node$name, to = node$row))
     ) %>%
     as.data.frame()

library(networkD3)

sankeyNetwork(Links = links, Nodes = node, Source = "source",
              Target = "target", Value = "n", NodeID = "name", LinkGroup = "group",
              units = "qty", fontSize = 12, nodeWidth = 5, margin = list(right=0,bottom=0,left=0,top=0),
              height = 570, width = 1260, iterations = 5) %>%
     saveNetwork("churn_world.html")

#stat ####
stat = plot %>%
     ungroup() %>%
     mutate(target = ifelse(grepl("churn", target), "churn", target)) %>%
     mutate(source2 = source) %>%
     mutate(source = ifelse(source2 == 'new user', target, source),
            target = ifelse(source2 == 'new user', source2, target)
     ) %>%
     mutate(target = ifelse(!target %in% c("new user", "churn"), 'continue', target)) %>%
     select(-source2) %>% 
     spread(key = target, value = n) %>%
     mutate(all_users = continue + churn,
            returning_user = continue + churn - `new user` - lag(continue),
            continue_perc = round(continue / (churn + continue), 2),
            churn_perc = round(churn / (churn + continue), 2),
            new_user_perc = round(`new user` / (churn + continue), 2),
            returning_user_perc = round(returning_user / (churn + continue), 2)
     ) %>%
     rename(month_id = source) %>%
     select(month_id, all_users, `new user`, new_user_perc, returning_user, returning_user_perc, continue, continue_perc,  churn,
            churn_perc)
names(stat) = c("месяц", "всего", "новые", "новые, %", "вернулись после отсутствия", "вернулись, %", "продолжили использование", "продолжили, %",
                "churn", "churn, %")

library(xlsx)
write.xlsx(as.data.frame(stat), "churn.xlsx", row.names = FALSE, sheetName = "world", append = TRUE)