vgrabovets
12/12/2016 - 3:14 PM

churn

churn

runQuery = function(query, cyrillic = FALSE){
     if (cyrillic) {
          Sys.setlocale("LC_CTYPE", "russian")
     }
     library(RMySQL)
     mydb = dbConnect(MySQL(), user='WWBQ9O', password='08ZN1D', dbname='megogo2', 
                      host='193.187.76.250')
     dbGetQuery(mydb, "SET NAMES cp1251")
     query.res = dbGetQuery(mydb, query)
     dbDisconnect(mydb)
     return (query.res)
}

add_month_id = function(month_id, num_months) {
     ret_val = month_id - month_id %% 100 + 100 * floor((month_id + num_months) %% 100 / 12.1) + 
          (((month_id + num_months) %% 100) - 12 * floor((month_id + num_months) %% 100 / 12.1))
     return (ret_val)
}
#-------------------------------------------------------------------------------------------------------------------------------------------
library(lubridate)
prev_month_id = year(Sys.Date() %m+% months(-1)) * 100 + month(Sys.Date() %m+% months(-1))

query = "select 
uid,
year(reg_date) * 100 + month(reg_date) month_id_reg
from megogo2.user"

library(dplyr)
users_query = runQuery(query)

# select    
# user_id,    
# region,    
# month_id,    
# sum(duration) duration    
# from   
# (    
#      SELECT             
#      user_id,          
#      case when country_code in ('RU', 'UA') then country_code when country_code in ('UZ','KZ','BY','TJ','TM','KG','AM','GE','MD','LV','AZ','LT','EE') then 'SNG' else 'world' end region,
#      year(date) * 100 + month(date) month_id,           
#      duration     
#      FROM access_to_objects_stat_new          
#      where date >= '2015-09-01' and date <= '2016-11-30'  and user_id > 0        
# ) q    
# group by user_id, region, month_id 

watch_report <- read.table(gzfile("report.gz"), header=F, sep = ",", stringsAsFactors = TRUE)
names(watch_report) = c("user_id", "region", "month_id", "duration")

main = watch_report %>%
       left_join(users_query, by = c("user_id" = "uid")) %>%
       group_by(user_id, month_id, month_id_reg) %>%
       summarise(total_duration = sum(duration)) %>%
       arrange(user_id, month_id)
library(tidyr)

main_mutate = main %>% 
              group_by(user_id) %>%
              mutate(next_period = lead(month_id)) %>%
              filter(month_id != prev_month_id) %>%
              mutate(next_period2 = ifelse((next_period - month_id) %% 88 != 1 | is.na(next_period), 
                                         paste(add_month_id(month_id, 1), " churn"), as.character(next_period))) %>%
              mutate(month_id = as.character(month_id))

new_users = main_mutate %>%
            filter(month_id == month_id_reg) %>%
            mutate(next_period2 = month_id) %>%
            mutate(month_id = 'new user')

#network plot ####
plot = main_mutate %>%
        bind_rows(new_users) %>%
        group_by(month_id, next_period2) %>%
        tally() %>%
        rename(source = month_id, target = next_period2)

node = as.data.frame(unique(c(plot$source, plot$target)), stringsAsFactors=FALSE)
names(node) = c("name")
node$row = as.numeric(rownames(node)) - 1

library(dplyr)
links = plot %>%
     ungroup() %>%
     mutate(group = ifelse(grepl("churn", target), "churn", target),
            source = as.numeric(plyr::mapvalues(source, from = node$name, to = node$row)),
            target = as.numeric(plyr::mapvalues(target, from = node$name, to = node$row))
     ) %>%
     as.data.frame()

library(networkD3)

sankeyNetwork(Links = links, Nodes = node, Source = "source",
              Target = "target", Value = "n", NodeID = "name", LinkGroup = "group",
              units = "qty", fontSize = 12, nodeWidth = 5, margin = list(right=0,bottom=0,left=0,top=0),
              height = 570, width = 1260, iterations = 5) %>%
              saveNetwork("churn_all_geos.html")

#stat ####
stat = plot %>%
       ungroup() %>%
       mutate(target = ifelse(grepl("churn", target), "churn", target)) %>%
       mutate(source2 = source) %>%
       mutate(source = ifelse(source2 == 'new user', target, source),
              target = ifelse(source2 == 'new user', source2, target)
              ) %>%
       mutate(target = ifelse(!target %in% c("new user", "churn"), 'continue', target)) %>%
       select(-source2) %>% 
       spread(key = target, value = n) %>%
       mutate(all_users = continue + churn,
              returning_user = continue + churn - `new user` - lag(continue),
              continue_perc = round(continue / (churn + continue), 2),
              churn_perc = round(churn / (churn + continue), 2),
              new_user_perc = round(`new user` / (churn + continue), 2),
              returning_user_perc = round(returning_user / (churn + continue), 2)
              ) %>%
       rename(month_id = source) %>%
       select(month_id, all_users, `new user`, new_user_perc, returning_user, returning_user_perc, continue, continue_perc,  churn, churn_perc)

names(stat) = c("месяц", "всего", "новые", "новые, %", "вернулись после отсутствия", "вернулись, %", "продолжили использование", "продолжили, %",
                "churn", "churn, %")

#library(xlsx)
xlsx::write.xlsx(as.data.frame(stat), paste0("churn_", prev_month_id, ".xlsx"), row.names = FALSE, sheetName = "all_geos", append = TRUE)


#change geos ####

getmode <- function(v) {
     uniqv <- unique(v)
     uniqv[which.max(tabulate(match(v, uniqv)))]
}

main = watch_report %>%
     left_join(users_query, by = c("user_id" = "uid")) %>%
     group_by(user_id) %>%
     mutate(region = getmode(region)) %>%
     group_by(user_id, month_id, month_id_reg, region) %>%
     summarise(total_duration = sum(duration)) %>%
     arrange(user_id, month_id)
library(tidyr)

#change region------------------------------------------------------------------------------------------------------------------------------
geos = c('UA', 'RU', 'SNG', 'world')
for (geo in geos){
     main_mutate = main %>% 
          filter(region == geo) %>%
          group_by(user_id) %>%
          mutate(next_period = lead(month_id)) %>%
          filter(month_id != prev_month_id) %>%
          mutate(next_period2 = ifelse((next_period - month_id) %% 88 != 1 | is.na(next_period), 
                                       paste(add_month_id(month_id, 1), " churn"), as.character(next_period))) %>%
          mutate(month_id = as.character(month_id))
     
     new_users = main_mutate %>%
          filter(month_id == month_id_reg) %>%
          mutate(next_period2 = month_id) %>%
          mutate(month_id = 'new user')
     
     #network plot ####
     plot = main_mutate %>%
          bind_rows(new_users) %>%
          group_by(month_id, next_period2) %>%
          tally() %>%
          rename(source = month_id, target = next_period2)
     
     node = as.data.frame(unique(c(plot$source, plot$target)), stringsAsFactors=FALSE)
     names(node) = c("name")
     node$row = as.numeric(rownames(node)) - 1
     
     library(dplyr)
     links = plot %>%
          ungroup() %>%
          mutate(group = ifelse(grepl("churn", target), "churn", target),
                 source = as.numeric(plyr::mapvalues(source, from = node$name, to = node$row)),
                 target = as.numeric(plyr::mapvalues(target, from = node$name, to = node$row))
          ) %>%
          as.data.frame()
     
     library(networkD3)
     
     sankeyNetwork(Links = links, Nodes = node, Source = "source",
                   Target = "target", Value = "n", NodeID = "name", LinkGroup = "group",
                   units = "qty", fontSize = 12, nodeWidth = 5, margin = list(right=0,bottom=0,left=0,top=0),
                   height = 570, width = 1260, iterations = 5) %>%
          saveNetwork("churn_world.html")
     
     #stat ####
     stat = plot %>%
          ungroup() %>%
          mutate(target = ifelse(grepl("churn", target), "churn", target)) %>%
          mutate(source2 = source) %>%
          mutate(source = ifelse(source2 == 'new user', target, source),
                 target = ifelse(source2 == 'new user', source2, target)
          ) %>%
          mutate(target = ifelse(!target %in% c("new user", "churn"), 'continue', target)) %>%
          select(-source2) %>% 
          spread(key = target, value = n) %>%
          mutate(all_users = continue + churn,
                 returning_user = continue + churn - `new user` - lag(continue),
                 continue_perc = round(continue / (churn + continue), 2),
                 churn_perc = round(churn / (churn + continue), 2),
                 new_user_perc = round(`new user` / (churn + continue), 2),
                 returning_user_perc = round(returning_user / (churn + continue), 2)
          ) %>%
          rename(month_id = source) %>%
          select(month_id, all_users, `new user`, new_user_perc, returning_user, returning_user_perc, continue, continue_perc,  churn,
                 churn_perc)
     names(stat) = c("месяц", "всего", "новые", "новые, %", "вернулись после отсутствия", "вернулись, %", "продолжили использование", "продолжили, %",
                     "churn", "churn, %")
     
     xlsx::write.xlsx(as.data.frame(stat), paste0("churn_", prev_month_id, ".xlsx"), row.names = FALSE, sheetName = geo, append = TRUE)
}