ippromek
2/2/2018 - 12:52 PM

Shiny приложение для аналитики squid

Shiny приложение для аналитики squid

library(tidyverse)
library(lubridate)
library(glue)
library(scales)
library(forcats)
library(readxl)
library(magrittr)
library(stringi)
library(futile.logger)
library(jsonlite)
library(Cairo)
library(RColorBrewer)
library(extrafont)
library(hrbrthemes)
library(shiny)
library(shinyjqui)
library(shinythemes)
library(shinyBS)
library(shinyjs)
library(shinyWidgets)
library(shinycssloaders)
library(anytime)
library(tictoc)
library(assertr)
library(checkmate)

options(shiny.reactlog=TRUE)
options(spinner.type=4)

eval(base::parse("funcs.R", encoding="UTF-8"))

# ================================================================
ui <- 
  navbarPage(
  title="Squid статистика",
  tabPanel("Статистика", value="general_panel"),
  tabPanel("Настройки", value="config_panel"),
  id="tsp",
  theme=shinytheme("yeti"),
  tags$head(tags$style(".rightAlign{float:right;}")), 
  conditionalPanel(
    # general panel -----------------------
    condition="input.tsp=='general_panel'",
    fluidRow(
      column(10, {}),
      column(2, actionButton("process_btn", "Загрузить лог", class='rightAlign'))
      ),
    tabsetPanel(
      id="main_panel",
      selected="graph_tab",
      tabPanel("Таблица", value="table_tab",
               p(),
               mainPanel(
                 fluidRow(
                   column(6, div(withSpinner(DT::dataTableOutput("url_volume_table"))), 
                          style="font-size: 90%"),
                   column(6, div(withSpinner(DT::dataTableOutput("url_host_volume_table"))), 
                          style="font-size: 90%")
                 ), width=10), 
      # ----------------
               sidebarPanel(
                 selectInput("depth_filter", "Глубина данных",
                             choices=c('10 минут'=10,
                                       '1 час'=60, 
                                       '24 часа'=24*60)
                 ),
                 width=2)
      ),
      tabPanel("График", value="graph_tab",
               fluidRow(
                 p(),
                 jqui_sortabled(
                   div(id='top10_plots',
                 column(4, div(withSpinner(plotOutput('top10_left_plot', height="400px")))),
                 column(4, div(withSpinner(plotOutput('top10_center_plot', height="400px")))),
                 column(4, div(withSpinner(plotOutput('top10_right_plot', height="400px"))))
                       ))
                 ),
               fluidRow(
                 column(12, div(withSpinner(plotOutput('timeline_plot', height="400px"))))
               )
               )
      
      )
  ),
 conditionalPanel(
    # config panel -----------------------
    condition = "input.tsp=='config_panel'",
    fluidRow(
      column(2, actionButton("set_test_dates_btn", "На демо дату", class='rightAlign'))
    ),
    fluidRow(
      column(12, verbatimTextOutput("info_text"))
    )
 ),
  shinyjs::useShinyjs()  # Include shinyjs
)

# ================================================================
server <- function(input, output, session) {
  # статические переменные ------------------------------------------------
  log_name <- "app.log"
  
  flog.appender(appender.tee(log_name))
  flog.threshold(TRACE)

  # реактивные переменные -------------------
  raw_df <- reactive({
    input$process_btn # обновлять будем вручную
    # загрузим лог squid -------
    # loadSquidLog("./data/acc.log")
    tic()
    data <- httr::content(httr::GET("http://10.0.0.246/access.log"))
    flog.info(glue("Downloading log file from server: {capture.output(toc())}"))
    tic()
    df <- loadSquidLog(data)
    flog.info(glue("Parsing log file from server: {capture.output(toc())}"))
    df
  })  

  squid_df <- reactive({
    req(raw_df()) %>%
      filter(timestamp>now()-days(2))
  })

  url_df <- reactive({
    df <- req(squid_df()) %>%
      filter(timestamp>now()-minutes(as.numeric(input$depth_filter))) %>%
      select(host, bytes, url) %>%
      group_by(url) %>%
      summarise(volume=round(sum(bytes)/1024/1024, 1)) %>% # Перевели в Мб
      arrange(desc(volume))
    df
  })

  url_host_df <- reactive(({
    # посчитаем сводку по отдельным HOST
    req(url_df())
    ids <- req(input$url_volume_table_rows_selected) # проводим анализ при выборе строки в таблице
    flog.info(glue("Selected row num is {ids}. Data row: {capture.output(str(url_df()[ids, ]))}"))
    
    url <- url_df()[[ids, "url"]]
    url_val <- enquo(url) # превратили в строку
    isolate({
      df <- squid_df() %>%
        filter(timestamp>now()-minutes(as.numeric(input$depth_filter))) %>%
        filter(url==!!url_val) %>%
        select(bytes, host) %>%
        group_by(host) %>%
        summarise(volume=round(sum(bytes)/1024/1024, 1)) %>% # Перевели в Мб
        arrange(desc(volume))
    })

    df
  }))  

  msg <- reactiveVal("")

  # таблица с выборкой по каналам ----------------------------
  output$url_volume_table <- DT::renderDataTable({
    df <- req(url_df())

    DT::datatable(df,
                  class='cell-border stripe',
                  rownames=FALSE,
                  filter='bottom',
                  selection=list(mode="single", target="row"),
                  options=list(dom='fltip', 
                               pageLength=7, 
                               lengthMenu=c(5, 7, 10, 15, 50),
                               order=list(list(1, 'desc')))) # нумерация с 0
    })

  # таблица-детализация по URL в разрезе ----------------------------
  output$url_host_volume_table <- DT::renderDataTable({
    df <- req(url_host_df())

    DT::datatable(df,
                  class='cell-border stripe',
                  rownames=FALSE,
                  filter='bottom',
                  options=list(dom='fltip', #autoWidth=TRUE, 
                               pageLength=7, lengthMenu=c(5, 7, 10, 15)))
  })
  
  # график Топ10  -------------
  output$top10_left_plot <- renderPlot({
    df <- req(squid_df()) %>%
      filter(timestamp>now()-days(1))
    plotTopHostDownload(df, subtitle="за последние сутки")
  })

  output$top10_center_plot <- renderPlot({
    df <- req(squid_df()) %>%
      filter(timestamp>now()-minutes(60))
    plotTopHostDownload(df, subtitle="за последний час")
  })

  output$top10_right_plot <- renderPlot({
    df <- req(squid_df()) %>%
      filter(timestamp>now()-minutes(10))
    plotTopHostDownload(df, subtitle="за последние 10 минут")
  })

  output$timeline_plot <- renderPlot({
    df <- req(squid_df()) %>%
      filter(timestamp>now()-days(1)) %>%
      mutate(timegroup=hgroup.enum(timestamp, mins_bin=10)) %>%
      select(timegroup, host, bytes) %>%
      group_by(timegroup, host) %>%
      summarise(volume=sum(bytes)/1024/1024*8/(10*60)) %>% # Перевели в Мбит/с
      top_n(10, volume)
    
    gp <- ggplot(df, aes(timegroup, volume)) + 
      geom_area(aes(fill=host), alpha=0.5, position="stack") +
      scale_color_brewer(palette="Set1") +
      scale_x_datetime(labels=date_format_tz("%d.%m\n%H:%M", tz="Europe/Moscow"),
                       breaks=date_breaks("4 hours"), 
                       minor_breaks=date_breaks("1 hours")) +
      theme_ipsum_rc(base_size=16, axis_title_size=14) +
      xlab("Дата, время") +
      ylab("Скорость, Mbit/s") +
      ggtitle("Динамика трафика за последние 24 часа")
    
    gp
  })
  
  # служебный вывод ---------------------  
  output$info_text <- renderText({
    msg()
  })

}

shinyApp(ui=ui, server=server)
#' Arrange time vector according equidistant time intervals
#'
#' There are two possible ways of arrangement: by hour's intervals OR by
#' minute's interval. Hour's arrangement has high priority. Hour arrangement
#' must be integer (1, 2, ...) or fractional part of hour (0.25 or 0.5 only)
#'
#' @param date POSIXct vector to arrange
#' @param hours_bin Duration (in hours) between arrangement points
#' @param mins_bin Duration (in minutes) between arrangement points
#' @export
hgroup.enum <- function(date, hours_bin=NULL, mins_bin=5){
  # привязываем все измерения, которые попали в промежуток [0, t] к точке измерения.
  # точки измерения могут быть кратны 1, 2, 3, 4, 6, 12 часам, определяется hour.bin
  # отсчет измерений идет с 0:00
  # поправка для лаборатории. для группировки меньше часа допускается указывать числа меньше 1
  # 0.5 -- раз в полчаса.0.25 -- раз в 15 минут
  # если hour.bin=NULL, то идет привязка к интервалам min.bin, заданном в минутах
  # необходим пакет lubridate
  
  tick_time <- date
  
  if (is.null(hours_bin)){
    # привязываем к минутным интервалам
    n <- floor(lubridate::minute(tick_time)/mins_bin)
    dt <- lubridate::floor_date(tick_time, unit="hour") + lubridate::minutes(n * mins_bin)
  }else{
    # привязываем к часовым интервалам
    if (hours_bin < 1 & !(hours_bin %in% c(0.25, 0.5))) hours_bin=1
    n <- floor((lubridate::hour(tick_time)*60 + lubridate::minute(tick_time))/ (hours_bin*60))
    dt <- lubridate::floor_date(tick_time, unit="day") + lubridate::minutes(n * hours_bin*60)
  }
  
  dt
}

date_format_tz <- function(format = "%Y-%m-%d", tz = "UTC") {
  function(x) format(x, format, tz=tz)
}


loadSquidLog <- function(fname){
  checkmate::qassert(fname, "S=1")
  
  raw_df <- read_table2(fname, 
                        col_names=c("timestamp", "duration", "client_address", "result_codes", 
                                    "bytes", "request_method", "url", "user", "hierarcy_code", "type"),
                        col_types=("nicciccccc")
  )
  # browser()
  df0 <- raw_df %>%
    mutate_at(vars(timestamp), anytime, tz="Europe/Moscow") %>%
    mutate(url=stri_replace_all_regex(url, 
                                      pattern=c("^([a-z]*)://", "^www\\.", "([^/]+).+", ":\\d+"),
                                      replacement=c("", "", "$1", ""),
                                      vectorize_all=FALSE)) %>%
    mutate_at(vars(client_address), as.factor) %>%
    mutate(ip=client_address, host=client_address)
  
  df0
}


#' Title
#'
#' @param df raw traffic dataframe limited by time
#' @param subtitle gplot subtitle
#'
#' @return
#' @export
#'
#' @examples
plotTopHostDownload <- function(df, subtitle) {
  flog.info(paste0("Top10 HOST download plot: nrow = ", nrow(df)))
  if(nrow(df)==0) return(NULL)
  
  # -------------- нарисуем Top10 по HOST за последние N минут ----
  df0 <- df %>%
    select(host, bytes, url) %>%
    group_by(host) %>%
    summarise(volume=round(sum(bytes)/1024/1024, 1)) %>% # Перевели в Мб
    top_n(10, volume) %>%
    # может возникнуть ситуация, когда все значения top_n одинаковы. тогда надо брать выборку
    arrange(desc(volume)) %>%
    filter(row_number()<=10) %>%
    # уберем ненужный дребезг, все кто скачали менее 10Мб -- в топку
    filter(!(volume<10 & row_number()>2)) %>%
    mutate(label=format(volume, big.mark=" ")) %>%
    mutate(hostname=glue("{host}\n ФИО")) %>%
    mutate(hostname=fct_reorder(hostname, volume))
    
  
  gp <- ggplot(df0, aes(hostname, volume)) + 
    geom_bar(fill=brewer.pal(n=9, name="Blues")[4], 
             alpha=0.5, stat="identity") +
    geom_label(aes(label=label), fill="white", colour="black", fontface="bold", hjust=+1.1) +
    theme_ipsum_rc(base_size=16, axis_title_size=14, subtitle_size=13) +
    xlab("HOST") +
    ylab("Суммарный Downlink, Мб") +
    ggtitle("ТОП 10 скачивающих", subtitle=subtitle) +
    coord_flip()
  
  gp
}