hivefans
12/4/2016 - 2:33 PM

NginxLineParser.scala

package spark.example

/**
  * Created by shidongjie on 2016/12/4.
  */
class NginxLineParser extends Serializable {
  private val regex = "([^-]*)\\s+-\\s+(\\S+)\\s+\\[(\\d{2}\\/[a-zA-Z]{3}\\/\\d{4}:\\d{2}:\\d{2}:\\d{2}\\s+-\\d{4})\\]\\s+\"(.+)\"\\s+(\\d{1,}\\.\\d{3})\\s+(\\d+)\\s+\"([^\"]+)\"\\s+Agent\\[\"([^\"]+)\"\\]\\s+(-|\\d.\\d{3,})\\s+(\\S+)\\s+(\\d{1,}).*".r

  /**
    * @param record Assumed to be an Nginx access log.
    * @return An NginxLogRecord instance wrapped in an Option.
    */
  def parse(record: String): Option[NginxLogRecord] = {

    def parseRequestField(request: String): Option[(String, String, String)] = {
      request.split(" ").toList match {
        case List(a, b, c) => Some((a, b, c))
        case other => None
      }
    }

    record match {
      case regex(ip, ruser, datetime, req, reqtime, recbytes, ref, ua, upstreamtime, pipe, status) =>
        val requestTuple = parseRequestField(req)
        Some(
          NginxLogRecord(
            ip,
            ruser,
            datetime,
            if (requestTuple.isDefined) requestTuple.get._1 else "",
            if (requestTuple.isDefined) requestTuple.get._2 else "",
            if (requestTuple.isDefined) requestTuple.get._3 else "",
            reqtime,
            recbytes,
            ref,
            ua,
            upstreamtime,
            pipe,
            status
          )
        )
      case _ => None
    }
  }
}

object NginxLineParser
package spark.example

/**
  * Created by shidongjie on 2016/12/4.
  */
case class NginxLogRecord (
        clientIpAddress:String,         // should be an ip address, but may also be the hostname if hostname-lookups are enabled
        remoteUser:String,              // typically '-'
        dateTime:String,                // [day/month/year:hour:minute:second zone]
        verb:String,                    // HTTP verb GET, POST, etc
        URL:String,                     // Resource accessed (URL)
        HTTPVersion:String,             // HTTP version: 1.1, 1.0
        RequestProcessingTime:String,   // Request Time in ms
        ReceivedBytes:String,           // Bytes received in the response
        URLReferer:String,              // Referer URL
        UserAgent:String,               // Which User Agent
        UpstreamResponseTime:String,    // Upstream response time, typically '-'
        Pipe:String,                    // Typically .
        ResponseCode:String             // HTTP Status
      )
package spark.example

import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}

object WordCount {

  case class Person(name: String, age: Long)

  def main(args: Array[String]) {

    val sparkConf = new SparkConf().setAppName("sparktopn").setMaster("local[4]")
    val spark = SparkSession
          .builder()
            .config(sparkConf)
          .appName("sparktopn")
          .getOrCreate()
        import spark.implicits._
        
   val parser = new NginxLineParser
    val nginxDF = spark.read.textFile("./src/nginx.log")
         .map(x => parser.parse(x))
         .toDF()
    nginxDF.createOrReplaceTempView("nginxlog")
    val logDF = spark.sql("select clientIpAddress,URL from nginxlog")
    logDF.show()

  }

}
name := "sparkdemo"

version := "1.0"

scalaVersion := "2.11.8"

val sparkVersion = "2.0.2"

libraryDependencies ++= Seq(
  "org.apache.spark" % "spark-core_2.11" % sparkVersion,
  "org.apache.spark" % "spark-streaming_2.11" % sparkVersion,
  "org.apache.spark" % "spark-sql_2.11" % sparkVersion,
  "org.apache.spark" % "spark-mllib_2.11" % sparkVersion
)
192.168.0.237 - - [04/May/2015:23:02:01 -0300]  "GET /adserver/www/delivery/lg.php?bannerid=9388 HTTP/1.1" 0.000  43 "http://wap.tim.com.br/" Agent["Mozilla/5.0 (Linux; U; Android 2.2; pt-br; GT-I5500B"] - . 200
192.168.0.237 - - [04/May/2015:23:02:01 -0300]  "GET /adserver/www/delivery/lg.php?bannerid=9389&campaignid=680&zoneid=1619&loc=1&referer=http%3A%2F%2Fwap.tim.com.br%2Flp%2Fsbappsclub&cb=e84e481818 HTTP/1.1" 0.000  43 "http://wap.tim.com.br/" Agent["Mozilla/5.0 (Linux; U; Android 2.2; pt-br; GT-I5500B Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"] - . 200
192.168.0.237 - - [04/May/2015:23:02:01 -0300]  "GET /adserver/www/delivery/lg.php?bannerid=9432&campaignid=681&zoneid=1637&loc=1&referer=http%3A%2F%2Fwap.tim.com.br%2Flp%2Fsbappsclub&cb=5455b229b0 HTTP/1.1" 0.000  43 "http://wap.tim.com.br/" Agent["Mozilla/5.0 (Linux; U; Android 2.2; pt-br; GT-I5500B Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"] - . 200
192.168.0.102 - - [04/May/2015:23:02:01 -0300]  "GET /adserver/www/delivery/new_spc.php?zones=1638, HTTP/1.1" 0.000  426 "-" Agent["Mozilla/5.0 (Linux; U; Android 4.1.2; pt-br; LG-P655H Build/JZO54K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30"] - . 200
192.168.0.102 - - [04/May/2015:23:02:01 -0300]  "GET /adserver/www/delivery/lg.php?bannerid=9348&campaignid=676&zoneid=1554&loc=1&cb=da3d687dcc HTTP/1.1" 0.000  43 "http://wap.tim.com.br/" Agent["Mozilla/5.0 (Linux; U; Android 4.1.2; pt-br; LG-E467f Build/JZO54K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30"] - . 200
192.168.0.102 - - [04/May/2015:23:02:01 -0300]  "GET /adserver/www/delivery/lg.php?bannerid=9310&campaignid=655&zoneid=1630&loc=1&cb=cb81d8936c HTTP/1.1" 0.000  43 "http://wap.tim.com.br/" Agent["Mozilla/5.0 (Linux; U; Android 2.3.6; pt-br; GT-S5830B Build/GINGERBREAD) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"] - . 200
192.168.0.102 - - [04/May/2015:23:02:01 -0300]  "GET /adserver/www/delivery/new_spc.php?zones=1638, HTTP/1.1" 0.000  426 "-" Agent["Mozilla/5.0 (Linux; U; Android 2.3.4; pt-br; GT-S5830B Build/GINGERBREAD) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"] - . 200
192.168.0.102 - - [04/May/2015:23:02:01 -0300]  "GET /adserver/www/delivery/lg.php?bannerid=9432&campaignid=681&zoneid=1637&loc=1&cb=fe8e6f00b0 HTTP/1.1" 0.000  43 "http://wap.tim.com.br/" Agent["Mozilla/5.0 (LG-T375 AppleWebkit/531 Browser/Phantom/V2.0 Widget/LGMW/3.0 MMS/LG-MMS-V1.0/1.2 Java/ASVM/1.1 Profile/MIDP-2.1 Configuration/CLDC-1.1)"] - . 200
192.168.0.102 - - [04/May/2015:23:02:01 -0300]  "GET /adserver/www/delivery/lg.php?bannerid=9349&campaignid=676&zoneid=1555&loc=1&cb=91e0c8a58b HTTP/1.1" 0.000  43 "http://wap.tim.com.br/" Agent["Mozilla/5.0 (Linux; U; Android 4.1.2; pt-br; LG-E467f Build/JZO54K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30"] - . 200
192.168.0.102 - - [04/May/2015:23:02:01 -0300]  "GET /adserver/www/delivery/lg.php?bannerid=9382&campaignid=679&zoneid=1519&cb=40f1441eb8 HTTP/1.1" 0.000  43 "-" Agent["MOT-EX128 Obigo/WAP2.0 MIDP-2.0/CLDC-1.1"] - . 200