package spark.example
/**
* Created by shidongjie on 2016/12/4.
*/
class NginxLineParser extends Serializable {
private val regex = "([^-]*)\\s+-\\s+(\\S+)\\s+\\[(\\d{2}\\/[a-zA-Z]{3}\\/\\d{4}:\\d{2}:\\d{2}:\\d{2}\\s+-\\d{4})\\]\\s+\"(.+)\"\\s+(\\d{1,}\\.\\d{3})\\s+(\\d+)\\s+\"([^\"]+)\"\\s+Agent\\[\"([^\"]+)\"\\]\\s+(-|\\d.\\d{3,})\\s+(\\S+)\\s+(\\d{1,}).*".r
/**
* @param record Assumed to be an Nginx access log.
* @return An NginxLogRecord instance wrapped in an Option.
*/
def parse(record: String): Option[NginxLogRecord] = {
def parseRequestField(request: String): Option[(String, String, String)] = {
request.split(" ").toList match {
case List(a, b, c) => Some((a, b, c))
case other => None
}
}
record match {
case regex(ip, ruser, datetime, req, reqtime, recbytes, ref, ua, upstreamtime, pipe, status) =>
val requestTuple = parseRequestField(req)
Some(
NginxLogRecord(
ip,
ruser,
datetime,
if (requestTuple.isDefined) requestTuple.get._1 else "",
if (requestTuple.isDefined) requestTuple.get._2 else "",
if (requestTuple.isDefined) requestTuple.get._3 else "",
reqtime,
recbytes,
ref,
ua,
upstreamtime,
pipe,
status
)
)
case _ => None
}
}
}
object NginxLineParser
package spark.example
/**
* Created by shidongjie on 2016/12/4.
*/
case class NginxLogRecord (
clientIpAddress:String, // should be an ip address, but may also be the hostname if hostname-lookups are enabled
remoteUser:String, // typically '-'
dateTime:String, // [day/month/year:hour:minute:second zone]
verb:String, // HTTP verb GET, POST, etc
URL:String, // Resource accessed (URL)
HTTPVersion:String, // HTTP version: 1.1, 1.0
RequestProcessingTime:String, // Request Time in ms
ReceivedBytes:String, // Bytes received in the response
URLReferer:String, // Referer URL
UserAgent:String, // Which User Agent
UpstreamResponseTime:String, // Upstream response time, typically '-'
Pipe:String, // Typically .
ResponseCode:String // HTTP Status
)
package spark.example
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
object WordCount {
case class Person(name: String, age: Long)
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("sparktopn").setMaster("local[4]")
val spark = SparkSession
.builder()
.config(sparkConf)
.appName("sparktopn")
.getOrCreate()
import spark.implicits._
val parser = new NginxLineParser
val nginxDF = spark.read.textFile("./src/nginx.log")
.map(x => parser.parse(x))
.toDF()
nginxDF.createOrReplaceTempView("nginxlog")
val logDF = spark.sql("select clientIpAddress,URL from nginxlog")
logDF.show()
}
}
name := "sparkdemo"
version := "1.0"
scalaVersion := "2.11.8"
val sparkVersion = "2.0.2"
libraryDependencies ++= Seq(
"org.apache.spark" % "spark-core_2.11" % sparkVersion,
"org.apache.spark" % "spark-streaming_2.11" % sparkVersion,
"org.apache.spark" % "spark-sql_2.11" % sparkVersion,
"org.apache.spark" % "spark-mllib_2.11" % sparkVersion
)
192.168.0.237 - - [04/May/2015:23:02:01 -0300] "GET /adserver/www/delivery/lg.php?bannerid=9388 HTTP/1.1" 0.000 43 "http://wap.tim.com.br/" Agent["Mozilla/5.0 (Linux; U; Android 2.2; pt-br; GT-I5500B"] - . 200
192.168.0.237 - - [04/May/2015:23:02:01 -0300] "GET /adserver/www/delivery/lg.php?bannerid=9389&campaignid=680&zoneid=1619&loc=1&referer=http%3A%2F%2Fwap.tim.com.br%2Flp%2Fsbappsclub&cb=e84e481818 HTTP/1.1" 0.000 43 "http://wap.tim.com.br/" Agent["Mozilla/5.0 (Linux; U; Android 2.2; pt-br; GT-I5500B Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"] - . 200
192.168.0.237 - - [04/May/2015:23:02:01 -0300] "GET /adserver/www/delivery/lg.php?bannerid=9432&campaignid=681&zoneid=1637&loc=1&referer=http%3A%2F%2Fwap.tim.com.br%2Flp%2Fsbappsclub&cb=5455b229b0 HTTP/1.1" 0.000 43 "http://wap.tim.com.br/" Agent["Mozilla/5.0 (Linux; U; Android 2.2; pt-br; GT-I5500B Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"] - . 200
192.168.0.102 - - [04/May/2015:23:02:01 -0300] "GET /adserver/www/delivery/new_spc.php?zones=1638, HTTP/1.1" 0.000 426 "-" Agent["Mozilla/5.0 (Linux; U; Android 4.1.2; pt-br; LG-P655H Build/JZO54K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30"] - . 200
192.168.0.102 - - [04/May/2015:23:02:01 -0300] "GET /adserver/www/delivery/lg.php?bannerid=9348&campaignid=676&zoneid=1554&loc=1&cb=da3d687dcc HTTP/1.1" 0.000 43 "http://wap.tim.com.br/" Agent["Mozilla/5.0 (Linux; U; Android 4.1.2; pt-br; LG-E467f Build/JZO54K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30"] - . 200
192.168.0.102 - - [04/May/2015:23:02:01 -0300] "GET /adserver/www/delivery/lg.php?bannerid=9310&campaignid=655&zoneid=1630&loc=1&cb=cb81d8936c HTTP/1.1" 0.000 43 "http://wap.tim.com.br/" Agent["Mozilla/5.0 (Linux; U; Android 2.3.6; pt-br; GT-S5830B Build/GINGERBREAD) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"] - . 200
192.168.0.102 - - [04/May/2015:23:02:01 -0300] "GET /adserver/www/delivery/new_spc.php?zones=1638, HTTP/1.1" 0.000 426 "-" Agent["Mozilla/5.0 (Linux; U; Android 2.3.4; pt-br; GT-S5830B Build/GINGERBREAD) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"] - . 200
192.168.0.102 - - [04/May/2015:23:02:01 -0300] "GET /adserver/www/delivery/lg.php?bannerid=9432&campaignid=681&zoneid=1637&loc=1&cb=fe8e6f00b0 HTTP/1.1" 0.000 43 "http://wap.tim.com.br/" Agent["Mozilla/5.0 (LG-T375 AppleWebkit/531 Browser/Phantom/V2.0 Widget/LGMW/3.0 MMS/LG-MMS-V1.0/1.2 Java/ASVM/1.1 Profile/MIDP-2.1 Configuration/CLDC-1.1)"] - . 200
192.168.0.102 - - [04/May/2015:23:02:01 -0300] "GET /adserver/www/delivery/lg.php?bannerid=9349&campaignid=676&zoneid=1555&loc=1&cb=91e0c8a58b HTTP/1.1" 0.000 43 "http://wap.tim.com.br/" Agent["Mozilla/5.0 (Linux; U; Android 4.1.2; pt-br; LG-E467f Build/JZO54K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30"] - . 200
192.168.0.102 - - [04/May/2015:23:02:01 -0300] "GET /adserver/www/delivery/lg.php?bannerid=9382&campaignid=679&zoneid=1519&cb=40f1441eb8 HTTP/1.1" 0.000 43 "-" Agent["MOT-EX128 Obigo/WAP2.0 MIDP-2.0/CLDC-1.1"] - . 200