gdequeiroz
7/4/2015 - 11:35 PM

gistfile1.scala

// in the spark shell here i load the file from S3
val myFile = sc.textFile("s3://some-s3-bucket/us-constitution.txt")
 
// Classic wordcount 
val counts = myFile.flatMap(line => line.toLowerCase().replace(".", " ").replace(",", " ").split(" ")).map(word => (word, 1L)).reduceByKey(_ + _)
 
// create tuples for the words 
val sorted_counts = counts.collect().sortBy(wc => -wc._2)
 
// print out a sample of 10 to see results
sorted_counts.take(10).foreach(println)
 
// save the files out to S3 bucket
sc.parallelize(sorted_counts).saveAsTextFile("s3n://some-s3-bucket/wordcount-us-constution")
 
// maybe you want to write it out as csv
val csvResults = sorted_counts map { case (key, value) => Array(key, value).mkString(",\t") }
 
// save csv out to S3
sc.parallelize(results).saveAsTextFile("s3n://some-s3-bucket/wordcount-csv-constitution")