// in the spark shell here i load the file from S3
val myFile = sc.textFile("s3://some-s3-bucket/us-constitution.txt")
// Classic wordcount
val counts = myFile.flatMap(line => line.toLowerCase().replace(".", " ").replace(",", " ").split(" ")).map(word => (word, 1L)).reduceByKey(_ + _)
// create tuples for the words
val sorted_counts = counts.collect().sortBy(wc => -wc._2)
// print out a sample of 10 to see results
sorted_counts.take(10).foreach(println)
// save the files out to S3 bucket
sc.parallelize(sorted_counts).saveAsTextFile("s3n://some-s3-bucket/wordcount-us-constution")
// maybe you want to write it out as csv
val csvResults = sorted_counts map { case (key, value) => Array(key, value).mkString(",\t") }
// save csv out to S3
sc.parallelize(results).saveAsTextFile("s3n://some-s3-bucket/wordcount-csv-constitution")