dgadiraju
7/31/2017 - 11:46 PM

pyspark-word-count-config.py

import sys
import ConfigParser as cp
try:
    from pyspark import SparkConf, SparkContext

    props = cp.RawConfigParser()
    props.read("src/main/resources/application.properties")

    conf = SparkConf().setAppName("Word Count").setMaster(props.get(sys.argv[3], "executionMode"))
    sc = SparkContext(conf=conf)
    inputPath = sys.argv[1]
    outputPath = sys.argv[2]

    Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
    FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
    Configuration = sc._gateway.jvm.org.apache.hadoop.conf.Configuration

    fs = FileSystem.get(Configuration())

    if(fs.exists(Path(inputPath)) == False):
        print("Input path does not exists")
    else:
        if(fs.exists(Path(outputPath))):
            fs.delete(Path(outputPath), True)
        sc.textFile(inputPath). \
            flatMap(lambda l: l.split(" ")). \
            map(lambda w: (w, 1)). \
            reduceByKey(lambda t, e: t + e). \
            saveAsTextFile(outputPath)

    print ("Successfully imported Spark Modules")

except ImportError as e:
    print ("Can not import Spark Modules", e)
    sys.exit(1)