beatrice-m
12/4/2017 - 6:16 PM

Apache Spark on Amazon EMR

How to create a cluster and run a Spark application on Amazon EMR.

# Start a cluster that needs a VPC
aws emr create-cluster  --profile $KEY \
         --name "Test Spark Cluster with VPC" \
         --release-label emr-5.10.0 \
         --applications Name=Hadoop Name=Spark \
         --ec2-attributes KeyName=$KEY,SubnetId=subnet-xxxxxxx \
         --instance-type r4.4xlarge \
         --instance-count 3 \
         --use-default-roles

Important

  • No space in Args=[arg1,arg2,arg3]
$JAR=/usr/lib/spark/lib/spark-examples.jar
$KEY=MoissinB
# Create cluster with 1st step
aws emr create-cluster  --profile $KEY \
                        --name "My Cluster" \
                        --release-label emr-5.10.0 \
                        --applications Name=Hadoop,Name=Spark \
                        --ec2-attributes KeyName=$KEY \
                        --instance-type m3.xlarge \
                        --instance-count 3 \
                        --auto-terminate\ #Cluster terminate at the end of the last step
                        --steps Type=Spark,\
                                Name="Spark Program - Task 1", \
                                ActionOnFailure=CONTINUE,\
                                Args=[--class,main.scala.task.Task1,$JAR] \
                        --use-default-roles

# Add an execution step (run another class from the jar)
aws emr add-steps --cluster-id j-2AXXXXXXGAPLF \
                  --steps Type=Spark,\
                          Name="Spark Program",\
                          ActionOnFailure=CONTINUE,\
                          Args=[--class,org.apache.spark.examples.SparkPi,$JAR,10]