iceycode
9/18/2017 - 8:05 AM

Apache Spark in R

Spark and R in action - example from blog.rubypdf.com and r-bloggers.com

# adds R lib to .libPaths after installing Spark

Sys.setenv(SPARK_HOME="/Development/Data/spark2")
.libPaths(c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib"), .libPaths()))
library(SparkR)


######################
# Example 1 
####
# after installing Spark, point to it in R
# Set the system environment variables
Sys.setenv(SPARK_HOME = "C:/Apache/spark-1.4.1")
.libPaths(c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib"), .libPaths()))



sc <- sparkR.session(master="local[2]")
df <- createDataFrame(iris)   #R data.frame to Spark DataFrame
createOrReplaceTempView(df,"iris") #register Spark DataFrame as temp view
x<-sql("select * from iris")# test Spark SQL
nrow(x)#
summary(x)

collect(df)#Spark DataFrame to R data.frame
sparkR.session.stop()


#########################################
# Example 2


# Set the system environment variables
Sys.setenv(SPARK_HOME = "C:/Apache/spark-1.4.1")
.libPaths(c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib"), .libPaths()))

#load the Sparkr library
library(SparkR)

# Create a spark context and a SQL context
sc <- sparkR.init(master = "local")
sqlContext <- sparkRSQL.init(sc)

#create a sparkR DataFrame
DF <- createDataFrame(sqlContext, faithful)
head(DF)
 
# Create a simple local data.frame
localDF <- data.frame(name=c("John", "Smith", "Sarah"), age=c(19, 23, 18))

# Convert local data frame to a SparkR DataFrame
df <- createDataFrame(sqlContext, localDF)

# Print its schema
printSchema(df)
# root
#  |-- name: string (nullable = true)
#  |-- age: double (nullable = true)

# Create a DataFrame from a JSON file
path <- file.path(Sys.getenv("SPARK_HOME"), "examples/src/main/resources/people.json")
peopleDF <- jsonFile(sqlContext, path)
printSchema(peopleDF)

# Register this DataFrame as a table.
registerTempTable(peopleDF, "people")

# SQL statements can be run by using the sql methods provided by sqlContext
teenagers <- sql(sqlContext, "SELECT name FROM people WHERE age >= 13 AND age <= 19")

# Call collect to get a local data.frame
teenagersLocalDF <- collect(teenagers)

# Print the teenagers in our dataset 
print(teenagersLocalDF)

# Stop the SparkContext now
sparkR.stop()