''' Data Camp '''
\!h ''' Building Recommendation Engines with PySpark '''
\!h ''' cross validation and hyperparameter tuning '''
# Import the required functions
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# Create test and train set
(train, test) = ratings.randomSplit([0.8, 0.2], seed = 1234)
# Create ALS model
als = ALS(userCol="userId", itemCol="itemId", ratingCol="rating", nonnegative = True, implicitPrefs = False)
# Confirm that a model called "als" was created
type(als)
# Import the requisite items
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# Add hyperparameters and their respective values to param_grid
param_grid = ParamGridBuilder() \
.addGrid(als.rank, [10, 50, 100, 150]) \
.addGrid(als.maxIter, [5, 50, 100, 200]) \
.addGrid(als.regParam, [.01, .05, .1, .15]) \
.build()
# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
print ("Num models to be tested: ", len(param_grid))
# Build cross validation using CrossValidator
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)
# Confirm cv was built
print(cv)
\!h ''' Fit cross validator to the 'train' dataset '''
model = cv.fit(train)
#Extract best model from the cv model above
best_model = model.bestModel
# Print best_model
print(type(best_model))
# Complete the code below to extract the ALS model parameters
print("**Best Model**")
# Print "Rank"
print(" Rank:", best_model.getRank())
# Print "MaxIter"
print(" MaxIter:", best_model.getMaxIter())
# Print "RegParam"
print(" RegParam:", best_model.getRegParam())
\!h ''' generate predictions and calculate RMSE '''
test_predictions = best_model.transform(test)
# View the predictions
test_predictions.show()
# Calculate and print the RMSE of test_predictions
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)
''' An RMSE of 0.633 means that on average the model predicts 0.633 above or below values of the original ratings matrix.'''
''' look at 2 users' ratings and recommendations '''
# Look at user 60's ratings
print("User 60's Ratings:")
original_ratings.filter(col("userId") == 60).sort("rating", ascending = False).show()
# Look at the movies recommended to user 60
print("User 60s Recommendations:")
recommendations.filter(col("userId") == 60).show()
# Look at user 63's ratings
print("User 63's Ratings:")
original_ratings.filter(col("userId") == 63).sort("rating", ascending = False).show()
# Look at the movies recommended to user 63
print("User 63's Recommendations:")
recommendations.filter(col("userId") == 63).show()