''' Data Camp '''
\!h ''' Building Recommendation Engines with PySpark '''
markus_ratings.groupBy("Genre").sum().show()
# matrix multiplication
# Use the .head() method to view the contents of matrices a and b
print("Matrix A: ")
print (a.head())
print("Matrix B: ")
print (b.head())
# Complete the matrix with the product of matrices a and b
product = np.array([[10,12], [15,18]])
# Run this validation to see how your estimate performs
product == np.dot(a,b)
# If the number of columns in C is different than the number of rows in D, then C and D cannot be multiplied.
\!h ''' feature insights '''
# Look at the column names
print(ratings.columns)
# Look at the first few rows of data
print(ratings.show())
# calculate sparsity
# Count the total number of ratings in the dataset
numerator = ratings.select("rating").count()
# Count the number of distinct userIds and distinct movieIds
num_users = ratings.select("userId").distinct().count()
num_movies = ratings.select("movieId").distinct().count()
# Set the denominator equal to the number of users multiplied by the number of movies
denominator = num_users * num_movies
# Divide the numerator by the denominator
sparsity = (1.0 - (numerator *1.0)/denominator)*100
print("The ratings dataframe is ", "%.2f" % sparsity + "% empty.")
# groupBy and Filter methods
# Import the requisite packages
from pyspark.sql.functions import col
# View the ratings dataset
ratings.show()
# Filter to show only userIds less than 100
ratings.filter(col("userId") < 100).show()
# Group data by userId, count ratings
ratings.groupBy("userId").count().show()
# combined
ratings.filter((col('userId') < 100) & (col('userId') > 50)).show()
\!h ''' summary statistics '''
# Min num ratings for movies
print("Movie with the fewest ratings: ")
ratings.groupBy("movieId").count().select(min("count")).show()
# Avg num ratings per movie
print("Avg num ratings per movie: ")
ratings.groupBy("movieId").count().select(avg("count")).show()
# Min num ratings for user
print("User with the fewest ratings: ")
ratings.groupBy("userId").count().select(min("count")).show()
# Avg num ratings per users
print("Avg num ratings per user: ")
ratings.groupBy("userId").count().select(avg("count")).show()
''' ensure proper schema (Ids and ratings need to be numbers) '''
# Use .printSchema() to see the datatypes of the ratings dataset
ratings.printSchema()
# Tell Spark to convert the columns to the proper data types
ratings = ratings.select(ratings.userId.cast("integer"), ratings.movieId.cast("integer"), ratings.rating.cast("double"))
# Call .printSchema() again to confirm the columns are now in the correct format
ratings.printSchema()