alathrop
10/11/2019 - 9:25 PM

recommenders feature engineering

''' Data Camp '''
\!h ''' Building Recommendation Engines with PySpark '''

markus_ratings.groupBy("Genre").sum().show()

# matrix multiplication
# Use the .head() method to view the contents of matrices a and b
print("Matrix A: ")
print (a.head())

print("Matrix B: ")
print (b.head())

# Complete the matrix with the product of matrices a and b
product = np.array([[10,12], [15,18]])

# Run this validation to see how your estimate performs
product == np.dot(a,b)

# If the number of columns in C is different than the number of rows in D, then C and D cannot be multiplied.

\!h ''' feature insights '''

# Look at the column names
print(ratings.columns)

# Look at the first few rows of data
print(ratings.show())

# calculate sparsity
# Count the total number of ratings in the dataset
numerator = ratings.select("rating").count()

# Count the number of distinct userIds and distinct movieIds
num_users = ratings.select("userId").distinct().count()
num_movies = ratings.select("movieId").distinct().count()

# Set the denominator equal to the number of users multiplied by the number of movies
denominator = num_users * num_movies

# Divide the numerator by the denominator
sparsity = (1.0 - (numerator *1.0)/denominator)*100
print("The ratings dataframe is ", "%.2f" % sparsity + "% empty.")

# groupBy and Filter methods
# Import the requisite packages
from pyspark.sql.functions import col

# View the ratings dataset
ratings.show()

# Filter to show only userIds less than 100
ratings.filter(col("userId") < 100).show()

# Group data by userId, count ratings
ratings.groupBy("userId").count().show()

# combined
ratings.filter((col('userId') < 100) & (col('userId') > 50)).show()

\!h ''' summary statistics '''
# Min num ratings for movies
print("Movie with the fewest ratings: ")
ratings.groupBy("movieId").count().select(min("count")).show()

# Avg num ratings per movie
print("Avg num ratings per movie: ")
ratings.groupBy("movieId").count().select(avg("count")).show()

# Min num ratings for user
print("User with the fewest ratings: ")
ratings.groupBy("userId").count().select(min("count")).show()

# Avg num ratings per users
print("Avg num ratings per user: ")
ratings.groupBy("userId").count().select(avg("count")).show()

''' ensure proper schema (Ids and ratings need to be numbers) '''
# Use .printSchema() to see the datatypes of the ratings dataset
ratings.printSchema()

# Tell Spark to convert the columns to the proper data types
ratings = ratings.select(ratings.userId.cast("integer"), ratings.movieId.cast("integer"), ratings.rating.cast("double"))

# Call .printSchema() again to confirm the columns are now in the correct format
ratings.printSchema()