10/11/2019 - 9:25 PM

recommenders feature engineering

''' Data Camp '''
\!h ''' Building Recommendation Engines with PySpark '''


# matrix multiplication
# Use the .head() method to view the contents of matrices a and b
print("Matrix A: ")
print (a.head())

print("Matrix B: ")
print (b.head())

# Complete the matrix with the product of matrices a and b
product = np.array([[10,12], [15,18]])

# Run this validation to see how your estimate performs
product ==,b)

# If the number of columns in C is different than the number of rows in D, then C and D cannot be multiplied.

\!h ''' feature insights '''

# Look at the column names

# Look at the first few rows of data

# calculate sparsity
# Count the total number of ratings in the dataset
numerator ="rating").count()

# Count the number of distinct userIds and distinct movieIds
num_users ="userId").distinct().count()
num_movies ="movieId").distinct().count()

# Set the denominator equal to the number of users multiplied by the number of movies
denominator = num_users * num_movies

# Divide the numerator by the denominator
sparsity = (1.0 - (numerator *1.0)/denominator)*100
print("The ratings dataframe is ", "%.2f" % sparsity + "% empty.")

# groupBy and Filter methods
# Import the requisite packages
from pyspark.sql.functions import col

# View the ratings dataset

# Filter to show only userIds less than 100
ratings.filter(col("userId") < 100).show()

# Group data by userId, count ratings

# combined
ratings.filter((col('userId') < 100) & (col('userId') > 50)).show()

\!h ''' summary statistics '''
# Min num ratings for movies
print("Movie with the fewest ratings: ")

# Avg num ratings per movie
print("Avg num ratings per movie: ")

# Min num ratings for user
print("User with the fewest ratings: ")

# Avg num ratings per users
print("Avg num ratings per user: ")

''' ensure proper schema (Ids and ratings need to be numbers) '''
# Use .printSchema() to see the datatypes of the ratings dataset

# Tell Spark to convert the columns to the proper data types
ratings ="integer"), ratings.movieId.cast("integer"), ratings.rating.cast("double"))

# Call .printSchema() again to confirm the columns are now in the correct format