You can find this dataset within the following repository folder: Movies Data
# Read the movie_ratings dataset
movie_ratings <- read.delim("u.data", header=FALSE, sep="\t")
head(movie_ratings,n=6)
movies <- read.delim("u.item", header=FALSE, sep="\t")
head(movies,n=6)
# Rename columns
ratings_df <-movie_ratings[,-c(4)]
colnames(ratings_df) <- c("UserId", "MovieId", "Rating")
head(ratings_df,n=6)
movies_df <- movies[,c(1:2)]
colnames(movies_df) <- c("MovieId", "Title")
head(movies_df, n=6)
# Join the Data Using Left Join
require(dplyr)
df_joined <- left_join(ratings_df,movies_df, by="MovieId")
head(df_joined)
# Drop 'MovieID'
# Rearrange columns
df_final <- df_joined[,-c(2)]
df_final <- subset(df_final,select=c(1,3,2))
head(df_final, n=6)
# Convert the data to a real ratings matrix: use getRatingMatrix()
library(recommenderlab)
rate_max <- as(df_final, "realRatingMatrix")
rate_max
943 x 1664 rating matrix of class 'realRatingMatrix' with 99693 ratings.
str(rate_max)
Formal class 'realRatingMatrix' [package "recommenderlab"] with 2 slots
..@ data :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
.. .. ..@ i : int [1:99693] 654 99 138 205 354 636 654 180 298 373 ...
.. .. ..@ p : int [1:1665] 0 1 7 13 22 44 45 66 69 73 ...
.. .. ..@ Dim : int [1:2] 943 1664
.. .. ..@ Dimnames:List of 2
.. .. .. ..$ : chr [1:943] "1" "2" "3" "4" ...
.. .. .. ..$ : chr [1:1664] "\xc1 k\xf6ldum klaka (Cold Fever) (1994)" "N\xe9nette et Boni (1996)" "Metisse (Caf\xe9 au Lait) (1993)" "Contempt (M\xe9pris, Le) (1963)" ...
.. .. ..@ x : num [1:99693] 3 3 5 1 4 5 3 1 3 3 ...
.. .. ..@ factors : list()
..@ normalize: NULL
# Explore the data
rate_freq <- as.data.frame(table(df_final$Rating))
rate_freq
# Distribution of Rating Frequency
barplot(rate_freq$Freq,names.arg = c(1,2,3,4,5), xlab="Rating",
main="Distribution of Rating Frequencies", col=c("red","green", "blue", "yellow", "salmon"), cex.names=0.7, cex.axis=0.5)
From the histogram, the most commonly assigned movie rating is 4, followed by 3, and then 5. One star is the least commonly assigned rating. People probably like the movies that they watch.
# Implement Recommendation Algorithm
recommenderRegistry$get_entries(dataType = "realRatingMatrix")
Reference: NA
Parameters: None
$RERECOMMEND_realRatingMatrix
Recommender method: RERECOMMEND for realRatingMatrix Description: Re-recommends highly rated items (real ratings).
Reference: NA
Parameters:
$SVD_realRatingMatrix
Recommender method: SVD for realRatingMatrix Description: Recommender based on SVD approximation with column-mean imputation.
Reference: NA
Parameters:
$SVDF_realRatingMatrix
Recommender method: SVDF for realRatingMatrix Description: Recommender based on Funk SVD with gradient descend
(https://sifter.org/~simon/journal/20061211.html).
Reference: NA
Parameters:
$UBCF_realRatingMatrix
Recommender method: UBCF for realRatingMatrix Description: Recommender based on user-based collaborative filtering.
Reference: NA
Parameters:
# Hold-out or Split approach
split_scheme <-evaluationScheme(rate_max, method="split", train=0.8, given=5, goodRating=3)
split_scheme
Evaluation scheme with 5 items given
Method: 'split' with 1 run(s).
Training set proportion: 0.800
Good ratings: >=3.000000
Data set: 943 x 1664 rating matrix of class 'realRatingMatrix' with 99693 ratings.
# Cross-validation approach (k-fold)
cv_scheme <-evaluationScheme(rate_max, method="cross-validation", k=5, given=-1, goodRating=3)
cv_scheme
Evaluation scheme using all-but-1 items
Method: 'cross-validation' with 5 run(s).
Good ratings: >=3.000000
Data set: 943 x 1664 rating matrix of class 'realRatingMatrix' with 99693 ratings..
# Run the selected algorithms and determine the prediction error
automate <- function(scheme){
##predictor variables
r_svd <- Recommender(getData(scheme, "train"), "SVD")
r_pop <- Recommender(getData(scheme, "train"), "POPULAR")
r_ubcf <- Recommender(getData(scheme, "train"), "UBCF")
r_ibcf <- Recommender(getData(scheme, "train"), "IBCF")
##test the models (predictive accuracy)
p_svd <- predict(r_svd, getData(scheme, "known"), type="ratings")
p_pop <- predict(r_pop,getData(scheme, "known"), type="ratings")
p_ubcf <- predict(r_ubcf,getData(scheme, "known"), type="ratings")
p_ibcf <- predict(r_ibcf,getData(scheme, "known"), type="ratings")
##generate prediction errors
err_svd <- calcPredictionAccuracy(p_svd, getData(scheme, "unknown"))
err_pop <- calcPredictionAccuracy(p_pop,getData(scheme, "unknown"))
err_ubcf <- calcPredictionAccuracy(p_ubcf,getData(scheme, "unknown"))
err_ibcf <- calcPredictionAccuracy(p_ibcf,getData(scheme, "unknown"))
df_err <- rbind(err_svd, err_pop, err_ubcf, err_ibcf)
df_err
}
automate(split_scheme)
automate(cv_scheme)
Based on predictive accuracy, we could use either SVD or POPULAR techniques to generate recommendations because they have the lowest prediction errors.
# Use selected model to generate recommendations.
rec <- Recommender(rate_max, method="SVD")
rec
Recommender of type 'SVD' for 'realRatingMatrix' learned using 943 users.
# Return only top 3 recommended movies
recom_user3 <- predict(rec, rate_max[1:3], n=3)
result3 <- as(recom_user3, "list")
print("Top 3 Recommended Movies for User with UserId = 1 to 3")
print("===================================================")
result3
[1] "Top 3 Recommended Movies for User with UserId = 1 to 3"
[1] "==================================================="
$`0`
[1] "Butcher Boy, The (1998)" "Ulee's Gold (1997)" "Great Day in Harlem, A (1994)"
$`1`
[1] "Butcher Boy, The (1998)" "Chasing Amy (1997)" "Great Day in Harlem, A (1994)"
$`2`
[1] "Butcher Boy, The (1998)" "Great Day in Harlem, A (1994)" "Aiqing wansui (1994)"
Note: These users are probably similar, so similar movies are recommended to them.