movies-recommender/recommenders.py at main · itsjavi/movies-recommender · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import pandas as pd
import numpy as np
import streamlit as st
from sklearn.metrics.pairwise import cosine_similarity
from imdb import Cinemagoer

# create an instance of the Cinemagoer class
cine = Cinemagoer()

"""
TODOS:
 - Ability to search by movie title (match as "OR" by every word)

 - Add a "year" column to the ratings, based on "timestamp" column,
   so we can display most popular movies by year.

 - Extract "year" from the last 6 chars of the movie title "(XXXX)"

 - Create functions for linear-regression and Knn algorithms


OPTIMIZATIONS:

 - To have a more optimal web application, we could save the ratings means and the
   merge with the movies as csv file, and load that one instead of calculating
   everything again over and over. The same for the users ratings.

 - That would make our app "static", but in a real world scenario the data would be dynamic
   and loaded from an API or Database, so it would be calculated on the fly.

"""

@st.cache
def get_movies():
    return pd.read_csv("data/movies-tags.csv")

@st.cache
def get_movies_with_decade():
    movies_df = get_movies()
    df = movies_df.copy()

    # df['title'] = movies_df['title'].str.extract('(.+?) \(')
    df['year'] = movies_df['title'].str.extract(' \(([0-9]{4})\)')
    df['title'] = movies_df['title'].str.replace('( \([0-9]{4}\))', '')
    df['year'] = df['year'].fillna(0)
    df['decade'] = (10 * (df['year'].astype(int) // 10)) #.astype(str) + 's'
    return df

    #decades = pd.unique(movies_df['decade'].dropna())
    #decades = np.sort(years)
    #decades

    #movies_df2.loc[movies_df2['year'] == 0]

@st.cache
def get_genres():
    return pd.read_csv("data/genres.csv")

    return genres

@st.cache
def get_ratings():
    return pd.read_csv("data/ratings.csv")

@st.cache
def get_user_ids():
    return pd.read_csv("data/users.csv")


def get_ratings_means_count(ratings_df):
    ratings = pd.DataFrame(ratings_df.groupby('movieId')['rating'].mean())
    ratings['rating_count'] = ratings_df.groupby('movieId')['rating'].count()

    return ratings


# simple ranking
def get_popular_movies(ratings_df, movies_df, n = 10, min_ratings = 10):
    ratings = get_ratings_means_count(ratings_df)

    popular_movies = (ratings
        .merge(movies_df, on='movieId')
        .sort_values(["rating_count", "rating"], ascending=[False, False])

    )

    return popular_movies[popular_movies['rating_count']>=min_ratings].head(n)


# get movie index,id tuple from highest ranked
def get_most_popular_movie(popular_movies):
    mostPopularMovie = popular_movies.head(1)
    mostPopularMovieIdx = mostPopularMovie.index[0]
    mostPopularMovieId = mostPopularMovie['movieId'][mostPopularMovieIdx]

    return [mostPopularMovieIdx, mostPopularMovieId]


# item-based recommender
def get_similar_movies(movieId, ratings_df, movies_df, n = 10, min_ratings = 10):
    ratings_mean_df = get_ratings_means_count(ratings_df)
    movies_crosstab = pd.pivot_table(data=ratings_df, values='rating', index='userId', columns='movieId')

    # Replace NaNs with zeros
         # not doing this gives different results??
    movies_crosstab = movies_crosstab.fillna(0, inplace=False)

    popular_ratings = movies_crosstab[movieId]
    popular_ratings[popular_ratings>=0] # exclude NaNs in the pivot table cross tab

    # Find PearsonR correlation
    similar_corr = pd.DataFrame(movies_crosstab.corrwith(popular_ratings), columns=['PearsonR_Value'])
    similar_corr = similar_corr.dropna(inplace=False) # exclude NaNs in the corr matrix

    similar_summary = similar_corr.join(ratings_mean_df['rating_count'])
    similar_summary = similar_summary.drop(
        movieId, inplace=False
    ) # drop popular movie itself

    return (similar_summary[similar_summary['rating_count']>=min_ratings]
            .sort_values(['PearsonR_Value', 'rating_count'], ascending=[False,False])
            .merge(movies_df, left_index=True, right_on="movieId")
            .head(n)
          )

# user-based recommender
def get_user_recommendations(for_user_id, ratings_df, movies_df, n = 10):
    # Create the big users-items table, using the userId as index.
    users_items = pd.pivot_table(data=ratings_df, values='rating', index='userId', columns='movieId')

    # Replace NaNs with zeros
    users_items = users_items.fillna(0, inplace=False)

    # Compute pairwise cosine similarities
    user_similarities = pd.DataFrame(
        cosine_similarity(users_items),
        columns=users_items.index,
        index=users_items.index
    )

    # build recommender system
    ## compute weights, excluding target user

    user_similarities_excl = user_similarities.query("userId!=@for_user_id")[for_user_id]
    user_similarities_excl_sums = sum(user_similarities_excl)
    weights = (user_similarities_excl / user_similarities_excl_sums)

    ## find movies that target user did not rate yet
    users_items.loc[for_user_id,:]==0

    not_rated_movies = users_items.loc[users_items.index!=for_user_id, users_items.loc[for_user_id,:]==0]

    ## predict/compute the ratings target user would give to those unrated restaurants.

    ### dot product between the not-rated-movies and the weights
    weighted_averages = pd.DataFrame(not_rated_movies.T.dot(weights), columns=["predicted_rating"])

    ## find the top N movies from the rating predictions
    recommendations = weighted_averages.merge(movies_df, left_index=True, right_on="movieId")
    recommendations = recommendations.sort_values("predicted_rating", ascending=False).head(n)

    return recommendations


def get_movies_with_covers(movies_df): # Warning! This API is very slow
    new_movies_data = movies_df.head(2).copy() # use only 2 for testing it put
    new_movies_data['imgUrl'] = None
    columns = new_movies_data.columns

    for i,row in movies_df.iterrows():
        if row['imdbId']:
            mov = cine.get_movie(row['imdbId'])
            #st.write(mov.keys())
            new_movies_data.at[i, 'imgUrl'] = mov["cover url"]
            st.image(mov["cover url"]) # for quick demo purposes

    return pd.DataFrame(new_movies_data, columns = columns)