diff --git a/examples/structured_data/collaborative_filtering_movielens.ipynb b/examples/structured_data/collaborative_filtering_movielens.ipynb new file mode 100644 index 0000000000..1a28e4c66f --- /dev/null +++ b/examples/structured_data/collaborative_filtering_movielens.ipynb @@ -0,0 +1,379 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text" + }, + "source": [ + "# Collaborative Filtering for Movie Recommendations\n", + "\n", + "**Author:** [Siddhartha Banerjee](https://twitter.com/sidd2006)
\n", + "**Date created:** 2020/05/24
\n", + "**Last modified:** 2020/05/24
\n", + "**Description:** Recommending movies using a model trained on Movielens dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text" + }, + "source": [ + "## Introduction\n", + "\n", + "This example demonstrates\n", + "[Collaborative filtering](https://en.wikipedia.org/wiki/Collaborative_filtering)\n", + "using the [Movielens dataset](https://www.kaggle.com/c/movielens-100k)\n", + "to recommend movies to users.\n", + "The MovieLens ratings dataset lists the ratings given by a set of users to a set of movies.\n", + "Our goal is to be able to predict ratings for movies a user has not yet watched.\n", + "The movies with the highest predicted ratings can then be recommended to the user.\n", + "\n", + "The steps in the model are as follows:\n", + "\n", + "1. Map user ID to a \"user vector\" via an embedding matrix\n", + "2. Map movie ID to a \"movie vector\" via an embedding matrix\n", + "3. Compute the dot product between the user vector and movie vector, to obtain\n", + "the a match score between the user and the movie (predicted rating).\n", + "4. Train the embeddings via gradient descent using all known user-movie pairs.\n", + "\n", + "**References:**\n", + "\n", + "- [Collaborative Filtering](https://dl.acm.org/doi/pdf/10.1145/371920.372071)\n", + "- [Neural Collaborative Filtering](https://dl.acm.org/doi/pdf/10.1145/3038912.3052569)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab_type": "code" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from pathlib import Path\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "\n", + "import keras\n", + "from keras import layers\n", + "from keras import ops" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text" + }, + "source": [ + "## First, load the data and apply preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab_type": "code" + }, + "outputs": [], + "source": [ + "# Download the actual data from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip\"\n", + "# Use the ratings.csv file\n", + "# FIX: Use HTTPS and let Keras handle extraction automatically\n", + "movielens_data_file_url = (\n", + " \"https://files.grouplens.org/datasets/movielens/ml-latest-small.zip\"\n", + ")\n", + "movielens_zipped_file = keras.utils.get_file(\n", + " \"ml-latest-small.zip\", movielens_data_file_url, extract=True\n", + ")\n", + "keras_datasets_path = Path(movielens_zipped_file).parents[0]\n", + "movielens_dir = keras_datasets_path / \"ml-latest-small\"\n", + "\n", + "ratings_file = movielens_dir / \"ratings.csv\"\n", + "df = pd.read_csv(ratings_file)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text" + }, + "source": [ + "First, need to perform some preprocessing to encode users and movies as integer indices." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab_type": "code" + }, + "outputs": [], + "source": [ + "user_ids = df[\"userId\"].unique().tolist()\n", + "user2user_encoded = {x: i for i, x in enumerate(user_ids)}\n", + "userencoded2user = {i: x for i, x in enumerate(user_ids)}\n", + "movie_ids = df[\"movieId\"].unique().tolist()\n", + "movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}\n", + "movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}\n", + "df[\"user\"] = df[\"userId\"].map(user2user_encoded)\n", + "df[\"movie\"] = df[\"movieId\"].map(movie2movie_encoded)\n", + "\n", + "num_users = len(user2user_encoded)\n", + "num_movies = len(movie_encoded2movie)\n", + "df[\"rating\"] = df[\"rating\"].values.astype(np.float32)\n", + "# min and max ratings will be used to normalize the ratings later\n", + "min_rating = min(df[\"rating\"])\n", + "max_rating = max(df[\"rating\"])\n", + "\n", + "print(\n", + " \"Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}\".format(\n", + " num_users, num_movies, min_rating, max_rating\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text" + }, + "source": [ + "## Prepare training and validation data" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab_type": "code" + }, + "outputs": [], + "source": [ + "df = df.sample(frac=1, random_state=42)\n", + "x = df[[\"user\", \"movie\"]].values\n", + "# Normalize the targets between 0 and 1. Makes it easy to train.\n", + "y = df[\"rating\"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values\n", + "# Assuming training on 90% of the data and validating on 10%.\n", + "train_indices = int(0.9 * df.shape[0])\n", + "x_train, x_val, y_train, y_val = (\n", + " x[:train_indices],\n", + " x[train_indices:],\n", + " y[:train_indices],\n", + " y[train_indices:],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text" + }, + "source": [ + "## Create the model\n", + "\n", + "We embed both users and movies in to 50-dimensional vectors.\n", + "\n", + "The model computes a match score between user and movie embeddings via a dot product,\n", + "and adds a per-movie and per-user bias. The match score is scaled to the `[0, 1]`\n", + "interval via a sigmoid (since our ratings are normalized to this range)." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab_type": "code" + }, + "outputs": [], + "source": [ + "EMBEDDING_SIZE = 50\n", + "\n", + "\n", + "class RecommenderNet(keras.Model):\n", + " def __init__(self, num_users, num_movies, embedding_size, **kwargs):\n", + " super().__init__(**kwargs)\n", + " self.num_users = num_users\n", + " self.num_movies = num_movies\n", + " self.embedding_size = embedding_size\n", + " self.user_embedding = layers.Embedding(\n", + " num_users,\n", + " embedding_size,\n", + " embeddings_initializer=\"he_normal\",\n", + " embeddings_regularizer=keras.regularizers.l2(1e-6),\n", + " )\n", + " self.user_bias = layers.Embedding(num_users, 1)\n", + " self.movie_embedding = layers.Embedding(\n", + " num_movies,\n", + " embedding_size,\n", + " embeddings_initializer=\"he_normal\",\n", + " embeddings_regularizer=keras.regularizers.l2(1e-6),\n", + " )\n", + " self.movie_bias = layers.Embedding(num_movies, 1)\n", + "\n", + " def call(self, inputs):\n", + " user_vector = self.user_embedding(inputs[:, 0])\n", + " user_bias = self.user_bias(inputs[:, 0])\n", + " movie_vector = self.movie_embedding(inputs[:, 1])\n", + " movie_bias = self.movie_bias(inputs[:, 1])\n", + " dot_user_movie = ops.tensordot(user_vector, movie_vector, 2)\n", + " # Add all the components (including bias)\n", + " x = dot_user_movie + user_bias + movie_bias\n", + " # The sigmoid activation forces the rating to between 0 and 1\n", + " return ops.nn.sigmoid(x)\n", + "\n", + "\n", + "model = RecommenderNet(num_users, num_movies, EMBEDDING_SIZE)\n", + "model.compile(\n", + " loss=keras.losses.BinaryCrossentropy(),\n", + " optimizer=keras.optimizers.Adam(learning_rate=0.001),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text" + }, + "source": [ + "## Train the model based on the data split" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab_type": "code" + }, + "outputs": [], + "source": [ + "history = model.fit(\n", + " x=x_train,\n", + " y=y_train,\n", + " batch_size=64,\n", + " epochs=5,\n", + " verbose=1,\n", + " validation_data=(x_val, y_val),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text" + }, + "source": [ + "## Plot training and validation loss" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab_type": "code" + }, + "outputs": [], + "source": [ + "plt.plot(history.history[\"loss\"])\n", + "plt.plot(history.history[\"val_loss\"])\n", + "plt.title(\"model loss\")\n", + "plt.ylabel(\"loss\")\n", + "plt.xlabel(\"epoch\")\n", + "plt.legend([\"train\", \"test\"], loc=\"upper left\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text" + }, + "source": [ + "## Show top 10 movie recommendations to a user" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab_type": "code" + }, + "outputs": [], + "source": [ + "movie_df = pd.read_csv(movielens_dir / \"movies.csv\")\n", + "\n", + "# Let us get a user and see the top recommendations.\n", + "user_id = df.userId.sample(1).iloc[0]\n", + "movies_watched_by_user = df[df.userId == user_id]\n", + "movies_not_watched = movie_df[\n", + " ~movie_df[\"movieId\"].isin(movies_watched_by_user.movieId.values)\n", + "][\"movieId\"]\n", + "movies_not_watched = list(\n", + " set(movies_not_watched).intersection(set(movie2movie_encoded.keys()))\n", + ")\n", + "movies_not_watched = [[movie2movie_encoded.get(x)] for x in movies_not_watched]\n", + "user_encoder = user2user_encoded.get(user_id)\n", + "user_movie_array = np.hstack(\n", + " ([[user_encoder]] * len(movies_not_watched), movies_not_watched)\n", + ")\n", + "ratings = model.predict(user_movie_array).flatten()\n", + "top_ratings_indices = ratings.argsort()[-10:][::-1]\n", + "recommended_movie_ids = [\n", + " movie_encoded2movie.get(movies_not_watched[x][0]) for x in top_ratings_indices\n", + "]\n", + "\n", + "print(\"Showing recommendations for user: {}\".format(user_id))\n", + "print(\"====\" * 9)\n", + "print(\"Movies with high ratings from user\")\n", + "print(\"----\" * 8)\n", + "top_movies_user = (\n", + " movies_watched_by_user.sort_values(by=\"rating\", ascending=False)\n", + " .head(5)\n", + " .movieId.values\n", + ")\n", + "movie_df_rows = movie_df[movie_df[\"movieId\"].isin(top_movies_user)]\n", + "for row in movie_df_rows.itertuples():\n", + " print(row.title, \":\", row.genres)\n", + "\n", + "print(\"----\" * 8)\n", + "print(\"Top 10 movie recommendations\")\n", + "print(\"----\" * 8)\n", + "recommended_movies = movie_df[movie_df[\"movieId\"].isin(recommended_movie_ids)]\n", + "for row in recommended_movies.itertuples():\n", + " print(row.title, \":\", row.genres)" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "collaborative_filtering_movielens", + "private_outputs": false, + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/examples/structured_data/collaborative_filtering_movielens.py b/examples/structured_data/collaborative_filtering_movielens.py index f5558ff9c3..8b9d8b6056 100644 --- a/examples/structured_data/collaborative_filtering_movielens.py +++ b/examples/structured_data/collaborative_filtering_movielens.py @@ -36,7 +36,7 @@ from pathlib import Path import matplotlib.pyplot as plt import numpy as np -from zipfile import ZipFile + import keras from keras import layers @@ -48,23 +48,16 @@ # Download the actual data from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip" # Use the ratings.csv file +# FIX: Use HTTPS and let Keras handle extraction automatically movielens_data_file_url = ( - "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip" + "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip" ) movielens_zipped_file = keras.utils.get_file( - "ml-latest-small.zip", movielens_data_file_url, extract=False + "ml-latest-small.zip", movielens_data_file_url, extract=True ) keras_datasets_path = Path(movielens_zipped_file).parents[0] movielens_dir = keras_datasets_path / "ml-latest-small" -# Only extract the data the first time the script is run. -if not movielens_dir.exists(): - with ZipFile(movielens_zipped_file, "r") as zip: - # Extract files - print("Extracting all the files now...") - zip.extractall(path=keras_datasets_path) - print("Done!") - ratings_file = movielens_dir / "ratings.csv" df = pd.read_csv(ratings_file)