diff --git a/examples/structured_data/collaborative_filtering_movielens.ipynb b/examples/structured_data/collaborative_filtering_movielens.ipynb
new file mode 100644
index 0000000000..1a28e4c66f
--- /dev/null
+++ b/examples/structured_data/collaborative_filtering_movielens.ipynb
@@ -0,0 +1,379 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text"
+ },
+ "source": [
+ "# Collaborative Filtering for Movie Recommendations\n",
+ "\n",
+ "**Author:** [Siddhartha Banerjee](https://twitter.com/sidd2006)
\n",
+ "**Date created:** 2020/05/24
\n",
+ "**Last modified:** 2020/05/24
\n",
+ "**Description:** Recommending movies using a model trained on Movielens dataset."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text"
+ },
+ "source": [
+ "## Introduction\n",
+ "\n",
+ "This example demonstrates\n",
+ "[Collaborative filtering](https://en.wikipedia.org/wiki/Collaborative_filtering)\n",
+ "using the [Movielens dataset](https://www.kaggle.com/c/movielens-100k)\n",
+ "to recommend movies to users.\n",
+ "The MovieLens ratings dataset lists the ratings given by a set of users to a set of movies.\n",
+ "Our goal is to be able to predict ratings for movies a user has not yet watched.\n",
+ "The movies with the highest predicted ratings can then be recommended to the user.\n",
+ "\n",
+ "The steps in the model are as follows:\n",
+ "\n",
+ "1. Map user ID to a \"user vector\" via an embedding matrix\n",
+ "2. Map movie ID to a \"movie vector\" via an embedding matrix\n",
+ "3. Compute the dot product between the user vector and movie vector, to obtain\n",
+ "the a match score between the user and the movie (predicted rating).\n",
+ "4. Train the embeddings via gradient descent using all known user-movie pairs.\n",
+ "\n",
+ "**References:**\n",
+ "\n",
+ "- [Collaborative Filtering](https://dl.acm.org/doi/pdf/10.1145/371920.372071)\n",
+ "- [Neural Collaborative Filtering](https://dl.acm.org/doi/pdf/10.1145/3038912.3052569)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "colab_type": "code"
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "from pathlib import Path\n",
+ "import matplotlib.pyplot as plt\n",
+ "import numpy as np\n",
+ "\n",
+ "\n",
+ "import keras\n",
+ "from keras import layers\n",
+ "from keras import ops"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text"
+ },
+ "source": [
+ "## First, load the data and apply preprocessing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "colab_type": "code"
+ },
+ "outputs": [],
+ "source": [
+ "# Download the actual data from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip\"\n",
+ "# Use the ratings.csv file\n",
+ "# FIX: Use HTTPS and let Keras handle extraction automatically\n",
+ "movielens_data_file_url = (\n",
+ " \"https://files.grouplens.org/datasets/movielens/ml-latest-small.zip\"\n",
+ ")\n",
+ "movielens_zipped_file = keras.utils.get_file(\n",
+ " \"ml-latest-small.zip\", movielens_data_file_url, extract=True\n",
+ ")\n",
+ "keras_datasets_path = Path(movielens_zipped_file).parents[0]\n",
+ "movielens_dir = keras_datasets_path / \"ml-latest-small\"\n",
+ "\n",
+ "ratings_file = movielens_dir / \"ratings.csv\"\n",
+ "df = pd.read_csv(ratings_file)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text"
+ },
+ "source": [
+ "First, need to perform some preprocessing to encode users and movies as integer indices."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "colab_type": "code"
+ },
+ "outputs": [],
+ "source": [
+ "user_ids = df[\"userId\"].unique().tolist()\n",
+ "user2user_encoded = {x: i for i, x in enumerate(user_ids)}\n",
+ "userencoded2user = {i: x for i, x in enumerate(user_ids)}\n",
+ "movie_ids = df[\"movieId\"].unique().tolist()\n",
+ "movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}\n",
+ "movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}\n",
+ "df[\"user\"] = df[\"userId\"].map(user2user_encoded)\n",
+ "df[\"movie\"] = df[\"movieId\"].map(movie2movie_encoded)\n",
+ "\n",
+ "num_users = len(user2user_encoded)\n",
+ "num_movies = len(movie_encoded2movie)\n",
+ "df[\"rating\"] = df[\"rating\"].values.astype(np.float32)\n",
+ "# min and max ratings will be used to normalize the ratings later\n",
+ "min_rating = min(df[\"rating\"])\n",
+ "max_rating = max(df[\"rating\"])\n",
+ "\n",
+ "print(\n",
+ " \"Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}\".format(\n",
+ " num_users, num_movies, min_rating, max_rating\n",
+ " )\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text"
+ },
+ "source": [
+ "## Prepare training and validation data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "colab_type": "code"
+ },
+ "outputs": [],
+ "source": [
+ "df = df.sample(frac=1, random_state=42)\n",
+ "x = df[[\"user\", \"movie\"]].values\n",
+ "# Normalize the targets between 0 and 1. Makes it easy to train.\n",
+ "y = df[\"rating\"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values\n",
+ "# Assuming training on 90% of the data and validating on 10%.\n",
+ "train_indices = int(0.9 * df.shape[0])\n",
+ "x_train, x_val, y_train, y_val = (\n",
+ " x[:train_indices],\n",
+ " x[train_indices:],\n",
+ " y[:train_indices],\n",
+ " y[train_indices:],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text"
+ },
+ "source": [
+ "## Create the model\n",
+ "\n",
+ "We embed both users and movies in to 50-dimensional vectors.\n",
+ "\n",
+ "The model computes a match score between user and movie embeddings via a dot product,\n",
+ "and adds a per-movie and per-user bias. The match score is scaled to the `[0, 1]`\n",
+ "interval via a sigmoid (since our ratings are normalized to this range)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "colab_type": "code"
+ },
+ "outputs": [],
+ "source": [
+ "EMBEDDING_SIZE = 50\n",
+ "\n",
+ "\n",
+ "class RecommenderNet(keras.Model):\n",
+ " def __init__(self, num_users, num_movies, embedding_size, **kwargs):\n",
+ " super().__init__(**kwargs)\n",
+ " self.num_users = num_users\n",
+ " self.num_movies = num_movies\n",
+ " self.embedding_size = embedding_size\n",
+ " self.user_embedding = layers.Embedding(\n",
+ " num_users,\n",
+ " embedding_size,\n",
+ " embeddings_initializer=\"he_normal\",\n",
+ " embeddings_regularizer=keras.regularizers.l2(1e-6),\n",
+ " )\n",
+ " self.user_bias = layers.Embedding(num_users, 1)\n",
+ " self.movie_embedding = layers.Embedding(\n",
+ " num_movies,\n",
+ " embedding_size,\n",
+ " embeddings_initializer=\"he_normal\",\n",
+ " embeddings_regularizer=keras.regularizers.l2(1e-6),\n",
+ " )\n",
+ " self.movie_bias = layers.Embedding(num_movies, 1)\n",
+ "\n",
+ " def call(self, inputs):\n",
+ " user_vector = self.user_embedding(inputs[:, 0])\n",
+ " user_bias = self.user_bias(inputs[:, 0])\n",
+ " movie_vector = self.movie_embedding(inputs[:, 1])\n",
+ " movie_bias = self.movie_bias(inputs[:, 1])\n",
+ " dot_user_movie = ops.tensordot(user_vector, movie_vector, 2)\n",
+ " # Add all the components (including bias)\n",
+ " x = dot_user_movie + user_bias + movie_bias\n",
+ " # The sigmoid activation forces the rating to between 0 and 1\n",
+ " return ops.nn.sigmoid(x)\n",
+ "\n",
+ "\n",
+ "model = RecommenderNet(num_users, num_movies, EMBEDDING_SIZE)\n",
+ "model.compile(\n",
+ " loss=keras.losses.BinaryCrossentropy(),\n",
+ " optimizer=keras.optimizers.Adam(learning_rate=0.001),\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text"
+ },
+ "source": [
+ "## Train the model based on the data split"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "colab_type": "code"
+ },
+ "outputs": [],
+ "source": [
+ "history = model.fit(\n",
+ " x=x_train,\n",
+ " y=y_train,\n",
+ " batch_size=64,\n",
+ " epochs=5,\n",
+ " verbose=1,\n",
+ " validation_data=(x_val, y_val),\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text"
+ },
+ "source": [
+ "## Plot training and validation loss"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "colab_type": "code"
+ },
+ "outputs": [],
+ "source": [
+ "plt.plot(history.history[\"loss\"])\n",
+ "plt.plot(history.history[\"val_loss\"])\n",
+ "plt.title(\"model loss\")\n",
+ "plt.ylabel(\"loss\")\n",
+ "plt.xlabel(\"epoch\")\n",
+ "plt.legend([\"train\", \"test\"], loc=\"upper left\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text"
+ },
+ "source": [
+ "## Show top 10 movie recommendations to a user"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "colab_type": "code"
+ },
+ "outputs": [],
+ "source": [
+ "movie_df = pd.read_csv(movielens_dir / \"movies.csv\")\n",
+ "\n",
+ "# Let us get a user and see the top recommendations.\n",
+ "user_id = df.userId.sample(1).iloc[0]\n",
+ "movies_watched_by_user = df[df.userId == user_id]\n",
+ "movies_not_watched = movie_df[\n",
+ " ~movie_df[\"movieId\"].isin(movies_watched_by_user.movieId.values)\n",
+ "][\"movieId\"]\n",
+ "movies_not_watched = list(\n",
+ " set(movies_not_watched).intersection(set(movie2movie_encoded.keys()))\n",
+ ")\n",
+ "movies_not_watched = [[movie2movie_encoded.get(x)] for x in movies_not_watched]\n",
+ "user_encoder = user2user_encoded.get(user_id)\n",
+ "user_movie_array = np.hstack(\n",
+ " ([[user_encoder]] * len(movies_not_watched), movies_not_watched)\n",
+ ")\n",
+ "ratings = model.predict(user_movie_array).flatten()\n",
+ "top_ratings_indices = ratings.argsort()[-10:][::-1]\n",
+ "recommended_movie_ids = [\n",
+ " movie_encoded2movie.get(movies_not_watched[x][0]) for x in top_ratings_indices\n",
+ "]\n",
+ "\n",
+ "print(\"Showing recommendations for user: {}\".format(user_id))\n",
+ "print(\"====\" * 9)\n",
+ "print(\"Movies with high ratings from user\")\n",
+ "print(\"----\" * 8)\n",
+ "top_movies_user = (\n",
+ " movies_watched_by_user.sort_values(by=\"rating\", ascending=False)\n",
+ " .head(5)\n",
+ " .movieId.values\n",
+ ")\n",
+ "movie_df_rows = movie_df[movie_df[\"movieId\"].isin(top_movies_user)]\n",
+ "for row in movie_df_rows.itertuples():\n",
+ " print(row.title, \":\", row.genres)\n",
+ "\n",
+ "print(\"----\" * 8)\n",
+ "print(\"Top 10 movie recommendations\")\n",
+ "print(\"----\" * 8)\n",
+ "recommended_movies = movie_df[movie_df[\"movieId\"].isin(recommended_movie_ids)]\n",
+ "for row in recommended_movies.itertuples():\n",
+ " print(row.title, \":\", row.genres)"
+ ]
+ }
+ ],
+ "metadata": {
+ "accelerator": "GPU",
+ "colab": {
+ "collapsed_sections": [],
+ "name": "collaborative_filtering_movielens",
+ "private_outputs": false,
+ "provenance": [],
+ "toc_visible": true
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/examples/structured_data/collaborative_filtering_movielens.py b/examples/structured_data/collaborative_filtering_movielens.py
index f5558ff9c3..8b9d8b6056 100644
--- a/examples/structured_data/collaborative_filtering_movielens.py
+++ b/examples/structured_data/collaborative_filtering_movielens.py
@@ -36,7 +36,7 @@
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
-from zipfile import ZipFile
+
import keras
from keras import layers
@@ -48,23 +48,16 @@
# Download the actual data from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
# Use the ratings.csv file
+# FIX: Use HTTPS and let Keras handle extraction automatically
movielens_data_file_url = (
- "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
+ "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
)
movielens_zipped_file = keras.utils.get_file(
- "ml-latest-small.zip", movielens_data_file_url, extract=False
+ "ml-latest-small.zip", movielens_data_file_url, extract=True
)
keras_datasets_path = Path(movielens_zipped_file).parents[0]
movielens_dir = keras_datasets_path / "ml-latest-small"
-# Only extract the data the first time the script is run.
-if not movielens_dir.exists():
- with ZipFile(movielens_zipped_file, "r") as zip:
- # Extract files
- print("Extracting all the files now...")
- zip.extractall(path=keras_datasets_path)
- print("Done!")
-
ratings_file = movielens_dir / "ratings.csv"
df = pd.read_csv(ratings_file)