Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"\n",
"**Author:** [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)<br>\n",
"**Date created:** 2020/12/30<br>\n",
"**Last modified:** 2025/01/03<br>\n",
"**Last modified:** 2025/01/27<br>\n",
"**Description:** Rating rate prediction using the Behavior Sequence Transformer (BST) model on the Movielens."
]
},
Expand Down Expand Up @@ -82,17 +82,16 @@
"source": [
"import os\n",
"\n",
"os.environ[\"KERAS_BACKEND\"] = \"tensorflow\"\n",
"os.environ[\"KERAS_BACKEND\"] = \"jax\" # or torch, or tensorflow\n",
"\n",
"import math\n",
"from zipfile import ZipFile\n",
"from urllib.request import urlretrieve\n",
"\n",
"import keras\n",
"import numpy as np\n",
"import pandas as pd\n",
"import tensorflow as tf\n",
"from keras import layers\n",
"\n",
"import keras\n",
"from keras import layers, ops\n",
"from keras.layers import StringLookup"
]
},
Expand Down Expand Up @@ -408,7 +407,8 @@
"\n",
"USER_FEATURES = [\"sex\", \"age_group\", \"occupation\"]\n",
"\n",
"MOVIE_FEATURES = [\"genres\"]"
"MOVIE_FEATURES = [\"genres\"]\n",
""
]
},
{
Expand All @@ -417,7 +417,30 @@
"colab_type": "text"
},
"source": [
"## Create `tf.data.Dataset` for training and evaluation"
"## Encode input features\n",
"\n",
"The `encode_input_features` function works as follows:\n",
"\n",
"1. Each categorical user feature is encoded using `layers.Embedding`, with embedding\n",
"dimension equals to the square root of the vocabulary size of the feature.\n",
"The embeddings of these features are concatenated to form a single input tensor.\n",
"\n",
"2. Each movie in the movie sequence and the target movie is encoded `layers.Embedding`,\n",
"where the dimension size is the square root of the number of movies.\n",
"\n",
"3. A multi-hot genres vector for each movie is concatenated with its embedding vector,\n",
"and processed using a non-linear `layers.Dense` to output a vector of the same movie\n",
"embedding dimensions.\n",
"\n",
"4. A positional embedding is added to each movie embedding in the sequence, and then\n",
"multiplied by its rating from the ratings sequence.\n",
"\n",
"5. The target movie embedding is concatenated to the sequence movie embeddings, producing\n",
"a tensor with the shape of `[batch size, sequence length, embedding size]`, as expected\n",
"by the attention layer for the transformer architecture.\n",
"\n",
"6. The method returns a tuple of two elements: `encoded_transformer_features` and\n",
"`encoded_other_features`."
]
},
{
Expand All @@ -428,25 +451,60 @@
},
"outputs": [],
"source": [
"# Required for tf.data.Dataset\n",
"import tensorflow as tf\n",
"\n",
"\n",
"def get_dataset_from_csv(csv_file_path, batch_size, shuffle=True):\n",
"\n",
" def process(features):\n",
" movie_ids_string = features[\"sequence_movie_ids\"]\n",
" sequence_movie_ids = tf.strings.split(movie_ids_string, \",\").to_tensor()\n",
"\n",
" # The last movie id in the sequence is the target movie.\n",
" features[\"target_movie_id\"] = sequence_movie_ids[:, -1]\n",
" features[\"sequence_movie_ids\"] = sequence_movie_ids[:, :-1]\n",
"\n",
" # Sequence ratings\n",
" ratings_string = features[\"sequence_ratings\"]\n",
" sequence_ratings = tf.strings.to_number(\n",
" tf.strings.split(ratings_string, \",\"), tf.dtypes.float32\n",
" ).to_tensor()\n",
"\n",
" # The last rating in the sequence is the target for the model to predict.\n",
" target = sequence_ratings[:, -1]\n",
" features[\"sequence_ratings\"] = sequence_ratings[:, :-1]\n",
"\n",
" def encoding_helper(feature_name):\n",
"\n",
" # This are target_movie_id and sequence_movie_ids and they have the same\n",
" # vocabulary as movie_id.\n",
" if feature_name not in CATEGORICAL_FEATURES_WITH_VOCABULARY:\n",
" vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[\"movie_id\"]\n",
" index_lookup = StringLookup(\n",
" vocabulary=vocabulary, mask_token=None, num_oov_indices=0\n",
" )\n",
" # Convert the string input values into integer indices.\n",
" value_index = index_lookup(features[feature_name])\n",
" features[feature_name] = value_index\n",
" else:\n",
" # movie_id is not part of the features, hence not processed. It was mainly required\n",
" # for its vocabulary above.\n",
" if feature_name == \"movie_id\":\n",
" pass\n",
" else:\n",
" vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]\n",
" index_lookup = StringLookup(\n",
" vocabulary=vocabulary, mask_token=None, num_oov_indices=0\n",
" )\n",
" # Convert the string input values into integer indices.\n",
" value_index = index_lookup(features[feature_name])\n",
" features[feature_name] = value_index\n",
"\n",
" # Encode the user features\n",
" for feature_name in CATEGORICAL_FEATURES_WITH_VOCABULARY:\n",
" encoding_helper(feature_name)\n",
" # Encoding target_movie_id and returning it as the target variable\n",
" encoding_helper(\"target_movie_id\")\n",
" # Encoding sequence movie_ids.\n",
" encoding_helper(\"sequence_movie_ids\")\n",
" return dict(features), target\n",
"\n",
" dataset = tf.data.experimental.make_csv_dataset(\n",
Expand All @@ -458,94 +516,14 @@
" field_delim=\"|\",\n",
" shuffle=shuffle,\n",
" ).map(process)\n",
"\n",
" return dataset\n",
""
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text"
},
"source": [
"## Create model inputs"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab_type": "code"
},
"outputs": [],
"source": [
"\n",
"def create_model_inputs():\n",
" return {\n",
" \"user_id\": keras.Input(name=\"user_id\", shape=(1,), dtype=\"string\"),\n",
" \"sequence_movie_ids\": keras.Input(\n",
" name=\"sequence_movie_ids\", shape=(sequence_length - 1,), dtype=\"string\"\n",
" ),\n",
" \"target_movie_id\": keras.Input(\n",
" name=\"target_movie_id\", shape=(1,), dtype=\"string\"\n",
" ),\n",
" \"sequence_ratings\": keras.Input(\n",
" name=\"sequence_ratings\", shape=(sequence_length - 1,), dtype=tf.float32\n",
" ),\n",
" \"sex\": keras.Input(name=\"sex\", shape=(1,), dtype=\"string\"),\n",
" \"age_group\": keras.Input(name=\"age_group\", shape=(1,), dtype=\"string\"),\n",
" \"occupation\": keras.Input(name=\"occupation\", shape=(1,), dtype=\"string\"),\n",
" }\n",
""
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text"
},
"source": [
"## Encode input features\n",
"\n",
"The `encode_input_features` method works as follows:\n",
"\n",
"1. Each categorical user feature is encoded using `layers.Embedding`, with embedding\n",
"dimension equals to the square root of the vocabulary size of the feature.\n",
"The embeddings of these features are concatenated to form a single input tensor.\n",
"\n",
"2. Each movie in the movie sequence and the target movie is encoded `layers.Embedding`,\n",
"where the dimension size is the square root of the number of movies.\n",
"\n",
"3. A multi-hot genres vector for each movie is concatenated with its embedding vector,\n",
"and processed using a non-linear `layers.Dense` to output a vector of the same movie\n",
"embedding dimensions.\n",
"\n",
"4. A positional embedding is added to each movie embedding in the sequence, and then\n",
"multiplied by its rating from the ratings sequence.\n",
"\n",
"5. The target movie embedding is concatenated to the sequence movie embeddings, producing\n",
"a tensor with the shape of `[batch size, sequence length, embedding size]`, as expected\n",
"by the attention layer for the transformer architecture.\n",
"\n",
"6. The method returns a tuple of two elements: `encoded_transformer_features` and\n",
"`encoded_other_features`."
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab_type": "code"
},
"outputs": [],
"source": [
"\n",
"def encode_input_features(\n",
" inputs,\n",
" include_user_id=True,\n",
" include_user_features=True,\n",
" include_movie_features=True,\n",
" include_user_id,\n",
" include_user_features,\n",
" include_movie_features,\n",
"):\n",
" encoded_transformer_features = []\n",
" encoded_other_features = []\n",
Expand All @@ -558,11 +536,7 @@
"\n",
" ## Encode user features\n",
" for feature_name in other_feature_names:\n",
" # Convert the string input values into integer indices.\n",
" vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]\n",
" idx = StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0)(\n",
" inputs[feature_name]\n",
" )\n",
" # Compute embedding dimensions\n",
" embedding_dims = int(math.sqrt(len(vocabulary)))\n",
" # Create an embedding layer with the specified dimensions.\n",
Expand All @@ -572,7 +546,7 @@
" name=f\"{feature_name}_embedding\",\n",
" )\n",
" # Convert the index values to embedding representations.\n",
" encoded_other_features.append(embedding_encoder(idx))\n",
" encoded_other_features.append(embedding_encoder(inputs[feature_name]))\n",
"\n",
" ## Create a single embedding vector for the user features\n",
" if len(encoded_other_features) > 1:\n",
Expand All @@ -585,13 +559,6 @@
" ## Create a movie embedding encoder\n",
" movie_vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[\"movie_id\"]\n",
" movie_embedding_dims = int(math.sqrt(len(movie_vocabulary)))\n",
" # Create a lookup to convert string values to integer indices.\n",
" movie_index_lookup = StringLookup(\n",
" vocabulary=movie_vocabulary,\n",
" mask_token=None,\n",
" num_oov_indices=0,\n",
" name=\"movie_index_lookup\",\n",
" )\n",
" # Create an embedding layer with the specified dimensions.\n",
" movie_embedding_encoder = layers.Embedding(\n",
" input_dim=len(movie_vocabulary),\n",
Expand All @@ -617,11 +584,10 @@
" ## Define a function to encode a given movie id.\n",
" def encode_movie(movie_id):\n",
" # Convert the string input values into integer indices.\n",
" movie_idx = movie_index_lookup(movie_id)\n",
" movie_embedding = movie_embedding_encoder(movie_idx)\n",
" movie_embedding = movie_embedding_encoder(movie_id)\n",
" encoded_movie = movie_embedding\n",
" if include_movie_features:\n",
" movie_genres_vector = movie_genres_lookup(movie_idx)\n",
" movie_genres_vector = movie_genres_lookup(movie_id)\n",
" encoded_movie = movie_embedding_processor(\n",
" layers.concatenate([movie_embedding, movie_genres_vector])\n",
" )\n",
Expand All @@ -640,11 +606,11 @@
" output_dim=movie_embedding_dims,\n",
" name=\"position_embedding\",\n",
" )\n",
" positions = tf.range(start=0, limit=sequence_length - 1, delta=1)\n",
" positions = ops.arange(start=0, stop=sequence_length - 1, step=1)\n",
" encodded_positions = position_embedding_encoder(positions)\n",
" # Retrieve sequence ratings to incorporate them into the encoding of the movie.\n",
" sequence_ratings = inputs[\"sequence_ratings\"]\n",
" sequence_ratings = keras.ops.expand_dims(sequence_ratings, -1)\n",
" sequence_ratings = ops.expand_dims(sequence_ratings, -1)\n",
" # Add the positional encoding to the movie encodings and multiply them by rating.\n",
" encoded_sequence_movies_with_poistion_and_rating = layers.Multiply()(\n",
" [(encoded_sequence_movies + encodded_positions), sequence_ratings]\n",
Expand All @@ -653,18 +619,53 @@
" # Construct the transformer inputs.\n",
" for i in range(sequence_length - 1):\n",
" feature = encoded_sequence_movies_with_poistion_and_rating[:, i, ...]\n",
" feature = keras.ops.expand_dims(feature, 1)\n",
" feature = ops.expand_dims(feature, 1)\n",
" encoded_transformer_features.append(feature)\n",
" encoded_transformer_features.append(encoded_target_movie)\n",
"\n",
" encoded_transformer_features = layers.concatenate(\n",
" encoded_transformer_features, axis=1\n",
" )\n",
"\n",
" return encoded_transformer_features, encoded_other_features\n",
""
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text"
},
"source": [
"## Create model inputs"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab_type": "code"
},
"outputs": [],
"source": [
"\n",
"def create_model_inputs():\n",
" return {\n",
" \"user_id\": keras.Input(name=\"user_id\", shape=(1,), dtype=\"int32\"),\n",
" \"sequence_movie_ids\": keras.Input(\n",
" name=\"sequence_movie_ids\", shape=(sequence_length - 1,), dtype=\"int32\"\n",
" ),\n",
" \"target_movie_id\": keras.Input(\n",
" name=\"target_movie_id\", shape=(1,), dtype=\"int32\"\n",
" ),\n",
" \"sequence_ratings\": keras.Input(\n",
" name=\"sequence_ratings\", shape=(sequence_length - 1,), dtype=\"float32\"\n",
" ),\n",
" \"sex\": keras.Input(name=\"sex\", shape=(1,), dtype=\"int32\"),\n",
" \"age_group\": keras.Input(name=\"age_group\", shape=(1,), dtype=\"int32\"),\n",
" \"occupation\": keras.Input(name=\"occupation\", shape=(1,), dtype=\"int32\"),\n",
" }\n",
""
]
},
{
"cell_type": "markdown",
"metadata": {
Expand Down Expand Up @@ -692,11 +693,11 @@
"\n",
"\n",
"def create_model():\n",
"\n",
" inputs = create_model_inputs()\n",
" transformer_features, other_features = encode_input_features(\n",
" inputs, include_user_id, include_user_features, include_movie_features\n",
" )\n",
"\n",
" # Create a multi-headed attention layer.\n",
" attention_output = layers.MultiHeadAttention(\n",
" num_heads=num_heads, key_dim=transformer_features.shape[2], dropout=dropout_rate\n",
Expand All @@ -713,7 +714,7 @@
" transformer_features = layers.LayerNormalization()(transformer_features)\n",
" features = layers.Flatten()(transformer_features)\n",
"\n",
" # Included the other features.\n",
" # Included the other_features.\n",
" if other_features is not None:\n",
" features = layers.concatenate(\n",
" [features, layers.Reshape([other_features.shape[-1]])(other_features)]\n",
Expand All @@ -725,7 +726,6 @@
" features = layers.BatchNormalization()(features)\n",
" features = layers.LeakyReLU()(features)\n",
" features = layers.Dropout(dropout_rate)(features)\n",
"\n",
" outputs = layers.Dense(units=1)(features)\n",
" model = keras.Model(inputs=inputs, outputs=outputs)\n",
" return model\n",
Expand Down Expand Up @@ -759,6 +759,7 @@
")\n",
"\n",
"# Read the training data.\n",
"\n",
"train_dataset = get_dataset_from_csv(\"train_data.csv\", batch_size=265, shuffle=True)\n",
"\n",
"# Fit the model with the training data.\n",
Expand Down
Loading