Review suggestions + typos fixes

afoucret · afoucret · commit 8e84eb92c294 · 2024-01-30T21:09:20.000+01:00
diff --git a/notebooks/search/08-learning-to-rank.ipynb b/notebooks/search/08-learning-to-rank.ipynb
@@ -713,18 +713,18 @@
     "feature_logger = FeatureLogger(es_client, MOVIE_INDEX, ltr_config)\n",
     "\n",
     "\n",
-    "# This method will be applied for each group of query in the judgment log:\n",
+    "# This method will be applied for each query group in the judgment log:\n",
     "def _extract_query_features(query_judgements_group):\n",
     "    # Retrieve document ids in the query group as strings.\n",
     "    doc_ids = query_judgements_group[\"doc_id\"].astype(\"str\").to_list()\n",
     "\n",
-    "    # Resolve query paras for the current query group (e.g.: {\"query\": \"batman\"}).\n",
+    "    # Resolve query params for the current query group (e.g.: {\"query\": \"batman\"}).\n",
     "    query_params = {\"query\": query_judgements_group[\"query\"].iloc[0]}\n",
     "\n",
     "    # Extract the features for the documents in the query group:\n",
     "    doc_features = feature_logger.extract_features(query_params, doc_ids)\n",
     "\n",
-    "    # Adding a column to the dataframe for each features:\n",
+    "    # Adding a column to the dataframe for each feature:\n",
     "    for feature_index, feature_name in enumerate(ltr_config.feature_names):\n",
     "        query_judgements_group[feature_name] = numpy.array([doc_features[doc_id][feature_index] for doc_id in doc_ids])\n",
     "\n",
@@ -919,8 +919,8 @@
     "# Split the dataset in two parts respectively used for training and evaluation of the model.\n",
     "group_preserving_splitter = GroupShuffleSplit(n_splits=1, train_size=0.7).split(X, y, groups)\n",
     "train_idx, eval_idx = next(group_preserving_splitter)\n",
-    "train_features, eval_features = X.loc[train_idx], X.loc[eval_idx]\n",
     "\n",
+    "train_features, eval_features = X.loc[train_idx], X.loc[eval_idx]\n",
     "train_target, eval_target = y.loc[train_idx], y.loc[eval_idx]\n",
     "train_query_groups, eval_query_groups = groups.loc[train_idx], groups.loc[eval_idx]\n",
     "\n",
@@ -1138,6 +1138,13 @@
     "    for movie in rescored_search_response[\"hits\"][\"hits\"]\n",
     "]"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We saw above that the title and popularity fields are important ranking feature in our model. Here we can see that now all results contain the query terms in the title. Moreover, more popular movies rank higher, for example `Star Wars: Episode I - The Phantom Menace` is now in third position."
+   ]
   }
  ],
  "metadata": {