From 8e84eb92c29401b6ba1cdb5f1782f8283114bd5d Mon Sep 17 00:00:00 2001 From: Aurelien FOUCRET Date: Tue, 30 Jan 2024 21:09:20 +0100 Subject: [PATCH] Review suggestions + typos fixes --- notebooks/search/08-learning-to-rank.ipynb | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/notebooks/search/08-learning-to-rank.ipynb b/notebooks/search/08-learning-to-rank.ipynb index b1b9e0cc..240bdfd9 100644 --- a/notebooks/search/08-learning-to-rank.ipynb +++ b/notebooks/search/08-learning-to-rank.ipynb @@ -713,18 +713,18 @@ "feature_logger = FeatureLogger(es_client, MOVIE_INDEX, ltr_config)\n", "\n", "\n", - "# This method will be applied for each group of query in the judgment log:\n", + "# This method will be applied for each query group in the judgment log:\n", "def _extract_query_features(query_judgements_group):\n", " # Retrieve document ids in the query group as strings.\n", " doc_ids = query_judgements_group[\"doc_id\"].astype(\"str\").to_list()\n", "\n", - " # Resolve query paras for the current query group (e.g.: {\"query\": \"batman\"}).\n", + " # Resolve query params for the current query group (e.g.: {\"query\": \"batman\"}).\n", " query_params = {\"query\": query_judgements_group[\"query\"].iloc[0]}\n", "\n", " # Extract the features for the documents in the query group:\n", " doc_features = feature_logger.extract_features(query_params, doc_ids)\n", "\n", - " # Adding a column to the dataframe for each features:\n", + " # Adding a column to the dataframe for each feature:\n", " for feature_index, feature_name in enumerate(ltr_config.feature_names):\n", " query_judgements_group[feature_name] = numpy.array([doc_features[doc_id][feature_index] for doc_id in doc_ids])\n", "\n", @@ -919,8 +919,8 @@ "# Split the dataset in two parts respectively used for training and evaluation of the model.\n", "group_preserving_splitter = GroupShuffleSplit(n_splits=1, train_size=0.7).split(X, y, groups)\n", "train_idx, eval_idx = next(group_preserving_splitter)\n", - "train_features, eval_features = X.loc[train_idx], X.loc[eval_idx]\n", "\n", + "train_features, eval_features = X.loc[train_idx], X.loc[eval_idx]\n", "train_target, eval_target = y.loc[train_idx], y.loc[eval_idx]\n", "train_query_groups, eval_query_groups = groups.loc[train_idx], groups.loc[eval_idx]\n", "\n", @@ -1138,6 +1138,13 @@ " for movie in rescored_search_response[\"hits\"][\"hits\"]\n", "]" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We saw above that the title and popularity fields are important ranking feature in our model. Here we can see that now all results contain the query terms in the title. Moreover, more popular movies rank higher, for example `Star Wars: Episode I - The Phantom Menace` is now in third position." + ] } ], "metadata": {