diff --git a/notebooks/search/08-learning-to-rank.ipynb b/notebooks/search/08-learning-to-rank.ipynb index ccf94470..9b2e95f0 100644 --- a/notebooks/search/08-learning-to-rank.ipynb +++ b/notebooks/search/08-learning-to-rank.ipynb @@ -10,7 +10,7 @@ "\n", "TODO: udpate the link to elastic/elasticsearch-labs instead of my fork before merging.\n", "\n", - "[](https://colab.research.google.com/github/afoucret/elasticsearch-labs/blob/ltr-notebook/notebooks/search/08-learning-to-rank.ipynb)\n", + "[](https://colab.research.google.com/github/elastic/elasticsearch-labs/blob/ltr-notebook/notebooks/search/08-learning-to-rank.ipynb)\n", "\n", "In this notebook we will see an example on how to train a Learning To Rank model using [XGBoost](https://xgboost.ai/) and how to deploy it to be used as a rescorer in Elasticsearch.\n", "\n", @@ -136,9 +136,7 @@ "source": [ "from urllib.parse import urljoin\n", "\n", - "# TODO: use elastic/elasticsearch-labs instead of afoucret/elasticsearch-labs before merging the PR.\n", - "\n", - "DATASET_BASE_URL = \"https://raw.githubusercontent.com/afoucret/elasticsearch-labs/ltr-notebook/notebooks/search/sample_data/learning-to-rank/\"\n", + "DATASET_BASE_URL = \"https://raw.githubusercontent.com/elastic/elasticsearch-labs/ltr-notebook/notebooks/search/sample_data/learning-to-rank/\"\n", "\n", "CORPUS_URL = urljoin(DATASET_BASE_URL, \"movies-corpus.jsonl.gz\")\n", "JUDGEMENTS_FILE_URL = urljoin(DATASET_BASE_URL, \"movies-judgments.tsv.gz\")\n", @@ -177,7 +175,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -192,7 +190,7 @@ "text": [ "Deleting index if it already exists: movies\n", "Creating index: movies\n", - "Loading the corpus from https://raw.githubusercontent.com/afoucret/elasticsearch-labs/ltr-notebook/notebooks/search/sample_data/learning-to-rank/movies-corpus.jsonl.gz\n", + "Loading the corpus from https://raw.githubusercontent.com/elastic/elasticsearch-labs/ltr-notebook/notebooks/search/sample_data/learning-to-rank/movies-corpus.jsonl.gz\n", "Indexing the corpus into movies ...\n", "Indexed 9750 documents into movies\n" ] @@ -1037,12 +1035,12 @@ "Once the model is uploaded to Elasticsearch, you will be able to use it as a rescorer in the _search API, as shown in this example:\n", "\n", "```\n", - "POST /_search\n", + "GET /movies/_search\n", "{\n", " \"query\" : {\n", " \"multi_match\" : {\n", " \"query\": \"star wars\",\n", - " \"field\": [\"title\", \"overview\", \"actors\", \"director\", \"tags\", \"characters\"]\n", + " \"fields\": [\"title\", \"overview\", \"actors\", \"director\", \"tags\", \"characters\"]\n", " }\n", " },\n", " \"rescore\" : {\n", @@ -1154,59 +1152,6 @@ "source": [ "We saw above that the title and popularity fields are important ranking feature in our model. Here we can see that now all results contain the query terms in the title. Moreover, more popular movies rank higher, for example `Star Wars: Episode I - The Phantom Menace` is now in third position." ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
XGBRanker(base_score=None, booster=None, callbacks=None, colsample_bylevel=None,\n", - " colsample_bynode=None, colsample_bytree=None, device=None,\n", - " early_stopping_rounds=20, enable_categorical=False,\n", - " eval_metric=['ndcg@10'], feature_types=None, gamma=None,\n", - " grow_policy=None, importance_type=None, interaction_constraints=None,\n", - " learning_rate=None, max_bin=None, max_cat_threshold=None,\n", - " max_cat_to_onehot=None, max_delta_step=None, max_depth=None,\n", - " max_leaves=None, min_child_weight=None, missing=nan,\n", - " monotone_constraints=None, multi_strategy=None, n_estimators=None,\n", - " n_jobs=None, num_parallel_tree=None, random_state=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
XGBRanker(base_score=None, booster=None, callbacks=None, colsample_bylevel=None,\n", - " colsample_bynode=None, colsample_bytree=None, device=None,\n", - " early_stopping_rounds=20, enable_categorical=False,\n", - " eval_metric=['ndcg@10'], feature_types=None, gamma=None,\n", - " grow_policy=None, importance_type=None, interaction_constraints=None,\n", - " learning_rate=None, max_bin=None, max_cat_threshold=None,\n", - " max_cat_to_onehot=None, max_delta_step=None, max_depth=None,\n", - " max_leaves=None, min_child_weight=None, missing=nan,\n", - " monotone_constraints=None, multi_strategy=None, n_estimators=None,\n", - " n_jobs=None, num_parallel_tree=None, random_state=None, ...)