From 444e05a42fad8d36939f5c9b133088662e5edc2b Mon Sep 17 00:00:00 2001 From: Max Jakob Date: Tue, 30 Jan 2024 11:28:41 +0100 Subject: [PATCH] Tokenization notebook: change title (#178) and minor refactoring --- notebooks/document-chunking/tokenization.ipynb | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/notebooks/document-chunking/tokenization.ipynb b/notebooks/document-chunking/tokenization.ipynb index cab24aa1..04b59506 100644 --- a/notebooks/document-chunking/tokenization.ipynb +++ b/notebooks/document-chunking/tokenization.ipynb @@ -7,7 +7,7 @@ "id": "s49gpkvZ7q53" }, "source": [ - "# Tokenization for Semantic Search (ELSER and E5)\n", + "# Calculating tokens for Semantic Search (ELSER and E5)\n", "\n", "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elastic/elasticsearch-labs/blob/main/notebooks/search/tokenization.ipynb)\n", "\n", @@ -217,9 +217,10 @@ "outputs": [], "source": [ "SEMANTIC_SEARCH_TOKEN_LIMIT = 510 # 512 minus space for the 2 special tokens\n", + "ELSER_TOKEN_OVERLAP = 0.5 # 50% token overlap between chunks is recommended for ELSER\n", "\n", - "def chunk(tokens, chunk_size=SEMANTIC_SEARCH_TOKEN_LIMIT):\n", - " step_size = round(chunk_size * .5) # 50% token overlap between chunks is recommended for ELSER\n", + "def chunk(tokens, chunk_size=SEMANTIC_SEARCH_TOKEN_LIMIT, overlap_ratio=ELSER_TOKEN_OVERLAP):\n", + " step_size = round(chunk_size * overlap_ratio)\n", "\n", " for i in range(0, len(tokens), step_size):\n", " yield tokens[i:i+chunk_size]"