elastic · maxjakob · Jan 30, 2024 · Jan 29, 2024
diff --git a/notebooks/document-chunking/tokenization.ipynb b/notebooks/document-chunking/tokenization.ipynb
@@ -7,7 +7,7 @@
     "id": "s49gpkvZ7q53"
    },
    "source": [
-    "# Tokenization for Semantic Search (ELSER and E5)\n",
+    "# Calculating tokens for Semantic Search (ELSER and E5)\n",
     "\n",
     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elastic/elasticsearch-labs/blob/main/notebooks/search/tokenization.ipynb)\n",
     "\n",
@@ -217,9 +217,10 @@
    "outputs": [],
    "source": [
     "SEMANTIC_SEARCH_TOKEN_LIMIT = 510  # 512 minus space for the 2 special tokens\n",
+    "ELSER_TOKEN_OVERLAP = 0.5  # 50% token overlap between chunks is recommended for ELSER\n",
     "\n",
-    "def chunk(tokens, chunk_size=SEMANTIC_SEARCH_TOKEN_LIMIT):\n",
-    "    step_size = round(chunk_size * .5)  # 50% token overlap between chunks is recommended for ELSER\n",
+    "def chunk(tokens, chunk_size=SEMANTIC_SEARCH_TOKEN_LIMIT, overlap_ratio=ELSER_TOKEN_OVERLAP):\n",
+    "    step_size = round(chunk_size * overlap_ratio)\n",
     "\n",
     "    for i in range(0, len(tokens), step_size):\n",
     "        yield tokens[i:i+chunk_size]"