567-labs · jxnl · Dec 21, 2023 · Dec 21, 2023 · Dec 21, 2023 · Dec 21, 2023
diff --git a/applications/wikipedia/benchmarks.json b/applications/wikipedia/benchmarks.json
@@ -583,3 +583,14 @@
   "characters_per_sec": 21734383,
   "extrapolated_duration": "0:14:59.981332"
 }
+{
+  "downscale": 0.05,
+  "batch_size": 80,
+  "n_gpu": 100,
+  "duration_mins": 2.2918854606833334,
+  "characters_per_sec": 28454999,
+  "extrapolated_duration": "0:11:27.420125",
+  "model": "jinaai/jina-embeddings-v2-small-en",
+  "model_batch_size": 8,
+  "model_token_window": 6000
+}
diff --git a/applications/wikipedia/main.py b/applications/wikipedia/main.py
@@ -3,16 +3,34 @@
 import asyncio
 import subprocess
 from pathlib import Path
-import time
 
 from modal import Image, Stub, Volume, gpu, method, Secret
 
 N_GPU = 100
 GPU_CONFIG = gpu.A10G()
-MODEL_ID = "BAAI/bge-small-en-v1.5"
-MODEL_SLUG = MODEL_ID.split("/")[-1]
 
-BATCH_SIZE = 512
+MODEL_CONFIG = {
+    "jinaai/jina-embeddings-v2-small-en": {
+        "batch_size": 8,
+        "token_window": 6000,
+        "slug": "jina-embeddings-v2-small-en",
+    },
+    "BAAI/bge-small-en-v1.5": {
+        "batch_size": 512,
+        "token_window": 400,
+        "slug": "bge-small-en-v1.5",
+    },
+    "BAAI/bge-base-en-v1.5": {
+        "batch_size": 256,
+        "token_window": 400,
+        "slug": "bge-base-en-v1.5",
+    },
+}
+
+MODEL_ID = "BAAI/bge-small-en-v1.5"
+BATCH_SIZE = MODEL_CONFIG[MODEL_ID]["batch_size"]
+TOKEN_WINDOW = MODEL_CONFIG[MODEL_ID]["token_window"]
+MODEL_SLUG = MODEL_CONFIG[MODEL_ID]["slug"]
 DOCKER_IMAGE = (
     "ghcr.io/huggingface/text-embeddings-inference:86-0.4.0"  # Ampere 86 for A10s.
     # "ghcr.io/huggingface/text-embeddings-inference:0.4.0" # Ampere 80 for A100s.
@@ -25,8 +43,8 @@
 DATA_PATH = Path(data_dir)
 
 SAVE_TO_DISK = True
-dataset_name = f"567-labs/wikipedia-embedding-{MODEL_SLUG}-sample"
-dataset_file = "wiki-embeddings.parquet"
+dataset_name = f"567-labs/wikipedia-embedding-{MODEL_SLUG}-five-percent"
+dataset_file = f"wiki-embeddings-{MODEL_SLUG}.parquet"
 
 LAUNCH_FLAGS = [
     "--model-id",
@@ -36,7 +54,7 @@
     "--max-client-batch-size",
     str(BATCH_SIZE),
     "--max-batch-tokens",
-    str(BATCH_SIZE * 512),
+    str(TOKEN_WINDOW * BATCH_SIZE),
 ]
 
 
@@ -82,13 +100,13 @@ def download_model():
     import numpy as np
 
 
-def generate_chunks_from_dataset(xs, chunk_size: int):
+def generate_chunks_from_dataset(xs, chunk_size: int = 3000, step: int = 1500):
     for data in xs:
         id_ = data["id"]
         url = data["url"]
         title = data["title"]
         text = data["text"]
-        for chunk_start in range(0, len(text), chunk_size):
+        for chunk_start in range(0, len(text) - chunk_size + 1, step):
             yield (
                 id_,
                 url,
@@ -183,7 +201,9 @@ def embed_dataset(down_scale: float = 0.005, batch_size: int = 512 * 50):
 
     print(f"Working with {sample_size} rows")
 
-    text_chunks = generate_chunks_from_dataset(subset, chunk_size=512)
+    text_chunks = generate_chunks_from_dataset(
+        subset, chunk_size=TOKEN_WINDOW, step=TOKEN_WINDOW // 2
+    )
     batches = generate_batches(text_chunks, batch_size=batch_size)
 
     start = time.perf_counter()
@@ -214,6 +234,9 @@ def embed_dataset(down_scale: float = 0.005, batch_size: int = 512 * 50):
         "duration_mins": duration / 60,
         "characters_per_sec": characters_per_sec,
         "extrapolated_duration": extrapolated_duration_cps_fmt,
+        "model": MODEL_ID,
+        "model_batch_size": BATCH_SIZE,
+        "model_token_window": TOKEN_WINDOW,
     }
 
     print(json.dumps(resp, indent=2))
@@ -232,15 +255,15 @@ def embed_dataset(down_scale: float = 0.005, batch_size: int = 512 * 50):
         )
         print(f"Saving to disk at {cache_dir}/{dataset_file}")
         pq.write_table(table, f"{cache_dir}/{dataset_file}")
-        volume.commit()
-
+        dataset = load_dataset("parquet", data_files=f"{cache_dir}/{dataset_file}")
+        dataset.push_to_hub(dataset_name, token=os.environ["HUGGINGFACE_TOKEN"])
     return resp
 
 
 @stub.local_entrypoint()
 def main():
-    scale = 0.01
-    batch_size = 512 * 150
-    with open("benchmarks.json", "a") as f:
-        benchmark = embed_dataset.remote(down_scale=scale, batch_size=batch_size)
-        f.write(json.dumps(benchmark, indent=2) + "\n")
+    for scale, batch_size in product([0.05], [BATCH_SIZE * 10]):
+        with open("benchmarks.json", "a") as f:
+            benchmark = embed_dataset.remote(down_scale=scale, batch_size=batch_size)
+            print(json.dumps(benchmark, indent=2))
+            f.write(json.dumps(benchmark, indent=2) + "\n")
diff --git a/docs/embedding.md b/docs/embedding.md
@@ -1,4 +1,4 @@
-# Embedding All of Wikipedia under 2 hours
+# Embedding All of Wikipedia under 15 minutes
 
 Embedding hundreds of gigabytes of text data can be a daunting task, especially when limited to making batch requests to a remote API. This article explores how Modal can be used to efficiently embed the huggingface Simple English Wikipedia in under 30 minutes. By mounting the data into a Modal volume and running the embedding function in parallel across multiple GPUs, we can achieve this.
 
@@ -398,4 +398,4 @@ Today's we went through a few foundational concepts that are key to taking advan
 
 Having the ability to scale unlocks new business use cases for companies that can now iterate on production models more quickly and efficiently. By shortening the feedback loop with Modal's serverless gpus, companies are then freed up to focus on experimentation and deployment.
 
-You can check out some datasets [here](https://huggingface.co/567-labs) containing embeddings that we computed using some popular open source embedding models. We've also uploaded our code [here](#todo) where we also showcase how to upload the generated embeddings to Hugging Face.
+You can check out some datasets [here](https://huggingface.co/567-labs) containing embeddings that we computed using some popular open source embedding models. We've also uploaded our code [here](#todo) where we also showcase how to upload the generated embeddings to Hugging Face.