From d04da867f31bb7969ab0a6595c1f9e9626efcc60 Mon Sep 17 00:00:00 2001
From: ewgenius <hey@ewgenius.me>
Date: Wed, 22 Apr 2026 23:19:13 +0900
Subject: [PATCH 1/7] Add Elasticsearch connector and vector engine recipes

- Add `elasticsearch/connector` recipe: federated SQL access to
  Elasticsearch indices, including sample data generation, Docker
  Compose, and all-types coverage.
- Add `vectors/elasticsearch` recipe: demonstrates using Elasticsearch
  as a vector engine, with sample data, Docker Compose, and Spice
  configuration for embedding and hybrid search.
- Include detailed READMEs, data generation scripts, and example
  `spicepod.yaml` files for both recipes.
---
 elasticsearch/connector/README.md          |  93 +++
 elasticsearch/connector/docker-compose.yml |  49 ++
 elasticsearch/connector/generate_data.py   | 681 +++++++++++++++++++++
 elasticsearch/connector/load_data.py       | 394 ++++++++++++
 elasticsearch/connector/spicepod.yaml      |  32 +
 vectors/elasticsearch/README.md            | 103 ++++
 vectors/elasticsearch/docker-compose.yml   |  28 +
 vectors/elasticsearch/generate_data.py     | 574 +++++++++++++++++
 vectors/elasticsearch/spicepod.yaml        |  43 ++
 9 files changed, 1997 insertions(+)
 create mode 100644 elasticsearch/connector/README.md
 create mode 100644 elasticsearch/connector/docker-compose.yml
 create mode 100644 elasticsearch/connector/generate_data.py
 create mode 100644 elasticsearch/connector/load_data.py
 create mode 100644 elasticsearch/connector/spicepod.yaml
 create mode 100644 vectors/elasticsearch/README.md
 create mode 100644 vectors/elasticsearch/docker-compose.yml
 create mode 100644 vectors/elasticsearch/generate_data.py
 create mode 100644 vectors/elasticsearch/spicepod.yaml

diff --git a/elasticsearch/connector/README.md b/elasticsearch/connector/README.md
new file mode 100644
index 0000000..f238289
--- /dev/null
+++ b/elasticsearch/connector/README.md
@@ -0,0 +1,93 @@
+# Elasticsearch Data Connector
+
+Works with `v2.0+`
+
+This recipe demonstrates how to query Elasticsearch indices from Spice using federated SQL. It includes:
+
+- `articles` — a federated dataset queried directly from Elasticsearch
+- `all_types` — a federated dataset covering supported Elasticsearch field types
+
+The Elasticsearch connector can also power `vector_search`, `text_search`, and `rrf` for indices that contain the required search fields.
+
+## Prerequisites
+
+- [Spice CLI](https://docs.spiceai.org/getting-started) installed
+- Docker installed
+
+## Getting Started
+
+### Step 1: Prepare the recipe directory
+
+Change into the recipe directory:
+
+```bash
+cd cookbook/elasticsearch/connector
+```
+
+### Step 2: Start Elasticsearch and seed the sample indices
+
+Start the local Elasticsearch service and seed the sample data:
+
+```bash
+docker compose up
+```
+
+Keep this running while you use the recipe.
+
+### Step 3: Start the Spice runtime
+
+In a new terminal, start the Spice runtime:
+
+```bash
+spice run
+```
+
+### Step 4: Open the Spice SQL REPL
+
+In another terminal, open the Spice SQL REPL:
+
+```bash
+spice sql
+```
+
+### Step 5: Run a few example queries
+
+Run a few basic federated SQL queries to verify the Elasticsearch datasets are available.
+
+Query the `articles` index:
+
+```sql
+SELECT id, title, category, author
+FROM articles
+WHERE category = 'machine_learning'
+LIMIT 10;
+```
+
+Inspect one row from the `all_types` index:
+
+```sql
+SELECT *
+FROM all_types
+LIMIT 1;
+```
+
+Filter the `all_types` index on a keyword field:
+
+```sql
+SELECT id, field_keyword, field_integer
+FROM all_types
+WHERE field_keyword = 'category_0';
+```
+
+## Notes
+
+- `articles` and `all_types` are queried directly from Elasticsearch.
+- The connector maps Elasticsearch index mappings to Arrow schemas so the data can be queried with SQL in Spice.
+- For indices with compatible search fields, Spice can route `vector_search`, `text_search`, and `rrf` to Elasticsearch.
+
+## Learn more
+
+- [Elasticsearch Data Connector Documentation](https://spiceai.org/docs/components/data-connectors/elasticsearch)
+- [Search Functionality Documentation](https://spiceai.org/docs/features/search)
+- [Datasets Reference](https://docs.spiceai.org/reference/spicepod/datasets)
+- [Spice SQL CLI Reference](https://docs.spiceai.org/cli/reference/sql)
diff --git a/elasticsearch/connector/docker-compose.yml b/elasticsearch/connector/docker-compose.yml
new file mode 100644
index 0000000..561fa6f
--- /dev/null
+++ b/elasticsearch/connector/docker-compose.yml
@@ -0,0 +1,49 @@
+services:
+  elasticsearch:
+    image: docker.elastic.co/elasticsearch/elasticsearch:8.13.4
+    container_name: es01
+    environment:
+      - discovery.type=single-node
+      - xpack.security.enabled=true
+      - xpack.security.http.ssl.enabled=false
+      - ES_JAVA_OPTS=-Xms512m -Xmx512m
+      - ELASTIC_PASSWORD=spiceai
+    ports:
+      - "9200:9200"
+    volumes:
+      - esdata:/usr/share/elasticsearch/data
+    healthcheck:
+      test:
+        [
+          "CMD-SHELL",
+          "curl -sf -u elastic:spiceai http://localhost:9200/_cluster/health | grep -qE '\"status\":\"(green|yellow)\"'",
+        ]
+      interval: 10s
+      timeout: 10s
+      retries: 15
+      start_period: 40s
+
+  es-init:
+    image: python:3.12-slim
+    container_name: es-init
+    depends_on:
+      elasticsearch:
+        condition: service_healthy
+    volumes:
+      - ./generate_data.py:/app/generate_data.py:ro
+      - ./load_data.py:/app/load_data.py:ro
+    working_dir: /app
+    environment:
+      - ES_HOST=http://elasticsearch:9200
+      - ES_USER=elastic
+      - ES_PASS=spiceai
+    command: >
+      bash -c "
+        pip install --quiet pandas pyarrow faker requests &&
+        python generate_data.py &&
+        python load_data.py --all-types
+      "
+
+volumes:
+  esdata:
+    driver: local
diff --git a/elasticsearch/connector/generate_data.py b/elasticsearch/connector/generate_data.py
new file mode 100644
index 0000000..1bf9399
--- /dev/null
+++ b/elasticsearch/connector/generate_data.py
@@ -0,0 +1,681 @@
+#!/usr/bin/env python3
+"""
+Generate articles.parquet — sample data for Spice Elasticsearch connector tests.
+
+Produces ~15 000 records with sufficient lexical and semantic diversity to
+meaningfully exercise:
+  - Data connector (federated SQL via elasticsearch:articles)
+  - Vector search  (vector_search UDTF)
+  - Full-text search (text_search UDTF / BM25)
+  - Hybrid search with RRF (rrf(vector_search(...), text_search(...)))
+
+Usage:
+  pip install pandas pyarrow faker
+  python generate_data.py [--rows N]   # default 15000
+"""
+
+import argparse
+import random
+from datetime import datetime, timedelta
+from itertools import product
+
+import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
+from faker import Faker
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+parser = argparse.ArgumentParser()
+parser.add_argument("--rows", type=int, default=100)
+parser.add_argument("--out", default="articles.parquet")
+parser.add_argument(
+    "--embeddings",
+    action="store_true",
+    help="Compute content_embedding column using all-MiniLM-L6-v2 and write into parquet",
+)
+parser.add_argument(
+    "--embedding-model", default="sentence-transformers/all-MiniLM-L6-v2"
+)
+parser.add_argument("--embedding-batch-size", type=int, default=64)
+args = parser.parse_args()
+
+TARGET_ROWS = args.rows
+OUT_PATH = args.out
+
+fake = Faker()
+random.seed(42)
+Faker.seed(42)
+
+# ---------------------------------------------------------------------------
+# Taxonomy
+# ---------------------------------------------------------------------------
+TOPICS = [
+    {
+        "category": "machine_learning",
+        "subtopics": [
+            "neural networks",
+            "deep learning",
+            "gradient descent",
+            "backpropagation",
+            "transformer models",
+            "attention mechanisms",
+            "fine-tuning",
+            "transfer learning",
+            "reinforcement learning",
+            "generative adversarial networks",
+            "diffusion models",
+            "contrastive learning",
+            "self-supervised learning",
+            "few-shot learning",
+            "model pruning",
+            "knowledge distillation",
+            "hyperparameter tuning",
+            "AutoML",
+            "federated learning",
+            "meta-learning",
+        ],
+        "title_templates": [
+            "A Practical Guide to {sub}",
+            "{sub} Explained: From Theory to Production",
+            "How {sub} Is Transforming AI Applications",
+            "Deep Dive: {sub} in Modern ML Pipelines",
+            "Benchmarking {sub} Across Popular Frameworks",
+            "Common Pitfalls in {sub} and How to Avoid Them",
+            "Scaling {sub} to Billions of Parameters",
+            "{sub}: State of the Art in {year}",
+            "Understanding {sub} Through Mathematical Intuition",
+            "Implementing {sub} Without a PhD",
+        ],
+    },
+    {
+        "category": "search_engines",
+        "subtopics": [
+            "BM25 ranking",
+            "inverted indexes",
+            "vector similarity search",
+            "semantic search",
+            "full-text search",
+            "dense retrieval",
+            "sparse retrieval",
+            "hybrid search",
+            "reciprocal rank fusion",
+            "embedding models",
+            "approximate nearest neighbor",
+            "HNSW graphs",
+            "product quantization",
+            "query expansion",
+            "relevance feedback",
+            "learning to rank",
+            "query rewriting",
+            "cross-encoder reranking",
+            "bi-encoder retrieval",
+            "knowledge graph search",
+        ],
+        "title_templates": [
+            "How {sub} Powers Modern Search",
+            "{sub} Under the Hood: Architecture and Trade-offs",
+            "Building Production-Ready {sub} Systems",
+            "Evaluating {sub}: Metrics That Matter",
+            "{sub} at Scale: Lessons from the Field",
+            "Comparing {sub} Approaches in {year}",
+            "A Developer's Guide to {sub}",
+            "When to Use {sub} vs Traditional Keyword Search",
+            "Optimising {sub} for Low-Latency Workloads",
+            "Open-Source Tools for {sub}",
+        ],
+    },
+    {
+        "category": "data_engineering",
+        "subtopics": [
+            "Apache Parquet",
+            "columnar storage",
+            "data pipelines",
+            "ETL workflows",
+            "stream processing",
+            "Apache Kafka",
+            "data lakes",
+            "schema evolution",
+            "Apache Iceberg",
+            "Delta Lake",
+            "data mesh",
+            "data contracts",
+            "change data capture",
+            "Apache Spark",
+            "dbt transformations",
+            "data quality checks",
+            "lineage tracking",
+            "metadata management",
+            "batch ingestion",
+            "real-time ingestion",
+        ],
+        "title_templates": [
+            "{sub}: The Definitive Overview",
+            "Why {sub} Is Essential for Modern Data Teams",
+            "Getting Started with {sub} in {year}",
+            "{sub} Best Practices for Large-Scale Workloads",
+            "Migrating to {sub}: A Step-by-Step Guide",
+            "Debugging {sub} in Production",
+            "{sub} vs Traditional Approaches: A Comparison",
+            "How {sub} Reduces Engineering Toil",
+            "Monitoring {sub} Pipelines at Scale",
+            "Cost Optimisation Strategies for {sub}",
+        ],
+    },
+    {
+        "category": "databases",
+        "subtopics": [
+            "SQL query optimisation",
+            "index design",
+            "ACID transactions",
+            "NoSQL modelling",
+            "vector databases",
+            "distributed databases",
+            "replication strategies",
+            "sharding",
+            "MVCC",
+            "write-ahead logging",
+            "B-tree indexes",
+            "LSM-tree storage",
+            "columnar databases",
+            "time-series databases",
+            "graph databases",
+            "in-memory databases",
+            "NewSQL",
+            "CockroachDB",
+            "FoundationDB",
+            "database connection pooling",
+        ],
+        "title_templates": [
+            "{sub} for High-Throughput Applications",
+            "A Deep Dive into {sub}",
+            "{sub}: Concepts Every Engineer Should Know",
+            "Production Lessons from Running {sub} at Scale",
+            "Choosing Between {sub} and Alternatives",
+            "How {sub} Impacts Query Performance",
+            "{sub} Internals Explained",
+            "Migrating from Legacy Systems to {sub}",
+            "Benchmarking {sub} in Cloud Environments",
+            "Debugging Slow Queries with {sub}",
+        ],
+    },
+    {
+        "category": "cloud_infrastructure",
+        "subtopics": [
+            "Kubernetes orchestration",
+            "container deployments",
+            "service meshes",
+            "auto-scaling",
+            "observability stacks",
+            "OpenTelemetry",
+            "cost optimisation",
+            "serverless functions",
+            "GitOps workflows",
+            "infrastructure as code",
+            "zero-trust networking",
+            "secrets management",
+            "multi-cloud strategies",
+            "disaster recovery",
+            "chaos engineering",
+            "FinOps",
+            "edge computing",
+            "platform engineering",
+            "developer portals",
+            "SLO management",
+        ],
+        "title_templates": [
+            "{sub} in Practice: Real-World Patterns",
+            "How {sub} Enables Reliable Systems",
+            "{sub} for Platform Engineers",
+            "Scaling {sub} in Enterprise Environments",
+            "Automating {sub} with Modern Tooling",
+            "{sub}: A Comprehensive Tutorial for {year}",
+            "Common Mistakes with {sub} and How to Fix Them",
+            "Security Considerations for {sub}",
+            "How We Cut Costs by Optimising {sub}",
+            "Building Self-Service Infrastructure with {sub}",
+        ],
+    },
+    {
+        "category": "software_engineering",
+        "subtopics": [
+            "clean code principles",
+            "design patterns",
+            "microservices architecture",
+            "domain-driven design",
+            "event sourcing",
+            "CQRS",
+            "API design",
+            "test-driven development",
+            "continuous integration",
+            "code review practices",
+            "technical debt",
+            "refactoring strategies",
+            "distributed tracing",
+            "circuit breakers",
+            "rate limiting",
+            "idempotency",
+            "concurrency models",
+            "async programming",
+            "type systems",
+            "compiler design",
+        ],
+        "title_templates": [
+            "Applying {sub} in Real Projects",
+            "{sub}: From Basics to Advanced Techniques",
+            "How {sub} Improves System Reliability",
+            "A Pragmatic Guide to {sub}",
+            "Interview Questions on {sub} and How to Answer Them",
+            "{sub} Patterns Every Senior Engineer Should Know",
+            "Refactoring Legacy Code with {sub}",
+            "When {sub} Makes Sense (and When It Doesn't)",
+            "{sub} in Distributed Systems",
+            "Teaching {sub} to Junior Developers",
+        ],
+    },
+    {
+        "category": "security",
+        "subtopics": [
+            "zero-trust architecture",
+            "OAuth 2.0",
+            "JWT authentication",
+            "supply chain security",
+            "SBOM",
+            "vulnerability scanning",
+            "penetration testing",
+            "secrets rotation",
+            "mTLS",
+            "RBAC",
+            "OWASP top 10",
+            "runtime security",
+            "SAST and DAST",
+            "CVE management",
+            "encryption at rest",
+            "key management",
+            "phishing prevention",
+            "incident response",
+            "threat modelling",
+            "compliance automation",
+        ],
+        "title_templates": [
+            "Understanding {sub} in Cloud-Native Environments",
+            "Implementing {sub} Without Slowing Down Development",
+            "{sub} for Distributed Systems",
+            "How {sub} Reduces Your Attack Surface",
+            "A Practical Guide to {sub} in {year}",
+            "Auditing {sub} Configurations at Scale",
+            "Automating {sub} in CI/CD Pipelines",
+            "{sub}: Myths and Realities",
+            "Responding to {sub} Incidents Effectively",
+            "Open-Source Tools for {sub}",
+        ],
+    },
+    {
+        "category": "developer_experience",
+        "subtopics": [
+            "developer portals",
+            "internal developer platforms",
+            "CLI tooling",
+            "IDE extensions",
+            "local development environments",
+            "devcontainers",
+            "debugging techniques",
+            "profiling tools",
+            "documentation-as-code",
+            "API mocking",
+            "feature flags",
+            "trunk-based development",
+            "monorepo tooling",
+            "dependency management",
+            "build caching",
+            "test parallelisation",
+            "code generation",
+            "linting and formatting",
+            "onboarding automation",
+            "developer productivity metrics",
+        ],
+        "title_templates": [
+            "Improving {sub} Across Engineering Teams",
+            "{sub}: Building a Great Developer Experience",
+            "How {sub} Speeds Up Delivery",
+            "Measuring the Impact of {sub}",
+            "Setting Up {sub} from Scratch",
+            "{sub} Tooling in {year}: What Works",
+            "How We Adopted {sub} at Our Company",
+            "The Case for Investing in {sub}",
+            "{sub} for Remote Engineering Teams",
+            "Automating {sub} to Eliminate Toil",
+        ],
+    },
+]
+
+# Cross-topic hand-crafted seeds that span multiple categories (good for RRF)
+CROSS_TOPIC_SEEDS = [
+    {
+        "title": "Vector Search Inside PostgreSQL with pgvector",
+        "category": "databases",
+        "keywords": [
+            "vector databases",
+            "BM25 ranking",
+            "SQL query optimisation",
+            "embedding models",
+        ],
+    },
+    {
+        "title": "Running Elasticsearch on Kubernetes: Lessons Learned",
+        "category": "cloud_infrastructure",
+        "keywords": [
+            "Kubernetes orchestration",
+            "observability stacks",
+            "inverted indexes",
+        ],
+    },
+    {
+        "title": "ETL Pipelines for Machine Learning Feature Stores",
+        "category": "data_engineering",
+        "keywords": [
+            "Apache Parquet",
+            "data pipelines",
+            "transformer models",
+            "fine-tuning",
+        ],
+    },
+    {
+        "title": "Securing ML Model Serving Endpoints",
+        "category": "security",
+        "keywords": [
+            "zero-trust architecture",
+            "mTLS",
+            "neural networks",
+            "API design",
+        ],
+    },
+    {
+        "title": "Observability for Real-Time Data Pipelines",
+        "category": "data_engineering",
+        "keywords": [
+            "OpenTelemetry",
+            "Apache Kafka",
+            "distributed tracing",
+            "SLO management",
+        ],
+    },
+    {
+        "title": "Cost-Aware AutoML on Kubernetes",
+        "category": "machine_learning",
+        "keywords": [
+            "AutoML",
+            "Kubernetes orchestration",
+            "cost optimisation",
+            "hyperparameter tuning",
+        ],
+    },
+    {
+        "title": "Hybrid Search with Elasticsearch and pgvector",
+        "category": "search_engines",
+        "keywords": [
+            "hybrid search",
+            "reciprocal rank fusion",
+            "vector databases",
+            "inverted indexes",
+        ],
+    },
+    {
+        "title": "Developer Portals Powered by LLMs",
+        "category": "developer_experience",
+        "keywords": [
+            "developer portals",
+            "transformer models",
+            "API design",
+            "documentation-as-code",
+        ],
+    },
+    {
+        "title": "Zero-Trust Networking for Microservices",
+        "category": "security",
+        "keywords": [
+            "zero-trust architecture",
+            "service meshes",
+            "mTLS",
+            "microservices architecture",
+        ],
+    },
+    {
+        "title": "DuckDB as an Embedded Analytics Engine",
+        "category": "databases",
+        "keywords": [
+            "columnar databases",
+            "Apache Parquet",
+            "SQL query optimisation",
+            "data lakes",
+        ],
+    },
+]
+
+AUTHORS = [
+    "Alice Chen",
+    "Bob Martinez",
+    "Carol Okonkwo",
+    "David Kim",
+    "Eva Lindqvist",
+    "Frank Nguyen",
+    "Grace Patel",
+    "Hiro Tanaka",
+    "Ingrid Sørensen",
+    "James O'Brien",
+    "Kavya Reddy",
+    "Luca Ferrari",
+    "Maya Goldberg",
+    "Nadia Petrov",
+    "Oscar Bergström",
+    "Priya Sharma",
+    "Quinn Walker",
+    "Ravi Subramaniam",
+    "Sofia Andrade",
+    "Tom Brennan",
+]
+
+BASE_DATE = datetime(2024, 1, 1)
+YEAR_RANGE = 2  # docs span 2024–2025
+
+
+# ---------------------------------------------------------------------------
+# Content generation helpers
+# ---------------------------------------------------------------------------
+
+INTRO_TEMPLATES = [
+    "{title} has become one of the most discussed topics among practitioners in {year}. "
+    "This article examines core ideas around {kw1}, {kw2}, and {kw3}.",
+    "Few subjects generate as much debate as {title}. "
+    "In this post we take a structured look at {kw1} and its relationship to {kw2}.",
+    "The rise of {kw1} has put {title} firmly on the radar of engineering teams worldwide. "
+    "Here we break down the key concepts, starting with {kw2} and {kw3}.",
+    "If you have ever struggled to understand {title}, you are not alone. "
+    "We will demystify {kw1} and explain why {kw2} matters more than ever.",
+    "In {year}, {title} continues to evolve rapidly. "
+    "This comprehensive guide covers {kw1}, {kw2}, and practical guidance on {kw3}.",
+]
+
+BODY_SENTENCE_POOLS = [
+    "One of the most important considerations when working with {kw} is understanding its performance implications at scale.",
+    "Teams that have adopted {kw} report significant improvements in both reliability and developer velocity.",
+    "The relationship between {kw} and system latency is often underappreciated.",
+    "Recent benchmarks suggest that {kw} outperforms legacy approaches by up to an order of magnitude in read-heavy workloads.",
+    "Choosing the right abstraction for {kw} requires a clear understanding of your consistency and availability requirements.",
+    "A common misconception about {kw} is that it requires specialised hardware—in practice, commodity cloud instances suffice.",
+    "Observability tooling plays a critical role when debugging issues related to {kw} in production.",
+    "The open-source ecosystem around {kw} has matured considerably since its initial release.",
+    "Engineers often underestimate the operational overhead of {kw} when evaluating it for the first time.",
+    "Pair {kw} with a solid CI/CD pipeline to catch regressions early.",
+    "Security implications of {kw} are frequently discussed but rarely acted on systematically.",
+    "Integrating {kw} into an existing architecture typically requires incremental migration rather than a big-bang rewrite.",
+    "Documentation quality is a persistent challenge in the {kw} ecosystem.",
+    "Load testing is essential before relying on {kw} in a high-traffic environment.",
+    "Proper indexing strategy is inseparable from any discussion of {kw}.",
+]
+
+CLOSING_TEMPLATES = [
+    "In conclusion, mastering {kw} is increasingly a prerequisite for engineers working on {category} problems. "
+    "We encourage you to experiment with the techniques described here and share your findings with the community.",
+    "As the field of {category} advances, {kw} will only grow in importance. "
+    "The patterns outlined in this article provide a solid foundation for further exploration.",
+    "Understanding {kw} deeply—not just at the API surface—will set you apart as a practitioner in {category}. "
+    "We hope this guide serves as a useful reference.",
+    "The journey into {kw} is long but rewarding. "
+    "Start small, measure rigorously, and iterate—that is the path to production-grade {category} systems.",
+]
+
+
+def make_body(title: str, keywords: list[str], category: str, year: int) -> str:
+    kws = keywords[:]
+    random.shuffle(kws)
+    kw1, kw2, kw3 = (kws + kws)[:3]
+
+    intro = random.choice(INTRO_TEMPLATES).format(
+        title=title, year=year, kw1=kw1, kw2=kw2, kw3=kw3
+    )
+
+    paragraphs = [intro]
+    for _ in range(random.randint(3, 6)):
+        n_sentences = random.randint(3, 6)
+        sentences = []
+        for _ in range(n_sentences):
+            kw = random.choice(keywords)
+            tmpl = random.choice(BODY_SENTENCE_POOLS)
+            sentences.append(tmpl.format(kw=kw))
+        # pad with faker sentences for variety
+        sentences += fake.sentences(nb=random.randint(1, 3))
+        random.shuffle(sentences)
+        paragraphs.append(" ".join(sentences))
+
+    closing_kw = random.choice(keywords)
+    paragraphs.append(
+        random.choice(CLOSING_TEMPLATES).format(kw=closing_kw, category=category)
+    )
+    return "\n\n".join(paragraphs)
+
+
+def make_published_at() -> str:
+    days = random.randint(0, 365 * YEAR_RANGE)
+    hours = random.randint(0, 23)
+    minutes = random.randint(0, 59)
+    dt = BASE_DATE + timedelta(days=days, hours=hours, minutes=minutes)
+    return dt.isoformat()
+
+
+# ---------------------------------------------------------------------------
+# Build seed pool from topic × template × subtopic combinations
+# ---------------------------------------------------------------------------
+seed_pool: list[dict] = []
+year = 2025
+
+for topic in TOPICS:
+    for sub, tmpl in product(topic["subtopics"], topic["title_templates"]):
+        title = tmpl.format(sub=sub.title(), year=year)
+        seed_pool.append(
+            {
+                "title": title,
+                "category": topic["category"],
+                "keywords": topic["subtopics"],  # full list for body variety
+                "primary_kw": sub,
+            }
+        )
+
+random.shuffle(seed_pool)
+
+# ---------------------------------------------------------------------------
+# Generate records
+# ---------------------------------------------------------------------------
+records = []
+
+# 1. Cross-topic hand-crafted seeds first (always included)
+for i, seed in enumerate(CROSS_TOPIC_SEEDS, start=1):
+    kws = seed["keywords"]
+    records.append(
+        {
+            "id": i,
+            "title": seed["title"],
+            "content": make_body(seed["title"], kws, seed["category"], year),
+            "author": random.choice(AUTHORS),
+            "category": seed["category"],
+            "tags": ", ".join(random.sample(kws, min(3, len(kws)))),
+            "published_at": make_published_at(),
+            "views": random.randint(5_000, 80_000),
+            "likes": random.randint(200, 8_000),
+        }
+    )
+
+# 2. Generated records from seed pool
+doc_id = len(CROSS_TOPIC_SEEDS) + 1
+remaining = TARGET_ROWS - len(CROSS_TOPIC_SEEDS)
+
+# Cycle through seed pool (with variation) to hit the target count
+pool_cycle = (seed_pool * ((remaining // len(seed_pool)) + 2))[:remaining]
+
+for seed in pool_cycle:
+    kws = seed["keywords"]
+    title = seed["title"]
+
+    # small title variation: occasionally prepend a qualifier
+    qualifiers = ["", "", "", "How to Use ", "Why ", "When to Use ", ""]
+    q = random.choice(qualifiers)
+    if q and not title.startswith(("How", "Why", "When", "A ", "Building", "Running")):
+        title = q + title[0].lower() + title[1:]
+
+    records.append(
+        {
+            "id": doc_id,
+            "title": title,
+            "content": make_body(title, kws, seed["category"], year),
+            "author": random.choice(AUTHORS),
+            "category": seed["category"],
+            "tags": ", ".join(random.sample(kws, min(4, len(kws)))),
+            "published_at": make_published_at(),
+            "views": random.randint(50, 60_000),
+            "likes": random.randint(0, 6_000),
+        }
+    )
+    doc_id += 1
+
+# ---------------------------------------------------------------------------
+# Write Parquet
+# ---------------------------------------------------------------------------
+df = pd.DataFrame(records)
+
+fields = [
+    pa.field("id", pa.int32()),
+    pa.field("title", pa.string()),
+    pa.field("content", pa.string()),
+    pa.field("author", pa.string()),
+    pa.field("category", pa.string()),
+    pa.field("tags", pa.string()),
+    pa.field("published_at", pa.string()),
+    pa.field("views", pa.int32()),
+    pa.field("likes", pa.int32()),
+]
+
+if args.embeddings:
+    print(f"Computing embeddings with '{args.embedding_model}' …")
+    from sentence_transformers import SentenceTransformer
+
+    model = SentenceTransformer(args.embedding_model)
+    texts = df["content"].tolist()
+    vecs = model.encode(
+        texts,
+        batch_size=args.embedding_batch_size,
+        show_progress_bar=True,
+        convert_to_numpy=True,
+    )
+    dims = vecs.shape[1]
+    print(f"Computed {len(vecs)} embeddings, dims={dims}")
+    df["content_embedding"] = [v.tolist() for v in vecs]
+    fields.append(pa.field("content_embedding", pa.list_(pa.float32(), dims)))
+
+schema = pa.schema(fields)
+table = pa.Table.from_pandas(df, schema=schema, preserve_index=False)
+pq.write_table(table, OUT_PATH, compression="snappy")
+
+print(f"Wrote {len(records):,} records to {OUT_PATH}")
+cats = df["category"].value_counts()
+print(cats.to_string())
diff --git a/elasticsearch/connector/load_data.py b/elasticsearch/connector/load_data.py
new file mode 100644
index 0000000..e6059ec
--- /dev/null
+++ b/elasticsearch/connector/load_data.py
@@ -0,0 +1,394 @@
+#!/usr/bin/env python3
+"""
+Load articles.parquet into Elasticsearch.
+
+Usage:
+  uv run load_data.py                        # load articles index only
+  uv run load_data.py --embeddings           # also compute + load vectors index
+  uv run load_data.py --embeddings --parquet 04-vector-search/articles.parquet
+"""
+
+import argparse
+import json
+import os
+import sys
+import time
+
+import pyarrow.parquet as pq
+import requests
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--es-host", default=os.environ.get("ES_HOST", "http://localhost:9200")
+)
+parser.add_argument("--es-user", default=os.environ.get("ES_USER", "elastic"))
+parser.add_argument("--es-pass", default=os.environ.get("ES_PASS", "spiceai"))
+parser.add_argument("--parquet", default="articles.parquet")
+parser.add_argument("--index", default="articles")
+parser.add_argument(
+    "--embeddings",
+    action="store_true",
+    help="Also compute and load vectors into a dense_vector index",
+)
+parser.add_argument("--vectors-index", default="articles_vectors")
+parser.add_argument(
+    "--embedding-model", default="sentence-transformers/all-MiniLM-L6-v2"
+)
+parser.add_argument("--embedding-column", default="content")
+parser.add_argument("--batch-size", type=int, default=64)
+parser.add_argument(
+    "--all-types",
+    action="store_true",
+    help="Also create and load the all_types index covering every ES field type",
+)
+parser.add_argument("--all-types-index", default="all_types")
+parser.add_argument("--all-types-docs", type=int, default=10)
+args = parser.parse_args()
+
+ES_HOST = args.es_host
+ES_AUTH = (args.es_user, args.es_pass) if args.es_user else None
+INDEX = args.index
+PARQUET_PATH = args.parquet
+
+
+# ---------------------------------------------------------------------------
+# Wait for Elasticsearch
+# ---------------------------------------------------------------------------
+def wait_for_es(retries: int = 30, delay: int = 5) -> None:
+    for attempt in range(1, retries + 1):
+        try:
+            r = requests.get(f"{ES_HOST}/_cluster/health", auth=ES_AUTH, timeout=5)
+            status = r.json().get("status", "red")
+            if status in ("green", "yellow"):
+                print(f"Elasticsearch ready (status={status})")
+                return
+            print(
+                f"[{attempt}/{retries}] Cluster status: {status}, retrying in {delay}s …"
+            )
+        except Exception as exc:
+            print(f"[{attempt}/{retries}] Not ready ({exc}), retrying in {delay}s …")
+        time.sleep(delay)
+    print("ERROR: Elasticsearch did not become healthy in time.")
+    sys.exit(1)
+
+
+# ---------------------------------------------------------------------------
+# Articles index
+# ---------------------------------------------------------------------------
+ARTICLES_MAPPING = {
+    "settings": {"number_of_shards": 1, "number_of_replicas": 0},
+    "mappings": {
+        "properties": {
+            "id": {"type": "integer"},
+            "title": {
+                "type": "text",
+                "analyzer": "english",
+                "fields": {"keyword": {"type": "keyword"}},
+            },
+            "content": {"type": "text", "analyzer": "english"},
+            "author": {"type": "keyword"},
+            "category": {"type": "keyword"},
+            "tags": {
+                "type": "text",
+                "analyzer": "english",
+                "fields": {"keyword": {"type": "keyword"}},
+            },
+            "published_at": {
+                "type": "date",
+                "format": "strict_date_time_no_millis||strict_date_optional_time",
+            },
+            "views": {"type": "integer"},
+            "likes": {"type": "integer"},
+        }
+    },
+}
+
+
+def recreate_index(name: str, mapping: dict) -> None:
+    url = f"{ES_HOST}/{name}"
+    if requests.head(url, auth=ES_AUTH).status_code == 200:
+        print(f"Index '{name}' already exists — deleting for a clean load.")
+        requests.delete(url, auth=ES_AUTH).raise_for_status()
+    requests.put(url, json=mapping, auth=ES_AUTH).raise_for_status()
+    print(f"Index '{name}' created.")
+
+
+def bulk_request(lines: list[str]) -> None:
+    body = "\n".join(lines) + "\n"
+    r = requests.post(
+        f"{ES_HOST}/_bulk",
+        data=body.encode(),
+        headers={"Content-Type": "application/x-ndjson"},
+        auth=ES_AUTH,
+        timeout=120,
+    )
+    r.raise_for_status()
+    errors = [
+        item for item in r.json().get("items", []) if "error" in item.get("index", {})
+    ]
+    if errors:
+        print(f"  WARNING: {len(errors)} bulk errors — first: {errors[0]}")
+
+
+def load_articles(df: dict) -> None:
+    n = len(df["id"])
+    lines = []
+    for i in range(n):
+        doc = {col: df[col][i] for col in df}
+        lines.append(json.dumps({"index": {"_index": INDEX, "_id": doc["id"]}}))
+        lines.append(json.dumps(doc))
+    bulk_request(lines)
+    requests.post(f"{ES_HOST}/{INDEX}/_refresh", auth=ES_AUTH)
+    count = (
+        requests.get(f"{ES_HOST}/{INDEX}/_count", auth=ES_AUTH).json().get("count", "?")
+    )
+    print(f"Loaded {n} documents → '{INDEX}' (count={count})")
+
+
+# ---------------------------------------------------------------------------
+# Vectors index
+# ---------------------------------------------------------------------------
+def vectors_mapping(dims: int) -> dict:
+    return {
+        "settings": {"number_of_shards": 1, "number_of_replicas": 0},
+        "mappings": {
+            "properties": {
+                "id": {"type": "integer"},
+                "content": {"type": "text", "index": False},
+                "content_embedding": {
+                    "type": "dense_vector",
+                    "dims": dims,
+                    "index": True,
+                    "similarity": "cosine",
+                },
+            }
+        },
+    }
+
+
+def load_vectors(df: dict) -> None:
+    print(f"Loading embedding model '{args.embedding_model}' …")
+    from sentence_transformers import SentenceTransformer
+
+    model = SentenceTransformer(args.embedding_model)
+
+    texts = df[args.embedding_column]
+    ids = df["id"]
+    n = len(ids)
+    dims = model.get_embedding_dimension()
+    print(
+        f"Model dims={dims}, encoding {n} documents in batches of {args.batch_size} …"
+    )
+
+    recreate_index(args.vectors_index, vectors_mapping(dims))
+
+    for batch_start in range(0, n, args.batch_size):
+        batch_texts = texts[batch_start : batch_start + args.batch_size]
+        batch_ids = ids[batch_start : batch_start + args.batch_size]
+        embeddings = model.encode(
+            batch_texts, show_progress_bar=False, convert_to_numpy=True
+        )
+
+        lines = []
+        for i, (doc_id, text, vec) in enumerate(
+            zip(batch_ids, batch_texts, embeddings)
+        ):
+            lines.append(
+                json.dumps({"index": {"_index": args.vectors_index, "_id": doc_id}})
+            )
+            lines.append(
+                json.dumps(
+                    {
+                        "id": doc_id,
+                        "content": text,
+                        "content_embedding": vec.tolist(),
+                    }
+                )
+            )
+        bulk_request(lines)
+        print(f"  Indexed {min(batch_start + args.batch_size, n)}/{n} vectors …")
+
+    requests.post(f"{ES_HOST}/{args.vectors_index}/_refresh", auth=ES_AUTH)
+    count = (
+        requests.get(f"{ES_HOST}/{args.vectors_index}/_count", auth=ES_AUTH)
+        .json()
+        .get("count", "?")
+    )
+    print(f"Loaded {n} vectors → '{args.vectors_index}' (count={count})")
+
+
+# ---------------------------------------------------------------------------
+# all_types index — one document per supported ES field type
+# ---------------------------------------------------------------------------
+ALL_TYPES_MAPPING = {
+    "settings": {"number_of_shards": 1, "number_of_replicas": 0},
+    "mappings": {
+        "properties": {
+            # Core scalar types
+            "id": {"type": "integer"},
+            "field_text": {"type": "text", "analyzer": "standard"},
+            "field_keyword": {"type": "keyword"},
+            "field_long": {"type": "long"},
+            "field_integer": {"type": "integer"},
+            "field_short": {"type": "short"},
+            "field_byte": {"type": "byte"},
+            "field_double": {"type": "double"},
+            "field_float": {"type": "float"},
+            "field_half_float": {"type": "half_float"},
+            "field_scaled_float": {"type": "scaled_float", "scaling_factor": 100},
+            "field_unsigned_long": {"type": "unsigned_long"},
+            # Boolean
+            "field_boolean": {"type": "boolean"},
+            # Date / time
+            "field_date": {"type": "date", "format": "strict_date_optional_time"},
+            "field_date_nanos": {"type": "date_nanos"},
+            # Binary
+            "field_binary": {"type": "binary"},
+            # Range types
+            "field_integer_range": {"type": "integer_range"},
+            "field_long_range": {"type": "long_range"},
+            "field_float_range": {"type": "float_range"},
+            "field_double_range": {"type": "double_range"},
+            "field_date_range": {
+                "type": "date_range",
+                "format": "strict_date_optional_time",
+            },
+            # Structured / complex types
+            "field_object": {
+                "type": "object",
+                "properties": {
+                    "name": {"type": "keyword"},
+                    "value": {"type": "integer"},
+                },
+            },
+            "field_nested": {
+                "type": "nested",
+                "properties": {
+                    "tag": {"type": "keyword"},
+                    "score": {"type": "float"},
+                },
+            },
+            # Geo types
+            "field_geo_point": {"type": "geo_point"},
+            "field_geo_shape": {"type": "geo_shape"},
+            # Specialised text types
+            "field_ip": {"type": "ip"},
+            "field_version": {"type": "version"},
+            # Completion / search-as-you-type
+            "field_completion": {"type": "completion"},
+            "field_search_as_you_type": {"type": "search_as_you_type"},
+            # Token count
+            "field_token_count": {
+                "type": "token_count",
+                "analyzer": "standard",
+            },
+            # Dense vector (small dim for speed)
+            "field_dense_vector": {
+                "type": "dense_vector",
+                "dims": 4,
+                "index": True,
+                "similarity": "cosine",
+            },
+            # Flattened
+            "field_flattened": {"type": "flattened"},
+        }
+    },
+}
+
+import base64
+import math
+import random
+
+
+def _make_all_types_doc(i: int) -> dict:
+    """Return a single document exercising every mapped field type."""
+    rng = random.Random(i)
+    ts = f"2024-0{(i % 9) + 1}-{(i % 28) + 1:02d}T{i % 24:02d}:00:00Z"
+    return {
+        "id": i,
+        "field_text": f"The quick brown fox jumps over the lazy dog — document {i}",
+        "field_keyword": f"category_{i % 5}",
+        "field_long": rng.randint(-(2**40), 2**40),
+        "field_integer": rng.randint(-100_000, 100_000),
+        "field_short": rng.randint(-32768, 32767),
+        "field_byte": rng.randint(-128, 127),
+        "field_double": rng.uniform(-1e6, 1e6),
+        "field_float": float(round(rng.uniform(-1000, 1000), 4)),
+        "field_half_float": float(round(rng.uniform(-100, 100), 2)),
+        "field_scaled_float": float(round(rng.uniform(0, 999.99), 2)),
+        "field_unsigned_long": rng.randint(0, 2**60),
+        "field_boolean": bool(i % 2),
+        "field_date": ts,
+        "field_date_nanos": ts.replace("Z", ".000000000Z"),
+        "field_binary": base64.b64encode(f"binary_payload_{i}".encode()).decode(),
+        "field_integer_range": {"gte": i * 10, "lte": i * 10 + 5},
+        "field_long_range": {"gte": i * 1000, "lte": i * 1000 + 100},
+        "field_float_range": {"gte": float(i), "lte": float(i) + 0.5},
+        "field_double_range": {"gte": float(i) * 1.1, "lte": float(i) * 1.1 + 0.01},
+        "field_date_range": {
+            "gte": f"2024-01-{(i % 28) + 1:02d}T00:00:00Z",
+            "lte": f"2024-12-{(i % 28) + 1:02d}T23:59:59Z",
+        },
+        "field_object": {"name": f"obj_{i}", "value": i * 7},
+        "field_nested": [
+            {"tag": f"tag_{j}", "score": float(round(rng.uniform(0, 1), 3))}
+            for j in range(1, 3)
+        ],
+        "field_geo_point": {
+            "lat": round(rng.uniform(-90, 90), 6),
+            "lon": round(rng.uniform(-180, 180), 6),
+        },
+        "field_geo_shape": {
+            "type": "Point",
+            "coordinates": [
+                round(rng.uniform(-180, 180), 6),
+                round(rng.uniform(-90, 90), 6),
+            ],
+        },
+        "field_ip": f"192.168.{i % 256}.{(i * 7) % 256}",
+        "field_version": f"1.{i}.0",
+        "field_completion": {"input": [f"suggest_{i}", f"doc_{i}"], "weight": i + 1},
+        "field_search_as_you_type": f"searchable text for document number {i}",
+        # token_count is computed by ES; send the source text
+        "field_token_count": f"token count source text document {i}",
+        "field_dense_vector": [round(math.sin(i + j), 6) for j in range(4)],
+        "field_flattened": {"arbitrary_key": f"value_{i}", "nested_key": {"deep": i}},
+    }
+
+
+def load_all_types(n: int = 10) -> None:
+    recreate_index(args.all_types_index, ALL_TYPES_MAPPING)
+    lines = []
+    for i in range(1, n + 1):
+        doc = _make_all_types_doc(i)
+        lines.append(json.dumps({"index": {"_index": args.all_types_index, "_id": i}}))
+        lines.append(json.dumps(doc))
+    bulk_request(lines)
+    requests.post(f"{ES_HOST}/{args.all_types_index}/_refresh", auth=ES_AUTH)
+    count = (
+        requests.get(f"{ES_HOST}/{args.all_types_index}/_count", auth=ES_AUTH)
+        .json()
+        .get("count", "?")
+    )
+    print(f"Loaded {n} documents → '{args.all_types_index}' (count={count})")
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    wait_for_es()
+
+    print(f"Reading {PARQUET_PATH} …")
+    df = pq.read_table(PARQUET_PATH).to_pydict()
+
+    recreate_index(INDEX, ARTICLES_MAPPING)
+    load_articles(df)
+
+    if args.embeddings:
+        load_vectors(df)
+
+    if args.all_types:
+        load_all_types(args.all_types_docs)
+
+    print("Done.")
diff --git a/elasticsearch/connector/spicepod.yaml b/elasticsearch/connector/spicepod.yaml
new file mode 100644
index 0000000..eba299c
--- /dev/null
+++ b/elasticsearch/connector/spicepod.yaml
@@ -0,0 +1,32 @@
+version: v2
+kind: Spicepod
+name: 01-connector
+
+datasets:
+  - from: elasticsearch:articles
+    name: articles
+    params:
+      elasticsearch_endpoint: http://localhost:9200
+      elasticsearch_user: elastic
+      elasticsearch_pass: spiceai
+
+  # all_types — federated (no acceleration, every query hits ES directly)
+  - from: elasticsearch:all_types
+    name: all_types
+    params:
+      elasticsearch_endpoint: http://localhost:9200
+      elasticsearch_user: elastic
+      elasticsearch_pass: spiceai
+
+  # all_types_accelerated — same index, materialized locally in DuckDB
+  - from: elasticsearch:all_types
+    name: all_types_accelerated
+    params:
+      elasticsearch_endpoint: http://localhost:9200
+      elasticsearch_user: elastic
+      elasticsearch_pass: spiceai
+    acceleration:
+      enabled: true
+      engine: duckdb
+      refresh_mode: full
+      refresh_check_interval: 30s
diff --git a/vectors/elasticsearch/README.md b/vectors/elasticsearch/README.md
new file mode 100644
index 0000000..a106f3d
--- /dev/null
+++ b/vectors/elasticsearch/README.md
@@ -0,0 +1,103 @@
+# Elasticsearch Vector Engine
+
+Works with `v2.0+`
+
+This recipe demonstrates how to use Elasticsearch as a vector engine in Spice.ai. It shows how to:
+
+- Run Elasticsearch locally with Docker Compose
+- Generate a sample articles dataset
+- Start Spice and automatically write embeddings into Elasticsearch
+- Run vector and hybrid search queries against the indexed embeddings
+
+## Prerequisites
+
+- [Spice CLI](https://docs.spiceai.org/getting-started) installed
+- Docker and Docker Compose installed
+- Python 3 installed
+
+## Getting Started
+
+### Step 1: Start Elasticsearch and generate the sample data
+
+From this recipe directory, start Elasticsearch:
+
+```bash
+docker compose up
+```
+
+In a new terminal, install the Python dependencies and generate the sample dataset:
+
+```bash
+pip install pandas pyarrow faker
+python generate_data.py --rows 500 --out articles.parquet
+```
+
+This creates an `articles.parquet` file used by the recipe.
+
+### Step 2: Start the Spice runtime
+
+Start Spice from this directory:
+
+```bash
+spice run
+```
+
+On startup, Spice automatically:
+
+1. Creates the `articles_search_engine` Elasticsearch index with a `dense_vector` mapping
+2. Loads the article records
+3. Computes embeddings locally using `all-MiniLM-L6-v2`
+4. Bulk indexes the vectors into Elasticsearch
+
+When startup completes, the `articles` dataset is ready for vector and hybrid search queries.
+
+### Step 3: Open the Spice SQL REPL
+
+In a new terminal, start the SQL REPL:
+
+```bash
+spice sql
+```
+
+## Run Queries
+
+Run semantic similarity search over the indexed embeddings:
+
+```sql
+SELECT id, title, _score
+FROM vector_search(articles, 'semantic similarity retrieval', 10)
+ORDER BY _score DESC;
+```
+
+Run vector search with a post-filter:
+
+```sql
+SELECT id, title, category, _score
+FROM vector_search(articles, 'cost optimisation cloud infrastructure', 10)
+WHERE category = 'cloud_infrastructure'
+ORDER BY _score DESC;
+```
+
+Fuse vector and keyword results with [Reciprocal Rank Fusion (RRF)](https://spiceai.org/docs/next/features/search#hybrid-search-with-rrf):
+
+```sql
+SELECT id, title, category, fused_score
+FROM rrf(
+    vector_search(articles, 'machine learning algorithms'),
+    text_search(articles, 'machine learning algorithms', content),
+    join_key => 'id'
+)
+ORDER BY fused_score DESC
+LIMIT 10;
+```
+
+## Notes
+
+- The Elasticsearch index is created automatically. No manual mapping step is required.
+- Embeddings are generated during the initial dataset refresh.
+
+## Learn more
+
+- [Elasticsearch Vector Engine Documentation](https://spiceai.org/docs/components/vectors/elasticsearch)
+- [Vector Search Documentation](https://spiceai.org/docs/features/search/vector-search)
+- [Datasets Reference](https://spiceai.org/docs/reference/spicepod/datasets)
diff --git a/vectors/elasticsearch/docker-compose.yml b/vectors/elasticsearch/docker-compose.yml
new file mode 100644
index 0000000..a51606f
--- /dev/null
+++ b/vectors/elasticsearch/docker-compose.yml
@@ -0,0 +1,28 @@
+services:
+  elasticsearch:
+    image: docker.elastic.co/elasticsearch/elasticsearch:8.13.4
+    container_name: es01
+    environment:
+      - discovery.type=single-node
+      - xpack.security.enabled=true
+      - xpack.security.http.ssl.enabled=false
+      - ES_JAVA_OPTS=-Xms512m -Xmx512m
+      - ELASTIC_PASSWORD=spiceai
+    ports:
+      - "9200:9200"
+    volumes:
+      - esdata:/usr/share/elasticsearch/data
+    healthcheck:
+      test:
+        [
+          "CMD-SHELL",
+          "curl -sf -u elastic:spiceai http://localhost:9200/_cluster/health | grep -qE '\"status\":\"(green|yellow)\"'",
+        ]
+      interval: 10s
+      timeout: 10s
+      retries: 15
+      start_period: 40s
+
+volumes:
+  esdata:
+    driver: local
diff --git a/vectors/elasticsearch/generate_data.py b/vectors/elasticsearch/generate_data.py
new file mode 100644
index 0000000..ba4edcb
--- /dev/null
+++ b/vectors/elasticsearch/generate_data.py
@@ -0,0 +1,574 @@
+#!/usr/bin/env python3
+"""
+Generate articles.parquet — sample data for the Spice search engine example.
+
+Usage:
+  pip install pandas pyarrow faker
+  python generate_data.py [--rows N] [--out PATH]
+"""
+
+import argparse
+import random
+from datetime import datetime, timedelta
+from itertools import product
+
+import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
+from faker import Faker
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+parser = argparse.ArgumentParser()
+parser.add_argument("--rows", type=int, default=100)
+parser.add_argument("--out", default="articles.parquet")
+args = parser.parse_args()
+
+TARGET_ROWS = args.rows
+OUT_PATH = args.out
+
+fake = Faker()
+random.seed(42)
+Faker.seed(42)
+
+# ---------------------------------------------------------------------------
+# Taxonomy
+# ---------------------------------------------------------------------------
+TOPICS = [
+    {
+        "category": "machine_learning",
+        "subtopics": [
+            "neural networks",
+            "deep learning",
+            "gradient descent",
+            "backpropagation",
+            "transformer models",
+            "attention mechanisms",
+            "fine-tuning",
+            "transfer learning",
+            "reinforcement learning",
+            "generative adversarial networks",
+            "diffusion models",
+            "contrastive learning",
+            "self-supervised learning",
+            "few-shot learning",
+            "model pruning",
+            "knowledge distillation",
+            "hyperparameter tuning",
+            "AutoML",
+            "federated learning",
+            "meta-learning",
+        ],
+        "title_templates": [
+            "A Practical Guide to {sub}",
+            "{sub} Explained: From Theory to Production",
+            "How {sub} Is Transforming AI Applications",
+            "Deep Dive: {sub} in Modern ML Pipelines",
+            "Benchmarking {sub} Across Popular Frameworks",
+            "Common Pitfalls in {sub} and How to Avoid Them",
+            "Scaling {sub} to Billions of Parameters",
+            "{sub}: State of the Art in {year}",
+            "Understanding {sub} Through Mathematical Intuition",
+            "Implementing {sub} Without a PhD",
+        ],
+    },
+    {
+        "category": "search_engines",
+        "subtopics": [
+            "BM25 ranking",
+            "inverted indexes",
+            "vector similarity search",
+            "semantic search",
+            "full-text search",
+            "dense retrieval",
+            "sparse retrieval",
+            "hybrid search",
+            "reciprocal rank fusion",
+            "embedding models",
+            "approximate nearest neighbor",
+            "HNSW graphs",
+            "product quantization",
+            "query expansion",
+            "relevance feedback",
+            "learning to rank",
+            "query rewriting",
+            "cross-encoder reranking",
+            "bi-encoder retrieval",
+            "knowledge graph search",
+        ],
+        "title_templates": [
+            "How {sub} Powers Modern Search",
+            "{sub} Under the Hood: Architecture and Trade-offs",
+            "Building Production-Ready {sub} Systems",
+            "Evaluating {sub}: Metrics That Matter",
+            "{sub} at Scale: Lessons from the Field",
+            "Comparing {sub} Approaches in {year}",
+            "A Developer's Guide to {sub}",
+            "When to Use {sub} vs Traditional Keyword Search",
+            "Optimising {sub} for Low-Latency Workloads",
+            "Open-Source Tools for {sub}",
+        ],
+    },
+    {
+        "category": "data_engineering",
+        "subtopics": [
+            "Apache Parquet",
+            "columnar storage",
+            "data pipelines",
+            "ETL workflows",
+            "stream processing",
+            "Apache Kafka",
+            "data lakes",
+            "schema evolution",
+            "Apache Iceberg",
+            "Delta Lake",
+            "data mesh",
+            "data contracts",
+            "change data capture",
+            "Apache Spark",
+            "dbt transformations",
+            "data quality checks",
+            "lineage tracking",
+            "metadata management",
+            "batch ingestion",
+            "real-time ingestion",
+        ],
+        "title_templates": [
+            "{sub}: The Definitive Overview",
+            "Why {sub} Is Essential for Modern Data Teams",
+            "Getting Started with {sub} in {year}",
+            "{sub} Best Practices for Large-Scale Workloads",
+            "Migrating to {sub}: A Step-by-Step Guide",
+            "Debugging {sub} in Production",
+            "{sub} vs Traditional Approaches: A Comparison",
+            "How {sub} Reduces Engineering Toil",
+            "Monitoring {sub} Pipelines at Scale",
+            "Cost Optimisation Strategies for {sub}",
+        ],
+    },
+    {
+        "category": "databases",
+        "subtopics": [
+            "SQL query optimisation",
+            "index design",
+            "ACID transactions",
+            "NoSQL modelling",
+            "vector databases",
+            "distributed databases",
+            "replication strategies",
+            "sharding",
+            "MVCC",
+            "write-ahead logging",
+            "B-tree indexes",
+            "LSM-tree storage",
+            "columnar databases",
+            "time-series databases",
+            "graph databases",
+            "in-memory databases",
+            "NewSQL",
+            "CockroachDB",
+            "FoundationDB",
+            "database connection pooling",
+        ],
+        "title_templates": [
+            "{sub} for High-Throughput Applications",
+            "A Deep Dive into {sub}",
+            "{sub}: Concepts Every Engineer Should Know",
+            "Production Lessons from Running {sub} at Scale",
+            "Choosing Between {sub} and Alternatives",
+            "How {sub} Impacts Query Performance",
+            "{sub} Internals Explained",
+            "Migrating from Legacy Systems to {sub}",
+            "Benchmarking {sub} in Cloud Environments",
+            "Debugging Slow Queries with {sub}",
+        ],
+    },
+    {
+        "category": "cloud_infrastructure",
+        "subtopics": [
+            "Kubernetes orchestration",
+            "container deployments",
+            "service meshes",
+            "auto-scaling",
+            "observability stacks",
+            "OpenTelemetry",
+            "cost optimisation",
+            "serverless functions",
+            "GitOps workflows",
+            "infrastructure as code",
+            "zero-trust networking",
+            "secrets management",
+            "multi-cloud strategies",
+            "disaster recovery",
+            "chaos engineering",
+            "FinOps",
+            "edge computing",
+            "platform engineering",
+            "developer portals",
+            "SLO management",
+        ],
+        "title_templates": [
+            "{sub} in Practice: Real-World Patterns",
+            "How {sub} Enables Reliable Systems",
+            "{sub} for Platform Engineers",
+            "Scaling {sub} in Enterprise Environments",
+            "Automating {sub} with Modern Tooling",
+            "{sub}: A Comprehensive Tutorial for {year}",
+            "Common Mistakes with {sub} and How to Fix Them",
+            "Security Considerations for {sub}",
+            "How We Cut Costs by Optimising {sub}",
+            "Building Self-Service Infrastructure with {sub}",
+        ],
+    },
+    {
+        "category": "software_engineering",
+        "subtopics": [
+            "clean code principles",
+            "design patterns",
+            "microservices architecture",
+            "domain-driven design",
+            "event sourcing",
+            "CQRS",
+            "API design",
+            "test-driven development",
+            "continuous integration",
+            "code review practices",
+            "technical debt",
+            "refactoring strategies",
+            "distributed tracing",
+            "circuit breakers",
+            "rate limiting",
+            "idempotency",
+            "concurrency models",
+            "async programming",
+            "type systems",
+            "compiler design",
+        ],
+        "title_templates": [
+            "Applying {sub} in Real Projects",
+            "{sub}: From Basics to Advanced Techniques",
+            "How {sub} Improves System Reliability",
+            "A Pragmatic Guide to {sub}",
+            "Interview Questions on {sub} and How to Answer Them",
+            "{sub} Patterns Every Senior Engineer Should Know",
+            "Refactoring Legacy Code with {sub}",
+            "When {sub} Makes Sense (and When It Doesn't)",
+            "{sub} in Distributed Systems",
+            "Teaching {sub} to Junior Developers",
+        ],
+    },
+    {
+        "category": "security",
+        "subtopics": [
+            "zero-trust architecture",
+            "OAuth 2.0",
+            "JWT authentication",
+            "supply chain security",
+            "SBOM",
+            "vulnerability scanning",
+            "penetration testing",
+            "secrets rotation",
+            "mTLS",
+            "RBAC",
+            "OWASP top 10",
+            "runtime security",
+            "SAST and DAST",
+            "CVE management",
+            "encryption at rest",
+            "key management",
+            "phishing prevention",
+            "incident response",
+            "threat modelling",
+            "compliance automation",
+        ],
+        "title_templates": [
+            "Understanding {sub} in Cloud-Native Environments",
+            "Implementing {sub} Without Slowing Down Development",
+            "{sub} for Distributed Systems",
+            "How {sub} Reduces Your Attack Surface",
+            "A Practical Guide to {sub} in {year}",
+            "Auditing {sub} Configurations at Scale",
+            "Automating {sub} in CI/CD Pipelines",
+            "{sub}: Myths and Realities",
+            "Responding to {sub} Incidents Effectively",
+            "Open-Source Tools for {sub}",
+        ],
+    },
+    {
+        "category": "developer_experience",
+        "subtopics": [
+            "developer portals",
+            "internal developer platforms",
+            "CLI tooling",
+            "IDE extensions",
+            "local development environments",
+            "devcontainers",
+            "debugging techniques",
+            "profiling tools",
+            "documentation-as-code",
+            "API mocking",
+            "feature flags",
+            "trunk-based development",
+            "monorepo tooling",
+            "dependency management",
+            "build caching",
+            "test parallelisation",
+            "code generation",
+            "linting and formatting",
+            "onboarding automation",
+            "developer productivity metrics",
+        ],
+        "title_templates": [
+            "Improving {sub} Across Engineering Teams",
+            "{sub}: Building a Great Developer Experience",
+            "How {sub} Speeds Up Delivery",
+            "Measuring the Impact of {sub}",
+            "Setting Up {sub} from Scratch",
+            "{sub} Tooling in {year}: What Works",
+            "How We Adopted {sub} at Our Company",
+            "The Case for Investing in {sub}",
+            "{sub} for Remote Engineering Teams",
+            "Automating {sub} to Eliminate Toil",
+        ],
+    },
+]
+
+CROSS_TOPIC_SEEDS = [
+    {
+        "title": "Vector Search Inside PostgreSQL with pgvector",
+        "category": "databases",
+        "keywords": ["vector databases", "BM25 ranking", "SQL query optimisation", "embedding models"],
+    },
+    {
+        "title": "Running Elasticsearch on Kubernetes: Lessons Learned",
+        "category": "cloud_infrastructure",
+        "keywords": ["Kubernetes orchestration", "observability stacks", "inverted indexes"],
+    },
+    {
+        "title": "ETL Pipelines for Machine Learning Feature Stores",
+        "category": "data_engineering",
+        "keywords": ["Apache Parquet", "data pipelines", "transformer models", "fine-tuning"],
+    },
+    {
+        "title": "Securing ML Model Serving Endpoints",
+        "category": "security",
+        "keywords": ["zero-trust architecture", "mTLS", "neural networks", "API design"],
+    },
+    {
+        "title": "Observability for Real-Time Data Pipelines",
+        "category": "data_engineering",
+        "keywords": ["OpenTelemetry", "Apache Kafka", "distributed tracing", "SLO management"],
+    },
+    {
+        "title": "Cost-Aware AutoML on Kubernetes",
+        "category": "machine_learning",
+        "keywords": ["AutoML", "Kubernetes orchestration", "cost optimisation", "hyperparameter tuning"],
+    },
+    {
+        "title": "Hybrid Search with Elasticsearch and pgvector",
+        "category": "search_engines",
+        "keywords": ["hybrid search", "reciprocal rank fusion", "vector databases", "inverted indexes"],
+    },
+    {
+        "title": "Developer Portals Powered by LLMs",
+        "category": "developer_experience",
+        "keywords": ["developer portals", "transformer models", "API design", "documentation-as-code"],
+    },
+    {
+        "title": "Zero-Trust Networking for Microservices",
+        "category": "security",
+        "keywords": ["zero-trust architecture", "service meshes", "mTLS", "microservices architecture"],
+    },
+    {
+        "title": "DuckDB as an Embedded Analytics Engine",
+        "category": "databases",
+        "keywords": ["columnar databases", "Apache Parquet", "SQL query optimisation", "data lakes"],
+    },
+]
+
+AUTHORS = [
+    "Alice Chen", "Bob Martinez", "Carol Okonkwo", "David Kim", "Eva Lindqvist",
+    "Frank Nguyen", "Grace Patel", "Hiro Tanaka", "Ingrid Sørensen", "James O'Brien",
+    "Kavya Reddy", "Luca Ferrari", "Maya Goldberg", "Nadia Petrov", "Oscar Bergström",
+    "Priya Sharma", "Quinn Walker", "Ravi Subramaniam", "Sofia Andrade", "Tom Brennan",
+]
+
+BASE_DATE = datetime(2024, 1, 1)
+YEAR_RANGE = 2
+
+# ---------------------------------------------------------------------------
+# Content generation
+# ---------------------------------------------------------------------------
+INTRO_TEMPLATES = [
+    "{title} has become one of the most discussed topics among practitioners in {year}. "
+    "This article examines core ideas around {kw1}, {kw2}, and {kw3}.",
+    "Few subjects generate as much debate as {title}. "
+    "In this post we take a structured look at {kw1} and its relationship to {kw2}.",
+    "The rise of {kw1} has put {title} firmly on the radar of engineering teams worldwide. "
+    "Here we break down the key concepts, starting with {kw2} and {kw3}.",
+    "If you have ever struggled to understand {title}, you are not alone. "
+    "We will demystify {kw1} and explain why {kw2} matters more than ever.",
+    "In {year}, {title} continues to evolve rapidly. "
+    "This comprehensive guide covers {kw1}, {kw2}, and practical guidance on {kw3}.",
+]
+
+BODY_SENTENCE_POOLS = [
+    "One of the most important considerations when working with {kw} is understanding its performance implications at scale.",
+    "Teams that have adopted {kw} report significant improvements in both reliability and developer velocity.",
+    "The relationship between {kw} and system latency is often underappreciated.",
+    "Recent benchmarks suggest that {kw} outperforms legacy approaches by up to an order of magnitude in read-heavy workloads.",
+    "Choosing the right abstraction for {kw} requires a clear understanding of your consistency and availability requirements.",
+    "A common misconception about {kw} is that it requires specialised hardware—in practice, commodity cloud instances suffice.",
+    "Observability tooling plays a critical role when debugging issues related to {kw} in production.",
+    "The open-source ecosystem around {kw} has matured considerably since its initial release.",
+    "Engineers often underestimate the operational overhead of {kw} when evaluating it for the first time.",
+    "Pair {kw} with a solid CI/CD pipeline to catch regressions early.",
+    "Security implications of {kw} are frequently discussed but rarely acted on systematically.",
+    "Integrating {kw} into an existing architecture typically requires incremental migration rather than a big-bang rewrite.",
+    "Documentation quality is a persistent challenge in the {kw} ecosystem.",
+    "Load testing is essential before relying on {kw} in a high-traffic environment.",
+    "Proper indexing strategy is inseparable from any discussion of {kw}.",
+]
+
+CLOSING_TEMPLATES = [
+    "In conclusion, mastering {kw} is increasingly a prerequisite for engineers working on {category} problems. "
+    "We encourage you to experiment with the techniques described here and share your findings with the community.",
+    "As the field of {category} advances, {kw} will only grow in importance. "
+    "The patterns outlined in this article provide a solid foundation for further exploration.",
+    "Understanding {kw} deeply—not just at the API surface—will set you apart as a practitioner in {category}. "
+    "We hope this guide serves as a useful reference.",
+    "The journey into {kw} is long but rewarding. "
+    "Start small, measure rigorously, and iterate—that is the path to production-grade {category} systems.",
+]
+
+
+def make_body(title: str, keywords: list[str], category: str, year: int) -> str:
+    kws = keywords[:]
+    random.shuffle(kws)
+    kw1, kw2, kw3 = (kws + kws)[:3]
+
+    intro = random.choice(INTRO_TEMPLATES).format(
+        title=title, year=year, kw1=kw1, kw2=kw2, kw3=kw3
+    )
+
+    paragraphs = [intro]
+    for _ in range(random.randint(3, 6)):
+        n_sentences = random.randint(3, 6)
+        sentences = []
+        for _ in range(n_sentences):
+            kw = random.choice(keywords)
+            tmpl = random.choice(BODY_SENTENCE_POOLS)
+            sentences.append(tmpl.format(kw=kw))
+        sentences += fake.sentences(nb=random.randint(1, 3))
+        random.shuffle(sentences)
+        paragraphs.append(" ".join(sentences))
+
+    closing_kw = random.choice(keywords)
+    paragraphs.append(
+        random.choice(CLOSING_TEMPLATES).format(kw=closing_kw, category=category)
+    )
+    return "\n\n".join(paragraphs)
+
+
+def make_published_at() -> str:
+    days = random.randint(0, 365 * YEAR_RANGE)
+    hours = random.randint(0, 23)
+    minutes = random.randint(0, 59)
+    dt = BASE_DATE + timedelta(days=days, hours=hours, minutes=minutes)
+    return dt.isoformat()
+
+
+# ---------------------------------------------------------------------------
+# Build seed pool
+# ---------------------------------------------------------------------------
+seed_pool: list[dict] = []
+year = 2025
+
+for topic in TOPICS:
+    for sub, tmpl in product(topic["subtopics"], topic["title_templates"]):
+        title = tmpl.format(sub=sub.title(), year=year)
+        seed_pool.append(
+            {
+                "title": title,
+                "category": topic["category"],
+                "keywords": topic["subtopics"],
+                "primary_kw": sub,
+            }
+        )
+
+random.shuffle(seed_pool)
+
+# ---------------------------------------------------------------------------
+# Generate records
+# ---------------------------------------------------------------------------
+records = []
+
+for i, seed in enumerate(CROSS_TOPIC_SEEDS, start=1):
+    kws = seed["keywords"]
+    records.append(
+        {
+            "id": i,
+            "title": seed["title"],
+            "content": make_body(seed["title"], kws, seed["category"], year),
+            "author": random.choice(AUTHORS),
+            "category": seed["category"],
+            "tags": ", ".join(random.sample(kws, min(3, len(kws)))),
+            "published_at": make_published_at(),
+            "views": random.randint(5_000, 80_000),
+            "likes": random.randint(200, 8_000),
+        }
+    )
+
+doc_id = len(CROSS_TOPIC_SEEDS) + 1
+remaining = TARGET_ROWS - len(CROSS_TOPIC_SEEDS)
+pool_cycle = (seed_pool * ((remaining // len(seed_pool)) + 2))[:remaining]
+
+for seed in pool_cycle:
+    kws = seed["keywords"]
+    title = seed["title"]
+
+    qualifiers = ["", "", "", "How to Use ", "Why ", "When to Use ", ""]
+    q = random.choice(qualifiers)
+    if q and not title.startswith(("How", "Why", "When", "A ", "Building", "Running")):
+        title = q + title[0].lower() + title[1:]
+
+    records.append(
+        {
+            "id": doc_id,
+            "title": title,
+            "content": make_body(title, kws, seed["category"], year),
+            "author": random.choice(AUTHORS),
+            "category": seed["category"],
+            "tags": ", ".join(random.sample(kws, min(4, len(kws)))),
+            "published_at": make_published_at(),
+            "views": random.randint(50, 60_000),
+            "likes": random.randint(0, 6_000),
+        }
+    )
+    doc_id += 1
+
+# ---------------------------------------------------------------------------
+# Write Parquet
+# ---------------------------------------------------------------------------
+df = pd.DataFrame(records)
+
+schema = pa.schema(
+    [
+        pa.field("id", pa.int32()),
+        pa.field("title", pa.string()),
+        pa.field("content", pa.string()),
+        pa.field("author", pa.string()),
+        pa.field("category", pa.string()),
+        pa.field("tags", pa.string()),
+        pa.field("published_at", pa.string()),
+        pa.field("views", pa.int32()),
+        pa.field("likes", pa.int32()),
+    ]
+)
+
+table = pa.Table.from_pandas(df, schema=schema, preserve_index=False)
+pq.write_table(table, OUT_PATH, compression="snappy")
+
+print(f"Wrote {len(records):,} records to {OUT_PATH}")
+print(df["category"].value_counts().to_string())
diff --git a/vectors/elasticsearch/spicepod.yaml b/vectors/elasticsearch/spicepod.yaml
new file mode 100644
index 0000000..bfa20fb
--- /dev/null
+++ b/vectors/elasticsearch/spicepod.yaml
@@ -0,0 +1,43 @@
+version: v2
+kind: Spicepod
+name: 06-search-engine
+
+embeddings:
+  - name: local_embeddings
+    from: huggingface:huggingface.co/sentence-transformers/all-MiniLM-L6-v2
+
+datasets:
+  - from: file:./articles.parquet
+    name: articles
+    params:
+      file_format: parquet
+    acceleration:
+      enabled: true
+      engine: arrow
+    columns:
+      - name: content
+        embeddings:
+          - from: local_embeddings
+            row_id: id
+            chunking:
+              enabled: true
+              target_chunk_size: 150
+              overlap_size: 20
+        full_text_search:
+          enabled: true
+          row_id:
+            - id
+      - name: title
+        full_text_search:
+          enabled: true
+          row_id:
+            - id
+    vectors:
+      enabled: true
+      engine: elasticsearch
+      params:
+        elasticsearch_endpoint: http://localhost:9200
+        elasticsearch_index: articles_search_engine
+        elasticsearch_vector_field: content_embedding
+        elasticsearch_user: elastic
+        elasticsearch_pass: spiceai

From 591def3008aceebe6736ef6eee2ab76848fac312 Mon Sep 17 00:00:00 2001
From: ewgenius <hey@ewgenius.me>
Date: Wed, 22 Apr 2026 23:52:44 +0900
Subject: [PATCH 2/7] tweak cookbooks

---
 elasticsearch/connector/README.md        | 42 +++++++----
 elasticsearch/connector/generate_data.py | 39 ++--------
 elasticsearch/connector/load_data.py     | 92 +-----------------------
 vectors/elasticsearch/generate_data.py   | 64 ++++-------------
 vectors/elasticsearch/spicepod.yaml      |  5 +-
 5 files changed, 55 insertions(+), 187 deletions(-)

diff --git a/elasticsearch/connector/README.md b/elasticsearch/connector/README.md
index f238289..2b2538c 100644
--- a/elasticsearch/connector/README.md
+++ b/elasticsearch/connector/README.md
@@ -63,27 +63,45 @@ WHERE category = 'machine_learning'
 LIMIT 10;
 ```
 
-Inspect one row from the `all_types` index:
-
-```sql
-SELECT *
-FROM all_types
-LIMIT 1;
+```
++----+----------------------------------------------------------------------------------+------------------+------------------+
+| id |                                       title                                      |     category     |      author      |
+|int32|                                      varchar                                     |      varchar     |      varchar     |
++----+----------------------------------------------------------------------------------+------------------+------------------+
+| 6  | Cost-Aware AutoML on Kubernetes                                                  | machine_learning | Bob Martinez     |
+| 14 | Contrastive Learning Explained: From Theory to Production                        | machine_learning | Priya Sharma     |
+| 27 | How Federated Learning Is Transforming AI Applications                           | machine_learning | Quinn Walker     |
+| 32 | Scaling Few-Shot Learning to Billions of Parameters                              | machine_learning | Tom Brennan      |
+| 38 | Why understanding Generative Adversarial Networks Through Mathematical Intuition | machine_learning | Alice Chen       |
+| 40 | Few-Shot Learning: State of the Art in 2025                                      | machine_learning | Luca Ferrari     |
+| 45 | Attention Mechanisms: State of the Art in 2025                                   | machine_learning | Carol Okonkwo    |
+| 47 | Why understanding Fine-Tuning Through Mathematical Intuition                     | machine_learning | Ravi Subramaniam |
+| 63 | Scaling Self-Supervised Learning to Billions of Parameters                       | machine_learning | Priya Sharma     |
+| 68 | A Practical Guide to Diffusion Models                                            | machine_learning | Luca Ferrari     |
++----+----------------------------------------------------------------------------------+------------------+------------------+
+
+Time: 0.036074709 seconds. 10 rows.
 ```
 
 Filter the `all_types` index on a keyword field:
 
 ```sql
-SELECT id, field_keyword, field_integer
+SELECT *
 FROM all_types
 WHERE field_keyword = 'category_0';
 ```
 
-## Notes
-
-- `articles` and `all_types` are queried directly from Elasticsearch.
-- The connector maps Elasticsearch index mappings to Arrow schemas so the data can be queried with SQL in Spice.
-- For indices with compatible search fields, Spice can route `vector_search`, `text_search`, and `rrf` to Elasticsearch.
+```
++--------------------------+---------------+------------+-----------------------------------------------+----------------------+--------------------------------+-------------------------------------------------------------+--------------------------------------------+--------------------+--------------------------+-------------------------------------------------------+-------------+-------------------------+--------------------------------------+--------------------------------------------------------+------------------+---------------+-----------------------+---------------+---------------+---------------+---------------------------+---------------------------------------------------------------+-------------------+--------------------+--------------------+----------------------------------------+-------------+-----------------------------------------------------------+-------------------------------------+---------------------+---------------+----+
+|       field_binary       | field_boolean | field_byte |                field_completion               |      field_date      |        field_date_nanos        |                       field_date_range                      |             field_dense_vector             |    field_double    |    field_double_range    |                    field_flattened                    | field_float |    field_float_range    |            field_geo_point           |                     field_geo_shape                    | field_half_float | field_integer |  field_integer_range  |    field_ip   | field_keyword |   field_long  |      field_long_range     |                          field_nested                         | field_object.name | field_object.value | field_scaled_float |        field_search_as_you_type        | field_short |                         field_text                        |          field_token_count          | field_unsigned_long | field_version | id |
+|          varchar         |    boolean    |    int8    |                    varchar                    |        varchar       |             varchar            |                           varchar                           |                 float32[4]                 |       float64      |          varchar         |                        varchar                        |   float32   |         varchar         |                varchar               |                         varchar                        |      float32     |     int32     |        varchar        |    varchar    |    varchar    |     int64     |          varchar          |                            varchar                            |      varchar      |        int32       |       float32      |                 varchar                |    int16    |                          varchar                          |               varchar               |       varchar       |    varchar    |int32|
++--------------------------+---------------+------------+-----------------------------------------------+----------------------+--------------------------------+-------------------------------------------------------------+--------------------------------------------+--------------------+--------------------------+-------------------------------------------------------+-------------+-------------------------+--------------------------------------+--------------------------------------------------------+------------------+---------------+-----------------------+---------------+---------------+---------------+---------------------------+---------------------------------------------------------------+-------------------+--------------------+--------------------+----------------------------------------+-------------+-----------------------------------------------------------+-------------------------------------+---------------------+---------------+----+
+| YmluYXJ5X3BheWxvYWRfNQ== | true          | -114       | {"input":["suggest_5","doc_5"],"weight":6}    | 2024-06-06T05:00:00Z | 2024-06-06T05:00:00.000000000Z | {"gte":"2024-01-06T00:00:00Z","lte":"2024-12-06T23:59:59Z"} | [-0.958924, -0.279415, 0.656987, 0.989358] | 680696.2410453358  | {"gte":5.5,"lte":5.51}   | {"arbitrary_key":"value_5","nested_key":{"deep":5}}   | 551.9171    | {"gte":5.0,"lte":5.5}   | {"lat":-21.463575,"lon":-143.289215} | {"type":"Point","coordinates":[-90.240943,41.613069]}  | -50.19           | 94455         | {"gte":50,"lte":55}   | 192.168.5.35  | category_0    | 24150178885   | {"gte":5000,"lte":5100}   | [{"tag":"tag_1","score":0.372},{"tag":"tag_2","score":0.868}] | obj_5             | 35                 | 51.85              | searchable text for document number 5  | 14225       | The quick brown fox jumps over the lazy dog — document 5  | token count source text document 5  | 261035185072990349  | 1.5.0         | 5  |
+| YmluYXJ5X3BheWxvYWRfMTA= | false         | -121       | {"input":["suggest_10","doc_10"],"weight":11} | 2024-02-11T10:00:00Z | 2024-02-11T10:00:00.000000000Z | {"gte":"2024-01-11T00:00:00Z","lte":"2024-12-11T23:59:59Z"} | [-0.544021, -0.99999, -0.536573, 0.420167] | -587803.5357209966 | {"gte":11.0,"lte":11.01} | {"arbitrary_key":"value_10","nested_key":{"deep":10}} | 626.6425    | {"gte":10.0,"lte":10.5} | {"lat":-45.000598,"lon":163.014087}  | {"type":"Point","coordinates":[178.760517,-81.979851]} | 64.72            | 12430         | {"gte":100,"lte":105} | 192.168.10.70 | category_0    | -955323551533 | {"gte":10000,"lte":10100} | [{"tag":"tag_1","score":0.521},{"tag":"tag_2","score":0.328}] | obj_10            | 70                 | 653.47             | searchable text for document number 10 | 30482       | The quick brown fox jumps over the lazy dog — document 10 | token count source text document 10 | 79329244941176303   | 1.10.0        | 10 |
++--------------------------+---------------+------------+-----------------------------------------------+----------------------+--------------------------------+-------------------------------------------------------------+--------------------------------------------+--------------------+--------------------------+-------------------------------------------------------+-------------+-------------------------+--------------------------------------+--------------------------------------------------------+------------------+---------------+-----------------------+---------------+---------------+---------------+---------------------------+---------------------------------------------------------------+-------------------+--------------------+--------------------+----------------------------------------+-------------+-----------------------------------------------------------+-------------------------------------+---------------------+---------------+----+
+
+Time: 0.024743022 seconds. 2 rows.
+```
 
 ## Learn more
 
diff --git a/elasticsearch/connector/generate_data.py b/elasticsearch/connector/generate_data.py
index 1bf9399..077d89f 100644
--- a/elasticsearch/connector/generate_data.py
+++ b/elasticsearch/connector/generate_data.py
@@ -1,17 +1,13 @@
 #!/usr/bin/env python3
 """
-Generate articles.parquet — sample data for Spice Elasticsearch connector tests.
+Generate articles.parquet — sample data for the Spice Elasticsearch connector recipe.
 
-Produces ~15 000 records with sufficient lexical and semantic diversity to
-meaningfully exercise:
-  - Data connector (federated SQL via elasticsearch:articles)
-  - Vector search  (vector_search UDTF)
-  - Full-text search (text_search UDTF / BM25)
-  - Hybrid search with RRF (rrf(vector_search(...), text_search(...)))
+Produces article records with sufficient lexical diversity to exercise the
+Elasticsearch data connector.
 
 Usage:
   pip install pandas pyarrow faker
-  python generate_data.py [--rows N]   # default 15000
+  python generate_data.py [--rows N]
 """
 
 import argparse
@@ -30,15 +26,7 @@
 parser = argparse.ArgumentParser()
 parser.add_argument("--rows", type=int, default=100)
 parser.add_argument("--out", default="articles.parquet")
-parser.add_argument(
-    "--embeddings",
-    action="store_true",
-    help="Compute content_embedding column using all-MiniLM-L6-v2 and write into parquet",
-)
-parser.add_argument(
-    "--embedding-model", default="sentence-transformers/all-MiniLM-L6-v2"
-)
-parser.add_argument("--embedding-batch-size", type=int, default=64)
+
 args = parser.parse_args()
 
 TARGET_ROWS = args.rows
@@ -655,23 +643,6 @@ def make_published_at() -> str:
     pa.field("likes", pa.int32()),
 ]
 
-if args.embeddings:
-    print(f"Computing embeddings with '{args.embedding_model}' …")
-    from sentence_transformers import SentenceTransformer
-
-    model = SentenceTransformer(args.embedding_model)
-    texts = df["content"].tolist()
-    vecs = model.encode(
-        texts,
-        batch_size=args.embedding_batch_size,
-        show_progress_bar=True,
-        convert_to_numpy=True,
-    )
-    dims = vecs.shape[1]
-    print(f"Computed {len(vecs)} embeddings, dims={dims}")
-    df["content_embedding"] = [v.tolist() for v in vecs]
-    fields.append(pa.field("content_embedding", pa.list_(pa.float32(), dims)))
-
 schema = pa.schema(fields)
 table = pa.Table.from_pandas(df, schema=schema, preserve_index=False)
 pq.write_table(table, OUT_PATH, compression="snappy")
diff --git a/elasticsearch/connector/load_data.py b/elasticsearch/connector/load_data.py
index e6059ec..55b61f7 100644
--- a/elasticsearch/connector/load_data.py
+++ b/elasticsearch/connector/load_data.py
@@ -3,9 +3,9 @@
 Load articles.parquet into Elasticsearch.
 
 Usage:
-  uv run load_data.py                        # load articles index only
-  uv run load_data.py --embeddings           # also compute + load vectors index
-  uv run load_data.py --embeddings --parquet 04-vector-search/articles.parquet
+  uv run load_data.py
+  uv run load_data.py --all-types
+  uv run load_data.py --parquet articles.parquet --all-types
 """
 
 import argparse
@@ -25,17 +25,6 @@
 parser.add_argument("--es-pass", default=os.environ.get("ES_PASS", "spiceai"))
 parser.add_argument("--parquet", default="articles.parquet")
 parser.add_argument("--index", default="articles")
-parser.add_argument(
-    "--embeddings",
-    action="store_true",
-    help="Also compute and load vectors into a dense_vector index",
-)
-parser.add_argument("--vectors-index", default="articles_vectors")
-parser.add_argument(
-    "--embedding-model", default="sentence-transformers/all-MiniLM-L6-v2"
-)
-parser.add_argument("--embedding-column", default="content")
-parser.add_argument("--batch-size", type=int, default=64)
 parser.add_argument(
     "--all-types",
     action="store_true",
@@ -145,78 +134,6 @@ def load_articles(df: dict) -> None:
     print(f"Loaded {n} documents → '{INDEX}' (count={count})")
 
 
-# ---------------------------------------------------------------------------
-# Vectors index
-# ---------------------------------------------------------------------------
-def vectors_mapping(dims: int) -> dict:
-    return {
-        "settings": {"number_of_shards": 1, "number_of_replicas": 0},
-        "mappings": {
-            "properties": {
-                "id": {"type": "integer"},
-                "content": {"type": "text", "index": False},
-                "content_embedding": {
-                    "type": "dense_vector",
-                    "dims": dims,
-                    "index": True,
-                    "similarity": "cosine",
-                },
-            }
-        },
-    }
-
-
-def load_vectors(df: dict) -> None:
-    print(f"Loading embedding model '{args.embedding_model}' …")
-    from sentence_transformers import SentenceTransformer
-
-    model = SentenceTransformer(args.embedding_model)
-
-    texts = df[args.embedding_column]
-    ids = df["id"]
-    n = len(ids)
-    dims = model.get_embedding_dimension()
-    print(
-        f"Model dims={dims}, encoding {n} documents in batches of {args.batch_size} …"
-    )
-
-    recreate_index(args.vectors_index, vectors_mapping(dims))
-
-    for batch_start in range(0, n, args.batch_size):
-        batch_texts = texts[batch_start : batch_start + args.batch_size]
-        batch_ids = ids[batch_start : batch_start + args.batch_size]
-        embeddings = model.encode(
-            batch_texts, show_progress_bar=False, convert_to_numpy=True
-        )
-
-        lines = []
-        for i, (doc_id, text, vec) in enumerate(
-            zip(batch_ids, batch_texts, embeddings)
-        ):
-            lines.append(
-                json.dumps({"index": {"_index": args.vectors_index, "_id": doc_id}})
-            )
-            lines.append(
-                json.dumps(
-                    {
-                        "id": doc_id,
-                        "content": text,
-                        "content_embedding": vec.tolist(),
-                    }
-                )
-            )
-        bulk_request(lines)
-        print(f"  Indexed {min(batch_start + args.batch_size, n)}/{n} vectors …")
-
-    requests.post(f"{ES_HOST}/{args.vectors_index}/_refresh", auth=ES_AUTH)
-    count = (
-        requests.get(f"{ES_HOST}/{args.vectors_index}/_count", auth=ES_AUTH)
-        .json()
-        .get("count", "?")
-    )
-    print(f"Loaded {n} vectors → '{args.vectors_index}' (count={count})")
-
-
 # ---------------------------------------------------------------------------
 # all_types index — one document per supported ES field type
 # ---------------------------------------------------------------------------
@@ -385,9 +302,6 @@ def load_all_types(n: int = 10) -> None:
     recreate_index(INDEX, ARTICLES_MAPPING)
     load_articles(df)
 
-    if args.embeddings:
-        load_vectors(df)
-
     if args.all_types:
         load_all_types(args.all_types_docs)
 
diff --git a/vectors/elasticsearch/generate_data.py b/vectors/elasticsearch/generate_data.py
index ba4edcb..643fafb 100644
--- a/vectors/elasticsearch/generate_data.py
+++ b/vectors/elasticsearch/generate_data.py
@@ -9,7 +9,6 @@
 
 import argparse
 import random
-from datetime import datetime, timedelta
 from itertools import product
 
 import pandas as pd
@@ -387,16 +386,6 @@
     },
 ]
 
-AUTHORS = [
-    "Alice Chen", "Bob Martinez", "Carol Okonkwo", "David Kim", "Eva Lindqvist",
-    "Frank Nguyen", "Grace Patel", "Hiro Tanaka", "Ingrid Sørensen", "James O'Brien",
-    "Kavya Reddy", "Luca Ferrari", "Maya Goldberg", "Nadia Petrov", "Oscar Bergström",
-    "Priya Sharma", "Quinn Walker", "Ravi Subramaniam", "Sofia Andrade", "Tom Brennan",
-]
-
-BASE_DATE = datetime(2024, 1, 1)
-YEAR_RANGE = 2
-
 # ---------------------------------------------------------------------------
 # Content generation
 # ---------------------------------------------------------------------------
@@ -452,31 +441,21 @@ def make_body(title: str, keywords: list[str], category: str, year: int) -> str:
         title=title, year=year, kw1=kw1, kw2=kw2, kw3=kw3
     )
 
-    paragraphs = [intro]
-    for _ in range(random.randint(3, 6)):
-        n_sentences = random.randint(3, 6)
-        sentences = []
-        for _ in range(n_sentences):
-            kw = random.choice(keywords)
-            tmpl = random.choice(BODY_SENTENCE_POOLS)
-            sentences.append(tmpl.format(kw=kw))
-        sentences += fake.sentences(nb=random.randint(1, 3))
-        random.shuffle(sentences)
-        paragraphs.append(" ".join(sentences))
+    sentences = []
+    for _ in range(random.randint(2, 3)):
+        kw = random.choice(keywords)
+        tmpl = random.choice(BODY_SENTENCE_POOLS)
+        sentences.append(tmpl.format(kw=kw))
+
+    sentences += fake.sentences(nb=1)
+    random.shuffle(sentences)
 
     closing_kw = random.choice(keywords)
-    paragraphs.append(
-        random.choice(CLOSING_TEMPLATES).format(kw=closing_kw, category=category)
+    closing = random.choice(CLOSING_TEMPLATES).format(
+        kw=closing_kw, category=category
     )
-    return "\n\n".join(paragraphs)
-
 
-def make_published_at() -> str:
-    days = random.randint(0, 365 * YEAR_RANGE)
-    hours = random.randint(0, 23)
-    minutes = random.randint(0, 59)
-    dt = BASE_DATE + timedelta(days=days, hours=hours, minutes=minutes)
-    return dt.isoformat()
+    return " ".join([intro, " ".join(sentences), closing])
 
 
 # ---------------------------------------------------------------------------
@@ -510,13 +489,8 @@ def make_published_at() -> str:
         {
             "id": i,
             "title": seed["title"],
-            "content": make_body(seed["title"], kws, seed["category"], year),
-            "author": random.choice(AUTHORS),
             "category": seed["category"],
-            "tags": ", ".join(random.sample(kws, min(3, len(kws)))),
-            "published_at": make_published_at(),
-            "views": random.randint(5_000, 80_000),
-            "likes": random.randint(200, 8_000),
+            "content": make_body(seed["title"], kws, seed["category"], year),
         }
     )
 
@@ -537,13 +511,8 @@ def make_published_at() -> str:
         {
             "id": doc_id,
             "title": title,
-            "content": make_body(title, kws, seed["category"], year),
-            "author": random.choice(AUTHORS),
             "category": seed["category"],
-            "tags": ", ".join(random.sample(kws, min(4, len(kws)))),
-            "published_at": make_published_at(),
-            "views": random.randint(50, 60_000),
-            "likes": random.randint(0, 6_000),
+            "content": make_body(title, kws, seed["category"], year),
         }
     )
     doc_id += 1
@@ -557,13 +526,8 @@ def make_published_at() -> str:
     [
         pa.field("id", pa.int32()),
         pa.field("title", pa.string()),
-        pa.field("content", pa.string()),
-        pa.field("author", pa.string()),
         pa.field("category", pa.string()),
-        pa.field("tags", pa.string()),
-        pa.field("published_at", pa.string()),
-        pa.field("views", pa.int32()),
-        pa.field("likes", pa.int32()),
+        pa.field("content", pa.string()),
     ]
 )
 
diff --git a/vectors/elasticsearch/spicepod.yaml b/vectors/elasticsearch/spicepod.yaml
index bfa20fb..fad524a 100644
--- a/vectors/elasticsearch/spicepod.yaml
+++ b/vectors/elasticsearch/spicepod.yaml
@@ -21,8 +21,9 @@ datasets:
             row_id: id
             chunking:
               enabled: true
-              target_chunk_size: 150
-              overlap_size: 20
+              target_chunk_size: 256
+              overlap_size: 64
+              file_format: md
         full_text_search:
           enabled: true
           row_id:

From 7f8c74927b3f6b435ec9f9a706d873f50708ebd5 Mon Sep 17 00:00:00 2001
From: ewgenius <hey@ewgenius.me>
Date: Wed, 22 Apr 2026 23:53:35 +0900
Subject: [PATCH 3/7] gitignore

---
 elasticsearch/connector/.gitignore | 1 +
 vectors/elasticsearch/.gitignore   | 1 +
 2 files changed, 2 insertions(+)
 create mode 100644 elasticsearch/connector/.gitignore
 create mode 100644 vectors/elasticsearch/.gitignore

diff --git a/elasticsearch/connector/.gitignore b/elasticsearch/connector/.gitignore
new file mode 100644
index 0000000..4bed5da
--- /dev/null
+++ b/elasticsearch/connector/.gitignore
@@ -0,0 +1 @@
+*.parquet
diff --git a/vectors/elasticsearch/.gitignore b/vectors/elasticsearch/.gitignore
new file mode 100644
index 0000000..4bed5da
--- /dev/null
+++ b/vectors/elasticsearch/.gitignore
@@ -0,0 +1 @@
+*.parquet

From 566ef580d5d4ee28ac1e08c2198072841f84f5d5 Mon Sep 17 00:00:00 2001
From: ewgenius <hey@ewgenius.me>
Date: Thu, 23 Apr 2026 00:03:45 +0900
Subject: [PATCH 4/7] tweak

---
 vectors/elasticsearch/README.md        | 15 ++++++++-------
 vectors/elasticsearch/generate_data.py | 23 +++++------------------
 2 files changed, 13 insertions(+), 25 deletions(-)

diff --git a/vectors/elasticsearch/README.md b/vectors/elasticsearch/README.md
index a106f3d..74cb9cb 100644
--- a/vectors/elasticsearch/README.md
+++ b/vectors/elasticsearch/README.md
@@ -7,7 +7,7 @@ This recipe demonstrates how to use Elasticsearch as a vector engine in Spice.ai
 - Run Elasticsearch locally with Docker Compose
 - Generate a sample articles dataset
 - Start Spice and automatically write embeddings into Elasticsearch
-- Run vector and hybrid search queries against the indexed embeddings
+- Run vector and hybrid search queries that match the generated article titles and content
 
 ## Prerequisites
 
@@ -50,6 +50,7 @@ On startup, Spice automatically:
 4. Bulk indexes the vectors into Elasticsearch
 
 When startup completes, the `articles` dataset is ready for vector and hybrid search queries.
+Use search prompts that mirror the generated article topics, such as `pgvector`, `Kubernetes`, `AutoML`, and `hybrid search`.
 
 ### Step 3: Open the Spice SQL REPL
 
@@ -64,8 +65,8 @@ spice sql
 Run semantic similarity search over the indexed embeddings:
 
 ```sql
-SELECT id, title, _score
-FROM vector_search(articles, 'semantic similarity retrieval', 10)
+SELECT id, title, category, _score
+FROM vector_search(articles, 'pgvector vector databases sql query optimisation', 10)
 ORDER BY _score DESC;
 ```
 
@@ -73,8 +74,8 @@ Run vector search with a post-filter:
 
 ```sql
 SELECT id, title, category, _score
-FROM vector_search(articles, 'cost optimisation cloud infrastructure', 10)
-WHERE category = 'cloud_infrastructure'
+FROM vector_search(articles, 'kubernetes cost optimisation automl', 10)
+WHERE category = 'machine_learning'
 ORDER BY _score DESC;
 ```
 
@@ -83,8 +84,8 @@ Fuse vector and keyword results with [Reciprocal Rank Fusion (RRF)](https://spic
 ```sql
 SELECT id, title, category, fused_score
 FROM rrf(
-    vector_search(articles, 'machine learning algorithms'),
-    text_search(articles, 'machine learning algorithms', content),
+    vector_search(articles, 'hybrid search elasticsearch pgvector'),
+    text_search(articles, 'hybrid search elasticsearch pgvector', content),
     join_key => 'id'
 )
 ORDER BY fused_score DESC
diff --git a/vectors/elasticsearch/generate_data.py b/vectors/elasticsearch/generate_data.py
index 643fafb..8ef1073 100644
--- a/vectors/elasticsearch/generate_data.py
+++ b/vectors/elasticsearch/generate_data.py
@@ -435,27 +435,14 @@
 def make_body(title: str, keywords: list[str], category: str, year: int) -> str:
     kws = keywords[:]
     random.shuffle(kws)
-    kw1, kw2, kw3 = (kws + kws)[:3]
+    kw1, kw2 = (kws + kws)[:2]
 
-    intro = random.choice(INTRO_TEMPLATES).format(
-        title=title, year=year, kw1=kw1, kw2=kw2, kw3=kw3
-    )
-
-    sentences = []
-    for _ in range(random.randint(2, 3)):
-        kw = random.choice(keywords)
-        tmpl = random.choice(BODY_SENTENCE_POOLS)
-        sentences.append(tmpl.format(kw=kw))
+    intro = f"{title}. {kw1} and {kw2} are key ideas in {category.replace('_', ' ')}."
 
-    sentences += fake.sentences(nb=1)
-    random.shuffle(sentences)
-
-    closing_kw = random.choice(keywords)
-    closing = random.choice(CLOSING_TEMPLATES).format(
-        kw=closing_kw, category=category
-    )
+    kw = random.choice(keywords)
+    body = random.choice(BODY_SENTENCE_POOLS).format(kw=kw)
 
-    return " ".join([intro, " ".join(sentences), closing])
+    return " ".join([intro, body])
 
 
 # ---------------------------------------------------------------------------

From 6077d14388f1b943a61a9155f61cf49d0f4c413b Mon Sep 17 00:00:00 2001
From: ewgenius <hey@ewgenius.me>
Date: Thu, 23 Apr 2026 00:25:35 +0900
Subject: [PATCH 5/7] tweak data

---
 vectors/elasticsearch/README.md        | 31 ++++++---
 vectors/elasticsearch/generate_data.py | 91 ++++++++++++++++++++++++--
 vectors/elasticsearch/spicepod.yaml    |  7 +-
 3 files changed, 115 insertions(+), 14 deletions(-)

diff --git a/vectors/elasticsearch/README.md b/vectors/elasticsearch/README.md
index 74cb9cb..19ad809 100644
--- a/vectors/elasticsearch/README.md
+++ b/vectors/elasticsearch/README.md
@@ -7,7 +7,7 @@ This recipe demonstrates how to use Elasticsearch as a vector engine in Spice.ai
 - Run Elasticsearch locally with Docker Compose
 - Generate a sample articles dataset
 - Start Spice and automatically write embeddings into Elasticsearch
-- Run vector and hybrid search queries that match the generated article titles and content
+- Run vector and hybrid search queries using meaningful phrases and natural-language questions
 
 ## Prerequisites
 
@@ -50,7 +50,7 @@ On startup, Spice automatically:
 4. Bulk indexes the vectors into Elasticsearch
 
 When startup completes, the `articles` dataset is ready for vector and hybrid search queries.
-Use search prompts that mirror the generated article topics, such as `pgvector`, `Kubernetes`, `AutoML`, and `hybrid search`.
+Use phrase-based prompts and natural-language questions that mirror the generated article topics, such as `pgvector`, `Kubernetes`, `AutoML`, and `hybrid search`.
 
 ### Step 3: Open the Spice SQL REPL
 
@@ -62,19 +62,27 @@ spice sql
 
 ## Run Queries
 
-Run semantic similarity search over the indexed embeddings:
+Run semantic similarity search over the indexed embeddings using a phrase-based prompt:
 
 ```sql
 SELECT id, title, category, _score
-FROM vector_search(articles, 'pgvector vector databases sql query optimisation', 10)
+FROM vector_search(
+    articles,
+    'How does pgvector improve SQL query optimisation for vector search?',
+    10
+)
 ORDER BY _score DESC;
 ```
 
-Run vector search with a post-filter:
+Run vector search with a post-filter using a natural-language question:
 
 ```sql
 SELECT id, title, category, _score
-FROM vector_search(articles, 'kubernetes cost optimisation automl', 10)
+FROM vector_search(
+    articles,
+    'What are the trade-offs of cost-aware AutoML on Kubernetes?',
+    10
+)
 WHERE category = 'machine_learning'
 ORDER BY _score DESC;
 ```
@@ -84,8 +92,15 @@ Fuse vector and keyword results with [Reciprocal Rank Fusion (RRF)](https://spic
 ```sql
 SELECT id, title, category, fused_score
 FROM rrf(
-    vector_search(articles, 'hybrid search elasticsearch pgvector'),
-    text_search(articles, 'hybrid search elasticsearch pgvector', content),
+    vector_search(
+        articles,
+        'How can hybrid search combine Elasticsearch and pgvector effectively?'
+    ),
+    text_search(
+        articles,
+        'How can hybrid search combine Elasticsearch and pgvector effectively?',
+        content
+    ),
     join_key => 'id'
 )
 ORDER BY fused_score DESC
diff --git a/vectors/elasticsearch/generate_data.py b/vectors/elasticsearch/generate_data.py
index 8ef1073..9d2559c 100644
--- a/vectors/elasticsearch/generate_data.py
+++ b/vectors/elasticsearch/generate_data.py
@@ -435,14 +435,95 @@
 def make_body(title: str, keywords: list[str], category: str, year: int) -> str:
     kws = keywords[:]
     random.shuffle(kws)
-    kw1, kw2 = (kws + kws)[:2]
+    selected = (kws + kws)[:6]
+    kw1, kw2, kw3, kw4, kw5, kw6 = selected
 
-    intro = f"{title}. {kw1} and {kw2} are key ideas in {category.replace('_', ' ')}."
+    category_label = category.replace("_", " ")
 
-    kw = random.choice(keywords)
-    body = random.choice(BODY_SENTENCE_POOLS).format(kw=kw)
+    intro = (
+        f"{title}\n\n"
+        f"In {year}, {category_label} teams are increasingly asking practical questions such as "
+        f"\"How do we apply {kw1} in production?\", "
+        f"\"When does {kw2} outperform older approaches?\", and "
+        f"\"What trade-offs should we expect when adopting {kw3}?\" "
+        f"This article explores those questions through concrete examples, implementation patterns, "
+        f"and lessons learned from real systems."
+    )
+
+    sections = [
+        (
+            f"What problem does {kw1} solve in modern {category_label} systems?\n\n"
+            f"At a high level, {kw1} helps teams improve how they design, operate, and scale their systems. "
+            f"When engineers first encounter {kw1}, they often focus on surface-level features, but the real value "
+            f"usually appears when it is combined with adjacent ideas such as {kw2} and {kw3}. "
+            f"In practice, this means better clarity around system behavior, faster iteration cycles, and more predictable "
+            f"performance in production environments.\n\n"
+            f"A common question is: \"How should we evaluate {kw1} before rolling it out widely?\" "
+            f"A good starting point is to define one or two measurable goals, such as reducing latency, increasing relevance, "
+            f"or simplifying operational workflows. Teams that skip this step often end up discussing {kw1} in abstract terms "
+            f"without learning whether it actually improves outcomes for users."
+        ),
+        (
+            f"How does {kw2} affect implementation choices?\n\n"
+            f"Implementation details matter. The way a team approaches {kw2} can influence data modeling, indexing strategy, "
+            f"query design, and even incident response. For example, engineers working with {kw2} often discover that the hardest "
+            f"part is not getting a basic demo running, but making it observable, cost-effective, and robust under real traffic.\n\n"
+            f"Another practical question is: \"What should we monitor once {kw2} is live?\" "
+            f"Useful signals include latency percentiles, throughput, error rates, and the quality of outputs returned to users. "
+            f"If those signals drift over time, the team can inspect whether {kw4}, infrastructure constraints, or poor query patterns "
+            f"are introducing regressions."
+        ),
+        (
+            f"Why do teams pair {kw3} with {kw4}?\n\n"
+            f"These topics are often discussed together because they reinforce each other. "
+            f"{kw3} can improve the expressiveness or quality of a system, while {kw4} helps ensure that the system remains stable "
+            f"and understandable as complexity grows. This combination is especially useful in architectures where the same dataset "
+            f"must support multiple access patterns such as analytics, retrieval, filtering, and ranking.\n\n"
+            f"A phrase that often appears in internal design reviews is \"progressive adoption.\" "
+            f"Instead of rewriting an entire platform, teams usually introduce {kw3} and {kw4} in stages. "
+            f"They start with one workflow, validate the impact, and then extend the pattern to adjacent services once the operational "
+            f"trade-offs are clear."
+        ),
+        (
+            f"What are the operational trade-offs of adopting {kw5}?\n\n"
+            f"Every meaningful architectural choice introduces trade-offs. "
+            f"{kw5} may improve developer productivity or system capability, but it can also add moving parts that require careful tuning. "
+            f"Engineers should ask questions like: \"How much additional storage will this require?\" "
+            f"\"Will it change the indexing pipeline?\" and \"How will we debug failures when results look plausible but are subtly wrong?\"\n\n"
+            f"Answering those questions usually requires a mix of benchmarking and qualitative review. "
+            f"Teams that succeed with {kw5} tend to document their assumptions, capture representative workloads, and compare multiple "
+            f"approaches before standardizing on one design."
+        ),
+        (
+            f"When should you choose {kw6} over simpler alternatives?\n\n"
+            f"The best choice depends on context. Sometimes {kw6} is clearly justified because the workload is large, the relevance "
+            f"requirements are strict, or the user experience depends on high-quality retrieval. In other situations, a simpler approach "
+            f"may be easier to explain, cheaper to operate, and good enough for the job.\n\n"
+            f"A useful decision framework is to ask whether {kw6} solves a problem that users can actually feel. "
+            f"If the answer is yes, the investment is often worthwhile. If not, a smaller design may create more value by reducing "
+            f"maintenance burden while keeping the system understandable for the team."
+        ),
+        (
+            f"Practical guidance for evaluation\n\n"
+            f"If you are exploring {title.lower()}, start with a narrow slice of the problem and use realistic data. "
+            f"Test how well the system handles representative phrases and natural-language questions, not just isolated keywords. "
+            f"For example, instead of evaluating only a term such as \"{kw1}\", try prompts like "
+            f"\"What are the trade-offs of {kw1}?\" or "
+            f"\"How does {kw2} improve production reliability?\" "
+            f"Those richer prompts usually reveal whether the system truly captures meaning or only memorizes exact wording.\n\n"
+            f"Over time, the most successful teams treat {kw1}, {kw2}, and {kw3} as part of a larger operating model rather than a single feature. "
+            f"They refine prompts, improve datasets, and monitor how changes affect relevance, latency, and operator confidence."
+        ),
+    ]
+
+    closing = (
+        f"In summary, {title.lower()} is best understood as a practical engineering topic rather than a buzzword. "
+        f"Teams that ask clear questions, benchmark carefully, and connect concepts like {kw1}, {kw2}, and {kw3} to user-facing outcomes "
+        f"are far more likely to see durable results. That is why topics such as {kw4}, {kw5}, and {kw6} continue to matter across modern "
+        f"{category_label} systems."
+    )
 
-    return " ".join([intro, body])
+    return "\n\n".join([intro, *sections, closing])
 
 
 # ---------------------------------------------------------------------------
diff --git a/vectors/elasticsearch/spicepod.yaml b/vectors/elasticsearch/spicepod.yaml
index fad524a..2a5f87d 100644
--- a/vectors/elasticsearch/spicepod.yaml
+++ b/vectors/elasticsearch/spicepod.yaml
@@ -6,6 +6,11 @@ embeddings:
   - name: local_embeddings
     from: huggingface:huggingface.co/sentence-transformers/all-MiniLM-L6-v2
 
+  - from: openai:text-embedding-3-small
+    name: openai_embeddings
+    params:
+      openai_api_key: ${secrets:OPENAI_API_KEY}
+
 datasets:
   - from: file:./articles.parquet
     name: articles
@@ -17,7 +22,7 @@ datasets:
     columns:
       - name: content
         embeddings:
-          - from: local_embeddings
+          - from: openai_embeddings
             row_id: id
             chunking:
               enabled: true

From e7d0fbec2631e38c93a80b6fd755cc459c7fcc8d Mon Sep 17 00:00:00 2001
From: ewgenius <hey@ewgenius.me>
Date: Thu, 23 Apr 2026 00:30:18 +0900
Subject: [PATCH 6/7] update config

---
 vectors/elasticsearch/README.md     | 9 ++++++---
 vectors/elasticsearch/spicepod.yaml | 6 +-----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/vectors/elasticsearch/README.md b/vectors/elasticsearch/README.md
index 19ad809..721e334 100644
--- a/vectors/elasticsearch/README.md
+++ b/vectors/elasticsearch/README.md
@@ -6,7 +6,7 @@ This recipe demonstrates how to use Elasticsearch as a vector engine in Spice.ai
 
 - Run Elasticsearch locally with Docker Compose
 - Generate a sample articles dataset
-- Start Spice and automatically write embeddings into Elasticsearch
+- Start Spice and automatically write OpenAI embeddings into Elasticsearch
 - Run vector and hybrid search queries using meaningful phrases and natural-language questions
 
 ## Prerequisites
@@ -46,12 +46,14 @@ On startup, Spice automatically:
 
 1. Creates the `articles_search_engine` Elasticsearch index with a `dense_vector` mapping
 2. Loads the article records
-3. Computes embeddings locally using `all-MiniLM-L6-v2`
+3. Computes embeddings using OpenAI `text-embedding-3-small`
 4. Bulk indexes the vectors into Elasticsearch
 
 When startup completes, the `articles` dataset is ready for vector and hybrid search queries.
 Use phrase-based prompts and natural-language questions that mirror the generated article topics, such as `pgvector`, `Kubernetes`, `AutoML`, and `hybrid search`.
 
+This avoids `dense_vector` mapping conflicts when the embedding dimensions change, such as `384` vs `1536`.
+
 ### Step 3: Open the Spice SQL REPL
 
 In a new terminal, start the SQL REPL:
@@ -110,7 +112,8 @@ LIMIT 10;
 ## Notes
 
 - The Elasticsearch index is created automatically. No manual mapping step is required.
-- Embeddings are generated during the initial dataset refresh.
+- Embeddings are generated during the initial dataset refresh using OpenAI `text-embedding-3-small`.
+- If the Elasticsearch index already exists from a previous run with a different embedding model, delete it before rerunning the recipe so Spice can recreate the mapping with the correct vector dimensions.
 
 ## Learn more
 
diff --git a/vectors/elasticsearch/spicepod.yaml b/vectors/elasticsearch/spicepod.yaml
index 2a5f87d..4f3abbd 100644
--- a/vectors/elasticsearch/spicepod.yaml
+++ b/vectors/elasticsearch/spicepod.yaml
@@ -3,9 +3,6 @@ kind: Spicepod
 name: 06-search-engine
 
 embeddings:
-  - name: local_embeddings
-    from: huggingface:huggingface.co/sentence-transformers/all-MiniLM-L6-v2
-
   - from: openai:text-embedding-3-small
     name: openai_embeddings
     params:
@@ -28,7 +25,6 @@ datasets:
               enabled: true
               target_chunk_size: 256
               overlap_size: 64
-              file_format: md
         full_text_search:
           enabled: true
           row_id:
@@ -43,7 +39,7 @@ datasets:
       engine: elasticsearch
       params:
         elasticsearch_endpoint: http://localhost:9200
-        elasticsearch_index: articles_search_engine
+        elasticsearch_index: articles_index
         elasticsearch_vector_field: content_embedding
         elasticsearch_user: elastic
         elasticsearch_pass: spiceai

From 8b7c87b0478e7443e441475779b6694450afc322 Mon Sep 17 00:00:00 2001
From: ewgenius <hey@ewgenius.me>
Date: Thu, 23 Apr 2026 00:35:02 +0900
Subject: [PATCH 7/7] update examples

---
 vectors/elasticsearch/README.md | 51 +++++++++++++++++++++++++++++----
 1 file changed, 46 insertions(+), 5 deletions(-)

diff --git a/vectors/elasticsearch/README.md b/vectors/elasticsearch/README.md
index 721e334..146f9e1 100644
--- a/vectors/elasticsearch/README.md
+++ b/vectors/elasticsearch/README.md
@@ -76,6 +76,22 @@ FROM vector_search(
 ORDER BY _score DESC;
 ```
 
+```
++-----+----------------------------------------------------------------------+----------------+------------+
+|  id |                                 title                                |    category    |   _score   |
+|int32|                                varchar                               |     varchar    |   float64  |
++-----+----------------------------------------------------------------------+----------------+------------+
+| 1   | Vector Search Inside PostgreSQL with pgvector                        | databases      | 0.8442027  |
+| 7   | Hybrid Search with Elasticsearch and pgvector                        | search_engines | 0.82845736 |
+| 211 | How Vector Databases Impacts Query Performance                       | databases      | 0.78836733 |
+| 26  | When to Use choosing Between Sql Query Optimisation and Alternatives | databases      | 0.7725092  |
+| 290 | Write-Ahead Logging for High-Throughput Applications                 | databases      | 0.7686081  |
+| 83  | Why debugging Slow Queries with In-Memory Databases                  | databases      | 0.76624966 |
++-----+----------------------------------------------------------------------+----------------+------------+
+
+Time: 0.491752584 seconds. 6 rows.
+```
+
 Run vector search with a post-filter using a natural-language question:
 
 ```sql
@@ -89,6 +105,19 @@ WHERE category = 'machine_learning'
 ORDER BY _score DESC;
 ```
 
+```
++-----+------------------------------------------------------------+------------------+------------+
+|  id |                            title                           |     category     |   _score   |
+|int32|                           varchar                          |      varchar     |   float64  |
++-----+------------------------------------------------------------+------------------+------------+
+| 6   | Cost-Aware AutoML on Kubernetes                            | machine_learning | 0.8782016  |
+| 208 | Benchmarking Transformer Models Across Popular Frameworks  | machine_learning | 0.80589664 |
+| 106 | Implementing Generative Adversarial Networks Without a PhD | machine_learning | 0.80574334 |
++-----+------------------------------------------------------------+------------------+------------+
+
+Time: 0.246288086 seconds. 3 rows.
+```
+
 Fuse vector and keyword results with [Reciprocal Rank Fusion (RRF)](https://spiceai.org/docs/next/features/search#hybrid-search-with-rrf):
 
 ```sql
@@ -109,11 +138,23 @@ ORDER BY fused_score DESC
 LIMIT 10;
 ```
 
-## Notes
-
-- The Elasticsearch index is created automatically. No manual mapping step is required.
-- Embeddings are generated during the initial dataset refresh using OpenAI `text-embedding-3-small`.
-- If the Elasticsearch index already exists from a previous run with a different embedding model, delete it before rerunning the recipe so Spice can recreate the mapping with the correct vector dimensions.
+```
++-----+---------------------------------------------------------------------+----------------+----------------------+
+|  id |                                title                                |    category    |      fused_score     |
+|int32|                               varchar                               |     varchar    |        float64       |
++-----+---------------------------------------------------------------------+----------------+----------------------+
+| 7   | Hybrid Search with Elasticsearch and pgvector                       | search_engines | 0.03278688524590164  |
+| 1   | Vector Search Inside PostgreSQL with pgvector                       | databases      | 0.03225806451612903  |
+| 164 | How Embedding Models Powers Modern Search                           | search_engines | 0.031746031746031744 |
+| 148 | Why evaluating Sparse Retrieval: Metrics That Matter                | search_engines | 0.030776515151515152 |
+| 21  | Comparing Bi-Encoder Retrieval Approaches in 2025                   | search_engines | 0.02804284323271665  |
+| 295 | Full-Text Search Under the Hood: Architecture and Trade-offs        | search_engines | 0.015625             |
+| 78  | Comparing Sparse Retrieval Approaches in 2025                       | search_engines | 0.015384615384615385 |
+| 264 | Reciprocal Rank Fusion Under the Hood: Architecture and Trade-offs  | search_engines | 0.014925373134328358 |
+| 139 | How to Use cross-Encoder Reranking at Scale: Lessons from the Field | search_engines | 0.014705882352941176 |
+| 336 | A Developer's Guide to Embedding Models                             | search_engines | 0.014492753623188406 |
++-----+---------------------------------------------------------------------+----------------+----------------------+
+```
 
 ## Learn more