From d04da867f31bb7969ab0a6595c1f9e9626efcc60 Mon Sep 17 00:00:00 2001 From: ewgenius Date: Wed, 22 Apr 2026 23:19:13 +0900 Subject: [PATCH 1/7] Add Elasticsearch connector and vector engine recipes - Add `elasticsearch/connector` recipe: federated SQL access to Elasticsearch indices, including sample data generation, Docker Compose, and all-types coverage. - Add `vectors/elasticsearch` recipe: demonstrates using Elasticsearch as a vector engine, with sample data, Docker Compose, and Spice configuration for embedding and hybrid search. - Include detailed READMEs, data generation scripts, and example `spicepod.yaml` files for both recipes. --- elasticsearch/connector/README.md | 93 +++ elasticsearch/connector/docker-compose.yml | 49 ++ elasticsearch/connector/generate_data.py | 681 +++++++++++++++++++++ elasticsearch/connector/load_data.py | 394 ++++++++++++ elasticsearch/connector/spicepod.yaml | 32 + vectors/elasticsearch/README.md | 103 ++++ vectors/elasticsearch/docker-compose.yml | 28 + vectors/elasticsearch/generate_data.py | 574 +++++++++++++++++ vectors/elasticsearch/spicepod.yaml | 43 ++ 9 files changed, 1997 insertions(+) create mode 100644 elasticsearch/connector/README.md create mode 100644 elasticsearch/connector/docker-compose.yml create mode 100644 elasticsearch/connector/generate_data.py create mode 100644 elasticsearch/connector/load_data.py create mode 100644 elasticsearch/connector/spicepod.yaml create mode 100644 vectors/elasticsearch/README.md create mode 100644 vectors/elasticsearch/docker-compose.yml create mode 100644 vectors/elasticsearch/generate_data.py create mode 100644 vectors/elasticsearch/spicepod.yaml diff --git a/elasticsearch/connector/README.md b/elasticsearch/connector/README.md new file mode 100644 index 0000000..f238289 --- /dev/null +++ b/elasticsearch/connector/README.md @@ -0,0 +1,93 @@ +# Elasticsearch Data Connector + +Works with `v2.0+` + +This recipe demonstrates how to query Elasticsearch indices from Spice using federated SQL. It includes: + +- `articles` — a federated dataset queried directly from Elasticsearch +- `all_types` — a federated dataset covering supported Elasticsearch field types + +The Elasticsearch connector can also power `vector_search`, `text_search`, and `rrf` for indices that contain the required search fields. + +## Prerequisites + +- [Spice CLI](https://docs.spiceai.org/getting-started) installed +- Docker installed + +## Getting Started + +### Step 1: Prepare the recipe directory + +Change into the recipe directory: + +```bash +cd cookbook/elasticsearch/connector +``` + +### Step 2: Start Elasticsearch and seed the sample indices + +Start the local Elasticsearch service and seed the sample data: + +```bash +docker compose up +``` + +Keep this running while you use the recipe. + +### Step 3: Start the Spice runtime + +In a new terminal, start the Spice runtime: + +```bash +spice run +``` + +### Step 4: Open the Spice SQL REPL + +In another terminal, open the Spice SQL REPL: + +```bash +spice sql +``` + +### Step 5: Run a few example queries + +Run a few basic federated SQL queries to verify the Elasticsearch datasets are available. + +Query the `articles` index: + +```sql +SELECT id, title, category, author +FROM articles +WHERE category = 'machine_learning' +LIMIT 10; +``` + +Inspect one row from the `all_types` index: + +```sql +SELECT * +FROM all_types +LIMIT 1; +``` + +Filter the `all_types` index on a keyword field: + +```sql +SELECT id, field_keyword, field_integer +FROM all_types +WHERE field_keyword = 'category_0'; +``` + +## Notes + +- `articles` and `all_types` are queried directly from Elasticsearch. +- The connector maps Elasticsearch index mappings to Arrow schemas so the data can be queried with SQL in Spice. +- For indices with compatible search fields, Spice can route `vector_search`, `text_search`, and `rrf` to Elasticsearch. + +## Learn more + +- [Elasticsearch Data Connector Documentation](https://spiceai.org/docs/components/data-connectors/elasticsearch) +- [Search Functionality Documentation](https://spiceai.org/docs/features/search) +- [Datasets Reference](https://docs.spiceai.org/reference/spicepod/datasets) +- [Spice SQL CLI Reference](https://docs.spiceai.org/cli/reference/sql) diff --git a/elasticsearch/connector/docker-compose.yml b/elasticsearch/connector/docker-compose.yml new file mode 100644 index 0000000..561fa6f --- /dev/null +++ b/elasticsearch/connector/docker-compose.yml @@ -0,0 +1,49 @@ +services: + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:8.13.4 + container_name: es01 + environment: + - discovery.type=single-node + - xpack.security.enabled=true + - xpack.security.http.ssl.enabled=false + - ES_JAVA_OPTS=-Xms512m -Xmx512m + - ELASTIC_PASSWORD=spiceai + ports: + - "9200:9200" + volumes: + - esdata:/usr/share/elasticsearch/data + healthcheck: + test: + [ + "CMD-SHELL", + "curl -sf -u elastic:spiceai http://localhost:9200/_cluster/health | grep -qE '\"status\":\"(green|yellow)\"'", + ] + interval: 10s + timeout: 10s + retries: 15 + start_period: 40s + + es-init: + image: python:3.12-slim + container_name: es-init + depends_on: + elasticsearch: + condition: service_healthy + volumes: + - ./generate_data.py:/app/generate_data.py:ro + - ./load_data.py:/app/load_data.py:ro + working_dir: /app + environment: + - ES_HOST=http://elasticsearch:9200 + - ES_USER=elastic + - ES_PASS=spiceai + command: > + bash -c " + pip install --quiet pandas pyarrow faker requests && + python generate_data.py && + python load_data.py --all-types + " + +volumes: + esdata: + driver: local diff --git a/elasticsearch/connector/generate_data.py b/elasticsearch/connector/generate_data.py new file mode 100644 index 0000000..1bf9399 --- /dev/null +++ b/elasticsearch/connector/generate_data.py @@ -0,0 +1,681 @@ +#!/usr/bin/env python3 +""" +Generate articles.parquet — sample data for Spice Elasticsearch connector tests. + +Produces ~15 000 records with sufficient lexical and semantic diversity to +meaningfully exercise: + - Data connector (federated SQL via elasticsearch:articles) + - Vector search (vector_search UDTF) + - Full-text search (text_search UDTF / BM25) + - Hybrid search with RRF (rrf(vector_search(...), text_search(...))) + +Usage: + pip install pandas pyarrow faker + python generate_data.py [--rows N] # default 15000 +""" + +import argparse +import random +from datetime import datetime, timedelta +from itertools import product + +import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq +from faker import Faker + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- +parser = argparse.ArgumentParser() +parser.add_argument("--rows", type=int, default=100) +parser.add_argument("--out", default="articles.parquet") +parser.add_argument( + "--embeddings", + action="store_true", + help="Compute content_embedding column using all-MiniLM-L6-v2 and write into parquet", +) +parser.add_argument( + "--embedding-model", default="sentence-transformers/all-MiniLM-L6-v2" +) +parser.add_argument("--embedding-batch-size", type=int, default=64) +args = parser.parse_args() + +TARGET_ROWS = args.rows +OUT_PATH = args.out + +fake = Faker() +random.seed(42) +Faker.seed(42) + +# --------------------------------------------------------------------------- +# Taxonomy +# --------------------------------------------------------------------------- +TOPICS = [ + { + "category": "machine_learning", + "subtopics": [ + "neural networks", + "deep learning", + "gradient descent", + "backpropagation", + "transformer models", + "attention mechanisms", + "fine-tuning", + "transfer learning", + "reinforcement learning", + "generative adversarial networks", + "diffusion models", + "contrastive learning", + "self-supervised learning", + "few-shot learning", + "model pruning", + "knowledge distillation", + "hyperparameter tuning", + "AutoML", + "federated learning", + "meta-learning", + ], + "title_templates": [ + "A Practical Guide to {sub}", + "{sub} Explained: From Theory to Production", + "How {sub} Is Transforming AI Applications", + "Deep Dive: {sub} in Modern ML Pipelines", + "Benchmarking {sub} Across Popular Frameworks", + "Common Pitfalls in {sub} and How to Avoid Them", + "Scaling {sub} to Billions of Parameters", + "{sub}: State of the Art in {year}", + "Understanding {sub} Through Mathematical Intuition", + "Implementing {sub} Without a PhD", + ], + }, + { + "category": "search_engines", + "subtopics": [ + "BM25 ranking", + "inverted indexes", + "vector similarity search", + "semantic search", + "full-text search", + "dense retrieval", + "sparse retrieval", + "hybrid search", + "reciprocal rank fusion", + "embedding models", + "approximate nearest neighbor", + "HNSW graphs", + "product quantization", + "query expansion", + "relevance feedback", + "learning to rank", + "query rewriting", + "cross-encoder reranking", + "bi-encoder retrieval", + "knowledge graph search", + ], + "title_templates": [ + "How {sub} Powers Modern Search", + "{sub} Under the Hood: Architecture and Trade-offs", + "Building Production-Ready {sub} Systems", + "Evaluating {sub}: Metrics That Matter", + "{sub} at Scale: Lessons from the Field", + "Comparing {sub} Approaches in {year}", + "A Developer's Guide to {sub}", + "When to Use {sub} vs Traditional Keyword Search", + "Optimising {sub} for Low-Latency Workloads", + "Open-Source Tools for {sub}", + ], + }, + { + "category": "data_engineering", + "subtopics": [ + "Apache Parquet", + "columnar storage", + "data pipelines", + "ETL workflows", + "stream processing", + "Apache Kafka", + "data lakes", + "schema evolution", + "Apache Iceberg", + "Delta Lake", + "data mesh", + "data contracts", + "change data capture", + "Apache Spark", + "dbt transformations", + "data quality checks", + "lineage tracking", + "metadata management", + "batch ingestion", + "real-time ingestion", + ], + "title_templates": [ + "{sub}: The Definitive Overview", + "Why {sub} Is Essential for Modern Data Teams", + "Getting Started with {sub} in {year}", + "{sub} Best Practices for Large-Scale Workloads", + "Migrating to {sub}: A Step-by-Step Guide", + "Debugging {sub} in Production", + "{sub} vs Traditional Approaches: A Comparison", + "How {sub} Reduces Engineering Toil", + "Monitoring {sub} Pipelines at Scale", + "Cost Optimisation Strategies for {sub}", + ], + }, + { + "category": "databases", + "subtopics": [ + "SQL query optimisation", + "index design", + "ACID transactions", + "NoSQL modelling", + "vector databases", + "distributed databases", + "replication strategies", + "sharding", + "MVCC", + "write-ahead logging", + "B-tree indexes", + "LSM-tree storage", + "columnar databases", + "time-series databases", + "graph databases", + "in-memory databases", + "NewSQL", + "CockroachDB", + "FoundationDB", + "database connection pooling", + ], + "title_templates": [ + "{sub} for High-Throughput Applications", + "A Deep Dive into {sub}", + "{sub}: Concepts Every Engineer Should Know", + "Production Lessons from Running {sub} at Scale", + "Choosing Between {sub} and Alternatives", + "How {sub} Impacts Query Performance", + "{sub} Internals Explained", + "Migrating from Legacy Systems to {sub}", + "Benchmarking {sub} in Cloud Environments", + "Debugging Slow Queries with {sub}", + ], + }, + { + "category": "cloud_infrastructure", + "subtopics": [ + "Kubernetes orchestration", + "container deployments", + "service meshes", + "auto-scaling", + "observability stacks", + "OpenTelemetry", + "cost optimisation", + "serverless functions", + "GitOps workflows", + "infrastructure as code", + "zero-trust networking", + "secrets management", + "multi-cloud strategies", + "disaster recovery", + "chaos engineering", + "FinOps", + "edge computing", + "platform engineering", + "developer portals", + "SLO management", + ], + "title_templates": [ + "{sub} in Practice: Real-World Patterns", + "How {sub} Enables Reliable Systems", + "{sub} for Platform Engineers", + "Scaling {sub} in Enterprise Environments", + "Automating {sub} with Modern Tooling", + "{sub}: A Comprehensive Tutorial for {year}", + "Common Mistakes with {sub} and How to Fix Them", + "Security Considerations for {sub}", + "How We Cut Costs by Optimising {sub}", + "Building Self-Service Infrastructure with {sub}", + ], + }, + { + "category": "software_engineering", + "subtopics": [ + "clean code principles", + "design patterns", + "microservices architecture", + "domain-driven design", + "event sourcing", + "CQRS", + "API design", + "test-driven development", + "continuous integration", + "code review practices", + "technical debt", + "refactoring strategies", + "distributed tracing", + "circuit breakers", + "rate limiting", + "idempotency", + "concurrency models", + "async programming", + "type systems", + "compiler design", + ], + "title_templates": [ + "Applying {sub} in Real Projects", + "{sub}: From Basics to Advanced Techniques", + "How {sub} Improves System Reliability", + "A Pragmatic Guide to {sub}", + "Interview Questions on {sub} and How to Answer Them", + "{sub} Patterns Every Senior Engineer Should Know", + "Refactoring Legacy Code with {sub}", + "When {sub} Makes Sense (and When It Doesn't)", + "{sub} in Distributed Systems", + "Teaching {sub} to Junior Developers", + ], + }, + { + "category": "security", + "subtopics": [ + "zero-trust architecture", + "OAuth 2.0", + "JWT authentication", + "supply chain security", + "SBOM", + "vulnerability scanning", + "penetration testing", + "secrets rotation", + "mTLS", + "RBAC", + "OWASP top 10", + "runtime security", + "SAST and DAST", + "CVE management", + "encryption at rest", + "key management", + "phishing prevention", + "incident response", + "threat modelling", + "compliance automation", + ], + "title_templates": [ + "Understanding {sub} in Cloud-Native Environments", + "Implementing {sub} Without Slowing Down Development", + "{sub} for Distributed Systems", + "How {sub} Reduces Your Attack Surface", + "A Practical Guide to {sub} in {year}", + "Auditing {sub} Configurations at Scale", + "Automating {sub} in CI/CD Pipelines", + "{sub}: Myths and Realities", + "Responding to {sub} Incidents Effectively", + "Open-Source Tools for {sub}", + ], + }, + { + "category": "developer_experience", + "subtopics": [ + "developer portals", + "internal developer platforms", + "CLI tooling", + "IDE extensions", + "local development environments", + "devcontainers", + "debugging techniques", + "profiling tools", + "documentation-as-code", + "API mocking", + "feature flags", + "trunk-based development", + "monorepo tooling", + "dependency management", + "build caching", + "test parallelisation", + "code generation", + "linting and formatting", + "onboarding automation", + "developer productivity metrics", + ], + "title_templates": [ + "Improving {sub} Across Engineering Teams", + "{sub}: Building a Great Developer Experience", + "How {sub} Speeds Up Delivery", + "Measuring the Impact of {sub}", + "Setting Up {sub} from Scratch", + "{sub} Tooling in {year}: What Works", + "How We Adopted {sub} at Our Company", + "The Case for Investing in {sub}", + "{sub} for Remote Engineering Teams", + "Automating {sub} to Eliminate Toil", + ], + }, +] + +# Cross-topic hand-crafted seeds that span multiple categories (good for RRF) +CROSS_TOPIC_SEEDS = [ + { + "title": "Vector Search Inside PostgreSQL with pgvector", + "category": "databases", + "keywords": [ + "vector databases", + "BM25 ranking", + "SQL query optimisation", + "embedding models", + ], + }, + { + "title": "Running Elasticsearch on Kubernetes: Lessons Learned", + "category": "cloud_infrastructure", + "keywords": [ + "Kubernetes orchestration", + "observability stacks", + "inverted indexes", + ], + }, + { + "title": "ETL Pipelines for Machine Learning Feature Stores", + "category": "data_engineering", + "keywords": [ + "Apache Parquet", + "data pipelines", + "transformer models", + "fine-tuning", + ], + }, + { + "title": "Securing ML Model Serving Endpoints", + "category": "security", + "keywords": [ + "zero-trust architecture", + "mTLS", + "neural networks", + "API design", + ], + }, + { + "title": "Observability for Real-Time Data Pipelines", + "category": "data_engineering", + "keywords": [ + "OpenTelemetry", + "Apache Kafka", + "distributed tracing", + "SLO management", + ], + }, + { + "title": "Cost-Aware AutoML on Kubernetes", + "category": "machine_learning", + "keywords": [ + "AutoML", + "Kubernetes orchestration", + "cost optimisation", + "hyperparameter tuning", + ], + }, + { + "title": "Hybrid Search with Elasticsearch and pgvector", + "category": "search_engines", + "keywords": [ + "hybrid search", + "reciprocal rank fusion", + "vector databases", + "inverted indexes", + ], + }, + { + "title": "Developer Portals Powered by LLMs", + "category": "developer_experience", + "keywords": [ + "developer portals", + "transformer models", + "API design", + "documentation-as-code", + ], + }, + { + "title": "Zero-Trust Networking for Microservices", + "category": "security", + "keywords": [ + "zero-trust architecture", + "service meshes", + "mTLS", + "microservices architecture", + ], + }, + { + "title": "DuckDB as an Embedded Analytics Engine", + "category": "databases", + "keywords": [ + "columnar databases", + "Apache Parquet", + "SQL query optimisation", + "data lakes", + ], + }, +] + +AUTHORS = [ + "Alice Chen", + "Bob Martinez", + "Carol Okonkwo", + "David Kim", + "Eva Lindqvist", + "Frank Nguyen", + "Grace Patel", + "Hiro Tanaka", + "Ingrid Sørensen", + "James O'Brien", + "Kavya Reddy", + "Luca Ferrari", + "Maya Goldberg", + "Nadia Petrov", + "Oscar Bergström", + "Priya Sharma", + "Quinn Walker", + "Ravi Subramaniam", + "Sofia Andrade", + "Tom Brennan", +] + +BASE_DATE = datetime(2024, 1, 1) +YEAR_RANGE = 2 # docs span 2024–2025 + + +# --------------------------------------------------------------------------- +# Content generation helpers +# --------------------------------------------------------------------------- + +INTRO_TEMPLATES = [ + "{title} has become one of the most discussed topics among practitioners in {year}. " + "This article examines core ideas around {kw1}, {kw2}, and {kw3}.", + "Few subjects generate as much debate as {title}. " + "In this post we take a structured look at {kw1} and its relationship to {kw2}.", + "The rise of {kw1} has put {title} firmly on the radar of engineering teams worldwide. " + "Here we break down the key concepts, starting with {kw2} and {kw3}.", + "If you have ever struggled to understand {title}, you are not alone. " + "We will demystify {kw1} and explain why {kw2} matters more than ever.", + "In {year}, {title} continues to evolve rapidly. " + "This comprehensive guide covers {kw1}, {kw2}, and practical guidance on {kw3}.", +] + +BODY_SENTENCE_POOLS = [ + "One of the most important considerations when working with {kw} is understanding its performance implications at scale.", + "Teams that have adopted {kw} report significant improvements in both reliability and developer velocity.", + "The relationship between {kw} and system latency is often underappreciated.", + "Recent benchmarks suggest that {kw} outperforms legacy approaches by up to an order of magnitude in read-heavy workloads.", + "Choosing the right abstraction for {kw} requires a clear understanding of your consistency and availability requirements.", + "A common misconception about {kw} is that it requires specialised hardware—in practice, commodity cloud instances suffice.", + "Observability tooling plays a critical role when debugging issues related to {kw} in production.", + "The open-source ecosystem around {kw} has matured considerably since its initial release.", + "Engineers often underestimate the operational overhead of {kw} when evaluating it for the first time.", + "Pair {kw} with a solid CI/CD pipeline to catch regressions early.", + "Security implications of {kw} are frequently discussed but rarely acted on systematically.", + "Integrating {kw} into an existing architecture typically requires incremental migration rather than a big-bang rewrite.", + "Documentation quality is a persistent challenge in the {kw} ecosystem.", + "Load testing is essential before relying on {kw} in a high-traffic environment.", + "Proper indexing strategy is inseparable from any discussion of {kw}.", +] + +CLOSING_TEMPLATES = [ + "In conclusion, mastering {kw} is increasingly a prerequisite for engineers working on {category} problems. " + "We encourage you to experiment with the techniques described here and share your findings with the community.", + "As the field of {category} advances, {kw} will only grow in importance. " + "The patterns outlined in this article provide a solid foundation for further exploration.", + "Understanding {kw} deeply—not just at the API surface—will set you apart as a practitioner in {category}. " + "We hope this guide serves as a useful reference.", + "The journey into {kw} is long but rewarding. " + "Start small, measure rigorously, and iterate—that is the path to production-grade {category} systems.", +] + + +def make_body(title: str, keywords: list[str], category: str, year: int) -> str: + kws = keywords[:] + random.shuffle(kws) + kw1, kw2, kw3 = (kws + kws)[:3] + + intro = random.choice(INTRO_TEMPLATES).format( + title=title, year=year, kw1=kw1, kw2=kw2, kw3=kw3 + ) + + paragraphs = [intro] + for _ in range(random.randint(3, 6)): + n_sentences = random.randint(3, 6) + sentences = [] + for _ in range(n_sentences): + kw = random.choice(keywords) + tmpl = random.choice(BODY_SENTENCE_POOLS) + sentences.append(tmpl.format(kw=kw)) + # pad with faker sentences for variety + sentences += fake.sentences(nb=random.randint(1, 3)) + random.shuffle(sentences) + paragraphs.append(" ".join(sentences)) + + closing_kw = random.choice(keywords) + paragraphs.append( + random.choice(CLOSING_TEMPLATES).format(kw=closing_kw, category=category) + ) + return "\n\n".join(paragraphs) + + +def make_published_at() -> str: + days = random.randint(0, 365 * YEAR_RANGE) + hours = random.randint(0, 23) + minutes = random.randint(0, 59) + dt = BASE_DATE + timedelta(days=days, hours=hours, minutes=minutes) + return dt.isoformat() + + +# --------------------------------------------------------------------------- +# Build seed pool from topic × template × subtopic combinations +# --------------------------------------------------------------------------- +seed_pool: list[dict] = [] +year = 2025 + +for topic in TOPICS: + for sub, tmpl in product(topic["subtopics"], topic["title_templates"]): + title = tmpl.format(sub=sub.title(), year=year) + seed_pool.append( + { + "title": title, + "category": topic["category"], + "keywords": topic["subtopics"], # full list for body variety + "primary_kw": sub, + } + ) + +random.shuffle(seed_pool) + +# --------------------------------------------------------------------------- +# Generate records +# --------------------------------------------------------------------------- +records = [] + +# 1. Cross-topic hand-crafted seeds first (always included) +for i, seed in enumerate(CROSS_TOPIC_SEEDS, start=1): + kws = seed["keywords"] + records.append( + { + "id": i, + "title": seed["title"], + "content": make_body(seed["title"], kws, seed["category"], year), + "author": random.choice(AUTHORS), + "category": seed["category"], + "tags": ", ".join(random.sample(kws, min(3, len(kws)))), + "published_at": make_published_at(), + "views": random.randint(5_000, 80_000), + "likes": random.randint(200, 8_000), + } + ) + +# 2. Generated records from seed pool +doc_id = len(CROSS_TOPIC_SEEDS) + 1 +remaining = TARGET_ROWS - len(CROSS_TOPIC_SEEDS) + +# Cycle through seed pool (with variation) to hit the target count +pool_cycle = (seed_pool * ((remaining // len(seed_pool)) + 2))[:remaining] + +for seed in pool_cycle: + kws = seed["keywords"] + title = seed["title"] + + # small title variation: occasionally prepend a qualifier + qualifiers = ["", "", "", "How to Use ", "Why ", "When to Use ", ""] + q = random.choice(qualifiers) + if q and not title.startswith(("How", "Why", "When", "A ", "Building", "Running")): + title = q + title[0].lower() + title[1:] + + records.append( + { + "id": doc_id, + "title": title, + "content": make_body(title, kws, seed["category"], year), + "author": random.choice(AUTHORS), + "category": seed["category"], + "tags": ", ".join(random.sample(kws, min(4, len(kws)))), + "published_at": make_published_at(), + "views": random.randint(50, 60_000), + "likes": random.randint(0, 6_000), + } + ) + doc_id += 1 + +# --------------------------------------------------------------------------- +# Write Parquet +# --------------------------------------------------------------------------- +df = pd.DataFrame(records) + +fields = [ + pa.field("id", pa.int32()), + pa.field("title", pa.string()), + pa.field("content", pa.string()), + pa.field("author", pa.string()), + pa.field("category", pa.string()), + pa.field("tags", pa.string()), + pa.field("published_at", pa.string()), + pa.field("views", pa.int32()), + pa.field("likes", pa.int32()), +] + +if args.embeddings: + print(f"Computing embeddings with '{args.embedding_model}' …") + from sentence_transformers import SentenceTransformer + + model = SentenceTransformer(args.embedding_model) + texts = df["content"].tolist() + vecs = model.encode( + texts, + batch_size=args.embedding_batch_size, + show_progress_bar=True, + convert_to_numpy=True, + ) + dims = vecs.shape[1] + print(f"Computed {len(vecs)} embeddings, dims={dims}") + df["content_embedding"] = [v.tolist() for v in vecs] + fields.append(pa.field("content_embedding", pa.list_(pa.float32(), dims))) + +schema = pa.schema(fields) +table = pa.Table.from_pandas(df, schema=schema, preserve_index=False) +pq.write_table(table, OUT_PATH, compression="snappy") + +print(f"Wrote {len(records):,} records to {OUT_PATH}") +cats = df["category"].value_counts() +print(cats.to_string()) diff --git a/elasticsearch/connector/load_data.py b/elasticsearch/connector/load_data.py new file mode 100644 index 0000000..e6059ec --- /dev/null +++ b/elasticsearch/connector/load_data.py @@ -0,0 +1,394 @@ +#!/usr/bin/env python3 +""" +Load articles.parquet into Elasticsearch. + +Usage: + uv run load_data.py # load articles index only + uv run load_data.py --embeddings # also compute + load vectors index + uv run load_data.py --embeddings --parquet 04-vector-search/articles.parquet +""" + +import argparse +import json +import os +import sys +import time + +import pyarrow.parquet as pq +import requests + +parser = argparse.ArgumentParser() +parser.add_argument( + "--es-host", default=os.environ.get("ES_HOST", "http://localhost:9200") +) +parser.add_argument("--es-user", default=os.environ.get("ES_USER", "elastic")) +parser.add_argument("--es-pass", default=os.environ.get("ES_PASS", "spiceai")) +parser.add_argument("--parquet", default="articles.parquet") +parser.add_argument("--index", default="articles") +parser.add_argument( + "--embeddings", + action="store_true", + help="Also compute and load vectors into a dense_vector index", +) +parser.add_argument("--vectors-index", default="articles_vectors") +parser.add_argument( + "--embedding-model", default="sentence-transformers/all-MiniLM-L6-v2" +) +parser.add_argument("--embedding-column", default="content") +parser.add_argument("--batch-size", type=int, default=64) +parser.add_argument( + "--all-types", + action="store_true", + help="Also create and load the all_types index covering every ES field type", +) +parser.add_argument("--all-types-index", default="all_types") +parser.add_argument("--all-types-docs", type=int, default=10) +args = parser.parse_args() + +ES_HOST = args.es_host +ES_AUTH = (args.es_user, args.es_pass) if args.es_user else None +INDEX = args.index +PARQUET_PATH = args.parquet + + +# --------------------------------------------------------------------------- +# Wait for Elasticsearch +# --------------------------------------------------------------------------- +def wait_for_es(retries: int = 30, delay: int = 5) -> None: + for attempt in range(1, retries + 1): + try: + r = requests.get(f"{ES_HOST}/_cluster/health", auth=ES_AUTH, timeout=5) + status = r.json().get("status", "red") + if status in ("green", "yellow"): + print(f"Elasticsearch ready (status={status})") + return + print( + f"[{attempt}/{retries}] Cluster status: {status}, retrying in {delay}s …" + ) + except Exception as exc: + print(f"[{attempt}/{retries}] Not ready ({exc}), retrying in {delay}s …") + time.sleep(delay) + print("ERROR: Elasticsearch did not become healthy in time.") + sys.exit(1) + + +# --------------------------------------------------------------------------- +# Articles index +# --------------------------------------------------------------------------- +ARTICLES_MAPPING = { + "settings": {"number_of_shards": 1, "number_of_replicas": 0}, + "mappings": { + "properties": { + "id": {"type": "integer"}, + "title": { + "type": "text", + "analyzer": "english", + "fields": {"keyword": {"type": "keyword"}}, + }, + "content": {"type": "text", "analyzer": "english"}, + "author": {"type": "keyword"}, + "category": {"type": "keyword"}, + "tags": { + "type": "text", + "analyzer": "english", + "fields": {"keyword": {"type": "keyword"}}, + }, + "published_at": { + "type": "date", + "format": "strict_date_time_no_millis||strict_date_optional_time", + }, + "views": {"type": "integer"}, + "likes": {"type": "integer"}, + } + }, +} + + +def recreate_index(name: str, mapping: dict) -> None: + url = f"{ES_HOST}/{name}" + if requests.head(url, auth=ES_AUTH).status_code == 200: + print(f"Index '{name}' already exists — deleting for a clean load.") + requests.delete(url, auth=ES_AUTH).raise_for_status() + requests.put(url, json=mapping, auth=ES_AUTH).raise_for_status() + print(f"Index '{name}' created.") + + +def bulk_request(lines: list[str]) -> None: + body = "\n".join(lines) + "\n" + r = requests.post( + f"{ES_HOST}/_bulk", + data=body.encode(), + headers={"Content-Type": "application/x-ndjson"}, + auth=ES_AUTH, + timeout=120, + ) + r.raise_for_status() + errors = [ + item for item in r.json().get("items", []) if "error" in item.get("index", {}) + ] + if errors: + print(f" WARNING: {len(errors)} bulk errors — first: {errors[0]}") + + +def load_articles(df: dict) -> None: + n = len(df["id"]) + lines = [] + for i in range(n): + doc = {col: df[col][i] for col in df} + lines.append(json.dumps({"index": {"_index": INDEX, "_id": doc["id"]}})) + lines.append(json.dumps(doc)) + bulk_request(lines) + requests.post(f"{ES_HOST}/{INDEX}/_refresh", auth=ES_AUTH) + count = ( + requests.get(f"{ES_HOST}/{INDEX}/_count", auth=ES_AUTH).json().get("count", "?") + ) + print(f"Loaded {n} documents → '{INDEX}' (count={count})") + + +# --------------------------------------------------------------------------- +# Vectors index +# --------------------------------------------------------------------------- +def vectors_mapping(dims: int) -> dict: + return { + "settings": {"number_of_shards": 1, "number_of_replicas": 0}, + "mappings": { + "properties": { + "id": {"type": "integer"}, + "content": {"type": "text", "index": False}, + "content_embedding": { + "type": "dense_vector", + "dims": dims, + "index": True, + "similarity": "cosine", + }, + } + }, + } + + +def load_vectors(df: dict) -> None: + print(f"Loading embedding model '{args.embedding_model}' …") + from sentence_transformers import SentenceTransformer + + model = SentenceTransformer(args.embedding_model) + + texts = df[args.embedding_column] + ids = df["id"] + n = len(ids) + dims = model.get_embedding_dimension() + print( + f"Model dims={dims}, encoding {n} documents in batches of {args.batch_size} …" + ) + + recreate_index(args.vectors_index, vectors_mapping(dims)) + + for batch_start in range(0, n, args.batch_size): + batch_texts = texts[batch_start : batch_start + args.batch_size] + batch_ids = ids[batch_start : batch_start + args.batch_size] + embeddings = model.encode( + batch_texts, show_progress_bar=False, convert_to_numpy=True + ) + + lines = [] + for i, (doc_id, text, vec) in enumerate( + zip(batch_ids, batch_texts, embeddings) + ): + lines.append( + json.dumps({"index": {"_index": args.vectors_index, "_id": doc_id}}) + ) + lines.append( + json.dumps( + { + "id": doc_id, + "content": text, + "content_embedding": vec.tolist(), + } + ) + ) + bulk_request(lines) + print(f" Indexed {min(batch_start + args.batch_size, n)}/{n} vectors …") + + requests.post(f"{ES_HOST}/{args.vectors_index}/_refresh", auth=ES_AUTH) + count = ( + requests.get(f"{ES_HOST}/{args.vectors_index}/_count", auth=ES_AUTH) + .json() + .get("count", "?") + ) + print(f"Loaded {n} vectors → '{args.vectors_index}' (count={count})") + + +# --------------------------------------------------------------------------- +# all_types index — one document per supported ES field type +# --------------------------------------------------------------------------- +ALL_TYPES_MAPPING = { + "settings": {"number_of_shards": 1, "number_of_replicas": 0}, + "mappings": { + "properties": { + # Core scalar types + "id": {"type": "integer"}, + "field_text": {"type": "text", "analyzer": "standard"}, + "field_keyword": {"type": "keyword"}, + "field_long": {"type": "long"}, + "field_integer": {"type": "integer"}, + "field_short": {"type": "short"}, + "field_byte": {"type": "byte"}, + "field_double": {"type": "double"}, + "field_float": {"type": "float"}, + "field_half_float": {"type": "half_float"}, + "field_scaled_float": {"type": "scaled_float", "scaling_factor": 100}, + "field_unsigned_long": {"type": "unsigned_long"}, + # Boolean + "field_boolean": {"type": "boolean"}, + # Date / time + "field_date": {"type": "date", "format": "strict_date_optional_time"}, + "field_date_nanos": {"type": "date_nanos"}, + # Binary + "field_binary": {"type": "binary"}, + # Range types + "field_integer_range": {"type": "integer_range"}, + "field_long_range": {"type": "long_range"}, + "field_float_range": {"type": "float_range"}, + "field_double_range": {"type": "double_range"}, + "field_date_range": { + "type": "date_range", + "format": "strict_date_optional_time", + }, + # Structured / complex types + "field_object": { + "type": "object", + "properties": { + "name": {"type": "keyword"}, + "value": {"type": "integer"}, + }, + }, + "field_nested": { + "type": "nested", + "properties": { + "tag": {"type": "keyword"}, + "score": {"type": "float"}, + }, + }, + # Geo types + "field_geo_point": {"type": "geo_point"}, + "field_geo_shape": {"type": "geo_shape"}, + # Specialised text types + "field_ip": {"type": "ip"}, + "field_version": {"type": "version"}, + # Completion / search-as-you-type + "field_completion": {"type": "completion"}, + "field_search_as_you_type": {"type": "search_as_you_type"}, + # Token count + "field_token_count": { + "type": "token_count", + "analyzer": "standard", + }, + # Dense vector (small dim for speed) + "field_dense_vector": { + "type": "dense_vector", + "dims": 4, + "index": True, + "similarity": "cosine", + }, + # Flattened + "field_flattened": {"type": "flattened"}, + } + }, +} + +import base64 +import math +import random + + +def _make_all_types_doc(i: int) -> dict: + """Return a single document exercising every mapped field type.""" + rng = random.Random(i) + ts = f"2024-0{(i % 9) + 1}-{(i % 28) + 1:02d}T{i % 24:02d}:00:00Z" + return { + "id": i, + "field_text": f"The quick brown fox jumps over the lazy dog — document {i}", + "field_keyword": f"category_{i % 5}", + "field_long": rng.randint(-(2**40), 2**40), + "field_integer": rng.randint(-100_000, 100_000), + "field_short": rng.randint(-32768, 32767), + "field_byte": rng.randint(-128, 127), + "field_double": rng.uniform(-1e6, 1e6), + "field_float": float(round(rng.uniform(-1000, 1000), 4)), + "field_half_float": float(round(rng.uniform(-100, 100), 2)), + "field_scaled_float": float(round(rng.uniform(0, 999.99), 2)), + "field_unsigned_long": rng.randint(0, 2**60), + "field_boolean": bool(i % 2), + "field_date": ts, + "field_date_nanos": ts.replace("Z", ".000000000Z"), + "field_binary": base64.b64encode(f"binary_payload_{i}".encode()).decode(), + "field_integer_range": {"gte": i * 10, "lte": i * 10 + 5}, + "field_long_range": {"gte": i * 1000, "lte": i * 1000 + 100}, + "field_float_range": {"gte": float(i), "lte": float(i) + 0.5}, + "field_double_range": {"gte": float(i) * 1.1, "lte": float(i) * 1.1 + 0.01}, + "field_date_range": { + "gte": f"2024-01-{(i % 28) + 1:02d}T00:00:00Z", + "lte": f"2024-12-{(i % 28) + 1:02d}T23:59:59Z", + }, + "field_object": {"name": f"obj_{i}", "value": i * 7}, + "field_nested": [ + {"tag": f"tag_{j}", "score": float(round(rng.uniform(0, 1), 3))} + for j in range(1, 3) + ], + "field_geo_point": { + "lat": round(rng.uniform(-90, 90), 6), + "lon": round(rng.uniform(-180, 180), 6), + }, + "field_geo_shape": { + "type": "Point", + "coordinates": [ + round(rng.uniform(-180, 180), 6), + round(rng.uniform(-90, 90), 6), + ], + }, + "field_ip": f"192.168.{i % 256}.{(i * 7) % 256}", + "field_version": f"1.{i}.0", + "field_completion": {"input": [f"suggest_{i}", f"doc_{i}"], "weight": i + 1}, + "field_search_as_you_type": f"searchable text for document number {i}", + # token_count is computed by ES; send the source text + "field_token_count": f"token count source text document {i}", + "field_dense_vector": [round(math.sin(i + j), 6) for j in range(4)], + "field_flattened": {"arbitrary_key": f"value_{i}", "nested_key": {"deep": i}}, + } + + +def load_all_types(n: int = 10) -> None: + recreate_index(args.all_types_index, ALL_TYPES_MAPPING) + lines = [] + for i in range(1, n + 1): + doc = _make_all_types_doc(i) + lines.append(json.dumps({"index": {"_index": args.all_types_index, "_id": i}})) + lines.append(json.dumps(doc)) + bulk_request(lines) + requests.post(f"{ES_HOST}/{args.all_types_index}/_refresh", auth=ES_AUTH) + count = ( + requests.get(f"{ES_HOST}/{args.all_types_index}/_count", auth=ES_AUTH) + .json() + .get("count", "?") + ) + print(f"Loaded {n} documents → '{args.all_types_index}' (count={count})") + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- +if __name__ == "__main__": + wait_for_es() + + print(f"Reading {PARQUET_PATH} …") + df = pq.read_table(PARQUET_PATH).to_pydict() + + recreate_index(INDEX, ARTICLES_MAPPING) + load_articles(df) + + if args.embeddings: + load_vectors(df) + + if args.all_types: + load_all_types(args.all_types_docs) + + print("Done.") diff --git a/elasticsearch/connector/spicepod.yaml b/elasticsearch/connector/spicepod.yaml new file mode 100644 index 0000000..eba299c --- /dev/null +++ b/elasticsearch/connector/spicepod.yaml @@ -0,0 +1,32 @@ +version: v2 +kind: Spicepod +name: 01-connector + +datasets: + - from: elasticsearch:articles + name: articles + params: + elasticsearch_endpoint: http://localhost:9200 + elasticsearch_user: elastic + elasticsearch_pass: spiceai + + # all_types — federated (no acceleration, every query hits ES directly) + - from: elasticsearch:all_types + name: all_types + params: + elasticsearch_endpoint: http://localhost:9200 + elasticsearch_user: elastic + elasticsearch_pass: spiceai + + # all_types_accelerated — same index, materialized locally in DuckDB + - from: elasticsearch:all_types + name: all_types_accelerated + params: + elasticsearch_endpoint: http://localhost:9200 + elasticsearch_user: elastic + elasticsearch_pass: spiceai + acceleration: + enabled: true + engine: duckdb + refresh_mode: full + refresh_check_interval: 30s diff --git a/vectors/elasticsearch/README.md b/vectors/elasticsearch/README.md new file mode 100644 index 0000000..a106f3d --- /dev/null +++ b/vectors/elasticsearch/README.md @@ -0,0 +1,103 @@ +# Elasticsearch Vector Engine + +Works with `v2.0+` + +This recipe demonstrates how to use Elasticsearch as a vector engine in Spice.ai. It shows how to: + +- Run Elasticsearch locally with Docker Compose +- Generate a sample articles dataset +- Start Spice and automatically write embeddings into Elasticsearch +- Run vector and hybrid search queries against the indexed embeddings + +## Prerequisites + +- [Spice CLI](https://docs.spiceai.org/getting-started) installed +- Docker and Docker Compose installed +- Python 3 installed + +## Getting Started + +### Step 1: Start Elasticsearch and generate the sample data + +From this recipe directory, start Elasticsearch: + +```bash +docker compose up +``` + +In a new terminal, install the Python dependencies and generate the sample dataset: + +```bash +pip install pandas pyarrow faker +python generate_data.py --rows 500 --out articles.parquet +``` + +This creates an `articles.parquet` file used by the recipe. + +### Step 2: Start the Spice runtime + +Start Spice from this directory: + +```bash +spice run +``` + +On startup, Spice automatically: + +1. Creates the `articles_search_engine` Elasticsearch index with a `dense_vector` mapping +2. Loads the article records +3. Computes embeddings locally using `all-MiniLM-L6-v2` +4. Bulk indexes the vectors into Elasticsearch + +When startup completes, the `articles` dataset is ready for vector and hybrid search queries. + +### Step 3: Open the Spice SQL REPL + +In a new terminal, start the SQL REPL: + +```bash +spice sql +``` + +## Run Queries + +Run semantic similarity search over the indexed embeddings: + +```sql +SELECT id, title, _score +FROM vector_search(articles, 'semantic similarity retrieval', 10) +ORDER BY _score DESC; +``` + +Run vector search with a post-filter: + +```sql +SELECT id, title, category, _score +FROM vector_search(articles, 'cost optimisation cloud infrastructure', 10) +WHERE category = 'cloud_infrastructure' +ORDER BY _score DESC; +``` + +Fuse vector and keyword results with [Reciprocal Rank Fusion (RRF)](https://spiceai.org/docs/next/features/search#hybrid-search-with-rrf): + +```sql +SELECT id, title, category, fused_score +FROM rrf( + vector_search(articles, 'machine learning algorithms'), + text_search(articles, 'machine learning algorithms', content), + join_key => 'id' +) +ORDER BY fused_score DESC +LIMIT 10; +``` + +## Notes + +- The Elasticsearch index is created automatically. No manual mapping step is required. +- Embeddings are generated during the initial dataset refresh. + +## Learn more + +- [Elasticsearch Vector Engine Documentation](https://spiceai.org/docs/components/vectors/elasticsearch) +- [Vector Search Documentation](https://spiceai.org/docs/features/search/vector-search) +- [Datasets Reference](https://spiceai.org/docs/reference/spicepod/datasets) diff --git a/vectors/elasticsearch/docker-compose.yml b/vectors/elasticsearch/docker-compose.yml new file mode 100644 index 0000000..a51606f --- /dev/null +++ b/vectors/elasticsearch/docker-compose.yml @@ -0,0 +1,28 @@ +services: + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:8.13.4 + container_name: es01 + environment: + - discovery.type=single-node + - xpack.security.enabled=true + - xpack.security.http.ssl.enabled=false + - ES_JAVA_OPTS=-Xms512m -Xmx512m + - ELASTIC_PASSWORD=spiceai + ports: + - "9200:9200" + volumes: + - esdata:/usr/share/elasticsearch/data + healthcheck: + test: + [ + "CMD-SHELL", + "curl -sf -u elastic:spiceai http://localhost:9200/_cluster/health | grep -qE '\"status\":\"(green|yellow)\"'", + ] + interval: 10s + timeout: 10s + retries: 15 + start_period: 40s + +volumes: + esdata: + driver: local diff --git a/vectors/elasticsearch/generate_data.py b/vectors/elasticsearch/generate_data.py new file mode 100644 index 0000000..ba4edcb --- /dev/null +++ b/vectors/elasticsearch/generate_data.py @@ -0,0 +1,574 @@ +#!/usr/bin/env python3 +""" +Generate articles.parquet — sample data for the Spice search engine example. + +Usage: + pip install pandas pyarrow faker + python generate_data.py [--rows N] [--out PATH] +""" + +import argparse +import random +from datetime import datetime, timedelta +from itertools import product + +import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq +from faker import Faker + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- +parser = argparse.ArgumentParser() +parser.add_argument("--rows", type=int, default=100) +parser.add_argument("--out", default="articles.parquet") +args = parser.parse_args() + +TARGET_ROWS = args.rows +OUT_PATH = args.out + +fake = Faker() +random.seed(42) +Faker.seed(42) + +# --------------------------------------------------------------------------- +# Taxonomy +# --------------------------------------------------------------------------- +TOPICS = [ + { + "category": "machine_learning", + "subtopics": [ + "neural networks", + "deep learning", + "gradient descent", + "backpropagation", + "transformer models", + "attention mechanisms", + "fine-tuning", + "transfer learning", + "reinforcement learning", + "generative adversarial networks", + "diffusion models", + "contrastive learning", + "self-supervised learning", + "few-shot learning", + "model pruning", + "knowledge distillation", + "hyperparameter tuning", + "AutoML", + "federated learning", + "meta-learning", + ], + "title_templates": [ + "A Practical Guide to {sub}", + "{sub} Explained: From Theory to Production", + "How {sub} Is Transforming AI Applications", + "Deep Dive: {sub} in Modern ML Pipelines", + "Benchmarking {sub} Across Popular Frameworks", + "Common Pitfalls in {sub} and How to Avoid Them", + "Scaling {sub} to Billions of Parameters", + "{sub}: State of the Art in {year}", + "Understanding {sub} Through Mathematical Intuition", + "Implementing {sub} Without a PhD", + ], + }, + { + "category": "search_engines", + "subtopics": [ + "BM25 ranking", + "inverted indexes", + "vector similarity search", + "semantic search", + "full-text search", + "dense retrieval", + "sparse retrieval", + "hybrid search", + "reciprocal rank fusion", + "embedding models", + "approximate nearest neighbor", + "HNSW graphs", + "product quantization", + "query expansion", + "relevance feedback", + "learning to rank", + "query rewriting", + "cross-encoder reranking", + "bi-encoder retrieval", + "knowledge graph search", + ], + "title_templates": [ + "How {sub} Powers Modern Search", + "{sub} Under the Hood: Architecture and Trade-offs", + "Building Production-Ready {sub} Systems", + "Evaluating {sub}: Metrics That Matter", + "{sub} at Scale: Lessons from the Field", + "Comparing {sub} Approaches in {year}", + "A Developer's Guide to {sub}", + "When to Use {sub} vs Traditional Keyword Search", + "Optimising {sub} for Low-Latency Workloads", + "Open-Source Tools for {sub}", + ], + }, + { + "category": "data_engineering", + "subtopics": [ + "Apache Parquet", + "columnar storage", + "data pipelines", + "ETL workflows", + "stream processing", + "Apache Kafka", + "data lakes", + "schema evolution", + "Apache Iceberg", + "Delta Lake", + "data mesh", + "data contracts", + "change data capture", + "Apache Spark", + "dbt transformations", + "data quality checks", + "lineage tracking", + "metadata management", + "batch ingestion", + "real-time ingestion", + ], + "title_templates": [ + "{sub}: The Definitive Overview", + "Why {sub} Is Essential for Modern Data Teams", + "Getting Started with {sub} in {year}", + "{sub} Best Practices for Large-Scale Workloads", + "Migrating to {sub}: A Step-by-Step Guide", + "Debugging {sub} in Production", + "{sub} vs Traditional Approaches: A Comparison", + "How {sub} Reduces Engineering Toil", + "Monitoring {sub} Pipelines at Scale", + "Cost Optimisation Strategies for {sub}", + ], + }, + { + "category": "databases", + "subtopics": [ + "SQL query optimisation", + "index design", + "ACID transactions", + "NoSQL modelling", + "vector databases", + "distributed databases", + "replication strategies", + "sharding", + "MVCC", + "write-ahead logging", + "B-tree indexes", + "LSM-tree storage", + "columnar databases", + "time-series databases", + "graph databases", + "in-memory databases", + "NewSQL", + "CockroachDB", + "FoundationDB", + "database connection pooling", + ], + "title_templates": [ + "{sub} for High-Throughput Applications", + "A Deep Dive into {sub}", + "{sub}: Concepts Every Engineer Should Know", + "Production Lessons from Running {sub} at Scale", + "Choosing Between {sub} and Alternatives", + "How {sub} Impacts Query Performance", + "{sub} Internals Explained", + "Migrating from Legacy Systems to {sub}", + "Benchmarking {sub} in Cloud Environments", + "Debugging Slow Queries with {sub}", + ], + }, + { + "category": "cloud_infrastructure", + "subtopics": [ + "Kubernetes orchestration", + "container deployments", + "service meshes", + "auto-scaling", + "observability stacks", + "OpenTelemetry", + "cost optimisation", + "serverless functions", + "GitOps workflows", + "infrastructure as code", + "zero-trust networking", + "secrets management", + "multi-cloud strategies", + "disaster recovery", + "chaos engineering", + "FinOps", + "edge computing", + "platform engineering", + "developer portals", + "SLO management", + ], + "title_templates": [ + "{sub} in Practice: Real-World Patterns", + "How {sub} Enables Reliable Systems", + "{sub} for Platform Engineers", + "Scaling {sub} in Enterprise Environments", + "Automating {sub} with Modern Tooling", + "{sub}: A Comprehensive Tutorial for {year}", + "Common Mistakes with {sub} and How to Fix Them", + "Security Considerations for {sub}", + "How We Cut Costs by Optimising {sub}", + "Building Self-Service Infrastructure with {sub}", + ], + }, + { + "category": "software_engineering", + "subtopics": [ + "clean code principles", + "design patterns", + "microservices architecture", + "domain-driven design", + "event sourcing", + "CQRS", + "API design", + "test-driven development", + "continuous integration", + "code review practices", + "technical debt", + "refactoring strategies", + "distributed tracing", + "circuit breakers", + "rate limiting", + "idempotency", + "concurrency models", + "async programming", + "type systems", + "compiler design", + ], + "title_templates": [ + "Applying {sub} in Real Projects", + "{sub}: From Basics to Advanced Techniques", + "How {sub} Improves System Reliability", + "A Pragmatic Guide to {sub}", + "Interview Questions on {sub} and How to Answer Them", + "{sub} Patterns Every Senior Engineer Should Know", + "Refactoring Legacy Code with {sub}", + "When {sub} Makes Sense (and When It Doesn't)", + "{sub} in Distributed Systems", + "Teaching {sub} to Junior Developers", + ], + }, + { + "category": "security", + "subtopics": [ + "zero-trust architecture", + "OAuth 2.0", + "JWT authentication", + "supply chain security", + "SBOM", + "vulnerability scanning", + "penetration testing", + "secrets rotation", + "mTLS", + "RBAC", + "OWASP top 10", + "runtime security", + "SAST and DAST", + "CVE management", + "encryption at rest", + "key management", + "phishing prevention", + "incident response", + "threat modelling", + "compliance automation", + ], + "title_templates": [ + "Understanding {sub} in Cloud-Native Environments", + "Implementing {sub} Without Slowing Down Development", + "{sub} for Distributed Systems", + "How {sub} Reduces Your Attack Surface", + "A Practical Guide to {sub} in {year}", + "Auditing {sub} Configurations at Scale", + "Automating {sub} in CI/CD Pipelines", + "{sub}: Myths and Realities", + "Responding to {sub} Incidents Effectively", + "Open-Source Tools for {sub}", + ], + }, + { + "category": "developer_experience", + "subtopics": [ + "developer portals", + "internal developer platforms", + "CLI tooling", + "IDE extensions", + "local development environments", + "devcontainers", + "debugging techniques", + "profiling tools", + "documentation-as-code", + "API mocking", + "feature flags", + "trunk-based development", + "monorepo tooling", + "dependency management", + "build caching", + "test parallelisation", + "code generation", + "linting and formatting", + "onboarding automation", + "developer productivity metrics", + ], + "title_templates": [ + "Improving {sub} Across Engineering Teams", + "{sub}: Building a Great Developer Experience", + "How {sub} Speeds Up Delivery", + "Measuring the Impact of {sub}", + "Setting Up {sub} from Scratch", + "{sub} Tooling in {year}: What Works", + "How We Adopted {sub} at Our Company", + "The Case for Investing in {sub}", + "{sub} for Remote Engineering Teams", + "Automating {sub} to Eliminate Toil", + ], + }, +] + +CROSS_TOPIC_SEEDS = [ + { + "title": "Vector Search Inside PostgreSQL with pgvector", + "category": "databases", + "keywords": ["vector databases", "BM25 ranking", "SQL query optimisation", "embedding models"], + }, + { + "title": "Running Elasticsearch on Kubernetes: Lessons Learned", + "category": "cloud_infrastructure", + "keywords": ["Kubernetes orchestration", "observability stacks", "inverted indexes"], + }, + { + "title": "ETL Pipelines for Machine Learning Feature Stores", + "category": "data_engineering", + "keywords": ["Apache Parquet", "data pipelines", "transformer models", "fine-tuning"], + }, + { + "title": "Securing ML Model Serving Endpoints", + "category": "security", + "keywords": ["zero-trust architecture", "mTLS", "neural networks", "API design"], + }, + { + "title": "Observability for Real-Time Data Pipelines", + "category": "data_engineering", + "keywords": ["OpenTelemetry", "Apache Kafka", "distributed tracing", "SLO management"], + }, + { + "title": "Cost-Aware AutoML on Kubernetes", + "category": "machine_learning", + "keywords": ["AutoML", "Kubernetes orchestration", "cost optimisation", "hyperparameter tuning"], + }, + { + "title": "Hybrid Search with Elasticsearch and pgvector", + "category": "search_engines", + "keywords": ["hybrid search", "reciprocal rank fusion", "vector databases", "inverted indexes"], + }, + { + "title": "Developer Portals Powered by LLMs", + "category": "developer_experience", + "keywords": ["developer portals", "transformer models", "API design", "documentation-as-code"], + }, + { + "title": "Zero-Trust Networking for Microservices", + "category": "security", + "keywords": ["zero-trust architecture", "service meshes", "mTLS", "microservices architecture"], + }, + { + "title": "DuckDB as an Embedded Analytics Engine", + "category": "databases", + "keywords": ["columnar databases", "Apache Parquet", "SQL query optimisation", "data lakes"], + }, +] + +AUTHORS = [ + "Alice Chen", "Bob Martinez", "Carol Okonkwo", "David Kim", "Eva Lindqvist", + "Frank Nguyen", "Grace Patel", "Hiro Tanaka", "Ingrid Sørensen", "James O'Brien", + "Kavya Reddy", "Luca Ferrari", "Maya Goldberg", "Nadia Petrov", "Oscar Bergström", + "Priya Sharma", "Quinn Walker", "Ravi Subramaniam", "Sofia Andrade", "Tom Brennan", +] + +BASE_DATE = datetime(2024, 1, 1) +YEAR_RANGE = 2 + +# --------------------------------------------------------------------------- +# Content generation +# --------------------------------------------------------------------------- +INTRO_TEMPLATES = [ + "{title} has become one of the most discussed topics among practitioners in {year}. " + "This article examines core ideas around {kw1}, {kw2}, and {kw3}.", + "Few subjects generate as much debate as {title}. " + "In this post we take a structured look at {kw1} and its relationship to {kw2}.", + "The rise of {kw1} has put {title} firmly on the radar of engineering teams worldwide. " + "Here we break down the key concepts, starting with {kw2} and {kw3}.", + "If you have ever struggled to understand {title}, you are not alone. " + "We will demystify {kw1} and explain why {kw2} matters more than ever.", + "In {year}, {title} continues to evolve rapidly. " + "This comprehensive guide covers {kw1}, {kw2}, and practical guidance on {kw3}.", +] + +BODY_SENTENCE_POOLS = [ + "One of the most important considerations when working with {kw} is understanding its performance implications at scale.", + "Teams that have adopted {kw} report significant improvements in both reliability and developer velocity.", + "The relationship between {kw} and system latency is often underappreciated.", + "Recent benchmarks suggest that {kw} outperforms legacy approaches by up to an order of magnitude in read-heavy workloads.", + "Choosing the right abstraction for {kw} requires a clear understanding of your consistency and availability requirements.", + "A common misconception about {kw} is that it requires specialised hardware—in practice, commodity cloud instances suffice.", + "Observability tooling plays a critical role when debugging issues related to {kw} in production.", + "The open-source ecosystem around {kw} has matured considerably since its initial release.", + "Engineers often underestimate the operational overhead of {kw} when evaluating it for the first time.", + "Pair {kw} with a solid CI/CD pipeline to catch regressions early.", + "Security implications of {kw} are frequently discussed but rarely acted on systematically.", + "Integrating {kw} into an existing architecture typically requires incremental migration rather than a big-bang rewrite.", + "Documentation quality is a persistent challenge in the {kw} ecosystem.", + "Load testing is essential before relying on {kw} in a high-traffic environment.", + "Proper indexing strategy is inseparable from any discussion of {kw}.", +] + +CLOSING_TEMPLATES = [ + "In conclusion, mastering {kw} is increasingly a prerequisite for engineers working on {category} problems. " + "We encourage you to experiment with the techniques described here and share your findings with the community.", + "As the field of {category} advances, {kw} will only grow in importance. " + "The patterns outlined in this article provide a solid foundation for further exploration.", + "Understanding {kw} deeply—not just at the API surface—will set you apart as a practitioner in {category}. " + "We hope this guide serves as a useful reference.", + "The journey into {kw} is long but rewarding. " + "Start small, measure rigorously, and iterate—that is the path to production-grade {category} systems.", +] + + +def make_body(title: str, keywords: list[str], category: str, year: int) -> str: + kws = keywords[:] + random.shuffle(kws) + kw1, kw2, kw3 = (kws + kws)[:3] + + intro = random.choice(INTRO_TEMPLATES).format( + title=title, year=year, kw1=kw1, kw2=kw2, kw3=kw3 + ) + + paragraphs = [intro] + for _ in range(random.randint(3, 6)): + n_sentences = random.randint(3, 6) + sentences = [] + for _ in range(n_sentences): + kw = random.choice(keywords) + tmpl = random.choice(BODY_SENTENCE_POOLS) + sentences.append(tmpl.format(kw=kw)) + sentences += fake.sentences(nb=random.randint(1, 3)) + random.shuffle(sentences) + paragraphs.append(" ".join(sentences)) + + closing_kw = random.choice(keywords) + paragraphs.append( + random.choice(CLOSING_TEMPLATES).format(kw=closing_kw, category=category) + ) + return "\n\n".join(paragraphs) + + +def make_published_at() -> str: + days = random.randint(0, 365 * YEAR_RANGE) + hours = random.randint(0, 23) + minutes = random.randint(0, 59) + dt = BASE_DATE + timedelta(days=days, hours=hours, minutes=minutes) + return dt.isoformat() + + +# --------------------------------------------------------------------------- +# Build seed pool +# --------------------------------------------------------------------------- +seed_pool: list[dict] = [] +year = 2025 + +for topic in TOPICS: + for sub, tmpl in product(topic["subtopics"], topic["title_templates"]): + title = tmpl.format(sub=sub.title(), year=year) + seed_pool.append( + { + "title": title, + "category": topic["category"], + "keywords": topic["subtopics"], + "primary_kw": sub, + } + ) + +random.shuffle(seed_pool) + +# --------------------------------------------------------------------------- +# Generate records +# --------------------------------------------------------------------------- +records = [] + +for i, seed in enumerate(CROSS_TOPIC_SEEDS, start=1): + kws = seed["keywords"] + records.append( + { + "id": i, + "title": seed["title"], + "content": make_body(seed["title"], kws, seed["category"], year), + "author": random.choice(AUTHORS), + "category": seed["category"], + "tags": ", ".join(random.sample(kws, min(3, len(kws)))), + "published_at": make_published_at(), + "views": random.randint(5_000, 80_000), + "likes": random.randint(200, 8_000), + } + ) + +doc_id = len(CROSS_TOPIC_SEEDS) + 1 +remaining = TARGET_ROWS - len(CROSS_TOPIC_SEEDS) +pool_cycle = (seed_pool * ((remaining // len(seed_pool)) + 2))[:remaining] + +for seed in pool_cycle: + kws = seed["keywords"] + title = seed["title"] + + qualifiers = ["", "", "", "How to Use ", "Why ", "When to Use ", ""] + q = random.choice(qualifiers) + if q and not title.startswith(("How", "Why", "When", "A ", "Building", "Running")): + title = q + title[0].lower() + title[1:] + + records.append( + { + "id": doc_id, + "title": title, + "content": make_body(title, kws, seed["category"], year), + "author": random.choice(AUTHORS), + "category": seed["category"], + "tags": ", ".join(random.sample(kws, min(4, len(kws)))), + "published_at": make_published_at(), + "views": random.randint(50, 60_000), + "likes": random.randint(0, 6_000), + } + ) + doc_id += 1 + +# --------------------------------------------------------------------------- +# Write Parquet +# --------------------------------------------------------------------------- +df = pd.DataFrame(records) + +schema = pa.schema( + [ + pa.field("id", pa.int32()), + pa.field("title", pa.string()), + pa.field("content", pa.string()), + pa.field("author", pa.string()), + pa.field("category", pa.string()), + pa.field("tags", pa.string()), + pa.field("published_at", pa.string()), + pa.field("views", pa.int32()), + pa.field("likes", pa.int32()), + ] +) + +table = pa.Table.from_pandas(df, schema=schema, preserve_index=False) +pq.write_table(table, OUT_PATH, compression="snappy") + +print(f"Wrote {len(records):,} records to {OUT_PATH}") +print(df["category"].value_counts().to_string()) diff --git a/vectors/elasticsearch/spicepod.yaml b/vectors/elasticsearch/spicepod.yaml new file mode 100644 index 0000000..bfa20fb --- /dev/null +++ b/vectors/elasticsearch/spicepod.yaml @@ -0,0 +1,43 @@ +version: v2 +kind: Spicepod +name: 06-search-engine + +embeddings: + - name: local_embeddings + from: huggingface:huggingface.co/sentence-transformers/all-MiniLM-L6-v2 + +datasets: + - from: file:./articles.parquet + name: articles + params: + file_format: parquet + acceleration: + enabled: true + engine: arrow + columns: + - name: content + embeddings: + - from: local_embeddings + row_id: id + chunking: + enabled: true + target_chunk_size: 150 + overlap_size: 20 + full_text_search: + enabled: true + row_id: + - id + - name: title + full_text_search: + enabled: true + row_id: + - id + vectors: + enabled: true + engine: elasticsearch + params: + elasticsearch_endpoint: http://localhost:9200 + elasticsearch_index: articles_search_engine + elasticsearch_vector_field: content_embedding + elasticsearch_user: elastic + elasticsearch_pass: spiceai From 591def3008aceebe6736ef6eee2ab76848fac312 Mon Sep 17 00:00:00 2001 From: ewgenius Date: Wed, 22 Apr 2026 23:52:44 +0900 Subject: [PATCH 2/7] tweak cookbooks --- elasticsearch/connector/README.md | 42 +++++++---- elasticsearch/connector/generate_data.py | 39 ++-------- elasticsearch/connector/load_data.py | 92 +----------------------- vectors/elasticsearch/generate_data.py | 64 ++++------------- vectors/elasticsearch/spicepod.yaml | 5 +- 5 files changed, 55 insertions(+), 187 deletions(-) diff --git a/elasticsearch/connector/README.md b/elasticsearch/connector/README.md index f238289..2b2538c 100644 --- a/elasticsearch/connector/README.md +++ b/elasticsearch/connector/README.md @@ -63,27 +63,45 @@ WHERE category = 'machine_learning' LIMIT 10; ``` -Inspect one row from the `all_types` index: - -```sql -SELECT * -FROM all_types -LIMIT 1; +``` ++----+----------------------------------------------------------------------------------+------------------+------------------+ +| id | title | category | author | +|int32| varchar | varchar | varchar | ++----+----------------------------------------------------------------------------------+------------------+------------------+ +| 6 | Cost-Aware AutoML on Kubernetes | machine_learning | Bob Martinez | +| 14 | Contrastive Learning Explained: From Theory to Production | machine_learning | Priya Sharma | +| 27 | How Federated Learning Is Transforming AI Applications | machine_learning | Quinn Walker | +| 32 | Scaling Few-Shot Learning to Billions of Parameters | machine_learning | Tom Brennan | +| 38 | Why understanding Generative Adversarial Networks Through Mathematical Intuition | machine_learning | Alice Chen | +| 40 | Few-Shot Learning: State of the Art in 2025 | machine_learning | Luca Ferrari | +| 45 | Attention Mechanisms: State of the Art in 2025 | machine_learning | Carol Okonkwo | +| 47 | Why understanding Fine-Tuning Through Mathematical Intuition | machine_learning | Ravi Subramaniam | +| 63 | Scaling Self-Supervised Learning to Billions of Parameters | machine_learning | Priya Sharma | +| 68 | A Practical Guide to Diffusion Models | machine_learning | Luca Ferrari | ++----+----------------------------------------------------------------------------------+------------------+------------------+ + +Time: 0.036074709 seconds. 10 rows. ``` Filter the `all_types` index on a keyword field: ```sql -SELECT id, field_keyword, field_integer +SELECT * FROM all_types WHERE field_keyword = 'category_0'; ``` -## Notes - -- `articles` and `all_types` are queried directly from Elasticsearch. -- The connector maps Elasticsearch index mappings to Arrow schemas so the data can be queried with SQL in Spice. -- For indices with compatible search fields, Spice can route `vector_search`, `text_search`, and `rrf` to Elasticsearch. +``` ++--------------------------+---------------+------------+-----------------------------------------------+----------------------+--------------------------------+-------------------------------------------------------------+--------------------------------------------+--------------------+--------------------------+-------------------------------------------------------+-------------+-------------------------+--------------------------------------+--------------------------------------------------------+------------------+---------------+-----------------------+---------------+---------------+---------------+---------------------------+---------------------------------------------------------------+-------------------+--------------------+--------------------+----------------------------------------+-------------+-----------------------------------------------------------+-------------------------------------+---------------------+---------------+----+ +| field_binary | field_boolean | field_byte | field_completion | field_date | field_date_nanos | field_date_range | field_dense_vector | field_double | field_double_range | field_flattened | field_float | field_float_range | field_geo_point | field_geo_shape | field_half_float | field_integer | field_integer_range | field_ip | field_keyword | field_long | field_long_range | field_nested | field_object.name | field_object.value | field_scaled_float | field_search_as_you_type | field_short | field_text | field_token_count | field_unsigned_long | field_version | id | +| varchar | boolean | int8 | varchar | varchar | varchar | varchar | float32[4] | float64 | varchar | varchar | float32 | varchar | varchar | varchar | float32 | int32 | varchar | varchar | varchar | int64 | varchar | varchar | varchar | int32 | float32 | varchar | int16 | varchar | varchar | varchar | varchar |int32| ++--------------------------+---------------+------------+-----------------------------------------------+----------------------+--------------------------------+-------------------------------------------------------------+--------------------------------------------+--------------------+--------------------------+-------------------------------------------------------+-------------+-------------------------+--------------------------------------+--------------------------------------------------------+------------------+---------------+-----------------------+---------------+---------------+---------------+---------------------------+---------------------------------------------------------------+-------------------+--------------------+--------------------+----------------------------------------+-------------+-----------------------------------------------------------+-------------------------------------+---------------------+---------------+----+ +| YmluYXJ5X3BheWxvYWRfNQ== | true | -114 | {"input":["suggest_5","doc_5"],"weight":6} | 2024-06-06T05:00:00Z | 2024-06-06T05:00:00.000000000Z | {"gte":"2024-01-06T00:00:00Z","lte":"2024-12-06T23:59:59Z"} | [-0.958924, -0.279415, 0.656987, 0.989358] | 680696.2410453358 | {"gte":5.5,"lte":5.51} | {"arbitrary_key":"value_5","nested_key":{"deep":5}} | 551.9171 | {"gte":5.0,"lte":5.5} | {"lat":-21.463575,"lon":-143.289215} | {"type":"Point","coordinates":[-90.240943,41.613069]} | -50.19 | 94455 | {"gte":50,"lte":55} | 192.168.5.35 | category_0 | 24150178885 | {"gte":5000,"lte":5100} | [{"tag":"tag_1","score":0.372},{"tag":"tag_2","score":0.868}] | obj_5 | 35 | 51.85 | searchable text for document number 5 | 14225 | The quick brown fox jumps over the lazy dog — document 5 | token count source text document 5 | 261035185072990349 | 1.5.0 | 5 | +| YmluYXJ5X3BheWxvYWRfMTA= | false | -121 | {"input":["suggest_10","doc_10"],"weight":11} | 2024-02-11T10:00:00Z | 2024-02-11T10:00:00.000000000Z | {"gte":"2024-01-11T00:00:00Z","lte":"2024-12-11T23:59:59Z"} | [-0.544021, -0.99999, -0.536573, 0.420167] | -587803.5357209966 | {"gte":11.0,"lte":11.01} | {"arbitrary_key":"value_10","nested_key":{"deep":10}} | 626.6425 | {"gte":10.0,"lte":10.5} | {"lat":-45.000598,"lon":163.014087} | {"type":"Point","coordinates":[178.760517,-81.979851]} | 64.72 | 12430 | {"gte":100,"lte":105} | 192.168.10.70 | category_0 | -955323551533 | {"gte":10000,"lte":10100} | [{"tag":"tag_1","score":0.521},{"tag":"tag_2","score":0.328}] | obj_10 | 70 | 653.47 | searchable text for document number 10 | 30482 | The quick brown fox jumps over the lazy dog — document 10 | token count source text document 10 | 79329244941176303 | 1.10.0 | 10 | ++--------------------------+---------------+------------+-----------------------------------------------+----------------------+--------------------------------+-------------------------------------------------------------+--------------------------------------------+--------------------+--------------------------+-------------------------------------------------------+-------------+-------------------------+--------------------------------------+--------------------------------------------------------+------------------+---------------+-----------------------+---------------+---------------+---------------+---------------------------+---------------------------------------------------------------+-------------------+--------------------+--------------------+----------------------------------------+-------------+-----------------------------------------------------------+-------------------------------------+---------------------+---------------+----+ + +Time: 0.024743022 seconds. 2 rows. +``` ## Learn more diff --git a/elasticsearch/connector/generate_data.py b/elasticsearch/connector/generate_data.py index 1bf9399..077d89f 100644 --- a/elasticsearch/connector/generate_data.py +++ b/elasticsearch/connector/generate_data.py @@ -1,17 +1,13 @@ #!/usr/bin/env python3 """ -Generate articles.parquet — sample data for Spice Elasticsearch connector tests. +Generate articles.parquet — sample data for the Spice Elasticsearch connector recipe. -Produces ~15 000 records with sufficient lexical and semantic diversity to -meaningfully exercise: - - Data connector (federated SQL via elasticsearch:articles) - - Vector search (vector_search UDTF) - - Full-text search (text_search UDTF / BM25) - - Hybrid search with RRF (rrf(vector_search(...), text_search(...))) +Produces article records with sufficient lexical diversity to exercise the +Elasticsearch data connector. Usage: pip install pandas pyarrow faker - python generate_data.py [--rows N] # default 15000 + python generate_data.py [--rows N] """ import argparse @@ -30,15 +26,7 @@ parser = argparse.ArgumentParser() parser.add_argument("--rows", type=int, default=100) parser.add_argument("--out", default="articles.parquet") -parser.add_argument( - "--embeddings", - action="store_true", - help="Compute content_embedding column using all-MiniLM-L6-v2 and write into parquet", -) -parser.add_argument( - "--embedding-model", default="sentence-transformers/all-MiniLM-L6-v2" -) -parser.add_argument("--embedding-batch-size", type=int, default=64) + args = parser.parse_args() TARGET_ROWS = args.rows @@ -655,23 +643,6 @@ def make_published_at() -> str: pa.field("likes", pa.int32()), ] -if args.embeddings: - print(f"Computing embeddings with '{args.embedding_model}' …") - from sentence_transformers import SentenceTransformer - - model = SentenceTransformer(args.embedding_model) - texts = df["content"].tolist() - vecs = model.encode( - texts, - batch_size=args.embedding_batch_size, - show_progress_bar=True, - convert_to_numpy=True, - ) - dims = vecs.shape[1] - print(f"Computed {len(vecs)} embeddings, dims={dims}") - df["content_embedding"] = [v.tolist() for v in vecs] - fields.append(pa.field("content_embedding", pa.list_(pa.float32(), dims))) - schema = pa.schema(fields) table = pa.Table.from_pandas(df, schema=schema, preserve_index=False) pq.write_table(table, OUT_PATH, compression="snappy") diff --git a/elasticsearch/connector/load_data.py b/elasticsearch/connector/load_data.py index e6059ec..55b61f7 100644 --- a/elasticsearch/connector/load_data.py +++ b/elasticsearch/connector/load_data.py @@ -3,9 +3,9 @@ Load articles.parquet into Elasticsearch. Usage: - uv run load_data.py # load articles index only - uv run load_data.py --embeddings # also compute + load vectors index - uv run load_data.py --embeddings --parquet 04-vector-search/articles.parquet + uv run load_data.py + uv run load_data.py --all-types + uv run load_data.py --parquet articles.parquet --all-types """ import argparse @@ -25,17 +25,6 @@ parser.add_argument("--es-pass", default=os.environ.get("ES_PASS", "spiceai")) parser.add_argument("--parquet", default="articles.parquet") parser.add_argument("--index", default="articles") -parser.add_argument( - "--embeddings", - action="store_true", - help="Also compute and load vectors into a dense_vector index", -) -parser.add_argument("--vectors-index", default="articles_vectors") -parser.add_argument( - "--embedding-model", default="sentence-transformers/all-MiniLM-L6-v2" -) -parser.add_argument("--embedding-column", default="content") -parser.add_argument("--batch-size", type=int, default=64) parser.add_argument( "--all-types", action="store_true", @@ -145,78 +134,6 @@ def load_articles(df: dict) -> None: print(f"Loaded {n} documents → '{INDEX}' (count={count})") -# --------------------------------------------------------------------------- -# Vectors index -# --------------------------------------------------------------------------- -def vectors_mapping(dims: int) -> dict: - return { - "settings": {"number_of_shards": 1, "number_of_replicas": 0}, - "mappings": { - "properties": { - "id": {"type": "integer"}, - "content": {"type": "text", "index": False}, - "content_embedding": { - "type": "dense_vector", - "dims": dims, - "index": True, - "similarity": "cosine", - }, - } - }, - } - - -def load_vectors(df: dict) -> None: - print(f"Loading embedding model '{args.embedding_model}' …") - from sentence_transformers import SentenceTransformer - - model = SentenceTransformer(args.embedding_model) - - texts = df[args.embedding_column] - ids = df["id"] - n = len(ids) - dims = model.get_embedding_dimension() - print( - f"Model dims={dims}, encoding {n} documents in batches of {args.batch_size} …" - ) - - recreate_index(args.vectors_index, vectors_mapping(dims)) - - for batch_start in range(0, n, args.batch_size): - batch_texts = texts[batch_start : batch_start + args.batch_size] - batch_ids = ids[batch_start : batch_start + args.batch_size] - embeddings = model.encode( - batch_texts, show_progress_bar=False, convert_to_numpy=True - ) - - lines = [] - for i, (doc_id, text, vec) in enumerate( - zip(batch_ids, batch_texts, embeddings) - ): - lines.append( - json.dumps({"index": {"_index": args.vectors_index, "_id": doc_id}}) - ) - lines.append( - json.dumps( - { - "id": doc_id, - "content": text, - "content_embedding": vec.tolist(), - } - ) - ) - bulk_request(lines) - print(f" Indexed {min(batch_start + args.batch_size, n)}/{n} vectors …") - - requests.post(f"{ES_HOST}/{args.vectors_index}/_refresh", auth=ES_AUTH) - count = ( - requests.get(f"{ES_HOST}/{args.vectors_index}/_count", auth=ES_AUTH) - .json() - .get("count", "?") - ) - print(f"Loaded {n} vectors → '{args.vectors_index}' (count={count})") - - # --------------------------------------------------------------------------- # all_types index — one document per supported ES field type # --------------------------------------------------------------------------- @@ -385,9 +302,6 @@ def load_all_types(n: int = 10) -> None: recreate_index(INDEX, ARTICLES_MAPPING) load_articles(df) - if args.embeddings: - load_vectors(df) - if args.all_types: load_all_types(args.all_types_docs) diff --git a/vectors/elasticsearch/generate_data.py b/vectors/elasticsearch/generate_data.py index ba4edcb..643fafb 100644 --- a/vectors/elasticsearch/generate_data.py +++ b/vectors/elasticsearch/generate_data.py @@ -9,7 +9,6 @@ import argparse import random -from datetime import datetime, timedelta from itertools import product import pandas as pd @@ -387,16 +386,6 @@ }, ] -AUTHORS = [ - "Alice Chen", "Bob Martinez", "Carol Okonkwo", "David Kim", "Eva Lindqvist", - "Frank Nguyen", "Grace Patel", "Hiro Tanaka", "Ingrid Sørensen", "James O'Brien", - "Kavya Reddy", "Luca Ferrari", "Maya Goldberg", "Nadia Petrov", "Oscar Bergström", - "Priya Sharma", "Quinn Walker", "Ravi Subramaniam", "Sofia Andrade", "Tom Brennan", -] - -BASE_DATE = datetime(2024, 1, 1) -YEAR_RANGE = 2 - # --------------------------------------------------------------------------- # Content generation # --------------------------------------------------------------------------- @@ -452,31 +441,21 @@ def make_body(title: str, keywords: list[str], category: str, year: int) -> str: title=title, year=year, kw1=kw1, kw2=kw2, kw3=kw3 ) - paragraphs = [intro] - for _ in range(random.randint(3, 6)): - n_sentences = random.randint(3, 6) - sentences = [] - for _ in range(n_sentences): - kw = random.choice(keywords) - tmpl = random.choice(BODY_SENTENCE_POOLS) - sentences.append(tmpl.format(kw=kw)) - sentences += fake.sentences(nb=random.randint(1, 3)) - random.shuffle(sentences) - paragraphs.append(" ".join(sentences)) + sentences = [] + for _ in range(random.randint(2, 3)): + kw = random.choice(keywords) + tmpl = random.choice(BODY_SENTENCE_POOLS) + sentences.append(tmpl.format(kw=kw)) + + sentences += fake.sentences(nb=1) + random.shuffle(sentences) closing_kw = random.choice(keywords) - paragraphs.append( - random.choice(CLOSING_TEMPLATES).format(kw=closing_kw, category=category) + closing = random.choice(CLOSING_TEMPLATES).format( + kw=closing_kw, category=category ) - return "\n\n".join(paragraphs) - -def make_published_at() -> str: - days = random.randint(0, 365 * YEAR_RANGE) - hours = random.randint(0, 23) - minutes = random.randint(0, 59) - dt = BASE_DATE + timedelta(days=days, hours=hours, minutes=minutes) - return dt.isoformat() + return " ".join([intro, " ".join(sentences), closing]) # --------------------------------------------------------------------------- @@ -510,13 +489,8 @@ def make_published_at() -> str: { "id": i, "title": seed["title"], - "content": make_body(seed["title"], kws, seed["category"], year), - "author": random.choice(AUTHORS), "category": seed["category"], - "tags": ", ".join(random.sample(kws, min(3, len(kws)))), - "published_at": make_published_at(), - "views": random.randint(5_000, 80_000), - "likes": random.randint(200, 8_000), + "content": make_body(seed["title"], kws, seed["category"], year), } ) @@ -537,13 +511,8 @@ def make_published_at() -> str: { "id": doc_id, "title": title, - "content": make_body(title, kws, seed["category"], year), - "author": random.choice(AUTHORS), "category": seed["category"], - "tags": ", ".join(random.sample(kws, min(4, len(kws)))), - "published_at": make_published_at(), - "views": random.randint(50, 60_000), - "likes": random.randint(0, 6_000), + "content": make_body(title, kws, seed["category"], year), } ) doc_id += 1 @@ -557,13 +526,8 @@ def make_published_at() -> str: [ pa.field("id", pa.int32()), pa.field("title", pa.string()), - pa.field("content", pa.string()), - pa.field("author", pa.string()), pa.field("category", pa.string()), - pa.field("tags", pa.string()), - pa.field("published_at", pa.string()), - pa.field("views", pa.int32()), - pa.field("likes", pa.int32()), + pa.field("content", pa.string()), ] ) diff --git a/vectors/elasticsearch/spicepod.yaml b/vectors/elasticsearch/spicepod.yaml index bfa20fb..fad524a 100644 --- a/vectors/elasticsearch/spicepod.yaml +++ b/vectors/elasticsearch/spicepod.yaml @@ -21,8 +21,9 @@ datasets: row_id: id chunking: enabled: true - target_chunk_size: 150 - overlap_size: 20 + target_chunk_size: 256 + overlap_size: 64 + file_format: md full_text_search: enabled: true row_id: From 7f8c74927b3f6b435ec9f9a706d873f50708ebd5 Mon Sep 17 00:00:00 2001 From: ewgenius Date: Wed, 22 Apr 2026 23:53:35 +0900 Subject: [PATCH 3/7] gitignore --- elasticsearch/connector/.gitignore | 1 + vectors/elasticsearch/.gitignore | 1 + 2 files changed, 2 insertions(+) create mode 100644 elasticsearch/connector/.gitignore create mode 100644 vectors/elasticsearch/.gitignore diff --git a/elasticsearch/connector/.gitignore b/elasticsearch/connector/.gitignore new file mode 100644 index 0000000..4bed5da --- /dev/null +++ b/elasticsearch/connector/.gitignore @@ -0,0 +1 @@ +*.parquet diff --git a/vectors/elasticsearch/.gitignore b/vectors/elasticsearch/.gitignore new file mode 100644 index 0000000..4bed5da --- /dev/null +++ b/vectors/elasticsearch/.gitignore @@ -0,0 +1 @@ +*.parquet From 566ef580d5d4ee28ac1e08c2198072841f84f5d5 Mon Sep 17 00:00:00 2001 From: ewgenius Date: Thu, 23 Apr 2026 00:03:45 +0900 Subject: [PATCH 4/7] tweak --- vectors/elasticsearch/README.md | 15 ++++++++------- vectors/elasticsearch/generate_data.py | 23 +++++------------------ 2 files changed, 13 insertions(+), 25 deletions(-) diff --git a/vectors/elasticsearch/README.md b/vectors/elasticsearch/README.md index a106f3d..74cb9cb 100644 --- a/vectors/elasticsearch/README.md +++ b/vectors/elasticsearch/README.md @@ -7,7 +7,7 @@ This recipe demonstrates how to use Elasticsearch as a vector engine in Spice.ai - Run Elasticsearch locally with Docker Compose - Generate a sample articles dataset - Start Spice and automatically write embeddings into Elasticsearch -- Run vector and hybrid search queries against the indexed embeddings +- Run vector and hybrid search queries that match the generated article titles and content ## Prerequisites @@ -50,6 +50,7 @@ On startup, Spice automatically: 4. Bulk indexes the vectors into Elasticsearch When startup completes, the `articles` dataset is ready for vector and hybrid search queries. +Use search prompts that mirror the generated article topics, such as `pgvector`, `Kubernetes`, `AutoML`, and `hybrid search`. ### Step 3: Open the Spice SQL REPL @@ -64,8 +65,8 @@ spice sql Run semantic similarity search over the indexed embeddings: ```sql -SELECT id, title, _score -FROM vector_search(articles, 'semantic similarity retrieval', 10) +SELECT id, title, category, _score +FROM vector_search(articles, 'pgvector vector databases sql query optimisation', 10) ORDER BY _score DESC; ``` @@ -73,8 +74,8 @@ Run vector search with a post-filter: ```sql SELECT id, title, category, _score -FROM vector_search(articles, 'cost optimisation cloud infrastructure', 10) -WHERE category = 'cloud_infrastructure' +FROM vector_search(articles, 'kubernetes cost optimisation automl', 10) +WHERE category = 'machine_learning' ORDER BY _score DESC; ``` @@ -83,8 +84,8 @@ Fuse vector and keyword results with [Reciprocal Rank Fusion (RRF)](https://spic ```sql SELECT id, title, category, fused_score FROM rrf( - vector_search(articles, 'machine learning algorithms'), - text_search(articles, 'machine learning algorithms', content), + vector_search(articles, 'hybrid search elasticsearch pgvector'), + text_search(articles, 'hybrid search elasticsearch pgvector', content), join_key => 'id' ) ORDER BY fused_score DESC diff --git a/vectors/elasticsearch/generate_data.py b/vectors/elasticsearch/generate_data.py index 643fafb..8ef1073 100644 --- a/vectors/elasticsearch/generate_data.py +++ b/vectors/elasticsearch/generate_data.py @@ -435,27 +435,14 @@ def make_body(title: str, keywords: list[str], category: str, year: int) -> str: kws = keywords[:] random.shuffle(kws) - kw1, kw2, kw3 = (kws + kws)[:3] + kw1, kw2 = (kws + kws)[:2] - intro = random.choice(INTRO_TEMPLATES).format( - title=title, year=year, kw1=kw1, kw2=kw2, kw3=kw3 - ) - - sentences = [] - for _ in range(random.randint(2, 3)): - kw = random.choice(keywords) - tmpl = random.choice(BODY_SENTENCE_POOLS) - sentences.append(tmpl.format(kw=kw)) + intro = f"{title}. {kw1} and {kw2} are key ideas in {category.replace('_', ' ')}." - sentences += fake.sentences(nb=1) - random.shuffle(sentences) - - closing_kw = random.choice(keywords) - closing = random.choice(CLOSING_TEMPLATES).format( - kw=closing_kw, category=category - ) + kw = random.choice(keywords) + body = random.choice(BODY_SENTENCE_POOLS).format(kw=kw) - return " ".join([intro, " ".join(sentences), closing]) + return " ".join([intro, body]) # --------------------------------------------------------------------------- From 6077d14388f1b943a61a9155f61cf49d0f4c413b Mon Sep 17 00:00:00 2001 From: ewgenius Date: Thu, 23 Apr 2026 00:25:35 +0900 Subject: [PATCH 5/7] tweak data --- vectors/elasticsearch/README.md | 31 ++++++--- vectors/elasticsearch/generate_data.py | 91 ++++++++++++++++++++++++-- vectors/elasticsearch/spicepod.yaml | 7 +- 3 files changed, 115 insertions(+), 14 deletions(-) diff --git a/vectors/elasticsearch/README.md b/vectors/elasticsearch/README.md index 74cb9cb..19ad809 100644 --- a/vectors/elasticsearch/README.md +++ b/vectors/elasticsearch/README.md @@ -7,7 +7,7 @@ This recipe demonstrates how to use Elasticsearch as a vector engine in Spice.ai - Run Elasticsearch locally with Docker Compose - Generate a sample articles dataset - Start Spice and automatically write embeddings into Elasticsearch -- Run vector and hybrid search queries that match the generated article titles and content +- Run vector and hybrid search queries using meaningful phrases and natural-language questions ## Prerequisites @@ -50,7 +50,7 @@ On startup, Spice automatically: 4. Bulk indexes the vectors into Elasticsearch When startup completes, the `articles` dataset is ready for vector and hybrid search queries. -Use search prompts that mirror the generated article topics, such as `pgvector`, `Kubernetes`, `AutoML`, and `hybrid search`. +Use phrase-based prompts and natural-language questions that mirror the generated article topics, such as `pgvector`, `Kubernetes`, `AutoML`, and `hybrid search`. ### Step 3: Open the Spice SQL REPL @@ -62,19 +62,27 @@ spice sql ## Run Queries -Run semantic similarity search over the indexed embeddings: +Run semantic similarity search over the indexed embeddings using a phrase-based prompt: ```sql SELECT id, title, category, _score -FROM vector_search(articles, 'pgvector vector databases sql query optimisation', 10) +FROM vector_search( + articles, + 'How does pgvector improve SQL query optimisation for vector search?', + 10 +) ORDER BY _score DESC; ``` -Run vector search with a post-filter: +Run vector search with a post-filter using a natural-language question: ```sql SELECT id, title, category, _score -FROM vector_search(articles, 'kubernetes cost optimisation automl', 10) +FROM vector_search( + articles, + 'What are the trade-offs of cost-aware AutoML on Kubernetes?', + 10 +) WHERE category = 'machine_learning' ORDER BY _score DESC; ``` @@ -84,8 +92,15 @@ Fuse vector and keyword results with [Reciprocal Rank Fusion (RRF)](https://spic ```sql SELECT id, title, category, fused_score FROM rrf( - vector_search(articles, 'hybrid search elasticsearch pgvector'), - text_search(articles, 'hybrid search elasticsearch pgvector', content), + vector_search( + articles, + 'How can hybrid search combine Elasticsearch and pgvector effectively?' + ), + text_search( + articles, + 'How can hybrid search combine Elasticsearch and pgvector effectively?', + content + ), join_key => 'id' ) ORDER BY fused_score DESC diff --git a/vectors/elasticsearch/generate_data.py b/vectors/elasticsearch/generate_data.py index 8ef1073..9d2559c 100644 --- a/vectors/elasticsearch/generate_data.py +++ b/vectors/elasticsearch/generate_data.py @@ -435,14 +435,95 @@ def make_body(title: str, keywords: list[str], category: str, year: int) -> str: kws = keywords[:] random.shuffle(kws) - kw1, kw2 = (kws + kws)[:2] + selected = (kws + kws)[:6] + kw1, kw2, kw3, kw4, kw5, kw6 = selected - intro = f"{title}. {kw1} and {kw2} are key ideas in {category.replace('_', ' ')}." + category_label = category.replace("_", " ") - kw = random.choice(keywords) - body = random.choice(BODY_SENTENCE_POOLS).format(kw=kw) + intro = ( + f"{title}\n\n" + f"In {year}, {category_label} teams are increasingly asking practical questions such as " + f"\"How do we apply {kw1} in production?\", " + f"\"When does {kw2} outperform older approaches?\", and " + f"\"What trade-offs should we expect when adopting {kw3}?\" " + f"This article explores those questions through concrete examples, implementation patterns, " + f"and lessons learned from real systems." + ) + + sections = [ + ( + f"What problem does {kw1} solve in modern {category_label} systems?\n\n" + f"At a high level, {kw1} helps teams improve how they design, operate, and scale their systems. " + f"When engineers first encounter {kw1}, they often focus on surface-level features, but the real value " + f"usually appears when it is combined with adjacent ideas such as {kw2} and {kw3}. " + f"In practice, this means better clarity around system behavior, faster iteration cycles, and more predictable " + f"performance in production environments.\n\n" + f"A common question is: \"How should we evaluate {kw1} before rolling it out widely?\" " + f"A good starting point is to define one or two measurable goals, such as reducing latency, increasing relevance, " + f"or simplifying operational workflows. Teams that skip this step often end up discussing {kw1} in abstract terms " + f"without learning whether it actually improves outcomes for users." + ), + ( + f"How does {kw2} affect implementation choices?\n\n" + f"Implementation details matter. The way a team approaches {kw2} can influence data modeling, indexing strategy, " + f"query design, and even incident response. For example, engineers working with {kw2} often discover that the hardest " + f"part is not getting a basic demo running, but making it observable, cost-effective, and robust under real traffic.\n\n" + f"Another practical question is: \"What should we monitor once {kw2} is live?\" " + f"Useful signals include latency percentiles, throughput, error rates, and the quality of outputs returned to users. " + f"If those signals drift over time, the team can inspect whether {kw4}, infrastructure constraints, or poor query patterns " + f"are introducing regressions." + ), + ( + f"Why do teams pair {kw3} with {kw4}?\n\n" + f"These topics are often discussed together because they reinforce each other. " + f"{kw3} can improve the expressiveness or quality of a system, while {kw4} helps ensure that the system remains stable " + f"and understandable as complexity grows. This combination is especially useful in architectures where the same dataset " + f"must support multiple access patterns such as analytics, retrieval, filtering, and ranking.\n\n" + f"A phrase that often appears in internal design reviews is \"progressive adoption.\" " + f"Instead of rewriting an entire platform, teams usually introduce {kw3} and {kw4} in stages. " + f"They start with one workflow, validate the impact, and then extend the pattern to adjacent services once the operational " + f"trade-offs are clear." + ), + ( + f"What are the operational trade-offs of adopting {kw5}?\n\n" + f"Every meaningful architectural choice introduces trade-offs. " + f"{kw5} may improve developer productivity or system capability, but it can also add moving parts that require careful tuning. " + f"Engineers should ask questions like: \"How much additional storage will this require?\" " + f"\"Will it change the indexing pipeline?\" and \"How will we debug failures when results look plausible but are subtly wrong?\"\n\n" + f"Answering those questions usually requires a mix of benchmarking and qualitative review. " + f"Teams that succeed with {kw5} tend to document their assumptions, capture representative workloads, and compare multiple " + f"approaches before standardizing on one design." + ), + ( + f"When should you choose {kw6} over simpler alternatives?\n\n" + f"The best choice depends on context. Sometimes {kw6} is clearly justified because the workload is large, the relevance " + f"requirements are strict, or the user experience depends on high-quality retrieval. In other situations, a simpler approach " + f"may be easier to explain, cheaper to operate, and good enough for the job.\n\n" + f"A useful decision framework is to ask whether {kw6} solves a problem that users can actually feel. " + f"If the answer is yes, the investment is often worthwhile. If not, a smaller design may create more value by reducing " + f"maintenance burden while keeping the system understandable for the team." + ), + ( + f"Practical guidance for evaluation\n\n" + f"If you are exploring {title.lower()}, start with a narrow slice of the problem and use realistic data. " + f"Test how well the system handles representative phrases and natural-language questions, not just isolated keywords. " + f"For example, instead of evaluating only a term such as \"{kw1}\", try prompts like " + f"\"What are the trade-offs of {kw1}?\" or " + f"\"How does {kw2} improve production reliability?\" " + f"Those richer prompts usually reveal whether the system truly captures meaning or only memorizes exact wording.\n\n" + f"Over time, the most successful teams treat {kw1}, {kw2}, and {kw3} as part of a larger operating model rather than a single feature. " + f"They refine prompts, improve datasets, and monitor how changes affect relevance, latency, and operator confidence." + ), + ] + + closing = ( + f"In summary, {title.lower()} is best understood as a practical engineering topic rather than a buzzword. " + f"Teams that ask clear questions, benchmark carefully, and connect concepts like {kw1}, {kw2}, and {kw3} to user-facing outcomes " + f"are far more likely to see durable results. That is why topics such as {kw4}, {kw5}, and {kw6} continue to matter across modern " + f"{category_label} systems." + ) - return " ".join([intro, body]) + return "\n\n".join([intro, *sections, closing]) # --------------------------------------------------------------------------- diff --git a/vectors/elasticsearch/spicepod.yaml b/vectors/elasticsearch/spicepod.yaml index fad524a..2a5f87d 100644 --- a/vectors/elasticsearch/spicepod.yaml +++ b/vectors/elasticsearch/spicepod.yaml @@ -6,6 +6,11 @@ embeddings: - name: local_embeddings from: huggingface:huggingface.co/sentence-transformers/all-MiniLM-L6-v2 + - from: openai:text-embedding-3-small + name: openai_embeddings + params: + openai_api_key: ${secrets:OPENAI_API_KEY} + datasets: - from: file:./articles.parquet name: articles @@ -17,7 +22,7 @@ datasets: columns: - name: content embeddings: - - from: local_embeddings + - from: openai_embeddings row_id: id chunking: enabled: true From e7d0fbec2631e38c93a80b6fd755cc459c7fcc8d Mon Sep 17 00:00:00 2001 From: ewgenius Date: Thu, 23 Apr 2026 00:30:18 +0900 Subject: [PATCH 6/7] update config --- vectors/elasticsearch/README.md | 9 ++++++--- vectors/elasticsearch/spicepod.yaml | 6 +----- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/vectors/elasticsearch/README.md b/vectors/elasticsearch/README.md index 19ad809..721e334 100644 --- a/vectors/elasticsearch/README.md +++ b/vectors/elasticsearch/README.md @@ -6,7 +6,7 @@ This recipe demonstrates how to use Elasticsearch as a vector engine in Spice.ai - Run Elasticsearch locally with Docker Compose - Generate a sample articles dataset -- Start Spice and automatically write embeddings into Elasticsearch +- Start Spice and automatically write OpenAI embeddings into Elasticsearch - Run vector and hybrid search queries using meaningful phrases and natural-language questions ## Prerequisites @@ -46,12 +46,14 @@ On startup, Spice automatically: 1. Creates the `articles_search_engine` Elasticsearch index with a `dense_vector` mapping 2. Loads the article records -3. Computes embeddings locally using `all-MiniLM-L6-v2` +3. Computes embeddings using OpenAI `text-embedding-3-small` 4. Bulk indexes the vectors into Elasticsearch When startup completes, the `articles` dataset is ready for vector and hybrid search queries. Use phrase-based prompts and natural-language questions that mirror the generated article topics, such as `pgvector`, `Kubernetes`, `AutoML`, and `hybrid search`. +This avoids `dense_vector` mapping conflicts when the embedding dimensions change, such as `384` vs `1536`. + ### Step 3: Open the Spice SQL REPL In a new terminal, start the SQL REPL: @@ -110,7 +112,8 @@ LIMIT 10; ## Notes - The Elasticsearch index is created automatically. No manual mapping step is required. -- Embeddings are generated during the initial dataset refresh. +- Embeddings are generated during the initial dataset refresh using OpenAI `text-embedding-3-small`. +- If the Elasticsearch index already exists from a previous run with a different embedding model, delete it before rerunning the recipe so Spice can recreate the mapping with the correct vector dimensions. ## Learn more diff --git a/vectors/elasticsearch/spicepod.yaml b/vectors/elasticsearch/spicepod.yaml index 2a5f87d..4f3abbd 100644 --- a/vectors/elasticsearch/spicepod.yaml +++ b/vectors/elasticsearch/spicepod.yaml @@ -3,9 +3,6 @@ kind: Spicepod name: 06-search-engine embeddings: - - name: local_embeddings - from: huggingface:huggingface.co/sentence-transformers/all-MiniLM-L6-v2 - - from: openai:text-embedding-3-small name: openai_embeddings params: @@ -28,7 +25,6 @@ datasets: enabled: true target_chunk_size: 256 overlap_size: 64 - file_format: md full_text_search: enabled: true row_id: @@ -43,7 +39,7 @@ datasets: engine: elasticsearch params: elasticsearch_endpoint: http://localhost:9200 - elasticsearch_index: articles_search_engine + elasticsearch_index: articles_index elasticsearch_vector_field: content_embedding elasticsearch_user: elastic elasticsearch_pass: spiceai From 8b7c87b0478e7443e441475779b6694450afc322 Mon Sep 17 00:00:00 2001 From: ewgenius Date: Thu, 23 Apr 2026 00:35:02 +0900 Subject: [PATCH 7/7] update examples --- vectors/elasticsearch/README.md | 51 +++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/vectors/elasticsearch/README.md b/vectors/elasticsearch/README.md index 721e334..146f9e1 100644 --- a/vectors/elasticsearch/README.md +++ b/vectors/elasticsearch/README.md @@ -76,6 +76,22 @@ FROM vector_search( ORDER BY _score DESC; ``` +``` ++-----+----------------------------------------------------------------------+----------------+------------+ +| id | title | category | _score | +|int32| varchar | varchar | float64 | ++-----+----------------------------------------------------------------------+----------------+------------+ +| 1 | Vector Search Inside PostgreSQL with pgvector | databases | 0.8442027 | +| 7 | Hybrid Search with Elasticsearch and pgvector | search_engines | 0.82845736 | +| 211 | How Vector Databases Impacts Query Performance | databases | 0.78836733 | +| 26 | When to Use choosing Between Sql Query Optimisation and Alternatives | databases | 0.7725092 | +| 290 | Write-Ahead Logging for High-Throughput Applications | databases | 0.7686081 | +| 83 | Why debugging Slow Queries with In-Memory Databases | databases | 0.76624966 | ++-----+----------------------------------------------------------------------+----------------+------------+ + +Time: 0.491752584 seconds. 6 rows. +``` + Run vector search with a post-filter using a natural-language question: ```sql @@ -89,6 +105,19 @@ WHERE category = 'machine_learning' ORDER BY _score DESC; ``` +``` ++-----+------------------------------------------------------------+------------------+------------+ +| id | title | category | _score | +|int32| varchar | varchar | float64 | ++-----+------------------------------------------------------------+------------------+------------+ +| 6 | Cost-Aware AutoML on Kubernetes | machine_learning | 0.8782016 | +| 208 | Benchmarking Transformer Models Across Popular Frameworks | machine_learning | 0.80589664 | +| 106 | Implementing Generative Adversarial Networks Without a PhD | machine_learning | 0.80574334 | ++-----+------------------------------------------------------------+------------------+------------+ + +Time: 0.246288086 seconds. 3 rows. +``` + Fuse vector and keyword results with [Reciprocal Rank Fusion (RRF)](https://spiceai.org/docs/next/features/search#hybrid-search-with-rrf): ```sql @@ -109,11 +138,23 @@ ORDER BY fused_score DESC LIMIT 10; ``` -## Notes - -- The Elasticsearch index is created automatically. No manual mapping step is required. -- Embeddings are generated during the initial dataset refresh using OpenAI `text-embedding-3-small`. -- If the Elasticsearch index already exists from a previous run with a different embedding model, delete it before rerunning the recipe so Spice can recreate the mapping with the correct vector dimensions. +``` ++-----+---------------------------------------------------------------------+----------------+----------------------+ +| id | title | category | fused_score | +|int32| varchar | varchar | float64 | ++-----+---------------------------------------------------------------------+----------------+----------------------+ +| 7 | Hybrid Search with Elasticsearch and pgvector | search_engines | 0.03278688524590164 | +| 1 | Vector Search Inside PostgreSQL with pgvector | databases | 0.03225806451612903 | +| 164 | How Embedding Models Powers Modern Search | search_engines | 0.031746031746031744 | +| 148 | Why evaluating Sparse Retrieval: Metrics That Matter | search_engines | 0.030776515151515152 | +| 21 | Comparing Bi-Encoder Retrieval Approaches in 2025 | search_engines | 0.02804284323271665 | +| 295 | Full-Text Search Under the Hood: Architecture and Trade-offs | search_engines | 0.015625 | +| 78 | Comparing Sparse Retrieval Approaches in 2025 | search_engines | 0.015384615384615385 | +| 264 | Reciprocal Rank Fusion Under the Hood: Architecture and Trade-offs | search_engines | 0.014925373134328358 | +| 139 | How to Use cross-Encoder Reranking at Scale: Lessons from the Field | search_engines | 0.014705882352941176 | +| 336 | A Developer's Guide to Embedding Models | search_engines | 0.014492753623188406 | ++-----+---------------------------------------------------------------------+----------------+----------------------+ +``` ## Learn more