Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .github/workflows/index.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,11 @@ jobs:
run: mergelore-index
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
MERGELORE_EMBED_MODEL: text-embedding-3-small
MERGELORE_QDRANT_URL: ${{ secrets.QDRANT_URL }}
QDRANT_API_KEY: ${{ secrets.QDRANT_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
GITHUB_REPOSITORY: ${{ github.repository }}
GITHUB_EVENT_PATH: ${{ github.event_path }}
# Default uses local FastEmbed (bge-small-en-v1.5) - no API key needed
# To use OpenAI embeddings instead, uncomment:
# MERGELORE_EMBED_MODEL: text-embedding-3-small
# OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
4 changes: 2 additions & 2 deletions indexer/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ branding:

inputs:
embed-model:
description: "Embedding model: text-embedding-3-small | nomic-embed-text | embed-english-v3.0"
default: text-embedding-3-small
description: "Embedding model: bge-small-en-v1.5 (default, local) | all-MiniLM-L6-v2 | nomic-embed-text-v1.5 | text-embedding-3-small (OpenAI) | embed-english-v3.0 (Cohere)"
default: bge-small-en-v1.5
qdrant-url:
description: Qdrant URL (e.g., http://qdrant:6333)
required: true
Expand Down
7 changes: 4 additions & 3 deletions indexer/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,15 @@ requires-python = ">=3.12"
license = "MIT"
dependencies = [
"qdrant-client>=1.12.0",
"openai>=1.60.0",
"cohere>=5.0.0",
"fastembed>=0.4.0",
"tiktoken>=0.8.0",
"PyGithub>=2.5.0",
]

[project.optional-dependencies]
local = ["sentence-transformers>=3.0.0"]
openai = ["openai>=1.60.0"]
cohere = ["cohere>=5.0.0"]
all = ["openai>=1.60.0", "cohere>=5.0.0"]
dev = ["pytest>=8.0", "pytest-asyncio>=0.24", "pytest-cov", "ruff"]

[project.scripts]
Expand Down
14 changes: 6 additions & 8 deletions indexer/src/mergelore_indexer/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,9 @@
import os
from dataclasses import dataclass

VALID_EMBED_MODELS = [
"text-embedding-3-small",
"nomic-embed-text",
"embed-english-v3.0",
]
from .embed.factory import ALL_MODELS

DEFAULT_EMBED_MODEL = "bge-small-en-v1.5"


@dataclass(frozen=True)
Expand Down Expand Up @@ -37,11 +35,11 @@ def load_config() -> IndexerConfig:
if not github_token:
raise ValueError("GITHUB_TOKEN is required.")

embed_model = os.environ.get("MERGELORE_EMBED_MODEL", "text-embedding-3-small")
if embed_model not in VALID_EMBED_MODELS:
embed_model = os.environ.get("MERGELORE_EMBED_MODEL", DEFAULT_EMBED_MODEL)
if embed_model not in ALL_MODELS:
raise ValueError(
f"Invalid MERGELORE_EMBED_MODEL: '{embed_model}'. "
f"Valid options: {', '.join(VALID_EMBED_MODELS)}"
f"Valid options: {', '.join(ALL_MODELS)}"
)

qdrant_url = os.environ.get("MERGELORE_QDRANT_URL", "")
Expand Down
31 changes: 24 additions & 7 deletions indexer/src/mergelore_indexer/embed/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,32 +3,49 @@
from __future__ import annotations

from .interface import EmbeddingProvider
from .openai_embed import OpenAIEmbedding
from .nomic import NomicEmbedding
from .cohere_embed import CohereEmbedding
from .fastembed_embed import FastEmbedEmbedding, MODELS as FASTEMBED_MODELS

FASTEMBED_KEYS = list(FASTEMBED_MODELS.keys())

ALL_MODELS = FASTEMBED_KEYS + ["text-embedding-3-small", "embed-english-v3.0"]


def create_embedding_provider(
model_name: str,
openai_api_key: str | None = None,
cohere_api_key: str | None = None,
) -> EmbeddingProvider:
# FastEmbed models (local, no API key needed)
if model_name in FASTEMBED_MODELS:
return FastEmbedEmbedding(model_key=model_name)

match model_name:
case "text-embedding-3-small":
try:
from .openai_embed import OpenAIEmbedding
except ImportError:
raise ImportError(
"OpenAI embedding requires the openai package. "
"Install with: pip install 'mergelore-indexer[openai]'"
)
if not openai_api_key:
raise ValueError("OPENAI_API_KEY is required for text-embedding-3-small.")
return OpenAIEmbedding(api_key=openai_api_key)

case "nomic-embed-text":
return NomicEmbedding()

case "embed-english-v3.0":
try:
from .cohere_embed import CohereEmbedding
except ImportError:
raise ImportError(
"Cohere embedding requires the cohere package. "
"Install with: pip install 'mergelore-indexer[cohere]'"
)
if not cohere_api_key:
raise ValueError("COHERE_API_KEY is required for embed-english-v3.0.")
return CohereEmbedding(api_key=cohere_api_key)

case _:
raise ValueError(
f"Unknown embedding model: '{model_name}'. "
f"Valid options: text-embedding-3-small, nomic-embed-text, embed-english-v3.0"
f"Valid options: {', '.join(ALL_MODELS)}"
)
39 changes: 39 additions & 0 deletions indexer/src/mergelore_indexer/embed/fastembed_embed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""FastEmbed provider - local ONNX-based embedding (no API key needed)."""

from __future__ import annotations

from fastembed import TextEmbedding


MODELS = {
"bge-small-en-v1.5": {"name": "BAAI/bge-small-en-v1.5", "dimension": 384},
"all-MiniLM-L6-v2": {"name": "sentence-transformers/all-MiniLM-L6-v2", "dimension": 384},
"nomic-embed-text-v1.5": {"name": "nomic-ai/nomic-embed-text-v1.5", "dimension": 768},
}

DEFAULT_MODEL = "bge-small-en-v1.5"


class FastEmbedEmbedding:
def __init__(self, model_key: str = DEFAULT_MODEL) -> None:
if model_key not in MODELS:
raise ValueError(
f"Unknown FastEmbed model: '{model_key}'. "
f"Valid options: {', '.join(MODELS.keys())}"
)
model_info = MODELS[model_key]
self._dimension = model_info["dimension"]
self._model = TextEmbedding(model_name=model_info["name"])

@property
def name(self) -> str:
return "fastembed"

@property
def dimension(self) -> int:
return self._dimension

async def embed(self, texts: list[str]) -> list[list[float]]:
# fastembed is sync but fast (ONNX on CPU)
embeddings = list(self._model.embed(texts))
return [e.tolist() for e in embeddings]
25 changes: 14 additions & 11 deletions indexer/tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,27 @@ def clean_env(monkeypatch):


def set_valid_env(monkeypatch):
"""Set minimal valid env - uses default FastEmbed model, no API keys needed."""
monkeypatch.setenv("GITHUB_TOKEN", "gh-test")
monkeypatch.setenv("MERGELORE_EMBED_MODEL", "text-embedding-3-small")
monkeypatch.setenv("MERGELORE_QDRANT_URL", "http://qdrant:6333")
monkeypatch.setenv("OPENAI_API_KEY", "sk-test")
monkeypatch.setenv("GITHUB_REPOSITORY", "owner/repo")


def test_loads_valid_config(monkeypatch):
def test_loads_valid_config_with_defaults(monkeypatch):
set_valid_env(monkeypatch)
config = load_config()
assert config.embed_model == "text-embedding-3-small"
assert config.embed_model == "bge-small-en-v1.5"
assert config.owner == "owner"
assert config.repo == "repo"
assert config.openai_api_key is None


def test_loads_openai_model(monkeypatch):
set_valid_env(monkeypatch)
monkeypatch.setenv("MERGELORE_EMBED_MODEL", "text-embedding-3-small")
monkeypatch.setenv("OPENAI_API_KEY", "sk-test")
config = load_config()
assert config.embed_model == "text-embedding-3-small"


def test_missing_github_token(monkeypatch):
Expand All @@ -54,26 +62,21 @@ def test_missing_qdrant_url(monkeypatch):


def test_openai_model_requires_key(monkeypatch):
monkeypatch.setenv("GITHUB_TOKEN", "gh-test")
set_valid_env(monkeypatch)
monkeypatch.setenv("MERGELORE_EMBED_MODEL", "text-embedding-3-small")
monkeypatch.setenv("MERGELORE_QDRANT_URL", "http://qdrant:6333")
monkeypatch.setenv("GITHUB_REPOSITORY", "owner/repo")
with pytest.raises(ValueError, match="OPENAI_API_KEY"):
load_config()


def test_cohere_model_requires_key(monkeypatch):
monkeypatch.setenv("GITHUB_TOKEN", "gh-test")
set_valid_env(monkeypatch)
monkeypatch.setenv("MERGELORE_EMBED_MODEL", "embed-english-v3.0")
monkeypatch.setenv("MERGELORE_QDRANT_URL", "http://qdrant:6333")
monkeypatch.setenv("GITHUB_REPOSITORY", "owner/repo")
with pytest.raises(ValueError, match="COHERE_API_KEY"):
load_config()


def test_missing_github_repository(monkeypatch):
monkeypatch.setenv("GITHUB_TOKEN", "gh-test")
monkeypatch.setenv("MERGELORE_EMBED_MODEL", "nomic-embed-text")
monkeypatch.setenv("MERGELORE_QDRANT_URL", "http://qdrant:6333")
with pytest.raises(ValueError, match="GITHUB_REPOSITORY"):
load_config()
30 changes: 18 additions & 12 deletions indexer/tests/test_embed_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,34 @@
from mergelore_indexer.embed.factory import create_embedding_provider


def test_creates_openai_provider():
provider = create_embedding_provider("text-embedding-3-small", openai_api_key="sk-test")
assert provider.name == "text-embedding-3-small"
assert provider.dimension == 1536
def test_creates_fastembed_default():
provider = create_embedding_provider("bge-small-en-v1.5")
assert provider.name == "fastembed"
assert provider.dimension == 384


def test_creates_fastembed_minilm():
provider = create_embedding_provider("all-MiniLM-L6-v2")
assert provider.name == "fastembed"
assert provider.dimension == 384


def test_creates_fastembed_nomic():
provider = create_embedding_provider("nomic-embed-text-v1.5")
assert provider.name == "fastembed"
assert provider.dimension == 768


def test_openai_requires_key():
with pytest.raises(ValueError, match="OPENAI_API_KEY"):
with pytest.raises((ValueError, ImportError)):
create_embedding_provider("text-embedding-3-small")


def test_cohere_requires_key():
with pytest.raises(ValueError, match="COHERE_API_KEY"):
with pytest.raises((ValueError, ImportError)):
create_embedding_provider("embed-english-v3.0")


def test_creates_cohere_provider():
provider = create_embedding_provider("embed-english-v3.0", cohere_api_key="co-test")
assert provider.name == "embed-english-v3.0"
assert provider.dimension == 1024


def test_unknown_model_raises():
with pytest.raises(ValueError, match="Unknown embedding model"):
create_embedding_provider("invalid-model")
21 changes: 10 additions & 11 deletions samples/tier2-qdrant-index.yml
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
# Tier 2: Qdrant vector memory - PR indexer
# Runs after every merge to main. Extracts the merged PR content,
# chunks it, embeds it with OpenAI, and stores it in Qdrant.
# chunks it, embeds it locally with FastEmbed, and stores it in Qdrant.
# Use this together with tier2-qdrant-review.yml.
#
# No embedding API key needed - FastEmbed runs locally on CPU.
#
# Secrets needed:
# OPENAI_API_KEY - get one at https://platform.openai.com (for embeddings)
# QDRANT_URL - your Qdrant instance (e.g. https://xyz.cloud.qdrant.io)
# QDRANT_API_KEY - from Qdrant Cloud dashboard
#
# Embedding models (set via MERGELORE_EMBED_MODEL):
# text-embedding-3-small - OpenAI, 1536d, ~$0.02/1M tokens (default)
# nomic-embed-text - local CPU, 768d, free
# embed-english-v3.0 - Cohere, 1024d, ~$0.01/1M tokens
# bge-small-en-v1.5 - FastEmbed, 384d, fast, no API key (default)
# all-MiniLM-L6-v2 - FastEmbed, 384d, well-tested, no API key
# nomic-embed-text-v1.5 - FastEmbed, 768d, best quality, no API key
# text-embedding-3-small - OpenAI, 1536d, requires OPENAI_API_KEY
# embed-english-v3.0 - Cohere, 1024d, requires COHERE_API_KEY

name: mergelore-index
on:
Expand All @@ -30,15 +33,11 @@ jobs:
- uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Install indexer
run: pip install mergelore-indexer
- name: Index merged PR
run: mergelore-index
- run: pip install mergelore-indexer
- run: mergelore-index
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
MERGELORE_EMBED_MODEL: text-embedding-3-small
MERGELORE_QDRANT_URL: ${{ secrets.QDRANT_URL }}
QDRANT_API_KEY: ${{ secrets.QDRANT_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
GITHUB_REPOSITORY: ${{ github.repository }}
GITHUB_EVENT_PATH: ${{ github.event_path }}
Loading