Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions rre-tools/embedding-model-evaluator/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ candidates_path: "resources/mteb_datasets/nfcorpus/test/candidates.jsonl"
# - graded: 0 (not relevant), 1 (maybe ok), 2 (that’s my result)
relevance_scale: "graded"

# (Optional) Path to write mteb output, if not given it will be written to output dir in the root folder
# (Optional) Path to write mteb resources, if not given it will be written to resources dir in the root folder
output_dest: "resources"

# (Optional) Path to write mteb document and query embeddings, if not given it will be written to output/embeddings dir
# (Optional) Path to write mteb document and query embeddings, if not given it will be written to resources/embeddings dir
embeddings_dest: "resources/embeddings"

Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from pathlib import Path

# Map simple "task key" -> registered MTEB task class name
TASKS_NAME_MAPPING = {
"retrieval": "CustomRetrievalTask",
"reranking": "CustomRerankingTask",
}

CACHE_PATH = Path("resources/cache")

__all__ = [
"TASKS_NAME_MAPPING",
"CACHE_PATH",
]
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ class Config(BaseModel):
dataset_name: str = Field("custom-dataset", description="Dataset name for MTEB task")
split: str = Field("test", description="Dataset split (train/dev/test)")
output_dest: Optional[Path] = Field(
None, description="Path to save mteb output, by default saved in output dir."
None, description="Path to save mteb output, by default saved in resource dir."
)
embeddings_dest: Optional[Path] = Field(
None,
description="Path to save mteb embeddings, by default saved in <output/embeddings> folder.",
description="Path to save mteb embeddings, by default saved in <resource/embeddings> folder.",
)

@classmethod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from mteb.overview import TASKS_REGISTRY

from embedding_model_evaluator.config import Config
from embedding_model_evaluator.utilities.helper import read_corpus, read_queries, read_candidates
from embedding_model_evaluator.utilities.helper import read_corpus_reranking, read_queries, read_candidates

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -92,7 +92,7 @@ def load_data(self, config: Config, **kwargs: Any) -> None:
"Pass your internal Config via MTEB.run(..., config=Config)."
)

corpus = read_corpus(config.corpus_path)
corpus = read_corpus_reranking(config.corpus_path)
queries = read_queries(config.queries_path)
candidates = read_candidates(config.candidates_path)["candidates"]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from mteb.overview import TASKS_REGISTRY

from embedding_model_evaluator.config import Config
from embedding_model_evaluator.utilities.helper import read_corpus, read_queries, read_candidates
from embedding_model_evaluator.utilities.helper import read_corpus_retrieval, read_queries, read_candidates

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -37,7 +37,7 @@ class CustomRetrievalTask(AbsTaskRetrieval):

def __init__(self, **kwargs: Any) -> None:
super().__init__(**kwargs)
self.corpus: dict[str, dict[str, dict[str, str]]] = {}
self.corpus: dict[str, dict[str, str]] = {}
self.queries: dict[str, dict[str, str]] = {}
self.relevant_docs: dict[str, dict[str, dict[str, int]]] = {}

Expand All @@ -51,7 +51,7 @@ def load_data(self, config: Config | None, **kwargs: Any) -> None:
log.error(message)
raise ValueError(message)

self.corpus = {"test": read_corpus(config.corpus_path)}
self.corpus = {"test": read_corpus_retrieval(config.corpus_path)}
self.queries = {"test": read_queries(config.queries_path)}
self.relevant_docs = {
"test": read_candidates(config.candidates_path)["relevant_docs"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

import argparse
import logging
from pathlib import Path
from typing import Any

import mteb
Expand All @@ -25,19 +24,13 @@
CustomRetrievalTask,
)
from embedding_model_evaluator.writers import EmbeddingWriter
from embedding_model_evaluator import TASKS_NAME_MAPPING, CACHE_PATH
from commons.logger import configure_logging # type: ignore[import]

log = logging.getLogger(__name__)

CACHE_PATH = Path("resources/cache")
CACHE_PATH.mkdir(parents=True, exist_ok=True)

# Map simple "task key" -> registered MTEB task class name
TASKS_NAME_MAPPING = {
"retrieval": "CustomRetrievalTask",
"reranking": "CustomRerankingTask",
}


def _parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Embedding Model Evaluator (MTEB, in-memory).")
Expand All @@ -48,8 +41,18 @@ def _parse_args() -> argparse.Namespace:
required=False,
default="embedding-model-evaluator/config.yaml",
)
parser.add_argument('-v', '--verbose', action='store_true',
help='Activate debug mode for logging [default: False]')

return parser.parse_args()

def setup_logging(verbose: bool = False) -> None:
if verbose:
configure_logging(logging.DEBUG)
else:
configure_logging(logging.INFO)
return


def _build_task(task_key: str, dataset_name: str, split: str) -> Any:
"""
Expand Down Expand Up @@ -79,8 +82,8 @@ def _build_task(task_key: str, dataset_name: str, split: str) -> Any:


def main() -> None:
configure_logging()
args = _parse_args()
setup_logging(args.verbose)
config: Config = Config.load(args.config)

# --- Sanity logs (explicit & helpful) ---
Expand All @@ -89,7 +92,9 @@ def main() -> None:

# --- Model + caching wrapper ---
model = mteb.get_model(config.model_id)
model_with_cache = CachedEmbeddingWrapper(model, cache_path=CACHE_PATH)
model_name_additional_path = config.model_id.replace("/", "__").replace(" ", "_")
model_with_cache_path = CACHE_PATH / model_name_additional_path
model_with_cache = CachedEmbeddingWrapper(model, cache_path=model_with_cache_path)

# --- Task instance (in-memory) ---
try:
Expand Down Expand Up @@ -117,7 +122,7 @@ def main() -> None:
writer = EmbeddingWriter(
config=config,
cached=model_with_cache,
cache_path=CACHE_PATH,
cache_path=model_with_cache_path,
task_name=TASKS_NAME_MAPPING.get(config.task_to_evaluate, "CustomRetrievalTask"),
batch_size=256,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,20 @@

from jsonlines import jsonlines


def read_corpus(path: Path) -> dict[str, dict[str, str]]:
def read_corpus_reranking(path: Path) -> dict[str, dict[str, str]]:
corpus_dict: dict[str, dict[str, str]] = {}
with jsonlines.open(path) as rows:
for row in rows:
corpus_dict[row["id"]] = {"title": row["title"], "text": row["text"]}
return corpus_dict

def read_corpus_retrieval(path: Path) -> dict[str, str]:
corpus_dict: dict[str, str] = {}
with jsonlines.open(path) as rows:
for row in rows:
corpus_dict[row["id"]] = row.get("title", "") + " " + row["text"]
return corpus_dict


def read_queries(path: Path) -> dict[str, str]:
queries_dict: dict[str, str] = {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@

from embedding_model_evaluator.config import Config
from embedding_model_evaluator.custom_tasks.reranking_task import compose_text
from embedding_model_evaluator.utilities.helper import read_corpus, read_queries
from embedding_model_evaluator.utilities.helper import read_corpus_retrieval, read_corpus_reranking, read_queries
from embedding_model_evaluator import TASKS_NAME_MAPPING

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -50,21 +51,29 @@ def write(self, embedding_path: str | Path | None) -> None:
"""
Write embeddings to <embedding_path>.
"""
# by default embeddings will be written into <output/embeddings>
# by default embeddings will be written into <resources/embeddings>
if embedding_path is None:
embedding_path = "output/embeddings"
embedding_path = "resources/embeddings"

path = Path(embedding_path)
path.mkdir(parents=True, exist_ok=True)

# documents
documents_path = path / "documents_embeddings.jsonl"
doc_dict = read_corpus(Path(self.config.corpus_path))
doc_ids = list(doc_dict.keys())
doc_texts = [
compose_text(doc_dict[_id].get("title"), doc_dict[_id].get("text"))
for _id in doc_ids
]
if self.task_name == TASKS_NAME_MAPPING["retrieval"]:
doc_dict_retrieval= read_corpus_retrieval(Path(self.config.corpus_path))
doc_ids = list(doc_dict_retrieval.keys())
doc_texts = list(doc_dict_retrieval.values())
elif self.task_name == TASKS_NAME_MAPPING["reranking"]:
doc_dict_reranking = read_corpus_reranking(Path(self.config.corpus_path))
doc_ids = list(doc_dict_reranking.keys())
doc_texts = [
compose_text(doc_dict_reranking[_id].get("title"), doc_dict_reranking[_id].get("text"))
for _id in doc_ids
]
else:
raise ValueError(f"Unknown task: {self.task_name}")


doc_vectors = self.cached.encode(
texts=doc_texts,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from embedding_model_evaluator.config import Config
from embedding_model_evaluator.writers.embedding_writer import EmbeddingWriter
from embedding_model_evaluator import TASKS_NAME_MAPPING


@pytest.fixture
Expand All @@ -17,8 +18,7 @@ def config() -> Config:


def _create_fake_cache_wrapper(
doc_vectors: Sequence[Sequence[float] | np.ndarray],
query_vectors: Sequence[Sequence[float] | np.ndarray],
vectors: Sequence[Sequence[float] | np.ndarray]
) -> CachedEmbeddingWrapper:
cached: CachedEmbeddingWrapper = create_autospec(
CachedEmbeddingWrapper, instance=True
Expand All @@ -31,11 +31,8 @@ def _encode(
batch_size: int,
) -> np.ndarray:
assert batch_size == 32
if task_name.endswith("-corpus"):
return np.vstack([np.asarray(vector) for vector in doc_vectors])
if task_name.endswith("-queries"):
return np.vstack([np.asarray(vector) for vector in query_vectors])
raise AssertionError(f"Unexpected encode name: {task_name}")
assert task_name == TASKS_NAME_MAPPING["retrieval"]
return np.vstack([np.asarray(vector) for vector in vectors])

cached.encode.side_effect = _encode
return cached
Expand All @@ -45,17 +42,21 @@ def test_embeddings_writer_with_valid_inputs__expects__creates_jsonl_files_with_
config: Config, tmp_path: Path
) -> None:
doc_vectors = [[0.1, 0.2, 0.3]]
cached_doc = _create_fake_cache_wrapper(
vectors=doc_vectors
)
query_vectors = [[1.0, 1.1, 1.2]]
cached = _create_fake_cache_wrapper(
doc_vectors=doc_vectors, query_vectors=query_vectors
cached_query = _create_fake_cache_wrapper(
vectors=query_vectors
)

config.embeddings_dest = tmp_path / "output" / "embeddings"

writer = EmbeddingWriter(
config=config,
cached=cached,
cached=cached_doc,
cache_path=tmp_path / "cache",
task_name="test_custom_task-corpus",
task_name=TASKS_NAME_MAPPING["retrieval"],
batch_size=32,
)

Expand All @@ -72,9 +73,9 @@ def test_embeddings_writer_with_valid_inputs__expects__creates_jsonl_files_with_
# recreating again because of fake cached embedding wrapper for queries and corpus vectors
writer = EmbeddingWriter(
config=config,
cached=cached,
cached=cached_query,
cache_path=tmp_path / "cache",
task_name="test_custom_task-queries",
task_name=TASKS_NAME_MAPPING["retrieval"],
batch_size=32,
)

Expand Down
23 changes: 10 additions & 13 deletions rre-tools/tests/test_cross_plataform.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from commons.writers.quepid_writer import QuepidWriter
from embedding_model_evaluator.config import Config as MTEBConfig
from embedding_model_evaluator.writers.embedding_writer import EmbeddingWriter
from embedding_model_evaluator import TASKS_NAME_MAPPING
from dataset_generator.config import Config as DGConfig


Expand Down Expand Up @@ -40,16 +41,11 @@ def test_save_and_load_nested_paths__expects__data_persisted_correctly(tmp_path:
# ---------------- embedding-model-evaluator ----------------

class _FakeCache:
def __init__(self, doc_vectors: Sequence[Sequence[float]], query_vectors: Sequence[Sequence[float]]):
self._doc_vectors = doc_vectors
self._query_vectors = query_vectors
def __init__(self, vectors: Sequence[Sequence[float]]):
self._vectors = vectors

def encode(self, texts, *, task_name: str, batch_size: int):
if task_name.endswith("-corpus"):
return self._doc_vectors
if task_name.endswith("-queries"):
return self._query_vectors
raise AssertionError(f"Unexpected encode name: {task_name}")
return self._vectors

def close(self) -> None:
pass
Expand All @@ -65,13 +61,14 @@ def test_embedding_writer_with_nested_dirs__expects__creates_files_in_nested_dir

dest = tmp_path / "out" / "embeddings"

cached = _FakeCache(doc_vectors=[[0.1, 0.2, 0.3]], query_vectors=[[1.0, 1.1, 1.2]])
cached_doc = _FakeCache(vectors=[[0.1, 0.2, 0.3]])
cached_query = _FakeCache(vectors=[[1.0, 1.1, 1.2]])

writer = EmbeddingWriter(
config=cfg,
cached=cached,
cached=cached_doc,
cache_path=tmp_path / "cache",
task_name="test_custom_task-corpus",
task_name=TASKS_NAME_MAPPING["retrieval"],
batch_size=32,
)

Expand All @@ -85,9 +82,9 @@ def test_embedding_writer_with_nested_dirs__expects__creates_files_in_nested_dir

writer = EmbeddingWriter(
config=cfg,
cached=cached,
cached=cached_query,
cache_path=tmp_path / "cache",
task_name="test_custom_task-queries",
task_name=TASKS_NAME_MAPPING["retrieval"],
batch_size=32,
)

Expand Down