Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions mteb/benchmarks/_create_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,16 @@ def _format_max_tokens(max_tokens: float | None) -> float | None:
return float(max_tokens)


def _get_embedding_size(embed_dim: int | list[int] | None) -> int | None:
if embed_dim is None:
return None
if isinstance(embed_dim, int):
return int(embed_dim)
if isinstance(embed_dim, list) and len(embed_dim) > 0:
return int(max(embed_dim))
return None


def _get_means_per_types(per_task: pd.DataFrame):
task_names_per_type = defaultdict(list)
for task_name in per_task.columns:
Expand Down Expand Up @@ -139,7 +149,7 @@ def _create_summary_table_from_benchmark_results(
joint_table.insert(
1,
"Embedding Dimensions",
model_metas.map(lambda m: int(m.embed_dim) if m.embed_dim else None),
model_metas.map(lambda m: _get_embedding_size),
)
joint_table.insert(
1,
Expand Down Expand Up @@ -382,7 +392,7 @@ def _create_summary_table_mean_public_private(
joint_table.insert(
1,
"Embedding Dimensions",
model_metas.map(lambda m: int(m.embed_dim) if m.embed_dim else None),
model_metas.map(lambda m: _get_embedding_size),
)
joint_table.insert(
1,
Expand Down Expand Up @@ -503,7 +513,7 @@ def _create_summary_table_mean_subset(
joint_table.insert(
1,
"Embedding Dimensions",
model_metas.map(lambda m: int(m.embed_dim) if m.embed_dim else None),
model_metas.map(lambda m: _get_embedding_size),
)
joint_table.insert(
1,
Expand Down Expand Up @@ -621,7 +631,7 @@ def _create_summary_table_mean_task_type(
joint_table.insert(
1,
"Embedding Dimensions",
model_metas.map(lambda m: int(m.embed_dim) if m.embed_dim else None),
model_metas.map(lambda m: _get_embedding_size),
)
joint_table.insert(
1,
Expand Down
5 changes: 4 additions & 1 deletion mteb/models/get_model_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ def get_model(
model_name: str,
revision: str | None = None,
device: str | None = None,
*,
embed_dim: int | None = None,
**kwargs: Any,
) -> MTEBModels:
"""A function to fetch and load model object by name.
Expand All @@ -104,13 +106,14 @@ def get_model(
model_name: Name of the model to fetch
revision: Revision of the model to fetch
device: Device used to load the model
embed_dim: Optional embedding dimension to load the model with. This is only used for models that support loading with a specified embedding dimension, and will be ignored for other models.
**kwargs: Additional keyword arguments to pass to the model loader

Returns:
A model object
"""
meta = get_model_meta(model_name, revision).model_copy(deep=True)
model = meta.load_model(device=device, **kwargs)
model = meta.load_model(device=device, embed_dim=embed_dim, **kwargs)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you think about letting embed_dim be part of kwargs?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added it to args of get_model for more visibility of this feature


if kwargs:
logger.info(
Expand Down
4 changes: 2 additions & 2 deletions mteb/models/model_implementations/jina_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -803,7 +803,7 @@ def encode(
n_embedding_parameters=155582464,
memory_usage_mb=1137.0,
max_tokens=32768,
embed_dim=1024,
embed_dim=[32, 64, 128, 256, 512, 768, 1024],
license="cc-by-nc-4.0",
similarity_fn_name=ScoringFunction.COSINE,
framework=[
Expand Down Expand Up @@ -858,7 +858,7 @@ def encode(
n_embedding_parameters=98500608,
memory_usage_mb=404.0,
max_tokens=8192,
embed_dim=768,
embed_dim=[32, 64, 128, 256, 512, 768],
license="cc-by-nc-4.0",
similarity_fn_name=ScoringFunction.COSINE,
framework=[
Expand Down
4 changes: 2 additions & 2 deletions mteb/models/model_implementations/random_baseline.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,11 +155,11 @@ def __init__(
revision: str | None,
array_framework: Literal["numpy", "torch"] = "numpy",
dtype: torch.dtype | np.floating = np.float32,
embed_dim: int = _EMBEDDING_DIM,
embed_dim: int | None = None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why the change here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To match with SentenceTransformerWrapper

**kwargs: Any,
) -> None:
self.rng_state = np.random.default_rng(42)
self.embedding_dim = embed_dim
self.embedding_dim = embed_dim or _EMBEDDING_DIM
self.array_framework = array_framework
self.dtype = dtype

Expand Down
38 changes: 35 additions & 3 deletions mteb/models/model_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ class ModelMeta(BaseModel):
max_tokens: The maximum number of tokens the model can handle. Can be None if the maximum number of tokens is not known (e.g. for proprietary
models).
embed_dim: The dimension of the embeddings produced by the model. Currently all models are assumed to produce fixed-size embeddings.
If annotated as list this will be treated as a range of possible embedding dimensions (Matryoshka).
revision: The revision number of the model. If None, it is assumed that the metadata (including the loader) is valid for all revisions of the model.
release_date: The date the model's revision was released. If None, then release date will be added based on 1st commit in hf repository of model.
license: The license under which the model is released. Required if open_weights is True.
Expand Down Expand Up @@ -146,7 +147,7 @@ class ModelMeta(BaseModel):
n_embedding_parameters: int | None = None
memory_usage_mb: float | None
max_tokens: float | None
embed_dim: int | None
embed_dim: int | list[int] | None
license: Licenses | StrURL | None
open_weights: bool | None
public_training_code: str | None
Expand Down Expand Up @@ -274,7 +275,13 @@ def _check_name(cls, v: str | None) -> str | None:
)
return v

def load_model(self, device: str | None = None, **kwargs: Any) -> MTEBModels:
def load_model(
self,
device: str | None = None,
*,
embed_dim: int | None = None,
**kwargs: Any,
) -> MTEBModels:
"""Loads the model using the specified loader function."""
if self.loader is None:
raise NotImplementedError(
Expand All @@ -283,6 +290,26 @@ def load_model(self, device: str | None = None, **kwargs: Any) -> MTEBModels:
if self.name is None:
raise ValueError("name is not set for ModelMeta. Cannot load model.")

if embed_dim is not None:
if (
self.embed_dim is not None
and isinstance(self.embed_dim, int)
and self.embed_dim != embed_dim
):
raise ValueError(
f"Requested embedding dimension {embed_dim} does not match the model's embedding dimension {self.embed_dim}."
"Model does not support loading with a different embedding dimension."
)
elif isinstance(self.embed_dim, list) and embed_dim not in self.embed_dim:
raise ValueError(
f"Requested embedding dimension {embed_dim} is not in the model's supported embedding dimensions {self.embed_dim}."
)
self.embed_dim = embed_dim
if self.experiment_kwargs is None:
self.experiment_kwargs = {"embed_dim": embed_dim}
else:
self.experiment_kwargs["embed_dim"] = embed_dim

if self.experiment_kwargs is None:
self.experiment_kwargs = kwargs if len(kwargs) > 0 else None
elif len(kwargs) > 0 and self.experiment_kwargs is not None:
Expand All @@ -295,7 +322,12 @@ def load_model(self, device: str | None = None, **kwargs: Any) -> MTEBModels:
if device is not None:
_kwargs["device"] = device

model: MTEBModels = self.loader(self.name, revision=self.revision, **_kwargs)
model: MTEBModels = self.loader(
self.name,
revision=self.revision,
embed_dim=embed_dim,
**_kwargs,
)
model.mteb_model_meta = self # type: ignore[misc]
return model

Expand Down
9 changes: 8 additions & 1 deletion mteb/models/sentence_transformer_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ def __init__(
revision: str | None = None,
device: str | None = None,
model_prompts: dict[str, str] | None = None,
*,
embed_dim: int | None = None,
**kwargs,
) -> None:
"""Wrapper for SentenceTransformer models.
Expand All @@ -66,13 +68,18 @@ def __init__(
First priority is given to the composed prompt of task name + prompt type (query or passage), then to the specific task prompt,
then to the composed prompt of task type + prompt type, then to the specific task type prompt,
and finally to the specific prompt type.
embed_dim: The embedding dimension of the model to use.
**kwargs: Additional arguments to pass to the SentenceTransformer model.
"""
from sentence_transformers import SentenceTransformer

if isinstance(model, str):
self.model = SentenceTransformer(
model, revision=revision, device=device, **kwargs
model,
revision=revision,
device=device,
truncate_dim=embed_dim,
**kwargs,
)
else:
self.model = model
Expand Down
Loading