Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,11 @@ jobs:
python-version: "3.12"

- name: Install dependencies
run: pip install -e "python/[dev]"
run: pip install -e "python/[dev,serve]"

- name: Run tests
run: |
cd python
pytest tests/ -v --ignore=tests/test_model.py
pytest tests/ -v \
--ignore=tests/test_model.py \
--ignore=tests/test_server_live.py
313 changes: 296 additions & 17 deletions cactus-engine/src/model.cpp

Large diffs are not rendered by default.

27 changes: 23 additions & 4 deletions cactus-engine/src/rag.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,13 @@ std::string retrieve_rag_context(CactusModelHandle* handle, const std::string& q
std::vector<uint32_t> query_tokens = tokenizer->encode(query);
if (query_tokens.empty()) return "";

std::vector<float> query_embedding = handle->model->get_embeddings(query_tokens, true, true);
std::vector<float> query_embedding;
try {
query_embedding = handle->model->get_embeddings(query_tokens, true, true);
} catch (const std::exception& e) {
CACTUS_LOG_WARN("rag", "get_embeddings unavailable, skipping RAG context: " << e.what());
return "";
}
if (query_embedding.size() != handle->corpus_embedding_dim) {
CACTUS_LOG_WARN("rag", "Query embedding dimension mismatch");
return "";
Expand Down Expand Up @@ -262,8 +268,15 @@ std::vector<cactus::ffi::ToolFunction> select_relevant_tools(

std::vector<uint32_t> tokens = tokenizer->encode(text);
if (!tokens.empty()) {
std::vector<float> emb = handle->model->get_embeddings(tokens, true, true);
handle->tool_embeddings.push_back(std::move(emb));
try {
std::vector<float> emb = handle->model->get_embeddings(tokens, true, true);
handle->tool_embeddings.push_back(std::move(emb));
} catch (const std::exception& e) {
CACTUS_LOG_WARN("tool_rag", "get_embeddings unavailable, returning all tools: " << e.what());
handle->tool_texts.clear();
handle->tool_embeddings.clear();
return all_tools;
}
} else {
handle->tool_embeddings.push_back({});
}
Expand All @@ -277,7 +290,13 @@ std::vector<cactus::ffi::ToolFunction> select_relevant_tools(
return all_tools;
}

std::vector<float> query_embedding = handle->model->get_embeddings(query_tokens, true, true);
std::vector<float> query_embedding;
try {
query_embedding = handle->model->get_embeddings(query_tokens, true, true);
} catch (const std::exception& e) {
CACTUS_LOG_WARN("tool_rag", "get_embeddings unavailable, returning all tools: " << e.what());
return all_tools;
}
if (query_embedding.empty()) {
CACTUS_LOG_WARN("tool_rag", "Failed to get query embedding, returning all tools");
return all_tools;
Expand Down
5 changes: 4 additions & 1 deletion cactus-engine/src/sp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,8 +235,11 @@ std::string SPTokenizer::preprocess_text(const std::string& text) const {
}

std::string processed = "";
if (sp_add_dummy_prefix_) processed += "\xE2\x96\x81";

for (size_t i = text.find_first_not_of(" "); i < text.length(); i++) {
size_t start = text.find_first_not_of(" ");
if (start == std::string::npos) return processed;
for (size_t i = start; i < text.length(); i++) {
char c = text[i];
if (c == ' ') {
processed += "▁";
Expand Down
6 changes: 3 additions & 3 deletions python/cactus/bindings/cactus.py
Original file line number Diff line number Diff line change
Expand Up @@ -1061,7 +1061,7 @@ def cactus_embed(model, text, normalize=True):
"""
buf = (ctypes.c_float * 4096)()
dim = ctypes.c_size_t()
rc = _lib.cactus_embed(model, _enc(text), buf, 4096, ctypes.byref(dim), normalize)
rc = _lib.cactus_embed(model, _enc(text), buf, ctypes.sizeof(buf), ctypes.byref(dim), normalize)
if rc < 0:
raise RuntimeError(_err("Embedding failed"))
return list(buf[:dim.value])
Expand All @@ -1071,7 +1071,7 @@ def cactus_image_embed(model, image_path):
"""Compute an image embedding. Returns a list of floats."""
buf = (ctypes.c_float * 4096)()
dim = ctypes.c_size_t()
rc = _lib.cactus_image_embed(model, _enc(image_path), buf, 4096, ctypes.byref(dim))
rc = _lib.cactus_image_embed(model, _enc(image_path), buf, ctypes.sizeof(buf), ctypes.byref(dim))
if rc < 0:
raise RuntimeError(_err("Image embedding failed"))
return list(buf[:dim.value])
Expand All @@ -1081,7 +1081,7 @@ def cactus_audio_embed(model, audio_path):
"""Compute an audio embedding. Returns a list of floats."""
buf = (ctypes.c_float * 4096)()
dim = ctypes.c_size_t()
rc = _lib.cactus_audio_embed(model, _enc(audio_path), buf, 4096, ctypes.byref(dim))
rc = _lib.cactus_audio_embed(model, _enc(audio_path), buf, ctypes.sizeof(buf), ctypes.byref(dim))
if rc < 0:
raise RuntimeError(_err("Audio embedding failed"))
return list(buf[:dim.value])
Expand Down
26 changes: 23 additions & 3 deletions python/cactus/convert/cactus_adapters/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,16 @@ def convert_hf_tokenizer(tokenizer, output_dir, token=None, model_id=None, label

tokenizer_model = tokenizer_json_data.get("model", {}) if tokenizer_json_data else {}
tokenizer_model_type = str(tokenizer_model.get("type", "")).upper()
is_sentencepiece = tokenizer_model_type != "BPE" and model_type in SENTENCEPIECE_MODEL_TYPES
is_unigram = tokenizer_model_type == "UNIGRAM"
is_sentencepiece = is_unigram or (tokenizer_model_type != "BPE" and model_type in SENTENCEPIECE_MODEL_TYPES)

# Unigram (e.g. XLM-RoBERTa, used by nomic-embed) carries per-token scores that
# the SentencePiece runtime needs for Viterbi segmentation.
unigram_scores: dict[int, float] = {}
if is_unigram and isinstance(tokenizer_model.get("vocab"), list):
for token_id, entry in enumerate(tokenizer_model["vocab"]):
if isinstance(entry, (list, tuple)) and len(entry) == 2:
unigram_scores[token_id] = float(entry[1])

if tokenizer_model_type == "BPE" and tokenizer_model.get("vocab"):
vocab = tokenizer_model["vocab"]
Expand Down Expand Up @@ -376,8 +385,12 @@ def write_merges_file(merges_list):
with open(vocab_output, 'w', encoding='utf-8') as f:
for token_id, token_str in enumerate(id_to_token):
if token_str:
f.write(f"{token_id}\t{token_str}\n")
print(f" Saved tokenizer vocabulary (ID\\ttoken format)")
if unigram_scores:
f.write(f"{token_id}\t{token_str}\t{unigram_scores.get(token_id, 0.0)}\n")
else:
f.write(f"{token_id}\t{token_str}\n")
vocab_fmt = "ID\\ttoken\\tscore" if unigram_scores else "ID\\ttoken"
print(f" Saved tokenizer vocabulary ({vocab_fmt} format)")

special_tokens_output = output_dir / "special_tokens.json"
with open(special_tokens_output, 'w', encoding='utf-8') as f:
Expand Down Expand Up @@ -410,6 +423,9 @@ def write_merges_file(merges_list):
decoder = "byte_level"
elif is_sentencepiece:
tokenizer_type = "sentencepiece"
if is_unigram:
normalizer = "metaspace"
decoder = "replace_metaspace"

tokenizer_config_output = output_dir / "tokenizer_config.txt"
with open(tokenizer_config_output, 'w') as f:
Expand All @@ -422,6 +438,10 @@ def write_merges_file(merges_list):
f.write(f"normalizer={normalizer}\n")
f.write(f"decoder={decoder}\n")
f.write(f"byte_fallback={'true' if byte_fallback else 'false'}\n")
if is_unigram:
f.write("sp_model_type=unigram\n")
f.write("sp_add_dummy_prefix=true\n")
f.write("sp_byte_fallback=false\n")

if chat_template_data:
f.write("has_chat_template=true\n")
Expand Down
13 changes: 13 additions & 0 deletions python/cactus/convert/model_adapters/adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,9 +349,22 @@ def name_tensor(self, source_name: str, tensor: Any, num_layers: int | None) ->
norm2 = _nomic_layer_suffix(source_name, ".norm2.weight")
if norm2 is not None:
return NameMatch(source_name, f"layer_{norm2}_norm2.weights", "language", True, hf_name=source_name, adapter_name=source_name)
for suffix, template, transpose in (
(".attn.Wqkv.weight", "layer_{i}_attn_qkv.weights", False),
(".attn.Wqkv.bias", "layer_{i}_attn_qkv.bias", False),
(".mlp.experts.mlp.w1", "layer_{i}_mlp_experts_w1.weights", False),
(".mlp.experts.mlp.w2", "layer_{i}_mlp_experts_w2.weights", True),
):
layer = _nomic_layer_suffix(source_name, suffix)
if layer is not None:
return NameMatch(source_name, template.format(i=layer), "language", True, transpose, hf_name=source_name, adapter_name=source_name)
return cactus_name_for_tensor(source_name, self.family, num_layers)

def policy(self, match: NameMatch, shape: tuple[int, ...], requested_bits: int) -> TensorPolicy:
# The MoE router is tiny ([num_experts, hidden]) but decides expert selection;
# 4-bit quantizing it corrupts routing, so keep it in FP16.
if ".mlp.router.layer.weight" in match.source_name:
return TensorPolicy("fallback", "FP16", None, match.component, False, "none", "moe router precision-sensitive")
policy = super().policy(match, shape, requested_bits)
if policy.use_gptq and ".mlp.experts.mlp." in match.source_name:
return replace(policy, use_gptq=False)
Expand Down
61 changes: 61 additions & 0 deletions python/cactus/convert/tests/test_lfm2_adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from __future__ import annotations

from cactus.convert.model_adapters.adapters import adapter_for_family


def test_lfm2_vl_adapter_selects_runtime_safe_model_class():
from transformers import Lfm2VlForConditionalGeneration

adapter = adapter_for_family("lfm2")
cfg = {"model_type": "lfm2", "architectures": ["Lfm2VlForConditionalGeneration"]}
assert adapter.model_class(cfg) is Lfm2VlForConditionalGeneration


def test_lfm2_processor_fallback_handles_tokenizers_backend(tmp_path):
import json

from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from transformers import Lfm2VlProcessor

tokenizer = Tokenizer(WordLevel({"<|pad|>": 0, "<|startoftext|>": 1, "<|im_end|>": 2, "<image>": 3, "hello": 4}, unk_token="<|pad|>"))
tokenizer.pre_tokenizer = Whitespace()
tokenizer.save(str(tmp_path / "tokenizer.json"))
(tmp_path / "tokenizer_config.json").write_text(
json.dumps(
{
"tokenizer_class": "TokenizersBackend",
"bos_token": "<|startoftext|>",
"eos_token": "<|im_end|>",
"pad_token": "<|pad|>",
"image_token": "<image>",
"image_start_token": "<image>",
"image_end_token": "<image>",
"image_thumbnail": "<image>",
}
),
encoding="utf-8",
)
(tmp_path / "preprocessor_config.json").write_text(
json.dumps(
{
"image_processor_type": "Lfm2VlImageProcessorFast",
"do_resize": True,
"size": {"height": 512, "width": 512},
"do_rescale": True,
"rescale_factor": 1 / 255,
"do_normalize": True,
"image_mean": [0.5, 0.5, 0.5],
"image_std": [0.5, 0.5, 0.5],
"do_pad": True,
"data_format": "channels_first",
}
),
encoding="utf-8",
)

processor = adapter_for_family("lfm2").load_processor(str(tmp_path))
assert isinstance(processor, Lfm2VlProcessor)
assert processor.image_token == "<image>"
assert processor.image_token_id == 3
113 changes: 113 additions & 0 deletions python/cactus/convert/tests/test_naming_qdq.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from cactus.convert.cactus_adapters.tensor_io import save_tensor_with_header
from cactus.convert.export.qdq import convert_qdq
from cactus.convert.model_adapters.adapters import adapter_for_family
from cactus.convert.model_adapters.naming import cactus_name_for_tensor, restore_hf_key_for_family


Expand Down Expand Up @@ -289,3 +290,115 @@ def test_parakeet_batchnorm_tracking_tensors_are_ignored():
match = cactus_name_for_tensor("encoder.layers.0.conv.norm.num_batches_tracked", "parakeet", 24)
assert match.recognized
assert match.output_name is None


def test_nomic_normalizes_global_tensors():
adapter = adapter_for_family("nomic")
state = {
"embeddings.word_embeddings.weight": torch.ones(4, 3),
"embeddings.token_type_embeddings.weight": torch.full((1, 3), 2.0),
"emb_ln.weight": torch.arange(3.0),
"emb_ln.bias": torch.arange(3.0) + 10,
}
normalized = adapter.normalize_state_dict(state)
assert set(normalized.state_dict) == {"token_embeddings", "embedding_layernorm.weight", "embedding_layernorm.bias"}
assert torch.equal(normalized.state_dict["token_embeddings"], torch.full((4, 3), 3.0))
assert normalized.provenance["token_embeddings"].source_names == [
"embeddings.word_embeddings.weight",
"embeddings.token_type_embeddings.weight",
]
assert normalized.provenance["token_embeddings"].qdq_restore == "adapter_key"
assert adapter.name_tensor("token_embeddings", normalized.state_dict["token_embeddings"], 12).output_name == "token_embeddings.weights"
assert adapter.name_tensor("embedding_layernorm.weight", normalized.state_dict["embedding_layernorm.weight"], 12).output_name == "embedding_layernorm.weight"


def test_nomic_norm2_weight_uses_runtime_name():
adapter = adapter_for_family("nomic")
match = adapter.name_tensor("encoder.layers.3.norm2.weight", torch.ones(768), 12)
assert match.recognized
assert match.output_name == "layer_3_norm2.weights"


def test_nomic_keeps_qkv_and_moe_experts_fused():
# The v2 transpile path binds graph weights by their HF parameter name, so the
# converter emits one fused tensor per HF parameter (no q/k/v or per-expert split).
# w2 is stored transposed so the second expert matmul can consume it as a direct
# linear weight in the transpiled graph.
adapter = adapter_for_family("nomic")
adapter.num_experts = 8

qkv = torch.arange(2304 * 2, dtype=torch.float32).reshape(2304, 2)
match = adapter.name_tensor("encoder.layers.0.attn.Wqkv.weight", qkv, 12)
emissions = adapter.expand_tensor(match, qkv)
assert [e.output_name for e in emissions] == ["layer_0_attn_qkv.weights"]
assert tuple(emissions[0].tensor.shape) == (2304, 2)

w1 = torch.empty(24576, 2)
match = adapter.name_tensor("encoder.layers.1.mlp.experts.mlp.w1", w1, 12)
emissions = adapter.expand_tensor(match, w1)
assert [e.output_name for e in emissions] == ["layer_1_mlp_experts_w1.weights"]
assert tuple(emissions[0].tensor.shape) == (24576, 2)

w2 = torch.empty(24576, 2)
match = adapter.name_tensor("encoder.layers.1.mlp.experts.mlp.w2", w2, 12)
emissions = adapter.expand_tensor(match, w2)
assert [e.output_name for e in emissions] == ["layer_1_mlp_experts_w2.weights"]
assert tuple(emissions[0].tensor.shape) == (2, 24576)


def test_nomic_qdq_runtime_keys_are_unique(tmp_path):
cactus = tmp_path / "cactus"
out = tmp_path / "qdq"
cactus.mkdir()
save_tensor_with_header(torch.ones(2, 3), cactus / "layer_0_attn_q.weights", precision="FP16")
save_tensor_with_header(torch.ones(2, 3) * 2, cactus / "layer_0_attn_k.weights", precision="FP16")
(cactus / "conversion_manifest.json").write_text(
"""[
{
"source_name": "encoder.layers.0.attn.Wqkv.weight",
"hf_name": "encoder.layers.0.attn.Wqkv.weight",
"adapter_name": "encoder.layers.0.attn.Wqkv.weight",
"output_file": "layer_0_attn_q.weights",
"shape": [2, 3],
"dtype": "torch.float32",
"component": "language",
"policy": "fallback",
"precision": "FP16",
"status": "fallback",
"required": true,
"qdq_restore": "runtime_key",
"scale_factor": 1.0
},
{
"source_name": "encoder.layers.0.attn.Wqkv.weight",
"hf_name": "encoder.layers.0.attn.Wqkv.weight",
"adapter_name": "encoder.layers.0.attn.Wqkv.weight",
"output_file": "layer_0_attn_k.weights",
"shape": [2, 3],
"dtype": "torch.float32",
"component": "language",
"policy": "fallback",
"precision": "FP16",
"status": "fallback",
"required": true,
"qdq_restore": "runtime_key",
"scale_factor": 1.0
}
]""",
encoding="utf-8",
)
report = convert_qdq(
SimpleNamespace(
input=cactus,
out=out,
dtype="float16",
model_family="nomic",
shard_size_gb=1.0,
row_batch_size=64,
tmp_dir=None,
force=True,
)
)
tensors = load_file(out / "model.safetensors")
assert report["written_count"] == 2
assert set(tensors) == {"layer_0_attn_q", "layer_0_attn_k"}
Loading
Loading