cactus-compute · HenryNdubuaku · Jun 2, 2026 · May 29, 2026 · May 29, 2026 · May 29, 2026
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
@@ -28,9 +28,11 @@ jobs:
           python-version: "3.12"
 
       - name: Install dependencies
-        run: pip install -e "python/[dev]"
+        run: pip install -e "python/[dev,serve]"
 
       - name: Run tests
         run: |
           cd python
-          pytest tests/ -v --ignore=tests/test_model.py
+          pytest tests/ -v \
+            --ignore=tests/test_model.py \
+            --ignore=tests/test_server_live.py
diff --git a/cactus-engine/src/model.cpp b/cactus-engine/src/model.cpp
diff --git a/cactus-engine/src/rag.cpp b/cactus-engine/src/rag.cpp
@@ -124,7 +124,13 @@ std::string retrieve_rag_context(CactusModelHandle* handle, const std::string& q
     std::vector<uint32_t> query_tokens = tokenizer->encode(query);
     if (query_tokens.empty()) return "";
 
-    std::vector<float> query_embedding = handle->model->get_embeddings(query_tokens, true, true);
+    std::vector<float> query_embedding;
+    try {
+        query_embedding = handle->model->get_embeddings(query_tokens, true, true);
+    } catch (const std::exception& e) {
+        CACTUS_LOG_WARN("rag", "get_embeddings unavailable, skipping RAG context: " << e.what());
+        return "";
+    }
     if (query_embedding.size() != handle->corpus_embedding_dim) {
         CACTUS_LOG_WARN("rag", "Query embedding dimension mismatch");
         return "";
@@ -262,8 +268,15 @@ std::vector<cactus::ffi::ToolFunction> select_relevant_tools(
 
             std::vector<uint32_t> tokens = tokenizer->encode(text);
             if (!tokens.empty()) {
-                std::vector<float> emb = handle->model->get_embeddings(tokens, true, true);
-                handle->tool_embeddings.push_back(std::move(emb));
+                try {
+                    std::vector<float> emb = handle->model->get_embeddings(tokens, true, true);
+                    handle->tool_embeddings.push_back(std::move(emb));
+                } catch (const std::exception& e) {
+                    CACTUS_LOG_WARN("tool_rag", "get_embeddings unavailable, returning all tools: " << e.what());
+                    handle->tool_texts.clear();
+                    handle->tool_embeddings.clear();
+                    return all_tools;
+                }
             } else {
                 handle->tool_embeddings.push_back({});
             }
@@ -277,7 +290,13 @@ std::vector<cactus::ffi::ToolFunction> select_relevant_tools(
         return all_tools;
     }
 
-    std::vector<float> query_embedding = handle->model->get_embeddings(query_tokens, true, true);
+    std::vector<float> query_embedding;
+    try {
+        query_embedding = handle->model->get_embeddings(query_tokens, true, true);
+    } catch (const std::exception& e) {
+        CACTUS_LOG_WARN("tool_rag", "get_embeddings unavailable, returning all tools: " << e.what());
+        return all_tools;
+    }
     if (query_embedding.empty()) {
         CACTUS_LOG_WARN("tool_rag", "Failed to get query embedding, returning all tools");
         return all_tools;

diff --git a/cactus-engine/src/sp.cpp b/cactus-engine/src/sp.cpp
@@ -235,8 +235,11 @@ std::string SPTokenizer::preprocess_text(const std::string& text) const {
     }
 
     std::string processed = "";
+    if (sp_add_dummy_prefix_) processed += "\xE2\x96\x81";
 
-    for (size_t i = text.find_first_not_of(" "); i < text.length(); i++) {
+    size_t start = text.find_first_not_of(" ");
+    if (start == std::string::npos) return processed;
+    for (size_t i = start; i < text.length(); i++) {
         char c = text[i];
         if (c == ' ') {
             processed += "▁";

diff --git a/python/cactus/bindings/cactus.py b/python/cactus/bindings/cactus.py
@@ -1061,7 +1061,7 @@ def cactus_embed(model, text, normalize=True):
     """
     buf = (ctypes.c_float * 4096)()
     dim = ctypes.c_size_t()
-    rc = _lib.cactus_embed(model, _enc(text), buf, 4096, ctypes.byref(dim), normalize)
+    rc = _lib.cactus_embed(model, _enc(text), buf, ctypes.sizeof(buf), ctypes.byref(dim), normalize)
     if rc < 0:
         raise RuntimeError(_err("Embedding failed"))
     return list(buf[:dim.value])
@@ -1071,7 +1071,7 @@ def cactus_image_embed(model, image_path):
     """Compute an image embedding. Returns a list of floats."""
     buf = (ctypes.c_float * 4096)()
     dim = ctypes.c_size_t()
-    rc = _lib.cactus_image_embed(model, _enc(image_path), buf, 4096, ctypes.byref(dim))
+    rc = _lib.cactus_image_embed(model, _enc(image_path), buf, ctypes.sizeof(buf), ctypes.byref(dim))
     if rc < 0:
         raise RuntimeError(_err("Image embedding failed"))
     return list(buf[:dim.value])
@@ -1081,7 +1081,7 @@ def cactus_audio_embed(model, audio_path):
     """Compute an audio embedding. Returns a list of floats."""
     buf = (ctypes.c_float * 4096)()
     dim = ctypes.c_size_t()
-    rc = _lib.cactus_audio_embed(model, _enc(audio_path), buf, 4096, ctypes.byref(dim))
+    rc = _lib.cactus_audio_embed(model, _enc(audio_path), buf, ctypes.sizeof(buf), ctypes.byref(dim))
     if rc < 0:
         raise RuntimeError(_err("Audio embedding failed"))
     return list(buf[:dim.value])

diff --git a/python/cactus/convert/cactus_adapters/tokenizer.py b/python/cactus/convert/cactus_adapters/tokenizer.py
@@ -143,7 +143,16 @@ def convert_hf_tokenizer(tokenizer, output_dir, token=None, model_id=None, label
 
     tokenizer_model = tokenizer_json_data.get("model", {}) if tokenizer_json_data else {}
     tokenizer_model_type = str(tokenizer_model.get("type", "")).upper()
-    is_sentencepiece = tokenizer_model_type != "BPE" and model_type in SENTENCEPIECE_MODEL_TYPES
+    is_unigram = tokenizer_model_type == "UNIGRAM"
+    is_sentencepiece = is_unigram or (tokenizer_model_type != "BPE" and model_type in SENTENCEPIECE_MODEL_TYPES)
+
+    # Unigram (e.g. XLM-RoBERTa, used by nomic-embed) carries per-token scores that
+    # the SentencePiece runtime needs for Viterbi segmentation.
+    unigram_scores: dict[int, float] = {}
+    if is_unigram and isinstance(tokenizer_model.get("vocab"), list):
+        for token_id, entry in enumerate(tokenizer_model["vocab"]):
+            if isinstance(entry, (list, tuple)) and len(entry) == 2:
+                unigram_scores[token_id] = float(entry[1])
 
     if tokenizer_model_type == "BPE" and tokenizer_model.get("vocab"):
         vocab = tokenizer_model["vocab"]
@@ -376,8 +385,12 @@ def write_merges_file(merges_list):
     with open(vocab_output, 'w', encoding='utf-8') as f:
         for token_id, token_str in enumerate(id_to_token):
             if token_str:
-                f.write(f"{token_id}\t{token_str}\n")
-    print(f"  Saved tokenizer vocabulary (ID\\ttoken format)")
+                if unigram_scores:
+                    f.write(f"{token_id}\t{token_str}\t{unigram_scores.get(token_id, 0.0)}\n")
+                else:
+                    f.write(f"{token_id}\t{token_str}\n")
+    vocab_fmt = "ID\\ttoken\\tscore" if unigram_scores else "ID\\ttoken"
+    print(f"  Saved tokenizer vocabulary ({vocab_fmt} format)")
 
     special_tokens_output = output_dir / "special_tokens.json"
     with open(special_tokens_output, 'w', encoding='utf-8') as f:
@@ -410,6 +423,9 @@ def write_merges_file(merges_list):
             decoder = "byte_level"
     elif is_sentencepiece:
         tokenizer_type = "sentencepiece"
+        if is_unigram:
+            normalizer = "metaspace"
+            decoder = "replace_metaspace"
 
     tokenizer_config_output = output_dir / "tokenizer_config.txt"
     with open(tokenizer_config_output, 'w') as f:
@@ -422,6 +438,10 @@ def write_merges_file(merges_list):
         f.write(f"normalizer={normalizer}\n")
         f.write(f"decoder={decoder}\n")
         f.write(f"byte_fallback={'true' if byte_fallback else 'false'}\n")
+        if is_unigram:
+            f.write("sp_model_type=unigram\n")
+            f.write("sp_add_dummy_prefix=true\n")
+            f.write("sp_byte_fallback=false\n")
 
         if chat_template_data:
             f.write("has_chat_template=true\n")

diff --git a/python/cactus/convert/model_adapters/adapters.py b/python/cactus/convert/model_adapters/adapters.py
@@ -349,9 +349,22 @@ def name_tensor(self, source_name: str, tensor: Any, num_layers: int | None) ->
         norm2 = _nomic_layer_suffix(source_name, ".norm2.weight")
         if norm2 is not None:
             return NameMatch(source_name, f"layer_{norm2}_norm2.weights", "language", True, hf_name=source_name, adapter_name=source_name)
+        for suffix, template, transpose in (
+            (".attn.Wqkv.weight", "layer_{i}_attn_qkv.weights", False),
+            (".attn.Wqkv.bias", "layer_{i}_attn_qkv.bias", False),
+            (".mlp.experts.mlp.w1", "layer_{i}_mlp_experts_w1.weights", False),
+            (".mlp.experts.mlp.w2", "layer_{i}_mlp_experts_w2.weights", True),
+        ):
+            layer = _nomic_layer_suffix(source_name, suffix)
+            if layer is not None:
+                return NameMatch(source_name, template.format(i=layer), "language", True, transpose, hf_name=source_name, adapter_name=source_name)
         return cactus_name_for_tensor(source_name, self.family, num_layers)
 
     def policy(self, match: NameMatch, shape: tuple[int, ...], requested_bits: int) -> TensorPolicy:
+        # The MoE router is tiny ([num_experts, hidden]) but decides expert selection;
+        # 4-bit quantizing it corrupts routing, so keep it in FP16.
+        if ".mlp.router.layer.weight" in match.source_name:
+            return TensorPolicy("fallback", "FP16", None, match.component, False, "none", "moe router precision-sensitive")
         policy = super().policy(match, shape, requested_bits)
         if policy.use_gptq and ".mlp.experts.mlp." in match.source_name:
             return replace(policy, use_gptq=False)

diff --git a/python/cactus/convert/tests/test_lfm2_adapter.py b/python/cactus/convert/tests/test_lfm2_adapter.py
@@ -0,0 +1,61 @@
+from __future__ import annotations
+
+from cactus.convert.model_adapters.adapters import adapter_for_family
+
+
+def test_lfm2_vl_adapter_selects_runtime_safe_model_class():
+    from transformers import Lfm2VlForConditionalGeneration
+
+    adapter = adapter_for_family("lfm2")
+    cfg = {"model_type": "lfm2", "architectures": ["Lfm2VlForConditionalGeneration"]}
+    assert adapter.model_class(cfg) is Lfm2VlForConditionalGeneration
+
+
+def test_lfm2_processor_fallback_handles_tokenizers_backend(tmp_path):
+    import json
+
+    from tokenizers import Tokenizer
+    from tokenizers.models import WordLevel
+    from tokenizers.pre_tokenizers import Whitespace
+    from transformers import Lfm2VlProcessor
+
+    tokenizer = Tokenizer(WordLevel({"<|pad|>": 0, "<|startoftext|>": 1, "<|im_end|>": 2, "<image>": 3, "hello": 4}, unk_token="<|pad|>"))
+    tokenizer.pre_tokenizer = Whitespace()
+    tokenizer.save(str(tmp_path / "tokenizer.json"))
+    (tmp_path / "tokenizer_config.json").write_text(
+        json.dumps(
+            {
+                "tokenizer_class": "TokenizersBackend",
+                "bos_token": "<|startoftext|>",
+                "eos_token": "<|im_end|>",
+                "pad_token": "<|pad|>",
+                "image_token": "<image>",
+                "image_start_token": "<image>",
+                "image_end_token": "<image>",
+                "image_thumbnail": "<image>",
+            }
+        ),
+        encoding="utf-8",
+    )
+    (tmp_path / "preprocessor_config.json").write_text(
+        json.dumps(
+            {
+                "image_processor_type": "Lfm2VlImageProcessorFast",
+                "do_resize": True,
+                "size": {"height": 512, "width": 512},
+                "do_rescale": True,
+                "rescale_factor": 1 / 255,
+                "do_normalize": True,
+                "image_mean": [0.5, 0.5, 0.5],
+                "image_std": [0.5, 0.5, 0.5],
+                "do_pad": True,
+                "data_format": "channels_first",
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    processor = adapter_for_family("lfm2").load_processor(str(tmp_path))
+    assert isinstance(processor, Lfm2VlProcessor)
+    assert processor.image_token == "<image>"
+    assert processor.image_token_id == 3
diff --git a/python/cactus/convert/tests/test_naming_qdq.py b/python/cactus/convert/tests/test_naming_qdq.py
@@ -7,6 +7,7 @@
 
 from cactus.convert.cactus_adapters.tensor_io import save_tensor_with_header
 from cactus.convert.export.qdq import convert_qdq
+from cactus.convert.model_adapters.adapters import adapter_for_family
 from cactus.convert.model_adapters.naming import cactus_name_for_tensor, restore_hf_key_for_family
 
 
@@ -289,3 +290,115 @@ def test_parakeet_batchnorm_tracking_tensors_are_ignored():
     match = cactus_name_for_tensor("encoder.layers.0.conv.norm.num_batches_tracked", "parakeet", 24)
     assert match.recognized
     assert match.output_name is None
+
+
+def test_nomic_normalizes_global_tensors():
+    adapter = adapter_for_family("nomic")
+    state = {
+        "embeddings.word_embeddings.weight": torch.ones(4, 3),
+        "embeddings.token_type_embeddings.weight": torch.full((1, 3), 2.0),
+        "emb_ln.weight": torch.arange(3.0),
+        "emb_ln.bias": torch.arange(3.0) + 10,
+    }
+    normalized = adapter.normalize_state_dict(state)
+    assert set(normalized.state_dict) == {"token_embeddings", "embedding_layernorm.weight", "embedding_layernorm.bias"}
+    assert torch.equal(normalized.state_dict["token_embeddings"], torch.full((4, 3), 3.0))
+    assert normalized.provenance["token_embeddings"].source_names == [
+        "embeddings.word_embeddings.weight",
+        "embeddings.token_type_embeddings.weight",
+    ]
+    assert normalized.provenance["token_embeddings"].qdq_restore == "adapter_key"
+    assert adapter.name_tensor("token_embeddings", normalized.state_dict["token_embeddings"], 12).output_name == "token_embeddings.weights"
+    assert adapter.name_tensor("embedding_layernorm.weight", normalized.state_dict["embedding_layernorm.weight"], 12).output_name == "embedding_layernorm.weight"
+
+
+def test_nomic_norm2_weight_uses_runtime_name():
+    adapter = adapter_for_family("nomic")
+    match = adapter.name_tensor("encoder.layers.3.norm2.weight", torch.ones(768), 12)
+    assert match.recognized
+    assert match.output_name == "layer_3_norm2.weights"
+
+
+def test_nomic_keeps_qkv_and_moe_experts_fused():
+    # The v2 transpile path binds graph weights by their HF parameter name, so the
+    # converter emits one fused tensor per HF parameter (no q/k/v or per-expert split).
+    # w2 is stored transposed so the second expert matmul can consume it as a direct
+    # linear weight in the transpiled graph.
+    adapter = adapter_for_family("nomic")
+    adapter.num_experts = 8
+
+    qkv = torch.arange(2304 * 2, dtype=torch.float32).reshape(2304, 2)
+    match = adapter.name_tensor("encoder.layers.0.attn.Wqkv.weight", qkv, 12)
+    emissions = adapter.expand_tensor(match, qkv)
+    assert [e.output_name for e in emissions] == ["layer_0_attn_qkv.weights"]
+    assert tuple(emissions[0].tensor.shape) == (2304, 2)
+
+    w1 = torch.empty(24576, 2)
+    match = adapter.name_tensor("encoder.layers.1.mlp.experts.mlp.w1", w1, 12)
+    emissions = adapter.expand_tensor(match, w1)
+    assert [e.output_name for e in emissions] == ["layer_1_mlp_experts_w1.weights"]
+    assert tuple(emissions[0].tensor.shape) == (24576, 2)
+
+    w2 = torch.empty(24576, 2)
+    match = adapter.name_tensor("encoder.layers.1.mlp.experts.mlp.w2", w2, 12)
+    emissions = adapter.expand_tensor(match, w2)
+    assert [e.output_name for e in emissions] == ["layer_1_mlp_experts_w2.weights"]
+    assert tuple(emissions[0].tensor.shape) == (2, 24576)
+
+
+def test_nomic_qdq_runtime_keys_are_unique(tmp_path):
+    cactus = tmp_path / "cactus"
+    out = tmp_path / "qdq"
+    cactus.mkdir()
+    save_tensor_with_header(torch.ones(2, 3), cactus / "layer_0_attn_q.weights", precision="FP16")
+    save_tensor_with_header(torch.ones(2, 3) * 2, cactus / "layer_0_attn_k.weights", precision="FP16")
+    (cactus / "conversion_manifest.json").write_text(
+        """[
+  {
+    "source_name": "encoder.layers.0.attn.Wqkv.weight",
+    "hf_name": "encoder.layers.0.attn.Wqkv.weight",
+    "adapter_name": "encoder.layers.0.attn.Wqkv.weight",
+    "output_file": "layer_0_attn_q.weights",
+    "shape": [2, 3],
+    "dtype": "torch.float32",
+    "component": "language",
+    "policy": "fallback",
+    "precision": "FP16",
+    "status": "fallback",
+    "required": true,
+    "qdq_restore": "runtime_key",
+    "scale_factor": 1.0
+  },
+  {
+    "source_name": "encoder.layers.0.attn.Wqkv.weight",
+    "hf_name": "encoder.layers.0.attn.Wqkv.weight",
+    "adapter_name": "encoder.layers.0.attn.Wqkv.weight",
+    "output_file": "layer_0_attn_k.weights",
+    "shape": [2, 3],
+    "dtype": "torch.float32",
+    "component": "language",
+    "policy": "fallback",
+    "precision": "FP16",
+    "status": "fallback",
+    "required": true,
+    "qdq_restore": "runtime_key",
+    "scale_factor": 1.0
+  }
+]""",
+        encoding="utf-8",
+    )
+    report = convert_qdq(
+        SimpleNamespace(
+            input=cactus,
+            out=out,
+            dtype="float16",
+            model_family="nomic",
+            shard_size_gb=1.0,
+            row_batch_size=64,
+            tmp_dir=None,
+            force=True,
+        )
+    )
+    tensors = load_file(out / "model.safetensors")
+    assert report["written_count"] == 2
+    assert set(tensors) == {"layer_0_attn_q", "layer_0_attn_k"}