add missing cactus transpile flags

jakmro · jakmro · commit fd61cbd28729 · 2026-06-08T17:49:28.000+02:00
Signed-off-by: jakmro &lt;kubamroz124@gmail.com&gt;
diff --git a/python/cactus/cli/__init__.py b/python/cactus/cli/__init__.py
@@ -121,8 +121,35 @@ def create_parser():
 
   cactus transpile <model>             build a runnable bundle from CQ weights
     --weights-dir <path>               path to CQ weights (default: weights/<model>)
-    --task <auto|...>                  force task type (default: auto)
+    --task <auto|...>                  task (default: auto, inferred from model config)
     --artifact-dir <path>              write bundle here (default: weights/<model>)
+    --prompt <text>                    representative prompt for shape capture
+    --system-prompt <text>             system prompt for multimodal chat
+    --enable-thinking                  enable thinking markers when supported
+    --input-ids <a,b,...>              token ids for causal-LM shape capture
+    --image-file <path>                representative image (repeatable)
+    --audio-file <path>                representative audio file (WAV)
+    --max-new-tokens <n>               preallocate decode context for causal LM
+    --component-pipeline auto|on|off   force component-pipeline transpilation
+    --components <a,b,...>             subset of components to transpile
+    --torch-dtype <dtype>              float16 | float32 | bfloat16
+    --token <token>                    HuggingFace token (defaults to $HF_TOKEN)
+    --trust-remote-code                allow HF remote code at transpile time
+    --local-files-only                 require model/processor to be local
+    --allow-unconverted-weights        debug-only: skip the CQ-weights check
+    --execute-after-transpile          run a reference execution after lowering
+    --graph-filename <name>            override saved graph filename
+    --skip-reference-compare           skip PyTorch comparison (with --execute-…)
+    --no-fuse-rms-norm                 disable RMSNorm fusion
+    --no-fuse-rope                     disable RoPE fusion
+    --no-fuse-attention                disable attention fusion
+    --no-fuse-attention-block          disable attention-block fusion
+    --no-fuse-add-clipped              disable add-clipped fusion
+    --no-fuse-gated-deltanet           disable gated DeltaNet fusion
+    --npu                              also emit CoreML .mlpackage(s) for NPU
+    --npu-quantize 0|4|8               force both NPU encoders to this quant
+    --npu-audio-quantize 0|4|8         audio encoder quant (default int8)
+    --npu-vision-quantize 0|4|8        vision encoder quant (default fp16)
 
   cactus serve [model]                 OpenAI-compatible local HTTP server
     --host <addr>                      bind address (default: 127.0.0.1)
@@ -289,44 +316,71 @@ def create_parser():
     transpile_parser = subparsers.add_parser("transpile",
                                              help="Build a runnable bundle from CQ weights")
     transpile_parser.add_argument("model_id", type=_hf_id_or_path,
-                                  help="HuggingFace model id (e.g. openai/whisper-base) or local PyTorch checkpoint path")
+                                  help="HuggingFace model id or local checkpoint path")
     transpile_parser.add_argument("--weights-dir",
-                                  help="Path to converted CQ weights (default: weights/<model_name>)")
+                                  help="CQ weights directory (default: weights/<model>)")
     transpile_parser.add_argument("--task", default="auto",
                                   choices=["auto", "causal_lm_logits", "multimodal_causal_lm_logits",
                                            "ctc_logits", "encoder_hidden_states",
                                            "seq2seq_transcription", "tdt_transcription"],
-                                  help="Transpile task (default: auto, inferred from weights)")
+                                  help="Transpile task (default: auto, from model config)")
     transpile_parser.add_argument("--prompt",
-                                  help="Representative prompt for causal/multimodal graph shape capture")
+                                  help="Prompt for causal/multimodal shape capture")
+    transpile_parser.add_argument("--system-prompt", default=None,
+                                  help="System prompt for multimodal chat formats")
+    transpile_parser.add_argument("--enable-thinking", action="store_true",
+                                  help="Enable thinking markers when the prompt supports them")
+    transpile_parser.add_argument("--input-ids", default=None,
+                                  help="Comma-separated token ids for causal-LM shape capture")
     transpile_parser.add_argument("--image-file", action="append", default=[],
-                                  help="Representative image file for multimodal transpile (repeatable)")
+                                  help="Image for multimodal shape capture (repeatable)")
     transpile_parser.add_argument("--audio-file",
-                                  help="Representative audio file for audio/multimodal transpile")
+                                  help="Audio file (WAV) for audio/multimodal shape capture")
     transpile_parser.add_argument("--max-new-tokens", type=_positive_int, default=None,
-                                  help="Generation room to preallocate for causal decode graphs")
+                                  help="Decode context to preallocate for causal LM (default: 32)")
     transpile_parser.add_argument("--component-pipeline", default="auto", choices=["auto", "on", "off"],
-                                  help="Use split component graph transpilation when supported")
+                                  help="Split-component transpilation when supported (default: auto)")
     transpile_parser.add_argument("--components",
-                                  help="Comma-separated component subset for component-pipeline models")
+                                  help="Comma-separated component subset (e.g. vision_encoder,decoder)")
+    transpile_parser.add_argument("--torch-dtype", default=None,
+                                  choices=["float16", "float32", "bfloat16"],
+                                  help="Torch dtype for HF loading (default: float16)")
+    transpile_parser.add_argument("--token", default=None,
+                                  help="HuggingFace token for gated models (default: $HF_TOKEN)")
     transpile_parser.add_argument("--trust-remote-code", action="store_true",
-                                  help="Allow HF remote code during the transpile phase")
+                                  help="Pass trust_remote_code=True to HF loaders")
     transpile_parser.add_argument("--local-files-only", action="store_true",
-                                  help="Require HF model/processor files to already be local during transpile")
+                                  help="Require model/processor to already be local")
     transpile_parser.add_argument("--allow-unconverted-weights", action="store_true",
-                                  help="Transpile against an unconverted source checkpoint (skip the CQ weights check)")
+                                  help="Debug: transpile without CQ weights")
     transpile_parser.add_argument("--execute-after-transpile", action="store_true",
-                                  help="Run a reference execution against the produced bundle after transpiling")
+                                  help="Run a reference execution after lowering")
     transpile_parser.add_argument("--artifact-dir",
                                   help="Output directory (default: weights/<model>)")
+    transpile_parser.add_argument("--graph-filename", default=None,
+                                  help="Saved graph filename (default: graph.cactus)")
     transpile_parser.add_argument("--skip-reference-compare", action="store_true",
-                                  help="Skip PyTorch vs transpiled output comparison")
+                                  help="Skip PyTorch comparison (requires --execute-after-transpile)")
     transpile_parser.add_argument("--no-fuse-rms-norm", action="store_true",
                                   help="Disable RMSNorm fusion")
     transpile_parser.add_argument("--no-fuse-rope", action="store_true",
                                   help="Disable RoPE fusion")
     transpile_parser.add_argument("--no-fuse-attention", action="store_true",
                                   help="Disable attention fusion")
+    transpile_parser.add_argument("--no-fuse-attention-block", action="store_true",
+                                  help="Disable attention-block fusion")
+    transpile_parser.add_argument("--no-fuse-add-clipped", action="store_true",
+                                  help="Disable add-clipped fusion")
+    transpile_parser.add_argument("--no-fuse-gated-deltanet", action="store_true",
+                                  help="Disable gated DeltaNet fusion")
+    transpile_parser.add_argument("--npu", action="store_true",
+                                  help="Also emit CoreML .mlpackage(s) for Apple NPU encoders")
+    transpile_parser.add_argument("--npu-quantize", type=int, choices=[0, 4, 8], default=None,
+                                  help="Legacy: force both NPU encoders to same quant (0=fp16, 4=int4, 8=int8)")
+    transpile_parser.add_argument("--npu-audio-quantize", type=int, choices=[0, 4, 8], default=None,
+                                  help="NPU audio encoder quant: 0=fp16, 4=int4, 8=int8 (default: 8)")
+    transpile_parser.add_argument("--npu-vision-quantize", type=int, choices=[0, 4, 8], default=None,
+                                  help="NPU vision encoder quant: 0=fp16, 4=int4, 8=int8 (default: 0; int4 degrades Gemma4 vision)")
 
     return parser
 
diff --git a/python/cactus/cli/convert.py b/python/cactus/cli/convert.py
@@ -87,6 +87,12 @@ def cmd_transpile(args):
         extra_args.extend(["--task", args.task])
     if args.prompt is not None:
         extra_args.extend(["--prompt", args.prompt])
+    if args.system_prompt is not None:
+        extra_args.extend(["--system-prompt", args.system_prompt])
+    if args.enable_thinking:
+        extra_args.append("--enable-thinking")
+    if args.input_ids is not None:
+        extra_args.extend(["--input-ids", args.input_ids])
 
     image_files = list(args.image_file or [])
     audio_file = args.audio_file
@@ -107,12 +113,18 @@ def cmd_transpile(args):
         extra_args.extend(["--component-pipeline", args.component_pipeline])
     if args.components:
         extra_args.extend(["--components", args.components])
+    if args.torch_dtype:
+        extra_args.extend(["--torch-dtype", args.torch_dtype])
+    if args.token:
+        extra_args.extend(["--token", args.token])
     if args.trust_remote_code:
         extra_args.append("--trust-remote-code")
     if args.local_files_only:
         extra_args.append("--local-files-only")
     if args.artifact_dir:
         extra_args.extend(["--artifact-dir", args.artifact_dir])
+    if args.graph_filename:
+        extra_args.extend(["--graph-filename", args.graph_filename])
     if args.skip_reference_compare:
         extra_args.append("--skip-reference-compare")
     if args.no_fuse_rms_norm:
@@ -121,6 +133,20 @@ def cmd_transpile(args):
         extra_args.append("--no-fuse-rope")
     if args.no_fuse_attention:
         extra_args.append("--no-fuse-attention")
+    if args.no_fuse_attention_block:
+        extra_args.append("--no-fuse-attention-block")
+    if args.no_fuse_add_clipped:
+        extra_args.append("--no-fuse-add-clipped")
+    if args.no_fuse_gated_deltanet:
+        extra_args.append("--no-fuse-gated-deltanet")
+    if args.npu:
+        extra_args.append("--npu")
+        if args.npu_quantize is not None:
+            extra_args.extend(["--npu-quantize", str(args.npu_quantize)])
+        if args.npu_audio_quantize is not None:
+            extra_args.extend(["--npu-audio-quantize", str(args.npu_audio_quantize)])
+        if args.npu_vision_quantize is not None:
+            extra_args.extend(["--npu-vision-quantize", str(args.npu_vision_quantize)])
 
     return run_transpile(
         args.model_id,
diff --git a/python/cactus/transpile/npu/audio.py b/python/cactus/transpile/npu/audio.py
@@ -26,11 +26,11 @@ def forward(self, input_features: torch.Tensor) -> torch.Tensor:
 def _import_coremltools() -> Any:
     try:
         import coremltools as ct
-        from .coremltools_patches import apply_all_coremltools_patches
-        apply_all_coremltools_patches()
-        return ct
-    except Exception:
-        return None
+    except ImportError as exc:
+        raise RuntimeError("--npu requires `pip install coremltools`") from exc
+    from .coremltools_patches import apply_all_coremltools_patches
+    apply_all_coremltools_patches()
+    return ct
 
 
 def _apply_weight_quantization(mlmodel: Any, bits: int) -> Any:
@@ -65,9 +65,6 @@ def emit_audio_encoder_mlpackage(
     quantize_bits: int | None = None,
 ) -> str | None:
     ct = _import_coremltools()
-    if ct is None:
-        print("npu.audio: coremltools not installed; skipping mlpackage emit")
-        return None
 
     wrapper = AudioEncoderWrapper(audio_module, baked_inputs)
     wrapper.eval()
diff --git a/python/cactus/transpile/npu/vision.py b/python/cactus/transpile/npu/vision.py
@@ -26,11 +26,11 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
 def _import_coremltools() -> Any:
     try:
         import coremltools as ct
-        from .coremltools_patches import apply_all_coremltools_patches
-        apply_all_coremltools_patches()
-        return ct
-    except Exception:
-        return None
+    except ImportError as exc:
+        raise RuntimeError("--npu requires `pip install coremltools`") from exc
+    from .coremltools_patches import apply_all_coremltools_patches
+    apply_all_coremltools_patches()
+    return ct
 
 
 def _apply_weight_quantization(mlmodel: Any, bits: int) -> Any:
@@ -65,9 +65,6 @@ def emit_vision_encoder_mlpackage(
     quantize_bits: int | None = None,
 ) -> str | None:
     ct = _import_coremltools()
-    if ct is None:
-        print("npu.vision: coremltools not installed; skipping mlpackage emit")
-        return None
 
     wrapper = VisionEncoderWrapper(vision_module, baked_inputs)
     wrapper.eval()