PrismML-Eng · bong-water-water-bong · May 7, 2026 · May 7, 2026 · May 7, 2026 · May 24, 2026
diff --git a/.github/CI.md b/.github/CI.md
@@ -70,3 +70,8 @@ Always-on (run every time):
 | `macos-metal` | `macos-14` | `build_mac.sh` | Metal + CPU (Apple Silicon) |
 | `macos-intel` | `macos-15-intel` | `build_mac.sh` | CPU only (Intel) |
 
+Optional self-hosted builds:
+
+| Job | Runner | Script | What it builds |
+|-----|--------|--------|---------------|
+| `linux-amd` | `[self-hosted, Linux, X64, amd]` | `build_rocm_linux.sh` | ROCm/HIP (`gfx1151` by default) |
diff --git a/.github/workflows/build-from-source-smoke.yml b/.github/workflows/build-from-source-smoke.yml
@@ -2,6 +2,12 @@ name: Build from source smoke tests
 
 on:
   workflow_dispatch:
+    inputs:
+      enable_linux_amd:
+        description: "Run optional self-hosted Linux AMD/ROCm build"
+        required: false
+        default: false
+        type: boolean
   pull_request:
     types: [labeled]
 
@@ -138,3 +144,24 @@ jobs:
           BONSAI_FAMILY: ternary
         run: './scripts/run_llama.sh -c 256 -st -n 50 -p "The meaning of life is"'
 
+  linux-amd:
+    name: Linux AMD ROCm build (self-hosted)
+    if: ${{ github.event_name == 'workflow_dispatch' && inputs.enable_linux_amd }}
+    needs: [download-model]
+    runs-on: [self-hosted, Linux, X64, amd]
+    timeout-minutes: 90
+    steps:
+      - uses: actions/checkout@v4
+      - name: Download models
+        uses: actions/download-artifact@v4
+        with:
+          name: all-models
+          path: models
+      - name: Build from source
+        run: ./scripts/build_rocm_linux.sh
+      - name: llama-cli smoke test (Bonsai Q1_0, ROCm)
+        run: './scripts/run_llama.sh -c 256 -st -n 50 -p "The meaning of life is"'
+      - name: llama-cli smoke test (Ternary-Bonsai Q2_0, ROCm)
+        env:
+          BONSAI_FAMILY: ternary
+        run: './scripts/run_llama.sh -c 256 -st -n 50 -p "The meaning of life is"'
diff --git a/.github/workflows/check-env-vars.yml b/.github/workflows/check-env-vars.yml
@@ -22,6 +22,14 @@ jobs:
         run: |
           sh -n scripts/common.sh
           sh -n scripts/download_models.sh
+          sh -n scripts/download_binaries.sh
+          sh -n scripts/run_llama.sh
+          sh -n scripts/start_llama_server.sh
+          sh -n scripts/start_openwebui.sh
+          bash -n scripts/build_cuda_linux.sh
+          bash -n scripts/build_rocm_linux.sh
+          sh -n scripts/build_cpu_linux.sh
+          sh -n scripts/build_mac.sh
           sh -n setup.sh
           echo "OK: shell scripts parse"
 

diff --git a/README.md b/README.md
@@ -63,7 +63,7 @@ More compact ternary formats are TBD. llama.cpp already has `TQ1_0` and `TQ2_0`
 | CUDA | `prism` fork | [e380897e](https://github.com/PrismML-Eng/llama.cpp/commit/e380897e); PR coming soon |
 | CPU (optimized x86) | ⏳ TBD | — |
 | Vulkan | ⏳ TBD | — |
-| ROCm / HIP | ⏳ TBD | — |
+| ROCm / HIP | `prism` fork; local gfx1151 validation | [d104cf1b](https://github.com/PrismML-Eng/llama.cpp/commit/d104cf1b); [benchmark](community-benchmarks/ternary-bonsai/rocm-hip-strix-halo-128gb-linux.md) |
 | MLX (2-bit) | Already supported in stock [MLX](https://github.com/ml-explore/mlx) | - |
 
 ## Benchmarks
@@ -326,12 +326,10 @@ cmake --build build -j$(nproc)
 ### Linux (ROCm / AMD GPU)
 
 ```bash
-# Requires ROCm toolkit (hipcc)
-git clone -b prism https://github.com/PrismML-Eng/llama.cpp.git
-cd llama.cpp
-cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_HIP=ON
-cmake --build build -j$(nproc)
-# Binaries in build/bin/
+./scripts/build_rocm_linux.sh
+
+# Strix Halo / gfx1151 explicitly:
+./scripts/build_rocm_linux.sh --targets gfx1151
 ```
 
 ### Windows (CUDA)
@@ -407,6 +405,7 @@ Bonsai-demo/
 │   ├── build_mac.sh                # Build llama.cpp for Mac
 │   ├── build_cpu_linux.sh          # Build llama.cpp for Linux (CPU only)
 │   ├── build_cuda_linux.sh         # Build llama.cpp for Linux CUDA
+│   ├── build_rocm_linux.sh         # Build llama.cpp for Linux ROCm/HIP
 │   └── build_cuda_windows.ps1      # Build llama.cpp for Windows CUDA
 ├── models/                         # ← downloaded by setup
 │   ├── gguf/

diff --git a/benchmarks/data/ternary-bonsai-rocm-strix-halo-20260507T003049Z.jsonl b/benchmarks/data/ternary-bonsai-rocm-strix-halo-20260507T003049Z.jsonl
diff --git a/benchmarks/data/ternary-bonsai-rocm-strix-halo-combined-20260507T003257Z.jsonl b/benchmarks/data/ternary-bonsai-rocm-strix-halo-combined-20260507T003257Z.jsonl
diff --git a/benchmarks/data/ternary-bonsai-rocm-strix-halo-fa-compare-20260507T005616Z.jsonl b/benchmarks/data/ternary-bonsai-rocm-strix-halo-fa-compare-20260507T005616Z.jsonl
@@ -0,0 +1,12 @@
+{"build_commit": "d104cf1b6", "build_number": 8846, "cpu_info": "AMD RYZEN AI MAX+ 395 w/ Radeon 8060S", "gpu_info": "AMD Radeon Graphics", "backends": "ROCm", "model_filename": "models/ternary-gguf/1.7B/Ternary-Bonsai-1.7B-Q2_0.gguf", "model_type": "qwen3 1.7B Q2_0", "model_size": 457345184, "model_n_params": 1720028160, "n_batch": 2048, "n_ubatch": 512, "n_threads": 16, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "n_cpu_moe": 0, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "devices": "auto", "tensor_split": "0.00", "tensor_buft_overrides": "none", "use_mmap": true, "use_direct_io": false, "embeddings": false, "no_op_offload": 0, "no_host": false, "fit_target": 0, "fit_min_ctx": 0, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2026-05-07T00:56:16Z", "avg_ns": 105712742, "stddev_ns": 797694, "avg_ts": 4843.498607, "stddev_ts": 36.703200, "samples_ns": [ 106239328, 106103931, 104794967 ],"samples_ts": [ 4819.31, 4825.46, 4885.73 ]}
+{"build_commit": "d104cf1b6", "build_number": 8846, "cpu_info": "AMD RYZEN AI MAX+ 395 w/ Radeon 8060S", "gpu_info": "AMD Radeon Graphics", "backends": "ROCm", "model_filename": "models/ternary-gguf/1.7B/Ternary-Bonsai-1.7B-Q2_0.gguf", "model_type": "qwen3 1.7B Q2_0", "model_size": 457345184, "model_n_params": 1720028160, "n_batch": 2048, "n_ubatch": 512, "n_threads": 16, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "n_cpu_moe": 0, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "devices": "auto", "tensor_split": "0.00", "tensor_buft_overrides": "none", "use_mmap": true, "use_direct_io": false, "embeddings": false, "no_op_offload": 0, "no_host": false, "fit_target": 0, "fit_min_ctx": 0, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2026-05-07T00:56:17Z", "avg_ns": 641547919, "stddev_ns": 1761539, "avg_ts": 199.518445, "stddev_ts": 0.547783, "samples_ns": [ 641509251, 639806032, 643328474 ],"samples_ts": [ 199.529, 200.061, 198.965 ]}
+{"build_commit": "d104cf1b6", "build_number": 8846, "cpu_info": "AMD RYZEN AI MAX+ 395 w/ Radeon 8060S", "gpu_info": "AMD Radeon Graphics", "backends": "ROCm", "model_filename": "models/ternary-gguf/1.7B/Ternary-Bonsai-1.7B-Q2_0.gguf", "model_type": "qwen3 1.7B Q2_0", "model_size": 457345184, "model_n_params": 1720028160, "n_batch": 2048, "n_ubatch": 512, "n_threads": 16, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "n_cpu_moe": 0, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": true, "devices": "auto", "tensor_split": "0.00", "tensor_buft_overrides": "none", "use_mmap": true, "use_direct_io": false, "embeddings": false, "no_op_offload": 0, "no_host": false, "fit_target": 0, "fit_min_ctx": 0, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2026-05-07T00:56:19Z", "avg_ns": 103939420, "stddev_ns": 1506310, "avg_ts": 4926.637669, "stddev_ts": 71.569664, "samples_ns": [ 102357530, 104104283, 105356449 ],"samples_ts": [ 5002.07, 4918.15, 4859.69 ]}
+{"build_commit": "d104cf1b6", "build_number": 8846, "cpu_info": "AMD RYZEN AI MAX+ 395 w/ Radeon 8060S", "gpu_info": "AMD Radeon Graphics", "backends": "ROCm", "model_filename": "models/ternary-gguf/1.7B/Ternary-Bonsai-1.7B-Q2_0.gguf", "model_type": "qwen3 1.7B Q2_0", "model_size": 457345184, "model_n_params": 1720028160, "n_batch": 2048, "n_ubatch": 512, "n_threads": 16, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "n_cpu_moe": 0, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": true, "devices": "auto", "tensor_split": "0.00", "tensor_buft_overrides": "none", "use_mmap": true, "use_direct_io": false, "embeddings": false, "no_op_offload": 0, "no_host": false, "fit_target": 0, "fit_min_ctx": 0, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2026-05-07T00:56:20Z", "avg_ns": 605813025, "stddev_ns": 59208958, "avg_ts": 212.567922, "stddev_ts": 19.669961, "samples_ns": [ 573295306, 569988785, 674154984 ],"samples_ts": [ 223.271, 224.566, 189.867 ]}
+{"build_commit": "d104cf1b6", "build_number": 8846, "cpu_info": "AMD RYZEN AI MAX+ 395 w/ Radeon 8060S", "gpu_info": "AMD Radeon Graphics", "backends": "ROCm", "model_filename": "models/ternary-gguf/4B/Ternary-Bonsai-4B-Q2_0.gguf", "model_type": "qwen3 4B Q2_0", "model_size": 1069018824, "model_n_params": 4021784576, "n_batch": 2048, "n_ubatch": 512, "n_threads": 16, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "n_cpu_moe": 0, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "devices": "auto", "tensor_split": "0.00", "tensor_buft_overrides": "none", "use_mmap": true, "use_direct_io": false, "embeddings": false, "no_op_offload": 0, "no_host": false, "fit_target": 0, "fit_min_ctx": 0, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2026-05-07T00:56:22Z", "avg_ns": 269780568, "stddev_ns": 28834220, "avg_ts": 1911.581370, "stddev_ts": 192.898358, "samples_ns": [ 302947715, 255720297, 250673694 ],"samples_ts": [ 1690.06, 2002.19, 2042.5 ]}
+{"build_commit": "d104cf1b6", "build_number": 8846, "cpu_info": "AMD RYZEN AI MAX+ 395 w/ Radeon 8060S", "gpu_info": "AMD Radeon Graphics", "backends": "ROCm", "model_filename": "models/ternary-gguf/4B/Ternary-Bonsai-4B-Q2_0.gguf", "model_type": "qwen3 4B Q2_0", "model_size": 1069018824, "model_n_params": 4021784576, "n_batch": 2048, "n_ubatch": 512, "n_threads": 16, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "n_cpu_moe": 0, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "devices": "auto", "tensor_split": "0.00", "tensor_buft_overrides": "none", "use_mmap": true, "use_direct_io": false, "embeddings": false, "no_op_offload": 0, "no_host": false, "fit_target": 0, "fit_min_ctx": 0, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2026-05-07T00:56:23Z", "avg_ns": 1273087226, "stddev_ns": 10885293, "avg_ts": 100.547910, "stddev_ts": 0.862642, "samples_ns": [ 1260983126, 1276205389, 1282073163 ],"samples_ts": [ 101.508, 100.297, 99.8383 ]}
+{"build_commit": "d104cf1b6", "build_number": 8846, "cpu_info": "AMD RYZEN AI MAX+ 395 w/ Radeon 8060S", "gpu_info": "AMD Radeon Graphics", "backends": "ROCm", "model_filename": "models/ternary-gguf/4B/Ternary-Bonsai-4B-Q2_0.gguf", "model_type": "qwen3 4B Q2_0", "model_size": 1069018824, "model_n_params": 4021784576, "n_batch": 2048, "n_ubatch": 512, "n_threads": 16, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "n_cpu_moe": 0, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": true, "devices": "auto", "tensor_split": "0.00", "tensor_buft_overrides": "none", "use_mmap": true, "use_direct_io": false, "embeddings": false, "no_op_offload": 0, "no_host": false, "fit_target": 0, "fit_min_ctx": 0, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2026-05-07T00:56:27Z", "avg_ns": 223728886, "stddev_ns": 7164531, "avg_ts": 2290.078617, "stddev_ts": 74.689449, "samples_ns": [ 215475042, 227370417, 228341201 ],"samples_ts": [ 2376.15, 2251.83, 2242.26 ]}
+{"build_commit": "d104cf1b6", "build_number": 8846, "cpu_info": "AMD RYZEN AI MAX+ 395 w/ Radeon 8060S", "gpu_info": "AMD Radeon Graphics", "backends": "ROCm", "model_filename": "models/ternary-gguf/4B/Ternary-Bonsai-4B-Q2_0.gguf", "model_type": "qwen3 4B Q2_0", "model_size": 1069018824, "model_n_params": 4021784576, "n_batch": 2048, "n_ubatch": 512, "n_threads": 16, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "n_cpu_moe": 0, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": true, "devices": "auto", "tensor_split": "0.00", "tensor_buft_overrides": "none", "use_mmap": true, "use_direct_io": false, "embeddings": false, "no_op_offload": 0, "no_host": false, "fit_target": 0, "fit_min_ctx": 0, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2026-05-07T00:56:28Z", "avg_ns": 1137567067, "stddev_ns": 41682101, "avg_ts": 112.619619, "stddev_ts": 4.043857, "samples_ns": [ 1185528586, 1117074129, 1110098488 ],"samples_ts": [ 107.969, 114.585, 115.305 ]}
+{"build_commit": "d104cf1b6", "build_number": 8846, "cpu_info": "AMD RYZEN AI MAX+ 395 w/ Radeon 8060S", "gpu_info": "AMD Radeon Graphics", "backends": "ROCm", "model_filename": "models/ternary-gguf/8B/Ternary-Bonsai-8B-Q2_0.gguf", "model_type": "qwen3 8B Q2_0", "model_size": 2176234112, "model_n_params": 8188548096, "n_batch": 2048, "n_ubatch": 512, "n_threads": 16, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "n_cpu_moe": 0, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "devices": "auto", "tensor_split": "0.00", "tensor_buft_overrides": "none", "use_mmap": true, "use_direct_io": false, "embeddings": false, "no_op_offload": 0, "no_host": false, "fit_target": 0, "fit_min_ctx": 0, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2026-05-07T00:56:32Z", "avg_ns": 448349692, "stddev_ns": 6090611, "avg_ts": 1142.107276, "stddev_ts": 15.625663, "samples_ns": [ 441394749, 452730861, 450923467 ],"samples_ts": [ 1159.96, 1130.91, 1135.45 ]}
+{"build_commit": "d104cf1b6", "build_number": 8846, "cpu_info": "AMD RYZEN AI MAX+ 395 w/ Radeon 8060S", "gpu_info": "AMD Radeon Graphics", "backends": "ROCm", "model_filename": "models/ternary-gguf/8B/Ternary-Bonsai-8B-Q2_0.gguf", "model_type": "qwen3 8B Q2_0", "model_size": 2176234112, "model_n_params": 8188548096, "n_batch": 2048, "n_ubatch": 512, "n_threads": 16, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "n_cpu_moe": 0, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "devices": "auto", "tensor_split": "0.00", "tensor_buft_overrides": "none", "use_mmap": true, "use_direct_io": false, "embeddings": false, "no_op_offload": 0, "no_host": false, "fit_target": 0, "fit_min_ctx": 0, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2026-05-07T00:56:34Z", "avg_ns": 1818481529, "stddev_ns": 8507533, "avg_ts": 70.389422, "stddev_ts": 0.328540, "samples_ns": [ 1828150863, 1812145433, 1815148292 ],"samples_ts": [ 70.0161, 70.6345, 70.5177 ]}
+{"build_commit": "d104cf1b6", "build_number": 8846, "cpu_info": "AMD RYZEN AI MAX+ 395 w/ Radeon 8060S", "gpu_info": "AMD Radeon Graphics", "backends": "ROCm", "model_filename": "models/ternary-gguf/8B/Ternary-Bonsai-8B-Q2_0.gguf", "model_type": "qwen3 8B Q2_0", "model_size": 2176234112, "model_n_params": 8188548096, "n_batch": 2048, "n_ubatch": 512, "n_threads": 16, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "n_cpu_moe": 0, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": true, "devices": "auto", "tensor_split": "0.00", "tensor_buft_overrides": "none", "use_mmap": true, "use_direct_io": false, "embeddings": false, "no_op_offload": 0, "no_host": false, "fit_target": 0, "fit_min_ctx": 0, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2026-05-07T00:56:40Z", "avg_ns": 393272958, "stddev_ns": 15794699, "avg_ts": 1303.328419, "stddev_ts": 53.546525, "samples_ns": [ 375104243, 400980695, 403733938 ],"samples_ts": [ 1364.95, 1276.87, 1268.16 ]}
+{"build_commit": "d104cf1b6", "build_number": 8846, "cpu_info": "AMD RYZEN AI MAX+ 395 w/ Radeon 8060S", "gpu_info": "AMD Radeon Graphics", "backends": "ROCm", "model_filename": "models/ternary-gguf/8B/Ternary-Bonsai-8B-Q2_0.gguf", "model_type": "qwen3 8B Q2_0", "model_size": 2176234112, "model_n_params": 8188548096, "n_batch": 2048, "n_ubatch": 512, "n_threads": 16, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "n_cpu_moe": 0, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": true, "devices": "auto", "tensor_split": "0.00", "tensor_buft_overrides": "none", "use_mmap": true, "use_direct_io": false, "embeddings": false, "no_op_offload": 0, "no_host": false, "fit_target": 0, "fit_min_ctx": 0, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2026-05-07T00:56:42Z", "avg_ns": 1635848246, "stddev_ns": 5158727, "avg_ts": 78.247385, "stddev_ts": 0.246330, "samples_ns": [ 1641750524, 1633591035, 1632203181 ],"samples_ts": [ 77.9656, 78.355, 78.4216 ]}
diff --git a/community-benchmarks/README.md b/community-benchmarks/README.md
@@ -14,7 +14,7 @@ Combined view across both model families. See the per-family subfolders below fo
 | Bonsai (1-bit) | AMD Strix Halo 128 GB | llama.cpp ROCm HIP | 1,325 | 96 | [link](bonsai/rocm-hip-strix-halo-128gb-archlinux.md) |
 | Bonsai (1-bit) | NVIDIA GeForce RTX 3080 10 GB | llama.cpp CUDA | 4,770 | 197 | [link](bonsai/cuda-rtx3080-linux.md) |
 | Bonsai (1-bit) | NVIDIA RTX A2000 Laptop (4 GB) | llama.cpp CUDA | 1,387 | 63 | [link](bonsai/cuda-rtxa2000-debian.md) |
-| Ternary-Bonsai (1.58-bit) | *coming soon* | | | | |
+| Ternary-Bonsai (1.58-bit) | AMD Strix Halo 128 GB | llama.cpp ROCm HIP | 1,323 | 79 | [link](ternary-bonsai/rocm-hip-strix-halo-128gb-linux.md) |
 
 ## Model Families
 

diff --git a/community-benchmarks/ternary-bonsai/README.md b/community-benchmarks/ternary-bonsai/README.md
@@ -4,11 +4,9 @@ Benchmark results submitted by the community running [Ternary-Bonsai](https://hu
 
 ## Results
 
-Coming soon...
-
 | Hardware | Backend | 8B PP512 (t/s) | 8B TG128 (t/s) | Details |
 |----------|---------|----------------|----------------|---------|
-| | | | | |
+| AMD Strix Halo 128 GB | llama.cpp ROCm HIP | 1,323 | 79 | [link](rocm-hip-strix-halo-128gb-linux.md) |
 
 ## Available Formats