Rewrite AI benchmark to default to llama.cpp, break out variables into group_data/all.py.

geerlingguy · geerlingguy · commit 8287746f53b6 · 2025-11-22T14:25:06.000-06:00
diff --git a/README.md b/README.md
@@ -30,7 +30,7 @@ I often test a variety of other board-specific features, too, though it depends
 
 The benchmark scripts are run using `pyinfra`. It can be installed with `pip3 install pyinfra`.
 
-Inside the `benchmark` directory, modify `inventory.py` to point at the system under test, and run:
+Inside the `benchmark` directory, modify `inventory.py` to point at the system under test, modify `group_data/all.py` with the variables appropriate for your system, and run:
 
 ```
 pyinfra inventory.py main.py -y
diff --git a/benchmark/group_data/all.py b/benchmark/group_data/all.py
@@ -0,0 +1,59 @@
+# Inventory variables applied to all hosts.
+
+# These two variables, when multiplied, should generally equal the core count.
+hpl_ps = 1
+hpl_qs = 4
+
+# PHP version available in system package manager (used for PTS installation).
+php_version = "8.3"
+
+# Select from 'llama.cpp' or 'ollama'.
+ai_benchmark = 'llama.cpp'
+
+# llama.cpp build options (e.g. '-DGGML_VULKAN=1' or '-DGGML_CUDA=1')
+llama_cpp_build_opts = '-DGGML_VULKAN=1'
+# For Nvidia DGX Spark / GB10 systems:
+# llama_cpp_build_opts = '-DGGML_CUDA=1 -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc'
+
+# https://github.com/ggml-org/llama.cpp/blob/master/tools/llama-bench/README.md
+llama_bench_opts = '-n 128 -p 512,4096 -pg 4096,128 -ngl 99 -r 2'
+
+# Select which models to benchmark. Ideally they will run entirely in VRAM.
+# The `urls` list can include multiple URLs for larger multi-part models.
+llama_cpp_models = {
+  'tinyllama-1.1b-1t-openorca.Q4_K_M.gguf': {
+    'urls': ['https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_K_M.gguf'],
+    'size_in_gb': 0.7,
+  },
+  'Llama-3.2-3B-Instruct-Q4_K_M.gguf': {
+    'urls': ['https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-Q4_K_M.gguf'],
+    'size_in_gb': 1.9,
+  },
+  # 'llama-2-13b.Q4_K_M.gguf': {
+  #   'urls': ['https://huggingface.co/TheBloke/Llama-2-13B-GGUF/resolve/main/llama-2-13b.Q4_K_M.gguf'],
+  #   'size_in_gb': 7.87,
+  # },
+  # 'DeepSeek-R1-Distill-Qwen-14B-Q4_K_M.gguf': {
+  #   'urls': ['https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-14B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-14B-Q4_K_M.gguf'],
+  #   'size_in_gb': 9.0,
+  # },
+  # 'gpt-oss-20b-Q4_K_M.gguf': {
+  #   'urls': ['https://huggingface.co/unsloth/gpt-oss-20b-GGUF/resolve/main/gpt-oss-20b-Q4_K_M.gguf'],
+  #   'size_in_gb': 11.6,
+  # },
+  # 'Qwen3-30B-A3B-Q4_K_M.gguf': {
+  #   'urls': ['https://huggingface.co/bartowski/Qwen_Qwen3-32B-GGUF/resolve/main/Qwen_Qwen3-32B-Q4_K_M.gguf'],
+  #   'size_in_gb': 18.6,
+  # },
+  # 'Meta-Llama-3.1-70B-Instruct-Q4_K_M.gguf': {
+  #   'urls': ['https://huggingface.co/bartowski/Meta-Llama-3.1-70B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-70B-Instruct-Q4_K_M.gguf'],
+  #   'size_in_gb': 42.5,
+  # },
+  # 'gpt-oss-120b-Q4_K_M-00001-of-00002.gguf': {
+  #   'urls': [
+  #     'https://huggingface.co/unsloth/gpt-oss-120b-GGUF/resolve/main/Q4_K_M/gpt-oss-120b-Q4_K_M-00001-of-00002.gguf',
+  #     'https://huggingface.co/unsloth/gpt-oss-120b-GGUF/resolve/main/Q4_K_M/gpt-oss-120b-Q4_K_M-00002-of-00002.gguf',
+  #   ],
+  #   'size_in_gb': 62.9,
+  # },
+}
diff --git a/benchmark/inventory.py b/benchmark/inventory.py
@@ -1,4 +1,4 @@
 hosts = (
-    ["framework-13.local"],
+    ["10.0.2.202"],
     {"ssh_user": "jgeerling"},
 )
diff --git a/benchmark/main.py b/benchmark/main.py
@@ -8,11 +8,11 @@
 
 pause_seconds = 60
 tasks = [
-    'tinymembench.py',
-    'geekbench.py',
-    'disk-benchmark.py',
-    'top500.py',
-    'sbc-general-benchmark.py',
+    #'tinymembench.py',
+    #'geekbench.py',
+    #'disk-benchmark.py',
+    #'top500.py',
+    #'sbc-general-benchmark.py',
     'ai-benchmark.py',
 ]
 
diff --git a/benchmark/tasks/ai-benchmark.py b/benchmark/tasks/ai-benchmark.py
@@ -1,63 +1,143 @@
+import os
 from pyinfra import host, logger
 from pyinfra.facts.files import File
 from pyinfra.facts.hardware import Memory
-from pyinfra.facts.server import Arch, Home
-from pyinfra.operations import files, git, python, server
+from pyinfra.facts.server import Arch, Home, LinuxName
+from pyinfra.operations import apt, dnf, files, git, python, server
+from urllib.parse import urlparse
 
 host_ram_size=host.get_fact(Memory)
 working_dir=host.get_fact(Home) + "/Downloads"
-ollama_models={
-    'llama3.2:3b': 2000,
-    'llama3.1:8b': 4900,
-    'llama2:13b': 7400,
-    'deepseek-r1:1.5b': 1100,
-    'deepseek-r1:8b': 4900,
-    'deepseek-r1:14b': 9000,
-    'deepseek-r1:70b': 43000,
-}
-
-files.download(
-    name="Download Ollama Install Script",
-    src="https://ollama.com/install.sh",
-    dest="{}/install.sh".format(working_dir),
-)
-
-# Install Ollama if necessary (but not on RISC-V, for now). For RISC-V, see:
-# https://github.com/geerlingguy/sbc-reviews/issues/65#issuecomment-2637866212
-host_arch = host.get_fact(Arch)
-if not host_arch == 'riscv64':
-    if not host.get_fact(File, path='/usr/local/bin/ollama'):
-        server.shell(
-            name="Run Ollama Install Script",
-            commands="sh {}/install.sh".format(working_dir),
-        )
 
-git.repo(
-    name="Clone ai-benchmarks with git.",
-    src="https://github.com/geerlingguy/ai-benchmarks.git",
-    dest="{}/ai-benchmarks".format(working_dir),
-)
-
-def ollama_loop_callback():
-    for model, model_size in ollama_models.items():
-        # Skip a model if it's larger than the system RAM.
-        if (host_ram_size - (host_ram_size / 8)) < model_size:
-            logger.info(f"\nSkipping model {model} as it is too large.\n\n")
-            continue
-
-        server.shell(
-            name="Download Ollama model: {}".format(model),
-            commands="ollama pull {}".format(model),
+if host.data.ai_benchmark == 'llama.cpp':
+    linux_name=host.get_fact(LinuxName)
+
+    if linux_name in ["Debian", "Ubuntu"]:
+        apt.packages(
+            name="Ensure prerequisites are installed (Debian).",
+            packages=[
+                "libvulkan-dev",
+                "glslc",
+                "cmake",
+                "libcurl4-openssl-dev",
+            ],
+            _sudo=True,
         )
 
-        ollama_benchmark_result = server.shell(
-            name="Benchmark Ollama model: {}".format(model),
-            commands="{}/ai-benchmarks/obench.sh -m {} -c 3 --markdown".format(working_dir, model),
+    if linux_name in ["CentOS", "RedHat", "Fedora"]:
+        dnf.packages(
+            name="Ensure prerequisites are installed (RedHat).",
+            packages=[
+                "vulkan-loader-devel",
+                "vulkan-validation-layers-devel",
+                "vulkan-tools",
+                "glslc",
+                "cmake",
+                "libcurl-devel",
+            ],
+            _sudo=True,
         )
 
-        logger.info(f"\n{ollama_benchmark_result.stdout}\n\n")
+    git.repo(
+        name="Clone llama.cpp with git.",
+        src="https://github.com/ggerganov/llama.cpp.git",
+        dest="{}/llama.cpp".format(working_dir),
+    )
+
+    llama_cpp_build_opts=host.data.llama_cpp_build_opts
+    server.shell(
+        name="Build llama.cpp",
+        commands=[
+            "cd {}/llama.cpp && cmake -B build {}".format(working_dir, llama_cpp_build_opts),
+            "cd {}/llama.cpp && cmake --build build --config Release".format(working_dir)
+        ]
+    )
+
+    llama_bench_opts=host.data.llama_bench_opts
+    def llama_cpp_loop_callback():
+        for model, model_details in host.data.llama_cpp_models.items():
+            # Accounting for multiple URL models.
+            counter = 0
+            total = len(model_details['urls'])
+
+            for url in model_details['urls']:
+                counter = counter + 1
+                filename = os.path.basename(urlparse(url).path)
+                files.download(
+                    name="Downloading model: {} (file {} of {})".format(model, counter, total),
+                    src=url,
+                    dest="{}/llama.cpp/models/{}".format(working_dir, filename),
+                )
+
+            llama_bench_result = server.shell(
+                name="Run llama-bench",
+                commands="cd {}/llama.cpp && ./build/bin/llama-bench -m models/{} {}".format(working_dir, model, llama_bench_opts),
+            )
+
+            logger.info(f"\n{llama_bench_result.stdout}\n")
+
+    python.call(
+        name="Execute llama.cpp loop",
+        function=llama_cpp_loop_callback,
+    )
+
+# TODO: Currently breaks, see https://github.com/pyinfra-dev/pyinfra/issues/1355
+elif host.data.ai_benchmark == 'ollama':
+    ollama_models={
+        'llama3.2:3b': 2000,
+        'llama3.1:8b': 4900,
+        'llama2:13b': 7400,
+        'deepseek-r1:1.5b': 1100,
+        'deepseek-r1:8b': 4900,
+        'deepseek-r1:14b': 9000,
+        'deepseek-r1:70b': 43000,
+    }
+
+    files.download(
+        name="Download Ollama Install Script",
+        src="https://ollama.com/install.sh",
+        dest="{}/install.sh".format(working_dir),
+    )
+
+    # Install Ollama if necessary (but not on RISC-V, for now). For RISC-V, see:
+    # https://github.com/geerlingguy/sbc-reviews/issues/65#issuecomment-2637866212
+    host_arch = host.get_fact(Arch)
+    if not host_arch == 'riscv64':
+        if not host.get_fact(File, path='/usr/local/bin/ollama'):
+            server.shell(
+                name="Run Ollama Install Script",
+                commands="sh {}/install.sh".format(working_dir),
+            )
+
+    git.repo(
+        name="Clone ai-benchmarks with git.",
+        src="https://github.com/geerlingguy/ai-benchmarks.git",
+        dest="{}/ai-benchmarks".format(working_dir),
+    )
+
+    def ollama_loop_callback():
+        for model, model_size in ollama_models.items():
+            # Skip a model if it's larger than the system RAM.
+            if (host_ram_size - (host_ram_size / 8)) < model_size:
+                logger.info(f"\nSkipping model {model} as it is too large.\n\n")
+                continue
+
+            server.shell(
+                name="Download Ollama model: {}".format(model),
+                commands="ollama pull {}".format(model),
+            )
+
+            ollama_benchmark_result = server.shell(
+                name="Benchmark Ollama model: {}".format(model),
+                commands="{}/ai-benchmarks/obench.sh -m {} -c 3 --markdown".format(working_dir, model),
+            )
+
+            logger.info(f"\n{ollama_benchmark_result.stdout}\n\n")
+
+    python.call(
+        name="Execute Ollama loop",
+        function=ollama_loop_callback,
+    )
 
-python.call(
-    name="Execute Ollama loop",
-    function=ollama_loop_callback,
-)
+else:
+    logger.info(f"Please specify a valid ai-benchmark option.")
diff --git a/benchmark/tasks/sbc-general-benchmark.py b/benchmark/tasks/sbc-general-benchmark.py
@@ -2,7 +2,7 @@
 from pyinfra.operations import files, python, server
 from pyinfra.facts.server import Home
 
-php_version="8.4"  # TODO Maybe map 8.x for different Ubuntu / Debian versions.
+php_version=host.data.php_version # TODO Map 8.x for different deb versions.
 working_dir=host.get_fact(Home) + "/Downloads"
 
 files.download(
diff --git a/benchmark/tasks/top500.py b/benchmark/tasks/top500.py
@@ -7,8 +7,8 @@
 
 # TODO: Make this dynamic based on CPU core count?
 # See: https://gist.github.com/CJCShadowsan/94efdf21539f3156414c1224b1c76605
-hpl_ps=1
-hpl_qs=6
+hpl_ps=host.data.hpl_ps
+hpl_qs=host.data.hpl_qs
 
 git.repo(
     name="Clone top500 with git.",

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`	`1`	`hosts = (`
`2`		`- ["framework-13.local"],`
	`2`	`+ ["10.0.2.202"],`
`3`	`3`	`{"ssh_user": "jgeerling"},`
`4`	`4`	`)`