ogx-ai · cooktheryan · Apr 14, 2025 · Apr 12, 2025
diff --git a/kubernetes/observability/guidellm/Containerfile b/kubernetes/observability/guidellm/Containerfile
@@ -0,0 +1,11 @@
+FROM registry.access.redhat.com/ubi9/python-312:9.5-1744198409
+
+RUN pip install --upgrade pip && pip install guidellm
+
+# Replace these env vars in the guidellm-job.yaml
+ENV TARGET=http://localhost:8000/v1 \
+    MODEL=neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16 \
+    DATA_TYPE=emulated \
+    DATA=prompt_tokens=512,generated_tokens=128
+
+ENTRYPOINT ["guidellm"]
diff --git a/kubernetes/observability/guidellm/README.md b/kubernetes/observability/guidellm/README.md
@@ -0,0 +1,49 @@
+## Run Guidellm to Evaluate & Optimize LLMs
+
+[Guidellm](https://github.com/neuralmagic/guidellm/blob/main/README.md) is a powerful tool for evaluating and optimizing the deployment of large
+language models (LLMs).
+
+Here's an example to run `guidellm` with `meta-llama/Llama-3.2-3B-Instruct` that has been deployed with this
+[llama-serve manifests](../../llama-serve/llama3.2-3b/vllm.yaml). Replace the `--target` and references to `Llama-3.2-3B` in
+[guidellm-job.yaml](./guidellm-job.yaml) to evaluate any served LLM.
+
+### Run evaluation
+
+```bash
+oc apply -f pvc.yaml
+oc apply -f guidellm-job.yaml
+```
+
+> **📝 NOTE:** The HF_TOKEN is passed to the job, but this will not be necessary if you use the same PVC as the one storing your model.
+> Guidellm uses the model's tokenizer/processor files in its evaluation. You can pass a path instead with `--tokenizer=/path/to/model`.
+> This eliminates the need for Guidellm to download the files from Huggingface.
+
+The PVC is RWX, so all guidellm pods can access the results.
+The logs from the job will show pretty tables that summarize the results. There is also a large yaml file created. The evaluation for this model
+will take ~25 minutes.
+
+### Extract Guidellm Report
+
+To extract the results, first run a job to compress the report file. Then, create an accessor pod from which you can use `oc rsync` to download the
+results.
+
+```bash
+oc apply -f retriever-job.yaml
+oc apply -f accessor-pod.yaml
+mkdir guidellm-reports
+oc rsync guidellm-accessor:/mnt/output/guidellm-reports.tgz ./guidellm-reports
+# ignore the WARNING: cannot use rsync: rsync not available in container
+```
+
+You will now have a local `./guidellm-reports/guidellm-reports.tgz`, to extract it run:
+
+```bash
+tar -xvf guidellm-reports.tgz
+```
+
+You will now have a local file `./guidellm-reports/llama32-3b.yaml`
+You can remove the accessor pod with:
+
+```bash
+oc delete pod guidellm-accessor
+```
diff --git a/kubernetes/observability/guidellm/accessor-pod.yaml b/kubernetes/observability/guidellm/accessor-pod.yaml
@@ -0,0 +1,18 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: guidellm-accessor
+spec:
+  containers:
+  - command:
+    - sleep
+    - "3600"
+    image: registry.access.redhat.com/ubi9/ubi
+    name: accessor
+    volumeMounts:
+    - mountPath: /mnt/output
+      name: output
+  volumes:
+  - name: output
+    persistentVolumeClaim:
+      claimName: guidellm-output-pvc
diff --git a/kubernetes/observability/guidellm/guidellm-job.yaml b/kubernetes/observability/guidellm/guidellm-job.yaml
@@ -0,0 +1,49 @@
+# This job takes ~20min to complete.
+# This will create a very large yaml file. To extract the file, run:
+# oc apply -f retriever-job.yaml
+# oc apply -f accessor-pod.yaml
+# mkdir ./guidellm-reports
+# oc rsync guidellm-accessor:/mnt/output/guidellm-reports.tgz ./guidellm-reports
+# You will now have a local ./guidellm-reports/guidellm-reports.tgz, to extract it run:
+# tar -xvf guidellm-reports.tgz
+# You will now have a local file ./guidellm-reports/llama32-3b.yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: guidellm-llama32-3b
+spec:
+  template:
+    spec:
+      containers:
+      - name: guidellm-llama32-3b
+        image: quay.io/sallyom/guidellm:entrypoint
+        imagePullPolicy: Always
+        args:
+        - --target=$(TARGET)
+        - --data-type=$(DATA_TYPE)
+        - --data=$(DATA)
+        - --output-path=/output/llama32-3b.yaml
+        env:
+        # HF_TOKEN is not necessary if you share/use the model PVC. Guidellm needs to access the tokenizer file.
+        # You can provide a path to the tokenizer file by passing `--tokenizer=/path/to/model`. If you do not
+        # pass the tokenizer path, Guidellm will get the tokenizer file(s) from Huggingface.
+        - name: HF_TOKEN
+          valueFrom:
+            secretKeyRef:
+              key: HF_TOKEN
+              name: huggingface-secret
+        - name: TARGET
+          value: "http://llama32-3b.llama-serve.svc.cluster.local:8000/v1"
+        - name: DATA_TYPE
+          value: "emulated"
+        - name: DATA
+          value: "prompt_tokens=512,generated_tokens=128"
+        volumeMounts:
+        - name: output
+          mountPath: /output
+      restartPolicy: Never
+      volumes:
+      - name: output
+        persistentVolumeClaim:
+          claimName: guidellm-output-pvc
+  backoffLimit: 0
diff --git a/kubernetes/observability/guidellm/pvc.yaml b/kubernetes/observability/guidellm/pvc.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: guidellm-output-pvc
+spec:
+  storageClassName: nfs-csi
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 1Gi
diff --git a/kubernetes/observability/guidellm/retriever-job.yaml b/kubernetes/observability/guidellm/retriever-job.yaml
@@ -0,0 +1,24 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: guidellm-extract
+spec:
+  template:
+    spec:
+      containers:
+      - name: extract
+        image: registry.access.redhat.com/ubi9/ubi
+        command: ["sh", "-c"]
+        args:
+        - |
+          echo "Packing reports...";
+          cd /output && \
+          tar czf guidellm-reports.tgz *.yaml
+        volumeMounts:
+        - name: output
+          mountPath: /output
+      restartPolicy: Never
+      volumes:
+      - name: output
+        persistentVolumeClaim:
+          claimName: guidellm-output-pvc