Skip to content

Commit a6de86a

Browse files
jia-gaoclaude
andauthored
Add description and keywords metadata to experiment config (llm-d#898)
* feat: Add optional description and keywords metadata to experiment config Allow users to add a description (text + keywords) to their experiment configuration. The metadata is propagated to the harness pod via env vars, saved as description.yaml in each results directory, and included in the benchmark report v0.2 run section. Closes llm-d#715 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * chore: Update secrets baseline for shifted line numbers Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 3cb4813 commit a6de86a

8 files changed

Lines changed: 54 additions & 3 deletions

File tree

.secrets.baseline

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"files": "^.secrets.baseline$",
44
"lines": null
55
},
6-
"generated_at": "2026-03-28T18:30:48Z",
6+
"generated_at": "2026-04-04T14:46:16Z",
77
"plugins_used": [
88
{
99
"name": "AWSKeyDetector"
@@ -155,7 +155,7 @@
155155
{
156156
"hashed_secret": "ea269867b057855f42084f04739114058bb26bb5",
157157
"is_verified": false,
158-
"line_number": 207,
158+
"line_number": 212,
159159
"type": "Base64 High Entropy String",
160160
"verified_result": null
161161
}

benchmark_report/br_v0_2_example.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@ run: # These are details on the benchmark run that produced these results
1515
end: 2025-11-05T18:17:03Z
1616
duration: PT49.97206788184121S
1717
user: namasluk
18+
description: "Baseline latency test for Qwen3-0.6B on inference-scheduling stack"
19+
keywords:
20+
- baseline
21+
- latency
22+
- qwen3
1823

1924
# System inputs, combination of component configurations and workload
2025
scenario:

benchmark_report/native_to_br0_2.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,9 @@ def _populate_run(ev_dict: dict) -> dict:
230230
except FileNotFoundError:
231231
namespace = ev_dict.get("vllm_common_namespace", "")
232232

233+
description_text = os.environ.get("LLMDBENCH_DESCRIPTION_TEXT")
234+
description_keywords = os.environ.get("LLMDBENCH_DESCRIPTION_KEYWORDS")
235+
233236
br_dict = {
234237
"run": {
235238
"eid": eid,
@@ -241,6 +244,12 @@ def _populate_run(ev_dict: dict) -> dict:
241244
"end": _get_harness_meta("harness_stop", "LLMDBENCH_HARNESS_STOP"),
242245
"duration": _get_harness_meta("harness_delta", "LLMDBENCH_HARNESS_DELTA"),
243246
},
247+
"description": description_text if description_text else None,
248+
"keywords": (
249+
[k.strip() for k in description_keywords.split(",") if k.strip()]
250+
if description_keywords
251+
else None
252+
),
244253
},
245254
}
246255
return br_dict

benchmark_report/schema_v0_2.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -921,6 +921,10 @@ class Run(BaseModel):
921921
"""Time details of experiment."""
922922
user: str | None = None
923923
"""Username that executed experiment."""
924+
description: str | None = None
925+
"""User-provided description of the experiment."""
926+
keywords: list[str] | None = None
927+
"""User-provided keywords/tags for the experiment."""
924928

925929

926930
class Scenario(BaseModel):

existing_stack/config_template.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@ endpoint:
55
base_url: &url http://${GATEWAY_SVC}.${NAMESPACE}.svc.cluster.local:80 # Base URL of inference endpoint
66
hf_token_secret: llm-d-hf-token # The name of secret that contains the HF token of the stack
77

8+
# description: # optional: user-defined metadata to help identify and organize results
9+
# text: "My first benchmark test" # free-text description of the experiment
10+
# keywords: # tags / keywords for filtering results
11+
# - benchmark
12+
# - test_env
13+
814
control:
915
work_dir: $HOME/llm-d-bench-work # working directory to store temporary and autogenerated files.
1016
# Do not edit content manually.

existing_stack/run_only.sh

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,10 @@ spec:
207207
${is_dataset_url} value: "${harness_dataset_url}"
208208
- name: LLMDBENCH_HARNESS_STACK_NAME
209209
value: "${endpoint_stack_name}"
210+
- name: LLMDBENCH_DESCRIPTION_TEXT
211+
value: "${_description_text}"
212+
- name: LLMDBENCH_DESCRIPTION_KEYWORDS
213+
value: "${_description_keywords}"
210214
volumeMounts:
211215
- name: results
212216
mountPath: ${RESULTS_DIR_PREFIX}
@@ -312,7 +316,11 @@ if ! [[ -f $_config_file ]]; then
312316
announce "❌ ERROR: could not find config file \"$_config_file\""
313317
exit 1
314318
fi
315-
eval $( yq -o shell '. | del(.workload)| del (.env)' "$_config_file")
319+
eval $( yq -o shell '. | del(.workload)| del (.env) | del(.description)' "$_config_file")
320+
321+
# Extract optional description metadata
322+
_description_text=$(yq '.description.text // ""' "$_config_file")
323+
_description_keywords=$(yq '.description.keywords // [] | join(",")' "$_config_file")
316324

317325
# Resolve workload hooks (CLI flags override config file values)
318326
# ========================================================
@@ -549,6 +557,16 @@ RUN_WORKLOAD
549557
: | ${_timeout} $control_kubectl exec -i ${_pod_name} -n ${harness_namespace} -- bash -c "$run_workload"
550558
res=$?
551559

560+
# Save description metadata to results directory
561+
if [[ -n "$_description_text" || -n "$_description_keywords" ]]; then
562+
_results_dir=$(results_dir_name "$endpoint_stack_name" "$harness_name" "$_run_experiment_id")
563+
$control_kubectl exec -i ${_pod_name} -n ${harness_namespace} -- bash -c "cat > ${_results_dir}/description.yaml <<'DESCEOF'
564+
description:
565+
text: \"${_description_text}\"
566+
keywords: [${_description_keywords}]
567+
DESCEOF"
568+
fi
569+
552570
# Run post-workload hook
553571
if [[ -n "${_post_workload}" ]]; then
554572
announce "🔧 Running post-workload hook..."

llmdbenchmark/analysis/benchmark_report/br_v0_2_example.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@ run: # These are details on the benchmark run that produced these results
1515
end: 2025-11-05T18:17:03Z
1616
duration: PT49.97206788184121S
1717
user: namasluk
18+
description: "Baseline latency test for Qwen3-0.6B on inference-scheduling stack"
19+
keywords:
20+
- baseline
21+
- latency
22+
- qwen3
1823

1924
# System inputs, combination of component configurations and workload
2025
scenario:

llmdbenchmark/analysis/benchmark_report/schema_v0_2.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -928,6 +928,10 @@ class Run(BaseModel):
928928
"""Time details of experiment."""
929929
user: str | None = None
930930
"""Username that executed experiment."""
931+
description: str | None = None
932+
"""User-provided description of the experiment."""
933+
keywords: list[str] | None = None
934+
"""User-provided keywords/tags for the experiment."""
931935

932936

933937
class Scenario(BaseModel):

0 commit comments

Comments
 (0)