Add description and keywords metadata to experiment config (llm-d#898)

jia-gao · claude · web-flow · commit a6de86aecc3c · 2026-04-11T15:16:17.000-04:00
* feat: Add optional description and keywords metadata to experiment config Allow users to add a description (text + keywords) to their experiment configuration. The metadata is propagated to the harness pod via env vars, saved as description.yaml in each results directory, and included in the benchmark report v0.2 run section. Closes llm-d#715 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * chore: Update secrets baseline for shifted line numbers Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/.secrets.baseline b/.secrets.baseline
@@ -3,7 +3,7 @@
     "files": "^.secrets.baseline$",
     "lines": null
   },
-  "generated_at": "2026-03-28T18:30:48Z",
+  "generated_at": "2026-04-04T14:46:16Z",
   "plugins_used": [
     {
       "name": "AWSKeyDetector"
@@ -155,7 +155,7 @@
       {
         "hashed_secret": "ea269867b057855f42084f04739114058bb26bb5",
         "is_verified": false,
-        "line_number": 207,
+        "line_number": 212,
         "type": "Base64 High Entropy String",
         "verified_result": null
       }
diff --git a/benchmark_report/br_v0_2_example.yaml b/benchmark_report/br_v0_2_example.yaml
@@ -15,6 +15,11 @@ run: # These are details on the benchmark run that produced these results
     end: 2025-11-05T18:17:03Z
     duration: PT49.97206788184121S
   user: namasluk
+  description: "Baseline latency test for Qwen3-0.6B on inference-scheduling stack"
+  keywords:
+    - baseline
+    - latency
+    - qwen3
 
 # System inputs, combination of component configurations and workload
 scenario:
diff --git a/benchmark_report/native_to_br0_2.py b/benchmark_report/native_to_br0_2.py
@@ -230,6 +230,9 @@ def _populate_run(ev_dict: dict) -> dict:
     except FileNotFoundError:
         namespace = ev_dict.get("vllm_common_namespace", "")
 
+    description_text = os.environ.get("LLMDBENCH_DESCRIPTION_TEXT")
+    description_keywords = os.environ.get("LLMDBENCH_DESCRIPTION_KEYWORDS")
+
     br_dict = {
         "run": {
             "eid": eid,
@@ -241,6 +244,12 @@ def _populate_run(ev_dict: dict) -> dict:
                 "end": _get_harness_meta("harness_stop", "LLMDBENCH_HARNESS_STOP"),
                 "duration": _get_harness_meta("harness_delta", "LLMDBENCH_HARNESS_DELTA"),
             },
+            "description": description_text if description_text else None,
+            "keywords": (
+                [k.strip() for k in description_keywords.split(",") if k.strip()]
+                if description_keywords
+                else None
+            ),
         },
     }
     return br_dict
diff --git a/benchmark_report/schema_v0_2.py b/benchmark_report/schema_v0_2.py
@@ -921,6 +921,10 @@ class Run(BaseModel):
     """Time details of experiment."""
     user: str | None = None
     """Username that executed experiment."""
+    description: str | None = None
+    """User-provided description of the experiment."""
+    keywords: list[str] | None = None
+    """User-provided keywords/tags for the experiment."""
 
 
 class Scenario(BaseModel):
diff --git a/existing_stack/config_template.yaml b/existing_stack/config_template.yaml
@@ -5,6 +5,12 @@ endpoint:
   base_url: &url http://${GATEWAY_SVC}.${NAMESPACE}.svc.cluster.local:80  # Base URL of inference endpoint
   hf_token_secret: llm-d-hf-token   # The name of secret that contains the HF token of the stack
 
+# description:                          # optional: user-defined metadata to help identify and organize results
+#   text: "My first benchmark test"     # free-text description of the experiment
+#   keywords:                           # tags / keywords for filtering results
+#     - benchmark
+#     - test_env
+
 control:
   work_dir: $HOME/llm-d-bench-work # working directory to store temporary and autogenerated files. 
                                     # Do not edit content manually.
diff --git a/existing_stack/run_only.sh b/existing_stack/run_only.sh
@@ -207,6 +207,10 @@ spec:
     ${is_dataset_url}  value: "${harness_dataset_url}"
     - name: LLMDBENCH_HARNESS_STACK_NAME
       value: "${endpoint_stack_name}"
+    - name: LLMDBENCH_DESCRIPTION_TEXT
+      value: "${_description_text}"
+    - name: LLMDBENCH_DESCRIPTION_KEYWORDS
+      value: "${_description_keywords}"
     volumeMounts:
     - name: results
       mountPath: ${RESULTS_DIR_PREFIX}
@@ -312,7 +316,11 @@ if ! [[ -f $_config_file  ]]; then
   announce "❌ ERROR: could not find config file \"$_config_file\""
   exit 1
 fi
-eval $( yq -o shell '. | del(.workload)| del (.env)' "$_config_file")
+eval $( yq -o shell '. | del(.workload)| del (.env) | del(.description)' "$_config_file")
+
+# Extract optional description metadata
+_description_text=$(yq '.description.text // ""' "$_config_file")
+_description_keywords=$(yq '.description.keywords // [] | join(",")' "$_config_file")
 
 # Resolve workload hooks (CLI flags override config file values)
 # ========================================================
@@ -549,6 +557,16 @@ RUN_WORKLOAD
     : | ${_timeout} $control_kubectl exec -i ${_pod_name} -n ${harness_namespace} -- bash -c "$run_workload"
     res=$?
 
+    # Save description metadata to results directory
+    if [[ -n "$_description_text" || -n "$_description_keywords" ]]; then
+      _results_dir=$(results_dir_name "$endpoint_stack_name" "$harness_name" "$_run_experiment_id")
+      $control_kubectl exec -i ${_pod_name} -n ${harness_namespace} -- bash -c "cat > ${_results_dir}/description.yaml <<'DESCEOF'
+description:
+  text: \"${_description_text}\"
+  keywords: [${_description_keywords}]
+DESCEOF"
+    fi
+
     # Run post-workload hook
     if [[ -n "${_post_workload}" ]]; then
       announce "🔧 Running post-workload hook..."
diff --git a/llmdbenchmark/analysis/benchmark_report/br_v0_2_example.yaml b/llmdbenchmark/analysis/benchmark_report/br_v0_2_example.yaml
@@ -15,6 +15,11 @@ run: # These are details on the benchmark run that produced these results
     end: 2025-11-05T18:17:03Z
     duration: PT49.97206788184121S
   user: namasluk
+  description: "Baseline latency test for Qwen3-0.6B on inference-scheduling stack"
+  keywords:
+    - baseline
+    - latency
+    - qwen3
 
 # System inputs, combination of component configurations and workload
 scenario:
diff --git a/llmdbenchmark/analysis/benchmark_report/schema_v0_2.py b/llmdbenchmark/analysis/benchmark_report/schema_v0_2.py
@@ -928,6 +928,10 @@ class Run(BaseModel):
     """Time details of experiment."""
     user: str | None = None
     """Username that executed experiment."""
+    description: str | None = None
+    """User-provided description of the experiment."""
+    keywords: list[str] | None = None
+    """User-provided keywords/tags for the experiment."""
 
 
 class Scenario(BaseModel):

Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,7 @@`
`3`	`3`	`"files": "^.secrets.baseline$",`
`4`	`4`	`"lines": null`
`5`	`5`	`},`
`6`		`- "generated_at": "2026-03-28T18:30:48Z",`
	`6`	`+ "generated_at": "2026-04-04T14:46:16Z",`
`7`	`7`	`"plugins_used": [`
`8`	`8`	`{`
`9`	`9`	`"name": "AWSKeyDetector"`
`@@ -155,7 +155,7 @@`
`155`	`155`	`{`
`156`	`156`	`"hashed_secret": "ea269867b057855f42084f04739114058bb26bb5",`
`157`	`157`	`"is_verified": false,`
`158`		`- "line_number": 207,`
	`158`	`+ "line_number": 212,`
`159`	`159`	`"type": "Base64 High Entropy String",`
`160`	`160`	`"verified_result": null`
`161`	`161`	`}`