Merge pull request #103 from opendatahub-io/main

saichandrapandraju · web-flow · commit cf74509cf6bd · 2026-04-09T08:42:57.000-04:00
sync: main to incubation
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,7 +19,7 @@ dependencies = [
     "kfp-server-api>=2.14.6",
     "boto3>=1.35.88",
     # eval-hub integration
-    "eval-hub-sdk[adapter]>=0.1.4",
+    "eval-hub-sdk[adapter]==0.1.4",
     "pandas>=2.3.3",
     "Jinja2>=3.1.6",
 ]
diff --git a/src/llama_stack_provider_trustyai_garak/evalhub/garak_adapter.py b/src/llama_stack_provider_trustyai_garak/evalhub/garak_adapter.py
@@ -416,11 +416,18 @@ def _run_simple(
             )
         )
 
+        env: dict[str, str] = {}
+        hf_cache = (config.parameters or {}).get("hf_cache_path", "")
+        if hf_cache:
+            env["HF_HUB_CACHE"] = hf_cache
+            logger.info("Using HF cache from mounted path: %s", hf_cache)
+
         result = run_garak_scan(
             config_file=config_file,
             timeout_seconds=timeout,
             log_file=log_file,
             report_prefix=report_prefix,
+            env=env if env else None,
         )
 
         # AVID conversion
@@ -554,6 +561,7 @@ def _run_via_kfp(
             "sdg_max_concurrency": ip.get("sdg_max_concurrency", DEFAULT_SDG_MAX_CONCURRENCY),
             "sdg_num_samples": ip.get("sdg_num_samples", DEFAULT_SDG_NUM_SAMPLES),
             "sdg_max_tokens": ip.get("sdg_max_tokens", DEFAULT_SDG_MAX_TOKENS),
+            "hf_cache_path": benchmark_config.get("hf_cache_path", ""),
         }
         if model_auth_secret:
             pipeline_args["model_auth_secret_name"] = model_auth_secret
@@ -595,7 +603,6 @@ def _run_via_kfp(
                     ),
                 )
             )
-            s3_bucket = kfp_config.s3_bucket or os.getenv("AWS_S3_BUCKET", "")
             creds = (
                 self._read_s3_credentials_from_secret(
                     kfp_config.s3_secret_name,
@@ -616,11 +623,13 @@ def _run_via_kfp(
                     kfp_config.s3_secret_name,
                     kfp_config.namespace,
                 )
+            s3_bucket = kfp_config.s3_bucket or creds.pop("bucket", "") or os.getenv("AWS_S3_BUCKET", "")
+            s3_endpoint = kfp_config.s3_endpoint or creds.pop("endpoint_url", "") or None
             self._download_results_from_s3(
                 s3_bucket,
                 s3_prefix,
                 scan_dir,
-                endpoint_url=kfp_config.s3_endpoint or None,
+                endpoint_url=s3_endpoint,
                 **creds,
             )
 
@@ -770,6 +779,8 @@ def _decode(key: str) -> str:
                 "access_key": _decode("AWS_ACCESS_KEY_ID"),
                 "secret_key": _decode("AWS_SECRET_ACCESS_KEY"),
                 "region": _decode("AWS_DEFAULT_REGION"),
+                "bucket": _decode("AWS_S3_BUCKET"),
+                "endpoint_url": _decode("AWS_S3_ENDPOINT"),
             }
         except Exception as exc:
             logger.warning("Could not read S3 credentials from secret %s/%s: %s", namespace, secret_name, exc)
@@ -1289,6 +1300,22 @@ def _parse_results(
         )
         overall_summary = combined.get("scores", {}).get("_overall", {}).get("aggregated_results", {})
 
+        overall_asr = overall_summary.get("attack_success_rate")
+        if overall_asr is not None:
+            try:
+                overall_asr = float(overall_asr)
+            except (TypeError, ValueError):
+                overall_asr = None
+        if overall_asr is not None:
+            metrics.append(
+                EvaluationResult(
+                    metric_name="attack_success_rate",
+                    metric_value=overall_asr,
+                    metric_type="percentage",
+                    num_samples=overall_summary.get("total_attempts"),
+                )
+            )
+
         # Convert to EvaluationResult format (one per probe)
         for probe_name, score_data in combined["scores"].items():
             if probe_name == "_overall":
diff --git a/src/llama_stack_provider_trustyai_garak/evalhub/kfp_pipeline.py b/src/llama_stack_provider_trustyai_garak/evalhub/kfp_pipeline.py
@@ -441,6 +441,7 @@ def garak_scan(
     config_json: str,
     s3_prefix: str,
     timeout_seconds: int,
+    hf_cache_path: str,
     prompts_dataset: dsl.Input[dsl.Dataset],
 ) -> NamedTuple("Outputs", [("success", bool), ("return_code", int)]):
     """Run a Garak scan and upload output to S3 via Data Connection credentials.
@@ -469,6 +470,54 @@ def garak_scan(
     )
     from llama_stack_provider_trustyai_garak.errors import GarakError
 
+    if hf_cache_path and hf_cache_path.strip():
+        from llama_stack_provider_trustyai_garak.evalhub.s3_utils import create_s3_client
+
+        if hf_cache_path.startswith("s3://"):
+            parts = hf_cache_path[len("s3://") :].split("/", 1)
+            bucket = parts[0]
+            prefix = parts[1] if len(parts) > 1 else ""
+        else:
+            bucket = os.environ.get("AWS_S3_BUCKET", "")
+            prefix = hf_cache_path.lstrip("/")
+
+        if not bucket:
+            raise GarakError(
+                "Cannot determine S3 bucket for HF cache. "
+                "Provide a full s3://bucket/prefix URI in hf_cache_path, "
+                "or set AWS_S3_BUCKET."
+            )
+
+        if not prefix:
+            log.warning(
+                "hf_cache_path has no sub-prefix; downloading all objects from bucket '%s'.",
+                bucket,
+            )
+
+        hf_cache_dir = Path(tempfile.mkdtemp(prefix="hf-cache-"))
+        s3 = create_s3_client()
+        downloaded = 0
+
+        paginator = s3.get_paginator("list_objects_v2")
+        for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
+            for obj in page.get("Contents", []):
+                rel = obj["Key"][len(prefix) :].lstrip("/")
+                if not rel:
+                    continue
+                dest = hf_cache_dir / rel
+                dest.parent.mkdir(parents=True, exist_ok=True)
+                s3.download_file(bucket, obj["Key"], str(dest))
+                downloaded += 1
+
+        os.environ["HF_HUB_CACHE"] = str(hf_cache_dir)
+        log.info(
+            "Populated HF cache from s3://%s/%s -> %s (%d files)",
+            bucket,
+            prefix,
+            hf_cache_dir,
+            downloaded,
+        )
+
     scan_dir = Path(tempfile.mkdtemp(prefix="garak-scan-"))
 
     prompts_path = Path(prompts_dataset.path)
@@ -639,6 +688,7 @@ def evalhub_garak_pipeline(
     sdg_max_concurrency: int = DEFAULT_SDG_MAX_CONCURRENCY,
     sdg_num_samples: int = DEFAULT_SDG_NUM_SAMPLES,
     sdg_max_tokens: int = DEFAULT_SDG_MAX_TOKENS,
+    hf_cache_path: str = "",
 ):
     """Six-step pipeline: validate, resolve taxonomy, SDG, prepare prompts, scan, write outputs.
 
@@ -736,6 +786,7 @@ def evalhub_garak_pipeline(
         config_json=config_json,
         s3_prefix=s3_prefix,
         timeout_seconds=timeout_seconds,
+        hf_cache_path=hf_cache_path,
         prompts_dataset=prep_task.outputs["prompts_dataset"],
     )
     scan_task.set_caching_options(False)
diff --git a/src/llama_stack_provider_trustyai_garak/remote/garak_remote_eval.py b/src/llama_stack_provider_trustyai_garak/remote/garak_remote_eval.py
@@ -211,6 +211,7 @@ async def run_eval(self, request: RunEvalRequest) -> Job:
                         provider_params.get("sdg_max_tokens", DEFAULT_SDG_MAX_TOKENS),
                         DEFAULT_SDG_MAX_TOKENS,
                     ),
+                    "hf_cache_path": provider_params.get("hf_cache_path", ""),
                 },
                 run_name=f"garak-{benchmark_id.split('::')[-1]}-{job_id.removeprefix(JOB_ID_PREFIX)}",
                 namespace=self._config.kubeflow_config.namespace,
diff --git a/src/llama_stack_provider_trustyai_garak/remote/kfp_utils/components.py b/src/llama_stack_provider_trustyai_garak/remote/kfp_utils/components.py
@@ -397,6 +397,7 @@ def garak_scan(
     job_id: str,
     timeout_seconds: int,
     verify_ssl: str,
+    hf_cache_path: str,
     prompts_dataset: dsl.Input[dsl.Dataset],
 ) -> NamedTuple(
     "Outputs",
@@ -419,13 +420,19 @@ def garak_scan(
     )
     log = logging.getLogger("garak_scan")
 
+    import os
+
     from llama_stack_client import LlamaStackClient
     from llama_stack_provider_trustyai_garak.utils import get_http_client_with_tls
     from llama_stack_provider_trustyai_garak.core.pipeline_steps import (
         setup_and_run_garak,
         redact_api_keys,
     )
 
+    if hf_cache_path and hf_cache_path.strip():
+        os.environ["HF_HUB_CACHE"] = hf_cache_path
+        log.info("Set HF_HUB_CACHE=%s for disconnected mode", hf_cache_path)
+
     scan_dir = Path(tempfile.mkdtemp(prefix="garak-scan-"))
 
     prompts_path = Path(prompts_dataset.path)
diff --git a/src/llama_stack_provider_trustyai_garak/remote/kfp_utils/pipeline.py b/src/llama_stack_provider_trustyai_garak/remote/kfp_utils/pipeline.py
@@ -53,6 +53,7 @@ def garak_scan_pipeline(
     sdg_max_concurrency: int = DEFAULT_SDG_MAX_CONCURRENCY,
     sdg_num_samples: int = DEFAULT_SDG_NUM_SAMPLES,
     sdg_max_tokens: int = DEFAULT_SDG_MAX_TOKENS,
+    hf_cache_path: str = "",
 ):
     """Six-step pipeline: validate, resolve taxonomy, SDG, prepare prompts, scan, parse.
 
@@ -120,6 +121,7 @@ def garak_scan_pipeline(
         job_id=job_id,
         timeout_seconds=timeout_seconds,
         verify_ssl=verify_ssl,
+        hf_cache_path=hf_cache_path,
         prompts_dataset=prep_task.outputs["prompts_dataset"],
     )
     scan_task.set_caching_options(False)
diff --git a/tests/test_evalhub_adapter.py b/tests/test_evalhub_adapter.py
@@ -189,6 +189,88 @@ def create_oci_artifact(self, _spec):
     assert captured["timeout_seconds"] == 42
 
 
+def test_simple_mode_passes_hf_cache_env(monkeypatch, tmp_path):
+    """When hf_cache_path is set, _run_simple passes HF_HUB_CACHE via env to run_garak_scan."""
+    module = _load_evalhub_garak_adapter(monkeypatch)
+    adapter = module.GarakAdapter()
+    monkeypatch.setenv("GARAK_SCAN_DIR", str(tmp_path))
+
+    captured: dict[str, object] = {}
+
+    def _fake_run_garak_scan(config_file, timeout_seconds, report_prefix, env=None, log_file=None):
+        captured["env"] = env
+        report_prefix.with_suffix(".report.jsonl").write_text("{}", encoding="utf-8")
+        return module.GarakScanResult(returncode=0, stdout="", stderr="", report_prefix=report_prefix)
+
+    monkeypatch.setattr(module, "run_garak_scan", _fake_run_garak_scan)
+    monkeypatch.setattr(module, "convert_to_avid_report", lambda _path: True)
+    monkeypatch.setattr(
+        module.GarakAdapter,
+        "_parse_results",
+        lambda self, result, eval_threshold, art_intents=False: ([], None, 0, {"total_attempts": 0}),
+    )
+
+    class _Callbacks:
+        def report_status(self, _update):
+            return None
+
+        def create_oci_artifact(self, _spec):
+            return SimpleNamespace(reference="oci://ref", digest="sha256:test")
+
+    job = SimpleNamespace(
+        id="hf-cache-job",
+        benchmark_id="trustyai_garak::quick",
+        benchmark_index=0,
+        model=SimpleNamespace(url="http://localhost:8000", name="test-model"),
+        parameters={"hf_cache_path": "/test_data/hf-cache"},
+        exports=None,
+    )
+
+    adapter.run_benchmark_job(job, _Callbacks())
+    assert captured["env"] == {"HF_HUB_CACHE": "/test_data/hf-cache"}
+
+
+def test_simple_mode_no_hf_cache_passes_none_env(monkeypatch, tmp_path):
+    """When hf_cache_path is not set, env=None is passed (default behavior)."""
+    module = _load_evalhub_garak_adapter(monkeypatch)
+    adapter = module.GarakAdapter()
+    monkeypatch.setenv("GARAK_SCAN_DIR", str(tmp_path))
+
+    captured: dict[str, object] = {}
+
+    def _fake_run_garak_scan(config_file, timeout_seconds, report_prefix, env=None, log_file=None):
+        captured["env"] = env
+        report_prefix.with_suffix(".report.jsonl").write_text("{}", encoding="utf-8")
+        return module.GarakScanResult(returncode=0, stdout="", stderr="", report_prefix=report_prefix)
+
+    monkeypatch.setattr(module, "run_garak_scan", _fake_run_garak_scan)
+    monkeypatch.setattr(module, "convert_to_avid_report", lambda _path: True)
+    monkeypatch.setattr(
+        module.GarakAdapter,
+        "_parse_results",
+        lambda self, result, eval_threshold, art_intents=False: ([], None, 0, {"total_attempts": 0}),
+    )
+
+    class _Callbacks:
+        def report_status(self, _update):
+            return None
+
+        def create_oci_artifact(self, _spec):
+            return SimpleNamespace(reference="oci://ref", digest="sha256:test")
+
+    job = SimpleNamespace(
+        id="no-hf-cache-job",
+        benchmark_id="trustyai_garak::quick",
+        benchmark_index=0,
+        model=SimpleNamespace(url="http://localhost:8000", name="test-model"),
+        parameters={},
+        exports=None,
+    )
+
+    adapter.run_benchmark_job(job, _Callbacks())
+    assert captured["env"] is None
+
+
 def test_parse_results_uses_overall_without_double_count(monkeypatch, tmp_path):
     module = _load_evalhub_garak_adapter(monkeypatch)
     adapter = module.GarakAdapter()
@@ -245,8 +327,11 @@ def test_parse_results_uses_overall_without_double_count(monkeypatch, tmp_path):
     )
     metrics, overall_score, num_examples, overall_summary = adapter._parse_results(result, 0.5)
 
-    assert len(metrics) == 1
-    assert metrics[0].metric_name == "probe.alpha_asr"
+    assert len(metrics) == 2
+    assert metrics[0].metric_name == "attack_success_rate"
+    assert metrics[0].metric_value == 30.0
+    assert metrics[0].num_samples == 10
+    assert metrics[1].metric_name == "probe.alpha_asr"
     assert overall_score == 30.0
     assert num_examples == 10
     assert overall_summary["tbsa"] == 4.1
@@ -1882,15 +1967,18 @@ def test_parse_results_with_art_intents(self, monkeypatch, tmp_path):
             art_intents=True,
         )
 
-        assert len(metrics) == 1
-        assert metrics[0].metric_name == "spo.SPOIntent_asr"
+        assert len(metrics) == 2
+        assert metrics[0].metric_name == "attack_success_rate"
         assert metrics[0].metric_value == 30.0
-        assert metrics[0].num_samples is None
-        assert metrics[0].metadata["total_attempts"] == 20
-        assert metrics[0].metadata["unsafe_stubs"] == 3
-        assert metrics[0].metadata["safe_stubs"] == 7
-        assert "intent_breakdown" in metrics[0].metadata
-        assert metrics[0].metadata["intent_breakdown"]["S001"]["unsafe_stubs"] == 2
+        assert metrics[0].num_samples == 20
+        assert metrics[1].metric_name == "spo.SPOIntent_asr"
+        assert metrics[1].metric_value == 30.0
+        assert metrics[1].num_samples is None
+        assert metrics[1].metadata["total_attempts"] == 20
+        assert metrics[1].metadata["unsafe_stubs"] == 3
+        assert metrics[1].metadata["safe_stubs"] == 7
+        assert "intent_breakdown" in metrics[1].metadata
+        assert metrics[1].metadata["intent_breakdown"]["S001"]["unsafe_stubs"] == 2
         assert overall_score == 30.0
         assert num_examples == 20
 

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ dependencies = [`
`19`	`19`	`"kfp-server-api>=2.14.6",`
`20`	`20`	`"boto3>=1.35.88",`
`21`	`21`	`# eval-hub integration`
`22`		`- "eval-hub-sdk[adapter]>=0.1.4",`
	`22`	`+ "eval-hub-sdk[adapter]==0.1.4",`
`23`	`23`	`"pandas>=2.3.3",`
`24`	`24`	`"Jinja2>=3.1.6",`
`25`	`25`	`]`