NVIDIA-NeMo
diff --git a/‎docs/libraries/nemo-evaluator/extending/byob/benchmark-decorator.md‎
Lines changed: 3 additions & 1 deletion b/‎docs/libraries/nemo-evaluator/extending/byob/benchmark-decorator.md‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎docs/libraries/nemo-evaluator/extending/byob/datasets.md‎
Lines changed: 87 additions & 15 deletions b/‎docs/libraries/nemo-evaluator/extending/byob/datasets.md‎
Lines changed: 87 additions & 15 deletions
diff --git a/‎packages/nemo-evaluator/src/nemo_evaluator/contrib/byob/compiler.py‎
Lines changed: 8 additions & 0 deletions b/‎packages/nemo-evaluator/src/nemo_evaluator/contrib/byob/compiler.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎packages/nemo-evaluator/src/nemo_evaluator/contrib/byob/decorators.py‎
Lines changed: 16 additions & 3 deletions b/‎packages/nemo-evaluator/src/nemo_evaluator/contrib/byob/decorators.py‎
Lines changed: 16 additions & 3 deletions
diff --git a/‎packages/nemo-evaluator/src/nemo_evaluator/contrib/byob/eval_logic.py‎
Lines changed: 38 additions & 27 deletions b/‎packages/nemo-evaluator/src/nemo_evaluator/contrib/byob/eval_logic.py‎
Lines changed: 38 additions & 27 deletions
diff --git a/‎packages/nemo-evaluator/src/nemo_evaluator/contrib/byob/runner.py‎
Lines changed: 12 additions & 10 deletions b/‎packages/nemo-evaluator/src/nemo_evaluator/contrib/byob/runner.py‎
Lines changed: 12 additions & 10 deletions
@@ -30,7 +30,9 @@ def check(sample: ScorerInput) -> dict:
 | `choices` | `list[str]` | `None` | Static candidate continuations for `endpoint_type="completions_logprob"` |
 | `choices_field` | `str` | `None` | Dataset field containing per-row candidate continuations for `endpoint_type="completions_logprob"`; dotted paths such as `choices.text` are supported |
 | `num_fewshot` | `int` | `0` | Number of few-shot examples to prepend to each prompt |
-| `fewshot_split` | `str` | `None` | Optional split to sample few-shot examples from |
+| `fewshot_dataset` | `str` | `None` | Optional explicit dataset URI/path to sample few-shot examples from. Use when the few-shot source needs filters, `data_files`, configs, or other URI options that cannot be expressed by a split name alone. Takes precedence over `fewshot_split`. |
+| `fewshot_split` | `str` | `None` | Optional split name to sample few-shot examples from when the primary `dataset` is an `hf://` URI. Used only if `fewshot_dataset` is not set or fails to load. |
+| `fewshot_prefix` | `str` | `""` | Optional static text prepended once before the rendered few-shot examples (e.g. `"The following are multiple-choice questions...\n\n"`). |
 | `fewshot_template` | `str` | `None` | Optional template for rendering few-shot examples |
 | `fewshot_separator` | `str` | `"\n\n"` | Separator between rendered few-shot examples |
 
 
@@ -144,28 +144,31 @@ Otherwise the shell treats `&` as a background-command separator.
 
 ### `extra.dataset.*` namespace
 
-BYOB groups dataset-related configuration under
-`config.params.extra.dataset.*` in the FDF / run_config:
+BYOB exposes two dataset-related keys under `config.params.extra.dataset.*`
+that can be overridden at run time without rebuilding the benchmark:
 
-| Key | Description |
-|-----|-------------|
-| `path` | Dataset file path or `hf://` URI (compile-time default from `@benchmark(dataset=...)`). |
-| `num_fewshot` | Optional few-shot example count (lm-eval-harness parity). |
-| `field_mapping` | Informational mirror of `@benchmark(field_mapping=...)`. |
-| `choices` / `choices_field` | Informational mirror of `@benchmark(choices=...)` / `@benchmark(choices_field=...)`. |
+| Key | CLI flag | Description |
+|-----|----------|-------------|
+| `path` | `--dataset` | Dataset file path or `hf://` URI. Compile-time default from `@benchmark(dataset=...)`. |
+| `num_fewshot` | `--num-fewshot` | Few-shot example count (lm-eval-harness parity). Pass `0` to force true 0-shot for a benchmark that declares a non-zero default. |
+
+All other dataset-related options (`field_mapping`, `choices`, `choices_field`,
+`fewshot_dataset`, `fewshot_prefix`, `fewshot_split`, etc.) are baked into the
+benchmark at compile time from the `@benchmark(...)` decorator and are not
+runtime-overridable — change them in your benchmark module and recompile with
+`nemo-evaluator-byob compile`.
 
 ### Overriding the dataset at run time
 
-The `@benchmark` decorator's `dataset=` value is the compile-time default. To
-swap it for a single run without rebuilding the benchmark, set
-`config.params.extra.dataset.path` via the launcher's run_config or CLI. The
-launcher deep-merges via OmegaConf, so sibling keys under `extra.dataset`
-(`num_fewshot`, `field_mapping`, etc.) and under `extra` (`benchmark_module`,
-`requirements`, …) are preserved.
+To swap `path` or `num_fewshot` for a single run, set the corresponding key
+under `config.params.extra.dataset.*` via the launcher's run_config or CLI.
+The launcher deep-merges via OmegaConf, so sibling keys (and unrelated keys
+under `extra` such as `benchmark_module`, `requirements`, …) are preserved.
 
 ```bash
 nemo-evaluator-launcher run --config my_config.yaml \
-  -o 'evaluation.tasks.<task_name>.nemo_evaluator_config.config.params.extra.dataset.path=hf://other/foo?split=test'
+  -o 'evaluation.tasks.<task_name>.nemo_evaluator_config.config.params.extra.dataset.path=hf://other/foo?split=test' \
+  -o 'evaluation.tasks.<task_name>.nemo_evaluator_config.config.params.extra.dataset.num_fewshot=0'
 ```
 
 Or in a run_config YAML:
@@ -183,6 +186,75 @@ evaluation:
                 num_fewshot: 5
 ```
 
+## Few-shot Examples
+
+BYOB resolves the few-shot example pool with this precedence:
+
+1. **`fewshot_dataset`** — explicit URI/path. Use this when the few-shot
+   source needs filters, `data_files`, configs, or any other URI options
+   that cannot be expressed by a split name (e.g.
+   `hf://my-org/foo?data_files=train.json&filter_field=lang&filter_value=hi`).
+2. **`fewshot_split`** — split name reused with the primary `hf://` dataset.
+   Used only when `fewshot_dataset` is unset *or* fails to load.
+3. **Tail of the primary dataset** — last-resort fallback. Logs a loud
+   warning because the few-shot pool overlaps with rows being evaluated,
+   risking gold-answer leakage into the prompt.
+
+### Examples
+
+Few-shot from a different split of the same HuggingFace dataset:
+
+```python
+@benchmark(
+    name="mmlu-mini",
+    dataset="hf://my-org/mmlu?split=test",
+    prompt="Question: {question}\nAnswer:",
+    target_field="answer",
+    num_fewshot=5,
+    fewshot_split="dev",
+)
+```
+
+Few-shot from a completely different dataset URI (filters, data_files, etc.):
+
+```python
+@benchmark(
+    name="boolq-hi",
+    dataset="hf://sarvamai/boolq-indic?split=validation&filter_field=language&filter_value=hi",
+    prompt="Passage: {passage}\nQuestion: {question}\nAnswer:",
+    target_field="answer",
+    num_fewshot=4,
+    fewshot_dataset="hf://sarvamai/boolq-indic?split=train&filter_field=language&filter_value=hi",
+)
+```
+
+Add a static introduction before the few-shot examples:
+
+```python
+@benchmark(
+    name="indommlu",
+    dataset="hf://indolem/IndoMMLU?split=test&trust_remote_code=true",
+    prompt="{question}\n\n{options}\n\nAnswer:",
+    target_field="answer",
+    num_fewshot=5,
+    fewshot_split="train",
+    fewshot_prefix="The following are multiple-choice questions. Choose the best answer.\n\n",
+)
+```
+
+The final prompt sent to the model is:
+
+```text
+<fewshot_prefix><example_1><fewshot_separator>...<example_N><fewshot_separator><test_prompt>
+```
+
+:::{tip}
+At run time you can force a true 0-shot evaluation against a benchmark
+that declares a non-zero `num_fewshot` by passing `--num-fewshot 0` on
+the `nemo-evaluator run_eval` CLI. The flag is `None` by default; an
+explicit `0` overrides the benchmark default.
+:::
+
 ## Field Mapping
 
 Use `field_mapping` to rename dataset columns so they match the `{placeholder}` names in your prompt template. The mapping is applied after loading the dataset and before prompt rendering.
 
@@ -84,6 +84,10 @@
     " and config.params.extra.dataset.num_fewshot is not none %}"
     " --num-fewshot {{config.params.extra.dataset.num_fewshot}}"
     "{% endif %}"
+    "{% if config.params.extra.save_predictions is not defined"
+    " or config.params.extra.save_predictions %}"
+    " --save-predictions"
+    "{% endif %}"
 )
 
 
@@ -113,6 +117,10 @@ def _build_fdf(
         dataset_params["field_mapping"] = bench.field_mapping
     if bench.num_fewshot:
         dataset_params["num_fewshot"] = bench.num_fewshot
+    if bench.fewshot_dataset:
+        dataset_params["fewshot_dataset"] = bench.fewshot_dataset
+    if bench.fewshot_prefix:
+        dataset_params["fewshot_prefix"] = bench.fewshot_prefix
     # Multiple-choice loglikelihood metadata (informational; the runner
     # picks up choices/choices_field from the @benchmark registry itself).
     if bench.choices is not None:
 
@@ -92,6 +92,8 @@ class BenchmarkDefinition:
     choices: Optional[List[str]] = None
     choices_field: Optional[str] = None
     num_fewshot: int = 0
+    fewshot_dataset: Optional[str] = None
+    fewshot_prefix: str = ""
     fewshot_split: Optional[str] = None
     fewshot_template: Optional[str] = None
     fewshot_separator: str = "\n\n"
@@ -170,6 +172,8 @@ def benchmark(
     choices: Optional[List[str]] = None,
     choices_field: Optional[str] = None,
     num_fewshot: int = 0,
+    fewshot_dataset: Optional[str] = None,
+    fewshot_prefix: str = "",
     fewshot_split: Optional[str] = None,
     fewshot_template: Optional[str] = None,
     fewshot_separator: str = "\n\n",
@@ -209,11 +213,18 @@ def benchmark(
             when both are set on a per-row basis.
         num_fewshot: Number of few-shot examples to prepend to each
             prompt. Examples are sampled deterministically from
-            ``fewshot_split`` (or the first ``num_fewshot`` rows of the
-            evaluation dataset when ``fewshot_split`` is None).
+            ``fewshot_dataset`` if provided, then ``fewshot_split`` if
+            provided, otherwise from the evaluation dataset fallback pool.
+        fewshot_dataset: Optional explicit dataset path or URI to sample
+            few-shot examples from. Prefer this over ``fewshot_split`` when
+            the few-shot source requires filters, data files, configs, or
+            other URI options that cannot be represented by a split name.
+        fewshot_prefix: Optional static text prepended before rendered
+            few-shot examples. Useful for introducing or delimiting examples.
         fewshot_split: HuggingFace split name to sample few-shot examples
             from (e.g. ``"train"`` or ``"dev"``). Only meaningful when the
-            primary ``dataset`` is an ``hf://`` URI.
+            primary ``dataset`` is an ``hf://`` URI and ``fewshot_dataset``
+            is not set.
         fewshot_template: Optional template string used to render each
             few-shot example. ``None`` reuses the main ``prompt`` template
             and appends the rendered ``target_field`` value.
@@ -308,6 +319,8 @@ def decorator(fn):
             choices=list(choices) if choices is not None else None,
             choices_field=choices_field,
             num_fewshot=num_fewshot,
+            fewshot_dataset=fewshot_dataset,
+            fewshot_prefix=fewshot_prefix,
             fewshot_split=fewshot_split,
             fewshot_template=resolved_fewshot_template,
             fewshot_separator=fewshot_separator,
 
@@ -589,8 +589,9 @@ def render_fewshot_example(bench: BenchmarkDefinition, row: Dict) -> Optional[st
 def build_fewshot_prefix(bench: BenchmarkDefinition, examples: List[Dict]) -> str:
     """Render *examples* into a prefix string ready to prepend to each prompt.
 
-    Skips examples that fail to render (missing fields). Always appends the
-    benchmark's ``fewshot_separator`` after the last example so the test
+    Skips examples that fail to render (missing fields). If configured,
+    ``bench.fewshot_prefix`` is prepended before the examples. Always appends
+    the benchmark's ``fewshot_separator`` after the last example so the test
     prompt starts on a fresh boundary.
     """
     if not examples:
@@ -602,45 +603,71 @@ def build_fewshot_prefix(bench: BenchmarkDefinition, examples: List[Dict]) -> st
             rendered.append(text)
     if not rendered:
         return ""
-    # Use ``is None`` rather than ``or`` so an explicit empty-string
-    # separator (concat with no delimiter) is honoured.
     sep = bench.fewshot_separator if bench.fewshot_separator is not None else "\n\n"
-    return sep.join(rendered) + sep
+    prefix = getattr(bench, "fewshot_prefix", "") or ""
+    return prefix + sep.join(rendered) + sep
 
 
 def build_fewshot_examples(
     primary_dataset_uri: str,
     primary_dataset: List[Dict],
     num_fewshot: int,
     fewshot_split: Optional[str],
+    fewshot_dataset: Optional[str] = None,
     field_mapping: Optional[Dict[str, str]] = None,
     seed: int = 42,
 ) -> List[Dict]:
     """Sample ``num_fewshot`` examples deterministically (lm-eval style).
 
     Selection rules (in order):
 
-    1. If ``fewshot_split`` is set and the primary dataset URI is an
+    1. If ``fewshot_dataset`` is set, load that exact dataset URI/path.
+       Use this when the safe few-shot source needs filters, data files,
+       configs, or other options that cannot be inferred from a split name.
+    2. If ``fewshot_split`` is set and the primary dataset URI is an
        ``hf://`` URI, load that split via the dataset module and sample
        ``num_fewshot`` rows. This is the **safe** path — examples come
        from a different split than the test set, so there is no
        contamination.
-    2. Otherwise, sample ``num_fewshot`` rows from the **tail** of
+    3. Otherwise, sample ``num_fewshot`` rows from the **tail** of
        ``primary_dataset`` (i.e. the rows least likely to be evaluated
        when ``--limit-samples`` is set). A loud warning is logged
        because the fewshot pool overlaps with the evaluation set when
        running the full dataset, which can leak gold answers into the
        prompt.
 
-    To guarantee no contamination, declare a ``fewshot_split`` on the
-    ``@benchmark`` (e.g. ``"train"`` or ``"dev"``) so this function
-    samples from a disjoint split.
+    To guarantee no contamination, declare a ``fewshot_dataset`` or
+    ``fewshot_split`` on the ``@benchmark`` so this function samples from a
+    disjoint source.
     """
     if num_fewshot <= 0:
         return []
 
     pool: List[Dict] = []
-    if fewshot_split and primary_dataset_uri.startswith("hf://"):
+    if fewshot_dataset:
+        try:
+            from nemo_evaluator.contrib.byob.dataset import load_dataset
+
+            pool = load_dataset(
+                fewshot_dataset,
+                limit=max(num_fewshot * 4, 16),
+                field_mapping=field_mapping,
+            )
+            if not pool:
+                logger.debug(
+                    "fewshot_dataset loaded successfully but returned 0 rows; "
+                    "falling back to fewshot_split or primary dataset",
+                    fewshot_dataset=fewshot_dataset,
+                )
+        except Exception as e:
+            logger.warning(
+                "Failed to load fewshot_dataset, falling back to fewshot_split or primary dataset",
+                fewshot_dataset=fewshot_dataset,
+                error=str(e),
+            )
+            pool = []
+
+    if not pool and fewshot_split and primary_dataset_uri.startswith("hf://"):
         try:
             from nemo_evaluator.contrib.byob.dataset import load_dataset
 
@@ -662,13 +689,6 @@ def build_fewshot_examples(
             pool = []
 
     if not pool:
-        # Fallback: no separate fewshot split is available. Sample from
-        # the tail of the primary dataset to minimise overlap with the
-        # eval set when the user passes --limit-samples (which iterates
-        # from the head). When the full dataset is evaluated, the
-        # fewshot pool is a strict subset of the eval set and gold
-        # answers can leak — warn loudly so the user knows to declare
-        # ``fewshot_split=`` on the @benchmark.
         logger.warning(
             "fewshot_split not available; sampling from primary dataset. "
             "This risks test-set contamination because the fewshot pool "
@@ -679,8 +699,6 @@ def build_fewshot_examples(
             primary_dataset_size=len(primary_dataset),
         )
         pool_size = max(num_fewshot * 4, num_fewshot)
-        # Tail slice — falls back to the head only if the dataset is
-        # smaller than the desired pool.
         if len(primary_dataset) > pool_size:
             pool = primary_dataset[-pool_size:]
         else:
@@ -741,11 +759,6 @@ def run_eval_loop(
             endpoint_type == "completions_logprob"
             or bench.endpoint_type == "completions_logprob"
         ):
-            # Logprob-mode MCQ ranking is the only strategy that requires
-            # ``choices`` / ``choices_field``; the @benchmark decorator
-            # already validates that pairing. Don't auto-pick MCQ just
-            # because choices are declared — a user may declare them as
-            # informational metadata while running the chat endpoint.
             strategy = MultipleChoiceStrategy()
         else:
             strategy = StandardStrategy()
@@ -816,8 +829,6 @@ def _run_eval_loop_sequential(
     progress_interval = max(1, min(10, total // 10)) if total > 0 else 1
 
     for idx, row in enumerate(dataset):
-        # Pass fewshot_prefix only when non-empty so legacy strategy
-        # implementations (without the kwarg) continue to work.
         kwargs = {"fewshot_prefix": fewshot_prefix} if fewshot_prefix else {}
         scores, prediction = strategy.evaluate_sample(
             idx,
 
@@ -747,10 +747,10 @@ def main():
     parser.add_argument(
         "--num-fewshot",
         type=int,
-        default=0,
+        default=None,
         help=(
             "Number of few-shot examples to prepend to each prompt "
-            "(default: 0). Examples are sampled deterministically from the "
+            "(default: benchmark default, usually 0). Examples are sampled deterministically from the "
             "benchmark's fewshot_split (or the first --num-fewshot rows of "
             "the same dataset when fewshot_split is not declared)."
         ),
@@ -781,15 +781,16 @@ def main():
         field_mapping=bench.field_mapping,
     )
 
-    # Resolve few-shot examples: precedence is CLI flag > benchmark default.
-    # Robust to mocked benchmark objects (tests use MagicMock) where
-    # ``bench.num_fewshot`` may not be a real int.
+    # Resolve few-shot examples: an explicit CLI value, including 0, must
+    # override the benchmark default. This is required for true 0-shot
+    # validation of benchmarks that declare a non-zero default.
     effective_num_fewshot = 0
-    try:
-        effective_num_fewshot = int(args.num_fewshot or 0)
-    except (TypeError, ValueError):
-        effective_num_fewshot = 0
-    if not effective_num_fewshot:
+    if args.num_fewshot is not None:
+        try:
+            effective_num_fewshot = int(args.num_fewshot)
+        except (TypeError, ValueError):
+            effective_num_fewshot = 0
+    else:
         try:
             effective_num_fewshot = int(getattr(bench, "num_fewshot", 0) or 0)
         except (TypeError, ValueError):
@@ -801,6 +802,7 @@ def main():
             primary_dataset=dataset,
             num_fewshot=effective_num_fewshot,
             fewshot_split=bench.fewshot_split,
+            fewshot_dataset=bench.fewshot_dataset,
             field_mapping=bench.field_mapping,
             seed=args.fewshot_seed,
         )