NVIDIA-NeMo
diff --git a/‎.github/workflows/gpu_tests.yml‎
Lines changed: 12 additions & 9 deletions b/‎.github/workflows/gpu_tests.yml‎
Lines changed: 12 additions & 9 deletions
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 1 deletion b/‎.gitignore‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 3 additions & 2 deletions b/‎README.md‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎cluster_configs/example-local.yaml‎
Lines changed: 2 additions & 2 deletions b/‎cluster_configs/example-local.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cluster_configs/example-slurm.yaml‎
Lines changed: 2 additions & 2 deletions b/‎cluster_configs/example-slurm.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎dockerfiles/Dockerfile.nemo-skills‎
Lines changed: 1 addition & 1 deletion b/‎dockerfiles/Dockerfile.nemo-skills‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/index.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/index.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/pipelines/evaluation.md‎
Lines changed: 67 additions & 0 deletions b/‎docs/pipelines/evaluation.md‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎nemo_skills/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎nemo_skills/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎nemo_skills/dataset/prepare.py‎
Lines changed: 6 additions & 5 deletions b/‎nemo_skills/dataset/prepare.py‎
Lines changed: 6 additions & 5 deletions
@@ -41,18 +41,19 @@ jobs:
         pip install -r requirements/common-tests.txt
         ns prepare_data gsm8k human-eval mbpp algebra222 mmlu ifeval
     - name: Run GPU tests
-      timeout-minutes: 120
+      timeout-minutes: 180
       env:
         HF_TOKEN: ${{ secrets.HF_TOKEN }}
       run: |
         cd ${{ github.run_id }}
         nvidia-smi
         set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
         ./tests/gpu-tests/run_llama.sh
-    - name: Cleanup test directory
+    - name: Cleanup
       if: always()
       run: |
-        docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.6.0 bash -c 'rm -rf /tmp/nemo-skills-tests/mistral_emb /home/azureuser/.nemo_run/'
+        docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.6.1 bash -c 'rm -rf /tmp/nemo-skills-tests /home/azureuser/.nemo_run/'
+        docker ps -a -q | xargs -r docker stop
 
   gpu-tests-qwen:
     runs-on: self-hosted-nemo-gpus-1
@@ -79,18 +80,19 @@ jobs:
         pip install -r requirements/common-tests.txt
         ns prepare_data gsm8k human-eval mbpp algebra222 mmlu ifeval
     - name: Run GPU tests
-      timeout-minutes: 120
+      timeout-minutes: 180
       env:
         HF_TOKEN: ${{ secrets.HF_TOKEN }}
       run: |
         cd ${{ github.run_id }}
         nvidia-smi
         set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
         ./tests/gpu-tests/run_qwen.sh
-    - name: Cleanup test directory
+    - name: Cleanup
       if: always()
       run: |
-        docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.6.0 bash -c 'rm -rf /tmp/nemo-skills-tests/mistral_emb /home/azureuser/.nemo_run/'
+        docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.6.1 bash -c 'rm -rf /tmp/nemo-skills-tests /home/azureuser/.nemo_run/'
+        docker ps -a -q | xargs -r docker stop
 
   gpu-tests-rm:
     runs-on: self-hosted-nemo-gpus-1
@@ -114,15 +116,16 @@ jobs:
         pip install -e .
         pip install -r requirements/common-tests.txt
     - name: Run GPU tests
-      timeout-minutes: 120
+      timeout-minutes: 180
       env:
         HF_TOKEN: ${{ secrets.HF_TOKEN }}
       run: |
         cd ${{ github.run_id }}
         nvidia-smi
         set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
         ./tests/gpu-tests/run_rm.sh
-    - name: Cleanup test directory
+    - name: Cleanup
       if: always()
       run: |
-        docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.6.0 bash -c 'rm -rf /tmp/nemo-skills-tests/mistral_emb /home/azureuser/.nemo_run/'
+        docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.6.1 bash -c 'rm -rf /tmp/nemo-skills-tests /home/azureuser/.nemo_run/'
+        docker ps -a -q | xargs -r docker stop
@@ -33,4 +33,6 @@ __pycache__
 .ipynb_checkpoints
 
 cluster_configs/*
-!cluster_configs/example-*.yaml
+!cluster_configs/example-*.yaml
+
+nemo_skills/dataset/ruler/*/
@@ -12,6 +12,7 @@ Here are some of the things we support.
     - Coding skills: human-eval, mbpp
     - Chat/instruction following: ifeval, arena-hard, mt-bench
     - General knowledge: mmlu, mmlu-pro, gpqa
+    - Long context: RULER
 - [Model training](https://nvidia.github.io/NeMo-Skills/pipelines/training): Train models at speed-of-light using [NeMo-Aligner](https://github.com/NVIDIA/NeMo-Aligner/).
 
 You can find the full documentation [here](https://nvidia.github.io/NeMo-Skills/).
@@ -24,12 +25,12 @@ commands and their options.
 Using our pipelines we created [OpenMathReasoning dataset](https://huggingface.co/datasets/nvidia/OpenMathReasoning).
 This dataset contains
 
-* 306K unique mathematical problems sourced from [AoPS forums](https://artofproblemsolving.com/community) with: 
+* 306K unique mathematical problems sourced from [AoPS forums](https://artofproblemsolving.com/community) with:
     * 3.2M long chain-of-thought (CoT) solutions
     * 1.7M long tool-integrated reasoning (TIR) solutions
     * 566K samples that select the most promising solution out of many candidates (GenSelect)
 * Additional 193K problems sourced from AoPS forums (problems only, no solutions)
-  
+
 We used [Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) to preprocess problems, and
 [DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) and [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) to generate solutions.
 
 
@@ -21,9 +21,9 @@ containers:
   nemo: igitman/nemo-skills-nemo:0.6.0
   megatron: igitman/nemo-skills-megatron:0.6.0
   sandbox: igitman/nemo-skills-sandbox:0.6.1
-  nemo-skills: igitman/nemo-skills:0.6.0
+  nemo-skills: igitman/nemo-skills:0.6.1
   verl: igitman/nemo-skills-verl:0.6.1
-  nemo-rl: igitman/nemo-skills-nemo-rl:0.6.0
+  nemo-rl: igitman/nemo-skills-nemo-rl:0.6.1
 
 # add required mounts for models/data here
 # the code is mounted automatically inside /nemo_run/code
 
@@ -21,9 +21,9 @@ containers:
   nemo: igitman/nemo-skills-nemo:0.6.0
   megatron: igitman/nemo-skills-megatron:0.6.0
   sandbox: igitman/nemo-skills-sandbox:0.6.1
-  nemo-skills: igitman/nemo-skills:0.6.0
+  nemo-skills: igitman/nemo-skills:0.6.1
   verl: igitman/nemo-skills-verl:0.6.1
-  nemo-rl: igitman/nemo-skills-nemo-rl:0.6.0
+  nemo-rl: igitman/nemo-skills-nemo-rl:0.6.1
 
 job_name_prefix: "nemo_skills:"
 
 
@@ -1,6 +1,6 @@
 FROM python:3.10
 
-RUN apt-get update && apt-get -y install curl git
+RUN apt-get update && apt-get -y install curl git git-lfs
 
 # for ifeval benchmark
 # TODO: can we get just a single dir?
 
@@ -16,6 +16,7 @@ Here are some of the things we support.
     - Coding skills: human-eval, mbpp
     - Chat/instruction following: ifeval, arena-hard, mt-bench
     - General knowledge: mmlu, mmlu-pro, gpqa
+    - Long context: RULER
 - [Model training](pipelines/training.md): Train models at speed-of-light using [NeMo-Aligner](https://github.com/NVIDIA/NeMo-Aligner/).
 
 To get started, follow this [tutorial](basics/index.md), browse available [pipelines](./pipelines/index.md) or run `ns --help` to see all available
 
@@ -14,6 +14,7 @@ We support many popular benchmarks and it's easy to add new in the future. E.g.
 - Coding skills: human-eval, mbpp
 - Chat/instruction following: ifeval, arena-hard, mt-bench
 - General knowledge: mmlu, mmlu-pro, gpqa
+- Long context: RULER
 
 See [nemo_skills/dataset](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/dataset) where each folder is a benchmark we support.
 
@@ -44,6 +45,13 @@ and if you installed from pip, they will be downloaded to wherever the repo is i
 python -c "import nemo_skills; print(nemo_skills.__path__)"
 ```
 
+Some benchmarks (e.g. ruler) require extra parameters to be passed to the prepare_data script. Thus you'd need to explicitly
+call `ns prepare_data` for all of them, e.g. for ruler you can use
+
+```bash
+ns prepare_data ruler --setup=llama_128k --tokenizer_path=meta-llama/Llama-3.1-8B-Instruct --max_seq_length=131072
+```
+
 ## Greedy decoding
 
 ```bash
@@ -121,6 +129,65 @@ pass@4          | 164         | 78.66              | 72.56
 
 If you want to get both multiple samples and greedy results, use `--add_greedy` parameter.
 
+
+## Using data on cluster
+
+Some benchmarks (e.g. ruler) have very large input datasets and it's inefficient to prepare them on local machine and
+keep uploading on cluster with every evaluation job. Instead, you can prepare them on cluster directly. To do that,
+run prepare_data command with `--data_dir` and `--cluster` options, e.g.
+
+```bash
+ns prepare_data \
+    --data_dir=/workspace/ns-data \
+    --cluster=slurm \
+    ruler --setup llama_128k --tokenizer_path meta-llama/Llama-3.1-8B-Instruct --max_seq_length 130900
+```
+
+Then during evaluation, you'd need to provide the same `data_dir` argument and it will read the data from cluster
+directly. You can also use `NEMO_SKILLS_DATA_DIR` environment variable instead of an explicit argument.
+
+Here is an example evaluation command for ruler that uses data_dir parameter
+
+```python
+from nemo_skills.pipeline.cli import eval, run_cmd, wrap_arguments
+
+tasks = [
+    "niah_single_1", "niah_single_2","niah_single_3",
+    "niah_multikey_1", "niah_multikey_2", "niah_multikey_3",
+    "niah_multivalue", "niah_multiquery",
+    "vt", "cwe", "fwe", "qa_1", "qa_2",
+]
+benchmarks = ",".join([f"ruler.llama_128k.{task}:0" for task in tasks])
+
+eval(
+    # using a low number of concurrent requests since it's almost entirely prefill stage
+    ctx=wrap_arguments("++max_concurrent_requests=32"),
+    cluster="slurm",
+    model="/hf_models/Meta-Llama-3.1-8B-Instruct",
+    server_type="sglang",
+    output_dir="/workspace/eval-ruler",
+    data_dir="/workspace/ns-data",
+    benchmarks=benchmarks,
+    server_gpus=8,
+    expname="eval-ruler",
+)
+
+# running summarize results on the cluster as well to avoid downloading the data
+# you can find results in /workspace/eval-ruler/eval-results/metrics.json
+# or add --wandb_name parameter to log to W&B
+cmd = (
+    "python -m nemo_skills.pipeline.summarize_results "
+    "    --data_dir /workspace/ns-data /workspace/eval-ruler/eval-results "
+)
+run_cmd(
+    ctx=wrap_arguments(cmd),
+    cluster="slurm",
+    log_dir="/workspace/eval-ruler/eval-results/summarize_results",
+    expname="summarize-results",
+    run_after="eval-ruler",
+)
+```
+
 ## How the benchmarks are defined
 
 Each benchmark exists as a separate folder inside
 
@@ -22,7 +22,7 @@
     'nemo': 'igitman/nemo-skills-nemo:0.6.0',
     'megatron': 'igitman/nemo-skills-megatron:0.6.0',
     'sandbox': 'igitman/nemo-skills-sandbox:0.6.1',
-    'nemo-skills': 'igitman/nemo-skills:0.6.0',
+    'nemo-skills': 'igitman/nemo-skills:0.6.1',
     'verl': 'igitman/nemo-skills-verl:0.6.1',
-    'nemo-rl': 'igitman/nemo-skills-nemo-rl:0.6.0',
+    'nemo-rl': 'igitman/nemo-skills-nemo-rl:0.6.1',
 }
@@ -20,7 +20,7 @@
 from nemo_skills.dataset.utils import add_header_to_jsonl_inplace, get_lean4_header
 
 
-def prepare_datasets(datasets=None, dataset_groups=None, add_lean4_header=False):
+def prepare_datasets(datasets=None, dataset_groups=None, add_lean4_header=False, extra_args=""):
     if datasets and dataset_groups:
         raise ValueError("Cannot specify both datasets and dataset_groups")
 
@@ -41,7 +41,7 @@ def prepare_datasets(datasets=None, dataset_groups=None, add_lean4_header=False)
     for dataset in datasets:
         print(f"Preparing {dataset}")
         dataset_path = datasets_dir / dataset
-        subprocess.run(f"{sys.executable} {dataset_path / 'prepare.py'}", shell=True, check=True)
+        subprocess.run(f"{sys.executable} {dataset_path / 'prepare.py'} {extra_args}", shell=True, check=True)
         dataset_module = importlib.import_module(f"nemo_skills.dataset.{dataset}")
 
         if dataset_module.DATASET_GROUP == "math":
@@ -62,12 +62,13 @@ def prepare_datasets(datasets=None, dataset_groups=None, add_lean4_header=False)
         '--dataset_groups',
         default=[],
         nargs="*",
-        choices=["math", "code", "chat", "multichoice"],
+        choices=["math", "code", "chat", "multichoice", "long-context"],
         help='Can specify a dataset groups here',
     )
     parser.add_argument(
         '--add_lean4_header', action='store_true', help='Add Lean4 header to JSONL files during preparation'
     )
-    args = parser.parse_args()
+    args, unknown = parser.parse_known_args()
+    extra_args = " ".join(unknown)
 
-    prepare_datasets(args.datasets, args.dataset_groups, args.add_lean4_header)
+    prepare_datasets(args.datasets, args.dataset_groups, args.add_lean4_header, extra_args=extra_args)