NVIDIA-NeMo
diff --git a/‎.github/workflows/gpu_tests.yml‎
Lines changed: 2 additions & 5 deletions b/‎.github/workflows/gpu_tests.yml‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎cluster_configs/example-local.yaml‎
Lines changed: 2 additions & 2 deletions b/‎cluster_configs/example-local.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cluster_configs/example-slurm.yaml‎
Lines changed: 2 additions & 2 deletions b/‎cluster_configs/example-slurm.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎dockerfiles/README.md‎
Lines changed: 1 addition & 1 deletion b/‎dockerfiles/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/basics/index.md‎
Lines changed: 5 additions & 5 deletions b/‎docs/basics/index.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎docs/openmathinstruct2/dataset.md‎
Lines changed: 10 additions & 9 deletions b/‎docs/openmathinstruct2/dataset.md‎
Lines changed: 10 additions & 9 deletions
diff --git a/‎docs/openmathinstruct2/evaluation.md‎
Lines changed: 3 additions & 4 deletions b/‎docs/openmathinstruct2/evaluation.md‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎docs/openmathreasoning1/evaluation.md‎
Lines changed: 4 additions & 5 deletions b/‎docs/openmathreasoning1/evaluation.md‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎docs/pipelines/decontamination.md‎
Lines changed: 14 additions & 11 deletions b/‎docs/pipelines/decontamination.md‎
Lines changed: 14 additions & 11 deletions
diff --git a/‎docs/pipelines/evaluation.md‎
Lines changed: 15 additions & 15 deletions b/‎docs/pipelines/evaluation.md‎
Lines changed: 15 additions & 15 deletions
@@ -39,15 +39,14 @@ jobs:
         pip uninstall -y nemo-skills nemo_run
         pip install -e .
         pip install -r requirements/common-tests.txt
-        ns prepare_data gsm8k human-eval mbpp algebra222 mmlu ifeval
+        ns prepare_data gsm8k human-eval mbpp algebra222 mmlu ifeval math-500 amc23 aime24
     - name: Run GPU tests
       timeout-minutes: 180
       env:
         HF_TOKEN: ${{ secrets.HF_TOKEN }}
       run: |
         cd ${{ github.run_id }}
         nvidia-smi
-        export DOCKER_CLIENT_TIMEOUT=120
         set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
         ./tests/gpu-tests/run_llama.sh
     - name: Cleanup
@@ -79,15 +78,14 @@ jobs:
         pip uninstall -y nemo-skills nemo_run
         pip install -e .
         pip install -r requirements/common-tests.txt
-        ns prepare_data gsm8k human-eval mbpp algebra222 mmlu ifeval
+        ns prepare_data gsm8k human-eval mbpp algebra222 mmlu ifeval math-500 amc23 aime24
     - name: Run GPU tests
       timeout-minutes: 180
       env:
         HF_TOKEN: ${{ secrets.HF_TOKEN }}
       run: |
         cd ${{ github.run_id }}
         nvidia-smi
-        export DOCKER_CLIENT_TIMEOUT=120
         set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
         ./tests/gpu-tests/run_qwen.sh
     - name: Cleanup
@@ -124,7 +122,6 @@ jobs:
       run: |
         cd ${{ github.run_id }}
         nvidia-smi
-        export DOCKER_CLIENT_TIMEOUT=120
         set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
         ./tests/gpu-tests/run_rm.sh
     - name: Cleanup
 
@@ -18,8 +18,8 @@ containers:
   trtllm: igitman/nemo-skills-trtllm:0.6.1
   vllm: vllm/vllm-openai:v0.9.0
   sglang: igitman/nemo-skills-sglang:0.6.1
-  nemo: igitman/nemo-skills-nemo:0.6.0
-  megatron: igitman/nemo-skills-megatron:0.6.0
+  nemo: igitman/nemo-skills-nemo:0.6.1
+  megatron: igitman/nemo-skills-megatron:0.6.1
   sandbox: igitman/nemo-skills-sandbox:0.6.1
   nemo-skills: igitman/nemo-skills:0.6.1
   verl: igitman/nemo-skills-verl:0.6.1
 
@@ -18,8 +18,8 @@ containers:
   trtllm: igitman/nemo-skills-trtllm:0.6.1
   vllm: vllm/vllm-openai:v0.9.0
   sglang: igitman/nemo-skills-sglang:0.6.1
-  nemo: igitman/nemo-skills-nemo:0.6.0
-  megatron: igitman/nemo-skills-megatron:0.6.0
+  nemo: igitman/nemo-skills-nemo:0.6.1
+  megatron: igitman/nemo-skills-megatron:0.6.1
   sandbox: igitman/nemo-skills-sandbox:0.6.1
   nemo-skills: igitman/nemo-skills:0.6.1
   verl: igitman/nemo-skills-verl:0.6.1
 
@@ -4,7 +4,7 @@ Some dockerfiles are directly included in this folder and for some others the in
 To build one of the existing dockerfiles use a command like this
 
 ```
-docker build -t igitman/nemo-skills-nemo:0.6.0 -f dockerfiles/Dockerfile.nemo .
+docker build -t igitman/nemo-skills-nemo:0.6.1 -f dockerfiles/Dockerfile.nemo .
 ```
 It might take a long time for some of the images.
 
 
@@ -54,7 +54,7 @@ You can either use [OpenAI models](https://platform.openai.com/docs/overview) or
         --model=meta/llama-3.1-8b-instruct \
         --server_address=https://integrate.api.nvidia.com/v1 \
         --output_dir=./generation \
-        ++input_file=./input.jsonl \
+        --input_file=./input.jsonl \
         ++prompt_config=./prompt.yaml
     ```
 
@@ -67,7 +67,7 @@ You can either use [OpenAI models](https://platform.openai.com/docs/overview) or
         --model=gpt-4o-mini \
         --server_address=https://api.openai.com/v1 \
         --output_dir=./generation \
-        ++input_file=./input.jsonl \
+        --input_file=./input.jsonl \
         ++prompt_config=./prompt.yaml
     ```
 
@@ -144,7 +144,7 @@ ns generate \
     --model=Qwen/Qwen2.5-1.5B-Instruct \
     --server_gpus=1 \
     --output_dir=/workspace/generation-local \
-    ++input_file=/workspace/input.jsonl \
+    --input_file=/workspace/input.jsonl \
     ++prompt_config=/workspace/prompt.yaml
 ```
 
@@ -176,7 +176,7 @@ ns generate \
     --model=/workspace/qwen2.5-1.5b-instruct-trtllm \
     --server_gpus=1 \
     --output_dir=/workspace/generation-local-trtllm \
-    ++input_file=/workspace/input.jsonl \
+    --input_file=/workspace/input.jsonl \
     ++prompt_config=/workspace/prompt.yaml \
     ++prompt_template=qwen-instruct # (3)!
 ```
@@ -215,7 +215,7 @@ ns generate \
     --server_type=vllm \
     --model=Qwen/Qwen2.5-1.5B-Instruct \
     --server_gpus=1 \
-    ++input_file=/nemo_run/code/input.jsonl \
+    --input_file=/nemo_run/code/input.jsonl \
     ++prompt_config=/nemo_run/code/prompt.yaml \
     --output_dir=/workspace/generation # (2)!
 ```
 
@@ -31,7 +31,7 @@ ns generate \
     --num_random_seeds=512 \
     --output_dir=/workspace/solution-augmentation/math \
     --eval_args="++eval_type=math" \
-    ++input_file=/nemo_run/code/nemo_skills/dataset/math/train.jsonl \
+    --input_file=/nemo_run/code/nemo_skills/dataset/math/train.jsonl \
     ++prompt_config=generic/math-base \
     ++examples_type=math_text_detailed \
     ++prompt_template=llama3-base
@@ -49,7 +49,7 @@ ns generate \
     --num_random_seeds=64 \
     --output_dir=/workspace/solution-augmentation/gsm8k \
     --eval_args="++eval_type=math" \
-    ++input_file=/nemo_run/code/nemo_skills/dataset/gsm8k/train.jsonl \
+    --input_file=/nemo_run/code/nemo_skills/dataset/gsm8k/train.jsonl \
     ++prompt_config=generic/math-base \
     ++examples_type=gsm8k_text_detailed \
     ++prompt_template=llama3-base
@@ -69,7 +69,7 @@ ns generate \
     --server_nodes=2 \
     --num_random_seeds=80 \
     --output_dir=/workspace/problem-augmentation/math \
-    ++input_file=/nemo_run/code/nemo_skills/dataset/math/train.jsonl \
+    --input_file=/nemo_run/code/nemo_skills/dataset/math/train.jsonl \
     ++prompt_config=generic/problem-augmentation \
     ++examples_type=math_problem_augmentation \
     ++prompt_template=llama3-instruct \
@@ -87,7 +87,7 @@ ns generate \
     --server_nodes=2 \
     --num_random_seeds=10 \
     --output_dir=/workspace/problem-augmentation/gsm8k \
-    ++input_file=/nemo_run/code/nemo_skills/dataset/gsm8k/train.jsonl \
+    --input_file=/nemo_run/code/nemo_skills/dataset/gsm8k/train.jsonl \
     ++prompt_config=generic/problem-augmentation-similar \
     ++examples_type=gsm8k_problem_augmentation \
     ++prompt_template=llama3-instruct \
@@ -117,8 +117,8 @@ for i in range(80):
         server_nodes=2,
         num_random_seeds=32,
         output_dir=f"/workspace/new-problems-solution-augmentation/math/problem-set{i}",
+        input_file=f"/workspace/solution-augmentation/math/generation/output-rs{i}",
         ctx=wrap_arguments(
-            f"++input_file=/workspace/solution-augmentation/math/generation/output-rs{i} "
             f"++prompt_config=generic/math-base "
             f"++examples_type=math_text_detailed "
             f"++prompt_template=llama3-base "
@@ -142,8 +142,8 @@ for i in range(10):
         server_nodes=2,
         num_random_seeds=32,
         output_dir=f"/workspace/new-problems-solution-augmentation/gsm8k/problem-set{i}",
+        input_file=f"/workspace/solution-augmentation/gsm8k/generation/output-rs{i}",
         ctx=wrap_arguments(
-            f"++input_file=/workspace/solution-augmentation/gsm8k/generation/output-rs{i} "
             f"++prompt_config=generic/math-base "
             f"++examples_type=gsm8k_text_detailed "
             f"++prompt_template=llama3-base "
@@ -231,10 +231,11 @@ Next, you need to run LLM inference to check those closest found problems from t
 We use the Llama3.1-405B-Instruct model for this, and here's one way of doing it via Nvidia API catalog.
 
 ```bash
-ns check_contamination \
+ns generate \
     --cluster=slurm \
+    --generation_type=check_contamination \
     --input_file=/workspace/new-problems-solution-augmentation/contamination-retrieved.jsonl \
-    --output_file=/workspace/new-problems-solution-augmentation/contamination-llm.jsonl \
+    --output_dir=/workspace/new-problems-solution-augmentation/contamination-llm \
     --server_type=openai \
     --model=meta/llama-3.1-405b-instruct \
     --server_address=https://integrate.api.nvidia.com/v1 \
@@ -267,7 +268,7 @@ python -m nemo_skills.training.prepare_data \
     ++hf_model_name="meta-llama/Meta-Llama-3.1-8B" \
     ++max_solution_length=1024 \
     ++filters.remove_contaminated=true \
-    ++contamination_file=/workspace/new-problems-solution-augmentation/contamination-llm.jsonl
+    ++contamination_file=/workspace/new-problems-solution-augmentation/contamination-llm/output.jsonl
 ```
 
 ## Dataset contamination explorer
 
@@ -73,7 +73,7 @@ for dataset in aime24 amc23 math gsm8k omni-math; do
         --server_type=openai \
         --server_address=https://api.openai.com/v1 \
         --output_dir=/workspace/openmath2-llama3.1-8b-eval-judged/eval-results/${dataset} \
-        ++input_dir=/workspace/openmath2-llama3.1-8b-eval/eval-results/${dataset}
+        --input_dir=/workspace/openmath2-llama3.1-8b-eval/eval-results/${dataset}
 done
 ```
 
@@ -155,14 +155,13 @@ for dataset in aime24 amc23 math gsm8k omni-math; do
         --server_type=openai \
         --server_address=https://api.openai.com/v1 \
         --output_dir=/workspace/openmath2-llama3.1-8b-eval-judged/eval-results-majority/${dataset} \
-        ++input_file=/workspace/openmath2-llama3.1-8b-eval/eval-results-majority/${dataset}/output-agg.jsonl \
-        ++output_file=/workspace/openmath2-llama3.1-8b-eval/eval-results-majority/${dataset}/output-rs0.jsonl
+        --input_file=/workspace/openmath2-llama3.1-8b-eval/eval-results-majority/${dataset}/output-agg.jsonl
 done
 ```
 
 ```bash
 ns summarize_results /workspace/openmath2-llama3.1-8b-eval-judged/eval-results-majority --cluster local
 ```
 
-This will print majority results (they will be labeled as `majority@1` since we fused them into a single file).
+This will print majority results (they will be labeled as `greedy` since we fused them into a single file).
 You can also ignore the symbolic score as it's not accurate anymore after we filled majority answers.
@@ -141,7 +141,7 @@ ns generate \
     --server_type=trtllm \
     --server_gpus=4 \
     --output_dir=/workspace/openmath-nemotron-1.5b-eval-cot/eval-results-judged/hle \
-    ++input_dir=/workspace/openmath-nemotron-1.5b-eval-cot/eval-results/hle
+    --input_dir=/workspace/openmath-nemotron-1.5b-eval-cot/eval-results/hle
 ```
 
 Alternatively, you can use an API model like gpt-4o, but the results might be different.
@@ -155,7 +155,7 @@ ns generate \
     --server_type=openai \
     --server_address=https://api.openai.com/v1 \
     --output_dir=/workspace/openmath-nemotron-1.5b-eval-cot/eval-results-judged/hle \
-    ++input_dir=/workspace/openmath-nemotron-1.5b-eval-cot/eval-results/hle
+    --input_dir=/workspace/openmath-nemotron-1.5b-eval-cot/eval-results/hle
 ```
 
 To print the metrics run
@@ -227,9 +227,8 @@ All other commands are the same as in the [CoT part](#run-cot-evaluations).
 Here is a sample command to run GenSelect evaluation:
 
 ```bash
-ns generate \
-    --generation_type=genselect \
-    --genselect_args="++input_dir=/workspace/openmath-nemotron-1.5b-eval-cot/eval-results-judged/hle" \
+ns genselect \
+    --preprocess_args="++input_dir=/workspace/openmath-nemotron-1.5b-eval-cot/eval-results-judged/hle" \
     --model=/trt_models/openmath-nemotron-1.5b \
     ++prompt_template=qwen-instruct \
     --output_dir=/workspace/openmath-nemotron-1.5b-eval-cot/self_genselect_hle \
 
@@ -2,7 +2,7 @@
 
 !!! info
 
-    This pipeline starting script is [nemo_skills/pipeline/check_contamination.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/pipeline/check_contamination.py)
+    This pipeline starting script is [nemo_skills/pipeline/generate.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/pipeline/generate.py)
 
     All extra parameters are passed to [nemo_skills/inference/check_contamination.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/inference/check_contamination.py)
 
@@ -27,7 +27,7 @@ you have `/workspace` defined in your [cluster config](../basics/cluster-configs
 you can do it in the following way
 
 ```python
-from nemo_skills.pipeline.cli import wrap_arguments, run_cmd
+from nemo_skills.pipeline.cli import wrap_arguments, run_cmd, generate
 
 
 test_sets = ['math', 'amc23', 'aime24']
@@ -52,14 +52,16 @@ run_cmd(
 Next, you need to run LLM inference to check those closest found questions from the output file. Here is an example
 using Llama-405B from Nvidia API catalog, but you can replace it with OpenAI models or self-hosted models.
 
-```
-ns check_contamination \
-    --cluster=local \
-    --input_file=/workspace/math-contamination-retrieved.jsonl \
-    --output_file=/workspace/math-contamination-results.jsonl \
-    --server_type=openai \
-    --model=meta/llama-3.1-405b-instruct \
-    --server_address=https://integrate.api.nvidia.com/v1
+```python
+generate(
+    cluster="local",
+    generation_type="check_contamination",
+    input_file="/workspace/math-contamination-retrieved.jsonl",
+    output_dir="/workspace/math-contamination-results",
+    model="meta/llama-3.1-405b-instruct",
+    server_type="openai",
+    server_address="https://integrate.api.nvidia.com/v1",
+)
 ```
 
 This script will print an output that looks like this
@@ -74,7 +76,8 @@ If you want instead to clean your training data from contaminated examples all t
 you need to swap values for the `retrieve_from` and `compare_to` arguments in the `retrieve_similar` step
 since we now want to make a check for each training set example and find closest test set problems.
 
-After you get `/workspace/math-contamination-results.jsonl`, you can pass it into [prepare_data command](training.md#preparing-the-data)
+After you get `/workspace/math-contamination-results/output.jsonl`,
+you can pass it into [prepare_data command](training.md#preparing-the-data)
 with `++contamination_file=...` option.
 
 See a more detailed example in [OpenMathInstruct-2 dataset construction pipeline](../openmathinstruct2/dataset.md#decontamination).
@@ -77,14 +77,14 @@ ns summarize_results --cluster local /workspace/test-eval
 Which should print the following
 
 ```
-------------------------- gsm8k -------------------------
-evaluation_mode | num_entries | symbolic_correct | no_answer
-greedy          | 1319        | 82.34            | 0.91
+--------------------------------- gsm8k ---------------------------------
+evaluation_mode | num_entries | avg_tokens | symbolic_correct | no_answer
+greedy          | 1319        | 169        | 83.40%           | 1.97%
 
 
------------------------------- human-eval -----------------------------
-evaluation_mode | num_entries | passing_base_tests | passing_plus_tests
-greedy          | 164         | 67.68              | 62.20
+------------------------------------ human-eval ------------------------------------
+evaluation_mode | num_entries | avg_tokens | passing_base_tests | passing_plus_tests
+greedy          | 164         | 228        | 70.12%             | 62.80%
 ```
 
 The [summarize_results](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/pipeline/summarize_results.py) script
@@ -116,19 +116,19 @@ ns eval \
 you will see the following output after summarizing results
 
 ```
--------------------------- gsm8k ---------------------------
-evaluation_mode | num_entries | symbolic_correct | no_answer
-majority@4      | 1319        | 87.95            | 0.00
-pass@4          | 1319        | 93.78            | 0.00
+--------------------------------- gsm8k ---------------------------------
+evaluation_mode | num_entries | avg_tokens | symbolic_correct | no_answer
+pass@1[4]       | 1319        | 161        | 78.96%           | 6.01%
+majority@4      | 1319        | 161        | 88.10%           | 0.08%
+pass@4          | 1319        | 161        | 93.25%           | 0.08%
 
 
------------------------------- human-eval -----------------------------
-evaluation_mode | num_entries | passing_base_tests | passing_plus_tests
-pass@4          | 164         | 78.66              | 72.56
+------------------------------------ human-eval ------------------------------------
+evaluation_mode | num_entries | avg_tokens | passing_base_tests | passing_plus_tests
+pass@1[4]       | 164         | 251        | 64.18%             | 59.30%
+pass@4          | 164         | 251        | 82.32%             | 78.05%
 ```
 
-If you want to get both multiple samples and greedy results, use `--add_greedy` parameter.
-
 
 ## Using data on cluster