Update omr receipe (#973)

wedu-nvidia · dgtm777 · commit 3fd84cc0729b · 2025-10-29T12:34:03.000+04:00
Signed-off-by: Wei Du &lt;wedu@nvidia.com&gt;
diff --git a/recipes/openmathreasoning/scripts/simplified_recipe.py b/recipes/openmathreasoning/scripts/simplified_recipe.py
@@ -24,7 +24,7 @@
 )
 
 
-def prepare(workspace, cluster, num_gpus, expname_prefix, wandb_params):
+def prepare(workspace, cluster, expname_prefix):
     # data preparation needs to run locally without container, so not wrapping with run_cmd
     prepare_datasets(["aime24", "aime25"])
 
@@ -90,7 +90,7 @@ def run_sdg(workspace, cluster, num_gpus, expname_prefix, wandb_params):
     )
 
 
-def run_training(workspace, cluster, num_gpus, expname_prefix, wandb_params):
+def run_training(workspace, cluster, num_gpus, expname_prefix, backend, wandb_params):
     # convert the generated solutions to a format that can be used for training
     run_cmd(
         ctx=wrap_arguments(
@@ -110,47 +110,54 @@ def run_training(workspace, cluster, num_gpus, expname_prefix, wandb_params):
     )
 
     # train the model
-
-    sft_nemo_rl(
-        ctx=wrap_arguments(
-            "++policy.max_total_sequence_length=8192 "
-            "++policy.train_global_batch_size=32 "
-            "++policy.tensor_model_parallel_size=4 "
-            "++policy.context_parallel_size=2 "
-            "++policy.lr=1e-5 "
-            "++sft.max_num_epochs=2 "
-        ),
-        cluster=cluster,
-        output_dir=f"{workspace}/training",
-        hf_model="Qwen/Qwen2.5-14B-Instruct",
-        backend="megatron",
-        num_gpus=num_gpus,
-        num_nodes=1,
-        disable_wandb=wandb_params["disable_wandb"],
-        wandb_project=wandb_params["wandb_project"],
-        training_data=f"{workspace}/sft-data.jsonl",
-        expname=f"{expname_prefix}-training",
-        run_after=f"{expname_prefix}-prepare-training-data",
-        final_hf_path=f"{workspace}/training/qwen2.5-14b-improved-hf",
-    )
-
-
-def final_eval(workspace, cluster, num_gpus, expname_prefix, wandb_params):
+    base_args = [
+        "++policy.max_total_sequence_length=8192",
+        "++policy.train_global_batch_size=32",
+        "++policy.tensor_model_parallel_size=4",
+        "++policy.context_parallel_size=2",
+        "++policy.lr=1e-5",
+        "++sft.max_num_epochs=2",
+    ]
+    # For FSDP, sequence_packing cannot be used with context parallel
+    for training_backend in backend:
+        args = list(base_args)
+        if training_backend == "fsdp":
+            args.append("++policy.sequence_packing.enabled=False")
+
+        sft_nemo_rl(
+            ctx=wrap_arguments(" ".join(args)),
+            cluster=cluster,
+            output_dir=f"{workspace}/training-{training_backend}",
+            hf_model="Qwen/Qwen2.5-14B-Instruct",
+            backend=training_backend,
+            num_gpus=num_gpus,
+            num_nodes=1,
+            disable_wandb=wandb_params["disable_wandb"],
+            wandb_project=wandb_params["wandb_project"],
+            training_data=f"{workspace}/sft-data.jsonl",
+            expname=f"{expname_prefix}-training-{training_backend}",
+            run_after=f"{expname_prefix}-prepare-training-data",
+            final_hf_path=f"{workspace}/training-{training_backend}/qwen2.5-14b-improved-hf",
+        )
+
+
+def final_eval(workspace, cluster, num_gpus, expname_prefix, backend, wandb_params):
     # launching evaluation
-    eval(
-        ctx=wrap_arguments("++inference.tokens_to_generate=16384 ++parse_reasoning=True "),
-        cluster=cluster,
-        model=f"{workspace}/training/qwen2.5-14b-improved-hf",
-        server_type="vllm",
-        server_gpus=num_gpus,
-        benchmarks="aime24:8,aime25:8",
-        output_dir=f"{workspace}/evals/after-training",
-        num_jobs=1,
-        expname=f"{expname_prefix}-final-eval",
-        run_after=f"{expname_prefix}-training",
-        wandb_name=f"{expname_prefix}-final-eval" if not wandb_params["disable_wandb"] else None,
-        wandb_project=wandb_params["wandb_project"],
-    )
+    for training_backend in backend:
+        eval(
+            ctx=wrap_arguments("++inference.tokens_to_generate=16384 ++parse_reasoning=True "),
+            cluster=cluster,
+            model=f"{workspace}/training-{training_backend}/qwen2.5-14b-improved-hf",
+            server_type="vllm",
+            server_gpus=num_gpus,
+            benchmarks="aime24:8,aime25:8",
+            output_dir=f"{workspace}/evals/after-training-{training_backend}",
+            num_jobs=1,
+            expname=f"{expname_prefix}-final-eval-{training_backend}",
+            run_after=f"{expname_prefix}-training-{training_backend}",
+            wandb_name=f"{expname_prefix}-final-eval" if not wandb_params["disable_wandb"] else None,
+            wandb_project=wandb_params["wandb_project"],
+        )
 
 
 def initial_eval(workspace, cluster, num_gpus, expname_prefix, wandb_params):
@@ -203,21 +210,42 @@ def initial_eval(workspace, cluster, num_gpus, expname_prefix, wandb_params):
         default="nemo-skills",
         help="WandB project name for tracking experiments.",
     )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        nargs="+",
+        choices=["megatron", "fsdp"],
+        default=["megatron"],
+    )
+
     args = parser.parse_args()
 
     wandb_params = {
         "disable_wandb": args.disable_wandb,
         "wandb_project": args.wandb_project,
     }
-    args = (
+    common_args = (
         args.workspace,
         args.cluster,
         args.num_gpus,
         args.expname_prefix,
+        args.backend,
         wandb_params,
     )
-    prepare(*args)
-    initial_eval(*args)
-    run_sdg(*args)
-    run_training(*args)
-    final_eval(*args)
+    prepare(workspace=args.workspace, cluster=args.cluster, expname_prefix=args.expname_prefix)
+    initial_eval(
+        workspace=args.workspace,
+        cluster=args.cluster,
+        num_gpus=args.num_gpus,
+        expname_prefix=args.expname_prefix,
+        wandb_params=wandb_params,
+    )
+    run_sdg(
+        workspace=args.workspace,
+        cluster=args.cluster,
+        num_gpus=args.num_gpus,
+        expname_prefix=args.expname_prefix,
+        wandb_params=wandb_params,
+    )
+    run_training(*common_args)
+    final_eval(*common_args)
diff --git a/tests/slurm-tests/omr_simple_recipe/check_results.py b/tests/slurm-tests/omr_simple_recipe/check_results.py
@@ -32,7 +32,7 @@
 }
 
 
-def check_results(benchmark: str, baseline_results: dict, after_training_results: dict):
+def check_results(benchmark: str, baseline_results: dict, after_training_results: dict, backend: str):
     for metric in ["pass@1[avg-of-8]", "majority@8"]:
         baseline_acc = baseline_results[benchmark][metric]["symbolic_correct"]
         after_acc = after_training_results[benchmark][metric]["symbolic_correct"]
@@ -46,22 +46,33 @@ def check_results(benchmark: str, baseline_results: dict, after_training_results
         )
         soft_assert(
             lo_a <= after_acc <= hi_a,
-            f"{benchmark}: after_training {after_acc}% out of range [{lo_a}%, {hi_a}%] for metric {metric}",
+            f"{benchmark} for {backend}: after_training {after_acc}% out of range [{lo_a}%, {hi_a}%] for metric {metric}",
         )
 
 
 def main():
     ap = argparse.ArgumentParser()
     ap.add_argument("--workspace", required=True, help="Workspace directory containing eval results.")
+    ap.add_argument(
+        "--backend",
+        type=str,
+        nargs="+",
+        choices=["megatron", "fsdp"],
+        default=["megatron"],
+    )
     args = ap.parse_args()
-
-    for benchmark in ("aime24", "aime25"):
-        common_path = Path(args.workspace) / "evals"
-        baseline_results = load_json(common_path / "baseline" / "eval-results" / benchmark / "metrics.json")
-        after_training_results = load_json(
-            common_path / "after-training" / "eval-results" / benchmark / "metrics.json"
-        )
-        check_results(benchmark, baseline_results, after_training_results)
+    for training_backend in args.backend:
+        for benchmark in ("aime24", "aime25"):
+            common_path = Path(args.workspace) / "evals"
+            baseline_results = load_json(common_path / "baseline" / "eval-results" / benchmark / "metrics.json")
+            after_training_results = load_json(
+                common_path
+                / "after-training-{}".format(training_backend)
+                / "eval-results"
+                / benchmark
+                / "metrics.json"
+            )
+            check_results(benchmark, baseline_results, after_training_results, training_backend)
 
     assert_all()
 
diff --git a/tests/slurm-tests/omr_simple_recipe/run_test.py b/tests/slurm-tests/omr_simple_recipe/run_test.py
@@ -24,13 +24,21 @@ def main():
     ap.add_argument("--wandb_project", default="nemo-skills-slurm-ci", help="W&B project name")
     ap.add_argument("--expname_prefix", required=True, help="Experiment name prefix used inside the recipe")
     ap.add_argument("--disable_wandb", action="store_true", help="Disable W&B logging in the recipe")
+    ap.add_argument(
+        "--backend",
+        type=str,
+        nargs="+",
+        choices=["megatron", "fsdp"],
+        default=["megatron"],
+    )
     args = ap.parse_args()
 
     cmd = (
         f"python -m recipes.openmathreasoning.scripts.simplified_recipe "
-        f"    --cluster {args.cluster} "
-        f"    --workspace {args.workspace} "
-        f"    --expname_prefix {args.expname_prefix} "
+        f" --cluster {args.cluster} "
+        f" --workspace {args.workspace} "
+        f" --expname_prefix {args.expname_prefix} "
+        f" --backend {' '.join(args.backend)} "
     )
 
     if args.disable_wandb:
@@ -40,17 +48,18 @@ def main():
 
     subprocess.run(cmd, shell=True, check=True)
 
-    checker_cmd = f"python tests/slurm-tests/omr_simple_recipe/check_results.py --workspace {args.workspace}"
+    checker_cmd = f"python tests/slurm-tests/omr_simple_recipe/check_results.py --workspace {args.workspace} --backend {' '.join(args.backend)}"
+
+    final_eval_name = [f"{args.expname_prefix}-final-eval-{training_backend}" for training_backend in args.backend]
 
     run_cmd(
         ctx=wrap_arguments(checker_cmd),
         cluster=args.cluster,
         expname=args.expname_prefix + "-check-results",
         log_dir=f"{args.workspace}/check-results-logs",
-        run_after=[  # these are launched in simplified recipe
-            f"{args.expname_prefix}-final-eval",
-            f"{args.expname_prefix}-baseline-eval",
-        ],
+        # these are launched in simplified recipe
+        run_after=final_eval_name + [f"{args.expname_prefix}-baseline-eval"],
+        reuse_code=True,
     )