NVIDIA-NeMo · marta-sd · Nov 13, 2025 · Nov 13, 2025 · Nov 14, 2025 · Nov 19, 2025
diff --git a/tests/integration_tests/Launch_Integration_Tests.sh b/tests/integration_tests/Launch_Integration_Tests.sh
@@ -19,32 +19,22 @@ export HF_DATASETS_OFFLINE="1"
 export TRANSFORMERS_OFFLINE="1"
 export HF_DATASETS_CACHE="${HF_HOME}/datasets"
 
+mkdir -p /checkpoints && \
+ln -s /home/TestData/nemo2_ckpt/llama-3_2-1b-instruct_v2.0 /checkpoints/llama-3_2-1b-instruct_v2.0
+
+# FIXME(martas): temporary WAR for broken deps in NeMo FW nightly image
+pip install nvidia-lm-eval[math]
+
 SCRIPT_DIR=$(dirname "$0")
 PROJECT_DIR=$SCRIPT_DIR/../../
 cd $PROJECT_DIR
 
-nemo2_ckpt_path="/home/TestData/nemo2_ckpt/llama-3_2-1b-instruct_v2.0"
-model_name="megatron_model"
-port=8886
-
-python /opt/Export-Deploy/scripts/deploy/nlp/deploy_ray_inframework.py \
-  --nemo_checkpoint $nemo2_ckpt_path \
-  --num_gpus 1 \
-  --tensor_model_parallel_size 1 \
-  --pipeline_model_parallel_size 1 \
-  --model_id $model_name \
-  --port $port &
-
-deploy_pid=$!
-
 coverage run \
     --data-file=.coverage.integration_tests \
     --source=src/ \
     -m pytest \
     -o log_cli=true \
     -o log_cli_level=INFO \
     -m "not pleasefixme" \
-    -v -s \
-   tests/integration_tests/nemo_fw/test_deployment.py
-
-kill $deploy_pid
+   tests/integration_tests
+coverage combine -q
diff --git a/tests/integration_tests/nemo_fw/test_deployment.py b/tests/integration_tests/nemo_fw/test_deployment.py
diff --git a/tests/integration_tests/nemo_fw/test_hf_ray.py b/tests/integration_tests/nemo_fw/test_hf_ray.py
@@ -27,11 +27,6 @@
     EvaluationTarget,
 )
 
-# FIXME(martas): EF packages pre 25.09 use old imports from nvidia_eval_commons
-from nvidia_eval_commons.api.api_dataclasses import (
-    EvaluationResult as LegacyEvaluationResult,
-)
-
 logger = logging.getLogger(__name__)
 
 
@@ -59,7 +54,7 @@ def deployment_process():
         endpoint_url=f"http://0.0.0.0:{port}/v1/completions/",
         endpoint_type="completions",
         model_name=model_name,
-        max_retries=600,
+        max_retries=100,
     )
     assert completions_ready, (
         "Completions endpoint is not ready. Please look at the deploy process log for the error"
@@ -69,7 +64,7 @@ def deployment_process():
         endpoint_url=f"http://0.0.0.0:{port}/v1/chat/completions",
         endpoint_type="chat",
         model_name=model_name,
-        max_retries=600,
+        max_retries=1,  # if completions endpoint is ready, chat should be ready too
     )
     assert chat_ready, (
         "Chat endpoint is not ready. Please look at the deploy process log for the error"
@@ -128,8 +123,5 @@ def test_evaluation(eval_type, endpoint_type, eval_params, tmp_path):
         type=eval_type, params=ConfigParams(**eval_params), output_dir=str(tmp_path)
     )
     results = evaluate(target_cfg=eval_target, eval_cfg=eval_config)
-    # FIXME(martas): EF packages pre 25.09 use old imports from nvidia_eval_commons
-    assert isinstance(results, EvaluationResult) or isinstance(
-        results, LegacyEvaluationResult
-    )
+    assert isinstance(results, EvaluationResult)
     logger.info("Evaluation completed.")
diff --git a/tests/integration_tests/nemo_fw/test_nemo2_ray.py b/tests/integration_tests/nemo_fw/test_nemo2_ray.py
@@ -27,11 +27,6 @@
     EvaluationTarget,
 )
 
-# FIXME(martas): EF packages pre 25.09 use old imports from nvidia_eval_commons
-from nvidia_eval_commons.api.api_dataclasses import (
-    EvaluationResult as LegacyEvaluationResult,
-)
-
 logger = logging.getLogger(__name__)
 
 
@@ -65,7 +60,7 @@ def deployment_process():
         endpoint_url=f"http://0.0.0.0:{port}/v1/completions/",
         endpoint_type="completions",
         model_name=model_name,
-        max_retries=600,
+        max_retries=100,
     )
     assert completions_ready, (
         "Completions endpoint is not ready. Please look at the deploy process log for the error"
@@ -75,7 +70,7 @@ def deployment_process():
         endpoint_url=f"http://0.0.0.0:{port}/v1/chat/completions",
         endpoint_type="chat",
         model_name=model_name,
-        max_retries=600,
+        max_retries=1,  # if completions endpoint is ready, chat should be ready too
     )
     assert chat_ready, (
         "Chat endpoint is not ready. Please look at the deploy process log for the error"
@@ -95,6 +90,9 @@ def deployment_process():
         subprocess.run(["pkill", f"-{signal.SIGTERM}", "tritonserver"], check=False)
 
 
+# FIXME(martas): Errors out due to an MCore bug on deployment side
+# enable once fixed in Export-Deploy
+@pytest.mark.pleasefixme
 @pytest.mark.run_only_on("GPU")
 @pytest.mark.parametrize(
     "eval_type,endpoint_type,eval_params",
@@ -133,8 +131,5 @@ def test_evaluation(eval_type, endpoint_type, eval_params, tmp_path):
         type=eval_type, params=ConfigParams(**eval_params), output_dir=str(tmp_path)
     )
     results = evaluate(target_cfg=eval_target, eval_cfg=eval_config)
-    # FIXME(martas): EF packages pre 25.09 use old imports from nvidia_eval_commons
-    assert isinstance(results, EvaluationResult) or isinstance(
-        results, LegacyEvaluationResult
-    )
+    assert isinstance(results, EvaluationResult)
     logger.info("Evaluation completed.")
diff --git a/tests/integration_tests/nemo_fw/test_nemo2_triton.py b/tests/integration_tests/nemo_fw/test_nemo2_triton.py
@@ -27,11 +27,6 @@
     EvaluationTarget,
 )
 
-# FIXME(martas): EF packages pre 25.09 use old imports from nvidia_eval_commons
-from nvidia_eval_commons.api.api_dataclasses import (
-    EvaluationResult as LegacyEvaluationResult,
-)
-
 logger = logging.getLogger(__name__)
 
 
@@ -52,9 +47,9 @@ def deployment_process():
             "1",
             "--num_nodes",
             "1",
-            "--tensor_parallelism_size",
+            "--tensor_model_parallel_size",
             "1",
-            "--pipeline_parallelism_size",
+            "--pipeline_model_parallel_size",
             "1",
             "--triton_model_name",
             model_name,
@@ -69,7 +64,7 @@ def deployment_process():
         endpoint_url=f"http://0.0.0.0:{port}/v1/completions/",
         endpoint_type="completions",
         model_name=model_name,
-        max_retries=600,
+        max_retries=100,
     )
     assert completions_ready, (
         "Completions endpoint is not ready. Please look at the deploy process log for the error"
@@ -79,7 +74,7 @@ def deployment_process():
         endpoint_url=f"http://0.0.0.0:{port}/v1/chat/completions",
         endpoint_type="chat",
         model_name=model_name,
-        max_retries=600,
+        max_retries=1,  # if completions endpoint is ready, chat should be ready too
     )
     assert chat_ready, (
         "Chat endpoint is not ready. Please look at the deploy process log for the error"
@@ -99,6 +94,9 @@ def deployment_process():
         subprocess.run(["pkill", f"-{signal.SIGTERM}", "tritonserver"], check=False)
 
 
+# FIXME(martas): Errors out due to an MCore bug on deployment side
+# enable once fixed in Export-Deploy
+@pytest.mark.pleasefixme
 @pytest.mark.run_only_on("GPU")
 @pytest.mark.parametrize(
     "eval_type,endpoint_type,eval_params",
@@ -137,8 +135,5 @@ def test_evaluation(eval_type, endpoint_type, eval_params, tmp_path):
         type=eval_type, params=ConfigParams(**eval_params), output_dir=str(tmp_path)
     )
     results = evaluate(target_cfg=eval_target, eval_cfg=eval_config)
-    # FIXME(martas): EF packages pre 25.09 use old imports from nvidia_eval_commons
-    assert isinstance(results, EvaluationResult) or isinstance(
-        results, LegacyEvaluationResult
-    )
+    assert isinstance(results, EvaluationResult)
     logger.info("Evaluation completed.")
diff --git a/tests/integration_tests/nemo_fw/test_notebooks.py b/tests/integration_tests/nemo_fw/test_notebooks.py
@@ -32,6 +32,9 @@ def uninstall_nvidia_simple_evals():
     subprocess.run(["pip", "uninstall", "-y", "nvidia-simple-evals"])
 
 
+# FIXME(martas): Errors out due to an MCore bug on deployment side
+# enable once fixed in Export-Deploy
+@pytest.mark.pleasefixme
 @pytest.mark.parametrize(
     "notebook_path",
     [