NVIDIA-NeMo · pablo-garay · Jan 5, 2026 · Jan 7, 2026 · Jan 7, 2026 · Jan 8, 2026
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -32,6 +32,7 @@ permissions:
 
 env:
   UV_HTTP_TIMEOUT: 60
+  PYTHONFAULTHANDLER: 1
 
 jobs:
   pre-flight:
@@ -70,6 +71,7 @@ jobs:
             pyproject.toml
             docker/**
             tests/**
+            uv.lock
           base_sha: ${{ steps.base-ref.outputs.base }}
 
       - name: Check if docs only
@@ -183,7 +185,7 @@ jobs:
           uv add InternVideo/InternVideo2/multi_modality
           FOLDER="${{ matrix.folder }}"
           FOLDER="${FOLDER/stages-/stages/}"
-          uv run coverage run --branch --source=nemo_curator -m pytest -v "tests/$FOLDER" -m "not gpu"
+          uv run coverage run --branch --source=nemo_curator -m pytest -vv -s "tests/$FOLDER" -m "not gpu"
 
       - name: Generate report
         id: check

diff --git a/pyproject.toml b/pyproject.toml
@@ -77,6 +77,7 @@ deduplication_cuda12 = [
     "pylibraft-cu12==25.10.*",
     "raft-dask-cu12==25.10.*",
     "rapidsmpf-cu12==25.10.*",
+    "scikit-learn<1.8.0", # cuml 25.10 is not compatible with sklearn 1.8+
 ]
 
 audio_cpu = [
@@ -166,7 +167,7 @@ test = [
     "pytest-asyncio",
     "pytest-cov",
     "pytest-loguru",
-    "scikit-learn",
+    "scikit-learn<1.8.0", # cuml 25.10 is not compatible with sklearn 1.8+
     "s3fs", # added for testing cloud fs
 ]
 

diff --git a/tests/backends/test_integration.py b/tests/backends/test_integration.py
@@ -192,6 +192,12 @@ def test_ray_data_execution_plan(self):
         """Test that Ray Data creates the expected execution plan with correct stage organization."""
         if self.backend_cls != RayDataExecutor:
             pytest.skip("Execution plan test only applies to RayDataExecutor")
+        from packaging import version
-        from packaging import version
+import ray
-        from packaging import version
+import ray
+
+        if version.parse(ray.__version__) >= version.parse("2.53.0"):
+            streaming_partitioning_stage = "StreamingRepartition[num_rows_per_block=1]"
+        else:
+            streaming_partitioning_stage = "StreamingRepartition"
 
         # Look for execution plan in logs with multiple possible patterns
         matches = re.findall(r"Execution plan of Dataset.*?:\s*(.+)", self.all_logs, re.MULTILINE)
@@ -205,10 +211,10 @@ def test_ray_data_execution_plan(self):
         expected_stages = [
             "InputDataBuffer[Input]",
             "TaskPoolMapOperator[MapBatches(FilePartitioningStageTask)]",
-            "TaskPoolMapOperator[StreamingRepartition]",
+            f"TaskPoolMapOperator[{streaming_partitioning_stage}]",
             "ActorPoolMapOperator[MapBatches(JsonlReaderStageTask)->MapBatches(AddLengthStageActor)]",
             "ActorPoolMapOperator[MapBatches(SplitIntoRowsStageActor)]",
-            "TaskPoolMapOperator[StreamingRepartition]",
+            f"TaskPoolMapOperator[{streaming_partitioning_stage}]",
             "ActorPoolMapOperator[MapBatches(AddLengthStageActor)]",
             "ActorPoolMapOperator[MapBatches(StageWithSetupActor)]",
             "TaskPoolMapOperator[MapBatches(JsonlWriterTask)]",