NVIDIA
diff --git a/‎.github/workflows/_example_tests_runner.yml‎
Lines changed: 6 additions & 2 deletions b/‎.github/workflows/_example_tests_runner.yml‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎.github/workflows/example_tests.yml‎
Lines changed: 6 additions & 5 deletions b/‎.github/workflows/example_tests.yml‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎.github/workflows/gpu_tests.yml‎
Lines changed: 4 additions & 4 deletions b/‎.github/workflows/gpu_tests.yml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎.github/workflows/unit_tests.yml‎
Lines changed: 4 additions & 4 deletions b/‎.github/workflows/unit_tests.yml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 14 additions & 8 deletions b/‎CONTRIBUTING.md‎
Lines changed: 14 additions & 8 deletions
diff --git a/‎examples/diffusers/quantization/diffusion_trt.py‎
Lines changed: 12 additions & 4 deletions b/‎examples/diffusers/quantization/diffusion_trt.py‎
Lines changed: 12 additions & 4 deletions
diff --git a/‎examples/llm_eval/run_simple_eval.sh‎
Lines changed: 7 additions & 1 deletion b/‎examples/llm_eval/run_simple_eval.sh‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎examples/llm_ptq/scripts/huggingface_example.sh‎
Lines changed: 1 addition & 1 deletion b/‎examples/llm_ptq/scripts/huggingface_example.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/llm_ptq/scripts/parser.sh‎
Lines changed: 3 additions & 1 deletion b/‎examples/llm_ptq/scripts/parser.sh‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎examples/llm_sparsity/weight_sparsity/data_prep.py‎
Lines changed: 15 additions & 0 deletions b/‎examples/llm_sparsity/weight_sparsity/data_prep.py‎
Lines changed: 15 additions & 0 deletions
@@ -47,8 +47,12 @@ jobs:
           echo "PATH=${PATH}:/usr/local/tensorrt/targets/x86_64-linux-gnu/bin" >> $GITHUB_ENV
       - name: Install dependencies
         run: |
-          # use `python -m pip` instead of `pip` to avoid conflicts with system pip for nemo containers
-          python -m pip install ".${{ inputs.pip_install_extras }}"
+          # Uninstall conflicting system-wide installed modelopt in nemo containers
+          pip uninstall -y nvidia-modelopt || true
+
+          # Use `python -m pip` instead of `pip` to avoid conflicts with system pip for nemo containers
+          # Editable install so example scripts launched as subprocesses resolve modelopt to the same source path as the test process
+          python -m pip install -e ".${{ inputs.pip_install_extras }}"
 
           if [[ "${{ inputs.example }}" == *"diffusers"* ]]; then
             echo "Uninstalling apex for diffusers: T5 Int8 (PixArt) + Apex is not supported as per https://github.com/huggingface/transformers/issues/21391"
 
@@ -35,14 +35,14 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        example: [llm_distill, llm_qat, llm_sparsity, diffusers_sparsity, specdec_bench]
+        example: [diffusers_sparsity, gpt-oss, llm_distill, llm_qat, llm_sparsity, specdec_bench]
         include:
           - example: speculative_decoding
             docker_image: "26.01"
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.04' }}-py3"
+      docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.05' }}-py3"
       example: ${{ matrix.example }}
       timeout_minutes: 30
       pip_install_extras: "[hf,dev-test]"
@@ -59,7 +59,7 @@ jobs:
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc16"
+      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc17"
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-rtxpro6000-latest-1
@@ -73,7 +73,7 @@ jobs:
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc16"
+      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc17"
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-rtxpro6000-latest-2
@@ -102,8 +102,9 @@ jobs:
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/tensorrt:26.04-py3"
+      docker_image: "nvcr.io/nvidia/tensorrt:26.05-py3"
       example: ${{ matrix.example }}
+      timeout_minutes: 45
       pip_install_extras: "[onnx,hf,dev-test]"
       runner: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && 'linux-amd64-gpu-rtxpro6000-latest-1' || 'linux-amd64-gpu-rtxpro6000-latest-2' }}
 
 
@@ -39,16 +39,16 @@ jobs:
       matrix:
         include:
           - example: gpu
-            timeout: 75
-            container_image: nvcr.io/nvidia/pytorch:26.04-py3
+            timeout: 60
+            container_image: nvcr.io/nvidia/pytorch:26.05-py3
           - example: gpu_megatron
             timeout: 60
             container_image: nvcr.io/nvidia/nemo:26.04
           - example: gpu_trtllm
             timeout: 30
-            container_image: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc16
+            container_image: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc17
           - example: gpu_vllm
-            timeout: 30
+            timeout: 15
             container_image: docker.io/vllm/vllm-openai:v0.20.0
     runs-on: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && 'linux-amd64-gpu-rtxpro6000-latest-1' || 'linux-amd64-gpu-rtxpro6000-latest-2' }}
     timeout-minutes: ${{ matrix.timeout }}
 
@@ -58,7 +58,7 @@ jobs:
   linux:
     needs: [check-dco]
     runs-on: ubuntu-latest
-    timeout-minutes: 30
+    timeout-minutes: 15
     steps:
       - uses: actions/checkout@v6
       - uses: ./.github/actions/ubuntu-setup
@@ -78,7 +78,7 @@ jobs:
     if: needs.check-file-changes.outputs.any_changed == 'true'
     needs: [linux, check-file-changes]
     runs-on: windows-latest
-    timeout-minutes: 30
+    timeout-minutes: 15
     steps:
       - uses: actions/checkout@v6
       - uses: actions/setup-python@v6
@@ -90,7 +90,7 @@ jobs:
     if: needs.check-file-changes.outputs.any_changed == 'true'
     needs: [linux, check-file-changes]
     runs-on: ubuntu-latest
-    timeout-minutes: 30
+    timeout-minutes: 15
     strategy:
       fail-fast: false
       matrix:
@@ -115,7 +115,7 @@ jobs:
     if: needs.check-file-changes.outputs.any_changed == 'true'
     needs: [linux, check-file-changes]
     runs-on: ubuntu-latest
-    timeout-minutes: 30
+    timeout-minutes: 15
     strategy:
       fail-fast: false
       matrix:
 
@@ -163,14 +163,20 @@ nox -s "unit-3.12(torch_211, tf_latest)"
 
 ### Test design principles
 
-- **Develop with focused tests.** During development, write as many focused
-  tests as needed, including lower-level unit tests or internal probes, to
-  understand and harden behavior.
-- **Curate production tests and keep them lean.** Before staging or committing,
-  decide which tests should be checked in. Checked-in tests should document
-  expected behavior, protect against regressions, or flag backward-incompatible
-  behavior changes. Remove redundant lower-level tests when a higher-level test
-  already covers the same behavior, keeping CI/CD fast and lean.
+- **Develop with focused tests.** During development, write as many focused tests as needed, including lower-level
+  unit tests or internal probes, to understand and harden behavior.
+- **Curate production tests and keep them lean.** Before staging or committing, decide which tests should be checked
+  in. Checked-in tests should document expected behavior, protect against regressions, or flag backward-incompatible
+  behavior changes. Remove redundant lower-level tests when a higher-level test already covers the same behavior,
+  keeping CI/CD fast and lean.
+- **Keep `tests/unit` offline — no HuggingFace Hub access.** Unit tests must be hermetic so they never flake on
+  network/timeout issues. Do not call `from_pretrained("<org>/<model>")`, `load_dataset("<hub-id>")`,
+  `snapshot_download(...)`, etc. with Hub IDs. Instead build dummy models, tokenizers, configs, and datasets locally —
+  e.g. the `create_tiny_*` helpers and `get_tiny_tokenizer()` in `tests/_test_utils/`, or a small on-disk dataset
+  directory written with `datasets.Dataset.from_dict(...).to_parquet(...)`.
+- **Respect the per-test timeout.** `tests/conftest.py` applies a default per-test call timeout by directory; override a
+  single slow test with `@pytest.mark.timeout(<seconds>)`, and register any new top-level `tests/<group>/` in that
+  mapping (collection errors until you do).
 
 ## ✍️ Signing your work
 
 
@@ -65,14 +65,14 @@
 
 
 @torch.inference_mode()
-def generate_image(pipe, prompt, image_name, torch_autocast=False):
+def generate_image(pipe, prompt, image_name, torch_autocast=False, num_inference_steps=30):
     context = torch.autocast("cuda") if torch_autocast else nullcontext()
     seed = 42
     with context:
         image = pipe(
             prompt,
             output_type="pil",
-            num_inference_steps=30,
+            num_inference_steps=num_inference_steps,
             generator=torch.Generator("cuda").manual_seed(seed),
         ).images[0]
     image.save(image_name)
@@ -186,6 +186,12 @@ def main():
         help="Use torch.autocast() during inference or benchmarking",
     )
     parser.add_argument("--skip-image", action="store_true", help="Skip image generation")
+    parser.add_argument(
+        "--num-inference-steps",
+        type=int,
+        default=30,
+        help="Number of denoising steps for image generation (lower is faster; tests use few).",
+    )
     args = parser.parse_args()
 
     image_name = args.save_image_as if args.save_image_as else f"{args.model}.png"
@@ -235,7 +241,9 @@ def main():
             )
 
         if not args.skip_image:
-            generate_image(pipe, args.prompt, image_name, args.torch_autocast)
+            generate_image(
+                pipe, args.prompt, image_name, args.torch_autocast, args.num_inference_steps
+            )
         return
 
     backbone.to("cuda")
@@ -322,7 +330,7 @@ def main():
     pipe.to("cuda")
 
     if not args.skip_image:
-        generate_image(pipe, args.prompt, image_name, args.torch_autocast)
+        generate_image(pipe, args.prompt, image_name, args.torch_autocast, args.num_inference_steps)
         print(f"Image generated using {args.model} model saved as {image_name}")
 
     if args.benchmark:
 
@@ -22,6 +22,7 @@ MODEL_NAME=$1
 EVALS=$2
 BUILD_MAX_OUTPUT_LEN=${3:-2048}
 PORT=${4:-8000}
+NUM_EXAMPLES=${5:-}  # optional: limit examples per eval (default: full eval)
 
 if [ ! -d "human-eval" ]; then
     git clone https://github.com/openai/human-eval.git
@@ -42,4 +43,9 @@ popd
 export OPENAI_API_KEY="local"
 export OPENAI_BASE_URL="http://localhost:$PORT/v1"
 
-python -m simple-evals.simple_evals --model $MODEL_NAME --evals $EVALS --max_tokens $BUILD_MAX_OUTPUT_LEN
+examples_flag=""
+if [ -n "$NUM_EXAMPLES" ]; then
+    examples_flag="--examples $NUM_EXAMPLES"
+fi
+
+python -m simple-evals.simple_evals --model $MODEL_NAME --evals $EVALS --max_tokens $BUILD_MAX_OUTPUT_LEN $examples_flag
@@ -328,7 +328,7 @@ if [[ $TASKS =~ "livecodebench" || $TASKS =~ "simple_eval" ]]; then
 
     if [[ $TASKS =~ "simple_eval" ]]; then
         echo "Using the following config: max output $BUILD_MAX_OUTPUT_LEN max batch $BUILD_MAX_BATCH_SIZE"
-        bash run_simple_eval.sh $MODEL_NAME $SIMPLE_EVAL_TASKS $BUILD_MAX_OUTPUT_LEN $PORT | tee $SAVE_PATH/simple_eval.txt
+        bash run_simple_eval.sh $MODEL_NAME $SIMPLE_EVAL_TASKS $BUILD_MAX_OUTPUT_LEN $PORT $SIMPLE_EVAL_LIMIT | tee $SAVE_PATH/simple_eval.txt
         echo "Simple eval results are saved under $SAVE_PATH/simple_eval.txt."
     fi
 
 
@@ -38,7 +38,7 @@ parse_options() {
     CAST_MXFP4_TO_NVFP4=false
 
   # Parse command-line options
-  ARGS=$(getopt -o "" -l "model:,quant:,recipe:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:,calib_seq:,auto_quantize_method:,auto_quantize_score_size:,auto_quantize_checkpoint:,moe_calib_experts_ratio:,cast_mxfp4_to_nvfp4" -n "$0" -- "$@")
+  ARGS=$(getopt -o "" -l "model:,quant:,recipe:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,simple_eval_limit:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:,calib_seq:,auto_quantize_method:,auto_quantize_score_size:,auto_quantize_checkpoint:,moe_calib_experts_ratio:,cast_mxfp4_to_nvfp4" -n "$0" -- "$@")
 
   eval set -- "$ARGS"
   while true; do
@@ -60,6 +60,7 @@ parse_options() {
       --lm_eval_tasks ) LM_EVAL_TASKS="$2"; shift 2;;
       --lm_eval_limit ) LM_EVAL_LIMIT="$2"; shift 2;;
       --simple_eval_tasks ) SIMPLE_EVAL_TASKS="$2"; shift 2;;
+      --simple_eval_limit ) SIMPLE_EVAL_LIMIT="$2"; shift 2;;
       --trust_remote_code ) TRUST_REMOTE_CODE=true; shift;;
       --use_seq_device_map ) USE_SEQ_DEVICE_MAP=true; shift;;
       --gpu_max_mem_percentage ) GPU_MAX_MEM_PERCENTAGE="$2"; shift 2;;
@@ -159,6 +160,7 @@ parse_options() {
   echo "lm_eval_tasks: $LM_EVAL_TASKS"
   echo "lm_eval_limit: $LM_EVAL_LIMIT"
   echo "simple_eval_tasks: $SIMPLE_EVAL_TASKS"
+  echo "simple_eval_limit: $SIMPLE_EVAL_LIMIT"
   echo "num_sample: $NUM_SAMPLES"
   echo "use_seq_device_map: $USE_SEQ_DEVICE_MAP"
   echo "gpu_max_mem_percentage: $GPU_MAX_MEM_PERCENTAGE"
 
@@ -39,6 +39,13 @@ def preprocess_function(sample):
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--save_path", type=str, default="data")
+    parser.add_argument(
+        "--max_samples",
+        type=int,
+        default=None,
+        help="If set, keep only the first N rows of each split before processing. Greatly "
+        "speeds up preparation for smoke tests (cnn_dailymail train is ~287k rows).",
+    )
     return parser.parse_args()
 
 
@@ -48,6 +55,14 @@ def main():
     # Load dataset from the hub
     dataset = load_dataset(dataset_id, name=dataset_config)
 
+    if args.max_samples is not None:
+        dataset = type(dataset)(
+            {
+                split: ds.select(range(min(args.max_samples, len(ds))))
+                for split, ds in dataset.items()
+            }
+        )
+
     # process dataset
     tokenized_dataset = dataset.map(
         preprocess_function, batched=True, remove_columns=list(dataset["train"].features)