NVIDIA · Edenzzzz · Nov 1, 2025 · Nov 1, 2025
diff --git a/examples/python/CuTeDSL/ampere/call_bypass_dlpack.py b/examples/python/CuTeDSL/ampere/call_bypass_dlpack.py
@@ -47,7 +47,7 @@
 
 .. code-block:: bash
 
-    python examples/ampere/call_bypass_dlpack.py
+    python examples/python/CuTeDSL/ampere/call_bypass_dlpack.py
 
 
 It's worth to mention that by-passing dlpack protocol can resolve the issue that dlpack doesn't handle shape-1

diff --git a/examples/python/CuTeDSL/ampere/call_from_jit.py b/examples/python/CuTeDSL/ampere/call_from_jit.py
@@ -44,7 +44,7 @@
 
 .. code-block:: bash
 
-    python examples/ampere/call_from_jit.py
+    python examples/python/CuTeDSL/ampere/call_from_jit.py
 
 Default configuration:
 - Batch dimension (L): 16

diff --git a/examples/python/CuTeDSL/ampere/elementwise_add.py b/examples/python/CuTeDSL/ampere/elementwise_add.py
@@ -118,16 +118,16 @@
 
 .. code-block:: bash
 
-    python examples/ampere/elementwise_add.py --M 3 --N 12
-    python examples/ampere/elementwise_add.py --M 1024 --N 512
-    python examples/ampere/elementwise_add.py --M 1024 --N 1024 --benchmark --warmup_iterations 2 --iterations 1000
+    python examples/python/CuTeDSL/ampere/elementwise_add.py --M 3 --N 12
+    python examples/python/CuTeDSL/ampere/elementwise_add.py --M 1024 --N 512
+    python examples/python/CuTeDSL/ampere/elementwise_add.py --M 1024 --N 1024 --benchmark --warmup_iterations 2 --iterations 1000
 
 To collect performance with NCU profiler:
 
 .. code-block:: bash
 
     # Don't iterate too many times when profiling with ncu
-    ncu python examples/ampere/elementwise_add.py --M 2048 --N 2048 --benchmark --iterations 10 --skip_ref_check
+    ncu python examples/python/CuTeDSL/ampere/elementwise_add.py --M 2048 --N 2048 --benchmark --iterations 10 --skip_ref_check
 """
 
 

diff --git a/examples/python/CuTeDSL/ampere/elementwise_apply.py b/examples/python/CuTeDSL/ampere/elementwise_apply.py
@@ -60,16 +60,16 @@
 .. code-block:: bash
 
     # Run with addition operation
-    python examples/ampere/elementwise_apply.py --M 1024 --N 512 --op add
+    python examples/python/CuTeDSL/ampere/elementwise_apply.py --M 1024 --N 512 --op add
 
     # Run with multiplication operation
-    python examples/ampere/elementwise_apply.py --M 1024 --N 512 --op mul
+    python examples/python/CuTeDSL/ampere/elementwise_apply.py --M 1024 --N 512 --op mul
 
     # Run with subtraction operation
-    python examples/ampere/elementwise_apply.py --M 1024 --N 512 --op sub
+    python examples/python/CuTeDSL/ampere/elementwise_apply.py --M 1024 --N 512 --op sub
 
     # Benchmark performance
-    python examples/ampere/elementwise_apply.py --M 2048 --N 2048 --op add --benchmark --warmup_iterations 2 --iterations 10
+    python examples/python/CuTeDSL/ampere/elementwise_apply.py --M 2048 --N 2048 --op add --benchmark --warmup_iterations 2 --iterations 10
 
 The example demonstrates how to express complex CUDA kernels with customizable operations
 while maintaining high performance through efficient memory access patterns.

diff --git a/examples/python/CuTeDSL/ampere/flash_attention_v2.py b/examples/python/CuTeDSL/ampere/flash_attention_v2.py
@@ -67,7 +67,7 @@
 
 .. code-block:: bash
 
-    python examples/ampere/flash_attention_v2.py                                            \
+    python examples/python/CuTeDSL/ampere/flash_attention_v2.py                                            \
       --dtype Float16 --head_dim 128 --m_block_size 128 --n_block_size 128                  \
       --num_threads 128 --batch_size 1 --seqlen_q 1280 --seqlen_k 1536                      \
       --num_head 16 --softmax_scale 1.0 --is_causal
@@ -81,7 +81,7 @@
 
 .. code-block:: bash
 
-    ncu python examples/ampere/flash_attention_v2.py                                        \
+    ncu python examples/python/CuTeDSL/ampere/flash_attention_v2.py                                        \
         --dtype Float16 --head_dim 128 --m_block_size 128 --n_block_size 128                \
         --num_threads 128 --batch_size 1 --seqlen_q 1280 --seqlen_k 1536                    \
         --num_head 16 --softmax_scale 1.0 --is_causal --skip_ref_check

diff --git a/examples/python/CuTeDSL/ampere/sgemm.py b/examples/python/CuTeDSL/ampere/sgemm.py
@@ -66,15 +66,15 @@
 
 .. code-block:: bash
 
-    python examples/ampere/sgemm.py                       \
+    python examples/python/CuTeDSL/ampere/sgemm.py                       \
       --mnk 8192,8192,8192                                \
       --a_major m --b_major n --c_major n
 
 To collect performance with NCU profiler:
 
 .. code-block:: bash
 
-    ncu python examples/ampere/sgemm.py                   \
+    ncu python examples/python/CuTeDSL/ampere/sgemm.py                   \
       --mnk 8192,8192,8192                                \
       --a_major m --b_major n --c_major n                 \
       --skip_ref_check --iterations 2

diff --git a/examples/python/CuTeDSL/ampere/smem_allocator.py b/examples/python/CuTeDSL/ampere/smem_allocator.py
@@ -52,7 +52,7 @@
 
 .. code-block:: bash
 
-    python examples/ampere/smem_allocator.py
+    python examples/python/CuTeDSL/ampere/smem_allocator.py
 
 The example will allocate shared memory, perform tensor operations, and verify the results.
 """

diff --git a/examples/python/CuTeDSL/ampere/tensorop_gemm.py b/examples/python/CuTeDSL/ampere/tensorop_gemm.py
@@ -65,7 +65,7 @@
 
 .. code-block:: bash
 
-    python examples/ampere/tensorop_gemm.py                                  \
+    python examples/python/CuTeDSL/ampere/tensorop_gemm.py                                  \
       --mnkl 8192,8192,8192,1 --atom_layout_mnk 2,2,1                        \
       --ab_dtype Float16                                                     \
       --c_dtype Float16 --acc_dtype Float32                                  \
@@ -80,7 +80,7 @@
 
 .. code-block:: bash
 
-    ncu python examples/ampere/tensorop_gemm.py                              \
+    ncu python examples/python/CuTeDSL/ampere/tensorop_gemm.py                              \
       --mnkl 8192,8192,8192,1 --atom_layout_mnk 2,2,1                        \
       --ab_dtype Float16                                                     \
       --c_dtype Float16 --acc_dtype Float32                                  \

diff --git a/examples/python/CuTeDSL/blackwell/dense_blockscaled_gemm_persistent.py b/examples/python/CuTeDSL/blackwell/dense_blockscaled_gemm_persistent.py
@@ -85,7 +85,7 @@
 
 .. code-block:: bash
 
-    python examples/blackwell/dense_blockscaled_gemm_persistent.py             \
+    python examples/python/CuTeDSL/blackwell/dense_blockscaled_gemm_persistent.py            \
       --ab_dtype Float4E2M1FN --sf_dtype Float8E8M0FNU --sf_vec_size 16        \
       --c_dtype Float16                                                        \
       --mma_tiler_mn 256,128 --cluster_shape_mn 2,1                            \
@@ -95,7 +95,7 @@
 
 .. code-block:: bash
 
-    ncu python examples/blackwell/dense_blockscaled_gemm_persistent.py         \
+    ncu python examples/python/CuTeDSL/blackwell/dense_blockscaled_gemm_persistent.py        \
       --ab_dtype Float4E2M1FN --sf_dtype Float8E8M0FNU --sf_vec_size 16        \
       --c_dtype Float16                                                        \
       --mma_tiler_mn 256,128 --cluster_shape_mn 2,1                            \

diff --git a/examples/python/CuTeDSL/blackwell/dense_gemm.py b/examples/python/CuTeDSL/blackwell/dense_gemm.py
@@ -75,7 +75,7 @@
 
 .. code-block:: bash
 
-    python examples/blackwell/dense_gemm.py                                     \
+    python examples/python/CuTeDSL/blackwell/dense_gemm.py                                     \
       --ab_dtype Float16 --c_dtype Float16 --acc_dtype Float32                  \
       --mma_tiler_mn 256,128 --cluster_shape_mn 2,1                             \
       --mnkl 8192,8192,8192,1                                                   \
@@ -90,7 +90,7 @@
 
 .. code-block:: bash
 
-    ncu python examples/blackwell/dense_gemm.py                                \
+    ncu python examples/python/CuTeDSL/blackwell/dense_gemm.py                                \
       --ab_dtype Float16 --c_dtype Float16 --acc_dtype Float32                 \
       --mma_tiler_mn 256,128 --cluster_shape_mn 2,1                            \
       --mnkl 8192,8192,8192,1                                                  \

diff --git a/examples/python/CuTeDSL/blackwell/dense_gemm_persistent.py b/examples/python/CuTeDSL/blackwell/dense_gemm_persistent.py
@@ -76,7 +76,7 @@
 
 .. code-block:: bash
 
-    python examples/blackwell/dense_gemm_persistent.py                          \
+    python examples/python/CuTeDSL/blackwell/dense_gemm_persistent.py                          \
       --ab_dtype Float16 --c_dtype Float16 --acc_dtype Float32                  \
       --mma_tiler_mn 256,128 --cluster_shape_mn 2,1                             \
       --mnkl 8192,8192,8192,1                                                   \
@@ -86,7 +86,7 @@
 
 .. code-block:: bash
 
-    ncu python examples/blackwell/dense_gemm_persistent.py                     \
+    ncu python examples/python/CuTeDSL/blackwell/dense_gemm_persistent.py                     \
       --ab_dtype Float16 --c_dtype Float16 --acc_dtype Float32                 \
       --mma_tiler_mn 256,128 --cluster_shape_mn 2,1                            \
       --mnkl 8192,8192,8192,1                                                  \

diff --git a/examples/python/CuTeDSL/blackwell/dense_gemm_software_pipeline.py b/examples/python/CuTeDSL/blackwell/dense_gemm_software_pipeline.py
@@ -74,7 +74,7 @@
 
 .. code-block:: bash
 
-    python examples/blackwell/dense_gemm_software_pipeline.py                   \
+    python examples/python/CuTeDSL/blackwell/dense_gemm_software_pipeline.py                   \
       --ab_dtype Float16 --c_dtype Float16 --acc_dtype Float32                  \
       --mma_tiler_mn 256,128 --cluster_shape_mn 2,1                             \
       --mnkl 8192,8192,8192,1                                                   \
@@ -89,7 +89,7 @@
 
 .. code-block:: bash
 
-    ncu python examples/blackwell/dense_gemm_software_pipeline.py              \
+    ncu python examples/python/CuTeDSL/blackwell/dense_gemm_software_pipeline.py              \
       --ab_dtype Float16 --c_dtype Float16 --acc_dtype Float32                 \
       --mma_tiler_mn 256,128 --cluster_shape_mn 2,1                            \
       --mnkl 8192,8192,8192,1                                                  \

diff --git a/examples/python/CuTeDSL/blackwell/fmha.py b/examples/python/CuTeDSL/blackwell/fmha.py
@@ -70,7 +70,7 @@
 
 .. code-block:: bash
 
-    python examples/blackwell/fmha.py                                     \
+    python examples/python/CuTeDSL/blackwell/fmha.py                                     \
       --qk_acc_dtype Float32 --pv_acc_dtype Float32                       \
       --mma_tiler_mn 128,128                                              \
       --q_shape 4,1024,8,64 --k_shape 4,1024,8,64                         \
@@ -84,7 +84,7 @@
 
 .. code-block:: bash
 
-    ncu python examples/blackwell/fmha.py                                 \
+    ncu python examples/python/CuTeDSL/blackwell/fmha.py                                 \
       --qk_acc_dtype Float32 --pv_acc_dtype Float32                       \
       --mma_tiler_mn 128,128                                              \
       --q_shape 4,1024,8,64 --k_shape 4,1024,8,64                         \

diff --git a/examples/python/CuTeDSL/blackwell/grouped_gemm.py b/examples/python/CuTeDSL/blackwell/grouped_gemm.py
@@ -58,7 +58,7 @@
 
 .. code-block:: bash
 
-    python examples/blackwell/grouped_gemm.py                                                 \
+    python examples/python/CuTeDSL/blackwell/grouped_gemm.py                                                 \
       --ab_dtype Float16 --c_dtype Float16 --acc_dtype Float32                                \
       --mma_tiler_mn 128,64 --cluster_shape_mn 1,1                                            \
       --problem_sizes_mnkl "(8192,1280,32,1),(16,384,1536,1),(640,1280,16,1),(640,160,16,1)"  \
@@ -72,7 +72,7 @@
 
 .. code-block:: bash
 
-    ncu python examples/blackwell/grouped_gemm.py                                             \
+    ncu python examples/python/CuTeDSL/blackwell/grouped_gemm.py                                             \
       --ab_dtype Float16 --c_dtype Float16 --acc_dtype Float32                                \
       --mma_tiler_mn 128,64 --cluster_shape_mn 1,1                                            \
       --problem_sizes_mnkl "(8192,1280,32,1),(16,384,1536,1),(640,1280,16,1),(640,160,16,1)"  \

diff --git a/examples/python/CuTeDSL/hopper/dense_gemm.py b/examples/python/CuTeDSL/hopper/dense_gemm.py
@@ -69,7 +69,7 @@
 
 .. code-block:: bash
 
-    python examples/hopper/dense_gemm.py                                   \
+    python examples/python/CuTeDSL/hopper/dense_gemm.py                                   \
       --mnkl 8192,8192,8192,1 --tile_shape_mn 128,256                      \
       --cluster_shape_mn 1,1 --a_dtype Float16 --b_dtype Float16           \
       --c_dtype Float16 --acc_dtype Float32                                \
@@ -84,7 +84,7 @@
 
 .. code-block:: bash
 
-    ncu python examples/hopper/dense_gemm.py                               \
+    ncu python examples/python/CuTeDSL/hopper/dense_gemm.py                               \
       --mnkl 8192,8192,8192,1 --tile_shape_mn 128,256                      \
       --cluster_shape_mn 1,1 --a_dtype Float16 --b_dtype Float16           \
       --c_dtype Float16 --acc_dtype Float32                                \