use uv

SigureMo · SigureMo · commit 726ba082a0cc · 2025-11-30T00:42:15.000+08:00
diff --git a/.github/workflows/ci-paddle.yml b/.github/workflows/ci-paddle.yml
@@ -72,15 +72,29 @@ jobs:
           docker exec -t ${{ env.container_name }} /bin/bash -c '
           set -e
           source ${{ github.workspace }}/../../../proxy
-          pip install -r requirements-test.txt
-          pip install -e .
+
+          # Install uv
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          source $HOME/.cargo/env
+
+          # Create and activate virtual environment
+          uv venv .venv
+          source .venv/bin/activate
+
+          # Install paddle
+          uv pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu129/
+
+          # Install project and minimal test runner
+          uv pip install pytest
+          uv pip install -e .
           '
 
       - name: Run tests
         run: |
           docker exec -t ${{ env.container_name }} /bin/bash -c '
           set -e
-          pytest testing/
+          source .venv/bin/activate
+          pytest tests_paddle/
           '
 
       - name: Terminate and delete the container
diff --git a/tests_paddle/test_quick_start.py b/tests_paddle/test_quick_start.py
@@ -0,0 +1,79 @@
+import numpy as np
+import paddle
+
+import tilelang
+import tilelang.language as T
+
+
+# @tilelang.jit(target="cuda")
+# target currently can be "cuda" or "hip" or "cpu".
+# if not specified, it will be inferred from the input tensors during compile time
+@tilelang.jit
+def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+    @T.prim_func
+    def matmul_relu_kernel(
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
+    ):
+        # Initialize Kernel Context
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            # Enable rasterization for better L2 cache locality (Optional)
+            # T.use_swizzle(panel_size=10, enable=True)
+
+            # Clear local accumulation
+            T.clear(C_local)
+
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                # Copy tile of A
+                # This is a sugar syntax for parallelized copy
+                T.copy(A[by * block_M, ko * block_K], A_shared)
+
+                # Copy tile of B
+                T.copy(B[ko * block_K, bx * block_N], B_shared)
+
+                # Perform a tile-level GEMM on the shared buffers
+                # Currently we dispatch to the cute/hip on Nvidia/AMD GPUs
+                T.gemm(A_shared, B_shared, C_local)
+
+            # relu
+            for i, j in T.Parallel(block_M, block_N):
+                C_local[i, j] = T.max(C_local[i, j], 0)
+
+            # Copy result back to global memory
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return matmul_relu_kernel
+
+
+def test_quick_start():
+    M = 1024  # M = T.dynamic("m") if you want to use dynamic shape
+    N = 1024
+    K = 1024
+    block_M = 128
+    block_N = 128
+    block_K = 32
+
+    # Define the kernel (matmul) and compile/lower it into an executable module
+    matmul_relu_kernel = matmul(M, N, K, block_M, block_N, block_K)
+    # Test the kernel in Python with PyTorch data
+    import paddle
+
+    # Create random input tensors on the GPU
+    a = paddle.randn(M, K, device="cuda", dtype=paddle.float16)
+    b = paddle.randn(K, N, device="cuda", dtype=paddle.float16)
+    c = paddle.empty(M, N, device="cuda", dtype=paddle.float16)
+
+    # Run the kernel through the Profiler
+    matmul_relu_kernel(a, b, c)
+
+    print(c)
+    # Reference multiplication using PyTorch
+    ref_c = paddle.nn.functional.relu(a @ b)
+
+    # Validate correctness
+    np.testing.assert_allclose(c.numpy(), ref_c.numpy(), rtol=1e-2, atol=1e-2)