[CK DSL] Parameterize conv integration test across shapes.

DarylHawkinsAMD · claude · DarylHawkinsAMD · commit afd4f82a9b2f · 2026-05-30T18:55:48.000-04:00
Convert IntegrationGpuCkDslConvFp16 from a single hardcoded
BakeOffConv TEST_F into a parameterized TEST_P over a ConvCase
shape list. The original bake-off shape is preserved as the
BakeOff case.

The bake-off shape is fully tile-aligned (M=N*Ho*Wo, GEMM-N=K,
GEMM-K=C*R*S all multiples of the kernel's 64-wide tile), leaving
partial-tile boundary handling unverified. The shape set now adds
tile-aligned variants (stride 2, 1x1, C/K=128, non-square R!=S,
dilation 2) and partial-tile probes (partial GEMM-N, GEMM-K,
GEMM-M, and all three at once).

All 10 shapes pass on gfx950 with worst abs diff 6e-5..2.4e-4
against the CPU reference (5e-2 tolerance).

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/dnn-providers/ck-dsl-provider/integration_tests/IntegrationGpuCkDslConvFp16.cpp b/dnn-providers/ck-dsl-provider/integration_tests/IntegrationGpuCkDslConvFp16.cpp
@@ -15,6 +15,7 @@
 #include <hipdnn_test_sdk/utilities/FlatbufferGraphTestUtils.hpp>
 #include <memory>
 #include <sstream>
+#include <string>
 #include <vector>
 
 #include "CkDslContainer.hpp"
@@ -40,24 +41,45 @@ using ck_dsl_provider::PerfResult;
 using hipdnn_data_sdk::types::half;
 using hipdnn_test_sdk::utilities::CpuFpReferenceConvolution;
 
-/// I-10 capstone: end-to-end M1 integration test.
+/// One forward-convolution problem the parameterized integration test
+/// drives end to end. Spatial fields are per-dimension so non-square
+/// kernels, padding, strides, and dilations can each be exercised.
+struct ConvCase {
+    const char* name;
+    std::int64_t n, c, hi, wi;  // input  (logical NCHW)
+    std::int64_t k, r, s;       // weight (logical KCRS)
+    std::int64_t strideH, strideW;
+    std::int64_t padH, padW;  // symmetric (pre == post) per spatial dim
+    std::int64_t dilH, dilW;
+};
+
+/// Standard forward-conv output extent for one spatial dimension.
+std::int64_t convOutputDim(std::int64_t in, std::int64_t pad, std::int64_t dil, std::int64_t k,
+                           std::int64_t stride) {
+    return (in + 2 * pad - dil * (k - 1) - 1) / stride + 1;
+}
+
+/// End-to-end M1 integration coverage for the implicit-GEMM conv path.
 ///
-/// Per plan §1 the test:
-///   1. Builds a single-op conv-fwd graph (bake-off shape: N=8,
-///      56x56x64 -> 64, 3x3, stride 1, pad 1, FP16, NHWC).
+/// Each case:
+///   1. Builds a single-op conv-fwd graph for the parameterized shape.
 ///   2. Runs it through the JIT pipeline (engine, plan-builder,
 ///      adapter, bridge, compile_service, JitCache, HipModule).
 ///   3. Validates the output against CpuFpReferenceConvolution::fprop
 ///      from the test SDK within tolerance.
 ///   4. Logs achieved kernel time and TFLOPS via PerfMeasurement.
 ///
+/// The shape set spans tile-aligned variants (M = N*Ho*Wo, GEMM-N = K,
+/// and GEMM-K = C*R*S each a multiple of the kernel's 64-wide tile) and
+/// partial-tile probes where one or more of those dimensions is not a
+/// multiple of 64 -- the latter directly exercise the last-tile
+/// boundary handling the tile-aligned bake-off shape never touches.
+///
 /// **Adaptation from plan §1:** the test bypasses the hipDNN frontend
 /// API and the backend's .so-loading plugin path. Both surfaces are
 /// architecturally additive on top of what the unit-test suite
 /// already proves -- the plan-builder + plan-execute path here is
-/// the exact same code the backend would call after dlopen. The
-/// frontend-API integration lands as M1.5 (or as part of I-11) once
-/// the .so installs cleanly into a hipDNN that can find it.
+/// the exact same code the backend would call after dlopen.
 ///
 /// Tensor layout convention (PREP_FINDINGS P-6 + miopen-provider
 /// precedent): host-side tensors carry logical NCHW dims for X/Y and
@@ -66,7 +88,7 @@ using hipdnn_test_sdk::utilities::CpuFpReferenceConvolution;
 /// iterates logical dims and resolves via strides, so a direct
 /// element-wise compare over the packed NHWC buffers walks the same
 /// logical positions in the same order.
-class IntegrationGpuCkDslConvFp16Gpu : public ::testing::Test {
+class IntegrationGpuCkDslConvFp16Gpu : public ::testing::TestWithParam<ConvCase> {
    protected:
     void SetUp() override {
         CK_DSL_PROVIDER_SKIP_IF_NOT_GFX950("IntegrationGpuCkDslConvFp16Gpu");
@@ -82,33 +104,36 @@ class IntegrationGpuCkDslConvFp16Gpu : public ::testing::Test {
     std::unique_ptr<ConvImplicitGemmPlanBuilder> _planBuilder;
 };
 
-TEST_F(IntegrationGpuCkDslConvFp16Gpu, BakeOffConv) {
-    // Bake-off shape from plan §4.
-    constexpr std::int64_t kN = 8;
-    constexpr std::int64_t kC = 64;
-    constexpr std::int64_t kHi = 56;
-    constexpr std::int64_t kWi = 56;
-    constexpr std::int64_t kK = 64;
-    constexpr std::int64_t kR = 3;
-    constexpr std::int64_t kS = 3;
-    // Ho = (Hi + 2*pH - dH*(R-1) - 1)/sH + 1 = (56 + 2 - 2 - 1)/1 + 1 = 56.
-    constexpr std::int64_t kHo = 56;
-    constexpr std::int64_t kWo = 56;
-
-    // FB graph -- exact same shape as ConvImplicitGemmPlanBuilderTest's
-    // ``makeBakeOffConvFwdGraph``. Tensor UIDs from
-    // createValidConvFwdGraph: x=1, w=2, y=3.
+TEST_P(IntegrationGpuCkDslConvFp16Gpu, Conv) {
+    const ConvCase& cse = GetParam();
+    const std::int64_t kN = cse.n;
+    const std::int64_t kC = cse.c;
+    const std::int64_t kHi = cse.hi;
+    const std::int64_t kWi = cse.wi;
+    const std::int64_t kK = cse.k;
+    const std::int64_t kR = cse.r;
+    const std::int64_t kS = cse.s;
+    const std::int64_t kHo = convOutputDim(kHi, cse.padH, cse.dilH, kR, cse.strideH);
+    const std::int64_t kWo = convOutputDim(kWi, cse.padW, cse.dilW, kS, cse.strideW);
+    ASSERT_GT(kHo, 0) << "shape '" << cse.name << "' yields non-positive Ho=" << kHo;
+    ASSERT_GT(kWo, 0) << "shape '" << cse.name << "' yields non-positive Wo=" << kWo;
+
+    // FB graph. Strides are the NHWC physical layout expressed over the
+    // logical NCHW (X/Y) and KCRS (W) dim order: the channel stride is
+    // 1, the W stride is C, the H stride is W*C, the N/K stride is the
+    // full per-image span. Tensor UIDs from createValidConvFwdGraph:
+    // x=1, w=2, y=3.
     auto fbBuilder = hipdnn_test_sdk::utilities::createValidConvFwdGraph(
         /*xDims=*/{kN, kC, kHi, kWi},
         /*xStrides=*/{kC * kHi * kWi, 1, kWi * kC, kC},
         /*wDims=*/{kK, kC, kR, kS},
         /*wStrides=*/{kC * kR * kS, 1, kS * kC, kC},
         /*yDims=*/{kN, kK, kHo, kWo},
         /*yStrides=*/{kK * kHo * kWo, 1, kWo * kK, kK},
-        /*convPrePadding=*/{1, 1},
-        /*convPostPadding=*/{1, 1},
-        /*convStrides=*/{1, 1},
-        /*convDilation=*/{1, 1},
+        /*convPrePadding=*/{cse.padH, cse.padW},
+        /*convPostPadding=*/{cse.padH, cse.padW},
+        /*convStrides=*/{cse.strideH, cse.strideW},
+        /*convDilation=*/{cse.dilH, cse.dilW},
         /*dataType=*/data_objects::DataType::HALF);
     flatbuffer_utilities::GraphWrapper graph(fbBuilder.GetBufferPointer(), fbBuilder.GetSize());
 
@@ -123,18 +148,18 @@ TEST_F(IntegrationGpuCkDslConvFp16Gpu, BakeOffConv) {
     utilities::Tensor<half> tensorYGpu({kN, kK, kHo, kWo}, nhwc);
     utilities::Tensor<half> tensorYCpu({kN, kK, kHo, kWo}, nhwc);
 
-    // Seed both inputs. Small range so the K_gemm=576 accumulation
-    // stays in a numerically friendly part of FP16 (max accumulator
-    // value is bounded by |x|*|w|*K = 0.1*0.1*576 = 5.76). Random
-    // distributions still exercise every codepath the kernel takes;
-    // adjusting the range only reduces the tail accumulator error.
+    // Seed both inputs. Small range so the K_gemm = C*R*S accumulation
+    // stays in a numerically friendly part of FP16 (the accumulator is
+    // bounded by |x|*|w|*K_gemm = 0.1*0.1*K_gemm). Random distributions
+    // still exercise every codepath the kernel takes; adjusting the
+    // range only reduces the tail accumulator error.
     constexpr unsigned kSeedX = 0x4242u;
     constexpr unsigned kSeedW = 0x5555u;
     tensorX.fillWithRandomValues(half(-0.1f), half(0.1f), kSeedX);
     tensorW.fillWithRandomValues(half(-0.1f), half(0.1f), kSeedW);
 
     // Build the plan. This compiles the kernel on a cold cache
-    // (multi-second the first time per provider session).
+    // (multi-second the first time per unique shape).
     flatbuffer_utilities::EngineConfigWrapper engineConfig(nullptr, 0);
     CkDslContext ctx;
     _planBuilder->buildPlan(*_handle, graph, engineConfig, ctx);
@@ -161,10 +186,11 @@ TEST_F(IntegrationGpuCkDslConvFp16Gpu, BakeOffConv) {
     // resolves to memory via inner_product(indices, strides) per
     // P-6's stride math. Output Y_cpu is written into a freshly-
     // allocated host tensor with matching strides.
-    CpuFpReferenceConvolution::fprop<half, half, half, float>(tensorX, tensorW, tensorYCpu,
-                                                              /*strides=*/{1, 1},
-                                                              /*dilations=*/{1, 1},
-                                                              /*padding=*/{1, 1});
+    CpuFpReferenceConvolution::fprop<half, half, half, float>(
+        tensorX, tensorW, tensorYCpu,
+        /*strides=*/{cse.strideH, cse.strideW},
+        /*dilations=*/{cse.dilH, cse.dilW},
+        /*padding=*/{cse.padH, cse.padW});
 
     // Force D->H on the GPU output so subsequent host reads see the
     // kernel's writes. markDeviceModified is what tells the migration
@@ -175,9 +201,9 @@ TEST_F(IntegrationGpuCkDslConvFp16Gpu, BakeOffConv) {
     const half* cpuOut = tensorYCpu.memory().hostData();
 
     // Tolerance bound (per plan §1 + PREP_FINDINGS): expected error
-    // for K_gemm=576 random-uniform fp16 accumulations is roughly
-    // sqrt(K_gemm) * fp16_eps * |max_input * max_weight| =
-    // 24 * 1e-3 * 0.01 = 2.4e-4 typical. We use a generous 5e-2
+    // for K_gemm random-uniform fp16 accumulations is roughly
+    // sqrt(K_gemm) * fp16_eps * |max_input * max_weight|, which for the
+    // shapes here stays well under 1e-3. We use a generous 5e-2
     // absolute tolerance to accommodate accumulation tail behaviour
     // without making the test brittle to minor codegen reshufflings.
     constexpr float kAbsTol = 5.0e-2f;
@@ -209,8 +235,8 @@ TEST_F(IntegrationGpuCkDslConvFp16Gpu, BakeOffConv) {
         }
     }
 
-    EXPECT_EQ(mismatches, 0u) << "found " << mismatches << " elements outside the " << kAbsTol
-                              << " tolerance ("
+    EXPECT_EQ(mismatches, 0u) << "shape '" << cse.name << "': found " << mismatches
+                              << " elements outside the " << kAbsTol << " tolerance ("
                               << static_cast<double>(mismatches) /
                                      static_cast<double>(numElements) * 100.0
                               << "%); first mismatch at linear index " << firstMismatchIdx
@@ -219,10 +245,10 @@ TEST_F(IntegrationGpuCkDslConvFp16Gpu, BakeOffConv) {
 
     // Perf measurement (no perf-target assertion, log only per Q9).
     // FLOPS formula from plan §4: 2 * N * Ho * Wo * K * C * R * S.
-    constexpr double kFlops = 2.0 * static_cast<double>(kN) * static_cast<double>(kHo) *
-                              static_cast<double>(kWo) * static_cast<double>(kK) *
-                              static_cast<double>(kC) * static_cast<double>(kR) *
-                              static_cast<double>(kS);
+    const double kFlops = 2.0 * static_cast<double>(kN) * static_cast<double>(kHo) *
+                          static_cast<double>(kWo) * static_cast<double>(kK) *
+                          static_cast<double>(kC) * static_cast<double>(kR) *
+                          static_cast<double>(kS);
     PerfMeasurement pm;
     auto launchFn = [&]() {
         ctx.plan().execute(*_handle, deviceBuffers.data(),
@@ -235,10 +261,10 @@ TEST_F(IntegrationGpuCkDslConvFp16Gpu, BakeOffConv) {
     // harness's recorder captures it. Also stamp the worst element
     // diff in the result message so a passing test still leaves a
     // breadcrumb of the numerical agreement quality.
-    pm.log("conv_implicit_gemm_bake_off_N8H56W56C64_K64R3S3", result);
+    pm.log(std::string("conv_implicit_gemm_") + cse.name, result);
 
     std::ostringstream summary;
-    summary << "IntegrationGpuCkDslConvFp16Gpu.BakeOffConv: numerical agreement "
+    summary << "IntegrationGpuCkDslConvFp16Gpu.Conv/" << cse.name << ": numerical agreement "
             << "(worst abs diff = " << worstError << " < tol = " << kAbsTol
             << "), perf min_us = " << result.minUs << ", median_us = " << result.medianUs
             << ", tflops = " << result.tflops;
@@ -248,4 +274,27 @@ TEST_F(IntegrationGpuCkDslConvFp16Gpu, BakeOffConv) {
     RecordProperty("ck_dsl_perf_summary", summary.str());
 }
 
+// Shape set. Cases 1-6 keep M = N*Ho*Wo, GEMM-N = K, and GEMM-K = C*R*S
+// each a multiple of the kernel's 64-wide tile (expected to pass). Cases
+// 7-10 leave one or more of those dimensions partial to probe last-tile
+// boundary handling the tile-aligned bake-off shape never exercises.
+const std::vector<ConvCase> kConvCases = {
+    // name                  N   C  Hi  Wi    K  R  S   sH sW  pH pW  dH dW
+    {"BakeOff", 8, 64, 56, 56, 64, 3, 3, 1, 1, 1, 1, 1, 1},
+    {"Stride2", 8, 64, 56, 56, 64, 3, 3, 2, 2, 1, 1, 1, 1},
+    {"OneByOne", 8, 64, 56, 56, 64, 1, 1, 1, 1, 0, 0, 1, 1},
+    {"BigChannels128", 8, 128, 56, 56, 128, 3, 3, 1, 1, 1, 1, 1, 1},
+    {"NonSquare3x1", 8, 64, 56, 56, 64, 3, 1, 1, 1, 1, 0, 1, 1},
+    {"Dilation2", 8, 64, 56, 56, 64, 3, 3, 1, 1, 2, 2, 2, 2},
+    {"PartialGemmN_K96", 8, 64, 56, 56, 96, 3, 3, 1, 1, 1, 1, 1, 1},
+    {"PartialGemmK_C48", 8, 48, 56, 56, 64, 3, 3, 1, 1, 1, 1, 1, 1},
+    {"PartialGemmM_1x7x7", 1, 64, 7, 7, 64, 3, 3, 1, 1, 1, 1, 1, 1},
+    {"AllPartial", 1, 48, 7, 7, 96, 3, 3, 1, 1, 1, 1, 1, 1},
+};
+
+INSTANTIATE_TEST_SUITE_P(Shapes, IntegrationGpuCkDslConvFp16Gpu, ::testing::ValuesIn(kConvCases),
+                         [](const ::testing::TestParamInfo<ConvCase>& info) {
+                             return std::string(info.param.name);
+                         });
+
 }  // namespace