Skip to content

Commit afd4f82

Browse files
[CK DSL] Parameterize conv integration test across shapes.
Convert IntegrationGpuCkDslConvFp16 from a single hardcoded BakeOffConv TEST_F into a parameterized TEST_P over a ConvCase shape list. The original bake-off shape is preserved as the BakeOff case. The bake-off shape is fully tile-aligned (M=N*Ho*Wo, GEMM-N=K, GEMM-K=C*R*S all multiples of the kernel's 64-wide tile), leaving partial-tile boundary handling unverified. The shape set now adds tile-aligned variants (stride 2, 1x1, C/K=128, non-square R!=S, dilation 2) and partial-tile probes (partial GEMM-N, GEMM-K, GEMM-M, and all three at once). All 10 shapes pass on gfx950 with worst abs diff 6e-5..2.4e-4 against the CPU reference (5e-2 tolerance). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 0f21fcb commit afd4f82

1 file changed

Lines changed: 98 additions & 49 deletions

File tree

dnn-providers/ck-dsl-provider/integration_tests/IntegrationGpuCkDslConvFp16.cpp

Lines changed: 98 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include <hipdnn_test_sdk/utilities/FlatbufferGraphTestUtils.hpp>
1616
#include <memory>
1717
#include <sstream>
18+
#include <string>
1819
#include <vector>
1920

2021
#include "CkDslContainer.hpp"
@@ -40,24 +41,45 @@ using ck_dsl_provider::PerfResult;
4041
using hipdnn_data_sdk::types::half;
4142
using hipdnn_test_sdk::utilities::CpuFpReferenceConvolution;
4243

43-
/// I-10 capstone: end-to-end M1 integration test.
44+
/// One forward-convolution problem the parameterized integration test
45+
/// drives end to end. Spatial fields are per-dimension so non-square
46+
/// kernels, padding, strides, and dilations can each be exercised.
47+
struct ConvCase {
48+
const char* name;
49+
std::int64_t n, c, hi, wi; // input (logical NCHW)
50+
std::int64_t k, r, s; // weight (logical KCRS)
51+
std::int64_t strideH, strideW;
52+
std::int64_t padH, padW; // symmetric (pre == post) per spatial dim
53+
std::int64_t dilH, dilW;
54+
};
55+
56+
/// Standard forward-conv output extent for one spatial dimension.
57+
std::int64_t convOutputDim(std::int64_t in, std::int64_t pad, std::int64_t dil, std::int64_t k,
58+
std::int64_t stride) {
59+
return (in + 2 * pad - dil * (k - 1) - 1) / stride + 1;
60+
}
61+
62+
/// End-to-end M1 integration coverage for the implicit-GEMM conv path.
4463
///
45-
/// Per plan §1 the test:
46-
/// 1. Builds a single-op conv-fwd graph (bake-off shape: N=8,
47-
/// 56x56x64 -> 64, 3x3, stride 1, pad 1, FP16, NHWC).
64+
/// Each case:
65+
/// 1. Builds a single-op conv-fwd graph for the parameterized shape.
4866
/// 2. Runs it through the JIT pipeline (engine, plan-builder,
4967
/// adapter, bridge, compile_service, JitCache, HipModule).
5068
/// 3. Validates the output against CpuFpReferenceConvolution::fprop
5169
/// from the test SDK within tolerance.
5270
/// 4. Logs achieved kernel time and TFLOPS via PerfMeasurement.
5371
///
72+
/// The shape set spans tile-aligned variants (M = N*Ho*Wo, GEMM-N = K,
73+
/// and GEMM-K = C*R*S each a multiple of the kernel's 64-wide tile) and
74+
/// partial-tile probes where one or more of those dimensions is not a
75+
/// multiple of 64 -- the latter directly exercise the last-tile
76+
/// boundary handling the tile-aligned bake-off shape never touches.
77+
///
5478
/// **Adaptation from plan §1:** the test bypasses the hipDNN frontend
5579
/// API and the backend's .so-loading plugin path. Both surfaces are
5680
/// architecturally additive on top of what the unit-test suite
5781
/// already proves -- the plan-builder + plan-execute path here is
58-
/// the exact same code the backend would call after dlopen. The
59-
/// frontend-API integration lands as M1.5 (or as part of I-11) once
60-
/// the .so installs cleanly into a hipDNN that can find it.
82+
/// the exact same code the backend would call after dlopen.
6183
///
6284
/// Tensor layout convention (PREP_FINDINGS P-6 + miopen-provider
6385
/// precedent): host-side tensors carry logical NCHW dims for X/Y and
@@ -66,7 +88,7 @@ using hipdnn_test_sdk::utilities::CpuFpReferenceConvolution;
6688
/// iterates logical dims and resolves via strides, so a direct
6789
/// element-wise compare over the packed NHWC buffers walks the same
6890
/// logical positions in the same order.
69-
class IntegrationGpuCkDslConvFp16Gpu : public ::testing::Test {
91+
class IntegrationGpuCkDslConvFp16Gpu : public ::testing::TestWithParam<ConvCase> {
7092
protected:
7193
void SetUp() override {
7294
CK_DSL_PROVIDER_SKIP_IF_NOT_GFX950("IntegrationGpuCkDslConvFp16Gpu");
@@ -82,33 +104,36 @@ class IntegrationGpuCkDslConvFp16Gpu : public ::testing::Test {
82104
std::unique_ptr<ConvImplicitGemmPlanBuilder> _planBuilder;
83105
};
84106

85-
TEST_F(IntegrationGpuCkDslConvFp16Gpu, BakeOffConv) {
86-
// Bake-off shape from plan §4.
87-
constexpr std::int64_t kN = 8;
88-
constexpr std::int64_t kC = 64;
89-
constexpr std::int64_t kHi = 56;
90-
constexpr std::int64_t kWi = 56;
91-
constexpr std::int64_t kK = 64;
92-
constexpr std::int64_t kR = 3;
93-
constexpr std::int64_t kS = 3;
94-
// Ho = (Hi + 2*pH - dH*(R-1) - 1)/sH + 1 = (56 + 2 - 2 - 1)/1 + 1 = 56.
95-
constexpr std::int64_t kHo = 56;
96-
constexpr std::int64_t kWo = 56;
97-
98-
// FB graph -- exact same shape as ConvImplicitGemmPlanBuilderTest's
99-
// ``makeBakeOffConvFwdGraph``. Tensor UIDs from
100-
// createValidConvFwdGraph: x=1, w=2, y=3.
107+
TEST_P(IntegrationGpuCkDslConvFp16Gpu, Conv) {
108+
const ConvCase& cse = GetParam();
109+
const std::int64_t kN = cse.n;
110+
const std::int64_t kC = cse.c;
111+
const std::int64_t kHi = cse.hi;
112+
const std::int64_t kWi = cse.wi;
113+
const std::int64_t kK = cse.k;
114+
const std::int64_t kR = cse.r;
115+
const std::int64_t kS = cse.s;
116+
const std::int64_t kHo = convOutputDim(kHi, cse.padH, cse.dilH, kR, cse.strideH);
117+
const std::int64_t kWo = convOutputDim(kWi, cse.padW, cse.dilW, kS, cse.strideW);
118+
ASSERT_GT(kHo, 0) << "shape '" << cse.name << "' yields non-positive Ho=" << kHo;
119+
ASSERT_GT(kWo, 0) << "shape '" << cse.name << "' yields non-positive Wo=" << kWo;
120+
121+
// FB graph. Strides are the NHWC physical layout expressed over the
122+
// logical NCHW (X/Y) and KCRS (W) dim order: the channel stride is
123+
// 1, the W stride is C, the H stride is W*C, the N/K stride is the
124+
// full per-image span. Tensor UIDs from createValidConvFwdGraph:
125+
// x=1, w=2, y=3.
101126
auto fbBuilder = hipdnn_test_sdk::utilities::createValidConvFwdGraph(
102127
/*xDims=*/{kN, kC, kHi, kWi},
103128
/*xStrides=*/{kC * kHi * kWi, 1, kWi * kC, kC},
104129
/*wDims=*/{kK, kC, kR, kS},
105130
/*wStrides=*/{kC * kR * kS, 1, kS * kC, kC},
106131
/*yDims=*/{kN, kK, kHo, kWo},
107132
/*yStrides=*/{kK * kHo * kWo, 1, kWo * kK, kK},
108-
/*convPrePadding=*/{1, 1},
109-
/*convPostPadding=*/{1, 1},
110-
/*convStrides=*/{1, 1},
111-
/*convDilation=*/{1, 1},
133+
/*convPrePadding=*/{cse.padH, cse.padW},
134+
/*convPostPadding=*/{cse.padH, cse.padW},
135+
/*convStrides=*/{cse.strideH, cse.strideW},
136+
/*convDilation=*/{cse.dilH, cse.dilW},
112137
/*dataType=*/data_objects::DataType::HALF);
113138
flatbuffer_utilities::GraphWrapper graph(fbBuilder.GetBufferPointer(), fbBuilder.GetSize());
114139

@@ -123,18 +148,18 @@ TEST_F(IntegrationGpuCkDslConvFp16Gpu, BakeOffConv) {
123148
utilities::Tensor<half> tensorYGpu({kN, kK, kHo, kWo}, nhwc);
124149
utilities::Tensor<half> tensorYCpu({kN, kK, kHo, kWo}, nhwc);
125150

126-
// Seed both inputs. Small range so the K_gemm=576 accumulation
127-
// stays in a numerically friendly part of FP16 (max accumulator
128-
// value is bounded by |x|*|w|*K = 0.1*0.1*576 = 5.76). Random
129-
// distributions still exercise every codepath the kernel takes;
130-
// adjusting the range only reduces the tail accumulator error.
151+
// Seed both inputs. Small range so the K_gemm = C*R*S accumulation
152+
// stays in a numerically friendly part of FP16 (the accumulator is
153+
// bounded by |x|*|w|*K_gemm = 0.1*0.1*K_gemm). Random distributions
154+
// still exercise every codepath the kernel takes; adjusting the
155+
// range only reduces the tail accumulator error.
131156
constexpr unsigned kSeedX = 0x4242u;
132157
constexpr unsigned kSeedW = 0x5555u;
133158
tensorX.fillWithRandomValues(half(-0.1f), half(0.1f), kSeedX);
134159
tensorW.fillWithRandomValues(half(-0.1f), half(0.1f), kSeedW);
135160

136161
// Build the plan. This compiles the kernel on a cold cache
137-
// (multi-second the first time per provider session).
162+
// (multi-second the first time per unique shape).
138163
flatbuffer_utilities::EngineConfigWrapper engineConfig(nullptr, 0);
139164
CkDslContext ctx;
140165
_planBuilder->buildPlan(*_handle, graph, engineConfig, ctx);
@@ -161,10 +186,11 @@ TEST_F(IntegrationGpuCkDslConvFp16Gpu, BakeOffConv) {
161186
// resolves to memory via inner_product(indices, strides) per
162187
// P-6's stride math. Output Y_cpu is written into a freshly-
163188
// allocated host tensor with matching strides.
164-
CpuFpReferenceConvolution::fprop<half, half, half, float>(tensorX, tensorW, tensorYCpu,
165-
/*strides=*/{1, 1},
166-
/*dilations=*/{1, 1},
167-
/*padding=*/{1, 1});
189+
CpuFpReferenceConvolution::fprop<half, half, half, float>(
190+
tensorX, tensorW, tensorYCpu,
191+
/*strides=*/{cse.strideH, cse.strideW},
192+
/*dilations=*/{cse.dilH, cse.dilW},
193+
/*padding=*/{cse.padH, cse.padW});
168194

169195
// Force D->H on the GPU output so subsequent host reads see the
170196
// kernel's writes. markDeviceModified is what tells the migration
@@ -175,9 +201,9 @@ TEST_F(IntegrationGpuCkDslConvFp16Gpu, BakeOffConv) {
175201
const half* cpuOut = tensorYCpu.memory().hostData();
176202

177203
// Tolerance bound (per plan §1 + PREP_FINDINGS): expected error
178-
// for K_gemm=576 random-uniform fp16 accumulations is roughly
179-
// sqrt(K_gemm) * fp16_eps * |max_input * max_weight| =
180-
// 24 * 1e-3 * 0.01 = 2.4e-4 typical. We use a generous 5e-2
204+
// for K_gemm random-uniform fp16 accumulations is roughly
205+
// sqrt(K_gemm) * fp16_eps * |max_input * max_weight|, which for the
206+
// shapes here stays well under 1e-3. We use a generous 5e-2
181207
// absolute tolerance to accommodate accumulation tail behaviour
182208
// without making the test brittle to minor codegen reshufflings.
183209
constexpr float kAbsTol = 5.0e-2f;
@@ -209,8 +235,8 @@ TEST_F(IntegrationGpuCkDslConvFp16Gpu, BakeOffConv) {
209235
}
210236
}
211237

212-
EXPECT_EQ(mismatches, 0u) << "found " << mismatches << " elements outside the " << kAbsTol
213-
<< " tolerance ("
238+
EXPECT_EQ(mismatches, 0u) << "shape '" << cse.name << "': found " << mismatches
239+
<< " elements outside the " << kAbsTol << " tolerance ("
214240
<< static_cast<double>(mismatches) /
215241
static_cast<double>(numElements) * 100.0
216242
<< "%); first mismatch at linear index " << firstMismatchIdx
@@ -219,10 +245,10 @@ TEST_F(IntegrationGpuCkDslConvFp16Gpu, BakeOffConv) {
219245

220246
// Perf measurement (no perf-target assertion, log only per Q9).
221247
// FLOPS formula from plan §4: 2 * N * Ho * Wo * K * C * R * S.
222-
constexpr double kFlops = 2.0 * static_cast<double>(kN) * static_cast<double>(kHo) *
223-
static_cast<double>(kWo) * static_cast<double>(kK) *
224-
static_cast<double>(kC) * static_cast<double>(kR) *
225-
static_cast<double>(kS);
248+
const double kFlops = 2.0 * static_cast<double>(kN) * static_cast<double>(kHo) *
249+
static_cast<double>(kWo) * static_cast<double>(kK) *
250+
static_cast<double>(kC) * static_cast<double>(kR) *
251+
static_cast<double>(kS);
226252
PerfMeasurement pm;
227253
auto launchFn = [&]() {
228254
ctx.plan().execute(*_handle, deviceBuffers.data(),
@@ -235,10 +261,10 @@ TEST_F(IntegrationGpuCkDslConvFp16Gpu, BakeOffConv) {
235261
// harness's recorder captures it. Also stamp the worst element
236262
// diff in the result message so a passing test still leaves a
237263
// breadcrumb of the numerical agreement quality.
238-
pm.log("conv_implicit_gemm_bake_off_N8H56W56C64_K64R3S3", result);
264+
pm.log(std::string("conv_implicit_gemm_") + cse.name, result);
239265

240266
std::ostringstream summary;
241-
summary << "IntegrationGpuCkDslConvFp16Gpu.BakeOffConv: numerical agreement "
267+
summary << "IntegrationGpuCkDslConvFp16Gpu.Conv/" << cse.name << ": numerical agreement "
242268
<< "(worst abs diff = " << worstError << " < tol = " << kAbsTol
243269
<< "), perf min_us = " << result.minUs << ", median_us = " << result.medianUs
244270
<< ", tflops = " << result.tflops;
@@ -248,4 +274,27 @@ TEST_F(IntegrationGpuCkDslConvFp16Gpu, BakeOffConv) {
248274
RecordProperty("ck_dsl_perf_summary", summary.str());
249275
}
250276

277+
// Shape set. Cases 1-6 keep M = N*Ho*Wo, GEMM-N = K, and GEMM-K = C*R*S
278+
// each a multiple of the kernel's 64-wide tile (expected to pass). Cases
279+
// 7-10 leave one or more of those dimensions partial to probe last-tile
280+
// boundary handling the tile-aligned bake-off shape never exercises.
281+
const std::vector<ConvCase> kConvCases = {
282+
// name N C Hi Wi K R S sH sW pH pW dH dW
283+
{"BakeOff", 8, 64, 56, 56, 64, 3, 3, 1, 1, 1, 1, 1, 1},
284+
{"Stride2", 8, 64, 56, 56, 64, 3, 3, 2, 2, 1, 1, 1, 1},
285+
{"OneByOne", 8, 64, 56, 56, 64, 1, 1, 1, 1, 0, 0, 1, 1},
286+
{"BigChannels128", 8, 128, 56, 56, 128, 3, 3, 1, 1, 1, 1, 1, 1},
287+
{"NonSquare3x1", 8, 64, 56, 56, 64, 3, 1, 1, 1, 1, 0, 1, 1},
288+
{"Dilation2", 8, 64, 56, 56, 64, 3, 3, 1, 1, 2, 2, 2, 2},
289+
{"PartialGemmN_K96", 8, 64, 56, 56, 96, 3, 3, 1, 1, 1, 1, 1, 1},
290+
{"PartialGemmK_C48", 8, 48, 56, 56, 64, 3, 3, 1, 1, 1, 1, 1, 1},
291+
{"PartialGemmM_1x7x7", 1, 64, 7, 7, 64, 3, 3, 1, 1, 1, 1, 1, 1},
292+
{"AllPartial", 1, 48, 7, 7, 96, 3, 3, 1, 1, 1, 1, 1, 1},
293+
};
294+
295+
INSTANTIATE_TEST_SUITE_P(Shapes, IntegrationGpuCkDslConvFp16Gpu, ::testing::ValuesIn(kConvCases),
296+
[](const ::testing::TestParamInfo<ConvCase>& info) {
297+
return std::string(info.param.name);
298+
});
299+
251300
} // namespace

0 commit comments

Comments
 (0)