1515#include < hipdnn_test_sdk/utilities/FlatbufferGraphTestUtils.hpp>
1616#include < memory>
1717#include < sstream>
18+ #include < string>
1819#include < vector>
1920
2021#include " CkDslContainer.hpp"
@@ -40,24 +41,45 @@ using ck_dsl_provider::PerfResult;
4041using hipdnn_data_sdk::types::half;
4142using hipdnn_test_sdk::utilities::CpuFpReferenceConvolution;
4243
43- // / I-10 capstone: end-to-end M1 integration test.
44+ // / One forward-convolution problem the parameterized integration test
45+ // / drives end to end. Spatial fields are per-dimension so non-square
46+ // / kernels, padding, strides, and dilations can each be exercised.
47+ struct ConvCase {
48+ const char * name;
49+ std::int64_t n, c, hi, wi; // input (logical NCHW)
50+ std::int64_t k, r, s; // weight (logical KCRS)
51+ std::int64_t strideH, strideW;
52+ std::int64_t padH, padW; // symmetric (pre == post) per spatial dim
53+ std::int64_t dilH, dilW;
54+ };
55+
56+ // / Standard forward-conv output extent for one spatial dimension.
57+ std::int64_t convOutputDim (std::int64_t in, std::int64_t pad, std::int64_t dil, std::int64_t k,
58+ std::int64_t stride) {
59+ return (in + 2 * pad - dil * (k - 1 ) - 1 ) / stride + 1 ;
60+ }
61+
62+ // / End-to-end M1 integration coverage for the implicit-GEMM conv path.
4463// /
45- // / Per plan §1 the test:
46- // / 1. Builds a single-op conv-fwd graph (bake-off shape: N=8,
47- // / 56x56x64 -> 64, 3x3, stride 1, pad 1, FP16, NHWC).
64+ // / Each case:
65+ // / 1. Builds a single-op conv-fwd graph for the parameterized shape.
4866// / 2. Runs it through the JIT pipeline (engine, plan-builder,
4967// / adapter, bridge, compile_service, JitCache, HipModule).
5068// / 3. Validates the output against CpuFpReferenceConvolution::fprop
5169// / from the test SDK within tolerance.
5270// / 4. Logs achieved kernel time and TFLOPS via PerfMeasurement.
5371// /
72+ // / The shape set spans tile-aligned variants (M = N*Ho*Wo, GEMM-N = K,
73+ // / and GEMM-K = C*R*S each a multiple of the kernel's 64-wide tile) and
74+ // / partial-tile probes where one or more of those dimensions is not a
75+ // / multiple of 64 -- the latter directly exercise the last-tile
76+ // / boundary handling the tile-aligned bake-off shape never touches.
77+ // /
5478// / **Adaptation from plan §1:** the test bypasses the hipDNN frontend
5579// / API and the backend's .so-loading plugin path. Both surfaces are
5680// / architecturally additive on top of what the unit-test suite
5781// / already proves -- the plan-builder + plan-execute path here is
58- // / the exact same code the backend would call after dlopen. The
59- // / frontend-API integration lands as M1.5 (or as part of I-11) once
60- // / the .so installs cleanly into a hipDNN that can find it.
82+ // / the exact same code the backend would call after dlopen.
6183// /
6284// / Tensor layout convention (PREP_FINDINGS P-6 + miopen-provider
6385// / precedent): host-side tensors carry logical NCHW dims for X/Y and
@@ -66,7 +88,7 @@ using hipdnn_test_sdk::utilities::CpuFpReferenceConvolution;
6688// / iterates logical dims and resolves via strides, so a direct
6789// / element-wise compare over the packed NHWC buffers walks the same
6890// / logical positions in the same order.
69- class IntegrationGpuCkDslConvFp16Gpu : public ::testing::Test {
91+ class IntegrationGpuCkDslConvFp16Gpu : public ::testing::TestWithParam<ConvCase> {
7092 protected:
7193 void SetUp () override {
7294 CK_DSL_PROVIDER_SKIP_IF_NOT_GFX950 (" IntegrationGpuCkDslConvFp16Gpu" );
@@ -82,33 +104,36 @@ class IntegrationGpuCkDslConvFp16Gpu : public ::testing::Test {
82104 std::unique_ptr<ConvImplicitGemmPlanBuilder> _planBuilder;
83105};
84106
85- TEST_F (IntegrationGpuCkDslConvFp16Gpu, BakeOffConv) {
86- // Bake-off shape from plan §4.
87- constexpr std::int64_t kN = 8 ;
88- constexpr std::int64_t kC = 64 ;
89- constexpr std::int64_t kHi = 56 ;
90- constexpr std::int64_t kWi = 56 ;
91- constexpr std::int64_t kK = 64 ;
92- constexpr std::int64_t kR = 3 ;
93- constexpr std::int64_t kS = 3 ;
94- // Ho = (Hi + 2*pH - dH*(R-1) - 1)/sH + 1 = (56 + 2 - 2 - 1)/1 + 1 = 56.
95- constexpr std::int64_t kHo = 56 ;
96- constexpr std::int64_t kWo = 56 ;
97-
98- // FB graph -- exact same shape as ConvImplicitGemmPlanBuilderTest's
99- // ``makeBakeOffConvFwdGraph``. Tensor UIDs from
100- // createValidConvFwdGraph: x=1, w=2, y=3.
107+ TEST_P (IntegrationGpuCkDslConvFp16Gpu, Conv) {
108+ const ConvCase& cse = GetParam ();
109+ const std::int64_t kN = cse.n ;
110+ const std::int64_t kC = cse.c ;
111+ const std::int64_t kHi = cse.hi ;
112+ const std::int64_t kWi = cse.wi ;
113+ const std::int64_t kK = cse.k ;
114+ const std::int64_t kR = cse.r ;
115+ const std::int64_t kS = cse.s ;
116+ const std::int64_t kHo = convOutputDim (kHi , cse.padH , cse.dilH , kR , cse.strideH );
117+ const std::int64_t kWo = convOutputDim (kWi , cse.padW , cse.dilW , kS , cse.strideW );
118+ ASSERT_GT (kHo , 0 ) << " shape '" << cse.name << " ' yields non-positive Ho=" << kHo ;
119+ ASSERT_GT (kWo , 0 ) << " shape '" << cse.name << " ' yields non-positive Wo=" << kWo ;
120+
121+ // FB graph. Strides are the NHWC physical layout expressed over the
122+ // logical NCHW (X/Y) and KCRS (W) dim order: the channel stride is
123+ // 1, the W stride is C, the H stride is W*C, the N/K stride is the
124+ // full per-image span. Tensor UIDs from createValidConvFwdGraph:
125+ // x=1, w=2, y=3.
101126 auto fbBuilder = hipdnn_test_sdk::utilities::createValidConvFwdGraph (
102127 /* xDims=*/ {kN , kC , kHi , kWi },
103128 /* xStrides=*/ {kC * kHi * kWi , 1 , kWi * kC , kC },
104129 /* wDims=*/ {kK , kC , kR , kS },
105130 /* wStrides=*/ {kC * kR * kS , 1 , kS * kC , kC },
106131 /* yDims=*/ {kN , kK , kHo , kWo },
107132 /* yStrides=*/ {kK * kHo * kWo , 1 , kWo * kK , kK },
108- /* convPrePadding=*/ {1 , 1 },
109- /* convPostPadding=*/ {1 , 1 },
110- /* convStrides=*/ {1 , 1 },
111- /* convDilation=*/ {1 , 1 },
133+ /* convPrePadding=*/ {cse. padH , cse. padW },
134+ /* convPostPadding=*/ {cse. padH , cse. padW },
135+ /* convStrides=*/ {cse. strideH , cse. strideW },
136+ /* convDilation=*/ {cse. dilH , cse. dilW },
112137 /* dataType=*/ data_objects::DataType::HALF );
113138 flatbuffer_utilities::GraphWrapper graph (fbBuilder.GetBufferPointer (), fbBuilder.GetSize ());
114139
@@ -123,18 +148,18 @@ TEST_F(IntegrationGpuCkDslConvFp16Gpu, BakeOffConv) {
123148 utilities::Tensor<half> tensorYGpu ({kN , kK , kHo , kWo }, nhwc);
124149 utilities::Tensor<half> tensorYCpu ({kN , kK , kHo , kWo }, nhwc);
125150
126- // Seed both inputs. Small range so the K_gemm=576 accumulation
127- // stays in a numerically friendly part of FP16 (max accumulator
128- // value is bounded by |x|*|w|*K = 0.1*0.1*576 = 5.76 ). Random
129- // distributions still exercise every codepath the kernel takes;
130- // adjusting the range only reduces the tail accumulator error.
151+ // Seed both inputs. Small range so the K_gemm = C*R*S accumulation
152+ // stays in a numerically friendly part of FP16 (the accumulator is
153+ // bounded by |x|*|w|*K_gemm = 0.1*0.1*K_gemm ). Random distributions
154+ // still exercise every codepath the kernel takes; adjusting the
155+ // range only reduces the tail accumulator error.
131156 constexpr unsigned kSeedX = 0x4242u ;
132157 constexpr unsigned kSeedW = 0x5555u ;
133158 tensorX.fillWithRandomValues (half (-0 .1f ), half (0 .1f ), kSeedX );
134159 tensorW.fillWithRandomValues (half (-0 .1f ), half (0 .1f ), kSeedW );
135160
136161 // Build the plan. This compiles the kernel on a cold cache
137- // (multi-second the first time per provider session ).
162+ // (multi-second the first time per unique shape ).
138163 flatbuffer_utilities::EngineConfigWrapper engineConfig (nullptr , 0 );
139164 CkDslContext ctx;
140165 _planBuilder->buildPlan (*_handle, graph, engineConfig, ctx);
@@ -161,10 +186,11 @@ TEST_F(IntegrationGpuCkDslConvFp16Gpu, BakeOffConv) {
161186 // resolves to memory via inner_product(indices, strides) per
162187 // P-6's stride math. Output Y_cpu is written into a freshly-
163188 // allocated host tensor with matching strides.
164- CpuFpReferenceConvolution::fprop<half, half, half, float >(tensorX, tensorW, tensorYCpu,
165- /* strides=*/ {1 , 1 },
166- /* dilations=*/ {1 , 1 },
167- /* padding=*/ {1 , 1 });
189+ CpuFpReferenceConvolution::fprop<half, half, half, float >(
190+ tensorX, tensorW, tensorYCpu,
191+ /* strides=*/ {cse.strideH , cse.strideW },
192+ /* dilations=*/ {cse.dilH , cse.dilW },
193+ /* padding=*/ {cse.padH , cse.padW });
168194
169195 // Force D->H on the GPU output so subsequent host reads see the
170196 // kernel's writes. markDeviceModified is what tells the migration
@@ -175,9 +201,9 @@ TEST_F(IntegrationGpuCkDslConvFp16Gpu, BakeOffConv) {
175201 const half* cpuOut = tensorYCpu.memory ().hostData ();
176202
177203 // Tolerance bound (per plan §1 + PREP_FINDINGS): expected error
178- // for K_gemm=576 random-uniform fp16 accumulations is roughly
179- // sqrt(K_gemm) * fp16_eps * |max_input * max_weight| =
180- // 24 * 1e-3 * 0.01 = 2.4e-4 typical . We use a generous 5e-2
204+ // for K_gemm random-uniform fp16 accumulations is roughly
205+ // sqrt(K_gemm) * fp16_eps * |max_input * max_weight|, which for the
206+ // shapes here stays well under 1e-3 . We use a generous 5e-2
181207 // absolute tolerance to accommodate accumulation tail behaviour
182208 // without making the test brittle to minor codegen reshufflings.
183209 constexpr float kAbsTol = 5 .0e-2f ;
@@ -209,8 +235,8 @@ TEST_F(IntegrationGpuCkDslConvFp16Gpu, BakeOffConv) {
209235 }
210236 }
211237
212- EXPECT_EQ (mismatches, 0u ) << " found " << mismatches << " elements outside the " << kAbsTol
213- << " tolerance ("
238+ EXPECT_EQ (mismatches, 0u ) << " shape ' " << cse. name << " ': found " << mismatches
239+ << " elements outside the " << kAbsTol << " tolerance ("
214240 << static_cast <double >(mismatches) /
215241 static_cast <double >(numElements) * 100.0
216242 << " %); first mismatch at linear index " << firstMismatchIdx
@@ -219,10 +245,10 @@ TEST_F(IntegrationGpuCkDslConvFp16Gpu, BakeOffConv) {
219245
220246 // Perf measurement (no perf-target assertion, log only per Q9).
221247 // FLOPS formula from plan §4: 2 * N * Ho * Wo * K * C * R * S.
222- constexpr double kFlops = 2.0 * static_cast <double >(kN ) * static_cast <double >(kHo ) *
223- static_cast <double >(kWo ) * static_cast <double >(kK ) *
224- static_cast <double >(kC ) * static_cast <double >(kR ) *
225- static_cast <double >(kS );
248+ const double kFlops = 2.0 * static_cast <double >(kN ) * static_cast <double >(kHo ) *
249+ static_cast <double >(kWo ) * static_cast <double >(kK ) *
250+ static_cast <double >(kC ) * static_cast <double >(kR ) *
251+ static_cast <double >(kS );
226252 PerfMeasurement pm;
227253 auto launchFn = [&]() {
228254 ctx.plan ().execute (*_handle, deviceBuffers.data (),
@@ -235,10 +261,10 @@ TEST_F(IntegrationGpuCkDslConvFp16Gpu, BakeOffConv) {
235261 // harness's recorder captures it. Also stamp the worst element
236262 // diff in the result message so a passing test still leaves a
237263 // breadcrumb of the numerical agreement quality.
238- pm.log (" conv_implicit_gemm_bake_off_N8H56W56C64_K64R3S3 " , result);
264+ pm.log (std::string ( " conv_implicit_gemm_ " ) + cse. name , result);
239265
240266 std::ostringstream summary;
241- summary << " IntegrationGpuCkDslConvFp16Gpu.BakeOffConv : numerical agreement "
267+ summary << " IntegrationGpuCkDslConvFp16Gpu.Conv/ " << cse. name << " : numerical agreement "
242268 << " (worst abs diff = " << worstError << " < tol = " << kAbsTol
243269 << " ), perf min_us = " << result.minUs << " , median_us = " << result.medianUs
244270 << " , tflops = " << result.tflops ;
@@ -248,4 +274,27 @@ TEST_F(IntegrationGpuCkDslConvFp16Gpu, BakeOffConv) {
248274 RecordProperty (" ck_dsl_perf_summary" , summary.str ());
249275}
250276
277+ // Shape set. Cases 1-6 keep M = N*Ho*Wo, GEMM-N = K, and GEMM-K = C*R*S
278+ // each a multiple of the kernel's 64-wide tile (expected to pass). Cases
279+ // 7-10 leave one or more of those dimensions partial to probe last-tile
280+ // boundary handling the tile-aligned bake-off shape never exercises.
281+ const std::vector<ConvCase> kConvCases = {
282+ // name N C Hi Wi K R S sH sW pH pW dH dW
283+ {" BakeOff" , 8 , 64 , 56 , 56 , 64 , 3 , 3 , 1 , 1 , 1 , 1 , 1 , 1 },
284+ {" Stride2" , 8 , 64 , 56 , 56 , 64 , 3 , 3 , 2 , 2 , 1 , 1 , 1 , 1 },
285+ {" OneByOne" , 8 , 64 , 56 , 56 , 64 , 1 , 1 , 1 , 1 , 0 , 0 , 1 , 1 },
286+ {" BigChannels128" , 8 , 128 , 56 , 56 , 128 , 3 , 3 , 1 , 1 , 1 , 1 , 1 , 1 },
287+ {" NonSquare3x1" , 8 , 64 , 56 , 56 , 64 , 3 , 1 , 1 , 1 , 1 , 0 , 1 , 1 },
288+ {" Dilation2" , 8 , 64 , 56 , 56 , 64 , 3 , 3 , 1 , 1 , 2 , 2 , 2 , 2 },
289+ {" PartialGemmN_K96" , 8 , 64 , 56 , 56 , 96 , 3 , 3 , 1 , 1 , 1 , 1 , 1 , 1 },
290+ {" PartialGemmK_C48" , 8 , 48 , 56 , 56 , 64 , 3 , 3 , 1 , 1 , 1 , 1 , 1 , 1 },
291+ {" PartialGemmM_1x7x7" , 1 , 64 , 7 , 7 , 64 , 3 , 3 , 1 , 1 , 1 , 1 , 1 , 1 },
292+ {" AllPartial" , 1 , 48 , 7 , 7 , 96 , 3 , 3 , 1 , 1 , 1 , 1 , 1 , 1 },
293+ };
294+
295+ INSTANTIATE_TEST_SUITE_P (Shapes, IntegrationGpuCkDslConvFp16Gpu, ::testing::ValuesIn(kConvCases ),
296+ [](const ::testing::TestParamInfo<ConvCase>& info) {
297+ return std::string (info.param .name );
298+ });
299+
251300} // namespace
0 commit comments