Skip to content

Commit 2602169

Browse files
ggml : add GGML_OP_COL2IM_1D (#24206)
* cpu: add GGML_OP_COL2IM_1D Add the overlap-add (scatter-add) step of a 1D transposed convolution. A ConvTranspose1d factorizes as a GEMM followed by col2im: a weight pre-permuted to [IC, K*OC] is contracted against the [IC, T_in] input with mul_mat to produce a column matrix [K*OC, T_in], and col2im_1d scatters those columns back into the [T_out, OC] signal, with T_out = (T_in - 1)*s0 + K - 2*p0. Keeping the contraction as a plain mul_mat leaves the heavy work on the optimized (and quantizable) matmul kernels, so col2im_1d only does the cheap overlap-add. CPU uses a gather formulation parallelized over output channels, supporting F32, F16 and BF16 with an F32 accumulator. * tests: add backend coverage for GGML_OP_COL2IM_1D Add test_col2im_1d next to the conv_transpose_1d cases, covering F32, F16 and BF16 across eight geometries: the canonical kernel = 2*stride DAC upsampling shape, overlap, no overlap, cropping (p0 = 1 and p0 = stride/2), kernel < stride with zeroed gaps, kernel not a multiple of stride, and a single column unfold. Perf mode gets three real vocoder stage shapes reporting memory bandwidth. max_nmse_err relaxes to 5e-4 for F16 and BF16. * cpu: harden GGML_OP_COL2IM_1D ggml_col2im_1d validates s0, oc, p0 and input contiguity at graph build time, before the oc division, protecting every backend at once. The kernel asserts the contiguity its flat indexing assumes and its doc states the full output length including the crop term. The kernel parallelizes over the time axis: the split stays balanced down to OC = 1, where the previous channel split was single threaded. Values are bit identical on the three real vocoder chains, two out of three improve. * tests: extend the GGML_OP_COL2IM_1D grid The eval grid grows to eleven geometries: OC = 1 (mono output stage), K = 1 with stride > 1 (sparse scatter, every gap position zeroed) and a crop down to T_out = 2 where all the gather bounds act at once. * tests: add col2im_1d equivalence test tests/test-col2im-1d.cpp proves mul_mat + col2im_1d matches the native ggml_conv_transpose_1d on the CPU backend, F32 bit exact, F16 and BF16 through casts of the column matrix. test-backend-ops cannot cover this for a CPU only op since the CPU backend is its own reference there. * rpc: bump protocol patch version for GGML_OP_COL2IM_1D GGML_OP_COUNT goes from 96 to 97 with the new op, which trips the static_assert in ggml-rpc.h. Bump RPC_PROTO_PATCH_VERSION since the op is appended and no existing op code shifts.
1 parent 961e9a3 commit 2602169

9 files changed

Lines changed: 343 additions & 4 deletions

File tree

ggml/include/ggml-rpc.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,10 @@ extern "C" {
88

99
#define RPC_PROTO_MAJOR_VERSION 4
1010
#define RPC_PROTO_MINOR_VERSION 0
11-
#define RPC_PROTO_PATCH_VERSION 0
11+
#define RPC_PROTO_PATCH_VERSION 1
1212

1313
#ifdef __cplusplus
14-
static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
14+
static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
1515
#endif
1616

1717
#define GGML_RPC_MAX_SERVERS 16

ggml/include/ggml.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -535,6 +535,7 @@ extern "C" {
535535
GGML_OP_IM2COL,
536536
GGML_OP_IM2COL_BACK,
537537
GGML_OP_IM2COL_3D,
538+
GGML_OP_COL2IM_1D,
538539
GGML_OP_CONV_2D,
539540
GGML_OP_CONV_3D,
540541
GGML_OP_CONV_2D_DW,
@@ -2007,6 +2008,16 @@ extern "C" {
20072008
int d1, // dilation dimension 1
20082009
bool is_2D);
20092010

2011+
// col2im_1d: scatter-add GEMM columns back to 1D signal
2012+
// a: [K*OC, T_in] (columns from matmul, K = a->ne[0]/OC)
2013+
// result: [T_out, OC] where T_out = (T_in - 1)*s0 + K - 2*p0
2014+
GGML_API struct ggml_tensor * ggml_col2im_1d(
2015+
struct ggml_context * ctx,
2016+
struct ggml_tensor * a, // columns [K*OC, T_in]
2017+
int s0, // stride
2018+
int oc, // output channels
2019+
int p0); // padding to crop from both sides
2020+
20102021
GGML_API struct ggml_tensor * ggml_conv_1d(
20112022
struct ggml_context * ctx,
20122023
struct ggml_tensor * a, // convolution kernel

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1912,6 +1912,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
19121912
{
19131913
ggml_compute_forward_im2col_3d(params, tensor);
19141914
} break;
1915+
case GGML_OP_COL2IM_1D:
1916+
{
1917+
ggml_compute_forward_col2im_1d(params, tensor);
1918+
} break;
19151919
case GGML_OP_CONV_2D:
19161920
{
19171921
ggml_compute_forward_conv_2d(params, tensor);
@@ -2343,6 +2347,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
23432347
case GGML_OP_CONV_2D:
23442348
case GGML_OP_CONV_3D:
23452349
case GGML_OP_CONV_2D_DW:
2350+
case GGML_OP_COL2IM_1D:
23462351
case GGML_OP_CONV_TRANSPOSE_1D:
23472352
case GGML_OP_CONV_TRANSPOSE_2D:
23482353
{

ggml/src/ggml-cpu/ops.cpp

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6730,6 +6730,78 @@ static inline int64_t ggml_wrap_around(int64_t coord, int64_t size) {
67306730
return (coord + size) % size; // adding size avoids negative number weirdness
67316731
}
67326732

6733+
// ggml_compute_forward_col2im_1d
6734+
//
6735+
// Scatter-add columns [K*OC, T_in] -> signal [T_out, OC]
6736+
// where T_out = (T_in - 1)*s + K - 2*p. Gather approach: each output reads ceil(K/s) inputs.
6737+
// Parallelized over the time axis so the split stays balanced whatever OC is.
6738+
// Supports F32, F16, BF16 input/output (same type), F32 accumulator.
6739+
6740+
template <typename elem_t>
6741+
static void ggml_compute_forward_col2im_1d_impl(
6742+
const ggml_compute_params * params,
6743+
ggml_tensor * dst) {
6744+
6745+
const ggml_tensor * src = dst->src[0]; // [K*OC, T_in]
6746+
6747+
GGML_ASSERT(ggml_is_contiguous(src));
6748+
GGML_ASSERT(ggml_is_contiguous(dst));
6749+
6750+
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
6751+
const int32_t OC = ((const int32_t *)(dst->op_params))[1];
6752+
const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
6753+
6754+
const int64_t K_OC = src->ne[0];
6755+
const int64_t T_in = src->ne[1];
6756+
const int64_t K = K_OC / OC;
6757+
const int64_t T_out = dst->ne[0];
6758+
6759+
const elem_t * col_data = (const elem_t *) src->data;
6760+
elem_t * dst_data = (elem_t *) dst->data;
6761+
6762+
const int ith = params->ith;
6763+
const int nth = params->nth;
6764+
6765+
// Parallelize over the time axis: the split stays balanced whatever OC is,
6766+
// down to OC = 1 for mono audio, and threads read disjoint column bands
6767+
const int64_t dr = (T_out + nth - 1) / nth;
6768+
const int64_t it0 = dr * ith;
6769+
const int64_t it1 = it0 + dr < T_out ? it0 + dr : T_out;
6770+
6771+
for (int64_t oc = 0; oc < OC; oc++) {
6772+
for (int64_t t_out = it0; t_out < it1; t_out++) {
6773+
const int64_t t_abs = t_out + p0; // absolute position in uncropped signal
6774+
// Gather: find all (t_in, k) where t_in * s + k == t_abs, 0 <= k < K
6775+
int64_t t_in_min = (t_abs - K + 1 + s0 - 1) / s0; // ceil((t_abs-K+1)/s)
6776+
if (t_in_min < 0) t_in_min = 0;
6777+
int64_t t_in_max = t_abs / s0;
6778+
if (t_in_max >= T_in) t_in_max = T_in - 1;
6779+
6780+
float sum = 0.0f;
6781+
for (int64_t t_in = t_in_min; t_in <= t_in_max; t_in++) {
6782+
int64_t k = t_abs - t_in * s0;
6783+
if (k >= 0 && k < K) {
6784+
// col layout: [K*OC, T_in], element (oc*K+k, t_in)
6785+
sum += type_conversion_table<elem_t>::to_f32(col_data[(oc * K + k) + t_in * K_OC]);
6786+
}
6787+
}
6788+
// dst layout: [T_out, OC], element (t_out, oc)
6789+
dst_data[t_out + oc * T_out] = type_conversion_table<elem_t>::from_f32(sum);
6790+
}
6791+
}
6792+
}
6793+
6794+
void ggml_compute_forward_col2im_1d(
6795+
const ggml_compute_params * params,
6796+
ggml_tensor * dst) {
6797+
switch (dst->src[0]->type) {
6798+
case GGML_TYPE_F32: ggml_compute_forward_col2im_1d_impl<float> (params, dst); break;
6799+
case GGML_TYPE_F16: ggml_compute_forward_col2im_1d_impl<ggml_fp16_t>(params, dst); break;
6800+
case GGML_TYPE_BF16: ggml_compute_forward_col2im_1d_impl<ggml_bf16_t>(params, dst); break;
6801+
default: GGML_ABORT("col2im_1d: unsupported type %d", dst->src[0]->type);
6802+
}
6803+
}
6804+
67336805
// ggml_compute_forward_conv_2d
67346806

67356807

ggml/src/ggml-cpu/ops.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p
6868
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
6969
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
7070
void ggml_compute_forward_im2col_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
71+
void ggml_compute_forward_col2im_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
7172
void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
7273
void ggml_compute_forward_conv_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
7374
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);

ggml/src/ggml.c

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1031,6 +1031,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
10311031
"IM2COL",
10321032
"IM2COL_BACK",
10331033
"IM2COL_3D",
1034+
"COL2IM_1D",
10341035
"CONV_2D",
10351036
"CONV_3D",
10361037
"CONV_2D_DW",
@@ -1080,7 +1081,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
10801081
"GLU",
10811082
};
10821083

1083-
static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
1084+
static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97");
10841085

10851086
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
10861087
"none",
@@ -1141,6 +1142,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
11411142
"im2col(x)",
11421143
"im2col_back(x)",
11431144
"im2col_3d(x)",
1145+
"col2im_1d(x)",
11441146
"conv_2d(x)",
11451147
"conv_3d(x)",
11461148
"conv_2d_dw(x)",
@@ -1190,7 +1192,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
11901192
"glu(x)",
11911193
};
11921194

1193-
static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
1195+
static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97");
11941196

11951197
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
11961198

@@ -4541,6 +4543,41 @@ struct ggml_tensor * ggml_conv_1d_dw_ph(
45414543
return ggml_conv_1d_dw(ctx, a, b, s0, a->ne[0] / 2, d0);
45424544
}
45434545

4546+
// ggml_col2im_1d
4547+
4548+
struct ggml_tensor * ggml_col2im_1d(
4549+
struct ggml_context * ctx,
4550+
struct ggml_tensor * a,
4551+
int s0,
4552+
int oc,
4553+
int p0) {
4554+
GGML_ASSERT(ggml_is_matrix(a));
4555+
GGML_ASSERT(ggml_is_contiguous(a));
4556+
GGML_ASSERT(a->type == GGML_TYPE_F32 || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16);
4557+
GGML_ASSERT(s0 > 0);
4558+
GGML_ASSERT(oc > 0);
4559+
GGML_ASSERT(p0 >= 0);
4560+
4561+
const int64_t K_OC = a->ne[0];
4562+
const int64_t T_in = a->ne[1];
4563+
const int64_t K = K_OC / oc;
4564+
const int64_t T_out = (T_in - 1) * s0 + K - 2 * p0;
4565+
4566+
GGML_ASSERT(K_OC == K * oc); // a->ne[0] must be a whole number of oc blocks
4567+
GGML_ASSERT(K > 0 && T_out > 0);
4568+
4569+
const int64_t ne[4] = { T_out, oc, 1, 1 };
4570+
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 2, ne);
4571+
4572+
int32_t params[] = { s0, (int32_t)oc, (int32_t)p0 };
4573+
ggml_set_op_params(result, params, sizeof(params));
4574+
4575+
result->op = GGML_OP_COL2IM_1D;
4576+
result->src[0] = a;
4577+
4578+
return result;
4579+
}
4580+
45444581
// ggml_conv_transpose_1d
45454582

45464583
static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ if (NOT GGML_BACKEND_DL)
265265
llama_build_and_test(test-quantize-fns.cpp)
266266
llama_build_and_test(test-quantize-perf.cpp)
267267
llama_build_and_test(test-rope.cpp)
268+
llama_build_and_test(test-col2im-1d.cpp)
268269
endif()
269270

270271
# libmtmd

tests/test-backend-ops.cpp

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5098,6 +5098,39 @@ struct test_conv_transpose_1d : public test_case {
50985098
}
50995099
};
51005100

5101+
// GGML_OP_COL2IM_1D
5102+
struct test_col2im_1d : public test_case {
5103+
const ggml_type type;
5104+
const int64_t K; // kernel size
5105+
const int64_t OC; // output channels
5106+
const int64_t T_in; // input length (number of columns)
5107+
const int s0; // stride
5108+
const int p0; // padding cropped from both sides
5109+
5110+
std::string vars() override {
5111+
return VARS_TO_STR6(type, K, OC, T_in, s0, p0);
5112+
}
5113+
5114+
double max_nmse_err() override {
5115+
return type == GGML_TYPE_F32 ? 1e-7 : 5e-4;
5116+
}
5117+
5118+
test_col2im_1d(ggml_type type = GGML_TYPE_F32,
5119+
int64_t K = 4, int64_t OC = 3, int64_t T_in = 7,
5120+
int s0 = 2, int p0 = 0)
5121+
: type(type), K(K), OC(OC), T_in(T_in), s0(s0), p0(p0) {}
5122+
5123+
ggml_tensor * build_graph(ggml_context * ctx) override {
5124+
ggml_tensor * cols = ggml_new_tensor_2d(ctx, type, K*OC, T_in);
5125+
ggml_set_name(cols, "cols");
5126+
5127+
ggml_tensor * out = ggml_col2im_1d(ctx, cols, s0, (int) OC, p0);
5128+
ggml_set_name(out, "out");
5129+
5130+
return out;
5131+
}
5132+
};
5133+
51015134
// GGML_OP_CONV_TRANSPOSE_2D
51025135
struct test_conv_transpose_2d : public test_case {
51035136
// Dimensions
@@ -8013,6 +8046,21 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
80138046
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
80148047
test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
80158048

8049+
for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16}) {
8050+
// ConvTranspose1d expressed as mul_mat + col2im (DAC decoder upsampling)
8051+
test_cases.emplace_back(new test_col2im_1d(type, 16, 32, 197, 8, 0)); // kernel = 2*stride
8052+
test_cases.emplace_back(new test_col2im_1d(type, 4, 3, 7, 2, 0));
8053+
test_cases.emplace_back(new test_col2im_1d(type, 1, 5, 13, 1, 0)); // stride 1, no overlap
8054+
test_cases.emplace_back(new test_col2im_1d(type, 6, 4, 11, 3, 1)); // with cropping
8055+
test_cases.emplace_back(new test_col2im_1d(type, 2, 3, 9, 3, 0)); // kernel < stride, gap positions are zeroed
8056+
test_cases.emplace_back(new test_col2im_1d(type, 5, 4, 11, 2, 0)); // kernel not a multiple of stride, alternating overlap
8057+
test_cases.emplace_back(new test_col2im_1d(type, 8, 4, 13, 4, 2)); // padding = stride/2 (DAC causal cropping)
8058+
test_cases.emplace_back(new test_col2im_1d(type, 4, 3, 1, 2, 0)); // single column, pure kernel unfold
8059+
test_cases.emplace_back(new test_col2im_1d(type, 16, 1, 197, 8, 0)); // OC = 1, mono output stage
8060+
test_cases.emplace_back(new test_col2im_1d(type, 1, 5, 13, 3, 0)); // K = 1 with stride > 1, sparse scatter
8061+
test_cases.emplace_back(new test_col2im_1d(type, 8, 2, 3, 2, 5)); // cropping eats most of the signal, T_out = 2
8062+
}
8063+
80168064
for (ggml_type kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
80178065
test_cases.emplace_back(new test_conv_transpose_2d({3, 2, 3, 1}, {2, 2, 1, 3}, 1, kernel_type));
80188066
test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2, kernel_type));
@@ -9366,6 +9414,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
93669414
test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2, kernel_type));
93679415
}
93689416

9417+
// Memory bound overlap-add of the GEMM + col2im_1d transposed conv path, real vocoder stage shapes
9418+
test_cases.emplace_back(new test_col2im_1d(GGML_TYPE_F32, 16, 512, 2048, 8, 0));
9419+
test_cases.emplace_back(new test_col2im_1d(GGML_TYPE_F32, 4, 128, 65536, 2, 0));
9420+
test_cases.emplace_back(new test_col2im_1d(GGML_TYPE_F16, 16, 512, 2048, 8, 0));
9421+
93699422
test_cases.emplace_back(new test_mean(GGML_TYPE_F32, {256, 256, 3, 1}));
93709423

93719424

0 commit comments

Comments
 (0)