@@ -47,11 +47,6 @@ layout(push_constant) uniform restrict Block {
47
47
48
48
layout (local_size_x_id = 0 , local_size_y_id = 1 , local_size_z_id = 2 ) in ;
49
49
50
- // For performance improvement, reduce register usage by caching positions in shared memory.
51
- // Offset index by 1 every 16 points to avoid bank access conflict.
52
- #define offset_pos_index(index) (index + ((index) >> 4 ))
53
- shared ivec3 pos_shared[offset_pos_index(LOCAL_WG_SIZE)];
54
-
55
50
/*
56
51
* Computes a depthwise convolution. Each shader invocation calculates the
57
52
* output at a single output location.
@@ -77,8 +72,6 @@ void main() {
77
72
return ;
78
73
}
79
74
80
- pos_shared[offset_pos_index(gl_LocalInvocationIndex)] = pos;
81
-
82
75
// Compute the index of the top-left element of the overlay region. Negative
83
76
// indices indicate that the top-left element is in a region added by padding.
84
77
const ivec2 ipos = pos.xy * stride - padding;
@@ -89,13 +82,10 @@ void main() {
89
82
const ivec2 end = ipos + overlay_region.xy;
90
83
91
84
// sum outputs
92
- VEC4_T sum[BATCH_SIZE_Y][ BATCH_SIZE_X];
85
+ VEC4_T sum[BATCH_SIZE_Y * BATCH_SIZE_X];
93
86
94
- sum[0 ][0 ] = texelFetch(t_bias, ivec2 (pos.z, 0 ), 0 );
95
- for (int y = 0 ; y < BATCH_SIZE_Y; y++ ) {
96
- for (int x = 0 ; x < BATCH_SIZE_X; x++ ) {
97
- sum[y][x] = sum[0 ][0 ];
98
- }
87
+ for (int i = 0 ; i < BATCH_SIZE_Y * BATCH_SIZE_X; i++ ) {
88
+ sum[i] = VEC4_T(0 );
99
89
}
100
90
101
91
// array to store input texels
@@ -115,7 +105,7 @@ void main() {
115
105
if (i > 0 ) {
116
106
for (int j = 0 ; j < TILE_SIZE; j++ ) {
117
107
for (int s = 0 ; s < BATCH_SIZE_X; s++ ) {
118
- sum[1 ][ s] = fma(in_texels[j + s], prev_kernel_line[j], sum[1 ][ s]);
108
+ sum[BATCH_SIZE_X + s] = fma(in_texels[j + s], prev_kernel_line[j], sum[BATCH_SIZE_X + s]);
119
109
}
120
110
}
121
111
}
@@ -125,19 +115,19 @@ void main() {
125
115
for (int j = 0 ; j < TILE_SIZE; j++ , kx++ ) {
126
116
prev_kernel_line[j] = texelFetch(t_kernel, ivec2 (kx, pos.z), 0 );
127
117
for (int s = 0 ; s < BATCH_SIZE_X; s++ ) {
128
- sum[0 ][ s] = fma(in_texels[j + s], prev_kernel_line[j], sum[ 0 ] [s]);
118
+ sum[s] = fma(in_texels[j + s], prev_kernel_line[j], sum[s]);
129
119
}
130
120
}
131
121
}
132
122
}
133
123
134
- const ivec3 out_pos = pos_shared[offset_pos_index(gl_LocalInvocationIndex)] ;
124
+ const VEC4_T bias = texelFetch(t_bias, ivec2 (pos.z, 0 ), 0 ) ;
135
125
for (int y = 0 ; y < BATCH_SIZE_Y; y++ ) {
136
126
for (int x = 0 ; x < BATCH_SIZE_X; x++ ) {
137
- if (any (greaterThanEqual (ivec3 (out_pos.x + x, out_pos.y + y, out_pos.z), out_limits.xyz))) {
138
- continue ;
127
+ const ivec3 out_pos = ivec3 (pos.x + x, pos.y + y, pos.z);
128
+ if (all (lessThan (out_pos.xy, out_limits.xy))) {
129
+ imageStore(t_out, out_pos, op(sum[y * BATCH_SIZE_X + x] + bias, out_min, out_max));
139
130
}
140
- imageStore(t_out, ivec3 (out_pos.x + x, out_pos.y + y, out_pos.z), op(sum[y][x], out_min, out_max));
141
131
}
142
132
}
143
133
}
0 commit comments