Skip to content

Commit d12753e

Browse files
gwoltmanpreda
authored andcommitted
Dialed back one of the MM2_CHAIN=0 changes to keep rocm optimizer happy
1 parent 626bfc4 commit d12753e

File tree

1 file changed

+19
-15
lines changed

1 file changed

+19
-15
lines changed

src/cl/fft-middle.cl

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -114,26 +114,18 @@ void middleMul(T2 *u, u32 s, Trig trig) {
114114
#if MM_CHAIN == 0
115115
WADDF(1, w);
116116
T2 base;
117-
if (MIDDLE >= 10) {
118-
base = csqTrigFancy(w);
119-
WADDF(2, base);
120-
base = ccubeTrigFancy(base, w);
121-
WADDF(3, base);
122-
base.x += 1;
123-
} else {
124-
base = csqTrigFancy(w);
125-
WADDF(2, base);
126-
base = ccubeTrigFancy(base, w);
127-
WADDF(3, base);
128-
base.x += 1;
129-
}
117+
base = csqTrigFancy(w);
118+
WADDF(2, base);
119+
base = ccubeTrigFancy(base, w);
120+
WADDF(3, base);
121+
base.x += 1;
130122

131123
for (u32 k = 4; k < MIDDLE; ++k) {
132124
base = cmulFancy(base, w);
133125
WADD(k, base);
134126
}
135127

136-
#elif 0 && MM_CHAIN == 1 // This is fewer F64 ops, but slower on Radeon 7 -- probably the optimizer being weird. It also has somewhat worse Z.
128+
#elif 0 && MM_CHAIN == 1 // This is fewer F64 ops, but may be slower on Radeon 7 -- probably the optimizer being weird. It also has somewhat worse Z.
137129
for (u32 k = 3 + (MIDDLE - 2) % 3; k < MIDDLE; k += 3) {
138130
T2 base, base_minus1, base_plus1;
139131
base = slowTrig_N(WIDTH * k * s, WIDTH * SMALL_HEIGHT * k);
@@ -204,7 +196,19 @@ void middleMul2(T2 *u, u32 x, u32 y, double factor, Trig trig) {
204196
} else { // MIDDLE >= 5
205197
// T2 w = slowTrig_N(x * SMALL_HEIGHT, ND / MIDDLE);
206198

207-
#if MM2_CHAIN == 0
199+
#if AMDGPU && MM2_CHAIN == 0 // Oddly, Radeon 7 is faster with this version that uses more F64 ops
200+
201+
T2 base = slowTrig_N(x * y + x * SMALL_HEIGHT, ND / MIDDLE * 2) * factor;
202+
WADD(0, base);
203+
WADD(1, base);
204+
205+
for (u32 k = 2; k < MIDDLE; ++k) {
206+
base = cmulFancy(base, w);
207+
WADD(k, base);
208+
}
209+
WSUBF(0, w);
210+
211+
#elif MM2_CHAIN == 0
208212

209213
u32 mid = MIDDLE / 2;
210214
T2 base = slowTrig_N(x * y + x * SMALL_HEIGHT * mid, ND / MIDDLE * (mid + 1)) * factor;

0 commit comments

Comments
 (0)