@@ -114,26 +114,18 @@ void middleMul(T2 *u, u32 s, Trig trig) {
114
114
#if MM_CHAIN == 0
115
115
WADDF (1 , w );
116
116
T2 base ;
117
- if (MIDDLE >= 10 ) {
118
- base = csqTrigFancy (w );
119
- WADDF (2 , base );
120
- base = ccubeTrigFancy (base , w );
121
- WADDF (3 , base );
122
- base .x += 1 ;
123
- } else {
124
- base = csqTrigFancy (w );
125
- WADDF (2 , base );
126
- base = ccubeTrigFancy (base , w );
127
- WADDF (3 , base );
128
- base .x += 1 ;
129
- }
117
+ base = csqTrigFancy (w );
118
+ WADDF (2 , base );
119
+ base = ccubeTrigFancy (base , w );
120
+ WADDF (3 , base );
121
+ base .x += 1 ;
130
122
131
123
for (u32 k = 4 ; k < MIDDLE ; ++ k ) {
132
124
base = cmulFancy (base , w );
133
125
WADD (k , base );
134
126
}
135
127
136
- #elif 0 && MM_CHAIN == 1 // This is fewer F64 ops, but slower on Radeon 7 -- probably the optimizer being weird. It also has somewhat worse Z.
128
+ #elif 0 && MM_CHAIN == 1 // This is fewer F64 ops, but may be slower on Radeon 7 -- probably the optimizer being weird. It also has somewhat worse Z.
137
129
for (u32 k = 3 + (MIDDLE - 2 ) % 3 ; k < MIDDLE ; k += 3 ) {
138
130
T2 base , base_minus1 , base_plus1 ;
139
131
base = slowTrig_N (WIDTH * k * s , WIDTH * SMALL_HEIGHT * k );
@@ -204,7 +196,19 @@ void middleMul2(T2 *u, u32 x, u32 y, double factor, Trig trig) {
204
196
} else { // MIDDLE >= 5
205
197
// T2 w = slowTrig_N(x * SMALL_HEIGHT, ND / MIDDLE);
206
198
207
- #if MM2_CHAIN == 0
199
+ #if AMDGPU && MM2_CHAIN == 0 // Oddly, Radeon 7 is faster with this version that uses more F64 ops
200
+
201
+ T2 base = slowTrig_N (x * y + x * SMALL_HEIGHT , ND / MIDDLE * 2 ) * factor ;
202
+ WADD (0 , base );
203
+ WADD (1 , base );
204
+
205
+ for (u32 k = 2 ; k < MIDDLE ; ++ k ) {
206
+ base = cmulFancy (base , w );
207
+ WADD (k , base );
208
+ }
209
+ WSUBF (0 , w );
210
+
211
+ #elif MM2_CHAIN == 0
208
212
209
213
u32 mid = MIDDLE / 2 ;
210
214
T2 base = slowTrig_N (x * y + x * SMALL_HEIGHT * mid , ND / MIDDLE * (mid + 1 )) * factor ;
0 commit comments