@@ -52,17 +52,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
52
52
/* Don't change following FR unless you know the effects. */
53
53
#define res1 $vr19
54
54
#define res2 $vr20
55
+ #define RCP $f2
56
+ #define VALPHA $vr3
57
+
58
+ // The optimization for snrm2 cannot simply involve
59
+ // extending the data type from float to double and
60
+ // then summing the squares of the data. LAPACK tests
61
+ // have shown that this approach can still lead to data overflow.
62
+ // Instead, we need to find the maximum absolute value in the entire
63
+ // array and divide each data element by this maximum value before
64
+ // performing the calculation. This approach can avoid overflow (and does not require extending the data type).
55
65
56
66
PROLOGUE
57
67
58
68
#ifdef F_INTERFACE
59
69
LDINT N, 0 (N)
60
70
LDINT INCX, 0 (INCX)
61
71
#endif
62
- vxor.v res1, res1, res1
63
- vxor.v res2, res2, res2
64
72
bge $r0, N, .L999
65
73
beq $r0, INCX, .L999
74
+
75
+ addi.d $sp, $sp, -32
76
+ st.d $ra, $sp, 0
77
+ st.d N, $sp, 8
78
+ st.d X, $sp, 16
79
+ st.d INCX, $sp, 24
80
+ #ifdef DYNAMIC_ARCH
81
+ bl samax_k_LA264
82
+ #else
83
+ bl samax_k
84
+ #endif
85
+ ld.d $ra, $sp, 0
86
+ ld.d N, $sp, 8
87
+ ld.d X, $sp, 16
88
+ ld.d INCX, $sp, 24
89
+ addi.d $sp, $sp, 32
90
+
91
+ frecip.s RCP, $f0
92
+ vreplvei.w VALPHA, $vr2, 0
93
+ vxor.v res1, res1, res1
94
+ vxor.v res2, res2, res2
95
+ fcmp.ceq.s $fcc0, $f0, $f19
96
+ bcnez $fcc0, .L999
66
97
li.d TEMP, SIZE
67
98
slli.d INCX, INCX, BASE_SHIFT
68
99
srai.d I, N, 3
@@ -75,14 +106,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
75
106
vld VX5, X, 4 * SIZE
76
107
addi.d I, I, -1
77
108
addi.d X, X, 8 * SIZE
78
- vfcvtl.d.s VX1, VX0
79
- vfcvth.d.s VX2, VX0
80
- vfcvtl.d.s VX3, VX5
81
- vfcvth.d.s VX4, VX5
82
- vfmadd.d res1, VX1, VX1, res1
83
- vfmadd.d res2, VX2, VX2, res2
84
- vfmadd.d res1, VX3, VX3, res1
85
- vfmadd.d res2, VX4, VX4, res2
109
+
110
+ vfmul.s VX0, VX0, VALPHA
111
+ vfmul.s VX5, VX5, VALPHA
112
+
113
+ vfmadd.s res1, VX0, VX0, res1
114
+ vfmadd.s res2, VX5, VX5, res2
86
115
blt $r0, I, .L10
87
116
b .L996
88
117
.align 3
@@ -104,10 +133,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
104
133
vinsgr2vr.w VX0, t2, 1
105
134
vinsgr2vr.w VX0, t3, 2
106
135
vinsgr2vr.w VX0, t4, 3
107
- vfcvtl.d.s VX1, VX0
108
- vfcvth.d.s VX2, VX0
109
- vfmadd.d res1, VX1, VX1, res1
110
- vfmadd.d res2, VX2, VX2, res2
136
+ vfmul.s VX0, VX0, VALPHA
137
+ vfmadd.s res1, VX0, VX0, res1
138
+
111
139
ld.w t1, X, 0
112
140
add .d X, X, INCX
113
141
ld.w t2, X, 0
@@ -120,19 +148,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
120
148
vinsgr2vr.w VX0, t2, 1
121
149
vinsgr2vr.w VX0, t3, 2
122
150
vinsgr2vr.w VX0, t4, 3
123
- vfcvtl.d.s VX3, VX0
124
- vfcvth.d.s VX4, VX0
125
- vfmadd.d res1, VX3, VX3, res1
126
- vfmadd.d res2, VX4, VX4, res2
151
+ vfmul.s VX0, VX0, VALPHA
152
+ vfmadd.s res2, VX0, VX0, res2
127
153
addi.d I, I, -1
128
154
blt $r0, I, .L21
129
- b .L996
130
155
.align 3
131
156
132
157
.L996:
133
- vfadd.d res1, res1, res2
134
- vreplvei.d VX1, res1, 1
135
- vfadd.d res1, VX1, res1
158
+ vfadd.s res1, res1, res2
159
+ vreplvei.w VX1, res1, 1
160
+ vreplvei.w VX2, res1, 2
161
+ vreplvei.w VX3, res1, 3
162
+ vfadd.s res1, VX1, res1
163
+ vfadd.s res1, VX2, res1
164
+ vfadd.s res1, VX3, res1
136
165
.align 3
137
166
138
167
.L997:
@@ -143,16 +172,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
143
172
.L998:
144
173
fld .s $f15, X, 0
145
174
addi.d I, I, -1
146
- fcvt.d.s $f15, $f15
147
- fmadd.d $f19, $f15, $f15, $f19
175
+ fmul .s $f15, $f15, RCP
176
+ fmadd.s $f19, $f15, $f15, $f19
148
177
add .d X, X, INCX
149
178
blt $r0, I, .L998
150
179
.align 3
151
180
152
181
.L999:
153
- fsqrt .d $f19, $f19
182
+ fsqrt .s $f19, $f19
183
+ fmul .s $f0, $f19, $f0
154
184
move $r4, $r17
155
- fcvt.s.d $f0, $f19
156
185
jirl $r0, $r1, 0x0
157
186
.align 3
158
187
0 commit comments