Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 68f9c81

Browse files
committedFeb 12, 2025
LoongArch64: Fixed snrm2_lsx.S and cnrm2_lsx.S
When the data type is single-precision real or single-precision complex, converting it to double precision does not prevent overflow (as exposed in LAPACK tests). The only solution is to follow C's approach: find the maximum value in the array and divide each element by that maximum to avoid this issue
1 parent 9e75d6b commit 68f9c81

File tree

2 files changed

+98
-54
lines changed

2 files changed

+98
-54
lines changed
 

‎kernel/loongarch64/cnrm2_lsx.S

Lines changed: 47 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4747
#define VX4 $vr21
4848
#define res1 $vr19
4949
#define res2 $vr20
50+
#define RCP $f2
51+
#define VALPHA $vr3
5052

5153
PROLOGUE
5254

@@ -55,10 +57,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5557
LDINT INCX, 0(INCX)
5658
#endif
5759

58-
vxor.v res1, res1, res1
59-
vxor.v res2, res2, res2
6060
bge $r0, N, .L999
6161
beq $r0, INCX, .L999
62+
addi.d $sp, $sp, -32
63+
st.d $ra, $sp, 0
64+
st.d N, $sp, 8
65+
st.d X, $sp, 16
66+
st.d INCX, $sp, 24
67+
bl camax_k
68+
ld.d $ra, $sp, 0
69+
ld.d N, $sp, 8
70+
ld.d X, $sp, 16
71+
ld.d INCX, $sp, 24
72+
addi.d $sp, $sp, 32
73+
74+
frecip.s RCP, $f0
75+
vreplvei.w VALPHA, $vr2, 0
76+
vxor.v res1, res1, res1
77+
vxor.v res2, res2, res2
78+
fcmp.ceq.s $fcc0, $f0, $f19
79+
bcnez $fcc0, .L999
6280
li.d TEMP, 1
6381
slli.d TEMP, TEMP, ZBASE_SHIFT
6482
slli.d INCX, INCX, ZBASE_SHIFT
@@ -69,16 +87,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
6987

7088
.L10:
7189
vld VX0, X, 0 * SIZE
72-
vfcvtl.d.s VX1, VX0
73-
vfcvth.d.s VX2, VX0
74-
vfmadd.d res1, VX1, VX1, res1
75-
vfmadd.d res2, VX2, VX2, res2
76-
vld VX0, X, 4 * SIZE
77-
vfcvtl.d.s VX3, VX0
78-
vfcvth.d.s VX4, VX0
79-
vfmadd.d res1, VX3, VX3, res1
80-
vfmadd.d res2, VX4, VX4, res2
8190
addi.d I, I, -1
91+
vld VX0, X, 0 * SIZE
92+
vld VX1, X, 4 * SIZE
93+
vfmul.s VX0, VX0, VALPHA
94+
vfmul.s VX1, VX1, VALPHA
95+
96+
vfmadd.s res1, VX0, VX0, res1
97+
vfmadd.s res2, VX1, VX1, res2
98+
8299
addi.d X, X, 8 * SIZE
83100
blt $r0, I, .L10
84101
b .L996
@@ -99,10 +116,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
99116
vinsgr2vr.w VX0, t3, 2
100117
vinsgr2vr.w VX0, t4, 3
101118
add.d X, X, INCX
102-
vfcvtl.d.s VX1, VX0
103-
vfcvth.d.s VX2, VX0
104-
vfmadd.d res1, VX1, VX1, res1
105-
vfmadd.d res2, VX2, VX2, res2
119+
vfmul.s VX0, VX0, VALPHA
120+
vfmadd.s res1, VX0, VX0, res1
121+
106122
ld.w t1, X, 0 * SIZE
107123
ld.w t2, X, 1 * SIZE
108124
add.d X, X, INCX
@@ -113,19 +129,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
113129
vinsgr2vr.w VX0, t3, 2
114130
vinsgr2vr.w VX0, t4, 3
115131
add.d X, X, INCX
116-
vfcvtl.d.s VX3, VX0
117-
vfcvth.d.s VX4, VX0
118-
vfmadd.d res1, VX3, VX3, res1
119-
vfmadd.d res2, VX4, VX4, res2
132+
vfmul.s VX0, VX0, VALPHA
133+
vfmadd.s res2, VX0, VX0, res2
134+
120135
addi.d I, I, -1
121136
blt $r0, I, .L21
122137
b .L996
123138
.align 3
124139

125140
.L996:
126-
vfadd.d res1, res1, res2
127-
vreplvei.d VX1, res1, 1
128-
vfadd.d res1, VX1, res1
141+
vfadd.s res1, res1, res2
142+
vreplvei.w VX1, res1, 1
143+
vreplvei.w VX2, res1, 2
144+
vreplvei.w VX3, res1, 3
145+
vfadd.s res1, VX1, res1
146+
vfadd.s res1, VX2, res1
147+
vfadd.s res1, VX3, res1
129148
.align 3
130149

131150
.L997:
@@ -137,18 +156,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
137156
fld.s a1, X, 0 * SIZE
138157
fld.s a2, X, 1 * SIZE
139158
addi.d I, I, -1
140-
fcvt.d.s a1, a1
141-
fcvt.d.s a2, a2
142-
fmadd.d res, a1, a1, res
143-
fmadd.d res, a2, a2, res
159+
fmul.s a1, a1, RCP
160+
fmul.s a2, a2, RCP
161+
fmadd.s res, a1, a1, res
162+
fmadd.s res, a2, a2, res
144163
add.d X, X, INCX
145164
blt $r0, I, .L998
146165
.align 3
147166

148167
.L999:
149-
fsqrt.d res, res
168+
fsqrt.s res, res
169+
fmul.s $f0, res, $f0
150170
move $r4, $r17
151-
fcvt.s.d $f0, $f19
152171
jirl $r0, $r1, 0x0
153172
.align 3
154173

‎kernel/loongarch64/snrm2_lsx.S

Lines changed: 51 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -52,17 +52,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5252
/* Don't change following FR unless you know the effects. */
5353
#define res1 $vr19
5454
#define res2 $vr20
55+
#define RCP $f2
56+
#define VALPHA $vr3
57+
58+
// The optimization for snrm2 cannot simply involve
59+
// extending the data type from float to double and
60+
// then summing the squares of the data. LAPACK tests
61+
// have shown that this approach can still lead to data overflow.
62+
// Instead, we need to find the maximum absolute value in the entire
63+
// array and divide each data element by this maximum value before
64+
// performing the calculation. This approach can avoid overflow (and does not require extending the data type).
5565

5666
PROLOGUE
5767

5868
#ifdef F_INTERFACE
5969
LDINT N, 0(N)
6070
LDINT INCX, 0(INCX)
6171
#endif
62-
vxor.v res1, res1, res1
63-
vxor.v res2, res2, res2
6472
bge $r0, N, .L999
6573
beq $r0, INCX, .L999
74+
75+
addi.d $sp, $sp, -32
76+
st.d $ra, $sp, 0
77+
st.d N, $sp, 8
78+
st.d X, $sp, 16
79+
st.d INCX, $sp, 24
80+
bl samax_k
81+
ld.d $ra, $sp, 0
82+
ld.d N, $sp, 8
83+
ld.d X, $sp, 16
84+
ld.d INCX, $sp, 24
85+
addi.d $sp, $sp, 32
86+
87+
frecip.s RCP, $f0
88+
vreplvei.w VALPHA, $vr2, 0
89+
vxor.v res1, res1, res1
90+
vxor.v res2, res2, res2
91+
fcmp.ceq.s $fcc0, $f0, $f19
92+
bcnez $fcc0, .L999
6693
li.d TEMP, SIZE
6794
slli.d INCX, INCX, BASE_SHIFT
6895
srai.d I, N, 3
@@ -75,14 +102,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
75102
vld VX5, X, 4 * SIZE
76103
addi.d I, I, -1
77104
addi.d X, X, 8 * SIZE
78-
vfcvtl.d.s VX1, VX0
79-
vfcvth.d.s VX2, VX0
80-
vfcvtl.d.s VX3, VX5
81-
vfcvth.d.s VX4, VX5
82-
vfmadd.d res1, VX1, VX1, res1
83-
vfmadd.d res2, VX2, VX2, res2
84-
vfmadd.d res1, VX3, VX3, res1
85-
vfmadd.d res2, VX4, VX4, res2
105+
106+
vfmul.s VX0, VX0, VALPHA
107+
vfmul.s VX5, VX5, VALPHA
108+
109+
vfmadd.s res1, VX0, VX0, res1
110+
vfmadd.s res2, VX5, VX5, res2
86111
blt $r0, I, .L10
87112
b .L996
88113
.align 3
@@ -104,10 +129,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
104129
vinsgr2vr.w VX0, t2, 1
105130
vinsgr2vr.w VX0, t3, 2
106131
vinsgr2vr.w VX0, t4, 3
107-
vfcvtl.d.s VX1, VX0
108-
vfcvth.d.s VX2, VX0
109-
vfmadd.d res1, VX1, VX1, res1
110-
vfmadd.d res2, VX2, VX2, res2
132+
vfmul.s VX0, VX0, VALPHA
133+
vfmadd.s res1, VX0, VX0, res1
134+
111135
ld.w t1, X, 0
112136
add.d X, X, INCX
113137
ld.w t2, X, 0
@@ -120,19 +144,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
120144
vinsgr2vr.w VX0, t2, 1
121145
vinsgr2vr.w VX0, t3, 2
122146
vinsgr2vr.w VX0, t4, 3
123-
vfcvtl.d.s VX3, VX0
124-
vfcvth.d.s VX4, VX0
125-
vfmadd.d res1, VX3, VX3, res1
126-
vfmadd.d res2, VX4, VX4, res2
147+
vfmul.s VX0, VX0, VALPHA
148+
vfmadd.s res2, VX0, VX0, res2
127149
addi.d I, I, -1
128150
blt $r0, I, .L21
129-
b .L996
130151
.align 3
131152

132153
.L996:
133-
vfadd.d res1, res1, res2
134-
vreplvei.d VX1, res1, 1
135-
vfadd.d res1, VX1, res1
154+
vfadd.s res1, res1, res2
155+
vreplvei.w VX1, res1, 1
156+
vreplvei.w VX2, res1, 2
157+
vreplvei.w VX3, res1, 3
158+
vfadd.s res1, VX1, res1
159+
vfadd.s res1, VX2, res1
160+
vfadd.s res1, VX3, res1
136161
.align 3
137162

138163
.L997:
@@ -143,16 +168,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
143168
.L998:
144169
fld.s $f15, X, 0
145170
addi.d I, I, -1
146-
fcvt.d.s $f15, $f15
147-
fmadd.d $f19, $f15, $f15, $f19
171+
fmul.s $f15, $f15, RCP
172+
fmadd.s $f19, $f15, $f15, $f19
148173
add.d X, X, INCX
149174
blt $r0, I, .L998
150175
.align 3
151176

152177
.L999:
153-
fsqrt.d $f19, $f19
178+
fsqrt.s $f19, $f19
179+
fmul.s $f0, $f19, $f0
154180
move $r4, $r17
155-
fcvt.s.d $f0, $f19
156181
jirl $r0, $r1, 0x0
157182
.align 3
158183

0 commit comments

Comments
 (0)