Skip to content

Commit 6dcc401

Browse files
committed
[loong64] Replace optimised string operations
The current implementation of the optimised string operations appears to have been ported from the (old) arm64 implementation, and does not cleanly match the LoongArch64 instruction set. Replace with code derived from the riscv64 implementation, modified to use indexed load and store instructions. Signed-off-by: Michael Brown <mcb30@ipxe.org>
1 parent 63eeb23 commit 6dcc401

2 files changed

Lines changed: 148 additions & 162 deletions

File tree

Lines changed: 128 additions & 160 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
/*
2-
* Copyright (C) 2016 Michael Brown <mbrown@fensystems.co.uk>.
3-
* Copyright (c) 2023, Xiaotian Wu <wuxiaotian@loongson.cn>
2+
* Copyright (C) 2026 Michael Brown <mbrown@fensystems.co.uk>.
43
*
54
* This program is free software; you can redistribute it and/or
65
* modify it under the terms of the GNU General Public License as
@@ -29,6 +28,7 @@
2928
*/
3029

3130
FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
31+
FILE_SECBOOT ( PERMITTED );
3232

3333
#include <string.h>
3434

@@ -41,68 +41,65 @@ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
4141
* @ret dest Destination address
4242
*/
4343
void loong64_memcpy ( void *dest, const void *src, size_t len ) {
44-
void *discard_dest;
45-
void *discard_end;
46-
const void *discard_src;
47-
size_t discard_offset;
44+
size_t len_pre;
45+
size_t len_mid;
46+
size_t len_post;
47+
size_t offset;
4848
unsigned long discard_data;
49-
unsigned long discard_low;
50-
unsigned long discard_high;
5149

52-
/* If length is too short, then just copy individual bytes.
50+
/* Calculate pre-aligned, aligned, and post-aligned lengths.
51+
* (Align on the destination address, on the assumption that
52+
* misaligned stores are likely to be more expensive than
53+
* misaligned loads.)
5354
*/
54-
if ( len < 16 ) {
55-
__asm__ __volatile__ ( "beqz %0, 2f\n\t"
56-
"\n1:\n\t"
57-
"addi.d %0, %0, -1\n\t"
58-
"ldx.b %1, %3, %0\n\t"
59-
"stx.b %1, %2, %0\n\t"
60-
"bnez %0, 1b\n\t"
61-
"\n2:\n\t"
62-
: "=&r" ( discard_offset ),
63-
"=&r" ( discard_data )
64-
: "r" ( dest ), "r" ( src ), "0" ( len )
65-
: "memory", "t0" );
66-
return;
67-
}
55+
len_pre = ( ( sizeof ( unsigned long ) - ( ( intptr_t ) dest ) ) &
56+
( sizeof ( unsigned long ) - 1 ) );
57+
if ( len_pre > len )
58+
len_pre = len;
59+
len -= len_pre;
60+
len_mid = ( len & ~( sizeof ( unsigned long ) - 1 ) );
61+
len -= len_mid;
62+
len_post = len;
63+
64+
/* Copy pre-aligned section */
65+
__asm__ __volatile__ ( "b 2f\n\t"
66+
"\n1:\n\t"
67+
"ldx.b %1, %3, %0\n\t"
68+
"stx.b %1, %2, %0\n\t"
69+
"addi.d %0, %0, 1\n\t"
70+
"\n2:\n\t"
71+
"bne %0, %4, 1b\n\t"
72+
: "=&r" ( offset ), "=&r" ( discard_data )
73+
: "r" ( dest ), "r" ( src ), "r" ( len_pre ),
74+
"0" ( 0 )
75+
: "memory" );
6876

69-
/* Copy 16 bytes at a time: one initial
70-
* potentially unaligned access, multiple destination-aligned
71-
* accesses, one final potentially unaligned access.
72-
*/
73-
__asm__ __volatile__ ( "ld.d %3, %1, 0\n\t"
74-
"ld.d %4, %1, 8\n\t"
75-
"addi.d %1, %1, 16\n\t"
76-
"st.d %3, %0, 0\n\t"
77-
"st.d %4, %0, 8\n\t"
78-
"addi.d %0, %0, 16\n\t"
79-
"andi %3, %0, 15\n\t"
80-
"sub.d %0, %0, %3\n\t"
81-
"sub.d %1, %1, %3\n\t"
82-
"addi.d $t0, $zero, 0xf\n\t"
83-
"andn %2, %5, $t0\n\t"
84-
"b 2f\n\t"
77+
/* Copy aligned section */
78+
__asm__ __volatile__ ( "b 2f\n\t"
8579
"\n1:\n\t"
86-
"ld.d %3, %1, 0\n\t"
87-
"ld.d %4, %1, 8\n\t"
88-
"addi.d %1, %1, 16\n\t"
89-
"st.d %3, %0, 0\n\t"
90-
"st.d %4, %0, 8\n\t"
91-
"addi.d %0, %0, 16\n\t"
80+
"ldx.d %1, %3, %0\n\t"
81+
"stx.d %1, %2, %0\n\t"
82+
"addi.d %0, %0, %5\n\t"
9283
"\n2:\n\t"
93-
"bne %0, %2, 1b\n\t"
94-
"ld.d %3, %6, -16\n\t"
95-
"ld.d %4, %6, -8\n\t"
96-
"st.d %3, %5, -16\n\t"
97-
"st.d %4, %5, -8\n\t"
98-
: "=&r" ( discard_dest ),
99-
"=&r" ( discard_src ),
100-
"=&r" ( discard_end ),
101-
"=&r" ( discard_low ),
102-
"=&r" ( discard_high )
103-
: "r" ( dest + len ), "r" ( src + len ),
104-
"0" ( dest ), "1" ( src )
105-
: "memory", "t0" );
84+
"bne %0, %4, 1b\n\t"
85+
: "+r" ( offset ), "=&r" ( discard_data )
86+
: "r" ( dest ), "r" ( src ),
87+
"r" ( offset + len_mid ),
88+
"i" ( sizeof ( unsigned long ) )
89+
: "memory" );
90+
91+
/* Copy post-aligned section */
92+
__asm__ __volatile__ ( "b 2f\n\t"
93+
"\n1:\n\t"
94+
"ldx.b %1, %3, %0\n\t"
95+
"stx.b %1, %2, %0\n\t"
96+
"addi.d %0, %0, 1\n\t"
97+
"\n2:\n\t"
98+
"bne %0, %4, 1b\n\t"
99+
: "+r" ( offset ), "=&r" ( discard_data )
100+
: "r" ( dest ), "r" ( src ),
101+
"r" ( offset + len_post )
102+
: "memory" );
106103
}
107104

108105
/**
@@ -112,50 +109,54 @@ void loong64_memcpy ( void *dest, const void *src, size_t len ) {
112109
* @v len Length
113110
*/
114111
void loong64_bzero ( void *dest, size_t len ) {
115-
size_t discard_offset;
116-
void *discard_dest;
117-
void *discard_end;
118-
119-
/* If length is too short, then just zero individual bytes.
120-
*/
121-
if ( len < 16 ) {
122-
__asm__ __volatile__ ( "beqz %0, 2f\n\t"
123-
"\n1:\n\t"
124-
"addi.d %0, %0, -1\n\t"
125-
"stx.b $zero, %1, %0\n\t"
126-
"bnez %0, 1b\n\t"
127-
"\n2:\n\t"
128-
: "=&r" ( discard_offset )
129-
: "r" ( dest ), "0" ( len )
130-
: "memory" );
131-
return;
132-
}
112+
size_t len_pre;
113+
size_t len_mid;
114+
size_t len_post;
115+
size_t offset;
116+
117+
/* Calculate pre-aligned, aligned, and post-aligned lengths */
118+
len_pre = ( ( sizeof ( unsigned long ) - ( ( intptr_t ) dest ) ) &
119+
( sizeof ( unsigned long ) - 1 ) );
120+
if ( len_pre > len )
121+
len_pre = len;
122+
len -= len_pre;
123+
len_mid = ( len & ~( sizeof ( unsigned long ) - 1 ) );
124+
len -= len_mid;
125+
len_post = len;
126+
127+
/* Zero pre-aligned section */
128+
__asm__ __volatile__ ( "b 2f\n\t"
129+
"\n1:\n\t"
130+
"stx.b $zero, %1, %0\n\t"
131+
"addi.d %0, %0, 1\n\t"
132+
"\n2:\n\t"
133+
"bne %0, %2, 1b\n\t"
134+
: "=&r" ( offset )
135+
: "r" ( dest ), "r" ( len_pre ), "0" ( 0 )
136+
: "memory" );
133137

134-
/* To zero 16 bytes at a time: one initial
135-
* potentially unaligned access, multiple aligned accesses,
136-
* one final potentially unaligned access.
137-
*/
138+
/* Zero aligned section */
139+
__asm__ __volatile__ ( "b 2f\n\t"
140+
"\n1:\n\t"
141+
"stx.d $zero, %1, %0\n\t"
142+
"addi.d %0, %0, %3\n\t"
143+
"\n2:\n\t"
144+
"bne %0, %2, 1b\n\t"
145+
: "+r" ( offset )
146+
: "r" ( dest ), "r" ( offset + len_mid ),
147+
"i" ( sizeof ( unsigned long ) )
148+
: "memory" );
138149

139-
__asm__ __volatile__ ( "st.d $zero, %0, 0\n\t"
140-
"st.d $zero, %0, 8\n\t"
141-
"addi.d %0, %0, 16\n\t"
142-
"addi.w $t0, $zero, 15\n\t"
143-
"andn %0, %0, $t0\n\t"
144-
"addi.w $t0, $zero, 15\n\t"
145-
"andn %1, %2, $t0\n\t"
146-
"b 2f\n\t"
150+
/* Zero post-aligned section */
151+
__asm__ __volatile__ ( "b 2f\n\t"
147152
"\n1:\n\t"
148-
"st.d $zero, %0, 0\n\t"
149-
"st.d $zero, %0, 8\n\t"
150-
"addi.d %0, %0, 16\n\t"
153+
"stx.b $zero, %1, %0\n\t"
154+
"addi.d %0, %0, 1\n\t"
151155
"\n2:\n\t"
152-
"bne %0, %1, 1b\n\t"
153-
"st.d $zero, %2, -16\n\t"
154-
"st.d $zero, %2, -8\n\t"
155-
: "=&r" ( discard_dest ),
156-
"=&r" ( discard_end )
157-
: "r" ( dest + len ), "0" ( dest )
158-
: "memory", "t0" );
156+
"bne %0, %2, 1b\n\t"
157+
: "+r" ( offset )
158+
: "r" ( dest ), "r" ( offset + len_post )
159+
: "memory" );
159160
}
160161

161162
/**
@@ -166,10 +167,14 @@ void loong64_bzero ( void *dest, size_t len ) {
166167
* @v character Fill character
167168
*
168169
* The unusual parameter order is to allow for more efficient
169-
* tail-calling to loong64_memset() when zeroing a region.
170+
* tail-calling to loong64_bzero() when zeroing a region.
170171
*/
171172
void loong64_memset ( void *dest, size_t len, int character ) {
172-
size_t discard_offset;
173+
size_t offset;
174+
175+
/* Do nothing if length is zero */
176+
if ( ! len )
177+
return;
173178

174179
/* Use optimised zeroing code if applicable */
175180
if ( character == 0 ) {
@@ -181,86 +186,49 @@ void loong64_memset ( void *dest, size_t len, int character ) {
181186
* value is relatively rare and unlikely to be
182187
* performance-critical.
183188
*/
184-
__asm__ __volatile__ ( "beqz %0, 2f\n\t"
185-
"\n1:\n\t"
186-
"addi.d %0, %0, -1\n\t"
189+
__asm__ __volatile__ ( "\n1:\n\t"
187190
"stx.b %2, %1, %0\n\t"
188-
"bnez %0, 1b\n\t"
189-
"\n2:\n\t"
190-
: "=&r" ( discard_offset )
191-
: "r" ( dest ), "r" ( character ), "0" ( len )
192-
: "memory" );
193-
}
194-
195-
/**
196-
* Copy (possibly overlapping) memory region forwards
197-
*
198-
* @v dest Destination region
199-
* @v src Source region
200-
* @v len Length
201-
*/
202-
void loong64_memmove_forwards ( void *dest, const void *src, size_t len ) {
203-
void *discard_dest;
204-
const void *discard_src;
205-
unsigned long discard_data;
206-
207-
/* Assume memmove() is not performance-critical, and perform a
208-
* bytewise copy for simplicity.
209-
*/
210-
__asm__ __volatile__ ( "b 2f\n\t"
211-
"\n1:\n\t"
212-
"ld.b %2, %1, 0\n\t"
213-
"addi.d %1, %1, 1\n\t"
214-
"st.b %2, %0, 0\n\t"
215191
"addi.d %0, %0, 1\n\t"
216192
"\n2:\n\t"
217193
"bne %0, %3, 1b\n\t"
218-
: "=&r" ( discard_dest ),
219-
"=&r" ( discard_src ),
220-
"=&r" ( discard_data )
221-
: "r" ( dest + len ), "0" ( dest ), "1" ( src )
194+
: "=&r" ( offset )
195+
: "r" ( dest ), "r" ( character ), "r" ( len ),
196+
"0" ( 0 )
222197
: "memory" );
223198
}
224199

225200
/**
226-
* Copy (possibly overlapping) memory region backwards
201+
* Copy (possibly overlapping) memory region
227202
*
228203
* @v dest Destination region
229204
* @v src Source region
230205
* @v len Length
231206
*/
232-
void loong64_memmove_backwards ( void *dest, const void *src, size_t len ) {
233-
size_t discard_offset;
207+
void loong64_memmove ( void *dest, const void *src, size_t len ) {
208+
size_t offset;
234209
unsigned long discard_data;
235210

211+
/* Do nothing if length is zero */
212+
if ( ! len )
213+
return;
214+
215+
/* Use memcpy() if copy direction is forwards */
216+
if ( dest <= src ) {
217+
memcpy ( dest, src, len );
218+
return;
219+
}
220+
236221
/* Assume memmove() is not performance-critical, and perform a
237-
* bytewise copy for simplicity.
222+
* bytewise copy backwards for simplicity.
238223
*/
239-
__asm__ __volatile__ ( "beqz %0, 2f\n\t"
240-
"\n1:\n\t"
224+
__asm__ __volatile__ ( "\n1:\n\t"
241225
"addi.d %0, %0, -1\n\t"
242226
"ldx.b %1, %3, %0\n\t"
243227
"stx.b %1, %2, %0\n\t"
244-
"bnez %0, 1b\n\t"
245228
"\n2:\n\t"
246-
: "=&r" ( discard_offset ),
247-
"=&r" ( discard_data )
248-
: "r" ( dest ), "r" ( src ), "0" ( len )
229+
"bnez %0, 1b\n\t"
230+
: "=&r" ( offset ), "=&r" ( discard_data )
231+
: "r" ( dest ), "r" ( src ),
232+
"0" ( len )
249233
: "memory" );
250234
}
251-
252-
/**
253-
* Copy (possibly overlapping) memory region
254-
*
255-
* @v dest Destination region
256-
* @v src Source region
257-
* @v len Length
258-
*/
259-
void loong64_memmove ( void *dest, const void *src, size_t len ) {
260-
261-
if ( dest <= src ) {
262-
loong64_memmove_forwards ( dest, src, len );
263-
} else {
264-
loong64_memmove_backwards ( dest, src, len );
265-
}
266-
}

0 commit comments

Comments
 (0)