diff --git a/qualcomm-software/patches/picolibc/0005-RISCV-Add-optimized-memset-memcpy-strcpy-strcmp-for-.patch b/qualcomm-software/patches/picolibc/0005-RISCV-Add-optimized-memset-memcpy-strcpy-strcmp-for-.patch new file mode 100644 index 000000000000..0c4159f5cade --- /dev/null +++ b/qualcomm-software/patches/picolibc/0005-RISCV-Add-optimized-memset-memcpy-strcpy-strcmp-for-.patch @@ -0,0 +1,818 @@ +From e74d2d3be07c72f13a54684b223bb7447b1ef5f6 Mon Sep 17 00:00:00 2001 +From: Venkata Ramanaiah Nalamothu +Date: Mon, 16 Mar 2026 03:31:04 -0700 +Subject: [PATCH] [RISCV] Add optimized memset/memcpy/strcpy/strcmp for Xqci + +The optimized implementations for Xqci will override the other +existing corresponding varients when Xqci extenions are enabled. + +The orverriding happens using the interface implemented in the +upstream Picolibc pull requests 1090, 1092 and 1098. +--- + newlib/libc/machine/riscv/CMakeLists.txt | 4 + + newlib/libc/machine/riscv/memcpy-xqci.S | 283 +++++++++++++++++++++++ + newlib/libc/machine/riscv/memset-xqci.S | 161 +++++++++++++ + newlib/libc/machine/riscv/memset.S | 6 +- + newlib/libc/machine/riscv/meson.build | 4 + + newlib/libc/machine/riscv/rv_string.h | 26 ++- + newlib/libc/machine/riscv/strcmp-xqci.S | 83 +++++++ + newlib/libc/machine/riscv/strcmp.S | 6 +- + newlib/libc/machine/riscv/strcpy-xqci.S | 87 +++++++ + newlib/libc/machine/riscv/strcpy.c | 5 + + 10 files changed, 660 insertions(+), 5 deletions(-) + create mode 100644 newlib/libc/machine/riscv/memcpy-xqci.S + create mode 100644 newlib/libc/machine/riscv/memset-xqci.S + create mode 100644 newlib/libc/machine/riscv/strcmp-xqci.S + create mode 100644 newlib/libc/machine/riscv/strcpy-xqci.S + +diff --git a/newlib/libc/machine/riscv/CMakeLists.txt b/newlib/libc/machine/riscv/CMakeLists.txt +index 1ccfaad01..82a8b9ec5 100644 +--- a/newlib/libc/machine/riscv/CMakeLists.txt ++++ b/newlib/libc/machine/riscv/CMakeLists.txt +@@ -38,13 +38,17 @@ add_subdirectory(machine) + picolibc_sources_flags("-fno-builtin" + ieeefp.c + memcpy-asm.S ++ memcpy-xqci.S + memcpy.c + memmove.S + memmove.c + memset.S ++ memset-xqci.S + setjmp.S + stpcpy.c ++ strcmp-xqci.S + strcmp.S ++ strcpy-xqci.S + strcpy.c + strlen.c + ) +diff --git a/newlib/libc/machine/riscv/memcpy-xqci.S b/newlib/libc/machine/riscv/memcpy-xqci.S +new file mode 100644 +index 000000000..c3bb96743 +--- /dev/null ++++ b/newlib/libc/machine/riscv/memcpy-xqci.S +@@ -0,0 +1,283 @@ ++/***************************************************************** ++Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. ++SPDX-License-Identifier: BSD-3-Clause-Clear ++*****************************************************************/ ++ ++#include "rv_string.h" ++ ++#ifdef _MACHINE_RISCV_MEMCPY_ASM_XQCI_ ++ ++.text ++ ++/*=========================================================================== ++ ++ void *memcpy(void *dest, const void *src, size_t n) ++ ++ void __xqci_memcpy(void *dest, const void *src, size_t n) ++ ++ void __xqci_memcpy_aligned(void *dest, void *src, size_t n) ++ ++ void __xqci_memcpy_words(void *dest, void *src, size_t nwords) ++ ++===========================================================================*/ ++/*! ++ @brief ++ memcpy() is standard libc memcpy(). ++ It returns the "dest" argument. ++ ++ __xqci_memcpy is called by the compiler memcpy() builtin if it can't ++ inline. It is identical to standard memcpy(), except it does not ++ return a value. ++ ++ __xqci_memcpy_aligned assumes that dest/src are 32-bit word aligned, ++ and n is a multiple of words (n==0 is allowed). ++ Alignment is not checked, behavior is undefined if not satisfied. ++ ++ __xqci_memcpy_words is like __xqci_memcpy_aligned, except that length ++ is in units of 32-bit words instead of bytes. ++ ++ For word-aligned src/dest/size, these functions are guaranteed to ++ do only word accesses, in sequential order, so they are safe to use ++ for HW peripherals. ++ For byte alignment on src/dest/size, access order may be non-sequential, ++ and some locations may be read/written multiple times. ++ In all cases, only bytes strictly within the src/dest regions will be ++ accessed, with no over-read. ++*/ ++/*=========================================================================*/ ++ ++// Inputs: ++#define dest a0 ++#define src a1 ++#define n a2 ++#define nwords n ++ ++#define BUFSZ 4 ++ ++// Locals: ++#define dst a3 ++#define buf00 a4 ++#define buf01 a5 ++#define buf02 a6 ++#define buf03 a7 ++#define buf10 t3 ++#define buf11 t4 ++#define buf12 t5 ++#define buf13 t6 ++#define buf_size t0 ++#define len0 t1 ++#define len1 t2 ++#define tmp buf00 ++#define src_end len0 ++#define dst_end len1 ++ ++#ifndef MEMSET_TEST ++.global memcpy ++.type memcpy, @function ++memcpy: ++#endif ++ ++.global __xqci_memcpy ++.type __xqci_memcpy, @function ++__xqci_memcpy: ++ // Check for src/dest/size alignment ++ or tmp, dest, src ++ or tmp, tmp, n ++ andi tmp, tmp, 0x3 # LSB of dest, src, and n are 0 ++ bnez tmp,.Lunaligned # Unaligned, take slow path ++ ++.size memcpy, . - memcpy ++ ++.global __xqci_memcpy_aligned ++.type __xqci_memcpy_aligned, @function ++__xqci_memcpy_aligned: ++ ++ srli nwords, n, 2 # Number of words ++ ++.global __xqci_memcpy_words ++.type __xqci_memcpy_words, @function ++__xqci_memcpy_words: ++ mv dst, dest ++ ++.Lmemcpy_words_cont: ++ qc.bgeui nwords, (BUFSZ*2+1), .Lmemcpy_words_long ++ ++.Lmemcpy_words_short: ++ li buf_size, BUFSZ ++ minu len0, nwords, buf_size # Limit to nwords or bufsize (can be 0) ++ qc.lwm buf00, len0, 0(src) # Load first buffer (can be 0 size) ++ sub nwords, nwords, len0 # adjust remind words number ++ minu len1, nwords, buf_size # Second buffer size (can be 0) ++ qc.lwm buf10, len1, (BUFSZ*4)(src) # Load second buffer (can be 0 size) ++ qc.swm buf00, len0, 0(dst) # Store first buffer (can be 0 size) ++ qc.swm buf10, len1, (BUFSZ*4)(dst) # Store second buffer (can be 0 size) ++ ret ++ ++.Lmemcpy_words_long: ++ addi dst, dst, -(BUFSZ*4*2) # pre-decrement destination pointer ++ ++.Lmemcpy_words_loop: ++ qc.lwmi buf00, BUFSZ, 0(src) # Load first buffer (can be 0 size) ++ qc.lwmi buf10, BUFSZ, (BUFSZ*4)(src) # Load second buffer (can be 0 size) ++ addi src, src, (BUFSZ*4*2) # Increment source pointer ++ addi dst, dst, (BUFSZ*4*2) # increment destination pointer ++ qc.swmi buf00, BUFSZ, 0(dst) # Store first buffer (can be 0 size) ++ qc.swmi buf10, BUFSZ, (BUFSZ*4)(dst) # Store second buffer (can be 0 size) ++ addi nwords, nwords, -(BUFSZ*2) # adjust remind words number ++ qc.bgeui nwords, (BUFSZ*2), .Lmemcpy_words_loop ++ ++ addi dst, dst, (BUFSZ*4*2) # increment destination pointer ++ bnez nwords, .Lmemcpy_words_short ++ ret ++ ++// src and/or dest and/or size are not word aligned. ++.Lunaligned: ++ mv dst, dest ++ qc.bltui n, 16, .Lbytecopy # Buffer is <= 15 bytes, copy directly ++ ++ // Copy 3 bytes from the beginning and end of the buffer to handle ++ // realignment and over-read prevention. These will never overlap. ++ // Some of these may be re-copied after realignment. ++ ++ lbu buf00, 0(src) # Start of buffer ++ lbu buf01, 1(src) ++ lbu buf02, 2(src) ++ add src_end, src, n # doing this add here to avoid bubble ++ sb buf00, 0(dst) ++ sb buf01, 1(dst) ++ sb buf02, 2(dst) ++ ++ lbu buf00, -3(src_end) ++ lbu buf01, -2(src_end) ++ lbu buf02, -1(src_end) ++ add dst_end, dst, n # doing this add here to avoid bubble ++ sb buf00, -3(dst_end) ++ sb buf01, -2(dst_end) ++ sb buf02, -1(dst_end) ++ ++ // Add 4 to avoid over-read in src, and +3 to round up dest to word boundary. ++ // Subtract 4 to avoid over-read in src_end, and round down to word boundary. ++ addi dst, dst, 3 ++ andi dst, dst, -4 # dest+7, rounded down ++ sub tmp, dst, dest # Offset from original dest ++ add src, src, tmp # Offset src by same amount (may not align) ++ andi dst_end, dst_end, -4 # -4 and round down on end of dest ++ sub n, dst_end, dst # Updated n. Word multiple, >= 4 ++ ++ // dest and n are now word aligned. ++ // If src is also aligned, do remainder, as word aligned copy. ++ srli nwords, n, 2 # Number of words ++ andi tmp, src, 0x3 ++ beqz tmp, .Lmemcpy_words_cont # we are ready to branch to aligned word loop ++ ++#define buf04 s2 ++#define buf05 s3 ++#define buf06 s4 ++#define buf07 s5 ++#define desc s0 ++#define len s1 ++#define limit buf_size ++#define LIMIT_WORDS (BUFSZ*2-1) ++#define LIMIT_BYTES (LIMIT_WORDS*4) ++ ++ // src is not aligned to dest. Realign src data. ++ qc.cm.push {ra,s0-s5}, -32 ++ li desc, 0x200000 # Width=32 in upper halfword ++ qc.insb desc, src, 2, 3 # Byte offset*8 in lower halfword ++ andi src, src, -4 # Word align src ++ lw buf07, 0(src) # Load first word ++ addi dst, dst, -LIMIT_BYTES # Pre-subtract dest ++ ++ // We don't expect to be doing unaligned access to cache. ++ // Pipeline less aggressively to save code, do bookkeeping ++ // between load and access which should hide memory latency. ++ // This loop copies <= 7 words per iteration. ++ ++ li limit, LIMIT_WORDS ++.Lunaligned_word_loop: ++ mv buf00, buf07 # Copy last word from previous buf ++ minu len, nwords, limit # Maximum 7 words per iteration ++ qc.lwm buf01, len, 4(src) # Load next buffer, offset by 1 word ++ addi src, src, LIMIT_BYTES # Inc src for next iteration ++ addi dst, dst, LIMIT_BYTES # Pre-inc dest ++ qc.extdur buf00, buf00, desc # Realign each word ++ qc.extdur buf01, buf01, desc ++ qc.extdur buf02, buf02, desc ++ qc.extdur buf03, buf03, desc ++ qc.extdur buf04, buf04, desc ++ qc.extdur buf05, buf05, desc ++ qc.extdur buf06, buf06, desc ++ qc.swm buf00, len, 0(dst) # Store realigned data ++ sub nwords, nwords, len ++ bnez nwords, .Lunaligned_word_loop ++ qc.cm.popret {ra,s0-s5}, 32 ++ ++#undef buf04 ++#undef buf05 ++#undef buf06 ++#undef buf07 ++#undef limit ++#undef desc ++#undef LIMIT_WORDS ++#undef LIMIT_BYTES ++ ++// Copy buffer directly, size is <= 15 bytes ++// Try to hide load latency, within reason. ++.Lbytecopy: ++ qc.bltui n, 4, .Lbytecopy_2 ++ ++.Lbytecopy_4: ++ // Copy 4 bytes per iteration ++ lbu buf00, 0(src) ++ lbu buf01, 1(src) ++ lbu buf02, 2(src) ++ lbu buf03, 3(src) ++ addi src, src, 4 ++ sb buf00, 0(dst) ++ sb buf01, 1(dst) ++ sb buf02, 2(dst) ++ sb buf03, 3(dst) ++ addi dst, dst, 4 ++ addi n, n, -4 ++ qc.bgeui n, 4, .Lbytecopy_4 # Repeat until < 4 bytes remaining ++ ++.Lbytecopy_2: ++ qc.bltui n, 2, .Lbytecopy_1_check ++ // Copy 2 bytes. ++ lbu buf00, 0(src) ++ lbu buf01, 1(src) ++ addi src, src, 2 ++ addi n, n, -2 ++ sb buf00, 0(dst) ++ sb buf01, 1(dst) ++ addi dst, dst, 2 ++ ++.Lbytecopy_1_check: ++ bnez n, .Lbytecopy_1 ++ ret ++ ++.Lbytecopy_1: ++ lbu buf00, 0(src) ++ sb buf00, 0(dst) ++ ret ++ ++#undef BUFSZ ++#undef src_end ++#undef dst_end ++#undef tmp ++#undef dst ++#undef len0 ++#undef len1 ++#undef buf_size ++#undef buf00 ++#undef buf01 ++#undef buf02 ++#undef buf03 ++#undef buf10 ++#undef buf11 ++#undef buf12 ++#undef buf13 ++ ++.size __xqci_memcpy, . - __xqci_memcpy ++ ++#endif /*_MACHINE_RISCV_MEMCPY_ASM_XQCI_ */ +diff --git a/newlib/libc/machine/riscv/memset-xqci.S b/newlib/libc/machine/riscv/memset-xqci.S +new file mode 100644 +index 000000000..361fdb7b3 +--- /dev/null ++++ b/newlib/libc/machine/riscv/memset-xqci.S +@@ -0,0 +1,161 @@ ++/***************************************************************** ++Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. ++SPDX-License-Identifier: BSD-3-Clause-Clear ++*****************************************************************/ ++ ++#include "rv_string.h" ++ ++#ifdef _MACHINE_RISCV_MEMSET_ASM_XQCI_ ++ ++.text ++ ++/*=========================================================================== ++ ++ void *memset(void *s, int c, size_t n) ++ ++ void __xqci_memset(void *s, int c, size_t n) ++ ++ void __xqci_memset_aligned(unsigned *s, int c, size_t n) ++ ++ void __xqci_memset_words(unsigned *s, unsigned w, size_t nwords) ++ ++===========================================================================*/ ++/*! ++ @brief ++ memset() is the standard libc implementation. ++ It returns the "s" argument. ++ ++ __xqci_memset() is called by the compiler memset() builtin if it can't ++ inline. It is identical to standard memset(), except it does not ++ return a value. ++ ++ __xqci_memset_aligned() works similarly to __xqci_memset(), except it assumes ++ s is 32-bit aligned, and n is a multiple of 32 bits (n==0 is allowed). ++ Alignment is not checked, behavior is undefined if not satisfied. ++ ++ __xqci_memset_words() also requires s to be 32-bit aligned. ++ w has the value replicated in each byte, and nwords is the length ++ in units of 32-bit words. ++ ++ For word aligned address/length, these functions are guaranteed to do ++ only word access, in sequential order, so they are safe to use for ++ HW peripherals. ++ If address or length have byte alignment, then sequential access ++ is not guaranteed, and some locations may be written multiple times. ++*/ ++/*=========================================================================*/ ++ ++#ifndef MEMSET_TEST ++.global memset ++.type memset, @function ++memset: ++#endif ++ ++.global __xqci_memset ++.type __xqci_memset, @function ++__xqci_memset: ++// Inputs: ++#define s a0 ++#define c a1 ++#define w c ++#define n a2 ++#define nwords n ++// Locals: ++#define len a3 ++#define end a5 ++ ++#define BLOCK_WORDS (28) ++#define BLOCK_BYTES (BLOCK_WORDS*4) ++ ++ // Common case: if s and n are word aligned, use aligned function. ++#define a a4 ++ or a, s, n ++ andi a, a, 0x3 ++ bnez a, .Lunaligned # Skip if unaligned ++#undef a ++ // Fallthrough to aligned memset ++ ++#define p a4 ++// External entry for __xqci_memset_aligned ++.global __xqci_memset_aligned ++.type __xqci_memset_aligned, @function ++__xqci_memset_aligned: ++ mv p, s # keep s value since it is return value ++.L__xqci_memset_aligned: ++ ++ srli nwords, n, 2 # Convert to nwords ++ qc.insb w, c, 8, 8 # Splat c over all 4 bytes ++ qc.insb w, w, 16, 16 ++ qc.e.bgeui nwords, 32, .Llong # Long memset if >= 32 words ++ qc.setwm w, nwords, 0(p) # Short memset (1..31 words) is fast path ++ ret ++ ++.Llong: ++ // Long memset, 32 or more words. ++ // Align the address to a 128-bit PDMEM boundary so block stores ++ // are more efficient. ++ qc.extu len, p, 2, 2 # Word address mod 4 ++ not len, len ++ addi len, len, 5 # Residual to next 4-word multiple ++ qc.setwm w, len, 0(p) # Store 1..4 words for alignment ++ sh2add p, len, p # p += alignment words*4 ++ sub nwords, nwords, len # nwords -= alignment words ++ qc.e.bltui nwords, 32, .Ltail ++ ++ // Store in blocks of 28 words, except for last block. ++.Lblocks: ++ qc.setwmi w, BLOCK_WORDS, 0(p) # Store block size words ++ addi p, p, BLOCK_BYTES # Increment ptr, may overflow but not used in ++ addi nwords, nwords, -BLOCK_WORDS # nwords -= length ++ qc.bgeui nwords, BLOCK_WORDS, .Lblocks ++ ++.Ltail: ++ qc.setwm w, nwords, 0(p) # Store reminder of words, if any ++ ret ++ ++ // Unaligned start address and/or length. ++.Lunaligned: ++ // Do byte stores at start/end of buffer. ++ // Some locations may be written multiple times for small buffers. ++ mv p, s # keep s value since it is return value ++ add end, p, n # End of buffer ++ beqz n, .Lroundup ++ sb c, 0(p) ++ sb c, -1(end) ++ addi n, n, -1 ++ beqz n, .Lroundup ++ sb c, 1(p) ++ sb c, -2(end) ++ addi n, n, -1 ++ beqz n, .Lroundup ++ sb c, 2(p) ++ sb c, -3(end) ++ ++ // Any residual bytes at start/end of buffer have been set. ++ // Round up starting address and round down size to word boundaries and ++ // continue with alignment memset. ++ // Some bytes may be written again by word writes. ++.Lroundup: ++ addi p, p, 3 ++ andi p, p, -4 # Round up p ++ sub n, end, p # n = end - p (may be negative) ++ # __xqci_memset_aligned will round down n ++ qc.bgei n, 4, .L__xqci_memset_aligned # Set middle of buffer as words ++ ++ // Short buffer, return immediately ++ ret ++ ++// External entry for __xqci_memset_words ++.global __xqci_memset_words ++.type __xqci_memset_words, @function ++__xqci_memset_words: ++ mv p, s # keep s value since it is return value ++ qc.e.bgeui nwords, 32, .Llong # Long memset if >= 32 words ++ qc.setwm w, nwords, 0(s) # Short memset (1..31 words) is fast path ++ ret ++ ++#undef p ++ ++.size __xqci_memset, . - __xqci_memset ++ ++#endif /* _MACHINE_RISCV_MEMSET_ASM_XQCI_ */ +diff --git a/newlib/libc/machine/riscv/memset.S b/newlib/libc/machine/riscv/memset.S +index 3f09fc067..5355d6898 100644 +--- a/newlib/libc/machine/riscv/memset.S ++++ b/newlib/libc/machine/riscv/memset.S +@@ -9,7 +9,9 @@ + http://www.opensource.org/licenses. + */ + +-#include ++#include "rv_string.h" ++ ++#ifdef _MACHINE_RISCV_MEMSET_ASM_ + + .section .text.memset + .global memset +@@ -113,3 +115,5 @@ memset: + j .Laligned + #endif + .size memset, .-memset ++ ++#endif /* _MACHINE_RISCV_MEMSET_ASM_ */ +diff --git a/newlib/libc/machine/riscv/meson.build b/newlib/libc/machine/riscv/meson.build +index bf5a279d6..af2060e3c 100644 +--- a/newlib/libc/machine/riscv/meson.build ++++ b/newlib/libc/machine/riscv/meson.build +@@ -35,13 +35,17 @@ + srcs_machine = [ + 'ieeefp.c', + 'memcpy-asm.S', ++ 'memcpy-xqci.S', + 'memcpy.c', + 'memmove.S', + 'memmove.c', + 'memset.S', ++ 'memset-xqci.S', + 'setjmp.S', + 'stpcpy.c', ++ 'strcmp-xqci.S', + 'strcmp.S', ++ 'strcpy-xqci.S', + 'strcpy.c', + 'strlen.c', + ] +diff --git a/newlib/libc/machine/riscv/rv_string.h b/newlib/libc/machine/riscv/rv_string.h +index 4f40ab690..92e59f997 100644 +--- a/newlib/libc/machine/riscv/rv_string.h ++++ b/newlib/libc/machine/riscv/rv_string.h +@@ -23,16 +23,36 @@ + + #include + +-#if defined(__PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) ++#ifdef __riscv_xqci ++# define _MACHINE_RISCV_MEMCPY_ASM_XQCI_ ++#elif defined(__PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) + # define _MACHINE_RISCV_MEMCPY_ASM_ + #else + # define _MACHINE_RISCV_MEMCPY_C_ +-# endif ++#endif + + #if !defined(__PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__) + # define _MACHINE_RISCV_MEMMOVE_GENERIC_ + #else + # define _MACHINE_RISCV_MEMMOVE_ASM_ +-# endif ++#endif ++ ++#if defined(__riscv_xqci) ++# define _MACHINE_RISCV_MEMSET_ASM_XQCI_ ++#else ++# define _MACHINE_RISCV_MEMSET_ASM_ ++#endif ++ ++#if defined(__riscv_xqci) ++# define _MACHINE_RISCV_STRCMP_ASM_XQCI_ ++#else ++# define _MACHINE_RISCV_STRCMP_ASM_ ++#endif ++ ++#if defined(__riscv_xqci) ++# define _MACHINE_RISCV_STRCPY_ASM_XQCI_ ++#else ++# define _MACHINE_RISCV_STRCPY_ASM_ ++#endif + + #endif /* _RV_STRING_H_ */ +diff --git a/newlib/libc/machine/riscv/strcmp-xqci.S b/newlib/libc/machine/riscv/strcmp-xqci.S +new file mode 100644 +index 000000000..1a7fdda89 +--- /dev/null ++++ b/newlib/libc/machine/riscv/strcmp-xqci.S +@@ -0,0 +1,83 @@ ++/***************************************************************** ++Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. ++SPDX-License-Identifier: BSD-3-Clause-Clear ++*****************************************************************/ ++ ++#include "rv_string.h" ++ ++#ifdef _MACHINE_RISCV_STRCMP_ASM_XQCI_ ++ ++.text ++ ++/*=========================================================================== ++ ++ int strcmp(const char *s1, const char *s2) ++ ++===========================================================================*/ ++ ++ /* ++ * Returns ++ * a0 - comparison result, value like strcmp ++ * ++ * Parameters ++ * a0 - string1 ++ * a1 - string2 ++ * ++ * Clobbers ++ * a2, a3, a4, a5, a6 ++ */ ++.global strcmp ++.type strcmp, @function ++strcmp: ++ mv a6, a0 ++ or a5, a6, a1 ++ and a5, a5, 3 ++ li a4, 0 ++ bnez a5, 3f ++ ++ /* Main loop for aligned string. */ ++1: ++ qc.lrw a2, a6, a4, 0 ++ qc.lrw a3, a1, a4, 0 ++ orc.b a5, a2 ++ qc.bnei a5, -1, 2f ++ addi a4, a4, 4 ++ beq a2, a3, 1b ++ ++ /* ++ * Words don't match, and no null byte in the first word. ++ * Compute the first differing byte and return unsigned difference. ++ */ ++ xor a5, a2, a3 ++ ctz a5, a5 # bit offset to first differing bit ++ andi a5, a5, -0x8 # bit offset to first differing byte ++ srl a0, a2, a5 ++ andi a0, a0, 0xFF ++ srl a3, a3, a5 ++ andi a3, a3, 0xFF ++ sub a0, a0, a3 ++ ret ++ ++2: ++ /* ++ * Found a null byte. ++ * If words don't match, fall back to simple loop. ++ */ ++ xor a0, a2, a3 ++ bnez a0, 3f ++ /* Otherwise, strings are equal. */ ++ ret ++ ++ /* Simple loop for misaligned strings. */ ++3: ++ qc.lrbu a2, a6, a4, 0 ++ qc.lrbu a3, a1, a4, 0 ++ addi a4, a4, 1 ++ bne a2, a3, 4f ++ bnez a2, 3b ++ ++4: ++ sub a0, a2, a3 ++ ret ++ ++#endif /* _MACHINE_RISCV_STRCMP_ASM_XQCI_ */ +diff --git a/newlib/libc/machine/riscv/strcmp.S b/newlib/libc/machine/riscv/strcmp.S +index 1b9869e72..ffb7f2215 100644 +--- a/newlib/libc/machine/riscv/strcmp.S ++++ b/newlib/libc/machine/riscv/strcmp.S +@@ -9,7 +9,9 @@ + http://www.opensource.org/licenses. + */ + +-#include ++#include "rv_string.h" ++ ++#ifdef _MACHINE_RISCV_STRCMP_ASM_ + + #include "asm.h" + +@@ -202,3 +204,5 @@ mask: + .size strcmp, .-strcmp + + #endif ++ ++#endif /* _MACHINE_RISCV_STRCMP_ASM_ */ +diff --git a/newlib/libc/machine/riscv/strcpy-xqci.S b/newlib/libc/machine/riscv/strcpy-xqci.S +new file mode 100644 +index 000000000..56d82fe85 +--- /dev/null ++++ b/newlib/libc/machine/riscv/strcpy-xqci.S +@@ -0,0 +1,87 @@ ++/***************************************************************** ++Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. ++SPDX-License-Identifier: BSD-3-Clause-Clear ++*****************************************************************/ ++ ++#include "rv_string.h" ++ ++#ifdef _MACHINE_RISCV_STRCPY_ASM_XQCI_ ++ ++.text ++ ++/*=========================================================================== ++ ++ char * strcpy(char *dst, const char *src) ++ ++===========================================================================*/ ++ ++ /* ++ * Returns ++ * a0 - destination string ++ * ++ * Parameters ++ * a0 - destination ++ * a1 - source ++ * ++ * Clobbers ++ * a2, a3, a4, a5, a6, a7, t3, t4, t5, t6 ++ */ ++.global strcpy ++.type strcpy, @function ++strcpy: ++ mv a7, a0 ++ or a2, a0, a1 ++ and a2, a1, 3 ++ bnez a2, 4f ++1: ++ qc.lwmi t3, 4, 0(a1) ++ add a1, a1, 16 ++ li a2, -1 ++ orc.b a3, t3 ++ and a2, a2, a3 ++ orc.b a4, t4 ++ and a2, a2, a4 ++ orc.b a5, t5 ++ and a2, a2, a5 ++ orc.b a6, t6 ++ and a2, a2, a6 ++ qc.bnei a2, -1, 2f ++ qc.swmi t3, 4, 0(a7) ++ add a7, a7, 16 ++ j 1b ++ ++2: ++ add a1, a1, -16 ++ li a2, 0 ++ qc.bnei a3, -1, 3f ++ qc.srw t3, a7, a2, 0 ++ add a2, a2, 4 ++ qc.bnei a4, -1, 3f ++ qc.srw t4, a7, a2, 0 ++ add a2, a2, 4 ++ qc.bnei a5, -1, 3f ++ qc.srw t5, a7, a2, 0 ++ add a2, a2, 4 ++ ++3: ++ add a7, a7, a2 ++ add a1, a1, a2 ++4: ++ lbu a2, 0(a1) ++ lbu a3, 1(a1) ++ lbu a4, 2(a1) ++ lbu a5, 3(a1) ++ sb a2, 0(a7) ++ beqz a2, 5f ++ sb a3, 1(a7) ++ beqz a3, 5f ++ sb a4, 2(a7) ++ beqz a4, 5f ++ sb a5, 3(a7) ++ beqz a5, 5f ++ li a2, 4 ++ j 3b ++5: ++ ret ++ ++#endif /* _MACHINE_RISCV_STRCPY_ASM_XQCI_ */ +diff --git a/newlib/libc/machine/riscv/strcpy.c b/newlib/libc/machine/riscv/strcpy.c +index ad4658f55..f1791c602 100644 +--- a/newlib/libc/machine/riscv/strcpy.c ++++ b/newlib/libc/machine/riscv/strcpy.c +@@ -11,6 +11,9 @@ + + #include + #include "rv_strcpy.h" ++#include "rv_string.h" ++ ++#ifdef _MACHINE_RISCV_STRCPY_ASM_ + + #undef strcpy + +@@ -18,3 +21,5 @@ char *strcpy(char *dst, const char *src) + { + return __libc_strcpy(dst, src, true); + } ++ ++#endif /* _MACHINE_RISCV_STRCPY_ASM_ */ +-- +2.34.1 +