From: Pavel Kozlov <pavel.kozlov(a)synopsys.com>
Add ability to use optimized versions of string functions for ARCv3 32-bit
CPUs with UCLIBC_HAS_STRING_ARCH_OPT option. Add optimized
memcpy/memset/memcmp code for ARCv3 CPUs based on the code from newlib
and adapt for ARCv3 existed optimized strchr/strcmp/strcpy/strlen.
Link to the Synopsys newlib repo with code for ARCv3 on GitHub:
https://github.com/foss-for-synopsys-dwc-arc-processors/newlib
Signed-off-by: Pavel Kozlov <pavel.kozlov(a)synopsys.com>
---
libc/string/arc/memcmp.S | 94 +++++++++++++++++++++++++++++++++++-
libc/string/arc/memcpy.S | 65 +++++++++++++++++++++----
libc/string/arc/memset.S | 61 +++++++++++++++++++----
libc/string/arc/strchr.S | 25 +++++-----
libc/string/arc/strcmp.S | 21 ++++----
libc/string/arc/strlen.S | 7 +--
libc/sysdeps/linux/arc/asm.h | 39 +++++++++++++++
7 files changed, 267 insertions(+), 45 deletions(-)
diff --git a/libc/string/arc/memcmp.S b/libc/string/arc/memcmp.S
index a60757e7a254..20122a2967eb 100644
--- a/libc/string/arc/memcmp.S
+++ b/libc/string/arc/memcmp.S
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2013 Synopsys, Inc. (
www.synopsys.com)
+ * Copyright (C) 2013, 2022 Synopsys, Inc. (
www.synopsys.com)
* Copyright (C) 2007 ARC International (UK) LTD
*
* Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
@@ -17,6 +17,8 @@
#endif
ENTRY(memcmp)
+
+#if defined(__ARC700__) || defined(__ARCHS__)
or r12,r0,r1
asl_s r12,r12,30
sub r3,r2,1
@@ -149,6 +151,96 @@ ENTRY(memcmp)
.Lnil:
j_s.d [blink]
mov r0,0
+
+#elif (__ARC64_ARCH32__)
+ ;; Based on Synopsys code from newlib's arc64/memcmp.S
+ cmp r2, 32
+ bls.d @.L_compare_1_bytes
+ mov r3, r0 ; "r0" will be used as return value
+
+ lsr r12, r2, 4 ; counter for 16-byte chunks
+ xor r13, r13, r13 ; the mask showing inequal registers
+
+.L_compare_16_bytes:
+ ld.ab r4, [r3, +4]
+ ld.ab r5, [r1, +4]
+ ld.ab r6, [r3, +4]
+ ld.ab r7, [r1, +4]
+ ld.ab r8, [r3, +4]
+ ld.ab r9, [r1, +4]
+ ld.ab r10, [r3, +4]
+ ld.ab r11, [r1, +4]
+ xor.f 0, r4, r5
+ xor.ne r13, r13, 0b0001
+ xor.f 0, r6, r7
+ xor.ne r13, r13, 0b0010
+ xor.f 0, r8, r9
+ xor.ne r13, r13, 0b0100
+ xor.f 0, r10, r11
+ xor.ne r13, r13, 0b1000
+ brne r13, 0, @.L_unequal_find
+ dbnz r12, @.L_compare_16_bytes
+
+ ;; Adjusting the pointers because of the extra loads in the end
+ sub r1, r1, 4
+ sub r3, r3, 4
+ bmsk_s r2, r2, 3 ; any remaining bytes to compare
+
+.L_compare_1_bytes:
+ cmp r2, 0
+ jeq.d [blink]
+ xor_s r0, r0, r0
+
+2:
+ ldb.ab r4, [r3, +1]
+ ldb.ab r5, [r1, +1]
+ sub.f r0, r4, r5
+ jne [blink]
+ dbnz r2, @2b
+ j_s [blink]
+
+ ;; At this point, we want to find the _first_ comparison that marked the
+ ;; inequality of "lhs" and "rhs"
+.L_unequal_find:
+ ffs r13, r13
+ asl r13, r13, 2
+ bi [r13]
+.L_unequal_r4r5:
+ mov r1, r4
+ b.d @.L_diff_byte_in_regs
+ mov r2, r5
+ nop
+.L_unequal_r6r7:
+ mov r1, r6
+ b.d @.L_diff_byte_in_regs
+ mov r2, r7
+ nop
+.L_unequal_r8r9:
+ mov r1, r8
+ b.d @.L_diff_byte_in_regs
+ mov r2, r9
+ nop
+.L_unequal_r10r11:
+ mov r1, r10
+ mov r2, r11
+
+ ;; fall-through
+ ;; If we're here, that means the two operands are not equal.
+.L_diff_byte_in_regs:
+ xor r0, r1, r2
+ ffs r0, r0
+ and r0, r0, 0x18
+ lsr r1, r1, r0
+ lsr r2, r2, r0
+ bmsk_s r1, r1, 7
+ bmsk_s r2, r2, 7
+ j_s.d [blink]
+ sub r0, r1, r2
+
+#else
+#error "Unsupported ARC CPU type"
+#endif
+
END(memcmp)
libc_hidden_def(memcmp)
diff --git a/libc/string/arc/memcpy.S b/libc/string/arc/memcpy.S
index 69d7220b8a7e..153083765ebb 100644
--- a/libc/string/arc/memcpy.S
+++ b/libc/string/arc/memcpy.S
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (
www.synopsys.com)
+ * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (
www.synopsys.com)
* Copyright (C) 2007 ARC International (UK) LTD
*
* Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
@@ -7,13 +7,9 @@
#include <sysdep.h>
-#if !defined(__ARC700__) && !defined(__ARCHS__)
-#error "Neither ARC700 nor ARCHS is defined!"
-#endif
-
ENTRY(memcpy)
-#ifdef __ARC700__
+#if defined(__ARC700__)
/* This memcpy implementation does not support objects of 1GB or larger -
the check for alignment does not work then. */
/* We assume that most sources and destinations are aligned, and
@@ -73,9 +69,9 @@ ENTRY(memcpy)
.Lendbloop:
j_s.d [blink]
stb r12,[r5,0]
-#endif /* __ARC700__ */
-#ifdef __ARCHS__
+#elif defined(__ARCHS__)
+
#ifdef __LITTLE_ENDIAN__
# define SHIFT_1(RX,RY,IMM) asl RX, RY, IMM ; <<
# define SHIFT_2(RX,RY,IMM) lsr RX, RY, IMM ; >>
@@ -299,7 +295,58 @@ ENTRY(memcpy)
stb.ab r6, [r3,1]
.Lcopybytewise_3:
j [blink]
-#endif /* __ARCHS__ */
+
+#elif defined(__ARC64_ARCH32__)
+ ;; Based on Synopsys code from newlib's arc64/memcpy.S
+ lsr.f r11, r2, 4 ; counter for 16-byte chunks
+ beq.d @.L_write_15_bytes
+ mov r3, r0 ; work on a copy of "r0"
+
+.L_write_16_bytes:
+#if defined(__ARC64_LL64__)
+ ldd.ab r4, [r1, 8]
+ ldd.ab r6, [r1, 8]
+ std.ab r4, [r3, 8]
+ std.ab r6, [r3, 8]
+ dbnz r11, @.L_write_16_bytes
+#else
+ ld.ab r4, [r1, 4]
+ ld.ab r5, [r1, 4]
+ ld.ab r6, [r1, 4]
+ ld.ab r7, [r1, 4]
+ st.ab r4, [r3, 4]
+ st.ab r5, [r3, 4]
+ st.ab r6, [r3, 4]
+ dbnz.d r11, @.L_write_16_bytes
+ st.ab r7, [r3, 4]
+#endif
+ bmsk_s r2, r2, 3
+
+.L_write_15_bytes:
+ bbit0.d r2, 1, @1f
+ lsr r11, r2, 2
+ ldh.ab r4, [r1, 2]
+ sth.ab r4, [r3, 2]
+1:
+ bbit0.d r2, 0, @1f
+ xor r11, r11, 3
+ ldb.ab r4, [r1, 1]
+ stb.ab r4, [r3, 1]
+1:
+ asl r11, r11, 1
+ bi [r11]
+ ld.ab r4,[r1, 4]
+ st.ab r4,[r3, 4]
+ ld.ab r4,[r1, 4]
+ st.ab r4,[r3, 4]
+ ld r4,[r1]
+ st r4,[r3]
+
+ j_s [blink]
+
+#else
+#error "Unsupported ARC CPU type"
+#endif
END(memcpy)
libc_hidden_def(memcpy)
diff --git a/libc/string/arc/memset.S b/libc/string/arc/memset.S
index 0b74ddc7fca8..5aa5d6c655c8 100644
--- a/libc/string/arc/memset.S
+++ b/libc/string/arc/memset.S
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (
www.synopsys.com)
+ * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (
www.synopsys.com)
* Copyright (C) 2007 ARC International (UK) LTD
*
* Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
@@ -7,13 +7,9 @@
#include <sysdep.h>
-#if !defined(__ARC700__) && !defined(__ARCHS__)
-#error "Neither ARC700 nor ARCHS is defined!"
-#endif
-
ENTRY(memset)
-#ifdef __ARC700__
+#if defined(__ARC700__)
#define SMALL 7 /* Must be at least 6 to deal with alignment/loop issues. */
mov_s r4,r0
@@ -52,9 +48,8 @@ ENTRY(memset)
stb.ab r1,[r4,1]
.Ltiny_end:
j_s [blink]
-#endif /* __ARC700__ */
-#ifdef __ARCHS__
+#elif defined(__ARCHS__)
#ifdef DONT_USE_PREALLOC
#define PREWRITE(A,B) prefetchw [(A),(B)]
#else
@@ -156,7 +151,55 @@ ENTRY(memset)
.Lcopy3bytes:
j [blink]
-#endif /* __ARCHS__ */
+
+#elif defined(__ARC64_ARCH32__)
+ ;; Based on Synopsys code from newlib's arc64/memset.S
+
+ ;; Assemble the bytes to 32bit words
+ bmsk_s r1, r1, 7 ; treat it like unsigned char
+ lsl8 r3, r1
+ or_s r1, r1, r3
+ lsl16 r3, r1
+ or r6, r1, r3
+ mov r7,r6
+
+ lsr.f r5, r2, 4 ; counter for 16-byte chunks
+ beq.d @.L_write_15_bytes
+ mov r4, r0 ; work on a copy of "r0"
+
+.L_write_16_bytes:
+#if defined(__ARC64_LL64__)
+ std.ab r6, [r4, 8]
+ std.ab r6, [r4, 8]
+ dbnz r5, @.L_write_16_bytes
+#else
+ st.ab r6, [r4, 4]
+ st.ab r6, [r4, 4]
+ st.ab r6, [r4, 4]
+ dbnz.d r5, @.L_write_16_bytes
+ st.ab r6, [r4, 4]
+#endif
+ bmsk_s r2, r2, 3
+
+.L_write_15_bytes:
+ bbit0.d r2, 1, @1f
+ lsr r3, r2, 2
+ sth.ab r6, [r4, 2]
+1:
+ bbit0.d r2, 0, @1f
+ xor r3, r3, 3
+ stb.ab r6, [r4, 1]
+1:
+ bi [r3]
+ st.ab r6,[r4, 4]
+ st.ab r6,[r4, 4]
+ st.ab r6,[r4, 4]
+
+ j_s [blink]
+
+#else
+#error "Unsupported ARC CPU type"
+#endif
END(memset)
libc_hidden_def(memset)
diff --git a/libc/string/arc/strchr.S b/libc/string/arc/strchr.S
index 443993589966..df25eb3be225 100644
--- a/libc/string/arc/strchr.S
+++ b/libc/string/arc/strchr.S
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2013 Synopsys, Inc. (
www.synopsys.com)
+ * Copyright (C) 2013, 2022 Synopsys, Inc. (
www.synopsys.com)
* Copyright (C) 2007 ARC International (UK) LTD
*
* Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
@@ -7,6 +7,7 @@
#include <sysdep.h>
#include <features.h>
+#include <asm.h>
/* ARC700 has a relatively long pipeline and branch prediction, so we want
to avoid branches that are hard to predict. On the other hand, the
@@ -21,7 +22,7 @@ ENTRY(strchr)
mov_s r3,0x01010101
breq.d r2,r0,.Laligned
asl r4,r5,16
- sub_s r0,r0,r2
+ SUBR_S r0,r0,r2
asl r7,r2,3
ld_s r2,[r0]
#ifdef __LITTLE_ENDIAN__
@@ -77,10 +78,10 @@ ENTRY(strchr)
sub r3,r7,1
bic r3,r3,r7
norm r2,r3
- sub_s r0,r0,1
- asr_s r2,r2,3
+ SUBR_S r0,r0,1
+ ASRR_S r2,r2,3
j.d [blink]
- sub_s r0,r0,r2
+ SUBR_S r0,r0,r2
.balign 4
.Lfound0_ua:
@@ -90,13 +91,13 @@ ENTRY(strchr)
bic r3,r3,r6
and r2,r3,r4
or_s r12,r12,r2
- sub_s r3,r12,1
+ SUBR_S r3,r12,1
bic_s r3,r3,r12
norm r3,r3
- add_s r0,r0,3
- asr_s r12,r3,3
+ ADDR_S r0,r0,3
+ ASRR_S r12,r3,3
asl.f 0,r2,r3
- sub_s r0,r0,r12
+ SUBR_S r0,r0,r12
j_s.d [blink]
mov.pl r0,0
#else /* BIG ENDIAN */
@@ -106,10 +107,10 @@ ENTRY(strchr)
bic r2,r7,r6
.Lfound_char_b:
norm r2,r2
- sub_s r0,r0,4
+ SUBR_S r0,r0,4
asr_s r2,r2,3
j.d [blink]
- add_s r0,r0,r2
+ ADDR_S r0,r0,r2
.Lfound0_ua:
mov_s r3,r7
@@ -126,7 +127,7 @@ ENTRY(strchr)
add.pl r3,r3,1
asr_s r12,r3,3
asl.f 0,r2,r3
- add_s r0,r0,r12
+ ADDR_S r0,r0,r12
j_s.d [blink]
mov.mi r0,0
#endif /* ENDIAN */
diff --git a/libc/string/arc/strcmp.S b/libc/string/arc/strcmp.S
index ad38d9e00c8a..3f64ac421acc 100644
--- a/libc/string/arc/strcmp.S
+++ b/libc/string/arc/strcmp.S
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (
www.synopsys.com)
+ * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (
www.synopsys.com)
* Copyright (C) 2007 ARC International (UK) LTD
*
* Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
@@ -7,14 +7,11 @@
#include <features.h>
#include <sysdep.h>
-
-#if !defined(__ARC700__) && !defined(__ARCHS__)
-#error "Neither ARC700 nor ARCHS is defined!"
-#endif
+#include <asm.h>
ENTRY(strcmp)
-#ifdef __ARC700__
+#if defined(__ARC700__) || defined(__ARC64_ARCH32__)
/* This is optimized primarily for the ARC700.
It would be possible to speed up the loops by one cycle / word
respective one cycle / byte by forcing double source 1 alignment, unrolling
@@ -38,7 +35,7 @@ ENTRY(strcmp)
breq r2,r3,.Lwordloop
#ifdef __LITTLE_ENDIAN__
xor r0,r2,r3 ; mask for difference
- sub_s r1,r0,1
+ SUBR_S r1,r0,1
bic_s r0,r0,r1 ; mask for least significant difference bit
sub r1,r5,r0
xor r0,r5,r1 ; mask for least significant difference byte
@@ -55,7 +52,7 @@ ENTRY(strcmp)
.Lfound0:
xor r0,r2,r3 ; mask for difference
or r0,r0,r4 ; or in zero indicator
- sub_s r1,r0,1
+ SUBR_S r1,r0,1
bic_s r0,r0,r1 ; mask for least significant difference bit
sub r1,r5,r0
xor r0,r5,r1 ; mask for least significant difference byte
@@ -99,9 +96,8 @@ ENTRY(strcmp)
.Lcmpend:
j_s.d [blink]
sub r0,r2,r3
-#endif /* __ARC700__ */
-#ifdef __ARCHS__
+#elif defined(__ARCHS__)
or r2, r0, r1
bmsk_s r2, r2, 1
brne r2, 0, @.Lcharloop
@@ -168,7 +164,10 @@ ENTRY(strcmp)
.Lcmpend:
j_s.d [blink]
sub r0, r2, r3
-#endif /* __ARCHS__ */
+
+#else
+#error "Unsupported ARC CPU type"
+#endif
END(strcmp)
libc_hidden_def(strcmp)
diff --git a/libc/string/arc/strlen.S b/libc/string/arc/strlen.S
index 0b9b9381576c..0d1d3aa4e787 100644
--- a/libc/string/arc/strlen.S
+++ b/libc/string/arc/strlen.S
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2013 Synopsys, Inc. (
www.synopsys.com)
+ * Copyright (C) 2013, 2022 Synopsys, Inc. (
www.synopsys.com)
* Copyright (C) 2007 ARC International (UK) LTD
*
* Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
@@ -7,6 +7,7 @@
#include <sysdep.h>
+#include <asm.h>
ENTRY(strlen)
or r3,r0,7
@@ -15,7 +16,7 @@ ENTRY(strlen)
mov r4,0x01010101
; uses long immediate
#ifdef __LITTLE_ENDIAN__
- asl_s r1,r0,3
+ ASLR_S r1,r0,3
btst_s r0,2
asl r7,r4,r1
ror r5,r4
@@ -59,7 +60,7 @@ ENTRY(strlen)
sub.ne r3,r3,4
mov.eq r1,r12
#ifdef __LITTLE_ENDIAN__
- sub_s r2,r1,1
+ SUBR_S r2,r1,1
bic_s r2,r2,r1
norm r1,r2
sub_s r0,r0,3
diff --git a/libc/sysdeps/linux/arc/asm.h b/libc/sysdeps/linux/arc/asm.h
index f15dff841aec..f83075ea1191 100644
--- a/libc/sysdeps/linux/arc/asm.h
+++ b/libc/sysdeps/linux/arc/asm.h
@@ -7,6 +7,13 @@
#ifndef _ARC_ASM_H
#define _ARC_ASM_H
+/*
+ * Some 16-bit instructions were excluded from the ARCv3 ISA
+ * the following macros are introduced to handle these changes in one place.
+ * This will allow not to change existing ARCv2 code and use 16-bit versions
+ * of instructions for ARCv2 and replace them with 32-bit vesrions for ARCv3
+ */
+
#if defined (__ARC64_ARCH32__)
.macro PUSHR reg
@@ -25,6 +32,22 @@
pop \reg
.endm
+.macro SUBR_S dst,src1,src2
+ sub \dst, \src1, \src2
+.endm
+
+.macro ADDR_S dst,src1,src2
+ add \dst, \src1, \src2
+.endm
+
+.macro ASRR_S dst,src1,src2
+ asr \dst, \src1, \src2
+.endm
+
+.macro ASLR_S dst,src1,src2
+ asl \dst, \src1, \src2
+.endm
+
#elif defined (__ARC64_ARCH64__)
# error ARCv3 64-bit is not supported by uClibc-ng
@@ -47,6 +70,22 @@
pop_s \reg
.endm
+.macro SUBR_S dst,src1,src2
+ sub_s \dst, \src1, \src2
+.endm
+
+.macro ADDR_S dst,src1,src2
+ add_s \dst, \src1, \src2
+.endm
+
+.macro ASRR_S dst,src1,src2
+ asr_s \dst, \src1, \src2
+.endm
+
+.macro ASLR_S dst,src1,src2
+ asl_s \dst, \src1, \src2
+.endm
+
#endif
#endif /* _ARC_ASM_H */
--
2.25.1