Hi Vladimir,
both patches applied and pushed,
best regards
Waldemar
Vladimir Murzin wrote,
O0 build result in the following codegen
00000000 <ldrex>:
0: b480 push {r7}
2: b085 sub sp, #20
4: af00 add r7, sp, #0
6: 6078 str r0, [r7, #4]
8: 687b ldr r3, [r7, #4]
a: e853 3f00 ldrex r3, [r3]
e: 60fb str r3, [r7, #12]
10: 68fb ldr r3, [r7, #12]
12: 4618 mov r0, r3
14: 3714 adds r7, #20
16: 46bd mov sp, r7
18: f85d 7b04 ldr.w r7, [sp], #4
1c: 4770 bx lr
0000001e <strex>:
1e: b480 push {r7}
20: b085 sub sp, #20
22: af00 add r7, sp, #0
24: 6078 str r0, [r7, #4]
26: 6039 str r1, [r7, #0]
28: 687b ldr r3, [r7, #4]
2a: 683a ldr r2, [r7, #0]
2c: e842 3300 strex r3, r3, [r2]
30: 60fb str r3, [r7, #12]
32: 68fb ldr r3, [r7, #12]
34: 4618 mov r0, r3
36: 3714 adds r7, #20
38: 46bd mov sp, r7
3a: f85d 7b04 ldr.w r7, [sp], #4
3e: 4770 bx lr
00000040 <testandset>:
40: b590 push {r4, r7, lr}
42: b083 sub sp, #12
44: af00 add r7, sp, #0
46: 6078 str r0, [r7, #4]
48: 6878 ldr r0, [r7, #4]
4a: f7ff fffe bl 0 <ldrex>
4e: 4603 mov r3, r0
50: 461c mov r4, r3
52: 6879 ldr r1, [r7, #4]
54: 2001 movs r0, #1
56: f7ff fffe bl 1e <strex>
5a: 4603 mov r3, r0
5c: 2b00 cmp r3, #0
5e: d1f3 bne.n 48 <testandset+0x8>
60: 4623 mov r3, r4
62: 4618 mov r0, r3
64: 370c adds r7, #12
66: 46bd mov sp, r7
68: bd90 pop {r4, r7, pc}
ARM ARM suggests that LoadExcl/StoreExcl loops are guaranteed to make
forward progress only if, for any LoadExcl/StoreExcl loop within a
single thread of execution, the software meets all of the following
conditions:
1 Between the Load-Exclusive and the Store-Exclusive, there are no
explicit memory accesses, preloads, direct or indirect System
register writes, address translation instructions, cache or TLB
maintenance instructions, exception generating instructions,
exception returns, or indirect branches.
...
Obviously condition is not met for O0 builds.
O2 build (which is highly likely the most common setting) able to do
the right thing resulting in
00000000 <ldrex>:
0: e850 0f00 ldrex r0, [r0]
4: 4770 bx lr
6: bf00 nop
00000008 <strex>:
8: e841 0000 strex r0, r0, [r1]
c: 4770 bx lr
e: bf00 nop
00000010 <testandset>:
10: 2101 movs r1, #1
12: 4603 mov r3, r0
14: e853 0f00 ldrex r0, [r3]
18: e843 1200 strex r2, r1, [r3]
1c: 2a00 cmp r2, #0
1e: d1f9 bne.n 14 <testandset+0x4>
20: 4770 bx lr
22: bf00 nop
Rather than depending on level of optimisation implement whole
ldrex/strex loop in inline assembly.
Signed-off-by: Vladimir Murzin <vladimir.murzin(a)arm.com>
---
.../linuxthreads/sysdeps/arm/pt-machine.h | 34 +++++--------------
1 file changed, 9 insertions(+), 25 deletions(-)
diff --git a/libpthread/linuxthreads/sysdeps/arm/pt-machine.h
b/libpthread/linuxthreads/sysdeps/arm/pt-machine.h
index fc17e9bc7..b00b10495 100644
--- a/libpthread/linuxthreads/sysdeps/arm/pt-machine.h
+++ b/libpthread/linuxthreads/sysdeps/arm/pt-machine.h
@@ -29,35 +29,19 @@
#endif
#if defined(__thumb2__)
-PT_EI long int ldrex(int *spinlock)
-{
- long int ret;
- __asm__ __volatile__(
- "ldrex %0, [%1]\n"
- : "=r"(ret)
- : "r"(spinlock) : "memory");
- return ret;
-}
-
-PT_EI long int strex(int val, int *spinlock)
-{
- long int ret;
- __asm__ __volatile__(
- "strex %0, %1, [%2]\n"
- : "=r"(ret)
- : "r" (val), "r"(spinlock) : "memory");
- return ret;
-}
-
/* Spinlock implementation; required. */
PT_EI long int
testandset (int *spinlock)
{
- register unsigned int ret;
-
- do {
- ret = ldrex(spinlock);
- } while (strex(1, spinlock));
+ unsigned int ret, tmp, val = 1;
+
+ __asm__ __volatile__ (
+"0: ldrex %0, [%2] \n"
+" strex %1, %3, [%2] \n"
+" cmp %1, #0 \n"
+" bne 0b"
+ : "=&r" (ret), "=&r" (tmp)
+ : "r" (spinlock), "r" (val) : "memory",
"cc");
return ret;
}
--
2.25.1