summaryrefslogtreecommitdiffstats
path: root/libatomic
diff options
context:
space:
mode:
authorJakub Jelinek <jakub@redhat.com>2022-03-17 18:49:00 +0100
committerJakub Jelinek <jakub@redhat.com>2022-03-17 18:49:00 +0100
commit1d47c0512a265d4bb3ab9e56259fd1e4f4d42c75 (patch)
tree082d567edef178aee086ebdb67cb4de18ebd46d8 /libatomic
parentlibstdc++: Fix comment in testsuite utility (diff)
downloadgcc-1d47c0512a265d4bb3ab9e56259fd1e4f4d42c75.tar.gz
gcc-1d47c0512a265d4bb3ab9e56259fd1e4f4d42c75.tar.bz2
gcc-1d47c0512a265d4bb3ab9e56259fd1e4f4d42c75.tar.xz
libatomic: Improve 16-byte atomics on Intel AVX [PR104688]
As mentioned in the PR, the latest Intel SDM has added: "Processors that enumerate support for Intel® AVX (by setting the feature flag CPUID.01H:ECX.AVX[bit 28]) guarantee that the 16-byte memory operations performed by the following instructions will always be carried out atomically: • MOVAPD, MOVAPS, and MOVDQA. • VMOVAPD, VMOVAPS, and VMOVDQA when encoded with VEX.128. • VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64 when encoded with EVEX.128 and k0 (masking disabled). (Note that these instructions require the linear addresses of their memory operands to be 16-byte aligned.)" The following patch deals with it just on the libatomic library side so far, currently (since ~ 2017) we emit all the __atomic_* 16-byte builtins as library calls since and this is something that we can hopefully backport. The patch simply introduces yet another ifunc variant that takes priority over the pure CMPXCHG16B one, one that checks AVX and CMPXCHG16B bits and on non-Intel clears the AVX bit during detection for now (if AMD comes with the same guarantee, we could revert the config/x86/init.c hunk), which implements 16-byte atomic load as vmovdqa and 16-byte atomic store as vmovdqa followed by mfence. 2022-03-17 Jakub Jelinek <jakub@redhat.com> PR target/104688 * Makefile.am (IFUNC_OPTIONS): Change on x86_64 to -mcx16 -mcx16. (libatomic_la_LIBADD): Add $(addsuffix _16_2_.lo,$(SIZEOBJS)) for x86_64. * Makefile.in: Regenerated. * config/x86/host-config.h (IFUNC_COND_1): For x86_64 define to both AVX and CMPXCHG16B bits. (IFUNC_COND_2): Define. (IFUNC_NCOND): For x86_64 define to 2 * (N == 16). (MAYBE_HAVE_ATOMIC_CAS_16, MAYBE_HAVE_ATOMIC_EXCHANGE_16, MAYBE_HAVE_ATOMIC_LDST_16): Define to IFUNC_COND_2 rather than IFUNC_COND_1. (HAVE_ATOMIC_CAS_16): Redefine to 1 whenever IFUNC_ALT != 0. (HAVE_ATOMIC_LDST_16): Redefine to 1 whenever IFUNC_ALT == 1. (atomic_compare_exchange_n): Define whenever IFUNC_ALT != 0 on x86_64 for N == 16. (__atomic_load_n, __atomic_store_n): Redefine whenever IFUNC_ALT == 1 on x86_64 for N == 16. (atomic_load_n, atomic_store_n): New functions. * config/x86/init.c (__libat_feat1_init): On x86_64 clear bit_AVX if CPU vendor is not Intel.
Diffstat (limited to 'libatomic')
-rw-r--r--libatomic/Makefile.am5
-rw-r--r--libatomic/Makefile.in6
-rw-r--r--libatomic/config/x86/host-config.h43
-rw-r--r--libatomic/config/x86/init.c12
4 files changed, 55 insertions, 11 deletions
diff --git a/libatomic/Makefile.am b/libatomic/Makefile.am
index 389f3dd1c60..d88515e4a03 100644
--- a/libatomic/Makefile.am
+++ b/libatomic/Makefile.am
@@ -138,8 +138,9 @@ IFUNC_OPTIONS = -march=i586
138libatomic_la_LIBADD += $(addsuffix _8_1_.lo,$(SIZEOBJS)) 138libatomic_la_LIBADD += $(addsuffix _8_1_.lo,$(SIZEOBJS))
139endif 139endif
140if ARCH_X86_64 140if ARCH_X86_64
141IFUNC_OPTIONS = -mcx16 141IFUNC_OPTIONS = -mcx16 -mcx16
142libatomic_la_LIBADD += $(addsuffix _16_1_.lo,$(SIZEOBJS)) 142libatomic_la_LIBADD += $(addsuffix _16_1_.lo,$(SIZEOBJS)) \
143 $(addsuffix _16_2_.lo,$(SIZEOBJS))
143endif 144endif
144endif 145endif
145 146
diff --git a/libatomic/Makefile.in b/libatomic/Makefile.in
index 0a51bd55f01..80d25653dc7 100644
--- a/libatomic/Makefile.in
+++ b/libatomic/Makefile.in
@@ -96,7 +96,9 @@ target_triplet = @target@
96@ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@ $(addsuffix \ 96@ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@ $(addsuffix \
97@ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@ _8_2_.lo,$(SIZEOBJS)) 97@ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@ _8_2_.lo,$(SIZEOBJS))
98@ARCH_I386_TRUE@@HAVE_IFUNC_TRUE@am__append_3 = $(addsuffix _8_1_.lo,$(SIZEOBJS)) 98@ARCH_I386_TRUE@@HAVE_IFUNC_TRUE@am__append_3 = $(addsuffix _8_1_.lo,$(SIZEOBJS))
99@ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@am__append_4 = $(addsuffix _16_1_.lo,$(SIZEOBJS)) 99@ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@am__append_4 = $(addsuffix _16_1_.lo,$(SIZEOBJS)) \
100@ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@ $(addsuffix _16_2_.lo,$(SIZEOBJS))
101
100subdir = . 102subdir = .
101ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 103ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
102am__aclocal_m4_deps = $(top_srcdir)/../config/acx.m4 \ 104am__aclocal_m4_deps = $(top_srcdir)/../config/acx.m4 \
@@ -435,7 +437,7 @@ libatomic_la_LIBADD = $(foreach s,$(SIZES),$(addsuffix \
435@ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=armv8-a+lse 437@ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=armv8-a+lse
436@ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=armv7-a+fp -DHAVE_KERNEL64 438@ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=armv7-a+fp -DHAVE_KERNEL64
437@ARCH_I386_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=i586 439@ARCH_I386_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=i586
438@ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -mcx16 440@ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -mcx16 -mcx16
439libatomic_convenience_la_SOURCES = $(libatomic_la_SOURCES) 441libatomic_convenience_la_SOURCES = $(libatomic_la_SOURCES)
440libatomic_convenience_la_LIBADD = $(libatomic_la_LIBADD) 442libatomic_convenience_la_LIBADD = $(libatomic_la_LIBADD)
441MULTISRCTOP = 443MULTISRCTOP =
diff --git a/libatomic/config/x86/host-config.h b/libatomic/config/x86/host-config.h
index f20ce0941a1..007b7e14718 100644
--- a/libatomic/config/x86/host-config.h
+++ b/libatomic/config/x86/host-config.h
@@ -55,31 +55,37 @@ load_feat1 (void)
55} 55}
56 56
57#ifdef __x86_64__ 57#ifdef __x86_64__
58# define IFUNC_COND_1 (load_feat1 () & bit_CMPXCHG16B) 58# define IFUNC_COND_1 ((load_feat1 () & (bit_AVX | bit_CMPXCHG16B)) \
59 == (bit_AVX | bit_CMPXCHG16B))
60# define IFUNC_COND_2 (load_feat1 () & bit_CMPXCHG16B)
59#else 61#else
60# define IFUNC_COND_1 (load_feat1 () & bit_CMPXCHG8B) 62# define IFUNC_COND_1 (load_feat1 () & bit_CMPXCHG8B)
61#endif 63#endif
62 64
63#ifdef __x86_64__ 65#ifdef __x86_64__
64# define IFUNC_NCOND(N) (N == 16) 66# define IFUNC_NCOND(N) (2 * (N == 16))
65#else 67#else
66# define IFUNC_NCOND(N) (N == 8) 68# define IFUNC_NCOND(N) (N == 8)
67#endif 69#endif
68 70
69#ifdef __x86_64__ 71#ifdef __x86_64__
70# undef MAYBE_HAVE_ATOMIC_CAS_16 72# undef MAYBE_HAVE_ATOMIC_CAS_16
71# define MAYBE_HAVE_ATOMIC_CAS_16 IFUNC_COND_1 73# define MAYBE_HAVE_ATOMIC_CAS_16 IFUNC_COND_2
72# undef MAYBE_HAVE_ATOMIC_EXCHANGE_16 74# undef MAYBE_HAVE_ATOMIC_EXCHANGE_16
73# define MAYBE_HAVE_ATOMIC_EXCHANGE_16 IFUNC_COND_1 75# define MAYBE_HAVE_ATOMIC_EXCHANGE_16 IFUNC_COND_2
74# undef MAYBE_HAVE_ATOMIC_LDST_16 76# undef MAYBE_HAVE_ATOMIC_LDST_16
75# define MAYBE_HAVE_ATOMIC_LDST_16 IFUNC_COND_1 77# define MAYBE_HAVE_ATOMIC_LDST_16 IFUNC_COND_2
76/* Since load and store are implemented with CAS, they are not fast. */ 78/* Since load and store are implemented with CAS, they are not fast. */
77# undef FAST_ATOMIC_LDST_16 79# undef FAST_ATOMIC_LDST_16
78# define FAST_ATOMIC_LDST_16 0 80# define FAST_ATOMIC_LDST_16 0
79# if IFUNC_ALT == 1 81# if IFUNC_ALT != 0
80# undef HAVE_ATOMIC_CAS_16 82# undef HAVE_ATOMIC_CAS_16
81# define HAVE_ATOMIC_CAS_16 1 83# define HAVE_ATOMIC_CAS_16 1
82# endif 84# endif
85# if IFUNC_ALT == 1
86# undef HAVE_ATOMIC_LDST_16
87# define HAVE_ATOMIC_LDST_16 1
88# endif
83#else 89#else
84# undef MAYBE_HAVE_ATOMIC_CAS_8 90# undef MAYBE_HAVE_ATOMIC_CAS_8
85# define MAYBE_HAVE_ATOMIC_CAS_8 IFUNC_COND_1 91# define MAYBE_HAVE_ATOMIC_CAS_8 IFUNC_COND_1
@@ -93,7 +99,7 @@ load_feat1 (void)
93# endif 99# endif
94#endif 100#endif
95 101
96#if defined(__x86_64__) && N == 16 && IFUNC_ALT == 1 102#if defined(__x86_64__) && N == 16 && IFUNC_ALT != 0
97static inline bool 103static inline bool
98atomic_compare_exchange_n (UTYPE *mptr, UTYPE *eptr, UTYPE newval, 104atomic_compare_exchange_n (UTYPE *mptr, UTYPE *eptr, UTYPE newval,
99 bool weak_p UNUSED, int sm UNUSED, int fm UNUSED) 105 bool weak_p UNUSED, int sm UNUSED, int fm UNUSED)
@@ -108,6 +114,29 @@ atomic_compare_exchange_n (UTYPE *mptr, UTYPE *eptr, UTYPE newval,
108# define atomic_compare_exchange_n atomic_compare_exchange_n 114# define atomic_compare_exchange_n atomic_compare_exchange_n
109#endif /* Have CAS 16 */ 115#endif /* Have CAS 16 */
110 116
117#if defined(__x86_64__) && N == 16 && IFUNC_ALT == 1
118#define __atomic_load_n(ptr, model) \
119 (sizeof (*ptr) == 16 ? atomic_load_n (ptr, model) \
120 : (__atomic_load_n) (ptr, model))
121#define __atomic_store_n(ptr, val, model) \
122 (sizeof (*ptr) == 16 ? atomic_store_n (ptr, val, model) \
123 : (__atomic_store_n) (ptr, val, model))
124
125static inline UTYPE
126atomic_load_n (UTYPE *ptr, int model UNUSED)
127{
128 UTYPE ret;
129 __asm__ ("vmovdqa\t{%1, %0|%0, %1}" : "=x" (ret) : "m" (*ptr));
130 return ret;
131}
132
133static inline void
134atomic_store_n (UTYPE *ptr, UTYPE val, int model UNUSED)
135{
136 __asm__ ("vmovdqa\t{%1, %0|%0, %1}\n\tmfence" : "=m" (*ptr) : "x" (val));
137}
138#endif
139
111#endif /* HAVE_IFUNC */ 140#endif /* HAVE_IFUNC */
112 141
113#include_next <host-config.h> 142#include_next <host-config.h>
diff --git a/libatomic/config/x86/init.c b/libatomic/config/x86/init.c
index 7bdec722725..6f6499c58c3 100644
--- a/libatomic/config/x86/init.c
+++ b/libatomic/config/x86/init.c
@@ -34,6 +34,18 @@ __libat_feat1_init (void)
34 unsigned int eax, ebx, ecx, edx; 34 unsigned int eax, ebx, ecx, edx;
35 FEAT1_REGISTER = 0; 35 FEAT1_REGISTER = 0;
36 __get_cpuid (1, &eax, &ebx, &ecx, &edx); 36 __get_cpuid (1, &eax, &ebx, &ecx, &edx);
37#ifdef __x86_64__
38 if ((FEAT1_REGISTER & (bit_AVX | bit_CMPXCHG16B))
39 == (bit_AVX | bit_CMPXCHG16B))
40 {
41 /* Intel SDM guarantees that 16-byte VMOVDQA on 16-byte aligned address
42 is atomic, but so far we don't have this guarantee from AMD. */
43 unsigned int ecx2 = 0;
44 __get_cpuid (0, &eax, &ebx, &ecx2, &edx);
45 if (ecx2 != signature_INTEL_ecx)
46 FEAT1_REGISTER &= ~bit_AVX;
47 }
48#endif
37 /* See the load in load_feat1. */ 49 /* See the load in load_feat1. */
38 __atomic_store_n (&__libat_feat1, FEAT1_REGISTER, __ATOMIC_RELAXED); 50 __atomic_store_n (&__libat_feat1, FEAT1_REGISTER, __ATOMIC_RELAXED);
39 return FEAT1_REGISTER; 51 return FEAT1_REGISTER;