On Thu, Jul 18, 2024 at 01:57:11PM +0200, Uros Bizjak wrote:
> Attached patch illustrates the proposed improvement with nested cpuid
> calls. Bootstrapped and teased with libatomic testsuite.
>
> Jakub, WDYT?
I'd probably keep the FEAT1_REGISTER = 0; before the if (__get_cpuid (1, ...)
to avoid the else, I think that could result in smaller code, but otherwise
LGTM, especially the use of just __cpuid there. And note your patch doesn't
incorporate the Zhaoxin changes.
> diff --git a/libatomic/config/x86/init.c b/libatomic/config/x86/init.c
> index a75be3f175c..94d45683567 100644
> --- a/libatomic/config/x86/init.c
> +++ b/libatomic/config/x86/init.c
> @@ -32,22 +32,25 @@ unsigned int
> __libat_feat1_init (void)
> {
> unsigned int eax, ebx, ecx, edx;
> - FEAT1_REGISTER = 0;
> - __get_cpuid (1, &eax, &ebx, &ecx, &edx);
> -#ifdef __x86_64__
> - if ((FEAT1_REGISTER & (bit_AVX | bit_CMPXCHG16B))
> - == (bit_AVX | bit_CMPXCHG16B))
> + if (__get_cpuid (1, &eax, &ebx, &ecx, &edx))
> {
> - /* Intel SDM guarantees that 16-byte VMOVDQA on 16-byte aligned address
> - is atomic, and AMD is going to do something similar soon.
> - We don't have a guarantee from vendors of other CPUs with AVX,
> - like Zhaoxin and VIA. */
> - unsigned int ecx2 = 0;
> - __get_cpuid (0, &eax, &ebx, &ecx2, &edx);
> - if (ecx2 != signature_INTEL_ecx && ecx2 != signature_AMD_ecx)
> - FEAT1_REGISTER &= ~bit_AVX;
> - }
> +#ifdef __x86_64__
> + if ((FEAT1_REGISTER & (bit_AVX | bit_CMPXCHG16B))
> + == (bit_AVX | bit_CMPXCHG16B))
> + {
> + /* Intel SDM guarantees that 16-byte VMOVDQA on 16-byte aligned
> + address is atomic, and AMD is going to do something similar soon.
> + We don't have a guarantee from vendors of other CPUs with AVX,
> + like Zhaoxin and VIA. */
> + unsigned int ecx2;
> + __cpuid (0, eax, ebx, ecx2, edx);
> + if (ecx2 != signature_INTEL_ecx && ecx2 != signature_AMD_ecx)
> + FEAT1_REGISTER &= ~bit_AVX;
> + }
> #endif
> + }
> + else
> + FEAT1_REGISTER = 0;
> /* See the load in load_feat1. */
> __atomic_store_n (&__libat_feat1, FEAT1_REGISTER, __ATOMIC_RELAXED);
> return FEAT1_REGISTER;
Jakub