https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114576
Bug ID: 114576
Summary: [13 regression][config/i386] GCC 14/trunk emits
VEX-prefixed AES instruction without AVX enabled
Product: gcc
Version: 14.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: thiago at kde dot org
Target Milestone: ---
Re: https://bugreports.qt.io/browse/QTBUG-123965
Re: https://bugzilla.redhat.com/show_bug.cgi?id=2262640,
https://bugzilla.redhat.com/show_bug.cgi?id=2272758
Godbolt link: https://gcc.godbolt.org/z/6P9fMvoxW
Found while compiling Qt 6.6 or 6.7 with GCC 14 (current trunk). This is a
regression from GCC 13.
This function from qhash.cpp
<https://github.com/qt/qtbase/blob/v6.7.0/src/corelib/tools/qhash.cpp#L581-L588>:
Q_ALWAYS_INLINE __m128i AESHashSeed::state1() const
{
{
// unlike the Go code, we don't have more per-process seed
__m128i state1 = _mm_aesenc_si128(state0, mseed2);
return state1;
}
}
Is apparently getting assembled to:
.L2:
leaq (%rdi,%rsi), %rdx
vaesenc %xmm1, %xmm0, %xmm1
Though there's no AVX enabled in this code (the original version in Qt has some
AVX/VAES and AVX512 code but the reduced example does not).
This function:
// hash twice 16 bytes, running 2 scramble rounds of AES on itself
static void QT_FUNCTION_TARGET(AES) QT_VECTORCALL
hash2x16bytes(__m128i &state0, __m128i &state1, const __m128i *src0, const
__m128i *src1)
{
__m128i data0 = _mm_loadu_si128(src0);
__m128i data1 = _mm_loadu_si128(src1);
state0 = _mm_xor_si128(data0, state0);
state1 = _mm_xor_si128(data1, state1);
state0 = _mm_aesenc_si128(state0, state0);
state1 = _mm_aesenc_si128(state1, state1);
state0 = _mm_aesenc_si128(state0, state0);
state1 = _mm_aesenc_si128(state1, state1);
}
Is even emitting:
.L20:
movdqu (%rax), %xmm2
pxor %xmm0, %xmm2
movdqu -16(%rdx), %xmm0
pxor %xmm0, %xmm1
vaesenc %xmm2, %xmm2, %xmm0
aesenc %xmm1, %xmm1
aesenc %xmm0, %xmm0
aesenc %xmm1, %xmm1
and that makes no sense to use AVX for one of four instructions alone, called
from the same source function.
For reference, GCC 13 generates respectively:
.L2:
movdqa %xmm0, %xmm1
leaq (%rdi,%rsi), %rdx
aesenc %xmm2, %xmm1
and
.L20:
movdqu (%rax), %xmm2
pxor %xmm0, %xmm2
movdqu -16(%rdx), %xmm0
aesenc %xmm2, %xmm2
pxor %xmm0, %xmm1
movdqa %xmm2, %xmm0
aesenc %xmm1, %xmm1
aesenc %xmm2, %xmm0
aesenc %xmm1, %xmm1
You can tell that they are the same source block because the labels are the
same.
Sources:
#include <immintrin.h>
#ifdef _MSC_VER
# define Q_ALWAYS_INLINE __forceinline
# define QT_VECTORCALL __vectorcall
# define QT_FUNCTION_TARGET(x)
#else
# define Q_ALWAYS_INLINE inline __attribute__((always_inline))
# define QT_VECTORCALL
# define QT_FUNCTION_TARGET(x) __attribute__((target(QT_FUNCTION_TARGET_##x)))
# define QT_FUNCTION_TARGET_AES "sse4.2,aes"
//# define qCpuHasFeature(x) __builtin_cpu_supports(QT_FUNCTION_TARGET_ ## x)
#endif
#define QT_COMPILER_SUPPORTS_HERE(x) true
# define mm_set1_epz _mm_set1_epi64x
# define mm_cvtsz_si128 _mm_cvtsi64_si128
# define mm_cvtsi128_sz _mm_cvtsi128_si64
# define mm256_set1_epz _mm256_set1_epi64x
extern bool qCpuHasFeature(const char *) noexcept;
#define qCpuHasFeature(x) qCpuHasFeature(#x)
using uchar = unsigned char;
using quintptr = unsigned long long;
using qint8 = signed char;
// hash 16 bytes, running 3 scramble rounds of AES on itself (like label
"final1")
static void Q_ALWAYS_INLINE QT_FUNCTION_TARGET(AES) QT_VECTORCALL
hash16bytes(__m128i &state0, __m128i data)
{
state0 = _mm_xor_si128(state0, data);
state0 = _mm_aesenc_si128(state0, state0);
state0 = _mm_aesenc_si128(state0, state0);
state0 = _mm_aesenc_si128(state0, state0);
}
// hash twice 16 bytes, running 2 scramble rounds of AES on itself
static void QT_FUNCTION_TARGET(AES) QT_VECTORCALL
hash2x16bytes(__m128i &state0, __m128i &state1, const __m128i *src0, const
__m128i *src1)
{
__m128i data0 = _mm_loadu_si128(src0);
__m128i data1 = _mm_loadu_si128(src1);
state0 = _mm_xor_si128(data0, state0);
state1 = _mm_xor_si128(data1, state1);
state0 = _mm_aesenc_si128(state0, state0);
state1 = _mm_aesenc_si128(state1, state1);
state0 = _mm_aesenc_si128(state0, state0);
state1 = _mm_aesenc_si128(state1, state1);
}
struct AESHashSeed
{
__m128i state0;
__m128i mseed2;
AESHashSeed(size_t seed, size_t seed2) QT_FUNCTION_TARGET(AES);
__m128i state1() const QT_FUNCTION_TARGET(AES);
};
Q_ALWAYS_INLINE AESHashSeed::AESHashSeed(size_t seed, size_t seed2)
{
__m128i mseed = mm_cvtsz_si128(seed);
mseed2 = mm_set1_epz(seed2);
// mseed (epi16) = [ seed, seed >> 16, seed >> 32, seed >> 48, len, 0, 0, 0
]
mseed = _mm_insert_epi16(mseed, short(seed), 4);
// mseed (epi16) = [ seed, seed >> 16, seed >> 32, seed >> 48, len, len,
len, len ]
mseed = _mm_shufflehi_epi16(mseed, 0);
// merge with the process-global seed
__m128i key = _mm_xor_si128(mseed, mseed2);
// scramble the key
__m128i state0 = _mm_aesenc_si128(key, key);
this->state0 = state0;
}
Q_ALWAYS_INLINE __m128i AESHashSeed::state1() const
{
{
// unlike the Go code, we don't have more per-process seed
__m128i state1 = _mm_aesenc_si128(state0, mseed2);
return state1;
}
}
static size_t QT_FUNCTION_TARGET(AES) QT_VECTORCALL
aeshash128_16to32(__m128i state0, __m128i state1, const __m128i *src, const
__m128i *srcend)
{
{
if (src + 1 < srcend) {
// epilogue: between 16 and 31 bytes
hash2x16bytes(state0, state1, src, srcend - 1);
} else if (src != srcend) {
// epilogue: between 1 and 16 bytes, overlap with the end
__m128i data = _mm_loadu_si128(srcend - 1);
hash16bytes(state0, data);
}
// combine results:
state0 = _mm_xor_si128(state0, state1);
}
return mm_cvtsi128_sz(state0);
}
static size_t QT_FUNCTION_TARGET(AES) QT_VECTORCALL
aeshash128_lt16(__m128i state0, const uchar *p, size_t len)
{
if (len) {
// We're going to load 16 bytes and mask zero the part we don't care
// (the hash of a short string is different from the hash of a longer
// including NULLs at the end because the length is in the key)
// WARNING: this may produce valgrind warnings, but it's safe
constexpr quintptr PageSize = 4096;
__m128i data;
if ((quintptr(p) & (PageSize / 2)) == 0) {
// lower half of the page:
// load all 16 bytes and mask off the bytes past the end of the
source
static const qint8 maskarray[] = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
__m128i mask = _mm_loadu_si128(reinterpret_cast<const __m128i
*>(maskarray + 15 - len));
data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(p));
data = _mm_and_si128(data, mask);
} else {
// upper half of the page:
// load 16 bytes ending at the data end, then shuffle them to the
beginning
static const qint8 shufflecontrol[] = {
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};
__m128i control = _mm_loadu_si128(reinterpret_cast<const __m128i
*>(shufflecontrol + 15 - len));
data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(p + len) -
1);
data = _mm_shuffle_epi8(data, control);
}
hash16bytes(state0, data);
}
return mm_cvtsi128_sz(state0);
}
static size_t QT_FUNCTION_TARGET(AES) QT_VECTORCALL
aeshash128_ge32(__m128i state0, __m128i state1, const __m128i *src, const
__m128i *srcend)
{
// main loop: scramble two 16-byte blocks
for ( ; src + 2 < srcend; src += 2)
hash2x16bytes(state0, state1, src, src + 1);
return aeshash128_16to32(state0, state1, src, srcend);
}
static size_t QT_FUNCTION_TARGET(AES)
aeshash128(const uchar *p, size_t len, size_t seed, size_t seed2) noexcept
{
AESHashSeed state(seed, seed2);
auto src = reinterpret_cast<const __m128i *>(p);
const auto srcend = reinterpret_cast<const __m128i *>(p + len);
if (len < sizeof(__m128i))
return aeshash128_lt16(state.state0, p, len);
if (len <= sizeof(__m256i))
return aeshash128_16to32(state.state0, state.state1(), src, srcend);
return aeshash128_ge32(state.state0, state.state1(), src, srcend);
}
static size_t aeshash(const uchar *p, size_t len, size_t seed, size_t seed2)
noexcept
{
return aeshash128(p, len, seed, seed2);
}
extern size_t qt_qhash_seed;
size_t qHashBits(const void *p, size_t size, size_t seed) noexcept
{
size_t seed2 = size;
if (seed)
seed2 = qt_qhash_seed;
return aeshash(reinterpret_cast<const uchar *>(p), size, seed, seed2);
}