https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68793
ktkachov at gcc dot gnu.org changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |ktkachov at gcc dot gnu.org
--- Comment #4 from ktkachov at gcc dot gnu.org ---
The testcase doesn't compile for me.
Did you mean the below?
#include <arm_neon.h>
typedef unsigned int uint;
void RGBA2BGRA_neon64(const uint* src, uint* dst, uint count)
{
uint i = 0;
for (; i < count - 7; i += 8) {
uint8x8x4_t tmp = vld4_u8((const uint8_t*)(src + i));
uint8x8x4_t tmp2 = { tmp.val[2], tmp.val[1], tmp.val[0], tmp.val[3] };
vst4_u8((uint8_t*)(dst + i), tmp2);
}
for (; i < count; ++i) {
dst[i] = src[i] & 0x00ff00ff;
uint tmp = src[i] & 0xff00ff00;
dst[i] |= (tmp << 16) | (tmp >> 16);
}
}
void RGBA2BGRA_neon128(const uint* src, uint* dst, uint count)
{
uint i = 0;
for (; i < count - 15; i += 16) {
uint8x16x4_t tmp = vld4q_u8((const uint8_t*)(src + i));
uint8x16x4_t tmp2 = {tmp.val[2], tmp.val[1], tmp.val[0], tmp.val[3]};
vst4q_u8((uint8_t*)(dst + i), tmp2);
}
for (; i < count; ++i) {
dst[i] = src[i] & 0x00ff00ff;
uint tmp = src[i] & 0xff00ff00;
dst[i] |= (tmp << 16) | (tmp >> 16);
}
}
Can you please try a trunk compiler?
I indeed get the extra umovs with a GCC 5 compiler but latest trunk at -O2
-mcpu=generic for me generates the good code for that loop:
ld4 {v4.16b - v7.16b}, [x6]
orr v0.16b, v6.16b, v6.16b
orr v1.16b, v5.16b, v5.16b
orr v2.16b, v4.16b, v4.16b
orr v3.16b, v7.16b, v7.16b
st4 {v0.16b - v3.16b}, [x3]