https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68793
ktkachov at gcc dot gnu.org changed: What |Removed |Added ---------------------------------------------------------------------------- CC| |ktkachov at gcc dot gnu.org --- Comment #4 from ktkachov at gcc dot gnu.org --- The testcase doesn't compile for me. Did you mean the below? #include <arm_neon.h> typedef unsigned int uint; void RGBA2BGRA_neon64(const uint* src, uint* dst, uint count) { uint i = 0; for (; i < count - 7; i += 8) { uint8x8x4_t tmp = vld4_u8((const uint8_t*)(src + i)); uint8x8x4_t tmp2 = { tmp.val[2], tmp.val[1], tmp.val[0], tmp.val[3] }; vst4_u8((uint8_t*)(dst + i), tmp2); } for (; i < count; ++i) { dst[i] = src[i] & 0x00ff00ff; uint tmp = src[i] & 0xff00ff00; dst[i] |= (tmp << 16) | (tmp >> 16); } } void RGBA2BGRA_neon128(const uint* src, uint* dst, uint count) { uint i = 0; for (; i < count - 15; i += 16) { uint8x16x4_t tmp = vld4q_u8((const uint8_t*)(src + i)); uint8x16x4_t tmp2 = {tmp.val[2], tmp.val[1], tmp.val[0], tmp.val[3]}; vst4q_u8((uint8_t*)(dst + i), tmp2); } for (; i < count; ++i) { dst[i] = src[i] & 0x00ff00ff; uint tmp = src[i] & 0xff00ff00; dst[i] |= (tmp << 16) | (tmp >> 16); } } Can you please try a trunk compiler? I indeed get the extra umovs with a GCC 5 compiler but latest trunk at -O2 -mcpu=generic for me generates the good code for that loop: ld4 {v4.16b - v7.16b}, [x6] orr v0.16b, v6.16b, v6.16b orr v1.16b, v5.16b, v5.16b orr v2.16b, v4.16b, v4.16b orr v3.16b, v7.16b, v7.16b st4 {v0.16b - v3.16b}, [x3]