https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68793

ktkachov at gcc dot gnu.org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |ktkachov at gcc dot gnu.org

--- Comment #4 from ktkachov at gcc dot gnu.org ---
The testcase doesn't compile for me.
Did you mean the below?
#include <arm_neon.h>

typedef unsigned int uint;

void RGBA2BGRA_neon64(const uint* src, uint* dst, uint count)
{
    uint i = 0;
    for (; i < count - 7; i += 8) {
        uint8x8x4_t tmp = vld4_u8((const uint8_t*)(src + i));
        uint8x8x4_t tmp2 = { tmp.val[2], tmp.val[1], tmp.val[0], tmp.val[3] };
        vst4_u8((uint8_t*)(dst + i), tmp2);
    }
    for (; i < count; ++i) {
        dst[i] = src[i] & 0x00ff00ff;
        uint tmp = src[i] & 0xff00ff00;
        dst[i] |= (tmp << 16) | (tmp >> 16);
    }
}

void RGBA2BGRA_neon128(const uint* src, uint* dst, uint count)
{
    uint i = 0;
    for (; i < count - 15; i += 16) {
        uint8x16x4_t tmp = vld4q_u8((const uint8_t*)(src + i));
        uint8x16x4_t tmp2 = {tmp.val[2], tmp.val[1], tmp.val[0], tmp.val[3]};
        vst4q_u8((uint8_t*)(dst + i), tmp2);
    }
    for (; i < count; ++i) {
        dst[i] = src[i] & 0x00ff00ff;
        uint tmp = src[i] & 0xff00ff00;
        dst[i] |= (tmp << 16) | (tmp >> 16);
    }
}

Can you please try a trunk compiler?
I indeed get the extra umovs with a GCC 5 compiler but latest trunk at -O2
-mcpu=generic for me generates the good code for that loop:
        ld4     {v4.16b - v7.16b}, [x6]
        orr     v0.16b, v6.16b, v6.16b
        orr     v1.16b, v5.16b, v5.16b
        orr     v2.16b, v4.16b, v4.16b
        orr     v3.16b, v7.16b, v7.16b
        st4     {v0.16b - v3.16b}, [x3]

Reply via email to