https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68793
--- Comment #7 from ktkachov at gcc dot gnu.org ---
(In reply to Allan Jensen from comment #6)
> I mean the neon64 case, not 32-bit.
Seems so. I get:
_Z16RGBA2BGRA_neon64PKjPjj:
.LFB3215:
.cfi_startproc
subs w7, w2, #7
mov w5, 0
beq .L4
.p2align 2
.L8:
ubfiz x3, x5, 2, 32
add w5, w5, 8
add x4, x0, x3
add x3, x1, x3
cmp w5, w7
ld4 {v4.8b - v7.8b}, [x4]
mov v0.8b, v6.8b
mov v1.8b, v5.8b
mov v2.8b, v4.8b
mov v3.8b, v7.8b
st4 {v0.8b - v3.8b}, [x3]
bcc .L8
.L4:
cmp w5, w2
bcs .L10
uxtw x3, w5
sub w2, w2, #1
sub w2, w2, w5
add x5, x3, 1
add x5, x2, x5
lsl x2, x3, 2
lsl x5, x5, 2
.p2align 2
.L7:
ldr w3, [x0, x2]
and w4, w3, 16711935
str w4, [x1, x2]
ldr w3, [x0, x2]
and w3, w3, -16711936
orr w3, w4, w3, ror (32 - 16)
str w3, [x1, x2]
add x2, x2, 4
cmp x2, x5
bne .L7
ret
.L10:
ret