https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106533
--- Comment #3 from Vineet Gupta <vineetg at rivosinc dot com> --- FWIW this was seen with riscv64 build of gcc, but same tree behavior seen with aarch64 gcc 12.1. For single copy-loop src, final output is inline copy loop -->8-- <bb 11> [local count: 1063004409]: # j_131 = PHI <j_92(28), 0(10)> # ivtmp_135 = PHI <ivtmp_88(28), 10000000(10)> _10 = a[j_131]; c[j_131] = _10; j_92 = j_131 + 1; ivtmp_88 = ivtmp_135 - 1; if (ivtmp_88 != 0) goto <bb 28>; [99.00%] else goto <bb 12>; [1.00%] .L74: // ../stream-4-loop.c:315: c[j] = a[j]; ldr q0, [x27, x0] // MEM <vector(2) double> [(double *)&a + ivtmp.224_247 * 1], MEM <vector(2) double> [(double *)&a + ivtmp.224_247 * 1] str q0, [x19, x0] // MEM <vector(2) double> [(double *)&a + ivtmp.224_247 * 1], MEM <vector(2) double> [(double *)&c + ivtmp.224_247 * 1] add x0, x0, 16 // ivtmp.224, ivtmp.224, cmp x0, x28 // ivtmp.224, tmp291 bne .L74 -->8-- While for multi-loop src we see -->8-- MEM <unsigned char[80000000]> [(char * {ref-all})&c] = MEM <unsigned char[80000000]> [(char * {ref-all})&a]; bl memcpy -->8--