/*******/ #include <arm_neon.h> void neon_add(float * __restrict out, float * __restrict a, float * __restrict b) { float32x2_t tmp1, tmp2; tmp1 = vset_lane_f32(*a, tmp1, 0); tmp2 = vset_lane_f32(*b, tmp2, 0); tmp1 = vadd_f32(tmp1, tmp2); *out = vget_lane_f32(tmp1, 0); } /*******/
00000000 <neon_add>: 0: e5913000 ldr r3, [r1] 4: eddf0b07 vldr d16, [pc, #28] ; 28 <neon_add+0x28> 8: e5922000 ldr r2, [r2] c: eddf1b05 vldr d17, [pc, #20] ; 28 <neon_add+0x28> 10: ee003b90 vmov.32 d16[0], r3 14: ee012b90 vmov.32 d17[0], r2 18: f2400da1 vadd.f32 d16, d16, d17 1c: f4c0080f vst1.32 {d16[0]}, [r0] 20: e12fff1e bx lr 24: e1a00000 nop (mov r0,r0) gcc fails to use a single instruction vld1.32 {d16[0]}, [r1] instead of 0: e5913000 ldr r3, [r1] 4: eddf0b07 vldr d16, [pc, #28] ; 28 <neon_add+0x28> 10: ee003b90 vmov.32 d16[0], r3 -- Summary: Suboptimal code for the use of ARM NEON intrinsic "vset_lane_f32" Product: gcc Version: 4.4.3 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: siarhei dot siamashka at gmail dot com GCC build triplet: arm-unknown-linux-gnueabi GCC host triplet: arm-unknown-linux-gnueabi GCC target triplet: arm-unknown-linux-gnueabi http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43364