------- Comment #4 from siarhei dot siamashka at gmail dot com 2010-06-15 20:34 ------- (In reply to comment #3) > Or use multiple alternatives feature for inline assembly constraints to emit > either VMOV or VLD1?
Well, this kind of works :) But is very ugly and fragile: /***************************************/ #include <arm_neon.h> /* Override a slow 'vdup_n_f32' intrinsic with something better */ static inline float32x2_t vdup_n_f32_fast(float x) { float32x2_t result; asm ( ".set vdup_n_f32_fast_CODE_EMITTED,0\n" ".irp regname,r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14\n" ".ifeqs \"\\regname\", \"%1\"\n" " vdup.32 %P0, %1\n" " .set vdup_n_f32_fast_CODE_EMITTED,1\n" ".endif\n" ".ifeqs \"[\\regname, #0]\", \"%1\"\n" " vld1.f32 {%P0[]}, [\\regname, :32]\n" " .set vdup_n_f32_fast_CODE_EMITTED,1\n" ".endif\n" ".endr\n" ".if vdup_n_f32_fast_CODE_EMITTED == 0\n" ".error \"Fixme: icky macros from 'vdup_n_f32_fast' failed\"\n" ".endif\n" : "=w,w" (result) : "r,Q" (x) : "memory"); return result; } #define vdup_n_f32(x) vdup_n_f32_fast(x) /* Now let's test it for accessing data in registers */ float neon_add_regs(float a, float b) { float32x2_t tmp1, tmp2; tmp1 = vdup_n_f32(a); tmp2 = vdup_n_f32(b); tmp1 = vadd_f32(tmp1, tmp2); return vget_lane_f32(tmp1, 0); } /* ... and in memory */ void neon_add_mem(float * __restrict out, float * __restrict a, float * __restrict b) { float32x2_t tmp1, tmp2; tmp1 = vdup_n_f32(*a); tmp2 = vdup_n_f32(*b); tmp1 = vadd_f32(tmp1, tmp2); *out = vget_lane_f32(tmp1, 0); } /***************************************/ $ objdump -d test.o 00000000 <neon_add_mem>: 0: f4e10c9f vld1.32 {d16[]}, [r1, :32] 4: f4e21c9f vld1.32 {d17[]}, [r2, :32] 8: f2400da1 vadd.f32 d16, d16, d17 c: f4c0080f vst1.32 {d16[0]}, [r0] 10: e12fff1e bx lr 00000014 <neon_add_regs>: 14: ee800b90 vdup.32 d16, r0 18: ee811b90 vdup.32 d17, r1 1c: f2400da1 vadd.f32 d16, d16, d17 20: ee100b90 vmov.32 r0, d16[0] 24: e12fff1e bx lr -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43364