http://gcc.gnu.org/bugzilla/show_bug.cgi?id=51980
--- Comment #4 from Ramana Radhakrishnan <ramana at gcc dot gnu.org> 2012-03-30 07:58:49 UTC --- Your testcase is broken - it doesn't honour reinterpret_casts properly . This is a better testcase. #include <arm_neon.h> uint32x4_t sqrlen4D_16u8( const uint8x16_t A, const uint8x16_t B ) { const uint8x16_t absAB = vabdq_u8( A, B ); const uint16x8_t square_l = vmull_u8( vget_low_u8( absAB ), vget_low_u8( absAB ) ); const uint16x8_t square_h = vmull_u8( vget_high_u8( absAB ), vget_high_u8( absAB ) ); const uint32x4x2_t rgrgrgrg_babababa = vuzpq_u32( vreinterpretq_u32_u16 (square_l), vreinterpretq_u32_u16 (square_h) ); const uint16x8_t rgrgrgrg = vreinterpretq_u16_u32 (rgrgrgrg_babababa.val[0]); const uint16x8_t babababa = vreinterpretq_u16_u32 (rgrgrgrg_babababa.val[1]); const uint32x4_t rpg_rpg_rpg_rpg = vpaddlq_u16( rgrgrgrg ); const uint32x4_t dp = vpadalq_u16( rpg_rpg_rpg_rpg, babababa ); return ( dp ); }