Hi,
The attached patch removes calls to builtins in vmul_n* (a, b) with __a * __b.
For float variants, it gates multiplication on __FAST_MATH__.
Since we are not removing all calls to builtins, I am not sure if we
should remove
entry for vmul_n from arm_neon_builtins.def ?

Testing the patch showed fallout for armv8_2-fp16-neon-2.c, because the patch
generates better code.
Code-gen diff:

--- armv8_2-fp16-neon-2.s       2021-05-26 11:34:30.870304900 +0530
+++ armv8_2-fp16-neon-2-after.s 2021-05-26 11:19:13.990304900 +0530
@@ -84,21 +84,9 @@
 test_vmul_n_16x4:
-       @ args = 0, pretend = 0, frame = 8
+       @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        @ link register save eliminated.
-       sub     sp, sp, #8
-       vldr    d7, .L8
-       add     r3, sp, #6
-       vst1.16 {d1[0]}, [r3]
-       vld1.16 {d7[0]}, [r3]
-       vmul.f16        d0, d0, d7[0]
-       add     sp, sp, #8
-       @ sp needed
+       vmov.f16        r3, s2  @ __fp16
+       vdup.16 d16, r3
+       vmul.f16        d0, d16, d0
        bx      lr
-.L9:
-       .align  3
-.L8:
-       .short  0
-       .short  0
-       .short  0
-       .short  0
        .size   test_vmul_n_16x4, .-test_vmul_n_16x4
@@ -113,21 +101,9 @@
 test_vmul_n_16x8:
-       @ args = 0, pretend = 0, frame = 8
+       @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        @ link register save eliminated.
-       sub     sp, sp, #8
-       vldr    d7, .L12
-       add     r3, sp, #6
-       vst1.16 {d2[0]}, [r3]
-       vld1.16 {d7[0]}, [r3]
-       vmul.f16        q0, q0, d7[0]
-       add     sp, sp, #8
-       @ sp needed
+       vmov.f16        r3, s4  @ __fp16
+       vdup.16 q8, r3
+       vmul.f16        q0, q8, q0
        bx      lr
-.L13:
-       .align  3
-.L12:
-       .short  0
-       .short  0
-       .short  0
-       .short  0
        .size   test_vmul_n_16x8, .-test_vmul_n_16x8

Adjusted the test, to fix the failing tests.
OK to commit if testing passes ?

Thanks,
Prathamesh
2021-26-05  Prathamesh Kulkarni  <prathamesh.kulka...@linaro.org>

        PR target/66791
        * config/arm/arm_neon.h (vmul_n_s16): Replace call to builtin with __a 
* __b.
        (vmul_n_s32): Likewise.
        (vmul_n_u16): Likewise.
        (vmul_n_u32): Likewise.
        (vmulq_n_s16): Likewise.
        (vmulq_n_s32): Likewise.
        (vmulq_n_u16): Likewise.
        (vmulq_n_u32): Likewise.
        (vmul_n_f32): Gate __a * __b conditionally on __FAST_MATH__.
        (vmulq_n_f32): Likewise.
        (vmul_n_f16): Likewise.
        (vmulq_n_f16): Likewise.

testsuite/
        * gcc.target/arm/armv8_2-fp16-neon-2.c: Adjust.

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index dcd533fd003..8ac00774e6c 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -8331,70 +8331,78 @@ __extension__ extern __inline int16x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_n_s16 (int16x4_t __a, int16_t __b)
 {
-  return (int16x4_t)__builtin_neon_vmul_nv4hi (__a, (__builtin_neon_hi) __b);
+  return __a * __b;
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_n_s32 (int32x2_t __a, int32_t __b)
 {
-  return (int32x2_t)__builtin_neon_vmul_nv2si (__a, (__builtin_neon_si) __b);
+  return __a * __b;
 }
 
 __extension__ extern __inline float32x2_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_n_f32 (float32x2_t __a, float32_t __b)
 {
+#ifdef __FAST_MATH__
+  return __a * __b;
+#else
   return (float32x2_t)__builtin_neon_vmul_nv2sf (__a, (__builtin_neon_sf) __b);
+#endif
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_n_u16 (uint16x4_t __a, uint16_t __b)
 {
-  return (uint16x4_t)__builtin_neon_vmul_nv4hi ((int16x4_t) __a, 
(__builtin_neon_hi) __b);
+  return __a * __b;
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_n_u32 (uint32x2_t __a, uint32_t __b)
 {
-  return (uint32x2_t)__builtin_neon_vmul_nv2si ((int32x2_t) __a, 
(__builtin_neon_si) __b);
+  return __a * __b;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_n_s16 (int16x8_t __a, int16_t __b)
 {
-  return (int16x8_t)__builtin_neon_vmul_nv8hi (__a, (__builtin_neon_hi) __b);
+  return __a * __b;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_n_s32 (int32x4_t __a, int32_t __b)
 {
-  return (int32x4_t)__builtin_neon_vmul_nv4si (__a, (__builtin_neon_si) __b);
+  return __a * __b;
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_n_f32 (float32x4_t __a, float32_t __b)
 {
+#ifdef __FAST_MATH__
+  return __a * __b;
+#else
   return (float32x4_t)__builtin_neon_vmul_nv4sf (__a, (__builtin_neon_sf) __b);
+#endif
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_n_u16 (uint16x8_t __a, uint16_t __b)
 {
-  return (uint16x8_t)__builtin_neon_vmul_nv8hi ((int16x8_t) __a, 
(__builtin_neon_hi) __b);
+  return __a * __b;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
 {
-  return (uint32x4_t)__builtin_neon_vmul_nv4si ((int32x4_t) __a, 
(__builtin_neon_si) __b);
+  return __a * __b;
 }
 
 __extension__ extern __inline int32x4_t
@@ -17661,7 +17669,11 @@ __extension__ extern __inline float16x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_n_f16 (float16x4_t __a, float16_t __b)
 {
+#ifdef __FAST_MATH__
+  return __a * __b;
+#else
   return __builtin_neon_vmul_nv4hf (__a, __b);
+#endif
 }
 
 __extension__ extern __inline float16x8_t
@@ -17686,7 +17698,11 @@ __extension__ extern __inline float16x8_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_n_f16 (float16x8_t __a, float16_t __b)
 {
+#ifdef __FAST_MATH__
+  return __a * __b;
+#else
   return __builtin_neon_vmul_nv8hf (__a, __b);
+#endif
 }
 
 __extension__ extern __inline float16x4_t
diff --git a/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c 
b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c
index 50f689352ca..2d26bc0ac26 100644
--- a/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c
+++ b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c
@@ -327,13 +327,13 @@ BINOP_TEST (vminnm)
 
 BINOP_TEST (vmul)
 /* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 3 
} }
-   { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 
} }  */
+   { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 2 
} }  */
 BINOP_LANE_TEST (vmul, 2)
 /* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, 
d[0-9]+\[2\]} 1 } }
    { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, 
d[0-9]+\[2\]} 1 } }  */
 BINOP_N_TEST (vmul)
-/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, 
d[0-9]+\[0\]} 1 } }
-   { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, 
d[0-9]+\[0\]} 1 } }*/
+/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 3 
} }
+   { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 2 
} }*/
 
 float16x4_t
 test_vpadd_16x4 (float16x4_t a, float16x4_t b)
@@ -387,7 +387,7 @@ test_vdup_n_f16 (float16_t a)
 {
   return vdup_n_f16 (a);
 }
-/* { dg-final { scan-assembler-times {vdup\.16\td[0-9]+, r[0-9]+} 2 } }  */
+/* { dg-final { scan-assembler-times {vdup\.16\td[0-9]+, r[0-9]+} 3 } }  */
 
 float16x8_t
 test_vmovq_n_f16 (float16_t a)

Reply via email to