Re: [PATCH] ARM PR68620 (ICE with FP16 on armeb)

Christophe Lyon Wed, 20 Jan 2016 13:10:34 -0800

On 19 January 2016 at 15:51, Alan Lawrence <alan.lawre...@foss.arm.com> wrote:
> On 19/01/16 11:15, Christophe Lyon wrote:
>
>>>> For neon_vdupn, I chose to implement neon_vdup_nv4hf and
>>>> neon_vdup_nv8hf instead of updating the VX iterator because I thought
>>>> it was not desirable to impact neon_vrev32<mode>.
>>>
>>>
>>> Well, the same instruction will suffice for vrev32'ing vectors of HF just
>>> as
>>> well as vectors of HI, so I think I'd argue that's harmless enough. To
>>> gain the
>>> benefit, we'd need to update arm_evpc_neon_vrev with a few new cases,
>>> though.
>>>
>> Since this is more intrusive, I'd rather leave that part for later. OK?
>
>
> Sure.
>
>>>> +#ifdef __ARM_BIG_ENDIAN
>>>> +  /* Here, 3 is (4-1) where 4 is the number of lanes. This is also the
>>>> +     right value for vectors with 8 lanes.  */
>>>> +#define __arm_lane(__vec, __idx) (__idx ^ 3)
>>>> +#else
>>>> +#define __arm_lane(__vec, __idx) __idx
>>>> +#endif
>>>> +
>>>
>>>
>>> Looks right, but sounds... my concern here is that I'm hoping at some
>>> point we
>>> will move the *other* vget/set_lane intrinsics to use GCC vector
>>> extensions
>>> too. At which time (unlike __aarch64_lane which can be used everywhere)
>>> this
>>> will be the wrong formula. Can we name (and/or comment) it to avoid
>>> misleading
>>> anyone? The key characteristic seems to be that it is for vectors of
>>> 16-bit
>>> elements only.
>>>
>> I'm not to follow, here. Looking at the patterns for
>> neon_vget_lane<mode>_*internal in neon.md,
>> I can see 2 flavours: one for VD, one for VQ2. The latter uses "halfelts".
>>
>> Do you prefer that I create 2 macros (say __arm_lane and __arm_laneq),
>> that would be similar to the aarch64 ones (by computing the number of
>> lanes of the input vector), but the "q" one would use half the total
>> number of lanes instead?
>
>
> That works for me! Sthg like:
>
> #define __arm_lane(__vec, __idx) NUM_LANES(__vec) - __idx
> #define __arm_laneq(__vec, __idx) (__idx & (NUM_LANES(__vec)/2)) +
> (NUM_LANES(__vec)/2 - __idx)
> //or similarly
> #define __arm_laneq(__vec, __idx) (__idx ^ (NUM_LANES(__vec)/2 - 1))
>
> Alternatively I'd been thinking
>
> #define __arm_lane_32xN(__idx) __idx ^ 1
> #define __arm_lane_16xN(__idx) __idx ^ 3
> #define __arm_lane_8xN(__idx) __idx ^ 7
>
> Bear in mind PR64893 that we had on AArch64 :-(
>


Here is a new version, based on the comments above.
I've also removed the addition of arm_fp_ok effective target since I
added that in my other testsuite patch.

OK now?

Thanks,

Christophe

> Cheers, Alan

gcc/ChangeLog:

2016-01-20  Christophe Lyon  <christophe.l...@linaro.org>

        PR target/68620
        * config/arm/arm.c (neon_valid_immediate): Handle FP16 vectors.
        * config/arm/arm_neon.h (__ARM_NUM_LANES, __arm_lane, arm_lanq):
        New helper macros.
        (vget_lane_f16): Handle big-endian.
        (vgetq_lane_f16): Likewise.
        (vset_lane_f16): Likewise.
        (vsetq_lane_f16): Likewise.
        * config/arm/iterators.md (VQXMOV): Add V8HF.
        (VDQ): Add V4HF and V8HF.
        (V_reg): Handle V4HF and V8HF.
        (Is_float_mode): Likewise.
        * config/arm/neon.md (movv4hf, movv8hf, neon_vdup_nv4hf,
        neon_vdup_nv8hf): New patterns.
        (vec_set<mode>_internal, vec_extract<mode>, neon_vld1_dup<mode>):
        Use VD_LANE iterator.
        (neon_vld1_dup<mode>): Use VQ2 iterator.

gcc/testsuite/ChangeLog:

2016-01-20  Christophe Lyon  <christophe.l...@linaro.org>

        PR target/68620
        * gcc.target/arm/pr68620.c: New test.

diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 3588b83..b1f408c 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -12370,6 +12370,10 @@ neon_valid_immediate (rtx op, machine_mode mode, int 
inverse,
       if (!vfp3_const_double_rtx (el0) && el0 != CONST0_RTX (GET_MODE (el0)))
         return -1;
 
+      /* FP16 vectors cannot be represented.  */
+      if (innersize == 2)
+       return -1;
+
       r0 = CONST_DOUBLE_REAL_VALUE (el0);
 
       for (i = 1; i < n_elts; i++)
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 0a33d21..69b28c8 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -5252,14 +5252,26 @@ vget_lane_s32 (int32x2_t __a, const int __b)
    were marked always-inline so there were no call sites, the declaration
    would nonetheless raise an error.  Hence, we must use a macro instead.  */
 
-#define vget_lane_f16(__v, __idx)              \
-  __extension__                                        \
-    ({                                         \
-      float16x4_t __vec = (__v);               \
-      __builtin_arm_lane_check (4, __idx);     \
-      float16_t __res = __vec[__idx];          \
-      __res;                                   \
-    })
+  /* For big-endian, GCC's vector indices are reversed within each 64
+     bits compared to the architectural lane indices used by Neon
+     intrinsics.  */
+#ifdef __ARM_BIG_ENDIAN
+#define __ARM_NUM_LANES(__v) (sizeof (__v) / sizeof (__v[0]))
+#define __arm_lane(__vec, __idx) (__idx ^ (__ARM_NUM_LANES(__vec) - 1))
+#define __arm_laneq(__vec, __idx) (__idx ^ (__ARM_NUM_LANES(__vec)/2 - 1))
+#else
+#define __arm_lane(__vec, __idx) __idx
+#define __arm_laneq(__vec, __idx) __idx
+#endif
+
+#define vget_lane_f16(__v, __idx)                      \
+  __extension__                                                \
+  ({                                                   \
+    float16x4_t __vec = (__v);                         \
+    __builtin_arm_lane_check (4, __idx);               \
+    float16_t __res = __vec[__arm_lane(__vec, __idx)]; \
+    __res;                                             \
+  })
 #endif
 
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
@@ -5329,14 +5341,14 @@ vgetq_lane_s32 (int32x4_t __a, const int __b)
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-#define vgetq_lane_f16(__v, __idx)             \
-  __extension__                                        \
-    ({                                         \
-      float16x8_t __vec = (__v);               \
-      __builtin_arm_lane_check (8, __idx);     \
-      float16_t __res = __vec[__idx];          \
-      __res;                                   \
-    })
+#define vgetq_lane_f16(__v, __idx)                     \
+  __extension__                                                \
+  ({                                                   \
+    float16x8_t __vec = (__v);                         \
+    __builtin_arm_lane_check (8, __idx);               \
+    float16_t __res = __vec[__arm_laneq(__vec, __idx)];        \
+    __res;                                             \
+  })
 #endif
 
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
@@ -5408,13 +5420,13 @@ vset_lane_s32 (int32_t __a, int32x2_t __b, const int 
__c)
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 #define vset_lane_f16(__e, __v, __idx)         \
   __extension__                                        \
-    ({                                         \
-      float16_t __elem = (__e);                        \
-      float16x4_t __vec = (__v);               \
-      __builtin_arm_lane_check (4, __idx);     \
-      __vec[__idx] = __elem;                   \
-      __vec;                                   \
-    })
+  ({                                           \
+    float16_t __elem = (__e);                  \
+    float16x4_t __vec = (__v);                 \
+    __builtin_arm_lane_check (4, __idx);       \
+    __vec[__arm_lane (__vec, __idx)] = __elem; \
+    __vec;                                     \
+  })
 #endif
 
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
@@ -5486,13 +5498,13 @@ vsetq_lane_s32 (int32_t __a, int32x4_t __b, const int 
__c)
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 #define vsetq_lane_f16(__e, __v, __idx)                \
   __extension__                                        \
-    ({                                         \
-      float16_t __elem = (__e);                        \
-      float16x8_t __vec = (__v);               \
-      __builtin_arm_lane_check (8, __idx);     \
-      __vec[__idx] = __elem;                   \
-      __vec;                                   \
-    })
+  ({                                           \
+    float16_t __elem = (__e);                  \
+    float16x8_t __vec = (__v);                 \
+    __builtin_arm_lane_check (8, __idx);       \
+    __vec[__arm_laneq (__vec, __idx)] = __elem;        \
+    __vec;                                     \
+  })
 #endif
 
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 6a54125..88e1c3d 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -99,7 +99,7 @@
 (define_mode_iterator VQI [V16QI V8HI V4SI])
 
 ;; Quad-width vector modes, with TImode added, for moves.
-(define_mode_iterator VQXMOV [V16QI V8HI V4SI V4SF V2DI TI])
+(define_mode_iterator VQXMOV [V16QI V8HI V8HF V4SI V4SF V2DI TI])
 
 ;; Opaque structure types wider than TImode.
 (define_mode_iterator VSTRUCT [EI OI CI XI])
@@ -114,7 +114,7 @@
 (define_mode_iterator VN [V8HI V4SI V2DI])
 
 ;; All supported vector modes (except singleton DImode).
-(define_mode_iterator VDQ [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF V2DI])
+(define_mode_iterator VDQ [V8QI V16QI V4HI V8HI V2SI V4SI V4HF V8HF V2SF V4SF 
V2DI])
 
 ;; All supported vector modes (except those with 64-bit integer elements).
 (define_mode_iterator VDQW [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF])
@@ -424,6 +424,7 @@
 ;; Register width from element mode
 (define_mode_attr V_reg [(V8QI "P") (V16QI "q")
                          (V4HI "P") (V8HI  "q")
+                         (V4HF "P") (V8HF  "q")
                          (V2SI "P") (V4SI  "q")
                          (V2SF "P") (V4SF  "q")
                          (DI   "P") (V2DI  "q")
@@ -572,6 +573,7 @@
 (define_mode_attr Is_float_mode [(V8QI "false") (V16QI "false")
                  (V4HI "false") (V8HI "false")
                  (V2SI "false") (V4SI "false")
+                 (V4HF "true") (V8HF "true")
                  (V2SF "true") (V4SF "true")
                  (DI "false") (V2DI "false")])
 
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 62fb6da..5e0aed2 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -137,6 +137,30 @@
     }
 })
 
+(define_expand "movv4hf"
+  [(set (match_operand:V4HF 0 "s_register_operand")
+       (match_operand:V4HF 1 "s_register_operand"))]
+  "TARGET_NEON && TARGET_FP16"
+{
+  if (can_create_pseudo_p ())
+    {
+      if (!REG_P (operands[0]))
+       operands[1] = force_reg (V4HFmode, operands[1]);
+    }
+})
+
+(define_expand "movv8hf"
+  [(set (match_operand:V8HF 0 "")
+       (match_operand:V8HF 1 ""))]
+  "TARGET_NEON && TARGET_FP16"
+{
+  if (can_create_pseudo_p ())
+    {
+      if (!REG_P (operands[0]))
+       operands[1] = force_reg (V8HFmode, operands[1]);
+    }
+})
+
 (define_insn "*neon_mov<mode>"
   [(set (match_operand:VSTRUCT 0 "nonimmediate_operand"        "=w,Ut,w")
        (match_operand:VSTRUCT 1 "general_operand"      " w,w, Ut"))]
@@ -299,11 +323,11 @@
   [(set_attr "type" "neon_load1_1reg<q>")])
 
 (define_insn "vec_set<mode>_internal"
-  [(set (match_operand:VD 0 "s_register_operand" "=w,w")
-        (vec_merge:VD
-          (vec_duplicate:VD
+  [(set (match_operand:VD_LANE 0 "s_register_operand" "=w,w")
+        (vec_merge:VD_LANE
+          (vec_duplicate:VD_LANE
             (match_operand:<V_elem> 1 "nonimmediate_operand" "Um,r"))
-          (match_operand:VD 3 "s_register_operand" "0,0")
+          (match_operand:VD_LANE 3 "s_register_operand" "0,0")
           (match_operand:SI 2 "immediate_operand" "i,i")))]
   "TARGET_NEON"
 {
@@ -385,7 +409,7 @@
 (define_insn "vec_extract<mode>"
   [(set (match_operand:<V_elem> 0 "nonimmediate_operand" "=Um,r")
         (vec_select:<V_elem>
-          (match_operand:VD 1 "s_register_operand" "w,w")
+          (match_operand:VD_LANE 1 "s_register_operand" "w,w")
           (parallel [(match_operand:SI 2 "immediate_operand" "i,i")])))]
   "TARGET_NEON"
 {
@@ -2806,6 +2830,22 @@ if (BYTES_BIG_ENDIAN)
   [(set_attr "type" "neon_from_gp<q>")]
 )
 
+(define_insn "neon_vdup_nv4hf"
+  [(set (match_operand:V4HF 0 "s_register_operand" "=w")
+        (vec_duplicate:V4HF (match_operand:HF 1 "s_register_operand" "r")))]
+  "TARGET_NEON"
+  "vdup.16\t%P0, %1"
+  [(set_attr "type" "neon_from_gp")]
+)
+
+(define_insn "neon_vdup_nv8hf"
+  [(set (match_operand:V8HF 0 "s_register_operand" "=w")
+        (vec_duplicate:V8HF (match_operand:HF 1 "s_register_operand" "r")))]
+  "TARGET_NEON"
+  "vdup.16\t%q0, %1"
+  [(set_attr "type" "neon_from_gp_q")]
+)
+
 (define_insn "neon_vdup_n<mode>"
   [(set (match_operand:V32 0 "s_register_operand" "=w,w")
         (vec_duplicate:V32 (match_operand:<V_elem> 1 "s_register_operand" 
"r,t")))]
@@ -4305,8 +4345,8 @@ if (BYTES_BIG_ENDIAN)
 )
 
 (define_insn "neon_vld1_dup<mode>"
-  [(set (match_operand:VD 0 "s_register_operand" "=w")
-        (vec_duplicate:VD (match_operand:<V_elem> 1 "neon_struct_operand" 
"Um")))]
+  [(set (match_operand:VD_LANE 0 "s_register_operand" "=w")
+        (vec_duplicate:VD_LANE (match_operand:<V_elem> 1 "neon_struct_operand" 
"Um")))]
   "TARGET_NEON"
   "vld1.<V_sz_elem>\t{%P0[]}, %A1"
   [(set_attr "type" "neon_load1_all_lanes<q>")]
@@ -4322,8 +4362,8 @@ if (BYTES_BIG_ENDIAN)
 )
 
 (define_insn "neon_vld1_dup<mode>"
-  [(set (match_operand:VQ 0 "s_register_operand" "=w")
-        (vec_duplicate:VQ (match_operand:<V_elem> 1 "neon_struct_operand" 
"Um")))]
+  [(set (match_operand:VQ2 0 "s_register_operand" "=w")
+        (vec_duplicate:VQ2 (match_operand:<V_elem> 1 "neon_struct_operand" 
"Um")))]
   "TARGET_NEON"
 {
   return "vld1.<V_sz_elem>\t{%e0[], %f0[]}, %A1";
diff --git a/gcc/testsuite/gcc.target/arm/pr68620.c 
b/gcc/testsuite/gcc.target/arm/pr68620.c
new file mode 100644
index 0000000..984992f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/pr68620.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_fp_ok } */
+/* { dg-options "-mfp16-format=ieee" } */
+/* { dg-add-options arm_fp } */
+
+#include "arm_neon.h"
+
+float16x4_t __attribute__((target("fpu=neon-fp16")))
+foo (float32x4_t arg)
+{
+    return vcvt_f16_f32 (arg);
+}

Re: [PATCH] ARM PR68620 (ICE with FP16 on armeb)

Reply via email to