Re: [AArch64_be] Fix vtbl[34] and vtbx4

Christophe Lyon Wed, 07 Oct 2015 13:07:51 -0700

On 7 October 2015 at 17:09, James Greenhalgh <[email protected]> wrote:
> On Tue, Sep 15, 2015 at 05:25:25PM +0100, Christophe Lyon wrote:
>> This patch re-implements vtbl[34] and vtbx4 AdvSIMD intrinsics using
>> existing builtins, and fixes the behaviour on aarch64_be.
>>
>> Tested on aarch64_be-none-elf and aarch64-none-elf using the Foundation 
>> Model.
>>
>> OK?
>
> Hi Christophe,
>
> Sorry for the delay getting back to you, comments below.
>
>> 2015-09-15  Christophe Lyon  <[email protected]>
>>
>>       * config/aarch64/aarch64-builtins.c
>>       (aarch64_types_tbl_qualifiers): New static data.
>>       (TYPES_TBL): Define.
>>       * config/aarch64/aarch64-simd-builtins.def: Update builtins
>>       tables.
>>       * config/aarch64/aarch64-simd.md (aarch64_tbl3v8qi): New.
>>       * config/aarch64/arm_neon.h (vtbl3_s8, vtbl3_u8, vtbl3_p8)
>>       (vtbl4_s8, vtbl4_u8, vtbl4_p8): Rewrite using builtin functions.
>>       (vtbx4_s8, vtbx4_u8, vtbx4_p8): Emulate behaviour using other
>>       intrinsics.
>>       * config/aarch64/iterators.md (V8Q): New.
>
>> diff --git a/gcc/config/aarch64/aarch64-builtins.c 
>> b/gcc/config/aarch64/aarch64-builtins.c
>> index 0f4f2b9..7ca3917 100644
>> --- a/gcc/config/aarch64/aarch64-builtins.c
>> +++ b/gcc/config/aarch64/aarch64-builtins.c
>> @@ -253,6 +253,11 @@ 
>> aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
>>        qualifier_none, qualifier_struct_load_store_lane_index };
>>  #define TYPES_STORESTRUCT_LANE (aarch64_types_storestruct_lane_qualifiers)
>>
>> +static enum aarch64_type_qualifiers
>> +aarch64_types_tbl_qualifiers[SIMD_MAX_BUILTIN_ARGS]
>> +  = { qualifier_none, qualifier_none, qualifier_none };
>> +#define TYPES_TBL (aarch64_types_tbl_qualifiers)
>> +
>
> Do we need these? This looks like TYPES_BINOP (the predicate on the
> instruction pattern will prevent the "qualifier_maybe_immediate" from
> becoming a problem).
>
I'll give it a try, indeed I feared "qualifier_maybe_immediate" would
cause problems.


>>  #define CF0(N, X) CODE_FOR_aarch64_##N##X
>>  #define CF1(N, X) CODE_FOR_##N##X##1
>>  #define CF2(N, X) CODE_FOR_##N##X##2
>> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def 
>> b/gcc/config/aarch64/aarch64-simd-builtins.def
>> index d0f298a..62f1b13 100644
>> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
>> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
>> @@ -405,3 +405,5 @@
>>    VAR1 (BINOPP, crypto_pmull, 0, di)
>>    VAR1 (BINOPP, crypto_pmull, 0, v2di)
>>
>> +  /* Implemented by aarch64_tbl3v8qi.  */
>> +  BUILTIN_V8Q (TBL, tbl3, 0)
>
> This can be:
>
>   VAR1 (BINOP, tbl3, 0, v8qi)
>
> It would be good if we could eliminate the casts in arm_neon.h by also
> defining a  "BINOPU" version of this, but I imagine that gets stuck on the
> types accepted by __builtin_aarch64_set_qregoiv16qi - so don't worry about
> making that change.
OK

>
>> diff --git a/gcc/config/aarch64/aarch64-simd.md 
>> b/gcc/config/aarch64/aarch64-simd.md
>> index 9777418..84a61d5 100644
>> --- a/gcc/config/aarch64/aarch64-simd.md
>> +++ b/gcc/config/aarch64/aarch64-simd.md
>> @@ -4716,6 +4714,16 @@
>>    [(set_attr "type" "neon_tbl2_q")]
>>  )
>>
>> +(define_insn "aarch64_tbl3v8qi"
>> +  [(set (match_operand:V8QI 0 "register_operand" "=w")
>> +     (unspec:V8QI [(match_operand:OI 1 "register_operand" "w")
>> +                   (match_operand:V8QI 2 "register_operand" "w")]
>> +                   UNSPEC_TBL))]
>> +  "TARGET_SIMD"
>> +  "tbl\\t%S0.8b, {%S1.16b - %T1.16b}, %S2.8b"
>> +  [(set_attr "type" "neon_tbl3")]
>> +)
>> +
>>  (define_insn_and_split "aarch64_combinev16qi"
>>    [(set (match_operand:OI 0 "register_operand" "=w")
>>       (unspec:OI [(match_operand:V16QI 1 "register_operand" "w")
>> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
>> index 87bbf6e..91704de 100644
>> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
>> index 6dfebe7..e8ee318 100644
>> --- a/gcc/config/aarch64/arm_neon.h
>> +++ b/gcc/config/aarch64/arm_neon.h
>>  /* End of temporary inline asm.  */
>>
>>  /* Start of optimal implementations in approved order.  */
>> @@ -23221,6 +23182,36 @@ vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, 
>> uint8x8_t __idx)
>>    return vbsl_p8 (__mask, __tbl, __r);
>>  }
>>
>> +/* vtbx4  */
>> +
>> +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
>> +vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx)
>> +{
>> +  uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx),
>> +                           vmov_n_u8 (32));
>> +  int8x8_t __tbl = vtbl4_s8 (__tab, __idx);
>> +
>> +  return vbsl_s8 (__mask, __tbl, __r);
>> +}
>> +
>> +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
>> +vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx)
>> +{
>> +  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (32));
>> +  uint8x8_t __tbl = vtbl4_u8 (__tab, __idx);
>> +
>> +  return vbsl_u8 (__mask, __tbl, __r);
>> +}
>> +
>> +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
>> +vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx)
>> +{
>> +  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (32));
>> +  poly8x8_t __tbl = vtbl4_p8 (__tab, __idx);
>> +
>> +  return vbsl_p8 (__mask, __tbl, __r);
>> +}
>> +
>
> Why do we want this for vtbx4 rather than putting out a VTBX instruction
> directly (as in the inline asm versions you replace)?
>
I just followed the pattern used for vtbx3.

> This sequence does make sense for vtbx3.
In fact, I don't see why vtbx3 and vtbx4 should be different?

>>  /* vtrn */
>>
>>  __extension__ static __inline float32x2_t __attribute__ 
>> ((__always_inline__))
>> diff --git a/gcc/config/aarch64/iterators.md 
>> b/gcc/config/aarch64/iterators.md
>> index b8a45d1..dfbd9cd 100644
>> --- a/gcc/config/aarch64/iterators.md
>> +++ b/gcc/config/aarch64/iterators.md
>> @@ -100,6 +100,8 @@
>>  ;; All modes.
>>  (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF 
>> V2DF])
>>
>> +(define_mode_iterator V8Q [V8QI])
>> +
>
> This can be dropped if you use VAR1 in aarch64-builtins.c.
>
> Thanks for working on this, with your patch applied, the only
> remaining intrinsics I see failing for aarch64_be are:
>
>   vqtbl2_*8
>   vqtbl2q_*8
>   vqtbl3_*8
>   vqtbl3q_*8
>   vqtbl4_*8
>   vqtbl4q_*8
>
>   vqtbx2_*8
>   vqtbx2q_*8
>   vqtbx3_*8
>   vqtbx3q_*8
>   vqtbx4_*8
>   vqtbx4q_*8
>
Quite possibly. Which tests are you looking at? Since these are
aarch64-specific, they are not part of the
tests I added (advsimd-intrinsics). Do you mean
gcc.target/aarch64/table-intrinsics.c?


> Thanks,
> James
>

Re: [AArch64_be] Fix vtbl[34] and vtbx4

Reply via email to