Re: [PATCH 2/2] aarch64: Fold lsl+lsr+orr to rev for half-width shifts

Richard Sandiford Thu, 22 May 2025 03:46:54 -0700

Dhruv Chawla <dhr...@nvidia.com> writes:
> On 20/05/25 16:35, Richard Sandiford wrote:
>> Dhruv Chawla <dhr...@nvidia.com> writes:
>>> [...]
>>> Would it be a good idea to add tests for the bad codegen as well? I have 
>>> added tests for lsl/usra in the next round of patches.
>> 
>> Nah, I don't think it's worth testing for something that we don't want.
>> If we can agree on what the "good" code would look like, it might be
>> worth adding that with an xfail, so that we notice when we start to get
>> the good codegen.  But IMO it's also fine to test only the dummy-argument
>> case, as in the new patch.
>
> Yeah, that sounds good to me. For the following test:
>
> svuint16_t
> lsl_usra_8_sve_lsl_operand (svuint16_t r)
> {
>    svbool_t pt = svptrue_b16 ();
>    return svorr_u16_z (pt, svlsl_n_u16_z (pt, r, 6), svlsr_n_u16_z (pt, r, 
> 10));
> }
>
> Currently:
>
>          lsl     z31.h, z0.h, #6
>          mov     z30.d, z0.d
>          movprfx z0, z31
>          usra    z0.h, z30.h, #10
>          ret
>
> is generated. Would something like:
>
>          mov     z31.d, z0.d
>          lsl     z0.h, z0.h, #6
>          usra    z0.h, z31.h, #10
>          ret
>
> be better?


Yeah, that looks better, although we wouldn't need to use specifically z31
as the temporary register.  It would also be possible to do the mov at the
end, so if we were going to match this, we'd want to allow both move
positions.

Thanks,
Richard

>
>>  
>> On the new patch:
>> 
>>> +/* { dg-final { scan-assembler-times "revb" 0 } } */
>>> +/* { dg-final { scan-assembler-times "revh" 0 } } */
>>> +/* { dg-final { scan-assembler-times "revw" 0 } } */
>> 
>> It would be better to add \ts around the mnemonics, so that we don't
>> accidentally match pieces of filenames that happen to contain "revb":
>> 
>> /* { dg-final { scan-assembler-times "\trevb\t" 0 } } */
>> /* { dg-final { scan-assembler-times "\trevh\t" 0 } } */
>> /* { dg-final { scan-assembler-times "\trevw\t" 0 } } */
>> 
>> Thanks,
>> Richard
>> 
>> 
>> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
>> b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> index 90dd5c97a10..b4396837c24 100644
>> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> @@ -2086,37 +2086,6 @@ public:
>>     {
>>       return f.fold_const_binary (LSHIFT_EXPR);
>>     }
>> -
>> -  rtx expand (function_expander &e) const override
>> -  {
>> -    tree pred = TREE_OPERAND (e.call_expr, 3);
>> -    tree shift = TREE_OPERAND (e.call_expr, 5);
>> -    if (is_ptrue (pred, GET_MODE_UNIT_SIZE (e.result_mode ()))
>> -       && uniform_integer_cst_p (shift))
>> -      return e.use_unpred_insn (e.direct_optab_handler (ashl_optab));
>> -    return rtx_code_function::expand (e);
>> -  }
>> -};
>> -
>> -class svlsr_impl : public rtx_code_function
>> -{
>> -public:
>> -  CONSTEXPR svlsr_impl () : rtx_code_function (LSHIFTRT, LSHIFTRT) {}
>> -
>> -  gimple *fold (gimple_folder &f) const override
>> -  {
>> -    return f.fold_const_binary (RSHIFT_EXPR);
>> -  }
>> -
>> -  rtx expand (function_expander &e) const override
>> -  {
>> -    tree pred = TREE_OPERAND (e.call_expr, 3);
>> -    tree shift = TREE_OPERAND (e.call_expr, 5);
>> -    if (is_ptrue (pred, GET_MODE_UNIT_SIZE (e.result_mode ()))
>> -       && uniform_integer_cst_p (shift))
>> -      return e.use_unpred_insn (e.direct_optab_handler (lshr_optab));
>> -    return rtx_code_function::expand (e);
>> -  }
>>   };
>> 
>>   class svmad_impl : public function_base
>> @@ -3617,7 +3586,7 @@ FUNCTION (svldnt1, svldnt1_impl,)
>>   FUNCTION (svlen, svlen_impl,)
>>   FUNCTION (svlsl, svlsl_impl,)
>>   FUNCTION (svlsl_wide, shift_wide, (ASHIFT, UNSPEC_ASHIFT_WIDE))
>> -FUNCTION (svlsr, svlsr_impl,)
>> +FUNCTION (svlsr, rtx_code_function, (LSHIFTRT, LSHIFTRT))
>>   FUNCTION (svlsr_wide, shift_wide, (LSHIFTRT, UNSPEC_LSHIFTRT_WIDE))
>>   FUNCTION (svmad, svmad_impl,)
>>   FUNCTION (svmax, rtx_code_function, (SMAX, UMAX, UNSPEC_COND_FMAX,
>> diff --git a/gcc/config/aarch64/aarch64-sve.md 
>> b/gcc/config/aarch64/aarch64-sve.md
>> index 0156afc1e7d..fa431c9c060 100644
>> --- a/gcc/config/aarch64/aarch64-sve.md
>> +++ b/gcc/config/aarch64/aarch64-sve.md
>> @@ -4931,9 +4931,7 @@ (define_expand "<ASHIFT:optab><mode>3"
>>       if (CONST_INT_P (operands[2]))
>>         {
>>          amount = gen_const_vec_duplicate (<MODE>mode, operands[2]);
>> -       if (!aarch64_sve_<lr>shift_operand (operands[2], <MODE>mode)
>> -           && !aarch64_simd_shift_imm_p (operands[2], <MODE>mode,
>> -                                         <optab>_optab == ashl_optab))
>> +       if (!aarch64_sve_<lr>shift_operand (amount, <MODE>mode))
>>            amount = force_reg (<MODE>mode, amount);
>>         }
>>       else
>> @@ -4957,8 +4955,7 @@ (define_expand "v<optab><mode>3"
>>            UNSPEC_PRED_X))]
>>     "TARGET_SVE"
>>     {
>> -    if (aarch64_simd_shift_imm_p (operands[2], <MODE>mode,
>> -                                 <optab>_optab == ashl_optab))
>> +    if (CONSTANT_P (operands[2]))
>>         {
>>          emit_insn (gen_aarch64_v<optab><mode>3_const (operands[0], 
>> operands[1],
>>                                                        operands[2]));
>> @@ -4968,11 +4965,30 @@ (define_expand "v<optab><mode>3"
>>     }
>>   )
>> 
>> -;; Shift by a vector, predicated with a PTRUE.  We don't actually need
>> -;; the predicate for the first alternative, but using Upa or X isn't
>> -;; likely to gain much and would make the instruction seem less uniform
>> -;; to the register allocator.
>> -(define_insn_and_split "@aarch64_pred_<optab><mode>"
>> +;; Shift by a vector, predicated with a PTRUE.
>> +(define_expand "@aarch64_pred_<optab><mode>"
>> +  [(set (match_operand:SVE_I 0 "register_operand")
>> +       (unspec:SVE_I
>> +         [(match_operand:<VPRED> 1 "register_operand")
>> +          (ASHIFT:SVE_I
>> +            (match_operand:SVE_I 2 "register_operand")
>> +            (match_operand:SVE_I 3 "aarch64_sve_<lr>shift_operand"))]
>> +         UNSPEC_PRED_X))]
>> +  "TARGET_SVE"
>> +  {
>> +    if (CONSTANT_P (operands[3]))
>> +      {
>> +       emit_insn (gen_aarch64_v<optab><mode>3_const (operands[0], 
>> operands[2],
>> +                                                     operands[3]));
>> +       DONE;
>> +      }
>> +  }
>> +)
>> +
>> +;; We don't actually need the predicate for the first alternative, but
>> +;; using Upa or X isn't likely to gain much and would make the instruction
>> +;; seem less uniform to the register allocator.
>> +(define_insn_and_split "*aarch64_pred_<optab><mode>"
>>     [(set (match_operand:SVE_I 0 "register_operand")
>>          (unspec:SVE_I
>>            [(match_operand:<VPRED> 1 "register_operand")
>> @@ -4987,8 +5003,7 @@ (define_insn_and_split "@aarch64_pred_<optab><mode>"
>>        [ w        , Upl , w , 0     ; *              ] 
>> <shift>r\t%0.<Vetype>, %1/m, %3.<Vetype>, %2.<Vetype>
>>        [ ?&w      , Upl , w , w     ; yes            ] movprfx\t%0, 
>> %2\;<shift>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
>>     }
>> -  "&& reload_completed
>> -   && !register_operand (operands[3], <MODE>mode)"
>> +  "&& !register_operand (operands[3], <MODE>mode)"
>>     [(set (match_dup 0) (ASHIFT:SVE_I (match_dup 2) (match_dup 3)))]
>>     ""
>>   )

Re: [PATCH 2/2] aarch64: Fold lsl+lsr+orr to rev for half-width shifts

Reply via email to