Re: [Patch 1/3][Aarch64] Implement Aarch64 SIMD ABI

Richard Sandiford Mon, 08 Oct 2018 05:19:08 -0700

Steve Ellcey <sell...@cavium.com> writes:
> @@ -1005,6 +1005,15 @@ static const struct processor *selected_tune;
>  /* The current tuning set.  */
>  struct tune_params aarch64_tune_params = generic_tunings;
>  
> +/* Table of machine attributes.  */
> +static const struct attribute_spec aarch64_attribute_table[] =
> +{
> +  /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
> +       affects_type_identity, handler, exclude } */
> +  { "aarch64_vector_pcs", 0, 0, true,  false, false, false, NULL, NULL },
> +  { NULL,                 0, 0, false, false, false, false, NULL, NULL }
> +};


Maybe it would be better to make this a type attribute instead,
so that it's possible to create pointers to PCS functions without
losing the ABI information.

> @@ -1383,6 +1392,31 @@ aarch64_hard_regno_mode_ok (unsigned regno, 
> machine_mode mode)
>    return false;
>  }
>  
> +/* Return true if this is a definition of a vectorized simd function.  */
> +
> +static bool
> +aarch64_simd_decl_p (tree fndecl)
> +{
> +  if (lookup_attribute ("aarch64_vector_pcs", DECL_ATTRIBUTES (fndecl)) != 
> NULL)
> +    return true;
> +  if (lookup_attribute ("simd", DECL_ATTRIBUTES (fndecl)) == NULL)
> +    return false;
> +  return (VECTOR_TYPE_P (TREE_TYPE (TREE_TYPE (fndecl))));

Why's only the return type relevant here?  Think this deserves a comment.

> @@ -3181,7 +3215,9 @@ static bool
>  aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
>                                tree exp ATTRIBUTE_UNUSED)
>  {
> -  /* Currently, always true.  */
> +  if (aarch64_simd_decl_p (cfun->decl))
> +    return false;

This should be OK if the target is also a vector PCS function.

> @@ -4012,7 +4061,8 @@ aarch64_layout_frame (void)
>        {
>       /* If there is an alignment gap between integer and fp callee-saves,
>          allocate the last fp register to it if possible.  */
> -     if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
> +     if (regno == last_fp_reg && has_align_gap
> +         && !simd_function && (offset & 8) == 0)
>         {
>           cfun->machine->frame.reg_offset[regno] = max_int_offset;
>           break;

Nit: one condition per line once the whole thing no longer fits
on a line.

> @@ -4024,7 +4074,7 @@ aarch64_layout_frame (void)
>       else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
>                && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
>         cfun->machine->frame.wb_candidate2 = regno;
> -     offset += UNITS_PER_WORD;
> +     offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
>        }
>  
>    offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
> @@ -4167,6 +4217,10 @@ aarch64_gen_storewb_pair (machine_mode mode, rtx base, 
> rtx reg, rtx reg2,
>        return gen_storewb_pairdf_di (base, base, reg, reg2,
>                                   GEN_INT (-adjustment),
>                                   GEN_INT (UNITS_PER_WORD - adjustment));
> +    case E_TFmode:
> +      return gen_storewb_pairtf_di (base, base, reg, reg2,
> +                                 GEN_INT (-adjustment),
> +                                 GEN_INT (UNITS_PER_VREG - adjustment));
>      default:
>        gcc_unreachable ();
>      }
> @@ -4179,7 +4233,7 @@ static void
>  aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT 
> adjustment)
>  {
>    rtx_insn *insn;
> -  machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
> +  machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
>  
>    if (regno2 == INVALID_REGNUM)
>      return aarch64_pushwb_single_reg (mode, regno1, adjustment);
> @@ -4209,6 +4263,9 @@ aarch64_gen_loadwb_pair (machine_mode mode, rtx base, 
> rtx reg, rtx reg2,
>      case E_DFmode:
>        return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT 
> (adjustment),
>                                  GEN_INT (UNITS_PER_WORD));
> +    case E_TFmode:
> +      return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT 
> (adjustment),
> +                                GEN_INT (UNITS_PER_VREG));
>      default:
>        gcc_unreachable ();
>      }
> @@ -4222,7 +4279,7 @@ static void
>  aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
>                 rtx *cfi_ops)
>  {
> -  machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
> +  machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
>    rtx reg1 = gen_rtx_REG (mode, regno1);
>  
>    *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
> @@ -4257,6 +4314,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, 
> rtx reg1, rtx mem2,
>      case E_DFmode:
>        return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
>  
> +    case E_TFmode:
> +      return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
> +
>      default:
>        gcc_unreachable ();
>      }
> @@ -4277,6 +4337,9 @@ aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx 
> mem1, rtx reg2,
>      case E_DFmode:
>        return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
>  
> +    case E_TFmode:
> +      return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
> +
>      default:
>        gcc_unreachable ();
>      }
> @@ -4309,6 +4372,10 @@ aarch64_save_callee_saves (machine_mode mode, 
> poly_int64 start_offset,
>    rtx_insn *insn;
>    unsigned regno;
>    unsigned regno2;
> +  HOST_WIDE_INT mode_size;
> +
> +  if (!GET_MODE_SIZE (mode).is_constant(&mode_size))
> +    gcc_unreachable ();

Just make this poly_int64 and use known_eq instead of ==...

> @@ -4334,7 +4401,7 @@ aarch64_save_callee_saves (machine_mode mode, 
> poly_int64 start_offset,
>  
>        if (regno2 <= limit
>         && !cfun->machine->reg_is_wrapped_separately[regno2]
> -       && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
> +       && ((cfun->machine->frame.reg_offset[regno] + mode_size)
>             == cfun->machine->frame.reg_offset[regno2]))
>  
>       {

...here.  That avoids having to justify why the size is known to
be constant.

> @@ -4375,6 +4442,9 @@ aarch64_restore_callee_saves (machine_mode mode,
>    unsigned regno;
>    unsigned regno2;
>    poly_int64 offset;
> +  HOST_WIDE_INT mode_size;
> +
> +  gcc_assert (GET_MODE_SIZE (mode).is_constant(&mode_size));
>  
>    for (regno = aarch64_next_callee_save (start, limit);
>         regno <= limit;
> @@ -4398,7 +4468,7 @@ aarch64_restore_callee_saves (machine_mode mode,
>  
>        if (regno2 <= limit
>         && !cfun->machine->reg_is_wrapped_separately[regno2]
> -       && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
> +       && ((cfun->machine->frame.reg_offset[regno] + mode_size)
>             == cfun->machine->frame.reg_offset[regno2]))
>       {
>         rtx reg2 = gen_rtx_REG (mode, regno2);

Same here.  (gcc_assert (...) calls with necessary side effects don't
work when the compiler is built with --enable-checking=no.)

> @@ -4611,8 +4683,10 @@ aarch64_process_components (sbitmap components, bool 
> prologue_p)
>    while (regno != last_regno)
>      {
>        /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
> -      so DFmode for the vector registers is enough.  */
> -      machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
> +      so DFmode for the vector registers is enough.  For simd functions
> +         we want to save the entire register.  */

Would be good to fix the indentation of the second line while you're there.
Maybe s/the entire register/the low 128 bits/ since it's possible to have
Advanced SIMD vector PCS functions when compiling for SVE.

> @@ -4712,6 +4787,25 @@ aarch64_set_handled_components (sbitmap components)
>        cfun->machine->reg_is_wrapped_separately[regno] = true;
>  }
>  
> +/* Return 1 if the register is used by the epilogue.  We need to say the
> +   return register is used, but only after epilogue generation is complete.
> +   Note that in the case of sibcalls, the values "used by the epilogue" are
> +   considered live at the start of the called function.
> +
> +   For SIMD functions we need to return 1 for FP registers that are saved and
> +   restored by a function but not zero in call_used_regs.  If we do not do 
> +   this optimizations may remove the restore of the register.  */
> +
> +int
> +aarch64_epilogue_uses (int regno)
> +{
> +  if (epilogue_completed && regno == LR_REGNUM)
> +    return 1;
> +  if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
> +    return 1;
> +  return 0;

Shouldn't this also depend on epilogue completed?  We won't have saved
the register otherwise, and it could cause the register to be marked
live unnecessarily.

If we check epilogue_completed, we can also check whether the function
actually saves the register, which would make things even more precise.

> @@ -4884,6 +4982,19 @@ aarch64_use_return_insn_p (void)
>    return known_eq (cfun->machine->frame.frame_size, 0);
>  }
>  
> +/* Return false for non-leaf SIMD functions in order to avoid
> +   shrink-wrapping them.  Doing this will lose the necessary
> +   save/restore of FP registers.  */
> +
> +bool
> +aarch64_use_simple_return_insn_p (void)
> +{
> +  if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
> +    return false;
> +
> +  return true;
> +}

How hard would it be to avoid this?  Shrink-wrapping could be a very
useful optimisation for vector routines, if the routine has to use
calls to other functions to handle rare but difficult cases.

> @@ -6185,7 +6300,7 @@ aarch64_fixed_condition_code_regs (unsigned int *p1, 
> unsigned int *p2)
>  void
>  aarch64_expand_call (rtx result, rtx mem, bool sibcall)
>  {
> -  rtx call, callee, tmp;
> +  rtx call, call_insn, callee, tmp;
>    rtvec vec;
>    machine_mode mode;
>  
> @@ -6203,7 +6318,8 @@ aarch64_expand_call (rtx result, rtx mem, bool sibcall)
>        : !REG_P (callee))
>      XEXP (mem, 0) = force_reg (mode, callee);
>  
> -  call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
> +  call_insn = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
> +  call = call_insn;

"call_insn" is a bit misleading, since it's not a full insn.  Maybe:

  call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
  pat = call;

or something instead?

LGTM otherwise, but I'll leave the AArch64 maintainers to do the
main review.

Thanks,
Richard

Re: [Patch 1/3][Aarch64] Implement Aarch64 SIMD ABI

Reply via email to