Steve Ellcey <sell...@cavium.com> writes: > @@ -1005,6 +1005,15 @@ static const struct processor *selected_tune; > /* The current tuning set. */ > struct tune_params aarch64_tune_params = generic_tunings; > > +/* Table of machine attributes. */ > +static const struct attribute_spec aarch64_attribute_table[] = > +{ > + /* { name, min_len, max_len, decl_req, type_req, fn_type_req, > + affects_type_identity, handler, exclude } */ > + { "aarch64_vector_pcs", 0, 0, true, false, false, false, NULL, NULL }, > + { NULL, 0, 0, false, false, false, false, NULL, NULL } > +};
Maybe it would be better to make this a type attribute instead, so that it's possible to create pointers to PCS functions without losing the ABI information. > @@ -1383,6 +1392,31 @@ aarch64_hard_regno_mode_ok (unsigned regno, > machine_mode mode) > return false; > } > > +/* Return true if this is a definition of a vectorized simd function. */ > + > +static bool > +aarch64_simd_decl_p (tree fndecl) > +{ > + if (lookup_attribute ("aarch64_vector_pcs", DECL_ATTRIBUTES (fndecl)) != > NULL) > + return true; > + if (lookup_attribute ("simd", DECL_ATTRIBUTES (fndecl)) == NULL) > + return false; > + return (VECTOR_TYPE_P (TREE_TYPE (TREE_TYPE (fndecl)))); Why's only the return type relevant here? Think this deserves a comment. > @@ -3181,7 +3215,9 @@ static bool > aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED, > tree exp ATTRIBUTE_UNUSED) > { > - /* Currently, always true. */ > + if (aarch64_simd_decl_p (cfun->decl)) > + return false; This should be OK if the target is also a vector PCS function. > @@ -4012,7 +4061,8 @@ aarch64_layout_frame (void) > { > /* If there is an alignment gap between integer and fp callee-saves, > allocate the last fp register to it if possible. */ > - if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0) > + if (regno == last_fp_reg && has_align_gap > + && !simd_function && (offset & 8) == 0) > { > cfun->machine->frame.reg_offset[regno] = max_int_offset; > break; Nit: one condition per line once the whole thing no longer fits on a line. > @@ -4024,7 +4074,7 @@ aarch64_layout_frame (void) > else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM > && cfun->machine->frame.wb_candidate1 >= V0_REGNUM) > cfun->machine->frame.wb_candidate2 = regno; > - offset += UNITS_PER_WORD; > + offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD; > } > > offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT); > @@ -4167,6 +4217,10 @@ aarch64_gen_storewb_pair (machine_mode mode, rtx base, > rtx reg, rtx reg2, > return gen_storewb_pairdf_di (base, base, reg, reg2, > GEN_INT (-adjustment), > GEN_INT (UNITS_PER_WORD - adjustment)); > + case E_TFmode: > + return gen_storewb_pairtf_di (base, base, reg, reg2, > + GEN_INT (-adjustment), > + GEN_INT (UNITS_PER_VREG - adjustment)); > default: > gcc_unreachable (); > } > @@ -4179,7 +4233,7 @@ static void > aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT > adjustment) > { > rtx_insn *insn; > - machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode; > + machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1); > > if (regno2 == INVALID_REGNUM) > return aarch64_pushwb_single_reg (mode, regno1, adjustment); > @@ -4209,6 +4263,9 @@ aarch64_gen_loadwb_pair (machine_mode mode, rtx base, > rtx reg, rtx reg2, > case E_DFmode: > return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT > (adjustment), > GEN_INT (UNITS_PER_WORD)); > + case E_TFmode: > + return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT > (adjustment), > + GEN_INT (UNITS_PER_VREG)); > default: > gcc_unreachable (); > } > @@ -4222,7 +4279,7 @@ static void > aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment, > rtx *cfi_ops) > { > - machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode; > + machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1); > rtx reg1 = gen_rtx_REG (mode, regno1); > > *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops); > @@ -4257,6 +4314,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, > rtx reg1, rtx mem2, > case E_DFmode: > return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2); > > + case E_TFmode: > + return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2); > + > default: > gcc_unreachable (); > } > @@ -4277,6 +4337,9 @@ aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx > mem1, rtx reg2, > case E_DFmode: > return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2); > > + case E_TFmode: > + return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2); > + > default: > gcc_unreachable (); > } > @@ -4309,6 +4372,10 @@ aarch64_save_callee_saves (machine_mode mode, > poly_int64 start_offset, > rtx_insn *insn; > unsigned regno; > unsigned regno2; > + HOST_WIDE_INT mode_size; > + > + if (!GET_MODE_SIZE (mode).is_constant(&mode_size)) > + gcc_unreachable (); Just make this poly_int64 and use known_eq instead of ==... > @@ -4334,7 +4401,7 @@ aarch64_save_callee_saves (machine_mode mode, > poly_int64 start_offset, > > if (regno2 <= limit > && !cfun->machine->reg_is_wrapped_separately[regno2] > - && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD) > + && ((cfun->machine->frame.reg_offset[regno] + mode_size) > == cfun->machine->frame.reg_offset[regno2])) > > { ...here. That avoids having to justify why the size is known to be constant. > @@ -4375,6 +4442,9 @@ aarch64_restore_callee_saves (machine_mode mode, > unsigned regno; > unsigned regno2; > poly_int64 offset; > + HOST_WIDE_INT mode_size; > + > + gcc_assert (GET_MODE_SIZE (mode).is_constant(&mode_size)); > > for (regno = aarch64_next_callee_save (start, limit); > regno <= limit; > @@ -4398,7 +4468,7 @@ aarch64_restore_callee_saves (machine_mode mode, > > if (regno2 <= limit > && !cfun->machine->reg_is_wrapped_separately[regno2] > - && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD) > + && ((cfun->machine->frame.reg_offset[regno] + mode_size) > == cfun->machine->frame.reg_offset[regno2])) > { > rtx reg2 = gen_rtx_REG (mode, regno2); Same here. (gcc_assert (...) calls with necessary side effects don't work when the compiler is built with --enable-checking=no.) > @@ -4611,8 +4683,10 @@ aarch64_process_components (sbitmap components, bool > prologue_p) > while (regno != last_regno) > { > /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved > - so DFmode for the vector registers is enough. */ > - machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode; > + so DFmode for the vector registers is enough. For simd functions > + we want to save the entire register. */ Would be good to fix the indentation of the second line while you're there. Maybe s/the entire register/the low 128 bits/ since it's possible to have Advanced SIMD vector PCS functions when compiling for SVE. > @@ -4712,6 +4787,25 @@ aarch64_set_handled_components (sbitmap components) > cfun->machine->reg_is_wrapped_separately[regno] = true; > } > > +/* Return 1 if the register is used by the epilogue. We need to say the > + return register is used, but only after epilogue generation is complete. > + Note that in the case of sibcalls, the values "used by the epilogue" are > + considered live at the start of the called function. > + > + For SIMD functions we need to return 1 for FP registers that are saved and > + restored by a function but not zero in call_used_regs. If we do not do > + this optimizations may remove the restore of the register. */ > + > +int > +aarch64_epilogue_uses (int regno) > +{ > + if (epilogue_completed && regno == LR_REGNUM) > + return 1; > + if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno)) > + return 1; > + return 0; Shouldn't this also depend on epilogue completed? We won't have saved the register otherwise, and it could cause the register to be marked live unnecessarily. If we check epilogue_completed, we can also check whether the function actually saves the register, which would make things even more precise. > @@ -4884,6 +4982,19 @@ aarch64_use_return_insn_p (void) > return known_eq (cfun->machine->frame.frame_size, 0); > } > > +/* Return false for non-leaf SIMD functions in order to avoid > + shrink-wrapping them. Doing this will lose the necessary > + save/restore of FP registers. */ > + > +bool > +aarch64_use_simple_return_insn_p (void) > +{ > + if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf) > + return false; > + > + return true; > +} How hard would it be to avoid this? Shrink-wrapping could be a very useful optimisation for vector routines, if the routine has to use calls to other functions to handle rare but difficult cases. > @@ -6185,7 +6300,7 @@ aarch64_fixed_condition_code_regs (unsigned int *p1, > unsigned int *p2) > void > aarch64_expand_call (rtx result, rtx mem, bool sibcall) > { > - rtx call, callee, tmp; > + rtx call, call_insn, callee, tmp; > rtvec vec; > machine_mode mode; > > @@ -6203,7 +6318,8 @@ aarch64_expand_call (rtx result, rtx mem, bool sibcall) > : !REG_P (callee)) > XEXP (mem, 0) = force_reg (mode, callee); > > - call = gen_rtx_CALL (VOIDmode, mem, const0_rtx); > + call_insn = gen_rtx_CALL (VOIDmode, mem, const0_rtx); > + call = call_insn; "call_insn" is a bit misleading, since it's not a full insn. Maybe: call = gen_rtx_CALL (VOIDmode, mem, const0_rtx); pat = call; or something instead? LGTM otherwise, but I'll leave the AArch64 maintainers to do the main review. Thanks, Richard