On 20/07/16 14:02, Jiong Wang wrote:
> This patch optimize immediate addition sequences generated by
> aarch64_add_constant.
>
> The current addition sequences generated are:
>
> * If immediate fit into unsigned 12bit range, generate single add/sub.
> * Otherwise if it fit into unsigned 24bit range, generate two
> add/sub.
>
> * Otherwise invoke general constant build function.
>
>
> This haven't considered the situation where immedate can't fit into
> unsigned 12bit range, but can fit into single mov instruction for which
> case we generate one move and one addition. The move won't touch the
> destination register thus the sequences is better than two additions
> which both touch the destination register.
>
>
> This patch thus optimize the addition sequences into:
>
> * If immediate fit into unsigned 12bit range, generate single add/sub.
>
> * Otherwise if it fit into unsigned 24bit range, generate two add/sub.
> And don't do this if it fit into single move instruction, in which case
> move the immedaite to scratch register firstly, then generate one
> addition to add the scratch register to the destination register.
> * Otherwise invoke general constant build function.
>
>
> OK for trunk?
>
> gcc/
> 2016-07-20 Jiong Wang <[email protected]>
>
> * config/aarch64/aarch64.c (aarch64_add_constant): Optimize
> instruction sequences.
>
>
OK with the updates to the comments as mentioned below.
> build-const-2.patch
>
>
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index
> aeea3b3ebc514663043ac8d7cd13361f06f78502..41844a101247c939ecb31f8a8c17cf79759255aa
> 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -1865,6 +1865,47 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
> aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
> }
>
> +/* Add DELTA onto REGNUM in MODE, using SCRATCHREG to held intermediate
> value if
> + it is necessary. */
Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold an
intermediate value if necessary.
> +
> +static void
> +aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
> + HOST_WIDE_INT delta)
> +{
> + HOST_WIDE_INT mdelta = abs_hwi (delta);
> + rtx this_rtx = gen_rtx_REG (mode, regnum);
> +
> + /* Do nothing if mdelta is zero. */
> + if (!mdelta)
> + return;
> +
> + /* We only need single instruction if the offset fit into add/sub. */
> + if (aarch64_uimm12_shift (mdelta))
> + {
> + emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
> + return;
> + }
> +
> + /* We need two add/sub instructions, each one perform part of the
> + addition/subtraction, but don't this if the addend can be loaded into
> + register by single instruction, in that case we prefer a move to scratch
> + register following by addition. */
We need two add/sub instructions, each one performing part of the
calculation. Don't do this if the addend can be loaded into
register with a single instruction, in that case we prefer a move to a
scratch register following by an addition.
> + if (mdelta < 0x1000000 && !aarch64_move_imm (delta, mode))
> + {
> + HOST_WIDE_INT low_off = mdelta & 0xfff;
> +
> + low_off = delta < 0 ? -low_off : low_off;
> + emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
> + emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
> + return;
> + }
> +
> + /* Otherwise use generic function to handle all other situations. */
> + rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
> + aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (delta), true, mode);
> + emit_insn (gen_add2_insn (this_rtx, scratch_rtx));
> +}
> +
> static bool
> aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
> tree exp ATTRIBUTE_UNUSED)
> @@ -3337,44 +3378,6 @@ aarch64_final_eh_return_addr (void)
> - 2 * UNITS_PER_WORD));
> }
>
> -static void
> -aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
> - HOST_WIDE_INT delta)
> -{
> - HOST_WIDE_INT mdelta = delta;
> - rtx this_rtx = gen_rtx_REG (mode, regnum);
> - rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
> -
> - if (mdelta < 0)
> - mdelta = -mdelta;
> -
> - if (mdelta >= 4096 * 4096)
> - {
> - aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (delta), true,
> mode);
> - emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
> - }
> - else if (mdelta > 0)
> - {
> - if (mdelta >= 4096)
> - {
> - emit_insn (gen_rtx_SET (scratch_rtx, GEN_INT (mdelta / 4096)));
> - rtx shift = gen_rtx_ASHIFT (mode, scratch_rtx, GEN_INT (12));
> - if (delta < 0)
> - emit_insn (gen_rtx_SET (this_rtx,
> - gen_rtx_MINUS (mode, this_rtx, shift)));
> - else
> - emit_insn (gen_rtx_SET (this_rtx,
> - gen_rtx_PLUS (mode, this_rtx, shift)));
> - }
> - if (mdelta % 4096 != 0)
> - {
> - scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
> - emit_insn (gen_rtx_SET (this_rtx,
> - gen_rtx_PLUS (mode, this_rtx, scratch_rtx)));
> - }
> - }
> -}
> -
> /* Output code to add DELTA to the first argument, and then jump
> to FUNCTION. Used for C++ multiple inheritance. */
> static void
>