https://gcc.gnu.org/g:c0ded050cd29cc73f78cb4ab23674c7bc024969e
commit r15-965-gc0ded050cd29cc73f78cb4ab23674c7bc024969e Author: Jeff Law <j...@ventanamicro.com> Date: Fri May 31 21:45:01 2024 -0600 [to-be-committed] [RISC-V] Use Zbkb for general 64 bit constants when profitable Basically this adds the ability to generate two independent constants during synthesis, then bring them together with a pack instruction. Thus we never need to go out to the constant pool when zbkb is enabled. The worst sequence we ever generate is lui+addi+lui+addi+pack Obviously if either half can be synthesized with just a lui or just an addi, then we'll DTRT automagically. So for example: unsigned long foo_0xf857f2def857f2de(void) { return 0x1425000028000000; } The high and low halves are just a lui. So the final synthesis is: > li a5,671088640 # 15 [c=4 l=4] *movdi_64bit/1 > li a0,337969152 # 16 [c=4 l=4] *movdi_64bit/1 > pack a0,a5,a0 # 17 [c=12 l=4] riscv_xpack_di_si_2 On the implementation side, I think the bits I've put in here likely can be used to handle the repeating constant case for !zbkb. I think it likely could be used to help capture cases where the upper half can be derived from the lower half (say by turning a bit on or off, shifting or something similar). The key in both of these cases is we need a temporary register holding an intermediate value. Ventana's internal tester enables zbkb, but I don't think any of the other testers currently exercise zbkb. We'll probably want to change that at some point, but I don't think it's super-critical yet. While I can envision a few more cases where we could improve constant synthesis, No immediate plans to work in this space, but if someone is interested, some thoughts are recorded here: > https://wiki.riseproject.dev/display/HOME/CT_00_031+--+Additional+Constant+Synthesis+Improvements gcc/ * config/riscv/riscv.cc (riscv_integer_op): Add new field. (riscv_build_integer_1): Initialize the new field. (riscv_built_integer): Recognize more cases where Zbkb's pack instruction is profitable. (riscv_move_integer): Loop over all the codes. If requested, save the current constant into a temporary. Generate pack for more cases using the saved constant. gcc/testsuite * gcc.target/riscv/synthesis-10.c: New test. Diff: --- gcc/config/riscv/riscv.cc | 108 ++++++++++++++++++++++---- gcc/testsuite/gcc.target/riscv/synthesis-10.c | 18 +++++ 2 files changed, 110 insertions(+), 16 deletions(-) diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 91fefacee80..10af38a5a81 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -250,6 +250,7 @@ struct riscv_arg_info { and each VALUE[i] is a constant integer. CODE[0] is undefined. */ struct riscv_integer_op { bool use_uw; + bool save_temporary; enum rtx_code code; unsigned HOST_WIDE_INT value; }; @@ -759,6 +760,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS], codes[0].code = UNKNOWN; codes[0].value = value; codes[0].use_uw = false; + codes[0].save_temporary = false; return 1; } if (TARGET_ZBS && SINGLE_BIT_MASK_OPERAND (value)) @@ -767,6 +769,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS], codes[0].code = UNKNOWN; codes[0].value = value; codes[0].use_uw = false; + codes[0].save_temporary = false; /* RISC-V sign-extends all 32bit values that live in a 32bit register. To avoid paradoxes, we thus need to use the @@ -796,6 +799,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS], alt_codes[alt_cost-1].code = PLUS; alt_codes[alt_cost-1].value = low_part; alt_codes[alt_cost-1].use_uw = false; + alt_codes[alt_cost-1].save_temporary = false; memcpy (codes, alt_codes, sizeof (alt_codes)); cost = alt_cost; } @@ -810,6 +814,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS], alt_codes[alt_cost-1].code = XOR; alt_codes[alt_cost-1].value = low_part; alt_codes[alt_cost-1].use_uw = false; + alt_codes[alt_cost-1].save_temporary = false; memcpy (codes, alt_codes, sizeof (alt_codes)); cost = alt_cost; } @@ -852,6 +857,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS], alt_codes[alt_cost-1].code = ASHIFT; alt_codes[alt_cost-1].value = shift; alt_codes[alt_cost-1].use_uw = use_uw; + alt_codes[alt_cost-1].save_temporary = false; memcpy (codes, alt_codes, sizeof (alt_codes)); cost = alt_cost; } @@ -873,9 +879,11 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS], codes[0].value = (((unsigned HOST_WIDE_INT) value >> trailing_ones) | (value << (64 - trailing_ones))); codes[0].use_uw = false; + codes[0].save_temporary = false; codes[1].code = ROTATERT; codes[1].value = 64 - trailing_ones; codes[1].use_uw = false; + codes[1].save_temporary = false; cost = 2; } /* Handle the case where the 11 bit range of zero bits wraps around. */ @@ -888,9 +896,11 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS], | ((unsigned HOST_WIDE_INT) value >> (32 + upper_trailing_ones))); codes[0].use_uw = false; + codes[0].save_temporary = false; codes[1].code = ROTATERT; codes[1].value = 32 - upper_trailing_ones; codes[1].use_uw = false; + codes[1].save_temporary = false; cost = 2; } @@ -917,6 +927,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS], alt_codes[alt_cost].code = AND; alt_codes[alt_cost].value = ~(1UL << bit); alt_codes[alt_cost].use_uw = false; + alt_codes[alt_cost].save_temporary = false; alt_cost++; nval &= ~(1UL << bit); } @@ -938,6 +949,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS], alt_codes[alt_cost - 1].code = FMA; alt_codes[alt_cost - 1].value = 9; alt_codes[alt_cost - 1].use_uw = false; + alt_codes[alt_cost - 1].save_temporary = false; memcpy (codes, alt_codes, sizeof (alt_codes)); cost = alt_cost; } @@ -948,6 +960,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS], alt_codes[alt_cost - 1].code = FMA; alt_codes[alt_cost - 1].value = 5; alt_codes[alt_cost - 1].use_uw = false; + alt_codes[alt_cost - 1].save_temporary = false; memcpy (codes, alt_codes, sizeof (alt_codes)); cost = alt_cost; } @@ -958,6 +971,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS], alt_codes[alt_cost - 1].code = FMA; alt_codes[alt_cost - 1].value = 3; alt_codes[alt_cost - 1].use_uw = false; + alt_codes[alt_cost - 1].save_temporary = false; memcpy (codes, alt_codes, sizeof (alt_codes)); cost = alt_cost; } @@ -978,6 +992,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS], alt_codes[alt_cost - 1].code = PLUS; alt_codes[alt_cost - 1].value = adjustment; alt_codes[alt_cost - 1].use_uw = false; + alt_codes[alt_cost - 1].save_temporary = false; memcpy (codes, alt_codes, sizeof (alt_codes)); cost = alt_cost; } @@ -995,6 +1010,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS], alt_codes[i].code = (i == 0 ? UNKNOWN : IOR); alt_codes[i].value = value & 0x7ffff000; alt_codes[i].use_uw = false; + alt_codes[i].save_temporary = false; value &= ~0x7ffff000; i++; } @@ -1005,6 +1021,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS], alt_codes[i].code = (i == 0 ? UNKNOWN : PLUS); alt_codes[i].value = value & 0x7ff; alt_codes[i].use_uw = false; + alt_codes[i].save_temporary = false; value &= ~0x7ff; i++; } @@ -1016,6 +1033,7 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS], alt_codes[i].code = (i == 0 ? UNKNOWN : IOR); alt_codes[i].value = 1UL << bit; alt_codes[i].use_uw = false; + alt_codes[i].save_temporary = false; value &= ~(1ULL << bit); i++; } @@ -1057,6 +1075,7 @@ riscv_build_integer (struct riscv_integer_op *codes, HOST_WIDE_INT value, alt_codes[alt_cost-1].code = LSHIFTRT; alt_codes[alt_cost-1].value = shift; alt_codes[alt_cost-1].use_uw = false; + alt_codes[alt_cost-1].save_temporary = false; memcpy (codes, alt_codes, sizeof (alt_codes)); cost = alt_cost; } @@ -1069,6 +1088,7 @@ riscv_build_integer (struct riscv_integer_op *codes, HOST_WIDE_INT value, alt_codes[alt_cost-1].code = LSHIFTRT; alt_codes[alt_cost-1].value = shift; alt_codes[alt_cost-1].use_uw = false; + alt_codes[alt_cost-1].save_temporary = false; memcpy (codes, alt_codes, sizeof (alt_codes)); cost = alt_cost; } @@ -1093,6 +1113,7 @@ riscv_build_integer (struct riscv_integer_op *codes, HOST_WIDE_INT value, alt_codes[alt_cost - 1].code = XOR; alt_codes[alt_cost - 1].value = -1; alt_codes[alt_cost - 1].use_uw = false; + alt_codes[alt_cost - 1].save_temporary = false; memcpy (codes, alt_codes, sizeof (alt_codes)); cost = alt_cost; } @@ -1128,13 +1149,55 @@ riscv_build_integer (struct riscv_integer_op *codes, HOST_WIDE_INT value, if (cost > 3 && TARGET_64BIT && TARGET_ZBKB) { unsigned HOST_WIDE_INT loval = value & 0xffffffff; - unsigned HOST_WIDE_INT hival = value & ~loval; - if (hival >> 32 == loval) + unsigned HOST_WIDE_INT hival = (value & ~loval) >> 32; + if (hival == loval) { cost = 1 + riscv_build_integer_1 (codes, sext_hwi (loval, 32), mode); codes[cost - 1].code = CONCAT; codes[cost - 1].value = 0; codes[cost - 1].use_uw = false; + codes[cost - 1].save_temporary = false; + } + + /* An arbitrary 64 bit constant can be synthesized in 5 instructions + using zbkb. We may do better than that if the upper or lower half + can be synthsized with a single LUI, ADDI or BSET. Regardless the + basic steps are the same. */ + if (cost > 3 && can_create_pseudo_p ()) + { + struct riscv_integer_op hi_codes[RISCV_MAX_INTEGER_OPS]; + struct riscv_integer_op lo_codes[RISCV_MAX_INTEGER_OPS]; + int hi_cost, lo_cost; + + /* Synthesize and get cost for each half. */ + lo_cost + = riscv_build_integer_1 (lo_codes, sext_hwi (loval, 32), mode); + hi_cost + = riscv_build_integer_1 (hi_codes, sext_hwi (hival, 32), mode); + + /* If profitable, finish synthesis using zbkb. */ + if (cost > hi_cost + lo_cost + 1) + { + /* We need the low half independent of the high half. So + mark it has creating a temporary we'll use later. */ + memcpy (codes, lo_codes, + lo_cost * sizeof (struct riscv_integer_op)); + codes[lo_cost - 1].save_temporary = true; + + /* Now the high half synthesis. */ + memcpy (codes + lo_cost, hi_codes, + hi_cost * sizeof (struct riscv_integer_op)); + + /* Adjust the cost. */ + cost = hi_cost + lo_cost + 1; + + /* And finally (ab)use VEC_MERGE to indicate we want to + put merge the two parts together. */ + codes[cost - 1].code = VEC_MERGE; + codes[cost - 1].value = 0; + codes[cost - 1].use_uw = false; + codes[cost - 1].save_temporary = false; + } } } @@ -2656,23 +2719,25 @@ riscv_move_integer (rtx temp, rtx dest, HOST_WIDE_INT value, x = riscv_split_integer (value, mode); else { - codes[0].value = trunc_int_for_mode (codes[0].value, mode); - /* Apply each binary operation to X. */ - x = GEN_INT (codes[0].value); - - for (i = 1; i < num_ops; i++) + rtx old_value = NULL_RTX; + for (i = 0; i < num_ops; i++) { - if (!can_create_pseudo_p ()) + if (i != 0 && !can_create_pseudo_p ()) x = riscv_emit_set (temp, x); - else + else if (i != 0) x = force_reg (mode, x); codes[i].value = trunc_int_for_mode (codes[i].value, mode); - /* If the sequence requires using a "uw" form of an insn, we're - going to have to construct the RTL ourselves and put it in - a register to avoid force_reg/force_operand from mucking things - up. */ - if (codes[i].use_uw) + if (codes[i].code == UNKNOWN) { + /* UNKNOWN means load the constant value into X. */ + x = GEN_INT (codes[i].value); + } + else if (codes[i].use_uw) + { + /* If the sequence requires using a "uw" form of an insn, we're + going to have to construct the RTL ourselves and put it in + a register to avoid force_reg/force_operand from mucking + things up. */ gcc_assert (TARGET_64BIT || TARGET_ZBA); rtx t = can_create_pseudo_p () ? gen_reg_rtx (mode) : temp; @@ -2695,16 +2760,27 @@ riscv_move_integer (rtx temp, rtx dest, HOST_WIDE_INT value, rtx t = can_create_pseudo_p () ? gen_reg_rtx (mode) : temp; x = riscv_emit_set (t, x); } - else if (codes[i].code == CONCAT) + else if (codes[i].code == CONCAT || codes[i].code == VEC_MERGE) { rtx t = can_create_pseudo_p () ? gen_reg_rtx (mode) : temp; - rtx t2 = gen_lowpart (SImode, x); + rtx t2 = codes[i].code == VEC_MERGE ? old_value : x; + gcc_assert (t2); + t2 = gen_lowpart (SImode, t2); emit_insn (gen_riscv_xpack_di_si_2 (t, x, GEN_INT (32), t2)); x = t; } else x = gen_rtx_fmt_ee (codes[i].code, mode, x, GEN_INT (codes[i].value)); + + /* If this entry in the code table indicates we should save away + the temporary holding the current value of X, then do so. */ + if (codes[i].save_temporary) + { + gcc_assert (old_value == NULL_RTX); + x = force_reg (mode, x); + old_value = x; + } } } diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-10.c b/gcc/testsuite/gcc.target/riscv/synthesis-10.c new file mode 100644 index 00000000000..0838ec9af47 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/synthesis-10.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target rv64 } */ +/* We aggressively skip as we really just need to test the basic synthesis + which shouldn't vary based on the optimization level. -O1 seems to work + and eliminates the usual sources of extraneous dead code that would throw + off the counts. */ +/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-O2" "-O3" "-Os" "-Oz" "-flto" } } */ +/* { dg-options "-march=rv64gc_zba_zbb_zbkb_zbs" } */ + +/* Rather than test for a specific synthesis of all these constants or + having thousands of tests each testing one variant, we just test the + total number of instructions. + + This isn't expected to change much and any change is worthy of a look. */ +/* { dg-final { scan-assembler-times "\\t(add|addi|bseti|li|pack|ret|sh1add|sh2add|sh3add|slli|srli|xori)" 10 } } */ + +unsigned long foo_0x1425000028000000(void) { return 0x1425000028000000UL; } +unsigned long foo_0xf057f2def857f2de(void) { return 0xf057f2def857f2deUL; }