On Mon, Dec 29, 2025 at 7:19 AM Tamar Christina <[email protected]> wrote:
>
> Since g:b219cbeda72d23b7ad6ff12cd159784b7ef00667
>
> The following
>
> void f(const int *restrict in,
> int *restrict out,
> int n, int threshold)
> {
> for (int i = 0; i < n; ++i) {
> int v = in[i];
> if (v > threshold) {
> int t = v * 3;
> t += 7;
> t ^= 0x55;
> t *= 0x55;
> t -= 0x5;
> t &= 0xFE;
> t ^= 0x55;
> out[i] = t;
> } else {
> out[i] = v;
> }
> }
> }
>
> compiled at -O2
>
> results in aggressive if-conversion which increases the number of dynamic
> instructions and the latency of the loop as it has to wait for t to be
> calculated now in all cases.
>
> This has led to big performance losses in packages like zstd [1] which in
> turns
> affects packaging and LTO speed.
>
> The default cost model for if-conversion is overly permissive and allows if
> conversions assuming that branches are very expensive.
>
> This patch implements an if-conversion cost model for AArch64. AArch64 has a
> number of conditional instructions that need to be accounted for, however this
> initial version keeps things simple and is only really concerned about csel.
>
> The issue specifically with csel is that it may have to wait for two argument
> to be evaluated before it can be executed. This means it has a direct
> correlation to increases in dynamic instructions.
>
> To fix this I add a new tuning parameter that indicates a rough estimation of
> the branch misprediction cost of a branch. We then accept if-conversion
> while
> the cost of this multiplied by the cost of branches is cheaper.
>
> There is a basic detection of CINC and CSET because these usually are ok. We
> also accept all if-conversion when not inside a loop. Because CE is not an
> RTL
> SSA pass we can't do more extensive checks like checking if the csel is a loop
> carried dependency. As such this is a best effort thing and intends to catch
> the
> most egregious cases like the above.
>
> This recovers the ~25% performance loss in zstd decoding and gives better
> results than GCC 14 which was before the regression happened.
>
> Additionally I've benchmarked on a number of cores all the attached examples
> and checked various cases. On average the patch gives an improvement between
> 20-40%.
>
> [1] https://github.com/facebook/zstd/pull/4418#issuecomment-3004606000
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master? Will commit after a week if no comment.
Since I have been looking into ifcvt lately. Let me try to review this one.
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> PR target/123017
> * config/aarch64/aarch64-json-schema.h: Add br_mispredict_factor.
> * config/aarch64/aarch64-json-tunings-parser-generated.inc
> (parse_branch_costs): Add br_mispredict_factor.
> * config/aarch64/aarch64-json-tunings-printer-generated.inc
> (serialize_branch_costs): Add br_mispredict_factor.
> * config/aarch64/aarch64-protos.h (struct cpu_branch_cost): Add
> br_mispredict_factor.
> * config/aarch64/aarch64.cc (aarch64_max_noce_ifcvt_seq_cost,
> aarch64_noce_conversion_profitable_p,
> TARGET_MAX_NOCE_IFCVT_SEQ_COST,
> TARGET_NOCE_CONVERSION_PROFITABLE_P): New.
> * config/aarch64/tuning_models/generic.h (generic_branch_cost): Add
> br_mispredict_factor.
> * config/aarch64/tuning_models/generic_armv8_a.h: Remove
> generic_armv8_a_branch_cost and use generic_branch_cost.
>
> gcc/testsuite/ChangeLog:
>
> PR target/123017
> * gcc.target/aarch64/pr123017_1.c: New test.
> * gcc.target/aarch64/pr123017_2.c: New test.
> * gcc.target/aarch64/pr123017_3.c: New test.
> * gcc.target/aarch64/pr123017_4.c: New test.
> * gcc.target/aarch64/pr123017_5.c: New test.
> * gcc.target/aarch64/pr123017_6.c: New test.
> * gcc.target/aarch64/pr123017_7.c: New test.
>
> ---
> diff --git a/gcc/config/aarch64/aarch64-json-schema.h
> b/gcc/config/aarch64/aarch64-json-schema.h
> index
> 0c1863fad18aaec9bb73f0df7310ff0554d1ffb4..32fb39300feb0c5a4aa68f6cf524c158de1bcfa1
> 100644
> --- a/gcc/config/aarch64/aarch64-json-schema.h
> +++ b/gcc/config/aarch64/aarch64-json-schema.h
> @@ -222,7 +222,11 @@ static const char *schema_json = R"json(
> }
> }
> },
> - "branch_costs": { "predictable": "int", "unpredictable": "int" },
> + "branch_costs": {
> + "predictable": "int",
> + "unpredictable": "int",
> + "br_mispredict_factor": "int"
> + },
> "approx_modes": { "division": "int", "sqrt": "int", "recip_sqrt": "int"
> },
> "sve_width": "uint",
> "memmov_cost": {
> diff --git a/gcc/config/aarch64/aarch64-json-tunings-parser-generated.inc
> b/gcc/config/aarch64/aarch64-json-tunings-parser-generated.inc
> index
> cf31e539d3ba6b78a2c6bb069165855a30a20f35..882d8f799755a3d304cb3adf7d8b6b79f6d80443
> 100644
> --- a/gcc/config/aarch64/aarch64-json-tunings-parser-generated.inc
> +++ b/gcc/config/aarch64/aarch64-json-tunings-parser-generated.inc
> @@ -237,6 +237,7 @@ parse_branch_costs (const json::object *jo, T
> &branch_costs)
> {
> PARSE_INTEGER_FIELD (jo, "predictable", branch_costs.predictable);
> PARSE_INTEGER_FIELD (jo, "unpredictable", branch_costs.unpredictable);
> + PARSE_INTEGER_FIELD (jo, "br_mispredict_factor",
> branch_costs.br_mispredict_factor);
> }
>
> template <typename T>
> diff --git a/gcc/config/aarch64/aarch64-json-tunings-printer-generated.inc
> b/gcc/config/aarch64/aarch64-json-tunings-printer-generated.inc
> index
> 6ffc4427a8bdb97a433f400dd65719adcf60156e..498be96ff33ef5e169882cd5a1cd0071411e6534
> 100644
> --- a/gcc/config/aarch64/aarch64-json-tunings-printer-generated.inc
> +++ b/gcc/config/aarch64/aarch64-json-tunings-printer-generated.inc
> @@ -287,6 +287,7 @@ serialize_branch_costs (const T &branch_costs)
>
> SERIALIZE_INTEGER_FIELD (branch_costs_obj, "predictable",
> branch_costs.predictable);
> SERIALIZE_INTEGER_FIELD (branch_costs_obj, "unpredictable",
> branch_costs.unpredictable);
> + SERIALIZE_INTEGER_FIELD (branch_costs_obj, "br_mispredict_factor",
> branch_costs.br_mispredict_factor);
>
> return branch_costs_obj;
> }
> diff --git a/gcc/config/aarch64/aarch64-protos.h
> b/gcc/config/aarch64/aarch64-protos.h
> index
> 89a0aed35326e3e68ede1a8f93af08e77f9040fd..8a59c3657392728ba5e6f714ab7d4aeb8fb278e9
> 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -481,6 +481,7 @@ struct cpu_branch_cost
> {
> int predictable; /* Predictable branch or optimizing for size. */
> int unpredictable; /* Unpredictable branch or optimizing for speed. */
> + int br_mispredict_factor; /* Scale factor for cost of misprediction on
> branches. */
> };
>
> /* Control approximate alternatives to certain FP operators. */
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index
> 8b7253b11fef1b7e0cd647251436da19ee177f0d..56469445d83a4dab933b2bbbbd2ac670214fd807
> 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -99,6 +99,7 @@
> #include "ipa-prop.h"
> #include "ipa-fnsummary.h"
> #include "hash-map.h"
> +#include "ifcvt.h"
> #include "aarch64-sched-dispatch.h"
> #include "aarch64-json-tunings-printer.h"
> #include "aarch64-json-tunings-parser.h"
> @@ -2269,6 +2270,112 @@ aarch64_instruction_selection (function * /* fun */,
> gimple_stmt_iterator *gsi)
> return true;
> }
>
> +/* Implement TARGET_MAX_NOCE_IFCVT_SEQ_COST. If an explicit max was set then
> + honor it, otherwise apply a tuning specific scale to branch costs. */
> +
> +static unsigned int
> +aarch64_max_noce_ifcvt_seq_cost (edge e)
> +{
> + bool predictable_p = predictable_edge_p (e);
> + if (predictable_p)
> + {
> + if (OPTION_SET_P (param_max_rtl_if_conversion_predictable_cost))
> + return param_max_rtl_if_conversion_predictable_cost;
> + }
> + else
> + {
> + if (OPTION_SET_P (param_max_rtl_if_conversion_unpredictable_cost))
> + return param_max_rtl_if_conversion_unpredictable_cost;
> + }
> +
> + /* For modern machines with long speculative execution chains and modern
> + branch prediction the penalty of the branch misprediction needs to
> weighed
> + against the cost of executing the instructions unconditionally. RISC
> cores
> + tend to not have that deep pipelines and so the cost of mispredictions
> can
> + be reasonably cheap. */
> +
> + bool speed_p = optimize_function_for_speed_p (cfun);
> + return BRANCH_COST (speed_p, predictable_p)
> + * aarch64_tune_params.branch_costs->br_mispredict_factor;
> +}
> +
> +/* Return true if SEQ is a good candidate as a replacement for the
> + if-convertible sequence described in IF_INFO. AArch64 has a range of
> + branchless statements and not all of them are potentially problematic.
> For
> + instance a cset is usually beneficial whereas a csel is more complicated.
> */
> +
> +static bool
> +aarch64_noce_conversion_profitable_p (rtx_insn *seq,
> + struct noce_if_info *if_info)
> +{
> + /* If not in a loop, just accept all if-conversion as the branch predictor
> + won't have anything to train on. So assume sequences are essentially
> + unpredictable. ce1 is in CFG mode still while ce2 is outside. For ce2
> + accept limited if-conversion based on the shape of the instruction. */
> + if (current_loops
> + && (!if_info->test_bb->loop_father
> + || !if_info->test_bb->loop_father->header))
> + return true;
> +
> + /* For now we only care about csel speciifcally. */
spelling mistake.
> + bool is_csel_p = true;
> +
> + if (if_info->then_simple
> + && if_info->a != NULL_RTX
> + && !REG_P (if_info->a))
> + is_csel_p = false;
REG_P might be too conservative. E.g. SUBREG might be here and I
suspect we want that to be considered a csel.
or a constant 0, -1 or 1.
> +
> + if (if_info->else_simple
> + && if_info->b != NULL_RTX
> + && !REG_P (if_info->b))
> + is_csel_p = false;
> +
> + for (rtx_insn *insn = seq; is_csel_p && insn; insn = NEXT_INSN (insn))
> + {
> + rtx set = single_set (insn);
> + if (!set)
> + continue;
> +
> + rtx src = SET_SRC (set);
> + rtx dst = SET_DEST (set);
> + machine_mode mode = GET_MODE (src);
> + if (GET_MODE_CLASS (mode) != MODE_INT)
> + continue;
> +
> + switch (GET_CODE (src))
> + {
> + case PLUS:
> + case MINUS:
> + {
> + /* Likely a CINC. */
> + if (REG_P (dst)
> + && REG_P (XEXP (src, 0))
> + && XEXP (src, 1) == CONST1_RTX (mode))
> + is_csel_p = false;
> + break;
> + }
> + default:
> + break;
> + }
> + }
This loop has one issue, is we might have the csel before the operation.
E.g. we might have:
```
csel x0, x1, wxz, ne
orr x0, x0, x2
```
That is the same as:
x0 = NE ? x1 | x2 : x2
as produced by noce_try_cond_zero_arith. Maybe that will be accepted
by the other case below.
But that might also be rejected by the above if_info->a/if_info->b
check which would be `REG` and `REG IOR REG`.
Thanks,
Andrew Pinski
> +
> + /* For now accept every variant but csel unconditionally because CSEL
> usually
> + means you have to wait for two values to be computed. */
> + if (!is_csel_p)
> + return true;
> +
> + /* TODO: Add detecting of csel, cinc, cset etc and take profiling in
> + consideration. For now this basic costing is enough to cover
> + most cases. */
> + if (if_info->speed_p)
> + {
> + unsigned cost = seq_cost (seq, true);
> + return cost <= if_info->max_seq_cost;
> + }
> +
> + return default_noce_conversion_profitable_p (seq, if_info);
> +}
> +
> /* Implement TARGET_HARD_REGNO_NREGS. */
>
> static unsigned int
> @@ -33453,6 +33560,13 @@ aarch64_libgcc_floating_mode_supported_p
> #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
> aarch64_atomic_assign_expand_fenv
>
> +/* If-conversion costs. */
> +#undef TARGET_MAX_NOCE_IFCVT_SEQ_COST
> +#define TARGET_MAX_NOCE_IFCVT_SEQ_COST aarch64_max_noce_ifcvt_seq_cost
> +
> +#undef TARGET_NOCE_CONVERSION_PROFITABLE_P
> +#define TARGET_NOCE_CONVERSION_PROFITABLE_P
> aarch64_noce_conversion_profitable_p
> +
> /* Section anchor support. */
>
> #undef TARGET_MIN_ANCHOR_OFFSET
> diff --git a/gcc/config/aarch64/tuning_models/generic.h
> b/gcc/config/aarch64/tuning_models/generic.h
> index
> 4a880b99673c6be6950c8c832f393f29b6406998..e90f406b34c794a965129da2899a8dc1b7c9fd87
> 100644
> --- a/gcc/config/aarch64/tuning_models/generic.h
> +++ b/gcc/config/aarch64/tuning_models/generic.h
> @@ -128,7 +128,8 @@ static const struct cpu_vector_cost generic_vector_cost =
> static const struct cpu_branch_cost generic_branch_cost =
> {
> 1, /* Predictable. */
> - 3 /* Unpredictable. */
> + 3, /* Unpredictable. */
> + 6, /* br_mispredict_factor. */
> };
>
> /* Generic approximation modes. */
> diff --git a/gcc/config/aarch64/tuning_models/generic_armv8_a.h
> b/gcc/config/aarch64/tuning_models/generic_armv8_a.h
> index
> a448d046db7a7194b786fa9528990fdbc2d37105..54567f5160026c8755adf63790aef46ff3b51865
> 100644
> --- a/gcc/config/aarch64/tuning_models/generic_armv8_a.h
> +++ b/gcc/config/aarch64/tuning_models/generic_armv8_a.h
> @@ -125,13 +125,6 @@ static const struct cpu_vector_cost
> generic_armv8_a_vector_cost =
> nullptr /* issue_info */
> };
>
> -/* Generic costs for branch instructions. */
> -static const struct cpu_branch_cost generic_armv8_a_branch_cost =
> -{
> - 1, /* Predictable. */
> - 3 /* Unpredictable. */
> -};
> -
> /* Generic approximation modes. */
> static const cpu_approx_modes generic_armv8_a_approx_modes =
> {
> @@ -158,7 +151,7 @@ static const struct tune_params generic_armv8_a_tunings =
> &generic_armv8_a_addrcost_table,
> &generic_armv8_a_regmove_cost,
> &generic_armv8_a_vector_cost,
> - &generic_armv8_a_branch_cost,
> + &generic_branch_cost,
> &generic_armv8_a_approx_modes,
> SVE_NOT_IMPLEMENTED, /* sve_width */
> { 4, /* load_int. */
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr123017_1.c
> b/gcc/testsuite/gcc.target/aarch64/pr123017_1.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..38e4334c1ea7979a5c535a06e85d027bb4f8a4cc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr123017_1.c
> @@ -0,0 +1,38 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { scan-assembler-times "csel\t" 2 } } */
> +
> +#include <stdint.h>
> +#include <stddef.h>
> +
> +// Branchy: compiler can emit a short compare+branch.
> +size_t branchy(size_t cur, size_t dist, size_t fallback, const uint8_t *src,
> uint8_t *dst, size_t len) {
> + if (dist < cur && dist < fallback) {
> + const uint8_t *p = src - dist;
> + for (size_t i = 0; i < len; ++i)
> + dst[i] = p[i];
> + return dist;
> + } else {
> + const uint8_t *p = src - fallback;
> + for (size_t i = 0; i < len; ++i)
> + dst[i] = p[i];
> + return fallback;
> + }
> +}
> +
> +// CSEL-heavy: chain ternaries so both paths stay live; compilers emit cmps
> + csel.
> +size_t selecty(size_t cur, size_t dist, size_t fallback, const uint8_t *src,
> uint8_t *dst, size_t len) {
> + // Compute both candidates unconditionally, then pick with ternaries.
> + size_t candA_off = dist;
> + size_t candB_off = fallback;
> + const uint8_t *candA = src - candA_off;
> + const uint8_t *candB = src - candB_off;
> +
> + size_t useA = (dist < cur && dist < fallback);
> + const uint8_t *p = useA ? candA : candB;
> + size_t chosen = useA ? candA_off : candB_off;
> +
> + for (size_t i = 0; i < len; ++i)
> + dst[i] = p[i];
> + return chosen;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr123017_2.c
> b/gcc/testsuite/gcc.target/aarch64/pr123017_2.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..72adb418e9e84bd54c2980232a053d88b85ed0b0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr123017_2.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { scan-assembler-times "csel\t" 0 } } */
> +
> +void f(const int *restrict in,
> + int *restrict out,
> + int n, int threshold)
> +{
> + for (int i = 0; i < n; i+=4) {
> + out[i+0] = in[i+0] > threshold ? in[i+0] : in[i+0] + i;
> + out[i+1] = in[i+1] > threshold ? in[i+1] : in[i+1] + i;
> + out[i+2] = in[i+2] > threshold ? in[i+2] : in[i+2] + i;
> + out[i+3] = in[i+3] > threshold ? in[i+3] : in[i+3] + i;
> + }
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr123017_3.c
> b/gcc/testsuite/gcc.target/aarch64/pr123017_3.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..ec39d785ee80cdc49db14bbe30d9a6c3668e64dd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr123017_3.c
> @@ -0,0 +1,32 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { scan-assembler-times "csel\t" 0 } } */
> +
> +void f(const int *restrict in,
> + int *restrict out,
> + int n, int threshold)
> +{
> + for (int i = 0; i < n; ++i) {
> + int v = in[i];
> + if (v > threshold) {
> + int t = v * 3;
> + t += 7;
> + t ^= 0x55;
> + t *= 0x55;
> + t -= 0x5;
> + t &= 0xFE;
> + t ^= 0x55;
> + out[i] = t;
> + } else {
> + int t = v * 5;
> + t += 4;
> + t ^= 0x65;
> + t *= 0x35;
> + t -= 0x7;
> + t &= 0x0E;
> + t ^= 0x45;
> + out[i] = t;
> + }
> + }
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr123017_4.c
> b/gcc/testsuite/gcc.target/aarch64/pr123017_4.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..b350d492ae46cf79f8107712fafc388554b76654
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr123017_4.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { scan-assembler-times "csel\t" 0 } } */
> +
> +void f(const int *restrict in,
> + int *restrict out,
> + int n, int threshold)
> +{
> + for (int i = 0; i < n; ++i) {
> + int v = in[i];
> + if (v > threshold) {
> + int t = v * 3;
> + out[i] = t;
> + }
> + }
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr123017_5.c
> b/gcc/testsuite/gcc.target/aarch64/pr123017_5.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..6d7cd088bd024eeac5821c890cc25d608118dd24
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr123017_5.c
> @@ -0,0 +1,20 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { scan-assembler-times "csel\t" 1 } } */
> +
> +void f(const int *restrict in,
> + int *restrict out,
> + int n, int threshold)
> +{
> + for (int i = 0; i < n; ++i) {
> + int v = in[i];
> + if (v > threshold) {
> + int t = v * 3;
> + t += 7;
> + out[i] = t;
> + } else {
> + out[i] = v;
> + }
> + }
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr123017_6.c
> b/gcc/testsuite/gcc.target/aarch64/pr123017_6.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..0402c4fdd86d53b2c5ad58792539cef035be115f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr123017_6.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { scan-assembler-times "csel\t" 0 } } */
> +
> +void f(const int *restrict in,
> + int *restrict out,
> + int n, int threshold)
> +{
> + for (int i = 0; i < n; ++i) {
> + int v = in[i];
> + if (v > threshold) {
> + int t = v * 3;
> + t += 7;
> + t ^= 0x55;
> + t *= 0x55;
> + out[i] = t;
> + } else {
> + out[i] = v;
> + }
> + }
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr123017_7.c
> b/gcc/testsuite/gcc.target/aarch64/pr123017_7.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..048a69e1c79c11d3b9eddbd3375e05b778da79f4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr123017_7.c
> @@ -0,0 +1,25 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { scan-assembler-times "csel\t" 0 } } */
> +
> +void f(const int *restrict in,
> + int *restrict out,
> + int n, int threshold)
> +{
> + for (int i = 0; i < n; ++i) {
> + int v = in[i];
> + if (v > threshold) {
> + int t = v * 3;
> + t += 7;
> + t ^= 0x55;
> + t *= 0x55;
> + t -= 0x5;
> + t &= 0xFE;
> + t ^= 0x55;
> + out[i] = t;
> + } else {
> + out[i] = v;
> + }
> + }
> +}
> +
>
>
> --