https://gcc.gnu.org/g:e311dd13a9adbc51d56971cba06e1ff15a4256d2
commit r15-3671-ge311dd13a9adbc51d56971cba06e1ff15a4256d2 Author: Jennifer Schmitz <jschm...@nvidia.com> Date: Mon Sep 2 06:46:57 2024 -0700 SVE intrinsics: Fold svdiv with all-zero operands to zero vector This patch folds svdiv where one of the operands is all-zeros to a zero vector, if one of the following conditions holds: - the dividend is all zeros or - the divisor is all zeros, and the predicate is ptrue or the predication is _x or _z. This case was not covered by the recent patch that implemented constant folding, because that covered only cases where both operands are constant vectors. Here, the operation is folded as soon as one of the operands is a constant zero vector. Folding of divison by 0 to return 0 is in accordance with the semantics of sdiv and udiv. The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression. OK for mainline? Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com> gcc/ * config/aarch64/aarch64-sve-builtins-base.cc (svdiv_impl::fold): Add folding of all-zero operands to zero vector. gcc/testsuite/ * gcc.target/aarch64/sve/fold_div_zero.c: New test. * gcc.target/aarch64/sve/const_fold_div_1.c: Adjust expected outcome. Diff: --- gcc/config/aarch64/aarch64-sve-builtins-base.cc | 29 +- .../gcc.target/aarch64/sve/const_fold_div_1.c | 12 +- .../gcc.target/aarch64/sve/fold_div_zero.c | 369 +++++++++++++++++++++ 3 files changed, 393 insertions(+), 17 deletions(-) diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index 8f781e26cc84..9f8af9b59319 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -758,30 +758,41 @@ public: if (auto *res = f.fold_const_binary (TRUNC_DIV_EXPR)) return res; - /* If the divisor is a uniform power of 2, fold to a shift - instruction. */ + /* If the dividend is all zeros, fold to zero vector. */ + tree op1 = gimple_call_arg (f.call, 1); + if (integer_zerop (op1)) + return gimple_build_assign (f.lhs, op1); + + /* If the divisor is all zeros, fold to zero vector. */ + tree pg = gimple_call_arg (f.call, 0); tree op2 = gimple_call_arg (f.call, 2); - tree divisor_cst = uniform_integer_cst_p (op2); + if (integer_zerop (op2) + && (f.pred != PRED_m + || is_ptrue (pg, f.type_suffix (0).element_bytes))) + return gimple_build_assign (f.lhs, build_zero_cst (TREE_TYPE (f.lhs))); - if (!divisor_cst || !integer_pow2p (divisor_cst)) + /* If the divisor is a uniform power of 2, fold to a shift + instruction. */ + tree op2_cst = uniform_integer_cst_p (op2); + if (!op2_cst || !integer_pow2p (op2_cst)) return NULL; tree new_divisor; gcall *call; - if (f.type_suffix (0).unsigned_p && tree_to_uhwi (divisor_cst) != 1) + if (f.type_suffix (0).unsigned_p && tree_to_uhwi (op2_cst) != 1) { function_instance instance ("svlsr", functions::svlsr, shapes::binary_uint_opt_n, MODE_n, f.type_suffix_ids, GROUP_none, f.pred); call = f.redirect_call (instance); - tree d = INTEGRAL_TYPE_P (TREE_TYPE (op2)) ? op2 : divisor_cst; + tree d = INTEGRAL_TYPE_P (TREE_TYPE (op2)) ? op2 : op2_cst; new_divisor = wide_int_to_tree (TREE_TYPE (d), tree_log2 (d)); } else { - if (tree_int_cst_sign_bit (divisor_cst) - || tree_to_shwi (divisor_cst) == 1) + if (tree_int_cst_sign_bit (op2_cst) + || tree_to_shwi (op2_cst) == 1) return NULL; function_instance instance ("svasrd", functions::svasrd, @@ -789,7 +800,7 @@ public: f.type_suffix_ids, GROUP_none, f.pred); call = f.redirect_call (instance); new_divisor = wide_int_to_tree (scalar_types[VECTOR_TYPE_svuint64_t], - tree_log2 (divisor_cst)); + tree_log2 (op2_cst)); } gimple_call_set_arg (call, 2, new_divisor); diff --git a/gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_1.c b/gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_1.c index c15b3fc3aa0a..92e0005c0fee 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_1.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_1.c @@ -45,7 +45,7 @@ svint64_t s64_z_pg (svbool_t pg) /* ** s64_z_pg_0: -** mov z[0-9]+\.d, p[0-7]/z, #0 +** mov z[0-9]+\.b, #0 ** ret */ svint64_t s64_z_pg_0 (svbool_t pg) @@ -55,9 +55,7 @@ svint64_t s64_z_pg_0 (svbool_t pg) /* ** s64_z_pg_by0: -** mov (z[0-9]+\.d), #5 -** mov (z[0-9]+)\.b, #0 -** sdivr \2\.d, p[0-7]/m, \2\.d, \1 +** mov z[0-9]+\.b, #0 ** ret */ svint64_t s64_z_pg_by0 (svbool_t pg) @@ -149,7 +147,7 @@ svint64_t s64_z_pg_n (svbool_t pg) /* ** s64_z_pg_n_s64_0: -** mov z[0-9]+\.d, p[0-7]/z, #0 +** mov z[0-9]+\.b, #0 ** ret */ svint64_t s64_z_pg_n_s64_0 (svbool_t pg) @@ -159,9 +157,7 @@ svint64_t s64_z_pg_n_s64_0 (svbool_t pg) /* ** s64_z_pg_n_s64_by0: -** mov (z[0-9]+\.d), #5 -** mov (z[0-9]+)\.b, #0 -** sdivr \2\.d, p[0-7]/m, \2\.d, \1 +** mov z[0-9]+\.b, #0 ** ret */ svint64_t s64_z_pg_n_s64_by0 (svbool_t pg) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fold_div_zero.c b/gcc/testsuite/gcc.target/aarch64/sve/fold_div_zero.c new file mode 100644 index 000000000000..0dcd018cadc8 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/fold_div_zero.c @@ -0,0 +1,369 @@ +/* { dg-final { check-function-bodies "**" "" } } */ +/* { dg-options "-O2" } */ + +#include "arm_sve.h" + +/* +** s64_x_pg_op1: +** mov z[0-9]+\.b, #0 +** ret +*/ +svint64_t s64_x_pg_op1 (svbool_t pg, svint64_t op2) +{ + return svdiv_x (pg, svdup_s64 (0), op2); +} + +/* +** s64_z_pg_op1: +** mov z[0-9]+\.b, #0 +** ret +*/ +svint64_t s64_z_pg_op1 (svbool_t pg, svint64_t op2) +{ + return svdiv_z (pg, svdup_s64 (0), op2); +} + +/* +** s64_m_pg_op1: +** mov z[0-9]+\.b, #0 +** ret +*/ +svint64_t s64_m_pg_op1 (svbool_t pg, svint64_t op2) +{ + return svdiv_m (pg, svdup_s64 (0), op2); +} + +/* +** s64_x_ptrue_op1: +** mov z[0-9]+\.b, #0 +** ret +*/ +svint64_t s64_x_ptrue_op1 (svint64_t op2) +{ + return svdiv_x (svptrue_b64 (), svdup_s64 (0), op2); +} + +/* +** s64_z_ptrue_op1: +** mov z[0-9]+\.b, #0 +** ret +*/ +svint64_t s64_z_ptrue_op1 (svint64_t op2) +{ + return svdiv_z (svptrue_b64 (), svdup_s64 (0), op2); +} + +/* +** s64_m_ptrue_op1: +** mov z[0-9]+\.b, #0 +** ret +*/ +svint64_t s64_m_ptrue_op1 (svint64_t op2) +{ + return svdiv_m (svptrue_b64 (), svdup_s64 (0), op2); +} + +/* +** s64_x_pg_op2: +** mov z[0-9]+\.b, #0 +** ret +*/ +svint64_t s64_x_pg_op2 (svbool_t pg, svint64_t op1) +{ + return svdiv_x (pg, op1, svdup_s64 (0)); +} + +/* +** s64_z_pg_op2: +** mov z[0-9]+\.b, #0 +** ret +*/ +svint64_t s64_z_pg_op2 (svbool_t pg, svint64_t op1) +{ + return svdiv_z (pg, op1, svdup_s64 (0)); +} + +/* +** s64_m_pg_op2: +** mov (z[0-9]+)\.b, #0 +** sdiv (z[0-9]\.d), p[0-7]/m, \2, \1\.d +** ret +*/ +svint64_t s64_m_pg_op2 (svbool_t pg, svint64_t op1) +{ + return svdiv_m (pg, op1, svdup_s64 (0)); +} + +/* +** s64_x_ptrue_op2: +** mov z[0-9]+\.b, #0 +** ret +*/ +svint64_t s64_x_ptrue_op2 (svint64_t op1) +{ + return svdiv_x (svptrue_b64 (), op1, svdup_s64 (0)); +} + +/* +** s64_z_ptrue_op2: +** mov z[0-9]+\.b, #0 +** ret +*/ +svint64_t s64_z_ptrue_op2 (svint64_t op1) +{ + return svdiv_z (svptrue_b64 (), op1, svdup_s64 (0)); +} + +/* +** s64_m_ptrue_op2: +** mov z[0-9]+\.b, #0 +** ret +*/ +svint64_t s64_m_ptrue_op2 (svint64_t op1) +{ + return svdiv_m (svptrue_b64 (), op1, svdup_s64 (0)); +} + +/* +** s64_n_x_pg_op2: +** mov z[0-9]+\.b, #0 +** ret +*/ +svint64_t s64_n_x_pg_op2 (svbool_t pg, svint64_t op1) +{ + return svdiv_n_s64_x (pg, op1, 0); +} + +/* +** s64_n_z_pg_op2: +** mov z[0-9]+\.b, #0 +** ret +*/ +svint64_t s64_n_z_pg_op2 (svbool_t pg, svint64_t op1) +{ + return svdiv_n_s64_z (pg, op1, 0); +} + +/* +** s64_n_m_pg_op2: +** mov (z[0-9]+)\.b, #0 +** sdiv (z[0-9]+\.d), p[0-7]/m, \2, \1\.d +** ret +*/ +svint64_t s64_n_m_pg_op2 (svbool_t pg, svint64_t op1) +{ + return svdiv_n_s64_m (pg, op1, 0); +} + +/* +** s64_n_x_ptrue_op2: +** mov z[0-9]+\.b, #0 +** ret +*/ +svint64_t s64_n_x_ptrue_op2 (svint64_t op1) +{ + return svdiv_n_s64_x (svptrue_b64 (), op1, 0); +} + +/* +** s64_n_z_ptrue_op2: +** mov z[0-9]+\.b, #0 +** ret +*/ +svint64_t s64_n_z_ptrue_op2 (svint64_t op1) +{ + return svdiv_n_s64_z (svptrue_b64 (), op1, 0); +} + +/* +** s64_n_m_ptrue_op2: +** mov z[0-9]+\.b, #0 +** ret +*/ +svint64_t s64_n_m_ptrue_op2 (svint64_t op1) +{ + return svdiv_n_s64_m (svptrue_b64 (), op1, 0); +} + +/* +** u64_x_pg_op1: +** mov z[0-9]+\.b, #0 +** ret +*/ +svuint64_t u64_x_pg_op1 (svbool_t pg, svuint64_t op2) +{ + return svdiv_x (pg, svdup_u64 (0), op2); +} + +/* +** u64_z_pg_op1: +** mov z[0-9]+\.b, #0 +** ret +*/ +svuint64_t u64_z_pg_op1 (svbool_t pg, svuint64_t op2) +{ + return svdiv_z (pg, svdup_u64 (0), op2); +} + +/* +** u64_m_pg_op1: +** mov z[0-9]+\.b, #0 +** ret +*/ +svuint64_t u64_m_pg_op1 (svbool_t pg, svuint64_t op2) +{ + return svdiv_m (pg, svdup_u64 (0), op2); +} + +/* +** u64_x_ptrue_op1: +** mov z[0-9]+\.b, #0 +** ret +*/ +svuint64_t u64_x_ptrue_op1 (svuint64_t op2) +{ + return svdiv_x (svptrue_b64 (), svdup_u64 (0), op2); +} + +/* +** u64_z_ptrue_op1: +** mov z[0-9]+\.b, #0 +** ret +*/ +svuint64_t u64_z_ptrue_op1 (svuint64_t op2) +{ + return svdiv_z (svptrue_b64 (), svdup_u64 (0), op2); +} + +/* +** u64_m_ptrue_op1: +** mov z[0-9]+\.b, #0 +** ret +*/ +svuint64_t u64_m_ptrue_op1 (svuint64_t op2) +{ + return svdiv_m (svptrue_b64 (), svdup_u64 (0), op2); +} + +/* +** u64_x_pg_op2: +** mov z[0-9]+\.b, #0 +** ret +*/ +svuint64_t u64_x_pg_op2 (svbool_t pg, svuint64_t op1) +{ + return svdiv_x (pg, op1, svdup_u64 (0)); +} + +/* +** u64_z_pg_op2: +** mov z[0-9]+\.b, #0 +** ret +*/ +svuint64_t u64_z_pg_op2 (svbool_t pg, svuint64_t op1) +{ + return svdiv_z (pg, op1, svdup_u64 (0)); +} + +/* +** u64_m_pg_op2: +** mov (z[0-9]+)\.b, #0 +** udiv (z[0-9]+\.d), p[0-7]/m, \2, \1\.d +** ret +*/ +svuint64_t u64_m_pg_op2 (svbool_t pg, svuint64_t op1) +{ + return svdiv_m (pg, op1, svdup_u64 (0)); +} + +/* +** u64_x_ptrue_op2: +** mov z[0-9]+\.b, #0 +** ret +*/ +svuint64_t u64_x_ptrue_op2 (svuint64_t op1) +{ + return svdiv_x (svptrue_b64 (), op1, svdup_u64 (0)); +} + +/* +** u64_z_ptrue_op2: +** mov z[0-9]+\.b, #0 +** ret +*/ +svuint64_t u64_z_ptrue_op2 (svuint64_t op1) +{ + return svdiv_z (svptrue_b64 (), op1, svdup_u64 (0)); +} + +/* +** u64_m_ptrue_op2: +** mov z[0-9]+\.b, #0 +** ret +*/ +svuint64_t u64_m_ptrue_op2 (svuint64_t op1) +{ + return svdiv_m (svptrue_b64 (), op1, svdup_u64 (0)); +} + +/* +** u64_n_x_pg_op2: +** mov z[0-9]+\.b, #0 +** ret +*/ +svuint64_t u64_n_x_pg_op2 (svbool_t pg, svuint64_t op1) +{ + return svdiv_n_u64_x (pg, op1, 0); +} + +/* +** u64_n_z_pg_op2: +** mov z[0-9]+\.b, #0 +** ret +*/ +svuint64_t u64_n_z_pg_op2 (svbool_t pg, svuint64_t op1) +{ + return svdiv_n_u64_z (pg, op1, 0); +} + +/* +** u64_n_m_pg_op2: +** mov (z[0-9]+)\.b, #0 +** udiv (z[0-9]+\.d), p[0-7]/m, \2, \1\.d +** ret +*/ +svuint64_t u64_n_m_pg_op2 (svbool_t pg, svuint64_t op1) +{ + return svdiv_n_u64_m (pg, op1, 0); +} + +/* +** u64_n_x_ptrue_op2: +** mov z[0-9]+\.b, #0 +** ret +*/ +svuint64_t u64_n_x_ptrue_op2 (svuint64_t op1) +{ + return svdiv_n_u64_x (svptrue_b64 (), op1, 0); +} + +/* +** u64_n_z_ptrue_op2: +** mov z[0-9]+\.b, #0 +** ret +*/ +svuint64_t u64_n_z_ptrue_op2 (svuint64_t op1) +{ + return svdiv_n_u64_z (svptrue_b64 (), op1, 0); +} + +/* +** u64_n_m_ptrue_op2: +** mov z[0-9]+\.b, #0 +** ret +*/ +svuint64_t u64_n_m_ptrue_op2 (svuint64_t op1) +{ + return svdiv_n_u64_m (svptrue_b64 (), op1, 0); +} +