On Mon, 23 Jun 2025, Tamar Christina wrote:
> This patch fixes a bug where the current code assumed that exact_log2 returns
> NULL on failure, but it instead returns -1. So there are some cases where the
> right shift could shift out the entire value.
>
> Secondly it also removes the requirement that VF be a power of two. With an
> uneven unroll factor we can easily end up with a non-power of two VF which SLP
> can handle. This replaces shifts with multiplication and division.
>
> The 32-bit x86 testcase from PR64110 was always wrong, it used to match by
> pure
> coincidence a vmovd inside the vector loop. What it intended to match was
> that
> the argument to the function isn't spilled and then reloaded from the stack
> for
> no reason.
>
> But on 32-bit x86 all arguments are passed on the stack anyway and so the
> match
> would have never worked. The patch seems to simplify the loop preheader which
> gets it to remove an intermediate zero extend which causes the match to now
> properly fail.
>
> As such I'm skipping the test on 32-bit x86.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> -m32, -m64 and no issues.
>
> Ok for master?
OK.
Thanks,
Richard.
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> * tree-vect-loop-manip.cc (vect_gen_vector_loop_niters,
> vect_gen_vector_loop_niters_mult_vf): Remove uses of log_vf.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/pr64110.c: Update testcase.
>
> ---
> diff --git a/gcc/testsuite/gcc.target/i386/pr64110.c
> b/gcc/testsuite/gcc.target/i386/pr64110.c
> index
> 99e391916cb7a94e7dd40207f35f29f62acc412c..11a6929835f4e6490d3ff5022a4f33050559b022
> 100644
> --- a/gcc/testsuite/gcc.target/i386/pr64110.c
> +++ b/gcc/testsuite/gcc.target/i386/pr64110.c
> @@ -1,6 +1,6 @@
> /* { dg-do compile } */
> /* { dg-options "-O3 -march=core-avx2" } */
> -/* { dg-final { scan-assembler "vmovd\[\\t \]" } } */
> +/* { dg-final { scan-assembler "vmovd\[\\t \]" { target { ! ilp32 } } } } */
>
> int foo (void);
> int a;
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index
> 56a4e9a8b63f3cae0bf596bf5d22893887dc80e8..90034fe2d681e2740f54486cfd9e7ddc778e84e1
> 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -2794,7 +2794,6 @@ vect_gen_vector_loop_niters (loop_vec_info loop_vinfo,
> tree niters,
> tree niters_vector, step_vector, type = TREE_TYPE (niters);
> poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
> - tree log_vf = NULL_TREE;
>
> /* If epilogue loop is required because of data accesses with gaps, we
> subtract one iteration from the total number of iterations here for
> @@ -2820,22 +2819,25 @@ vect_gen_vector_loop_niters (loop_vec_info
> loop_vinfo, tree niters,
> if (vf.is_constant (&const_vf)
> && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
> {
> - /* Create: niters >> log2(vf) */
> + /* Create: niters / vf, which is equivalent to niters >> log2(vf) when
> + vf is a power of two, and when not we approximate using a
> + truncating division. */
> /* If it's known that niters == number of latch executions + 1 doesn't
> - overflow, we can generate niters >> log2(vf); otherwise we generate
> - (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
> + overflow, we can generate niters / vf; otherwise we generate
> + (niters - vf) / vf + 1 by using the fact that we know ratio
> will be at least one. */
> - log_vf = build_int_cst (type, exact_log2 (const_vf));
> + tree var_vf = build_int_cst (type, const_vf);
> if (niters_no_overflow)
> - niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
> + niters_vector = fold_build2 (TRUNC_DIV_EXPR, type, ni_minus_gap,
> + var_vf);
> else
> niters_vector
> = fold_build2 (PLUS_EXPR, type,
> - fold_build2 (RSHIFT_EXPR, type,
> + fold_build2 (TRUNC_DIV_EXPR, type,
> fold_build2 (MINUS_EXPR, type,
> ni_minus_gap,
> - build_int_cst (type, vf)),
> - log_vf),
> + var_vf),
> + var_vf),
> build_int_cst (type, 1));
> step_vector = build_one_cst (type);
> }
> @@ -2854,16 +2856,17 @@ vect_gen_vector_loop_niters (loop_vec_info
> loop_vinfo, tree niters,
> /* Peeling algorithm guarantees that vector loop bound is at least ONE,
> we set range information to make niters analyzer's life easier.
> Note the number of latch iteration value can be TYPE_MAX_VALUE so
> - we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf. */
> - if (stmts != NULL && log_vf)
> + we have to represent the vector niter TYPE_MAX_VALUE + 1 / vf. */
> + if (stmts != NULL && const_vf > 0)
> {
> if (niters_no_overflow)
> {
> int_range<1> vr (type,
> wi::one (TYPE_PRECISION (type)),
> - wi::rshift (wi::max_value (TYPE_PRECISION (type),
> - TYPE_SIGN (type)),
> - exact_log2 (const_vf),
> + wi::div_trunc (wi::max_value
> + (TYPE_PRECISION (type),
> + TYPE_SIGN (type)),
> + const_vf,
> TYPE_SIGN (type)));
> set_range_info (niters_vector, vr);
> }
> @@ -2901,13 +2904,12 @@ vect_gen_vector_loop_niters_mult_vf (loop_vec_info
> loop_vinfo,
> /* We should be using a step_vector of VF if VF is variable. */
> int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
> tree type = TREE_TYPE (niters_vector);
> - tree log_vf = build_int_cst (type, exact_log2 (vf));
> tree tree_vf = build_int_cst (type, vf);
> basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
>
> gcc_assert (niters_vector_mult_vf_ptr != NULL);
> - tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
> - niters_vector, log_vf);
> + tree niters_vector_mult_vf = fold_build2 (MULT_EXPR, type,
> + niters_vector, tree_vf);
>
> /* If we've peeled a vector iteration then subtract one full vector
> iteration. */
>
>
>
--
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)