On Wed, 22 Oct 2025, Pengfei Li wrote:
> When compiling the following code with SIMDe on AArch64:
>
> __m128i lo = _mm_srli_si128(a, 12);
> __m128i hi = _mm_slli_si128(b, 4);
> __m128i res = _mm_blend_epi16(hi, lo, 3);
>
> current GCC produces:
>
> mov v31.4s, 0
> ext v30.16b, v0.16b, v31.16b, #12
> ext v0.16b, v31.16b, v1.16b, #12
> ins v0.s[0], v30.s[0]
>
> instead of the more efficient:
>
> ext v0.16b, v0.16b, v1.16b, #12
>
> GCC builds three VEC_PERM_EXPRs for the intrinsic calls. The first two
> implement vector shifts and the final one implements the blend, but they
> use different vector modes. The forward propagation fails to optimize
> this case because VIEW_CONVERT_EXPRs in between block the folding.
>
> This patch adds a match.pd pattern to recognize the concat-and-extract
> idiom and folds the VEC_PERM_EXPR chain, even when VIEW_CONVERT_EXPRs
> split the chain.
>
> Bootstrapped and tested on aarch64-linux-gnu and x86_64-linux-gnu.
OK.
Thanks,
Richard.
> gcc/ChangeLog:
>
> * match.pd: Fold VEC_PERM_EXPR chains implementing vector
> concat-and-extract.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/fold-vecperm-1.c: New test.
> ---
> gcc/match.pd | 53 +++++++++++++++++++++++++++
> gcc/testsuite/gcc.dg/fold-vecperm-1.c | 23 ++++++++++++
> 2 files changed, 76 insertions(+)
> create mode 100644 gcc/testsuite/gcc.dg/fold-vecperm-1.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index a4248a521cf..7b86db54a78 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -11872,6 +11872,59 @@ and,
> (if (full_perm_p)
> (vec_perm (op@3 @0 @1) @3 @2))))))
>
> +/* Fold
> + x = VEC_PERM_EXPR <a, ANY, sel0>;
> + y = VEC_PERM_EXPR <ANY, b, sel0>;
> + c = VEC_PERM_EXPR <x, y, sel1>;
> + into
> + c = VEC_PERM_EXPR <a, b, sel0>;
> + if sel0 combined with sel1 denotes extracting a contiguous subvector from
> + the conceptual concatenated [ a | b ]. */
> +(simplify
> + (vec_perm (view_convert? (vec_perm @0 @4 VECTOR_CST@2))
> + (view_convert? (vec_perm @5 @1 VECTOR_CST@2))
> + VECTOR_CST@3)
> + (with
> + {
> + bool can_fold = false;
> + unsigned HOST_WIDE_INT nelts;
> + vec_perm_builder builder;
> + if (TYPE_VECTOR_SUBPARTS (type).is_constant (&nelts)
> + && tree_to_vec_perm_builder (&builder, @2))
> + {
> + /* Set can_fold to true when
> + - sel0 is a vector of consecutive indices.
> + - sel1 is composed of two parts of consecutive indices [ ia | ib ],
> + selecting the elements originally in 'a' and 'b', respectively. */
> + vec_perm_indices sel0 (builder, 2, VECTOR_CST_NELTS (@2));
> + unsigned int sel0_first_idx = sel0[0].to_constant ();
> + unsigned int elt_size = vector_element_bits (TREE_TYPE (@0));
> + unsigned int ia_size = tree_to_uhwi (TYPE_SIZE (type))
> + - elt_size * sel0_first_idx;
> + unsigned int ib_start;
> + if (sel0.series_p (0, 1, sel0_first_idx, 1)
> + && multiple_p (ia_size, vector_element_bits (type), &ib_start)
> + && tree_to_vec_perm_builder (&builder, @3))
> + {
> + /* Check if the ib part contains consecutive indices starting from
> + 'nelts + ib_start'. */
> + vec_perm_indices sel1 (builder, 2, VECTOR_CST_NELTS (@3));
> + can_fold = sel1.series_p (ib_start, 1, nelts + ib_start, 1);
> +
> + /* Check if the ia part contains indices [0 ... ib_start - 1]. */
> + if (can_fold)
> + for (unsigned int i = 0; i < ib_start; i++)
> + if (sel1[i].to_constant () != i)
> + {
> + can_fold = false;
> + break;
> + }
> + }
> + }
> + }
> + (if (can_fold)
> + (view_convert (vec_perm @0 @1 @2)))))
> +
> #if GIMPLE
> /* Simplify (a >> 1) + (b >> 1) + ((a | b) & 1) to .AVG_CEIL (a, b).
> Similar for (a | b) - ((a ^ b) >> 1). */
> diff --git a/gcc/testsuite/gcc.dg/fold-vecperm-1.c
> b/gcc/testsuite/gcc.dg/fold-vecperm-1.c
> new file mode 100644
> index 00000000000..5d4456b98b1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/fold-vecperm-1.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +typedef int v4si __attribute__((vector_size(16)));
> +typedef short v8hi __attribute__((vector_size(16)));
> +
> +typedef union {
> + v4si s;
> + v8hi h;
> +} int128;
> +
> +int128 concat (int128 a, int128 b) {
> + int128 x, y, res;
> + v4si zero = { 0, 0, 0, 0 };
> + v4si sel0 = { 3, 4, 5, 6 };
> + v8hi sel1 = { 0, 1, 10, 11, 12, 13, 14, 15 };
> + x.s = __builtin_shuffle (a.s, zero, sel0);
> + y.s = __builtin_shuffle (zero, b.s, sel0);
> + res.h = __builtin_shuffle (x.h, y.h, sel1);
> + return res;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 1 "optimized" } } */
>
--
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)