Re: [PATCH v2 3/4] aarch64: Add SVE support for simd clones [PR 96342]

Richard Sandiford Mon, 11 Nov 2024 06:27:51 -0800

Victor Do Nascimento <[email protected]> writes:
> This patch finalizes adding support for the generation of SVE simd clones when
> no simdlen is provided, following the ABI rules where the widest data type
> determines the minimum amount of elements in a length agnostic vector.
>
> gcc/ChangeLog:
>
>       * config/aarch64/aarch64-protos.h (add_sve_type_attribute): Declare.
>       * config/aarch64/aarch64-sve-builtins.cc (add_sve_type_attribute): Make
>       visibility global and support use for non_acle types.
>       * config/aarch64/aarch64.cc
>       (aarch64_simd_clone_compute_vecsize_and_simdlen): Create VLA simd clone
>       when no simdlen is provided, according to ABI rules.
>       (simd_clone_adjust_sve_vector_type): New helper function.
>       (aarch64_simd_clone_adjust): Add '+sve' attribute to SVE simd clones
>       and modify types to use SVE types.
>       * omp-simd-clone.cc (simd_clone_mangle): Print 'x' for VLA simdlen.
>       (simd_clone_adjust): Adapt safelen check to be compatible with VLA
>       simdlen.
>
> gcc/testsuite/ChangeLog:
>
>       * gcc.target/aarch64/declare-simd-2.c: Add SVE clone scan.
>       * gcc.target/aarch64/vect-simd-clone-1.c: New test.


As a general comment, there are a few instances of "a SVE" instead
of "an SVE".  Otherwise it mostly looks good, but some comments below:

> ---
>  gcc/config/aarch64/aarch64-protos.h           |   2 +
>  gcc/config/aarch64/aarch64-sve-builtins.cc    |   6 +-
>  gcc/config/aarch64/aarch64.cc                 | 181 +++++++++++++++---
>  gcc/omp-simd-clone.cc                         |  13 +-
>  .../gcc.target/aarch64/declare-simd-2.c       |   1 +
>  .../gcc.target/aarch64/vect-simd-clone-1.c    | 137 +++++++++++++
>  6 files changed, 306 insertions(+), 34 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/vect-simd-clone-1.c
>
> [...]
>  static void
>  aarch64_simd_clone_adjust (struct cgraph_node *node)
>  {
> -  /* Add aarch64_vector_pcs target attribute to SIMD clones so they
> -     use the correct ABI.  */
> -
>    tree t = TREE_TYPE (node->decl);
> -  TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
> -                                     TYPE_ATTRIBUTES (t));
> +  cl_target_option cur_target;
> +  bool m_old_have_regs_of_mode[MAX_MACHINE_MODE];
> +
> +  if (node->simdclone->vecsize_mangle == 's')
> +    {
> +      /* This is additive and has no effect if SVE, or a superset thereof, is
> +      already enabled.  */
> +      tree target = build_string (strlen ("+sve") + 1, "+sve");
> +      if (!aarch64_option_valid_attribute_p (node->decl, NULL_TREE, target, 
> 0))
> +     gcc_unreachable ();
> +      cl_target_option_save (&cur_target, &global_options, 
> &global_options_set);
> +      tree new_target = DECL_FUNCTION_SPECIFIC_TARGET (node->decl);
> +      cl_target_option_restore (&global_options, &global_options_set,
> +                             TREE_TARGET_OPTION (new_target));
> +      aarch64_override_options_internal (&global_options);
> +      memcpy (m_old_have_regs_of_mode, have_regs_of_mode,
> +           sizeof (have_regs_of_mode));
> +      for (int i = 0; i < NUM_MACHINE_MODES; ++i)
> +     if (aarch64_sve_mode_p ((machine_mode) i))
> +       have_regs_of_mode[i] = true;

I've now pushed g:9d14f677a0da80bc6355955469c69709b1d3c67e to add a
push_function_decl/pop_function_decl pair that should be usable here.

> [...]
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-1.c 
> b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-1.c
> new file mode 100644
> index 00000000000..b33541cc329
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-1.c
> @@ -0,0 +1,137 @@
> +/* { dg-do compile }  */
> +/* { dg-options "-std=c99" } */
> +/* { dg-additional-options "-O3 -march=armv8-a" } */

We should add -fno-schedule-insns and -fno-schedule-insns2, to make the
order more predictable.

> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +/*  Ensure correct creation of SVE Vector-length agnostic (VLA SVE) vector
> +    function calls from scalar versions in accordance with the Vector 
> Function
> +    Application Binary Interface Specification for AArch64 (AAVPCS).
> +
> +  We check for correctness in:
> +    - Vector function name mangling, with the grammar:
> +
> +      vector name := prefix  "_" name
> +      prefix := "_ZGV" isa mask <len> <parameters>
> +
> +      Whereby:
> +      - <isa>  := "s" for SVE
> +      - <mask> := "M" for Mask
> +      - <len>  := "x" for VLA SVE
> +
> +      resulting in:
> +      <prefix> := "_ZGVsMx" <parameters>
> +
> +      with each vector parameter contributing a "v" to the prefix.
> +
> +    - Parameter and return value mapping:
> +      - Unless marked with uniform or linear OpenMP clauses, parameters and
> +      return values are expected to map to vectors.
> +      - Where the lane-size of a parameter is less than the widest data size
> +      for a given function, the resulting vector should be unpacked and
> +      populated via use extending loads.
> +
> +    - Finally, we also make sure we can correctly generate calls to the same
> +      function, differing only in the target architecture (i.e. SVE vs SIMD),
> +      ensuring that each call points to the correctly-mangled vector function
> +      and employs the correct ABI.  For example, for `fn' we may expect:
> +
> +     for #pragma GCC target("+sve"): _ZGVsMxvv_fn
> +     for #pragma GCC target("+simd): _ZGVnN4vv_fn */
> +
> +#pragma GCC target ("+sve")
> +/* scan-assembler {_ZGVsMxv_fn0} } } */

Should be:

/* { dg-final { scan-assembler {...} } } */

But I think it'd be more robust to add ":\n" at the end, to make sure
that there's a definition rather than a reference.

It would also be good to have a C++ test for the C++ names.

> +extern int __attribute__ ((simd, const)) fn0 (int);
> +void test_fn0 (int *a, int *b, int n)
> +{
> +  for (int i = 0; i < n; ++i)
> +    a[i] += fn0 (b[i]);
> +}
> +
> +/*
> +** test_fn1:
> +** ...
> +**   ld1w    z1\.s, p7/z, \[x22, x19, lsl 2\]
> +**   ld1h    z0\.s, p7/z, \[x23, x19, lsl 1\]
> +**   mov     p0\.b, p7\.b
> +**   bl      _ZGVsMxvv_fn1
> +**   st1w    z0\.s, p7, \[x21, x19, lsl 2\]

We shouldn't hard-code any register numbers other than the arguments
and return value (z0, p0, z0, z1).  Same for later tests.

> +** ...
> +*/
> +extern int __attribute__ ((simd, const)) fn1 (short, int);
> +void test_fn1 (int *a, int *b, short *c, int n)
> +{
> +  for (int i = 0; i < n; ++i)
> +    a[i] = fn1 (c[i], b[i]);
> +}
> +
> +/*
> +** test_fn2:
> +** ...
> +**   ld1w    z1\.s, p7/z, \[x23, x19, lsl 2\]
> +**   ld1h    z0\.s, p7/z, \[x22, x19, lsl 1\]
> +**   mov     p0\.b, p7.b
> +**   bl      _ZGVsMxvv_fn2
> +**   st1h    z0\.s, p7, \[x21, x19, lsl 1\]
> +** ...
> +*/
> +extern short __attribute__ ((simd, const)) fn2 (short, int);
> +void test_fn2 (short *a, int *b, short *c, int n)
> +{
> +  for (int i = 0; i < n; ++i)
> +    a[i] = fn2 (c[i], b[i]);
> +}
> +
> +/*
> +** test_fn3:
> +** ...
> +**   ld1b    z23\.s, p7/z, \[x22, x19\]
> +**   ld1w    z0\.s, p7/z, \[x23, x19, lsl 2\]
> +**   ptrue   p6\.b, all
> +**   mov     p0\.b, p7\.b
> +**   mov     z1\.d, z23\.d
> +**   uxtb    z23\.h, p6/m, z23\.h
> +**   bl      _ZGVsMxvv_fn3
> +**   uxtb    z0\.h, p6/m, z0\.h

I don't think we should match the first uxtb (we can just stub it out
with "...").  And I think it's enough to stop matching at the first use
of the return value (the uxtb z0 in this case).

Same for later tests.

Thanks,
Richard

> +**   add     z0\.h, z0\.h, z23\.h
> +**   uxth    z0\.s, p6/m, z0\.s
> +**   st1w    z0\.s, p7, \[x20, x19, lsl 2\]
> +** ...
> +*/
> +extern char __attribute__ ((simd, const)) fn3 (int, char);
> +void test_fn3 (int *a, int *b, char *c, int n)
> +{
> +  for (int i = 0; i < n; ++i)
> +    a[i] = (int) (fn3 (b[i], c[i]) + c[i]);
> +}
> +
> +/*
> +** test_fn4:
> +** ...
> +**   ld1h    z23\.s, p7/z, \[x23, x19, lsl 1\]
> +**   ld1w    z0\.s, p7/z, \[x22, x19, lsl 2\]
> +**   mov     p0\.b, p7\.b
> +**   mov     z1\.d, z23\.d
> +**   ptrue   p6\.b, all
> +**   bl      _ZGVsMxvv_fn4
> +**   sxth    z23\.s, p6/m, z23\.s
> +**   sxth    z0\.s, p6/m, z0\.s
> +**   add     z0\.s, z0\.s, z23\.s
> +**   st1w    z0\.s, p7, \[x21, x19, lsl 2\]
> +** ...
> +*/
> +extern short __attribute__ ((simd, const)) fn4 (int, short);
> +void test_fn4 (int *a, int *b, short *c, int n)
> +{
> +  for (int i = 0; i < n; ++i)
> +    a[i] = (int) (fn4 (b[i], c[i]) + c[i]);
> +}
> +
> +#pragma GCC reset_options
> +#pragma GCC target ("+simd")
> +/* scan-assembler {_ZGVnN4vv_fn4} } } */
> +extern short __attribute__ ((simd, const)) fn4 (int, short);
> +void test_fn5 (int *a, int *b, short *c, int n)
> +{
> +  for (int i = 0; i < n; ++i)
> +    a[i] = (int) (fn4 (b[i], c[i]) + c[i]);
> +}

Re: [PATCH v2 3/4] aarch64: Add SVE support for simd clones [PR 96342]

Reply via email to