Re: [PATCH] s390: Floating point vector lane handling

Stefan Schulze Frielinghaus Fri, 16 May 2025 08:52:27 -0700

On Wed, May 14, 2025 at 04:30:35PM +0200, Juergen Christ wrote:
> Since floating point and vector registers overlap on s390, more
> efficient code can be generated to extract FPRs from VRs.
> Additionally, for double vectors, more efficient code can be generated
> to load specific lanes.
> 
> gcc/ChangeLog:
> 
>       * config/s390/vector.md (VF): New mode iterator.
>       (VEC_SET_NONFLOAT): New mode iterator.
>       (VEC_SET_SINGLEFLOAT): New mode iterator.
>       (*vec_set<mode>): Split pattern in two.
>       (*vec_setv2df): Extract special handling for V2DF mode.
>       (*vec_extract<mode>): Split pattern in two.
> 
> gcc/testsuite/ChangeLog:
> 
>       * gcc.target/s390/vector/vec-extract-1.c: New test.
>       * gcc.target/s390/vector/vec-set-1.c: New test.
> 
> Signed-off-by: Juergen Christ <jchr...@linux.ibm.com>
> ---
>  gcc/config/s390/vector.md                     | 135 ++++++++++++-
>  .../gcc.target/s390/vector/vec-extract-1.c    | 190 ++++++++++++++++++
>  .../gcc.target/s390/vector/vec-set-1.c        |  67 ++++++
>  3 files changed, 381 insertions(+), 11 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-extract-1.c
>  create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-set-1.c
> 
> diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
> index e29255fe1116..580cf6fc71f6 100644
> --- a/gcc/config/s390/vector.md
> +++ b/gcc/config/s390/vector.md
> @@ -75,6 +75,8 @@
>                          V1DF V2DF
>                          (V1TF "TARGET_VXE") (TF "TARGET_VXE")])
>  
> +(define_mode_iterator VF [(V2SF "TARGET_VXE") (V4SF "TARGET_VXE") V2DF])
> +
>  ; All modes present in V_HW1 and VFT.
>  (define_mode_iterator V_HW1_FT [V16QI V8HI V4SI V2DI V1TI V1DF
>                              V2DF (V1SF "TARGET_VXE") (V2SF "TARGET_VXE")
> @@ -506,26 +508,90 @@
>                  UNSPEC_VEC_SET))]
>    "TARGET_VX")
>  
> +; Iterator for vec_set that does not use special float/vect overlay tricks
> +(define_mode_iterator VEC_SET_NONFLOAT
> +  [V1QI V2QI V4QI V8QI V16QI V1HI V2HI V4HI V8HI V1SI V2SI V4SI V1DI V2DI 
> V2SF V4SF])
> +; Iterator for single element float vectors
> +(define_mode_iterator VEC_SET_SINGLEFLOAT [(V1SF "TARGET_VXE") V1DF (V1TF 
> "TARGET_VXE")])
> +
>  ; FIXME: Support also vector mode operands for 1
>  ; FIXME: A target memory operand seems to be useful otherwise we end
>  ; up with vl vlvgg vst.  Shouldn't the middle-end be able to handle
>  ; that itself?
>  ; vlvgb, vlvgh, vlvgf, vlvgg, vleb, vleh, vlef, vleg, vleib, vleih, vleif, 
> vleig
>  (define_insn "*vec_set<mode>"
> -  [(set (match_operand:V                    0 "register_operand"  "=v,v,v")
> -     (unspec:V [(match_operand:<non_vec> 1 "general_operand"    "d,R,K")
> -                (match_operand:SI        2 "nonmemory_operand" "an,I,I")
> -                (match_operand:V         3 "register_operand"   "0,0,0")]
> -               UNSPEC_VEC_SET))]
> +  [(set (match_operand:VEC_SET_NONFLOAT      0 "register_operand"  "=v,v,v")
> +     (unspec:VEC_SET_NONFLOAT
> +       [(match_operand:<non_vec>          1 "general_operand"    "d,R,K")
> +        (match_operand:SI                 2 "nonmemory_operand" "an,I,I")
> +        (match_operand:VEC_SET_NONFLOAT   3 "register_operand"   "0,0,0")]
> +       UNSPEC_VEC_SET))]
>    "TARGET_VX
>     && (!CONST_INT_P (operands[2])
> -       || UINTVAL (operands[2]) < GET_MODE_NUNITS (<V:MODE>mode))"
> +       || UINTVAL (operands[2]) < GET_MODE_NUNITS 
> (<VEC_SET_NONFLOAT:MODE>mode))"
>    "@
>     vlvg<bhfgq>\t%v0,%1,%Y2
>     vle<bhfgq>\t%v0,%1,%2
>     vlei<bhfgq>\t%v0,%1,%2"
>    [(set_attr "op_type" "VRS,VRX,VRI")])
>  
> +(define_insn "*vec_set<mode>"
> +  [(set (match_operand:VEC_SET_SINGLEFLOAT     0 "register_operand"  "=v,v")
> +     (unspec:VEC_SET_SINGLEFLOAT
> +       [(match_operand:<non_vec>            1 "general_operand"    "f,R")
                                                                       ^
Constraint v instead of f gives more flexibility to the RA.  Note, on
s390 we allow values of mode SF and DF in vector registers which do not
overlap with floating-point registers, i.e., with REGNO >= 38.  Of
course, if a value of SFmode was created via a floating-point instruction,
then it initially lives in a FPR.  However, we could give RA the freedom
to move those values to VRs in case as e.g. register pressure increases
for FPRs or in case SF/DFmode values were not created by floating-point
instructions in the first place.  Therefore, at the moment I don't see
that this could hurt us.


> +        (match_operand:SI                   2 "nonmemory_operand" "an,I")

Although the modes ensure to a certain degree that we deal with lane 0
in this case, however, instead of ignoring operand 2 it would be better
to check that it is indeed lane zero by replacing it with (const_int 0)

> +        (match_operand:VEC_SET_SINGLEFLOAT  3 "register_operand"   "0,0")]
> +       UNSPEC_VEC_SET))]
> +  "TARGET_VX"
> +  "@
> +  vlr\t%v0,%v1
> +  vle<bhfgq>\t%v0,%1,0"
     ^
Multiple output patterns are aligned with the @ symbol.

> + [(set_attr "op_type" "VRR,VRX")])
> +
> +(define_insn "*vec_setv2df"
> +  [(set (match_operand:V2DF                    0 "register_operand"  
> "=v,v,v,v")
> +     (unspec:V2DF [(match_operand:DF        1 "general_operand"    "d,R,K,f")
                                                                             ^
                                                                   constraint v

> +                   (match_operand:SI        2 "nonmemory_operand" "an,I,I,n")
> +                   (match_operand:V2DF      3 "register_operand"   
> "0,0,0,0")]
> +                  UNSPEC_VEC_SET))]
> +  "TARGET_VX
> +   && (!CONST_INT_P (operands[2])
> +       || UINTVAL (operands[2]) < GET_MODE_NUNITS (V2DFmode))"
> +  "@
> +   vlvgg\t%v0,%1,%Y2
> +   vleg\t%v0,%1,%2
> +   vleig\t%v0,%1,%2
> +   #"
> +  [(set_attr "op_type" "VRS,VRX,VRI,*")])
> +
> +(define_split
> +  [(set (match_operand:V2DF                 0 "register_operand"  "")
                                               ^
Should be aligned with the other operands (doesn't show up here because
how the tabs and the leading + play together).

> +     (unspec:V2DF [(match_operand:DF        1 "register_operand"  "")
> +                   (match_operand:SI        2 "const_int_operand" "")
> +                   (match_operand:V2DF      3 "register_operand"  "")]
> +                  UNSPEC_VEC_SET))]
> +  "TARGET_VX
> +   && (UINTVAL (operands[2]) < GET_MODE_NUNITS (V2DFmode))
> +   && reload_completed
> +   && FP_REGNO_P (REGNO (operands[1]))"
          ^
If constraint v is used, then of course VECTOR_REGNO_P must be used.

> +   [(set (match_dup 0)
> +      (vec_select:V2DF
> +        (vec_concat:V4DF
> +       (match_dup 1)
> +       (match_dup 3))
> +     (parallel [(const_int 0) (match_dup 4)])))]
> +  "
> +  {
     ^
For single line C code we use "..." and for multi line {} where the
curly brackets start/end in row zero, i.e., nesting {} into "" is not
necessary.

> +    operands[1] = gen_rtx_REG (V2DFmode, REGNO (operands[1]));
> +    if (UINTVAL (operands[2]) == 0)
> +      operands[4] = GEN_INT (3);
> +    else
> +    {
> +      std::swap (operands[1], operands[3]);
> +      operands[4] = GEN_INT (2);
> +    }
> +   }")
> +
>  ; vlvgb, vlvgh, vlvgf, vlvgg
>  (define_insn "*vec_set<mode>_plus"
>    [(set (match_operand:V                      0 "register_operand" "=v")
> @@ -554,19 +620,66 @@
>  (define_insn "*vec_extract<mode>"
>    [(set (match_operand:<non_vec> 0 "nonimmediate_operand" "=d,R")
>         (vec_select:<non_vec>
> -         (match_operand:V        1 "nonmemory_operand"  "v,v")
> +         (match_operand:VI       1 "nonmemory_operand"  "v,v")
>           (parallel
>            [(match_operand:SI     2 "nonmemory_operand" "an,I")])))]
>    "TARGET_VX"
>    {
>      if (CONST_INT_P (operands[2]))
> -       operands[2] = GEN_INT (UINTVAL (operands[2]) & (GET_MODE_NUNITS 
> (<V:MODE>mode) - 1));
> +       operands[2] = GEN_INT (UINTVAL (operands[2]) & (GET_MODE_NUNITS 
> (<VI:MODE>mode) - 1));
>      if (which_alternative == 0)
>        return "vlgv<bhfgq>\t%0,%v1,%Y2";
>       return "vste<bhfgq>\t%v1,%0,%2";
>    }
>    [(set_attr "op_type" "VRS,VRX")])
>  
> +(define_insn "*vec_extract<mode>"
> +  [(set (match_operand:<non_vec> 0 "nonimmediate_operand" "=d,R,f")
                                                                   ^
                                                                v constraint
> +       (vec_select:<non_vec>
> +         (match_operand:VF       1 "nonmemory_operand"  "v,v,f")
                                                                ^
                                                                v constraint
> +         (parallel
> +          [(match_operand:SI     2 "nonmemory_operand" "an,I,n")])))]
> +  "TARGET_VX"
> +  {
> +    if (CONST_INT_P (operands[2]))
> +      operands[2] = GEN_INT (UINTVAL (operands[2]) & (GET_MODE_NUNITS 
> (<VF:MODE>mode) - 1));
> +    if (which_alternative == 0)
> +      return "vlgv<bhfgq>\t%0,%v1,%Y2";
> +    else if (which_alternative == 1)
> +      return "vste<bhfgq>\t%v1,%0,%2";
> +    else
> +      return "#";
> +  }
> +  [(set_attr "op_type" "VRS,VRX,*")])
> +
> +(define_split
> +  [(set (match_operand:<non_vec> 0 "register_operand" "")
> +       (vec_select:<non_vec>
> +         (match_operand:VF       1 "register_operand"  "")
> +         (parallel
> +          [(match_operand:SI     2 "const_int_operand" "")])))]
> +  "TARGET_VX && reload_completed && FP_REGNO_P (REGNO (operands[0]))"
                                        ^
If constraint v is used, then of course VECTOR_REGNO_P must be used.

> +  [(set (match_dup 0)
> +        (vec_duplicate:VF
> +           (vec_select:<non_vec>
> +              (match_dup 1)
> +              (parallel [(match_dup 2)]))))]
> +  "
> +  {
     ^
same as above, i.e., "" vs {}

> +    unsigned HOST_WIDE_INT idx = UINTVAL (operands[2]) & (GET_MODE_NUNITS 
> (<VF:MODE>mode) - 1);
> +    if (idx == 0)
> +      {
> +        rtx dest = gen_rtx_REG (<VF:MODE>mode, REGNO (operands[0]));
> +        emit_insn (gen_mov<VF:mode> (dest, operands[1]));
> +        DONE;
> +      }
> +    else
> +      {
> +        operands[0] = gen_rtx_REG (<VF:MODE>mode, REGNO (operands[0]));
> +        operands[2] = GEN_INT (idx);
> +      }
> +  }")
> +
>  ; vlgvb, vlgvh, vlgvf, vlgvg
>  (define_insn "*vec_extract<mode>_plus"
>    [(set (match_operand:<non_vec>       0 "nonimmediate_operand" "=d")
> @@ -603,10 +716,10 @@
>  ; Replicate from vector element
>  ; vrepb, vreph, vrepf, vrepg
>  (define_insn "*vec_splat<mode>"
> -  [(set (match_operand:V_128_NOSINGLE   0 "register_operand" "=v")
> -     (vec_duplicate:V_128_NOSINGLE
> +  [(set (match_operand:V   0 "register_operand" "=v")
> +     (vec_duplicate:V
>        (vec_select:<non_vec>
> -       (match_operand:V_128_NOSINGLE 1 "register_operand"  "v")
> +       (match_operand:V 1 "register_operand"  "v")
>         (parallel
>          [(match_operand:QI 2 "const_mask_operand" "C")]))))]
>    "TARGET_VX && UINTVAL (operands[2]) < GET_MODE_NUNITS (<MODE>mode)"
> diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-extract-1.c 
> b/gcc/testsuite/gcc.target/s390/vector/vec-extract-1.c
> new file mode 100644
> index 000000000000..9df7909a3ea8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/s390/vector/vec-extract-1.c
> @@ -0,0 +1,190 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=z14 -mzarch" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +typedef double V2DF __attribute__((vector_size(16)));
> +typedef float V4SF __attribute__((vector_size(16)));
> +typedef float V2SF __attribute__((vector_size(8)));
> +typedef double V1DF __attribute__((vector_size(8)));
> +typedef float V1SF __attribute__((vector_size(4)));
> +typedef long double V1TF __attribute__((vector_size(16)));
> +
> +/*
> +** extractfirstdouble:
> +**   vlr     %v0,%v24
> +**   br      %r14
> +*/
> +double
> +extractfirstdouble (V2DF x)
> +{
> +  return x[0];
> +}
> +
> +/*
> +** extractseconddouble:
> +**   vrepg   %v0,%v24,1
> +**   br      %r14
> +*/
> +double
> +extractseconddouble (V2DF x)
> +{
> +  return x[1];
> +}
> +
> +/*
> +** extractnthdouble:
> +**   vlgvg   (%r.),%v24,0\(%r2\)
> +**   ldgr    %f0,\1
> +**   br      %r14
> +*/
> +double
> +extractnthdouble (V2DF x, int n)
> +{
> +  return x[n];
> +}
> +
> +/*
> +** sumfirstdouble:
> +**   vfadb   %v0,%v24,%v26
> +**   br      %r14
> +*/
> +double
> +sumfirstdouble (V2DF x, V2DF y)
> +{
> +  return (x + y)[0];
> +}
> +
> +/*
> +** extractfirstfloat:
> +**   vlr     %v0,%v24
> +**   br      %r14
> +*/
> +float
> +extractfirstfloat (V4SF x)
> +{
> +  return x[0];
> +}
> +
> +/*
> +** extractsecondfloat:
> +**   vrepf   %v0,%v24,1
> +**   br      %r14
> +*/
> +float
> +extractsecondfloat (V4SF x)
> +{
> +  return x[1];
> +}
> +
> +/*
> +** extractthirdfloat:
> +**   vrepf   %v0,%v24,2
> +**   br      %r14
> +*/
> +float
> +extractthirdfloat (V4SF x)
> +{
> +  return x[2];
> +}
> +
> +/*
> +** extractfourthfloat:
> +**   vrepf   %v0,%v24,3
> +**   br      %r14
> +*/
> +float
> +extractfourthfloat (V4SF x)
> +{
> +  return x[3];
> +}
> +
> +/*
> +** extractnthfloat:
> +**   vlgvf   (%r.),%v24,0\(%r2\)
> +**   vlvgf   %v0,\1,0
> +**   br      %r14
> +*/
> +float
> +extractnthfloat (V4SF x, int n)
> +{
> +  return x[n];
> +}
> +
> +/*
> +** sumfirstfloat:
> +**   vfasb   %v0,%v24,%v26
> +**   br      %r14
> +*/
> +float
> +sumfirstfloat (V4SF x, V4SF y)
> +{
> +  return (x + y)[0];
> +}
> +
> +/*
> +** extractfirst2:
> +**   vlr     %v0,%v24
> +**   br      %r14
> +*/
> +float
> +extractfirst2 (V2SF x)
> +{
> +  return x[0];
> +}
> +
> +/*
> +** extractsecond2:
> +**   vrepf   %v0,%v24,1
> +**   br      %r14
> +*/
> +float
> +extractsecond2 (V2SF x)
> +{
> +  return x[1];
> +}
> +
> +/*
> +** extractnth2:
> +**   vlgvf   (%r.),%v24,0\(%r2\)
> +**   vlvgf   %v0,\1,0
> +**   br      %r14
> +*/
> +float
> +extractnth2 (V2SF x, int n)
> +{
> +  return x[n];
> +}
> +
> +/*
> +** extractsinglef:
> +**   vlr     %v0,%v24
> +**   br      %r14
> +*/
> +float
> +extractsinglef (V1SF x)
> +{
> +  return x[0];
> +}
> +
> +/*
> +** extractsingled:
> +**   vlr     %v0,%v24
> +**   br      %r14
> +*/
> +double
> +extractsingled (V1DF x)
> +{
> +  return x[0];
> +}
> +
> +/*
> +** extractsingleld:
> +**   vlr     (%v.),%v24
> +**   vst     \1,0\(%r2\),3
> +**   br      %r14
> +*/
> +long double
> +extractsingleld (V1TF x)
> +{
> +  return x[0];
> +}
> diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-set-1.c 
> b/gcc/testsuite/gcc.target/s390/vector/vec-set-1.c
> new file mode 100644
> index 000000000000..2eddb58290f6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/s390/vector/vec-set-1.c
> @@ -0,0 +1,67 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=z14 -mzarch" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +typedef double V2DF __attribute__((vector_size(16)));
> +typedef double V1DF __attribute__((vector_size(8)));
> +
> +/*
> +** setdf0:
> +**   vpdi    %v24,%v0,%v24,1
> +**   br      %r14
> +*/
> +V2DF
> +setdf0 (V2DF x, double y)
> +{
> +  x[0] = y;
> +  return x;
> +}
> +
> +/*
> +** setdf1:
> +**   vmrhg   %v24,%v24,%v0
> +**   br      %r14
> +*/
> +V2DF
> +setdf1 (V2DF x, double y)
> +{
> +  x[1] = y;
> +  return x;
> +}
> +
> +/*
> +** setdfn:
> +**   lgdr    (%r.),%f0
> +**   vlvgg   %v24,\1,0\(%r2\)
> +**   br      %r14
> +*/
> +V2DF
> +setdfn (V2DF x, double y, int n)
> +{
> +  x[n] = y;
> +  return x;
> +}
> +
> +/*
> +** set1df:
> +**   vlr     %v24,%v0
> +**   br      %r14
> +*/
> +V1DF
> +set1df (V1DF x, double y)
> +{
> +  x[0] = y;
> +  return x;
> +}
> +
> +/*
> +** set1dfn:
> +**   vlr     %v24,%v0
> +**   br      %r14
> +*/
> +V1DF
> +set1dfn (V1DF x, double y, int n)
> +{
> +  x[n] = y;
> +  return x;
> +}

I very much like those tests.  Could you add for the sake of completeness
also some SF tests for vec-set-1.c?

> -- 
> 2.43.5
> 

Could you run a bootstrap and regtest?

Thanks,
Stefan

Re: [PATCH] s390: Floating point vector lane handling

Reply via email to