On Wed, May 14, 2025 at 04:30:35PM +0200, Juergen Christ wrote: > Since floating point and vector registers overlap on s390, more > efficient code can be generated to extract FPRs from VRs. > Additionally, for double vectors, more efficient code can be generated > to load specific lanes. > > gcc/ChangeLog: > > * config/s390/vector.md (VF): New mode iterator. > (VEC_SET_NONFLOAT): New mode iterator. > (VEC_SET_SINGLEFLOAT): New mode iterator. > (*vec_set<mode>): Split pattern in two. > (*vec_setv2df): Extract special handling for V2DF mode. > (*vec_extract<mode>): Split pattern in two. > > gcc/testsuite/ChangeLog: > > * gcc.target/s390/vector/vec-extract-1.c: New test. > * gcc.target/s390/vector/vec-set-1.c: New test. > > Signed-off-by: Juergen Christ <jchr...@linux.ibm.com> > --- > gcc/config/s390/vector.md | 135 ++++++++++++- > .../gcc.target/s390/vector/vec-extract-1.c | 190 ++++++++++++++++++ > .../gcc.target/s390/vector/vec-set-1.c | 67 ++++++ > 3 files changed, 381 insertions(+), 11 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-extract-1.c > create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-set-1.c > > diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md > index e29255fe1116..580cf6fc71f6 100644 > --- a/gcc/config/s390/vector.md > +++ b/gcc/config/s390/vector.md > @@ -75,6 +75,8 @@ > V1DF V2DF > (V1TF "TARGET_VXE") (TF "TARGET_VXE")]) > > +(define_mode_iterator VF [(V2SF "TARGET_VXE") (V4SF "TARGET_VXE") V2DF]) > + > ; All modes present in V_HW1 and VFT. > (define_mode_iterator V_HW1_FT [V16QI V8HI V4SI V2DI V1TI V1DF > V2DF (V1SF "TARGET_VXE") (V2SF "TARGET_VXE") > @@ -506,26 +508,90 @@ > UNSPEC_VEC_SET))] > "TARGET_VX") > > +; Iterator for vec_set that does not use special float/vect overlay tricks > +(define_mode_iterator VEC_SET_NONFLOAT > + [V1QI V2QI V4QI V8QI V16QI V1HI V2HI V4HI V8HI V1SI V2SI V4SI V1DI V2DI > V2SF V4SF]) > +; Iterator for single element float vectors > +(define_mode_iterator VEC_SET_SINGLEFLOAT [(V1SF "TARGET_VXE") V1DF (V1TF > "TARGET_VXE")]) > + > ; FIXME: Support also vector mode operands for 1 > ; FIXME: A target memory operand seems to be useful otherwise we end > ; up with vl vlvgg vst. Shouldn't the middle-end be able to handle > ; that itself? > ; vlvgb, vlvgh, vlvgf, vlvgg, vleb, vleh, vlef, vleg, vleib, vleih, vleif, > vleig > (define_insn "*vec_set<mode>" > - [(set (match_operand:V 0 "register_operand" "=v,v,v") > - (unspec:V [(match_operand:<non_vec> 1 "general_operand" "d,R,K") > - (match_operand:SI 2 "nonmemory_operand" "an,I,I") > - (match_operand:V 3 "register_operand" "0,0,0")] > - UNSPEC_VEC_SET))] > + [(set (match_operand:VEC_SET_NONFLOAT 0 "register_operand" "=v,v,v") > + (unspec:VEC_SET_NONFLOAT > + [(match_operand:<non_vec> 1 "general_operand" "d,R,K") > + (match_operand:SI 2 "nonmemory_operand" "an,I,I") > + (match_operand:VEC_SET_NONFLOAT 3 "register_operand" "0,0,0")] > + UNSPEC_VEC_SET))] > "TARGET_VX > && (!CONST_INT_P (operands[2]) > - || UINTVAL (operands[2]) < GET_MODE_NUNITS (<V:MODE>mode))" > + || UINTVAL (operands[2]) < GET_MODE_NUNITS > (<VEC_SET_NONFLOAT:MODE>mode))" > "@ > vlvg<bhfgq>\t%v0,%1,%Y2 > vle<bhfgq>\t%v0,%1,%2 > vlei<bhfgq>\t%v0,%1,%2" > [(set_attr "op_type" "VRS,VRX,VRI")]) > > +(define_insn "*vec_set<mode>" > + [(set (match_operand:VEC_SET_SINGLEFLOAT 0 "register_operand" "=v,v") > + (unspec:VEC_SET_SINGLEFLOAT > + [(match_operand:<non_vec> 1 "general_operand" "f,R") ^ Constraint v instead of f gives more flexibility to the RA. Note, on s390 we allow values of mode SF and DF in vector registers which do not overlap with floating-point registers, i.e., with REGNO >= 38. Of course, if a value of SFmode was created via a floating-point instruction, then it initially lives in a FPR. However, we could give RA the freedom to move those values to VRs in case as e.g. register pressure increases for FPRs or in case SF/DFmode values were not created by floating-point instructions in the first place. Therefore, at the moment I don't see that this could hurt us.
> + (match_operand:SI 2 "nonmemory_operand" "an,I") Although the modes ensure to a certain degree that we deal with lane 0 in this case, however, instead of ignoring operand 2 it would be better to check that it is indeed lane zero by replacing it with (const_int 0) > + (match_operand:VEC_SET_SINGLEFLOAT 3 "register_operand" "0,0")] > + UNSPEC_VEC_SET))] > + "TARGET_VX" > + "@ > + vlr\t%v0,%v1 > + vle<bhfgq>\t%v0,%1,0" ^ Multiple output patterns are aligned with the @ symbol. > + [(set_attr "op_type" "VRR,VRX")]) > + > +(define_insn "*vec_setv2df" > + [(set (match_operand:V2DF 0 "register_operand" > "=v,v,v,v") > + (unspec:V2DF [(match_operand:DF 1 "general_operand" "d,R,K,f") ^ constraint v > + (match_operand:SI 2 "nonmemory_operand" "an,I,I,n") > + (match_operand:V2DF 3 "register_operand" > "0,0,0,0")] > + UNSPEC_VEC_SET))] > + "TARGET_VX > + && (!CONST_INT_P (operands[2]) > + || UINTVAL (operands[2]) < GET_MODE_NUNITS (V2DFmode))" > + "@ > + vlvgg\t%v0,%1,%Y2 > + vleg\t%v0,%1,%2 > + vleig\t%v0,%1,%2 > + #" > + [(set_attr "op_type" "VRS,VRX,VRI,*")]) > + > +(define_split > + [(set (match_operand:V2DF 0 "register_operand" "") ^ Should be aligned with the other operands (doesn't show up here because how the tabs and the leading + play together). > + (unspec:V2DF [(match_operand:DF 1 "register_operand" "") > + (match_operand:SI 2 "const_int_operand" "") > + (match_operand:V2DF 3 "register_operand" "")] > + UNSPEC_VEC_SET))] > + "TARGET_VX > + && (UINTVAL (operands[2]) < GET_MODE_NUNITS (V2DFmode)) > + && reload_completed > + && FP_REGNO_P (REGNO (operands[1]))" ^ If constraint v is used, then of course VECTOR_REGNO_P must be used. > + [(set (match_dup 0) > + (vec_select:V2DF > + (vec_concat:V4DF > + (match_dup 1) > + (match_dup 3)) > + (parallel [(const_int 0) (match_dup 4)])))] > + " > + { ^ For single line C code we use "..." and for multi line {} where the curly brackets start/end in row zero, i.e., nesting {} into "" is not necessary. > + operands[1] = gen_rtx_REG (V2DFmode, REGNO (operands[1])); > + if (UINTVAL (operands[2]) == 0) > + operands[4] = GEN_INT (3); > + else > + { > + std::swap (operands[1], operands[3]); > + operands[4] = GEN_INT (2); > + } > + }") > + > ; vlvgb, vlvgh, vlvgf, vlvgg > (define_insn "*vec_set<mode>_plus" > [(set (match_operand:V 0 "register_operand" "=v") > @@ -554,19 +620,66 @@ > (define_insn "*vec_extract<mode>" > [(set (match_operand:<non_vec> 0 "nonimmediate_operand" "=d,R") > (vec_select:<non_vec> > - (match_operand:V 1 "nonmemory_operand" "v,v") > + (match_operand:VI 1 "nonmemory_operand" "v,v") > (parallel > [(match_operand:SI 2 "nonmemory_operand" "an,I")])))] > "TARGET_VX" > { > if (CONST_INT_P (operands[2])) > - operands[2] = GEN_INT (UINTVAL (operands[2]) & (GET_MODE_NUNITS > (<V:MODE>mode) - 1)); > + operands[2] = GEN_INT (UINTVAL (operands[2]) & (GET_MODE_NUNITS > (<VI:MODE>mode) - 1)); > if (which_alternative == 0) > return "vlgv<bhfgq>\t%0,%v1,%Y2"; > return "vste<bhfgq>\t%v1,%0,%2"; > } > [(set_attr "op_type" "VRS,VRX")]) > > +(define_insn "*vec_extract<mode>" > + [(set (match_operand:<non_vec> 0 "nonimmediate_operand" "=d,R,f") ^ v constraint > + (vec_select:<non_vec> > + (match_operand:VF 1 "nonmemory_operand" "v,v,f") ^ v constraint > + (parallel > + [(match_operand:SI 2 "nonmemory_operand" "an,I,n")])))] > + "TARGET_VX" > + { > + if (CONST_INT_P (operands[2])) > + operands[2] = GEN_INT (UINTVAL (operands[2]) & (GET_MODE_NUNITS > (<VF:MODE>mode) - 1)); > + if (which_alternative == 0) > + return "vlgv<bhfgq>\t%0,%v1,%Y2"; > + else if (which_alternative == 1) > + return "vste<bhfgq>\t%v1,%0,%2"; > + else > + return "#"; > + } > + [(set_attr "op_type" "VRS,VRX,*")]) > + > +(define_split > + [(set (match_operand:<non_vec> 0 "register_operand" "") > + (vec_select:<non_vec> > + (match_operand:VF 1 "register_operand" "") > + (parallel > + [(match_operand:SI 2 "const_int_operand" "")])))] > + "TARGET_VX && reload_completed && FP_REGNO_P (REGNO (operands[0]))" ^ If constraint v is used, then of course VECTOR_REGNO_P must be used. > + [(set (match_dup 0) > + (vec_duplicate:VF > + (vec_select:<non_vec> > + (match_dup 1) > + (parallel [(match_dup 2)]))))] > + " > + { ^ same as above, i.e., "" vs {} > + unsigned HOST_WIDE_INT idx = UINTVAL (operands[2]) & (GET_MODE_NUNITS > (<VF:MODE>mode) - 1); > + if (idx == 0) > + { > + rtx dest = gen_rtx_REG (<VF:MODE>mode, REGNO (operands[0])); > + emit_insn (gen_mov<VF:mode> (dest, operands[1])); > + DONE; > + } > + else > + { > + operands[0] = gen_rtx_REG (<VF:MODE>mode, REGNO (operands[0])); > + operands[2] = GEN_INT (idx); > + } > + }") > + > ; vlgvb, vlgvh, vlgvf, vlgvg > (define_insn "*vec_extract<mode>_plus" > [(set (match_operand:<non_vec> 0 "nonimmediate_operand" "=d") > @@ -603,10 +716,10 @@ > ; Replicate from vector element > ; vrepb, vreph, vrepf, vrepg > (define_insn "*vec_splat<mode>" > - [(set (match_operand:V_128_NOSINGLE 0 "register_operand" "=v") > - (vec_duplicate:V_128_NOSINGLE > + [(set (match_operand:V 0 "register_operand" "=v") > + (vec_duplicate:V > (vec_select:<non_vec> > - (match_operand:V_128_NOSINGLE 1 "register_operand" "v") > + (match_operand:V 1 "register_operand" "v") > (parallel > [(match_operand:QI 2 "const_mask_operand" "C")]))))] > "TARGET_VX && UINTVAL (operands[2]) < GET_MODE_NUNITS (<MODE>mode)" > diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-extract-1.c > b/gcc/testsuite/gcc.target/s390/vector/vec-extract-1.c > new file mode 100644 > index 000000000000..9df7909a3ea8 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/s390/vector/vec-extract-1.c > @@ -0,0 +1,190 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -march=z14 -mzarch" } */ > +/* { dg-final { check-function-bodies "**" "" } } */ > + > +typedef double V2DF __attribute__((vector_size(16))); > +typedef float V4SF __attribute__((vector_size(16))); > +typedef float V2SF __attribute__((vector_size(8))); > +typedef double V1DF __attribute__((vector_size(8))); > +typedef float V1SF __attribute__((vector_size(4))); > +typedef long double V1TF __attribute__((vector_size(16))); > + > +/* > +** extractfirstdouble: > +** vlr %v0,%v24 > +** br %r14 > +*/ > +double > +extractfirstdouble (V2DF x) > +{ > + return x[0]; > +} > + > +/* > +** extractseconddouble: > +** vrepg %v0,%v24,1 > +** br %r14 > +*/ > +double > +extractseconddouble (V2DF x) > +{ > + return x[1]; > +} > + > +/* > +** extractnthdouble: > +** vlgvg (%r.),%v24,0\(%r2\) > +** ldgr %f0,\1 > +** br %r14 > +*/ > +double > +extractnthdouble (V2DF x, int n) > +{ > + return x[n]; > +} > + > +/* > +** sumfirstdouble: > +** vfadb %v0,%v24,%v26 > +** br %r14 > +*/ > +double > +sumfirstdouble (V2DF x, V2DF y) > +{ > + return (x + y)[0]; > +} > + > +/* > +** extractfirstfloat: > +** vlr %v0,%v24 > +** br %r14 > +*/ > +float > +extractfirstfloat (V4SF x) > +{ > + return x[0]; > +} > + > +/* > +** extractsecondfloat: > +** vrepf %v0,%v24,1 > +** br %r14 > +*/ > +float > +extractsecondfloat (V4SF x) > +{ > + return x[1]; > +} > + > +/* > +** extractthirdfloat: > +** vrepf %v0,%v24,2 > +** br %r14 > +*/ > +float > +extractthirdfloat (V4SF x) > +{ > + return x[2]; > +} > + > +/* > +** extractfourthfloat: > +** vrepf %v0,%v24,3 > +** br %r14 > +*/ > +float > +extractfourthfloat (V4SF x) > +{ > + return x[3]; > +} > + > +/* > +** extractnthfloat: > +** vlgvf (%r.),%v24,0\(%r2\) > +** vlvgf %v0,\1,0 > +** br %r14 > +*/ > +float > +extractnthfloat (V4SF x, int n) > +{ > + return x[n]; > +} > + > +/* > +** sumfirstfloat: > +** vfasb %v0,%v24,%v26 > +** br %r14 > +*/ > +float > +sumfirstfloat (V4SF x, V4SF y) > +{ > + return (x + y)[0]; > +} > + > +/* > +** extractfirst2: > +** vlr %v0,%v24 > +** br %r14 > +*/ > +float > +extractfirst2 (V2SF x) > +{ > + return x[0]; > +} > + > +/* > +** extractsecond2: > +** vrepf %v0,%v24,1 > +** br %r14 > +*/ > +float > +extractsecond2 (V2SF x) > +{ > + return x[1]; > +} > + > +/* > +** extractnth2: > +** vlgvf (%r.),%v24,0\(%r2\) > +** vlvgf %v0,\1,0 > +** br %r14 > +*/ > +float > +extractnth2 (V2SF x, int n) > +{ > + return x[n]; > +} > + > +/* > +** extractsinglef: > +** vlr %v0,%v24 > +** br %r14 > +*/ > +float > +extractsinglef (V1SF x) > +{ > + return x[0]; > +} > + > +/* > +** extractsingled: > +** vlr %v0,%v24 > +** br %r14 > +*/ > +double > +extractsingled (V1DF x) > +{ > + return x[0]; > +} > + > +/* > +** extractsingleld: > +** vlr (%v.),%v24 > +** vst \1,0\(%r2\),3 > +** br %r14 > +*/ > +long double > +extractsingleld (V1TF x) > +{ > + return x[0]; > +} > diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-set-1.c > b/gcc/testsuite/gcc.target/s390/vector/vec-set-1.c > new file mode 100644 > index 000000000000..2eddb58290f6 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/s390/vector/vec-set-1.c > @@ -0,0 +1,67 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -march=z14 -mzarch" } */ > +/* { dg-final { check-function-bodies "**" "" } } */ > + > +typedef double V2DF __attribute__((vector_size(16))); > +typedef double V1DF __attribute__((vector_size(8))); > + > +/* > +** setdf0: > +** vpdi %v24,%v0,%v24,1 > +** br %r14 > +*/ > +V2DF > +setdf0 (V2DF x, double y) > +{ > + x[0] = y; > + return x; > +} > + > +/* > +** setdf1: > +** vmrhg %v24,%v24,%v0 > +** br %r14 > +*/ > +V2DF > +setdf1 (V2DF x, double y) > +{ > + x[1] = y; > + return x; > +} > + > +/* > +** setdfn: > +** lgdr (%r.),%f0 > +** vlvgg %v24,\1,0\(%r2\) > +** br %r14 > +*/ > +V2DF > +setdfn (V2DF x, double y, int n) > +{ > + x[n] = y; > + return x; > +} > + > +/* > +** set1df: > +** vlr %v24,%v0 > +** br %r14 > +*/ > +V1DF > +set1df (V1DF x, double y) > +{ > + x[0] = y; > + return x; > +} > + > +/* > +** set1dfn: > +** vlr %v24,%v0 > +** br %r14 > +*/ > +V1DF > +set1dfn (V1DF x, double y, int n) > +{ > + x[n] = y; > + return x; > +} I very much like those tests. Could you add for the sake of completeness also some SF tests for vec-set-1.c? > -- > 2.43.5 > Could you run a bootstrap and regtest? Thanks, Stefan