[PATCH 1/2] [x86] Support smin/smax for V2HF/V4HF

2023-10-07 Thread liuhongt
Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.
Ready push to trunk.

gcc/ChangeLog:

* config/i386/mmx.md (VHF_32_64): New mode iterator.
(3): New define_expand, merged from ..
(v4hf3): .. this and
(v2hf3): .. this.
(movd_v2hf_to_sse_reg): New define_expand, splitted from ..
(movd_v2hf_to_sse): .. this.
(3): New define_expand.

gcc/testsuite/ChangeLog:

* gcc.target/i386/part-vect-vminmaxph-1.c: New test.
* gcc.target/i386/avx512fp16-64-32-vecop-1.c: Scan-assembler
only for { target { ! ia32 } }.
---
 gcc/config/i386/mmx.md| 74 +++
 .../i386/avx512fp16-64-32-vecop-1.c   |  8 +-
 .../gcc.target/i386/part-vect-vminmaxph-1.c   | 36 +
 3 files changed, 83 insertions(+), 35 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/part-vect-vminmaxph-1.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index ef578222945..77f1db265ab 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -1936,25 +1936,7 @@ (define_expand "lroundv2sfv2si2"
 ;;
 ;
 
-(define_expand "v4hf3"
-  [(set (match_operand:V4HF 0 "register_operand")
-   (plusminusmult:V4HF
- (match_operand:V4HF 1 "nonimmediate_operand")
- (match_operand:V4HF 2 "nonimmediate_operand")))]
-  "TARGET_AVX512FP16 && TARGET_AVX512VL && ix86_partial_vec_fp_math"
-{
-  rtx op2 = gen_reg_rtx (V8HFmode);
-  rtx op1 = gen_reg_rtx (V8HFmode);
-  rtx op0 = gen_reg_rtx (V8HFmode);
-
-  emit_insn (gen_movq_v4hf_to_sse (op2, operands[2]));
-  emit_insn (gen_movq_v4hf_to_sse (op1, operands[1]));
-
-  emit_insn (gen_v8hf3 (op0, op1, op2));
-
-  emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode));
-  DONE;
-})
+(define_mode_iterator VHF_32_64 [V2HF (V4HF "TARGET_MMX_WITH_SSE")])
 
 (define_expand "divv4hf3"
   [(set (match_operand:V4HF 0 "register_operand")
@@ -1976,39 +1958,50 @@ (define_expand "divv4hf3"
   DONE;
 })
 
+(define_mode_attr mov_to_sse_suffix [(V2HF "d") (V4HF "q")])
 (define_expand "movd_v2hf_to_sse"
   [(set (match_operand:V8HF 0 "register_operand")
(vec_merge:V8HF
  (vec_duplicate:V8HF
(match_operand:V2HF 1 "nonimmediate_operand"))
- (match_operand:V8HF 2 "reg_or_0_operand")
+ (match_dup 2)
  (const_int 3)))]
   "TARGET_SSE"
 {
-  if (!flag_trapping_math && operands[2] == CONST0_RTX (V8HFmode))
+  if (!flag_trapping_math)
   {
 rtx op1 = force_reg (V2HFmode, operands[1]);
 emit_move_insn (operands[0], lowpart_subreg (V8HFmode, op1, V2HFmode));
 DONE;
   }
+  operands[2] = CONST0_RTX (V8HFmode);
 })
 
-(define_expand "v2hf3"
-  [(set (match_operand:V2HF 0 "register_operand")
-   (plusminusmult:V2HF
- (match_operand:V2HF 1 "nonimmediate_operand")
- (match_operand:V2HF 2 "nonimmediate_operand")))]
+(define_expand "movd_v2hf_to_sse_reg"
+  [(set (match_operand:V8HF 0 "register_operand")
+   (vec_merge:V8HF
+ (vec_duplicate:V8HF
+   (match_operand:V2HF 1 "nonimmediate_operand"))
+ (match_operand:V8HF 2 "register_operand")
+ (const_int 3)))]
+  "TARGET_SSE")
+
+(define_expand "3"
+  [(set (match_operand:VHF_32_64 0 "register_operand")
+   (plusminusmult:VHF_32_64
+ (match_operand:VHF_32_64 1 "nonimmediate_operand")
+ (match_operand:VHF_32_64 2 "nonimmediate_operand")))]
   "TARGET_AVX512FP16 && TARGET_AVX512VL && ix86_partial_vec_fp_math"
 {
   rtx op2 = gen_reg_rtx (V8HFmode);
   rtx op1 = gen_reg_rtx (V8HFmode);
   rtx op0 = gen_reg_rtx (V8HFmode);
 
-  emit_insn (gen_movd_v2hf_to_sse (op2, operands[2], CONST0_RTX (V8HFmode)));
-  emit_insn (gen_movd_v2hf_to_sse (op1, operands[1], CONST0_RTX (V8HFmode)));
+  emit_insn (gen_mov__to_sse (op2, operands[2]));
+  emit_insn (gen_mov__to_sse (op1, operands[1]));
   emit_insn (gen_v8hf3 (op0, op1, op2));
 
-  emit_move_insn (operands[0], lowpart_subreg (V2HFmode, op0, V8HFmode));
+  emit_move_insn (operands[0], lowpart_subreg (mode, op0, V8HFmode));
   DONE;
 })
 
@@ -2023,15 +2016,34 @@ (define_expand "divv2hf3"
   rtx op1 = gen_reg_rtx (V8HFmode);
   rtx op0 = gen_reg_rtx (V8HFmode);
 
-  emit_insn (gen_movd_v2hf_to_sse (op2, operands[2],
+  emit_insn (gen_movd_v2hf_to_sse_reg (op2, operands[2],
  force_reg (V8HFmode, CONST1_RTX (V8HFmode;
-  emit_insn (gen_movd_v2hf_to_sse (op1, operands[1], CONST0_RTX (V8HFmode)));
+  emit_insn (gen_movd_v2hf_to_sse (op1, operands[1]));
   emit_insn (gen_divv8hf3 (op0, op1, op2));
 
   emit_move_insn (operands[0], lowpart_subreg (V2HFmode, op0, V8HFmode));
   DONE;
 })
 
+(define_expand "3"
+  [(set (match_operand:VHF_32_64 0 "register_operand")
+   (smaxmin:VHF_32_64
+ (match_operand:VHF_32_64 1 "nonimmediate_operand")
+ (match_operand:VHF_32_64 2 "nonimmediate_operand")))]
+  "TARGET_AVX512FP16 && TARG

[PATCH 2/2] Support signbit/xorsign/copysign/abs/neg/and/xor/ior/andn for V2HF/V4HF.

2023-10-07 Thread liuhongt
Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.
Ready push to trunk.

gcc/ChangeLog:

* config/i386/i386.cc (ix86_build_const_vector): Handle V2HF
and V4HFmode.
(ix86_build_signbit_mask): Ditto.
* config/i386/mmx.md (mmxintvecmode): Ditto.
(2): New define_expand.
(*mmx_): New define_insn_and_split.
(*mmx_nabs2): Ditto.
(*mmx_andnot3): New define_insn.
(3): Ditto.
(copysign3): New define_expand.
(xorsign3): Ditto.
(signbit2): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/part-vect-absneghf.c: New test.
* gcc.target/i386/part-vect-copysignhf.c: New test.
* gcc.target/i386/part-vect-xorsignhf.c: New test.
---
 gcc/config/i386/i386.cc   |   4 +
 gcc/config/i386/mmx.md| 114 +-
 .../gcc.target/i386/part-vect-absneghf.c  |  91 ++
 .../gcc.target/i386/part-vect-copysignhf.c|  60 +
 .../gcc.target/i386/part-vect-vminmaxph-1.c   |   4 +-
 .../gcc.target/i386/part-vect-xorsignhf.c |  60 +
 6 files changed, 330 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/part-vect-absneghf.c
 create mode 100644 gcc/testsuite/gcc.target/i386/part-vect-copysignhf.c
 create mode 100644 gcc/testsuite/gcc.target/i386/part-vect-xorsignhf.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 9557bffd092..46326d3c82e 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -15752,6 +15752,8 @@ ix86_build_const_vector (machine_mode mode, bool vect, 
rtx value)
 case E_V2DImode:
   gcc_assert (vect);
   /* FALLTHRU */
+case E_V2HFmode:
+case E_V4HFmode:
 case E_V8HFmode:
 case E_V16HFmode:
 case E_V32HFmode:
@@ -15793,6 +15795,8 @@ ix86_build_signbit_mask (machine_mode mode, bool vect, 
bool invert)
 
   switch (mode)
 {
+case E_V2HFmode:
+case E_V4HFmode:
 case E_V8HFmode:
 case E_V16HFmode:
 case E_V32HFmode:
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 77f1db265ab..c68a3d6fe43 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -99,7 +99,8 @@ (define_mode_attr mmxdoublemode
 
 ;; Mapping of vector float modes to an integer mode of the same size
 (define_mode_attr mmxintvecmode
-  [(V2SF "V2SI") (V2SI "V2SI") (V4HI "V4HI") (V8QI "V8QI")])
+  [(V2SF "V2SI") (V2SI "V2SI") (V4HI "V4HI") (V8QI "V8QI")
+   (V4HF "V4HF") (V2HF "V2HI")])
 
 (define_mode_attr mmxintvecmodelower
   [(V2SF "v2si") (V2SI "v2si") (V4HI "v4hi") (V8QI "v8qi")])
@@ -2045,6 +2046,117 @@ (define_expand "3"
   DONE;
 })
 
+(define_expand "2"
+  [(set (match_operand:VHF_32_64 0 "register_operand")
+   (absneg:VHF_32_64
+ (match_operand:VHF_32_64 1 "register_operand")))]
+  "TARGET_SSE"
+  "ix86_expand_fp_absneg_operator (, mode, operands); DONE;")
+
+(define_insn_and_split "*mmx_"
+  [(set (match_operand:VHF_32_64 0 "register_operand" "=x,x,x")
+   (absneg:VHF_32_64
+ (match_operand:VHF_32_64 1 "register_operand" "0,x,x")))
+   (use (match_operand:VHF_32_64 2 "register_operand" "x,0,x"))]
+  "TARGET_SSE"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+   (: (match_dup 1) (match_dup 2)))]
+{
+  if (!TARGET_AVX && operands_match_p (operands[0], operands[2]))
+std::swap (operands[1], operands[2]);
+}
+  [(set_attr "isa" "noavx,noavx,avx")])
+
+(define_insn_and_split "*mmx_nabs2"
+  [(set (match_operand:VHF_32_64 0 "register_operand" "=x,x,x")
+   (neg:VHF_32_64
+ (abs:VHF_32_64
+   (match_operand:VHF_32_64 1 "register_operand" "0,x,x"
+   (use (match_operand:VHF_32_64 2 "register_operand" "x,0,x"))]
+  "TARGET_SSE"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+   (ior: (match_dup 1) (match_dup 2)))])
+
+;
+;;
+;; Parallel half-precision floating point logical operations
+;;
+;
+
+(define_insn "*mmx_andnot3"
+  [(set (match_operand:VHF_32_64 0 "register_operand""=x,x")
+   (and:VHF_32_64
+ (not:VHF_32_64
+   (match_operand:VHF_32_64 1 "register_operand" "0,x"))
+ (match_operand:VHF_32_64 2 "register_operand"   "x,x")))]
+  "TARGET_SSE"
+  "@
+   andnps\t{%2, %0|%0, %2}
+   vandnps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "V4SF")])
+
+(define_insn "3"
+  [(set (match_operand:VHF_32_64 0 "register_operand"   "=x,x")
+   (any_logic:VHF_32_64
+ (match_operand:VHF_32_64 1 "register_operand" "%0,x")
+ (match_operand:VHF_32_64 2 "register_operand" " x,x")))]
+  "TARGET_SSE"
+  "@
+   ps\t{%2, %0|%0, %2}
+   vps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog,sselog")
+   (set_attr "prefix" 

[PATCH] [x86] Refine predicate of operands[2] in divv4hf3 with register_operand.

2023-10-10 Thread liuhongt
In the expander, it will emit below insn.

rtx tmp = gen_rtx_VEC_CONCAT (V4SFmode, operands[2],
force_reg (V2SFmode, CONST1_RTX (V2SFmode)));

but *vec_concat only allow register_operand.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

PR target/111745
* config/i386/mmx.md (divv4hf3): Refine predicate of
operands[2] with register_operand.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr111745.c: New test.
---
 gcc/config/i386/mmx.md   |  2 +-
 gcc/testsuite/gcc.target/i386/pr111745.c | 18 ++
 2 files changed, 19 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr111745.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index c84a37a8444..4707cfae93f 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -1950,7 +1950,7 @@ (define_expand "divv4hf3"
   [(set (match_operand:V4HF 0 "register_operand")
(div:V4HF
  (match_operand:V4HF 1 "nonimmediate_operand")
- (match_operand:V4HF 2 "nonimmediate_operand")))]
+ (match_operand:V4HF 2 "register_operand")))]
   "TARGET_AVX512FP16 && TARGET_AVX512VL && ix86_partial_vec_fp_math"
 {
   rtx op2 = gen_reg_rtx (V8HFmode);
diff --git a/gcc/testsuite/gcc.target/i386/pr111745.c 
b/gcc/testsuite/gcc.target/i386/pr111745.c
new file mode 100644
index 000..e8989d96abf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr111745.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -ffloat-store -O2" } */
+
+char c;
+_Float16 __attribute__((__vector_size__ (4 * sizeof (_Float16 f;
+_Float16 __attribute__((__vector_size__ (2 * sizeof (_Float16 f1;
+
+void
+foo (void)
+{
+  f /= c;
+}
+
+void
+foo1 (void)
+{
+  f1 /= c;
+}
-- 
2.31.1



[PATCH 2/2] Support 32/64-bit vectorization for conversion between _Float16 and integer/float.

2023-10-11 Thread liuhongt
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

* config/i386/mmx.md (V2FI_32): New mode iterator
(movd_v2hf_to_sse): Rename to ..
(movd__to_sse): .. this.
(movd_v2hf_to_sse_reg): Rename to ..
(movd__to_sse_reg): .. this.
(fix_trunc2): New
expander.
(fix_truncv2hfv2si2): Ditto.
(float2): Ditto.
(floatv2siv2hf2): Ditto.
(extendv2hfv2sf2): Ditto.
(truncv2sfv2hf2): Ditto.
* config/i386/sse.md (*vec_concatv8hf_movss): Rename to ..
(*vec_concat_movss): .. this.

gcc/testsuite/ChangeLog:

* gcc.target/i386/part-vect-hf-convert-1.c: New test.
---
 gcc/config/i386/mmx.md| 164 --
 gcc/config/i386/sse.md|  12 +-
 .../gcc.target/i386/part-vect-hf-convert-1.c  | 111 
 3 files changed, 262 insertions(+), 25 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/part-vect-hf-convert-1.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 8375100d4bf..be2a9026c44 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -60,6 +60,7 @@ (define_mode_iterator MMXMODE248 [V4HI V2SI V1DI])
 ;; All 4-byte integer/float16 vector modes
 (define_mode_iterator V_32 [V4QI V2HI V1SI V2HF V2BF])
 
+(define_mode_iterator V2FI_32 [V2HF V2BF V2HI])
 ;; 4-byte integer vector modes
 (define_mode_iterator VI_32 [V4QI V2HI])
 
@@ -79,7 +80,7 @@ (define_mode_iterator V_16_32_64
 ;; V2S* modes
 (define_mode_iterator V2FI [V2SF V2SI])
 
-(define_mode_iterator V2FI_V4HF [V2SF V2SI V4HF])
+(define_mode_iterator V24FI [V2SF V2SI V4HF V4HI])
 ;; Mapping from integer vector mode to mnemonic suffix
 (define_mode_attr mmxvecsize
   [(V8QI "b") (V4QI "b") (V2QI "b")
@@ -100,7 +101,7 @@ (define_mode_attr mmxdoublemode
 ;; Mapping of vector float modes to an integer mode of the same size
 (define_mode_attr mmxintvecmode
   [(V2SF "V2SI") (V2SI "V2SI") (V4HI "V4HI") (V8QI "V8QI")
-   (V4HF "V4HF") (V2HF "V2HI")])
+   (V4HF "V4HI") (V2HF "V2HI")])
 
 (define_mode_attr mmxintvecmodelower
   [(V2SF "v2si") (V2SI "v2si") (V4HI "v4hi") (V8QI "v8qi")
@@ -108,7 +109,7 @@ (define_mode_attr mmxintvecmodelower
 
 ;; Mapping of vector modes to a vector mode of double size
 (define_mode_attr mmxdoublevecmode
-  [(V2SF "V4SF") (V2SI "V4SI") (V4HF "V8HF")])
+  [(V2SF "V4SF") (V2SI "V4SI") (V4HF "V8HF") (V4HI "V8HI")])
 
 ;; Mapping of vector modes back to the scalar modes
 (define_mode_attr mmxscalarmode
@@ -600,7 +601,7 @@ (define_insn "sse_movntq"
 (define_expand "movq__to_sse"
   [(set (match_operand: 0 "register_operand")
(vec_concat:
- (match_operand:V2FI_V4HF 1 "nonimmediate_operand")
+ (match_operand:V24FI 1 "nonimmediate_operand")
  (match_dup 2)))]
   "TARGET_SSE2"
 {
@@ -1967,31 +1968,40 @@ (define_expand "divv4hf3"
   DONE;
 })
 
-(define_mode_attr mov_to_sse_suffix [(V2HF "d") (V4HF "q")])
-(define_expand "movd_v2hf_to_sse"
-  [(set (match_operand:V8HF 0 "register_operand")
-   (vec_merge:V8HF
- (vec_duplicate:V8HF
-   (match_operand:V2HF 1 "nonimmediate_operand"))
+(define_mode_attr mov_to_sse_suffix
+  [(V2HF "d") (V4HF "q") (V2HI "d") (V4HI "q")])
+
+(define_mode_attr mmxxmmmode
+  [(V2HF "V8HF") (V2HI "V8HI") (V2BF "V8BF")])
+
+(define_mode_attr mmxxmmmodelower
+  [(V2HF "v8hf") (V2HI "v8hi") (V2BF "v8bf")])
+
+(define_expand "movd__to_sse"
+  [(set (match_operand: 0 "register_operand")
+   (vec_merge:
+ (vec_duplicate:
+   (match_operand:V2FI_32 1 "nonimmediate_operand"))
  (match_dup 2)
  (const_int 3)))]
   "TARGET_SSE"
 {
   if (!flag_trapping_math)
   {
-rtx op1 = force_reg (V2HFmode, operands[1]);
-emit_move_insn (operands[0], lowpart_subreg (V8HFmode, op1, V2HFmode));
+rtx op1 = force_reg (mode, operands[1]);
+emit_move_insn (operands[0],
+  lowpart_subreg (mode, op1, mode));
 DONE;
   }
-  operands[2] = CONST0_RTX (V8HFmode);
+  operands[2] = CONST0_RTX (mode);
 })
 
-(define_expand "movd_v2hf_to_sse_reg"
-  [(set (match_operand:V8HF 0 "register_operand")
-   (vec_merge:V8HF
- (vec_duplicate:V8HF
-   (match_operand:V2HF 1 "nonimmediate_operand"))
- (match_operand:V8HF 2 "register_operand")
+(define_expand "movd__to_sse_reg"
+  [(set (match_operand: 0 "register_operand")
+   (vec_merge:
+ (vec_duplicate:
+   (match_operand:V2FI_32 1 "nonimmediate_operand"))
+ (match_operand: 2 "register_operand")
  (const_int 3)))]
   "TARGET_SSE")
 
@@ -2353,6 +2363,122 @@ (define_expand "signbit2"
   "TARGET_SSE2"
   "operands[2] = GEN_INT (GET_MODE_UNIT_BITSIZE (mode)-1);")
 
+;
+;;
+;; Parallel single-precision floating point conversion operations
+;;
+;
+
+(define_ex

[PATCH 1/2] Enable vectorization for V2HF/V4HF rounding operations and sqrt.

2023-10-11 Thread liuhongt
For lrint/lround/lceil/lfoor is not vectorized due to vectorization
restriction. When input element size is different from output element size,
vectorization relies on the old TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
intstead of the modern standand pattern name. The patch only supports standard
pattern name, doesn't update ix86_builtin_vectorized_function.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

* config/i386/i386-expand.cc (ix86_sse_copysign_to_positive):
Handle HFmode.
(ix86_expand_round_sse4): Ditto.
* config/i386/i386.md (roundhf2): New expander.
(lroundhf2): Ditto.
(lrinthf2): Ditto.
(lhf2): Ditto.
* config/i386/mmx.md (sqrt2): Ditto.
(btrunc2): Ditto.
(nearbyint2): Ditto.
(rint2): Ditto.
(lrint2): Ditto.
(floor2): Ditto.
(lfloor2): Ditto.
(ceil2): Ditto.
(lceil2): Ditto.
(round2): Ditto.
(lround2): Ditto.
* config/i386/sse.md (lrint2): Ditto.
(lfloor2): Ditto.
(lceil2): Ditto.
(lround2): Ditto.
(sse4_1_round): Extend to V8HF.
(round2): Extend to V8HF/V16HF/V32HF.

gcc/testsuite/ChangeLog:

* gcc.target/i386/part-vect-roundhf.c: New test.
* gcc.target/i386/part-vect-sqrtph-1.c: New test.
---
 gcc/config/i386/i386-expand.cc|   6 +
 gcc/config/i386/i386.md   |  38 +++
 gcc/config/i386/mmx.md| 191 ++-
 gcc/config/i386/sse.md|  60 -
 .../gcc.target/i386/part-vect-roundhf.c   | 217 ++
 .../gcc.target/i386/part-vect-sqrtph-1.c  |  20 ++
 6 files changed, 521 insertions(+), 11 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/part-vect-roundhf.c
 create mode 100644 gcc/testsuite/gcc.target/i386/part-vect-sqrtph-1.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 425f3531862..b81b5cc030c 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -18434,6 +18434,8 @@ ix86_sse_copysign_to_positive (rtx result, rtx 
abs_value, rtx sign, rtx mask)
vmode = V4SFmode;
   else if (mode == DFmode)
vmode = V2DFmode;
+  else if (mode == HFmode)
+   vmode = V8HFmode;
   else
vmode = mode;
 
@@ -18970,6 +18972,10 @@ ix86_expand_round_sse4 (rtx op0, rtx op1)
 
   switch (mode)
 {
+case E_HFmode:
+  gen_copysign = gen_copysignhf3;
+  gen_round = gen_sse4_1_roundhf2;
+  break;
 case E_SFmode:
   gen_copysign = gen_copysignsf3;
   gen_round = gen_sse4_1_roundsf2;
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 65a0dd025c7..41173cb3452 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -21741,6 +21741,15 @@ (define_expand "nearbyint2"
   DONE;
 })
 
+(define_expand "roundhf2"
+  [(match_operand:HF 0 "register_operand")
+   (match_operand:HF 1 "register_operand")]
+  "TARGET_AVX512FP16 && !flag_trapping_math && !flag_rounding_math"
+{
+  ix86_expand_round_sse4 (operands[0], operands[1]);
+  DONE;
+})
+
 (define_expand "round2"
   [(match_operand:X87MODEF 0 "register_operand")
(match_operand:X87MODEF 1 "nonimmediate_operand")]
@@ -21792,6 +21801,22 @@ (define_insn "lrintxf2"
   [(set_attr "type" "fpspc")
(set_attr "mode" "")])
 
+(define_expand "lroundhf2"
+  [(set (match_operand:SWI248 0 "register_operand")
+ (unspec:SWI248 [(match_operand:HF 1 "nonimmediate_operand")]
+  UNSPEC_FIX_NOTRUNC))]
+  "TARGET_AVX512FP16 && !flag_trapping_math && !flag_rounding_math"
+{
+  ix86_expand_lround (operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "lrinthf2"
+  [(set (match_operand:SWI48 0 "register_operand")
+ (unspec:SWI48 [(match_operand:HF 1 "nonimmediate_operand")]
+  UNSPEC_FIX_NOTRUNC))]
+  "TARGET_AVX512FP16")
+
 (define_expand "lrint2"
   [(set (match_operand:SWI48 0 "register_operand")
  (unspec:SWI48 [(match_operand:MODEF 1 "nonimmediate_operand")]
@@ -22034,6 +22059,19 @@ (define_expand "lxf2"
&& (!TARGET_SSE_MATH || TARGET_MIX_SSE_I387)
&& flag_unsafe_math_optimizations")
 
+(define_expand "lhf2"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand")
+   (unspec:SWI48 [(match_operand:HF 1 "register_operand")]
+   FIST_ROUNDING))]
+  "TARGET_AVX512FP16"
+{
+  rtx tmp = gen_reg_rtx (HFmode);
+  emit_insn (gen_sse4_1_roundhf2 (tmp, operands[1],
+GEN_INT (ROUND_ | ROUND_NO_EXC)));
+  emit_insn (gen_fix_trunchf2 (operands[0], tmp));
+  DONE;
+})
+
 (define_expand "l2"
   [(parallel [(set (match_operand:SWI48 0 "nonimmediate_operand")
   (unspec:SWI48 [(match_operand:MODEF 1 "register_operand")]
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index c84a37a8444..8375100d4bf 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc

[PATCH] Support 32/64-bit vectorization for _Float16 fma related operations.

2023-10-16 Thread liuhongt
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

* config/i386/mmx.md (fma4): New expander.
(fms4): Ditto.
(fnma4): Ditto.
(fnms4): Ditto.
(vec_fmaddsubv4hf4): Ditto.
(vec_fmsubaddv4hf4): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/part-vect-fmaddsubhf-1.c: New test.
* gcc.target/i386/part-vect-fmahf-1.c: New test.
---
 gcc/config/i386/mmx.md| 152 +-
 .../gcc.target/i386/part-vect-fmaddsubhf-1.c  |  22 +++
 .../gcc.target/i386/part-vect-fmahf-1.c   |  58 +++
 3 files changed, 231 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/part-vect-fmaddsubhf-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/part-vect-fmahf-1.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 82ca49c207b..491a0a51272 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -2365,7 +2365,157 @@ (define_expand "signbit2"
 
 ;
 ;;
-;; Parallel single-precision floating point conversion operations
+;; Parallel half-precision FMA multiply/accumulate instructions.
+;;
+;
+
+(define_expand "fma4"
+  [(set (match_operand:VHF_32_64 0 "register_operand")
+   (fma:VHF_32_64
+ (match_operand:VHF_32_64 1 "nonimmediate_operand")
+ (match_operand:VHF_32_64 2 "nonimmediate_operand")
+ (match_operand:VHF_32_64 3 "nonimmediate_operand")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL && ix86_partial_vec_fp_math"
+{
+  rtx op3 = gen_reg_rtx (V8HFmode);
+  rtx op2 = gen_reg_rtx (V8HFmode);
+  rtx op1 = gen_reg_rtx (V8HFmode);
+  rtx op0 = gen_reg_rtx (V8HFmode);
+
+  emit_insn (gen_mov__to_sse (op3, operands[3]));
+  emit_insn (gen_mov__to_sse (op2, operands[2]));
+  emit_insn (gen_mov__to_sse (op1, operands[1]));
+
+  emit_insn (gen_fmav8hf4 (op0, op1, op2, op3));
+
+  emit_move_insn (operands[0], lowpart_subreg (mode, op0, V8HFmode));
+  DONE;
+})
+
+(define_expand "fms4"
+  [(set (match_operand:VHF_32_64 0 "register_operand")
+   (fma:VHF_32_64
+ (match_operand:VHF_32_64   1 "nonimmediate_operand")
+ (match_operand:VHF_32_64   2 "nonimmediate_operand")
+ (neg:VHF_32_64
+   (match_operand:VHF_32_64 3 "nonimmediate_operand"]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL && ix86_partial_vec_fp_math"
+{
+  rtx op3 = gen_reg_rtx (V8HFmode);
+  rtx op2 = gen_reg_rtx (V8HFmode);
+  rtx op1 = gen_reg_rtx (V8HFmode);
+  rtx op0 = gen_reg_rtx (V8HFmode);
+
+  emit_insn (gen_mov__to_sse (op3, operands[3]));
+  emit_insn (gen_mov__to_sse (op2, operands[2]));
+  emit_insn (gen_mov__to_sse (op1, operands[1]));
+
+  emit_insn (gen_fmsv8hf4 (op0, op1, op2, op3));
+
+  emit_move_insn (operands[0], lowpart_subreg (mode, op0, V8HFmode));
+  DONE;
+})
+
+(define_expand "fnma4"
+  [(set (match_operand:VHF_32_64 0 "register_operand")
+   (fma:VHF_32_64
+ (neg:VHF_32_64
+   (match_operand:VHF_32_64 1 "nonimmediate_operand"))
+ (match_operand:VHF_32_64   2 "nonimmediate_operand")
+ (match_operand:VHF_32_64   3 "nonimmediate_operand")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL && ix86_partial_vec_fp_math"
+{
+  rtx op3 = gen_reg_rtx (V8HFmode);
+  rtx op2 = gen_reg_rtx (V8HFmode);
+  rtx op1 = gen_reg_rtx (V8HFmode);
+  rtx op0 = gen_reg_rtx (V8HFmode);
+
+  emit_insn (gen_mov__to_sse (op3, operands[3]));
+  emit_insn (gen_mov__to_sse (op2, operands[2]));
+  emit_insn (gen_mov__to_sse (op1, operands[1]));
+
+  emit_insn (gen_fnmav8hf4 (op0, op1, op2, op3));
+
+  emit_move_insn (operands[0], lowpart_subreg (mode, op0, V8HFmode));
+  DONE;
+})
+
+(define_expand "fnms4"
+  [(set (match_operand:VHF_32_64 0 "register_operand" "=v,v,x")
+   (fma:VHF_32_64
+ (neg:VHF_32_64
+   (match_operand:VHF_32_64 1 "nonimmediate_operand"))
+ (match_operand:VHF_32_64   2 "nonimmediate_operand")
+ (neg:VHF_32_64
+   (match_operand:VHF_32_64 3 "nonimmediate_operand"]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL && ix86_partial_vec_fp_math"
+{
+  rtx op3 = gen_reg_rtx (V8HFmode);
+  rtx op2 = gen_reg_rtx (V8HFmode);
+  rtx op1 = gen_reg_rtx (V8HFmode);
+  rtx op0 = gen_reg_rtx (V8HFmode);
+
+  emit_insn (gen_mov__to_sse (op3, operands[3]));
+  emit_insn (gen_mov__to_sse (op2, operands[2]));
+  emit_insn (gen_mov__to_sse (op1, operands[1]));
+
+  emit_insn (gen_fnmsv8hf4 (op0, op1, op2, op3));
+
+  emit_move_insn (operands[0], lowpart_subreg (mode, op0, V8HFmode));
+  DONE;
+})
+
+(define_expand "vec_fmaddsubv4hf4"
+  [(match_operand:V4HF 0 "register_operand")
+   (match_operand:V4HF 1 "nonimmediate_operand")
+   (match_operand:V4HF 2 "nonimmediate_operand")
+   (match_operand:V4HF 3 "nonimmediate_operand")]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL
+   && TARGET_MMX_WITH_SSE
+   && ix86_partial_vec_fp

[PATCH] Avoid compile time hog on vect_peel_nonlinear_iv_init for nonlinear induction vec_step_op_mul when iteration count is too big. 65; 6800; 1c There's loop in vect_peel_nonlinear_iv_init to get i

2023-10-18 Thread liuhongt
Also give up vectorization when niters_skip is negative which will be
used for fully masked loop.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR tree-optimization/111820
PR tree-optimization/111833
* tree-vect-loop-manip.cc (vect_can_peel_nonlinear_iv_p): Give
up vectorization for nonlinear iv vect_step_op_mul when
step_expr is not exact_log2 and niters is greater than
TYPE_PRECISION (TREE_TYPE (step_expr)). Also don't vectorize
for nagative niters_skip which will be used by fully masked
loop.
(vect_can_advance_ivs_p): Pass whole phi_info to
vect_can_peel_nonlinear_iv_p.
* tree-vect-loop.cc (vect_peel_nonlinear_iv_init): Optimize
init_expr * pow (step_expr, skipn) to init_expr
<< (log2 (step_expr) * skipn) when step_expr is exact_log2.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr111820-1.c: New test.
* gcc.target/i386/pr111820-2.c: New test.
* gcc.target/i386/pr103144-mul-1.c: Adjust testcase.
---
 .../gcc.target/i386/pr103144-mul-1.c  |  6 ++--
 gcc/testsuite/gcc.target/i386/pr111820-1.c| 16 ++
 gcc/testsuite/gcc.target/i386/pr111820-2.c| 17 ++
 gcc/tree-vect-loop-manip.cc   | 28 ++--
 gcc/tree-vect-loop.cc | 32 ---
 5 files changed, 88 insertions(+), 11 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr111820-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr111820-2.c

diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c 
b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
index 640c34fd959..f80d1094097 100644
--- a/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
@@ -23,7 +23,7 @@ foo_mul_const (int* a)
   for (int i = 0; i != N; i++)
 {
   a[i] = b;
-  b *= 3;
+  b *= 4;
 }
 }
 
@@ -34,7 +34,7 @@ foo_mul_peel (int* a, int b)
   for (int i = 0; i != 39; i++)
 {
   a[i] = b;
-  b *= 3;
+  b *= 4;
 }
 }
 
@@ -46,6 +46,6 @@ foo_mul_peel_const (int* a)
   for (int i = 0; i != 39; i++)
 {
   a[i] = b;
-  b *= 3;
+  b *= 4;
 }
 }
diff --git a/gcc/testsuite/gcc.target/i386/pr111820-1.c 
b/gcc/testsuite/gcc.target/i386/pr111820-1.c
new file mode 100644
index 000..50e960c39d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr111820-1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2 -fno-tree-vrp -Wno-aggressive-loop-optimizations 
-fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump "Avoid compile time hog on 
vect_peel_nonlinear_iv_init for nonlinear induction vec_step_op_mul when 
iteration count is too big" "vect" } } */
+
+int r;
+int r_0;
+
+void f1 (void)
+{
+  int n = 0;
+  while (-- n)
+{
+  r_0 += r;
+  r  *= 3;
+}
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr111820-2.c 
b/gcc/testsuite/gcc.target/i386/pr111820-2.c
new file mode 100644
index 000..bbdb40798c6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr111820-2.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2 -fno-tree-vrp -fdump-tree-vect-details 
-Wno-aggressive-loop-optimizations" } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+
+int r;
+int r_0;
+
+void f (void)
+{
+  int n = 0;
+  while (-- n)
+{
+  r_0 += r ;
+  r  *= 2;
+}
+}
+
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 2608c286e5d..a530088b61d 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -1783,8 +1783,10 @@ iv_phi_p (stmt_vec_info stmt_info)
 /* Return true if vectorizer can peel for nonlinear iv.  */
 static bool
 vect_can_peel_nonlinear_iv_p (loop_vec_info loop_vinfo,
- enum vect_induction_op_type induction_type)
+ stmt_vec_info stmt_info)
 {
+  enum vect_induction_op_type induction_type
+= STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
   tree niters_skip;
   /* Init_expr will be update by vect_update_ivs_after_vectorizer,
  if niters or vf is unkown:
@@ -1805,11 +1807,31 @@ vect_can_peel_nonlinear_iv_p (loop_vec_info loop_vinfo,
   return false;
 }
 
+  /* Avoid compile time hog on vect_peel_nonlinear_iv_init.  */
+  if (induction_type == vect_step_op_mul)
+{
+  tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
+  tree type = TREE_TYPE (step_expr);
+
+  if (wi::exact_log2 (wi::to_wide (step_expr)) == -1
+ && LOOP_VINFO_INT_NITERS(loop_vinfo) >= TYPE_PRECISION (type))
+   {
+ if (dump_enabled_p ())
+   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+"Avoid compile time hog on"
+" vect_peel_nonlinear_iv_init"
+" for nonlinear induction vec_step_op_mul"
+ 

[PATCH] Avoid compile time hog on vect_peel_nonlinear_iv_init for nonlinear induction vec_step_op_mul when iteration count is too big.

2023-10-18 Thread liuhongt
>So the bugs were not fixed without this hunk?  IIRC in the audit
>trail we concluded the value is always positive ... (but of course
>a large unsigned value can appear negative if you test it this way?)
No, I added this incase in the future there's negative skip_niters as
you mentioned in the PR, it's just defensive programming.

>I think you can use one of the mpz_pow* functions and
>wi::to_mpz/from_mpz for this.  See tree-ssa-loop-niter.cc for the
>most heavy user of mpz (but not pow I think).
Changed.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk.

There's loop in vect_peel_nonlinear_iv_init to get init_expr *
pow (step_expr, skip_niters). When skipn_iters is too big, compile time
hogs. To avoid that, optimize init_expr * pow (step_expr, skip_niters) to
init_expr << (exact_log2 (step_expr) * skip_niters) when step_expr is
pow of 2, otherwise give up vectorization when skip_niters >=
TYPE_PRECISION (TREE_TYPE (init_expr)).

Also give up vectorization when niters_skip is negative which will be
used for fully masked loop.

gcc/ChangeLog:

PR tree-optimization/111820
PR tree-optimization/111833
* tree-vect-loop-manip.cc (vect_can_peel_nonlinear_iv_p): Give
up vectorization for nonlinear iv vect_step_op_mul when
step_expr is not exact_log2 and niters is greater than
TYPE_PRECISION (TREE_TYPE (step_expr)). Also don't vectorize
for nagative niters_skip which will be used by fully masked
loop.
(vect_can_advance_ivs_p): Pass whole phi_info to
vect_can_peel_nonlinear_iv_p.
* tree-vect-loop.cc (vect_peel_nonlinear_iv_init): Optimize
init_expr * pow (step_expr, skipn) to init_expr
<< (log2 (step_expr) * skipn) when step_expr is exact_log2.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr111820-1.c: New test.
* gcc.target/i386/pr111820-2.c: New test.
* gcc.target/i386/pr111820-3.c: New test.
* gcc.target/i386/pr103144-mul-1.c: Adjust testcase.
* gcc.target/i386/pr103144-mul-2.c: Adjust testcase.
---
 .../gcc.target/i386/pr103144-mul-1.c  |  8 ++---
 .../gcc.target/i386/pr103144-mul-2.c  |  8 ++---
 gcc/testsuite/gcc.target/i386/pr111820-1.c| 16 +
 gcc/testsuite/gcc.target/i386/pr111820-2.c| 16 +
 gcc/testsuite/gcc.target/i386/pr111820-3.c| 16 +
 gcc/tree-vect-loop-manip.cc   | 28 +--
 gcc/tree-vect-loop.cc | 34 ---
 7 files changed, 110 insertions(+), 16 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr111820-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr111820-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr111820-3.c

diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c 
b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
index 640c34fd959..913d7737dcd 100644
--- a/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
@@ -11,7 +11,7 @@ foo_mul (int* a, int b)
   for (int i = 0; i != N; i++)
 {
   a[i] = b;
-  b *= 3;
+  b *= 4;
 }
 }
 
@@ -23,7 +23,7 @@ foo_mul_const (int* a)
   for (int i = 0; i != N; i++)
 {
   a[i] = b;
-  b *= 3;
+  b *= 4;
 }
 }
 
@@ -34,7 +34,7 @@ foo_mul_peel (int* a, int b)
   for (int i = 0; i != 39; i++)
 {
   a[i] = b;
-  b *= 3;
+  b *= 4;
 }
 }
 
@@ -46,6 +46,6 @@ foo_mul_peel_const (int* a)
   for (int i = 0; i != 39; i++)
 {
   a[i] = b;
-  b *= 3;
+  b *= 4;
 }
 }
diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c 
b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
index 39fdea3a69d..b2ff186e335 100644
--- a/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
@@ -16,12 +16,12 @@ avx2_test (void)
 
   __builtin_memset (epi32_exp, 0, N * sizeof (int));
   int b = 8;
-  v8si init = __extension__(v8si) { b, b * 3, b * 9, b * 27, b * 81, b * 243, 
b * 729, b * 2187 };
+  v8si init = __extension__(v8si) { b, b * 4, b * 16, b * 64, b * 256, b * 
1024, b * 4096, b * 16384 };
 
   for (int i = 0; i != N / 8; i++)
 {
   memcpy (epi32_exp + i * 8, &init, 32);
-  init *= 6561;
+  init *= 65536;
 }
 
   foo_mul (epi32_dst, b);
@@ -32,11 +32,11 @@ avx2_test (void)
   if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * 4) != 0)
 __builtin_abort ();
 
-  init = __extension__(v8si) { 1, 3, 9, 27, 81, 243, 729, 2187 };
+  init = __extension__(v8si) { 1, 4, 16, 64, 256, 1024, 4096, 16384 };
   for (int i = 0; i != N / 8; i++)
 {
   memcpy (epi32_exp + i * 8, &init, 32);
-  init *= 6561;
+  init *= 65536;
 }
 
   foo_mul_const (epi32_dst);
diff --git a/gcc/testsuite/gcc.target/i386/pr111820-1.c 
b/gcc/testsuite/gcc.target/i386/pr111820-1.c
new file mode 100644
index 000..50e960c39d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr111820-1.c
@@ -0,0 +

[PATCH] Avoid compile time hog on vect_peel_nonlinear_iv_init for nonlinear induction vec_step_op_mul when iteration count is too big.

2023-10-19 Thread liuhongt
>So with pow being available this limit shouldn't be necessary any more and
>the testcase adjustment can be avoided?
I tries, compile time still hogs on mpz_powm(3, INT_MAX), so i'll just
keep this.

>and to avoid undefined behavior with too large shift just go the gmp
>way unconditionally.
Changed.

>this comment is now resolved I think.
Removed.

>mpz_pow_ui uses unsigned long while i think we constrain known niters
>to uint64 - so I suggest to use mpz_powm instead (limiting to a possibly
>host specific limit - unsigned long - is unfortunately a no-go).
Changed.

There's loop in vect_peel_nonlinear_iv_init to get init_expr *
pow (step_expr, skip_niters). When skipn_iters is too big, compile time
hogs. To avoid that, optimize init_expr * pow (step_expr, skip_niters) to
init_expr << (exact_log2 (step_expr) * skip_niters) when step_expr is
pow of 2, otherwise give up vectorization when skip_niters >=
TYPE_PRECISION (TREE_TYPE (init_expr)).

Also give up vectorization when niters_skip is negative which will be
used for fully masked loop.

Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
Ok for trunk.

gcc/ChangeLog:

PR tree-optimization/111820
PR tree-optimization/111833
* tree-vect-loop-manip.cc (vect_can_peel_nonlinear_iv_p): Give
up vectorization for nonlinear iv vect_step_op_mul when
step_expr is not exact_log2 and niters is greater than
TYPE_PRECISION (TREE_TYPE (step_expr)). Also don't vectorize
for nagative niters_skip which will be used by fully masked
loop.
(vect_can_advance_ivs_p): Pass whole phi_info to
vect_can_peel_nonlinear_iv_p.
* tree-vect-loop.cc (vect_peel_nonlinear_iv_init): Optimize
init_expr * pow (step_expr, skipn) to init_expr
<< (log2 (step_expr) * skipn) when step_expr is exact_log2.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr111820-1.c: New test.
* gcc.target/i386/pr111820-2.c: New test.
* gcc.target/i386/pr111820-3.c: New test.
* gcc.target/i386/pr103144-mul-1.c: Adjust testcase.
* gcc.target/i386/pr103144-mul-2.c: Adjust testcase.
---
 .../gcc.target/i386/pr103144-mul-1.c  |  8 +++---
 .../gcc.target/i386/pr103144-mul-2.c  |  8 +++---
 gcc/testsuite/gcc.target/i386/pr111820-1.c| 16 +++
 gcc/testsuite/gcc.target/i386/pr111820-2.c| 16 +++
 gcc/testsuite/gcc.target/i386/pr111820-3.c| 16 +++
 gcc/tree-vect-loop-manip.cc   | 28 +--
 gcc/tree-vect-loop.cc | 13 ++---
 7 files changed, 90 insertions(+), 15 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr111820-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr111820-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr111820-3.c

diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c 
b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
index 640c34fd959..913d7737dcd 100644
--- a/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
@@ -11,7 +11,7 @@ foo_mul (int* a, int b)
   for (int i = 0; i != N; i++)
 {
   a[i] = b;
-  b *= 3;
+  b *= 4;
 }
 }
 
@@ -23,7 +23,7 @@ foo_mul_const (int* a)
   for (int i = 0; i != N; i++)
 {
   a[i] = b;
-  b *= 3;
+  b *= 4;
 }
 }
 
@@ -34,7 +34,7 @@ foo_mul_peel (int* a, int b)
   for (int i = 0; i != 39; i++)
 {
   a[i] = b;
-  b *= 3;
+  b *= 4;
 }
 }
 
@@ -46,6 +46,6 @@ foo_mul_peel_const (int* a)
   for (int i = 0; i != 39; i++)
 {
   a[i] = b;
-  b *= 3;
+  b *= 4;
 }
 }
diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c 
b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
index 39fdea3a69d..b2ff186e335 100644
--- a/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
@@ -16,12 +16,12 @@ avx2_test (void)
 
   __builtin_memset (epi32_exp, 0, N * sizeof (int));
   int b = 8;
-  v8si init = __extension__(v8si) { b, b * 3, b * 9, b * 27, b * 81, b * 243, 
b * 729, b * 2187 };
+  v8si init = __extension__(v8si) { b, b * 4, b * 16, b * 64, b * 256, b * 
1024, b * 4096, b * 16384 };
 
   for (int i = 0; i != N / 8; i++)
 {
   memcpy (epi32_exp + i * 8, &init, 32);
-  init *= 6561;
+  init *= 65536;
 }
 
   foo_mul (epi32_dst, b);
@@ -32,11 +32,11 @@ avx2_test (void)
   if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * 4) != 0)
 __builtin_abort ();
 
-  init = __extension__(v8si) { 1, 3, 9, 27, 81, 243, 729, 2187 };
+  init = __extension__(v8si) { 1, 4, 16, 64, 256, 1024, 4096, 16384 };
   for (int i = 0; i != N / 8; i++)
 {
   memcpy (epi32_exp + i * 8, &init, 32);
-  init *= 6561;
+  init *= 65536;
 }
 
   foo_mul_const (epi32_dst);
diff --git a/gcc/testsuite/gcc.target/i386/pr111820-1.c 
b/gcc/testsuite/gcc.target/i386/pr111820-1.c
new file mode 100644
index 000..50e960c39d4
--- /dev/null

[PATCH] [x86] Remove unused mmx_pinsrw.

2023-10-19 Thread liuhongt
When I'm working on enable more 32/64-bit vectorization for _Float16,
I notice there's 1 redundant define_expand, the patch removed the expander.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

* config/i386/mmx.md (mmx_pinsrw): Removed.
---
 gcc/config/i386/mmx.md | 14 --
 1 file changed, 14 deletions(-)

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 491a0a51272..d1e1f733566 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -4574,20 +4574,6 @@ (define_insn "*mmx_pinsrd"
(set_attr "prefix" "orig,vex")
(set_attr "mode" "TI")])
 
-(define_expand "mmx_pinsrw"
-  [(set (match_operand:V4HI 0 "register_operand")
-(vec_merge:V4HI
-  (vec_duplicate:V4HI
-(match_operand:SI 2 "nonimmediate_operand"))
- (match_operand:V4HI 1 "register_operand")
-  (match_operand:SI 3 "const_0_to_3_operand")))]
-  "(TARGET_MMX || TARGET_MMX_WITH_SSE)
-   && (TARGET_SSE || TARGET_3DNOW_A)"
-{
-  operands[2] = gen_lowpart (HImode, operands[2]);
-  operands[3] = GEN_INT (1 << INTVAL (operands[3]));
-})
-
 (define_insn "*mmx_pinsrw"
   [(set (match_operand:V4HI 0 "register_operand" "=y,x,YW")
 (vec_merge:V4HI
-- 
2.31.1



[PATCH] Support vec_cmpmn/vcondmn for v2hf/v4hf.

2023-10-23 Thread liuhongt
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

PR target/103861
* config/i386/i386-expand.cc (ix86_expand_sse_movcc): Handle
V2HF/V2BF/V4HF/V4BFmode.
* config/i386/mmx.md (vec_cmpv4hfqi): New expander.
(vcondv4hf): Ditto.
(vcondv4hi): Ditto.
(vconduv4hi): Ditto.
(vcond_mask_v4hi): Ditto.
(vcond_mask_qi): Ditto.
(vec_cmpv2hfqi): Ditto.
(vcondv2hf): Ditto.
(vcondv2hi): Ditto.
(vconduv2hi): Ditto.
(vcond_mask_v2hi): Ditto.
* config/i386/sse.md (vcond): Merge this with ..
(vcond): .. this into ..
(vcond): .. this,
and extend to V8BF/V16BF/V32BFmode.

gcc/testsuite/ChangeLog:

* g++.target/i386/part-vect-vcondhf.C: New test.
* gcc.target/i386/part-vect-vec_cmphf.c: New test.
---
 gcc/config/i386/i386-expand.cc|   4 +
 gcc/config/i386/mmx.md| 237 +-
 gcc/config/i386/sse.md|  25 +-
 .../g++.target/i386/part-vect-vcondhf.C   |  34 +++
 .../gcc.target/i386/part-vect-vec_cmphf.c |  26 ++
 5 files changed, 304 insertions(+), 22 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/part-vect-vcondhf.C
 create mode 100644 gcc/testsuite/gcc.target/i386/part-vect-vec_cmphf.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 1eae9d7c78c..9658f9c5a2d 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -4198,6 +4198,8 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, 
rtx op_false)
   break;
 case E_V8QImode:
 case E_V4HImode:
+case E_V4HFmode:
+case E_V4BFmode:
 case E_V2SImode:
   if (TARGET_SSE4_1)
{
@@ -4207,6 +4209,8 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, 
rtx op_false)
   break;
 case E_V4QImode:
 case E_V2HImode:
+case E_V2HFmode:
+case E_V2BFmode:
   if (TARGET_SSE4_1)
{
  gen = gen_mmx_pblendvb_v4qi;
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 491a0a51272..b9617e9d8c6 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -61,6 +61,9 @@ (define_mode_iterator MMXMODE248 [V4HI V2SI V1DI])
 (define_mode_iterator V_32 [V4QI V2HI V1SI V2HF V2BF])
 
 (define_mode_iterator V2FI_32 [V2HF V2BF V2HI])
+(define_mode_iterator V4FI_64 [V4HF V4BF V4HI])
+(define_mode_iterator V4F_64 [V4HF V4BF])
+(define_mode_iterator V2F_32 [V2HF V2BF])
 ;; 4-byte integer vector modes
 (define_mode_iterator VI_32 [V4QI V2HI])
 
@@ -1972,10 +1975,12 @@ (define_mode_attr mov_to_sse_suffix
   [(V2HF "d") (V4HF "q") (V2HI "d") (V4HI "q")])
 
 (define_mode_attr mmxxmmmode
-  [(V2HF "V8HF") (V2HI "V8HI") (V2BF "V8BF")])
+  [(V2HF "V8HF") (V2HI "V8HI") (V2BF "V8BF")
+   (V4HF "V8HF") (V4HI "V8HI") (V4BF "V8BF")])
 
 (define_mode_attr mmxxmmmodelower
-  [(V2HF "v8hf") (V2HI "v8hi") (V2BF "v8bf")])
+  [(V2HF "v8hf") (V2HI "v8hi") (V2BF "v8bf")
+   (V4HF "v8hf") (V4HI "v8hi") (V4BF "v8bf")])
 
 (define_expand "movd__to_sse"
   [(set (match_operand: 0 "register_operand")
@@ -2114,6 +2119,234 @@ (define_insn_and_split "*mmx_nabs2"
   [(set (match_dup 0)
(ior: (match_dup 1) (match_dup 2)))])
 
+;
+;;
+;; Parallel half-precision floating point comparisons
+;;
+;
+
+(define_expand "vec_cmpv4hfqi"
+  [(set (match_operand:QI 0 "register_operand")
+   (match_operator:QI 1 ""
+ [(match_operand:V4HF 2 "nonimmediate_operand")
+  (match_operand:V4HF 3 "nonimmediate_operand")]))]
+  "TARGET_MMX_WITH_SSE && TARGET_AVX512FP16 && TARGET_AVX512VL
+   && ix86_partial_vec_fp_math"
+{
+  rtx ops[4];
+  ops[3] = gen_reg_rtx (V8HFmode);
+  ops[2] = gen_reg_rtx (V8HFmode);
+
+  emit_insn (gen_movq_v4hf_to_sse (ops[3], operands[3]));
+  emit_insn (gen_movq_v4hf_to_sse (ops[2], operands[2]));
+  emit_insn (gen_vec_cmpv8hfqi (operands[0], operands[1], ops[2], ops[3]));
+  DONE;
+})
+
+(define_expand "vcondv4hf"
+  [(set (match_operand:V4FI_64 0 "register_operand")
+   (if_then_else:V4FI_64
+ (match_operator 3 ""
+   [(match_operand:V4HF 4 "nonimmediate_operand")
+(match_operand:V4HF 5 "nonimmediate_operand")])
+ (match_operand:V4FI_64 1 "general_operand")
+ (match_operand:V4FI_64 2 "general_operand")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL
+  && TARGET_MMX_WITH_SSE && ix86_partial_vec_fp_math"
+{
+  rtx ops[6];
+  ops[5] = gen_reg_rtx (V8HFmode);
+  ops[4] = gen_reg_rtx (V8HFmode);
+  ops[0] = gen_reg_rtx (mode);
+  ops[1] = lowpart_subreg (mode,
+  force_reg (mode, operands[1]),
+  mode);
+  ops[2] = lowpart_subreg (mode,
+  force_reg (mode, operands[2]),
+  mode);
+  ops

[PATCH GCC13 backport] Avoid compile time hog on vect_peel_nonlinear_iv_init for nonlinear induction vec_step_op_mul when iteration count is too big.

2023-10-24 Thread liuhongt
This is the backport patch for releases/gcc-13 branch, the original patch for 
main trunk
is at [1].
The only difference between this backport patch and [1] is GCC13 doesn't 
support auto_mpz,
So this patch manually use mpz_init/mpz_clear.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2023-October/633661.html

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ok for backport to releases/gcc-13?

There's loop in vect_peel_nonlinear_iv_init to get init_expr *
pow (step_expr, skip_niters). When skipn_iters is too big, compile time
hogs. To avoid that, optimize init_expr * pow (step_expr, skip_niters) to
init_expr << (exact_log2 (step_expr) * skip_niters) when step_expr is
pow of 2, otherwise give up vectorization when skip_niters >=
TYPE_PRECISION (TREE_TYPE (init_expr)).

Also give up vectorization when niters_skip is negative which will be
used for fully masked loop.

gcc/ChangeLog:

PR tree-optimization/111820
PR tree-optimization/111833
* tree-vect-loop-manip.cc (vect_can_peel_nonlinear_iv_p): Give
up vectorization for nonlinear iv vect_step_op_mul when
step_expr is not exact_log2 and niters is greater than
TYPE_PRECISION (TREE_TYPE (step_expr)). Also don't vectorize
for nagative niters_skip which will be used by fully masked
loop.
(vect_can_advance_ivs_p): Pass whole phi_info to
vect_can_peel_nonlinear_iv_p.
* tree-vect-loop.cc (vect_peel_nonlinear_iv_init): Optimize
init_expr * pow (step_expr, skipn) to init_expr
<< (log2 (step_expr) * skipn) when step_expr is exact_log2.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr111820-1.c: New test.
* gcc.target/i386/pr111820-2.c: New test.
* gcc.target/i386/pr111820-3.c: New test.
* gcc.target/i386/pr103144-mul-1.c: Adjust testcase.
* gcc.target/i386/pr103144-mul-2.c: Adjust testcase.
---
 .../gcc.target/i386/pr103144-mul-1.c  |  8 +++---
 .../gcc.target/i386/pr103144-mul-2.c  |  8 +++---
 gcc/testsuite/gcc.target/i386/pr111820-1.c| 16 +++
 gcc/testsuite/gcc.target/i386/pr111820-2.c| 16 +++
 gcc/testsuite/gcc.target/i386/pr111820-3.c| 16 +++
 gcc/tree-vect-loop-manip.cc   | 28 +--
 gcc/tree-vect-loop.cc | 21 +++---
 7 files changed, 98 insertions(+), 15 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr111820-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr111820-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr111820-3.c

diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c 
b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
index 640c34fd959..913d7737dcd 100644
--- a/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
@@ -11,7 +11,7 @@ foo_mul (int* a, int b)
   for (int i = 0; i != N; i++)
 {
   a[i] = b;
-  b *= 3;
+  b *= 4;
 }
 }
 
@@ -23,7 +23,7 @@ foo_mul_const (int* a)
   for (int i = 0; i != N; i++)
 {
   a[i] = b;
-  b *= 3;
+  b *= 4;
 }
 }
 
@@ -34,7 +34,7 @@ foo_mul_peel (int* a, int b)
   for (int i = 0; i != 39; i++)
 {
   a[i] = b;
-  b *= 3;
+  b *= 4;
 }
 }
 
@@ -46,6 +46,6 @@ foo_mul_peel_const (int* a)
   for (int i = 0; i != 39; i++)
 {
   a[i] = b;
-  b *= 3;
+  b *= 4;
 }
 }
diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c 
b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
index 39fdea3a69d..b2ff186e335 100644
--- a/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
@@ -16,12 +16,12 @@ avx2_test (void)
 
   __builtin_memset (epi32_exp, 0, N * sizeof (int));
   int b = 8;
-  v8si init = __extension__(v8si) { b, b * 3, b * 9, b * 27, b * 81, b * 243, 
b * 729, b * 2187 };
+  v8si init = __extension__(v8si) { b, b * 4, b * 16, b * 64, b * 256, b * 
1024, b * 4096, b * 16384 };
 
   for (int i = 0; i != N / 8; i++)
 {
   memcpy (epi32_exp + i * 8, &init, 32);
-  init *= 6561;
+  init *= 65536;
 }
 
   foo_mul (epi32_dst, b);
@@ -32,11 +32,11 @@ avx2_test (void)
   if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * 4) != 0)
 __builtin_abort ();
 
-  init = __extension__(v8si) { 1, 3, 9, 27, 81, 243, 729, 2187 };
+  init = __extension__(v8si) { 1, 4, 16, 64, 256, 1024, 4096, 16384 };
   for (int i = 0; i != N / 8; i++)
 {
   memcpy (epi32_exp + i * 8, &init, 32);
-  init *= 6561;
+  init *= 65536;
 }
 
   foo_mul_const (epi32_dst);
diff --git a/gcc/testsuite/gcc.target/i386/pr111820-1.c 
b/gcc/testsuite/gcc.target/i386/pr111820-1.c
new file mode 100644
index 000..50e960c39d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr111820-1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2 -fno-tree-vrp -Wno-aggressive-loop-optimizations 
-fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-du

[PATCH V2 1/2] Pass type of comparison operands instead of comparison result to truth_type_for in build_vec_cmp.

2023-10-25 Thread liuhongt
>I think it's indeed on purpose that the result of v1 < v2 is a signed
>integer vector type.
>But build_vec_cmp should not use the truth type for the result but instead the
>truth type for the comparison, so

Change build_vec_cmp in both c/c++, also notice for jit part, it already uses
type of comparison instead of the result.

gcc/c/ChangeLog:

* c-typeck.cc (build_vec_cmp): Pass type of arg0 to
truth_type_for.

gcc/cp/ChangeLog:

* typeck.cc (build_vec_cmp): Pass type of arg0 to
truth_type_for.
---
 gcc/c/c-typeck.cc | 2 +-
 gcc/cp/typeck.cc  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/c/c-typeck.cc b/gcc/c/c-typeck.cc
index e55e887da14..41ee38368f2 100644
--- a/gcc/c/c-typeck.cc
+++ b/gcc/c/c-typeck.cc
@@ -11982,7 +11982,7 @@ build_vec_cmp (tree_code code, tree type,
 {
   tree zero_vec = build_zero_cst (type);
   tree minus_one_vec = build_minus_one_cst (type);
-  tree cmp_type = truth_type_for (type);
+  tree cmp_type = truth_type_for (TREE_TYPE (arg0));
   tree cmp = build2 (code, cmp_type, arg0, arg1);
   return build3 (VEC_COND_EXPR, type, cmp, minus_one_vec, zero_vec);
 }
diff --git a/gcc/cp/typeck.cc b/gcc/cp/typeck.cc
index 8132bd7fccc..7b2ad51bde7 100644
--- a/gcc/cp/typeck.cc
+++ b/gcc/cp/typeck.cc
@@ -4826,7 +4826,7 @@ build_vec_cmp (tree_code code, tree type,
 {
   tree zero_vec = build_zero_cst (type);
   tree minus_one_vec = build_minus_one_cst (type);
-  tree cmp_type = truth_type_for (type);
+  tree cmp_type = truth_type_for (TREE_TYPE (arg0));
   tree cmp = build2 (code, cmp_type, arg0, arg1);
   return build3 (VEC_COND_EXPR, type, cmp, minus_one_vec, zero_vec);
 }
-- 
2.31.1



[PATCH V2 2/2] Support vec_cmpmn/vcondmn for v2hf/v4hf.

2023-10-25 Thread liuhongt
>vcond and vcondeq shouldn't be necessary if there's
>vcond_mask and vcmp support which is the "modern"
>way of handling vcond.  Unless the ISA really can do
>compare and select with a single instruction.
The V2 patch remove vcond/vcondu from the initial version[1], but there're
many optimizations done in ix86_expand_int_vcond, so this patch adds many
extra combine splitters to get the same optimizations.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2023-October/633946.html

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

PR target/103861
* config/i386/i386-expand.cc (ix86_expand_sse_movcc): Handle
V2HF/V2BF/V4HF/V4BFmode.
* config/i386/i386.cc (ix86_get_mask_mode): Return QImode when
data_mode is V4HF/V2HFmode.
* config/i386/mmx.md (vec_cmpv4hfqi): New expander.
(vcond_mask_v4hi): Ditto.
(vcond_mask_qi): Ditto.
(vec_cmpv2hfqi): Ditto.
(vcond_mask_v2hi): Ditto.
(mmx_plendvb_): Add 2 combine splitters after the
patterns.
(mmx_pblendvb_v8qi): Ditto.
(v2hi3): Add a combine splitter after the pattern.
(3): Ditto.
(v8qi3): Ditto.
(3): Ditto.
* config/i386/sse.md (vcond): Merge this with ..
(vcond): .. this into ..
(vcond): .. this,
and extend to V8BF/V16BF/V32BFmode.

gcc/testsuite/ChangeLog:

* g++.target/i386/part-vect-vcondhf.C: New test.
* gcc.target/i386/part-vect-vec_cmphf.c: New test.
---
 gcc/config/i386/i386-expand.cc|   4 +
 gcc/config/i386/i386.cc   |   6 +-
 gcc/config/i386/mmx.md| 269 +-
 gcc/config/i386/sse.md|  25 +-
 .../g++.target/i386/part-vect-vcondhf.C   |  45 +++
 .../gcc.target/i386/part-vect-vec_cmphf.c |  26 ++
 6 files changed, 352 insertions(+), 23 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/part-vect-vcondhf.C
 create mode 100644 gcc/testsuite/gcc.target/i386/part-vect-vec_cmphf.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 1eae9d7c78c..9658f9c5a2d 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -4198,6 +4198,8 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, 
rtx op_false)
   break;
 case E_V8QImode:
 case E_V4HImode:
+case E_V4HFmode:
+case E_V4BFmode:
 case E_V2SImode:
   if (TARGET_SSE4_1)
{
@@ -4207,6 +4209,8 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, 
rtx op_false)
   break;
 case E_V4QImode:
 case E_V2HImode:
+case E_V2HFmode:
+case E_V2BFmode:
   if (TARGET_SSE4_1)
{
  gen = gen_mmx_pblendvb_v4qi;
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 641e7680335..c9c07beaeb1 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -24360,7 +24360,11 @@ ix86_get_mask_mode (machine_mode data_mode)
 
   /* Scalar mask case.  */
   if ((TARGET_AVX512F && TARGET_EVEX512 && vector_size == 64)
-  || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
+  || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16))
+  /* AVX512FP16 only supports vector comparison
+to kmask for _Float16.  */
+  || (TARGET_AVX512VL && TARGET_AVX512FP16
+ && GET_MODE_INNER (data_mode) == E_HFmode))
 {
   if (elem_size == 4
  || elem_size == 8
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 491a0a51272..78bb618f54c 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -61,6 +61,9 @@ (define_mode_iterator MMXMODE248 [V4HI V2SI V1DI])
 (define_mode_iterator V_32 [V4QI V2HI V1SI V2HF V2BF])
 
 (define_mode_iterator V2FI_32 [V2HF V2BF V2HI])
+(define_mode_iterator V4FI_64 [V4HF V4BF V4HI])
+(define_mode_iterator V4F_64 [V4HF V4BF])
+(define_mode_iterator V2F_32 [V2HF V2BF])
 ;; 4-byte integer vector modes
 (define_mode_iterator VI_32 [V4QI V2HI])
 
@@ -1972,10 +1975,12 @@ (define_mode_attr mov_to_sse_suffix
   [(V2HF "d") (V4HF "q") (V2HI "d") (V4HI "q")])
 
 (define_mode_attr mmxxmmmode
-  [(V2HF "V8HF") (V2HI "V8HI") (V2BF "V8BF")])
+  [(V2HF "V8HF") (V2HI "V8HI") (V2BF "V8BF")
+   (V4HF "V8HF") (V4HI "V8HI") (V4BF "V8BF")])
 
 (define_mode_attr mmxxmmmodelower
-  [(V2HF "v8hf") (V2HI "v8hi") (V2BF "v8bf")])
+  [(V2HF "v8hf") (V2HI "v8hi") (V2BF "v8bf")
+   (V4HF "v8hf") (V4HI "v8hi") (V4BF "v8bf")])
 
 (define_expand "movd__to_sse"
   [(set (match_operand: 0 "register_operand")
@@ -2114,6 +2119,110 @@ (define_insn_and_split "*mmx_nabs2"
   [(set (match_dup 0)
(ior: (match_dup 1) (match_dup 2)))])
 
+;
+;;
+;; Parallel half-precision floating point comparisons
+;;
+;
+
+(define_expand "vec_cmpv4hfqi"
+  [(set (match_op

[PATCH] Improve memcmpeq for 512-bit vector with vpcmpeq + kortest.

2023-10-26 Thread liuhongt
When 2 vectors are equal, kmask is allones and kortest will set CF,
else CF will be cleared.

So CF bit can be used to check for the result of the comparison.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

Before:
vmovdqu (%rsi), %ymm0
vpxorq  (%rdi), %ymm0, %ymm0
vptest  %ymm0, %ymm0
jne .L2
vmovdqu 32(%rsi), %ymm0
vpxorq  32(%rdi), %ymm0, %ymm0
vptest  %ymm0, %ymm0
je  .L5
.L2:
movl$1, %eax
xorl$1, %eax
vzeroupper
ret

After:
vmovdqu64   (%rsi), %zmm0
xorl%eax, %eax
vpcmpeqd(%rdi), %zmm0, %k0
kortestw%k0, %k0
setc%al
vzeroupper
ret

gcc/ChangeLog:

PR target/104610
* config/i386/i386-expand.cc (ix86_expand_branch): Handle
512-bit vector with vpcmpeq + kortest.
* config/i386/i386.md (cbranchxi4): New expander.
* config/i386/sse.md: (cbranch4): Extend to V16SImode
and V8DImode.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr104610-2.c: New test.
---
 gcc/config/i386/i386-expand.cc | 55 +++---
 gcc/config/i386/i386.md| 16 +++
 gcc/config/i386/sse.md | 36 +++---
 gcc/testsuite/gcc.target/i386/pr104610-2.c | 14 ++
 4 files changed, 99 insertions(+), 22 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr104610-2.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 1eae9d7c78c..c664cb61e80 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -2411,30 +2411,53 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx 
op1, rtx label)
   rtx tmp;
 
   /* Handle special case - vector comparsion with boolean result, transform
- it using ptest instruction.  */
+ it using ptest instruction or vpcmpeq + kortest.  */
   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
   || (mode == TImode && !TARGET_64BIT)
-  || mode == OImode)
+  || mode == OImode
+  || GET_MODE_SIZE (mode) == 64)
 {
-  rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
-  machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
+  unsigned msize = GET_MODE_SIZE (mode);
+  machine_mode p_mode
+   = msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode;
+  /* kortest set CF when result is 0x (op0 == op1).  */
+  rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, FLAGS_REG);
 
   gcc_assert (code == EQ || code == NE);
 
-  if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
+  /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors.  */
+  if (msize == 64)
{
- op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
- op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
- mode = p_mode;
+ if (mode != V16SImode)
+   {
+ op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
+ op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
+   }
+
+ tmp = gen_reg_rtx (HImode);
+ emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0)));
+ emit_insn (gen_kortesthi_ccc (tmp, tmp));
+   }
+  /* Using ptest for 128/256-bit vectors.  */
+  else
+   {
+ if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
+   {
+ op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
+ op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
+ mode = p_mode;
+   }
+
+ /* Generate XOR since we can't check that one operand is zero
+vector.  */
+ tmp = gen_reg_rtx (mode);
+ emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
+ tmp = gen_lowpart (p_mode, tmp);
+ emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
+ gen_rtx_UNSPEC (CCZmode,
+ gen_rtvec (2, tmp, tmp),
+ UNSPEC_PTEST)));
}
-  /* Generate XOR since we can't check that one operand is zero vector.  */
-  tmp = gen_reg_rtx (mode);
-  emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
-  tmp = gen_lowpart (p_mode, tmp);
-  emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
- gen_rtx_UNSPEC (CCZmode,
- gen_rtvec (2, tmp, tmp),
- UNSPEC_PTEST)));
   tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
  gen_rtx_LABEL_REF (VOIDmode, label),
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index abaf2f311e8..51d8d0c3b97 100644
--- a/gcc/config/i386/i386.md
+++ b/gc

[PATCH] Fix wrong code due to incorrest define_split

2023-10-30 Thread liuhongt
-(define_split
-  [(set (match_operand:V2HI 0 "register_operand")
-(eq:V2HI
-  (eq:V2HI
-(us_minus:V2HI
-  (match_operand:V2HI 1 "register_operand")
-  (match_operand:V2HI 2 "register_operand"))
-(match_operand:V2HI 3 "const0_operand"))
-  (match_operand:V2HI 4 "const0_operand")))]
-  "TARGET_SSE4_1"
-  [(set (match_dup 0)
-(umin:V2HI (match_dup 1) (match_dup 2)))
-   (set (match_dup 0)
-(eq:V2HI (match_dup 0) (match_dup 2)))])

the splitter is wrong when op1 == op2.(the original pattern returns 0, after 
split, it returns 1)
So remove the splitter.

Also extend another define_split to define_insn_and_split to handle
below pattern

494(set (reg:V4QI 112)
495(unspec:V4QI [
496(subreg:V4QI (reg:V2HF 111 [ bf ]) 0)
497(subreg:V4QI (reg:V2HF 110 [ af ]) 0)
498(subreg:V4QI (eq:V2HI (eq:V2HI (reg:V2HI 105)
499(const_vector:V2HI [
500(const_int 0 [0]) repeated x2
501]))
502(const_vector:V2HI [
503(const_int 0 [0]) repeated x2
504])) 0)
505] UNSPEC_BLENDV))

define_split doesn't work since pass_combine assumes it produces at
most 2 insns after split, but here it produces 3 since we need to move
const0_rtx (V2HImode) to reg. The move insn can be eliminated later.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

PR target/112276
* config/i386/mmx.md (*mmx_pblendvb_v8qi_1): Change
define_split to define_insn_and_split to handle
immediate_operand for comparison.
(*mmx_pblendvb_v8qi_2): Ditto.
(*mmx_pblendvb__1): Ditto.
(*mmx_pblendvb_v4qi_2): Ditto.
(3): Remove define_split after it.
(v8qi3): Ditto.
(3): Ditto.
(v2hi3): Ditto.

gcc/testsuite/ChangeLog:

* g++.target/i386/part-vect-vcondhf.C: Adjust testcase.
* gcc.target/i386/pr112276.c: New test.
---
 gcc/config/i386/mmx.md| 112 ++
 .../g++.target/i386/part-vect-vcondhf.C   |   1 -
 gcc/testsuite/gcc.target/i386/pr112276.c  |  36 ++
 3 files changed, 70 insertions(+), 79 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr112276.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index e3d0fb5b107..2b97bb8fa98 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -3360,21 +3360,6 @@ (define_insn "3"
(set_attr "prefix" "orig,orig,vex")
(set_attr "mode" "TI")])
 
-(define_split
-  [(set (match_operand:V4HI 0 "register_operand")
-   (eq:V4HI
- (eq:V4HI
-   (us_minus:V4HI
- (match_operand:V4HI 1 "register_operand")
- (match_operand:V4HI 2 "register_operand"))
-   (match_operand:V4HI 3 "const0_operand"))
- (match_operand:V4HI 4 "const0_operand")))]
-  "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
-  [(set (match_dup 0)
-   (umin:V4HI (match_dup 1) (match_dup 2)))
-   (set (match_dup 0)
-   (eq:V4HI (match_dup 0) (match_dup 2)))])
-
 (define_expand "mmx_v8qi3"
   [(set (match_operand:V8QI 0 "register_operand")
 (umaxmin:V8QI
@@ -3408,21 +3393,6 @@ (define_expand "v8qi3"
  (match_operand:V8QI 2 "register_operand")))]
   "TARGET_MMX_WITH_SSE")
 
-(define_split
-  [(set (match_operand:V8QI 0 "register_operand")
-   (eq:V8QI
- (eq:V8QI
-   (us_minus:V8QI
- (match_operand:V8QI 1 "register_operand")
- (match_operand:V8QI 2 "register_operand"))
-   (match_operand:V8QI 3 "const0_operand"))
- (match_operand:V8QI 4 "const0_operand")))]
-  "TARGET_MMX_WITH_SSE"
-  [(set (match_dup 0)
-   (umin:V8QI (match_dup 1) (match_dup 2)))
-   (set (match_dup 0)
-   (eq:V8QI (match_dup 0) (match_dup 2)))])
-
 (define_insn "3"
   [(set (match_operand:VI1_16_32 0 "register_operand" "=x,Yw")
 (umaxmin:VI1_16_32
@@ -3436,21 +3406,6 @@ (define_insn "3"
(set_attr "type" "sseiadd")
(set_attr "mode" "TI")])
 
-(define_split
-  [(set (match_operand:V4QI 0 "register_operand")
-   (eq:V4QI
- (eq:V4QI
-   (us_minus:V4QI
- (match_operand:V4QI 1 "register_operand")
- (match_operand:V4QI 2 "register_operand"))
-   (match_operand:V4QI 3 "const0_operand"))
- (match_operand:V4QI 4 "const0_operand")))]
-  "TARGET_SSE2"
-  [(set (match_dup 0)
-   (umin:V4QI (match_dup 1) (match_dup 2)))
-   (set (match_dup 0)
-   (eq:V4QI (match_dup 0) (match_dup 2)))])
-
 (define_insn "v2hi3"
   [(set (match_operand:V2HI 0 "register_operand" "=Yr,*x,Yv")
(umaxmin:V2HI
@@ -3467,21 +3422,6 @@ (define_insn "v2hi3"
(set_attr "prefix" "orig,orig,vex")
(set_attr "mode" "TI")])
 
-(define_split
-  [(set (match_operand:V2HI 0 "register_operand")
-   (eq:V2HI
-  

[PATCH] Handle bitop with INTEGER_CST in analyze_and_compute_bitop_with_inv_effect.

2023-10-30 Thread liuhongt
analyze_and_compute_bitop_with_inv_effect assumes the first operand is
loop invariant which is not the case when it's INTEGER_CST.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ok for trunk?

gcc/ChangeLog:

PR tree-optimization/105735
PR tree-optimization/111972
* tree-scalar-evolution.cc
(analyze_and_compute_bitop_with_inv_effect): Handle bitop with
INTEGER_CST.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr105735-3.c: New test.
---
 gcc/testsuite/gcc.target/i386/pr105735-3.c | 87 ++
 gcc/tree-scalar-evolution.cc   |  3 +
 2 files changed, 90 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr105735-3.c

diff --git a/gcc/testsuite/gcc.target/i386/pr105735-3.c 
b/gcc/testsuite/gcc.target/i386/pr105735-3.c
new file mode 100644
index 000..9e268a1a997
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr105735-3.c
@@ -0,0 +1,87 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -fdump-tree-sccp-details" } */
+/* { dg-final { scan-tree-dump-times {final value replacement} 8 "sccp" } } */
+
+unsigned int
+__attribute__((noipa))
+foo (unsigned int tmp)
+{
+  for (int bit = 0; bit < 64; bit++)
+tmp &= 11304;
+  return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo1 (unsigned int tmp)
+{
+  for (int bit = 63; bit >= 0; bit -=3)
+tmp &= 11304;
+  return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo2 (unsigned int tmp)
+{
+  for (int bit = 0; bit < 64; bit++)
+tmp |= 11304;
+  return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo3 (unsigned int tmp)
+{
+  for (int bit = 63; bit >= 0; bit -=3)
+tmp |= 11304;
+  return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo4 (unsigned int tmp)
+{
+  for (int bit = 0; bit < 64; bit++)
+tmp ^= 11304;
+  return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo5 (unsigned int tmp)
+{
+  for (int bit = 0; bit < 63; bit++)
+tmp ^= 11304;
+  return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+f (unsigned int tmp, int bit)
+{
+  unsigned int res = tmp;
+  for (int i = 0; i < bit; i++)
+res &= 11304;
+  return res;
+}
+
+unsigned int
+__attribute__((noipa))
+f1 (unsigned int tmp, int bit)
+{
+  unsigned int res = tmp;
+  for (int i = 0; i < bit; i++)
+res |= 11304;
+  return res;
+}
+
+unsigned int
+__attribute__((noipa))
+f2 (unsigned int tmp, int bit)
+{
+  unsigned int res = tmp;
+  for (int i = 0; i < bit; i++)
+res ^= 11304;
+  return res;
+}
diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
index 95a15fe0988..33c7711f7e3 100644
--- a/gcc/tree-scalar-evolution.cc
+++ b/gcc/tree-scalar-evolution.cc
@@ -3681,6 +3681,9 @@ analyze_and_compute_bitop_with_inv_effect (class loop* 
loop, tree phidef,
   match_op[0] = gimple_assign_rhs1 (def);
   match_op[1] = gimple_assign_rhs2 (def);
 
+  if (TREE_CODE (match_op[1]) == INTEGER_CST)
+std::swap (match_op[0], match_op[1]);
+
   if (TREE_CODE (match_op[1]) != SSA_NAME
   || !expr_invariant_in_loop_p (loop, match_op[0])
   || !(header_phi = dyn_cast  (SSA_NAME_DEF_STMT (match_op[1])))
-- 
2.31.1



[PATCH] Support cmul{_conj}v4hf3/cmla{_conj}v4hf4 with AVX512FP16 instruction.

2023-11-01 Thread liuhongt
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ready push to trunk.

gcc/ChangeLog:

* config/i386/mmx.md (cmlav4hf4): New expander.
(cmla_conjv4hf4): Ditto.
(cmulv4hf3): Ditto.
(cmul_conjv4hf3): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/part-vect-complexhf.c: New test.
---
 gcc/config/i386/mmx.md| 86 +++
 .../gcc.target/i386/part-vect-complexhf.c | 40 +
 2 files changed, 126 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/part-vect-complexhf.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 2b97bb8fa98..ba81ff72551 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -2622,6 +2622,92 @@ (define_expand "vec_fmsubaddv4hf4"
   DONE;
 })
 
+
+;;
+;; Parallel half-precision floating point complex type operations
+;;
+
+
+(define_expand "cmlav4hf4"
+  [(match_operand:V4HF 0 "register_operand")
+   (match_operand:V4HF 1 "vector_operand")
+   (match_operand:V4HF 2 "vector_operand")
+   (match_operand:V4HF 3 "vector_operand")]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+{
+  rtx op3 = gen_reg_rtx (V8HFmode);
+  rtx op2 = gen_reg_rtx (V8HFmode);
+  rtx op1 = gen_reg_rtx (V8HFmode);
+  rtx op0 = gen_reg_rtx (V8HFmode);
+
+  emit_insn (gen_movq_v4hf_to_sse (op3, operands[3]));
+  emit_insn (gen_movq_v4hf_to_sse (op2, operands[2]));
+  emit_insn (gen_movq_v4hf_to_sse (op1, operands[1]));
+
+  emit_insn (gen_cmlav8hf4 (op0, op1, op2, op3));
+
+  emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode));
+  DONE;
+})
+
+(define_expand "cmla_conjv4hf4"
+  [(match_operand:V4HF 0 "register_operand")
+   (match_operand:V4HF 1 "vector_operand")
+   (match_operand:V4HF 2 "vector_operand")
+   (match_operand:V4HF 3 "vector_operand")]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+{
+  rtx op3 = gen_reg_rtx (V8HFmode);
+  rtx op2 = gen_reg_rtx (V8HFmode);
+  rtx op1 = gen_reg_rtx (V8HFmode);
+  rtx op0 = gen_reg_rtx (V8HFmode);
+
+  emit_insn (gen_movq_v4hf_to_sse (op3, operands[3]));
+  emit_insn (gen_movq_v4hf_to_sse (op2, operands[2]));
+  emit_insn (gen_movq_v4hf_to_sse (op1, operands[1]));
+
+  emit_insn (gen_cmla_conjv8hf4 (op0, op1, op2, op3));
+
+  emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode));
+  DONE;
+})
+
+(define_expand "cmulv4hf3"
+  [(match_operand:V4HF 0 "register_operand")
+   (match_operand:V4HF 1 "vector_operand")
+   (match_operand:V4HF 2 "vector_operand")]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+{
+  rtx op2 = gen_reg_rtx (V8HFmode);
+  rtx op1 = gen_reg_rtx (V8HFmode);
+  rtx op0 = gen_reg_rtx (V8HFmode);
+
+  emit_insn (gen_movq_v4hf_to_sse (op2, operands[2]));
+  emit_insn (gen_movq_v4hf_to_sse (op1, operands[1]));
+
+  emit_insn (gen_cmulv8hf3 (op0, op1, op2));
+  emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode));
+  DONE;
+})
+
+(define_expand "cmul_conjv4hf3"
+  [(match_operand:V4HF 0 "register_operand")
+   (match_operand:V4HF 1 "vector_operand")
+   (match_operand:V4HF 2 "vector_operand")]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+{
+  rtx op2 = gen_reg_rtx (V8HFmode);
+  rtx op1 = gen_reg_rtx (V8HFmode);
+  rtx op0 = gen_reg_rtx (V8HFmode);
+
+  emit_insn (gen_movq_v4hf_to_sse (op2, operands[2]));
+  emit_insn (gen_movq_v4hf_to_sse (op1, operands[1]));
+
+  emit_insn (gen_cmul_conjv8hf3 (op0, op1, op2));
+  emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode));
+  DONE;
+})
+
 ;
 ;;
 ;; Parallel half-precision floating point conversion operations
diff --git a/gcc/testsuite/gcc.target/i386/part-vect-complexhf.c 
b/gcc/testsuite/gcc.target/i386/part-vect-complexhf.c
new file mode 100644
index 000..b9f4ba2f4cf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/part-vect-complexhf.c
@@ -0,0 +1,40 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O1 -ftree-vectorize -ffast-math -mavx512fp16 -mavx512vl" } */
+/* { dg-final { scan-assembler-times "vfmaddcph\[ \\t\]" 1 } } */
+/* { dg-final { scan-assembler-not "vfmadd\[123]*ph\[ \\t\]"} } */
+/* { dg-final { scan-assembler-not "vfmadd\[123]*sh\[ \\t\]"} } */
+/* { dg-final { scan-assembler-times "vfcmaddcph\[ \\t\]" 1 } } */
+/* { dg-final { scan-assembler-times "vfmulcph\[ \\t\]" 1 } } */
+/* { dg-final { scan-assembler-times "vfcmulcph\[ \\t\]" 1 } } */
+
+#include
+#define TYPE _Float16
+#define N 2
+
+void fma0 (_Complex TYPE *a, _Complex TYPE *b,
+   _Complex TYPE * __restrict c)
+{
+  for (int i = 0; i < N; i++)
+c[i] += a[i] * b[i];
+}
+
+void fmaconj (_Complex TYPE a[restrict N], _Complex TYPE b[restrict N],
+ _Complex TYPE c[restrict N])
+{
+  for (int i = 0; i < N; i++)
+c[i] += a[i] * ~b[i];
+}
+
+void fmul (_Complex TYPE a[restrict N], _Complex TYPE b[restr

[PATCH] Avoid generating RTL code when d->testing_p.

2023-11-06 Thread liuhongt
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

PR target/112393
* config/i386/i386-expand.cc (ix86_expand_vec_perm_vpermt2):
Avoid generating RTL code when d->testing_p.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr112393.c: New test.
---
 gcc/config/i386/i386-expand.cc   |  3 +++
 gcc/testsuite/gcc.target/i386/pr112393.c | 20 
 2 files changed, 23 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr112393.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 6ae5830037d..8fad73c1549 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -5245,6 +5245,9 @@ ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx 
op0, rtx op1,
   if (gen == NULL)
 return false;
 
+  if (d && d->testing_p)
+return true;
+
   /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
  expander, so args are either in d, or in op0, op1 etc.  */
   if (d)
diff --git a/gcc/testsuite/gcc.target/i386/pr112393.c 
b/gcc/testsuite/gcc.target/i386/pr112393.c
new file mode 100644
index 000..c5c5b95b188
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr112393.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -Wuninitialized" } */
+typedef _Float16 __attribute__((__vector_size__ (32))) V;
+
+V v;
+
+void
+foo (void)
+{
+  (void) __builtin_shufflevector (v, __builtin_shufflevector (v, (V){},
+  3, 0, 2, 2,
+ 5, 6, 3, 7, 5,
+ 6, 0, 8, 6, 4,
+ 3, 2, 8, 9, 5,
+ 8, 8, 7, 5, 4,
+ 8, 9, 1, 2, 4,
+ 9, 9, 7),
+  40, 33);
+}
+
-- 
2.31.1



[PATCH] sanitizer: [PR110027] Align asan_vec[0] to MAX (alignb, ASAN_RED_ZONE_SIZE)

2024-03-12 Thread liuhongt
if alignb > ASAN_RED_ZONE_SIZE and offset[0] is not multiple of
alignb. (base_align_bias - base_offset) may not aligned to alignb, and
caused segement fault.

Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
Ok for trunk and backport to GCC13?

gcc/ChangeLog:

PR sanitizer/110027
* cfgexpand.cc (expand_stack_vars): Align frame offset to
MAX (alignb, ASAN_RED_ZONE_SIZE).

gcc/testsuite/ChangeLog:

* g++.dg/asan/pr110027.C: New test.
---
 gcc/cfgexpand.cc |  2 +-
 gcc/testsuite/g++.dg/asan/pr110027.C | 20 
 2 files changed, 21 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/g++.dg/asan/pr110027.C

diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc
index 0de299c62e3..92062378d8e 100644
--- a/gcc/cfgexpand.cc
+++ b/gcc/cfgexpand.cc
@@ -1214,7 +1214,7 @@ expand_stack_vars (bool (*pred) (size_t), class 
stack_vars_data *data)
{
  if (data->asan_vec.is_empty ())
{
- align_frame_offset (ASAN_RED_ZONE_SIZE);
+ align_frame_offset (MAX (alignb, ASAN_RED_ZONE_SIZE));
  prev_offset = frame_offset.to_constant ();
}
  prev_offset = align_base (prev_offset,
diff --git a/gcc/testsuite/g++.dg/asan/pr110027.C 
b/gcc/testsuite/g++.dg/asan/pr110027.C
new file mode 100644
index 000..0067781bc89
--- /dev/null
+++ b/gcc/testsuite/g++.dg/asan/pr110027.C
@@ -0,0 +1,20 @@
+/* PR sanitizer/110027 */
+/* { dg-do run } */
+/* { dg-require-effective-target avx512f_runtime } */
+/* { dg-options "-std=gnu++23 -mavx512f -fsanitize=address -O0 -g 
-fstack-protector-strong" } */
+
+#include 
+#include 
+
+template 
+using Vec [[gnu::vector_size(W * sizeof(T))]] = T;
+
+auto foo() {
+  Vec<8, int64_t> ret{};
+  return ret;
+}
+
+int main() {
+  foo();
+  return 0;
+}
-- 
2.31.1



[PATCH] i386[stv]: Handle REG_EH_REGION note

2024-03-13 Thread liuhongt
When we split
(insn 37 36 38 10 (set (reg:DI 104 [ _18 ])
(mem:DI (reg/f:SI 98 [ CallNative_nclosure.0_1 ]) [6 MEM[(struct 
SQRefCounted *)CallNative_nclosure.0_1]._uiRef+0 S8 A32])) "test.C":22:42 84 
{*movdi_internal}
 (expr_list:REG_EH_REGION (const_int -11 [0xfff5])

into

(insn 104 36 37 10 (set (subreg:V2DI (reg:DI 124) 0)
(vec_concat:V2DI (mem:DI (reg/f:SI 98 [ CallNative_nclosure.0_1 ]) [6 
MEM[(struct SQRefCounted *)CallNative_nclosure.0_1]._uiRef+0 S8 A32])
(const_int 0 [0]))) "test.C":22:42 -1
(nil)))
(insn 37 104 105 10 (set (subreg:V2DI (reg:DI 104 [ _18 ]) 0)
(subreg:V2DI (reg:DI 124) 0)) "test.C":22:42 2024 {movv2di_internal}
 (expr_list:REG_EH_REGION (const_int -11 [0xfff5])
(nil)))

we must copy the REG_EH_REGION note to the first insn and split the block
after the newly added insn.  The REG_EH_REGION on the second insn will be
removed later since it no longer traps.

Currently we only handle memory_operand, are there any other insns
need to be handled???

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,} for trunk and 
gcc-13/gcc-12 release branch.
Ok for trunk and backport?

gcc/ChangeLog:

* config/i386/i386-features.cc
(general_scalar_chain::convert_op): Handle REG_EH_REGION note.
(convert_scalars_to_vector): Ditto.
* config/i386/i386-features.h (class scalar_chain): New
memeber control_flow_insns.

gcc/testsuite/ChangeLog:

* g++.target/i386/pr111822.C: New test.
---
 gcc/config/i386/i386-features.cc | 48 ++--
 gcc/config/i386/i386-features.h  |  1 +
 gcc/testsuite/g++.target/i386/pr111822.C | 45 ++
 3 files changed, 90 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/pr111822.C

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 1de2a07ed75..2ed27a9ebdd 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -998,20 +998,36 @@ general_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
 }
   else if (MEM_P (*op))
 {
+  rtx_insn* eh_insn, *movabs = NULL;
   rtx tmp = gen_reg_rtx (GET_MODE (*op));
 
   /* Handle movabs.  */
   if (!memory_operand (*op, GET_MODE (*op)))
{
  rtx tmp2 = gen_reg_rtx (GET_MODE (*op));
+ movabs = emit_insn_before (gen_rtx_SET (tmp2, *op), insn);
 
- emit_insn_before (gen_rtx_SET (tmp2, *op), insn);
  *op = tmp2;
}
 
-  emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0),
-gen_gpr_to_xmm_move_src (vmode, *op)),
-   insn);
+  eh_insn
+   = emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0),
+gen_gpr_to_xmm_move_src (vmode, *op)),
+   insn);
+
+  if (cfun->can_throw_non_call_exceptions)
+   {
+ /* Handle REG_EH_REGION note.  */
+ rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
+ if (note)
+   {
+ if (movabs)
+   eh_insn = movabs;
+ control_flow_insns.safe_push (eh_insn);
+ add_reg_note (eh_insn, REG_EH_REGION, XEXP (note, 0));
+   }
+   }
+
   *op = gen_rtx_SUBREG (vmode, tmp, 0);
 
   if (dump_file)
@@ -2494,6 +2510,7 @@ convert_scalars_to_vector (bool timode_p)
 {
   basic_block bb;
   int converted_insns = 0;
+  auto_vec control_flow_insns;
 
   bitmap_obstack_initialize (NULL);
   const machine_mode cand_mode[3] = { SImode, DImode, TImode };
@@ -2575,6 +2592,11 @@ convert_scalars_to_vector (bool timode_p)
 chain->chain_id);
}
 
+ rtx_insn* iter_insn;
+ unsigned int ii;
+ FOR_EACH_VEC_ELT (chain->control_flow_insns, ii, iter_insn)
+   control_flow_insns.safe_push (iter_insn);
+
  delete chain;
}
 }
@@ -2643,6 +2665,24 @@ convert_scalars_to_vector (bool timode_p)
  DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
  }
  }
+
+  if (!control_flow_insns.is_empty ())
+   {
+ free_dominance_info (CDI_DOMINATORS);
+
+ unsigned int i;
+ rtx_insn* insn;
+ FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
+   if (control_flow_insn_p (insn))
+ {
+   /* Split the block after insn.  There will be a fallthru
+  edge, which is OK so we keep it.  We have to create
+  the exception edges ourselves.  */
+   bb = BLOCK_FOR_INSN (insn);
+   split_block (bb, insn);
+   rtl_make_eh_edge (NULL, bb, BB_END (bb));
+ }
+   }
 }
 
   return 0;
diff --git a/gcc/config/i386/i386-features.h b/gcc/config/i386/i386-features.h
index 8bab2d8666d..b259cf679af 100644
--- a/gcc/config/i3

[PATCH] Add missing hf/bf patterns.

2024-03-17 Thread liuhongt
It fixes ICE of unrecognized logic operation insn which is generated by 
lroundmn2 expanders.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

PR target/114334
* config/i386/i386.md (mode): Add new number V8BF,V16BF,V32BF.
(MODEF248): New mode iterator.
(ssevecmodesuffix): Hanlde BF and HF.
* config/i386/sse.md (andnot3): Extend to HF/BF.
(3): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr114334.c: New test.
---
 gcc/config/i386/i386.md  | 13 +
 gcc/config/i386/sse.md   | 22 +++---
 gcc/testsuite/gcc.target/i386/pr114334.c |  8 
 3 files changed, 28 insertions(+), 15 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr114334.c

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index df97a2d6270..11fdc6af3fa 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -543,8 +543,9 @@ (define_attr "type"
 
 ;; Main data type used by the insn
 (define_attr "mode"
-  "unknown,none,QI,HI,SI,DI,TI,OI,XI,HF,BF,SF,DF,XF,TF,V32HF,V16HF,V8HF,
-   V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF,V4HF,V4BF,V2HF,V2BF"
+  "unknown,none,QI,HI,SI,DI,TI,OI,XI,HF,BF,SF,DF,XF,TF,
+   V32HF,V16HF,V8HF,V4HF,V2HF,V32BF,V16BF,V8BF,V4BF,V2BF,
+   V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF"
   (const_string "unknown"))
 
 ;; The CPU unit operations uses.
@@ -1323,6 +1324,8 @@ (define_mode_attr ashl_input_operand
 ;; SSE and x87 SFmode and DFmode floating point modes
 (define_mode_iterator MODEF [SF DF])
 
+(define_mode_iterator MODEF248 [BF HF SF (DF "TARGET_SSE2")])
+
 ;; SSE floating point modes
 (define_mode_iterator MODEFH [(HF "TARGET_AVX512FP16") SF DF])
 
@@ -1347,7 +1350,8 @@ (define_mode_attr ssemodesuffix
(V64QI "b") (V32HI "w") (V16SI "d") (V8DI "q")])
 
 ;; SSE vector suffix for floating point modes
-(define_mode_attr ssevecmodesuffix [(SF "ps") (DF "pd")])
+;; BF HF use same suffix as SF for logic operations.
+(define_mode_attr ssevecmodesuffix [(BF "ps") (HF "ps") (SF "ps") (DF "pd")])
 
 ;; SSE vector mode corresponding to a scalar mode
 (define_mode_attr ssevecmode
@@ -1357,7 +1361,8 @@ (define_mode_attr ssevecmodelower
 
 ;; AVX512F vector mode corresponding to a scalar mode
 (define_mode_attr avx512fvecmode
-  [(QI "V64QI") (HI "V32HI") (SI "V16SI") (DI "V8DI") (SF "V16SF") (DF 
"V8DF")])
+  [(QI "V64QI") (HI "V32HI") (SI "V16SI") (DI "V8DI")
+   (HF "V32HF") (BF "V32BF") (SF "V16SF") (DF "V8DF")])
 
 ;; Instruction suffix for REX 64bit operators.
 (define_mode_attr rex64suffix [(SI "{l}") (DI "{q}")])
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 1bc614ab702..3286d3a4fac 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -5125,12 +5125,12 @@ (define_expand "signbit2"
 ;; because the native instructions read the full 128-bits.
 
 (define_insn "*andnot3"
-  [(set (match_operand:MODEF 0 "register_operand" "=x,x,v,v")
-   (and:MODEF
- (not:MODEF
-   (match_operand:MODEF 1 "register_operand" "0,x,v,v"))
-   (match_operand:MODEF 2 "register_operand" "x,x,v,v")))]
-  "SSE_FLOAT_MODE_P (mode)"
+  [(set (match_operand:MODEF248 0 "register_operand" "=x,x,v,v")
+   (and:MODEF248
+ (not:MODEF248
+   (match_operand:MODEF248 1 "register_operand" "0,x,v,v"))
+   (match_operand:MODEF248 2 "register_operand" "x,x,v,v")))]
+  "TARGET_SSE"
 {
   char buf[128];
   const char *ops;
@@ -5257,11 +5257,11 @@ (define_insn "*andnot3"
  (const_string "TI")))])
 
 (define_insn "3"
-  [(set (match_operand:MODEF 0 "register_operand" "=x,x,v,v")
-   (any_logic:MODEF
- (match_operand:MODEF 1 "register_operand" "%0,x,v,v")
- (match_operand:MODEF 2 "register_operand" "x,x,v,v")))]
-  "SSE_FLOAT_MODE_P (mode)"
+  [(set (match_operand:MODEF248 0 "register_operand" "=x,x,v,v")
+   (any_logic:MODEF248
+ (match_operand:MODEF248 1 "register_operand" "%0,x,v,v")
+ (match_operand:MODEF248 2 "register_operand" "x,x,v,v")))]
+  "TARGET_SSE"
 {
   char buf[128];
   const char *ops;
diff --git a/gcc/testsuite/gcc.target/i386/pr114334.c 
b/gcc/testsuite/gcc.target/i386/pr114334.c
new file mode 100644
index 000..8e38e24cd16
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114334.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -mavx512fp16" } */
+
+long
+foo(_Float16 f)
+{
+  return __builtin_lroundf16(f);
+}
-- 
2.31.1



[PATCH] i386 [stv]: Handle REG_EH_REGION note [pr111822].

2024-03-18 Thread liuhongt
Commit r14-9459-g618e34d56cc38e only handles
general_scalar_chain::convert_op. The patch also handles
timode_scalar_chain::convert_op to avoid potential similar bug.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk and backport to releases/gcc-13 branch?

gcc/ChangeLog:

PR target/111822
* config/i386/i386-features.cc
(timode_scalar_chain::convert_op): Handle REG_EH_REGION note.
---
 gcc/config/i386/i386-features.cc | 20 +---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index c7d7a965901..38f57d96df5 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -1794,12 +1794,26 @@ timode_scalar_chain::convert_op (rtx *op, rtx_insn 
*insn)
 *op = gen_rtx_SUBREG (V1TImode, *op, 0);
   else if (MEM_P (*op))
 {
+  rtx_insn* eh_insn;
   rtx tmp = gen_reg_rtx (V1TImode);
-  emit_insn_before (gen_rtx_SET (tmp,
-gen_gpr_to_xmm_move_src (V1TImode, *op)),
-   insn);
+  eh_insn
+   = emit_insn_before (gen_rtx_SET (tmp,
+gen_gpr_to_xmm_move_src (V1TImode,
+ *op)),
+   insn);
   *op = tmp;
 
+  if (cfun->can_throw_non_call_exceptions)
+   {
+ /* Handle REG_EH_REGION note.  */
+ rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
+ if (note)
+   {
+ control_flow_insns.safe_push (eh_insn);
+ add_reg_note (eh_insn, REG_EH_REGION, XEXP (note, 0));
+   }
+   }
+
   if (dump_file)
fprintf (dump_file, "  Preloading operand for insn %d into r%d\n",
 INSN_UID (insn), REGNO (tmp));
-- 
2.31.1



[PATCH] Document -fexcess-precision=16.

2024-03-18 Thread liuhongt
Ok for trunk?

gcc/ChangeLog:

* doc/invoke.texi: Document -fexcess-precision=16.
---
 gcc/doc/invoke.texi | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 85c938d4a14..673420fdd3e 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -14930,6 +14930,9 @@ assignments).  This option is enabled by default for C 
or C++ if a strict
 conformance option such as @option{-std=c99} or @option{-std=c++17} is used.
 @option{-ffast-math} enables @option{-fexcess-precision=fast} by default
 regardless of whether a strict conformance option is used.
+If @option{-fexcess-precision=16} is specified, casts and assignments of
+@code{_Float16} and @code{bfloat16_t} cause value to be rounded to their
+semantic types if they're supported by the target.
 
 @opindex mfpmath
 @option{-fexcess-precision=standard} is not implemented for languages
-- 
2.31.1



[PATCH V2] Document -fexcess-precision=16.

2024-03-19 Thread liuhongt
gcc/ChangeLog:

* doc/invoke.texi: Document -fexcess-precision=16.
---
 gcc/doc/invoke.texi | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 85c938d4a14..6bc1ebf9721 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -14930,6 +14930,9 @@ assignments).  This option is enabled by default for C 
or C++ if a strict
 conformance option such as @option{-std=c99} or @option{-std=c++17} is used.
 @option{-ffast-math} enables @option{-fexcess-precision=fast} by default
 regardless of whether a strict conformance option is used.
+If @option{-fexcess-precision=16} is specified, constants and the
+results of expressions with types @code{_Float16} and @code{__bf16}
+are computed without excess precision.
 
 @opindex mfpmath
 @option{-fexcess-precision=standard} is not implemented for languages
-- 
2.31.1



[PATCH] Fix runtime error for nonlinear iv vectorization(step_mult).

2024-03-21 Thread liuhongt
wi::from_mpz doesn't take a sign argument, we want it to be wrapped
instead of saturation, so pass utype and true to it, and it fixes the
bug.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk and backport to gcc13?

gcc/ChangeLog:

PR tree-optimization/114396
* tree-vect-loop.cc (vect_peel_nonlinear_iv_init): Pass utype
and true to wi::from_mpz.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr114396.c: New test.
---
 gcc/testsuite/gcc.target/i386/pr114396.c | 105 +++
 gcc/tree-vect-loop.cc|   2 +-
 2 files changed, 106 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr114396.c

diff --git a/gcc/testsuite/gcc.target/i386/pr114396.c 
b/gcc/testsuite/gcc.target/i386/pr114396.c
new file mode 100644
index 000..4c4015f871f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114396.c
@@ -0,0 +1,105 @@
+/* { dg-do run } */
+/* { dg-options "-O1 -fwrapv -fno-vect-cost-model" } */
+
+short a = 0xF;
+short b[16];
+unsigned short ua = 0xF;
+unsigned short ub[16];
+
+short
+__attribute__((noipa))
+foo (short a)
+{
+  for (int e = 0; e < 9; e += 1)
+b[e] = a *= 5;
+  return a;
+}
+
+short
+__attribute__((noipa))
+foo1 (short a)
+{
+  for (int e = 0; e < 9; e += 1)
+b[e] = a *= -5;
+  return a;
+}
+
+unsigned short
+__attribute__((noipa))
+foou (unsigned short a)
+{
+  for (int e = 0; e < 9; e += 1)
+ub[e] = a *= -5;
+  return a;
+}
+
+unsigned short
+__attribute__((noipa))
+foou1 (unsigned short a)
+{
+  for (int e = 0; e < 9; e += 1)
+ub[e] = a *= 5;
+  return a;
+}
+
+short
+__attribute__((noipa,optimize("O3")))
+foo_o3 (short a)
+{
+  for (int e = 0; e < 9; e += 1)
+b[e] = a *= 5;
+  return a;
+}
+
+short
+__attribute__((noipa,optimize("O3")))
+foo1_o3 (short a)
+{
+  for (int e = 0; e < 9; e += 1)
+b[e] = a *= -5;
+  return a;
+}
+
+unsigned short
+__attribute__((noipa,optimize("O3")))
+foou_o3 (unsigned short a)
+{
+  for (int e = 0; e < 9; e += 1)
+ub[e] = a *= -5;
+  return a;
+}
+
+unsigned short
+__attribute__((noipa,optimize("O3")))
+foou1_o3 (unsigned short a)
+{
+  for (int e = 0; e < 9; e += 1)
+ub[e] = a *= 5;
+  return a;
+}
+
+int main() {
+  unsigned short uexp, ures;
+  short exp, res;
+  exp = foo (a);
+  res = foo_o3 (a);
+  if (exp != res)
+__builtin_abort ();
+
+  exp = foo1 (a);
+  res = foo1_o3 (a);
+  if (uexp != ures)
+__builtin_abort ();
+
+  uexp = foou (a);
+  ures = foou_o3 (a);
+  if (uexp != ures)
+__builtin_abort ();
+
+  uexp = foou1 (a);
+  ures = foou1_o3 (a);
+  if (uexp != ures)
+__builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 4375ebdcb49..2921a9e6aa1 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -9454,7 +9454,7 @@ vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree 
init_expr,
wi::to_mpz (skipn, exp, UNSIGNED);
mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
mpz_powm (res, base, exp, mod);
-   begin = wi::from_mpz (type, res, TYPE_SIGN (type));
+   begin = wi::from_mpz (utype, res, true);
tree mult_expr = wide_int_to_tree (utype, begin);
init_expr = gimple_build (stmts, MULT_EXPR, utype,
  init_expr, mult_expr);
-- 
2.31.1



[PATCH] Move pr114396.c from gcc.target/i386 to gcc.c-torture/execute.

2024-03-21 Thread liuhongt
Also fixed a typo in the testcase.

Commit as an obvious fix.

gcc/testsuite/ChangeLog:

PR tree-optimization/114396
* gcc.target/i386/pr114396.c: Move to...
* gcc.c-torture/execute/pr114396.c: ...here.
---
 .../{gcc.target/i386 => gcc.c-torture/execute}/pr114396.c   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
 rename gcc/testsuite/{gcc.target/i386 => gcc.c-torture/execute}/pr114396.c 
(92%)

diff --git a/gcc/testsuite/gcc.target/i386/pr114396.c 
b/gcc/testsuite/gcc.c-torture/execute/pr114396.c
similarity index 92%
rename from gcc/testsuite/gcc.target/i386/pr114396.c
rename to gcc/testsuite/gcc.c-torture/execute/pr114396.c
index 4c4015f871f..baf90eafabf 100644
--- a/gcc/testsuite/gcc.target/i386/pr114396.c
+++ b/gcc/testsuite/gcc.c-torture/execute/pr114396.c
@@ -1,5 +1,5 @@
-/* { dg-do run } */
-/* { dg-options "-O1 -fwrapv -fno-vect-cost-model" } */
+/* PR tree-optimization/114396 */
+/* { dg-additional-options "-fwrapv -fno-vect-cost-model" } */
 
 short a = 0xF;
 short b[16];
@@ -88,7 +88,7 @@ int main() {
 
   exp = foo1 (a);
   res = foo1_o3 (a);
-  if (uexp != ures)
+  if (exp != res)
 __builtin_abort ();
 
   uexp = foou (a);
-- 
2.31.1



[PATCH V2] sanitizer: [PR110027] Align asan_vec[0] to MAX (BIGGEST_ALIGNMENT / BITS_PER_UNIT, ASAN_RED_ZONE_SIZE)

2024-03-25 Thread liuhongt
> > So, try to add some other variable with larger size and smaller alignment
> > to the frame (and make sure it isn't optimized away).
> >
> > alignb above is the alignment of the first partition's var, if
> > align_frame_offset really needs to depend on the var alignment, it probably
> > should be the maximum alignment of all the vars with alignment
> > alignb * BITS_PER_UNIT <=3D MAX_SUPPORTED_STACK_ALIGNMENT
> >

In asan_emit_stack_protection, when it allocated fake stack, it assume
bottom of stack is also aligned to alignb. And the place violated this
is the first var partition. which is 32 bytes offsets,  it should be
BIGGEST_ALIGNMENT / BITS_PER_UNIT.
So I think we need to use MAX (BIGGEST_ALIGNMENT /
BITS_PER_UNIT, ASAN_RED_ZONE_SIZE) for the first var partition.

Ok?

gcc/ChangeLog:

PR sanitizer/110027
* cfgexpand.cc (expand_stack_vars): Align frame offset to
MAX (BIGGEST_ALIGNMENT / BITS_PER_UNIT, ASAN_RED_ZONE_SIZE).

gcc/testsuite/ChangeLog:

* g++.dg/asan/pr110027.C: New test.
---
 gcc/cfgexpand.cc |  3 ++-
 gcc/testsuite/g++.dg/asan/pr110027.C | 20 
 2 files changed, 22 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/g++.dg/asan/pr110027.C

diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc
index 0de299c62e3..70cc89b29f4 100644
--- a/gcc/cfgexpand.cc
+++ b/gcc/cfgexpand.cc
@@ -1214,7 +1214,8 @@ expand_stack_vars (bool (*pred) (size_t), class 
stack_vars_data *data)
{
  if (data->asan_vec.is_empty ())
{
- align_frame_offset (ASAN_RED_ZONE_SIZE);
+ align_frame_offset (MAX (BIGGEST_ALIGNMENT / BITS_PER_UNIT,
+  ASAN_RED_ZONE_SIZE));
  prev_offset = frame_offset.to_constant ();
}
  prev_offset = align_base (prev_offset,
diff --git a/gcc/testsuite/g++.dg/asan/pr110027.C 
b/gcc/testsuite/g++.dg/asan/pr110027.C
new file mode 100644
index 000..0067781bc89
--- /dev/null
+++ b/gcc/testsuite/g++.dg/asan/pr110027.C
@@ -0,0 +1,20 @@
+/* PR sanitizer/110027 */
+/* { dg-do run } */
+/* { dg-require-effective-target avx512f_runtime } */
+/* { dg-options "-std=gnu++23 -mavx512f -fsanitize=address -O0 -g 
-fstack-protector-strong" } */
+
+#include 
+#include 
+
+template 
+using Vec [[gnu::vector_size(W * sizeof(T))]] = T;
+
+auto foo() {
+  Vec<8, int64_t> ret{};
+  return ret;
+}
+
+int main() {
+  foo();
+  return 0;
+}
-- 
2.31.1



[PATCH wwwdoc] Hardware-assisted AddressSanitizer now works for x86_64 with LAM_U57

2024-02-08 Thread liuhongt
---
 htdocs/gcc-14/changes.html | 5 +
 1 file changed, 5 insertions(+)

diff --git a/htdocs/gcc-14/changes.html b/htdocs/gcc-14/changes.html
index 6d917535..a022357a 100644
--- a/htdocs/gcc-14/changes.html
+++ b/htdocs/gcc-14/changes.html
@@ -499,6 +499,11 @@ a work-in-progress.
 -march=knm, -mtune=knl or -mtune=knm
 compiler switches. Support will be removed in GCC 15.
   
+  https://gcc.gnu.org/gcc-11/changes.html";>Hardware-assisted
+AddressSanitizer now works for the x86-64 target with LAM_U57.
+-fsanitize=hwaddress will enable -mlam=u57
+by default.
+  
 
 
 
-- 
2.31.1



[PATCH] Fix testcase for platform without gnu/stubs-x32.h

2024-02-18 Thread liuhongt
target maybe_x32 doesn't check if platform has gnu/stubs-x32.h, but
it's included by stdint.h in the testcase.
Adjust testcase: remove stdint.h, use 'typedef long long int64_t'
instead.

Commit as an obvious patch.

gcc/testsuite/ChangeLog:

PR target/113711
* gcc.target/i386/apx-ndd-x32-1.c: Adjust testcase.
---
 gcc/testsuite/gcc.target/i386/apx-ndd-x32-1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/apx-ndd-x32-1.c 
b/gcc/testsuite/gcc.target/i386/apx-ndd-x32-1.c
index 4280d400458..74197dbc641 100644
--- a/gcc/testsuite/gcc.target/i386/apx-ndd-x32-1.c
+++ b/gcc/testsuite/gcc.target/i386/apx-ndd-x32-1.c
@@ -3,7 +3,7 @@
 /* { dg-require-effective-target maybe_x32 } */
 /* { dg-options "-mapxf -O2 -mx32" } */
 
-#include 
+typedef long long int64_t;
 
 #define FOO(TYPE, OP_NAME, OP, IMM)\
 TYPE   \
-- 
2.31.1



[PATCH] Update documents for fcf-protection=

2024-01-09 Thread liuhongt
After r14-2692-g1c6231c05bdcca, the option is defined as EnumSet and
-fcf-protection=branch won't unset any others bits since they're in
different groups. So to override -fcf-protection, an explicit
-fcf-protection=none needs to be added and then with
-fcf-protection=XXX

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

* doc/invoke.texi (fcf-protection=): Update documents.
---
 gcc/doc/invoke.texi | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 68d1f364ac0..d1e6fafb98c 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -17734,6 +17734,9 @@ function.  The value @code{full} is an alias for 
specifying both
 @code{branch} and @code{return}. The value @code{none} turns off
 instrumentation.
 
+To override @option{-fcf-protection}, @option{-fcf-protection=none}
+needs to be explicitly added and then with @option{-fcf-protection=xxx}.
+
 The value @code{check} is used for the final link with link-time
 optimization (LTO).  An error is issued if LTO object files are
 compiled with different @option{-fcf-protection} values.  The
-- 
2.31.1



[PATCH] Document refactoring of the option -fcf-protection=x.

2024-01-09 Thread liuhongt
To override -fcf-protection, -fcf-protection=none needs to be added
and then with -fcf-protection=xxx.
---
 htdocs/gcc-14/changes.html | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/htdocs/gcc-14/changes.html b/htdocs/gcc-14/changes.html
index e3a68998..72b0d291 100644
--- a/htdocs/gcc-14/changes.html
+++ b/htdocs/gcc-14/changes.html
@@ -40,6 +40,12 @@ a work-in-progress.
   https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html#index-Wflex-array-member-not-at-end";>-Wflex-array-member-not-at-end
 to
   identify all such cases in the source code and modify them.
   
+  https://gcc.gnu.org/onlinedocs/gcc/Instrumentation-Options.html";>-fcf-protection=[full|branch|return|none|check]
+  is refactored, to override -fcf-protection,
+  -fcf-protection=none needs to be added and then
+  with -fcf-protection=xxx.
+  
+
 
 
 
-- 
2.31.1



[PATCH] Fix testcase failure on many platforms which don't support vect_int_max.

2024-01-18 Thread liuhongt
After r14-7124-g6686e16fda4190, the testcase can be optimized to
MAX_EXPR if the backends support that. So I adjust the testcase to
scan for MAX_EXPR, but it failed many platforms which don't support
that.
As pinski mentioned, target vect_no_int_min_max is only available
under vect directory, so for simplicity, I adjust the testcase to scan
either MAX_EXPR or original VEC_COND_EXPR.

Commit as an obvious fix.

gcc/testsuite/ChangeLog:

PR testsuite/113437
* gcc.dg/tree-ssa/pr95906.c: Scan either MAX_EXPR or
VEC_COND_EXPR.
---
 gcc/testsuite/gcc.dg/tree-ssa/pr95906.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr95906.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr95906.c
index d15670f3e9e..ce43983f341 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr95906.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr95906.c
@@ -9,4 +9,4 @@ v16i8 f(v16i8 a, v16i8 b)
 }
 
 /* { dg-final { scan-tree-dump-not "bit_(and|ior)_expr" "forwprop3" } } */
-/* { dg-final { scan-tree-dump-times "max_expr" 1 "forwprop3" } } */
+/* { dg-final { scan-tree-dump-times {(?n)(?:max_expr|vec_cond_expr)} 1 
"forwprop3" } } */
-- 
2.31.1



[PATCH] Adjust testcase gcc.target/i386/part-vect-copysignhf.c.

2024-01-18 Thread liuhongt
After vect_early_break is supported, more vectorization is enabled(3
COPYSIGN), so adjust testcase for that.

Commit as obvious fix.

gcc/testsuite/ChangeLog:

* gcc.target/i386/part-vect-copysignhf.c: Remove
-ftree-vectorize from dg-options.
---
 gcc/testsuite/gcc.target/i386/part-vect-copysignhf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/part-vect-copysignhf.c 
b/gcc/testsuite/gcc.target/i386/part-vect-copysignhf.c
index 811617bc3dd..0fdcbaea363 100644
--- a/gcc/testsuite/gcc.target/i386/part-vect-copysignhf.c
+++ b/gcc/testsuite/gcc.target/i386/part-vect-copysignhf.c
@@ -1,5 +1,5 @@
 /* { dg-do run { target avx512fp16 } } */
-/* { dg-options "-O1 -mavx512fp16 -mavx512vl -ftree-vectorize 
-fdump-tree-slp-details -fdump-tree-optimized" } */
+/* { dg-options "-O1 -mavx512fp16 -mavx512vl -fdump-tree-slp-details 
-fdump-tree-optimized" } */
 
 extern void abort ();
 
-- 
2.31.1



[PATCH 2/2] [x86] Enable -mlam=u57 by default when compiled with -fsanitize=hwaddress.

2024-01-22 Thread liuhongt
Ready push to trunk.

gcc/ChangeLog:

* config/i386/i386-options.cc (ix86_option_override_internal):
Enable -mlam=u57 by default when compiled with
-fsanitize=hwaddress.
---
 gcc/config/i386/i386-options.cc | 9 +
 1 file changed, 9 insertions(+)

diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index b6f634e9a32..e66a58ed926 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -2189,6 +2189,15 @@ ix86_option_override_internal (bool main_args_p,
   && opts->x_ix86_abi != DEFAULT_ABI)
 error ("%<-mabi=%s%> not supported with %<-fsanitize=thread%>", abi_name);
 
+  /* Hwasan is supported with lam_u57 only.  */
+  if (opts->x_flag_sanitize & SANITIZE_HWADDRESS)
+{
+  if (ix86_lam_type == lam_u48)
+   warning (0, "%<-mlam=u48%> is not compatible with Hardware-assisted "
+"AddressSanitizer, override to %<-mlam=u57%>");
+  ix86_lam_type = lam_u57;
+}
+
   /* For targets using ms ABI enable ms-extensions, if not
  explicit turned off.  For non-ms ABI we turn off this
  option.  */
-- 
2.31.1



[PATCH 1/2] Adjust hwasan testcase for x86 target.

2024-01-22 Thread liuhongt
There're 2 cases:
1. hwasan-poison-optimisation.c is supposed to scan call to
__hwasan_tag_mismatch4, and x86 have different mnemonic(call) from
aarch64(bl), so adjust testcase to scan either call or bl.

2. alloca-outside-caught.c/vararray-outside-caught.c are supposed to
scan mismatched tags and expected the tag corresponding to
out-of-bounds memory is 00, but for x86 the continous stack is
allocated by other local variable/array which is assigned with a
different tag, but still there're mismatches. So adjust testcase to
scan XX/XX instead of XX/00.

Ok for trunk?

gcc/testsuite/ChangeLog:

* c-c++-common/hwasan/alloca-outside-caught.c: Adjust
testcase.
* c-c++-common/hwasan/hwasan-poison-optimisation.c: Ditto.
* c-c++-common/hwasan/vararray-outside-caught.c: Ditto.
---
 gcc/testsuite/c-c++-common/hwasan/alloca-outside-caught.c  | 2 +-
 gcc/testsuite/c-c++-common/hwasan/hwasan-poison-optimisation.c | 2 +-
 gcc/testsuite/c-c++-common/hwasan/vararray-outside-caught.c| 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/c-c++-common/hwasan/alloca-outside-caught.c 
b/gcc/testsuite/c-c++-common/hwasan/alloca-outside-caught.c
index 6f3825bee7c..f31484a2613 100644
--- a/gcc/testsuite/c-c++-common/hwasan/alloca-outside-caught.c
+++ b/gcc/testsuite/c-c++-common/hwasan/alloca-outside-caught.c
@@ -20,6 +20,6 @@ main ()
 }
 
 /* { dg-output "HWAddressSanitizer: tag-mismatch on address 0x\[0-9a-f\]*.*" } 
*/
-/* { dg-output "READ of size 4 at 0x\[0-9a-f\]* tags: 
\[\[:xdigit:\]\]\[\[:xdigit:\]\]/00.* \\(ptr/mem\\) in thread T0.*" } */
+/* { dg-output "READ of size 4 at 0x\[0-9a-f\]* tags: 
\[\[:xdigit:\]\]\[\[:xdigit:\]\]/\[\[:xdigit:\]\]\[\[:xdigit:\]\].* 
\\(ptr/mem\\) in thread T0.*" } */
 /* { dg-output "Address 0x\[0-9a-f\]* is located in stack of thread T0.*" } */
 /* { dg-output "SUMMARY: HWAddressSanitizer: tag-mismatch \[^\n\]*.*" } */
diff --git a/gcc/testsuite/c-c++-common/hwasan/hwasan-poison-optimisation.c 
b/gcc/testsuite/c-c++-common/hwasan/hwasan-poison-optimisation.c
index 2d6bab4c578..48cf88744eb 100644
--- a/gcc/testsuite/c-c++-common/hwasan/hwasan-poison-optimisation.c
+++ b/gcc/testsuite/c-c++-common/hwasan/hwasan-poison-optimisation.c
@@ -22,7 +22,7 @@ main ()
 }
 
 /* { dg-final { scan-tree-dump-times "ASAN_POISON" 1 "asan1" }  } */
-/* { dg-final { scan-assembler-times "bl\\s*__hwasan_tag_mismatch4" 1 } } */
+/* { dg-final { scan-assembler-times "(?:bl|call)\\s*__hwasan_tag_mismatch4" 1 
} } */
 /* { dg-output "HWAddressSanitizer: tag-mismatch on address 0x\[0-9a-f\]*.*" } 
*/
 /* { dg-output "READ of size 4 at 0x\[0-9a-f\]* tags: 
\[\[:xdigit:\]\]\[\[:xdigit:\]\]/00 \\(ptr/mem\\) in thread T0.*" } */
 /* { dg-output "Address 0x\[0-9a-f\]* is located in stack of thread T0.*" } */
diff --git a/gcc/testsuite/c-c++-common/hwasan/vararray-outside-caught.c 
b/gcc/testsuite/c-c++-common/hwasan/vararray-outside-caught.c
index 35a344def42..743a894ede9 100644
--- a/gcc/testsuite/c-c++-common/hwasan/vararray-outside-caught.c
+++ b/gcc/testsuite/c-c++-common/hwasan/vararray-outside-caught.c
@@ -17,6 +17,6 @@ main ()
 }
 
 /* { dg-output "HWAddressSanitizer: tag-mismatch on address 0x\[0-9a-f\]*.*" } 
*/
-/* { dg-output "READ of size 4 at 0x\[0-9a-f\]* tags: 
\[\[:xdigit:\]\]\[\[:xdigit:\]\]/00 \\(ptr/mem\\) in thread T0.*" } */
+/* { dg-output "READ of size 4 at 0x\[0-9a-f\]* tags: 
\[\[:xdigit:\]\]\[\[:xdigit:\]\]/\[\[:xdigit:\]\]\[\[:xdigit:\]\].*\\(ptr/mem\\)
 in thread T0.*" } */
 /* { dg-output "Address 0x\[0-9a-f\]* is located in stack of thread T0.*" } */
 /* { dg-output "SUMMARY: HWAddressSanitizer: tag-mismatch \[^\n\]*.*" } */
-- 
2.31.1



[PATCH] Optimize A < B ? A : B to MIN_EXPR.

2023-12-18 Thread liuhongt
Similar for A < B ? B : A to MAX_EXPR.
There're codes in the frontend to optimize such pattern but failed to
handle testcase in the PR since it's exposed at gimple level when
folding backend builtins.

pr95906 now can be optimized to MAX_EXPR as it's commented in the
testcase.

// FIXME: this should further optimize to a MAX_EXPR
 typedef signed char v16i8 __attribute__((vector_size(16)));
 v16i8 f(v16i8 a, v16i8 b)


Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk? (or maybe wait for GCC 15).

gcc/ChangeLog:

PR target/104401
* match.pd (A < B ? A : B -> MIN_EXPR): New patten match.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr104401.c: New test.
* gcc.dg/tree-ssa/pr95906.c: Adjust testcase.
---
 gcc/match.pd | 20 ++
 gcc/testsuite/gcc.dg/tree-ssa/pr95906.c  |  3 +--
 gcc/testsuite/gcc.target/i386/pr104401.c | 27 
 3 files changed, 48 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr104401.c

diff --git a/gcc/match.pd b/gcc/match.pd
index d57e29bfe1d..9584a70aa3d 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -5263,6 +5263,26 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (view_convert:type
(vec_cond @4 (view_convert:vtype @2) (view_convert:vtype @3)))
 
+/* Optimize A < B ? A : B to MIN (A, B)
+   A > B ? A : B to MAX (A, B).  */
+(for cmp (lt le gt ge)
+ minmax (min min max max)
+ MINMAX (MIN_EXPR MIN_EXPR MAX_EXPR MAX_EXPR)
+ (simplify
+  (vec_cond (cmp @0 @1) @0 @1)
+   (if (VECTOR_INTEGER_TYPE_P (type)
+   && target_supports_op_p (type, MINMAX, optab_vector))
+(minmax @0 @1
+
+(for cmp (lt le gt ge)
+ minmax (max max min min)
+ MINMAX (MAX_EXPR MAX_EXPR MIN_EXPR MIN_EXPR)
+ (simplify
+  (vec_cond (cmp @0 @1) @1 @0)
+   (if (VECTOR_INTEGER_TYPE_P (type)
+   && target_supports_op_p (type, MINMAX, optab_vector))
+(minmax @0 @1
+
 /* c1 ? c2 ? a : b : b  -->  (c1 & c2) ? a : b  */
 (simplify
  (vec_cond @0 (vec_cond:s @1 @2 @3) @3)
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr95906.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr95906.c
index 3d820a58e93..d15670f3e9e 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr95906.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr95906.c
@@ -1,7 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -fdump-tree-forwprop3-raw -w -Wno-psabi" } */
 
-// FIXME: this should further optimize to a MAX_EXPR
 typedef signed char v16i8 __attribute__((vector_size(16)));
 v16i8 f(v16i8 a, v16i8 b)
 {
@@ -10,4 +9,4 @@ v16i8 f(v16i8 a, v16i8 b)
 }
 
 /* { dg-final { scan-tree-dump-not "bit_(and|ior)_expr" "forwprop3" } } */
-/* { dg-final { scan-tree-dump-times "vec_cond_expr" 1 "forwprop3" } } */
+/* { dg-final { scan-tree-dump-times "max_expr" 1 "forwprop3" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr104401.c 
b/gcc/testsuite/gcc.target/i386/pr104401.c
new file mode 100644
index 000..8ce7ff88d9e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr104401.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4.1" } */
+/* { dg-final { scan-assembler-times "pminsd" 2 } } */
+/* { dg-final { scan-assembler-times "pmaxsd" 2 } } */
+
+#include 
+
+__m128i min32(__m128i value, __m128i input)
+{
+  return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input));
+}
+
+__m128i max32(__m128i value, __m128i input)
+{
+  return _mm_blendv_epi8(input, value, _mm_cmpgt_epi32(value, input));
+}
+
+__m128i min32_1(__m128i value, __m128i input)
+{
+  return _mm_blendv_epi8(input, value, _mm_cmpgt_epi32(input, value));
+}
+
+__m128i max32_1(__m128i value, __m128i input)
+{
+  return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(input, value));
+}
+
-- 
2.31.1



[PATCH] Optimize A < B ? A : B to MIN_EXPR.

2024-01-09 Thread liuhongt
> I wonder if you can amend the existing patterns instead by iterating
> over cond/vec_cond.  There are quite some (look for uses of
> minmax_from_comparison) that could be adapted to vectors.
>
> The ones matching the simple form you match are
>
> #if GIMPLE
> /* A >= B ? A : B -> max (A, B) and friends.  The code is still
>    in fold_cond_expr_with_comparison for GENERIC folding with
>    some extra constraints.  */
> (for cmp (eq ne le lt unle unlt ge gt unge ungt uneq ltgt)
>  (simplify
>   (cond (cmp:c (nop_convert1?@c0 @0) (nop_convert2?@c1 @1))
>         (convert3? @0) (convert4? @1))
>   (if (!HONOR_SIGNED_ZEROS (type)
> ...
This pattern is a conditional operation that treats a vector as a complete
unit, it's more like cbranchm which is different from vec_cond_expr.
So I add my patterns after this.
>
> I think.  Consider at least placing the new patterns next to that.


Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

Similar for A < B ? B : A to MAX_EXPR.
There're codes in the frontend to optimize such pattern but failed to
handle testcase in the PR since it's exposed at gimple level when
folding backend builtins.

pr95906 now can be optimized to MAX_EXPR as it's commented in the
testcase.

// FIXME: this should further optimize to a MAX_EXPR
 typedef signed char v16i8 __attribute__((vector_size(16)));
 v16i8 f(v16i8 a, v16i8 b)

gcc/ChangeLog:

PR target/104401
* match.pd (VEC_COND_EXPR: A < B ? A : B -> MIN_EXPR): New patten match.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr104401.c: New test.
* gcc.dg/tree-ssa/pr95906.c: Adjust testcase.
---
 gcc/match.pd | 21 ++
 gcc/testsuite/gcc.dg/tree-ssa/pr95906.c  |  3 +--
 gcc/testsuite/gcc.target/i386/pr104401.c | 27 
 3 files changed, 49 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr104401.c

diff --git a/gcc/match.pd b/gcc/match.pd
index 7b4b15acc41..d8e2009a83f 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -5672,6 +5672,27 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
   (if (VECTOR_TYPE_P (type))
(view_convert @c0)
(convert @c0
+
+/* This is for VEC_COND_EXPR
+   Optimize A < B ? A : B to MIN (A, B)
+   A > B ? A : B to MAX (A, B).  */
+(for cmp (lt le ungt unge gt ge unlt unle)
+ minmax (min min min min max max max max)
+ MINMAX (MIN_EXPR MIN_EXPR MIN_EXPR MIN_EXPR MAX_EXPR MAX_EXPR MAX_EXPR 
MAX_EXPR)
+ (simplify
+  (vec_cond (cmp @0 @1) @0 @1)
+   (if (VECTOR_INTEGER_TYPE_P (type)
+   && target_supports_op_p (type, MINMAX, optab_vector))
+(minmax @0 @1
+
+(for cmp (lt le ungt unge gt ge unlt unle)
+ minmax (max max max max min min min min)
+ MINMAX (MAX_EXPR MAX_EXPR MAX_EXPR MAX_EXPR MIN_EXPR MIN_EXPR MIN_EXPR 
MIN_EXPR)
+ (simplify
+  (vec_cond (cmp @0 @1) @1 @0)
+   (if (VECTOR_INTEGER_TYPE_P (type)
+   && target_supports_op_p (type, MINMAX, optab_vector))
+(minmax @0 @1
 #endif
 
 (for cnd (cond vec_cond)
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr95906.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr95906.c
index 3d820a58e93..d15670f3e9e 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr95906.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr95906.c
@@ -1,7 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -fdump-tree-forwprop3-raw -w -Wno-psabi" } */
 
-// FIXME: this should further optimize to a MAX_EXPR
 typedef signed char v16i8 __attribute__((vector_size(16)));
 v16i8 f(v16i8 a, v16i8 b)
 {
@@ -10,4 +9,4 @@ v16i8 f(v16i8 a, v16i8 b)
 }
 
 /* { dg-final { scan-tree-dump-not "bit_(and|ior)_expr" "forwprop3" } } */
-/* { dg-final { scan-tree-dump-times "vec_cond_expr" 1 "forwprop3" } } */
+/* { dg-final { scan-tree-dump-times "max_expr" 1 "forwprop3" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr104401.c 
b/gcc/testsuite/gcc.target/i386/pr104401.c
new file mode 100644
index 000..8ce7ff88d9e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr104401.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4.1" } */
+/* { dg-final { scan-assembler-times "pminsd" 2 } } */
+/* { dg-final { scan-assembler-times "pmaxsd" 2 } } */
+
+#include 
+
+__m128i min32(__m128i value, __m128i input)
+{
+  return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input));
+}
+
+__m128i max32(__m128i value, __m128i input)
+{
+  return _mm_blendv_epi8(input, value, _mm_cmpgt_epi32(value, input));
+}
+
+__m128i min32_1(__m128i value, __m128i input)
+{
+  return _mm_blendv_epi8(input, value, _mm_cmpgt_epi32(input, value));
+}
+
+__m128i max32_1(__m128i value, __m128i input)
+{
+  return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(input, value));
+}
+
-- 
2.31.1



[PATCH] Take register pressure into account for vec_construct/scalar_to_vec when the components are not loaded from memory.

2023-11-30 Thread liuhongt
> Hmm, I would suggest you put reg_needed into the class and accumulate
> over all vec_construct, with your patch you pessimize a single v32qi
> over two separate v16qi for example.  Also currently the whole block is
> gated with INTEGRAL_TYPE_P but register pressure would be also
> a concern for floating point vectors.  finish_cost would then apply an
> adjustment.

Changed.

> 'target_avail_regs' is for GENERAL_REGS, does that include APX regs?
> I don't see anything similar for FP regs, but I guess the target should know
> or maybe there's a #regs in regclass query already.
Haven't see any, use below setting.

unsigned target_avail_sse = TARGET_64BIT ? (TARGET_AVX512F ? 32 : 16) : 8;

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
No big impact on SPEC2017.
Observe 1 big improvement from other benchmark by avoiding vectorization with
vec_construct v32qi which caused lots of spills.

Ok for trunk?

For vec_contruct, the components must be live at the same time if
they're not loaded from memory, when the number of those components
exceeds available registers, spill happens. Try to account that with a
rough estimation.
??? Ideally, we should have an overall estimation of register pressure
if we know the live range of all variables.

gcc/ChangeLog:

* config/i386/i386.cc (ix86_vector_costs::add_stmt_cost):
Count sse_reg/gpr_regs for components not loaded from memory.
(ix86_vector_costs:ix86_vector_costs): New constructor.
(ix86_vector_costs::m_num_gpr_needed[3]): New private memeber.
(ix86_vector_costs::m_num_sse_needed[3]): Ditto.
(ix86_vector_costs::finish_cost): Estimate overall register
pressure cost.
(ix86_vector_costs::ix86_vect_estimate_reg_pressure): New
function.
---
 gcc/config/i386/i386.cc | 54 ++---
 1 file changed, 50 insertions(+), 4 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 9390f525b99..dcaea6c2096 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -24562,15 +24562,34 @@ ix86_noce_conversion_profitable_p (rtx_insn *seq, 
struct noce_if_info *if_info)
 /* x86-specific vector costs.  */
 class ix86_vector_costs : public vector_costs
 {
-  using vector_costs::vector_costs;
+public:
+  ix86_vector_costs (vec_info *, bool);
 
   unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
  stmt_vec_info stmt_info, slp_tree node,
  tree vectype, int misalign,
  vect_cost_model_location where) override;
   void finish_cost (const vector_costs *) override;
+
+private:
+
+  /* Estimate register pressure of the vectorized code.  */
+  void ix86_vect_estimate_reg_pressure ();
+  /* Number of GENERAL_REGS/SSE_REGS used in the vectorizer, it's used for
+ estimation of register pressure.
+ ??? Currently it's only used by vec_construct/scalar_to_vec
+ where we know it's not loaded from memory.  */
+  unsigned m_num_gpr_needed[3];
+  unsigned m_num_sse_needed[3];
 };
 
+ix86_vector_costs::ix86_vector_costs (vec_info* vinfo, bool costing_for_scalar)
+  : vector_costs (vinfo, costing_for_scalar),
+m_num_gpr_needed (),
+m_num_sse_needed ()
+{
+}
+
 /* Implement targetm.vectorize.create_costs.  */
 
 static vector_costs *
@@ -24748,8 +24767,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
 }
   else if ((kind == vec_construct || kind == scalar_to_vec)
   && node
-  && SLP_TREE_DEF_TYPE (node) == vect_external_def
-  && INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
+  && SLP_TREE_DEF_TYPE (node) == vect_external_def)
 {
   stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
   unsigned i;
@@ -24785,7 +24803,15 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
  && (gimple_assign_rhs_code (def) != BIT_FIELD_REF
  || !VECTOR_TYPE_P (TREE_TYPE
(TREE_OPERAND (gimple_assign_rhs1 (def), 0))
-   stmt_cost += ix86_cost->sse_to_integer;
+   {
+ if (fp)
+   m_num_sse_needed[where]++;
+ else
+   {
+ m_num_gpr_needed[where]++;
+ stmt_cost += ix86_cost->sse_to_integer;
+   }
+   }
}
   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
if (TREE_CODE (op) == SSA_NAME)
@@ -24821,6 +24847,24 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
   return retval;
 }
 
+void
+ix86_vector_costs::ix86_vect_estimate_reg_pressure ()
+{
+  unsigned gpr_spill_cost = COSTS_N_INSNS (ix86_cost->int_store [2]) / 2;
+  unsigned sse_spill_cost = COSTS_N_INSNS (ix86_cost->sse_store[0]) / 2;
+
+  /* Any better way to have target available fp registers, currently use 
SSE_REGS.  */
+  unsigned target_avail_sse = TARGET_64BIT ? (T

[PATCH] Don't vectorize when vector stmts are only vec_contruct and stores

2023-12-03 Thread liuhongt
.i.e. for below cases.
   a[0] = b1;
   a[1] = b2;
   ..
   a[n] = bn;

There're extra dependences when contructing the vector, but not for
scalar store. According to experiments, it's generally worse.

The patch adds an cut-off heuristic when vec_stmt is just
vec_construct and vector store. It improves SPEC2017 a little bit.

BenchMarks  Ratio
500.perlbench_r 2.60%
502.gcc_r   0.30%
505.mcf_r   0.40%
520.omnetpp_r   -1.00%
523.xalancbmk_r 0.90%
525.x264_r  0.00%
531.deepsjeng_r 0.30%
541.leela_r 0.90%
548.exchange2_r 3.20%
557.xz_r1.40%
503.bwaves_r0.00%
507.cactuBSSN_r 0.00%
508.namd_r  0.30%
510.parest_r0.00%
511.povray_r0.20%
519.lbm_r   SAME BIN
521.wrf_r   -0.30%
526.blender_r   -1.20%
527.cam4_r  -0.20%
538.imagick_r   4.00%
544.nab_r   0.40%
549.fotonik3d_r 0.00%
554.roms_r  0.00%
Geomean-int 0.90%
Geomean-fp  0.30%
Geomean-all 0.50%

And
Regressed testcases:

gcc.target/i386/part-vect-absneghf.c
gcc.target/i386/part-vect-copysignhf.c
gcc.target/i386/part-vect-xorsignhf.c

Regressed under -m32 since it generates 2 vector
.ABS/NEG/XORSIGN/COPYSIGN vs original 1 64-bit vec_construct. The
original testcases are used to test vectorization capability for
.ABS/NEG/XORG/COPYSIGN, so just restrict testcase to TARGET_64BIT.

gcc.target/i386/pr111023-2.c
gcc.target/i386/pr111023.c
Regressed under -m32

testcase as below

void
v8hi_v8qi (v8hi *dst, v16qi src)
{
  short tem[8];
  tem[0] = src[0];
  tem[1] = src[1];
  tem[2] = src[2];
  tem[3] = src[3];
  tem[4] = src[4];
  tem[5] = src[5];
  tem[6] = src[6];
  tem[7] = src[7];
  dst[0] = *(v8hi *) tem;
}

under 64-bit target, vectorizer realize it's just permutation of
original src vector, but under -m32, vectorizer relies on
vec_construct for vectorization. I think optimziation for this case
under 32-bit target maynot impact much, so just add
-fno-vect-cost-model.

gcc.target/i386/pr91446.c: This testcase is guard for cost model of
vector store, not vectorization capability, so just adjust testcase.

gcc.target/i386/pr108938-3.c: This testcase relies on vec_construct to
optimize for bswap, like other optimziation vectorizer can't realize
optimization after it. So the current solution is add
-fno-vect-cost-model to the testcase.

costmodel-pr104582-1.c
costmodel-pr104582-2.c
costmodel-pr104582-4.c

Failed since it's now not vectorized, looked at the PR, it's exactly
what's wanted, so adjust testcase to scan-tree-dump-not.


Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/99881
PR target/104582
* config/i386/i386.cc (ix86_vector_costs::add_stmt_cost):
Check if kind is vec_construct or vector store.
(ix86_vector_costs::finish_cost): Don't do vectorization when
vector stmts are only vec_construct and stores.
(ix86_vector_costs::ix86_vect_construct_store_only_p): New
function.
(ix86_vector_costs::ix86_vect_cut_off): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/part-vect-absneghf.c: Restrict testcase to
TARGET_64BIT.
* gcc.target/i386/part-vect-copysignhf.c: Ditto.
* gcc.target/i386/part-vect-xorsignhf.c: Ditto.
* gcc.target/i386/pr91446.c: Adjust testcase.
* gcc.target/i386/pr108938-3.c: Add -fno-vect-cost-model.
* gcc.target/i386/pr111023-2.c: Ditto.
* gcc.target/i386/pr111023.c: Ditto.
* gcc.target/i386/pr99881.c: Remove xfail.
* gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-1.c: Changed
to Scan-tree-dump-not.
* gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-3.c: Ditto.
* gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-4.c: Ditto.
---
 gcc/config/i386/i386.cc   | 81 ++-
 .../costmodel/x86_64/costmodel-pr104582-1.c   |  2 +-
 .../costmodel/x86_64/costmodel-pr104582-3.c   |  2 +-
 .../costmodel/x86_64/costmodel-pr104582-4.c   |  2 +-
 .../gcc.target/i386/part-vect-absneghf.c  |  4 +-
 .../gcc.target/i386/part-vect-copysignhf.c|  4 +-
 .../gcc.target/i386/part-vect-xorsignhf.c |  4 +-
 gcc/testsuite/gcc.target/i386/pr108938-3.c|  2 +-
 gcc/testsuite/gcc.target/i386/pr111023-2.c|  2 +-
 gcc/testsuite/gcc.target/i386/pr111023.c  |  2 +-
 gcc/testsuite/gcc.target/i386/pr91446.c   | 14 ++--
 gcc/testsuite/gcc.target/i386/pr99881.c   |  2 +-
 12 files changed, 99 insertions(+), 22 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index dcaea6c2096..a4b23e29eba 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -24573,6 +24573,10 @@ public:
 
 private:
 
+  /* Don't do vectorization for certain patterns.  */
+  void ix86_vect_cut_off ();
+

[PATCH] Support udot_prodv*qi with emulation sdot_prodv*hi

2023-12-03 Thread liuhongt
Like r14-5990-gb4a7c1c8c59d19, but the patch optimized for udot_prod.

Since (zero_extend) (unsigned char)-> int is equal
to (zero_extend)(unsigned char) -> short
+ (sign_extend) (short) -> int

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

It should be safe to emulate udot_prodv*qi with

 vec_unpacku_lo_v32qi
 vec_unpacku_lo_v32qi
 vec_unpacku_hi_v32qi
 vec_unpacku_hi_v32qi
 sdot_prodv16hi
 sdot_prodv16hi
 add3v8si

gcc/ChangeLog:

* config/i386/sse.md (udot_prodv64qi): New expander.
(udot_prod): Emulates with VEC_UNPACKU_EXPR +
DOT_PROD (short, int).

gcc/testsuite/ChangeLog:

* gcc.target/i386/udotprodint8_emulate.c: New test.
---
 gcc/config/i386/sse.md| 82 ---
 .../gcc.target/i386/udotprodint8_emulate.c| 15 
 2 files changed, 85 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/udotprodint8_emulate.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a1d4fec42a2..3244cef483a 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -30835,20 +30835,78 @@ (define_expand "sdot_prodv64qi"
 
 (define_expand "udot_prod"
   [(match_operand: 0 "register_operand")
-   (match_operand:VI1 1 "register_operand")
-   (match_operand:VI1 2 "register_operand")
+   (match_operand:VI1_AVX2 1 "register_operand")
+   (match_operand:VI1_AVX2 2 "register_operand")
(match_operand: 3 "register_operand")]
-  "TARGET_AVXVNNIINT8"
+  "TARGET_SSE2"
 {
-  operands[1] = lowpart_subreg (mode,
-force_reg (mode, operands[1]),
-mode);
-  operands[2] = lowpart_subreg (mode,
-force_reg (mode, operands[2]),
-mode);
-  emit_insn (gen_rtx_SET (operands[0], operands[3]));
-  emit_insn (gen_vpdpbuud_ (operands[0], operands[3],
-  operands[1], operands[2]));
+  if (TARGET_AVXVNNIINT8)
+{
+  operands[1] = lowpart_subreg (mode,
+   force_reg (mode, operands[1]),
+   mode);
+  operands[2] = lowpart_subreg (mode,
+   force_reg (mode, operands[2]),
+   mode);
+  emit_insn (gen_rtx_SET (operands[0], operands[3]));
+  emit_insn (gen_vpdpbuud_ (operands[0], operands[3],
+ operands[1], operands[2]));
+   }
+ else
+   {
+ /* Emulate with vpdpwssd.  */
+ rtx op1_lo = gen_reg_rtx (mode);
+ rtx op1_hi = gen_reg_rtx (mode);
+ rtx op2_lo = gen_reg_rtx (mode);
+ rtx op2_hi = gen_reg_rtx (mode);
+
+ emit_insn (gen_vec_unpacku_lo_ (op1_lo, operands[1]));
+ emit_insn (gen_vec_unpacku_lo_ (op2_lo, operands[2]));
+ emit_insn (gen_vec_unpacku_hi_ (op1_hi, operands[1]));
+ emit_insn (gen_vec_unpacku_hi_ (op2_hi, operands[2]));
+
+ rtx res1 = gen_reg_rtx (mode);
+ rtx res2 = gen_reg_rtx (mode);
+ rtx sum = gen_reg_rtx (mode);
+
+ emit_move_insn (sum, CONST0_RTX (mode));
+ emit_insn (gen_sdot_prod (res1, op1_lo,
+   op2_lo, sum));
+ emit_insn (gen_sdot_prod (res2, op1_hi,
+   op2_hi, operands[3]));
+ emit_insn (gen_add3 (operands[0], res1, res2));
+   }
+
+  DONE;
+})
+
+(define_expand "udot_prodv64qi"
+  [(match_operand:V16SI 0 "register_operand")
+   (match_operand:V64QI 1 "register_operand")
+   (match_operand:V64QI 2 "register_operand")
+   (match_operand:V16SI 3 "register_operand")]
+  "(TARGET_AVX512VNNI || TARGET_AVX512BW) && TARGET_EVEX512"
+{
+  /* Emulate with vpdpwssd.  */
+  rtx op1_lo = gen_reg_rtx (V32HImode);
+  rtx op1_hi = gen_reg_rtx (V32HImode);
+  rtx op2_lo = gen_reg_rtx (V32HImode);
+  rtx op2_hi = gen_reg_rtx (V32HImode);
+
+  emit_insn (gen_vec_unpacku_lo_v64qi (op1_lo, operands[1]));
+  emit_insn (gen_vec_unpacku_lo_v64qi (op2_lo, operands[2]));
+  emit_insn (gen_vec_unpacku_hi_v64qi (op1_hi, operands[1]));
+  emit_insn (gen_vec_unpacku_hi_v64qi (op2_hi, operands[2]));
+
+  rtx res1 = gen_reg_rtx (V16SImode);
+  rtx res2 = gen_reg_rtx (V16SImode);
+  rtx sum = gen_reg_rtx (V16SImode);
+
+  emit_move_insn (sum, CONST0_RTX (V16SImode));
+  emit_insn (gen_sdot_prodv32hi (res1, op1_lo, op2_lo, sum));
+  emit_insn (gen_sdot_prodv32hi (res2, op1_hi, op2_hi, operands[3]));
+
+  emit_insn (gen_addv16si3 (operands[0], res1, res2));
   DONE;
 })
 
diff --git a/gcc/testsuite/gcc.target/i386/udotprodint8_emulate.c 
b/gcc/testsuite/gcc.target/i386/udotprodint8_emulate.c
new file mode 100644
index 000..1e8f2cfe521
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/udotprodint8_emulate.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-mavxvnni -O2 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-times "DOT_PROD_EXPR" 1 "

[PATCH] Don't assume it's AVX_U128_CLEAN after call_insn whose abi.mode_clobber(V4DImode) deosn't contains all SSE_REGS.

2023-12-07 Thread liuhongt
If the function desn't clobber any sse registers or only clobber
128-bit part, then vzeroupper isn't issued before the function exit.
the status not CLEAN but ANY after the function.

Also for sibling_call, it's safe to issue an vzeroupper. Also there
could be missing vzeroupper since there's no mode_exit for
sibling_call_p.

Compared to the patch in the PR, this patch add sibling_call part.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk and backport?

gcc/ChangeLog:

PR target/112891
* config/i386/i386.cc (ix86_avx_u128_mode_after): Return
AVX_U128_ANY if callee_abi doesn't clobber all_sse_regs to
align with ix86_avx_u128_mode_needed.
(ix86_avx_u128_mode_needed): Return AVX_U128_ClEAN for
sibling_call.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr112891.c: New test.
* gcc.target/i386/pr112891-2.c: New test.
---
 gcc/config/i386/i386.cc| 22 +---
 gcc/testsuite/gcc.target/i386/pr112891-2.c | 30 ++
 gcc/testsuite/gcc.target/i386/pr112891.c   | 29 +
 3 files changed, 78 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr112891-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr112891.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 7c5cab4e2c6..fe259cdb789 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -15038,8 +15038,12 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
 vzeroupper if all SSE registers are clobbered.  */
   const function_abi &abi = insn_callee_abi (insn);
   if (vzeroupper_pattern (PATTERN (insn), VOIDmode)
- || !hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
-abi.mode_clobbers (V4DImode)))
+ /* Should be safe to issue an vzeroupper before sibling_call_p.
+Also there not mode_exit for sibling_call, so there could be
+missing vzeroupper for that.  */
+ || !(SIBLING_CALL_P (insn)
+  || hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
+abi.mode_clobbers (V4DImode
return AVX_U128_ANY;
 
   return AVX_U128_CLEAN;
@@ -15177,7 +15181,19 @@ ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
   bool avx_upper_reg_found = false;
   note_stores (insn, ix86_check_avx_upper_stores, &avx_upper_reg_found);
 
-  return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
+  if (avx_upper_reg_found)
+   return AVX_U128_DIRTY;
+
+  /* If the function desn't clobber any sse registers or only clobber
+128-bit part, Then vzeroupper isn't issued before the function exit.
+the status not CLEAN but ANY after the function.  */
+  const function_abi &abi = insn_callee_abi (insn);
+  if (!(SIBLING_CALL_P (insn)
+   || hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
+ abi.mode_clobbers (V4DImode
+   return AVX_U128_ANY;
+
+  return  AVX_U128_CLEAN;
 }
 
   /* Otherwise, return current mode.  Remember that if insn
diff --git a/gcc/testsuite/gcc.target/i386/pr112891-2.c 
b/gcc/testsuite/gcc.target/i386/pr112891-2.c
new file mode 100644
index 000..164c3985d50
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr112891-2.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O3" } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
+
+void
+__attribute__((noinline))
+bar (double* a)
+{
+  a[0] = 1.0;
+  a[1] = 2.0;
+}
+
+double
+__attribute__((noinline))
+foo (double* __restrict a, double* b)
+{
+  a[0] += b[0];
+  a[1] += b[1];
+  a[2] += b[2];
+  a[3] += b[3];
+  bar (b);
+  return a[5] + b[5];
+}
+
+double
+foo1 (double* __restrict a, double* b)
+{
+  double c = foo (a, b);
+  return __builtin_exp (c);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr112891.c 
b/gcc/testsuite/gcc.target/i386/pr112891.c
new file mode 100644
index 000..dbf6c67948a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr112891.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O3" } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
+
+void
+__attribute__((noinline))
+bar (double* a)
+{
+  a[0] = 1.0;
+  a[1] = 2.0;
+}
+
+void
+__attribute__((noinline))
+foo (double* __restrict a, double* b)
+{
+  a[0] += b[0];
+  a[1] += b[1];
+  a[2] += b[2];
+  a[3] += b[3];
+  bar (b);
+}
+
+double
+foo1 (double* __restrict a, double* b)
+{
+  foo (a, b);
+  return __builtin_exp (b[1]);
+}
-- 
2.31.1



[PATCH] [ICE] Support vpcmov for V4HF/V4BF/V2HF/V2BF under TARGET_XOP.

2023-12-07 Thread liuhongt
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

PR target/112904
* config/i386/mmx.md (*xop_pcmov_): New define_insn.

gcc/testsuite/ChangeLog:

* g++.target/i386/pr112904.C: New test.
---
 gcc/config/i386/mmx.md   | 22 +++
 gcc/testsuite/g++.target/i386/pr112904.C | 27 
 2 files changed, 49 insertions(+)
 create mode 100644 gcc/testsuite/g++.target/i386/pr112904.C

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index a07a921b739..06d6c57876b 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -4136,6 +4136,17 @@ (define_insn "*xop_pcmov_"
   [(set_attr "type" "sse4arg")
(set_attr "mode" "TI")])
 
+(define_insn "*xop_pcmov_"
+  [(set (match_operand:V4F_64 0 "register_operand" "=x")
+(if_then_else:V4F_64
+  (match_operand:V4F_64 3 "register_operand" "x")
+  (match_operand:V4F_64 1 "register_operand" "x")
+  (match_operand:V4F_64 2 "register_operand" "x")))]
+  "TARGET_XOP && TARGET_MMX_WITH_SSE"
+  "vpcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "sse4arg")
+   (set_attr "mode" "TI")])
+
 (define_insn "*xop_pcmov_"
   [(set (match_operand:VI_16_32 0 "register_operand" "=x")
 (if_then_else:VI_16_32
@@ -4147,6 +4158,17 @@ (define_insn "*xop_pcmov_"
   [(set_attr "type" "sse4arg")
(set_attr "mode" "TI")])
 
+(define_insn "*xop_pcmov_"
+  [(set (match_operand:V2F_32 0 "register_operand" "=x")
+(if_then_else:V2F_32
+  (match_operand:V2F_32 3 "register_operand" "x")
+  (match_operand:V2F_32 1 "register_operand" "x")
+  (match_operand:V2F_32 2 "register_operand" "x")))]
+  "TARGET_XOP"
+  "vpcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "sse4arg")
+   (set_attr "mode" "TI")])
+
 ;; XOP permute instructions
 (define_insn "mmx_ppermv64"
   [(set (match_operand:V8QI 0 "register_operand" "=x")
diff --git a/gcc/testsuite/g++.target/i386/pr112904.C 
b/gcc/testsuite/g++.target/i386/pr112904.C
new file mode 100644
index 000..556be921197
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr112904.C
@@ -0,0 +1,27 @@
+typedef _Float16 v4hf __attribute__((vector_size(8)));
+typedef short v4hi __attribute__((vector_size(8)));
+typedef _Float16 v2hf __attribute__((vector_size(4)));
+typedef short v2hi __attribute__((vector_size(4)));
+
+typedef __bf16 v4bf __attribute__((vector_size(8)));
+typedef __bf16 v2bf __attribute__((vector_size(4)));
+
+v4hf foo(v4hf a, v4hf b, v4hi c)
+{
+  return c ? a : b;
+}
+
+v2hf foo1(v2hf a, v2hf b, v2hi c)
+{
+  return c ? a : b;
+}
+
+v4bf foo(v4bf a, v4bf b, v4hi c)
+{
+  return c ? a : b;
+}
+
+v2bf foo1(v2bf a, v2bf b, v2hi c)
+{
+  return c ? a : b;
+}
-- 
2.31.1



[v3 PATCH] Simplify vector ((VCE (a cmp b ? -1 : 0)) < 0) ? c : d to just (VCE ((a cmp b) ? (VCE c) : (VCE d))).

2023-12-10 Thread liuhongt
> since you are looking at TYPE_PRECISION below you want
> VECTOR_INTIEGER_TYPE_P here as well?  The alternative
> would be to compare TYPE_SIZE.
>
> Some of the checks feel redundant but are probably good for
> documentation purposes.
>
> OK with using VECTOR_INTIEGER_TYPE_P
Actually, the data type doens't need to integer, .i.e x86 support vblendvps
so I'm using TYPE_SIZE here, the code is adjusted to

&& tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (type)))
&& (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (type)))
   <= TYPE_PRECISION (TREE_TYPE (TREE_TYPE (@6

Here's the updated patch.
Ok for trunk?

When I'm working on PR112443, I notice there's some misoptimizations:
after we fold _mm{,256}_blendv_epi8/pd/ps into gimple, the backend
fails to combine it back to v{,p}blendv{v,ps,pd} since the pattern is
too complicated, so I think maybe we should hanlde it in the gimple
level.

The dump is like

  _1 = c_3(D) >= { 0, 0, 0, 0 };
  _2 = VEC_COND_EXPR <_1, { -1, -1, -1, -1 }, { 0, 0, 0, 0 }>;
  _7 = VIEW_CONVERT_EXPR(_2);
  _8 = VIEW_CONVERT_EXPR(b_6(D));
  _9 = VIEW_CONVERT_EXPR(a_5(D));
  _10 = _7 < { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
  _11 = VEC_COND_EXPR <_10, _8, _9>;

It can be optimized to

  _1 = c_2(D) >= { 0, 0, 0, 0 };
  _6 = VEC_COND_EXPR <_1, b_5(D), a_4(D)>;

since _7 is either -1 or 0, the selection of _7 < 0 ? _8 : _9 should
be euqal to _1 ? b : a as long as TYPE_PRECISION of the component type
of the second VEC_COND_EXPR is less equal to the first one.
The patch add a gimple pattern to handle that.

gcc/ChangeLog:

* match.pd (VCE (a cmp b ? -1 : 0) < 0) ? c : d ---> (VCE ((a
cmp b) ? (VCE:c) : (VCE:d))): New gimple simplication.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512vl-blendv-3.c: New test.
* gcc.target/i386/blendv-3.c: New test.
---
 gcc/match.pd  | 23 ++
 .../gcc.target/i386/avx512vl-blendv-3.c   |  6 +++
 gcc/testsuite/gcc.target/i386/blendv-3.c  | 46 +++
 3 files changed, 75 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/blendv-3.c

diff --git a/gcc/match.pd b/gcc/match.pd
index 4d554ba4721..359c7b07dc3 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -5190,6 +5190,29 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (if (optimize_vectors_before_lowering_p () && types_match (@0, @3))
   (vec_cond (bit_and @0 (bit_not @3)) @2 @1)))
 
+/*  ((VCE (a cmp b ? -1 : 0)) < 0) ? c : d is just
+(VCE ((a cmp b) ? (VCE c) : (VCE d))) when TYPE_PRECISION of the
+component type of the outer vec_cond is greater equal the inner one.  */
+(for cmp (simple_comparison)
+ (simplify
+  (vec_cond
+(lt (view_convert@5 (vec_cond@6 (cmp@4 @0 @1)
+   integer_all_onesp
+   integer_zerop))
+ integer_zerop) @2 @3)
+  (if (VECTOR_INTEGER_TYPE_P (TREE_TYPE (@0))
+   && VECTOR_INTEGER_TYPE_P (TREE_TYPE (@5))
+   && !TYPE_UNSIGNED (TREE_TYPE (@5))
+   && VECTOR_TYPE_P (TREE_TYPE (@6))
+   && VECTOR_TYPE_P (type)
+   && tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (type)))
+   && (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (type)))
+ <= TYPE_PRECISION (TREE_TYPE (TREE_TYPE (@6
+   && TYPE_SIZE (type) == TYPE_SIZE (TREE_TYPE (@6)))
+   (with { tree vtype = TREE_TYPE (@6);}
+ (view_convert:type
+   (vec_cond @4 (view_convert:vtype @2) (view_convert:vtype @3)))
+
 /* c1 ? c2 ? a : b : b  -->  (c1 & c2) ? a : b  */
 (simplify
  (vec_cond @0 (vec_cond:s @1 @2 @3) @3)
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c 
b/gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c
new file mode 100644
index 000..2777e72ab5f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -mavx512bw -O2" } */
+/* { dg-final { scan-assembler-times {vp?blendv(?:b|p[sd])[ \t]*} 6 } } */
+/* { dg-final { scan-assembler-not {vpcmp} } } */
+
+#include "blendv-3.c"
diff --git a/gcc/testsuite/gcc.target/i386/blendv-3.c 
b/gcc/testsuite/gcc.target/i386/blendv-3.c
new file mode 100644
index 000..fa0fb067a73
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/blendv-3.c
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2" } */
+/* { dg-final { scan-assembler-times {vp?blendv(?:b|p[sd])[ \t]*} 6 } } */
+/* { dg-final { scan-assembler-not {vpcmp} } } */
+
+#include 
+
+__m256i
+foo (__m256i a, __m256i b, __m256i c)
+{
+  return _mm256_blendv_epi8 (a, b, ~c < 0);
+}
+
+__m256d
+foo1 (__m256d a, __m256d b, __m256i c)
+{
+  __m256i d = ~c < 0;
+  return _mm256_blendv_pd (a, b, (__m256d)d);
+}
+
+__m256
+foo2 (__m256 a, __m256 b, __m256i c)
+{
+  __m256i d = ~c < 0;
+  return _mm256_blendv_ps (a, b, (__m256)d);
+}
+
+__m128i
+foo4 (__m128i a, __m128i b, __m128i c)
+{
+  return _m

[PATCH] Adjust vectorized cost for reduction.

2023-12-11 Thread liuhongt
x86 doesn't support horizontal reduction instructions, reduc_op_scal_m
is emulated with vec_extract_half + op(half vector length)
Take that into account when calculating cost for vectorization.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
No big performance impact on SPEC2017 as measured on ICX.
Ok for trunk?

gcc/ChangeLog:

PR target/112325
* config/i386/i386.cc (ix86_vector_costs::add_stmt_cost):
Handle reduction vec_to_scalar.
(ix86_vector_costs::ix86_vect_reduc_cost): New function.
---
 gcc/config/i386/i386.cc | 45 +
 1 file changed, 45 insertions(+)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 4b6bad37c8f..02c9a5004a1 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -24603,6 +24603,7 @@ private:
 
   /* Estimate register pressure of the vectorized code.  */
   void ix86_vect_estimate_reg_pressure ();
+  unsigned ix86_vect_reduc_cost (stmt_vec_info, tree);
   /* Number of GENERAL_REGS/SSE_REGS used in the vectorizer, it's used for
  estimation of register pressure.
  ??? Currently it's only used by vec_construct/scalar_to_vec
@@ -24845,6 +24846,12 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
if (TREE_CODE (op) == SSA_NAME)
  TREE_VISITED (op) = 0;
 }
+  /* This is a reduc_*_scal_m, x86 support reduc_*_scal_m with emulation.  */
+  else if (kind == vec_to_scalar
+  && stmt_info
+  && vect_is_reduction (stmt_info))
+stmt_cost = ix86_vect_reduc_cost (stmt_info, vectype);
+
   if (stmt_cost == -1)
 stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
 
@@ -24875,6 +24882,44 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
   return retval;
 }
 
+/* x86 doesn't support horizontal reduction instructions,
+   redc_op_scal_m is emulated with vec_extract_hi + op.  */
+unsigned
+ix86_vector_costs::ix86_vect_reduc_cost (stmt_vec_info stmt_info,
+tree vectype)
+{
+  gcc_assert (vectype);
+  unsigned cost = 0;
+  machine_mode mode = TYPE_MODE (vectype);
+  unsigned len = GET_MODE_SIZE (mode);
+
+  /* PSADBW is used for reduc_plus_scal_{v16qi, v8qi, v4qi}.  */
+  if (GET_MODE_INNER (mode) == E_QImode
+  && stmt_info
+  && stmt_info->stmt && gimple_code (stmt_info->stmt) == GIMPLE_ASSIGN
+  && gimple_assign_rhs_code (stmt_info->stmt) == PLUS_EXPR)
+{
+  cost = ix86_cost->sse_op;
+  /* vec_extract_hi + vpaddb for 256/512-bit reduc_plus_scal_v*qi.  */
+  if (len > 16)
+   cost += exact_log2 (len >> 4) * ix86_cost->sse_op * 2;
+}
+  else
+/* vec_extract_hi + op.  */
+cost = ix86_cost->sse_op * exact_log2 (TYPE_VECTOR_SUBPARTS (vectype)) * 2;
+
+  /* Cout extra uops for TARGET_*_SPLIT_REGS. NB: There's no target which
+ supports 512-bit vector but has TARGET_AVX256/128_SPLIT_REGS.
+ ix86_vect_cost is not used since reduction instruction sequence are
+ consisted with mixed vector-length instructions after vec_extract_hi.  */
+  if ((len == 64 && TARGET_AVX512_SPLIT_REGS)
+  || (len == 32 && TARGET_AVX256_SPLIT_REGS)
+  || (len == 16 && TARGET_AVX256_SPLIT_REGS))
+cost += ix86_cost->sse_op;
+
+  return cost;
+}
+
 void
 ix86_vector_costs::ix86_vect_estimate_reg_pressure ()
 {
-- 
2.31.1



[PATCH] Force broadcast constant to mem for vec_dup{v4di, v8si, v4df, v8df} when TARGET_AVX2 is not available.

2023-12-12 Thread liuhongt
vpbroadcastd/vpbroadcastq is avaiable under TARGET_AVX2, but
vec_dup{v4di,v8si} pattern is avaiable under AVX with memory operand.
And it will cause LRA/Reload to generate spill and reload if we put
constant in register.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

PR target/112992
* config/i386/i386-expand.cc
(ix86_convert_const_wide_int_to_broadcast): Don't convert to
broadcast for vec_dup{v4di,v8si} when TARGET_AVX2 is not
available.
(ix86_broadcast_from_constant): Allow broadcast for V4DI/V8SI
when !TARGET_AVX2 since it will be forced to memory later.
(ix86_expand_vector_move): Force constant to mem for
vec_dup{vssi,v4di} when TARGET_AVX2 is not available.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr100865-7a.c: Adjust testcase.
* gcc.target/i386/pr100865-7c.c: Ditto.
* gcc.target/i386/pr112992.c: New test.
---
 gcc/config/i386/i386-expand.cc  | 48 +
 gcc/testsuite/gcc.target/i386/pr100865-7a.c |  3 +-
 gcc/testsuite/gcc.target/i386/pr100865-7c.c |  3 +-
 gcc/testsuite/gcc.target/i386/pr112992.c| 30 +
 4 files changed, 62 insertions(+), 22 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr112992.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index a53d69d5400..fad4f34f905 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -297,6 +297,12 @@ ix86_convert_const_wide_int_to_broadcast (machine_mode 
mode, rtx op)
   if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
 return nullptr;
 
+  unsigned int msize = GET_MODE_SIZE (mode);
+
+  /* Only optimized for vpbroadcast[bwsd]/vbroadcastss with xmm/ymm/zmm.  */
+  if (msize != 16 && msize != 32 && msize != 64)
+return nullptr;
+
   /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
  broadcast only if vector broadcast is available.  */
   if (!TARGET_AVX
@@ -309,18 +315,23 @@ ix86_convert_const_wide_int_to_broadcast (machine_mode 
mode, rtx op)
   HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
   HOST_WIDE_INT val_broadcast;
   scalar_int_mode broadcast_mode;
-  if (TARGET_AVX2
+  /* vpbroadcastb zmm requires TARGET_AVX512BW.  */
+  if ((msize == 64 ? TARGET_AVX512BW : TARGET_AVX2)
   && ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
 val_broadcast))
 broadcast_mode = QImode;
-  else if (TARGET_AVX2
+  else if ((msize == 64 ? TARGET_AVX512BW : TARGET_AVX2)
   && ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
  val_broadcast))
 broadcast_mode = HImode;
-  else if (ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
+  /* vbroadcasts[sd] only support memory operand w/o AVX2.
+ When msize == 16, pshufs is used for vec_duplicate.
+ when msize == 64, vpbroadcastd is used, and TARGET_AVX512F must be 
existed.  */
+  else if ((msize != 32 || TARGET_AVX2)
+  && ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
   val_broadcast))
 broadcast_mode = SImode;
-  else if (TARGET_64BIT
+  else if (TARGET_64BIT && (msize != 32 || TARGET_AVX2)
   && ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
  val_broadcast))
 broadcast_mode = DImode;
@@ -596,23 +607,17 @@ ix86_broadcast_from_constant (machine_mode mode, rtx op)
   && INTEGRAL_MODE_P (mode))
 return nullptr;
 
+  unsigned int msize = GET_MODE_SIZE (mode);
+  unsigned int inner_size = GET_MODE_SIZE (GET_MODE_INNER ((mode)));
+
   /* Convert CONST_VECTOR to a non-standard SSE constant integer
  broadcast only if vector broadcast is available.  */
-  if (!(TARGET_AVX2
-   || (TARGET_AVX
-   && (GET_MODE_INNER (mode) == SImode
-   || GET_MODE_INNER (mode) == DImode))
-   || FLOAT_MODE_P (mode))
-  || standard_sse_constant_p (op, mode))
+  if (standard_sse_constant_p (op, mode))
 return nullptr;
 
-  /* Don't broadcast from a 64-bit integer constant in 32-bit mode.
- We can still put 64-bit integer constant in memory when
- avx512 embed broadcast is available.  */
-  if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT
-  && (!TARGET_AVX512F
- || (GET_MODE_SIZE (mode) == 64 && !TARGET_EVEX512)
- || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL)))
+  /* vpbroadcast[b,w] is available under TARGET_AVX2.
+ or TARGET_AVX512BW for zmm.  */
+  if (inner_size < 4 && !(msize == 64 ? TARGET_AVX512BW : TARGET_AVX2))
 return nullptr;
 
   if (GET_MODE_INNER (mode) == TImode)
@@ -710,7 +715,14 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
 constant or scalar mem.  */
  op1 = gen_reg_rtx (mode);
  if (FLOAT_MODE_P (mode)
- || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
+ || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode)
+ /* vbro

[V2 PATCH] Handle bitop with INTEGER_CST in analyze_and_compute_bitop_with_inv_effect.

2023-11-06 Thread liuhongt
analyze_and_compute_bitop_with_inv_effect assumes the first operand is
loop invariant which is not the case when it's INTEGER_CST.

Bootstrapped and regtseted on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR tree-optimization/105735
PR tree-optimization/111972
* tree-scalar-evolution.cc
(analyze_and_compute_bitop_with_inv_effect): Handle bitop with
INTEGER_CST.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr105735-3.c: New test.
---
 gcc/testsuite/gcc.target/i386/pr105735-3.c | 87 ++
 gcc/tree-scalar-evolution.cc   |  3 +
 2 files changed, 90 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr105735-3.c

diff --git a/gcc/testsuite/gcc.target/i386/pr105735-3.c 
b/gcc/testsuite/gcc.target/i386/pr105735-3.c
new file mode 100644
index 000..9e268a1a997
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr105735-3.c
@@ -0,0 +1,87 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -fdump-tree-sccp-details" } */
+/* { dg-final { scan-tree-dump-times {final value replacement} 8 "sccp" } } */
+
+unsigned int
+__attribute__((noipa))
+foo (unsigned int tmp)
+{
+  for (int bit = 0; bit < 64; bit++)
+tmp &= 11304;
+  return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo1 (unsigned int tmp)
+{
+  for (int bit = 63; bit >= 0; bit -=3)
+tmp &= 11304;
+  return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo2 (unsigned int tmp)
+{
+  for (int bit = 0; bit < 64; bit++)
+tmp |= 11304;
+  return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo3 (unsigned int tmp)
+{
+  for (int bit = 63; bit >= 0; bit -=3)
+tmp |= 11304;
+  return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo4 (unsigned int tmp)
+{
+  for (int bit = 0; bit < 64; bit++)
+tmp ^= 11304;
+  return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo5 (unsigned int tmp)
+{
+  for (int bit = 0; bit < 63; bit++)
+tmp ^= 11304;
+  return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+f (unsigned int tmp, int bit)
+{
+  unsigned int res = tmp;
+  for (int i = 0; i < bit; i++)
+res &= 11304;
+  return res;
+}
+
+unsigned int
+__attribute__((noipa))
+f1 (unsigned int tmp, int bit)
+{
+  unsigned int res = tmp;
+  for (int i = 0; i < bit; i++)
+res |= 11304;
+  return res;
+}
+
+unsigned int
+__attribute__((noipa))
+f2 (unsigned int tmp, int bit)
+{
+  unsigned int res = tmp;
+  for (int i = 0; i < bit; i++)
+res ^= 11304;
+  return res;
+}
diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
index 70b17c5bca1..f61277c32df 100644
--- a/gcc/tree-scalar-evolution.cc
+++ b/gcc/tree-scalar-evolution.cc
@@ -3689,6 +3689,9 @@ analyze_and_compute_bitop_with_inv_effect (class loop* 
loop, tree phidef,
   match_op[0] = gimple_assign_rhs1 (def);
   match_op[1] = gimple_assign_rhs2 (def);
 
+  if (expr_invariant_in_loop_p (loop, match_op[1]))
+std::swap (match_op[0], match_op[1]);
+
   if (TREE_CODE (match_op[1]) != SSA_NAME
   || !expr_invariant_in_loop_p (loop, match_op[0])
   || !(header_phi = dyn_cast  (SSA_NAME_DEF_STMT (match_op[1])))
-- 
2.31.1



[PATCH] Fix wrong code due to vec_merge + pcmp to blendvb splitter.

2023-11-09 Thread liuhongt
Boostrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.
Will test and backport to GCC13/GCC12 release branch.

gcc/ChangeLog:

PR target/112443
* config/i386/sse.md (*avx2_pcmp3_4): Fix swap condition
from LT to GE since there's not in the pattern.
(*avx2_pcmp3_5): Ditto.

gcc/testsuite/ChangeLog:

* g++.target/i386/pr112443.C: New test.
---
 gcc/config/i386/sse.md   |   4 +-
 gcc/testsuite/g++.target/i386/pr112443.C | 108 +++
 2 files changed, 110 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/pr112443.C

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 33198756bb0..9eefe9ed45b 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -17082,7 +17082,7 @@ (define_insn_and_split "*avx2_pcmp3_4"
 (match_dup 4))]
 UNSPEC_BLENDV))]
 {
-  if (INTVAL (operands[5]) == 1)
+  if (INTVAL (operands[5]) == 5)
 std::swap (operands[1], operands[2]);
   operands[3] = gen_lowpart (mode, operands[3]);
 })
@@ -17112,7 +17112,7 @@ (define_insn_and_split "*avx2_pcmp3_5"
 (match_dup 4))]
 UNSPEC_BLENDV))]
 {
-  if (INTVAL (operands[5]) == 1)
+  if (INTVAL (operands[5]) == 5)
 std::swap (operands[1], operands[2]);
 })
 
diff --git a/gcc/testsuite/g++.target/i386/pr112443.C 
b/gcc/testsuite/g++.target/i386/pr112443.C
new file mode 100644
index 000..ebfa9b4a753
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr112443.C
@@ -0,0 +1,108 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512bw } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-options "-O2 -std=c++17 -mavx512bw -mavx512vl" } */
+
+#include 
+#include 
+#include 
+#include 
+
+#define AVX512BW
+#define AVX512VL
+
+#include "avx512f-helper.h"
+
+struct TensorIteratorBase{
+  char* in;
+  char* out;
+
+  void for_each(std::function loop){
+loop(out, in, 32);
+  }
+};
+
+class Vectorized {
+protected:
+  __m256i values;
+
+  static inline __m256i invert(const __m256i& v) {
+const auto ones = _mm256_set1_epi64x(-1);
+return _mm256_xor_si256(ones, v);
+  }
+public:
+  operator __m256i() const {
+return values;
+  }
+
+  static constexpr int size() {
+return 32;
+  }
+
+  Vectorized() {}
+  Vectorized(__m256i v) : values(v) {}
+  Vectorized(uint8_t v) { values = _mm256_set1_epi8(v); }
+  static Vectorized blendv(const Vectorized& a, const Vectorized& b,
+  const Vectorized& mask) {
+return _mm256_blendv_epi8(a, b, mask);
+  }
+  static Vectorized loadu(const void* ptr) {
+return _mm256_loadu_si256(reinterpret_cast(ptr));
+  }
+  void store(void* ptr) const {
+_mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
+  }
+
+  Vectorized operator<(const Vectorized& other) const {
+__m256i max = _mm256_max_epu8(values, other);
+return invert(_mm256_cmpeq_epi8(max, values));
+  }
+  Vectorized operator-(const Vectorized& b) {
+return _mm256_sub_epi8(values, b);
+  }
+};
+
+std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) {
+  uint8_t buf[Vectorized::size()];
+  vec.store(buf);
+  stream << "vec[";
+  for (int i = 0; i != Vectorized::size(); i++) {
+if (i != 0)
+  stream << ", ";
+stream << buf[i]*1;
+  }
+  stream << "]";
+  return stream;
+}
+
+void run(TensorIteratorBase iter){
+  Vectorized zero_vec(0);
+  Vectorized one_vec(1);
+
+  iter.for_each([=](char* out, char* in, int64_t size) {
+for (int64_t i = 0; i <= size - Vectorized::size(); i += 
Vectorized::size()) {
+  auto self_vec = Vectorized::loadu(in + i);
+  auto left = Vectorized::blendv(zero_vec, one_vec, zero_vec < self_vec);
+  auto right = Vectorized::blendv(zero_vec, one_vec, self_vec < zero_vec);
+  auto outv = left - right;
+  outv.store(out + i);
+}
+  });
+}
+
+void
+test_256 (){
+  char in[32];
+  char out[32];
+  for(auto& x: in) x = 1;
+  run(TensorIteratorBase{in, out});
+  Vectorized::loadu (out);
+  for (int i = 0; i != 32; i++)
+if (out[i] != 1)
+  __builtin_abort ();
+}
+
+void
+test_128 ()
+{
+}
-- 
2.31.1



[PATCH] Simplify vector ((VCE?(a cmp b ? -1 : 0)) < 0) ? c : d to just (VCE:a cmp VCE:b) ? c : d.

2023-11-09 Thread liuhongt
When I'm working on PR112443, I notice there's some misoptimizations: after we
fold _mm{,256}_blendv_epi8/pd/ps into gimple, the backend fails to combine it
back to v{,p}blendv{v,ps,pd} since the pattern is too complicated, so I think
maybe we should hanlde it in the gimple level.

The dump is like

  _1 = c_3(D) >= { 0, 0, 0, 0 };
  _2 = VEC_COND_EXPR <_1, { -1, -1, -1, -1 }, { 0, 0, 0, 0 }>;
  _7 = VIEW_CONVERT_EXPR(_2);
  _8 = VIEW_CONVERT_EXPR(b_6(D));
  _9 = VIEW_CONVERT_EXPR(a_5(D));
  _10 = _7 < { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
  _11 = VEC_COND_EXPR <_10, _8, _9>;


It can be optimized to

  _6 = VIEW_CONVERT_EXPR(b_4(D));
  _7 = VIEW_CONVERT_EXPR(a_3(D));
  _10 = VIEW_CONVERT_EXPR(c_1(D));
  _5 = _10 >= { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
  _8 = VEC_COND_EXPR <_5, _6, _7>;
  _9 = VIEW_CONVERT_EXPR<__m256i>(_8);

since _7 is either -1 or 0, _7 < 0 should is euqal to _1 = c_3(D) > { 0, 0, 0, 
0 };
The patch add a gimple pattern to handle that.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ok for trunk?

gcc/ChangeLog:

* match.pd (VCE:(a cmp b ? -1 : 0) < 0) ? c : d ---> (VCE:a cmp
VCE:b) ? c : d): New gimple simplication.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512vl-blendv-3.c: New test.
* gcc.target/i386/blendv-3.c: New test.
---
 gcc/match.pd  | 17 +++
 .../gcc.target/i386/avx512vl-blendv-3.c   |  6 +++
 gcc/testsuite/gcc.target/i386/blendv-3.c  | 46 +++
 3 files changed, 69 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/blendv-3.c

diff --git a/gcc/match.pd b/gcc/match.pd
index dbc811b2b38..e6f9c4fa1fd 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -5170,6 +5170,23 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (if (optimize_vectors_before_lowering_p () && types_match (@0, @3))
   (vec_cond (bit_and @0 (bit_not @3)) @2 @1)))
 
+(for cmp (simple_comparison)
+ (simplify
+  (vec_cond
+(lt@4 (view_convert?@5 (vec_cond (cmp @0 @1)
+integer_all_onesp
+integer_zerop))
+ integer_zerop) @2 @3)
+  (if (VECTOR_INTEGER_TYPE_P (TREE_TYPE (@0))
+   && VECTOR_INTEGER_TYPE_P (TREE_TYPE (@5))
+   && TYPE_SIGN (TREE_TYPE (@0)) == TYPE_SIGN (TREE_TYPE (@5))
+   && VECTOR_TYPE_P (type))
+   (with {
+  tree itype = TREE_TYPE (@5);
+  tree vbtype = TREE_TYPE (@4);}
+ (vec_cond (cmp:vbtype (view_convert:itype @0)
+  (view_convert:itype @1)) @2 @3)
+
 /* c1 ? c2 ? a : b : b  -->  (c1 & c2) ? a : b  */
 (simplify
  (vec_cond @0 (vec_cond:s @1 @2 @3) @3)
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c 
b/gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c
new file mode 100644
index 000..2777e72ab5f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -mavx512bw -O2" } */
+/* { dg-final { scan-assembler-times {vp?blendv(?:b|p[sd])[ \t]*} 6 } } */
+/* { dg-final { scan-assembler-not {vpcmp} } } */
+
+#include "blendv-3.c"
diff --git a/gcc/testsuite/gcc.target/i386/blendv-3.c 
b/gcc/testsuite/gcc.target/i386/blendv-3.c
new file mode 100644
index 000..fa0fb067a73
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/blendv-3.c
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2" } */
+/* { dg-final { scan-assembler-times {vp?blendv(?:b|p[sd])[ \t]*} 6 } } */
+/* { dg-final { scan-assembler-not {vpcmp} } } */
+
+#include 
+
+__m256i
+foo (__m256i a, __m256i b, __m256i c)
+{
+  return _mm256_blendv_epi8 (a, b, ~c < 0);
+}
+
+__m256d
+foo1 (__m256d a, __m256d b, __m256i c)
+{
+  __m256i d = ~c < 0;
+  return _mm256_blendv_pd (a, b, (__m256d)d);
+}
+
+__m256
+foo2 (__m256 a, __m256 b, __m256i c)
+{
+  __m256i d = ~c < 0;
+  return _mm256_blendv_ps (a, b, (__m256)d);
+}
+
+__m128i
+foo4 (__m128i a, __m128i b, __m128i c)
+{
+  return _mm_blendv_epi8 (a, b, ~c < 0);
+}
+
+__m128d
+foo5 (__m128d a, __m128d b, __m128i c)
+{
+  __m128i d = ~c < 0;
+  return _mm_blendv_pd (a, b, (__m128d)d);
+}
+
+__m128
+foo6 (__m128 a, __m128 b, __m128i c)
+{
+  __m128i d = ~c < 0;
+  return _mm_blendv_ps (a, b, (__m128)d);
+}
-- 
2.31.1



[PATCH] Support vec_set/vec_extract/vec_init for V4HF/V2HF.

2023-11-09 Thread liuhongt
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

* config/i386/i386-expand.cc
(ix86_expand_vector_init_duplicate): Handle V4HF/V4BF and
V2HF/V2BF.
(ix86_expand_vector_init_one_nonzero): Ditto.
(ix86_expand_vector_init_one_var): Ditto.
(ix86_expand_vector_init_general): Ditto.
(ix86_expand_vector_set_var): Ditto.
(ix86_expand_vector_set): Ditto.
(ix86_expand_vector_extract): Ditto.
* config/i386/mmx.md
(mmxdoublevecmode): Extend to V4HF/V4BF/V2HF/V2BF.
(*mmx_pinsrw): Extend to V4FI_64, add a new alternative (&x,
x, x), add a new define_split after the pattern.
(*mmx_pextrw): New define_insn.
(mmx_pshufw_1): Rename to ..
(mmx_pshufw_1): .. this, extend to V4FI_64.
(*mmx_pblendw64): Extend to V4FI_64.
(*vec_dup): New define_insn.
(vec_setv4hi): Rename to ..
(vec_set): .. this, and extend to V4FI_64
(vec_extractv4hihi): Rename to ..
(vec_extract): .. this, and extend
to V4FI_64.
(vec_init): New define_insn.
(*pinsrw): Extend to V2FI_32, add a new alternative (&x,
x, x), and add a new define_split after it.
(*pextrw): New define_insn.
(vec_setv2hi): Rename to ..
(vec_set): .. this, extend to V2FI_32.
(vec_extractv2hihi): Rename to ..
(vec_extract): .. this, extend to
V2FI_32.
(*punpckwd): Extend to V2FI_32.
(*pshufw_1): Rename to ..
(*pshufw_1): .. this, extend to V2FI_32.
(vec_initv2hihi): Rename to ..
(vec_init): .. this, and extend to
V2FI_32.
(*vec_dup): New define_insn.
* config/i386/sse.md (*vec_extract): Refine constraint
from v to Yw.

gcc/testsuite/ChangeLog:

* gcc.target/i386/part-vect-vec_elem-1.c: New test.
* gcc.target/i386/part-vect-vec_elem-2.c: New test.
---
 gcc/config/i386/i386-expand.cc|  60 
 gcc/config/i386/mmx.md| 271 ++
 gcc/config/i386/sse.md|   4 +-
 .../gcc.target/i386/part-vect-vec_elem-1.c| 135 +
 .../gcc.target/i386/part-vect-vec_elem-2.c| 135 +
 5 files changed, 541 insertions(+), 64 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/part-vect-vec_elem-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/part-vect-vec_elem-2.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 8fad73c1549..b52ec51fbe4 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -15592,6 +15592,17 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, 
machine_mode mode,
}
   goto widen;
 
+case E_V4HFmode:
+case E_V4BFmode:
+  if (TARGET_MMX_WITH_SSE)
+   {
+ val = force_reg (GET_MODE_INNER (mode), val);
+ rtx x = gen_rtx_VEC_DUPLICATE (mode, val);
+ emit_insn (gen_rtx_SET (target, x));
+ return true;
+   }
+  return false;
+
 case E_V2HImode:
   if (TARGET_SSE2)
{
@@ -15605,6 +15616,17 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, 
machine_mode mode,
}
   return false;
 
+case E_V2HFmode:
+case E_V2BFmode:
+  if (TARGET_SSE2)
+   {
+ val = force_reg (GET_MODE_INNER (mode), val);
+ rtx x = gen_rtx_VEC_DUPLICATE (mode, val);
+ emit_insn (gen_rtx_SET (target, x));
+ return true;
+   }
+  return false;
+
 case E_V8QImode:
 case E_V4QImode:
   if (!mmx_ok)
@@ -15815,6 +15837,8 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, 
machine_mode mode,
   use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
   break;
 case E_V4HImode:
+case E_V4HFmode:
+case E_V4BFmode:
   use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
   break;
 case E_V4QImode:
@@ -16051,6 +16075,8 @@ ix86_expand_vector_init_one_var (bool mmx_ok, 
machine_mode mode,
 case E_V4SImode:
 case E_V8HImode:
 case E_V4HImode:
+case E_V4HFmode:
+case E_V4BFmode:
   break;
 
 case E_V16QImode:
@@ -16438,6 +16464,7 @@ ix86_expand_vector_init_general (bool mmx_ok, 
machine_mode mode,
   rtx ops[64], op0, op1, op2, op3, op4, op5;
   machine_mode half_mode = VOIDmode;
   machine_mode quarter_mode = VOIDmode;
+  machine_mode int_inner_mode = VOIDmode;
   int n, i;
 
   switch (mode)
@@ -16582,6 +16609,13 @@ quarter:
   ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
   return;
 
+case E_V4HFmode:
+case E_V4BFmode:
+case E_V2HFmode:
+case E_V2BFmode:
+  int_inner_mode = HImode;
+  break;
+
 case E_V4HImode:
 case E_V8QImode:
 
@@ -16613,6 +16647,16 @@ quarter:
  for (j = 0; j < n_elt_per_word; ++j)
{
  rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
+ if (int_inn

[PATCH] Simplify vector ((VCE?(a cmp b ? -1 : 0)) < 0) ? c : d to just VCE:((a cmp b) ? (VCE c) : (VCE d)).

2023-11-09 Thread liuhongt
When I'm working on PR112443, I notice there's some misoptimizations:
after we fold _mm{,256}_blendv_epi8/pd/ps into gimple, the backend
fails to combine it back to v{,p}blendv{v,ps,pd} since the pattern is
too complicated, so I think maybe we should hanlde it in the gimple
level.

The dump is like

  _1 = c_3(D) >= { 0, 0, 0, 0 };
  _2 = VEC_COND_EXPR <_1, { -1, -1, -1, -1 }, { 0, 0, 0, 0 }>;
  _7 = VIEW_CONVERT_EXPR(_2);
  _8 = VIEW_CONVERT_EXPR(b_6(D));
  _9 = VIEW_CONVERT_EXPR(a_5(D));
  _10 = _7 < { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
  _11 = VEC_COND_EXPR <_10, _8, _9>;

It can be optimized to

  _1 = c_2(D) >= { 0, 0, 0, 0 };
  _6 = VEC_COND_EXPR <_1, b_5(D), a_4(D)>;

since _7 is either -1 or 0, the selection of _7 < 0 ? _8 : _9 should
be euqal to _1 ? b : a as long as TYPE_PRECISION of the component type
of the second VEC_COND_EXPR is less equal to the first one.
The patch add a gimple pattern to handle that.

gcc/ChangeLog:

* match.pd (VCE:(a cmp b ? -1 : 0) < 0) ? c : d ---> VCE:((a
cmp b) ? (VCE:c) : (VCE:d)): New gimple simplication.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512vl-blendv-3.c: New test.
* gcc.target/i386/blendv-3.c: New test.
---
 gcc/match.pd  | 19 
 .../gcc.target/i386/avx512vl-blendv-3.c   |  6 +++
 gcc/testsuite/gcc.target/i386/blendv-3.c  | 46 +++
 3 files changed, 71 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/blendv-3.c

diff --git a/gcc/match.pd b/gcc/match.pd
index dbc811b2b38..4d823882a7c 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -5170,6 +5170,25 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (if (optimize_vectors_before_lowering_p () && types_match (@0, @3))
   (vec_cond (bit_and @0 (bit_not @3)) @2 @1)))
 
+(for cmp (simple_comparison)
+ (simplify
+  (vec_cond
+(lt (view_convert?@5 (vec_cond@6 (cmp@4 @0 @1)
+integer_all_onesp
+integer_zerop))
+ integer_zerop) @2 @3)
+  (if (VECTOR_INTEGER_TYPE_P (TREE_TYPE (@0))
+   && VECTOR_INTEGER_TYPE_P (TREE_TYPE (@5))
+   && !TYPE_UNSIGNED (TREE_TYPE (@5))
+   && VECTOR_TYPE_P (TREE_TYPE (@6))
+   && VECTOR_TYPE_P (type)
+   && (TYPE_PRECISION (TREE_TYPE (type))
+ <= TYPE_PRECISION (TREE_TYPE (TREE_TYPE (@6
+   && TYPE_SIZE (type) == TYPE_SIZE (TREE_TYPE (@6)))
+   (with { tree vtype = TREE_TYPE (@6);}
+ (view_convert:type
+   (vec_cond @4 (view_convert:vtype @2) (view_convert:vtype @3)))
+
 /* c1 ? c2 ? a : b : b  -->  (c1 & c2) ? a : b  */
 (simplify
  (vec_cond @0 (vec_cond:s @1 @2 @3) @3)
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c 
b/gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c
new file mode 100644
index 000..2777e72ab5f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -mavx512bw -O2" } */
+/* { dg-final { scan-assembler-times {vp?blendv(?:b|p[sd])[ \t]*} 6 } } */
+/* { dg-final { scan-assembler-not {vpcmp} } } */
+
+#include "blendv-3.c"
diff --git a/gcc/testsuite/gcc.target/i386/blendv-3.c 
b/gcc/testsuite/gcc.target/i386/blendv-3.c
new file mode 100644
index 000..fa0fb067a73
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/blendv-3.c
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2" } */
+/* { dg-final { scan-assembler-times {vp?blendv(?:b|p[sd])[ \t]*} 6 } } */
+/* { dg-final { scan-assembler-not {vpcmp} } } */
+
+#include 
+
+__m256i
+foo (__m256i a, __m256i b, __m256i c)
+{
+  return _mm256_blendv_epi8 (a, b, ~c < 0);
+}
+
+__m256d
+foo1 (__m256d a, __m256d b, __m256i c)
+{
+  __m256i d = ~c < 0;
+  return _mm256_blendv_pd (a, b, (__m256d)d);
+}
+
+__m256
+foo2 (__m256 a, __m256 b, __m256i c)
+{
+  __m256i d = ~c < 0;
+  return _mm256_blendv_ps (a, b, (__m256)d);
+}
+
+__m128i
+foo4 (__m128i a, __m128i b, __m128i c)
+{
+  return _mm_blendv_epi8 (a, b, ~c < 0);
+}
+
+__m128d
+foo5 (__m128d a, __m128d b, __m128i c)
+{
+  __m128i d = ~c < 0;
+  return _mm_blendv_pd (a, b, (__m128d)d);
+}
+
+__m128
+foo6 (__m128 a, __m128 b, __m128i c)
+{
+  __m128i d = ~c < 0;
+  return _mm_blendv_ps (a, b, (__m128)d);
+}
-- 
2.31.1



[PATCH] Fix ICE in vectorizable_nonlinear_induction with bitfield.

2023-11-13 Thread liuhongt
 if (TREE_CODE (init_expr) == INTEGER_CST)
init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
  else
gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
   TREE_TYPE (init_expr)));

and init_expr is a 24 bit integer type while vectype has 32bit components.

The "fix" is to bail out instead of asserting.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?(and backport to GCC-13)

gcc/ChangeLog:

PR tree-optimization/112496
* tree-vect-loop.cc (vectorizable_nonlinear_induction): Return
false when !tree_nop_conversion_p (TREE_TYPE (vectype),
TREE_TYPE (init_expr).

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr112496.c: New test.
---
 gcc/testsuite/gcc.target/i386/pr112496.c |  7 +++
 gcc/tree-vect-loop.cc| 13 ++---
 2 files changed, 17 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr112496.c

diff --git a/gcc/testsuite/gcc.target/i386/pr112496.c 
b/gcc/testsuite/gcc.target/i386/pr112496.c
new file mode 100644
index 000..c478fda9cce
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr112496.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+struct T { int x : 24; } v;
+void f1(int x) {
+  while (v.x - ((v.x <<= 1) - v.x)) ;
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 1304b7ece79..75d69bbc30e 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -9429,9 +9429,16 @@ vectorizable_nonlinear_induction (loop_vec_info 
loop_vinfo,
 
   if (TREE_CODE (init_expr) == INTEGER_CST)
 init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
-  else
-gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
-  TREE_TYPE (init_expr)));
+  else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
+{
+  /* INIT_EXPR could be a bit_field, bail out for such case.  */
+  if (dump_enabled_p ())
+   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+"nonlinear induction vectorization failed:"
+" component type of vectype is not a nop conversion"
+" from type of init_expr.\n");
+  return false;
+}
 
   switch (induction_type)
 {
-- 
2.31.1



[PATCH] Fix ICE of unrecognizable insn.

2023-11-15 Thread liuhongt
The new added splitter will generate

(insn 58 56 59 2 (set (reg:V4HI 20 xmm0 [129])
(vec_duplicate:V4HI (reg:HI 22 xmm2 [123]))) "testcase.c":16:21 -1

But we only have

(define_insn "*vec_dupv4hi"
  [(set (match_operand:V4HI 0 "register_operand" "=y,Yw")
(vec_duplicate:V4HI
  (truncate:HI
(match_operand:SI 1 "register_operand" "0,Yw"]

The patch add patterns for V4HI and V2HI.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

PR target/112532
* config/i386/mmx.md (*vec_dup): Extend for V4HI and
V2HI.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr112532.c: New test.
---
 gcc/config/i386/mmx.md   |  8 
 gcc/testsuite/gcc.target/i386/pr112532.c | 21 +
 2 files changed, 25 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr112532.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index a3d08bb9d3b..e4b89160fc0 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -5277,8 +5277,8 @@ (define_insn "*vec_dupv4hi"
(set_attr "mode" "DI,TI")])
 
 (define_insn "*vec_dup"
-  [(set (match_operand:V4F_64 0 "register_operand" "=Yw")
-   (vec_duplicate:V4F_64
+  [(set (match_operand:V4FI_64 0 "register_operand" "=Yw")
+   (vec_duplicate:V4FI_64
  (match_operand: 1 "register_operand" "Yw")))]
   "TARGET_MMX_WITH_SSE"
   "%vpshuflw\t{$0, %1, %0|%0, %1, 0}"
@@ -5869,8 +5869,8 @@ (define_insn "*vec_dupv2hi"
(set_attr "mode" "TI")])
 
 (define_insn "*vec_dup"
-  [(set (match_operand:V2F_32 0 "register_operand" "=Yw")
-   (vec_duplicate:V2F_32
+  [(set (match_operand:V2FI_32 0 "register_operand" "=Yw")
+   (vec_duplicate:V2FI_32
  (match_operand: 1 "register_operand" "Yw")))]
   "TARGET_SSE2"
   "%vpshuflw\t{$0, %1, %0|%0, %1, 0}"
diff --git a/gcc/testsuite/gcc.target/i386/pr112532.c 
b/gcc/testsuite/gcc.target/i386/pr112532.c
new file mode 100644
index 000..690f1d9670d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr112532.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-msse4 -O2" } */
+
+typedef char __attribute__((__vector_size__(2))) v16u8;
+typedef int __attribute__((__vector_size__(8))) v64u8;
+typedef unsigned short __attribute__((__vector_size__(2))) v16u16;
+typedef unsigned short __attribute__((__vector_size__(8))) v64u16;
+v64u16 foo0_v64u16_0;
+int __attribute__((__vector_size__(4 * sizeof(int foo0_v128u32_0;
+__attribute__((__vector_size__(8 * sizeof(short unsigned short 
foo0_v128u16_0;
+v16u16 foo0_v16u16_0;
+v16u8 foo0() {
+  v16u16 v16u16_1 = 
__builtin_shufflevector(__builtin_shufflevector(__builtin_convertvector(foo0_v128u32_0,
 v64u16),foo0_v16u16_0, 1, 4, 2, 0, 0, 2, 2, 2),foo0_v16u16_0, 7);
+  foo0_v64u16_0 -= (short)v16u16_1;
+  v64u16 v64u16_3 = __builtin_shufflevector(v16u16_1, 
__builtin_shufflevector((v16u16){}, foo0_v128u16_0, 7, 0), 0, 1, 2, 2);
+  return (union {v16u8 b;})
+{((union {
+  v64u8 a;
+  int b;
+})(v64u8)v64u16_3).b}.b + (v16u8)v16u16_1;
+}
-- 
2.31.1



[V2 PATCH] Simplify vector ((VCE (a cmp b ? -1 : 0)) < 0) ? c : d to just (VCE ((a cmp b) ? (VCE c) : (VCE d))).

2023-11-16 Thread liuhongt
Update in V2:
1) Add some comments before the pattern.
2) Remove ? from view_convert.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

When I'm working on PR112443, I notice there's some misoptimizations:
after we fold _mm{,256}_blendv_epi8/pd/ps into gimple, the backend
fails to combine it back to v{,p}blendv{v,ps,pd} since the pattern is
too complicated, so I think maybe we should hanlde it in the gimple
level.

The dump is like

  _1 = c_3(D) >= { 0, 0, 0, 0 };
  _2 = VEC_COND_EXPR <_1, { -1, -1, -1, -1 }, { 0, 0, 0, 0 }>;
  _7 = VIEW_CONVERT_EXPR(_2);
  _8 = VIEW_CONVERT_EXPR(b_6(D));
  _9 = VIEW_CONVERT_EXPR(a_5(D));
  _10 = _7 < { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
  _11 = VEC_COND_EXPR <_10, _8, _9>;

It can be optimized to

  _1 = c_2(D) >= { 0, 0, 0, 0 };
  _6 = VEC_COND_EXPR <_1, b_5(D), a_4(D)>;

since _7 is either -1 or 0, the selection of _7 < 0 ? _8 : _9 should
be euqal to _1 ? b : a as long as TYPE_PRECISION of the component type
of the second VEC_COND_EXPR is less equal to the first one.
The patch add a gimple pattern to handle that.

gcc/ChangeLog:

* match.pd (VCE (a cmp b ? -1 : 0) < 0) ? c : d ---> (VCE ((a
cmp b) ? (VCE:c) : (VCE:d))): New gimple simplication.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512vl-blendv-3.c: New test.
* gcc.target/i386/blendv-3.c: New test.
---
 gcc/match.pd  | 22 +
 .../gcc.target/i386/avx512vl-blendv-3.c   |  6 +++
 gcc/testsuite/gcc.target/i386/blendv-3.c  | 46 +++
 3 files changed, 74 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/blendv-3.c

diff --git a/gcc/match.pd b/gcc/match.pd
index dbc811b2b38..2a69622a300 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -5170,6 +5170,28 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (if (optimize_vectors_before_lowering_p () && types_match (@0, @3))
   (vec_cond (bit_and @0 (bit_not @3)) @2 @1)))
 
+/*  ((VCE (a cmp b ? -1 : 0)) < 0) ? c : d is just
+(VCE ((a cmp b) ? (VCE c) : (VCE d))) when TYPE_PRECISION of the
+component type of the outer vec_cond is greater equal the inner one.  */
+(for cmp (simple_comparison)
+ (simplify
+  (vec_cond
+(lt (view_convert@5 (vec_cond@6 (cmp@4 @0 @1)
+   integer_all_onesp
+   integer_zerop))
+ integer_zerop) @2 @3)
+  (if (VECTOR_INTEGER_TYPE_P (TREE_TYPE (@0))
+   && VECTOR_INTEGER_TYPE_P (TREE_TYPE (@5))
+   && !TYPE_UNSIGNED (TREE_TYPE (@5))
+   && VECTOR_TYPE_P (TREE_TYPE (@6))
+   && VECTOR_TYPE_P (type)
+   && (TYPE_PRECISION (TREE_TYPE (type))
+ <= TYPE_PRECISION (TREE_TYPE (TREE_TYPE (@6
+   && TYPE_SIZE (type) == TYPE_SIZE (TREE_TYPE (@6)))
+   (with { tree vtype = TREE_TYPE (@6);}
+ (view_convert:type
+   (vec_cond @4 (view_convert:vtype @2) (view_convert:vtype @3)))
+
 /* c1 ? c2 ? a : b : b  -->  (c1 & c2) ? a : b  */
 (simplify
  (vec_cond @0 (vec_cond:s @1 @2 @3) @3)
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c 
b/gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c
new file mode 100644
index 000..2777e72ab5f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -mavx512bw -O2" } */
+/* { dg-final { scan-assembler-times {vp?blendv(?:b|p[sd])[ \t]*} 6 } } */
+/* { dg-final { scan-assembler-not {vpcmp} } } */
+
+#include "blendv-3.c"
diff --git a/gcc/testsuite/gcc.target/i386/blendv-3.c 
b/gcc/testsuite/gcc.target/i386/blendv-3.c
new file mode 100644
index 000..fa0fb067a73
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/blendv-3.c
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2" } */
+/* { dg-final { scan-assembler-times {vp?blendv(?:b|p[sd])[ \t]*} 6 } } */
+/* { dg-final { scan-assembler-not {vpcmp} } } */
+
+#include 
+
+__m256i
+foo (__m256i a, __m256i b, __m256i c)
+{
+  return _mm256_blendv_epi8 (a, b, ~c < 0);
+}
+
+__m256d
+foo1 (__m256d a, __m256d b, __m256i c)
+{
+  __m256i d = ~c < 0;
+  return _mm256_blendv_pd (a, b, (__m256d)d);
+}
+
+__m256
+foo2 (__m256 a, __m256 b, __m256i c)
+{
+  __m256i d = ~c < 0;
+  return _mm256_blendv_ps (a, b, (__m256)d);
+}
+
+__m128i
+foo4 (__m128i a, __m128i b, __m128i c)
+{
+  return _mm_blendv_epi8 (a, b, ~c < 0);
+}
+
+__m128d
+foo5 (__m128d a, __m128d b, __m128i c)
+{
+  __m128i d = ~c < 0;
+  return _mm_blendv_pd (a, b, (__m128d)d);
+}
+
+__m128
+foo6 (__m128 a, __m128 b, __m128i c)
+{
+  __m128i d = ~c < 0;
+  return _mm_blendv_ps (a, b, (__m128)d);
+}
-- 
2.31.1



[PATCH 2/2] Add i?86-*-* and x86_64-*-* to vect_logical_reduc

2023-11-16 Thread liuhongt
x86 backend support reduc_{and,ior,xor>_scal_m for vector integer
modes.

Ok for trunk?

gcc/testsuite/ChangeLog:

* lib/target-supports.exp (vect_logical_reduc): Add i?86-*-*
and x86_64-*-*.
---
 gcc/testsuite/lib/target-supports.exp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index b6a2e4fd096..30dd39508f8 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -9257,7 +9257,8 @@ proc check_effective_target_vect_call_roundf { } {
 proc check_effective_target_vect_logical_reduc { } {
 return [expr { [check_effective_target_aarch64_sve]
   || [istarget amdgcn-*-*]
-  || [check_effective_target_riscv_v] }]
+  || [check_effective_target_riscv_v]
+  || [istarget i?86-*-*] || [istarget x86_64-*-*]}]
 }
 
 # Return 1 if the target supports the fold_extract_last optab.
-- 
2.31.1



[PATCH 1/2] Support reduc_{plus, xor, and, ior}_scal_m for vector integer mode.

2023-11-16 Thread liuhongt
BB vectorizer relies on the backend support of
.REDUC_{PLUS,IOR,XOR,AND} to vectorize reduction.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

PR target/112325
* config/i386/sse.md (reduc__scal_): New expander.
(REDUC_ANY_LOGIC_MODE): New iterator.
(REDUC_PLUS_MODE): Extend to VxHI/SI/DImode.
(REDUC_SSE_PLUS_MODE): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr112325-1.c: New test.
* gcc.target/i386/pr112325-2.c: New test.
---
 gcc/config/i386/sse.md |  48 -
 gcc/testsuite/gcc.target/i386/pr112325-1.c | 116 +
 gcc/testsuite/gcc.target/i386/pr112325-2.c |  38 +++
 3 files changed, 199 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr112325-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr112325-2.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index d250a6cb802..f94a77d0b6d 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -3417,7 +3417,9 @@ (define_insn "sse3_hv4sf3"
 
 (define_mode_iterator REDUC_SSE_PLUS_MODE
  [(V2DF "TARGET_SSE") (V4SF "TARGET_SSE")
-  (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")])
+  (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+  (V8HI "TARGET_SSE2") (V4SI "TARGET_SSE2")
+  (V2DI "TARGET_SSE2")])
 
 (define_expand "reduc_plus_scal_"
  [(plus:REDUC_SSE_PLUS_MODE
@@ -3458,8 +3460,12 @@ (define_mode_iterator REDUC_PLUS_MODE
   (V8DF "TARGET_AVX512F && TARGET_EVEX512")
   (V16SF "TARGET_AVX512F && TARGET_EVEX512")
   (V32HF "TARGET_AVX512FP16 && TARGET_AVX512VL && TARGET_EVEX512")
-  (V32QI "TARGET_AVX")
-  (V64QI "TARGET_AVX512F && TARGET_EVEX512")])
+  (V32QI "TARGET_AVX") (V16HI "TARGET_AVX")
+  (V8SI "TARGET_AVX")  (V4DI "TARGET_AVX")
+  (V64QI "TARGET_AVX512F && TARGET_EVEX512")
+  (V32HI "TARGET_AVX512F && TARGET_EVEX512")
+  (V16SI "TARGET_AVX512F && TARGET_EVEX512")
+  (V8DI "TARGET_AVX512F && TARGET_EVEX512")])
 
 (define_expand "reduc_plus_scal_"
  [(plus:REDUC_PLUS_MODE
@@ -3597,6 +3603,42 @@ (define_insn 
"reduces"
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
+(define_expand "reduc__scal_"
+ [(any_logic:VI_128
+(match_operand: 0 "register_operand")
+(match_operand:VI_128 1 "register_operand"))]
+ "TARGET_SSE2"
+{
+  rtx tmp = gen_reg_rtx (mode);
+  ix86_expand_reduc (gen_3, tmp, operands[1]);
+  emit_insn (gen_vec_extract (operands[0],
+  tmp, const0_rtx));
+  DONE;
+})
+
+(define_mode_iterator REDUC_ANY_LOGIC_MODE
+ [(V32QI "TARGET_AVX") (V16HI "TARGET_AVX")
+  (V8SI "TARGET_AVX")  (V4DI "TARGET_AVX")
+  (V64QI "TARGET_AVX512F && TARGET_EVEX512")
+  (V32HI "TARGET_AVX512F && TARGET_EVEX512")
+  (V16SI "TARGET_AVX512F && TARGET_EVEX512")
+  (V8DI "TARGET_AVX512F && TARGET_EVEX512")])
+
+(define_expand "reduc__scal_"
+ [(any_logic:REDUC_ANY_LOGIC_MODE
+   (match_operand: 0 "register_operand")
+   (match_operand:REDUC_ANY_LOGIC_MODE 1 "register_operand"))]
+ ""
+{
+  rtx tmp = gen_reg_rtx (mode);
+  emit_insn (gen_vec_extract_hi_ (tmp, operands[1]));
+  rtx tmp2 = gen_reg_rtx (mode);
+  rtx tmp3 = gen_lowpart (mode, operands[1]);
+  emit_insn (gen_3 (tmp2, tmp, tmp3));
+  emit_insn (gen_reduc__scal_ (operands[0], tmp2));
+  DONE;
+})
+
 ;
 ;;
 ;; Parallel floating point comparisons
diff --git a/gcc/testsuite/gcc.target/i386/pr112325-1.c 
b/gcc/testsuite/gcc.target/i386/pr112325-1.c
new file mode 100644
index 000..56e20c156f1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr112325-1.c
@@ -0,0 +1,116 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -mavx512bw -O2 -mtune=generic 
-mprefer-vector-width=512 -fdump-tree-slp2" } */
+/* { dg-final { scan-tree-dump-times ".REDUC_PLUS" 3 "slp2" } } */
+/* { dg-final { scan-tree-dump-times ".REDUC_IOR" 4 "slp2" } } */
+
+int
+__attribute__((noipa))
+plus_v4si (int* a)
+{
+  int sum = 0;
+  sum += a[0];
+  sum += a[1];
+  sum += a[2];
+  sum += a[3];
+  return sum;
+}
+
+short
+__attribute__((noipa))
+plus_v8hi (short* a)
+{
+  short sum = 0;
+  sum += a[0];
+  sum += a[1];
+  sum += a[2];
+  sum += a[3];
+  sum += a[4];
+  sum += a[5];
+  sum += a[6];
+  sum += a[7];
+  return sum;
+}
+
+long long
+__attribute__((noipa))
+plus_v8di (long long* a)
+{
+  long long sum = 0;
+  sum += a[0];
+  sum += a[1];
+  sum += a[2];
+  sum += a[3];
+  sum += a[4];
+  sum += a[5];
+  sum += a[6];
+  sum += a[7];
+  return sum;
+}
+
+int
+__attribute__((noipa))
+ior_v4si (int* a)
+{
+  int sum = 0;
+  sum |= a[0];
+  sum |= a[1];
+  sum |= a[2];
+  sum |= a[3];
+  return sum;
+}
+
+short
+__attribute__((noipa))
+ior_v8hi (short* a)
+{
+  short sum = 0;
+  sum |= a[0];
+  sum |= a[1];
+  sum |= a[2];
+  sum |= a[3];
+  sum |= a[4];
+  sum |= a[5];
+  sum |= a[6];
+  sum |= a[7];
+  return sum;
+}
+
+long long
+__attribute__((noipa))
+ior_v8di (long long* a)
+

[PATCH] Support cbranchm for Vector HI/QImode.

2023-11-16 Thread liuhongt
The missing cbranchv*{hi,qi}4 maybe needed by early break vectorization.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

* config/i386/sse.md (cbranch4): Extend to Vector
HI/QImode.
---
 gcc/config/i386/sse.md | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index d250a6cb802..3659660a616 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -514,6 +514,12 @@ (define_mode_iterator VI_AVX2
(V16SI "TARGET_AVX512F && TARGET_EVEX512") (V8SI "TARGET_AVX2") V4SI
(V8DI "TARGET_AVX512F && TARGET_EVEX512") (V4DI "TARGET_AVX2") V2DI])
 
+(define_mode_iterator VI_AVX_AVX512F
+  [(V64QI "TARGET_AVX512F && TARGET_EVEX512") (V32QI "TARGET_AVX") V16QI
+   (V32HI "TARGET_AVX512F && TARGET_EVEX512") (V16HI "TARGET_AVX") V8HI
+   (V16SI "TARGET_AVX512F && TARGET_EVEX512") (V8SI "TARGET_AVX") V4SI
+   (V8DI "TARGET_AVX512F && TARGET_EVEX512") (V4DI "TARGET_AVX") V2DI])
+
 ;; All QImode vector integer modes
 (define_mode_iterator VI1
   [(V32QI "TARGET_AVX") V16QI])
@@ -27868,8 +27874,8 @@ (define_insn "_store_mask"
 
 (define_expand "cbranch4"
   [(set (reg:CC FLAGS_REG)
-   (compare:CC (match_operand:VI48_AVX_AVX512F 1 "register_operand")
-   (match_operand:VI48_AVX_AVX512F 2 "nonimmediate_operand")))
+   (compare:CC (match_operand:VI_AVX_AVX512F 1 "register_operand")
+   (match_operand:VI_AVX_AVX512F 2 "nonimmediate_operand")))
(set (pc) (if_then_else
   (match_operator 0 "bt_comparison_operator"
[(reg:CC FLAGS_REG) (const_int 0)])
-- 
2.31.1



[PATCH] [x86] Support reduc_{and, ior, xor}_scal_m for V4HI/V8QI/V4QImode

2023-11-19 Thread liuhongt
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

PR target/112325
* config/i386/i386-expand.cc (emit_reduc_half): Hanlde
V8QImode.
* config/i386/mmx.md (reduc__scal_): New expander.
(reduc__scal_v4qi): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr112325-mmx-1.c: New test.
---
 gcc/config/i386/i386-expand.cc|  1 +
 gcc/config/i386/mmx.md| 31 +-
 .../gcc.target/i386/pr112325-mmx-1.c  | 40 +++
 3 files changed, 70 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr112325-mmx-1.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index a8d871d321e..fe56d2f6153 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -17748,6 +17748,7 @@ emit_reduc_half (rtx dest, rtx src, int i)
   tem = gen_mmx_lshrv1si3 (d, gen_lowpart (V1SImode, src),
   GEN_INT (i / 2));
   break;
+case E_V8QImode:
 case E_V4HImode:
   d = gen_reg_rtx (V1DImode);
   tem = gen_mmx_lshrv1di3 (d, gen_lowpart (V1DImode, src),
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 355538749d1..c77c9719e9a 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -120,13 +120,15 @@ (define_mode_attr mmxscalarmode
   [(V2SI "SI") (V2SF "SF")
(V4HF "HF") (V4BF "BF")
(V2HF "HF") (V2BF "BF")
-   (V4HI "HI") (V2HI "HI")])
+   (V4HI "HI") (V2HI "HI")
+   (V8QI "QI")])
 
 (define_mode_attr mmxscalarmodelower
   [(V2SI "si") (V2SF "sf")
(V4HF "hf") (V4BF "bf")
(V2HF "hf") (V2BF "bf")
-   (V4HI "hi") (V2HI "hi")])
+   (V4HI "hi") (V2HI "hi")
+   (V8QI "qi")])
 
 (define_mode_attr Yv_Yw
   [(V8QI "Yw") (V4HI "Yw") (V2SI "Yv") (V1DI "Yv") (V2SF "Yv")])
@@ -6094,6 +6096,31 @@ (define_insn "*mmx_psadbw"
(set_attr "type" "mmxshft,sseiadd,sseiadd")
(set_attr "mode" "DI,TI,TI")])
 
+(define_expand "reduc__scal_"
+ [(any_logic:MMXMODE12
+(match_operand: 0 "register_operand")
+(match_operand:MMXMODE12 1 "register_operand"))]
+ "TARGET_MMX_WITH_SSE"
+{
+  rtx tmp = gen_reg_rtx (mode);
+  ix86_expand_reduc (gen_3, tmp, operands[1]);
+  emit_insn (gen_vec_extract (operands[0],
+  tmp, const0_rtx));
+  DONE;
+})
+
+(define_expand "reduc__scal_v4qi"
+ [(any_logic:V4QI
+(match_operand:QI 0 "register_operand")
+(match_operand:V4QI 1 "register_operand"))]
+ "TARGET_SSE2"
+{
+  rtx tmp = gen_reg_rtx (V4QImode);
+  ix86_expand_reduc (gen_v4qi3, tmp, operands[1]);
+  emit_insn (gen_vec_extractv4qiqi (operands[0], tmp, const0_rtx));
+  DONE;
+})
+
 (define_expand "reduc_plus_scal_v8qi"
  [(plus:V8QI
 (match_operand:QI 0 "register_operand")
diff --git a/gcc/testsuite/gcc.target/i386/pr112325-mmx-1.c 
b/gcc/testsuite/gcc.target/i386/pr112325-mmx-1.c
new file mode 100644
index 000..887249fc6ad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr112325-mmx-1.c
@@ -0,0 +1,40 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-msse2 -O2 -fdump-tree-slp2" } */
+/* { dg-final { scan-tree-dump-times ".REDUC_IOR" 3 "slp2" } } */
+
+short
+foo1 (short* a)
+{
+  short sum = 0;
+  sum |= a[0];
+  sum |= a[1];
+  sum |= a[2];
+  sum |= a[3];
+  return sum;
+}
+
+char
+foo2 (char* a)
+{
+  char sum = 0;
+  sum |= a[0];
+  sum |= a[1];
+  sum |= a[2];
+  sum |= a[3];
+  sum |= a[4];
+  sum |= a[5];
+  sum |= a[6];
+  sum |= a[7];
+  return sum;
+}
+
+char
+foo3 (char* a)
+{
+  char sum = 0;
+  sum |= a[0];
+  sum |= a[1];
+  sum |= a[2];
+  sum |= a[3];
+  return sum;
+}
-- 
2.31.1



[PATCH] Set AVOID_256FMA_CHAINS TO m_GENERIC as it's generally good to new platforms

2023-11-21 Thread liuhongt
From: "Zhang, Annita" 

Avoid_fma_chain was enabled in m_SAPPHIRERAPIDS, m_ALDERLAKE and
m_CORE_HYBRID. It can also be enabled in m_GENERIC to improve the
performance of -march=x86-64-v3/v4 with -mtune=generic set by
default. One SPEC2017 benchmark 510.parest_r can improve greatly due
to it. From the experiments, the single thread with -O2
-march=x86-64-v3 can improve 26% on SPR, and 15% on Zen3. Meanwhile,
it didn't cause notable regression in previous platforms including
Cascade Lake and Ice Lake Server.

On zenver4, it looks like fadd(3 cycles) is still fater than fma(4
cycles). So in theory, avoid_fma_chain should be also better for
znver4. And according to [1], enable fma_chain is not a generic win on
znver4?

cut from [1]---
I also added X86_TUNE_AVOID_256FMA_CHAINS. Since fma has improved in
zen4 this flag may not be a win except for very specific benchmarks. I
am still doing some more detailed testing here.
-cut end--

[1] https://gcc.gnu.org/pipermail/gcc-patches/2022-December/607962.html

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog

* config/i386/x86-tune.def (AVOID_256FMA_CHAINS): Add
m_GENERIC.
---
 gcc/config/i386/x86-tune.def | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 43fa9e8fd6d..a2e57e01550 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -521,7 +521,7 @@ DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", 
m_ZNVER1 | m_ZNVER2
 /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
smaller FMA chain.  */
 DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | 
m_ZNVER3
- | m_CORE_HYBRID | m_SAPPHIRERAPIDS | m_CORE_ATOM)
+ | m_CORE_HYBRID | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC)
 
 /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
smaller FMA chain.  */
-- 
2.31.1



[PATCH] Take register pressure into account for vec_construct when the components are not loaded from memory.

2023-11-27 Thread liuhongt
For vec_contruct, the components must be live at the same time if
they're not loaded from memory, when the number of those components
exceeds available registers, spill happens. Try to account that with a
rough estimation.
??? Ideally, we should have an overall estimation of register pressure
if we know the live range of all variables.

The patch can avoid regressions due to .i.e. vec_contruct with 32 char.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.

Ok for trunk?

gcc/ChangeLog:

* config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): Take
register pressure into account for vec_construct when the
components are not loaded from memory.
---
 gcc/config/i386/i386.cc | 22 +-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 683ac643bc8..f8417555930 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -24706,6 +24706,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
   stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
   unsigned i;
   tree op;
+  unsigned reg_needed = 0;
   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
if (TREE_CODE (op) == SSA_NAME)
  TREE_VISITED (op) = 0;
@@ -24737,11 +24738,30 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
  && (gimple_assign_rhs_code (def) != BIT_FIELD_REF
  || !VECTOR_TYPE_P (TREE_TYPE
(TREE_OPERAND (gimple_assign_rhs1 (def), 0))
-   stmt_cost += ix86_cost->sse_to_integer;
+   {
+ stmt_cost += ix86_cost->sse_to_integer;
+ reg_needed++;
+   }
}
   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
if (TREE_CODE (op) == SSA_NAME)
  TREE_VISITED (op) = 0;
+
+  /* For vec_contruct, the components must be live at the same time if
+they're not loaded from memory, when the number of those components
+exceeds available registers, spill happens. Try to account that with a
+rough estimation. Currently only handle integral modes since scalar fp
+shares sse_regs with vectors.
+??? Ideally, we should have an overall estimation of register pressure
+if we know the live range of all variables.  */
+  if (!fp && kind == vec_construct
+ && reg_needed > target_avail_regs)
+   {
+ unsigned spill_cost = ix86_builtin_vectorization_cost (scalar_store,
+vectype,
+misalign);
+ stmt_cost += spill_cost * (reg_needed - target_avail_regs);
+   }
 }
   if (stmt_cost == -1)
 stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
-- 
2.31.1



[PATCH] [x86] Support sdot_prodv*qi with emulation of sdot_prodv*hi.

2023-11-28 Thread liuhongt
Currently sdot_prodv*qi is available under TARGET_AVXVNNIINT8, but it
can be emulated by

 vec_unpacks_lo_v32qi
 vec_unpacks_lo_v32qi
 vec_unpacks_hi_v32qi
 vec_unpacks_hi_v32qi
 sdot_prodv16hi
 sdot_prodv16hi
 add3v8si

which is faster than original

  vect_patt_39.11_48 = WIDEN_MULT_LO_EXPR ;
  vect_patt_39.11_49 = WIDEN_MULT_HI_EXPR ;
  vect_patt_38.14_54 = [vec_unpack_lo_expr] vect_patt_39.11_48;
  vect_patt_38.14_55 = [vec_unpack_hi_expr] vect_patt_39.11_48;
  vect_patt_38.14_56 = [vec_unpack_lo_expr] vect_patt_39.11_49;
  vect_patt_38.14_57 = [vec_unpack_hi_expr] vect_patt_39.11_49;
  vect_sum_15.15_59 = vect_patt_38.14_54 + vect_patt_38.14_55;
  vect_sum_15.15_60 = vect_patt_38.14_56 + vect_sum_15.15_59;
  vect_sum_15.15_61 = vect_patt_38.14_57 + vect_sum_15.15_60;

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

* config/i386/sse.md (sdot_prodv64qi): New expander.
(sseunpackmodelower): New mode attr.
(sdot_prod): Emulate sdot_prodv*qi with sodt_prov*hi
when TARGET_VNNIINT8 is not available.

gcc/testsuite/ChangeLog:

* gcc.target/i386/sdotprodint8_emulate.c: New test.
---
 gcc/config/i386/sse.md| 87 ---
 .../gcc.target/i386/sdotprodint8_emulate.c| 15 
 2 files changed, 90 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/sdotprodint8_emulate.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index f94a77d0b6d..e29311d83cc 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1291,6 +1291,11 @@ (define_mode_attr sseunpackmode
(V32QI "V16HI") (V16HI "V8SI") (V8SI "V4DI")
(V32HI "V16SI") (V64QI "V32HI") (V16SI "V8DI")])
 
+(define_mode_attr sseunpackmodelower
+  [(V16QI "v8hi") (V8HI "v4si") (V4SI "v2di")
+   (V32QI "v16hi") (V16HI "v8si") (V8SI "v4di")
+   (V32HI "v16si") (V64QI "v32hi") (V16SI "v8di")])
+
 (define_mode_attr ssepackmode
   [(V8HI "V16QI") (V4SI "V8HI") (V2DI "V4SI")
(V16HI "V32QI") (V8SI "V16HI") (V4DI "V8SI")
@@ -30742,20 +30747,78 @@ (define_int_attr vpdotprodtype
 
 (define_expand "sdot_prod"
   [(match_operand: 0 "register_operand")
-   (match_operand:VI1 1 "register_operand")
-   (match_operand:VI1 2 "register_operand")
+   (match_operand:VI1_AVX2 1 "register_operand")
+   (match_operand:VI1_AVX2 2 "register_operand")
(match_operand: 3 "register_operand")]
-  "TARGET_AVXVNNIINT8"
+  "TARGET_SSE2"
 {
-  operands[1] = lowpart_subreg (mode,
-force_reg (mode, operands[1]),
-mode);
-  operands[2] = lowpart_subreg (mode,
-force_reg (mode, operands[2]),
-mode);
-  emit_insn (gen_rtx_SET (operands[0], operands[3]));
-  emit_insn (gen_vpdpbssd_ (operands[0], operands[3],
-  operands[1], operands[2]));
+  if (TARGET_AVXVNNIINT8)
+{
+  operands[1] = lowpart_subreg (mode,
+   force_reg (mode, operands[1]),
+   mode);
+  operands[2] = lowpart_subreg (mode,
+   force_reg (mode, operands[2]),
+   mode);
+  emit_insn (gen_rtx_SET (operands[0], operands[3]));
+  emit_insn (gen_vpdpbssd_ (operands[0], operands[3],
+ operands[1], operands[2]));
+}
+  else
+{
+  /* Emulate with vpdpwssd.  */
+  rtx op1_lo = gen_reg_rtx (mode);
+  rtx op1_hi = gen_reg_rtx (mode);
+  rtx op2_lo = gen_reg_rtx (mode);
+  rtx op2_hi = gen_reg_rtx (mode);
+
+  emit_insn (gen_vec_unpacks_lo_ (op1_lo, operands[1]));
+  emit_insn (gen_vec_unpacks_lo_ (op2_lo, operands[2]));
+  emit_insn (gen_vec_unpacks_hi_ (op1_hi, operands[1]));
+  emit_insn (gen_vec_unpacks_hi_ (op2_hi, operands[2]));
+
+  rtx res1 = gen_reg_rtx (mode);
+  rtx res2 = gen_reg_rtx (mode);
+  rtx sum = gen_reg_rtx (mode);
+
+  emit_move_insn (sum, CONST0_RTX (mode));
+  emit_insn (gen_sdot_prod (res1, op1_lo,
+   op2_lo, sum));
+  emit_insn (gen_sdot_prod (res2, op1_hi,
+   op2_hi, operands[3]));
+  emit_insn (gen_add3 (operands[0], res1, res2));
+}
+
+  DONE;
+})
+
+(define_expand "sdot_prodv64qi"
+  [(match_operand:V16SI 0 "register_operand")
+   (match_operand:V64QI 1 "register_operand")
+   (match_operand:V64QI 2 "register_operand")
+   (match_operand:V16SI 3 "register_operand")]
+  "(TARGET_AVX512VNNI || TARGET_AVX512BW) && TARGET_EVEX512"
+{
+  /* Emulate with vpdpwssd.  */
+  rtx op1_lo = gen_reg_rtx (V32HImode);
+  rtx op1_hi = gen_reg_rtx (V32HImode);
+  rtx op2_lo = gen_reg_rtx (V32HImode);
+  rtx op2_hi = gen_reg_rtx (V32HImode);
+
+  emit_insn (gen_vec_unpacks_lo_v64qi (op1_lo, operands[1]));
+  emit_insn (gen_vec_unpack

[PATCH] Use vec_extact_lo instead of subreg in reduc__scal_m.

2023-11-29 Thread liuhongt
Loop vectorizer will use vec_perm to select lower part of a vector,
there could be some redundancy when using subreg in
reduc__scal_m, because rtl cse can't figure out vec_select lower
part is just subreg.

I'm trying to canonicalize vec_select to subreg like aarch64 did, but
there're so many regressions, some are easy to fix, some requires
middle-end adjustment.

So for simplicity, the patch use vec_select instead of subreg in
reduc__scal_m.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

* config/i386/sse.md: (reduc_plus_scal_): Use
vec_extract_lo instead of subreg.
(reduc__scal_): Ditto.
(reduc__scal_): Ditto.
(reduc__scal_): Ditto.
(reduc__scal_): Ditto.
---
 gcc/config/i386/sse.md | 47 +++---
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 4f511693e3f..5e0e0e9e51f 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -3480,11 +3480,12 @@ (define_expand "reduc_plus_scal_"
  ""
 {
   rtx tmp = gen_reg_rtx (mode);
-  emit_insn (gen_vec_extract_hi_ (tmp, operands[1]));
   rtx tmp2 = gen_reg_rtx (mode);
-  rtx tmp3 = gen_lowpart (mode, operands[1]);
-  emit_insn (gen_add3 (tmp2, tmp, tmp3));
-  emit_insn (gen_reduc_plus_scal_ (operands[0], tmp2));
+  rtx tmp3 = gen_reg_rtx (mode);
+  emit_insn (gen_vec_extract_hi_ (tmp, operands[1]));
+  emit_insn (gen_vec_extract_lo_ (tmp2, operands[1]));
+  emit_insn (gen_add3 (tmp3, tmp, tmp2));
+  emit_insn (gen_reduc_plus_scal_ (operands[0], tmp3));
   DONE;
 })
 
@@ -3528,11 +3529,12 @@ (define_expand "reduc__scal_"
   ""
 {
   rtx tmp = gen_reg_rtx (mode);
-  emit_insn (gen_vec_extract_hi_ (tmp, operands[1]));
   rtx tmp2 = gen_reg_rtx (mode);
-  emit_insn (gen_3
-(tmp2, tmp, gen_lowpart (mode, operands[1])));
-  emit_insn (gen_reduc__scal_ (operands[0], tmp2));
+  rtx tmp3 = gen_reg_rtx (mode);
+  emit_insn (gen_vec_extract_hi_ (tmp, operands[1]));
+  emit_insn (gen_vec_extract_lo_ (tmp2, operands[1]));
+  emit_insn (gen_3 (tmp3, tmp, tmp2));
+  emit_insn (gen_reduc__scal_ (operands[0], tmp3));
   DONE;
 })
 
@@ -3543,11 +3545,12 @@ (define_expand "reduc__scal_"
   "TARGET_AVX512F"
 {
   rtx tmp = gen_reg_rtx (mode);
-  emit_insn (gen_vec_extract_hi_ (tmp, operands[1]));
   rtx tmp2 = gen_reg_rtx (mode);
-  emit_insn (gen_3
-(tmp2, tmp, gen_lowpart (mode, operands[1])));
-  emit_insn (gen_reduc__scal_ (operands[0], tmp2));
+  rtx tmp3 = gen_reg_rtx (mode);
+  emit_insn (gen_vec_extract_hi_ (tmp, operands[1]));
+  emit_insn (gen_vec_extract_lo_ (tmp2, operands[1]));
+  emit_insn (gen_3 (tmp3, tmp, tmp2));
+  emit_insn (gen_reduc__scal_ (operands[0], tmp3));
   DONE;
 })
 
@@ -3558,14 +3561,15 @@ (define_expand "reduc__scal_"
   "TARGET_AVX2"
 {
   rtx tmp = gen_reg_rtx (mode);
-  emit_insn (gen_vec_extract_hi_ (tmp, operands[1]));
   rtx tmp2 = gen_reg_rtx (mode);
-  emit_insn (gen_3
-(tmp2, tmp, gen_lowpart (mode, operands[1])));
   rtx tmp3 = gen_reg_rtx (mode);
-  ix86_expand_reduc (gen_3, tmp3, tmp2);
+  emit_insn (gen_vec_extract_hi_ (tmp, operands[1]));
+  emit_insn (gen_vec_extract_lo_ (tmp2, operands[1]));
+  emit_insn (gen_3 (tmp3, tmp, tmp2));
+  rtx tmp4 = gen_reg_rtx (mode);
+  ix86_expand_reduc (gen_3, tmp4, tmp3);
   emit_insn (gen_vec_extract
-   (operands[0], tmp3, const0_rtx));
+   (operands[0], tmp4, const0_rtx));
   DONE;
 })
 
@@ -3637,11 +3641,12 @@ (define_expand "reduc__scal_"
  ""
 {
   rtx tmp = gen_reg_rtx (mode);
-  emit_insn (gen_vec_extract_hi_ (tmp, operands[1]));
   rtx tmp2 = gen_reg_rtx (mode);
-  rtx tmp3 = gen_lowpart (mode, operands[1]);
-  emit_insn (gen_3 (tmp2, tmp, tmp3));
-  emit_insn (gen_reduc__scal_ (operands[0], tmp2));
+  rtx tmp3 = gen_reg_rtx (mode);
+  emit_insn (gen_vec_extract_hi_ (tmp, operands[1]));
+  emit_insn (gen_vec_extract_lo_ (tmp2, operands[1]));
+  emit_insn (gen_3 (tmp3, tmp, tmp2));
+  emit_insn (gen_reduc__scal_ (operands[0], tmp3));
   DONE;
 })
 
-- 
2.31.1



[PATCH V2] Fix wrong cost of MEM when addr is a lea.

2024-06-26 Thread liuhongt
> But rtx_cost invokes targetm.rtx_cost which allows to avoid that
> recursive processing at any level.  You're dealing with MEM [addr]
> here, so why's rtx_cost (addr, Pmode, MEM, 0, speed) not always
> the best way to deal with this?  Since this is the MEM [addr] case
> we know it's not LEA, no?
The patch restrict MEM rtx_cost reduction only for register_operand + disp.


Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?


416.gamess regressed 4-6% on x86_64 since my r15-882-g1d6199e5f8c1c0.
The commit adjust rtx_cost of mem to reduce cost of (add op0 disp).
But Cost of ADDR could be cheaper than XEXP (addr, 0) when it's a lea.
It is the case in the PR, the patch adjust rtx_cost to only handle reg
+ disp, for other forms, they're basically all LEA which doesn't have
additional cost of ADD.

gcc/ChangeLog:

PR target/115462
* config/i386/i386.cc (ix86_rtx_costs): Make cost of MEM (reg +
disp) just a little bit more than MEM (reg).

gcc/testsuite/ChangeLog:
* gcc.target/i386/pr115462.c: New test.
---
 gcc/config/i386/i386.cc  |  5 -
 gcc/testsuite/gcc.target/i386/pr115462.c | 22 ++
 2 files changed, 26 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr115462.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d4ccc24be6e..ef2a1e4f4f2 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22339,7 +22339,10 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
 address_cost should be used, but it reduce cost too much.
 So current solution is make constant disp as cheap as possible.  */
  if (GET_CODE (addr) == PLUS
- && x86_64_immediate_operand (XEXP (addr, 1), Pmode))
+ && x86_64_immediate_operand (XEXP (addr, 1), Pmode)
+ /* Only hanlde (reg + disp) since other forms of addr are mostly 
LEA,
+there's no additional cost for the plus of disp.  */
+ && register_operand (XEXP (addr, 0), Pmode))
{
  *total += 1;
  *total += rtx_cost (XEXP (addr, 0), Pmode, PLUS, 0, speed);
diff --git a/gcc/testsuite/gcc.target/i386/pr115462.c 
b/gcc/testsuite/gcc.target/i386/pr115462.c
new file mode 100644
index 000..ad50a6382bc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115462.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -fno-tree-vectorize -fno-pic" } */
+/* { dg-final { scan-assembler-times {(?n)movl[ \t]+.*, p1\.0\+[0-9]*\(,} 3 } 
} */
+
+int
+foo (long indx, long indx2, long indx3, long indx4, long indx5, long indx6, 
long n, int* q)
+{
+  static int p1[1];
+  int* p2 = p1 + 1000;
+  int* p3 = p1 + 4000;
+  int* p4 = p1 + 8000;
+
+  for (long i = 0; i != n; i++)
+{
+  /* scan for  movl%edi, p1.0+3996(,%rax,4),
+p1.0+3996 should be propagted into the loop.  */
+  p2[indx++] = q[indx++];
+  p3[indx2++] = q[indx2++];
+  p4[indx3++] = q[indx3++];
+}
+  return p1[indx6] + p1[indx5];
+}
-- 
2.31.1



[PATCH 3/7] [x86] Match IEEE min/max with UNSPEC_IEEE_{MIN,MAX}.

2024-06-27 Thread liuhongt
These versions of the min/max patterns implement exactly the operations
   min = (op1 < op2 ? op1 : op2)
   max = (!(op1 < op2) ? op1 : op2)

gcc/ChangeLog:
PR target/115517
* config/i386/sse.md (*minmax3_1): New pre_reload
define_insn_and_split.
(*minmax3_2): Ditto.
---
 gcc/config/i386/sse.md | 63 ++
 1 file changed, 63 insertions(+)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 822159a869b..92f8b74999f 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -3064,6 +3064,69 @@ (define_insn 
"*3"
(set_attr "prefix" "")
(set_attr "mode" "")])
 
+(define_insn_and_split "*minmax3_1"
+  [(set (match_operand:VFH 0 "register_operand")
+   (vec_merge:VFH
+ (match_operand:VFH 1 "nonimmediate_operand")
+ (match_operand:VFH 2 "nonimmediate_operand")
+ (unspec:
+   [(match_operand:VFH 3 "nonimmediate_operand")
+(match_operand:VFH 4 "nonimmediate_operand")
+(match_operand:SI 5 "const_0_to_31_operand")]
+UNSPEC_PCMP)))]
+  "TARGET_SSE && ix86_pre_reload_split ()
+   && ((rtx_equal_p (operands[1], operands[3])
+   && rtx_equal_p (operands[2], operands[4]))
+   || (rtx_equal_p (operands[1], operands[4])
+  && rtx_equal_p (operands[2], operands[3])))
+   && (INTVAL (operands[5]) == 1 || INTVAL (operands[5]) == 14)"
+   "#"
+   "&& 1"
+   [(const_int 0)]
+ {
+   int u = UNSPEC_IEEE_MIN;
+   if ((INTVAL (operands[5]) == 1 && rtx_equal_p (operands[1], operands[4]))
+   || (INTVAL (operands[5]) == 14 && rtx_equal_p (operands[1], 
operands[3])))
+ u = UNSPEC_IEEE_MAX;
+
+   if (MEM_P (operands[1]))
+ operands[1] = force_reg (mode, operands[1]);
+   rtvec v = gen_rtvec (2, operands[1], operands[2]);
+   rtx tmp = gen_rtx_UNSPEC (mode, v, u);
+   emit_move_insn (operands[0], tmp);
+   DONE;
+ })
+
+(define_insn_and_split "*minmax3_2"
+  [(set (match_operand:VF_128_256 0 "register_operand")
+   (unspec:VF_128_256
+ [(match_operand:VF_128_256 1 "nonimmediate_operand")
+  (match_operand:VF_128_256 2 "nonimmediate_operand")
+  (lt:VF_128_256
+(match_operand:VF_128_256 3 "nonimmediate_operand")
+(match_operand:VF_128_256 4 "nonimmediate_operand"))]
+UNSPEC_BLENDV))]
+  "TARGET_SSE && ix86_pre_reload_split ()
+   && ((rtx_equal_p (operands[1], operands[3])
+   && rtx_equal_p (operands[2], operands[4]))
+   || (rtx_equal_p (operands[1], operands[4])
+  && rtx_equal_p (operands[2], operands[3])))"
+   "#"
+   "&& 1"
+   [(const_int 0)]
+ {
+   int u = UNSPEC_IEEE_MIN;
+   if (rtx_equal_p (operands[1], operands[3]))
+ u = UNSPEC_IEEE_MAX;
+
+   if (MEM_P (operands[2]))
+ force_reg (mode, operands[2]);
+   rtvec v = gen_rtvec (2, operands[2], operands[1]);
+   rtx tmp = gen_rtx_UNSPEC (mode, v, u);
+   emit_move_insn (operands[0], tmp);
+   DONE;
+ })
+
 ;; These versions of the min/max patterns implement exactly the operations
 ;;   min = (op1 < op2 ? op1 : op2)
 ;;   max = (!(op1 < op2) ? op1 : op2)
-- 
2.31.1



[PATCH 0/7][x86] Remove vcond{,u,eq} expanders.

2024-06-27 Thread liuhongt
There're several regressions after obsolete vcond{,u,eq},
Some regressions are due to the direct optimizations in
ix86_expand_{fp,int}_vcond..i.e ix86_expand_sse_fp_minmax.
Some regrssions are due to optimizations relies on canonicalization
in ix86_expand_{fp,int}_vcond.

This series add define_split or define_insn_and_split to restore
those optimizations at pass_combine. It fixed most regressions in GCC
testsuite except for ones compiled w/o sse4.1. W/o sse4.1 it takes 3
instrution for vector condition move, and pass_combine only supports
at most 4 instructions combination. One possible solution is add fake
"ssemovcc" instructions to help combine, and split that back to real
instruction. This series doesn't handle that, but just adjust testcases
to XFAIL.

I also test performance on SPEC2017 with different options set.
-march=sapphirerapids -O2
-march=x86-64-v3 -O2
-march=x86-64 -O2
-march=sapphirerapids -O2
Didn't observe obvious performance change, mostly same binaries.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Any comments?

liuhongt (7):
  [x86] Add more splitters to match (unspec [op1 op2 (gt op3
constm1_operand)] UNSPEC_BLENDV)
  Lower AVX512 kmask comparison back to AVX2 comparison when
op_{true,false} is vector -1/0.
  [x86] Match IEEE min/max with UNSPEC_IEEE_{MIN,MAX}.
  Add more splitter for mskmov with avx512 comparison.
  Adjust testcase for the regressed testcases after obsolete of
vcond{,u,eq}.
  [x86] Optimize a < 0 ? -1 : 0 to (signed)a >> 31.
  Remove vcond{,u,eq} expanders since they will be obsolete.

 gcc/config/i386/mmx.md| 149 ++--
 gcc/config/i386/sse.md| 772 +-
 gcc/testsuite/g++.target/i386/avx2-pr115517.C |  60 ++
 .../g++.target/i386/avx512-pr115517.C |  70 ++
 gcc/testsuite/g++.target/i386/pr100637-1b.C   |   4 +-
 gcc/testsuite/g++.target/i386/pr100637-1w.C   |   4 +-
 gcc/testsuite/g++.target/i386/pr103861-1.C|   4 +-
 .../g++.target/i386/sse4_1-pr100637-1b.C  |  17 +
 .../g++.target/i386/sse4_1-pr100637-1w.C  |  17 +
 .../g++.target/i386/sse4_1-pr103861-1.C   |  17 +
 gcc/testsuite/gcc.target/i386/avx2-pr115517.c |  33 +
 .../gcc.target/i386/avx512-pr115517.c |  70 ++
 gcc/testsuite/gcc.target/i386/pr103941-2.c|   2 +-
 gcc/testsuite/gcc.target/i386/pr111023-2.c|   4 +-
 gcc/testsuite/gcc.target/i386/pr88540.c   |   4 +-
 .../gcc.target/i386/sse4_1-pr88540.c  |  10 +
 gcc/testsuite/gcc.target/i386/vect-div-1.c|   3 +-
 17 files changed, 918 insertions(+), 322 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/avx2-pr115517.C
 create mode 100644 gcc/testsuite/g++.target/i386/avx512-pr115517.C
 create mode 100644 gcc/testsuite/g++.target/i386/sse4_1-pr100637-1b.C
 create mode 100644 gcc/testsuite/g++.target/i386/sse4_1-pr100637-1w.C
 create mode 100644 gcc/testsuite/g++.target/i386/sse4_1-pr103861-1.C
 create mode 100644 gcc/testsuite/gcc.target/i386/avx2-pr115517.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512-pr115517.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse4_1-pr88540.c

-- 
2.31.1



[PATCH 1/7] [x86] Add more splitters to match (unspec [op1 op2 (gt op3 constm1_operand)] UNSPEC_BLENDV)

2024-06-27 Thread liuhongt
These define_insn_and_split are needed after vcond{,u,eq} is obsolete.

gcc/ChangeLog:

PR target/115517
* config/i386/sse.md
(*_blendv_gt): New
define_insn_and_split.
(*_blendv_gtint):
Ditto.
(*_blendv_not_gtint):
Ditto.
(*_pblendvb_gt): Ditto.
(*_pblendvb_gt_subreg_not): Ditto.
---
 gcc/config/i386/sse.md | 130 +
 1 file changed, 130 insertions(+)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 0be2dcd8891..1148ac84f3d 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -23016,6 +23016,32 @@ (define_insn_and_split 
"*_blendv_lt"
(set_attr "btver2_decode" "vector,vector,vector") 
(set_attr "mode" "")])
 
+(define_insn_and_split "*_blendv_gt"
+  [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x")
+   (unspec:VF_128_256
+ [(match_operand:VF_128_256 1 "vector_operand" "Yrja,*xja,xjm")
+  (match_operand:VF_128_256 2 "register_operand" "0,0,x")
+  (gt:VF_128_256
+(match_operand: 3 "register_operand" "Yz,Yz,x")
+(match_operand: 4 "vector_all_ones_operand"))]
+ UNSPEC_BLENDV))]
+  "TARGET_SSE4_1"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+   (unspec:VF_128_256
+[(match_dup 2) (match_dup 1) (match_dup 3)] UNSPEC_BLENDV))]
+  "operands[3] = gen_lowpart (mode, operands[3]);"
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "addr" "gpr16")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix_data16" "1,1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,orig,vex")
+   (set_attr "btver2_decode" "vector,vector,vector")
+   (set_attr "mode" "")])
+
 (define_mode_attr ssefltmodesuffix
   [(V2DI "pd") (V4DI "pd") (V4SI "ps") (V8SI "ps")
(V2DF "pd") (V4DF "pd") (V4SF "ps") (V8SF "ps")])
@@ -23055,6 +23081,38 @@ (define_insn_and_split 
"*_blendv_ltint"
(set_attr "btver2_decode" "vector,vector,vector") 
(set_attr "mode" "")])
 
+(define_insn_and_split 
"*_blendv_gtint"
+  [(set (match_operand: 0 "register_operand" "=Yr,*x,x")
+   (unspec:
+ [(match_operand: 1 "vector_operand" "Yrja,*xja,xjm")
+  (match_operand: 2 "register_operand" "0,0,x")
+  (subreg:
+(gt:VI48_AVX
+  (match_operand:VI48_AVX 3 "register_operand" "Yz,Yz,x")
+  (match_operand:VI48_AVX 4 "vector_all_ones_operand")) 0)]
+ UNSPEC_BLENDV))]
+  "TARGET_SSE4_1"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+   (unspec:
+[(match_dup 2) (match_dup 1) (match_dup 3)] UNSPEC_BLENDV))]
+{
+  operands[0] = gen_lowpart (mode, operands[0]);
+  operands[1] = gen_lowpart (mode, operands[1]);
+  operands[2] = gen_lowpart (mode, operands[2]);
+  operands[3] = gen_lowpart (mode, operands[3]);
+}
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "addr" "gpr16")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix_data16" "1,1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,orig,vex")
+   (set_attr "btver2_decode" "vector,vector,vector")
+   (set_attr "mode" "")])
+
 ;; PR target/100738: Transform vpcmpeqd + vpxor + vblendvps to vblendvps for 
inverted mask;
 (define_insn_and_split 
"*_blendv_not_ltint"
   [(set (match_operand: 0 "register_operand")
@@ -23082,6 +23140,32 @@ (define_insn_and_split 
"*_blendv_not_lt
   operands[3] = gen_lowpart (mode, operands[3]);
 })
 
+(define_insn_and_split 
"*_blendv_not_gtint"
+  [(set (match_operand: 0 "register_operand")
+   (unspec:
+ [(match_operand: 1 "vector_operand")
+  (match_operand: 2 "register_operand")
+  (subreg:
+(gt:VI48_AVX
+  (subreg:VI48_AVX
+  (not:
+(match_operand: 3 "register_operand")) 0)
+  (match_operand:VI48_AVX 4 "vector_all_ones_operand")) 0)]
+ UNSPEC_BLENDV))]
+  "TARGET_SSE4_1 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (unspec:
+[(match_dup 1) (match_dup 2) (match_dup 3)] UNSPEC_BLENDV))]
+{
+  operands[0] = gen_lowpart (mode, operands[0]);
+  operands[2] = gen_lowpart (mode, operands[2]);
+  operands[1] = force_reg (mode,
+  gen_lowpart (mode, operands[1]));
+  operands[3] = gen_lowpart (mode, operands[3]);
+})
+
 (define_insn "_dp"
   [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x")
(unspec:VF_128_256
@@ -23236,6 +23320,30 @@ (define_insn_and_split "*_pblendvb_lt"
(set_attr "btver2_decode" "vector,vector,vector")
(set_attr "mode" "")])
 
+(define_insn_and_split "*_pblendvb_gt"
+  [(set (match_operand:VI1_AVX2 0 "register_operand" "=Yr,*x,x")
+   (unspec:VI1_AVX2
+ [(match_operand:VI1_AVX2 1 "vector_operand" "Yrja,*xja,xjm")
+  (match_operand:VI1_AVX2 2 "register_operand" "0,0,x")
+  (gt:VI1_AVX2 (match_operand:VI

[PATCH 6/7] [x86] Optimize a < 0 ? -1 : 0 to (signed)a >> 31.

2024-06-27 Thread liuhongt
Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
and x < 0 ? 1 : 0 into (unsigned) x >> 31.

Add define_insn_and_split for the optimization did in
ix86_expand_int_vcond.

gcc/ChangeLog:

PR target/115517
* config/i386/sse.md ("*ashr3_1"): New
define_insn_and_split.
(*avx512_ashr3_1): Ditto.
(*avx2_lshr3_1): Ditto.
(*avx2_lshr3_2): Ditto and add 2 combine splitter after
it.
* config/i386/mmx.md (mmxscalarsize): New mode attribute.
(*mmw_ashr3_1): New define_insn_and_split.
("mmx_3): Add a combine spiltter after it.
(*mmx_ashrv2hi3_1): New define_insn_and_plit, also add a
combine splitter after it.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx2-pr115517.c: New test.
* gcc.target/i386/avx512-pr115517.c: New test.
* g++.target/i386/avx2-pr115517.C: New test.
* g++.target/i386/avx512-pr115517.C: New test.
* gcc.target/i386/pr111023-2.c: Adjust testcase.
* gcc.target/i386/vect-div-1.c: Ditto.
---
 gcc/config/i386/mmx.md| 52 
 gcc/config/i386/sse.md| 83 +++
 gcc/testsuite/g++.target/i386/avx2-pr115517.C | 60 ++
 .../g++.target/i386/avx512-pr115517.C | 70 
 gcc/testsuite/gcc.target/i386/avx2-pr115517.c | 33 
 .../gcc.target/i386/avx512-pr115517.c | 70 
 gcc/testsuite/gcc.target/i386/pr111023-2.c|  4 +-
 gcc/testsuite/gcc.target/i386/vect-div-1.c|  3 +-
 8 files changed, 372 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/avx2-pr115517.C
 create mode 100644 gcc/testsuite/g++.target/i386/avx512-pr115517.C
 create mode 100644 gcc/testsuite/gcc.target/i386/avx2-pr115517.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512-pr115517.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index ea53f516cbb..7262bf146c2 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -135,6 +135,14 @@ (define_mode_attr mmxscalarmodelower
(V4HI "hi") (V2HI "hi")
(V8QI "qi")])
 
+(define_mode_attr mmxscalarsize
+  [(V1DI "64")
+   (V2SI "32") (V2SF "32")
+   (V4HF "16") (V4BF "16")
+   (V2HF "16") (V2BF "16")
+   (V4HI "16") (V2HI "16")
+   (V8QI "8")])
+
 (define_mode_attr Yv_Yw
   [(V8QI "Yw") (V4HI "Yw") (V2SI "Yv") (V1DI "Yv") (V2SF "Yv")])
 
@@ -3608,6 +3616,17 @@ (define_insn "mmx_ashr3"
(const_string "0")))
(set_attr "mode" "DI,TI,TI")])
 
+(define_insn_and_split "*mmx_ashr3_1"
+  [(set (match_operand:MMXMODE24 0 "register_operand")
+   (lt:MMXMODE24
+ (match_operand:MMXMODE24 1 "register_operand")
+ (match_operand:MMXMODE24 2 "const0_operand")))]
+  "TARGET_MMX_WITH_SSE && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0) (ashiftrt:MMXMODE24 (match_dup 1) (match_dup 3)))]
+  "operands[3] = gen_int_mode ( - 1, DImode);")
+
 (define_expand "ashr3"
   [(set (match_operand:MMXMODE24 0 "register_operand")
 (ashiftrt:MMXMODE24
@@ -3634,6 +3653,17 @@ (define_insn "mmx_3"
(const_string "0")))
(set_attr "mode" "DI,TI,TI")])
 
+(define_split
+  [(set (match_operand:MMXMODE248 0 "register_operand")
+   (and:MMXMODE248
+ (lt:MMXMODE248
+   (match_operand:MMXMODE248 1 "register_operand")
+   (match_operand:MMXMODE248 2 "const0_operand"))
+ (match_operand:MMXMODE248 3 "const1_operand")))]
+  "TARGET_MMX_WITH_SSE && ix86_pre_reload_split ()"
+  [(set (match_dup 0) (lshiftrt:MMXMODE248 (match_dup 1) (match_dup 4)))]
+  "operands[4] = gen_int_mode ( - 1, DImode);")
+
 (define_expand "3"
   [(set (match_operand:MMXMODE24 0 "register_operand")
 (any_lshift:MMXMODE24
@@ -3675,6 +3705,28 @@ (define_insn "v2hi3"
(const_string "0")))
(set_attr "mode" "TI")])
 
+(define_insn_and_split "*mmx_ashrv2hi3_1"
+  [(set (match_operand:V2HI 0 "register_operand")
+   (lt:V2HI
+ (match_operand:V2HI 1 "register_operand")
+ (match_operand:V2HI 2 "const0_operand")))]
+  "TARGET_SSE2 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0) (ashiftrt:V2HI (match_dup 1) (match_dup 3)))]
+  "operands[3] = gen_int_mode (15, DImode);")
+
+(define_split
+  [(set (match_operand:V2HI 0 "register_operand")
+   (and:V2HI
+ (lt:V2HI
+   (match_operand:V2HI 1 "register_operand")
+   (match_operand:V2HI 2 "const0_operand"))
+ (match_operand:V2HI 3 "const1_operand")))]
+  "TARGET_SSE2 && ix86_pre_reload_split ()"
+  [(set (match_dup 0) (lshiftrt:V2HI (match_dup 1) (match_dup 4)))]
+  "operands[4] = gen_int_mode (15, DImode);")
+
 (define_expand "v8qi3"
   [(set (match_operand:V8QI 0 "register_operand")
(any_shift:V8QI (match_operand:V8QI 1 "register_operand")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 5996ad99606..d86b6fa81c0 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1

[PATCH 4/7] Add more splitter for mskmov with avx512 comparison.

2024-06-27 Thread liuhongt
gcc/ChangeLog:

PR target/115517
* config/i386/sse.md
(*_movmsk_lt_avx512): New
define_insn_and_split.
(*_movmsk_ext_lt_avx512):
Ditto.
(*_pmovmskb_lt_avx512): Ditto.
(*_pmovmskb_zext_lt_avx512): Ditto.
(*sse2_pmovmskb_ext_lt_avx512): Ditto.
(*pmovsk_kmask_v16qi_avx512): Ditto.
(*pmovsk_mask_v32qi_avx512): Ditto.
(*pmovsk_mask_cmp__avx512): Ditto.
(*pmovsk_ptest__avx512): Ditto.
---
 gcc/config/i386/sse.md | 232 +
 1 file changed, 209 insertions(+), 23 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 92f8b74999f..5996ad99606 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -10049,24 +10049,6 @@ (define_insn "*_cvtmask2"
   [(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
-(define_insn_and_split "*_cvtmask2_not"
-  [(set (match_operand:VI12_AVX512VL 0 "register_operand")
-   (vec_merge:VI12_AVX512VL
- (match_operand:VI12_AVX512VL 2 "const0_operand")
- (match_operand:VI12_AVX512VL 3 "vector_all_ones_operand")
- (match_operand: 1 "register_operand")))]
-  "TARGET_AVX512BW && ix86_pre_reload_split ()"
-  "#"
-  "&& 1"
-  [(set (match_dup 4)
-   (not: (match_dup 1)))
-   (set (match_dup 0)
-   (vec_merge:VI12_AVX512VL
- (match_dup 3)
- (match_dup 2)
- (match_dup 4)))]
-  "operands[4] = gen_reg_rtx (mode);")
-
 (define_expand "_cvtmask2"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand")
(vec_merge:VI48_AVX512VL
@@ -10106,10 +10088,10 @@ (define_insn_and_split 
"*_cvtmask2"
(set_attr "mode" "")])
 
 (define_insn_and_split "*_cvtmask2_not"
-  [(set (match_operand:VI48_AVX512VL 0 "register_operand")
-   (vec_merge:VI48_AVX512VL
- (match_operand:VI48_AVX512VL 2 "const0_operand")
- (match_operand:VI48_AVX512VL 3 "vector_all_ones_operand")
+  [(set (match_operand:VI1248_AVX512VLBW 0 "register_operand")
+   (vec_merge:VI1248_AVX512VLBW
+ (match_operand:VI1248_AVX512VLBW 2 "const0_operand")
+ (match_operand:VI1248_AVX512VLBW 3 "vector_all_ones_operand")
  (match_operand: 1 "register_operand")))]
   "TARGET_AVX512F && ix86_pre_reload_split ()"
   "#"
@@ -10117,7 +10099,7 @@ (define_insn_and_split 
"*_cvtmask2_not"
   [(set (match_dup 4)
(not: (match_dup 1)))
(set (match_dup 0)
-   (vec_merge:VI48_AVX512VL
+   (vec_merge:VI1248_AVX512VLBW
  (match_dup 3)
  (match_dup 2)
  (match_dup 4)))]
@@ -21753,6 +21735,30 @@ (define_insn_and_split 
"*_movmsk_lt"
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "")])
 
+(define_insn_and_split "*_movmsk_lt_avx512"
+  [(set (match_operand:SI 0 "register_operand" "=r,jr")
+   (unspec:SI
+ [(subreg:VF_128_256
+   (vec_merge:
+(match_operand: 3 "vector_all_ones_operand")
+(match_operand: 4 "const0_operand")
+(unspec:
+ [(match_operand: 1 "register_operand" "x,x")
+  (match_operand: 2 "const0_operand")
+  (const_int 1)]
+ UNSPEC_PCMP)) 0)]
+ UNSPEC_MOVMSK))]
+  "TARGET_SSE"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+   (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK))]
+  "operands[1] = gen_lowpart (mode, operands[1]);"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "")])
+
 (define_insn_and_split "*_movmsk_ext_lt"
   [(set (match_operand:DI 0 "register_operand" "=r,jr")
(any_extend:DI
@@ -21772,6 +21778,31 @@ (define_insn_and_split 
"*_movmsk_ext_lt"
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "")])
 
+(define_insn_and_split 
"*_movmsk_ext_lt_avx512"
+  [(set (match_operand:DI 0 "register_operand" "=r,jr")
+   (any_extend:DI
+ (unspec:SI
+   [(subreg:VF_128_256
+ (vec_merge:
+  (match_operand: 3 "vector_all_ones_operand")
+  (match_operand: 4 "const0_operand")
+  (unspec:
+   [(match_operand: 1 "register_operand" "x,x")
+(match_operand: 2 "const0_operand")
+(const_int 1)]
+   UNSPEC_PCMP)) 0)]
+   UNSPEC_MOVMSK)))]
+  "TARGET_64BIT && TARGET_SSE"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+   (any_extend:DI (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK)))]
+  "operands[1] = gen_lowpart (mode, operands[1]);"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "")])
+
 (define_insn_and_split "*_movmsk_shift"
   [(set (match_operand:SI 0 "register_operand" "=r,jr")
(unspec:SI
@@ -21961,6 +21992,34 @@ (define_insn_and_split "*_pmovmskb_lt"
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "SI")])
 
+(define_insn_and_split "*_pmovmskb_lt_avx512"
+  [(set (match_operand:SI 0 "regist

[PATCH 2/7] Lower AVX512 kmask comparison back to AVX2 comparison when op_{true, false} is vector -1/0.

2024-06-27 Thread liuhongt
gcc/ChangeLog
PR target/115517
* config/i386/sse.md
(*_cvtmask2_not): New pre_reload
splitter.
(*_cvtmask2_not): Ditto.
(*avx2_pcmp3_6): Ditto.
(*avx2_pcmp3_7): Ditto.
---
 gcc/config/i386/sse.md | 97 ++
 1 file changed, 97 insertions(+)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 1148ac84f3d..822159a869b 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -9986,6 +9986,24 @@ (define_insn "*_cvtmask2"
   [(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
+(define_insn_and_split "*_cvtmask2_not"
+  [(set (match_operand:VI12_AVX512VL 0 "register_operand")
+   (vec_merge:VI12_AVX512VL
+ (match_operand:VI12_AVX512VL 2 "const0_operand")
+ (match_operand:VI12_AVX512VL 3 "vector_all_ones_operand")
+ (match_operand: 1 "register_operand")))]
+  "TARGET_AVX512BW && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 4)
+   (not: (match_dup 1)))
+   (set (match_dup 0)
+   (vec_merge:VI12_AVX512VL
+ (match_dup 3)
+ (match_dup 2)
+ (match_dup 4)))]
+  "operands[4] = gen_reg_rtx (mode);")
+
 (define_expand "_cvtmask2"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand")
(vec_merge:VI48_AVX512VL
@@ -10024,6 +10042,24 @@ (define_insn_and_split 
"*_cvtmask2"
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
+(define_insn_and_split "*_cvtmask2_not"
+  [(set (match_operand:VI48_AVX512VL 0 "register_operand")
+   (vec_merge:VI48_AVX512VL
+ (match_operand:VI48_AVX512VL 2 "const0_operand")
+ (match_operand:VI48_AVX512VL 3 "vector_all_ones_operand")
+ (match_operand: 1 "register_operand")))]
+  "TARGET_AVX512F && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 4)
+   (not: (match_dup 1)))
+   (set (match_dup 0)
+   (vec_merge:VI48_AVX512VL
+ (match_dup 3)
+ (match_dup 2)
+ (match_dup 4)))]
+  "operands[4] = gen_reg_rtx (mode);")
+
 (define_insn "*_cvtmask2_pternlog_false_dep"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
(vec_merge:VI48_AVX512VL
@@ -17675,6 +17711,67 @@ (define_insn_and_split "*avx2_pcmp3_5"
 std::swap (operands[1], operands[2]);
 })
 
+(define_int_attr pcmp_usmin
+  [(UNSPEC_PCMP "smin") (UNSPEC_UNSIGNED_PCMP "umin")])
+
+(define_insn_and_split "*avx2_pcmp3_6"
+ [(set (match_operand:VI_128_256  0 "register_operand")
+   (vec_merge:VI_128_256
+ (match_operand:VI_128_256 1 "vector_all_ones_operand")
+ (match_operand:VI_128_256 2 "const0_operand")
+ (unspec:
+   [(match_operand:VI_128_256 3 "nonimmediate_operand")
+(match_operand:VI_128_256 4 "nonimmediate_operand")
+(match_operand:SI 5 "const_0_to_7_operand")]
+UNSPEC_PCMP_ITER)))]
+  "TARGET_AVX512VL && ix86_pre_reload_split ()
+   && (INTVAL (operands[5]) == 2 || INTVAL (operands[5]) == 5)"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  rtx dst_min = gen_reg_rtx (mode);
+
+  if (MEM_P (operands[3]) && MEM_P (operands[4]))
+operands[3] = force_reg (mode, operands[3]);
+  emit_insn (gen_3 (dst_min, operands[3], operands[4]));
+  rtx eq_op = INTVAL (operands[5]) == 2 ? operands[3] : operands[4];
+  emit_move_insn (operands[0], gen_rtx_EQ (mode, eq_op, dst_min));
+  DONE;
+})
+
+(define_insn_and_split "*avx2_pcmp3_7"
+ [(set (match_operand:VI_128_256  0 "register_operand")
+   (vec_merge:VI_128_256
+ (match_operand:VI_128_256 1 "const0_operand")
+ (match_operand:VI_128_256 2 "vector_all_ones_operand")
+ (unspec:
+   [(match_operand:VI_128_256 3 "nonimmediate_operand")
+(match_operand:VI_128_256 4 "nonimmediate_operand")
+(match_operand:SI 5 "const_0_to_7_operand")]
+UNSPEC_PCMP_ITER)))]
+  "TARGET_AVX512VL && ix86_pre_reload_split ()
+ /* NE is commutative.  */
+   && (INTVAL (operands[5]) == 4
+ /* LE, 3 must be register.  */
+   || INTVAL (operands[5]) == 2
+ /* NLT aka GE, 4 must be register and we swap operands.  */
+   || INTVAL (operands[5]) == 5)"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  if (INTVAL (operands[5]) == 5)
+std::swap (operands[3], operands[4]);
+
+  if (MEM_P (operands[3]))
+operands[3] = force_reg (mode, operands[3]);
+  enum rtx_code code = INTVAL (operands[5]) != 4 ? GT : EQ;
+  emit_move_insn (operands[0], gen_rtx_fmt_ee (code, mode,
+  operands[3], operands[4]));
+  DONE;
+})
+
 (define_expand "_eq3"
   [(set (match_operand: 0 "register_operand")
(unspec:
-- 
2.31.1



[PATCH 5/7] Adjust testcase for the regressed testcases after obsolete of vcond{, u, eq}.

2024-06-27 Thread liuhongt
> Richard suggests that we implement the "obvious" transforms like
> inversion in the middle-end but if for example unsigned compares
> are not supported the us_minus + eq + negative trick isn't on
> that list.
>
> The main reason to restrict vec_cmp would be to avoid
> a <= b ? c : d going with an unsupported vec_cmp but instead
> do a > b ? d : c - the alternative is trying to fix this
> on the RTL side via combine.  I understand the non-native

Yes, I have a patch which can fix most regressions via pattern match
in combine.
Still there is a situation that is difficult to deal with, mainly the
optimization w/o sse4.1 . Because pblendvb/blendvps/blendvpd only
exists under sse4.1, w/o sse4.1, it takes 3
instructions (pand,pandn,por) to simulate the vcond_mask, and the
combine matches up to 4 instructions, which makes it currently
impossible to use the combine to recover those optimizations in the
vcond{,u,eq}.i.e min/max.

In the case of sse 4.1 and above, there is basically no regression anymore.

the regression testcases w/o sse4.1

FAIL: g++.target/i386/pr100637-1b.C  -std=gnu++14  scan-assembler-times pcmpeqb 
2
FAIL: g++.target/i386/pr100637-1b.C  -std=gnu++17  scan-assembler-times pcmpeqb 
2
FAIL: g++.target/i386/pr100637-1b.C  -std=gnu++20  scan-assembler-times pcmpeqb 
2
FAIL: g++.target/i386/pr100637-1b.C  -std=gnu++98  scan-assembler-times pcmpeqb 
2
FAIL: g++.target/i386/pr100637-1w.C  -std=gnu++14  scan-assembler-times pcmpeqw 
2
FAIL: g++.target/i386/pr100637-1w.C  -std=gnu++17  scan-assembler-times pcmpeqw 
2
FAIL: g++.target/i386/pr100637-1w.C  -std=gnu++20  scan-assembler-times pcmpeqw 
2
FAIL: g++.target/i386/pr100637-1w.C  -std=gnu++98  scan-assembler-times pcmpeqw 
2
FAIL: g++.target/i386/pr103861-1.C  -std=gnu++14  scan-assembler-times pcmpeqb 2
FAIL: g++.target/i386/pr103861-1.C  -std=gnu++17  scan-assembler-times pcmpeqb 2
FAIL: g++.target/i386/pr103861-1.C  -std=gnu++20  scan-assembler-times pcmpeqb 2
FAIL: g++.target/i386/pr103861-1.C  -std=gnu++98  scan-assembler-times pcmpeqb 2
FAIL: gcc.target/i386/pr88540.c scan-assembler minpd

gcc/testsuite/ChangeLog:

PR target/115517
* g++.target/i386/pr100637-1b.C: Add xfail and -mno-sse4.1.
* g++.target/i386/pr100637-1w.C: Ditto.
* g++.target/i386/pr103861-1.C: Ditto.
* gcc.target/i386/pr88540.c: Ditto.
* gcc.target/i386/pr103941-2.c: Add -mno-avx512f.
* g++.target/i386/sse4_1-pr100637-1b.C: New test.
* g++.target/i386/sse4_1-pr100637-1w.C: New test.
* g++.target/i386/sse4_1-pr103861-1.C: New test.
* gcc.target/i386/sse4_1-pr88540.c: New test.
---
 gcc/testsuite/g++.target/i386/pr100637-1b.C |  4 ++--
 gcc/testsuite/g++.target/i386/pr100637-1w.C |  4 ++--
 gcc/testsuite/g++.target/i386/pr103861-1.C  |  4 ++--
 .../g++.target/i386/sse4_1-pr100637-1b.C| 17 +
 .../g++.target/i386/sse4_1-pr100637-1w.C| 17 +
 .../g++.target/i386/sse4_1-pr103861-1.C | 17 +
 gcc/testsuite/gcc.target/i386/pr103941-2.c  |  2 +-
 gcc/testsuite/gcc.target/i386/pr88540.c |  4 ++--
 gcc/testsuite/gcc.target/i386/sse4_1-pr88540.c  | 10 ++
 9 files changed, 70 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/sse4_1-pr100637-1b.C
 create mode 100644 gcc/testsuite/g++.target/i386/sse4_1-pr100637-1w.C
 create mode 100644 gcc/testsuite/g++.target/i386/sse4_1-pr103861-1.C
 create mode 100644 gcc/testsuite/gcc.target/i386/sse4_1-pr88540.c

diff --git a/gcc/testsuite/g++.target/i386/pr100637-1b.C 
b/gcc/testsuite/g++.target/i386/pr100637-1b.C
index 35b5df7c9dd..dccb8f5e712 100644
--- a/gcc/testsuite/g++.target/i386/pr100637-1b.C
+++ b/gcc/testsuite/g++.target/i386/pr100637-1b.C
@@ -1,6 +1,6 @@
 /* PR target/100637 */
 /* { dg-do compile } */
-/* { dg-options "-O2 -msse2" } */
+/* { dg-options "-O2 -msse2 -mno-sse4.1" } */
 
 typedef unsigned char __attribute__((__vector_size__ (4))) __v4qu;
 typedef char __attribute__((__vector_size__ (4))) __v4qi;
@@ -13,5 +13,5 @@ __v4qu us (__v4qi a, __v4qi b) { return (a > b) ? au : bu; }
 __v4qi su (__v4qu a, __v4qu b) { return (a > b) ? as : bs; }
 __v4qi ss (__v4qi a, __v4qi b) { return (a > b) ? as : bs; }
 
-/* { dg-final { scan-assembler-times "pcmpeqb" 2 } } */
+/* { dg-final { scan-assembler-times "pcmpeqb" 2 { xfail *-*-* } } } */
 /* { dg-final { scan-assembler-times "pcmpgtb" 2 } } */
diff --git a/gcc/testsuite/g++.target/i386/pr100637-1w.C 
b/gcc/testsuite/g++.target/i386/pr100637-1w.C
index a3ed06fddee..a0aab62db33 100644
--- a/gcc/testsuite/g++.target/i386/pr100637-1w.C
+++ b/gcc/testsuite/g++.target/i386/pr100637-1w.C
@@ -1,6 +1,6 @@
 /* PR target/100637 */
 /* { dg-do compile } */
-/* { dg-options "-O2 -msse2" } */
+/* { dg-options "-O2 -msse2 -mno-sse4.1" } */
 
 typedef unsigned short __attribute__((__vector_size__ (4))) __v2hu;
 typedef short __attribute__((__vector_size__ (4))) __v2hi;
@@ -13,5 +13,5 @@ __v2hu

[PATCH 7/7] Remove vcond{, u, eq} expanders since they will be obsolete.

2024-06-27 Thread liuhongt
gcc/ChangeLog:

PR target/115517
* config/i386/mmx.md (vcondv2sf): Removed.
(vcond): Ditto.
(vcond): Ditto.
(vcondu): Ditto.
(vcondu): Ditto.
* config/i386/sse.md (vcond): Ditto.
(vcond): Ditto.
(vcond): Ditto.
(vcond): Ditto.
(vcond): Ditto.
(vcond): Ditto.
(vcond): Ditto.
(vcondv2di): Ditto.
(vcondu): Ditto.
(vcondu): Ditto.
(vcondu): Ditto.
(vconduv2di): Ditto.
(vcondeqv2di): Ditto.
---
 gcc/config/i386/mmx.md |  97 ---
 gcc/config/i386/sse.md | 213 -
 2 files changed, 310 deletions(-)

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 7262bf146c2..17c5205cae2 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -1168,39 +1168,6 @@ (define_expand "vec_cmpv2sfv2si"
   DONE;
 })
 
-(define_expand "vcondv2sf"
-  [(set (match_operand:V2FI 0 "register_operand")
-   (if_then_else:V2FI
- (match_operator 3 ""
-   [(match_operand:V2SF 4 "nonimmediate_operand")
-(match_operand:V2SF 5 "nonimmediate_operand")])
- (match_operand:V2FI 1 "general_operand")
- (match_operand:V2FI 2 "general_operand")))]
-  "TARGET_MMX_WITH_SSE && ix86_partial_vec_fp_math"
-{
-  rtx ops[6];
-  ops[5] = gen_reg_rtx (V4SFmode);
-  ops[4] = gen_reg_rtx (V4SFmode);
-  ops[3] = gen_rtx_fmt_ee (GET_CODE (operands[3]), VOIDmode, ops[4], ops[5]);
-  ops[2] = lowpart_subreg (mode,
-  force_reg (mode, operands[2]),
-  mode);
-  ops[1] = lowpart_subreg (mode,
-  force_reg (mode, operands[1]),
-  mode);
-  ops[0] = gen_reg_rtx (mode);
-
-  emit_insn (gen_movq_v2sf_to_sse (ops[5], operands[5]));
-  emit_insn (gen_movq_v2sf_to_sse (ops[4], operands[4]));
-
-  bool ok = ix86_expand_fp_vcond (ops);
-  gcc_assert (ok);
-
-  emit_move_insn (operands[0], lowpart_subreg (mode, ops[0],
-  mode));
-  DONE;
-})
-
 (define_insn "@sse4_1_insertps_"
   [(set (match_operand:V2FI 0 "register_operand" "=Yr,*x,v")
(unspec:V2FI
@@ -4029,70 +3996,6 @@ (define_expand "vec_cmpu"
   DONE;
 })
 
-(define_expand "vcond"
-  [(set (match_operand:MMXMODE124 0 "register_operand")
-   (if_then_else:MMXMODE124
- (match_operator 3 ""
-   [(match_operand:MMXMODEI 4 "register_operand")
-(match_operand:MMXMODEI 5 "register_operand")])
- (match_operand:MMXMODE124 1)
- (match_operand:MMXMODE124 2)))]
-  "TARGET_MMX_WITH_SSE
-   && (GET_MODE_NUNITS (mode)
-   == GET_MODE_NUNITS (mode))"
-{
-  bool ok = ix86_expand_int_vcond (operands);
-  gcc_assert (ok);
-  DONE;
-})
-
-(define_expand "vcond"
-  [(set (match_operand:VI_16_32 0 "register_operand")
-   (if_then_else:VI_16_32
- (match_operator 3 ""
-   [(match_operand:VI_16_32 4 "register_operand")
-(match_operand:VI_16_32 5 "register_operand")])
- (match_operand:VI_16_32 1)
- (match_operand:VI_16_32 2)))]
-  "TARGET_SSE2"
-{
-  bool ok = ix86_expand_int_vcond (operands);
-  gcc_assert (ok);
-  DONE;
-})
-
-(define_expand "vcondu"
-  [(set (match_operand:MMXMODE124 0 "register_operand")
-   (if_then_else:MMXMODE124
- (match_operator 3 ""
-   [(match_operand:MMXMODEI 4 "register_operand")
-(match_operand:MMXMODEI 5 "register_operand")])
- (match_operand:MMXMODE124 1)
- (match_operand:MMXMODE124 2)))]
-  "TARGET_MMX_WITH_SSE
-   && (GET_MODE_NUNITS (mode)
-   == GET_MODE_NUNITS (mode))"
-{
-  bool ok = ix86_expand_int_vcond (operands);
-  gcc_assert (ok);
-  DONE;
-})
-
-(define_expand "vcondu"
-  [(set (match_operand:VI_16_32 0 "register_operand")
-   (if_then_else:VI_16_32
- (match_operator 3 ""
-   [(match_operand:VI_16_32 4 "register_operand")
-(match_operand:VI_16_32 5 "register_operand")])
- (match_operand:VI_16_32 1)
- (match_operand:VI_16_32 2)))]
-  "TARGET_SSE2"
-{
-  bool ok = ix86_expand_int_vcond (operands);
-  gcc_assert (ok);
-  DONE;
-})
-
 (define_expand "vcond_mask_"
   [(set (match_operand:MMXMODE124 0 "register_operand")
(vec_merge:MMXMODE124
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index d86b6fa81c0..2d6b39c920f 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -4816,72 +4816,6 @@ (define_expand "vec_cmpeqv1tiv1ti"
   DONE;
 })
 
-(define_expand "vcond"
-  [(set (match_operand:V_512 0 "register_operand")
-   (if_then_else:V_512
- (match_operator 3 ""
-   [(match_operand:VF_512 4 "nonimmediate_operand")
-(match_operand:VF_512 5 "nonimmediate_operand")])
- (match_operand:V_512 1 "general_operand")
- (match_operand:V_512 2 "general_operand")))]
-  "TARGET_AVX512F
-   && (GET_MODE_NUNITS (mode)
-   == GE

[PATCH] Fix native_encode_vector_part for itype when TYPE_PRECISION (itype) == BITS_PER_UNIT

2024-06-27 Thread liuhongt
for the testcase in the PR115406, here is part of the dump.

  char D.4882;
  vector(1)  _1;
  vector(1) signed char _2;
  char _5;

   :
  _1 = { -1 };

When assign { -1 } to vector(1} {signed-boolean:8},
Since TYPE_PRECISION (itype) <= BITS_PER_UNIT, so it set each bit of dest
with each vector elemnet. But i think the bit setting should only apply for
TYPE_PRECISION (itype) < BITS_PER_UNIT. .i.e for vector(1).
, it will be assigned as -1, instead of 1.
Is there any specific reason vector(1)  is handled
differently from vector<1> ?

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR middle-end/115406
* fold-const.cc (native_encode_vector_part): Don't set each
bit to the dest when TYPE_PRECISION (itype) == BITS_PER_UNIT.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115406.c: New test.
---
 gcc/fold-const.cc|  2 +-
 gcc/testsuite/gcc.target/i386/pr115406.c | 23 +++
 2 files changed, 24 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr115406.c

diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
index 710d697c021..0f045f851d1 100644
--- a/gcc/fold-const.cc
+++ b/gcc/fold-const.cc
@@ -8077,7 +8077,7 @@ native_encode_vector_part (const_tree expr, unsigned char 
*ptr, int len,
 {
   tree itype = TREE_TYPE (TREE_TYPE (expr));
   if (VECTOR_BOOLEAN_TYPE_P (TREE_TYPE (expr))
-  && TYPE_PRECISION (itype) <= BITS_PER_UNIT)
+  && TYPE_PRECISION (itype) < BITS_PER_UNIT)
 {
   /* This is the only case in which elements can be smaller than a byte.
 Element 0 is always in the lsb of the containing byte.  */
diff --git a/gcc/testsuite/gcc.target/i386/pr115406.c 
b/gcc/testsuite/gcc.target/i386/pr115406.c
new file mode 100644
index 000..623dff06fc3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115406.c
@@ -0,0 +1,23 @@
+/* { dg-do run } */
+/* { dg-options "-O0 -mavx512f" } */
+/* { dg-require-effective-target avx512f } */
+
+typedef __attribute__((__vector_size__ (1))) char V;
+
+char
+foo (V v)
+{
+  return ((V) v == v)[0];
+}
+
+int
+main ()
+{
+  if (!__builtin_cpu_supports ("avx512f"))
+return 0;
+
+  char x = foo ((V) { });
+  if (x != -1)
+__builtin_abort ();
+  return 0;
+}
-- 
2.31.1



[PATCH 1/3] [avx512 testsuite] Define mask as extern instead of uninitialized local variables.

2024-06-27 Thread liuhongt
The testcases are supposed to scan for vpopcnt{b,w,d,q} operations
with k mask, but mask is defined as uninitialized local variable which
will be set as 0 at rtl expand phase.
And it's further simplified off by late_combine which caused scan assembly 
failure.
Move the definition of mask outside to make the testcases more stable.

gcc/testsuite/ChangeLog:

PR target/115610
* gcc.target/i386/avx512bitalg-vpopcntb.c: Define mask as
extern instead of uninitialized local variables.
* gcc.target/i386/avx512bitalg-vpopcntbvl.c: Ditto.
* gcc.target/i386/avx512bitalg-vpopcntw.c: Ditto.
* gcc.target/i386/avx512bitalg-vpopcntwvl.c: Ditto.
* gcc.target/i386/avx512vpopcntdq-vpopcntd.c: Ditto.
* gcc.target/i386/avx512vpopcntdq-vpopcntq.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntb.c| 3 +--
 gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntbvl.c  | 4 ++--
 gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntw.c| 2 +-
 gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntwvl.c  | 4 ++--
 gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntd.c | 5 +++--
 gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntq.c | 2 +-
 6 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntb.c 
b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntb.c
index 44b82c0519d..66d24107c26 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntb.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntb.c
@@ -7,10 +7,9 @@
 #include 
 
 extern __m512i z, z1;
-
+extern __mmask16 msk;
 int foo ()
 {
-  __mmask16 msk;
   __m512i c = _mm512_popcnt_epi8 (z);
   asm volatile ("" : "+v" (c));
   c = _mm512_mask_popcnt_epi8 (z1, msk, z);
diff --git a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntbvl.c 
b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntbvl.c
index 8c2dfaba9c6..8ab05653f7c 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntbvl.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntbvl.c
@@ -11,11 +11,11 @@
 
 extern __m256i y, y_1;
 extern __m128i x, x_1;
+extern __mmask32 msk32;
+extern __mmask16 msk16;
 
 int foo ()
 {
-  __mmask32 msk32;
-  __mmask16 msk16;
   __m256i c256 = _mm256_popcnt_epi8 (y);
   asm volatile ("" : "+v" (c256));
   c256 = _mm256_mask_popcnt_epi8 (y_1, msk32, y);
diff --git a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntw.c 
b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntw.c
index 2ef8589f6c1..c741bf48a51 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntw.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntw.c
@@ -7,10 +7,10 @@
 #include 
 
 extern __m512i z, z1;
+extern __mmask16 msk;
 
 int foo ()
 {
-  __mmask16 msk;
   __m512i c = _mm512_popcnt_epi16 (z);
   asm volatile ("" : "+v" (c));
   c = _mm512_mask_popcnt_epi16 (z1, msk, z);
diff --git a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntwvl.c 
b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntwvl.c
index c976461b12e..79bb3c31e85 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntwvl.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntwvl.c
@@ -11,11 +11,11 @@
 
 extern __m256i y, y_1;
 extern __m128i x, x_1;
+extern __mmask16 msk16;
+extern __mmask8 msk8;
 
 int foo ()
 {
-  __mmask16 msk16;
-  __mmask8 msk8;
   __m256i c256 = _mm256_popcnt_epi16 (y);
   asm volatile ("" : "+v" (c256));
   c256 = _mm256_mask_popcnt_epi16 (y_1, msk16, y);
diff --git a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntd.c 
b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntd.c
index b4d82f97032..776a4753d8e 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntd.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntd.c
@@ -15,11 +15,12 @@
 extern __m128i x, x_1;
 extern __m256i y, y_1;
 extern __m512i z, z_1;
+extern  __mmask16 msk;
+extern  __mmask8 msk8;
+
 
 int foo ()
 {
-  __mmask16 msk;
-  __mmask8 msk8;
   __m128i a = _mm_popcnt_epi32 (x);
   asm volatile ("" : "+v" (a));
   a = _mm_mask_popcnt_epi32 (x_1, msk8, x);
diff --git a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntq.c 
b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntq.c
index e87d6c999b6..c6314ac5deb 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntq.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntq.c
@@ -15,10 +15,10 @@
 extern __m128i x, x_1;
 extern __m256i y, y_1;
 extern __m512i z, z_1;
+extern __mmask8 msk; 
 
 int foo ()
 {
-  __mmask8 msk; 
   __m128i a = _mm_popcnt_epi64 (x);
   asm volatile ("" : "+v" (a));
   a = _mm_mask_popcnt_epi64 (x_1, msk, x);
-- 
2.31.1



[PATCH 0/3][x86] Enable pass_late_combine for x86.

2024-06-27 Thread liuhongt
Because of the issue described in PR115610, late_combine is disabled by
default.The series try to solve the regressions and enable late_combine.
There're 4 regressions observed.

1. The first one is related to pass_stv2, because late_combine will restore
transformation did in the pass. Move the pass after pass_late_combine can
solve the issue.

2. The second one is related to pass_rpad, both pre_reload and post_reload
late_combine would restore the transformation. So besides moving pass_rpad
after pre_reload late_combine, target_insn_cost is defined to prevent
post_reload pass_late_combine to revert the optimziation did in pass_rpad.

3. The third one is related to avx512 kmask, lshirt + zero_extend are combined
into *si3_zext which doesn't support k alternative, and an extra move
between GPR and KMASK and regressed
gcc.target/i386/zero_extendkmask.c scan-assembler-not (?n)shr[bwl],
the solution is extending the pattern to ?k alternative just like what we did
before for other patterns.

4. The fourth one is fake, it's because pass_late_combine generates better code 
but
break scan assembly.
.i.e
Under 32-bit target, gcc used to generate broadcast from stack and
then do the real operation.
After enabling flate_combine, they're combined into embeded broadcast
operations.

Tested with SPEC2017, flate_combine reduces codesize by ~0.6%, which means
there're lots of small improvements.
Bootstrapped and regtested on x86_64-pc-linu-gnu{-m32,}.
Ok for trunk?


liuhongt (3):
  [avx512 testsuite] Define mask as extern instead of uninitialized
local variables.
  Extend lshifrtsi3_1_zext to ?k alternative.
  [x86] Enable flate-combine.

 gcc/config/i386/i386-features.cc  | 16 +++
 gcc/config/i386/i386-options.cc   |  4 ---
 gcc/config/i386/i386-passes.def   |  4 +--
 gcc/config/i386/i386-protos.h |  1 +
 gcc/config/i386/i386.cc   | 18 
 gcc/config/i386/i386.md   | 19 +
 gcc/config/i386/sse.md| 28 +++
 .../gcc.target/i386/avx512bitalg-vpopcntb.c   |  3 +-
 .../gcc.target/i386/avx512bitalg-vpopcntbvl.c |  4 +--
 .../gcc.target/i386/avx512bitalg-vpopcntw.c   |  2 +-
 .../gcc.target/i386/avx512bitalg-vpopcntwvl.c |  4 +--
 .../i386/avx512f-broadcast-pr87767-1.c|  4 +--
 .../i386/avx512f-broadcast-pr87767-5.c|  1 -
 .../gcc.target/i386/avx512f-fmadd-sf-zmm-7.c  |  2 +-
 .../gcc.target/i386/avx512f-fmsub-sf-zmm-7.c  |  2 +-
 .../gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c |  2 +-
 .../gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c |  2 +-
 .../i386/avx512vl-broadcast-pr87767-1.c   |  4 +--
 .../i386/avx512vl-broadcast-pr87767-5.c   |  2 --
 .../i386/avx512vpopcntdq-vpopcntd.c   |  5 ++--
 .../i386/avx512vpopcntdq-vpopcntq.c   |  2 +-
 gcc/testsuite/gcc.target/i386/pr91333.c   |  2 +-
 .../gcc.target/i386/vect-strided-4.c  |  2 +-
 23 files changed, 93 insertions(+), 40 deletions(-)

-- 
2.31.1



[PATCH 3/3] [x86] Enable flate-combine.

2024-06-27 Thread liuhongt
Move pass_stv2 and pass_rpad after pre_reload pass_late_combine, also
define target_insn_cost to prevent post_reload pass_late_combine to
revert the optimziation did in pass_rpad.

Adjust testcases since pass_late_combine generates better code but
break scan assembly.

.i.e
Under 32-bit target, gcc used to generate broadcast from stack and
then do the real operation.
After flate_combine, they're combined into embeded broadcast
operations.

gcc/ChangeLog:

* config/i386/i386-features.cc (ix86_rpad_gate): New function.
* config/i386/i386-options.cc (ix86_override_options_after_change):
Don't disable flate_combine.
* config/i386/i386-passes.def: Move pass_stv2 and pass_rpad
after pre_reload pas_late_combine.
* config/i386/i386-protos.h (ix86_rpad_gate): New declare.
* config/i386/i386.cc (ix86_insn_cost): New function.
(TARGET_INSN_COST): Define.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512f-broadcast-pr87767-1.c: Adjus
testcase.
* gcc.target/i386/avx512f-broadcast-pr87767-5.c: Ditto.
* gcc.target/i386/avx512f-fmadd-sf-zmm-7.c: Ditto.
* gcc.target/i386/avx512f-fmsub-sf-zmm-7.c: Ditto.
* gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c: Ditto.
* gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c: Ditto.
* gcc.target/i386/avx512vl-broadcast-pr87767-1.c: Ditto.
* gcc.target/i386/avx512vl-broadcast-pr87767-5.c: Ditto.
* gcc.target/i386/pr91333.c: Ditto.
* gcc.target/i386/vect-strided-4.c: Ditto.
---
 gcc/config/i386/i386-features.cc   | 16 +++-
 gcc/config/i386/i386-options.cc|  4 
 gcc/config/i386/i386-passes.def|  4 ++--
 gcc/config/i386/i386-protos.h  |  1 +
 gcc/config/i386/i386.cc| 18 ++
 .../i386/avx512f-broadcast-pr87767-1.c |  4 ++--
 .../i386/avx512f-broadcast-pr87767-5.c |  1 -
 .../gcc.target/i386/avx512f-fmadd-sf-zmm-7.c   |  2 +-
 .../gcc.target/i386/avx512f-fmsub-sf-zmm-7.c   |  2 +-
 .../gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c  |  2 +-
 .../gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c  |  2 +-
 .../i386/avx512vl-broadcast-pr87767-1.c|  4 ++--
 .../i386/avx512vl-broadcast-pr87767-5.c|  2 --
 gcc/testsuite/gcc.target/i386/pr91333.c|  2 +-
 gcc/testsuite/gcc.target/i386/vect-strided-4.c |  2 +-
 15 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 607d1991460..fc224ed06b0 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -2995,6 +2995,16 @@ make_pass_insert_endbr_and_patchable_area (gcc::context 
*ctxt)
   return new pass_insert_endbr_and_patchable_area (ctxt);
 }
 
+bool
+ix86_rpad_gate ()
+{
+  return (TARGET_AVX
+ && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+ && TARGET_SSE_MATH
+ && optimize
+ && optimize_function_for_speed_p (cfun));
+}
+
 /* At entry of the nearest common dominator for basic blocks with
conversions/rcp/sqrt/rsqrt/round, generate a single
vxorps %xmmN, %xmmN, %xmmN
@@ -3232,11 +3242,7 @@ public:
   /* opt_pass methods: */
   bool gate (function *) final override
 {
-  return (TARGET_AVX
- && TARGET_SSE_PARTIAL_REG_DEPENDENCY
- && TARGET_SSE_MATH
- && optimize
- && optimize_function_for_speed_p (cfun));
+  return ix86_rpad_gate ();
 }
 
   unsigned int execute (function *) final override
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 9c12d498928..1ef2c71a7a2 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -1944,10 +1944,6 @@ ix86_override_options_after_change (void)
flag_cunroll_grow_size = flag_peel_loops || optimize >= 3;
 }
 
-  /* Late combine tends to undo some of the effects of STV and RPAD,
- by combining instructions back to their original form.  */
-  if (!OPTION_SET_P (flag_late_combine_instructions))
-flag_late_combine_instructions = 0;
 }
 
 /* Clear stack slot assignments remembered from previous functions.
diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
index 7d96766f7b9..2d29f65da88 100644
--- a/gcc/config/i386/i386-passes.def
+++ b/gcc/config/i386/i386-passes.def
@@ -25,11 +25,11 @@ along with GCC; see the file COPYING3.  If not see
  */
 
   INSERT_PASS_AFTER (pass_postreload_cse, 1, pass_insert_vzeroupper);
-  INSERT_PASS_AFTER (pass_combine, 1, pass_stv, false /* timode_p */);
+  INSERT_PASS_AFTER (pass_late_combine, 1, pass_stv, false /* timode_p */);
   /* Run the 64-bit STV pass before the CSE pass so that CONST0_RTX and
  CONSTM1_RTX generated by the STV pass can be CSEed.  */
   INSERT_PASS_BEFORE (pass_cse2, 1, pass_stv, true /* timode_p */);
 
   INSERT_PASS_BEFORE (pass_shorten_branches, 1, 
pass_insert_endbr_and

[PATCH 2/3] Extend lshifrtsi3_1_zext to ?k alternative.

2024-06-27 Thread liuhongt
late_combine will combine lshift + zero into *lshifrtsi3_1_zext which
cause extra mov between gpr and kmask, add ?k to the pattern.

gcc/ChangeLog:

PR target/115610
* config/i386/i386.md (<*insnsi3_zext): Add alternative ?k,
enable it only for lshiftrt and under avx512bw.
* config/i386/sse.md (*klshrsi3_1_zext): New define_insn, and
add corresponding define_split after it.
---
 gcc/config/i386/i386.md | 19 +--
 gcc/config/i386/sse.md  | 28 
 2 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index fd48e764469..57a10c1af48 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -16836,10 +16836,10 @@ (define_insn "*bmi2_si3_1_zext"
(set_attr "mode" "SI")])
 
 (define_insn "*si3_1_zext"
-  [(set (match_operand:DI 0 "register_operand" "=r,r,r")
+  [(set (match_operand:DI 0 "register_operand" "=r,r,r,?k")
(zero_extend:DI
- (any_shiftrt:SI (match_operand:SI 1 "nonimmediate_operand" "0,rm,rm")
- (match_operand:QI 2 "nonmemory_operand" "cI,r,cI"
+ (any_shiftrt:SI (match_operand:SI 1 "nonimmediate_operand" 
"0,rm,rm,k")
+ (match_operand:QI 2 "nonmemory_operand" 
"cI,r,cI,I"
(clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT
&& ix86_binary_operator_ok (, SImode, operands, TARGET_APX_NDD)"
@@ -16850,6 +16850,8 @@ (define_insn "*si3_1_zext"
 case TYPE_ISHIFTX:
   return "#";
 
+case TYPE_MSKLOG:
+  return "#";
 default:
   if (operands[2] == const1_rtx
  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
@@ -16860,8 +16862,8 @@ (define_insn "*si3_1_zext"
   : "{l}\t{%2, %k0|%k0, %2}";
 }
 }
-  [(set_attr "isa" "*,bmi2,apx_ndd")
-   (set_attr "type" "ishift,ishiftx,ishift")
+  [(set_attr "isa" "*,bmi2,apx_ndd,avx512bw")
+   (set_attr "type" "ishift,ishiftx,ishift,msklog")
(set (attr "length_immediate")
  (if_then_else
(and (match_operand 2 "const1_operand")
@@ -16869,7 +16871,12 @@ (define_insn "*si3_1_zext"
 (match_test "optimize_function_for_size_p (cfun)")))
(const_string "0")
(const_string "*")))
-   (set_attr "mode" "SI")])
+   (set_attr "mode" "SI")
+   (set (attr "enabled")
+   (if_then_else
+ (eq_attr "alternative" "3")
+ (symbol_ref " == LSHIFTRT && TARGET_AVX512BW")
+ (const_string "*")))])
 
 ;; Convert shift to the shiftx pattern to avoid flags dependency.
 (define_split
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 0be2dcd8891..20665a6f097 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2179,6 +2179,34 @@ (define_split
 (match_dup 2)))
   (unspec [(const_int 0)] UNSPEC_MASKOP)])])
 
+(define_insn "*klshrsi3_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=k")
+   (zero_extend:DI
+ (lshiftrt:SI (match_operand:SI 1 "register_operand" "k")
+  (match_operand 2 "const_0_to_31_operand" "I"
+  (unspec [(const_int 0)] UNSPEC_MASKOP)]
+  "TARGET_AVX512BW"
+  "kshiftrd\t{%2, %1, %0|%0, %1, %2}"
+[(set_attr "type" "msklog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "SI")])
+
+(define_split
+  [(set (match_operand:DI 0 "mask_reg_operand")
+   (zero_extend:DI
+ (lshiftrt:SI
+   (match_operand:SI 1 "mask_reg_operand")
+   (match_operand 2 "const_0_to_31_operand"
+(clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512BW && reload_completed"
+  [(parallel
+ [(set (match_dup 0)
+  (zero_extend:DI
+(lshiftrt:SI
+  (match_dup 1)
+  (match_dup 2
+  (unspec [(const_int 0)] UNSPEC_MASKOP)])])
+
 (define_insn "ktest"
   [(set (reg:CC FLAGS_REG)
(unspec:CC
-- 
2.31.1



[PATCH] x86: Update branch hint for Redwood Cove.

2024-07-01 Thread liuhongt
From: "H.J. Lu" 

According to Intel® 64 and IA-32 Architectures Optimization Reference
Manual[1], Branch Hint is updated for Redwood Cove.

cut from [1]-
Starting with the Redwood Cove microarchitecture, if the predictor has
no stored information about a branch, the branch has the Intel® SSE2
branch taken hint (i.e., instruction prefix 3EH), When the codec
decodes the branch, it flips the branch’s prediction from not-taken to
taken. It then flushes the pipeline in front of it and steers this
pipeline to fetch the taken path of the branch.
cut end -

For -mtune-ctrl=branch_prediction_hints, always generate branch hint for
conditional branches, this tune is disabled by default.

[1] 
https://www.intel.com/content/www/us/en/content-details/821612/intel-64-and-ia-32-architectures-optimization-reference-manual-volume-1.html

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/

* config/i386/i386.cc (ix86_print_operand): Always generate
branch hint for conditional branches.
---
 gcc/config/i386/i386.cc | 24 +---
 1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 1f71ed04be6..9992b9d6186 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -14050,25 +14050,11 @@ ix86_print_operand (FILE *file, rtx x, int code)
int pred_val = profile_probability::from_reg_br_prob_note
 (XINT (x, 0)).to_reg_br_prob_base ();
 
-   if (pred_val < REG_BR_PROB_BASE * 45 / 100
-   || pred_val > REG_BR_PROB_BASE * 55 / 100)
- {
-   bool taken = pred_val > REG_BR_PROB_BASE / 2;
-   bool cputaken
- = final_forward_branch_p (current_output_insn) == 0;
-
-   /* Emit hints only in the case default branch prediction
-  heuristics would fail.  */
-   if (taken != cputaken)
- {
-   /* We use 3e (DS) prefix for taken branches and
-  2e (CS) prefix for not taken branches.  */
-   if (taken)
- fputs ("ds ; ", file);
-   else
- fputs ("cs ; ", file);
- }
- }
+   bool taken = pred_val > REG_BR_PROB_BASE / 2;
+   /* We use 3e (DS) prefix for taken branches and
+  2e (CS) prefix for not taken branches.  */
+   if (taken)
+ fputs ("ds ; ", file);
  }
return;
  }
-- 
2.31.1



[PATCH][committed] Move runtime check into a separate function and guard it with target ("no-avx")

2024-07-03 Thread liuhongt
The patch can avoid SIGILL on non-AVX512 machine due to kmovd is
generated in dynamic check.

Committed as an obvious fix.

gcc/testsuite/ChangeLog:

PR target/115748
* gcc.target/i386/avx512-check.h: Move runtime check into a
separate function and guard it with target ("no-avx").
---
 gcc/testsuite/gcc.target/i386/avx512-check.h | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h 
b/gcc/testsuite/gcc.target/i386/avx512-check.h
index 0ad9064f637..71858a33dac 100644
--- a/gcc/testsuite/gcc.target/i386/avx512-check.h
+++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
@@ -34,8 +34,9 @@ check_osxsave (void)
   return (ecx & bit_OSXSAVE) != 0;
 }
 
+__attribute__((noipa,target("no-avx")))
 int
-main ()
+avx512_runtime_support_p ()
 {
   unsigned int eax, ebx, ecx, edx;
 
@@ -100,6 +101,17 @@ main ()
   && (edx & bit_AVX512VP2INTERSECT)
 #endif
   && avx512f_os_support ())
+{
+  return 1;
+}
+
+  return 0;
+}
+
+int
+main ()
+{
+  if (avx512_runtime_support_p ())
 {
   DO_TEST ();
 #ifdef DEBUG
-- 
2.31.1



[PATCH V2] x86: Update branch hint for Redwood Cove.

2024-07-03 Thread liuhongt
From: "H.J. Lu" 

>The above reads like it would be worth splitting branc_prediction_hits
>into branch_prediction_hints_taken and branch_prediction_hints_not_taken
>given not-taken is the default and thus will just increase code size?
>According to Intel® 64 and IA-32 Architectures Optimization Reference
>Manual[1], Branch Hint is updated for Redwood Cove.
Changed.

cut from [1]-
Starting with the Redwood Cove microarchitecture, if the predictor has
no stored information about a branch, the branch has the Intel® SSE2
branch taken hint (i.e., instruction prefix 3EH), When the codec
decodes the branch, it flips the branch’s prediction from not-taken to
taken. It then flushes the pipeline in front of it and steers this
pipeline to fetch the taken path of the branch.
cut end -

Split tune branch_prediction_hints into branch_prediction_hints_taken
and branch_prediction_hints_not_taken, always generate branch hint for
conditional branches, both tunes are disabled by default.

[1] 
https://www.intel.com/content/www/us/en/content-details/821612/intel-64-and-ia-32-architectures-optimization-reference-manual-volume-1.html

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/

* config/i386/i386.cc (ix86_print_operand): Always generate
branch hint for conditional branches.
* config/i386/i386.h (TARGET_BRANCH_PREDICTION_HINTS): Split
into ..
(TARGET_BRANCH_PREDICTION_HINTS_TAKEN): .. this, and ..
(TARGET_BRANCH_PREDICTION_HINTS_NOT_TAKEN): .. this.
* config/i386/x86-tune.def (X86_TUNE_BRANCH_PREDICTION_HINTS):
Split into ..
(X86_TUNE_BRANCH_PREDICTION_HINTS_TAKEN): .. this, and ..
(X86_TUNE_BRANCH_PREDICTION_HINTS_NOT_TAKEN): .. this.
---
 gcc/config/i386/i386.cc  | 29 +
 gcc/config/i386/i386.h   |  6 --
 gcc/config/i386/x86-tune.def | 13 +++--
 3 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 1f71ed04be6..ea9cb620f8d 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -14041,7 +14041,8 @@ ix86_print_operand (FILE *file, rtx x, int code)
 
if (!optimize
|| optimize_function_for_size_p (cfun)
-   || !TARGET_BRANCH_PREDICTION_HINTS)
+   || (!TARGET_BRANCH_PREDICTION_HINTS_NOT_TAKEN
+   && !TARGET_BRANCH_PREDICTION_HINTS_TAKEN))
  return;
 
x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
@@ -14050,25 +14051,13 @@ ix86_print_operand (FILE *file, rtx x, int code)
int pred_val = profile_probability::from_reg_br_prob_note
 (XINT (x, 0)).to_reg_br_prob_base ();
 
-   if (pred_val < REG_BR_PROB_BASE * 45 / 100
-   || pred_val > REG_BR_PROB_BASE * 55 / 100)
- {
-   bool taken = pred_val > REG_BR_PROB_BASE / 2;
-   bool cputaken
- = final_forward_branch_p (current_output_insn) == 0;
-
-   /* Emit hints only in the case default branch prediction
-  heuristics would fail.  */
-   if (taken != cputaken)
- {
-   /* We use 3e (DS) prefix for taken branches and
-  2e (CS) prefix for not taken branches.  */
-   if (taken)
- fputs ("ds ; ", file);
-   else
- fputs ("cs ; ", file);
- }
- }
+   bool taken = pred_val > REG_BR_PROB_BASE / 2;
+   /* We use 3e (DS) prefix for taken branches and
+  2e (CS) prefix for not taken branches.  */
+   if (taken && TARGET_BRANCH_PREDICTION_HINTS_TAKEN)
+ fputs ("ds ; ", file);
+   else if (!taken && TARGET_BRANCH_PREDICTION_HINTS_NOT_TAKEN)
+ fputs ("cs ; ", file);
  }
return;
  }
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 9ed225ec587..50ebed221dc 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -309,8 +309,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 #define TARGET_ZERO_EXTEND_WITH_AND \
ix86_tune_features[X86_TUNE_ZERO_EXTEND_WITH_AND]
 #define TARGET_UNROLL_STRLEN   ix86_tune_features[X86_TUNE_UNROLL_STRLEN]
-#define TARGET_BRANCH_PREDICTION_HINTS \
-   ix86_tune_features[X86_TUNE_BRANCH_PREDICTION_HINTS]
+#define TARGET_BRANCH_PREDICTION_HINTS_NOT_TAKEN \
+   ix86_tune_features[X86_TUNE_BRANCH_PREDICTION_HINTS_NOT_TAKEN]
+#define TARGET_BRANCH_PREDICTION_HINTS_TAKEN \
+   ix86_tune_features[X86_TUNE_BRANCH_PREDICTION_HINTS_TAKEN]
 #define TARGET_DOUBLE_WITH_ADD ix86_tune_features[X86_TUNE_DOUBLE_WITH_ADD]
 #defi

[PATCH] [committed] Use __builtin_cpu_support instead of __get_cpuid_count.

2024-07-03 Thread liuhongt
>> Hmm, now all avx512 tests SIGILL when testing with -m32:
>>
>> Dump of assembler code for function __get_cpuid_count:
>> => 0x08049500 <+0>:     kmovd  %eax,%k2
>>    0x08049504 <+4>:     kmovd  %edx,%k1
>>    0x08049508 <+8>:     pushf
>>    0x08049509 <+9>:     pushf
>>    0x0804950a <+10>:    pop    %eax
>>    0x0804950b <+11>:    mov    %eax,%edx
>>
>> looks like __get_cpuid_count is no longer inlined but AVX512 is in
>> effect for it.
>>
>> Maybe use #pragma GCC target around the includes instead?
>
>
> Can the built-in cpu supports be used?

Changed, and verified on both AVX512 and non-AVX512 machine.

Push to trunk.
gcc/testsuite/ChangeLog:

PR target/115748
* gcc.target/i386/avx512-check.h: Use __builtin_cpu_support
instead of __get_cpuid_count.
---
 gcc/testsuite/gcc.target/i386/avx512-check.h | 46 +---
 1 file changed, 20 insertions(+), 26 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h 
b/gcc/testsuite/gcc.target/i386/avx512-check.h
index 71858a33dac..8ec1a7ccbae 100644
--- a/gcc/testsuite/gcc.target/i386/avx512-check.h
+++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
@@ -38,69 +38,63 @@ __attribute__((noipa,target("no-avx")))
 int
 avx512_runtime_support_p ()
 {
-  unsigned int eax, ebx, ecx, edx;
-
-  if (!__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx))
-return 0;
-
   /* Run AVX512 test only if host has ISA support.  */
-  if (check_osxsave ()
-  && (ebx & bit_AVX512F)
+  if (__builtin_cpu_supports ("avx512f")
 #ifdef AVX512VL
-  && (ebx & bit_AVX512VL)
+  && __builtin_cpu_supports ("avx512vl")
 #endif
 #ifdef AVX512ER
-  && (ebx & bit_AVX512ER)
+  && __builtin_cpu_supports ("avx512er")
 #endif
 #ifdef AVX512CD
-  && (ebx & bit_AVX512CD)
+  && __builtin_cpu_supports ("avx512cd")
 #endif
 #ifdef AVX512DQ
-  && (ebx & bit_AVX512DQ)
+  && __builtin_cpu_supports ("avx512dq")
 #endif
 #ifdef AVX512BW
-  && (ebx & bit_AVX512BW)
+  && __builtin_cpu_supports ("avx512bw")
 #endif
 #ifdef AVX512IFMA
-  && (ebx & bit_AVX512IFMA)
+  && __builtin_cpu_supports ("avx512ifma")
 #endif
 #ifdef AVX512VBMI
-  && (ecx & bit_AVX512VBMI)
+  && __builtin_cpu_supports ("avx512vbmi")
 #endif
 #ifdef AVX5124FMAPS
-  && (edx & bit_AVX5124FMAPS)
+  && __builtin_cpu_supports ("avx5124fmaps")
 #endif
 #ifdef AVX5124VNNIW
-  && (edx & bit_AVX5124VNNIW)
+  && __builtin_cpu_supports ("avx5124vnniw")
 #endif
 #ifdef AVX512VPOPCNTDQ
-  && (ecx & bit_AVX512VPOPCNTDQ)
+  && __builtin_cpu_supports ("avx512vpopcntdq")
 #endif
 #ifdef AVX512BITALG
-  && (ecx & bit_AVX512BITALG)
+  && __builtin_cpu_supports ("avx512bitalg")
 #endif
 #ifdef GFNI
-  && (ecx & bit_GFNI)
+  && __builtin_cpu_supports ("gfni")
 #endif
 #ifdef AVX512VBMI2
-  && (ecx & bit_AVX512VBMI2)
+  && __builtin_cpu_supports ("avx512vbmi2")
 #endif
 #ifdef AVX512VNNI
-  && (ecx & bit_AVX512VNNI)
+  && __builtin_cpu_supports ("avx512vnni")
 #endif
 #ifdef AVX512FP16
-  && (edx & bit_AVX512FP16)
+  && __builtin_cpu_supports ("avx512fp16")
 #endif
 #ifdef VAES
-  && (ecx & bit_VAES)
+  && __builtin_cpu_supports ("vaes")
 #endif
 #ifdef VPCLMULQDQ
-  && (ecx & bit_VPCLMULQDQ)
+  && __builtin_cpu_supports ("vpclmulqdq")
 #endif
 #ifdef AVX512VP2INTERSECT
-  && (edx & bit_AVX512VP2INTERSECT)
+  && __builtin_cpu_supports ("avx512vp2intersect")
 #endif
-  && avx512f_os_support ())
+  )
 {
   return 1;
 }
-- 
2.31.1



[PATCH] Rename __{float, double}_u to __x86_{float, double}_u to avoid pulluting the namespace.

2024-07-07 Thread liuhongt
I have a build failure on NetBSD as the namespace pollution avoidance causes
a direct hit with the system /usr/include/math.h
===

In file included from /usr/src/local/gcc/obj/gcc/include/emmintrin.h:31,
 from 
/usr/src/local/gcc/obj/x86_64-unknown-netbsd10.99/libstdc++-v3/include/ext/random:45,
 from 
/usr/src/local/gcc/libstdc++-v3/include/precompiled/extc++.h:65:
/usr/src/local/gcc/obj/gcc/include/xmmintrin.h:75:15: error: conflicting 
declaration 'typedef float __float_u'
   75 | typedef float __float_u __attribute__ ((__may_alias__, __aligned__ 
(1)));
  |   ^
In file included from 
/usr/src/local/gcc/obj/x86_64-unknown-netbsd10.99/libstdc++-v3/include/cmath:47,
 from 
/usr/src/local/gcc/obj/x86_64-unknown-netbsd10.99/libstdc++-v3/include/x86_64-unknown-netbsd10.99/bits/stdc++.h:114,
 from 
/usr/src/local/gcc/libstdc++-v3/include/precompiled/extc++.h:32:
/usr/src/local/gcc/obj/gcc/include-fixed/math.h:49:7: note: previous 
declaration as 'union __float_u'
   49 | union __float_u {

As pinski suggested in #c2, use __x86_float_u which seems less likely to 
pullute the namespace.

Bootstrapped and regtested on x86_64-pc-linux{-m32,}.
Ready push to trunk if there's no other concerns.

gcc/ChangeLog:

PR target/115796
* config/i386/emmintrin.h (__float_u): Rename to ..
(__x86_float_u): .. this.
(_mm_load_sd): Ditto.
(_mm_store_sd): Ditto.
(_mm_loadh_pd): Ditto.
(_mm_loadl_pd): Ditto.
* config/i386/xmmintrin.h (__double_u): Rename to ..
(__x86_double_u): .. this.
(_mm_load_ss): Ditto.
(_mm_store_ss): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115796.c: New test.
---
 gcc/config/i386/emmintrin.h  | 10 +-
 gcc/config/i386/xmmintrin.h  |  6 +++---
 gcc/testsuite/gcc.target/i386/pr115796.c | 24 
 3 files changed, 32 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr115796.c

diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h
index d58030e5c4f..a3fcd7a869c 100644
--- a/gcc/config/i386/emmintrin.h
+++ b/gcc/config/i386/emmintrin.h
@@ -56,7 +56,7 @@ typedef double __m128d __attribute__ ((__vector_size__ (16), 
__may_alias__));
 /* Unaligned version of the same types.  */
 typedef long long __m128i_u __attribute__ ((__vector_size__ (16), 
__may_alias__, __aligned__ (1)));
 typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, 
__aligned__ (1)));
-typedef double __double_u __attribute__ ((__may_alias__, __aligned__ (1)));
+typedef double __x86_double_u __attribute__ ((__may_alias__, __aligned__ (1)));
 
 /* Create a selector for use with the SHUFPD instruction.  */
 #define _MM_SHUFFLE2(fp1,fp0) \
@@ -146,7 +146,7 @@ _mm_load1_pd (double const *__P)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_load_sd (double const *__P)
 {
-  return __extension__ (__m128d) { *(__double_u *)__P, 0.0 };
+  return __extension__ (__m128d) { *(__x86_double_u *)__P, 0.0 };
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
@@ -181,7 +181,7 @@ _mm_storeu_pd (double *__P, __m128d __A)
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_store_sd (double *__P, __m128d __A)
 {
-  *(__double_u *)__P = ((__v2df)__A)[0] ;
+  *(__x86_double_u *)__P = ((__v2df)__A)[0] ;
 }
 
 extern __inline double __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
@@ -974,13 +974,13 @@ _mm_unpacklo_pd (__m128d __A, __m128d __B)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_loadh_pd (__m128d __A, double const *__B)
 {
-  return __extension__ (__m128d) { ((__v2df)__A)[0], *(__double_u*)__B };
+  return __extension__ (__m128d) { ((__v2df)__A)[0], *(__x86_double_u*)__B };
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_loadl_pd (__m128d __A, double const *__B)
 {
-  return __extension__ (__m128d) { *(__double_u*)__B, ((__v2df)__A)[1] };
+  return __extension__ (__m128d) { *(__x86_double_u*)__B, ((__v2df)__A)[1] };
 }
 
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
index 37e5a94cf10..7f10f96d72c 100644
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -72,7 +72,7 @@ typedef float __m128 __attribute__ ((__vector_size__ (16), 
__may_alias__));
 
 /* Unaligned version of the same type.  */
 typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__, 
__aligned__ (1)));
-typedef float __float_u __attribute__ ((__may_alias__, __aligned__ (1)));
+typedef float __x86_float_u __

[PATCH] Fix SSA_NAME leak due to def_stmt is removed before use_stmt.

2024-07-11 Thread liuhongt
>-  _5 = __atomic_fetch_or_8 (&set_work_pending_p, 1, 0);
>-  # DEBUG old => (long int) _5
>+  _6 = .ATOMIC_BIT_TEST_AND_SET (&set_work_pending_p, 0, 1, 0, 
>__atomic_fetch_or_8);
>+  # DEBUG old => NULL
>   # DEBUG BEGIN_STMT
>-  # DEBUG D#2 => _5 & 1
>+  # DEBUG D#2 => NULL
>...
>-  _10 = ~_5;
>-  _8 = (_Bool) _10;
>-  # DEBUG ret => _8
>+  _8 = _6 == 0;
>+  # DEBUG ret => (_Bool) _10
>
>confirmed.  convert_atomic_bit_not does this, it checks for single_use
>and removes the def, failing to release the name (which would fix this up
>IIRC).
>
>Note the function removes stmts in "wrong" order (before uses of LHS
>are removed), so it requires larger surgery.  And it leaks SSA names.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/115872
* tree-ssa-ccp.cc (convert_atomic_bit_not): Remove use_stmt after 
use_nop_stmt is removed.
(optimize_atomic_bit_test_and): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115872.c: New test.
---
 gcc/testsuite/gcc.target/i386/pr115872.c | 16 
 gcc/tree-ssa-ccp.cc  | 12 
 2 files changed, 24 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr115872.c

diff --git a/gcc/testsuite/gcc.target/i386/pr115872.c 
b/gcc/testsuite/gcc.target/i386/pr115872.c
new file mode 100644
index 000..937004456d3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115872.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -g" } */
+
+long set_work_pending_p;
+_Bool set_work_pending() {
+  _Bool __trans_tmp_1;
+  long mask = 1, old = __atomic_fetch_or(&set_work_pending_p, mask, 0);
+  __trans_tmp_1 = old & mask;
+  return !__trans_tmp_1;
+}
+void __queue_work() {
+  _Bool ret = set_work_pending();
+  if (ret)
+__queue_work();
+}
+
diff --git a/gcc/tree-ssa-ccp.cc b/gcc/tree-ssa-ccp.cc
index 3749126b5f7..de83d26d311 100644
--- a/gcc/tree-ssa-ccp.cc
+++ b/gcc/tree-ssa-ccp.cc
@@ -3332,9 +3332,10 @@ convert_atomic_bit_not (enum internal_fn fn, gimple 
*use_stmt,
 return nullptr;
 
   gimple_stmt_iterator gsi;
-  gsi = gsi_for_stmt (use_stmt);
-  gsi_remove (&gsi, true);
   tree var = make_ssa_name (TREE_TYPE (lhs));
+  /* use_stmt need to be removed after use_nop_stmt,
+ so use_lhs can be released.  */
+  gimple *use_stmt_removal = use_stmt;
   use_stmt = gimple_build_assign (var, BIT_AND_EXPR, lhs, and_mask);
   gsi = gsi_for_stmt (use_not_stmt);
   gsi_insert_before (&gsi, use_stmt, GSI_NEW_STMT);
@@ -3344,6 +3345,8 @@ convert_atomic_bit_not (enum internal_fn fn, gimple 
*use_stmt,
   gsi_insert_after (&gsi, g, GSI_NEW_STMT);
   gsi = gsi_for_stmt (use_not_stmt);
   gsi_remove (&gsi, true);
+  gsi = gsi_for_stmt (use_stmt_removal);
+  gsi_remove (&gsi, true);
   return use_stmt;
 }
 
@@ -3646,8 +3649,7 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
   */
}
  var = make_ssa_name (TREE_TYPE (use_rhs));
- gsi = gsi_for_stmt (use_stmt);
- gsi_remove (&gsi, true);
+ gimple* use_stmt_removal = use_stmt;
  g = gimple_build_assign (var, BIT_AND_EXPR, use_rhs,
   and_mask);
  gsi = gsi_for_stmt (use_nop_stmt);
@@ -3664,6 +3666,8 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
  gsi_insert_after (&gsi, g, GSI_NEW_STMT);
  gsi = gsi_for_stmt (use_nop_stmt);
  gsi_remove (&gsi, true);
+ gsi = gsi_for_stmt (use_stmt_removal);
+ gsi_remove (&gsi, true);
}
}
  else
-- 
2.31.1



[PATCH] [x86][avx512] Optimize maskstore when mask is 0 or -1 in UNSPEC_MASKMOV

2024-07-16 Thread liuhongt
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

PR target/115843
* config/i386/predicates.md (const0_or_m1_operand): New
predicate.
* config/i386/sse.md (*_store_mask_1): New
pre_reload define_insn_and_split.
(V): Add V32BF,V16BF,V8BF.
(V4SF_V8BF): Rename to ..
(V24F_128): .. this.
(*vec_concat): Adjust with V24F_128.
(*vec_concat_0): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115843.c: New test.
---
 gcc/config/i386/predicates.md|  5 
 gcc/config/i386/sse.md   | 32 
 gcc/testsuite/gcc.target/i386/pr115843.c | 38 
 3 files changed, 69 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr115843.c

diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 5d0bb1e0f54..680594871de 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -825,6 +825,11 @@ (define_predicate "constm1_operand"
   (and (match_code "const_int")
(match_test "op == constm1_rtx")))
 
+;; Match 0 or -1.
+(define_predicate "const0_or_m1_operand"
+  (ior (match_operand 0 "const0_operand")
+   (match_operand 0 "constm1_operand")))
+
 ;; Match exactly eight.
 (define_predicate "const8_operand"
   (and (match_code "const_int")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index e44822f705b..e11610f4b88 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -294,6 +294,7 @@ (define_mode_iterator V
(V16SI "TARGET_AVX512F && TARGET_EVEX512") (V8SI "TARGET_AVX") V4SI
(V8DI "TARGET_AVX512F && TARGET_EVEX512")  (V4DI "TARGET_AVX") V2DI
(V32HF "TARGET_AVX512F && TARGET_EVEX512") (V16HF "TARGET_AVX") V8HF
+   (V32BF "TARGET_AVX512F && TARGET_EVEX512") (V16BF "TARGET_AVX") V8BF
(V16SF "TARGET_AVX512F && TARGET_EVEX512") (V8SF "TARGET_AVX") V4SF
(V8DF "TARGET_AVX512F && TARGET_EVEX512")  (V4DF "TARGET_AVX") (V2DF 
"TARGET_SSE2")])
 
@@ -430,8 +431,8 @@ (define_mode_iterator VFB_512
(V16SF "TARGET_EVEX512")
(V8DF "TARGET_EVEX512")])
 
-(define_mode_iterator V4SF_V8HF
-  [V4SF V8HF])
+(define_mode_iterator V24F_128
+  [V4SF V8HF V8BF])
 
 (define_mode_iterator VI48_AVX512VL
   [(V16SI "TARGET_EVEX512") (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL")
@@ -11543,8 +11544,8 @@ (define_insn "*vec_concatv2sf_sse"
(set_attr "mode" "V4SF,SF,DI,DI")])
 
 (define_insn "*vec_concat"
-  [(set (match_operand:V4SF_V8HF 0 "register_operand"   "=x,v,x,v")
-   (vec_concat:V4SF_V8HF
+  [(set (match_operand:V24F_128 0 "register_operand"   "=x,v,x,v")
+   (vec_concat:V24F_128
  (match_operand: 1 "register_operand" " 0,v,0,v")
  (match_operand: 2 "nonimmediate_operand" " 
x,v,m,m")))]
   "TARGET_SSE"
@@ -11559,8 +11560,8 @@ (define_insn "*vec_concat"
(set_attr "mode" "V4SF,V4SF,V2SF,V2SF")])
 
 (define_insn "*vec_concat_0"
-  [(set (match_operand:V4SF_V8HF 0 "register_operand"   "=v")
-   (vec_concat:V4SF_V8HF
+  [(set (match_operand:V24F_128 0 "register_operand"   "=v")
+   (vec_concat:V24F_128
  (match_operand: 1 "nonimmediate_operand" "vm")
  (match_operand: 2 "const0_operand")))]
   "TARGET_SSE2"
@@ -28574,6 +28575,25 @@ (define_insn "_store_mask"
(set_attr "memory" "store")
(set_attr "mode" "")])
 
+(define_insn_and_split "*_store_mask_1"
+  [(set (match_operand:V 0 "memory_operand")
+   (unspec:V
+ [(match_operand:V 1 "register_operand")
+  (match_dup 0)
+  (match_operand: 2 "const0_or_m1_operand")]
+ UNSPEC_MASKMOV))]
+  "TARGET_AVX512F"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  if (constm1_operand (operands[2], mode))
+  {
+emit_move_insn (operands[0], operands[1]);
+DONE;
+  }
+})
+
 (define_expand "cbranch4"
   [(set (reg:CC FLAGS_REG)
(compare:CC (match_operand:VI_AVX_AVX512F 1 "register_operand")
diff --git a/gcc/testsuite/gcc.target/i386/pr115843.c 
b/gcc/testsuite/gcc.target/i386/pr115843.c
new file mode 100644
index 000..00d8605757a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115843.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512vl --param vect-partial-vector-usage=2 
-mtune=znver5 -mprefer-vector-width=512" } */
+/* { dg-final { scan-assembler-not "kxor\[bw]" } } */
+
+typedef unsigned long long BITBOARD;
+BITBOARD KingPressureMask1[64], KingSafetyMask1[64];
+
+void __attribute__((noinline))
+foo()
+{
+  int i;
+
+  for (i = 0; i < 64; i++) {
+if ((i & 7) == 0) {
+  KingPressureMask1[i] = KingSafetyMask1[i + 1];
+} else if ((i & 7) == 7) {
+  KingPressureMask1[i] = KingSafetyMask1[i - 1];
+} else {
+  KingPressureMask1[i] = KingSafetyMask1[i];
+}
+  }
+}
+
+BITBOARD verify[64] = {1, 1, 2, 3, 4, 5, 6, 6, 9, 9, 10, 11, 12, 13, 14, 14, 
17, 17, 18, 19,
+  20, 21, 22, 22, 25, 25, 26, 27, 28, 29,

[PATCH v2] [x86][avx512] Optimize maskstore when mask is 0 or -1 in UNSPEC_MASKMOV

2024-07-17 Thread liuhongt
> Also, in case the insn is deleted, do:
>
> emit_note (NOTE_INSN_DELETED);
>
> DONE;
>
> instead of leaving (const_int 0) in the stream.
>
> So, the above insn preparation statements should read:
>
> --cut here--
> if (constm1_operand (operands[2], mode))
>   emit_move_insn (operands[0], operands[1]);
> else
>   emit_note (NOTE_INSN_DELETED);
>
> DONE;
> --cut here--
Changed.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/115843
* config/i386/predicates.md (const0_or_m1_operand): New
predicate.
* config/i386/sse.md (*_store_mask_1): New
pre_reload define_insn_and_split.
(V): Add V32BF,V16BF,V8BF.
(V4SF_V8BF): Rename to ..
(V24F_128): .. this.
(*vec_concat): Adjust with V24F_128.
(*vec_concat_0): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115843.c: New test.
---
 gcc/config/i386/predicates.md|  5 
 gcc/config/i386/sse.md   | 33 
 gcc/testsuite/gcc.target/i386/pr115843.c | 38 
 3 files changed, 70 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr115843.c

diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 5d0bb1e0f54..680594871de 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -825,6 +825,11 @@ (define_predicate "constm1_operand"
   (and (match_code "const_int")
(match_test "op == constm1_rtx")))
 
+;; Match 0 or -1.
+(define_predicate "const0_or_m1_operand"
+  (ior (match_operand 0 "const0_operand")
+   (match_operand 0 "constm1_operand")))
+
 ;; Match exactly eight.
 (define_predicate "const8_operand"
   (and (match_code "const_int")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index e44822f705b..f54e966bdbb 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -294,6 +294,7 @@ (define_mode_iterator V
(V16SI "TARGET_AVX512F && TARGET_EVEX512") (V8SI "TARGET_AVX") V4SI
(V8DI "TARGET_AVX512F && TARGET_EVEX512")  (V4DI "TARGET_AVX") V2DI
(V32HF "TARGET_AVX512F && TARGET_EVEX512") (V16HF "TARGET_AVX") V8HF
+   (V32BF "TARGET_AVX512F && TARGET_EVEX512") (V16BF "TARGET_AVX") V8BF
(V16SF "TARGET_AVX512F && TARGET_EVEX512") (V8SF "TARGET_AVX") V4SF
(V8DF "TARGET_AVX512F && TARGET_EVEX512")  (V4DF "TARGET_AVX") (V2DF 
"TARGET_SSE2")])
 
@@ -430,8 +431,8 @@ (define_mode_iterator VFB_512
(V16SF "TARGET_EVEX512")
(V8DF "TARGET_EVEX512")])
 
-(define_mode_iterator V4SF_V8HF
-  [V4SF V8HF])
+(define_mode_iterator V24F_128
+  [V4SF V8HF V8BF])
 
 (define_mode_iterator VI48_AVX512VL
   [(V16SI "TARGET_EVEX512") (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL")
@@ -11543,8 +11544,8 @@ (define_insn "*vec_concatv2sf_sse"
(set_attr "mode" "V4SF,SF,DI,DI")])
 
 (define_insn "*vec_concat"
-  [(set (match_operand:V4SF_V8HF 0 "register_operand"   "=x,v,x,v")
-   (vec_concat:V4SF_V8HF
+  [(set (match_operand:V24F_128 0 "register_operand"   "=x,v,x,v")
+   (vec_concat:V24F_128
  (match_operand: 1 "register_operand" " 0,v,0,v")
  (match_operand: 2 "nonimmediate_operand" " 
x,v,m,m")))]
   "TARGET_SSE"
@@ -11559,8 +11560,8 @@ (define_insn "*vec_concat"
(set_attr "mode" "V4SF,V4SF,V2SF,V2SF")])
 
 (define_insn "*vec_concat_0"
-  [(set (match_operand:V4SF_V8HF 0 "register_operand"   "=v")
-   (vec_concat:V4SF_V8HF
+  [(set (match_operand:V24F_128 0 "register_operand"   "=v")
+   (vec_concat:V24F_128
  (match_operand: 1 "nonimmediate_operand" "vm")
  (match_operand: 2 "const0_operand")))]
   "TARGET_SSE2"
@@ -28574,6 +28575,26 @@ (define_insn "_store_mask"
(set_attr "memory" "store")
(set_attr "mode" "")])
 
+(define_insn_and_split "*_store_mask_1"
+  [(set (match_operand:V 0 "memory_operand")
+   (unspec:V
+ [(match_operand:V 1 "register_operand")
+  (match_dup 0)
+  (match_operand: 2 "const0_or_m1_operand")]
+ UNSPEC_MASKMOV))]
+  "TARGET_AVX512F && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  if (constm1_operand (operands[2], mode))
+emit_move_insn (operands[0], operands[1]);
+  else
+emit_note (NOTE_INSN_DELETED);
+
+  DONE;
+})
+
 (define_expand "cbranch4"
   [(set (reg:CC FLAGS_REG)
(compare:CC (match_operand:VI_AVX_AVX512F 1 "register_operand")
diff --git a/gcc/testsuite/gcc.target/i386/pr115843.c 
b/gcc/testsuite/gcc.target/i386/pr115843.c
new file mode 100644
index 000..00d8605757a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115843.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512vl --param vect-partial-vector-usage=2 
-mtune=znver5 -mprefer-vector-width=512" } */
+/* { dg-final { scan-assembler-not "kxor\[bw]" } } */
+
+typedef unsigned long long BITBOARD;
+BITBOARD KingPressureMask1[64], KingSafetyMask1[64];
+
+void __attribute__((noinline))

[PATCH] [x86] Optimize ashift >> 7 to vpcmpgtb for vector int8.

2024-05-14 Thread liuhongt
Since there is no corresponding instruction, the shift operation for
vector int8 is implemented using the instructions for vector int16,
but for some special shift counts, it can be transformed into vpcmpgtb.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

PR target/114514
* config/i386/i386-expand.cc
(ix86_expand_vec_shift_qihi_constant): Optimize ashift >> 7 to
vpcmpgtb.
(ix86_expand_vecop_qihi_partial): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr114514-shift.c: New test.
---
 gcc/config/i386/i386-expand.cc| 32 
 .../gcc.target/i386/pr114514-shift.c  | 49 +++
 2 files changed, 81 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr114514-shift.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 1ab22fe7973..ab6631f51e3 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -24182,6 +24182,28 @@ ix86_expand_vec_shift_qihi_constant (enum rtx_code 
code,
 return false;
 
   gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
+
+
+  if (shift_amount == 7
+  && code == ASHIFTRT)
+{
+  if (qimode == V16QImode
+ || qimode == V32QImode)
+   {
+ rtx zero = gen_reg_rtx (qimode);
+ emit_move_insn (zero, CONST0_RTX (qimode));
+ emit_move_insn (dest, gen_rtx_fmt_ee (GT, qimode, zero, op1));
+   }
+  else
+   {
+ gcc_assert (qimode == V64QImode);
+ rtx kmask = gen_reg_rtx (DImode);
+ emit_insn (gen_avx512bw_cvtb2maskv64qi (kmask, op1));
+ emit_insn (gen_avx512bw_cvtmask2bv64qi (dest, kmask));
+   }
+  return true;
+}
+
   /* Record sign bit.  */
   xor_constant = 1 << (8 - shift_amount - 1);
 
@@ -24292,6 +24314,16 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, 
rtx dest, rtx op1, rtx op2)
   return;
 }
 
+  if (CONST_INT_P (op2)
+  && code == ASHIFTRT
+  && INTVAL (op2) == 7)
+{
+  rtx zero = gen_reg_rtx (qimode);
+  emit_move_insn (zero, CONST0_RTX (qimode));
+  emit_move_insn (dest, gen_rtx_fmt_ee (GT, qimode, zero, op1));
+  return;
+}
+
   switch (code)
 {
 case MULT:
diff --git a/gcc/testsuite/gcc.target/i386/pr114514-shift.c 
b/gcc/testsuite/gcc.target/i386/pr114514-shift.c
new file mode 100644
index 000..cf8b32b3b1d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114514-shift.c
@@ -0,0 +1,49 @@
+/* { dg-do compile  } */
+/* { dg-options "-mavx512vl -mavx512bw -O2" } */
+/* { dg-final { scan-assembler-times "vpxor" 4 } } */
+/* { dg-final { scan-assembler-times "vpcmpgtb" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpcmpgtb" 5 { target  ia32 } } } */
+/* { dg-final { scan-assembler-times "vpmovb2m" 1 } } */
+/* { dg-final { scan-assembler-times "vpmovm2b" 1 } } */
+
+
+typedef char v16qi __attribute__((vector_size(16)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef char v64qi __attribute__((vector_size(64)));
+typedef char v8qi __attribute__((vector_size(8)));
+typedef char v4qi __attribute__((vector_size(4)));
+
+v4qi
+__attribute__((noipa))
+foo1 (v4qi a)
+{
+  return a >> 7;
+}
+
+v8qi
+__attribute__((noipa))
+foo2 (v8qi a)
+{
+  return a >> 7;
+}
+
+v16qi
+__attribute__((noipa))
+foo3 (v16qi a)
+{
+  return a >> 7;
+}
+
+v32qi
+__attribute__((noipa))
+foo4 (v32qi a)
+{
+  return a >> 7;
+}
+
+v64qi
+__attribute__((noipa))
+foo5 (v64qi a)
+{
+  return a >> 7;
+}
-- 
2.31.1



[PATCH] [x86] Set d.one_operand_p to true when TARGET_SSSE3 in ix86_expand_vecop_qihi_partial.

2024-05-15 Thread liuhongt
pshufb is available under TARGET_SSSE3, so
ix86_expand_vec_perm_const_1 must return true when TARGET_SSSE3.
w/o TARGET_SSSE3, if we set one_operand_p to true, ix86_expand_vec_perm_const_1 
could return false.

With the patch under -march=x86-64-v2

v8qi
foo (v8qi a)
{
  return a >> 5;
}

<   pmovsxbw%xmm0, %xmm0
<   psraw   $5, %xmm0
<   pshufb  .LC0(%rip), %xmm0
---
>   movdqa  %xmm0, %xmm1
>   pcmpeqd %xmm0, %xmm0
>   pmovsxbw%xmm1, %xmm1
>   psrlw   $8, %xmm0
>   psraw   $5, %xmm1
>   pand%xmm1, %xmm0
>   packuswb%xmm0, %xmm0

Although there's a memory load from constant pool, but it should be
better when it's inside a loop. The load from constant pool can be
hoist out. it's 1 instruction vs 4 instructions.

<   pshufb  .LC0(%rip), %xmm0

vs.

>   pcmpeqd %xmm0, %xmm0
>   psrlw   $8, %xmm0
>   pand%xmm1, %xmm0
>   packuswb%xmm0, %xmm0


Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk.

gcc/ChangeLog:

PR target/114514
* config/i386/i386-expand.cc (ix86_expand_vecop_qihi_partial):
Set d.one_operand_p to true when TARGET_SSSE3.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr114514-shufb.c: New test.
---
 gcc/config/i386/i386-expand.cc|  2 +-
 .../gcc.target/i386/pr114514-shufb.c  | 35 +++
 2 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr114514-shufb.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index ab6631f51e3..ae2e9ab4e05 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -24394,7 +24394,7 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx 
dest, rtx op1, rtx op2)
   d.op0 = d.op1 = qres;
   d.vmode = V16QImode;
   d.nelt = 16;
-  d.one_operand_p = false;
+  d.one_operand_p = TARGET_SSSE3;
   d.testing_p = false;
 
   for (i = 0; i < d.nelt; ++i)
diff --git a/gcc/testsuite/gcc.target/i386/pr114514-shufb.c 
b/gcc/testsuite/gcc.target/i386/pr114514-shufb.c
new file mode 100644
index 000..71fdc9d8daf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114514-shufb.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-msse4.1 -O2 -mno-avx512f" } */
+/* { dg-final { scan-assembler-not "packuswb" } }  */
+/* { dg-final { scan-assembler-times "pshufb" 4 { target { ! ia32 } } } }  */
+/* { dg-final { scan-assembler-times "pshufb" 6 { target  ia32 } } }  */
+
+typedef unsigned char v8uqi __attribute__((vector_size(8)));
+typedef  char v8qi __attribute__((vector_size(8)));
+typedef unsigned char v4uqi __attribute__((vector_size(4)));
+typedef  char v4qi __attribute__((vector_size(4)));
+
+v8qi
+foo (v8qi a)
+{
+  return a >> 5;
+}
+
+v8uqi
+foo1 (v8uqi a)
+{
+  return a >> 5;
+}
+
+v4qi
+foo2 (v4qi a)
+{
+  return a >> 5;
+}
+
+v4uqi
+foo3 (v4uqi a)
+{
+  return a >> 5;
+}
+
-- 
2.31.1



[PATCH] Use pblendw instead of pand to clear upper 16 bits.

2024-05-16 Thread liuhongt
For vec_pack_truncv8si/v4si w/o AVX512,
(const_vector:v4si (const_int 0x) x4) is used as mask to clear
upper 16 bits, but vpblendw with zero_vector can also be used, and
zero vector is cheaper than (const_vector:v4si (const_int 0x) x4).

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:
PR target/114427
* config/i386/i386-expand.cc (expand_vec_perm_even_odd_pack):
Use pblendw instead of pand to clear upper bits.

gcc/testsuite/ChangeLog:
* gcc.target/i386/pr114427.c: New test.
---
 gcc/config/i386/i386-expand.cc   | 34 +---
 gcc/testsuite/gcc.target/i386/pr114427.c | 18 +
 2 files changed, 48 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr114427.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 4e16aedc5c1..231e9321d81 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -22918,6 +22918,7 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d 
*d)
 {
   rtx op, dop0, dop1, t;
   unsigned i, odd, c, s, nelt = d->nelt;
+  int pblendw_i = 0;
   bool end_perm = false;
   machine_mode half_mode;
   rtx (*gen_and) (rtx, rtx, rtx);
@@ -22939,6 +22940,7 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d 
*d)
   gen_and = gen_andv2si3;
   gen_pack = gen_mmx_packusdw;
   gen_shift = gen_lshrv2si3;
+  pblendw_i = 0x5;
   break;
 case E_V8HImode:
   /* Required for "pack".  */
@@ -22950,6 +22952,7 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d 
*d)
   gen_and = gen_andv4si3;
   gen_pack = gen_sse4_1_packusdw;
   gen_shift = gen_lshrv4si3;
+  pblendw_i = 0x55;
   break;
 case E_V8QImode:
   /* No check as all instructions are SSE2.  */
@@ -22978,6 +22981,7 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d 
*d)
   gen_and = gen_andv8si3;
   gen_pack = gen_avx2_packusdw;
   gen_shift = gen_lshrv8si3;
+  pblendw_i = 0x;
   end_perm = true;
   break;
 case E_V32QImode:
@@ -23013,10 +23017,32 @@ expand_vec_perm_even_odd_pack (struct 
expand_vec_perm_d *d)
   dop1 = gen_reg_rtx (half_mode);
   if (odd == 0)
 {
-  t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
-  t = force_reg (half_mode, t);
-  emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
-  emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
+  /* Use pblendw since const_vector 0 should be cheaper than
+const_vector 0x.  */
+  if (d->vmode == V4HImode
+ || d->vmode == E_V8HImode
+ || d->vmode == E_V16HImode)
+   {
+ rtx dop0_t = gen_reg_rtx (d->vmode);
+ rtx dop1_t = gen_reg_rtx (d->vmode);
+ t = gen_reg_rtx (d->vmode);
+ emit_move_insn (t, CONST0_RTX (d->vmode));
+
+ emit_move_insn (dop0_t, gen_rtx_VEC_MERGE (d->vmode, d->op0, t,
+GEN_INT (pblendw_i)));
+ emit_move_insn (dop1_t, gen_rtx_VEC_MERGE (d->vmode, d->op1, t,
+GEN_INT (pblendw_i)));
+
+ emit_move_insn (dop0, gen_lowpart (half_mode, dop0_t));
+ emit_move_insn (dop1, gen_lowpart (half_mode, dop1_t));
+   }
+  else
+   {
+ t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
+ t = force_reg (half_mode, t);
+ emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
+ emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
+   }
 }
   else
 {
diff --git a/gcc/testsuite/gcc.target/i386/pr114427.c 
b/gcc/testsuite/gcc.target/i386/pr114427.c
new file mode 100644
index 000..58b66db7fff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114427.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -O2 -mno-avx512f" } */
+/* { dg-final { scan-assembler-not "vpand" } } */
+/* { dg-final { scan-assembler-not "65535" } } */
+
+void
+foo (int* a, short* __restrict b, int* c)
+{
+for (int i = 0; i != 16; i++)
+  b[i] = c[i] + a[i];
+}
+
+void
+foo1 (int* a, short* __restrict b, int* c)
+{
+for (int i = 0; i != 8; i++)
+  b[i] = c[i] + a[i];
+}
-- 
2.31.1



[PATCH 1/2] Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for vector mode.

2024-05-20 Thread liuhongt
When mask is (1 << (prec - imm) - 1) which is used to clear upper bits
of A, then it can be simplified to LSHIFTRT.

i.e Simplify
(and:v8hi
  (ashifrt:v8hi A 8)
  (const_vector 0xff x8))
to
(lshifrt:v8hi A 8)

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok of trunk?

gcc/ChangeLog:

PR target/114428
* simplify-rtx.cc
(simplify_context::simplify_binary_operation_1):
Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for
specific mask.
---
 gcc/simplify-rtx.cc | 25 +
 1 file changed, 25 insertions(+)

diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index 53f54d1d392..6c91409200e 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -4021,6 +4021,31 @@ simplify_context::simplify_binary_operation_1 (rtx_code 
code,
return tem;
}
 
+  /* (and:v4si
+  (ashiftrt:v4si A 16)
+  (const_vector: 0x x4))
+is just (lshiftrt:v4si A 16).  */
+  if (VECTOR_MODE_P (mode) && GET_CODE (op0) == ASHIFTRT
+ && (CONST_INT_P (XEXP (op0, 1))
+ || (GET_CODE (XEXP (op0, 1)) == CONST_VECTOR
+ && CONST_VECTOR_DUPLICATE_P (XEXP (op0, 1
+ && GET_CODE (op1) == CONST_VECTOR
+ && CONST_VECTOR_DUPLICATE_P (op1))
+   {
+ unsigned HOST_WIDE_INT shift_count
+   = (CONST_INT_P (XEXP (op0, 1))
+  ? UINTVAL (XEXP (op0, 1))
+  : UINTVAL (XVECEXP (XEXP (op0, 1), 0, 0)));
+ unsigned HOST_WIDE_INT inner_prec
+   = GET_MODE_PRECISION (GET_MODE_INNER (mode));
+
+ /* Avoid UD shift count.  */
+ if (shift_count < inner_prec
+ && (UINTVAL (XVECEXP (op1, 0, 0))
+ == (HOST_WIDE_INT_1U << (inner_prec - shift_count)) - 1))
+   return simplify_gen_binary (LSHIFTRT, mode, XEXP (op0, 0), XEXP 
(op0, 1));
+   }
+
   tem = simplify_byte_swapping_operation (code, mode, op0, op1);
   if (tem)
return tem;
-- 
2.31.1



[PATCH 2/2] [x86] Adjust rtx_cost for MEM to enable more simplication

2024-05-20 Thread liuhongt
For CONST_VECTOR_DUPLICATE_P in constant_pool, it is just broadcast or
variants in ix86_vector_duplicate_simode_const.
Adjust the cost to COSTS_N_INSNS (2) + speed which should be a little
bit larger than broadcast.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:
PR target/114428
* config/i386/i386.cc (ix86_rtx_costs): Adjust cost for
CONST_VECTOR_DUPLICATE_P in constant_pool.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr114428.c: New test.
---
 gcc/config/i386/i386-expand.cc   |  2 +-
 gcc/config/i386/i386-protos.h|  1 +
 gcc/config/i386/i386.cc  | 13 +
 gcc/testsuite/gcc.target/i386/pr114428.c | 18 ++
 4 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr114428.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 4e16aedc5c1..d96c365e144 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -588,7 +588,7 @@ ix86_expand_move (machine_mode mode, rtx operands[])
 
 /* OP is a memref of CONST_VECTOR, return scalar constant mem
if CONST_VECTOR is a vec_duplicate, else return NULL.  */
-static rtx
+rtx
 ix86_broadcast_from_constant (machine_mode mode, rtx op)
 {
   int nunits = GET_MODE_NUNITS (mode);
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index dbc861fb1ea..90712769200 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -107,6 +107,7 @@ extern void ix86_expand_clear (rtx);
 extern void ix86_expand_move (machine_mode, rtx[]);
 extern void ix86_expand_vector_move (machine_mode, rtx[]);
 extern void ix86_expand_vector_move_misalign (machine_mode, rtx[]);
+extern rtx ix86_broadcast_from_constant (machine_mode, rtx);
 extern rtx ix86_fixup_binary_operands (enum rtx_code, machine_mode,
   rtx[], bool = false);
 extern void ix86_fixup_binary_operands_no_copy (enum rtx_code, machine_mode,
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index b4838b7939e..fdd9343e47a 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22197,6 +22197,19 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
   return true;
 
 case MEM:
+  /* CONST_VECTOR_DUPLICATE_P in constant_pool is just broadcast.
+or variants in ix86_vector_duplicate_simode_const.  */
+
+  if (GET_MODE_SIZE (mode) >= 16
+ && VECTOR_MODE_P (mode)
+ && SYMBOL_REF_P (XEXP (x, 0))
+ && CONSTANT_POOL_ADDRESS_P (XEXP (x, 0))
+ && ix86_broadcast_from_constant (mode, x))
+   {
+ *total = COSTS_N_INSNS (2) + speed;
+ return true;
+   }
+
   /* An insn that accesses memory is slightly more expensive
  than one that does not.  */
   if (speed)
diff --git a/gcc/testsuite/gcc.target/i386/pr114428.c 
b/gcc/testsuite/gcc.target/i386/pr114428.c
new file mode 100644
index 000..bbbc5a080f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114428.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -mno-avx512f -O2" } */
+/* { dg-final { scan-assembler-not "vpsra[dw]" } } */
+
+void
+foo2 (char* __restrict a, short* b)
+{
+  for (int i = 0; i != 32; i++)
+a[i] = b[i] >> (short)8;
+}
+
+void
+foo3 (char* __restrict a, short* b)
+{
+  for (int i = 0; i != 16; i++)
+a[i] = b[i] >> (short)8;
+}
+
-- 
2.31.1



[PATCH] Don't simplify NAN/INF or out-of-range constant for FIX/UNSIGNED_FIX.

2024-05-21 Thread liuhongt
According to IEEE standard, for conversions from floating point to
integer. When a NaN or infinite operand cannot be represented in the
destination format and this cannot otherwise be indicated, the invalid
operation exception shall be signaled. When a numeric operand would
convert to an integer outside the range of the destination format, the
invalid operation exception shall be signaled if this situation cannot
otherwise be indicated.

The patch prevent simplication of the conversion from floating point
to integer for NAN/INF/out-of-range constant when flag_trapping_math.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ok for trunk?

gcc/ChangeLog:

PR rtl-optimization/100927
PR rtl-optimization/115161
PR rtl-optimization/115115
* simplify-rtx.cc (simplify_const_unary_operation): Prevent
simplication of FIX/UNSIGNED_FIX for NAN/INF/out-of-range
constant when flag_trapping_math.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr100927.c: New test.
---
 gcc/simplify-rtx.cc  | 23 
 gcc/testsuite/gcc.target/i386/pr100927.c | 27 
 2 files changed, 46 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100927.c

diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index 53f54d1d392..b7a770dad60 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -2256,14 +2256,25 @@ simplify_const_unary_operation (enum rtx_code code, 
machine_mode mode,
   switch (code)
{
case FIX:
+ /* According to IEEE standard, for conversions from floating point to
+integer. When a NaN or infinite operand cannot be represented in
+the destination format and this cannot otherwise be indicated, the
+invalid operation exception shall be signaled. When a numeric
+operand would convert to an integer outside the range of the
+destination format, the invalid operation exception shall be
+signaled if this situation cannot otherwise be indicated.  */
  if (REAL_VALUE_ISNAN (*x))
-   return const0_rtx;
+   return flag_trapping_math ? NULL_RTX : const0_rtx;
+
+ if (REAL_VALUE_ISINF (*x) && flag_trapping_math)
+   return NULL_RTX;
 
  /* Test against the signed upper bound.  */
  wmax = wi::max_value (width, SIGNED);
  real_from_integer (&t, VOIDmode, wmax, SIGNED);
  if (real_less (&t, x))
-   return immed_wide_int_const (wmax, mode);
+   return (flag_trapping_math
+   ? NULL_RTX : immed_wide_int_const (wmax, mode));
 
  /* Test against the signed lower bound.  */
  wmin = wi::min_value (width, SIGNED);
@@ -2276,13 +2287,17 @@ simplify_const_unary_operation (enum rtx_code code, 
machine_mode mode,
 
case UNSIGNED_FIX:
  if (REAL_VALUE_ISNAN (*x) || REAL_VALUE_NEGATIVE (*x))
-   return const0_rtx;
+   return flag_trapping_math ? NULL_RTX : const0_rtx;
+
+ if (REAL_VALUE_ISINF (*x) && flag_trapping_math)
+   return NULL_RTX;
 
  /* Test against the unsigned upper bound.  */
  wmax = wi::max_value (width, UNSIGNED);
  real_from_integer (&t, VOIDmode, wmax, UNSIGNED);
  if (real_less (&t, x))
-   return immed_wide_int_const (wmax, mode);
+   return (flag_trapping_math
+   ? NULL_RTX : immed_wide_int_const (wmax, mode));
 
  return immed_wide_int_const (real_to_integer (x, &fail, width),
   mode);
diff --git a/gcc/testsuite/gcc.target/i386/pr100927.c 
b/gcc/testsuite/gcc.target/i386/pr100927.c
new file mode 100644
index 000..b137396c30f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100927.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2 -ftrapping-math" } */
+/* { dg-final { scan-assembler-times "cvttps2dq" 3 } }  */
+
+#include 
+
+__m128i foo_ofr() {
+  const __m128i iv = _mm_set_epi32(0x4f00, 0x4f00, 0x4f00, 
0x4f00);
+  const __m128  fv = _mm_castsi128_ps(iv);
+  const __m128i riv = _mm_cvttps_epi32(fv);
+  return riv;
+}
+
+__m128i foo_nan() {
+  const __m128i iv = _mm_set_epi32(0xff81, 0xff81, 0xff81, 
0xff81);
+  const __m128  fv = _mm_castsi128_ps(iv);
+  const __m128i riv = _mm_cvttps_epi32(fv);
+  return riv;
+}
+
+__m128i foo_inf() {
+  const __m128i iv = _mm_set_epi32(0xff80, 0xff80, 0xff80, 
0xff80);
+  const __m128  fv = _mm_castsi128_ps(iv);
+  const __m128i riv = _mm_cvttps_epi32(fv);
+  return riv;
+}
+
-- 
2.31.1



[V2 PATCH] Don't reduce estimated unrolled size for innermost loop at cunrolli.

2024-05-21 Thread liuhongt
>> Hard to find a default value satisfying all testcases.
>> some require loop unroll with 7 insns increment, some don't want loop
>> unroll w/ 5 insn increment.
>> The original 2/3 reduction happened to meet all those testcases(or the
>> testcases are constructed based on the old 2/3).
>> Can we define the parameter as the size of the loop, below the size we
>> still do the reduction, so the small loop can be unrolled?

>Yeah, that's also a sensible possibility.  Does it work to have a parameter
>for the unrolled body size?  Thus, amend the existing
>--param max-completely-peeled-insns with a --param
>max-completely-peeled-insns-nogrowth?

Update V2:
It's still hard to find a default value for loop boday size. So I move the
2 / 3 reduction from estimated_unrolled_size to try_unroll_loop_completely.
For the check of body size shrink, 2 / 3 reduction is added, so small loops
can still be unrolled.
For the check of comparison between body size and 
param_max_completely_peeled_insns,
2 / 3 is conditionally added for loop->inner || !cunrolli.
Then the patch avoid gcc testsuite regression, and also prevent big inner loop
completely unrolled at cunrolli.

--

For the innermost loop, after completely loop unroll, it will most likely
not be able to reduce the body size to 2/3. The current 2/3 reduction
will make some of the larger loops completely unrolled during
cunrolli, which will then result in them not being able to be
vectorized. It also increases the register pressure. The patch move
from estimated_unrolled_size to
the 2/3 reduction at cunrolli.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR tree-optimization/112325
* tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Move the
2 / 3 loop body size reduction to ..
(try_unroll_loop_completely): .. here, add it for the check of
body size shrink, and the check of comparison against
param_max_completely_peeled_insns when
(!cunrolli ||loop->inner).
(canonicalize_loop_induction_variables): Add new parameter
cunrolli and pass down.
(tree_unroll_loops_completely_1): Ditto.
(tree_unroll_loops_completely): Ditto.
(canonicalize_induction_variables): Handle new parameter.
(pass_complete_unrolli::execute): Ditto.
(pass_complete_unroll::execute): Ditto.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/pr112325.c: New test.
* gcc.dg/vect/pr69783.c: Add extra option --param
max-completely-peeled-insns=300.
---
 gcc/testsuite/gcc.dg/tree-ssa/pr112325.c | 57 
 gcc/testsuite/gcc.dg/vect/pr69783.c  |  2 +-
 gcc/tree-ssa-loop-ivcanon.cc | 45 ++-
 3 files changed, 83 insertions(+), 21 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr112325.c

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
new file mode 100644
index 000..14208b3e7f8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
@@ -0,0 +1,57 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-cunrolli-details" } */
+
+typedef unsigned short ggml_fp16_t;
+static float table_f32_f16[1 << 16];
+
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+unsigned short s;
+__builtin_memcpy(&s, &f, sizeof(unsigned short));
+return table_f32_f16[s];
+}
+
+typedef struct {
+ggml_fp16_t d;
+ggml_fp16_t m;
+unsigned char qh[4];
+unsigned char qs[32 / 2];
+} block_q5_1;
+
+typedef struct {
+float d;
+float s;
+char qs[32];
+} block_q8_1;
+
+void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * 
restrict vx, const void * restrict vy) {
+const int qk = 32;
+const int nb = n / qk;
+
+const block_q5_1 * restrict x = vx;
+const block_q8_1 * restrict y = vy;
+
+float sumf = 0.0;
+
+for (int i = 0; i < nb; i++) {
+unsigned qh;
+__builtin_memcpy(&qh, x[i].qh, sizeof(qh));
+
+int sumi = 0;
+
+for (int j = 0; j < qk/2; ++j) {
+const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
+const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
+
+const int x0 = (x[i].qs[j] & 0xF) | xh_0;
+const int x1 = (x[i].qs[j] >> 4) | xh_1;
+
+sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
+}
+
+sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + 
ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
+}
+
+*s = sumf;
+}
+
+/* { dg-final { scan-tree-dump {(?n)Not unrolling loop [1-9] \(--param 
max-completely-peel-times limit reached} "cunrolli"} } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr69783.c 
b/gcc/testsuite/gcc.dg/vect/pr69783.c
index 5df95d0ce4e..a1f75514d72 100644
--- a/gcc/testsuite/gcc.dg/vect/pr69783.c
+++ b/gcc/testsuite/gcc.dg/vect/pr69783.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-require-effective

[V3 PATCH] Don't reduce estimated unrolled size for innermost loop.

2024-05-24 Thread liuhongt
Update in V3:
> Since this was about vectorization can you instead add a testcase to
> gcc.dg/vect/ and check for
> vectorization to happen?
Move to vect/pr112325.c.
>
> I believe the if (unr_insn <= 0) check can go as well.
Removed.

> as said, you want to do
>
>   curolli = false;
>
> after the above since we are iterating and for a subsequent unrolling
> of an outer loop
> of an unrolled inner loop we _do_ want to apply the 2/3 reduction
> since there's likely
> inter-loop redundancies exposed (as happens in SPEC calculix for example).
>
> Not sure if that changes any of the testsuite outcome - it possibly avoids the
> gcc.dg/vect/pr69783.c FAIL?
Yes, it avoids that, cunrolli is set to false when CHANGED is true.

> Not sure about the arm fallout.
It's the same reason as pr69783.c, there's subsequent unrolling of an outer loop
of an unrolled inner loop, and since inner loop is completely unrolled,
outer_loop->inner is false and escape from the check.
The change also fix 2 arm fallouts.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

For the innermost loop, after completely loop unroll, it will most likely
not be able to reduce the body size to 2/3. The current 2/3 reduction
will make some of the larger loops completely unrolled during
cunrolli, which will then result in them not being able to be
vectorized. It also increases the register pressure.

The patch move the 2/3 reduction from estimated_unrolled_size to
tree_unroll_loops_completely.

gcc/ChangeLog:

PR tree-optimization/112325
* tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Move the
2 / 3 loop body size reduction to ..
(try_unroll_loop_completely): .. here, add it for the check of
body size shrink, and the check of comparison against
param_max_completely_peeled_insns when
(!cunrolli ||loop->inner).
(canonicalize_loop_induction_variables): Add new parameter
cunrolli and pass down.
(tree_unroll_loops_completely_1): Ditto.
(canonicalize_induction_variables): Pass cunrolli as false to
canonicalize_loop_induction_variables.
(tree_unroll_loops_completely): Set cunrolli to true at
beginning and set it to false after CHANGED is true.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/pr112325.c: New test.
---
 gcc/testsuite/gcc.dg/vect/pr112325.c | 59 
 gcc/tree-ssa-loop-ivcanon.cc | 46 +++---
 2 files changed, 83 insertions(+), 22 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr112325.c

diff --git a/gcc/testsuite/gcc.dg/vect/pr112325.c 
b/gcc/testsuite/gcc.dg/vect/pr112325.c
new file mode 100644
index 000..71cf4099253
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr112325.c
@@ -0,0 +1,59 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -funroll-loops -fdump-tree-vect-details" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } } */
+
+typedef unsigned short ggml_fp16_t;
+static float table_f32_f16[1 << 16];
+
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+unsigned short s;
+__builtin_memcpy(&s, &f, sizeof(unsigned short));
+return table_f32_f16[s];
+}
+
+typedef struct {
+ggml_fp16_t d;
+ggml_fp16_t m;
+unsigned char qh[4];
+unsigned char qs[32 / 2];
+} block_q5_1;
+
+typedef struct {
+float d;
+float s;
+char qs[32];
+} block_q8_1;
+
+void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * 
restrict vx, const void * restrict vy) {
+const int qk = 32;
+const int nb = n / qk;
+
+const block_q5_1 * restrict x = vx;
+const block_q8_1 * restrict y = vy;
+
+float sumf = 0.0;
+
+for (int i = 0; i < nb; i++) {
+unsigned qh;
+__builtin_memcpy(&qh, x[i].qh, sizeof(qh));
+
+int sumi = 0;
+
+for (int j = 0; j < qk/2; ++j) {
+const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
+const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
+
+const int x0 = (x[i].qs[j] & 0xF) | xh_0;
+const int x1 = (x[i].qs[j] >> 4) | xh_1;
+
+sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
+}
+
+sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + 
ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
+}
+
+*s = sumf;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
index bf017137260..216e81ef15f 100644
--- a/gcc/tree-ssa-loop-ivcanon.cc
+++ b/gcc/tree-ssa-loop-ivcanon.cc
@@ -437,11 +437,7 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge 
edge_to_cancel,
It is (NUNROLL + 1) * size of loop body with taking into account
the fact that in last copy everything after exit conditional
is dead and that some instructions will be eliminated after
-   peeling.
-
-   L

[PATCH] Fix typo in the testcase.

2024-05-24 Thread liuhongt
Committed as an obvious patch.

gcc/testsuite/ChangeLog:

PR target/114148
* gcc.target/i386/pr106010-7b.c: Refine testcase.
---
 gcc/testsuite/gcc.target/i386/pr106010-7b.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7b.c 
b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
index 26482cc10f5..917e56e45f7 100644
--- a/gcc/testsuite/gcc.target/i386/pr106010-7b.c
+++ b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
@@ -34,11 +34,11 @@ avx_test (void)
 p_init[i] = i % 2 + 3;
 
   memcpy (pd_src, p_init, 2 * N * sizeof (double));
-  memcpy (ps_dst, p_init, 2 * N * sizeof (float));
-  memcpy (epi64_dst, p_init, 2 * N * sizeof (long long));
-  memcpy (epi32_dst, p_init, 2 * N * sizeof (int));
-  memcpy (epi16_dst, p_init, 2 * N * sizeof (short));
-  memcpy (epi8_dst, p_init, 2 * N * sizeof (char));
+  memcpy (ps_src, p_init, 2 * N * sizeof (float));
+  memcpy (epi64_src, p_init, 2 * N * sizeof (long long));
+  memcpy (epi32_src, p_init, 2 * N * sizeof (int));
+  memcpy (epi16_src, p_init, 2 * N * sizeof (short));
+  memcpy (epi8_src, p_init, 2 * N * sizeof (char));
 
   foo_pd (pd_dst, pd_src[0]);
   foo_ps (ps_dst, ps_src[0]);
-- 
2.31.1



[PATCH] Don't simplify NAN/INF or out-of-range constant for FIX/UNSIGNED_FIX.

2024-05-26 Thread liuhongt
Update in V2:
Guard constant folding for overflow value in
fold_convert_const_int_from_real with flag_trapping_math.
Add -fno-trapping-math to related testcases which warn for overflow
in conversion from floating point to integer.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

According to IEEE standard, for conversions from floating point to
integer. When a NaN or infinite operand cannot be represented in the
destination format and this cannot otherwise be indicated, the invalid
operation exception shall be signaled. When a numeric operand would
convert to an integer outside the range of the destination format, the
invalid operation exception shall be signaled if this situation cannot
otherwise be indicated.

The patch prevent simplication of the conversion from floating point
to integer for NAN/INF/out-of-range constant when flag_trapping_math.

gcc/ChangeLog:

PR rtl-optimization/100927
PR rtl-optimization/115161
PR rtl-optimization/115115
* simplify-rtx.cc (simplify_const_unary_operation): Prevent
simplication of FIX/UNSIGNED_FIX for NAN/INF/out-of-range
constant when flag_trapping_math.
* fold-const.cc (fold_convert_const_int_from_real): Don't fold
for overflow value when_trapping_math.

gcc/testsuite/ChangeLog:

* gcc.dg/pr100927.c: New test.
* c-c++-common/Wconversion-1.c: Add -fno-trapping-math.
* c-c++-common/dfp/convert-int-saturate.c: Ditto.
* g++.dg/ubsan/pr63956.C: Ditto.
* g++.dg/warn/Wconversion-real-integer.C: Ditto.
* gcc.c-torture/execute/20031003-1.c: Ditto.
* gcc.dg/Wconversion-complex-c99.c: Ditto.
* gcc.dg/Wconversion-real-integer.c: Ditto.
* gcc.dg/c90-const-expr-11.c: Ditto.
* gcc.dg/overflow-warn-8.c: Ditto.
---
 gcc/fold-const.cc | 13 +++-
 gcc/simplify-rtx.cc   | 23 +++---
 gcc/testsuite/c-c++-common/Wconversion-1.c|  2 +-
 .../c-c++-common/dfp/convert-int-saturate.c   |  1 +
 gcc/testsuite/g++.dg/ubsan/pr63956.C  |  7 -
 .../g++.dg/warn/Wconversion-real-integer.C|  2 +-
 .../gcc.c-torture/execute/20031003-1.c|  2 ++
 .../gcc.dg/Wconversion-complex-c99.c  |  2 +-
 .../gcc.dg/Wconversion-real-integer.c |  2 +-
 gcc/testsuite/gcc.dg/c90-const-expr-11.c  |  2 +-
 gcc/testsuite/gcc.dg/overflow-warn-8.c|  1 +
 gcc/testsuite/gcc.dg/pr100927.c   | 31 +++
 12 files changed, 77 insertions(+), 11 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/pr100927.c

diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
index 7b268964acc..0ba01984630 100644
--- a/gcc/fold-const.cc
+++ b/gcc/fold-const.cc
@@ -2246,7 +2246,18 @@ fold_convert_const_int_from_real (enum tree_code code, 
tree type, const_tree arg
   if (! overflow)
 val = real_to_integer (&r, &overflow, TYPE_PRECISION (type));
 
-  t = force_fit_type (type, val, -1, overflow | TREE_OVERFLOW (arg1));
+  /* According to IEEE standard, for conversions from floating point to
+ integer. When a NaN or infinite operand cannot be represented in the
+ destination format and this cannot otherwise be indicated, the invalid
+ operation exception shall be signaled. When a numeric operand would
+ convert to an integer outside the range of the destination format, the
+ invalid operation exception shall be signaled if this situation cannot
+ otherwise be indicated.  */
+  if (!flag_trapping_math || !overflow)
+t = force_fit_type (type, val, -1, overflow | TREE_OVERFLOW (arg1));
+  else
+t = NULL_TREE;
+
   return t;
 }
 
diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index 53f54d1d392..b7a770dad60 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -2256,14 +2256,25 @@ simplify_const_unary_operation (enum rtx_code code, 
machine_mode mode,
   switch (code)
{
case FIX:
+ /* According to IEEE standard, for conversions from floating point to
+integer. When a NaN or infinite operand cannot be represented in
+the destination format and this cannot otherwise be indicated, the
+invalid operation exception shall be signaled. When a numeric
+operand would convert to an integer outside the range of the
+destination format, the invalid operation exception shall be
+signaled if this situation cannot otherwise be indicated.  */
  if (REAL_VALUE_ISNAN (*x))
-   return const0_rtx;
+   return flag_trapping_math ? NULL_RTX : const0_rtx;
+
+ if (REAL_VALUE_ISINF (*x) && flag_trapping_math)
+   return NULL_RTX;
 
  /* Test against the signed upper bound.  */
  wmax = wi::max_value (width, SIGNED);
  real_from_integer (&t, VOIDmode, wmax, SIGNED);
  if (real_less (&t, x))
-   return immed_wide_int_const (wmax, mode);
+   r

[PATCH] Reduce cost of MEM (A + imm).

2024-05-27 Thread liuhongt
For MEM, rtx_cost iterates each subrtx, and adds up the costs,
so for MEM (reg) and MEM (reg + 4), the former costs 5,
the latter costs 9, it is not accurate for x86. Ideally
address_cost should be used, but it reduce cost too much.
So current solution is make constant disp as cheap as possible.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/67325
* config/i386/i386.cc (ix86_rtx_costs): Reduce cost of MEM (A
+ imm) to "cost of MEM (A)" + 1.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr67325.c: New test.
---
 gcc/config/i386/i386.cc | 19 ++-
 gcc/testsuite/gcc.target/i386/pr67325.c |  7 +++
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr67325.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 3e2a3a194f1..3936223bd20 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22194,7 +22194,24 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
   /* An insn that accesses memory is slightly more expensive
  than one that does not.  */
   if (speed)
-*total += 1;
+   {
+ *total += 1;
+ rtx addr = XEXP (x, 0);
+ /* For MEM, rtx_cost iterates each subrtx, and adds up the costs,
+so for MEM (reg) and MEM (reg + 4), the former costs 5,
+the latter costs 9, it is not accurate for x86. Ideally
+address_cost should be used, but it reduce cost too much.
+So current solution is make constant disp as cheap as possible.  */
+ if (GET_CODE (addr) == PLUS
+ && CONST_INT_P (XEXP (addr, 1))
+ && x86_64_immediate_operand (XEXP (addr, 1), Pmode))
+   {
+ *total += 1;
+ *total += rtx_cost (XEXP (addr, 0), Pmode, PLUS, 0, speed);
+ return true;
+   }
+   }
+
   return false;
 
 case ZERO_EXTRACT:
diff --git a/gcc/testsuite/gcc.target/i386/pr67325.c 
b/gcc/testsuite/gcc.target/i386/pr67325.c
new file mode 100644
index 000..c3c1e4c5b4d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr67325.c
@@ -0,0 +1,7 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler-not "(?:sar|shr)" } } */
+
+int f(long*l){
+  return *l>>32;
+}
-- 
2.31.1



[PATCH][committed] [avx512] Fix predicate mismatch between vfcmaddcph's define_insn and define_expand.

2024-05-27 Thread liuhongt
When I applied Roger's patch [1], there's ICE due to it.
The patch fix the latent bug.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2024-May/651365.html

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Pushed to trunk.

gcc/ChangeLog:

* config/i386/sse.md
(___mask): Align
operands' predicate with corresponding expander.
(__):
Ditto.
---
 gcc/config/i386/sse.md | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index b59c988fc31..0f4fbcb2c5d 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -6867,9 +6867,9 @@ (define_insn 
"___mask"
   [(set (match_operand:VHF_AVX512VL 0 "register_operand" "=&v")
(vec_merge:VHF_AVX512VL
  (unspec:VHF_AVX512VL
-   [(match_operand:VHF_AVX512VL 1 "nonimmediate_operand" "v")
-(match_operand:VHF_AVX512VL 2 "nonimmediate_operand" 
"")
-(match_operand:VHF_AVX512VL 3 "register_operand" "0")]
+   [(match_operand:VHF_AVX512VL 1 "" 
"v")
+(match_operand:VHF_AVX512VL 2 "" 
"")
+(match_operand:VHF_AVX512VL 3 "" "0")]
 UNSPEC_COMPLEX_F_C_MA)
  (match_dup 1)
  (unspec:
@@ -6892,8 +6892,8 @@ (define_expand "cmul3"
 (define_insn "__"
   [(set (match_operand:VHF_AVX512VL 0 "register_operand" "=&v")
  (unspec:VHF_AVX512VL
-   [(match_operand:VHF_AVX512VL 1 "nonimmediate_operand" "v")
-(match_operand:VHF_AVX512VL 2 "nonimmediate_operand" 
"")]
+   [(match_operand:VHF_AVX512VL 1 "" 
"v")
+(match_operand:VHF_AVX512VL 2 "" 
"")]
 UNSPEC_COMPLEX_F_C_MUL))]
   "TARGET_AVX512FP16 && "
 {
-- 
2.31.1



[PATCH V2] Reduce cost of MEM (A + imm).

2024-05-28 Thread liuhongt
> IMO, there is no need for CONST_INT_P condition, we should also allow
> symbol_ref, label_ref and const (all allowed by
> x86_64_immediate_operand predicate), these all decay to an immediate
> value.

Changed.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk.

For MEM, rtx_cost iterates each subrtx, and adds up the costs,
so for MEM (reg) and MEM (reg + 4), the former costs 5,
the latter costs 9, it is not accurate for x86. Ideally
address_cost should be used, but it reduce cost too much.
So current solution is make constant disp as cheap as possible.

gcc/ChangeLog:

PR target/67325
* config/i386/i386.cc (ix86_rtx_costs): Reduce cost of MEM (A
+ imm) to "cost of MEM (A)" + 1.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr67325.c: New test.
---
 gcc/config/i386/i386.cc | 18 +-
 gcc/testsuite/gcc.target/i386/pr67325.c |  7 +++
 2 files changed, 24 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr67325.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 3e2a3a194f1..85d87b9f778 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22194,7 +22194,23 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
   /* An insn that accesses memory is slightly more expensive
  than one that does not.  */
   if (speed)
-*total += 1;
+   {
+ *total += 1;
+ rtx addr = XEXP (x, 0);
+ /* For MEM, rtx_cost iterates each subrtx, and adds up the costs,
+so for MEM (reg) and MEM (reg + 4), the former costs 5,
+the latter costs 9, it is not accurate for x86. Ideally
+address_cost should be used, but it reduce cost too much.
+So current solution is make constant disp as cheap as possible.  */
+ if (GET_CODE (addr) == PLUS
+ && x86_64_immediate_operand (XEXP (addr, 1), Pmode))
+   {
+ *total += 1;
+ *total += rtx_cost (XEXP (addr, 0), Pmode, PLUS, 0, speed);
+ return true;
+   }
+   }
+
   return false;
 
 case ZERO_EXTRACT:
diff --git a/gcc/testsuite/gcc.target/i386/pr67325.c 
b/gcc/testsuite/gcc.target/i386/pr67325.c
new file mode 100644
index 000..c3c1e4c5b4d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr67325.c
@@ -0,0 +1,7 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler-not "(?:sar|shr)" } } */
+
+int f(long*l){
+  return *l>>32;
+}
-- 
2.31.1



[PATCH] [x86] Support vcond_mask_qiqi and friends.

2024-05-28 Thread liuhongt
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

* config/i386/sse.md (vcond_mask_): New expander.

gcc/testsuite/ChangeLog:
* gcc.target/i386/pr114125.c: New test.
---
 gcc/config/i386/sse.md   | 20 
 gcc/testsuite/gcc.target/i386/pr114125.c | 10 ++
 2 files changed, 30 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr114125.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 0f4fbcb2c5d..7cd912eeeb1 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -4807,6 +4807,26 @@ (define_expand "vcond_mask_"
   DONE;
 })
 
+(define_expand "vcond_mask_"
+  [(match_operand:SWI1248_AVX512BW 0 "register_operand")
+   (match_operand:SWI1248_AVX512BW 1 "register_operand")
+   (match_operand:SWI1248_AVX512BW 2 "register_operand")
+   (match_operand:SWI1248_AVX512BW 3 "register_operand")]
+  "TARGET_AVX512F"
+{
+  /* (operand[1] & operand[3]) | (operand[2] & ~operand[3])  */
+  rtx op1 = gen_reg_rtx (mode);
+  rtx op2 = gen_reg_rtx (mode);
+  rtx op3 = gen_reg_rtx (mode);
+
+  emit_insn (gen_and3 (op1, operands[1], operands[3]));
+  emit_insn (gen_one_cmpl2 (op3, operands[3]));
+  emit_insn (gen_and3 (op2, operands[2], op3));
+  emit_insn (gen_ior3 (operands[0], op1, op2));
+
+  DONE;
+})
+
 ;
 ;;
 ;; Parallel floating point logical operations
diff --git a/gcc/testsuite/gcc.target/i386/pr114125.c 
b/gcc/testsuite/gcc.target/i386/pr114125.c
new file mode 100644
index 000..e63fbffe965
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114125.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -fdump-tree-forwprop3-raw " } */
+
+typedef long vec __attribute__((vector_size(16)));
+vec f(vec x){
+  vec y = x < 10;
+  return y & (y == 0);
+}
+
+/* { dg-final { scan-tree-dump-not "_expr" "forwprop3" } } */
-- 
2.31.1



[committed] [x86] Rename double_u with __double_u to avoid pulluting the namespace.

2024-05-30 Thread liuhongt
Committed as an obvious patch.

gcc/ChangeLog:

* config/i386/emmintrin.h (__double_u): Rename from double_u.
(_mm_load_sd): Replace double_u with __double_u.
(_mm_store_sd): Ditto.
(_mm_loadh_pd): Ditto.
(_mm_loadl_pd): Ditto.
* config/i386/xmmintrin.h (__float_u): Rename from float_u.
(_mm_load_ss): Ditto.
(_mm_store_ss): Ditto.
---
 gcc/config/i386/emmintrin.h | 10 +-
 gcc/config/i386/xmmintrin.h |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h
index fa301103daf..356ca218fcb 100644
--- a/gcc/config/i386/emmintrin.h
+++ b/gcc/config/i386/emmintrin.h
@@ -56,7 +56,7 @@ typedef double __m128d __attribute__ ((__vector_size__ (16), 
__may_alias__));
 /* Unaligned version of the same types.  */
 typedef long long __m128i_u __attribute__ ((__vector_size__ (16), 
__may_alias__, __aligned__ (1)));
 typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, 
__aligned__ (1)));
-typedef double double_u __attribute__ ((__may_alias__, __aligned__ (1)));
+typedef double __double_u __attribute__ ((__may_alias__, __aligned__ (1)));
 
 /* Create a selector for use with the SHUFPD instruction.  */
 #define _MM_SHUFFLE2(fp1,fp0) \
@@ -146,7 +146,7 @@ _mm_load1_pd (double const *__P)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_load_sd (double const *__P)
 {
-  return __extension__ (__m128d) { *(double_u *)__P, 0.0 };
+  return __extension__ (__m128d) { *(__double_u *)__P, 0.0 };
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
@@ -181,7 +181,7 @@ _mm_storeu_pd (double *__P, __m128d __A)
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_store_sd (double *__P, __m128d __A)
 {
-  *(double_u *)__P = ((__v2df)__A)[0] ;
+  *(__double_u *)__P = ((__v2df)__A)[0] ;
 }
 
 extern __inline double __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
@@ -974,13 +974,13 @@ _mm_unpacklo_pd (__m128d __A, __m128d __B)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_loadh_pd (__m128d __A, double const *__B)
 {
-  return __extension__ (__m128d) { ((__v2df)__A)[0], *(double_u*)__B };
+  return __extension__ (__m128d) { ((__v2df)__A)[0], *(__double_u*)__B };
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_loadl_pd (__m128d __A, double const *__B)
 {
-  return __extension__ (__m128d) { *(double_u*)__B, ((__v2df)__A)[1] };
+  return __extension__ (__m128d) { *(__double_u*)__B, ((__v2df)__A)[1] };
 }
 
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
index 87515ecb218..c90fc71331a 100644
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -72,7 +72,7 @@ typedef float __m128 __attribute__ ((__vector_size__ (16), 
__may_alias__));
 
 /* Unaligned version of the same type.  */
 typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__, 
__aligned__ (1)));
-typedef float float_u __attribute__ ((__may_alias__, __aligned__ (1)));
+typedef float __float_u __attribute__ ((__may_alias__, __aligned__ (1)));
 
 /* Internal data types for implementing the intrinsics.  */
 typedef float __v4sf __attribute__ ((__vector_size__ (16)));
@@ -910,7 +910,7 @@ _mm_set_ps1 (float __F)
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_load_ss (float const *__P)
 {
-  return __extension__ (__m128) (__v4sf){ *(float_u *)__P, 0.0f, 0.0f, 0.0f };
+  return __extension__ (__m128) (__v4sf){ *(__float_u *)__P, 0.0f, 0.0f, 0.0f 
};
 }
 
 /* Create a vector with all four elements equal to *P.  */
@@ -966,7 +966,7 @@ _mm_setr_ps (float __Z, float __Y, float __X, float __W)
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_store_ss (float *__P, __m128 __A)
 {
-  *(float_u *)__P = ((__v4sf)__A)[0];
+  *(__float_u *)__P = ((__v4sf)__A)[0];
 }
 
 extern __inline float __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
-- 
2.31.1



[PATCH] [x86] Add some preference for floating point rtl ifcvt when sse4.1 is not available

2024-06-02 Thread liuhongt
W/o TARGET_SSE4_1, it takes 3 instructions (pand, pandn and por) for
movdfcc/movsfcc, and could possibly fail cost comparison. Increase
branch cost could hurt performance for other modes, so specially add
some preference for floating point ifcvt.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

* config/i386/i386.cc (ix86_noce_conversion_profitable_p): Add
some preference for floating point ifcvt when SSE4.1 is not
available.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115299.c: New test.
* gcc.target/i386/pr86722.c: Adjust testcase.
---
 gcc/config/i386/i386.cc  | 17 +
 gcc/testsuite/gcc.target/i386/pr115299.c | 10 ++
 gcc/testsuite/gcc.target/i386/pr86722.c  |  2 +-
 3 files changed, 28 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr115299.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 1a0206ab573..271da127a89 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -24879,6 +24879,23 @@ ix86_noce_conversion_profitable_p (rtx_insn *seq, 
struct noce_if_info *if_info)
return false;
}
 }
+
+  /* W/o TARGET_SSE4_1, it takes 3 instructions (pand, pandn and por)
+ for movdfcc/movsfcc, and could possibly fail cost comparison.
+ Increase branch cost will hurt performance for other modes, so
+ specially add some preference for floating point ifcvt.  */
+  if (!TARGET_SSE4_1 && if_info->x
+  && GET_MODE_CLASS (GET_MODE (if_info->x)) == MODE_FLOAT
+  && if_info->speed_p)
+{
+  unsigned cost = seq_cost (seq, true);
+
+  if (cost <= if_info->original_cost)
+   return true;
+
+  return cost <= (if_info->max_seq_cost + COSTS_N_INSNS (2));
+}
+
   return default_noce_conversion_profitable_p (seq, if_info);
 }
 
diff --git a/gcc/testsuite/gcc.target/i386/pr115299.c 
b/gcc/testsuite/gcc.target/i386/pr115299.c
new file mode 100644
index 000..53c5899136a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115299.c
@@ -0,0 +1,10 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mno-sse4.1 -msse2" } */
+
+void f(double*d,double*e){
+  for(;d

  1   2   3   4   5   6   >