from:"liuhongt via Gcc\-patches"

[PATCH] Don't try bswap + rotate when TYPE_PRECISION(n->type) > n->range.

2023-06-01 Thread liuhongt via Gcc-patches

For the testcase in the PR, we have

  br64 = br;
  br64 = ((br64 << 16) & 0x00ffull) | (br64 & 0xff00ull);

  n->n: 0x300200.
  n->range: 32.
  n->type: uint64.

The original code assumes n->range is same as TYPE PRECISION(n->type),
and tries to rotate the mask from 0x30200 -> 0x20300 which is
incorrect. The patch fixed this bug by not trying bswap + rotate when
TYPE_PRECISION(n->type) is not equal to n->range.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR tree-optimization/110067
* gimple-ssa-store-merging.cc (find_bswap_or_nop): Don't try
bswap + rotate when TYPE_PRECISION(n->type) > n->range.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110067.c: New test.
---
 gcc/gimple-ssa-store-merging.cc  |  3 +
 gcc/testsuite/gcc.target/i386/pr110067.c | 77 
 2 files changed, 80 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110067.c

diff --git a/gcc/gimple-ssa-store-merging.cc b/gcc/gimple-ssa-store-merging.cc
index 9cb574fa315..401496a9231 100644
--- a/gcc/gimple-ssa-store-merging.cc
+++ b/gcc/gimple-ssa-store-merging.cc
@@ -1029,6 +1029,9 @@ find_bswap_or_nop (gimple *stmt, struct symbolic_number 
*n, bool *bswap,
   /* TODO, handle cast64_to_32 and big/litte_endian memory
 source when rsize < range.  */
   if (n->range == orig_range
+ /* There're case like 0x30200 for uint32->uint64 cast,
+Don't hanlde this.  */
+ && n->range == TYPE_PRECISION (n->type)
  && ((orig_range == 32
   && optab_handler (rotl_optab, SImode) != CODE_FOR_nothing)
  || (orig_range == 64
diff --git a/gcc/testsuite/gcc.target/i386/pr110067.c 
b/gcc/testsuite/gcc.target/i386/pr110067.c
new file mode 100644
index 000..c4208811628
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110067.c
@@ -0,0 +1,77 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fno-strict-aliasing" } */
+
+#include 
+#define force_inline __inline__ __attribute__ ((__always_inline__))
+
+__attribute__((noipa))
+static void
+fetch_pixel_no_alpha_32_bug (void *out)
+{
+  uint32_t *ret = out;
+  *ret = 0xff499baf;
+}
+
+static force_inline uint32_t
+bilinear_interpolation_local (uint32_t tl, uint32_t tr,
+ uint32_t bl, uint32_t br,
+ int distx, int disty)
+{
+  uint64_t distxy, distxiy, distixy, distixiy;
+  uint64_t tl64, tr64, bl64, br64;
+  uint64_t f, r;
+
+  distx <<= 1;
+  disty <<= 1;
+
+  distxy = distx * disty;
+  distxiy = distx * (256 - disty);
+  distixy = (256 - distx) * disty;
+  distixiy = (256 - distx) * (256 - disty);
+
+  /* Alpha and Blue */
+  tl64 = tl & 0xffff;
+  tr64 = tr & 0xffff;
+  bl64 = bl & 0xffff;
+  br64 = br & 0xffff;
+
+  f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+  r = f & 0xffffull;
+
+  /* Red and Green */
+  tl64 = tl;
+  tl64 = ((tl64 << 16) & 0x00ffull) | (tl64 & 0xff00ull);
+
+  tr64 = tr;
+  tr64 = ((tr64 << 16) & 0x00ffull) | (tr64 & 0xff00ull);
+
+  bl64 = bl;
+  bl64 = ((bl64 << 16) & 0x00ffull) | (bl64 & 0xff00ull);
+
+  br64 = br;
+  br64 = ((br64 << 16) & 0x00ffull) | (br64 & 0xff00ull);
+
+  f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+  r |= ((f >> 16) & 0x00ffull) | (f & 0xff00ull);
+
+  return (uint32_t)(r >> 16);
+}
+
+__attribute__((noipa))
+static void
+bits_image_fetch_pixel_bilinear_32_bug (void *out)
+{
+  uint32_t br;
+  uint32_t *ret = out;
+
+  fetch_pixel_no_alpha_32_bug (&br);
+  *ret = bilinear_interpolation_local (0, 0, 0, br, 0x41, 0x42);
+}
+
+int main() {
+  uint32_t r;
+  bits_image_fetch_pixel_bilinear_32_bug (&r);
+  if (r != 0x4213282d)
+__builtin_abort ();
+  return 0;
+}
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] i386: Add missing vector truncate patterns [PR92658].

2023-06-01 Thread liuhongt via Gcc-patches

Add missing insn patterns for v2si -> v2hi/v2qi and v2hi-> v2qi vector
truncate.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/92658
* config/i386/mmx.md (truncv2hiv2qi2): New define_insn.
(truncv2si2): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr92658-avx512bw-trunc-2.c: New test.
---
 gcc/config/i386/mmx.md| 21 +++
 .../i386/pr92658-avx512bw-trunc-2.c   | 27 +++
 2 files changed, 48 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92658-avx512bw-trunc-2.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index dbcb850ffde..bb45098f797 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -3667,6 +3667,27 @@ (define_expand "v2qiv2hi2"
   DONE;
 })
 
+(define_insn "truncv2hiv2qi2"
+  [(set (match_operand:V2QI 0 "register_operand" "=v")
+   (truncate:V2QI
+ (match_operand:V2HI 1 "register_operand" "v")))]
+  "TARGET_AVX512VL && TARGET_AVX512BW"
+  "vpmovwb\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
+(define_mode_iterator V2QI_V2HI [V2QI V2HI])
+(define_insn "truncv2si2"
+  [(set (match_operand:V2QI_V2HI 0 "register_operand" "=v")
+   (truncate:V2QI_V2HI
+ (match_operand:V2SI 1 "register_operand" "v")))]
+  "TARGET_AVX512VL && TARGET_MMX_WITH_SSE"
+  "vpmovd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
 ;; Pack/unpack vector modes
 (define_mode_attr mmxpackmode
   [(V4HI "V8QI") (V2SI "V4HI")])
diff --git a/gcc/testsuite/gcc.target/i386/pr92658-avx512bw-trunc-2.c 
b/gcc/testsuite/gcc.target/i386/pr92658-avx512bw-trunc-2.c
new file mode 100644
index 000..2f5b7dc5668
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92658-avx512bw-trunc-2.c
@@ -0,0 +1,27 @@
+/* PR target/92658 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bw -mavx512vl" } */
+/* { dg-final { scan-assembler-times "vpmovwb" 1 } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 1 { target { ! ia32 } } } } */
+
+void
+foo (int* __restrict a, char* b)
+{
+b[0] = a[0];
+b[1] = a[1];
+}
+
+void
+foo2 (short* __restrict a, char* b)
+{
+b[0] = a[0];
+b[1] = a[1];
+}
+
+void
+foo3 (int* __restrict a, short* b)
+{
+b[0] = a[0];
+b[1] = a[1];
+}
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] [vect]Use intermiediate integer type for float_expr/fix_trunc_expr when direct optab is not existed.

2023-06-01 Thread liuhongt via Gcc-patches

We have already use intermidate type in case WIDEN, but not for NONE,
this patch extended that.

I didn't do that in pattern recog since we need to know whether the
stmt belongs to any slp_node to decide the vectype, the related optabs
are checked according to vectype_in and vectype_out. For non-slp case,
vec_pack/unpack are always used when lhs has different size from rhs,
for slp case, sometimes vec_pack/unpack is used, somethings
direct conversion is used.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/110018
* tree-vect-stmts.cc (vectorizable_conversion): Use
intermiediate integer type for float_expr/fix_trunc_expr when
direct optab is not existed.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110018-1.c: New test.
---
 gcc/testsuite/gcc.target/i386/pr110018-1.c | 94 ++
 gcc/tree-vect-stmts.cc | 56 -
 2 files changed, 149 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110018-1.c

diff --git a/gcc/testsuite/gcc.target/i386/pr110018-1.c 
b/gcc/testsuite/gcc.target/i386/pr110018-1.c
new file mode 100644
index 000..b1baffd7af1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110018-1.c
@@ -0,0 +1,94 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -O2 -mavx512dq" } */
+/* { dg-final { scan-assembler-times {(?n)vcvttp[dsh]2[dqw]} 5 } } */
+/* { dg-final { scan-assembler-times {(?n)vcvt[dqw]*2p[dsh]} 5 } } */
+
+void
+foo (double* __restrict a, char* b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+}
+
+void
+foo1 (float* __restrict a, char* b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+}
+
+void
+foo2 (_Float16* __restrict a, char* b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+  a[4] = b[4];
+  a[5] = b[5];
+  a[6] = b[6];
+  a[7] = b[7];
+}
+
+void
+foo3 (double* __restrict a, short* b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+}
+
+void
+foo4 (float* __restrict a, char* b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+}
+
+void
+foo5 (double* __restrict b, char* a)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+}
+
+void
+foo6 (float* __restrict b, char* a)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+}
+
+void
+foo7 (_Float16* __restrict b, char* a)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+  a[4] = b[4];
+  a[5] = b[5];
+  a[6] = b[6];
+  a[7] = b[7];
+}
+
+void
+foo8 (double* __restrict b, short* a)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+}
+
+void
+foo9 (float* __restrict b, char* a)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index bd3b07a3aa1..1118c89686d 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -5162,6 +5162,49 @@ vectorizable_conversion (vec_info *vinfo,
return false;
   if (supportable_convert_operation (code, vectype_out, vectype_in, 
&code1))
break;
+  if ((code == FLOAT_EXPR
+  && GET_MODE_SIZE (lhs_mode) > GET_MODE_SIZE (rhs_mode))
+ || (code == FIX_TRUNC_EXPR
+ && GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode)))
+   {
+ bool float_expr_p = code == FLOAT_EXPR;
+ scalar_mode imode = float_expr_p ? rhs_mode : lhs_mode;
+ fltsz = GET_MODE_SIZE (float_expr_p ? lhs_mode : rhs_mode);
+ code1 = float_expr_p ? code : NOP_EXPR;
+ codecvt1 = float_expr_p ? NOP_EXPR : code;
+ FOR_EACH_2XWIDER_MODE (rhs_mode_iter, imode)
+   {
+ imode = rhs_mode_iter.require ();
+ if (GET_MODE_SIZE (imode) > fltsz)
+   break;
+
+ cvt_type
+   = build_nonstandard_integer_type (GET_MODE_BITSIZE (imode),
+ 0);
+ cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type,
+ slp_node);
+ /* This should only happened for SLP as long as loop vectorizer
+only supports same-sized vector.  */
+ if (cvt_type == NULL_TREE
+ || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nunits_in)
+ || !supportable_convert_operation (code1, vectype_out,
+cvt_type, &code1)
+ || !supportable_convert_operation (codecvt1, cvt_type,
+vectype_in, &codecvt1))
+   continue;
+
+ found_mode = true;
+ break;
+   }
+
+ if (found_mode)
+   {
+ multi_step_cvt++;
+ interm_types.safe_push (cvt_type);
+ cvt_type = NULL_TREE;
+ break;
+   }
+   }
   /* FALLTHRU */
 unsupported:
   if (dump_enabled_p ())
@@ -5381,7 +5424,18 @@ vectorizable_conversion (vec_info *v

[PATCH] [x86] Add missing vec_pack/unpacks patterns for _Float16 <-> int/float conversion.

2023-06-04 Thread liuhongt via Gcc-patches

This patch only support vec_pack/unpacks optabs for vector modes whose lenth >= 
128.
For 32/64-bit vector, they're more hanlded by BB vectorizer with
truncmn2/extendmn2/fix{,uns}_truncmn2.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready to push to trunk.

gcc/ChangeLog:

* config/i386/sse.md (vec_pack_float_): New expander.
(vec_unpack_fix_trunc_lo_): Ditto.
(vec_unpack_fix_trunc_hi_): Ditto.
(vec_unpacks_lo_: Ditto.
(vec_unpacks_hi_: Ditto.
(sse_movlhps_): New define_insn.
(ssse3_palignr_perm): Extend to V_128H.
(V_128H): New mode iterator.
(ssepackPHmode): New mode attribute.
(vunpck_extract_mode>: Ditto.
(vpckfloat_concat_mode): Extend to VxSI/VxSF for _Float16.
(vpckfloat_temp_mode): Ditto.
(vpckfloat_op_mode): Ditto.
(vunpckfixt_mode): Extend to VxHF.
(vunpckfixt_model): Ditto.
(vunpckfixt_extract_mode): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/vec_pack_fp16-1.c: New test.
* gcc.target/i386/vec_pack_fp16-2.c: New test.
* gcc.target/i386/vec_pack_fp16-3.c: New test.
---
 gcc/config/i386/sse.md| 216 +-
 .../gcc.target/i386/vec_pack_fp16-1.c |  34 +++
 .../gcc.target/i386/vec_pack_fp16-2.c |   9 +
 .../gcc.target/i386/vec_pack_fp16-3.c |   8 +
 4 files changed, 258 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/vec_pack_fp16-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vec_pack_fp16-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vec_pack_fp16-3.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a92f50e96b5..1eb2dd077ff 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -291,6 +291,9 @@ (define_mode_iterator V
 (define_mode_iterator V_128
   [V16QI V8HI V4SI V2DI V4SF (V2DF "TARGET_SSE2")])
 
+(define_mode_iterator V_128H
+  [V16QI V8HI V8HF V8BF V4SI V2DI V4SF (V2DF "TARGET_SSE2")])
+
 ;; All 256bit vector modes
 (define_mode_iterator V_256
   [V32QI V16HI V8SI V4DI V8SF V4DF])
@@ -1076,6 +1079,12 @@ (define_mode_attr ssePHmodelower
(V8DI "v8hf") (V4DI "v4hf") (V2DI "v2hf")
(V8DF "v8hf") (V16SF "v16hf") (V8SF "v8hf")])
 
+
+;; Mapping of vector modes to packed vector hf modes of same sized.
+(define_mode_attr ssepackPHmode
+  [(V16SI "V32HF") (V8SI "V16HF") (V4SI "V8HF")
+   (V16SF "V32HF") (V8SF "V16HF") (V4SF "V8HF")])
+
 ;; Mapping of vector modes to packed single mode of the same size
 (define_mode_attr ssePSmode
   [(V16SI "V16SF") (V8DF "V16SF")
@@ -6918,6 +6927,61 @@ (define_mode_attr qq2phsuff
(V16SF "") (V8SF "{y}") (V4SF "{x}")
(V8DF "{z}") (V4DF "{y}") (V2DF "{x}")])
 
+(define_mode_attr vunpck_extract_mode
+  [(V32HF "v32hf") (V16HF "v16hf") (V8HF "v16hf")])
+
+(define_expand "vec_unpacks_lo_"
+  [(match_operand: 0 "register_operand")
+   (match_operand:VF_AVX512FP16VL 1 "register_operand")]
+  "TARGET_AVX512FP16"
+{
+  rtx tem = operands[1];
+  rtx (*gen) (rtx, rtx);
+  if (mode != V8HFmode)
+{
+  tem = gen_reg_rtx (mode);
+  emit_insn (gen_vec_extract_lo_ (tem,
+  operands[1]));
+  gen = gen_extend2;
+}
+  else
+gen = gen_avx512fp16_float_extend_phv4sf2;
+
+  emit_insn (gen (operands[0], tem));
+  DONE;
+})
+
+(define_expand "vec_unpacks_hi_"
+  [(match_operand: 0 "register_operand")
+   (match_operand:VF_AVX512FP16VL 1 "register_operand")]
+  "TARGET_AVX512FP16"
+{
+  rtx tem = operands[1];
+  rtx (*gen) (rtx, rtx);
+  if (mode != V8HFmode)
+{
+  tem = gen_reg_rtx (mode);
+  emit_insn (gen_vec_extract_hi_ (tem,
+  operands[1]));
+  gen = gen_extend2;
+}
+  else
+{
+  tem = gen_reg_rtx (V8HFmode);
+  rtvec tmp = rtvec_alloc (8);
+  for (int i = 0; i != 8; i++)
+   RTVEC_ELT (tmp, i) = GEN_INT((i+4)%8);
+
+  rtx selector = gen_rtx_PARALLEL (VOIDmode, tmp);
+  emit_move_insn (tem,
+gen_rtx_VEC_SELECT (V8HFmode, operands[1], selector));
+  gen = gen_avx512fp16_float_extend_phv4sf2;
+}
+
+  emit_insn (gen (operands[0], tem));
+  DONE;
+})
+
 (define_insn 
"avx512fp16_vcvtph2_"
   [(set (match_operand:VI248_AVX512VL 0 "register_operand" "=v")
 (unspec:VI248_AVX512VL
@@ -8314,11 +8378,17 @@ (define_expand "floatv2div2sf2"
 })
 
 (define_mode_attr vpckfloat_concat_mode
-  [(V8DI "v16sf") (V4DI "v8sf") (V2DI "v8sf")])
+  [(V8DI "v16sf") (V4DI "v8sf") (V2DI "v8sf")
+   (V16SI "v32hf") (V8SI "v16hf") (V4SI "v16hf")
+   (V16SF "v32hf") (V8SF "v16hf") (V4SF "v16hf")])
 (define_mode_attr vpckfloat_temp_mode
-  [(V8DI "V8SF") (V4DI "V4SF") (V2DI "V4SF")])
+  [(V8DI "V8SF") (V4DI "V4SF") (V2DI "V4SF")
+   (V16SI "V16HF") (V8SI "V8HF") (V4SI "V8HF")
+   (V16SF "V16HF") (V8SF "V8HF") (V4SF "V8HF")])
 (define_mode_attr vpckfloat_op_mode
-  [(V8DI "v8sf"

[PATCH] Fold _mm{, 256, 512}_abs_{epi8, epi16, epi32, epi64} into gimple ABSU_EXPR + VCE.

2023-06-05 Thread liuhongt via Gcc-patches

r14-1145 fold the intrinsics into gimple ABS_EXPR which has UB for
TYPE_MIN, but PABSB will store unsigned result into dst. The patch
uses ABSU_EXPR + VCE instead of ABS_EXPR.

Also don't fold _mm_abs_{pi8,pi16,pi32} w/o TARGET_64BIT since 64-bit
vector absm2 is guarded with TARGET_MMX_WITH_SSE.

Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
Ok for trunk?


gcc/ChangeLog:

PR target/110108
* config/i386/i386.cc (ix86_gimple_fold_builtin): Fold
_mm{,256,512}_abs_{epi8,epi16,epi32,epi64} into gimple
ABSU_EXPR + VCE, don't fold _mm_abs_{pi8,pi16,pi32} w/o
TARGET_64BIT.
* config/i386/i386-builtin.def: Replace CODE_FOR_nothing with
real codename for __builtin_ia32_pabs{b,w,d}.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110108.c: New test.
---
 gcc/config/i386/i386-builtin.def |  6 ++--
 gcc/config/i386/i386.cc  | 44 
 gcc/testsuite/gcc.target/i386/pr110108.c | 16 +
 3 files changed, 56 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110108.c

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 383b68a9bb8..7ba5b6a9d11 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -900,11 +900,11 @@ BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_hsubv2df3, 
"__builtin_ia32_hsubpd"
 
 /* SSSE3 */
 BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_nothing, "__builtin_ia32_pabsb128", 
IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI)
-BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, 
"__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI)
+BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, 
CODE_FOR_ssse3_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, 
(int) V8QI_FTYPE_V8QI)
 BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_nothing, "__builtin_ia32_pabsw128", 
IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI)
-BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, 
"__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI)
+BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, 
CODE_FOR_ssse3_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, 
(int) V4HI_FTYPE_V4HI)
 BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_nothing, "__builtin_ia32_pabsd128", 
IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI)
-BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, 
"__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI)
+BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, 
CODE_FOR_ssse3_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, 
(int) V2SI_FTYPE_V2SI)
 
 BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_ssse3_phaddwv8hi3, 
"__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) 
V8HI_FTYPE_V8HI_V8HI)
 BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, 
CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, 
UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI)
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d4ff56ee8dd..b09b3c79e99 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -18433,6 +18433,7 @@ bool
 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 {
   gimple *stmt = gsi_stmt (*gsi), *g;
+  gimple_seq stmts = NULL;
   tree fndecl = gimple_call_fndecl (stmt);
   gcc_checking_assert (fndecl && fndecl_built_in_p (fndecl, BUILT_IN_MD));
   int n_args = gimple_call_num_args (stmt);
@@ -18555,7 +18556,6 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
{
  loc = gimple_location (stmt);
  tree type = TREE_TYPE (arg2);
- gimple_seq stmts = NULL;
  if (VECTOR_FLOAT_TYPE_P (type))
{
  tree itype = GET_MODE_INNER (TYPE_MODE (type)) == E_SFmode
@@ -18610,7 +18610,6 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
  tree zero_vec = build_zero_cst (type);
  tree minus_one_vec = build_minus_one_cst (type);
  tree cmp_type = truth_type_for (type);
- gimple_seq stmts = NULL;
  tree cmp = gimple_build (&stmts, tcode, cmp_type, arg0, arg1);
  gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
  g = gimple_build_assign (gimple_call_lhs (stmt),
@@ -18904,14 +18903,18 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
   break;
 
 case IX86_BUILTIN_PABSB:
+case IX86_BUILTIN_PABSW:
+case IX86_BUILTIN_PABSD:
+  /* 64-bit vector abs2 is only supported under TARGET_MMX_WITH_SSE. 
 */
+  if (!TARGET_64BIT)
+   break;
+  /* FALLTHRU.  */
 case IX86_BUILTIN_PABSB128:
 case IX86_BUILTIN_PABSB256:
 case IX86_BUILTIN_PABSB512:
-case IX86_BUILTIN_PABSW:
 case IX86_BUILTIN_PABSW128:
 case IX86_BUILTIN_PABSW256:
 case IX86_BUILTIN_PABSW512:
-case IX86_BUILTIN_PABSD:
 case IX86_BUILTIN_PABSD128:
 case

[PATCH] Don't fold _mm{, 256}_blendv_epi8 into (mask < 0 ? src1 : src2) when -funsigned-char.

2023-06-05 Thread liuhongt via Gcc-patches

Since mask < 0 will be always false when -funsigned-char, but
vpblendvb needs to check the most significant bit.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk and backport to GCC12/GCC13 release branch?

gcc/ChangeLog:

PR target/110108
* config/i386/i386-builtin.def (BDESC): Replace
CODE_FOR_nothing with real code name for blendvb builtins.
* config/i386/i386.cc (ix86_gimple_fold_builtin): Don't fold
_mm{,256}_blendv_epi8 into (mask < 0 ? src1 : src2) when
-funsigned-char.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110108-2.c: New test.
---
 gcc/config/i386/i386-builtin.def   |  4 ++--
 gcc/config/i386/i386.cc|  7 +++
 gcc/testsuite/gcc.target/i386/pr110108-2.c | 14 ++
 3 files changed, 23 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110108-2.c

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 7ba5b6a9d11..b4c99ff62a2 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -944,7 +944,7 @@ BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_dppd, 
"__builtin_ia32_dppd", I
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", 
IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_insertps_v4sf, 
"__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) 
V4SF_FTYPE_V4SF_V4SF_INT)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_mpsadbw, 
"__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) 
V16QI_FTYPE_V16QI_V16QI_INT)
-BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_nothing, 
"__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) 
V16QI_FTYPE_V16QI_V16QI_V16QI)
+BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_pblendvb, 
"__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) 
V16QI_FTYPE_V16QI_V16QI_V16QI)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_pblendw, 
"__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) 
V8HI_FTYPE_V8HI_V8HI_INT)
 
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, 
"__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) 
V8HI_FTYPE_V16QI)
@@ -1198,7 +1198,7 @@ BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_andv4di3, 
"__builtin_ia32_andsi256", IX
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_andnotv4di3, 
"__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) 
V4DI_FTYPE_V4DI_V4DI)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_uavgv32qi3, 
"__builtin_ia32_pavgb256",  IX86_BUILTIN_PAVGB256, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_uavgv16hi3, 
"__builtin_ia32_pavgw256",  IX86_BUILTIN_PAVGW256, UNKNOWN, (int) 
V16HI_FTYPE_V16HI_V16HI)
-BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, 
"__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI_V32QI)
+BDESC (OPTION_MASK_ISA_AVX2, 0,  CODE_FOR_avx2_pblendvb, 
"__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI_V32QI)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_pblendw, 
"__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) 
V16HI_FTYPE_V16HI_V16HI_INT)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpeqb256", 
IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpeqw256", 
IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI)
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index b09b3c79e99..f8f6c26c8eb 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -18548,6 +18548,13 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
   /* FALLTHRU.  */
 case IX86_BUILTIN_PBLENDVB128:
 case IX86_BUILTIN_BLENDVPS:
+  /* Don't fold PBLENDVB when funsigned-char since mask < 0
+will always be false in the gimple level.  */
+  if ((fn_code == IX86_BUILTIN_PBLENDVB128
+  || fn_code == IX86_BUILTIN_PBLENDVB256)
+ && !flag_signed_char)
+   break;
+
   gcc_assert (n_args == 3);
   arg0 = gimple_call_arg (stmt, 0);
   arg1 = gimple_call_arg (stmt, 1);
diff --git a/gcc/testsuite/gcc.target/i386/pr110108-2.c 
b/gcc/testsuite/gcc.target/i386/pr110108-2.c
new file mode 100644
index 000..2d1d2fd4991
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110108-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2 -funsigned-char" } */
+/* { dg-final { scan-assembler-times "vpblendvb" 2 } } */
+
+#include 
+__m128i do_stuff_128(__m128i X0, __m128i X1, __m128i X2) {
+  __m128i Result = _mm_blendv_epi8(X0, X1, X2);
+  return Result;
+}
+
+__m256i do_stuff_256(__m256i X0, __m256i X1, __m256i X2) {
+  __m256i Result = _mm256_blendv_epi8(X0, X1, X

[PATCH v2] Explicitly view_convert_expr mask to signed type when folding pblendvb builtins.

2023-06-06 Thread liuhongt via Gcc-patches

> I think this is a better patch and will always be correct and still
> get folded at the gimple level (correctly):
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index d4ff56ee8dd..02bf5ba93a5 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -18561,8 +18561,10 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
>   tree itype = GET_MODE_INNER (TYPE_MODE (type)) == E_SFmode
> ? intSI_type_node : intDI_type_node;
>   type = get_same_sized_vectype (itype, type);
> - arg2 = gimple_build (&stmts, VIEW_CONVERT_EXPR, type, arg2);
> }
> + else
> +   type = signed_type_for (type);
> + arg2 = gimple_build (&stmts, VIEW_CONVERT_EXPR, type, arg2);
>   tree zero_vec = build_zero_cst (type);
>   tree cmp_type = truth_type_for (type);
>   tree cmp = gimple_build (&stmts, LT_EXPR, cmp_type, arg2, zero_vec);
>
>

Yes, thanks.

Here's the updated patch:

Since mask < 0 will be always false for vector char when
-funsigned-char, but vpblendvb needs to check the most significant
bit. The patch explicitly VCE to vector signed char.


gcc/ChangeLog:

PR target/110108
* config/i386/i386.cc (ix86_gimple_fold_builtin): Explicitly
view_convert_expr mask to signed type when folding pblendvb
builtins.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110108-2.c: New test.
---
 gcc/config/i386/i386.cc|  4 +++-
 gcc/testsuite/gcc.target/i386/pr110108-2.c | 14 ++
 2 files changed, 17 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110108-2.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index da20c2c49de..4e594a9c88e 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -18561,8 +18561,10 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
  tree itype = GET_MODE_INNER (TYPE_MODE (type)) == E_SFmode
? intSI_type_node : intDI_type_node;
  type = get_same_sized_vectype (itype, type);
- arg2 = gimple_build (&stmts, VIEW_CONVERT_EXPR, type, arg2);
}
+ else
+   type = signed_type_for (type);
+ arg2 = gimple_build (&stmts, VIEW_CONVERT_EXPR, type, arg2);
  tree zero_vec = build_zero_cst (type);
  tree cmp_type = truth_type_for (type);
  tree cmp = gimple_build (&stmts, LT_EXPR, cmp_type, arg2, zero_vec);
diff --git a/gcc/testsuite/gcc.target/i386/pr110108-2.c 
b/gcc/testsuite/gcc.target/i386/pr110108-2.c
new file mode 100644
index 000..2d1d2fd4991
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110108-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2 -funsigned-char" } */
+/* { dg-final { scan-assembler-times "vpblendvb" 2 } } */
+
+#include 
+__m128i do_stuff_128(__m128i X0, __m128i X1, __m128i X2) {
+  __m128i Result = _mm_blendv_epi8(X0, X1, X2);
+  return Result;
+}
+
+__m256i do_stuff_256(__m256i X0, __m256i X1, __m256i X2) {
+  __m256i Result = _mm256_blendv_epi8(X0, X1, X2);
+  return Result;
+}
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH 1/2] Fold _mm{, 256, 512}_abs_{epi8, epi16, epi32, epi64} into gimple ABSU_EXPR + VCE.

2023-06-06 Thread liuhongt via Gcc-patches

r14-1145 fold the intrinsics into gimple ABS_EXPR which has UB for
TYPE_MIN, but PABSB will store unsigned result into dst. The patch
uses ABSU_EXPR + VCE instead of ABS_EXPR.

Also don't fold _mm_abs_{pi8,pi16,pi32} w/o TARGET_64BIT since 64-bit
vector absm2 is guarded with TARGET_MMX_WITH_SSE.

gcc/ChangeLog:

PR target/110108
* config/i386/i386.cc (ix86_gimple_fold_builtin): Fold
_mm{,256,512}_abs_{epi8,epi16,epi32,epi64} into gimple
ABSU_EXPR + VCE, don't fold _mm_abs_{pi8,pi16,pi32} w/o
TARGET_64BIT.
* config/i386/i386-builtin.def: Replace CODE_FOR_nothing with
real codename for __builtin_ia32_pabs{b,w,d}.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110108.c: New test.
* gcc.target/i386/pr110108-3.c: New test.
---
 gcc/config/i386/i386-builtin.def   |  6 ++---
 gcc/config/i386/i386.cc| 27 --
 gcc/testsuite/gcc.target/i386/pr109900.c   |  2 +-
 gcc/testsuite/gcc.target/i386/pr110108-3.c | 22 ++
 gcc/testsuite/gcc.target/i386/pr110108.c   | 16 +
 5 files changed, 62 insertions(+), 11 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110108-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110108.c

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 383b68a9bb8..7ba5b6a9d11 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -900,11 +900,11 @@ BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_hsubv2df3, 
"__builtin_ia32_hsubpd"
 
 /* SSSE3 */
 BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_nothing, "__builtin_ia32_pabsb128", 
IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI)
-BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, 
"__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI)
+BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, 
CODE_FOR_ssse3_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, 
(int) V8QI_FTYPE_V8QI)
 BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_nothing, "__builtin_ia32_pabsw128", 
IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI)
-BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, 
"__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI)
+BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, 
CODE_FOR_ssse3_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, 
(int) V4HI_FTYPE_V4HI)
 BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_nothing, "__builtin_ia32_pabsd128", 
IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI)
-BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, 
"__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI)
+BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, 
CODE_FOR_ssse3_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, 
(int) V2SI_FTYPE_V2SI)
 
 BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_ssse3_phaddwv8hi3, 
"__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) 
V8HI_FTYPE_V8HI_V8HI)
 BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, 
CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, 
UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI)
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d4ff56ee8dd..da20c2c49de 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -18433,6 +18433,7 @@ bool
 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 {
   gimple *stmt = gsi_stmt (*gsi), *g;
+  gimple_seq stmts = NULL;
   tree fndecl = gimple_call_fndecl (stmt);
   gcc_checking_assert (fndecl && fndecl_built_in_p (fndecl, BUILT_IN_MD));
   int n_args = gimple_call_num_args (stmt);
@@ -18555,7 +18556,6 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
{
  loc = gimple_location (stmt);
  tree type = TREE_TYPE (arg2);
- gimple_seq stmts = NULL;
  if (VECTOR_FLOAT_TYPE_P (type))
{
  tree itype = GET_MODE_INNER (TYPE_MODE (type)) == E_SFmode
@@ -18610,7 +18610,6 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
  tree zero_vec = build_zero_cst (type);
  tree minus_one_vec = build_minus_one_cst (type);
  tree cmp_type = truth_type_for (type);
- gimple_seq stmts = NULL;
  tree cmp = gimple_build (&stmts, tcode, cmp_type, arg0, arg1);
  gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
  g = gimple_build_assign (gimple_call_lhs (stmt),
@@ -18904,14 +18903,18 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
   break;
 
 case IX86_BUILTIN_PABSB:
+case IX86_BUILTIN_PABSW:
+case IX86_BUILTIN_PABSD:
+  /* 64-bit vector abs2 is only supported under TARGET_MMX_WITH_SSE. 
 */
+  if (!TARGET_64BIT)
+   break;
+  /* FALLTHRU.  */
 case IX86_BUILTIN_PABSB128:
 case IX86_BUILTIN_PABSB256:
 case IX86_BUILTIN_PABSB512:
-case IX86_BUILTIN_PABSW:

[PATCH] [x86] Use x instead of v for alternative 2 (v, BH) in mov_internal.

2023-06-13 Thread liuhongt via Gcc-patches

Since there's no evex version for vpcmpeq ymm, ymm, ymm.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready to push to trunk and backport to GCC13.

gcc/ChangeLog:

PR target/110227
* config/i386/sse.md (mov_internal>): Use x instead of v
for alternative 2 since there's no evex version for vpcmpeqd
ymm, ymm, ymm.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110227.c: New test.
---
 gcc/config/i386/sse.md   |  2 +-
 gcc/testsuite/gcc.target/i386/pr110227.c | 11 +++
 2 files changed, 12 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110227.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 9bec09d354a..370ea6418a6 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1324,7 +1324,7 @@ (define_expand "mov"
 
 (define_insn "mov_internal"
   [(set (match_operand:VMOVE 0 "nonimmediate_operand"
-"=v,v ,v,v ,m")
+"=v,v ,x,v ,m")
(match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand"
 " C,,BH,vm,v"))]
   "TARGET_SSE
diff --git a/gcc/testsuite/gcc.target/i386/pr110227.c 
b/gcc/testsuite/gcc.target/i386/pr110227.c
new file mode 100644
index 000..9b59f5b6e49
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110227.c
@@ -0,0 +1,11 @@
+/* { dg-do assemble { target { ! ia32 } } } */
+/* { dg-options " -O2 -mavx512vl" } */
+
+#include 
+
+void f()
+{
+  __m256i mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
+  register __m256i reg asm("xmm16") = mask;
+  asm(""::"v"(reg));
+}
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH 1/2] Reimplement packuswb/packusdw with UNSPEC_US_TRUNCATE instead of original us_truncate.

2023-06-15 Thread liuhongt via Gcc-patches

packuswb/packusdw does unsigned saturation for signed source, but rtl
us_truncate means does unsigned saturation for unsigned source.
So for value -1, packuswb will produce 0, but us_truncate produces
255. The patch reimplement those related patterns and functions with
UNSPEC_US_TRUNCATE instead of us_truncate.

The patch will fix below testcase which failed after
g:921b841350c4fc298d09f6c5674663e0f4208610 added constant-folding for 
US_TRUNCATE

FAIL: gcc.target/i386/avx-vpackuswb-1.c execution test
FAIL: gcc.target/i386/avx2-vpackusdw-2.c execution test
FAIL: gcc.target/i386/avx2-vpackuswb-2.c execution test
FAIL: gcc.target/i386/sse2-packuswb-1.c execution test

Bootstrapped and regtested on x86_64-pc-linux-gnu.
Ok for trunk?

gcc/ChangeLog:

PR target/110235
* config/i386/i386-expand.cc (ix86_split_mmx_pack): Use
UNSPEC_US_TRUNCATE instead of original us_truncate for
packusdw/packuswb.
* config/i386/mmx.md (mmx_packswb): Splitted to
below 2 new patterns.
(mmx_packsswb): New reload_completed define_insn_and_split.
(mmx_packuswb): Ditto.
(mmx_packusdw): Use UNSPEC_US_TRUNCATE instead of original
us_truncate.
(s_trunsuffix): Removed.
(any_s_truncate): Removed.
* config/i386/sse.md (_packuswb): Use
UNSPEC_US_TRUNCATE instead of original us_truncate.
(_packusdw): Ditto.
* config/i386/i386.md (UNSPEC_US_TRUNCATE): New unspec_c_enum.
---
 gcc/config/i386/i386-expand.cc | 20 
 gcc/config/i386/i386.md|  4 
 gcc/config/i386/mmx.md | 43 ++
 gcc/config/i386/sse.md | 20 
 4 files changed, 57 insertions(+), 30 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index def060ab562..35e2740f9b6 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -1019,6 +1019,7 @@ ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
   rtx op0 = operands[0];
   rtx op1 = operands[1];
   rtx op2 = operands[2];
+  rtx src;
 
   machine_mode dmode = GET_MODE (op0);
   machine_mode smode = GET_MODE (op1);
@@ -1042,11 +1043,20 @@ ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
   op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
   op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
 
-  op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
-  op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
-  rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
-   op1, op2));
-  emit_insn (insn);
+  /* For packusdw/packuswb, it does unsigned saturation for
+ signed source which is different for rtl US_TRUNCATE.  */
+  if (code == US_TRUNCATE)
+src = gen_rtx_UNSPEC (sse_dmode,
+ gen_rtvec (2, op1, op2),
+ UNSPEC_US_TRUNCATE);
+  else
+{
+  op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
+  op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
+  src = gen_rtx_VEC_CONCAT (sse_dmode, op1, op2);
+}
+
+  emit_move_insn (dest, src);
 
   ix86_move_vector_high_sse_to_mmx (op0);
 }
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 0929115ed4d..070a84d8af9 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -129,6 +129,10 @@ (define_c_enum "unspec" [
   UNSPEC_RSQRT
   UNSPEC_PSADBW
 
+  ;; US_TRUNCATE this is different from rtl us_truncate,
+  ;; it does unsigned truncation for signed source.
+  UNSPEC_US_TRUNCATE
+
   ;; For AVX/AVX512F support
   UNSPEC_SCALEF
   UNSPEC_PCMP
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 6fbe3909c8b..315eb4193c4 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -3337,27 +3337,41 @@ (define_split
 ;;
 ;
 
-;; Used in signed and unsigned truncations with saturation.
-(define_code_iterator any_s_truncate [ss_truncate us_truncate])
-;; Instruction suffix for truncations with saturation.
-(define_code_attr s_trunsuffix [(ss_truncate "s") (us_truncate "u")])
-
-(define_insn_and_split "mmx_packswb"
+(define_insn_and_split "mmx_packsswb"
   [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yw")
(vec_concat:V8QI
- (any_s_truncate:V4QI
+ (ss_truncate:V4QI
(match_operand:V4HI 1 "register_operand" "0,0,Yw"))
- (any_s_truncate:V4QI
+ (ss_truncate:V4QI
(match_operand:V4HI 2 "register_mmxmem_operand" "ym,x,Yw"]
   "TARGET_MMX || TARGET_MMX_WITH_SSE"
   "@
-   packswb\t{%2, %0|%0, %2}
+   packsswb\t{%2, %0|%0, %2}
+   #
+   #"
+  "&& reload_completed
+   && SSE_REGNO_P (REGNO (operands[0]))"
+  [(const_int 0)]
+  "ix86_split_mmx_pack (operands, SS_TRUNCATE); DONE;"
+  [(set_attr "mmx_isa" "native,sse_noavx,avx")
+   (set_attr "type" "mmxshft,sselog,sselog")
+   (set_attr "mode" "DI,TI,TI")])
+
+(define_ins

[PATCH 2/2] Refined 256/512-bit vpacksswb/vpackssdw patterns.

2023-06-15 Thread liuhongt via Gcc-patches

The packing in vpacksswb/vpackssdw is not a simple concat, it's an
interweave from src1 and src2 for every 128 bit(or 64-bit for the
ss_truncate result).

.i.e.

dst[192-255] = ss_truncate (src2[128-255])
dst[128-191] = ss_truncate (src1[128-255])
dst[64-127] = ss_truncate (src2[0-127])
dst[0-63] = ss_truncate (src1[0-127]

The patch refined those patterns with an extra vec_select for the
interweave.

The patch will fix below testcase which failed after
g:921b841350c4fc298d09f6c5674663e0f4208610 added constant-folding for 
SS_TRUNCATE
FAIL: gcc.target/i386/avx2-vpackssdw-2.c execution test.

Bootstrapped and regtested on x86_64-pc-linux-gnu.
Ok for trunk?

gcc/ChangeLog:

PR target/110235
* config/i386/sse.md (_packsswb): Split
to below 3 new define_insns.
(sse2_packsswb): New define_insn.
(avx2_packsswb): Ditto.
(avx512bw_packsswb): Ditto.
(_packssdw): Split to below 3 new define_insns.
(sse2_packssdw): New define_insn.
(avx2_packssdw): Ditto.
(avx512bw_packssdw): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512bw-vpackssdw-3.c: New test.
* gcc.target/i386/avx512bw-vpacksswb-3.c: New test.
---
 gcc/config/i386/sse.md| 165 --
 .../gcc.target/i386/avx512bw-vpackssdw-3.c|  55 ++
 .../gcc.target/i386/avx512bw-vpacksswb-3.c|  50 ++
 3 files changed, 252 insertions(+), 18 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-vpackssdw-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-vpacksswb-3.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 83e3f534fd2..cc4e4620257 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -17762,14 +17762,14 @@ (define_expand "vec_pack_sbool_trunc_qi"
   DONE;
 })
 
-(define_insn "_packsswb"
-  [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,")
-   (vec_concat:VI1_AVX512
- (ss_truncate:
-   (match_operand: 1 "register_operand" "0,"))
- (ss_truncate:
-   (match_operand: 2 "vector_operand" "xBm,m"]
-  "TARGET_SSE2 &&  && "
+(define_insn "sse2_packsswb"
+  [(set (match_operand:V16QI 0 "register_operand" "=x,Yw")
+   (vec_concat:V16QI
+ (ss_truncate:V8QI
+   (match_operand:V8HI 1 "register_operand" "0,Yw"))
+ (ss_truncate:V8QI
+   (match_operand:V8HI 2 "vector_operand" "xBm,Ywm"]
+  "TARGET_SSE2 &&  && "
   "@
packsswb\t{%2, %0|%0, %2}
vpacksswb\t{%2, %1, %0|%0, %1, %2}"
@@ -1,16 +1,93 @@ (define_insn "_packsswb"
(set_attr "type" "sselog")
(set_attr "prefix_data16" "1,*")
(set_attr "prefix" "orig,")
-   (set_attr "mode" "")])
+   (set_attr "mode" "TI")])
 
-(define_insn "_packssdw"
-  [(set (match_operand:VI2_AVX2 0 "register_operand" "=x,")
-   (vec_concat:VI2_AVX2
- (ss_truncate:
-   (match_operand: 1 "register_operand" "0,"))
- (ss_truncate:
-   (match_operand: 2 "vector_operand" "xBm,m"]
-  "TARGET_SSE2 &&  && "
+(define_insn "avx2_packsswb"
+  [(set (match_operand:V32QI 0 "register_operand" "=Yw")
+   (vec_select:V32QI
+ (vec_concat:V32QI
+   (ss_truncate:V16QI
+ (match_operand:V16HI 1 "register_operand" "Yw"))
+   (ss_truncate:V16QI
+ (match_operand:V16HI 2 "vector_operand" "Ywm")))
+ (parallel [(const_int 0)  (const_int 1)
+(const_int 2)  (const_int 3)
+(const_int 4)  (const_int 5)
+(const_int 6)  (const_int 7)
+(const_int 16) (const_int 17)
+(const_int 18) (const_int 19)
+(const_int 20) (const_int 21)
+(const_int 22) (const_int 23)
+(const_int 8)  (const_int 9)
+(const_int 10) (const_int 11)
+(const_int 12) (const_int 13)
+(const_int 14) (const_int 15)
+(const_int 24) (const_int 25)
+(const_int 26) (const_int 27)
+(const_int 28) (const_int 29)
+(const_int 30) (const_int 31)])))]
+  "TARGET_AVX2 &&  && "
+  "vpacksswb\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "")
+   (set_attr "mode" "OI")])
+
+(define_insn "avx512bw_packsswb"
+  [(set (match_operand:V64QI 0 "register_operand" "=v")
+   (vec_select:V64QI
+ (vec_concat:V64QI
+   (ss_truncate:V32QI
+ (match_operand:V32HI 1 "register_operand" "v"))
+   (ss_truncate:V32QI
+ (match_operand:V32HI 2 "vector_operand" "vm")))
+ (parallel [(const_int 0)  (const_int 1)
+(const_int 2)  (const_int 3)
+(const_int 4)  (const_int 5)
+(const_int 6)  (const_int 7)
+(const_int 32) (const_int 33)
+(const_int 34) (const_i

[PATCH] Remove # from one_cmpl2 assemble output.

2023-07-17 Thread liuhongt via Gcc-patches

optimize_insn_for_speed () in assemble output is not aligned with
splitter condition, and it cause an ICE when building SPEC2017
blender_r.

Not sure if ctrl is supposed to be reliable in assemble output, the patch just 
remove that as a walkaround.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ready to push to trunk.

libpng/pngread.c: In function ‘png_read_image’:
libpng/pngread.c:786:1: internal compiler error: in final_scan_insn_1, at 
final.cc:2813
  786 | }
  | ^
0x73ac3d final_scan_insn_1
../../gcc/final.cc:2813
0xb3420b final_scan_insn(rtx_insn*, _IO_FILE*, int, int, int*)
../../gcc/final.cc:2887
0xb344c4 final_1
../../gcc/final.cc:1979
0xb34f64 rest_of_handle_final
../../gcc/final.cc:4240
0xb34f64 execute
../../gcc/final.cc:4318

gcc/ChangeLog:

PR target/110438
* config/i386/sse.md (one_cmpl2):
Remove # from assemble output.
---
 gcc/config/i386/sse.md | 4 
 1 file changed, 4 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 6bf9c99a2c1..e1158c5717a 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -17220,10 +17220,6 @@ (define_insn_and_split 
"one_cmpl2"
|| mode == SImode
|| mode == DImode)"
 {
-  if (! && which_alternative
-  && optimize_insn_for_speed_p ())
-return "#";
-
   if (TARGET_AVX512VL)
 return "vpternlog\t{$0x55, %1, %0, 
%0|%0, %0, %1, 0x55}";
   else
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Fix fp16 related testcase failure for i686.

2023-07-19 Thread liuhongt via Gcc-patches

> I see some regressions most likely with this change on i686-linux,
> in particular:
> +FAIL: gcc.dg/pr107547.c (test for excess errors)
> +FAIL: gcc.dg/torture/floatn-convert.c  -O0 (test for excess errors)
> +UNRESOLVED: gcc.dg/torture/floatn-convert.c  -O0 compilation failed to 
> produce executable
> +FAIL: gcc.dg/torture/floatn-convert.c  -O1 (test for excess errors)
> +UNRESOLVED: gcc.dg/torture/floatn-convert.c  -O1 compilation failed to 
> produce executable
> +FAIL: gcc.dg/torture/floatn-convert.c  -O2 (test for excess errors)
> +UNRESOLVED: gcc.dg/torture/floatn-convert.c  -O2 compilation failed to 
> produce executable
> +FAIL: gcc.dg/torture/floatn-convert.c  -O2 -flto (test for excess errors)
> +UNRESOLVED: gcc.dg/torture/floatn-convert.c  -O2 -flto compilation failed to 
> produce executable
> +FAIL: gcc.dg/torture/floatn-convert.c  -O2 -flto -flto-partition=none (test 
> for excess errors)
> +UNRESOLVED: gcc.dg/torture/floatn-convert.c  -O2 -flto -flto-partition=none 
> compilation failed to produce executable
> +FAIL: gcc.dg/torture/floatn-convert.c  -O3 -fomit-frame-pointer 
> -funroll-loops -fpeel-loops -ftracer -finline-functions (test for excess 
> errors)
> +UNRESOLVED: gcc.dg/torture/floatn-convert.c  -O3 -fomit-frame-pointer 
> -funroll-loops -fpeel-loops -ftracer -finline-functions compilation failed to 
> produce executable
> +FAIL: gcc.dg/torture/floatn-convert.c  -O3 -g (test for excess errors)
> +UNRESOLVED: gcc.dg/torture/floatn-convert.c  -O3 -g compilation failed to 
> produce executable
> +FAIL: gcc.dg/torture/floatn-convert.c  -Os (test for excess errors)
> +UNRESOLVED: gcc.dg/torture/floatn-convert.c  -Os compilation failed to 
> produce executable
> +FAIL: gcc.target/i386/float16-7.c (test for errors, line 7)
>

> Perhaps we need to tweak
> gcc/testsuite/lib/target-supports.exp (add_options_for_float16)
> so that it adds -msse2 for i?86-*-* x86_64-*-* (that would likely
> fix up floatn-convert) and for the others perhaps
> /* { dg-add-options float16 } */
> ?

I've verified the change fixed those failures.
Ready to push to trunk if there's no objections.

gcc/testsuite/ChangeLog:

* gcc.dg/pr107547.c: Add { dg-add-options float16 }.
* gcc.target/i386/float16-7.c: Add -msse2 to dg-options.
* lib/target-supports.exp (add_options_for_float16): Add
-msse2 for i?86-*-* || x86_64-*-*.
---
 gcc/testsuite/gcc.dg/pr107547.c   | 1 +
 gcc/testsuite/gcc.target/i386/float16-7.c | 2 +-
 gcc/testsuite/lib/target-supports.exp | 3 +++
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/pr107547.c b/gcc/testsuite/gcc.dg/pr107547.c
index c6992c8674b..7cd68afc0af 100644
--- a/gcc/testsuite/gcc.dg/pr107547.c
+++ b/gcc/testsuite/gcc.dg/pr107547.c
@@ -1,6 +1,7 @@
 /* PR tree-optimization/107547 */
 /* { dg-do compile } */
 /* { dg-options "-O2" } */
+/* { dg-add-options float16 } */
 
 int x;
 
diff --git a/gcc/testsuite/gcc.target/i386/float16-7.c 
b/gcc/testsuite/gcc.target/i386/float16-7.c
index 86641afeba9..660021b6ccc 100644
--- a/gcc/testsuite/gcc.target/i386/float16-7.c
+++ b/gcc/testsuite/gcc.target/i386/float16-7.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mfpmath=387 -fexcess-precision=16" } */
+/* { dg-options "-O2 -msse2 -mfpmath=387 -fexcess-precision=16" } */
 /* { dg-excess-errors "'-fexcess-precision=16' is not compatible with 
'-mfpmath=387'" } */
 _Float16
 foo (_Float16 a, _Float16 b)
diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index 8ea0d9feb1c..42024474091 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -3487,6 +3487,9 @@ proc add_options_for_float16 { flags } {
 if { [istarget arm*-*-*] } {
return "$flags -mfp16-format=ieee"
 }
+if { [istarget i?86-*-*] || [istarget x86_64-*-*] } {
+   return "$flags -msse2"
+}
 return "$flags"
 }
 
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Optimize vlddqu to vmovdqu for TARGET_AVX

2023-07-20 Thread liuhongt via Gcc-patches

For Intel processors, after TARGET_AVX, vmovdqu is optimized as fast
as vlddqu, UNSPEC_LDDQU can be removed to enable more optimizations.
Can someone confirm this with AMD folks?
If AMD doesn't like such optimization, I'll put my optimization under
micro-architecture tuning.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
If AMD also like such optimization, Ok for trunk?

gcc/ChangeLog:

* config/i386/sse.md (_lddqu): Change to
define_expand, expand as simple move when TARGET_AVX
&& ( == 16 || !TARGET_AVX256_SPLIT_UNALIGNED_LOAD).
The original define_insn is renamed to
..
(_lddqu): .. this.

gcc/testsuite/ChangeLog:

* gcc.target/i386/vlddqu_vinserti128.c: New test.
---
 gcc/config/i386/sse.md| 15 ++-
 .../gcc.target/i386/vlddqu_vinserti128.c  | 11 +++
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 2d81347c7b6..d571a78f4c4 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1835,7 +1835,20 @@ (define_peephole2
   [(set (match_dup 4) (match_dup 1))]
   "operands[4] = adjust_address (operands[0], V2DFmode, 0);")
 
-(define_insn "_lddqu"
+(define_expand "_lddqu"
+  [(set (match_operand:VI1 0 "register_operand")
+   (unspec:VI1 [(match_operand:VI1 1 "memory_operand")]
+   UNSPEC_LDDQU))]
+  "TARGET_SSE3"
+{
+  if (TARGET_AVX && ( == 16 || !TARGET_AVX256_SPLIT_UNALIGNED_LOAD))
+{
+  emit_move_insn (operands[0], operands[1]);
+  DONE;
+}
+})
+
+(define_insn "*_lddqu"
   [(set (match_operand:VI1 0 "register_operand" "=x")
(unspec:VI1 [(match_operand:VI1 1 "memory_operand" "m")]
UNSPEC_LDDQU))]
diff --git a/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c 
b/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
new file mode 100644
index 000..29699a5fa7f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2" } */
+/* { dg-final { scan-assembler-times "vbroadcasti128" 1 } } */
+/* { dg-final { scan-assembler-not {(?n)vlddqu.*xmm} } } */
+
+#include 
+__m256i foo(void *data) {
+__m128i X1 = _mm_lddqu_si128((__m128i*)data);
+__m256i V1 = _mm256_broadcastsi128_si256 (X1);
+return V1;
+}
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] [x86] Add UNSPEC_MASKOP to vpbroadcastm pattern.

2023-07-27 Thread liuhongt via Gcc-patches

Prevent rtl optimization of vec_duplicate + zero_extend to
vpbroadcastm since there could be an extra kmov after RA.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ready to push to trunk.

gcc/ChangeLog:

PR target/110788
* config/i386/sse.md (avx512cd_maskb_vec_dup): Add
UNSPEC_MASKOP.
(avx512cd_maskw_vec_dup: Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110788.c: New test.
---
 gcc/config/i386/sse.md   |  8 ++--
 gcc/testsuite/gcc.target/i386/pr110788.c | 11 +++
 2 files changed, 17 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110788.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 35fd66ed4aa..51961bbfc0b 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -26778,11 +26778,14 @@ (define_insn 
"avx512dq_broadcast_1"
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
+;; Use unspec to prevent rtl optimizer to optimize zero_extend + vec_duplicate
+;; to pbroadcastm, there could be an extra kmov after RA.
 (define_insn "avx512cd_maskb_vec_dup"
   [(set (match_operand:VI8_AVX512VL 0 "register_operand" "=v")
(vec_duplicate:VI8_AVX512VL
  (zero_extend:DI
-   (match_operand:QI 1 "register_operand" "k"]
+   (match_operand:QI 1 "register_operand" "k"
+   (unspec [(const_int 0)] UNSPEC_MASKOP)]
   "TARGET_AVX512CD"
   "vpbroadcastmb2q\t{%1, %0|%0, %1}"
   [(set_attr "type" "mskmov")
@@ -26793,7 +26796,8 @@ (define_insn "avx512cd_maskw_vec_dup"
   [(set (match_operand:VI4_AVX512VL 0 "register_operand" "=v")
(vec_duplicate:VI4_AVX512VL
  (zero_extend:SI
-   (match_operand:HI 1 "register_operand" "k"]
+   (match_operand:HI 1 "register_operand" "k"
+   (unspec [(const_int 0)] UNSPEC_MASKOP)]
   "TARGET_AVX512CD"
   "vpbroadcastmw2d\t{%1, %0|%0, %1}"
   [(set_attr "type" "mskmov")
diff --git a/gcc/testsuite/gcc.target/i386/pr110788.c 
b/gcc/testsuite/gcc.target/i386/pr110788.c
new file mode 100644
index 000..4cf1676ccb6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110788.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=cascadelake --param vect-partial-vector-usage=2" } 
*/
+/* { dg-final { scan-assembler-not "vpbroadcastm" } } */
+
+double a[1024], b[1024];
+
+void foo (int n)
+{
+  for (int i = 0; i < n; ++i)
+a[i] = b[i] * 3.;
+}
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Adjust testcase for more optimal codegen.

2023-07-31 Thread liuhongt via Gcc-patches

After
b9d7140c80bd3c7355b8291bb46f0895dcd8c3cb is the first bad commit
commit b9d7140c80bd3c7355b8291bb46f0895dcd8c3cb
Author: Jan Hubicka 
Date:   Fri Jul 28 09:16:09 2023 +0200

loop-split improvements, part 1

Now we have
vpbroadcastd %ecx, %xmm0
vpaddd .LC3(%rip), %xmm0, %xmm0
vpextrd $3, %xmm0, %eax
vmovddup %xmm3, %xmm0
vrndscalepd $9, %xmm0, %xmm0
vunpckhpd %xmm0, %xmm0, %xmm3

for vrndscalepd, no need to insert pxor since it reuses input register
xmm0 to avoid partial sse dependece.

Pushed to trunk.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr87007-4.c: Adjust testcase.
* gcc.target/i386/pr87007-5.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/pr87007-4.c | 6 +++---
 gcc/testsuite/gcc.target/i386/pr87007-5.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr87007-4.c 
b/gcc/testsuite/gcc.target/i386/pr87007-4.c
index e91bdcbac44..23b5c5dcc52 100644
--- a/gcc/testsuite/gcc.target/i386/pr87007-4.c
+++ b/gcc/testsuite/gcc.target/i386/pr87007-4.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
-/* { dg-options "-Ofast -march=skylake-avx512 -mfpmath=sse" } */
-
+/* { dg-options "-O2 -march=skylake-avx512 -mfpmath=sse" } */
+/* Load of d2/d3 is hoisted out, vrndscalesd will reuse loades register to 
avoid partial dependence.  */
 
 #include
 
@@ -15,4 +15,4 @@ foo (int n, int k)
   d1 = ceil (d3);
 }
 
-/* { dg-final { scan-assembler-times "vxorps\[^\n\r\]*xmm\[0-9\]" 1 } } */
+/* { dg-final { scan-assembler-times "vxorps\[^\n\r\]*xmm\[0-9\]" 0 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr87007-5.c 
b/gcc/testsuite/gcc.target/i386/pr87007-5.c
index 20d13cf650b..b0b0a7b70ef 100644
--- a/gcc/testsuite/gcc.target/i386/pr87007-5.c
+++ b/gcc/testsuite/gcc.target/i386/pr87007-5.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
-/* { dg-options "-Ofast -march=skylake-avx512 -mfpmath=sse" } */
-
+/* { dg-options "-O2 -march=skylake-avx512 -mfpmath=sse" } */
+/* Load of d2/d3 is hoisted out, vrndscalesd will reuse loades register to 
avoid partial dependence.  */
 
 #include
 
@@ -15,4 +15,4 @@ foo (int n, int k)
   d1 = sqrt (d3);
 }
 
-/* { dg-final { scan-assembler-times "vxorps\[^\n\r\]*xmm\[0-9\]" 1 } } */
+/* { dg-final { scan-assembler-times "vxorps\[^\n\r\]*xmm\[0-9\]" 0 } } */
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Support vec_fmaddsub/vec_fmsubadd for vector HFmode.

2023-08-01 Thread liuhongt via Gcc-patches

AVX512FP16 supports vfmaddsubXXXph and vfmsubaddXXXph.
Also remove scalar mode from fmaddsub/fmsubadd pattern since there's
no scalar instruction for that.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready to push to trunk.

gcc/ChangeLog:

PR target/81904
* config/i386/sse.md (vec_fmaddsub4): Extend to vector
HFmode, use mode iterator VFH instead.
(vec_fmsubadd4): Ditto.
(fma_fmaddsub_):
Remove scalar mode from iterator, use VFH_AVX512VL instead.
(fma_fmsubadd_):
Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr81904.c: New test.
---
 gcc/config/i386/sse.md  | 44 -
 gcc/testsuite/gcc.target/i386/pr81904.c | 22 +
 2 files changed, 44 insertions(+), 22 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr81904.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 51961bbfc0b..4e75c9addaa 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -5803,21 +5803,21 @@ (define_insn "_fnmsub__mask3"
 ;; But this doesn't seem useful in practice.
 
 (define_expand "vec_fmaddsub4"
-  [(set (match_operand:VF 0 "register_operand")
-   (unspec:VF
- [(match_operand:VF 1 "nonimmediate_operand")
-  (match_operand:VF 2 "nonimmediate_operand")
-  (match_operand:VF 3 "nonimmediate_operand")]
+  [(set (match_operand:VFH 0 "register_operand")
+   (unspec:VFH
+ [(match_operand:VFH 1 "nonimmediate_operand")
+  (match_operand:VFH 2 "nonimmediate_operand")
+  (match_operand:VFH 3 "nonimmediate_operand")]
  UNSPEC_FMADDSUB))]
   "TARGET_FMA || TARGET_FMA4 || ( == 64 || TARGET_AVX512VL)")
 
 (define_expand "vec_fmsubadd4"
-  [(set (match_operand:VF 0 "register_operand")
-   (unspec:VF
- [(match_operand:VF 1 "nonimmediate_operand")
-  (match_operand:VF 2 "nonimmediate_operand")
-  (neg:VF
-(match_operand:VF 3 "nonimmediate_operand"))]
+  [(set (match_operand:VFH 0 "register_operand")
+   (unspec:VFH
+ [(match_operand:VFH 1 "nonimmediate_operand")
+  (match_operand:VFH 2 "nonimmediate_operand")
+  (neg:VFH
+(match_operand:VFH 3 "nonimmediate_operand"))]
  UNSPEC_FMADDSUB))]
   "TARGET_FMA || TARGET_FMA4 || ( == 64 || TARGET_AVX512VL)")
 
@@ -5877,11 +5877,11 @@ (define_insn "*fma_fmaddsub_"
(set_attr "mode" "")])
 
 (define_insn "fma_fmaddsub_"
-  [(set (match_operand:VFH_SF_AVX512VL 0 "register_operand" "=v,v,v")
-   (unspec:VFH_SF_AVX512VL
- [(match_operand:VFH_SF_AVX512VL 1 "" "%0,0,v")
-  (match_operand:VFH_SF_AVX512VL 2 "" 
",v,")
-  (match_operand:VFH_SF_AVX512VL 3 "" 
"v,,0")]
+  [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v,v,v")
+   (unspec:VFH_AVX512VL
+ [(match_operand:VFH_AVX512VL 1 "" "%0,0,v")
+  (match_operand:VFH_AVX512VL 2 "" 
",v,")
+  (match_operand:VFH_AVX512VL 3 "" 
"v,,0")]
  UNSPEC_FMADDSUB))]
   "TARGET_AVX512F &&  && 
"
   "@
@@ -5943,12 +5943,12 @@ (define_insn "*fma_fmsubadd_"
(set_attr "mode" "")])
 
 (define_insn "fma_fmsubadd_"
-  [(set (match_operand:VFH_SF_AVX512VL 0 "register_operand" "=v,v,v")
-   (unspec:VFH_SF_AVX512VL
- [(match_operand:VFH_SF_AVX512VL   1 "" "%0,0,v")
-  (match_operand:VFH_SF_AVX512VL   2 "" 
",v,")
-  (neg:VFH_SF_AVX512VL
-(match_operand:VFH_SF_AVX512VL 3 "" 
"v,,0"))]
+  [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v,v,v")
+   (unspec:VFH_AVX512VL
+ [(match_operand:VFH_AVX512VL   1 "" "%0,0,v")
+  (match_operand:VFH_AVX512VL   2 "" 
",v,")
+  (neg:VFH_AVX512VL
+(match_operand:VFH_AVX512VL 3 "" 
"v,,0"))]
  UNSPEC_FMADDSUB))]
   "TARGET_AVX512F &&  && 
"
   "@
diff --git a/gcc/testsuite/gcc.target/i386/pr81904.c 
b/gcc/testsuite/gcc.target/i386/pr81904.c
new file mode 100644
index 000..9f5ad0bd952
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr81904.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -O2 -mprefer-vector-width=512" } */
+/* { dg-final { scan-assembler-times "vfmaddsub...ph\[ 
\t\]+\[^\n\]*%zmm\[0-9\]" 1 } } */
+/* { dg-final { scan-assembler-times "vfmsubadd...ph\[ 
\t\]+\[^\n\]*%zmm\[0-9\]" 1 } } */
+
+void vec_fmaddsub_fp16(int n, _Float16 da_r, _Float16 *x, _Float16* y, 
_Float16* __restrict z)
+{
+  for (int i = 0; i < 32; i += 2)
+{
+  z[i] =  da_r * x[i] - y[i];
+  z[i+1]  =  da_r * x[i+1] + y[i+1];
+}
+}
+
+void vec_fmasubadd_fp16(int n, _Float16 da_r, _Float16 *x, _Float16* y, 
_Float16* __restrict z)
+{
+  for (int i = 0; i < 32; i += 2)
+{
+  z[i] =  da_r * x[i] + y[i];
+  z[i+1]  =  da_r * x[i+1] - y[i+1];
+}
+}
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Optimize vlddqu + inserti128 to vbroadcasti128

2023-08-01 Thread liuhongt via Gcc-patches

In [1], I propose a patch to generate vmovdqu for all vlddqu intrinsics
after AVX2, it's rejected as
> The instruction is reachable only as __builtin_ia32_lddqu* (aka
> _mm_lddqu_si*), so it was chosen by the programmer for a reason. I
> think that in this case, the compiler should not be too smart and
> change the instruction behind the programmer's back. The caveats are
> also explained at length in the ISA manual.

So the patch is more conservative, only optimize vlddqu + vinserti128
to vbroadcasti128.
vlddqu + vinserti128 will use shuffle port in addition to load port
comparing to vbroadcasti128, For latency perspective,vbroadcasti is no
worse than vlddqu + vinserti128.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2023-July/625122.html

Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

* config/i386/sse.md (*avx2_lddqu_inserti_to_bcasti): New
pre_reload define_insn_and_split.

gcc/testsuite/ChangeLog:

* gcc.target/i386/vlddqu_vinserti128.c: New test.
---
 gcc/config/i386/sse.md | 18 ++
 .../gcc.target/i386/vlddqu_vinserti128.c   | 11 +++
 2 files changed, 29 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 2d81347c7b6..4bdd2b43ba7 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -26600,6 +26600,24 @@ (define_insn "avx2_vbroadcasti128_"
(set_attr "prefix" "vex,evex,evex")
(set_attr "mode" "OI")])
 
+;; optimize vlddqu + vinserti128 to vbroadcasti128, the former will use
+;; extra shuffle port in addition to load port than the latter.
+;; For latency perspective,vbroadcasti is no worse.
+(define_insn_and_split "avx2_lddqu_inserti_to_bcasti"
+  [(set (match_operand:V4DI 0 "register_operand" "=x,v,v")
+   (vec_concat:V4DI
+ (subreg:V2DI
+   (unspec:V16QI [(match_operand:V16QI 1 "memory_operand")]
+ UNSPEC_LDDQU) 0)
+ (subreg:V2DI (unspec:V16QI [(match_dup 1)]
+ UNSPEC_LDDQU) 0)))]
+  "TARGET_AVX2 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (vec_concat:V4DI (match_dup 1) (match_dup 1)))]
+  "operands[1] = adjust_address (operands[1], V2DImode, 0);")
+
 ;; Modes handled by AVX vec_dup patterns.
 (define_mode_iterator AVX_VEC_DUP_MODE
   [V8SI V8SF V4DI V4DF])
diff --git a/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c 
b/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
new file mode 100644
index 000..29699a5fa7f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2" } */
+/* { dg-final { scan-assembler-times "vbroadcasti128" 1 } } */
+/* { dg-final { scan-assembler-not {(?n)vlddqu.*xmm} } } */
+
+#include 
+__m256i foo(void *data) {
+__m128i X1 = _mm_lddqu_si128((__m128i*)data);
+__m256i V1 = _mm256_broadcastsi128_si256 (X1);
+return V1;
+}
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Allocate general register(memory/immediate) for 16/32/64-bit vector bit_op patterns.

2022-07-10 Thread liuhongt via Gcc-patches

And split it to GPR-version instruction after reload.

This will enable below optimization for 16/32/64-bit vector bit_op

-   movd(%rdi), %xmm0
-   movd(%rsi), %xmm1
-   pand%xmm1, %xmm0
-   movd%xmm0, (%rdi)
+   movl(%rsi), %eax
+   andl%eax, (%rdi)

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/106038
* config/i386/mmx.md (3): Expand
with (clobber (reg:CC flags_reg)) under TARGET_64BIT
(mmx_code>3): Ditto.
(*mmx_3_1): New define_insn, add post_reload
splitter after it.
(*3): New define_insn, also add post_reload
splitter after it.
(mmxinsnmode): New mode attribute.
(VI_16_32_64): New mode iterator.
(*mov_imm): Refactor with mmxinsnmode.
* config/i386/predicates.md
(nonimmediate_or_x86_64_vector_cst): New predicate.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr106038-1.c: New test.
* gcc.target/i386/pr106038-2.c: New test.
* gcc.target/i386/pr106038-3.c: New test.
---
 gcc/config/i386/mmx.md | 131 +++--
 gcc/config/i386/predicates.md  |   4 +
 gcc/testsuite/gcc.target/i386/pr106038-1.c |  61 ++
 gcc/testsuite/gcc.target/i386/pr106038-2.c |  35 ++
 gcc/testsuite/gcc.target/i386/pr106038-3.c |  17 +++
 5 files changed, 213 insertions(+), 35 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-3.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 3294c1e6274..85b06abea27 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -75,6 +75,11 @@ (define_mode_iterator V_16_32_64
 (V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT") (V4HF "TARGET_64BIT")
 (V2SI "TARGET_64BIT") (V2SF "TARGET_64BIT")])
 
+(define_mode_iterator VI_16_32_64
+   [V2QI V4QI V2HI
+(V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT")
+(V2SI "TARGET_64BIT")])
+
 ;; V2S* modes
 (define_mode_iterator V2FI [V2SF V2SI])
 
@@ -86,6 +91,14 @@ (define_mode_attr mmxvecsize
   [(V8QI "b") (V4QI "b") (V2QI "b")
(V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
 
+;; Mapping to same size integral mode.
+(define_mode_attr mmxinsnmode
+  [(V8QI "DI") (V4QI "SI") (V2QI "HI")
+   (V4HI "DI") (V2HI "SI")
+   (V2SI "DI")
+   (V4HF "DI") (V2HF "SI")
+   (V2SF "DI")])
+
 (define_mode_attr mmxdoublemode
   [(V8QI "V8HI") (V4HI "V4SI")])
 
@@ -350,22 +363,7 @@ (define_insn_and_split "*mov_imm"
   HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1],
mode);
   operands[1] = GEN_INT (val);
-  machine_mode mode;
-  switch (GET_MODE_SIZE (mode))
-{
-case 2:
-  mode = HImode;
-  break;
-case 4:
-  mode = SImode;
-  break;
-case 8:
-  mode = DImode;
-  break;
-default:
-  gcc_unreachable ();
-}
-  operands[0] = lowpart_subreg (mode, operands[0], mode);
+  operands[0] = lowpart_subreg (mode, operands[0], mode);
 })
 
 ;; For TARGET_64BIT we always round up to 8 bytes.
@@ -2948,14 +2946,28 @@ (define_expand "mmx_3"
  (match_operand:MMXMODEI 1 "register_mmxmem_operand")
  (match_operand:MMXMODEI 2 "register_mmxmem_operand")))]
   "TARGET_MMX || TARGET_MMX_WITH_SSE"
-  "ix86_fixup_binary_operands_no_copy (, mode, operands);")
+{
+  ix86_fixup_binary_operands_no_copy (, mode, operands);
+  if (TARGET_64BIT)
+  {
+ix86_expand_binary_operator (, mode, operands);
+DONE;
+  }
+})
 
 (define_expand "3"
   [(set (match_operand:MMXMODEI 0 "register_operand")
(any_logic:MMXMODEI
  (match_operand:MMXMODEI 1 "register_operand")
  (match_operand:MMXMODEI 2 "register_operand")))]
-  "TARGET_MMX_WITH_SSE")
+  "TARGET_MMX_WITH_SSE"
+{
+  if (TARGET_64BIT)
+{
+  ix86_expand_binary_operator (, mode, operands);
+  DONE;
+}
+})
 
 (define_insn "*mmx_3"
   [(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,x,v")
@@ -2974,33 +2986,82 @@ (define_insn "*mmx_3"
(set_attr "type" "mmxadd,sselog,sselog,sselog")
(set_attr "mode" "DI,TI,TI,TI")])
 
-(define_insn "3"
-  [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v")
-(any_logic:VI_16_32
- (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v")
- (match_operand:VI_16_32 2 "register_operand" "r,x,x,v")))
+(define_insn "*mmx_3_1"
+  [(set (match_operand:MMXMODEI 0 "nonimmediate_operand" "=y,x,x,v,rm,r")
+(any_logic:MMXMODEI
+ (match_operand:MMXMODEI 1 "nonimmediate_operand" "%0,0,x,v,0,0")
+ (match_operand:MMXMODEI 2 "nonimmediate_or_x86_64_vector_cst" 
"ym,x,x,v,ri,m")))
+(clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT
+   && (TARGET_MMX || TARGET_SSE2)
+   && ix86_binary_operator_ok (, mode, operands)"
+  "#"
+  [(set_attr

[PATCH] [RFC]Support vectorization for Complex type.

2022-07-10 Thread liuhongt via Gcc-patches

The patch only handles load/store(including ctor/permutation, except
gather/scatter) for complex type, other operations don't needs to be
handled since they will be lowered by pass cplxlower.(MASK_LOAD is not
supported for complex type, so no need to handle either).

Instead of support vector(2) _Complex double, this patch takes vector(4)
double as vector type of _Complex double. Since vectorizer originally
takes TYPE_VECTOR_SUBPARTS as nunits which is not true for complex
type, the patch handles nunits/ncopies/vf specially for complex type.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Also test the patch for SPEC2017 and find there's complex type vectorization
in 510/549(but no performance impact).

Any comments?

gcc/ChangeLog:

PR tree-optimization/106010
* tree-vect-data-refs.cc (vect_get_data_access_cost):
Pass complex_p to vect_get_num_copies to avoid ICE.
(vect_analyze_data_refs): Support vectorization for Complex
type with vector scalar types.
* tree-vect-loop.cc (vect_determine_vf_for_stmt_1): VF should
be half of TYPE_VECTOR_SUBPARTS when complex_p.
* tree-vect-slp.cc (vect_record_max_nunits): nunits should be
half of TYPE_VECTOR_SUBPARTS when complex_p.
(vect_optimize_slp): Support permutation for complex type.
(vect_slp_analyze_node_operations_1): Double nunits in
vect_get_num_vectors to get right SLP_TREE_NUMBER_OF_VEC_STMTS
when complex_p.
(vect_slp_analyze_node_operations): Ditto.
(vect_create_constant_vectors): Support CTOR for complex type.
(vect_transform_slp_perm_load): Support permutation for
complex type.
* tree-vect-stmts.cc (vect_init_vector): Support complex type.
(vect_get_vec_defs_for_operand): Get vector type for
complex type.
(vectorizable_store): Get right ncopies/nunits for complex
type, also return false when complex_p and
!TYPE_VECTOR_SUBPARTS.is_constant ().
(vectorizable_load): Ditto.
(vect_get_vector_types_for_stmt): Get vector type for complex type.
* tree-vectorizer.h (STMT_VINFO_COMPLEX_P): New macro.
(vect_get_num_copies): New overload.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr106010-1a.c: New test.
* gcc.target/i386/pr106010-1b.c: New test.
* gcc.target/i386/pr106010-1c.c: New test.
* gcc.target/i386/pr106010-2a.c: New test.
* gcc.target/i386/pr106010-2b.c: New test.
* gcc.target/i386/pr106010-2c.c: New test.
* gcc.target/i386/pr106010-3a.c: New test.
* gcc.target/i386/pr106010-3b.c: New test.
* gcc.target/i386/pr106010-3c.c: New test.
* gcc.target/i386/pr106010-4a.c: New test.
* gcc.target/i386/pr106010-4b.c: New test.
* gcc.target/i386/pr106010-4c.c: New test.
* gcc.target/i386/pr106010-5a.c: New test.
* gcc.target/i386/pr106010-5b.c: New test.
* gcc.target/i386/pr106010-5c.c: New test.
* gcc.target/i386/pr106010-6a.c: New test.
* gcc.target/i386/pr106010-6b.c: New test.
* gcc.target/i386/pr106010-6c.c: New test.
* gcc.target/i386/pr106010-7a.c: New test.
* gcc.target/i386/pr106010-7b.c: New test.
* gcc.target/i386/pr106010-7c.c: New test.
* gcc.target/i386/pr106010-8a.c: New test.
* gcc.target/i386/pr106010-8b.c: New test.
* gcc.target/i386/pr106010-8c.c: New test.
---
 gcc/testsuite/gcc.target/i386/pr106010-1a.c |  58 +++
 gcc/testsuite/gcc.target/i386/pr106010-1b.c |  63 +++
 gcc/testsuite/gcc.target/i386/pr106010-1c.c |  41 +
 gcc/testsuite/gcc.target/i386/pr106010-2a.c |  82 +
 gcc/testsuite/gcc.target/i386/pr106010-2b.c |  62 +++
 gcc/testsuite/gcc.target/i386/pr106010-2c.c |  47 ++
 gcc/testsuite/gcc.target/i386/pr106010-3a.c |  80 +
 gcc/testsuite/gcc.target/i386/pr106010-3b.c | 126 ++
 gcc/testsuite/gcc.target/i386/pr106010-3c.c |  69 
 gcc/testsuite/gcc.target/i386/pr106010-4a.c | 101 
 gcc/testsuite/gcc.target/i386/pr106010-4b.c |  67 
 gcc/testsuite/gcc.target/i386/pr106010-4c.c |  54 ++
 gcc/testsuite/gcc.target/i386/pr106010-5a.c | 117 +
 gcc/testsuite/gcc.target/i386/pr106010-5b.c |  80 +
 gcc/testsuite/gcc.target/i386/pr106010-5c.c |  62 +++
 gcc/testsuite/gcc.target/i386/pr106010-6a.c | 115 +
 gcc/testsuite/gcc.target/i386/pr106010-6b.c | 157 ++
 gcc/testsuite/gcc.target/i386/pr106010-6c.c |  80 +
 gcc/testsuite/gcc.target/i386/pr106010-7a.c |  58 +++
 gcc/testsuite/gcc.target/i386/pr106010-7b.c |  63 +++
 gcc/testsuite/gcc.target/i386/pr106010-7c.c |  41 +
 gcc/testsuite/gcc.target/i386/pr106010-8a.c |  58 +++
 gcc/testsuite/gcc.target/i386/pr106010-8b.c |  53 ++
 gcc/testsuite/gcc.target/i386/pr106010-8c.c |  38 +
 gcc/tree-vect-data-refs.cc  |  26 +

[PATCH] Extend 64-bit vector bit_op patterns with ?r alternative

2022-07-13 Thread liuhongt via Gcc-patches

And split it to GPR-version instruction after reload.

> ?r was introduced under the assumption that we want vector values
> mostly in vector registers. Currently there are no instructions with
> memory or immediate operand, so that made sense at the time. Let's
> keep ?r until logic instructions with mem/imm operands are introduced.
> So, for the patch that adds 64-bit vector logic in GPR, I would advise
> to first introduce only register operands. mem/imm operands should be
Update patch to add ?r to 64-bit bit_op patterns.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
No big imact on SPEC2017(Most same binary).

Ok for trunk?

gcc/ChangeLog:

PR target/106038
* config/i386/mmx.md (3): Expand
with (clobber (reg:CC flags_reg)) under TARGET_64BIT
(mmx_code>3): Ditto.
(*mmx_3_gpr): New define_insn, add post_reload
splitter after it.
(mmx_andnot3_gpr): Ditto.
(3): Extend follow define_split from VI_16_32 to
VI_16_32_64.
(*andnot3): Ditto.
(mmxinsnmode): New mode attribute.
(VI_16_32_64): New mode iterator.
(*mov_imm): Refactor with mmxinsnmode.
* config/i386/predicates.md

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr106038-1.c: New test.
* gcc.target/i386/pr106038-2.c: New test.
* gcc.target/i386/pr106038-3.c: New test.
---
 gcc/config/i386/mmx.md | 131 +++--
 gcc/testsuite/gcc.target/i386/pr106038-1.c |  61 ++
 gcc/testsuite/gcc.target/i386/pr106038-2.c |  35 ++
 gcc/testsuite/gcc.target/i386/pr106038-3.c |  17 +++
 4 files changed, 210 insertions(+), 34 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-3.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 3294c1e6274..5f7e40bd7a1 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -75,6 +75,11 @@ (define_mode_iterator V_16_32_64
 (V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT") (V4HF "TARGET_64BIT")
 (V2SI "TARGET_64BIT") (V2SF "TARGET_64BIT")])
 
+(define_mode_iterator VI_16_32_64
+   [V2QI V4QI V2HI
+(V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT")
+(V2SI "TARGET_64BIT")])
+
 ;; V2S* modes
 (define_mode_iterator V2FI [V2SF V2SI])
 
@@ -86,6 +91,14 @@ (define_mode_attr mmxvecsize
   [(V8QI "b") (V4QI "b") (V2QI "b")
(V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
 
+;; Mapping to same size integral mode.
+(define_mode_attr mmxinsnmode
+  [(V8QI "DI") (V4QI "SI") (V2QI "HI")
+   (V4HI "DI") (V2HI "SI")
+   (V2SI "DI")
+   (V4HF "DI") (V2HF "SI")
+   (V2SF "DI")])
+
 (define_mode_attr mmxdoublemode
   [(V8QI "V8HI") (V4HI "V4SI")])
 
@@ -350,22 +363,7 @@ (define_insn_and_split "*mov_imm"
   HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1],
mode);
   operands[1] = GEN_INT (val);
-  machine_mode mode;
-  switch (GET_MODE_SIZE (mode))
-{
-case 2:
-  mode = HImode;
-  break;
-case 4:
-  mode = SImode;
-  break;
-case 8:
-  mode = DImode;
-  break;
-default:
-  gcc_unreachable ();
-}
-  operands[0] = lowpart_subreg (mode, operands[0], mode);
+  operands[0] = lowpart_subreg (mode, operands[0], mode);
 })
 
 ;; For TARGET_64BIT we always round up to 8 bytes.
@@ -2878,6 +2876,31 @@ (define_insn "mmx_andnot3"
(set_attr "type" "mmxadd,sselog,sselog,sselog")
(set_attr "mode" "DI,TI,TI,TI")])
 
+(define_insn "mmx_andnot3_gpr"
+  [(set (match_operand:MMXMODEI 0 "register_operand" "=?r,y,x,x,v")
+   (and:MMXMODEI
+ (not:MMXMODEI (match_operand:MMXMODEI 1 "register_operand" 
"r,0,0,x,v"))
+ (match_operand:MMXMODEI 2 "register_mmxmem_operand" "r,ym,x,x,v")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && (TARGET_MMX || TARGET_SSE2)"
+  "#"
+  [(set_attr "isa" "bmi,*,sse2_noavx,avx,avx512vl")
+   (set_attr "mmx_isa" "*,native,*,*,*")
+   (set_attr "type" "alu,mmxadd,sselog,sselog,sselog")
+   (set_attr "mode" "DI,DI,TI,TI,TI")])
+
+(define_split
+  [(set (match_operand:MMXMODEI 0 "register_operand")
+   (and:MMXMODEI
+ (not:MMXMODEI (match_operand:MMXMODEI 1 "register_mmxmem_operand"))
+ (match_operand:MMXMODEI 2 "register_mmxmem_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "reload_completed
+   && (TARGET_MMX || TARGET_MMX_WITH_SSE)
+   && !GENERAL_REGNO_P (REGNO (operands[0]))"
+  [(set (match_dup 0)
+   (and: (not: (match_dup 1)) (match_dup 2)))])
+
 (define_insn "*andnot3"
   [(set (match_operand:VI_16_32 0 "register_operand" "=?&r,?r,x,x,v")
 (and:VI_16_32
@@ -2892,20 +2915,20 @@ (define_insn "*andnot3"
(set_attr "mode" "SI,SI,TI,TI,TI")])
 
 (define_split
-  [(set (match_operand:VI_16_32 0 "general_reg_operand")
-(and:VI_16_32
- (not:VI_16_32 (match_operand:VI_16_32 1

[PATCH] Extend 16/32-bit vector bit_op patterns with (m, 0, i)(vertical) alternative.

2022-07-17 Thread liuhongt via Gcc-patches

And split it after reload.

>IMO, the only case it is worth adding is a direct immediate store to
>memory, which HJ recently added.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/106038
* config/i386/mmx.md (3): Extend to AND mem,imm,
and adjust below define_split.
(mmxinsnmode): New mode attribute.
(*mov_imm): Refactor with mmxinsnmode.
* config/i386/predicates.md
(register_or_x86_64_const_vector_operand): New predicate.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr106038-1.c: New test.
---
 gcc/config/i386/mmx.md | 58 +++---
 gcc/config/i386/predicates.md  |  4 ++
 gcc/testsuite/gcc.target/i386/pr106038-1.c | 27 ++
 3 files changed, 60 insertions(+), 29 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 3294c1e6274..fbcb34d4395 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -86,6 +86,14 @@ (define_mode_attr mmxvecsize
   [(V8QI "b") (V4QI "b") (V2QI "b")
(V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
 
+;; Mapping to same size integral mode.
+(define_mode_attr mmxinsnmode
+  [(V8QI "DI") (V4QI "SI") (V2QI "HI")
+   (V4HI "DI") (V2HI "SI")
+   (V2SI "DI")
+   (V4HF "DI") (V2HF "SI")
+   (V2SF "DI")])
+
 (define_mode_attr mmxdoublemode
   [(V8QI "V8HI") (V4HI "V4SI")])
 
@@ -350,22 +358,7 @@ (define_insn_and_split "*mov_imm"
   HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1],
mode);
   operands[1] = GEN_INT (val);
-  machine_mode mode;
-  switch (GET_MODE_SIZE (mode))
-{
-case 2:
-  mode = HImode;
-  break;
-case 4:
-  mode = SImode;
-  break;
-case 8:
-  mode = DImode;
-  break;
-default:
-  gcc_unreachable ();
-}
-  operands[0] = lowpart_subreg (mode, operands[0], mode);
+  operands[0] = lowpart_subreg (mode, operands[0], mode);
 })
 
 ;; For TARGET_64BIT we always round up to 8 bytes.
@@ -2975,32 +2968,39 @@ (define_insn "*mmx_3"
(set_attr "mode" "DI,TI,TI,TI")])
 
 (define_insn "3"
-  [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v")
+  [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=?r,m,x,x,v")
 (any_logic:VI_16_32
- (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v")
- (match_operand:VI_16_32 2 "register_operand" "r,x,x,v")))
+ (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v")
+ (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand" 
"r,i,x,x,v")))
(clobber (reg:CC FLAGS_REG))]
   ""
   "#"
-  [(set_attr "isa" "*,sse2_noavx,avx,avx512vl")
-   (set_attr "type" "alu,sselog,sselog,sselog")
-   (set_attr "mode" "SI,TI,TI,TI")])
+  [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl")
+   (set_attr "type" "alu,alu,sselog,sselog,sselog")
+   (set_attr "mode" "SI,SI,TI,TI,TI")])
 
 (define_split
-  [(set (match_operand:VI_16_32 0 "general_reg_operand")
+  [(set (match_operand:VI_16_32 0 "nonimmediate_gr_operand")
 (any_logic:VI_16_32
- (match_operand:VI_16_32 1 "general_reg_operand")
- (match_operand:VI_16_32 2 "general_reg_operand")))
+ (match_operand:VI_16_32 1 "nonimmediate_gr_operand")
+ (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand")))
(clobber (reg:CC FLAGS_REG))]
   "reload_completed"
   [(parallel
  [(set (match_dup 0)
-  (any_logic:SI (match_dup 1) (match_dup 2)))
+  (any_logic: (match_dup 1) (match_dup 2)))
   (clobber (reg:CC FLAGS_REG))])]
 {
-  operands[2] = lowpart_subreg (SImode, operands[2], mode);
-  operands[1] = lowpart_subreg (SImode, operands[1], mode);
-  operands[0] = lowpart_subreg (SImode, operands[0], mode);
+  if (GET_CODE (operands[2]) == CONST_VECTOR)
+{
+  HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2],
+   mode);
+  operands[2] = GEN_INT (val);
+}
+  else
+operands[2] = lowpart_subreg (mode, operands[2], mode);
+  operands[1] = lowpart_subreg (mode, operands[1], mode);
+  operands[0] = lowpart_subreg (mode, operands[0], mode);
 })
 
 (define_split
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index c71c453cceb..5f63a7d52f5 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1205,6 +1205,10 @@ (define_predicate "x86_64_const_vector_operand"
   return trunc_int_for_mode (val, SImode) == val;
 })
 
+(define_predicate "register_or_x86_64_const_vector_operand"
+  (ior (match_operand 0 "register_operand")
+   (match_operand 0 "x86_64_const_vector_operand")))
+
 ;; Return true when OP is nonimmediate or standard SSE constant.
 (define_predicate "nonimmediate_or_sse_const_operand"
   (ior (match_operand 0 "nonimmediate_o

[PATCH V2] [RFC]Support vectorization for Complex type.

2022-07-17 Thread liuhongt via Gcc-patches

V2 update:
   Handle VMAT_ELEMENTWISE, VMAT_CONTIGUOUS_PERMUTE, VMAT_STRIDED_SLP,
   VMAT_CONTIGUOUS_REVERSE, VMAT_CONTIGUOUS_DOWN for complex type.

I've run SPECspeed@2017 627.cam4_s, there's some vectorization cases,
but no big performance impact(since this patch only handle load/store).

Any comments?

gcc/ChangeLog:

PR tree-optimization/106010
* tree-vect-data-refs.cc (vect_get_data_access_cost):
Pass complex_p to vect_get_num_copies to avoid ICE.
(vect_analyze_data_refs): Support vectorization for Complex
type with vector scalar types.
(vect_permute_load_chain): Handle Complex type.
* tree-vect-loop.cc (vect_determine_vf_for_stmt_1): VF should
be half of TYPE_VECTOR_SUBPARTS when complex_p.
* tree-vect-slp.cc (vect_record_max_nunits): nunits should be
half of TYPE_VECTOR_SUBPARTS when complex_p.
(vect_optimize_slp): Support permutation for complex type.
(vect_slp_analyze_node_operations_1): Double nunits in
vect_get_num_vectors to get right SLP_TREE_NUMBER_OF_VEC_STMTS
when complex_p.
(vect_slp_analyze_node_operations): Ditto.
(vect_create_constant_vectors): Support CTOR for complex type.
(vect_transform_slp_perm_load): Support permutation for
complex type.
* tree-vect-stmts.cc (vect_init_vector): Support complex type.
(vect_get_vec_defs_for_operand): Get vector type for
complex type.
(vectorizable_store): Get right ncopies/nunits and
elem_type for complex type vector, also return false when
complex_p and !TYPE_VECTOR_SUBPARTS.is_constant ().
(vect_truncate_gather_scatter_offset): Return false for
complex type.
(vectorizable_load): Ditto.
(vect_get_vector_types_for_stmt): Get vector type for
complex type.
(get_group_load_store_type): Hanlde complex type for
nunits.
(perm_mask_for_reverse): New overload.
(get_negative_load_store_type): Handle complex type,
p_offset should be N - 2 beofre addres of DR.
(vect_check_scalar_mask): Return false for complex type.
* tree-vectorizer.h (STMT_VINFO_COMPLEX_P): New macro.
(vect_get_num_copies): New overload.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr106010-1a.c: New test.
* gcc.target/i386/pr106010-1b.c: New test.
* gcc.target/i386/pr106010-1c.c: New test.
* gcc.target/i386/pr106010-2a.c: New test.
* gcc.target/i386/pr106010-2b.c: New test.
* gcc.target/i386/pr106010-2c.c: New test.
* gcc.target/i386/pr106010-3a.c: New test.
* gcc.target/i386/pr106010-3b.c: New test.
* gcc.target/i386/pr106010-3c.c: New test.
* gcc.target/i386/pr106010-4a.c: New test.
* gcc.target/i386/pr106010-4b.c: New test.
* gcc.target/i386/pr106010-4c.c: New test.
* gcc.target/i386/pr106010-5a.c: New test.
* gcc.target/i386/pr106010-5b.c: New test.
* gcc.target/i386/pr106010-5c.c: New test.
* gcc.target/i386/pr106010-6a.c: New test.
* gcc.target/i386/pr106010-6b.c: New test.
* gcc.target/i386/pr106010-6c.c: New test.
* gcc.target/i386/pr106010-7a.c: New test.
* gcc.target/i386/pr106010-7b.c: New test.
* gcc.target/i386/pr106010-7c.c: New test.
* gcc.target/i386/pr106010-8a.c: New test.
* gcc.target/i386/pr106010-8b.c: New test.
* gcc.target/i386/pr106010-8c.c: New test.
* gcc.target/i386/pr106010-9a.c: New test.
* gcc.target/i386/pr106010-9b.c: New test.
* gcc.target/i386/pr106010-9c.c: New test.
* gcc.target/i386/pr106010-9d.c: New test.
---
 gcc/testsuite/gcc.target/i386/pr106010-1a.c |  58 +
 gcc/testsuite/gcc.target/i386/pr106010-1b.c |  63 ++
 gcc/testsuite/gcc.target/i386/pr106010-1c.c |  41 
 gcc/testsuite/gcc.target/i386/pr106010-2a.c |  82 +++
 gcc/testsuite/gcc.target/i386/pr106010-2b.c |  62 ++
 gcc/testsuite/gcc.target/i386/pr106010-2c.c |  47 
 gcc/testsuite/gcc.target/i386/pr106010-3a.c |  80 +++
 gcc/testsuite/gcc.target/i386/pr106010-3b.c | 126 +++
 gcc/testsuite/gcc.target/i386/pr106010-3c.c |  69 ++
 gcc/testsuite/gcc.target/i386/pr106010-4a.c | 101 +
 gcc/testsuite/gcc.target/i386/pr106010-4b.c |  67 ++
 gcc/testsuite/gcc.target/i386/pr106010-4c.c |  54 +
 gcc/testsuite/gcc.target/i386/pr106010-5a.c | 117 ++
 gcc/testsuite/gcc.target/i386/pr106010-5b.c |  80 +++
 gcc/testsuite/gcc.target/i386/pr106010-5c.c |  62 ++
 gcc/testsuite/gcc.target/i386/pr106010-6a.c | 115 ++
 gcc/testsuite/gcc.target/i386/pr106010-6b.c | 157 +
 gcc/testsuite/gcc.target/i386/pr106010-6c.c |  80 +++
 gcc/testsuite/gcc.target/i386/pr106010-7a.c |  58 +
 gcc/testsuite/gcc.target/i386/pr106010-7b.c |  63 ++
 gcc/testsuite/gcc.target/i386/pr106010-7c.c |  41 
 gcc/testsuite/gcc.t

[PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative.

2022-07-18 Thread liuhongt via Gcc-patches

And split it after reload.

> You will need ix86_binary_operator_ok insn constraint here with
> corresponding expander using ix86_fixup_binary_operands_no_copy to
> prepare insn operands.
Split define_expand with just register_operand, and allow
memory/immediate in define_insn, assume combine/forwprop will do optimization.

> Please use if (!register_operand (operands[2], mode)) instead.
Changed.

Update patch.

gcc/ChangeLog:

PR target/106038
* config/i386/mmx.md (3): New define_expand, it's
original "3".
(*3): New define_insn, it's original
"3" be extended to handle memory and immediate
operand with ix86_binary_operator_ok. Also adjust define_split
after it.
(mmxinsnmode): New mode attribute.
(*mov_imm): Refactor with mmxinsnmode.
* config/i386/predicates.md
(register_or_x86_64_const_vector_operand): New predicate.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr106038-1.c: New test.
---
 gcc/config/i386/mmx.md | 71 --
 gcc/config/i386/predicates.md  |  4 ++
 gcc/testsuite/gcc.target/i386/pr106038-1.c | 27 
 3 files changed, 71 insertions(+), 31 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 3294c1e6274..316b83dd3ac 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -86,6 +86,14 @@ (define_mode_attr mmxvecsize
   [(V8QI "b") (V4QI "b") (V2QI "b")
(V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
 
+;; Mapping to same size integral mode.
+(define_mode_attr mmxinsnmode
+  [(V8QI "DI") (V4QI "SI") (V2QI "HI")
+   (V4HI "DI") (V2HI "SI")
+   (V2SI "DI")
+   (V4HF "DI") (V2HF "SI")
+   (V2SF "DI")])
+
 (define_mode_attr mmxdoublemode
   [(V8QI "V8HI") (V4HI "V4SI")])
 
@@ -350,22 +358,7 @@ (define_insn_and_split "*mov_imm"
   HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1],
mode);
   operands[1] = GEN_INT (val);
-  machine_mode mode;
-  switch (GET_MODE_SIZE (mode))
-{
-case 2:
-  mode = HImode;
-  break;
-case 4:
-  mode = SImode;
-  break;
-case 8:
-  mode = DImode;
-  break;
-default:
-  gcc_unreachable ();
-}
-  operands[0] = lowpart_subreg (mode, operands[0], mode);
+  operands[0] = lowpart_subreg (mode, operands[0], mode);
 })
 
 ;; For TARGET_64BIT we always round up to 8 bytes.
@@ -2974,33 +2967,49 @@ (define_insn "*mmx_3"
(set_attr "type" "mmxadd,sselog,sselog,sselog")
(set_attr "mode" "DI,TI,TI,TI")])
 
-(define_insn "3"
-  [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v")
+(define_expand "3"
+  [(parallel
+[(set (match_operand:VI_16_32 0 "register_operand")
+(any_logic:VI_16_32
+ (match_operand:VI_16_32 1 "register_operand")
+ (match_operand:VI_16_32 2 "register_operand")))
+   (clobber (reg:CC FLAGS_REG))])]
+  "")
+
+(define_insn "*3"
+  [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=?r,m,x,x,v")
 (any_logic:VI_16_32
- (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v")
- (match_operand:VI_16_32 2 "register_operand" "r,x,x,v")))
+ (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v")
+ (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand" 
"r,i,x,x,v")))
(clobber (reg:CC FLAGS_REG))]
-  ""
+  "ix86_binary_operator_ok (, mode, operands)"
   "#"
-  [(set_attr "isa" "*,sse2_noavx,avx,avx512vl")
-   (set_attr "type" "alu,sselog,sselog,sselog")
-   (set_attr "mode" "SI,TI,TI,TI")])
+  [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl")
+   (set_attr "type" "alu,alu,sselog,sselog,sselog")
+   (set_attr "mode" "SI,SI,TI,TI,TI")])
 
 (define_split
-  [(set (match_operand:VI_16_32 0 "general_reg_operand")
+  [(set (match_operand:VI_16_32 0 "nonimmediate_gr_operand")
 (any_logic:VI_16_32
- (match_operand:VI_16_32 1 "general_reg_operand")
- (match_operand:VI_16_32 2 "general_reg_operand")))
+ (match_operand:VI_16_32 1 "nonimmediate_gr_operand")
+ (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand")))
(clobber (reg:CC FLAGS_REG))]
   "reload_completed"
   [(parallel
  [(set (match_dup 0)
-  (any_logic:SI (match_dup 1) (match_dup 2)))
+  (any_logic: (match_dup 1) (match_dup 2)))
   (clobber (reg:CC FLAGS_REG))])]
 {
-  operands[2] = lowpart_subreg (SImode, operands[2], mode);
-  operands[1] = lowpart_subreg (SImode, operands[1], mode);
-  operands[0] = lowpart_subreg (SImode, operands[0], mode);
+  if (!register_operand (operands[2], mode))
+{
+  HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2],
+   mode);
+  operands[2] = GEN_INT (val);
+}
+  else
+operands[2] = lowpart_subreg (mode, operands[2], mode);
+

[PATCH] Move pass_cse_sincos after vectorizer.

2022-07-19 Thread liuhongt via Gcc-patches

__builtin_cexpi can't be vectorized since there's gap between it and
vectorized sincos version(In libmvec, it passes a double and two
double pointer and returns nothing.) And it will lose some
vectorization opportunity if sin & cos are optimized to cexpi before
vectorizer.

I'm trying to add vect_recog_cexpi_pattern to split cexpi to sin and
cos, but it failed vectorizable_simd_clone_call since NULL is returned
by cgraph_node::get (fndecl).  So alternatively, the patch try to move
pass_cse_sincos after vectorizer, just before pas_cse_reciprocals.

Also original pass_cse_sincos additionaly expands pow&cabs, this patch
split that part into a separate pass named pass_expand_powcabs which
remains the old pass position.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Observe more libmvec sin/cos vectorization in specfp, but no big performance.

Ok for trunk?

gcc/ChangeLog:

* passes.def: (Split pass_cse_sincos to pass_expand_powcabs
and pass_cse_sincos, and move pass_cse_sincos after vectorizer).
* timevar.def (TV_TREE_POWCABS): New timevar.
* tree-pass.h (make_pass_expand_powcabs): Split from pass_cse_sincos.
* tree-ssa-math-opts.cc (gimple_expand_builtin_cabs): Ditto.
(class pass_expand_powcabs): Ditto.
(pass_expand_powcabs::execute): Ditto.
(make_pass_expand_powcabs): Ditto.
(pass_cse_sincos::execute): Remove pow/cabs expand part.
(make_pass_cse_sincos): Ditto.

gcc/testsuite/ChangeLog:

* gcc.dg/pow-sqrt-synth-1.c: Adjust testcase.
---
 gcc/passes.def  |   3 +-
 gcc/testsuite/gcc.dg/pow-sqrt-synth-1.c |   4 +-
 gcc/timevar.def |   1 +
 gcc/tree-pass.h |   1 +
 gcc/tree-ssa-math-opts.cc   | 112 +++-
 5 files changed, 97 insertions(+), 24 deletions(-)

diff --git a/gcc/passes.def b/gcc/passes.def
index 375d3d62d51..6bb92efacd4 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -253,7 +253,7 @@ along with GCC; see the file COPYING3.  If not see
   NEXT_PASS (pass_ccp, true /* nonzero_p */);
   /* After CCP we rewrite no longer addressed locals into SSA
 form if possible.  */
-  NEXT_PASS (pass_cse_sincos);
+  NEXT_PASS (pass_expand_powcabs);
   NEXT_PASS (pass_optimize_bswap);
   NEXT_PASS (pass_laddress);
   NEXT_PASS (pass_lim);
@@ -328,6 +328,7 @@ along with GCC; see the file COPYING3.  If not see
   NEXT_PASS (pass_simduid_cleanup);
   NEXT_PASS (pass_lower_vector_ssa);
   NEXT_PASS (pass_lower_switch);
+  NEXT_PASS (pass_cse_sincos);
   NEXT_PASS (pass_cse_reciprocals);
   NEXT_PASS (pass_reassoc, false /* early_p */);
   NEXT_PASS (pass_strength_reduction);
diff --git a/gcc/testsuite/gcc.dg/pow-sqrt-synth-1.c 
b/gcc/testsuite/gcc.dg/pow-sqrt-synth-1.c
index 4a94325cdb3..484b29a8fc8 100644
--- a/gcc/testsuite/gcc.dg/pow-sqrt-synth-1.c
+++ b/gcc/testsuite/gcc.dg/pow-sqrt-synth-1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile { target sqrt_insn } } */
-/* { dg-options "-fdump-tree-sincos -Ofast --param max-pow-sqrt-depth=8" } */
+/* { dg-options "-fdump-tree-powcabs -Ofast --param max-pow-sqrt-depth=8" } */
 /* { dg-additional-options "-mfloat-abi=softfp -mfpu=neon-vfpv4" { target 
arm*-*-* } } */
 
 double
@@ -34,4 +34,4 @@ vecfoo (double *a)
 a[i] = __builtin_pow (a[i], 1.25);
 }
 
-/* { dg-final { scan-tree-dump-times "synthesizing" 7 "sincos" } } */
+/* { dg-final { scan-tree-dump-times "synthesizing" 7 "powcabs" } } */
diff --git a/gcc/timevar.def b/gcc/timevar.def
index 2dae5e1c760..651af19876f 100644
--- a/gcc/timevar.def
+++ b/gcc/timevar.def
@@ -220,6 +220,7 @@ DEFTIMEVAR (TV_TREE_SWITCH_CONVERSION, "tree switch 
conversion")
 DEFTIMEVAR (TV_TREE_SWITCH_LOWERING,   "tree switch lowering")
 DEFTIMEVAR (TV_TREE_RECIP, "gimple CSE reciprocals")
 DEFTIMEVAR (TV_TREE_SINCOS   , "gimple CSE sin/cos")
+DEFTIMEVAR (TV_TREE_POWCABS   , "gimple expand pow/cabs")
 DEFTIMEVAR (TV_TREE_WIDEN_MUL, "gimple widening/fma detection")
 DEFTIMEVAR (TV_TRANS_MEM , "transactional memory")
 DEFTIMEVAR (TV_TREE_STRLEN   , "tree strlen optimization")
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 606d1d60b85..4dfe05ed8e0 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -444,6 +444,7 @@ extern gimple_opt_pass *make_pass_early_warn_uninitialized 
(gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_late_warn_uninitialized (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_cse_reciprocals (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_cse_sincos (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_expand_powcabs (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_optimize_bswap (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_store_merging (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_optimize_widening_mul (gcc::context *ctxt);
diff --git a/gcc/tree-ssa-

gcc-patches@gcc.gnu.org

2022-07-19 Thread liuhongt via Gcc-patches

> My original comments still stand (it feels like this should be more generic).
> Can we go the way lowering complex loads/stores first?  A large part
> of the testcases
> added by the patch should pass after that.

This is the patch as suggested, one additional change is handling COMPLEX_CST
for rhs. And it will enable vectorization for pr106010-8a.c.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

2022-07-20  Richard Biener  
Hongtao Liu  

gcc/ChangeLog:

PR tree-optimization/106010
* tree-complex.cc (init_dont_simulate_again): Lower complex
type move.
(expand_complex_move): Also expand COMPLEX_CST for rhs.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr106010-1a.c: New test.
* gcc.target/i386/pr106010-1b.c: New test.
* gcc.target/i386/pr106010-1c.c: New test.
* gcc.target/i386/pr106010-2a.c: New test.
* gcc.target/i386/pr106010-2b.c: New test.
* gcc.target/i386/pr106010-2c.c: New test.
* gcc.target/i386/pr106010-3a.c: New test.
* gcc.target/i386/pr106010-3b.c: New test.
* gcc.target/i386/pr106010-3c.c: New test.
* gcc.target/i386/pr106010-4a.c: New test.
* gcc.target/i386/pr106010-4b.c: New test.
* gcc.target/i386/pr106010-4c.c: New test.
* gcc.target/i386/pr106010-5a.c: New test.
* gcc.target/i386/pr106010-5b.c: New test.
* gcc.target/i386/pr106010-5c.c: New test.
* gcc.target/i386/pr106010-6a.c: New test.
* gcc.target/i386/pr106010-6b.c: New test.
* gcc.target/i386/pr106010-6c.c: New test.
* gcc.target/i386/pr106010-7a.c: New test.
* gcc.target/i386/pr106010-7b.c: New test.
* gcc.target/i386/pr106010-7c.c: New test.
* gcc.target/i386/pr106010-8a.c: New test.
* gcc.target/i386/pr106010-8b.c: New test.
* gcc.target/i386/pr106010-8c.c: New test.
* gcc.target/i386/pr106010-9a.c: New test.
* gcc.target/i386/pr106010-9b.c: New test.
* gcc.target/i386/pr106010-9c.c: New test.
* gcc.target/i386/pr106010-9d.c: New test.
---
 gcc/testsuite/gcc.target/i386/pr106010-1a.c |  58 
 gcc/testsuite/gcc.target/i386/pr106010-1b.c |  63 
 gcc/testsuite/gcc.target/i386/pr106010-1c.c |  41 +
 gcc/testsuite/gcc.target/i386/pr106010-2a.c |  82 ++
 gcc/testsuite/gcc.target/i386/pr106010-2b.c |  62 
 gcc/testsuite/gcc.target/i386/pr106010-2c.c |  47 ++
 gcc/testsuite/gcc.target/i386/pr106010-3a.c |  80 ++
 gcc/testsuite/gcc.target/i386/pr106010-3b.c | 126 
 gcc/testsuite/gcc.target/i386/pr106010-3c.c |  69 +
 gcc/testsuite/gcc.target/i386/pr106010-4a.c | 101 +
 gcc/testsuite/gcc.target/i386/pr106010-4b.c |  67 +
 gcc/testsuite/gcc.target/i386/pr106010-4c.c |  54 +++
 gcc/testsuite/gcc.target/i386/pr106010-5a.c | 117 +++
 gcc/testsuite/gcc.target/i386/pr106010-5b.c |  80 ++
 gcc/testsuite/gcc.target/i386/pr106010-5c.c |  62 
 gcc/testsuite/gcc.target/i386/pr106010-6a.c | 115 ++
 gcc/testsuite/gcc.target/i386/pr106010-6b.c | 157 
 gcc/testsuite/gcc.target/i386/pr106010-6c.c |  80 ++
 gcc/testsuite/gcc.target/i386/pr106010-7a.c |  58 
 gcc/testsuite/gcc.target/i386/pr106010-7b.c |  63 
 gcc/testsuite/gcc.target/i386/pr106010-7c.c |  41 +
 gcc/testsuite/gcc.target/i386/pr106010-8a.c |  58 
 gcc/testsuite/gcc.target/i386/pr106010-8b.c |  53 +++
 gcc/testsuite/gcc.target/i386/pr106010-8c.c |  38 +
 gcc/testsuite/gcc.target/i386/pr106010-9a.c |  89 +++
 gcc/testsuite/gcc.target/i386/pr106010-9b.c |  90 +++
 gcc/testsuite/gcc.target/i386/pr106010-9c.c |  90 +++
 gcc/testsuite/gcc.target/i386/pr106010-9d.c |  92 
 gcc/tree-complex.cc |   9 +-
 29 files changed, 2141 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr10601

[PATCH V3] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative.

2022-07-20 Thread liuhongt via Gcc-patches

And split it after reload.

gcc/ChangeLog:

PR target/106038
* config/i386/mmx.md (3): New define_expand, it's
original "3".
(*3): New define_insn, it's original
"3" be extended to handle memory and immediate
operand with ix86_binary_operator_ok. Also adjust define_split
after it.
(mmxinsnmode): New mode attribute.
(*mov_imm): Refactor with mmxinsnmode.
* config/i386/predicates.md
(register_or_x86_64_const_vector_operand): New predicate.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr106038-1.c: New test.
---
 gcc/config/i386/mmx.md | 70 --
 gcc/config/i386/predicates.md  |  4 ++
 gcc/testsuite/gcc.target/i386/pr106038-1.c | 27 +
 3 files changed, 70 insertions(+), 31 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 3294c1e6274..dda4b43f5c1 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -86,6 +86,14 @@ (define_mode_attr mmxvecsize
   [(V8QI "b") (V4QI "b") (V2QI "b")
(V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
 
+;; Mapping to same size integral mode.
+(define_mode_attr mmxinsnmode
+  [(V8QI "DI") (V4QI "SI") (V2QI "HI")
+   (V4HI "DI") (V2HI "SI")
+   (V2SI "DI")
+   (V4HF "DI") (V2HF "SI")
+   (V2SF "DI")])
+
 (define_mode_attr mmxdoublemode
   [(V8QI "V8HI") (V4HI "V4SI")])
 
@@ -350,22 +358,7 @@ (define_insn_and_split "*mov_imm"
   HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1],
mode);
   operands[1] = GEN_INT (val);
-  machine_mode mode;
-  switch (GET_MODE_SIZE (mode))
-{
-case 2:
-  mode = HImode;
-  break;
-case 4:
-  mode = SImode;
-  break;
-case 8:
-  mode = DImode;
-  break;
-default:
-  gcc_unreachable ();
-}
-  operands[0] = lowpart_subreg (mode, operands[0], mode);
+  operands[0] = lowpart_subreg (mode, operands[0], mode);
 })
 
 ;; For TARGET_64BIT we always round up to 8 bytes.
@@ -2974,33 +2967,48 @@ (define_insn "*mmx_3"
(set_attr "type" "mmxadd,sselog,sselog,sselog")
(set_attr "mode" "DI,TI,TI,TI")])
 
-(define_insn "3"
-  [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v")
+(define_expand "3"
+  [(set (match_operand:VI_16_32 0 "nonimmediate_operand")
 (any_logic:VI_16_32
- (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v")
- (match_operand:VI_16_32 2 "register_operand" "r,x,x,v")))
-   (clobber (reg:CC FLAGS_REG))]
+ (match_operand:VI_16_32 1 "nonimmediate_operand")
+ (match_operand:VI_16_32 2 
"nonimmediate_or_x86_64_const_vector_operand")))]
   ""
+  "ix86_expand_binary_operator (, mode, operands); DONE;")
+
+(define_insn "*3"
+  [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=?r,m,x,x,v")
+(any_logic:VI_16_32
+ (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v")
+ (match_operand:VI_16_32 2 
"nonimmediate_or_x86_64_const_vector_operand" "r,i,x,x,v")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (, mode, operands)"
   "#"
-  [(set_attr "isa" "*,sse2_noavx,avx,avx512vl")
-   (set_attr "type" "alu,sselog,sselog,sselog")
-   (set_attr "mode" "SI,TI,TI,TI")])
+  [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl")
+   (set_attr "type" "alu,alu,sselog,sselog,sselog")
+   (set_attr "mode" "SI,SI,TI,TI,TI")])
 
 (define_split
-  [(set (match_operand:VI_16_32 0 "general_reg_operand")
+  [(set (match_operand:VI_16_32 0 "nonimmediate_gr_operand")
 (any_logic:VI_16_32
- (match_operand:VI_16_32 1 "general_reg_operand")
- (match_operand:VI_16_32 2 "general_reg_operand")))
+ (match_operand:VI_16_32 1 "nonimmediate_gr_operand")
+ (match_operand:VI_16_32 2 "reg_or_const_vector_operand")))
(clobber (reg:CC FLAGS_REG))]
   "reload_completed"
   [(parallel
  [(set (match_dup 0)
-  (any_logic:SI (match_dup 1) (match_dup 2)))
+  (any_logic: (match_dup 1) (match_dup 2)))
   (clobber (reg:CC FLAGS_REG))])]
 {
-  operands[2] = lowpart_subreg (SImode, operands[2], mode);
-  operands[1] = lowpart_subreg (SImode, operands[1], mode);
-  operands[0] = lowpart_subreg (SImode, operands[0], mode);
+  if (!register_operand (operands[2], mode))
+{
+  HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2],
+   mode);
+  operands[2] = GEN_INT (val);
+}
+  else
+operands[2] = lowpart_subreg (mode, operands[2], mode);
+  operands[1] = lowpart_subreg (mode, operands[1], mode);
+  operands[0] = lowpart_subreg (mode, operands[0], mode);
 })
 
 (define_split
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index c71c453cceb..73dfd46bf90 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md

[PATCH] Adjust testcase.

2022-07-21 Thread liuhongt via Gcc-patches

r13-1762-gf9d4c3b45c5ed5f45c8089c990dbd4e181929c3d lower complex type
move to scalars, but testcase pr23911 is supposed to scan __complex__
constant which is never available, so adjust testcase to scan
IMAGPART/REALPART_EXPR constants separately.

Pushed as obvious patch.

gcc/testsuite/ChangeLog

PR tree-optimization/106010
* gcc.dg/pr23911.c: Scan IMAGPART/REALPART_EXPR = ** instead
of __complex__ since COMPLEX_CST is lower to scalars.
---
 gcc/testsuite/gcc.dg/pr23911.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/pr23911.c b/gcc/testsuite/gcc.dg/pr23911.c
index 3fa041222de..691f3507db2 100644
--- a/gcc/testsuite/gcc.dg/pr23911.c
+++ b/gcc/testsuite/gcc.dg/pr23911.c
@@ -16,5 +16,6 @@ test (void)
 
 /* After DCE2 which runs after FRE, the expressions should be fully
constant folded.  There should be no loads from b left.  */
-/* { dg-final { scan-tree-dump-times "__complex__ \\\(1.0e\\\+0, 0.0\\\)" 2 
"dce3" } } */
+/* { dg-final { scan-tree-dump-times {(?n)REALPART_EXPR.*= 1\.0e\+0} 2 "dce3" 
} } */
+/* { dg-final { scan-tree-dump-times {(?n)IMAGPART_EXPR.*= 0\.0} 2 "dce3" } } 
*/
 /* { dg-final { scan-tree-dump-times "= b" 0 "dce3" } } */
-- 
2.18.1

[RFC: PATCH] Extend vectorizer to handle nonlinear induction for neg, mul/lshift/rshift with a constant.

2022-08-03 Thread liuhongt via Gcc-patches

For neg, the patch create a vec_init as [ a, -a, a, -a, ...  ] and no
vec_step is needed to update vectorized iv since vf is always multiple
of 2(negative * negative is positive).

For shift, the patch create a vec_init as [ a, a >> c, a >> 2*c, ..]
as vec_step as [ c * nunits, c * nunits, c * nunits, ... ], vectorized iv is
updated as vec_def = vec_init >>/<< vec_step.

For mul, the patch create a vec_init as [ a, a * c, a * pow(c, 2), ..]
as vec_step as [ pow(c,nunits), pow(c,nunits),...] iv is updated as vec_def =
vec_init * vec_step.

The patch handles nonlinear iv for
1. Integer type only, floating point is not handled.
2. No slp_node.
3. iv_loop should be same as vector loop, not nested loop.
4. No UD is created, for mul, no UD overlow for pow (c, vf), for
   shift, shift count should be less than type precision.

Bootstrapped and regression tested on x86_64-pc-linux-gnu{-m32,}.
There's some cases observed in SPEC2017, but no big performance impact.

Any comments?

gcc/ChangeLog:

PR tree-optimization/103144
* tree-vect-loop.cc (vect_is_nonlinear_iv_evolution): New function.
(vect_analyze_scalar_cycles_1): Detect nonlinear iv by upper function.
(vect_create_nonlinear_iv_init): New function.
(vect_create_nonlinear_iv_step): Ditto
(vect_create_nonlinear_iv_vec_step): Ditto
(vect_update_nonlinear_iv): Ditto
(vectorizable_nonlinear_induction): Ditto.
(vectorizable_induction): Call
vectorizable_nonlinear_induction when induction_type is not
vect_step_op_add.
* tree-vectorizer.h (enum vect_induction_op_type): New enum.
(STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE): New Macro.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr103144-mul-1.c: New test.
* gcc.target/i386/pr103144-mul-2.c: New test.
* gcc.target/i386/pr103144-neg-1.c: New test.
* gcc.target/i386/pr103144-neg-2.c: New test.
* gcc.target/i386/pr103144-shift-1.c: New test.
* gcc.target/i386/pr103144-shift-2.c: New test.
---
 .../gcc.target/i386/pr103144-mul-1.c  |  25 +
 .../gcc.target/i386/pr103144-mul-2.c  |  43 ++
 .../gcc.target/i386/pr103144-neg-1.c  |  25 +
 .../gcc.target/i386/pr103144-neg-2.c  |  36 ++
 .../gcc.target/i386/pr103144-shift-1.c|  34 +
 .../gcc.target/i386/pr103144-shift-2.c|  61 ++
 gcc/tree-vect-loop.cc | 604 +-
 gcc/tree-vectorizer.h |  11 +
 8 files changed, 834 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-neg-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-neg-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-shift-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-shift-2.c

diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c 
b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
new file mode 100644
index 000..2357541d95d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited 
-fdump-tree-vect-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
+
+#define N 1
+void
+foo_mul (int* a, int b)
+{
+  for (int i = 0; i != N; i++)
+{
+  a[i] = b;
+  b *= 3;
+}
+}
+
+void
+foo_mul_const (int* a)
+{
+  int b = 1;
+  for (int i = 0; i != N; i++)
+{
+  a[i] = b;
+  b *= 3;
+}
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c 
b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
new file mode 100644
index 000..4ea53e44658
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
@@ -0,0 +1,43 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited 
-mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx2 } */
+
+#include "avx2-check.h"
+#include 
+#include "pr103144-mul-1.c"
+
+typedef int v8si __attribute__((vector_size(32)));
+
+void
+avx2_test (void)
+{
+  int* epi32_exp = (int*) malloc (N * sizeof (int));
+  int* epi32_dst = (int*) malloc (N * sizeof (int));
+
+  __builtin_memset (epi32_exp, 0, N * sizeof (int));
+  int b = 8;
+  v8si init = __extension__(v8si) { b, b * 3, b * 9, b * 27, b * 81, b * 243, 
b * 729, b * 2187 };
+
+  for (int i = 0; i != N / 8; i++)
+{
+  memcpy (epi32_exp + i * 8, &init, 32);
+  init *= 6561;
+}
+
+  foo_mul (epi32_dst, b);
+  if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0)
+__builtin_abort ();
+
+  init = __extension__(v8si) { 1, 3, 9, 27, 81, 243, 729, 2187 };
+  for (int i = 0; i != N / 8; i++)
+{
+  memcpy (epi32_exp + i * 8, &init, 32);
+  init *= 6561;
+}
+
+

[PATCH] Fix ICE in rtl check when bootstrap.

2023-08-07 Thread liuhongt via Gcc-patches

/var/tmp/portage/sys-devel/gcc-14.0.0_pre20230806/work/gcc-14-20230806/libgfortran/generated/matmul_i1.c:
 In function ‘matmul_i1_avx512f’:
/var/tmp/portage/sys-devel/gcc-14.0.0_pre20230806/work/gcc-14-20230806/libgfortran/generated/matmul_i1.c:1781:1:
 internal compiler error: RTL check: expected elt 0 type 'i' or 'n', have 'w' 
(rtx const_int) in vpternlog_redundant_operand_mask, at 
config/i386/i386.cc:19460
 1781 | }
  | ^
0x5559de26dc2d rtl_check_failed_type2(rtx_def const*, int, int, int, char 
const*, int, char const*)

/var/tmp/portage/sys-devel/gcc-14.0.0_pre20230806/work/gcc-14-20230806/gcc/rtl.cc:761
0x5559de340bfe vpternlog_redundant_operand_mask(rtx_def**)

/var/tmp/portage/sys-devel/gcc-14.0.0_pre20230806/work/gcc-14-20230806/gcc/config/i386/i386.cc:19460
0x5559dfec67a6 split_44

/var/tmp/portage/sys-devel/gcc-14.0.0_pre20230806/work/gcc-14-20230806/gcc/config/i386/sse.md:12730
0x5559dfec67a6 split_63

/var/tmp/portage/sys-devel/gcc-14.0.0_pre20230806/work/gcc-14-20230806/gcc/config/i386/sse.md:28428
0x5559deb8a682 try_split(rtx_def*, rtx_insn*, int)

/var/tmp/portage/sys-devel/gcc-14.0.0_pre20230806/work/gcc-14-20230806/gcc/emit-rtl.cc:3800
0x5559deb8adf2 try_split(rtx_def*, rtx_insn*, int)

/var/tmp/portage/sys-devel/gcc-14.0.0_pre20230806/work/gcc-14-20230806/gcc/emit-rtl.cc:3972
0x5559def69194 split_insn

/var/tmp/portage/sys-devel/gcc-14.0.0_pre20230806/work/gcc-14-20230806/gcc/recog.cc:3385
0x5559def70c57 split_all_insns()

/var/tmp/portage/sys-devel/gcc-14.0.0_pre20230806/work/gcc-14-20230806/gcc/recog.cc:3489
0x5559def70d0c execute

/var/tmp/portage/sys-devel/gcc-14.0.0_pre20230806/work/gcc-14-20230806/gcc/recog.cc:4413

Use INTVAL (imm_op) instead of XINT (imm_op, 0).

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

* config/i386/i386-protos.h (vpternlog_redundant_operand_mask):
  Adjust parameter type.
* config/i386/i386.cc (vpternlog_redundant_operand_mask): Use
  INTVAL instead of XINT, also adjust parameter type from rtx* to
  rtx since the function only needs operands[4] in vpternlog
  pattern.
(substitute_vpternlog_operands): Pass operands[4] instead of
  operands to vpternlog_redundant_operand_mask
* config/i386/sse.md: Ditto.
---
 gcc/config/i386/i386-protos.h | 2 +-
 gcc/config/i386/i386.cc   | 6 +++---
 gcc/config/i386/sse.md| 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index e547ee64587..fc2f1f13b78 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -70,7 +70,7 @@ extern machine_mode ix86_cc_mode (enum rtx_code, rtx, rtx);
 extern int avx_vpermilp_parallel (rtx par, machine_mode mode);
 extern int avx_vperm2f128_parallel (rtx par, machine_mode mode);
 
-extern int vpternlog_redundant_operand_mask (rtx[]);
+extern int vpternlog_redundant_operand_mask (rtx);
 extern void substitute_vpternlog_operands (rtx[]);
 
 extern bool ix86_expand_strlen (rtx, rtx, rtx, rtx);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 8cd26eb54fa..50860050049 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -19454,10 +19454,10 @@ avx_vperm2f128_parallel (rtx par, machine_mode mode)
 /* Return a mask of VPTERNLOG operands that do not affect output.  */
 
 int
-vpternlog_redundant_operand_mask (rtx *operands)
+vpternlog_redundant_operand_mask (rtx pternlog_imm)
 {
   int mask = 0;
-  int imm8 = XINT (operands[4], 0);
+  int imm8 = INTVAL (pternlog_imm);
 
   if (((imm8 >> 4) & 0x0F) == (imm8 & 0x0F))
 mask |= 1;
@@ -19475,7 +19475,7 @@ vpternlog_redundant_operand_mask (rtx *operands)
 void
 substitute_vpternlog_operands (rtx *operands)
 {
-  int mask = vpternlog_redundant_operand_mask (operands);
+  int mask = vpternlog_redundant_operand_mask (operands[4]);
 
   if (mask & 1) /* The first operand is redundant.  */
 operands[1] = operands[2];
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 7e2aa3f995c..c53450fd965 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -12706,7 +12706,7 @@ (define_split
   (match_operand:V 3 "memory_operand")
   (match_operand:SI 4 "const_0_to_255_operand")]
  UNSPEC_VTERNLOG))]
-  "!reload_completed && vpternlog_redundant_operand_mask (operands) == 3"
+  "!reload_completed && vpternlog_redundant_operand_mask (operands[4]) == 3"
   [(set (match_dup 0)
(match_dup 3))
(set (match_dup 0)
@@ -12727,7 +12727,7 @@ (define_split
   (match_operand:V 3 "nonimmediate_operand")
   (match_operand:SI 4 "const_0_to_255_operand")]
  UNSPEC_VTERNLOG))]
-  "!reload_completed && vpternlog_redundant_operand_mask (operands) != 0"
+  "!reload_completed && vpternlog_redundant_operand_mask (operands[4]) != 0"
   [(set (match_dup 0)

[PATCH] i386: Clear upper bits of XMM register for V4HFmode/V2HFmode operations [PR110762]

2023-08-07 Thread liuhongt via Gcc-patches

Similar like r14-2786-gade30fad6669e5, the patch is for V4HF/V2HFmode.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/110762
* config/i386/mmx.md (3): Changed from define_insn
to define_expand and break into ..
(v4hf3): .. this.
(divv4hf3): .. this.
(v2hf3): .. this.
(divv2hf3): .. this.
(movd_v2hf_to_sse): New define_expand.
(movq__to_sse): Extend to V4HFmode.
(mmxdoublevecmode): Ditto.
(V2FI_V4HF): New mode iterator.
* config/i386/sse.md (*vec_concatv4sf): Extend to hanlde V8HF
by using mode iterator V4SF_V8HF, renamed to ..
(*vec_concat): .. this.
(*vec_concatv4sf_0): Extend to handle V8HF by using mode
iterator V4SF_V8HF, renamed to ..
(*vec_concat_0): .. this.
(*vec_concatv8hf_movss): New define_insn.
(V4SF_V8HF): New mode iterator.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110762-v4hf.c: New test.
---
 gcc/config/i386/mmx.md| 109 +++---
 gcc/config/i386/sse.md|  40 +--
 gcc/testsuite/gcc.target/i386/pr110762-v4hf.c |  57 +
 3 files changed, 177 insertions(+), 29 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110762-v4hf.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 896af76a33f..88bdf084f54 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -79,9 +79,7 @@ (define_mode_iterator V_16_32_64
 ;; V2S* modes
 (define_mode_iterator V2FI [V2SF V2SI])
 
-;; 4-byte and 8-byte float16 vector modes
-(define_mode_iterator VHF_32_64 [V4HF V2HF])
-
+(define_mode_iterator V2FI_V4HF [V2SF V2SI V4HF])
 ;; Mapping from integer vector mode to mnemonic suffix
 (define_mode_attr mmxvecsize
   [(V8QI "b") (V4QI "b") (V2QI "b")
@@ -108,7 +106,7 @@ (define_mode_attr mmxintvecmodelower
 
 ;; Mapping of vector modes to a vector mode of double size
 (define_mode_attr mmxdoublevecmode
-  [(V2SF "V4SF") (V2SI "V4SI")])
+  [(V2SF "V4SF") (V2SI "V4SI") (V4HF "V8HF")])
 
 ;; Mapping of vector modes back to the scalar modes
 (define_mode_attr mmxscalarmode
@@ -594,7 +592,7 @@ (define_insn "sse_movntq"
 (define_expand "movq__to_sse"
   [(set (match_operand: 0 "register_operand")
(vec_concat:
- (match_operand:V2FI 1 "nonimmediate_operand")
+ (match_operand:V2FI_V4HF 1 "nonimmediate_operand")
  (match_dup 2)))]
   "TARGET_SSE2"
   "operands[2] = CONST0_RTX (mode);")
@@ -1927,21 +1925,94 @@ (define_expand "lroundv2sfv2si2"
 ;;
 ;
 
-(define_insn "3"
-  [(set (match_operand:VHF_32_64 0 "register_operand" "=v")
-   (plusminusmultdiv:VHF_32_64
- (match_operand:VHF_32_64 1 "register_operand" "v")
- (match_operand:VHF_32_64 2 "register_operand" "v")))]
+(define_expand "v4hf3"
+  [(set (match_operand:V4HF 0 "register_operand")
+   (plusminusmult:V4HF
+ (match_operand:V4HF 1 "nonimmediate_operand")
+ (match_operand:V4HF 2 "nonimmediate_operand")))]
   "TARGET_AVX512FP16 && TARGET_AVX512VL"
-  "vph\t{%2, %1, %0|%0, %1, %2}"
-  [(set (attr "type")
-  (cond [(match_test " == MULT")
-   (const_string "ssemul")
-(match_test " == DIV")
-   (const_string "ssediv")]
-(const_string "sseadd")))
-   (set_attr "prefix" "evex")
-   (set_attr "mode" "V8HF")])
+{
+  rtx op2 = gen_reg_rtx (V8HFmode);
+  rtx op1 = gen_reg_rtx (V8HFmode);
+  rtx op0 = gen_reg_rtx (V8HFmode);
+
+  emit_insn (gen_movq_v4hf_to_sse (op2, operands[2]));
+  emit_insn (gen_movq_v4hf_to_sse (op1, operands[1]));
+
+  emit_insn (gen_v8hf3 (op0, op1, op2));
+
+  emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode));
+  DONE;
+})
+
+(define_expand "divv4hf3"
+  [(set (match_operand:V4HF 0 "register_operand")
+   (div:V4HF
+ (match_operand:V4HF 1 "nonimmediate_operand")
+ (match_operand:V4HF 2 "nonimmediate_operand")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+{
+  rtx op2 = gen_reg_rtx (V8HFmode);
+  rtx op1 = gen_reg_rtx (V8HFmode);
+  rtx op0 = gen_reg_rtx (V8HFmode);
+
+  emit_insn (gen_movq_v4hf_to_sse (op1, operands[1]));
+  rtx tmp = gen_rtx_VEC_CONCAT (V8HFmode, operands[2],
+   force_reg (V4HFmode, CONST1_RTX (V4HFmode)));
+  emit_insn (gen_rtx_SET (op2, tmp));
+  emit_insn (gen_divv8hf3 (op0, op1, op2));
+  emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode));
+  DONE;
+})
+
+(define_expand "movd_v2hf_to_sse"
+  [(set (match_operand:V8HF 0 "register_operand")
+   (vec_merge:V8HF
+ (vec_duplicate:V8HF
+   (match_operand:V2HF 1 "nonimmediate_operand"))
+ (match_operand:V8HF 2 "reg_or_0_operand")
+ (const_int 3)))]
+  "TARGET_SSE")
+
+(define_expand "v2hf3"
+  [(set (match_operand:V2HF 0 "register_operand")
+   (plusminusmult:V2HF
+

[PATCH] [X86] Workaround possible CPUID bug in Sandy Bridge.

2023-08-08 Thread liuhongt via Gcc-patches

Don't access leaf 7 subleaf 1 unless subleaf 0 says it is
supported via EAX.

Intel documentation says invalid subleaves return 0. We had been
relying on that behavior instead of checking the max sublef number.

It appears that some Sandy Bridge CPUs return at least the subleaf 0
EDX value for subleaf 1. Best guess is that this is a bug in a
microcode patch since all of the bits we're seeing set in EDX were
introduced after Sandy Bridge was originally released.

This is causing avxvnniint16 to be incorrectly enabled with
-march=native on these CPUs.

BTW: Thanks for reminder from llvm forks Phoebe and Craig.


Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk and backport?

gcc/ChangeLog:

* common/config/i386/cpuinfo.h (get_available_features): Check
EAX for valid subleaf before use CPUID.
---
 gcc/common/config/i386/cpuinfo.h | 84 +---
 1 file changed, 46 insertions(+), 38 deletions(-)

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 30ef0d334ca..24ab2252eb0 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -874,45 +874,53 @@ get_available_features (struct __processor_model 
*cpu_model,
set_feature (FEATURE_AVX512FP16);
}
 
-  __cpuid_count (7, 1, eax, ebx, ecx, edx);
-  if (eax & bit_HRESET)
-   set_feature (FEATURE_HRESET);
-  if (eax & bit_CMPCCXADD)
-   set_feature(FEATURE_CMPCCXADD);
-  if (edx & bit_PREFETCHI)
-   set_feature (FEATURE_PREFETCHI);
-  if (eax & bit_RAOINT)
-   set_feature (FEATURE_RAOINT);
-  if (avx_usable)
-   {
- if (eax & bit_AVXVNNI)
-   set_feature (FEATURE_AVXVNNI);
- if (eax & bit_AVXIFMA)
-   set_feature (FEATURE_AVXIFMA);
- if (edx & bit_AVXVNNIINT8)
-   set_feature (FEATURE_AVXVNNIINT8);
- if (edx & bit_AVXNECONVERT)
-   set_feature (FEATURE_AVXNECONVERT);
- if (edx & bit_AVXVNNIINT16)
-   set_feature (FEATURE_AVXVNNIINT16);
- if (eax & bit_SM3)
-   set_feature (FEATURE_SM3);
- if (eax & bit_SHA512)
-   set_feature (FEATURE_SHA512);
- if (eax & bit_SM4)
-   set_feature (FEATURE_SM4);
-   }
-  if (avx512_usable)
-   {
- if (eax & bit_AVX512BF16)
-   set_feature (FEATURE_AVX512BF16);
-   }
-  if (amx_usable)
+  /* According to document, when subleaf is invliad, EAX,EBX,ECX,EDX should
+return 0 for CPUID (7, 1, EAX, EBX, ECX, EDX).
+But looks like it doesn't satisfy the document on some CPU, refer to
+https://reviews.llvm.org/D155145.
+Manually check valid subleaf here.  */
+  if (eax)
{
- if (eax & bit_AMX_FP16)
-   set_feature (FEATURE_AMX_FP16);
- if (edx & bit_AMX_COMPLEX)
-   set_feature (FEATURE_AMX_COMPLEX);
+ __cpuid_count (7, 1, eax, ebx, ecx, edx);
+ if (eax & bit_HRESET)
+   set_feature (FEATURE_HRESET);
+ if (eax & bit_CMPCCXADD)
+   set_feature(FEATURE_CMPCCXADD);
+ if (edx & bit_PREFETCHI)
+   set_feature (FEATURE_PREFETCHI);
+ if (eax & bit_RAOINT)
+   set_feature (FEATURE_RAOINT);
+ if (avx_usable)
+   {
+ if (eax & bit_AVXVNNI)
+   set_feature (FEATURE_AVXVNNI);
+ if (eax & bit_AVXIFMA)
+   set_feature (FEATURE_AVXIFMA);
+ if (edx & bit_AVXVNNIINT8)
+   set_feature (FEATURE_AVXVNNIINT8);
+ if (edx & bit_AVXNECONVERT)
+   set_feature (FEATURE_AVXNECONVERT);
+ if (edx & bit_AVXVNNIINT16)
+   set_feature (FEATURE_AVXVNNIINT16);
+ if (eax & bit_SM3)
+   set_feature (FEATURE_SM3);
+ if (eax & bit_SHA512)
+   set_feature (FEATURE_SHA512);
+ if (eax & bit_SM4)
+   set_feature (FEATURE_SM4);
+   }
+ if (avx512_usable)
+   {
+ if (eax & bit_AVX512BF16)
+   set_feature (FEATURE_AVX512BF16);
+   }
+ if (amx_usable)
+   {
+ if (eax & bit_AMX_FP16)
+   set_feature (FEATURE_AMX_FP16);
+ if (edx & bit_AMX_COMPLEX)
+   set_feature (FEATURE_AMX_COMPLEX);
+   }
}
 }
 
-- 
2.31.1

[PATCH V2] [X86] Workaround possible CPUID bug in Sandy Bridge.

2023-08-08 Thread liuhongt via Gcc-patches

> Please rather do it in a more self-descriptive way, as proposed in the
> attached patch. You won't need a comment then.
>

Adjusted in V2 patch.

Don't access leaf 7 subleaf 1 unless subleaf 0 says it is
supported via EAX.

Intel documentation says invalid subleaves return 0. We had been
relying on that behavior instead of checking the max sublef number.

It appears that some Sandy Bridge CPUs return at least the subleaf 0
EDX value for subleaf 1. Best guess is that this is a bug in a
microcode patch since all of the bits we're seeing set in EDX were
introduced after Sandy Bridge was originally released.

This is causing avxvnniint16 to be incorrectly enabled with
-march=native on these CPUs.

gcc/ChangeLog:

* common/config/i386/cpuinfo.h (get_available_features): Check
EAX for valid subleaf before use CPUID.
---
 gcc/common/config/i386/cpuinfo.h | 82 +---
 1 file changed, 43 insertions(+), 39 deletions(-)

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 30ef0d334ca..9fa4dec2a7e 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -663,6 +663,7 @@ get_available_features (struct __processor_model *cpu_model,
   unsigned int max_cpuid_level = cpu_model2->__cpu_max_level;
   unsigned int eax, ebx;
   unsigned int ext_level;
+  unsigned int subleaf_level;
 
   /* Get XCR_XFEATURE_ENABLED_MASK register with xgetbv.  */
 #define XCR_XFEATURE_ENABLED_MASK  0x0
@@ -762,7 +763,7 @@ get_available_features (struct __processor_model *cpu_model,
   /* Get Advanced Features at level 7 (eax = 7, ecx = 0/1). */
   if (max_cpuid_level >= 7)
 {
-  __cpuid_count (7, 0, eax, ebx, ecx, edx);
+  __cpuid_count (7, 0, subleaf_level, ebx, ecx, edx);
   if (ebx & bit_BMI)
set_feature (FEATURE_BMI);
   if (ebx & bit_SGX)
@@ -874,45 +875,48 @@ get_available_features (struct __processor_model 
*cpu_model,
set_feature (FEATURE_AVX512FP16);
}
 
-  __cpuid_count (7, 1, eax, ebx, ecx, edx);
-  if (eax & bit_HRESET)
-   set_feature (FEATURE_HRESET);
-  if (eax & bit_CMPCCXADD)
-   set_feature(FEATURE_CMPCCXADD);
-  if (edx & bit_PREFETCHI)
-   set_feature (FEATURE_PREFETCHI);
-  if (eax & bit_RAOINT)
-   set_feature (FEATURE_RAOINT);
-  if (avx_usable)
-   {
- if (eax & bit_AVXVNNI)
-   set_feature (FEATURE_AVXVNNI);
- if (eax & bit_AVXIFMA)
-   set_feature (FEATURE_AVXIFMA);
- if (edx & bit_AVXVNNIINT8)
-   set_feature (FEATURE_AVXVNNIINT8);
- if (edx & bit_AVXNECONVERT)
-   set_feature (FEATURE_AVXNECONVERT);
- if (edx & bit_AVXVNNIINT16)
-   set_feature (FEATURE_AVXVNNIINT16);
- if (eax & bit_SM3)
-   set_feature (FEATURE_SM3);
- if (eax & bit_SHA512)
-   set_feature (FEATURE_SHA512);
- if (eax & bit_SM4)
-   set_feature (FEATURE_SM4);
-   }
-  if (avx512_usable)
-   {
- if (eax & bit_AVX512BF16)
-   set_feature (FEATURE_AVX512BF16);
-   }
-  if (amx_usable)
+  if (subleaf_level >= 1)
{
- if (eax & bit_AMX_FP16)
-   set_feature (FEATURE_AMX_FP16);
- if (edx & bit_AMX_COMPLEX)
-   set_feature (FEATURE_AMX_COMPLEX);
+ __cpuid_count (7, 1, eax, ebx, ecx, edx);
+ if (eax & bit_HRESET)
+   set_feature (FEATURE_HRESET);
+ if (eax & bit_CMPCCXADD)
+   set_feature(FEATURE_CMPCCXADD);
+ if (edx & bit_PREFETCHI)
+   set_feature (FEATURE_PREFETCHI);
+ if (eax & bit_RAOINT)
+   set_feature (FEATURE_RAOINT);
+ if (avx_usable)
+   {
+ if (eax & bit_AVXVNNI)
+   set_feature (FEATURE_AVXVNNI);
+ if (eax & bit_AVXIFMA)
+   set_feature (FEATURE_AVXIFMA);
+ if (edx & bit_AVXVNNIINT8)
+   set_feature (FEATURE_AVXVNNIINT8);
+ if (edx & bit_AVXNECONVERT)
+   set_feature (FEATURE_AVXNECONVERT);
+ if (edx & bit_AVXVNNIINT16)
+   set_feature (FEATURE_AVXVNNIINT16);
+ if (eax & bit_SM3)
+   set_feature (FEATURE_SM3);
+ if (eax & bit_SHA512)
+   set_feature (FEATURE_SHA512);
+ if (eax & bit_SM4)
+   set_feature (FEATURE_SM4);
+   }
+ if (avx512_usable)
+   {
+ if (eax & bit_AVX512BF16)
+   set_feature (FEATURE_AVX512BF16);
+   }
+ if (amx_usable)
+   {
+ if (eax & bit_AMX_FP16)
+   set_feature (FEATURE_AMX_FP16);
+ if (edx & bit_AMX_COMPLEX)
+   set_feature (FEATURE_AMX_COMPLEX);
+   }
}
 }
 
-- 
2.31.1

[PATCH] Rename local variable subleaf_level to max_subleaf_level.

2023-08-08 Thread liuhongt via Gcc-patches

This minor fix is preapproved in [1].
Committed to trunk.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2023-August/626758.html

gcc/ChangeLog:

* common/config/i386/cpuinfo.h (get_available_features):
Rename local variable subleaf_level to max_subleaf_level.
---
 gcc/common/config/i386/cpuinfo.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 9fa4dec2a7e..70e8d01e09b 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -663,7 +663,6 @@ get_available_features (struct __processor_model *cpu_model,
   unsigned int max_cpuid_level = cpu_model2->__cpu_max_level;
   unsigned int eax, ebx;
   unsigned int ext_level;
-  unsigned int subleaf_level;
 
   /* Get XCR_XFEATURE_ENABLED_MASK register with xgetbv.  */
 #define XCR_XFEATURE_ENABLED_MASK  0x0
@@ -763,7 +762,8 @@ get_available_features (struct __processor_model *cpu_model,
   /* Get Advanced Features at level 7 (eax = 7, ecx = 0/1). */
   if (max_cpuid_level >= 7)
 {
-  __cpuid_count (7, 0, subleaf_level, ebx, ecx, edx);
+  unsigned int max_subleaf_level;
+  __cpuid_count (7, 0, max_subleaf_level, ebx, ecx, edx);
   if (ebx & bit_BMI)
set_feature (FEATURE_BMI);
   if (ebx & bit_SGX)
@@ -875,7 +875,7 @@ get_available_features (struct __processor_model *cpu_model,
set_feature (FEATURE_AVX512FP16);
}
 
-  if (subleaf_level >= 1)
+  if (max_subleaf_level >= 1)
{
  __cpuid_count (7, 1, eax, ebx, ecx, edx);
  if (eax & bit_HRESET)
-- 
2.31.1

[PATCH] i386: Do not sanitize upper part of V2HFmode and V4HFmode reg with -fno-trapping-math [PR110832]

2023-08-09 Thread liuhongt via Gcc-patches

Also add ix86_partial_vec_fp_math to to condition of V2HF/V4HF named
patterns in order to avoid generation of partial vector V8HFmode
trapping instructions.

Bootstrapped and regtseted on x86_64-pc-linux-gnu{-m32,}
Ok for trunk?

gcc/ChangeLog:

PR target/110832
* config/i386/mmx.md: (movq__to_sse): Also do not
sanitize upper part of V4HFmode register with
-fno-trapping-math.
(v4hf3): Enable for ix86_partial_vec_fp_math.
(v2hf3): Ditto.
(divv2hf3): Ditto.
(movd_v2hf_to_sse): Do not sanitize upper part of V2HFmode
register with -fno-trapping-math.
---
 gcc/config/i386/mmx.md | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index d51b3b9dc71..170432a7128 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -596,7 +596,7 @@ (define_expand "movq__to_sse"
  (match_dup 2)))]
   "TARGET_SSE2"
 {
-  if (mode == V2SFmode
+  if (mode != V2SImode
   && !flag_trapping_math)
 {
   rtx op1 = force_reg (mode, operands[1]);
@@ -1941,7 +1941,7 @@ (define_expand "v4hf3"
(plusminusmult:V4HF
  (match_operand:V4HF 1 "nonimmediate_operand")
  (match_operand:V4HF 2 "nonimmediate_operand")))]
-  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "TARGET_AVX512FP16 && TARGET_AVX512VL && ix86_partial_vec_fp_math"
 {
   rtx op2 = gen_reg_rtx (V8HFmode);
   rtx op1 = gen_reg_rtx (V8HFmode);
@@ -1961,7 +1961,7 @@ (define_expand "divv4hf3"
(div:V4HF
  (match_operand:V4HF 1 "nonimmediate_operand")
  (match_operand:V4HF 2 "nonimmediate_operand")))]
-  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "TARGET_AVX512FP16 && TARGET_AVX512VL && ix86_partial_vec_fp_math"
 {
   rtx op2 = gen_reg_rtx (V8HFmode);
   rtx op1 = gen_reg_rtx (V8HFmode);
@@ -1983,14 +1983,22 @@ (define_expand "movd_v2hf_to_sse"
(match_operand:V2HF 1 "nonimmediate_operand"))
  (match_operand:V8HF 2 "reg_or_0_operand")
  (const_int 3)))]
-  "TARGET_SSE")
+  "TARGET_SSE"
+{
+  if (!flag_trapping_math && operands[2] == CONST0_RTX (V8HFmode))
+  {
+rtx op1 = force_reg (V2HFmode, operands[1]);
+emit_move_insn (operands[0], lowpart_subreg (V8HFmode, op1, V2HFmode));
+DONE;
+  }
+})
 
 (define_expand "v2hf3"
   [(set (match_operand:V2HF 0 "register_operand")
(plusminusmult:V2HF
  (match_operand:V2HF 1 "nonimmediate_operand")
  (match_operand:V2HF 2 "nonimmediate_operand")))]
-  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "TARGET_AVX512FP16 && TARGET_AVX512VL && ix86_partial_vec_fp_math"
 {
   rtx op2 = gen_reg_rtx (V8HFmode);
   rtx op1 = gen_reg_rtx (V8HFmode);
@@ -2009,7 +2017,7 @@ (define_expand "divv2hf3"
(div:V2HF
  (match_operand:V2HF 1 "nonimmediate_operand")
  (match_operand:V2HF 2 "nonimmediate_operand")))]
-  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "TARGET_AVX512FP16 && TARGET_AVX512VL && ix86_partial_vec_fp_math"
 {
   rtx op2 = gen_reg_rtx (V8HFmode);
   rtx op1 = gen_reg_rtx (V8HFmode);
-- 
2.31.1

[PATCH] Support -m[no-]gather -m[no-]scatter to enable/disable vectorization for all gather/scatter instructions.

2023-08-09 Thread liuhongt via Gcc-patches

Currently we have 3 different independent tunes for gather
"use_gather,use_gather_2parts,use_gather_4parts",
similar for scatter, there're
"use_scatter,use_scatter_2parts,use_scatter_4parts"

The patch support 2 standardizing options to enable/disable
vectorization for all gather/scatter instructions. The options is
interpreted by driver to 3 tunes.

bootstrapped and regtested on x86_64-pc-linux-gnu.
Ok for trunk?

gcc/ChangeLog:

* config/i386/i386.h (DRIVER_SELF_SPECS): Add
GATHER_SCATTER_DRIVER_SELF_SPECS.
(GATHER_SCATTER_DRIVER_SELF_SPECS): New macro.
* config/i386/i386.opt (mgather): New option.
(mscatter): Ditto.
---
 gcc/config/i386/i386.h   | 12 +++-
 gcc/config/i386/i386.opt |  8 
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index ef342fcee9b..d9ac2c29bde 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -565,7 +565,17 @@ extern GTY(()) tree x86_mfence;
 # define SUBTARGET_DRIVER_SELF_SPECS ""
 #endif
 
-#define DRIVER_SELF_SPECS SUBTARGET_DRIVER_SELF_SPECS
+#ifndef GATHER_SCATTER_DRIVER_SELF_SPECS
+# define GATHER_SCATTER_DRIVER_SELF_SPECS \
+  "%{mno-gather:-mtune-ctrl=^use_gather_2parts,^use_gather_4parts,^use_gather} 
\
+   %{mgather:-mtune-ctrl=use_gather_2parts,use_gather_4parts,use_gather} \
+   
%{mno-scatter:-mtune-ctrl=^use_scatter_2parts,^use_scatter_4parts,^use_scatter} 
\
+   %{mscatter:-mtune-ctrl=use_scatter_2parts,use_scatter_4parts,use_scatter}"
+#endif
+
+#define DRIVER_SELF_SPECS \
+  SUBTARGET_DRIVER_SELF_SPECS " " \
+  GATHER_SCATTER_DRIVER_SELF_SPECS
 
 /* -march=native handling only makes sense with compiler running on
an x86 or x86_64 chip.  If changing this condition, also change
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index ddb7f110aa2..99948644a8d 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -424,6 +424,14 @@ mdaz-ftz
 Target
 Set the FTZ and DAZ Flags.
 
+mgather
+Target
+Enable vectorization for gather instruction.
+
+mscatter
+Target
+Enable vectorization for scatter instruction.
+
 mpreferred-stack-boundary=
 Target RejectNegative Joined UInteger Var(ix86_preferred_stack_boundary_arg)
 Attempt to keep stack aligned to this power of 2.
-- 
2.31.1

[PATCH] Software mitigation: Disable gather generation in vectorization for GDS affected Intel Processors.

2023-08-10 Thread liuhongt via Gcc-patches

For more details of GDS (Gather Data Sampling), refer to
https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/gather-data-sampling.html

After microcode update, there's performance regression. To avoid that,
the patch disables gather generation in autovectorization but uses
gather scalar emulation instead.

Ready push to trunk and backport.
any comments?

gcc/ChangeLog:

* config/i386/i386-options.cc (m_GDS): New macro.
* config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Don't
enable for m_GDS.
(X86_TUNE_USE_GATHER_4PARTS): Ditto.
(X86_TUNE_USE_GATHER): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx2-gather-2.c: Adjust options to keep
gather vectorization.
* gcc.target/i386/avx2-gather-6.c: Ditto.
* gcc.target/i386/avx512f-pr88464-1.c: Ditto.
* gcc.target/i386/avx512f-pr88464-5.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-1.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-11.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-3.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-9.c: Ditto.
* gcc.target/i386/pr88531-1b.c: Ditto.
* gcc.target/i386/pr88531-1c.c: Ditto.
---
 gcc/config/i386/i386-options.cc | 5 +
 gcc/config/i386/x86-tune.def| 6 +++---
 gcc/testsuite/gcc.target/i386/avx2-gather-2.c   | 2 +-
 gcc/testsuite/gcc.target/i386/avx2-gather-6.c   | 2 +-
 gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c   | 2 +-
 gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c   | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c  | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c  | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c  | 2 +-
 gcc/testsuite/gcc.target/i386/pr88531-1b.c  | 2 +-
 gcc/testsuite/gcc.target/i386/pr88531-1c.c  | 2 +-
 12 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 127ee24203c..e6ba33c370d 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -141,6 +141,11 @@ along with GCC; see the file COPYING3.  If not see
 #define m_ARROWLAKE (HOST_WIDE_INT_1U<

[PATCH V2] Support -m[no-]gather -m[no-]scatter to enable/disable vectorization for all gather/scatter instructions

2023-08-10 Thread liuhongt via Gcc-patches

Rename original use_gather to use_gather_8parts, Support
-mtune-ctrl={,^}use_gather to set/clear tune features
use_gather_{2parts, 4parts, 8parts}. Support the new option -mgather
as alias of -mtune-ctrl=, use_gather, ^use_gather.

Similar for use_scatter.

How about this version?

gcc/ChangeLog:

* config/i386/i386-builtins.cc
(ix86_vectorize_builtin_gather): Adjust for use_gather_8parts.
* config/i386/i386-options.cc (parse_mtune_ctrl_str):
Set/Clear tune features use_{gather,scatter}_{2parts, 4parts,
8parts} for -mtune-crtl={,^}{use_gather,use_scatter}.
* config/i386/i386.cc (ix86_vectorize_builtin_scatter): Adjust
for use_scatter_8parts
* config/i386/i386.h (TARGET_USE_GATHER): Rename to ..
(TARGET_USE_GATHER_8PARTS): .. this.
(TARGET_USE_SCATTER): Rename to ..
(TARGET_USE_SCATTER_8PARTS): .. this.
* config/i386/x86-tune.def (X86_TUNE_USE_GATHER): Rename to
(X86_TUNE_USE_GATHER_8PARTS): .. this.
(X86_TUNE_USE_SCATTER): Rename to
(X86_TUNE_USE_SCATTER_8PARTS): .. this.
* config/i386/i386.opt: Add new options mgather, mscatter.
---
 gcc/config/i386/i386-builtins.cc |  2 +-
 gcc/config/i386/i386-options.cc  | 54 +++-
 gcc/config/i386/i386.cc  |  2 +-
 gcc/config/i386/i386.h   |  8 ++---
 gcc/config/i386/i386.opt |  8 +
 gcc/config/i386/x86-tune.def |  4 +--
 6 files changed, 56 insertions(+), 22 deletions(-)

diff --git a/gcc/config/i386/i386-builtins.cc b/gcc/config/i386/i386-builtins.cc
index 356b6dfd5fb..8a0b8dfe073 100644
--- a/gcc/config/i386/i386-builtins.cc
+++ b/gcc/config/i386/i386-builtins.cc
@@ -1657,7 +1657,7 @@ ix86_vectorize_builtin_gather (const_tree mem_vectype,
  ? !TARGET_USE_GATHER_2PARTS
  : (known_eq (TYPE_VECTOR_SUBPARTS (mem_vectype), 4u)
 ? !TARGET_USE_GATHER_4PARTS
-: !TARGET_USE_GATHER)))
+: !TARGET_USE_GATHER_8PARTS)))
 return NULL_TREE;
 
   if ((TREE_CODE (index_type) != INTEGER_TYPE
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 127ee24203c..b8d038af69d 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -1731,20 +1731,46 @@ parse_mtune_ctrl_str (struct gcc_options *opts, bool 
dump)
   curr_feature_string++;
   clear = true;
 }
-  for (i = 0; i < X86_TUNE_LAST; i++)
-{
-  if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
-{
-  ix86_tune_features[i] = !clear;
-  if (dump)
-fprintf (stderr, "Explicitly %s feature %s\n",
- clear ? "clear" : "set", ix86_tune_feature_names[i]);
-  break;
-}
-}
-  if (i == X86_TUNE_LAST)
-   error ("unknown parameter to option %<-mtune-ctrl%>: %s",
-  clear ? curr_feature_string - 1 : curr_feature_string);
+
+  if (!strcmp (curr_feature_string, "use_gather"))
+   {
+ ix86_tune_features[X86_TUNE_USE_GATHER_2PARTS] = !clear;
+ ix86_tune_features[X86_TUNE_USE_GATHER_4PARTS] = !clear;
+ ix86_tune_features[X86_TUNE_USE_GATHER_8PARTS] = !clear;
+ if (dump)
+   fprintf (stderr, "Explicitly %s features use_gather_2parts,"
+" use_gather_4parts, use_gather_8parts\n",
+clear ? "clear" : "set");
+
+   }
+  else if (!strcmp (curr_feature_string, "use_scatter"))
+   {
+ ix86_tune_features[X86_TUNE_USE_SCATTER_2PARTS] = !clear;
+ ix86_tune_features[X86_TUNE_USE_SCATTER_4PARTS] = !clear;
+ ix86_tune_features[X86_TUNE_USE_SCATTER_8PARTS] = !clear;
+ if (dump)
+   fprintf (stderr, "Explicitly %s features use_scatter_2parts,"
+" use_scatter_4parts, use_scatter_8parts\n",
+clear ? "clear" : "set");
+   }
+  else
+   {
+ for (i = 0; i < X86_TUNE_LAST; i++)
+   {
+ if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
+   {
+ ix86_tune_features[i] = !clear;
+ if (dump)
+   fprintf (stderr, "Explicitly %s feature %s\n",
+clear ? "clear" : "set", 
ix86_tune_feature_names[i]);
+ break;
+   }
+   }
+
+ if (i == X86_TUNE_LAST)
+   error ("unknown parameter to option %<-mtune-ctrl%>: %s",
+  clear ? curr_feature_string - 1 : curr_feature_string);
+   }
   curr_feature_string = next_feature_string;
 }
   while (curr_feature_string);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d592ece700a..cd49fb9e47a 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -19193,7 +19193,7 @@ ix86_vectorize_builtin_scatter (const_tree vectype,
   ? !TARGET_USE_SCATTER_2

[PATCH] Generate vmovapd instead of vmovsd for moving DFmode between SSE_REGS.

2023-08-13 Thread liuhongt via Gcc-patches

vmovapd can enable register renaming and have same code size as
vmovsd. Similar for vmovsh vs vmovaps, vmovaps is 1 byte less than
vmovsh.

When TARGET_AVX512VL is not available, still generate
vmovsd/vmovss/vmovsh to avoid vmovapd/vmovaps zmm16-31.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

* config/i386/i386.md (movdf_internal): Generate vmovapd instead of
vmovsd when moving DFmode between SSE_REGS.
(movhi_internal): Generate vmovdqa instead of vmovsh when
moving HImode between SSE_REGS.
(mov_internal): Use vmovaps instead of vmovsh when
moving HF/BFmode between SSE_REGS.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr89229-4a.c: Adjust testcase.
---
 gcc/config/i386/i386.md| 20 +---
 gcc/testsuite/gcc.target/i386/pr89229-4a.c |  4 +---
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index c906d75b13e..77182e34fe1 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -2961,8 +2961,12 @@ (define_insn "*movhi_internal"
]
(const_string "TI"))
(eq_attr "alternative" "12")
- (cond [(match_test "TARGET_AVX512FP16")
+ (cond [(match_test "TARGET_AVX512VL")
+  (const_string "TI")
+(match_test "TARGET_AVX512FP16")
   (const_string "HF")
+(match_test "TARGET_AVX512F")
+  (const_string "SF")
 (match_test "TARGET_AVX")
   (const_string "TI")
 (ior (not (match_test "TARGET_SSE2"))
@@ -4099,8 +4103,12 @@ (define_insn "*movdf_internal"
 
   /* movaps is one byte shorter for non-AVX targets.  */
   (eq_attr "alternative" "13,17")
-(cond [(match_test "TARGET_AVX")
+(cond [(match_test "TARGET_AVX512VL")
+ (const_string "V2DF")
+   (match_test "TARGET_AVX512F")
  (const_string "DF")
+   (match_test "TARGET_AVX")
+ (const_string "V2DF")
(ior (not (match_test "TARGET_SSE2"))
 (match_test "optimize_function_for_size_p (cfun)"))
  (const_string "V4SF")
@@ -4380,8 +4388,14 @@ (define_insn "*mov_internal"
   (const_string "HI")
   (const_string "TI"))
   (eq_attr "alternative" "5")
-(cond [(match_test "TARGET_AVX512FP16")
+(cond [(match_test "TARGET_AVX512VL")
+   (const_string "V4SF")
+   (match_test "TARGET_AVX512FP16")
  (const_string "HF")
+   (match_test "TARGET_AVX512F")
+ (const_string "SF")
+   (match_test "TARGET_AVX")
+ (const_string "V4SF")
(ior (match_test "TARGET_SSE_PARTIAL_REG_DEPENDENCY")
 (match_test "TARGET_SSE_SPLIT_REGS"))
  (const_string "V4SF")
diff --git a/gcc/testsuite/gcc.target/i386/pr89229-4a.c 
b/gcc/testsuite/gcc.target/i386/pr89229-4a.c
index 5bc10d25619..8869650b0ad 100644
--- a/gcc/testsuite/gcc.target/i386/pr89229-4a.c
+++ b/gcc/testsuite/gcc.target/i386/pr89229-4a.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do assemble { target { ! ia32 } } } */
 /* { dg-options "-O2 -march=skylake-avx512" } */
 
 extern double d;
@@ -12,5 +12,3 @@ foo1 (double x)
   asm volatile ("" : "+v" (xmm17));
   d = xmm17;
 }
-
-/* { dg-final { scan-assembler-not "vmovapd" } } */
-- 
2.31.1

[PATCH] Support -march=gracemont

2023-08-17 Thread liuhongt via Gcc-patches

Alderlake-N is E-core only, add it as an alias of Alderlake.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Any comments?

gcc/ChangeLog:

* common/config/i386/cpuinfo.h (get_intel_cpu): Detect
Alderlake-N.
* common/config/i386/i386-common.cc (alias_table): Support
-march=gracemont as an alias of -march=alderlake.
---
 gcc/common/config/i386/cpuinfo.h  | 3 +++
 gcc/common/config/i386/i386-common.cc | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 13102b9c5dc..941f728b48b 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -533,6 +533,9 @@ get_intel_cpu (struct __processor_model *cpu_model,
   cpu_model->__cpu_type = INTEL_COREI7;
   cpu_model->__cpu_subtype = INTEL_COREI7_TIGERLAKE;
   break;
+
+case 0xbe:
+  /* Alder Lake N, E-core only.  */
 case 0x97:
 case 0x9a:
   /* Alder Lake.  */
diff --git a/gcc/common/config/i386/i386-common.cc 
b/gcc/common/config/i386/i386-common.cc
index 26005914079..8aa8bf12d76 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -2190,6 +2190,8 @@ const pta processor_alias_table[] =
 M_CPU_TYPE (INTEL_GOLDMONT_PLUS), P_PROC_SSE4_2},
   {"tremont", PROCESSOR_TREMONT, CPU_HASWELL, PTA_TREMONT,
 M_CPU_TYPE (INTEL_TREMONT), P_PROC_SSE4_2},
+  {"gracemont", PROCESSOR_ALDERLAKE, CPU_HASWELL, PTA_ALDERLAKE,
+   M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2},
   {"sierraforest", PROCESSOR_SIERRAFOREST, CPU_HASWELL, PTA_SIERRAFOREST,
 M_CPU_SUBTYPE (INTEL_SIERRAFOREST), P_PROC_AVX2},
   {"grandridge", PROCESSOR_GRANDRIDGE, CPU_HASWELL, PTA_GRANDRIDGE,
-- 
2.31.1

[PATCH] Mention Intel -march=gracemont for Alderlake-N.

2023-08-20 Thread liuhongt via Gcc-patches

---
 htdocs/gcc-14/changes.html | 4 
 1 file changed, 4 insertions(+)

diff --git a/htdocs/gcc-14/changes.html b/htdocs/gcc-14/changes.html
index eae25f1a..2c888660 100644
--- a/htdocs/gcc-14/changes.html
+++ b/htdocs/gcc-14/changes.html
@@ -151,6 +151,10 @@ a work-in-progress.
 -march=lunarlake.
 Lunar Lake is based on Arrow Lake S.
   
+  GCC now supports the Intel CPU named Alderlake-N through
+  -march=gracemont.
+  Alderlake-N is E-core only, not hybrid architecture.
+  
 
 
 
-- 
2.31.1

[PATCH] Adjust testcase for Intel GDS.

2023-08-21 Thread liuhongt via Gcc-patches

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512f-pr88464-2.c: Add -mgather to
options.
* gcc.target/i386/avx512f-pr88464-3.c: Ditto.
* gcc.target/i386/avx512f-pr88464-4.c: Ditto.
* gcc.target/i386/avx512f-pr88464-6.c: Ditto.
* gcc.target/i386/avx512f-pr88464-7.c: Ditto.
* gcc.target/i386/avx512f-pr88464-8.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-10.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-12.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-13.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-14.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-15.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-16.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-2.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-4.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-5.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-6.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-7.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-8.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/avx512f-pr88464-2.c   | 2 +-
 gcc/testsuite/gcc.target/i386/avx512f-pr88464-3.c   | 2 +-
 gcc/testsuite/gcc.target/i386/avx512f-pr88464-4.c   | 2 +-
 gcc/testsuite/gcc.target/i386/avx512f-pr88464-6.c   | 2 +-
 gcc/testsuite/gcc.target/i386/avx512f-pr88464-7.c   | 2 +-
 gcc/testsuite/gcc.target/i386/avx512f-pr88464-8.c   | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-10.c | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-12.c | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-13.c | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-14.c | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-15.c | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-16.c | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-2.c  | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-4.c  | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-5.c  | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-6.c  | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-7.c  | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-8.c  | 2 +-
 18 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-2.c 
b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-2.c
index 845bf509d82..28827dbd75d 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-2.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-2.c
@@ -1,6 +1,6 @@
 /* PR tree-optimization/88464 */
 /* { dg-do run { target { avx512f } } } */
-/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 
-mtune=skylake-avx512" } */
+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 
-mgather" } */
 
 #include "avx512f-check.h"
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-3.c 
b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-3.c
index 9eda4aa9b13..2df64bfa063 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-3.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-3.c
@@ -1,6 +1,6 @@
 /* PR tree-optimization/88464 */
 /* { dg-do compile } */
-/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 
-fdump-tree-vect-details" } */
+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 
-fdump-tree-vect-details -mgather" } */
 /* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 4 
"vect" } } */
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" 
} } */
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-4.c 
b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-4.c
index e347e63b17a..173858aadd5 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-4.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-4.c
@@ -1,6 +1,6 @@
 /* PR tree-optimization/88464 */
 /* { dg-do run { target { avx512f } } } */
-/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 
-mtune=skylake-avx512" } */
+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 
-mgather" } */
 
 #include "avx512f-check.h"
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-6.c 
b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-6.c
index 9ebb72a5bae..0adf3b6726a 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-6.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-6.c
@@ -1,6 +1,6 @@
 /* PR tree-optimization/88464 */
 /* { dg-do run { target { avx512f } } } */
-/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 
-mtune=skylake-avx512" } */
+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 
-mgather" } */
 
 #include "avx512f-check.h"
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-7.c 
b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-7.c
index 738640c2bf5..471ebc1676d 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-7.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-7.c
@@ -1,6 +1,6 @@
 /* PR tree-optimization/88464 */
 /* { dg-do compile

[PATCH] [x86] Testcase fix.

2023-08-21 Thread liuhongt via Gcc-patches

Commit as an abvious fix.

gcc/testsuite/ChangeLog:

* gcc.target/i386/invariant-ternlog-1.c: Only scan %rdx under
TARGET_64BIT.
---
 gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c 
b/gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c
index 21051c6bba0..bf67ed7e43d 100644
--- a/gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c
+++ b/gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512f -O2" } */
 /* { dg-final { scan-assembler-times "vmovdqa" 4 } } */
-/* { dg-final { scan-assembler-times {vpternlog[^\n\r]*\(%rdx\)} 2 } } */
+/* { dg-final { scan-assembler-times {vpternlog[^\n\r]*\(%rdx\)} 2 { target { 
! ia32 } } } } */
 
 #include 
 
-- 
2.31.1

[PATCH] [vect]Use intermiediate integer type for float_expr/fix_trunc_expr when direct optab is not existed.

2023-06-20 Thread liuhongt via Gcc-patches

I notice there's some refactor in vectorizable_conversion
for code_helper,so I've adjusted my patch to that.
Here's the patch I'm going to commit.

We have already use intermidate type in case WIDEN, but not for NONE,
this patch extended that.

gcc/ChangeLog:

PR target/110018
* tree-vect-stmts.cc (vectorizable_conversion): Use
intermiediate integer type for float_expr/fix_trunc_expr when
direct optab is not existed.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110018-1.c: New test.
---
 gcc/testsuite/gcc.target/i386/pr110018-1.c | 94 ++
 gcc/tree-vect-stmts.cc | 66 ++-
 2 files changed, 158 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110018-1.c

diff --git a/gcc/testsuite/gcc.target/i386/pr110018-1.c 
b/gcc/testsuite/gcc.target/i386/pr110018-1.c
new file mode 100644
index 000..b1baffd7af1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110018-1.c
@@ -0,0 +1,94 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -O2 -mavx512dq" } */
+/* { dg-final { scan-assembler-times {(?n)vcvttp[dsh]2[dqw]} 5 } } */
+/* { dg-final { scan-assembler-times {(?n)vcvt[dqw]*2p[dsh]} 5 } } */
+
+void
+foo (double* __restrict a, char* b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+}
+
+void
+foo1 (float* __restrict a, char* b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+}
+
+void
+foo2 (_Float16* __restrict a, char* b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+  a[4] = b[4];
+  a[5] = b[5];
+  a[6] = b[6];
+  a[7] = b[7];
+}
+
+void
+foo3 (double* __restrict a, short* b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+}
+
+void
+foo4 (float* __restrict a, char* b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+}
+
+void
+foo5 (double* __restrict b, char* a)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+}
+
+void
+foo6 (float* __restrict b, char* a)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+}
+
+void
+foo7 (_Float16* __restrict b, char* a)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+  a[4] = b[4];
+  a[5] = b[5];
+  a[6] = b[6];
+  a[7] = b[7];
+}
+
+void
+foo8 (double* __restrict b, short* a)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+}
+
+void
+foo9 (float* __restrict b, char* a)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 056a0ecb2be..ae24f3e66e6 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -5041,7 +5041,7 @@ vectorizable_conversion (vec_info *vinfo,
   tree scalar_dest;
   tree op0, op1 = NULL_TREE;
   loop_vec_info loop_vinfo = dyn_cast  (vinfo);
-  tree_code tc1;
+  tree_code tc1, tc2;
   code_helper code, code1, code2;
   code_helper codecvt1 = ERROR_MARK, codecvt2 = ERROR_MARK;
   tree new_temp;
@@ -5249,6 +5249,57 @@ vectorizable_conversion (vec_info *vinfo,
code1 = tc1;
break;
   }
+
+  /* For conversions between float and smaller integer types try whether we
+can use intermediate signed integer types to support the
+conversion.  */
+  if ((code == FLOAT_EXPR
+  && GET_MODE_SIZE (lhs_mode) > GET_MODE_SIZE (rhs_mode))
+ || (code == FIX_TRUNC_EXPR
+ && GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode)))
+   {
+ bool float_expr_p = code == FLOAT_EXPR;
+ scalar_mode imode = float_expr_p ? rhs_mode : lhs_mode;
+ fltsz = GET_MODE_SIZE (float_expr_p ? lhs_mode : rhs_mode);
+ code1 = float_expr_p ? code : NOP_EXPR;
+ codecvt1 = float_expr_p ? NOP_EXPR : code;
+ FOR_EACH_2XWIDER_MODE (rhs_mode_iter, imode)
+   {
+ imode = rhs_mode_iter.require ();
+ if (GET_MODE_SIZE (imode) > fltsz)
+   break;
+
+ cvt_type
+   = build_nonstandard_integer_type (GET_MODE_BITSIZE (imode),
+ 0);
+ cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type,
+ slp_node);
+ /* This should only happened for SLP as long as loop vectorizer
+only supports same-sized vector.  */
+ if (cvt_type == NULL_TREE
+ || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nunits_in)
+ || !supportable_convert_operation ((tree_code) code1,
+vectype_out,
+cvt_type, &tc1)
+ || !supportable_convert_operation ((tree_code) codecvt1,
+cvt_type,
+vectype_in, &tc2))
+   continue;
+
+ found_mode = true;
+ break;
+   }
+
+ if (found_mode)
+   {
+ multi_step_cvt++;
+

[PATCH] Refine maskloadmn pattern with UNSPEC_MASKLOAD.

2023-06-20 Thread liuhongt via Gcc-patches

If mem_addr points to a memory region with less than whole vector size
bytes of accessible memory and k is a mask that would prevent reading
the inaccessible bytes from mem_addr, add UNSPEC_MASKLOAD to prevent
it to be transformed to vpblendd.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready to push to master.

gcc/ChangeLog:

PR target/110309
* config/i386/sse.md (maskload):
Refine pattern with UNSPEC_MASKLOAD.
(maskload): Ditto.
(*_load_mask): Extend mode iterator to
VI12HFBF_AVX512VL.
(*_load): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110309.c: New test.
---
 gcc/config/i386/sse.md   | 32 +---
 gcc/testsuite/gcc.target/i386/pr110309.c | 10 
 2 files changed, 28 insertions(+), 14 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110309.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 87570357db6..4d1f7ac8d7e 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1465,12 +1465,12 @@ (define_expand "_load_mask"
 })
 
 (define_insn "*_load_mask"
-  [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v")
-   (vec_merge:VI12_AVX512VL
- (unspec:VI12_AVX512VL
-   [(match_operand:VI12_AVX512VL 1 "memory_operand" "m")]
+  [(set (match_operand:VI12HFBF_AVX512VL 0 "register_operand" "=v")
+   (vec_merge:VI12HFBF_AVX512VL
+ (unspec:VI12HFBF_AVX512VL
+   [(match_operand:VI12HFBF_AVX512VL 1 "memory_operand" "m")]
UNSPEC_MASKLOAD)
- (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand" "0C")
+ (match_operand:VI12HFBF_AVX512VL 2 "nonimm_or_0_operand" "0C")
  (match_operand: 3 "register_operand" "Yk")))]
   "TARGET_AVX512BW"
   "vmovdqu\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
@@ -1479,9 +1479,9 @@ (define_insn "*_load_mask"
(set_attr "mode" "")])
 
 (define_insn_and_split "*_load"
-  [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v")
-   (unspec:VI12_AVX512VL
- [(match_operand:VI12_AVX512VL 1 "memory_operand" "m")]
+  [(set (match_operand:VI12HFBF_AVX512VL 0 "register_operand" "=v")
+   (unspec:VI12HFBF_AVX512VL
+ [(match_operand:VI12HFBF_AVX512VL 1 "memory_operand" "m")]
  UNSPEC_MASKLOAD))]
   "TARGET_AVX512BW"
   "#"
@@ -26883,17 +26883,21 @@ (define_expand "maskload"
   "TARGET_AVX")
 
 (define_expand "maskload"
-  [(set (match_operand:V48H_AVX512VL 0 "register_operand")
-   (vec_merge:V48H_AVX512VL
- (match_operand:V48H_AVX512VL 1 "memory_operand")
+  [(set (match_operand:V48_AVX512VL 0 "register_operand")
+   (vec_merge:V48_AVX512VL
+ (unspec:V48_AVX512VL
+   [(match_operand:V48_AVX512VL 1 "memory_operand")]
+   UNSPEC_MASKLOAD)
  (match_dup 0)
  (match_operand: 2 "register_operand")))]
   "TARGET_AVX512F")
 
 (define_expand "maskload"
-  [(set (match_operand:VI12_AVX512VL 0 "register_operand")
-   (vec_merge:VI12_AVX512VL
- (match_operand:VI12_AVX512VL 1 "memory_operand")
+  [(set (match_operand:VI12HFBF_AVX512VL 0 "register_operand")
+   (vec_merge:VI12HFBF_AVX512VL
+ (unspec:VI12HFBF_AVX512VL
+   [(match_operand:VI12HFBF_AVX512VL 1 "memory_operand")]
+   UNSPEC_MASKLOAD)
  (match_dup 0)
  (match_operand: 2 "register_operand")))]
   "TARGET_AVX512BW")
diff --git a/gcc/testsuite/gcc.target/i386/pr110309.c 
b/gcc/testsuite/gcc.target/i386/pr110309.c
new file mode 100644
index 000..f6e9e9c3c61
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110309.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 --param vect-partial-vector-usage=1 -march=znver4 
-mprefer-vector-width=256" } */
+/* { dg-final { scan-assembler-not {(?n)vpblendd.*ymm} } } */
+
+
+void foo (int * __restrict a, int *b)
+{
+  for (int i = 0; i < 6; ++i)
+a[i] = b[i] + 42;
+}
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH 2/3] Don't use intermiediate type for FIX_TRUNC_EXPR when ftrapping-math.

2023-06-25 Thread liuhongt via Gcc-patches

> > Hmm, good question.  GENERIC has a direct truncation to unsigned char
> > for example, the C standard generally says if the integral part cannot
> > be represented then the behavior is undefined.  So I think we should be
> > safe here (0x1.0p32 doesn't fit an int).
>
> We should be following Annex F (unspecified value plus "invalid" exception
> for out-of-range floating-to-integer conversions rather than undefined
> behavior).  But we don't achieve that very well at present (see bug 93806
> comments 27-29 for examples of how such conversions produce wobbly
> values).

That would mean guarding this with !flag_trapping_math would be the appropriate
thing to do.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,} and aarch64-linux-gnu.
Ok for trunk?

gcc/ChangeLog:

PR tree-optimization/110371
PR tree-optimization/110018
* tree-vect-stmts.cc (vectorizable_conversion): Don't use
intermiediate type for FIX_TRUNC_EXPR when ftrapping-math.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110018-1.c: Add -fno-trapping-math to dg-options.
* gcc.target/i386/pr110018-2.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/pr110018-1.c | 2 +-
 gcc/testsuite/gcc.target/i386/pr110018-2.c | 2 +-
 gcc/tree-vect-stmts.cc | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr110018-1.c 
b/gcc/testsuite/gcc.target/i386/pr110018-1.c
index b6a3be7b7a2..24eeca60f6f 100644
--- a/gcc/testsuite/gcc.target/i386/pr110018-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr110018-1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-mavx512fp16 -mavx512vl -O2 -mavx512dq" } */
+/* { dg-options "-mavx512fp16 -mavx512vl -O2 -mavx512dq -fno-trapping-math" } 
*/
 /* { dg-final { scan-assembler-times {(?n)vcvttp[dsh]2[dqw]} 5 } } */
 /* { dg-final { scan-assembler-times {(?n)vcvt[dqw]*2p[dsh]} 5 } } */
 
diff --git a/gcc/testsuite/gcc.target/i386/pr110018-2.c 
b/gcc/testsuite/gcc.target/i386/pr110018-2.c
index a663e074698..9a2d9e17894 100644
--- a/gcc/testsuite/gcc.target/i386/pr110018-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr110018-2.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-mavx512fp16 -mavx512vl -O2 -mavx512dq" } */
+/* { dg-options "-mavx512fp16 -mavx512vl -O2 -mavx512dq -fno-trapping-math" } 
*/
 /* { dg-final { scan-assembler-times {(?n)vcvttp[dsh]2[dqw]} 5 } } */
 /* { dg-final { scan-assembler-times {(?n)vcvt[dqw]*2p[dsh]} 5 } } */
 
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 1748555a625..bf61461939b 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -5263,7 +5263,8 @@ vectorizable_conversion (vec_info *vinfo,
   if ((code == FLOAT_EXPR
   && GET_MODE_SIZE (lhs_mode) > GET_MODE_SIZE (rhs_mode))
  || (code == FIX_TRUNC_EXPR
- && GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode)))
+ && GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode)
+ && !flag_trapping_math))
{
  bool float_expr_p = code == FLOAT_EXPR;
  scalar_mode imode = float_expr_p ? rhs_mode : lhs_mode;
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH 3/3] [aarch64] Adjust testcase to match assembly output after r14-2007.

2023-06-25 Thread liuhongt via Gcc-patches

The new assembly looks better than original one, so I adjust those testcases.
Ok for trunk?

gcc/testsuite/ChangeLog:

PR tree-optimization/110371
PR tree-optimization/110018
* gcc.target/aarch64/sve/unpack_fcvt_signed_1.c: Scan scvt +
sxtw instead of scvt + zip1 + zip2.
* gcc.target/aarch64/sve/unpack_fcvt_unsigned_1.c: Scan scvt +
uxtw instead of ucvtf + zip1 + zip2.
---
 gcc/testsuite/gcc.target/aarch64/sve/unpack_fcvt_signed_1.c | 6 +++---
 .../gcc.target/aarch64/sve/unpack_fcvt_unsigned_1.c | 5 ++---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpack_fcvt_signed_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/unpack_fcvt_signed_1.c
index 0f96dc2ff00..5edc288ce35 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/unpack_fcvt_signed_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpack_fcvt_signed_1.c
@@ -10,6 +10,6 @@ unpack_double_int_plus8 (double *d, int32_t *s, int size)
 d[i] = s[i] + 8;
 }
 
-/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.s, z[0-9]+\.s, 
z[0-9]+\.s\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tzip2\tz[0-9]+\.s, z[0-9]+\.s, 
z[0-9]+\.s\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.d, p[0-7]/m, 
z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.d, p[0-7]/m, 
z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsxtw\tz[0-9]+\.d, p[0-7]/m, 
z[0-9]+\.d\n} 1 } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpack_fcvt_unsigned_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/unpack_fcvt_unsigned_1.c
index 70465f91eba..ecd72176177 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/unpack_fcvt_unsigned_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpack_fcvt_unsigned_1.c
@@ -10,6 +10,5 @@ unpack_double_int_plus9 (double *d, uint32_t *s, int size)
 d[i] = (double) (s[i] + 9);
 }
 
-/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.s, z[0-9]+\.s, 
z[0-9]+\.s\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tzip2\tz[0-9]+\.s, z[0-9]+\.s, 
z[0-9]+\.s\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.d, p[0-7]/m, 
z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.d, p[0-7]/m, 
z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuxtw\tz[0-9]+\.d, p[0-7]/m, 
z[0-9]+\.d\n} 1 } } */
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH 1/3] Use cvt_op to save intermediate type operand instead of "subtle" vec_dest.

2023-06-25 Thread liuhongt via Gcc-patches

When there're multiple operands in vec_oprnds0, vec_dest will be
overwrited to vectype_out, but in multi_step_cvt case, cvt_type is
expected. It caused an ICE when verify_gimple_in_cfg.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,} and aarch64-linux-gnu.
Ok for trunk?

gcc/ChangeLog:

PR tree-optimization/110371
PR tree-optimization/110018
* tree-vect-stmts.cc (vectorizable_conversion): Use cvt_op to
save intermediate type operand instead of "subtle" vec_dest
for case NONE.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/pr110371.c: New test.
---
 gcc/testsuite/gcc.target/aarch64/pr110371.c | 20 
 gcc/tree-vect-stmts.cc  | 14 ++
 2 files changed, 30 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr110371.c

diff --git a/gcc/testsuite/gcc.target/aarch64/pr110371.c 
b/gcc/testsuite/gcc.target/aarch64/pr110371.c
new file mode 100644
index 000..444e514e04f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr110371.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+typedef struct dest
+{
+  double m[3][3];
+} dest;
+
+typedef struct src
+{
+  int m[3][3];
+} src;
+
+void
+foo (dest *a, src* s)
+{
+  for (int i = 0; i != 3; i++)
+for (int j = 0; j != 3; j++)
+  a->m[i][j] = s->m[i][j];
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 85d1f3ae52c..1748555a625 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -5044,7 +5044,7 @@ vectorizable_conversion (vec_info *vinfo,
 gimple **vec_stmt, slp_tree slp_node,
 stmt_vector_for_cost *cost_vec)
 {
-  tree vec_dest;
+  tree vec_dest, cvt_op = NULL_TREE;
   tree scalar_dest;
   tree op0, op1 = NULL_TREE;
   loop_vec_info loop_vinfo = dyn_cast  (vinfo);
@@ -5568,6 +5568,13 @@ vectorizable_conversion (vec_info *vinfo,
 case NONE:
   vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies,
 op0, &vec_oprnds0);
+  /* vec_dest is intermediate type operand when multi_step_cvt.  */
+  if (multi_step_cvt)
+   {
+ cvt_op = vec_dest;
+ vec_dest = vec_dsts[0];
+   }
+
   FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
{
  /* Arguments are ready, create the new vector stmt.  */
@@ -5575,12 +5582,11 @@ vectorizable_conversion (vec_info *vinfo,
  if (multi_step_cvt)
{
  gcc_assert (multi_step_cvt == 1);
- new_stmt = vect_gimple_build (vec_dest, codecvt1, vop0);
- new_temp = make_ssa_name (vec_dest, new_stmt);
+ new_stmt = vect_gimple_build (cvt_op, codecvt1, vop0);
+ new_temp = make_ssa_name (cvt_op, new_stmt);
  gimple_assign_set_lhs (new_stmt, new_temp);
  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
  vop0 = new_temp;
- vec_dest = vec_dsts[0];
}
  new_stmt = vect_gimple_build (vec_dest, code1, vop0);
  new_temp = make_ssa_name (vec_dest, new_stmt);
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Issue a warning for conversion between short and __bf16 under TARGET_AVX512BF16.

2023-06-26 Thread liuhongt via Gcc-patches

__bfloat16 is redefined from typedef short to real __bf16 since GCC
V13. The patch issues an warning for potential silent implicit
conversion between __bf16 and short where users may only expect a
data movement.

To avoid too many false positive, warning is only under
TARGET_AVX512BF16.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ok for trunk?

gcc/ChangeLog:

* config/i386/i386.cc (ix86_invalid_conversion): New function.
(TARGET_INVALID_CONVERSION): Define as
ix86_invalid_conversion.

gcc/testsuite/ChangeLog:

* gcc.target/i386/bf16_short_warn.c: New test.
---
 gcc/config/i386/i386.cc   | 32 +++
 .../gcc.target/i386/bf16_short_warn.c | 17 ++
 2 files changed, 49 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/bf16_short_warn.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 0761965344b..dc02eac6203 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22718,6 +22718,35 @@ x86_emit_floatuns (rtx operands[2])
 
   emit_label (donelab);
 }
+
+/* Return the diagnostic message string if conversion from FROMTYPE to
+   TOTYPE is not allowed, NULL otherwise.
+   Currently it's used to warn for silent implicit conversion between __bf16
+   and short, since __bfloat16 is refined as real __bf16 instead of short
+   since GCC13.  */
+
+static const char *
+ix86_invalid_conversion (const_tree fromtype, const_tree totype)
+{
+  if (element_mode (fromtype) != element_mode (totype)
+  && (TARGET_AVX512BF16 || TARGET_AVXNECONVERT))
+{
+  /* Warn for silent implicit conversion where user may expect
+a bitcast.  */
+  if ((TYPE_MODE (fromtype) == BFmode
+  && TYPE_MODE (totype) == HImode)
+ || (TYPE_MODE (totype) == BFmode
+ && TYPE_MODE (fromtype) == HImode))
+   warning (0, "%<__bfloat16%> is redefined from typedef % "
+   "to real %<__bf16%> since GCC V13, be careful of "
+"implicit conversion between %<__bf16%> and %; "
+"a explicit bitcast may be needed here");
+}
+
+  /* Conversion allowed.  */
+  return NULL;
+}
+
 
 /* Target hook for scalar_mode_supported_p.  */
 static bool
@@ -25009,6 +25038,9 @@ ix86_run_selftests (void)
 #  define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
 #endif
 
+#undef TARGET_INVALID_CONVERSION
+#define TARGET_INVALID_CONVERSION ix86_invalid_conversion
+
 #undef TARGET_COMP_TYPE_ATTRIBUTES
 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
 
diff --git a/gcc/testsuite/gcc.target/i386/bf16_short_warn.c 
b/gcc/testsuite/gcc.target/i386/bf16_short_warn.c
new file mode 100644
index 000..3e47a815200
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bf16_short_warn.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include
+typedef struct {
+short payload;
+} BFloat16;
+
+__attribute__((target("avx512vl,avx512bf16")))
+BFloat16 tobf16_avx512(float f)
+{
+BFloat16 r;
+__m128bh m = _mm_cvtneps_pbh(_mm_set_ss(f));
+r.payload = m[0]; /* { dg-warning " be careful of implicit conversion 
between '__bf16' and 'short'" } */
+return r;
+}
+
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] [x86] Refine maskstore patterns with UNSPEC_MASKMOV.

2023-06-26 Thread liuhongt via Gcc-patches

At the rtl level, we cannot guarantee that the maskstore is not optimized
to other full-memory accesses, as the current implementations are equivalent
in terms of pattern, to solve this potential problem, this patch refines
the pattern of the maskstore and the intrinsics with unspec.

One thing I'm not sure is VCOND_EXPR, should VCOND_EXPR also expect
fault suppression for masked-out elements?

Currently we're still using vec_merge for both AVX2 and AVX512 target.


Similar like r14-2070-gc79476da46728e

If mem_addr points to a memory region with less than whole vector size
bytes of accessible memory and k is a mask that would prevent reading
the inaccessible bytes from mem_addr, add UNSPEC_MASKMOV to prevent
it to be transformed to any other whole memory access instructions.

Bootstrapped and regtested on x86_64-pc-linu-gnu{-m32,}.
Ready to push to trunk.

gcc/ChangeLog:

PR rtl-optimization/110237
* config/i386/sse.md (_store_mask): Refine with
UNSPEC_MASKMOV.
(maskstore_store_mask): New define_insn, it's renamed
from original _store_mask.
---
 gcc/config/i386/sse.md | 69 ++
 1 file changed, 57 insertions(+), 12 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 3b50c7117f8..812cfca4b92 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1608,7 +1608,7 @@ (define_insn "_blendm"
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
-(define_insn "_store_mask"
+(define_insn "*_store_mask"
   [(set (match_operand:V48_AVX512VL 0 "memory_operand" "=m")
(vec_merge:V48_AVX512VL
  (match_operand:V48_AVX512VL 1 "register_operand" "v")
@@ -1636,7 +1636,7 @@ (define_insn "_store_mask"
(set_attr "memory" "store")
(set_attr "mode" "")])
 
-(define_insn "_store_mask"
+(define_insn "*_store_mask"
   [(set (match_operand:VI12HFBF_AVX512VL 0 "memory_operand" "=m")
(vec_merge:VI12HFBF_AVX512VL
  (match_operand:VI12HFBF_AVX512VL 1 "register_operand" "v")
@@ -27008,21 +27008,66 @@ (define_expand "maskstore"
   "TARGET_AVX")
 
 (define_expand "maskstore"
-  [(set (match_operand:V48H_AVX512VL 0 "memory_operand")
-   (vec_merge:V48H_AVX512VL
- (match_operand:V48H_AVX512VL 1 "register_operand")
- (match_dup 0)
- (match_operand: 2 "register_operand")))]
+  [(set (match_operand:V48_AVX512VL 0 "memory_operand")
+   (unspec:V48_AVX512VL
+ [(match_operand:V48_AVX512VL 1 "register_operand")
+  (match_dup 0)
+  (match_operand: 2 "register_operand")]
+ UNSPEC_MASKMOV))]
   "TARGET_AVX512F")
 
 (define_expand "maskstore"
-  [(set (match_operand:VI12_AVX512VL 0 "memory_operand")
-   (vec_merge:VI12_AVX512VL
- (match_operand:VI12_AVX512VL 1 "register_operand")
- (match_dup 0)
- (match_operand: 2 "register_operand")))]
+  [(set (match_operand:VI12HFBF_AVX512VL 0 "memory_operand")
+   (unspec:VI12HFBF_AVX512VL
+ [(match_operand:VI12HFBF_AVX512VL 1 "register_operand")
+  (match_dup 0)
+  (match_operand: 2 "register_operand")]
+ UNSPEC_MASKMOV))]
   "TARGET_AVX512BW")
 
+(define_insn "_store_mask"
+  [(set (match_operand:V48_AVX512VL 0 "memory_operand" "=m")
+   (unspec:V48_AVX512VL
+ [(match_operand:V48_AVX512VL 1 "register_operand" "v")
+  (match_dup 0)
+  (match_operand: 2 "register_operand" "Yk")]
+ UNSPEC_MASKMOV))]
+  "TARGET_AVX512F"
+{
+  if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
+{
+  if (misaligned_operand (operands[0], mode))
+   return "vmovu\t{%1, %0%{%2%}|%0%{%2%}, %1}";
+  else
+   return "vmova\t{%1, %0%{%2%}|%0%{%2%}, %1}";
+}
+  else
+{
+  if (misaligned_operand (operands[0], mode))
+   return "vmovdqu\t{%1, %0%{%2%}|%0%{%2%}, %1}";
+  else
+   return "vmovdqa\t{%1, %0%{%2%}|%0%{%2%}, %1}";
+}
+}
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "memory" "store")
+   (set_attr "mode" "")])
+
+(define_insn "_store_mask"
+  [(set (match_operand:VI12HFBF_AVX512VL 0 "memory_operand" "=m")
+   (unspec:VI12HFBF_AVX512VL
+ [(match_operand:VI12HFBF_AVX512VL 1 "register_operand" "v")
+  (match_dup 0)
+  (match_operand: 2 "register_operand" "Yk")]
+  UNSPEC_MASKMOV))]
+  "TARGET_AVX512BW"
+  "vmovdqu\t{%1, %0%{%2%}|%0%{%2%}, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "memory" "store")
+   (set_attr "mode" "")])
+
 (define_expand "cbranch4"
   [(set (reg:CC FLAGS_REG)
(compare:CC (match_operand:VI48_AVX 1 "register_operand")
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH 2/2] Make option mvzeroupper independent of optimization level.

2023-06-26 Thread liuhongt via Gcc-patches

pass_insert_vzeroupper is under condition

TARGET_AVX && TARGET_VZEROUPPER
&& flag_expensive_optimizations && !optimize_size

But the document of mvzeroupper doesn't mention the insertion
required -O2 and above, it may confuse users when they explicitly
use -Os -mvzeroupper.


mvzeroupper
Target Mask(VZEROUPPER) Save
Generate vzeroupper instruction before a transfer of control flow out of
the function.


The patch moves flag_expensive_optimizations && !optimize_size to
ix86_option_override_internal. It makes -mvzeroupper independent of
optimization level, but still keeps the behavior of architecture
tuning(emit_vzeroupper) unchanged.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

* config/i386/i386-features.cc (pass_insert_vzeroupper:gate):
Move flag_expensive_optimizations && !optimize_size to ..
* config/i386/i386-options.cc (ix86_option_override_internal):
.. this, it makes -mvzeroupper independent of optimization
level, but still keeps the behavior of architecture
tuning(emit_vzeroupper) unchanged.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-vzeroupper-29.c: New testcase.
---
 gcc/config/i386/i386-features.cc  |  3 +--
 gcc/config/i386/i386-options.cc   |  4 +++-
 gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c | 14 ++
 3 files changed, 18 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 4a3b07ae045..92ae08d442e 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -2489,8 +2489,7 @@ public:
   /* opt_pass methods: */
   bool gate (function *) final override
 {
-  return TARGET_AVX && TARGET_VZEROUPPER
-   && flag_expensive_optimizations && !optimize_size;
+  return TARGET_AVX && TARGET_VZEROUPPER;
 }
 
   unsigned int execute (function *) final override
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 2cb0bddcd35..f76e7c5947b 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -2727,7 +2727,9 @@ ix86_option_override_internal (bool main_args_p,
 sorry ("%<-mcall-ms2sysv-xlogues%> isn%'t currently supported with SEH");
 
   if (!(opts_set->x_target_flags & MASK_VZEROUPPER)
-  && TARGET_EMIT_VZEROUPPER)
+  && TARGET_EMIT_VZEROUPPER
+  && flag_expensive_optimizations
+  && !optimize_size)
 opts->x_target_flags |= MASK_VZEROUPPER;
   if (!(opts_set->x_target_flags & MASK_STV))
 opts->x_target_flags |= MASK_STV;
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c 
b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c
new file mode 100644
index 000..4af637757f7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mtune=generic -mvzeroupper -dp" } */
+
+#include 
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+  x = y;
+}
+
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH 1/2] Don't issue vzeroupper for vzeroupper call_insn.

2023-06-26 Thread liuhongt via Gcc-patches

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/82735
* config/i386/i386.cc (ix86_avx_u127_mode_needed): Don't emit
vzeroupper for vzeroupper call_insn.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-vzeroupper-30.c: New test.
---
 gcc/config/i386/i386.cc   |  5 +++--
 gcc/testsuite/gcc.target/i386/avx-vzeroupper-30.c | 15 +++
 2 files changed, 18 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx-vzeroupper-30.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 0761965344b..caca74d6dec 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -14489,8 +14489,9 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
 modes wider than 256 bits.  It's only safe to issue a
 vzeroupper if all SSE registers are clobbered.  */
   const function_abi &abi = insn_callee_abi (insn);
-  if (!hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
- abi.mode_clobbers (V4DImode)))
+  if (vzeroupper_pattern (PATTERN (insn), VOIDmode)
+ || !hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
+abi.mode_clobbers (V4DImode)))
return AVX_U128_ANY;
 
   return AVX_U128_CLEAN;
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-30.c 
b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-30.c
new file mode 100644
index 000..c1c9baa8fc4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-30.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -mvzeroupper -dp" } */
+
+#include 
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+  x = y;
+  _mm256_zeroupper ();
+}
+
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Break false dependence for vpternlog by inserting vpxor.

2023-07-03 Thread liuhongt via Gcc-patches

vpternlog is also used for optimization which doesn't need any valid
input operand, in that case, the destination is used as input in the
instruction and that creates a false dependence.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready to push to trunk.

gcc/ChangeLog:

PR target/110438
* config/i386/predicates.md
(int_float_vector_all_ones_operand): New predicate.
* config/i386/sse.md (*vmov_constm1_pternlog): New
define_insn.
(*_cvtmask2): Adjust to
define_insn_and_split to avoid false dependence.
(*_cvtmask2_pternlog): New
define_insn.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110438.c: New test.
---
 gcc/config/i386/predicates.md|  8 ++-
 gcc/config/i386/sse.md   | 69 +++-
 gcc/testsuite/gcc.target/i386/pr110438.c | 30 +++
 3 files changed, 94 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110438.c

diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index fb07707dcba..df0d9e20def 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1192,12 +1192,18 @@ (define_predicate "float_vector_all_ones_operand"
 return false;
 })
 
-/* Return true if operand is a vector constant that is all ones. */
+/* Return true if operand is an integral vector constant that is all ones. */
 (define_predicate "vector_all_ones_operand"
   (and (match_code "const_vector")
(match_test "INTEGRAL_MODE_P (GET_MODE (op))")
(match_test "op == CONSTM1_RTX (GET_MODE (op))")))
 
+/* Return true if operand is a vector constant that is all ones. */
+(define_predicate "int_float_vector_all_ones_operand"
+  (ior (match_operand 0 "vector_all_ones_operand")
+   (match_operand 0 "float_vector_all_ones_operand")
+   (match_test "op == constm1_rtx")))
+
 /* Return true if operand is an 128/256bit all ones vector
that zero-extends to 256/512bit.  */
 (define_predicate "vector_all_ones_zero_extend_half_operand"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 812cfca4b92..93cdd844026 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1382,6 +1382,28 @@ (define_insn "mov_internal"
  ]
  (symbol_ref "true")))])
 
+; False dependency happens on destination register which is not really
+; used when moving all ones to vector register
+(define_split
+  [(set (match_operand:VMOVE 0 "register_operand")
+   (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
+  "TARGET_AVX512F && reload_completed
+  && ( == 64 || EXT_REX_SSE_REG_P (operands[0]))"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel
+ [(set (match_dup 0) (match_dup 1))
+  (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "operands[2] = CONST0_RTX (mode);")
+
+(define_insn "*vmov_constm1_pternlog"
+  [(set (match_operand:VMOVE 0 "register_operand" "=v")
+   (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" 
""))
+   (unspec [(match_operand:VMOVE 2 "register_operand" "0")] 
UNSPEC_INSN_FALSE_DEP)]
+   "TARGET_AVX512VL ||  == 64"
+   "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "evex")])
+
 ;; If mem_addr points to a memory region with less than whole vector size bytes
 ;; of accessible memory and k is a mask that would prevent reading the 
inaccessible
 ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to 
vpblendd
@@ -9336,7 +9358,7 @@ (define_expand "_cvtmask2"
 operands[3] = CONST0_RTX (mode);
   }")
 
-(define_insn "*_cvtmask2"
+(define_insn_and_split "*_cvtmask2"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v")
(vec_merge:VI48_AVX512VL
  (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
@@ -9345,12 +9367,35 @@ (define_insn "*_cvtmask2"
   "TARGET_AVX512F"
   "@
vpmovm2\t{%1, %0|%0, %1}
-   vpternlog\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, 
%0, 0x81}"
+   #"
+  "&& !TARGET_AVX512DQ && reload_completed"
+  [(set (match_dup 0) (match_dup 4))
+   (parallel
+[(set (match_dup 0)
+ (vec_merge:VI48_AVX512VL
+   (match_dup 2)
+   (match_dup 3)
+   (match_dup 1)))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "operands[4] = CONST0_RTX (mode);"
   [(set_attr "isa" "avx512dq,*")
(set_attr "length_immediate" "0,1")
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
+(define_insn "*_cvtmask2_pternlog"
+  [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
+   (vec_merge:VI48_AVX512VL
+ (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
+ (match_operand:VI48_AVX512VL 3 "const0_operand")
+ (match_operand: 1 "register_operand" "Yk")))
+   (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] 
UNSPEC_INSN_FALSE_DEP)]
+  "TARGET_AVX512F && !TARGET_AVX512DQ"
+  "vpternlog\t{$0x81

[PATCH] Disparage slightly for the alternative which move DFmode between SSE_REGS and GENERAL_REGS.

2023-07-05 Thread liuhongt via Gcc-patches

For testcase

void __cond_swap(double* __x, double* __y) {
  bool __r = (*__x < *__y);
  auto __tmp = __r ? *__x : *__y;
  *__y = __r ? *__y : *__x;
  *__x = __tmp;
}

GCC-14 with -O2 and -march=x86-64 options generates the following code:

__cond_swap(double*, double*):
movsd   xmm1, QWORD PTR [rdi]
movsd   xmm0, QWORD PTR [rsi]
comisd  xmm0, xmm1
jbe .L2
movqrax, xmm1
movapd  xmm1, xmm0
movqxmm0, rax
.L2:
movsd   QWORD PTR [rsi], xmm1
movsd   QWORD PTR [rdi], xmm0
ret

rax is used to save and restore DFmode value. In RA both GENERAL_REGS
and SSE_REGS cost zero since we didn't disparage the
alternative in movdf_internal pattern, according to register
allocation order, GENERAL_REGS is allocated. The patch add ? for
alternative (r,v) and (v,r) just like we did for movsf/hf/bf_internal
pattern, after that we get optimal RA.

__cond_swap:
.LFB0:
.cfi_startproc
movsd   (%rdi), %xmm1
movsd   (%rsi), %xmm0
comisd  %xmm1, %xmm0
jbe .L2
movapd  %xmm1, %xmm2
movapd  %xmm0, %xmm1
movapd  %xmm2, %xmm0
.L2:
movsd   %xmm1, (%rsi)
movsd   %xmm0, (%rdi)
ret

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ok for trunk?


gcc/ChangeLog:

PR target/110170
* config/i386/i386.md (movdf_internal): Disparage slightly for
2 alternatives (r,v) and (v,r) by adding constraint modifier
'?'.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110170-3.c: New test.
---
 gcc/config/i386/i386.md|  4 ++--
 gcc/testsuite/gcc.target/i386/pr110170-3.c | 11 +++
 2 files changed, 13 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110170-3.c

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index a82cc353cfd..e47ced1bb70 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -3915,9 +3915,9 @@ (define_split
 ;; Possible store forwarding (partial memory) stall in alternatives 4, 6 and 7.
 (define_insn "*movdf_internal"
   [(set (match_operand:DF 0 "nonimmediate_operand"
-"=Yf*f,m   ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,r ,v,r  
,o ,r  ,m")
+"=Yf*f,m   ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,?r,?v,r 
 ,o ,r  ,m")
(match_operand:DF 1 "general_operand"
-"Yf*fm,Yf*f,G   ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x,v,r 
,roF,rF,rmF,rC"))]
+"Yf*fm,Yf*f,G   ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x, v, 
r,roF,rF,rmF,rC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))
&& (lra_in_progress || reload_completed
|| !CONST_DOUBLE_P (operands[1])
diff --git a/gcc/testsuite/gcc.target/i386/pr110170-3.c 
b/gcc/testsuite/gcc.target/i386/pr110170-3.c
new file mode 100644
index 000..70daa89e9aa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110170-3.c
@@ -0,0 +1,11 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -fno-if-conversion -fno-if-conversion2" } */
+/* { dg-final { scan-assembler-not {(?n)movq.*r} } } */
+
+void __cond_swap(double* __x, double* __y) {
+  _Bool __r = (*__x < *__y);
+  double __tmp = __r ? *__x : *__y;
+  *__y = __r ? *__y : *__x;
+  *__x = __tmp;
+}
+
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH 2/2] Adjust rtx_cost for DF/SFmode AND/IOR/XOR/ANDN operations.

2023-07-05 Thread liuhongt via Gcc-patches

They should have same cost as vector mode since both generate
pand/pandn/pxor/por instruction.

Bootstrapped and regtested on x86_64-pc-linu-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

* config/i386/i386.cc (ix86_rtx_costs): Adjust rtx_cost for
DF/SFmode AND/IOR/XOR/ANDN operations.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110170-2.c: New test.
---
 gcc/config/i386/i386.cc|  6 --
 gcc/testsuite/gcc.target/i386/pr110170-2.c | 16 
 2 files changed, 20 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110170-2.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d4ff56ee8dd..fe31acd7646 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21153,7 +21153,8 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
 
 case IOR:
 case XOR:
-  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+ || SSE_FLOAT_MODE_P (mode))
*total = ix86_vec_cost (mode, cost->sse_op);
   else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
*total = cost->add * 2;
@@ -21167,7 +21168,8 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
  *total = cost->lea;
  return true;
}
-  else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+  else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+  || SSE_FLOAT_MODE_P (mode))
{
  /* pandn is a single instruction.  */
  if (GET_CODE (XEXP (x, 0)) == NOT)
diff --git a/gcc/testsuite/gcc.target/i386/pr110170-2.c 
b/gcc/testsuite/gcc.target/i386/pr110170-2.c
new file mode 100644
index 000..d43e322fc49
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110170-2.c
@@ -0,0 +1,16 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-msse2 -O2 -mfpmath=sse" } */
+/* { dg-final { scan-assembler-not "comi" } }  */
+
+double
+foo (double* a, double* b, double c, double d)
+{
+  return *a < *b ? c : d;
+}
+
+float
+foo1 (float* a, float* b, float c, float d)
+{
+  return *a < *b ? c : d;
+}
+
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH 1/2] [x86] Add pre_reload splitter to detect fp min/max pattern.

2023-07-05 Thread liuhongt via Gcc-patches

We have ix86_expand_sse_fp_minmax to detect min/max sematics, but
it requires rtx_equal_p for cmp_op0/cmp_op1 and if_true/if_false, for
the testcase in the PR, there's an extra move from cmp_op0 to if_true,
and it failed ix86_expand_sse_fp_minmax.

This patch adds pre_reload splitter to detect the min/max pattern.

Operands order in MINSS matters for signed zero and NANs, since the
instruction always returns second operand when any operand is NAN or
both operands are zero.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/110170
* config/i386/i386.md (*ieee_minmax3_1): New pre_reload
splitter to detect fp min/max pattern.

gcc/testsuite/ChangeLog:

* g++.target/i386/pr110170.C: New test.
* gcc.target/i386/pr110170.c: New test.
---
 gcc/config/i386/i386.md  | 30 +
 gcc/testsuite/g++.target/i386/pr110170.C | 78 
 gcc/testsuite/gcc.target/i386/pr110170.c | 18 ++
 3 files changed, 126 insertions(+)
 create mode 100644 gcc/testsuite/g++.target/i386/pr110170.C
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110170.c

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e6ebc461e52..353bb21993d 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -22483,6 +22483,36 @@ (define_insn "*ieee_s3"
(set_attr "type" "sseadd")
(set_attr "mode" "")])
 
+;; Operands order in min/max instruction matters for signed zero and NANs.
+(define_insn_and_split "*ieee_minmax3_1"
+  [(set (match_operand:MODEF 0 "register_operand")
+   (unspec:MODEF
+ [(match_operand:MODEF 1 "register_operand")
+  (match_operand:MODEF 2 "register_operand")
+  (lt:MODEF
+(match_operand:MODEF 3 "register_operand")
+(match_operand:MODEF 4 "register_operand"))]
+ UNSPEC_BLENDV))]
+  "SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH
+  && ((rtx_equal_p (operands[1], operands[3])
+   && rtx_equal_p (operands[2], operands[4]))
+  || (rtx_equal_p (operands[1], operands[4])
+ && rtx_equal_p (operands[2], operands[3])))
+  && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  int u = (rtx_equal_p (operands[1], operands[3])
+  && rtx_equal_p (operands[2], operands[4]))
+  ? UNSPEC_IEEE_MAX : UNSPEC_IEEE_MIN;
+  emit_move_insn (operands[0],
+ gen_rtx_UNSPEC (mode,
+ gen_rtvec (2, operands[2], operands[1]),
+ u));
+  DONE;
+})
+
 ;; Make two stack loads independent:
 ;;   fld aa  fld aa
 ;;   fld %st(0) ->   fld bb
diff --git a/gcc/testsuite/g++.target/i386/pr110170.C 
b/gcc/testsuite/g++.target/i386/pr110170.C
new file mode 100644
index 000..1e9a781ca74
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr110170.C
@@ -0,0 +1,78 @@
+/* { dg-do run } */
+/* { dg-options " -O2 -march=x86-64 -mfpmath=sse -std=gnu++20" } */
+#include 
+
+void
+__attribute__((noinline))
+__cond_swap(double* __x, double* __y) {
+  bool __r = (*__x < *__y);
+  auto __tmp = __r ? *__x : *__y;
+  *__y = __r ? *__y : *__x;
+  *__x = __tmp;
+}
+
+auto test1() {
+double nan = -0.0;
+double x = 0.0;
+__cond_swap(&nan, &x);
+return x == -0.0 && nan == 0.0;
+}
+
+auto test1r() {
+double nan = NAN;
+double x = 1.0;
+__cond_swap(&x, &nan);
+return isnan(x) && signbit(x) == 0 && nan == 1.0;
+}
+
+auto test2() {
+double nan = NAN;
+double x = -1.0;
+__cond_swap(&nan, &x);
+return isnan(x) && signbit(x) == 0 && nan == -1.0;
+}
+
+auto test2r() {
+double nan = NAN;
+double x = -1.0;
+__cond_swap(&x, &nan);
+return isnan(x) && signbit(x) == 0 && nan == -1.0;
+}
+
+auto test3() {
+double nan = -NAN;
+double x = 1.0;
+__cond_swap(&nan, &x);
+return isnan(x) && signbit(x) == 1 && nan == 1.0;
+}
+
+auto test3r() {
+double nan = -NAN;
+double x = 1.0;
+__cond_swap(&x, &nan);
+return isnan(x) && signbit(x) == 1 && nan == 1.0;
+}
+
+auto test4() {
+double nan = -NAN;
+double x = -1.0;
+__cond_swap(&nan, &x);
+return isnan(x) && signbit(x) == 1 && nan == -1.0;
+}
+
+auto test4r() {
+double nan = -NAN;
+double x = -1.0;
+__cond_swap(&x, &nan);
+return isnan(x) && signbit(x) == 1 && nan == -1.0;
+}
+
+
+int main() {
+if (
+!test1() || !test1r()
+|| !test2() || !test2r()
+|| !test3() || !test4r()
+|| !test4() || !test4r()
+) __builtin_abort();
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr110170.c 
b/gcc/testsuite/gcc.target/i386/pr110170.c
new file mode 100644
index 000..0f98545cce3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110170.c
@@ -0,0 +1,18 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options " -O2 -march=x86-64-v2 -mfpmath=sse" } */
+/* { dg-final { scan-assembler-times {(?n)mins[sd]} 2 } } */
+/* { dg-final { scan-assembl

[PATCH V2] [x86] Add pre_reload splitter to detect fp min/max pattern.

2023-07-06 Thread liuhongt via Gcc-patches

> Please split the above pattern into two, one emitting UNSPEC_IEEE_MAX
> and the other emitting UNSPEC_IEEE_MIN.
Splitted.

> The test involves blendv instruction, which is SSE4.1, so it is
> pointless to test it without -msse4.1. Please add -msse4.1 instead of
> -march=x86_64 and use sse4_runtime target selector, as is the case
> with gcc.target/i386/pr90358.c.
Changed.

> Please also use -msse4.1 instead of -march here. With -mfpmath=sse,
> the test is valid also for 32bit targets, you should use -msseregparm
> additional options for ia32 (please see gcc.target/i386/pr43546.c
> testcase) in the same way as -mregparm to pass SSE arguments in
> registers.
32-bit target still failed to do condition elimination for DFmode due to
below code in rtx_cost

  /* A size N times larger than UNITS_PER_WORD likely needs N times as
 many insns, taking N times as long.  */
  factor = mode_size > UNITS_PER_WORD ? mode_size / UNITS_PER_WORD : 1;

It looks like a separate issue for DFmode operation under 32-bit target.

I've enable 32-bit for the testcase, but only scan for minss/maxss
currently.

Here's updated patch.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

We have ix86_expand_sse_fp_minmax to detect min/max sematics, but
it requires rtx_equal_p for cmp_op0/cmp_op1 and if_true/if_false, for
the testcase in the PR, there's an extra move from cmp_op0 to if_true,
and it failed ix86_expand_sse_fp_minmax.

This patch adds pre_reload splitter to detect the min/max pattern.

Operands order in MINSS matters for signed zero and NANs, since the
instruction always returns second operand when any operand is NAN or
both operands are zero.

gcc/ChangeLog:

PR target/110170
* config/i386/i386.md (*ieee_max3_1): New pre_reload
splitter to detect fp max pattern.
(*ieee_min3_1): Ditto, but for fp min pattern.

gcc/testsuite/ChangeLog:

* g++.target/i386/pr110170.C: New test.
* gcc.target/i386/pr110170.c: New test.
---
 gcc/config/i386/i386.md  | 43 +
 gcc/testsuite/g++.target/i386/pr110170.C | 78 
 gcc/testsuite/gcc.target/i386/pr110170.c | 21 +++
 3 files changed, 142 insertions(+)
 create mode 100644 gcc/testsuite/g++.target/i386/pr110170.C
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110170.c

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index a82cc353cfd..6f415f899ae 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -23163,6 +23163,49 @@ (define_insn "*ieee_s3"
(set_attr "type" "sseadd")
(set_attr "mode" "")])
 
+;; Operands order in min/max instruction matters for signed zero and NANs.
+(define_insn_and_split "*ieee_max3_1"
+  [(set (match_operand:MODEF 0 "register_operand")
+   (unspec:MODEF
+ [(match_operand:MODEF 1 "register_operand")
+  (match_operand:MODEF 2 "register_operand")
+  (lt:MODEF
+(match_operand:MODEF 3 "register_operand")
+(match_operand:MODEF 4 "register_operand"))]
+ UNSPEC_BLENDV))]
+  "SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH
+  && (rtx_equal_p (operands[1], operands[3])
+  && rtx_equal_p (operands[2], operands[4]))
+  && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (unspec:MODEF
+ [(match_dup 2)
+  (match_dup 1)]
+UNSPEC_IEEE_MAX))])
+
+(define_insn_and_split "*ieee_min3_1"
+  [(set (match_operand:MODEF 0 "register_operand")
+   (unspec:MODEF
+ [(match_operand:MODEF 1 "register_operand")
+  (match_operand:MODEF 2 "register_operand")
+  (lt:MODEF
+(match_operand:MODEF 3 "register_operand")
+(match_operand:MODEF 4 "register_operand"))]
+ UNSPEC_BLENDV))]
+  "SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH
+  && (rtx_equal_p (operands[1], operands[4])
+  && rtx_equal_p (operands[2], operands[3]))
+  && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (unspec:MODEF
+ [(match_dup 2)
+  (match_dup 1)]
+UNSPEC_IEEE_MIN))])
+
 ;; Make two stack loads independent:
 ;;   fld aa  fld aa
 ;;   fld %st(0) ->   fld bb
diff --git a/gcc/testsuite/g++.target/i386/pr110170.C 
b/gcc/testsuite/g++.target/i386/pr110170.C
new file mode 100644
index 000..5d6842270d0
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr110170.C
@@ -0,0 +1,78 @@
+/* { dg-do run } */
+/* { dg-options " -O2 -msse4.1 -mfpmath=sse -std=gnu++20" } */
+#include 
+
+void
+__attribute__((noinline))
+__cond_swap(double* __x, double* __y) {
+  bool __r = (*__x < *__y);
+  auto __tmp = __r ? *__x : *__y;
+  *__y = __r ? *__y : *__x;
+  *__x = __tmp;
+}
+
+auto test1() {
+double nan = -0.0;
+double x = 0.0;
+__cond_swap(&nan, &x);
+return x == -0.0 && nan == 0.0;
+}
+
+auto test1r() {
+double nan = NAN;
+double x = 1.0;
+__cond_swap(&x, &nan);
+return isnan(x) && signbit(x) == 0 && nan == 1.0;
+}

[PATCH] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0'

2023-07-09 Thread liuhongt via Gcc-patches

False dependency happens when destination is only updated by
pternlog. There is no false dependency when destination is also used
in source. So either a pxor should be inserted, or input operand
should be set with constraint '0'.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready to push to trunk.

gcc/ChangeLog:

PR target/110438
PR target/110202
* config/i386/predicates.md
(int_float_vector_all_ones_operand): New predicate.
* config/i386/sse.md (*vmov_constm1_pternlog_false_dep): New
define_insn.
(*_cvtmask2_pternlog_false_dep):
Ditto.
(*_cvtmask2_pternlog_false_dep):
Ditto.
(*_cvtmask2): Adjust to
define_insn_and_split to avoid false dependence.
(*_cvtmask2): Ditto.
(one_cmpl2): Adjust constraint
of operands 1 to '0' to avoid false dependence.
(*andnot3): Ditto.
(iornot3): Ditto.
(*3): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110438.c: New test.
---
 gcc/config/i386/predicates.md|   8 +-
 gcc/config/i386/sse.md   | 113 ---
 gcc/testsuite/gcc.target/i386/pr110438.c |  30 ++
 3 files changed, 135 insertions(+), 16 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110438.c

diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 7ddbe01a6f9..37d20c6303a 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1192,12 +1192,18 @@ (define_predicate "float_vector_all_ones_operand"
 return false;
 })
 
-/* Return true if operand is a vector constant that is all ones. */
+/* Return true if operand is an integral vector constant that is all ones. */
 (define_predicate "vector_all_ones_operand"
   (and (match_code "const_vector")
(match_test "INTEGRAL_MODE_P (GET_MODE (op))")
(match_test "op == CONSTM1_RTX (GET_MODE (op))")))
 
+/* Return true if operand is a vector constant that is all ones. */
+(define_predicate "int_float_vector_all_ones_operand"
+  (ior (match_operand 0 "vector_all_ones_operand")
+   (match_operand 0 "float_vector_all_ones_operand")
+   (match_test "op == constm1_rtx")))
+
 /* Return true if operand is an 128/256bit all ones vector
that zero-extends to 256/512bit.  */
 (define_predicate "vector_all_ones_zero_extend_half_operand"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 418c337a775..56920a3e1d3 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1382,6 +1382,29 @@ (define_insn "mov_internal"
  ]
  (symbol_ref "true")))])
 
+; False dependency happens on destination register which is not really
+; used when moving all ones to vector register
+(define_split
+  [(set (match_operand:VMOVE 0 "register_operand")
+   (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
+  "TARGET_AVX512F && reload_completed
+  && ( == 64 || EXT_REX_SSE_REG_P (operands[0]))
+  && optimize_function_for_speed_p (cfun)"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel
+ [(set (match_dup 0) (match_dup 1))
+  (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "operands[2] = CONST0_RTX (mode);")
+
+(define_insn "*vmov_constm1_pternlog_false_dep"
+  [(set (match_operand:VMOVE 0 "register_operand" "=v")
+   (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" 
""))
+   (unspec [(match_operand:VMOVE 2 "register_operand" "0")] 
UNSPEC_INSN_FALSE_DEP)]
+   "TARGET_AVX512VL ||  == 64"
+   "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "evex")])
+
 ;; If mem_addr points to a memory region with less than whole vector size bytes
 ;; of accessible memory and k is a mask that would prevent reading the 
inaccessible
 ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to 
vpblendd
@@ -9336,7 +9359,7 @@ (define_expand "_cvtmask2"
 operands[3] = CONST0_RTX (mode);
   }")
 
-(define_insn "*_cvtmask2"
+(define_insn_and_split "*_cvtmask2"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v")
(vec_merge:VI48_AVX512VL
  (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
@@ -9346,11 +9369,35 @@ (define_insn "*_cvtmask2"
   "@
vpmovm2\t{%1, %0|%0, %1}
vpternlog\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, 
%0, 0x81}"
+  "&& !TARGET_AVX512DQ && reload_completed
+   && optimize_function_for_speed_p (cfun)"
+  [(set (match_dup 0) (match_dup 4))
+   (parallel
+[(set (match_dup 0)
+ (vec_merge:VI48_AVX512VL
+   (match_dup 2)
+   (match_dup 3)
+   (match_dup 1)))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "operands[4] = CONST0_RTX (mode);"
   [(set_attr "isa" "avx512dq,*")
(set_attr "length_immediate" "0,1")
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
+(define_insn "*_cvtmask2_pternlog_false_dep"
+  [(set (match_operand:VI

[PATCH] Add peephole to eliminate redundant comparison after cmpccxadd.

2023-07-10 Thread liuhongt via Gcc-patches

Similar like we did for cmpxchg, but extended to all
ix86_comparison_int_operator since cmpccxadd set EFLAGS exactly same
as CMP.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,},
Ok for trunk?

gcc/ChangeLog:

PR target/110591
* config/i386/sync.md (cmpccxadd_): Add a new
define_peephole2 after the pattern.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110591.c: New test.
---
 gcc/config/i386/sync.md  | 56 
 gcc/testsuite/gcc.target/i386/pr110591.c | 66 
 2 files changed, 122 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110591.c

diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md
index e1fa1504deb..43f6421bcb8 100644
--- a/gcc/config/i386/sync.md
+++ b/gcc/config/i386/sync.md
@@ -1105,3 +1105,59 @@ (define_insn "cmpccxadd_"
   output_asm_insn (buf, operands);
   return "";
 })
+
+(define_peephole2
+  [(set (match_operand:SWI48x 0 "register_operand")
+   (match_operand:SWI48x 1 "x86_64_general_operand"))
+   (parallel [(set (match_dup 0)
+  (unspec_volatile:SWI48x
+[(match_operand:SWI48x 2 "memory_operand")
+ (match_dup 0)
+ (match_operand:SWI48x 3 "register_operand")
+ (match_operand:SI 4 "const_int_operand")]
+UNSPECV_CMPCCXADD))
+ (set (match_dup 2)
+  (unspec_volatile:SWI48x [(const_int 0)] UNSPECV_CMPCCXADD))
+ (clobber (reg:CC FLAGS_REG))])
+   (set (reg FLAGS_REG)
+   (compare (match_operand:SWI48x 5 "register_operand")
+(match_operand:SWI48x 6 "x86_64_general_operand")))
+   (set (match_operand:QI 7 "nonimmediate_operand")
+   (match_operator:QI 8 "ix86_comparison_int_operator"
+ [(reg FLAGS_REG) (const_int 0)]))]
+  "TARGET_CMPCCXADD && TARGET_64BIT
+   && ((rtx_equal_p (operands[0], operands[5])
+   && rtx_equal_p (operands[1], operands[6]))
+   || ((rtx_equal_p (operands[0], operands[6])
+   && rtx_equal_p (operands[1], operands[5]))
+  && peep2_regno_dead_p (4, FLAGS_REG)))"
+  [(set (match_dup 0)
+   (match_dup 1))
+   (parallel [(set (match_dup 0)
+  (unspec_volatile:SWI48x
+[(match_dup 2)
+ (match_dup 0)
+ (match_dup 3)
+ (match_dup 4)]
+UNSPECV_CMPCCXADD))
+ (set (match_dup 2)
+  (unspec_volatile:SWI48x [(const_int 0)] UNSPECV_CMPCCXADD))
+ (clobber (reg:CC FLAGS_REG))])
+   (set (match_dup 7)
+   (match_op_dup 8
+ [(match_dup 9) (const_int 0)]))]
+{
+  operands[9] = gen_rtx_REG (GET_MODE (XEXP (operands[8], 0)), FLAGS_REG);
+  if (rtx_equal_p (operands[0], operands[6])
+ && rtx_equal_p (operands[1], operands[5])
+ && swap_condition (GET_CODE (operands[8])) != GET_CODE (operands[8]))
+ {
+   operands[8] = shallow_copy_rtx (operands[8]);
+   enum rtx_code ccode = swap_condition (GET_CODE (operands[8]));
+   PUT_CODE (operands[8], ccode);
+   operands[9] = gen_rtx_REG (SELECT_CC_MODE (ccode,
+ operands[6],
+ operands[5]),
+  FLAGS_REG);
+ }
+})
diff --git a/gcc/testsuite/gcc.target/i386/pr110591.c 
b/gcc/testsuite/gcc.target/i386/pr110591.c
new file mode 100644
index 000..32a515b429e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110591.c
@@ -0,0 +1,66 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mcmpccxadd -O2" } */
+/* { dg-final { scan-assembler-not {cmp[lq]?[ \t]+} } } */
+/* { dg-final { scan-assembler-times {cmpoxadd[ \t]+} 12 } } */
+
+#include 
+
+_Bool foo_setg (int *ptr, int v)
+{
+return _cmpccxadd_epi32(ptr, v, 1, _CMPCCX_O) > v;
+}
+
+_Bool foo_setl (int *ptr, int v)
+{
+return _cmpccxadd_epi32(ptr, v, 1, _CMPCCX_O) < v;
+}
+
+_Bool foo_sete(int *ptr, int v)
+{
+return _cmpccxadd_epi32(ptr, v, 1, _CMPCCX_O) == v;
+}
+
+_Bool foo_setne(int *ptr, int v)
+{
+return _cmpccxadd_epi32(ptr, v, 1, _CMPCCX_O) != v;
+}
+
+_Bool foo_setge(int *ptr, int v)
+{
+return _cmpccxadd_epi32(ptr, v, 1, _CMPCCX_O) >= v;
+}
+
+_Bool foo_setle(int *ptr, int v)
+{
+return _cmpccxadd_epi32(ptr, v, 1, _CMPCCX_O) <= v;
+}
+
+_Bool fooq_setg (long long *ptr, long long v)
+{
+return _cmpccxadd_epi64(ptr, v, 1, _CMPCCX_O) > v;
+}
+
+_Bool fooq_setl (long long *ptr, long long v)
+{
+return _cmpccxadd_epi64(ptr, v, 1, _CMPCCX_O) < v;
+}
+
+_Bool fooq_sete(long long *ptr, long long v)
+{
+return _cmpccxadd_epi64(ptr, v, 1, _CMPCCX_O) == v;
+}
+
+_Bool fooq_setne(long long *ptr, long long v)
+{
+return _cmpccxadd_epi64(ptr, v, 1, _CMPCCX_O) != v;
+}
+
+_Bool fooq_setge(long long *ptr, long long v)
+{
+return _cmpccxadd_epi64(ptr, v, 1, _CMPCCX_O) >= v;
+}
+
+_Bool

[PATCH v2] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0'

2023-07-10 Thread liuhongt via Gcc-patches

Here's updated patch.
1. use optimize_insn_for_speed_p instead of using optimize_function_for_speed_p.
2. explicitly move memory to dest register to avoid false dependence in 
one_cmpl pattern.


False dependency happens when destination is only updated by
pternlog. There is no false dependency when destination is also used
in source. So either a pxor should be inserted, or input operand
should be set with constraint '0'.

gcc/ChangeLog:

PR target/110438
PR target/110202
* config/i386/predicates.md
(int_float_vector_all_ones_operand): New predicate.
* config/i386/sse.md (*vmov_constm1_pternlog_false_dep): New
define_insn.
(*_cvtmask2_pternlog_false_dep):
Ditto.
(*_cvtmask2_pternlog_false_dep):
Ditto.
(*_cvtmask2): Adjust to
define_insn_and_split to avoid false dependence.
(*_cvtmask2): Ditto.
(one_cmpl2): Adjust constraint
of operands 1 to '0' to avoid false dependence.
(*andnot3): Ditto.
(iornot3): Ditto.
(*3): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110438.c: New test.
* gcc.target/i386/pr100711.c: Adjust testcase.
---
 gcc/config/i386/predicates.md  |   8 +-
 gcc/config/i386/sse.md | 145 ++---
 gcc/testsuite/gcc.target/i386/pr100711-6.c |   2 +-
 gcc/testsuite/gcc.target/i386/pr110438.c   |  30 +
 4 files changed, 168 insertions(+), 17 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110438.c

diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 7ddbe01a6f9..37d20c6303a 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1192,12 +1192,18 @@ (define_predicate "float_vector_all_ones_operand"
 return false;
 })
 
-/* Return true if operand is a vector constant that is all ones. */
+/* Return true if operand is an integral vector constant that is all ones. */
 (define_predicate "vector_all_ones_operand"
   (and (match_code "const_vector")
(match_test "INTEGRAL_MODE_P (GET_MODE (op))")
(match_test "op == CONSTM1_RTX (GET_MODE (op))")))
 
+/* Return true if operand is a vector constant that is all ones. */
+(define_predicate "int_float_vector_all_ones_operand"
+  (ior (match_operand 0 "vector_all_ones_operand")
+   (match_operand 0 "float_vector_all_ones_operand")
+   (match_test "op == constm1_rtx")))
+
 /* Return true if operand is an 128/256bit all ones vector
that zero-extends to 256/512bit.  */
 (define_predicate "vector_all_ones_zero_extend_half_operand"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 418c337a775..05485b1792d 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1382,6 +1382,29 @@ (define_insn "mov_internal"
  ]
  (symbol_ref "true")))])
 
+; False dependency happens on destination register which is not really
+; used when moving all ones to vector register
+(define_split
+  [(set (match_operand:VMOVE 0 "register_operand")
+   (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
+  "TARGET_AVX512F && reload_completed
+  && ( == 64 || EXT_REX_SSE_REG_P (operands[0]))
+  && optimize_insn_for_speed_p ()"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel
+ [(set (match_dup 0) (match_dup 1))
+  (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "operands[2] = CONST0_RTX (mode);")
+
+(define_insn "*vmov_constm1_pternlog_false_dep"
+  [(set (match_operand:VMOVE 0 "register_operand" "=v")
+   (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" 
""))
+   (unspec [(match_operand:VMOVE 2 "register_operand" "0")] 
UNSPEC_INSN_FALSE_DEP)]
+   "TARGET_AVX512VL ||  == 64"
+   "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "evex")])
+
 ;; If mem_addr points to a memory region with less than whole vector size bytes
 ;; of accessible memory and k is a mask that would prevent reading the 
inaccessible
 ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to 
vpblendd
@@ -9336,7 +9359,7 @@ (define_expand "_cvtmask2"
 operands[3] = CONST0_RTX (mode);
   }")
 
-(define_insn "*_cvtmask2"
+(define_insn_and_split "*_cvtmask2"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v")
(vec_merge:VI48_AVX512VL
  (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
@@ -9346,11 +9369,35 @@ (define_insn "*_cvtmask2"
   "@
vpmovm2\t{%1, %0|%0, %1}
vpternlog\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, 
%0, 0x81}"
+  "&& !TARGET_AVX512DQ && reload_completed
+   && optimize_function_for_speed_p (cfun)"
+  [(set (match_dup 0) (match_dup 4))
+   (parallel
+[(set (match_dup 0)
+ (vec_merge:VI48_AVX512VL
+   (match_dup 2)
+   (match_dup 3)
+   (match_dup 1)))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "operands[4] = CONST0_RTX

[PATCH] Add peephole to eliminate redundant comparison after cmpccxadd.

2023-07-11 Thread liuhongt via Gcc-patches

Similar like we did for CMPXCHG, but extended to all
ix86_comparison_int_operator since CMPCCXADD set EFLAGS exactly same
as CMP.

When operand order in CMP insn is same as that in CMPCCXADD,
CMP insn can be eliminated directly.

When operand order is swapped in CMP insn, only optimize
cmpccxadd + cmpl + jcc/setcc to cmpccxadd + jcc/setcc when FLAGS_REG is dead
after jcc/setcc plus adjusting code for jcc/setcc.

gcc/ChangeLog:

PR target/110591
* config/i386/sync.md (cmpccxadd_): Adjust the pattern
to explicitly set FLAGS_REG like *cmp_1, also add extra
3 define_peephole2 after the pattern.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110591.c: New test.
* gcc.target/i386/pr110591-2.c: New test.
---
 gcc/config/i386/sync.md| 160 -
 gcc/testsuite/gcc.target/i386/pr110591-2.c |  90 
 gcc/testsuite/gcc.target/i386/pr110591.c   |  66 +
 3 files changed, 315 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110591-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110591.c

diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md
index e1fa1504deb..e84226cf895 100644
--- a/gcc/config/i386/sync.md
+++ b/gcc/config/i386/sync.md
@@ -1093,7 +1093,9 @@ (define_insn "cmpccxadd_"
  UNSPECV_CMPCCXADD))
(set (match_dup 1)
(unspec_volatile:SWI48x [(const_int 0)] UNSPECV_CMPCCXADD))
-   (clobber (reg:CC FLAGS_REG))]
+   (set (reg:CC FLAGS_REG)
+   (compare:CC (match_dup 1)
+   (match_dup 2)))]
   "TARGET_CMPCCXADD && TARGET_64BIT"
 {
   char buf[128];
@@ -1105,3 +1107,159 @@ (define_insn "cmpccxadd_"
   output_asm_insn (buf, operands);
   return "";
 })
+
+(define_peephole2
+  [(set (match_operand:SWI48x 0 "register_operand")
+   (match_operand:SWI48x 1 "x86_64_general_operand"))
+   (parallel [(set (match_dup 0)
+  (unspec_volatile:SWI48x
+[(match_operand:SWI48x 2 "memory_operand")
+ (match_dup 0)
+ (match_operand:SWI48x 3 "register_operand")
+ (match_operand:SI 4 "const_int_operand")]
+UNSPECV_CMPCCXADD))
+ (set (match_dup 2)
+  (unspec_volatile:SWI48x [(const_int 0)] UNSPECV_CMPCCXADD))
+ (set (reg:CC FLAGS_REG)
+  (compare:CC (match_dup 2)
+  (match_dup 0)))])
+   (set (reg FLAGS_REG)
+   (compare (match_operand:SWI48x 5 "register_operand")
+(match_operand:SWI48x 6 "x86_64_general_operand")))]
+  "TARGET_CMPCCXADD && TARGET_64BIT
+   && rtx_equal_p (operands[0], operands[5])
+   && rtx_equal_p (operands[1], operands[6])"
+  [(set (match_dup 0)
+   (match_dup 1))
+   (parallel [(set (match_dup 0)
+  (unspec_volatile:SWI48x
+[(match_dup 2)
+ (match_dup 0)
+ (match_dup 3)
+ (match_dup 4)]
+UNSPECV_CMPCCXADD))
+ (set (match_dup 2)
+  (unspec_volatile:SWI48x [(const_int 0)] UNSPECV_CMPCCXADD))
+ (set (reg:CC FLAGS_REG)
+  (compare:CC (match_dup 2)
+  (match_dup 0)))])
+   (set (match_dup 7)
+   (match_op_dup 8
+ [(match_dup 9) (const_int 0)]))])
+
+(define_peephole2
+  [(set (match_operand:SWI48x 0 "register_operand")
+   (match_operand:SWI48x 1 "x86_64_general_operand"))
+   (parallel [(set (match_dup 0)
+  (unspec_volatile:SWI48x
+[(match_operand:SWI48x 2 "memory_operand")
+ (match_dup 0)
+ (match_operand:SWI48x 3 "register_operand")
+ (match_operand:SI 4 "const_int_operand")]
+UNSPECV_CMPCCXADD))
+ (set (match_dup 2)
+  (unspec_volatile:SWI48x [(const_int 0)] UNSPECV_CMPCCXADD))
+ (set (reg:CC FLAGS_REG)
+  (compare:CC (match_dup 2)
+  (match_dup 0)))])
+   (set (reg FLAGS_REG)
+   (compare (match_operand:SWI48x 5 "register_operand")
+(match_operand:SWI48x 6 "x86_64_general_operand")))
+   (set (match_operand:QI 7 "nonimmediate_operand")
+   (match_operator:QI 8 "ix86_comparison_int_operator"
+ [(reg FLAGS_REG) (const_int 0)]))]
+  "TARGET_CMPCCXADD && TARGET_64BIT
+   && rtx_equal_p (operands[0], operands[6])
+   && rtx_equal_p (operands[1], operands[5])
+   && peep2_regno_dead_p (4, FLAGS_REG)"
+  [(set (match_dup 0)
+   (match_dup 1))
+   (parallel [(set (match_dup 0)
+  (unspec_volatile:SWI48x
+[(match_dup 2)
+ (match_dup 0)
+ (match_dup 3)
+ (match_dup 4)]
+UNSPECV_CMPCCXADD))
+ (set (match_dup 2)
+  (unspec_volatile:S

[PATCH] Fix typo in the testcase.

2023-07-11 Thread liuhongt via Gcc-patches

Antony Polukhin 2023-07-11 09:51:58 UTC
There's a typo at 
https://gcc.gnu.org/git/?p=gcc.git;a=blob;f=gcc/testsuite/g%2B%2B.target/i386/pr110170.C;h=e638b12a5ee2264ecef77acca86432a9f24b103b;hb=d41a57c46df6f8f7dae0c0a8b349e734806a837b#l87

It should be `|| !test3() || !test3r()` rather than `|| !test3() || !test4r()`

Committed as an obvious fix.

gcc/testsuite/ChangeLog:

* g++.target/i386/pr110170.C: Fix typo.
---
 gcc/testsuite/g++.target/i386/pr110170.C | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/g++.target/i386/pr110170.C 
b/gcc/testsuite/g++.target/i386/pr110170.C
index e638b12a5ee..21cca8f3805 100644
--- a/gcc/testsuite/g++.target/i386/pr110170.C
+++ b/gcc/testsuite/g++.target/i386/pr110170.C
@@ -84,7 +84,7 @@ TEST()
   if (
   !test1() || !test1r()
   || !test2() || !test2r()
-  || !test3() || !test4r()
+  || !test3() || !test3r()
   || !test4() || !test4r()
   ) __builtin_abort();
 }
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] x86: Add a new option -mdaz-ftz to enable FTZ and DAZ flags in MXCSR.

2023-05-10 Thread liuhongt via Gcc-patches

> The quoted patch shows -shared in context and  you didn't post a
> backport version
> to look at.  But yes, we shouldn't change -shared behavior on a
> branch, even less so make it
> inconsistent between targets.
Here's the patch.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for GCC 11/12 backport?

if (mdaz-ftz)
  link crtfastmath.o
else if ((Ofast || ffast-math || funsafe-math-optimizations)
 && !mno-daz-ftz)
  link crtfastmath.o
else
  Don't link crtfastmath.o

gcc/ChangeLog:

* config/i386/cygwin.h (ENDFILE_SPEC): Link crtfastmath.o
whenever -mdaz-ftz is specified. Don't link crtfastmath.o
when -mno-daz-ftz is specified.
* config/i386/darwin.h (ENDFILE_SPEC): Ditto.
* config/i386/gnu-user-common.h
(GNU_USER_TARGET_MATHFILE_SPEC): Ditto.
* config/i386/mingw32.h (ENDFILE_SPEC): Ditto.
* config/i386/i386.opt (mdaz-ftz): New option.
* doc/invoke.texi (x86 options): Document mftz-daz.
---
 gcc/config/i386/cygwin.h  |  2 +-
 gcc/config/i386/darwin.h  |  4 ++--
 gcc/config/i386/gnu-user-common.h |  2 +-
 gcc/config/i386/i386.opt  |  4 
 gcc/config/i386/mingw32.h |  2 +-
 gcc/doc/invoke.texi   | 11 ++-
 6 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/cygwin.h b/gcc/config/i386/cygwin.h
index d06eda369cf..5412c5d4479 100644
--- a/gcc/config/i386/cygwin.h
+++ b/gcc/config/i386/cygwin.h
@@ -57,7 +57,7 @@ along with GCC; see the file COPYING3.  If not see
 
 #undef ENDFILE_SPEC
 #define ENDFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}\
+  
"%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}}
 \
%{!shared:%:if-exists(default-manifest.o%s)}\
%{fvtable-verify=none:%s; \
 fvtable-verify=preinit:vtv_end.o%s; \
diff --git a/gcc/config/i386/darwin.h b/gcc/config/i386/darwin.h
index a55f6b2b874..2f773924d6e 100644
--- a/gcc/config/i386/darwin.h
+++ b/gcc/config/i386/darwin.h
@@ -109,8 +109,8 @@ along with GCC; see the file COPYING3.  If not see
 "%{!force_cpusubtype_ALL:-force_cpusubtype_ALL} "
 
 #undef ENDFILE_SPEC
-#define ENDFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+#define ENDFILE_SPEC
+\  
"%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}}
 \
%{mpc32:crtprec32.o%s} \
%{mpc64:crtprec64.o%s} \
%{mpc80:crtprec80.o%s}" TM_DESTRUCTOR
diff --git a/gcc/config/i386/gnu-user-common.h 
b/gcc/config/i386/gnu-user-common.h
index 23b54c5be52..3d2a33f1714 100644
--- a/gcc/config/i386/gnu-user-common.h
+++ b/gcc/config/i386/gnu-user-common.h
@@ -47,7 +47,7 @@ along with GCC; see the file COPYING3.  If not see
 
 /* Similar to standard GNU userspace, but adding -ffast-math support.  */
 #define GNU_USER_TARGET_MATHFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+  
"%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}}
 \
%{mpc32:crtprec32.o%s} \
%{mpc64:crtprec64.o%s} \
%{mpc80:crtprec80.o%s}"
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index a3675e515bc..5cfb7cdcbc2 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -420,6 +420,10 @@ mpc80
 Target RejectNegative
 Set 80387 floating-point precision to 80-bit.
 
+mdaz-ftz
+Target
+Set the FTZ and DAZ Flags.
+
 mpreferred-stack-boundary=
 Target RejectNegative Joined UInteger Var(ix86_preferred_stack_boundary_arg)
 Attempt to keep stack aligned to this power of 2.
diff --git a/gcc/config/i386/mingw32.h b/gcc/config/i386/mingw32.h
index d3ca0cd0279..ddbe6a4054b 100644
--- a/gcc/config/i386/mingw32.h
+++ b/gcc/config/i386/mingw32.h
@@ -197,7 +197,7 @@ along with GCC; see the file COPYING3.  If not see
 
 #undef ENDFILE_SPEC
 #define ENDFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+  
"%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}}
 \
%{!shared:%:if-exists(default-manifest.o%s)}\
%{fvtable-verify=none:%s; \
 fvtable-verify=preinit:vtv_end.o%s; \
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index cb83dd8a1cc..87eedfffa6c 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1434,7 +1434,7 @@ See RS/6000 and PowerPC Options.
 -m96bit-long-double  -mlong-double-64  -mlong-double-80  -mlong-double-128 @gol
 -mregparm=@var{num}  -msseregparm @gol
 -mveclibabi=@var{type}  -mvect8-ret-in-mem @gol
--mpc32  -mpc64  -mpc80  -mstackrealign @gol
+-mpc32  -mpc64  -mpc80 -mdaz-ftz -mstackrealign @gol
 -momit-leaf-frame-pointer  -mno-red-zone  -mno-tls-direct-seg-refs @gol
 -mcmodel=@var{code-model}  -mabi=@var{name}  -maddress-mode=@var{mode} @gol
 -m32  -m64  -mx32  -m16  -miamcu  -mlarge-data-threshold=@var{num} @gol
@@ -32078,6 +3

[PATCH] Provide -fcf-protection=branch,return.

2023-05-11 Thread liuhongt via Gcc-patches

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/89701
* common.opt: Refactor -fcf-protection= to support combination
of param.
* lto-wrapper.c (merge_and_complain): Adjusted.
* opts.c (parse_cf_protection_options): New.
(common_handle_option): Decode argument for -fcf-protection=.
* opts.h (parse_cf_protection_options): Declare.

gcc/testsuite/ChangeLog:

PR target/89701
* c-c++-common/fcf-protection-8.c: New test.
* c-c++-common/fcf-protection-9.c: New test.
* c-c++-common/fcf-protection-10.c: New test.
* gcc.target/i386/pr89701-1.c: New test.
* gcc.target/i386/pr89701-2.c: New test.
* gcc.target/i386/pr89701-3.c: New test.
* gcc.target/i386/pr89701-4.c: New test.
---
 gcc/common.opt| 24 ++
 gcc/lto-wrapper.cc| 21 +++--
 gcc/opts.cc   | 79 +++
 gcc/opts.h|  1 +
 .../c-c++-common/fcf-protection-10.c  |  3 +
 .../c-c++-common/fcf-protection-11.c  |  2 +
 .../c-c++-common/fcf-protection-12.c  |  2 +
 gcc/testsuite/c-c++-common/fcf-protection-8.c |  3 +
 gcc/testsuite/c-c++-common/fcf-protection-9.c |  3 +
 gcc/testsuite/gcc.target/i386/pr89701-1.c |  4 +
 gcc/testsuite/gcc.target/i386/pr89701-2.c |  4 +
 gcc/testsuite/gcc.target/i386/pr89701-3.c |  5 ++
 gcc/testsuite/gcc.target/i386/pr89701-4.c |  5 ++
 13 files changed, 130 insertions(+), 26 deletions(-)
 create mode 100644 gcc/testsuite/c-c++-common/fcf-protection-10.c
 create mode 100644 gcc/testsuite/c-c++-common/fcf-protection-11.c
 create mode 100644 gcc/testsuite/c-c++-common/fcf-protection-12.c
 create mode 100644 gcc/testsuite/c-c++-common/fcf-protection-8.c
 create mode 100644 gcc/testsuite/c-c++-common/fcf-protection-9.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89701-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89701-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89701-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89701-4.c

diff --git a/gcc/common.opt b/gcc/common.opt
index a28ca13385a..ac12da52733 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -229,6 +229,10 @@ bool dump_base_name_prefixed = false
 Variable
 unsigned int flag_zero_call_used_regs
 
+;; What the CF check should instrument
+Variable
+unsigned int flag_cf_protection = 0
+
 ###
 Driver
 
@@ -1886,28 +1890,10 @@ fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
 fcf-protection=
-Common Joined RejectNegative Enum(cf_protection_level) Var(flag_cf_protection) 
Init(CF_NONE)
+Common Joined
 -fcf-protection=[full|branch|return|none|check]Instrument functions 
with checks to verify jump/call/return control-flow transfer
 instructions have valid targets.
 
-Enum
-Name(cf_protection_level) Type(enum cf_protection_level) UnknownError(unknown 
Control-Flow Protection Level %qs)
-
-EnumValue
-Enum(cf_protection_level) String(full) Value(CF_FULL)
-
-EnumValue
-Enum(cf_protection_level) String(branch) Value(CF_BRANCH)
-
-EnumValue
-Enum(cf_protection_level) String(return) Value(CF_RETURN)
-
-EnumValue
-Enum(cf_protection_level) String(check) Value(CF_CHECK)
-
-EnumValue
-Enum(cf_protection_level) String(none) Value(CF_NONE)
-
 finstrument-functions
 Common Var(flag_instrument_function_entry_exit,1)
 Instrument function entry and exit with profiling calls.
diff --git a/gcc/lto-wrapper.cc b/gcc/lto-wrapper.cc
index 5186d040ce0..568c8af659d 100644
--- a/gcc/lto-wrapper.cc
+++ b/gcc/lto-wrapper.cc
@@ -359,26 +359,33 @@ merge_and_complain (vec 
&decoded_options,
case OPT_fcf_protection_:
  /* Default to link-time option, else append or check identical.  */
  if (!cf_protection_option
- || cf_protection_option->value == CF_CHECK)
+ || !memcmp (cf_protection_option->arg, "check", 5))
{
+ const char* parg = decoded_options[existing_opt].arg;
  if (existing_opt == -1)
decoded_options.safe_push (*foption);
- else if (decoded_options[existing_opt].value != foption->value)
+ else if ((strlen (parg) != strlen (foption->arg))
+  || memcmp (parg, foption->arg, strlen (foption->arg)))
{
  if (cf_protection_option
- && cf_protection_option->value == CF_CHECK)
+ && !memcmp (cf_protection_option->arg, "check", 5))
fatal_error (input_location,
 "option %qs with mismatching values"
 " (%s, %s)",
 "-fcf-protection",
-decoded_options[existing_opt].arg,
+parg,
 foption->arg);

[PATCH V2] Provide -fcf-protection=branch,return.

2023-05-13 Thread liuhongt via Gcc-patches

> I think this could be simplified if you use either EnumSet or
> EnumBitSet instead in common.opt for `-fcf-protection=`.

Use EnumSet instead of EnumBitSet since CF_FULL is not power of 2.
It is a bit tricky for sets classification, cf_branch and cf_return
should be in different sets, but they both "conflicts" cf_full,
cf_none. And current EnumSet don't handle this well.

So in the current implementation, only cf_full,cf_none are exclusive
to each other, but they can be combined with any cf_branch, cf_return,
cf_check. It's not perfect, but still an improvement than original
one.

gcc/ChangeLog:

* common.opt: (fcf-protection=): Add EnumSet attribute to
support combination of params.

gcc/testsuite/ChangeLog:

* c-c++-common/fcf-protection-10.c: New test.
* c-c++-common/fcf-protection-11.c: New test.
* c-c++-common/fcf-protection-12.c: New test.
* c-c++-common/fcf-protection-8.c: New test.
* c-c++-common/fcf-protection-9.c: New test.
* gcc.target/i386/pr89701-1.c: New test.
* gcc.target/i386/pr89701-2.c: New test.
* gcc.target/i386/pr89701-3.c: New test.
---
 gcc/common.opt | 12 ++--
 gcc/testsuite/c-c++-common/fcf-protection-10.c |  2 ++
 gcc/testsuite/c-c++-common/fcf-protection-11.c |  2 ++
 gcc/testsuite/c-c++-common/fcf-protection-12.c |  2 ++
 gcc/testsuite/c-c++-common/fcf-protection-8.c  |  2 ++
 gcc/testsuite/c-c++-common/fcf-protection-9.c  |  2 ++
 gcc/testsuite/gcc.target/i386/pr89701-1.c  |  4 
 gcc/testsuite/gcc.target/i386/pr89701-2.c  |  4 
 gcc/testsuite/gcc.target/i386/pr89701-3.c  |  4 
 9 files changed, 28 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/c-c++-common/fcf-protection-10.c
 create mode 100644 gcc/testsuite/c-c++-common/fcf-protection-11.c
 create mode 100644 gcc/testsuite/c-c++-common/fcf-protection-12.c
 create mode 100644 gcc/testsuite/c-c++-common/fcf-protection-8.c
 create mode 100644 gcc/testsuite/c-c++-common/fcf-protection-9.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89701-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89701-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89701-3.c

diff --git a/gcc/common.opt b/gcc/common.opt
index a28ca13385a..02f2472959a 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1886,7 +1886,7 @@ fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
 fcf-protection=
-Common Joined RejectNegative Enum(cf_protection_level) Var(flag_cf_protection) 
Init(CF_NONE)
+Common Joined RejectNegative Enum(cf_protection_level) EnumSet 
Var(flag_cf_protection) Init(CF_NONE)
 -fcf-protection=[full|branch|return|none|check]Instrument functions 
with checks to verify jump/call/return control-flow transfer
 instructions have valid targets.
 
@@ -1894,19 +1894,19 @@ Enum
 Name(cf_protection_level) Type(enum cf_protection_level) UnknownError(unknown 
Control-Flow Protection Level %qs)
 
 EnumValue
-Enum(cf_protection_level) String(full) Value(CF_FULL)
+Enum(cf_protection_level) String(full) Value(CF_FULL) Set(1)
 
 EnumValue
-Enum(cf_protection_level) String(branch) Value(CF_BRANCH)
+Enum(cf_protection_level) String(branch) Value(CF_BRANCH) Set(2)
 
 EnumValue
-Enum(cf_protection_level) String(return) Value(CF_RETURN)
+Enum(cf_protection_level) String(return) Value(CF_RETURN) Set(3)
 
 EnumValue
-Enum(cf_protection_level) String(check) Value(CF_CHECK)
+Enum(cf_protection_level) String(check) Value(CF_CHECK) Set(4)
 
 EnumValue
-Enum(cf_protection_level) String(none) Value(CF_NONE)
+Enum(cf_protection_level) String(none) Value(CF_NONE) Set(1)
 
 finstrument-functions
 Common Var(flag_instrument_function_entry_exit,1)
diff --git a/gcc/testsuite/c-c++-common/fcf-protection-10.c 
b/gcc/testsuite/c-c++-common/fcf-protection-10.c
new file mode 100644
index 000..b271d134e52
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/fcf-protection-10.c
@@ -0,0 +1,2 @@
+/* { dg-do compile { target { "i?86-*-* x86_64-*-*" } } } */
+/* { dg-options "-fcf-protection=branch,check" } */
diff --git a/gcc/testsuite/c-c++-common/fcf-protection-11.c 
b/gcc/testsuite/c-c++-common/fcf-protection-11.c
new file mode 100644
index 000..2e566350ccd
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/fcf-protection-11.c
@@ -0,0 +1,2 @@
+/* { dg-do compile { target { "i?86-*-* x86_64-*-*" } } } */
+/* { dg-options "-fcf-protection=branch,return" } */
diff --git a/gcc/testsuite/c-c++-common/fcf-protection-12.c 
b/gcc/testsuite/c-c++-common/fcf-protection-12.c
new file mode 100644
index 000..b39c2f8e25d
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/fcf-protection-12.c
@@ -0,0 +1,2 @@
+/* { dg-do compile { target { "i?86-*-* x86_64-*-*" } } } */
+/* { dg-options "-fcf-protection=return,branch" } */
diff --git a/gcc/testsuite/c-c++-common/fcf-protection-8.c 
b/gcc/testsuite/c-c++-common/fcf-protection-8.c
new file mode 100644
index 000..3b97095a92c
--- /dev/null

[PATCH] Only use NO_REGS in cost calculation when !hard_regno_mode_ok for GENERAL_REGS and mode.

2023-05-16 Thread liuhongt via Gcc-patches

r14-172-g0368d169492017 replaces GENERAL_REGS with NO_REGS in cost
calculation when the preferred register class are not known yet.
It regressed powerpc PR109610 and PR109858, it looks too aggressive to use
NO_REGS when mode can be allocated with GENERAL_REGS.
The patch takes a step back, still use GENERAL_REGS when
hard_regno_mode_ok for mode and GENERAL_REGS, otherwise uses NO_REGS.
Kewen confirmed the patch fixed PR109858, I vefiried it also fixed PR109610.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
No big performance impact for SPEC2017 on icelake server.
Ok for trunk?

gcc/ChangeLog:

* ira-costs.cc (scan_one_insn): Only use NO_REGS in cost
calculation when !hard_regno_mode_ok for GENERAL_REGS and
mode, otherwise still use GENERAL_REGS.
---
 gcc/ira-costs.cc | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/gcc/ira-costs.cc b/gcc/ira-costs.cc
index d2a801ab9b0..ae8304ff938 100644
--- a/gcc/ira-costs.cc
+++ b/gcc/ira-costs.cc
@@ -1572,12 +1572,16 @@ scan_one_insn (rtx_insn *insn)
   && (! ira_use_lra_p || ! pic_offset_table_rtx
  || ! contains_symbol_ref_p (XEXP (note, 0
 {
-  /* Costs for NO_REGS are used in cost calculation on the
-1st pass when the preferred register classes are not
-known yet.  In this case we take the best scenario.  */
-  enum reg_class cl = NO_REGS;
+  enum reg_class cl = GENERAL_REGS;
   rtx reg = SET_DEST (set);
   int num = COST_INDEX (REGNO (reg));
+  /* Costs for NO_REGS are used in cost calculation on the
+1st pass when the preferred register classes are not
+known yet.  In this case we take the best scenario when
+mode can't be put into GENERAL_REGS.  */
+  if (!targetm.hard_regno_mode_ok (ira_class_hard_regs[cl][0],
+  GET_MODE (reg)))
+   cl = NO_REGS;
 
   COSTS (costs, num)->mem_cost
-= ira_memory_move_cost[GET_MODE (reg)][cl][1] * frequency;
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Fold _mm{, 256, 512}_abs_{epi8, epi16, epi32, epi64} into gimple ABS_EXPR.

2023-05-22 Thread liuhongt via Gcc-patches

Also for 64-bit vector abs intrinsics _mm_abs_{pi8,pi16,pi32}.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/109900
* config/i386/i386.cc (ix86_gimple_fold_builtin): Fold
_mm{,256,512}_abs_{epi8,epi16,epi32,epi64} and
_mm_abs_{pi8,pi16,pi32} into gimple ABS_EXPR.
(ix86_masked_all_ones): Handle 64-bit mask.
* config/i386/i386-builtin.def: Replace icode of related
non-mask simd abs builtins with CODE_FOR_nothing.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr109900.c: New test.
---
 gcc/config/i386/i386-builtin.def | 18 ++---
 gcc/config/i386/i386.cc  | 86 +++--
 gcc/testsuite/gcc.target/i386/pr109900.c | 95 
 3 files changed, 166 insertions(+), 33 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109900.c

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index f7b10a6ab1e..c91e3809c75 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -899,12 +899,12 @@ BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_hsubv4sf3, 
"__builtin_ia32_hsubps"
 BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_hsubv2df3, 
"__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) 
V2DF_FTYPE_V2DF_V2DF)
 
 /* SSSE3 */
-BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_absv16qi2, 
"__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) 
V16QI_FTYPE_V16QI)
-BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, 
CODE_FOR_ssse3_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, 
(int) V8QI_FTYPE_V8QI)
-BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", 
IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI)
-BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, 
CODE_FOR_ssse3_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, 
(int) V4HI_FTYPE_V4HI)
-BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", 
IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI)
-BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, 
CODE_FOR_ssse3_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, 
(int) V2SI_FTYPE_V2SI)
+BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_nothing, "__builtin_ia32_pabsb128", 
IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI)
+BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, 
"__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI)
+BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_nothing, "__builtin_ia32_pabsw128", 
IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI)
+BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, 
"__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI)
+BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_nothing, "__builtin_ia32_pabsd128", 
IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI)
+BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, 
"__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI)
 
 BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_ssse3_phaddwv8hi3, 
"__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) 
V8HI_FTYPE_V8HI_V8HI)
 BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, 
CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, 
UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI)
@@ -1178,9 +1178,9 @@ BDESC (OPTION_MASK_ISA_AVX, 0, 
CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_
 
 /* AVX2 */
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_mpsadbw, 
"__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI_INT)
-BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", 
IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI)
-BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", 
IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI)
-BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", 
IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI)
+BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pabsb256", 
IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI)
+BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pabsw256", 
IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI)
+BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pabsd256", 
IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_packssdw, 
"__builtin_ia32_packssdw256",  IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) 
V16HI_FTYPE_V8SI_V8SI)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_packsswb, 
"__builtin_ia32_packsswb256",  IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) 
V32QI_FTYPE_V16HI_V16HI)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_packusdw, 
"__builtin_ia32_packusdw256",  IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) 
V16HI_FTYPE_V8SI_V8SI)

[PATCH] [x86] Split notl + pbraodcast + pand to pbroadcast + pandn more modes.

2023-05-25 Thread liuhongt via Gcc-patches

r12-5595-gc39d77f252e895306ef88c1efb3eff04e4232554 adds 2 splitter to
transform notl + pbroadcast + pand to pbroadcast + pandn for
VI124_AVX2 which leaves out all DI-element-size ones as
well as all 512-bit ones.
This patch extend the splitter to VI_AVX2 which will handle DImode for
AVX2, and V64QImode,V32HImode,V16SImode,V8DImode for AVX512.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready to push to trunk.

gcc/ChangeLog:

PR target/100711
* config/i386/sse.md (*andnot3): Extend below splitter
to VI_AVX2 to cover more modes.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr100711-2.c: Add v4di/v2di testcases.
* gcc.target/i386/pr100711-3.c: New test.
---
 gcc/config/i386/sse.md | 12 +++
 gcc/testsuite/gcc.target/i386/pr100711-2.c | 14 +++-
 gcc/testsuite/gcc.target/i386/pr100711-3.c | 40 ++
 3 files changed, 59 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100711-3.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 26dd0b1aa10..97f883d8083 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -17116,17 +17116,17 @@ (define_split
 
 ;; PR target/100711: Split notl; vpbroadcastd; vpand as vpbroadcastd; vpandn
 (define_split
-  [(set (match_operand:VI124_AVX2 0 "register_operand")
-   (and:VI124_AVX2
- (vec_duplicate:VI124_AVX2
+  [(set (match_operand:VI_AVX2 0 "register_operand")
+   (and:VI_AVX2
+ (vec_duplicate:VI_AVX2
(not:
  (match_operand: 1 "register_operand")))
- (match_operand:VI124_AVX2 2 "vector_operand")))]
+ (match_operand:VI_AVX2 2 "vector_operand")))]
   "TARGET_AVX2"
   [(set (match_dup 3)
-   (vec_duplicate:VI124_AVX2 (match_dup 1)))
+   (vec_duplicate:VI_AVX2 (match_dup 1)))
(set (match_dup 0)
-   (and:VI124_AVX2 (not:VI124_AVX2 (match_dup 3))
+   (and:VI_AVX2 (not:VI_AVX2 (match_dup 3))
(match_dup 2)))]
   "operands[3] = gen_reg_rtx (mode);")
 
diff --git a/gcc/testsuite/gcc.target/i386/pr100711-2.c 
b/gcc/testsuite/gcc.target/i386/pr100711-2.c
index ccaf1688e19..f75914fb7fc 100644
--- a/gcc/testsuite/gcc.target/i386/pr100711-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr100711-2.c
@@ -4,10 +4,12 @@
 typedef char v16qi __attribute__ ((vector_size (16)));
 typedef short v8hi __attribute__ ((vector_size (16)));
 typedef int v4si __attribute__ ((vector_size (16)));
+typedef long long v2di __attribute__((vector_size (16)));
 
 typedef char v32qi __attribute__ ((vector_size (32)));
 typedef short v16hi __attribute__ ((vector_size (32)));
 typedef int v8si __attribute__ ((vector_size (32)));
+typedef long long v4di __attribute__((vector_size (32)));
 
 v16qi foo_v16qi (char a, v16qi b)
 {
@@ -25,6 +27,11 @@ v4si foo_v4si (int a, v4si b)
 return (__extension__ (v4si) {~a, ~a, ~a, ~a}) & b;
 }
 
+v2di foo_v2di (long long a, v2di b)
+{
+return (__extension__ (v2di) {~a, ~a}) & b;
+}
+
 v32qi foo_v32qi (char a, v32qi b)
 {
 return (__extension__ (v32qi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
@@ -44,4 +51,9 @@ v8si foo_v8si (int a, v8si b)
 return (__extension__ (v8si) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,}) & b;
 }
 
-/* { dg-final { scan-assembler-times "vpandn" 6 } } */
+v4di foo_v4di (long long a, v4di b)
+{
+return (__extension__ (v4di) {~a, ~a, ~a, ~a}) & b;
+}
+
+/* { dg-final { scan-assembler-times "vpandn" 8 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100711-3.c 
b/gcc/testsuite/gcc.target/i386/pr100711-3.c
new file mode 100644
index 000..e90f2a48d8d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100711-3.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bw" } */
+
+typedef char v64qi __attribute__ ((vector_size (64)));
+typedef short v32hi __attribute__ ((vector_size (64)));
+typedef int v16si __attribute__ ((vector_size (64)));
+typedef long long v8di __attribute__((vector_size (64)));
+
+v64qi foo_v64qi (char a, v64qi b)
+{
+return (__extension__ (v64qi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+  ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+  ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+  ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+  ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) & b;
+}
+
+v32hi foo_v32hi (short a, v32hi b)
+{
+return (__extension__ (v32hi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+  ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) & b;
+}
+
+v16si foo_v16si (int a, v16si b)
+{
+return (__extension__ (v16si) {~a, ~a, ~a, ~a

[PATCH] Disable avoid_false_dep_for_bmi for atom and icelake(and later) core processors.

2023-05-25 Thread liuhongt via Gcc-patches

lzcnt/tzcnt has been fixed since skylake, popcnt has been fixed since
icelake. At least for icelake and later intel Core processors, the
errata tune is not needed. And the tune isn't need for ATOM either.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready to push to trunk.


gcc/ChangeLog:

* config/i386/x86-tune.def (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI):
Remove ATOM and ICELAKER(and later) core processors.
---
 gcc/config/i386/x86-tune.def | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 9d603cc84e4..e1c72cddf1f 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -335,7 +335,8 @@ DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
 /* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency
for bit-manipulation instructions.  */
 DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi",
- m_SANDYBRIDGE | m_CORE_AVX2 | m_TREMONT | m_ALDERLAKE | m_CORE_ATOM
+ m_SANDYBRIDGE | m_HASWELL | m_SKYLAKE | m_SKYLAKE_AVX512
+ | m_CANNONLAKE | m_CASCADELAKE | m_COOPERLAKE
  | m_LUJIAZUI | m_GENERIC)
 
 /* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Support cond_add/sub/mul/div for vector float/double.

2021-08-01 Thread liuhongt via Gcc-patches

Hi:
  This patch supports cond_add/sub/mul/div expanders for vector float/double.
  There're still cond_fma/fms/fnms/fma/max/min/xor/ior/and left which I failed 
to figure out a testcase to validate them.
Also cond_add/sub/mul for vector integer.
  
  Bootstrap is ok, survive the regression test on x86_64-linux-gnu{-m32,}.
  Pushed to trunk if there're no objections.

gcc/ChangeLog:

* config/i386/sse.md (cond_):New expander.
(cond_mul): Ditto.
(cond_div): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/cond_op_addsubmuldiv_double-1.c: New test.
* gcc.target/i386/cond_op_addsubmuldiv_double-2.c: New test.
* gcc.target/i386/cond_op_addsubmuldiv_float-1.c: New test.
* gcc.target/i386/cond_op_addsubmuldiv_float-2.c: New test.
---
 gcc/config/i386/sse.md| 54 
 .../i386/cond_op_addsubmuldiv_double-1.c  | 31 +++
 .../i386/cond_op_addsubmuldiv_double-2.c  | 85 +++
 .../i386/cond_op_addsubmuldiv_float-1.c   |  9 ++
 .../i386/cond_op_addsubmuldiv_float-2.c   |  4 +
 5 files changed, 183 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_double-1.c
 create mode 100644 
gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_double-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_float-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_float-2.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index b5a08988590..8bf1764d3d5 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1891,6 +1891,24 @@ (define_insn_and_split "*nabs2"
 }
   [(set_attr "isa" "noavx,noavx,avx,avx")])
 
+(define_expand "cond_"
+  [(set (match_operand:VF 0 "register_operand")
+   (vec_merge:VF
+ (plusminus:VF
+   (match_operand:VF 2 "vector_operand")
+   (match_operand:VF 3 "vector_operand"))
+ (match_operand:VF 4 "nonimm_or_0_operand")
+ (match_operand: 1 "register_operand")))]
+  " == 64 || TARGET_AVX512VL"
+{
+  emit_insn (gen_3_mask (operands[0],
+operands[2],
+operands[3],
+operands[4],
+operands[1]));
+  DONE;
+})
+
 (define_expand "3"
   [(set (match_operand:VF 0 "register_operand")
(plusminus:VF
@@ -1953,6 +1971,24 @@ (define_insn 
"_vm3"
(set_attr "prefix" "")
(set_attr "mode" "")])
 
+(define_expand "cond_mul"
+  [(set (match_operand:VF 0 "register_operand")
+   (vec_merge:VF
+ (mult:VF
+   (match_operand:VF 2 "vector_operand")
+   (match_operand:VF 3 "vector_operand"))
+ (match_operand:VF 4 "nonimm_or_0_operand")
+ (match_operand: 1 "register_operand")))]
+  " == 64 || TARGET_AVX512VL"
+{
+  emit_insn (gen_mul3_mask (operands[0],
+operands[2],
+operands[3],
+operands[4],
+operands[1]));
+  DONE;
+})
+
 (define_expand "mul3"
   [(set (match_operand:VF 0 "register_operand")
(mult:VF
@@ -2041,6 +2077,24 @@ (define_expand "div3"
 }
 })
 
+(define_expand "cond_div"
+  [(set (match_operand:VF 0 "register_operand")
+   (vec_merge:VF
+ (div:VF
+   (match_operand:VF 2 "register_operand")
+   (match_operand:VF 3 "vector_operand"))
+ (match_operand:VF 4 "nonimm_or_0_operand")
+ (match_operand: 1 "register_operand")))]
+  " == 64 || TARGET_AVX512VL"
+{
+  emit_insn (gen__div3_mask (operands[0],
+   operands[2],
+   operands[3],
+   operands[4],
+   operands[1]));
+  DONE;
+})
+
 (define_insn "_div3"
   [(set (match_operand:VF 0 "register_operand" "=x,v")
(div:VF
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_double-1.c 
b/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_double-1.c
new file mode 100644
index 000..1092cba9876
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_double-1.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -fdump-tree-vect" } */
+/* { dg-final { scan-tree-dump ".COND_ADD" "vect" } } */
+/* { dg-final { scan-tree-dump ".COND_SUB" "vect" } } */
+/* { dg-final { scan-tree-dump ".COND_MUL" "vect" } } */
+/* { dg-final { scan-tree-dump ".COND_RDIV" "vect" } } */
+
+#ifndef NUM
+#define NUM 800
+#endif
+#ifndef TYPE
+#define TYPE double
+#endif
+
+TYPE a[NUM], b[NUM], c[NUM], d[NUM], e[NUM], j[NUM];
+
+#define BIN(OPNAME, OP)\
+  void \
+  __attribute__ ((noipa,optimize ("O3")))  \
+  foo_##OPNAME ()  \
+  {

[PATCH 2/6] [i386] Enable _Float16 type for TARGET_SSE2 and above.

2021-08-01 Thread liuhongt via Gcc-patches

gcc/ChangeLog:

* config/i386/i386-modes.def (FLOAT_MODE): Define ieee HFmode.
* config/i386/i386.c (enum x86_64_reg_class): Add
X86_64_SSEHF_CLASS.
(merge_classes): Handle X86_64_SSEHF_CLASS.
(examine_argument): Ditto.
(construct_container): Ditto.
(classify_argument): Ditto, and set HFmode/HCmode to
X86_64_SSEHF_CLASS.
(function_value_32): Return _FLoat16/Complex Float16 by
%xmm0.
(function_value_64): Return _Float16/Complex Float16 by SSE
register.
(ix86_print_operand): Handle CONST_DOUBLE HFmode.
(ix86_secondary_reload): Require gpr as intermediate register
to store _Float16 from sse register when sse4 is not
available.
(ix86_libgcc_floating_mode_supported_p): Enable _FLoat16 under
sse2.
(ix86_scalar_mode_supported_p): Ditto.
(TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P): Defined.
* config/i386/i386.h (VALID_SSE2_REG_MODE): Add HFmode.
(VALID_INT_MODE_P): Add HFmode and HCmode.
* config/i386/i386.md (*pushhf_rex64): New define_insn.
(*pushhf): Ditto.
(*movhf_internal): Ditto.
* doc/extend.texi (Half-Precision Floating Point): Documemt
_Float16 for x86.
* emit-rtl.c (validate_subreg): Allow (subreg:SI (reg:HF) 0)
which is used by extract_bit_field but not backends.

gcc/lto/ChangeLog:

* lto-lang.c (lto_type_for_mode): Return float16_type_node
when mode == TYPE_MODE (float16_type_node).

gcc/testsuite/ChangeLog

* gcc.target/i386/sse2-float16-1.c: New test.
* gcc.target/i386/sse2-float16-2.c: Ditto.
* gcc.target/i386/sse2-float16-3.c: Ditto.
* gcc.target/i386/float16-5.c: Ditto.
---
 gcc/config/i386/i386-modes.def|   1 +
 gcc/config/i386/i386.c|  91 +-
 gcc/config/i386/i386.h|   3 +-
 gcc/config/i386/i386.md   | 118 +-
 gcc/doc/extend.texi   |  13 ++
 gcc/emit-rtl.c|   5 +
 gcc/lto/lto-lang.c|   3 +
 gcc/testsuite/gcc.target/i386/float16-5.c |  12 ++
 .../gcc.target/i386/sse2-float16-1.c  |   8 ++
 .../gcc.target/i386/sse2-float16-2.c  |  16 +++
 .../gcc.target/i386/sse2-float16-3.c  |  12 ++
 11 files changed, 274 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/float16-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-float16-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-float16-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-float16-3.c

diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def
index 4e7014be034..9232f59a925 100644
--- a/gcc/config/i386/i386-modes.def
+++ b/gcc/config/i386/i386-modes.def
@@ -23,6 +23,7 @@ along with GCC; see the file COPYING3.  If not see
 
 FRACTIONAL_FLOAT_MODE (XF, 80, 12, ieee_extended_intel_96_format);
 FLOAT_MODE (TF, 16, ieee_quad_format);
+FLOAT_MODE (HF, 2, ieee_half_format);
 
 /* In ILP32 mode, XFmode has size 12 and alignment 4.
In LP64 mode, XFmode has size and alignment 16.  */
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index ff96134fb37..7979e240426 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -387,6 +387,7 @@ enum x86_64_reg_class
 X86_64_INTEGER_CLASS,
 X86_64_INTEGERSI_CLASS,
 X86_64_SSE_CLASS,
+X86_64_SSEHF_CLASS,
 X86_64_SSESF_CLASS,
 X86_64_SSEDF_CLASS,
 X86_64_SSEUP_CLASS,
@@ -2023,8 +2024,10 @@ merge_classes (enum x86_64_reg_class class1, enum 
x86_64_reg_class class2)
 return X86_64_MEMORY_CLASS;
 
   /* Rule #4: If one of the classes is INTEGER, the result is INTEGER.  */
-  if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
-  || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
+  if ((class1 == X86_64_INTEGERSI_CLASS
+   && (class2 == X86_64_SSESF_CLASS || class2 == X86_64_SSEHF_CLASS))
+  || (class2 == X86_64_INTEGERSI_CLASS
+ && (class1 == X86_64_SSESF_CLASS || class1 == X86_64_SSEHF_CLASS)))
 return X86_64_INTEGERSI_CLASS;
   if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
   || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
@@ -2178,6 +2181,8 @@ classify_argument (machine_mode mode, const_tree type,
/* The partial classes are now full classes.  */
if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
  subclasses[0] = X86_64_SSE_CLASS;
+   if (subclasses[0] == X86_64_SSEHF_CLASS && bytes != 2)
+ subclasses[0] = X86_64_SSE_CLASS;
if (subclasses[0] == X86_64_INTEGERSI_CLASS
&& !((bit_offset % 64) == 0 && bytes == 4))
  subclasses[0] = X86_64_INTEGER_CLASS;
@@ -2350,6 +2355,12 @@ classify

[PATCH V3 0/6] Initial support for AVX512FP16

2021-08-01 Thread liuhongt via Gcc-patches

Update from v2:

1. Support -fexcess-precision=16 which will enable
FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16 when backend supports _Float16. 
2. Update ix86_get_excess_precision, so -fexcess-precision=standard
should not do anything different from -fexcess-precision=fast
 regarding _Float16.
3. Avoiding macroization of HFmode patterns.
4. Allow (subreg:SI (reg:HF)).
5. Update documents corresponding exactly to the code changes in
the same patch.
6. According to 32bit abi, pass vector _Float16 by sse registers
for 32-bit mode, not stack.

Guo, Xuepeng (1):
  AVX512FP16: Initial support for AVX512FP16 feature and scalar _Float16
instructions.

liuhongt (5):
  Update hf soft-fp from glibc.
  [i386] Enable _Float16 type for TARGET_SSE2 and above.
  [i386] libgcc: Enable hfmode soft-sf/df/xf/tf extensions and
truncations.
  Support -fexcess-precision=16 which will enable
FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16 when backend supports _Float16.
  AVX512FP16: Support vector init/broadcast/set/extract for FP16.

 gcc/ada/gcc-interface/misc.c  |   3 +
 gcc/c-family/c-common.c   |   6 +-
 gcc/c-family/c-cppbuiltin.c   |   6 +-
 gcc/common.opt|   5 +-
 gcc/common/config/i386/cpuinfo.h  |   2 +
 gcc/common/config/i386/i386-common.c  |  26 +-
 gcc/common/config/i386/i386-cpuinfo.h |   1 +
 gcc/common/config/i386/i386-isas.h|   1 +
 gcc/config.gcc|   2 +-
 gcc/config/aarch64/aarch64.c  |   1 +
 gcc/config/arm/arm.c  |   1 +
 gcc/config/i386/avx512fp16intrin.h| 225 ++
 gcc/config/i386/cpuid.h   |   1 +
 gcc/config/i386/i386-builtin-types.def|   7 +-
 gcc/config/i386/i386-builtins.c   |  23 +
 gcc/config/i386/i386-c.c  |   2 +
 gcc/config/i386/i386-expand.c | 129 +-
 gcc/config/i386/i386-isa.def  |   1 +
 gcc/config/i386/i386-modes.def|  13 +-
 gcc/config/i386/i386-options.c|   4 +-
 gcc/config/i386/i386.c| 243 +--
 gcc/config/i386/i386.h|  29 +-
 gcc/config/i386/i386.md   | 291 -
 gcc/config/i386/i386.opt  |   4 +
 gcc/config/i386/immintrin.h   |   4 +
 gcc/config/i386/sse.md| 397 +-
 gcc/config/m68k/m68k.c|   2 +
 gcc/config/s390/s390.c|   2 +
 gcc/coretypes.h   |   3 +-
 gcc/doc/extend.texi   |  22 +
 gcc/doc/invoke.texi   |  10 +-
 gcc/doc/tm.texi   |  14 +-
 gcc/doc/tm.texi.in|   3 +
 gcc/emit-rtl.c|   5 +
 gcc/flag-types.h  |   3 +-
 gcc/fortran/options.c |   3 +
 gcc/lto/lto-lang.c|   3 +
 gcc/target.def|  11 +-
 gcc/testsuite/g++.dg/other/i386-2.C   |   2 +-
 gcc/testsuite/g++.dg/other/i386-3.C   |   2 +-
 gcc/testsuite/g++.target/i386/float16-1.C |   8 +
 gcc/testsuite/g++.target/i386/float16-2.C |  14 +
 gcc/testsuite/g++.target/i386/float16-3.C |  10 +
 gcc/testsuite/gcc.target/i386/avx-1.c |   2 +-
 gcc/testsuite/gcc.target/i386/avx-2.c |   2 +-
 gcc/testsuite/gcc.target/i386/avx512-check.h  |   3 +
 .../gcc.target/i386/avx512fp16-12a.c  |  21 +
 .../gcc.target/i386/avx512fp16-12b.c  |  27 ++
 gcc/testsuite/gcc.target/i386/float16-3a.c|  10 +
 gcc/testsuite/gcc.target/i386/float16-3b.c|  10 +
 gcc/testsuite/gcc.target/i386/float16-4a.c|  10 +
 gcc/testsuite/gcc.target/i386/float16-4b.c|  10 +
 gcc/testsuite/gcc.target/i386/float16-5.c |  12 +
 gcc/testsuite/gcc.target/i386/float16-6.c |   8 +
 gcc/testsuite/gcc.target/i386/funcspec-56.inc |   2 +
 gcc/testsuite/gcc.target/i386/pr54855-12.c|  14 +
 gcc/testsuite/gcc.target/i386/sse-13.c|   2 +-
 gcc/testsuite/gcc.target/i386/sse-14.c|   2 +-
 gcc/testsuite/gcc.target/i386/sse-22.c|   4 +-
 gcc/testsuite/gcc.target/i386/sse-23.c|   2 +-
 .../gcc.target/i386/sse2-float16-1.c  |   8 +
 .../gcc.target/i386/sse2-float16-2.c  |  16 +
 .../gcc.target/i386/sse2-float16-3.c  |  12 +
 gcc/testsuite/lib/target-supports.exp |  13 +-
 gcc/tree.c|   3 +-
 libgcc/config.host|   5 +-
 libgcc/config/i386/32/sfp-machine.h   |   1 +
 libgcc/config/i386/32/t-softfp|   1 +
 libgcc/config/i386/64/sfp-machine.h   |   1 +
 libgcc/config/i386/64/t-softfp|   1 +
 libgcc/config/i386/sfp-machine.h  |

[PATCH 4/6] Support -fexcess-precision=16 which will enable FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16 when backend supports _Float16.

2021-08-01 Thread liuhongt via Gcc-patches

gcc/ada/ChangeLog:

* gcc-interface/misc.c (gnat_post_options): Issue an error for
-fexcess-precision=16.

gcc/c-family/ChangeLog:

* c-common.c (excess_precision_mode_join): Update below comments.
(c_ts18661_flt_eval_method): Set excess_precision_type to
EXCESS_PRECISION_TYPE_FLOAT16 when -fexcess-precision=16.
* c-cppbuiltin.c (cpp_atomic_builtins): Update below comments.
(c_cpp_flt_eval_method_iec_559): Set excess_precision_type to
EXCESS_PRECISION_TYPE_FLOAT16 when -fexcess-precision=16.

gcc/ChangeLog:

* common.opt: Support -fexcess-precision=16.
* config/aarch64/aarch64.c (aarch64_excess_precision): Return
FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16 when
EXCESS_PRECISION_TYPE_FLOAT16.
* config/arm/arm.c (arm_excess_precision): Ditto.
* config/i386/i386.c (ix86_get_excess_precision): Ditto.
* config/m68k/m68k.c (m68k_excess_precision): Issue an error
when EXCESS_PRECISION_TYPE_FLOAT16.
* config/s390/s390.c (s390_excess_precision): Ditto.
* coretypes.h (enum excess_precision_type): Add
EXCESS_PRECISION_TYPE_FLOAT16.
* doc/tm.texi (TARGET_C_EXCESS_PRECISION): Update documents.
* doc/tm.texi.in (TARGET_C_EXCESS_PRECISION): Ditto.
* doc/extend.texi (Half-Precision): Document
-fexcess-precision=16.
* flag-types.h (enum excess_precision): Add
EXCESS_PRECISION_FLOAT16.
* target.def (excess_precision): Update document.
* tree.c (excess_precision_type): Set excess_precision_type to
EXCESS_PRECISION_FLOAT16 when -fexcess-precision=16.

gcc/fortran/ChangeLog:

* options.c (gfc_post_options): Issue an error for
-fexcess-precision=16.

gcc/testsuite/ChangeLog:

* gcc.target/i386/float16-6.c: New test.
---
 gcc/ada/gcc-interface/misc.c  |  3 +++
 gcc/c-family/c-common.c   |  6 --
 gcc/c-family/c-cppbuiltin.c   |  6 --
 gcc/common.opt|  5 -
 gcc/config/aarch64/aarch64.c  |  1 +
 gcc/config/arm/arm.c  |  1 +
 gcc/config/i386/i386.c|  2 ++
 gcc/config/m68k/m68k.c|  2 ++
 gcc/config/s390/s390.c|  2 ++
 gcc/coretypes.h   |  3 ++-
 gcc/doc/extend.texi   |  3 ++-
 gcc/doc/tm.texi   | 14 ++
 gcc/doc/tm.texi.in|  3 +++
 gcc/flag-types.h  |  3 ++-
 gcc/fortran/options.c |  3 +++
 gcc/target.def| 11 +++
 gcc/testsuite/gcc.target/i386/float16-6.c |  8 
 gcc/tree.c|  3 ++-
 18 files changed, 62 insertions(+), 17 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/float16-6.c

diff --git a/gcc/ada/gcc-interface/misc.c b/gcc/ada/gcc-interface/misc.c
index 186367ac6d1..96199bd4b63 100644
--- a/gcc/ada/gcc-interface/misc.c
+++ b/gcc/ada/gcc-interface/misc.c
@@ -256,6 +256,9 @@ gnat_post_options (const char **pfilename ATTRIBUTE_UNUSED)
   /* Excess precision other than "fast" requires front-end support.  */
   if (flag_excess_precision == EXCESS_PRECISION_STANDARD)
 sorry ("%<-fexcess-precision=standard%> for Ada");
+  else if (flag_excess_precision == EXCESS_PRECISION_FLOAT16)
+sorry ("%<-fexcess-precision=16%> for Ada");
+
   flag_excess_precision = EXCESS_PRECISION_FAST;
 
   /* No psABI change warnings for Ada.  */
diff --git a/gcc/c-family/c-common.c b/gcc/c-family/c-common.c
index aacdfb46a02..7e72062c77c 100644
--- a/gcc/c-family/c-common.c
+++ b/gcc/c-family/c-common.c
@@ -8772,7 +8772,7 @@ excess_precision_mode_join (enum flt_eval_method x,
 
This relates to the effective excess precision seen by the user,
which is the join point of the precision the target requests for
-   -fexcess-precision={standard,fast} and the implicit excess precision
+   -fexcess-precision={standard,fast,16} and the implicit excess precision
the target uses.  */
 
 static enum flt_eval_method
@@ -8784,7 +8784,9 @@ c_ts18661_flt_eval_method (void)
   enum excess_precision_type flag_type
 = (flag_excess_precision == EXCESS_PRECISION_STANDARD
? EXCESS_PRECISION_TYPE_STANDARD
-   : EXCESS_PRECISION_TYPE_FAST);
+   : (flag_excess_precision == EXCESS_PRECISION_FLOAT16
+ ? EXCESS_PRECISION_TYPE_FLOAT16
+ : EXCESS_PRECISION_TYPE_FAST));
 
   enum flt_eval_method requested
 = targetm.c.excess_precision (flag_type);
diff --git a/gcc/c-family/c-cppbuiltin.c b/gcc/c-family/c-cppbuiltin.c
index f79f939bd10..5f30354a33c 100644
--- a/gcc/c-family/c-cppbuiltin.c
+++ b/gcc/c-family/c-cppbuiltin.c
@@ -753,7 +753,7 @@ cpp_atomic_builtins (cpp_reader *pfile)
 /* Return TRUE if the implicit excess precision in which the back-end will
compute floating-point calculations is

[PATCH 1/6] Update hf soft-fp from glibc.

2021-08-01 Thread liuhongt via Gcc-patches

libgcc/ChangeLog

* soft-fp/eqhf2.c: New file.
* soft-fp/extendhfdf2.c: New file.
* soft-fp/extendhfsf2.c: New file.
* soft-fp/extendhfxf2.c: New file.
* soft-fp/half.h (FP_CMP_EQ_H): New marco.
* soft-fp/truncdfhf2.c: New file
* soft-fp/truncsfhf2.c: New file
* soft-fp/truncxfhf2.c: New file
---
 libgcc/soft-fp/eqhf2.c   | 49 +
 libgcc/soft-fp/extendhfdf2.c | 53 
 libgcc/soft-fp/extendhfsf2.c | 49 +
 libgcc/soft-fp/half.h|  1 +
 libgcc/soft-fp/truncdfhf2.c  | 52 +++
 libgcc/soft-fp/truncsfhf2.c  | 48 
 6 files changed, 252 insertions(+)
 create mode 100644 libgcc/soft-fp/eqhf2.c
 create mode 100644 libgcc/soft-fp/extendhfdf2.c
 create mode 100644 libgcc/soft-fp/extendhfsf2.c
 create mode 100644 libgcc/soft-fp/truncdfhf2.c
 create mode 100644 libgcc/soft-fp/truncsfhf2.c

diff --git a/libgcc/soft-fp/eqhf2.c b/libgcc/soft-fp/eqhf2.c
new file mode 100644
index 000..6d6634e5c54
--- /dev/null
+++ b/libgcc/soft-fp/eqhf2.c
@@ -0,0 +1,49 @@
+/* Software floating-point emulation.
+   Return 0 iff a == b, 1 otherwise
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   .  */
+
+#include "soft-fp.h"
+#include "half.h"
+
+CMPtype
+__eqhf2 (HFtype a, HFtype b)
+{
+  FP_DECL_EX;
+  FP_DECL_H (A);
+  FP_DECL_H (B);
+  CMPtype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_H (A, a);
+  FP_UNPACK_RAW_H (B, b);
+  FP_CMP_EQ_H (r, A, B, 1);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
+
+strong_alias (__eqhf2, __nehf2);
diff --git a/libgcc/soft-fp/extendhfdf2.c b/libgcc/soft-fp/extendhfdf2.c
new file mode 100644
index 000..337ba791d48
--- /dev/null
+++ b/libgcc/soft-fp/extendhfdf2.c
@@ -0,0 +1,53 @@
+/* Software floating-point emulation.
+   Return an IEEE half converted to IEEE double
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   .  */
+
+#define FP_NO_EXACT_UNDERFLOW
+#include "soft-fp.h"
+#include "half.h"
+#include "double.h"
+
+DFtype
+__extendhfdf2 (HFtype a)
+{
+  FP_DECL_EX;
+  FP_DECL_H (A);
+  FP_DECL_D (R);
+  DFtype r;
+
+  FP_INIT_EXCEPTIONS;
+  FP_UNPACK_RAW_H (A, a);
+#if _FP_W_TYPE_SIZE < _FP_FRACBITS_D
+  FP_EXTEND (D, H, 2, 1, R, A);
+#else
+  FP_EXTEND (D, H, 1, 1, R, A);
+#endif
+  FP_PACK_RAW_D (r, R);
+  FP_HANDLE_EXCEPTIONS;
+
+  return r;
+}
diff --git a/

[PATCH 3/6] [i386] libgcc: Enable hfmode soft-sf/df/xf/tf extensions and truncations.

2021-08-01 Thread liuhongt via Gcc-patches

libgcc/ChangeLog:

* config/i386/32/sfp-machine.h (_FP_NANFRAC_H): New macro.
* config/i386/64/sfp-machine.h (_FP_NANFRAC_H): Ditto.
* config/i386/sfp-machine.h (_FP_NANSIGN_H): Ditto.
* config/i386/t-softfp: Add hf soft-fp.
* config.host: Add i386/64/t-softfp.
* config/i386/64/t-softfp: New file.
---
 libgcc/config.host  | 5 +
 libgcc/config/i386/32/sfp-machine.h | 1 +
 libgcc/config/i386/32/t-softfp  | 1 +
 libgcc/config/i386/64/sfp-machine.h | 1 +
 libgcc/config/i386/64/t-softfp  | 1 +
 libgcc/config/i386/sfp-machine.h| 1 +
 libgcc/config/i386/t-softfp | 5 +
 7 files changed, 11 insertions(+), 4 deletions(-)
 create mode 100644 libgcc/config/i386/64/t-softfp

diff --git a/libgcc/config.host b/libgcc/config.host
index 50f00062232..96da9ef1cce 100644
--- a/libgcc/config.host
+++ b/libgcc/config.host
@@ -1540,10 +1540,7 @@ i[34567]86-*-elfiamcu | i[34567]86-*-rtems*)
;;
 i[34567]86-*-* | x86_64-*-*)
tmake_file="${tmake_file} t-softfp-tf"
-   if test "${host_address}" = 32; then
-   tmake_file="${tmake_file} i386/${host_address}/t-softfp"
-   fi
-   tmake_file="${tmake_file} i386/t-softfp t-softfp"
+   tmake_file="${tmake_file} i386/${host_address}/t-softfp i386/t-softfp 
t-softfp"
;;
 esac
 
diff --git a/libgcc/config/i386/32/sfp-machine.h 
b/libgcc/config/i386/32/sfp-machine.h
index 1fa282d7afe..e24cbc8d180 100644
--- a/libgcc/config/i386/32/sfp-machine.h
+++ b/libgcc/config/i386/32/sfp-machine.h
@@ -86,6 +86,7 @@
 #define _FP_DIV_MEAT_D(R,X,Y)   _FP_DIV_MEAT_2_udiv(D,R,X,Y)
 #define _FP_DIV_MEAT_Q(R,X,Y)   _FP_DIV_MEAT_4_udiv(Q,R,X,Y)
 
+#define _FP_NANFRAC_H  _FP_QNANBIT_H
 #define _FP_NANFRAC_S  _FP_QNANBIT_S
 #define _FP_NANFRAC_D  _FP_QNANBIT_D, 0
 /* Even if XFmode is 12byte,  we have to pad it to
diff --git a/libgcc/config/i386/32/t-softfp b/libgcc/config/i386/32/t-softfp
index a48a5b3b116..86478cf5f20 100644
--- a/libgcc/config/i386/32/t-softfp
+++ b/libgcc/config/i386/32/t-softfp
@@ -3,3 +3,4 @@ softfp_int_modes := si di
 
 # Provide fallbacks for __builtin_copysignq and __builtin_fabsq.
 LIB2ADD += $(srcdir)/config/i386/32/tf-signs.c
+
diff --git a/libgcc/config/i386/64/sfp-machine.h 
b/libgcc/config/i386/64/sfp-machine.h
index 1ff94c23ea4..e1c616699bb 100644
--- a/libgcc/config/i386/64/sfp-machine.h
+++ b/libgcc/config/i386/64/sfp-machine.h
@@ -13,6 +13,7 @@ typedef unsigned int UTItype __attribute__ ((mode (TI)));
 
 #define _FP_DIV_MEAT_Q(R,X,Y)   _FP_DIV_MEAT_2_udiv(Q,R,X,Y)
 
+#define _FP_NANFRAC_H  _FP_QNANBIT_H
 #define _FP_NANFRAC_S  _FP_QNANBIT_S
 #define _FP_NANFRAC_D  _FP_QNANBIT_D
 #define _FP_NANFRAC_E  _FP_QNANBIT_E, 0
diff --git a/libgcc/config/i386/64/t-softfp b/libgcc/config/i386/64/t-softfp
new file mode 100644
index 000..f9d8b3a945c
--- /dev/null
+++ b/libgcc/config/i386/64/t-softfp
@@ -0,0 +1 @@
+softfp_extras := fixhfti fixunshfti floattihf floatuntihf
diff --git a/libgcc/config/i386/sfp-machine.h b/libgcc/config/i386/sfp-machine.h
index 8319f0550bc..f15d29d3755 100644
--- a/libgcc/config/i386/sfp-machine.h
+++ b/libgcc/config/i386/sfp-machine.h
@@ -17,6 +17,7 @@ typedef int __gcc_CMPtype __attribute__ ((mode 
(__libgcc_cmp_return__)));
 #define _FP_KEEPNANFRACP   1
 #define _FP_QNANNEGATEDP 0
 
+#define _FP_NANSIGN_H  1
 #define _FP_NANSIGN_S  1
 #define _FP_NANSIGN_D  1
 #define _FP_NANSIGN_E  1
diff --git a/libgcc/config/i386/t-softfp b/libgcc/config/i386/t-softfp
index 685d9cf8502..4ac214eb0ce 100644
--- a/libgcc/config/i386/t-softfp
+++ b/libgcc/config/i386/t-softfp
@@ -1 +1,6 @@
 LIB2ADD += $(srcdir)/config/i386/sfp-exceptions.c
+
+softfp_extensions := hfsf hfdf hftf hfxf sfdf sftf dftf xftf
+softfp_truncations := tfhf xfhf dfhf sfhf tfsf dfsf tfdf tfxf
+
+softfp_extras += eqhf2
\ No newline at end of file
-- 
2.27.0

[PATCH 6/6] AVX512FP16: Support vector init/broadcast/set/extract for FP16.

2021-08-01 Thread liuhongt via Gcc-patches

gcc/ChangeLog:

* config/i386/avx512fp16intrin.h (_mm_set_ph): New intrinsic.
(_mm256_set_ph): Likewise.
(_mm512_set_ph): Likewise.
(_mm_setr_ph): Likewise.
(_mm256_setr_ph): Likewise.
(_mm512_setr_ph): Likewise.
(_mm_set1_ph): Likewise.
(_mm256_set1_ph): Likewise.
(_mm512_set1_ph): Likewise.
(_mm_setzero_ph): Likewise.
(_mm256_setzero_ph): Likewise.
(_mm512_setzero_ph): Likewise.
(_mm_set_sh): Likewise.
(_mm_load_sh): Likewise.
(_mm_store_sh): Likewise.
* config/i386/i386-builtin-types.def (V8HF): New type.
(DEF_FUNCTION_TYPE (V8HF, V8HI)): New builtin function type
* config/i386/i386-expand.c (ix86_expand_vector_init_duplicate):
Support vector HFmodes.
(ix86_expand_vector_init_one_nonzero): Likewise.
(ix86_expand_vector_init_one_var): Likewise.
(ix86_expand_vector_init_interleave): Likewise.
(ix86_expand_vector_init_general): Likewise.
(ix86_expand_vector_set): Likewise.
(ix86_expand_vector_extract): Likewise.
(ix86_expand_vector_init_concat): Likewise.
(ix86_expand_sse_movcc): Handle vector HFmodes.
(ix86_expand_vector_set_var): Ditto.
* config/i386/i386-modes.def: Add HF vector modes in comment.
* config/i386/i386.c (classify_argument): Add HF vector modes.
(ix86_hard_regno_mode_ok): Allow HF vector modes for AVX512FP16.
(ix86_vector_mode_supported_p): Likewise.
(ix86_set_reg_reg_cost): Handle vector HFmode.
(ix86_get_ssemov): Handle vector HFmode.
(function_arg_advance_64): Pass unamed V16HFmode and V32HFmode
by stack.
(function_arg_32): Pass V8HF/V16HF/V32HF by sse reg for 32bit
mode.
(function_arg_advance_32): Ditto.
* config/i386/i386.h (VALID_AVX512FP16_REG_MODE): New.
(VALID_AVX256_REG_OR_OI_MODE): Rename to ..
(VALID_AVX256_REG_OR_OI_VHF_MODE): .. this, and add V16HF.
(VALID_SSE2_REG_VHF_MODE): New.
(VALID_AVX512VL_128_REG_MODE): Add V8HF and TImode.
(SSE_REG_MODE_P): Add vector HFmode.
* config/i386/i386.md (mode): Add HF vector modes.
(MODE_SIZE): Likewise.
(ssemodesuffix): Add ph suffix for HF vector modes.
* config/i386/sse.md (VFH_128): New mode iterator.
(VMOVE): Adjust for HF vector modes.
(V): Likewise.
(V_256_512): Likewise.
(avx512): Likewise.
(avx512fmaskmode): Likewise.
(shuffletype): Likewise.
(sseinsnmode): Likewise.
(ssedoublevecmode): Likewise.
(ssehalfvecmode): Likewise.
(ssehalfvecmodelower): Likewise.
(ssePScmode): Likewise.
(ssescalarmode): Likewise.
(ssescalarmodelower): Likewise.
(sseintprefix): Likewise.
(i128): Likewise.
(bcstscalarsuff): Likewise.
(xtg_mode): Likewise.
(VI12HF_AVX512VL): New mode_iterator.
(VF_AVX512FP16): Likewise.
(VIHF): Likewise.
(VIHF_256): Likewise.
(VIHF_AVX512BW): Likewise.
(V16_256): Likewise.
(V32_512): Likewise.
(sseintmodesuffix): New mode_attr.
(sse): Add scalar and vector HFmodes.
(ssescalarmode): Add vector HFmode mapping.
(ssescalarmodesuffix): Add sh suffix for HFmode.
(*_vm3): Use VFH_128.
(*_vm3): Likewise.
(*ieee_3): Likewise.
(_blendm): New define_insn.
(vec_setv8hf): New define_expand.
(vec_set_0): New define_insn for HF vector set.
(*avx512fp16_movsh): Likewise.
(avx512fp16_movsh): Likewise.
(vec_extract_lo_v32hi): Rename to ...
(vec_extract_lo_): ... this, and adjust to allow HF
vector modes.
(vec_extract_hi_v32hi): Likewise.
(vec_extract_hi_): Likewise.
(vec_extract_lo_v16hi): Likewise.
(vec_extract_lo_): Likewise.
(vec_extract_hi_v16hi): Likewise.
(vec_extract_hi_): Likewise.
(vec_set_hi_v16hi): Likewise.
(vec_set_hi_): Likewise.
(vec_set_lo_v16hi): Likewise.
(vec_set_lo_: Likewise.
(*vec_extract_0): New define_insn_and_split for HF
vector extract.
(*vec_extracthf): New define_insn.
(VEC_EXTRACT_MODE): Add HF vector modes.
(PINSR_MODE): Add V8HF.
(sse2p4_1): Likewise.
(pinsr_evex_isa): Likewise.
(_pinsr): Adjust to support
insert for V8HFmode.
(pbroadcast_evex_isa): Add HF vector modes.
(AVX2_VEC_DUP_MODE): Likewise.
(VEC_INIT_MODE): Likewise.
(VEC_INIT_HALF_MODE): Likewise.
(avx2_pbroadcast): Adjust to support HF vector mode
broadcast.
(avx2_pbroadcast_1): Likewise.
(_vec_dup_1): Likewise.
(_vec_dup): Likewise.
(_vec_dup_gpr):
Likewise.
---
 gcc/config/i386/avx512fp16intrin.h | 172 +++
 gcc

[PATCH 5/6] AVX512FP16: Initial support for AVX512FP16 feature and scalar _Float16 instructions.

2021-08-01 Thread liuhongt via Gcc-patches

From: "Guo, Xuepeng" 

gcc/ChangeLog:

* common/config/i386/cpuinfo.h (get_available_features):
Detect FEATURE_AVX512FP16.
* common/config/i386/i386-common.c
(OPTION_MASK_ISA_AVX512FP16_SET,
OPTION_MASK_ISA_AVX512FP16_UNSET,
OPTION_MASK_ISA2_AVX512FP16_SET,
OPTION_MASK_ISA2_AVX512FP16_UNSET): New.
(OPTION_MASK_ISA2_AVX512BW_UNSET,
OPTION_MASK_ISA2_AVX512BF16_UNSET): Add AVX512FP16.
(ix86_handle_option): Handle -mavx512fp16.
* common/config/i386/i386-cpuinfo.h (enum processor_features):
Add FEATURE_AVX512FP16.
* common/config/i386/i386-isas.h: Add entry for AVX512FP16.
* config.gcc: Add avx512fp16intrin.h.
* config/i386/avx512fp16intrin.h: New intrinsic header.
* config/i386/cpuid.h: Add bit_AVX512FP16.
* config/i386/i386-builtin-types.def: (FLOAT16): New primitive type.
* config/i386/i386-builtins.c: Support _Float16 type for i386
backend.
(ix86_init_float16_builtins): New function.
(ix86_float16_type_node): New.
* config/i386/i386-c.c (ix86_target_macros_internal): Define
__AVX512FP16__.
* config/i386/i386-expand.c (ix86_expand_branch): Support
HFmode.
(ix86_prepare_fp_compare_args): Adjust TARGET_SSE_MATH &&
SSE_FLOAT_MODE_P to SSE_FLOAT_MODE_SSEMATH_OR_HF_P.
(ix86_expand_fp_movcc): Ditto.
* config/i386/i386-isa.def: Add PTA define for AVX512FP16.
* config/i386/i386-options.c (isa2_opts): Add -mavx512fp16.
(ix86_valid_target_attribute_inner_p): Add avx512fp16 attribute.
* config/i386/i386.c (ix86_get_ssemov): Use
vmovdqu16/vmovw/vmovsh for HFmode/HImode scalar or vector.
(ix86_get_excess_precision): Use
FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16 when TARGET_AVX512FP16
existed.
(sse_store_index): Use SFmode cost for HFmode cost.
(inline_memory_move_cost): Add HFmode, and perfer SSE cost over
GPR cost for HFmode.
(ix86_hard_regno_mode_ok): Allow HImode in sse register.
(ix86_mangle_type): Add manlging for _Float16 type.
(inline_secondary_memory_needed): No memory is needed for
16bit movement between gpr and sse reg under
TARGET_AVX512FP16.
(ix86_multiplication_cost): Adjust TARGET_SSE_MATH &&
SSE_FLOAT_MODE_P to SSE_FLOAT_MODE_SSEMATH_OR_HF_P.
(ix86_division_cost): Ditto.
(ix86_rtx_costs): Ditto.
(ix86_add_stmt_cost): Ditto.
(ix86_optab_supported_p): Ditto.
* config/i386/i386.h (VALID_AVX512F_SCALAR_MODE): Add HFmode.
(SSE_FLOAT_MODE_SSEMATH_OR_HF_P): Add HFmode.
(PTA_SAPPHIRERAPIDS): Add PTA_AVX512FP16.
* config/i386/i386.md (mode): Add HFmode.
(MODE_SIZE): Add HFmode.
(isa): Add avx512fp16.
(enabled): Handle avx512fp16.
(ssemodesuffix): Add sh suffix for HFmode.
(comm): Add mult, div.
(plusminusmultdiv): New code iterator.
(insn): Add mult, div.
(*movhf_internal): Adjust for avx512fp16 instruction.
(*movhi_internal): Ditto.
(*cmpihf): New define_insn for HFmode.
(*ieee_shf3): Likewise.
(extendhf2): Likewise.
(trunchf2): Likewise.
(floathf2): Likewise.
(*hf): Likewise.
(cbranchhf4): New expander.
(movhfcc): Likewise.
(hf3): Likewise.
(mulhf3): Likewise.
(divhf3): Likewise.
* config/i386/i386.opt: Add mavx512fp16.
* config/i386/immintrin.h: Include avx512fp16intrin.h.
* doc/invoke.texi: Add mavx512fp16.
* doc/extend.texi: Add avx512fp16 Usage Notes.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add -mavx512fp16 in dg-options.
* gcc.target/i386/avx-2.c: Ditto.
* gcc.target/i386/avx512-check.h: Check cpuid for AVX512FP16.
* gcc.target/i386/funcspec-56.inc: Add new target attribute check.
* gcc.target/i386/sse-13.c: Add -mavx512fp16.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Ditto.
* gcc.target/i386/sse-23.c: Ditto.
* lib/target-supports.exp: (check_effective_target_avx512fp16): New.
* g++.target/i386/float16-1.C: New test.
* g++.target/i386/float16-2.C: Ditto.
* g++.target/i386/float16-3.C: Ditto.
* gcc.target/i386/avx512fp16-12a.c: Ditto.
* gcc.target/i386/avx512fp16-12b.c: Ditto.
* gcc.target/i386/float16-3a.c: Ditto.
* gcc.target/i386/float16-3b.c: Ditto.
* gcc.target/i386/float16-4a.c: Ditto.
* gcc.target/i386/float16-4b.c: Ditto.
* gcc.target/i386/pr54855-12.c: Ditto.
* g++.dg/other/i386-2.C: Ditto.
* g++.dg/other/i386-3.C: Ditto.

Co-Authored-By: H.J. Lu 
Co-Authored-By: Liu Hongtao 
Co-Authored-By: Wang Hongyu 
Co-Authored-By: Xu Dianhong 
---
 gcc/common/config/i386/cpuinfo.h  |   2 +
 gcc/c

[PATCH] Add cond_add/sub/mul for vector integer modes.

2021-08-02 Thread liuhongt via Gcc-patches

Hi:
  This is a follow up of [1].
  Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
  Pushed to trunk.
[1] https://gcc.gnu.org/pipermail/gcc-patches/2021-August/576514.html

gcc/ChangeLog:

* config/i386/sse.md (cond_): New expander.
(cond_mul): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/cond_op_addsubmul_d-1.c: New test.
* gcc.target/i386/cond_op_addsubmul_d-2.c: New test.
* gcc.target/i386/cond_op_addsubmul_q-1.c: New test.
* gcc.target/i386/cond_op_addsubmul_q-2.c: New test.
* gcc.target/i386/cond_op_addsubmul_w-1.c: New test.
* gcc.target/i386/cond_op_addsubmul_w-2.c: New test.
---
 gcc/config/i386/sse.md| 88 +--
 .../gcc.target/i386/cond_op_addsubmul_d-1.c   | 32 +++
 .../gcc.target/i386/cond_op_addsubmul_d-2.c   | 76 
 .../gcc.target/i386/cond_op_addsubmul_q-1.c   |  7 ++
 .../gcc.target/i386/cond_op_addsubmul_q-2.c   |  4 +
 .../gcc.target/i386/cond_op_addsubmul_w-1.c   |  6 ++
 .../gcc.target/i386/cond_op_addsubmul_w-2.c   |  5 ++
 7 files changed, 210 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_addsubmul_d-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_addsubmul_d-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_addsubmul_q-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_addsubmul_q-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_addsubmul_w-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_addsubmul_w-2.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 8bf1764d3d5..52b2b4214d7 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -333,6 +333,14 @@ (define_mode_iterator VI48_AVX512VL
   [V16SI (V8SI  "TARGET_AVX512VL") (V4SI  "TARGET_AVX512VL")
V8DI  (V4DI  "TARGET_AVX512VL") (V2DI  "TARGET_AVX512VL")])
 
+(define_mode_iterator VI1248_AVX512VLBW
+  [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX512VL && TARGET_AVX512BW")
+   (V16QI "TARGET_AVX512VL && TARGET_AVX512BW")
+   (V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX512VL && TARGET_AVX512BW")
+   (V8HI "TARGET_AVX512VL && TARGET_AVX512BW")
+   V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL")
+   V8DI (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")])
+
 (define_mode_iterator VF_AVX512VL
   [V16SF (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")
V8DF (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")])
@@ -11803,6 +11811,24 @@ (define_expand "3"
   "TARGET_SSE2"
   "ix86_fixup_binary_operands_no_copy (, mode, operands);")
 
+(define_expand "cond_"
+  [(set (match_operand:VI1248_AVX512VLBW 0 "register_operand")
+   (vec_merge:VI1248_AVX512VLBW
+ (plusminus:VI1248_AVX512VLBW
+   (match_operand:VI1248_AVX512VLBW 2 "nonimmediate_operand")
+   (match_operand:VI1248_AVX512VLBW 3 "nonimmediate_operand"))
+ (match_operand:VI1248_AVX512VLBW 4 "nonimm_or_0_operand")
+ (match_operand: 1 "register_operand")))]
+  "TARGET_AVX512F"
+{
+  emit_insn (gen_3_mask (operands[0],
+operands[2],
+operands[3],
+operands[4],
+operands[1]));
+  DONE;
+})
+
 (define_expand "3_mask"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand")
(vec_merge:VI48_AVX512VL
@@ -11929,6 +11955,24 @@ (define_expand "mul3"
   DONE;
 })
 
+(define_expand "cond_mul"
+  [(set (match_operand:VI2_AVX512VL 0 "register_operand")
+   (vec_merge:VI2_AVX512VL
+ (mult:VI2_AVX512VL
+   (match_operand:VI2_AVX512VL 2 "vector_operand")
+   (match_operand:VI2_AVX512VL 3 "vector_operand"))
+ (match_operand:VI2_AVX512VL 4 "nonimm_or_0_operand")
+ (match_operand: 1 "register_operand")))]
+  "TARGET_AVX512BW"
+{
+  emit_insn (gen_mul3_mask (operands[0],
+ operands[2],
+ operands[3],
+ operands[4],
+ operands[1]));
+  DONE;
+})
+
 (define_expand "mul3"
   [(set (match_operand:VI2_AVX2 0 "register_operand")
(mult:VI2_AVX2 (match_operand:VI2_AVX2 1 "vector_operand")
@@ -12363,6 +12407,24 @@ (define_insn "*sse2_pmaddwd"
(set_attr "prefix" "orig,vex")
(set_attr "mode" "TI")])
 
+(define_expand "cond_mul"
+  [(set (match_operand:VI8_AVX512VL 0 "register_operand")
+   (vec_merge:VI8_AVX512VL
+ (mult:VI8_AVX512VL
+   (match_operand:VI8_AVX512VL 2 "vector_operand")
+   (match_operand:VI8_AVX512VL 3 "vector_operand"))
+ (match_operand:VI8_AVX512VL 4 "nonimm_or_0_operand")
+ (match_operand: 1 "register_operand")))]
+  "TARGET_AVX512DQ"
+{
+  emit_insn (gen_avx512dq_mul3_mask (operands[0],
+  operands[2],
+  operands[3],
+

[PATCH] [i386] Refine predicate of peephole2 to general_reg_operand. [PR target/101743]

2021-08-03 Thread liuhongt via Gcc-patches

Hi:
  The define_peephole2 which is added by r12-2640-gf7bf03cf69ccb7dc
should only work on general registers, considering that x86 also
supports mov instructions between gpr, sse reg, mask reg, limiting the
peephole2 predicate to general_reg_operand.
  I failed to contruct a testcase, but I believe that the PR problem
should be solved by this patch.

  Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
  Ok for trunk?

gcc/ChangeLog:

PR target/101743
* config/i386/i386.md (peephole2): Refine predicate from
register_operand to general_reg_operand.
---
 gcc/config/i386/i386.md | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 0c23ddb8d1f..51e8b475bca 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -19423,11 +19423,11 @@ (define_peephole2
 ;; Eliminate a reg-reg mov by inverting the condition of a cmov (#1).
 ;; mov r0,r1; dec r0; mov r2,r3; cmov r0,r2 -> dec r1; mov r0,r3; cmov r0, r1
 (define_peephole2
- [(set (match_operand:SWI248 0 "register_operand")
-   (match_operand:SWI248 1 "register_operand"))
+ [(set (match_operand:SWI248 0 "general_reg_operand")
+   (match_operand:SWI248 1 "general_reg_operand"))
   (parallel [(set (reg FLAGS_REG) (match_operand 5))
 (set (match_dup 0) (match_operand:SWI248 6))])
-  (set (match_operand:SWI248 2 "register_operand")
+  (set (match_operand:SWI248 2 "general_reg_operand")
(match_operand:SWI248 3))
   (set (match_dup 0)
(if_then_else:SWI248 (match_operator 4 "ix86_comparison_operator"
@@ -19455,10 +19455,10 @@ (define_peephole2
 ;; Eliminate a reg-reg mov by inverting the condition of a cmov (#2).
 ;; mov r2,r3; mov r0,r1; dec r0; cmov r0,r2 -> dec r1; mov r0,r3; cmov r0, r1
 (define_peephole2
- [(set (match_operand:SWI248 2 "register_operand")
+ [(set (match_operand:SWI248 2 "general_reg_operand")
(match_operand:SWI248 3))
-  (set (match_operand:SWI248 0 "register_operand")
-   (match_operand:SWI248 1 "register_operand"))
+  (set (match_operand:SWI248 0 "general_reg_operand")
+   (match_operand:SWI248 1 "general_reg_operand"))
   (parallel [(set (reg FLAGS_REG) (match_operand 5))
 (set (match_dup 0) (match_operand:SWI248 6))])
   (set (match_dup 0)
-- 
2.27.0

[PATCH] [i386] Support cond_{fma, fms, fnma, fnms} for vector float/double under AVX512.

2021-08-03 Thread liuhongt via Gcc-patches

Hi:
  This patch add expanders cond_{fma,fms,fnms,fnms}
for vector float/double modes.

  Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
  Pushed to trunk.

gcc/ChangeLog:

* config/i386/sse.md (cond_fma): New expander.
(cond_fms): Ditto.
(cond_fnma): Ditto.
(cond_fnms): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/cond_op_fma_double-1.c: New test.
* gcc.target/i386/cond_op_fma_double-2.c: New test.
* gcc.target/i386/cond_op_fma_float-1.c: New test.
* gcc.target/i386/cond_op_fma_float-2.c: New test.
---
 gcc/config/i386/sse.md|  96 
 .../gcc.target/i386/cond_op_fma_double-1.c|  87 
 .../gcc.target/i386/cond_op_fma_double-2.c| 206 ++
 .../gcc.target/i386/cond_op_fma_float-1.c |  20 ++
 .../gcc.target/i386/cond_op_fma_float-2.c |   4 +
 5 files changed, 413 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_fma_double-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_fma_double-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_fma_float-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_fma_float-2.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 52b2b4214d7..f5968e04669 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -4438,6 +4438,29 @@ (define_insn 
"fma_fmadd_"
   [(set_attr "type" "ssemuladd")
(set_attr "mode" "")])
 
+(define_expand "cond_fma"
+  [(set (match_operand:VF_AVX512VL 0 "register_operand")
+   (vec_merge:VF_AVX512VL
+ (fma:VF_AVX512VL
+   (match_operand:VF_AVX512VL 2 "vector_operand")
+   (match_operand:VF_AVX512VL 3 "vector_operand")
+   (match_operand:VF_AVX512VL 4 "vector_operand"))
+ (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand")
+ (match_operand: 1 "register_operand")))]
+  "TARGET_AVX512F"
+{
+  rtx tmp = gen_reg_rtx (mode);
+  emit_insn (gen_fma4 (tmp,
+operands[2],
+operands[3],
+operands[4]));
+  emit_move_insn (operands[0], gen_rtx_VEC_MERGE (mode,
+ tmp,
+ operands[5],
+ operands[1]));
+  DONE;
+})
+
 (define_insn "_fmadd__mask"
   [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v,v")
(vec_merge:VF_AVX512VL
@@ -4515,6 +4538,30 @@ (define_insn 
"fma_fmsub_"
   [(set_attr "type" "ssemuladd")
(set_attr "mode" "")])
 
+(define_expand "cond_fms"
+  [(set (match_operand:VF_AVX512VL 0 "register_operand")
+   (vec_merge:VF_AVX512VL
+ (fma:VF_AVX512VL
+   (match_operand:VF_AVX512VL 2 "vector_operand")
+   (match_operand:VF_AVX512VL 3 "vector_operand")
+   (neg:VF_AVX512VL
+ (match_operand:VF_AVX512VL 4 "vector_operand")))
+ (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand")
+ (match_operand: 1 "register_operand")))]
+  "TARGET_AVX512F"
+{
+  rtx tmp = gen_reg_rtx (mode);
+  emit_insn (gen_fms4 (tmp,
+operands[2],
+operands[3],
+operands[4]));
+  emit_move_insn (operands[0], gen_rtx_VEC_MERGE (mode,
+ tmp,
+ operands[5],
+ operands[1]));
+  DONE;
+})
+
 (define_insn "_fmsub__mask"
   [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v,v")
(vec_merge:VF_AVX512VL
@@ -4594,6 +4641,30 @@ (define_insn 
"fma_fnmadd_"
   [(set_attr "type" "ssemuladd")
(set_attr "mode" "")])
 
+(define_expand "cond_fnma"
+  [(set (match_operand:VF_AVX512VL 0 "register_operand")
+   (vec_merge:VF_AVX512VL
+ (fma:VF_AVX512VL
+   (neg:VF_AVX512VL
+ (match_operand:VF_AVX512VL 2 "vector_operand"))
+   (match_operand:VF_AVX512VL 3 "vector_operand")
+   (match_operand:VF_AVX512VL 4 "vector_operand"))
+ (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand")
+ (match_operand: 1 "register_operand")))]
+  "TARGET_AVX512F"
+{
+  rtx tmp = gen_reg_rtx (mode);
+  emit_insn (gen_fnma4 (tmp,
+ operands[2],
+ operands[3],
+ operands[4]));
+  emit_move_insn (operands[0], gen_rtx_VEC_MERGE (mode,
+ tmp,
+ operands[5],
+ operands[1]));
+  DONE;
+})
+
 (define_insn "_fnmadd__mask"
   [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v,v")
(vec_merge:VF_AVX512VL
@@ -4675,6 +4746,31 @@ (define_insn 
"fma_fnmsub_"
   [(set_attr "type" "ssemuladd")
(set_attr "mode" "")])
 
+(define_expand

[PATCH] Add dg-require-effective-target for testcases.

2021-08-03 Thread liuhongt via Gcc-patches

Hi:
  Pushed to trunk as an abvious fix.

gcc/testsuite/ChangeLog:

* gcc.target/i386/cond_op_addsubmul_d-2.c: Add
dg-require-effective-target for avx512.
* gcc.target/i386/cond_op_addsubmul_q-2.c: Ditto.
* gcc.target/i386/cond_op_addsubmul_w-2.c: Ditto.
* gcc.target/i386/cond_op_addsubmuldiv_double-2.c: Ditto.
* gcc.target/i386/cond_op_addsubmuldiv_float-2.c: Ditto.
* gcc.target/i386/cond_op_fma_double-2.c: Ditto.
* gcc.target/i386/cond_op_fma_float-2.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/cond_op_addsubmul_d-2.c | 2 ++
 gcc/testsuite/gcc.target/i386/cond_op_addsubmul_q-2.c | 2 ++
 gcc/testsuite/gcc.target/i386/cond_op_addsubmul_w-2.c | 2 ++
 gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_double-2.c | 1 +
 gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_float-2.c  | 1 +
 gcc/testsuite/gcc.target/i386/cond_op_fma_double-2.c  | 2 ++
 gcc/testsuite/gcc.target/i386/cond_op_fma_float-2.c   | 1 +
 7 files changed, 11 insertions(+)

diff --git a/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_d-2.c 
b/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_d-2.c
index 490f4afbf18..046804bacbd 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_d-2.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_d-2.c
@@ -1,5 +1,7 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512vl } */
+
 #define AVX512VL
 #ifndef CHECK
 #define CHECK "avx512f-helper.h"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_q-2.c 
b/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_q-2.c
index 09a87deb529..56245b143fa 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_q-2.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_q-2.c
@@ -1,4 +1,6 @@
 /* { dg-do run { target { ! ia32 } } } */
 /* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -mavx512dq 
-DTYPE=long" } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512dq } */
 
 #include "cond_op_addsubmul_d-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_w-2.c 
b/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_w-2.c
index fdcdb34346c..bdcd2ef3db7 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_w-2.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_w-2.c
@@ -1,5 +1,7 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -mavx512bw 
-DTYPE=short" } */
+/* { dg-require-effective-target avx512bw } */
+/* { dg-require-effective-target avx512vl } */
 
 #define AVX512BW
 #include "cond_op_addsubmul_d-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_double-2.c 
b/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_double-2.c
index 360891f3d21..5ec38df5933 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_double-2.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_double-2.c
@@ -1,5 +1,6 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512vl } */
 
 #define AVX512VL
 #ifndef CHECK
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_float-2.c 
b/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_float-2.c
index 20ed737cbf3..c99c04c0b41 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_float-2.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_float-2.c
@@ -1,4 +1,5 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -DTYPE=float" } */
+/* { dg-require-effective-target avx512vl } */
 
 #include "cond_op_addsubmuldiv_double-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_fma_double-2.c 
b/gcc/testsuite/gcc.target/i386/cond_op_fma_double-2.c
index d8180de7491..4c6514e756c 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_fma_double-2.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_fma_double-2.c
@@ -1,5 +1,7 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512vl } */
+
 #define AVX512VL
 #ifndef CHECK
 #define CHECK "avx512f-helper.h"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_fma_float-2.c 
b/gcc/testsuite/gcc.target/i386/cond_op_fma_float-2.c
index 0097735dddb..e13d37720fe 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_fma_float-2.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_fma_float-2.c
@@ -1,4 +1,5 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -DTYPE=float 
-D__BUILTIN_FMA=__builtin_fmaf" } */
+/* { dg-require-effective-target avx512vl } */
 
 #include "cond_op_fma_double-2.c"
-- 
2.18.1

[PATCH 0/3] [i386] Support cond_{smax, smin, umax, umin, xor, ior, and} for vector modes under AVX512

2021-08-04 Thread liuhongt via Gcc-patches

Hi:
  Together with the previous 3 patches, all cond_op expanders of vector
modes are supported (if they have a corresponding avx512 mask instruction).

  Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
  
liuhongt (3):
  [i386] Support cond_{smax,smin,umax,umin} for vector integer modes
under AVX512.
  [i386] Support cond_{smax,smin} for vector float/double modes under
AVX512.
  [i386] Support cond_{xor,ior,and} for vector integer mode under
AVX512.

 gcc/config/i386/sse.md| 54 +
 .../gcc.target/i386/cond_op_anylogic_d-1.c| 38 +
 .../gcc.target/i386/cond_op_anylogic_d-2.c| 78 +++
 .../gcc.target/i386/cond_op_anylogic_q-1.c| 10 +++
 .../gcc.target/i386/cond_op_anylogic_q-2.c|  5 ++
 .../gcc.target/i386/cond_op_maxmin_b-1.c  |  8 ++
 .../gcc.target/i386/cond_op_maxmin_b-2.c  |  6 ++
 .../gcc.target/i386/cond_op_maxmin_d-1.c  | 41 ++
 .../gcc.target/i386/cond_op_maxmin_d-2.c  | 67 
 .../gcc.target/i386/cond_op_maxmin_double-1.c | 39 ++
 .../gcc.target/i386/cond_op_maxmin_double-2.c | 67 
 .../gcc.target/i386/cond_op_maxmin_float-1.c  |  8 ++
 .../gcc.target/i386/cond_op_maxmin_float-2.c  |  5 ++
 .../gcc.target/i386/cond_op_maxmin_q-1.c  |  8 ++
 .../gcc.target/i386/cond_op_maxmin_q-2.c  |  5 ++
 .../gcc.target/i386/cond_op_maxmin_ub-1.c |  8 ++
 .../gcc.target/i386/cond_op_maxmin_ub-2.c |  6 ++
 .../gcc.target/i386/cond_op_maxmin_ud-1.c |  8 ++
 .../gcc.target/i386/cond_op_maxmin_ud-2.c |  5 ++
 .../gcc.target/i386/cond_op_maxmin_uq-1.c |  8 ++
 .../gcc.target/i386/cond_op_maxmin_uq-2.c |  5 ++
 .../gcc.target/i386/cond_op_maxmin_uw-1.c |  8 ++
 .../gcc.target/i386/cond_op_maxmin_uw-2.c |  6 ++
 .../gcc.target/i386/cond_op_maxmin_w-1.c  |  8 ++
 .../gcc.target/i386/cond_op_maxmin_w-2.c  |  6 ++
 25 files changed, 507 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_anylogic_d-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_anylogic_d-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_anylogic_q-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_anylogic_q-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_b-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_b-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_d-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_d-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_double-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_double-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_float-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_float-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_q-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_q-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_ub-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_ub-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_ud-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_ud-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_uq-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_uq-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_uw-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_uw-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_w-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_w-2.c

-- 
2.18.1

[PATCH 1/3] [i386] Support cond_{smax, smin, umax, umin} for vector integer modes under AVX512.

2021-08-04 Thread liuhongt via Gcc-patches

gcc/ChangeLog:

* config/i386/sse.md (cond_): New expander.

gcc/testsuite/ChangeLog:

* gcc.target/i386/cond_op_maxmin_b-1.c: New test.
* gcc.target/i386/cond_op_maxmin_b-2.c: New test.
* gcc.target/i386/cond_op_maxmin_d-1.c: New test.
* gcc.target/i386/cond_op_maxmin_d-2.c: New test.
* gcc.target/i386/cond_op_maxmin_q-1.c: New test.
* gcc.target/i386/cond_op_maxmin_q-2.c: New test.
* gcc.target/i386/cond_op_maxmin_ub-1.c: New test.
* gcc.target/i386/cond_op_maxmin_ub-2.c: New test.
* gcc.target/i386/cond_op_maxmin_ud-1.c: New test.
* gcc.target/i386/cond_op_maxmin_ud-2.c: New test.
* gcc.target/i386/cond_op_maxmin_uq-1.c: New test.
* gcc.target/i386/cond_op_maxmin_uq-2.c: New test.
* gcc.target/i386/cond_op_maxmin_uw-1.c: New test.
* gcc.target/i386/cond_op_maxmin_uw-2.c: New test.
* gcc.target/i386/cond_op_maxmin_w-1.c: New test.
* gcc.target/i386/cond_op_maxmin_w-2.c: New test.
---
 gcc/config/i386/sse.md| 18 +
 .../gcc.target/i386/cond_op_maxmin_b-1.c  |  8 +++
 .../gcc.target/i386/cond_op_maxmin_b-2.c  |  6 ++
 .../gcc.target/i386/cond_op_maxmin_d-1.c  | 41 
 .../gcc.target/i386/cond_op_maxmin_d-2.c  | 67 +++
 .../gcc.target/i386/cond_op_maxmin_q-1.c  |  8 +++
 .../gcc.target/i386/cond_op_maxmin_q-2.c  |  5 ++
 .../gcc.target/i386/cond_op_maxmin_ub-1.c |  8 +++
 .../gcc.target/i386/cond_op_maxmin_ub-2.c |  6 ++
 .../gcc.target/i386/cond_op_maxmin_ud-1.c |  8 +++
 .../gcc.target/i386/cond_op_maxmin_ud-2.c |  5 ++
 .../gcc.target/i386/cond_op_maxmin_uq-1.c |  8 +++
 .../gcc.target/i386/cond_op_maxmin_uq-2.c |  5 ++
 .../gcc.target/i386/cond_op_maxmin_uw-1.c |  8 +++
 .../gcc.target/i386/cond_op_maxmin_uw-2.c |  6 ++
 .../gcc.target/i386/cond_op_maxmin_w-1.c  |  8 +++
 .../gcc.target/i386/cond_op_maxmin_w-2.c  |  6 ++
 17 files changed, 221 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_b-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_b-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_d-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_d-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_q-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_q-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_ub-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_ub-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_ud-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_ud-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_uq-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_uq-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_uw-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_uw-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_w-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_w-2.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index f5968e04669..6035411ea75 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -13070,6 +13070,24 @@ (define_insn "*avx2_3"
(set_attr "prefix" "vex")
(set_attr "mode" "OI")])
 
+(define_expand "cond_"
+  [(set (match_operand:VI1248_AVX512VLBW 0 "register_operand")
+   (vec_merge:VI1248_AVX512VLBW
+ (maxmin:VI1248_AVX512VLBW
+   (match_operand:VI1248_AVX512VLBW 2 "nonimmediate_operand")
+   (match_operand:VI1248_AVX512VLBW 3 "nonimmediate_operand"))
+ (match_operand:VI1248_AVX512VLBW 4 "nonimm_or_0_operand")
+ (match_operand: 1 "register_operand")))]
+  "TARGET_AVX512F"
+{
+  emit_insn (gen_3_mask (operands[0],
+operands[2],
+operands[3],
+operands[4],
+operands[1]));
+  DONE;
+})
+
 (define_expand "3_mask"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand")
(vec_merge:VI48_AVX512VL
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_b-1.c 
b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_b-1.c
new file mode 100644
index 000..78c6600f83b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_b-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -DTYPE=int8 -fdump-tree-optimized" 
} */
+/* { dg-final { scan-tree-dump ".COND_MAX" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_MIN" "optimized" } } */
+/* { dg-final { scan-assembler-times "vpmaxsb"  1 } } */
+/* { dg-final { scan-assembler-times "vpminsb"  1 } } */
+
+#include "cond_op_maxmin_d-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_b-2.c 
b/gcc/testsuite

[PATCH 3/3] [i386] Support cond_{xor, ior, and} for vector integer mode under AVX512.

2021-08-04 Thread liuhongt via Gcc-patches

gcc/ChangeLog:

* config/i386/sse.md (cond_): New expander.

gcc/testsuite/ChangeLog:

* gcc.target/i386/cond_op_anylogic_d-1.c: New test.
* gcc.target/i386/cond_op_anylogic_d-2.c: New test.
* gcc.target/i386/cond_op_anylogic_q-1.c: New test.
* gcc.target/i386/cond_op_anylogic_q-2.c: New test.
---
 gcc/config/i386/sse.md| 18 +
 .../gcc.target/i386/cond_op_anylogic_d-1.c| 38 +
 .../gcc.target/i386/cond_op_anylogic_d-2.c| 78 +++
 .../gcc.target/i386/cond_op_anylogic_q-1.c| 10 +++
 .../gcc.target/i386/cond_op_anylogic_q-2.c|  5 ++
 5 files changed, 149 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_anylogic_d-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_anylogic_d-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_anylogic_q-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_anylogic_q-2.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 51733a3849d..a46a2373547 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -14063,6 +14063,24 @@ (define_expand "3"
   DONE;
 })
 
+(define_expand "cond_"
+  [(set (match_operand:VI48_AVX512VL 0 "register_operand")
+   (vec_merge:VI48_AVX512VL
+ (any_logic:VI48_AVX512VL
+   (match_operand:VI48_AVX512VL 2 "vector_operand")
+   (match_operand:VI48_AVX512VL 3 "vector_operand"))
+ (match_operand:VI48_AVX512VL 4 "nonimm_or_0_operand")
+ (match_operand: 1 "register_operand")))]
+  "TARGET_AVX512F"
+{
+  emit_insn (gen_3_mask (operands[0],
+operands[2],
+operands[3],
+operands[4],
+operands[1]));
+  DONE;
+})
+
 (define_insn "3"
   [(set (match_operand:VI48_AVX_AVX512F 0 "register_operand" "=x,x,v")
(any_logic:VI48_AVX_AVX512F
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_anylogic_d-1.c 
b/gcc/testsuite/gcc.target/i386/cond_op_anylogic_d-1.c
new file mode 100644
index 000..8951f4a3a27
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_anylogic_d-1.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump ".COND_AND" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_XOR" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_IOR" "optimized" } } */
+/* { dg-final { scan-assembler-times "vpxord"  1 } } */
+/* { dg-final { scan-assembler-times "vpord"  1 } } */
+/* { dg-final { scan-assembler-times "vpandd"  1 } } */
+
+typedef int int32;
+typedef unsigned int uint32;
+typedef long long int64;
+typedef unsigned long long uint64;
+
+#ifndef NUM
+#define NUM 800
+#endif
+#ifndef TYPE
+#define TYPE int
+#endif
+
+TYPE a[NUM], b[NUM], c[NUM], d[NUM], e[NUM], j[NUM];
+
+#define BIN(OPNAME, OP)\
+  void \
+  __attribute__ ((noipa,optimize ("O3")))  \
+  foo_##OPNAME ()  \
+  {\
+for (int i = 0; i != NUM; i++) \
+  if (b[i] < c[i]) \
+   a[i] = d[i] OP e[i];\
+  else \
+   a[i] = d[i] - e[i]; \
+  }
+
+BIN (and, &);
+BIN (ior, |);
+BIN (xor, ^);
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_anylogic_d-2.c 
b/gcc/testsuite/gcc.target/i386/cond_op_anylogic_d-2.c
new file mode 100644
index 000..23ca4120cf2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_anylogic_d-2.c
@@ -0,0 +1,78 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512vl } */
+
+#define AVX512VL
+#ifndef CHECK
+#define CHECK "avx512f-helper.h"
+#endif
+
+#include CHECK
+
+#include "cond_op_anylogic_d-1.c"
+#define BINO2(OPNAME, OP)  \
+  void \
+  __attribute__ ((noipa,optimize ("O2")))  \
+  foo_o2_##OPNAME ()   \
+  {\
+for (int i = 0; i != NUM; i++) \
+  if (b[i] < c[i]) \
+   j[i] = d[i] OP e[i];\
+  else \
+   j[i] = d[i] - e[i]; \
+  }
+
+BINO2 (and, &);
+BINO2 (ior, |);
+BINO2 (xor, ^);
+
+static void
+test_256 (void)
+{
+  int sign = -1;
+  for (int i = 0; i != NUM; i++)
+{
+  a[i] = 0;
+  d[i] = i * 2;
+  e[i] = i * i * 3 - i * 9 + 153;
+  b[i] = i * 83;
+  c[i] = b[i] + sign;
+  sign *= -1;
+  j[i] = 1;
+}
+  foo_and ();
+  foo_o2_and ();
+  for (int i = 0; i != NUM; i++)
+{
+  if (a[i] != j[i])
+   abort ();
+  a[i] = 0;

[PATCH 2/3] [i386] Support cond_{smax, smin} for vector float/double modes under AVX512.

2021-08-04 Thread liuhongt via Gcc-patches

gcc/ChangeLog:

* config/i386/sse.md (cond_): New expander.

gcc/testsuite/ChangeLog:

* gcc.target/i386/cond_op_maxmin_double-1.c: New test.
* gcc.target/i386/cond_op_maxmin_double-2.c: New test.
* gcc.target/i386/cond_op_maxmin_float-1.c: New test.
* gcc.target/i386/cond_op_maxmin_float-2.c: New test.
---
 gcc/config/i386/sse.md| 18 +
 .../gcc.target/i386/cond_op_maxmin_double-1.c | 39 +++
 .../gcc.target/i386/cond_op_maxmin_double-2.c | 67 +++
 .../gcc.target/i386/cond_op_maxmin_float-1.c  |  8 +++
 .../gcc.target/i386/cond_op_maxmin_float-2.c  |  5 ++
 5 files changed, 137 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_double-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_double-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_float-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_float-2.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 6035411ea75..51733a3849d 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2376,6 +2376,24 @@ (define_insn "*sse_vmrsqrtv4sf2"
(set_attr "prefix" "orig,vex")
(set_attr "mode" "SF")])
 
+(define_expand "cond_"
+  [(set (match_operand:VF 0 "register_operand")
+   (vec_merge:VF
+ (smaxmin:VF
+   (match_operand:VF 2 "vector_operand")
+   (match_operand:VF 3 "vector_operand"))
+ (match_operand:VF 4 "nonimm_or_0_operand")
+ (match_operand: 1 "register_operand")))]
+  " == 64 || TARGET_AVX512VL"
+{
+  emit_insn (gen_3_mask (operands[0],
+operands[2],
+operands[3],
+operands[4],
+operands[1]));
+  DONE;
+})
+
 (define_expand "3"
   [(set (match_operand:VF 0 "register_operand")
(smaxmin:VF
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_double-1.c 
b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_double-1.c
new file mode 100644
index 000..eda8e1974b9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_double-1.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump ".COND_MAX" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_MIN" "optimized" } } */
+/* { dg-final { scan-assembler-times "vmaxpd"  1 } } */
+/* { dg-final { scan-assembler-times "vminpd"  1 } } */
+
+#include
+#ifndef NUM
+#define NUM 800
+#endif
+#ifndef TYPE
+#define TYPE double
+#endif
+#ifndef FN_MAX
+#define FN_MAX fmax
+#endif
+#ifndef FN_MIN
+#define FN_MIN fmin
+#endif
+
+TYPE a[NUM], b[NUM], c[NUM], d[NUM], e[NUM], j[NUM];
+#define MAX FN_MAX
+#define MIN FN_MIN
+
+#define BIN(OPNAME, OP)\
+  void \
+  __attribute__ ((noipa,optimize ("Ofast")))   \
+  foo_##OPNAME ()  \
+  {\
+for (int i = 0; i != NUM; i++) \
+  if (b[i] < c[i]) \
+   a[i] = (OP (d[i], e[i]));   \
+  else \
+   a[i] = d[i] - e[i]; \
+  }
+
+BIN (max, MAX);
+BIN (min, MIN);
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_double-2.c 
b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_double-2.c
new file mode 100644
index 000..c50a831000a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_double-2.c
@@ -0,0 +1,67 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -ffast-math" } */
+/* { dg-require-effective-target avx512vl } */
+
+#define AVX512VL
+#ifndef CHECK
+#define CHECK "avx512f-helper.h"
+#endif
+
+#include CHECK
+
+#include "cond_op_maxmin_double-1.c"
+#define BINO2(OPNAME, OP)  \
+  void \
+  __attribute__ ((noipa))  \
+  foo_o2_##OPNAME ()   \
+  {\
+for (int i = 0; i != NUM; i++) \
+  if (b[i] < c[i]) \
+   j[i] = OP(d[i], e[i]);  \
+  else \
+   j[i] = d[i] - e[i]; \
+  }
+
+BINO2 (max, MAX);
+BINO2 (min, MIN);
+
+static void
+test_256 (void)
+{
+  int sign = -1;
+  for (int i = 0; i != NUM; i++)
+{
+  a[i] = 0;
+  d[i] = i * 2;
+  e[i] = i * i * 3 - i * 9 + 153;
+  b[i] = i * 83;
+  c[i] = b[i] + sign;
+  sign *= -1;
+  j[i] = 1;
+}
+  foo_max ();
+  foo_o2_max ();
+  for (int i = 0; i != NUM; i++)
+{
+  if (a[i] != j[i])
+   abort ();
+  a[i] = 0;
+  b[i] = 1;
+}
+
+  foo_min ();
+  foo_o2_min ();
+  for (int i = 0; i != NUM; i+

[PATCH] Make sure we're playing with integral modes before call extract_integral_bit_field.

2021-08-05 Thread liuhongt via Gcc-patches

Hi:
---
OK, I think sth is amiss here upthread.  insv/extv do look like they
are designed
to work on integer modes (but docs do not say anything about this here).
In fact the caller of extract_bit_field_using_extv is named
extract_integral_bit_field.  Of course nothing seems to check what kind of
modes we're dealing with, but we're for example happily doing
expand_shift in 'mode'.  In the extract_integral_bit_field call 'mode' is
some integer mode and op0 is HFmode?  From the above I get it's
the other way around?  In that case we should wrap the
call to extract_integral_bit_field, extracting in an integer mode with the
same size as 'mode' and then converting the result as (subreg:HF (reg:HI ...)).
---
  This is a separate patch as a follow up of upper comments.
 
gcc/ChangeLog:

* expmed.c (extract_bit_field_1): Wrap the call to
extract_integral_bit_field, extracting in an integer mode with
the same size as 'tmode' and then converting the result
as (subreg:tmode (reg:imode)).

gcc/testsuite/ChangeLog:
* gcc.target/i386/float16-5.c: New test.
---
 gcc/expmed.c  | 19 +++
 gcc/testsuite/gcc.target/i386/float16-5.c | 12 
 2 files changed, 31 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/float16-5.c

diff --git a/gcc/expmed.c b/gcc/expmed.c
index 3143f38e057..72790693ef0 100644
--- a/gcc/expmed.c
+++ b/gcc/expmed.c
@@ -1850,6 +1850,25 @@ extract_bit_field_1 (rtx str_rtx, poly_uint64 bitsize, 
poly_uint64 bitnum,
   op0_mode = opt_scalar_int_mode ();
 }
 
+  /* Make sure we are playing with integral modes.  Pun with subregs
+ if we aren't. When tmode is HFmode, op0 is SImode, there will be ICE
+ in extract_integral_bit_field.  */
+  if (int_mode_for_mode (tmode).exists (&imode)
+  && imode != tmode
+  && imode != GET_MODE (op0))
+{
+  rtx ret = extract_integral_bit_field (op0, op0_mode,
+   bitsize.to_constant (),
+   bitnum.to_constant (), unsignedp,
+   NULL, imode, imode,
+   reverse, fallback_p);
+  gcc_assert (ret);
+
+  if (!REG_P (ret))
+   ret = force_reg (imode, ret);
+  return gen_lowpart_SUBREG (tmode, ret);
+}
+
   /* It's possible we'll need to handle other cases here for
  polynomial bitnum and bitsize.  */
 
diff --git a/gcc/testsuite/gcc.target/i386/float16-5.c 
b/gcc/testsuite/gcc.target/i386/float16-5.c
new file mode 100644
index 000..ebc0af1490b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/float16-5.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2" } */
+_Float16
+foo (int a)
+{
+  union {
+int a;
+_Float16 b;
+  }c;
+  c.a = a;
+  return c.b;
+}
-- 
2.27.0

[PATCH] [rtl-optimization] Simplify vector shift/rotate with const_vec_duplicate to vector shift/rotate with const_int element.

2021-08-06 Thread liuhongt via Gcc-patches

Hi:
  Bootstrapped and regtested on x86_64-linux-gnu{-m32,}
  Ok for trunk?

gcc/ChangeLog:

PR rtl-optimization/101796
* simplify-rtx.c
(simplify_context::simplify_binary_operation_1): Simplify
vector shift/rotate with const_vec_duplicate to vector
shift/rotate with const_int element.

gcc/testsuite/ChangeLog:

PR rtl-optimization/101796
* gcc.target/i386/pr101796.c: New test.
---
 gcc/simplify-rtx.c   | 15 ++
 gcc/testsuite/gcc.target/i386/pr101796.c | 65 
 2 files changed, 80 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101796.c

diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
index a719f57870f..75f3e455562 100644
--- a/gcc/simplify-rtx.c
+++ b/gcc/simplify-rtx.c
@@ -3970,6 +3970,21 @@ simplify_context::simplify_binary_operation_1 (rtx_code 
code,
return simplify_gen_binary (code, mode, op0,
gen_int_shift_amount (mode, val));
}
+
+  /* Optimize vector shift/rotate with const_vec_duplicate
+to vector shift/rotate with const_int element.
+  /* TODO: vec_duplicate with variable can also be simplified,
+but GCC only require operand 2 of shift/rotate to be a scalar type
+which can have different modes in different backends, it makes
+simplication difficult to decide which mode should be choosed
+for shift/rotate count.  */
+  if ((code == ASHIFTRT || code == LSHIFTRT
+  || code == ASHIFT || code == ROTATERT
+  || code == ROTATE)
+ && const_vec_duplicate_p (op1))
+   return simplify_gen_binary (code, mode, op0,
+   unwrap_const_vec_duplicate (op1));
+
   break;
 
 case ASHIFT:
diff --git a/gcc/testsuite/gcc.target/i386/pr101796.c 
b/gcc/testsuite/gcc.target/i386/pr101796.c
new file mode 100644
index 000..c22d6267fe5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101796.c
@@ -0,0 +1,65 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -O2 " } */
+/* { dg-final { scan-assembler-not "vpbroadcast" } }  */
+/* { dg-final { scan-assembler-not "vpsrlv\[dwq\]" } }  */
+/* { dg-final { scan-assembler-not "vpsllv\[dwq\]" } }  */
+/* { dg-final { scan-assembler-not "vpsrav\[dwq\]" } }  */
+/* { dg-final { scan-assembler-times "vpsrl\[dwq\]" 3 } }  */
+/* { dg-final { scan-assembler-times "vpsll\[dwq\]" 3 } }  */
+/* { dg-final { scan-assembler-times "vpsra\[dwq\]" 3 } }  */
+
+#include 
+
+__m512i
+foo (__m512i a)
+{
+  return _mm512_srlv_epi16 (a, _mm512_set1_epi16 (3));
+}
+
+__m512i
+foo1 (__m512i a)
+{
+  return _mm512_srlv_epi32 (a, _mm512_set1_epi32 (3));
+}
+
+__m512i
+foo2 (__m512i a, long long b)
+{
+  return _mm512_srlv_epi64 (a, _mm512_set1_epi64 (3));
+}
+
+__m512i
+foo3 (__m512i a)
+{
+  return _mm512_srav_epi16 (a, _mm512_set1_epi16 (3));
+}
+
+__m512i
+foo4 (__m512i a)
+{
+  return _mm512_srav_epi32 (a, _mm512_set1_epi32 (3));
+}
+
+__m512i
+foo5 (__m512i a, long long b)
+{
+  return _mm512_srav_epi64 (a, _mm512_set1_epi64 (3));
+}
+
+__m512i
+foo6 (__m512i a)
+{
+  return _mm512_sllv_epi16 (a, _mm512_set1_epi16 (3));
+}
+
+__m512i
+foo7 (__m512i a)
+{
+  return _mm512_sllv_epi32 (a, _mm512_set1_epi32 (3));
+}
+
+__m512i
+foo8 (__m512i a, long long b)
+{
+  return _mm512_sllv_epi64 (a, _mm512_set1_epi64 (3));
+}
-- 
2.27.0

[PATCH] [i386] Support cond_ashr/lshr/ashl for vector integer modes under AVX512.

2021-08-09 Thread liuhongt via Gcc-patches

Hi:
  Boostrapped and regtested on x86_64-linux-gnu{-m32,}.

gcc/ChangeLog:

* config/i386/sse.md (cond_): New expander.
(VI248_AVX512VLBW): New mode iterator.
* config/i386/predicates.md
(nonimmediate_or_const_vec_dup_operand): New predicate.

gcc/testsuite/ChangeLog:

* gcc.target/i386/cond_op_shift_d-1.c: New test.
* gcc.target/i386/cond_op_shift_d-2.c: New test.
* gcc.target/i386/cond_op_shift_q-1.c: New test.
* gcc.target/i386/cond_op_shift_q-2.c: New test.
* gcc.target/i386/cond_op_shift_ud-1.c: New test.
* gcc.target/i386/cond_op_shift_ud-2.c: New test.
* gcc.target/i386/cond_op_shift_uq-1.c: New test.
* gcc.target/i386/cond_op_shift_uq-2.c: New test.
* gcc.target/i386/cond_op_shift_uw-1.c: New test.
* gcc.target/i386/cond_op_shift_uw-2.c: New test.
* gcc.target/i386/cond_op_shift_w-1.c: New test.
* gcc.target/i386/cond_op_shift_w-2.c: New test.
---
 gcc/config/i386/predicates.md |   4 +
 gcc/config/i386/sse.md|  36 +++
 .../gcc.target/i386/cond_op_shift_d-1.c   |  56 ++
 .../gcc.target/i386/cond_op_shift_d-2.c   | 102 ++
 .../gcc.target/i386/cond_op_shift_q-1.c   |  11 ++
 .../gcc.target/i386/cond_op_shift_q-2.c   |   5 +
 .../gcc.target/i386/cond_op_shift_ud-1.c  |  10 ++
 .../gcc.target/i386/cond_op_shift_ud-2.c  |   5 +
 .../gcc.target/i386/cond_op_shift_uq-1.c  |  10 ++
 .../gcc.target/i386/cond_op_shift_uq-2.c  |   5 +
 .../gcc.target/i386/cond_op_shift_uw-1.c  |   8 ++
 .../gcc.target/i386/cond_op_shift_uw-2.c  |   6 ++
 .../gcc.target/i386/cond_op_shift_w-1.c   |   8 ++
 .../gcc.target/i386/cond_op_shift_w-2.c   |   6 ++
 14 files changed, 272 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_shift_d-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_shift_d-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_shift_q-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_shift_q-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_shift_ud-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_shift_ud-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_shift_uq-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_shift_uq-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_shift_uw-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_shift_uw-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_shift_w-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_shift_w-2.c

diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 6aa1ea32627..129205ac3a7 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1161,6 +1161,10 @@ (define_predicate "nonimmediate_or_const_vector_operand"
   (ior (match_operand 0 "nonimmediate_operand")
(match_code "const_vector")))
 
+(define_predicate "nonimmediate_or_const_vec_dup_operand"
+  (ior (match_operand 0 "nonimmediate_operand")
+   (match_test "const_vec_duplicate_p (op)")))
+
 ;; Return true when OP is either register operand, or any
 ;; CONST_VECTOR.
 (define_predicate "reg_or_const_vector_operand"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a46a2373547..45b1ec2add4 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -506,6 +506,13 @@ (define_mode_iterator VI248_AVX512VL
(V4DI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL")
(V4SI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")])
 
+(define_mode_iterator VI248_AVX512VLBW
+  [(V32HI "TARGET_AVX512BW")
+   (V16HI "TARGET_AVX512VL && TARGET_AVX512BW")
+   (V8HI "TARGET_AVX512VL && TARGET_AVX512BW")
+   V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL")
+   V8DI (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")])
+
 (define_mode_iterator VI48_AVX2
   [(V8SI "TARGET_AVX2") V4SI
(V4DI "TARGET_AVX2") V2DI])
@@ -22786,6 +22793,35 @@ (define_expand "vec_init"
   DONE;
 })
 
+(define_expand "cond_"
+  [(set (match_operand:VI248_AVX512VLBW 0 "register_operand")
+   (vec_merge:VI248_AVX512VLBW
+ (any_shift:VI248_AVX512VLBW
+   (match_operand:VI248_AVX512VLBW 2 "register_operand")
+   (match_operand:VI248_AVX512VLBW 3 
"nonimmediate_or_const_vec_dup_operand"))
+ (match_operand:VI248_AVX512VLBW 4 "nonimm_or_0_operand")
+ (match_operand: 1 "register_operand")))]
+  "TARGET_AVX512F"
+{
+  if (const_vec_duplicate_p (operands[3]))
+{
+  operands[3] = unwrap_const_vec_duplicate (operands[3]);
+  operands[3] = lowpart_subreg (DImode, operands[3], mode);
+  emit_insn (gen_3_mask (operands[0],
+operands[2],
+operands[3],
+operands[4],
+operand

[PATCH] Extend ldexp{s, d}f3 to vscalefs{s, d} when TARGET_AVX512F and TARGET_SSE_MATH.

2021-08-10 Thread liuhongt via Gcc-patches

Hi:
  AVX512F supported vscalefs{s,d} which is the same as ldexp except the second 
operand should be floating point.
  Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.

gcc/ChangeLog:

PR target/98309
* config/i386/i386.md (ldexp3): Extend to vscalefs[sd]
when TARGET_AVX512F and TARGET_SSE_MATH.

gcc/testsuite/ChangeLog:

PR target/98309
* gcc.target/i386/pr98309-1.c: New test.
* gcc.target/i386/pr98309-2.c: New test.
---
 gcc/config/i386/i386.md   | 34 +++-
 gcc/testsuite/gcc.target/i386/pr98309-1.c | 18 +++
 gcc/testsuite/gcc.target/i386/pr98309-2.c | 39 +++
 3 files changed, 83 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr98309-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr98309-2.c

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index bc1c30b77f4..56b09c566ed 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -17914,17 +17914,35 @@ (define_expand "ldexp3"
   [(use (match_operand:MODEF 0 "register_operand"))
(use (match_operand:MODEF 1 "general_operand"))
(use (match_operand:SI 2 "register_operand"))]
-  "TARGET_USE_FANCY_MATH_387
-   && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
-   || TARGET_MIX_SSE_I387)
+  "((TARGET_USE_FANCY_MATH_387
+ && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+|| TARGET_MIX_SSE_I387))
+|| (TARGET_AVX512F && TARGET_SSE_MATH))
&& flag_unsafe_math_optimizations"
 {
-  rtx op0 = gen_reg_rtx (XFmode);
-  rtx op1 = gen_reg_rtx (XFmode);
+  /* Prefer avx512f version.  */
+  if (TARGET_AVX512F && TARGET_SSE_MATH)
+   {
+ rtx op2 = gen_reg_rtx (mode);
+ emit_insn (gen_floatsi2 (op2, operands[2]));
+ operands[0] = lowpart_subreg (mode, operands[0], mode);
+ if (MEM_P (operands[1]))
+   operands[1] = force_reg (mode, operands[1]);
+ operands[1] = lowpart_subreg (mode, operands[1], mode);
+ op2 = lowpart_subreg (mode, op2, mode);
+ emit_insn (gen_avx512f_vmscalef (operands[0],
+  operands[1],
+  op2));
+   }
+  else
+{
+  rtx op0 = gen_reg_rtx (XFmode);
+  rtx op1 = gen_reg_rtx (XFmode);
 
-  emit_insn (gen_extendxf2 (op1, operands[1]));
-  emit_insn (gen_ldexpxf3 (op0, op1, operands[2]));
-  emit_insn (gen_truncxf2 (operands[0], op0));
+  emit_insn (gen_extendxf2 (op1, operands[1]));
+  emit_insn (gen_ldexpxf3 (op0, op1, operands[2]));
+  emit_insn (gen_truncxf2 (operands[0], op0));
+  }
   DONE;
 })
 
diff --git a/gcc/testsuite/gcc.target/i386/pr98309-1.c 
b/gcc/testsuite/gcc.target/i386/pr98309-1.c
new file mode 100644
index 000..3a7afb58971
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr98309-1.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2 -mfpmath=sse -ffast-math" } */
+/* { dg-final { scan-assembler-times "vcvtsi2s\[sd\]" "2" } } */
+/* { dg-final { scan-assembler-times "vscalefs\[sd\]" "2" } } */
+
+double
+__attribute__((noipa))
+foo (double a, int b)
+{
+  return __builtin_ldexp (a, b);
+}
+
+float
+__attribute__((noipa))
+foo2 (float a, int b)
+{
+  return __builtin_ldexpf (a, b);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr98309-2.c 
b/gcc/testsuite/gcc.target/i386/pr98309-2.c
new file mode 100644
index 000..ecfb9168b7d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr98309-2.c
@@ -0,0 +1,39 @@
+/* { dg-do run } */
+/* { dg-options "-mavx512f -O2 -mfpmath=sse -ffast-math" } */
+/* { dg-require-effective-target avx512f } */
+
+#define AVX512F
+#ifndef CHECK
+#define CHECK "avx512f-helper.h"
+#endif
+
+#include CHECK
+
+#include "pr98309-1.c"
+
+double
+__attribute__((noipa, target("fpmath=387")))
+foo_i387 (double a, int b)
+{
+  return __builtin_ldexp (a, b);
+}
+
+float
+__attribute__((noipa, target("fpmath=387")))
+foo2_i387 (float a, int b)
+{
+  return __builtin_ldexpf (a, b);
+}
+
+static void
+test_512 (void)
+{
+  float fa = 14.5;
+  double da = 44.5;
+  int fb = 12;
+  int db = 8;
+  if (foo_i387 (da, db) != foo (da, db))
+abort ();
+  if (foo2_i387 (fa, fb) != foo2 (fa, fb))
+abort ();
+}
-- 
2.27.0

[PATCH] [i386] Combine avx_vec_concatv16si and avx512f_zero_extendv16hiv16si2_1 to avx512f_zero_extendv16hiv16si2_2.

2021-08-10 Thread liuhongt via Gcc-patches

Hi:
  Add define_insn_and_split to combine avx_vec_concatv16si/2 and
avx512f_zero_extendv16hiv16si2_1 since the latter already zero_extend
the upper bits, similar for other patterns which are related to
pmovzx{bw,wd,dq}.

It will do optimization like

-   vmovdqa %ymm0, %ymm0# 7 [c=4 l=6]  avx_vec_concatv16si/2
vpmovzxwd   %ymm0, %zmm0# 22[c=4 l=6]  
avx512f_zero_extendv16hiv16si2
ret # 25[c=0 l=1]  simple_return_internal

  Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
  Ok for trunk?

gcc/ChangeLog:

PR target/101846
* config/i386/sse.md (*avx2_zero_extendv16qiv16hi2_2): New
post_reload define_insn_and_split.
(*avx512bw_zero_extendv32qiv32hi2_2): Ditto.
(*sse4_1_zero_extendv8qiv8hi2_4): Ditto.
(*avx512f_zero_extendv16hiv16si2_2): Ditto.
(*avx2_zero_extendv8hiv8si2_2): Ditto.
(*sse4_1_zero_extendv4hiv4si2_4): Ditto.
(*avx512f_zero_extendv8siv8di2_2): Ditto.
(*avx2_zero_extendv4siv4di2_2): Ditto.
(*sse4_1_zero_extendv2siv2di2_4): Ditto.

gcc/testsuite/ChangeLog:

PR target/101846
* gcc.target/i386/pr101846-1.c: New test.
---
 gcc/config/i386/sse.md | 220 +
 gcc/testsuite/gcc.target/i386/pr101846-1.c |  95 +
 2 files changed, 315 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101846-1.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a46a2373547..6450c058458 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -673,8 +673,14 @@ (define_mode_iterator VI12_128 [V16QI V8HI])
 (define_mode_iterator VI14_128 [V16QI V4SI])
 (define_mode_iterator VI124_128 [V16QI V8HI V4SI])
 (define_mode_iterator VI24_128 [V8HI V4SI])
+(define_mode_iterator VI128_128 [V16QI V8HI V2DI])
 (define_mode_iterator VI248_128 [V8HI V4SI V2DI])
+(define_mode_iterator VI248_256 [V16HI V8SI V4DI])
+(define_mode_iterator VI248_512 [V32HI V16SI V8DI])
 (define_mode_iterator VI48_128 [V4SI V2DI])
+(define_mode_iterator VI148_512 [V64QI V16SI V8DI])
+(define_mode_iterator VI148_256 [V32QI V8SI V4DI])
+(define_mode_iterator VI148_128 [V16QI V4SI V2DI])
 
 ;; Various 256bit and 512 vector integer mode combinations
 (define_mode_iterator VI124_256 [V32QI V16HI V8SI])
@@ -18499,6 +18505,26 @@ (define_insn_and_split "*avx2_zero_extendv16qiv16hi2_1"
   operands[1] = lowpart_subreg (V16QImode, operands[1], V32QImode);
 })
 
+(define_insn_and_split "*avx2_zero_extendv16qiv16hi2_2"
+  [(set (match_operand:V32QI 0 "register_operand" "=v")
+   (vec_select:V32QI
+ (vec_concat:V64QI
+   (subreg:V32QI
+ (vec_concat:VI248_256
+   (match_operand: 1 "nonimmediate_operand" "vm")
+   (match_operand: 2 "const0_operand" "C")) 0)
+   (match_operand:V32QI 3 "const0_operand" "C"))
+ (match_parallel 4 "pmovzx_parallel"
+   [(match_operand 5 "const_int_operand" "n")])))]
+  "TARGET_AVX2"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:V16HI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V16HImode, operands[0], V32QImode);
+  operands[1] = lowpart_subreg (V16QImode, operands[1], mode);
+})
+
 (define_expand "v16qiv16hi2"
   [(set (match_operand:V16HI 0 "register_operand")
(any_extend:V16HI
@@ -18533,6 +18559,26 @@ (define_insn_and_split 
"*avx512bw_zero_extendv32qiv32hi2_1"
   operands[1] = lowpart_subreg (V32QImode, operands[1], V64QImode);
 })
 
+(define_insn_and_split "*avx512bw_zero_extendv32qiv32hi2_2"
+  [(set (match_operand:V64QI 0 "register_operand" "=v")
+   (vec_select:V64QI
+ (vec_concat:V128QI
+   (subreg:V64QI
+ (vec_concat:VI248_512
+   (match_operand: 1 "nonimmediate_operand" "vm")
+   (match_operand: 2 "const0_operand" "C")) 0)
+   (match_operand:V64QI 3 "const0_operand" "C"))
+ (match_parallel 4 "pmovzx_parallel"
+   [(match_operand 5 "const_int_operand" "n")])))]
+  "TARGET_AVX512BW"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:V32HI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V32HImode, operands[0], V64QImode);
+  operands[1] = lowpart_subreg (V32QImode, operands[1], mode);
+})
+
 (define_expand "v32qiv32hi2"
   [(set (match_operand:V32HI 0 "register_operand")
(any_extend:V32HI
@@ -18619,6 +18665,41 @@ (define_insn_and_split "*sse4_1_zero_extendv8qiv8hi2_3"
 }
   [(set_attr "isa" "noavx,noavx,avx")])
 
+(define_insn_and_split "*sse4_1_zero_extendv8qiv8hi2_4"
+  [(set (match_operand:V16QI 0 "register_operand" "=Yr,*x,Yw")
+   (vec_select:V16QI
+ (vec_concat:V32QI
+   (subreg:V16QI
+ (vec_concat:VI248_128
+   (match_operand: 1 "vector_operand" 
"YrBm,*xBm,Ywm")
+   (match_operand: 2 "const0_operand" "C,C,C")) 0)
+   (match_operand:V16QI 3 "const0_operand" "C,C,C"))
+ (match_parallel 4 "pm

[PATCH] [i386] Introduce a scalar version of avx512f_vmscalef and adjust ldexp3 for it.

2021-08-11 Thread liuhongt via Gcc-patches

Hi:
  This is the patch i'm going to checkin.
  Bootstrapped and regtested on x86_64-linux-gnu{-m32,};


2021-08-12  Uros Bizjak  

gcc/ChangeLog:

PR target/98309
* config/i386/i386.md (avx512f_scalef2): New
define_insn.
(ldexp3): Adjust for new define_insn.
(UNSPEC_SCALEF): Move from sse.md.
* config/i386/sse.md (UNSPEC_SCALEF): Move to i386.md.
---
 gcc/config/i386/i386.md | 27 +++
 gcc/config/i386/sse.md  |  1 -
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 56b09c566ed..4a8e8fea290 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -125,6 +125,9 @@ (define_c_enum "unspec" [
   UNSPEC_RSQRT
   UNSPEC_PSADBW
 
+  ;; For AVX512F support
+  UNSPEC_SCALEF
+
   ;; Generic math support
   UNSPEC_COPYSIGN
   UNSPEC_XORSIGN
@@ -17894,6 +17897,17 @@ (define_expand "expm12"
   DONE;
 })
 
+(define_insn "avx512f_scalef2"
+  [(set (match_operand:MODEF 0 "register_operand" "=v")
+   (unspec:MODEF
+ [(match_operand:MODEF 1 "register_operand" "v")
+  (match_operand:MODEF 2 "nonimmediate_operand" "vm")]
+ UNSPEC_SCALEF))]
+  "TARGET_AVX512F"
+  "vscalef\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "prefix" "evex")
+   (set_attr "mode"  "")])
+
 (define_expand "ldexpxf3"
   [(match_operand:XF 0 "register_operand")
(match_operand:XF 1 "register_operand")
@@ -17924,15 +17938,12 @@ (define_expand "ldexp3"
   if (TARGET_AVX512F && TARGET_SSE_MATH)
{
  rtx op2 = gen_reg_rtx (mode);
- emit_insn (gen_floatsi2 (op2, operands[2]));
- operands[0] = lowpart_subreg (mode, operands[0], mode);
- if (MEM_P (operands[1]))
+
+ if (!nonimmediate_operand (operands[1], mode))
operands[1] = force_reg (mode, operands[1]);
- operands[1] = lowpart_subreg (mode, operands[1], mode);
- op2 = lowpart_subreg (mode, op2, mode);
- emit_insn (gen_avx512f_vmscalef (operands[0],
-  operands[1],
-  op2));
+
+ emit_insn (gen_floatsi2 (op2, operands[2]));
+ emit_insn (gen_avx512f_scalef2 (operands[0], operands[1], op2));
}
   else
 {
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 3957c86c3df..9233dfc6150 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -92,7 +92,6 @@ (define_c_enum "unspec" [
   UNSPEC_RCP14
   UNSPEC_RSQRT14
   UNSPEC_FIXUPIMM
-  UNSPEC_SCALEF
   UNSPEC_VTERNLOG
   UNSPEC_GETEXP
   UNSPEC_GETMANT
-- 
2.18.1

[PATCH] [i386] Optimize vec_perm_expr to match vpmov{dw,qd,wb}.

2021-08-11 Thread liuhongt via Gcc-patches

Hi:
  This is another patch to optimize vec_perm_expr to match vpmov{dw,dq,wb}
under AVX512.
  For scenarios(like pr101846-2.c) where the upper half is not used, this patch
generates better code with only one vpmov{wb,dw,qd} instruction. For
scenarios(like pr101846-3.c) where the upper half is actually used,  if the src
vector length is 256/512bits, the patch can still generate better code, but for
128bits, the code generation is worse.

128 bits upper half not used.

-   vpshufb .LC2(%rip), %xmm0, %xmm0
+   vpmovdw %xmm0, %xmm0

128 bits upper half used.
-   vpshufb .LC2(%rip), %xmm0, %xmm0
+   vpmovdw %xmm0, %xmm1
+   vmovq   %xmm1, %rax
+   vpinsrq $0, %rax, %xmm0, %xmm0

  Maybe expand_vec_perm_trunc_vinsert should only deal with 256/512bits of
vectors, but considering the real use of scenarios like pr101846-3.c
foo_*_128 possibility is relatively low, I still keep this part of the code.

  Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
  Ok for trunk?

gcc/ChangeLog:

PR target/101846
* config/i386/i386-expand.c (expand_vec_perm_trunc_vinsert):
New function.
(ix86_vectorize_vec_perm_const): Call
expand_vec_perm_trunc_vinsert.
* config/i386/sse.md (vec_set_lo_v32hi): New define_insn.
(vec_set_lo_v64qi): Ditto.
(vec_set_lo_): Extend to no-avx512dq.

gcc/testsuite/ChangeLog:

PR target/101846
* gcc.target/i386/pr101846-2.c: New test.
* gcc.target/i386/pr101846-3.c: New test.
---
 gcc/config/i386/i386-expand.c  | 125 +
 gcc/config/i386/sse.md |  60 +-
 gcc/testsuite/gcc.target/i386/pr101846-2.c |  81 +
 gcc/testsuite/gcc.target/i386/pr101846-3.c |  95 
 4 files changed, 359 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101846-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101846-3.c

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index bd21efa9530..519caac2e15 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -18317,6 +18317,126 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
   return false;
 }
 
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
+   in terms of a pair of vpmovdw + vinserti128 instructions.  */
+static bool
+expand_vec_perm_trunc_vinsert (struct expand_vec_perm_d *d)
+{
+  unsigned i, nelt = d->nelt, mask = d->nelt - 1;
+  unsigned half = nelt / 2;
+  machine_mode half_mode, trunc_mode;
+
+  /* vpmov{wb,dw,qd} only available under AVX512.  */
+  if (!d->one_operand_p || !TARGET_AVX512F
+  || (!TARGET_AVX512VL  && GET_MODE_SIZE (d->vmode) < 64)
+  || GET_MODE_SIZE (GET_MODE_INNER (d->vmode)) > 4)
+return false;
+
+  /* TARGET_AVX512BW is needed for vpmovwb.  */
+  if (GET_MODE_INNER (d->vmode) == E_QImode && !TARGET_AVX512BW)
+return false;
+
+  for (i = 0; i < nelt; i++)
+{
+  unsigned idx = d->perm[i] & mask;
+  if (idx != i * 2 && i < half)
+   return false;
+  if (idx != i && i >= half)
+   return false;
+}
+
+  rtx (*gen_trunc) (rtx, rtx) = NULL;
+  rtx (*gen_vec_set_lo) (rtx, rtx, rtx) = NULL;
+  switch (d->vmode)
+{
+case E_V16QImode:
+  gen_trunc = gen_truncv8hiv8qi2;
+  gen_vec_set_lo = gen_vec_setv2di;
+  half_mode = V8QImode;
+  trunc_mode = V8HImode;
+  break;
+case E_V32QImode:
+  gen_trunc = gen_truncv16hiv16qi2;
+  gen_vec_set_lo = gen_vec_set_lo_v32qi;
+  half_mode = V16QImode;
+  trunc_mode = V16HImode;
+  break;
+case E_V64QImode:
+  gen_trunc = gen_truncv32hiv32qi2;
+  gen_vec_set_lo = gen_vec_set_lo_v64qi;
+  half_mode = V32QImode;
+  trunc_mode = V32HImode;
+  break;
+case E_V8HImode:
+  gen_trunc = gen_truncv4siv4hi2;
+  gen_vec_set_lo = gen_vec_setv2di;
+  half_mode = V4HImode;
+  trunc_mode = V4SImode;
+  break;
+case E_V16HImode:
+  gen_trunc = gen_truncv8siv8hi2;
+  gen_vec_set_lo = gen_vec_set_lo_v16hi;
+  half_mode = V8HImode;
+  trunc_mode = V8SImode;
+  break;
+case E_V32HImode:
+  gen_trunc = gen_truncv16siv16hi2;
+  gen_vec_set_lo = gen_vec_set_lo_v32hi;
+  half_mode = V16HImode;
+  trunc_mode = V16SImode;
+  break;
+case E_V4SImode:
+  gen_trunc = gen_truncv2div2si2;
+  gen_vec_set_lo = gen_vec_setv2di;
+  half_mode = V2SImode;
+  trunc_mode = V2DImode;
+  break;
+case E_V8SImode:
+  gen_trunc = gen_truncv4div4si2;
+  gen_vec_set_lo = gen_vec_set_lo_v8si;
+  half_mode = V4SImode;
+  trunc_mode = V4DImode;
+  break;
+case E_V16SImode:
+  gen_trunc = gen_truncv8div8si2;
+  gen_vec_set_lo = gen_vec_set_lo_v16si;
+  half_mode = V8SImode;
+  trunc_mode = V8DImode;
+  break;
+
+default:
+  break;
+}
+
+  if (gen_trunc == NULL)
+return false;
+
+  rtx op_half

[PATCH] [i386] Optimize __builtin_shuffle_vector.

2021-08-15 Thread liuhongt via Gcc-patches

Hi:
  Here's updated patch which does 3 things:
1. Support vpermw/vpermb in ix86_expand_vec_one_operand_perm_avx512.
2. Support 256/128-bits vpermi2b in ix86_expand_vec_perm_vpermt2.
3. Add define_insn_and_split to optimize specific vector permutation to 
opmov{dw,wb,qd}.

  Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
  Ok for trunk?

gcc/ChangeLog:

PR target/101846
* config/i386/i386-expand.c (ix86_expand_vec_perm_vpermt2):
Support vpermi2b for V32QI/V16QImode.
(ix86_extract_perm_from_pool_constant): New function.
(ix86_expand_vec_one_operand_perm_avx512): Support
vpermw/vpermb under TARGET_AVX512BW/TARGET_AVX512VBMI.
(expand_vec_perm_1): Adjust comments for upper.
* config/i386/i386-protos.h (ix86_extract_perm_from_pool_constant):
New declare.
* config/i386/predicates.md (permvar_truncate_operand): New predicate.
(pshufb_truncv4siv4hi_operand): Ditto.
(pshufb_truncv8hiv8qi_operand): Ditto.
* config/i386/sse.md (*avx512bw_permvar_truncv16siv16hi_1):
New pre_reload define_insn_and_split.
(*avx512f_permvar_truncv8siv8hi_1): Ditto.
(*avx512f_vpermvar_truncv8div8si_1): Ditto.
(*avx512f_permvar_truncv32hiv32qi_1): Ditto.
(*avx512f_permvar_truncv16hiv16qi_1): Ditto.
(*avx512f_permvar_truncv4div4si_1): Ditto.
(*avx512f_pshufb_truncv8hiv8qi_1): Ditto.
(*avx512f_pshufb_truncv4siv4hi_1): Ditto.
(*avx512f_pshufd_truncv2div2si_1): Ditto.

gcc/testsuite/ChangeLog:

PR target/101846
* gcc.target/i386/pr101846-2.c: New test.
* gcc.target/i386/pr101846-3.c: New test.
* gcc.target/i386/pr101846-4.c: New test.
---
 gcc/config/i386/i386-expand.c  |  89 +-
 gcc/config/i386/i386-protos.h  |   1 +
 gcc/config/i386/predicates.md  |  90 ++
 gcc/config/i386/sse.md | 190 +
 gcc/testsuite/gcc.target/i386/pr101846-2.c |  81 +
 gcc/testsuite/gcc.target/i386/pr101846-3.c |  73 
 gcc/testsuite/gcc.target/i386/pr101846-4.c |  40 +
 7 files changed, 559 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101846-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101846-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101846-4.c

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index a652b25f534..56319cb6f6a 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -4778,6 +4778,18 @@ ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx 
op0, rtx op1,
 
   switch (mode)
 {
+case E_V16QImode:
+  if (TARGET_AVX512VL && TARGET_AVX512VBMI)
+   gen = gen_avx512vl_vpermt2varv16qi3;
+  break;
+case E_V32QImode:
+  if (TARGET_AVX512VL && TARGET_AVX512VBMI)
+   gen = gen_avx512vl_vpermt2varv32qi3;
+  break;
+case E_V64QImode:
+  if (TARGET_AVX512VBMI)
+   gen = gen_avx512bw_vpermt2varv64qi3;
+  break;
 case E_V8HImode:
   if (TARGET_AVX512VL && TARGET_AVX512BW)
gen = gen_avx512vl_vpermt2varv8hi3;
@@ -4786,10 +4798,6 @@ ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx 
op0, rtx op1,
   if (TARGET_AVX512VL && TARGET_AVX512BW)
gen = gen_avx512vl_vpermt2varv16hi3;
   break;
-case E_V64QImode:
-  if (TARGET_AVX512VBMI)
-   gen = gen_avx512bw_vpermt2varv64qi3;
-  break;
 case E_V32HImode:
   if (TARGET_AVX512BW)
gen = gen_avx512bw_vpermt2varv32hi3;
@@ -5487,6 +5495,45 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool 
unsigned_p, bool high_p)
 }
 }
 
+/* Return true if mem is pool constant which contains a const_vector
+   perm index, assign the index to PERM.  */
+bool
+ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
+{
+  machine_mode mode = GET_MODE (mem);
+  int nelt = GET_MODE_NUNITS (mode);
+
+  if (!INTEGRAL_MODE_P (mode))
+return false;
+
+/* Needs to be constant pool.  */
+  if (!(MEM_P (mem))
+  || !SYMBOL_REF_P (XEXP (mem, 0))
+  || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
+   return false;
+
+  rtx constant = get_pool_constant (XEXP (mem, 0));
+
+  if (GET_CODE (constant) != CONST_VECTOR)
+return false;
+
+  /* There could be some rtx like
+ (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
+ but with "*.LC1" refer to V2DI constant vector.  */
+  if (GET_MODE (constant) != mode)
+{
+  constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
+
+  if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
+   return false;
+}
+
+  for (int i = 0; i != nelt; i++)
+perm[i] = UINTVAL (XVECEXP (constant, 0, i));
+
+  return true;
+}
+
 /* Split operands 0 and 1 into half-mode parts.  Similar to split_double_mode,
but works for floating pointer parameters and nonoffsetable memories.
For pushes, it returns just stack offsets;

[PATCH] [i386] Fix ICE.

2021-08-16 Thread liuhongt via Gcc-patches

Hi:
  avx512f_scalef2 only accept register_operand for operands[1],
force it to reg in ldexp3.

  Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
  Ok for trunk.

gcc/ChangeLog:

PR target/101930
* config/i386/i386.md (ldexp3): Force operands[1] to
reg.

gcc/testsuite/ChangeLog:

PR target/101930
* gcc.target/i386/pr101930.c: New test.
---
 gcc/config/i386/i386.md  | 4 +---
 gcc/testsuite/gcc.target/i386/pr101930.c | 9 +
 2 files changed, 10 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101930.c

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 4a8e8fea290..41d85623ad6 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -17938,9 +17938,7 @@ (define_expand "ldexp3"
   if (TARGET_AVX512F && TARGET_SSE_MATH)
{
  rtx op2 = gen_reg_rtx (mode);
-
- if (!nonimmediate_operand (operands[1], mode))
-   operands[1] = force_reg (mode, operands[1]);
+ operands[1] = force_reg (mode, operands[1]);
 
  emit_insn (gen_floatsi2 (op2, operands[2]));
  emit_insn (gen_avx512f_scalef2 (operands[0], operands[1], op2));
diff --git a/gcc/testsuite/gcc.target/i386/pr101930.c 
b/gcc/testsuite/gcc.target/i386/pr101930.c
new file mode 100644
index 000..7207dd18377
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101930.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2 -mfpmath=sse -ffast-math" } */
+double a;
+double
+__attribute__((noipa))
+foo (int b)
+{
+  return __builtin_ldexp (a, b);
+}
-- 
2.27.0

[PATCH] [i386] Add x86 tune to enable v2df vector reduction by paddpd.

2021-08-17 Thread liuhongt via Gcc-patches

Hi:
  This patch add a new x86 tune named X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD
to enable haddpd for v2df vector reduction, the tune is disabled by default.

  Bootstrapped and regtested on x86_64-linux-gnu{-m32,}
  Ok for trunk?

gcc/ChangeLog:

PR target/97147
* config/i386/i386.h (TARGET_V2DF_REDUCTION_PREFER_HADDPD):
New macro.
* config/i386/sse.md (*sse3_haddv2df3_low): Add
TARGET_V2DF_REDUCTION_PREFER_HADDPD.
(*sse3_hsubv2df3_low): Ditto.
* config/i386/x86-tune.def
(X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD): New tune.

gcc/testsuite/ChangeLog:

PR target/97147
* gcc.target/i386/pr54400.c: Adjust testcase.
* gcc.target/i386/pr94147.c: New test.
---
 gcc/config/i386/i386.h  |  2 ++
 gcc/config/i386/sse.md  |  4 ++--
 gcc/config/i386/x86-tune.def|  5 +
 gcc/testsuite/gcc.target/i386/pr54400.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr94147.c | 22 ++
 5 files changed, 32 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr94147.c

diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 21fe51bba40..b3e57a83846 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -418,6 +418,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_EMIT_VZEROUPPER]
 #define TARGET_EXPAND_ABS \
ix86_tune_features[X86_TUNE_EXPAND_ABS]
+#define TARGET_V2DF_REDUCTION_PREFER_HADDPD \
+   ix86_tune_features[X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD]
 
 /* Feature tests against the various architecture variations.  */
 enum ix86_arch_indices {
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 27e25cc7952..13889687793 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2771,7 +2771,7 @@ (define_insn "*sse3_haddv2df3_low"
  (vec_select:DF
(match_dup 1)
(parallel [(match_operand:SI 3 "const_0_to_1_operand")]]
-  "TARGET_SSE3
+  "TARGET_SSE3 && TARGET_V2DF_REDUCTION_PREFER_HADDPD
&& INTVAL (operands[2]) != INTVAL (operands[3])"
   "@
haddpd\t{%0, %0|%0, %0}
@@ -2790,7 +2790,7 @@ (define_insn "*sse3_hsubv2df3_low"
  (vec_select:DF
(match_dup 1)
(parallel [(const_int 1)]]
-  "TARGET_SSE3"
+  "TARGET_SSE3 && TARGET_V2DF_REDUCTION_PREFER_HADDPD"
   "@
hsubpd\t{%0, %0|%0, %0}
vhsubpd\t{%1, %1, %0|%0, %1, %1}"
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index eb057a67750..8f55da89c92 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -452,6 +452,11 @@ DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, 
"avoid_fma_chains", m_ZNVER)
smaller FMA chain.  */
 DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | 
m_ZNVER3)
 
+/* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd
+   for v2df vector reduction.  */
+DEF_TUNE (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD,
+ "v2df_reduction_prefer_haddpd", m_NONE)
+
 /*/
 /* AVX instruction selection tuning (some of SSE flags affects AVX, too) */
 /*/
diff --git a/gcc/testsuite/gcc.target/i386/pr54400.c 
b/gcc/testsuite/gcc.target/i386/pr54400.c
index 5ed5ba06644..3a450376b9e 100644
--- a/gcc/testsuite/gcc.target/i386/pr54400.c
+++ b/gcc/testsuite/gcc.target/i386/pr54400.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -msse3 -mfpmath=sse" } */
+/* { dg-options "-O2 -msse3 -mfpmath=sse 
-mtune-ctrl=v2df_reduction_prefer_haddpd" } */
 
 #include 
 
diff --git a/gcc/testsuite/gcc.target/i386/pr94147.c 
b/gcc/testsuite/gcc.target/i386/pr94147.c
new file mode 100644
index 000..8ff5c34834f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr94147.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse3 -mfpmath=sse" } */
+
+#include 
+
+double f (__m128d p)
+{
+  return p[0] - p[1];
+}
+
+double g1 (__m128d p)
+{
+  return p[0] + p[1];
+}
+
+double g2 (__m128d p)
+{
+  return p[1] + p[0];
+}
+
+/* { dg-final { scan-assembler-not "hsubpd" } } */
+/* { dg-final { scan-assembler-not "haddpd" } } */
-- 
2.18.1

[PATCH] Revert "Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct."

2021-08-17 Thread liuhongt via Gcc-patches

This reverts commit 872da9a6f664a06d73c987aa0cb2e5b830158a10.

PR target/101936
PR target/101929

  Bootstrapped and regtested on x86_64-linux-gnu{-m32,}
  Pushed to master.

---
 gcc/config/i386/i386.c  |  6 +-
 gcc/config/i386/i386.h  |  1 -
 gcc/config/i386/x86-tune-costs.h| 26 -
 gcc/testsuite/gcc.target/i386/pr99881.c |  2 +-
 4 files changed, 2 insertions(+), 33 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 4d4ab6a03d6..46844fab08f 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -22203,11 +22203,7 @@ ix86_builtin_vectorization_cost (enum 
vect_cost_for_stmt type_of_cost,
   case vec_construct:
{
  /* N element inserts into SSE vectors.  */
- int cost
-   = TYPE_VECTOR_SUBPARTS (vectype) * (fp ?
-   ix86_cost->sse_op
-   : ix86_cost->integer_to_sse);
-
+ int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
  /* One vinserti128 for combining two SSE vectors for AVX256.  */
  if (GET_MODE_BITSIZE (mode) == 256)
cost += ix86_vec_cost (mode, ix86_cost->addss);
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 21fe51bba40..f9241df3b3d 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -165,7 +165,6 @@ struct processor_costs {
   const int xmm_move, ymm_move, /* cost of moving XMM and YMM register.  */
zmm_move;
   const int sse_to_integer;/* cost of moving SSE register to integer.  */
-  const int integer_to_sse;/* cost of moving integer to SSE register.  */
   const int gather_static, gather_per_elt; /* Cost of gather load is computed
   as static + per_item * nelts. */
   const int scatter_static, scatter_per_elt; /* Cost of gather store is
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 67cfa006196..ffe810f2bcb 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -102,7 +102,6 @@ struct processor_costs ix86_size_cost = {/* costs for 
tuning for size */
   in 128bit, 256bit and 512bit */
   3, 3, 3, /* cost of moving XMM,YMM,ZMM register 
*/
   3,   /* cost of moving SSE register to 
integer.  */
-  COSTS_N_BYTES (2),   /* cost of moving integer to sse 
register.  */
   5, 0,/* Gather load static, per_elt. 
 */
   5, 0,/* Gather store static, 
per_elt.  */
   0,   /* size of l1 cache  */
@@ -212,7 +211,6 @@ struct processor_costs i386_cost = {/* 386 specific 
costs */
   {4, 8, 16, 32, 64},  /* cost of unaligned stores.  */
   2, 4, 8, /* cost of moving XMM,YMM,ZMM register 
*/
   3,   /* cost of moving SSE register to 
integer.  */
-  COSTS_N_INSNS (1),   /* cost of moving integer to sse 
register.  */
   4, 4,/* Gather load static, per_elt. 
 */
   4, 4,/* Gather store static, 
per_elt.  */
   0,   /* size of l1 cache  */
@@ -321,7 +319,6 @@ struct processor_costs i486_cost = {/* 486 specific 
costs */
   {4, 8, 16, 32, 64},  /* cost of unaligned stores.  */
   2, 4, 8, /* cost of moving XMM,YMM,ZMM register 
*/
   3,   /* cost of moving SSE register to 
integer.  */
-  COSTS_N_INSNS (1),   /* cost of moving integer to sse 
register.  */
   4, 4,/* Gather load static, per_elt. 
 */
   4, 4,/* Gather store static, 
per_elt.  */
   4,   /* size of l1 cache.  486 has 8kB cache
@@ -432,7 +429,6 @@ struct processor_costs pentium_cost = {
   {4, 8, 16, 32, 64},  /* cost of unaligned stores.  */
   2, 4, 8, /* cost of moving XMM,YMM,ZMM register 
*/
   3,   /* cost of moving SSE register to 
integer.  */
-  COSTS_N_INSNS (1),   /* cost of moving integer to sse 
register.  */
   4, 4,/* Gather load static, per_elt. 
 */
   4, 4,/* Gather store static, 
per_elt.  */
   8,   /* size of l1 cache.  */
@@ -534,7 +530,6 @@ struct processor_costs lakemont_cost = {
   {4, 8, 16, 32, 64},  /* cost of unaligned stores.  */
   2, 4, 8, /* cost of moving XMM,YMM,ZMM register 
*/
   3,

[PATCH] Disable slp in loop vectorizer when cost model is very-cheap.

2021-08-22 Thread liuhongt via Gcc-patches

Performance impact for the commit with option:
-march=x86-64 -O2 -ftree-vectorize -fvect-cost-model=very-cheap

SPEC2017 fprate
503.bwaves_rBuildSame
507.cactuBSSN_r -0.04
508.namd_r   0.14
510.parest_r-0.54
511.povray_r 0.10
519.lbm_r   BuildSame
521.wrf_r0.64
526.blender_r   -0.32
527.cam4_r   0.17
538.imagick_r0.09
544.nab_r   BuildSame
549.fotonik3d_r BuildSame
554.roms_r  BuildSame
997.specrand_fr -0.09
Geometric mean:  0.02

SPEC2017 intrate
500.perlbench_r  0.26
502.gcc_r0.21
505.mcf_r   -0.09
520.omnetpp_r   BuildSame
523.xalancbmk_r BuildSame
525.x264_r  -0.41
531.deepsjeng_r BuildSame
541.leela_r  0.13
548.exchange2_r BuildSame
557.xz_rBuildSame
999.specrand_ir BuildSame
Geometric mean:  0.02

EEMBC: no regression, only improvement or build the same, the below is
improved benchmarks.

mp2decoddata1   7.59
mp2decoddata2   31.80
mp2decoddata3   12.15
mp2decoddata4   11.16
mp2decoddata5   11.19
mp2decoddata1   7.06
mp2decoddata2   24.12
mp2decoddata3   10.83
mp2decoddata4   10.04
mp2decoddata5   10.07

Survived regression test.
Ok for trunk?

gcc/ChangeLog:

PR tree-optimization/100089
* tree-vectorizer.c (try_vectorize_loop_1): Disable slp in
loop vectorizer when cost model is very-cheap.
---
 gcc/tree-vectorizer.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index b9709a613d5..8a5b8735546 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -1033,7 +1033,10 @@ try_vectorize_loop_1 (hash_table 
*&simduid_to_vf_htab,
 only non-if-converted parts took part in BB vectorization.  */
   if (flag_tree_slp_vectorize != 0
  && loop_vectorized_call
- && ! loop->inner)
+ && ! loop->inner
+ /* This would purely be a workaround and should be removed
+once the PR100089 is fixed.  */
+ && flag_vect_cost_model != VECT_COST_MODEL_VERY_CHEAP)
{
  basic_block bb = loop->header;
  bool require_loop_vectorize = false;
-- 
2.18.1

[PATCH] [i386] Fix ICE.

2021-08-23 Thread liuhongt via Gcc-patches

  Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
  Pushed to trunk.

gcc/ChangeLog:

PR target/102016
* config/i386/sse.md (*avx512f_pshufb_truncv8hiv8qi_1): Add
TARGET_AVX512BW to condition.

gcc/testsuite/ChangeLog:

PR target/102016
* gcc.target/i386/pr102016.c: New test.
---
 gcc/config/i386/sse.md   |  2 +-
 gcc/testsuite/gcc.target/i386/pr102016.c | 10 ++
 2 files changed, 11 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102016.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 13889687793..95f95823ea3 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -11256,7 +11256,7 @@ (define_insn_and_split "*avx512f_pshufb_truncv8hiv8qi_1"
   (match_operand:V16QI 2 "pshufb_truncv8hiv8qi_operand")]
   UNSPEC_PSHUFB) 0)
  (parallel [(const_int 0)])))]
-  "TARGET_AVX512VL && ix86_pre_reload_split ()"
+  "TARGET_AVX512VL && TARGET_AVX512BW && ix86_pre_reload_split ()"
   "#"
   "&& 1"
   [(const_int 0)]
diff --git a/gcc/testsuite/gcc.target/i386/pr102016.c 
b/gcc/testsuite/gcc.target/i386/pr102016.c
new file mode 100644
index 000..2ff75cb898a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102016.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -O2" } */
+typedef char v8qi __attribute__((vector_size (8)));
+typedef char v16qi __attribute__((vector_size (16)));
+v8qi
+foo_wb_128 (v16qi x)
+{
+  return __builtin_shufflevector (x, x,
+ 0, 2, 4, 6, 8, 10, 12, 14);
+}
-- 
2.27.0

[PATCH] [i386] Optimize (a & b) | (c & ~b) to vpternlog instruction.

2021-08-23 Thread liuhongt via Gcc-patches

Also optimize below 3 forms to vpternlog, op1, op2, op3 are
register_operand or unary_p as (not reg)

A: (any_logic (any_logic op1 op2) op3)
B: (any_logic (any_logic op1 op2) (any_logic op3 op4)) op3/op4 should
be equal to op1/op2
C: (any_logic (any_logic (any_logic:op1 op2) op3) op4) op3/op4 should
be equal to op1/op2

  Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.

gcc/ChangeLog:

PR target/101989
* config/i386/i386-protos.h
(ix86_strip_reg_or_notreg_operand): New declare.
* config/i386/i386.c (ix86_rtx_costs): Define cost for
UNSPEC_VTERNLOG.
(ix86_strip_reg_or_notreg_operand): New function.
* config/i386/predicates.md (reg_or_notreg_operand): New
predicate.
* config/i386/sse.md (*_vternlog_all): New define_insn.
(*_vternlog_1): New pre_reload
define_insn_and_split.
(*_vternlog_2): Ditto.
(*_vternlog_3): Ditto.
(any_logic1,any_logic2): New code iterator.
(logic_op): New code attribute.
(ternlogsuffix): Extend to VNxDF and VNxSF.

gcc/testsuite/ChangeLog:

PR target/101989
* gcc.target/i386/pr101989-1.c: New test.
* gcc.target/i386/pr101989-2.c: New test.
* gcc.target/i386/avx512bw-shiftqihi-constant-1.c: Adjust testcase.
---
 gcc/config/i386/i386-protos.h |   1 +
 gcc/config/i386/i386.c|  13 +
 gcc/config/i386/predicates.md |   7 +
 gcc/config/i386/sse.md| 234 ++
 .../i386/avx512bw-shiftqihi-constant-1.c  |   4 +-
 gcc/testsuite/gcc.target/i386/pr101989-1.c|  51 
 gcc/testsuite/gcc.target/i386/pr101989-2.c| 102 
 7 files changed, 410 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101989-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101989-2.c

diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 2fd13074c81..2bdaadcf4f3 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -60,6 +60,7 @@ extern rtx standard_80387_constant_rtx (int);
 extern int standard_sse_constant_p (rtx, machine_mode);
 extern const char *standard_sse_constant_opcode (rtx_insn *, rtx *);
 extern bool ix86_standard_x87sse_constant_load_p (const rtx_insn *, rtx);
+extern rtx ix86_strip_reg_or_notreg_operand (rtx);
 extern bool ix86_pre_reload_split (void);
 extern bool symbolic_reference_mentioned_p (rtx);
 extern bool extended_reg_mentioned_p (rtx);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 46844fab08f..a69225ccc81 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -5236,6 +5236,14 @@ ix86_standard_x87sse_constant_load_p (const rtx_insn 
*insn, rtx dst)
   return true;
 }
 
+/* Returns true if INSN can be transformed from a memory load
+   to a supported FP constant load.  */
+rtx
+ix86_strip_reg_or_notreg_operand (rtx op)
+{
+  return UNARY_P (op) ? XEXP (op, 0) : op;
+}
+
 /* Predicate for pre-reload splitters with associated instructions,
which can match any time before the split1 pass (usually combine),
then are unconditionally split in that pass and should not be
@@ -20544,6 +20552,11 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
 case UNSPEC:
   if (XINT (x, 1) == UNSPEC_TP)
*total = 0;
+  else if (XINT(x, 1) == UNSPEC_VTERNLOG)
+   {
+ *total = cost->sse_op;
+ return true;
+   }
   return false;
 
 case VEC_SELECT:
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 9321f332ef9..df5acb425d4 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1044,6 +1044,13 @@ (define_predicate "reg_or_pm1_operand"
(ior (match_test "op == const1_rtx")
 (match_test "op == constm1_rtx")
 
+;; True for registers, or (not: registers).  Used to optimize 3-operand
+;; bitwise operation.
+(define_predicate "reg_or_notreg_operand"
+  (ior (match_operand 0 "register_operand")
+   (and (match_code "not")
+   (match_test "register_operand (XEXP (op, 0), mode)"
+
 ;; True if OP is acceptable as operand of DImode shift expander.
 (define_predicate "shiftdi_operand"
   (if_then_else (match_test "TARGET_64BIT")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 13889687793..0acd749d21c 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -933,7 +933,9 @@ (define_mode_attr iptr
 ;; Mapping of vector modes to VPTERNLOG suffix
 (define_mode_attr ternlogsuffix
   [(V8DI "q") (V4DI "q") (V2DI "q")
+   (V8DF "q") (V4DF "q") (V2DF "q")
(V16SI "d") (V8SI "d") (V4SI "d")
+   (V16SF "d") (V8SF "d") (V4SF "d")
(V32HI "d") (V16HI "d") (V8HI "d")
(V64QI "d") (V32QI "d") (V16QI "d")])
 
@@ -10041,6 +10043,238 @@ (define_insn "_vternlog"
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
+(define_insn "*_v

[PATCH] Change illegitimate constant into memref of constant pool in change_zero_ext.

2021-08-24 Thread liuhongt via Gcc-patches

Hi:
  This patch extend change_zero_ext to change illegitimate constant
into constant pool, this will enable simplification of below:

Trying 5 -> 7:
5: r85:V4SF=[`*.LC0']
  REG_EQUAL const_vector
7: r84:V4SF=vec_select(vec_concat(r85:V4SF,r85:V4SF),parallel)
  REG_DEAD r85:V4SF
  REG_EQUAL const_vector
Failed to match this instruction:
(set (reg:V4SF 84)
(const_vector:V4SF [
(const_double:SF 3.0e+0 [0x0.cp+2])
(const_double:SF 2.0e+0 [0x0.8p+2])
(const_double:SF 4.0e+0 [0x0.8p+3])
(const_double:SF 1.0e+0 [0x0.8p+1])
]))

(insn 5 2 7 2 (set (reg:V4SF 85)
(mem/u/c:V4SF (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0  S16
A128])) 
1600 {movv4sf_internal}
 (expr_list:REG_EQUAL (const_vector:V4SF [
(const_double:SF 4.0e+0 [0x0.8p+3])
(const_double:SF 3.0e+0 [0x0.cp+2])
(const_double:SF 2.0e+0 [0x0.8p+2])
(const_double:SF 1.0e+0 [0x0.8p+1])
])
(nil)))
(insn 7 5 11 2 (set (reg:V4SF 84)
(vec_select:V4SF (vec_concat:V8SF (reg:V4SF 85)
(reg:V4SF 85))
(parallel [
(const_int 1 [0x1])
(const_int 2 [0x2])
(const_int 4 [0x4])
(const_int 7 [0x7])
])))
3015 {sse_shufps_v4sf}
 (expr_list:REG_DEAD (reg:V4SF 85)
(expr_list:REG_EQUAL (const_vector:V4SF [
(const_double:SF 3.0e+0 [0x0.cp+2])
(const_double:SF 2.0e+0 [0x0.8p+2])
(const_double:SF 4.0e+0 [0x0.8p+3])
(const_double:SF 1.0e+0 [0x0.8p+1])
])
(nil

  Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
  Ok for trunk?

gcc/ChangeLog:

PR rtl-optimization/43147
* combine.c (recog_for_combine_1): Adjust comments of ..
(change_zero_ext):.. this, and extend to change illegitimate
constant into constant pool.

gcc/testsuite/ChangeLog:

PR rtl-optimization/43147
* gcc.target/i386/pr43147.c: New test.
* gcc.target/i386/pr22076.c: Adjust testcase.
---
 gcc/combine.c   | 20 +++-
 gcc/testsuite/gcc.target/i386/pr22076.c |  4 ++--
 gcc/testsuite/gcc.target/i386/pr43147.c | 15 +++
 3 files changed, 36 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr43147.c

diff --git a/gcc/combine.c b/gcc/combine.c
index cb5fa401fcb..0b2afdf45af 100644
--- a/gcc/combine.c
+++ b/gcc/combine.c
@@ -11404,7 +11404,8 @@ recog_for_combine_1 (rtx *pnewpat, rtx_insn *insn, rtx 
*pnotes)
 
 /* Change every ZERO_EXTRACT and ZERO_EXTEND of a SUBREG that can be
expressed as an AND and maybe an LSHIFTRT, to that formulation.
-   Return whether anything was so changed.  */
+   Return whether anything was so changed.
+   Also change illegitimate constant into memref of constant pool.  */
 
 static bool
 change_zero_ext (rtx pat)
@@ -11417,6 +11418,23 @@ change_zero_ext (rtx pat)
 {
   rtx x = **iter;
   scalar_int_mode mode, inner_mode;
+  machine_mode const_mode = GET_MODE (x);
+
+  /* Change illegitimate constant into memref of constant pool.  */
+  if (CONSTANT_P (x)
+ && !const_vec_duplicate_p (x)
+ && const_mode != BLKmode
+ && GET_CODE (x) != HIGH
+ && GET_MODE_SIZE (const_mode).is_constant ()
+ && !targetm.legitimate_constant_p (const_mode, x)
+ && !targetm.cannot_force_const_mem (const_mode, x))
+   {
+ x = force_const_mem (GET_MODE (x), x);
+ SUBST (**iter, x);
+ changed = true;
+ continue;
+   }
+
   if (!is_a  (GET_MODE (x), &mode))
continue;
   int size;
diff --git a/gcc/testsuite/gcc.target/i386/pr22076.c 
b/gcc/testsuite/gcc.target/i386/pr22076.c
index 427ffcd4920..866c387280f 100644
--- a/gcc/testsuite/gcc.target/i386/pr22076.c
+++ b/gcc/testsuite/gcc.target/i386/pr22076.c
@@ -15,5 +15,5 @@ void test ()
   x = _mm_add_pi8 (mm0, mm1);
 }
 
-/* { dg-final { scan-assembler-times "movq" 2 } } */
-/* { dg-final { scan-assembler-not "movl" { target nonpic } } } */
+/* { dg-final { scan-assembler-times "movq" 2 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "movl" 4  { target ia32 } } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr43147.c 
b/gcc/testsuite/gcc.target/i386/pr43147.c
new file mode 100644
index 000..3c30f917c06
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr43147.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-final { scan-assembler "movaps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+#include 
+
+__m128
+foo (void)
+{
+  __m128 m = _mm_set_ps(1.0f, 2.0f, 3.0f, 4.0f);
+  m = _mm_shuffle_ps(m, m, 0xC9);
+  m = _mm_shuffle_ps(m, m, 0x2D);
+  return m;
+}
-- 
2.27.0

1 2 3 4 5 >

1 - 100 of 409 matches

Mail list logo