Re: [PATCH] Optimize nested permutation to single VEC_PERM_EXPR [PR54346]

2022-10-12 Thread Levy
Hi RuoYao

It’s probably because loongarch64 doesn’t support 
can_vec_perm_const_p(result_mode, op_mode, sel2, false)

I’m not sure whether if loongarch will support it or should I just limit the 
test target for pr54346.c?

Best Regards
Levy

> On 12 Oct 2022, at 9:51 pm, Xi Ruoyao  wrote:
> 
> pr54346.



[PATCH] RISC-V: Add implementation for builtin overflow

2021-01-21 Thread Levy
Added implementation for builtin overflow detection, new patterns are listed 
below.

signed addition:
add t0, t1, t2
sltit3, t2, 0
slt t4, t0, t1
bne t3, t4, overflow
unsigned addition:
add t0, t1, t2
bltut0, t1, overflow

signed subtraction:
sub t0, t1, t2
sltit3, t2, 0
slt t4, t1, t0
bne t3, t4, overflow
signed subtraction:
add t0, t1, t2
bltut1, t0, overflow

signed multiplication:
mulht4, t1, t2
mul t0, t1, t2
srait5, t0, 31/63 (RV32/64)
bne t4, t5, overflow
unsigned multiplication:
mulhu   t4, t1, t2
mul t0, t1, t2
bne t4, 0, overflow

gcc/Changelog:
* config/riscv/riscv.md:Add expand pattern for addv4, 
subv4 and mulv4
* config/riscv/riscv.h: Defined riscv_min_arithmetic_precision for hook 
TARGET_MIN_ARITHMETIC_PRECISION
* config/riscv/riscv.c: Set the riscv_min_arithmetic_precision to 32, 
allows SImode op calls for addw/subw under RV64.

---
 gcc/config/riscv/riscv.c  |   8 +++
 gcc/config/riscv/riscv.h  |   5 ++
 gcc/config/riscv/riscv.md | 104 ++
 3 files changed, 117 insertions(+)

diff --git a/gcc/config/riscv/riscv.c b/gcc/config/riscv/riscv.c
index d489717b2a5..cf94f5c9658 100644
--- a/gcc/config/riscv/riscv.c
+++ b/gcc/config/riscv/riscv.c
@@ -351,6 +351,14 @@ static const struct riscv_tune_info 
riscv_tune_info_table[] = {
   { "size", generic, &optimize_size_tune_info },
 };
 
+/* Implement TARGET_MIN_ARITHMETIC_PRECISION.  */
+
+static unsigned int
+riscv_min_arithmetic_precision (void)
+{
+  return 32;
+}
+
 /* Return the riscv_tune_info entry for the given name string.  */
 
 static const struct riscv_tune_info *
diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h
index 172c7ca7c98..62cebd08cff 100644
--- a/gcc/config/riscv/riscv.h
+++ b/gcc/config/riscv/riscv.h
@@ -115,6 +115,11 @@ extern const char *riscv_default_mtune (int argc, const 
char **argv);
 
 #define MAX_BITS_PER_WORD 64
 
+/* Allows SImode op in builtin overflow pattern, see internal-fn.c.  */
+
+#undef TARGET_MIN_ARITHMETIC_PRECISION
+#define TARGET_MIN_ARITHMETIC_PRECISION riscv_min_arithmetic_precision
+
 /* Width of a word, in units (bytes).  */
 #define UNITS_PER_WORD (TARGET_64BIT ? 8 : 4)
 #ifndef IN_LIBGCC2
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 36012ad1f77..a95654d77e9 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -462,6 +462,41 @@
   [(set_attr "type" "arith")
(set_attr "mode" "DI")])
 
+(define_expand "addv4"
+  [(set (match_operand:GPR 0 "register_operand" "=r,r")
+  (plus:GPR (match_operand:GPR 1 "register_operand" " r,r")
+ (match_operand:GPR 2 "arith_operand"" r,I")))
+  (label_ref (match_operand 3 "" ""))]
+  ""
+{
+  rtx t3 = gen_reg_rtx (mode);
+  rtx t4 = gen_reg_rtx (mode);
+
+  emit_insn (gen_add3_insn (operands[0], operands[1], operands[2]));
+
+  rtx cmp1 = gen_rtx_LT (mode, operands[2], const0_rtx);
+  emit_insn (gen_cstore4 (t3, cmp1, operands[2], const0_rtx));
+
+  rtx cmp2 = gen_rtx_LT (mode, operands[0], operands[1]);
+  emit_insn (gen_cstore4 (t4, cmp2, operands[0], operands[1]));
+
+  riscv_expand_conditional_branch (operands[3], NE, t3, t4);
+  DONE;
+})
+
+(define_expand "uaddv4"
+  [(set (match_operand:GPR 0 "register_operand" "=r,r")
+  (plus:GPR (match_operand:GPR 1 "register_operand" " r,r")
+ (match_operand:GPR 2 "arith_operand"" r,I")))
+  (label_ref (match_operand 3 "" ""))]
+  ""
+{
+  emit_insn (gen_add3_insn (operands[0], operands[1], operands[2]));
+  riscv_expand_conditional_branch (operands[3], LTU, operands[0], operands[1]);
+
+   DONE;
+})
+
 (define_insn "*addsi3_extended"
   [(set (match_operand:DI   0 "register_operand" "=r,r")
(sign_extend:DI
@@ -518,6 +553,41 @@
   [(set_attr "type" "arith")
(set_attr "mode" "SI")])
 
+(define_expand "subv4"
+  [(set (match_operand:GPR 0 "register_operand" "= r")
+  (minus:GPR (match_operand:GPR 1 "reg_or_0_operand" " rJ")
+   (match_operand:GPR 2 "register_operand" "  r")))
+  (label_ref (match_operand 3 "" ""))]
+  ""
+{
+  rtx t3 = gen_reg_rtx (mode);
+  rtx t4 = gen_reg_rtx (mode);
+
+  emit_insn (gen_sub3_insn (operands[0], operands[1], operands[2]));
+
+  rtx cmp1 = gen_rtx_LT (mode, operands[2], const0_rtx);
+  emit_insn (gen_cstore4 (t3, cmp1, operands[2], const0_rtx));
+
+  rtx cmp2 = gen_rtx_LT (mode, operands[1], operands[0]);
+  emit_insn (gen_cstore4 (t4, cmp2, operands[1], operands[0]));
+
+  riscv_expand_conditional_branch (operands[3], NE, t3, t4);
+  DONE;
+})
+
+(define_expand "usubv4"
+  [(set (match_operand:GPR 0 "register_operand" "= r")
+  (minus:GPR (match_operand:GPR 1 "reg_or_0_operand" " rJ")
+  (match_operand:GPR 2 "register_operand" "  r")))
+  (label_ref

[PATCH] [RISCV] Add Pattern for builtin overflow

2021-04-26 Thread Levy Hsu
From: LevyHsu 

Added implementation for builtin overflow detection, new patterns are listed 
below.

---
Addition:

signed addition (SImode with RV32 || DImode with RV64):
add t0, t1, t2
sltit3, t2, 0
slt t4, t0, t1
bne t3, t4, overflow

signed addition (SImode with RV64):
add t0, t1, t2
sext.w  t3, t0
bne t0, t3, overflow

unsigned addition (SImode with RV32 || DImode with RV64):
add t0, t1, t2
bltut0, t1, overflow

unsigned addition (SImode with RV64):
sext.w  t3, t1
addwt0, t1, t2
bltut0, t3, overflow
---
Subtraction:

signed subtraction (SImode with RV32 || DImode with RV64):
sub t0, t1, t2
sltit3, t2, 0
slt t4, t1, t0
bne t3, t4, overflow

signed subtraction (SImode with RV64):
sub t0, t1, t2
sext.w  t3, t0
bne t0, t3, overflow

unsigned subtraction (SImode with RV32 || DImode with RV64):
add t0, t1, t2
bltut1, t0, overflow

unsigned subtraction (SImode with RV64):
sext.w  t3, t1
subwt0, t1, t2
bltut0, t3, overflow
---
Multiplication:

signed multiplication (SImode with RV32 || DImode with RV64):
mulht4, t1, t2
mul t0, t1, t2
srait5, t0, 31/63 (RV32/64)
bne t4, t5, overflow

signed multiplication (SImode with RV64):
mul t0, t1, t2
sext.w  t3, t0
bne t0, t3, overflow

unsigned multiplication (SImode with RV32 || DImode with RV64 ):
mulhu   t4, t1, t2
mul t0, t1, t2
bne t4, 0,  overflow

unsigned multiplication (SImode with RV64):
sllit0,t0,32
sllit1,t1,32
srlit0,t0,32
srlit1,t1,32
mul t0,t0,t1
srait5,t0,32
bne t5, 0, overflow

---
---
 gcc/config/riscv/riscv.c  |   8 ++
 gcc/config/riscv/riscv.h  |   5 +
 gcc/config/riscv/riscv.md | 240 ++
 3 files changed, 253 insertions(+)

diff --git a/gcc/config/riscv/riscv.c b/gcc/config/riscv/riscv.c
index d489717b2a5..cf94f5c9658 100644
--- a/gcc/config/riscv/riscv.c
+++ b/gcc/config/riscv/riscv.c
@@ -351,6 +351,14 @@ static const struct riscv_tune_info 
riscv_tune_info_table[] = {
   { "size", generic, &optimize_size_tune_info },
 };
 
+/* Implement TARGET_MIN_ARITHMETIC_PRECISION.  */
+
+static unsigned int
+riscv_min_arithmetic_precision (void)
+{
+  return 32;
+}
+
 /* Return the riscv_tune_info entry for the given name string.  */
 
 static const struct riscv_tune_info *
diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h
index 172c7ca7c98..a6f451b97e3 100644
--- a/gcc/config/riscv/riscv.h
+++ b/gcc/config/riscv/riscv.h
@@ -121,6 +121,11 @@ extern const char *riscv_default_mtune (int argc, const 
char **argv);
 #define MIN_UNITS_PER_WORD 4
 #endif
 
+/* Allows SImode op in builtin overflow pattern, see internal-fn.c.  */
+
+#undef TARGET_MIN_ARITHMETIC_PRECISION
+#define TARGET_MIN_ARITHMETIC_PRECISION riscv_min_arithmetic_precision
+
 /* The `Q' extension is not yet supported.  */
 #define UNITS_PER_FP_REG (TARGET_DOUBLE_FLOAT ? 8 : 4)
 
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 36012ad1f77..c82017a4bce 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -462,6 +462,81 @@
   [(set_attr "type" "arith")
(set_attr "mode" "DI")])
 
+(define_expand "addv4"
+  [(set (match_operand:GPR 0 "register_operand" "=r,r")
+(plus:GPR (match_operand:GPR 1 "register_operand" " r,r")
+(match_operand:GPR 2 "arith_operand"" r,I")))
+(label_ref (match_operand 3 "" ""))]
+  ""
+{
+  if (TARGET_64BIT && mode == SImode)
+  {
+rtx t3 = gen_reg_rtx (DImode);
+rtx t4 = gen_reg_rtx (DImode);
+rtx t5 = gen_reg_rtx (DImode);
+rtx t6 = gen_reg_rtx (DImode);
+
+if (GET_CODE (operands[1]) != CONST_INT)
+  emit_insn (gen_extend_insn (t4, operands[1], DImode, SImode, 0));
+else
+  t4 = operands[1];
+if (GET_CODE (operands[2]) != CONST_INT)
+  emit_insn (gen_extend_insn (t5, operands[2], DImode, SImode, 0));
+else
+  t5 = operands[2];
+emit_insn (gen_adddi3 (t3, t4, t5));
+
+emit_move_insn (operands[0], gen_lowpart (SImode, t3));
+emit_insn (gen_extend_insn (t6, operands[0], DImode, SImode, 0));
+
+riscv_expand_conditional_branch (operands[3], NE, t6, t3);
+  }
+  else
+  {
+rtx t3 = gen_reg_rtx (mode);
+rtx t4 = gen_reg_rtx (mode);
+
+emit_insn (gen_add3_insn (operands[0], operands[1], operands[2]));
+rtx cmp1 = gen_rtx_LT (mode, operands[2], const0_rtx);
+emit_insn (gen_cstore4 (t3, cmp1, operands[2], const0_rtx));
+rtx cmp2 = g

[PATCH] [RISCV] Add Pattern for builtin overflow

2021-04-28 Thread Levy Hsu
From: LevyHsu 

Added implementation for builtin overflow detection, new patterns are listed 
below.

---
Addition:

signed addition (SImode in RV32 || DImode in RV64):
add t0, t1, t2
sltit3, t2, 0
slt t4, t0, t1
bne t3, t4, overflow

signed addition (SImode in RV64):
add t0, t1, t2
addwt3, t1, t2
bne t0, t3, overflow

unsigned addition (SImode in RV32 || DImode in RV64):
add t0, t1, t2
bltut0, t1, overflow

unsigned addition (SImode in RV64):
sext.w  t3, t1
addwt0, t1, t2
bltut0, t3, overflow
---
Subtraction:

signed subtraction (SImode in RV32 || DImode in RV64):
sub t0, t1, t2
sltit3, t2, 0
slt t4, t1, t0
bne t3, t4, overflow

signed subtraction (SImode in RV64):
sub t0, t1, t2
subwt3, t1, t2
bne t0, t3, overflow

unsigned subtraction (SImode in RV32 || DImode in RV64):
add t0, t1, t2
bltut1, t0, overflow

unsigned subtraction (SImode in RV64):
sext.w  t3, t1
subwt0, t1, t2
bltut0, t3, overflow
---
Multiplication:

signed multiplication (SImode in RV32 || DImode in RV64):
mulht3, t1, t2
mul t0, t1, t2
srait4, t0, 31/63 (RV32/64)
bne t3, t4, overflow

signed multiplication (SImode in RV64):
mul t0, t1, t2
sext.w  t3, t0
bne t0, t3, overflow

unsigned multiplication (SImode in RV32 || DImode in RV64 ):
mulhu   t3, t1, t2
mul t0, t1, t2
bne t3, 0,  overflow

unsigned multiplication (SImode in RV64):
sllit0, t0, 32
sllit1, t1, 32
mulhu   t2, t1, t1
srlit3, t2, 32
bne t3, 0,  overflow
sext.w  t2, t2

Speical Thanks to:
Jim Wilson, for the throughout help and advice on gcc and gdb.
Craig Topper, for pointing out the SImode operand needs sext.w for 
unsigned add/sub in RV64.
Andrew Waterman. For better SImode signed add/sub and unsigned mul 
pattern in RV64.
Kito Cheng, for patch submission.
---
 gcc/config/riscv/riscv.c  |   8 ++
 gcc/config/riscv/riscv.h  |   4 +
 gcc/config/riscv/riscv.md | 243 ++
 3 files changed, 255 insertions(+)

diff --git a/gcc/config/riscv/riscv.c b/gcc/config/riscv/riscv.c
index d489717b2a5..cf94f5c9658 100644
--- a/gcc/config/riscv/riscv.c
+++ b/gcc/config/riscv/riscv.c
@@ -351,6 +351,14 @@ static const struct riscv_tune_info 
riscv_tune_info_table[] = {
   { "size", generic, &optimize_size_tune_info },
 };
 
+/* Implement TARGET_MIN_ARITHMETIC_PRECISION.  */
+
+static unsigned int
+riscv_min_arithmetic_precision (void)
+{
+  return 32;
+}
+
 /* Return the riscv_tune_info entry for the given name string.  */
 
 static const struct riscv_tune_info *
diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h
index 172c7ca7c98..0521c8881ae 100644
--- a/gcc/config/riscv/riscv.h
+++ b/gcc/config/riscv/riscv.h
@@ -121,6 +121,10 @@ extern const char *riscv_default_mtune (int argc, const 
char **argv);
 #define MIN_UNITS_PER_WORD 4
 #endif
 
+/* Allows SImode op in builtin overflow pattern, see internal-fn.c.  */
+#undef TARGET_MIN_ARITHMETIC_PRECISION
+#define TARGET_MIN_ARITHMETIC_PRECISION riscv_min_arithmetic_precision
+
 /* The `Q' extension is not yet supported.  */
 #define UNITS_PER_FP_REG (TARGET_DOUBLE_FLOAT ? 8 : 4)
 
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 36012ad1f77..a6e14fdc24d 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -462,6 +462,80 @@
   [(set_attr "type" "arith")
(set_attr "mode" "DI")])
 
+(define_expand "addv4"
+  [(set (match_operand:GPR 0 "register_operand" "=r,r")
+(plus:GPR (match_operand:GPR 1 "register_operand" " r,r")
+(match_operand:GPR 2 "arith_operand"" r,I")))
+(label_ref (match_operand 3 "" ""))]
+  ""
+{
+  if (TARGET_64BIT && mode == SImode)
+  {
+rtx t3 = gen_reg_rtx (DImode);
+rtx t4 = gen_reg_rtx (DImode);
+rtx t5 = gen_reg_rtx (DImode);
+rtx t6 = gen_reg_rtx (DImode);
+
+emit_insn (gen_addsi3 (operands[0], operands[1], operands[2]));
+if (GET_CODE (operands[1]) != CONST_INT)
+  emit_insn (gen_extend_insn (t4, operands[1], DImode, SImode, 0));
+else
+  t4 = operands[1];
+if (GET_CODE (operands[2]) != CONST_INT)
+  emit_insn (gen_extend_insn (t5, operands[2], DImode, SImode, 0));
+else
+  t5 = operands[2];
+emit_insn (gen_adddi3 (t3, t4, t5));
+emit_insn (gen_extend_insn (t6, operands[0], DImode, SImode, 0));
+
+riscv_expand_conditional_branch (operands[3], NE, t6, t3)

[x86_64 PATCH] PR target/107563: Add 3-instruction subroutine vector shift in ix86_expand_vec_perm_const_1

2024-01-04 Thread Levy Hsu
From: Liwei Xu 

This patch optimize byte swaps in vectors using SSE2 instructions.
It targets 8-byte and 16-byte vectors, efficiently handling patterns like 
__builtin_shufflevector(v, v, 1, 0, 3, 2, ...). 

PR target/107563

gcc/ChangeLog:

* config/i386/i386-expand.cc (expand_vec_perm_psrlw_psllw_por): New
subroutine.
(ix86_expand_vec_perm_const_1): New Entry.

gcc/testsuite/ChangeLog:

* g++.target/i386/pr107563.C: New test.
---
 gcc/config/i386/i386-expand.cc   | 64 
 gcc/testsuite/g++.target/i386/pr107563.C | 23 +
 2 files changed, 87 insertions(+)
 create mode 100755 gcc/testsuite/g++.target/i386/pr107563.C

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 527fcc63506..ba5ea20daf7 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -21826,6 +21826,67 @@ expand_vec_perm_2perm_pblendv (struct 
expand_vec_perm_d *d, bool two_insn)
   return true;
 }
 
+/* A subroutine of ix86_expand_vec_perm_const_1.
+   Implement a permutation with psrlw, psllw and por.
+   It handles case:
+   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
+   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */
+
+static bool
+expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d)
+{
+  unsigned i;
+  rtx (*gen_shr) (rtx, rtx, rtx);
+  rtx (*gen_shl) (rtx, rtx, rtx);
+  rtx (*gen_or) (rtx, rtx, rtx);
+  machine_mode mode = VOIDmode;
+
+  if (!TARGET_SSE2 || !d->one_operand_p)
+return false;
+
+  switch (d->vmode)
+{
+case E_V8QImode:
+  if (!TARGET_MMX_WITH_SSE)
+   return false;
+  mode = V4HImode;
+  gen_shr = gen_ashrv4hi3;
+  gen_shl = gen_ashlv4hi3;
+  gen_or = gen_iorv4hi3;
+  break;
+case E_V16QImode:
+  mode = V8HImode;
+  gen_shr = gen_vlshrv8hi3;
+  gen_shl = gen_vashlv8hi3;
+  gen_or = gen_iorv8hi3;
+  break;
+default: return false;
+}
+
+  if (!rtx_equal_p (d->op0, d->op1))
+return false;
+
+  for (i = 0; i < d->nelt; i += 2)
+if (d->perm[i] != i + 1 || d->perm[i + 1] != i)
+  return false;
+
+  if (d->testing_p)
+return true;
+
+  rtx tmp1 = gen_reg_rtx (mode);
+  rtx tmp2 = gen_reg_rtx (mode);
+  rtx op0 = force_reg (d->vmode, d->op0);
+
+  emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode));
+  emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode));
+  emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8)));
+  emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8)));
+  emit_insn (gen_or (tmp1, tmp1, tmp2));
+  emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
permutation using two vperm2f128, followed by a vshufpd insn blending
the two vectors together.  */
@@ -23243,6 +23304,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d 
*d)
   if (expand_vec_perm_2perm_pblendv (d, false))
 return true;
 
+  if (expand_vec_perm_psrlw_psllw_por (d))
+return true;
+
   /* Try sequences of four instructions.  */
 
   if (expand_vec_perm_even_odd_trunc (d))
diff --git a/gcc/testsuite/g++.target/i386/pr107563.C 
b/gcc/testsuite/g++.target/i386/pr107563.C
new file mode 100755
index 000..5b0c648e8f1
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr107563.C
@@ -0,0 +1,23 @@
+/* PR target/107563.C */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-std=c++2b -O3 -msse2" } */
+/* { dg-final { scan-assembler-not "movzbl" } } */
+/* { dg-final { scan-assembler-not "salq" } } */
+/* { dg-final { scan-assembler-not "orq" } } */
+/* { dg-final { scan-assembler-not "punpcklqdq" } } */
+/* { dg-final { scan-assembler-times "psllw" 2 } } */
+/* { dg-final { scan-assembler-times "psrlw" 1 } } */
+/* { dg-final { scan-assembler-times "psraw" 1 } } */
+/* { dg-final { scan-assembler-times "por" 2 } } */
+
+using temp_vec_type [[__gnu__::__vector_size__ (16)]] = char;
+void foo (temp_vec_type& v) noexcept
+{
+  v = __builtin_shufflevector(v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
+}
+
+using temp_vec_type2 [[__gnu__::__vector_size__ (8)]] = char;
+void foo2 (temp_vec_type2& v) noexcept
+{
+  v=__builtin_shufflevector(v,v,1,0,3,2,5,4,7,6);
+}
-- 
2.31.1



Support bitwise and/andnot/abs/neg/copysign/xorsign op for V8BF/V16BF/V32BF

2024-07-03 Thread Levy Hsu
This patch extends support for BF16 vector operations in GCC, including bitwise 
AND, ANDNOT, ABS, NEG, COPYSIGN, and XORSIGN for V8BF, V16BF, and V32BF modes.
Bootstrapped and tested on x86_64-linux-gnu. ok for trunk?

gcc/ChangeLog:

* config/i386/i386-expand.cc (ix86_expand_fp_absneg_operator): Add VBF 
modes.
(ix86_expand_copysign): Ditto.
(ix86_expand_xorsign): Ditto.
* config/i386/i386.cc (ix86_build_const_vector): Ditto.
(ix86_build_signbit_mask): Ditto.
* config/i386/sse.md: Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx2-bf16-vec-absneg.c: New test.
* gcc.target/i386/avx512f-bf16-vec-absneg.c: New test.

---
 gcc/config/i386/i386-expand.cc| 76 +++--
 gcc/config/i386/i386.cc   |  6 ++
 gcc/config/i386/sse.md| 37 +---
 .../gcc.target/i386/avx2-bf16-vec-absneg.c| 85 +++
 .../gcc.target/i386/avx512f-bf16-vec-absneg.c | 66 ++
 5 files changed, 234 insertions(+), 36 deletions(-)
 create mode 100755 gcc/testsuite/gcc.target/i386/avx2-bf16-vec-absneg.c
 create mode 100755 gcc/testsuite/gcc.target/i386/avx512f-bf16-vec-absneg.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 5c29ee1353f..46d13a55e6a 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -2175,20 +2175,28 @@ ix86_expand_fp_absneg_operator (enum rtx_code code, 
machine_mode mode,
   machine_mode vmode = mode;
   rtvec par;
 
-  if (vector_mode || mode == TFmode || mode == HFmode)
-{
-  use_sse = true;
-  if (mode == HFmode)
-   vmode = V8HFmode;
-}
-  else if (TARGET_SSE_MATH)
-{
-  use_sse = SSE_FLOAT_MODE_P (mode);
-  if (mode == SFmode)
-   vmode = V4SFmode;
-  else if (mode == DFmode)
-   vmode = V2DFmode;
-}
+  switch (mode)
+  {
+  case HFmode:
+use_sse = true;
+vmode = V8HFmode;
+break;
+  case BFmode:
+use_sse = true;
+vmode = V8BFmode;
+break;
+  case SFmode:
+use_sse = TARGET_SSE_MATH;
+vmode = V4SFmode;
+break;
+  case DFmode:
+use_sse = TARGET_SSE_MATH;
+vmode = V2DFmode;
+break;
+  default:
+use_sse = vector_mode || mode == TFmode;
+break;
+  }
 
   dst = operands[0];
   src = operands[1];
@@ -2321,16 +2329,26 @@ ix86_expand_copysign (rtx operands[])
 
   mode = GET_MODE (operands[0]);
 
-  if (mode == HFmode)
+  switch (mode)
+  {
+  case HFmode:
 vmode = V8HFmode;
-  else if (mode == SFmode)
+break;
+  case BFmode:
+vmode = V8BFmode;
+break;
+  case SFmode:
 vmode = V4SFmode;
-  else if (mode == DFmode)
+break;
+  case DFmode:
 vmode = V2DFmode;
-  else if (mode == TFmode)
+break;
+  case TFmode:
 vmode = mode;
-  else
-gcc_unreachable ();
+break;
+  default:
+gcc_unreachable();
+  }
 
   if (rtx_equal_p (operands[1], operands[2]))
 {
@@ -2391,14 +2409,24 @@ ix86_expand_xorsign (rtx operands[])
 
   mode = GET_MODE (dest);
 
-  if (mode == HFmode)
+  switch (mode)
+  {
+  case HFmode:
 vmode = V8HFmode;
-  else if (mode == SFmode)
+break;
+  case BFmode:
+vmode = V8BFmode;
+break;
+  case SFmode:
 vmode = V4SFmode;
-  else if (mode == DFmode)
+break;
+  case DFmode:
 vmode = V2DFmode;
-  else
+break;
+  default:
 gcc_unreachable ();
+break;
+  }
 
   temp = gen_reg_rtx (vmode);
   mask = ix86_build_signbit_mask (vmode, 0, 0);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d4ccc24be6e..b5768a65e52 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -16353,6 +16353,9 @@ ix86_build_const_vector (machine_mode mode, bool vect, 
rtx value)
 case E_V8DFmode:
 case E_V4DFmode:
 case E_V2DFmode:
+case E_V32BFmode:
+case E_V16BFmode:
+case E_V8BFmode:
   n_elt = GET_MODE_NUNITS (mode);
   v = rtvec_alloc (n_elt);
   scalar_mode = GET_MODE_INNER (mode);
@@ -16389,6 +16392,9 @@ ix86_build_signbit_mask (machine_mode mode, bool vect, 
bool invert)
 case E_V8HFmode:
 case E_V16HFmode:
 case E_V32HFmode:
+case E_V32BFmode:
+case E_V16BFmode:
+case E_V8BFmode:
   vec_mode = mode;
   imode = HImode;
   break;
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 0be2dcd8891..1703bbb4250 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -351,7 +351,9 @@
 
 ;; 128-, 256- and 512-bit float vector modes for bitwise operations
 (define_mode_iterator VFB
-  [(V32HF "TARGET_AVX512F && TARGET_EVEX512")
+  [(V32BF "TARGET_AVX512F && TARGET_EVEX512")
+   (V16BF "TARGET_AVX") (V8BF "TARGET_SSE2")
+   (V32HF "TARGET_AVX512F && TARGET_EVEX512")
(V16HF "TARGET_AVX") (V8HF "TARGET_SSE2")
(V16SF "TARGET_AVX512F && TARGET_EVEX512") (V8SF "TARGET_AVX") V4SF
(V8DF "TARGET_AVX512F && TARGET_EVEX512")
@@ -364,7 +366,8 @@
 
 ;; 128- and 256-bit float vector modes for bitwise opera

[PATCH] x86: Add 3-instruction subroutine vector shift for V16QI in ix86_expand_vec_perm_const_1 [PR107563]

2024-05-14 Thread Levy Hsu
Hi All

We've introduced a new subroutine in ix86_expand_vec_perm_const_1
to optimize vector shifting for the V16QI type on x86.
This patch uses a three-instruction sequence psrlw, psllw, and por
to handle specific vector shuffle operations more efficiently.
The change aims to improve assembly code generation for configurations
supporting SSE2.

Bootstrapped and tested on x86_64-linux-gnu, OK for trunk?

Best
Levy

gcc/ChangeLog:

PR target/107563
* config/i386/i386-expand.cc (expand_vec_perm_psrlw_psllw_por): New
subroutine.
(ix86_expand_vec_perm_const_1): Call expand_vec_perm_psrlw_psllw_por.

gcc/testsuite/ChangeLog:

PR target/107563
* g++.target/i386/pr107563-a.C: New test.
* g++.target/i386/pr107563-b.C: New test.
---
 gcc/config/i386/i386-expand.cc | 64 ++
 gcc/testsuite/g++.target/i386/pr107563-a.C | 13 +
 gcc/testsuite/g++.target/i386/pr107563-b.C | 12 
 3 files changed, 89 insertions(+)
 create mode 100755 gcc/testsuite/g++.target/i386/pr107563-a.C
 create mode 100755 gcc/testsuite/g++.target/i386/pr107563-b.C

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 2f27bfb484c..5098d2886bb 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -22362,6 +22362,67 @@ expand_vec_perm_2perm_pblendv (struct 
expand_vec_perm_d *d, bool two_insn)
   return true;
 }
 
+/* A subroutine of ix86_expand_vec_perm_const_1.
+   Implement a permutation with psrlw, psllw and por.
+   It handles case:
+   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
+   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */
+
+static bool
+expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d)
+{
+  unsigned i;
+  rtx (*gen_shr) (rtx, rtx, rtx);
+  rtx (*gen_shl) (rtx, rtx, rtx);
+  rtx (*gen_or) (rtx, rtx, rtx);
+  machine_mode mode = VOIDmode;
+
+  if (!TARGET_SSE2 || !d->one_operand_p)
+return false;
+
+  switch (d->vmode)
+{
+case E_V8QImode:
+  if (!TARGET_MMX_WITH_SSE)
+   return false;
+  mode = V4HImode;
+  gen_shr = gen_ashrv4hi3;
+  gen_shl = gen_ashlv4hi3;
+  gen_or = gen_iorv4hi3;
+  break;
+case E_V16QImode:
+  mode = V8HImode;
+  gen_shr = gen_vlshrv8hi3;
+  gen_shl = gen_vashlv8hi3;
+  gen_or = gen_iorv8hi3;
+  break;
+default: return false;
+}
+
+  if (!rtx_equal_p (d->op0, d->op1))
+return false;
+
+  for (i = 0; i < d->nelt; i += 2)
+if (d->perm[i] != i + 1 || d->perm[i + 1] != i)
+  return false;
+
+  if (d->testing_p)
+return true;
+
+  rtx tmp1 = gen_reg_rtx (mode);
+  rtx tmp2 = gen_reg_rtx (mode);
+  rtx op0 = force_reg (d->vmode, d->op0);
+
+  emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode));
+  emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode));
+  emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8)));
+  emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8)));
+  emit_insn (gen_or (tmp1, tmp1, tmp2));
+  emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
permutation using two vperm2f128, followed by a vshufpd insn blending
the two vectors together.  */
@@ -23782,6 +23843,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d 
*d)
   if (expand_vec_perm_2perm_pblendv (d, false))
 return true;
 
+  if (expand_vec_perm_psrlw_psllw_por (d))
+return true;
+
   /* Try sequences of four instructions.  */
 
   if (expand_vec_perm_even_odd_trunc (d))
diff --git a/gcc/testsuite/g++.target/i386/pr107563-a.C 
b/gcc/testsuite/g++.target/i386/pr107563-a.C
new file mode 100755
index 000..605c1bdf814
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr107563-a.C
@@ -0,0 +1,13 @@
+/* PR target/107563.C */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-std=c++2b -O3 -msse2" } */
+/* { dg-final { scan-assembler-times "psllw" 1 } } */
+/* { dg-final { scan-assembler-times "psraw" 1 } } */
+/* { dg-final { scan-assembler-times "por" 1 } } */
+
+using temp_vec_type2 [[__gnu__::__vector_size__(8)]] = char;
+
+void foo2(temp_vec_type2& v) noexcept
+{
+  v = __builtin_shufflevector(v, v, 1, 0, 3, 2, 5, 4, 7, 6);
+}
diff --git a/gcc/testsuite/g++.target/i386/pr107563-b.C 
b/gcc/testsuite/g++.target/i386/pr107563-b.C
new file mode 100755
index 000..0ce3e8263bb
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr107563-b.C
@@ -0,0 +1,12 @@
+/* PR target/107563.C */
+/* { dg-options "-std=c++2b -O3 -msse2" } */
+/* { dg-final { scan-assembler-times "psllw" 1 } } */
+/* { dg-final { scan-assembler-times "psrlw" 1 } } */
+/* { dg-final { scan-assembler-times "por" 1 } } */
+
+using temp_vec_type [[__gnu__::__vector_size__(16)]] = char;
+
+void foo(temp_vec_type& v) noexcept
+{
+  v = __builtin_shufflevector(v, v, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 
12, 15, 14);
+}
-- 
2.31.1



[PATCH] x86: Fix Logical Shift Issue in expand_vec_perm_psrlw_psllw_por [PR115146]

2024-05-20 Thread Levy Hsu
Replaced arithmetic shifts with logical shifts in 
expand_vec_perm_psrlw_psllw_por to avoid sign bit extension issues. Also 
corrected gen_vlshrv8hi3 to gen_lshrv8hi3 and gen_vashlv8hi3 to gen_ashlv8hi3.

Bootstrapped and tested on x86_64-linux-gnu, OK for trunk?

Co-authored-by: H.J. Lu 

gcc/ChangeLog:

PR target/115146
* config/i386/i386-expand.cc (expand_vec_perm_psrlw_psllw_por): Replace 
arithmatic shift
gen_ashrv4hi3 with logic shift gen_lshrv4hi3.
Replace gen_vlshrv8hi3 with gen_lshrv8hi3 and gen_vashlv8hi3 with 
gen_ashlv8hi3.

gcc/testsuite/ChangeLog:

PR target/115146
* g++.target/i386/pr107563-a.C: Append '-mno-sse3' to compile option
to avoid test failure on hosts with SSE3 support.
* g++.target/i386/pr107563-b.C: Append '-mno-sse3' to compile option
to avoid test failure on hosts with SSE3 support.
* gcc.target/i386/pr115146.c: New test.
---
 gcc/config/i386/i386-expand.cc |  6 ++--
 gcc/testsuite/g++.target/i386/pr107563-a.C |  4 +--
 gcc/testsuite/g++.target/i386/pr107563-b.C |  2 +-
 gcc/testsuite/gcc.target/i386/pr115146.c   | 37 ++
 4 files changed, 43 insertions(+), 6 deletions(-)
 create mode 100755 gcc/testsuite/gcc.target/i386/pr115146.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 4e16aedc5c1..945530d6481 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -22386,14 +22386,14 @@ expand_vec_perm_psrlw_psllw_por (struct 
expand_vec_perm_d *d)
   if (!TARGET_MMX_WITH_SSE)
return false;
   mode = V4HImode;
-  gen_shr = gen_ashrv4hi3;
+  gen_shr = gen_lshrv4hi3;
   gen_shl = gen_ashlv4hi3;
   gen_or = gen_iorv4hi3;
   break;
 case E_V16QImode:
   mode = V8HImode;
-  gen_shr = gen_vlshrv8hi3;
-  gen_shl = gen_vashlv8hi3;
+  gen_shr = gen_lshrv8hi3;
+  gen_shl = gen_ashlv8hi3;
   gen_or = gen_iorv8hi3;
   break;
 default: return false;
diff --git a/gcc/testsuite/g++.target/i386/pr107563-a.C 
b/gcc/testsuite/g++.target/i386/pr107563-a.C
index 605c1bdf814..c1c332bb948 100755
--- a/gcc/testsuite/g++.target/i386/pr107563-a.C
+++ b/gcc/testsuite/g++.target/i386/pr107563-a.C
@@ -1,8 +1,8 @@
 /* PR target/107563.C */
 /* { dg-do compile { target { ! ia32 } } } */
-/* { dg-options "-std=c++2b -O3 -msse2" } */
+/* { dg-options "-std=c++2b -O3 -msse2 -mno-sse3" } */
 /* { dg-final { scan-assembler-times "psllw" 1 } } */
-/* { dg-final { scan-assembler-times "psraw" 1 } } */
+/* { dg-final { scan-assembler-times "psrlw" 1 } } */
 /* { dg-final { scan-assembler-times "por" 1 } } */
 
 using temp_vec_type2 [[__gnu__::__vector_size__(8)]] = char;
diff --git a/gcc/testsuite/g++.target/i386/pr107563-b.C 
b/gcc/testsuite/g++.target/i386/pr107563-b.C
index 0ce3e8263bb..d5cc0300f46 100755
--- a/gcc/testsuite/g++.target/i386/pr107563-b.C
+++ b/gcc/testsuite/g++.target/i386/pr107563-b.C
@@ -1,5 +1,5 @@
 /* PR target/107563.C */
-/* { dg-options "-std=c++2b -O3 -msse2" } */
+/* { dg-options "-std=c++2b -O3 -msse2 -mno-sse3" } */
 /* { dg-final { scan-assembler-times "psllw" 1 } } */
 /* { dg-final { scan-assembler-times "psrlw" 1 } } */
 /* { dg-final { scan-assembler-times "por" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr115146.c 
b/gcc/testsuite/gcc.target/i386/pr115146.c
new file mode 100755
index 000..df7d0131968
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115146.c
@@ -0,0 +1,38 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef unsigned char v8qi __attribute__((vector_size (8)));
+
+v8qi res, a;
+
+void __attribute__((noipa))
+foo (void)
+{
+  res = __builtin_shufflevector(a, a, 1, 0, 3, 2, 5, 4, 7, 6);
+}
+
+void
+comp (v8qi a, v8qi b, int n)
+{
+  for (unsigned i = 0; i < n; ++i)
+if ((a)[i] != (b)[i])
+  __builtin_abort ();
+}
+
+#define E0 140
+#define E1 141
+#define E2 142
+#define E3 143
+#define E4 144
+#define E5 145
+#define E6 146
+#define E7 147
+
+int main()
+{
+  a = (v8qi) { E0, E1, E2, E3, E4, E5, E6, E7 };
+  foo ();
+  comp (res, ((v8qi) { E1, E0, E3, E2, E5, E4, E7, E6 }), 8);
+  return 0;
+}
+
-- 
2.31.1



[PATCH] x86: Emit cvtne2ps2bf16 for odd increasing perm in __builtin_shufflevector

2024-06-13 Thread Levy Hsu
gcc/ChangeLog:

* config/i386/i386-expand.cc
(ix86_vectorize_vec_perm_const): Convert BF to HI using subreg.
* config/i386/predicates.md
(vcvtne2ps2bf_parallel): New define_insn_and_split.
* config/i386/sse.md
(vpermt2_sepcial_bf16_shuffle_): New predicates matches odd 
increasing perm.

gcc/testsuite/ChangeLog:

* gcc.target/i386/vpermt2-special-bf16-shufflue.c: New test.
---
 gcc/config/i386/i386-expand.cc|  4 +--
 gcc/config/i386/predicates.md | 11 ++
 gcc/config/i386/sse.md| 35 +++
 .../i386/vpermt2-special-bf16-shufflue.c  | 27 ++
 4 files changed, 75 insertions(+), 2 deletions(-)
 create mode 100755 
gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 312329e550b..3d599c0651a 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -23657,8 +23657,8 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, 
machine_mode op_mode,
   if (GET_MODE_SIZE (vmode) == 64 && !TARGET_EVEX512)
 return false;
 
-  /* For HF mode vector, convert it to HI using subreg.  */
-  if (GET_MODE_INNER (vmode) == HFmode)
+  /* For HF and BF mode vector, convert it to HI using subreg.  */
+  if (GET_MODE_INNER (vmode) == HFmode || GET_MODE_INNER (vmode) == BFmode)
 {
   machine_mode orig_mode = vmode;
   vmode = mode_for_vector (HImode,
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 7afe3100cb7..1676c50de71 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -2322,3 +2322,14 @@
 
   return true;
 })
+
+;; Check that each element is odd and incrementally increasing from 1
+(define_predicate "vcvtne2ps2bf_parallel"
+  (and (match_code "const_vector")
+   (match_code "const_int" "a"))
+{
+  for (int i = 0; i < XVECLEN (op, 0); ++i)
+if (INTVAL (XVECEXP (op, 0, i)) != (2 * i + 1))
+  return false;
+  return true;
+})
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 680a46a0b08..5ddd1c0a778 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -30698,3 +30698,38 @@
   "TARGET_AVXVNNIINT16"
   "vpdp\t{%3, %2, %0|%0, %2, %3}"
[(set_attr "prefix" "vex")])
+
+(define_mode_attr hi_cvt_bf
+  [(V8HI "v8bf") (V16HI "v16bf") (V32HI "v32bf")])
+
+(define_mode_attr HI_CVT_BF
+  [(V8HI "V8BF") (V16HI "V16BF") (V32HI "V32BF")])
+
+(define_insn_and_split "vpermt2_sepcial_bf16_shuffle_"
+  [(set (match_operand:VI2_AVX512F 0 "register_operand")
+   (unspec:VI2_AVX512F
+ [(match_operand:VI2_AVX512F 1 "vcvtne2ps2bf_parallel")
+  (match_operand:VI2_AVX512F 2 "register_operand")
+  (match_operand:VI2_AVX512F 3 "nonimmediate_operand")]
+  UNSPEC_VPERMT2))]
+  "TARGET_AVX512VL && TARGET_AVX512BF16 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  rtx op0 = gen_reg_rtx (mode);
+  operands[2] = lowpart_subreg (mode,
+   force_reg (mode, operands[2]),
+   mode);
+  operands[3] = lowpart_subreg (mode,
+   force_reg (mode, operands[3]),
+   mode);
+
+  emit_insn (gen_avx512f_cvtne2ps2bf16_(op0,
+  operands[3],
+  operands[2]));
+  emit_move_insn (operands[0], lowpart_subreg (mode, op0,
+  mode));
+  DONE;
+}
+[(set_attr "mode" "")])
diff --git a/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c 
b/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c
new file mode 100755
index 000..5c65f2a9884
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bf16 -mavx512vl" } */
+/* { dg-final { scan-assembler-not "vpermi2b" } } */
+/* { dg-final { scan-assembler-times "vcvtne2ps2bf16" 3 } } */
+
+typedef __bf16 v8bf __attribute__((vector_size(16)));
+typedef __bf16 v16bf __attribute__((vector_size(32)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+
+v8bf foo0(v8bf a, v8bf b)
+{
+  return __builtin_shufflevector(a, b, 1, 3, 5, 7, 9, 11, 13, 15);
+}
+
+v16bf foo1(v16bf a, v16bf b)
+{
+  return __builtin_shufflevector(a, b, 1, 3, 5, 7, 9, 11, 13, 15,
+ 17, 19, 21, 23, 25, 27, 29, 31);
+}
+
+v32bf foo2(v32bf a, v32bf b)
+{
+  return __builtin_shufflevector(a, b, 1, 3, 5, 7, 9, 11, 13, 15, 
+ 17, 19, 21, 23, 25, 27, 29, 31, 
+ 33, 35, 37, 39, 41, 43, 45, 47, 
+ 49, 51, 53, 55, 57, 59, 61, 63);
+}
-- 
2.31.1



[PATCH] x86: Emit cvtne2ps2bf16 for odd increasing perm in __builtin_shufflevector

2024-06-13 Thread Levy Hsu
This patch updates the GCC x86 backend to efficiently handle
odd, incrementally increasing permutations of BF16 vectors
using the cvtne2ps2bf16 instruction.
It modifies ix86_vectorize_vec_perm_const to support these operations
and adds a specific predicate to ensure proper sequence handling.

Bootstrapped and tested on x86_64-linux-gnu, OK for trunk?

gcc/ChangeLog:

* config/i386/i386-expand.cc
(ix86_vectorize_vec_perm_const): Convert BF to HI using subreg.
* config/i386/predicates.md
(vcvtne2ps2bf_parallel): New define_insn_and_split.
* config/i386/sse.md
(vpermt2_sepcial_bf16_shuffle_): New predicates matches odd 
increasing perm.

gcc/testsuite/ChangeLog:

* gcc.target/i386/vpermt2-special-bf16-shufflue.c: New test.
---
 gcc/config/i386/i386-expand.cc|  4 +--
 gcc/config/i386/predicates.md | 11 ++
 gcc/config/i386/sse.md| 35 +++
 .../i386/vpermt2-special-bf16-shufflue.c  | 27 ++
 4 files changed, 75 insertions(+), 2 deletions(-)
 create mode 100755 
gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 312329e550b..3d599c0651a 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -23657,8 +23657,8 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, 
machine_mode op_mode,
   if (GET_MODE_SIZE (vmode) == 64 && !TARGET_EVEX512)
 return false;
 
-  /* For HF mode vector, convert it to HI using subreg.  */
-  if (GET_MODE_INNER (vmode) == HFmode)
+  /* For HF and BF mode vector, convert it to HI using subreg.  */
+  if (GET_MODE_INNER (vmode) == HFmode || GET_MODE_INNER (vmode) == BFmode)
 {
   machine_mode orig_mode = vmode;
   vmode = mode_for_vector (HImode,
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 7afe3100cb7..1676c50de71 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -2322,3 +2322,14 @@
 
   return true;
 })
+
+;; Check that each element is odd and incrementally increasing from 1
+(define_predicate "vcvtne2ps2bf_parallel"
+  (and (match_code "const_vector")
+   (match_code "const_int" "a"))
+{
+  for (int i = 0; i < XVECLEN (op, 0); ++i)
+if (INTVAL (XVECEXP (op, 0, i)) != (2 * i + 1))
+  return false;
+  return true;
+})
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 680a46a0b08..5ddd1c0a778 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -30698,3 +30698,38 @@
   "TARGET_AVXVNNIINT16"
   "vpdp\t{%3, %2, %0|%0, %2, %3}"
[(set_attr "prefix" "vex")])
+
+(define_mode_attr hi_cvt_bf
+  [(V8HI "v8bf") (V16HI "v16bf") (V32HI "v32bf")])
+
+(define_mode_attr HI_CVT_BF
+  [(V8HI "V8BF") (V16HI "V16BF") (V32HI "V32BF")])
+
+(define_insn_and_split "vpermt2_sepcial_bf16_shuffle_"
+  [(set (match_operand:VI2_AVX512F 0 "register_operand")
+   (unspec:VI2_AVX512F
+ [(match_operand:VI2_AVX512F 1 "vcvtne2ps2bf_parallel")
+  (match_operand:VI2_AVX512F 2 "register_operand")
+  (match_operand:VI2_AVX512F 3 "nonimmediate_operand")]
+  UNSPEC_VPERMT2))]
+  "TARGET_AVX512VL && TARGET_AVX512BF16 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  rtx op0 = gen_reg_rtx (mode);
+  operands[2] = lowpart_subreg (mode,
+   force_reg (mode, operands[2]),
+   mode);
+  operands[3] = lowpart_subreg (mode,
+   force_reg (mode, operands[3]),
+   mode);
+
+  emit_insn (gen_avx512f_cvtne2ps2bf16_(op0,
+  operands[3],
+  operands[2]));
+  emit_move_insn (operands[0], lowpart_subreg (mode, op0,
+  mode));
+  DONE;
+}
+[(set_attr "mode" "")])
diff --git a/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c 
b/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c
new file mode 100755
index 000..5c65f2a9884
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bf16 -mavx512vl" } */
+/* { dg-final { scan-assembler-not "vpermi2b" } } */
+/* { dg-final { scan-assembler-times "vcvtne2ps2bf16" 3 } } */
+
+typedef __bf16 v8bf __attribute__((vector_size(16)));
+typedef __bf16 v16bf __attribute__((vector_size(32)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+
+v8bf foo0(v8bf a, v8bf b)
+{
+  return __builtin_shufflevector(a, b, 1, 3, 5, 7, 9, 11, 13, 15);
+}
+
+v16bf foo1(v16bf a, v16bf b)
+{
+  return __builtin_shufflevector(a, b, 1, 3, 5, 7, 9, 11, 13, 15,
+ 17, 19, 21, 23, 25, 27, 29, 31);
+}
+
+v32bf foo2(v32bf a, v32bf b)
+{
+  return __builtin_shufflevector(a, b, 

[PATCH] x86:Add 3-instruction subroutine vector shift for V16QI in ix86_expand_vec_perm_const_1 [PR107563]

2024-05-07 Thread Levy Hsu
PR target/107563

gcc/ChangeLog:

* config/i386/i386-expand.cc (expand_vec_perm_psrlw_psllw_por): New
subroutine.
(ix86_expand_vec_perm_const_1): New Entry.

gcc/testsuite/ChangeLog:

* g++.target/i386/pr107563.C: New test.
---
 gcc/config/i386/i386-expand.cc   | 64 
 gcc/testsuite/g++.target/i386/pr107563.C | 23 +
 2 files changed, 87 insertions(+)
 create mode 100755 gcc/testsuite/g++.target/i386/pr107563.C

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 2f27bfb484c..2718b0acb87 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -22362,6 +22362,67 @@ expand_vec_perm_2perm_pblendv (struct 
expand_vec_perm_d *d, bool two_insn)
   return true;
 }
 
+/* A subroutine of ix86_expand_vec_perm_const_1.
+   Implement a permutation with psrlw, psllw and por.
+   It handles case:
+   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
+   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */
+
+static bool
+expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d)
+{
+  unsigned i;
+  rtx (*gen_shr) (rtx, rtx, rtx);
+  rtx (*gen_shl) (rtx, rtx, rtx);
+  rtx (*gen_or) (rtx, rtx, rtx);
+  machine_mode mode = VOIDmode;
+
+  if (!TARGET_SSE2 || !d->one_operand_p)
+return false;
+
+  switch (d->vmode)
+{
+case E_V8QImode:
+  if (!TARGET_MMX_WITH_SSE)
+   return false;
+  mode = V4HImode;
+  gen_shr = gen_ashrv4hi3;
+  gen_shl = gen_ashlv4hi3;
+  gen_or = gen_iorv4hi3;
+  break;
+case E_V16QImode:
+  mode = V8HImode;
+  gen_shr = gen_vlshrv8hi3;
+  gen_shl = gen_vashlv8hi3;
+  gen_or = gen_iorv8hi3;
+  break;
+default: return false;
+}
+
+  if (!rtx_equal_p (d->op0, d->op1))
+return false;
+
+  for (i = 0; i < d->nelt; i += 2)
+if (d->perm[i] != i + 1 || d->perm[i + 1] != i)
+  return false;
+
+  if (d->testing_p)
+return true;
+
+  rtx tmp1 = gen_reg_rtx (mode);
+  rtx tmp2 = gen_reg_rtx (mode);
+  rtx op0 = force_reg (d->vmode, d->op0);
+
+  emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode));
+  emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode));
+  emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8)));
+  emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8)));
+  emit_insn (gen_or (tmp1, tmp1, tmp2));
+  emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
permutation using two vperm2f128, followed by a vshufpd insn blending
the two vectors together.  */
@@ -23781,6 +23842,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d 
*d)
 
   if (expand_vec_perm_2perm_pblendv (d, false))
 return true;
+
+  if (expand_vec_perm_psrlw_psllw_por (d))
+return true;
 
   /* Try sequences of four instructions.  */
 
diff --git a/gcc/testsuite/g++.target/i386/pr107563.C 
b/gcc/testsuite/g++.target/i386/pr107563.C
new file mode 100755
index 000..5b0c648e8f1
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr107563.C
@@ -0,0 +1,23 @@
+/* PR target/107563.C */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-std=c++2b -O3 -msse2" } */
+/* { dg-final { scan-assembler-not "movzbl" } } */
+/* { dg-final { scan-assembler-not "salq" } } */
+/* { dg-final { scan-assembler-not "orq" } } */
+/* { dg-final { scan-assembler-not "punpcklqdq" } } */
+/* { dg-final { scan-assembler-times "psllw" 2 } } */
+/* { dg-final { scan-assembler-times "psrlw" 1 } } */
+/* { dg-final { scan-assembler-times "psraw" 1 } } */
+/* { dg-final { scan-assembler-times "por" 2 } } */
+
+using temp_vec_type [[__gnu__::__vector_size__ (16)]] = char;
+void foo (temp_vec_type& v) noexcept
+{
+  v = __builtin_shufflevector(v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
+}
+
+using temp_vec_type2 [[__gnu__::__vector_size__ (8)]] = char;
+void foo2 (temp_vec_type2& v) noexcept
+{
+  v=__builtin_shufflevector(v,v,1,0,3,2,5,4,7,6);
+}
-- 
2.31.1



[PATCH] x86:Add 3-instruction subroutine vector shift for V16QI in ix86_expand_vec_perm_const_1 [PR107563]

2024-05-07 Thread Levy Hsu
Hi All

We've introduced a new subroutine in ix86_expand_vec_perm_const_1 
to optimize vector shifting for the V16QI type on x86. 
This patch uses a three-instruction sequence psrlw, psllw, and por
to handle specific vector shuffle operations more efficiently. 
The change aims to improve assembly code generation for configurations 
supporting SSE2. 
This update addresses the issue detailed in Bugzilla report 107563.

Bootstrapped and tested on x86_64-linux-gnu, OK for trunk?

BRs,
Levy

gcc/ChangeLog:

* config/i386/i386-expand.cc (expand_vec_perm_psrlw_psllw_por): New
subroutine.
(ix86_expand_vec_perm_const_1): New Entry.

gcc/testsuite/ChangeLog:

* g++.target/i386/pr107563.C: New test.
---
 gcc/config/i386/i386-expand.cc   | 64 
 gcc/testsuite/g++.target/i386/pr107563.C | 23 +
 2 files changed, 87 insertions(+)
 create mode 100755 gcc/testsuite/g++.target/i386/pr107563.C

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 2f27bfb484c..2718b0acb87 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -22362,6 +22362,67 @@ expand_vec_perm_2perm_pblendv (struct 
expand_vec_perm_d *d, bool two_insn)
   return true;
 }
 
+/* A subroutine of ix86_expand_vec_perm_const_1.
+   Implement a permutation with psrlw, psllw and por.
+   It handles case:
+   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
+   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */
+
+static bool
+expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d)
+{
+  unsigned i;
+  rtx (*gen_shr) (rtx, rtx, rtx);
+  rtx (*gen_shl) (rtx, rtx, rtx);
+  rtx (*gen_or) (rtx, rtx, rtx);
+  machine_mode mode = VOIDmode;
+
+  if (!TARGET_SSE2 || !d->one_operand_p)
+return false;
+
+  switch (d->vmode)
+{
+case E_V8QImode:
+  if (!TARGET_MMX_WITH_SSE)
+   return false;
+  mode = V4HImode;
+  gen_shr = gen_ashrv4hi3;
+  gen_shl = gen_ashlv4hi3;
+  gen_or = gen_iorv4hi3;
+  break;
+case E_V16QImode:
+  mode = V8HImode;
+  gen_shr = gen_vlshrv8hi3;
+  gen_shl = gen_vashlv8hi3;
+  gen_or = gen_iorv8hi3;
+  break;
+default: return false;
+}
+
+  if (!rtx_equal_p (d->op0, d->op1))
+return false;
+
+  for (i = 0; i < d->nelt; i += 2)
+if (d->perm[i] != i + 1 || d->perm[i + 1] != i)
+  return false;
+
+  if (d->testing_p)
+return true;
+
+  rtx tmp1 = gen_reg_rtx (mode);
+  rtx tmp2 = gen_reg_rtx (mode);
+  rtx op0 = force_reg (d->vmode, d->op0);
+
+  emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode));
+  emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode));
+  emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8)));
+  emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8)));
+  emit_insn (gen_or (tmp1, tmp1, tmp2));
+  emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
permutation using two vperm2f128, followed by a vshufpd insn blending
the two vectors together.  */
@@ -23781,6 +23842,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d 
*d)
 
   if (expand_vec_perm_2perm_pblendv (d, false))
 return true;
+
+  if (expand_vec_perm_psrlw_psllw_por (d))
+return true;
 
   /* Try sequences of four instructions.  */
 
diff --git a/gcc/testsuite/g++.target/i386/pr107563.C 
b/gcc/testsuite/g++.target/i386/pr107563.C
new file mode 100755
index 000..5b0c648e8f1
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr107563.C
@@ -0,0 +1,23 @@
+/* PR target/107563.C */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-std=c++2b -O3 -msse2" } */
+/* { dg-final { scan-assembler-not "movzbl" } } */
+/* { dg-final { scan-assembler-not "salq" } } */
+/* { dg-final { scan-assembler-not "orq" } } */
+/* { dg-final { scan-assembler-not "punpcklqdq" } } */
+/* { dg-final { scan-assembler-times "psllw" 2 } } */
+/* { dg-final { scan-assembler-times "psrlw" 1 } } */
+/* { dg-final { scan-assembler-times "psraw" 1 } } */
+/* { dg-final { scan-assembler-times "por" 2 } } */
+
+using temp_vec_type [[__gnu__::__vector_size__ (16)]] = char;
+void foo (temp_vec_type& v) noexcept
+{
+  v = __builtin_shufflevector(v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
+}
+
+using temp_vec_type2 [[__gnu__::__vector_size__ (8)]] = char;
+void foo2 (temp_vec_type2& v) noexcept
+{
+  v=__builtin_shufflevector(v,v,1,0,3,2,5,4,7,6);
+}
-- 
2.31.1



[PATCH 1/1] Emit cvtne2ps2bf16 for odd increasing perm in __builtin_shufflevector

2024-05-07 Thread Levy Hsu
Hi All

This patch updates the GCC x86 backend to efficiently handle
odd, incrementally increasing permutations of BF16 vectors
using the cvtne2ps2bf16 instruction.
It modifies ix86_vectorize_vec_perm_const to support these operations
and adds a specific predicate to ensure proper sequence handling.

Bootstrapped and tested on x86_64-linux-gnu, OK for trunk?

BRs,
Levy

gcc/ChangeLog:

* config/i386/i386-expand.cc
(ix86_vectorize_vec_perm_const): Convert BF to HI using subreg.
* config/i386/predicates.md
(vcvtne2ps2bf_parallel): New define_insn_and_split.
* config/i386/sse.md
(vpermt2_sepcial_bf16_shuffle_): New predicates.

gcc/testsuite/ChangeLog:

* gcc.target/i386/vpermt2-special-bf16-shufflue.c: New test.
---
 gcc/config/i386/i386-expand.cc|  4 +--
 gcc/config/i386/predicates.md | 12 +++
 gcc/config/i386/sse.md| 35 +++
 .../i386/vpermt2-special-bf16-shufflue.c  | 27 ++
 4 files changed, 76 insertions(+), 2 deletions(-)
 create mode 100755 
gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 2f27bfb484c..e2e1e93f2bb 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -23894,8 +23894,8 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, 
machine_mode op_mode,
   if (GET_MODE_SIZE (vmode) == 64 && !TARGET_EVEX512)
 return false;
 
-  /* For HF mode vector, convert it to HI using subreg.  */
-  if (GET_MODE_INNER (vmode) == HFmode)
+  /* For HF and BF mode vector, convert it to HI using subreg.  */
+  if (GET_MODE_INNER (vmode) == HFmode || GET_MODE_INNER (vmode) == BFmode)
 {
   machine_mode orig_mode = vmode;
   vmode = mode_for_vector (HImode,
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 2a97776fc32..9813739daf7 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -2317,3 +2317,15 @@
 
   return true;
 })
+
+;; Check that each element is odd and incrementally increasing from 1
+(define_predicate "vcvtne2ps2bf_parallel"
+  (and (match_code "const_vector")
+   (match_code "const_int" "a"))
+{
+  for (int i = 0; i < XVECLEN (op, 0); ++i)
+if (INTVAL (XVECEXP (op, 0, i)) != (2 * i + 1))
+  return false;
+  return true;
+})
+
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index f57f36ae380..39b52cd00ca 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -31110,3 +31110,38 @@
   "TARGET_AVXVNNIINT16"
   "vpdp\t{%3, %2, %0|%0, %2, %3}"
[(set_attr "prefix" "vex")])
+
+(define_mode_attr hi_cvt_bf
+  [(V8HI "v8bf") (V16HI "v16bf") (V32HI "v32bf")])
+
+(define_mode_attr HI_CVT_BF
+  [(V8HI "V8BF") (V16HI "V16BF") (V32HI "V32BF")])
+
+(define_insn_and_split "vpermt2_sepcial_bf16_shuffle_"
+  [(set (match_operand:VI2_AVX512F 0 "register_operand")
+   (unspec:VI2_AVX512F
+ [(match_operand:VI2_AVX512F 1 "vcvtne2ps2bf_parallel")
+  (match_operand:VI2_AVX512F 2 "register_operand")
+  (match_operand:VI2_AVX512F 3 "nonimmediate_operand")]
+ UNSPEC_VPERMT2))]
+  "TARGET_AVX512VL && TARGET_AVX512BF16 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  rtx op0 = gen_reg_rtx (mode);
+  operands[2] = lowpart_subreg (mode,
+   force_reg (mode, operands[2]),
+   mode);
+  operands[3] = lowpart_subreg (mode,
+   force_reg (mode, operands[3]),
+   mode);
+
+  emit_insn (gen_avx512f_cvtne2ps2bf16_(op0,
+  operands[3],
+  operands[2]));
+  emit_move_insn (operands[0], lowpart_subreg (mode, op0,
+  mode));
+DONE;
+}
+[(set_attr "mode" "")])
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c 
b/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c
new file mode 100755
index 000..5d36c03442b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bf16 -mavx512vl" } */
+/* { dg-final { scan-assembler-not "vpermi2b" } } */
+/* { dg-final { scan-assembler-times "vcvtne2ps2bf16" 3 } } */
+
+typedef __bf16 v8bf __attribute__((vector_size(16)));
+typedef __bf16 v16bf __attribute__((vector_size(32)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+
+v8bf foo0(v

[PATCH 1/1] [PATCH] x86:Add 3-instruction subroutine vector shift for V16QI in ix86_expand_vec_perm_const_1 [PR107563]

2024-05-09 Thread Levy Hsu
Hi All

We've introduced a new subroutine in ix86_expand_vec_perm_const_1
to optimize vector shifting for the V16QI type on x86.
This patch uses a three-instruction sequence psrlw, psllw, and por
to handle specific vector shuffle operations more efficiently.
The change aims to improve assembly code generation for configurations
supporting SSE2.

Bootstrapped and tested on x86_64-linux-gnu, OK for trunk?

Best
Levy

gcc/ChangeLog:

PR target/107563
* config/i386/i386-expand.cc (expand_vec_perm_psrlw_psllw_por): New
subroutine.
(ix86_expand_vec_perm_const_1): New Entry.

gcc/testsuite/ChangeLog:

PR target/107563
* g++.target/i386/pr107563-a.C: New test.
* g++.target/i386/pr107563-b.C: New test.
---
 gcc/config/i386/i386-expand.cc | 64 ++
 gcc/testsuite/g++.target/i386/pr107563-a.C | 13 +
 gcc/testsuite/g++.target/i386/pr107563-b.C | 12 
 3 files changed, 89 insertions(+)
 create mode 100755 gcc/testsuite/g++.target/i386/pr107563-a.C
 create mode 100755 gcc/testsuite/g++.target/i386/pr107563-b.C

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 2f27bfb484c..5098d2886bb 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -22362,6 +22362,67 @@ expand_vec_perm_2perm_pblendv (struct 
expand_vec_perm_d *d, bool two_insn)
   return true;
 }
 
+/* A subroutine of ix86_expand_vec_perm_const_1.
+   Implement a permutation with psrlw, psllw and por.
+   It handles case:
+   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
+   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */
+
+static bool
+expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d)
+{
+  unsigned i;
+  rtx (*gen_shr) (rtx, rtx, rtx);
+  rtx (*gen_shl) (rtx, rtx, rtx);
+  rtx (*gen_or) (rtx, rtx, rtx);
+  machine_mode mode = VOIDmode;
+
+  if (!TARGET_SSE2 || !d->one_operand_p)
+return false;
+
+  switch (d->vmode)
+{
+case E_V8QImode:
+  if (!TARGET_MMX_WITH_SSE)
+   return false;
+  mode = V4HImode;
+  gen_shr = gen_ashrv4hi3;
+  gen_shl = gen_ashlv4hi3;
+  gen_or = gen_iorv4hi3;
+  break;
+case E_V16QImode:
+  mode = V8HImode;
+  gen_shr = gen_vlshrv8hi3;
+  gen_shl = gen_vashlv8hi3;
+  gen_or = gen_iorv8hi3;
+  break;
+default: return false;
+}
+
+  if (!rtx_equal_p (d->op0, d->op1))
+return false;
+
+  for (i = 0; i < d->nelt; i += 2)
+if (d->perm[i] != i + 1 || d->perm[i + 1] != i)
+  return false;
+
+  if (d->testing_p)
+return true;
+
+  rtx tmp1 = gen_reg_rtx (mode);
+  rtx tmp2 = gen_reg_rtx (mode);
+  rtx op0 = force_reg (d->vmode, d->op0);
+
+  emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode));
+  emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode));
+  emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8)));
+  emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8)));
+  emit_insn (gen_or (tmp1, tmp1, tmp2));
+  emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
permutation using two vperm2f128, followed by a vshufpd insn blending
the two vectors together.  */
@@ -23782,6 +23843,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d 
*d)
   if (expand_vec_perm_2perm_pblendv (d, false))
 return true;
 
+  if (expand_vec_perm_psrlw_psllw_por (d))
+return true;
+
   /* Try sequences of four instructions.  */
 
   if (expand_vec_perm_even_odd_trunc (d))
diff --git a/gcc/testsuite/g++.target/i386/pr107563-a.C 
b/gcc/testsuite/g++.target/i386/pr107563-a.C
new file mode 100755
index 000..605c1bdf814
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr107563-a.C
@@ -0,0 +1,13 @@
+/* PR target/107563.C */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-std=c++2b -O3 -msse2" } */
+/* { dg-final { scan-assembler-times "psllw" 1 } } */
+/* { dg-final { scan-assembler-times "psraw" 1 } } */
+/* { dg-final { scan-assembler-times "por" 1 } } */
+
+using temp_vec_type2 [[__gnu__::__vector_size__(8)]] = char;
+
+void foo2(temp_vec_type2& v) noexcept
+{
+  v = __builtin_shufflevector(v, v, 1, 0, 3, 2, 5, 4, 7, 6);
+}
diff --git a/gcc/testsuite/g++.target/i386/pr107563-b.C 
b/gcc/testsuite/g++.target/i386/pr107563-b.C
new file mode 100755
index 000..0ce3e8263bb
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr107563-b.C
@@ -0,0 +1,12 @@
+/* PR target/107563.C */
+/* { dg-options "-std=c++2b -O3 -msse2" } */
+/* { dg-final { scan-assembler-times "psllw" 1 } } */
+/* { dg-final { scan-assembler-times "psrlw" 1 } } */
+/* { dg-final { scan-assembler-times "por" 1 } } */
+
+using temp_vec_type [[__gnu__::__vector_size__(16)]] = char;
+
+void foo(temp_vec_type& v) noexcept
+{
+  v = __builtin_shufflevector(v, v, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 
12, 15, 14);
+}
-- 
2.31.1



[PATCH] i386: Support partial vectorized V2BF/V4BF plus/minus/mult/div/sqrt

2024-09-02 Thread Levy Hsu
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

This patch introduces new mode iterators and expands for the i386 architecture 
to support partial vectorization of bf16 operations using AVX10.2 instructions. 
These operations include addition, subtraction, multiplication, division, and 
square root calculations for V2BF and V4BF data types.

gcc/ChangeLog:

* config/i386/mmx.md (VBF_32_64): New mode iterator for partial 
vectorized V2BF/V4BF.
(3): New define_expand for plusminusmultdiv.
(sqrt2): New define_expand for sqrt.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c: New test.
* gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c: New test.
---
 gcc/config/i386/mmx.md| 37 
 .../avx10_2-partial-bf-vector-fast-math-1.c   | 22 +++
 .../avx10_2-partial-bf-vector-operations-1.c  | 57 +++
 3 files changed, 116 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c
 create mode 100644 
gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index e0065ed4d48..9116ddb5321 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -94,6 +94,8 @@
 
 (define_mode_iterator VHF_32_64 [V2HF (V4HF "TARGET_MMX_WITH_SSE")])
 
+(define_mode_iterator VBF_32_64 [V2BF (V4BF "TARGET_MMX_WITH_SSE")])
+
 ;; Mapping from integer vector mode to mnemonic suffix
 (define_mode_attr mmxvecsize
   [(V8QI "b") (V4QI "b") (V2QI "b")
@@ -2036,6 +2038,26 @@
   DONE;
 })
 
+;; VDIVNEPBF16 does not generate floating point exceptions.
+(define_expand "3"
+  [(set (match_operand:VBF_32_64 0 "register_operand")
+(plusminusmultdiv:VBF_32_64
+  (match_operand:VBF_32_64 1 "nonimmediate_operand")
+  (match_operand:VBF_32_64 2 "nonimmediate_operand")))]
+  "TARGET_AVX10_2_256"
+{
+  rtx op0 = gen_reg_rtx (V8BFmode);
+  rtx op1 = lowpart_subreg (V8BFmode,
+   force_reg (mode, operands[1]), mode);
+  rtx op2 = lowpart_subreg (V8BFmode,
+   force_reg (mode, operands[2]), mode);
+
+  emit_insn (gen_v8bf3 (op0, op1, op2));
+
+  emit_move_insn (operands[0], lowpart_subreg (mode, op0, V8BFmode));
+  DONE;
+})
+
 (define_expand "divv2hf3"
   [(set (match_operand:V2HF 0 "register_operand")
(div:V2HF
@@ -2091,6 +2113,21 @@
   DONE;
 })
 
+(define_expand "sqrt2"
+  [(set (match_operand:VBF_32_64 0 "register_operand")
+   (sqrt:VBF_32_64 (match_operand:VBF_32_64 1 "vector_operand")))]
+  "TARGET_AVX10_2_256"
+{
+  rtx op0 = gen_reg_rtx (V8BFmode);
+  rtx op1 = lowpart_subreg (V8BFmode,
+   force_reg (mode, operands[1]), mode);
+
+  emit_insn (gen_sqrtv8bf2 (op0, op1));
+
+  emit_move_insn (operands[0], lowpart_subreg (mode, op0, V8BFmode));
+  DONE;
+})
+
 (define_expand "2"
   [(set (match_operand:VHF_32_64 0 "register_operand")
(absneg:VHF_32_64
diff --git 
a/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c 
b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c
new file mode 100644
index 000..fd064f17445
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mavx10.2 -O2" } */
+/* { dg-final { scan-assembler-times "vmulnepbf16\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vrcppbf16\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
+
+typedef __bf16 v4bf __attribute__ ((__vector_size__ (8)));
+typedef __bf16 v2bf __attribute__ ((__vector_size__ (4)));
+
+
+__attribute__((optimize("fast-math")))
+v4bf
+foo_div_fast_math_4 (v4bf a, v4bf b)
+{
+  return a / b;
+}
+
+__attribute__((optimize("fast-math")))
+v2bf
+foo_div_fast_math_2 (v2bf a, v2bf b)
+{
+  return a / b;
+}
diff --git 
a/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c 
b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c
new file mode 100644
index 000..e7ee08a20a9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c
@@ -0,0 +1,57 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mavx10.2 -O2" } */
+/* { dg-final { scan-assembler-times "vmulnepbf16\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vaddnepbf16\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vdivnepbf16\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vsubnepbf16\[ 
\\t\]+\[^\{

[PATCH] i386: Support partial vectorized V2BF/V4BF smaxmin

2024-09-02 Thread Levy Hsu
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

This patch supports sminmax for partial vectorized V2BF/V4BF.

gcc/ChangeLog:

* config/i386/mmx.md (3): New define_expand for 
V2BF/V4BFsmaxmin

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx10_2-partial-bf-vector-smaxmin-1.c: New test.
---
 gcc/config/i386/mmx.md| 19 ++
 .../avx10_2-partial-bf-vector-smaxmin-1.c | 36 +++
 2 files changed, 55 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-smaxmin-1.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 9116ddb5321..3f12a1349ab 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -2098,6 +2098,25 @@
   DONE;
 })
 
+(define_expand "3"
+  [(set (match_operand:VBF_32_64 0 "register_operand")
+(smaxmin:VBF_32_64
+  (match_operand:VBF_32_64 1 "nonimmediate_operand")
+  (match_operand:VBF_32_64 2 "nonimmediate_operand")))]
+  "TARGET_AVX10_2_256"
+{
+  rtx op0 = gen_reg_rtx (V8BFmode);
+  rtx op1 = lowpart_subreg (V8BFmode,
+   force_reg (mode, operands[1]), mode);
+  rtx op2 = lowpart_subreg (V8BFmode,
+   force_reg (mode, operands[2]), mode);
+
+  emit_insn (gen_v8bf3 (op0, op1, op2));
+
+  emit_move_insn (operands[0], lowpart_subreg (mode, op0, V8BFmode));
+  DONE;
+})
+
 (define_expand "sqrt2"
   [(set (match_operand:VHF_32_64 0 "register_operand")
(sqrt:VHF_32_64
diff --git 
a/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-smaxmin-1.c 
b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-smaxmin-1.c
new file mode 100644
index 000..0a7cc58e29d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-smaxmin-1.c
@@ -0,0 +1,36 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mavx10.2 -Ofast" } */
+/* /* { dg-final { scan-assembler-times "vmaxpbf16" 2 } } */
+/* /* { dg-final { scan-assembler-times "vminpbf16" 2 } } */
+
+void
+maxpbf16_64 (__bf16* restrict dest, __bf16* restrict src1, __bf16* restrict 
src2)
+{
+  int i;
+  for (i = 0; i < 4; i++)
+dest[i] = src1[i] > src2[i] ? src1[i] : src2[i];
+}
+
+void
+maxpbf16_32 (__bf16* restrict dest, __bf16* restrict src1, __bf16* restrict 
src2)
+{
+  int i;
+  for (i = 0; i < 2; i++)
+dest[i] = src1[i] > src2[i] ? src1[i] : src2[i];
+}
+
+void
+minpbf16_64 (__bf16* restrict dest, __bf16* restrict src1, __bf16* restrict 
src2)
+{
+  int i;
+  for (i = 0; i < 4; i++)
+dest[i] = src1[i] < src2[i] ? src1[i] : src2[i];
+}
+
+void
+minpbf16_32 (__bf16* restrict dest, __bf16* restrict src1, __bf16* restrict 
src2)
+{
+  int i;
+  for (i = 0; i < 2; i++)
+dest[i] = src1[i] < src2[i] ? src1[i] : src2[i];
+}
-- 
2.31.1



[PATCH] i386: Integrate BFmode for Enhanced Vectorization in ix86_preferred_simd_mode

2024-09-03 Thread Levy Hsu
Hi

This change adds BFmode support to the ix86_preferred_simd_mode function
enhancing SIMD vectorization for BF16 operations. The update ensures
optimized usage of SIMD capabilities improving performance and aligning
vector sizes with processor capabilities.

Bootstrapped and tested on x86-64-pc-linux-gnu. 
Ok for trunk?

gcc/ChangeLog:

* config/i386/i386.cc (ix86_preferred_simd_mode): Add BFmode Support.
---
 gcc/config/i386/i386.cc | 8 
 1 file changed, 8 insertions(+)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 7af9ceca429..aea138c85ad 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -24570,6 +24570,14 @@ ix86_preferred_simd_mode (scalar_mode mode)
}
   return word_mode;
 
+case E_BFmode:
+  if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256)
+   return V32BFmode;
+  else if (TARGET_AVX && !TARGET_PREFER_AVX128)
+   return V16BFmode;
+  else
+   return V8BFmode;
+
 case E_SFmode:
   if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256)
return V16SFmode;
-- 
2.31.1



[PATCH] i386: Support partial signbit/xorsign/copysign/abs/neg/and/xor/ior/andn for V2BF/V4BF

2024-09-03 Thread Levy Hsu
Hi

This patch adds support for bf16 operations in V2BF and V4BF modes on i386,
handling signbit, xorsign, copysign, abs, neg, and various logical operations.

Bootstrapped and tested on x86-64-pc-linux-gnu. 
Ok for trunk?

gcc/ChangeLog:

* config/i386/i386.cc (ix86_build_const_vector): Add V2BF/V4BF.
(ix86_build_signbit_mask): Add V2BF/V4BF.
* config/i386/mmx.md: Modified supported logic op to use VHBF_32_64.

gcc/testsuite/ChangeLog:

* gcc.target/i386/part-vect-absnegbf.c: New test.
---
 gcc/config/i386/i386.cc   |  4 +
 gcc/config/i386/mmx.md| 74 +
 .../gcc.target/i386/part-vect-absnegbf.c  | 81 +++
 3 files changed, 124 insertions(+), 35 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/part-vect-absnegbf.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 78bf890f14b..2bbfb1bf5fc 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -16176,6 +16176,8 @@ ix86_build_const_vector (machine_mode mode, bool vect, 
rtx value)
 case E_V32BFmode:
 case E_V16BFmode:
 case E_V8BFmode:
+case E_V4BFmode:
+case E_V2BFmode:
   n_elt = GET_MODE_NUNITS (mode);
   v = rtvec_alloc (n_elt);
   scalar_mode = GET_MODE_INNER (mode);
@@ -16215,6 +16217,8 @@ ix86_build_signbit_mask (machine_mode mode, bool vect, 
bool invert)
 case E_V32BFmode:
 case E_V16BFmode:
 case E_V8BFmode:
+case E_V4BFmode:
+case E_V2BFmode:
   vec_mode = mode;
   imode = HImode;
   break;
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index cb2697537a8..44adcd8d8e0 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -121,7 +121,7 @@
 ;; Mapping of vector float modes to an integer mode of the same size
 (define_mode_attr mmxintvecmode
   [(V2SF "V2SI") (V2SI "V2SI") (V4HI "V4HI") (V8QI "V8QI")
-   (V4HF "V4HI") (V2HF "V2HI")])
+   (V4HF "V4HI") (V2HF "V2HI") (V4BF "V4HI") (V2BF "V2HI")])
 
 (define_mode_attr mmxintvecmodelower
   [(V2SF "v2si") (V2SI "v2si") (V4HI "v4hi") (V8QI "v8qi")
@@ -2091,18 +2091,22 @@
   DONE;
 })
 
+(define_mode_iterator VHBF_32_64
+ [V2BF (V4BF "TARGET_MMX_WITH_SSE")
+  V2HF (V4HF "TARGET_MMX_WITH_SSE")]) 
+
 (define_expand "2"
-  [(set (match_operand:VHF_32_64 0 "register_operand")
-   (absneg:VHF_32_64
- (match_operand:VHF_32_64 1 "register_operand")))]
+  [(set (match_operand:VHBF_32_64 0 "register_operand")
+   (absneg:VHBF_32_64
+ (match_operand:VHBF_32_64 1 "register_operand")))]
   "TARGET_SSE"
   "ix86_expand_fp_absneg_operator (, mode, operands); DONE;")
 
 (define_insn_and_split "*mmx_"
-  [(set (match_operand:VHF_32_64 0 "register_operand" "=x,x,x")
-   (absneg:VHF_32_64
- (match_operand:VHF_32_64 1 "register_operand" "0,x,x")))
-   (use (match_operand:VHF_32_64 2 "register_operand" "x,0,x"))]
+  [(set (match_operand:VHBF_32_64 0 "register_operand" "=x,x,x")
+   (absneg:VHBF_32_64
+ (match_operand:VHBF_32_64 1 "register_operand" "0,x,x")))
+   (use (match_operand:VHBF_32_64 2 "register_operand" "x,0,x"))]
   "TARGET_SSE"
   "#"
   "&& reload_completed"
@@ -2115,11 +2119,11 @@
   [(set_attr "isa" "noavx,noavx,avx")])
 
 (define_insn_and_split "*mmx_nabs2"
-  [(set (match_operand:VHF_32_64 0 "register_operand" "=x,x,x")
-   (neg:VHF_32_64
- (abs:VHF_32_64
-   (match_operand:VHF_32_64 1 "register_operand" "0,x,x"
-   (use (match_operand:VHF_32_64 2 "register_operand" "x,0,x"))]
+  [(set (match_operand:VHBF_32_64 0 "register_operand" "=x,x,x")
+   (neg:VHBF_32_64
+ (abs:VHBF_32_64
+   (match_operand:VHBF_32_64 1 "register_operand" "0,x,x"
+   (use (match_operand:VHBF_32_64 2 "register_operand" "x,0,x"))]
   "TARGET_SSE"
   "#"
   "&& reload_completed"
@@ -2410,11 +2414,11 @@
 ;
 
 (define_insn "*mmx_andnot3"
-  [(set (match_operand:VHF_32_64 0 "register_operand""=x,x")
-   (and:VHF_32_64
- (not:VHF_32_64
-   (match_operand:VHF_32_64 1 "register_operand" "0,x"))
- (match_operand:VHF_32_64 2 "register_operand"   "x,x")))]
+  [(set (match_operand:VHBF_32_64 0 "register_operand""=x,x")
+   (and:VHBF_32_64
+ (not:VHBF_32_64
+   (match_operand:VHBF_32_64 1 "register_operand" "0,x"))
+ (match_operand:VHBF_32_64 2 "register_operand"   "x,x")))]
   "TARGET_SSE"
   "@
andnps\t{%2, %0|%0, %2}
@@ -2425,10 +2429,10 @@
(set_attr "mode" "V4SF")])
 
 (define_insn "3"
-  [(set (match_operand:VHF_32_64 0 "register_operand"   "=x,x")
-   (any_logic:VHF_32_64
- (match_operand:VHF_32_64 1 "register_operand" "%0,x")
- (match_operand:VHF_32_64 2 "register_operand" " x,x")))]
+  [(set (match_operand:VHBF_32_64 0 "register_operand"   "=x,x")
+   (any_logic:VHBF_32_64
+ (match_operand:VHBF_32_64 1 "register_operand" "%0,x")
+   

[PATCH] i386: Support partial vectorized FMA for V2BF/V4BF

2024-09-03 Thread Levy Hsu
Hi

Bootstrapped and tested on x86-64-pc-linux-gnu. 
Ok for trunk?

This patch introduces support for vectorized FMA operations for bf16 types in
V2BF and V4BF modes on the i386 architecture. New mode iterators and
define_expand entries for fma, fnma, fms, and fnms operations are added in
mmx.md, enhancing the i386 backend to handle these complex arithmetic 
operations.

gcc/ChangeLog:

* config/i386/mmx.md (TARGET_MMX_WITH_SSE): New mode iterator VBF_32_64
(fma4): define_expand for V2BF/V4BF fma4.
(fnma4): define_expand for V2BF/V4BF fnma4.
(fms4): define_expand for V2BF/V4BF fms4.
(fnms4): define_expand for V2BF/V4BF fnms4.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx10_2-partial-bf-vector-fma-1.c: New test.
---
 gcc/config/i386/mmx.md| 84 ++-
 .../i386/avx10_2-partial-bf-vector-fma-1.c| 57 +
 2 files changed, 139 insertions(+), 2 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fma-1.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 10fcd2beda6..22aeb43f436 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -2636,6 +2636,88 @@
   DONE;
 })
 
+(define_mode_iterator VBF_32_64 [V2BF (V4BF "TARGET_MMX_WITH_SSE")])
+
+(define_expand "fma4"
+  [(set (match_operand:VBF_32_64 0 "register_operand")
+   (fma:VBF_32_64
+ (match_operand:VBF_32_64 1 "nonimmediate_operand")
+ (match_operand:VBF_32_64 2 "nonimmediate_operand")
+ (match_operand:VBF_32_64 3 "nonimmediate_operand")))]
+  "TARGET_AVX10_2_256"
+{
+  rtx op0 = gen_reg_rtx (V8BFmode);
+  rtx op1 = lowpart_subreg (V8BFmode, force_reg (mode, operands[1]), 
mode);
+  rtx op2 = lowpart_subreg (V8BFmode, force_reg (mode, operands[2]), 
mode);
+  rtx op3 = lowpart_subreg (V8BFmode, force_reg (mode, operands[3]), 
mode);
+
+  emit_insn (gen_fmav8bf4 (op0, op1, op2, op3));
+
+  emit_move_insn (operands[0], lowpart_subreg (mode, op0, V8BFmode));
+  DONE;
+})
+
+(define_expand "fms4"
+  [(set (match_operand:VBF_32_64 0 "register_operand")
+   (fma:VBF_32_64
+ (match_operand:VBF_32_64   1 "nonimmediate_operand")
+ (match_operand:VBF_32_64   2 "nonimmediate_operand")
+ (neg:VBF_32_64
+   (match_operand:VBF_32_64 3 "nonimmediate_operand"]
+  "TARGET_AVX10_2_256"
+{
+  rtx op0 = gen_reg_rtx (V8BFmode);
+  rtx op1 = lowpart_subreg (V8BFmode, force_reg (mode, operands[1]), 
mode);
+  rtx op2 = lowpart_subreg (V8BFmode, force_reg (mode, operands[2]), 
mode);
+  rtx op3 = lowpart_subreg (V8BFmode, force_reg (mode, operands[3]), 
mode);
+
+  emit_insn (gen_fmsv8bf4 (op0, op1, op2, op3));
+
+  emit_move_insn (operands[0], lowpart_subreg (mode, op0, V8BFmode));
+  DONE;
+})
+
+(define_expand "fnma4"
+  [(set (match_operand:VBF_32_64 0 "register_operand")
+   (fma:VBF_32_64
+ (neg:VBF_32_64
+   (match_operand:VBF_32_64 1 "nonimmediate_operand"))
+ (match_operand:VBF_32_64   2 "nonimmediate_operand")
+ (match_operand:VBF_32_64   3 "nonimmediate_operand")))]
+  "TARGET_AVX10_2_256"
+{
+  rtx op0 = gen_reg_rtx (V8BFmode);
+  rtx op1 = lowpart_subreg (V8BFmode, force_reg (mode, operands[1]), 
mode);
+  rtx op2 = lowpart_subreg (V8BFmode, force_reg (mode, operands[2]), 
mode);
+  rtx op3 = lowpart_subreg (V8BFmode, force_reg (mode, operands[3]), 
mode);
+
+  emit_insn (gen_fnmav8bf4 (op0, op1, op2, op3));
+
+  emit_move_insn (operands[0], lowpart_subreg (mode, op0, V8BFmode));
+  DONE;
+})
+
+(define_expand "fnms4"
+  [(set (match_operand:VBF_32_64 0 "register_operand")
+   (fma:VBF_32_64
+ (neg:VBF_32_64
+   (match_operand:VBF_32_64 1 "nonimmediate_operand"))
+ (match_operand:VBF_32_64   2 "nonimmediate_operand")
+ (neg:VBF_32_64
+   (match_operand:VBF_32_64 3 "nonimmediate_operand"]
+  "TARGET_AVX10_2_256"
+{
+  rtx op0 = gen_reg_rtx (V8BFmode);
+  rtx op1 = lowpart_subreg (V8BFmode, force_reg (mode, operands[1]), 
mode);
+  rtx op2 = lowpart_subreg (V8BFmode, force_reg (mode, operands[2]), 
mode);
+  rtx op3 = lowpart_subreg (V8BFmode, force_reg (mode, operands[3]), 
mode);
+
+  emit_insn (gen_fnmsv8bf4 (op0, op1, op2, op3));
+
+  emit_move_insn (operands[0], lowpart_subreg (mode, op0, V8BFmode));
+  DONE;
+})
+
 
 ;;
 ;; Parallel half-precision floating point complex type operations
@@ -6670,8 +6752,6 @@
(set_attr "modrm" "0")
(set_attr "memory" "none")])
 
-(define_mode_iterator VBF_32_64 [V2BF (V4BF "TARGET_MMX_WITH_SSE")])
-
 ;; VDIVNEPBF16 does not generate floating point exceptions.
 (define_expand "3"
   [(set (match_operand:VBF_32_64 0 "register_operand")
diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fma-1.c 
b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fma-1.c
new file mode 100644
index 000..72e17e99603
--- /dev/null
+++ b/

[PATCH] x86: Refine V4BF/V2BF FMA testcase

2024-09-05 Thread Levy Hsu
Simple testcase fix, ok for trunk?

This patch removes specific register checks to account for possible
register spills and disables tests in 32-bit mode. This adjustment
is necessary because V4BF operations in 32-bit mode require duplicating
instructions, which lead to unintended test failures. It fixed the
case when testing with --target_board='unix{-m32\ -march=cascadelake}'

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx10_2-partial-bf-vector-fma-1.c: Remove specific
register checks to account for potential register spills. Exclude tests
in 32-bit mode to prevent incorrect failure reports due to the need for
multiple instruction executions in handling V4BF operations.
---
 .../gcc.target/i386/avx10_2-partial-bf-vector-fma-1.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fma-1.c 
b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fma-1.c
index 72e17e99603..17c32c1d36b 100644
--- a/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fma-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fma-1.c
@@ -1,9 +1,9 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx10.2 -O2" } */
-/* { dg-final { scan-assembler-times "vfmadd132nepbf16\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 2 } } */
-/* { dg-final { scan-assembler-times "vfmsub132nepbf16\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 2 } } */
-/* { dg-final { scan-assembler-times "vfnmadd132nepbf16\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 2 } } */
-/* { dg-final { scan-assembler-times "vfnmsub132nepbf16\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vfmadd132nepbf16\[^\n\r\]*xmm\[0-9\]" 2 
{ target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vfmsub132nepbf16\[^\n\r\]*xmm\[0-9\]" 2 
{ target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vfnmadd132nepbf16\[^\n\r\]*xmm\[0-9\]" 2 
{ target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vfnmsub132nepbf16\[^\n\r\]*xmm\[0-9\]" 2 
{ target { ! ia32 } } } } */
 
 typedef __bf16 v4bf __attribute__ ((__vector_size__ (8)));
 typedef __bf16 v2bf __attribute__ ((__vector_size__ (4)));
-- 
2.31.1



[PATCH] i386: Enable V2BF/V4BF vec_cmp with AVX10.2 vcmppbf16

2024-09-09 Thread Levy Hsu
gcc/ChangeLog:

* config/i386/i386.cc (ix86_get_mask_mode):
Enable BFmode for targetm.vectorize.get_mask_mode with AVX10.2.
* config/i386/mmx.md (vec_cmpqi):
Implement vec_cmpv2bfqi and vec_cmpv4bfqi.

gcc/testsuite/ChangeLog:

* gcc.target/i386/part-vect-vec_cmpbf.c: New test.
---
 gcc/config/i386/i386.cc   |  3 ++-
 gcc/config/i386/mmx.md| 17 
 .../gcc.target/i386/part-vect-vec_cmpbf.c | 26 +++
 3 files changed, 45 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/part-vect-vec_cmpbf.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 45320124b91..82267552474 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -24682,7 +24682,8 @@ ix86_get_mask_mode (machine_mode data_mode)
   /* AVX512FP16 only supports vector comparison
 to kmask for _Float16.  */
   || (TARGET_AVX512VL && TARGET_AVX512FP16
- && GET_MODE_INNER (data_mode) == E_HFmode))
+ && GET_MODE_INNER (data_mode) == E_HFmode)
+  || TARGET_AVX10_2_256 && GET_MODE_INNER (data_mode) == E_BFmode)
 {
   if (elem_size == 4
  || elem_size == 8
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 4bc191b874b..95d9356694a 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -2290,6 +2290,23 @@
   DONE;
 })
 
+;;This instruction does not generate floating point exceptions
+(define_expand "vec_cmpqi"
+  [(set (match_operand:QI 0 "register_operand")
+   (match_operator:QI 1 ""
+ [(match_operand:VBF_32_64 2 "register_operand")
+  (match_operand:VBF_32_64 3 "nonimmediate_operand")]))]
+  "TARGET_AVX10_2_256"
+{
+  rtx op2 = lowpart_subreg (V8BFmode,
+   force_reg (mode, operands[2]), mode);
+  rtx op3 = lowpart_subreg (V8BFmode,
+   force_reg (mode, operands[3]), mode);
+
+  emit_insn (gen_vec_cmpv8bfqi (operands[0], operands[1], op2, op3));
+  DONE;
+})
+
 ;
 ;;
 ;; Parallel half-precision floating point rounding operations.
diff --git a/gcc/testsuite/gcc.target/i386/part-vect-vec_cmpbf.c 
b/gcc/testsuite/gcc.target/i386/part-vect-vec_cmpbf.c
new file mode 100644
index 000..0bb720b6432
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/part-vect-vec_cmpbf.c
@@ -0,0 +1,26 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mavx10.2" } */
+/* { dg-final { scan-assembler-times "vcmppbf16" 10 } } */
+
+typedef __bf16 __attribute__((__vector_size__ (4))) v2bf;
+typedef __bf16 __attribute__((__vector_size__ (8))) v4bf;
+
+
+#define VCMPMN(type, op, name) \
+type  \
+__attribute__ ((noinline, noclone)) \
+vec_cmp_##type##type##name (type a, type b) \
+{ \
+  return a op b;  \
+}
+
+VCMPMN (v4bf, <, lt)
+VCMPMN (v2bf, <, lt)
+VCMPMN (v4bf, <=, le)
+VCMPMN (v2bf, <=, le)
+VCMPMN (v4bf, >, gt)
+VCMPMN (v2bf, >, gt)
+VCMPMN (v4bf, >=, ge)
+VCMPMN (v2bf, >=, ge)
+VCMPMN (v4bf, ==, eq)
+VCMPMN (v2bf, ==, eq)
-- 
2.31.1



[PATCH] x86: Refine V4BF/V2BF FMA Testcase

2024-09-10 Thread Levy Hsu
Simple testcase fix, ok for trunk?

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx10_2-partial-bf-vector-fma-1.c: Separated 32-bit 
scan
and removed register checks in spill situations.
---
 .../i386/avx10_2-partial-bf-vector-fma-1.c   | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fma-1.c 
b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fma-1.c
index 72e17e99603..8a9096a300a 100644
--- a/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fma-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fma-1.c
@@ -1,9 +1,13 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx10.2 -O2" } */
-/* { dg-final { scan-assembler-times "vfmadd132nepbf16\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 2 } } */
-/* { dg-final { scan-assembler-times "vfmsub132nepbf16\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 2 } } */
-/* { dg-final { scan-assembler-times "vfnmadd132nepbf16\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 2 } } */
-/* { dg-final { scan-assembler-times "vfnmsub132nepbf16\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vfmadd132nepbf16\[^\n\r\]*xmm\[0-9\]" 3 
{ target ia32 } } } */
+/* { dg-final { scan-assembler-times "vfmsub132nepbf16\[^\n\r\]*xmm\[0-9\]" 3 
{ target ia32 } } } */
+/* { dg-final { scan-assembler-times "vfnmadd132nepbf16\[^\n\r\]*xmm\[0-9\]" 3 
{ target ia32 } } } */
+/* { dg-final { scan-assembler-times "vfnmsub132nepbf16\[^\n\r\]*xmm\[0-9\]" 3 
{ target ia32 } } } */
+/* { dg-final { scan-assembler-times "vfmadd132nepbf16\[^\n\r\]*xmm\[0-9\]" 2 
{ target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vfmsub132nepbf16\[^\n\r\]*xmm\[0-9\]" 2 
{ target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vfnmadd132nepbf16\[^\n\r\]*xmm\[0-9\]" 2 
{ target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vfnmsub132nepbf16\[^\n\r\]*xmm\[0-9\]" 2 
{ target { ! ia32 } } } } */
 
 typedef __bf16 v4bf __attribute__ ((__vector_size__ (8)));
 typedef __bf16 v2bf __attribute__ ((__vector_size__ (4)));
-- 
2.31.1



[PATCH v2] Enable V2BF/V4BF vec_cmp with AVX10.2 vcmppbf16

2024-09-11 Thread Levy Hsu
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

* config/i386/i386.cc (ix86_get_mask_mode):
Enable BFmode for targetm.vectorize.get_mask_mode with AVX10.2.
* config/i386/mmx.md (vec_cmpqi):
Implement vec_cmpv2bfqi and vec_cmpv4bfqi.

gcc/testsuite/ChangeLog:

* gcc.target/i386/part-vect-vec_cmpbf.c: New test.
---
 gcc/config/i386/i386.cc   |  3 ++-
 gcc/config/i386/mmx.md| 17 
 .../gcc.target/i386/part-vect-vec_cmpbf.c | 26 +++
 3 files changed, 45 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/part-vect-vec_cmpbf.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 45320124b91..7dbae1d72e3 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -24682,7 +24682,8 @@ ix86_get_mask_mode (machine_mode data_mode)
   /* AVX512FP16 only supports vector comparison
 to kmask for _Float16.  */
   || (TARGET_AVX512VL && TARGET_AVX512FP16
- && GET_MODE_INNER (data_mode) == E_HFmode))
+ && GET_MODE_INNER (data_mode) == E_HFmode)
+  || (TARGET_AVX10_2_256 && GET_MODE_INNER (data_mode) == E_BFmode))
 {
   if (elem_size == 4
  || elem_size == 8
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 4bc191b874b..95d9356694a 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -2290,6 +2290,23 @@
   DONE;
 })
 
+;;This instruction does not generate floating point exceptions
+(define_expand "vec_cmpqi"
+  [(set (match_operand:QI 0 "register_operand")
+   (match_operator:QI 1 ""
+ [(match_operand:VBF_32_64 2 "register_operand")
+  (match_operand:VBF_32_64 3 "nonimmediate_operand")]))]
+  "TARGET_AVX10_2_256"
+{
+  rtx op2 = lowpart_subreg (V8BFmode,
+   force_reg (mode, operands[2]), mode);
+  rtx op3 = lowpart_subreg (V8BFmode,
+   force_reg (mode, operands[3]), mode);
+
+  emit_insn (gen_vec_cmpv8bfqi (operands[0], operands[1], op2, op3));
+  DONE;
+})
+
 ;
 ;;
 ;; Parallel half-precision floating point rounding operations.
diff --git a/gcc/testsuite/gcc.target/i386/part-vect-vec_cmpbf.c 
b/gcc/testsuite/gcc.target/i386/part-vect-vec_cmpbf.c
new file mode 100644
index 000..0bb720b6432
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/part-vect-vec_cmpbf.c
@@ -0,0 +1,26 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mavx10.2" } */
+/* { dg-final { scan-assembler-times "vcmppbf16" 10 } } */
+
+typedef __bf16 __attribute__((__vector_size__ (4))) v2bf;
+typedef __bf16 __attribute__((__vector_size__ (8))) v4bf;
+
+
+#define VCMPMN(type, op, name) \
+type  \
+__attribute__ ((noinline, noclone)) \
+vec_cmp_##type##type##name (type a, type b) \
+{ \
+  return a op b;  \
+}
+
+VCMPMN (v4bf, <, lt)
+VCMPMN (v2bf, <, lt)
+VCMPMN (v4bf, <=, le)
+VCMPMN (v2bf, <=, le)
+VCMPMN (v4bf, >, gt)
+VCMPMN (v2bf, >, gt)
+VCMPMN (v4bf, >=, ge)
+VCMPMN (v2bf, >=, ge)
+VCMPMN (v4bf, ==, eq)
+VCMPMN (v2bf, ==, eq)
-- 
2.31.1



[PATCH] x86: Implement Fast-Math Float Truncation to BF16 via PSRLD Instruction

2024-10-08 Thread Levy Hsu
Bootstrapped and tested on x86_64-linux-gnu, OK for trunk?

gcc/ChangeLog:

* config/i386/i386.md: Rewrite insn truncsfbf2.

gcc/testsuite/ChangeLog:

* gcc.target/i386/truncsfbf-1.c: New test.
* gcc.target/i386/truncsfbf-2.c: New test.
---
 gcc/config/i386/i386.md | 16 ++---
 gcc/testsuite/gcc.target/i386/truncsfbf-1.c |  9 +++
 gcc/testsuite/gcc.target/i386/truncsfbf-2.c | 65 +
 3 files changed, 83 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/truncsfbf-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/truncsfbf-2.c

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 9c2a0aa6112..d3fee0968d8 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -5672,16 +5672,18 @@
(set_attr "mode" "HF")])
 
 (define_insn "truncsfbf2"
-  [(set (match_operand:BF 0 "register_operand" "=x, v")
+  [(set (match_operand:BF 0 "register_operand" "=x,x,v,Yv")
(float_truncate:BF
- (match_operand:SF 1 "register_operand" "x,v")))]
-  "((TARGET_AVX512BF16 && TARGET_AVX512VL) || TARGET_AVXNECONVERT)
-   && !HONOR_NANS (BFmode) && flag_unsafe_math_optimizations"
+ (match_operand:SF 1 "register_operand" "0,x,v,Yv")))]
+  "TARGET_SSE2 && flag_unsafe_math_optimizations && !HONOR_NANS (BFmode)"
   "@
+  psrld\t{$16, %0|%0, 16}
   %{vex%} vcvtneps2bf16\t{%1, %0|%0, %1}
-  vcvtneps2bf16\t{%1, %0|%0, %1}"
-  [(set_attr "isa" "avxneconvert,avx512bf16vl")
-   (set_attr "prefix" "vex,evex")])
+  vcvtneps2bf16\t{%1, %0|%0, %1}
+  vpsrld\t{$16, %1, %0|%0, %1, 16}"
+  [(set_attr "isa" "noavx,avxneconvert,avx512bf16vl,avx")
+   (set_attr "prefix" "orig,vex,evex,vex")
+   (set_attr "type" "sseishft1,ssecvt,ssecvt,sseishft1")])
 
 ;; Signed conversion to DImode.
 
diff --git a/gcc/testsuite/gcc.target/i386/truncsfbf-1.c 
b/gcc/testsuite/gcc.target/i386/truncsfbf-1.c
new file mode 100644
index 000..dd3ff8a50b4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/truncsfbf-1.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2 -ffast-math" } */
+/* { dg-final { scan-assembler-times "psrld" 1 } } */
+
+__bf16
+foo (float a)
+{
+  return a;
+}
diff --git a/gcc/testsuite/gcc.target/i386/truncsfbf-2.c 
b/gcc/testsuite/gcc.target/i386/truncsfbf-2.c
new file mode 100644
index 000..f4952f88fc9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/truncsfbf-2.c
@@ -0,0 +1,65 @@
+/* { dg-do run } */
+/* { dg-options "-msse2 -O2 -ffast-math" } */
+
+#include 
+#include 
+#include 
+#include 
+
+__bf16
+foo (float a)
+{
+  return a;
+}
+
+static __bf16
+CALC (float *a)
+{
+  uint32_t bits;
+  memcpy (&bits, a, sizeof (bits));
+  bits >>= 16;
+  uint16_t bfloat16_bits = (uint16_t) bits;
+  __bf16 bf16;
+  memcpy (&bf16, &bfloat16_bits, sizeof (bf16));
+  return bf16;
+}
+
+int
+main (void)
+{
+  float test_values[] = { 0.0f, -0.0f, 1.0f, -1.0f, 0.5f, -0.5f, 1000.0f, 
-1000.0f,
+  3.1415926f, -3.1415926f, 1e-8f, -1e-8f,
+  1.0e+38f, -1.0e+38f, 1.0e-38f, -1.0e-38f };
+  size_t num_values = sizeof (test_values) / sizeof (test_values[0]);
+
+  for (size_t i = 0; i < num_values; ++i)
+{
+  float original = test_values[i];
+  __bf16 hw_bf16 = foo (original);
+  __bf16 sw_bf16 = CALC (&original);
+
+  /* Verify psrld $16, %0 == %0 >> 16 */
+  if (memcmp (&hw_bf16, &sw_bf16, sizeof (__bf16)) != 0)
+abort ();
+
+  /* Reconstruct the float value from the __bf16 bits */
+  uint16_t bf16_bits;
+  memcpy (&bf16_bits, &hw_bf16, sizeof (bf16_bits));
+  uint32_t reconstructed_bits = ((uint32_t) bf16_bits) << 16;
+  float converted;
+  memcpy (&converted, &reconstructed_bits, sizeof (converted));
+
+  float diff = fabsf (original - converted);
+
+  /* Expected Maximum Precision Loss */
+  uint32_t orig_bits;
+  memcpy (&orig_bits, &original, sizeof (orig_bits));
+  int exponent = ((orig_bits >> 23) & 0xFF) - 127;
+  float expected_loss = (exponent == -127)
+? ldexpf (1.0f, -126 - 7)
+: ldexpf (1.0f, exponent - 7);
+  if (diff > expected_loss)
+abort ();
+}
+  return 0;
+}
-- 
2.31.1



[PATCH] x86: Extend AVX512 Vectorization for Popcount in Various Modes

2024-09-23 Thread Levy Hsu
This patch enables vectorization of the popcount operation for V2QI, V4QI,
V8QI, V2HI, V4HI, and V2SI modes.

gcc/ChangeLog:

* config/i386/mmx.md:
(VQI_16_32_64): New mode iterator for 8-byte, 4-byte, and 2-byte QImode.
(popcount2): New pattern for popcount of V2QI/V4QI/V8QI mode.
(popcount2): New pattern for popcount of V2HI/V4HI mode.
(popcountv2si2): New pattern for popcount of V2SI mode.

gcc/testsuite/ChangeLog:

* gcc.target/i386/part-vect-popcount-1.c: New test.
---
 gcc/config/i386/mmx.md| 24 +
 .../gcc.target/i386/part-vect-popcount-1.c| 49 +++
 2 files changed, 73 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/part-vect-popcount-1.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 4bc191b874b..147ae150bf3 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -70,6 +70,9 @@
 ;; 8-byte and 4-byte HImode vector modes
 (define_mode_iterator VI2_32_64 [(V4HI "TARGET_MMX_WITH_SSE") V2HI])
 
+;; 8-byte, 4-byte and 2-byte QImode vector modes
+(define_mode_iterator VI1_16_32_64 [(V8QI "TARGET_MMX_WITH_SSE") V4QI V2QI])
+
 ;; 4-byte and 2-byte integer vector modes
 (define_mode_iterator VI_16_32 [V4QI V2QI V2HI])
 
@@ -6786,3 +6789,24 @@
   [(set_attr "type" "mmx")
(set_attr "modrm" "0")
(set_attr "memory" "none")])
+
+(define_insn "popcount2"
+  [(set (match_operand:VI1_16_32_64 0 "register_operand" "=v")
+   (popcount:VI1_16_32_64
+ (match_operand:VI1_16_32_64 1 "register_operand" "v")))]
+  "TARGET_AVX512VL && TARGET_AVX512BITALG"
+  "vpopcntb\t{%1, %0|%0, %1}")
+
+(define_insn "popcount2"
+  [(set (match_operand:VI2_32_64 0 "register_operand" "=v")
+   (popcount:VI2_32_64
+ (match_operand:VI2_32_64 1 "register_operand" "v")))]
+  "TARGET_AVX512VL && TARGET_AVX512BITALG"
+  "vpopcntw\t{%1, %0|%0, %1}")
+
+(define_insn "popcountv2si2"
+  [(set (match_operand:V2SI 0 "register_operand" "=v")
+   (popcount:V2SI
+ (match_operand:V2SI 1 "register_operand" "v")))]
+  "TARGET_AVX512VPOPCNTDQ && TARGET_AVX512VL && TARGET_MMX_WITH_SSE"
+  "vpopcntd\t{%1, %0|%0, %1}")
diff --git a/gcc/testsuite/gcc.target/i386/part-vect-popcount-1.c 
b/gcc/testsuite/gcc.target/i386/part-vect-popcount-1.c
new file mode 100644
index 000..a30f6ec4726
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/part-vect-popcount-1.c
@@ -0,0 +1,49 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512vpopcntdq -mavx512bitalg -mavx512vl" } */
+/* { dg-final { scan-assembler-times "vpopcntd\[^\n\r\]*xmm\[0-9\]" 1 { target 
{ ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpopcntw\[^\n\r\]*xmm\[0-9\]" 3 { target 
ia32 } } } */
+/* { dg-final { scan-assembler-times "vpopcntw\[^\n\r\]*xmm\[0-9\]" 2 { target 
{ ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpopcntb\[^\n\r\]*xmm\[0-9\]" 4 { target 
ia32 } } } */
+/* { dg-final { scan-assembler-times "vpopcntb\[^\n\r\]*xmm\[0-9\]" 3 { target 
{ ! ia32 } } } } */
+
+void
+foo1 (int* a, int* __restrict b)
+{
+  for (int i = 0; i != 2; i++)
+a[i] = __builtin_popcount (b[i]);
+}
+
+void
+foo2 (unsigned short* a, unsigned short* __restrict b)
+{
+  for (int i = 0; i != 4; i++)
+a[i] = __builtin_popcount (b[i]);
+}
+
+void
+foo3 (unsigned short* a, unsigned short* __restrict b)
+{
+  for (int i = 0; i != 2; i++)
+a[i] = __builtin_popcount (b[i]);
+}
+
+void
+foo4 (unsigned char* a, unsigned char* __restrict b)
+{
+  for (int i = 0; i != 8; i++)
+a[i] = __builtin_popcount (b[i]);
+}
+
+void
+foo5 (unsigned char* a, unsigned char* __restrict b)
+{
+  for (int i = 0; i != 4; i++)
+a[i] = __builtin_popcount (b[i]);
+}
+
+void
+foo6 (unsigned char* a, unsigned char* __restrict b)
+{
+  for (int i = 0; i != 2; i++)
+a[i] = __builtin_popcount (b[i]);
+}
-- 
2.31.1



[PATCH] i386: Utilize VCOMSBF16 for BF16 Comparisons with AVX10.2

2024-10-16 Thread Levy Hsu
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m64}.
Ok for trunk?

This patch enables the use of the VCOMSBF16 instruction from AVX10.2 for
efficient BF16 comparisons.

gcc/ChangeLog:

* config/i386/i386-expand.cc (ix86_expand_branch): Handle BFmode
when TARGET_AVX10_2_256 is enabled.
(ix86_prepare_fp_compare_args):
Renamed SSE_FLOAT_MODE_SSEMATH_OR_HF_P to 
SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P.
(ix86_expand_fp_compare): For BFmode with IX86_FPCMP_COMI, use cmpibf.
(ix86_expand_fp_movcc):
Renamed SSE_FLOAT_MODE_SSEMATH_OR_HF_P to 
SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P.
* config/i386/i386.cc (ix86_multiplication_cost): Ditto.
(ix86_division_cost): Ditto.
(ix86_rtx_costs): Ditto.
(ix86_vector_costs::add_stmt_cost): Ditto.
* config/i386/i386.h (SSE_FLOAT_MODE_SSEMATH_OR_HF_P):  Ditto.
(SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P): Add BFmode.
* config/i386/i386.md (*cmpibf): New insn for cmpibf.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx10_2-comibf-1.c: New test.
* gcc.target/i386/avx10_2-comibf-2.c: New test.
---
 gcc/config/i386/i386-expand.cc|  22 ++--
 gcc/config/i386/i386.cc   |  22 ++--
 gcc/config/i386/i386.h|   7 +-
 gcc/config/i386/i386.md   |  27 +++-
 .../gcc.target/i386/avx10_2-comibf-1.c|  40 ++
 .../gcc.target/i386/avx10_2-comibf-2.c| 115 ++
 6 files changed, 208 insertions(+), 25 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-comibf-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-comibf-2.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 63f5e348d64..ce413fa0eba 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -2530,6 +2530,10 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx 
op1, rtx label)
   emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   return;
 
+case E_BFmode:
+  gcc_assert (TARGET_AVX10_2_256 && !flag_trapping_math);
+  goto simple;
+
 case E_DImode:
   if (TARGET_64BIT)
goto simple;
@@ -2796,9 +2800,9 @@ ix86_prepare_fp_compare_args (enum rtx_code code, rtx 
*pop0, rtx *pop1)
   bool unordered_compare = ix86_unordered_fp_compare (code);
   rtx op0 = *pop0, op1 = *pop1;
   machine_mode op_mode = GET_MODE (op0);
-  bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode);
+  bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (op_mode);
 
-  if (op_mode == BFmode)
+  if (op_mode == BFmode && (!TARGET_AVX10_2_256 || flag_trapping_math))
 {
   rtx op = gen_lowpart (HImode, op0);
   if (CONST_INT_P (op))
@@ -2917,10 +2921,14 @@ ix86_expand_fp_compare (enum rtx_code code, rtx op0, 
rtx op1)
 {
 case IX86_FPCMP_COMI:
   tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
-  if (TARGET_AVX10_2_256 && (code == EQ || code == NE))
-   tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_OPTCOMX);
-  if (unordered_compare)
-   tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
+  /* We only have vcomsbf16, No vcomubf16 nor vcomxbf16 */
+  if (GET_MODE (op0) != E_BFmode)
+{
+ if (TARGET_AVX10_2_256 && (code == EQ || code == NE))
+   tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_OPTCOMX);
+ if (unordered_compare)
+   tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
+   }
   cmp_mode = CCFPmode;
   emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
   break;
@@ -4635,7 +4643,7 @@ ix86_expand_fp_movcc (rtx operands[])
   && !ix86_fp_comparison_operator (operands[1], VOIDmode))
 return false;
 
-  if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode))
+  if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
 {
   machine_mode cmode;
 
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index a1f0ae7a7e1..c7132252e48 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21324,7 +21324,7 @@ ix86_multiplication_cost (const struct processor_costs 
*cost,
   if (VECTOR_MODE_P (mode))
 inner_mode = GET_MODE_INNER (mode);
 
-  if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode))
+  if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
 return inner_mode == DFmode ? cost->mulsd : cost->mulss;
   else if (X87_FLOAT_MODE_P (mode))
 return cost->fmul;
@@ -21449,7 +21449,7 @@ ix86_division_cost (const struct processor_costs *cost,
   if (VECTOR_MODE_P (mode))
 inner_mode = GET_MODE_INNER (mode);
 
-  if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode))
+  if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
 return inner_mode == DFmode ? cost->divsd : cost->divss;
   else if (X87_FLOAT_MODE_P (mode))
 return cost->fdiv;
@@ -21991,7 +21991,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
  return true;
}
 
-  if (SSE_FLOAT_M