Re: Add a param to decide stack slot sharing at -O0

2013-10-09 Thread Eric Botcazou
> Seems like a odd thing for a param. If the compile time increase is very
> small (< 1%?) I would just make the new threshold default.

I don't understand the 'odd' here... params are exactly for this purpose, i.e. 
to avoid hardcoding magic numbers in the compiler, so please elaborate.

-- 
Eric Botcazou


[Committed] S/390: Use FPRs as GPR save slots

2013-10-09 Thread Andreas Krebbel
Hi,

with the attached patch we use call-clobbered floating point registers
as save slots for general purpose registers in leaf functions.

Bootstrapped and regtested with various options and -march levels.

Committed to mainline.

Bye,

-Andreas-

2013-10-09  Andreas Krebbel  

* config/s390/s390.c (struct s390_frame_layout): New field
gpr_save_slots.
(cfun_save_arg_fprs_p, cfun_gpr_save_slot): New macros.
(s390_reg_clobbered_rtx, s390_regs_ever_clobbered): Change type of
regs_ever_clobbered to char*.
(s390_regs_ever_clobbered): Check crtl->saves_all_registers
instead of cfun->has_nonlocal_label.  Ignore frame related restore
INSNs.
(s390_register_info): Enable FPR save slots.  Move/Copy some
functionality into ...
(s390_register_info_gprtofpr, s390_register_info_stdarg_fpr)
(s390_register_info_stdarg_gpr, s390_optimize_register_info): New
function.
(s390_frame_info): Do gpr slot allocation here now.  stdarg does
not imply a stack frame.
(s390_init_frame_layout): Remove variable clobbered_regs.
(s390_update_register_info): Remove function.
(s390_hard_regno_rename_ok): Call-saved regs without a save slot
cannot be used for register renaming.
(s390_hard_regno_scratch_ok): New function.
(TARGET_HARD_REGNO_SCRATCH_OK): Define target hook.
(s390_initial_elimination_offset): Change offset calculation of
the return address pointer.
(save_gprs): Deal with only r6 being saved from the call-saved
regs.
(restore_gprs): Set frame related flag.
(s390_save_gprs_to_fprs, s390_restore_gprs_from_fprs): New
functions.
(s390_emit_prologue): Call s390_register_info instead of
s390_update_frame_layout.  Call s390_save_gprs_to_fprs.
(s390_emit_epilogue): Call s390_restore_gprs_from_fprs.
(s390_optimize_prologue): Call s390_optimize_register_info.
Try to remove also FPR slot save/restore INSNs.  Remove frame
related flags from restore INSNs.

---

---
 gcc/config/s390/s390.c |  663 ++--
 1 file changed, 141 insertions(+), 24 deletions(-), 498 modifications(!)

Index: gcc/config/s390/s390.c
===
*** gcc/config/s390/s390.c.orig
--- gcc/config/s390/s390.c
*** struct GTY (()) s390_frame_layout
*** 324,329 
--- 324,335 
int first_save_gpr_slot;
int last_save_gpr_slot;
  
+   /* Location (FP register number) where GPRs (r0-r15) should
+  be saved to.
+   0 - does not need to be saved at all
+  -1 - stack slot  */
+   signed char gpr_save_slots[16];
+ 
/* Number of first and last gpr to be saved, restored.  */
int first_save_gpr;
int first_restore_gpr;
*** struct GTY(()) machine_function
*** 377,388 
  
  #define cfun_frame_layout (cfun->machine->frame_layout)
  #define cfun_save_high_fprs_p (!!cfun_frame_layout.high_fprs)
! #define cfun_gprs_save_area_size ((cfun_frame_layout.last_save_gpr_slot - 
  \
cfun_frame_layout.first_save_gpr_slot + 1) * UNITS_PER_LONG)
  #define cfun_set_fpr_save(REGNO) (cfun->machine->frame_layout.fpr_bitmap |=   
 \
(1 << (REGNO - FPR0_REGNUM)))
  #define cfun_fpr_save_p(REGNO) (!!(cfun->machine->frame_layout.fpr_bitmap &   
 \
(1 << (REGNO - FPR0_REGNUM
  
  /* Number of GPRs and FPRs used for argument passing.  */
  #define GP_ARG_NUM_REG 5
--- 383,399 
  
  #define cfun_frame_layout (cfun->machine->frame_layout)
  #define cfun_save_high_fprs_p (!!cfun_frame_layout.high_fprs)
! #define cfun_save_arg_fprs_p (!!(TARGET_64BIT \
!? cfun_frame_layout.fpr_bitmap & 0x0f  \
!: cfun_frame_layout.fpr_bitmap & 0x03))
! #define cfun_gprs_save_area_size ((cfun_frame_layout.last_save_gpr_slot - \
cfun_frame_layout.first_save_gpr_slot + 1) * UNITS_PER_LONG)
  #define cfun_set_fpr_save(REGNO) (cfun->machine->frame_layout.fpr_bitmap |=   
 \
(1 << (REGNO - FPR0_REGNUM)))
  #define cfun_fpr_save_p(REGNO) (!!(cfun->machine->frame_layout.fpr_bitmap &   
 \
(1 << (REGNO - FPR0_REGNUM
+ #define cfun_gpr_save_slot(REGNO) \
+   cfun->machine->frame_layout.gpr_save_slots[REGNO]
  
  /* Number of GPRs and FPRs used for argument passing.  */
  #define GP_ARG_NUM_REG 5
*** find_unused_clobbered_reg (void)
*** 7364,7370 
  static void
  s390_reg_clobbered_rtx (rtx setreg, const_rtx set_insn ATTRIBUTE_UNUSED, void 
*data)
  {
!   int *regs_ever_clobbered = (int *)data;
unsigned int i, regno;
enum machine_mode mode = GET_MODE (setreg);
  
--- 7375,7381 
  static void
  s390_reg_clobbered_rtx (rtx setreg, const_rtx set_insn ATTRIBUTE_UNUSED, void 
*data)
  {
!   char *regs_ever_clobbered = (char *)data;
unsigned int i, regno;
enum m

[Committed] S/390: Cleanup of s390_frame_info

2013-10-09 Thread Andreas Krebbel
Just a cleanup of the s390_frame_info function.

Committed to mainline after regression test was fine.

Bye,

-Andreas-


2013-10-09  Andreas Krebbel  

* config/s390/s390.c (s390_frame_info): Restructure function.

---
 gcc/config/s390/s390.c |  112 ++!!
 1 file changed, 23 insertions(+), 1 deletion(-), 88 modifications(!)

Index: gcc/config/s390/s390.c
===
*** gcc/config/s390/s390.c.orig
--- gcc/config/s390/s390.c
*** s390_optimize_register_info ()
*** 7781,7787 
  static void
  s390_frame_info (void)
  {
!   int i;
  
cfun_frame_layout.first_save_gpr_slot = cfun_frame_layout.first_save_gpr;
cfun_frame_layout.last_save_gpr_slot = cfun_frame_layout.last_save_gpr;
--- 7781,7787 
  static void
  s390_frame_info (void)
  {
!   HOST_WIDE_INT lowest_offset;
  
cfun_frame_layout.first_save_gpr_slot = cfun_frame_layout.first_save_gpr;
cfun_frame_layout.last_save_gpr_slot = cfun_frame_layout.last_save_gpr;
*** s390_frame_info (void)
*** 7805,7810 
--- 7805,7811 
  
if (!TARGET_PACKED_STACK)
  {
+   /* Fixed stack layout.  */
cfun_frame_layout.backchain_offset = 0;
cfun_frame_layout.f0_offset = 16 * UNITS_PER_LONG;
cfun_frame_layout.f4_offset = cfun_frame_layout.f0_offset + 2 * 8;
*** s390_frame_info (void)
*** 7812,7856 
cfun_frame_layout.gprs_offset = (cfun_frame_layout.first_save_gpr_slot
   * UNITS_PER_LONG);
  }
!   else if (TARGET_BACKCHAIN) /* kernel stack layout */
  {
cfun_frame_layout.backchain_offset = (STACK_POINTER_OFFSET
- UNITS_PER_LONG);
cfun_frame_layout.gprs_offset
!   = (cfun_frame_layout.backchain_offset
!  - (STACK_POINTER_REGNUM - cfun_frame_layout.first_save_gpr_slot + 1)
!  * UNITS_PER_LONG);
  
!   if (TARGET_64BIT)
!   {
! cfun_frame_layout.f4_offset
!   = (cfun_frame_layout.gprs_offset
!  - 8 * (cfun_fpr_save_p (FPR4_REGNUM)
! + cfun_fpr_save_p (FPR6_REGNUM)));
! 
! cfun_frame_layout.f0_offset
!   = (cfun_frame_layout.f4_offset
!  - 8 * (cfun_fpr_save_p (FPR0_REGNUM)
! + cfun_fpr_save_p (FPR2_REGNUM)));
!   }
!   else
!   {
! /* On 31 bit we have to care about alignment of the
!floating point regs to provide fastest access.  */
! cfun_frame_layout.f0_offset
!   = ((cfun_frame_layout.gprs_offset
!   & ~(STACK_BOUNDARY / BITS_PER_UNIT - 1))
!  - 8 * (cfun_fpr_save_p (FPR0_REGNUM)
! + cfun_fpr_save_p (FPR2_REGNUM)));
! 
! cfun_frame_layout.f4_offset
!   = (cfun_frame_layout.f0_offset
!  - 8 * (cfun_fpr_save_p (FPR4_REGNUM)
! + cfun_fpr_save_p (FPR6_REGNUM)));
!   }
  }
!   else /* no backchain */
  {
cfun_frame_layout.f4_offset
= (STACK_POINTER_OFFSET
   - 8 * (cfun_fpr_save_p (FPR4_REGNUM)
--- 7813,7842 
cfun_frame_layout.gprs_offset = (cfun_frame_layout.first_save_gpr_slot
   * UNITS_PER_LONG);
  }
!   else if (TARGET_BACKCHAIN)
  {
+   /* Kernel stack layout - packed stack, backchain, no float  */
+   gcc_assert (TARGET_SOFT_FLOAT);
cfun_frame_layout.backchain_offset = (STACK_POINTER_OFFSET
- UNITS_PER_LONG);
+ 
+   /* The distance between the backchain and the return address
+save slot must not change.  So we always need a slot for the
+stack pointer which resides in between.  */
+   cfun_frame_layout.last_save_gpr_slot = STACK_POINTER_REGNUM;
+ 
cfun_frame_layout.gprs_offset
!   = cfun_frame_layout.backchain_offset - cfun_gprs_save_area_size;
  
!   /* FPRs will not be saved.  Nevertheless pick sane values to
!keep area calculations valid.  */
!   cfun_frame_layout.f0_offset =
!   cfun_frame_layout.f4_offset =
!   cfun_frame_layout.f8_offset = cfun_frame_layout.gprs_offset;
  }
!   else
  {
+   /* Packed stack layout without backchain.  */
cfun_frame_layout.f4_offset
= (STACK_POINTER_OFFSET
   - 8 * (cfun_fpr_save_p (FPR4_REGNUM)
*** s390_frame_info (void)
*** 7863,7909 
  
cfun_frame_layout.gprs_offset
= cfun_frame_layout.f0_offset - cfun_gprs_save_area_size;
  }
  
if (crtl->is_leaf
&& !TARGET_TPF_PROFILING
&& cfun_frame_layout.frame_size == 0
-   && !cfun_save_high_fprs_p
&& !cfun->calls_alloca)
  return;
  
!   if (!TARGET_PACKED_STACK)
! cfun_frame_layout.frame_size += (STACK_POINTER_OFFSET
!+ crtl->outgoing_args_size
!

[Committed] S/390: Use fix stack slots for FPRs saved due to stdarg

2013-10-09 Thread Andreas Krebbel
Hi,

for stdarg the floating point argument registers in the register save
area must reside at an ABI-defined offset relative to the start of the save
area.  So far were not able to optimize away save instructions with
the packed stack layout since this changed the offsets.

With the attached patch we use dedicated slots when saving FPR
argument registers and are now able to get rid of unnecessary saves
that way.

Bootstrapped and regression tested.

Committed to mainline.

Bye,

-Andreas-


2013-10-09  Andreas Krebbel  

* config/s390/s390.c (s390_register_info_stdarg_fpr): Remove
packed stack special handling.
(s390_frame_info, s390_emit_prologue, s390_emit_epilogue): Switch
back to fixed stack slots for FPRs saved due to stdarg.

---
 gcc/config/s390/s390.c |   39 ++---!!
 1 file changed, 2 insertions(+), 7 deletions(-), 30 modifications(!)

Index: gcc/config/s390/s390.c
===
*** gcc/config/s390/s390.c.orig
--- gcc/config/s390/s390.c
*** s390_register_info_stdarg_fpr ()
*** 7589,7601 
if (max_fpr > FP_ARG_NUM_REG)
  max_fpr = FP_ARG_NUM_REG;
  
-   /* The va_arg algorithm accesses the FPRs in the reg save area using
-  a constant offset from r0.  With the packed stack layout omitting
-  FPRs from the beginning would change the offset for the
-  subsequent FPRs.  */
-   if (TARGET_PACKED_STACK)
- min_fpr = 0;
- 
for (i = min_fpr; i < max_fpr; i++)
  cfun_set_fpr_save (i + FPR0_REGNUM);
  }
--- 7589,7594 
*** s390_frame_info (void)
*** 7836,7851 
  }
else
  {
/* Packed stack layout without backchain.  */
!   cfun_frame_layout.f4_offset
!   = (STACK_POINTER_OFFSET
!  - 8 * (cfun_fpr_save_p (FPR4_REGNUM)
! + cfun_fpr_save_p (FPR6_REGNUM)));
! 
!   cfun_frame_layout.f0_offset
!   = (cfun_frame_layout.f4_offset
!  - 8 * (cfun_fpr_save_p (FPR0_REGNUM)
! + cfun_fpr_save_p (FPR2_REGNUM)));
  
cfun_frame_layout.gprs_offset
= cfun_frame_layout.f0_offset - cfun_gprs_save_area_size;
--- 7829,7848 
  }
else
  {
+   int num_fprs;
+ 
/* Packed stack layout without backchain.  */
! 
!   /* With stdarg FPRs need their dedicated slots.  */
!   num_fprs = (TARGET_64BIT && cfun->stdarg ? 2
! : (cfun_fpr_save_p (FPR4_REGNUM) +
!cfun_fpr_save_p (FPR6_REGNUM)));
!   cfun_frame_layout.f4_offset = STACK_POINTER_OFFSET - 8 * num_fprs;
! 
!   num_fprs = (cfun->stdarg ? 2
! : (cfun_fpr_save_p (FPR0_REGNUM)
!+ cfun_fpr_save_p (FPR2_REGNUM)));
!   cfun_frame_layout.f0_offset = cfun_frame_layout.f4_offset - 8 * 
num_fprs;
  
cfun_frame_layout.gprs_offset
= cfun_frame_layout.f0_offset - cfun_gprs_save_area_size;
*** s390_emit_prologue (void)
*** 8680,8687 
  save_fpr (stack_pointer_rtx, offset, i);
  offset += 8;
}
!   else if (!TARGET_PACKED_STACK)
! offset += 8;
  }
  
/* Save f4 and f6.  */
--- 8677,8684 
  save_fpr (stack_pointer_rtx, offset, i);
  offset += 8;
}
!   else if (!TARGET_PACKED_STACK || cfun->stdarg)
!   offset += 8;
  }
  
/* Save f4 and f6.  */
*** s390_emit_prologue (void)
*** 8693,8704 
  insn = save_fpr (stack_pointer_rtx, offset, i);
  offset += 8;
  
! /* If f4 and f6 are call clobbered they are saved due to stdargs and
!therefore are not frame related.  */
  if (!call_really_used_regs[i])
RTX_FRAME_RELATED_P (insn) = 1;
}
!   else if (!TARGET_PACKED_STACK)
offset += 8;
  }
  
--- 8690,8701 
  insn = save_fpr (stack_pointer_rtx, offset, i);
  offset += 8;
  
! /* If f4 and f6 are call clobbered they are saved due to
!stdargs and therefore are not frame related.  */
  if (!call_really_used_regs[i])
RTX_FRAME_RELATED_P (insn) = 1;
}
!   else if (!TARGET_PACKED_STACK || call_really_used_regs[i])
offset += 8;
  }
  



Re: Cleanup patches

2013-10-09 Thread Thomas Schwinge
Hi!

On Tue, 8 Oct 2013 22:04:23 +0200, Jakub Jelinek  wrote:
> On Tue, Oct 08, 2013 at 09:17:35AM +0200, Thomas Schwinge wrote:
> > Here are a few cleanup patches, mostly in the realm of OpenMP, so Jakub
> > gets a CC.  OK to commit?
> 
> They look ok to me, but I'd prefer if they could go in after the merge
> I've just posted.

Sure, I can wait until then.


Meanwhile, here's another series that I assumed had gotten lost, but now
recovered thanks to »git fsck«.  Also OK to commit?


_OPENMP pre-processor checks, openmp_version Fortran instrinsic checks.

gcc/testsuite/
* c-c++-common/cpp/openmp-define-1.c: New file.
* c-c++-common/cpp/openmp-define-2.c: Likewise.
* c-c++-common/cpp/openmp-define-3.c: Likewise.
* gfortran.dg/openmp-define-1.f90: Likewise.
* gfortran.dg/openmp-define-2.f90: Likewise.
* gfortran.dg/openmp-define-3.f90: Likewise.
libgomp/
* testsuite/libgomp.fortran/openmp_version-1.f: New file.
* testsuite/libgomp.fortran/openmp_version-2.f90: Likewise.
---
 gcc/testsuite/c-c++-common/cpp/openmp-define-1.c   |  6 ++
 gcc/testsuite/c-c++-common/cpp/openmp-define-2.c   |  7 +++
 gcc/testsuite/c-c++-common/cpp/openmp-define-3.c   | 11 +++
 gcc/testsuite/gfortran.dg/openmp-define-1.f90  |  7 +++
 gcc/testsuite/gfortran.dg/openmp-define-2.f90  |  7 +++
 gcc/testsuite/gfortran.dg/openmp-define-3.f90  | 11 +++
 libgomp/testsuite/libgomp.fortran/openmp_version-1.f   |  9 +
 libgomp/testsuite/libgomp.fortran/openmp_version-2.f90 |  9 +
 8 files changed, 67 insertions(+)
 create mode 100644 gcc/testsuite/c-c++-common/cpp/openmp-define-1.c
 create mode 100644 gcc/testsuite/c-c++-common/cpp/openmp-define-2.c
 create mode 100644 gcc/testsuite/c-c++-common/cpp/openmp-define-3.c
 create mode 100644 gcc/testsuite/gfortran.dg/openmp-define-1.f90
 create mode 100644 gcc/testsuite/gfortran.dg/openmp-define-2.f90
 create mode 100644 gcc/testsuite/gfortran.dg/openmp-define-3.f90
 create mode 100644 libgomp/testsuite/libgomp.fortran/openmp_version-1.f
 create mode 100644 libgomp/testsuite/libgomp.fortran/openmp_version-2.f90

diff --git gcc/testsuite/c-c++-common/cpp/openmp-define-1.c 
gcc/testsuite/c-c++-common/cpp/openmp-define-1.c
new file mode 100644
index 000..0e7943a
--- /dev/null
+++ gcc/testsuite/c-c++-common/cpp/openmp-define-1.c
@@ -0,0 +1,6 @@
+/* { dg-require-effective-target fopenmp } */
+/* { dg-do preprocess } */
+
+#ifdef _OPENMP
+# error _OPENMP defined
+#endif
diff --git gcc/testsuite/c-c++-common/cpp/openmp-define-2.c 
gcc/testsuite/c-c++-common/cpp/openmp-define-2.c
new file mode 100644
index 000..f89
--- /dev/null
+++ gcc/testsuite/c-c++-common/cpp/openmp-define-2.c
@@ -0,0 +1,7 @@
+/* { dg-require-effective-target fopenmp } */
+/* { dg-options "-fno-openmp" } */
+/* { dg-do preprocess } */
+
+#ifdef _OPENMP
+# error _OPENMP defined
+#endif
diff --git gcc/testsuite/c-c++-common/cpp/openmp-define-3.c 
gcc/testsuite/c-c++-common/cpp/openmp-define-3.c
new file mode 100644
index 000..6ea0425
--- /dev/null
+++ gcc/testsuite/c-c++-common/cpp/openmp-define-3.c
@@ -0,0 +1,11 @@
+/* { dg-require-effective-target fopenmp } */
+/* { dg-options "-fopenmp" } */
+/* { dg-do preprocess } */
+
+#ifndef _OPENMP
+# error _OPENMP not defined
+#endif
+
+#if _OPENMP != 201107
+# error _OPENMP defined to wrong value
+#endif
diff --git gcc/testsuite/gfortran.dg/openmp-define-1.f90 
gcc/testsuite/gfortran.dg/openmp-define-1.f90
new file mode 100644
index 000..d4ef1d1
--- /dev/null
+++ gcc/testsuite/gfortran.dg/openmp-define-1.f90
@@ -0,0 +1,7 @@
+! { dg-require-effective-target fopenmp }
+! { dg-options "-cpp" }
+! { dg-do preprocess }
+
+#ifdef _OPENMP
+# error _OPENMP defined
+#endif
diff --git gcc/testsuite/gfortran.dg/openmp-define-2.f90 
gcc/testsuite/gfortran.dg/openmp-define-2.f90
new file mode 100644
index 000..651dae7
--- /dev/null
+++ gcc/testsuite/gfortran.dg/openmp-define-2.f90
@@ -0,0 +1,7 @@
+! { dg-require-effective-target fopenmp }
+! { dg-options "-cpp -fno-openmp" }
+! { dg-do preprocess }
+
+#ifdef _OPENMP
+# error _OPENMP defined
+#endif
diff --git gcc/testsuite/gfortran.dg/openmp-define-3.f90 
gcc/testsuite/gfortran.dg/openmp-define-3.f90
new file mode 100644
index 000..dfc45b1
--- /dev/null
+++ gcc/testsuite/gfortran.dg/openmp-define-3.f90
@@ -0,0 +1,11 @@
+! { dg-require-effective-target fopenmp }
+! { dg-options "-cpp -fopenmp" }
+! { dg-do preprocess }
+
+#ifndef _OPENMP
+# error _OPENMP not defined
+#endif
+
+#if _OPENMP != 201107
+# error _OPENMP defined to wrong value
+#endif
diff --git libgomp/testsuite/libgomp.fortran/openmp_version-1.f 
libgomp/testsuite/libgomp.fortran/openmp_version-1.f
new file mode 100644
index 000..aaa8881
--- /dev/null
+++ libgomp/testsuite/libgomp.fortran/openmp_version-1.f
@@ -0,0 +1,9 @@
+! { dg-do run }
+
+  program main
+  implicit none
+  include "omp_

Re: Cleanup patches

2013-10-09 Thread Jakub Jelinek
On Wed, Oct 09, 2013 at 10:56:25AM +0200, Thomas Schwinge wrote:
> On Tue, 8 Oct 2013 22:04:23 +0200, Jakub Jelinek  wrote:
> > On Tue, Oct 08, 2013 at 09:17:35AM +0200, Thomas Schwinge wrote:
> > > Here are a few cleanup patches, mostly in the realm of OpenMP, so Jakub
> > > gets a CC.  OK to commit?
> > 
> > They look ok to me, but I'd prefer if they could go in after the merge
> > I've just posted.
> 
> Sure, I can wait until then.
> 
> 
> Meanwhile, here's another series that I assumed had gotten lost, but now
> recovered thanks to »git fsck«.  Also OK to commit?

The testing _OPENMP value against 201107 won't work after merge, it will be
201307 instead.  The Fortran openmp_version will be still 201107 for now,
as Fortran OpenMP 4.0 support isn't there yet.  Other than that it looks good.

Jakub


Re: Cleanup patches

2013-10-09 Thread Tobias Burnus
Thomas Schwinge wrote:
> Meanwhile, here's another series that I assumed had gotten lost, but now
> recovered thanks to Âgit fsckÂ.  Also OK to commit?
> 
> _OPENMP pre-processor checks, openmp_version Fortran instrinsic checks.

Two remarks:
* When Jakub's patch gets it, one needs to update the C/C++ OpenMP version
number for OpenMPv4 (i.e. 201307 instead of 201107). The Fortran
version number hasn't been bumped, yet.

* gfortran's preprocessor also defines _OPENMP. Thus, you could also add
test cases for _OPENMP.

Tobias


[4.8] Backported fix for PR tree-optimization/58539

2013-10-09 Thread Jakub Jelinek
Hi!

I've backported the following patch to 4.8 branch,
bootstrapped/regtested on x86_64-linux and i686-linux,
committed.

2013-10-09  Jakub Jelinek  

Backport from mainline
2013-09-26  Richard Biener  

PR tree-optimization/58539
* tree-vect-loop.c (vect_create_epilog_for_reduction): Honor
the fact that debug statements are not taking part in loop-closed
SSA construction.

* gcc.dg/torture/pr58539.c: New testcase.

--- gcc/tree-vect-loop.c(revision 202941)
+++ gcc/tree-vect-loop.c(revision 202942)
@@ -4411,7 +4411,8 @@ vect_finalize_reduction:
  result.  (The reduction result is expected to have two immediate uses 
-
  one at the latch block, and one at the loop exit).  */
   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
-if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p
+if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
+   && !is_gimple_debug (USE_STMT (use_p)))
   phis.safe_push (USE_STMT (use_p));
 
   /* While we expect to have found an exit_phi because of loop-closed-ssa
@@ -4541,7 +4542,10 @@ vect_finalize_reduction:
   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
 {
   if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p
-phis.safe_push (USE_STMT (use_p));
+   {
+ if (!is_gimple_debug (USE_STMT (use_p)))
+   phis.safe_push (USE_STMT (use_p));
+   }
   else
 {
   if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
@@ -4551,7 +4555,8 @@ vect_finalize_reduction:
   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
 {
   if (!flow_bb_inside_loop_p (loop,
- gimple_bb (USE_STMT (phi_use_p
+ gimple_bb (USE_STMT (phi_use_p)))
+ && !is_gimple_debug (USE_STMT (phi_use_p)))
 phis.safe_push (USE_STMT (phi_use_p));
 }
 }
--- gcc/testsuite/gcc.dg/torture/pr58539.c  (revision 0)
+++ gcc/testsuite/gcc.dg/torture/pr58539.c  (revision 202942)
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-g" } */
+
+int a, b;
+
+extern void baz (int);
+
+int foo (int p)
+{
+  return p ? p : 1;
+}
+
+void bar ()
+{
+  int *c = &a, *d = &a;
+  for (b = 0; b < 12; b++)
+*d |= 1;
+  foo (*c);
+  baz (*c && 1);
+}

Jakub


[4.8] Fix OMP_THREAD_LIMIT libgomp handling

2013-10-09 Thread Jakub Jelinek
Hi!

This is the minimal fix for the issue mentioned in
http://gcc.gnu.org/ml/gcc-patches/2013-10/msg00333.html
that is being fixed as part of gomp-4_0-branch merge to trunk,
but for 4.8 I've applied this fix instead.
The testcase also includes the follow-up fix from
http://gcc.gnu.org/ml/gcc-patches/2013-10/msg00497.html

Bootstrapped/regtested on x86_64-linux and i686-linux, committed
to 4.8 branch.

2013-10-09  Jakub Jelinek  

* parallel.c (GOMP_parallel_end): Remember team->nthreads and call
gomp_team_end before adjusting gomp_remaining_threads_count, increment
gomp_remaining_threads_count instead of decrementing it again.
* testsuite/libgomp.c/thread-limit-1.c: New test.

--- libgomp/parallel.c.jj   2013-03-16 08:07:43.0 +0100
+++ libgomp/parallel.c  2013-10-09 09:10:38.215245390 +0200
@@ -115,19 +115,22 @@ GOMP_parallel_end (void)
 {
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_team *team = thr->ts.team;
-  if (team && team->nthreads > 1)
+  unsigned int nthreads = team ? team->nthreads : 1;
+  gomp_team_end ();
+  if (nthreads > 1)
{
 #ifdef HAVE_SYNC_BUILTINS
  __sync_fetch_and_add (&gomp_remaining_threads_count,
-   1UL - team->nthreads);
+   nthreads - 1);
 #else
  gomp_mutex_lock (&gomp_remaining_threads_lock);
- gomp_remaining_threads_count -= team->nthreads - 1;
+ gomp_remaining_threads_count += nthreads - 1;
  gomp_mutex_unlock (&gomp_remaining_threads_lock);
 #endif
}
 }
-  gomp_team_end ();
+  else
+gomp_team_end ();
 }
 
 
--- libgomp/testsuite/libgomp.c/thread-limit-1.c.jj 2013-10-09 
09:05:35.702854964 +0200
+++ libgomp/testsuite/libgomp.c/thread-limit-1.c2013-10-08 
20:30:47.0 +0200
@@ -0,0 +1,41 @@
+/* { dg-do run } */
+/* { dg-set-target-env-var OMP_THREAD_LIMIT "6" } */
+
+#include 
+#include 
+
+int
+main ()
+{
+  if (omp_get_thread_limit () != 6)
+return 0;
+  omp_set_dynamic (0);
+  omp_set_nested (1);
+  #pragma omp parallel num_threads (3)
+  if (omp_get_num_threads () != 3)
+abort ();
+  #pragma omp parallel num_threads (3)
+  if (omp_get_num_threads () != 3)
+abort ();
+  #pragma omp parallel num_threads (8)
+  if (omp_get_num_threads () > 6)
+abort ();
+  #pragma omp parallel num_threads (6)
+  if (omp_get_num_threads () != 6)
+abort ();
+  int cnt = 0;
+  #pragma omp parallel num_threads (5)
+  #pragma omp parallel num_threads (5)
+  #pragma omp parallel num_threads (2)
+  {
+int v;
+#pragma omp atomic capture
+v = ++cnt;
+if (v > 6)
+  abort ();
+usleep (1);
+#pragma omp atomic
+--cnt;
+  }
+  return 0;
+}

Jakub


Re: Cleanup patches

2013-10-09 Thread Thomas Schwinge
Hi!

On Wed, 9 Oct 2013 11:04:29 +0200, Jakub Jelinek  wrote:
> The testing _OPENMP value against 201107 won't work after merge, it will be
> 201307 instead.  The Fortran openmp_version will be still 201107 for now,
> as Fortran OpenMP 4.0 support isn't there yet.

And:

On Wed, 9 Oct 2013 11:13:38 +0200, Tobias Burnus 
 wrote:
> * When Jakub's patch gets it, one needs to update the C/C++ OpenMP version
> number for OpenMPv4 (i.e. 201307 instead of 201107). The Fortran
> version number hasn't been bumped, yet.

Sorry for not pointing this out upfront: I've been aware of that, but
posted the patch I had tested based on yesterday's trunk.


> * gfortran's preprocessor also defines _OPENMP. Thus, you could also add
> test cases for _OPENMP.

That's what I'm testing in the new file
gcc/testsuite/gfortran.dg/openmp-define-3.f90 -- anything I'm missing
there?


Grüße,
 Thomas


pgpgYEUI_Dy1a.pgp
Description: PGP signature


RE: Fix scheduler ix86_issue_rate and ix86_adjust_cost for modern x86 chips

2013-10-09 Thread Gopalasubramanian, Ganesh
Hi Honza, 

> Yep, I think we need to merge only those autmatas tha are same for both:
> (define_automaton "bdver3,bdver3_ieu,bdver3_load,bdver3_fp,bdver3_agu")
> probably can become
> (define_automaton "bdver3,bdver3_fp")
> with the corresponding reservations using bdver3_ieu,bdver3_load,bdver3_agu 
> changed to bdver1
> automaton.  I think it should result in smaller binary - the fact that all 
> conditionals are
> physically duplicated in bdver1/bdev3.md should be optimized away by 
> genautomata.

Before merging the insn reservations, I need to compare the latency values for 
bdver1 and bdver3. I know that they are different for some of the instructions. 
In that case, the merging should prop up another subset of latency differences. 
I would like to keep these insn reservations in two .md files (one for bdver1 
and one for bdver3) even after the merger.

> Your version has problem that it does not model the thing that the two 
> decoders works sequentially.

The two stage modeling is required so that the decode unit reservations are 
screened from other unit reservations.
But this sort of goes away in bdver3 because of the decode cycle.
In bdver3, the decode units scan two of these windows every "two" cycles 
decoding a maximum of eight instructions.
The hardware scan is done every two cycles in bdver3 whereas it is done every 
single cycle in bdver1/bdver2. (But we have two separate hardware decoders 
which guarantees higher throughput)
This means that the two stage modeling is not required in the scheduler 
descriptions since the hardware sort of guarantees that with its scanning 
mechanism. 
Our job is to make sure that 8 direct instructions get scheduled in two cycles 
or 4 double instructions get scheduled in two cycles.
So, I have modeled the bdver3 decoders such that with in a cycle they guarantee 
to issue 4 direct instructions or 2 double instructions. 
This eliminates the sequencing problem in modeling decoders and also ensures 
that the issue rate can be numbered for a single cycle rather than two cycles.
This is one of the reasons why I remodeled only bdver3. Let me know your 
comments on this.

> We can also experiment with defining TARGET_SCHED_VARIABLE_ISSUE to get more 
> realistic estimates on what still can be issued - the value of 6 is 
> unrealistically high.
This would get more complicated if we go by decoder capacity in bdver3. As we 
have two hardware decoders in steamroller (bdver3), they have a capacity to 
decode eight instructions per clock cycle, providing up to twice the decode and 
dispatch bandwidth compared to bdver1.
If we model this in GCC we need to change the issue rate to 8. If 6 is high, 
then 8 would add more joy and excitement.

TARGET_SCHED_VARIABLE_ISSUE is a nice suggestion to schedule instructions in 
different way. 

> We also should enable ia32_multipass_dfa_lookahead - with that scheduler 
> should be able to put double decoded and vector decoded insns on the proper 
> places.
Yes. Whenever we have this scheduler analysis in place we discuss about this 
but unfortunately is left as it is.
I will look into this after I do the enablement for bdver4.

> I will work on replacing most of the CPU cases into tuning flags + costs.
I am planning to get bdver4 enablement in place once scheduler descriptions for 
bdver3 is done with.
I will have cycles to look into the cost models. Please delegate some tasks if 
you can and I am willing to take them up.

Regards
Ganesh

-Original Message-
From: Jan Hubicka [mailto:hubi...@ucw.cz] 
Sent: Tuesday, October 08, 2013 3:20 PM
To: Gopalasubramanian, Ganesh
Cc: Jan Hubicka; gcc-patches@gcc.gnu.org; hjl.to...@gmail.com
Subject: Re: Fix scheduler ix86_issue_rate and ix86_adjust_cost for modern x86 
chips

> Hi Honza,
> 
> I am planning to update the scheduler descriptions for bdver3 first.
> Attached is the patch. Please let me know your comments if any.
> 
> Though I agree on merging bdver1/2 and bdver3 on most parts, the FP lines and 
> decoding schemes are different. So, let me know how can I approach merging 
> these.

Yep, I think we need to merge only those autmatas tha are same for both:
(define_automaton "bdver3,bdver3_ieu,bdver3_load,bdver3_fp,bdver3_agu")
probably can become
(define_automaton "bdver3,bdver3_fp")
with the corresponding reservations using bdver3_ieu,bdver3_load,bdver3_agu 
changed to bdver1
automaton.  I think it should result in smaller binary - the fact that all 
conditionals are
physically duplicated in bdver1/bdev3.md should be optimized away by 
genautomata.

I also played a bit with the decoders and I am attaching my version - that 
seems SPEC neutral though.
Your version has problem that it does not model the thing that the two decoders 
works sequentially.

I removed the bdver1-decodev unit and instead i simply reserve all thre 
decoders + I added 
presence set requring second decoder to be taken only after first one changed 
presence set requring
decoder 2 to be taken only after decod

Re: [patch] Add tree-ssa-loop.h and friends.

2013-10-09 Thread Richard Biener
On Tue, Oct 8, 2013 at 4:15 PM, Andrew MacLeod  wrote:
> On 10/08/2013 09:18 AM, Richard Biener wrote:
>>
>> On Tue, Oct 8, 2013 at 2:58 PM, Andrew MacLeod 
>> wrote:
>>>
>>>
 I just took a quick stab at it...  I think its pretty involved and
 someone
 with better loop comprehension should probably look at the followup of
 removing that requirement. estimate_numbers_of_iterations_loop() in
 particular uses last_stmt(), so it requires gimple.. and there is
 sprinkling
 of gimple specific stuff all through it...  I have no idea how this is
 suppose to work for rtl.

 This is the way it is now, so at least by including that header, it
 exposes the hidden problem and either I can revisit it later, or someone
 else can tackle that.  it seems *really* messy.

 OK as is for now?

 Andrew
>>>
>>> heh, make it available, and it will get used :-)  It hasn't been this
>>> that
>>> way that long.
>>
>> Well, that's just accessing already computed and preserved max-iteration.
>>
>> That is, the accessors used by loop-*.c should probably be moved to
>> cfgloop.[ch].
>>
>>
> huh, deeper than I intended to go but not too bad.   OK...  how about this
> as an add-on to the previous patch.  I'd just check in the combination all
> at once. A couple of the functions needed to be separated... so I created
> get_estimated_loop_iterations() and get_max_loop_iterations()... hopefully I
> did that right. basically I pulled out everything but the scev estimating,a
> dn left that in the original.   This bootstraps, and regression tests are
> running.
>
> Assuming it all works out, is this ok?

YEs, that looks good.

THanks,
Richard.

> Andrew
>


Re: [PATCH i386 3/8] [AVX512] [2/n] Add AVX-512 patterns: Fix missing `v' constraint.

2013-10-09 Thread Kirill Yukhin
Hello,

> This patch is still far too large.
>
> I think you should split it up based on every single mode iterator that
> you need to add or change.


Here's 2nd subpatch. It fixes missing `v' constraints.

Is it Ok?

Testing:
  1. Bootstrap pass.
  2. make check shows no regressions.
  3. Spec 2000 & 2006 build show no regressions both with and without -mavx512f 
option.
  4. Spec 2000 & 2006 run shows no stability regressions without -mavx512f 
option.


--
Thanks, K

PS. If it is ok - I am going to strip out ChangeLog lines from big patch

---
 gcc/config/i386/sse.md | 34 +-
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 10637cc..2f2fb38 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -880,7 +880,7 @@
 
 (define_insn "_movnt"
   [(set (match_operand:VI8 0 "memory_operand" "=m")
-   (unspec:VI8 [(match_operand:VI8 1 "register_operand" "x")]
+   (unspec:VI8 [(match_operand:VI8 1 "register_operand" "v")]
UNSPEC_MOVNT))]
   "TARGET_SSE2"
   "%vmovntdq\t{%1, %0|%0, %1}"
@@ -1764,10 +1764,10 @@
   [(set (reg:CCFP FLAGS_REG)
(compare:CCFP
  (vec_select:MODEF
-   (match_operand: 0 "register_operand" "x")
+   (match_operand: 0 "register_operand" "v")
(parallel [(const_int 0)]))
  (vec_select:MODEF
-   (match_operand: 1 "nonimmediate_operand" "xm")
+   (match_operand: 1 "nonimmediate_operand" "vm")
(parallel [(const_int 0)]]
   "SSE_FLOAT_MODE_P (mode)"
   "%vcomi\t{%1, %0|%0, %1}"
@@ -1784,10 +1784,10 @@
   [(set (reg:CCFPU FLAGS_REG)
(compare:CCFPU
  (vec_select:MODEF
-   (match_operand: 0 "register_operand" "x")
+   (match_operand: 0 "register_operand" "v")
(parallel [(const_int 0)]))
  (vec_select:MODEF
-   (match_operand: 1 "nonimmediate_operand" "xm")
+   (match_operand: 1 "nonimmediate_operand" "vm")
(parallel [(const_int 0)]]
   "SSE_FLOAT_MODE_P (mode)"
   "%vucomi\t{%1, %0|%0, %1}"
@@ -2594,7 +2594,7 @@
(set_attr "amdfam10_decode" "vector,double,*")
(set_attr "bdver1_decode" "double,direct,*")
(set_attr "btver2_decode" "double,double,double")
-   (set_attr "prefix" "orig,orig,vex")
+   (set_attr "prefix" "orig,orig,maybe_evex")
(set_attr "mode" "SF")])
 
 (define_insn "sse_cvtsi2ssq"
@@ -2617,7 +2617,7 @@
(set_attr "btver2_decode" "double,double,double")
(set_attr "length_vex" "*,*,4")
(set_attr "prefix_rex" "1,1,*")
-   (set_attr "prefix" "orig,orig,vex")
+   (set_attr "prefix" "orig,orig,maybe_evex")
(set_attr "mode" "SF")])
 
 (define_insn "sse_cvtss2si"
@@ -2668,7 +2668,7 @@
 
 (define_insn "sse_cvtss2siq_2"
   [(set (match_operand:DI 0 "register_operand" "=r,r")
-   (unspec:DI [(match_operand:SF 1 "nonimmediate_operand" "x,m")]
+   (unspec:DI [(match_operand:SF 1 "nonimmediate_operand" "v,m")]
   UNSPEC_FIX_NOTRUNC))]
   "TARGET_SSE && TARGET_64BIT"
   "%vcvtss2si{q}\t{%1, %0|%0, %k1}"
@@ -2860,11 +2860,11 @@
(set_attr "mode" "DF")])
 
 (define_insn "sse2_cvtsi2sdq"
-  [(set (match_operand:V2DF 0 "register_operand" "=x,x,x")
+  [(set (match_operand:V2DF 0 "register_operand" "=x,x,v")
(vec_merge:V2DF
  (vec_duplicate:V2DF
(float:DF (match_operand:DI 2 "nonimmediate_operand" "r,m,rm")))
- (match_operand:V2DF 1 "register_operand" "0,0,x")
+ (match_operand:V2DF 1 "register_operand" "0,0,v")
  (const_int 1)))]
   "TARGET_SSE2 && TARGET_64BIT"
   "@
@@ -2878,14 +2878,14 @@
(set_attr "bdver1_decode" "double,direct,*")
(set_attr "length_vex" "*,*,4")
(set_attr "prefix_rex" "1,1,*")
-   (set_attr "prefix" "orig,orig,vex")
+   (set_attr "prefix" "orig,orig,maybe_evex")
(set_attr "mode" "DF")])
 
 (define_insn "sse2_cvtsd2si"
   [(set (match_operand:SI 0 "register_operand" "=r,r")
(unspec:SI
  [(vec_select:DF
-(match_operand:V2DF 1 "nonimmediate_operand" "x,m")
+(match_operand:V2DF 1 "nonimmediate_operand" "v,m")
 (parallel [(const_int 0)]))]
  UNSPEC_FIX_NOTRUNC))]
   "TARGET_SSE2"
@@ -2916,7 +2916,7 @@
   [(set (match_operand:DI 0 "register_operand" "=r,r")
(unspec:DI
  [(vec_select:DF
-(match_operand:V2DF 1 "nonimmediate_operand" "x,m")
+(match_operand:V2DF 1 "nonimmediate_operand" "v,m")
 (parallel [(const_int 0)]))]
  UNSPEC_FIX_NOTRUNC))]
   "TARGET_SSE2 && TARGET_64BIT"
@@ -2946,7 +2946,7 @@
   [(set (match_operand:SI 0 "register_operand" "=r,r")
(fix:SI
  (vec_select:DF
-   (match_operand:V2DF 1 "nonimmediate_operand" "x,m")
+   (match_operand:V2DF 1 "nonimmediate_operand" "v,m")
(parallel [(const_int 0)]]
   "TARGET_SSE2"
   "%vcvttsd2si\t{%1, %0|%0, %q1}"
@@ -2963,7 +2963,7 @@
   [(set (match_o

Re: [PATCH i386 3/8] [AVX512] [3/n] Add AVX-512 patterns: VF1 and VI iterators.

2013-10-09 Thread Kirill Yukhin
Hello,

> This patch is still far too large.
>
> I think you should split it up based on every single mode iterator that
> you need to add or change.

Here's 3rd subpatch. It extends VF1 and VI iterators.

Is it Ok?

Testing:
  1. Bootstrap pass.
  2. make check shows no regressions.
  3. Spec 2000 & 2006 build show no regressions both with and without -mavx512f 
option.
  4. Spec 2000 & 2006 run shows no stability regressions without -mavx512f 
option.

--
Thanks, K

PS. If it is Ok - I am going to strip out ChangeLog lines from big patch.

---
 gcc/config/i386/i386.md |   4 ++
 gcc/config/i386/sse.md  | 117 +++-
 2 files changed, 79 insertions(+), 42 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 03b3842..cc332ea 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -827,6 +827,10 @@
 (define_code_attr s [(sign_extend "s") (zero_extend "u")])
 (define_code_attr u_bool [(sign_extend "false") (zero_extend "true")])
 
+;; Used in signed and unsigned fix.
+(define_code_iterator any_fix [fix unsigned_fix])
+(define_code_attr fixsuffix [(fix "") (unsigned_fix "u")])
+
 ;; All integer modes.
 (define_mode_iterator SWI1248x [QI HI SI DI])
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 2f2fb38..aa9f1d1 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -134,6 +134,10 @@
 
 ;; All SFmode vector float modes
 (define_mode_iterator VF1
+  [(V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF])
+
+;; 128- and 256-bit SF vector modes
+(define_mode_iterator VF1_128_256
   [(V8SF "TARGET_AVX") V4SF])
 
 ;; All DFmode vector float modes
@@ -154,7 +158,8 @@
 
 ;; All vector integer modes
 (define_mode_iterator VI
-  [(V32QI "TARGET_AVX") V16QI
+  [(V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
+   (V32QI "TARGET_AVX") V16QI
(V16HI "TARGET_AVX") V8HI
(V8SI "TARGET_AVX") V4SI
(V4DI "TARGET_AVX") V2DI])
@@ -162,8 +167,8 @@
 (define_mode_iterator VI_AVX2
   [(V32QI "TARGET_AVX2") V16QI
(V16HI "TARGET_AVX2") V8HI
-   (V8SI "TARGET_AVX2") V4SI
-   (V4DI "TARGET_AVX2") V2DI])
+   (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX2") V4SI
+   (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX2") V2DI])
 
 ;; All QImode vector integer modes
 (define_mode_iterator VI1
@@ -175,7 +180,7 @@
 
 ;; All DImode vector integer modes
 (define_mode_iterator VI8
-  [(V4DI "TARGET_AVX") V2DI])
+  [(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI])
 
 (define_mode_iterator VI1_AVX2
   [(V32QI "TARGET_AVX2") V16QI])
@@ -358,7 +363,8 @@
(V32QI "V32QI") (V16QI "V16QI")])
 
 (define_mode_attr sseintvecmodelower
-  [(V8SF "v8si") (V4DF "v4di")
+  [(V16SF "v16si")
+   (V8SF "v8si") (V4DF "v4di")
(V4SF "v4si") (V2DF "v2di")
(V8SI "v8si") (V4DI "v4di")
(V4SI "v4si") (V2DI "v2di")
@@ -393,10 +399,21 @@
 
 ;; Mapping of vector modes back to the scalar modes
 (define_mode_attr ssescalarmode
-  [(V32QI "QI") (V16HI "HI") (V8SI "SI") (V4DI "DI")
-   (V16QI "QI") (V8HI "HI") (V4SI "SI") (V2DI "DI")
-   (V8SF "SF") (V4DF "DF")
-   (V4SF "SF") (V2DF "DF")])
+  [(V64QI "QI") (V32QI "QI") (V16QI "QI")
+   (V32HI "HI") (V16HI "HI") (V8HI "HI")
+   (V16SI "SI") (V8SI "SI")  (V4SI "SI")
+   (V8DI "DI")  (V4DI "DI")  (V2DI "DI")
+   (V16SF "SF") (V8SF "SF")  (V4SF "SF")
+   (V8DF "DF")  (V4DF "DF")  (V2DF "DF")])
+
+;; Mapping of vector modes to the 128bit modes
+(define_mode_attr ssexmmmode
+  [(V64QI "V16QI") (V32QI "V16QI") (V16QI "V16QI")
+   (V32HI "V8HI")  (V16HI "V8HI") (V8HI "V8HI")
+   (V16SI "V4SI")  (V8SI "V4SI")  (V4SI "V4SI")
+   (V8DI "V2DI")   (V4DI "V2DI")  (V2DI "V2DI")
+   (V16SF "V4SF")  (V8SF "V4SF")  (V4SF "V4SF")
+   (V8DF "V2DF")   (V4DF "V2DF")  (V2DF "V2DF")])
 
 ;; Pointer size override for scalar modes (Intel asm dialect)
 (define_mode_attr iptr
@@ -408,8 +425,10 @@
 
 ;; Number of scalar elements in each vector type
 (define_mode_attr ssescalarnum
-  [(V32QI "32") (V16HI "16") (V8SI "8") (V4DI "4")
+  [(V64QI "64") (V16SI "16") (V8DI "8")
+   (V32QI "32") (V16HI "16") (V8SI "8") (V4DI "4")
(V16QI "16") (V8HI "8") (V4SI "4") (V2DI "2")
+   (V16SF "16") (V8DF "8")
(V8SF "8") (V4DF "4")
(V4SF "4") (V2DF "2")])
 
@@ -1101,9 +1120,9 @@
(set_attr "mode" "")])
 
 (define_insn "_rcp2"
-  [(set (match_operand:VF1 0 "register_operand" "=x")
-   (unspec:VF1
- [(match_operand:VF1 1 "nonimmediate_operand" "xm")] UNSPEC_RCP))]
+  [(set (match_operand:VF1_128_256 0 "register_operand" "=x")
+   (unspec:VF1_128_256
+ [(match_operand:VF1_128_256 1 "nonimmediate_operand" "xm")] 
UNSPEC_RCP))]
   "TARGET_SSE"
   "%vrcpps\t{%1, %0|%0, %1}"
   [(set_attr "type" "sse")
@@ -1181,9 +1200,9 @@
(set_attr "mode" "")])
 
 (define_expand "rsqrt2"
-  [(set (match_operand:VF1 0 "register_operand")
-   (unspec:VF1
- [(match_operand:VF1 1 "nonimmediate_operand")] UNSPEC_RSQRT))]
+  [(set (match_operand:VF1_128_256 0 "register_operand")
+   (unspec:VF1_128_256
+   

Re: [PATCH i386 3/8] [AVX512] [4/n] Add AVX-512 patterns: V iterator.

2013-10-09 Thread Kirill Yukhin
Hello,

> This patch is still far too large.
>
> I think you should split it up based on every single mode iterator that
> you need to add or change.

Here's 4th subpatch. It extends V iterator.

Is it Ok?

Testing:
  1. Bootstrap pass.
  2. make check shows no regressions.
  3. Spec 2000 & 2006 build show no regressions both with and without -mavx512f 
option.
  4. Spec 2000 & 2006 run shows no stability regressions without -mavx512f 
option.

--
Thanks, K

PS. If it is Ok - I am going to strip out ChangeLog lines from big patch.

---
 gcc/config/i386/sse.md | 175 -
 1 file changed, 131 insertions(+), 44 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index aa9f1d1..cdb9ae0 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -109,10 +109,10 @@
 (define_mode_iterator V
   [(V32QI "TARGET_AVX") V16QI
(V16HI "TARGET_AVX") V8HI
-   (V8SI "TARGET_AVX") V4SI
-   (V4DI "TARGET_AVX") V2DI
-   (V8SF "TARGET_AVX") V4SF
-   (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
+   (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI
+   (V8DI "TARGET_AVX512F")  (V4DI "TARGET_AVX") V2DI
+   (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
+   (V8DF "TARGET_AVX512F")  (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
 
 ;; All 128bit vector modes
 (define_mode_iterator V_128
@@ -122,6 +122,12 @@
 (define_mode_iterator V_256
   [V32QI V16HI V8SI V4DI V8SF V4DF])
 
+;; All 256bit and 512bit vector modes
+(define_mode_iterator V_256_512
+  [V32QI V16HI V8SI V4DI V8SF V4DF
+   (V64QI "TARGET_AVX512F") (V32HI "TARGET_AVX512F") (V16SI "TARGET_AVX512F")
+   (V8DI "TARGET_AVX512F") (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")])
+
 ;; All vector float modes
 (define_mode_iterator VF
   [(V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
@@ -142,8 +148,15 @@
 
 ;; All DFmode vector float modes
 (define_mode_iterator VF2
+  [(V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF])
+
+;; 128- and 256-bit DF vector modes
+(define_mode_iterator VF2_128_256
   [(V4DF "TARGET_AVX") V2DF])
 
+(define_mode_iterator VF2_512_256
+  [(V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX")])
+
 ;; All 128bit vector float modes
 (define_mode_iterator VF_128
   [V4SF (V2DF "TARGET_SSE2")])
@@ -380,10 +393,12 @@
 
 ;; Mapping of vector modes to a vector mode of half size
 (define_mode_attr ssehalfvecmode
-  [(V32QI "V16QI") (V16HI "V8HI") (V8SI "V4SI") (V4DI "V2DI")
-   (V16QI  "V8QI") (V8HI  "V4HI") (V4SI "V2SI")
-   (V8SF "V4SF") (V4DF "V2DF")
-   (V4SF "V2SF")])
+  [(V64QI "V32QI") (V32HI "V16HI") (V16SI "V8SI") (V8DI "V4DI")
+   (V32QI "V16QI") (V16HI  "V8HI") (V8SI  "V4SI") (V4DI "V2DI")
+   (V16QI  "V8QI") (V8HI   "V4HI") (V4SI  "V2SI")
+   (V16SF "V8SF") (V8DF "V4DF")
+   (V8SF  "V4SF") (V4DF "V2DF")
+   (V4SF  "V2SF")])
 
 ;; Mapping of vector modes ti packed single mode of the same size
 (define_mode_attr ssePSmode
@@ -474,9 +489,11 @@
 (define_code_attr extsuffix [(sign_extend "sx") (zero_extend "zx")])
 
 ;; i128 for integer vectors and TARGET_AVX2, f128 otherwise.
+;; i64x4 or f64x4 for 512bit modes.
 (define_mode_attr i128
-  [(V8SF "f128") (V4DF "f128") (V32QI "%~128") (V16HI "%~128")
-   (V8SI "%~128") (V4DI "%~128")])
+  [(V16SF "f64x4") (V8SF "f128") (V8DF "f64x4") (V4DF "f128")
+   (V64QI "i64x4") (V32QI "%~128") (V32HI "i64x4") (V16HI "%~128")
+   (V16SI "i64x4") (V8SI "%~128") (V8DI "i64x4") (V4DI "%~128")])
 
 ;; Mix-n-match
 (define_mode_iterator AVX256MODE2P [V8SI V8SF V4DF])
@@ -3004,14 +3021,20 @@
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "DI")])
 
-(define_insn "floatv4siv4df2"
-  [(set (match_operand:V4DF 0 "register_operand" "=x")
-   (float:V4DF (match_operand:V4SI 1 "nonimmediate_operand" "xm")))]
+;; For float2 insn pattern
+(define_mode_attr si2dfmode
+  [(V8DF "V8SI") (V4DF "V4SI")])
+(define_mode_attr si2dfmodelower
+  [(V8DF "v8si") (V4DF "v4si")])
+
+(define_insn "float2"
+  [(set (match_operand:VF2_512_256 0 "register_operand" "=v")
+   (float:VF2_512_256 (match_operand: 1 "nonimmediate_operand" 
"vm")))]
   "TARGET_AVX"
   "vcvtdq2pd\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
-   (set_attr "prefix" "vex")
-   (set_attr "mode" "V4DF")])
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "")])
 
 (define_insn "avx_cvtdq2pd256_2"
   [(set (match_operand:V4DF 0 "register_operand" "=x")
@@ -3101,6 +3124,15 @@
(set_attr "athlon_decode" "vector")
(set_attr "bdver1_decode" "double")])
 
+(define_insn "fix_truncv8dfv8si2"
+  [(set (match_operand:V8SI 0 "register_operand" "=v")
+   (any_fix:V8SI (match_operand:V8DF 1 "nonimmediate_operand" "vm")))]
+  "TARGET_AVX512F"
+  "vcvttpd2dq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "OI")])
+
 (define_insn "fix_truncv4dfv4si2"
   [(set (match_operand:V4SI 0 "register_operand" "=x")
(fix:V4SI (match_operand:V4DF 1 "nonimmediate_operand" "xm")))]
@@ -3243,15 +3275,19 @@
(set_attr "prefix" "maybe_vex")

Re: [PATCH i386 3/8] [AVX512] [6/n] Add AVX-512 patterns: VI2 and VI124 iterators.

2013-10-09 Thread Kirill Yukhin
Hello,

> This patch is still far too large.
>
> I think you should split it up based on every single mode iterator that
> you need to add or change.

Here's 6th subpatch. It extends VI2 and VI124 iterators.

Is it Ok?

Testing:
  1. Bootstrap pass.
  2. make check shows no regressions.
  3. Spec 2000 & 2006 build show no regressions both with and without -mavx512f 
option.
  4. Spec 2000 & 2006 run shows no stability regressions without -mavx512f 
option.

--
Thanks, K

PS. If it is Ok - I am going to strip out ChangeLog lines from big patch.

---
 gcc/config/i386/sse.md | 30 --
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 89c31c5..351f5bb 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -201,6 +201,9 @@
 (define_mode_iterator VI2_AVX2
   [(V16HI "TARGET_AVX2") V8HI])
 
+(define_mode_iterator VI2_AVX512F
+  [(V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX2") V8HI])
+
 (define_mode_iterator VI4_AVX2
   [(V8SI "TARGET_AVX2") V4SI])
 
@@ -223,6 +226,11 @@
   [(V16HI "TARGET_AVX2") V8HI
(V8SI "TARGET_AVX2") V4SI])
 
+(define_mode_iterator VI124_AVX512F
+  [(V32QI "TARGET_AVX2") V16QI
+   (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX2") V8HI
+   (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX2") V4SI])
+
 (define_mode_iterator VI124_AVX2
   [(V32QI "TARGET_AVX2") V16QI
(V16HI "TARGET_AVX2") V8HI
@@ -472,7 +480,8 @@
 ;; Pack/unpack vector modes
 (define_mode_attr sseunpackmode
   [(V16QI "V8HI") (V8HI "V4SI") (V4SI "V2DI")
-   (V32QI "V16HI") (V16HI "V8SI") (V8SI "V4DI")])
+   (V32QI "V16HI") (V16HI "V8SI") (V8SI "V4DI")
+   (V32HI "V16SI") (V64QI "V32HI") (V16SI "V8DI")])
 
 (define_mode_attr ssepackmode
   [(V8HI "V16QI") (V4SI "V8HI") (V2DI "V4SI")
@@ -3347,11 +3356,12 @@
   "TARGET_AVX")
 
 (define_mode_attr sseunpackfltmode
-  [(V8HI "V4SF") (V4SI "V2DF") (V16HI "V8SF") (V8SI "V4DF")])
+  [(V8HI "V4SF") (V4SI "V2DF") (V16HI "V8SF")
+  (V8SI "V4DF") (V32HI "V16SF") (V16SI "V8DF")])
 
 (define_expand "vec_unpacks_float_hi_"
   [(match_operand: 0 "register_operand")
-   (match_operand:VI2_AVX2 1 "register_operand")]
+   (match_operand:VI2_AVX512F 1 "register_operand")]
   "TARGET_SSE2"
 {
   rtx tmp = gen_reg_rtx (mode);
@@ -3364,7 +3374,7 @@
 
 (define_expand "vec_unpacks_float_lo_"
   [(match_operand: 0 "register_operand")
-   (match_operand:VI2_AVX2 1 "register_operand")]
+   (match_operand:VI2_AVX512F 1 "register_operand")]
   "TARGET_SSE2"
 {
   rtx tmp = gen_reg_rtx (mode);
@@ -3377,7 +3387,7 @@
 
 (define_expand "vec_unpacku_float_hi_"
   [(match_operand: 0 "register_operand")
-   (match_operand:VI2_AVX2 1 "register_operand")]
+   (match_operand:VI2_AVX512F 1 "register_operand")]
   "TARGET_SSE2"
 {
   rtx tmp = gen_reg_rtx (mode);
@@ -3390,7 +3400,7 @@
 
 (define_expand "vec_unpacku_float_lo_"
   [(match_operand: 0 "register_operand")
-   (match_operand:VI2_AVX2 1 "register_operand")]
+   (match_operand:VI2_AVX512F 1 "register_operand")]
   "TARGET_SSE2"
 {
   rtx tmp = gen_reg_rtx (mode);
@@ -7835,25 +7845,25 @@
 
 (define_expand "vec_unpacks_lo_"
   [(match_operand: 0 "register_operand")
-   (match_operand:VI124_AVX2 1 "register_operand")]
+   (match_operand:VI124_AVX512F 1 "register_operand")]
   "TARGET_SSE2"
   "ix86_expand_sse_unpack (operands[0], operands[1], false, false); DONE;")
 
 (define_expand "vec_unpacks_hi_"
   [(match_operand: 0 "register_operand")
-   (match_operand:VI124_AVX2 1 "register_operand")]
+   (match_operand:VI124_AVX512F 1 "register_operand")]
   "TARGET_SSE2"
   "ix86_expand_sse_unpack (operands[0], operands[1], false, true); DONE;")
 
 (define_expand "vec_unpacku_lo_"
   [(match_operand: 0 "register_operand")
-   (match_operand:VI124_AVX2 1 "register_operand")]
+   (match_operand:VI124_AVX512F 1 "register_operand")]
   "TARGET_SSE2"
   "ix86_expand_sse_unpack (operands[0], operands[1], true, false); DONE;")
 
 (define_expand "vec_unpacku_hi_"
   [(match_operand: 0 "register_operand")
-   (match_operand:VI124_AVX2 1 "register_operand")]
+   (match_operand:VI124_AVX512F 1 "register_operand")]
   "TARGET_SSE2"
   "ix86_expand_sse_unpack (operands[0], operands[1], true, true); DONE;")
 
-- 
1.7.11.7



Re: [PATCH i386 3/8] [AVX512] [5/n] Add AVX-512 patterns: Introduce `multdiv' code iterator.

2013-10-09 Thread Kirill Yukhin
Hello,

> This patch is still far too large.
>
> I think you should split it up based on every single mode iterator that
> you need to add or change.

Here's 5th subpatch. It introduces `multdiv' code iterator.

Is it Ok?

Testing:
  1. Bootstrap pass.
  2. make check shows no regressions.
  3. Spec 2000 & 2006 build show no regressions both with and without -mavx512f 
option.
  4. Spec 2000 & 2006 run shows no stability regressions without -mavx512f 
option.

--
Thanks, K

PS. If it is Ok - I am going to strip out ChangeLog lines from big patch.

---
 gcc/config/i386/i386.md |  4 
 gcc/config/i386/sse.md  | 31 +++
 2 files changed, 11 insertions(+), 24 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index cc332ea..10ca6cb 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -746,6 +746,8 @@
 
 (define_code_iterator sat_plusminus [ss_plus us_plus ss_minus us_minus])
 
+(define_code_iterator multdiv [mult div])
+
 ;; Base name for define_insn
 (define_code_attr plusminus_insn
   [(plus "add") (ss_plus "ssadd") (us_plus "usadd")
@@ -757,6 +759,8 @@
(minus "sub") (ss_minus "subs") (us_minus "subus")])
 (define_code_attr plusminus_carry_mnemonic
   [(plus "adc") (minus "sbb")])
+(define_code_attr multdiv_mnemonic
+  [(mult "mul") (div "div")])
 
 ;; Mark commutative operators as such in constraints.
 (define_code_attr comm [(plus "%") (ss_plus "%") (us_plus "%")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index cdb9ae0..89c31c5 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1061,21 +1061,22 @@
(set_attr "btver2_decode" "direct,double")
(set_attr "mode" "")])
 
-(define_insn "_vmmul3"
+(define_insn "_vm3"
   [(set (match_operand:VF_128 0 "register_operand" "=x,v")
(vec_merge:VF_128
- (mult:VF_128
+ (multdiv:VF_128
(match_operand:VF_128 1 "register_operand" "0,v")
(match_operand:VF_128 2 "nonimmediate_operand" "xm,vm"))
  (match_dup 1)
  (const_int 1)))]
   "TARGET_SSE"
   "@
-   mul\t{%2, %0|%0, %2}
-   vmul\t{%2, %1, %0|%0, %1, %2}"
+   \t{%2, %0|%0, %2}
+   v\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "isa" "noavx,avx")
-   (set_attr "type" "ssemul")
-   (set_attr "prefix" "orig,vex")
+   (set_attr "type" "sse")
+   (set_attr "prefix" "orig,maybe_evex")
+   (set_attr "btver2_decode" "direct,double")
(set_attr "mode" "")])
 
 (define_expand "div3"
@@ -1118,24 +1119,6 @@
(set_attr "prefix" "orig,vex")
(set_attr "mode" "")])
 
-(define_insn "_vmdiv3"
-  [(set (match_operand:VF_128 0 "register_operand" "=x,v")
-   (vec_merge:VF_128
- (div:VF_128
-   (match_operand:VF_128 1 "register_operand" "0,v")
-   (match_operand:VF_128 2 "nonimmediate_operand" "xm,vm"))
- (match_dup 1)
- (const_int 1)))]
-  "TARGET_SSE"
-  "@
-   div\t{%2, %0|%0, %2}
-   vdiv\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "isa" "noavx,avx")
-   (set_attr "type" "ssediv")
-   (set_attr "prefix" "orig,vex")
-   (set_attr "btver2_decode" "direct,double")
-   (set_attr "mode" "")])
-
 (define_insn "_rcp2"
   [(set (match_operand:VF1_128_256 0 "register_operand" "=x")
(unspec:VF1_128_256
-- 
1.7.11.7



Re: [PATCH i386 3/8] [AVX512] [7/n] Add AVX-512 patterns: VI4 and VI8 iterators.

2013-10-09 Thread Kirill Yukhin
Hello,

> This patch is still far too large.
>
> I think you should split it up based on every single mode iterator that
> you need to add or change.

Here's 7th subpatch. It extends VI4 and VI8 iterators.

Is it Ok?

Testing:
  1. Bootstrap pass.
  2. make check shows no regressions.
  3. Spec 2000 & 2006 build show no regressions both with and without -mavx512f 
option.
  4. Spec 2000 & 2006 run shows no stability regressions without -mavx512f 
option.

--
Thanks, K

PS. If it is Ok - I am going to strip out ChangeLog lines from big patch.

---
 gcc/config/i386/sse.md | 33 -
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 351f5bb..127ecf2 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -207,9 +207,15 @@
 (define_mode_iterator VI4_AVX2
   [(V8SI "TARGET_AVX2") V4SI])
 
+(define_mode_iterator VI4_AVX512F
+  [(V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX2") V4SI])
+
 (define_mode_iterator VI8_AVX2
   [(V4DI "TARGET_AVX2") V2DI])
 
+(define_mode_iterator VI8_AVX2_AVX512F
+  [(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX2") V2DI])
+
 ;; ??? We should probably use TImode instead.
 (define_mode_iterator VIMAX_AVX2
   [(V2TI "TARGET_AVX2") V1TI])
@@ -5854,10 +5860,10 @@
(set_attr "mode" "TI")])
 
 (define_expand "mul3"
-  [(set (match_operand:VI4_AVX2 0 "register_operand")
-   (mult:VI4_AVX2
- (match_operand:VI4_AVX2 1 "general_vector_operand")
- (match_operand:VI4_AVX2 2 "general_vector_operand")))]
+  [(set (match_operand:VI4_AVX512F 0 "register_operand")
+   (mult:VI4_AVX512F
+ (match_operand:VI4_AVX512F 1 "general_vector_operand")
+ (match_operand:VI4_AVX512F 2 "general_vector_operand")))]
   "TARGET_SSE2"
 {
   if (TARGET_SSE4_1)
@@ -5876,10 +5882,10 @@
 })
 
 (define_insn "*_mul3"
-  [(set (match_operand:VI4_AVX2 0 "register_operand" "=x,v")
-   (mult:VI4_AVX2
- (match_operand:VI4_AVX2 1 "nonimmediate_operand" "%0,v")
- (match_operand:VI4_AVX2 2 "nonimmediate_operand" "xm,vm")))]
+  [(set (match_operand:VI4_AVX512F 0 "register_operand" "=x,v")
+   (mult:VI4_AVX512F
+ (match_operand:VI4_AVX512F 1 "nonimmediate_operand" "%0,v")
+ (match_operand:VI4_AVX512F 2 "nonimmediate_operand" "xm,vm")))]
   "TARGET_SSE4_1 && ix86_binary_operator_ok (MULT, mode, operands)"
   "@
pmulld\t{%2, %0|%0, %2}
@@ -5892,9 +5898,10 @@
(set_attr "mode" "")])
 
 (define_expand "mul3"
-  [(set (match_operand:VI8_AVX2 0 "register_operand")
-   (mult:VI8_AVX2 (match_operand:VI8_AVX2 1 "register_operand")
-  (match_operand:VI8_AVX2 2 "register_operand")))]
+  [(set (match_operand:VI8_AVX2_AVX512F 0 "register_operand")
+   (mult:VI8_AVX2_AVX512F
+ (match_operand:VI8_AVX2_AVX512F 1 "register_operand")
+ (match_operand:VI8_AVX2_AVX512F 2 "register_operand")))]
   "TARGET_SSE2"
 {
   ix86_expand_sse2_mulvxdi3 (operands[0], operands[1], operands[2]);
@@ -5941,8 +5948,8 @@
 (define_expand "vec_widen_mult_odd_"
   [(match_operand: 0 "register_operand")
(any_extend:
- (match_operand:VI4_AVX2 1 "general_vector_operand"))
-   (match_operand:VI4_AVX2 2 "general_vector_operand")]
+ (match_operand:VI4_AVX512F 1 "general_vector_operand"))
+   (match_operand:VI4_AVX512F 2 "general_vector_operand")]
   "TARGET_SSE2"
 {
   ix86_expand_mul_widen_evenodd (operands[0], operands[1], operands[2],
-- 
1.7.11.7



Re: [PATCH i386 3/8] [AVX512] [8/n] Add AVX-512 patterns: VI48 and VI48_AVX2 iterators.

2013-10-09 Thread Kirill Yukhin
Hello,

> This patch is still far too large.
>
> I think you should split it up based on every single mode iterator that
> you need to add or change.

Here's 8th subpatch. It extends VI48 and VI48_AVX2 iterators.

Is it Ok?

Testing:
  1. Bootstrap pass.
  2. make check shows no regressions.
  3. Spec 2000 & 2006 build show no regressions both with and without -mavx512f 
option.
  4. Spec 2000 & 2006 run shows no stability regressions without -mavx512f 
option.

--
Thanks, K

PS. If it is Ok - I am going to strip out ChangeLog lines from big patch.

---
 gcc/config/i386/sse.md | 36 
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 127ecf2..49124ba 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -210,6 +210,10 @@
 (define_mode_iterator VI4_AVX512F
   [(V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX2") V4SI])
 
+(define_mode_iterator VI48_AVX512F
+  [(V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX2") V4SI
+   (V8DI "TARGET_AVX512F")])
+
 (define_mode_iterator VI8_AVX2
   [(V4DI "TARGET_AVX2") V2DI])
 
@@ -247,9 +251,9 @@
(V8SI "TARGET_AVX2") V4SI
(V4DI "TARGET_AVX2") V2DI])
 
-(define_mode_iterator VI48_AVX2
-  [(V8SI "TARGET_AVX2") V4SI
-   (V4DI "TARGET_AVX2") V2DI])
+(define_mode_iterator VI48_AVX2_48_AVX512F
+  [(V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX2") V4SI
+   (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX2") V2DI])
 
 (define_mode_iterator V48_AVX2
   [V4SF V2DF
@@ -11404,26 +11408,26 @@
   DONE;
 })
 
-(define_insn "avx2_ashrv"
-  [(set (match_operand:VI4_AVX2 0 "register_operand" "=v")
-   (ashiftrt:VI4_AVX2
- (match_operand:VI4_AVX2 1 "register_operand" "v")
- (match_operand:VI4_AVX2 2 "nonimmediate_operand" "vm")))]
+(define_insn "_ashrv"
+  [(set (match_operand:VI48_AVX512F 0 "register_operand" "=v")
+   (ashiftrt:VI48_AVX512F
+ (match_operand:VI48_AVX512F 1 "register_operand" "v")
+ (match_operand:VI48_AVX512F 2 "nonimmediate_operand" "vm")))]
   "TARGET_AVX2"
-  "vpsravd\t{%2, %1, %0|%0, %1, %2}"
+  "vpsrav\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "sseishft")
-   (set_attr "prefix" "vex")
+   (set_attr "prefix" "maybe_evex")
(set_attr "mode" "")])
 
-(define_insn "avx2_v"
-  [(set (match_operand:VI48_AVX2 0 "register_operand" "=v")
-   (any_lshift:VI48_AVX2
- (match_operand:VI48_AVX2 1 "register_operand" "v")
- (match_operand:VI48_AVX2 2 "nonimmediate_operand" "vm")))]
+(define_insn "_v"
+  [(set (match_operand:VI48_AVX2_48_AVX512F 0 "register_operand" "=v")
+   (any_lshift:VI48_AVX2_48_AVX512F
+ (match_operand:VI48_AVX2_48_AVX512F 1 "register_operand" "v")
+ (match_operand:VI48_AVX2_48_AVX512F 2 "nonimmediate_operand" "vm")))]
   "TARGET_AVX2"
   "vpv\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "sseishft")
-   (set_attr "prefix" "vex")
+   (set_attr "prefix" "maybe_evex")
(set_attr "mode" "")])
 
 ;; For avx_vec_concat insn pattern
-- 
1.7.11.7



Re: [PATCH i386 3/8] [AVX512] [10/n] Add AVX-512 patterns: VI248_AVX2_8_AVX512F and VI124_256_48_AVX512F iterators.

2013-10-09 Thread Kirill Yukhin
Hello,

> This patch is still far too large.
>
> I think you should split it up based on every single mode iterator that
> you need to add or change.

Here's 10th subpatch. It introduces VI248_AVX2_8_AVX512F and VI124_256_48_512 
iterators.

Is it Ok?

Testing:
  1. Bootstrap pass.
  2. make check shows no regressions.
  3. Spec 2000 & 2006 build show no regressions both with and without -mavx512f 
option.
  4. Spec 2000 & 2006 run shows no stability regressions without -mavx512f 
option.

--
Thanks, K

PS. If it is Ok - I am going to strip out ChangeLog lines from big patch.

---
 gcc/config/i386/sse.md | 35 +--
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index dd310b5..a380690 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -257,6 +257,11 @@
(V8SI "TARGET_AVX2") V4SI
(V4DI "TARGET_AVX2") V2DI])
 
+(define_mode_iterator VI248_AVX2_8_AVX512F
+  [(V16HI "TARGET_AVX2") V8HI
+   (V8SI "TARGET_AVX2") V4SI
+   (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX2") V2DI])
+
 (define_mode_iterator VI48_AVX2_48_AVX512F
   [(V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX2") V4SI
(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX2") V2DI])
@@ -341,8 +346,9 @@
 (define_mode_iterator VI248_128 [V8HI V4SI V2DI])
 (define_mode_iterator VI48_128 [V4SI V2DI])
 
-;; Random 256bit vector integer mode combinations
-(define_mode_iterator VI124_256 [V32QI V16HI V8SI])
+;; Various 256bit and 512 vector integer mode combinations
+(define_mode_iterator VI124_256_48_512
+  [V32QI V16HI V8SI (V8DI "TARGET_AVX512F") (V16SI "TARGET_AVX512F")])
 (define_mode_iterator VI48_256 [V8SI V4DI])
 
 ;; Int-float size matches
@@ -503,7 +509,8 @@
 
 (define_mode_attr ssepackmode
   [(V8HI "V16QI") (V4SI "V8HI") (V2DI "V4SI")
-   (V16HI "V32QI") (V8SI "V16HI") (V4DI "V8SI")])
+   (V16HI "V32QI") (V8SI "V16HI") (V4DI "V8SI")
+   (V32HI "V64QI") (V16SI "V32HI") (V8DI "V16SI")])
 
 ;; Mapping of the max integer size for xop rotate immediate constraint
 (define_mode_attr sserotatemax
@@ -6114,23 +6121,23 @@
 
 
 (define_expand "3"
-  [(set (match_operand:VI124_256 0 "register_operand")
-   (maxmin:VI124_256
- (match_operand:VI124_256 1 "nonimmediate_operand")
- (match_operand:VI124_256 2 "nonimmediate_operand")))]
+  [(set (match_operand:VI124_256_48_512 0 "register_operand")
+   (maxmin:VI124_256_48_512
+ (match_operand:VI124_256_48_512 1 "nonimmediate_operand")
+ (match_operand:VI124_256_48_512 2 "nonimmediate_operand")))]
   "TARGET_AVX2"
   "ix86_fixup_binary_operands_no_copy (, mode, operands);")
 
 (define_insn "*avx2_3"
-  [(set (match_operand:VI124_256 0 "register_operand" "=v")
-   (maxmin:VI124_256
- (match_operand:VI124_256 1 "nonimmediate_operand" "%v")
- (match_operand:VI124_256 2 "nonimmediate_operand" "vm")))]
+  [(set (match_operand:VI124_256_48_512 0 "register_operand" "=v")
+   (maxmin:VI124_256_48_512
+ (match_operand:VI124_256_48_512 1 "nonimmediate_operand" "%v")
+ (match_operand:VI124_256_48_512 2 "nonimmediate_operand" "vm")))]
   "TARGET_AVX2 && ix86_binary_operator_ok (, mode, operands)"
   "vp\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "sseiadd")
(set_attr "prefix_extra" "1")
-   (set_attr "prefix" "vex")
+   (set_attr "prefix" "maybe_evex")
(set_attr "mode" "OI")])
 
 (define_expand "3"
@@ -6777,8 +6784,8 @@
 
 (define_expand "vec_pack_trunc_"
   [(match_operand: 0 "register_operand")
-   (match_operand:VI248_AVX2 1 "register_operand")
-   (match_operand:VI248_AVX2 2 "register_operand")]
+   (match_operand:VI248_AVX2_8_AVX512F 1 "register_operand")
+   (match_operand:VI248_AVX2_8_AVX512F 2 "register_operand")]
   "TARGET_SSE2"
 {
   rtx op1 = gen_lowpart (mode, operands[1]);
-- 
1.7.11.7



Re: [PATCH i386 3/8] [AVX512] [9/n] Add AVX-512 patterns: VI124_AVX2, VI8F iterators.

2013-10-09 Thread Kirill Yukhin
Hello,

> This patch is still far too large.
>
> I think you should split it up based on every single mode iterator that
> you need to add or change.

Here's 9th subpatch. It extends VI124_AVX2_48 and VI8F iterators.

Is it Ok?

Testing:
  1. Bootstrap pass.
  2. make check shows no regressions.
  3. Spec 2000 & 2006 build show no regressions both with and without -mavx512f 
option.
  4. Spec 2000 & 2006 run shows no stability regressions without -mavx512f 
option.

--
Thanks, K

PS. If it is Ok - I am going to strip out ChangeLog lines from big patch.

---
 gcc/config/i386/sse.md | 38 +++---
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 49124ba..dd310b5 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -236,6 +236,12 @@
   [(V16HI "TARGET_AVX2") V8HI
(V8SI "TARGET_AVX2") V4SI])
 
+(define_mode_iterator VI124_AVX2_48_AVX512F
+  [(V32QI "TARGET_AVX2") V16QI
+   (V16HI "TARGET_AVX2") V8HI
+   (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX2") V4SI
+   (V8DI "TARGET_AVX512F")])
+
 (define_mode_iterator VI124_AVX512F
   [(V32QI "TARGET_AVX2") V16QI
(V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX2") V8HI
@@ -344,6 +350,8 @@
 (define_mode_iterator VI8F_128 [V2DI V2DF])
 (define_mode_iterator VI4F_256 [V8SI V8SF])
 (define_mode_iterator VI8F_256 [V4DI V4DF])
+(define_mode_iterator VI8F_256_512
+  [V4DI V4DF (V8DI "TARGET_AVX512F") (V8DF "TARGET_AVX512F")])
 
 ;; Mapping from float mode to required SSE level
 (define_mode_attr sse
@@ -8627,9 +8635,9 @@
(set_attr "mode" "DI")])
 
 (define_insn "abs2"
-  [(set (match_operand:VI124_AVX2 0 "register_operand" "=v")
-   (abs:VI124_AVX2
- (match_operand:VI124_AVX2 1 "nonimmediate_operand" "vm")))]
+  [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand" "=v")
+   (abs:VI124_AVX2_48_AVX512F
+ (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand" "vm")))]
   "TARGET_SSSE3"
   "%vpabs\t{%1, %0|%0, %1}"
   [(set_attr "type" "sselog1")
@@ -10755,25 +10763,25 @@
(set_attr "prefix" "vex")
(set_attr "mode" "OI")])
 
-(define_expand "avx2_perm"
-  [(match_operand:VI8F_256 0 "register_operand")
-   (match_operand:VI8F_256 1 "nonimmediate_operand")
+(define_expand "_perm"
+  [(match_operand:VI8F_256_512 0 "register_operand")
+   (match_operand:VI8F_256_512 1 "nonimmediate_operand")
(match_operand:SI 2 "const_0_to_255_operand")]
   "TARGET_AVX2"
 {
   int mask = INTVAL (operands[2]);
-  emit_insn (gen_avx2_perm_1 (operands[0], operands[1],
-   GEN_INT ((mask >> 0) & 3),
-   GEN_INT ((mask >> 2) & 3),
-   GEN_INT ((mask >> 4) & 3),
-   GEN_INT ((mask >> 6) & 3)));
+  emit_insn (gen__perm_1 (operands[0], operands[1],
+   GEN_INT ((mask >> 0) & 3),
+   GEN_INT ((mask >> 2) & 3),
+   GEN_INT ((mask >> 4) & 3),
+   GEN_INT ((mask >> 6) & 3)));
   DONE;
 })
 
-(define_insn "avx2_perm_1"
-  [(set (match_operand:VI8F_256 0 "register_operand" "=v")
-   (vec_select:VI8F_256
- (match_operand:VI8F_256 1 "nonimmediate_operand" "vm")
+(define_insn "_perm_1"
+  [(set (match_operand:VI8F_256_512 0 "register_operand" "=v")
+   (vec_select:VI8F_256_512
+ (match_operand:VI8F_256_512 1 "nonimmediate_operand" "vm")
  (parallel [(match_operand 2 "const_0_to_3_operand")
 (match_operand 3 "const_0_to_3_operand")
 (match_operand 4 "const_0_to_3_operand")
-- 
1.7.11.7



Re: [PATCH i386 3/8] [AVX512] [11/n] Add AVX-512 patterns: FMA.

2013-10-09 Thread Kirill Yukhin
Hello,

> This patch is still far too large.
>
> I think you should split it up based on every single mode iterator that
> you need to add or change.

Here's 11th subpatch. It introduces AVX-512 FMA instructions.

Is it Ok?

Testing:
  1. Bootstrap pass.
  2. make check shows no regressions.
  3. Spec 2000 & 2006 build show no regressions both with and without -mavx512f 
option.
  4. Spec 2000 & 2006 run shows no stability regressions without -mavx512f 
option.

--
Thanks, K

PS. If it is Ok - I am going to strip out ChangeLog lines from big patch.

---
 gcc/config/i386/i386.c |  2 +-
 gcc/config/i386/sse.md | 60 --
 2 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index f10113f..5908383 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -34785,7 +34785,7 @@ ix86_rtx_costs (rtx x, int code_i, int outer_code_i, 
int opno, int *total,
rtx sub;
 
 gcc_assert (FLOAT_MODE_P (mode));
-gcc_assert (TARGET_FMA || TARGET_FMA4);
+gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
 
 /* ??? SSE scalar/vector cost should be used here.  */
 /* ??? Bald assumption that fma has the same cost as fmul.  */
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a380690..6adcdd3 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2254,9 +2254,18 @@
 ;
 
 ;; The standard names for scalar FMA are only available with SSE math enabled.
-(define_mode_iterator FMAMODEM [(SF "TARGET_SSE_MATH")
-   (DF "TARGET_SSE_MATH")
-   V4SF V2DF V8SF V4DF])
+;; CPUID bit AVX512F enables evex encoded scalar and 512-bit fma.  It doesn't
+;; care about FMA bit, so we enable fma for TARGET_AVX512F even when TARGET_FMA
+;; and TARGET_FMA4 are both false.
+(define_mode_iterator FMAMODEM
+  [(SF "TARGET_SSE_MATH && (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F)")
+   (DF "TARGET_SSE_MATH && (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F)")
+   (V4SF "TARGET_FMA || TARGET_FMA4")
+   (V2DF "TARGET_FMA || TARGET_FMA4")
+   (V8SF "TARGET_FMA || TARGET_FMA4")
+   (V4DF "TARGET_FMA || TARGET_FMA4")
+   (V16SF "TARGET_AVX512F")
+   (V8DF "TARGET_AVX512F")])
 
 (define_expand "fma4"
   [(set (match_operand:FMAMODEM 0 "register_operand")
@@ -2264,7 +2273,7 @@
  (match_operand:FMAMODEM 1 "nonimmediate_operand")
  (match_operand:FMAMODEM 2 "nonimmediate_operand")
  (match_operand:FMAMODEM 3 "nonimmediate_operand")))]
-  "TARGET_FMA || TARGET_FMA4")
+  "")
 
 (define_expand "fms4"
   [(set (match_operand:FMAMODEM 0 "register_operand")
@@ -2272,7 +2281,7 @@
  (match_operand:FMAMODEM 1 "nonimmediate_operand")
  (match_operand:FMAMODEM 2 "nonimmediate_operand")
  (neg:FMAMODEM (match_operand:FMAMODEM 3 "nonimmediate_operand"]
-  "TARGET_FMA || TARGET_FMA4")
+  "")
 
 (define_expand "fnma4"
   [(set (match_operand:FMAMODEM 0 "register_operand")
@@ -2280,7 +2289,7 @@
  (neg:FMAMODEM (match_operand:FMAMODEM 1 "nonimmediate_operand"))
  (match_operand:FMAMODEM 2 "nonimmediate_operand")
  (match_operand:FMAMODEM 3 "nonimmediate_operand")))]
-  "TARGET_FMA || TARGET_FMA4")
+  "")
 
 (define_expand "fnms4"
   [(set (match_operand:FMAMODEM 0 "register_operand")
@@ -2288,10 +2297,17 @@
  (neg:FMAMODEM (match_operand:FMAMODEM 1 "nonimmediate_operand"))
  (match_operand:FMAMODEM 2 "nonimmediate_operand")
  (neg:FMAMODEM (match_operand:FMAMODEM 3 "nonimmediate_operand"]
-  "TARGET_FMA || TARGET_FMA4")
+  "")
 
 ;; The builtins for intrinsics are not constrained by SSE math enabled.
-(define_mode_iterator FMAMODE [SF DF V4SF V2DF V8SF V4DF])
+(define_mode_iterator FMAMODE [(SF "TARGET_FMA || TARGET_FMA4 || 
TARGET_AVX512F")
+  (DF "TARGET_FMA || TARGET_FMA4 || 
TARGET_AVX512F")
+  (V4SF "TARGET_FMA || TARGET_FMA4")
+  (V2DF "TARGET_FMA || TARGET_FMA4")
+  (V8SF "TARGET_FMA || TARGET_FMA4")
+  (V4DF "TARGET_FMA || TARGET_FMA4")
+  (V16SF "TARGET_AVX512F")
+  (V8DF "TARGET_AVX512F")])
 
 (define_expand "fma4i_fmadd_"
   [(set (match_operand:FMAMODE 0 "register_operand")
@@ -2299,7 +2315,7 @@
  (match_operand:FMAMODE 1 "nonimmediate_operand")
  (match_operand:FMAMODE 2 "nonimmediate_operand")
  (match_operand:FMAMODE 3 "nonimmediate_operand")))]
-  "TARGET_FMA || TARGET_FMA4")
+  "")
 
 (define_insn "*fma_fmadd_"
   [(set (match_operand:FMAMODE 0 "register_operand" "=v,v,v,x,x")
@@ -2307,7 +2323,7 @@
  (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0, v, x,x")
  (match_operand:FMAMODE 2 "nonimmediate_operand" "vm, 

Re: [PATCH i386 3/8] [AVX512] [13/n] Add AVX-512 patterns: VI4_AVX iterator.

2013-10-09 Thread Kirill Yukhin
Hello,

> This patch is still far too large.
>
> I think you should split it up based on every single mode iterator that
> you need to add or change.

Here's 13th subpatch. It introduces VI4_AVX iterator.

Is it Ok?

Testing:
  1. Bootstrap pass.
  2. make check shows no regressions.
  3. Spec 2000 & 2006 build show no regressions both with and without -mavx512f 
option.
  4. Spec 2000 & 2006 run shows no stability regressions without -mavx512f 
option.

--
Thanks, K

PS. If it is Ok - I am going to strip out ChangeLog lines from big patch.

---
 gcc/config/i386/i386.c |  4 ++--
 gcc/config/i386/sse.md | 27 +--
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 5908383..febceca 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -27751,7 +27751,7 @@ static const struct builtin_description bdesc_args[] =
   { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, 
"__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) 
INT64_FTYPE_V2DF },
   { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, 
"__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) 
INT64_FTYPE_V2DF },
 
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", 
IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_fix_notruncv4sfv4si, 
"__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) 
V4SI_FTYPE_V4SF },
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", 
IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
   { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, 
"__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) 
V4SI_FTYPE_V4SF },
 
@@ -28099,7 +28099,7 @@ static const struct builtin_description bdesc_args[] =
   { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, 
"__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) 
V4DF_FTYPE_V4SI },
   { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, 
"__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) 
V8SF_FTYPE_V8SI },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, 
"__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) 
V4SF_FTYPE_V4DF },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, 
"__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) 
V8SI_FTYPE_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_fix_notruncv8sfv8si, 
"__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) 
V8SI_FTYPE_V8SF },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, 
"__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) 
V4DF_FTYPE_V4SF },
   { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, 
"__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) 
V4SI_FTYPE_V4DF },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, 
"__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) 
V4SI_FTYPE_V4DF },
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 0ba1670..40030cf 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -207,6 +207,9 @@
 (define_mode_iterator VI2_AVX512F
   [(V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX2") V8HI])
 
+(define_mode_iterator VI4_AVX
+  [(V8SI "TARGET_AVX") V4SI])
+
 (define_mode_iterator VI4_AVX2
   [(V8SI "TARGET_AVX2") V4SI])
 
@@ -2823,20 +2826,16 @@
   DONE;
 })
 
-(define_insn "avx_cvtps2dq256"
-  [(set (match_operand:V8SI 0 "register_operand" "=x")
-   (unspec:V8SI [(match_operand:V8SF 1 "nonimmediate_operand" "xm")]
-UNSPEC_FIX_NOTRUNC))]
-  "TARGET_AVX"
-  "vcvtps2dq\t{%1, %0|%0, %1}"
-  [(set_attr "type" "ssecvt")
-   (set_attr "prefix" "vex")
-   (set_attr "mode" "OI")])
 
-(define_insn "sse2_cvtps2dq"
-  [(set (match_operand:V4SI 0 "register_operand" "=x")
-   (unspec:V4SI [(match_operand:V4SF 1 "nonimmediate_operand" "xm")]
-UNSPEC_FIX_NOTRUNC))]
+;; For _fix_notrunc insn pattern
+(define_mode_attr sf2simodelower
+  [(V16SI "v16sf") (V8SI "v8sf") (V4SI "v4sf")])
+
+(define_insn "_fix_notrunc"
+  [(set (match_operand:VI4_AVX 0 "register_operand" "=v")
+   (unspec:VI4_AVX
+ [(match_operand: 1 "nonimmediate_operand" "vm")]
+ UNSPEC_FIX_NOTRUNC))]
   "TARGET_SSE2"
   "%vcvtps2dq\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
@@ -2846,7 +2845,7 @@
  (const_string "*")
  (const_string "1")))
(set_attr "prefix" "maybe_vex")
-   (set_attr "mode" "TI")])
+   (set_attr "mode" "")])
 
 (define_insn "fix_truncv16sfv16si2"
   [(set (match_operand:V16SI 0 "register_operand" "=v")
-- 
1.7.11.7



Re: [PATCH i386 3/8] [AVX512] [12/n] Add AVX-512 patterns: V_512 and VI_512 iterators.

2013-10-09 Thread Kirill Yukhin
Hello,

> This patch is still far too large.
>
> I think you should split it up based on every single mode iterator that
> you need to add or change.

Here's 12th subpatch. It introduces VF_512 and VI_512 iterators.

Is it Ok?

Testing:
  1. Bootstrap pass.
  2. make check shows no regressions.
  3. Spec 2000 & 2006 build show no regressions both with and without -mavx512f 
option.
  4. Spec 2000 & 2006 run shows no stability regressions without -mavx512f 
option.

--
Thanks, K

PS. If it is Ok - I am going to strip out ChangeLog lines from big patch.

---
 gcc/config/i386/sse.md | 59 +-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 6adcdd3..0ba1670 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -122,6 +122,9 @@
 (define_mode_iterator V_256
   [V32QI V16HI V8SI V4DI V8SF V4DF])
 
+;; All 512bit vector modes
+(define_mode_iterator V_512 [V64QI V32HI V16SI V8DI V16SF V8DF])
+
 ;; All 256bit and 512bit vector modes
 (define_mode_iterator V_256_512
   [V32QI V16HI V8SI V4DI V8SF V4DF
@@ -337,7 +340,10 @@
 ;; All 256bit vector integer modes
 (define_mode_iterator VI_256 [V32QI V16HI V8SI V4DI])
 
-;; Random 128bit vector integer mode combinations
+;; All 512bit vector integer modes
+(define_mode_iterator VI_512 [V64QI V32HI V16SI V8DI])
+
+;; Various 128bit vector integer mode combinations
 (define_mode_iterator VI12_128 [V16QI V8HI])
 (define_mode_iterator VI14_128 [V16QI V4SI])
 (define_mode_iterator VI124_128 [V16QI V8HI V4SI])
@@ -1853,6 +1859,23 @@
  (const_string "0")))
(set_attr "mode" "")])
 
+(define_expand "vcond"
+  [(set (match_operand:V_512 0 "register_operand")
+   (if_then_else:V_512
+ (match_operator 3 ""
+   [(match_operand:VF_512 4 "nonimmediate_operand")
+(match_operand:VF_512 5 "nonimmediate_operand")])
+ (match_operand:V_512 1 "general_operand")
+ (match_operand:V_512 2 "general_operand")))]
+  "TARGET_AVX512F
+   && (GET_MODE_NUNITS (mode)
+   == GET_MODE_NUNITS (mode))"
+{
+  bool ok = ix86_expand_fp_vcond (operands);
+  gcc_assert (ok);
+  DONE;
+})
+
 (define_expand "vcond"
   [(set (match_operand:V_256 0 "register_operand")
(if_then_else:V_256
@@ -6457,6 +6480,23 @@
(set_attr "prefix" "orig,vex")
(set_attr "mode" "TI")])
 
+(define_expand "vcond"
+  [(set (match_operand:V_512 0 "register_operand")
+   (if_then_else:V_512
+ (match_operator 3 ""
+   [(match_operand:VI_512 4 "nonimmediate_operand")
+(match_operand:VI_512 5 "general_operand")])
+ (match_operand:V_512 1)
+ (match_operand:V_512 2)))]
+  "TARGET_AVX512F
+   && (GET_MODE_NUNITS (mode)
+   == GET_MODE_NUNITS (mode))"
+{
+  bool ok = ix86_expand_int_vcond (operands);
+  gcc_assert (ok);
+  DONE;
+})
+
 (define_expand "vcond"
   [(set (match_operand:V_256 0 "register_operand")
(if_then_else:V_256
@@ -6506,6 +6546,23 @@
   DONE;
 })
 
+(define_expand "vcondu"
+  [(set (match_operand:V_512 0 "register_operand")
+   (if_then_else:V_512
+ (match_operator 3 ""
+   [(match_operand:VI_512 4 "nonimmediate_operand")
+(match_operand:VI_512 5 "nonimmediate_operand")])
+ (match_operand:V_512 1 "general_operand")
+ (match_operand:V_512 2 "general_operand")))]
+  "TARGET_AVX512F
+   && (GET_MODE_NUNITS (mode)
+   == GET_MODE_NUNITS (mode))"
+{
+  bool ok = ix86_expand_int_vcond (operands);
+  gcc_assert (ok);
+  DONE;
+})
+
 (define_expand "vcondu"
   [(set (match_operand:V_256 0 "register_operand")
(if_then_else:V_256
-- 
1.7.11.7



Re: [PATCH i386 3/8] [AVX512] [15/n] Add AVX-512 patterns: VI48F_512 iterator.

2013-10-09 Thread Kirill Yukhin
Hello,

> This patch is still far too large.
>
> I think you should split it up based on every single mode iterator that
> you need to add or change.

Here's 15th subpatch. It introduces VI48F_512 iterator.

Is it Ok?

Testing:
  1. Bootstrap pass.
  2. make check shows no regressions.
  3. Spec 2000 & 2006 build show no regressions both with and without -mavx512f 
option.
  4. Spec 2000 & 2006 run shows no stability regressions without -mavx512f 
option.

--
Thanks, K

PS. If it is Ok - I am going to strip out ChangeLog lines from big patch.

---
 gcc/config/i386/sse.md | 332 ++---
 1 file changed, 316 insertions(+), 16 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index bfaa3a1..2364ccc 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -83,6 +83,11 @@
   UNSPEC_VPERMTI
   UNSPEC_GATHER
   UNSPEC_VSIBADDR
+
+  ;; For AVX512F support
+  UNSPEC_VPERMI2
+  UNSPEC_VPERMT2
+  UNSPEC_SCATTER
 ])
 
 (define_c_enum "unspecv" [
@@ -371,6 +376,7 @@
   [V8SI V8SF
   (V16SI "TARGET_AVX512F") (V16SF "TARGET_AVX512F")
   (V8DI  "TARGET_AVX512F") (V8DF  "TARGET_AVX512F")])
+(define_mode_iterator VI48F_512 [V16SI V16SF V8DI V8DF])
 
 ;; Mapping from float mode to required SSE level
 (define_mode_attr sse
@@ -409,6 +415,15 @@
(V4SF "V4SF") (V2DF "V2DF")
(TI "TI")])
 
+;; Mapping of vector modes to corresponding mask size
+(define_mode_attr avx512fmaskmode
+  [(V16QI "HI")
+   (V16HI "HI") (V8HI  "QI")
+   (V16SI "HI") (V8SI  "QI") (V4SI  "QI")
+   (V8DI  "QI") (V4DI  "QI") (V2DI  "QI")
+   (V16SF "HI") (V8SF  "QI") (V4SF  "QI")
+   (V8DF  "QI") (V4DF  "QI") (V2DF  "QI")])
+
 ;; Mapping of vector float modes to an integer mode of the same size
 (define_mode_attr sseintvecmode
   [(V16SF "V16SI") (V8DF  "V8DI")
@@ -501,10 +516,12 @@
 
 ;; SSE prefix for integer vector modes
 (define_mode_attr sseintprefix
-  [(V2DI "p") (V2DF "")
-   (V4DI "p") (V4DF "")
-   (V4SI "p") (V4SF "")
-   (V8SI "p") (V8SF "")])
+  [(V2DI  "p") (V2DF  "")
+   (V4DI  "p") (V4DF  "")
+   (V8DI  "p") (V8DF  "")
+   (V4SI  "p") (V4SF  "")
+   (V8SI  "p") (V8SF  "")
+   (V16SI "p") (V16SF "")])
 
 ;; SSE scalar suffix for vector modes
 (define_mode_attr ssescalarmodesuffix
@@ -549,6 +566,10 @@
 (define_mode_attr blendbits
   [(V8SF "255") (V4SF "15") (V4DF "15") (V2DF "3")])
 
+;; Mapping suffixes for broadcast
+(define_mode_attr bcstscalarsuff
+  [(V16SI "d") (V16SF "ss") (V8DI "q") (V8DF "sd")])
+
 ;; Patterns whose name begins with "sse{,2,3}_" are invoked by intrinsics.
 
 ;
@@ -688,6 +709,18 @@
  ]
  (const_string "")))])
 
+(define_insn "avx512f_blendm"
+  [(set (match_operand:VI48F_512 0 "register_operand" "=v")
+   (vec_merge:VI48F_512
+ (match_operand:VI48F_512 2 "nonimmediate_operand" "vm")
+ (match_operand:VI48F_512 1 "register_operand" "v")
+ (match_operand: 3 "register_operand" "k")))]
+  "TARGET_AVX512F"
+  "vblendm\t{%2, %1, %0%{%3%}|%0%{%3%}, %1, %2}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "")])
+
 (define_insn "sse2_movq128"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
(vec_concat:V2DI
@@ -1826,6 +1859,24 @@
(set_attr "prefix" "orig,vex")
(set_attr "mode" "")])
 
+(define_mode_attr cmp_imm_predicate
+  [(V16SF "const_0_to_31_operand") (V8DF "const_0_to_31_operand")
+  (V16SI "const_0_to_7_operand") (V8DI "const_0_to_7_operand")])
+
+(define_insn "avx512f_cmp3"
+  [(set (match_operand: 0 "register_operand" "=k")
+   (unspec:
+ [(match_operand:VI48F_512 1 "register_operand" "v")
+  (match_operand:VI48F_512 2 "nonimmediate_operand" "vm")
+  (match_operand:SI 3 "" "n")]
+ UNSPEC_PCMP))]
+  "TARGET_AVX512F"
+  "vcmp\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "")])
+
 (define_insn "_comi"
   [(set (reg:CCFP FLAGS_REG)
(compare:CCFP
@@ -10927,6 +10978,28 @@
(set_attr "isa" "*,avx2,noavx2")
(set_attr "mode" "V8SF")])
 
+(define_insn "avx512f_vec_dup"
+  [(set (match_operand:VI48F_512 0 "register_operand" "=v")
+   (vec_duplicate:VI48F_512
+ (vec_select:
+   (match_operand: 1 "nonimmediate_operand" "vm")
+   (parallel [(const_int 0)]]
+  "TARGET_AVX512F"
+  "vbroadcast\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "")])
+
+(define_insn "avx512f_vec_dup_mem"
+  [(set (match_operand:VI48F_512 0 "register_operand" "=x")
+   (vec_duplicate:VI48F_512
+ (match_operand: 1 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX512F"
+  "vbroadcast\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "")])
+
 (define_insn "avx2_vbroadcasti128_"
   [(set (match_operand:VI_2

Re: [PATCH i386 3/8] [AVX512] [14/n] Add AVX-512 patterns: VI48F_256_512 iterator.

2013-10-09 Thread Kirill Yukhin
Hello,

> This patch is still far too large.
>
> I think you should split it up based on every single mode iterator that
> you need to add or change.

Here's 14th subpatch. It introduces VI48F_256_512 iterator.

Is it Ok?

Testing:
  1. Bootstrap pass.
  2. make check shows no regressions.
  3. Spec 2000 & 2006 build show no regressions both with and without -mavx512f 
option.
  4. Spec 2000 & 2006 run shows no stability regressions without -mavx512f 
option.

--
Thanks, K

PS. If it is Ok - I am going to strip out ChangeLog lines from big patch.

---
 gcc/config/i386/sse.md | 16 ++--
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 40030cf..bfaa3a1 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -367,6 +367,10 @@
 (define_mode_iterator VI8F_256 [V4DI V4DF])
 (define_mode_iterator VI8F_256_512
   [V4DI V4DF (V8DI "TARGET_AVX512F") (V8DF "TARGET_AVX512F")])
+(define_mode_iterator VI48F_256_512
+  [V8SI V8SF
+  (V16SI "TARGET_AVX512F") (V16SF "TARGET_AVX512F")
+  (V8DI  "TARGET_AVX512F") (V8DF  "TARGET_AVX512F")])
 
 ;; Mapping from float mode to required SSE level
 (define_mode_attr sse
@@ -10830,17 +10834,17 @@
(set_attr "prefix" "vex")
(set_attr "mode" "")])
 
-(define_insn "avx2_permvar"
-  [(set (match_operand:VI4F_256 0 "register_operand" "=v")
-   (unspec:VI4F_256
- [(match_operand:VI4F_256 1 "nonimmediate_operand" "vm")
-  (match_operand:V8SI 2 "register_operand" "v")]
+(define_insn "_permvar"
+  [(set (match_operand:VI48F_256_512 0 "register_operand" "=v")
+   (unspec:VI48F_256_512
+ [(match_operand:VI48F_256_512 1 "nonimmediate_operand" "vm")
+  (match_operand: 2 "register_operand" "v")]
  UNSPEC_VPERMVAR))]
   "TARGET_AVX2"
   "vperm\t{%1, %2, %0|%0, %2, %1}"
   [(set_attr "type" "sselog")
(set_attr "prefix" "vex")
-   (set_attr "mode" "OI")])
+   (set_attr "mode" "")])
 
 (define_expand "_perm"
   [(match_operand:VI8F_256_512 0 "register_operand")
-- 
1.7.11.7



Re: [PATCH i386 3/8] [AVX512] [17/n] Add AVX-512 patterns: V8FI and V16FI iterators.

2013-10-09 Thread Kirill Yukhin
Hello,

> This patch is still far too large.
>
> I think you should split it up based on every single mode iterator that
> you need to add or change.

Here's 17th subpatch. It introduces V8FI and V16FI iterators.

Is it Ok?

Testing:
  1. Bootstrap pass.
  2. make check shows no regressions.
  3. Spec 2000 & 2006 build show no regressions both with and without -mavx512f 
option.
  4. Spec 2000 & 2006 run shows no stability regressions without -mavx512f 
option.

--
Thanks, K

PS. If it is Ok - I am going to strip out ChangeLog lines from big patch.

---
 gcc/config/i386/predicates.md |  10 ++
 gcc/config/i386/sse.md| 367 +-
 2 files changed, 376 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index eff82eb..e1670f3 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -757,11 +757,21 @@
   (and (match_code "const_int")
(match_test "IN_RANGE (INTVAL (op), 8, 11)")))
 
+;; Match 8 to 15.
+(define_predicate "const_8_to_15_operand"
+  (and (match_code "const_int")
+   (match_test "IN_RANGE (INTVAL (op), 8, 15)")))
+
 ;; Match 12 to 15.
 (define_predicate "const_12_to_15_operand"
   (and (match_code "const_int")
(match_test "IN_RANGE (INTVAL (op), 12, 15)")))
 
+;; Match 16 to 31.
+(define_predicate "const_16_to_31_operand"
+  (and (match_code "const_int")
+   (match_test "IN_RANGE (INTVAL (op), 16, 31)")))
+
 ;; True if this is a constant appropriate for an increment or decrement.
 (define_predicate "incdec_operand"
   (match_code "const_int")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 8221d61..2b27649f 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -243,6 +243,14 @@
 (define_mode_iterator VI8_AVX2_AVX512F
   [(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX2") V2DI])
 
+;; All V8D* modes
+(define_mode_iterator V8FI
+  [V8DF V8DI])
+
+;; All V16S* modes
+(define_mode_iterator V16FI
+  [V16SF V16SI])
+
 ;; ??? We should probably use TImode instead.
 (define_mode_iterator VIMAX_AVX2
   [(V2TI "TARGET_AVX2") V1TI])
@@ -347,8 +355,12 @@
   (V32QI "i") (V16HI "u") (V16QI "i") (V8HI "i")
   (V64QI "i") (V1TI "i") (V2TI "i")])
 
+(define_mode_attr ssequartermode
+  [(V16SF "V4SF") (V8DF "V2DF") (V16SI "V4SI") (V8DI "V2DI")])
+
 (define_mode_attr ssedoublemode
-  [(V16HI "V16SI") (V8HI "V8SI") (V4HI "V4SI")
+  [(V16SF "V32SF") (V16SI "V32SI") (V8DI "V16DI") (V8DF "V16DF")
+   (V16HI "V16SI") (V8HI "V8SI") (V4HI "V4SI")
(V32QI "V32HI") (V16QI "V16HI")])
 
 (define_mode_attr ssebytemode
@@ -1697,6 +1709,15 @@
(set_attr "prefix_rep" "1,*")
(set_attr "mode" "V4SF")])
 
+(define_expand "reduc_splus_v8df"
+  [(match_operand:V8DF 0 "register_operand")
+   (match_operand:V8DF 1 "register_operand")]
+  "TARGET_AVX512F"
+{
+  ix86_expand_reduc (gen_addv8df3, operands[0], operands[1]);
+  DONE;
+})
+
 (define_expand "reduc_splus_v4df"
   [(match_operand:V4DF 0 "register_operand")
(match_operand:V4DF 1 "register_operand")]
@@ -1719,6 +1740,15 @@
   DONE;
 })
 
+(define_expand "reduc_splus_v16sf"
+  [(match_operand:V16SF 0 "register_operand")
+   (match_operand:V16SF 1 "register_operand")]
+  "TARGET_AVX512F"
+{
+  ix86_expand_reduc (gen_addv16sf3, operands[0], operands[1]);
+  DONE;
+})
+
 (define_expand "reduc_splus_v8sf"
   [(match_operand:V8SF 0 "register_operand")
(match_operand:V8SF 1 "register_operand")]
@@ -4748,6 +4778,86 @@
   operands[1] = adjust_address (operands[1], SFmode, INTVAL (operands[2]) * 4);
 })
 
+(define_insn "avx512f_vextract32x4_1"
+  [(set (match_operand: 0 "nonimmediate_operand" "=vm")
+   (vec_select:
+ (match_operand:V16FI 1 "register_operand" "v")
+ (parallel [(match_operand 2  "const_0_to_15_operand")
+(match_operand 3  "const_0_to_15_operand")
+(match_operand 4  "const_0_to_15_operand")
+(match_operand 5  "const_0_to_15_operand")])))]
+  "TARGET_AVX512F && (INTVAL (operands[2]) = INTVAL (operands[3]) - 1)
+  && (INTVAL (operands[3]) = INTVAL (operands[4]) - 1)
+  && (INTVAL (operands[4]) = INTVAL (operands[5]) - 1)"
+{
+  operands[2] = GEN_INT ((INTVAL (operands[2])) >> 2);
+  return "vextract32x4\t{%2, %1, %0|%0, %1, %2}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set (attr "memory")
+  (if_then_else (match_test "MEM_P (operands[0])")
+   (const_string "store")
+   (const_string "none")))
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "")])
+
+(define_split
+  [(set (match_operand: 0 "nonimmediate_operand")
+   (vec_select:
+ (match_operand:V8FI 1 "nonimmediate_operand")
+ (parallel [(const_int 0) (const_int 1)
+(const_int 2) (const_int 3)])))]
+  "TARGET_AVX512F && !(MEM_P (operands[0]) && MEM_P (operands[1]))
+  && reload_completed"
+  [(const_int 0)]
+{
+  rtx op1 = operands[1];
+  if (REG_P (op1))
+op1 = gen

Re: [PATCH i386 3/8] [AVX512] [18/n] Add AVX-512 patterns: various RCPs and SQRTs.

2013-10-09 Thread Kirill Yukhin
Hello,

> This patch is still far too large.
>
> I think you should split it up based on every single mode iterator that
> you need to add or change.

Here's 18th subpatch. It introduces various new insns.

Is it Ok?

Testing:
  1. Bootstrap pass.
  2. make check shows no regressions.
  3. Spec 2000 & 2006 build show no regressions both with and without -mavx512f 
option.
  4. Spec 2000 & 2006 run shows no stability regressions without -mavx512f 
option.

--
Thanks, K

PS. If it is Ok - I am going to strip out ChangeLog lines from big patch.

---
 gcc/config/i386/sse.md | 220 -
 1 file changed, 216 insertions(+), 4 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 2b27649f..3ab35a7 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -91,7 +91,13 @@
   UNSPEC_TESTM
   UNSPEC_TESTNM
   UNSPEC_SCATTER
+  UNSPEC_RCP14
+  UNSPEC_RSQRT14
+  UNSPEC_FIXUPIMM
+  UNSPEC_SCALEF
   UNSPEC_VTERNLOG
+  UNSPEC_GETEXP
+  UNSPEC_GETMANT
   UNSPEC_ALIGN
   UNSPEC_CONFLICT
   UNSPEC_MASKED_EQ
@@ -100,6 +106,11 @@
   ;; For AVX512PF support
   UNSPEC_GATHER_PREFETCH
   UNSPEC_SCATTER_PREFETCH
+
+  ;; For AVX512ER support
+  UNSPEC_EXP2
+  UNSPEC_RCP28
+  UNSPEC_RSQRT28
 ])
 
 (define_c_enum "unspecv" [
@@ -363,6 +374,9 @@
(V16HI "V16SI") (V8HI "V8SI") (V4HI "V4SI")
(V32QI "V32HI") (V16QI "V16HI")])
 
+(define_mode_attr ssefixupmode
+  [(V16SF "V16SI") (V4SF "V4SI") (V8DF "V8DI") (V2DF "V2DI")])
+
 (define_mode_attr ssebytemode
   [(V4DI "V32QI") (V2DI "V16QI")])
 
@@ -1254,6 +1268,32 @@
(set_attr "prefix" "orig,vex")
(set_attr "mode" "SF")])
 
+(define_insn "rcp14"
+  [(set (match_operand:VF_512 0 "register_operand" "=v")
+   (unspec:VF_512
+ [(match_operand:VF_512 1 "nonimmediate_operand" "vm")]
+ UNSPEC_RCP14))]
+  "TARGET_AVX512F"
+  "vrcp14\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "")])
+
+(define_insn "srcp14"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+   (vec_merge:VF_128
+ (unspec:VF_128
+   [(match_operand:VF_128 1 "register_operand" "v")
+(match_operand:VF_128 2 "nonimmediate_operand" "vm")]
+   UNSPEC_RCP14)
+ (match_dup 1)
+ (const_int 1)))]
+  "TARGET_AVX512F"
+  "vrcp14\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "")])
+
 (define_expand "sqrt2"
   [(set (match_operand:VF2 0 "register_operand")
(sqrt:VF2 (match_operand:VF2 1 "nonimmediate_operand")))]
@@ -1324,6 +1364,32 @@
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "")])
 
+(define_insn "rsqrt14"
+  [(set (match_operand:VF_512 0 "register_operand" "=v")
+   (unspec:VF_512
+ [(match_operand:VF_512 1 "nonimmediate_operand" "vm")]
+ UNSPEC_RSQRT14))]
+  "TARGET_AVX512F"
+  "vrsqrt14\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "")])
+
+(define_insn "rsqrt14"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+   (vec_merge:VF_128
+ (unspec:VF_128
+   [(match_operand:VF_128 1 "register_operand" "v")
+(match_operand:VF_128 2 "nonimmediate_operand" "vm")]
+   UNSPEC_RSQRT14)
+ (match_dup 1)
+ (const_int 1)))]
+  "TARGET_AVX512F"
+  "vrsqrt14\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "")])
+
 (define_insn "sse_vmrsqrtv4sf2"
   [(set (match_operand:V4SF 0 "register_operand" "=x,x")
(vec_merge:V4SF
@@ -5301,6 +5367,29 @@
   operands[1] = adjust_address (operands[1], DFmode, INTVAL (operands[2]) * 8);
 })
 
+(define_insn "avx512f_vmscalef"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+   (vec_merge:VF_128
+ (unspec:VF_128 [(match_operand:VF_128 1 "register_operand" "v")
+ (match_operand:VF_128 2 "nonimmediate_operand" "vm")]
+UNSPEC_SCALEF)
+ (match_dup 1)
+ (const_int 1)))]
+  "TARGET_AVX512F"
+  "%vscalef\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "prefix" "evex")
+   (set_attr "mode"  "")])
+
+(define_insn "avx512f_scalef"
+  [(set (match_operand:VF_512 0 "register_operand" "=v")
+   (unspec:VF_512 [(match_operand:VF_512 1 "register_operand" "v")
+   (match_operand:VF_512 2 "nonimmediate_operand" "vm")]
+  UNSPEC_SCALEF))]
+  "TARGET_AVX512F"
+  "%vscalef\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "prefix" "evex")
+   (set_attr "mode"  "")])
+
 (define_insn "avx512f_vternlog"
   [(set (match_operand:VI48_512 0 "register_operand" "=v")
(unspec:VI48_512
@@ -5315,6 +5404,28 @@
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
+(define_insn "avx512f_getexp"
+  [(set (match_operand:VF_512 0 "register_operand" "=v")
+(unspec:VF_512 [(match_operand:VF_512 1 "nonimmediate_operand" "vm")

Re: [PATCH i386 3/8] [AVX512] [20/n] Add AVX-512 patterns: Misc.

2013-10-09 Thread Kirill Yukhin
Hello,

> This patch is still far too large.
>
> I think you should split it up based on every single mode iterator that
> you need to add or change.

Here's 20th subpatch. It introduces last insns of AVX-512F.

Is it Ok?

Testing:
  1. Bootstrap pass.
  2. make check shows no regressions.
  3. Spec 2000 & 2006 build show no regressions both with and without -mavx512f 
option.
  4. Spec 2000 & 2006 run shows no stability regressions without -mavx512f 
option.

This patch finalize 3/8 series.

--
Thanks, K

PS. If it is Ok - I am going to strip out ChangeLog lines from big patch.

---
 gcc/config/i386/i386.md | 16 +++-
 gcc/config/i386/sse.md  | 34 ++
 2 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e7e9f2d..91be1ce 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -17531,7 +17531,7 @@
   [(prefetch (match_operand 0 "address_operand")
 (match_operand:SI 1 "const_int_operand")
 (match_operand:SI 2 "const_int_operand"))]
-  "TARGET_PREFETCH_SSE || TARGET_PRFCHW"
+  "TARGET_PREFETCH_SSE || TARGET_PRFCHW || TARGET_AVX512PF"
 {
   bool write = INTVAL (operands[1]) != 0;
   int locality = INTVAL (operands[2]);
@@ -17544,6 +17544,8 @@
  of locality.  */
   if (TARGET_PRFCHW && (write || !TARGET_PREFETCH_SSE))
 operands[2] = GEN_INT (3);
+  else if (TARGET_AVX512PF && (write || !TARGET_PREFETCH_SSE))
+operands[2] = GEN_INT (1);
   else
 operands[1] = const0_rtx;
 })
@@ -17585,6 +17587,18 @@
(symbol_ref "memory_address_length (operands[0], false)"))
(set_attr "memory" "none")])
 
+(define_insn "*prefetch_avx512pf_"
+  [(prefetch (match_operand:P 0 "address_operand" "p")
+(const_int 1)
+(const_int 1))]
+  "TARGET_AVX512PF"
+  "prefetchwt1\t%a0";
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set (attr "length_address")
+   (symbol_ref "memory_address_length (operands[0], false)"))
+   (set_attr "memory" "none")])
+
 (define_expand "stack_protect_set"
   [(match_operand 0 "memory_operand")
(match_operand 1 "memory_operand")]
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index f7e9fd5..939cc33 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2013,6 +2013,34 @@
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
+(define_insn "avx512f_vmcmp3"
+  [(set (match_operand: 0 "register_operand" "=k")
+   (and:
+ (unspec:
+   [(match_operand:VF_128 1 "register_operand" "v")
+(match_operand:VF_128 2 "nonimmediate_operand" "vm")
+(match_operand:SI 3 "const_0_to_31_operand" "n")]
+   UNSPEC_PCMP)
+ (const_int 1)))]
+  "TARGET_AVX512F"
+  "vcmp\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "")])
+
+(define_insn "avx512f_maskcmp3"
+  [(set (match_operand: 0 "register_operand" "=k")
+   (match_operator: 3 "sse_comparison_operator"
+ [(match_operand:VF 1 "register_operand" "v")
+  (match_operand:VF 2 "nonimmediate_operand" "vm")]))]
+  "TARGET_SSE"
+  "vcmp%D3\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "")])
+
 (define_insn "_comi"
   [(set (reg:CCFP FLAGS_REG)
(compare:CCFP
@@ -12154,6 +12182,12 @@
 }
 })
 
+(define_expand "vashrv16si3"
+  [(set (match_operand:V16SI 0 "register_operand")
+   (ashiftrt:V16SI (match_operand:V16SI 1 "register_operand")
+   (match_operand:V16SI 2 "nonimmediate_operand")))]
+  "TARGET_AVX512F")
+
 (define_expand "vashrv8si3"
   [(set (match_operand:V8SI 0 "register_operand")
(ashiftrt:V8SI (match_operand:V8SI 1 "register_operand")
-- 
1.7.11.7



Re: [PATCH i386 3/8] [AVX512] [19/n] Add AVX-512 patterns: Extracts and converts.

2013-10-09 Thread Kirill Yukhin
Hello,

> This patch is still far too large.
>
> I think you should split it up based on every single mode iterator that
> you need to add or change.

Here's 19th subpatch. It extends extract and convert insn patterns.

Is it Ok?

Testing:
  1. Bootstrap pass.
  2. make check shows no regressions.
  3. Spec 2000 & 2006 build show no regressions both with and without -mavx512f 
option.
  4. Spec 2000 & 2006 run shows no stability regressions without -mavx512f 
option.

--
Thanks, K

PS. If it is Ok - I am going to strip out ChangeLog lines from big patch.

---
 gcc/config/i386/i386.md   |   5 +
 gcc/config/i386/predicates.md |  40 ++
 gcc/config/i386/sse.md| 938 +-
 3 files changed, 977 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 10ca6cb..e7e9f2d 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -831,6 +831,11 @@
 (define_code_attr s [(sign_extend "s") (zero_extend "u")])
 (define_code_attr u_bool [(sign_extend "false") (zero_extend "true")])
 
+;; Used in signed and unsigned truncations.
+(define_code_iterator any_truncate [ss_truncate truncate us_truncate])
+;; Instruction suffix for truncations.
+(define_code_attr trunsuffix [(ss_truncate "s") (truncate "") (us_truncate 
"us")])
+
 ;; Used in signed and unsigned fix.
 (define_code_iterator any_fix [fix unsigned_fix])
 (define_code_attr fixsuffix [(fix "") (unsigned_fix "u")])
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index e1670f3..261335d 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -752,6 +752,11 @@
   (and (match_code "const_int")
(match_test "IN_RANGE (INTVAL (op), 6, 7)")))
 
+;; Match 8 to 9.
+(define_predicate "const_8_to_9_operand"
+  (and (match_code "const_int")
+   (match_test "IN_RANGE (INTVAL (op), 8, 9)")))
+
 ;; Match 8 to 11.
 (define_predicate "const_8_to_11_operand"
   (and (match_code "const_int")
@@ -762,16 +767,51 @@
   (and (match_code "const_int")
(match_test "IN_RANGE (INTVAL (op), 8, 15)")))
 
+;; Match 10 to 11.
+(define_predicate "const_10_to_11_operand"
+  (and (match_code "const_int")
+   (match_test "IN_RANGE (INTVAL (op), 10, 11)")))
+
+;; Match 12 to 13.
+(define_predicate "const_12_to_13_operand"
+  (and (match_code "const_int")
+   (match_test "IN_RANGE (INTVAL (op), 12, 13)")))
+
 ;; Match 12 to 15.
 (define_predicate "const_12_to_15_operand"
   (and (match_code "const_int")
(match_test "IN_RANGE (INTVAL (op), 12, 15)")))
 
+;; Match 14 to 15.
+(define_predicate "const_14_to_15_operand"
+  (and (match_code "const_int")
+   (match_test "IN_RANGE (INTVAL (op), 14, 15)")))
+
+;; Match 16 to 19.
+(define_predicate "const_16_to_19_operand"
+  (and (match_code "const_int")
+   (match_test "IN_RANGE (INTVAL (op), 16, 19)")))
+
 ;; Match 16 to 31.
 (define_predicate "const_16_to_31_operand"
   (and (match_code "const_int")
(match_test "IN_RANGE (INTVAL (op), 16, 31)")))
 
+;; Match 20 to 23.
+(define_predicate "const_20_to_23_operand"
+  (and (match_code "const_int")
+   (match_test "IN_RANGE (INTVAL (op), 20, 23)")))
+
+;; Match 24 to 27.
+(define_predicate "const_24_to_27_operand"
+  (and (match_code "const_int")
+   (match_test "IN_RANGE (INTVAL (op), 24, 27)")))
+
+;; Match 28 to 31.
+(define_predicate "const_28_to_31_operand"
+  (and (match_code "const_int")
+   (match_test "IN_RANGE (INTVAL (op), 28, 31)")))
+
 ;; True if this is a constant appropriate for an increment or decrement.
 (define_predicate "incdec_operand"
   (match_code "const_int")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 3ab35a7..f7e9fd5 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -87,6 +87,7 @@
   ;; For AVX512F support
   UNSPEC_VPERMI2
   UNSPEC_VPERMT2
+  UNSPEC_UNSIGNED_FIX_NOTRUNC
   UNSPEC_UNSIGNED_PCMP
   UNSPEC_TESTM
   UNSPEC_TESTNM
@@ -2997,6 +2998,34 @@
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "DI")])
 
+(define_insn "cvtusi232"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+   (vec_merge:VF_128
+ (vec_duplicate:VF_128
+   (unsigned_float:
+ (match_operand:SI 2 "nonimmediate_operand" "rm")))
+ (match_operand:VF_128 1 "register_operand" "v")
+ (const_int 1)))]
+  "TARGET_AVX512F"
+  "vcvtusi2\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "")])
+
+(define_insn "cvtusi264"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+   (vec_merge:VF_128
+ (vec_duplicate:VF_128
+   (unsigned_float:
+ (match_operand:DI 2 "nonimmediate_operand" "rm")))
+ (match_operand:VF_128 1 "register_operand" "v")
+ (const_int 1)))]
+  "TARGET_AVX512F && TARGET_64BIT"
+  "vcvtusi2\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "evex")
+   (se

Re: [PATCH i386 3/8] [AVX512] [16/n] Add AVX-512 patterns: VI48_512 and VI4F_128 iterators.

2013-10-09 Thread Kirill Yukhin
Hello,

> This patch is still far too large.
>
> I think you should split it up based on every single mode iterator that
> you need to add or change.

Here's 1st subpatch. It extends VI4F_128 and introduces VI48_512 iterator.

Is it Ok?

Testing:
  1. Bootstrap pass.
  2. make check shows no regressions.
  3. Spec 2000 & 2006 build show no regressions both with and without -mavx512f 
option.
  4. Spec 2000 & 2006 run shows no stability regressions without -mavx512f 
option.

--
Thanks, K

PS. If it is Ok - I am going to strip out ChangeLog lines from big patch.

---
 gcc/config/i386/predicates.md |   5 +
 gcc/config/i386/sse.md| 344 +-
 2 files changed, 348 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 18f425c..eff82eb 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1332,3 +1332,8 @@
 (define_predicate "general_vector_operand"
   (ior (match_operand 0 "nonimmediate_operand")
(match_code "const_vector")))
+
+;; Return true if OP is either -1 constant or stored in register.
+(define_predicate "register_or_constm1_operand"
+  (ior (match_operand 0 "register_operand")
+   (match_test "op == constm1_rtx")))
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 2364ccc..8221d61 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -87,7 +87,19 @@
   ;; For AVX512F support
   UNSPEC_VPERMI2
   UNSPEC_VPERMT2
+  UNSPEC_UNSIGNED_PCMP
+  UNSPEC_TESTM
+  UNSPEC_TESTNM
   UNSPEC_SCATTER
+  UNSPEC_VTERNLOG
+  UNSPEC_ALIGN
+  UNSPEC_CONFLICT
+  UNSPEC_MASKED_EQ
+  UNSPEC_MASKED_GT
+
+  ;; For AVX512PF support
+  UNSPEC_GATHER_PREFETCH
+  UNSPEC_SCATTER_PREFETCH
 ])
 
 (define_c_enum "unspecv" [
@@ -364,6 +376,7 @@
 (define_mode_iterator VI124_256_48_512
   [V32QI V16HI V8SI (V8DI "TARGET_AVX512F") (V16SI "TARGET_AVX512F")])
 (define_mode_iterator VI48_256 [V8SI V4DI])
+(define_mode_iterator VI48_512 [V16SI V8DI])
 
 ;; Int-float size matches
 (define_mode_iterator VI4F_128 [V4SI V4SF])
@@ -1741,7 +1754,9 @@
   [(V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")
(V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")
(V8SF "TARGET_AVX") (V4DF "TARGET_AVX")
-   (V4SF "TARGET_SSE")])
+   (V4SF "TARGET_SSE") (V16SI "TARGET_AVX512F")
+   (V8DI "TARGET_AVX512F") (V16SF "TARGET_AVX512F")
+   (V8DF "TARGET_AVX512F")])
 
 (define_expand "reduc__"
   [(smaxmin:REDUC_SMINMAX_MODE
@@ -1754,6 +1769,16 @@
 })
 
 (define_expand "reduc__"
+  [(umaxmin:VI48_512
+ (match_operand:VI48_512 0 "register_operand")
+ (match_operand:VI48_512 1 "register_operand"))]
+  "TARGET_AVX512F"
+{
+  ix86_expand_reduc (gen_3, operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "reduc__"
   [(umaxmin:VI_256
  (match_operand:VI_256 0 "register_operand")
  (match_operand:VI_256 1 "register_operand"))]
@@ -1877,6 +1902,20 @@
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
+(define_insn "avx512f_ucmp3"
+  [(set (match_operand: 0 "register_operand" "=k")
+   (unspec:
+ [(match_operand:VI48_512 1 "register_operand" "v")
+  (match_operand:VI48_512 2 "nonimmediate_operand" "vm")
+  (match_operand:SI 3 "const_0_to_7_operand" "n")]
+ UNSPEC_UNSIGNED_PCMP))]
+  "TARGET_AVX512F"
+  "vpcmpu\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "")])
+
 (define_insn "_comi"
   [(set (reg:CCFP FLAGS_REG)
(compare:CCFP
@@ -5113,6 +5152,31 @@
   operands[1] = adjust_address (operands[1], DFmode, INTVAL (operands[2]) * 8);
 })
 
+(define_insn "avx512f_vternlog"
+  [(set (match_operand:VI48_512 0 "register_operand" "=v")
+   (unspec:VI48_512
+ [(match_operand:VI48_512 1 "register_operand" "0")
+  (match_operand:VI48_512 2 "register_operand" "v")
+  (match_operand:VI48_512 3 "nonimmediate_operand" "vm")
+  (match_operand:SI 4 "const_0_to_255_operand")]
+ UNSPEC_VTERNLOG))]
+  "TARGET_AVX512F"
+  "vpternlog\t{%4, %3, %2, %0|%0, %2, %3, %4}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "")])
+
+(define_insn "avx512f_align"
+  [(set (match_operand:VI48_512 0 "register_operand" "=v")
+(unspec:VI48_512 [(match_operand:VI48_512 1 "register_operand" "v")
+ (match_operand:VI48_512 2 "nonimmediate_operand" "vm")
+ (match_operand:SI 3 "const_0_to_255_operand")]
+UNSPEC_ALIGN))]
+  "TARGET_AVX512F"
+  "valign\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+  [(set_attr "prefix" "evex")
+   (set_attr "mode" "")])
+
 (define_insn "avx512f_rndscale"
   [(set (match_operand:VF_512 0 "register_operand" "=v")
(unspec:VF_512
@@ -6137,6 +6201,22 @@
(set_attr "prefix" "orig,vex")
(set_attr "mode" "")])
 
+(define_insn "3"
+  [(set (match_operand:VI48_512 0 "register_operand" "=v,v")
+ 

Re: [PATCH][ARM]Replace gen_rtx_PLUS with plus_constant

2013-10-09 Thread Kyrill Tkachov

On 01/10/13 11:15, Marcus Shawcroft wrote:

On 30 September 2013 14:23, Renlin Li  wrote:


OK for trunk?

Kind regards,
Renlin Li

gcc/ChangeLog:

2013-09-30  Renlin Li  

 * config/arm/arm.c (arm_output_mi_thunk): Use plus_constant.

OK
/Marcus


Hi Renlin,

Richard Earnshaw also ok'd this patch offline.
I've committed it for you as r203309.

Kyrill



Re: [patch] Fix PR middle-end/58570

2013-10-09 Thread Richard Biener
On Tue, Oct 8, 2013 at 7:52 PM, Eric Botcazou  wrote:
>> Probably because the actual accesses may overlap if we choose to
>> perform a bigger access.
>
> Nope, simply because they share a byte.
>
>> The same can happen if we for struct { char c1; char c2; } perform
>> an HImode access in case the target doesn't support QImode accesses.
>> Basically anytime we go through the bitfield expansion path.  Thus, doesn't
>> that mean that MEM_EXPR is wrong on the MEMs?  Maybe we used to
>> strip all DECL_BIT_FIELD component-refs at some point (adjusting
>> MEM_OFFSET accordingly)?
>
> Yes, we used to strip the MEM_EXPRs as soon as we go through the bitfield
> expansion path until last year, when I changed it:
>
> 2012-09-14  Eric Botcazou  
>
> PR rtl-optimization/44194
> * calls.c (expand_call): In the PARALLEL case, copy the return value
> into pseudos instead of spilling it onto the stack.
> * emit-rtl.c (adjust_address_1): Rename ADJUST into ADJUST_ADDRESS and
> add new ADJUST_OBJECT parameter.
> If ADJUST_OBJECT is set, drop the underlying object if it cannot be
> proved that the adjusted memory access is still within its bounds.
> (adjust_automodify_address_1): Adjust call to adjust_address_1.
> (widen_memory_access): Likewise.
> * expmed.c (store_bit_field_1): Call adjust_bitfield_address instead
> of adjust_address.  Do not drop the underlying object of a MEM.
> (store_fixed_bit_field): Likewise.
> (extract_bit_field_1): Likewise.  Fix oversight in recursion.
> (extract_fixed_bit_field): Likewise.
> * expr.h (adjust_address_1): Adjust prototype.
> (adjust_address): Adjust call to adjust_address_1.
> (adjust_address_nv): Likewise.
> (adjust_bitfield_address): New macro.
> (adjust_bitfield_address_nv): Likewise.
> * expr.c (expand_assignment): Handle a PARALLEL in more cases.
> (store_expr): Likewise.
> (store_field): Likewise.
>
> But this was done carefully, i.e. we still drop the MEM_EXPRs if we cannot
> prove that they are still valid.  Now the granularity of memory accesses at
> the RTL level is the byte so everything is rounded up to byte boundaries,
> that's why bitfields sharing a byte need to be dealt with specially.

Yes, so you need to drop bit-granular parts of MEM_EXPRs at this point
(or when initially creating them from set_mem_attributes_minus_bitpos).

>> Your patch seems to paper over this issue in the wrong place ...
>
> No, it's the proper, albeit conservative, fix in my opinion.

In my opinion the MEM_EXPR is "wrong", as it is supposed to be
the tree equivalent of the memory access.  At gimple level we
handle accesses at bit-granularity so bit-accesses are fine.
Not so at RTL level it seems.

[this also shows we probably should lower bit-granular accesses
at the gimple level, as planned for some time (to read, bit-extract
and read, bit-modify, write)]

Btw, ao_ref_from_mem will AFAIK not correctly handle bit-granular
accesses.  For

struct { int pad : 1; int a : 1; int b : 1; } x;

x.a will have MEM_SIZE == 1 and ref->offset = 1, ref->size == 8
x.b will have MEM_SIZE == 1 and ref->offset = 2, ref->size == 8

so we feed the alias oracle with bit-granular offsets but byte-granular
sizes.  Now I cannot quickly create a testcase that makes offset
based disambiguation disambugate two accesses that overlap
with the actual memory access, but I'm not 100% sure it's not
possible.  At least it will cause false aliasing for accesses crossing
byte-boundaries.

While the ultimate solution of making gimple match rtl (byte-granular
accesses only) would be best, adjusting the MEM attrs to the
RTL land sounds like a more appropriate fix.

Richard.

> --
> Eric Botcazou


Re: [PATCH][AArch64] Vneg NEON intrinsics modified

2013-10-09 Thread Marcus Shawcroft
On 8 October 2013 17:10, Alex Velenko  wrote:

> gcc/testsuite/
>
> 2013-10-08  Alex Velenko  
>
> * gcc.target/aarch64/vneg_f.c: New testcase.
> * gcc.target/aarch64/vneg_s.c: New testcase.
>
> gcc/
>
> 2013-10-08  Alex Velenko  
>
> * config/aarch64/arm_neon.h (vneg_f32): Asm replaced with C.
> (vneg_f64): New intrinsic.
> (vneg_s8): Asm replaced with C.
> (vneg_s16): Likewise.
> (vneg_s32): Likewise.
> (vneg_s64): New intrinsic.
> (vnegq_f32): Asm replaced with C.
> (vnegq_f64): Likewise.
> (vnegq_s8): Likewise.
> (vnegq_s16): Likewise.
> (vnegq_s32): Likewise.
> (vnegq_s64): Likewise.

OK, and I committed this for you.
/Marcus


Re: [patch] The remainder of tree-flow.h refactored.

2013-10-09 Thread Richard Biener
On Wed, Oct 9, 2013 at 1:31 AM, Andrew MacLeod  wrote:
> On 10/08/2013 07:44 AM, Andrew MacLeod wrote:
>>
>> On 10/08/2013 06:22 AM, Richard Biener wrote:
>>>
>>> graphite.h should be unnecessary with moving the pass struct like you
>>> did for other loop opts.  Likewise tree-parloops.h (well, ok, maybe
>>> you need parallelized_function_p, even though it's implementation is
>>> gross ;)).  Likewise tree-predcom.h.
>>
>>
>> fair enough.  Yes, I've already seen a few things that madfe my skin crawl
>> and I had to resist going down a  rathole for :-)
>>>
>>>
>>> unvisit_body isn't generic enough to warrant moving out of gimplify.c
>>> (the only user).
>>>
>>> The force_gimple_operand_gsi... routines are in gimplify.c because they
>>> ...
>>> gimplify!  And you moved them but not force_gimple_operand[_1]!?
>>
>>
>> OK, let me make the above adjustments, and I'll recreate a patch without
>> the gimple/gimplfy parts, and re-address that separately. I forget the
>> details of my include issues there at the moment.
>
>
> Here's the adjusted patch which doesn't contain the ugly gimple, gimplify,
> and tree stuff.  I'll deal with that once everything else settles.
> I removed tree-predcom.h and graphite.h and also moved the parallel_loops
> pass into tree-parloops.c... but we still need predcom.h :-P.  oh well. I
> think most of its pretty straightforward.
>
> Bootstraps on x86_64-unknown-linux-gnu, and running regressions. Assuming no
> issues, OK?

Ok.

Thanks,
Richard.

> Andrew
>
>


Re: Add a param to decide stack slot sharing at -O0

2013-10-09 Thread Richard Biener
On Tue, Oct 8, 2013 at 11:04 PM, Easwaran Raman  wrote:
> In cfgexpand.c, variables in non-overlapping lexical scopes are
> assigned same stack locations at -O1 and above. At -O0, this is
> attempted only if the size of the stack objects is above a threshold
> (32). The rationale is at -O0, more variables are going to be in the
> stack and the O(n^2) stack slot sharing algorithm will increase the
> compilation time. This patch replaces the constant with a param which
> is set to 32 by default. We ran into a case where the presence of
> always_inline attribute triggered Wframe-larger-than warnings at -O0
> but not at -O2 since the different inlined copies share the stack. We
> are ok with a slight increase in compilation time to get smaller stack
> frames even at -O0 and this patch would allow us do that easily.
>
> Bootstraps on x86_64/linux. Is this ok for trunk?

Ok with

+DEFPARAM (PARAM_MIN_SIZE_FOR_STACK_SHARING,
+ "min-size-for-stack-sharing",
+ "Attempt to share stack slots among variables in different
lexical blocks "
+ "at O0 only if their sizes exceed this value",
+ 32, 0, 0)

changed to

   "The minimum size of variables taking part in stack slot sharing "
   "when not optimizing"
   32, 0, 0)

And with adding documentation for that param in doc/invoke.texi.

Btw, I'm not sure the sharing algorithm is still quadratic - can you
investigate on that?

Thanks,
Richard.

> Thanks,
> Easwaran
>
>
> 2013-10-08  Easwaran Raman 
>
> * params.def (PARAM_MIN_SIZE_FOR_STACK_SHARING): New param...
> * cfgexpand.c (defer_stack_allocation): ...use here


[C++ Patch] PR 58633 (Take 2)

2013-10-09 Thread Paolo Carlini

Hi,

this is a completely different approach at fixing the bug, which overall 
I like better.


In this case most of the patch touches cp_parser_decltype_expr: instead 
of using cp_parser_postfix_expression only for member access 
expressions, we accept all its valid return values (still identifying 
the member access expressions, of course, because the semantics of 
decltype is special for those). Thus we can also immediately accept the 
pseudo destructor calls which are the matter of this bug. Otherwise, as 
a last resort, if the expression at issue isn't a postfix expression at 
all, we use cp_parser_expression.


Patch works better than my previous try from the diagnostic point of 
view (eg, for c++/47277 - type illegal expressions as arguments of the 
decltype) and seems less "hackish" to me. Also, it seems to me that it's 
a computational waste to redo the parsing of valid postfix-expressions 
!= member access expression.


Tested x86_64-linux.

Thanks!
Paolo.

/
/cp
2013-10-09  Paolo Carlini  

PR c++/58633
* parser.c (cp_parser_decltype_expr): Restructure parsing of
expressions != id-expressions.
(cp_parser_postfix_expression, cp_parser_unary_expression): Adjust.

/testsuite
2013-10-09  Paolo Carlini  

PR c++/58633
* g++.dg/cpp0x/decltype57.C: New.
Index: cp/parser.c
===
--- cp/parser.c (revision 203308)
+++ cp/parser.c (working copy)
@@ -1857,7 +1857,7 @@ static tree cp_parser_nested_name_specifier
 static tree cp_parser_qualifying_entity
   (cp_parser *, bool, bool, bool, bool, bool);
 static tree cp_parser_postfix_expression
-  (cp_parser *, bool, bool, bool, bool, cp_id_kind *);
+  (cp_parser *, bool, bool, bool, bool&, cp_id_kind *);
 static tree cp_parser_postfix_open_square_expression
   (cp_parser *, tree, bool, bool);
 static tree cp_parser_postfix_dot_deref_expression
@@ -5531,14 +5531,14 @@ cp_parser_qualifying_entity (cp_parser *parser,
`&' operator.  CAST_P is true if this expression is the target of a
cast.
 
-   If MEMBER_ACCESS_ONLY_P, we only allow postfix expressions that are
-   class member access expressions [expr.ref].
+   If IS_MEMBER_ACCESS is false, we are not returning a class member
+   access expression [expr.ref].
 
Returns a representation of the expression.  */
 
 static tree
 cp_parser_postfix_expression (cp_parser *parser, bool address_p, bool cast_p,
-  bool member_access_only_p, bool decltype_p,
+  bool decltype_p, bool& is_member_access,
  cp_id_kind * pidk_return)
 {
   cp_token *token;
@@ -5546,7 +5546,7 @@ cp_parser_postfix_expression (cp_parser *parser, b
   enum rid keyword;
   cp_id_kind idk = CP_ID_KIND_NONE;
   tree postfix_expression = NULL_TREE;
-  bool is_member_access = false;
+  is_member_access = false;
 
   /* Peek at the next token.  */
   token = cp_lexer_peek_token (parser->lexer);
@@ -6066,10 +6066,7 @@ cp_parser_postfix_expression (cp_parser *parser, b
default:
  if (pidk_return != NULL)
* pidk_return = idk;
-  if (member_access_only_p)
-return is_member_access? postfix_expression : error_mark_node;
-  else
-return postfix_expression;
+ return postfix_expression;
}
 }
 
@@ -7003,9 +7000,9 @@ cp_parser_unary_expression (cp_parser *parser, boo
   return expression;
 }
 
+  bool is_member_access;
   return cp_parser_postfix_expression (parser, address_p, cast_p,
-   /*member_access_only_p=*/false,
-  decltype_p,
+  decltype_p, is_member_access,
   pidk);
 }
 
@@ -11678,33 +11675,42 @@ cp_parser_decltype_expr (cp_parser *parser,
 id_expression_or_member_access_p = true;
 }
 
-  if (!id_expression_or_member_access_p)
+  if (id_expression_or_member_access_p)
 {
-  /* Abort the id-expression parse.  */
-  cp_parser_abort_tentative_parse (parser);
+  /* We have parsed an id-expression.  */
+  cp_parser_parse_definitely (parser);
+  return expr;
+}
 
-  /* Parsing tentatively, again.  */
-  cp_parser_parse_tentatively (parser);
+  /* Abort the id-expression parse.  */
+  cp_parser_abort_tentative_parse (parser);
 
-  /* Parse a class member access.  */
-  expr = cp_parser_postfix_expression (parser, /*address_p=*/false,
-   /*cast_p=*/false, /*decltype*/true,
-   /*member_access_only_p=*/true, 
NULL);
+  /* Parsing tentatively, again.  */
+  cp_parser_parse_tentatively (parser);
 
-  if (expr 
-  && expr != error_mark_node
-  && cp_lexer_peek_token (parser->lexer)->type == CPP_CLOSE_PAREN)
-/* We have an id-expression.  */
-  

Re: [patch] Fix PR middle-end/58570

2013-10-09 Thread Eric Botcazou
> In my opinion the MEM_EXPR is "wrong", as it is supposed to be
> the tree equivalent of the memory access.  At gimple level we
> handle accesses at bit-granularity so bit-accesses are fine.
> Not so at RTL level it seems.
> 
> [this also shows we probably should lower bit-granular accesses
> at the gimple level, as planned for some time (to read, bit-extract
> and read, bit-modify, write)]
> 
> Btw, ao_ref_from_mem will AFAIK not correctly handle bit-granular
> accesses.  For
> 
> struct { int pad : 1; int a : 1; int b : 1; } x;
> 
> x.a will have MEM_SIZE == 1 and ref->offset = 1, ref->size == 8
> x.b will have MEM_SIZE == 1 and ref->offset = 2, ref->size == 8
> 
> so we feed the alias oracle with bit-granular offsets but byte-granular
> sizes.  Now I cannot quickly create a testcase that makes offset
> based disambiguation disambugate two accesses that overlap
> with the actual memory access, but I'm not 100% sure it's not
> possible.

I see, but I'm nevertheless waiting for a testcase.

> While the ultimate solution of making gimple match rtl (byte-granular
> accesses only) would be best, adjusting the MEM attrs to the
> RTL land sounds like a more appropriate fix.

This will disable the RTL machinery installed for PR rtl-optimization/44194, 
so we'll need to reopen it...  Given that it seems to be working correctly,
I don't see that point in doing so (and I don't intend to spend time on this 
right now).  So let's apply my patchlet first, close PR middle-end/58570 and 
if you want to discuss an alternate plan for PR rtl-optimization/44194, let's 
do that in a separate thread.

-- 
Eric Botcazou


Re: [gomp4] Adjust some gcc.dg/autopar/ tests

2013-10-09 Thread Thomas Schwinge
Hi!

On Tue, 8 Oct 2013 17:24:14 +0200, Jakub Jelinek  wrote:
> These tests were expecting 5 loopfn matches, 3 on the fn definition, one
> as GOMP_parallel_start argument and one called in between
> GOMP_parallel_start and GOMP_parallel_end.  But the new API is
> to call GOMP_parallel with the function and not call the outlined
> function nor GOMP_parallel_end directly, GOMP_parallel will call it
> indirectly.
> 
> 2013-10-08  Jakub Jelinek  
> 
>   * gcc.dg/autopar/outer-1.c: Expect 4 instead of 5 loopfn matches.
>   * [...]

IRC, OFTC, #gcc, 2013-10-09:

 jakub: On the gomp-4_0-branch as of yesterday, I'm seeing
  PASS -> FAIL for some libgomp.graphite tests; looks very similar to
  the loopfn changes you applied to gcc.dg/autopar tests
  (GOMP_parallel interface change).  Want me to have a look, or are
  you going to?
 By the weay, I didn't mean to say that the FAIL appeared
  yesterday, but that this is a test run from yesterday's sources.
 tschwinge: ah, I don't have graphite enabled (except for
  release branch builds for the distro), so it is very well possible
  there are some issues
 sounds sed -i -e 's/" 5 "/" 4 "/'
  libgomp/testsuite/libgomp.graphite/force-parallel-*.c would cure it
 jakub: Yep, something like that.  Will give it a try
  (unless you're going to).
 tschwinge: I could easily cook up a patch, but it would be
  much harder to test it for me (no isl nor cloog readily built)
 jakub: Testing.

OK to commit -- on gomp-4_0-branch, or trunk after your merge patches
have gone in, or are you going to fold it into the respective merge patch
(and later merge it back into gomp-4_0-branch)?  Pick the one that suits
you best.  ;-)

2013-10-09  Thomas Schwinge  
Jakub Jelinek  

libgomp/
* testsuite/libgomp.graphite/force-parallel-1.c: Expect 4 instead
of 5 loopfn matches.
* testsuite/libgomp.graphite/force-parallel-2.c: Likewise.
* testsuite/libgomp.graphite/force-parallel-3.c: Likewise.
* testsuite/libgomp.graphite/force-parallel-4.c: Likewise.
* testsuite/libgomp.graphite/force-parallel-5.c: Likewise.
* testsuite/libgomp.graphite/force-parallel-6.c: Likewise.
* testsuite/libgomp.graphite/force-parallel-7.c: Likewise.
* testsuite/libgomp.graphite/force-parallel-8.c: Likewise.
* testsuite/libgomp.graphite/force-parallel-9.c: Likewise.

As changed in gomp-4_0-branch, r203282.

diff --git libgomp/testsuite/libgomp.graphite/force-parallel-1.c 
libgomp/testsuite/libgomp.graphite/force-parallel-1.c
index 8c40e34..008c9d7 100644
--- libgomp/testsuite/libgomp.graphite/force-parallel-1.c
+++ libgomp/testsuite/libgomp.graphite/force-parallel-1.c
@@ -26,6 +26,6 @@ int main(void)
 /* Check that parallel code generation part make the right answer.  */
 /* { dg-final { scan-tree-dump-times "1 loops carried no dependency" 1 
"graphite" } } */
 /* { dg-final { cleanup-tree-dump "graphite" } } */
-/* { dg-final { scan-tree-dump-times "loopfn" 5 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "loopfn" 4 "optimized" } } */
 /* { dg-final { cleanup-tree-dump "parloops" } } */
 /* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git libgomp/testsuite/libgomp.graphite/force-parallel-2.c 
libgomp/testsuite/libgomp.graphite/force-parallel-2.c
index 1ce0feb..f80401b 100644
--- libgomp/testsuite/libgomp.graphite/force-parallel-2.c
+++ libgomp/testsuite/libgomp.graphite/force-parallel-2.c
@@ -25,6 +25,6 @@ int main(void)
 /* Check that parallel code generation part make the right answer.  */
 /* { dg-final { scan-tree-dump-times "2 loops carried no dependency" 1 
"graphite" } } */
 /* { dg-final { cleanup-tree-dump "graphite" } } */
-/* { dg-final { scan-tree-dump-times "loopfn" 5 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "loopfn" 4 "optimized" } } */
 /* { dg-final { cleanup-tree-dump "parloops" } } */
 /* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git libgomp/testsuite/libgomp.graphite/force-parallel-3.c 
libgomp/testsuite/libgomp.graphite/force-parallel-3.c
index ff8680b..4a2d323 100644
--- libgomp/testsuite/libgomp.graphite/force-parallel-3.c
+++ libgomp/testsuite/libgomp.graphite/force-parallel-3.c
@@ -32,7 +32,7 @@ int main(void)
 /* Check that parallel code generation part make the right answer.  */
 /* { dg-final { scan-tree-dump-times "4 loops carried no dependency" 1 
"graphite" } } */
 /* { dg-final { cleanup-tree-dump "graphite" } } */
-/* { dg-final { scan-tree-dump-times "loopfn.0" 5 "optimized" } } */
-/* { dg-final { scan-tree-dump-times "loopfn.1" 5 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "loopfn.0" 4 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "loopfn.1" 4 "optimized" } } */
 /* { dg-final { cleanup-tree-dump "parloops" } } */
 /* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git libgomp/testsuite/libgomp.graphite/force-parallel-4.c 
libgomp/te

Re: [PATCH][AARCH64] Vdiv NEON intrinsic

2013-10-09 Thread Marcus Shawcroft
On 8 October 2013 17:25, Alex Velenko  wrote:

> gcc/testsuite/
>
> 2013-09-10  Alex Velenko  
>
> * gcc.target/aarch64/vdiv_f.c: New testcase.
>
> gcc/
>
> 2013-09-10  Alex Velenko  
>
> * config/aarch64/arm_neon.h (vdiv_f64): Added.

OK. I fixed the date format for the proposed ChangeLog entry and
committed for you.
/Marcus


Re: [PATCH][AArch64] NEON vadd_f64 and vsub_f64 intrinsics modified

2013-10-09 Thread Marcus Shawcroft
On 8 October 2013 17:35, Alex Velenko  wrote:

> 2013-10-08  Alex Velenko  
>
> * gcc.target/aarch64/vadd_f64.c: New testcase.
> * gcc.target/aarch64/vsub_f64.c: New testcase.
>
> gcc/
>
> 2013-10-08  Alex Velenko  
>
> * config/aarch64/arm_neon.h (vadd_f64): Implementation added.
> (vsub_f64): Likewise.

OK, committed.
/Marcus


Re: [patch] Fix PR middle-end/58570

2013-10-09 Thread Richard Biener
On Wed, Oct 9, 2013 at 1:36 PM, Eric Botcazou  wrote:
>> In my opinion the MEM_EXPR is "wrong", as it is supposed to be
>> the tree equivalent of the memory access.  At gimple level we
>> handle accesses at bit-granularity so bit-accesses are fine.
>> Not so at RTL level it seems.
>>
>> [this also shows we probably should lower bit-granular accesses
>> at the gimple level, as planned for some time (to read, bit-extract
>> and read, bit-modify, write)]
>>
>> Btw, ao_ref_from_mem will AFAIK not correctly handle bit-granular
>> accesses.  For
>>
>> struct { int pad : 1; int a : 1; int b : 1; } x;
>>
>> x.a will have MEM_SIZE == 1 and ref->offset = 1, ref->size == 8
>> x.b will have MEM_SIZE == 1 and ref->offset = 2, ref->size == 8
>>
>> so we feed the alias oracle with bit-granular offsets but byte-granular
>> sizes.  Now I cannot quickly create a testcase that makes offset
>> based disambiguation disambugate two accesses that overlap
>> with the actual memory access, but I'm not 100% sure it's not
>> possible.
>
> I see, but I'm nevertheless waiting for a testcase.
>
>> While the ultimate solution of making gimple match rtl (byte-granular
>> accesses only) would be best, adjusting the MEM attrs to the
>> RTL land sounds like a more appropriate fix.
>
> This will disable the RTL machinery installed for PR rtl-optimization/44194,
> so we'll need to reopen it...  Given that it seems to be working correctly,
> I don't see that point in doing so (and I don't intend to spend time on this
> right now).  So let's apply my patchlet first, close PR middle-end/58570 and
> if you want to discuss an alternate plan for PR rtl-optimization/44194, let's
> do that in a separate thread.

Well, ok.  Please adjust the comment

+  /* Different fields of the same record type cannot overlap, unless they
+are both bitfields and we are at the RTL level.  */

to sth like

  ???  Bitfields can overlap at RTL level so punt if we end up at them.

or sth similar.  You don't check whether we are at the RTL level after all
(not that this would be appropriate).

Thanks,
Richard.

> --
> Eric Botcazou


Re: [PATCH][AArch64] NEON vclz intrinsic modified

2013-10-09 Thread Marcus Shawcroft
On 8 October 2013 17:45, Alex Velenko  wrote:
>
> 2013-10-08  Alex Velenko  
>
> * gcc.target/aarch64/vclz.c: New testcase.
>
> gcc/
>
> 2013-10-08  Alex Velenko  
>
> * config/aarch64/arm_neon.h (vclz_s8): Asm replaced with C
>   (vclz_s16): Likewise.
>   (vclz_s32): Likewise.
>   (vclzq_s8): Likewise.
>   (vclzq_s16): Likewise.
>   (vclzq_s32): Likewise.
>   (vclz_u8): Likewise.
>   (vclz_u16): Likewise.
>   (vclz_u32): Likewise.
>   (vclzq_u8): Likewise.
>   (vclzq_u16): Likewise.
>   (vclzq_u32): Likewise.
>
> * config/aarch64/aarch64.h (CLZ_DEFINED_VALUE_AT_ZERO): Macro
> fixed for clz.
>
> * config/aarch64/aarch64-simd-builtins.def (VAR1 (UNOP, clz, 0,
> v4si)): Replaced with iterator.

OK, committed.

/Marcus


Patch to split out new warning flag for floating point conversion

2013-10-09 Thread Joshua J Cogliati
== Administrivia ==

This is my first patch.  I have emailed in the signed copyright transfer
documents already.

== Description ==

This patch is a fix for Bug 53001

As required by the C and C++ standards, gcc automatically converts
floating point numbers to lower precision or integer values.  Silently
converting these values is a problem for numerical programs.  GCC
already has a flag -Wconversion which does warn about these conversions,
but -Wconversion also warns about integer conversion which means for
many programs the number of warnings will be large.

This patch adds a -Wfloat-conversion that only warns on float
conversions.  Here are three examples that are warned by this new flag:

int main(int argc, char ** argv) {
  int i = 3.14;
  return i;
}

int foo(double x)
{
  return x;
}

float foo2(double x) {
  return x;
}

The -Wfloat-conversion is enabled by both -Wconversion (since it is a
subset) and -Wextra (as suggested on the Bug 53001 discussion)

Because this changes -Wextra, when compiling with -Werror and -Wextra,
some code will not compile now.  The code in gcc that this occurred in
was changed to use explicit casts.  The patch would be shorter if
-Wextra did not enable -Wfloat-conversion, and if that is desired I can
change the patch.

Because this patch enables it for -Wextra, gcc/c-family/c-cppbuiltin.c,
gcc/mcf.c, gcc/predict.c, gcc/real.c and libcpp/symtab.c need to be
changed, otherwise they would be unmodified.

I am not certain that c.opt was modified correctly.

== Testcases ==

This patch has passes the existing -Wconversion testcases.
gcc/testsuite/c-c++-common/Wconversion-real.c and other testcases
possibly could be modified.

== Changelog ==

2013-10-08  Joshua Cogliati 

Splitting out a -Wfloat-conversion from -Wconversion for
conversions that lower floating point number precision
or conversion from floating point numbers to integers
* gcc/c-family/c-common.c Switching unsafe_conversion_p to
return an enumeration with more detail, and conversion_warning
to use this information.
* gcc/c-family/c-common.h Adding conversion_safety enumeration
and switching return type of unsafe_conversion_p
* gcc/c-family/c-cppbuiltin.c Making explicit casts
* gcc/c-family/c.opt Adding new warning float-conversion and
enabling it with -Wextra and -Wconversion
* gcc/doc/invoke.texi Adding documentation about
-Wfloat-conversion
* gcc/mcf.c Making explicit casts
* gcc/predict.c Making explicit casts
* gcc/real.c Making explicit casts
* libcpp/symtab.c Avoiding float to int conversion

== Bootstrapping and testing ==

Tested bootstrap on x86_64-unknown-linux-gnu for
--enable-languages=c,c++,fortran,java,lto,objc with trunk on r203051 and
r203112


Thank you for consideration of this patch.

Joshua Cogliati
Index: gcc/c-family/c-common.c
===
--- gcc/c-family/c-common.c	(revision 203112)
+++ gcc/c-family/c-common.c	(working copy)
@@ -2517,10 +2517,10 @@ shorten_binary_op (tree result_type, tre
Function allows conversions between types of different signedness and
does not return true in that case.  Function can produce signedness
warnings if PRODUCE_WARNS is true.  */
-bool
+enum conversion_safety
 unsafe_conversion_p (tree type, tree expr, bool produce_warns)
 {
-  bool give_warning = false;
+  enum conversion_safety give_warning = SAFE_CONVERSION; /* is 0 or false */
   tree expr_type = TREE_TYPE (expr);
   location_t loc = EXPR_LOC_OR_HERE (expr);
 
@@ -2532,7 +2532,7 @@ unsafe_conversion_p (tree type, tree exp
 	  && TREE_CODE (type) == INTEGER_TYPE)
 	{
 	  if (!real_isinteger (TREE_REAL_CST_PTR (expr), TYPE_MODE (expr_type)))
-	give_warning = true;
+	give_warning = UNSAFE_REAL;
 	}
   /* Warn for an integer constant that does not fit into integer type.  */
   else if (TREE_CODE (expr_type) == INTEGER_TYPE
@@ -2553,7 +2553,7 @@ unsafe_conversion_p (tree type, tree exp
 			" constant value to negative integer");
 	}
 	  else
-	give_warning = true;
+	give_warning = UNSAFE_OTHER;
 	}
   else if (TREE_CODE (type) == REAL_TYPE)
 	{
@@ -2562,7 +2562,7 @@ unsafe_conversion_p (tree type, tree exp
 	{
 	  REAL_VALUE_TYPE a = real_value_from_int_cst (0, expr);
 	  if (!exact_real_truncate (TYPE_MODE (type), &a))
-		give_warning = true;
+		give_warning = UNSAFE_REAL;
 	}
 	  /* Warn for a real constant that does not fit into a smaller
 	 real type.  */
@@ -2571,7 +2571,7 @@ unsafe_conversion_p (tree type, tree exp
 	{
 	  REAL_VALUE_TYPE a = TREE_REAL_CST (expr);
 	  if (!exact_real_truncate (TYPE_MODE (type), &a))
-		give_warning = true;
+		give_warning = UNSAFE_REAL;
 	}
 	}
 }
@@ -2580,7 +2580,7 @@ unsafe_conversion_p (tree type, tree exp
   /* Warn for real types converted to integer types.  */
   if (TREE_CODE (e

Re: [patch] Fix PR middle-end/58570

2013-10-09 Thread Eric Botcazou
> Well, ok.  Please adjust the comment
> 
> +  /* Different fields of the same record type cannot overlap, unless
> they +are both bitfields and we are at the RTL level.  */
> 
> to sth like
> 
>   ???  Bitfields can overlap at RTL level so punt if we end up at
> them.
> 
> or sth similar.

Done, thanks.

> You don't check whether we are at the RTL level after all (not that this
> would be appropriate).

Yes, I was sure that you would have criticized that even more ;-)

-- 
Eric Botcazou


Re: Fix scheduler ix86_issue_rate and ix86_adjust_cost for modern x86 chips

2013-10-09 Thread Jan Hubicka
> Before merging the insn reservations, I need to compare the latency values 
> for bdver1 and bdver3. I know that they are different for some of the 
> instructions. 
> In that case, the merging should prop up another subset of latency 
> differences. I would like to keep these insn reservations in two .md files 
> (one for bdver1 and one for bdver3) even after the merger.

I am not really insisting on merging (define_insn_reservation "bdver3*") with
(define_insn_reservation "bdver1*).  What I have in mind is merging actual
atuomatons in cases it makes sense.  Latencies are not really encoded in those.

Bdver 12 has:
(define_automaton "bdver1,bdver1_ieu,bdver1_load,bdver1_fp,bdver1_agu")
while bdver 3:
(define_automaton "bdver3,bdver3_ieu,bdver3_load,bdver3_fp,bdver3_agu")

automatons bdver1 and bdver3 are very different, because one handles up to 3 
instructions,
while other handles only 2.  I am still bit confused with this every second 
cycle logic,
so lets discuss it incrementally.

I would propose to have
(define_automaton "bdver3")
or perhaps
(define_automaton "bdver3,bdver3_fp")

now if you look at use of bdver3_ieu we have:
jan@linux-9ure:~/trunk/gcc/config/i386> grep bdver1-ieu *.md
bdver1.md:(define_cpu_unit "bdver1-ieu0" "bdver1_ieu")
bdver1.md:(define_cpu_unit "bdver1-ieu1" "bdver1_ieu")
bdver1.md:(define_reservation "bdver1-ieu" "(bdver1-ieu0 | bdver1-ieu1)")
bdver1.md:(define_reservation "bdver1-ivector" "bdver1-ieu0+bdver1-ieu1+
bdver1.md:   "bdver1-direct1,bdver1-ieu1")
bdver1.md:   "bdver1-direct1,bdver1-ieu1")
bdver1.md: "bdver1-direct1,bdver1-load,bdver1-ieu1")
bdver1.md:   "bdver1-direct1,bdver1-load,bdver1-ieu1")
bdver1.md:   
"bdver1-vector,(bdver1-ieu0*6+(bdver1-fpsched,bdver1-fvector))")
bdver1.md:   
"bdver1-vector,((bdver1-load,bdver1-ieu0*6)+(bdver1-fpsched,bdver1-fvector))")
bdver1.md:   "bdver1-vector,bdver1-load,bdver1-ieu0*6")
bdver1.md:   "bdver1-direct,bdver1-ieu")
bdver1.md:   "bdver1-vector,bdver1-ieu,bdver1-ieu")
bdver1.md:   "bdver1-direct,bdver1-load,bdver1-ieu")
bdver1.md:   
"bdver1-vector,bdver1-load,bdver1-ieu,bdver1-ieu")
bdver1.md:bdver1-ieu,bdver1-store,
bdver1.md:bdver1-ieu,
bdver1.md:bdver1-ieu,
bdver1.md:   "bdver1-direct,(bdver1-ieu+bdver1-agu),
bdver1.md:   
"bdver1-vector,(bdver1-ieu+bdver1-agu),bdver1-ieu,
jan@linux-9ure:~/trunk/gcc/config/i386> grep bdver3-ieu *.md
bdver3.md:(define_cpu_unit "bdver3-ieu0" "bdver3_ieu")
bdver3.md:(define_cpu_unit "bdver3-ieu1" "bdver3_ieu")
bdver3.md:(define_reservation "bdver3-ieu" "(bdver3-ieu0|bdver3-ieu1)")
bdver3.md:(define_reservation "bdver3-ivector" "bdver3-ieu0+bdver3-ieu1+
bdver3.md:   "bdver3-double,(bdver3-agu | 
bdver3-ieu),nothing")
bdver3.md:   "bdver3-direct,bdver3-ieu,bdver3-store")
bdver3.md:   "bdver3-direct,bdver3-ieu")
bdver3.md:   "bdver3-direct,bdver3-ieu1")
bdver3.md:   "bdver3-direct,bdver3-ieu1")
bdver3.md:   "bdver3-direct,bdver3-load,bdver3-ieu1")
bdver3.md:   "bdver3-direct,bdver3-load,bdver3-ieu1")
bdver3.md:   "bdver3-direct,(bdver3-ieu|bdver3-agu)")
bdver3.md:   "bdver3-direct,bdver3-load,bdver3-ieu")
bdver3.md:   "bdver3-direct,bdver3-ieu,bdver3-store")
bdver3.md:bdver3-ieu,bdver3-store,
bdver3.md:   "bdver3-direct,(bdver3-ieu+bdver3-agu),

While they are not used always the same way, it seems that bdver3 instructions 
have similar characteristic
to bdver1 instructions, so unifying the automatons would save binary size.
This only means replacing
bdver3.md:(define_cpu_unit "bdver3-ieu0" "bdver1_ieu")
bdver3.md:(define_cpu_unit "bdver3-ieu1" "bdver1_ieu")
and removing bdver3_ieu from list of automatons.

The automaton minimization should take care of the rest and resulting schedules 
should not change.
I will give it a try.
> 
> > Your version has problem that it does not model the thing that the two 
> > decoders works sequentially.
> 
> The two stage modeling is required so that the decode unit reservations are 
> screened from other unit reservations.
> But this sort of goes away in bdver3 because of the decode cycle.
> In bdver3, the decode units scan two of these windows every "two" cycles 
> decoding a maximum of eight instructions.
> The hardware scan is done every two cycles in bdver3 whereas it is done every 
> single cycle in bdver1/bdver2. (But we have two separate hardware decoders 
> which guarantees higher throughput)
> This means that the two stage modeling is not required in the scheduler 
> descriptions s

[committed] Fix up bb-slp-31.c testcase

2013-10-09 Thread Jakub Jelinek
Hi!

I've noticed that this testcase doesn't clean up after itself.
Fixed thusly, committed as obvious to trunk.

2013-10-09  Jakub Jelinek  

* gcc.dg/vect/bb-slp-31.c: Add cleanup-tree-dump.

--- gcc/testsuite/gcc.dg/vect/bb-slp-31.c.jj2013-08-30 14:38:40.0 
+0200
+++ gcc/testsuite/gcc.dg/vect/bb-slp-31.c   2013-10-09 15:46:21.832962906 
+0200
@@ -10,3 +10,4 @@ void f(){
 }
 
 /* { dg-final { scan-tree-dump "basic block vectorized" "slp" } } */
+/* { dg-final { cleanup-tree-dump "slp" } } */

Jakub


Re: [C++ Patch] PR 58633

2013-10-09 Thread Jason Merrill

On 10/08/2013 07:03 PM, Paolo Carlini wrote:

.. a curiosity: the cp_parser_commit_to_tentative_parse at the end of
cp_parser_pseudo_destructor_name, which didn't exist in 4.6.x and we can
consider the root of this issue, is also my fault:

 http://gcc.gnu.org/ml/gcc-patches/2011-05/msg02246.html

 From a different angle, I'm happy of the outcome of this detective
work, because it means that the parser_commit isn't there for
correctness: not performing it in some cases shouldn't be a big issue.


Yeah.  I don't understand why cp_parser_commit_to_tentative_parse 
commits all levels, rather than just the current one as the comment 
seems to suggest.  Mark, do you remember anything about that decision?


Jason




[PATCH] Fix libstdc++/58659.cc test case.

2013-10-09 Thread Marcus Shawcroft

The test case add here:

http://gcc.gnu.org/ml/gcc-patches/2013-10/msg00474.html

Introduced an unprototyped call to abort() resulting in failures due to 
unexepected warnings in aarch64-none-elf cross testing.


Committed to trunk as obvious.

Cheers
/Marcus

2013-10-09  Marcus Shawcroft  

* testsuite/20_util/shared_ptr/cons/58659.cc: Use 
__builtin_abort().diff --git a/libstdc++-v3/testsuite/20_util/shared_ptr/cons/58659.cc b/libstdc++-v3/testsuite/20_util/shared_ptr/cons/58659.cc
index 5e7c730..1b823bc 100644
--- a/libstdc++-v3/testsuite/20_util/shared_ptr/cons/58659.cc
+++ b/libstdc++-v3/testsuite/20_util/shared_ptr/cons/58659.cc
@@ -48,7 +48,7 @@ namespace std
   void deallocate(value_type* p, size_t n)
   {
 if (n != 1 || p != (void*)storage || !allocated)
-  abort();
+  __builtin_abort();
 allocated = false;
   }
 

[PATCH][ARM][committed] Move comment to correct place below #includes

2013-10-09 Thread Kyrill Tkachov

Hi all,

I've committed this trivial patch as r203322.
It moves the comment for the arm_early_load_addr_dep from above the #includes in 
the file to above the function definition.


Thanks,
Kyrill

2013-10-09  Kyrylo Tkachov  

* config/arm/aarch-common.c (arm_early_load_addr_dep):
Place comment above function.diff --git a/gcc/config/arm/aarch-common.c b/gcc/config/arm/aarch-common.c
index 69366af..3111ae9 100644
--- a/gcc/config/arm/aarch-common.c
+++ b/gcc/config/arm/aarch-common.c
@@ -21,9 +21,6 @@
.  */
 
 
-/* Return nonzero if the CONSUMER instruction (a load) does need
-   PRODUCER's value to calculate the address.  */
-
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
@@ -34,6 +31,9 @@
 #include "c-family/c-common.h"
 #include "rtl.h"
 
+/* Return nonzero if the CONSUMER instruction (a load) does need
+   PRODUCER's value to calculate the address.  */
+
 int
 arm_early_load_addr_dep (rtx producer, rtx consumer)
 {

Re: [C++ PATCH] PR58635

2013-10-09 Thread Jason Merrill

OK.

Jason


Re: [C++ Patch] PR 58633

2013-10-09 Thread Paolo Carlini

Hi,

On 10/09/2013 03:56 PM, Jason Merrill wrote:

On 10/08/2013 07:03 PM, Paolo Carlini wrote:

.. a curiosity: the cp_parser_commit_to_tentative_parse at the end of
cp_parser_pseudo_destructor_name, which didn't exist in 4.6.x and we can
consider the root of this issue, is also my fault:

 http://gcc.gnu.org/ml/gcc-patches/2011-05/msg02246.html

 From a different angle, I'm happy of the outcome of this detective
work, because it means that the parser_commit isn't there for
correctness: not performing it in some cases shouldn't be a big issue.


Yeah.  I don't understand why cp_parser_commit_to_tentative_parse 
commits all levels, rather than just the current one as the comment 
seems to suggest.  Mark, do you remember anything about that decision?
Well, the below appears to work for the testcase at issue and even 
appears to pass the testsuite. Not sure how far we want to go at this 
time, what should we double chech...


Paolo.

//
Index: cp/parser.c
===
--- cp/parser.c (revision 203320)
+++ cp/parser.c (working copy)
@@ -2347,6 +2347,8 @@ static void cp_parser_parse_tentatively
   (cp_parser *);
 static void cp_parser_commit_to_tentative_parse
   (cp_parser *);
+static void cp_parser_commit_to_topmost_tentative_parse
+  (cp_parser *);
 static void cp_parser_abort_tentative_parse
   (cp_parser *);
 static bool cp_parser_parse_definitely
@@ -6693,7 +6695,7 @@ cp_parser_pseudo_destructor_name (cp_parser* parse
 
   /* Once we see the ~, this has to be a pseudo-destructor.  */
   if (!processing_template_decl && !cp_parser_error_occurred (parser))
-cp_parser_commit_to_tentative_parse (parser);
+cp_parser_commit_to_topmost_tentative_parse (parser);
 
   /* Look for the type-name again.  We are not responsible for
  checking that it matches the first type-name.  */
@@ -24346,6 +24348,26 @@ cp_parser_commit_to_tentative_parse (cp_parser* pa
 }
 }
 
+/* Commit to the topmost currently active tentative parse.  */
+
+static void
+cp_parser_commit_to_topmost_tentative_parse (cp_parser* parser)
+{
+  cp_parser_context *context = parser->context;
+  cp_lexer *lexer = parser->lexer;
+
+  if (context)
+{
+  if (context->status == CP_PARSER_STATUS_KIND_COMMITTED)
+   return;
+  context->status = CP_PARSER_STATUS_KIND_COMMITTED;
+
+  while (!cp_lexer_saving_tokens (lexer))
+   lexer = lexer->next;
+  cp_lexer_commit_tokens (lexer);
+}
+}
+
 /* Abort the currently active tentative parse.  All consumed tokens
will be rolled back, and no diagnostics will be issued.  */
 
Index: testsuite/g++.dg/cpp0x/decltype57.C
===
--- testsuite/g++.dg/cpp0x/decltype57.C (revision 0)
+++ testsuite/g++.dg/cpp0x/decltype57.C (working copy)
@@ -0,0 +1,8 @@
+// PR c++/58633
+// { dg-do compile { target c++11 } }
+
+void foo(int i)
+{
+  typedef int I;
+  decltype(i.I::~I())* p;
+}


Re: [gomp4] C++ OpenMP user defined reductions (take 2)

2013-10-09 Thread Jason Merrill

On 10/07/2013 07:02 AM, Jakub Jelinek wrote:

duplicates anywhere, but during error diagnostics.  Without those two decl.c
hunks (either of them), pushdecl will sometimes return a different decl from
the original or error_mark_node, and the original fndecl passed to it has
ggc_free called on it, thus any further use of it ICEs or may ICE.


Right.


Perhaps if pushdecl returns error_mark_node,
then I'd should expect that the error has been reported already and if
it returns some other FUNCTION_DECL, then I should report it myself,


Makes sense.


but a problem with that is that there are multiple locations that call
pushdecl (two in parser.c, one in pt.c) and more importantly, that for
the diagnostics the new fndecl is ggc_freed and thus I can't use it
for the diagnostics anymore.


True, though probably input_location is enough.


Trying to set DECL_CONTEXT to non-NULL for block_scope UDRs leads to
immediate ICEs as I said earlier, again on udr-3.C testcase:

pushdecl_maybe_friend_1 clearly doesn't expect anything to be in function scope,
and from what I remember from writing the UDR patch, it certainly wasn't the
only spot.


Right.  Looking at the code again I see that block-scope function 
declarations have namespace DECL_CONTEXT and then have 
DECL_LOCAL_FUNCTION_P set on them.  That probably makes sense for UDR 
functions as well.



Normal C++ lookup behavior is to check for ambiguity, so I think
that's the best bet for what the eventual defined semantics will be.


No response from omp-lang yet, so I'm not changing this yet.


Please do change it.  The current behavior is just wrong, and we should 
set a good example for others to follow.  It's ok to fix this in a 
follow-up patch.



Unfortunately it didn't work, again on the udr-3.C testcase.
mark_used was already called during instantiation of the decl, DECL_ODR_USED
got set on it, but it was actually deferred, then when mark_used is called
again on it, it is ignored.  I'd need to clear DECL_ODR_USED explicitly
and call mark_used, perhaps that would work.


If deferring it is a problem you can add UDRs to the group of things 
which are always instantiated immediately in mark_used:



  /* Normally, we can wait until instantiation-time to synthesize DECL.
 However, if DECL is a static data member initialized with a constant
 or a constexpr function, we need it right now because a reference to
 such a data member or a call to such function is not value-dependent.
 For a function that uses auto in the return type, we need to instantiate
 it to find out its type.  */
  if ((decl_maybe_constant_var_p (decl)
   || (TREE_CODE (decl) == FUNCTION_DECL
   && DECL_DECLARED_CONSTEXPR_P (decl))
   || undeduced_auto_decl (decl))
  && DECL_LANG_SPECIFIC (decl)
  && DECL_TEMPLATE_INFO (decl)
  && !uses_template_parms (DECL_TI_ARGS (decl)))



As for not using INIT_EXPR and just use DECL_EXPR gimplification, the problem
is that we do not actually gimplify it with the OMP_PRIV decl which has that
DECL_INITIAL from the parsing, but a different one - the original var referenced
by the clause, and that can have completely different DECL_INITIAL, or none at
all.


Ah, OK.


+   error_at (loc, "predeclared arithmetic type in %qT"
+   error_at (loc, "reference type in %qT"


"%qT in"

Jason



Re: [PATCH] Fix libstdc++/58659.cc test case.

2013-10-09 Thread Jonathan Wakely
On 9 October 2013 15:26, Marcus Shawcroft wrote:
> The test case add here:
>
> http://gcc.gnu.org/ml/gcc-patches/2013-10/msg00474.html
>
> Introduced an unprototyped call to abort() resulting in failures due to
> unexepected warnings in aarch64-none-elf cross testing.
>
> Committed to trunk as obvious.

Thanks, I forgot to change that part when adding it to the testsuite.
I think we should really use a VERIFY there.

Jakub, can I fix the testcase on the 4.8 branch that was committed for
http://gcc.gnu.org/ml/gcc-patches/2013-10/msg00478.html?


> Cheers
> /Marcus
>
> 2013-10-09  Marcus Shawcroft  
>
> * testsuite/20_util/shared_ptr/cons/58659.cc: Use __builtin_abort().


Re: Ping Re: [gomp4] Dumping gimple for offload.

2013-10-09 Thread Ilya Tocar
Ping.

On 03 Oct 20:05, Ilya Tocar wrote:
> On 26 Sep 21:21, Ilya Tocar wrote:
> > On 25 Sep 15:48, Richard Biener wrote:
> > > On Wed, Sep 25, 2013 at 3:29 PM, Ilya Tocar  
> > > wrote:
> > > > On 24 Sep 11:02, Richard Biener wrote:
> > > >> On Mon, Sep 23, 2013 at 3:29 PM, Ilya Tocar  
> > > >> wrote:
> > > >>  thus consider assigning the section
> > > >> name in a different place.
> > > >>
> > > >> Richard.
> > > >
> > > > What do you mean  by different place?
> > > > I can add global dumping_omp_target variable to choose correct name,
> > > > depending on it's value (patch below). Is it better?
> > > 
> > > More like passing down a different abstraction, like for
> > > 
> > > > @@ -907,9 +907,15 @@ output_symtab (void)
> > > >  {
> > > >symtab_node node = lto_symtab_encoder_deref (encoder, i);
> > > >if (cgraph_node *cnode = dyn_cast  (node))
> > > > -lto_output_node (ob, cnode, encoder);
> > > > +   {
> > > > + if (!dumping_omp_target || lookup_attribute ("omp declare 
> > > > target",
> > > > + DECL_ATTRIBUTES 
> > > > (node->symbol.decl)))
> > > > +   lto_output_node (ob, cnode, encoder);
> > > > +   }
> > > >else
> > > > -lto_output_varpool_node (ob, varpool (node), encoder);
> > > > + if (!dumping_omp_target || lookup_attribute ("omp declare 
> > > > target",
> > > > + DECL_ATTRIBUTES 
> > > > (node->symbol.decl)))
> > > > +   lto_output_varpool_node (ob, varpool (node), encoder);
> > > >
> > > >  }
> > > 
> > > have the symtab encoder already not contain the varpool nodes you
> > > don't need.
> > > 
> > > And instead of looking up attributes, mark the symtab node with a flag.
> > 
> > Good idea!
> > I've tried creating 2 encoders, and adding only nodes with
> > "omp declare target" attribute in omp case. There is still some is_omp
> > passing to control  lto_set_symtab_encoder_in_partition behaivor, 
> > because i think it's better than global var.
> > What do you think?
> >
> Updated version of the patch. I've checked that it doesn't break lto on
> SPEC 2006. Streaming for omp is enabled by -fopnemp flag. Works with and
> without enabled lto. Ok for gomp4 branch?
> 
> 
> ---
>  gcc/cgraphunit.c  | 15 +--
>  gcc/ipa-inline-analysis.c |  2 +-
>  gcc/lto-cgraph.c  | 15 ++-
>  gcc/lto-streamer.c|  5 +++--
>  gcc/lto-streamer.h| 10 --
>  gcc/lto/lto-partition.c   |  4 ++--
>  gcc/passes.c  | 12 ++--
>  gcc/tree-pass.h   |  2 +-
>  8 files changed, 44 insertions(+), 21 deletions(-)
> 
> diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c
> index 1644ca9..d595475 100644
> --- a/gcc/cgraphunit.c
> +++ b/gcc/cgraphunit.c
> @@ -2016,7 +2016,18 @@ ipa_passes (void)
> passes->all_lto_gen_passes);
>  
>if (!in_lto_p)
> -ipa_write_summaries ();
> +{
> +  if (flag_openmp)
> + {
> +   section_name_prefix = OMP_SECTION_NAME_PREFIX;
> +   ipa_write_summaries (true);
> + }
> +  if (flag_lto)
> + {
> +   section_name_prefix = LTO_SECTION_NAME_PREFIX;
> +   ipa_write_summaries (false);
> + }
> +}
>  
>if (flag_generate_lto)
>  targetm.asm_out.lto_end ();
> @@ -2107,7 +2118,7 @@ compile (void)
>cgraph_state = CGRAPH_STATE_IPA;
>  
>/* If LTO is enabled, initialize the streamer hooks needed by GIMPLE.  */
> -  if (flag_lto)
> +  if (flag_lto || flag_openmp)
>  lto_streamer_hooks_init ();
>  
>/* Don't run the IPA passes if there was any error or sorry messages.  */
> diff --git a/gcc/ipa-inline-analysis.c b/gcc/ipa-inline-analysis.c
> index ba6221e..4420213 100644
> --- a/gcc/ipa-inline-analysis.c
> +++ b/gcc/ipa-inline-analysis.c
> @@ -3721,7 +3721,7 @@ inline_generate_summary (void)
>  
>/* When not optimizing, do not bother to analyze.  Inlining is still done
>   because edge redirection needs to happen there.  */
> -  if (!optimize && !flag_lto && !flag_wpa)
> +  if (!optimize && !flag_lto && !flag_wpa && !flag_openmp)
>  return;
>  
>function_insertion_hook_holder =
> diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c
> index 952588d..4a7d179 100644
> --- a/gcc/lto-cgraph.c
> +++ b/gcc/lto-cgraph.c
> @@ -236,8 +236,13 @@ lto_symtab_encoder_in_partition_p (lto_symtab_encoder_t 
> encoder,
>  
>  void
>  lto_set_symtab_encoder_in_partition (lto_symtab_encoder_t encoder,
> -  symtab_node node)
> +  symtab_node node, bool is_omp)
>  {
> +  /* Ignore non omp target nodes for omp case.  */
> +  if (is_omp && !lookup_attribute ("omp declare target",
> +DECL_ATTRIBUTES (node->symbol.decl)))
> +return;
> +
>int index = lto_symtab_encoder_encode (encoder, (symtab_node)node);
>encoder->nodes[index].in_partition = true;
>  }
> @@ -760,7 +765,7 @@

Re: [PATCH] Fix libstdc++/58659.cc test case.

2013-10-09 Thread Jakub Jelinek
On Wed, Oct 09, 2013 at 04:12:21PM +0100, Jonathan Wakely wrote:
> On 9 October 2013 15:26, Marcus Shawcroft wrote:
> > The test case add here:
> >
> > http://gcc.gnu.org/ml/gcc-patches/2013-10/msg00474.html
> >
> > Introduced an unprototyped call to abort() resulting in failures due to
> > unexepected warnings in aarch64-none-elf cross testing.
> >
> > Committed to trunk as obvious.
> 
> Thanks, I forgot to change that part when adding it to the testsuite.
> I think we should really use a VERIFY there.
> 
> Jakub, can I fix the testcase on the 4.8 branch that was committed for
> http://gcc.gnu.org/ml/gcc-patches/2013-10/msg00478.html?

Ok.

Jakub


[patch] fix broken loop-doloop.c

2013-10-09 Thread Andrew MacLeod

Most of loop-doloop.c is wrapped by
#ifdef HAVE_doloop_end
<...>
#endif

so on my bootstraps/builds, it doesn't exercise the code.  It was 
pointed out to me the restructuring broke this file on some targets.
This should fix it by calling the refactored get_max_loop_iterations() 
function.  cross compiled for rs6000 to verify compilation.


committed as revision 203324

Andrew
	* loop-doloop.c (doloop_modify, doloop_optimize): Use 
	get_max_loop_iterations.

Index: loop-doloop.c
===
*** loop-doloop.c	(revision 203285)
--- loop-doloop.c	(working copy)
*** doloop_modify (struct loop *loop, struct
*** 460,466 
  
/* Determine if the iteration counter will be non-negative.
  	 Note that the maximum value loaded is iterations_max - 1.  */
!   if (max_loop_iterations (loop, &iterations)
  	  && (iterations.ule (double_int_one.llshift
  			   (GET_MODE_PRECISION (mode) - 1,
  GET_MODE_PRECISION (mode)
--- 460,466 
  
/* Determine if the iteration counter will be non-negative.
  	 Note that the maximum value loaded is iterations_max - 1.  */
!   if (get_max_loop_iterations (loop, &iterations)
  	  && (iterations.ule (double_int_one.llshift
  			   (GET_MODE_PRECISION (mode) - 1,
  GET_MODE_PRECISION (mode)
*** doloop_modify (struct loop *loop, struct
*** 552,558 
  double_int iter;
  rtx iter_rtx;
  
! if (!max_loop_iterations (loop, &iter)
  	|| !iter.fits_shwi ())
iter_rtx = const0_rtx;
  else
--- 552,558 
  double_int iter;
  rtx iter_rtx;
  
! if (!get_max_loop_iterations (loop, &iter)
  	|| !iter.fits_shwi ())
iter_rtx = const0_rtx;
  else
*** doloop_optimize (struct loop *loop)
*** 669,675 
  
count = copy_rtx (desc->niter_expr);
iterations = desc->const_iter ? desc->niter_expr : const0_rtx;
!   if (!max_loop_iterations (loop, &iter)
|| !iter.fits_shwi ())
  iterations_max = const0_rtx;
else
--- 669,675 
  
count = copy_rtx (desc->niter_expr);
iterations = desc->const_iter ? desc->niter_expr : const0_rtx;
!   if (!get_max_loop_iterations (loop, &iter)
|| !iter.fits_shwi ())
  iterations_max = const0_rtx;
else


Re: [patch] The remainder of tree-flow.h refactored.

2013-10-09 Thread Andrew MacLeod

On 10/08/2013 06:22 AM, Richard Biener wrote:


unvisit_body isn't generic enough to warrant moving out of gimplify.c
(the only user).


Bah, now I remember.. so there *are* other users.. this routine is 
called from various front ends.. fortran, c-family and cp all call it. 
That is why I wanted to move it to tree.[ch].  it doesn't belong in a 
gimple file..  it operates on trees and is also used by tree front ends.


which then brings with it all the related routines...  unshare_body is 
then exported from tree.c in order for gimplify.c to utilize it.


This patch handles just this part.

bootstraps on x86_64-unknown-linux-gnu, regressions test are still 
running.  OK?


Andrew


	* tree.h: Adjust prototypes.
	* gimplify.c (mostly_copy_tree_r, copy_if_shared_r, copy_if_shared,
	unshare_body, unshare_expr, prune_expr_location,
	unshare_expr_without_location): Move to tree.c.
	* tree.c (mostly_copy_tree_r, copy_if_shared_r, copy_if_shared,
	unshare_body, unshare_expr, prune_expr_location,
	unshare_expr_without_location): Relocate from gimplify.c.

Index: tree.h
===
*** tree.h	(revision 203320)
--- tree.h	(working copy)
*** extern void cache_integer_cst (tree);
*** 4203,4212 
  /* In cgraph.c */
  extern void change_decl_assembler_name (tree, tree);
  
- /* In gimplify.c */
- extern tree unshare_expr (tree);
- extern tree unshare_expr_without_location (tree);
- 
  /* In stmt.c */
  
  extern void expand_label (tree);
--- 4203,4208 
*** extern bool type_in_anonymous_namespace_
*** 4524,4529 
--- 4520,4528 
  extern bool block_may_fallthru (const_tree);
  extern void using_eh_for_cleanups (void);
  extern bool using_eh_for_cleanups_p (void);
+ extern void unshare_body (tree);
+ extern tree unshare_expr (tree);
+ extern tree unshare_expr_without_location (tree);
  
  /* In tree-nested.c */
  extern tree build_addr (tree, tree);
Index: gimplify.c
===
*** gimplify.c	(revision 203319)
--- gimplify.c	(working copy)
*** annotate_all_with_location (gimple_seq s
*** 813,970 
  }
  }
  
- /* This page contains routines to unshare tree nodes, i.e. to duplicate tree
-nodes that are referenced more than once in GENERIC functions.  This is
-necessary because gimplification (translation into GIMPLE) is performed
-by modifying tree nodes in-place, so gimplication of a shared node in a
-first context could generate an invalid GIMPLE form in a second context.
- 
-This is achieved with a simple mark/copy/unmark algorithm that walks the
-GENERIC representation top-down, marks nodes with TREE_VISITED the first
-time it encounters them, duplicates them if they already have TREE_VISITED
-set, and finally removes the TREE_VISITED marks it has set.
- 
-The algorithm works only at the function level, i.e. it generates a GENERIC
-representation of a function with no nodes shared within the function when
-passed a GENERIC function (except for nodes that are allowed to be shared).
- 
-At the global level, it is also necessary to unshare tree nodes that are
-referenced in more than one function, for the same aforementioned reason.
-This requires some cooperation from the front-end.  There are 2 strategies:
- 
-  1. Manual unsharing.  The front-end needs to call unshare_expr on every
- expression that might end up being shared across functions.
- 
-  2. Deep unsharing.  This is an extension of regular unsharing.  Instead
- of calling unshare_expr on expressions that might be shared across
- functions, the front-end pre-marks them with TREE_VISITED.  This will
- ensure that they are unshared on the first reference within functions
- when the regular unsharing algorithm runs.  The counterpart is that
- this algorithm must look deeper than for manual unsharing, which is
- specified by LANG_HOOKS_DEEP_UNSHARING.
- 
-   If there are only few specific cases of node sharing across functions, it is
-   probably easier for a front-end to unshare the expressions manually.  On the
-   contrary, if the expressions generated at the global level are as widespread
-   as expressions generated within functions, deep unsharing is very likely the
-   way to go.  */
- 
- /* Similar to copy_tree_r but do not copy SAVE_EXPR or TARGET_EXPR nodes.
-These nodes model computations that must be done once.  If we were to
-unshare something like SAVE_EXPR(i++), the gimplification process would
-create wrong code.  However, if DATA is non-null, it must hold a pointer
-set that is used to unshare the subtrees of these nodes.  */
- 
- static tree
- mostly_copy_tree_r (tree *tp, int *walk_subtrees, void *data)
- {
-   tree t = *tp;
-   enum tree_code code = TREE_CODE (t);
- 
-   /* Do not copy SAVE_EXPR, TARGET_EXPR or BIND_EXPR nodes themselves, b

Re: [patch] The remainder of tree-flow.h refactored.

2013-10-09 Thread Diego Novillo
On Wed, Oct 9, 2013 at 11:37 AM, Andrew MacLeod  wrote:

> bootstraps on x86_64-unknown-linux-gnu, regressions test are still running.
> OK?

Sure.


Re: [PATCH]: Fix PR58542, Arguments of __atomic_* functions are converted in unsigned mode

2013-10-09 Thread Richard Henderson
On 10/08/2013 11:37 AM, Uros Bizjak wrote:
> 
> As shown in the attached testcase, arguments of various __atomic
> builtins should be converted as signed, so the immediates get properly
> extended.
> 
> 2013-10-08  Uros Bizjak  
> 
> * optabs.c (maybe_emit_atomic_exchange): Convert operands as signed.
> (maybe_emit_sync_lock_test_and_set): Ditto.
> (expand_atomic_compare_and_swap): Ditto.
> (maybe_emit_op): Ditto.
> 
> testsuite/ChangeLog:
> 
> 2013-10-08  Uros Bizjak  
> 
> * g++.dg/ext/atomic-2.C: New test.
> 
> Patch was bootstrapped and regression tested on x86_64-pc-linux-gnu {,-m32}.
> 
> OK for mainline and release branches?

This doesn't seem right at all.

The bug is that I gets set to UINT64_MAX, right?  Where's the
incorrect conversion from int to __int128_t?  Surely you can
produce a reduced test case that doesn't involve all of 
to show that.


r~


Re: [gomp4] C++ OpenMP user defined reductions (take 2)

2013-10-09 Thread Jakub Jelinek
On Wed, Oct 09, 2013 at 10:47:22AM -0400, Jason Merrill wrote:
> On 10/07/2013 07:02 AM, Jakub Jelinek wrote:
> >duplicates anywhere, but during error diagnostics.  Without those two decl.c
> >hunks (either of them), pushdecl will sometimes return a different decl from
> >the original or error_mark_node, and the original fndecl passed to it has
> >ggc_free called on it, thus any further use of it ICEs or may ICE.
> 
> Right.
> 
> >Perhaps if pushdecl returns error_mark_node,
> >then I'd should expect that the error has been reported already and if
> >it returns some other FUNCTION_DECL, then I should report it myself,
> 
> Makes sense.
> 
> >but a problem with that is that there are multiple locations that call
> >pushdecl (two in parser.c, one in pt.c) and more importantly, that for
> >the diagnostics the new fndecl is ggc_freed and thus I can't use it
> >for the diagnostics anymore.
> 
> True, though probably input_location is enough.

It turned much easier to just diagnose and return error_mark_node
for UDRs always in duplicate_decls by just moving the if a few lines
earlier, then no changes in decls_match are needed.

> >>Normal C++ lookup behavior is to check for ambiguity, so I think
> >>that's the best bet for what the eventual defined semantics will be.
> >
> >No response from omp-lang yet, so I'm not changing this yet.
> 
> Please do change it.  The current behavior is just wrong, and we
> should set a good example for others to follow.  It's ok to fix this
> in a follow-up patch.

Ok, see the attached patch.
> 
> >Unfortunately it didn't work, again on the udr-3.C testcase.
> >mark_used was already called during instantiation of the decl, DECL_ODR_USED
> >got set on it, but it was actually deferred, then when mark_used is called
> >again on it, it is ignored.  I'd need to clear DECL_ODR_USED explicitly
> >and call mark_used, perhaps that would work.
> 
> If deferring it is a problem you can add UDRs to the group of things
> which are always instantiated immediately in mark_used:

Ok, forced it there.  Just to be sure moved the DECL_LANG_SPECIFIC check
first, because DECL_OMP_DECLARE_REDUCTION_P is in DECL_LANG_SPECIFIC.
Perhaps DECL_TEMPLATE_INFO check could also be less expensive to be done
before calling decl_maybe_constant_var_p or undeduced_auto_decl ?

> >+error_at (loc, "predeclared arithmetic type in %qT"
> >+error_at (loc, "reference type in %qT"
> 
> "%qT in"

Fixed.

Ok?

2013-10-09  Jakub Jelinek  

* decl.c (decls_match): Revert DECL_OMP_DECLARE_REDUCTION_P
special cases.
(duplicate_decls): Move DECL_OMP_DECLARE_REDUCTION_P case
earlier.
* parser.c (cp_parser_omp_declare_reduction): Fix spelling
of some error messages.  Set DECL_CONTEXT (fndecl) to
global_namespace first and set DECL_LOCAL_FUNCTION_P (fndecl).
* decl2.c (mark_used): Force immediate instantiation of
DECL_OMP_DECLARE_REDUCTION_P decls.
* semantics.c (omp_reduction_lookup): Add baselinkp and
ambiguousp arguments, diagnose ambiguities, perform access
check only if non-ambiguous.
(finish_omp_reduction_clause): Adjust omp_reduction_lookup
caller, if it returned error_mark_node, just return true,
use mark_used instead of instantiate_decl.
gcc/testsuite/
* g++.dg/gomp/udr-6.C: New test.
libgomp/
* testsuite/libgomp.c++/udr-6.C: Remove UDR + on type F.

--- gcc/cp/decl.c.jj2013-10-07 14:06:58.0 +0200
+++ gcc/cp/decl.c   2013-10-09 17:41:58.264948392 +0200
@@ -978,15 +978,12 @@ decls_match (tree newdecl, tree olddecl)
   tree t2 = (DECL_USE_TEMPLATE (olddecl)
 ? DECL_TI_TEMPLATE (olddecl)
 : NULL_TREE);
-  if (t1 != t2 && !DECL_OMP_DECLARE_REDUCTION_P (newdecl))
+  if (t1 != t2)
return 0;
 
   if (CP_DECL_CONTEXT (newdecl) != CP_DECL_CONTEXT (olddecl)
  && ! (DECL_EXTERN_C_P (newdecl)
-   && DECL_EXTERN_C_P (olddecl))
- && ! (DECL_OMP_DECLARE_REDUCTION_P (newdecl)
-   && DECL_CONTEXT (newdecl) == NULL_TREE
-   && DECL_CONTEXT (olddecl) == current_function_decl))
+   && DECL_EXTERN_C_P (olddecl)))
return 0;
 
   /* A new declaration doesn't match a built-in one unless it
@@ -1344,6 +1341,15 @@ duplicate_decls (tree newdecl, tree oldd
}
  return NULL_TREE;
}
+  else if (DECL_OMP_DECLARE_REDUCTION_P (olddecl))
+   {
+ gcc_assert (DECL_OMP_DECLARE_REDUCTION_P (newdecl));
+ error_at (DECL_SOURCE_LOCATION (newdecl),
+   "redeclaration of %");
+ error_at (DECL_SOURCE_LOCATION (olddecl),
+   "previous % declaration");
+ return error_mark_node;
+   }
   else if (!types_match)
{
  /* Avoid warnings redeclaring built-ins which have not been
@@ -1422,15 +1428,6 @@ duplicate_decls (tree newdecl, tree oldd
  type = cp_build_t

Re: [patch] The remainder of tree-flow.h refactored.

2013-10-09 Thread Richard Biener
Andrew MacLeod  wrote:
>On 10/08/2013 06:22 AM, Richard Biener wrote:
>>
>> unvisit_body isn't generic enough to warrant moving out of gimplify.c
>> (the only user).
>
>Bah, now I remember.. so there *are* other users.. this routine is 
>called from various front ends.. fortran, c-family and cp all call it. 
>That is why I wanted to move it to tree.[ch].  it doesn't belong in a 
>gimple file..  it operates on trees and is also used by tree front
>ends.

But afaik as part of gimplifying. The gimplifier is part of the 
frontend-middleend interface. Lumping all of it into tree.c isn't sustainable.

Richard.

>which then brings with it all the related routines...  unshare_body is 
>then exported from tree.c in order for gimplify.c to utilize it.
>
>This patch handles just this part.
>
>bootstraps on x86_64-unknown-linux-gnu, regressions test are still 
>running.  OK?
>
>Andrew




Re: Add a param to decide stack slot sharing at -O0

2013-10-09 Thread Easwaran Raman
On Wed, Oct 9, 2013 at 4:11 AM, Richard Biener
 wrote:
> On Tue, Oct 8, 2013 at 11:04 PM, Easwaran Raman  wrote:
>> In cfgexpand.c, variables in non-overlapping lexical scopes are
>> assigned same stack locations at -O1 and above. At -O0, this is
>> attempted only if the size of the stack objects is above a threshold
>> (32). The rationale is at -O0, more variables are going to be in the
>> stack and the O(n^2) stack slot sharing algorithm will increase the
>> compilation time. This patch replaces the constant with a param which
>> is set to 32 by default. We ran into a case where the presence of
>> always_inline attribute triggered Wframe-larger-than warnings at -O0
>> but not at -O2 since the different inlined copies share the stack. We
>> are ok with a slight increase in compilation time to get smaller stack
>> frames even at -O0 and this patch would allow us do that easily.
>>
>> Bootstraps on x86_64/linux. Is this ok for trunk?
>
> Ok with
>
> +DEFPARAM (PARAM_MIN_SIZE_FOR_STACK_SHARING,
> + "min-size-for-stack-sharing",
> + "Attempt to share stack slots among variables in different
> lexical blocks "
> + "at O0 only if their sizes exceed this value",
> + 32, 0, 0)
>
> changed to
>
>"The minimum size of variables taking part in stack slot sharing "
>"when not optimizing"
>32, 0, 0)
>
> And with adding documentation for that param in doc/invoke.texi.
>
> Btw, I'm not sure the sharing algorithm is still quadratic - can you
> investigate on that?

The partition_stack_vars is still quadratic in worst case (all live
ranges overlap), but the expensive part is likely to be the
add_scope_conflicts function.

- Easwaran

> Thanks,
> Richard.
>
>> Thanks,
>> Easwaran
>>
>>
>> 2013-10-08  Easwaran Raman 
>>
>> * params.def (PARAM_MIN_SIZE_FOR_STACK_SHARING): New param...
>> * cfgexpand.c (defer_stack_allocation): ...use here


[patch] shuffle a couple of functions.

2013-10-09 Thread Andrew MacLeod
This patch simply moves std_gimplify_va_arg_expr() and the related 
build_va_arg_indirect_ref to gimplify.c where I think it belongs.
It also moves gimple_fold_indirect_ref out of gimplfy.c and into 
gimple-fold.c.


bootstraps on x86_64-unknown-linux-gnu... regressions running.  OK?

Andrew


	* builtins.c (std_gimplify_va_arg_expr, build_va_arg_indirect_ref):
	Move to gimplify.c.
	* gimplify.c (gimple_fold_indirect_ref): Move to gimple-fold.c.
	(build_va_arg_indirect_ref): Relocate and make static.
	(std_gimplify_va_arg_expr): Relocate here.
	* gimple-fold.c (gimple_fold_indirect_ref): Relocate here.
	* gimple-fold.h (gimple_fold_indirect_ref): Add prototype.
	* tree-flow.h (gimple_fold_indirect_ref): Remove prototype.


*** R/builtins.c	2013-10-09 13:46:21.687171187 -0400
--- builtins.c	2013-10-09 14:03:42.717421331 -0400
*** along with GCC; see the file COPYING3.  
*** 43,49 
  #include "target.h"
  #include "langhooks.h"
  #include "basic-block.h"
- #include "tree-mudflap.h"
  #include "tree-ssa.h"
  #include "value-prof.h"
  #include "diagnostic-core.h"
--- 43,48 
*** along with GCC; see the file COPYING3.  
*** 51,59 
  #include "ubsan.h"
  
  
- #ifndef PAD_VARARGS_DOWN
- #define PAD_VARARGS_DOWN BYTES_BIG_ENDIAN
- #endif
  static tree do_mpc_arg1 (tree, tree, int (*)(mpc_ptr, mpc_srcptr, mpc_rnd_t));
  
  struct target_builtins default_target_builtins;
--- 50,55 
*** expand_builtin_va_start (tree exp)
*** 4237,4351 
return const0_rtx;
  }
  
- /* The "standard" implementation of va_arg: read the value from the
-current (padded) address and increment by the (padded) size.  */
- 
- tree
- std_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
- 			  gimple_seq *post_p)
- {
-   tree addr, t, type_size, rounded_size, valist_tmp;
-   unsigned HOST_WIDE_INT align, boundary;
-   bool indirect;
- 
- #ifdef ARGS_GROW_DOWNWARD
-   /* All of the alignment and movement below is for args-grow-up machines.
-  As of 2004, there are only 3 ARGS_GROW_DOWNWARD targets, and they all
-  implement their own specialized gimplify_va_arg_expr routines.  */
-   gcc_unreachable ();
- #endif
- 
-   indirect = pass_by_reference (NULL, TYPE_MODE (type), type, false);
-   if (indirect)
- type = build_pointer_type (type);
- 
-   align = PARM_BOUNDARY / BITS_PER_UNIT;
-   boundary = targetm.calls.function_arg_boundary (TYPE_MODE (type), type);
- 
-   /* When we align parameter on stack for caller, if the parameter
-  alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
-  aligned at MAX_SUPPORTED_STACK_ALIGNMENT.  We will match callee
-  here with caller.  */
-   if (boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
- boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
- 
-   boundary /= BITS_PER_UNIT;
- 
-   /* Hoist the valist value into a temporary for the moment.  */
-   valist_tmp = get_initialized_tmp_var (valist, pre_p, NULL);
- 
-   /* va_list pointer is aligned to PARM_BOUNDARY.  If argument actually
-  requires greater alignment, we must perform dynamic alignment.  */
-   if (boundary > align
-   && !integer_zerop (TYPE_SIZE (type)))
- {
-   t = build2 (MODIFY_EXPR, TREE_TYPE (valist), valist_tmp,
- 		  fold_build_pointer_plus_hwi (valist_tmp, boundary - 1));
-   gimplify_and_add (t, pre_p);
- 
-   t = build2 (MODIFY_EXPR, TREE_TYPE (valist), valist_tmp,
- 		  fold_build2 (BIT_AND_EXPR, TREE_TYPE (valist),
- 			   valist_tmp,
- 			   build_int_cst (TREE_TYPE (valist), -boundary)));
-   gimplify_and_add (t, pre_p);
- }
-   else
- boundary = align;
- 
-   /* If the actual alignment is less than the alignment of the type,
-  adjust the type accordingly so that we don't assume strict alignment
-  when dereferencing the pointer.  */
-   boundary *= BITS_PER_UNIT;
-   if (boundary < TYPE_ALIGN (type))
- {
-   type = build_variant_type_copy (type);
-   TYPE_ALIGN (type) = boundary;
- }
- 
-   /* Compute the rounded size of the type.  */
-   type_size = size_in_bytes (type);
-   rounded_size = round_up (type_size, align);
- 
-   /* Reduce rounded_size so it's sharable with the postqueue.  */
-   gimplify_expr (&rounded_size, pre_p, post_p, is_gimple_val, fb_rvalue);
- 
-   /* Get AP.  */
-   addr = valist_tmp;
-   if (PAD_VARARGS_DOWN && !integer_zerop (rounded_size))
- {
-   /* Small args are padded downward.  */
-   t = fold_build2_loc (input_location, GT_EXPR, sizetype,
- 		   rounded_size, size_int (align));
-   t = fold_build3 (COND_EXPR, sizetype, t, size_zero_node,
- 		   size_binop (MINUS_EXPR, rounded_size, type_size));
-   addr = fold_build_pointer_plus (addr, t);
- }
- 
-   /* Compute new value for AP.  */
-   t = fold_build_pointer_plus (valist_tmp, rounded_size);
-   t = build2 (MODIFY_EXPR, TREE_TYPE (valist), valist, t);
-   gimplify_and_add (t, pre_p);
- 
-   addr = fold_convert (build_pointer_type (type), addr);
- 
-

Re: [patch] The remainder of tree-flow.h refactored.

2013-10-09 Thread Andrew MacLeod

On 10/09/2013 01:48 PM, Richard Biener wrote:

Andrew MacLeod  wrote:

On 10/08/2013 06:22 AM, Richard Biener wrote:

unvisit_body isn't generic enough to warrant moving out of gimplify.c
(the only user).

Bah, now I remember.. so there *are* other users.. this routine is
called from various front ends.. fortran, c-family and cp all call it.
That is why I wanted to move it to tree.[ch].  it doesn't belong in a
gimple file..  it operates on trees and is also used by tree front
ends.

But afaik as part of gimplifying. The gimplifier is part of the 
frontend-middleend interface. Lumping all of it into tree.c isn't sustainable.

Richard.



Fair enough.   I'll adjust...  the front end files which use that 
routine will just have to include gimplify.h


Andrew


Re: [patch] The remainder of tree-flow.h refactored.

2013-10-09 Thread Andrew MacLeod

On 10/09/2013 02:15 PM, Andrew MacLeod wrote:

On 10/09/2013 01:48 PM, Richard Biener wrote:

Andrew MacLeod  wrote:

On 10/08/2013 06:22 AM, Richard Biener wrote:

unvisit_body isn't generic enough to warrant moving out of gimplify.c
(the only user).

Bah, now I remember.. so there *are* other users.. this routine is
called from various front ends.. fortran, c-family and cp all call it.
That is why I wanted to move it to tree.[ch].  it doesn't belong in a
gimple file..  it operates on trees and is also used by tree front
ends.
But afaik as part of gimplifying. The gimplifier is part of the 
frontend-middleend interface. Lumping all of it into tree.c isn't 
sustainable.


Richard.



Fair enough.   I'll adjust...  the front end files which use that 
routine will just have to include gimplify.h


Unless maybe we should expand the gimplify module to have a 
gimplfy-fe.[ch] which includes the routines the front ends require..?  
tehre are 3 or 4 other ones in there I have noticed that are called from 
the front ends.


Andrew



Re: [PATCH] Fix libstdc++/58659.cc test case.

2013-10-09 Thread Jonathan Wakely
On 9 October 2013 16:15, Jakub Jelinek wrote:
> On Wed, Oct 09, 2013 at 04:12:21PM +0100, Jonathan Wakely wrote:
>> On 9 October 2013 15:26, Marcus Shawcroft wrote:
>> > The test case add here:
>> >
>> > http://gcc.gnu.org/ml/gcc-patches/2013-10/msg00474.html
>> >
>> > Introduced an unprototyped call to abort() resulting in failures due to
>> > unexepected warnings in aarch64-none-elf cross testing.
>> >
>> > Committed to trunk as obvious.
>>
>> Thanks, I forgot to change that part when adding it to the testsuite.
>> I think we should really use a VERIFY there.
>>
>> Jakub, can I fix the testcase on the 4.8 branch that was committed for
>> http://gcc.gnu.org/ml/gcc-patches/2013-10/msg00478.html?
>
> Ok.

Thanks. The attached changes are committed to the trunk and 4.8 branch.


2013-10-09  Jonathan Wakely  

* testsuite/20_util/shared_ptr/cons/58659.cc: Use VERIFY instead of
aborting.


49.patch
Description: Binary data


48.patch
Description: Binary data


[Google 4.8 Patch] Generate gnu-pubnames for definitions only. Not declarations.

2013-10-09 Thread Sterling Augustine
Hi Cary,

The enclosed patch stops declaration dies from generating
gnu-pubnames. Declaration dies don't include enough information to be
useful to gdb, so it has to scan all the entries in the index until it
finds a definition. Not including declarations prevents this problem.

Google ref: b/10191963.

Sterling

2013-10-09  Sterling Augustine  

* dwarf2out.c (include_pubname_in_output): Add conditional on
is_declaration_die
and debug_generate_pubnames.


patch.diff
Description: Binary data


Re: [PATCH] Enhance phiopt to handle BIT_AND_EXPR

2013-10-09 Thread Jeff Law

On 09/30/13 03:29, Zhenqiang Chen wrote:

Hi,

The patch enhances phiopt to handle cases like:

   if (a == 0 && (...))
 return 0;
   return a;

Boot strap and no make check regression on X86-64 and ARM.

Is it OK for trunk?

Thanks!
-Zhenqiang

ChangeLog:
2013-09-30  Zhenqiang Chen  

* tree-ssa-phiopt.c (operand_equal_for_phi_arg_p_1): New.
(value_replacement): Move a check to operand_equal_for_phi_arg_p_1.

testsuite/ChangeLog:
2013-09-30  Zhenqiang Chen  

* gcc.dg/tree-ssa/phi-opt-11.c: New test case.
So I made some minor changes.  First, the duplicated code was factored 
out into its own function.  Block comments were added to the two new 
functions and comments within the functions were improved.  Finally, I 
added to additional cases to the testcase to show other cases it handles.


I also fixed some trailing whitespace in tree-ssa-phiopt.c unrelated to 
your changes.



Bootstrapped & regression tested on x86_64-unknown-linux-gnu.  Installed 
on the trunk.  Final patch attached for reference.



diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index b427946..b1028bf 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@
+2013-10-09  Zhenqiang Chen  
+
+   * tree-ssa-phiopts.c (rhs_is_fed_for_value_replacement): New function.
+   (operand_equal_for_value_replacement): New function, extracted from
+   value_replacement and enhanced to catch more cases.
+   (value_replacement): Use operand_equal_for_value_replacement.
+
 2013-10-09  Andrew MacLeod  
 
* loop-doloop.c (doloop_modify, doloop_optimize): Use 
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 66b3c38..76cce59 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,7 @@
+2013-10-09  Zhenqiang Chen  
+
+   * gcc.dg/tree-ssa/phi-opt-11.c: New test.
+
 2013-10-09  Marek Polacek  
 
PR c++/58635
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-11.c 
b/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-11.c
new file mode 100644
index 000..7c83007
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-11.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -fdump-tree-optimized" } */
+
+int f(int a, int b, int c)
+{
+  if (a == 0 && b > c)
+   return 0;
+ return a;
+}
+
+int g(int a, int b, int c)
+{
+  if (a == 42 && b > c)
+   return 42;
+ return a;
+}
+
+int h(int a, int b, int c, int d)
+{
+  if (a == d && b > c)
+   return d;
+ return a;
+}
+/* { dg-final { scan-tree-dump-times "if" 0 "optimized"} } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git a/gcc/tree-ssa-phiopt.c b/gcc/tree-ssa-phiopt.c
index 8e1ddab..adf8a28 100644
--- a/gcc/tree-ssa-phiopt.c
+++ b/gcc/tree-ssa-phiopt.c
@@ -110,6 +110,26 @@ static bool gate_hoist_loads (void);
This opportunity can sometimes occur as a result of other
optimizations.
 
+
+   Another case caught by value replacement looks like this:
+
+ bb0:
+   t1 = a == CONST;
+   t2 = b > c;
+   t3 = t1 & t2;
+   if (t3 != 0) goto bb1; else goto bb2;
+ bb1:
+ bb2:
+   x = PHI (CONST, a)
+
+   Gets replaced with:
+ bb0:
+ bb2:
+   t1 = a == CONST;
+   t2 = b > c;
+   t3 = t1 & t2;
+   x = a;
+
ABS Replacement
---
 
@@ -155,7 +175,7 @@ static bool gate_hoist_loads (void);
 
Adjacent Load Hoisting
--
-   
+
This transformation replaces
 
  bb0:
@@ -286,7 +306,7 @@ single_non_singleton_phi_for_edges (gimple_seq seq, edge 
e0, edge e1)
phi optimizations.  Both share much of the infrastructure in how
to match applicable basic block patterns.  DO_STORE_ELIM is true
when we want to do conditional store replacement, false otherwise.
-   DO_HOIST_LOADS is true when we want to hoist adjacent loads out 
+   DO_HOIST_LOADS is true when we want to hoist adjacent loads out
of diamond control flow patterns, false otherwise.  */
 static unsigned int
 tree_ssa_phiopt_worker (bool do_store_elim, bool do_hoist_loads)
@@ -389,7 +409,7 @@ tree_ssa_phiopt_worker (bool do_store_elim, bool 
do_hoist_loads)
  continue;
}
   else
-   continue;  
+   continue;
 
   e1 = EDGE_SUCC (bb1, 0);
 
@@ -437,7 +457,7 @@ tree_ssa_phiopt_worker (bool do_store_elim, bool 
do_hoist_loads)
 
  if (!candorest)
continue;
- 
+
  phi = single_non_singleton_phi_for_edges (phis, e1, e2);
  if (!phi)
continue;
@@ -672,6 +692,93 @@ jump_function_from_stmt (tree *arg, gimple stmt)
   return false;
 }
 
+/* RHS is a source argument in a BIT_AND_EXPR which feeds a conditional
+   of the form SSA_NAME NE 0.
+
+   If RHS is fed by a simple EQ_EXPR comparison of two values, see if
+   the two input values of the EQ_EXPR match arg0 and arg1.
+
+   If so update *code and return TRUE.  Otherwise return FALSE.  */
+
+static bool
+rhs_is_fed_for_value_replacement (const_tree arg0, const_tree arg1,
+  

Re: [PATCH] Enhance phiopt to handle BIT_AND_EXPR

2013-10-09 Thread Jeff Law

On 10/08/13 21:39, Zhenqiang Chen wrote:


I'll go ahead and pull the common bits into a single function and commit on
Zhenqiang's behalf.


Thank you!
No problem.  If you think you'll be contributing regularly, you might 
want to go ahead and ask for write-after-approval privileges.


jeff


Re: [PATCH][RFC] fix reload causing ICE in subreg_get_info on m68k (PR58369)

2013-10-09 Thread Jeff Law

On 09/28/13 09:30, Mikael Pettersson wrote:

This patch fixes PR58369, an ICE in subreg_get_info when compiling
boost for m68k-linux.

choose_reload_regs attempts to reload a DFmode (8-byte) reg, finds
an XFmode (12-byte) reg in "last_reg", and calls subreg_regno_offset
with these two modes and a subreg offset of zero.  However, this is
not a correct lowpart subreg offset for big-endian and these two modes,
so the lowpart subreg check in subreg_get_info fails, and the code
continues to

 gcc_assert ((GET_MODE_SIZE (xmode) % GET_MODE_SIZE (ymode)) == 0);

which fails because (12 % 8) != 0.

choose_reload_regs passes the constant zero, in all cases where the reg
isn't already a subreg, as the subreg offset to subreg_regno_offset, even
though lowpart subregs on big-endian targets require an explicit offset
computation.  I think that is a bug.

I believe other big-endian targets don't see this ICE because
a) they define CANNOT_CHANGE_MODE_CLASS to reject differently-sized
modes in floating-point registers (which prevents this path in
choose_reload_regs), or
b) their differently-sized modes are such that the size of a larger
mode is a whole multiple of the size of the smaller mode (which
allows the gcc_assert above to pass).

This patch changes choose_reload_regs to call subreg_lowpart_offset
to pass an endian-correct offset to subreg_regno_offset, except where
the offset comes from a pre-existing subreg.

[Defining CANNOT_CHANGE_MODE_CLASS appropriately for m68k also fixes
the ICE, but I don't think the m68k backend really wants that, and I
think it just papers over a generic bug.]

Tested with trunk and 4.8 on {m68k,sparc64,powerpc64}-linux (big-endian),
and on x86_64-linux/armv5tel-linux-gnueabi (little-endian).  No regressions.

Comments?
Is this Ok for trunk?

gcc/

2013-09-28  Mikael Pettersson  

PR rtl-optimization/58369
* reload1.c (choose_reload_regs): Use subreg_lowpart_offset
to pass endian-correct lowpart offset to subreg_regno_offset.
Thanks Mikael.  My only concern is the lack of adjustment when the value 
found was already a SUBREG.


ie, let's assume rld[r].in_reg was something like
(subreg:XF (reg:DF) 0)

and our target is (reg:DF)

In this case it seems to me we still want to compute the subreg offset, 
right?


jeff



Re: [PING] 3 patches waiting for approval/review

2013-10-09 Thread Jeff Law

On 08/21/13 03:21, Andreas Krebbel wrote:

[RFC] Allow functions calling mcount before prologue to be leaf functions
http://gcc.gnu.org/ml/gcc-patches/2013-04/msg00993.html
I don't think this is necessarily correct for all targets.  ISTM the 
ability to consider a function calling mcount as a leaf needs to be a 
property of the target.


From a safety standpoint, the default value for that should be false, 
which matches current behaviour.




[PATCH] PR57377: Fix mnemonic attribute
http://gcc.gnu.org/ml/gcc-patches/2013-05/msg01364.html

[PATCH] Doc: Add documentation for the mnemonic attribute
http://gcc.gnu.org/ml/gcc-patches/2013-05/msg01436.html
These are OK.  Sorry for the absurdly long delay in getting this stuff 
reviewed.


jeff


Re: [PATCH]: Fix PR58542, Arguments of __atomic_* functions are converted in unsigned mode

2013-10-09 Thread Uros Bizjak
On Wed, Oct 9, 2013 at 6:23 PM, Richard Henderson  wrote:

> This doesn't seem right at all.
>
> The bug is that I gets set to UINT64_MAX, right?  Where's the
> incorrect conversion from int to __int128_t?  Surely you can

Please see Comment #5 of PR58542:

--cut here--

The problem actually starts in expand_atomic_compare_and_swap, in:

(gdb) list
7339  create_convert_operand_to (&ops[3], expected, mode, true);
7340  create_convert_operand_to (&ops[4], desired, mode, true);
7341  create_integer_operand (&ops[5], is_weak);
7342  create_integer_operand (&ops[6], succ_model);
7343  create_integer_operand (&ops[7], fail_model);
7344  expand_insn (icode, 8, ops);

ops[4] is converted in unsigned mode, so from "desired" operand:

(gdb) p debug_rtx (desired)
(const_int -1 [0x])

we got:

(gdb) p ops[4]
$45 = {type = EXPAND_CONVERT_TO, unsigned_p = 1, unused = 0, mode =
TImode, value = 0x7fffeffc21e0}
(gdb) p debug_rtx (ops[4].value)
(const_double -1 [0x] 0 [0] 0 [0] 0 [0])

So, it is actually expansion of atomic_compare_and_swap, which doesn't
account for signedness of "desired" operand.

Manually changing the argument from "true" to "false" for ops[4]
generates correct code.

--cut here--

> produce a reduced test case that doesn't involve all of 
> to show that.

I did try, but without necessary c++ expertise, I was not able to
create equivalent c testcase.

Uros.


Re: Patch to split out new warning flag for floating point conversion

2013-10-09 Thread Joseph S. Myers
On Wed, 9 Oct 2013, Joshua J Cogliati wrote:

> Because this changes -Wextra, when compiling with -Werror and -Wextra,
> some code will not compile now.  The code in gcc that this occurred in
> was changed to use explicit casts.  The patch would be shorter if

I think those changes should be submitted separately, as cleanups that can 
be proposed on their own merits.

Note that casts use spaces, "(type) expr", as detailed at 
.

-- 
Joseph S. Myers
jos...@codesourcery.com


Re: Patch to split out new warning flag for floating point conversion

2013-10-09 Thread Joseph S. Myers
Also note that this patch needs to add testcases to the testsuite 
(gcc/testsuite/c-c++-common/, probably) testing what cases generate 
warnings with the new option and what cases don't.

-- 
Joseph S. Myers
jos...@codesourcery.com


Re: [PATCH i386 3/8] [AVX512] [2/n] Add AVX-512 patterns: Fix missing `v' constraint.

2013-10-09 Thread Richard Henderson
On 10/09/2013 03:24 AM, Kirill Yukhin wrote:
> Here's 2nd subpatch. It fixes missing `v' constraints.

And one v constraint that shouldn't have been.

Ok.


r~


Re: [PATCH i386 3/8] [AVX512] [3/n] Add AVX-512 patterns: VF1 and VI iterators.

2013-10-09 Thread Richard Henderson
On 10/09/2013 03:24 AM, Kirill Yukhin wrote:
> Here's 3rd subpatch. It extends VF1 and VI iterators.

Ok.



r~


Re: [PATCH i386 3/8] [AVX512] [4/n] Add AVX-512 patterns: V iterator.

2013-10-09 Thread Richard Henderson
On 10/09/2013 03:25 AM, Kirill Yukhin wrote:
> Here's 4th subpatch. It extends V iterator.

And much much more that's totally unrelated to changing V.

That said, I didn't see anything wrong in there.

Ok.


r~


Re: [patch] The remainder of tree-flow.h refactored.

2013-10-09 Thread Andrew MacLeod

On 10/09/2013 02:18 PM, Andrew MacLeod wrote:

On 10/09/2013 02:15 PM, Andrew MacLeod wrote:


Fair enough.   I'll adjust...  the front end files which use that 
routine will just have to include gimplify.h


Unless maybe we should expand the gimplify module to have a 
gimplfy-fe.[ch] which includes the routines the front ends require..?  
tehre are 3 or 4 other ones in there I have noticed that are called 
from the front ends.



Ugg. I played around with it some, but it needs to be sorted through a 
lot better. so any attempts at that seem premature right now.


Instead, I'll just move the few remaining prototypes from gimplfy.c that 
are in tree-flow.h into gimple.h which has a lot of the other gimplify 
prototypes in it.  I'll deal with gimpllify.h and all these issues later 
on when Im more prepared and get to processing gimple.h.


I'll follow it by then reducing the include list for the new tree-cfg.h 
so that it truly represents just tree-cfg routines.


Bootstraps on x86_64-unknown-linux-gnu. regressions running.. OK?

Andrew


	* tree-flow.h: Move some prototypes to gimple.h.
	* gimple.h: Relocate some prototypes from tree-flow.h

Index: tree-flow.h
===
*** tree-flow.h	(revision 203320)
--- tree-flow.h	(working copy)
*** extern tree gimplify_build1 (gimple_stmt
*** 159,173 
  extern void extract_true_false_edges_from_block (basic_block, edge *, edge *);
  extern unsigned int execute_fixup_cfg (void);
  
- /* In gimplify.c  */
- tree force_gimple_operand_1 (tree, gimple_seq *, gimple_predicate, tree);
- tree force_gimple_operand (tree, gimple_seq *, bool, tree);
- tree force_gimple_operand_gsi_1 (gimple_stmt_iterator *, tree,
-  gimple_predicate, tree,
-  bool, enum gsi_iterator_update);
- tree force_gimple_operand_gsi (gimple_stmt_iterator *, tree, bool, tree,
- 			   bool, enum gsi_iterator_update);
- 
- 
  #endif /* _TREE_FLOW_H  */
--- 159,162 
Index: gimple.h
===
*** gimple.h	(revision 203319)
--- gimple.h	(working copy)
*** void gsi_commit_one_edge_insert (edge, b
*** 5354,5359 
--- 5354,5368 
  void gsi_commit_edge_inserts (void);
  gimple gimple_call_copy_skip_args (gimple, bitmap);
  
+ /* In gimplify.c.  */
+ tree force_gimple_operand_1 (tree, gimple_seq *, gimple_predicate, tree);
+ tree force_gimple_operand (tree, gimple_seq *, bool, tree);
+ tree force_gimple_operand_gsi_1 (gimple_stmt_iterator *, tree,
+  gimple_predicate, tree,
+  bool, enum gsi_iterator_update);
+ tree force_gimple_operand_gsi (gimple_stmt_iterator *, tree, bool, tree,
+ 			   bool, enum gsi_iterator_update);
  
  /* Convenience routines to walk all statements of a gimple function.
 Note that this is useful exclusively before the code is converted
*** gimple_seq_set_location (gimple_seq seq,
*** 5478,5482 
  
  #define PERCENT(x,y) ((float)(x) * 100.0 / (float)(y))
  
- 
  #endif  /* GCC_GIMPLE_H */
--- 5487,5490 


Re: [PATCH i386 3/8] [AVX512] [5/n] Add AVX-512 patterns: Introduce `multdiv' code iterator.

2013-10-09 Thread Richard Henderson
On 10/09/2013 03:25 AM, Kirill Yukhin wrote:
> Here's 5th subpatch. It introduces `multdiv' code iterator.

This is the sort of patch I like to see.  It's the first one
you've sent that's done exactly one thing.  Congratulations.

Ok.


r~


Re: [PATCH i386 3/8] [AVX512] [6/n] Add AVX-512 patterns: VI2 and VI124 iterators.

2013-10-09 Thread Richard Henderson
On 10/09/2013 03:26 AM, Kirill Yukhin wrote:
> Here's 6th subpatch. It extends VI2 and VI124 iterators.

Ok.


r~


Re: [PATCH i386 3/8] [AVX512] [7/n] Add AVX-512 patterns: VI4 and VI8 iterators.

2013-10-09 Thread Richard Henderson
On 10/09/2013 03:26 AM, Kirill Yukhin wrote:
> Here's 7th subpatch. It extends VI4 and VI8 iterators.

Ok.


r~


Re: [PATCH i386 3/8] [AVX512] [8/n] Add AVX-512 patterns: VI48 and VI48_AVX2 iterators.

2013-10-09 Thread Richard Henderson
On 10/09/2013 03:27 AM, Kirill Yukhin wrote:
> Here's 8th subpatch. It extends VI48 and VI48_AVX2 iterators.

Ok.


r~


Re: [PATCH i386 3/8] [AVX512] [9/n] Add AVX-512 patterns: VI124_AVX2, VI8F iterators.

2013-10-09 Thread Richard Henderson
On 10/09/2013 03:27 AM, Kirill Yukhin wrote:
> Here's 9th subpatch. It extends VI124_AVX2_48 and VI8F iterators.

Ok.


r~


Re: [Google 4.8 Patch] Generate gnu-pubnames for definitions only. Not declarations.

2013-10-09 Thread Cary Coutant
> 2013-10-09  Sterling Augustine  
>
> * dwarf2out.c (include_pubname_in_output): Add conditional on
> is_declaration_die
> and debug_generate_pubnames.

OK for google 4.8 branch. Thanks!

-cary


[Patch, Fortran] PR58226 - Avoid invalid mem access with compiler_options

2013-10-09 Thread Tobias Burnus
A rather obvious fix; the memory is freed by the caller 
(gfc_simplify_compiler_options). It is unlikely that the compiler has no 
arguments as the driver tends to send some, e.g. "-mtune=generic 
-march=x86-64" on my system. However, it is better to be safe than sorry.


Build and regtested on x86-64-gnu-linux.
OK?

Tobias
2013-10-09  Tobias Burnus  

	PR fortran/58226
	* options.c (gfc_get_option_string): Handle zero arg case.

diff --git a/gcc/fortran/options.c b/gcc/fortran/options.c
index 3a9c508..6e4e7c1 100644
--- a/gcc/fortran/options.c
+++ b/gcc/fortran/options.c
@@ -1166,6 +1166,10 @@ gfc_get_option_string (void)
   size_t len, pos;
   char *result;
 
+  /* Allocate and return a one-character string with '\0'.  */
+  if (!save_decoded_options_count)
+return XCNEWVEC (char, 1);
+
   /* Determine required string length.  */
 
   len = 0;


Re: [PATCH i386 3/8] [AVX512] [10/n] Add AVX-512 patterns: VI248_AVX2_8_AVX512F and VI124_256_48_AVX512F iterators.

2013-10-09 Thread Richard Henderson
On 10/09/2013 03:27 AM, Kirill Yukhin wrote:
> Here's 10th subpatch. It introduces VI248_AVX2_8_AVX512F and VI124_256_48_512 
> iterators.

Ok.


r~


Re: [PATCH i386 3/8] [AVX512] [11/n] Add AVX-512 patterns: FMA.

2013-10-09 Thread Richard Henderson
On 10/09/2013 03:28 AM, Kirill Yukhin wrote:
> +;; CPUID bit AVX512F enables evex encoded scalar and 512-bit fma.  It doesn't
> +;; care about FMA bit, so we enable fma for TARGET_AVX512F even when 
> TARGET_FMA
> +;; and TARGET_FMA4 are both false.

How do you force an evex encoding of the instruction?

Do you really mean that cpuid AVX512F, !FMA will not #OP
for a vex (but not evex) encoded version of the same insn?


r~


Re: [PATCH i386 3/8] [AVX512] [12/n] Add AVX-512 patterns: V_512 and VI_512 iterators.

2013-10-09 Thread Richard Henderson
On 10/09/2013 03:28 AM, Kirill Yukhin wrote:
> Here's 12th subpatch. It introduces VF_512 and VI_512 iterators.

Ok.


r~


Re: [PATCH i386 3/8] [AVX512] [13/n] Add AVX-512 patterns: VI4_AVX iterator.

2013-10-09 Thread Richard Henderson
On 10/09/2013 03:29 AM, Kirill Yukhin wrote:
> Here's 13th subpatch. It introduces VI4_AVX iterator.

Ok.


r~


Re: [PATCH i386 3/8] [AVX512] [14/n] Add AVX-512 patterns: VI48F_256_512 iterator.

2013-10-09 Thread Richard Henderson
On 10/09/2013 03:29 AM, Kirill Yukhin wrote:
> Here's 14th subpatch. It introduces VI48F_256_512 iterator.

Ok.


r~


[RFC] [Testsuite,ARM] Neon intrinsics executable tests

2013-10-09 Thread Christophe Lyon
Hi,

This patch is a first small sample of dejagnu-ization of my ARM Neon
intrinsics tests.

It's derived from my previous work at
http://gitorious.org/arm-neon-tests/arm-neon-tests which supports all
the ARM intrinsics, with executable tests. As I have to manually
transform each test (to include the expected data, and a few other
modifications), it's quite a bit tedious.

I'd like your feedback before continuing, as there are a lot more
files to come.

I have made some cleanup to help review, but the two .h files will
need to grow as more intrinsics will be added (see the original ones).

I'd like to keep the modifications at a minimal level, to save my time
when adapting each test (there are currently 145 test files, so 143
left :-).

Thanks,

Christophe.

This patch only introduces new files.
2013-10-03  Christophe Lyon  

testsuite/gcc.target/arm/neon-intrinsics/
* neon-intrinsics.exp: New driver file.
* arm-neon-ref.h: New file, with common vector construction
helpers.
* compute_ref_data.h: New file, with helpers for input data
initialization.
* ref_vaba.c: New test file for the vaba family of intrinsics.
* ref_vld1.c: New test file for vld1.
diff -rNup '--exclude=.git' 
gcc-fsf/gcc/testsuite/gcc.target/arm/neon-intrinsics/neon-intrinsics.exp 
gcc-fsf-neontests/gcc/testsuite/gcc.target/arm/neon-intrinsics/neon-intrinsics.exp
--- gcc-fsf/gcc/testsuite/gcc.target/arm/neon-intrinsics/neon-intrinsics.exp
1970-01-01 01:00:00.0 +0100
+++ 
gcc-fsf-neontests/gcc/testsuite/gcc.target/arm/neon-intrinsics/neon-intrinsics.exp
  2013-05-08 23:08:46.271786347 +0200
@@ -0,0 +1,35 @@
+# Copyright (C) 1997-2013 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# .
+
+# GCC testsuite that uses the `dg.exp' driver.
+
+# Exit immediately if this isn't an ARM target.
+if ![istarget arm*-*-*] then {
+  return
+}
+
+# Load support procs.
+load_lib gcc-dg.exp
+
+# Initialize `dg'.
+dg-init
+
+# Main loop.
+gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.\[cCS\]]] \
+   ""
+
+# All done.
+dg-finish
diff -rNup '--exclude=.git' 
gcc-fsf/gcc/testsuite/gcc.target/arm/neon-intrinsics/arm-neon-ref.h 
gcc-fsf-neontests/gcc/testsuite/gcc.target/arm/neon-intrinsics/arm-neon-ref.h
--- gcc-fsf/gcc/testsuite/gcc.target/arm/neon-intrinsics/arm-neon-ref.h 
1970-01-01 01:00:00.0 +0100
+++ 
gcc-fsf-neontests/gcc/testsuite/gcc.target/arm/neon-intrinsics/arm-neon-ref.h   
2013-05-09 00:48:59.395628726 +0200
@@ -0,0 +1,349 @@
+#ifndef _ARM_NEON_REF_H_
+#define _ARM_NEON_REF_H_
+
+#include 
+#include 
+#include 
+#include 
+
+#define xSTR(X) #X
+#define STR(X) xSTR(X)
+
+#define xNAME1(V,T) V ## _ ##  T
+#define xNAME(V,T) xNAME1(V,T)
+
+#define VAR(V,T,W) xNAME(V,T##W)
+#define VAR_DECL(V, T, W) T##W##_t VAR(V,T,W)
+
+#define VECT_NAME(T, W, N) T##W##x##N
+#define VECT_ARRAY_NAME(T, W, N, L) T##W##x##N##x##L
+#define VECT_TYPE(T, W, N) xNAME(VECT_NAME(T,W,N),t)
+#define VECT_ARRAY_TYPE(T, W, N, L) xNAME(VECT_ARRAY_NAME(T,W,N,L),t)
+
+#define VECT_VAR(V,T,W,N) xNAME(V,VECT_NAME(T,W,N))
+#define VECT_VAR_DECL(V, T, W, N) T##W##_t VECT_VAR(V,T,W,N)
+
+/* Array declarations.  */
+#define ARRAY(V, T, W, N) VECT_VAR_DECL(V,T,W,N)[N]
+
+/* Check results vs expected values.  */
+#define CHECK(MSG,T,W,N,FMT)   \
+  for(i=0; i
+#include "arm-neon-ref.h"
+
+/* Initialization helpers; 4 slices are needed for vld2, vld3 and
+   vld4.  */
+#define MY_INIT_TAB(T,W,N) xNAME(INIT_TAB,N)(T##W##_t)
+#define MY_INIT_TAB2(T,W,N) xNAME(INIT_TAB2,N)(T##W##_t)
+#define MY_INIT_TAB3(T,W,N) xNAME(INIT_TAB3,N)(T##W##_t)
+#define MY_INIT_TAB4(T,W,N) xNAME(INIT_TAB4,N)(T##W##_t)
+
+/* Initialized input buffers.  */
+#define VECT_VAR_DECL_INIT(V, T, W, N) \
+  VECT_VAR_DECL(V,T,W,N) [] = { MY_INIT_TAB(T,W,N) };
+
+/* Sample initialization vectors.  */
+#define INIT_TAB_1(T)  \
+  (T)-16,
+
+#define INIT_TAB_2(T)  \
+  (T)-16, (T)-15,
+
+#define INIT_TAB_4(T)  \
+  (T)-16, (T)-15, (T)-14, (T)-13,
+
+#define INIT_TAB_8(T)  \
+  (T)-16, (T)-15, (T)-14, (T)-13, (T)-12, (T)-11, (T)-10, (T)-9,
+
+#define INIT_TAB_16(T) \
+  (T)-16, (T)-15, (T)-14, (T)-13, (T)-12, (T)-11, (T)-10, (T)-9,   \

Go patch committed: Fix complex division of NaN / 0

2013-10-09 Thread Ian Lance Taylor
This patch to the Go frontend and libgo fixes complex division of NaN /
0.  Go expects that to produce NaN.  When using libgcc it could produce
Inf.  Specifically NaN+1i / 0+0i produced NaN+Infi, which by the rules
of C99 Annex G is Inf.  This may be correct for C, but it's not correct
for Go.  Bootstrapped and ran Go testsuite on x86_64-unknown-linux-gnu.
Committed to mainline.  Will commit to 4.8 branch when it reopens.

Ian

diff -r c54f26ba8279 go/expressions.cc
--- a/go/expressions.cc	Tue Oct 08 16:55:52 2013 -0700
+++ b/go/expressions.cc	Wed Oct 09 15:17:14 2013 -0700
@@ -5967,6 +5967,43 @@
 right);
 }
 
+  // For complex division Go wants slightly different results than the
+  // GCC library provides, so we have our own runtime routine.
+  if (this->op_ == OPERATOR_DIV && this->left_->type()->complex_type() != NULL)
+{
+  const char *name;
+  tree *pdecl;
+  Type* ctype;
+  static tree complex64_div_decl;
+  static tree complex128_div_decl;
+  switch (this->left_->type()->complex_type()->bits())
+	{
+	case 64:
+	  name = "__go_complex64_div";
+	  pdecl = &complex64_div_decl;
+	  ctype = Type::lookup_complex_type("complex64");
+	  break;
+	case 128:
+	  name = "__go_complex128_div";
+	  pdecl = &complex128_div_decl;
+	  ctype = Type::lookup_complex_type("complex128");
+	  break;
+	default:
+	  go_unreachable();
+	}
+  Btype* cbtype = ctype->get_backend(gogo);
+  tree ctype_tree = type_to_tree(cbtype);
+  return Gogo::call_builtin(pdecl,
+this->location(),
+name,
+2,
+ctype_tree,
+ctype_tree,
+fold_convert_loc(gccloc, ctype_tree, left),
+type,
+fold_convert_loc(gccloc, ctype_tree, right));
+}
+
   tree compute_type = excess_precision_type(type);
   if (compute_type != NULL_TREE)
 {
diff -r c54f26ba8279 go/runtime.cc
--- a/go/runtime.cc	Tue Oct 08 16:55:52 2013 -0700
+++ b/go/runtime.cc	Wed Oct 09 15:17:14 2013 -0700
@@ -42,6 +42,8 @@
   RFT_RUNE,
   // Go type float64, C type double.
   RFT_FLOAT64,
+  // Go type complex64, C type __complex float.
+  RFT_COMPLEX64,
   // Go type complex128, C type __complex double.
   RFT_COMPLEX128,
   // Go type string, C type struct __go_string.
@@ -126,6 +128,10 @@
 	  t = Type::lookup_float_type("float64");
 	  break;
 
+	case RFT_COMPLEX64:
+	  t = Type::lookup_complex_type("complex64");
+	  break;
+
 	case RFT_COMPLEX128:
 	  t = Type::lookup_complex_type("complex128");
 	  break;
@@ -216,6 +222,7 @@
 case RFT_UINTPTR:
 case RFT_RUNE:
 case RFT_FLOAT64:
+case RFT_COMPLEX64:
 case RFT_COMPLEX128:
 case RFT_STRING:
 case RFT_POINTER:
diff -r c54f26ba8279 go/runtime.def
--- a/go/runtime.def	Tue Oct 08 16:55:52 2013 -0700
+++ b/go/runtime.def	Wed Oct 09 15:17:14 2013 -0700
@@ -68,6 +68,12 @@
 	   P1(STRING), R1(SLICE))
 
 
+// Complex division.
+DEF_GO_RUNTIME(COMPLEX64_DIV, "__go_complex64_div",
+	   P2(COMPLEX64, COMPLEX64), R1(COMPLEX64))
+DEF_GO_RUNTIME(COMPLEX128_DIV, "__go_complex128_div",
+	   P2(COMPLEX128, COMPLEX128), R1(COMPLEX128))
+
 // Make a slice.
 DEF_GO_RUNTIME(MAKESLICE1, "__go_make_slice1", P2(TYPE, UINTPTR), R1(SLICE))
 DEF_GO_RUNTIME(MAKESLICE2, "__go_make_slice2", P3(TYPE, UINTPTR, UINTPTR),
diff -r c54f26ba8279 libgo/Makefile.am
--- a/libgo/Makefile.am	Tue Oct 08 16:55:52 2013 -0700
+++ b/libgo/Makefile.am	Wed Oct 09 15:17:14 2013 -0700
@@ -424,6 +424,7 @@
 	runtime/go-caller.c \
 	runtime/go-callers.c \
 	runtime/go-can-convert-interface.c \
+	runtime/go-cdiv.c \
 	runtime/go-cgo.c \
 	runtime/go-check-interface.c \
 	runtime/go-construct-map.c \
diff -r c54f26ba8279 libgo/runtime/go-cdiv.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +
+++ b/libgo/runtime/go-cdiv.c	Wed Oct 09 15:17:14 2013 -0700
@@ -0,0 +1,46 @@
+/* go-cdiv.c -- complex division routines
+
+   Copyright 2013 The Go Authors. All rights reserved.
+   Use of this source code is governed by a BSD-style
+   license that can be found in the LICENSE file.  */
+
+/* Calls to these functions are generated by the Go frontend for
+   division of complex64 or complex128.  We use these because Go's
+   complex division expects slightly different results from the GCC
+   default.  When dividing NaN+1.0i / 0+0i, Go expects NaN+NaNi but
+   GCC generates NaN+Infi.  NaN+Infi seems wrong seems the rules of
+   C99 Annex G specify that if either side of a complex number is Inf,
+   the the whole number is Inf, but an operation involving NaN ought
+   to result in NaN, not Inf.  */
+
+__complex float
+__go_complex64_div (__complex float a, __complex float b)
+{
+  if (__builtin_expect (b == 0+0i, 0))
+{
+  if (!__builtin_isinff (__real__ a)
+	  && !__builtin_isinff (__imag__ a)
+	  && (__builtin_isnanf (__real__ a) || __builtin_isnanf (__imag__ a)))
+	{
+	  /* Pass "1" to nanf to match math/bits.go.  */
+	  return __builtin_nanf("1") + __builtin_nanf("1")*1i;
+	}
+}
+  return a / b;
+}
+
+__complex double
+__go_complex128_div (__complex double a, __complex 

Re: [Patch, Fortran] PR58226 - Avoid invalid mem access with compiler_options

2013-10-09 Thread Steve Kargl
On Wed, Oct 09, 2013 at 11:51:30PM +0200, Tobias Burnus wrote:
> A rather obvious fix; the memory is freed by the caller 
> (gfc_simplify_compiler_options). It is unlikely that the compiler has no 
> arguments as the driver tends to send some, e.g. "-mtune=generic 
> -march=x86-64" on my system. However, it is better to be safe than sorry.
> 
> Build and regtested on x86-64-gnu-linux.
> OK?
> 

Ok.

-- 
Steve


[PATCH] Workaround errata for the PMC-Sierra RM7000 cpu.

2013-10-09 Thread Moore, Catherine
Hi Richard,

This patch implements a workaround for errors on the PMC-Sierra RM7000 cpu 
while executing the dmult or dmultu instruction.  The workaround is to insert 
three nops after the dmult/dmultu.

Does this look okay to commit?

Thanks,
Catherine

gcc/
2013-10-09  Catherine Moore  
Chao-ying Fu  >

* doc/invoke.texi (mfix-pmc): Document.
* config/mips/mips.md (mul3): Handle PMC errata.
(mul3_internal): Likewise.
(mul3_pmc): New pattern.
(muldi3_highpart_split): Handle PMC errata.
(mulditi3): Likewise.
(mulditi3_internal): Likewise.
(mulditi3_pmc): New pattern.
* config/mips/mips.opt (mfix-pmc): New option.

testsuite/
2013-10-09  Catherine Moore  
Chao-ying Fu  >

* gcc.target/mips/mips.exp (mips_option_groups): Append -mfix-pmc.
* gcc/target/mips/fix-pmc-[1-6].c: New tests.



pmc.patch
Description: pmc.patch


[PATCH, rs6000] Fix variable permute control vectors for little endian

2013-10-09 Thread Bill Schmidt
Hi,

This is a follow-up to the recent patch that fixed constant permute
control vectors for little endian.  When the control vector is constant,
we can adjust the constant and use a vperm without increasing code size.
When the control vector is unknown, however, we have to generate two
additional instructions to subtract each element of the control vector
from 31 (equivalently, from -1, since only 5 bits are pertinent).  This
patch adds the additional code generation.

There are two main paths to the affected permutes:  via the known
pattern vec_perm, and via an altivec builtin.  The builtin path
causes a little difficulty because there's no way to dispatch a builtin
to two different insns for BE and LE.  I solved this by adding two new
unspecs for the builtins (UNSPEC_VPERM_X and UNSPEC_VPERM_UNS_X).  The
insns for the builtins are changed from a define_insn to a
define_insn_and_split.  We create the _X forms at expand time and later
split them into the correct sequences for BE and LE, using the "real"
UNSPEC_VPERM and UNSPEC_VPERM_UNS to generate the vperm instruction.

For the path via the known pattern, I added a new routine in rs6000.c in
similar fashion to the solution for the constant control vector case.

When the permute control vector is a rotate vector loaded by lvsl or
lvsr, we can generate the desired control vector more cheaply by simply
changing to use the opposite instruction.  We are already doing that
when expanding an unaligned load.  The changes in vector.md avoid
undoing that effort by circumventing the subtract-from-splat (going
straight to the UNSPEC_VPERM).

I bootstrapped and tested this for big endian on
powerpc64-unknown-linux-gnu with no new regressions.  I did the same for
little endian on powerpc64le-unknown-linux-gnu.  Here the results were
slightly mixed: the changes fix 32 test failures, but expose an
unrelated bug in 9 others when -mvsx is permitted on LE (not currently
allowed).  The bug is a missing permute for a vector load in the
unaligned vector load logic that will be fixed in a subsequent patch.

Is this okay for trunk?

Thanks,
Bill


2013-10-09  Bill Schmidt  

* config/rs6000/vector.md (vec_realign_load): Generate vperm
directly to circumvent subtract from splat{31} workaround.
* config/rs6000/rs6000-protos.h (altivec_expand_vec_perm_le): New
prototype.
* config/rs6000/rs6000.c (altivec_expand_vec_perm_le): New.
* config/rs6000/altivec.md (define_c_enum "unspec"): Add
UNSPEC_VPERM_X and UNSPEC_VPERM_UNS_X.
(altivec_vperm_): Convert to define_insn_and_split to
separate big and little endian logic.
(*altivec_vperm__internal): New define_insn.
(altivec_vperm__uns): Convert to define_insn_and_split to
separate big and little endian logic.
(*altivec_vperm__uns_internal): New define_insn.
(vec_permv16qi): Add little endian logic.


Index: gcc/config/rs6000/vector.md
===
--- gcc/config/rs6000/vector.md (revision 203246)
+++ gcc/config/rs6000/vector.md (working copy)
@@ -950,8 +950,15 @@
 emit_insn (gen_altivec_vperm_ (operands[0], operands[1],
 operands[2], operands[3]));
   else
-emit_insn (gen_altivec_vperm_ (operands[0], operands[2],
-operands[1], operands[3]));
+{
+  /* Avoid the "subtract from splat31" workaround for vperm since
+ we have changed lvsr to lvsl instead.  */
+  rtx unspec = gen_rtx_UNSPEC (mode,
+   gen_rtvec (3, operands[2],
+  operands[1], operands[3]),
+   UNSPEC_VPERM);
+  emit_move_insn (operands[0], unspec);
+}
   DONE;
 })
 
Index: gcc/config/rs6000/rs6000-protos.h
===
--- gcc/config/rs6000/rs6000-protos.h   (revision 203246)
+++ gcc/config/rs6000/rs6000-protos.h   (working copy)
@@ -56,6 +56,7 @@ extern void paired_expand_vector_init (rtx, rtx);
 extern void rs6000_expand_vector_set (rtx, rtx, int);
 extern void rs6000_expand_vector_extract (rtx, rtx, int);
 extern bool altivec_expand_vec_perm_const (rtx op[4]);
+extern void altivec_expand_vec_perm_le (rtx op[4]);
 extern bool rs6000_expand_vec_perm_const (rtx op[4]);
 extern void rs6000_expand_extract_even (rtx, rtx, rtx);
 extern void rs6000_expand_interleave (rtx, rtx, rtx, bool);
Index: gcc/config/rs6000/rs6000.c
===
--- gcc/config/rs6000/rs6000.c  (revision 203247)
+++ gcc/config/rs6000/rs6000.c  (working copy)
@@ -28608,6 +28608,54 @@ altivec_expand_vec_perm_const_le (rtx operands[4])
   emit_move_insn (target, unspec);
 }
 
+/* Similarly to altivec_expand_vec_perm_const_le, we must adjust the
+   permute control vector.  But here it's not a constant, so we must
+   ge

[GOOGLE] AutoFDO profile propagation should use all dominators

2013-10-09 Thread Dehao Chen
This patch updates the AutoFDO profile propagation of equivalence
class: instead of looking just immediate dominators, traverse all
dominators. This helps improving profile accuracy.

Bootstrapped and passed regression test.

OK for google-4_8 branch?

Thanks,
Dehao

Index: gcc/auto-profile.c
===
--- gcc/auto-profile.c (revision 203331)
+++ gcc/auto-profile.c (working copy)
@@ -891,7 +891,7 @@ afdo_find_equiv_class (void)
   if (bb->aux != NULL)
  continue;
   bb->aux = bb;
-  dom_bbs = get_dominated_by (CDI_DOMINATORS, bb);
+  dom_bbs = get_all_dominated_blocks (CDI_DOMINATORS, bb);
   FOR_EACH_VEC_ELT (dom_bbs, i, bb1)
  if (bb1->aux == NULL
 && dominated_by_p (CDI_POST_DOMINATORS, bb, bb1)
@@ -904,7 +904,7 @@ afdo_find_equiv_class (void)
  bb->flags |= BB_ANNOTATED;
   }
   }
-  dom_bbs = get_dominated_by (CDI_POST_DOMINATORS, bb);
+  dom_bbs = get_all_dominated_blocks (CDI_POST_DOMINATORS, bb);
   FOR_EACH_VEC_ELT (dom_bbs, i, bb1)
  if (bb1->aux == NULL
 && dominated_by_p (CDI_DOMINATORS, bb, bb1)


Re: [GOOGLE] AutoFDO profile propagation should use all dominators

2013-10-09 Thread Xinliang David Li
ok.

David

On Wed, Oct 9, 2013 at 4:47 PM, Dehao Chen  wrote:
> This patch updates the AutoFDO profile propagation of equivalence
> class: instead of looking just immediate dominators, traverse all
> dominators. This helps improving profile accuracy.
>
> Bootstrapped and passed regression test.
>
> OK for google-4_8 branch?
>
> Thanks,
> Dehao
>
> Index: gcc/auto-profile.c
> ===
> --- gcc/auto-profile.c (revision 203331)
> +++ gcc/auto-profile.c (working copy)
> @@ -891,7 +891,7 @@ afdo_find_equiv_class (void)
>if (bb->aux != NULL)
>   continue;
>bb->aux = bb;
> -  dom_bbs = get_dominated_by (CDI_DOMINATORS, bb);
> +  dom_bbs = get_all_dominated_blocks (CDI_DOMINATORS, bb);
>FOR_EACH_VEC_ELT (dom_bbs, i, bb1)
>   if (bb1->aux == NULL
>  && dominated_by_p (CDI_POST_DOMINATORS, bb, bb1)
> @@ -904,7 +904,7 @@ afdo_find_equiv_class (void)
>   bb->flags |= BB_ANNOTATED;
>}
>}
> -  dom_bbs = get_dominated_by (CDI_POST_DOMINATORS, bb);
> +  dom_bbs = get_all_dominated_blocks (CDI_POST_DOMINATORS, bb);
>FOR_EACH_VEC_ELT (dom_bbs, i, bb1)
>   if (bb1->aux == NULL
>  && dominated_by_p (CDI_DOMINATORS, bb, bb1)


[GOOGLE] Represent the callsite with lineno only for AutoFDO

2013-10-09 Thread Dehao Chen
In legacy AutoFDO, callsite is represented as a (lineno, callee_name)
pair because there could be multiple calls in one line. However, as we
enhanced the debug info by assigning discriminators for each function
call in the same line, callee_name is not needed when indexing the
callsite.

This patch will improve AutoFDO performance because some functions
have alias (e.g. constructors/destructors), which could make the
function name matching fail.

Bootstrapped and passed regression test.

OK for google-4_8 branch.
Dehao

Index: gcc/auto-profile.c
===
--- gcc/auto-profile.c (revision 203331)
+++ gcc/auto-profile.c (working copy)
@@ -174,10 +174,8 @@ class function_instance {
   const function_instance *get_function_instance_by_decl (unsigned lineno,
   tree decl) const;

-  /* Callsite, represented as (decl_lineno, callee_function_name_index).  */
-  typedef std::pair callsite;
-  /* Map from callsite to callee function_instance.  */
-  typedef std::map callsite_map;
+  /* Map from callsite decl_lineno to callee function_instance.  */
+  typedef std::map callsite_map;
   /* Map from source location (decl_lineno) to profile (count_info).  */
   typedef std::map position_count_map;

@@ -430,8 +428,7 @@ const function_instance *function_instance::get_fu
   int func_name_idx = afdo_function_name_map->get_index_by_decl (decl);
   if (func_name_idx != -1)
 {
-  callsite_map::const_iterator ret = callsites.find (
-  std::make_pair (lineno, func_name_idx));
+  callsite_map::const_iterator ret = callsites.find (lineno);
   if (ret != callsites.end ())
  return ret->second;
 }
@@ -439,8 +436,7 @@ const function_instance *function_instance::get_fu
   lang_hooks.dwarf_name (decl, 0));
   if (func_name_idx != -1)
 {
-  callsite_map::const_iterator ret = callsites.find (
-  std::make_pair (lineno, func_name_idx));
+  callsite_map::const_iterator ret = callsites.find (lineno);
   if (ret != callsites.end ())
  return ret->second;
 }
@@ -500,10 +496,7 @@ const function_instance *function_instance::read_f
 }
   for (unsigned i = 0; i < num_callsites; i++) {
 unsigned offset = gcov_read_unsigned ();
-const function_instance *callee_function_instance =
- read_function_instance (stack, 0);
-s->callsites[std::make_pair (offset, callee_function_instance->name ())] =
- callee_function_instance;
+s->callsites[offset] = read_function_instance (stack, 0);
   }
   stack->pop_back();
   return s;


Go patch committed: A type conversion of a constant need not be const

2013-10-09 Thread Ian Lance Taylor
In Go a type conversion of a constant is not necessarily itself a
constant, as in []byte(nil).  This patch fixes the Go frontend to
recognize that.  Bootstrapped and ran Go testsuite on
x86_64-unknown-linux-gnu.  Committed to mainline.  Will commit to 4.8
branch when it reopens.

Ian

diff -r a6f45afcc8a6 go/expressions.cc
--- a/go/expressions.cc	Wed Oct 09 15:28:49 2013 -0700
+++ b/go/expressions.cc	Wed Oct 09 17:02:02 2013 -0700
@@ -3055,8 +3055,7 @@
   do_lower(Gogo*, Named_object*, Statement_inserter*, int);
 
   bool
-  do_is_constant() const
-  { return this->expr_->is_constant(); }
+  do_is_constant() const;
 
   bool
   do_numeric_constant_value(Numeric_constant*) const;
@@ -3198,6 +3197,27 @@
   return this;
 }
 
+// Return whether a type conversion is a constant.
+
+bool
+Type_conversion_expression::do_is_constant() const
+{
+  if (!this->expr_->is_constant())
+return false;
+
+  // A conversion to a type that may not be used as a constant is not
+  // a constant.  For example, []byte(nil).
+  Type* type = this->type_;
+  if (type->integer_type() == NULL
+  && type->float_type() == NULL
+  && type->complex_type() == NULL
+  && !type->is_boolean_type()
+  && !type->is_string_type())
+return false;
+
+  return true;
+}
+
 // Return the constant numeric value if there is one.
 
 bool


  1   2   >