[og7, nvptx, openacc, PR85381, committed] Don't emit barriers for empty loops

2018-04-21 Thread Tom de Vries

Hi,

when compiling this testcase with the og7 branch:
...
int
main (void)
{
  long long v1;
#pragma acc parallel num_gangs (640) num_workers(1) vector_length (128)
#pragma acc loop
  for (v1 = 0; v1 < 20; v1 += 2)
;

  return 0;
}
...

this ptx is generated:
...
{
  // fork 4; 


  bar.sync 0;
  // forked 4; 

  // joining 4; 


  bar.sync 0;
  // join 4; 


  ret;
}
...

This triggers some bug on my quadro m1200 (I'm assuming in the ptxas/JIT 
compiler) that hangs the testcase. I can work around this by adding a 
membar.cta before the bar.syc, or two membar.ctas inbetween, but I'm not 
really sure what a minimal workaround should look like (I reported the 
bug to nvidia, I'm hoping for them to answer that question).


This patch works around the bug by doing an optimization: we detect that 
this is an empty loop (a forked immediately followed by a joining), and 
don't emit the barriers.


Build x86_64 with nvptx accelerator and tested libgomp.

Committed to og7 branch.

Thanks,
- Tom
[nvptx, openacc] Don't emit barriers for empty loops

2018-04-21  Tom de Vries  

	PR target/85381
	* config/nvptx/nvptx.c (nvptx_process_pars): Don't emit barriers for
	empty loops.

	* testsuite/libgomp.oacc-c-c++-common/pr85381-2.c: New test.
	* testsuite/libgomp.oacc-c-c++-common/pr85381-3.c: New test.
	* testsuite/libgomp.oacc-c-c++-common/pr85381-4.c: New test.
	* testsuite/libgomp.oacc-c-c++-common/pr85381-5.c: New test.
	* testsuite/libgomp.oacc-c-c++-common/pr85381.c: New test.

---
 gcc/config/nvptx/nvptx.c   | 15 +++---
 .../libgomp.oacc-c-c++-common/pr85381-2.c  | 35 ++
 .../libgomp.oacc-c-c++-common/pr85381-3.c  | 34 +
 .../libgomp.oacc-c-c++-common/pr85381-4.c  | 26 
 .../libgomp.oacc-c-c++-common/pr85381-5.c  | 23 ++
 .../testsuite/libgomp.oacc-c-c++-common/pr85381.c  | 17 +++
 6 files changed, 146 insertions(+), 4 deletions(-)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 8c478c8..3aee9cc 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -4467,9 +4467,12 @@ nvptx_process_pars (parallel *par)
 {
   nvptx_shared_propagate (false, is_call, par->forked_block,
 			  par->forked_insn, !worker);
-  bool empty = nvptx_shared_propagate (true, is_call,
-	   par->forked_block, par->fork_insn,
-	   !worker);
+  bool no_prop_p
+	= nvptx_shared_propagate (true, is_call, par->forked_block,
+  par->fork_insn, !worker);
+  bool empty_loop_p
+	= !is_call && (NEXT_INSN (par->forked_insn)
+		   && NEXT_INSN (par->forked_insn) == par->joining_insn);
   rtx barrier = GEN_INT (0);
   int threads = 0;
 
@@ -4479,7 +4482,11 @@ nvptx_process_pars (parallel *par)
 	  threads = nvptx_mach_vector_length ();
 	}
 
-  if (!empty || !is_call)
+  if (no_prop_p && empty_loop_p)
+	;
+  else if (no_prop_p && is_call)
+	;
+  else
 	{
 	  /* Insert begin and end synchronizations.  */
 	  emit_insn_before (nvptx_cta_sync (barrier, threads),
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85381-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85381-2.c
new file mode 100644
index 000..e5d02cf
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85381-2.c
@@ -0,0 +1,35 @@
+/* { dg-additional-options "-save-temps" } */
+/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_DEVICE_TYPE_nvidia=1 -O2" } } */
+
+int
+main (void)
+{
+  int v1;
+
+  #pragma acc parallel
+  #pragma acc loop worker
+  for (v1 = 0; v1 < 20; v1 += 2)
+;
+
+  return 0;
+}
+
+/* Todo: Boths bar.syncs can be removed.
+   Atm we generate this dead code inbetween forked and joining:
+
+ mov.u32 %r28, %ntid.y;
+ mov.u32 %r29, %tid.y;
+ add.u32 %r30, %r29, %r29;
+ setp.gt.s32 %r31, %r30, 19;
+ @%r31   bra $L2;
+ add.u32 %r25, %r28, %r28;
+ mov.u32 %r24, %r30;
+ $L3:
+ add.u32 %r24, %r24, %r25;
+ setp.le.s32 %r33, %r24, 19;
+ @%r33   bra $L3;
+ $L2:
+
+   so the loop is not recognized as empty loop (which we detect by seeing if
+   joining immediately follows forked).  */
+/* { dg-final { scan-assembler-times "bar.sync" 2 } } */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85381-3.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85381-3.c
new file mode 100644
index 000..7d9ba1b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85381-3.c
@@ -0,0 +1,34 @@
+/* { dg-additional-options "-save-temps -w" } */
+/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_DEVICE_TYPE_nvidia=1 -O2" } } */
+
+int a;
+#pragma acc declare create(a)
+
+#pragma acc routine vector
+void __attribute__((noinline, noclone))
+foo_v (void)
+{
+  a = 1;
+}
+
+#pragma acc routine worker
+void __attribute__((no

[PATCH] x86: Update __CET__ check

2018-04-21 Thread H.J. Lu
__CET__ has been changed by revision 259522:

commit d59cfa9a4064339cf2bd2da828c4c133f13e57f0
Author: hjl 
Date:   Fri Apr 20 13:30:13 2018 +

Define __CET__ for -fcf-protection and remove -mibt

to

(__CET__ & 1) != 0: -fcf-protection=branch or -fcf-protection=full
(__CET__ & 2) != 0: -fcf-protection=return or -fcf-protection=full

We should check (__CET__ & 2) != 0 for shadow stack.

OK for trunk?


H.J.
---
libgcc/

* config/i386/linux-unwind.h: Add (__CET__ & 2) != 0 check
when including "config/i386/shadow-stack-unwind.h".

libitm/

* config/x86/sjlj.S (_ITM_beginTransaction): Add
(__CET__ & 2) != 0 check for shadow stack.
(GTM_longjmp): Likewise.
---
 libgcc/config/i386/linux-unwind.h | 2 +-
 libitm/config/x86/sjlj.S  | 8 
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/libgcc/config/i386/linux-unwind.h 
b/libgcc/config/i386/linux-unwind.h
index f1f52334d8d..ea838e4e47b 100644
--- a/libgcc/config/i386/linux-unwind.h
+++ b/libgcc/config/i386/linux-unwind.h
@@ -23,7 +23,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If 
not, see
 .  */
 
 /* Unwind shadow stack for -fcf-protection -mshstk.  */
-#if defined __SHSTK__ && defined __CET__
+#if defined __SHSTK__ && defined __CET__ && (__CET__ & 2) != 0
 # include "config/i386/shadow-stack-unwind.h"
 #endif
 
diff --git a/libitm/config/x86/sjlj.S b/libitm/config/x86/sjlj.S
index ee61451394a..3cac9099c9e 100644
--- a/libitm/config/x86/sjlj.S
+++ b/libitm/config/x86/sjlj.S
@@ -141,7 +141,7 @@ SYM(_ITM_beginTransaction):
movq%r15, -24(%rax)
xorq%rdx, %rdx
/* Save zero or shadow stack pointer in the new field.  */
-#if defined(__SHSTK__) && defined(__CET__)
+#if defined __SHSTK__ && defined __CET__ && (__CET__ & 2) != 0
rdsspq  %rdx
 #endif
movq%rdx, -16(%rax)
@@ -177,7 +177,7 @@ SYM(_ITM_beginTransaction):
movl%ebp, 20(%esp)
xorl%edx, %edx
/* Save zero or shadow stack pointer in the new field.  */
-#if defined(__SHSTK__) && defined(__CET__)
+#if defined __SHSTK__ && defined __CET__ && (__CET__ & 2) != 0
rdsspd  %edx
 #endif
movl%edx, 24(%esp)
@@ -221,7 +221,7 @@ SYM(GTM_longjmp):
cfi_offset(%rip, 64)
cfi_register(%rsp, %rcx)
movq%rcx, %rsp
-#if defined(__SHSTK__) && defined(__CET__)
+#if defined __SHSTK__ && defined __CET__ && (__CET__ & 2) != 0
/* Check if Shadow Stack is enabled.  */
xorq%rcx, %rcx
rdsspq  %rcx
@@ -259,7 +259,7 @@ SYM(GTM_longjmp):
cfi_offset(%eip, 24)
cfi_register(%esp, %ecx)
movl%ecx, %esp
-#if defined(__SHSTK__) && defined(__CET__)
+#if defined __SHSTK__ && defined __CET__ && (__CET__ & 2) != 0
/* Check if Shadow Stack is enabled.  */
xorl%ecx, %ecx
rdsspd  %ecx
-- 
2.14.3



RE: [PATCH][i386] PR target/85473, Fix _movdir64b expansion with -mx32

2018-04-21 Thread Peryt, Sebastian
Hi,

I just realized this patch introduces some new regressions.

Sorry, I must have mixed up something in testing. Will update this patch 
shortly.

Sebastian

> -Original Message-
> From: Peryt, Sebastian
> Sent: Friday, April 20, 2018 6:38 PM
> To: gcc-patches@gcc.gnu.org
> Cc: Uros Bizjak ; Kirill Yukhin ;
> H.J. Lu ; Peryt, Sebastian 
> Subject: [PATCH][i386] PR target/85473, Fix _movdir64b expansion with -mx32
> 
> Hi,
> 
> This fixes PR85473 by fixing _movdir64b expansion for -mx32.
> 
> Ok for trunk?
> 
> 2018-04-20  Sebastian Peryt  
> 
> gcc/ChangeLog:
> 
>   PR target/85473
>   * config/i386/i386.c (ix86_expand_builtin): Change memory
>   operand to XI, op0 extend to Pmode.
>   * config/i386/i386.md: Change unspec volatile and operand 1
>   mode to XI, change operand 0 mode to P
> 
> 2018-04-20  Sebastian Peryt  
> 
> gcc/testsuite/ChangeLog:
> 
>   PR target/85473
>   * gcc.target/i386/pr85473-1.c: New test.
>   * gcc.target/i386/pr85473-2.c: New test.
> 
> Sebastian
> 



[PATCH] Add constant folding support for next{after,toward}{,f,l} (PR libstdc++/85466)

2018-04-21 Thread Jakub Jelinek
Hi!

This patch adds constant folding for next{after,toward}{,f,l}.
It doesn't handle decimal (we don't have a builtins that would need it),
nor composite modes (IBM double double; nextafter/nexttoward for variable
precision types isn't really well defined; we handle nexttoward where long
double is IBM double double) and for now punts also on formats without
denormals or infinities (don't really know what the library nextafter does
for those).

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for stage1?

2018-04-21  Jakub Jelinek  

PR libstdc++/85466
* real.h (real_nextafter): Declare.
* real.c (real_nextafter): New function.
* fold-const-call.c (fold_const_nextafter): New function.
(fold_const_call_sss): Call it for CASE_CFN_NEXTAFTER and
CASE_CFN_NEXTTOWARD.
(fold_const_call_1): For CASE_CFN_NEXTTOWARD call fold_const_call_sss
even when arg1_mode is different from arg0_mode.

* gcc.dg/nextafter-1.c: New test.
* gcc.dg/nextafter-2.c: New test.
* gcc.dg/nextafter-3.c: New test.
* gcc.dg/nextafter-4.c: New test.

--- gcc/real.h.jj   2018-01-03 10:19:54.349533828 +0100
+++ gcc/real.h  2018-04-20 12:44:30.707350855 +0200
@@ -507,6 +507,10 @@ extern void real_copysign (REAL_VALUE_TY
 extern bool real_isinteger (const REAL_VALUE_TYPE *, format_helper);
 extern bool real_isinteger (const REAL_VALUE_TYPE *, HOST_WIDE_INT *);
 
+/* Calculate nextafter (X, Y) in format FMT.  */
+extern bool real_nextafter (REAL_VALUE_TYPE *, format_helper,
+   const REAL_VALUE_TYPE *, const REAL_VALUE_TYPE *);
+
 /* Write into BUF the maximum representable finite floating-point
number, (1 - b**-p) * b**emax for a given FP format FMT as a hex
float string.  BUF must be large enough to contain the result.  */
--- gcc/real.c.jj   2018-01-03 10:19:55.003533933 +0100
+++ gcc/real.c  2018-04-21 19:29:09.103584254 +0200
@@ -5048,6 +5048,102 @@ real_isinteger (const REAL_VALUE_TYPE *c
   return false;
 }
 
+/* Calculate nextafter (X, Y) or nexttoward (X, Y).  Return true if
+   underflow or overflow needs to be raised.  */
+
+bool
+real_nextafter (REAL_VALUE_TYPE *r, format_helper fmt,
+   const REAL_VALUE_TYPE *x, const REAL_VALUE_TYPE *y)
+{
+  int cmp = do_compare (x, y, 2);
+  /* If either operand is NaN, return qNaN.  */
+  if (cmp == 2)
+{
+  get_canonical_qnan (r, 0);
+  return false;
+}
+  /* If x == y, return y cast to target type.  */
+  if (cmp == 0)
+{
+  real_convert (r, fmt, y);
+  return false;
+}
+
+  if (x->cl == rvc_zero)
+{
+  get_zero (r, y->sign);
+  r->cl = rvc_normal;
+  SET_REAL_EXP (r, fmt->emin - fmt->p + 1);
+  r->sig[SIGSZ - 1] = SIG_MSB;
+  return false;
+}
+
+  int np2 = SIGNIFICAND_BITS - fmt->p;
+  /* For denormals adjust np2 correspondingly.  */
+  if (x->cl == rvc_normal && REAL_EXP (x) < fmt->emin)
+np2 += fmt->emin - REAL_EXP (x);
+
+  REAL_VALUE_TYPE u;
+  get_zero (r, x->sign);
+  get_zero (&u, 0);
+  set_significand_bit (&u, np2);
+  r->cl = rvc_normal;
+  SET_REAL_EXP (r, REAL_EXP (x));
+
+  if (x->cl == rvc_inf)
+{
+  bool borrow = sub_significands (r, r, &u, 0);
+  gcc_assert (borrow);
+  SET_REAL_EXP (r, fmt->emax);
+}
+  else if (cmp == (x->sign ? 1 : -1))
+{
+  if (add_significands (r, x, &u))
+   {
+ /* Overflow.  Means the significand had been all ones, and
+is now all zeros.  Need to increase the exponent, and
+possibly re-normalize it.  */
+ SET_REAL_EXP (r, REAL_EXP (r) + 1);
+ if (REAL_EXP (r) > fmt->emax)
+   {
+ get_inf (r, x->sign);
+ return true;
+   }
+ r->sig[SIGSZ - 1] = SIG_MSB;
+   }
+}
+  else
+{
+  if (REAL_EXP (x) > fmt->emin && x->sig[SIGSZ - 1] == SIG_MSB)
+   {
+ int i;
+ for (i = SIGSZ - 2; i >= 0; i--)
+   if (x->sig[i])
+ break;
+ if (i < 0)
+   {
+ /* When mantissa is 1.0, we need to subtract only
+half of u: nextafter (1.0, 0.0) is 1.0 - __DBL_EPSILON__ / 2
+rather than 1.0 - __DBL_EPSILON__.  */
+ clear_significand_bit (&u, np2);
+ np2--;
+ set_significand_bit (&u, np2);
+   }
+   }
+  sub_significands (r, x, &u, 0);
+}
+
+  /* Clear out trailing garbage.  */
+  clear_significand_below (r, np2);
+  normalize (r);
+  if (REAL_EXP (r) <= fmt->emin - fmt->p)
+{
+  get_zero (r, x->sign);
+  return true;
+}
+  return r->cl == rvc_zero;
+}
+
 /* Write into BUF the maximum representable finite floating-point
number, (1 - b**-p) * b**emax for a given FP format FMT as a hex
float string.  LEN is the size of BUF, and the buffer must be large
--- gcc/fold-const-call.c.jj2018-01-14 17:16:52.873836266 +0100
+++ gcc/fold-const-call.c   20