Re: [gomp4.1] handle undeclared sink variables gracefully

2015-07-17 Thread Jakub Jelinek
On Thu, Jul 16, 2015 at 05:47:09PM -0700, Aldy Hernandez wrote:
> commit 6ec528841cee875cfd0bcac0e35f5a6db1df0f6b
> Author: Aldy Hernandez 
> Date:   Thu Jul 16 16:38:19 2015 -0700
> 
> c/
>   * c-parser.c (c_parser_omp_clause_depend_sink): Handle multiple
>   undeclared sink variables gracefully.
> cp/
>   * parser.c (cp_parser_omp_clause_depend_sink): Handle multiple
>   undeclared sink variables gracefully.
> testsuite/
>   * c-c++-common/gomp/sink-3.c: New test.

Ok, thanks.

Jakub


[PATCH, i386]: Improve FP moves through integer registers

2015-07-17 Thread Uros Bizjak
If the target only has integer registers, we don't have to go through
delicately tuned alternatives, balanced between x87 regs, SSE regs and
integer regs. Attached patch introduces simple and effective
alternatives in this case.

Testcases will be provided later by HJ.

2015-07-17  Uros Bizjak  

PR target/66824
* config/i386/i386.h (TARGET_HARD_SF_REGS): New define.
(TARGET_HARD_DF_REGS): Ditto.
(TARGET_HARD_XF_REGS): Ditto.
* config/i386/i386.md (*movxf_internal): Add alternatives 9 and 10.
Enable alternatives 9 and 10 only for !TARGET_HARD_XF_REG target.
(*movdf_internal): Add alternatives 22, 23, 24 and 25. Enable
alternatives 22, 23, 24 and 25 only for !TARGET_HARD_DF_REG target.
(*movsf_internal): Add alternatives 16 and 17. Enable
alternatives 16 and 17 only for !TARGET_HARD_SF_REG target.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32} and
committed to mainline.

Uros.
Index: config/i386/i386.h
===
--- config/i386/i386.h  (revision 225857)
+++ config/i386/i386.h  (working copy)
@@ -524,6 +524,10 @@
 #define TARGET_MIX_SSE_I387 \
  ((ix86_fpmath & (FPMATH_SSE | FPMATH_387)) == (FPMATH_SSE | FPMATH_387))
 
+#define TARGET_HARD_SF_REGS(TARGET_80387 || TARGET_MMX || TARGET_SSE)
+#define TARGET_HARD_DF_REGS(TARGET_80387 || TARGET_SSE)
+#define TARGET_HARD_XF_REGS(TARGET_80387)
+
 #define TARGET_GNU_TLS (ix86_tls_dialect == TLS_DIALECT_GNU)
 #define TARGET_GNU2_TLS(ix86_tls_dialect == TLS_DIALECT_GNU2)
 #define TARGET_ANY_GNU_TLS (TARGET_GNU_TLS || TARGET_GNU2_TLS)
Index: config/i386/i386.md
===
--- config/i386/i386.md (revision 225857)
+++ config/i386/i386.md (working copy)
@@ -3087,9 +3087,9 @@
 ;; in alternatives 4, 6, 7 and 8.
 (define_insn "*movxf_internal"
   [(set (match_operand:XF 0 "nonimmediate_operand"
-"=f,m,f,?r ,!o,?*r ,!o,!o,!o")
+"=f,m,f,?r ,!o,?*r ,!o,!o,!o,r  ,o")
(match_operand:XF 1 "general_operand"
-"fm,f,G,roF,r , *roF,*r,F ,C"))]
+"fm,f,G,roF,r , *roF,*r,F ,C,roF,rF"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))
&& (!can_create_pseudo_p ()
|| (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE)
@@ -3098,7 +3098,8 @@
   && standard_80387_constant_p (operands[1]) > 0
   && !memory_operand (operands[0], XFmode))
|| (!TARGET_MEMORY_MISMATCH_STALL
-  && memory_operand (operands[0], XFmode)))"
+  && memory_operand (operands[0], XFmode))
+   || !TARGET_HARD_XF_REGS)"
 {
   switch (get_attr_type (insn))
 {
@@ -3122,12 +3123,12 @@
  ]
  (const_string "*")))
(set (attr "type")
-   (cond [(eq_attr "alternative" "3,4,5,6,7,8")
+   (cond [(eq_attr "alternative" "3,4,5,6,7,8,9,10")
 (const_string "multi")
  ]
  (const_string "fmov")))
(set (attr "mode")
-   (cond [(eq_attr "alternative" "3,4,5,6,7,8")
+   (cond [(eq_attr "alternative" "3,4,5,6,7,8,9,10")
 (if_then_else (match_test "TARGET_64BIT")
   (const_string "DI")
   (const_string "SI"))
@@ -3136,14 +3137,24 @@
(set (attr "preferred_for_size")
  (cond [(eq_attr "alternative" "3,4")
   (symbol_ref "false")]
-   (symbol_ref "true")))])
+   (symbol_ref "true")))
+   (set (attr "enabled")
+ (cond [(eq_attr "alternative" "9,10")
+  (if_then_else
+   (match_test "TARGET_HARD_XF_REGS")
+   (symbol_ref "false")
+   (const_string "*"))
+(not (match_test "TARGET_HARD_XF_REGS"))
+ (symbol_ref "false")
+  ]
+  (const_string "*")))])

 ;; Possible store forwarding (partial memory) stall in alternatives 4, 6 and 7.
 (define_insn "*movdf_internal"
   [(set (match_operand:DF 0 "nonimmediate_operand"
-"=Yf*f,m   ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,r ,Yi")
+"=Yf*f,m   ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,r ,Yi,r 
 ,o ,r  ,m")
(match_operand:DF 1 "general_operand"
-"Yf*fm,Yf*f,G   ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x,Yj,r"))]
+"Yf*fm,Yf*f,G   ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x,Yj,r 
,roF,rF,rmF,rC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))
&& (!can_create_pseudo_p ()
|| (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE)
@@ -3155,7 +3166,8 @@
   && standard_sse_constant_p (operands[1])))
   && !memory_operand (operands[0], DFmode))
|| ((TARGET_64BIT || !TARGET_MEMORY_MISMATCH_STALL)
-  && memory_operand (operands[0], DFmode)))"
+  && memory_operand (operands[0], DFmode))
+   || !TARGET_HARD_DF_REGS)"
 {
   switch (get_attr_type (insn))
 {
@@ -3216,9 +3228,9 @@
 }
 }
   [(set (attr "isa")
-   

Re: [PATCH 2/2] Set REG_EQUAL

2015-07-17 Thread Kugan
Ping?


On 28/06/15 21:30, Kugan wrote:
> This patch sets REG_EQUAL when emitting arm_emit_movpair.
> 
> Thanks,
> Kugan
> 
> gcc/testsuite/ChangeLog:
> 
> 2015-06-26  Kugan Vivekanandarajah  
> 
>   * gcc.target/arm/reg_equal_test.c: New test.
> 
> gcc.
> 
> 2015-06-26  Kugan Vivekanandarajah  
> 
>   * config/arm/arm.c (arm_emit_movpair): Add REG_EQUAL notes to
>   instruction.
> 


Re: [C/C++ PATCH] Implement -Wshift-overflow (PR c++/55095) (take 3)

2015-07-17 Thread Marek Polacek
Ping^6.

On Fri, Jul 10, 2015 at 03:23:43PM +0200, Marek Polacek wrote:
> Ping^5.
> 
> On Fri, Jul 03, 2015 at 09:42:39AM +0200, Marek Polacek wrote:
> > Ping^4.
> > 
> > On Fri, Jun 26, 2015 at 10:08:51AM +0200, Marek Polacek wrote:
> > > I'm pinging the C++ parts.
> > > 
> > > On Fri, Jun 19, 2015 at 12:44:36PM +0200, Marek Polacek wrote:
> > > > Ping.
> > > > 
> > > > On Fri, Jun 12, 2015 at 11:07:29AM +0200, Marek Polacek wrote:
> > > > > Ping.
> > > > > 
> > > > > On Fri, Jun 05, 2015 at 10:55:08AM +0200, Marek Polacek wrote:
> > > > > > On Thu, Jun 04, 2015 at 09:04:19PM +, Joseph Myers wrote:
> > > > > > > The C changes are OK.
> > > > > > 
> > > > > > Jason, do you want to approve the C++ parts?

Marek


Re: [PATCH 2/2] Set REG_EQUAL

2015-07-17 Thread Kyrill Tkachov

Hi Kugan,

On 28/06/15 12:30, Kugan wrote:

This patch sets REG_EQUAL when emitting arm_emit_movpair.

Thanks,
Kugan

gcc/testsuite/ChangeLog:

2015-06-26  Kugan Vivekanandarajah  

* gcc.target/arm/reg_equal_test.c: New test.

gcc.

2015-06-26  Kugan Vivekanandarajah  

* config/arm/arm.c (arm_emit_movpair): Add REG_EQUAL notes to
instruction.


This is ok for trunk.
Sorry for the delay.
Can you please re-test this on arm and commit if all is clean?

Thanks,
Kyrill




Re: [PATCH v3] [AArch64] PR63870 Improve error messages for NEON single lane memory access intrinsics

2015-07-17 Thread James Greenhalgh
On Fri, Jun 26, 2015 at 08:14:55PM +0100, Charles Baylis wrote:
> Since the last ping, I've tweaked the test cases a bit...
> 
> Since I've been working on doing the same changes for the ARM backend,
> I've moved the tests into the advsimd-intrinsics directory, marked as
> XFAIL for ARM targets for now. The gcc/ part of the patch is
> unchanged.

Hi Charles,

This patch looks OK to me, though please fix the whitespace nits called
out below:

> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -3959,7 +3962,7 @@
>  (define_insn "vec_store_lanesoi_lane"
>[(set (match_operand: 0 "aarch64_simd_struct_operand" "=Utv")
>   (unspec: [(match_operand:OI 1 "register_operand" "w")
> -(unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)
> +(unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)
>   (match_operand:SI 2 "immediate_operand" "i")]
> UNSPEC_ST2_LANE))]

8 Spaces to tab.

>"TARGET_SIMD"
> @@ -3967,7 +3970,7 @@
>  operands[2] = GEN_INT (ENDIAN_LANE_N (mode, INTVAL (operands[2])));
>  return "st2\\t{%S1. - %T1.}[%2], %0";
>}
> -  [(set_attr "type" "neon_store3_one_lane")]
> +  [(set_attr "type" "neon_store2_one_lane")]

I would prefer this in a separate patch as it is a separate logical
change. Consider it pre-approved (and obvious) to commit as a one-line
fix on its own.

> @@ -4054,7 +4060,7 @@
>  (define_insn "vec_store_lanesci_lane"
>[(set (match_operand: 0 "aarch64_simd_struct_operand" "=Utv")
>   (unspec: [(match_operand:CI 1 "register_operand" "w")
> -(unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)
> +(unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)
>   (match_operand:SI 2 "immediate_operand" "i")]
> UNSPEC_ST3_LANE))]

8 Spaces to tab.

> @@ -4149,7 +4158,7 @@
>  (define_insn "vec_store_lanesxi_lane"
>[(set (match_operand: 0 "aarch64_simd_struct_operand" "=Utv")
>   (unspec: [(match_operand:XI 1 "register_operand" "w")
> -(unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)
> +(unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)
>   (match_operand:SI 2 "immediate_operand" "i")]

8 Spaces to tab.

> diff --git 
> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2_lane_f32_indices_1.c
>  
> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2_lane_f32_indices_1.c
> new file mode 100644
> index 000..04be713
> --- /dev/null
> +++ 
> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2_lane_f32_indices_1.c
> @@ -0,0 +1,16 @@
> +#include 
> +
> +/* { dg-do compile } */
> +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */

This seems an odd limitation, presumably this is a side effect of waiting
until expand time to throw an error... It does suggest that we're tackling
the problem in the wrong way by pushing this to so late in the compilation
pipeline. The property here is on a type itself, which must take a constant
value within a given range. That feels much more like the sort of thing
we should be detecting and bailing out on closer to the front-end - perhaps
with a more generic extension allowing you to annotate any type with an
expected/required range (both as a helping hand for VRP and as a way to
express programmer defined preconditions).

But, given that adding such an extension is likely more effort than needed
I think this is OK for now!

Cheers,
James



Re: [gomp] Fix PTX worker spill/fill

2015-07-17 Thread Thomas Schwinge
Hi!

On Thu, 16 Jul 2015 12:23:52 -0400, Nathan Sidwell  wrote:
> I've committed this patch to fix a bug in the worker spill/fill code.  We 
> ended 
> up not incrementing the pointer, resulting in the stack frame being filled 
> with 
> the same value.
> 
> Thanks to Jim for finding the failure.

> --- config/nvptx/nvptx.c  (revision 225831)
> +++ config/nvptx/nvptx.c  (working copy)
> @@ -1257,7 +1257,7 @@ nvptx_gen_wcast (rtx reg, propagate_mask
>   
>   emit_insn (res);
>   emit_insn (gen_adddi3 (data->ptr, data->ptr,
> -GEN_INT (GET_MODE_SIZE (GET_MODE (res);
> +GEN_INT (GET_MODE_SIZE (GET_MODE (reg);
>   res = get_insns ();
>   end_sequence ();
> }

Nice; this is actually the change to resolve the FAIL for
libgomp.oacc-c-c++-common/private-vars-loop-gang-5.c I had reported in
.
(The testsuite/libgomp.oacc-c-c++-common/parallel-loop-1.c regression
reported earlier in that thread remains to be addressed.)  Committed to
gomp-4_0-branch in r225922:

commit 7961bf7049729aebadf639a52174be14010da499
Author: tschwinge 
Date:   Fri Jul 17 08:30:10 2015 +

libgomp: Remove XFAIL libgomp.oacc-c-c++-common/private-vars-loop-gang-5.c 
for acc_device_nvidia

Problem got addressed in r225896.

libgomp/
* testsuite/libgomp.oacc-c-c++-common/private-vars-loop-gang-5.c:
Remove XFAIL.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@225922 
138bc75d-0d04-0410-961f-82ee72b054a4
---
 libgomp/ChangeLog.gomp|5 +
 .../libgomp.oacc-c-c++-common/private-vars-loop-gang-5.c  |3 ---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git libgomp/ChangeLog.gomp libgomp/ChangeLog.gomp
index b2e4b2c..0293ad5 100644
--- libgomp/ChangeLog.gomp
+++ libgomp/ChangeLog.gomp
@@ -1,3 +1,8 @@
+2015-07-17  Thomas Schwinge  
+
+   * testsuite/libgomp.oacc-c-c++-common/private-vars-loop-gang-5.c:
+   Remove XFAIL.
+
 2015-07-15  Nathan Sidwell  
 
* plugin/plugin-nvptx.c (nvptx_exec): Show grid dimensions in
diff --git 
libgomp/testsuite/libgomp.oacc-c-c++-common/private-vars-loop-gang-5.c 
libgomp/testsuite/libgomp.oacc-c-c++-common/private-vars-loop-gang-5.c
index a710849..b070773 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/private-vars-loop-gang-5.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/private-vars-loop-gang-5.c
@@ -1,6 +1,3 @@
-/* main: Assertion `arr[i] == i + (i % 8) * 2' failed.
-   { dg-xfail-run-if "TODO" { openacc_nvidia_accel_selected } { "*" } { "" } } 
*/
-
 #include 
 
 /* Test of gang-private array variable declared on loop directive, with


Grüße,
 Thomas


signature.asc
Description: PGP signature


Re: [PATCH] [4/14] Completes renaming of configure.in files to .ac

2015-07-17 Thread Michael Darling
Perhaps the best solution is both sets of patches.

Yours to modify the build system so it can work with either extension,
old versions, and other imported packages.

Mine to complete binutils-gdb and gcc moving from configure.in to .ac
extension.  There have been several commits spread out across the last
year slowly moving the migration along.  People have been slowly
changing the extension over in their areas for the past year, but not
getting everything moved over at once.

Finish it up, have consistency, use the "prefered" extension, prevent
the warning about old ".in" extensions, make it robust so it works
with the old extension, and get combined builds working again.

On Fri, Jul 17, 2015 at 6:43 AM, Jan Beulich  wrote:
 On 17.07.15 at 06:26,  wrote:
>> Which imported packages use configure.in?  I'm happy to submit patches
>> for those, too.
>
> The answer to this may not even matter - consuming components
> (like gcc is in respect to binutils) shouldn't assume only the newer
> name is used: It should remain to be possible to build with older
> versions. I.e. you always have to check for both .ac and .in when
> looking for a file.
>
> Jan
>


Re: [PATCH][AArch64][1/14] Add ident field to struct processor

2015-07-17 Thread James Greenhalgh
On Thu, Jul 16, 2015 at 04:20:30PM +0100, Kyrill Tkachov wrote:
> Hi all,
> 
> This first patch adds a field to the processor structure that uniquely
> identifies that processor.  Note that the current 'core' field is actually
> just the core for which to schedule the instructions.  With this patch we get
> the nice property that we can reference a processor struct by just indexing
> the all_cores at the index specified by the value of the 'ident' enum.  It's
> not hard to implement either, since we already construct the required enum
> values in aarch64-opts.h and aarch64-cores.def already specifies the correct
> values for each core!
> 
> Thus, to implement the 'back up and restore' functionality we need for
> SWITCHABLE_TARGET the only thing we'd need to save and restore on the tuning
> side is an aarch64_processor enum value.
> 
> Bootstrapped with and without LTO and tested on aarch64 as part of series.
> 
> Ok for trunk?

OK.

Thanks,
James

> 2015-07-16  Kyrylo Tkachov  
> 
>  * config/aarch64/aarch64.c (struct processor): Add ident field.
>  Rename core sched_core.
>  (all_cores): Handle above changes.
>  (all_architectures): Likewise.
>  (aarch64_parse_arch): Likewise.
>  (aarch64_override_options): Likewise.



Re: [Fortran, Patch] Passing function pointer to co_reduce

2015-07-17 Thread Mikael Morin
Le 16/07/2015 16:34, Damian Rouson a écrit :
> Alternatively, if it’s easy, please feel free to add the directives and 
> commit.  
> 
It's not very difficult, but there is some pain getting the number of
escape backslashes right.
Never mind, I'll take care of it all.

Mikael


Re: [gomp] Fix PTX worker spill/fill

2015-07-17 Thread Thomas Schwinge
Hi!

On Thu, 16 Jul 2015 12:23:52 -0400, Nathan Sidwell  wrote:
> I've committed this patch to fix a bug in the worker spill/fill code.  We 
> ended 
> up not incrementing the pointer, resulting in the stack frame being filled 
> with 
> the same value.
> 
> Thanks to Jim for finding the failure.

Cesar had prepared a reduced test case, a slightly altered variant of
which I've now committed to gomp-4_0-branch in r225924:

commit ee7fb343a0d0dbd17ac8dc7d24048d8647e41232
Author: tschwinge 
Date:   Fri Jul 17 09:11:10 2015 +

OpenACC: Add test case for worker state propagation handling the stack frame

... for problem that got addressed in r225896.

libgomp/
* testsuite/libgomp.oacc-c-c++-common/worker-partn-8.c: New file.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@225924 
138bc75d-0d04-0410-961f-82ee72b054a4
---
 libgomp/ChangeLog.gomp |  5 ++
 .../libgomp.oacc-c-c++-common/worker-partn-8.c | 53 ++
 2 files changed, 58 insertions(+)

diff --git libgomp/ChangeLog.gomp libgomp/ChangeLog.gomp
index 0293ad5..ec943f5 100644
--- libgomp/ChangeLog.gomp
+++ libgomp/ChangeLog.gomp
@@ -1,4 +1,9 @@
 2015-07-17  Thomas Schwinge  
+   Cesar Philippidis  
+
+   * testsuite/libgomp.oacc-c-c++-common/worker-partn-8.c: New file.
+
+2015-07-17  Thomas Schwinge  
 
* testsuite/libgomp.oacc-c-c++-common/private-vars-loop-gang-5.c:
Remove XFAIL.
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-8.c 
libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-8.c
new file mode 100644
index 000..e787947
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-8.c
@@ -0,0 +1,53 @@
+/* { dg-additional-options "-O0" } */
+
+/* With -O0, variables are on the stack, not in registers.  Check that worker
+   state propagation handles the stack frame.  */
+
+int
+main (int argc, char *argv[])
+{
+  int w0 = 0;
+  int w1 = 0;
+  int w2 = 0;
+  int w3 = 0;
+  int w4 = 0;
+  int w5 = 0;
+  int w6 = 0;
+  int w7 = 0;
+
+  int i;
+
+#pragma acc parallel num_gangs (1) num_workers (8) copy (w0, w1, w2, w3, w4, 
w5, w6, w7)
+  {
+int internal = 100;
+
+#pragma acc loop worker
+for (i = 0; i < 8; i++)
+  {
+   switch (i)
+ {
+ case 0: w0 = internal; break;
+ case 1: w1 = internal; break;
+ case 2: w2 = internal; break;
+ case 3: w3 = internal; break;
+ case 4: w4 = internal; break;
+ case 5: w5 = internal; break;
+ case 6: w6 = internal; break;
+ case 7: w7 = internal; break;
+ default: break;
+ }
+  }
+  }
+
+  if (w0 != 100
+  || w1 != 100
+  || w2 != 100
+  || w3 != 100
+  || w4 != 100
+  || w5 != 100
+  || w6 != 100
+  || w7 != 100)
+__builtin_abort ();
+
+  return 0;
+}


Grüße,
 Thomas


signature.asc
Description: PGP signature


Re: [PATCH][AArch64][2/14] Refactor arches handling, add arch enum identifier

2015-07-17 Thread James Greenhalgh
On Thu, Jul 16, 2015 at 04:20:33PM +0100, Kyrill Tkachov wrote:
> Hi all,
> 
> In this second patch I want to get to the point where I can get an enum that
> I can use to index all_architectures to get the current architecture being
> used, similar to what we
> do in patch 1/N.
> 
> The closest thing to what I want in aarch64-arches.def is the 3rd field which
> specifies the architecture revision. Unfortunately, it is used sometimes as
> an integer and sometimes as a string when defining the __ARM_ARCH macro in
> TARGET_CPU_CPP_BUILTINS.
> 
> I've decided to create a new field that is to be used as part of an enum name
> to uniquely identify each entry in aarch64-arches.def. The revision number
> (currently only '8') is left there since we need it for the ACLE predefs, but
> we might consider moving that out in the future...
> 
> In any case, with this patch we can now get an enum that can be used to
> access the architecture information from all_architectures and can be easily
> saved and restored for SWITCHABLE_TARGET functionality.
> 
> Bootstrapped with and without LTO and tested on aarch64 as part of the series.
> 
> Ok for trunk?

Ok.

Thanks,
James

> P.S. I think we should consider creating a separate struct definition for
> cores and architectures as the information we want to store about each starts
> to diverge and it's sometimes confusing as to what a 'struct processor*'
> pointer is referencing. But such a refactoring would interfere too much with
> what I'm trying to do in this patch series and is not strictly required for
> it. Although, once the dust settles on this series, I believe it will be
> easier to split them up.

I look forward to seeing the patch ;-)

James

> 
> 2015-07-16  Kyrylo Tkachov  
> 
>  * config/aarch64/aarch64.h (TARGET_CPU_CPP_BUILTINS): Define
>  __ARM_ARCH_8A directly rather than with cpp_define_formatted.
>  * config/aarch64/aarch64.c (struct processor): Add arch field.
>  (all_architectures): Handle above, move above all_cores.
>  (all_cores): Handle above.
>  (aarch64_parse_arch): Handle above changes.
>  * config/aarch64/aarch64-arches.def (armv8-a): Extend according to
>  above.  Update comments.
>  (armv8.1-a): Likewise.
>  * config/aarch64/aarch64-cores.def: Update according to above.
>  * config/aarch64/aarch64-opts.h (aarch64_arch): New enum.
>  * config/aarch64/driver-aarch64.c (struct aarch64_arch): Rename to
>  aarch64_arch_driver_info.



[PATCH] Fix partial template specialization syntax in wide-int.h

2015-07-17 Thread Mikhail Maltsev
Hi, all!
The following code (reduced from wide-int.h) is rejected by Intel C++ Compiler
(EDG-based):

$ cat genpreds1_min.cc
template 
class A;

template 
struct B;

template 
struct C;

template <>
template 
struct C  >
{
template
A  >
m_fn(T);
};

template 
template 
A  >
C  >::m_fn (T)
{
}

$ /opt/intel/bin/icpc -c genpreds1_min.cc
genpreds1_min.cc(22): error: incomplete type is not allowed
  C  >::m_fn (T)
  ^

genpreds1_min.cc(22): error: template argument list must match the parameter 
list
  C  >::m_fn (T)

Clang gives the following warning:

$ clang++ -c genpreds1_min.cc
genpreds1_min.cc:10:1: warning: extraneous template parameter list in template
specialization
template <>

I think that the warning is correct, and "template <>" should not be used here.
The attached patch should fix this issue. Bootstrapped and regtested on
x86_64-linux. OK for trunk?

-- 
Regards,
Mikhail Maltsev
diff --git a/gcc/wide-int.h b/gcc/wide-int.h
index d8f7b46..6e0275f 100644
--- a/gcc/wide-int.h
+++ b/gcc/wide-int.h
@@ -360,21 +360,18 @@ namespace wi
  inputs.  Note that CONST_PRECISION and VAR_PRECISION cannot be
  mixed, in order to give stronger type checking.  When both inputs
  are CONST_PRECISION, they must have the same precision.  */
-  template <>
   template 
   struct binary_traits 
   {
 typedef widest_int result_type;
   };
 
-  template <>
   template 
   struct binary_traits 
   {
 typedef wide_int result_type;
   };
 
-  template <>
   template 
   struct binary_traits 
   {
@@ -384,14 +381,12 @@ namespace wi
 			   ::precision> > result_type;
   };
 
-  template <>
   template 
   struct binary_traits 
   {
 typedef wide_int result_type;
   };
 
-  template <>
   template 
   struct binary_traits 
   {
@@ -401,7 +396,6 @@ namespace wi
 			   ::precision> > result_type;
   };
 
-  template <>
   template 
   struct binary_traits 
   {
@@ -412,7 +406,6 @@ namespace wi
 			   ::precision> > result_type;
   };
 
-  template <>
   template 
   struct binary_traits 
   {
@@ -876,7 +869,6 @@ generic_wide_int ::dump () const
 
 namespace wi
 {
-  template <>
   template 
   struct int_traits < generic_wide_int  >
 : public wi::int_traits 
@@ -955,7 +947,6 @@ inline wide_int_ref_storage ::wide_int_ref_storage (const T &x,
 
 namespace wi
 {
-  template <>
   template 
   struct int_traits  >
   {
@@ -1142,7 +1133,6 @@ public:
 
 namespace wi
 {
-  template <>
   template 
   struct int_traits < fixed_wide_int_storage  >
   {
gcc/ChangeLog:

2015-07-17  Mikhail Maltsev  

* wide-int.h (struct binary_traits): Fix partial specialization syntax.
(struct int_traits): Likewise.



[scalar-storage-order] Fix handling of complex types

2015-07-17 Thread Eric Botcazou
Complex types (mathematical sense) weren't correctly handled, in particular 
they were reversed as a whole instead of piecewise so a big-endian complex 
field on a little-endian target didn't have the same representation in memory 
as on a big-endian target.

Tested on x86_64-suse-linux. applied on the branch.


* expmed.c (flip_storage_order): Deal with complex modes specially.
Use the precision of the mode instead of its size to find an integral
counterpart.
(extract_bit_field_1): Deal with complex modes specially.
* expr.c (read_complex_part): Make public.
(get_inner_reference): Call reverse_storage_order_for_component_p.
(expand_expr_real_1): Allow early exit for CONCATs as well.
* expr.h (read_complex_part): Declare.
* tree-dfa.c (get_ref_base_and_extent): Call
reverse_storage_order_for_component_p.
* tree.h (reverse_storage_order_for_component_p): New inline predicate.
ada/
* freeze.adb (Check_Component_Storage_Order): Skip a record component
if it has Complex_Representation.
(Freeze_Record_Type): If the type has Complex_Representation, skip
the regular treatment of Scalar_Storage_Order attribute and instead
issue a warning if it is present.
testsuite/
* c-c++-common/sso/init13.h: New helper.
* c-c++-common/sso/p13.c: New test.
* c-c++-common/sso/q13.c: Likewise.
* c-c++-common/sso/t13.c: Likewise.
* gnat.dg/sso/init13.ads: New helper.
* gnat.dg/sso/p13.adb: New test.
* gnat.dg/sso/q13.adb: Likewise.
* gnat.dg/sso/t13.adb: Likewise.

-- 
Eric BotcazouIndex: tree.h
===
--- tree.h	(revision 225434)
+++ tree.h	(working copy)
@@ -4306,23 +4306,56 @@ handled_component_p (const_tree t)
 }
 }
 
-/* Return true if REF is a storage order barrier, i.e. a VIEW_CONVERT_EXPR
+/* Return true T is a component with reverse storage order.  */
+
+static inline bool
+reverse_storage_order_for_component_p (tree t)
+{
+  /* The storage order only applies to scalar components.  */
+  if (AGGREGATE_TYPE_P (TREE_TYPE (t)))
+return false;
+
+  if (TREE_CODE (t) == REALPART_EXPR || TREE_CODE (t) == IMAGPART_EXPR)
+t = TREE_OPERAND (t, 0);
+
+  switch (TREE_CODE (t))
+{
+case ARRAY_REF:
+case COMPONENT_REF:
+  /* ??? Fortran can take COMPONENT_REF of a void type.  */
+  return !VOID_TYPE_P (TREE_TYPE (TREE_OPERAND (t, 0)))
+	 && TYPE_REVERSE_STORAGE_ORDER (TREE_TYPE (TREE_OPERAND (t, 0)));
+
+case BIT_FIELD_REF:
+case MEM_REF:
+  return REF_REVERSE_STORAGE_ORDER (t);
+
+case ARRAY_RANGE_REF:
+case VIEW_CONVERT_EXPR:
+default:
+  return false;
+}
+
+  gcc_unreachable ();
+}
+
+/* Return true if T is a storage order barrier, i.e. a VIEW_CONVERT_EXPR
that can modify the storage order of objects.  Note that, even if the
TYPE_REVERSE_STORAGE_ORDER flag is set on both the inner type and the
outer type, a VIEW_CONVERT_EXPR can modify the storage order because
it can change the partition of the aggregate object into scalars.  */
 
 static inline bool
-storage_order_barrier_p (const_tree ref)
+storage_order_barrier_p (const_tree t)
 {
-  if (TREE_CODE (ref) != VIEW_CONVERT_EXPR)
+  if (TREE_CODE (t) != VIEW_CONVERT_EXPR)
 return false;
 
-  if (AGGREGATE_TYPE_P (TREE_TYPE (ref))
-  && TYPE_REVERSE_STORAGE_ORDER (TREE_TYPE (ref)))
+  if (AGGREGATE_TYPE_P (TREE_TYPE (t))
+  && TYPE_REVERSE_STORAGE_ORDER (TREE_TYPE (t)))
 return true;
 
-  tree op = TREE_OPERAND (ref, 0);
+  tree op = TREE_OPERAND (t, 0);
 
   if (AGGREGATE_TYPE_P (TREE_TYPE (op))
   && TYPE_REVERSE_STORAGE_ORDER (TREE_TYPE (op)))
Index: testsuite/gnat.dg/sso/q13.adb
===
--- testsuite/gnat.dg/sso/q13.adb	(revision 0)
+++ testsuite/gnat.dg/sso/q13.adb	(revision 0)
@@ -0,0 +1,53 @@
+-- { dg-do run }
+
+with Init13; use Init13;
+with Ada.Numerics; use Ada.Numerics;
+with Text_IO; use Text_IO;
+with Dump;
+
+procedure Q13 is
+
+  A1 : R1 := My_R1;
+  B1 : R1 := My_R1;
+
+  A2 : R2 := My_R2;
+  B2 : R2 := My_R2;
+
+begin
+  Put ("A1 :");
+  Dump (A1'Address, R1'Max_Size_In_Storage_Elements);
+  New_Line;
+  -- { dg-output "A1 : db 0f 49 40 db 0f 49 c0\n" }
+
+  Put ("B1 :");
+  Dump (B1'Address, R1'Max_Size_In_Storage_Elements);
+  New_Line;
+  -- { dg-output "B1 : db 0f 49 40 db 0f 49 c0\n" }
+
+  Put ("A2 :");
+  Dump (A2'Address, R2'Max_Size_In_Storage_Elements);
+  New_Line;
+  -- { dg-output "A2 : 40 49 0f db c0 49 0f db\n" }
+
+  Put ("B2 :");
+  Dump (B2'Address, R2'Max_Size_In_Storage_Elements);
+  New_Line;
+  -- { dg-output "B2 : 40 49 0f db c0 49 0f db\n" }
+
+  if A1.F /= B1.F then
+raise Program_Error;
+  end if;
+
+  if A1.F /= (Pi, -Pi) then
+raise Program_Error;
+  end if;
+
+  if A2.F /= B2.F then
+raise Program_Error;
+  end if;
+

Re: [PATCH v2, libcpp] Faster line lexer.

2015-07-17 Thread Ondřej Bílka
On Fri, Jul 10, 2015 at 02:06:26PM -0600, Jeff Law wrote:
> On 07/10/2015 07:25 AM, Ondřej Bílka wrote:
> >On Fri, Jul 10, 2015 at 12:43:48PM +0200, Jakub Jelinek wrote:
> >>On Fri, Jul 10, 2015 at 11:37:18AM +0200, Uros Bizjak wrote:
> >>>Have you tried new SSE4.2 implementation (the one with asm flags) with
> >>>unrolled loop?
> >>
> >>Also, the SSE4.2 implementation looks shorter, so more I-cache friendly,
> >>so I wouldn't really say it is redundant if they are roughly same speed.
> >>
> >Ok, I tried to also optimize sse4 and found that main problem was
> >checking that index==16 caused high latency.
> >
> >Trick was checking first 64 bytes in header using flags. Then loop is
> >relatively unlikely as lines longer than 64 bytes are relatively rare.
> >
> >I tested that on more machines. On haswell sse4 is noticable faster, on
> >nehalem a sse2 is still bit faster and on amd fx10 its lot slower. How
> >do I check processor to select sse2 on amd processors where its
> >considerably slower?
> I doubt any of this is worth the maintenance burden.  I think we
> should pick a reasonably performant implementation and move on to
> bigger issues.
> 
Then we could proceed with this patch on basis that intel processors
are more common than amd ones. On fx10 a new sse2 implementation is 20%
faster than sse4, but on haswell its opposite and sse4 is 20% faster
than sse2.




Re: [Patch, Fortran, 66035, v2] [5/6 Regression] gfortran ICE segfault

2015-07-17 Thread Andre Vehreschild
Hi Paul, 

thanks for the review, commited as r225928.

Regards,
Andre

On Wed, 15 Jul 2015 13:40:29 +0200
Paul Richard Thomas  wrote:

> Dear Andre,
> 
> I am still in the bizarre situation that the testcase compiles and
> runs correctly on a clean trunk!
> 
> That said, the patch applies cleanly and, at very least from my point
> of view, does not do any harm :-)
> 
> OK for trunk
> 
> Thanks for the patch
> 
> Paul
> 
> On 11 July 2015 at 14:08, Andre Vehreschild  wrote:
> > Hi Mikael,
> >
> >> > @@ -7030,7 +7053,8 @@ gfc_trans_subcomponent_assign (tree dest,
> >> > gfc_component * cm, gfc_expr * expr, gfc_add_expr_to_block (&block, tmp);
> >> >  }
> >> >else if (init && (cm->attr.allocatable
> >> > -  || (cm->ts.type == BT_CLASS && CLASS_DATA
> >> > (cm)->attr.allocatable)))
> >> > +  || (cm->ts.type == BT_CLASS && CLASS_DATA (cm)->attr.allocatable
> >> > +  && expr->ts.type != BT_CLASS)))
> >> >  {
> >> >/* Take care about non-array allocatable components here.  The
> >> > alloc_* routine below is motivated by the alloc_scalar_allocatable_for_
> >> > @@ -7074,6 +7098,14 @@ gfc_trans_subcomponent_assign (tree dest,
> >> > gfc_component * cm, gfc_expr * expr, tmp = gfc_build_memcpy_call (tmp,
> >> > se.expr, size); gfc_add_expr_to_block (&block, tmp);
> >> > }
> >> > +  else if (cm->ts.type == BT_CLASS && expr->ts.type == BT_CLASS)
> >> > +   {
> >> > + tmp = gfc_copy_class_to_class (se.expr, dest, integer_one_node,
> >> > +  CLASS_DATA
> >> > (cm)->attr.unlimited_polymorphic);
> >> > + gfc_add_expr_to_block (&block, tmp);
> >> > + gfc_add_modify (&block, gfc_class_vptr_get (dest),
> >> > + gfc_class_vptr_get (se.expr));
> >> > +   }
> >> >else
> >> > gfc_add_modify (&block, tmp,
> >> > fold_convert (TREE_TYPE (tmp), se.expr));
> >> But this hunk is canceled by the one before, isn't it?
> >> I mean, If the condition here is true, the condition before was false?
> >
> > You are absolutely right. The second hunk is dead code and removed in the
> > attached patch. That must have been the first attempt to address the issue
> > and later on I did not perceive that it was useless. Sorry for that.
> >
> > Regards,
> > Andre
> > --
> > Andre Vehreschild * Email: vehre ad gmx dot de
> 
> 
> 


-- 
Andre Vehreschild * Email: vehre ad gmx dot de 
Index: gcc/fortran/ChangeLog
===
--- gcc/fortran/ChangeLog	(Revision 225927)
+++ gcc/fortran/ChangeLog	(Arbeitskopie)
@@ -1,3 +1,15 @@
+2015-07-17  Andre Vehreschild  
+
+	PR fortran/66035
+	* trans-expr.c (alloc_scalar_allocatable_for_subcomponent_assignment):
+	Compute the size to allocate for class and derived type objects
+	correclty.
+	(gfc_trans_subcomponent_assign): Only allocate memory for a
+	component when the object to assign is not an allocatable class
+	object (the memory is already present for allocatable class objects).
+	Furthermore use copy_class_to_class for assigning the rhs to the
+	component (may happen for dummy class objects on the rhs).
+
 2015-07-17  Mikael Morin  
 	Dominique d'Humieres  
 
Index: gcc/fortran/trans-expr.c
===
--- gcc/fortran/trans-expr.c	(Revision 225927)
+++ gcc/fortran/trans-expr.c	(Arbeitskopie)
@@ -6969,6 +6969,29 @@
    TREE_TYPE (tmp), tmp,
    fold_convert (TREE_TYPE (tmp), size));
 }
+  else if (cm->ts.type == BT_CLASS)
+{
+  gcc_assert (expr2->ts.type == BT_CLASS || expr2->ts.type == BT_DERIVED);
+  if (expr2->ts.type == BT_DERIVED)
+	{
+	  tmp = gfc_get_symbol_decl (expr2->ts.u.derived);
+	  size = TYPE_SIZE_UNIT (tmp);
+	}
+  else
+	{
+	  gfc_expr *e2vtab;
+	  gfc_se se;
+	  e2vtab = gfc_find_and_cut_at_last_class_ref (expr2);
+	  gfc_add_vptr_component (e2vtab);
+	  gfc_add_size_component (e2vtab);
+	  gfc_init_se (&se, NULL);
+	  gfc_conv_expr (&se, e2vtab);
+	  gfc_add_block_to_block (block, &se.pre);
+	  size = fold_convert (size_type_node, se.expr);
+	  gfc_free_expr (e2vtab);
+	}
+  size_in_bytes = size;
+}
   else
 {
   /* Otherwise use the length in bytes of the rhs.  */
@@ -7096,7 +7119,8 @@
   gfc_add_expr_to_block (&block, tmp);
 }
   else if (init && (cm->attr.allocatable
-	   || (cm->ts.type == BT_CLASS && CLASS_DATA (cm)->attr.allocatable)))
+	   || (cm->ts.type == BT_CLASS && CLASS_DATA (cm)->attr.allocatable
+	   && expr->ts.type != BT_CLASS)))
 {
   /* Take care about non-array allocatable components here.  The alloc_*
 	 routine below is motivated by the alloc_scalar_allocatable_for_
Index: gcc/testsuite/ChangeLog
===
--- gcc/testsuite/ChangeLog	(Revision 225927)
+++ gcc/testsuite/ChangeLog	(Arbeitskopie)
@@ -1,3 +1,8 @@
+2015-07-17  Andre Vehreschild  
+
+	PR fortran/66035
+	* gfortran.dg

[PATCH] Move A - (A & B) -> ~B & A

2015-07-17 Thread Marek Polacek
This moves one pattern from fold-const.c into match.pd.
Since no test was testing that pattern, I added a new test.
As a follow up I'll move the "(A & ~B) - (A & B) into (A ^ B) - B"
pattern.

Bootstrapped/regtested on x86_64-linux, ok for trunk?

2015-07-17  Marek Polacek  

* fold-const.c (fold_binary_loc): Move A - (A & B) into ~B & A ...
* match.pd: ... here.

* gcc.dg/fold-minus-7.c: New test.

diff --git gcc/fold-const.c gcc/fold-const.c
index 93dd29d..fa321f4 100644
--- gcc/fold-const.c
+++ gcc/fold-const.c
@@ -9777,30 +9777,6 @@ fold_binary_loc (location_t loc,
 
   if (! FLOAT_TYPE_P (type))
{
- /* Fold A - (A & B) into ~B & A.  */
- if (!TREE_SIDE_EFFECTS (arg0)
- && TREE_CODE (arg1) == BIT_AND_EXPR)
-   {
- if (operand_equal_p (arg0, TREE_OPERAND (arg1, 1), 0))
-   {
- tree arg10 = fold_convert_loc (loc, type,
-TREE_OPERAND (arg1, 0));
- return fold_build2_loc (loc, BIT_AND_EXPR, type,
- fold_build1_loc (loc, BIT_NOT_EXPR,
-  type, arg10),
- fold_convert_loc (loc, type, arg0));
-   }
- if (operand_equal_p (arg0, TREE_OPERAND (arg1, 0), 0))
-   {
- tree arg11 = fold_convert_loc (loc,
-type, TREE_OPERAND (arg1, 1));
- return fold_build2_loc (loc, BIT_AND_EXPR, type,
- fold_build1_loc (loc, BIT_NOT_EXPR,
-  type, arg11),
- fold_convert_loc (loc, type, arg0));
-   }
-   }
-
  /* Fold (A & ~B) - (A & B) into (A ^ B) - B, where B is
 any power of 2 minus 1.  */
  if (TREE_CODE (arg0) == BIT_AND_EXPR
diff --git gcc/match.pd gcc/match.pd
index c335ada..066d5de 100644
--- gcc/match.pd
+++ gcc/match.pd
@@ -662,6 +662,10 @@ along with GCC; see the file COPYING3.  If not see
  (bit_ior:c (bit_and:cs @0 (bit_not @2)) (bit_and:cs @1 @2))
  (bit_xor (bit_and (bit_xor @0 @1) @2) @0))
 
+/* Fold A - (A & B) into ~B & A.  */
+(simplify
+ (minus (convert? @0) (convert? (bit_and:c @0 @1)))
+ (convert (bit_and (bit_not @1) @0)))
 
 /* Associate (p +p off1) +p off2 as (p +p (off1 + off2)).  */
 (simplify
diff --git gcc/testsuite/gcc.dg/fold-minus-7.c 
gcc/testsuite/gcc.dg/fold-minus-7.c
index e69de29..7a49faa 100644
--- gcc/testsuite/gcc.dg/fold-minus-7.c
+++ gcc/testsuite/gcc.dg/fold-minus-7.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fdump-tree-cddce1" } */
+
+int
+f1 (int a, int b)
+{
+  int tem = a & b;
+  return a - tem;
+}
+
+int
+f2 (int a, int b)
+{
+  int tem = b & a;
+  return a - tem;
+}
+
+int
+f3 (unsigned int a, int b)
+{
+  return a - (a & b);
+}
+
+int
+f4 (int a, unsigned int b)
+{
+  return a - (a & b);
+}
+
+int
+f5 (int a, int b)
+{
+  return a - (unsigned) (b & a);
+}
+
+/* { dg-final { scan-tree-dump-not " - " "cddce1" } } */

Marek


Re: constify target offload data

2015-07-17 Thread Ilya Verbin
On Thu, Jul 16, 2015 at 16:08:47 -0400, Nathan Sidwell wrote:
> Jakub, Ilya,
> this patch against trunk constifies the offload target data.  I'm
> having difficulty building an intelmic toolchain, so the changes
> there aren't tested. Ilya, if you could check them, that'd be great.

Works fine with one change:


diff --git a/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp 
b/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
index 136fb99..baa4945 100644
--- a/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
+++ b/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
@@ -61,7 +61,7 @@ typedef std::vector AddrVect;
 typedef std::vector DevAddrVect;
 
 /* Addresses for all images and all devices.  */
-typedef std::map ImgDevAddrMap;
+typedef std::map ImgDevAddrMap;
 
 /* Image descriptor needed by __offload_[un]register_image.  */
 struct TargetImageDesc {


  -- Ilya


Re: constify target offload data

2015-07-17 Thread Jakub Jelinek
On Fri, Jul 17, 2015 at 02:01:12PM +0300, Ilya Verbin wrote:
> On Thu, Jul 16, 2015 at 16:08:47 -0400, Nathan Sidwell wrote:
> > Jakub, Ilya,
> > this patch against trunk constifies the offload target data.  I'm
> > having difficulty building an intelmic toolchain, so the changes
> > there aren't tested. Ilya, if you could check them, that'd be great.
> 
> Works fine with one change:
> 
> 
> diff --git a/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp 
> b/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
> index 136fb99..baa4945 100644
> --- a/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
> +++ b/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
> @@ -61,7 +61,7 @@ typedef std::vector AddrVect;
>  typedef std::vector DevAddrVect;
>  
>  /* Addresses for all images and all devices.  */
> -typedef std::map ImgDevAddrMap;
> +typedef std::map ImgDevAddrMap;
>  
>  /* Image descriptor needed by __offload_[un]register_image.  */
>  struct TargetImageDesc {

The patch is ok for trunk with that change then.

Jakub


[PATCH] gcc/: Fix building with isl-0.15.0; includes

2015-07-17 Thread Bernhard Reutner-Fischer
In addition to Mike's "[PATCH] gcc: fix building w/isl-0.15" in
https://gcc.gnu.org/ml/gcc-patches/2015-07/msg01162.html
I seem to also need a couple of more includes for isl-0.15.0:

2015-07-15  Bernhard Reutner-Fischer  

* graphite-blocking.c, graphite-dependences.c,
graphite-interchange.c, graphite-isl-ast-to-gimple.c,
graphite-optimize-isl.c, graphite-poly.c,
graphite-scop-detection.c, graphite-sese-to-poly.c,
graphite.c: Add missing isl includes.
---
 gcc/graphite-blocking.c  |1 +
 gcc/graphite-dependences.c   |1 +
 gcc/graphite-interchange.c   |1 +
 gcc/graphite-isl-ast-to-gimple.c |2 ++
 gcc/graphite-optimize-isl.c  |2 ++
 gcc/graphite-poly.c  |1 +
 gcc/graphite-scop-detection.c|1 +
 gcc/graphite-sese-to-poly.c  |1 +
 gcc/graphite.c   |1 +
 9 files changed, 11 insertions(+)

diff --git a/gcc/graphite-blocking.c b/gcc/graphite-blocking.c
index 73d7c59..172bdcd 100644
--- a/gcc/graphite-blocking.c
+++ b/gcc/graphite-blocking.c
@@ -27,6 +27,7 @@ along with GCC; see the file COPYING3.  If not see
 /* Workaround for GMP 5.1.3 bug, see PR56019.  */
 #include 
 
+#include 
 #include 
 #include 
 #include 
diff --git a/gcc/graphite-dependences.c b/gcc/graphite-dependences.c
index 9a0986d..df52e54 100644
--- a/gcc/graphite-dependences.c
+++ b/gcc/graphite-dependences.c
@@ -25,6 +25,7 @@ along with GCC; see the file COPYING3.  If not see
 /* Workaround for GMP 5.1.3 bug, see PR56019.  */
 #include 
 
+#include 
 #include 
 #include 
 #include 
diff --git a/gcc/graphite-interchange.c b/gcc/graphite-interchange.c
index aee51a8..bcf7f3b 100644
--- a/gcc/graphite-interchange.c
+++ b/gcc/graphite-interchange.c
@@ -27,6 +27,7 @@ along with GCC; see the file COPYING3.  If not see
 /* Workaround for GMP 5.1.3 bug, see PR56019.  */
 #include 
 
+#include 
 #include 
 #include 
 #include 
diff --git a/gcc/graphite-isl-ast-to-gimple.c b/gcc/graphite-isl-ast-to-gimple.c
index b32781a..7aafee2 100644
--- a/gcc/graphite-isl-ast-to-gimple.c
+++ b/gcc/graphite-isl-ast-to-gimple.c
@@ -24,7 +24,9 @@ along with GCC; see the file COPYING3.  If not see
 /* Workaround for GMP 5.1.3 bug, see PR56019.  */
 #include 
 
+#include 
 #include 
+#include 
 #include 
 #include 
 #include 
diff --git a/gcc/graphite-optimize-isl.c b/gcc/graphite-optimize-isl.c
index 388e25c..2b55783 100644
--- a/gcc/graphite-optimize-isl.c
+++ b/gcc/graphite-optimize-isl.c
@@ -24,7 +24,9 @@ along with GCC; see the file COPYING3.  If not see
 /* Workaround for GMP 5.1.3 bug, see PR56019.  */
 #include 
 
+#include 
 #include 
+#include 
 #include 
 #include 
 #include 
diff --git a/gcc/graphite-poly.c b/gcc/graphite-poly.c
index bcd08d8..e3ddd57 100644
--- a/gcc/graphite-poly.c
+++ b/gcc/graphite-poly.c
@@ -25,6 +25,7 @@ along with GCC; see the file COPYING3.  If not see
 /* Workaround for GMP 5.1.3 bug, see PR56019.  */
 #include 
 
+#include 
 #include 
 #include 
 #include 
diff --git a/gcc/graphite-scop-detection.c b/gcc/graphite-scop-detection.c
index 47e0da0..22fdde3 100644
--- a/gcc/graphite-scop-detection.c
+++ b/gcc/graphite-scop-detection.c
@@ -25,6 +25,7 @@ along with GCC; see the file COPYING3.  If not see
 /* Workaround for GMP 5.1.3 bug, see PR56019.  */
 #include 
 
+#include 
 #include 
 #include 
 #include 
diff --git a/gcc/graphite-sese-to-poly.c b/gcc/graphite-sese-to-poly.c
index 5064247..7b10289 100644
--- a/gcc/graphite-sese-to-poly.c
+++ b/gcc/graphite-sese-to-poly.c
@@ -24,6 +24,7 @@ along with GCC; see the file COPYING3.  If not see
 /* Workaround for GMP 5.1.3 bug, see PR56019.  */
 #include 
 
+#include 
 #include 
 #include 
 #include 
diff --git a/gcc/graphite.c b/gcc/graphite.c
index a81ef6a..32f405a 100644
--- a/gcc/graphite.c
+++ b/gcc/graphite.c
@@ -38,6 +38,7 @@ along with GCC; see the file COPYING3.  If not see
 /* Workaround for GMP 5.1.3 bug, see PR56019.  */
 #include 
 
+#include 
 #include 
 #include 
 #include 
-- 
1.7.10.4



Re: [PATCH] enable loop fusion with ISL scheduler

2015-07-17 Thread Bernhard Reutner-Fischer
On 17 July 2015 at 06:32, Tobias Grosser  wrote:
> On 07/17/2015 12:35 AM, Sebastian Pop wrote:
>>
>> gcc/ChangeLog:
>>
>> 2015-07-16  Aditya Kumar  
>>  Sebastian Pop  
>>
>>  * common.opt (floop-fuse): New.
>>  * doc/invoke.texi (floop-fuse): Documented.
>>  * graphite-optimize-isl.c (optimize_isl): Use
>>  ISL_SCHEDULE_FUSE_MAX when using flag_loop_fuse.
>>  * graphite-poly.c (apply_poly_transforms): Call optimize_isl when
>>  using flag_loop_fuse.
>>  * graphite.c (gate_graphite_transforms): Enable graphite with
>>  flag_loop_fuse.
>
>
> LGTM.

AFAICS this won't work with isl-0.15.0 where this spot changed. See
https://gcc.gnu.org/ml/gcc-patches/2015-07/msg01162.html and ff.

Apart from that it looks identical in spirit to what came out of
starting to think about the fusion part of https://gcc.gnu.org/PR66741
;)
Didn't have time to pursue this yet due to RL interference though, so
if you beat me to it..
Thanks,

>
> Tobias
>
>> gcc/testsuite/ChangeLog:
>>
>> 2015-07-16  Aditya Kumar  
>>  Sebastian Pop  
>>
>>
>>  * gcc.dg/graphite/fuse-1.c: New test.
>>  * gcc.dg/graphite/fuse-2.c: New test.
>> ---
>>   gcc/common.opt |  4 
>>   gcc/doc/invoke.texi| 23 +++-
>>   gcc/graphite-optimize-isl.c|  5 -
>>   gcc/graphite-poly.c|  2 +-
>>   gcc/graphite.c |  3 ++-
>>   gcc/testsuite/gcc.dg/graphite/fuse-1.c | 32 
>>   gcc/testsuite/gcc.dg/graphite/fuse-2.c | 38
>> ++
>>   7 files changed, 103 insertions(+), 4 deletions(-)
>>   create mode 100644 gcc/testsuite/gcc.dg/graphite/fuse-1.c
>>   create mode 100644 gcc/testsuite/gcc.dg/graphite/fuse-2.c
>>
>> diff --git a/gcc/common.opt b/gcc/common.opt
>> index dd49ae3..200ecc1 100644
>> --- a/gcc/common.opt
>> +++ b/gcc/common.opt
>> @@ -1365,6 +1365,10 @@ floop-nest-optimize
>>   Common Report Var(flag_loop_optimize_isl) Optimization
>>   Enable the ISL based loop nest optimizer
>>
>> +floop-fuse
>> +Common Report Var(flag_loop_fuse) Optimization
>> +Enable loop fusion
>> +
>>   fstrict-volatile-bitfields
>>   Common Report Var(flag_strict_volatile_bitfields) Init(-1) Optimization
>>   Force bitfield accesses to match their type width
>> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
>> index b99ab1c..7cc8bb9 100644
>> --- a/gcc/doc/invoke.texi
>> +++ b/gcc/doc/invoke.texi
>> @@ -409,7 +409,7 @@ Objective-C and Objective-C++ Dialects}.
>>   -fivopts -fkeep-inline-functions -fkeep-static-consts @gol
>>   -flive-range-shrinkage @gol
>>   -floop-block -floop-interchange -floop-strip-mine @gol
>> --floop-unroll-and-jam -floop-nest-optimize @gol
>> +-floop-unroll-and-jam -floop-nest-optimize -floop-fuse @gol
>>   -floop-parallelize-all -flra-remat -flto -flto-compression-level @gol
>>   -flto-partition=@var{alg} -flto-report -flto-report-wpa
>> -fmerge-all-constants @gol
>>   -fmerge-constants -fmodulo-sched -fmodulo-sched-allow-regmoves @gol
>> @@ -8796,6 +8796,27 @@ optimizer based on the Pluto optimization
>> algorithms.  It calculates a loop
>>   structure optimized for data-locality and parallelism.  This option
>>   is experimental.
>>
>> +@item -floop-fuse
>> +@opindex floop-fuse
>> +Enable loop fusion.  This option is experimental.
>> +
>> +For example, given a loop like:
>> +@smallexample
>> +DO I = 1, N
>> +  A(I) = A(I) + B(I)
>> +ENDDO
>> +DO I = 1, N
>> +  A(I) = A(I) + C(I)
>> +ENDDO
>> +@end smallexample
>> +@noindent
>> +loop fusion transforms the loop as if it were written:
>> +@smallexample
>> +DO I = 1, N
>> +  A(I) = A(I) + B(I) + C(I)
>> +ENDDO
>> +@end smallexample
>> +
>>   @item -floop-unroll-and-jam
>>   @opindex floop-unroll-and-jam
>>   Enable unroll and jam for the ISL based loop nest optimizer.  The unroll
>> diff --git a/gcc/graphite-optimize-isl.c b/gcc/graphite-optimize-isl.c
>> index 624cc87..c016461 100644
>> --- a/gcc/graphite-optimize-isl.c
>> +++ b/gcc/graphite-optimize-isl.c
>> @@ -599,7 +599,10 @@ optimize_isl (scop_p scop)
>>
>> isl_options_set_schedule_max_constant_term (scop->ctx,
>> CONSTANT_BOUND);
>> isl_options_set_schedule_maximize_band_depth (scop->ctx, 1);
>> -  isl_options_set_schedule_fuse (scop->ctx, ISL_SCHEDULE_FUSE_MIN);
>> +  if (flag_loop_fuse)
>> +isl_options_set_schedule_fuse (scop->ctx, ISL_SCHEDULE_FUSE_MAX);
>> +  else
>> +isl_options_set_schedule_fuse (scop->ctx, ISL_SCHEDULE_FUSE_MIN);
>> isl_options_set_on_error (scop->ctx, ISL_ON_ERROR_CONTINUE);
>>
>>   #ifdef HAVE_ISL_SCHED_CONSTRAINTS_COMPUTE_SCHEDULE
>> diff --git a/gcc/graphite-poly.c b/gcc/graphite-poly.c
>> index 4407dc5..4808fbe 100644
>> --- a/gcc/graphite-poly.c
>> +++ b/gcc/graphite-poly.c
>> @@ -272,7 +272,7 @@ apply_poly_transforms (scop_p scop)
>>
>> /* This pass needs to be run at the final stage, as it does not
>>

Re: [PATCH][combine][1/2] Try to simplify before substituting

2015-07-17 Thread Kyrill Tkachov


On 16/07/15 19:28, Segher Boessenkool wrote:

On Thu, Jul 16, 2015 at 07:17:54PM +0100, Kyrill Tkachov wrote:

If you always want to simplify first, does it work to move this whole big
block behind the simplify just following it?  Or do you want to simplify
after the transform as well?

You mean move this hunk outside the "if (BINARY_P (x)...)" block it's in?
I think it would work, but I'm not sure if it would affect other cases.
I was also conscious that simplify_rtx might not be a cheap function to call
so frequently (or is it? I didn't profile it), so I tried to avoid calling
it unless I need for the transformation in question here.

I mean move the whole "if (BINARY_P ..." block to after the existing
simplify calls, to just before the "First see if we can apply" comment,
and not do a new simplify_rtx call at all.  Does that work?


Yes, and here's the patch.
It just moves the simplification block.
The effect on codegen in SPEC2006 on aarch64 looks sane in the same
way as the original patch I posted (i.e. many redundant zero_extends eliminated)
and together with patch 2/2 this helps in the -abs testcase.

I'm bootstrapping this on aarch64, arm and x86.
Any other testing would be appreciated.

Is this version ok if testing comes clean?

Thanks,
Kyrill

2015-07-17  Kyrylo Tkachov  

* combine.c (combine_simplify_rtx): Move simplification step
before various transformations/substitutions.


commit d7c7cd4acad9a34b3e991fd589125cc165f25c1f
Author: Kyrylo Tkachov 
Date:   Thu Jul 16 19:42:58 2015 +0100

[combine] Alternative approach

diff --git a/gcc/combine.c b/gcc/combine.c
index 574f874..2f806ab 100644
--- a/gcc/combine.c
+++ b/gcc/combine.c
@@ -5489,6 +5489,51 @@ combine_simplify_rtx (rtx x, machine_mode op0_mode, int in_dest,
   SUBST (XEXP (x, 1), temp);
 }
 
+  /* Try to fold this expression in case we have constants that weren't
+ present before.  */
+  temp = 0;
+  switch (GET_RTX_CLASS (code))
+{
+case RTX_UNARY:
+  if (op0_mode == VOIDmode)
+	op0_mode = GET_MODE (XEXP (x, 0));
+  temp = simplify_unary_operation (code, mode, XEXP (x, 0), op0_mode);
+  break;
+case RTX_COMPARE:
+case RTX_COMM_COMPARE:
+  {
+	machine_mode cmp_mode = GET_MODE (XEXP (x, 0));
+	if (cmp_mode == VOIDmode)
+	  {
+	cmp_mode = GET_MODE (XEXP (x, 1));
+	if (cmp_mode == VOIDmode)
+	  cmp_mode = op0_mode;
+	  }
+	temp = simplify_relational_operation (code, mode, cmp_mode,
+	  XEXP (x, 0), XEXP (x, 1));
+  }
+  break;
+case RTX_COMM_ARITH:
+case RTX_BIN_ARITH:
+  temp = simplify_binary_operation (code, mode, XEXP (x, 0), XEXP (x, 1));
+  break;
+case RTX_BITFIELD_OPS:
+case RTX_TERNARY:
+  temp = simplify_ternary_operation (code, mode, op0_mode, XEXP (x, 0),
+	 XEXP (x, 1), XEXP (x, 2));
+  break;
+default:
+  break;
+}
+
+  if (temp)
+{
+  x = temp;
+  code = GET_CODE (temp);
+  op0_mode = VOIDmode;
+  mode = GET_MODE (temp);
+}
+
   /* If this is a simple operation applied to an IF_THEN_ELSE, try
  applying it to the arms of the IF_THEN_ELSE.  This often simplifies
  things.  Check for cases where both arms are testing the same
@@ -5588,51 +5633,6 @@ combine_simplify_rtx (rtx x, machine_mode op0_mode, int in_dest,
 	}
 }
 
-  /* Try to fold this expression in case we have constants that weren't
- present before.  */
-  temp = 0;
-  switch (GET_RTX_CLASS (code))
-{
-case RTX_UNARY:
-  if (op0_mode == VOIDmode)
-	op0_mode = GET_MODE (XEXP (x, 0));
-  temp = simplify_unary_operation (code, mode, XEXP (x, 0), op0_mode);
-  break;
-case RTX_COMPARE:
-case RTX_COMM_COMPARE:
-  {
-	machine_mode cmp_mode = GET_MODE (XEXP (x, 0));
-	if (cmp_mode == VOIDmode)
-	  {
-	cmp_mode = GET_MODE (XEXP (x, 1));
-	if (cmp_mode == VOIDmode)
-	  cmp_mode = op0_mode;
-	  }
-	temp = simplify_relational_operation (code, mode, cmp_mode,
-	  XEXP (x, 0), XEXP (x, 1));
-  }
-  break;
-case RTX_COMM_ARITH:
-case RTX_BIN_ARITH:
-  temp = simplify_binary_operation (code, mode, XEXP (x, 0), XEXP (x, 1));
-  break;
-case RTX_BITFIELD_OPS:
-case RTX_TERNARY:
-  temp = simplify_ternary_operation (code, mode, op0_mode, XEXP (x, 0),
-	 XEXP (x, 1), XEXP (x, 2));
-  break;
-default:
-  break;
-}
-
-  if (temp)
-{
-  x = temp;
-  code = GET_CODE (temp);
-  op0_mode = VOIDmode;
-  mode = GET_MODE (temp);
-}
-
   /* First see if we can apply the inverse distributive law.  */
   if (code == PLUS || code == MINUS
   || code == AND || code == IOR || code == XOR)


Re: [PATCH][combine][1/2] Try to simplify before substituting

2015-07-17 Thread Pinski, Andrew




> On Jul 17, 2015, at 7:36 PM, Kyrill Tkachov  wrote:
> 
> 
>> On 16/07/15 19:28, Segher Boessenkool wrote:
>> On Thu, Jul 16, 2015 at 07:17:54PM +0100, Kyrill Tkachov wrote:
 If you always want to simplify first, does it work to move this whole big
 block behind the simplify just following it?  Or do you want to simplify
 after the transform as well?
>>> You mean move this hunk outside the "if (BINARY_P (x)...)" block it's in?
>>> I think it would work, but I'm not sure if it would affect other cases.
>>> I was also conscious that simplify_rtx might not be a cheap function to call
>>> so frequently (or is it? I didn't profile it), so I tried to avoid calling
>>> it unless I need for the transformation in question here.
>> I mean move the whole "if (BINARY_P ..." block to after the existing
>> simplify calls, to just before the "First see if we can apply" comment,
>> and not do a new simplify_rtx call at all.  Does that work?
> 
> Yes, and here's the patch.
> It just moves the simplification block.
> The effect on codegen in SPEC2006 on aarch64 looks sane in the same
> way as the original patch I posted (i.e. many redundant zero_extends 
> eliminated)
> and together with patch 2/2 this helps in the -abs testcase.
> 
> I'm bootstrapping this on aarch64, arm and x86.
> Any other testing would be appreciated.
> 
> Is this version ok if testing comes clean?

This combined with the other patch looks much better than the original target 
specific hack. And it goes to show that doing it in a non-target specific way 
can improve other cases you did not see before. 

Thanks,
Andrew

> 
> Thanks,
> Kyrill
> 
> 2015-07-17  Kyrylo Tkachov  
> 
>* combine.c (combine_simplify_rtx): Move simplification step
>before various transformations/substitutions.
> 
> 
> 


Re: [PATCH, i386]: Improve FP moves through integer registers

2015-07-17 Thread H.J. Lu
On Fri, Jul 17, 2015 at 12:10 AM, Uros Bizjak  wrote:
> If the target only has integer registers, we don't have to go through
> delicately tuned alternatives, balanced between x87 regs, SSE regs and
> integer regs. Attached patch introduces simple and effective
> alternatives in this case.
>
> Testcases will be provided later by HJ.
>
> 2015-07-17  Uros Bizjak  
>
> PR target/66824
> * config/i386/i386.h (TARGET_HARD_SF_REGS): New define.
> (TARGET_HARD_DF_REGS): Ditto.
> (TARGET_HARD_XF_REGS): Ditto.
> * config/i386/i386.md (*movxf_internal): Add alternatives 9 and 10.
> Enable alternatives 9 and 10 only for !TARGET_HARD_XF_REG target.
> (*movdf_internal): Add alternatives 22, 23, 24 and 25. Enable
> alternatives 22, 23, 24 and 25 only for !TARGET_HARD_DF_REG target.
> (*movsf_internal): Add alternatives 16 and 17. Enable
> alternatives 16 and 17 only for !TARGET_HARD_SF_REG target.
>
> Bootstrapped and regression tested on x86_64-linux-gnu {,-m32} and
> committed to mainline.
>

I checked in this testcase.

Thanks.

-- 
H.J.
---
Index: ChangeLog
===
--- ChangeLog (revision 225928)
+++ ChangeLog (working copy)
@@ -1,3 +1,8 @@
+2015-07-17  H.J. Lu  
+
+ PR target/66824
+ * gcc.target/i386/pr66824.c:
+
 2015-07-17  Andre Vehreschild  

  PR fortran/66035
Index: gcc.target/i386/pr66824.c
===
--- gcc.target/i386/pr66824.c (revision 0)
+++ gcc.target/i386/pr66824.c (working copy)
@@ -0,0 +1,29 @@
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-O2 -mno-sse -mno-mmx -mno-80387" } */
+/* { dg-final { scan-assembler-not "\.LC\[0-9\]" } } */
+
+double foo (float);
+
+double
+f1 (void)
+{
+  return foo (1.0);
+}
+
+double
+f2 (void)
+{
+  return foo (0.0);
+}
+
+void
+f3 (float *x, float t)
+{
+  *x = 0.0 + t;
+}
+
+float
+f4 (void)
+{
+  return 1.0;
+}


Re: [Patch 0/4] PowerPC64 Linux split stack support

2015-07-17 Thread Lynn A. Boger

I have tested this and it fixes the problem.

On 07/01/2015 09:15 AM, Andreas Schwab wrote:

"Lynn A. Boger"  writes:


It's not clear to me what distro/gcc/glibc versions and type of build
causes the error.  I have not been able to reproduce the
original problem.

The failure mode is quite obvious: go_langhook_init_options_struct is
called before the options are parsed, so -m32 hasn't been acted upon and
supports_split_stack falsely returns true.

Andreas.






Re: [PATCH] Move A - (A & B) -> ~B & A

2015-07-17 Thread Richard Biener
On July 17, 2015 12:28:46 PM GMT+02:00, Marek Polacek  
wrote:
>This moves one pattern from fold-const.c into match.pd.
>Since no test was testing that pattern, I added a new test.
>As a follow up I'll move the "(A & ~B) - (A & B) into (A ^ B) - B"
>pattern.
>
>Bootstrapped/regtested on x86_64-linux, ok for trunk?
>
>2015-07-17  Marek Polacek  
>
>   * fold-const.c (fold_binary_loc): Move A - (A & B) into ~B & A ...
>   * match.pd: ... here.
>
>   * gcc.dg/fold-minus-7.c: New test.
>
>diff --git gcc/fold-const.c gcc/fold-const.c
>index 93dd29d..fa321f4 100644
>--- gcc/fold-const.c
>+++ gcc/fold-const.c
>@@ -9777,30 +9777,6 @@ fold_binary_loc (location_t loc,
> 
>   if (! FLOAT_TYPE_P (type))
>   {
>-/* Fold A - (A & B) into ~B & A.  */
>-if (!TREE_SIDE_EFFECTS (arg0)
>-&& TREE_CODE (arg1) == BIT_AND_EXPR)
>-  {
>-if (operand_equal_p (arg0, TREE_OPERAND (arg1, 1), 0))
>-  {
>-tree arg10 = fold_convert_loc (loc, type,
>-   TREE_OPERAND (arg1, 0));
>-return fold_build2_loc (loc, BIT_AND_EXPR, type,
>-fold_build1_loc (loc, BIT_NOT_EXPR,
>- type, arg10),
>-fold_convert_loc (loc, type, arg0));
>-  }
>-if (operand_equal_p (arg0, TREE_OPERAND (arg1, 0), 0))
>-  {
>-tree arg11 = fold_convert_loc (loc,
>-   type, TREE_OPERAND (arg1, 1));
>-return fold_build2_loc (loc, BIT_AND_EXPR, type,
>-fold_build1_loc (loc, BIT_NOT_EXPR,
>- type, arg11),
>-fold_convert_loc (loc, type, arg0));
>-  }
>-  }
>-
> /* Fold (A & ~B) - (A & B) into (A ^ B) - B, where B is
>any power of 2 minus 1.  */
> if (TREE_CODE (arg0) == BIT_AND_EXPR
>diff --git gcc/match.pd gcc/match.pd
>index c335ada..066d5de 100644
>--- gcc/match.pd
>+++ gcc/match.pd
>@@ -662,6 +662,10 @@ along with GCC; see the file COPYING3.  If not see
>  (bit_ior:c (bit_and:cs @0 (bit_not @2)) (bit_and:cs @1 @2))
>  (bit_xor (bit_and (bit_xor @0 @1) @2) @0))
> 
>+/* Fold A - (A & B) into ~B & A.  */
>+(simplify
>+ (minus (convert? @0) (convert? (bit_and:c @0 @1)))

I think you want :s on the bit_and and its convert?

Are you sure you don't need to check for a nop conversion here?

Richard.

>+ (convert (bit_and (bit_not @1) @0)))
> 
> /* Associate (p +p off1) +p off2 as (p +p (off1 + off2)).  */
> (simplify
>diff --git gcc/testsuite/gcc.dg/fold-minus-7.c
>gcc/testsuite/gcc.dg/fold-minus-7.c
>index e69de29..7a49faa 100644
>--- gcc/testsuite/gcc.dg/fold-minus-7.c
>+++ gcc/testsuite/gcc.dg/fold-minus-7.c
>@@ -0,0 +1,36 @@
>+/* { dg-do compile } */
>+/* { dg-options "-O -fdump-tree-cddce1" } */
>+
>+int
>+f1 (int a, int b)
>+{
>+  int tem = a & b;
>+  return a - tem;
>+}
>+
>+int
>+f2 (int a, int b)
>+{
>+  int tem = b & a;
>+  return a - tem;
>+}
>+
>+int
>+f3 (unsigned int a, int b)
>+{
>+  return a - (a & b);
>+}
>+
>+int
>+f4 (int a, unsigned int b)
>+{
>+  return a - (a & b);
>+}
>+
>+int
>+f5 (int a, int b)
>+{
>+  return a - (unsigned) (b & a);
>+}
>+
>+/* { dg-final { scan-tree-dump-not " - " "cddce1" } } */
>
>   Marek




Re: [PATCH][doc][13/14] Document AArch64 target attributes and pragmas

2015-07-17 Thread Kyrill Tkachov

Hi Sandra,

On 17/07/15 04:21, Sandra Loosemore wrote:

On 07/16/2015 09:21 AM, Kyrill Tkachov wrote:

Hi all,

This patch adds the documentation for the AArch64 target attributes and
pragmas.

Ok for trunk?

The content looks OK, but I have a bunch of nit-picky comments about
grammar, typos, markup, etc


Thanks for the detailed feedback!
Here's an updated version.

Thanks,
Kyrill

2015-07-17  Kyrylo Tkachov  

 * doc/extend.texi (AArch64 Function Attributes): New node.
 (AArch64 Pragmas): Likewise.



+The following target-specific function attributes are available for
+the AArch64 target and for the most part mirror the behavior of similar
+command line options, but on a per-function basis:

s/command line option/command-line option/g

It would be good to add a cross-reference to the section where the
command-line options are documented.  I recommend splitting the
introductory sentence into two, like:

The following target-specific function attributes are available for the
AArch64 target.  For the most part, these options mirror the behavior of
similar command-line options (@pxref{AArch64 Options}), but on a
per-function basis.


+
+@table @code
+@item general-regs-only
+@cindex @code{general-regs-only} function attribute, AArch64
+Indicates that no floating point or AdvancedSIMD registers should be

s/floating point/floating-point/
s/AdvancedSIMD/Advanced SIMD/


+used when generating code for this function.  If the function explicitly
+uses floating point code, then the compiler will give an error.  This is

s/floating point code/floating-point code/
s/will give/gives/


+the same behavior as that of the command line option
+@code{-mgeneral-regs-only}.

Please use @option markup instead of @code on option names throughout
this patch.


+@item cmodel=
+@cindex @code{cmodel=} function attribute, AArch64
+Indicates that code should be generated for a particular code model for
+this function.  The behaviour and permissible arguments are the same as

s/behaviour/behavior/

(We prefer to consistently use American spellings throughout the GCC
documentation.)


+@item strict-align
+@cindex @code{strit-align} function attribute, AArch64

s/strit-align/strict-align/


+The above target attributes can be specified as follows:
+
+@smallexample
+__attribute__((target("")))
+int
+f (int a)
+@{
+  return a + 5;
+@}
+@end smallexample
+
+where @code{} is one of the attribute strings specified above.

s//@var{attr-string}/g


+In this example @code{target("+crc+nocrypto")} will enable the @code{crc}
+extension and disable the @code{crypto} extension for the function @code{foo}

s/will enable/enables/
s/disable/disables/


+is valid and will compile function @code{foo} for ARMv8-A with @code{crc}
+and @code{crypto} extensions and tune it for @code{cortex-a53}.

s/will compile/compiles/
s/tune/tunes/


+@code{-mcpu=} optio or the @code{cpu=} attribute conflicts with the

s/optio/option/


@@ -18159,6 +18299,19 @@ for further explanation.
  * Loop-Specific Pragmas::
  @end menu

+@node AArch64 Pragmas
+@subsection AArch64 Pragmas
+
+The pragmas defined by the AArch64 target correspond to the AArch64
+target function attributes.  They can be specified as below:
+@smallexample
+#pragma GCC target("")
+@end smallexample
+
+where @code{} can be any string accepted as an AArch64 target
+attribute.  @xref{AArch64 Function Attributes} for more details
+on the permissible values of @code{}.

s//@var{string}/g

-Sandra




commit 39b05898be3e1d91fed37db4192dd10c373c1418
Author: Kyrylo Tkachov 
Date:   Fri May 22 12:06:10 2015 +0100

[doc][13/N] Document AArch64 target attributes and pragmas

diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index b18d8fb..180d9e9 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -2191,6 +2191,7 @@ GCC plugins may provide their own attributes.
 
 @menu
 * Common Function Attributes::
+* AArch64 Function Attributes::
 * ARC Function Attributes::
 * ARM Function Attributes::
 * AVR Function Attributes::
@@ -3322,6 +3323,145 @@ easier to pack regions.
 
 @c This is the end of the target-independent attribute table
 
+@node AArch64 Function Attributes
+@subsection AArch64 Function Attributes
+
+The following target-specific function attributes are available for the
+AArch64 target.  For the most part, these options mirror the behavior of
+similar command-line options (@pxref{AArch64 Options}), but on a
+per-function basis.
+
+@table @code
+@item general-regs-only
+@cindex @code{general-regs-only} function attribute, AArch64
+Indicates that no floating-point or Advanced SIMD registers should be
+used when generating code for this function.  If the function explicitly
+uses floating-point code, then the compiler gives an error.  This is
+the same behavior as that of the command line option
+@option{-mgeneral-regs-only}.
+
+@item fix-cortex-a53-835769
+@cindex @code{fix-cortex-a53-835769} function attribute, AArch64
+Indicates that the workaround for the Cortex-A53 errat

Re: [gomp4] Remove device-specific filtering during parsing for OpenACC

2015-07-17 Thread Thomas Schwinge
Hi Julian!

On Thu, 16 Jul 2015 16:32:12 +0100, Julian Brown  
wrote:
> This patch removes the device-specific filtering (for NVidia PTX) from
> the parsing stages of the host compiler (for the device_type clause --
> separately for C, C++ and Fortran) in favour of fully parsing the
> device_type clauses, but not actually implementing anything for them
> (device_type support is a feature that we're not planning to implement
> just yet: the existing "support" is something of a red herring).
> 
> With this patch, the parsed device_type clauses will be ready at OMP
> lowering time whenever we choose to do something with them (e.g.
> transforming them into a representation that can be streamed out and
> re-read by the appropriate offload compiler). The representation is
> more-or-less the same for all supported languages

Thanks!

> modulo clause ordering.

Is that something that a) doesn't need to be/already has been addressed
(with your patch), or b) still needs to be addressed?


> I've altered the dtype-*.* tests to account for the new behaviour (and
> to not use e.g. mixed-case "nVidia" or "acc_device_nvidia" names, which
> are contrary to the recommendations in the spec).

OpenACC 2.0a indeed seems to suggest that device_type arguments are
case-sensitive -- contrary to the ACC_DEVICE_TYPE environment variable,
which probably is where the idea came from to parse them
case-insensitive.

As to the latter "invalid" names, I thought the idea has been to verify
that the clauses following such device_types clauses are indeed ignored
in the later processing.  (Obviously, there should've been comments
indicating that, as otherwise that's very confusing -- as we've just seen
-- due to the similarity to the runtime library's acc_device_* device
type values.)


> OK to apply, or any comments?

Your commit r225927 appears to have caused:

[-PASS:-]{+FAIL: libgomp.fortran/declare-simd-2.f90   -O0  (internal 
compiler error)+}
{+FAIL:+} libgomp.fortran/declare-simd-2.f90   -O0  (test for excess errors)
[-PASS:-]{+UNRESOLVED:+} libgomp.fortran/declare-simd-2.f90   -O0  
[-execution test-]
[-PASS:-]{+compilation failed to produce executable+}
[same for other optimization levels]

[...]/source-gcc/libgomp/testsuite/libgomp.fortran/declare-simd-3.f90:17:0: 
internal compiler error: Segmentation fault
0xc39b6f crash_signal
[...]/source-gcc/gcc/toplev.c:352
0x7043a8 gfc_trans_omp_clauses
[...]/source-gcc/gcc/fortran/trans-openmp.c:2671
0x7049a8 gfc_trans_omp_declare_simd(gfc_namespace*)
[...]/source-gcc/gcc/fortran/trans-openmp.c:4589
0x6b8542 gfc_get_extern_function_decl(gfc_symbol*)
[...]/source-gcc/gcc/fortran/trans-decl.c:2025
0x6b878d gfc_get_extern_function_decl(gfc_symbol*)
[...]/source-gcc/gcc/fortran/trans-decl.c:1820
0x6ce952 conv_function_val
[...]/source-gcc/gcc/fortran/trans-expr.c:3601
0x6ce952 gfc_conv_procedure_call(gfc_se*, gfc_symbol*, gfc_actual_arglist*, 
gfc_expr*, vec*)
[...]/source-gcc/gcc/fortran/trans-expr.c:5873
0x6cf4c2 gfc_conv_expr(gfc_se*, gfc_expr*)
[...]/source-gcc/gcc/fortran/trans-expr.c:7391
0x6d71d0 gfc_trans_assignment_1
[...]/source-gcc/gcc/fortran/trans-expr.c:9127
0x692465 trans_code
[...]/source-gcc/gcc/fortran/trans.c:1674
0x6fa457 gfc_trans_omp_code
[...]/source-gcc/gcc/fortran/trans-openmp.c:2711
0x705410 gfc_trans_omp_do
[...]/source-gcc/gcc/fortran/trans-openmp.c:3459
0x707f9f gfc_trans_omp_directive(gfc_code*)
[...]/source-gcc/gcc/fortran/trans-openmp.c:4521
0x6922b7 trans_code
[...]/source-gcc/gcc/fortran/trans.c:1924
0x6c0660 gfc_generate_function_code(gfc_namespace*)
[...]/source-gcc/gcc/fortran/trans-decl.c:6231
0x64d630 translate_all_program_units
[...]/source-gcc/gcc/fortran/parse.c:5523
0x64d630 gfc_parse_file()
[...]/source-gcc/gcc/fortran/parse.c:5728
0x68ef12 gfc_be_parse_file
[...]/source-gcc/gcc/fortran/f95-lang.c:214


> --- a/gcc/c/c-parser.c
> +++ b/gcc/c/c-parser.c
> @@ -12439,10 +12439,7 @@ c_parser_oacc_all_clauses (c_parser *parser, 
> omp_clause_mask mask,
>c_parser_skip_to_pragma_eol (parser);
>  
>if (finish_p)
> -{
> -  clauses = c_oacc_filter_device_types (clauses);
> -  return c_finish_omp_clauses (clauses, true);
> -}
> +return c_finish_omp_clauses (clauses, true);
>  
>return clauses;
>  }

In combination with the equivant change to
gcc/cp/parser.c:cp_parser_oacc_all_clauses,
gcc/c-family/c-omp.c:c_oacc_filter_device_types, and transitively also
the struct identifier_hasher and c_oacc_extract_device_id function
preceding it, are now unused.  (Not an exhaustive list; have not checked
which other auxilliary functions etc. Cesar has added in his device_type
changes.)  Does it make any sense to keep these for later, or

Still crashes due to aliasing violation (Re: [RFC, PATCH] Split pool_allocator and create a new object_allocator)

2015-07-17 Thread Ulrich Weigand
On 07/09/2015 11:43 PM, Martin Liška wrote:

> This final version which I agreed with Richard Sandiford.
> Hope this can be finally installed to trunk?
> 
> Patch can bootstrap and survive regression tests on x86_64-linux-gnu.

Unfortunately, this still crashes on my SPU toolchain build machine,
for pretty much the same reason outlined here:
https://gcc.gnu.org/ml/gcc-patches/2015-06/msg00868.html

However, the host compiler no longer miscompiles these lines:

  empty_shared_hash = new shared_hash_def;
  empty_shared_hash->refcount = 1;

But that is simply because that "new" now goes to the default
heap-based allocator from the standard library.  There is a
"shared_hash_def_pool", but that's apparently not being used
for anything -- this probably was not intended?


But now the following lines are miscompiled:

  elt_list *el = elt_list_pool.allocate ();
  el->next = next;
  el->elt = elt;

from new_elt_list in cselib.c.  Again, the "allocate" call ends
simply with a cast:

  header = m_returned_free_list;
  m_returned_free_list = header->next;

  return (void *)(header);

and type-based aliasing now states the access to "header->next"
in allocate must not alias the access to "el->next" in new_elt_list,
but clearly it does.

(Since there is no C++ operator new involved at all anymore,
this clearly violates even the C aliasing rules ...)

I really think the allocate routine needs to be more careful to
avoid violating aliasing, e.g. by using memcpy or union-based
type-punning to access its free list info.

Bye,
Ulrich

-- 
  Dr. Ulrich Weigand
  GNU/Linux compilers and toolchain
  ulrich.weig...@de.ibm.com



[gomp4.1] Handle new form of #pragma omp declare target

2015-07-17 Thread Jakub Jelinek
Hi!

As the testcases show, #pragma omp declare target has now a new form (well,
two; with some issues on it pending), where it is used just as a single
declarative directive rather than a pair of them and allows marking
vars and functions by name as "omp declare target" vars/functions (which the
middle-end etc. already handles), but also "omp declare target link", which
is a deferred var, that is not initially mapped (on devices without shared
memory with host), but has to be mapped explicitly.

This patch only marks them with the new attribute, the actual middle-end
implementation needs to be implemented.

I believe OpenACC has something similar, but no idea if it is already
implemented.

Anyway, I think the implementation should be that in some pass running on
the ACCEL_COMPILER side (guarded by separate address space aka non-HSA)
we actually replace the variables with pointers to variables, then need
to somehow also mark those in the offloading tables, so that the library
registers them (the locations of the pointers to the vars), but also marks
them for special treatment, and then when actually trying to map them
(or their parts, guess that needs to be discussed) we allocate them or
whatever is requested and store the device pointer into the corresponding
variable.

Ilya, Thomas, thoughts on this?

2015-07-17  Jakub Jelinek  

* tree-core.h (enum omp_clause_code): Add OMP_CLAUSE_TO_DECLARE
and OMP_CLAUSE_LINK.
* tree.c (omp_clause_num_ops, omp_clause_code_name): Add entries for
OMP_CLAUSE_{TO_DECLARE,LINK}.
(walk_tree_1): Handle OMP_CLAUSE_{TO_DECLARE,LINK}.
* tree-nested.c (convert_nonlocal_omp_clauses,
convert_local_omp_clauses): Likewise.
* tree-pretty-print.c (dump_omp_clause): Likewise.
c-family/
* c-pragma.h (enum pragma_omp_clause): Add PRAGMA_OMP_CLAUSE_LINK.
c/
* c-parser.c (c_parser_omp_clause_name): Handle link clause.
(c_parser_omp_variable_list): Formatting fix.
(c_parser_omp_all_clauses): Handle PRAGMA_OMP_CLAUSE_LINK.
For PRAGMA_OMP_CLAUSE_TO, parse it as OMP_CLAUSE_TO_DECLARE
rather than OMP_CLAUSE_TO if it is a declare target directive clause.
(OMP_DECLARE_TARGET_CLAUSE_MASK): Define.
(c_parser_omp_declare_target): Parse directive with clauses forms.
* c-typeck.c (c_finish_omp_clauses): Handle
OMP_CLAUSE_{TO_DECLARE,LINK}.
cp/
* parser.c (cp_parser_omp_clause_name): Handle link clause.
(cp_parser_omp_var_list_no_open): Formatting fix.
(cp_parser_omp_all_clauses): Handle PRAGMA_OMP_CLAUSE_LINK.
For PRAGMA_OMP_CLAUSE_TO, parse it as OMP_CLAUSE_TO_DECLARE
rather than OMP_CLAUSE_TO if it is a declare target directive clause.
(OMP_DECLARE_TARGET_CLAUSE_MASK): Define.
(cp_parser_omp_declare_target): Parse directive with clauses forms.
* semantics.c (finish_omp_clauses): Handle
OMP_CLAUSE_{TO_DECLARE,LINK}.
testsuite/
* c-c++-common/gomp/declare-target-1.c: New test.
* c-c++-common/gomp/declare-target-2.c: New test.

--- gcc/tree-core.h.jj  2015-07-15 13:02:31.0 +0200
+++ gcc/tree-core.h 2015-07-17 09:30:44.944431669 +0200
@@ -256,6 +256,13 @@ enum omp_clause_code {
   /* OpenMP clause: uniform (argument-list).  */
   OMP_CLAUSE_UNIFORM,
 
+  /* OpenMP clause: to (extended-list).
+ Only when it appears in declare target.  */
+  OMP_CLAUSE_TO_DECLARE,
+
+  /* OpenMP clause: link (variable-list).  */
+  OMP_CLAUSE_LINK,
+
   /* OpenMP clause: from (variable-list).  */
   OMP_CLAUSE_FROM,
 
--- gcc/tree.c.jj   2015-07-14 14:49:57.0 +0200
+++ gcc/tree.c  2015-07-17 09:33:51.270692623 +0200
@@ -288,6 +288,8 @@ unsigned const char omp_clause_num_ops[]
   2, /* OMP_CLAUSE_ALIGNED  */
   1, /* OMP_CLAUSE_DEPEND  */
   1, /* OMP_CLAUSE_UNIFORM  */
+  1, /* OMP_CLAUSE_TO_DECLARE  */
+  1, /* OMP_CLAUSE_LINK  */
   2, /* OMP_CLAUSE_FROM  */
   2, /* OMP_CLAUSE_TO  */
   2, /* OMP_CLAUSE_MAP  */
@@ -357,6 +359,8 @@ const char * const omp_clause_code_name[
   "aligned",
   "depend",
   "uniform",
+  "to",
+  "link",
   "from",
   "to",
   "map",
@@ -11392,6 +11396,8 @@ walk_tree_1 (tree *tp, walk_tree_fn func
case OMP_CLAUSE_GRAINSIZE:
case OMP_CLAUSE_NUM_TASKS:
case OMP_CLAUSE_HINT:
+   case OMP_CLAUSE_TO_DECLARE:
+   case OMP_CLAUSE_LINK:
case OMP_CLAUSE_USE_DEVICE_PTR:
case OMP_CLAUSE_IS_DEVICE_PTR:
case OMP_CLAUSE__LOOPTEMP_:
--- gcc/tree-nested.c.jj2015-07-14 14:49:57.0 +0200
+++ gcc/tree-nested.c   2015-07-17 09:35:11.905507270 +0200
@@ -1098,6 +1098,8 @@ convert_nonlocal_omp_clauses (tree *pcla
case OMP_CLAUSE_FIRSTPRIVATE:
case OMP_CLAUSE_COPYPRIVATE:
case OMP_CLAUSE_SHARED:
+   case OMP_CLAUSE_TO_DECLARE:
+   case OMP_CLAUSE_LINK:
case OMP_CLAUSE_USE_DEVICE_PTR:
case OMP_CLAUSE_IS_DEVICE_PTR:
do_decl_clause:
@@ -17

Re: Still crashes due to aliasing violation (Re: [RFC, PATCH] Split pool_allocator and create a new object_allocator)

2015-07-17 Thread Richard Biener
On July 17, 2015 3:11:51 PM GMT+02:00, Ulrich Weigand  
wrote:
>On 07/09/2015 11:43 PM, Martin Liška wrote:
>
>> This final version which I agreed with Richard Sandiford.
>> Hope this can be finally installed to trunk?
>> 
>> Patch can bootstrap and survive regression tests on x86_64-linux-gnu.
>
>Unfortunately, this still crashes on my SPU toolchain build machine,
>for pretty much the same reason outlined here:
>https://gcc.gnu.org/ml/gcc-patches/2015-06/msg00868.html
>
>However, the host compiler no longer miscompiles these lines:
>
>  empty_shared_hash = new shared_hash_def;
>  empty_shared_hash->refcount = 1;
>
>But that is simply because that "new" now goes to the default
>heap-based allocator from the standard library.  There is a
>"shared_hash_def_pool", but that's apparently not being used
>for anything -- this probably was not intended?
>
>
>But now the following lines are miscompiled:
>
>  elt_list *el = elt_list_pool.allocate ();
>  el->next = next;
>  el->elt = elt;
>
>from new_elt_list in cselib.c.  Again, the "allocate" call ends
>simply with a cast:
>
>  header = m_returned_free_list;
>  m_returned_free_list = header->next;
>
>  return (void *)(header);
>
>and type-based aliasing now states the access to "header->next"
>in allocate must not alias the access to "el->next" in new_elt_list,
>but clearly it does.
>
>(Since there is no C++ operator new involved at all anymore,
>this clearly violates even the C aliasing rules ...)
>
>I really think the allocate routine needs to be more careful to
>avoid violating aliasing, e.g. by using memcpy or union-based
>type-punning to access its free list info.

As far as I understand the object allocator delegates construction to callers 
and thus in the above case cselib
Would be responsible for calling placement new on the return value from
Allocate.

Richard.

>Bye,
>Ulrich




Re: [gomp4] Remove device-specific filtering during parsing for OpenACC

2015-07-17 Thread Julian Brown
On Fri, 17 Jul 2015 14:57:14 +0200
Thomas Schwinge  wrote:

> Hi Julian!
> 
> On Thu, 16 Jul 2015 16:32:12 +0100, Julian Brown
>  wrote:
> > This patch removes the device-specific filtering (for NVidia PTX)
> > from the parsing stages of the host compiler (for the device_type
> > clause -- separately for C, C++ and Fortran) in favour of fully
> > parsing the device_type clauses, but not actually implementing
> > anything for them (device_type support is a feature that we're not
> > planning to implement just yet: the existing "support" is something
> > of a red herring).
> > 
> > With this patch, the parsed device_type clauses will be ready at OMP
> > lowering time whenever we choose to do something with them (e.g.
> > transforming them into a representation that can be streamed out and
> > re-read by the appropriate offload compiler). The representation is
> > more-or-less the same for all supported languages
> 
> Thanks!
> 
> > modulo clause ordering.
> 
> Is that something that a) doesn't need to be/already has been
> addressed (with your patch), or b) still needs to be addressed?

It's something that doesn't matter, I think: clauses are chained
together like this:

  num_gangs
  num_workers
  ...
  |
  device_type(foo)
  \__num_gangs(OMP_CLAUSE_DEVICE_TYPE_CLAUSES)
  |  num_workers
  |  ...
  device_type(bar)
  \__num_gangs
  |  num_workers
  |  ...
  V
  (OMP_CLAUSE_CHAIN)

"foo" and "bar" are OMP_CLAUSE_DEVICE_TYPE_DEVICES -- tree lists. The
Fortran front-end will emit num_gangs, num_workers etc. clauses in a
fixed order (irrespective of their order in the source program), but the
C and C++ frontends will emit them in the (reverse of the) order
encountered.

There isn't really a consumer for this information yet, but when there
is, it will just have to not care about that (which should be
straightforward, I think).

> > I've altered the dtype-*.* tests to account for the new behaviour
> > (and to not use e.g. mixed-case "nVidia" or "acc_device_nvidia"
> > names, which are contrary to the recommendations in the spec).
> 
> OpenACC 2.0a indeed seems to suggest that device_type arguments are
> case-sensitive -- contrary to the ACC_DEVICE_TYPE environment
> variable, which probably is where the idea came from to parse them
> case-insensitive.
> 
> As to the latter "invalid" names, I thought the idea has been to
> verify that the clauses following such device_types clauses are
> indeed ignored in the later processing.  (Obviously, there should've
> been comments indicating that, as otherwise that's very confusing --
> as we've just seen -- due to the similarity to the runtime library's
> acc_device_* device type values.)

Yes, and there are still some tests for that functionality. I figured
there wasn't much point in "over-testing" it, especially since none of
this code does that much yet.

> > OK to apply, or any comments?
> 
> Your commit r225927 appears to have caused:
> 
> [-PASS:-]{+FAIL: libgomp.fortran/declare-simd-2.f90   -O0
> (internal compiler error)+} {+FAIL:+}
> libgomp.fortran/declare-simd-2.f90   -O0  (test for excess errors)
> [-PASS:-]{+UNRESOLVED:+} libgomp.fortran/declare-simd-2.f90   -O0
> [-execution test-] [-PASS:-]{+compilation failed to produce
> executable+} [same for other optimization levels]
> 
> 
> [...]/source-gcc/libgomp/testsuite/libgomp.fortran/declare-simd-3.f90:17:0:
> internal compiler error: Segmentation fault 0xc39b6f crash_signal
> [...]/source-gcc/gcc/toplev.c:352
> 0x7043a8 gfc_trans_omp_clauses
> [...]/source-gcc/gcc/fortran/trans-openmp.c:2671
> 0x7049a8 gfc_trans_omp_declare_simd(gfc_namespace*)
> [...]/source-gcc/gcc/fortran/trans-openmp.c:4589
> 0x6b8542 gfc_get_extern_function_decl(gfc_symbol*)
> [...]/source-gcc/gcc/fortran/trans-decl.c:2025
> 0x6b878d gfc_get_extern_function_decl(gfc_symbol*)
> [...]/source-gcc/gcc/fortran/trans-decl.c:1820
> 0x6ce952 conv_function_val
> [...]/source-gcc/gcc/fortran/trans-expr.c:3601
> 0x6ce952 gfc_conv_procedure_call(gfc_se*, gfc_symbol*,
> gfc_actual_arglist*, gfc_expr*, vec*)
> [...]/source-gcc/gcc/fortran/trans-expr.c:5873 0x6cf4c2
> gfc_conv_expr(gfc_se*, gfc_expr*)
> [...]/source-gcc/gcc/fortran/trans-expr.c:7391 0x6d71d0
> gfc_trans_assignment_1 [...]/source-gcc/gcc/fortran/trans-expr.c:9127
> 0x692465 trans_code
> [...]/source-gcc/gcc/fortran/trans.c:1674
> 0x6fa457 gfc_trans_omp_code
> [...]/source-gcc/gcc/fortran/trans-openmp.c:2711
> 0x705410 gfc_trans_omp_do
> [...]/source-gcc/gcc/fortran/trans-openmp.c:3459
> 0x707f9f gfc_trans_omp_directive(gfc_code*)
> [...]/source-gcc/gcc/fortran/trans-openmp.c:4521
> 0x6922b7 trans_code
> [...]/source-gcc/gcc/fortran/trans.c:1924
> 0x6c0660 gfc_generate_function_code(gfc_namespace*)
> [...]/source-gcc/gcc/fortran/trans-decl.c:6231
> 0x64d630 translate_all_program_units

Re: Still crashes due to aliasing violation (Re: [RFC, PATCH] Split pool_allocator and create a new object_allocator)

2015-07-17 Thread Ulrich Weigand
Richard Biener wrote:
> On July 17, 2015 3:11:51 PM GMT+02:00, Ulrich Weigand  
> wrote:
> >(Since there is no C++ operator new involved at all anymore,
> >this clearly violates even the C aliasing rules ...)
> >
> >I really think the allocate routine needs to be more careful to
> >avoid violating aliasing, e.g. by using memcpy or union-based
> >type-punning to access its free list info.
> 
> As far as I understand the object allocator delegates construction to callers 
> and thus in the above case cselib
> Would be responsible for calling placement new on the return value from
> Allocate.

Ah, it looks like I was wrong above: the code uses the *object*
allocator, so it should go through a placement new here:
  inline T *
  allocate () ATTRIBUTE_MALLOC
  {
return ::new (m_allocator.allocate ()) T ();
  }

It's still being miscompiled at least by my GCC 4.1 host compiler ...

Bye,
Ulrich

-- 
  Dr. Ulrich Weigand
  GNU/Linux compilers and toolchain
  ulrich.weig...@de.ibm.com



Re: PATCH (v2) trunk GCCJIT: adding gcc_jit_context_new_rvalue_from_long_long, etc...

2015-07-17 Thread David Malcolm
On Thu, 2015-07-16 at 11:00 +0200, Basile Starynkevitch wrote:
> On 07/15/2015 21:16, David Malcolm wrote:
> > Perhaps, but note that nothing in a regular gcc bootstrap uses
> > libgccjit, so you *might* still have a latent linking error that
> shows
> > up only at run time.   Running the jit testsuite is the best way to
> be
> > sure.
> >
> >> And I'm testing that on
> >> x86-64/Linux where the patch is almost useless.
> >>
> >> Thanks for your other comments. I'm trying to understand them and I
> am
> >> working on that.
> >>
> >> Cheers
> >>
> 
> Here (attached gcc-jitlonglong-r225860.diff)
> is an improved version of my patch against trunk r225860.
> Thanks to David Malcom for the kind help.

Thanks.   Comments inline below thoughout.

> ### gcc/jit/ ChangeLog entry
> 
> 2015-07-16  Basile Starynkevitch  
> 
>  * jit-playback.c: Mention that it is in C++.
>  (new_rvalue_from_const ): New.
> 
>  * jit-recording.c: Mention that it is in C++.
>  (recording::memento_of_new_rvalue_from_const ): New
>  instanciated template.
>  (memento_of_new_rvalue_from_const  long>::make_debug_string):
>  New specialized function.
>  (memento_of_new_rvalue_from_const ::get_wide_int): New
>  specialized function.
>  (recording::memento_of_new_rvalue_from_const   long>::write_reproducer): Likewise.
> 
>  * libgccjit.c: Mention that it is in C++.
>  (gcc_jit_context_new_rvalue_from_long_long): New function.
>  (gcc_jit_context_new_rvalue_from_int32): New function.
>  (gcc_jit_context_new_rvalue_from_int64): New function.
>  (gcc_jit_context_new_rvalue_from_intptr): New function.
> 
>  * libgccjit.h: #include 
>  (gcc_jit_context_new_rvalue_from_long_long): New declaration.
>  In the declarations of the functions below, a short comment
>  explains that they are convenience functions.
>  (gcc_jit_context_new_rvalue_from_int32): New declaration.
>  (gcc_jit_context_new_rvalue_from_int64): New declaration.
>  (gcc_jit_context_new_rvalue_from_intptr): New declaration.
> 
>  * libgccjit.map: Add LIBGCCJIT_ABI_4 for new functions
>  e.g. gcc_jit_context_new_rvalue_from_long_long, 
> 
> 
> ## gcc/testsuite/ChangeLog entry
>  * test-constants.c (make_test_of_long_long_constant): New
> function.
>  (make_tests_of_long_long_constants): New.
>  (verify_long_long_constants): New.
>  (create_code): Call make_tests_of_long_long_constants.
>  (verify_code): Call verify_long_long_constants.
> 
> 
> I have mixed feelings about adding the 
> gcc_jit_context_new_rvalue_from_int32 
> gcc_jit_context_new_rvalue_from_int64 & 
> gcc_jit_context_new_rvalue_from_intptr functions.
> On one hand, their name is very suggestive, and most programmers know 
> about . I should confess that I discovered only recently
> that 
> long long is guaranteed by C99 standard to be at least 64 bits (I 
> thought that the standard just required that long long is at least as 
> big as long).
> On the other hand, we are adding more functions to the ABI, and
> indeed 
> the gcc_jit_context_new_rvalue_from_long_long is in principle enough. 
> Perhaps we should simply document that for int32_t, int64_t, intptr_t 
> types, the GCCJIT user should test the sizeof intptr_t and call the 
> appropriate function?
> 
> BTW some bytecodes or VMs (in particular the JVM) are hardcoding the 
> size of some integers, so dealing explicitly with int32_t & int64_t 
> definitely makes sense.


I'd prefer to just have:

  gcc_jit_context_new_rvalue_from_long_long
  gcc_jit_context_new_rvalue_from_unsigned_long_long

and not bother with the int32_t, int64_t, intptr_t.

I think if we're adding "long long", we need "unsigned long long", given
that there are values of the latter that aren't yet expressible without
resorting to hacks.

Note that in:
   gcc_jit_context_new_rvalue_from_SOME_TYPE
"SOME_TYPE" refers to a *host* type, whereas the gcc_jit_type parameter
expresses the *target* type.

So arguably we don't need lots of support for different host types: if
you need to populate an int32_t constant, you can do:

gcc_jit_type *t_int_32_t =
  gcc_jit_context_get_int_type (ctxt,
sizeof (int32_t), /* int num_bytes */
1); /* int is_signed*/


to get the target type (assuming sizeof (int32_t) on the host is the
same as that on the target, which for these types is of course the
case).

To build an rvalue for a specific int constant we can then simply use a
big enough host type, say your new long long entrypoint:

gcc_jit_rvalue *rval_const = 
  gcc_jit_context_new_rvalue_from_long_long (ctxt,
 t_int32_t,
 INT32_MAX);

given that INT32_MAX will fit in a host "long long".

and indeed if you're using the C++ bindings you can get at integer types
via some template "magic" like this:

  gccjit::type t_int_32_t = ctxt.get_int_type  (

Re: Still crashes due to aliasing violation (Re: [RFC, PATCH] Split pool_allocator and create a new object_allocator)

2015-07-17 Thread Martin Liška
On 07/17/2015 03:44 PM, Ulrich Weigand wrote:
> Richard Biener wrote:
>> On July 17, 2015 3:11:51 PM GMT+02:00, Ulrich Weigand  
>> wrote:
>>> (Since there is no C++ operator new involved at all anymore,
>>> this clearly violates even the C aliasing rules ...)
>>>
>>> I really think the allocate routine needs to be more careful to
>>> avoid violating aliasing, e.g. by using memcpy or union-based
>>> type-punning to access its free list info.
>>
>> As far as I understand the object allocator delegates construction to 
>> callers and thus in the above case cselib
>> Would be responsible for calling placement new on the return value from
>> Allocate.
> 
> Ah, it looks like I was wrong above: the code uses the *object*
> allocator, so it should go through a placement new here:
>   inline T *
>   allocate () ATTRIBUTE_MALLOC
>   {
> return ::new (m_allocator.allocate ()) T ();
>   }
> 
> It's still being miscompiled at least by my GCC 4.1 host compiler ...
> 
> Bye,
> Ulrich
> 

Hi.

I've just wanted to write you that it really utilizes a placement new :)
The first example that bypasses pool allocator is of course a bug, I'll fix.

Question is why aliasing oracle still wrongly aliases these pointers?
Another option (suggested by Martin Jambor) would be to place ::allocate 
implementation
to alloc-pool.c file.

Thoughts?
Martin



Re: [Fortran, Patch] Passing function pointer to co_reduce

2015-07-17 Thread Mikael Morin
Le 17/07/2015 11:02, Mikael Morin a écrit :
> Le 16/07/2015 16:34, Damian Rouson a écrit :
>> Alternatively, if it’s easy, please feel free to add the directives and 
>> commit.  
>>

> Never mind, I'll take care of it all.
> 
This is what I have committed:
https://gcc.gnu.org/r225930 (trunk)
https://gcc.gnu.org/r225932 (5 branch)

Mikael
Index: gcc/testsuite/gfortran.dg/co_reduce_1.f90
===
--- gcc/testsuite/gfortran.dg/co_reduce_1.f90	(révision 0)
+++ gcc/testsuite/gfortran.dg/co_reduce_1.f90	(révision 225930)
@@ -0,0 +1,33 @@
+! { dg-do compile }
+! { dg-additional-options "-fdump-tree-original -fcoarray=lib" }
+!
+! Check that we don't take twice the address of procedure simple_reduction
+! in the generated code.
+!
+! Contributed by Alessandro Fanfarillo 
+
+program simple_reduce
+  implicit none
+
+  integer :: me
+
+  me = this_image()
+
+  sync all
+
+  call co_reduce(me,simple_reduction)
+
+  write(*,*) this_image(),me
+
+contains
+  
+  pure function simple_reduction(a,b)
+integer,intent(in) :: a,b
+integer :: simple_reduction
+
+simple_reduction = a * b
+  end function simple_reduction
+
+end program simple_reduce
+
+! { dg-final { scan-tree-dump "_gfortran_caf_co_reduce \\(&desc\\.\\d+,\\s*simple_reduction," "original" } }
Index: gcc/testsuite/ChangeLog
===
--- gcc/testsuite/ChangeLog	(révision 225929)
+++ gcc/testsuite/ChangeLog	(révision 225930)
@@ -1,3 +1,7 @@
+2015-07-17  Alessandro Fanfarillo  
+
+	* gfortran.dg/co_reduce_1.f90: New file.
+
 2015-07-17  H.J. Lu  
 
 	PR target/66824
Index: gcc/fortran/ChangeLog
===
--- gcc/fortran/ChangeLog	(révision 225929)
+++ gcc/fortran/ChangeLog	(révision 225930)
@@ -1,3 +1,8 @@
+2015-07-17  Alessandro Fanfarillo  
+
+	* trans-intrinsic.c (conv_co_collective): Remove redundant address
+	operator in the generated code.
+
 2015-07-17  Andre Vehreschild  
 
 	PR fortran/66035
Index: gcc/fortran/trans-intrinsic.c
===
--- gcc/fortran/trans-intrinsic.c	(révision 225929)
+++ gcc/fortran/trans-intrinsic.c	(révision 225930)
@@ -8804,7 +8804,7 @@
 	}
   opr_flags = build_int_cst (integer_type_node, opr_flag_int);
   gfc_conv_expr (&argse, opr_expr);
-  opr = gfc_build_addr_expr (NULL_TREE, argse.expr);
+  opr = argse.expr;
   fndecl = build_call_expr_loc (input_location, fndecl, 8, array, opr, opr_flags,
 image_index, stat, errmsg, strlen, errmsg_len);
 }


[gomp4.1] Fix linear-2.{c,C} testcases

2015-07-17 Thread Jakub Jelinek
Hi!

These tests had a thinko, computation performed on the offloaded copy of the
a variable, but then tested on the host side, without #pragma omp target
update or similar.
Fixed thusly.

I'm still seeing
FAIL: libgomp.c/for-5.c (internal compiler error)
FAIL: libgomp.c/for-5.c (test for excess errors)
FAIL: libgomp.c++/for-13.C (internal compiler error)
FAIL: libgomp.c++/for-13.C (test for excess errors)
which is some LTO ICE.

2015-07-17  Jakub Jelinek  

* testsuite/libgomp.c/linear-2.c (verify): New function.
(TEST): Use it.
* testsuite/libgomp.c++/linear-2.C (verify): New function.
(TEST): Use it.

--- libgomp/testsuite/libgomp.c/linear-2.c.jj   2015-06-18 15:16:18.0 
+0200
+++ libgomp/testsuite/libgomp.c/linear-2.c  2015-07-17 15:23:45.0 
+0200
@@ -200,17 +200,28 @@ f18 (long long int i, long long int k, s
 }
 }
 
+void
+verify (void)
+{
+  int err;
+  #pragma omp target map(from:err)
+  {
+err = 0;
+for (int i = 0; i < 256; i++)
+  if (a[i] != (((i & 3) == 0 && i >= 8
+   && i < 8 + 48 * 4)
+  ? ((i - 8) / 4) + 16 : 0))
+   err = 1;
+__builtin_memset (a, 0, sizeof (a));
+  }
+  if (err)
+__builtin_abort ();
+}
+
 int
 main ()
 {
-#define TEST(x) \
-  x;   \
-  for (int i = 0; i < 256; i++)\
-if (a[i] != (((i & 3) == 0 && i >= 8   \
- && i < 8 + 48 * 4)\
-? ((i - 8) / 4) + 16 : 0)) \
-  __builtin_abort ();  \
-  __builtin_memset (a, 0, sizeof (a))
+#define TEST(x) x; verify ()
   TEST (f1 (8));
   TEST (f2 (8, 3));
   TEST (f3 (8LL, 4LL));
--- libgomp/testsuite/libgomp.c++/linear-2.C.jj 2015-06-18 15:16:18.0 
+0200
+++ libgomp/testsuite/libgomp.c++/linear-2.C2015-07-17 15:26:20.190269857 
+0200
@@ -204,17 +204,28 @@ f18 (long long int i, long long int k, s
 }
 }
 
+void
+verify (void)
+{
+  int err;
+  #pragma omp target map(from:err)
+  {
+err = 0;
+for (int i = 0; i < 256; i++)
+  if (a[i] != (((i & 3) == 0 && i >= 8
+   && i < 8 + 48 * 4)
+  ? ((i - 8) / 4) + 16 : 0))
+   err = 1;
+__builtin_memset (a, 0, sizeof (a));
+  }
+  if (err)
+__builtin_abort ();
+}
+
 int
 main ()
 {
-#define TEST(x) \
-  x;   \
-  for (int i = 0; i < 256; i++)\
-if (a[i] != (((i & 3) == 0 && i >= 8   \
- && i < 8 + 48 * 4)\
-? ((i - 8) / 4) + 16 : 0)) \
-  __builtin_abort ();  \
-  __builtin_memset (a, 0, sizeof (a))
+#define TEST(x) x; verify ()
   int vi = 8;
   TEST (f1 (vi));
   TEST (f2 (8, 3));

Jakub


Re: [PATCH][AArch64] Improve csinc/csneg/csinv opportunities on immediates

2015-07-17 Thread Kyrill Tkachov


On 10/07/15 14:45, Kyrill Tkachov wrote:

On 10/07/15 10:00, pins...@gmail.com wrote:




On Jul 10, 2015, at 1:47 AM, Kyrill Tkachov  wrote:

Hi Andrew,


On 10/07/15 09:40, pins...@gmail.com wrote:




On Jul 10, 2015, at 1:34 AM, Kyrill Tkachov  wrote:

Hi all,

Currently when evaluating expressions like (a ? 24 : 25) we will move 24 and 25 
into
registers and perform a csel on them.  This misses the opportunity to instead 
move just 24
into a register and then perform a csinc, saving us an instruction and a 
register use.
Similarly for csneg and csinv.

This patch implements that idea by allowing such pairs of immediates in 
*cmov_insn
and adding an early splitter that performs the necessary transformation.

The testcase included in the patch demonstrates the kind of opportunities that 
are now picked up.

With this patch I see about 9.6% more csinc instructions being generated for 
SPEC2006
and the generated code looks objectively better (i.e. fewer mov-immediates and 
slightly
lower register pressure).

Bootstrapped and tested on aarch64.

Ok for trunk?

I think this is the wrong place for this optimization. It should happen in 
expr.c and we should produce cond_expr on the gimple level.

I had considered it, but I wasn't sure how general the conditional 
increment/negate/inverse operations
are to warrant a midend implementation. Do you mean the 
expand_cond_expr_using_cmove function in expr.c?

Yes and we can expand it to even have a target hook on how to expand them if 
needed.

I played around in that part and it seems that by the time it gets to expansion 
the midend
doesn't have a cond_expr of the two immediates, it's a PHI node with the 
immediates already expanded.
I have not been able to get it to match a cond_expr of two immediates there, 
although that could be
because I'm unfamiliar with that part of the codebase.


So by the time we reach expansion time we don't have a COND_EXPR of two 
immediates, so I tried getting
the code in expr.c to do the right thing, but it didn't work out.
This patch catches this opportunity at the RTL level and could catch such cases 
if they were to be
generated by any of the pre-combine RTL passes. Or do you reckon looking for 
these patterns in RTL
ifcvt is the way to go? I think it would be somewhat messy to express the 
CSNEG, CSINV opportunities
there as we don't have optabs for conditional negate and invert, but 
conditional increment would work,
though in the aarch64 case we can only do a conditional by 1 rather than a 
general conditional add.

Kyrill




Kyrill


There is already a standard pattern for condition add so the a ? Const1 : 
const2 can be handled in the a generic way without much troubles. We should 
handle it better in rtl  ifcvt too (that should be an easier patch). The neg 
and not cases are very target specific but can be handled by a target hook and 
expand it directly to it.


   
I have patches to do both but I have not got around to cleaning them up. If anyone wants them, I can send a link to my current gcc 5.1 sources with them included.

Any chance you can post them on gcc-patches even as a rough idea of what needs 
to be done?

I posted my expr patch a few years ago but I never got around to rth's 
comments. This was the generic increment patch. Basically aarch64 should be 
implementing that pattern too.


The main reason why this should be handled in gimple is that ifcvt on the rtl 
level is not cheap and does not catch all of the cases the simple expansion of 
phi-opt does. I can dig that patch up and I will be doing that next week 
anyways.

Thanks,
Andrew


Thanks,
Kyrill

   
Thanks,

Andrew


Thanks,
Kyrill

2015-07-10  Kyrylo Tkachov  

 * config/aarch64/aarch64.md (*cmov_insn): Move stricter
 check for operands 3 and 4 to pattern predicate.  Allow immediates
 that can be expressed as csinc/csneg/csinv.  New define_split.
 (*csinv3_insn): Rename to...
 (csinv3_insn): ... This.
 * config/aarch64/aarch64.h (AARCH64_IMMS_OK_FOR_CSNEG): New macro.
 (AARCH64_IMMS_OK_FOR_CSINC): Likewise.
 (AARCH64_IMMS_OK_FOR_CSINV): Likewise.
 * config/aarch64/aarch64.c (aarch64_imms_ok_for_cond_op_1):
 New function.
 (aarch64_imms_ok_for_cond_op): Likewise.
 * config/aarch64/aarch64-protos.h (aarch64_imms_ok_for_cond_op_1):
 Declare prototype.
 (aarch64_imms_ok_for_cond_op): Likewise.

2015-07-10  Kyrylo Tkachov  

 * gcc.target/aarch64/cond-op-imm_1.c: New test.





Re: constify target offload data

2015-07-17 Thread Nathan Sidwell

On 07/17/15 07:05, Jakub Jelinek wrote:

On Fri, Jul 17, 2015 at 02:01:12PM +0300, Ilya Verbin wrote:

On Thu, Jul 16, 2015 at 16:08:47 -0400, Nathan Sidwell wrote:

Jakub, Ilya,
this patch against trunk constifies the offload target data.  I'm
having difficulty building an intelmic toolchain, so the changes
there aren't tested. Ilya, if you could check them, that'd be great.


Works fine with one change:


diff --git a/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp 
b/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
index 136fb99..baa4945 100644
--- a/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
+++ b/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
@@ -61,7 +61,7 @@ typedef std::vector AddrVect;
  typedef std::vector DevAddrVect;

  /* Addresses for all images and all devices.  */
-typedef std::map ImgDevAddrMap;
+typedef std::map ImgDevAddrMap;

  /* Image descriptor needed by __offload_[un]register_image.  */
  struct TargetImageDesc {


The patch is ok for trunk with that change then.


Thanks!

Have  you had a chance to review the host-side const patch?

nathan



Re: PATCH (v2) trunk GCCJIT: adding gcc_jit_context_new_rvalue_from_long_long, etc...

2015-07-17 Thread David Malcolm
On Fri, 2015-07-17 at 09:40 -0400, David Malcolm wrote:
> On Thu, 2015-07-16 at 11:00 +0200, Basile Starynkevitch wrote:

(snip)

> > + 
> > +/* Specialization of making an rvalue from a const, for host  > long>.  */
> > +
> > +template <>
> > +rvalue *
> > +context::
> > +new_rvalue_from_const  (type *type,
> > +  long long value)
> > +{
> > +  // FIXME: type-checking, or coercion?
> > +  tree inner_type = type->as_tree ();
> > +  if (INTEGRAL_TYPE_P (inner_type))
> > +{
> > +  tree inner = build_int_cst (inner_type, value);
> > +  return new rvalue (this, inner);
> > +}
> > +  else
> > +{
> > +  REAL_VALUE_TYPE real_value;
> > +  real_from_integer (&real_value, VOIDmode, value, SIGNED);
> > +  tree inner = build_real (inner_type, real_value);
> > +  return new rvalue (this, inner);
> > +}
> > +}
> 
> This is probably out-of-scope for this patch, but seeing this made me
> wonder if we ought to add a validation to the various
> "new_rvalue_from_TYPE" APIs to ensure that the host value will fit in
> the target type, and emit an error on the context if the given host
> value won't fit.

Filed as PR jit/66913 for now.  The validation would happen in
libgccjit.c.

(snip)




Re: [PATCH][AArch64][6/14] Implement TARGET_OPTION_SAVE/TARGET_OPTION_RESTORE

2015-07-17 Thread Kyrill Tkachov



This is a slight respin of this patch, handling the -moverride string more 
gracefully.
We need to explicitly save and restore it in TARGET_OPTION_SAVE otherwise the 
option gen machinery
gets confused about its type and during its printing uses the wrong format code 
for the pointer, leading to a warning that may trigger during bootstrap.

Otherwise it is the same as the previous version.

Bootstrapped and tested on aarch64.
I'd like to propose this version instead of the original.

Ok?

Thanks,
Kyrill

2015-07-17  Kyrylo Tkachov  

* config/aarch64/aarch64.opt (explicit_tune_core): New TargetVariable.
(explicit_arch): Likewise.
(x_aarch64_isa_flags): Likewise.
(mgeneral-regs-only): Mark as Save.
(mfix-cortex-a53-835769): Likewise.
(mcmodel=): Likewise.
(mstrict-align): Likewise.
(momit-leaf-frame-pointer): Likewise.
(mtls-dialect): Likewise.
(master=): Likewise.
* config/aarch64/aarch64.h (ASM_DECLARE_FUNCTION_NAME): Define.
(aarch64_isa_flags): Remove extern declaration.
* config/aarch64/aarch64.c (aarch64_validate_mcpu): Return a bool
to indicate success or failure.
(aarch64_validate_march): Likewise.
(aarch64_validate_mtune): Likewise.
(aarch64_isa_flags): Delete.
(aarch64_override_options_internal): Access opts->x_aarch64_isa_flags
instead of aarch64_isa_flags.
(aarch64_get_tune_cpu): New function.
(aarch64_get_arch): Likewise.
(aarch64_override_options): Use above and set up explicit_tune_core
and explicit_arch.
(aarch64_print_extension): Move earlier in file.  Add isa_flags
argument and use that instead of the global aarch64_isa_flags.
(aarch64_option_save): New function.
(aarch64_option_restore): Likewise.
(aarch64_option_print): Likewise.
(aarch64_declare_function_name): Likewise.
(aarch64_start_file): Delete.
(TARGET_ASM_FILE_START): Do not define.
(TARGET_OPTION_RESTORE, TARGET_OPTION_PRINT): Define.
* config/aarch64/aarch64-protos.h (aarch64_declare_function_name):
Declare prototype.


On 16/07/15 16:20, Kyrill Tkachov wrote:

Hi all,

This is one of the main patches in the series.
The backend compilation state can be described by the options in aarch64.opt 
marked as Save.
This causes the options-save.c machinery to save and restore them when asked 
them and the
TARGET_OPTION_SAVE and TARGET_OPTION_RESTORE should handle all the extra stuff 
that's required
to reinitialise the backend.

This patch marks the options that we want to support for SWITCHABLE_TARGET as 
Save and adds 3
extra variables: explicit_tune_core, explicit_arch and x_aarch64_isa_flags.
These 3 variables are used to store the explicit core to tune for (as specified 
by -mcpu or -mtune),
the explicitly specified architecture (as specified by -mcpu or -march) and the 
architecture
features (as specified by the extension string to -march,-mcpu or derived from 
them).

The aarch64_isa_flags definition is moved from aarch64.c into aarch64.opt and 
marked as a TargetVariable.
This means that the auto-generated machinery in options-save.c will 
automatically save and restore it for us.

The patch defines the TARGET_OPTION_RESTORE hook to extract the selected_tune 
and selected_arch from the
explicit_tune_core and explicit_arch variables and restore the backend 
compilation state using the
aarch64_override_options_internal machinery that we refactored earlier.

A TARGET_OPTION_PRINT implementation is added to print out the explicit_arch 
and explicit_tune_core options,
as well as aarch64_isa_flags.

As preparation for SWITCHABLE_TARGETS this patch also changes the output 
assembly format a bit.
Since we want to potentially handle multiple values of aarch64_isa_flags within 
a file in the future, we don't
want to just print out a global .arch or .cpu directive in the beginning of the 
assembly file.
Instead, we want to print out the .arch directive on a per-function basis. This 
is accomplished by
defining the ASM_DECLARE_FUNCTION_NAME hook and printing out selected_arch and 
aarch64_isa_flags there.
As an added bonus we can print out the tuning name in the comments and since we 
added a proper ident
field to the processor struct that we store in explicit_tune_core, we can print 
out the full tune name
in an assembly comment.

For example, compiling with -mcpu=cortex-a57.cortex-a53 we now get:

  .file   "sha1_1.c"
  .text
  .align  2
  .p2align 4,,15
  .global foo
  .arch armv8-a+fp+simd+crc
  //.tune cortex-a57.cortex-a53
  .type   foo, %function
foo:
  add w0, w0, 5
  ret
  .size   foo, .-foo
  .ident  "GCC: (unknown) 6.0.0 20150522 (experimental)"

instead of:
  .cpu cortex-a57+fp+simd+crc
  .file   "sha1_1.c"
  .text
  .align  2
  .p2align 4,,15
  .global foo
  .type   foo, %function
foo:
  add w0, w0, 5
  ret
  

Re: [Fortran, Patch] Passing function pointer to co_reduce

2015-07-17 Thread Damian Rouson
Thank you!

Damian

> On Jul 17, 2015, at 9:50 AM, Mikael Morin  wrote:
> 
> Le 17/07/2015 11:02, Mikael Morin a écrit :
>> Le 16/07/2015 16:34, Damian Rouson a écrit :
>>> Alternatively, if it’s easy, please feel free to add the directives and 
>>> commit.  
>>> 
> 
>> Never mind, I'll take care of it all.
>> 
> This is what I have committed:
> https://gcc.gnu.org/r225930 (trunk)
> https://gcc.gnu.org/r225932 (5 branch)
> 
> Mikael
> 



Re: [Fortran, Patch] Passing function pointer to co_reduce

2015-07-17 Thread Alessandro Fanfarillo
Thanks a lot!

2015-07-17 15:50 GMT+02:00 Mikael Morin :
> Le 17/07/2015 11:02, Mikael Morin a écrit :
>> Le 16/07/2015 16:34, Damian Rouson a écrit :
>>> Alternatively, if it’s easy, please feel free to add the directives and 
>>> commit.
>>>
>
>> Never mind, I'll take care of it all.
>>
> This is what I have committed:
> https://gcc.gnu.org/r225930 (trunk)
> https://gcc.gnu.org/r225932 (5 branch)
>
> Mikael


Re: Constify host-side offload data`

2015-07-17 Thread Jakub Jelinek
On Thu, Jul 16, 2015 at 08:20:53AM -0400, Nathan Sidwell wrote:
> On 07/16/15 07:41, Ilya Verbin wrote:
> >On Wed, Jul 15, 2015 at 20:56:50 -0400, Nathan Sidwell wrote:
> >>Index: gcc/config/nvptx/mkoffload.c
> >>===
> >>-  fprintf (out, "extern void *__OFFLOAD_TABLE__[];\n\n");
> >>+  fprintf (out, "extern const void *conat __OFFLOAD_TABLE__[];\n\n");
> >
> >Here is a typo.
> 
> Thanks, caught that myself too.  testing shows the patch ok for x86-linux/ptx

Ok for trunk with that change.

Jakub


Re: [PATCH] fix in-tree-binutils builds

2015-07-17 Thread H.J. Lu
On Wed, Jul 15, 2015 at 9:47 AM, Mike Stump  wrote:
> On Jul 15, 2015, at 9:07 AM, H.J. Lu  wrote:
>> On Wed, Jul 15, 2015 at 1:03 AM, Jan Beulich  wrote:
>>>
>>> - $gcc_cv_as_gas_srcdir/configure.in \
>>> + $gcc_cv_as_gas_srcdir/configure.[ai][cn] \
>>>  $gcc_cv_as_gas_srcdir/Makefile.in ; do
>>>   gcc_cv_gas_version=`sed -n -e 's/^[[ 
>>> ]]*VERSION=[[^0-9A-Za-z_]]*\([[0-9]]*\.[[0-9]]*.*\)/VERSION=\1/p' < $f`
>>
>> How portable is [ai][cn]?
>
> Should be portable enough.

Are there any objections to this patch?

-- 
H.J.


Re: [gomp4.1] Fix linear-2.{c,C} testcases

2015-07-17 Thread Ilya Verbin
On Fri, Jul 17, 2015 at 15:54:13 +0200, Jakub Jelinek wrote:
> These tests had a thinko, computation performed on the offloaded copy of the
> a variable, but then tested on the host side, without #pragma omp target
> update or similar.
> Fixed thusly.

In my testing linear-2.C still causes SIGSEGV on target in f1:

   0x76fc3872 <_Z2f1IiEvRT_._omp_fn.29(void)>:  push   %rbp
   0x76fc3873 <_Z2f1IiEvRT_._omp_fn.29(void)+1>:mov%rsp,%rbp
   0x76fc3876 <_Z2f1IiEvRT_._omp_fn.29(void)+4>:push   %rbx
   0x76fc3877 <_Z2f1IiEvRT_._omp_fn.29(void)+5>:sub$0x48,%rsp
   0x76fc387b <_Z2f1IiEvRT_._omp_fn.29(void)+9>:mov%rdi,-0x48(%rbp)
   0x76fc387f <_Z2f1IiEvRT_._omp_fn.29(void)+13>:   lea-0x34(%rbp),%rax
   0x76fc3883 <_Z2f1IiEvRT_._omp_fn.29(void)+17>:   mov%rax,-0x18(%rbp)
   0x76fc3887 <_Z2f1IiEvRT_._omp_fn.29(void)+21>:   mov-0x48(%rbp),%rax
   0x76fc388b <_Z2f1IiEvRT_._omp_fn.29(void)+25>:   mov(%rax),%rax
   0x76fc388e <_Z2f1IiEvRT_._omp_fn.29(void)+28>:   mov(%rax),%rax
=> 0x76fc3891 <_Z2f1IiEvRT_._omp_fn.29(void)+31>:   mov(%rax),%edx

(gdb) x $rax
0x7fff537fc1ec: Cannot access memory at address 0x7fff537fc1ec

Probably something wasn't mapped.


> I'm still seeing
> FAIL: libgomp.c/for-5.c (internal compiler error)
> FAIL: libgomp.c/for-5.c (test for excess errors)
> FAIL: libgomp.c++/for-13.C (internal compiler error)
> FAIL: libgomp.c++/for-13.C (test for excess errors)
> which is some LTO ICE.

I've never seen such ICEs before...

  -- Ilya


Re: fixup gomp register/unregister prototypes

2015-07-17 Thread Jakub Jelinek
On Wed, Jul 15, 2015 at 03:04:30PM -0400, Nathan Sidwell wrote:
> I'm almost tempted to commit as obvious.  I noticed that the callers of
> these functions from code generated by mkoffload declare the 2nd arg as an
> int, because they have no visibility of the enum.  I thought it wise to make
> the definitions match.

I think it doesn't really matter that much, such an enum on all targets we care
about is passed the same way as int.

But if you think it is important, I don't really care either way.

> 2015-07-15  Nathan Sidwell  
> 
>   * target.c (GOMP_offload_register): Use int for device type arg.
>   (GOMP_offload_unregister): Likewise.

Jakub


Re: [gomp4.1] Fix linear-2.{c,C} testcases

2015-07-17 Thread Jakub Jelinek
On Fri, Jul 17, 2015 at 05:43:09PM +0300, Ilya Verbin wrote:
> On Fri, Jul 17, 2015 at 15:54:13 +0200, Jakub Jelinek wrote:
> > These tests had a thinko, computation performed on the offloaded copy of the
> > a variable, but then tested on the host side, without #pragma omp target
> > update or similar.
> > Fixed thusly.
> 
> In my testing linear-2.C still causes SIGSEGV on target in f1:
> 
>0x76fc3872 <_Z2f1IiEvRT_._omp_fn.29(void)>:  push   %rbp
>0x76fc3873 <_Z2f1IiEvRT_._omp_fn.29(void)+1>:mov%rsp,%rbp
>0x76fc3876 <_Z2f1IiEvRT_._omp_fn.29(void)+4>:push   %rbx
>0x76fc3877 <_Z2f1IiEvRT_._omp_fn.29(void)+5>:sub$0x48,%rsp
>0x76fc387b <_Z2f1IiEvRT_._omp_fn.29(void)+9>:mov
> %rdi,-0x48(%rbp)
>0x76fc387f <_Z2f1IiEvRT_._omp_fn.29(void)+13>:   lea
> -0x34(%rbp),%rax
>0x76fc3883 <_Z2f1IiEvRT_._omp_fn.29(void)+17>:   mov
> %rax,-0x18(%rbp)
>0x76fc3887 <_Z2f1IiEvRT_._omp_fn.29(void)+21>:   mov
> -0x48(%rbp),%rax
>0x76fc388b <_Z2f1IiEvRT_._omp_fn.29(void)+25>:   mov(%rax),%rax
>0x76fc388e <_Z2f1IiEvRT_._omp_fn.29(void)+28>:   mov(%rax),%rax
> => 0x76fc3891 <_Z2f1IiEvRT_._omp_fn.29(void)+31>:   mov(%rax),%edx
> 
> (gdb) x $rax
> 0x7fff537fc1ec: Cannot access memory at address 0x7fff537fc1ec
> 
> Probably something wasn't mapped.

Yeah, I've noticed too.  Most likely this is about the implicit
map(tofrom:i) where i is int &.

I think it boils down to:
#pragma omp declare target
void use (int &);
#pragma omp end declare target

void
foo (int &p)
{
  #pragma omp target map(tofrom:p)
  {
use (p);
  }
}
where we map the reference (i.e. the pointer), rather than what it points
to plus pointer translate the reference to the new copy.
Guess I'll ask on omp-lang.

Jakub


Re: Still crashes due to aliasing violation (Re: [RFC, PATCH] Split pool_allocator and create a new object_allocator)

2015-07-17 Thread Richard Biener
On July 17, 2015 3:50:19 PM GMT+02:00, "Martin Liška"  wrote:
>On 07/17/2015 03:44 PM, Ulrich Weigand wrote:
>> Richard Biener wrote:
>>> On July 17, 2015 3:11:51 PM GMT+02:00, Ulrich Weigand
> wrote:
 (Since there is no C++ operator new involved at all anymore,
 this clearly violates even the C aliasing rules ...)

 I really think the allocate routine needs to be more careful to
 avoid violating aliasing, e.g. by using memcpy or union-based
 type-punning to access its free list info.
>>>
>>> As far as I understand the object allocator delegates construction
>to callers and thus in the above case cselib
>>> Would be responsible for calling placement new on the return value
>from
>>> Allocate.
>> 
>> Ah, it looks like I was wrong above: the code uses the *object*
>> allocator, so it should go through a placement new here:
>>   inline T *
>>   allocate () ATTRIBUTE_MALLOC
>>   {
>> return ::new (m_allocator.allocate ()) T ();
>>   }
>> 
>> It's still being miscompiled at least by my GCC 4.1 host compiler ...
>> 
>> Bye,
>> Ulrich
>> 
>
>Hi.
>
>I've just wanted to write you that it really utilizes a placement new
>:)
>The first example that bypasses pool allocator is of course a bug, I'll
>fix.
>
>Question is why aliasing oracle still wrongly aliases these pointers?
>Another option (suggested by Martin Jambor) would be to place
>::allocate implementation
>to alloc-pool.c file.

Note that all compilers up to 4.4 have aliasing issues with placement new.
A fix is to move the placement new out-of-line.

Richard.

>Thoughts?
>Martin




Re: [PATCH] Move A - (A & B) -> ~B & A

2015-07-17 Thread Marek Polacek
On Fri, Jul 17, 2015 at 02:23:50PM +0200, Richard Biener wrote:
> >+/* Fold A - (A & B) into ~B & A.  */
> >+(simplify
> >+ (minus (convert? @0) (convert? (bit_and:c @0 @1)))
> 
> I think you want :s on the bit_and and its convert?

Ok, I keep forgetting about this :(.
 
> Are you sure you don't need to check for a nop conversion here?

I thought I didn't need it here, and couldn't find a testcase which
would break without tree_nop_conversion_p checks, but that doesn't mean
much.  So for extra safety I put the checks there.

Bootstrapped/regtested on x86_64-linux, ok for trunk?

2015-07-17  Marek Polacek  

* fold-const.c (fold_binary_loc): Move A - (A & B) into ~B & A ...
* match.pd: ... here.

* gcc.dg/fold-minus-7.c: New test.

diff --git gcc/fold-const.c gcc/fold-const.c
index 93dd29d..fa321f4 100644
--- gcc/fold-const.c
+++ gcc/fold-const.c
@@ -9777,30 +9777,6 @@ fold_binary_loc (location_t loc,
 
   if (! FLOAT_TYPE_P (type))
{
- /* Fold A - (A & B) into ~B & A.  */
- if (!TREE_SIDE_EFFECTS (arg0)
- && TREE_CODE (arg1) == BIT_AND_EXPR)
-   {
- if (operand_equal_p (arg0, TREE_OPERAND (arg1, 1), 0))
-   {
- tree arg10 = fold_convert_loc (loc, type,
-TREE_OPERAND (arg1, 0));
- return fold_build2_loc (loc, BIT_AND_EXPR, type,
- fold_build1_loc (loc, BIT_NOT_EXPR,
-  type, arg10),
- fold_convert_loc (loc, type, arg0));
-   }
- if (operand_equal_p (arg0, TREE_OPERAND (arg1, 0), 0))
-   {
- tree arg11 = fold_convert_loc (loc,
-type, TREE_OPERAND (arg1, 1));
- return fold_build2_loc (loc, BIT_AND_EXPR, type,
- fold_build1_loc (loc, BIT_NOT_EXPR,
-  type, arg11),
- fold_convert_loc (loc, type, arg0));
-   }
-   }
-
  /* Fold (A & ~B) - (A & B) into (A ^ B) - B, where B is
 any power of 2 minus 1.  */
  if (TREE_CODE (arg0) == BIT_AND_EXPR
diff --git gcc/match.pd gcc/match.pd
index c335ada..700a692 100644
--- gcc/match.pd
+++ gcc/match.pd
@@ -662,6 +662,12 @@ along with GCC; see the file COPYING3.  If not see
  (bit_ior:c (bit_and:cs @0 (bit_not @2)) (bit_and:cs @1 @2))
  (bit_xor (bit_and (bit_xor @0 @1) @2) @0))
 
+/* Fold A - (A & B) into ~B & A.  */
+(simplify
+ (minus (convert? @0) (convert?:s (bit_and:cs @0 @1)))
+ (if (tree_nop_conversion_p (type, TREE_TYPE (@0))
+  && tree_nop_conversion_p (type, TREE_TYPE (@1)))
+  (convert (bit_and (bit_not @1) @0
 
 /* Associate (p +p off1) +p off2 as (p +p (off1 + off2)).  */
 (simplify
diff --git gcc/testsuite/gcc.dg/fold-minus-7.c 
gcc/testsuite/gcc.dg/fold-minus-7.c
index e69de29..7a49faa 100644
--- gcc/testsuite/gcc.dg/fold-minus-7.c
+++ gcc/testsuite/gcc.dg/fold-minus-7.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fdump-tree-cddce1" } */
+
+int
+f1 (int a, int b)
+{
+  int tem = a & b;
+  return a - tem;
+}
+
+int
+f2 (int a, int b)
+{
+  int tem = b & a;
+  return a - tem;
+}
+
+int
+f3 (unsigned int a, int b)
+{
+  return a - (a & b);
+}
+
+int
+f4 (int a, unsigned int b)
+{
+  return a - (a & b);
+}
+
+int
+f5 (int a, int b)
+{
+  return a - (unsigned) (b & a);
+}
+
+/* { dg-final { scan-tree-dump-not " - " "cddce1" } } */

Marek


[gomp4] New nvptx pattern and internal builtin

2015-07-17 Thread Bernd Schmidt
I've made this change at the request of Cesar who says it's needed for 
his reductions work. It makes a new instruction to represent shfl.down, 
a thread communication instruction, and some builtin functions for 
internal use to access it.


Committed on gomp-4_0-branch.


Bernd
Index: gcc/ChangeLog.gomp
===
--- gcc/ChangeLog.gomp	(revision 225936)
+++ gcc/ChangeLog.gomp	(working copy)
@@ -1,3 +1,17 @@
+2015-07-17  Bernd Schmidt  
+
+	* config/nvptx/nvptx.c (enum nvptx_builtins,
+	struct builtin_description): New.
+	(nvptx_builtin_decls, bdesc_2arg): New static variables.
+	(def_builtin): New macro.
+	(nvptx_builtin_decl, nvptx_init_builtins, nvptx_expand_binop_builtin,
+	nvptx_expand_builtin): New static functions.
+	(TARGET_INIT_BUILTINS, TARGET_EXPAND_BUILTIN, TARGET_BUILTIN_DECL):
+	Define.
+	* config/nvptx/nvptx.md (UNSPECV_SHFL_DOWN): New constant.
+	(thread_shuffle_down): New pattern.
+	(thread_shuffle_downdi): New expander.
+
 2015-07-17  Julian Brown  
 
 	* gimplify.c (gimplify_scan_omp_clauses): Handle
Index: gcc/config/nvptx/nvptx.c
===
--- gcc/config/nvptx/nvptx.c	(revision 225936)
+++ gcc/config/nvptx/nvptx.c	(working copy)
@@ -3058,6 +3058,139 @@ nvptx_file_end (void)
 }
 }
 
+/* Codes for all the NVPTX builtins.  */
+enum nvptx_builtins
+{
+  NVPTX_BUILTIN_SHUFFLE_DOWN,
+  NVPTX_BUILTIN_SHUFFLE_DOWNF,
+  NVPTX_BUILTIN_SHUFFLE_DOWNLL,
+
+  NVPTX_BUILTIN_MAX
+};
+
+
+static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
+
+/* Return the NVPTX builtin for CODE.  */
+static tree
+nvptx_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
+{
+  if (code >= NVPTX_BUILTIN_MAX)
+return error_mark_node;
+
+  return nvptx_builtin_decls[code];
+}
+
+#define def_builtin(NAME, TYPE, CODE)	\
+do {	\
+  tree bdecl;\
+  bdecl = add_builtin_function ((NAME), (TYPE), (CODE), BUILT_IN_MD,	\
+NULL, NULL_TREE);			\
+  nvptx_builtin_decls[CODE] = bdecl;	\
+} while (0)
+
+/* Set up all builtin functions for this target.  */
+static void
+nvptx_init_builtins (void)
+{ 
+  tree uint_ftype_uint_int
+= build_function_type_list (unsigned_type_node, unsigned_type_node,
+integer_type_node, NULL_TREE);
+  tree ull_ftype_ull_int
+= build_function_type_list (long_long_unsigned_type_node,
+long_long_unsigned_type_node,
+integer_type_node, NULL_TREE);
+  tree float_ftype_float_int
+= build_function_type_list (float_type_node, float_type_node,
+integer_type_node, NULL_TREE);
+  def_builtin ("__builtin_nvptx_shuffle_down", uint_ftype_uint_int,
+	   NVPTX_BUILTIN_SHUFFLE_DOWN);
+  def_builtin ("__builtin_nvptx_shuffle_downf", float_ftype_float_int,
+	   NVPTX_BUILTIN_SHUFFLE_DOWNF);
+  def_builtin ("__builtin_nvptx_shuffle_downll", ull_ftype_ull_int,
+	   NVPTX_BUILTIN_SHUFFLE_DOWNLL);
+}
+
+/* Subroutine of nvptx_expand_builtin to take care of binop insns.  MACFLAG is -1
+   if this is a normal binary op, or one of the MACFLAG_xxx constants.  */
+
+static rtx
+nvptx_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  tree arg1 = CALL_EXPR_ARG (exp, 1);
+  rtx op0 = expand_expr (arg0, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+  rtx op1 = expand_expr (arg1, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+  machine_mode op0mode = GET_MODE (op0);
+  machine_mode op1mode = GET_MODE (op1);
+  machine_mode tmode = insn_data[icode].operand[0].mode;
+  machine_mode mode0 = insn_data[icode].operand[1].mode;
+  machine_mode mode1 = insn_data[icode].operand[2].mode;
+  rtx ret = target;
+
+  if (! target
+  || GET_MODE (target) != tmode
+  || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
+target = gen_reg_rtx (tmode);
+
+  gcc_assert ((op0mode == mode0 || op0mode == VOIDmode)
+	  && (op1mode == mode1 || op1mode == VOIDmode));
+
+  if (! (*insn_data[icode].operand[1].predicate) (op0, mode0))
+op0 = copy_to_mode_reg (mode0, op0);
+  if (! (*insn_data[icode].operand[2].predicate) (op1, mode1))
+op1 = copy_to_mode_reg (mode1, op1);
+
+  pat = GEN_FCN (icode) (target, op0, op1);
+
+  if (! pat)
+return 0;
+
+  emit_insn (pat);
+
+  return ret;
+}
+
+
+struct builtin_description
+{
+  const enum insn_code icode;
+  const char *const name;
+  const enum nvptx_builtins code;
+};
+
+static const struct builtin_description bdesc_2arg[] =
+{
+  { CODE_FOR_thread_shuffle_downsi, "__builtin_nvptx_shuffle_down", NVPTX_BUILTIN_SHUFFLE_DOWN },
+  { CODE_FOR_thread_shuffle_downsf, "__builtin_nvptx_shuffle_downf", NVPTX_BUILTIN_SHUFFLE_DOWNF },
+  { CODE_FOR_thread_shuffle_downdi, "__builtin_nvptx_shuffle_downll", NVPTX_BUILTIN_SHUFFLE_DOWNLL }
+};
+
+/* Expand an expression EXP that calls a built-in function,
+   with result going to TARGET if that's convenient
+   (and in mode MODE if that's convenient).
+   SUBTARGET may be used as the tar

[PATCH v2][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math

2015-07-17 Thread Benedikt Huber
This second revision of the patch:
 * incorporated the review
 * added vector types
 * added commandline options
   -mrecip -mlow-precision-recip-sqrt
 * separated functional and assembly parse tests

Ok for check in.

Benedikt Huber (1):
  2015-07-14  Benedikt Huber  
Philipp Tomsich 

 gcc/ChangeLog  |  17 
 gcc/config/aarch64/aarch64-builtins.c  | 103 
 gcc/config/aarch64/aarch64-protos.h|   2 +
 gcc/config/aarch64/aarch64-simd.md |  27 ++
 gcc/config/aarch64/aarch64.c   |  58 +++
 gcc/config/aarch64/aarch64.md  |   3 +
 gcc/config/aarch64/aarch64.opt |   8 ++
 gcc/doc/invoke.texi|  20 
 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c |  63 
 gcc/testsuite/gcc.target/aarch64/rsqrt.c   | 107 +
 10 files changed, 408 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt.c

-- 
1.9.1



[PATCH] 2015-07-14 Benedikt Huber Philipp Tomsich

2015-07-17 Thread Benedikt Huber
* config/aarch64/aarch64-builtins.c: Builtins
for rsqrt and rsqrtf.
* config/aarch64/aarch64-protos.h: Declare.
* config/aarch64/aarch64-simd.md: Matching expressions
for frsqrte and frsqrts.
* config/aarch64/aarch64.c: New functions. Emit rsqrt
estimation code in fast math mode.
* config/aarch64/aarch64.md: Added enum entries.
* config/aarch64/aarch64.opt: Added options -mrecip and
-mlow-precision-recip-sqrt.
* testsuite/gcc.target/aarch64/rsqrt-asm-check.c: Assembly scans
for frsqrte and frsqrts
* testsuite/gcc.target/aarch64/rsqrt.c: Functional tests
for rsqrt.

Signed-off-by: Philipp Tomsich 
---
 gcc/ChangeLog  |  17 
 gcc/config/aarch64/aarch64-builtins.c  | 103 
 gcc/config/aarch64/aarch64-protos.h|   2 +
 gcc/config/aarch64/aarch64-simd.md |  27 ++
 gcc/config/aarch64/aarch64.c   |  58 +++
 gcc/config/aarch64/aarch64.md  |   3 +
 gcc/config/aarch64/aarch64.opt |   8 ++
 gcc/doc/invoke.texi|  20 
 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c |  63 
 gcc/testsuite/gcc.target/aarch64/rsqrt.c   | 107 +
 10 files changed, 408 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt.c

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 3432adb..f4b7407 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,20 @@
+2015-07-14  Benedikt Huber  
+   Philipp Tomsich  
+
+   * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and
+   rsqrtf.
+   * config/aarch64/aarch64-protos.h: Declare.
+   * config/aarch64/aarch64-simd.md: Matching expressions for
+   frsqrte and frsqrts.
+   * config/aarch64/aarch64.c: New functions. Emit rsqrt
+   estimation code in fast math mode.
+   * config/aarch64/aarch64.md: Added enum entries.
+   * config/aarch64/aarch64.opt: Added options -mrecip and
+   -mlow-precision-recip-sqrt.
+   * testsuite/gcc.target/aarch64/rsqrt-asm-check.c: Assembly scans
+   for frsqrte and frsqrts
+   * testsuite/gcc.target/aarch64/rsqrt.c: Functional tests for rsqrt.
+
 2015-07-08  Jiong Wang  
 
* config/aarch64/aarch64.c (aarch64_unspec_may_trap_p): New function.
diff --git a/gcc/config/aarch64/aarch64-builtins.c 
b/gcc/config/aarch64/aarch64-builtins.c
index b6c89b9..adcea07 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -335,6 +335,11 @@ enum aarch64_builtins
   AARCH64_BUILTIN_GET_FPSR,
   AARCH64_BUILTIN_SET_FPSR,
 
+  AARCH64_BUILTIN_RSQRT_DF,
+  AARCH64_BUILTIN_RSQRT_SF,
+  AARCH64_BUILTIN_RSQRT_V2DF,
+  AARCH64_BUILTIN_RSQRT_V2SF,
+  AARCH64_BUILTIN_RSQRT_V4SF,
   AARCH64_SIMD_BUILTIN_BASE,
   AARCH64_SIMD_BUILTIN_LANE_CHECK,
 #include "aarch64-simd-builtins.def"
@@ -824,6 +829,42 @@ aarch64_init_crc32_builtins ()
 }
 
 void
+aarch64_add_builtin_rsqrt (void)
+{
+  tree fndecl = NULL;
+  tree ftype = NULL;
+
+  tree V2SF_type_node = build_vector_type (float_type_node, 2);
+  tree V2DF_type_node = build_vector_type (double_type_node, 2);
+  tree V4SF_type_node = build_vector_type (float_type_node, 4);
+
+  ftype = build_function_type_list (double_type_node, double_type_node, 
NULL_TREE);
+  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_df",
+ftype, AARCH64_BUILTIN_RSQRT_DF, BUILT_IN_MD, NULL, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF] = fndecl;
+
+  ftype = build_function_type_list (float_type_node, float_type_node, 
NULL_TREE);
+  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_sf",
+ftype, AARCH64_BUILTIN_RSQRT_SF, BUILT_IN_MD, NULL, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF] = fndecl;
+
+  ftype = build_function_type_list (V2DF_type_node, V2DF_type_node, NULL_TREE);
+  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v2df",
+ftype, AARCH64_BUILTIN_RSQRT_V2DF, BUILT_IN_MD, NULL, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF] = fndecl;
+
+  ftype = build_function_type_list (V2SF_type_node, V2SF_type_node, NULL_TREE);
+  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v2sf",
+ftype, AARCH64_BUILTIN_RSQRT_V2SF, BUILT_IN_MD, NULL, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF] = fndecl;
+
+  ftype = build_function_type_list (V4SF_type_node, V4SF_type_node, NULL_TREE);
+  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v4sf",
+ftype, AARCH64_BUILTIN_RSQRT_V4SF, BUILT_IN_MD, NULL, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF] = fndecl;
+}
+
+void
 aarch64_init_builtins (void)
 {
   tree ftype_set_fpr
@@ -848,6 +889,7 @@ aarch64_init_builtins (void)
 aarch64_init_simd_bu

Re: fixup gomp register/unregister prototypes

2015-07-17 Thread Nathan Sidwell

On 07/17/15 10:44, Jakub Jelinek wrote:

On Wed, Jul 15, 2015 at 03:04:30PM -0400, Nathan Sidwell wrote:

I'm almost tempted to commit as obvious.  I noticed that the callers of
these functions from code generated by mkoffload declare the 2nd arg as an
int, because they have no visibility of the enum.  I thought it wise to make
the definitions match.


I think it doesn't really matter that much, such an enum on all targets we care
about is passed the same way as int.

But if you think it is important, I don't really care either way.


Having been bitten by fshort-enums on ARM, I like to be careful :)

nathan


Re: [gomp4.1] Handle new form of #pragma omp declare target

2015-07-17 Thread James Norris

Jakub,

On 07/17/2015 08:05 AM, Jakub Jelinek wrote:

Hi!

...

I believe OpenACC has something similar, but no idea if it is already
implemented.


Yes, it is implemented in gomp-4_0-branch.

While the purpose for 'omp declare target' and 'acc declare' are 
similar, the data movement, via the clauses, provided with the latter 
make it very different than the former.


The data movement requires that data be moved at the entry and
exit of an 'associated region'. Associated region to mean either
a function, subroutine, entire program or Fortran module. I choose
to implement this in the front-ends.

For discussion purposes, I'll use the C front-end: c_parser_oacc_declare 
and finish_oacc_declare.


As far as the syntax, OpenMP is alot easier to deal with than
OpenACC. The handling of said is reflected in c_parser_oacc_declare.
Here also is the handling of the numerous data movement clauses.
One in particular requires special handling: create. This can
be seen toward the end of the function, There is a libgomp
component GOACC_register_static (oacc-parallel.c) that is used
in conjunction with the create clause.

The creation and deletion of the 'associated region' is done
in finish_oacc_declare. Depending upon where the directive was
found requires different handling, i.e., global variable scope
versus local variable scope. In addition, if there is data
movement from target -> host, this must be handled appropriately.



...

Ilya, Thomas, thoughts on this?



Jim answering at the behest of Thomas

If the above explanation is not sufficient please yell. It may
make more sense to carve out the code in question and document
it more thoroughly for discussion purposes. Also the implementation 
approach in the front-ends may be entirely wrong. There may be an 
approach to do it in the 'middle'. However, my lack of experience in the 
middle may have caused me to go down the wrong path.


Jim





Re: [PATCH] Move A - (A & B) -> ~B & A

2015-07-17 Thread Richard Biener
On July 17, 2015 5:23:21 PM GMT+02:00, Marek Polacek  wrote:
>On Fri, Jul 17, 2015 at 02:23:50PM +0200, Richard Biener wrote:
>> >+/* Fold A - (A & B) into ~B & A.  */
>> >+(simplify
>> >+ (minus (convert? @0) (convert? (bit_and:c @0 @1)))
>> 
>> I think you want :s on the bit_and and its convert?
>
>Ok, I keep forgetting about this :(.
> 
>> Are you sure you don't need to check for a nop conversion here?
>
>I thought I didn't need it here, and couldn't find a testcase which
>would break without tree_nop_conversion_p checks, but that doesn't mean
>much.  So for extra safety I put the checks there.
>
>Bootstrapped/regtested on x86_64-linux, ok for trunk?

OK 

Thanks
Richard

>2015-07-17  Marek Polacek  
>
>   * fold-const.c (fold_binary_loc): Move A - (A & B) into ~B & A ...
>   * match.pd: ... here.
>
>   * gcc.dg/fold-minus-7.c: New test.
>
>diff --git gcc/fold-const.c gcc/fold-const.c
>index 93dd29d..fa321f4 100644
>--- gcc/fold-const.c
>+++ gcc/fold-const.c
>@@ -9777,30 +9777,6 @@ fold_binary_loc (location_t loc,
> 
>   if (! FLOAT_TYPE_P (type))
>   {
>-/* Fold A - (A & B) into ~B & A.  */
>-if (!TREE_SIDE_EFFECTS (arg0)
>-&& TREE_CODE (arg1) == BIT_AND_EXPR)
>-  {
>-if (operand_equal_p (arg0, TREE_OPERAND (arg1, 1), 0))
>-  {
>-tree arg10 = fold_convert_loc (loc, type,
>-   TREE_OPERAND (arg1, 0));
>-return fold_build2_loc (loc, BIT_AND_EXPR, type,
>-fold_build1_loc (loc, BIT_NOT_EXPR,
>- type, arg10),
>-fold_convert_loc (loc, type, arg0));
>-  }
>-if (operand_equal_p (arg0, TREE_OPERAND (arg1, 0), 0))
>-  {
>-tree arg11 = fold_convert_loc (loc,
>-   type, TREE_OPERAND (arg1, 1));
>-return fold_build2_loc (loc, BIT_AND_EXPR, type,
>-fold_build1_loc (loc, BIT_NOT_EXPR,
>- type, arg11),
>-fold_convert_loc (loc, type, arg0));
>-  }
>-  }
>-
> /* Fold (A & ~B) - (A & B) into (A ^ B) - B, where B is
>any power of 2 minus 1.  */
> if (TREE_CODE (arg0) == BIT_AND_EXPR
>diff --git gcc/match.pd gcc/match.pd
>index c335ada..700a692 100644
>--- gcc/match.pd
>+++ gcc/match.pd
>@@ -662,6 +662,12 @@ along with GCC; see the file COPYING3.  If not see
>  (bit_ior:c (bit_and:cs @0 (bit_not @2)) (bit_and:cs @1 @2))
>  (bit_xor (bit_and (bit_xor @0 @1) @2) @0))
> 
>+/* Fold A - (A & B) into ~B & A.  */
>+(simplify
>+ (minus (convert? @0) (convert?:s (bit_and:cs @0 @1)))
>+ (if (tree_nop_conversion_p (type, TREE_TYPE (@0))
>+  && tree_nop_conversion_p (type, TREE_TYPE (@1)))
>+  (convert (bit_and (bit_not @1) @0
> 
> /* Associate (p +p off1) +p off2 as (p +p (off1 + off2)).  */
> (simplify
>diff --git gcc/testsuite/gcc.dg/fold-minus-7.c
>gcc/testsuite/gcc.dg/fold-minus-7.c
>index e69de29..7a49faa 100644
>--- gcc/testsuite/gcc.dg/fold-minus-7.c
>+++ gcc/testsuite/gcc.dg/fold-minus-7.c
>@@ -0,0 +1,36 @@
>+/* { dg-do compile } */
>+/* { dg-options "-O -fdump-tree-cddce1" } */
>+
>+int
>+f1 (int a, int b)
>+{
>+  int tem = a & b;
>+  return a - tem;
>+}
>+
>+int
>+f2 (int a, int b)
>+{
>+  int tem = b & a;
>+  return a - tem;
>+}
>+
>+int
>+f3 (unsigned int a, int b)
>+{
>+  return a - (a & b);
>+}
>+
>+int
>+f4 (int a, unsigned int b)
>+{
>+  return a - (a & b);
>+}
>+
>+int
>+f5 (int a, int b)
>+{
>+  return a - (unsigned) (b & a);
>+}
>+
>+/* { dg-final { scan-tree-dump-not " - " "cddce1" } } */
>
>   Marek




Re: fixup gomp register/unregister prototypes

2015-07-17 Thread Jakub Jelinek
On Fri, Jul 17, 2015 at 11:44:47AM -0400, Nathan Sidwell wrote:
> On 07/17/15 10:44, Jakub Jelinek wrote:
> >On Wed, Jul 15, 2015 at 03:04:30PM -0400, Nathan Sidwell wrote:
> >>I'm almost tempted to commit as obvious.  I noticed that the callers of
> >>these functions from code generated by mkoffload declare the 2nd arg as an
> >>int, because they have no visibility of the enum.  I thought it wise to make
> >>the definitions match.
> >
> >I think it doesn't really matter that much, such an enum on all targets we 
> >care
> >about is passed the same way as int.
> >
> >But if you think it is important, I don't really care either way.
> 
> Having been bitten by fshort-enums on ARM, I like to be careful :)

That is an ABI changing option, though perhaps somebody might build libgomp
for that ABI.  Ok then.

Jakub


[PATCH 0/4] S390 -march=native related fixes

2015-07-17 Thread Dominik Vogt
The following series of patches contains fixes, and cleanup work
related to -march=native (and -mtune=native) for S390.

* 0001 Fix cross compilation for S390 targets that was broken with
   the original -march=native aptch.

* 0002 Fix handling of -march=z9_ec in old code.  This bug has been
   in the code since z9-ec support was added.

* 0003 -march=native now detects various cpu features and passes
   that as options from the driver to the compiler.

* 0004 Non-mandatory cleanup of the S390 code handling the various
   cpu types.

Ciao

Dominik ^_^  ^_^

-- 

Dominik Vogt
IBM Germany



[PATCH 1/4] S390 -march=native related fixes

2015-07-17 Thread Dominik Vogt
> The following series of patches contains fixes, and cleanup work
> related to -march=native (and -mtune=native) for S390.
> 
> * 0001 Fix cross compilation for S390 targets that was broken with
>the original -march=native aptch.

Ciao

Dominik ^_^  ^_^

-- 

Dominik Vogt
IBM Germany
gcc/ChangeLog:

* config/s390/s390.h: S390: Do not define EXTRA_SPEC_FUNCTIONS when
cross compiling.
>From 7ce17cd9b53959a70a9d30cb33acf62ef43ea418 Mon Sep 17 00:00:00 2001
From: Dominik Vogt 
Date: Thu, 9 Jul 2015 07:31:42 +0100
Subject: [PATCH 1/4] S390: Do not define EXTRA_SPEC_FUNCTIONS when cross
 compiling.

---
 gcc/config/s390/s390.h | 4 
 1 file changed, 4 insertions(+)

diff --git a/gcc/config/s390/s390.h b/gcc/config/s390/s390.h
index 85a0d1a..f18b973 100644
--- a/gcc/config/s390/s390.h
+++ b/gcc/config/s390/s390.h
@@ -126,6 +126,7 @@ enum processor_flags
   { "arch", "%{!march=*:-march=%(VALUE)}" },			\
   { "tune", "%{!mtune=*:-mtune=%(VALUE)}" }
 
+#ifdef __s390__
 extern const char *s390_host_detect_local_cpu (int argc, const char **argv);
 # define EXTRA_SPEC_FUNCTIONS \
   { "local_cpu_detect", s390_host_detect_local_cpu },
@@ -133,6 +134,9 @@ extern const char *s390_host_detect_local_cpu (int argc, const char **argv);
 # define MARCH_MTUNE_NATIVE_SPECS\
   " %{march=native:%

[PATCH 2/4] S390 -march=native related fixes

2015-07-17 Thread Dominik Vogt
> The following series of patches contains fixes, and cleanup work
> related to -march=native (and -mtune=native) for S390.
> 
> * 0002 Fix handling of -march=z9_ec in old code.  This bug has been
>in the code since z9-ec support was added.

Ciao

Dominik ^_^  ^_^

-- 

Dominik Vogt
IBM Germany
gcc/ChangeLog

* config/s390/s390.c (s390_issue_rate): Handle PROCESSOR_2094_Z9_EC.
(s390_option_override): Likewise.
(s390_adjust_priority): Likewise.
>From d7fda8d5dbb006bffef36323921efa081bfdfa57 Mon Sep 17 00:00:00 2001
From: Dominik Vogt 
Date: Tue, 7 Jul 2015 07:57:26 +0100
Subject: [PATCH 2/4] S390: Add handling of PROCESSOR_2094_Z9_EC.

---
 gcc/config/s390/s390.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index 861dfb2..27e1c35 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -7270,12 +7270,7 @@ s390_adjust_priority (rtx_insn *insn, int priority)
   if (! INSN_P (insn))
 return priority;
 
-  if (s390_tune != PROCESSOR_2084_Z990
-  && s390_tune != PROCESSOR_2094_Z9_109
-  && s390_tune != PROCESSOR_2097_Z10
-  && s390_tune != PROCESSOR_2817_Z196
-  && s390_tune != PROCESSOR_2827_ZEC12
-  && s390_tune != PROCESSOR_2964_Z13)
+  if (s390_tune <= PROCESSOR_2064_Z900)
 return priority;
 
   switch (s390_safe_attr_type (insn))
@@ -7304,6 +7299,7 @@ s390_issue_rate (void)
 {
 case PROCESSOR_2084_Z990:
 case PROCESSOR_2094_Z9_109:
+case PROCESSOR_2094_Z9_EC:
 case PROCESSOR_2817_Z196:
   return 3;
 case PROCESSOR_2097_Z10:
@@ -13521,6 +13517,7 @@ s390_option_override (void)
   s390_cost = &z990_cost;
   break;
 case PROCESSOR_2094_Z9_109:
+case PROCESSOR_2094_Z9_EC:
   s390_cost = &z9_109_cost;
   break;
 case PROCESSOR_2097_Z10:
-- 
2.3.0



[PATCH 3/4] S390 -march=native related fixes

2015-07-17 Thread Dominik Vogt
On Fri, Jul 17, 2015 at 05:00:50PM +0100, Dominik Vogt wrote:
> The following series of patches contains fixes, and cleanup work
> related to -march=native (and -mtune=native) for S390.
> 
> * 0003 -march=native now detects various cpu features and passes
>that as options from the driver to the compiler.

Ciao

Dominik ^_^  ^_^

-- 

Dominik Vogt
IBM Germany
gcc/ChangeLog

* config/s390/driver-native.c (s390_host_detect_local_cpu): Handle
processor capabilities with -march=native.
* config/s390/s390.h (MARCH_MTUNE_NATIVE_SPECS): Likewise.
(DRIVER_SELF_SPECS): Likewise.  Join specs for 31 and 64 bit.
* (S390_TARGET_BITS_STRING): Macro to simplify specs.  Use
CC1_CPU_SPEC, EXTRA_SPECS and CC1_SPEC instead.
(CC1_CPU_SPEC): New macro.
(EXTRA_SPECS): New macro.
(CC1_SPEC): New macro.
>From 0b6de7baee54ce964ff4d0be1380dd03ab69bb93 Mon Sep 17 00:00:00 2001
From: Dominik Vogt 
Date: Mon, 6 Jul 2015 16:28:32 +0100
Subject: [PATCH 3/4] S390: Handle processor capabilities with -march=native.

---
 gcc/config/s390/driver-native.c | 143 
 gcc/config/s390/s390.h  |  32 +
 2 files changed, 133 insertions(+), 42 deletions(-)

diff --git a/gcc/config/s390/driver-native.c b/gcc/config/s390/driver-native.c
index 88c76bd..ebce79f 100644
--- a/gcc/config/s390/driver-native.c
+++ b/gcc/config/s390/driver-native.c
@@ -42,6 +42,17 @@ s390_host_detect_local_cpu (int argc, const char **argv)
   char buf[256];
   FILE *f;
   bool arch;
+  const char *options = "";
+  unsigned int has_features;
+  unsigned int has_processor;
+  unsigned int is_cpu_z9_109 = 0;
+  unsigned int has_highgprs = 0;
+  unsigned int has_dfp = 0;
+  unsigned int has_te = 0;
+  unsigned int has_vx = 0;
+  unsigned int has_opt_te = 0;
+  unsigned int has_opt_vx = 0;
+  unsigned int has_opt_esa_zarch = 0;
 
   if (argc < 1)
 return NULL;
@@ -49,43 +60,119 @@ s390_host_detect_local_cpu (int argc, const char **argv)
   arch = strcmp (argv[0], "arch") == 0;
   if (!arch && strcmp (argv[0], "tune"))
 return NULL;
+  if (argc == 4)
+{
+  /* The extra arguments are only used for "arch" mode.  */
+  has_opt_esa_zarch = atoi (argv[1]);
+  has_opt_te = atoi (argv[2]);
+  has_opt_vx = atoi (argv[3]);
+}
 
   f = fopen ("/proc/cpuinfo", "r");
   if (f == NULL)
 return NULL;
 
-  while (fgets (buf, sizeof (buf), f) != NULL)
-if (strncmp (buf, "processor", sizeof ("processor") - 1) == 0)
-  {
-	if (strstr (buf, "machine = 9672") != NULL)
-	  cpu = "g5";
-	else if (strstr (buf, "machine = 2064") != NULL
-		 || strstr (buf, "machine = 2066") != NULL)
-	  cpu = "z900";
-	else if (strstr (buf, "machine = 2084") != NULL
-		 || strstr (buf, "machine = 2086") != NULL)
-	  cpu = "z990";
-	else if (strstr (buf, "machine = 2094") != NULL
-		 || strstr (buf, "machine = 2096") != NULL)
-	  cpu = "z9-109";
-	else if (strstr (buf, "machine = 2097") != NULL
-		 || strstr (buf, "machine = 2098") != NULL)
-	  cpu = "z10";
-	else if (strstr (buf, "machine = 2817") != NULL
-		 || strstr (buf, "machine = 2818") != NULL)
-	  cpu = "z196";
-	else if (strstr (buf, "machine = 2827") != NULL
-		 || strstr (buf, "machine = 2828") != NULL)
-	  cpu = "zEC12";
-	else if (strstr (buf, "machine = 2964") != NULL)
-	  cpu = "z13";
-	break;
-  }
+  for (has_features = 0, has_processor = 0;
+   (has_features == 0 || has_processor == 0)
+	 && fgets (buf, sizeof (buf), f) != NULL; )
+{
+  if (has_processor == 0 && strncmp (buf, "processor", 9) == 0)
+	{
+	  const char *p;
+	  long machine_id;
+
+	  p = strstr (buf, "machine = ");
+	  if (p == NULL)
+	continue;
+	  p += 10;
+	  has_processor = 1;
+	  machine_id = strtol (p, NULL, 16);
+	  switch (machine_id)
+	{
+	case 0x9672:
+	  cpu = "g5";
+	  break;
+	case 0x2064:
+	case 0x2066:
+	  cpu = "z900";
+	  break;
+	case 0x2084:
+	case 0x2086:
+	  cpu = "z990";
+	  break;
+	case 0x2094:
+	case 0x2096:
+	  cpu = "z9-109";
+	  is_cpu_z9_109 = 1;
+	  break;
+	case 0x2097:
+	case 0x2098:
+	  cpu = "z10";
+	  break;
+	case 0x2817:
+	case 0x2818:
+	  cpu = "z196";
+	  break;
+	case 0x2827:
+	case 0x2828:
+	  cpu = "zEC12";
+	  break;
+	case 0x2964:
+	  cpu = "z13";
+	  break;
+	}
+	}
+  if (has_features == 0 && strncmp (buf, "features", 8) == 0)
+	{
+	  const char *p;
+
+	  p = strchr (buf, ':');
+	  if (p == NULL)
+	continue;
+	  p++;
+	  while (*p != 0)
+	{
+	  int i;
+
+	  while (ISSPACE (*p))
+		p++;
+	  for (i = 0; !ISSPACE (p[i]) && p[i] != 0; i++)
+		;
+	  if (i == 3 && strncmp (p, "dfp", 3) == 0)
+		has_dfp = 1;
+	  else if (i == 2 && strncmp (p, "te", 2) == 0)
+		has_te = 1;
+	  else if (i == 2 && strncmp (p, "vx", 2) == 0)
+		has_vx = 1;
+	  else if (i == 8 && strncmp (p, "highgprs", 8) == 0)
+		

[PATCH 4/4] S390 -march=native related fixes

2015-07-17 Thread Dominik Vogt
On Fri, Jul 17, 2015 at 05:00:50PM +0100, Dominik Vogt wrote:
> The following series of patches contains fixes, and cleanup work
> related to -march=native (and -mtune=native) for S390.
> 
> * 0004 Non-mandatory cleanup of the S390 code handling the various
>cpu types.

Ciao

Dominik ^_^  ^_^

-- 

Dominik Vogt
IBM Germany
gcc/ChangeLog

* config/s390/s390.c (s390_reorg): Clean up handling of processors with
-mtune=
(s390_issue_rate): Likewise.
(s390_sched_reorder): Likewise.
(s390_sched_variable_issue): Likewise.
(s390_loop_unroll_adjust): Likewise.
(s390_option_override):  Likewise.
>From 1d52b5534a728c9625b64bb702f3adcb2fc35ed0 Mon Sep 17 00:00:00 2001
From: Dominik Vogt 
Date: Wed, 8 Jul 2015 13:20:41 +0100
Subject: [PATCH 4/4] S390: Clean up handling of various PROCESSOR_* values
 with -mtune=.

---
 gcc/config/s390/s390.c | 32 +---
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index 27e1c35..1764243 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -7304,11 +7304,15 @@ s390_issue_rate (void)
   return 3;
 case PROCESSOR_2097_Z10:
   return 2;
+case PROCESSOR_9672_G5:
+case PROCESSOR_9672_G6:
+case PROCESSOR_2064_Z900:
   /* Starting with EC12 we use the sched_reorder hook to take care
 	 of instruction dispatch constraints.  The algorithm only
 	 picks the best instruction and assumes only a single
 	 instruction gets issued per cycle.  */
 case PROCESSOR_2827_ZEC12:
+case PROCESSOR_2964_Z13:
 default:
   return 1;
 }
@@ -12914,10 +12918,7 @@ s390_reorg (void)
   s390_optimize_prologue ();
 
   /* Walk over the insns and do some >=z10 specific changes.  */
-  if (s390_tune == PROCESSOR_2097_Z10
-  || s390_tune == PROCESSOR_2817_Z196
-  || s390_tune == PROCESSOR_2827_ZEC12
-  || s390_tune == PROCESSOR_2964_Z13)
+  if (s390_tune >= PROCESSOR_2097_Z10)
 {
   rtx_insn *insn;
   bool insn_added_p = false;
@@ -13168,12 +13169,12 @@ static int
 s390_sched_reorder (FILE *file, int verbose,
 		rtx_insn **ready, int *nreadyp, int clock ATTRIBUTE_UNUSED)
 {
-  if (s390_tune == PROCESSOR_2097_Z10)
-if (reload_completed && *nreadyp > 1)
-  s390_z10_prevent_earlyload_conflicts (ready, nreadyp);
+  if (s390_tune == PROCESSOR_2097_Z10
+  && reload_completed
+  && *nreadyp > 1)
+s390_z10_prevent_earlyload_conflicts (ready, nreadyp);
 
-  if ((s390_tune == PROCESSOR_2827_ZEC12
-   || s390_tune == PROCESSOR_2964_Z13)
+  if (s390_tune >= PROCESSOR_2827_ZEC12
   && reload_completed
   && *nreadyp > 1)
 {
@@ -13256,8 +13257,7 @@ s390_sched_variable_issue (FILE *file, int verbose, rtx_insn *insn, int more)
 {
   last_scheduled_insn = insn;
 
-  if ((s390_tune == PROCESSOR_2827_ZEC12
-   || s390_tune == PROCESSOR_2964_Z13)
+  if (s390_tune >= PROCESSOR_2827_ZEC12
   && reload_completed
   && recog_memoized (insn) >= 0)
 {
@@ -13335,10 +13335,7 @@ s390_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
   unsigned i;
   unsigned mem_count = 0;
 
-  if (s390_tune != PROCESSOR_2097_Z10
-  && s390_tune != PROCESSOR_2817_Z196
-  && s390_tune != PROCESSOR_2827_ZEC12
-  && s390_tune != PROCESSOR_2964_Z13)
+  if (s390_tune < PROCESSOR_2097_Z10)
 return nunroll;
 
   /* Count the number of memory references within the loop body.  */
@@ -13553,10 +13550,7 @@ s390_option_override (void)
 target_flags |= MASK_LONG_DOUBLE_128;
 #endif
 
-  if (s390_tune == PROCESSOR_2097_Z10
-  || s390_tune == PROCESSOR_2817_Z196
-  || s390_tune == PROCESSOR_2827_ZEC12
-  || s390_tune == PROCESSOR_2964_Z13)
+  if (s390_tune >= PROCESSOR_2097_Z10)
 {
   maybe_set_param_value (PARAM_MAX_UNROLLED_INSNS, 100,
 			 global_options.x_param_values,
-- 
2.3.0



Re: [PATCH] [graphite] fix pr61929

2015-07-17 Thread Tobias Grosser

On 07/17/2015 06:03 PM, Aditya Kumar wrote:

Hi Tobias,
I have updated the change log to reflect which changes were required for the
bootstrap problem.


This fixes bootstrap of GCC with BOOT_CFLAGS="-g -O2 -fgraphite-identity
-floop-nest-optimize -floop-block -floop-interchange -floop-strip-mine".
It passes regstrap on amd64-linux. A previous change
(https://gcc.gnu.org/viewcvs/gcc?view=revision&revision=213816), replaced
isl_int with isl_val
because isl_int would be deprecated. Since isl_val has stricter checks, it
exposed the bug. In the test case (isl_set_max_val) would return infinity
which would remain unchecked. We now check if the value returned is an
integer or not, and bail out if it isn't. The other problem was that we were
allowing
all kinds of data-refs in a scop. Now we discard a scop if it has any
date-refs other than (ARRAY_REF, MEM_REF, COMPONENT_REF).


"any data ref" - no S

Otherwise LGTM.

Tobias


-Aditya


2015-07-17  Aditya Kumar  
Sebastian Pop  

PR middle-end/61929
* graphite-dependences.c (add_pdr_constraints): Renamed
pdr->extent to pdr->subscript_sizes.
* graphite-interchange.c (build_linearized_memory_access): Add
back all gcc_assert's that the "isl_int to isl_val conversion"
patch has removed (fixed the bootstrap).  Refactored.
(pdr_stride_in_loop): Renamed pdr->extent to pdr->subscript_sizes.
* graphite-poly.c (new_poly_dr): Same.
(free_poly_dr): Same.
* graphite-poly.h (struct poly_dr): Same.
* graphite-scop-detection.c (stmt_has_simple_data_refs_p): Ignore
all data references other than ARRAY_REF and MEM_REF  (fixed the
bootstrap).
* graphite-scop-detection.h: Fix space.
* graphite-sese-to-poly.c (build_pbb_scattering_polyhedrons): Add
back all gcc_assert's removed by a previous patch.
(wrap): Remove the_isl_ctx global variable that the same patch has
added.
(build_loop_iteration_domains): Same.
(add_param_constraints): Same.
(pdr_add_data_dimensions): Same.  Refactored.
(build_poly_dr): Renamed extent to subscript_sizes.

testsuite/
PR middle-end/61929
* gcc.dg/graphite/pr61929.c: New.
---
  gcc/graphite-dependences.c  |  4 +--
  gcc/graphite-interchange.c  | 55
+
  gcc/graphite-poly.c |  6 ++--
  gcc/graphite-poly.h |  2 +-
  gcc/graphite-scop-detection.c   | 22 +
  gcc/graphite-scop-detection.h   |  2 +-
  gcc/graphite-sese-to-poly.c | 54

  gcc/testsuite/gcc.dg/graphite/pr61929.c | 19 
  8 files changed, 97 insertions(+), 67 deletions(-)  create mode 100644
gcc/testsuite/gcc.dg/graphite/pr61929.c

diff --git a/gcc/graphite-dependences.c b/gcc/graphite-dependences.c index
50fe73e..af18ecb 100644
--- a/gcc/graphite-dependences.c
+++ b/gcc/graphite-dependences.c
@@ -88,13 +88,13 @@ constrain_domain (isl_map *map, isl_set *s)
return isl_map_intersect_domain (map, s);  }

-/* Constrain pdr->accesses with pdr->extent and pbb->domain.  */
+/* Constrain pdr->accesses with pdr->subscript_sizes and pbb->domain.
+*/

  static isl_map *
  add_pdr_constraints (poly_dr_p pdr, poly_bb_p pbb)  {
isl_map *x = isl_map_intersect_range (isl_map_copy (pdr->accesses),
-   isl_set_copy (pdr->extent));
+   isl_set_copy
(pdr->subscript_sizes));
x = constrain_domain (x, isl_set_copy (pbb->domain));
return x;
  }
diff --git a/gcc/graphite-interchange.c b/gcc/graphite-interchange.c index
aee51a8..03c2c63 100644
--- a/gcc/graphite-interchange.c
+++ b/gcc/graphite-interchange.c
@@ -79,37 +79,40 @@ extern "C" {
  static isl_constraint *
  build_linearized_memory_access (isl_map *map, poly_dr_p pdr)  {
-  isl_constraint *res;
isl_local_space *ls = isl_local_space_from_space (isl_map_get_space
(map));
-  unsigned offset, nsubs;
-  int i;
-  isl_ctx *ctx;
+  isl_constraint *res = isl_equality_alloc (ls);  isl_val *size =
+ isl_val_int_from_ui (isl_map_get_ctx (map), 1);

-  isl_val *size, *subsize, *size1;
-
-  res = isl_equality_alloc (ls);
-  ctx = isl_local_space_get_ctx (ls);
-  size = isl_val_int_from_ui (ctx, 1);
-
-  nsubs = isl_set_dim (pdr->extent, isl_dim_set);
+  unsigned nsubs = isl_set_dim (pdr->subscript_sizes, isl_dim_set);
/* -1 for the already included L dimension.  */
-  offset = isl_map_dim (map, isl_dim_out) - 1 - nsubs;
+  unsigned offset = isl_map_dim (map, isl_dim_out) - 1 - nsubs;
res = isl_constraint_set_coefficient_si (res, isl_dim_out, offset +
nsubs, -1);
-  /* Go through all subscripts from last to first.  First dimension
+  /* Go through all subscripts from last to first.  The dimension "i=0"
   is the alias set, ignore it.  */
-  for (i = nsubs - 1; i >= 1; i--)
+  for (int i = nsubs - 1; i >= 1; i-

[patch] Document that std::thread::native_handle() and similar are not portable or stable

2015-07-17 Thread Jonathan Wakely

This just adds a caveat to the docs that the semantics of
native_handles should not be relied on.

Committed to trunk.
commit 46e6c523b7d2d38cc3b31b77a19639f4b2634671
Author: Jonathan Wakely 
Date:   Fri Jul 17 17:07:30 2015 +0100

	* doc/xml/manual/status_cxx2011.xml: Add caveats for native_handle.
	* doc/html/manual/status.html: Regenerate.

diff --git a/libstdc++-v3/doc/xml/manual/status_cxx2011.xml b/libstdc++-v3/doc/xml/manual/status_cxx2011.xml
index 2b99b9d..cf99956 100644
--- a/libstdc++-v3/doc/xml/manual/status_cxx2011.xml
+++ b/libstdc++-v3/doc/xml/manual/status_cxx2011.xml
@@ -2670,7 +2670,10 @@ particular release.
   30.2.3 [thread.req.native]/1
   native_handle_type and
   native_handle are provided. The handle types
-  are defined in terms of the Gthreads abstraction layer.
+  are defined in terms of the Gthreads abstraction layer, although this
+  is subject to change at any time. Any use of
+  native_handle is inherently non-portable and
+  not guaranteed to work between major releases of GCC.
   
  thread: The native handle type is
 a typedef for __gthread_t i.e. pthread_t


Re: [PATCH 3/4] S390 -march=native related fixes

2015-07-17 Thread Dominik Vogt
> > The following series of patches contains fixes, and cleanup work
> > related to -march=native (and -mtune=native) for S390.
> > 
> > * 0003 -march=native now detects various cpu features and passes
> >that as options from the driver to the compiler.

Sorry, some old code sneaked into the patch.  Updated version
without the dead code attached.  The ChangeLog ist still the same.

Ciao

Dominik ^_^  ^_^

-- 

Dominik Vogt
IBM Germany
>From b7942651a85761dcd9cf1f1c0c01074b9c327e43 Mon Sep 17 00:00:00 2001
From: Dominik Vogt 
Date: Mon, 6 Jul 2015 16:28:32 +0100
Subject: [PATCH 3/4] S390: Handle processor capabilities with -march=native.

---
 gcc/config/s390/driver-native.c | 130 +++-
 gcc/config/s390/s390.h  |  32 +-
 2 files changed, 120 insertions(+), 42 deletions(-)

diff --git a/gcc/config/s390/driver-native.c b/gcc/config/s390/driver-native.c
index 88c76bd..ff897cc 100644
--- a/gcc/config/s390/driver-native.c
+++ b/gcc/config/s390/driver-native.c
@@ -42,6 +42,14 @@ s390_host_detect_local_cpu (int argc, const char **argv)
   char buf[256];
   FILE *f;
   bool arch;
+  const char *options = "";
+  unsigned int has_features;
+  unsigned int has_processor;
+  unsigned int is_cpu_z9_109 = 0;
+  unsigned int has_highgprs = 0;
+  unsigned int has_dfp = 0;
+  unsigned int has_te = 0;
+  unsigned int has_vx = 0;
 
   if (argc < 1)
 return NULL;
@@ -54,38 +62,104 @@ s390_host_detect_local_cpu (int argc, const char **argv)
   if (f == NULL)
 return NULL;
 
-  while (fgets (buf, sizeof (buf), f) != NULL)
-if (strncmp (buf, "processor", sizeof ("processor") - 1) == 0)
-  {
-	if (strstr (buf, "machine = 9672") != NULL)
-	  cpu = "g5";
-	else if (strstr (buf, "machine = 2064") != NULL
-		 || strstr (buf, "machine = 2066") != NULL)
-	  cpu = "z900";
-	else if (strstr (buf, "machine = 2084") != NULL
-		 || strstr (buf, "machine = 2086") != NULL)
-	  cpu = "z990";
-	else if (strstr (buf, "machine = 2094") != NULL
-		 || strstr (buf, "machine = 2096") != NULL)
-	  cpu = "z9-109";
-	else if (strstr (buf, "machine = 2097") != NULL
-		 || strstr (buf, "machine = 2098") != NULL)
-	  cpu = "z10";
-	else if (strstr (buf, "machine = 2817") != NULL
-		 || strstr (buf, "machine = 2818") != NULL)
-	  cpu = "z196";
-	else if (strstr (buf, "machine = 2827") != NULL
-		 || strstr (buf, "machine = 2828") != NULL)
-	  cpu = "zEC12";
-	else if (strstr (buf, "machine = 2964") != NULL)
-	  cpu = "z13";
-	break;
-  }
+  for (has_features = 0, has_processor = 0;
+   (has_features == 0 || has_processor == 0)
+	 && fgets (buf, sizeof (buf), f) != NULL; )
+{
+  if (has_processor == 0 && strncmp (buf, "processor", 9) == 0)
+	{
+	  const char *p;
+	  long machine_id;
+
+	  p = strstr (buf, "machine = ");
+	  if (p == NULL)
+	continue;
+	  p += 10;
+	  has_processor = 1;
+	  machine_id = strtol (p, NULL, 16);
+	  switch (machine_id)
+	{
+	case 0x9672:
+	  cpu = "g5";
+	  break;
+	case 0x2064:
+	case 0x2066:
+	  cpu = "z900";
+	  break;
+	case 0x2084:
+	case 0x2086:
+	  cpu = "z990";
+	  break;
+	case 0x2094:
+	case 0x2096:
+	  cpu = "z9-109";
+	  is_cpu_z9_109 = 1;
+	  break;
+	case 0x2097:
+	case 0x2098:
+	  cpu = "z10";
+	  break;
+	case 0x2817:
+	case 0x2818:
+	  cpu = "z196";
+	  break;
+	case 0x2827:
+	case 0x2828:
+	  cpu = "zEC12";
+	  break;
+	case 0x2964:
+	  cpu = "z13";
+	  break;
+	}
+	}
+  if (has_features == 0 && strncmp (buf, "features", 8) == 0)
+	{
+	  const char *p;
+
+	  p = strchr (buf, ':');
+	  if (p == NULL)
+	continue;
+	  p++;
+	  while (*p != 0)
+	{
+	  int i;
+
+	  while (ISSPACE (*p))
+		p++;
+	  for (i = 0; !ISSPACE (p[i]) && p[i] != 0; i++)
+		;
+	  if (i == 3 && strncmp (p, "dfp", 3) == 0)
+		has_dfp = 1;
+	  else if (i == 2 && strncmp (p, "te", 2) == 0)
+		has_te = 1;
+	  else if (i == 2 && strncmp (p, "vx", 2) == 0)
+		has_vx = 1;
+	  else if (i == 8 && strncmp (p, "highgprs", 8) == 0)
+		has_highgprs = 1;
+	  p += i;
+	}
+	  has_features = 1;
+	}
+}
 
   fclose (f);
 
   if (cpu == NULL)
 return NULL;
 
-  return concat ("-m", argv[0], "=", cpu, NULL);
+  if (arch)
+{
+  const char *opt_htm = "";
+  const char *opt_vx = "";
+  const char *opt_esa_zarch = "";
+
+  opt_htm = (has_te) ? " -mhtm" : " -mno-htm";
+  opt_vx = (has_vx) ? " -mvx" : " -mno-vx";
+  opt_esa_zarch = (has_highgprs) ? " -mzarch" : " -mesa";
+  options = concat (options, opt_htm, opt_vx, opt_esa_zarch, NULL);
+}
+  if (has_dfp && is_cpu_z9_109)
+cpu = "z9-ec";
+
+  return concat ("-m", argv[0], "=", cpu, options, NULL);
 }
diff --git a/gcc/config/s390/s390.h b/gcc/config/s390/s390.h
index f18b973..cfe23d4 100644
--- a/gcc/config/s390/s390.h
+++ b/gcc/config/s390/s390.h
@@ -131,26 +131,30 @@ extern const char *s390_host_detect

Re: [gomp4.1] Initial support for some OpenMP 4.1 construct parsing

2015-07-17 Thread Ilya Verbin
On Thu, Jun 25, 2015 at 22:10:58 +0200, Jakub Jelinek wrote:
> On Thu, Jun 25, 2015 at 10:45:29PM +0300, Ilya Verbin wrote:
> > So, as I understood, three tasks will be generated almost simultaneously in
> > foo1: one on host and two on target.
> > Target task 1 will be executed immediately.
> > Host task will wait for task 1 to be completed on target.
> > (Or it is not possible to mix "omp target" and "omp task" dependencies?)
> > And task 2 will wait on target for task 1.
> 
> My understanding is that you don't create any extra tasks,
> but rather you pointer translate the host address from the start of the
> variable (or array section; thus the depend clause argument) into
> target address, and check if it can be offloaded right away (no need
> to wait for dependencies).  If yes, you just offload it, with nowait
> without waiting in the caller till it finishes.  If not, you arrange
> that when some other offloaded job finishes that provides the dependency,
> your scheduled job is executed.
> So, the task on the target is the implicit one, what executes the
> body of the target region.
> In tasking (task.c) dependencies are only honored for sibling tasks,
> whether the different target implicit tasks are sibling is questionable and
> supposedly should be clarified, but I can't imagine they aren't meant to.
> So, you don't really need to care about the task.c dependencies, target.c
> could have its own ones if it is easier to write it that way.
> Supposedly for nowait you want to spawn or queue the job and return right
> away, and for queued job stick it into some data structure (supposedly
> inside of libgomp on the host) that when the library is (asynchronously)
> notified that some offloaded job finished you check the data structures
> and spawn something different.  Or have the data structures on the offloaded
> device instead?
> 
> In any case, I'd look what the Mentor folks are doing for OpenACC async
> offloading, what libmicoffload allows you to do and figure out something
> from that.

One big question is who will maintain the list of scheduled job, its
dependencies, etc. - libgomp or each target plugin?


OpenACC has async queues:
#pragma acc parallel async(2) wait(1)

But it's not possible to have 2 waits like:
#pragma acc parallel async(3) wait(1) wait(2)

(GOMP_OFFLOAD_openacc_async_wait_async has only one argument with the number of
queue to wait)

Thomas, please correct me if I'm wrong.

In this regard, OpenMP is more complicated, since it allows e.g.:
#pragma omp target nowait depend(in: a, b) depend(out: c, d)

Currently I'm trying to figure out what liboffloadmic can do.


BTW, do you plan to remove GOMP_MAP_POINTER mappings from array sections?
The enter/exit patch for libgomp depends on this change.

  -- Ilya


Re: [gomp4.1] Initial support for some OpenMP 4.1 construct parsing

2015-07-17 Thread Jakub Jelinek
On Fri, Jul 17, 2015 at 07:31:36PM +0300, Ilya Verbin wrote:
> One big question is who will maintain the list of scheduled job, its
> dependencies, etc. - libgomp or each target plugin?
> 
> 
> OpenACC has async queues:
> #pragma acc parallel async(2) wait(1)
> 
> But it's not possible to have 2 waits like:
> #pragma acc parallel async(3) wait(1) wait(2)
> 
> (GOMP_OFFLOAD_openacc_async_wait_async has only one argument with the number 
> of
> queue to wait)
> 
> Thomas, please correct me if I'm wrong.
> 
> In this regard, OpenMP is more complicated, since it allows e.g.:
> #pragma omp target nowait depend(in: a, b) depend(out: c, d)

If it is each plugin, then supposedly it should use (if possible) some
common libgomp routine to maintain the queues, duplicating the dependency
graph handling code in each plugins might be too ugly.

> Currently I'm trying to figure out what liboffloadmic can do.
> 
> 
> BTW, do you plan to remove GOMP_MAP_POINTER mappings from array sections?
> The enter/exit patch for libgomp depends on this change.

My current plan (for Monday and onwards) is to first implement firstprivate
on target construct, once that works hack on the GOMP_MAP_POINTER
replacement, and then rewrite the gimplification rules for target construct
for the new 2.15.5 rules (so that this one does not really break all the
target tests we need the first two working somehow).

Jakub


Re: [PATCH] Fix partial template specialization syntax in wide-int.h

2015-07-17 Thread Mike Stump
On Jul 17, 2015, at 2:28 AM, Mikhail Maltsev  wrote:
> The following code (reduced from wide-int.h) is rejected by Intel C++ Compiler
> (EDG-based):

So, could you test this with the top of the tree compiler and file a bug report 
against g++ for it, if it seems to not work right.  If that bug report is 
rejected, then I’d say file a bug report against clang and EDG.

> I think that the warning is correct, and "template <>" should not be used 
> here.
> The attached patch should fix this issue. Bootstrapped and regtested on
> x86_64-linux. OK for trunk?

Ok.  Does this need to go into the gcc-5 release branch as well?  If so, ok 
there too.  Thanks.

Re: Still crashes due to aliasing violation (Re: [RFC, PATCH] Split pool_allocator and create a new object_allocator)

2015-07-17 Thread Ulrich Weigand
Richard Biener wrote:
> On July 17, 2015 3:50:19 PM GMT+02:00, "Martin Liška"  wrote:
> >Question is why aliasing oracle still wrongly aliases these pointers?
> >Another option (suggested by Martin Jambor) would be to place
> >::allocate implementation
> >to alloc-pool.c file.
> 
> Note that all compilers up to 4.4 have aliasing issues with placement new.
> A fix is to move the placement new out-of-line.

Yes, that's what I just noticed as well.  In fact, my particular problem
already disappears with 4.3, presumably due to the fix for PR 29286.

So do we now consider host compilers < 4.3 (4?) unsupported for building
mainline GCC, or should we try to work around the issue (e.g. by moving
the allocator out-of-line or using some other aliasing barrier)?

Bye,
Ulrich

-- 
  Dr. Ulrich Weigand
  GNU/Linux compilers and toolchain
  ulrich.weig...@de.ibm.com



Re: Still crashes due to aliasing violation (Re: [RFC, PATCH] Split pool_allocator and create a new object_allocator)

2015-07-17 Thread Richard Biener
On July 17, 2015 6:54:32 PM GMT+02:00, Ulrich Weigand  
wrote:
>Richard Biener wrote:
>> On July 17, 2015 3:50:19 PM GMT+02:00, "Martin Liška"
> wrote:
>> >Question is why aliasing oracle still wrongly aliases these
>pointers?
>> >Another option (suggested by Martin Jambor) would be to place
>> >::allocate implementation
>> >to alloc-pool.c file.
>> 
>> Note that all compilers up to 4.4 have aliasing issues with placement
>new.
>> A fix is to move the placement new out-of-line.
>
>Yes, that's what I just noticed as well.  In fact, my particular
>problem
>already disappears with 4.3, presumably due to the fix for PR 29286.
>
>So do we now consider host compilers < 4.3 (4?) unsupported for
>building
>mainline GCC, or should we try to work around the issue (e.g. by moving
>the allocator out-of-line or using some other aliasing barrier)?

Why is this an issue for stage1 which runs w/o optimization?  For cross 
compiling we already suggest using known good compilers.

>Bye,
>Ulrich




[PATCH] PR target/66906: Replicate static chain on the stack

2015-07-17 Thread H.J. Lu
If we put static chain on the stack, we need to replicate it on the stack
so that static chain can be reached via (argp - 2) slot.  This is needed
for nested function with stack realignment.

OK for trunk if there are no regressions?

H.J.
---
gcc/

PR target/66906
* config/i386/i386.c (ix86_expand_prologue): Replicate static
chain on the stack.

gcc/testsuite/

PR target/66906
* gcc.target/i386/pr66906.c: New test.
---
 gcc/config/i386/i386.c  | 18 -
 gcc/testsuite/gcc.target/i386/pr66906.c | 45 +
 2 files changed, 62 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr66906.c

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 0551a75..3803dde 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -11495,6 +11495,7 @@ ix86_expand_prologue (void)
   HOST_WIDE_INT allocate;
   bool int_registers_saved;
   bool sse_registers_saved;
+  rtx static_chain = NULL_RTX;
 
   ix86_finalize_stack_realign_flags ();
 
@@ -11593,7 +11594,8 @@ ix86_expand_prologue (void)
  call.  This insn will be skipped by the trampoline.  */
   else if (ix86_static_chain_on_stack)
 {
-  insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
+  static_chain = ix86_static_chain (cfun->decl, false);
+  insn = emit_insn (gen_push (static_chain));
   emit_insn (gen_blockage ());
 
   /* We don't want to interpret this push insn as a register save,
@@ -11645,6 +11647,20 @@ ix86_expand_prologue (void)
 we've started over with a new frame.  */
   m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
   m->fs.realigned = true;
+
+  if (static_chain)
+   {
+ /* Replicate static chain on the stack so that static chain
+can be reached via (argp - 2) slot.  This is needed for
+nested function with stack realignment.  */
+ t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
+ t = gen_rtx_SET (stack_pointer_rtx, t);
+ insn = emit_insn (t);
+ RTX_FRAME_RELATED_P (insn) = 1;
+ emit_move_insn (gen_rtx_MEM (Pmode, stack_pointer_rtx),
+ static_chain);
+ m->fs.sp_offset += UNITS_PER_WORD;
+   }
 }
 
   int_registers_saved = (frame.nregs == 0);
diff --git a/gcc/testsuite/gcc.target/i386/pr66906.c 
b/gcc/testsuite/gcc.target/i386/pr66906.c
new file mode 100644
index 000..969e183
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr66906.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target ia32 } } */
+/* { dg-options "-O0 -mregparm=3" } */
+
+typedef int ptrdiff_t;
+extern void abort (void);
+int
+check_int (int *i, int align)
+{
+  *i = 20;
+  if ptrdiff_t) i) & (align - 1)) != 0)
+abort ();
+  return *i;
+}
+void
+check (void *p, int align)
+{
+  if ptrdiff_t) p) & (align - 1)) != 0)
+abort ();
+}
+typedef int aligned __attribute__((aligned(64)));
+void
+foo (void)
+{
+  aligned j;
+  void bar ()
+{
+  aligned i;
+  if (check_int (&i, __alignof__(i)) != i)
+   abort ();
+  if (check_int (&j, __alignof__(j)) != j)
+   abort ();
+  j = -20;
+}
+  bar ();
+  if (j != -20)
+abort ();
+  if (check_int (&j, __alignof__(j)) != j)
+abort ();
+}
+int
+main()
+{
+  foo ();
+  return 0;
+}
-- 
2.4.3



Re: Still crashes due to aliasing violation (Re: [RFC, PATCH] Split pool_allocator and create a new object_allocator)

2015-07-17 Thread Martin Liška
On 07/17/2015 05:03 PM, Richard Biener wrote:
> On July 17, 2015 3:50:19 PM GMT+02:00, "Martin Liška"  wrote:
>> On 07/17/2015 03:44 PM, Ulrich Weigand wrote:
>>> Richard Biener wrote:
 On July 17, 2015 3:11:51 PM GMT+02:00, Ulrich Weigand
>>  wrote:
> (Since there is no C++ operator new involved at all anymore,
> this clearly violates even the C aliasing rules ...)
>
> I really think the allocate routine needs to be more careful to
> avoid violating aliasing, e.g. by using memcpy or union-based
> type-punning to access its free list info.

 As far as I understand the object allocator delegates construction
>> to callers and thus in the above case cselib
 Would be responsible for calling placement new on the return value
>> from
 Allocate.
>>>
>>> Ah, it looks like I was wrong above: the code uses the *object*
>>> allocator, so it should go through a placement new here:
>>>   inline T *
>>>   allocate () ATTRIBUTE_MALLOC
>>>   {
>>> return ::new (m_allocator.allocate ()) T ();
>>>   }
>>>
>>> It's still being miscompiled at least by my GCC 4.1 host compiler ...
>>>
>>> Bye,
>>> Ulrich
>>>
>>
>> Hi.
>>
>> I've just wanted to write you that it really utilizes a placement new
>> :)
>> The first example that bypasses pool allocator is of course a bug, I'll
>> fix.
>>
>> Question is why aliasing oracle still wrongly aliases these pointers?
>> Another option (suggested by Martin Jambor) would be to place
>> ::allocate implementation
>> to alloc-pool.c file.
> 
> Note that all compilers up to 4.4 have aliasing issues with placement new.
> A fix is to move the placement new out-of-line.
> 
> Richard.

Hi Richi.

Should I place the placement new to alloc-pool.c or should I leave it as it is?

Thanks for decision,
Martin

> 
>> Thoughts?
>> Martin
> 
> 



[gomp4] OpenACC vector and worker reductions

2015-07-17 Thread Cesar Philippidis
This patch adds support for OpenACC vector and worker reductions in a
target-independent fashion. It adds quite a bit of machinery to
accomplish that goal. For starters, three internal functions,
GOACC_REDUCTION_INIT, GOACC_REDUCTION and GOACC_REDUCTION_WRITEBACK,
have been introduced. It's probably easiest to explain all of the
changes with an example. Given an acc loop reduction as follows

  red = ...

  #pragma acc loop reduction (+:red) vector
  for (...)
red++;

the OpenMP way to lower this reduction would be to introduce a new
private variable for 'red', which I'll call red.private. That private
reduction variable gets initialized with some value depending on the
reduction operation. All of the references to the original reduction
variable inside the loop get replaced with the private copy. Immediately
after the loop exits, the original reduction variable is atomically
updated with the private copy.

The code ends up looking something as follows:

  red = ...
  red.private = 0;   // initialize red.internal
  #pragma omp for (...)
red.internal++;
  #pragma omp continue
red += red.private // this is an atomic operation
  #pragma omp end

Conceptually, this loop may be decomposed into three sections. The first
section is the reduction initializer, the second is the loop, and the
third is the reduction finalizer.

This get a little more complicated in OpenACC. For starters, there are
three levels of parallelism that may be associated with a single acc
loop. When transferring from one level of parallelism to another, some
targets (e.g. nvptx) may require variable state propagation and
predication due to the constraints of static thread scheduling. Nathan
solved that problem, at least from a high-level, by surrounding acc
loops with GOACC_FORK and GOACC_JOIN function markers.

Furthermore, certain targets have hardware limitations preventing
general atomic operations from being utilized. Specifically, spinlocks
may not be used by threads inside the same warp for nvptx targets. In
gcc 6.0, warps corresponds to vectors, which currently contain 32
threads. That said, spinlocks are usable on nvptx targets if only one
thread within a warp is using it. This patch solves this problem by
breaking up the reduction finalizer into two steps -- a parallel
reduction (a call to GOACC_REDUCTION) and a write-back to the original
variable. In OpenACC, the original loop gets lowered into the following
form:

  red = ...
  red.private = GOACC_REDUCTION_INIT (0)
  GOACC_FORK ()
  #pragma omp for (...)
red.internal++;
  #pragma omp continue
red.private = GOACC_REDUCTION (gwv_mask, op, red.private)
GOACC_WRITEBACK ()
red += red.private // this is an atomic operation
  #pragma omp end
  GOACC_JOIN ()

First of all, the call to GOACC_REDUCTION_INIT is necessary to ensure
that red.private has a value to propagate to all of the threads
associated with that loop. Without it, in situations where there are
more threads than loop iterations, the threads that didn't enter the
body of the loop would not contain a proper initial value, so the
reduction finalizer would be generating bogus results.

Both GOACC_REDUCTION and GOACC_WRITEBACK get evaluated inside the target
compiler by a new fold_oacc_reductions pass. That pass uses
targetm.goacc.fold_reduction to fold GOACC_REDUCTION in a
target-specific way. That pass also removes the GOACC_WRITEBACK marker
and moves the nearest GOACC_JOIN call at it's place if necessary
(worker-only loops are special). This is guaranteed to work because
OpenACC loops are single-entry, single-exit and there is only one
GOACC_WRITEBACK marker per acc loop (there is one GOACC_REDUCTION per
reduction though). Moving the GOACC_JOIN up allows the reduction
write-back to operate in a corresponds 'single' mode. E.g. since this
example executes the body in vector-partitioned mode, the original
reduction variable must be updated in vector-single mode.

There's one more quirk that I encountered while working on this patch.
All dummy args to fortran subroutine are passed by reference. That
causes problems for loop state propagation, because only the pointer
gets propagated, and not the value being pointed to. To get around this,
I taught the gimplifier to introduce a new local copy of the reduction
variable. Now the reduction clause has five operands associated with it,
with the fifth one being new private reduction variable.

In addition to the above machinery, this patch also implements the
fold_reduction hook on nvptx targets to use a tree-reduction for vector
loops. All other reductions on nvptx targets use atomics.

I hopefully ironed out all of the bugs in this patch, but I am rerunning
the entire regression testsuite again. Any comments are welcome. Is this
reduction scheme too nvptx-specific?

I'll post the test cases in a follow up patch because the patch would be
too big for the mailing list otherwise.

Thanks,
Cesar
2015-07-17  Cesar Philippidis  

	gcc/
	* config/nvptx/nvptx.c (

[gomp4] OpenACC reduction tests

2015-07-17 Thread Cesar Philippidis
This patch updates the libgomp OpenACC reduction test cases to check
worker, vector and combined gang worker vector reductions. I tried to
use some macros to simplify the c test cases a bit. I probably could
have made them more generic with an additional header file/macro, but
then that makes it too confusing too debug. The fortran tests are a bit
of a lost clause, unless someone knows how to use the preprocessor with
!$acc loops.

Cesar
2015-07-17  Cesar Philippidis  

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/reduction.h: New file.
	* testsuite/libgomp.oacc-c-c++-common/reduction-1.c: Update tests
	with worker, vector and combined reductions.
	* testsuite/libgomp.oacc-c-c++-common/reduction-2.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/reduction-3.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/reduction-4.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/reduction-5.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/reduction-6.c: Likewise.
	* testsuite/libgomp.oacc-fortran/reduction-1.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/reduction-2.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/reduction-3.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/reduction-4.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/reduction-5.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/reduction-6.f90: Likewise.


diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-1.c
index bb81759..8738927 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-1.c
@@ -3,44 +3,54 @@
 /* Integer reductions.  */
 
 #include 
-#include 
-
-#define ng 32
-
-#define DO_PRAGMA(x) _Pragma (#x)
-
-#define check_reduction_op(type, op, init, b)	\
-  {		\
-type res, vres;\
-res = (init);\
-DO_PRAGMA (acc parallel num_gangs (ng) copy (res)) \
-DO_PRAGMA (acc loop gang reduction (op:res))	\
-for (i = 0; i < n; i++)			\
-  res = res op (b);\
-		\
-vres = (init);\
-for (i = 0; i < n; i++)			\
-  vres = vres op (b);			\
-		\
-if (res != vres)\
-  abort ();	\
-  }
+#include "reduction.h"
+
+const int ng = 8;
+const int nw = 4;
+const int vl = 32;
 
 static void
-test_reductions_int (void)
+test_reductions (void)
 {
-  const int n = 1000;
+  const int n = 100;
   int i;
   int array[n];
 
   for (i = 0; i < n; i++)
-array[i] = i;
-
-  check_reduction_op (int, +, 0, array[i]);
-  check_reduction_op (int, *, 1, array[i]);
-  check_reduction_op (int, &, -1, array[i]);
-  check_reduction_op (int, |, 0, array[i]);
-  check_reduction_op (int, ^, 0, array[i]);
+array[i] = i+1;
+
+  /* Gang reductions.  */
+  check_reduction_op (int, +, 0, array[i], num_gangs (ng), gang);
+  check_reduction_op (int, *, 1, array[i], num_gangs (ng), gang);
+  check_reduction_op (int, &, -1, array[i], num_gangs (ng), gang);
+  check_reduction_op (int, |, 0, array[i], num_gangs (ng), gang);
+  check_reduction_op (int, ^, 0, array[i], num_gangs (ng), gang);
+
+  /* Worker reductions.  */
+  check_reduction_op (int, +, 0, array[i], num_workers (nw), worker);
+  check_reduction_op (int, *, 1, array[i], num_workers (nw), worker);
+  check_reduction_op (int, &, -1, array[i], num_workers (nw), worker);
+  check_reduction_op (int, |, 0, array[i], num_workers (nw), worker);
+  check_reduction_op (int, ^, 0, array[i], num_workers (nw), worker);
+
+  /* Vector reductions.  */
+  check_reduction_op (int, +, 0, array[i], vector_length (vl), vector);
+  check_reduction_op (int, *, 1, array[i], vector_length (vl), vector);
+  check_reduction_op (int, &, -1, array[i], vector_length (vl), vector);
+  check_reduction_op (int, |, 0, array[i], vector_length (vl), vector);
+  check_reduction_op (int, ^, 0, array[i], vector_length (vl), vector);
+
+  /* Combined reductions.  */
+  check_reduction_op (int, +, 0, array[i], num_gangs (ng) num_workers (nw)
+		  vector_length (vl), gang worker vector);
+  check_reduction_op (int, *, 1, array[i], num_gangs (ng) num_workers (nw)
+		  vector_length (vl), gang worker vector);
+  check_reduction_op (int, &, -1, array[i], num_gangs (ng) num_workers (nw)
+		  vector_length (vl), gang worker vector);
+  check_reduction_op (int, |, 0, array[i], num_gangs (ng) num_workers (nw)
+		  vector_length (vl), gang worker vector);
+  check_reduction_op (int, ^, 0, array[i], num_gangs (ng) num_workers (nw)
+		  vector_length (vl), gang worker vector);
 }
 
 static void
@@ -55,32 +65,31 @@ test_reductions_bool (void)
 array[i] = i;
 
   cmp_val = 5;
-#if 0
-  // TODO
-  check_reduction_op (bool, &&, true, (cmp_val > array[i]));
-  check_reduction_op (bool, ||, false, (cmp_val > array[i]));
-#endif
-}
 
-#define check_reduction_macro(type, op, init, b)	\
-  {			\
-type res, vres;	\
-res = (init);	\
-DO_PRAGMA (acc parallel num_gangs (ng) copy(res))	\
-DO_PRAGMA (acc loop gang reducti

ping: [PATCH v3 1/4] libcc1: Introduce GCC_FE_VERSION_1

2015-07-17 Thread Jan Kratochvil
Hi,

I was asked now about this [PATCH v3 1..4] series for a possible inclusion of
its GDB patches counterpart into gdb-7.10.


Jan


RE: [PATCH, MIPS] Add -march=interaptiv

2015-07-17 Thread Moore, Catherine


> -Original Message-
> From: Robert Suchanek [mailto:robert.sucha...@imgtec.com]
> Sent: Thursday, July 16, 2015 10:17 AM
> Subject: [PATCH, MIPS] Add -march=interaptiv
> 
> As in the title, the attached patch adds -march=interaptiv defined to 24kf2_1,
> mapped to -mips32r2 and -mdsp.
> 
> OK to apply?
> 
> 
> gcc/
>   * config/mips/mips-cpus.def (interaptiv): Define.
>   * config/mips/mips-tables.opt: Regenerate.
>   * config/mips/mips.h (MIPS_ISA_LEVEL_SPEC): Map -
> march=interaptiv to
>   -mips32r2.
>   (BASE_DRIVER_SELF_SPECS): Likewise but map to -mdsp.
>   * doc/invoke.texi (-march=@var{arch}): Add interaptiv.
> ---

Yes, this looks OK.


[gomp4, committed] Fix if-clause handling for kernels

2015-07-17 Thread Tom de Vries

Hi,

this patch fixes a problem with if-clauses on kernels, reported by Jim.

It makes sure the condition testing code is generated after the 
definition of the condition variable.


Bootstrapped on x86_64, committed to gomp-4_0-branch.

Thanks,
- Tom
Fix if-clause handling for kernels

2015-07-17  Tom de Vries  

	* omp-low.c (expand_omp_target): Fix location of insertion of if-clause
	handling code.

	* c-c++-common/goacc/if-clause-2.c: Compile with -Wall.
---
 gcc/omp-low.c  | 3 ++-
 gcc/testsuite/c-c++-common/goacc/if-clause-2.c | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 0419dcd..9b062e6 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -9643,7 +9643,8 @@ expand_omp_target (struct omp_region *region)
   tree tmp_var;
 
   tmp_var = create_tmp_var (TREE_TYPE (device));
-  if (offloaded)
+  if (offloaded
+	  && do_splitoff)
 	e = split_block_after_labels (new_bb);
   else
 	{
diff --git a/gcc/testsuite/c-c++-common/goacc/if-clause-2.c b/gcc/testsuite/c-c++-common/goacc/if-clause-2.c
index 5ab8459..bf7d6ed 100644
--- a/gcc/testsuite/c-c++-common/goacc/if-clause-2.c
+++ b/gcc/testsuite/c-c++-common/goacc/if-clause-2.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-Wall" } */
+
 void
 f (short c)
 {
-- 
1.9.1



Re: [Fortran, Patch] Passing function pointer to co_reduce

2015-07-17 Thread Steve Kargl
On Fri, Jul 17, 2015 at 03:50:26PM +0200, Mikael Morin wrote:
> Le 17/07/2015 11:02, Mikael Morin a écrit :
> > Le 16/07/2015 16:34, Damian Rouson a écrit :
> >> Alternatively, if it?s easy, please feel free to add the directives and 
> >> commit.  
> >>
> 
> > Never mind, I'll take care of it all.
> > 
> This is what I have committed:
> https://gcc.gnu.org/r225930 (trunk)
> https://gcc.gnu.org/r225932 (5 branch)
> 

Hi Mikael,

It seems that this commit has changed the scanning of 
other testcases.

FAIL: gfortran.dg/coarray_collectives_16.f90   -O   scan-tree-dump-times 
original "_gfortran_caf_co_reduce \\(&desc.., &fr, 4, _gfortran_caf_num_images 
\\(0, -1\\), &stat1, errmesg1, 0, 6\\);" 1
FAIL: gfortran.dg/coarray_collectives_16.f90   -O   scan-tree-dump-times 
original "_gfortran_caf_co_reduce \\(&val2, &gz, 0, 4, &stat2, errmesg2, 0, 
7\\);" 1
FAIL: gfortran.dg/coarray_collectives_16.f90   -O   scan-tree-dump-times 
original "_gfortran_caf_co_reduce \\(&desc.., &hc, 1, res, &stat3, errmesg3, 
99, 8\\);"

-- 
Steve


[gomp4] cleanup firstprivate test case

2015-07-17 Thread Cesar Philippidis
Tom noticed that one of my firstprivate test cases in libgomp had an omp
pragma. That pragma shouldn't be there. I probably forgot to remove that
pragma when I integrated that test into the libgomp test suite. This
patch corrects that test.

I applied this patch to gomp-4_0-branch.

Cesar
2015-07-17  Cesar Philippidis  

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/firstprivate-2.c: Remove
	omp pragma.


diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/firstprivate-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/firstprivate-2.c
index e5fc6a0..69abb23 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/firstprivate-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/firstprivate-2.c
@@ -16,7 +16,6 @@ main()
   for (i = 0; i < n; i++)
 b[i] = -1;
 
-  #pragma omp parallel for firstprivate (a)
   #pragma acc parallel num_gangs (n) firstprivate (a)
   #pragma acc loop gang
   for (i = 0; i < n; i++)


Re: [gomp4, committed] Fix if-clause handling for kernels

2015-07-17 Thread Jakub Jelinek
On Fri, Jul 17, 2015 at 08:36:42PM +0200, Tom de Vries wrote:
> diff --git a/gcc/omp-low.c b/gcc/omp-low.c
> index 0419dcd..9b062e6 100644
> --- a/gcc/omp-low.c
> +++ b/gcc/omp-low.c
> @@ -9643,7 +9643,8 @@ expand_omp_target (struct omp_region *region)
>tree tmp_var;
>  
>tmp_var = create_tmp_var (TREE_TYPE (device));
> -  if (offloaded)
> +  if (offloaded
> +   && do_splitoff)

Such short conditions belong on a single line IMHO.

Jakub


Re: [gomp4] Remove device-specific filtering during parsing for OpenACC

2015-07-17 Thread Julian Brown
On Fri, 17 Jul 2015 14:57:14 +0200
Thomas Schwinge  wrote:

> Your commit r225927 appears to have caused:
> 
> [-PASS:-]{+FAIL: libgomp.fortran/declare-simd-2.f90   -O0
> (internal compiler error)+} {+FAIL:+}
> libgomp.fortran/declare-simd-2.f90   -O0  (test for excess errors)
> [-PASS:-]{+UNRESOLVED:+} libgomp.fortran/declare-simd-2.f90   -O0
> [-execution test-] [-PASS:-]{+compilation failed to produce
> executable+} [same for other optimization levels]

This is fixed by the attached. I will apply shortly.

Thanks,

Julian

ChangeLog

gcc/fortran/
* trans-openmp.c (gfc_trans_omp_clauses): Add NULL check for
clauses.commit 7171ab9066e6b4bb84c317d1892a3a0a77cf63ae
Author: Julian Brown 
Date:   Fri Jul 17 11:46:56 2015 -0700

Add NULL check for clauses in gfc_trans_omp_clauses.

diff --git a/gcc/fortran/trans-openmp.c b/gcc/fortran/trans-openmp.c
index 20a1e65..378dd3b 100644
--- a/gcc/fortran/trans-openmp.c
+++ b/gcc/fortran/trans-openmp.c
@@ -2668,6 +2668,9 @@ gfc_trans_omp_clauses (stmtblock_t *block, gfc_omp_clauses *clauses,
   tree omp_clauses = gfc_trans_omp_clauses_1 (block, clauses, where,
 	  declare_simd);
 
+  if (clauses == NULL)
+return NULL_TREE;
+
   for (; clauses->device_types; clauses = clauses->dtype_clauses)
 {
   tree c, following_clauses = NULL_TREE, dev_list = NULL_TREE;


Re: [PATCH][AArch64] Improve csinc/csneg/csinv opportunities on immediates

2015-07-17 Thread pinskia




> On Jul 17, 2015, at 9:58 PM, Kyrill Tkachov  wrote:
> 
> 
>> On 10/07/15 14:45, Kyrill Tkachov wrote:
>>> On 10/07/15 10:00, pins...@gmail.com wrote:
>>> 
>>> 
 On Jul 10, 2015, at 1:47 AM, Kyrill Tkachov  wrote:
 
 Hi Andrew,
 
> On 10/07/15 09:40, pins...@gmail.com wrote:
> 
> 
> 
>> On Jul 10, 2015, at 1:34 AM, Kyrill Tkachov  
>> wrote:
>> 
>> Hi all,
>> 
>> Currently when evaluating expressions like (a ? 24 : 25) we will move 24 
>> and 25 into
>> registers and perform a csel on them.  This misses the opportunity to 
>> instead move just 24
>> into a register and then perform a csinc, saving us an instruction and a 
>> register use.
>> Similarly for csneg and csinv.
>> 
>> This patch implements that idea by allowing such pairs of immediates in 
>> *cmov_insn
>> and adding an early splitter that performs the necessary transformation.
>> 
>> The testcase included in the patch demonstrates the kind of 
>> opportunities that are now picked up.
>> 
>> With this patch I see about 9.6% more csinc instructions being generated 
>> for SPEC2006
>> and the generated code looks objectively better (i.e. fewer 
>> mov-immediates and slightly
>> lower register pressure).
>> 
>> Bootstrapped and tested on aarch64.
>> 
>> Ok for trunk?
> I think this is the wrong place for this optimization. It should happen 
> in expr.c and we should produce cond_expr on the gimple level.
 I had considered it, but I wasn't sure how general the conditional 
 increment/negate/inverse operations
 are to warrant a midend implementation. Do you mean the 
 expand_cond_expr_using_cmove function in expr.c?
>>> Yes and we can expand it to even have a target hook on how to expand them 
>>> if needed.
>> I played around in that part and it seems that by the time it gets to 
>> expansion the midend
>> doesn't have a cond_expr of the two immediates, it's a PHI node with the 
>> immediates already expanded.
>> I have not been able to get it to match a cond_expr of two immediates there, 
>> although that could be
>> because I'm unfamiliar with that part of the codebase.
> 
> So by the time we reach expansion time we don't have a COND_EXPR of two 
> immediates, so I tried getting
> the code in expr.c to do the right thing, but it didn't work out.
> This patch catches this opportunity at the RTL level and could catch such 
> cases if they were to be
> generated by any of the pre-combine RTL passes. Or do you reckon looking for 
> these patterns in RTL
> ifcvt is the way to go? I think it would be somewhat messy to express the 
> CSNEG, CSINV opportunities
> there as we don't have optabs for conditional negate and invert, but 
> conditional increment would work,
> though in the aarch64 case we can only do a conditional by 1 rather than a 
> general conditional add.

Right as I said, I have a patch to phiopt to produce the cond_expr when it is 
useful. That is create cond_expr before we even get to rtl. 

Thanks,
Andrew


> 
> Kyrill
> 
> 
>> 
>> Kyrill
>> 
>>> There is already a standard pattern for condition add so the a ? Const1 : 
>>> const2 can be handled in the a generic way without much troubles. We should 
>>> handle it better in rtl  ifcvt too (that should be an easier patch). The 
>>> neg and not cases are very target specific but can be handled by a target 
>>> hook and expand it directly to it.
>>> 
>>> 
>   I have patches to do both but I have not got around to cleaning them 
> up. If anyone wants them, I can send a link to my current gcc 5.1 sources 
> with them included.
 Any chance you can post them on gcc-patches even as a rough idea of what 
 needs to be done?
>>> I posted my expr patch a few years ago but I never got around to rth's 
>>> comments. This was the generic increment patch. Basically aarch64 should be 
>>> implementing that pattern too.
>>> 
>>> 
>>> The main reason why this should be handled in gimple is that ifcvt on the 
>>> rtl level is not cheap and does not catch all of the cases the simple 
>>> expansion of phi-opt does. I can dig that patch up and I will be doing that 
>>> next week anyways.
>>> 
>>> Thanks,
>>> Andrew
>>> 
 Thanks,
 Kyrill
 
>   Thanks,
> Andrew
> 
>> Thanks,
>> Kyrill
>> 
>> 2015-07-10  Kyrylo Tkachov  
>> 
>> * config/aarch64/aarch64.md (*cmov_insn): Move stricter
>> check for operands 3 and 4 to pattern predicate.  Allow immediates
>> that can be expressed as csinc/csneg/csinv.  New define_split.
>> (*csinv3_insn): Rename to...
>> (csinv3_insn): ... This.
>> * config/aarch64/aarch64.h (AARCH64_IMMS_OK_FOR_CSNEG): New macro.
>> (AARCH64_IMMS_OK_FOR_CSINC): Likewise.
>> (AARCH64_IMMS_OK_FOR_CSINV): Likewise.
>> * config/aarch64/aarch64.c (aarch64_imms_ok_for_cond_op_1):
>> New fun

Re: [PATCH] PR target/66906: Replicate static chain on the stack

2015-07-17 Thread Uros Bizjak
On Fri, Jul 17, 2015 at 7:49 PM, H.J. Lu  wrote:
> If we put static chain on the stack, we need to replicate it on the stack
> so that static chain can be reached via (argp - 2) slot.  This is needed
> for nested function with stack realignment.
>
> OK for trunk if there are no regressions?
>
> H.J.
> ---
> gcc/
>
> PR target/66906
> * config/i386/i386.c (ix86_expand_prologue): Replicate static
> chain on the stack.
>
> gcc/testsuite/
>
> PR target/66906
> * gcc.target/i386/pr66906.c: New test.
> ---
>  gcc/config/i386/i386.c  | 18 -
>  gcc/testsuite/gcc.target/i386/pr66906.c | 45 
> +
>  2 files changed, 62 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr66906.c
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 0551a75..3803dde 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -11495,6 +11495,7 @@ ix86_expand_prologue (void)
>HOST_WIDE_INT allocate;
>bool int_registers_saved;
>bool sse_registers_saved;
> +  rtx static_chain = NULL_RTX;
>
>ix86_finalize_stack_realign_flags ();
>
> @@ -11593,7 +11594,8 @@ ix86_expand_prologue (void)
>   call.  This insn will be skipped by the trampoline.  */
>else if (ix86_static_chain_on_stack)
>  {
> -  insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
> +  static_chain = ix86_static_chain (cfun->decl, false);
> +  insn = emit_insn (gen_push (static_chain));
>emit_insn (gen_blockage ());
>
>/* We don't want to interpret this push insn as a register save,
> @@ -11645,6 +11647,20 @@ ix86_expand_prologue (void)
>  we've started over with a new frame.  */
>m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
>m->fs.realigned = true;
> +
> +  if (static_chain)
> +   {
> + /* Replicate static chain on the stack so that static chain
> +can be reached via (argp - 2) slot.  This is needed for
> +nested function with stack realignment.  */
> + t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
> + t = gen_rtx_SET (stack_pointer_rtx, t);
> + insn = emit_insn (t);
> + RTX_FRAME_RELATED_P (insn) = 1;
> + emit_move_insn (gen_rtx_MEM (Pmode, stack_pointer_rtx),
> + static_chain);
> + m->fs.sp_offset += UNITS_PER_WORD;

All above can be just:

 if (static_chain)
{
  /* ... */
  insn = emit_insn (gen_push (static_chain));
  RTX_FRAME_RELATED_P (insn) = 1;
}

Please check if the above code survives bootstrap and regression test on x86_32.

Uros.


Re: [Fortran, Patch] Passing function pointer to co_reduce

2015-07-17 Thread Mikael Morin
Le 17/07/2015 20:40, Steve Kargl a écrit :
> On Fri, Jul 17, 2015 at 03:50:26PM +0200, Mikael Morin wrote:
>> Le 17/07/2015 11:02, Mikael Morin a écrit :
>>> Le 16/07/2015 16:34, Damian Rouson a écrit :
 Alternatively, if it?s easy, please feel free to add the directives and 
 commit.  

>>
>>> Never mind, I'll take care of it all.
>>>
>> This is what I have committed:
>> https://gcc.gnu.org/r225930 (trunk)
>> https://gcc.gnu.org/r225932 (5 branch)
>>
> 
> Hi Mikael,
> 
> It seems that this commit has changed the scanning of 
> other testcases.
> 
> FAIL: gfortran.dg/coarray_collectives_16.f90   -O   scan-tree-dump-times 
> original "_gfortran_caf_co_reduce \\(&desc.., &fr, 4, 
> _gfortran_caf_num_images \\(0, -1\\), &stat1, errmesg1, 0, 6\\);" 1
> FAIL: gfortran.dg/coarray_collectives_16.f90   -O   scan-tree-dump-times 
> original "_gfortran_caf_co_reduce \\(&val2, &gz, 0, 4, &stat2, errmesg2, 0, 
> 7\\);" 1
> FAIL: gfortran.dg/coarray_collectives_16.f90   -O   scan-tree-dump-times 
> original "_gfortran_caf_co_reduce \\(&desc.., &hc, 1, res, &stat3, errmesg3, 
> 99, 8\\);"
> 
Hmm,

now you bring the topic, I realize I haven't tested the patch against
the testsuite (except the new file).
Will fix.

Mikael


Re: [C++/66443] virtual base of abstract class

2015-07-17 Thread Jason Merrill

On 07/08/2015 10:50 AM, Nathan Sidwell wrote:

On 06/30/15 19:21, Nathan Sidwell wrote:

On 06/30/15 00:19, Jason Merrill wrote:

On 06/29/2015 06:57 PM, Nathan Sidwell wrote:

* method.c (synthesized_method_walk): Skip virtual bases of
abstract classes in C++14 mode.


Let's not limit this to C++14 mode; most DRs apply to earlier
standards as well.


curiously opening it up leads to some test failures related to
determining the exception specifier for implicit ctors and dtors.  Not
had time to investigate that yet ...


If C++98 mode is problematic, we can limit this to C++11 and up.

Jason



Re: [PATCH][doc][13/14] Document AArch64 target attributes and pragmas

2015-07-17 Thread Sandra Loosemore

On 07/17/2015 06:37 AM, Kyrill Tkachov wrote:

Hi Sandra,

On 17/07/15 04:21, Sandra Loosemore wrote:

On 07/16/2015 09:21 AM, Kyrill Tkachov wrote:

Hi all,

This patch adds the documentation for the AArch64 target attributes and
pragmas.

Ok for trunk?

The content looks OK, but I have a bunch of nit-picky comments about
grammar, typos, markup, etc


Thanks for the detailed feedback!
Here's an updated version.


There are still a few things I previously noted, but that you missed in 
preparing this version.



+the same behavior as that of the command line option


s/command line option/command-line option/g

https://gcc.gnu.org/codingconventions.html#Spelling


+where @var{@var{attr-string}} is one of the attribute strings specified above.


@var{@var{...}} should be @code{@var{...}}, I think.


+Note that CPU tuning options and attributes such as the @option{-mcpu=},
+@option{-mtune=} do not inhibit inlining unless the CPU specified by the
+@option{-mcpu=} option or the @option{cpu=} attribute conflicts with the


s/@option{cpu=} attribute/@code{cpu=} attribute/


+@smallexample
+#pragma GCC target("")


s//@var{string}/


+@end smallexample
+
+where @code{@var{string}} can be any string accepted as an AArch64 target
+attribute.  @xref{AArch64 Function Attributes}, for more details
+on the permissible values of @code{}.


Here too.

OK with those things fixed.

-Sandra



revised and updated new-if-converter patch… [PATCH] fix PR46029: reimplement if conversion of loads and stores

2015-07-17 Thread Abe

Dear all,

Relative to the previous submission of this same patch, the below corrects some 
minor spacing and/or indentation issues,
misc. other formatting fixes, and makes the disabled vectorization tests be disabled via 
"xfail" rather than by adding spaces to
deliberately cause the relevant scanned-for text to not be found by DejaGNU so 
as to prevent the DejaGNU line from being interpreted.

The below is also based on a Git checkout that was rebased to the latest 
upstream check-in from today,
so it should merge cleanly with trunk as of today.

Regards,

Abe








From 89e115118dcc49d6839db2e9d7ae6c330789c235 Mon Sep 17 00:00:00 2001
Subject: [PATCH] fix PR46029: reimplement if conversion of loads and stores

In the current implementation of if conversion, loads and stores are
if-converted in a thread-unsafe way:

  * loads were always executed, even when they should have not been.
Some source code could be rendered invalid due to null pointers
that were OK in the original program because they were never
dereferenced.

  * writes were if-converted via load/maybe-modify/store, which
renders some code multithreading-unsafe.

This patch reimplements if-conversion of loads and stores in a safe
way using a scratchpad allocated by the compiler on the stack:

  * loads are done through an indirection, reading either the correct
data from the correct source [if the condition is true] or reading
from the scratchpad and later ignoring this read result [if the
condition is false].

  * writes are also done through an indirection, writing either to the
correct destination [if the condition is true] or to the
scratchpad [if the condition is false].

Vectorization of "if-cvt-stores-vect-ifcvt-18.c" disabled because the
old if-conversion resulted in unsafe code that could fail under
multithreading even though the as-written code _was_ thread-safe.

Passed regression testing and bootstrap on amd64-linux.

2015-06-12  Sebastian Pop  
Abe Skolnik  

PR tree-optimization/46029
* tree-data-ref.c (struct data_ref_loc_d): Moved...
(get_references_in_stmt): Exported.
* tree-data-ref.h (struct data_ref_loc_d): ... here.
(get_references_in_stmt): Declared.

* doc/invoke.texi (-ftree-loop-if-convert-stores): Update description.
* tree-if-conv.c (struct ifc_dr): Removed.
(IFC_DR): Removed.
(DR_WRITTEN_AT_LEAST_ONCE): Removed.
(DR_RW_UNCONDITIONALLY): Removed.
(memrefs_read_or_written_unconditionally): Removed.
(write_memrefs_written_at_least_once): Removed.
(ifcvt_could_trap_p): Does not take refs parameter anymore.
(ifcvt_memrefs_wont_trap): Removed.
(has_non_addressable_refs): New.
(if_convertible_gimple_assign_stmt_p): Call has_non_addressable_refs.
Removed use of refs.
(if_convertible_stmt_p): Removed use of refs.
(if_convertible_gimple_assign_stmt_p): Same.
(if_convertible_loop_p_1): Removed use of refs.  Remove initialization
of dr->aux, DR_WRITTEN_AT_LEAST_ONCE, and DR_RW_UNCONDITIONALLY.
(insert_address_of): New.
(create_scratchpad): New.
(create_indirect_cond_expr): New.
(predicate_mem_writes): Call create_indirect_cond_expr.  Take an extra
parameter for scratch_pad.
(combine_blocks): Same.
(tree_if_conversion): Same.

testsuite/
* g++.dg/tree-ssa/ifc-pr46029.C: New.
* gcc.dg/tree-ssa/ifc-5.c: Make it exactly like the FFmpeg kernel.
* gcc.dg/tree-ssa/ifc-8.c: New.
* gcc.dg/tree-ssa/ifc-9.c: New.
* gcc.dg/tree-ssa/ifc-10.c: New.
* gcc.dg/tree-ssa/ifc-11.c: New.
* gcc.dg/tree-ssa/ifc-12.c: New.
* gcc.dg/vect/if-cvt-stores-vect-ifcvt-18.c: Disabled.
* gcc.dg/vect/if-cvt-stores-vect-ifcvt-19.c: New.
---
 gcc/common.opt |   2 +-
 gcc/doc/invoke.texi|  29 +-
 gcc/testsuite/g++.dg/tree-ssa/ifc-pr46029.C|  76 
 gcc/testsuite/gcc.dg/tree-ssa/cunroll-10.c |   2 +-
 gcc/testsuite/gcc.dg/tree-ssa/ifc-10.c |  17 +
 gcc/testsuite/gcc.dg/tree-ssa/ifc-11.c |  16 +
 gcc/testsuite/gcc.dg/tree-ssa/ifc-12.c |  13 +
 gcc/testsuite/gcc.dg/tree-ssa/ifc-5.c  |  19 +-
 gcc/testsuite/gcc.dg/tree-ssa/ifc-8.c  |  29 ++
 gcc/testsuite/gcc.dg/tree-ssa/ifc-9.c  |  17 +
 .../gcc.dg/vect/if-cvt-stores-vect-ifcvt-18.c  |  10 +-
 .../gcc.dg/vect/if-cvt-stores-vect-ifcvt-19.c  |  46 +++
 gcc/testsuite/gcc.dg/vect/vect-mask-loadstore-1.c  |   1 +
 gcc/testsuite/gcc.target/i386/avx2-gather-6.c  |   2 +-
 .../gcc.target/i386/avx2-vect-aggressive-1.c   |   2 +-
 .../gcc.target/i386/avx2-vect-aggressive.c |   3 +-
 gcc/tree-data-ref.c|  13 +-
 gcc/tree-data-ref.h|  14 

Re: [C++/66443] virtual base of abstract class

2015-07-17 Thread Nathan Sidwell

On 07/17/15 15:42, Jason Merrill wrote:

On 07/08/2015 10:50 AM, Nathan Sidwell wrote:

On 06/30/15 19:21, Nathan Sidwell wrote:

On 06/30/15 00:19, Jason Merrill wrote:

On 06/29/2015 06:57 PM, Nathan Sidwell wrote:

* method.c (synthesized_method_walk): Skip virtual bases of
abstract classes in C++14 mode.


Let's not limit this to C++14 mode; most DRs apply to earlier
standards as well.


curiously opening it up leads to some test failures related to
determining the exception specifier for implicit ctors and dtors.  Not
had time to investigate that yet ...


If C++98 mode is problematic, we can limit this to C++11 and up.


I'm not yet sure.  The failure mode I saw surprised me, and suggests there's 
something wrong with the patch.  Sadly, I've got interrupted by other stuff.


nathan


Re: [PATCH] PR target/66906: Replicate static chain on the stack

2015-07-17 Thread H.J. Lu
On Fri, Jul 17, 2015 at 12:35 PM, Uros Bizjak  wrote:
> On Fri, Jul 17, 2015 at 7:49 PM, H.J. Lu  wrote:
>> If we put static chain on the stack, we need to replicate it on the stack
>> so that static chain can be reached via (argp - 2) slot.  This is needed
>> for nested function with stack realignment.
>>
>> OK for trunk if there are no regressions?
>>
>> H.J.
>> ---
>> gcc/
>>
>> PR target/66906
>> * config/i386/i386.c (ix86_expand_prologue): Replicate static
>> chain on the stack.
>>
>> gcc/testsuite/
>>
>> PR target/66906
>> * gcc.target/i386/pr66906.c: New test.
>> ---
>>  gcc/config/i386/i386.c  | 18 -
>>  gcc/testsuite/gcc.target/i386/pr66906.c | 45 
>> +
>>  2 files changed, 62 insertions(+), 1 deletion(-)
>>  create mode 100644 gcc/testsuite/gcc.target/i386/pr66906.c
>>
>> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
>> index 0551a75..3803dde 100644
>> --- a/gcc/config/i386/i386.c
>> +++ b/gcc/config/i386/i386.c
>> @@ -11495,6 +11495,7 @@ ix86_expand_prologue (void)
>>HOST_WIDE_INT allocate;
>>bool int_registers_saved;
>>bool sse_registers_saved;
>> +  rtx static_chain = NULL_RTX;
>>
>>ix86_finalize_stack_realign_flags ();
>>
>> @@ -11593,7 +11594,8 @@ ix86_expand_prologue (void)
>>   call.  This insn will be skipped by the trampoline.  */
>>else if (ix86_static_chain_on_stack)
>>  {
>> -  insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
>> +  static_chain = ix86_static_chain (cfun->decl, false);
>> +  insn = emit_insn (gen_push (static_chain));
>>emit_insn (gen_blockage ());
>>
>>/* We don't want to interpret this push insn as a register save,
>> @@ -11645,6 +11647,20 @@ ix86_expand_prologue (void)
>>  we've started over with a new frame.  */
>>m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
>>m->fs.realigned = true;
>> +
>> +  if (static_chain)
>> +   {
>> + /* Replicate static chain on the stack so that static chain
>> +can be reached via (argp - 2) slot.  This is needed for
>> +nested function with stack realignment.  */
>> + t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
>> + t = gen_rtx_SET (stack_pointer_rtx, t);
>> + insn = emit_insn (t);
>> + RTX_FRAME_RELATED_P (insn) = 1;
>> + emit_move_insn (gen_rtx_MEM (Pmode, stack_pointer_rtx),
>> + static_chain);
>> + m->fs.sp_offset += UNITS_PER_WORD;
>
> All above can be just:
>
>  if (static_chain)
> {
>   /* ... */
>   insn = emit_insn (gen_push (static_chain));
>   RTX_FRAME_RELATED_P (insn) = 1;
> }
>
> Please check if the above code survives bootstrap and regression test on 
> x86_32.

I am testing this on both ia32 and x86-64.  I will check it in there are no
regressions.

Thanks.


-- 
H.J.
From 86be1d677bbef8bb23f0dfdffc3185b8afbcb080 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" 
Date: Fri, 17 Jul 2015 10:44:40 -0700
Subject: [PATCH] Replicate static chain on the stack

If we put static chain on the stack, we need to replicate it on the stack
when stack is realigned with DRAP so that static chain can be reached via
(argp - 2) slot.

gcc/

	PR target/66906
	* config/i386/i386.c (ix86_expand_prologue): Replicate static
	chain on the stack.

gcc/testsuite/

	PR target/66906
	* gcc.target/i386/pr66906.c: New test.
---
 gcc/config/i386/i386.c  | 13 +-
 gcc/testsuite/gcc.target/i386/pr66906.c | 45 +
 2 files changed, 57 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr66906.c

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 55e1e2d..01a1cb9 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -11495,6 +11495,7 @@ ix86_expand_prologue (void)
   HOST_WIDE_INT allocate;
   bool int_registers_saved;
   bool sse_registers_saved;
+  rtx static_chain = NULL_RTX;
 
   ix86_finalize_stack_realign_flags ();
 
@@ -11593,7 +11594,8 @@ ix86_expand_prologue (void)
  call.  This insn will be skipped by the trampoline.  */
   else if (ix86_static_chain_on_stack)
 {
-  insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
+  static_chain = ix86_static_chain (cfun->decl, false);
+  insn = emit_insn (gen_push (static_chain));
   emit_insn (gen_blockage ());
 
   /* We don't want to interpret this push insn as a register save,
@@ -11645,6 +11647,15 @@ ix86_expand_prologue (void)
 	 we've started over with a new frame.  */
   m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
   m->fs.realigned = true;
+
+  if (static_chain)
+	{
+	  /* Replicate static chain on the stack so that static chain
+	 can be reached via (argp - 2) slot.  This is needed for
+	 nested function with stack realignment.  */
+	  insn = emit_insn (gen_push (static_chain))

Another benefit of the new if converter: better performance for half hammocks when running the generated code on a modern high-speed CPU with write-back caching, relative to the code produced by the o

2015-07-17 Thread Abe

Dear all,

Another benefit of the new if converter that perhaps I neglected to 
mention/explain...

TLDR: for some source code which can reasonably be expected to exist in "real-world 
code",
when the false/true values of the condition bits of a given "if" in a given 
loop are very
well-clustered, the code produced by the new converter runs _much_ faster for 
the same
inputs than the code produced by the old converter when write-back caching is 
in effect.

The long explanation follows.



In the case of a loop with a "half-hammock" that looks something like this:

  if (C[index])  A[index] = foo(bar);
  // important: no else here

... or problem-wise-equivalently:

  if (C[index])  ; // empty "then" section
  else   B[index] = foo(bar);

... the latter of which is semantically equivalent to:

  if (! C[index])  B[index] = foo(bar);
  // important: no else here


... the old if converter does something that may massively damage performance.

Basically, the old if converter converts...

  if (C[index])  A[index] = foo(bar);
  // important: no else here

... to the equivalent of:

  __compiler_temp = foo(bar);
  A[index] = C[index] ? __compiler_temp : A[index];


For now, let`s assume the preceding conversion is valid even in the face of 
multithreading,
since multithreading bugs introduced by an "optimization" are a whole other
ball of wax than what this message is all about; for now, let`s assume that
all of A[] is thread-local and no nasty, sneaky pointer-passing has occurred.

The problem is this: the compiler cannot, in the general case, predict what the 
values of C[]
will be at runtime.  Therefor, it cannot [again, in the general case] arrive at 
a conclusion
"this is definitely worth it" or "this is definitely _not_ worth it".  All the 
compiler can do
statically without profiling information is to say "I guess a probability of 
50% on the
elements of C[] being equivalent to true", which -- under an assumption of 
vectorization --
means that the vectorization factor is going to make the transformation 
worthwhile.

However: what if the values of C[] are mostly equivalent to false, not to true? 
 For such
cases, the old if converter yielded code that may cause a big performance 
degradation due to
the if conversion, even in the presence of vectorization.  If we assume that 
the CPU hardware
is not checking to see whether writes to an address change the contents, then 
each execution
of "A[index] = C[index] ? foo(bar) : A[index];" is causing a write to occur 
*_no matter
what the value of "C[index]" is/was_*.  Now, instead of reading the whole A[] 
and writing
a tiny fraction of it, the program is reading all of A[] and also (re)writing 
at least
almost all of A[] (possibly writing all of it even when the probability of the
relevant elements of C[] is _not_ 100%, due to cache-line granularity of writes:
when every cache line from A[] is modified, all of A[] will be rewritten).

The preceding problem could be somewhat ameliorated by profiling, providing 
that the data
you run through your program while profiling is a good representation of the 
data run
through the same program by "real executions", but there is no need for that 
extra work
or extra qualification given the existence of the new if converter.  Plus, the 
profiling
approach to "fixing" this problem with the old converter would only result in a 
binary
decision -- "do convert this if" vs. "don`t convert this if" -- which in cases 
where the
decision is to do/retain the conversion, the converted code is going to rewrite 
the whole
array.  The new converter`s conversion, OTOH, can produce better performance 
than the
conversion from the old converter in cases where the elements of C[] in our 
example are
very clustered: in other words, the _overall_ probability can still be close 
[or =] to the
hardest-to-deal-with challenge of 50%, but there is a large degree of 
clustering of the
"true" values and the "false" values.  For example, all the "false" values come 
first in C[].
In this case, if a profiling-based approach using the old converter decides to 
do/keep
the conversion, then there are still lots of wasted writes that the new 
converter
would avoid, assuming at least one level of write-back cache in the relevant 
data path.

The key factor to understand for understanding how/why the new converter`s 
resulting code
is better than that of the old converter is this: the new converter uses a 
scratchpad
to "throw away" useless writes.  This not only fixes problems with speculative 
writes
through a null pointer that the pre-conversion code never actually does, it 
also fixes
the above-described potential performance problem, at least on architectures 
with
write-back data cache, which AFAIK covers most/all modern high-speed CPUs.
The new converter converts something like (same example as one of the above):

  if (C[index])  A[index] = foo(bar);
  // important: no else here

... into something like:

  /* the type of the scalar goes

Re: [PATCH, i386]: Fix PR 66866, incorrect load address on manual vector shuffle

2015-07-17 Thread Uros Bizjak
On Thu, Jul 16, 2015 at 12:17 AM, Uros Bizjak  wrote:
> Attached patch fixes PR 66866. The problem was in ix86_expand_pinsr,
> where we didn't account for non-lowpart source register and just
> blindly took a SUBREG of it.

Attached patch is the version for release branches. As suggested by
Richi in the PR, the patch simply rejects non-lowpart source subregs.

2015-07-17  Uros Bizjak  

PR target/66866
* config/i386/i386.c (ix86_expand_pinsr): Reject non-lowpart
source subregs.

testsuite/ChangeLog:

2015-07-17  Uros Bizjak  

PR target/66866
* g++.dg/pr66866.C: New test.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}, will
be committed to active release branches.

Uros.
Index: config/i386/i386.c
===
--- config/i386/i386.c  (revision 225932)
+++ config/i386/i386.c  (working copy)
@@ -50335,6 +50335,14 @@ ix86_expand_pinsr (rtx *operands)
   unsigned int size = INTVAL (operands[1]);
   unsigned int pos = INTVAL (operands[2]);
 
+  if (GET_CODE (src) == SUBREG)
+{
+  /* Reject non-lowpart subregs.  */
+  if (SUBREG_BYTE (src) != 0)
+   return false;
+  src = SUBREG_REG (src);
+}
+
   if (GET_CODE (dst) == SUBREG)
 {
   pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
@@ -50341,9 +50349,6 @@ ix86_expand_pinsr (rtx *operands)
   dst = SUBREG_REG (dst);
 }
 
-  if (GET_CODE (src) == SUBREG)
-src = SUBREG_REG (src);
-
   switch (GET_MODE (dst))
 {
 case V16QImode:
Index: testsuite/g++.dg/pr66866.C
===
--- testsuite/g++.dg/pr66866.C  (revision 0)
+++ testsuite/g++.dg/pr66866.C  (working copy)
@@ -0,0 +1,29 @@
+// { dg-do run { target i?86-*-* x86_64-*-* } }
+// { dg-require-effective-target sse2_runtime }
+// { dg-options "-O -msse2" }
+
+extern "C" void abort (void);
+
+typedef long long __m128i __attribute__ ((__vector_size__ (16), 
__may_alias__));
+typedef short A __attribute__((__may_alias__));
+
+__m128i __attribute__((noinline))
+shuf(const __m128i v)
+{
+  __m128i r;
+
+  reinterpret_cast(&r)[5] = reinterpret_cast(&v)[4];
+  return r;
+}
+
+int main()
+{
+  __attribute__((aligned(16))) short mem[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
+
+  *reinterpret_cast<__m128i *>(mem) = shuf (*reinterpret_cast<__m128i *>(mem));
+
+  if (mem[5] != 4)
+abort ();
+
+  return 0;
+}


Re: [Fortran, Patch] Passing function pointer to co_reduce

2015-07-17 Thread Mikael Morin
Le 17/07/2015 21:39, Mikael Morin a écrit :
>> It seems that this commit has changed the scanning of 
>> other testcases.
>>
> Will fix.
> 
Done:
https://gcc.gnu.org/r225965 (trunk)
https://gcc.gnu.org/r225969 (5 branch)

Mikael


Index: gcc/testsuite/gfortran.dg/coarray_collectives_16.f90
===
--- gcc/testsuite/gfortran.dg/coarray_collectives_16.f90	(révision 225964)
+++ gcc/testsuite/gfortran.dg/coarray_collectives_16.f90	(révision 225965)
@@ -33,6 +33,6 @@ contains
   end function hc
 end program test
 
-! { dg-final { scan-tree-dump-times "_gfortran_caf_co_reduce \\(&desc.., &fr, 4, _gfortran_caf_num_images \\(0, -1\\), &stat1, errmesg1, 0, 6\\);" 1 "original" } }
-! { dg-final { scan-tree-dump-times "_gfortran_caf_co_reduce \\(&val2, &gz, 0, 4, &stat2, errmesg2, 0, 7\\);" 1 "original" } }
-! { dg-final { scan-tree-dump-times "_gfortran_caf_co_reduce \\(&desc.., &hc, 1, res, &stat3, errmesg3, 99, 8\\);" 1 "original" } }
+! { dg-final { scan-tree-dump-times "_gfortran_caf_co_reduce \\(&desc.., fr, 4, _gfortran_caf_num_images \\(0, -1\\), &stat1, errmesg1, 0, 6\\);" 1 "original" } }
+! { dg-final { scan-tree-dump-times "_gfortran_caf_co_reduce \\(&val2, gz, 0, 4, &stat2, errmesg2, 0, 7\\);" 1 "original" } }
+! { dg-final { scan-tree-dump-times "_gfortran_caf_co_reduce \\(&desc.., hc, 1, res, &stat3, errmesg3, 99, 8\\);" 1 "original" } }
Index: gcc/testsuite/ChangeLog
===
--- gcc/testsuite/ChangeLog	(révision 225964)
+++ gcc/testsuite/ChangeLog	(révision 225965)
@@ -1,3 +1,8 @@
+2015-07-17  Mikael Morin  
+
+	* gfortran.dg/coarray_collectives_16.f90: Fix pattern
+	as follow-up to r225930.
+
 2015-07-17  Paul Thomas  
 
 	PR fortran/52846



Re: [PATCH][combine][1/2] Try to simplify before substituting

2015-07-17 Thread Jeff Law

On 07/17/2015 05:36 AM, Kyrill Tkachov wrote:


On 16/07/15 19:28, Segher Boessenkool wrote:

On Thu, Jul 16, 2015 at 07:17:54PM +0100, Kyrill Tkachov wrote:

If you always want to simplify first, does it work to move this
whole big
block behind the simplify just following it?  Or do you want to
simplify
after the transform as well?

You mean move this hunk outside the "if (BINARY_P (x)...)" block it's
in?
I think it would work, but I'm not sure if it would affect other cases.
I was also conscious that simplify_rtx might not be a cheap function
to call
so frequently (or is it? I didn't profile it), so I tried to avoid
calling
it unless I need for the transformation in question here.

I mean move the whole "if (BINARY_P ..." block to after the existing
simplify calls, to just before the "First see if we can apply" comment,
and not do a new simplify_rtx call at all.  Does that work?


Yes, and here's the patch.
It just moves the simplification block.
The effect on codegen in SPEC2006 on aarch64 looks sane in the same
way as the original patch I posted (i.e. many redundant zero_extends
eliminated)
and together with patch 2/2 this helps in the -abs testcase.

I'm bootstrapping this on aarch64, arm and x86.
Any other testing would be appreciated.

Is this version ok if testing comes clean?

Thanks,
Kyrill

2015-07-17  Kyrylo Tkachov  

 * combine.c (combine_simplify_rtx): Move simplification step
 before various transformations/substitutions.

OK.
jeff


Re: [PATCH] fix pr46851 and pr60340: remove unmaintained omega dependence test

2015-07-17 Thread Jeff Law

On 07/17/2015 02:40 PM, Sebastian Pop wrote:

Regstrapped on amd64-linux.
Ok for trunk?

Thanks,
Sebastian

2015-07-17  Sebastian Pop  

PR middle-end/46851
PR middle-end/60340
* Makefile.in: Removed omega.o.
* common.opt: Remove flag fcheck-data-deps.
* doc/invoke.texi: Remove documentation for fcheck-data-deps and
its associated params: omega-max-vars, omega-max-geqs,
omega-max-eqs, omega-max-wild-cards, omega-hash-table-size,
omega-max-keys, omega-eliminate-redundant-constraints.
* doc/loop.texi: Remove all the section on Omega.
* graphite-blocking.c: Include missing params.h: it used to be
included through tree-data-ref.h and omega.h.
* graphite-isl-ast-to-gimple.c: Same.
* graphite-optimize-isl.c: Same.
* graphite-sese-to-poly.c: Same.
* graphite.c: Same.
* omega.c: Remove.
* omega.h: Remove.
* params.def: Removed PARAM_OMEGA_MAX_VARS, PARAM_OMEGA_MAX_GEQS,
PARAM_OMEGA_MAX_EQS, PARAM_OMEGA_MAX_WILD_CARDS,
PARAM_OMEGA_HASH_TABLE_SIZE, PARAM_OMEGA_MAX_KEYS, and
PARAM_OMEGA_ELIMINATE_REDUNDANT_CONSTRAINTS.
* passes.def: Remove pass_check_data_deps.
* tree-data-ref.c (dump_affine_function): Declare DEBUG_FUNCTION.
(dump_conflict_function): Same.
(dump_subscript): Same.
(print_direction_vector): Same.
(print_dir_vectors): Same.
(print_lambda_vector): Same.
(print_dist_vectors): Same.
(dump_data_dependence_relation): Same.
(dump_data_dependence_relations): Same.
(dump_dist_dir_vectors): Same.
(dump_ddrs): Same.
(init_omega_eq_with_af): Removed.
(omega_extract_distance_vectors): Removed.
(omega_setup_subscript): Removed.
(init_omega_for_ddr_1): Removed.
(init_omega_for_ddr): Removed.
(ddr_consistent_p): Removed.
(compute_affine_dependence): Do not use omega to check data
dependences.
(compute_data_dependences_for_bb): Removed.
(analyze_all_data_dependences): Removed.
(tree_check_data_deps): Removed.
* tree-data-ref.h: Do not include omega.h.
(compute_data_dependences_for_bb): Removed.
(tree_check_data_deps): Removed.
* tree-ssa-loop.c (pass_check_data_deps): Removed.
(make_pass_check_data_deps): Removed.
* tree-ssa-phiopt.c: Include params.h.
* tree-vect-data-refs.c: Same.
* tree-vect-slp.c: Same.
My ownly concern would be whether or not to keep the -fcheck-data-deps 
option as a deprecated option -- OK with that change.


Jeff



Re: Still crashes due to aliasing violation (Re: [RFC, PATCH] Split pool_allocator and create a new object_allocator)

2015-07-17 Thread Ulrich Weigand
On July 17, 2015 6:54:32 PM GMT+02:00, Ulrich Weigand  
wrote:
> >So do we now consider host compilers < 4.3 (4?) unsupported for
> >building
> >mainline GCC, or should we try to work around the issue (e.g. by moving
> >the allocator out-of-line or using some other aliasing barrier)?
> 
> Why is this an issue for stage1 which runs w/o optimization?

Well, this is the SPU compiler on a Cell system, which is technically
a cross compiler from PowerPC (even though the resulting binaries run
natively on the machine).

> For cross compiling we already suggest using known good compilers.

The documentation says:

  To build a cross compiler, we recommend first building and installing
  a native compiler. You can then use the native GCC compiler to build
  the cross compiler. The installed native compiler needs to be GCC
  version 2.95 or later. 

So building with a native GCC 4.1 seems to have been officially
supported until now as far as I can tell (unless you're building Ada).


Now, I could certainly live with a statement that cross compilers can
only be build with a native GCC 4.3 or newer; but that should be IMO
a deliberate decision and be widely announced (maybe even verified
by a configure check?), so that others don't run into the problem;
the nature of its symptoms make the problem difficult to diagnose.


Bye,
Ulrich

-- 
  Dr. Ulrich Weigand
  GNU/Linux compilers and toolchain
  ulrich.weig...@de.ibm.com



Re: [C/C++ PATCH] Implement -Wshift-overflow (PR c++/55095) (take 3)

2015-07-17 Thread Jeff Law

On 07/17/2015 02:09 AM, Marek Polacek wrote:

Ping^6.

On Fri, Jul 10, 2015 at 03:23:43PM +0200, Marek Polacek wrote:

Ping^5.

On Fri, Jul 03, 2015 at 09:42:39AM +0200, Marek Polacek wrote:

Ping^4.

On Fri, Jun 26, 2015 at 10:08:51AM +0200, Marek Polacek wrote:

I'm pinging the C++ parts.

On Fri, Jun 19, 2015 at 12:44:36PM +0200, Marek Polacek wrote:

Ping.

On Fri, Jun 12, 2015 at 11:07:29AM +0200, Marek Polacek wrote:

Ping.

On Fri, Jun 05, 2015 at 10:55:08AM +0200, Marek Polacek wrote:

On Thu, Jun 04, 2015 at 09:04:19PM +, Joseph Myers wrote:

The C changes are OK.


Jason, do you want to approve the C++ parts?

I'll approve the C++ parts given how simple they are :-)

jeff


Re: [PATCH][simplify-rtx][2/2] Simplify - (y ? -x : x) -> (!y ? -x : x

2015-07-17 Thread Jeff Law

On 07/16/2015 09:25 AM, Kyrill Tkachov wrote:

Hi all,

In this second patch I add the transformation mentioned in the subject
to simplify-rtx.c.
In combination with the first patch to combine, combine_simplify_rtx now
picks it up in the
testcase and does the right thing by not emitting an extra negate after
the conditional negate
operation.

Bootstrapped and tested on aarch64, arm, x86_64.

Ok for trunk?

Thanks,
Kyrill


2015-07-16  Kyrylo Tkachov 

 * simplify-rtx.c (simplify_unary_operation_1, NEG case):
 (neg (x ? (neg y) : y)) -> !x ? (neg y) : y.

OK.
jeff



[PATCH] Rename parameters which are within scop

2015-07-17 Thread Aditya Kumar
---
 gcc/graphite-isl-ast-to-gimple.c | 153 +++
 1 file changed, 122 insertions(+), 31 deletions(-)

diff --git a/gcc/graphite-isl-ast-to-gimple.c b/gcc/graphite-isl-ast-to-gimple.c
index b32781a..3e2c1fa 100644
--- a/gcc/graphite-isl-ast-to-gimple.c
+++ b/gcc/graphite-isl-ast-to-gimple.c
@@ -124,9 +124,84 @@ void ivs_params_clear (ivs_params &ip)
 }
 }
 
-static tree
-gcc_expression_from_isl_expression (tree type, __isl_take isl_ast_expr *,
-   ivs_params &ip);
+class translate_isl_ast_to_gimple
+{
+public:
+  translate_isl_ast_to_gimple (sese r)
+   : region (r)
+  { }
+
+  edge translate_isl_ast (loop_p context_loop, __isl_keep isl_ast_node *node,
+ edge next_e, ivs_params &ip);
+
+  edge translate_isl_ast_node_for (loop_p context_loop,
+  __isl_keep isl_ast_node *node,
+  edge next_e, ivs_params &ip);
+
+  edge translate_isl_ast_for_loop (loop_p context_loop,
+  __isl_keep isl_ast_node *node_for,
+  edge next_e,
+  tree type, tree lb, tree ub,
+  ivs_params &ip);
+
+  edge translate_isl_ast_node_if (loop_p context_loop,
+ __isl_keep isl_ast_node *node,
+ edge next_e, ivs_params &ip);
+
+  edge translate_isl_ast_node_user (__isl_keep isl_ast_node *node,
+   edge next_e, ivs_params &ip);
+
+  edge translate_isl_ast_node_block (loop_p context_loop,
+__isl_keep isl_ast_node *node,
+edge next_e, ivs_params &ip);
+
+  tree unary_op_to_tree (tree type, __isl_take isl_ast_expr *expr,
+ivs_params &ip);
+
+  tree binary_op_to_tree (tree type, __isl_take isl_ast_expr *expr,
+ ivs_params &ip);
+
+  tree ternary_op_to_tree (tree type, __isl_take isl_ast_expr *expr,
+  ivs_params &ip);
+
+  tree nary_op_to_tree (tree type, __isl_take isl_ast_expr *expr,
+   ivs_params &ip);
+
+  tree gcc_expression_from_isl_expression (tree type,
+  __isl_take isl_ast_expr *,
+  ivs_params &ip);
+
+  tree gcc_expression_from_isl_ast_expr_id (tree type,
+   __isl_keep isl_ast_expr *expr_id,
+   ivs_params &ip);
+
+  tree gcc_expression_from_isl_expr_int (tree type,
+__isl_take isl_ast_expr *expr);
+
+  tree gcc_expression_from_isl_expr_op (tree type,
+   __isl_take isl_ast_expr *expr,
+   ivs_params &ip);
+
+  struct loop *graphite_create_new_loop (edge entry_edge,
+__isl_keep isl_ast_node *node_for,
+loop_p outer, tree type,
+tree lb, tree ub, ivs_params &ip);
+
+  edge graphite_create_new_guard (edge entry_edge,
+ __isl_take isl_ast_expr *if_cond,
+ ivs_params &ip);
+
+  edge graphite_create_new_loop_guard (edge entry_edge,
+  __isl_keep isl_ast_node *node_for,
+  tree *type,
+  tree *lb, tree *ub, ivs_params &ip);
+
+  void build_iv_mapping (vec iv_map, gimple_bb_p gbb,
+__isl_keep isl_ast_expr *user_expr, ivs_params &ip,
+sese region);
+private:
+  sese region;
+};
 
 /* Return the tree variable that corresponds to the given isl ast identifier
expression (an isl_ast_expr of type isl_ast_expr_id).
@@ -136,7 +211,8 @@ gcc_expression_from_isl_expression (tree type, __isl_take 
isl_ast_expr *,
converting type sizes may be problematic when we switch to smaller
types.  */
 
-static tree
+tree
+translate_isl_ast_to_gimple::
 gcc_expression_from_isl_ast_expr_id (tree type,
 __isl_keep isl_ast_expr *expr_id,
 ivs_params &ip)
@@ -147,7 +223,7 @@ gcc_expression_from_isl_ast_expr_id (tree type,
   res = ip.find (tmp_isl_id);
   isl_id_free (tmp_isl_id);
   gcc_assert (res != ip.end () &&
-  "Could not map isl_id to tree expression");
+ "Could not map isl_id to tree expression");
   isl_ast_expr_free (expr_id);
   return fold_convert (type, res->second);
 }
@@ -155,7 +231,8 @@ gcc_expression_from_isl_ast_expr_id (tree type,
 /* Converts an isl_ast_expr_int expression E to a GCC expression tree of
type TYPE.  */
 
-static tree
+tree
+translate_isl_ast_to_gimple::
 gcc_expression_from_isl_expr_int

Re: [PATCH 3/4] S390 -march=native related fixes

2015-07-17 Thread Ulrich Weigand
Dominik Vogt wrote:

> +  opt_esa_zarch = (has_highgprs) ? " -mzarch" : " -mesa";

This will force -mesa on old machines *even in -m64 mode*,
which is wrong and will cause compilation to fail.


> -/* Defaulting rules.  */
>  #ifdef DEFAULT_TARGET_64BIT
> -#define DRIVER_SELF_SPECS\

This completely removes use of DRIVER_SELF_SPECS for defaulting,
which I introduced back here:
https://gcc.gnu.org/ml/gcc-patches/2003-06/msg03369.html

The reason for using DRIVER_SELF_SPECS as described there is to
make sure we use compatible flags for compiler, assembler and
linker in all cases, even if some of those flags result from
defaulting rules.

If we don't do that, we rely on those components agreeing exactly
in how to default for unspecified options; for example, we want
to give the correct -march flag to the assembler as an additional
verification to detect compiler bugs where the compiler erroneously
generates an incorrect instruction for that architecture.

Bye,
Ulrich

-- 
  Dr. Ulrich Weigand
  GNU/Linux compilers and toolchain
  ulrich.weig...@de.ibm.com



[PATCH, i386]: Cleanup FPmode splitters a bit

2015-07-17 Thread Uros Bizjak
No functional changes.

2015-07-18  Uros Bizjak  

* config/i386/i386.md (pushsf splitter): Pass curr_insn to
find_constant_src.  FAIL if find_constant_src returns NULL_RTX.
(mem->fpreg splitters): Ditto.
(general_operand->nonimmediate_operand splitter): Use explicit modes.
Disable DFmode for TARGET_64BIT.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Committed to mainline SVN.

Uros.
Index: config/i386/i386.md
===
--- config/i386/i386.md (revision 225919)
+++ config/i386/i386.md (working copy)
@@ -2987,10 +2987,15 @@
 (define_split
   [(set (match_operand:SF 0 "push_operand")
(match_operand:SF 1 "memory_operand"))]
-  "reload_completed
-   && (operands[2] = find_constant_src (insn))"
-  [(set (match_dup 0) (match_dup 2))])
+  "reload_completed"
+  [(set (match_dup 0) (match_dup 2))]
+{
+  operands[2] = find_constant_src (curr_insn);
 
+  if (operands[2] == NULL_RTX)
+FAIL;
+})
+
 (define_split
   [(set (match_operand 0 "push_operand")
(match_operand 1 "general_operand"))]
@@ -3083,6 +3088,14 @@
   ]
   (const_string "TI")))])
 
+(define_split
+  [(set (match_operand:TF 0 "nonimmediate_operand")
+(match_operand:TF 1 "general_operand"))]
+  "reload_completed
+   && !(ANY_FP_REG_P (operands[0]) || ANY_FP_REG_P (operands[1]))"
+  [(const_int 0)]
+  "ix86_split_long_move (operands); DONE;")
+
 ;; Possible store forwarding (partial memory) stall
 ;; in alternatives 4, 6, 7 and 8.
 (define_insn "*movxf_internal"
@@ -3149,6 +3162,14 @@
   ]
   (const_string "*")))])

+(define_split
+  [(set (match_operand:XF 0 "nonimmediate_operand")
+(match_operand:XF 1 "general_operand"))]
+  "reload_completed
+   && !(ANY_FP_REG_P (operands[0]) || ANY_FP_REG_P (operands[1]))"
+  [(const_int 0)]
+  "ix86_split_long_move (operands); DONE;")
+
 ;; Possible store forwarding (partial memory) stall in alternatives 4, 6 and 7.
 (define_insn "*movdf_internal"
   [(set (match_operand:DF 0 "nonimmediate_operand"
@@ -3344,6 +3365,14 @@
   ]
   (const_string "*")))])
 
+(define_split
+  [(set (match_operand:DF 0 "nonimmediate_operand")
+(match_operand:DF 1 "general_operand"))]
+  "!TARGET_64BIT && reload_completed
+   && !(ANY_FP_REG_P (operands[0]) || ANY_FP_REG_P (operands[1]))"
+  [(const_int 0)]
+  "ix86_split_long_move (operands); DONE;")
+
 (define_insn "*movsf_internal"
   [(set (match_operand:SF 0 "nonimmediate_operand"
  "=Yf*f,m   ,Yf*f,?r ,?m,v,v,v,m,?r,?Yi,!*y,!*y,!m,!r ,!*Ym,r  ,m")
@@ -3487,15 +3516,16 @@
&& (GET_MODE (operands[0]) == TFmode
|| GET_MODE (operands[0]) == XFmode
|| GET_MODE (operands[0]) == DFmode
-   || GET_MODE (operands[0]) == SFmode)
-   && (operands[2] = find_constant_src (insn))"
+   || GET_MODE (operands[0]) == SFmode)"
   [(set (match_dup 0) (match_dup 2))]
 {
-  rtx c = operands[2];
-  int r = REGNO (operands[0]);
+  operands[2] = find_constant_src (curr_insn);
 
-  if ((SSE_REGNO_P (r) && !standard_sse_constant_p (c))
-  || (STACK_REGNO_P (r) && standard_80387_constant_p (c) < 1))
+  if (operands[2] == NULL_RTX
+  || (SSE_REGNO_P (REGNO (operands[0]))
+ && !standard_sse_constant_p (operands[2]))
+  || (STACK_REGNO_P (REGNO (operands[0]))
+  && standard_80387_constant_p (operands[2]) < 1))
 FAIL;
 })
 
@@ -3505,15 +3535,16 @@
   "reload_completed
&& (GET_MODE (operands[0]) == TFmode
|| GET_MODE (operands[0]) == XFmode
-   || GET_MODE (operands[0]) == DFmode)
-   && (operands[2] = find_constant_src (insn))"
+   || GET_MODE (operands[0]) == DFmode)"
   [(set (match_dup 0) (match_dup 2))]
 {
-  rtx c = operands[2];
-  int r = REGNO (operands[0]);
+  operands[2] = find_constant_src (curr_insn);
 
-  if ((SSE_REGNO_P (r) && !standard_sse_constant_p (c))
-  || (STACK_REGNO_P (r) && standard_80387_constant_p (c) < 1))
+  if (operands[2] == NULL_RTX
+  || (SSE_REGNO_P (REGNO (operands[0]))
+ && !standard_sse_constant_p (operands[2]))
+  || (STACK_REGNO_P (REGNO (operands[0]))
+  && standard_80387_constant_p (operands[2]) < 1))
 FAIL;
 })
 
@@ -3537,17 +3568,6 @@
 operands[1] = CONST1_RTX (mode);
 })
 
-(define_split
-  [(set (match_operand 0 "nonimmediate_operand")
-(match_operand 1 "general_operand"))]
-  "reload_completed
-   && (GET_MODE (operands[0]) == TFmode
-   || GET_MODE (operands[0]) == XFmode
-   || GET_MODE (operands[0]) == DFmode)
-   && !(ANY_FP_REG_P (operands[0]) || ANY_FP_REG_P (operands[1]))"
-  [(const_int 0)]
-  "ix86_split_long_move (operands); DONE;")
-
 (define_insn "swapxf"
   [(set (match_operand:XF 0 "register_operand" "+f")
(match_operand:XF 1 "register_operand" "+f"))


  1   2   >