[PATCH] Fix devirtualization ICE (PR tree-optimization/59622, take 2)

2014-01-03 Thread Jakub Jelinek
On Tue, Dec 31, 2013 at 11:30:12AM +0100, Richard Biener wrote:
> Meanwhile your patch is ok.

As discussed in the PR, the patch wasn't sufficient, __cxa_pure_virtual
can appear in the vtable and it has pretty much the same properties
as __builtin_unreachable (void return value, no arguments, noreturn,
DCE or cfg cleanup being able to remove anything after it because of
noreturn).

Additionally, the patch attempts to punt on invalid type changes
(ODR violations?, apparently none appear during bootstrap/regtest as
verified by additional logging added there) or inplace changes of
gimple_call_flags that would require some cleanups caller isn't expected to
do (again, doesn't happen during bootstrap/regtest).

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2014-01-03  Jakub Jelinek  

PR tree-optimization/59622
* gimple-fold.c: Include calls.h.
(gimple_fold_call): Fix a typo in message.  Handle
__cxa_pure_virtual similarly to __builtin_unreachable.  For
inplace punt if gimple_call_flags would change.  Verify
that lhs and argument types are compatible.

* g++.dg/opt/pr59622-2.C: New test.

--- gcc/gimple-fold.c.jj2013-12-31 12:56:59.0 +0100
+++ gcc/gimple-fold.c   2014-01-02 18:33:51.207921734 +0100
@@ -51,6 +51,7 @@ along with GCC; see the file COPYING3.
 #include "gimple-pretty-print.h"
 #include "tree-ssa-address.h"
 #include "langhooks.h"
+#include "calls.h"
 
 /* Return true when DECL can be referenced from current unit.
FROM_DECL (if non-null) specify constructor of variable DECL was taken from.
@@ -1167,7 +1168,7 @@ gimple_fold_call (gimple_stmt_iterator *
  (OBJ_TYPE_REF_EXPR 
(callee)
{
  fprintf (dump_file,
-  "Type inheritnace inconsistent devirtualization of ");
+  "Type inheritance inconsistent devirtualization of ");
  print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM);
  fprintf (dump_file, " to ");
  print_generic_expr (dump_file, callee, TDF_SLIM);
@@ -1184,18 +1185,74 @@ gimple_fold_call (gimple_stmt_iterator *
= possible_polymorphic_call_targets (callee, &final);
  if (final && targets.length () <= 1)
{
+ tree fndecl;
+ int call_flags = gimple_call_flags (stmt), new_flags;
  if (targets.length () == 1)
+   fndecl = targets[0]->decl;
+ else
+   fndecl = builtin_decl_implicit (BUILT_IN_UNREACHABLE);
+ new_flags = flags_from_decl_or_type (fndecl);
+ if (inplace)
{
- gimple_call_set_fndecl (stmt, targets[0]->decl);
- changed = true;
+ /* For inplace, avoid changes of call flags that
+might require cfg cleanups, EH cleanups, removal
+of vdef/vuses etc.  */
+ if ((call_flags ^ new_flags)
+ & (ECF_NORETURN | ECF_NOTHROW | ECF_NOVOPS))
+   final = false;
+ else if ((call_flags & ECF_NOVOPS) == 0)
+   {
+ if ((call_flags ^ new_flags) & ECF_CONST)
+   final = false;
+ else if (((call_flags & (ECF_PURE | ECF_CONST
+  | ECF_NORETURN)) == 0)
+  ^ ((new_flags & (ECF_PURE | ECF_CONST
+   | ECF_NORETURN)) == 0))
+   final = false;
+   }
}
- else if (!inplace)
+ if (final)
{
- tree fndecl = builtin_decl_implicit (BUILT_IN_UNREACHABLE);
- gimple new_stmt = gimple_build_call (fndecl, 0);
- gimple_set_location (new_stmt, gimple_location (stmt));
- gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
- return true;
+ tree fntype = gimple_call_fntype (stmt);
+ tree dtype = TREE_TYPE (fndecl);
+ if (gimple_call_lhs (stmt)
+ && !useless_type_conversion_p (TREE_TYPE (fntype),
+TREE_TYPE (dtype)))
+   final = false;
+ else
+   {
+ tree t1, t2;
+ for (t1 = TYPE_ARG_TYPES (fntype),
+  t2 = TYPE_ARG_TYPES (dtype);
+  t1 && t2;
+  t1 = TREE_CHAIN (t1), t2 = TREE_CHAIN (t2))
+   if (!useless_type_conversion_p (TREE_VALUE (t2),
+   TREE_VALUE (t1)))
+ break;
+ if (t1 || t2)
+   final = false;
+   }
+
+   

[PATCH] Don't count asm goto for 4 branch limit optimization (PR target/59625)

2014-01-03 Thread Jakub Jelinek
Hi!

Linus complained about padding inserted in between asm goto.  In 4.8
even -mtune=generic performs ix86_avoid_jump_mispredicts, in 4.9 only e.g.
-mtune=atom and others, but not generic.
The issue is that assuming every asm goto must contain a jump is too
pessimistic, often asm goto (may) transfer control to the label(s) through
other means (e.g. on the side data structure with labels etc.), plus we
conservatively assume asm goto is 0 bytes long, so we were adding the
paddings pretty much always when multiple asm goto stmts appeared
consecutively.

This patch just ignores asm goto from the optimization.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk and 4.8
after a while?

2014-01-03  Jakub Jelinek  

PR target/59625
* config/i386/i386.c (ix86_avoid_jump_mispredicts): Don't consider
asm goto as jump.

* gcc.target/i386/pr59625.c: New test.

--- gcc/config/i386/i386.c.jj   2014-01-01 00:45:15.0 +0100
+++ gcc/config/i386/i386.c  2014-01-02 14:44:07.980841529 +0100
@@ -38852,7 +38852,8 @@ ix86_avoid_jump_mispredicts (void)
  while (nbytes + max_skip >= 16)
{
  start = NEXT_INSN (start);
- if (JUMP_P (start) || CALL_P (start))
+ if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
+ || CALL_P (start))
njumps--, isjump = 1;
  else
isjump = 0;
@@ -38867,7 +38868,8 @@ ix86_avoid_jump_mispredicts (void)
   if (dump_file)
fprintf (dump_file, "Insn %i estimated to %i bytes\n",
 INSN_UID (insn), min_size);
-  if (JUMP_P (insn) || CALL_P (insn))
+  if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
+ || CALL_P (insn))
njumps++;
   else
continue;
@@ -38875,7 +38877,8 @@ ix86_avoid_jump_mispredicts (void)
   while (njumps > 3)
{
  start = NEXT_INSN (start);
- if (JUMP_P (start) || CALL_P (start))
+ if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
+ || CALL_P (start))
njumps--, isjump = 1;
  else
isjump = 0;
--- gcc/testsuite/gcc.target/i386/pr59625.c.jj  2014-01-02 17:17:45.611680569 
+0100
+++ gcc/testsuite/gcc.target/i386/pr59625.c 2014-01-02 17:17:37.0 
+0100
@@ -0,0 +1,36 @@
+/* PR target/59625 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=atom" } */
+
+int
+foo (void)
+{
+  asm goto ("" : : : : lab);
+  asm goto ("" : : : : lab);
+  asm goto ("" : : : : lab);
+  asm goto ("" : : : : lab);
+  asm goto ("" : : : : lab);
+  asm goto ("" : : : : lab);
+  asm goto ("" : : : : lab);
+  asm goto ("" : : : : lab);
+  asm goto ("" : : : : lab);
+  asm goto ("" : : : : lab);
+  asm goto ("" : : : : lab);
+  asm goto ("" : : : : lab);
+  asm goto ("" : : : : lab);
+  asm goto ("" : : : : lab);
+  asm goto ("" : : : : lab);
+  asm goto ("" : : : : lab);
+  asm goto ("" : : : : lab);
+  asm goto ("" : : : : lab);
+  asm goto ("" : : : : lab);
+  asm goto ("" : : : : lab);
+  return 0;
+lab:
+  return 1;
+}
+
+/* Verify we don't consider asm goto as a jump for four jumps limit
+   optimization.  asm goto doesn't have to contain a jump at all,
+   the branching to labels can happen through different means.  */
+/* { dg-final { scan-assembler-not 
"p2align\[^\n\r\]*\[\n\r]*\[^\n\r\]*p2align\[^\n\r\]*\[\n\r]*\[^\n\r\]*p2align\[^\n\r\]*\[\n\r]*\[^\n\r\]*p2align\[^\n\r\]*\[\n\r]*\[^\n\r\]*p2align\[^\n\r\]*\[\n\r]*\[^\n\r\]*p2align"
 } } */

Jakub


[PATCH] Introduce MODE_SIZE mode attribute

2014-01-03 Thread Jakub Jelinek
Hi!

I've noticed that especially with the AVX512F introduction we use
GET_MODE_SIZE (mode) quite heavily in the i386 *.md files, and
the problem with that is GET_MODE_SIZE isn't a compile time constant,
needs to read mode_size array (non-const) at runtime.

This patch attempts to decrease the enormous .text/.rodata growth from AVX512F
introduction a little bit and improve generated code, by making
GET_MODE_SIZE (mode) compile time constants, thus all the
GET_MODE_SIZE (mode) == 64 and similar tests can fold into
false/true.  The patch saves ~ 110KB of .text (both x86_64 and i686)
and ~ 50KB of .rodata (x86_64) or ~ 27KB of .rodata (i686).

The disadvantage is that the mode attribute duplicates insn-modes.c,
but I guess the listed modes aren't going to change their sizes ever
on this target, and if some mode isn't listed and used in some mode
iterator, one would get syntax errors from insn-*.[ch] and so it wouldn't be
hard to add it.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2014-01-03  Jakub Jelinek  

* config/i386/i386.md (MODE_SIZE): New mode attribute.
(push splitter): Use  instead of
GET_MODE_SIZE (mode).
(lea splitter): Use  instead of GET_MODE_SIZE (mode).
(mov -1, reg peephole2): Likewise.
* config/i386/sse.md (*mov_internal,
_storeu,
_storedqu, _andnot3,
*3, *andnot3,
3): Likewise.
* config/i386/subst.md (mask_mode512bit_condition,
sd_mask_mode512bit_condition): Likewise.

--- gcc/config/i386/i386.md.jj  2013-12-27 19:24:33.0 +0100
+++ gcc/config/i386/i386.md 2014-01-02 20:08:16.911851795 +0100
@@ -914,6 +914,20 @@ (define_mode_iterator SWIM248 [(HI "TARG
 (define_mode_iterator DWI [(DI "!TARGET_64BIT")
   (TI "TARGET_64BIT")])
 
+;; GET_MODE_SIZE for selected modes.  As GET_MODE_SIZE is not
+;; compile time constant, it is faster to use  than
+;; GET_MODE_SIZE (mode).  For XFmode which depends on
+;; command line options just use GET_MODE_SIZE macro.
+(define_mode_attr MODE_SIZE [(QI "1") (HI "2") (SI "4") (DI "8") (TI "16")
+(SF "4") (DF "8") (XF "GET_MODE_SIZE (XFmode)")
+(V16QI "16") (V32QI "32") (V64QI "64")
+(V8HI "16") (V16HI "32") (V32HI "64")
+(V4SI "16") (V8SI "32") (V16SI "64")
+(V2DI "16") (V4DI "32") (V8DI "64")
+(V1TI "16") (V2TI "32") (V4TI "64")
+(V2DF "16") (V4DF "32") (V8DF "64")
+(V4SF "16") (V8SF "32") (V16SF "64")])
+
 ;; Double word integer modes as mode attribute.
 (define_mode_attr DWI [(QI "HI") (HI "SI") (SI "DI") (DI "TI")])
 (define_mode_attr dwi [(QI "hi") (HI "si") (SI "di") (DI "ti")])
@@ -2734,7 +2748,7 @@ (define_split
   "reload_completed"
   [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2)))
(set (mem:SF (reg:P SP_REG)) (match_dup 1))]
-  "operands[2] = GEN_INT (-GET_MODE_SIZE (mode));")
+  "operands[2] = GEN_INT (-);")
 
 (define_split
   [(set (match_operand:SF 0 "push_operand")
@@ -5770,7 +5784,7 @@ (define_split
   enum machine_mode mode = mode;
   rtx pat;
 
-  if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
+  if ( < GET_MODE_SIZE (SImode))
 { 
   mode = SImode; 
   operands[0] = gen_lowpart (mode, operands[0]);
@@ -17403,7 +17417,7 @@ (define_peephole2
   [(parallel [(set (match_dup 0) (const_int -1))
  (clobber (reg:CC FLAGS_REG))])]
 {
-  if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
+  if ( < GET_MODE_SIZE (SImode))
 operands[0] = gen_lowpart (SImode, operands[0]);
 })
 
--- gcc/config/i386/sse.md.jj   2014-01-01 00:52:56.0 +0100
+++ gcc/config/i386/sse.md  2014-01-02 20:11:49.723916127 +0100
@@ -669,7 +669,7 @@ (define_insn "*mov_internal"
   /* There is no evex-encoded vmov* for sizes smaller than 64-bytes
 in avx512f, so we need to use workarounds, to access sse registers
 16-31, which are evex-only.  */
-  if (TARGET_AVX512F && GET_MODE_SIZE (mode) < 64
+  if (TARGET_AVX512F &&  < 64
  && ((REG_P (operands[0])
   && EXT_REX_SSE_REGNO_P (REGNO (operands[0])))
  || (REG_P (operands[1])
@@ -677,18 +677,18 @@ (define_insn "*mov_internal"
{
  if (memory_operand (operands[0], mode))
{
- if (GET_MODE_SIZE (mode) == 32)
+ if ( == 32)
return "vextract64x4\t{$0x0, %g1, %0|%0, %g1, 
0x0}";
- else if (GET_MODE_SIZE (mode) == 16)
+ else if ( == 16)
return "vextract32x4\t{$0x0, %g1, %0|%0, %g1, 
0x0}";
  else
gcc_unreachable ();
}
  else if (memory_operand (operands[1], mode))
{
- if (GET_MODE_SIZE (mode) == 32)
+ if ( == 32)
return "vbroadcast64x4\t{%1, %g0|%g0

[PATCH] i?86 unaligned/aligned load improvement for AVX512F

2014-01-03 Thread Jakub Jelinek
Hi!

This is an attempt to port my recent
http://gcc.gnu.org/viewcvs?rev=204219&root=gcc&view=rev
http://gcc.gnu.org/viewcvs?rev=205663&root=gcc&view=rev
http://gcc.gnu.org/viewcvs?rev=206090&root=gcc&view=rev
changes also to AVX512F.  The motivation is to get:

#include 

__m512i
foo (void *x, void *y)
{
  __m512i a = _mm512_loadu_si512 (x);
  __m512i b = _mm512_loadu_si512 (y);
  return _mm512_add_epi32 (a, b);
}

use one of the unaligned memories directly as operand to the vpaddd
instruction.  The first hunk is needed so that we don't regress on say:

#include 

__m512i z;

__m512i
foo (void *x, void *y, int k)
{
  __m512i a = _mm512_mask_loadu_epi32 (z, k, x);
  __m512i b = _mm512_mask_loadu_epi32 (z, k, y);
  return _mm512_add_epi32 (a, b);
}

__m512i
bar (void *x, void *y, int k)
{
  __m512i a = _mm512_maskz_loadu_epi32 (k, x);
  __m512i b = _mm512_maskz_loadu_epi32 (k, y);
  return _mm512_add_epi32 (a, b);
}

Does it matter which of vmovdqu32 vs. vmovdqu64 is used if no
masking/zeroing is performed (i.e. vmovdqu32 (%rax), %zmm0 vs.
vmovdqu64 (%rax), %zmm0) for performance reasons (i.e. isn't there some
reinterpretation penalty)?

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2014-01-03  Jakub Jelinek  

* config/i386/sse.md (avx512f_load_mask): Emit vmovup{s,d}
or vmovdqu* for misaligned_operand.
(_loadu,
_loaddqu): Handle .
* config/i386/i386.c (ix86_expand_special_args_builtin): Set
aligned_mem for AVX512F masked aligned load and store builtins and for
non-temporal moves.

* gcc.target/i386/avx512f-vmovdqu32-1.c: Allow vmovdqu64 instead of
vmovdqu32.

--- gcc/config/i386/sse.md.jj   2014-01-02 20:11:49.0 +0100
+++ gcc/config/i386/sse.md  2014-01-02 21:59:06.706161064 +0100
@@ -786,8 +786,12 @@ (define_insn "avx512f_load_mask"
 {
 case MODE_V8DF:
 case MODE_V16SF:
+  if (misaligned_operand (operands[1], mode))
+   return "vmovu\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}";
   return "vmova\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}";
 default:
+  if (misaligned_operand (operands[1], mode))
+   return "vmovdqu\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}";
   return "vmovdqa\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}";
 }
 }
@@ -936,11 +940,14 @@ (define_expand "_loadumode)
-  /* FIXME: Revisit after AVX512F merge is completed.  */
-  && !)
+  && misaligned_operand (operands[1], mode))
 {
-  emit_insn (gen_rtx_SET (VOIDmode, operands[0], operands[1]));
+  rtx src = operands[1];
+  if ()
+   src = gen_rtx_VEC_MERGE (mode, operands[1],
+operands[2 * ],
+operands[3 * ]);
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0], src));
   DONE;
 }
 })
@@ -1046,11 +1053,14 @@ (define_expand "_loadd
  false, still emit UNSPEC_LOADU insn to honor user's request for
  misaligned load.  */
   if (TARGET_AVX
-  && misaligned_operand (operands[1], mode)
-  /* FIXME: Revisit after AVX512F merge is completed.  */
-  && !)
+  && misaligned_operand (operands[1], mode))
 {
-  emit_insn (gen_rtx_SET (VOIDmode, operands[0], operands[1]));
+  rtx src = operands[1];
+  if ()
+   src = gen_rtx_VEC_MERGE (mode, operands[1],
+operands[2 * ],
+operands[3 * ]);
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0], src));
   DONE;
 }
 })
--- gcc/config/i386/i386.c.jj   2014-01-02 14:44:07.0 +0100
+++ gcc/config/i386/i386.c  2014-01-02 21:48:23.204400654 +0100
@@ -34407,6 +34408,9 @@ ix86_expand_special_args_builtin (const
case CODE_FOR_sse2_movntidi:
case CODE_FOR_sse_movntq:
case CODE_FOR_sse2_movntisi:
+   case CODE_FOR_avx512f_movntv16sf:
+   case CODE_FOR_avx512f_movntv8df:
+   case CODE_FOR_avx512f_movntv8di:
  aligned_mem = true;
  break;
default:
@@ -34431,6 +34435,24 @@ ix86_expand_special_args_builtin (const
   klass = load;
   memory = 0;
   break;
+case VOID_FTYPE_PV8DF_V8DF_QI:
+case VOID_FTYPE_PV16SF_V16SF_HI:
+case VOID_FTYPE_PV8DI_V8DI_QI:
+case VOID_FTYPE_PV16SI_V16SI_HI:
+  switch (icode)
+   {
+   /* These builtins and instructions require the memory
+  to be properly aligned.  */
+   case CODE_FOR_avx512f_storev16sf_mask:
+   case CODE_FOR_avx512f_storev16si_mask:
+   case CODE_FOR_avx512f_storev8df_mask:
+   case CODE_FOR_avx512f_storev8di_mask:
+ aligned_mem = true;
+ break;
+   default:
+ break;
+   }
+  /* FALLTHRU */
 case VOID_FTYPE_PV8SF_V8SI_V8SF:
 case VOID_FTYPE_PV4DF_V4DI_V4DF:
 case VOID_FTYPE_PV4SF_V4SI_V4SF:
@@ -34439,10 +34461,6 @@ ix86_expand_special_args_builtin (const
 case VOID_FTYPE_PV4DI_V4DI_V4DI:
 case VOID_FTYPE_PV4SI_V4SI_V4SI:
 case VOID_FTYPE_PV2DI_V2DI_V2DI:
-

Re: [PATCH] Introduce MODE_SIZE mode attribute

2014-01-03 Thread Uros Bizjak
On Fri, Jan 3, 2014 at 9:48 AM, Jakub Jelinek  wrote:

> I've noticed that especially with the AVX512F introduction we use
> GET_MODE_SIZE (mode) quite heavily in the i386 *.md files, and
> the problem with that is GET_MODE_SIZE isn't a compile time constant,
> needs to read mode_size array (non-const) at runtime.
>
> This patch attempts to decrease the enormous .text/.rodata growth from AVX512F
> introduction a little bit and improve generated code, by making
> GET_MODE_SIZE (mode) compile time constants, thus all the
> GET_MODE_SIZE (mode) == 64 and similar tests can fold into
> false/true.  The patch saves ~ 110KB of .text (both x86_64 and i686)
> and ~ 50KB of .rodata (x86_64) or ~ 27KB of .rodata (i686).
>
> The disadvantage is that the mode attribute duplicates insn-modes.c,
> but I guess the listed modes aren't going to change their sizes ever
> on this target, and if some mode isn't listed and used in some mode
> iterator, one would get syntax errors from insn-*.[ch] and so it wouldn't be
> hard to add it.

This should be tolerable, at least for gains, shown above. OTOH, we
already have some attributes (e.g. ssescalarnum, ssescalarsize) that
use the same approach and the sky didn't fall down.

> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2014-01-03  Jakub Jelinek  
>
> * config/i386/i386.md (MODE_SIZE): New mode attribute.
> (push splitter): Use  instead of
> GET_MODE_SIZE (mode).
> (lea splitter): Use  instead of GET_MODE_SIZE (mode).
> (mov -1, reg peephole2): Likewise.
> * config/i386/sse.md (*mov_internal,
> _storeu,
> _storedqu, _andnot3,
> *3, *andnot3,
> 3): Likewise.
> * config/i386/subst.md (mask_mode512bit_condition,
> sd_mask_mode512bit_condition): Likewise.

OK for mainline.

Thanks,
Uros.


Re: [PATCH] Fix devirtualization ICE (PR tree-optimization/59622, take 2)

2014-01-03 Thread Richard Biener
Jakub Jelinek  wrote:
>On Tue, Dec 31, 2013 at 11:30:12AM +0100, Richard Biener wrote:
>> Meanwhile your patch is ok.
>
>As discussed in the PR, the patch wasn't sufficient, __cxa_pure_virtual
>can appear in the vtable and it has pretty much the same properties
>as __builtin_unreachable (void return value, no arguments, noreturn,
>DCE or cfg cleanup being able to remove anything after it because of
>noreturn).
>
>Additionally, the patch attempts to punt on invalid type changes
>(ODR violations?, apparently none appear during bootstrap/regtest as
>verified by additional logging added there) or inplace changes of
>gimple_call_flags that would require some cleanups caller isn't
>expected to
>do (again, doesn't happen during bootstrap/regtest).
>
>Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

I'd rather do nothing in the !inplace case, it's rare enough to not care.

Also there is nothing wrong with wrong types et al - we have a perfect 
mechanism for dealing with this. Change the fndecl but not the fntype.

I'd like to see the code simplified with keeping the above in mind.

Thanks,
Richard.

>2014-01-03  Jakub Jelinek  
>
>   PR tree-optimization/59622
>   * gimple-fold.c: Include calls.h.
>   (gimple_fold_call): Fix a typo in message.  Handle
>   __cxa_pure_virtual similarly to __builtin_unreachable.  For
>   inplace punt if gimple_call_flags would change.  Verify
>   that lhs and argument types are compatible.
>
>   * g++.dg/opt/pr59622-2.C: New test.
>
>--- gcc/gimple-fold.c.jj   2013-12-31 12:56:59.0 +0100
>+++ gcc/gimple-fold.c  2014-01-02 18:33:51.207921734 +0100
>@@ -51,6 +51,7 @@ along with GCC; see the file COPYING3.
> #include "gimple-pretty-print.h"
> #include "tree-ssa-address.h"
> #include "langhooks.h"
>+#include "calls.h"
> 
> /* Return true when DECL can be referenced from current unit.
>FROM_DECL (if non-null) specify constructor of variable DECL was taken
>from.
>@@ -1167,7 +1168,7 @@ gimple_fold_call (gimple_stmt_iterator *
>(OBJ_TYPE_REF_EXPR (callee)
>   {
> fprintf (dump_file,
>- "Type inheritnace inconsistent devirtualization of ");
>+ "Type inheritance inconsistent devirtualization of ");
> print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM);
> fprintf (dump_file, " to ");
> print_generic_expr (dump_file, callee, TDF_SLIM);
>@@ -1184,18 +1185,74 @@ gimple_fold_call (gimple_stmt_iterator *
>   = possible_polymorphic_call_targets (callee, &final);
> if (final && targets.length () <= 1)
>   {
>+tree fndecl;
>+int call_flags = gimple_call_flags (stmt), new_flags;
> if (targets.length () == 1)
>+  fndecl = targets[0]->decl;
>+else
>+  fndecl = builtin_decl_implicit (BUILT_IN_UNREACHABLE);
>+new_flags = flags_from_decl_or_type (fndecl);
>+if (inplace)
>   {
>-gimple_call_set_fndecl (stmt, targets[0]->decl);
>-changed = true;
>+/* For inplace, avoid changes of call flags that
>+   might require cfg cleanups, EH cleanups, removal
>+   of vdef/vuses etc.  */
>+if ((call_flags ^ new_flags)
>+& (ECF_NORETURN | ECF_NOTHROW | ECF_NOVOPS))
>+  final = false;
>+else if ((call_flags & ECF_NOVOPS) == 0)
>+  {
>+if ((call_flags ^ new_flags) & ECF_CONST)
>+  final = false;
>+else if (((call_flags & (ECF_PURE | ECF_CONST
>+ | ECF_NORETURN)) == 0)
>+ ^ ((new_flags & (ECF_PURE | ECF_CONST
>+  | ECF_NORETURN)) == 0))
>+  final = false;
>+  }
>   }
>-else if (!inplace)
>+if (final)
>   {
>-tree fndecl = builtin_decl_implicit (BUILT_IN_UNREACHABLE);
>-gimple new_stmt = gimple_build_call (fndecl, 0);
>-gimple_set_location (new_stmt, gimple_location (stmt));
>-gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
>-return true;
>+tree fntype = gimple_call_fntype (stmt);
>+tree dtype = TREE_TYPE (fndecl);
>+if (gimple_call_lhs (stmt)
>+&& !useless_type_conversion_p (TREE_TYPE (fntype),
>+   TREE_TYPE (dtype)))
>+  final = false;
>+else
>+  {
>+tree t1, t2;
>+for (t1 = TYPE_ARG_TYPES (fntype),
>+ t2 = TYPE_ARG_TYPES (dtype);
>+ t1 && t2;
>+  

Re: [Patch, Fortran] PR 59023: [4.9 regression] ICE in gfc_search_interface with BIND(C)

2014-01-03 Thread Janus Weil
In addition this patch fixes PR 59662.

Also: Ping!

Cheers,
Janus



2013/12/31 Janus Weil :
> Hi all,
>
> ... and the reg-bashing continues: Here is a small patch to fix a
> bind(c) problem. It essentially prevents 'resolve_global_procedure' to
> be applied to bind(c) procedures.
>
> Regtested on x86_64-unknown-linux-gnu. Ok for trunk?
>
> Cheers,
> Janus
>
>
>
> 2013-12-31  Janus Weil  
>
> PR fortran/59023
> * resolve.c (resolve_global_procedure): Don't apply to c-binding
> procedures.
> (gfc_verify_binding_labels): Remove duplicate line.
>
> 2013-12-31  Janus Weil  
>
> PR fortran/59023
> * gfortran.dg/bind_c_procs_2.f90: New.


Re: [PATCH] Fix devirtualization ICE (PR tree-optimization/59622, take 2)

2014-01-03 Thread Jakub Jelinek
On Fri, Jan 03, 2014 at 10:26:44AM +0100, Richard Biener wrote:
> Jakub Jelinek  wrote:
> >On Tue, Dec 31, 2013 at 11:30:12AM +0100, Richard Biener wrote:
> >> Meanwhile your patch is ok.
> >
> >As discussed in the PR, the patch wasn't sufficient, __cxa_pure_virtual
> >can appear in the vtable and it has pretty much the same properties
> >as __builtin_unreachable (void return value, no arguments, noreturn,
> >DCE or cfg cleanup being able to remove anything after it because of
> >noreturn).
> >
> >Additionally, the patch attempts to punt on invalid type changes
> >(ODR violations?, apparently none appear during bootstrap/regtest as
> >verified by additional logging added there) or inplace changes of
> >gimple_call_flags that would require some cleanups caller isn't
> >expected to
> >do (again, doesn't happen during bootstrap/regtest).
> >
> >Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> 
> I'd rather do nothing in the !inplace case, it's rare enough to not care.

Ok.

> Also there is nothing wrong with wrong types et al - we have a perfect 
> mechanism for dealing with this. Change the fndecl but not the fntype.

Well, see PR59630.  The question is if having to handle it everywhere
is worth it.

Jakub


Re: [PATCH] Fix devirtualization ICE (PR tree-optimization/59622, take 2)

2014-01-03 Thread Richard Biener
On Fri, 3 Jan 2014, Jakub Jelinek wrote:

> On Fri, Jan 03, 2014 at 10:26:44AM +0100, Richard Biener wrote:
> > Jakub Jelinek  wrote:
> > >On Tue, Dec 31, 2013 at 11:30:12AM +0100, Richard Biener wrote:
> > >> Meanwhile your patch is ok.
> > >
> > >As discussed in the PR, the patch wasn't sufficient, __cxa_pure_virtual
> > >can appear in the vtable and it has pretty much the same properties
> > >as __builtin_unreachable (void return value, no arguments, noreturn,
> > >DCE or cfg cleanup being able to remove anything after it because of
> > >noreturn).
> > >
> > >Additionally, the patch attempts to punt on invalid type changes
> > >(ODR violations?, apparently none appear during bootstrap/regtest as
> > >verified by additional logging added there) or inplace changes of
> > >gimple_call_flags that would require some cleanups caller isn't
> > >expected to
> > >do (again, doesn't happen during bootstrap/regtest).
> > >
> > >Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> > 
> > I'd rather do nothing in the !inplace case, it's rare enough to not care.
> 
> Ok.
> 
> > Also there is nothing wrong with wrong types et al - we have a perfect 
> > mechanism for dealing with this. Change the fndecl but not the fntype.
> 
> Well, see PR59630.  The question is if having to handle it everywhere
> is worth it.

Well, this case happens because we go back to GENERIC which doesn't
have this feature.  So "everywhere" is somewhat a broad stmt.
It's easy to guard the builtins.c folding with a compatibility
check of fntype and fndecl type.

Richard.


Re: [PATCH] Fix devirtualization ICE (PR tree-optimization/59622, take 2)

2014-01-03 Thread Jakub Jelinek
On Fri, Jan 03, 2014 at 11:01:13AM +0100, Richard Biener wrote:
> > Well, see PR59630.  The question is if having to handle it everywhere
> > is worth it.
> 
> Well, this case happens because we go back to GENERIC which doesn't
> have this feature.  So "everywhere" is somewhat a broad stmt.
> It's easy to guard the builtins.c folding with a compatibility
> check of fntype and fndecl type.

Well, clearly the inliner has similar issue, I doubt e.g. IPA cloning
handles it right, there is just one gimple_call_fntype call in all of ipa*.c
(which surprises me) and that ignores it if there is a decl:
  tree type = (e->callee
   ? TREE_TYPE (e->callee->decl)
   : gimple_call_fntype (e->call_stmt));
so if there is a mismatch between TREE_TYPE (e->callee->decl) and
gimple_call_fntype, it will happily look at the decl type.
So I'd say it is just a matter of adding more (invalid) testcases.

Jakub


Re: [PATCH] Fix devirtualization ICE (PR tree-optimization/59622, take 2)

2014-01-03 Thread Richard Biener
On Fri, 3 Jan 2014, Jakub Jelinek wrote:

> On Fri, Jan 03, 2014 at 11:01:13AM +0100, Richard Biener wrote:
> > > Well, see PR59630.  The question is if having to handle it everywhere
> > > is worth it.
> > 
> > Well, this case happens because we go back to GENERIC which doesn't
> > have this feature.  So "everywhere" is somewhat a broad stmt.
> > It's easy to guard the builtins.c folding with a compatibility
> > check of fntype and fndecl type.
> 
> Well, clearly the inliner has similar issue, I doubt e.g. IPA cloning
> handles it right, there is just one gimple_call_fntype call in all of ipa*.c
> (which surprises me) and that ignores it if there is a decl:
>   tree type = (e->callee
>? TREE_TYPE (e->callee->decl)
>: gimple_call_fntype (e->call_stmt));
> so if there is a mismatch between TREE_TYPE (e->callee->decl) and
> gimple_call_fntype, it will happily look at the decl type.
> So I'd say it is just a matter of adding more (invalid) testcases.

Of course we have to fix al the bugs.  The inliner has a
compatibility check in some of it's can-inline predicates.
ipa_get_callee_param_type looks correct if it really wants
to get at the param type of the _callee_ (not the type
the caller uses!)

static tree 
ipa_get_callee_param_type (struct cgraph_edge *e, int i)
{   
  int n;
  tree type = (e->callee
   ? TREE_TYPE (e->callee->decl)
   : gimple_call_fntype (e->call_stmt));
  tree t = TYPE_ARG_TYPES (type);  

so if there is a decl then use its type signature, otherwise
(indirect calls) use the caller signature (and hope it matches
the callee...).  That it later falls back to looking at
DECL_ARGUMENTS is odd (probably a FE issue where we have a
fndecl with a bogus type?)

Richard.


Re: [PATCH] Fix devirtualization ICE (PR tree-optimization/59622, take 2)

2014-01-03 Thread Jakub Jelinek
On Fri, Jan 03, 2014 at 11:15:32AM +0100, Richard Biener wrote:
> so if there is a decl then use its type signature, otherwise
> (indirect calls) use the caller signature (and hope it matches
> the callee...).  That it later falls back to looking at
> DECL_ARGUMENTS is odd (probably a FE issue where we have a
> fndecl with a bogus type?)

For K&R C non-prototyped functions
foo (x, y)
  int x, y;
{
  return x + y;
}
I think TYPE_ARG_TYPES is NULL (?) and DECL_ARGUMENTS is the only way to get
at the types.

Jakub


Re: [PATCH] Fix devirtualization ICE (PR tree-optimization/59622, take 2)

2014-01-03 Thread Richard Biener
On Fri, 3 Jan 2014, Jakub Jelinek wrote:

> On Fri, Jan 03, 2014 at 11:15:32AM +0100, Richard Biener wrote:
> > so if there is a decl then use its type signature, otherwise
> > (indirect calls) use the caller signature (and hope it matches
> > the callee...).  That it later falls back to looking at
> > DECL_ARGUMENTS is odd (probably a FE issue where we have a
> > fndecl with a bogus type?)
> 
> For K&R C non-prototyped functions
> foo (x, y)
>   int x, y;
> {
>   return x + y;
> }
> I think TYPE_ARG_TYPES is NULL (?) and DECL_ARGUMENTS is the only way to get
> at the types.

Ah, indeed.  A C speciality that shouldn't survive GENERICization if
that's really the case.

I wonder how

int main()
{
  return (*(&foo))(0, 0);
}

works though.  Or

 __auto_type x = foo;

(substitute for that new C __auto thing).

The former probably because we fold away the indirection very early.

Richard.


Re: [PATCH] Fix devirtualization ICE (PR tree-optimization/59622, take 2)

2014-01-03 Thread Richard Biener
On Fri, 3 Jan 2014, Richard Biener wrote:

> On Fri, 3 Jan 2014, Jakub Jelinek wrote:
> 
> > On Fri, Jan 03, 2014 at 11:15:32AM +0100, Richard Biener wrote:
> > > so if there is a decl then use its type signature, otherwise
> > > (indirect calls) use the caller signature (and hope it matches
> > > the callee...).  That it later falls back to looking at
> > > DECL_ARGUMENTS is odd (probably a FE issue where we have a
> > > fndecl with a bogus type?)
> > 
> > For K&R C non-prototyped functions
> > foo (x, y)
> >   int x, y;
> > {
> >   return x + y;
> > }
> > I think TYPE_ARG_TYPES is NULL (?) and DECL_ARGUMENTS is the only way to get
> > at the types.
> 
> Ah, indeed.  A C speciality that shouldn't survive GENERICization if
> that's really the case.
> 
> I wonder how
> 
> int main()
> {
>   return (*(&foo))(0, 0);
> }
> 
> works though.  Or
> 
>  __auto_type x = foo;
> 
> (substitute for that new C __auto thing).
> 
> The former probably because we fold away the indirection very early.

foo (x, y)
   int x, y;
{
return x + y;
}

int main()
{
  __auto_type f = foo;
  return f(0,0);
}

works just fine for me but shows

  int (*) () f;   
  int _4;   

  :   
  f_1 = foo;
  _4 = f_1 (0, 0);

so I suppose the fntype is not NULL but int (*)().

Richard.


[PATCH] Fix slpeel_update_phi_nodes_for_guard1 ICE (PR tree-optimization/59519)

2014-01-03 Thread Jakub Jelinek
Hi!

Since the r205959 SCEV changes for peeled chrec, apparently we can end up
with multiple PHIs on the to be vectorized loop that have the same arguments
(both on preheader and latch edges).  slpeel_update_phi_nodes_for_guard1
doesn't like that, it asserts that for the argument from the latch edge
we set_current_def only once, when it is shared by more than one PHI,
we would actually be trying to set it more than once.

What we create is just multiple PHIs on the *new_exit_bb that look like
_NN = PHI 
but as they necessarily have the same value, IMHO it shouldn't matter
which one of those we record through set_current_def.

I've tried to trace where we'd call get_current_def on the loop_arg
besides the get_current_def call changed in the patch, but saw none even
on
struct S { int f0; } d;
int a[8] = { 0 }, b, c, e, f;

void
foo (void)
{
  for (; e < 1; e++)
{
  for (b = 0; b < 7; b++)
{
  c |= (a[b + 1] != 0);
  if (d.f0)
break;
}
  f += b;
}
}
where the value is actually used after the loop, in all cases the generated
IL looks sane.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2014-01-03  Jakub Jelinek  

PR tree-optimization/59519
* tree-vect-loop-manip.c (slpeel_update_phi_nodes_for_guard1): Don't
ICE if get_current_def (current_new_name) is already non-NULL, as long
as it is a phi result of some other phi in *new_exit_bb that has
the same argument.

* gcc.dg/vect/pr59519-1.c: New test.
* gcc.dg/vect/pr59519-2.c: New test.

--- gcc/tree-vect-loop-manip.c.jj   2013-12-10 12:43:21.0 +0100
+++ gcc/tree-vect-loop-manip.c  2014-01-02 16:29:21.983645769 +0100
@@ -483,7 +483,18 @@ slpeel_update_phi_nodes_for_guard1 (edge
  if (!current_new_name)
continue;
 }
-  gcc_assert (get_current_def (current_new_name) == NULL_TREE);
+  tree new_name = get_current_def (current_new_name);
+  /* Because of peeled_chrec optimization it is possible that we have
+set this earlier.  Verify the PHI has the same value.  */
+  if (new_name)
+   {
+ gimple phi = SSA_NAME_DEF_STMT (new_name);
+ gcc_assert (gimple_code (phi) == GIMPLE_PHI
+ && gimple_bb (phi) == *new_exit_bb
+ && (PHI_ARG_DEF_FROM_EDGE (phi, single_exit (loop))
+ == loop_arg));
+ continue;
+   }
 
   set_current_def (current_new_name, PHI_RESULT (new_phi));
 }
--- gcc/testsuite/gcc.dg/vect/pr59519-1.c.jj2014-01-02 16:39:07.743647669 
+0100
+++ gcc/testsuite/gcc.dg/vect/pr59519-1.c   2014-01-02 16:40:22.414276947 
+0100
@@ -0,0 +1,19 @@
+/* PR tree-optimization/59519 */
+/* { dg-do compile } */
+/* { dg-additional-options "-O3" } */
+
+int a, b, c, d;
+
+void
+foo (void)
+{
+  for (; d; d++)
+for (b = 0; b < 14; b++)
+  {
+   c |= 1;
+   if (a)
+ break;
+  }
+}
+
+/* { dg-final { cleanup-tree-dump "vect" } } */
--- gcc/testsuite/gcc.dg/vect/pr59519-2.c.jj2014-01-02 16:39:10.726629480 
+0100
+++ gcc/testsuite/gcc.dg/vect/pr59519-2.c   2014-01-02 16:40:26.213249520 
+0100
@@ -0,0 +1,20 @@
+/* PR tree-optimization/59519 */
+/* { dg-do compile } */
+/* { dg-additional-options "-O3" } */
+
+struct S { int f0; } d;
+int a[8] = { 0 }, b, c, e;
+
+void
+foo (void)
+{
+  for (; e < 1; e++)
+for (b = 0; b < 7; b++)
+  {
+   c |= (a[b + 1] != 0);
+   if (d.f0)
+ break;
+  }
+}
+
+/* { dg-final { cleanup-tree-dump "vect" } } */

Jakub


Re: [PATCH] Fix devirtualization ICE (PR tree-optimization/59622, take 2)

2014-01-03 Thread Jakub Jelinek
On Fri, Jan 03, 2014 at 11:24:53AM +0100, Richard Biener wrote:
> works just fine for me but shows
> 
>   int (*) () f; 
>   
>   int _4; 
>   
>   
>   
>   : 
>   
>   f_1 = foo;  
>   
>   _4 = f_1 (0, 0);
> 
> so I suppose the fntype is not NULL but int (*)().

Sure, I said TYPE_ARG_TYPES is NULL, which is the () there.

Jakub


Re: [PATCH i386 5/8] [AVX-512] Extend vectorizer hooks.

2014-01-03 Thread Uros Bizjak
On Thu, Jan 2, 2014 at 11:18 PM, Eric Botcazou  wrote:
>> Note that it has unexpected side-effects: previously, in 32-bit mode,
>> 256-bit aggregate objects would have been given 256-bit alignment; now,
>> they will fall back to default alignment, for example 32-bit only.
>
> In case this wasn't clear enough, just compile in 32-bit mode:
>
> int a[8] = { 1, 2, 3, 4, 5, 6, 7, 8};

It looks to me that we don't need to adjust anything with max_align.
Using following patch:

--cut here--
Index: i386.c
===
--- i386.c  (revision 206311)
+++ i386.c  (working copy)
@@ -26465,6 +26465,7 @@
 int
 ix86_data_alignment (tree type, int align, bool opt)
 {
+#if 0
   int max_align = optimize_size ? BITS_PER_WORD
: MIN (512, MAX_OFILE_ALIGNMENT);

@@ -26476,6 +26477,7 @@
  || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
   && align < max_align)
 align = max_align;
+#endif

   /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
  to 16byte boundary.  */
--cut here--

and following testcase:

-- cut here--
float a[8] = { 1, 2, 3, 4, 5, 6, 7, 8};

extern float z[8];

void t (void)
{
  int i;

  for (i = 0; i < 8; i++)
z[i] = z[i] + a[i];
}
--cut here--

When compiled with -m32 -mavx, we get:

.align 32
.type   a, @object
.size   a, 32
a:

so, the alignment was already raised elsewhere. We get .align 16 for
-msse -m32 when vectorizing.

without -msse (and consequently without vectorizing), we get for -m32:

.align 4
.type   a, @object
.size   a, 32
a:

which corresponds to 32bit ABI rules (we still get .align16 for 64bit ABI).

What bothers me in this testcase is (unrelated) alignment of z[8]
array. Even for 64bit targets, we get:

#(insn:TI 6 5 8 2 (set (reg:V4SF 21 xmm0 [orig:90 vect__4.5 ] [90])
#(unspec:V4SF [
#(mem/c:V4SF (reg/f:DI 0 ax [89]) [2 MEM[(float
*)&z]+0 S16 A32])
#] UNSPEC_LOADU)) al.c:10 1195 {*sse_loadups}
# (nil))
movups  (%rax), %xmm0   # 6 *sse_loadups[length = 3]

ABI guarantees 16 byte alignment of z[8], but we fail to communicate
this to the compiler.

Uros.


Re: [PATCH i386 5/8] [AVX-512] Extend vectorizer hooks.

2014-01-03 Thread Eric Botcazou
> When compiled with -m32 -mavx, we get:
> 
> .align 32
> .type   a, @object
> .size   a, 32
> a:
> 
> so, the alignment was already raised elsewhere. We get .align 16 for
> -msse -m32 when vectorizing.
> 
> without -msse (and consequently without vectorizing), we get for -m32:
> 
> .align 4
> .type   a, @object
> .size   a, 32
> a:
> 
> which corresponds to 32bit ABI rules (we still get .align16 for 64bit ABI).

Yes, but the issue is that it was 32 before so the alignment decrease is weird.

-- 
Eric Botcazou


Re: [PATCH i386 5/8] [AVX-512] Extend vectorizer hooks.

2014-01-03 Thread Uros Bizjak
On Fri, Jan 3, 2014 at 12:20 PM, Eric Botcazou  wrote:
>> When compiled with -m32 -mavx, we get:
>>
>> .align 32
>> .type   a, @object
>> .size   a, 32
>> a:
>>
>> so, the alignment was already raised elsewhere. We get .align 16 for
>> -msse -m32 when vectorizing.
>>
>> without -msse (and consequently without vectorizing), we get for -m32:
>>
>> .align 4
>> .type   a, @object
>> .size   a, 32
>> a:
>>
>> which corresponds to 32bit ABI rules (we still get .align16 for 64bit ABI).
>
> Yes, but the issue is that it was 32 before so the alignment decrease is 
> weird.

Yes, but this change is benign, and as shown, I don't think we need
this functionality at all. The data from other TUs is accessed with a
4 byte alignment (on 32bit targets, and unfortunately also on 64bit
targets), and the alignment of local data is increased elsewhere.

I am testing a patch that removes "max_align" part from ix86_data_alignment.

Uros.


Re: Rb tree node recycling patch

2014-01-03 Thread Christopher Jefferson
Personally, I consider the testing insufficent, although the testing
was already insufficient! Note that this is my opinion, don't assume I
talk for all of libstdc++!

After I broke std::nth_element in 4.8.2 (woops), I am of the view that
the libstdc++ testsuite is defficient -- many code paths are never
tested. The nth_element breakage should really have been picked up,
about 1/3 of all random invocations of nth_element failed!

You could be "inspired" by the code I wrote for algorithms, it is in
testsuite/util/testsuite_containergen.h. The basic idea is I make sure
to generate special cases (size 0 containers, containers containing
only a single value repeated), and then a range of containers of
various different sizes.

I realise suggesting this is probably as much work as your patch, and
you shouldn't assume this is required, but I think it would improve
the testsuite. If you do go down this route, then be sure to reduce
your test's size for simulators. You can look at for example at my new
tests which feature:

// { dg-options "-std=gnu++11" }
// { dg-options "-std=gnu++11 -DSIMULATOR_TEST" { target simulator } }

Which enables the macro SIMULATOR_TEST on simulators, where I do much
less testing (else the tester takes too long.


On 27 December 2013 18:30, François Dumont  wrote:
> Hi
>
> Here is a patch to add recycling of Rb tree nodes when possible.
>
> I replaced the _Rb_tree::_M_move_assign with a move assignment operator
> to benefit from:
> - move of elements when the whole data structure cannot be moved
> - faster data structure cloning rather than full regeneration of the tree
> when _M_move_assign was failing
>
> Note that this patch contains also a cleanup of a useless template
> parameter _Is_pod_comparator on _Rb_tree_impl. If you want to apply it
> quickly for 4.9 do not hesitate.
>
> I haven't done any specific test for this feature, existing ones looks
> good enough to me. If you want me to add some I will when back from
> vacation. I am mostly submitting this patch to show you that I worked on it
> and you do not need to do it yourself.
>
> 2013-12-27  François Dumont 
>
> * include/bits/stl_tree.h (_Rb_tree_reuse_or_alloc_node<>): New.
> (_Rb_tree_alloc_node<>): Likewise.
> (_Rb_tree<>::_M_clone_node): Made template to take a node
> generator.
> (_Rb_tree_impl<>): Remove unused _Is_pod_comparator template
> value.
> (_Rb_tree<>::_M_move_assign): Replace by...
> (_Rb_tree<>::operator(_Rb_tree&&)): ...this.
> (_Rb_tree_impl<>::_M_reset): New.
> (_Rb_tree<>::_M_insert_): Add node generator parameter.
> (_Rb_tree<>::_M_copy): Add overload taking a node generator.
> (_Rb_tree<>::_M_insert_unique_): Add node generator parameter.
> (_Rb_tree<>::_M_insert_equal_): Add node generator parameter.
> (_Rb_tree<>::_M_assign_unique): New.
> (_Rb_tree<>::_M_assign_equal): New.
> (_Rb_tree<>): Adapt to use _Rb_tree_impl<>::_M_reset and reuse
> nodes as much as possible.
> * include/bits/stl_set.h (set<>::operator=(set<>&&): Adapt to use
> _Rb_tree move assignment operator.
> (set<>::operator=(initializer_list<>)): Adapt to use
> _Rb_tree<>::_M_assign_unique.
> * include/bits/stl_multiset.h
> (multiset<>::operator=(multiset<>&&)): Adapt to use
> _Rb_tree move assignment operator.
> (multiset<>::operator=(initializer_list<>)): Adapt to use
> _Rb_tree<>::_M_assign_equal.
> * include/bits/stl_map.h (map<>::operator=(map<>&&): Adapt to use
> _Rb_tree move assignment operator.
> (map<>::operator=(initializer_list<>)): Adapt to use
> _Rb_tree<>::_M_assign_unique.
> * include/bits/stl_multimap.h
> (multimap<>::operator=(multimap<>&&)): Adapt to use
> _Rb_tree move assignment operator.
> (multimap<>::operator=(initializer_list<>)): Adapt to use
> _Rb_tree<>::_M_assign_equal.
>
> Tested under Linux x86_64.
>
> Happy new year.
>
> François
>


Re: [PATCH i386 5/8] [AVX-512] Extend vectorizer hooks.

2014-01-03 Thread Jakub Jelinek
On Fri, Jan 03, 2014 at 12:25:00PM +0100, Uros Bizjak wrote:
> I am testing a patch that removes "max_align" part from ix86_data_alignment.

That looks like unnecessary pessimization.  Note the hunk in question is
guarded with opt, which means it is an optimization rather than ABI issue,
it can increase alignment, but the compiler can only assume the increased
alignment if the symbol is not public or if it is public, but can't be
preempted by another TU's definition.  Even in that case it can be
worthwhile to increase the alignment, say if doing versioning for alignment,
or say just doing some AVX/AVX2/AVX512F version of memcpy/memset that can
handle it faster if it is sufficiently aligned by testing it at runtime.

So, IMHO we shouldn't drop it, just improve it.
Perhaps:

  int max_align = optimize_size ? BITS_PER_WORD
: MIN (512, MAX_OFILE_ALIGNMENT);

  if (opt
  && AGGREGATE_TYPE_P (type)
  && TYPE_SIZE (type)
  && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
  && align < max_align)
{
  int this_align = 256;
  for (this_align = 256; this_align <= max_align; this_align *= 2)
if (TREE_INT_CST_LOW (TYPE_SIZE (type)) < (unsigned) this_align
&& !TREE_INT_CST_HIGH (TYPE_SIZE (type)))
  break;
else if (align < this_align)
  align = this_align;
}

which will handle both the 256 bit alignment for >= 256 bit objects,
512 bit alignment for >= 512 bit objects and will be prepared for the
future.

128 bit I think doesn't need to be handled, DATA_ALIGNMENT has been
using 256-bit test already since it's introduction in 1998:
http://gcc.gnu.org/ml/gcc-bugs/1998-03/msg00011.html

Jakub


Re: [PATCH] Don't count asm goto for 4 branch limit optimization (PR target/59625)

2014-01-03 Thread Uros Bizjak
On Fri, Jan 3, 2014 at 9:37 AM, Jakub Jelinek  wrote:

> Linus complained about padding inserted in between asm goto.  In 4.8
> even -mtune=generic performs ix86_avoid_jump_mispredicts, in 4.9 only e.g.
> -mtune=atom and others, but not generic.
> The issue is that assuming every asm goto must contain a jump is too
> pessimistic, often asm goto (may) transfer control to the label(s) through
> other means (e.g. on the side data structure with labels etc.), plus we
> conservatively assume asm goto is 0 bytes long, so we were adding the
> paddings pretty much always when multiple asm goto stmts appeared
> consecutively.
>
> This patch just ignores asm goto from the optimization.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk and 4.8
> after a while?
>
> 2014-01-03  Jakub Jelinek  
>
> PR target/59625
> * config/i386/i386.c (ix86_avoid_jump_mispredicts): Don't consider
> asm goto as jump.
>
> * gcc.target/i386/pr59625.c: New test.

OK everywhere, but please add a short comment about asm goto handling

> +/* Verify we don't consider asm goto as a jump for four jumps limit
> +   optimization.  asm goto doesn't have to contain a jump at all,
> +   the branching to labels can happen through different means.  */
> +/* { dg-final { scan-assembler-not 
> "p2align\[^\n\r\]*\[\n\r]*\[^\n\r\]*p2align\[^\n\r\]*\[\n\r]*\[^\n\r\]*p2align\[^\n\r\]*\[\n\r]*\[^\n\r\]*p2align\[^\n\r\]*\[\n\r]*\[^\n\r\]*p2align\[^\n\r\]*\[\n\r]*\[^\n\r\]*p2align"
>  } } */

This RE can probably use match count, "{n}", as e.g. in g++.dg/abi/mangle33.C

Thanks,
Uros.


[PATCH] Fix typo in docs

2014-01-03 Thread Marek Polacek
__builtin_{FILE,FUNCTION} had wrong return types, thus fixed.
Tested by make html and links -g.  Ok for trunk?

2014-01-03  Marek Polacek  

PR other/59661
* doc/extend.texi: Fix the return value of __builtin_FUNCTION and
__builtin_FILE.

--- gcc/doc/extend.texi.mp  2014-01-03 12:32:13.366420582 +0100
+++ gcc/doc/extend.texi 2014-01-03 13:02:03.723889259 +0100
@@ -8728,12 +8728,12 @@ This function is the equivalent to the p
 macro and returns the line number of the invocation of the built-in.
 @end deftypefn
 
-@deftypefn {Built-in Function} int __builtin_FUNCTION ()
+@deftypefn {Built-in Function} {const char *} __builtin_FUNCTION ()
 This function is the equivalent to the preprocessor @code{__FUNCTION__}
 macro and returns the function name the invocation of the built-in is in.
 @end deftypefn
 
-@deftypefn {Built-in Function} int __builtin_FILE ()
+@deftypefn {Built-in Function} {const char *} __builtin_FILE ()
 This function is the equivalent to the preprocessor @code{__FILE__}
 macro and returns the file name the invocation of the built-in is in.
 @end deftypefn

Marek


Re: [PATCH] Fix typo in docs

2014-01-03 Thread Jakub Jelinek
On Fri, Jan 03, 2014 at 01:03:24PM +0100, Marek Polacek wrote:
> __builtin_{FILE,FUNCTION} had wrong return types, thus fixed.
> Tested by make html and links -g.  Ok for trunk?

Ok, thanks.

> 2014-01-03  Marek Polacek  
> 
>   PR other/59661
>   * doc/extend.texi: Fix the return value of __builtin_FUNCTION and
>   __builtin_FILE.
> 
> --- gcc/doc/extend.texi.mp2014-01-03 12:32:13.366420582 +0100
> +++ gcc/doc/extend.texi   2014-01-03 13:02:03.723889259 +0100
> @@ -8728,12 +8728,12 @@ This function is the equivalent to the p
>  macro and returns the line number of the invocation of the built-in.
>  @end deftypefn
>  
> -@deftypefn {Built-in Function} int __builtin_FUNCTION ()
> +@deftypefn {Built-in Function} {const char *} __builtin_FUNCTION ()
>  This function is the equivalent to the preprocessor @code{__FUNCTION__}
>  macro and returns the function name the invocation of the built-in is in.
>  @end deftypefn
>  
> -@deftypefn {Built-in Function} int __builtin_FILE ()
> +@deftypefn {Built-in Function} {const char *} __builtin_FILE ()
>  This function is the equivalent to the preprocessor @code{__FILE__}
>  macro and returns the file name the invocation of the built-in is in.
>  @end deftypefn

Jakub


Re: [PATCH i386 5/8] [AVX-512] Extend vectorizer hooks.

2014-01-03 Thread Uros Bizjak
On Fri, Jan 3, 2014 at 12:59 PM, Jakub Jelinek  wrote:
> On Fri, Jan 03, 2014 at 12:25:00PM +0100, Uros Bizjak wrote:
>> I am testing a patch that removes "max_align" part from ix86_data_alignment.
>
> That looks like unnecessary pessimization.  Note the hunk in question is
> guarded with opt, which means it is an optimization rather than ABI issue,
> it can increase alignment, but the compiler can only assume the increased
> alignment if the symbol is not public or if it is public, but can't be
> preempted by another TU's definition.  Even in that case it can be
> worthwhile to increase the alignment, say if doing versioning for alignment,
> or say just doing some AVX/AVX2/AVX512F version of memcpy/memset that can
> handle it faster if it is sufficiently aligned by testing it at runtime.
>
> So, IMHO we shouldn't drop it, just improve it.
> Perhaps:
>
>   int max_align = optimize_size ? BITS_PER_WORD
> : MIN (512, MAX_OFILE_ALIGNMENT);
>
>   if (opt
>   && AGGREGATE_TYPE_P (type)
>   && TYPE_SIZE (type)
>   && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
>   && align < max_align)
> {
>   int this_align = 256;
>   for (this_align = 256; this_align <= max_align; this_align *= 2)
> if (TREE_INT_CST_LOW (TYPE_SIZE (type)) < (unsigned) this_align
> && !TREE_INT_CST_HIGH (TYPE_SIZE (type)))
>   break;
> else if (align < this_align)
>   align = this_align;
> }
>
> which will handle both the 256 bit alignment for >= 256 bit objects,
> 512 bit alignment for >= 512 bit objects and will be prepared for the
> future.
>
> 128 bit I think doesn't need to be handled, DATA_ALIGNMENT has been
> using 256-bit test already since it's introduction in 1998:
> http://gcc.gnu.org/ml/gcc-bugs/1998-03/msg00011.html

Thanks for the pointer, there is indeed the recommendation in
optimization manual [1], section 3.6.4, where it is said:

--quote--
Misaligned data access can incur significant performance penalties.
This is particularly true for cache line
splits. The size of a cache line is 64 bytes in the Pentium 4 and
other recent Intel processors, including
processors based on Intel Core microarchitecture.
An access to data unaligned on 64-byte boundary leads to two memory
accesses and requires several
μops to be executed (instead of one). Accesses that span 64-byte
boundaries are likely to incur a large
performance penalty, the cost of each stall generally are greater on
machines with longer pipelines.

...

A 64-byte or greater data structure or array should be aligned so that
its base address is a multiple of 64.
Sorting data in decreasing size order is one heuristic for assisting
with natural alignment. As long as 16-
byte boundaries (and cache lines) are never crossed, natural alignment
is not strictly necessary (though
it is an easy way to enforce this).
--/quote--

So, this part has nothing to do with AVX512, but with cache line
width. And we do have a --param "l1-cache-line-size=64", detected with
-march=native that could come handy here.

This part should be rewritten (and commented) with the information
above in mind.

[1] 
http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-optimization-manual.html

Uros.


Extend -fstack-protector-strong to cover calls with return slot

2014-01-03 Thread Florian Weimer
This patch fixes a loophole in the -fstack-protector-strong protection. 
 If a function call uses the return slot optimization, the caller needs 
stack protector instrumentation because the return slot is addressable.


Bootstrapped and regression-tested on x86_64-redhat-linux-gnu, with 
C/C++/Java enabled.  Okay for trunk?


--
Florian Weimer / Red Hat Product Security Team
gcc/

2014-01-03  Florian Weimer  

	* cfgexpand.c (call_with_return_slot_opt_p): New function.
	(expand_used_vars): Emit stack protector instrumentation in strong
	mode if call_with_return_slot_opt_p.

gcc/testsuite/

2014-01-03  Florian Weimer  

	* gcc.dg/fstack-protector-strong.c: Add coverage for named return
	values.
	* g++.dg/fstack-protector-strong.C: Likewise.

Index: gcc/cfgexpand.c
===
--- gcc/cfgexpand.c	(revision 206311)
+++ gcc/cfgexpand.c	(working copy)
@@ -1599,6 +1599,22 @@
   return 0;
 }
 
+/* Check if the basic block has a call which uses a return slot.  */
+
+static bool
+call_with_return_slot_opt_p (basic_block bb)
+{
+  for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
+   !gsi_end_p (gsi); gsi_next (&gsi))
+{
+  gimple stmt = gsi_stmt (gsi);
+  if (gimple_code (stmt) == GIMPLE_CALL
+	  && gimple_call_return_slot_opt_p (stmt))
+	return true;
+}
+  return false;
+}
+
 /* Expand all variables used in the function.  */
 
 static rtx
@@ -1669,22 +1685,35 @@
   pointer_map_destroy (ssa_name_decls);
 
   if (flag_stack_protect == SPCT_FLAG_STRONG)
-FOR_EACH_LOCAL_DECL (cfun, i, var)
-  if (!is_global_var (var))
+{
+  FOR_EACH_LOCAL_DECL (cfun, i, var)
+	if (!is_global_var (var))
+	  {
+	tree var_type = TREE_TYPE (var);
+	/* Examine local referenced variables that have their
+	   addresses taken, contain an array, or are arrays.  */
+	if (TREE_CODE (var) == VAR_DECL
+		&& (TREE_CODE (var_type) == ARRAY_TYPE
+		|| TREE_ADDRESSABLE (var)
+		|| (RECORD_OR_UNION_TYPE_P (var_type)
+			&& record_or_union_type_has_array_p (var_type
+	  {
+		gen_stack_protect_signal = true;
+		break;
+	  }
+	  }
+  /* The return slot introduces addressable local variables.  */
+  if (!gen_stack_protect_signal)
 	{
-	  tree var_type = TREE_TYPE (var);
-	  /* Examine local referenced variables that have their addresses taken,
-	 contain an array, or are arrays.  */
-	  if (TREE_CODE (var) == VAR_DECL
-	  && (TREE_CODE (var_type) == ARRAY_TYPE
-		  || TREE_ADDRESSABLE (var)
-		  || (RECORD_OR_UNION_TYPE_P (var_type)
-		  && record_or_union_type_has_array_p (var_type
+	  basic_block bb;
+	  FOR_ALL_BB_FN (bb, cfun)
 	{
-	  gen_stack_protect_signal = true;
-	  break;
+	  gen_stack_protect_signal = call_with_return_slot_opt_p (bb);
+	  if (gen_stack_protect_signal)
+		break;
 	}
 	}
+}
 
   /* At this point all variables on the local_decls with TREE_USED
  set are not associated with any block scope.  Lay them out.  */
Index: gcc/testsuite/g++.dg/fstack-protector-strong.C
===
--- gcc/testsuite/g++.dg/fstack-protector-strong.C	(revision 206311)
+++ gcc/testsuite/g++.dg/fstack-protector-strong.C	(working copy)
@@ -32,4 +32,39 @@
   return global_func (a);
 }
 
-/* { dg-final { scan-assembler-times "stack_chk_fail" 2 } } */
+/* Frame addressed exposed through return slot. */
+
+struct B
+{
+  /* Discourage passing this struct in registers. */
+  int a1, a2, a3, a4, a5, a6, a7, a8, a9, a10;
+};
+
+B global_func ();
+void noop ();
+
+int foo3 ()
+{
+  return global_func ().a1;
+}
+
+int foo4 ()
+{
+  try {
+noop ();
+return 0;
+  } catch (...) {
+return global_func ().a1;
+  }
+}
+
+int foo5 ()
+{
+  try {
+return global_func ().a1;
+  } catch (...) {
+return 0;
+  }
+}
+
+/* { dg-final { scan-assembler-times "stack_chk_fail" 5 } } */
Index: gcc/testsuite/gcc.dg/fstack-protector-strong.c
===
--- gcc/testsuite/gcc.dg/fstack-protector-strong.c	(revision 206311)
+++ gcc/testsuite/gcc.dg/fstack-protector-strong.c	(working copy)
@@ -131,4 +131,17 @@
   return bb.three;
 }
 
-/* { dg-final { scan-assembler-times "stack_chk_fail" 10 } } */
+struct B
+{
+  /* Discourage passing this struct in registers. */
+  int a1, a2, a3, a4, a5, a6, a7, a8, a9, a10;
+};
+
+struct B global3 (void);
+
+int foo11 ()
+{
+  return global3 ().a1;
+}
+
+/* { dg-final { scan-assembler-times "stack_chk_fail" 11 } } */


Re: [PATCH][ARM] Add new cores to t-aprofile

2014-01-03 Thread Gerald Pfeifer
Kyrill Tkachov  wrote:
>This patch adds the recently introduced cores to the t-aprofile
>multilib 
>machinery. The values added are cortex-a15.cortex-a7, cortex-a12,
>cortex-a57 and 
>cortex-a57.cortex-a53.

Are you going to add this to the release notes (gcc-4.9/changes.html) or will 
this be part of an ARM mega patch later? For the record, I prefer incremental 
changes, but whatever works. 

Gerald



Re: Update GCC 4.9 changes.html

2014-01-03 Thread Gerald Pfeifer
"H.J. Lu"  wrote:
>This patch adds Intel microarchitecture changes to GCC 4.9
>changes.html.
>OK to install?

Okay, modulo the fix Bernhard noted. 
Thanks,
Gerald



Re: [ARM] add armv7ve support

2014-01-03 Thread Gerald Pfeifer
Renlin Li  wrote:
>Hi all,
>
>This patch will add armv7ve support to gcc. Armv7ve is basically a 
>armv7-a architecture profile with Virtualization Extensions.

Mind adding this to the release notes?

Gerald



Re: [PATCH i386 5/8] [AVX-512] Extend vectorizer hooks.

2014-01-03 Thread Uros Bizjak
On Fri, Jan 3, 2014 at 1:27 PM, Uros Bizjak  wrote:

>>> I am testing a patch that removes "max_align" part from ix86_data_alignment.
>>
>> That looks like unnecessary pessimization.  Note the hunk in question is
>> guarded with opt, which means it is an optimization rather than ABI issue,
>> it can increase alignment, but the compiler can only assume the increased
>> alignment if the symbol is not public or if it is public, but can't be
>> preempted by another TU's definition.  Even in that case it can be
>> worthwhile to increase the alignment, say if doing versioning for alignment,
>> or say just doing some AVX/AVX2/AVX512F version of memcpy/memset that can
>> handle it faster if it is sufficiently aligned by testing it at runtime.
>>
>> So, IMHO we shouldn't drop it, just improve it.
>> Perhaps:
>>
>>   int max_align = optimize_size ? BITS_PER_WORD
>> : MIN (512, MAX_OFILE_ALIGNMENT);
>>
>>   if (opt
>>   && AGGREGATE_TYPE_P (type)
>>   && TYPE_SIZE (type)
>>   && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
>>   && align < max_align)
>> {
>>   int this_align = 256;
>>   for (this_align = 256; this_align <= max_align; this_align *= 2)
>> if (TREE_INT_CST_LOW (TYPE_SIZE (type)) < (unsigned) this_align
>> && !TREE_INT_CST_HIGH (TYPE_SIZE (type)))
>>   break;
>> else if (align < this_align)
>>   align = this_align;
>> }
>>
>> which will handle both the 256 bit alignment for >= 256 bit objects,
>> 512 bit alignment for >= 512 bit objects and will be prepared for the
>> future.
>>
>> 128 bit I think doesn't need to be handled, DATA_ALIGNMENT has been
>> using 256-bit test already since it's introduction in 1998:
>> http://gcc.gnu.org/ml/gcc-bugs/1998-03/msg00011.html
>
> Thanks for the pointer, there is indeed the recommendation in
> optimization manual [1], section 3.6.4, where it is said:
>
> --quote--
> Misaligned data access can incur significant performance penalties.
> This is particularly true for cache line
> splits. The size of a cache line is 64 bytes in the Pentium 4 and
> other recent Intel processors, including
> processors based on Intel Core microarchitecture.
> An access to data unaligned on 64-byte boundary leads to two memory
> accesses and requires several
> μops to be executed (instead of one). Accesses that span 64-byte
> boundaries are likely to incur a large
> performance penalty, the cost of each stall generally are greater on
> machines with longer pipelines.
>
> ...
>
> A 64-byte or greater data structure or array should be aligned so that
> its base address is a multiple of 64.
> Sorting data in decreasing size order is one heuristic for assisting
> with natural alignment. As long as 16-
> byte boundaries (and cache lines) are never crossed, natural alignment
> is not strictly necessary (though
> it is an easy way to enforce this).
> --/quote--
>
> So, this part has nothing to do with AVX512, but with cache line
> width. And we do have a --param "l1-cache-line-size=64", detected with
> -march=native that could come handy here.
>
> This part should be rewritten (and commented) with the information
> above in mind.

Like in the patch below. Please note, that the block_tune setting for
the nocona is wrong, -march=native on my trusted old P4 returns:

--param "l1-cache-size=16" --param "l1-cache-line-size=64" --param
"l2-cache-size=2048" "-mtune=nocona"

which is consistent with the above quote from manual.

2014-01-02  Uros Bizjak  

* config/i386/i386.c (ix86_data_alignment): Calculate max_align
from prefetch_block tune setting.
(nocona_cost): Correct size of prefetch block to 64.

The patch was bootstrapped on x86_64-pc-linux-gnu and is currently in
regression testing. If there are no comments, I will commit it to
mainline and release branches after a couple of days.

Uros.
Index: i386.c
===
--- i386.c  (revision 206311)
+++ i386.c  (working copy)
@@ -1568,7 +1568,7 @@ struct processor_costs nocona_cost = {
   8,   /* MMX or SSE register to integer */
   8,   /* size of l1 cache.  */
   1024,/* size of l2 cache.  */
-  128, /* size of prefetch block */
+  64,  /* size of prefetch block */
   8,   /* number of parallel prefetches */
   1,   /* Branch cost */
   COSTS_N_INSNS (6),   /* cost of FADD and FSUB insns.  */
@@ -26465,9 +26465,21 @@ ix86_constant_alignment (tree exp, int align)
 int
 ix86_data_alignment (tree type, int align, bool opt)
 {
-  int max_align = optimize_size ? BITS_PER_WORD
-   : MIN (512, MAX_OFILE_ALIGNMENT);
+  /* Misaligned data access can incur significant performance penalties.
+ This is parti

Re: [PATCH i386 5/8] [AVX-512] Extend vectorizer hooks.

2014-01-03 Thread Jakub Jelinek
On Fri, Jan 03, 2014 at 02:35:36PM +0100, Uros Bizjak wrote:
> Like in the patch below. Please note, that the block_tune setting for
> the nocona is wrong, -march=native on my trusted old P4 returns:
> 
> --param "l1-cache-size=16" --param "l1-cache-line-size=64" --param
> "l2-cache-size=2048" "-mtune=nocona"
> 
> which is consistent with the above quote from manual.
> 
> 2014-01-02  Uros Bizjak  
> 
> * config/i386/i386.c (ix86_data_alignment): Calculate max_align
> from prefetch_block tune setting.
> (nocona_cost): Correct size of prefetch block to 64.
> 
> The patch was bootstrapped on x86_64-pc-linux-gnu and is currently in
> regression testing. If there are no comments, I will commit it to
> mainline and release branches after a couple of days.

That still has the effect of not aligning (for most tunings) 32 to 63 bytes
long aggregates to 32 bytes, while previously they were aligned.  Forcing
aligning 32 byte long aggregates to 64 bytes would be overkill, 32 byte
alignment is just fine for those (and ensures it never crosses 64 byte
boundary), for 33 to 63 bytes perhaps using 64 bytes alignment wouldn't
be that bad, just wouldn't match what we have done before.

Jakub


Re: [PATCH i386 5/8] [AVX-512] Extend vectorizer hooks.

2014-01-03 Thread Uros Bizjak
On Fri, Jan 3, 2014 at 2:43 PM, Jakub Jelinek  wrote:
> On Fri, Jan 03, 2014 at 02:35:36PM +0100, Uros Bizjak wrote:
>> Like in the patch below. Please note, that the block_tune setting for
>> the nocona is wrong, -march=native on my trusted old P4 returns:
>>
>> --param "l1-cache-size=16" --param "l1-cache-line-size=64" --param
>> "l2-cache-size=2048" "-mtune=nocona"
>>
>> which is consistent with the above quote from manual.
>>
>> 2014-01-02  Uros Bizjak  
>>
>> * config/i386/i386.c (ix86_data_alignment): Calculate max_align
>> from prefetch_block tune setting.
>> (nocona_cost): Correct size of prefetch block to 64.
>>
>> The patch was bootstrapped on x86_64-pc-linux-gnu and is currently in
>> regression testing. If there are no comments, I will commit it to
>> mainline and release branches after a couple of days.
>
> That still has the effect of not aligning (for most tunings) 32 to 63 bytes
> long aggregates to 32 bytes, while previously they were aligned.  Forcing
> aligning 32 byte long aggregates to 64 bytes would be overkill, 32 byte
> alignment is just fine for those (and ensures it never crosses 64 byte
> boundary), for 33 to 63 bytes perhaps using 64 bytes alignment wouldn't
> be that bad, just wouldn't match what we have done before.

Please note that previous value was based on earlier (pre P4)
recommendation and it was appropriate for older chips with 32byte
cache line. The value should be updated long ago, when 64bit cache
lines were introduced, but was probably missed due to usage of magic
value without comment.

Ah, I see. My patch deals only with structures, larger than cache
line. As recommended in "As long as 16-byte boundaries (and cache
lines) are never crossed, natural alignment is not strictly necessary
(though it is an easy way to enforce this)." part of the manual, we
should align smaller structures to 16 or 32 bytes.

Yes, I agree. Can you please merge your patch together with the proposed patch?

Thanks,
Uros.


[PATCH] m68k: properly handle double word push for all registers

2014-01-03 Thread Andreas Schwab
handle_move_double in m68k.c has a special case to handle movd
N(sp),-(sp), but this case can happen with any (address) register,
especially if a stack like variable is forced into a register (this
happened in gforth).  Bootstrapped on m68k-suse-linux and installed on
trunk and gcc-4.8 branch.

Andreas.

* config/m68k/m68k.c (handle_move_double): Handle pushes with
overlapping registers also for registers other than the stack
pointer.
---
 gcc/config/m68k/m68k.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gcc/config/m68k/m68k.c b/gcc/config/m68k/m68k.c
index 25b8580..f20d071 100644
--- a/gcc/config/m68k/m68k.c
+++ b/gcc/config/m68k/m68k.c
@@ -3328,12 +3328,12 @@ handle_move_double (rtx operands[2],
latehalf[1] = adjust_address (operands[1], SImode, 0);
 }
 
-  /* If insn is effectively movd N(sp),-(sp) then we will do the
- high word first.  We should use the adjusted operand 1 (which is N+4(sp))
- for the low word as well, to compensate for the first decrement of sp.  */
+  /* If insn is effectively movd N(REG),-(REG) then we will do the high
+ word first.  We should use the adjusted operand 1 (which is N+4(REG))
+ for the low word as well, to compensate for the first decrement of
+ REG.  */
   if (optype0 == PUSHOP
-  && REGNO (XEXP (XEXP (operands[0], 0), 0)) == STACK_POINTER_REGNUM
-  && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
+  && reg_overlap_mentioned_p (XEXP (XEXP (operands[0], 0), 0), 
operands[1]))
 operands[1] = middlehalf[1] = latehalf[1];
 
   /* For (set (reg:DI N) (mem:DI ... (reg:SI N) ...)),
-- 
1.8.5.2

-- 
Andreas Schwab, sch...@linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."


[PING] [PATCH] fixed pr59651 & new test case

2014-01-03 Thread Bingfeng Mei
Updated: the patch passes aarch64 tests. Added missing ChangeLog entry for 
testsuite. OK to commit? 

Bingfeng
-Original Message-
From: Bingfeng Mei 
Sent: 02 January 2014 15:02
To: gcc-patches@gcc.gnu.org
Cc: 'tbela...@arm.com'
Subject: [PATCH] fixed pr59651 & new test case

Hi, 
This patch fixes pr59651. The original regression for pr52943 only appears on 
AArch64 target. I constructed a similar test that also exposes bug on x86-64. 
The problem is that calculation of address range in alias versioning for loops 
with negative step is wrong during vectorization. For example, for a loop that 
access int a[3] -> a[1], the old calculated address range is [a, a+12). It 
should be [a+4, a+16) instead.

Bootstrapped and passed testsuite on x86-64. Get confirmed by original reporter 
that regression is fixed on AArch64. OK to commit?

Thanks,
Bingfeng


patch_pr59651
Description: patch_pr59651


Re: [PATCH i386 5/8] [AVX-512] Extend vectorizer hooks.

2014-01-03 Thread Jakub Jelinek
On Fri, Jan 03, 2014 at 03:02:51PM +0100, Uros Bizjak wrote:
> Please note that previous value was based on earlier (pre P4)
> recommendation and it was appropriate for older chips with 32byte
> cache line. The value should be updated long ago, when 64bit cache
> lines were introduced, but was probably missed due to usage of magic
> value without comment.
> 
> Ah, I see. My patch deals only with structures, larger than cache
> line. As recommended in "As long as 16-byte boundaries (and cache
> lines) are never crossed, natural alignment is not strictly necessary
> (though it is an easy way to enforce this)." part of the manual, we
> should align smaller structures to 16 or 32 bytes.
> 
> Yes, I agree. Can you please merge your patch together with the proposed 
> patch?

How do we want to treat the 33-63 resp. 17-31 bytes long aggregates though?
32 byte long and 16 byte long aggregates can surely be aligned just to 32
resp. 16 bytes and never crosses 64 byte boundary then and doesn't waste
space in paddings unnecessarily (still opt thing, ABI can override),
but do we want to waste some extra bytes to ensure that 17-31 resp. 33-63
bytes long objects don't cross 64 byte boundaries by aligning those to 32
resp. 64 bytes, or do align them to 16 resp. 32 bytes instead?

Jakub


Re: [PING] [PATCH] fixed pr59651 & new test case

2014-01-03 Thread Jakub Jelinek
On Fri, Jan 03, 2014 at 02:07:03PM +, Bingfeng Mei wrote:
> This patch fixes pr59651.  The original regression for pr52943 only
> appears on AArch64 target.  I constructed a similar test that also exposes
> bug on x86-64.  The problem is that calculation of address range in alias
> versioning for loops with negative step is wrong during vectorization. 
> For example, for a loop that access int a[3] -> a[1], the old calculated
> address range is [a, a+12).  It should be [a+4, a+16) instead.

--- tree-vect-loop-manip.c  (revision 206279)
+++ tree-vect-loop-manip.c  (working copy)
@@ -2241,12 +2241,26 @@ vect_create_cond_for_alias_checks (loop_
   tree seg_a_min = addr_base_a;
   tree seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
   if (tree_int_cst_compare (DR_STEP (dr_a.dr), size_zero_node) < 0)
-   seg_a_min = seg_a_max, seg_a_max = addr_base_a;
+   {
+ seg_a_min = 
+   fold_build_pointer_plus (seg_a_max,
+TYPE_SIZE_UNIT (TREE_TYPE (DR_REF 
(dr_a.dr;
+ seg_a_max =
+   fold_build_pointer_plus (addr_base_a, 
+TYPE_SIZE_UNIT (TREE_TYPE (DR_REF 
(dr_a.dr;
+   }

Too long lines, can you create a temporary tree var for TYPE_SIZE_UNIT (...); 
value?
Can you add a comment why you do this?

--- testsuite/gcc.dg/torture/pr59651.c  (revision 0)
+++ testsuite/gcc.dg/torture/pr59651.c  (revision 0)

Perhaps better would be to put this test into gcc.dg/vect/pr59651.c
(with the required /* { dg-final { cleanup-tree-dump "vect" } } */ ),
so that it is also tested say on i686-linux with -msse2 etc.
Or duplicate between those two places (the body of the test can be
included in one of those two from the other place using relative path).

@@ -0,0 +1,20 @@
+/* { dg-do run } */
+
+extern void abort (void);
+int a[] = { 0, 0, 0, 0, 0, 0, 0, 6 };
+
+int b;
+int
+main ()
+{
+  for (;;)
+{
+  b = 7;
+  for (; b; b -= 1)

Sounds like C-reduce weirdness, just write for (b = 7; b; --b)

+   a[b] = a[7] > 1;
+  break;
+}
+  if (a[1] != 0)
+abort ();
+  return 0;
+}

Ok with those changes.

Jakub


Re: [PATCH i386 5/8] [AVX-512] Extend vectorizer hooks.

2014-01-03 Thread Uros Bizjak
On Fri, Jan 3, 2014 at 3:13 PM, Jakub Jelinek  wrote:
> On Fri, Jan 03, 2014 at 03:02:51PM +0100, Uros Bizjak wrote:
>> Please note that previous value was based on earlier (pre P4)
>> recommendation and it was appropriate for older chips with 32byte
>> cache line. The value should be updated long ago, when 64bit cache
>> lines were introduced, but was probably missed due to usage of magic
>> value without comment.
>>
>> Ah, I see. My patch deals only with structures, larger than cache
>> line. As recommended in "As long as 16-byte boundaries (and cache
>> lines) are never crossed, natural alignment is not strictly necessary
>> (though it is an easy way to enforce this)." part of the manual, we
>> should align smaller structures to 16 or 32 bytes.
>>
>> Yes, I agree. Can you please merge your patch together with the proposed 
>> patch?
>
> How do we want to treat the 33-63 resp. 17-31 bytes long aggregates though?
> 32 byte long and 16 byte long aggregates can surely be aligned just to 32
> resp. 16 bytes and never crosses 64 byte boundary then and doesn't waste
> space in paddings unnecessarily (still opt thing, ABI can override),
> but do we want to waste some extra bytes to ensure that 17-31 resp. 33-63
> bytes long objects don't cross 64 byte boundaries by aligning those to 32
> resp. 64 bytes, or do align them to 16 resp. 32 bytes instead?

Looking at "significant performance penalties" part of the above
recommendation, I'd say to align it to 32/64 byte boundaries.
Hopefully, the linker is able to put other data in the hole?

Uros.


Re: [PATCH i386 5/8] [AVX-512] Extend vectorizer hooks.

2014-01-03 Thread Jakub Jelinek
On Fri, Jan 03, 2014 at 03:35:11PM +0100, Uros Bizjak wrote:
> On Fri, Jan 3, 2014 at 3:13 PM, Jakub Jelinek  wrote:
> > On Fri, Jan 03, 2014 at 03:02:51PM +0100, Uros Bizjak wrote:
> >> Please note that previous value was based on earlier (pre P4)
> >> recommendation and it was appropriate for older chips with 32byte
> >> cache line. The value should be updated long ago, when 64bit cache
> >> lines were introduced, but was probably missed due to usage of magic
> >> value without comment.
> >>
> >> Ah, I see. My patch deals only with structures, larger than cache
> >> line. As recommended in "As long as 16-byte boundaries (and cache
> >> lines) are never crossed, natural alignment is not strictly necessary
> >> (though it is an easy way to enforce this)." part of the manual, we
> >> should align smaller structures to 16 or 32 bytes.
> >>
> >> Yes, I agree. Can you please merge your patch together with the proposed 
> >> patch?
> >
> > How do we want to treat the 33-63 resp. 17-31 bytes long aggregates though?
> > 32 byte long and 16 byte long aggregates can surely be aligned just to 32
> > resp. 16 bytes and never crosses 64 byte boundary then and doesn't waste
> > space in paddings unnecessarily (still opt thing, ABI can override),
> > but do we want to waste some extra bytes to ensure that 17-31 resp. 33-63
> > bytes long objects don't cross 64 byte boundaries by aligning those to 32
> > resp. 64 bytes, or do align them to 16 resp. 32 bytes instead?
> 
> Looking at "significant performance penalties" part of the above
> recommendation, I'd say to align it to 32/64 byte boundaries.
> Hopefully, the linker is able to put other data in the hole?

Unless -fdata-sections linker doesn't affect this, unless it is about the
very first or very last object in the TU in the particular section.
GCC itself would need to (supposedly unless -fno-toplevel-reorder) attempt
to sort the varpool constants that are going to be emitted prior to emitting
them (compute what section each decl would be emitted to, and within each
section start with putting variable with biggest alignment first and then
try to pack them nicely).  Kind of similar to what is done for
-fsection-anchors, just don't emit everything as a single block, just sort
them in the varpool queue.

Jakub


Re: [ARM] Fix incorrect restore of FP registers in nested APCS frame

2014-01-03 Thread Richard Earnshaw
On 22/12/13 17:42, Eric Botcazou wrote:
> Hi,
> 
> this is the last issue with nested APCS frames according to our testing.  
> When 
> the IP register needs to be preserved on entry and r3 isn't free and there are
> no arguments to push, the prologue creates a slot above the frame, so various 
> internal offsets need to be adjusted.  One has been missed, leading to:
> 
>   sub ip, fp, #20
>   fldmfdd ip!, {d8}
>   sub sp, fp, #16
>   ldmfd   sp, {r3, fp, sp, pc}
> 
> in the epilogue of the nested frame.  That's wrong because the difference 
> between the 2 immediates must be equal to the size of the saved FP registers.
> 
> Tested on ARM/VxWorks (where it fixes several ACATS tests at -O2) and 
> ARM/EABI, OK for the mainline?
> 
> 
> 2013-12-22  Eric Botcazou  
> 
>   * config/arm/arm.c (arm_get_frame_offsets): Revamp long lines.
>   (arm_expand_epilogue_apcs_frame): Take into account the number of bytes
>   used to save the static chain register in the computation of the offset
>   from which the FP registers need to be restored.
> 
> 
> 2013-12-22  Eric Botcazou  
> 
>   * gcc.target/arm/neon-nested-apcs.c: New test.
> 
> 

OK, modulo the 2 nits below.

>/* In Thumb mode this is incorrect, but never used.  */
> -  offsets->frame = offsets->saved_args + (frame_pointer_needed ? 4 : 0) +
> -   arm_compute_static_chain_stack_bytes();
> +  offsets->frame
> += offsets->saved_args
> + + arm_compute_static_chain_stack_bytes ()
> + + (frame_pointer_needed ? 4 : 0);

There should be brackets around the entire expression when split across
mulitple lines, so that auto-indentation is correctly preserved: ie

  offsets->frame
= (expression_in_backets_when
   - over_mulitple_lines);

>/* Find the offset of the floating-point save area in the frame.  */
> -  floats_from_frame = offsets->saved_args - offsets->frame;
> +  floats_from_frame
> += offsets->saved_args
> + + arm_compute_static_chain_stack_bytes ()
> + - offsets->frame;
>  

Same here.

R.



[C++] PR58950: warn for unused __builtin_shuffle result

2014-01-03 Thread Marc Glisse

Hello,

this is the piece of patch you commented on in the PR.

Bootstrap+testsuite on x86_64-unknown-linux-gnu.

2014-01-03  Marc Glisse  

PR c++/58950
gcc/cp/
* cvt.c (convert_to_void): Handle VEC_PERM_EXPR and VEC_COND_EXPR.
gcc/testsuite/
* g++.dg/pr58950.C: New file.

--
Marc GlisseIndex: gcc/cp/cvt.c
===
--- gcc/cp/cvt.c(revision 206313)
+++ gcc/cp/cvt.c(working copy)
@@ -1396,21 +1396,23 @@ convert_to_void (tree expr, impl_conv_vo
  code = TREE_CODE (e);
  tclass = TREE_CODE_CLASS (code);
  if ((tclass == tcc_comparison
   || tclass == tcc_unary
   || (tclass == tcc_binary
   && !(code == MODIFY_EXPR
|| code == INIT_EXPR
|| code == PREDECREMENT_EXPR
|| code == PREINCREMENT_EXPR
|| code == POSTDECREMENT_EXPR
-   || code == POSTINCREMENT_EXPR)))
+   || code == POSTINCREMENT_EXPR))
+  || code == VEC_PERM_EXPR
+  || code == VEC_COND_EXPR)
   && (complain & tf_warning))
warning_at (loc, OPT_Wunused_value, "value computed is not 
used");
}
}
   expr = build1 (CONVERT_EXPR, void_type_node, expr);
 }
   if (! TREE_SIDE_EFFECTS (expr))
 expr = void_zero_node;
   return expr;
 }
Index: gcc/testsuite/g++.dg/pr58950.C
===
--- gcc/testsuite/g++.dg/pr58950.C  (revision 0)
+++ gcc/testsuite/g++.dg/pr58950.C  (working copy)
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-Wall" } */
+
+void f(){
+  int i __attribute__((vector_size(2*sizeof(int = { 2, 3 };
+  __builtin_shuffle (i, i); /* { dg-warning "value computed is not used" } */
+  ++i?1:0; /* { dg-warning "value computed is not used" } */
+}

Property changes on: gcc/testsuite/g++.dg/pr58950.C
___
Added: svn:keywords
## -0,0 +1 ##
+Author Date Id Revision URL
\ No newline at end of property
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property


[COMMITTED] [PATCH] fixed pr59651 & new test case

2014-01-03 Thread Bingfeng Mei
Jakub, thanks. Committed with suggested changes.

Bingfeng

-Original Message-
From: Jakub Jelinek [mailto:ja...@redhat.com] 
Sent: 03 January 2014 14:26
To: Bingfeng Mei
Cc: gcc-patches@gcc.gnu.org; tbela...@arm.com
Subject: Re: [PING] [PATCH] fixed pr59651 & new test case

On Fri, Jan 03, 2014 at 02:07:03PM +, Bingfeng Mei wrote:
> This patch fixes pr59651.  The original regression for pr52943 only
> appears on AArch64 target.  I constructed a similar test that also exposes
> bug on x86-64.  The problem is that calculation of address range in alias
> versioning for loops with negative step is wrong during vectorization. 
> For example, for a loop that access int a[3] -> a[1], the old calculated
> address range is [a, a+12).  It should be [a+4, a+16) instead.

--- tree-vect-loop-manip.c  (revision 206279)
+++ tree-vect-loop-manip.c  (working copy)
@@ -2241,12 +2241,26 @@ vect_create_cond_for_alias_checks (loop_
   tree seg_a_min = addr_base_a;
   tree seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
   if (tree_int_cst_compare (DR_STEP (dr_a.dr), size_zero_node) < 0)
-   seg_a_min = seg_a_max, seg_a_max = addr_base_a;
+   {
+ seg_a_min = 
+   fold_build_pointer_plus (seg_a_max,
+TYPE_SIZE_UNIT (TREE_TYPE (DR_REF 
(dr_a.dr;
+ seg_a_max =
+   fold_build_pointer_plus (addr_base_a, 
+TYPE_SIZE_UNIT (TREE_TYPE (DR_REF 
(dr_a.dr;
+   }

Too long lines, can you create a temporary tree var for TYPE_SIZE_UNIT (...); 
value?
Can you add a comment why you do this?

--- testsuite/gcc.dg/torture/pr59651.c  (revision 0)
+++ testsuite/gcc.dg/torture/pr59651.c  (revision 0)

Perhaps better would be to put this test into gcc.dg/vect/pr59651.c
(with the required /* { dg-final { cleanup-tree-dump "vect" } } */ ),
so that it is also tested say on i686-linux with -msse2 etc.
Or duplicate between those two places (the body of the test can be
included in one of those two from the other place using relative path).

@@ -0,0 +1,20 @@
+/* { dg-do run } */
+
+extern void abort (void);
+int a[] = { 0, 0, 0, 0, 0, 0, 0, 6 };
+
+int b;
+int
+main ()
+{
+  for (;;)
+{
+  b = 7;
+  for (; b; b -= 1)

Sounds like C-reduce weirdness, just write for (b = 7; b; --b)

+   a[b] = a[7] > 1;
+  break;
+}
+  if (a[1] != 0)
+abort ();
+  return 0;
+}

Ok with those changes.

Jakub


[PING][GOMP4][PATCH] SIMD-enabled functions (formerly Elemental functions) for C++

2014-01-03 Thread Iyer, Balaji V
Hello Everyone,
Did anyone get a chance to look into this?

Thanks,

Balaji V. Iyer.

> -Original Message-
> From: Iyer, Balaji V
> Sent: Monday, December 23, 2013 11:51 PM
> To: gcc-patches@gcc.gnu.org
> Cc: Jakub Jelinek
> Subject: [PING][GOMP4][PATCH] SIMD-enabled functions (formerly
> Elemental functions) for C++
> 
> Ping!
> 
> -Balaji V. Iyer.
> 
> > -Original Message-
> > From: Iyer, Balaji V
> > Sent: Thursday, December 19, 2013 1:12 PM
> > To: Jakub Jelinek
> > Cc: 'Aldy Hernandez (al...@redhat.com)'; 'gcc-patches@gcc.gnu.org'
> > Subject: RE: [GOMP4][PATCH] SIMD-enabled functions (formerly
> Elemental
> > functions) for C++
> >
> > Hi Jakub,
> > Attached, please find a fixed patch. I have answered your questions
> > below. Is this OK for trunk?
> >
> > Here are the ChangeLog entries:
> > Gcc/cp/ChangeLog
> > 2013-12-19  Balaji V. Iyer  
> >
> > * parser.c (cp_parser_direct_declarator): When Cilk Plus is enabled
> > see if there is an attribute after function decl.  If so, then
> > parse them now.
> > (cp_parser_late_return_type_opt): Handle parsing of Cilk Plus SIMD
> > enabled function late parsing.
> > (cp_parser_gnu_attribute_list): Parse all the tokens for the vector
> > attribute for a SIMD-enabled function.
> > (cp_parser_omp_all_clauses): Skip parsing to the end of pragma when
> > the function is used by SIMD-enabled function (indicated by NULL
> > pragma token).   Added 3 new clauses: PRAGMA_CILK_CLAUSE_MASK,
> > PRAGMA_CILK_CLAUSE_NOMASK and
> > PRAGMA_CILK_CLAUSE_VECTORLENGTH
> > (cp_parser_cilk_simd_vectorlength): Modified this function to handle
> > vectorlength clause in SIMD-enabled function and #pragma SIMD's
> > vectorlength clause.  Added a new bool parameter to differentiate
> > between the two.
> > (cp_parser_cilk_simd_fn_vector_attrs): New function.
> > (is_cilkplus_vector_p): Likewise.
> > (cp_parser_late_parsing_elem_fn_info): Likewise.
> > (cp_parser_omp_clause_name): Added a check for "mask," "nomask"
> > and "vectorlength" clauses when Cilk Plus is enabled.
> > (cp_parser_omp_clause_linear): Added a new parameter of type bool
> > and emit a sorry message when step size is a parameter.
> > * parser.h (cp_parser::cilk_simd_fn_info): New field.
> >
> > Testsuite/ChangeLog
> > 2013-12-19  Balaji V. Iyer  
> >
> > * g++.dg/cilk-plus/cilk-plus.exp: Called the C/C++ common tests for
> > SIMD enabled function.
> > * g++.dg/cilk-plus/ef_test.C: New test.
> > * c-c++-common/cilk-plus/vlength_errors.c: Added new dg-error tags
> > to differenciate C error messages from C++ ones.
> >
> > Thanks,
> >
> > Balaji V. Iyer.
> >
> > > -Original Message-
> > > From: Jakub Jelinek [mailto:ja...@redhat.com]
> > > Sent: Thursday, December 19, 2013 2:23 AM
> > > To: Iyer, Balaji V
> > > Cc: 'Aldy Hernandez (al...@redhat.com)'; 'gcc-patches@gcc.gnu.org'
> > > Subject: Re: [GOMP4][PATCH] SIMD-enabled functions (formerly
> > Elemental
> > > functions) for C++
> > >
> > > On Wed, Dec 18, 2013 at 11:36:04PM +, Iyer, Balaji V wrote:
> > > > --- a/gcc/cp/decl2.c
> > > > +++ b/gcc/cp/decl2.c
> > > > @@ -1124,6 +1124,10 @@ is_late_template_attribute (tree attr, tree
> > decl)
> > > >&& is_attribute_p ("omp declare simd", name))
> > > >  return true;
> > > >
> > > > +  /* Ditto as above for Cilk Plus SIMD-enabled function attributes.
> > > > + */  if (flag_enable_cilkplus && is_attribute_p ("cilk simd
> > > > + function",
> > > name))
> > > > +return true;
> > >
> > > Why?  It doesn't have any argument, why it should be processed late?
> > >
> >
> > Fixed.
> >
> > > > @@ -17097,6 +17102,14 @@ cp_parser_direct_declarator (cp_parser*
> > > > parser,
> > > >
> > > >   attrs = cp_parser_std_attribute_spec_seq (parser);
> > > >
> > > > + /* In here, we handle cases where attribute is used 
> > > > after
> > > > +the function declaration.  For example:
> > > > +void func (int x) __attribute__((vector(..)));  */
> > > > + if (flag_enable_cilkplus
> > > > + && cp_lexer_next_token_is_keyword (parser->lexer,
> > > > +RID_ATTRIBUTE))
> > > > +   attrs = chainon (cp_parser_gnu_attributes_opt 
> > > > (parser),
> > > > +attrs);
> > > >   late_return = (cp_parser_late_return_type_opt
> > > >  (parser, declarator,
> > > >   memfn ? cv_quals : -1));
> > >
> > > Doesn't this change the grammar (for all attributes, not just Cilk+
> > > specific
> > > ones) just based on whether -fcilkplus has been specified or not?
> > >
> >
> > OK. Fixed this by making it p

Re: [PATCH] Introduce MODE_SIZE mode attribute

2014-01-03 Thread Joseph S. Myers
On Fri, 3 Jan 2014, Jakub Jelinek wrote:

> I've noticed that especially with the AVX512F introduction we use
> GET_MODE_SIZE (mode) quite heavily in the i386 *.md files, and
> the problem with that is GET_MODE_SIZE isn't a compile time constant,
> needs to read mode_size array (non-const) at runtime.

It would seem better to me for genmodes to generate appropriate macro / 
inline function definitions that can be used by GET_MODE_SIZE (along the 
lines of (__builtin_constant_p (MODE) && !mode_size_variable (MODE) ? 
get_mode_size_constant (MODE) : (unsigned short) mode_size[MODE]), where 
mode_size_variable and get_mode_size_constant are always_inline functions 
generated by genmodes) - that way all targets are covered automatically, 
without needing such duplication of mode sizes.  (Of course such 
optimizations wouldn't apply for a first-stage compiler bootstrapped by a 
compiler not supporting __builtin_constant_p, but lack of optimization in 
the first stage of a bootstrap is not a particular concern.)

-- 
Joseph S. Myers
jos...@codesourcery.com


Re: wide-int, tree-ssa

2014-01-03 Thread Kenneth Zadeck

On 11/26/2013 07:34 AM, Richard Biener wrote:

--- a/gcc/tree-ssa-ccp.c
+++ b/gcc/tree-ssa-ccp.c
@@ -98,6 +98,15 @@ along with GCC; see the file COPYING3.  If not see
 array CONST_VAL[i].VALUE.  That is fed into substitute_and_fold for
 final substitution and folding.

+   This algorithm uses wide-ints at the max precision of the target.
+   This means that, with one uninteresting exception, variables with
+   UNSIGNED types never go to VARYING because the bits above the
+   precision of the type of the variable are always zero.  The
+   uninteresting case is a variable of UNSIGNED type that has the
+   maximum precision of the target.  Such variables can go to VARYING,
+   but this causes no loss of infomation since these variables will
+   never be extended.
+

I don't understand this.  In CCP a SSA name drops to varying when
it's not constant.  How is that related to signedness or precision?!


Richi,

This is a bogosity that is inherited from the double-int code. Consider 
an unsigned int in double-int (or for that matter in widest-int).   It 
has a rep of a bunch of leading zeros followed by 32 bits of real 
stuff.Even if you know nothing about the value, it still has the 
upper bits of zero that you know are constant zeros.   Of course this is 
not true for signed numbers because the upper bits are smeared with the 
sign bits.


This was what i thought you were talking about when you argued many 
months ago when you said that infinite precision was more natural and 
that i would have trouble with a fixed precision based on the size of 
the type or mode and is the reason that tree-ssa-ccp uses widest int 
rather than wide-int.   I do actually believe that wide-int is more 
natural here.   However this bogosity does buy you a few constants.


My first cut at this pass used wide-int rather than widest int, but 
there are in fact constants that the pass finds that depend on this 
being done this way so to satisfy the bit for bit same code rule for 
values smaller than timode, so i left this the way that it was but 
decided to document it.


The proper way to do this is in fact to use wide-int, not widest-int but 
that would require major surgery to the code that converts values from 
one type to another.I plan to do this during the next stage 1.   At 
that time, i will also enhance the code to expand the sign bit if it is 
known for signed types.


Kenny


Re: [PATCH i386 5/8] [AVX-512] Extend vectorizer hooks.

2014-01-03 Thread Uros Bizjak
On Fri, Jan 3, 2014 at 3:02 PM, Uros Bizjak  wrote:

>>> Like in the patch below. Please note, that the block_tune setting for
>>> the nocona is wrong, -march=native on my trusted old P4 returns:
>>>
>>> --param "l1-cache-size=16" --param "l1-cache-line-size=64" --param
>>> "l2-cache-size=2048" "-mtune=nocona"
>>>
>>> which is consistent with the above quote from manual.
>>>
>>> 2014-01-02  Uros Bizjak  
>>>
>>> * config/i386/i386.c (ix86_data_alignment): Calculate max_align
>>> from prefetch_block tune setting.
>>> (nocona_cost): Correct size of prefetch block to 64.
>>>
>>> The patch was bootstrapped on x86_64-pc-linux-gnu and is currently in
>>> regression testing. If there are no comments, I will commit it to
>>> mainline and release branches after a couple of days.
>>
>> That still has the effect of not aligning (for most tunings) 32 to 63 bytes
>> long aggregates to 32 bytes, while previously they were aligned.  Forcing
>> aligning 32 byte long aggregates to 64 bytes would be overkill, 32 byte
>> alignment is just fine for those (and ensures it never crosses 64 byte
>> boundary), for 33 to 63 bytes perhaps using 64 bytes alignment wouldn't
>> be that bad, just wouldn't match what we have done before.
>
> Please note that previous value was based on earlier (pre P4)
> recommendation and it was appropriate for older chips with 32byte
> cache line. The value should be updated long ago, when 64bit cache
> lines were introduced, but was probably missed due to usage of magic
> value without comment.
>
> Ah, I see. My patch deals only with structures, larger than cache
> line. As recommended in "As long as 16-byte boundaries (and cache
> lines) are never crossed, natural alignment is not strictly necessary
> (though it is an easy way to enforce this)." part of the manual, we
> should align smaller structures to 16 or 32 bytes.
>
> Yes, I agree. Can you please merge your patch together with the proposed 
> patch?

On a second thought, the crossing of 16-byte boundaries is mentioned
for the data *access* (the instruction itself) if it is not naturally
aligned (please see example 3-40 and fig 3-2), which is *NOT* in our
case.

So, we don't have to align 32 byte structures in any way for newer
processors, since this optimization applies to 64+ byte (larger or
equal to cache line size) structures only. Older processors are
handled correctly, modulo nocona, where its cache line size value has
to be corrected.

Following that, my original patch implements this optimization in the
correct way.

Uros.


[C PATCH] Don't pedwarn for C99/C11 enum bit-fields (PR c/57773)

2014-01-03 Thread Marek Polacek
As Paul Eggert says in the PR, we shouldn't warn for enum bit-fields
in C99/C11 mode.  C11 6.7.2.1 (5) says "A bit-field shall have a type
that is a qualified or unqualified version of _Bool, signed int,
unsigned int, or some other implementation-defined type.", so ISTM
that enum bit-fields should be fine.  OTOH, I would warn in ISO C
mode.  It's true that no constraint is violated, but in C89 in 3.5.2.1
Semantics there's: "A bit-field may have type int, unsigned int, or
signed int." so it seems desirable to warn in this case.

Regtested/bootstrapped on x86_64-linux, ok for trunk?

2014-01-03  Marek Polacek  

PR c/57773
c/
* c-decl.c (check_bitfield_type_and_width): Warn for enum bit-fields
only in ISO C.
testsuite/
* gcc.dg/pr57773.c: New test.

--- gcc/c/c-decl.c.mp   2014-01-03 13:50:37.041997222 +0100
+++ gcc/c/c-decl.c  2014-01-03 14:46:29.115816235 +0100
@@ -4840,7 +4840,8 @@ check_bitfield_type_and_width (tree *typ
   if (!in_system_header_at (input_location)
   && type_mv != integer_type_node
   && type_mv != unsigned_type_node
-  && type_mv != boolean_type_node)
+  && type_mv != boolean_type_node
+  && !flag_isoc99)
 pedwarn (input_location, OPT_Wpedantic,
 "type of bit-field %qs is a GCC extension", name);
 
--- gcc/testsuite/gcc.dg/pr57773.c.mp   2014-01-03 14:50:51.097902818 +0100
+++ gcc/testsuite/gcc.dg/pr57773.c  2014-01-03 14:49:05.727462306 +0100
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99 -Wpedantic" } */
+
+enum e { zero };
+struct { enum e field: 2; } s;

Marek


Re: [C PATCH] Don't pedwarn for C99/C11 enum bit-fields (PR c/57773)

2014-01-03 Thread Joseph S. Myers
On Fri, 3 Jan 2014, Marek Polacek wrote:

> As Paul Eggert says in the PR, we shouldn't warn for enum bit-fields
> in C99/C11 mode.  C11 6.7.2.1 (5) says "A bit-field shall have a type
> that is a qualified or unqualified version of _Bool, signed int,
> unsigned int, or some other implementation-defined type.", so ISTM
> that enum bit-fields should be fine.  OTOH, I would warn in ISO C
> mode.  It's true that no constraint is violated, but in C89 in 3.5.2.1
> Semantics there's: "A bit-field may have type int, unsigned int, or
> signed int." so it seems desirable to warn in this case.
> 
> Regtested/bootstrapped on x86_64-linux, ok for trunk?

Implementation-defined behavior is documented in implement-c.texi, so this 
patch is incomplete as it doesn't update that file where it says:

No other types are permitted in strictly conforming mode.
@c Would it be better to restrict the pedwarn for other types to C90
@c mode and document the other types for C99/C11 mode?

(And this isn't just about enums, but other integer types as well, so the 
test should cover those.)

-- 
Joseph S. Myers
jos...@codesourcery.com


Re: [C++ Patch] Fix __is_base_of vs incomplete types

2014-01-03 Thread Paolo Carlini

Hi,

On 01/02/2014 10:46 PM, Jason Merrill wrote:

On 12/27/2013 07:02 AM, Paolo Carlini wrote:

the same arguments. Conservatively but still more neatly than my first
try, we could maybe use same_type_ignoring_top_level_qualifiers_p in the
definition of the DERIVED_FROM_P macro?


Sure, let's do that.  And add something about incomplete types to the 
pre-function comment for lookup_base.

Great. Thus I successfully tested on x86_64-linux the below.

Thanks,
Paolo.


/cp
2014-01-03  Paolo Carlini  

* cp-tree.h (DERIVED_FROM_P): True when PARENT and TYPE are the same
class-type (even if incomplete).
* search.c (lookup_base): Extend comment.

/testsuite
2014-01-03  Paolo Carlini  

* g++.dg/ext/is_base_of_incomplete-2.C: New.
Index: cp/cp-tree.h
===
--- cp/cp-tree.h(revision 206313)
+++ cp/cp-tree.h(working copy)
@@ -1324,7 +1324,9 @@ enum languages { lang_c, lang_cplusplus, lang_java
 /* Nonzero iff TYPE is derived from PARENT. Ignores accessibility and
ambiguity issues.  */
 #define DERIVED_FROM_P(PARENT, TYPE) \
-  (lookup_base ((TYPE), (PARENT), ba_any, NULL, tf_none) != NULL_TREE)
+  (lookup_base ((TYPE), (PARENT), ba_any, NULL, tf_none) != NULL_TREE \
+   || ((TYPE) && NON_UNION_CLASS_TYPE_P (TYPE) \
+   && same_type_ignoring_top_level_qualifiers_p ((PARENT), (TYPE
 
 /* Gives the visibility specification for a class type.  */
 #define CLASSTYPE_VISIBILITY(TYPE) \
Index: cp/search.c
===
--- cp/search.c (revision 206313)
+++ cp/search.c (working copy)
@@ -177,8 +177,8 @@ accessible_base_p (tree t, tree base, bool conside
discovered.
 
If the base is inaccessible, or ambiguous, then error_mark_node is
-   returned.  If the tf_error bit of COMPLAIN is not set, no error
-   is issued.  */
+   returned.  If the tf_error bit of COMPLAIN is not set, no error is
+   issued.  If the base in incomplete, then NULL_TREE is returned.  */
 
 tree
 lookup_base (tree t, tree base, base_access access,
Index: testsuite/g++.dg/ext/is_base_of_incomplete-2.C
===
--- testsuite/g++.dg/ext/is_base_of_incomplete-2.C  (revision 0)
+++ testsuite/g++.dg/ext/is_base_of_incomplete-2.C  (working copy)
@@ -0,0 +1,5 @@
+struct T;
+
+int check1[__is_base_of(T, T) ? 1 : -1];
+int check2[__is_base_of(T, const T) ? 1 : -1];
+int check3[__is_base_of(volatile T, T) ? 1 : -1];


Re: Rb tree node recycling patch

2014-01-03 Thread François Dumont

Thanks for your feedback.

I considered my patch enough tested because it impacts basic 
operations on the Rb tree containers. There are also a number of tests 
that are normally dedicated to exception safety but that has the 
advantage of challenging numerous containers operations in a generic way 
and with some randomness. I have already enhance those tests adding 
checks on memory leaks. It is the biggest issue that could occur with 
this patch. I will add some tests to this framework to cover the newly 
introduced C++11 allocator aware methods.


To really know how bad or good is the testsuite we should also work 
on test coverage computation but that's a big subject, perhaps for a GSOC...


François


On 01/03/2014 12:57 PM, Christopher Jefferson wrote:

Personally, I consider the testing insufficent, although the testing
was already insufficient! Note that this is my opinion, don't assume I
talk for all of libstdc++!

After I broke std::nth_element in 4.8.2 (woops), I am of the view that
the libstdc++ testsuite is defficient -- many code paths are never
tested. The nth_element breakage should really have been picked up,
about 1/3 of all random invocations of nth_element failed!

You could be "inspired" by the code I wrote for algorithms, it is in
testsuite/util/testsuite_containergen.h. The basic idea is I make sure
to generate special cases (size 0 containers, containers containing
only a single value repeated), and then a range of containers of
various different sizes.

I realise suggesting this is probably as much work as your patch, and
you shouldn't assume this is required, but I think it would improve
the testsuite. If you do go down this route, then be sure to reduce
your test's size for simulators. You can look at for example at my new
tests which feature:

// { dg-options "-std=gnu++11" }
// { dg-options "-std=gnu++11 -DSIMULATOR_TEST" { target simulator } }

Which enables the macro SIMULATOR_TEST on simulators, where I do much
less testing (else the tester takes too long.


On 27 December 2013 18:30, François Dumont  wrote:

Hi

 Here is a patch to add recycling of Rb tree nodes when possible.

 I replaced the _Rb_tree::_M_move_assign with a move assignment operator
to benefit from:
- move of elements when the whole data structure cannot be moved
- faster data structure cloning rather than full regeneration of the tree
when _M_move_assign was failing

 Note that this patch contains also a cleanup of a useless template
parameter _Is_pod_comparator on _Rb_tree_impl. If you want to apply it
quickly for 4.9 do not hesitate.

 I haven't done any specific test for this feature, existing ones looks
good enough to me. If you want me to add some I will when back from
vacation. I am mostly submitting this patch to show you that I worked on it
and you do not need to do it yourself.

2013-12-27  François Dumont 

 * include/bits/stl_tree.h (_Rb_tree_reuse_or_alloc_node<>): New.
 (_Rb_tree_alloc_node<>): Likewise.
 (_Rb_tree<>::_M_clone_node): Made template to take a node
 generator.
 (_Rb_tree_impl<>): Remove unused _Is_pod_comparator template
 value.
 (_Rb_tree<>::_M_move_assign): Replace by...
 (_Rb_tree<>::operator(_Rb_tree&&)): ...this.
 (_Rb_tree_impl<>::_M_reset): New.
 (_Rb_tree<>::_M_insert_): Add node generator parameter.
 (_Rb_tree<>::_M_copy): Add overload taking a node generator.
 (_Rb_tree<>::_M_insert_unique_): Add node generator parameter.
 (_Rb_tree<>::_M_insert_equal_): Add node generator parameter.
 (_Rb_tree<>::_M_assign_unique): New.
 (_Rb_tree<>::_M_assign_equal): New.
 (_Rb_tree<>): Adapt to use _Rb_tree_impl<>::_M_reset and reuse
 nodes as much as possible.
 * include/bits/stl_set.h (set<>::operator=(set<>&&): Adapt to use
 _Rb_tree move assignment operator.
 (set<>::operator=(initializer_list<>)): Adapt to use
 _Rb_tree<>::_M_assign_unique.
 * include/bits/stl_multiset.h
 (multiset<>::operator=(multiset<>&&)): Adapt to use
 _Rb_tree move assignment operator.
 (multiset<>::operator=(initializer_list<>)): Adapt to use
 _Rb_tree<>::_M_assign_equal.
 * include/bits/stl_map.h (map<>::operator=(map<>&&): Adapt to use
 _Rb_tree move assignment operator.
 (map<>::operator=(initializer_list<>)): Adapt to use
 _Rb_tree<>::_M_assign_unique.
 * include/bits/stl_multimap.h
 (multimap<>::operator=(multimap<>&&)): Adapt to use
 _Rb_tree move assignment operator.
 (multimap<>::operator=(initializer_list<>)): Adapt to use
 _Rb_tree<>::_M_assign_equal.

Tested under Linux x86_64.

Happy new year.

François





Re: [C++ Patch] Fix __is_base_of vs incomplete types

2014-01-03 Thread Jason Merrill

On 01/03/2014 12:29 PM, Paolo Carlini wrote:

-  (lookup_base ((TYPE), (PARENT), ba_any, NULL, tf_none) != NULL_TREE)
+  (lookup_base ((TYPE), (PARENT), ba_any, NULL, tf_none) != NULL_TREE \
+   || ((TYPE) && NON_UNION_CLASS_TYPE_P (TYPE) \
+   && same_type_ignoring_top_level_qualifiers_p ((PARENT), (TYPE


Let's only check one or the other, depending on whether TYPE is incomplete.


+   issued.  If the base in incomplete, then NULL_TREE is returned.  */


"If T is incomplete"

Jason



Re: [C++ Patch] Fix __is_base_of vs incomplete types

2014-01-03 Thread Jason Merrill

On 01/03/2014 01:47 PM, Jason Merrill wrote:

+   issued.  If the base in incomplete, then NULL_TREE is returned.  */


"If T is incomplete"


"...even if BASE is the same type."

Jason




Re: [C++] PR58950: warn for unused __builtin_shuffle result

2014-01-03 Thread Jason Merrill

OK.

Jason


Re: Extend -fstack-protector-strong to cover calls with return slot

2014-01-03 Thread Jakub Jelinek
On Fri, Jan 03, 2014 at 02:15:51PM +0100, Florian Weimer wrote:
> This patch fixes a loophole in the -fstack-protector-strong
> protection.  If a function call uses the return slot optimization,
> the caller needs stack protector instrumentation because the return
> slot is addressable.
> 
> Bootstrapped and regression-tested on x86_64-redhat-linux-gnu, with
> C/C++/Java enabled.  Okay for trunk?
> 
> -- 
> Florian Weimer / Red Hat Product Security Team

> 2014-01-03  Florian Weimer  
> 
>   * cfgexpand.c (call_with_return_slot_opt_p): New function.
>   (expand_used_vars): Emit stack protector instrumentation in strong
>   mode if call_with_return_slot_opt_p.
> 
> gcc/testsuite/
> 
> 2014-01-03  Florian Weimer  
> 
>   * gcc.dg/fstack-protector-strong.c: Add coverage for named return
>   values.
>   * g++.dg/fstack-protector-strong.C: Likewise.
> 
> Index: gcc/cfgexpand.c
> ===
> --- gcc/cfgexpand.c   (revision 206311)
> +++ gcc/cfgexpand.c   (working copy)
> @@ -1599,6 +1599,22 @@
>return 0;
>  }
>  
> +/* Check if the basic block has a call which uses a return slot.  */
> +
> +static bool
> +call_with_return_slot_opt_p (basic_block bb)
> +{
> +  for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
> +   !gsi_end_p (gsi); gsi_next (&gsi))
> +{
> +  gimple stmt = gsi_stmt (gsi);
> +  if (gimple_code (stmt) == GIMPLE_CALL

That would be is_gimple_call (stmt) instead.

Also, I'd think the function is misnamed, given that it checks if there
are any calls with return_slot_opt_p in a bb.  I think it would be
better to move FOR_ALL_BB_FN (bb, cfun) loop also into the
function and call it any_call_...

Lastly, I wonder if gimple_call_return_slot_opt_p is really what you are
after, why does NRV matter here?  Isn't what you are looking for instead
whether the called function returns value through invisible reference,
because then you'll always have some (aggregate) addressable object
in the caller's frame and supposedly you are after making sure that the
callee doesn't overflow the return object.

So, looking at tree-nrv.c, that check would be roughly:
  if (is_gimple_call (stmt)
  && gimple_call_lhs (stmt)
  && aggregate_value_p (TREE_TYPE (gimple_call_lhs (stmt)),
gimple_call_fndecl (stmt)))

Jakub


[patch] libbacktrace: add support for --disable-werror

2014-01-03 Thread Mike Frysinger
In the same vein as the other dirs, add a --disable-werror option to the
libbacktrace dir to disable the explicit -Werror usage.

2014-01-03  Mike Frysinger  

* configure.ac: Add --enable-werror.
(WARN_FLAGS): Use it.
* configure: Regenerate.

--- a/libbacktrace/configure.ac
+++ a/libbacktrace/configure.ac
@@ -132,8 +132,13 @@ ACX_PROG_CC_WARNING_OPTS([-W -Wall -Wwri
  -Wmissing-format-attribute -Wcast-qual],
  [WARN_FLAGS])
 
+AC_ARG_ENABLE(werror, [AS_HELP_STRING([--enable-werror],
+  [turns on -Werror @<:@default=yes@:>@])])
+
 if test -n "${with_target_subdir}"; then
-  WARN_FLAGS="$WARN_FLAGS -Werror"
+  if test "x$enable_werror" != "xno"; then
+WARN_FLAGS="$WARN_FLAGS -Werror"
+  fi
 fi
 
 AC_SUBST(WARN_FLAGS)
--- a/libbacktrace/configure
+++ a/libbacktrace/configure
@@ -730,6 +730,7 @@ enable_fast_install
 with_gnu_ld
 enable_libtool_lock
 enable_multilib
+enable_werror
 with_system_libunwind
 '
   ac_precious_vars='build_alias
@@ -1370,6 +1371,7 @@ Optional Features:
   optimize for fast installation [default=yes]
   --disable-libtool-lock  avoid locking (might break parallel builds)
   --enable-multilib   build many library versions (default)
+  --enable-werror turns on -Werror [default=yes]
 
 Optional Packages:
   --with-PACKAGE[=ARG]use PACKAGE [ARG=yes]
@@ -11614,8 +11616,16 @@ fi
 CFLAGS="$save_CFLAGS"
 
 
+# Check whether --enable-werror was given.
+if test "${enable_werror+set}" = set; then :
+  enableval=$enable_werror;
+fi
+
+
 if test -n "${with_target_subdir}"; then
-  WARN_FLAGS="$WARN_FLAGS -Werror"
+  if test "x$enable_werror" != "xno"; then
+WARN_FLAGS="$WARN_FLAGS -Werror"
+  fi
 fi
 
 


signature.asc
Description: This is a digitally signed message part.


Re: wide-int, rtl

2014-01-03 Thread Mike Stump
On Jan 2, 2014, at 2:26 PM, Eric Botcazou  wrote:
>> So, I'd like to ping the original patch and Kenny's patch to resolve the
>> issues you found.  If you have any other concerns or thoughts, let us
>> know.
> 
> Almost OK, but remove the strange quotes in the comment for the INTEGER_CST 
> case of expand_expr_real_1

Ok, thanks.  I've resolved it this way:

Index: expr.c
===
--- expr.c  (revision 206323)
+++ expr.c  (working copy)
@@ -9477,11 +9477,10 @@ expand_expr_real_1 (tree exp, rtx target
   return decl_rtl;
 
 case INTEGER_CST:
-  /* "Given that TYPE_PRECISION (type) is not always equal to
+  /* Given that TYPE_PRECISION (type) is not always equal to
  GET_MODE_PRECISION (TYPE_MODE (type)), we need to extend from
  the former to the latter according to the signedness of the
- type". */
-
+ type. */
   temp = immed_wide_int_const (wide_int::from
   (exp,
GET_MODE_PRECISION (TYPE_MODE (type)),



Re: How to generate AVX512 instructions now (just to look at them).

2014-01-03 Thread Jakub Jelinek
Hi!

On Fri, Jan 03, 2014 at 08:58:30PM +0100, Toon Moene wrote:
> I don't doubt that would work, what I'm interested in, is (cat verintlin.f):

Well, you need gather loads for that and there you hit PR target/59617.

Completely untested patch that let's your testcase be vectorized
using 64-byte vectors, for vectorizable_mask_load_store it still punts,
but I guess the steps there are first to teach it about non-gather MASK_LOAD
and MASK_STORE, which aren't handled for the AVX512F modes either
(I think V8DI/V8DF/V16SI/V16SF modes should be possible to handle right now)
and then move on to handle the gathers similarly.

2014-01-03  Jakub Jelinek  

PR target/59617
* config/i386/i386.c (ix86_vectorize_builtin_gather): Uncomment
AVX512F gather builtins.
* tree-vect-stmts.c (vectorizable_mask_load_store): For now punt
on gather decls with INTEGER_TYPE masktype.
(vectorizable_load): For INTEGER_TYPE masktype, put the INTEGER_CST
directly into the builtin rather than hoisting it before loop.

--- gcc/config/i386/i386.c.jj   2014-01-03 13:19:14.0 +0100
+++ gcc/config/i386/i386.c  2014-01-03 21:12:23.630145609 +0100
@@ -36527,9 +36527,6 @@ ix86_vectorize_builtin_gather (const_tre
 case V8SImode:
   code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
   break;
-#if 0
-/*  FIXME: Commented until vectorizer can work with (mask_type != src_type)
-   PR59617.   */
 case V8DFmode:
   if (TARGET_AVX512F)
code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
@@ -36554,7 +36551,6 @@ ix86_vectorize_builtin_gather (const_tre
   else
return NULL_TREE;
   break;
-#endif
 default:
   return NULL_TREE;
 }
--- gcc/tree-vect-stmts.c.jj2014-01-03 11:41:01.0 +0100
+++ gcc/tree-vect-stmts.c   2014-01-03 21:29:47.595911084 +0100
@@ -1813,6 +1813,17 @@ vectorizable_mask_load_store (gimple stm
 "gather index use not simple.");
  return false;
}
+
+  tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gather_decl));
+  tree masktype
+   = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (TREE_CHAIN (arglist;
+  if (TREE_CODE (masktype) == INTEGER_TYPE)
+   {
+ if (dump_enabled_p ())
+   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+"masked gather with integer mask not supported.");
+ return false;
+   }
 }
   else if (tree_int_cst_compare (nested_in_vect_loop
 ? STMT_VINFO_DR_STEP (stmt_info)
@@ -5761,6 +5772,7 @@ vectorizable_load (gimple stmt, gimple_s
{
  mask = build_int_cst (TREE_TYPE (masktype), -1);
  mask = build_vector_from_val (masktype, mask);
+ mask = vect_init_vector (stmt, mask, masktype, NULL);
}
   else if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (masktype)))
{
@@ -5771,10 +5783,10 @@ vectorizable_load (gimple stmt, gimple_s
  real_from_target (&r, tmp, TYPE_MODE (TREE_TYPE (masktype)));
  mask = build_real (TREE_TYPE (masktype), r);
  mask = build_vector_from_val (masktype, mask);
+ mask = vect_init_vector (stmt, mask, masktype, NULL);
}
   else
gcc_unreachable ();
-  mask = vect_init_vector (stmt, mask, masktype, NULL);
 
   scale = build_int_cst (scaletype, gather_scale);
 


Jakub


Re: Extend -fstack-protector-strong to cover calls with return slot

2014-01-03 Thread Florian Weimer

On 01/03/2014 07:57 PM, Jakub Jelinek wrote:


+/* Check if the basic block has a call which uses a return slot.  */
+
+static bool
+call_with_return_slot_opt_p (basic_block bb)
+{
+  for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
+   !gsi_end_p (gsi); gsi_next (&gsi))
+{
+  gimple stmt = gsi_stmt (gsi);
+  if (gimple_code (stmt) == GIMPLE_CALL


That would be is_gimple_call (stmt) instead.


Ah, it's not used consistently everywhere, and I got it from of the 
leftover places.



Also, I'd think the function is misnamed, given that it checks if there
are any calls with return_slot_opt_p in a bb.  I think it would be
better to move FOR_ALL_BB_FN (bb, cfun) loop also into the
function and call it any_call_...


I should probably move both loops (the one for declarations and the one 
for basic blocks) into its own function.



Lastly, I wonder if gimple_call_return_slot_opt_p is really what you are
after, why does NRV matter here?


The C code we generate does not construct the returned value in place 
(presumably because the partial write would be visible with threads, 
longjmp etc.), unlike the C++ code.


That's why I'm interested in instrumenting NRV-able calls only.  But 
gimple_call_return_slot_opt_p doesn't actually give me that.  The GIMPLE 
from the C test case looks like this (before and after applying your 
proposal):


foo11 ()
{
  int D.2265;
  struct B D.2266;

  D.2266 = global3 (); [return slot optimization]
  D.2265 = D.2266.a1;
  return D.2265;
}

In both cases, SSP instrumentation is applied:

.type   foo11, @function
foo11:
.LFB24:
.cfi_startproc
subq$56, %rsp
.cfi_def_cfa_offset 64
movq%rsp, %rdi
movq%fs:40, %rax
movq%rax, 40(%rsp)
xorl%eax, %eax
callglobal3
movq40(%rsp), %rdx
xorq%fs:40, %rdx
movl(%rsp), %eax
jne .L50
addq$56, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 8
ret
.L50:
.cfi_restore_state
call__stack_chk_fail
.cfi_endproc


Isn't what you are looking for instead
whether the called function returns value through invisible reference,
because then you'll always have some (aggregate) addressable object
in the caller's frame and supposedly you are after making sure that the
callee doesn't overflow the return object.

So, looking at tree-nrv.c, that check would be roughly:
   if (is_gimple_call (stmt)
   && gimple_call_lhs (stmt)
   && aggregate_value_p (TREE_TYPE (gimple_call_lhs (stmt)),
 gimple_call_fndecl (stmt)))


When I do that, I get SSP instrumentation even when the struct is small 
enough to be returned in registers.  gimple_call_return_slot_opt_p 
returns false in this case.  So gimple_call_return_slot_opt_p appears to 
be misnamed (it's an ABI thing, not really an optimization), but it's 
closer to what I want.


--
Florian Weimer / Red Hat Product Security Team


Re: [PATCH] i?86 unaligned/aligned load improvement for AVX512F

2014-01-03 Thread Kirill Yukhin
Hello,
On 03 Jan 09:59, Jakub Jelinek wrote:
> Does it matter which of vmovdqu32 vs. vmovdqu64 is used if no
> masking/zeroing is performed (i.e. vmovdqu32 (%rax), %zmm0 vs.
> vmovdqu64 (%rax), %zmm0) for performance reasons (i.e. isn't there some
> reinterpretation penalty)?
No, there should be no penalty (at least from today point of view).
So, I like your patch!

--
Thanks, K


Re: [Patch, Fortran, OOP] PR 59547: Problem with using tbp specification function in multiple class procedures

2014-01-03 Thread Mikael Morin
Le 22/12/2013 11:28, Janus Weil a écrit :
> Hi all,
> 
> here is a patch for a rejects-valid problem with type-bound
> procedures, which is due to the fact that the PURE attribute is being
> propagated too late. (I'm not sure if this problem could show up also
> with other attributes, so for now I'm only treating PURE.)
> 
> Regtested on x86_64-unknown-linux-gnu. Ok for trunk?
> 
OK.

Mikael


Re: [Patch] libgcov.c re-factoring

2014-01-03 Thread Teresa Johnson
On Sun, Dec 22, 2013 at 10:27 AM, Jan Hubicka  wrote:
>> On Tue, Dec 17, 2013 at 7:48 AM, Teresa Johnson  wrote:
>> > On Mon, Dec 16, 2013 at 2:48 PM, Xinliang David Li  
>> > wrote:
>> >> Ok -- gcov_write_counter and gcov_write_tag_length are qualified as
>> >> low level primitives for basic gcov format and probably should be kept
>> >> in gcov-io.c.
>> >>
>> >> gcov_rewrite is petty much libgcov runtime implementation details so I
>> >> think it should be moved out. gcov_write_summary is not related to
>> >> gcov low level format either, neither is gcov_seek.  Ok for them to be
>> >> moved?
>> >
>> > After looking at these some more, with the idea that gcov-io.c should
>> > encapsulate the lower level IO routines, then I think all of these
>> > (including gcov_rewrite) should remain in gcov-io.c. I think
>> > gcov_write_summary belongs there since all of the other gcov_write_*
>
> Yep, I think gcov_write_summary and read summary should not be split in 
> between
> two directories.  Similary for gcov_seek/rewrite I can see either the whole
> low-level I/O being abstracted away to -driver.c but currently -driver.c seem
> to containing higher level stuff that you apparenlty want to fork for kernel
> implementation instead and the actual i/o seems to remain in gcov-io.c
>
>> +GCOV_LINKAGE struct gcov_var gcov_var;
>
> If gcov_var is not used by gcov-io.c only, why its declaration remains in 
> gcov-io.h?

Good point - moved to gcov-io.c.

>> Index: libgcc/libgcov.h
>> ===
>> --- libgcc/libgcov.h(revision 0)
>> +++ libgcc/libgcov.h(revision 0)
>> @@ -0,0 +1,225 @@
>> +/* Header file for libgcov-*.c.
>> +   Contributed by Rong Xu .
>> +   Copyright (C) 2013 Free Software Foundation, Inc.
> I believe when the code was created by moving it from elsehwre, the copyright 
> should say
> original date of gcov-io.h.

Fixed the copyright header.

>> +
>> +#include "tconfig.h"
>> +#include "tsystem.h"
>> +#include "coretypes.h"
>> +#include "tm.h"
>> +#include "libgcc_tm.h"
> 
> I would really like someone working on header restructuring to comment on
> those.
> I am not 100% sure what our best practices currently are in respect of
> including headers within headers and specially after good amount of
> defines like gcov-io.h gets included.

Ok, I have left it the same as the prior patch for now. Note that
gcov-io.h was already including tconfig.h, which has now been moved to
libgcov.h. The other includes were pulled in from the various
libgcov-*.c files that were including them.

New patch (bootstrapped and regression tested) is included below.

>
>> +
>> +#include "gcov-io.h"
>
> Otherwise the patch  is OK (if header restructuring is fine and after
> moving gcov_var structure definition into gcov-io.c if possible).

Ok, hopefully someone can comment on the header file issue.

Thanks,
Teresa

>
> Honza

2014-01-03  Rong Xu  

* gcc/gcov-io.c (gcov_var): Move from gcov-io.h.
(gcov_position): Ditto.
(gcov_is_error): Ditto.
(gcov_rewrite): Ditto.
* gcc/gcov-io.h: Refactor. Move gcov_var to gcov-io.h, and libgcov
only part to libgcc/libgcov.h.
* libgcc/libgcov-driver.c: Use libgcov.h.
(buffer_fn_data): Use xmalloc instead of malloc.
(gcov_exit_merge_gcda): Ditto.
* libgcc/libgcov-driver-system.c (allocate_filename_struct): Ditto.
* libgcc/libgcov.h: New common header files for libgcov-*.h.
* libgcc/libgcov-interface.c: Use libgcov.h
* libgcc/libgcov-merge.c: Ditto.
* libgcc/libgcov-profiler.c: Ditto.
* libgcc/Makefile.in: Add dependence to libgcov.h

Index: gcc/gcov-io.c
===
--- gcc/gcov-io.c   (revision 206100)
+++ gcc/gcov-io.c   (working copy)
@@ -36,6 +36,61 @@ static const gcov_unsigned_t *gcov_read_words (uns
 static void gcov_allocate (unsigned);
 #endif

+/* Optimum number of gcov_unsigned_t's read from or written to disk.  */
+#define GCOV_BLOCK_SIZE (1 << 10)
+
+GCOV_LINKAGE struct gcov_var
+{
+  FILE *file;
+  gcov_position_t start;   /* Position of first byte of block */
+  unsigned offset; /* Read/write position within the block.  */
+  unsigned length; /* Read limit in the block.  */
+  unsigned overread;   /* Number of words overread.  */
+  int error;   /* < 0 overflow, > 0 disk error.  */
+  int mode;/* < 0 writing, > 0 reading */
+#if IN_LIBGCOV
+  /* Holds one block plus 4 bytes, thus all coverage reads & writes
+ fit within this buffer and we always can transfer GCOV_BLOCK_SIZE
+ to and from the disk. libgcov never backtracks and only writes 4
+ or 8 byte objects.  */
+  gcov_unsigned_t buffer[GCOV_BLOCK_SIZE + 1];
+#else
+  int endian;  /* Swap endianness.  */
+  /* Holds a variable length block, as the compiler can write
+ strings and n

Re: [Patch] libgcov.c re-factoring

2014-01-03 Thread Joseph S. Myers
On Fri, 3 Jan 2014, Teresa Johnson wrote:

> Index: libgcc/libgcov.h
> ===
> --- libgcc/libgcov.h(revision 0)
> +++ libgcc/libgcov.h(revision 0)
> @@ -0,0 +1,224 @@
> +/* Header file for libgcov-*.c.
> +   Copyright (C) 1996-2013 Free Software Foundation, Inc.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it under
> +   the terms of the GNU General Public License as published by the Free
> +   Software Foundation; either version 3, or (at your option) any later
> +   version.
> +
> +   GCC is distributed in the hope that it will be useful, but WITHOUT ANY
> +   WARRANTY; without even the implied warranty of MERCHANTABILITY or
> +   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
> +   for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with GCC; see the file COPYING3.  If not see
> +   .  */

Any libgcc source file (.c/.h etc., not makefiles) should have the runtime 
license exception.

-- 
Joseph S. Myers
jos...@codesourcery.com


Re: [C++ Patch] Fix __is_base_of vs incomplete types

2014-01-03 Thread Paolo Carlini

Hi,

On 01/03/2014 07:47 PM, Jason Merrill wrote:

On 01/03/2014 12:29 PM, Paolo Carlini wrote:

-  (lookup_base ((TYPE), (PARENT), ba_any, NULL, tf_none) != NULL_TREE)
+  (lookup_base ((TYPE), (PARENT), ba_any, NULL, tf_none) != NULL_TREE \
+   || ((TYPE) && NON_UNION_CLASS_TYPE_P (TYPE) \
+   && same_type_ignoring_top_level_qualifiers_p ((PARENT), 
(TYPE
Let's only check one or the other, depending on whether TYPE is 
incomplete.

The dispatching seems tricky, though, because lookup_base has:

   if (!TYPE_P (t))
{
  t_binfo = t;
  t = BINFO_TYPE (t);
}
  else
{
  t = complete_type (TYPE_MAIN_VARIANT (t));
  t_binfo = TYPE_BINFO (t);
}

thus TYPE_P (t) isn't necessarily true and, more importantly, otherwise 
the function actively tries to complete the type. Do you think it can work?


Thanks,
Paolo.

PS: sorry about the sloppy comment in my last try ;)


Re: [Patch] libgcov.c re-factoring

2014-01-03 Thread Teresa Johnson
On Fri, Jan 3, 2014 at 2:49 PM, Joseph S. Myers  wrote:
> On Fri, 3 Jan 2014, Teresa Johnson wrote:
>
>> Index: libgcc/libgcov.h
>> ===
>> --- libgcc/libgcov.h(revision 0)
>> +++ libgcc/libgcov.h(revision 0)
>> @@ -0,0 +1,224 @@
>> +/* Header file for libgcov-*.c.
>> +   Copyright (C) 1996-2013 Free Software Foundation, Inc.
>> +
>> +   This file is part of GCC.
>> +
>> +   GCC is free software; you can redistribute it and/or modify it under
>> +   the terms of the GNU General Public License as published by the Free
>> +   Software Foundation; either version 3, or (at your option) any later
>> +   version.
>> +
>> +   GCC is distributed in the hope that it will be useful, but WITHOUT ANY
>> +   WARRANTY; without even the implied warranty of MERCHANTABILITY or
>> +   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
>> +   for more details.
>> +
>> +   You should have received a copy of the GNU General Public License
>> +   along with GCC; see the file COPYING3.  If not see
>> +   .  */
>
> Any libgcc source file (.c/.h etc., not makefiles) should have the runtime
> license exception.

Ok, thanks. I have changed this last paragraph to the following:

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   .  */

Teresa

>
> --
> Joseph S. Myers
> jos...@codesourcery.com



-- 
Teresa Johnson | Software Engineer | tejohn...@google.com | 408-460-2413


Re: [C++ Patch] Fix __is_base_of vs incomplete types

2014-01-03 Thread Paolo Carlini
... something like the attached appears to work. Not sure at the moment 
if it could be simplified.


Thanks,
Paolo.


Index: cp/cp-tree.h
===
--- cp/cp-tree.h(revision 206318)
+++ cp/cp-tree.h(working copy)
@@ -1324,7 +1324,12 @@ enum languages { lang_c, lang_cplusplus, lang_java
 /* Nonzero iff TYPE is derived from PARENT. Ignores accessibility and
ambiguity issues.  */
 #define DERIVED_FROM_P(PARENT, TYPE) \
-  (lookup_base ((TYPE), (PARENT), ba_any, NULL, tf_none) != NULL_TREE)
+  (((TYPE) && (!TYPE_P (TYPE)  \
+  || TYPE_BINFO (complete_type \
+ (TYPE_MAIN_VARIANT (TYPE) \
+   ? lookup_base ((TYPE), (PARENT), ba_any, NULL, tf_none) != NULL_TREE \
+   : ((TYPE) && NON_UNION_CLASS_TYPE_P (TYPE)  \
+  && same_type_ignoring_top_level_qualifiers_p ((PARENT), (TYPE
 
 /* Gives the visibility specification for a class type.  */
 #define CLASSTYPE_VISIBILITY(TYPE) \
Index: cp/search.c
===
--- cp/search.c (revision 206318)
+++ cp/search.c (working copy)
@@ -177,8 +177,9 @@ accessible_base_p (tree t, tree base, bool conside
discovered.
 
If the base is inaccessible, or ambiguous, then error_mark_node is
-   returned.  If the tf_error bit of COMPLAIN is not set, no error
-   is issued.  */
+   returned.  If the tf_error bit of COMPLAIN is not set, no error is
+   issued.  If T cannot be completed, then NULL_TREE is returned even
+   if BASE is the same type.  */
 
 tree
 lookup_base (tree t, tree base, base_access access,
Index: testsuite/g++.dg/ext/is_base_of_incomplete-2.C
===
--- testsuite/g++.dg/ext/is_base_of_incomplete-2.C  (revision 0)
+++ testsuite/g++.dg/ext/is_base_of_incomplete-2.C  (working copy)
@@ -0,0 +1,5 @@
+struct T;
+
+int check1[__is_base_of(T, T) ? 1 : -1];
+int check2[__is_base_of(T, const T) ? 1 : -1];
+int check3[__is_base_of(volatile T, T) ? 1 : -1];


Re: [PATCH] Introduce MODE_SIZE mode attribute

2014-01-03 Thread Jakub Jelinek
On Fri, Jan 03, 2014 at 03:39:11PM +, Joseph S. Myers wrote:
> On Fri, 3 Jan 2014, Jakub Jelinek wrote:
> 
> > I've noticed that especially with the AVX512F introduction we use
> > GET_MODE_SIZE (mode) quite heavily in the i386 *.md files, and
> > the problem with that is GET_MODE_SIZE isn't a compile time constant,
> > needs to read mode_size array (non-const) at runtime.
> 
> It would seem better to me for genmodes to generate appropriate macro / 
> inline function definitions that can be used by GET_MODE_SIZE (along the 
> lines of (__builtin_constant_p (MODE) && !mode_size_variable (MODE) ? 
> get_mode_size_constant (MODE) : (unsigned short) mode_size[MODE]), where 
> mode_size_variable and get_mode_size_constant are always_inline functions 
> generated by genmodes) - that way all targets are covered automatically, 
> without needing such duplication of mode sizes.  (Of course such 
> optimizations wouldn't apply for a first-stage compiler bootstrapped by a 
> compiler not supporting __builtin_constant_p, but lack of optimization in 
> the first stage of a bootstrap is not a particular concern.)

That is certainly doable (as attached), but strangely if the patch (that I've
already committed) is reverted and this one applied, the .text savings are
much smaller.

Here are .text and .rodata readelf -WS lines from x86_64 (first 4 pairs) and
i686 (last 4 pairs) builds, always vanilla trunk before r206312, that plus
r206312 patch, without r206312 but with attached patch, with both r206312
and attached patch.  So, for .text size the best is both patches, but
for .rodata patches just r206312.  I'll try to look at details why this is so
next week.

  [12] .text PROGBITS004f4b00 0f4b00 1131704 00  AX 
 0   0 16
  [14] .rodata   PROGBITS01626240 1226240 4093b4 00   A 
 0   0 64
  [12] .text PROGBITS004f20a0 0f20a0 11156e4 00  AX 
 0   0 16
  [14] .rodata   PROGBITS016077c0 12077c0 3fcbb4 00   A 
 0   0 64
  [12] .text PROGBITS004f4c60 0f4c60 112b8b4 00  AX 
 0   0 16
  [14] .rodata   PROGBITS01620540 1220540 40b134 00   A 
 0   0 64
  [12] .text PROGBITS004f2200 0f2200 1113eb4 00  AX 
 0   0 16
  [14] .rodata   PROGBITS016060c0 12060c0 3fea74 00   A 
 0   0 64
  [12] .text PROGBITS0811d750 0d5750 12b4464 00  AX  0   0 
16
  [14] .rodata   PROGBITS093d1c00 1389c00 2d8094 00   A  0   0 
64
  [12] .text PROGBITS0811b150 0d3150 12996a4 00  AX  0   0 
16
  [14] .rodata   PROGBITS093b4840 136c840 2d1354 00   A  0   0 
64
  [12] .text PROGBITS0811d840 0d5840 12aa1e4 00  AX  0   0 
16
  [14] .rodata   PROGBITS093c7a40 137fa40 2d8b94 00   A  0   0 
64
  [12] .text PROGBITS0811b240 0d3240 1292914 00  AX  0   0 
16
  [14] .rodata   PROGBITS093adb80 1365b80 2d1f94 00   A  0   0 
64

2014-01-04  Jakub Jelinek  

* genmodes.c (struct mode_data): Add need_bytesize_adj field.
(blank_mdoe): Initialize it.
(emit_mode_size_inline, emit_mode_nunits_inline,
emit_mode_inner_inline): New functions.
(emit_insn_modes_h): Call them and surround their output with
#if GCC_VERSION >= 4001 ... #endif.
* machmode.h (GET_MODE_SIZE, GET_MODE_NUNITS, GET_MODE_INNER):
For GCC_VERSION >= 4001 use mode_*_inline routines instead of
mode_* arrays if the argument is __builtin_constant_p.
* lower-subreg.c (dump_choices): Make sure GET_MODE_SIZE argument
is enum machine_mode.
fortran/
* trans-types.c (gfc_init_kinds): Make sure GET_MODE_BITSIZE
argument is enum machine_mode.

--- gcc/lower-subreg.c.jj   2013-12-10 18:18:39.077943292 +0100
+++ gcc/lower-subreg.c  2014-01-03 18:35:00.510418999 +0100
@@ -1371,7 +1371,7 @@ dump_choices (bool speed_p, const char *
   fprintf (dump_file, "Choices when optimizing for %s:\n", description);
 
   for (i = 0; i < MAX_MACHINE_MODE; i++)
-if (GET_MODE_SIZE (i) > UNITS_PER_WORD)
+if (GET_MODE_SIZE ((enum machine_mode) i) > UNITS_PER_WORD)
   fprintf (dump_file, "  %s mode %s for copy lowering.\n",
   choices[speed_p].move_modes_to_split[i]
   ? "Splitting"
--- gcc/fortran/trans-types.c.jj2013-11-21 22:24:18.790939654 +0100
+++ gcc/fortran/trans-types.c   2014-01-03 18:35:00.534418997 +0100
@@ -373,7 +373,7 @@ gfc_init_kinds (void)
   /* The middle end doesn't support constants larger than 2*HWI.
 Perhaps the target hook shouldn't have accepted these either,
 but just to be safe...  */
-  bitsize = GET_MODE_BITSIZE (mode);
+  bitsize = GET_MODE_BITSIZE ((enum machine_mode) mode);
   if (bitsize > 2*HOST_BITS_PER_WIDE_INT)
continue;
 
--- gcc/machmode.h.jj   2013-11-29 18:22:15.6715

Re: wide-int, tree

2014-01-03 Thread Mike Stump
I'm wondering if all the outstanding issues you raised with "tree" have been 
addressed.  If there are any that remain, let us know.

If they have been, is the original patch (as modified of course by approved 
later work) Ok?

Re: [PATCH] Introduce MODE_SIZE mode attribute

2014-01-03 Thread Andrew Pinski
On Fri, Jan 3, 2014 at 3:37 PM, Jakub Jelinek  wrote:
> On Fri, Jan 03, 2014 at 03:39:11PM +, Joseph S. Myers wrote:
>> On Fri, 3 Jan 2014, Jakub Jelinek wrote:
>>
>> > I've noticed that especially with the AVX512F introduction we use
>> > GET_MODE_SIZE (mode) quite heavily in the i386 *.md files, and
>> > the problem with that is GET_MODE_SIZE isn't a compile time constant,
>> > needs to read mode_size array (non-const) at runtime.
>>
>> It would seem better to me for genmodes to generate appropriate macro /
>> inline function definitions that can be used by GET_MODE_SIZE (along the
>> lines of (__builtin_constant_p (MODE) && !mode_size_variable (MODE) ?
>> get_mode_size_constant (MODE) : (unsigned short) mode_size[MODE]), where
>> mode_size_variable and get_mode_size_constant are always_inline functions
>> generated by genmodes) - that way all targets are covered automatically,
>> without needing such duplication of mode sizes.  (Of course such
>> optimizations wouldn't apply for a first-stage compiler bootstrapped by a
>> compiler not supporting __builtin_constant_p, but lack of optimization in
>> the first stage of a bootstrap is not a particular concern.)
>
> That is certainly doable (as attached), but strangely if the patch (that I've
> already committed) is reverted and this one applied, the .text savings are
> much smaller.
>
> Here are .text and .rodata readelf -WS lines from x86_64 (first 4 pairs) and
> i686 (last 4 pairs) builds, always vanilla trunk before r206312, that plus
> r206312 patch, without r206312 but with attached patch, with both r206312
> and attached patch.  So, for .text size the best is both patches, but
> for .rodata patches just r206312.  I'll try to look at details why this is so
> next week.
>
>   [12] .text PROGBITS004f4b00 0f4b00 1131704 00  
> AX  0   0 16
>   [14] .rodata   PROGBITS01626240 1226240 4093b4 00   
> A  0   0 64
>   [12] .text PROGBITS004f20a0 0f20a0 11156e4 00  
> AX  0   0 16
>   [14] .rodata   PROGBITS016077c0 12077c0 3fcbb4 00   
> A  0   0 64
>   [12] .text PROGBITS004f4c60 0f4c60 112b8b4 00  
> AX  0   0 16
>   [14] .rodata   PROGBITS01620540 1220540 40b134 00   
> A  0   0 64
>   [12] .text PROGBITS004f2200 0f2200 1113eb4 00  
> AX  0   0 16
>   [14] .rodata   PROGBITS016060c0 12060c0 3fea74 00   
> A  0   0 64
>   [12] .text PROGBITS0811d750 0d5750 12b4464 00  AX  0   
> 0 16
>   [14] .rodata   PROGBITS093d1c00 1389c00 2d8094 00   A  0   
> 0 64
>   [12] .text PROGBITS0811b150 0d3150 12996a4 00  AX  0   
> 0 16
>   [14] .rodata   PROGBITS093b4840 136c840 2d1354 00   A  0   
> 0 64
>   [12] .text PROGBITS0811d840 0d5840 12aa1e4 00  AX  0   
> 0 16
>   [14] .rodata   PROGBITS093c7a40 137fa40 2d8b94 00   A  0   
> 0 64
>   [12] .text PROGBITS0811b240 0d3240 1292914 00  AX  0   
> 0 16
>   [14] .rodata   PROGBITS093adb80 1365b80 2d1f94 00   A  0   
> 0 64
>
> 2014-01-04  Jakub Jelinek  
>
> * genmodes.c (struct mode_data): Add need_bytesize_adj field.
> (blank_mdoe): Initialize it.
> (emit_mode_size_inline, emit_mode_nunits_inline,
> emit_mode_inner_inline): New functions.
> (emit_insn_modes_h): Call them and surround their output with
> #if GCC_VERSION >= 4001 ... #endif.
> * machmode.h (GET_MODE_SIZE, GET_MODE_NUNITS, GET_MODE_INNER):
> For GCC_VERSION >= 4001 use mode_*_inline routines instead of
> mode_* arrays if the argument is __builtin_constant_p.
> * lower-subreg.c (dump_choices): Make sure GET_MODE_SIZE argument
> is enum machine_mode.
> fortran/
> * trans-types.c (gfc_init_kinds): Make sure GET_MODE_BITSIZE
> argument is enum machine_mode.

It turns out Jorn filed a bug about this exact issue (back in 2008).
See bug 36109.


Thanks,
Andrew Pinski


>
> --- gcc/lower-subreg.c.jj   2013-12-10 18:18:39.077943292 +0100
> +++ gcc/lower-subreg.c  2014-01-03 18:35:00.510418999 +0100
> @@ -1371,7 +1371,7 @@ dump_choices (bool speed_p, const char *
>fprintf (dump_file, "Choices when optimizing for %s:\n", description);
>
>for (i = 0; i < MAX_MACHINE_MODE; i++)
> -if (GET_MODE_SIZE (i) > UNITS_PER_WORD)
> +if (GET_MODE_SIZE ((enum machine_mode) i) > UNITS_PER_WORD)
>fprintf (dump_file, "  %s mode %s for copy lowering.\n",
>choices[speed_p].move_modes_to_split[i]
>? "Splitting"
> --- gcc/fortran/trans-types.c.jj2013-11-21 22:24:18.790939654 +0100
> +++ gcc/fortran/trans-types.c   2014-01-03 18:35:00.534418997 +0100
> @@ -373,7 +373,7 @@ gfc_init_kinds (void)
>/* The middle end doesn't support constants larger than 2*HWI.
>  

Re: [PATCH] Introduce MODE_SIZE mode attribute

2014-01-03 Thread Andrew Pinski
On Fri, Jan 3, 2014 at 6:27 PM, Andrew Pinski  wrote:
> On Fri, Jan 3, 2014 at 3:37 PM, Jakub Jelinek  wrote:
>> On Fri, Jan 03, 2014 at 03:39:11PM +, Joseph S. Myers wrote:
>>> On Fri, 3 Jan 2014, Jakub Jelinek wrote:
>>>
>>> > I've noticed that especially with the AVX512F introduction we use
>>> > GET_MODE_SIZE (mode) quite heavily in the i386 *.md files, and
>>> > the problem with that is GET_MODE_SIZE isn't a compile time constant,
>>> > needs to read mode_size array (non-const) at runtime.
>>>
>>> It would seem better to me for genmodes to generate appropriate macro /
>>> inline function definitions that can be used by GET_MODE_SIZE (along the
>>> lines of (__builtin_constant_p (MODE) && !mode_size_variable (MODE) ?
>>> get_mode_size_constant (MODE) : (unsigned short) mode_size[MODE]), where
>>> mode_size_variable and get_mode_size_constant are always_inline functions
>>> generated by genmodes) - that way all targets are covered automatically,
>>> without needing such duplication of mode sizes.  (Of course such
>>> optimizations wouldn't apply for a first-stage compiler bootstrapped by a
>>> compiler not supporting __builtin_constant_p, but lack of optimization in
>>> the first stage of a bootstrap is not a particular concern.)
>>
>> That is certainly doable (as attached), but strangely if the patch (that I've
>> already committed) is reverted and this one applied, the .text savings are
>> much smaller.
>>
>> Here are .text and .rodata readelf -WS lines from x86_64 (first 4 pairs) and
>> i686 (last 4 pairs) builds, always vanilla trunk before r206312, that plus
>> r206312 patch, without r206312 but with attached patch, with both r206312
>> and attached patch.  So, for .text size the best is both patches, but
>> for .rodata patches just r206312.  I'll try to look at details why this is so
>> next week.
>>
>>   [12] .text PROGBITS004f4b00 0f4b00 1131704 00  
>> AX  0   0 16
>>   [14] .rodata   PROGBITS01626240 1226240 4093b4 00  
>>  A  0   0 64
>>   [12] .text PROGBITS004f20a0 0f20a0 11156e4 00  
>> AX  0   0 16
>>   [14] .rodata   PROGBITS016077c0 12077c0 3fcbb4 00  
>>  A  0   0 64
>>   [12] .text PROGBITS004f4c60 0f4c60 112b8b4 00  
>> AX  0   0 16
>>   [14] .rodata   PROGBITS01620540 1220540 40b134 00  
>>  A  0   0 64
>>   [12] .text PROGBITS004f2200 0f2200 1113eb4 00  
>> AX  0   0 16
>>   [14] .rodata   PROGBITS016060c0 12060c0 3fea74 00  
>>  A  0   0 64
>>   [12] .text PROGBITS0811d750 0d5750 12b4464 00  AX  0   
>> 0 16
>>   [14] .rodata   PROGBITS093d1c00 1389c00 2d8094 00   A  0   
>> 0 64
>>   [12] .text PROGBITS0811b150 0d3150 12996a4 00  AX  0   
>> 0 16
>>   [14] .rodata   PROGBITS093b4840 136c840 2d1354 00   A  0   
>> 0 64
>>   [12] .text PROGBITS0811d840 0d5840 12aa1e4 00  AX  0   
>> 0 16
>>   [14] .rodata   PROGBITS093c7a40 137fa40 2d8b94 00   A  0   
>> 0 64
>>   [12] .text PROGBITS0811b240 0d3240 1292914 00  AX  0   
>> 0 16
>>   [14] .rodata   PROGBITS093adb80 1365b80 2d1f94 00   A  0   
>> 0 64
>>
>> 2014-01-04  Jakub Jelinek  
>>
>> * genmodes.c (struct mode_data): Add need_bytesize_adj field.
>> (blank_mdoe): Initialize it.
>> (emit_mode_size_inline, emit_mode_nunits_inline,
>> emit_mode_inner_inline): New functions.
>> (emit_insn_modes_h): Call them and surround their output with
>> #if GCC_VERSION >= 4001 ... #endif.
>> * machmode.h (GET_MODE_SIZE, GET_MODE_NUNITS, GET_MODE_INNER):
>> For GCC_VERSION >= 4001 use mode_*_inline routines instead of
>> mode_* arrays if the argument is __builtin_constant_p.
>> * lower-subreg.c (dump_choices): Make sure GET_MODE_SIZE argument
>> is enum machine_mode.
>> fortran/
>> * trans-types.c (gfc_init_kinds): Make sure GET_MODE_BITSIZE
>> argument is enum machine_mode.
>
> It turns out Jorn filed a bug about this exact issue (back in 2008).
> See bug 36109.

Also Kazu filed a similar thing about TREE_CODE_LENGTH and
TREE_CODE_CLASS, see bug 14840; I attached a patch but I never got
around to seeing if it improves compile time speed either.  It
definitely showed up in fold-const.c code at one point.

Thanks,
Andrew Pinski

>
>
> Thanks,
> Andrew Pinski
>
>
>>
>> --- gcc/lower-subreg.c.jj   2013-12-10 18:18:39.077943292 +0100
>> +++ gcc/lower-subreg.c  2014-01-03 18:35:00.510418999 +0100
>> @@ -1371,7 +1371,7 @@ dump_choices (bool speed_p, const char *
>>fprintf (dump_file, "Choices when optimizing for %s:\n", description);
>>
>>for (i = 0; i < MAX_MACHINE_MODE; i++)
>> -if (GET_MODE_SIZE (i) > UNITS_PER_WORD)
>> +if (GET_MODE_SIZE ((enum machine_mode) i) > UNITS_PER_WORD

[Patch] Regex bracket matcher cache optimization

2014-01-03 Thread Tim Shen
The data structure _BracketMatcher (storing regex like [123a-z]) could
be quite slow mainly because of regex_traits. So a result of cache for
small range of inputs (char) sounds reasonable. It iterates all 256
inputs and calculate them at regex compile time.

Booted and tested under -m64 and -m32.

Thanks!


-- 
Regards,
Tim Shen
commit de1e43989be9c9dffebf488e118549cfe2987b9e
Author: tim 
Date:   Fri Jan 3 11:43:12 2014 -0500

2014-01-03  Tim Shen  

	* include/bits/regex_compiler.h (_AnyMatcher<>::_AnyMatcher(),
	_AnyMatcher<>::operator(), _AnyMatcher<>::_M_apply(),
	_CharMatcher<>::_CharMatcher(), _CharMatcher<>::_M_translate(),
	_BracketMatcher<>::_BracketMatcher(), _BracketMatcher<>::operator(),
	_BracketMatcher<>::_M_add_char(),
	_BracketMatcher<>::_M_add_collating_element(),
	_BracketMatcher<>::_M_add_equivalence_class(),
	_BracketMatcher<>::_M_add_character_class(),
	_BracketMatcher<>::_M_make_range(), _BracketMatcher<>::_M_ready(),
	_BracketMatcher<>::_M_apply(), _BracketMatcher<>::_M_make_cache()):
	Fix _AnyMatcher behavior of POSIX style and move _M_flags
	to template parameter; Add cache for _BracketMatcher. Adjust
	declarations from here...
	* include/bits/regex.h (basic_regex<>::imbue()): ...to here. Also,
	imbuing a regex will trigger a recompilation to rebuild the cache.
	* include/bits/regex_compiler.tcc (_Compiler<>::_M_atom(),
	_Compiler<>::_M_bracket_expression()): Adjust matchers' caller for
	different template bool parameters.
	* include/bits/regex_executor.h: Remove unnecessary declarations.
	* include/std/regex: Adjust including orders.
	* testsuite/28_regex/traits/char/user_defined.cc: New.
	* testsuite/28_regex/traits/wchar_t/user_defined.cc: New.

diff --git a/libstdc++-v3/include/bits/regex.h b/libstdc++-v3/include/bits/regex.h
index 4e091e0..ae8e1f5 100644
--- a/libstdc++-v3/include/bits/regex.h
+++ b/libstdc++-v3/include/bits/regex.h
@@ -30,6 +30,15 @@
 
 namespace std _GLIBCXX_VISIBILITY(default)
 {
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+  template
+class basic_regex;
+
+  template
+class match_results;
+
+_GLIBCXX_END_NAMESPACE_VERSION
+
 namespace __detail
 {
 _GLIBCXX_BEGIN_NAMESPACE_VERSION
@@ -48,6 +57,56 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 		  const basic_regex<_CharT, _TraitsT>& __re,
 		  regex_constants::match_flag_type __flags);
 
+  template
+class _Executor;
+
+  template
+struct __has_contiguous_iter : std::false_type { };
+
+  template
+struct __has_contiguous_iter>
+: std::true_type  // string storage is contiguous
+{ };
+
+  template
+struct __has_contiguous_iter>
+: std::true_type  // vector storage is contiguous
+{ };
+
+  template
+struct __has_contiguous_iter>
+: std::false_type // vector storage is not contiguous
+{ };
+
+  template
+struct __is_contiguous_normal_iter : std::false_type { };
+
+  template
+struct
+__is_contiguous_normal_iter<__gnu_cxx::__normal_iterator<_Tp, _Cont>>
+: __has_contiguous_iter<_Cont>::type
+{ };
+
+  template
+using __enable_if_contiguous_normal_iter
+  = typename enable_if< __is_contiguous_normal_iter<_Iter>::value,
+			std::shared_ptr<_NFA<_TraitsT>> >::type;
+
+  template
+using __disable_if_contiguous_normal_iter
+  = typename enable_if< !__is_contiguous_normal_iter<_Iter>::value,
+			std::shared_ptr<_NFA<_TraitsT>> >::type;
+
+  template
+__disable_if_contiguous_normal_iter<_FwdIter, _TraitsT>
+__compile_nfa(_FwdIter __first, _FwdIter __last, const _TraitsT& __traits,
+		  regex_constants::syntax_option_type __flags);
+
+  template
+__enable_if_contiguous_normal_iter<_Iter, _TraitsT>
+__compile_nfa(_Iter __first, _Iter __last, const _TraitsT& __traits,
+		  regex_constants::syntax_option_type __flags);
+
 _GLIBCXX_END_NAMESPACE_VERSION
 }
 
@@ -501,6 +560,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	basic_regex(_FwdIter __first, _FwdIter __last,
 		flag_type __f = ECMAScript)
 	: _M_flags(__f),
+	  _M_original_str(__first, __last),
 	  _M_automaton(__detail::__compile_nfa(__first, __last, _M_traits,
 	   _M_flags))
 	{ }
@@ -637,6 +697,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	   flag_type __flags = ECMAScript)
 	{
 	  _M_flags = __flags;
+	  _M_original_str.assign(__s.begin(), __s.end());
 	  _M_automaton = __detail::__compile_nfa(__s.begin(), __s.end(),
 		 _M_traits, _M_flags);
 	  return *this;
@@ -701,7 +762,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
*/
   locale_type
   imbue(locale_type __loc)
-  { return _M_traits.imbue(__loc); }
+  {
+	auto __ret = _M_traits.imbue(__loc);
+	this->assign(_M_original_str, _M_flags);
+	return __ret;
+  }
 
   /**
* @brief Gets the locale currently imbued in the regular expression
@@ -744,9 +809,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   template
 	friend class __detail::_Executor;
 
-  flag_type _M_flags;
-   

Re: Fix IBM long double division inaccuracy (glibc bug 15396)

2014-01-03 Thread Alan Modra
On Thu, Jan 02, 2014 at 09:46:56PM +, Joseph S. Myers wrote:
> (Note that there remain other bugs in the IBM long double code, some
> causing glibc test failures, at least (a) invalid results in rounding
> modes other than FE_TONEAREST, (b) spurious overflow and underflow
> exceptions, mainly but not entirely where discontiguous mantissa bits
> are involved.)

Thing is, the algorithms in rs6000/ibm-ldouble.c require round to
nearest to generate correct results.  Quoting from the PowerPC64 ABI:

This "Extended precision" differs from the IEEE 754 Standard in the 
following ways:
 
 * The software support is restricted to round-to-nearest mode.  
   Programs that use extended precision must ensure that this rounding 
   mode is in effect when extended-precision calculations are performed.
 * Does not fully support the IEEE special numbers NaN and INF.  These
   values are encoded in the high-order double value only.  The 
   low-order value is not significant.  
 * Does not support the IEEE status flags for overflow, underflow, and 
   other conditions.  These flag have no meaning in this format.


-- 
Alan Modra
Australia Development Lab, IBM


Re: [patch] libbacktrace: add support for --disable-werror

2014-01-03 Thread Ian Lance Taylor
On Fri, Jan 3, 2014 at 11:19 AM, Mike Frysinger  wrote:
> In the same vein as the other dirs, add a --disable-werror option to the
> libbacktrace dir to disable the explicit -Werror usage.

As far as I can see most of the target directories do not do this.
And libbacktrace is only turning on -Werror when it is a target
directory.  I'm not necessarily opposed to this, but I question the
premise: this is not like the other dirs, this is something new.

Ian



> 2014-01-03  Mike Frysinger  
>
> * configure.ac: Add --enable-werror.
> (WARN_FLAGS): Use it.
> * configure: Regenerate.
>
> --- a/libbacktrace/configure.ac
> +++ a/libbacktrace/configure.ac
> @@ -132,8 +132,13 @@ ACX_PROG_CC_WARNING_OPTS([-W -Wall -Wwri
>   -Wmissing-format-attribute -Wcast-qual],
>   [WARN_FLAGS])
>
> +AC_ARG_ENABLE(werror, [AS_HELP_STRING([--enable-werror],
> +  [turns on -Werror 
> @<:@default=yes@:>@])])
> +
>  if test -n "${with_target_subdir}"; then
> -  WARN_FLAGS="$WARN_FLAGS -Werror"
> +  if test "x$enable_werror" != "xno"; then
> +WARN_FLAGS="$WARN_FLAGS -Werror"
> +  fi
>  fi
>
>  AC_SUBST(WARN_FLAGS)
> --- a/libbacktrace/configure
> +++ a/libbacktrace/configure
> @@ -730,6 +730,7 @@ enable_fast_install
>  with_gnu_ld
>  enable_libtool_lock
>  enable_multilib
> +enable_werror
>  with_system_libunwind
>  '
>ac_precious_vars='build_alias
> @@ -1370,6 +1371,7 @@ Optional Features:
>optimize for fast installation [default=yes]
>--disable-libtool-lock  avoid locking (might break parallel builds)
>--enable-multilib   build many library versions (default)
> +  --enable-werror turns on -Werror [default=yes]
>
>  Optional Packages:
>--with-PACKAGE[=ARG]use PACKAGE [ARG=yes]
> @@ -11614,8 +11616,16 @@ fi
>  CFLAGS="$save_CFLAGS"
>
>
> +# Check whether --enable-werror was given.
> +if test "${enable_werror+set}" = set; then :
> +  enableval=$enable_werror;
> +fi
> +
> +
>  if test -n "${with_target_subdir}"; then
> -  WARN_FLAGS="$WARN_FLAGS -Werror"
> +  if test "x$enable_werror" != "xno"; then
> +WARN_FLAGS="$WARN_FLAGS -Werror"
> +  fi
>  fi
>
>