[google] Handle incompatible cg options more generally in LIPO (issue6476057)

2012-08-23 Thread David Li
Index: coverage.c
===
--- coverage.c  (revision 190369)
+++ coverage.c  (working copy)
@@ -261,6 +261,56 @@ str_eq (const void *p1, const void *p2)
   return !strcmp (s1, s2);
 }
 
+/* Command line option descriptor.  */
+
+struct opt_desc
+{
+  const char *opt_str;
+  const char *opt_neg_str;
+  bool default_val;  /* TODO better handling of default  */
+};
+
+static struct opt_desc force_matching_cg_opts[] =
+  {
+{ "-fexceptions", "-fno-exceptions", true },
+{ "-fsized-delete", "-fno-sized-delete", false },
+{ NULL, NULL, false }
+  };
+
+/* A helper function to check if OPTION_STRING is one of the codegen
+   options specified in FORCE_MATCHING_CG_ARGS. If yes, set the
+   corresponding entry in CG_ARG_VAL to the value of the option specified
+   in OPTION_STRING.  */
+
+static void
+check_cg_opts (bool *cg_opt_val, const char *option_str)
+{
+  unsigned int i;
+  for (i = 0; force_matching_cg_opts[i].opt_str; i++)
+{
+  if (!strcmp (force_matching_cg_opts[i].opt_str, option_str))
+cg_opt_val[i] = true;
+  else if (!strcmp (force_matching_cg_opts[i].opt_neg_str, option_str))
+cg_opt_val[i] = false;
+}
+}
+
+/* A helper function to check if CG_OPTS1 and CG_OPTS are identical. It returns
+   true if yes, false otherwise.  */
+
+static bool
+has_incompatible_cg_opts (bool *cg_opts1, bool *cg_opts2, unsigned num_cg_opts)
+{
+  unsigned i;
+
+  for (i = 0; i < num_cg_opts; i++)
+{
+  if (cg_opts1[i] != cg_opts2[i])
+return true;
+}
+
+  return false;
+}
 
 /* Returns true if the command-line arguments stored in the given module-infos
are incompatible.  */
@@ -276,8 +326,6 @@ incompatible_cl_args (struct gcov_module
   unsigned int num_non_warning_opts1 = 0, num_non_warning_opts2 = 0;
   bool warning_mismatch = false;
   bool non_warning_mismatch = false;
-  bool with_fexceptions1 = true;
-  bool with_fexceptions2 = true;
   htab_t option_tab1, option_tab2;
   unsigned int start_index1 = mod_info1->num_quote_paths +
 mod_info1->num_bracket_paths + mod_info1->num_cpp_defines +
@@ -286,6 +334,22 @@ incompatible_cl_args (struct gcov_module
 mod_info2->num_bracket_paths + mod_info2->num_cpp_defines +
 mod_info2->num_cpp_includes;
 
+  bool *cg_opts1, *cg_opts2, has_any_incompatible_cg_opts;
+  unsigned int num_cg_opts = 0;
+
+  for (i = 0; force_matching_cg_opts[i].opt_str; i++)
+num_cg_opts++;
+
+  cg_opts1 = XCNEWVEC (bool, num_cg_opts);
+  cg_opts2 = XCNEWVEC (bool, num_cg_opts);
+
+  /* Initialize the array to default value  */
+  for (i = 0; force_matching_cg_opts[i].opt_str; i++)
+{
+  cg_opts1[i] = force_matching_cg_opts[i].default_val;
+  cg_opts2[i] = force_matching_cg_opts[i].default_val;
+}
+
   option_tab1 = htab_create (10, str_hash, str_eq, NULL);
   option_tab2 = htab_create (10, str_hash, str_eq, NULL);
 
@@ -297,12 +361,9 @@ incompatible_cl_args (struct gcov_module
 else
   {
 void **slot;
-char* option_string = mod_info1->string_array[start_index1 + i];
+char *option_string = mod_info1->string_array[start_index1 + i];
 
-if (!strcmp ("-fexceptions", option_string))
-  with_fexceptions1 = true;
-else if (!strcmp ("-fno-exceptions", option_string))
-  with_fexceptions1 = false;
+check_cg_opts (cg_opts1, option_string);
 
 slot = htab_find_slot (option_tab1, option_string, INSERT);
 if (!*slot)
@@ -319,12 +380,10 @@ incompatible_cl_args (struct gcov_module
 else
   {
 void **slot;
-char* option_string = mod_info2->string_array[start_index2 + i];
+char *option_string = mod_info2->string_array[start_index2 + i];
+
+check_cg_opts (cg_opts2, option_string);
 
-if (!strcmp ("-fexceptions", option_string))
-  with_fexceptions2 = true;
-else if (!strcmp ("-fno-exceptions", option_string))
-  with_fexceptions2 = false;
 slot = htab_find_slot (option_tab2, option_string, INSERT);
 if (!*slot)
   {
@@ -354,7 +413,7 @@ incompatible_cl_args (struct gcov_module
 warning (OPT_Wripa_opt_mismatch, "command line arguments mismatch for %s "
 "and %s", mod_info1->source_filename, mod_info2->source_filename);
 
-   if (warn_ripa_opt_mismatch && non_warning_mismatch 
+   if (warn_ripa_opt_mismatch && non_warning_mismatch
&& (flag_opt_info >= OPT_INFO_MED))
  {
inform (UNKNOWN_LOCATION, "Options for %s", mod_info1->source_filename);
@@ -365,14 +424,19 @@ incompatible_cl_args (struct gcov_module
  inform (UNKNOWN_LOCATION, non_warning_opts2[i]);
  }
 
-  XDELETEVEC (warning_opts1);
-  XDELETEVEC (warning_opts2);
-  XDELETEVEC (non_warning_opts1);
-  XDELETEVEC (non_warning_opts2);
-  htab_delete (option_tab1);
-  htab_delete (option_tab2);
-  return ((flag_ripa_disallow_opt_mismatch && non_warning_mismatch)
-  || (with_fexce

[google] fix a static promotion bug -- not handling aliases in LIPO mode (issue4879042)

2011-08-11 Thread David Li
The following patch will be applied to google branches.

2011-08-11  David Li  

* l-ipo.c (process_module_scope_static_func): Promote
aliases of static symbol in LIPO mode.

Index: l-ipo.c
===
--- l-ipo.c (revision 177629)
+++ l-ipo.c (working copy)
@@ -1850,6 +1850,7 @@ static void
 process_module_scope_static_func (struct cgraph_node *cnode)
 {
   tree decl = cnode->decl;
+  bool addr_taken;
 
   if (TREE_PUBLIC (decl)
   || !TREE_STATIC (decl)
@@ -1863,7 +1864,18 @@ process_module_scope_static_func (struct
 
   /* Can be local -- the promotion pass need to be done after
  callgraph build when address taken bit is set.  */
-  if (!cnode->address_taken)
+  addr_taken = cnode->address_taken;
+  if (!addr_taken && cnode->same_body)
+{
+  struct cgraph_node *alias = cnode->same_body;
+  while (alias)
+{
+ if (alias->address_taken)
+   addr_taken = true;
+  alias = alias->next;
+}
+}
+  if (!addr_taken)
 {
   tree assemb_id = create_unique_name (decl, cgraph_get_module_id (decl));
 
@@ -1875,10 +1887,32 @@ process_module_scope_static_func (struct
 
   if (cgraph_is_auxiliary (decl))
 {
+  unsigned mod_id;
+
   gcc_assert (cgraph_get_module_id (decl) != primary_module_id);
+  mod_id = cgraph_get_module_id (decl);
   /* Promote static function to global.  */
-  if (cgraph_get_module_id (decl))
-promote_static_var_func (cgraph_get_module_id (decl), decl, 1);
+  if (mod_id)
+{
+  promote_static_var_func (mod_id, decl, 1);
+
+  /* Process aliases  */
+  if (cnode->same_body)
+{
+  struct cgraph_node *alias = cnode->same_body;
+  while (alias)
+{
+  if (!alias->thunk.thunk_p)
+{
+  tree alias_decl = alias->decl;
+  /* Should assert  */
+  if (cgraph_get_module_id (alias_decl) == mod_id)
+promote_static_var_func (mod_id, alias_decl, 1);
+}
+   alias = alias->next;
+}
+}
+}
 }
   else
 {
@@ -1888,6 +1922,23 @@ process_module_scope_static_func (struct
 {
   promote_static_var_func (cgraph_get_module_id (decl), decl, 0);
   cgraph_mark_if_needed (decl);
+
+  /* Process aliases  */
+  if (cnode->same_body)
+{
+  struct cgraph_node *alias = cnode->same_body;
+  while (alias)
+{
+  if (!alias->thunk.thunk_p)
+{
+  tree alias_decl = alias->decl;
+  /* Should assert  */
+  if (cgraph_get_module_id (alias_decl) == 
cgraph_get_module_id (decl))
+promote_static_var_func (cgraph_get_module_id (decl), 
alias_decl, 0);
+}
+   alias = alias->next;
+}
+}
 }
 }
 }

--
This patch is available for review at http://codereview.appspot.com/4879042


[google]: test case fixes (issue4553055)

2011-05-22 Thread David Li
Due to the inliner change (in google/main), a couple of test cases under
gcc.dg/matrix starts to fail (ICE in matrix reorg). The problem can be
reproduced in gcc46 with increases inline limit (but not in trunk -- the
problem is either fixed or inline behavior is still different). The following
workaround will be checked in to google/main. Along with the patch is a change
in LIPO. When forming the name for local labels in function, to avoid conflict,
both module id and funcdef_no is used. The new change increase one bit to 
allow more functions (256k funcs per module), and a total of 16k modules -- we 
saw
cases where funcdef_no is greater than 128k in a module leading to overflow and
conflict.

2011-05-21  David Li  

* testsuite/gcc.dg/matrix/transpose-1.c: Do not inline mem_init.
* testsuite/gcc.dg/matrix/transpose-2.c: Ditto.
* testsuite/gcc.dg/matrix/transpose-3.c: Ditto.
* testsuite/gcc.dg/matrix/transpose-4.c: Ditto.
* testsuite/gcc.dg/matrix/transpose-5.c: Ditto.
* testsuite/gcc.dg/matrix/transpose-6.c: Ditto.

Index: function.h
===
--- function.h  (revision 174024)
+++ function.h  (working copy)
@@ -673,7 +673,7 @@ struct GTY(()) function {
   GEN_FUNC_GLOBAL_ID (FUNC_DECL_MODULE_ID (func), FUNC_DECL_FUNC_ID (func))
 /* 32 bit wide unique id used for asm label (limit: 30k modules,
128k funcs per module.  */
-#define FUNC_LABEL_ID(func) ((FUNC_DECL_MODULE_ID (func) << 17) +\
+#define FUNC_LABEL_ID(func) ((FUNC_DECL_MODULE_ID (func) << 18) +\
  (func)->funcdef_no)
 
 /* Add the decl D to the local_decls list of FUN.  */
Index: testsuite/gcc.dg/matrix/transpose-1.c
===
--- testsuite/gcc.dg/matrix/transpose-1.c   (revision 174024)
+++ testsuite/gcc.dg/matrix/transpose-1.c   (working copy)
@@ -52,7 +52,7 @@ main (int argc, char **argv)
 /*--*/
 /* Dynamic memory allocations and initializations   */
 
-void
+__attribute__((noinline)) void
 mem_init (void)
 {
 
@@ -95,4 +95,3 @@ mem_init (void)
 /* { dg-final-use { scan-ipa-dump-times "Flattened 3 dimensions" 1 
"matrix-reorg"  } } */
 /* { dg-final-use { scan-ipa-dump-times "Transposed" 3 "matrix-reorg"  } } */
 /* { dg-final-use { cleanup-ipa-dump "matrix-reorg" } } */
-
Index: testsuite/gcc.dg/matrix/transpose-2.c
===
--- testsuite/gcc.dg/matrix/transpose-2.c   (revision 174024)
+++ testsuite/gcc.dg/matrix/transpose-2.c   (working copy)
@@ -50,7 +50,7 @@ main (int argc, char **argv)
 /*--*/
 /* Dynamic memory allocations and initializations   */
 
-void
+__attribute__((noinline)) void
 mem_init (void)
 {
 
Index: testsuite/gcc.dg/matrix/transpose-3.c
===
--- testsuite/gcc.dg/matrix/transpose-3.c   (revision 174024)
+++ testsuite/gcc.dg/matrix/transpose-3.c   (working copy)
@@ -54,7 +54,7 @@ main (int argc, char **argv)
 /*--*/
 /* Dynamic memory allocations and initializations   */
 
-void
+__attribute__((noinline)) void
 mem_init (void)
 {
 
Index: testsuite/gcc.dg/matrix/transpose-4.c
===
--- testsuite/gcc.dg/matrix/transpose-4.c   (revision 174024)
+++ testsuite/gcc.dg/matrix/transpose-4.c   (working copy)
@@ -52,7 +52,7 @@ main (int argc, char **argv)
 /*--*/
 /* Dynamic memory allocations and initializations   */
 
-void
+__attribute__((noinline)) void
 mem_init (void)
 {
 
Index: testsuite/gcc.dg/matrix/transpose-5.c
===
--- testsuite/gcc.dg/matrix/transpose-5.c   (revision 174024)
+++ testsuite/gcc.dg/matrix/transpose-5.c   (working copy)
@@ -49,7 +49,7 @@ main (int argc, char **argv)
 /*--*/
 /* Dynamic memory allocations and initializations   */
 
-void
+__attribute__((noinline)) void
 mem_init (void)
 {
 
Index: testsuite/gcc.dg/matrix/transpose-6.c
===
--- testsuite/gcc.dg/matrix/transpose-6.c   (revision 174024)
+++ testsuite/gcc.dg/matrix/transpose-6.c   (working copy)
@@ -49,7 +49,7 @@ main (int argc, char **argv)
 /*--*/
 /* Dynamic memory allocations and in

[google] static function promotion improvement patch for LIPO (issue4517117)

2011-05-30 Thread David Li
The following patch will be committed to google/main. It improves 
performance of internal benchmarks significantly.

This is a patch that improves static function promotion in LIPO mode.
1) Do not promote non address taken static functions -- this greately
reduce the number of promotions and allows more DFE after inlining. This
also makes inline size estimation more consistent across profile-gen
and profile-use
2) Delay static promotion just before tree-profiling after early inlining
is done. This is to make sure consistent size estimation
3) For emition of static init functions from aux modules. Those functions
will be eliminated later (they are not called from global dtor/ctor) -- 
there existence is important to make sure address taken (etc) attributes
for called dtor/ctors are consistent between profile-gen and use.

2011-05-30  David Li  

* cgraphunit.c (cgraph_optimize):  Remove call to static
promotion funciton.
* cp/decl2.c (cp_process_pending_declarations):  Do not
remove body of __static_init functions for aux modules.
* ipa-inline.c (leaf_node_p):  Filter indirect callsite
to make sure profile gen and profile use consistency
(cgraph_decide_inlining_incrementally):  Remove LIPO
specific inline rule used to workaround size estimation
problem for static functions.
* tree-profile.c (tree_profiling):  Do static promotion here.
* l-ipo.c (cgraph_is_aux_decl_external):  Handle non-promoted
static function.
(create_unique_name): New function.
(promote_static_var_func): Do not promote non addr taken statics.

Index: cgraphunit.c
===
--- cgraphunit.c(revision 174088)
+++ cgraphunit.c(working copy)
@@ -2030,9 +2030,6 @@ cgraph_optimize (void)
 {
   cgraph_init_gid_map ();
   cgraph_add_fake_indirect_call_edges ();
- /* Perform static promotion before IPA passes to avoid
-needed static functions being deleted.  */
-  cgraph_process_module_scope_statics ();
 }
 
   /* Don't run the IPA passes if there was any error or sorry messages.  */
Index: cp/decl2.c
===
--- cp/decl2.c  (revision 174088)
+++ cp/decl2.c  (working copy)
@@ -3901,10 +3901,11 @@ cp_process_pending_declarations (locatio
  to be created for auxiliary modules -- they are created to keep
  funcdef_no consistent between profile use and profile gen.  */
   for (i = 0; VEC_iterate (tree, ssdf_decls, i, fndecl); ++i)
- {
-   TREE_STATIC (fndecl) = 0;
-   DECL_INITIAL (fndecl) = 0;
- }
+/* Such ssdf_decls are not called from GLOBAL ctor/dtor, mark
+  them reachable to avoid being eliminated too early before
+  gimplication.  */
+cgraph_mark_reachable_node (cgraph_node (fndecl));
+
   ssdf_decls = NULL;
   return;
 }
Index: ipa-inline.c
===
--- ipa-inline.c(revision 174088)
+++ ipa-inline.c(working copy)
@@ -1624,8 +1624,10 @@ static bool
 leaf_node_p (struct cgraph_node *n)
 {
   struct cgraph_edge *e;
+  /* The following is buggy -- indirect call is not considered.  */
   for (e = n->callees; e; e = e->next_callee)
-if (!is_inexpensive_builtin (e->callee->decl))
+if (e->call_stmt /* Only exisit in profile use pass in LIPO */
+   && !is_inexpensive_builtin (e->callee->decl))
   return false;
   return true;
 }
@@ -1640,8 +1642,6 @@ cgraph_decide_inlining_incrementally (st
   struct cgraph_edge *e;
   bool inlined = false;
   cgraph_inline_failed_t failed_reason;
-  bool after_tree_profile =
-(DECL_STRUCT_FUNCTION (node->decl))->after_tree_profile;
 
 #ifdef ENABLE_CHECKING
   verify_cgraph_node (node);
@@ -1718,20 +1718,7 @@ cgraph_decide_inlining_incrementally (st
  || !e->inline_failed
  || e->callee->local.disregard_inline_limits)
continue;
- /* Don't do cross-module inlining before profile-use, so that we have
-a consistent CFG between profile-gen and profile-use passes.  */
- if (!after_tree_profile
- && L_IPO_COMP_MODE
- && !cgraph_is_inline_body_available_in_module (
- e->callee->decl, cgraph_get_module_id (e->caller->decl)))
-   {
- e->inline_failed = CIF_NO_INTERMODULE_INLINE;
- if (dump_file)
-   fprintf (dump_file, "Not inlining considering inlining %s: 
%s\n",
-cgraph_node_name (e->callee),
-"Inter-module inlining disabled");
- continue;
-   }
+
  if (dump_file)
fprintf (dump_file, "Considering inline candidate %s.\n

[google] Backport r174536,174537,174762,174698 to google/main (issue4568059)

2011-06-07 Thread David Li
2011-06-07  David Li  
Backport trunk r174536,174537,174762,174698


Index: doc/invoke.texi
===
--- doc/invoke.texi (revision 174725)
+++ doc/invoke.texi (working copy)
@@ -5217,11 +5217,12 @@ appended with a sequential number starti
 Disable rtl pass @var{pass}.  @var{pass} is the pass name.  If the same pass is
 statically invoked in the compiler multiple times, the pass name should be
 appended with a sequential number starting from 1.  @var{range-list} is a comma
-seperated list of function ranges.  Each range is a number pair seperated by a 
colon.
-The range is inclusive in both ends.  If the range is trivial, the number pair 
can be
-simplified a a single number.  If the function's cgraph node's @var{uid} is 
falling
-within one of the specified ranges, the @var{pass} is disabled for that 
function.
-The @var{uid} is shown in the function header of a dump file.
+seperated list of function ranges or assembler names.  Each range is a number
+pair seperated by a colon.  The range is inclusive in both ends.  If the range
+is trivial, the number pair can be simplified as a single number.  If the
+function's cgraph node's @var{uid} is falling within one of the specified 
ranges,
+the @var{pass} is disabled for that function.  The @var{uid} is shown in the
+function header of a dump file.
 
 @item -fdisable-tree-@var{pass}
 @item -fdisable-tree-@var{pass}=@var{range-list}
@@ -5251,7 +5252,8 @@ of option arguments.
-fenable-tree-cunroll=1
 # disable gcse2 for functions at the following ranges [1,1],
 # [300,400], and [400,1000]
-   -fdisable-rtl-gcse2=1:100,300,400:1000
+# disable gcse2 for functions foo and foo2
+   -fdisable-rtl-gcse2=foo,foo2
 # disable early inlining
-fdisable-tree-einline
 # disable ipa inlining
Index: tree-pretty-print.c
===
--- tree-pretty-print.c (revision 174725)
+++ tree-pretty-print.c (working copy)
@@ -3015,3 +3015,40 @@ pp_base_tree_identifier (pretty_printer 
 pp_append_text (pp, IDENTIFIER_POINTER (id),
IDENTIFIER_POINTER (id) + IDENTIFIER_LENGTH (id));
 }
+
+/* A helper function that is used to dump function information before the
+   function dump.  */
+
+void
+dump_function_header (FILE *dump_file, tree fdecl, int flags)
+{
+  const char *dname, *aname;
+  struct cgraph_node *node = cgraph_get_node (fdecl);
+  struct function *fun = DECL_STRUCT_FUNCTION (fdecl);
+
+  dname = lang_hooks.decl_printable_name (fdecl, 2);
+
+  if (DECL_ASSEMBLER_NAME_SET_P (fdecl))
+aname = (IDENTIFIER_POINTER
+ (DECL_ASSEMBLER_NAME (fdecl)));
+  else
+aname = "";
+
+  fprintf (dump_file, "\n;; Function %s (%s, funcdef_no=%d",
+  dname, aname, fun->funcdef_no);
+  if (!(flags & TDF_NOUID))
+fprintf (dump_file, ", decl_uid=%d", DECL_UID (fdecl));
+  if (node)
+{
+  fprintf (dump_file, ", cgraph_uid=%d)%s\n\n", node->uid,
+   node->frequency == NODE_FREQUENCY_HOT
+   ? " (hot)"
+   : node->frequency == NODE_FREQUENCY_UNLIKELY_EXECUTED
+   ? " (unlikely executed)"
+   : node->frequency == NODE_FREQUENCY_EXECUTED_ONCE
+   ? " (executed once)"
+   : "");
+}
+  else
+fprintf (dump_file, ")\n\n");
+}
Index: tree-pretty-print.h
===
--- tree-pretty-print.h (revision 174725)
+++ tree-pretty-print.h (working copy)
@@ -50,6 +50,7 @@ extern void debug_generic_expr (tree);
 extern void debug_generic_stmt (tree);
 extern void debug_tree_chain (tree);
 extern void percent_K_format (text_info *);
+extern void dump_function_header (FILE *, tree, int);
 
 /* In toplev.c  */
 extern bool default_tree_printer (pretty_printer *, text_info *, const char *,
Index: ChangeLog.google-main
===
--- ChangeLog.google-main   (revision 174725)
+++ ChangeLog.google-main   (working copy)
@@ -1,3 +1,25 @@
+2011-06-07  David Li  
+
+   Backport trunk r174536,174537,174762,174698
+
+   * passes.c (enable_disable_pass): Handle assembler name.
+   (is_pass_explicitly_enabled_or_disabled): Ditto.
+   * predict.c : Change pass name
+   * ipa.c: Ditto.
+   * dce.c: Ditto.
+   * tree-profile.c: Ditto.
+   * except.c: Ditto.
+   * tree-pretty-print.c (dump_function_header): New function.
+   * final.c (rest_of_clean_state): Use header dumper.
+   * tree-cfg.c (gimple_dump_cfg): Use header dumper.
+   * passes.c (pass_init_dump_file): Use header dumper.
+   * tree-pretty-print.c (dump_function_header): Add flags.
+   Don't dump decl_uid with nouid.
+   * tree-pretty-print.h (dump_function_header): Adjust.

[google] Test case cleanup after pass name change (issue4528128)

2011-06-07 Thread David Li
Fix lipo test cases with new pass names.

2011-06-07  David Li  

* testsuite/gcc.dg/tree-prof/lipo/indir-call-prof_0.c: Pass name fix.
* testsuite/gcc.dg/tree-prof/lipo/indir-call-prof_1.c: Pass name fix.
* testsuite/gcc.dg/tree-prof/lipo/val-prof-1_0.c: Pass name fix.
* testsuite/gcc.dg/tree-prof/lipo/val-prof-2_0.c: Pass name fix.
* testsuite/gcc.dg/tree-prof/lipo/val-prof-3_0.c: Pass name fix.
* testsuite/gcc.dg/tree-prof/lipo/val-prof-4_0.c: Pass name fix.
* testsuite/gcc.dg/tree-prof/lipo/val-prof-5_0.c: Pass name fix.
* testsuite/gcc.dg/tree-prof/lipo/val-prof-7_0.c: Pass name fix.
* testsuite/gcc.dg/tree-prof/lipo/indir-call-prof-single_0.c: Pass name 
fix.
* testsuite/gcc.dg/tree-prof/lipo/stringop-1_0.c: Pass name fix.
* testsuite/gcc.dg/tree-prof/lipo/ic-misattribution-1_0.c: Pass name 
fix.
* testsuite/gcc.dg/tree-prof/lipo/stringop-2_0.c: Pass name fix.
* testsuite/gcc.dg/tree-prof/lipo/update-loopch_0.c: Pass name fix.
* testsuite/gcc.dg/tree-prof/lipo/ic-misattribution-1_1.c: Pass name 
fix.
* testsuite/g++.dg/tree-prof/lipo/vcall1_0.C: Pass name fix.
* testsuite/g++.dg/tree-prof/lipo/indir-call-prof_0.C: Pass name fix.
* testsuite/g++.dg/tree-prof/lipo/vcall1_1.C: Pass name fix.
* testsuite/g++.dg/tree-prof/lipo/vcall1_2.C: Pass name fix.

Index: testsuite/gcc.dg/tree-prof/lipo/indir-call-prof_0.c
===
--- testsuite/gcc.dg/tree-prof/lipo/indir-call-prof_0.c (revision 174725)
+++ testsuite/gcc.dg/tree-prof/lipo/indir-call-prof_0.c (working copy)
@@ -1,4 +1,4 @@
-/* { dg-options "-O2 -fdump-tree-optimized -fdump-ipa-tree_profile_ipa" } */
+/* { dg-options "-O2 -fdump-tree-optimized -fdump-ipa-profile" } */
 
 extern void setp (int (**pp) (void), int i);
 
@@ -17,7 +17,7 @@ main (void)
   return 0;
 }
 
-/* { dg-final-use { scan-ipa-dump "Indirect call -> direct call.* a1" 
"tree_profile_ipa"} } */
+/* { dg-final-use { scan-ipa-dump "Indirect call -> direct call.* a1" 
"profile"} } */
 /* { dg-final-use { scan-tree-dump-not "Invalid sum" "optimized"} } */
 /* { dg-final-use { cleanup-tree-dump "optimized" } } */
-/* { dg-final-use { cleanup-ipa-dump "tree_profile_ipa" } } */
+/* { dg-final-use { cleanup-ipa-dump "profile" } } */
Index: testsuite/gcc.dg/tree-prof/lipo/indir-call-prof_1.c
===
--- testsuite/gcc.dg/tree-prof/lipo/indir-call-prof_1.c (revision 174725)
+++ testsuite/gcc.dg/tree-prof/lipo/indir-call-prof_1.c (working copy)
@@ -1,4 +1,4 @@
-/* { dg-options "-O2 -fdump-tree-optimized -fdump-ipa-tree_profile_ipa" } */
+/* { dg-options "-O2 -fdump-tree-optimized -fdump-ipa-profile" } */
 
 int a1 (void)
 {
Index: testsuite/gcc.dg/tree-prof/lipo/val-prof-1_0.c
===
--- testsuite/gcc.dg/tree-prof/lipo/val-prof-1_0.c  (revision 174725)
+++ testsuite/gcc.dg/tree-prof/lipo/val-prof-1_0.c  (working copy)
@@ -1,4 +1,4 @@
-/* { dg-options "-O2 -fdump-tree-optimized -fdump-ipa-tree_profile_ipa" } */
+/* { dg-options "-O2 -fdump-tree-optimized -fdump-ipa-profile" } */
 int a[1000];
 int b = 256;
 int c = 257;
@@ -15,8 +15,8 @@ main ()
 }
   return 0;
 }
-/* { dg-final-use { scan-ipa-dump "Div.mod by constant n_\[0-9\]*=257 
transformation on insn" "tree_profile_ipa"} } */
+/* { dg-final-use { scan-ipa-dump "Div.mod by constant n_\[0-9\]*=257 
transformation on insn" "profile"} } */
 /* { dg-final-use { scan-tree-dump "if \\(n_\[0-9\]* != 257\\)" "optimized"} } 
*/
 /* { dg-final-use { scan-tree-dump-not "Invalid sum" "optimized"} } */
 /* { dg-final-use { cleanup-tree-dump "optimized" } } */
-/* { dg-final-use { cleanup-ipa-dump "tree_profile_ipa" } } */
+/* { dg-final-use { cleanup-ipa-dump "profile" } } */
Index: testsuite/gcc.dg/tree-prof/lipo/val-prof-2_0.c
===
--- testsuite/gcc.dg/tree-prof/lipo/val-prof-2_0.c  (revision 174725)
+++ testsuite/gcc.dg/tree-prof/lipo/val-prof-2_0.c  (working copy)
@@ -1,4 +1,4 @@
-/* { dg-options "-O2 -fdump-tree-optimized -fdump-ipa-tree_profile_ipa" } */
+/* { dg-options "-O2 -fdump-tree-optimized -fdump-ipa-profile" } */
 unsigned int a[1000];
 unsigned int b = 256;
 unsigned int c = 1024;
@@ -23,10 +23,10 @@ main ()
 }
   return 0;
 }
-/* { dg-final-use { scan-ipa-dump "Mod power of 2 transformation on insn" 
"tree_profile_ipa" } } */
+/* { dg-final-use { scan-ipa-dump "Mod power of 2 transformation on insn" 
"profile"

[google] add module id in function header dump for lipo (issue4579046)

2011-06-07 Thread David Li
This is trivial change to allow module id to be displayed in LIPO mode
in function header.

2011-06-07  David Li  

* tree-pretty-print.c   (revision 174779)
(dump_function_header): dump module id.

Index: tree-pretty-print.c
===
--- tree-pretty-print.c (revision 174779)
+++ tree-pretty-print.c (working copy)
@@ -34,6 +34,7 @@ along with GCC; see the file COPYING3.  
 #include "tree-pass.h"
 #include "value-prof.h"
 #include "predict.h"
+#include "l-ipo.h"
 
 /* Local functions, macros and variables.  */
 static const char *op_symbol (const_tree);
@@ -3034,8 +3035,13 @@ dump_function_header (FILE *dump_file, t
   else
 aname = "";
 
-  fprintf (dump_file, "\n;; Function %s (%s, funcdef_no=%d",
-  dname, aname, fun->funcdef_no);
+  if (L_IPO_COMP_MODE)
+fprintf (dump_file, "\n;; Function %s (%s, funcdef_no=%d:%d",
+ dname, aname, FUNC_DECL_MODULE_ID (fun),
+ FUNC_DECL_FUNC_ID (fun));
+  else
+fprintf (dump_file, "\n;; Function %s (%s, funcdef_no=%d",
+ dname, aname, fun->funcdef_no);
   if (!(flags & TDF_NOUID))
 fprintf (dump_file, ", decl_uid=%d", DECL_UID (fdecl));
   if (node)
Index: Makefile.in
===
--- Makefile.in (revision 174779)
+++ Makefile.in (working copy)
@@ -2787,7 +2787,7 @@ tree-nomudflap.o : $(CONFIG_H) $(SYSTEM_
 tree-pretty-print.o : tree-pretty-print.c $(CONFIG_H) $(SYSTEM_H) \
$(TREE_H) $(DIAGNOSTIC_H) $(HASHTAB_H) $(TREE_FLOW_H) \
$(TM_H) coretypes.h tree-iterator.h $(SCEV_H) langhooks.h \
-   $(TREE_PASS_H) value-prof.h output.h tree-pretty-print.h
+   $(TREE_PASS_H) value-prof.h output.h tree-pretty-print.h l-ipo.h
 tree-diagnostic.o : tree-diagnostic.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
$(TREE_H) $(DIAGNOSTIC_H) tree-diagnostic.h langhooks.h $(LANGHOOKS_DEF_H)
 fold-const.o : fold-const.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \

--
This patch is available for review at http://codereview.appspot.com/4579046


[google] cprop pass's gate function cleanup (issue4572055)

2011-06-09 Thread David Li
The patch fixed the breakage due to the r174848 backport to google/main --
the gate function for cprop is in a different file from trunk.

2011-06-09  David Li  

* gcse.c (gate_rtl_cprop): Gate cleanup
(execute_rtl_cprop): Gate cleanup

Index: gcse.c
===
--- gcse.c  (revision 174852)
+++ gcse.c  (working copy)
@@ -5309,15 +5309,17 @@ one_cprop_pass (void)
 static bool
 gate_rtl_cprop (void)
 {
-  return optimize > 0 && flag_gcse
-&& !cfun->calls_setjmp
-&& dbg_cnt (cprop);
+  return optimize > 0 && flag_gcse;
 }
 
 static unsigned int
 execute_rtl_cprop (void)
 {
   int changed;
+  if (cfun->calls_setjmp
+  || !dbg_cnt (cprop))
+return 0;
+
   delete_unreachable_blocks ();
   df_set_flags (DF_LR_RUN_DCE);
   df_analyze ();

--
This patch is available for review at http://codereview.appspot.com/4572055


[google] Fix a bug leading to inconsistent comdat group in LIPO mode (issue4616041)

2011-06-14 Thread David Li
The patch will be committed to google/main to fix a problem in LIPO model
that leads to 'reference to discarded comdat section' ld warning. The problem
is caused by inconsistent comdat groups between primary and aux modules because
thunks were skipped in aux module.

2011-06-14   David Li  

* cp/semantics.c (emit_associated_thunks):
Do not omit thunk emission for aux modules.

Index: cp/semantics.c
===
--- cp/semantics.c  (revision 174851)
+++ cp/semantics.c  (working copy)
@@ -3415,8 +3415,7 @@ emit_associated_thunks (tree fn)
  enabling you to output all the thunks with the function itself.  */
   if (DECL_VIRTUAL_P (fn)
   /* Do not emit thunks for extern template instantiations.  */
-  && ! DECL_REALLY_EXTERN (fn)
-  && ! cgraph_is_auxiliary (fn))
+  && ! DECL_REALLY_EXTERN (fn))
 {
   tree thunk;
 

--
This patch is available for review at http://codereview.appspot.com/4616041


[google] Fix lipo regression test failures after merge from trunk (issue4806053)

2011-07-26 Thread David Li
The patch is committed to google/main to fix lipo test regressions after trunk 
merge.

2011-07-26  David Li  

* value-prof.c (gimple_value_profile_transformations): Remove redundant 
code.
* cgraphunit.c (cgraph_mark_functions_to_output): Fix assertion in lipo 
mode.
* ipa-inline.c (early_inliner): Check fake edge.
* l-ipo.c (pop_module_scope): Process alias node.
(cgraph_unify_type_alias_sets): Skip empty function.
* testsuite/gcc.dg/tree-prof/lipo/val-prof-2_0.c: update test case

Index: value-prof.c
===
--- value-prof.c(revision 176763)
+++ value-prof.c(working copy)
@@ -613,18 +613,7 @@ gimple_value_profile_transformations (vo
 }
 
   if (changed)
-{
-  counts_to_freqs ();
-  /* Value profile transformations may change inline parameters
- a lot (e.g., indirect call promotion introduces new direct calls).
- The update is also needed to avoid compiler ICE -- when MULTI
- target icall promotion happens, the caller's size may become
- negative when the promoted direct calls get promoted.  */
-  /* Guard this for LIPO for now.  */
-  if (L_IPO_COMP_MODE)
-compute_inline_parameters (cgraph_get_node (current_function_decl),
-  false);
-}
+counts_to_freqs ();
 
   return changed;
 }
Index: cgraphunit.c
===
--- cgraphunit.c(revision 176763)
+++ cgraphunit.c(working copy)
@@ -1531,7 +1531,8 @@ cgraph_mark_functions_to_output (void)
  gcc_assert (node->global.inlined_to
  || !gimple_has_body_p (decl)
  || node->in_other_partition
- || DECL_EXTERNAL (decl));
+ || DECL_EXTERNAL (decl)
+  || cgraph_is_auxiliary (node->decl));
 
}
 
Index: testsuite/gcc.dg/tree-prof/lipo/val-prof-2_0.c
===
--- testsuite/gcc.dg/tree-prof/lipo/val-prof-2_0.c  (revision 176763)
+++ testsuite/gcc.dg/tree-prof/lipo/val-prof-2_0.c  (working copy)
@@ -26,7 +26,7 @@ main ()
 /* { dg-final-use { scan-ipa-dump "Mod power of 2 transformation on insn" 
"profile" } } */
 /* This is part of code checking that n is power of 2, so we are sure that the 
transformation
didn't get optimized out.  */
-/* { dg-final-use { scan-tree-dump "n_\[0-9\]* \\+ 0x" "optimized"} } */
+/* { dg-final-use { scan-tree-dump "n_\[0-9\]* \\+ (4294967295|0x0*)" 
"optimized"} } */
 /* { dg-final-use { scan-tree-dump-not "Invalid sum" "optimized"} } */
 /* { dg-final-use { cleanup-tree-dump "optimized" } } */
 /* { dg-final-use { cleanup-ipa-dump "profile" } } */
Index: ipa-inline.c
===
--- ipa-inline.c(revision 176763)
+++ ipa-inline.c(working copy)
@@ -2008,6 +2008,9 @@ early_inliner (void)
  for (edge = node->callees; edge; edge = edge->next_callee)
{
  struct inline_edge_summary *es = inline_edge_summary (edge);
+
+ if (!edge->call_stmt)
+   continue;
  es->call_stmt_size
= estimate_num_insns (edge->call_stmt, &eni_size_weights);
  es->call_stmt_time
Index: l-ipo.c
===
--- l-ipo.c (revision 176763)
+++ l-ipo.c (working copy)
@@ -390,6 +390,7 @@ pop_module_scope (void)
 primary_module_last_loc = input_location;
 
   at_eof = 1;
+  cgraph_process_same_body_aliases ();
   lang_hooks.l_ipo.process_pending_decls (input_location);
   lang_hooks.l_ipo.clear_deferred_fns ();
   at_eof = 0;
@@ -1067,7 +1068,8 @@ cgraph_unify_type_alias_sets (void)
 {
   push_cfun (DECL_STRUCT_FUNCTION (node->decl));
   current_function_decl = node->decl;
-  cgraph_collect_type_referenced ();
+  if (gimple_has_body_p (current_function_decl))
+cgraph_collect_type_referenced ();
   current_function_decl = NULL;
   pop_cfun ();
 }

--
This patch is available for review at http://codereview.appspot.com/4806053


[google] Do not declare pmu and sampling rate related vars for profile-use build (issue4832042)

2011-07-28 Thread David Li
The following trivial patch will be applied to google branches.
Bootstrap and tessted on x86-64/linux

2011-07-28  David Li  

* coverage.c (coverage_init): Remove checking of profile-use
flags.

Index: coverage.c
===
--- coverage.c  (revision 176765)
+++ coverage.c  (working copy)
@@ -1952,9 +1952,10 @@ coverage_init (const char *filename, con
 static bool
 profiling_enabled_p (void)
 {
-  return flag_pmu_profile_generate || profile_arc_flag ||
-  flag_profile_generate_sampling || flag_test_coverage ||
-  flag_branch_probabilities || flag_profile_reusedist;
+  return flag_pmu_profile_generate
+   || profile_arc_flag
+   || flag_profile_generate_sampling
+   || flag_profile_reusedist;
 }
 
 /* Construct variables for PMU profiling.

--
This patch is available for review at http://codereview.appspot.com/4832042


[google] LIPO regression tests and bug fixes (issue4444076)

2011-04-28 Thread David Li
This patch added regression tests for LIPO in google/main and fixes a couple of 
bugs found
in app testing:

1) duplicate assembler labels in .s file
2) missing icall profling for static functions indirect called
3) assertion in type 'merging'
4) gcov-dump bug.

Bootstrap compiler, regression test, and SPEC06 LIPO build.

Will commit to google/main.

Thanks,

David

2011-04-28  David Li  

* testsuite/gcc.dg/tree-prof/lipo/inliner-1.c   (revision 0): New test.
* testsuite/gcc.dg/tree-prof/lipo/gdb_cmd   (revision 0): Ditto.
* testsuite/gcc.dg/tree-prof/lipo/bb-reorg.c(revision 0): Ditto.
* testsuite/gcc.dg/tree-prof/lipo/stringop-1.c  (revision 0): Ditto.
* testsuite/gcc.dg/tree-prof/lipo/pr34999.c (revision 0): Ditto.
* testsuite/gcc.dg/tree-prof/lipo/stringop-2.c  (revision 0): Ditto.
* testsuite/gcc.dg/tree-prof/lipo/update-loopch.c   (revision 0):
* 
testsuite/gcc.dg/tree-prof/lipo/indir-call-prof.c.040i.tree_profile_ipa   
(revision 0):
Ditto.
* testsuite/gcc.dg/tree-prof/lipo/ic-misattribution-1.c (revision 0): 
Ditto.
* testsuite/gcc.dg/tree-prof/lipo/update-tailcall.c (revision 0): 
Ditto.
* testsuite/gcc.dg/tree-prof/lipo/ic-misattribution-1a.c
(revision 0): Ditto.
* testsuite/gcc.dg/tree-prof/lipo/lipo.exp  (revision 0): Ditto.
* testsuite/gcc.dg/tree-prof/lipo/tracer-1.c(revision 0): Ditto.
* testsuite/gcc.dg/tree-prof/lipo/indir-call-prof.c.145t.optimized  
(revision 0): Ditto.
* testsuite/gcc.dg/tree-prof/lipo/indir-call-prof.c (revision 0): 
Ditto.
* testsuite/gcc.dg/tree-prof/lipo/val-prof-1.c  (revision 0): Ditto.
* testsuite/gcc.dg/tree-prof/lipo/wcoverage-mismatch.c  (revision 0): 
Ditto.
* testsuite/gcc.dg/tree-prof/lipo/val-prof-2.c  (revision 0): Ditto.
* testsuite/gcc.dg/tree-prof/lipo/pr45354.c (revision 0): Ditto.
* testsuite/gcc.dg/tree-prof/lipo/val-prof-3.c  (revision 0): Ditto.
* testsuite/gcc.dg/tree-prof/lipo/val-prof-4.c  (revision 0): Ditto.
* testsuite/gcc.dg/tree-prof/lipo/val-prof-5.c  (revision 0): Ditto.
* testsuite/gcc.dg/tree-prof/lipo/val-prof-6.c  (revision 0): Ditto.
* testsuite/gcc.dg/tree-prof/lipo/val-prof-7.c  (revision 0): Ditto.
* testsuite/gcc.dg/tree-prof/lipo/pr47187.c (revision 0): Ditto.
* testsuite/gcc.dg/tree-prof/lipo/update-cunroll-2.c(revision 0): 
Ditto.
* testsuite/g++.dg/tree-prof/lipo/inline_mismatch_args.C
(revision 0): Ditto.
* testsuite/g++.dg/tree-prof/lipo/indir-call-prof-2.C   (revision 0): 
Ditto.
* testsuite/g++.dg/tree-prof/lipo/indir-call-prof.C (revision 0): 
Ditto.
* testsuite/g++.dg/tree-prof/lipo/partition1.C  (revision 0): Ditto.
* testsuite/g++.dg/tree-prof/lipo/partition2.C  (revision 0): Ditto.
* testsuite/g++.dg/tree-prof/lipo/partition3.C  (revision 0): Ditto.
* testsuite/g++.dg/tree-prof/lipo/lipo.exp  (revision 0): Ditto.
* final.c   (revision 173136) (profile_function): Use FUNC_LABEL_ID.
* dwarf2out.c   (revision 173136) (dwarf2out_vms_end_prologue): Ditto.
(dwarf2out_vms_begin_epilogue): Ditto.
(dwarf2out_vms_debug_main_pointer): Ditto.
* cgraphunit.c  (revision 173136) (cgraph_finalize_compilation_unit):
Remove eq type alias set computation.
* tree-profile.c(revision 173136) (gimple_gen_ic_func_profiler):
(gimple_gen_ic_func_topn_profiler): Do not skip any functions.
(tree_profiling): Add type alias merging.
* l-ipo.c   (revision 173136) (restore_post_parsing_states): Use
max funcdef_no.
(pop_module_scope): Use max funcdef_no.
* gcov-dump.c   (revision 173136) (tag_function): Fix a bug in function
read.

Index: final.c
===
--- final.c (revision 173136)
+++ final.c (working copy)
@@ -1623,7 +1623,7 @@ profile_function (FILE *file ATTRIBUTE_U
   int align = MIN (BIGGEST_ALIGNMENT, LONG_TYPE_SIZE);
   switch_to_section (data_section);
   ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
-  targetm.asm_out.internal_label (file, "LP", current_function_funcdef_no);
+  targetm.asm_out.internal_label (file, "LP", FUNC_LABEL_ID (cfun));
   assemble_integer (const0_rtx, LONG_TYPE_SIZE / BITS_PER_UNIT, align, 1);
 }
 
@@ -1636,7 +1636,7 @@ profile_function (FILE *file ATTRIBUTE_U
 ASM_OUTPUT_REG_PUSH (file, REGNO (chain));
 #endif
 
-  FUNCTION_PROFILER (file, current_function_funcdef_no);
+  FUNCTION_PROFILER (file, FUNC_LABEL_ID (cfun));
 
 #ifdef ASM_OUTPUT_REG_PUSH
   if (chain && REG_P (chain))
Index: cgraphunit.c
===
--- cgraphunit.c

[google] Multiple source LIPO test cases and bug fixes (issue4441084)

2011-05-01 Thread David Li
Hi, the following patch will be committed to google/main. The patch added 
multiple
source file support for FDO and a couple of multi-source test cases for LIPO. It
also include a couple of bug fixes related to missing assembler name binding 
cleanup.

regression test and SPEC with LIPO.

David

2011-05-01  David Li  

* testsuite/lib/profopt.exp (proc): Multiple source file support.
* testsuite/gcc.dg/tree-prof/lipo/lipo.exp (load_lib):
Multiple source file support.
* testsuite/g++.dg/tree-prof/lipo/lipo.exp: Ditto.
* testsuite/gcc.dg/tree-prof/lipo/indir-call-prof_0.c: New test.
* testsuite/gcc.dg/tree-prof/lipo/indir-call-prof_1.c: New test.
* testsuite/gcc.dg/tree-prof/lipo/lipo_inline1_0.c: New test.
* testsuite/gcc.dg/tree-prof/lipo/lipo_inline1_1.c: New test.
* testsuite/gcc.dg/tree-prof/lipo/lipo_inline1_2.c: New test.
* testsuite/g++.dg/tree-prof/lipo/vcall1_0.c: New test.
* testsuite/g++.dg/tree-prof/lipo/vcall1_1.c: New test.
* testsuite/g++.dg/tree-prof/lipo/vcall1_2.c: New test.
* testsuite/gcc.dg/tree-prof/lipo/inliner-1.c: Rename.
* testsuite/gcc.dg/tree-prof/lipo/stringop-1.c: Ditto.
* testsuite/gcc.dg/tree-prof/lipo/update-loopch.c: Ditto.
* testsuite/gcc.dg/tree-prof/lipo/stringop-2.: Ditto.
* testsuite/gcc.dg/tree-prof/lipo/ic-misattribution-1.c: Ditto.
* testsuite/gcc.dg/tree-prof/lipo/update-tailcall.c: Ditto.
* testsuite/gcc.dg/tree-prof/lipo/ic-misattribution-1a.c: Ditto.
* testsuite/gcc.dg/tree-prof/lipo/tracer-1.c: Ditto.
* testsuite/gcc.dg/tree-prof/lipo/indir-call-prof.c: Ditto.
* testsuite/gcc.dg/tree-prof/lipo/val-prof-1.c: Ditto.
* testsuite/gcc.dg/tree-prof/lipo/val-prof-2.c: Ditto.
* testsuite/gcc.dg/tree-prof/lipo/val-prof-3.c: Ditto.
* testsuite/gcc.dg/tree-prof/lipo/val-prof-4.c: Ditto.
* testsuite/gcc.dg/tree-prof/lipo/val-prof-5.c: Ditto.
* testsuite/gcc.dg/tree-prof/lipo/val-prof-6.c: Ditto.
* testsuite/gcc.dg/tree-prof/lipo/val-prof-7.c: Ditto.
* testsuite/gcc.dg/tree-prof/lipo/pr47187.c: Ditto.
* testsuite/gcc.dg/tree-prof/lipo/update-cunroll-2.c: Ditto.
* testsuite/gcc.dg/tree-prof/lipo/bb-reorg.c: Ditto.
* testsuite/gcc.dg/tree-prof/lipo/pr34999.c: Ditto.
* testsuite/gcc.dg/tree-prof/lipo/ic-misattribution-1_0.c: Ditto.
* testsuite/gcc.dg/tree-prof/lipo/ic-misattribution-1_1.c: Ditto.
* testsuite/gcc.dg/tree-prof/lipo/wcoverage-mismatch.c: Ditto.
* testsuite/gcc.dg/tree-prof/lipo/pr45354.c: Ditto.
* testsuite/g++.dg/tree-prof/lipo/inline_mismatch_args.C: Ditto.
* testsuite/g++.dg/tree-prof/lipo/indir-call-prof-2.C: Ditto.
* testsuite/g++.dg/tree-prof/lipo/indir-call-prof.C: Ditto.
* testsuite/g++.dg/tree-prof/lipo/partition1.C: Ditto.
* testsuite/g++.dg/tree-prof/lipo/partition2.C: Ditto.
* testsuite/g++.dg/tree-prof/lipo/partition3.C: Ditto.
* cp/cp-objcp-common.c (cmp_templ_arg): Use assembler name for
decl comparison.
* l-ipo.c (clear_module_scope_bindings): Clear name binding for
assembler name.
(pop_module_scope): Clear name bindings for the last module.
(lipo_cmp_type): Handle template arg type.

Index: testsuite/lib/profopt.exp
===
--- testsuite/lib/profopt.exp   (revision 173148)
+++ testsuite/lib/profopt.exp   (working copy)
@@ -239,6 +239,27 @@ proc profopt-execute { src } {
 set executable $tmpdir/[file tail [file rootname $src].x]
 set basename [file tail $testcase]
 set base [file rootname $basename]
+set dir [file dirname $src]
+# multiple file test base
+set mbase [file rootname $basename]
+regsub "_0" $mbase "" mbase
+regsub "/" $mbase "" mbase
+set src_list $src
+set i 1
+set done 0
+while { !$done } {
+   set names [glob -nocomplain -types f -- "${dir}/${mbase}_${i}.*"]
+   if { [llength ${names}] > 1 } {
+   warning "profopt-execute: more than one file matched 
${dir}/${mbase}_${i}.*"
+   }
+   if { [llength ${names}] == 1 } {
+   lappend src_list [lindex ${names} 0]
+   incr i
+   } else {
+   set num_srcs ${i}
+   set done 1
+   }
+}
 
 set count 0
 foreach option $prof_option_list {
@@ -279,7 +300,8 @@ proc profopt-execute { src } {
set options "$extra_options"
lappend options "additional_flags=$option $extra_flags $profile_option"
set optstr "$option $profile_option"
-   set comp_output [${tool}_target_compile "$src" "$execname1" executable 
$options]
+   verbose "Hey $src_list $execname1 executable $options"
+   set comp_

[google] backport r17317,r17347,r17342 to google/main (issue4430080)

2011-05-03 Thread David Li
Backport patches r17342, r173177, and r173147 from trunk to google/main + some 
minor cleanups for problems found in test.

Tested: bootstrap + regression test + SPEC06 LIPO testing.

Backport r173177
2011-05-03  David Li  

* tree-profile.c (init_ic_make_global_vars): Set
tls attribute on ic vars.
* coverage.c (coverage_end_function): Initialize
function_list with zero.

Backport r173147
2011-05-03  David Li  

* tree.c (crc32_string): Use crc32_byte.
(crc32_byte): New function.
* tree.h (crc32_byte): New function.
* gcov.c (read_graph_file): Handle new cfg_cksum.
(read_count_file): Ditto.
* profile.c (instrument_values): Ditto.
(get_exec_counts): Ditto.
(read_profile_edge_counts): Ditto.
(compute_branch_probabilities): Ditto.
(compute_value_histograms): Ditto.
(branch_prob): Ditto.
(end_branch_prob): Ditto.
* coverage.c (read_counts_file): Ditto.
(get_coverage_counts): Ditto.
(tree_coverage_counter_addr): Ditto.
(coverage_checksum_string): Ditto.
(coverage_begin_output): Ditto.
(coverage_end_function): Ditto.
(build_fn_info_type): Ditto.
(build_fn_info_value): Ditto.
* libgcov.c (gcov_exit): Ditto.
* gcov-dump.c (tag_function): Ditto.
(compute_checksum): Remove.

2011-05-03  David Li  

* l-ipo.c (promote_static_var_or_func): Keep initializer
for externalized aux module variables.
(process_module_scope_static_var): Keep initializer
for promoted static vars to allow ccp.

Backport r17342
2011-05-03  Xinliang David Li  

* gcc.dg/tree-ssa/integer-addr.c: New test.
* gcc.dg/tree-ssa/alias_bug.c: New test.

Index: tree.c
===
--- tree.c  (revision 173345)
+++ tree.c  (working copy)
@@ -8489,14 +8489,12 @@ dump_tree_statistics (void)
 
 #define FILE_FUNCTION_FORMAT "_GLOBAL__%s_%s"
 
-/* Generate a crc32 of a string.  */
+/* Generate a crc32 of a byte.  */
 
 unsigned
-crc32_string (unsigned chksum, const char *string)
+crc32_byte (unsigned chksum, char byte)
 {
-  do
-{
-  unsigned value = *string << 24;
+  unsigned value = (unsigned) byte << 24;
   unsigned ix;
 
   for (ix = 8; ix--; value <<= 1)
@@ -8507,6 +8505,18 @@ crc32_string (unsigned chksum, const cha
  chksum <<= 1;
  chksum ^= feedback;
}
+  return chksum;
+}
+
+
+/* Generate a crc32 of a string.  */
+
+unsigned
+crc32_string (unsigned chksum, const char *string)
+{
+  do
+{
+  chksum = crc32_byte (chksum, *string);
 }
   while (*string++);
   return chksum;
@@ -8530,8 +8540,10 @@ clean_symbol_name (char *p)
   *p = '_';
 }
 
-/* Generate a name for a special-purpose function function.
+/* Generate a name for a special-purpose function.
The generated name may need to be unique across the whole link.
+   Changes to this function may also require corresponding changes to
+   xstrdup_mask_random.
TYPE is some string to identify the purpose of this function to the
linker or collect2; it must start with an uppercase letter,
one of:
Index: tree.h
===
--- tree.h  (revision 173345)
+++ tree.h  (working copy)
@@ -4948,6 +4948,7 @@ inlined_function_outer_scope_p (const_tr
 
 /* In tree.c */
 extern unsigned crc32_string (unsigned, const char *);
+extern unsigned crc32_byte (unsigned, char);
 extern void clean_symbol_name (char *);
 extern tree get_file_function_name (const char *);
 extern tree get_callee_fndecl (const_tree);
Index: gcov.c
===
--- gcov.c  (revision 173345)
+++ gcov.c  (working copy)
@@ -54,6 +54,13 @@ along with Gcov; see the file COPYING3. 
some places we make use of the knowledge of how profile.c works to
select particular algorithms here.  */
 
+/* The code validates that the profile information read in corresponds
+   to the code currently being compiled.  Rather than checking for
+   identical files, the code below computes a checksum on the CFG
+   (based on the order of basic blocks and the arcs in the CFG).  If
+   the CFG checksum in the gcda file match the CFG checksum for the
+   code currently being compiled, the profile data will be used.  */
+
 /* This is the size of the buffer used to read in source file lines.  */
 
 #define STRING_SIZE 200
@@ -161,7 +168,8 @@ typedef struct function_info
   /* Name of function.  */
   char *name;
   unsigned ident;
-  unsigned checksum;
+  unsigned lineno_checksum;
+  unsigned cfg_checksum;
 
   /* Array of basic blocks.  */
   block_t *blocks;
@@ -809,12 +817,14 @@ read_graph_file (void)
   if (tag == GCOV_TAG_FUNCTION)
{
  char *function_name;
- unsigned ident

[google] improves option mismatch handling for LIPO (issue4479045)

2011-05-04 Thread David Li
This patch improves cross module option mismatch handling in LIPO mode -- will 
be commited to google/main.

1) Remove duplicates in the option list before comparison;
2) Force module incompatiblity when two modules disagree in -fexceptions 
setting. In LIPO mode, when option mismatch is discovered between the primary 
and aux module, a warning message is emitted, but the modules will be 
considered incompatible when -fripa-disallow-opt-mismatch is specified. With 
this change, exception option mismatch will force the primary module to reject 
the aux module.

Tested: SPEC with LIPO.


2011-05-04  David Li  

* coverage.c (incompatible_cl_args): Better handling
of option mismatch.

Index: coverage.c
===
--- coverage.c  (revision 173353)
+++ coverage.c  (working copy)
@@ -213,6 +213,27 @@ is_last_module (unsigned mod_id)
   return (mod_id == module_infos[num_in_fnames - 1]->ident);
 }
 
+/* String hash function  */
+
+static hashval_t
+str_hash (const void *p)
+{
+  const char *s = (const char *)p;
+  return htab_hash_string (s);
+}
+
+/* String equal function  */
+
+static int
+str_eq (const void *p1, const void *p2)
+{
+  const char *s1 = (const char *)p1;
+  const char *s2 = (const char *)p2;
+
+  return !strcmp (s1, s2);
+}
+
+
 /* Returns true if the command-line arguments stored in the given module-infos
are incompatible.  */
 static bool
@@ -227,6 +248,9 @@ incompatible_cl_args (struct gcov_module
   unsigned int num_non_warning_opts1 = 0, num_non_warning_opts2 = 0;
   bool warning_mismatch = false;
   bool non_warning_mismatch = false;
+  bool with_fexceptions1 = true;
+  bool with_fexceptions2 = true;
+  htab_t option_tab1, option_tab2;
   unsigned int start_index1 = mod_info1->num_quote_paths +
 mod_info1->num_bracket_paths + mod_info1->num_cpp_defines +
 mod_info1->num_cpp_includes;
@@ -234,22 +258,52 @@ incompatible_cl_args (struct gcov_module
 mod_info2->num_bracket_paths + mod_info2->num_cpp_defines +
 mod_info2->num_cpp_includes;
 
+  option_tab1 = htab_create (10, str_hash, str_eq, NULL);
+  option_tab2 = htab_create (10, str_hash, str_eq, NULL);
+
   /* First, separate the warning and non-warning options.  */
   for (i = 0; i < mod_info1->num_cl_args; i++)
 if (mod_info1->string_array[start_index1 + i][1] == 'W')
   warning_opts1[num_warning_opts1++] =
mod_info1->string_array[start_index1 + i];
 else
-  non_warning_opts1[num_non_warning_opts1++] =
-   mod_info1->string_array[start_index1 + i];
+  {
+void **slot;
+char* option_string = mod_info1->string_array[start_index1 + i];
+
+if (!strcmp ("-fexceptions", option_string))
+  with_fexceptions1 = true;
+else if (!strcmp ("-fno-exceptions", option_string))
+  with_fexceptions1 = false;
+
+slot = htab_find_slot (option_tab1, option_string, INSERT);
+if (!*slot)
+  {
+*slot = option_string;
+non_warning_opts1[num_non_warning_opts1++] = option_string;
+  }
+  }
 
   for (i = 0; i < mod_info2->num_cl_args; i++)
 if (mod_info2->string_array[start_index2 + i][1] == 'W')
   warning_opts2[num_warning_opts2++] =
mod_info2->string_array[start_index2 + i];
 else
-  non_warning_opts2[num_non_warning_opts2++] =
-   mod_info2->string_array[start_index2 + i];
+  {
+void **slot;
+char* option_string = mod_info2->string_array[start_index2 + i];
+
+if (!strcmp ("-fexceptions", option_string))
+  with_fexceptions2 = true;
+else if (!strcmp ("-fno-exceptions", option_string))
+  with_fexceptions2 = false;
+slot = htab_find_slot (option_tab2, option_string, INSERT);
+if (!*slot)
+  {
+*slot = option_string;
+non_warning_opts2[num_non_warning_opts2++] = option_string;
+  }
+  }
 
   /* Compare warning options. If these mismatch, we emit a warning.  */
   if (num_warning_opts1 != num_warning_opts2)
@@ -272,11 +326,24 @@ incompatible_cl_args (struct gcov_module
 warning (OPT_Wripa_opt_mismatch, "command line arguments mismatch for %s "
 "and %s", mod_info1->source_filename, mod_info2->source_filename);
 
+   if (warn_ripa_opt_mismatch && non_warning_mismatch && flag_ripa_verbose)
+ {
+   inform (UNKNOWN_LOCATION, "Options for %s", mod_info1->source_filename);
+   for (i = 0; i < num_non_warning_opts1; i++)
+ inform (UNKNOWN_LOCATION, non_warning_opts1[i]);
+   inform (UNKNOWN_LOCATION, "Options for %s", mod_info2->source_filename);
+   for (i = 0; i < num_non_warning_opts2; i++)
+ inform (UNKNOWN_LOCATION, non_warning_opts2[i]);
+ }
+
   

[google] revert 173158 (-fstrict-enum-precisions) (issue4503041)

2011-05-06 Thread David Li
The following patch reverted r173158 from google/main -- -fstrict-enums 
provides a better
implementation. The test cases are kept with slight modification.

Bootstrap and tested with related test cases.

Ok for google/main?

2011-05-06  David Li  

Revert r173158.

Index: tree-vrp.c
===
--- tree-vrp.c  (revision 173415)
+++ tree-vrp.c  (working copy)
@@ -5553,9 +5553,7 @@ stmt_interesting_for_vrp (gimple stmt)
  && ((is_gimple_call (stmt)
   && gimple_call_fndecl (stmt) != NULL_TREE
   && DECL_IS_BUILTIN (gimple_call_fndecl (stmt)))
- || !gimple_vuse (stmt))
-  && (flag_strict_enum_precision
-  || TREE_CODE (TREE_TYPE (lhs)) != ENUMERAL_TYPE))
+ || !gimple_vuse (stmt)))
return true;
 }
   else if (gimple_code (stmt) == GIMPLE_COND
Index: doc/invoke.texi
===
--- doc/invoke.texi (revision 173415)
+++ doc/invoke.texi (working copy)
@@ -395,8 +395,8 @@ Objective-C and Objective-C++ Dialects}.
 -fsel-sched-pipelining -fsel-sched-pipelining-outer-loops @gol
 -fsignaling-nans -fsingle-precision-constant -fsplit-ivs-in-unroller @gol
 -fsplit-wide-types -fstack-protector -fstack-protector-all @gol
--fstrict-aliasing -fstrict-overflow -fno-strict-enum-precision -fthread-jumps
--ftracer -ftree-bit-ccp @gol
+-fstrict-aliasing -fstrict-overflow -fthread-jumps -ftracer @gol
+-ftree-bit-ccp @gol
 -ftree-builtin-call-dce -ftree-ccp -ftree-ch -ftree-copy-prop @gol
 -ftree-copyrename -ftree-dce -ftree-dominator-opts -ftree-dse @gol
 -ftree-forwprop -ftree-fre -ftree-loop-if-convert @gol
@@ -2075,11 +2075,6 @@ represented in the minimum number of bit
 enumerators).  This assumption may not be valid if the program uses a
 cast to convert an arbitrary integer value to the enumeration type.
 
-@item -fno-strict-enum-precision
-@opindex fno-strict-enum-precision
-Do not perform optimizations of switch() statements based on the
-precision of enum types.
-
 @item -ftemplate-depth=@var{n}
 @opindex ftemplate-depth
 Set the maximum instantiation depth for template classes to @var{n}.
Index: testsuite/g++.dg/other/no-strict-enum-precision-3.C
===
--- testsuite/g++.dg/other/no-strict-enum-precision-3.C (revision 173415)
+++ testsuite/g++.dg/other/no-strict-enum-precision-3.C (working copy)
@@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O2 -fno-strict-enum-precision" } */
+/* { dg-options "-O2 -fno-strict-enums" } */
 
 extern "C" void abort (void);
 
Index: testsuite/g++.dg/other/no-strict-enum-precision-1.C
===
--- testsuite/g++.dg/other/no-strict-enum-precision-1.C (revision 173415)
+++ testsuite/g++.dg/other/no-strict-enum-precision-1.C (working copy)
@@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-fno-strict-enum-precision" } */
+/* { dg-options "-fno-strict-enums" } */
 
 extern "C" void abort (void);
 
Index: testsuite/g++.dg/other/no-strict-enum-precision-2.C
===
--- testsuite/g++.dg/other/no-strict-enum-precision-2.C (revision 173415)
+++ testsuite/g++.dg/other/no-strict-enum-precision-2.C (working copy)
@@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O2 -fno-strict-enum-precision" } */
+/* { dg-options "-O2 -fno-strict-enums" } */
 
 extern "C" void abort (void);
 
Index: gimplify.c
===
--- gimplify.c  (revision 173415)
+++ gimplify.c  (working copy)
@@ -1602,8 +1602,6 @@ gimplify_switch_expr (tree *expr_p, gimp
type = TREE_TYPE (SWITCH_COND (switch_expr));
  if (len
  && INTEGRAL_TYPE_P (type)
-  && (flag_strict_enum_precision
-  || TREE_CODE (type) != ENUMERAL_TYPE)
  && TYPE_MIN_VALUE (type)
  && TYPE_MAX_VALUE (type)
  && tree_int_cst_equal (CASE_LOW (VEC_index (tree, labels, 0)),

--
This patch is available for review at http://codereview.appspot.com/4503041


New options to disable/enable any pass for any functions (issue4550056)

2011-05-18 Thread David Li

In gcc, not all passes have user level control to turn it on/off, and
there is no way to flip on/off the pass for a subset of functions. I
implemented a generic option handling scheme in gcc to allow
disabling/enabling any gcc pass for any specified function(s).  The
new options will be very useful for things like performance
experiments and bug triaging (gcc has dbgcnt mechanism, but not all
passes have the counter).

The option syntax is very similar to -fdump- options. The following
are some examples:

-fdisable-tree-ccp1<--- disable ccp1 for all functions
-fenable-tree-cunroll=1   <--- enable complete unroll for the function
   whose cgraphnode uid is 1
-fdisable-rtl-gcse2=1:100,300,400:1000   <-- disable gcse2 for
   functions at the following
ranges [1,1], [300,400], and 
[400,1000]
-fdisable-tree-einline --> disable early inlining for all callers
-fdisable-ipa-inline --> disable ipa inlininig

In the gcc dumps, the uid numbers are displayed in the function header.

The options are intended to be used internally by gcc developers.

Ok for trunk ? (There is a little LIPO specific change that can be removed).

David

2011-05-18  David Li  

* final.c (rest_of_clean_state): Call function header dumper.
* opts-global.c (handle_common_deferred_options): Handle new options.
* tree-cfg.c (gimple_dump_cfg): Call function header dumper.
* passes.c (register_one_dump_file): Call register_pass_name.
(pass_init_dump_file): Call function header dumper.
(execute_one_pass): Check explicit enable/disable flag.
(passr_hash): New function.
(passr_eq): 
(register_pass_name):
(get_pass_by_name):
(pass_hash):
(pass_eq):
(enable_disable_pass):
(is_pass_explicitly_enabled_or_disabled):
(is_pass_explicitly_enabled):
(is_pass_explicitly_disabled):


Index: tree-pass.h
===
--- tree-pass.h (revision 173635)
+++ tree-pass.h (working copy)
@@ -644,4 +644,12 @@ extern bool first_pass_instance;
 /* Declare for plugins.  */
 extern void do_per_function_toporder (void (*) (void *), void *);
 
+extern void enable_disable_pass (const char *, bool);
+extern bool is_pass_explicitly_disabled (struct opt_pass *, tree);
+extern bool is_pass_explicitly_enabled (struct opt_pass *, tree);
+extern void register_pass_name (struct opt_pass *, const char *);
+extern struct opt_pass *get_pass_by_name (const char *);
+struct function;
+extern void pass_dump_function_header (FILE *, tree, struct function *);
+
 #endif /* GCC_TREE_PASS_H */
Index: final.c
===
--- final.c (revision 173635)
+++ final.c (working copy)
@@ -4456,19 +4456,7 @@ rest_of_clean_state (void)
}
   else
{
- const char *aname;
- struct cgraph_node *node = cgraph_node (current_function_decl);
-
- aname = (IDENTIFIER_POINTER
-  (DECL_ASSEMBLER_NAME (current_function_decl)));
- fprintf (final_output, "\n;; Function (%s) %s\n\n", aname,
-node->frequency == NODE_FREQUENCY_HOT
-? " (hot)"
-: node->frequency == NODE_FREQUENCY_UNLIKELY_EXECUTED
-? " (unlikely executed)"
-: node->frequency == NODE_FREQUENCY_EXECUTED_ONCE
-? " (executed once)"
-: "");
+ pass_dump_function_header (final_output, current_function_decl, cfun);
 
  flag_dump_noaddr = flag_dump_unnumbered = 1;
  if (flag_compare_debug_opt || flag_compare_debug)
Index: common.opt
===
--- common.opt  (revision 173635)
+++ common.opt  (working copy)
@@ -1018,6 +1018,14 @@ fdiagnostics-show-option
 Common Var(flag_diagnostics_show_option) Init(1)
 Amend appropriate diagnostic messages with the command line option that 
controls them
 
+fdisable-
+Common Joined RejectNegative Var(common_deferred_options) Defer
+-fdisable-[tree|rtl|ipa]-=range1+range2 disables an optimization pass
+
+fenable-
+Common Joined RejectNegative Var(common_deferred_options) Defer
+-fenable-[tree|rtl|ipa]-=range1+range2 enables an optimization pass
+
 fdump-
 Common Joined RejectNegative Var(common_deferred_options) Defer
 -fdump-  Dump various compiler internals to a file
Index: opts-global.c
===
--- opts-global.c   (revision 173635)
+++ opts-global.c   (working copy)
@@ -411,6 +411,12 @@ handle_common_deferred_options (void)
error ("unrecognized command line option %<-fdump-%s%>", opt->arg);
  break;
 
+   case OPT_fenable_:
+  

Re: [Google] Refine hot caller heuristic

2013-08-20 Thread Xinliang David Li
Do you need to guard the jump function access with check if
(ipa_node_params_vector.exists ())?

Ideally, useful_cold_callee should be folded into the inline hints
estimation.  Question about the heuristic: why filtering out
PASS_THROUGH parameter cases completely? Passing 'this' parameter in
many cases can result in good PRE opportunities.  Why not skip the
unknown type?

David

On Tue, Aug 20, 2013 at 12:26 PM, Easwaran Raman  wrote:
> The current hot caller heuristic simply promotes edges whose caller is
> hot. This patch does the following:
> * Turn it off for applications with large footprint since the size
> increase hurts them
> * Be more selective by considering arguments to callee when the
> heuristic is enabled.
>
> This performs well on internal benchmarks. Ok for google/4_8 branch if
> all tests pass?
>
> - Easwaran


opt-info message change for vectorizer

2013-08-22 Thread Xinliang David Li
N_LINE (loc),
+ gimple_decl_printable_name (current_function_decl, 1));
+  else
+fprintf (dfile, "\n%s:%d: note: ", LOCATION_FILE (loc),
+ LOCATION_LINE (loc));
+}
   else if (current_function_decl)
-fprintf (dfile, "\n%s:%d: note: ",
+fprintf (dfile, "\n%s:%d:%s: note: ",
  DECL_SOURCE_FILE (current_function_decl),
- DECL_SOURCE_LINE (current_function_decl));
+ DECL_SOURCE_LINE (current_function_decl),
+ gimple_decl_printable_name (current_function_decl, 1));
 }
 }
 
Index: ChangeLog
===
--- ChangeLog   (revision 201752)
+++ ChangeLog   (working copy)
@@ -1,3 +1,13 @@
+2013-08-22  Xinliang David Li  
+
+   * tree-vect-loop-manip.c (vect_do_peeling_for_alignment):
+   Emit alignment peeling message with default -fopt-info.
+   (vect_loop_versioning): Emit loop version info message.
+   * tree-vectorizer.c (vectorize_loops): Minor message
+   change.
+   (execute_vect_slp): Ditto.
+   * dumpfile.c (dump_loc): Add function name in the dump.
+
 2013-08-14  Xinliang David Li  
 
* config/i386/i386.c (ix86_option_override_internal):


Re: [GOOGLE] Update AutoFDO annotation

2013-08-26 Thread Xinliang David Li
Can you add missing documentation on functions like ...:get_count_info
-- documenting return value etc.  Also it might be better to avoid
using 'set' as the local variable name. Change it to something more
specific.

thanks,

David

On Thu, Aug 22, 2013 at 3:56 PM, Dehao Chen  wrote:
> This patch has 2 changes:
>
> 1. Now that we have discriminator for inlined callsite, we don't need
> special handling for callsite location any more.
> 2. If a source line is mapped to multiple BBs, only the first BB will
> be annotated.
> 3. Before actual annotation, mark everythin BB/edge as not annotated.
>
> Bootstrapped and passed regression test.
>
> OK for google branch?
>
> Thanks,
> Dehao
>
> Index: gcc/auto-profile.c
> ===
> --- gcc/auto-profile.c (revision 201927)
> +++ gcc/auto-profile.c (working copy)
> @@ -24,6 +24,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include 
>  #include 
>  #include 
> +#include 
>
>  #include "config.h"
>  #include "system.h"
> @@ -100,6 +101,10 @@ typedef std::map icall_target
> (execution_count, value_profile_histogram).  */
>  typedef std::pair count_info;
>
> +/* Set of inline_stack. Used to track if the profile is already used to
> +   annotate the program.  */
> +typedef std::set location_set;
> +
>  struct string_compare
>  {
>bool operator() (const char *a, const char *b) const
> @@ -202,7 +207,8 @@ class autofdo_source_profile {
>const function_instance *get_function_instance_by_decl (tree decl) const;
>/* Find profile info for a given gimple STMT. If found, store the profile
>   info in INFO, and return true; otherwise return false.  */
> -  bool get_count_info (gimple stmt, count_info *info) const;
> +  bool get_count_info (gimple stmt, count_info *info,
> +   const location_set *set) const;
>/* Find total count of the callee of EDGE.  */
>gcov_type get_callsite_total_count (struct cgraph_edge *edge) const;
>
> @@ -284,17 +290,13 @@ static const char *get_original_name (const char *
>
>  /* Return the combined location, which is a 32bit integer in which
> higher 16 bits stores the line offset of LOC to the start lineno
> -   of DECL, The lower 16 bits stores the discrimnator of LOC if
> -   USE_DISCR is true, otherwise 0.  */
> +   of DECL, The lower 16 bits stores the discrimnator.  */
>
>  static unsigned
> -get_combined_location (location_t loc, tree decl, bool use_discr)
> +get_combined_location (location_t loc, tree decl)
>  {
> -  if (use_discr)
> -return ((LOCATION_LINE (loc) - DECL_SOURCE_LINE (decl)) << 16)
> -   | get_discriminator_from_locus (loc);
> -  else
> -return (LOCATION_LINE (loc) - DECL_SOURCE_LINE (decl)) << 16;
> +  return ((LOCATION_LINE (loc) - DECL_SOURCE_LINE (decl)) << 16)
> + | get_discriminator_from_locus (loc);
>  }
>
>  /* Return the function decl of a given lexical BLOCK.  */
> @@ -316,7 +318,7 @@ get_function_decl_from_block (tree block)
>  }
>
>  static void
> -get_inline_stack (gimple stmt, bool use_discr, inline_stack *stack)
> +get_inline_stack (gimple stmt, inline_stack *stack)
>  {
>location_t locus = gimple_location (stmt);
>if (LOCATION_LOCUS (locus) == UNKNOWN_LOCATION)
> @@ -337,14 +339,13 @@ static void
>
>tree decl = get_function_decl_from_block (block);
>stack->push_back (std::make_pair (
> -  decl, get_combined_location (locus, decl, level == 0 && use_discr)));
> +  decl, get_combined_location (locus, decl)));
>locus = tmp_locus;
>level++;
>  }
>stack->push_back (std::make_pair (
>current_function_decl,
> -  get_combined_location (locus, current_function_decl,
> - level == 0 && use_discr)));
> +  get_combined_location (locus, current_function_decl)));
>  }
>
>
> @@ -523,14 +524,16 @@ const function_instance *autofdo_source_profile::g
>return ret == map_.end() ? NULL : ret->second;
>  }
>
> -bool autofdo_source_profile::get_count_info (gimple stmt,
> - count_info *info) const
> +bool autofdo_source_profile::get_count_info (gimple stmt, count_info *info,
> + const location_set *set) const
>  {
>if (LOCATION_LOCUS (gimple_location (stmt)) == cfun->function_end_locus)
>  return false;
>
>inline_stack stack;
> -  get_inline_stack (stmt, true, &stack);
> +  get_inline_stack (stmt, &stack);
> +  if (set && set->find(stack) != set->end())
> +return false;
>if (stack.size () == 0)
>  return false;
>const function_instance *s = get_function_instance_by_inline_stack (stack);
> @@ -544,7 +547,7 @@ gcov_type autofdo_source_profile::get_callsite_tot
>  {
>inline_stack stack;
>stack.push_back (std::make_pair(edge->callee->symbol.decl, 0));
> -  get_inline_stack (edge->call_stmt, false, &stack);
> +  get_inline_stack (edge->call_stmt, &stack);
>
>const function_instance *s = get_function_instance_by_inline_stack (stack);
>if (s == NULL)
> @@ -821,7 +824,7 @@ afdo_vpt (gimple stmt, const icall_target

Re: [GOOGLE] Update AutoFDO annotation

2013-08-27 Thread Xinliang David Li
Ok.

David

On Tue, Aug 27, 2013 at 7:36 AM, Dehao Chen  wrote:
> Patch updated.
>
> Thanks,
> Dehao
>
> On Mon, Aug 26, 2013 at 4:11 PM, Xinliang David Li  wrote:
>> Can you add missing documentation on functions like ...:get_count_info
>> -- documenting return value etc.  Also it might be better to avoid
>> using 'set' as the local variable name. Change it to something more
>> specific.
>>
>> thanks,
>>
>> David
>>
>> On Thu, Aug 22, 2013 at 3:56 PM, Dehao Chen  wrote:
>>> This patch has 2 changes:
>>>
>>> 1. Now that we have discriminator for inlined callsite, we don't need
>>> special handling for callsite location any more.
>>> 2. If a source line is mapped to multiple BBs, only the first BB will
>>> be annotated.
>>> 3. Before actual annotation, mark everythin BB/edge as not annotated.
>>>
>>> Bootstrapped and passed regression test.
>>>
>>> OK for google branch?
>>>
>>> Thanks,
>>> Dehao
>>>
>>> Index: gcc/auto-profile.c
>>> ===
>>> --- gcc/auto-profile.c (revision 201927)
>>> +++ gcc/auto-profile.c (working copy)
>>> @@ -24,6 +24,7 @@ along with GCC; see the file COPYING3.  If not see
>>>  #include 
>>>  #include 
>>>  #include 
>>> +#include 
>>>
>>>  #include "config.h"
>>>  #include "system.h"
>>> @@ -100,6 +101,10 @@ typedef std::map icall_target
>>> (execution_count, value_profile_histogram).  */
>>>  typedef std::pair count_info;
>>>
>>> +/* Set of inline_stack. Used to track if the profile is already used to
>>> +   annotate the program.  */
>>> +typedef std::set location_set;
>>> +
>>>  struct string_compare
>>>  {
>>>bool operator() (const char *a, const char *b) const
>>> @@ -202,7 +207,8 @@ class autofdo_source_profile {
>>>const function_instance *get_function_instance_by_decl (tree decl) const;
>>>/* Find profile info for a given gimple STMT. If found, store the profile
>>>   info in INFO, and return true; otherwise return false.  */
>>> -  bool get_count_info (gimple stmt, count_info *info) const;
>>> +  bool get_count_info (gimple stmt, count_info *info,
>>> +   const location_set *set) const;
>>>/* Find total count of the callee of EDGE.  */
>>>gcov_type get_callsite_total_count (struct cgraph_edge *edge) const;
>>>
>>> @@ -284,17 +290,13 @@ static const char *get_original_name (const char *
>>>
>>>  /* Return the combined location, which is a 32bit integer in which
>>> higher 16 bits stores the line offset of LOC to the start lineno
>>> -   of DECL, The lower 16 bits stores the discrimnator of LOC if
>>> -   USE_DISCR is true, otherwise 0.  */
>>> +   of DECL, The lower 16 bits stores the discrimnator.  */
>>>
>>>  static unsigned
>>> -get_combined_location (location_t loc, tree decl, bool use_discr)
>>> +get_combined_location (location_t loc, tree decl)
>>>  {
>>> -  if (use_discr)
>>> -return ((LOCATION_LINE (loc) - DECL_SOURCE_LINE (decl)) << 16)
>>> -   | get_discriminator_from_locus (loc);
>>> -  else
>>> -return (LOCATION_LINE (loc) - DECL_SOURCE_LINE (decl)) << 16;
>>> +  return ((LOCATION_LINE (loc) - DECL_SOURCE_LINE (decl)) << 16)
>>> + | get_discriminator_from_locus (loc);
>>>  }
>>>
>>>  /* Return the function decl of a given lexical BLOCK.  */
>>> @@ -316,7 +318,7 @@ get_function_decl_from_block (tree block)
>>>  }
>>>
>>>  static void
>>> -get_inline_stack (gimple stmt, bool use_discr, inline_stack *stack)
>>> +get_inline_stack (gimple stmt, inline_stack *stack)
>>>  {
>>>location_t locus = gimple_location (stmt);
>>>if (LOCATION_LOCUS (locus) == UNKNOWN_LOCATION)
>>> @@ -337,14 +339,13 @@ static void
>>>
>>>tree decl = get_function_decl_from_block (block);
>>>stack->push_back (std::make_pair (
>>> -  decl, get_combined_location (locus, decl, level == 0 && use_discr)));
>>> +  decl, get_combined_location (locus, decl)));
>>>locus = tmp_locus;
>>>level++;
>>>  }
>>>stack->push_back (std::make_pair (
>>>current_function_decl,
>>> -  get_combined_location (lo

Re: [PATCH] Convert more passes to new dump framework

2013-08-27 Thread Xinliang David Li
+ Honza

On Tue, Aug 27, 2013 at 10:56 AM, Teresa Johnson  wrote:
> Ping #3.
>
> Thanks,
> Teresa
>
> On Mon, Aug 19, 2013 at 11:33 AM, Teresa Johnson  wrote:
>> Ping.
>> Thanks,
>> Teresa
>>
>> On Mon, Aug 12, 2013 at 6:54 AM, Teresa Johnson  wrote:
>>> On Tue, Aug 6, 2013 at 10:23 PM, Teresa Johnson  
>>> wrote:
 On Tue, Aug 6, 2013 at 9:29 AM, Teresa Johnson  
 wrote:
> On Tue, Aug 6, 2013 at 9:01 AM, Martin Jambor  wrote:
>> Hi,
>>
>> On Tue, Aug 06, 2013 at 07:14:42AM -0700, Teresa Johnson wrote:
>>> On Tue, Aug 6, 2013 at 5:37 AM, Martin Jambor  wrote:
>>> > On Mon, Aug 05, 2013 at 10:37:00PM -0700, Teresa Johnson wrote:
>>> >> This patch ports messages to the new dump framework,
>>> >
>>> > It would be great this new framework was documented somewhere.  I lost
>>> > track of what was agreed it would be and from the uses in the
>>> > vectorizer I was never quite sure how to utilize it in other passes.
>>>
>>> Cc'ing Sharad who implemented this - Sharad, is this documented on a
>>> wiki or elsewhere?
>>
>> Thanks
>>
>>>
>>> >
>>> > I'd also like to point out two other minor things inline:
>>> >
>>> > [...]
>>> >
>>> >> 2013-08-06  Teresa Johnson  
>>> >> Dehao Chen  
>>> >>
>>> >> * dumpfile.c (dump_loc): Add column number to output, make 
>>> >> newlines
>>> >> consistent.
>>> >> * dumpfile.h (OPTGROUP_OTHER): Add and enable under 
>>> >> OPTGROUP_ALL.
>>> >> * ipa-inline-transform.c (clone_inlined_nodes):
>>> >> (cgraph_node_opt_info): New function.
>>> >> (cgraph_node_call_chain): Ditto.
>>> >> (dump_inline_decision): Ditto.
>>> >> (inline_call): Invoke dump_inline_decision.
>>> >> * doc/invoke.texi: Document optall -fopt-info flag.
>>> >> * profile.c (read_profile_edge_counts): Use new dump 
>>> >> framework.
>>> >> (compute_branch_probabilities): Ditto.
>>> >> * passes.c (pass_manager::register_one_dump_file): Use 
>>> >> OPTGROUP_OTHER
>>> >> when pass not in any opt group.
>>> >> * value-prof.c (check_counter): Use new dump framework.
>>> >> (find_func_by_funcdef_no): Ditto.
>>> >> (check_ic_target): Ditto.
>>> >> * coverage.c (get_coverage_counts): Ditto.
>>> >> (coverage_init): Setup new dump framework.
>>> >> * ipa-inline.c (inline_small_functions): Set 
>>> >> is_in_ipa_inline.
>>> >> * ipa-inline.h (is_in_ipa_inline): Declare.
>>> >>
>>> >> * testsuite/gcc.dg/pr40209.c: Use -fopt-info.
>>> >> * testsuite/gcc.dg/pr26570.c: Ditto.
>>> >> * testsuite/gcc.dg/pr32773.c: Ditto.
>>> >> * testsuite/g++.dg/tree-ssa/dom-invalid.C (struct C): Ditto.
>>> >>
>>> >
>>> > [...]
>>> >
>>> >> Index: ipa-inline-transform.c
>>> >> ===
>>> >> --- ipa-inline-transform.c  (revision 201461)
>>> >> +++ ipa-inline-transform.c  (working copy)
>>> >> @@ -192,6 +192,108 @@ clone_inlined_nodes (struct cgraph_edge *e, 
>>> >> bool d
>>> >>  }
>>> >>
>>> >>
>>> >> +#define MAX_INT_LENGTH 20
>>> >> +
>>> >> +/* Return NODE's name and profile count, if available.  */
>>> >> +
>>> >> +static const char *
>>> >> +cgraph_node_opt_info (struct cgraph_node *node)
>>> >> +{
>>> >> +  char *buf;
>>> >> +  size_t buf_size;
>>> >> +  const char *bfd_name = lang_hooks.dwarf_name (node->symbol.decl, 
>>> >> 0);
>>> >> +
>>> >> +  if (!bfd_name)
>>> >> +bfd_name = "unknown";
>>> >> +
>>> >> +  buf_size = strlen (bfd_name) + 1;
>>> >> +  if (profile_info)
>>> >> +buf_size += (MAX_INT_LENGTH + 3);
>>> >> +
>>> >> +  buf = (char *) xmalloc (buf_size);
>>> >> +
>>> >> +  strcpy (buf, bfd_name);
>>> >> +
>>> >> +  if (profile_info)
>>> >> +sprintf (buf, "%s ("HOST_WIDEST_INT_PRINT_DEC")", buf, 
>>> >> node->count);
>>> >> +  return buf;
>>> >> +}
>>> >
>>> > I'm not sure if output of this function is aimed only at the user or
>>> > if it is supposed to be used by gcc developers as well.  If the
>>> > latter, an incredibly useful thing is to also dump node->symbol.order
>>> > too.  We usually dump it after "/" sign separating it from node name.
>>> > It is invaluable when examining decisions in C++ code where you can
>>> > have lots of clones of a node (and also because existing dumps print
>>> > it, it is easy to combine them).
>>>
>>> The output is useful for both power users doing performance tuning of
>>> their application, and by gcc developers. Adding the id is not so
>>> useful for the former, but I agree that it i

Re: opt-info message change for vectorizer

2013-08-27 Thread Xinliang David Li
Does this one look ok?

thanks,

David

On Thu, Aug 22, 2013 at 4:20 PM, Xinliang David Li  wrote:
> Hi, In this patch, loop alignment peeling and loop versioning
> transformation will be reported via -fopt-info by default. This will
> help vectorizer size tuning.
>
> It also enhances the opt-info dump to include current function name as
> context. Otherwise, we may see duplicate messeages from inline/cloned
> instances.
>
> An example opt report:
>
>
>
> b.c:16:A::foo: note: Loop is vectorized
>
> b.c:16:A::foo: note: Loop is versioned to remove aliases for vectorization
>
> b.c:16:A::foo: note: Loop is peeled to enhance alignment for vectorization
>
> b.c:16:A::foo: note: Completely unroll loop 6 times
>
> b.c:12:A::foo: note: Completely unroll loop 6 times
>
>
> Ok after testing?
>
> thanks,
>
> David


Re: opt-info message change for vectorizer

2013-08-27 Thread Xinliang David Li
yes -- the long unmangled names can be annoying -- that is why I chose
to dump the short form of the function names -- combined with line
numbers, it should be enough to get the full context.

David

On Tue, Aug 27, 2013 at 11:36 AM, Teresa Johnson  wrote:
> My only concern is whether the dump messages will get too long with
> the full function name on the same line. The infrastructure that emits
> inform() notes ensures that the function name is printed before each
> block of messages related to that function (via an "In function foo:"
> type message), but I had found before that the new dump infrastructure
> doesn't do that. OTOH, your approach will make it much easier to grep
> the output of a large build. Personally, I use grep on this type of
> output enough to make the longer lines worth it. Either that or the
> new dump infrastructure needs to be fixed to emit the function name
> before each block of messages, a la inform().
>
> Thanks,
> Teresa
>
> On Tue, Aug 27, 2013 at 11:22 AM, Xinliang David Li  
> wrote:
>> Does this one look ok?
>>
>> thanks,
>>
>> David
>>
>> On Thu, Aug 22, 2013 at 4:20 PM, Xinliang David Li  
>> wrote:
>>> Hi, In this patch, loop alignment peeling and loop versioning
>>> transformation will be reported via -fopt-info by default. This will
>>> help vectorizer size tuning.
>>>
>>> It also enhances the opt-info dump to include current function name as
>>> context. Otherwise, we may see duplicate messeages from inline/cloned
>>> instances.
>>>
>>> An example opt report:
>>>
>>>
>>>
>>> b.c:16:A::foo: note: Loop is vectorized
>>>
>>> b.c:16:A::foo: note: Loop is versioned to remove aliases for vectorization
>>>
>>> b.c:16:A::foo: note: Loop is peeled to enhance alignment for vectorization
>>>
>>> b.c:16:A::foo: note: Completely unroll loop 6 times
>>>
>>> b.c:12:A::foo: note: Completely unroll loop 6 times
>>>
>>>
>>> Ok after testing?
>>>
>>> thanks,
>>>
>>> David
>
>
>
> --
> Teresa Johnson | Software Engineer | tejohn...@google.com | 408-460-2413


Re: opt-info message change for vectorizer

2013-08-27 Thread Xinliang David Li
If this is the convention, we should probably have another patch to
fix all the existing opt-info messages.

thanks,

David

On Tue, Aug 27, 2013 at 1:23 PM, Mike Stump  wrote:
> On Aug 27, 2013, at 11:22 AM, Xinliang David Li  wrote:
>> Does this one look ok?
>
> We don't capitalize text after error:, warning: or note:.
>
>> thanks,
>>
>> David
>>
>> On Thu, Aug 22, 2013 at 4:20 PM, Xinliang David Li  
>> wrote:
>>> Hi, In this patch, loop alignment peeling and loop versioning
>>> transformation will be reported via -fopt-info by default. This will
>>> help vectorizer size tuning.
>>>
>>> It also enhances the opt-info dump to include current function name as
>>> context. Otherwise, we may see duplicate messeages from inline/cloned
>>> instances.
>>>
>>> An example opt report:
>>>
>>>
>>>
>>> b.c:16:A::foo: note: Loop is vectorized
>>>
>>> b.c:16:A::foo: note: Loop is versioned to remove aliases for vectorization
>>>
>>> b.c:16:A::foo: note: Loop is peeled to enhance alignment for vectorization
>>>
>>> b.c:16:A::foo: note: Completely unroll loop 6 times
>>>
>>> b.c:12:A::foo: note: Completely unroll loop 6 times
>>>
>>>
>>> Ok after testing?
>>>
>>> thanks,
>>>
>>> David
>


Re: [PATCH][2/n] 2nd try: Re-organize -fvect-cost-model, enable basic vectorization at -O2

2013-08-27 Thread Xinliang David Li
Richard, I have some comments about the patch.

>   -ftree-vectorizer-verbose=This switch is deprecated. Use 
> -fopt-info instead.
>
>   ftree-slp-vectorize
> ! Common Report Var(flag_tree_slp_vectorize) Optimization
>   Enable basic block vectorization (SLP) on trees

The code dealing with the interactions between -ftree-vectorize, O3,
etc are complicated and hard to understand. Is it better to change the
meaning of -ftree-vectorize to mean -floop-vectorize only, and make it
independent of -fslp-vectorize?  P


>
> + fvect-cost-model=
> + Common Joined RejectNegative Enum(vect_cost_model) 
> Var(flag_vect_cost_model) Init(VECT_COST_MODEL_DEFAULT)
> + Specifies the cost model for vectorization
> +
> + Enum
> + Name(vect_cost_model) Type(enum vect_cost_model) UnknownError(unknown 
> vectorizer cost model %qs)
> +
> + EnumValue
> + Enum(vect_cost_model) String(unlimited) Value(VECT_COST_MODEL_UNLIMITED)
> +
> + EnumValue
> + Enum(vect_cost_model) String(dynamic) Value(VECT_COST_MODEL_DYNAMIC)
> +
> + EnumValue
> + Enum(vect_cost_model) String(cheap) Value(VECT_COST_MODEL_CHEAP)

Introducing cheap model is a great change.

> +

> *** 173,179 
>   {
> struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
>
> !   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
>   return false;
>
> if (dump_enabled_p ())
> --- 173,180 
>   {
> struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
>
> !   if (loop_vinfo->cost_model == VECT_COST_MODEL_CHEAP
> !   || (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) 
> == 0)
>   return false;
>

When the cost_model == cheap, the alignment peeling should also be
disabled -- there will still be loops that are beneficial to be
vectorized without peeling -- at perhaps reduced net runtime gain.



>   struct gimple_opt_pass pass_slp_vectorize =
> --- 206,220 
>   static bool
>   gate_vect_slp (void)
>   {
> !   /* Apply SLP either according to whether the user specified whether to
> !  run SLP or not, or according to whether the user specified whether
> !  to do vectorization or not.  */
> !   if (global_options_set.x_flag_tree_slp_vectorize)
> ! return flag_tree_slp_vectorize != 0;
> !   if (global_options_set.x_flag_tree_vectorize)
> ! return flag_tree_vectorize != 0;
> !   /* And if vectorization was enabled by default run SLP only at -O3.  */
> !   return flag_tree_vectorize != 0 && optimize == 3;
>   }

The logic can be greatly simplified if slp vectorizer is controlled
independently -- easier for user to understand too.


> ! @item -fvect-cost-model=@var{model}
>   @opindex fvect-cost-model
> ! Alter the cost model used for vectorization.  The @var{model} argument
> ! should be one of @code{unlimited}, @code{dynamic} or @code{cheap}.
> ! With the @code{unlimited} model the vectorized code-path is assumed
> ! to be profitable while with the @code{dynamic} model a runtime check
> ! will guard the vectorized code-path to enable it only for iteration
> ! counts that will likely execute faster than when executing the original
> ! scalar loop.  The @code{cheap} model will disable vectorization of
> ! loops where doing so would be cost prohibitive for example due to
> ! required runtime checks for data dependence or alignment but otherwise
> ! is equal to the @code{dynamic} model.
> ! The default cost model depends on other optimization flags and is
> ! either @code{dynamic} or @code{cheap}.
>

Vectorizer in theory will only vectorize a loop with net runtime gain,
so the 'cost' here should only mean code size and compile time cost.

Cheap Model: with this model, the compiler will vectorize loops that
are considered beneficial for runtime performance with minimal code
size increase and compile time cost;
Unlimited Model: compiler will vectorize loops to maximize runtime
gain without considering compile time cost and impact to code size;


thanks,

David


Re: [PATCH] Convert more passes to new dump framework

2013-08-28 Thread Xinliang David Li
On Wed, Aug 28, 2013 at 7:09 AM, Teresa Johnson  wrote:
> On Wed, Aug 28, 2013 at 4:01 AM, Richard Biener
>  wrote:
>> On Wed, Aug 7, 2013 at 7:23 AM, Teresa Johnson  wrote:
>>> On Tue, Aug 6, 2013 at 9:29 AM, Teresa Johnson  wrote:
 On Tue, Aug 6, 2013 at 9:01 AM, Martin Jambor  wrote:
> Hi,
>
> On Tue, Aug 06, 2013 at 07:14:42AM -0700, Teresa Johnson wrote:
>> On Tue, Aug 6, 2013 at 5:37 AM, Martin Jambor  wrote:
>> > On Mon, Aug 05, 2013 at 10:37:00PM -0700, Teresa Johnson wrote:
>> >> This patch ports messages to the new dump framework,
>> >
>> > It would be great this new framework was documented somewhere.  I lost
>> > track of what was agreed it would be and from the uses in the
>> > vectorizer I was never quite sure how to utilize it in other passes.
>>
>> Cc'ing Sharad who implemented this - Sharad, is this documented on a
>> wiki or elsewhere?
>
> Thanks
>
>>
>> >
>> > I'd also like to point out two other minor things inline:
>> >
>> > [...]
>> >
>> >> 2013-08-06  Teresa Johnson  
>> >> Dehao Chen  
>> >>
>> >> * dumpfile.c (dump_loc): Add column number to output, make 
>> >> newlines
>> >> consistent.
>> >> * dumpfile.h (OPTGROUP_OTHER): Add and enable under 
>> >> OPTGROUP_ALL.
>> >> * ipa-inline-transform.c (clone_inlined_nodes):
>> >> (cgraph_node_opt_info): New function.
>> >> (cgraph_node_call_chain): Ditto.
>> >> (dump_inline_decision): Ditto.
>> >> (inline_call): Invoke dump_inline_decision.
>> >> * doc/invoke.texi: Document optall -fopt-info flag.
>> >> * profile.c (read_profile_edge_counts): Use new dump 
>> >> framework.
>> >> (compute_branch_probabilities): Ditto.
>> >> * passes.c (pass_manager::register_one_dump_file): Use 
>> >> OPTGROUP_OTHER
>> >> when pass not in any opt group.
>> >> * value-prof.c (check_counter): Use new dump framework.
>> >> (find_func_by_funcdef_no): Ditto.
>> >> (check_ic_target): Ditto.
>> >> * coverage.c (get_coverage_counts): Ditto.
>> >> (coverage_init): Setup new dump framework.
>> >> * ipa-inline.c (inline_small_functions): Set is_in_ipa_inline.
>> >> * ipa-inline.h (is_in_ipa_inline): Declare.
>> >>
>> >> * testsuite/gcc.dg/pr40209.c: Use -fopt-info.
>> >> * testsuite/gcc.dg/pr26570.c: Ditto.
>> >> * testsuite/gcc.dg/pr32773.c: Ditto.
>> >> * testsuite/g++.dg/tree-ssa/dom-invalid.C (struct C): Ditto.
>> >>
>> >
>> > [...]
>> >
>> >> Index: ipa-inline-transform.c
>> >> ===
>> >> --- ipa-inline-transform.c  (revision 201461)
>> >> +++ ipa-inline-transform.c  (working copy)
>> >> @@ -192,6 +192,108 @@ clone_inlined_nodes (struct cgraph_edge *e, 
>> >> bool d
>> >>  }
>> >>
>> >>
>> >> +#define MAX_INT_LENGTH 20
>> >> +
>> >> +/* Return NODE's name and profile count, if available.  */
>> >> +
>> >> +static const char *
>> >> +cgraph_node_opt_info (struct cgraph_node *node)
>> >> +{
>> >> +  char *buf;
>> >> +  size_t buf_size;
>> >> +  const char *bfd_name = lang_hooks.dwarf_name (node->symbol.decl, 
>> >> 0);
>> >> +
>> >> +  if (!bfd_name)
>> >> +bfd_name = "unknown";
>> >> +
>> >> +  buf_size = strlen (bfd_name) + 1;
>> >> +  if (profile_info)
>> >> +buf_size += (MAX_INT_LENGTH + 3);
>> >> +
>> >> +  buf = (char *) xmalloc (buf_size);
>> >> +
>> >> +  strcpy (buf, bfd_name);
>> >> +
>> >> +  if (profile_info)
>> >> +sprintf (buf, "%s ("HOST_WIDEST_INT_PRINT_DEC")", buf, 
>> >> node->count);
>> >> +  return buf;
>> >> +}
>> >
>> > I'm not sure if output of this function is aimed only at the user or
>> > if it is supposed to be used by gcc developers as well.  If the
>> > latter, an incredibly useful thing is to also dump node->symbol.order
>> > too.  We usually dump it after "/" sign separating it from node name.
>> > It is invaluable when examining decisions in C++ code where you can
>> > have lots of clones of a node (and also because existing dumps print
>> > it, it is easy to combine them).
>>
>> The output is useful for both power users doing performance tuning of
>> their application, and by gcc developers. Adding the id is not so
>> useful for the former, but I agree that it is very useful for compiler
>> developers. In fact, in the google branch version we emit more verbose
>> information (the lipo module id and the funcdef_no) to help uniquely
>> identify the routines and to aid in post-processing by humans and
>> tools. S

Re: [PATCH][2/n] 2nd try: Re-organize -fvect-cost-model, enable basic vectorization at -O2

2013-08-28 Thread Xinliang David Li
On Wed, Aug 28, 2013 at 12:59 AM, Richard Biener  wrote:
> On Tue, 27 Aug 2013, Xinliang David Li wrote:
>
>> Richard, I have some comments about the patch.
>>
>> >   -ftree-vectorizer-verbose=This switch is deprecated. Use 
>> > -fopt-info instead.
>> >
>> >   ftree-slp-vectorize
>> > ! Common Report Var(flag_tree_slp_vectorize) Optimization
>> >   Enable basic block vectorization (SLP) on trees
>>
>> The code dealing with the interactions between -ftree-vectorize, O3,
>> etc are complicated and hard to understand. Is it better to change the
>> meaning of -ftree-vectorize to mean -floop-vectorize only, and make it
>> independent of -fslp-vectorize?  P
>
> Yeah, but that would be an independent change.  Also people expect
> to be able to enable all of the vectorizer with -ftree-vectorize.
> So rather we introduce -floop-vectorize?

I think that will be good and simplify the logic too --
ftree-vectorize turns on both loop and slp if they are not explicitly
specified.


>
>> > + fvect-cost-model=
>> > + Common Joined RejectNegative Enum(vect_cost_model) 
>> > Var(flag_vect_cost_model) Init(VECT_COST_MODEL_DEFAULT)
>> > + Specifies the cost model for vectorization
>> > +
>> > + Enum
>> > + Name(vect_cost_model) Type(enum vect_cost_model) UnknownError(unknown 
>> > vectorizer cost model %qs)
>> > +
>> > + EnumValue
>> > + Enum(vect_cost_model) String(unlimited) Value(VECT_COST_MODEL_UNLIMITED)
>> > +
>> > + EnumValue
>> > + Enum(vect_cost_model) String(dynamic) Value(VECT_COST_MODEL_DYNAMIC)
>> > +
>> > + EnumValue
>> > + Enum(vect_cost_model) String(cheap) Value(VECT_COST_MODEL_CHEAP)
>>
>> Introducing cheap model is a great change.
>>
>> > +
>>
>> > *** 173,179 
>> >   {
>> > struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
>> >
>> > !   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) 
>> > == 0)
>> >   return false;
>> >
>> > if (dump_enabled_p ())
>> > --- 173,180 
>> >   {
>> > struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
>> >
>> > !   if (loop_vinfo->cost_model == VECT_COST_MODEL_CHEAP
>> > !   || (unsigned) PARAM_VALUE 
>> > (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
>> >   return false;
>> >
>>
>> When the cost_model == cheap, the alignment peeling should also be
>> disabled -- there will still be loops that are beneficial to be
>> vectorized without peeling -- at perhaps reduced net runtime gain.
>
> IIRC there are targets that cannot vectorize unaligned accesses, so
> in the end the cost model needs to be more target-controlled.
>
> The above was just a start for experimenting, of course.
>
>> >   struct gimple_opt_pass pass_slp_vectorize =
>> > --- 206,220 
>> >   static bool
>> >   gate_vect_slp (void)
>> >   {
>> > !   /* Apply SLP either according to whether the user specified whether to
>> > !  run SLP or not, or according to whether the user specified whether
>> > !  to do vectorization or not.  */
>> > !   if (global_options_set.x_flag_tree_slp_vectorize)
>> > ! return flag_tree_slp_vectorize != 0;
>> > !   if (global_options_set.x_flag_tree_vectorize)
>> > ! return flag_tree_vectorize != 0;
>> > !   /* And if vectorization was enabled by default run SLP only at -O3.  */
>> > !   return flag_tree_vectorize != 0 && optimize == 3;
>> >   }
>>
>> The logic can be greatly simplified if slp vectorizer is controlled
>> independently -- easier for user to understand too.
>
> It should work with separating out -floop-vectorize, too I guess.  But
> yes, as I wanted to preserve behavior of adding -ftree-vectorize to
> -O2 the above necessarily became quite complicated ;)

With floop-vectorize, ftree-vectorize becomes a simple shorthand/alias
to 'floop-vectorize + fslp-vectorize', and O3, O2 does not need to
look at ftree-vectorize (which does even need a flag variable).

>
>> > ! @item -fvect-cost-model=@var{model}
>> >   @opindex fvect-cost-model
>> > ! Alter the cost model used for vectorization.  The @var{model} argument
>> > ! should be one of @code{unlimited}, @code{dynamic} or @code{cheap}.
>> > ! With the @code{unlimited} model the vectorized code-path is assumed
>> > ! to be profitable while with the @code{dynamic} model a runtime check
>> > ! will guard the vecto

Re: opt-info message change for vectorizer

2013-08-28 Thread Xinliang David Li
Fixed as requested. I don't like the extra newline either, but I will
leave that to Teresa.

basic3.c:8:foo: note: loop vectorized

basic3.c:8:foo: note: loop versioned for vectorization because of
possible aliasing

basic3.c:8:foo: note: loop peeled for vectorization to enhance alignment

basic3.c:8:foo: note: loop with 7 iterations completely unrolled

basic3.c:5:foo: note: loop with 7 iterations completely unrolled


Is this version ok after testing?

thanks,

David

On Wed, Aug 28, 2013 at 2:45 AM, Richard Biener
 wrote:
> On Tue, Aug 27, 2013 at 10:30 PM, Xinliang David Li  
> wrote:
>> If this is the convention, we should probably have another patch to
>> fix all the existing opt-info messages.
>
> Yes please.
>
> Also ...
>
>
>>>>> b.c:16:A::foo: note: Loop is vectorized
>
> "loop vectorized"
>
>>>>>
>>>>> b.c:16:A::foo: note: Loop is versioned to remove aliases for vectorization
>
> "loop versioned for vectorization because of possible aliasing"
>
>>>>> b.c:16:A::foo: note: Loop is peeled to enhance alignment for vectorization
>
> "loop peeled for vectorization to enhance alignment"
>
>>>>> b.c:16:A::foo: note: Completely unroll loop 6 times
>
> maybe "loop with 6 iterations completely unrolled"
>
>>>>>
>>>>> b.c:12:A::foo: note: Completely unroll loop 6 times
>>>>>
>
> I hate the excessive vertical spacing as well.
>
> Richard.
>
>>>>> Ok after testing?
>>>>>
>>>>> thanks,
>>>>>
>>>>> David
>>>
Index: tree-vectorizer.c
===
--- tree-vectorizer.c   (revision 201751)
+++ tree-vectorizer.c   (working copy)
@@ -119,7 +119,7 @@ vectorize_loops (void)
 if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOC
&& dump_enabled_p ())
   dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
-   "Vectorized loop\n");
+   "loop vectorized\n");
vect_transform_loop (loop_vinfo);
num_vectorized_loops++;
   }
@@ -180,7 +180,7 @@ execute_vect_slp (void)
   vect_slp_transform_bb (bb);
   if (dump_enabled_p ())
 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
-"Vectorized basic-block\n");
+"Basic block is vectorized\n");
 }
 }
 
Index: loop-unroll.c
===
--- loop-unroll.c   (revision 201751)
+++ loop-unroll.c   (working copy)
@@ -225,7 +225,7 @@ report_unroll_peel (struct loop *loop, l
   && !loop->lpt_decision.times)
 {
   dump_printf_loc (report_flags, locus,
-   "Turned loop into non-loop; it never loops.\n");
+   "loop turned into non-loop; it never loops.\n");
   return;
 }
 
@@ -236,13 +236,16 @@ report_unroll_peel (struct loop *loop, l
   else if (loop->header->count)
 niters = expected_loop_iterations (loop);
 
-  dump_printf_loc (report_flags, locus,
-   "%s loop %d times",
-   (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY
-?  "Completely unroll"
-: (loop->lpt_decision.decision == LPT_PEEL_SIMPLE
-   ? "Peel" : "Unroll")),
-   loop->lpt_decision.times);
+  if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
+dump_printf_loc (report_flags, locus,
+ "loop with %d iterations completely unrolled",
+loop->lpt_decision.times + 1);
+  else
+dump_printf_loc (report_flags, locus,
+ "loop %s %d times",
+ (loop->lpt_decision.decision == LPT_PEEL_SIMPLE
+   ? "peeled" : "unrolled"),
+ loop->lpt_decision.times);
   if (profile_info)
 dump_printf (report_flags,
  " (header execution count %d",
Index: dumpfile.c
===
--- dumpfile.c  (revision 201751)
+++ dumpfile.c  (working copy)
@@ -24,6 +24,7 @@ along with GCC; see the file COPYING3.
 #include "dumpfile.h"
 #include "gimple-pretty-print.h"
 #include "tree.h"
+#include "gimple.h"
 
 /* If non-NULL, return one past-the-end of the matching SUBPART of
the WHOLE string.  */
@@ -261,12 +262,20 @@ dump_loc (int dump_kind, FILE *dfile, so
   if (dump_kind)
 {
   if (LOCATION_LOCUS (loc) > 

Eliminate vectorizer analysis side effects

2013-08-29 Thread Xinliang David Li
I was debugging a runtime failure of SPEC06 xalancbmk built with LIPO.
Using -fdisable- option pinpoints the problem in slp vectorize
pass on a particular function. dbgcnt support is added to to track
down the individual BB, but it  fails even when the dbg count is set
to 0.

It turns out that no BB was actually vectorized for that function, but
turning on/off slp-vectorize does make a difference in generated code
-- the only difference between the good and bad case is stack layout.
 The problem is  in the alignment analysis phase -- which
speculatively changes the base declaration's alignment regardless
whether the vectorization transformation will be performed or not
later.

The attached patch fixes the problem. Testing is ok. Ok for trunk?

thanks,

David
Index: ChangeLog
===
--- ChangeLog   (revision 202088)
+++ ChangeLog   (working copy)
@@ -1,5 +1,17 @@
 2013-08-29  Xinliang David Li  
 
+   * tree-vect-data-refs.c (vect_compute_data_ref_alignment):
+   Delay base decl alignment adjustment.
+   * tree-vectorizer.c (ensure_base_alignment): New function.
+   (vectorize_loops): Add dbg_cnt support. Perform alignment
+   adjustment.
+   (execute_vect_slp): Ditto.
+   * dbgcnt.def: New debug counter.
+   * tree-data-ref.h: New fields.
+   * Makefile: New dependency.
+
+2013-08-29  Xinliang David Li  
+
* loop-unroll.c (report_unroll_peel): Minor message
change.
* tree-vect-loop-manip.c (vect_do_peeling_for_alignment):
Index: tree-vect-data-refs.c
===
--- tree-vect-data-refs.c   (revision 202088)
+++ tree-vect-data-refs.c   (working copy)
@@ -763,15 +763,10 @@ vect_compute_data_ref_alignment (struct
   dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 }
 
-  DECL_ALIGN (base) = TYPE_ALIGN (vectype);
-  DECL_USER_ALIGN (base) = 1;
+  DR_BASE_DECL (dr) = base;
+  DR_BASE_MISALIGNED (dr) = true;
 }
 
-  /* At this point we assume that the base is aligned.  */
-  gcc_assert (base_aligned
- || (TREE_CODE (base) == VAR_DECL
- && DECL_ALIGN (base) >= TYPE_ALIGN (vectype)));
-
   /* If this is a backward running DR then first access in the larger
  vectype actually is N-1 elements before the address in the DR.
  Adjust misalign accordingly.  */
Index: dbgcnt.def
===
--- dbgcnt.def  (revision 202088)
+++ dbgcnt.def  (working copy)
@@ -172,6 +172,8 @@ DEBUG_COUNTER (pre_insn)
 DEBUG_COUNTER (treepre_insert)
 DEBUG_COUNTER (tree_sra)
 DEBUG_COUNTER (eipa_sra)
+DEBUG_COUNTER (vect_loop)
+DEBUG_COUNTER (vect_slp)
 DEBUG_COUNTER (sched2_func)
 DEBUG_COUNTER (sched_block)
 DEBUG_COUNTER (sched_func)
Index: tree-vectorizer.c
===
--- tree-vectorizer.c   (revision 202088)
+++ tree-vectorizer.c   (working copy)
@@ -68,6 +68,7 @@ along with GCC; see the file COPYING3.
 #include "tree-pass.h"
 #include "hash-table.h"
 #include "tree-ssa-propagate.h"
+#include "dbgcnt.h"
 
 /* Loop or bb location.  */
 LOC vect_location;
@@ -279,6 +280,37 @@ note_simd_array_uses (hash_table  datarefs;
+  struct data_reference *dr;
+  unsigned int i;
+
+ if (loop_vinfo)
+datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
+  else
+datarefs = BB_VINFO_DATAREFS (bb_vinfo);
+
+  FOR_EACH_VEC_ELT (datarefs, i, dr)
+{
+  tree base_decl = DR_BASE_DECL (dr);
+  if (base_decl && DR_BASE_MISALIGNED (dr))
+{
+  gimple stmt = DR_STMT (dr);
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+  DECL_ALIGN (base_decl) = TYPE_ALIGN (vectype);
+  DECL_USER_ALIGN (base_decl) = 1;
+  DR_BASE_MISALIGNED (dr) = false;
+}
+}
+}
+
+
 /* Function vectorize_loops.
 
Entry point to loop vectorization phase.  */
@@ -331,10 +363,14 @@ vectorize_loops (void)
if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
  continue;
 
+if (!dbg_cnt (vect_loop))
+ break;
+
 if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOC
&& dump_enabled_p ())
   dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
"loop vectorized\n");
+ensure_base_alignment (loop_vinfo, NULL);
vect_transform_loop (loop_vinfo);
num_vectorized_loops++;
/* Now that the loop has been vectorized, allow it to be unrolled
@@ -431,6 +467,7 @@ static unsigned int
 execute_vect_slp (void)
 {
   basic_block bb;
+  bb_vec_info bb_vinfo;
 
   init_stmt_vec_info_vec ();
 
@@ -438,8 +475,12 @@ execute_vect_slp (void)
 {
   vect_location = find_bb_location (bb);
 
-  if (vect_slp_analyze_

Re: [Google] Refine hot caller heuristic

2013-08-29 Thread Xinliang David Li
Ok.  Do consider generalize it with 1) more inline hints (how the
parameters are used in callee); and 2) more parameter type (such as
addr_k, non_null_k, range_k etc you have proposed before) in the
future.

thanks,

David

On Thu, Aug 29, 2013 at 11:24 AM, Easwaran Raman  wrote:
> On Tue, Aug 20, 2013 at 9:35 PM, Xinliang David Li  wrote:
>> Do you need to guard the jump function access with check if
>> (ipa_node_params_vector.exists ())?
> I believe it is not necessary since, for example, ipa_analyze_node
> calls ipa_check_create_node_params that calls create. But I see it is
> used in ipa-inline-analysis.c everywhere. So I have added a check and
> conservatively return false.
>
>>
>> Ideally, useful_cold_callee should be folded into the inline hints
>> estimation.  Question about the heuristic: why filtering out
>> PASS_THROUGH parameter cases completely? Passing 'this' parameter in
>> many cases can result in good PRE opportunities.  Why not skip the
>> unknown type?
>
> The rationale is it is useful to inline bar into foo in the snippet below:
>
> void foo ()
> {
>   A a;
>   bar(&a);
>   ...
> }
>
> Capturing this requires UNKNOWN and KNOWN_TYPE jump functions. I have
> changed the check accordingly. I have attached the new patch.
>
> - Easwaran
>
>> David
>>
>> On Tue, Aug 20, 2013 at 12:26 PM, Easwaran Raman  wrote:
>>> The current hot caller heuristic simply promotes edges whose caller is
>>> hot. This patch does the following:
>>> * Turn it off for applications with large footprint since the size
>>> increase hurts them
>>> * Be more selective by considering arguments to callee when the
>>> heuristic is enabled.
>>>
>>> This performs well on internal benchmarks. Ok for google/4_8 branch if
>>> all tests pass?
>>>
>>> - Easwaran


Re: [PATCH] Convert more passes to new dump framework

2013-08-30 Thread Xinliang David Li
Except that in this form, the dump will be extremely large and not
suitable for very large applications. Besides, we might also want to
use the same machinery (dump_printf_loc etc) for dump file dumping.
The current behavior of using '-details' to turn on opt-info-all
messages for dump files are not desirable.  How about the following:

1) add a new dump_kind modifier so that when that modifier is
specified, the messages won't goto the alt_dumpfile (controlled by
-fopt-info), but only to primary dump file. With this, the inline
messages can be dumped via:

   dump_printf_loc (OPT_OPTIMIZED_LOCATIONS | OPT_DUMP_FILE_ONLY, .)


2) add more flags in -fdump- support:

   -fdump-ipa-inline-opt   --> turn on opt-info messages only
   -fdump-ipa-inline-optall --> turn on opt-info-all messages
   -fdump-tree-pre-ir --> turn on GIMPLE dump only
   -fdump-tree-pre-details --> turn on everything (ir, optall, trace)

With this, developers can really just use


-fdump-ipa-inline-opt=stderr for inline messages.

thanks,

David

On Fri, Aug 30, 2013 at 1:30 AM, Richard Biener
 wrote:
> On Thu, Aug 29, 2013 at 5:15 PM, Teresa Johnson  wrote:
>> On Thu, Aug 29, 2013 at 3:04 AM, Richard Biener
>>  wrote:
>> New patch below that removes this global variable, and also outputs
>> the node->symbol.order (in square brackets after the function name so
>> as to not clutter it). Inline messages with profile data look look:
>>
>> test.c:8:3: note: foobar [0] (9000) inlined into foo [2] (1000)
>> with call count 9000 (via inline instance bar [3] (9000))
>
> Ick.  This looks both redundant and cluttered.  This is supposed to be
> understandable by GCC users, not only GCC developers.

 The main part that is only useful/understandable to gcc developers is
 the node->symbol.order in square brackes, requested by Martin. One
 possibility is that I could put that part under a param, disabled by
 default. We have something similar on the google branches that emits
 LIPO module info in the message, enabled via a param.
>>>
>>> But we have _dump files_ for that.  That's the developer-consumed
>>> form of opt-info.  -fopt-info is purely user sugar and for usual translation
>>> units it shouldn't exceed a single terminal full of output.
>>
>> But as a developer I don't want to have to parse lots of dump files
>> for a summary of the major optimizations performed (e.g. inlining,
>> unrolling) for an application, unless I am diving into the reasons for
>> why or why not one of those optimizations occurred in a particular
>> location. I really do want a summary emitted to stderr so that it is
>> easily searchable/summarizable for the app as a whole.
>>
>> For example, some of the apps I am interested in have thousands of
>> input files, and trying to collect and parse dump files for each and
>> every one is overwhelming (it probably would be even if my input files
>> numbered in the hundreds). What has been very useful is having these
>> high level summary messages of inlines and unrolls emitted to stderr
>> by -fopt-info. Then it is easy to search and sort by hotness to get a
>> feel for things like what inlines are missing when moving to a new
>> compiler, or compiling a new version of the source, for example. Then
>> you know which files to focus on and collect dump files for.
>
> I thought we can direct dump files to stderr now?  So, just use
> -fdump-tree-all=stderr
>
> and grep its contents.
>
>>>
 I'd argue that the other information (the profile counts, emitted only
 when using -fprofile-use, and the inline call chains) are useful if
 you want to understand whether and how critical inlines are occurring.
 I think this is the type of information that users focused on
 optimizations, as well as gcc developers, want when they use
 -fopt-info. Otherwise it is difficult to make sense of the inline
 information.
>>>
>>> Well, I doubt that inline information is interesting to users unless we are
>>> able to aggressively filter it to what users are interested in.  Which IMHO
>>> isn't possible - users are interested in "I have not inlined this even 
>>> though
>>> inlining would severely improve performance" which would indicate a bug
>>> in the heuristics we can reliably detect and thus it wouldn't be there.
>>
>> I have interacted with users who are aware of optimizations such as
>> inlining and unrolling and want to look at that information to
>> diagnose performance differences when refactoring code or using a new
>> compiler version. I also think inlining (especially cross-module) is
>> one example of an optimization that is still being tuned, and user
>> reports of performance issues related to that have been useful.
>>
>> I really think that the two groups of people who will find -fopt-info
>> useful are gcc developers and savvy performance-hungry users. For the
>> former group the additional info is extremely useful. For the latter
>> gro

Re: [PATCH] Convert more passes to new dump framework

2013-08-30 Thread Xinliang David Li
On Fri, Aug 30, 2013 at 12:51 PM, Teresa Johnson  wrote:
> On Fri, Aug 30, 2013 at 9:27 AM, Xinliang David Li  wrote:
>> Except that in this form, the dump will be extremely large and not
>> suitable for very large applications.
>
> Yes. I did some measurements for both a fairly large source file that
> is heavily optimized with LIPO and for a simple toy example that has
> some inlining. For the large source file, the output from
> -fdump-ipa-inline=stderr was almost 100x the line count of the
> -fopt-info output. For the toy source file it was 43x. The size of the
> -details output was 250x and 100x, respectively. Which is untenable
> for a large app.
>
> The issue I am having here is that I want a more verbose message, not
> a more voluminous set of messages. Using either -fopt-info-all or
> -fdump-ipa-inline to provoke the more verbose inline message will give
> me a much greater volume of output.
>
> One compromise could be to emit the more verbose inliner message under
> a param (and a more concise "foo inlined into bar" by default with
> -fopt-info). Or we could do some variant of what David talks about
> below.

something like --param=verbose-opt-info=1


>
>> Besides, we might also want to
>> use the same machinery (dump_printf_loc etc) for dump file dumping.
>> The current behavior of using '-details' to turn on opt-info-all
>> messages for dump files are not desirable.
>
> Interestingly, this doesn't even work. When I do
> -fdump-ipa-inline-details=stderr (with my patch containing the inliner
> messages) I am not getting those inliner messages emitted to stderr.
> Even though in dumpfile.c "details" is set to (TDF_DETAILS |
> MSG_OPTIMIZED_LOCATIONS | MSG_MISSED_OPTIMIZATION | MSG_NOTE). I'm not
> sure why, but will need to debug this.

It works for vectorizer pass.

>
>> How about the following:
>>
>> 1) add a new dump_kind modifier so that when that modifier is
>> specified, the messages won't goto the alt_dumpfile (controlled by
>> -fopt-info), but only to primary dump file. With this, the inline
>> messages can be dumped via:
>>
>>dump_printf_loc (OPT_OPTIMIZED_LOCATIONS | OPT_DUMP_FILE_ONLY, .)
>
> (you mean (MSG_OPTIMIZED_LOCATIONS | OPT_DUMP_FILE_ONLY) )
>

Yes.

> Typically OR-ing together flags like this indicates dump under any of
> those conditions. But we could implement special handling for
> OPT_DUMP_FILE_ONLY, which in the above case would mean dump only to
> the primary dump file, and only under the other conditions specified
> in the flag (here under "-optimized")
>
>>
>>
>> 2) add more flags in -fdump- support:
>>
>>-fdump-ipa-inline-opt   --> turn on opt-info messages only
>>-fdump-ipa-inline-optall --> turn on opt-info-all messages
>
> According to the documentation (see the -fdump-tree- documentation on
> http://gcc.gnu.org/onlinedocs/gcc/Debugging-Options.html#Debugging-Options),
> the above are already supposed to be there (-optimized, -missed, -note
> and -optall). However, specifying any of these gives a warning like:
>cc1: warning: ignoring unknown option ‘optimized’ in
> ‘-fdump-ipa-inline’ [enabled by default]
> Probably because none is listed in the dump_options[] array in dumpfile.c.
>
> However, I don't think there is currently a way to use -fdump- options
> and *only* get one of these, as much of the current dump output is
> emitted whenever there is a dump_file defined. Until everything is
> migrated to the new framework it may be difficult to get this to work.
>
>>-fdump-tree-pre-ir --> turn on GIMPLE dump only
>>-fdump-tree-pre-details --> turn on everything (ir, optall, trace)
>>
>> With this, developers can really just use
>>
>>
>> -fdump-ipa-inline-opt=stderr for inline messages.
>
> Yes, if we can figure out a good way to get this to work (i.e. only
> emit the optimized messages and not the rest of the dump messages).
> And unfortunately to get them all you need to specify
> "-fdump-ipa-all-optimized -fdump-tree-all-optimized
> -fdump-rtl-all-optimized" instead of just -fopt-info. Unless we can
> add -fdump-all-all-optimized.

Having general support requires cleanup of all the old style  if
(dump_file) fprintf (dump_file, ...) instances to be:

  if (dump_enabled_p ())
dump_printf (dump_kind );


However, it might be easier to do this filtering for IR dump only (in
execute_function_dump) -- do not dump IR if any of the MSG_ is
specified unless IR flag (a new flag) is also specified.

David


>
> Teresa
>
>>
>> thanks,
>>
>> David
>>
>> On Fri, Aug 30, 2013 at 1

Re: Eliminate vectorizer analysis side effects

2013-08-30 Thread Xinliang David Li
On Fri, Aug 30, 2013 at 1:23 AM, Richard Biener
 wrote:
> On Fri, Aug 30, 2013 at 1:28 AM, Xinliang David Li  wrote:
>> I was debugging a runtime failure of SPEC06 xalancbmk built with LIPO.
>> Using -fdisable- option pinpoints the problem in slp vectorize
>> pass on a particular function. dbgcnt support is added to to track
>> down the individual BB, but it  fails even when the dbg count is set
>> to 0.
>>
>> It turns out that no BB was actually vectorized for that function, but
>> turning on/off slp-vectorize does make a difference in generated code
>> -- the only difference between the good and bad case is stack layout.
>>  The problem is  in the alignment analysis phase -- which
>> speculatively changes the base declaration's alignment regardless
>> whether the vectorization transformation will be performed or not
>> later.
>>
>> The attached patch fixes the problem. Testing is ok. Ok for trunk?
>
> Not in this form.  I'd rather not put extra fields in the data-refs this way.
> (As for the xalancbmk runtime problem - doesn't this patch just paper
> over the real issue?)

I believe it is stack-limit related. This program has some recursive
call chains that can generate a call frame ~9k deep. The vectorizer
side effect causes the affected function in the call frame to grow
~100 byte in stack size. Since this function appears lots of times in
the callstack, the overall stack consumption increase a lot. Combined
with the aggressive cross module inlining, it ends up blowing up the
stack limit.


>
> For BB SLP you still adjust DR bases that do not take part in
> vectorization - the DR vector contains all refs in the BB, not just
> those in the SLP instances we are going to vectorize.  So the
> commit of the re-aligning decision should be done from where
> we vectorize the DR, in vectorizable_load/store in its transform
> phase.
>
> If we decide to integrate the fields into the data-ref then the
> analysis and commit parts should go into the data-ref machinery
> as well.  Otherwise the vectorizer should use data-ref->aux or some
> other way to hang off its private data.
>

Good point.

> Other than that, modifying alignment of variables that are not
> vectorized is indeed a bug worth fixing.

The new version of the patch is attached. Ok for trunk after testing?

thanks,

David

>
> Richard.
>
>> thanks,
>>
>> David
Index: ChangeLog
===
--- ChangeLog   (revision 202088)
+++ ChangeLog   (working copy)
@@ -1,5 +1,25 @@
 2013-08-29  Xinliang David Li  
 
+   * tree-vect-slp.c (destroy_bb_vec_info): Data ref cleanup.
+   * tree-vect-loop.c (destroy_bb_vec_info): Ditto.
+   * tree-vect-data-refs.c (vect_compute_data_ref_alignment):
+   Delay base decl alignment adjustment.
+   * tree-vectorizer.c (destroy_datarefs): New function.
+   * tree-vectorizer.h: New data structure.
+   (set_dr_misalignment): New function.
+   (dr_misalignment): Ditto.
+   * tree-vect-stmts.c (vectorizable_store_1): Name change.
+   (vectorizable_load_1): Ditto.
+   (vectorizable_store): New function.
+   (vectorizable_load): Ditto.
+   (ensure_base_align): Ditto.
+   (vectorize_loops): Add dbg_cnt support.
+   (execute_vect_slp): Ditto.
+   * dbgcnt.def: New debug counter.
+   * Makefile: New dependency.
+
+2013-08-29  Xinliang David Li  
+
* loop-unroll.c (report_unroll_peel): Minor message
change.
* tree-vect-loop-manip.c (vect_do_peeling_for_alignment):
Index: tree-vect-loop.c
===
--- tree-vect-loop.c(revision 202088)
+++ tree-vect-loop.c(working copy)
@@ -957,7 +957,7 @@ destroy_loop_vec_info (loop_vec_info loo
 }
 
   free (LOOP_VINFO_BBS (loop_vinfo));
-  free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
+  destroy_datarefs (loop_vinfo, NULL);
   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
Index: dbgcnt.def
===
--- dbgcnt.def  (revision 202088)
+++ dbgcnt.def  (working copy)
@@ -172,6 +172,8 @@ DEBUG_COUNTER (pre_insn)
 DEBUG_COUNTER (treepre_insert)
 DEBUG_COUNTER (tree_sra)
 DEBUG_COUNTER (eipa_sra)
+DEBUG_COUNTER (vect_loop)
+DEBUG_COUNTER (vect_slp)
 DEBUG_COUNTER (sched2_func)
 DEBUG_COUNTER (sched_block)
 DEBUG_COUNTER (sched_func)
Index: Makefile.in
===
--- Makefile.in (revision 202088)
+++ Makefile.in (working copy)
@@ -2645,7 +2645,7 @@ tree-vect-data-refs.o: tree-vect-data-re
 tree-vectorizer.o: tree-vectorizer.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \

Re: Type inheritance graph analysis & speculative devirtualization, part 7/7 (speculative devirtualizatoin)

2013-09-01 Thread Xinliang David Li
Missing test cases?

Have you tested the optimization with SPEC2k and SPEC06? There are a
couple of benchmarks benefit greatly from devirtualization, such as
eon, povray etc.   I believe astar will probably improve with this
optimization at O2 (it has hot virtual functions that are not
overridden at all). For eon, the assumption at O2 for speculative
devirt may not work well.

thanks,

David

On Sun, Sep 1, 2013 at 6:57 AM, Jan Hubicka  wrote:
> Hi,
> this patch implement speculative devirtualization.  It is a trivial pass that
> asks for targets of every polymorphic call in a program and if the list
> contains one likely target, it produces an speculative call. No context
> sensitive analysis is done at the moment.  This call may or may not survive
> into final program depending if we do somehting useful about the direct call.
>
> The pass is currently disabled for LTO because
> http://gcc.gnu.org/ml/gcc-patches/2013-08/msg01007.html is needed to properly
> build type inheritance graph.
>
> With LTO it is supposed to be effective on a premise that most types are not
> escaping LTO unit and thus we see all possible targets.  Without LTO it makes
> stronger assumption that you usually call only targets defined in current 
> unit,
> if any.
>
> Path is suprisingly effective on Firefox:
> 105306 polymorphic calls, 0 devirtualized, 34258 speculatively devirtualized, 
> 4056 cold
> 66875 have multiple targets, 0 overwritable, 0 already speculated (0 agree, 0 
> disagree), 0 not defined
>
> So about 32% of calls are devirutalized.  By random checking, these can be
> tracked to real occurences of code where virtual is used in a silly way.
> I plan to introduce warning for that (I have code for that already since it
> makes it easier to analyze what changes are really made and why).
>
> Martin Liska rebuilt with FDO based indirect call resolution.  Here we get:
> 23928 indirect calls trained.
> 12837 (53.65%) have common target.
> 342 (1.43%) targets was not found.
> 8378 (35.01%) speculations seems useless.
> 4117 (17.21%) speculations produced.
>
> I compared the overlap that is devirtualized by both techniques.  There is
> almost 100% match, except that FDO code is not dealing well with thunks and
> those 342 calls that seem to go out of libxul into plugins.  I will fix the
> thunk issue later.
>
> I also tested QT, where the numbers are smaller - only about 20% of 
> devirtualized
> calls, but largery things seems similar.
>
> For non-LTO build, the devirtualization also seems sane, there seems to be
> about 8% of miss rate on GCC bootstrap that seems acceptable. I tracked most 
> of
> those down into randomly included headers that do define derived types of a
> given class that are unused in the current unit.  I think we can track this by
> computing reachable functions in current unit and looking for vtables actually
> used by construction.  One of such actually triggers undefined reference in
> build of libstdc++ and therefore I added the check disabling devirtualization
> to DECL_EXTERNAL for now.  It is because the libstdc++ header seems to have
> explicit instantiation of a template that is never linked with.
>
> I currently enabled the pass by default at -O2.  Based on the experience about
> missrate, we may want to disable it for -O2 non-LTO if it shows to be too 
> risky
> on some codebases.
>
> Bootstrapped/regtested x86_64-linux and ppc64-linux, also tested with lto 
> bootstrap
> with the LTO ODR code and tested on Firefox and QT builds. Will commit the 
> patch
> later today.
>
> Comments are welcome.
> Honza
>
> * common.opt (fdevirtualize-speculatively): New function.
> * invoke.texi (fdevirtualize-speculatively): Document.
> * ipa-devirt.c: Include ipa-inline.h
> (likely_target_p): New function.
> (ipa_devirt): New function.
> (gate_ipa_devirt): New function.
> (pass_data_ipa_devirt): New static var.
> (pass_ipa_devirt): Likewise.
> (make_pass_ipa_devirt): New function.
> * opts.c (default_options): Add OPT_fdevirtualize_speculatively.
> (common_handle_option): Disable devirtualization when
> value range profiling is available.
> * passes.def (pass_ipa_devirt): Add.
> * timever.def (TV_IPA_DEVIRT): New timevar.
> * tree-pass.h (make_pass_ipa_devirt):
>
> Index: common.opt
> ===
> --- common.opt  (revision 202136)
> +++ common.opt  (working copy)
> @@ -1007,6 +1007,10 @@ fdevirtualize
>  Common Report Var(flag_devirtualize) Optimization
>  Try to convert virtual calls to direct ones.
>
> +fdevirtualize-speculatively
> +Common Report Var(flag_devirtualize_speculatively) Optimization
> +Perform speculative devirtualization
> +
>  fdiagnostics-show-location=
>  Common Joined RejectNegative Enum(diagnostic_prefixing_rule)
>  -fdiagnostics-show-location=[once|every-line]  How often to emit source 
> location at the begi

Re: [PATCH] Fixing improper conversion from sin() to sinf() in optimization mode.

2013-09-03 Thread Xinliang David Li
>From Joseph:

"The
conversion is not safe for sqrt if the two types are double and long
double and long double is x86 extended, for example."

This is not reflected in the patch.

David


On Tue, Sep 3, 2013 at 2:27 PM, Joseph S. Myers  wrote:
> On Tue, 3 Sep 2013, Cong Hou wrote:
>
>> +  CASE_MATHFN (SQRT)
>> +/* sqrtl(double) cannot be safely converted to sqrt(double). */
>> +if (fcode == BUILT_IN_SQRTL &&
>> +(TYPE_MODE (type) == TYPE_MODE (double_type_node)) &&
>> +!flag_unsafe_math_optimizations)
>> +  break;
>
> Please reread my previous messages on this subject and try again, with
> regard to both the patch itself and the accompanying analysis.
>
> --
> Joseph S. Myers
> jos...@codesourcery.com


Re: [PATCH] Fixing improper conversion from sin() to sinf() in optimization mode.

2013-09-04 Thread Xinliang David Li
On Wed, Sep 4, 2013 at 1:53 PM, Cong Hou  wrote:
> I have made a new patch according to your comments. I found several
> references saying that the precision 2p+2 is OK for the sqrt
> conversion (one here:
> http://www.cs.berkeley.edu/~fateman/generic/algorithms.pdf). The new
> patch is pasted as below.
>
> Thank you for all the suggestions, Joseph!
>
>
> Cong
>
>
> Index: gcc/testsuite/gcc.c-torture/execute/20030125-1.c
> ===
> --- gcc/testsuite/gcc.c-torture/execute/20030125-1.c (revision 201891)
> +++ gcc/testsuite/gcc.c-torture/execute/20030125-1.c (working copy)
> @@ -44,11 +44,11 @@ __attribute__ ((noinline))
>  double
>  sin(double a)
>  {
> - abort ();
> + return a;
>  }
>  __attribute__ ((noinline))
>  float
>  sinf(float a)
>  {
> - return a;
> + abort ();
>  }
> Index: gcc/convert.c
> ===
> --- gcc/convert.c (revision 201891)
> +++ gcc/convert.c (working copy)
> @@ -135,16 +135,34 @@ convert_to_real (tree type, tree expr)
>CASE_MATHFN (COS)
>CASE_MATHFN (ERF)
>CASE_MATHFN (ERFC)
> -  CASE_MATHFN (FABS)
>CASE_MATHFN (LOG)
>CASE_MATHFN (LOG10)
>CASE_MATHFN (LOG2)
>CASE_MATHFN (LOG1P)
> -  CASE_MATHFN (LOGB)
>CASE_MATHFN (SIN)
> -  CASE_MATHFN (SQRT)
>CASE_MATHFN (TAN)
>CASE_MATHFN (TANH)
> +  CASE_MATHFN (SQRT)
> +
> +/* The above functions (except sqrt) are not safe to do
> this conversion. */
> +if (!flag_unsafe_math_optimizations)
> +{
> +  /* sqrtl?(T1) could be safely converted into sqrtf?(T2) only if
> +   * p1 >= p2*2+2, where p1 and p2 are precisions of T1 and T2. 
> */

Two spaces after T2.

Perhaps making the comment clearer?

  it is safe to do the following:
float f1 = sqrt ((double) f2);
 -->
float f1 = sqrtf (f2);

 But conditionally safe for the following
double d1 = sqrtl ((long double) d2);
  -->
double d1 = sqrt (d2);

 depending on the precision of the long double type on
the target. ...< Add your
 reference here.>


David

> +  if ((fcode == BUILT_IN_SQRT || fcode == BUILT_IN_SQRTL))
> +  {

Fix indentation.

> +int p1 = REAL_MODE_FORMAT (TYPE_MODE (type))->p;
> +int p2 = (fcode == BUILT_IN_SQRTL) ?
> +REAL_MODE_FORMAT (TYPE_MODE (long_double_type_node))->p :
> +REAL_MODE_FORMAT (TYPE_MODE (double_type_node))->p;
> +if (p2 < p1 * 2 + 2)
> +  break;
> +  }
> +  else
> +break;
> +}
> +  CASE_MATHFN (FABS)
> +  CASE_MATHFN (LOGB)
>  #undef CASE_MATHFN
>  {
>tree arg0 = strip_float_extensions (CALL_EXPR_ARG (expr, 0));
>
> On Tue, Sep 3, 2013 at 3:38 PM, Joseph S. Myers  
> wrote:
>> On Tue, 3 Sep 2013, Cong Hou wrote:
>>
>>> Could you please tell me how to check the precision of long double in
>>> GCC on different platforms?
>>
>> REAL_MODE_FORMAT (TYPE_MODE (long_double_type_node))->p
>>
>> (but you should be referring to the relevant types - "type", the type
>> being converted to, "itype", the type of the function being called in the
>> source code, "TREE_TYPE (arg0)", the type of the argument after extensions
>> have been removed, and "newtype", computed from those - so you should have
>> expressions like the above with two or more of those four types, but not
>> with long_double_type_node directly).
>>
>> The patch submission will need to include a proper analysis to justify to
>> the reader why the particular inequality with particular types from those
>> four is correct in all cases where the relevant code may be executed.
>>
>> --
>> Joseph S. Myers
>> jos...@codesourcery.com


Re: [PATCH] Fixing improper conversion from sin() to sinf() in optimization mode.

2013-09-04 Thread Xinliang David Li
On Wed, Sep 4, 2013 at 1:59 PM, Joseph S. Myers  wrote:
> On Wed, 4 Sep 2013, Cong Hou wrote:
>
>> I have made a new patch according to your comments. I found several
>> references saying that the precision 2p+2 is OK for the sqrt
>> conversion (one here:
>> http://www.cs.berkeley.edu/~fateman/generic/algorithms.pdf). The new
>> patch is pasted as below.
>
> This patch submission still fails to pay attention to various of my
> comments.
>

If you can provide inlined comments in the patch, that will be more
useful and productive.

thanks,

David


> --
> Joseph S. Myers
> jos...@codesourcery.com


Re: [PATCH] Fixing improper conversion from sin() to sinf() in optimization mode.

2013-09-09 Thread Xinliang David Li
gt; http://stackoverflow.com/questions/9235456/determining-floating-point-square-root
> + */
> +  if (fcode == BUILT_IN_SQRT || fcode == BUILT_IN_SQRTL)
> + {
> +  int p1 = REAL_MODE_FORMAT (TYPE_MODE (itype))->p;
> +  int p2 = REAL_MODE_FORMAT (TYPE_MODE (newtype))->p;
> +  if (p1 < p2 * 2 + 2 && !flag_unsafe_math_optimizations)
> +break;
> + }
> +
>/* Be careful about integer to fp conversions.
>   These may overflow still.  */
>if (FLOAT_TYPE_P (TREE_TYPE (arg0))
> Index: gcc/testsuite/gcc.c-torture/execute/20030125-1.c
> =======
> --- gcc/testsuite/gcc.c-torture/execute/20030125-1.c (revision 201891)
> +++ gcc/testsuite/gcc.c-torture/execute/20030125-1.c (working copy)
> @@ -44,11 +44,11 @@ __attribute__ ((noinline))
>  double
>  sin(double a)
>  {
> - abort ();
> + return a;
>  }
>  __attribute__ ((noinline))
>  float
>  sinf(float a)
>  {
> - return a;
> + abort ();
>  }
>
> On Wed, Sep 4, 2013 at 3:26 PM, Joseph S. Myers  
> wrote:
>> On Wed, 4 Sep 2013, Xinliang David Li wrote:
>>
>>> > This patch submission still fails to pay attention to various of my
>>> > comments.
>>> >
>>>
>>> If you can provide inlined comments in the patch, that will be more
>>> useful and productive.
>>
>> I have explained things several times in this thread already.  I see no
>> point in repeating things when what I would say has already been said and
>> ignored.  As far as I can tell, this latest patch submission has taken one
>> line from the message it is in response to, and largely ignored the
>> following two paragraphs (including where I explicitly say that said line
>> should not appear literally in the source code at all).  But, repeating
>> what I said before yet again:
>>
>>   (but you should be referring to the relevant types
>>
>> The patch does not do properly that.  It refers explicitly to
>> long_double_type_node and double_type_node.
>>
>>   - "type", the type
>>   being converted to, "itype", the type of the function being called in the
>>   source code, "TREE_TYPE (arg0)", the type of the argument after extensions
>>   have been removed, and "newtype", computed from those
>>
>> The patch only engages with "type".  I suspect "newtype" is what it really
>> means there when using "type".  When it uses long_double_type_node and
>> double_type_node, those should be "itype".
>>
>>   - so you should have
>>   expressions like the above with two or more of those four types, but not
>>   with long_double_type_node directly).
>>
>> See above.  The patch uses long_double_type_node directly, contrary to
>> what I explicitly said.  You are free to disagree with something I say in
>> a review - but in that case you need to reply specifically to the review
>> comment explaining your rationale for disagreeing with it.  Just ignoring
>> the comment and not mentioning the disagreement wastes the time of
>> reviewers.
>>
>>   The patch submission will need to include a proper analysis to justify to
>>   the reader why the particular inequality with particular types from those
>>   four is correct in all cases where the relevant code may be executed.
>>
>> The comments only refer to "T1" and "T2" without explaining how they
>> relate to the original expression (three types) or the variables within
>> GCC dealing with various types (four types, because newtype gets
>> involved).  I said back in
>> <http://gcc.gnu.org/ml/gcc-patches/2013-09/msg00161.html> and
>> <http://gcc.gnu.org/ml/gcc-patches/2013-08/msg01384.html> that three types
>> are involved - when I say "the patch submission needs to include its own
>> analysis for the full generality of three types", again, it's
>> inappropriate for a patch to omit such an analysis without justification.
>> The patch submission needs to include an analysis that properly explains
>> the transformation involved and the conditions under which it is safe.
>> Maybe starting along the lines of:
>>
>> We have an expression of the form (T1) sqrtT2 ((T2) exprT3), where exprT3
>> has type T3 (TREE_TYPE (ARG0)), T2 is the type of the floating-point
>> square root function being used (ITYPE), T1 is TYPE and all these types
>> are binary floating-point types.  We wish to optimize if possible to an
>> expression of the form (T1) sqrtT4 ((T4) exprT3), where T4 (NEWTYPE) is
>> narrower than T2.  (Then explain the choice of T4 and the conditions under
>> which the transformation is safe, with appropriate references.)
>>
>> I suggest that for the next patch submission you (the patch submitter)
>> make sure you spend at least an hour properly understanding the issues and
>> all the previous messages in the thread and writing up the detailed,
>> coherent explanation of the transformation and analysis of the issues that
>> I asked for some time ago, as if for a reader who hasn't read any of this
>> thread or looked at this transformation in GCC before.  I've certainly
>> spent longer than that on review in this thread.  It's normal for a patch
>> involving anything at all tricky to take hours to write up (and also
>> normal for one to discover, in the course of writing the detailed coherent
>> analysis for people who aren't familiar with the code and the issues
>> involved, that there's in fact something wrong with the patch and it needs
>> revisiting before submission).
>>
>> --
>> Joseph S. Myers
>> jos...@codesourcery.com


Re: Dump framework newline cleanup

2013-09-09 Thread Xinliang David Li
looks fine to me.

In the long run, I wonder if the machinery in diagnostic messages can
be reused for opt-info dumping -- i.e., support different streams. It
has many nice features including %qD specifier for printing tree
decls.

David

On Mon, Sep 9, 2013 at 12:01 PM, Teresa Johnson  wrote:
> I've attached a patch that implements the cleanup of newline emission
> by the new dump framework as discussed here:
>
> http://gcc.gnu.org/ml/gcc-patches/2013-08/msg01779.html
>
> Essentially, I have removed the leading newline emission from
> dump_loc, and updated dump_printf_loc invocations to emit a trailing
> newline as necessary. This will remove unnecessary vertical space in
> the dump output.
>
> I did not do any other cleanup of the existing vectorization messages
> - there are IMO a lot of messages being emitted by the vectorizer
> under MSG_NOTE (and probably MSG_MISSED_OPTIMIZATION) that should only
> be emitted to the dump file under -fdump-tree-... and not emitted
> under -fopt-info-all. The ones that stay under -fopt-info-all need
> some formatting/style cleanup. Leaving that for follow-on work.
>
> Bootstrapped and tested on x86-64-unknown-linux-gnu. Ok for trunk?
>
> Thanks,
> Teresa
>
> --
> Teresa Johnson | Software Engineer | tejohn...@google.com | 408-460-2413


[google] record system paths (isystem) in module infos (LIPO mode)

2013-09-10 Thread Xinliang David Li
The following patch enables GCC to record system include paths in
module infos. This allows more precise parsing context to be
established when compiling auxiliary modules -- FE behaves slightly
differently when parsing system headers.  There are also a couple of
other cleanups in the patch.

David
Index: libgcc/dyn-ipa.c
===
--- libgcc/dyn-ipa.c(revision 202477)
+++ libgcc/dyn-ipa.c(working copy)
@@ -2082,9 +2082,10 @@ gcov_write_module_info (const struct gco
   len = filename_len + src_filename_len;
   len += 2; /* each name string is led by a length.  */
 
-  num_strings = module_info->num_quote_paths + module_info->num_bracket_paths +
-module_info->num_cpp_defines + module_info->num_cpp_includes +
-module_info->num_cl_args;
+  num_strings = module_info->num_quote_paths + module_info->num_bracket_paths
++ module_info->num_system_paths
++ module_info->num_cpp_defines + module_info->num_cpp_includes
++ module_info->num_cl_args;
   for (i = 0; i < num_strings; i++)
 {
   gcov_unsigned_t string_len
@@ -2094,7 +2095,7 @@ gcov_write_module_info (const struct gco
   len += 1; /* Each string is lead by a length.  */
 }
 
-  len += 9; /* 9 more fields */
+  len += 10; /* 9 more fields */
 
   gcov_write_tag_length (GCOV_TAG_MODULE_INFO, len);
   gcov_write_unsigned (module_info->ident);
@@ -2105,6 +2106,7 @@ gcov_write_module_info (const struct gco
   gcov_write_unsigned (module_info->lang);
   gcov_write_unsigned (module_info->num_quote_paths);
   gcov_write_unsigned (module_info->num_bracket_paths);
+  gcov_write_unsigned (module_info->num_system_paths);
   gcov_write_unsigned (module_info->num_cpp_defines);
   gcov_write_unsigned (module_info->num_cpp_includes);
   gcov_write_unsigned (module_info->num_cl_args);
Index: gcc/gcov-io.c
===
--- gcc/gcov-io.c   (revision 202477)
+++ gcc/gcov-io.c   (working copy)
@@ -594,10 +594,11 @@ gcov_read_module_info (struct gcov_modul
   mod_info->lang  = gcov_read_unsigned ();
   mod_info->num_quote_paths = gcov_read_unsigned ();
   mod_info->num_bracket_paths = gcov_read_unsigned ();
+  mod_info->num_system_paths = gcov_read_unsigned ();
   mod_info->num_cpp_defines = gcov_read_unsigned ();
   mod_info->num_cpp_includes = gcov_read_unsigned ();
   mod_info->num_cl_args = gcov_read_unsigned ();
-  len -= 9;
+  len -= 10;
 
   filename_len = gcov_read_unsigned ();
   mod_info->da_filename = (char *) xmalloc (filename_len *
@@ -613,9 +614,10 @@ gcov_read_module_info (struct gcov_modul
 ((gcov_unsigned_t *) mod_info->source_filename)[i] = gcov_read_unsigned ();
   len -= (src_filename_len + 1);
 
-  num_strings = mod_info->num_quote_paths + mod_info->num_bracket_paths +
-mod_info->num_cpp_defines + mod_info->num_cpp_includes +
-mod_info->num_cl_args;
+  num_strings = mod_info->num_quote_paths + mod_info->num_bracket_paths
++ mod_info->num_system_paths
++ mod_info->num_cpp_defines + mod_info->num_cpp_includes
++ mod_info->num_cl_args;
   for (j = 0; j < num_strings; j++)
{
  gcov_unsigned_t string_len = gcov_read_unsigned ();
Index: gcc/gcov-io.h
===
--- gcc/gcov-io.h   (revision 202477)
+++ gcc/gcov-io.h   (working copy)
@@ -537,6 +537,7 @@ struct gcov_module_info
   char *source_filename;
   gcov_unsigned_t num_quote_paths;
   gcov_unsigned_t num_bracket_paths;
+  gcov_unsigned_t num_system_paths;
   gcov_unsigned_t num_cpp_defines;
   gcov_unsigned_t num_cpp_includes;
   gcov_unsigned_t num_cl_args;
Index: gcc/cp/cp-tree.h
===
--- gcc/cp/cp-tree.h(revision 202477)
+++ gcc/cp/cp-tree.h(working copy)
@@ -5512,6 +5512,7 @@ extern tree get_template_argument_pack_e
 extern tree get_function_template_decl (const_tree);
 extern tree resolve_nondeduced_context (tree);
 extern hashval_t iterative_hash_template_arg (tree arg, hashval_t val);
+extern void clear_pending_templates (void);
 
 /* in repo.c */
 extern void init_repo  (void);
Index: gcc/cp/decl2.c
===
--- gcc/cp/decl2.c  (revision 202477)
+++ gcc/cp/decl2.c  (working copy)
@@ -3890,10 +3890,12 @@ void
 cp_clear_deferred_fns (void)
 {
   vec_free (deferred_fns);
+  deferred_fns = NULL;
   keyed_classes = NULL;
   vec_free (no_linkage_decls);
   no_linkage_decls = NULL;
   cp_clear_constexpr_hashtable ();
+  clear_pending_templates ();
 }
 
 /* Collect declarations from all namespaces relevant to SOURCE_FILE.  */
@@ -4225,18 +4227,19 @@ cp_process_pending_declarations (locatio
 
   if (L_IPO_IS_AUXILIARY_MODULE)
 {
+  tree fndecl;
+  int i;
+
   gcc_assert (flag_dyn_ipa && L_IPO_COMP_MODE);
 
   /* Do some cleanup -- we do not really need static i

Re: [PATCH] [vectorizer] Fixing a bug in tree-vect-patterns.c in GCC vectorizer.

2013-09-11 Thread Xinliang David Li
Can you add a test case to the regression suite?

When the type of arguments are unsigned short/unsigned int, GCC does
not vectorize the loop anymore -- this is worth a separate bug to
track. punpcklwd instruction can be used to do zero extension of the
short type.

David

On Wed, Sep 11, 2013 at 6:16 PM, Cong Hou  wrote:
> Hi
>
> There is a bug in the function vect_recog_dot_prod_pattern() in
> tree-vect-patterns.c. This function checks if a loop is of dot
> production pattern. Specifically, according to the comment of this
> function:
>
> /*
>  Try to find the following pattern:
>
>  type x_t, y_t;
>  TYPE1 prod;
>  TYPE2 sum = init;
>loop:
>  sum_0 = phi 
>  S1  x_t = ...
>  S2  y_t = ...
>  S3  x_T = (TYPE1) x_t;
>  S4  y_T = (TYPE1) y_t;
>  S5  prod = x_T * y_T;
>  [S6  prod = (TYPE2) prod;  #optional]
>  S7  sum_1 = prod + sum_0;
>
>where 'TYPE1' is exactly double the size of type 'type', and
> 'TYPE2' is the same size of 'TYPE1' or bigger. This is a special case
> of a reduction computation.
> */
>
> This function should check if x_t and y_t have the same type (type)
> which has the half size of TYPE1. The corresponding code is shown
> below:
>
>   oprnd0 = gimple_assign_rhs1 (stmt);
>   oprnd1 = gimple_assign_rhs2 (stmt);
>   if (!types_compatible_p (TREE_TYPE (oprnd0), prod_type) ||
> !types_compatible_p (TREE_TYPE (oprnd1), prod_type))
> return NULL;
>   if (!type_conversion_p (oprnd0, stmt, true, &half_type0,
> &def_stmt, &promotion) || !promotion)
> return NULL;
>   oprnd00 = gimple_assign_rhs1 (def_stmt);
>
> /*==V  see here! */
>   if (!type_conversion_p (oprnd0, stmt, true, &half_type1,
> &def_stmt, &promotion) || !promotion)
> return NULL;
>   oprnd01 = gimple_assign_rhs1 (def_stmt);
>   if (!types_compatible_p (half_type0, half_type1))
> return NULL;
>   if (TYPE_PRECISION (prod_type) != TYPE_PRECISION (half_type0) * 2)
> return NULL;
>
> Here the function uses x_T (oprnd0) to check the type of y_t, which is
> incorrect. The fix is simple: just replace it by oprnd1.
>
> The failed test case for this bug is shown below:
>
> int foo(short *a, int *b, int n) {
>   int sum = 0;
>   for (int i = 0; i < n; ++i)
> sum += a[i] * b[i];
>   return sum;
> }
>
>
> thanks,
> Cong
>
>
> Index: gcc/tree-vect-patterns.c
> ===
> --- gcc/tree-vect-patterns.c (revision 200988)
> +++ gcc/tree-vect-patterns.c (working copy)
> @@ -397,7 +397,7 @@ vect_recog_dot_prod_pattern (vec
>|| !promotion)
>  return NULL;
>oprnd00 = gimple_assign_rhs1 (def_stmt);
> -  if (!type_conversion_p (oprnd0, stmt, true, &half_type1, &def_stmt,
> +  if (!type_conversion_p (oprnd1, stmt, true, &half_type1, &def_stmt,
>  &promotion)
>|| !promotion)
>  return NULL;


Re: [Google] Fix test failure after porting __gcov_get_profile_prefix from google/4_7

2013-09-12 Thread Xinliang David Li
On Thu, Sep 12, 2013 at 1:06 PM, Teresa Johnson  wrote:
> After porting r198033 from google/4_7 to google/4_8 a test case failed
> with an assert when trying to take the strlen of profile_data_prefix.
>
> In most cases this is either set from the directory specified to
> -fprofile-generate=, or to getpwd when a directory is not specified.
> However, the exception is when no directory is specified for
> -fprofile-generate and -auxbase-strip option is used with the absolute
> pathname. In that case the code does not set profile_data_prefix since
> the filenames already have the full path.
>
> In the code that sets __gcov_get_profile_prefix, the fix is to simply
> check if profile_data_prefix is still NULL, and if so just set via
> getpwd.

Why setting it to getpwd() val? Should it be set to null instead?

David

>
> Passes regression tests and failure I reproduced. Ok for google branches?
>
> Thanks,
> Teresa
>
> 2013-09-12  Teresa Johnson  
>
> * tree-profile.c (tree_init_instrumentation): Handle the case
> where profile_data_prefix is NULL.
>
> Index: tree-profile.c
> ===
> --- tree-profile.c (revision 202500)
> +++ tree-profile.c (working copy)
> @@ -470,8 +470,11 @@ tree_init_instrumentation (void)
>DECL_ASSEMBLER_NAME (gcov_profile_prefix_decl));
>TREE_STATIC (gcov_profile_prefix_decl) = 1;
>
> -  prefix_len = strlen (profile_data_prefix);
> -  prefix_string = build_string (prefix_len + 1, profile_data_prefix);
> +  const char *prefix = profile_data_prefix;
> +  if (!prefix)
> +prefix = getpwd ();
> +  prefix_len = strlen (prefix);
> +  prefix_string = build_string (prefix_len + 1, prefix);
>TREE_TYPE (prefix_string) = build_array_type
>(char_type_node, build_index_type
> (build_int_cst (NULL_TREE, prefix_len)));
>
>
> --
> Teresa Johnson | Software Engineer | tejohn...@google.com | 408-460-2413


New GCC options for loop vectorization

2013-09-12 Thread Xinliang David Li
Currently -ftree-vectorize turns on both loop and slp vectorizations,
but there is no simple way to turn on loop vectorization alone. The
logic for default O3 setting is also complicated.

In this patch, two new options are introduced:

1) -ftree-loop-vectorize

This option is used to turn on loop vectorization only. option
-ftree-slp-vectorize also becomes a first class citizen, and no funny
business of Init(2) is needed.  With this change, -ftree-vectorize
becomes a simple alias to -ftree-loop-vectorize +
-ftree-slp-vectorize.

For instance, to turn on only slp vectorize at O3, the old way is:

 -O3 -fno-tree-vectorize -ftree-slp-vectorize

With the new change it becomes:

-O3 -fno-loop-vectorize


To turn on only loop vectorize at O2, the old way is

-O2 -ftree-vectorize -fno-slp-vectorize

The new way is

-O2 -ftree-loop-vectorize



2) -ftree-vect-loop-peeling

This option is used to turn on/off loop peeling for alignment.  In the
long run, this should be folded into the cheap cost model proposed by
Richard.  This option is also useful in scenarios where peeling can
introduce runtime problems:
http://gcc.gnu.org/ml/gcc/2005-12/msg00390.html  which happens to be
common in practice.



Patch attached. Compiler boostrapped. Ok after testing?


thanks,

David
Index: omp-low.c
===
--- omp-low.c   (revision 202481)
+++ omp-low.c   (working copy)
@@ -2305,8 +2305,8 @@ omp_max_vf (void)
 {
   if (!optimize
   || optimize_debug
-  || (!flag_tree_vectorize
- && global_options_set.x_flag_tree_vectorize))
+  || (!flag_tree_loop_vectorize
+ && global_options_set.x_flag_tree_loop_vectorize))
 return 1;
 
   int vs = targetm.vectorize.autovectorize_vector_sizes ();
@@ -5684,10 +5684,10 @@ expand_omp_simd (struct omp_region *regi
  loop->simduid = OMP_CLAUSE__SIMDUID__DECL (simduid);
  cfun->has_simduid_loops = true;
}
-  /* If not -fno-tree-vectorize, hint that we want to vectorize
+  /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
 the loop.  */
-  if ((flag_tree_vectorize
-  || !global_options_set.x_flag_tree_vectorize)
+  if ((flag_tree_loop_vectorize
+  || !global_options_set.x_flag_tree_loop_vectorize)
  && loop->safelen > 1)
{
  loop->force_vect = true;
Index: ChangeLog
===
--- ChangeLog   (revision 202481)
+++ ChangeLog   (working copy)
@@ -1,3 +1,24 @@
+2013-09-12  Xinliang David Li  
+
+   * tree-if-conv.c (main_tree_if_conversion): Check new flag.
+   * omp-low.c (omp_max_vf): Ditto.
+   (expand_omp_simd): Ditto.
+   * tree-vectorizer.c (vectorize_loops): Ditto.
+   (gate_vect_slp): Ditto.
+   (gate_increase_alignment): Ditto.
+   * tree-vect-data-refs.c (vect_enhance_data_refs_alignment): Ditto.
+   * tree-ssa-pre.c (inhibit_phi_insertion): Ditto.
+   * tree-ssa-loop.c (gate_tree_vectorize): Ditto.
+   (gate_tree_vectorize): Name change.
+   (tree_vectorize): Ditto.
+   (pass_vectorize::gate): Call new function.
+   (pass_vectorize::execute): Ditto.
+   opts.c: O3 default setting change.
+   (finish_options): Check new flag.
+   * doc/invoke.texi: Document new flags.
+   * common.opt: New flags.
+
+
 2013-09-10  Richard Earnshaw  
 
PR target/58361
Index: doc/invoke.texi
===
--- doc/invoke.texi (revision 202481)
+++ doc/invoke.texi (working copy)
@@ -419,10 +419,12 @@ Objective-C and Objective-C++ Dialects}.
 -ftree-loop-if-convert-stores -ftree-loop-im @gol
 -ftree-phiprop -ftree-loop-distribution -ftree-loop-distribute-patterns @gol
 -ftree-loop-ivcanon -ftree-loop-linear -ftree-loop-optimize @gol
+-ftree-loop-vectorize @gol
 -ftree-parallelize-loops=@var{n} -ftree-pre -ftree-partial-pre -ftree-pta @gol
 -ftree-reassoc -ftree-sink -ftree-slsr -ftree-sra @gol
--ftree-switch-conversion -ftree-tail-merge @gol
--ftree-ter -ftree-vect-loop-version -ftree-vectorize -ftree-vrp @gol
+-ftree-switch-conversion -ftree-tail-merge -ftree-ter @gol
+-ftree-vect-loop-version -ftree-vect-loop-peeling -ftree-vectorize @gol
+-ftree-vrp @gol
 -funit-at-a-time -funroll-all-loops -funroll-loops @gol
 -funsafe-loop-optimizations -funsafe-math-optimizations -funswitch-loops @gol
 -fvariable-expansion-in-unroller -fvect-cost-model -fvpt -fweb @gol
@@ -6748,8 +6750,8 @@ invoking @option{-O2} on programs that u
 Optimize yet more.  @option{-O3} turns on all optimizations specified
 by @option{-O2} and also turns on the @option{-finline-functions},
 @option{-funswitch-loops}, @option{-fpredictive-commoning},
-@option{-fgcse-after-reload}, @option{-ftree-vectorize},
-@option{-fvect-cost-model},
+@option{-fgcse-after-reload}, @option{-ftree-loop-vectorize},
+@option{-ft

Re: [Google] Fix test failure after porting __gcov_get_profile_prefix from google/4_7

2013-09-12 Thread Xinliang David Li
When absolute path is specified for the object file, no prefix will be
prepended to the gcda path. If you record the cwd as in the
_gcov_profile_prefix variable, at profile dump time, the prefix will
be wrong -- as it is never used.

David

On Thu, Sep 12, 2013 at 2:07 PM, Teresa Johnson  wrote:
> On Thu, Sep 12, 2013 at 1:20 PM, Xinliang David Li  wrote:
>> On Thu, Sep 12, 2013 at 1:06 PM, Teresa Johnson  wrote:
>>> After porting r198033 from google/4_7 to google/4_8 a test case failed
>>> with an assert when trying to take the strlen of profile_data_prefix.
>>>
>>> In most cases this is either set from the directory specified to
>>> -fprofile-generate=, or to getpwd when a directory is not specified.
>>> However, the exception is when no directory is specified for
>>> -fprofile-generate and -auxbase-strip option is used with the absolute
>>> pathname. In that case the code does not set profile_data_prefix since
>>> the filenames already have the full path.
>>>
>>> In the code that sets __gcov_get_profile_prefix, the fix is to simply
>>> check if profile_data_prefix is still NULL, and if so just set via
>>> getpwd.
>>
>> Why setting it to getpwd() val? Should it be set to null instead?
>
> The specified behavior when no path is given to -fprofile-generate (or
> -fprofile-dir) is to use the current directory.
>
> The case where this was happening was an lto test case, where lto1 was
> first run in WPA (-fwpa) mode and was emitting the ltrans output to a
> /tmp/ path (-fltrans-output-list=/tmp/cciR1m1o.ltrans.out). Then lto1
> was run again in LTRANS mode (-fltrans) with -auxbase-strip
> /tmp/cciR1m1o.ltrans0.ltrans.o, triggering the problem.
>
> Teresa
>
>>
>> David
>>
>>>
>>> Passes regression tests and failure I reproduced. Ok for google branches?
>>>
>>> Thanks,
>>> Teresa
>>>
>>> 2013-09-12  Teresa Johnson  
>>>
>>> * tree-profile.c (tree_init_instrumentation): Handle the case
>>> where profile_data_prefix is NULL.
>>>
>>> Index: tree-profile.c
>>> ===
>>> --- tree-profile.c (revision 202500)
>>> +++ tree-profile.c (working copy)
>>> @@ -470,8 +470,11 @@ tree_init_instrumentation (void)
>>>DECL_ASSEMBLER_NAME (gcov_profile_prefix_decl));
>>>TREE_STATIC (gcov_profile_prefix_decl) = 1;
>>>
>>> -  prefix_len = strlen (profile_data_prefix);
>>> -  prefix_string = build_string (prefix_len + 1, profile_data_prefix);
>>> +  const char *prefix = profile_data_prefix;
>>> +  if (!prefix)
>>> +prefix = getpwd ();
>>> +  prefix_len = strlen (prefix);
>>> +  prefix_string = build_string (prefix_len + 1, prefix);
>>>TREE_TYPE (prefix_string) = build_array_type
>>>(char_type_node, build_index_type
>>> (build_int_cst (NULL_TREE, prefix_len)));
>>>
>>>
>>> --
>>> Teresa Johnson | Software Engineer | tejohn...@google.com | 408-460-2413
>
>
>
> --
> Teresa Johnson | Software Engineer | tejohn...@google.com | 408-460-2413


Re: [Google] Fix test failure after porting __gcov_get_profile_prefix from google/4_7

2013-09-13 Thread Xinliang David Li
Ok.

David

On Fri, Sep 13, 2013 at 7:21 AM, Teresa Johnson  wrote:
> Testing passes, is the below patch ok for google/4_8?
> Thanks, Teresa
>
> On Thu, Sep 12, 2013 at 10:18 PM, Teresa Johnson  wrote:
>> On Thu, Sep 12, 2013 at 2:32 PM, Xinliang David Li  
>> wrote:
>>> When absolute path is specified for the object file, no prefix will be
>>> prepended to the gcda path. If you record the cwd as in the
>>> _gcov_profile_prefix variable, at profile dump time, the prefix will
>>> be wrong -- as it is never used.
>>
>> Yes I think I agree with you now.
>>
>> Basically, for non-lto compilations, you get the following:
>> -fprofile-generate={path}
>>-> no auxbase-strip and profile_data_prefix={path}
>> -fprofile-generate -o relative/path/to/file.o
>>-> no auxbase-strip and profile_data_prefix=getpwd()
>> -fprofile-generate -o /absolute/path/to/file.o
>>-> auxbase-strip /absolute/path/to/file.o and profile_data_prefix=NULL
>>
>> But with -flto and -fprofile-generate -o relative/path/to/file.o
>>-> auxbase-strip /tmp/file.ltrans.out and profile_data_prefix=NULL
>>
>> In the LTO case the gcda files will go into cwd, but not in the case
>> just above where the absolute path is given to the object file.
>> However, for our purposes we rely on the path being specified to
>> -fprofile-generate={path} in places where we query
>> __gcov_profile_prefix in order to find the dump directory. Therefore,
>> I think it is best to simply record a NULL string as the
>> profile_data_prefix value in all cases where profile_data_prefix=NULL.
>>
>> Here is the patch I am regression testing:
>>
>> Index: tree-profile.c
>> ===
>> --- tree-profile.c (revision 202500)
>> +++ tree-profile.c (working copy)
>> @@ -470,8 +470,15 @@ tree_init_instrumentation (void)
>>DECL_ASSEMBLER_NAME (gcov_profile_prefix_decl));
>>TREE_STATIC (gcov_profile_prefix_decl) = 1;
>>
>> -  prefix_len = strlen (profile_data_prefix);
>> -  prefix_string = build_string (prefix_len + 1, profile_data_prefix);
>> +  const char null_prefix[] = "\0";
>> +  const char *prefix = null_prefix;
>> +  prefix_len = 0;
>> +  if (profile_data_prefix)
>> +{
>> +  prefix_len = strlen (profile_data_prefix);
>> +  prefix = profile_data_prefix;
>> +}
>> +  prefix_string = build_string (prefix_len + 1, prefix);
>>TREE_TYPE (prefix_string) = build_array_type
>>(char_type_node, build_index_type
>> (build_int_cst (NULL_TREE, prefix_len)));
>>
>>>
>>> David
>>>
>>> On Thu, Sep 12, 2013 at 2:07 PM, Teresa Johnson  
>>> wrote:
>>>> On Thu, Sep 12, 2013 at 1:20 PM, Xinliang David Li  
>>>> wrote:
>>>>> On Thu, Sep 12, 2013 at 1:06 PM, Teresa Johnson  
>>>>> wrote:
>>>>>> After porting r198033 from google/4_7 to google/4_8 a test case failed
>>>>>> with an assert when trying to take the strlen of profile_data_prefix.
>>>>>>
>>>>>> In most cases this is either set from the directory specified to
>>>>>> -fprofile-generate=, or to getpwd when a directory is not specified.
>>>>>> However, the exception is when no directory is specified for
>>>>>> -fprofile-generate and -auxbase-strip option is used with the absolute
>>>>>> pathname. In that case the code does not set profile_data_prefix since
>>>>>> the filenames already have the full path.
>>>>>>
>>>>>> In the code that sets __gcov_get_profile_prefix, the fix is to simply
>>>>>> check if profile_data_prefix is still NULL, and if so just set via
>>>>>> getpwd.
>>>>>
>>>>> Why setting it to getpwd() val? Should it be set to null instead?
>>>>
>>>> The specified behavior when no path is given to -fprofile-generate (or
>>>> -fprofile-dir) is to use the current directory.
>>>>
>>>> The case where this was happening was an lto test case, where lto1 was
>>>> first run in WPA (-fwpa) mode and was emitting the ltrans output to a
>>>> /tmp/ path (-fltrans-output-list=/tmp/cciR1m1o.ltrans.out). Then lto1
>>>> was run again in LTRANS mode (-fltrans) with -auxbase-strip
>>>> /tmp/cciR1m1o.ltrans0.ltrans.o, triggering the problem.
&

Re: New GCC options for loop vectorization

2013-09-13 Thread Xinliang David Li
On Fri, Sep 13, 2013 at 1:30 AM, Richard Biener
 wrote:
> On Thu, Sep 12, 2013 at 10:31 PM, Xinliang David Li  
> wrote:
>> Currently -ftree-vectorize turns on both loop and slp vectorizations,
>> but there is no simple way to turn on loop vectorization alone. The
>> logic for default O3 setting is also complicated.
>>
>> In this patch, two new options are introduced:
>>
>> 1) -ftree-loop-vectorize
>>
>> This option is used to turn on loop vectorization only. option
>> -ftree-slp-vectorize also becomes a first class citizen, and no funny
>> business of Init(2) is needed.  With this change, -ftree-vectorize
>> becomes a simple alias to -ftree-loop-vectorize +
>> -ftree-slp-vectorize.
>>
>> For instance, to turn on only slp vectorize at O3, the old way is:
>>
>>  -O3 -fno-tree-vectorize -ftree-slp-vectorize
>>
>> With the new change it becomes:
>>
>> -O3 -fno-loop-vectorize
>>
>>
>> To turn on only loop vectorize at O2, the old way is
>>
>> -O2 -ftree-vectorize -fno-slp-vectorize
>>
>> The new way is
>>
>> -O2 -ftree-loop-vectorize
>>
>>
>>
>> 2) -ftree-vect-loop-peeling
>>
>> This option is used to turn on/off loop peeling for alignment.  In the
>> long run, this should be folded into the cheap cost model proposed by
>> Richard.  This option is also useful in scenarios where peeling can
>> introduce runtime problems:
>> http://gcc.gnu.org/ml/gcc/2005-12/msg00390.html  which happens to be
>> common in practice.
>>
>>
>>
>> Patch attached. Compiler boostrapped. Ok after testing?
>
> I'd like you to split 1) and 2), mainly because I agree on 1) but not on 2).

Ok. Can you also comment on 2) ?

>
> I've stopped a quick try doing 1) myself because
>
> @@ -1691,6 +1695,12 @@ common_handle_option (struct gcc_options
>  opts->x_flag_ipa_reference = false;
>break;
>
> +case OPT_ftree_vectorize:
> +  if (!opts_set->x_flag_tree_loop_vectorize)
> + opts->x_flag_tree_loop_vectorize = value;
> +  if (!opts_set->x_flag_tree_slp_vectorize)
> + opts->x_flag_tree_slp_vectorize = value;
> +  break;
>
> doesn't look obviously correct.  Does that handle
>
>   -ftree-vectorize -fno-tree-loop-vectorize -ftree-vectorize
>
> or
>
>   -ftree-loop-vectorize -fno-tree-vectorize
>
> properly?  Currently at least
>
>   -ftree-slp-vectorize -fno-tree-vectorize
>
> doesn't "work".


Right -- same is true for -fprofile-use option. FDO enables some
passes, but can not re-enable them if they are flipped off before.

>
> That said, the option machinery doesn't handle an option being an alias
> for two other options, so it's mechanism to contract positives/negatives
> doesn't work here and the override hooks do not work reliably for
> repeated options.
>
> Or am I wrong here?  Should we care at all?  Joseph?

We should probably just document the behavior. Even better, we should
deprecate the old option.

thanks,

David

>
> Thanks,
> Richard.
>
>>
>> thanks,
>>
>> David


Re: New GCC options for loop vectorization

2013-09-13 Thread Xinliang David Li
New patch attached.

1) the peeling part is removed
2) the new patch implements the last-one-wins logic. -ftree-vectorize
behaves like a true alias. -fno-tree-vectorize can override previous
-ftree-xxx-vectorize.

Ok for trunk after testing?

thanks,

David

On Fri, Sep 13, 2013 at 8:16 AM, Xinliang David Li  wrote:
> On Fri, Sep 13, 2013 at 1:30 AM, Richard Biener
>  wrote:
>> On Thu, Sep 12, 2013 at 10:31 PM, Xinliang David Li  
>> wrote:
>>> Currently -ftree-vectorize turns on both loop and slp vectorizations,
>>> but there is no simple way to turn on loop vectorization alone. The
>>> logic for default O3 setting is also complicated.
>>>
>>> In this patch, two new options are introduced:
>>>
>>> 1) -ftree-loop-vectorize
>>>
>>> This option is used to turn on loop vectorization only. option
>>> -ftree-slp-vectorize also becomes a first class citizen, and no funny
>>> business of Init(2) is needed.  With this change, -ftree-vectorize
>>> becomes a simple alias to -ftree-loop-vectorize +
>>> -ftree-slp-vectorize.
>>>
>>> For instance, to turn on only slp vectorize at O3, the old way is:
>>>
>>>  -O3 -fno-tree-vectorize -ftree-slp-vectorize
>>>
>>> With the new change it becomes:
>>>
>>> -O3 -fno-loop-vectorize
>>>
>>>
>>> To turn on only loop vectorize at O2, the old way is
>>>
>>> -O2 -ftree-vectorize -fno-slp-vectorize
>>>
>>> The new way is
>>>
>>> -O2 -ftree-loop-vectorize
>>>
>>>
>>>
>>> 2) -ftree-vect-loop-peeling
>>>
>>> This option is used to turn on/off loop peeling for alignment.  In the
>>> long run, this should be folded into the cheap cost model proposed by
>>> Richard.  This option is also useful in scenarios where peeling can
>>> introduce runtime problems:
>>> http://gcc.gnu.org/ml/gcc/2005-12/msg00390.html  which happens to be
>>> common in practice.
>>>
>>>
>>>
>>> Patch attached. Compiler boostrapped. Ok after testing?
>>
>> I'd like you to split 1) and 2), mainly because I agree on 1) but not on 2).
>
> Ok. Can you also comment on 2) ?
>
>>
>> I've stopped a quick try doing 1) myself because
>>
>> @@ -1691,6 +1695,12 @@ common_handle_option (struct gcc_options
>>  opts->x_flag_ipa_reference = false;
>>break;
>>
>> +case OPT_ftree_vectorize:
>> +  if (!opts_set->x_flag_tree_loop_vectorize)
>> + opts->x_flag_tree_loop_vectorize = value;
>> +  if (!opts_set->x_flag_tree_slp_vectorize)
>> + opts->x_flag_tree_slp_vectorize = value;
>> +  break;
>>
>> doesn't look obviously correct.  Does that handle
>>
>>   -ftree-vectorize -fno-tree-loop-vectorize -ftree-vectorize
>>
>> or
>>
>>   -ftree-loop-vectorize -fno-tree-vectorize
>>
>> properly?  Currently at least
>>
>>   -ftree-slp-vectorize -fno-tree-vectorize
>>
>> doesn't "work".
>
>
> Right -- same is true for -fprofile-use option. FDO enables some
> passes, but can not re-enable them if they are flipped off before.
>
>>
>> That said, the option machinery doesn't handle an option being an alias
>> for two other options, so it's mechanism to contract positives/negatives
>> doesn't work here and the override hooks do not work reliably for
>> repeated options.
>>
>> Or am I wrong here?  Should we care at all?  Joseph?
>
> We should probably just document the behavior. Even better, we should
> deprecate the old option.
>
> thanks,
>
> David
>
>>
>> Thanks,
>> Richard.
>>
>>>
>>> thanks,
>>>
>>> David
Index: ChangeLog
===
--- ChangeLog   (revision 202540)
+++ ChangeLog   (working copy)
@@ -1,3 +1,22 @@
+2013-09-12  Xinliang David Li  
+
+   * tree-if-conv.c (main_tree_if_conversion): Check new flag.
+   * omp-low.c (omp_max_vf): Ditto.
+   (expand_omp_simd): Ditto.
+   * tree-vectorizer.c (vectorize_loops): Ditto.
+   (gate_vect_slp): Ditto.
+   (gate_increase_alignment): Ditto.
+   * tree-ssa-pre.c (inhibit_phi_insertion): Ditto.
+   * tree-ssa-loop.c (gate_tree_vectorize): Ditto.
+   (gate_tree_vectorize): Name change.
+   (tree_vectorize): Ditto.
+   (pass_vectorize::gate): Call new function.
+   (pass_vectorize::execute): Ditto.
+   opts.c: O3 default setting change.
+   (finis

Re: New GCC options for loop vectorization

2013-09-13 Thread Xinliang David Li
Ok -- then my updated patch is wrong then. The implementation in the
first version matches the requirement.

thanks,

David


On Fri, Sep 13, 2013 at 9:45 AM, Joseph S. Myers
 wrote:
> On Fri, 13 Sep 2013, Richard Biener wrote:
>
>> @@ -1691,6 +1695,12 @@ common_handle_option (struct gcc_options
>>  opts->x_flag_ipa_reference = false;
>>break;
>>
>> +case OPT_ftree_vectorize:
>> +  if (!opts_set->x_flag_tree_loop_vectorize)
>> + opts->x_flag_tree_loop_vectorize = value;
>> +  if (!opts_set->x_flag_tree_slp_vectorize)
>> + opts->x_flag_tree_slp_vectorize = value;
>> +  break;
>>
>> doesn't look obviously correct.  Does that handle
>
> It looks right to me.  The general principle is that the more specific
> option takes precedence over the less specific one, whatever the order on
> the command line.
>
>>   -ftree-vectorize -fno-tree-loop-vectorize -ftree-vectorize
>
> Should mean -ftree-slp-vectorize.
>
>>   -ftree-loop-vectorize -fno-tree-vectorize
>
> Should mean -ftree-loop-vectorize.
>
>>   -ftree-slp-vectorize -fno-tree-vectorize
>
> Should mean -ftree-slp-vectorize.
>
> --
> Joseph S. Myers
> jos...@codesourcery.com


Re: New GCC options for loop vectorization

2013-09-13 Thread Xinliang David Li
Updated patch implementing the logic that more specific option wins.

Ok for trunk?

thanks,

David

On Fri, Sep 13, 2013 at 9:48 AM, Xinliang David Li  wrote:
> Ok -- then my updated patch is wrong then. The implementation in the
> first version matches the requirement.
>
> thanks,
>
> David
>
>
> On Fri, Sep 13, 2013 at 9:45 AM, Joseph S. Myers
>  wrote:
>> On Fri, 13 Sep 2013, Richard Biener wrote:
>>
>>> @@ -1691,6 +1695,12 @@ common_handle_option (struct gcc_options
>>>  opts->x_flag_ipa_reference = false;
>>>break;
>>>
>>> +case OPT_ftree_vectorize:
>>> +  if (!opts_set->x_flag_tree_loop_vectorize)
>>> + opts->x_flag_tree_loop_vectorize = value;
>>> +  if (!opts_set->x_flag_tree_slp_vectorize)
>>> + opts->x_flag_tree_slp_vectorize = value;
>>> +  break;
>>>
>>> doesn't look obviously correct.  Does that handle
>>
>> It looks right to me.  The general principle is that the more specific
>> option takes precedence over the less specific one, whatever the order on
>> the command line.
>>
>>>   -ftree-vectorize -fno-tree-loop-vectorize -ftree-vectorize
>>
>> Should mean -ftree-slp-vectorize.
>>
>>>   -ftree-loop-vectorize -fno-tree-vectorize
>>
>> Should mean -ftree-loop-vectorize.
>>
>>>   -ftree-slp-vectorize -fno-tree-vectorize
>>
>> Should mean -ftree-slp-vectorize.
>>
>> --
>> Joseph S. Myers
>> jos...@codesourcery.com
Index: omp-low.c
===
--- omp-low.c   (revision 202540)
+++ omp-low.c   (working copy)
@@ -2305,8 +2305,8 @@ omp_max_vf (void)
 {
   if (!optimize
   || optimize_debug
-  || (!flag_tree_vectorize
- && global_options_set.x_flag_tree_vectorize))
+  || (!flag_tree_loop_vectorize
+ && global_options_set.x_flag_tree_loop_vectorize))
 return 1;
 
   int vs = targetm.vectorize.autovectorize_vector_sizes ();
@@ -5684,10 +5684,10 @@ expand_omp_simd (struct omp_region *regi
  loop->simduid = OMP_CLAUSE__SIMDUID__DECL (simduid);
  cfun->has_simduid_loops = true;
}
-  /* If not -fno-tree-vectorize, hint that we want to vectorize
+  /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
 the loop.  */
-  if ((flag_tree_vectorize
-  || !global_options_set.x_flag_tree_vectorize)
+  if ((flag_tree_loop_vectorize
+  || !global_options_set.x_flag_tree_loop_vectorize)
  && loop->safelen > 1)
{
  loop->force_vect = true;
Index: ChangeLog
===
--- ChangeLog   (revision 202540)
+++ ChangeLog   (working copy)
@@ -1,3 +1,22 @@
+2013-09-12  Xinliang David Li  
+
+   * tree-if-conv.c (main_tree_if_conversion): Check new flag.
+   * omp-low.c (omp_max_vf): Ditto.
+   (expand_omp_simd): Ditto.
+   * tree-vectorizer.c (vectorize_loops): Ditto.
+   (gate_vect_slp): Ditto.
+   (gate_increase_alignment): Ditto.
+   * tree-ssa-pre.c (inhibit_phi_insertion): Ditto.
+   * tree-ssa-loop.c (gate_tree_vectorize): Ditto.
+   (gate_tree_vectorize): Name change.
+   (tree_vectorize): Ditto.
+   (pass_vectorize::gate): Call new function.
+   (pass_vectorize::execute): Ditto.
+   opts.c: O3 default setting change.
+   (finish_options): Check new flag.
+   * doc/invoke.texi: Document new flags.
+   * common.opt: New flags.
+
 2013-09-12  Vladimir Makarov  
 
PR middle-end/58335
Index: opts.c
===
--- opts.c  (revision 202540)
+++ opts.c  (working copy)
@@ -498,7 +498,8 @@ static const struct default_options defa
 { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_finline_functions_called_once, NULL, 1 
},
 { OPT_LEVELS_3_PLUS, OPT_funswitch_loops, NULL, 1 },
 { OPT_LEVELS_3_PLUS, OPT_fgcse_after_reload, NULL, 1 },
-{ OPT_LEVELS_3_PLUS, OPT_ftree_vectorize, NULL, 1 },
+{ OPT_LEVELS_3_PLUS, OPT_ftree_loop_vectorize, NULL, 1 },
+{ OPT_LEVELS_3_PLUS, OPT_ftree_slp_vectorize, NULL, 1 },
 { OPT_LEVELS_3_PLUS, OPT_fvect_cost_model, NULL, 1 },
 { OPT_LEVELS_3_PLUS, OPT_fipa_cp_clone, NULL, 1 },
 { OPT_LEVELS_3_PLUS, OPT_ftree_partial_pre, NULL, 1 },
@@ -826,7 +827,8 @@ finish_options (struct gcc_options *opts
 
   /* Set PARAM_MAX_STORES_TO_SINK to 0 if either vectorization or if-conversion
  is disabled.  */
-  if (!opts->x_flag_tree_vectorize || !opts->x_flag_tree_loop_if_convert)
+  if ((!opts->x_flag_tree_loop_vectorize && !opts->x_flag_tree_slp_vectorize)
+   || !opts->x_flag_tree_loop_if_convert)
 maybe_set_param_va

tree if convert pass control

2013-09-13 Thread Xinliang David Li
tree if conversion is an enabler pass for vectorization, so by
default, it is only turned on when vectorization is on, but may also
depend on the optimization level. Currently, the logic to handle this
is in the gate function which become hard to understand and extend.

The proposed patch move the logic from the gate function to
'finish_option' which is much clearer. The downside of this patch is
that function specific optimization node needs to be created for some
cases during omp-lowering.

Comments?


thanks,

David
Index: opts.c
===
--- opts.c  (revision 202229)
+++ opts.c  (working copy)
@@ -824,6 +824,12 @@ finish_options (struct gcc_options *opts
}
 }
 
+  if (!opts_set->x_flag_tree_loop_if_convert)
+opts->x_flag_tree_loop_if_convert = opts->x_flag_tree_vectorize;
+
+  if (!opts_set->x_flag_tree_loop_if_convert_stores)
+opts->x_flag_tree_loop_if_convert_stores = opts->x_flag_tree_vectorize;
+
   /* Set PARAM_MAX_STORES_TO_SINK to 0 if either vectorization or if-conversion
  is disabled.  */
   if (!opts->x_flag_tree_vectorize || !opts->x_flag_tree_loop_if_convert)
Index: omp-low.c
===
--- omp-low.c   (revision 202229)
+++ omp-low.c   (working copy)
@@ -5690,8 +5690,36 @@ expand_omp_simd (struct omp_region *regi
   || !global_options_set.x_flag_tree_vectorize)
  && loop->safelen > 1)
{
+  bool need_to_turn_on_if_cvt = false;
+  bool need_to_turn_on_if_cvt_stores = false;
+
  loop->force_vect = true;
  cfun->has_force_vect_loops = true;
+
+  /* Now check if it is needed to turn on tree-if-convert.  */
+
+  if (!global_options_set.x_flag_tree_loop_if_convert
+  && !flag_tree_loop_if_convert)
+need_to_turn_on_if_cvt = true;
+
+  if (!global_options_set.x_flag_tree_loop_if_convert_stores
+  && !flag_tree_loop_if_convert_stores)
+need_to_turn_on_if_cvt_stores = true;
+
+  if (need_to_turn_on_if_cvt || need_to_turn_on_if_cvt_stores)
+{
+  if (need_to_turn_on_if_cvt)
+global_options.x_flag_tree_loop_if_convert = 1;
+  if (need_to_turn_on_if_cvt_stores)
+global_options.x_flag_tree_loop_if_convert_stores = 1;
+  DECL_FUNCTION_SPECIFIC_OPTIMIZATION (current_function_decl)
+  = build_optimization_node ();
+  /* restore  */
+  if (need_to_turn_on_if_cvt)
+global_options.x_flag_tree_loop_if_convert = 0;
+  if (need_to_turn_on_if_cvt_stores)
+global_options.x_flag_tree_loop_if_convert_stores = 0;
+}
}
 }
 }
Index: ChangeLog
===
--- ChangeLog   (revision 202229)
+++ ChangeLog   (working copy)
@@ -1,3 +1,12 @@
+2013-09-13  Xinliang David Li  
+   
+   * omp-low.c (expand_omp_simd): Build optimization cl
+   node if necessary.
+   * opts.c (finish_option): Enable/Disable tree if-cvt
+   if not explicitly set.
+   * tree-if-conv.c (gate_tree_if_conversion): Simplify.
+
+
 2013-08-29  Xinliang David Li  
 
* tree-vect-slp.c (destroy_bb_vec_info): Data ref cleanup.
Index: common.opt
===
--- common.opt  (revision 202229)
+++ common.opt  (working copy)
@@ -1308,7 +1308,7 @@ EnumValue
 Enum(stack_reuse_level) String(none) Value(SR_NONE)
 
 ftree-loop-if-convert
-Common Report Var(flag_tree_loop_if_convert) Init(-1) Optimization
+Common Report Var(flag_tree_loop_if_convert) Optimization
 Convert conditional jumps in innermost loops to branchless equivalents
 
 ftree-loop-if-convert-stores
Index: tree-if-conv.c
===
--- tree-if-conv.c  (revision 202229)
+++ tree-if-conv.c  (working copy)
@@ -1815,10 +1815,8 @@ main_tree_if_conversion (void)
 static bool
 gate_tree_if_conversion (void)
 {
-  return (((flag_tree_vectorize || cfun->has_force_vect_loops)
-  && flag_tree_loop_if_convert != 0)
- || flag_tree_loop_if_convert == 1
- || flag_tree_loop_if_convert_stores == 1);
+  return (flag_tree_loop_if_convert
+ || flag_tree_loop_if_convert_stores);
 }
 
 namespace {


Re: Dump framework newline cleanup

2013-09-16 Thread Xinliang David Li
I noticed there are a couple of dump_printf_loc instances in
coverage.c not ended with newline. They should be fixed.

David

On Tue, Sep 10, 2013 at 6:32 AM, Teresa Johnson  wrote:
> On Mon, Sep 9, 2013 at 9:55 PM, Xinliang David Li  wrote:
>> looks fine to me.
>>
>> In the long run, I wonder if the machinery in diagnostic messages can
>> be reused for opt-info dumping -- i.e., support different streams. It
>> has many nice features including %qD specifier for printing tree
>> decls.
>
> Yes, this would have some advantages such as getting the function name 
> emitted.
>
> Teresa
>
>>
>> David
>>
>> On Mon, Sep 9, 2013 at 12:01 PM, Teresa Johnson  wrote:
>>> I've attached a patch that implements the cleanup of newline emission
>>> by the new dump framework as discussed here:
>>>
>>> http://gcc.gnu.org/ml/gcc-patches/2013-08/msg01779.html
>>>
>>> Essentially, I have removed the leading newline emission from
>>> dump_loc, and updated dump_printf_loc invocations to emit a trailing
>>> newline as necessary. This will remove unnecessary vertical space in
>>> the dump output.
>>>
>>> I did not do any other cleanup of the existing vectorization messages
>>> - there are IMO a lot of messages being emitted by the vectorizer
>>> under MSG_NOTE (and probably MSG_MISSED_OPTIMIZATION) that should only
>>> be emitted to the dump file under -fdump-tree-... and not emitted
>>> under -fopt-info-all. The ones that stay under -fopt-info-all need
>>> some formatting/style cleanup. Leaving that for follow-on work.
>>>
>>> Bootstrapped and tested on x86-64-unknown-linux-gnu. Ok for trunk?
>>>
>>> Thanks,
>>> Teresa
>>>
>>> --
>>> Teresa Johnson | Software Engineer | tejohn...@google.com | 408-460-2413
>
>
>
> --
> Teresa Johnson | Software Engineer | tejohn...@google.com | 408-460-2413


Re: Dump framework newline cleanup

2013-09-16 Thread Xinliang David Li
Looks like there is one missing spot:

@@ -349,7 +349,7 @@ get_coverage_counts (unsigned counter, u
  (flag_guess_branch_prob
   ? "file %s not found, execution counts
estimated" <
   : "file %s not found, execution counts assumed to "
-"be zero"),
+"be zero\n"),
  da_file_name);
   return NULL;


I found this when testing interaction of -fprofile-use and
-fno-tree-vectorize without a profile.

thanks,

David


On Mon, Sep 16, 2013 at 11:06 AM, Teresa Johnson  wrote:
> On Mon, Sep 16, 2013 at 10:57 AM, Xinliang David Li  
> wrote:
>> I noticed there are a couple of dump_printf_loc instances in
>> coverage.c not ended with newline. They should be fixed.
>
> I committed this change this morning as r202628. I believe I fixed all
> the dump_printf_loc calls (just double-checked). Can you let me know
> if you see anymore after you update to this revision?
>
> Thanks,
> Teresa
>
>>
>> David
>>
>> On Tue, Sep 10, 2013 at 6:32 AM, Teresa Johnson  wrote:
>>> On Mon, Sep 9, 2013 at 9:55 PM, Xinliang David Li  
>>> wrote:
>>>> looks fine to me.
>>>>
>>>> In the long run, I wonder if the machinery in diagnostic messages can
>>>> be reused for opt-info dumping -- i.e., support different streams. It
>>>> has many nice features including %qD specifier for printing tree
>>>> decls.
>>>
>>> Yes, this would have some advantages such as getting the function name 
>>> emitted.
>>>
>>> Teresa
>>>
>>>>
>>>> David
>>>>
>>>> On Mon, Sep 9, 2013 at 12:01 PM, Teresa Johnson  
>>>> wrote:
>>>>> I've attached a patch that implements the cleanup of newline emission
>>>>> by the new dump framework as discussed here:
>>>>>
>>>>> http://gcc.gnu.org/ml/gcc-patches/2013-08/msg01779.html
>>>>>
>>>>> Essentially, I have removed the leading newline emission from
>>>>> dump_loc, and updated dump_printf_loc invocations to emit a trailing
>>>>> newline as necessary. This will remove unnecessary vertical space in
>>>>> the dump output.
>>>>>
>>>>> I did not do any other cleanup of the existing vectorization messages
>>>>> - there are IMO a lot of messages being emitted by the vectorizer
>>>>> under MSG_NOTE (and probably MSG_MISSED_OPTIMIZATION) that should only
>>>>> be emitted to the dump file under -fdump-tree-... and not emitted
>>>>> under -fopt-info-all. The ones that stay under -fopt-info-all need
>>>>> some formatting/style cleanup. Leaving that for follow-on work.
>>>>>
>>>>> Bootstrapped and tested on x86-64-unknown-linux-gnu. Ok for trunk?
>>>>>
>>>>> Thanks,
>>>>> Teresa
>>>>>
>>>>> --
>>>>> Teresa Johnson | Software Engineer | tejohn...@google.com | 408-460-2413
>>>
>>>
>>>
>>> --
>>> Teresa Johnson | Software Engineer | tejohn...@google.com | 408-460-2413
>
>
>
> --
> Teresa Johnson | Software Engineer | tejohn...@google.com | 408-460-2413


Re: New GCC options for loop vectorization

2013-09-16 Thread Xinliang David Li
I incorporated all the comments and committed the change (also fixed a
test failure with --help=optimizers).

thanks,

David

On Mon, Sep 16, 2013 at 3:07 AM, Richard Biener
 wrote:
> On Fri, Sep 13, 2013 at 6:56 PM, Xinliang David Li  wrote:
>> Updated patch implementing the logic that more specific option wins.
>>
>> Ok for trunk?
>
> @@ -2305,8 +2305,8 @@ omp_max_vf (void)
>  {
>if (!optimize
>|| optimize_debug
> -  || (!flag_tree_vectorize
> -  && global_options_set.x_flag_tree_vectorize))
> +  || (!flag_tree_loop_vectorize
> +  && global_options_set.x_flag_tree_loop_vectorize))
>  return 1;
>
> Not sure what is the intent here, but it looks like
> -fno-tree-vectorize will no longer disable this.  So it would
> need to check (global_options_set.x_flag_tree_vectorize ||
> global_options_set.x_flag_tree_loop_vectorize)?  Jakub?
>
>int vs = targetm.vectorize.autovectorize_vector_sizes ();
> @@ -5684,10 +5684,10 @@ expand_omp_simd (struct omp_region *regi
>loop->simduid = OMP_CLAUSE__SIMDUID__DECL (simduid);
>cfun->has_simduid_loops = true;
>   }
> -  /* If not -fno-tree-vectorize, hint that we want to vectorize
> +  /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
>   the loop.  */
> -  if ((flag_tree_vectorize
> -   || !global_options_set.x_flag_tree_vectorize)
> +  if ((flag_tree_loop_vectorize
> +   || !global_options_set.x_flag_tree_loop_vectorize)
>&& loop->safelen > 1)
>   {
>loop->force_vect = true;
>
> similar.
>
> -  if (!opts_set->x_flag_tree_vectorize)
> - opts->x_flag_tree_vectorize = value;
> +  if (!opts_set->x_flag_tree_loop_vectorize)
> + opts->x_flag_tree_loop_vectorize = value;
> +  if (!opts_set->x_flag_tree_slp_vectorize)
> + opts->x_flag_tree_slp_vectorize = value;
>
> similar - if I use -fprofile-use -fno-tree-vecotorize you override this 
> choice.
> This case should be wrapped in if (!opts_set->x_flag_tree_vectorize)
>
>  @item -ftree-vectorize
>  @opindex ftree-vectorize
> +Perform vectorization on trees. This flag enables
> @option{-ftree-loop-vectorize}
> +and @option{-ftree-slp-vectorize} if neither option is explicitly specified.
>
> "if neither option is explicitely specified" doesn't correctly document
> -ftree-loop-vectorize -ftree-vectorize behavior, no? (-ftree-slp-vectorize
> is still enabled here)
>
> I'm not a native speaker so I cannot suggest a clearer wording here
> but maybe just say "if not explicitely specified".
>
> Ok with the -fprofile-use change I suggested and whatever resolution Jakub
> suggests and the doc adjustment.
>
> Thanks,
> Richard.
>
>> thanks,
>>
>> David
>>
>> On Fri, Sep 13, 2013 at 9:48 AM, Xinliang David Li  
>> wrote:
>>> Ok -- then my updated patch is wrong then. The implementation in the
>>> first version matches the requirement.
>>>
>>> thanks,
>>>
>>> David
>>>
>>>
>>> On Fri, Sep 13, 2013 at 9:45 AM, Joseph S. Myers
>>>  wrote:
>>>> On Fri, 13 Sep 2013, Richard Biener wrote:
>>>>
>>>>> @@ -1691,6 +1695,12 @@ common_handle_option (struct gcc_options
>>>>>  opts->x_flag_ipa_reference = false;
>>>>>break;
>>>>>
>>>>> +case OPT_ftree_vectorize:
>>>>> +  if (!opts_set->x_flag_tree_loop_vectorize)
>>>>> + opts->x_flag_tree_loop_vectorize = value;
>>>>> +  if (!opts_set->x_flag_tree_slp_vectorize)
>>>>> + opts->x_flag_tree_slp_vectorize = value;
>>>>> +  break;
>>>>>
>>>>> doesn't look obviously correct.  Does that handle
>>>>
>>>> It looks right to me.  The general principle is that the more specific
>>>> option takes precedence over the less specific one, whatever the order on
>>>> the command line.
>>>>
>>>>>   -ftree-vectorize -fno-tree-loop-vectorize -ftree-vectorize
>>>>
>>>> Should mean -ftree-slp-vectorize.
>>>>
>>>>>   -ftree-loop-vectorize -fno-tree-vectorize
>>>>
>>>> Should mean -ftree-loop-vectorize.
>>>>
>>>>>   -ftree-slp-vectorize -fno-tree-vectorize
>>>>
>>>> Should mean -ftree-slp-vectorize.
>>>>
>>>> --
>>>> Joseph S. Myers
>>>> jos...@codesourcery.com


Re: New GCC options for loop vectorization

2013-09-16 Thread Xinliang David Li
On Mon, Sep 16, 2013 at 3:13 AM, Richard Biener
 wrote:
> On Fri, Sep 13, 2013 at 5:16 PM, Xinliang David Li  wrote:
>> On Fri, Sep 13, 2013 at 1:30 AM, Richard Biener
>>  wrote:
>>> On Thu, Sep 12, 2013 at 10:31 PM, Xinliang David Li  
>>> wrote:
>>>> Currently -ftree-vectorize turns on both loop and slp vectorizations,
>>>> but there is no simple way to turn on loop vectorization alone. The
>>>> logic for default O3 setting is also complicated.
>>>>
>>>> In this patch, two new options are introduced:
>>>>
>>>> 1) -ftree-loop-vectorize
>>>>
>>>> This option is used to turn on loop vectorization only. option
>>>> -ftree-slp-vectorize also becomes a first class citizen, and no funny
>>>> business of Init(2) is needed.  With this change, -ftree-vectorize
>>>> becomes a simple alias to -ftree-loop-vectorize +
>>>> -ftree-slp-vectorize.
>>>>
>>>> For instance, to turn on only slp vectorize at O3, the old way is:
>>>>
>>>>  -O3 -fno-tree-vectorize -ftree-slp-vectorize
>>>>
>>>> With the new change it becomes:
>>>>
>>>> -O3 -fno-loop-vectorize
>>>>
>>>>
>>>> To turn on only loop vectorize at O2, the old way is
>>>>
>>>> -O2 -ftree-vectorize -fno-slp-vectorize
>>>>
>>>> The new way is
>>>>
>>>> -O2 -ftree-loop-vectorize
>>>>
>>>>
>>>>
>>>> 2) -ftree-vect-loop-peeling
>>>>
>>>> This option is used to turn on/off loop peeling for alignment.  In the
>>>> long run, this should be folded into the cheap cost model proposed by
>>>> Richard.  This option is also useful in scenarios where peeling can
>>>> introduce runtime problems:
>>>> http://gcc.gnu.org/ml/gcc/2005-12/msg00390.html  which happens to be
>>>> common in practice.
>>>>
>>>>
>>>>
>>>> Patch attached. Compiler boostrapped. Ok after testing?
>>>
>>> I'd like you to split 1) and 2), mainly because I agree on 1) but not on 2).
>>
>> Ok. Can you also comment on 2) ?
>
> I think we want to decide how granular we want to control the vectorizer
> and using which mechanism.  My cost-model re-org makes
> ftree-vect-loop-version a no-op (basically removes it), so 2) looks like
> a step backwards in this context.

Using cost model to do a coarse grain control/configuration is
certainly something we want, but having a fine grain control is still
useful.

>
> So, can you summarize what pieces (including versioning) of the vectorizer
> you'd want to be able to disable separately?

Loop peeling seems to be the main one. There is also a correctness
issue related. For instance, the following code is common in practice,
but loop peeling wrongly assumes initial base-alignment and generates
aligned mov instruction after peeling, leading to SEGV.  Peeling is
not something we can blindly turned on -- even when it is on, there
should be a way to turn it off explicitly:

char a[1];

void foo(int n)
{
  int* b = (int*)(a+n);
  int i = 0;
  for (; i < 1000; ++i)
b[i] = 1;
}

int main(int argn, char** argv)
{
  foo(argn);
}



>  Just disabling peeling for
> alignment may get you into the versioning for alignment path (and thus
> an unvectorized loop at runtime).

This is not true for target supporting mis-aligned access. I have not
seen a case where alignment driver loop version happens on x86.

>Also it's know that the alignment peeling
> code needs some serious TLC (it's outcome depends on the order of DRs,
> the cost model it uses leaves to be desired as we cannot distinguish
> between unaligned load and store costs).

Yet another reason to turn it off as it is not effective anyways?


thanks,

David

>
> Richard.
>
>>>
>>> I've stopped a quick try doing 1) myself because
>>>
>>> @@ -1691,6 +1695,12 @@ common_handle_option (struct gcc_options
>>>  opts->x_flag_ipa_reference = false;
>>>break;
>>>
>>> +case OPT_ftree_vectorize:
>>> +  if (!opts_set->x_flag_tree_loop_vectorize)
>>> + opts->x_flag_tree_loop_vectorize = value;
>>> +  if (!opts_set->x_flag_tree_slp_vectorize)
>>> + opts->x_flag_tree_slp_vectorize = value;
>>> +  break;
>>>
>>> doesn't look obviously correct.  Does that handle
>>>
>>>   -ftree-vectorize -fno-tree-loop-vectorize -ftree-vectorize
>>>
>>> or
>>>
>>>   -ftree-loop-vectorize -fno-tree-vectorize
>>>
>>> properly?  Currently at least
>>>
>>>   -ftree-slp-vectorize -fno-tree-vectorize
>>>
>>> doesn't "work".
>>
>>
>> Right -- same is true for -fprofile-use option. FDO enables some
>> passes, but can not re-enable them if they are flipped off before.
>>
>>>
>>> That said, the option machinery doesn't handle an option being an alias
>>> for two other options, so it's mechanism to contract positives/negatives
>>> doesn't work here and the override hooks do not work reliably for
>>> repeated options.
>>>
>>> Or am I wrong here?  Should we care at all?  Joseph?
>>
>> We should probably just document the behavior. Even better, we should
>> deprecate the old option.
>>
>> thanks,
>>
>> David
>>
>>>
>>> Thanks,
>>> Richard.
>>>
>>>>
>>>> thanks,
>>>>
>>>> David


Re: tree if convert pass control

2013-09-16 Thread Xinliang David Li
Ok -- abandon the patch.

On Mon, Sep 16, 2013 at 2:04 AM, Richard Biener
 wrote:
> On Sat, Sep 14, 2013 at 8:10 AM, Xinliang David Li  wrote:
>> tree if conversion is an enabler pass for vectorization, so by
>> default, it is only turned on when vectorization is on, but may also
>> depend on the optimization level. Currently, the logic to handle this
>> is in the gate function which become hard to understand and extend.
>>
>> The proposed patch move the logic from the gate function to
>> 'finish_option' which is much clearer. The downside of this patch is
>> that function specific optimization node needs to be created for some
>> cases during omp-lowering.
>
> Something I don't like.  What's the issue with checking the
> has_force_vect_loops flag?
>
> How's the argument that the gate is hard to extend?  Wouldn't
> extending it complicate it again and thus make it hard to understand again?

The gating logic is:

1) the pass is a standalone pass that can be turned on independently
of loop vectorizer;
2) the pass can be implicitly turned on by loop vectorizer, and
possibly depending on the optimization level.

The logic is not complicated, but it would be simplier to handle in
the option processing if there was no need for the 'force_vect_loop'
business.

>
> That said, given that doing things in finish_options () is discouraged
> the patch looks like a step backwards.

Doing it in finish_options can avoid duplicating the handling in
multiple places.

>
> So, can you explain the underlying rationale?
>
> Btw, if we think of if-conversion as tied to loop vectorization then we can
> guard it by it and make a new container pass like
>
>   NEXT_PASS (pass_loop_vectorizer);
>   PUSH_INSERT_PASSES_WITHIN (pass_loop_vectorizer)
>   NEXT_PASS (pass_if_conversion);
>   NEXT_PASS (pass_vectorize);
>   NEXT_PASS (pass_dce_loop);
>   POP_INSERT_PASSES ()
>
> and guard pass_loop_vectorizer by flag_tree_loop_vectorize, defaulting
> if-conversion to be enabled (but allow disabling it manually).
>

This will make it hard to turn on the pass independently.

Anyway, there seems no need for the patch.

thanks,

David


> Or take the step and move it under control of the vectorizer itself.
>
> Richard.
>
>> Comments?
>>
>>
>> thanks,
>>
>> David


Re: New GCC options for loop vectorization

2013-09-17 Thread Xinliang David Li
On Tue, Sep 17, 2013 at 1:20 AM, Richard Biener
 wrote:
> On Mon, Sep 16, 2013 at 10:24 PM, Xinliang David Li  
> wrote:
>> On Mon, Sep 16, 2013 at 3:13 AM, Richard Biener
>>  wrote:
>>> On Fri, Sep 13, 2013 at 5:16 PM, Xinliang David Li  
>>> wrote:
>>>> On Fri, Sep 13, 2013 at 1:30 AM, Richard Biener
>>>>  wrote:
>>>>> On Thu, Sep 12, 2013 at 10:31 PM, Xinliang David Li  
>>>>> wrote:
>>>>>> Currently -ftree-vectorize turns on both loop and slp vectorizations,
>>>>>> but there is no simple way to turn on loop vectorization alone. The
>>>>>> logic for default O3 setting is also complicated.
>>>>>>
>>>>>> In this patch, two new options are introduced:
>>>>>>
>>>>>> 1) -ftree-loop-vectorize
>>>>>>
>>>>>> This option is used to turn on loop vectorization only. option
>>>>>> -ftree-slp-vectorize also becomes a first class citizen, and no funny
>>>>>> business of Init(2) is needed.  With this change, -ftree-vectorize
>>>>>> becomes a simple alias to -ftree-loop-vectorize +
>>>>>> -ftree-slp-vectorize.
>>>>>>
>>>>>> For instance, to turn on only slp vectorize at O3, the old way is:
>>>>>>
>>>>>>  -O3 -fno-tree-vectorize -ftree-slp-vectorize
>>>>>>
>>>>>> With the new change it becomes:
>>>>>>
>>>>>> -O3 -fno-loop-vectorize
>>>>>>
>>>>>>
>>>>>> To turn on only loop vectorize at O2, the old way is
>>>>>>
>>>>>> -O2 -ftree-vectorize -fno-slp-vectorize
>>>>>>
>>>>>> The new way is
>>>>>>
>>>>>> -O2 -ftree-loop-vectorize
>>>>>>
>>>>>>
>>>>>>
>>>>>> 2) -ftree-vect-loop-peeling
>>>>>>
>>>>>> This option is used to turn on/off loop peeling for alignment.  In the
>>>>>> long run, this should be folded into the cheap cost model proposed by
>>>>>> Richard.  This option is also useful in scenarios where peeling can
>>>>>> introduce runtime problems:
>>>>>> http://gcc.gnu.org/ml/gcc/2005-12/msg00390.html  which happens to be
>>>>>> common in practice.
>>>>>>
>>>>>>
>>>>>>
>>>>>> Patch attached. Compiler boostrapped. Ok after testing?
>>>>>
>>>>> I'd like you to split 1) and 2), mainly because I agree on 1) but not on 
>>>>> 2).
>>>>
>>>> Ok. Can you also comment on 2) ?
>>>
>>> I think we want to decide how granular we want to control the vectorizer
>>> and using which mechanism.  My cost-model re-org makes
>>> ftree-vect-loop-version a no-op (basically removes it), so 2) looks like
>>> a step backwards in this context.
>>
>> Using cost model to do a coarse grain control/configuration is
>> certainly something we want, but having a fine grain control is still
>> useful.
>>
>>>
>>> So, can you summarize what pieces (including versioning) of the vectorizer
>>> you'd want to be able to disable separately?
>>
>> Loop peeling seems to be the main one. There is also a correctness
>> issue related. For instance, the following code is common in practice,
>> but loop peeling wrongly assumes initial base-alignment and generates
>> aligned mov instruction after peeling, leading to SEGV.  Peeling is
>> not something we can blindly turned on -- even when it is on, there
>> should be a way to turn it off explicitly:
>>
>> char a[1];
>>
>> void foo(int n)
>> {
>>   int* b = (int*)(a+n);
>>   int i = 0;
>>   for (; i < 1000; ++i)
>> b[i] = 1;
>> }
>>
>> int main(int argn, char** argv)
>> {
>>   foo(argn);
>> }
>
> But that's just a bug that should be fixed (looking into it).

This kind of code is not uncommon for certain applications (e.g, group
varint decoding).  Besides, the code like this may be built with
-fno-strict-aliasing.


>
>>>  Just disabling peeling for
>>> alignment may get you into the versioning for alignment path (and thus
>>> an unvectorized loop at runtime).
>>
>> This is not true for target supporting mis-aligned access. I have not
>> se

Re: New GCC options for loop vectorization

2013-09-17 Thread Xinliang David Li
On Tue, Sep 17, 2013 at 8:45 AM, Jakub Jelinek  wrote:
> On Tue, Sep 17, 2013 at 08:37:57AM -0700, Xinliang David Li wrote:
>> >> char a[1];
>> >>
>> >> void foo(int n)
>> >> {
>> >>   int* b = (int*)(a+n);
>> >>   int i = 0;
>> >>   for (; i < 1000; ++i)
>> >> b[i] = 1;
>> >> }
>> >>
>> >> int main(int argn, char** argv)
>> >> {
>> >>   foo(argn);
>> >> }
>> >
>> > But that's just a bug that should be fixed (looking into it).
>>
>> This kind of code is not uncommon for certain applications (e.g, group
>> varint decoding).  Besides, the code like this may be built with
>
> That is irrelevant to the fact that it is invalid.
>
>> -fno-strict-aliasing.
>
> It isn't invalid because of aliasing violations, but because of unaligned
> access without saying that it is unaligned (say accessing through
> aligned(1) type, or packed struct or similar, or doing memcpy).
> On various architectures unaligned accesses don't cause faults, so it
> may appear to work, and even on i?86/x86_64 often appears to work, as
> long as you aren't trying to vectorize code (which doesn't change anything
> on the fact that it is undefined behavior).

ok, undefined behavior it is.  By the way, ICC does loop versioning on
the case and therefore has no problem. Clang/LLVM vectorizes it with
neither peeling nor versioning, and it works fine to. For legacy code
like this, GCC is less tolerant.

thanks,

David

>
> Jakub


Re: [GOOGLE] AutoFDO should honor system paths in the profile

2013-09-17 Thread Xinliang David Li
ok.

David

On Tue, Sep 17, 2013 at 4:53 PM, Dehao Chen  wrote:
> This patch makes AutoFDO honor system paths stored in the profile.
>
> Bootstrapped and passed regression tests.
>
> OK for google-4_8 branch?
>
> Thanks,
> Dehao
>
> Index: gcc/auto-profile.c
> ===
> --- gcc/auto-profile.c (revision 202672)
> +++ gcc/auto-profile.c (working copy)
> @@ -616,11 +616,11 @@ bool autofdo_module_profile::read ()
>  {
>char *name = xstrdup (gcov_read_string ());
>unsigned total_num = 0;
> -  unsigned num_array[6];
> +  unsigned num_array[7];
>unsigned exported = gcov_read_unsigned ();
>unsigned lang = gcov_read_unsigned ();
>unsigned ggc_memory = gcov_read_unsigned ();
> -  for (unsigned j = 0; j < 6; j++)
> +  for (unsigned j = 0; j < 7; j++)
>   {
>num_array[j] = gcov_read_unsigned ();
>total_num += num_array[j];
> @@ -638,9 +638,10 @@ bool autofdo_module_profile::read ()
>module->ggc_memory = ggc_memory;
>module->num_quote_paths = num_array[1];
>module->num_bracket_paths = num_array[2];
> -  module->num_cpp_defines = num_array[3];
> -  module->num_cpp_includes = num_array[4];
> -  module->num_cl_args = num_array[5];
> +  module->num_system_paths = num_array[3];
> +  module->num_cpp_defines = num_array[4];
> +  module->num_cpp_includes = num_array[5];
> +  module->num_cl_args = num_array[6];
>module->source_filename = name;
>module->is_primary = strcmp (name, in_fnames[0]) == 0;
>module->flags = module->is_primary ? exported : 1;


Re: [GOOGLE] disable slp for AutoFDO

2013-09-18 Thread Xinliang David Li
Ok.

David

On Wed, Sep 18, 2013 at 10:21 AM, Dehao Chen  wrote:
> This patch disables SLP for AutoFDO.
>
> Bootstrapped and passed unittests.
>
> OK for google-4_8?
>
> Thanks,
> Dehao
>
> Index: gcc/opts.c
> ===
> --- gcc/opts.c (revision 202709)
> +++ gcc/opts.c (working copy)
> @@ -1661,9 +1661,6 @@ common_handle_option (struct gcc_options *opts,
>if (!opts_set->x_flag_tree_loop_vectorize
>&& !opts_set->x_flag_tree_vectorize)
>   opts->x_flag_tree_loop_vectorize = value;
> -  if (!opts_set->x_flag_tree_slp_vectorize
> -  && !opts_set->x_flag_tree_vectorize)
> - opts->x_flag_tree_slp_vectorize = value;
>if (!opts_set->x_flag_vect_cost_model)
>   opts->x_flag_vect_cost_model = value;
>if (!opts_set->x_flag_tree_loop_distribute_patterns)


Re: New GCC options for loop vectorization

2013-09-18 Thread Xinliang David Li
On Tue, Sep 17, 2013 at 1:20 AM, Richard Biener
 wrote:
> On Mon, Sep 16, 2013 at 10:24 PM, Xinliang David Li  
> wrote:
>> On Mon, Sep 16, 2013 at 3:13 AM, Richard Biener
>>  wrote:
>>> On Fri, Sep 13, 2013 at 5:16 PM, Xinliang David Li  
>>> wrote:
>>>> On Fri, Sep 13, 2013 at 1:30 AM, Richard Biener
>>>>  wrote:
>>>>> On Thu, Sep 12, 2013 at 10:31 PM, Xinliang David Li  
>>>>> wrote:
>>>>>> Currently -ftree-vectorize turns on both loop and slp vectorizations,
>>>>>> but there is no simple way to turn on loop vectorization alone. The
>>>>>> logic for default O3 setting is also complicated.
>>>>>>
>>>>>> In this patch, two new options are introduced:
>>>>>>
>>>>>> 1) -ftree-loop-vectorize
>>>>>>
>>>>>> This option is used to turn on loop vectorization only. option
>>>>>> -ftree-slp-vectorize also becomes a first class citizen, and no funny
>>>>>> business of Init(2) is needed.  With this change, -ftree-vectorize
>>>>>> becomes a simple alias to -ftree-loop-vectorize +
>>>>>> -ftree-slp-vectorize.
>>>>>>
>>>>>> For instance, to turn on only slp vectorize at O3, the old way is:
>>>>>>
>>>>>>  -O3 -fno-tree-vectorize -ftree-slp-vectorize
>>>>>>
>>>>>> With the new change it becomes:
>>>>>>
>>>>>> -O3 -fno-loop-vectorize
>>>>>>
>>>>>>
>>>>>> To turn on only loop vectorize at O2, the old way is
>>>>>>
>>>>>> -O2 -ftree-vectorize -fno-slp-vectorize
>>>>>>
>>>>>> The new way is
>>>>>>
>>>>>> -O2 -ftree-loop-vectorize
>>>>>>
>>>>>>
>>>>>>
>>>>>> 2) -ftree-vect-loop-peeling
>>>>>>
>>>>>> This option is used to turn on/off loop peeling for alignment.  In the
>>>>>> long run, this should be folded into the cheap cost model proposed by
>>>>>> Richard.  This option is also useful in scenarios where peeling can
>>>>>> introduce runtime problems:
>>>>>> http://gcc.gnu.org/ml/gcc/2005-12/msg00390.html  which happens to be
>>>>>> common in practice.
>>>>>>
>>>>>>
>>>>>>
>>>>>> Patch attached. Compiler boostrapped. Ok after testing?
>>>>>
>>>>> I'd like you to split 1) and 2), mainly because I agree on 1) but not on 
>>>>> 2).
>>>>
>>>> Ok. Can you also comment on 2) ?
>>>
>>> I think we want to decide how granular we want to control the vectorizer
>>> and using which mechanism.  My cost-model re-org makes
>>> ftree-vect-loop-version a no-op (basically removes it), so 2) looks like
>>> a step backwards in this context.
>>
>> Using cost model to do a coarse grain control/configuration is
>> certainly something we want, but having a fine grain control is still
>> useful.
>>
>>>
>>> So, can you summarize what pieces (including versioning) of the vectorizer
>>> you'd want to be able to disable separately?
>>
>> Loop peeling seems to be the main one. There is also a correctness
>> issue related. For instance, the following code is common in practice,
>> but loop peeling wrongly assumes initial base-alignment and generates
>> aligned mov instruction after peeling, leading to SEGV.  Peeling is
>> not something we can blindly turned on -- even when it is on, there
>> should be a way to turn it off explicitly:
>>
>> char a[1];
>>
>> void foo(int n)
>> {
>>   int* b = (int*)(a+n);
>>   int i = 0;
>>   for (; i < 1000; ++i)
>> b[i] = 1;
>> }
>>
>> int main(int argn, char** argv)
>> {
>>   foo(argn);
>> }
>
> But that's just a bug that should be fixed (looking into it).
>
>>>  Just disabling peeling for
>>> alignment may get you into the versioning for alignment path (and thus
>>> an unvectorized loop at runtime).
>>
>> This is not true for target supporting mis-aligned access. I have not
>> seen a case where alignment driver loop version happens on x86.
>>
>>>Also it's know that the alignment peeling
>>> code needs some s

Re: [GOOGLE] Patch to fix AutoFDO LIPO performance regression

2013-09-18 Thread Xinliang David Li
On Wed, Sep 18, 2013 at 4:51 PM, Dehao Chen  wrote:
> This patch fixup the call graph edge targets during AutoFDO pass, so
> that when rebuilding call graph edges, it can find the correct callee.
>
> Bootstrapped and passed regression test. Benchmark tests on-going.
>
> Ok for google-4_8 branch?
>
> Thanks,
> Dehao
>
> Index: gcc/Makefile.in
> ===
> --- gcc/Makefile.in (revision 202725)
> +++ gcc/Makefile.in (working copy)
> @@ -2960,7 +2960,7 @@ coverage.o : coverage.c $(GCOV_IO_H) $(CONFIG_H) $
>  auto-profile.o : auto-profile.c $(CONFIG_H) $(SYSTEM_H) $(FLAGS_H) \
> $(BASIC_BLOCK_H) $(DIAGNOSTIC_CORE_H) $(GCOV_IO_H) $(INPUT_H) profile.h \
> $(LANGHOOKS_H) $(OPTS_H) $(TREE_PASS_H) $(CGRAPH_H) $(GIMPLE_H)
> value-prof.h \
> -   $(COVERAGE_H) coretypes.h $(TREE_H) $(PARAMS_H) $(AUTO_PROFILE_H)
> +   $(COVERAGE_H) coretypes.h $(TREE_H) $(PARAMS_H) l-ipo.h $(AUTO_PROFILE_H)
>  cselib.o : cselib.c $(CONFIG_H) $(SYSTEM_H) coretypes.h dumpfile.h
> $(TM_H) $(RTL_H) \
> $(REGS_H) hard-reg-set.h $(FLAGS_H) insn-config.h $(RECOG_H) \
> $(EMIT_RTL_H) $(DIAGNOSTIC_CORE_H) $(FUNCTION_H) \
> Index: gcc/auto-profile.c
> ===
> --- gcc/auto-profile.c (revision 202725)
> +++ gcc/auto-profile.c (working copy)
> @@ -46,6 +46,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "value-prof.h"
>  #include "coverage.h"
>  #include "params.h"
> +#include "l-ipo.h"
>  #include "auto-profile.h"
>
>  /* The following routines implements AutoFDO optimization.
> @@ -1290,6 +1291,13 @@ auto_profile (void)
>init_node_map ();
>profile_info = autofdo::afdo_profile_info;
>
> +  cgraph_pre_profiling_inlining_done = true;
> +  cgraph_process_module_scope_statics ();
> +  /* Now perform link to allow cross module inlining.  */
> +  cgraph_do_link ();
> +  varpool_do_link ();
> +  cgraph_unify_type_alias_sets ();
> +
>FOR_EACH_FUNCTION (node)
>  {
>if (!gimple_has_body_p (node->symbol.decl))
> @@ -1301,6 +1309,21 @@ auto_profile (void)
>
>push_cfun (DECL_STRUCT_FUNCTION (node->symbol.decl));
>
> +  if (L_IPO_COMP_MODE)
> +{
> +  basic_block bb;
> +  FOR_EACH_BB (bb)
> +{
> +  gimple_stmt_iterator gsi;
> +  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> + {
> +  gimple stmt = gsi_stmt (gsi);
> +  if (is_gimple_call (stmt))
> +lipo_fixup_cgraph_edge_call_target (stmt);
> + }
> +}
> + }
> +

Need this:


  if (execute_fixup_cfg () & TODO_cleanup_cfg)
  cleanup_tree_cfg ();


as in tree-profiling. Changing call stmt targets can lead to CFG changes.



David

>autofdo::afdo_annotate_cfg ();
>compute_function_frequency ();
>update_ssa (TODO_update_ssa);
> @@ -1309,13 +1332,6 @@ auto_profile (void)
>pop_cfun ();
>  }
>
> -  cgraph_pre_profiling_inlining_done = true;
> -  cgraph_process_module_scope_statics ();
> -  /* Now perform link to allow cross module inlining.  */
> -  cgraph_do_link ();
> -  varpool_do_link ();
> -  cgraph_unify_type_alias_sets ();
> -
>return TODO_rebuild_cgraph_edges;
>  }


Re: [GOOGLE] Patch to fix AutoFDO LIPO performance regression

2013-09-19 Thread Xinliang David Li
ok.

David

On Thu, Sep 19, 2013 at 10:10 AM, Dehao Chen  wrote:
> Thanks, patch updated:
>
> Index: gcc/Makefile.in
> ===
> --- gcc/Makefile.in (revision 202725)
> +++ gcc/Makefile.in (working copy)
> @@ -2960,7 +2960,7 @@ coverage.o : coverage.c $(GCOV_IO_H) $(CONFIG_H) $
>  auto-profile.o : auto-profile.c $(CONFIG_H) $(SYSTEM_H) $(FLAGS_H) \
> $(BASIC_BLOCK_H) $(DIAGNOSTIC_CORE_H) $(GCOV_IO_H) $(INPUT_H) profile.h \
> $(LANGHOOKS_H) $(OPTS_H) $(TREE_PASS_H) $(CGRAPH_H) $(GIMPLE_H)
> value-prof.h \
> -   $(COVERAGE_H) coretypes.h $(TREE_H) $(PARAMS_H) $(AUTO_PROFILE_H)
> +   $(COVERAGE_H) coretypes.h $(TREE_H) $(PARAMS_H) l-ipo.h $(AUTO_PROFILE_H)
>  cselib.o : cselib.c $(CONFIG_H) $(SYSTEM_H) coretypes.h dumpfile.h
> $(TM_H) $(RTL_H) \
> $(REGS_H) hard-reg-set.h $(FLAGS_H) insn-config.h $(RECOG_H) \
> $(EMIT_RTL_H) $(DIAGNOSTIC_CORE_H) $(FUNCTION_H) \
> Index: gcc/auto-profile.c
> ===
> --- gcc/auto-profile.c (revision 202725)
> +++ gcc/auto-profile.c (working copy)
> @@ -46,6 +46,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "value-prof.h"
>  #include "coverage.h"
>  #include "params.h"
> +#include "l-ipo.h"
>  #include "auto-profile.h"
>
>  /* The following routines implements AutoFDO optimization.
> @@ -1290,6 +1291,13 @@ auto_profile (void)
>init_node_map ();
>profile_info = autofdo::afdo_profile_info;
>
> +  cgraph_pre_profiling_inlining_done = true;
> +  cgraph_process_module_scope_statics ();
> +  /* Now perform link to allow cross module inlining.  */
> +  cgraph_do_link ();
> +  varpool_do_link ();
> +  cgraph_unify_type_alias_sets ();
> +
>FOR_EACH_FUNCTION (node)
>  {
>if (!gimple_has_body_p (node->symbol.decl))
> @@ -1301,21 +1309,33 @@ auto_profile (void)
>
>push_cfun (DECL_STRUCT_FUNCTION (node->symbol.decl));
>
> +  if (L_IPO_COMP_MODE)
> +{
> +  basic_block bb;
> +  FOR_EACH_BB (bb)
> +{
> +  gimple_stmt_iterator gsi;
> +  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> + {
> +  gimple stmt = gsi_stmt (gsi);
> +  if (is_gimple_call (stmt))
> +lipo_fixup_cgraph_edge_call_target (stmt);
> + }
> +}
> + }
> +
>autofdo::afdo_annotate_cfg ();
>compute_function_frequency ();
>update_ssa (TODO_update_ssa);
>
> +  /* Local pure-const may imply need to fixup the cfg.  */
> +  if (execute_fixup_cfg () & TODO_cleanup_cfg)
> + cleanup_tree_cfg ();
> +
>current_function_decl = NULL;
>pop_cfun ();
>  }
>
> -  cgraph_pre_profiling_inlining_done = true;
> -  cgraph_process_module_scope_statics ();
> -  /* Now perform link to allow cross module inlining.  */
> -  cgraph_do_link ();
> -  varpool_do_link ();
> -  cgraph_unify_type_alias_sets ();
> -
>return TODO_rebuild_cgraph_edges;
>  }
>
> On Wed, Sep 18, 2013 at 5:16 PM, Xinliang David Li  wrote:
>> On Wed, Sep 18, 2013 at 4:51 PM, Dehao Chen  wrote:
>>> This patch fixup the call graph edge targets during AutoFDO pass, so
>>> that when rebuilding call graph edges, it can find the correct callee.
>>>
>>> Bootstrapped and passed regression test. Benchmark tests on-going.
>>>
>>> Ok for google-4_8 branch?
>>>
>>> Thanks,
>>> Dehao
>>>
>>> Index: gcc/Makefile.in
>>> ===
>>> --- gcc/Makefile.in (revision 202725)
>>> +++ gcc/Makefile.in (working copy)
>>> @@ -2960,7 +2960,7 @@ coverage.o : coverage.c $(GCOV_IO_H) $(CONFIG_H) $
>>>  auto-profile.o : auto-profile.c $(CONFIG_H) $(SYSTEM_H) $(FLAGS_H) \
>>> $(BASIC_BLOCK_H) $(DIAGNOSTIC_CORE_H) $(GCOV_IO_H) $(INPUT_H) profile.h 
>>> \
>>> $(LANGHOOKS_H) $(OPTS_H) $(TREE_PASS_H) $(CGRAPH_H) $(GIMPLE_H)
>>> value-prof.h \
>>> -   $(COVERAGE_H) coretypes.h $(TREE_H) $(PARAMS_H) $(AUTO_PROFILE_H)
>>> +   $(COVERAGE_H) coretypes.h $(TREE_H) $(PARAMS_H) l-ipo.h 
>>> $(AUTO_PROFILE_H)
>>>  cselib.o : cselib.c $(CONFIG_H) $(SYSTEM_H) coretypes.h dumpfile.h
>>> $(TM_H) $(RTL_H) \
>>> $(REGS_H) hard-reg-set.h $(FLAGS_H) insn-config.h $(RECOG_H) \
>>> $(EMIT_RTL_H) $(DIAGNOSTIC_CORE_H) $(FUNCTION_H) \
>>> Index: gcc/auto-profile.c
>>> ===
>>> --- gcc/auto-profile.c (revision 202725)
>

Re: [GOOGLE] Sets cgraph_node count during annotation

2013-09-19 Thread Xinliang David Li
Looks good.

David

On Thu, Sep 19, 2013 at 1:15 PM, Dehao Chen  wrote:
> This patch sets cgraph_node count during AutoFDO annotation, otherwise
> execute_fixup_cfg will clear all the BB counts.
>
> bootstrapped and passed regression test.
>
> OK for google-4_8 branch?
>
> Thanks,
> Dehao
>
> Index: gcc/auto-profile.c
> ===
> --- gcc/auto-profile.c (revision 202753)
> +++ gcc/auto-profile.c (working copy)
> @@ -1234,6 +1234,7 @@ afdo_annotate_cfg (void)
>
>if (s == NULL)
>  return;
> +  cgraph_get_node (current_function_decl)->count = s->head_count ();
>ENTRY_BLOCK_PTR->count = s->head_count ();
>gcov_type max_count = ENTRY_BLOCK_PTR->count;


Re: [GOOGLE] Patch to fix AutoFDO LIPO performance regression

2013-09-19 Thread Xinliang David Li
I did not catch this in the last review. The cleanup CFG should be
done before afdo_annotate_cfg and just after call statement fixup.
Otherwise the cfg cleanup will zero out all profile counts. After
moving this up, you don't need the follow up patch which sets the
cgraph node's count -- that should be done in the
rebuild_cgraph_edges.

David

On Thu, Sep 19, 2013 at 10:10 AM, Dehao Chen  wrote:
> Thanks, patch updated:
>
> Index: gcc/Makefile.in
> ===
> --- gcc/Makefile.in (revision 202725)
> +++ gcc/Makefile.in (working copy)
> @@ -2960,7 +2960,7 @@ coverage.o : coverage.c $(GCOV_IO_H) $(CONFIG_H) $
>  auto-profile.o : auto-profile.c $(CONFIG_H) $(SYSTEM_H) $(FLAGS_H) \
> $(BASIC_BLOCK_H) $(DIAGNOSTIC_CORE_H) $(GCOV_IO_H) $(INPUT_H) profile.h \
> $(LANGHOOKS_H) $(OPTS_H) $(TREE_PASS_H) $(CGRAPH_H) $(GIMPLE_H)
> value-prof.h \
> -   $(COVERAGE_H) coretypes.h $(TREE_H) $(PARAMS_H) $(AUTO_PROFILE_H)
> +   $(COVERAGE_H) coretypes.h $(TREE_H) $(PARAMS_H) l-ipo.h $(AUTO_PROFILE_H)
>  cselib.o : cselib.c $(CONFIG_H) $(SYSTEM_H) coretypes.h dumpfile.h
> $(TM_H) $(RTL_H) \
> $(REGS_H) hard-reg-set.h $(FLAGS_H) insn-config.h $(RECOG_H) \
> $(EMIT_RTL_H) $(DIAGNOSTIC_CORE_H) $(FUNCTION_H) \
> Index: gcc/auto-profile.c
> ===
> --- gcc/auto-profile.c (revision 202725)
> +++ gcc/auto-profile.c (working copy)
> @@ -46,6 +46,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "value-prof.h"
>  #include "coverage.h"
>  #include "params.h"
> +#include "l-ipo.h"
>  #include "auto-profile.h"
>
>  /* The following routines implements AutoFDO optimization.
> @@ -1290,6 +1291,13 @@ auto_profile (void)
>init_node_map ();
>profile_info = autofdo::afdo_profile_info;
>
> +  cgraph_pre_profiling_inlining_done = true;
> +  cgraph_process_module_scope_statics ();
> +  /* Now perform link to allow cross module inlining.  */
> +  cgraph_do_link ();
> +  varpool_do_link ();
> +  cgraph_unify_type_alias_sets ();
> +
>FOR_EACH_FUNCTION (node)
>  {
>if (!gimple_has_body_p (node->symbol.decl))
> @@ -1301,21 +1309,33 @@ auto_profile (void)
>
>push_cfun (DECL_STRUCT_FUNCTION (node->symbol.decl));
>
> +  if (L_IPO_COMP_MODE)
> +{
> +  basic_block bb;
> +  FOR_EACH_BB (bb)
> +{
> +  gimple_stmt_iterator gsi;
> +  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> + {
> +  gimple stmt = gsi_stmt (gsi);
> +  if (is_gimple_call (stmt))
> +lipo_fixup_cgraph_edge_call_target (stmt);
> + }
> +}
> + }
> +
>autofdo::afdo_annotate_cfg ();
>compute_function_frequency ();
>update_ssa (TODO_update_ssa);
>
> +  /* Local pure-const may imply need to fixup the cfg.  */
> +  if (execute_fixup_cfg () & TODO_cleanup_cfg)
> + cleanup_tree_cfg ();
> +
>current_function_decl = NULL;
>pop_cfun ();
>  }
>
> -  cgraph_pre_profiling_inlining_done = true;
> -  cgraph_process_module_scope_statics ();
> -  /* Now perform link to allow cross module inlining.  */
> -  cgraph_do_link ();
> -  varpool_do_link ();
> -  cgraph_unify_type_alias_sets ();
> -
>return TODO_rebuild_cgraph_edges;
>  }
>
> On Wed, Sep 18, 2013 at 5:16 PM, Xinliang David Li  wrote:
>> On Wed, Sep 18, 2013 at 4:51 PM, Dehao Chen  wrote:
>>> This patch fixup the call graph edge targets during AutoFDO pass, so
>>> that when rebuilding call graph edges, it can find the correct callee.
>>>
>>> Bootstrapped and passed regression test. Benchmark tests on-going.
>>>
>>> Ok for google-4_8 branch?
>>>
>>> Thanks,
>>> Dehao
>>>
>>> Index: gcc/Makefile.in
>>> ===
>>> --- gcc/Makefile.in (revision 202725)
>>> +++ gcc/Makefile.in (working copy)
>>> @@ -2960,7 +2960,7 @@ coverage.o : coverage.c $(GCOV_IO_H) $(CONFIG_H) $
>>>  auto-profile.o : auto-profile.c $(CONFIG_H) $(SYSTEM_H) $(FLAGS_H) \
>>> $(BASIC_BLOCK_H) $(DIAGNOSTIC_CORE_H) $(GCOV_IO_H) $(INPUT_H) profile.h 
>>> \
>>> $(LANGHOOKS_H) $(OPTS_H) $(TREE_PASS_H) $(CGRAPH_H) $(GIMPLE_H)
>>> value-prof.h \
>>> -   $(COVERAGE_H) coretypes.h $(TREE_H) $(PARAMS_H) $(AUTO_PROFILE_H)
>>> +   $(COVERAGE_H) coretypes.h $(TREE_H) $(PARAMS_H) l-ipo.h 
>>> $(AUTO_PROFILE_H)
>>>  cselib.o : cselib.c $(CONFIG_H) $(SYSTEM_H) coretypes.h dumpfile.h
>>> $(TM_H) $(RT

Re: [GOOGLE] Patch to fix AutoFDO LIPO performance regression

2013-09-20 Thread Xinliang David Li
Yes -- in current form, it is still needed. As you explained, the
linking & stmt fixup will need to be done after profile annotation as
autofdo uses assembler name for icall target matching.

David

On Fri, Sep 20, 2013 at 3:29 PM, Dehao Chen  wrote:
> Now that both statement fixup and cfg cleanup are moved after
> annotation. So setting of cgraph node's count is still needed, right?
>
> Thanks,
> Dehao
>
> On Thu, Sep 19, 2013 at 9:28 PM, Xinliang David Li  wrote:
>> I did not catch this in the last review. The cleanup CFG should be
>> done before afdo_annotate_cfg and just after call statement fixup.
>> Otherwise the cfg cleanup will zero out all profile counts. After
>> moving this up, you don't need the follow up patch which sets the
>> cgraph node's count -- that should be done in the
>> rebuild_cgraph_edges.
>>
>> David
>>
>> On Thu, Sep 19, 2013 at 10:10 AM, Dehao Chen  wrote:
>>> Thanks, patch updated:
>>>
>>> Index: gcc/Makefile.in
>>> ===
>>> --- gcc/Makefile.in (revision 202725)
>>> +++ gcc/Makefile.in (working copy)
>>> @@ -2960,7 +2960,7 @@ coverage.o : coverage.c $(GCOV_IO_H) $(CONFIG_H) $
>>>  auto-profile.o : auto-profile.c $(CONFIG_H) $(SYSTEM_H) $(FLAGS_H) \
>>> $(BASIC_BLOCK_H) $(DIAGNOSTIC_CORE_H) $(GCOV_IO_H) $(INPUT_H) profile.h 
>>> \
>>> $(LANGHOOKS_H) $(OPTS_H) $(TREE_PASS_H) $(CGRAPH_H) $(GIMPLE_H)
>>> value-prof.h \
>>> -   $(COVERAGE_H) coretypes.h $(TREE_H) $(PARAMS_H) $(AUTO_PROFILE_H)
>>> +   $(COVERAGE_H) coretypes.h $(TREE_H) $(PARAMS_H) l-ipo.h 
>>> $(AUTO_PROFILE_H)
>>>  cselib.o : cselib.c $(CONFIG_H) $(SYSTEM_H) coretypes.h dumpfile.h
>>> $(TM_H) $(RTL_H) \
>>> $(REGS_H) hard-reg-set.h $(FLAGS_H) insn-config.h $(RECOG_H) \
>>> $(EMIT_RTL_H) $(DIAGNOSTIC_CORE_H) $(FUNCTION_H) \
>>> Index: gcc/auto-profile.c
>>> ===
>>> --- gcc/auto-profile.c (revision 202725)
>>> +++ gcc/auto-profile.c (working copy)
>>> @@ -46,6 +46,7 @@ along with GCC; see the file COPYING3.  If not see
>>>  #include "value-prof.h"
>>>  #include "coverage.h"
>>>  #include "params.h"
>>> +#include "l-ipo.h"
>>>  #include "auto-profile.h"
>>>
>>>  /* The following routines implements AutoFDO optimization.
>>> @@ -1290,6 +1291,13 @@ auto_profile (void)
>>>init_node_map ();
>>>profile_info = autofdo::afdo_profile_info;
>>>
>>> +  cgraph_pre_profiling_inlining_done = true;
>>> +  cgraph_process_module_scope_statics ();
>>> +  /* Now perform link to allow cross module inlining.  */
>>> +  cgraph_do_link ();
>>> +  varpool_do_link ();
>>> +  cgraph_unify_type_alias_sets ();
>>> +
>>>FOR_EACH_FUNCTION (node)
>>>  {
>>>if (!gimple_has_body_p (node->symbol.decl))
>>> @@ -1301,21 +1309,33 @@ auto_profile (void)
>>>
>>>push_cfun (DECL_STRUCT_FUNCTION (node->symbol.decl));
>>>
>>> +  if (L_IPO_COMP_MODE)
>>> +{
>>> +  basic_block bb;
>>> +  FOR_EACH_BB (bb)
>>> +{
>>> +  gimple_stmt_iterator gsi;
>>> +  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
>>> + {
>>> +  gimple stmt = gsi_stmt (gsi);
>>> +  if (is_gimple_call (stmt))
>>> +lipo_fixup_cgraph_edge_call_target (stmt);
>>> + }
>>> +}
>>> + }
>>> +
>>>autofdo::afdo_annotate_cfg ();
>>>compute_function_frequency ();
>>>update_ssa (TODO_update_ssa);
>>>
>>> +  /* Local pure-const may imply need to fixup the cfg.  */
>>> +  if (execute_fixup_cfg () & TODO_cleanup_cfg)
>>> + cleanup_tree_cfg ();
>>> +
>>>current_function_decl = NULL;
>>>pop_cfun ();
>>>  }
>>>
>>> -  cgraph_pre_profiling_inlining_done = true;
>>> -  cgraph_process_module_scope_statics ();
>>> -  /* Now perform link to allow cross module inlining.  */
>>> -  cgraph_do_link ();
>>> -  varpool_do_link ();
>>> -  cgraph_unify_type_alias_sets ();
>>> -
>>>return TODO_rebuild_cgraph_edges;
>>>  }
>>>
>>> On Wed, Sep 18, 2013 at 5:16 PM, Xinliang David Li  
>>> wrote:
>>>

Re: Revisit Core tunning flags

2013-09-21 Thread Xinliang David Li
On Sat, Sep 21, 2013 at 12:54 PM, Jan Hubicka  wrote:
> Hi,
> this is upated version of patch discussed at
> http://gcc.gnu.org/ml/gcc-patches/2012-12/msg00841.html
>
> It makes CORE tuning to more follow the optimization guidelines.
> In particular it removes some tuning flags for features I implemented years
> back specifically for K7/K8 chips that ended up in Core tunning becuase
> it was based on generic. Incrementally I plan to drop some of these from
> generic, too.
>
> Compared to previous version of patch I left out INC_DEC change, even
> though Core I7+ should resolve dependencies on partial flags correctly.
> Optimization manual still seems to suggest to not use this:
>
> Assembly/Compiler Coding Rule 33. (M impact, H generality)
> INC and DEC instructions should be replaced with ADD or SUB instructions,
> because ADD and SUB overwrite all flags, whereas INC and DEC do not, therefore
> creating false dependencies on earlier instructions that set the flags.
>
> Other change dropped is use_vector_fp_converts that seems to improve
> Core perofrmance.

I did not see this in your patch, but Wei has this tuning in this patch:

http://gcc.gnu.org/ml/gcc-patches/2013-09/msg00884.html

thanks,

David


>
> I benchmarked the patch on SPEC2k and earlier it was benchmarked on 2k6
> and the performance difference seems in noise.  It causes about 0.3% code
> size reduction.  Main motivation for the patch is to drop some codegen
> oddities that do not make sense on modern chips.
>
> Bootstrapped/regtested x86_64-linux, will commit it shortly.
> Honza
>
> * x86-tune.def (partial_reg_stall): Disable for CoreI7 and newer.
> (sse_typeless_stores): Enable for core
> (sse_load0_by_pxor): Likewise.
> (four_jump_limit): Disable for core.
> (pad_returns): Likewise.
> (avoid_vector_decode): Likewise.
> (fuse_cmp_and_branch): Enable for cores.
> * i386.c (x86_accumulate_outgoing_args): Disable for cores.
> Index: x86-tune.def
> ===
> *** x86-tune.def(revision 202812)
> --- x86-tune.def(working copy)
> *** DEF_TUNE (X86_TUNE_MOVX, "movx",
> *** 52,58 
>  and can happen in caller/callee saving sequences.  */
>   DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO)
>   DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
> !   m_CORE_ALL | m_GENERIC)
>   /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
>* on 16-bit immediate moves into memory on Core2 and Corei7.  */
>   DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC)
> --- 52,58 
>  and can happen in caller/callee saving sequences.  */
>   DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO)
>   DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
> !   m_CORE2 | m_GENERIC)
>   /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
>* on 16-bit immediate moves into memory on Core2 and Corei7.  */
>   DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC)
> *** DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INS
> *** 125,132 
>  maintain just lower part of scalar values in proper format leaving the
>  upper part undefined.  */
>   DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8)
> ! DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores", 
> m_AMD_MULTIPLE)
> ! DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor", m_PPRO | 
> m_P4_NOCONA)
>   DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
> m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | 
> m_GENERIC)
>   DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move",
> --- 125,134 
>  maintain just lower part of scalar values in proper format leaving the
>  upper part undefined.  */
>   DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8)
> ! DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
> ! m_AMD_MULTIPLE | m_CORE_ALL)
> ! DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor",
> ! m_PPRO | m_P4_NOCONA | m_CORE_ALL)
>   DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
> m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | 
> m_GENERIC)
>   DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move",
> *** DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSION
> *** 144,150 
>   /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
>  than 4 branch instructions in the 16 byte window.  */
>   DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
> !   m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM| m_AMD_MULTIPLE
> | m_GENERIC)
>   DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
> m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE
> --- 146,152 
>   /* X86_

Re: Revisit Core tunning flags

2013-09-21 Thread Xinliang David Li
On Sat, Sep 21, 2013 at 3:51 PM, Xinliang David Li  wrote:
> On Sat, Sep 21, 2013 at 12:54 PM, Jan Hubicka  wrote:
>> Hi,
>> this is upated version of patch discussed at
>> http://gcc.gnu.org/ml/gcc-patches/2012-12/msg00841.html
>>
>> It makes CORE tuning to more follow the optimization guidelines.
>> In particular it removes some tuning flags for features I implemented years
>> back specifically for K7/K8 chips that ended up in Core tunning becuase
>> it was based on generic. Incrementally I plan to drop some of these from
>> generic, too.
>>
>> Compared to previous version of patch I left out INC_DEC change, even
>> though Core I7+ should resolve dependencies on partial flags correctly.
>> Optimization manual still seems to suggest to not use this:
>>
>> Assembly/Compiler Coding Rule 33. (M impact, H generality)
>> INC and DEC instructions should be replaced with ADD or SUB instructions,
>> because ADD and SUB overwrite all flags, whereas INC and DEC do not, 
>> therefore
>> creating false dependencies on earlier instructions that set the flags.
>>
>> Other change dropped is use_vector_fp_converts that seems to improve
>> Core perofrmance.
>
> I did not see this in your patch, but Wei has this tuning in this patch:
>

Sorry, I meant to ask why dropping this part?

David

> http://gcc.gnu.org/ml/gcc-patches/2013-09/msg00884.html
>
> thanks,
>
> David
>
>
>>
>> I benchmarked the patch on SPEC2k and earlier it was benchmarked on 2k6
>> and the performance difference seems in noise.  It causes about 0.3% code
>> size reduction.  Main motivation for the patch is to drop some codegen
>> oddities that do not make sense on modern chips.
>>
>> Bootstrapped/regtested x86_64-linux, will commit it shortly.
>> Honza
>>
>> * x86-tune.def (partial_reg_stall): Disable for CoreI7 and newer.
>> (sse_typeless_stores): Enable for core
>> (sse_load0_by_pxor): Likewise.
>> (four_jump_limit): Disable for core.
>> (pad_returns): Likewise.
>> (avoid_vector_decode): Likewise.
>> (fuse_cmp_and_branch): Enable for cores.
>> * i386.c (x86_accumulate_outgoing_args): Disable for cores.
>> Index: x86-tune.def
>> ===
>> *** x86-tune.def(revision 202812)
>> --- x86-tune.def(working copy)
>> *** DEF_TUNE (X86_TUNE_MOVX, "movx",
>> *** 52,58 
>>  and can happen in caller/callee saving sequences.  */
>>   DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO)
>>   DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
>> !   m_CORE_ALL | m_GENERIC)
>>   /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
>>* on 16-bit immediate moves into memory on Core2 and Corei7.  */
>>   DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC)
>> --- 52,58 
>>  and can happen in caller/callee saving sequences.  */
>>   DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO)
>>   DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
>> !   m_CORE2 | m_GENERIC)
>>   /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
>>* on 16-bit immediate moves into memory on Core2 and Corei7.  */
>>   DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC)
>> *** DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INS
>> *** 125,132 
>>  maintain just lower part of scalar values in proper format leaving the
>>  upper part undefined.  */
>>   DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8)
>> ! DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores", 
>> m_AMD_MULTIPLE)
>> ! DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor", m_PPRO | 
>> m_P4_NOCONA)
>>   DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
>> m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | 
>> m_GENERIC)
>>   DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move",
>> --- 125,134 
>>  maintain just lower part of scalar values in proper format leaving the
>>  upper part undefined.  */
>>   DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8)
>> ! DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
>> ! m_AMD_MULTIPLE | m_CORE_AL

Re: New GCC options for loop vectorization

2013-09-23 Thread Xinliang David Li
Thanks. I modified the patch so that the max allowed peel iterations
can be specified via a parameter. Testing on going. Ok for trunk ?

David

On Mon, Sep 23, 2013 at 4:31 AM, Richard Biener
 wrote:
> On Wed, Sep 18, 2013 at 10:21 PM, Xinliang David Li  
> wrote:
>> On Tue, Sep 17, 2013 at 1:20 AM, Richard Biener
>>  wrote:
>>> On Mon, Sep 16, 2013 at 10:24 PM, Xinliang David Li  
>>> wrote:
>>>> On Mon, Sep 16, 2013 at 3:13 AM, Richard Biener
>>>>  wrote:
>>>>> On Fri, Sep 13, 2013 at 5:16 PM, Xinliang David Li  
>>>>> wrote:
>>>>>> On Fri, Sep 13, 2013 at 1:30 AM, Richard Biener
>>>>>>  wrote:
>>>>>>> On Thu, Sep 12, 2013 at 10:31 PM, Xinliang David Li 
>>>>>>>  wrote:
>>>>>>>> Currently -ftree-vectorize turns on both loop and slp vectorizations,
>>>>>>>> but there is no simple way to turn on loop vectorization alone. The
>>>>>>>> logic for default O3 setting is also complicated.
>>>>>>>>
>>>>>>>> In this patch, two new options are introduced:
>>>>>>>>
>>>>>>>> 1) -ftree-loop-vectorize
>>>>>>>>
>>>>>>>> This option is used to turn on loop vectorization only. option
>>>>>>>> -ftree-slp-vectorize also becomes a first class citizen, and no funny
>>>>>>>> business of Init(2) is needed.  With this change, -ftree-vectorize
>>>>>>>> becomes a simple alias to -ftree-loop-vectorize +
>>>>>>>> -ftree-slp-vectorize.
>>>>>>>>
>>>>>>>> For instance, to turn on only slp vectorize at O3, the old way is:
>>>>>>>>
>>>>>>>>  -O3 -fno-tree-vectorize -ftree-slp-vectorize
>>>>>>>>
>>>>>>>> With the new change it becomes:
>>>>>>>>
>>>>>>>> -O3 -fno-loop-vectorize
>>>>>>>>
>>>>>>>>
>>>>>>>> To turn on only loop vectorize at O2, the old way is
>>>>>>>>
>>>>>>>> -O2 -ftree-vectorize -fno-slp-vectorize
>>>>>>>>
>>>>>>>> The new way is
>>>>>>>>
>>>>>>>> -O2 -ftree-loop-vectorize
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> 2) -ftree-vect-loop-peeling
>>>>>>>>
>>>>>>>> This option is used to turn on/off loop peeling for alignment.  In the
>>>>>>>> long run, this should be folded into the cheap cost model proposed by
>>>>>>>> Richard.  This option is also useful in scenarios where peeling can
>>>>>>>> introduce runtime problems:
>>>>>>>> http://gcc.gnu.org/ml/gcc/2005-12/msg00390.html  which happens to be
>>>>>>>> common in practice.
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> Patch attached. Compiler boostrapped. Ok after testing?
>>>>>>>
>>>>>>> I'd like you to split 1) and 2), mainly because I agree on 1) but not 
>>>>>>> on 2).
>>>>>>
>>>>>> Ok. Can you also comment on 2) ?
>>>>>
>>>>> I think we want to decide how granular we want to control the vectorizer
>>>>> and using which mechanism.  My cost-model re-org makes
>>>>> ftree-vect-loop-version a no-op (basically removes it), so 2) looks like
>>>>> a step backwards in this context.
>>>>
>>>> Using cost model to do a coarse grain control/configuration is
>>>> certainly something we want, but having a fine grain control is still
>>>> useful.
>>>>
>>>>>
>>>>> So, can you summarize what pieces (including versioning) of the vectorizer
>>>>> you'd want to be able to disable separately?
>>>>
>>>> Loop peeling seems to be the main one. There is also a correctness
>>>> issue related. For instance, the following code is common in practice,
>>>> but loop peeling wrongly assumes initial base-alignment and generates
>>>> aligned mov instruction after peeling, leading to SEGV.  Peeling is
>>>> not something 

Re: [PATCH] Bug fix: *var and MEM[(const int *)var] (var has int* type) are not treated as the same data ref.

2013-09-23 Thread Xinliang David Li
Basically GCC reports that the loop is vectorized, but the vectorized
loop is never executed because of the bogus alias check ' a + 16 < a'
generated. In trunk, the vectorized version is eliminated, but it
remains as a dead code with gcc 48.

David

On Mon, Sep 23, 2013 at 5:26 PM, Cong Hou  wrote:
> (I have also created this issue in bug reports:
> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58513)
>
>
> First look at the code below:
>
>
> int op(const int* a, const int* b)
> { return *a+*b; }
>
> void foo(int*a, int b)
> {
>   int i;
>   for (i = 0; i < 10; ++i)
> a[i] = op(a+i, &b);
> }
>
>
> GCC will generate the following GIMPLE for this loop after inlining op():
>
>
>   :
>   # i_15 = PHI 
>   # ivtmp_23 = PHI 
>   _4 = (long unsigned int) i_15;
>   _5 = _4 * 4;
>   _7 = a_6(D) + _5;
>   _10 = MEM[(const int *)_7];
>   _11 = _10 + b_12(D);
>   *_7 = _11;
>   i_9 = i_15 + 1;
>   ivtmp_22 = ivtmp_23 - 1;
>   if (ivtmp_22 != 0)
> goto ;
>   else
> goto ;
>
>
>
> Here each element of the array a is loaded by MEM[(const int *)_7] and
> stored by *_7, which are the only two data refs in the loop body. The
> GCC vectorizer needs to check the possible aliasing between data refs
> with potential data dependence. Here those two data refs are actually
> the same one, but GCC could not recognize this fact. As a result, the
> aliasing checking predicate will always return false at runtime (GCC
> 4.9 could eliminate this generated branch at the end of the
> vectorization pass).
>
> The reason why GCC thinks that MEM[(const int *)_7] and *_7 are two
> different data refs is that there is a possible defect in the function
> operand_equal_p(), which is used to compare two data refs. The current
> implementation uses == to compare the types of the second argument of
> MEM_REF operator, which is too strict. Using types_compatible_p()
> instead can fix the issue above. I also included a test case for this
> bug fix. Bootstrapping and "make check" are both passed.
>
>
> thanks,
> Cong
>
>
>
> Index: gcc/testsuite/gcc.dg/alias-14.c
> ===
> --- gcc/testsuite/gcc.dg/alias-14.c (revision 0)
> +++ gcc/testsuite/gcc.dg/alias-14.c (revision 0)
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize" } */
> +
> +int op (const int* x, const int* y)
> +{
> +  return *x + *y;
> +}
> +
> +/* After inlining op() the type of the data ref is converted from int* into
> +   const int&, resulting in two data refs MEM[(const int *)DR] and *DR for 
> read
> +   and write, where DR represents the address of a[i] here.  They are still
> +   the same data ref and no alias exists in the loop.  The vectorizer should
> +   succesffuly vectorize this loop.  */
> +
> +void foo(int* a, int b)
> +{
> +  int i;
> +  for (i = 0; i < 10; ++i)
> +a[i] = op(a + i, &b);
> +}
> +
> +
> +/* { dg-final { scan-assembler-times "paddd" 1 { target x86_64-*-* } } } */
> +
> Index: gcc/fold-const.c
> ===
> --- gcc/fold-const.c (revision 202662)
> +++ gcc/fold-const.c (working copy)
> @@ -2693,8 +2693,9 @@ operand_equal_p (const_tree arg0, const_
> && operand_equal_p (TYPE_SIZE (TREE_TYPE (arg0)),
> TYPE_SIZE (TREE_TYPE (arg1)), flags)))
>&& types_compatible_p (TREE_TYPE (arg0), TREE_TYPE (arg1))
> -  && (TYPE_MAIN_VARIANT (TREE_TYPE (TREE_OPERAND (arg0, 1)))
> -  == TYPE_MAIN_VARIANT (TREE_TYPE (TREE_OPERAND (arg1, 1
> +  && types_compatible_p (
> +   TYPE_MAIN_VARIANT (TREE_TYPE (TREE_OPERAND (arg0, 1))),
> +   TYPE_MAIN_VARIANT (TREE_TYPE (TREE_OPERAND (arg1, 1
>&& OP_SAME (0) && OP_SAME (1));
>
>   case ARRAY_REF:


Re: [GOOGLE] Disable aggressive loop peeling to prevent code bloat.

2013-09-25 Thread Xinliang David Li
I wish there is better heuristic in the future. For now it is ok.

David

On Wed, Sep 25, 2013 at 2:48 PM, Dehao Chen  wrote:
> This patch disables aggressive loop peeling when profile is available.
> This prevents extensive code bloat which leads to increased i-cache
> misses.
>
> Bootstrapped and passed regression tests.
>
> OK for google-4_8?
>
> Thanks,
> Dehao
>
> Index: gcc/loop-unroll.c
> ===
> --- gcc/loop-unroll.c (revision 202926)
> +++ gcc/loop-unroll.c (working copy)
> @@ -1574,8 +1574,7 @@ decide_peel_simple (struct loop *loop, int flags)
>   peeling it is not the case.  Also a function call inside loop is
>   also branch from branch prediction POV (and probably better reason
>   to not unroll/peel).  */
> -  if (desc->num_branches > 1
> -  && profile_status != PROFILE_READ)
> +  if (desc->num_branches > 1)
>  {
>if (dump_file)
>   fprintf (dump_file, ";; Not peeling, contains branches\n");


cost model patch

2013-09-25 Thread Xinliang David Li
I took the liberty to pick up Richard's original fvect-cost-model
patch and made some modification.

What has not changed:
1) option -ftree-vect-loop-version is removed;
2) three cost models are introduced: cheap, dynamic, and unlimited;
3) unless explicitly specified, cheap model is the default at O2 (e.g.
when -ftree-loop-vectorize is used with -O2), and dynamic mode is the
default for O3 and FDO
4) alignment based versioning is disabled with cheap model.

What has changed:
1) peeling is also disabled with cheap model;
2) alias check condition limit is reduced with cheap model, but not
completely suppressed. Runtime alias check is a pretty important
enabler.
3) tree if conversion changes are not included.

Does this patch look reasonable?

thanks,

David
Index: tree-vectorizer.h
===
--- tree-vectorizer.h   (revision 202926)
+++ tree-vectorizer.h   (working copy)
@@ -880,6 +880,14 @@ known_alignment_for_access_p (struct dat
   return (DR_MISALIGNMENT (data_ref_info) != -1);
 }
 
+
+/* Return true if the vect cost model is unlimited.  */
+static inline bool
+unlimited_cost_model ()
+{
+  return flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED;
+}
+
 /* Source location */
 extern LOC vect_location;
 
Index: flag-types.h
===
--- flag-types.h(revision 202926)
+++ flag-types.h(working copy)
@@ -191,6 +191,15 @@ enum fp_contract_mode {
   FP_CONTRACT_FAST = 2
 };
 
+/* Vectorizer cost-model.  */
+enum vect_cost_model {
+  VECT_COST_MODEL_UNLIMITED = 0,
+  VECT_COST_MODEL_CHEAP = 1,
+  VECT_COST_MODEL_DYNAMIC = 2,
+  VECT_COST_MODEL_DEFAULT = 3
+};
+
+
 /* Different instrumentation modes.  */
 enum sanitize_code {
   /* AddressSanitizer.  */
Index: targhooks.c
===
--- targhooks.c (revision 202926)
+++ targhooks.c (working copy)
@@ -1057,20 +1057,17 @@ default_add_stmt_cost (void *data, int c
   unsigned *cost = (unsigned *) data;
   unsigned retval = 0;
 
-  if (flag_vect_cost_model)
-{
-  tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
-  int stmt_cost = default_builtin_vectorization_cost (kind, vectype,
+  tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
+  int stmt_cost = default_builtin_vectorization_cost (kind, vectype,
  misalign);
-  /* Statements in an inner loop relative to the loop being
-vectorized are weighted more heavily.  The value here is
-arbitrary and could potentially be improved with analysis.  */
-  if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
-   count *= 50;  /* FIXME.  */
+   /* Statements in an inner loop relative to the loop being
+  vectorized are weighted more heavily.  The value here is
+  arbitrary and could potentially be improved with analysis.  */
+  if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
+count *= 50;  /* FIXME.  */
 
-  retval = (unsigned) (count * stmt_cost);
-  cost[where] += retval;
-}
+  retval = (unsigned) (count * stmt_cost);
+  cost[where] += retval;
 
   return retval;
 }
Index: common.opt
===
--- common.opt  (revision 202926)
+++ common.opt  (working copy)
@@ -2278,13 +2278,33 @@ ftree-slp-vectorize
 Common Report Var(flag_tree_slp_vectorize) Optimization
 Enable basic block vectorization (SLP) on trees
 
+fvect-cost-model=
+Common Joined RejectNegative Enum(vect_cost_model) Var(flag_vect_cost_model) 
Init(VECT_COST_MODEL_DEFAULT)
+Specifies the cost model for vectorization
+ 
+Enum
+Name(vect_cost_model) Type(enum vect_cost_model) UnknownError(unknown 
vectorizer cost model %qs)
+
+EnumValue
+Enum(vect_cost_model) String(unlimited) Value(VECT_COST_MODEL_UNLIMITED)
+
+EnumValue
+Enum(vect_cost_model) String(dynamic) Value(VECT_COST_MODEL_DYNAMIC)
+
+EnumValue
+Enum(vect_cost_model) String(cheap) Value(VECT_COST_MODEL_CHEAP)
+
 fvect-cost-model
-Common Report Var(flag_vect_cost_model) Optimization
-Enable use of cost model in vectorization
+Common RejectNegative Alias(fvect-cost-model=,dynamic)
+Enables the dynamic vectorizer cost model.  Preserved for backward 
compatibility.
+
+fno-vect-cost-model
+Common RejectNegative Alias(fvect-cost-model=,unlimited)
+Enables the unlimited vectorizer cost model.  Preserved for backward 
compatibility.
 
 ftree-vect-loop-version
-Common Report Var(flag_tree_vect_loop_version) Init(1) Optimization
-Enable loop versioning when doing loop vectorization on trees
+Common Ignore
+Does nothing. Preserved for backward compatibility.
 
 ftree-scev-cprop
 Common Report Var(flag_tree_scev_cprop) Init(1) Optimization
Index: opts.c
===
--- opts.c  (revision 202926)
+++ opts.c  (working copy)
@@ 

Re: [GOOGLE] max-lipo-group parameter for AutoFDO

2013-09-26 Thread Xinliang David Li
Looks ok.

David

On Thu, Sep 26, 2013 at 9:00 AM, Dehao Chen  wrote:
> This patch fix the bug when setting max-lipo-group in AutoFDO:
>
> Bootstrapped and passed regression test.
>
> OK for google branches?
>
> Thanks,
> Dehao
>
> Index: gcc/auto-profile.c
> ===
> --- gcc/auto-profile.c (revision 202926)
> +++ gcc/auto-profile.c (working copy)
> @@ -746,7 +746,7 @@ read_aux_modules (void)
>  "assembler statements", *iter);
>continue;
>   }
> -  if (max_group != 0 && curr_module == max_group)
> +  if (max_group != 0 && curr_module >= max_group)
>   {
>if (flag_opt_info)
>  inform (0, "Not importing %s: maximum group size reached", *iter);


Re: [GOOGLE] Fix an ICE in lipo_cmp_type

2013-09-26 Thread Xinliang David Li
yes.

David

On Thu, Sep 26, 2013 at 9:26 AM, Dehao Chen  wrote:
> This fixes an ICE when lipo_cmp_type handles NULL_PTR_TYPE.
>
> Bootstrapped and regression test on going?
>
> OK for google branches?
>
> Thanks,
> Dehao
>
> Index: gcc/l-ipo.c
> ===
> --- gcc/l-ipo.c (revision 202926)
> +++ gcc/l-ipo.c (working copy)
> @@ -713,6 +713,7 @@ lipo_cmp_type (tree t1, tree t2)
>&& lipo_cmp_type (TREE_TYPE (t1), TREE_TYPE (t2)));
>  case VOID_TYPE:
>  case BOOLEAN_TYPE:
> +case NULLPTR_TYPE:
>return 1;
>  case TEMPLATE_TYPE_PARM:
>return 1;


Re: cost model patch

2013-09-26 Thread Xinliang David Li
On Thu, Sep 26, 2013 at 7:37 AM, Richard Biener
 wrote:
> On Thu, Sep 26, 2013 at 1:10 AM, Xinliang David Li  wrote:
>> I took the liberty to pick up Richard's original fvect-cost-model
>> patch and made some modification.
>>
>> What has not changed:
>> 1) option -ftree-vect-loop-version is removed;
>> 2) three cost models are introduced: cheap, dynamic, and unlimited;
>> 3) unless explicitly specified, cheap model is the default at O2 (e.g.
>> when -ftree-loop-vectorize is used with -O2), and dynamic mode is the
>> default for O3 and FDO
>> 4) alignment based versioning is disabled with cheap model.
>>
>> What has changed:
>> 1) peeling is also disabled with cheap model;
>> 2) alias check condition limit is reduced with cheap model, but not
>> completely suppressed. Runtime alias check is a pretty important
>> enabler.
>> 3) tree if conversion changes are not included.
>>
>> Does this patch look reasonable?
>
> In principle yes.  Note that it changes the behavior of -O2 -ftree-vectorize
> as -ftree-vectorize does not imply changing the default cost model.  I am
> fine with that, but eventually this will have some testsuite fallout.  This
> reorg would also need documenting in changes.html to make people
> aware of this.


Here is the proposed change:


Index: htdocs/gcc-4.9/changes.html
===
RCS file: /cvs/gcc/wwwdocs/htdocs/gcc-4.9/changes.html,v
retrieving revision 1.26
diff -u -r1.26 changes.html
--- htdocs/gcc-4.9/changes.html 26 Aug 2013 14:16:31 - 1.26
+++ htdocs/gcc-4.9/changes.html 26 Sep 2013 18:02:33 -
@@ -37,6 +37,7 @@
   
 AddressSanitizer, a fast memory error detector, is now
available on ARM.
 
+GCC introduces a new cost model for vectorizer, called
'cheap' model. The new cost model is intenteded to minimize compile
time, code size, and potential negative runtime impact introduced when
vectorizer is turned on at the expense of not getting the maximum
potential runtime speedup. The 'cheap' model will be the default when
vectorizer is turned on at -O2. To override this, use
option -fvect-cost-model=[cheap|dynamic|unlimited].
   

 New Languages and Language specific improvements


>
> With completely disabling alingment peeling and alignment versioning
> you cut out targets that have no way of performing unaligned accesses.
> From looking at vect_no_align this are mips, sparc, ia64 and some arm.
> A compromise for them would be to allow peeling a single iteration
> and some alignment checks (like up to two?).
>

Possibly. I think target owners can choose to do target specific
tunings as follow up.


> Reducing the number of allowed alias-checks is ok, but I'd reduce it
> more than to 6 (was that an arbitrary number or is that the result of
> some benchmarking?)
>

yes -- we found that it is not uncommon to have a loop with 2 or 3
distinct source address and 1 or 2 target address.

There are also tuning opportunities. For instance, in cases where
source address are derived from the same base, a consolidated alias
check (against the whole access range instead of just checking cross
1-unrolled iteration dependence) can be done.

> I suppose all of the params could use some benchmarking to select
> a sweet spot in code size vs. runtime.

Agree.


>
> I suppose the patch is ok as-is (if it actually works) if you provide
> a changelog and propose an entry for changes.html.  We can
> tune the params for the cheap model as followup.

Ok. I will do more testing and check in the patch with proper
ChangeLog. The changes.html change will be done separately.

thanks,

David


>
> Thanks for picking this up,
> Richard.
>
>> thanks,
>>
>> David


Re: [google gcc-4_8] alternate hirate for builtin_expert

2013-09-26 Thread Xinliang David Li
This patch improves linux kernel performance with a large workload, so
it is good to first submit this to trunk and backport it.

thanks,

David

On Thu, Sep 26, 2013 at 3:27 PM, Jan Hubicka  wrote:
>> Hi,
>>
>> Current default probably for builtin_expect is 0.9996.
>> This makes the freq of unlikely bb very low (4), which
>> suppresses the inlining of any calls within those bb.
>>
>> We used FDO data to measure the branch probably for
>> the branch annotated with builtin_expert.
>>  For google
>> internal benchmarks, the weight average
>> (the profile count value as the weight) is 0.9081.
>>
>> Linux kernel is another program that is heavily annotated
>> with builtin-expert. We measured its weight average as 0.8717,
>>   using google search as
>> the workload.
>
> This is interesting.  I was always unsure if programmers use builtin_expect
> more often to mark an impossible paths (as those leading to crash) or just
> unlikely paths.  Your data seems to suggest the second.
>
> We probably also ought to get analyze_brprob working again. It was always
> useful to get such a data.
>>
>>
>> This patch sets the alternate hirate probability for
>> builtin_expert
>> to 90%. With the alternate hirate, we measured performance
>>   improvement for google
>> benchmarks and Linux kernel.
>>
>>
>>   -Rong
>> 2013-09-26  Rong Xu  
>>
>>   * params.def (DEFPARAM): New.
>>   * params.def: New.
>>   * predict.c (tree_predict_by_opcode): Alternate
>> probablity hirate for builtin_expect.
>
> This also seems resonable for mainline.  Please add a comment
> to the parameter explaining how the value was determined.
> Please also add invoke.texi documentation.
>
> For patches that seems resonable for mainline in FDO/IPA area,
> i would apprechiate if you just added me to CC, so I do not lose
> track of them.
> Honza


Re: [google gcc-4_8] alternate hirate for builtin_expert

2013-09-26 Thread Xinliang David Li
it might worth extend __builtin_expect to take more parameters:
1) argument to specify actual probability: __builtin_expect (x, 10, 0.6)
2) a more general way of doing this is to allow specifying multiple
values, and the probability is determined by # of occurances:
__builtin_expect (x, 10, 10, 20) --> tells compiler x is expected to
be 10 66% of the time, and 33% of time with value twenty.
3) a special value can be reserved to indicate if the branch is
predictable or not.

David

On Thu, Sep 26, 2013 at 3:27 PM, Jan Hubicka  wrote:
>> Hi,
>>
>> Current default probably for builtin_expect is 0.9996.
>> This makes the freq of unlikely bb very low (4), which
>> suppresses the inlining of any calls within those bb.
>>
>> We used FDO data to measure the branch probably for
>> the branch annotated with builtin_expert.
>>  For google
>> internal benchmarks, the weight average
>> (the profile count value as the weight) is 0.9081.
>>
>> Linux kernel is another program that is heavily annotated
>> with builtin-expert. We measured its weight average as 0.8717,
>>   using google search as
>> the workload.
>
> This is interesting.  I was always unsure if programmers use builtin_expect
> more often to mark an impossible paths (as those leading to crash) or just
> unlikely paths.  Your data seems to suggest the second.
>
> We probably also ought to get analyze_brprob working again. It was always
> useful to get such a data.
>>
>>
>> This patch sets the alternate hirate probability for
>> builtin_expert
>> to 90%. With the alternate hirate, we measured performance
>>   improvement for google
>> benchmarks and Linux kernel.
>>
>>
>>   -Rong
>> 2013-09-26  Rong Xu  
>>
>>   * params.def (DEFPARAM): New.
>>   * params.def: New.
>>   * predict.c (tree_predict_by_opcode): Alternate
>> probablity hirate for builtin_expect.
>
> This also seems resonable for mainline.  Please add a comment
> to the parameter explaining how the value was determined.
> Please also add invoke.texi documentation.
>
> For patches that seems resonable for mainline in FDO/IPA area,
> i would apprechiate if you just added me to CC, so I do not lose
> track of them.
> Honza


Re: Add value range support into memcpy/memset expansion

2013-09-27 Thread Xinliang David Li
Nice extension. Test cases would be great to have.

thanks,

David

On Fri, Sep 27, 2013 at 7:50 AM, Jan Hubicka  wrote:
> Hi,
> this patch makes it possible to access value range info from setmem/movstr 
> that
> I plan to use in i386 memcpy/memset expansion code.  It is all quite
> straighforward except that I need to deal with cases where max size does not
> fit in HOST_WIDE_INT where I use maximal value as a marker.  It is then
> translated as NULL pointer to the expander that is bit inconsistent with other
> places that use -1 as marker of unknown value.
>
> I also think we lose some cases because of TER replacing out the SSA_NAME by
> something else, but it seems to work in quite many cases. This can be probably
> tracked incrementally by disabling TER here or finally getting away from
> expanding calls via the generic route.
>
> Bootstrapped/regtested x86_64-linux, OK?
>
> Honza
>
> * doc/md.texi (setmem, movstr): Update documentation.
> * builtins.c (determine_block_size): New function.
> (expand_builtin_memcpy): Use it and pass it to
> emit_block_move_hints.
> (expand_builtin_memset_args): Use it and pass it to
> set_storage_via_setmem.
> * expr.c (emit_block_move_via_movmem): Add min_size/max_size 
> parameters;
> update call to expander.
> (emit_block_move_hints): Add min_size/max_size parameters.
> (clear_storage_hints): Likewise.
> (set_storage_via_setmem): Likewise.
> (clear_storage): Update.
> * expr.h (emit_block_move_hints, clear_storage_hints,
> set_storage_via_setmem): Update prototype.
>
> Index: doc/md.texi
> ===
> --- doc/md.texi (revision 202968)
> +++ doc/md.texi (working copy)
> @@ -5198,6 +5198,9 @@ destination and source strings are opera
>  the expansion of this pattern should store in operand 0 the address in
>  which the @code{NUL} terminator was stored in the destination string.
>
> +This patern has also several optional operands that are same as in
> +@code{setmem}.
> +
>  @cindex @code{setmem@var{m}} instruction pattern
>  @item @samp{setmem@var{m}}
>  Block set instruction.  The destination string is the first operand,
> @@ -5217,6 +5220,8 @@ respectively.  The expected alignment di
>  in a way that the blocks are not required to be aligned according to it in
>  all cases. This expected alignment is also in bytes, just like operand 4.
>  Expected size, when unknown, is set to @code{(const_int -1)}.
> +Operand 7 is the minimal size of the block and operand 8 is the
> +maximal size of the block (NULL if it can not be represented as CONST_INT).
>
>  The use for multiple @code{setmem@var{m}} is as for @code{movmem@var{m}}.
>
> Index: builtins.c
> ===
> --- builtins.c  (revision 202968)
> +++ builtins.c  (working copy)
> @@ -3070,6 +3070,51 @@ builtin_memcpy_read_str (void *data, HOS
>return c_readstr (str + offset, mode);
>  }
>
> +/* LEN specify length of the block of memcpy/memset operation.
> +   Figure out its range and put it into MIN_SIZE/MAX_SIZE.  */
> +
> +static void
> +determine_block_size (tree len, rtx len_rtx,
> + unsigned HOST_WIDE_INT *min_size,
> + unsigned HOST_WIDE_INT *max_size)
> +{
> +  if (CONST_INT_P (len_rtx))
> +{
> +  *min_size = *max_size = UINTVAL (len_rtx);
> +  return;
> +}
> +  else
> +{
> +  double_int min, max;
> +  if (TREE_CODE (len) == SSA_NAME
> + && get_range_info (len, &min, &max) == VR_RANGE)
> +   {
> + if (min.fits_uhwi ())
> +   *min_size = min.to_uhwi ();
> + else
> +   *min_size = 0;
> + if (max.fits_uhwi ())
> +   *max_size = max.to_uhwi ();
> + else
> +   *max_size = (HOST_WIDE_INT)-1;
> +   }
> +  else
> +   {
> + if (host_integerp (TYPE_MIN_VALUE (TREE_TYPE (len)), 1))
> +   *min_size = tree_low_cst (TYPE_MIN_VALUE (TREE_TYPE (len)), 1);
> + else
> +   *min_size = 0;
> + if (host_integerp (TYPE_MAX_VALUE (TREE_TYPE (len)), 1))
> +   *max_size = tree_low_cst (TYPE_MAX_VALUE (TREE_TYPE (len)), 1);
> + else
> +   *max_size = GET_MODE_MASK (GET_MODE (len_rtx));
> +   }
> +}
> +  gcc_checking_assert (*max_size <=
> +  (unsigned HOST_WIDE_INT)
> + GET_MODE_MASK (GET_MODE (len_rtx)));
> +}
> +
>  /* Expand a call EXP to the memcpy builtin.
> Return NULL_RTX if we failed, the caller should emit a normal call,
> otherwise try to get the result in TARGET, if convenient (and in
> @@ -3092,6 +3137,8 @@ expand_builtin_memcpy (tree exp, rtx tar
>rtx dest_mem, src_mem, dest_addr, len_rtx;
>HOST_WIDE_INT expected_size = -1;
>unsigned int expected_align = 0;
> +  unsigned HOST_WIDE_INT min_size;
> +

Re: cost model patch

2013-09-27 Thread Xinliang David Li
Please review the changes.html change and suggest better wordings if possible:

ndex: htdocs/gcc-4.9/changes.html
===
RCS file: /cvs/gcc/wwwdocs/htdocs/gcc-4.9/changes.html,v
retrieving revision 1.26
diff -u -r1.26 changes.html
--- htdocs/gcc-4.9/changes.html 26 Aug 2013 14:16:31 - 1.26
+++ htdocs/gcc-4.9/changes.html 26 Sep 2013 18:02:33 -
@@ -37,6 +37,7 @@
   
 AddressSanitizer, a fast memory error detector, is now
available on ARM.
 
+GCC introduces a new cost model for vectorizer, called
'cheap' model. The new cost model is intenteded to minimize compile
time, code size, and potential negative runtime impact introduced when
vectorizer is turned on at the expense of not getting the maximum
potential runtime speedup. The 'cheap' model will be the default when
vectorizer is turned on at -O2. To override this, use
option -fvect-cost-model=[cheap|dynamic|unlimited].
   

 New Languages and Language specific improvements

thanks,

David


On Thu, Sep 26, 2013 at 11:09 AM, Xinliang David Li  wrote:
> On Thu, Sep 26, 2013 at 7:37 AM, Richard Biener
>  wrote:
>> On Thu, Sep 26, 2013 at 1:10 AM, Xinliang David Li  
>> wrote:
>>> I took the liberty to pick up Richard's original fvect-cost-model
>>> patch and made some modification.
>>>
>>> What has not changed:
>>> 1) option -ftree-vect-loop-version is removed;
>>> 2) three cost models are introduced: cheap, dynamic, and unlimited;
>>> 3) unless explicitly specified, cheap model is the default at O2 (e.g.
>>> when -ftree-loop-vectorize is used with -O2), and dynamic mode is the
>>> default for O3 and FDO
>>> 4) alignment based versioning is disabled with cheap model.
>>>
>>> What has changed:
>>> 1) peeling is also disabled with cheap model;
>>> 2) alias check condition limit is reduced with cheap model, but not
>>> completely suppressed. Runtime alias check is a pretty important
>>> enabler.
>>> 3) tree if conversion changes are not included.
>>>
>>> Does this patch look reasonable?
>>
>> In principle yes.  Note that it changes the behavior of -O2 -ftree-vectorize
>> as -ftree-vectorize does not imply changing the default cost model.  I am
>> fine with that, but eventually this will have some testsuite fallout.  This
>> reorg would also need documenting in changes.html to make people
>> aware of this.
>
>
> Here is the proposed change:
>
>
> Index: htdocs/gcc-4.9/changes.html
> ===
> RCS file: /cvs/gcc/wwwdocs/htdocs/gcc-4.9/changes.html,v
> retrieving revision 1.26
> diff -u -r1.26 changes.html
> --- htdocs/gcc-4.9/changes.html 26 Aug 2013 14:16:31 - 1.26
> +++ htdocs/gcc-4.9/changes.html 26 Sep 2013 18:02:33 -
> @@ -37,6 +37,7 @@
>
>  AddressSanitizer, a fast memory error detector, is now
> available on ARM.
>  
> +GCC introduces a new cost model for vectorizer, called
> 'cheap' model. The new cost model is intenteded to minimize compile
> time, code size, and potential negative runtime impact introduced when
> vectorizer is turned on at the expense of not getting the maximum
> potential runtime speedup. The 'cheap' model will be the default when
> vectorizer is turned on at -O2. To override this, use
> option -fvect-cost-model=[cheap|dynamic|unlimited].
>
>
>  New Languages and Language specific improvements
>
>
>>
>> With completely disabling alingment peeling and alignment versioning
>> you cut out targets that have no way of performing unaligned accesses.
>> From looking at vect_no_align this are mips, sparc, ia64 and some arm.
>> A compromise for them would be to allow peeling a single iteration
>> and some alignment checks (like up to two?).
>>
>
> Possibly. I think target owners can choose to do target specific
> tunings as follow up.
>
>
>> Reducing the number of allowed alias-checks is ok, but I'd reduce it
>> more than to 6 (was that an arbitrary number or is that the result of
>> some benchmarking?)
>>
>
> yes -- we found that it is not uncommon to have a loop with 2 or 3
> distinct source address and 1 or 2 target address.
>
> There are also tuning opportunities. For instance, in cases where
> source address are derived from the same base, a consolidated alias
> check (against the whole access range instead of just checking cross
> 1-unrolled iteration dependence) can be done.
>
>> I suppose all of the params could use some benchmarking to select
>> a sweet spot in code size vs. runtime.
>
> Agree.
>
>
>>
>> I suppose the patch is ok as-is (if it actually works) if you provide
>> a changelog and propose an entry for changes.html.  We can
>> tune the params for the cheap model as followup.
>
> Ok. I will do more testing and check in the patch with proper
> ChangeLog. The changes.html change will be done separately.
>
> thanks,
>
> David
>
>
>>
>> Thanks for picking this up,
>> Richard.
>>
>>> thanks,
>>>
>>> David


Re: [google/4_8] Disable -g/-gmlt during LIPO instrumentation

2013-09-27 Thread Xinliang David Li
On Fri, Sep 27, 2013 at 11:50 AM, Teresa Johnson  wrote:
> David and Rong,
>
> The following patch will disable -g/-gmlt when instrumenting for LIPO
> since they will affect the recorded ggc_memory used in the module
> grouping decision. Added -fripa-allow-debug to override this behavior.
>
> Passes regression tests and simple tests on the new functionality.
>
> Ok for google/4_8?
>
> Teresa
>
> 2013-09-27  Teresa Johnson  
>
> * opts.c (finish_options): Suppress -g/-gmlt when we are
> building for LIPO instrumention.
> * common.opt (fripa-allow-debug): New option.
>
> Index: opts.c
> ===
> --- opts.c  (revision 202976)
> +++ opts.c  (working copy)
> @@ -799,7 +799,7 @@ finish_options (struct gcc_options *opts, struct g
>  #endif
>if (!opts->x_flag_fat_lto_objects && !HAVE_LTO_PLUGIN)
>  error_at (loc, "-fno-fat-lto-objects are supported only with
> linker plugin.");
> -}
> +}


Unrelated format change?

Otherwise looks ok.

thanks,

David


>if ((opts->x_flag_lto_partition_balanced != 0) +
> (opts->x_flag_lto_partition_1to1 != 0)
> + (opts->x_flag_lto_partition_none != 0) >= 1)
>  {
> @@ -852,6 +852,26 @@ finish_options (struct gcc_options *opts, struct g
>/* Turn on -ffunction-sections when -freorder-functions=* is used.  */
>if (opts->x_flag_reorder_functions > 1)
>  opts->x_flag_function_sections = 1;
> +
> +  /* LIPO module grouping depends on the memory consumed by the profile-gen
> + parsing phase, recorded in a per-module ggc_memory field of the module
> + info struct. This will be higher when debug generation is on via
> + -g/-gmlt, which causes the FE to generate debug structures that will
> + increase the ggc_total_memory. This could in theory cause the module
> + groups to be slightly more conservative. Disable -g/-gmlt under
> + -fripa -fprofile-generate, but provide an option to override this
> + in case we actually need to debug an instrumented binary.  */
> +  if (opts->x_profile_arc_flag
> +  && opts->x_flag_dyn_ipa
> +  && opts->x_debug_info_level != DINFO_LEVEL_NONE
> +  && !opts->x_flag_dyn_ipa_allow_debug)
> +{
> +  inform (loc,
> + "Debug generation via -g option disabled under -fripa "
> +  "-fprofile-generate (use -fripa-allow-debug to override)");
> +  set_debug_level (NO_DEBUG, DEFAULT_GDB_EXTENSIONS, "0", opts, opts_set,
> +   loc);
> +}
>  }
>
>  #define LEFT_COLUMN27
> Index: common.opt
> ===
> --- common.opt  (revision 202976)
> +++ common.opt  (working copy)
> @@ -1155,6 +1155,10 @@ fripa
>  Common Report Var(flag_dyn_ipa)
>  Perform Dynamic Inter-Procedural Analysis.
>
> +fripa-allow-debug
> +Common Report Var(flag_dyn_ipa_allow_debug) Init(0)
> +Allow -g enablement for -fripa -fprofile-generate compiles.
> +
>  fripa-disallow-asm-modules
>  Common Report Var(flag_ripa_disallow_asm_modules)
>  Don't import an auxiliary module if it contains asm statements
>
> --
> Teresa Johnson | Software Engineer | tejohn...@google.com | 408-460-2413


Re: [google/4_8] Disable -g/-gmlt during LIPO instrumentation

2013-09-27 Thread Xinliang David Li
ok.

David

On Fri, Sep 27, 2013 at 1:03 PM, Teresa Johnson  wrote:
> On Fri, Sep 27, 2013 at 12:01 PM, Xinliang David Li  
> wrote:
>> On Fri, Sep 27, 2013 at 11:50 AM, Teresa Johnson  
>> wrote:
>>> David and Rong,
>>>
>>> The following patch will disable -g/-gmlt when instrumenting for LIPO
>>> since they will affect the recorded ggc_memory used in the module
>>> grouping decision. Added -fripa-allow-debug to override this behavior.
>>>
>>> Passes regression tests and simple tests on the new functionality.
>>>
>>> Ok for google/4_8?
>>>
>>> Teresa
>>>
>>> 2013-09-27  Teresa Johnson  
>>>
>>> * opts.c (finish_options): Suppress -g/-gmlt when we are
>>> building for LIPO instrumention.
>>> * common.opt (fripa-allow-debug): New option.
>>>
>>> Index: opts.c
>>> ===
>>> --- opts.c  (revision 202976)
>>> +++ opts.c  (working copy)
>>> @@ -799,7 +799,7 @@ finish_options (struct gcc_options *opts, struct g
>>>  #endif
>>>if (!opts->x_flag_fat_lto_objects && !HAVE_LTO_PLUGIN)
>>>  error_at (loc, "-fno-fat-lto-objects are supported only with
>>> linker plugin.");
>>> -}
>>> +}
>>
>>
>> Unrelated format change?
>
> Well, related in the sense that it messed up my editor's
> auto-indention logic when making the change below. Ok to include the
> formatting fix if I mention it in the commit log?
>
> Teresa
>
>>
>> Otherwise looks ok.
>>
>> thanks,
>>
>> David
>>
>>
>>>if ((opts->x_flag_lto_partition_balanced != 0) +
>>> (opts->x_flag_lto_partition_1to1 != 0)
>>> + (opts->x_flag_lto_partition_none != 0) >= 1)
>>>  {
>>> @@ -852,6 +852,26 @@ finish_options (struct gcc_options *opts, struct g
>>>/* Turn on -ffunction-sections when -freorder-functions=* is used.  */
>>>if (opts->x_flag_reorder_functions > 1)
>>>  opts->x_flag_function_sections = 1;
>>> +
>>> +  /* LIPO module grouping depends on the memory consumed by the profile-gen
>>> + parsing phase, recorded in a per-module ggc_memory field of the module
>>> + info struct. This will be higher when debug generation is on via
>>> + -g/-gmlt, which causes the FE to generate debug structures that will
>>> + increase the ggc_total_memory. This could in theory cause the module
>>> + groups to be slightly more conservative. Disable -g/-gmlt under
>>> + -fripa -fprofile-generate, but provide an option to override this
>>> + in case we actually need to debug an instrumented binary.  */
>>> +  if (opts->x_profile_arc_flag
>>> +  && opts->x_flag_dyn_ipa
>>> +  && opts->x_debug_info_level != DINFO_LEVEL_NONE
>>> +  && !opts->x_flag_dyn_ipa_allow_debug)
>>> +{
>>> +  inform (loc,
>>> + "Debug generation via -g option disabled under -fripa "
>>> +  "-fprofile-generate (use -fripa-allow-debug to override)");
>>> +  set_debug_level (NO_DEBUG, DEFAULT_GDB_EXTENSIONS, "0", opts, 
>>> opts_set,
>>> +   loc);
>>> +}
>>>  }
>>>
>>>  #define LEFT_COLUMN27
>>> Index: common.opt
>>> ===
>>> --- common.opt  (revision 202976)
>>> +++ common.opt  (working copy)
>>> @@ -1155,6 +1155,10 @@ fripa
>>>  Common Report Var(flag_dyn_ipa)
>>>  Perform Dynamic Inter-Procedural Analysis.
>>>
>>> +fripa-allow-debug
>>> +Common Report Var(flag_dyn_ipa_allow_debug) Init(0)
>>> +Allow -g enablement for -fripa -fprofile-generate compiles.
>>> +
>>>  fripa-disallow-asm-modules
>>>  Common Report Var(flag_ripa_disallow_asm_modules)
>>>  Don't import an auxiliary module if it contains asm statements
>>>
>>> --
>>> Teresa Johnson | Software Engineer | tejohn...@google.com | 408-460-2413
>
>
>
> --
> Teresa Johnson | Software Engineer | tejohn...@google.com | 408-460-2413


Re: [google/4_8] Disable -g/-gmlt during LIPO instrumentation

2013-09-27 Thread Xinliang David Li
The key is that grouping results should not dependent on the presence
of -g flags.  The downside of the patch is that it may slightly
underestimate the memory pressure at profile-use time, but that should
not be a big problem.

David

On Fri, Sep 27, 2013 at 1:18 PM, Rong Xu  wrote:
> I don't quite understand here. We use the profile-generate memory
> consumption to estimate the profile use memory consumption.
> we still have -g/-gmlt in profile-use compilation. Will this change
> effectively under estimate the memory use in the use phrase?
>
> -Rong
>
> On Fri, Sep 27, 2013 at 11:50 AM, Teresa Johnson  wrote:
>> David and Rong,
>>
>> The following patch will disable -g/-gmlt when instrumenting for LIPO
>> since they will affect the recorded ggc_memory used in the module
>> grouping decision. Added -fripa-allow-debug to override this behavior.
>>
>> Passes regression tests and simple tests on the new functionality.
>>
>> Ok for google/4_8?
>>
>> Teresa
>>
>> 2013-09-27  Teresa Johnson  
>>
>> * opts.c (finish_options): Suppress -g/-gmlt when we are
>> building for LIPO instrumention.
>> * common.opt (fripa-allow-debug): New option.
>>
>> Index: opts.c
>> ===
>> --- opts.c  (revision 202976)
>> +++ opts.c  (working copy)
>> @@ -799,7 +799,7 @@ finish_options (struct gcc_options *opts, struct g
>>  #endif
>>if (!opts->x_flag_fat_lto_objects && !HAVE_LTO_PLUGIN)
>>  error_at (loc, "-fno-fat-lto-objects are supported only with
>> linker plugin.");
>> -}
>> +}
>>if ((opts->x_flag_lto_partition_balanced != 0) +
>> (opts->x_flag_lto_partition_1to1 != 0)
>> + (opts->x_flag_lto_partition_none != 0) >= 1)
>>  {
>> @@ -852,6 +852,26 @@ finish_options (struct gcc_options *opts, struct g
>>/* Turn on -ffunction-sections when -freorder-functions=* is used.  */
>>if (opts->x_flag_reorder_functions > 1)
>>  opts->x_flag_function_sections = 1;
>> +
>> +  /* LIPO module grouping depends on the memory consumed by the profile-gen
>> + parsing phase, recorded in a per-module ggc_memory field of the module
>> + info struct. This will be higher when debug generation is on via
>> + -g/-gmlt, which causes the FE to generate debug structures that will
>> + increase the ggc_total_memory. This could in theory cause the module
>> + groups to be slightly more conservative. Disable -g/-gmlt under
>> + -fripa -fprofile-generate, but provide an option to override this
>> + in case we actually need to debug an instrumented binary.  */
>> +  if (opts->x_profile_arc_flag
>> +  && opts->x_flag_dyn_ipa
>> +  && opts->x_debug_info_level != DINFO_LEVEL_NONE
>> +  && !opts->x_flag_dyn_ipa_allow_debug)
>> +{
>> +  inform (loc,
>> + "Debug generation via -g option disabled under -fripa "
>> +  "-fprofile-generate (use -fripa-allow-debug to override)");
>> +  set_debug_level (NO_DEBUG, DEFAULT_GDB_EXTENSIONS, "0", opts, 
>> opts_set,
>> +   loc);
>> +}
>>  }
>>
>>  #define LEFT_COLUMN27
>> Index: common.opt
>> ===
>> --- common.opt  (revision 202976)
>> +++ common.opt  (working copy)
>> @@ -1155,6 +1155,10 @@ fripa
>>  Common Report Var(flag_dyn_ipa)
>>  Perform Dynamic Inter-Procedural Analysis.
>>
>> +fripa-allow-debug
>> +Common Report Var(flag_dyn_ipa_allow_debug) Init(0)
>> +Allow -g enablement for -fripa -fprofile-generate compiles.
>> +
>>  fripa-disallow-asm-modules
>>  Common Report Var(flag_ripa_disallow_asm_modules)
>>  Don't import an auxiliary module if it contains asm statements
>>
>> --
>> Teresa Johnson | Software Engineer | tejohn...@google.com | 408-460-2413


Re: [Google] Adjust comdat-sharing-probability param for -Os

2013-09-27 Thread Xinliang David Li
d.growth -= (info->size
* (100 - PARAM_VALUE (PARAM_COMDAT_SHARING_PROBABILITY))
+ 50) / 100;

What is the purpose of '50' here?

The patch is fine for Google branch.

Other tunings to think about -- I think the sharing probability should
not be a fixed value -- but depending on the function's charateristics
-- such as size, number of callsites etc. For instance, for small leaf
comdat functions, the sharing probability will be small.

David


On Fri, Sep 27, 2013 at 2:57 PM, Easwaran Raman  wrote:
> This patch increases comdat-sharing-probability to 80 under -Os. This
> reduces the amount of inlining and helps internal benchmarks.
> Unfortunately, this causes slight regression on spec 2006. Ok for
> google branches if all tests pass?
>
> - Easwaran


Re: [PATCH] Relax the requirement of reduction pattern in GCC vectorizer.

2013-09-28 Thread Xinliang David Li
You can also add a test case of this form:

int foo( int t, int n, int *dst)
{
   int j = 0;
   int s = 1;
   t++;
   for (j = 0; j < n; j++)
 {
 dst[j] = t;
 s *= t;
 }

   return s;
}

where without the fix the loop vectorization is missed.

David

On Fri, Sep 27, 2013 at 6:28 PM, Cong Hou  wrote:
> The current GCC vectorizer requires the following pattern as a simple
> reduction computation:
>
>loop_header:
>  a1 = phi < a0, a2 >
>  a3 = ...
>  a2 = operation (a3, a1)
>
> But a3 can also be defined outside of the loop. For example, the
> following loop can benefit from vectorization but the GCC vectorizer
> fails to vectorize it:
>
>
> int foo(int v)
> {
>   int s = 1;
>   ++v;
>   for (int i = 0; i < 10; ++i)
> s *= v;
>   return s;
> }
>
>
> This patch relaxes the original requirement by also considering the
> following pattern:
>
>
>a3 = ...
>loop_header:
>  a1 = phi < a0, a2 >
>  a2 = operation (a3, a1)
>
>
> A test case is also added. The patch is tested on x86-64.
>
>
> thanks,
> Cong
>
> 
>
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index 39c786e..45c1667 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,9 @@
> +2013-09-27  Cong Hou  
> +
> + * tree-vect-loop.c: Relax the requirement of the reduction
> + pattern so that one operand of the reduction operation can
> + come from outside of the loop.
> +
>  2013-09-25  Tom Tromey  
>
>   * Makefile.in (PARTITION_H, LTO_SYMTAB_H, COMMON_TARGET_DEF_H)
> diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
> index 09644d2..90496a2 100644
> --- a/gcc/testsuite/ChangeLog
> +++ b/gcc/testsuite/ChangeLog
> @@ -1,3 +1,7 @@
> +2013-09-27  Cong Hou  
> +
> + * gcc.dg/vect/vect-reduc-pattern-3.c: New test.
> +
>  2013-09-25  Marek Polacek  
>
>   PR sanitizer/58413
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index 2871ba1..3c51c3b 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -2091,6 +2091,13 @@ vect_is_slp_reduction (loop_vec_info loop_info,
> gimple phi, gimple first_stmt)
>   a3 = ...
>   a2 = operation (a3, a1)
>
> +   or
> +
> +   a3 = ...
> +   loop_header:
> + a1 = phi < a0, a2 >
> + a2 = operation (a3, a1)
> +
> such that:
> 1. operation is commutative and associative and it is safe to
>change the order of the computation (if CHECK_REDUCTION is true)
> @@ -2451,6 +2458,7 @@ vect_is_simple_reduction_1 (loop_vec_info
> loop_info, gimple phi,
>if (def2 && def2 == phi
>&& (code == COND_EXPR
>|| !def1 || gimple_nop_p (def1)
> +  || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
>|| (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
>&& (is_gimple_assign (def1)
>|| is_gimple_call (def1)
> @@ -2469,6 +2477,7 @@ vect_is_simple_reduction_1 (loop_vec_info
> loop_info, gimple phi,
>if (def1 && def1 == phi
>&& (code == COND_EXPR
>|| !def2 || gimple_nop_p (def2)
> +  || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
>|| (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
>&& (is_gimple_assign (def2)
>|| is_gimple_call (def2)
> diff --git gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-3.c
> gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-3.c
> new file mode 100644
> index 000..06a9416
> --- /dev/null
> +++ gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-3.c
> @@ -0,0 +1,41 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include 
> +#include "tree-vect.h"
> +
> +#define N 10
> +#define RES 1024
> +
> +/* A reduction pattern in which there is no data ref in
> +   the loop and one operand is defined outside of the loop.  */
> +
> +__attribute__ ((noinline)) int
> +foo (int v)
> +{
> +  int i;
> +  int result = 1;
> +
> +  ++v;
> +  for (i = 0; i < N; i++)
> +result *= v;
> +
> +  return result;
> +}
> +
> +int
> +main (void)
> +{
> +  int res;
> +
> +  check_vect ();
> +
> +  res = foo (1);
> +  if (res != RES)
> +abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
> +/* { dg-final { cleanup-tree-dump "vect" } } */
> +


Re: Add value range support into memcpy/memset expansion

2013-09-28 Thread Xinliang David Li
On Sat, Sep 28, 2013 at 3:05 PM, Jan Hubicka  wrote:
>> > Nice extension. Test cases would be great to have.
>> Fore those you need i386 changes to actually use the info.  I will post that
>> after some cleanup and additional testing.
>
> Hi,
> since I already caught your attention, here is the target specific part for
> comments.
>
> this patch implements memcpy/memset prologues and epilogues as suggested by
> Ondrej Bilka.  His glibc implementation use IMO very smart trick with single
> misaligned move to copy first N and last N bytes of the block.  The remainder
> of the block is then copied by the usual loop that gets aligned to the proper
> address.
>
> This leads to partial memory stall, but that is handled well by modern x86
> chips.
>
> For example in the following testcase:
> char *a;
> char *b;
> t1()
> {
>   memcpy (a,b,140);
> }
>
> We now produce:
> movqb(%rip), %rsi
> movqa(%rip), %rcx
> movq(%rsi), %rax <- first 8 bytes are moved
> leaq8(%rcx), %rdi
> andq$-8, %rdi   <- dest is aligned
> movq%rax, (%rcx)
> movq132(%rsi), %rax  <- last 8 bytes are moved
> movq%rax, 132(%rcx)
> subq%rdi, %rcx  <- alignment is subtracted from count

> subq%rcx, %rsi  <- source is aligned

This (source aligned) is not always true, but nevertheless the
sequence is very tight.

> addl$140, %ecx  <- normal copying of 8 byte chunks
> shrl$3, %ecx
> rep; movsq
> ret

> Of course it is quite common to know only upper bound on the block.  In this 
> case
> we need to generate prologue for first few bytes:
> char *p,*q;
> t(unsigned int a)
> {
>   if (a<100)
> memcpy(q,p,a);
>
> }
> t:
> .LFB0:
> .cfi_startproc
> cmpl$99, %edi
> jbe .L15
> .L7:
> rep; ret
> .p2align 4,,10
> .p2align 3
> .L15:
> cmpl$8, %edi
> movqq(%rip), %rdx
> movqp(%rip), %rsi
> jae .L3
> testb   $4, %dil
> jne .L16
> testl   %edi, %edi
> je  .L7
> movzbl  (%rsi), %eax
> testb   $2, %dil
> movb%al, (%rdx)
> je  .L7
> movl%edi, %edi
> movzwl  -2(%rsi,%rdi), %eax
> movw%ax, -2(%rdx,%rdi)
> ret
> .p2align 4,,10
> .p2align 3
> .L3:
> movq(%rsi), %rax
> movq%rax, (%rdx)
> movl%edi, %eax
> movq-8(%rsi,%rax), %rcx
> movq%rcx, -8(%rdx,%rax)
> leaq8(%rdx), %rax
> andq$-8, %rax
> subq%rax, %rdx
> addl%edx, %edi
> subq%rdx, %rsi
> shrl$3, %edi
> movl%edi, %ecx
> movq%rax, %rdi
> rep; movsq
> ret
> .p2align 4,,10
> .p2align 3
> .L16:
> movl(%rsi), %eax
> movl%edi, %edi
> movl%eax, (%rdx)
> movl-4(%rsi,%rdi), %eax
> movl%eax, -4(%rdx,%rdi)
> ret
> .cfi_endproc
> .LFE0:
>
> Mainline would output a libcall here (because size is unknown to it) and with
> inlining all stringops it winds up 210 bytes of code instead of 142 bytes
> above.
>
> Unforutnately the following testcase:
> char *p,*q;
> t(int a)
> {
>   if (a<100)
> memcpy(q,p,a);
>
> }
> Won't get inlined.  This is because A is known to be smaller than 100 that
> results in anti range after conversion to size_t.  This anti range allows very
> large values (above INT_MAX) and thus we do not know the block size.
> I am not sure if the sane range can be recovered somehow.  If not, maybe
> this is common enough to add support for "probable" upper bound parameter to
> the template.

Do we know if there is real code that intentionally does that other
than security flaws as result of improperly done range check?

I think by default GCC should assume the memcpy size range is (0, 100)
here with perhaps an option to override it.

thanks,

David

>
> Use of value ranges makes it harder to choose proper algorithm since the 
> average
> size is no longer known.  For the moment I take simple average of lower and 
> upper
> bound, but this is wrong.
>
> Libcall starts to win only for pretty large blocks (over 4GB definitely) so 
> it makes
> sense to inline functions with range 04096 even though the cost tables 
> tells
> to expand libcall for everything bigger than 140 bytes:  if blocks are small 
> we will
> get noticeable win and if blocks are big, we won't lose much.
>
> I am considering assigning value ranges to the algorithms, too, for more sane
> choices in decide_alg.
>
> I also think the misaligned move trick can/should be performed by
> move_by_pieces and we ought to consider sane use of SSE - current vector_loop
> with unrolling factor of 4 seems bit extreme.  At least buldozer is happy with
> 2 and I would expect SSE moves to be especially useful fo

Re: cost model patch

2013-09-30 Thread Xinliang David Li
Yes, that will do.  Can you do it for me? I can't  do testing easily
on arm myself.

thanks,

David




On Mon, Sep 30, 2013 at 3:29 AM, Kyrill Tkachov  wrote:
> Hi Richard, David,
>
>> In principle yes.  Note that it changes the behavior of -O2
>> -ftree-vectorize
>> as -ftree-vectorize does not imply changing the default cost model.  I am
>> fine with that, but eventually this will have some testsuite fallout.
>
> Indeed I am observing a regression with this patch on arm-none-eabi in
> gcc.dg/tree-ssa/gen-vect-26.c.
>
> Seems that the cheap vectoriser model doesn't do unaligned stores (as
> expected I think?). Is adding -fvect-cost-model=dynamic to the test options
> the correct approach?
>
>
> Thanks,
> Kyrill
>
>


Re: [PATCH] Improving uniform_vector_p() function.

2013-10-01 Thread Xinliang David Li
On Tue, Oct 1, 2013 at 10:31 AM, Cong Hou  wrote:
> The current uniform_vector_p() function only returns non-NULL when the
> vector is directly a uniform vector. For example, for the following
> gimple code:
>
> vect_cst_.15_91 = {_9, _9, _9, _9, _9, _9, _9, _9};
>
>
> The current implementation can only detect that {_9, _9, _9, _9, _9,
> _9, _9, _9} is a uniform vector, but fails to recognize
> vect_cst_.15_91 is also one. This simple patch searches through
> assignment chains to find more uniform vectors.
>
>
> thanks,
> Cong
>
>
>
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index 45c1667..b42f8a9 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,9 @@
> +2013-10-01  Cong Hou  
> +
> +   * tree.c: Improve the function uniform_vector_p() so that a
> +   vector assigned with a uniform vector is also treated as a
> +   uniform vector.
> +
> diff --git a/gcc/tree.c b/gcc/tree.c
> index 1c881e4..1d6d894 100644
> --- a/gcc/tree.c
> +++ b/gcc/tree.c
> @@ -10297,6 +10297,17 @@ uniform_vector_p (const_tree vec)
>return first;
>  }
>
> +  if (TREE_CODE (vec) == SSA_NAME)
> +{
> +  gimple def = SSA_NAME_DEF_STMT (vec);
> +  if (gimple_code (def) == GIMPLE_ASSIGN)


do  this:

 if (is_gimple_assign (def) && gimple_assign_copy_p (def))

> +{
> +  tree rhs = gimple_op (def, 1);
> +  if (VECTOR_TYPE_P (TREE_TYPE (rhs)))
> +return uniform_vector_p (rhs);
> +}
> +}
> +
>return NULL_TREE;
>  }

Do you have a test case showing what missed optimization this fix can enable ?

David


Re: [PATCH] Improving uniform_vector_p() function.

2013-10-01 Thread Xinliang David Li
On Tue, Oct 1, 2013 at 2:37 PM, Xinliang David Li  wrote:
> On Tue, Oct 1, 2013 at 10:31 AM, Cong Hou  wrote:
>> The current uniform_vector_p() function only returns non-NULL when the
>> vector is directly a uniform vector. For example, for the following
>> gimple code:
>>
>> vect_cst_.15_91 = {_9, _9, _9, _9, _9, _9, _9, _9};
>>
>>
>> The current implementation can only detect that {_9, _9, _9, _9, _9,
>> _9, _9, _9} is a uniform vector, but fails to recognize
>> vect_cst_.15_91 is also one. This simple patch searches through
>> assignment chains to find more uniform vectors.
>>
>>
>> thanks,
>> Cong
>>
>>
>>
>> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
>> index 45c1667..b42f8a9 100644
>> --- a/gcc/ChangeLog
>> +++ b/gcc/ChangeLog
>> @@ -1,3 +1,9 @@
>> +2013-10-01  Cong Hou  
>> +
>> +   * tree.c: Improve the function uniform_vector_p() so that a
>> +   vector assigned with a uniform vector is also treated as a
>> +   uniform vector.
>> +
>> diff --git a/gcc/tree.c b/gcc/tree.c
>> index 1c881e4..1d6d894 100644
>> --- a/gcc/tree.c
>> +++ b/gcc/tree.c
>> @@ -10297,6 +10297,17 @@ uniform_vector_p (const_tree vec)
>>return first;
>>  }
>>
>> +  if (TREE_CODE (vec) == SSA_NAME)
>> +{
>> +  gimple def = SSA_NAME_DEF_STMT (vec);
>> +  if (gimple_code (def) == GIMPLE_ASSIGN)
>
>
> do  this:
>
>  if (is_gimple_assign (def) && gimple_assign_copy_p (def))


Wrong comment from me. Should be

 if (gimple_assign_single_p (def))
  ..

David

>
>> +{
>> +  tree rhs = gimple_op (def, 1);
>> +  if (VECTOR_TYPE_P (TREE_TYPE (rhs)))
>> +return uniform_vector_p (rhs);
>> +}
>> +}
>> +
>>return NULL_TREE;
>>  }
>
> Do you have a test case showing what missed optimization this fix can enable ?
>
> David


Re: [google 4.7] fix line number checksum mismatch in lipo-use (issue6566044)

2012-09-28 Thread Xinliang David Li
ok (for google-47 and google/main)

thanks,

David

On Fri, Sep 28, 2012 at 10:22 AM, Rong Xu  wrote:
> Comments are inlined.
> Attached is the new patch.
>
> Thanks,
>
> -Rong
>
> On Tue, Sep 25, 2012 at 2:25 PM, Xinliang David Li  wrote:
>> On Mon, Sep 24, 2012 at 2:42 PM, Rong Xu  wrote:
>>> Hi,
>>>
>>> This is for google branches only.
>>> It fix the lino number checksum mismatch during LIPO-use build.
>>>
>>> Tested with SPEC and google internal banchmarks.
>>>
>>> Thanks,
>>>
>>> -Rong
>>>
>>> 2012-09-24  Rong Xu  
>>>
>>> * gcc/coverage.c (coverage_checksum_string): strip out LIPO
>>> specific string.
>>> (crc32_string_1): New function.
>>> * gcc/cp/decl2.c (start_static_storage_duration_function):
>>> generate LIPO specific string.
>>>
>>> Index: gcc/coverage.c
>>> ===
>>> --- gcc/coverage.c  (revision 191679)
>>> +++ gcc/coverage.c  (working copy)
>>> @@ -903,6 +903,27 @@
>>>  }
>>>
>>>
>>> +/* Generate a crc32 of a string with specified STR_ELN when it's not 0.
>>
>> STR_ELN --> STR_LEN
>
> Fixed.
>
>>
>>> +   Non-zero STR_LEN should only be seen in LIPO mode.  */
>>
>> Empty line needed.
>
> Fixed.
>
>>
>>> +static unsigned
>>> +crc32_string_1 (unsigned chksum, const char *string, unsigned str_len)
>>> +{
>>> +  char *dup;
>>> +
>>> +  if (!L_IPO_COMP_MODE || str_len == 0)
>>> +return crc32_string (chksum, string);
>>> +
>>> +  gcc_assert (str_len > 0 && str_len < strlen(string));
>>> +  dup = xstrdup (string);
>>> +  dup[str_len] = 0;
>>> +  chksum = crc32_string (chksum, dup);
>>> +  free (dup);
>>> +
>>> +  return chksum;
>>> +
>>
>> Remove extra lines after return.
>
> Fixed.
>
>>
>>> +
>>> +}
>>> +
>>>  /* Generate a checksum for a string.  CHKSUM is the current
>>> checksum.  */
>>>
>>> @@ -911,7 +932,26 @@
>>>  {
>>>int i;
>>>char *dup = NULL;
>>> +  unsigned lipo_orig_str_len = 0;
>>>
>>> +  /* Strip out the ending "_cmo_[0-9]*" string from function
>>> + name. Otherwise we will have lineno checksum mismatch.  */
>>> +  if (L_IPO_COMP_MODE)
>>> +{
>>> +  int len;
>>> +
>>> +  i = len = strlen (string);
>>> +  while (i--)
>>> +if ((string[i] < '0' || string[i] > '9'))
>>> +  break;
>>> +  if ((i > 5) && (i != len - 1))
>>
>>  i >= 5?
>
> This should not matter because we are expecting a non-empty sub-string
> before "_cmo_". If there not sub-string before "_cmo_", the original
> code will do nothing (which I think it's correct in the case of user
> defined name.)
>
>>
>>> +{
>>> +  if (!strncmp (string + i - 4, "_cmo_", 5))
>>
>> _cmo_ or .cmo. ?
>>
>>> +lipo_orig_str_len = i - 4;
>>> +}
>>> +
>>> +}
>>> +
>>>/* Look for everything that looks if it were produced by
>>>   get_file_function_name and zero out the second part
>>>   that may result from flag_random_seed.  This is not critical
>>> @@ -957,7 +997,7 @@
>>> }
>>>  }
>>>
>>> -  chksum = crc32_string (chksum, string);
>>> +  chksum = crc32_string_1 (chksum, string, lipo_orig_str_len);
>>>if (dup)
>>>  free (dup);
>>>
>>> Index: gcc/cp/decl2.c
>>> ===
>>> --- gcc/cp/decl2.c  (revision 191679)
>>> +++ gcc/cp/decl2.c  (working copy)
>>> @@ -2911,7 +2911,7 @@
>>>   SSDF_IDENTIFIER_.  */
>>>sprintf (id, "%s_%u", SSDF_IDENTIFIER, count);
>>>if (L_IPO_IS_AUXILIARY_MODULE)
>>> -sprintf (id, "%s_%u", id, current_module_id);
>>> +sprintf (id, "%s_cmo_%u", id, current_module_id);
>>
>> _cmo_ or .cmo. for consistency?
>
> Changed all "_cmo_" to ".cmo.".
>
>>
>> David
>>
>>>
>>>type = build_function_type_list (void_type_node,
>>>integer_type_node, integer_type_node,
>>>
>>> --
>>> This patch is available for review at http://codereview.appspot.com/6566044


Re: [PATCH] Fix powerpc breakage, was: Add option for dumping to stderr (issue6190057)

2012-10-01 Thread Xinliang David Li
On Mon, Oct 1, 2012 at 2:37 PM, Michael Meissner
 wrote:
> I tracked down some of the other code that previously used REPORT_DETAILS, and
> MSG_NOTE is the new way to do the same thing.  This bootstraps and no
> unexpected errors occur during make check.  Is it ok to install?
>
> 2012-10-01  Michael Meissner  
>
> * config/rs6000/rs6000.c (toplevel): Include dumpfile.h.
> (rs6000_density_test): Rework to accomidate 09-30 change by Sharad
> Singhai.
>
> * config/rs6000/t-rs6000 (rs6000.o): Add dumpfile.h dependency.
>
> Index: gcc/config/rs6000/rs6000.c
> ===
> --- gcc/config/rs6000/rs6000.c  (revision 191932)
> +++ gcc/config/rs6000/rs6000.c  (working copy)
> @@ -58,6 +58,7 @@
>  #include "tm-constrs.h"
>  #include "opts.h"
>  #include "tree-vectorizer.h"
> +#include "dumpfile.h"
>  #if TARGET_XCOFF
>  #include "xcoffout.h"  /* get declarations of xcoff_*_section_name */
>  #endif
> @@ -3518,11 +3519,11 @@ rs6000_density_test (rs6000_cost_data *d
>&& vec_cost + not_vec_cost > DENSITY_SIZE_THRESHOLD)
>  {
>data->cost[vect_body] = vec_cost * (100 + DENSITY_PENALTY) / 100;
> -  if (vect_print_dump_info (REPORT_DETAILS))
> -   fprintf (vect_dump,
> -"density %d%%, cost %d exceeds threshold, penalizing "
> -"loop body cost by %d%%", density_pct,
> -vec_cost + not_vec_cost, DENSITY_PENALTY);
> +  if (dump_kind_p (MSG_NOTE))

Is this check needed? Seems redundant.

David


> +   dump_printf_loc (MSG_NOTE, vect_location,
> +"density %d%%, cost %d exceeds threshold, penalizing 
> "
> +"loop body cost by %d%%", density_pct,
> +vec_cost + not_vec_cost, DENSITY_PENALTY);
>  }
>  }
>
> Index: gcc/config/rs6000/t-rs6000
> ===
> --- gcc/config/rs6000/t-rs6000  (revision 191932)
> +++ gcc/config/rs6000/t-rs6000  (working copy)
> @@ -26,7 +26,7 @@ rs6000.o: $(CONFIG_H) $(SYSTEM_H) corety
>$(OBSTACK_H) $(TREE_H) $(EXPR_H) $(OPTABS_H) except.h function.h \
>output.h dbxout.h $(BASIC_BLOCK_H) toplev.h $(GGC_H) $(HASHTAB_H) \
>$(TM_P_H) $(TARGET_H) $(TARGET_DEF_H) langhooks.h reload.h gt-rs6000.h \
> -  cfgloop.h $(OPTS_H) $(COMMON_TARGET_H)
> +  cfgloop.h $(OPTS_H) $(COMMON_TARGET_H) dumpfile.h
>
>  rs6000-c.o: $(srcdir)/config/rs6000/rs6000-c.c \
>  $(srcdir)/config/rs6000/rs6000-protos.h \
>
> --
> Michael Meissner, IBM
> 5 Technology Place Drive, M/S 2757, Westford, MA 01886-3141, USA
> meiss...@linux.vnet.ibm.com fax +1 (978) 399-6899
>


Re: [PATCH] Fix powerpc breakage, was: Add option for dumping to stderr (issue6190057)

2012-10-01 Thread Xinliang David Li
On Mon, Oct 1, 2012 at 4:05 PM, Sharad Singhai  wrote:
> Thanks for tracking down and fixing the powerpc port.
>
> The "dump_kind_p ()" check is redundant but canonical form here. I
> think blocks of dump code guarded by "if dump_kind_p (...)" might be
> easier to read/maintain.
>

I find it confusing to be honest. The redundant check serves no purpose.

David

> Sharad
> Sharad
>
>
> On Mon, Oct 1, 2012 at 3:45 PM, Xinliang David Li  wrote:
>> On Mon, Oct 1, 2012 at 2:37 PM, Michael Meissner
>>  wrote:
>>> I tracked down some of the other code that previously used REPORT_DETAILS, 
>>> and
>>> MSG_NOTE is the new way to do the same thing.  This bootstraps and no
>>> unexpected errors occur during make check.  Is it ok to install?
>>>
>>> 2012-10-01  Michael Meissner  
>>>
>>> * config/rs6000/rs6000.c (toplevel): Include dumpfile.h.
>>> (rs6000_density_test): Rework to accomidate 09-30 change by Sharad
>>> Singhai.
>>>
>>> * config/rs6000/t-rs6000 (rs6000.o): Add dumpfile.h dependency.
>>>
>>> Index: gcc/config/rs6000/rs6000.c
>>> ===
>>> --- gcc/config/rs6000/rs6000.c  (revision 191932)
>>> +++ gcc/config/rs6000/rs6000.c  (working copy)
>>> @@ -58,6 +58,7 @@
>>>  #include "tm-constrs.h"
>>>  #include "opts.h"
>>>  #include "tree-vectorizer.h"
>>> +#include "dumpfile.h"
>>>  #if TARGET_XCOFF
>>>  #include "xcoffout.h"  /* get declarations of xcoff_*_section_name */
>>>  #endif
>>> @@ -3518,11 +3519,11 @@ rs6000_density_test (rs6000_cost_data *d
>>>&& vec_cost + not_vec_cost > DENSITY_SIZE_THRESHOLD)
>>>  {
>>>data->cost[vect_body] = vec_cost * (100 + DENSITY_PENALTY) / 100;
>>> -  if (vect_print_dump_info (REPORT_DETAILS))
>>> -   fprintf (vect_dump,
>>> -"density %d%%, cost %d exceeds threshold, penalizing "
>>> -"loop body cost by %d%%", density_pct,
>>> -vec_cost + not_vec_cost, DENSITY_PENALTY);
>>> +  if (dump_kind_p (MSG_NOTE))
>>
>> Is this check needed? Seems redundant.
>>
>> David
>>
>>
>>> +   dump_printf_loc (MSG_NOTE, vect_location,
>>> +"density %d%%, cost %d exceeds threshold, 
>>> penalizing "
>>> +"loop body cost by %d%%", density_pct,
>>> +vec_cost + not_vec_cost, DENSITY_PENALTY);
>>>  }
>>>  }
>>>
>>> Index: gcc/config/rs6000/t-rs6000
>>> ===
>>> --- gcc/config/rs6000/t-rs6000  (revision 191932)
>>> +++ gcc/config/rs6000/t-rs6000  (working copy)
>>> @@ -26,7 +26,7 @@ rs6000.o: $(CONFIG_H) $(SYSTEM_H) corety
>>>$(OBSTACK_H) $(TREE_H) $(EXPR_H) $(OPTABS_H) except.h function.h \
>>>output.h dbxout.h $(BASIC_BLOCK_H) toplev.h $(GGC_H) $(HASHTAB_H) \
>>>$(TM_P_H) $(TARGET_H) $(TARGET_DEF_H) langhooks.h reload.h gt-rs6000.h \
>>> -  cfgloop.h $(OPTS_H) $(COMMON_TARGET_H)
>>> +  cfgloop.h $(OPTS_H) $(COMMON_TARGET_H) dumpfile.h
>>>
>>>  rs6000-c.o: $(srcdir)/config/rs6000/rs6000-c.c \
>>>  $(srcdir)/config/rs6000/rs6000-protos.h \
>>>
>>> --
>>> Michael Meissner, IBM
>>> 5 Technology Place Drive, M/S 2757, Westford, MA 01886-3141, USA
>>> meiss...@linux.vnet.ibm.com fax +1 (978) 399-6899
>>>


Re: Propagate profile counts during switch expansion

2012-10-03 Thread Xinliang David Li
What is the status of switch expansion GIMPLE rewrite? If it is not
planned for 4.8, It will be desirable to include this fix into trunk.
It also helps set up a good base line to test against regression.


thanks,

David

On Tue, Oct 2, 2012 at 6:09 PM, Easwaran Raman  wrote:
> Hi,
>  This patch propagates the profile counts during RTL expansion. In
> many cases, there is no way to determine the exact count of an edge
> generated during the expansion. So this patch uses some simple
> heuristics to estimate the edge counts but ensures that the counts of
> the basic blocks corresponding to the cases are (nearly the) same as
> at the gimple level.
>
> Bootstrapped and profile-bootstrapped on an x86_64/linux machine. OK for 
> trunk?
>
> - Easwaran
>
> --
> 2012-10-02   Easwaran Raman  
>
> * cfgbuild.c (gen_probabilities_from_existing_counts): New function.
> (compute_outgoing_frequencies): If at least one successor of a
> BB has non-zero profile count, use the counts to compute
> probabilities.
> * expr.c (do_tablejump): Add a REG_BR_PROB note on the
> jump to default label.
> (try_tablejump): Add a parameter to specify the probability
> of jumping to the default label.
> * expr.h (try_tablejump): Add a new parameter.
> * stmt.c (case_node): Add new fields COUNT and SUBTREE_COUNT.
> (do_jump_if_equal): Pass probability for REG_BR_PROB note.
> (add_case_node): Pass execution count of the case node and use
> it to initialize COUNT field.
> (emit_case_decision_tree): Pass default_count to emit_case_nodes.
> (get_outgoing_edge_counts): New function.
> (add_prob_note_to_last_insn): Likewise.
> (case_probability): New macro.
> (emit_case_dispatch_table): Compute probability of jumping to default
> label and apply note to the jump.
> (expand_case): Compute and propagate default edge count to
> emit_case_dispatch_table.
> (expand_sjlj_dispatch_table): Update calls to add_case_node and
> emit_case_dispatch_table.
> (balance_case_nodes): Update subtree_counts.
> (emit_case_nodes): Compute edge probabilities and add note.
>
> gcc/testsuite/ChangeLog:
> 2012-10-02   Easwaran Raman  
> * gcc.dg/tree-prof/switch-case-1.c: New test case.
> * gcc.dg/tree-prof/switch-case-2.c: New test case.
>
> Index: gcc/testsuite/gcc.dg/tree-prof/switch-case-1.c
> ===
> --- gcc/testsuite/gcc.dg/tree-prof/switch-case-1.c (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-prof/switch-case-1.c (revision 0)
> @@ -0,0 +1,40 @@
> +/* { dg-options "-O2 -fdump-rtl-expand-all" } */
> +int g;
> +
> +__attribute__((noinline)) void foo (int  n)
> +{
> +  switch (n)
> +{
> +case 1:
> +  g++; break;
> +case 2:
> +  g += 2; break;
> +case 3:
> +  g += 1; break;
> +case 4:
> +  g += 3; break;
> +case 5:
> +  g += 4; break;
> +case 6:
> +  g += 5; break;
> +case 7:
> +  g += 6; break;
> +case 8:
> +  g += 7; break;
> +case 9:
> +  g += 8; break;
> +default:
> +  g += 8; break;
> +   }
> +}
> +
> +int main ()
> +{
> + int i;
> + for (i = 0; i < 1; i++)
> +   foo ((i * i) % 5);
> + return 0;
> +}
> +/* { dg-final-use { scan-rtl-dump-times ";; basic block\[^\\n\]*count
> 4000" 2 "expand"} } */
> +/* { dg-final-use { scan-rtl-dump-times ";; basic block\[^\\n\]*count
> 2000" 1 "expand"} } */
> +/* { dg-final-use { cleanup-rtl-dump "expand" } } */
> Index: gcc/testsuite/gcc.dg/tree-prof/switch-case-2.c
> ===
> --- gcc/testsuite/gcc.dg/tree-prof/switch-case-2.c (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-prof/switch-case-2.c (revision 0)
> @@ -0,0 +1,40 @@
> +/* { dg-options "-O2 -fdump-rtl-expand-all" } */
> +int g;
> +
> +__attribute__((noinline)) void foo (int  n)
> +{
> +  switch (n)
> +{
> +case 99:
> +  g += 2; break;
> +case 1:
> +  g++; break;
> +case 100:
> +  g += 1; break;
> +case 4:
> +  g += 3; break;
> +case 5:
> +  g += 4; break;
> +case 6:
> +  g += 5; break;
> +case 7:
> +  g += 6; break;
> +case 8:
> +  g += 7; break;
> +case 9:
> +  g += 8; break;
> +default:
> +  g += 8; break;
> +   }
> +}
> +
> +int main ()
> +{
> + int i;
> + for (i = 0; i < 1; i++)
> +   foo ((i * i) % 5);
> + return 0;
> +}
> +/* { dg-final-use { scan-rtl-dump-times ";; basic block\[^\\n\]*count
> 4000" 2 "expand"} } */
> +/* { dg-final-use { scan-rtl-dump-times ";; basic block\[^\\n\]*count
> 2000" 1 "expand"} } */
> +/* { dg-final-use { cleanup-rtl-dump "expand" } } */
> Index: gcc/expr.c
> ===
> --- gcc/expr.c (revision 191879)
> +++ gcc/expr.c (working copy)
> @@ -154,7 +154,7 @@ static rtx do_store_flag (sepops, rtx, enum machin
>  #ifdef PUSH_ROUNDING
>  static void emit_single_push_insn (enum machine_mode, rtx, tree);
>  #endif
> -static void do_tablejump (rtx, enum machine_mode, rtx, rtx, rtx);
> +stati

Re: Propagate profile counts during switch expansion

2012-10-03 Thread Xinliang David Li
thanks for the update!

David

On Wed, Oct 3, 2012 at 10:37 AM, Steven Bosscher  wrote:
> On Wed, Oct 3, 2012 at 6:12 PM, Xinliang David Li  wrote:
>> What is the status of switch expansion GIMPLE rewrite? If it is not
>> planned for 4.8, It will be desirable to include this fix into trunk.
>
> I could work on it for GCC 4.8 (there's not a lot of work left to be
> done for it now) but we haven't really decided yet where the pass
> should be scheduled and I also would like to wait a bit to see how the
> SJLJ changes work out.  So I talked about this with Easwaran, I think
> his patch should be included into the trunk now. I'll adapt it for the
> move to GIMPLE.
>
>> It also helps set up a good base line to test against regression.
>
> Agreed.
>
> Ciao!
> Steven


Re: User directed Function Multiversioning via Function Overloading (issue5752064)

2012-10-05 Thread Xinliang David Li
Hi Jason, Sri has addressed the comments you had on FE part. Can you
take a look if it is ok?   Stage-1 is going to be closed soon, and we
hope to get this major feature in 4.8.

thanks,

David



On Tue, Sep 18, 2012 at 9:29 AM, Sriraman Tallam  wrote:
> Ping.
>
> On Fri, Aug 24, 2012 at 5:34 PM, Sriraman Tallam  wrote:
>> Hi Jason,
>>
>>I have created a new patch to use target hooks for all the
>> functionality and make the front-end just call the target hooks at the
>> appropriate places. This is more like what you suggested in a previous
>> mail. In particular, target hooks address the following questions:
>>
>> * Determine if two function decls with the same signature are versions.
>> * Determine the new assembler name of a function version.
>> * Generate the dispatcher function for a set of function versions.
>> * Compare versions to see if one has a higher priority over the other.
>>
>> Patch attached and also available for review at:
>>
>> http://codereview.appspot.com/5752064/
>>
>> Hope this is more along the lines of what you had in mind, please let
>> me know what you think.
>>
>> Thanks,
>> -Sri.
>>
>>
>> On Mon, Jul 30, 2012 at 12:01 PM, Sriraman Tallam  
>> wrote:
>>> On Thu, Jul 19, 2012 at 1:39 PM, Jason Merrill  wrote:

 On 07/10/2012 03:14 PM, Sriraman Tallam wrote:
>
> I am using the questions you asked previously
> to explain how I solved each of them. When working on this patch, these
> are the exact questions I had and tried to address it.
>
> * Does this attribute affect a function signature?
>
> The function signature should be changed when there is more than one
> definition/declaration of foo distinguished by unique target attributes.

 >[...]

 I agree.  I was trying to suggest that these questions are what the front 
 end needs to care about, not about versioning specifically.  If these 
 questions are turned into target hooks, all of the logic specific to 
 versioning can be contained in the target.

 My only question intended to be answered by humans is, do people think 
 moving the versioning logic behind more generic target hooks is worthwhile?
>>>
>>> I have  some comments related
>>>
>>> For the example below,
>>>
>>> // Default version.
>>> int foo ()
>>> {
>>>   .
>>> }
>>>
>>> // Version  XXX feature supported by Target ABC.
>>> int foo __attribute__ ((target ("XXX")))
>>> {
>>>
>>> }
>>>
>>> How should the second version of foo be treated for targets where
>>> feature XXX is not supported? Right now, I am working on having my
>>> patch completely ignore such function versions when compiled for
>>> targets that do not understand the attribute. I could move this check
>>> into a generic target hook so that a function definition that does not
>>> make sense for the current target is ignored.
>>>
>>> Also, currently the patch uses target hooks to do the following:
>>>
>>> - Find if a particular version can be called directly, rather than go
>>> through the dispatcher.
>>> - Determine what the dispatcher body should be.
>>> - Determining the order in which function versions must be dispatched.
>>>
>>> I do not have a strong opinion on whether the entire logic should be
>>> based on target hooks.
>>>
>>> Thanks,
>>> -Sri.
>>>



 Jason


Re: [google] AutoFDO implementation

2012-10-05 Thread Xinliang David Li
thanks. That will be helpful.

David


On Fri, Oct 5, 2012 at 2:09 PM, Dehao Chen  wrote:
> Sure, I'll add a detailed documentation in a gcc wiki page.
>
> Dehao
>
> On Fri, Oct 5, 2012 at 2:01 PM, Xinliang David Li  wrote:
>> Dehao, the file auto-profile.c has some high level description of
>> aFDO, but I think it is too sparse. Can you write up a gcc wiki page
>> and point the details to that page in auto-profile.c?
>>
>> The documentation should focus more on the differences (mainly the
>> profile-use phase) between sample based FDO and instrumentation based
>> FDO. The description there should explain various autoFDO specific
>> tunings in cgraph build, ipa-inline, cloning, introduction of
>> total_count and rationale etc. The main source of difference comes
>> from differences in the points of profiling, but some small examples
>> would help.
>>
>> Most of the changes guarded by flag_auto_profile need some comments.
>>
>> thanks,
>>
>> David
>>
>> On Fri, Sep 28, 2012 at 5:22 PM, Dehao Chen  wrote:
>>> Hi,
>>>
>>> This patch implements the fine-graind AutoFDO optimizations for GCC.
>>> It uses linux perf to collect sample profiles, and uses debug info to
>>> represent the profile. In GCC, it uses the profile to annotate CFG to
>>> drive FDO. This can bring 50% to 110% of the speedup derived by
>>> traditional instrumentation based FDO. (Average is between 70% to 80%
>>> for many CPU intensive applications). Comparing with traditional FDO,
>>> AutoFDO does not require instrumentation. It just need to have an
>>> optimized binary with debug info to collect the profile.
>>>
>>> This patch has passed bootstrap and gcc regression tests as well as
>>> tested with crosstool. Okay for google branches?
>>>
>>> If people in up-stream find this feature interesting, I'll spend some
>>> time to port this to trunk and try to opensource the tool to generate
>>> profile data file.
>>>
>>> Dehao
>>>
>>> The patch can also be viewed from:
>>>
>>> http://codereview.appspot.com/6567079
>>>
>>> gcc/ChangeLog.google-4_7:
>>> 2012-09-28  Dehao Chen  
>>>
>>> * cgraphbuild.c (build_cgraph_edges): Handle AutoFDO profile.
>>> (rebuild_cgraph_edges): Likewise.
>>> * cgraph.c (cgraph_clone_node): Likewise.
>>> (clone_function_name): Likewise.
>>> * cgraph.h (cgraph_node): New field.
>>> * tree-pass.h (pass_ipa_auto_profile): New pass.
>>> * cfghooks.c (make_forwarder_block): Handle AutoFDO profile.
>>> * ipa-inline-transform.c (clone_inlined_nodes): Likewise.
>>> * toplev.c (compile_file): Likewise.
>>> (process_options): Likewise.
>>> * debug.h (auto_profile_debug_hooks): New.
>>> * cgraphunit.c (cgraph_finalize_compilation_unit): Handle AutoFDO
>>> profile.
>>> (cgraph_copy_node_for_versioning): Likewise.
>>> * regs.h (REG_FREQ_FROM_BB): Likewise.
>>> * gcov-io.h: (GCOV_TAG_AFDO_FILE_NAMES): New.
>>> (GCOV_TAG_AFDO_FUNCTION): New.
>>> (GCOV_TAG_AFDO_MODULE_GROUPING): New.
>>> * ira-int.h (REG_FREQ_FROM_EDGE_FREQ): Handle AutoFDO profile.
>>> * ipa-inline.c (edge_hot_enough_p): Likewise.
>>> (edge_badness): Likewise.
>>> (inline_small_functions): Likewise.
>>> * dwarf2out.c (auto_profile_debug_hooks): New.
>>> * opts.c (common_handle_option): Handle AutoFDO profile.
>>> * timevar.def (TV_IPA_AUTOFDO): New.
>>> * predict.c (compute_function_frequency): Handle AutoFDO profile.
>>> (rebuild_frequencies): Handle AutoFDO profile.
>>> * auto-profile.c (struct gcov_callsite_pos): New.
>>> (struct gcov_callsite): New.
>>> (struct gcov_stack): New.
>>> (struct gcov_function): New.
>>> (struct afdo_bfd_name): New.
>>> (struct afdo_module): New.
>>> (afdo_get_filename): New.
>>> (afdo_get_original_name_size): New.
>>> (afdo_get_bfd_name): New.
>>> (afdo_read_bfd_names): New.
>>> (afdo_stack_hash): New.
>>> (afdo_stack_eq): New.
>>> (afdo_function_hash): New.
>>> (afdo_function_eq): New.
>>> (afdo_bfd_name_hash): New.
>>> (afdo_bfd_name_eq): New.
>>> (afdo_bfd_name_del): New.
>>> (afdo_module_hash): New.
>>> (afdo_module_eq): New.
>>> (afdo_module_num_strings): New.
>>> (afdo_add_module): New.
>>> (read_aux_modules): New.
>>> (get_inline_stack_size_by_stmt): New.
>>> (get_inline_stack_size_by_edge): New.

Re: [google] Emit relative addresses to function patch sections instead of absolute addresses. (issue6572065)

2012-10-06 Thread Xinliang David Li
xray feature is not in trunk yet.

David

On Fri, Oct 5, 2012 at 3:53 PM, Diego Novillo  wrote:
> Harshit, why didn't you propose this patch for trunk?  Why should we
> make it a google-local patch?
>
>
> Diego.
>
> On Fri, Sep 28, 2012 at 5:24 AM, Harshit Chopra  wrote:
>> commit fc3a55ccec9bc770c79f8a221f5abd397befc8f6
>> Author: Harshit Chopra 
>> Date:   Thu Sep 20 17:49:59 2012 -0700
>>
>> Instead of emitting absolute addresses to the function patch sections, 
>> emit relative addresses. Absolute addresses might require relocation, which 
>> is time consuming and fraught with other issues.
>>
>> M   gcc/config/i386/i386.c
>>
>> Tested:
>>   Ran make check-gcc and manually confirmed that the affected tests pass.
>>
>> ChangeLog:
>>
>> 2012-09-28  Harshit Chopra  
>>
>> * gcc/config/i386/i386.c 
>> (ix86_output_function_nops_prologue_epilogue): Emit relative address to 
>> function patch sections.
>>
>> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
>> index f72b0b5..8c9334f 100644
>> --- a/gcc/config/i386/i386.c
>> +++ b/gcc/config/i386/i386.c
>> @@ -11098,7 +11098,7 @@ ix86_output_function_nops_prologue_epilogue (FILE 
>> *file,
>> $LFPEL0:
>>   
>>   0x90 (repeated num_actual_nops times)
>> - .quad $LFPESL0
>> + .quad $LFPESL0 - .
>>   followed by section 'section_name' which contains the address
>>   of instruction at 'label'.
>> */
>> @@ -0,7 +0,10 @@ ix86_output_function_nops_prologue_epilogue (FILE 
>> *file,
>>  asm_fprintf (file, ASM_BYTE"0x90\n");
>>
>>fprintf (file, ASM_QUAD);
>> +  /* Output "section_label - ." for the relative address of the entry in
>> + the section 'section_name'.  */
>>assemble_name_raw (file, section_label);
>> +  fprintf (file, " - .");
>>fprintf (file, "\n");
>>
>>/* Emit the backpointer section. For functions belonging to comdat group,
>> @@ -11144,7 +11147,7 @@ ix86_output_function_nops_prologue_epilogue (FILE 
>> *file,
>>   .quad $LFPEL0
>> */
>>ASM_OUTPUT_INTERNAL_LABEL (file, section_label);
>> -  fprintf(file, ASM_QUAD"\t");
>> +  fprintf(file, ASM_QUAD);
>>assemble_name_raw (file, label);
>>fprintf (file, "\n");
>>
>>
>> --
>> This patch is available for review at http://codereview.appspot.com/6572065


Re: [google] Emit relative addresses to function patch sections instead of absolute addresses. (issue6572065)

2012-10-06 Thread Xinliang David Li
Ok for google branches.

Please consider resend the original xray patch to trunk (gcc-4_8) You
need to make the runtime bits available publicly though.

thanks,

David

On Fri, Sep 28, 2012 at 2:24 AM, Harshit Chopra  wrote:
> commit fc3a55ccec9bc770c79f8a221f5abd397befc8f6
> Author: Harshit Chopra 
> Date:   Thu Sep 20 17:49:59 2012 -0700
>
> Instead of emitting absolute addresses to the function patch sections, 
> emit relative addresses. Absolute addresses might require relocation, which 
> is time consuming and fraught with other issues.
>
> M   gcc/config/i386/i386.c
>
> Tested:
>   Ran make check-gcc and manually confirmed that the affected tests pass.
>
> ChangeLog:
>
> 2012-09-28  Harshit Chopra  
>
> * gcc/config/i386/i386.c 
> (ix86_output_function_nops_prologue_epilogue): Emit relative address to 
> function patch sections.
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index f72b0b5..8c9334f 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -11098,7 +11098,7 @@ ix86_output_function_nops_prologue_epilogue (FILE 
> *file,
> $LFPEL0:
>   
>   0x90 (repeated num_actual_nops times)
> - .quad $LFPESL0
> + .quad $LFPESL0 - .
>   followed by section 'section_name' which contains the address
>   of instruction at 'label'.
> */
> @@ -0,7 +0,10 @@ ix86_output_function_nops_prologue_epilogue (FILE 
> *file,
>  asm_fprintf (file, ASM_BYTE"0x90\n");
>
>fprintf (file, ASM_QUAD);
> +  /* Output "section_label - ." for the relative address of the entry in
> + the section 'section_name'.  */
>assemble_name_raw (file, section_label);
> +  fprintf (file, " - .");
>fprintf (file, "\n");
>
>/* Emit the backpointer section. For functions belonging to comdat group,
> @@ -11144,7 +11147,7 @@ ix86_output_function_nops_prologue_epilogue (FILE 
> *file,
>   .quad $LFPEL0
> */
>ASM_OUTPUT_INTERNAL_LABEL (file, section_label);
> -  fprintf(file, ASM_QUAD"\t");
> +  fprintf(file, ASM_QUAD);
>assemble_name_raw (file, label);
>fprintf (file, "\n");
>
>
> --
> This patch is available for review at http://codereview.appspot.com/6572065


Re: [asan] New transitional branch to port ASAN to trunk

2012-10-10 Thread Xinliang David Li
Is there an agreed way for file naming?

David

On Wed, Oct 10, 2012 at 1:48 PM, Steven Bosscher  wrote:
> On Wed, Oct 10, 2012 at 10:20 PM, Diego Novillo wrote:
>> * tree-asan.c: New file.
>> * tree-asan.h: New file.
>
> Nit: do we still need the "tree-" prefix? IMHO not.
>
> Ciao!
> Steven


Re: [asan] New transitional branch to port ASAN to trunk

2012-10-10 Thread Xinliang David Li
On Wed, Oct 10, 2012 at 2:08 PM, Steven Bosscher  wrote:
> On Wed, Oct 10, 2012 at 11:00 PM, Xinliang David Li wrote:
>> Is there an agreed way for file naming?
>
> It was not my intent to start a bike shed discussion. This was just
> something I've been wondering for some time. But AFAIC it's up to
> Diego&co to do what they think is right :-)

Ok. This is more relevant to the GCC's flat source structure which is
a different problem.

David

>
> Ciao!
> Steven


Re: Use conditional casting with symtab_node

2012-10-10 Thread Xinliang David Li
In a different thread, I proposed the following alternative to 'try_xxx':

template T* symbol::cast_to(symbol* p) {
   if (p->is())
  return static_cast(p);
   return 0;
 }

cast:

template T& symbol:as(symbol* p) {
   assert(p->is())
   return static_cast(*p);

 }

David

On Wed, Sep 19, 2012 at 2:17 AM, Richard Guenther
 wrote:
> On Wed, Sep 19, 2012 at 9:29 AM, Eric Botcazou  wrote:
>>>
>>> The language syntax would bind the conditional into the intializer, as in
>>>
>>>   if (varpool_node *vnode = (node->try_variable ()
>>>  && vnode->finalized))
>>> varpool_analyze_node (vnode);
>>>
>>> which does not type-match.
>>>
>>> So, if you want the type saftey and performance, the cascade is really
>>> unavoidable.
>>
>> Just write:
>>
>>   varpool_node *vnode;
>>
>>   if ((vnode = node->try_variable ()) && vnode->finalized)
>> varpool_analyze_node (vnode);
>>
>> This has been the standard style for the past 2 decades and trading it for
>> cascading if's is really a bad idea.
>
> Indeed.  Btw, can we not provide a specialization for dynamic_cast <>?
> This ->try_... looks awkward to me compared to the more familiar
>
>   vnode = dynamic_cast  (node)
>
> but yeah - dynamic_cast is not a template ... (but maybe there is some
> standard library piece that mimics it?).
>
> Richard.
>
>> --
>> Eric Botcazou


  1   2   3   4   5   6   7   8   9   10   >