> Hi,
> 
> I've implemented hierarchical discriminators for AutoFDO
> This helps AutoFDO profile accuracy by:
> - Loop iterations are now uniquely identifiable in profile data
> - Distinguishes which iteration of an unrolled loop executed hotly and so on.
> 
> The discriminator in AutoFDO is is extended from 16 bits to 32 bits
> with three fields:
> 
>   - Base (12 bits): Traditional same-line disambiguation
>   - Pass1 (12 bits): Optimization context (loop versioning, inlining)
>   - Pass2 (8 bits): Code duplication (loop unrolling, peeling)
> 
> The inline context tracking (pass1 discriminators for inlining) is NOT added 
> to Thi patch , as initial testing did not show performance improvements. 
> 
> We could add  hierarchical discriminator this conditionally via a compiler 
> parameter or a specific compiler options if that is preferred
> 
> Bootstrapped and regression tested. Initial testing on Spec2017 with AutoFDO 
> shows some good improvements. I am rerunning the full suite and will update 
> the results.
> 
> Is this OK?


> diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
> index cf7a2191336..3eae14aad53 100644
> --- a/gcc/auto-profile.cc
> +++ b/gcc/auto-profile.cc
> @@ -178,9 +178,10 @@ private:

> +  /* Callsite, represented as (decl_lineno, callee_function_name_index).
> +     decl_lineno is now 64-bit to support hierarchical discriminators:
> +     upper 32 bits: line offset, lower 32 bits: hierarchical discriminator.  
> */

I am not sure "is now 64-bit to support" part of comment is going to
make sense in future.  I would just comment the encoding without
refering to the past. Similarly later in the patch. In few years those
will no longer be new...
> +  typedef std::pair<uint64_t, unsigned> callsite;
>  
>    /* Map from callsite to callee function_instance.  */
>    typedef std::map<callsite, function_instance *> callsite_map;
> @@ -480,8 +483,10 @@ private:
>    {
>    }
>  
> -  /* Map from source location (decl_lineno) to profile (count_info).  */
> -  typedef std::map<unsigned, count_info> position_count_map;
> +  /* Map from source location (decl_lineno) to profile (count_info).
> +     Key is 64-bit to support hierarchical discriminators:
> +     upper 32 bits: line offset, lower 32 bits: hierarchical discriminator.  
> */

Maybe you can give it a typedef name, like combined_loc_t?
and move the comment to one place?
> +  typedef std::map<uint64_t, count_info> position_count_map;
>  
>    /* function_instance name index in the string_table.  */
>    unsigned name_;
> @@ -495,7 +500,9 @@ private:
>    /* Map from callsite location to callee function_instance.  */
>    callsite_map callsites;
>  
> -  /* Map from source location to count_info.  */
> +  /* Map from source location to count_info.
> +     During profile reading, entries are aggregated by (line_offset, base)
> +     to strip pass1 and pass2 discriminators.  */

I am also not sure we want to ahve pass1/2.  Our debug statements are
closer to pseudo-probes.  Perhaps to strip hiearchical discriminators.
>    position_count_map pos_counts;
>  
>    /* True if function was removed from indir target list.  */
> @@ -659,29 +666,65 @@ get_original_name (const char *name, bool alloc = true)
>    return ret;
>  }
>  
> -/* Return the combined location, which is a 32bit integer in which
> -   higher 16 bits stores the line offset of LOC to the start lineno
> -   of DECL, The lower 16 bits stores the discriminator.  */
> +/* Extract line offset from a 64-bit combined location.  */
>  
> -static unsigned
> +static inline int
> +get_line_offset_from_combined_loc (uint64_t combined_loc)
> +{
> +  return (int)(combined_loc >> 32);
> +}
> +
> +/* Extract 32-bit hierarchical discriminator from a 64-bit combined
> +   location.  */
> +
> +static inline unsigned int
> +get_discriminator_from_combined_loc (uint64_t combined_loc)
> +{
> +  return (unsigned int)(combined_loc & 0xFFFFFFFFULL);
> +}
> +
> +/* Extract base discriminator (bits 0-11) from a 32-bit hierarchical
> +   discriminator.  */
> +
> +static inline unsigned int
> +get_base_discriminator (unsigned int discriminator)
> +{
> +  return discriminator & DISCR_BASE_MASK;
> +}
> +
> +/* Return the combined location, which is a 64-bit integer in which
> +   upper 32 bits stores the line offset of LOC to the start lineno
> +   of DECL, and the lower 32 bits stores the hierarchical discriminator.
> +   This supports the new hierarchical discriminator layout:
> +   Base (12 bits) | Pass1 (12 bits) | Pass2 (8 bits).  */

This seems to simply combine relative offset (32bit signed) and
discriminator (32bit unsigned) without caring about passes.  So i think
the comment does not need to speak about new hiearchical discriminators.
> +
> +static uint64_t
>  get_combined_location (location_t loc, tree decl)
>  {
>    bool warned = false;
> -  /* TODO: allow more bits for line and less bits for discriminator.  */
> -  if ((LOCATION_LINE (loc) - DECL_SOURCE_LINE (decl)) >= (1<<15)
> -      || (LOCATION_LINE (loc) - DECL_SOURCE_LINE (decl)) <= -(1<<15))
> -    warned = warning_at (loc, OPT_Wauto_profile,
> -                      "auto-profile cannot encode offset %i "
> -                      "that exceeds 16 bytes",
> -                      LOCATION_LINE (loc) - DECL_SOURCE_LINE (decl));
> -  if (warned)
> -    inform (DECL_SOURCE_LOCATION (decl), "location offset is related to");
> -  if ((unsigned)get_discriminator_from_loc (loc) >= (1u << 16))
> -    warning_at (loc, OPT_Wauto_profile,
> -             "auto-profile cannot encode discriminators "
> -             "that exceeds 16 bytes");
> -  return ((unsigned)(LOCATION_LINE (loc) - DECL_SOURCE_LINE (decl)) << 16)
> -      | get_discriminator_from_loc (loc);
> +  int line_offset = LOCATION_LINE (loc) - DECL_SOURCE_LINE (decl);
> +
> +  /* Check if line offset fits in 32 bits (signed) within the 64-bit
> +     combined location format.  */
> +  if (line_offset >= (1LL << 31) || line_offset <= -(1LL << 31))
> +    {
> +      warned = warning_at (loc, OPT_Wauto_profile,
> +                        "auto-profile cannot encode line offset %i "
> +                        "in 64-bit combined location format",
> +                        line_offset);
> +      if (warned)
> +     inform (DECL_SOURCE_LOCATION (decl), "location offset is related to");
> +      /* Clamp to valid range.  */
> +      if (line_offset >= (1LL << 31))
> +     line_offset = (1LL << 31) - 1;
> +      else
> +     line_offset = -(1LL << 31);
> +    }
> +
> +  unsigned int discriminator = get_discriminator_from_loc (loc);
> +
> +  /* The discriminator now supports 32 bits for hierarchical layout.  */
> +  return (((uint64_t)(unsigned int)line_offset) << 32) | discriminator;
>  }
>  
>  /* Return the function decl of a given lexical BLOCK.  */
> @@ -695,15 +738,15 @@ get_function_decl_from_block (tree block)
>    return BLOCK_ABSTRACT_ORIGIN (block);
>  }
>  
> -/* Dump LOC to F.  */
> +/* Dump line offset and discriminator to F.  */
>  
>  static void
> -dump_afdo_loc (FILE *f, unsigned loc)
> +dump_afdo_loc (FILE *f, int line_offset, unsigned int discriminator)
>  {
> -  if (loc & 65535)
> -    fprintf (f, "%i.%i", loc >> 16, loc & 65535);
> +  if (discriminator)
> +    fprintf (f, "%i.%u", line_offset, discriminator);
>    else
> -    fprintf (f, "%i", loc >> 16);
> +    fprintf (f, "%i", line_offset);
>  }
>  
>  /* Return assembler name as in symbol table and DW_AT_linkage_name.  */
> @@ -737,7 +780,7 @@ dump_inline_stack (FILE *f, inline_stack *stack)
>        fprintf (f, "%s%s:",
>              first ? "" : "; ",
>              raw_symbol_name (p.decl));
> -      dump_afdo_loc (f, p.afdo_loc);
> +      dump_afdo_loc (f, p.line_offset, p.discriminator);
>        first = false;
>      }
>    fprintf (f, "\n");
> @@ -764,12 +807,15 @@ get_inline_stack (location_t locus, inline_stack *stack,
>              continue;
>  
>            tree decl = get_function_decl_from_block (block);
> -          stack->safe_push (
> -           {decl, get_combined_location (locus, decl), locus});
> +       int line_offset = LOCATION_LINE (locus) - DECL_SOURCE_LINE (decl);
> +       unsigned int discriminator = get_discriminator_from_loc (locus);
> +       stack->safe_push ({decl, line_offset, discriminator, locus});
>            locus = tmp_locus;
>          }
>      }
> -  stack->safe_push ({fn, get_combined_location (locus, fn), locus});
> +  int line_offset = LOCATION_LINE (locus) - DECL_SOURCE_LINE (fn);
> +  unsigned int discriminator = get_discriminator_from_loc (locus);
> +  stack->safe_push ({fn, line_offset, discriminator, locus});
>  }
>  
>  /* Same as get_inline_stack for a given node which may be
> @@ -802,7 +848,7 @@ get_inline_stack_in_node (location_t locus, inline_stack 
> *stack,
>     LOC to the start lineno of DECL, The lower 16 bits stores the
>     discriminator.  */
>  
> -static unsigned
> +static uint64_t
>  get_relative_location_for_locus (tree fn, tree block, location_t locus)
>  {
>    if (LOCATION_LOCUS (locus) == UNKNOWN_LOCATION)
> @@ -818,7 +864,7 @@ get_relative_location_for_locus (tree fn, tree block, 
> location_t locus)
>  
>  /* Return combined location of STMT in function FN.  */
>  
> -static unsigned
> +static uint64_t
>  get_relative_location_for_stmt (tree fn, gimple *stmt)
>  {
>    return get_relative_location_for_locus
> @@ -826,6 +872,16 @@ get_relative_location_for_stmt (tree fn, gimple *stmt)
>          gimple_location (stmt));
>  }
>  
> +/* Create a lookup key from line_offset and discriminator.
> +   Strips pass1 and pass2 from discriminator, keeping only base.  */
Probably just "... line_offset and base discriminator" again not
speaking about pass1/2
> +
> +static uint64_t
> +make_profile_lookup_key (int line_offset, unsigned int discriminator)
perehaps profile_lookup_key is shorter and gives same infomation...
> @@ -1919,16 +2003,32 @@ function_instance::debug () const
>    dump (stderr);
>  }
>  
> -/* Return profile info for LOC in INFO.  */
> +/* Return profile info for LOC in INFO.
> +
> +   For hierarchical discriminators, we aggregate counts across all
> +   pass1/pass2 values that share the same line offset and base
It is called base discriminator earlier, so I would use it here too.
> +   discriminator.  This is necessary because during profile-guided
> +   optimization, the code structure may differ from the training run
> +   (e.g., different unrolling decisions), but we still want to use the
> +   profile data for the same logical source location.  */
>  
>  bool
>  function_instance::get_count_info (location_t loc, count_info *info) const
>  {
> -  position_count_map::const_iterator iter = pos_counts.find (loc);
> -  if (iter == pos_counts.end ())
> -    return false;
> -  *info = iter->second;
> -  return true;
> +  /* Direct lookup using combined location which contains (line_offset, 
> base).
> +     Profile data was aggregated during reading by (line_offset, base),
> +     stripping only pass1 and pass2 discriminators.
> +     At afdo_offline pass, discriminators only have base component
> +     (pass1=0, pass2=0).  */
Multiplicity is handled at the autofdo tool side?
> +
> +  position_count_map::const_iterator it = pos_counts.find (loc);
> +  if (it != pos_counts.end ())
> +    {
> +      *info = it->second;
> +      return true;
> +    }
> +
> +  return false;
>  }
>  
>  /* Read the inlined indirect call target profile for STMT and store it in

The changes to auto-profile looks OK to me with comments above.
I will comment on the other changes in next email.

Honza
> diff --git a/gcc/cfgloopmanip.cc b/gcc/cfgloopmanip.cc
> index dda2fb661d8..91c3e88d567 100644
> --- a/gcc/cfgloopmanip.cc
> +++ b/gcc/cfgloopmanip.cc
> @@ -34,6 +34,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "sreal.h"
>  #include "tree-cfg.h"
>  #include "tree-pass.h"
> +#include "hierarchical_discriminator.h"
>  
>  static void copy_loops_to (class loop **, int,
>                          class loop *);
> @@ -1422,6 +1423,47 @@ duplicate_loop_body_to_header_edge (class loop *loop, 
> edge e,
>           new_bbs[i]->aux = (void *)(size_t)(j + 1);
>         }
>  
> +      /* Assign hierarchical discriminators to distinguish loop iterations.  
> */
> +      if (flags & DLTHE_RECORD_HIERARCHICAL_DISCRIMINATOR)
> +     {
> +       /* Only handle GIMPLE mode for now.  */
> +       if (current_ir_type () == IR_GIMPLE)
> +         {
> +           unsigned int iter = j + 1;
> +
> +           for (i = 0; i < n; i++)
> +             {
> +               for (gimple_stmt_iterator gsi = gsi_start_bb (new_bbs[i]);
> +                    !gsi_end_p (gsi); gsi_next (&gsi))
> +                 {
> +                   gimple *stmt = gsi_stmt (gsi);
> +                   location_t loc = gimple_location (stmt);
> +
> +                   if (loc != UNKNOWN_LOCATION && !is_gimple_debug (stmt))
> +                     {
> +                       unsigned int base, pass1, old_pass2;
> +                       get_discriminator_components_from_loc (loc, &base,
> +                                                              &pass1,
> +                                                              &old_pass2);
> +
> +                       /* Add iteration count to existing pass2 value,
> +                          capping at 255.  */
> +                       unsigned int pass2 = old_pass2 + iter;
> +                       if (pass2 > 255)
> +                         pass2 = 255;
> +
> +                       location_t new_loc
> +                         = location_with_discriminator_components (loc,
> +                                                                   base,
> +                                                                   pass1,
> +                                                                   pass2);
> +                       gimple_set_location (stmt, new_loc);
> +                     }
> +                 }
> +             }
> +         }
> +     }
> +
>        /* Note whether the blocks and edges belong to an irreducible loop.  */
>        if (add_irreducible_flag)
>       {
> diff --git a/gcc/cfgloopmanip.h b/gcc/cfgloopmanip.h
> index 42def2fe40d..d3d1a73bdea 100644
> --- a/gcc/cfgloopmanip.h
> +++ b/gcc/cfgloopmanip.h
> @@ -34,6 +34,10 @@ enum
>                                          a complete peeling.  */
>  #define DLTHE_FLAG_FLAT_PROFILE 8    /* Profile is flat; do not reduce
>                                          count by unroll factor.  */
> +#define DLTHE_RECORD_HIERARCHICAL_DISCRIMINATOR 16 /* Assign hierarchical
> +                                                   discriminators to
> +                                                   distinguish loop
> +                                                   iterations.  */
>  extern edge mfb_kj_edge;
>  
>  extern bool remove_path (edge, bool * = NULL, bitmap = NULL);
> diff --git a/gcc/gimple-loop-versioning.cc b/gcc/gimple-loop-versioning.cc
> index 5c9b2fb77ff..b4fb5575b67 100644
> --- a/gcc/gimple-loop-versioning.cc
> +++ b/gcc/gimple-loop-versioning.cc
> @@ -41,6 +41,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "tree-into-ssa.h"
>  #include "gimple-range.h"
>  #include "tree-cfg.h"
> +#include "hierarchical_discriminator.h"
>  
>  namespace {
>  
> @@ -1699,6 +1700,13 @@ loop_versioning::version_loop (class loop *loop)
>        return false;
>      }
>  
> +  /* Assign hierarchical discriminators to distinguish loop versions.
> +     This allows AutoFDO to distinguish profile data from different
> +     versions.  */
> +  assign_discriminators_to_loop (li.optimized_loop,
> +                               DISCRIMINATOR_LOOP_VERSION_ALIGNED);
> +  assign_discriminators_to_loop (loop, DISCRIMINATOR_LOOP_VERSION_UNALIGNED);
> +
>    if (dump_enabled_p ())
>      dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, find_loop_location (loop),
>                    "versioned this loop for when certain strides are 1\n");
> diff --git a/gcc/hierarchical_discriminator.cc 
> b/gcc/hierarchical_discriminator.cc
> new file mode 100644
> index 00000000000..ddd19718f0f
> --- /dev/null
> +++ b/gcc/hierarchical_discriminator.cc
> @@ -0,0 +1,97 @@
> +/* Copyright The GNU Toolchain Authors
> +
> +This file is part of GCC.
> +
> +GCC is free software; you can redistribute it and/or modify it under
> +the terms of the GNU General Public License as published by the Free
> +Software Foundation; either version 3, or (at your option) any later
> +version.
> +
> +GCC is distributed in the hope that it will be useful, but WITHOUT ANY
> +WARRANTY; without even the implied warranty of MERCHANTABILITY or
> +FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
> +for more details.
> +
> +You should have received a copy of the GNU General Public License
> +along with GCC; see the file COPYING3.  If not see
> +<http://www.gnu.org/licenses/>.  */
> +
> +#include "config.h"
> +#include "system.h"
> +#include "coretypes.h"
> +#include "backend.h"
> +#include "tree.h"
> +#include "gimple.h"
> +#include "tree-pass.h"
> +#include "ssa.h"
> +#include "gimple-iterator.h"
> +#include "tree-cfg.h"
> +#include "cfgloop.h"
> +#include "hierarchical_discriminator.h"
> +#include "cfghooks.h"
> +
> +/* Assign discriminators to all statements in a basic block.  This
> +   function updates the pass1 and/or pass2 discriminator components for
> +   all statements in the given basic block, while preserving the base
> +   discriminator.  */
> +
> +void
> +assign_discriminators_to_bb (basic_block bb,
> +                           unsigned int pass1_value,
> +                           unsigned int pass2_value,
> +                           bool update_pass1,
> +                           bool update_pass2)
> +{
> +  gimple_stmt_iterator gsi;
> +
> +  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> +    {
> +      gimple *stmt = gsi_stmt (gsi);
> +      location_t loc = gimple_location (stmt);
> +
> +      if (loc == UNKNOWN_LOCATION || is_gimple_debug (stmt))
> +     continue;
> +
> +      /* Get existing discriminator components.  */
> +      unsigned int base, pass1, pass2;
> +      get_discriminator_components_from_loc (loc, &base, &pass1, &pass2);
> +
> +      /* Update requested components.  */
> +      if (update_pass1)
> +     pass1 = pass1_value;
> +      if (update_pass2)
> +     pass2 = pass2_value;
> +
> +      /* Set new location.  */
> +      location_t new_loc = location_with_discriminator_components (loc, base,
> +                                                                pass1,
> +                                                                pass2);
> +      gimple_set_location (stmt, new_loc);
> +    }
> +}
> +
> +/* Assign pass1 discriminators to all basic blocks in a loop.  This
> +   function is used by loop versioning passes to assign a unique version
> +   ID to all statements in a loop version.  The version_id should be a
> +   unique value (1, 2, 3, ...) for each version of the loop.  */
> +
> +void
> +assign_discriminators_to_loop (class loop *loop, unsigned int version_id)
> +{
> +  basic_block *bbs;
> +  unsigned int i;
> +
> +  /* Validate version_id is in valid range for pass1 (1-4095).  */
> +  gcc_assert (version_id > 0 && version_id <= DISCR_PASS1_MAX);
> +
> +  /* Get all basic blocks in the loop.  */
> +  bbs = get_loop_body (loop);
> +
> +  /* Assign pass1 discriminator to all blocks in the loop.  */
> +  for (i = 0; i < loop->num_nodes; i++)
> +    assign_discriminators_to_bb (bbs[i], version_id, 0, true, false);
> +
> +  free (bbs);
> +}
> +
> +
> diff --git a/gcc/hierarchical_discriminator.h 
> b/gcc/hierarchical_discriminator.h
> new file mode 100644
> index 00000000000..dd3cb1b0ae7
> --- /dev/null
> +++ b/gcc/hierarchical_discriminator.h
> @@ -0,0 +1,75 @@
> +/* Copyright The GNU Toolchain Authors
> +
> +This file is part of GCC.
> +
> +GCC is free software; you can redistribute it and/or modify it under
> +the terms of the GNU General Public License as published by the Free
> +Software Foundation; either version 3, or (at your option) any later
> +version.
> +
> +GCC is distributed in the hope that it will be useful, but WITHOUT ANY
> +WARRANTY; without even the implied warranty of MERCHANTABILITY or
> +FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
> +for more details.
> +
> +You should have received a copy of the GNU General Public License
> +along with GCC; see the file COPYING3.  If not see
> +<http://www.gnu.org/licenses/>.  */
> +
> +#include "config.h"
> +#include "system.h"
> +#include "coretypes.h"
> +#ifndef GCC_HIERARCHICAL_DISCRIMINATOR_H
> +#define GCC_HIERARCHICAL_DISCRIMINATOR_H
> +
> +#include "gimple.h"
> +#include "tree.h"
> +#include "basic-block.h"
> +#include "input.h"
> +
> +/* Hierarchical discriminator layout (32 bits total):
> +   - Base: bits 0-11 (12 bits, 0-4095)
> +   - Pass1: bits 12-23 (12 bits, 0-4095)
> +   - Pass2: bits 24-31 (8 bits, 0-255)
> +
> +   Base discriminator: Used by front-end and early passes to distinguish
> +                    different statements on the same source line.
> +
> +   Pass1 discriminator: Used by middle-end optimizations to distinguish
> +                     different versions/contexts of the same code:
> +                     - Loop versioning (vectorized vs scalar)
> +                     - Inlining contexts (different callsites)
> +                     - Function cloning
> +
> +   Pass2 discriminator: Used by late optimizations to distinguish
> +                     iterations or variants:
> +                     - Loop unrolling iterations
> +                     - Vectorization variants
> + */
> +
> +/* Loop versioning discriminators.  */
> +#define DISCRIMINATOR_LOOP_VERSION_VECTORIZED  1  /* Vectorized version.  */
> +#define DISCRIMINATOR_LOOP_VERSION_SCALAR      2  /* Scalar version.  */
> +#define DISCRIMINATOR_LOOP_VERSION_ALIGNED     3  /* Aligned version.  */
> +#define DISCRIMINATOR_LOOP_VERSION_UNALIGNED   4  /* Unaligned version.  */
> +
> +/* Loop transformation discriminators.  */
> +#define DISCRIMINATOR_LOOP_UNROLLED         5  /* Unrolled loop.  */
> +#define DISCRIMINATOR_LOOP_PEELED           6  /* Peeled loop.  */
> +
> +/* Helper function to assign discriminators to all statements in a basic
> +   block.  This preserves the base discriminator and only updates the
> +   requested components.  */
> +extern void assign_discriminators_to_bb (basic_block bb,
> +                                       unsigned int pass1_value,
> +                                       unsigned int pass2_value,
> +                                       bool update_pass1,
> +                                       bool update_pass2);
> +
> +/* Helper function to assign pass1 discriminators to all basic blocks in
> +   a loop.  This is used by loop versioning passes to distinguish
> +   different versions of the same loop.  */
> +extern void assign_discriminators_to_loop (class loop *loop,
> +                                         unsigned int version_id);
> +
> +#endif /* GCC_HIERARCHICAL_DISCRIMINATOR_H.  */
> diff --git a/gcc/input.cc b/gcc/input.cc
> index aad98394711..7d8cd31e304 100644
> --- a/gcc/input.cc
> +++ b/gcc/input.cc
> @@ -1074,6 +1074,40 @@ get_discriminator_from_loc (location_t locus)
>    return get_discriminator_from_loc (line_table, locus);
>  }
>  
> +/* Create a location with hierarchical discriminator components.  */
> +
> +location_t
> +location_with_discriminator_components (location_t locus,
> +                                     unsigned int base,
> +                                     unsigned int pass1,
> +                                     unsigned int pass2)
> +{
> +  gcc_assert (base <= DISCR_BASE_MAX);
> +  gcc_assert (pass1 <= DISCR_PASS1_MAX);
> +  gcc_assert (pass2 <= DISCR_PASS2_MAX);
> +  unsigned int discriminator = (base << DISCR_BASE_SHIFT)
> +    | (pass1 << DISCR_PASS1_SHIFT)
> +    | (pass2 << DISCR_PASS2_SHIFT);
> +  return location_with_discriminator (locus, discriminator);
> +}
> +
> +/* Get hierarchical discriminator components from a location.  */
> +
> +void
> +get_discriminator_components_from_loc (location_t locus,
> +                                    unsigned int *base,
> +                                    unsigned int *pass1,
> +                                    unsigned int *pass2)
> +{
> +  unsigned int discriminator = get_discriminator_from_loc (locus);
> +  if (base)
> +    *base = discriminator & DISCR_BASE_MASK;
> +  if (pass1)
> +    *pass1 = (discriminator >> DISCR_PASS1_SHIFT) & DISCR_PASS1_MASK;
> +  if (pass2)
> +    *pass2 = (discriminator >> DISCR_PASS2_SHIFT) & DISCR_PASS2_MASK;
> +}
> +
>  #if CHECKING_P
>  
>  namespace selftest {
> diff --git a/gcc/input.h b/gcc/input.h
> index 4d2d7741592..af30f314be5 100644
> --- a/gcc/input.h
> +++ b/gcc/input.h
> @@ -89,6 +89,42 @@ extern location_t location_with_discriminator (location_t, 
> int);
>  extern bool has_discriminator (location_t);
>  extern int get_discriminator_from_loc (location_t);
>  
> +/* Hierarchical discriminator support for AutoFDO.
> +Layout: Base (12 bits) | Pass1 (12 bits) | Pass2 (8 bits)
> +- Base: Traditional same-line disambiguation
> +- Pass1: Optimization context (e.g., inline callsite hash)
> +- Pass2: Code duplication (e.g., loop unroll iteration)  */
> +
> +/* Discriminator bit layout constants.  */
> +#define DISCR_BASE_BITS 12
> +#define DISCR_PASS1_BITS 12
> +#define DISCR_PASS2_BITS 8
> +
> +#define DISCR_BASE_MASK ((1u << DISCR_BASE_BITS) - 1)
> +#define DISCR_PASS1_MASK ((1u << DISCR_PASS1_BITS) - 1)
> +#define DISCR_PASS2_MASK ((1u << DISCR_PASS2_BITS) - 1)
> +
> +#define DISCR_BASE_SHIFT 0
> +#define DISCR_PASS1_SHIFT DISCR_BASE_BITS
> +#define DISCR_PASS2_SHIFT (DISCR_BASE_BITS + DISCR_PASS1_BITS)
> +
> +/* Maximum values for each discriminator field.  */
> +#define DISCR_BASE_MAX DISCR_BASE_MASK
> +#define DISCR_PASS1_MAX DISCR_PASS1_MASK
> +#define DISCR_PASS2_MAX DISCR_PASS2_MASK
> +
> +/* Create location with hierarchical discriminator.  */
> +extern location_t location_with_discriminator_components (location_t,
> +                                                       unsigned int base,
> +                                                       unsigned int pass1,
> +                                                       unsigned int pass2);
> +
> +/* Get discriminator components from location.  */
> +extern void get_discriminator_components_from_loc (location_t,
> +                                                unsigned int *base,
> +                                                unsigned int *pass1,
> +                                                unsigned int *pass2);
> +
>  #define LOCATION_FILE(LOC) ((expand_location (LOC)).file)
>  #define LOCATION_LINE(LOC) ((expand_location (LOC)).line)
>  #define LOCATION_COLUMN(LOC)((expand_location (LOC)).column)
> diff --git a/gcc/testsuite/gcc.dg/hierarchical-discriminator-unroll.c 
> b/gcc/testsuite/gcc.dg/hierarchical-discriminator-unroll.c
> new file mode 100644
> index 00000000000..9690d664197
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/hierarchical-discriminator-unroll.c
> @@ -0,0 +1,35 @@
> +/* Test that loop unrolling assigns pass2 discriminators for iterations.
> +   { dg-do compile }
> +   { dg-options "-S -O2 -g  -fno-tree-vectorize" } */
> +
> +int a[100];
> +int
> +test_unroll (void)
> +{
> +  int sum = 0;
> +  int i;
> +  
> +  /* Small fixed-count loop that should be completely unrolled */
> +  #pragma GCC unroll 4
> +  for (i = 0; i < 4; i++)
> +    {
> +      /* Each unrolled iteration should get pass2=1,2,3,4 */
> +      asm ("nop");
> +      sum += a[i] * 2; 
> +    }
> +  
> +  return sum;
> +}
> +
> +/* Expected discriminators from the assembly (hierarchical format: 
> base:pass1:pass2):
> +   - discriminator 16777216 (0x1000000) = base:0, pass1:0, pass2:1 - first 
> unrolled iteration
> +   - discriminator 33554432 (0x2000000) = base:0, pass1:0, pass2:2 - second 
> unrolled iteration
> +   - discriminator 50331648 (0x3000000) = base:0, pass1:0, pass2:3 - third 
> unrolled iteration
> +   - discriminator 67108864 (0x4000000) = base:0, pass1:0, pass2:4 - fourth 
> unrolled iteration
> +   Note: pass2 values represent the iteration number in the unrolled loop
> +*/
> +
> +/* { dg-final { scan-assembler "\\.loc 1 17 7 is_stmt 0 discriminator 
> 16777216" } } */
> +/* { dg-final { scan-assembler "\\.loc 1 17 7 is_stmt 0 discriminator 
> 33554432" } } */
> +/* { dg-final { scan-assembler "\\.loc 1 17 7 is_stmt 0 discriminator 
> 50331648" } } */
> +/* { dg-final { scan-assembler "\\.loc 1 17 7 is_stmt 0 discriminator 
> 67108864" } } */
> diff --git a/gcc/testsuite/gcc.dg/hierarchical-discriminator-vect-version.c 
> b/gcc/testsuite/gcc.dg/hierarchical-discriminator-vect-version.c
> new file mode 100644
> index 00000000000..d9b5ebb584c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/hierarchical-discriminator-vect-version.c
> @@ -0,0 +1,28 @@
> +/* Test that loop versioning for vectorization assigns pass1 discriminators.
> +   { dg-do compile }
> +   { dg-options "-O3 -g -ftree-vectorize" }
> +   { dg-require-effective-target vect_int }
> +    */
> +
> +void
> +test_vectorize (int *a, int *b, int *c, int n)
> +//test_vectorize (int * __restrict__ a, int * __restrict__ b, int * 
> __restrict__ c, int n)
> +{
> +  int i;
> +  for (i = 0; i < n; i++)
> +    {
> +      a[i] = b[i] + c[i];  /* This should get pass1=1 (vectorized) and 
> pass1=2 (scalar) */
> +    }
> +}
> +
> +/* Check that .loc directives with discriminators are present.
> +   Format: .loc file line column discriminator
> +   Expected discriminators from the assembly (hierarchical format: 
> base:pass1:pass2):
> +   - discriminator 4096 (0x1000) = base:0, pass1:1, pass2:0 - vectorized 
> version
> +   - discriminator 8192 (0x2000) = base:0, pass1:2, pass2:0 - scalar version
> +   - discriminator 16781312 (0x1000000) = base:0, pass1:4096, pass2:0 - 
> scalar remainder first iteration
> +   - discriminator 33558528 (0x2000000) = base:0, pass1:8192, pass2:0 - 
> scalar remainder second iteration
> +*/
> +
> +/* { dg-final { scan-assembler "\\.loc 1 14 15 is_stmt 0 discriminator 4096" 
> } } */
> +/* { dg-final { scan-assembler "\\.loc 1 14 19 is_stmt 0 discriminator 8192" 
> } } */
> diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
> index ca6295c7de2..fe774454bf5 100644
> --- a/gcc/tree-ssa-loop-ivcanon.cc
> +++ b/gcc/tree-ssa-loop-ivcanon.cc
> @@ -65,6 +65,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "tree-ssa-sccvn.h"
>  #include "tree-vectorizer.h" /* For find_loop_location */
>  #include "dbgcnt.h"
> +#include "hierarchical_discriminator.h"
>  
>  /* Specifies types of loops that may be unrolled.  */
>  
> @@ -980,7 +981,8 @@ try_unroll_loop_completely (class loop *loop,
>        if (!gimple_duplicate_loop_body_to_header_edge (
>           loop, loop_preheader_edge (loop), n_unroll, wont_exit, exit,
>           &edges_to_remove,
> -         DLTHE_FLAG_UPDATE_FREQ | DLTHE_FLAG_COMPLETTE_PEEL))
> +         DLTHE_FLAG_UPDATE_FREQ | DLTHE_FLAG_COMPLETTE_PEEL
> +         | DLTHE_RECORD_HIERARCHICAL_DISCRIMINATOR))
>       {
>            free_original_copy_tables ();
>         if (dump_file && (dump_flags & TDF_DETAILS))
> @@ -1222,7 +1224,8 @@ try_peel_loop (class loop *loop,
>  
>    if (!gimple_duplicate_loop_body_to_header_edge (
>       loop, loop_preheader_edge (loop), npeel, wont_exit, exit,
> -     &edges_to_remove, DLTHE_FLAG_UPDATE_FREQ))
> +     &edges_to_remove,
> +     DLTHE_FLAG_UPDATE_FREQ | DLTHE_RECORD_HIERARCHICAL_DISCRIMINATOR))
>      {
>        free_original_copy_tables ();
>        return false;
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index 20141dbc2e5..6de9fcd5746 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -53,6 +53,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "langhooks.h"
>  #include "tree-vector-builder.h"
>  #include "optabs-tree.h"
> +#include "hierarchical_discriminator.h"
>  
>  /*************************************************************************
>    Simple Loop Peeling Utilities
> @@ -4359,6 +4360,14 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
>        gcc_assert (nloop);
>        nloop = get_loop_copy (loop);
>  
> +      /* Assign hierarchical discriminators to distinguish loop versions.
> +      This allows AutoFDO to distinguish profile data from different
> +      versions.  */
> +      assign_discriminators_to_loop (loop,
> +                                   DISCRIMINATOR_LOOP_VERSION_VECTORIZED);
> +      assign_discriminators_to_loop (nloop,
> +                                   DISCRIMINATOR_LOOP_VERSION_SCALAR);
> +
>        /* For cycle vectorization with SLP we rely on the PHI arguments
>        appearing in the same order as the SLP node operands which for the
>        loop PHI nodes means the preheader edge dest index needs to remain
> -- 
> 2.34.1
> 

Reply via email to