On Sun, Apr 20, 2025 at 11:26 PM H.J. Lu <hjl.to...@gmail.com> wrote:
>
> Don't assume that stack slots can only be accessed by stack or frame
> registers.  We first find all registers defined by stack or frame
> registers.  Then check memory accesses by such registers, including
> stack and frame registers.

I've been thinking some more about this issue. The code below searches
for registers that are dependent on stack (and frame) pointer, and
then also searches for registers that are dependent on these
registers. I think that second step is an overkill, the core of the
problem (as shown in PR109780, comment  34 [1]) is in the expansion of
__builtin_memset() that creates a temporary that refers to the virtual
stack var.

The current DF infrastructure doesn't handle cases where
stack-referred register is later killed e.g.:

leaq    -4(%rsp), %rdx
movl   $2, (%rdx)       <- some random code that uses %rdx address correctly
...
mov $x, %rdx            <- load of "x" address kills previous
temporary; "x" is aligned
vmovdqa %xmm1, (%rdx)  <- this should not increase stack alignment

and the proposed patch will increase stack alignment unnecessarily.
This issue will be even worse when registers that depend on %rdx will
be taken into account.

[1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109780#c34

Uros.

>
> gcc/
>
>         PR target/109780
>         PR target/109093
>         * config/i386/i386.cc (stack_access_data): New.
>         (ix86_update_stack_alignment): Likewise.
>         (ix86_find_all_reg_use_1): Likewise.
>         (ix86_find_all_reg_use): Likewise.
>         (ix86_find_max_used_stack_alignment): Also check memory accesses
>         from registers defined by stack or frame registers.
>
> gcc/testsuite/
>
>         PR target/109780
>         PR target/109093
>         * g++.target/i386/pr109780-1.C: New test.
>         * gcc.target/i386/pr109093-1.c: Likewise.
>         * gcc.target/i386/pr109780-1.c: Likewise.
>         * gcc.target/i386/pr109780-2.c: Likewise.
>         * gcc.target/i386/pr109780-3.c: Likewise.
>
> Signed-off-by: H.J. Lu <hjl.to...@gmail.com>
> ---
>  gcc/config/i386/i386.cc                    | 174 ++++++++++++++++++---
>  gcc/testsuite/g++.target/i386/pr109780-1.C |  72 +++++++++
>  gcc/testsuite/gcc.target/i386/pr109093-1.c |  33 ++++
>  gcc/testsuite/gcc.target/i386/pr109780-1.c |  14 ++
>  gcc/testsuite/gcc.target/i386/pr109780-2.c |  21 +++
>  gcc/testsuite/gcc.target/i386/pr109780-3.c |  46 ++++++
>  6 files changed, 339 insertions(+), 21 deletions(-)
>  create mode 100644 gcc/testsuite/g++.target/i386/pr109780-1.C
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr109093-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr109780-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr109780-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr109780-3.c
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 28603c2943e..9e4e76857e6 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -8473,6 +8473,103 @@ output_probe_stack_range (rtx reg, rtx end)
>    return "";
>  }
>
> +/* Data passed to ix86_update_stack_alignment.  */
> +struct stack_access_data
> +{
> +  /* The stack access register.  */
> +  const_rtx reg;
> +  /* Pointer to stack alignment.  */
> +  unsigned int *stack_alignment;
> +};
> +
> +/* Update the maximum stack slot alignment from memory alignment in
> +   PAT.  */
> +
> +static void
> +ix86_update_stack_alignment (rtx, const_rtx pat, void *data)
> +{
> +  /* This insn may reference stack slot.  Update the maximum stack slot
> +     alignment if the memory is referenced by the stack access register.
> +   */
> +  stack_access_data *p = (stack_access_data *) data;
> +  subrtx_iterator::array_type array;
> +  FOR_EACH_SUBRTX (iter, array, pat, ALL)
> +    {
> +      auto op = *iter;
> +      if (GET_CODE (op) == ZERO_EXTEND)
> +       op = XEXP (op, 0);
> +      if (MEM_P (op) && reg_mentioned_p (p->reg, op))
> +       {
> +         unsigned int alignment = MEM_ALIGN (op);
> +         if (alignment > *p->stack_alignment)
> +           *p->stack_alignment = alignment;
> +         break;
> +       }
> +    }
> +}
> +
> +/* Helper function for ix86_find_all_reg_use.  */
> +
> +static void
> +ix86_find_all_reg_use_1 (rtx set, HARD_REG_SET &stack_slot_access,
> +                        auto_bitmap &worklist)
> +{
> +  rtx dest = SET_DEST (set);
> +  if (!REG_P (dest))
> +    return;
> +
> +  rtx src = SET_SRC (set);
> +
> +  if (GET_CODE (src) == ZERO_EXTEND)
> +    src = XEXP (src, 0);
> +
> +  if (MEM_P (src) || CONST_SCALAR_INT_P (src))
> +    return;
> +
> +  if (TEST_HARD_REG_BIT (stack_slot_access, REGNO (dest)))
> +    return;
> +
> +  /* Add this register to stack_slot_access.  */
> +  add_to_hard_reg_set (&stack_slot_access, Pmode, REGNO (dest));
> +  bitmap_set_bit (worklist, REGNO (dest));
> +}
> +
> +/* Find all registers defined with REG.  */
> +
> +static void
> +ix86_find_all_reg_use (HARD_REG_SET &stack_slot_access,
> +                      unsigned int reg, auto_bitmap &worklist)
> +{
> +  for (df_ref ref = DF_REG_USE_CHAIN (reg);
> +       ref != NULL;
> +       ref = DF_REF_NEXT_REG (ref))
> +    {
> +      if (DF_REF_IS_ARTIFICIAL (ref))
> +       continue;
> +
> +      rtx_insn *insn = DF_REF_INSN (ref);
> +
> +      if (!NONJUMP_INSN_P (insn))
> +       continue;
> +
> +      rtx set = single_set (insn);
> +      if (set)
> +       ix86_find_all_reg_use_1 (set, stack_slot_access, worklist);
> +
> +      rtx pat = PATTERN (insn);
> +      if (GET_CODE (pat) != PARALLEL)
> +       continue;
> +
> +      for (int i = 0; i < XVECLEN (pat, 0); i++)
> +       {
> +         rtx exp = XVECEXP (pat, 0, i);
> +
> +         if (GET_CODE (exp) == SET)
> +           ix86_find_all_reg_use_1 (exp, stack_slot_access, worklist);
> +       }
> +    }
> +}
> +
>  /* Set stack_frame_required to false if stack frame isn't required.
>     Update STACK_ALIGNMENT to the largest alignment, in bits, of stack
>     slot used if stack frame is required and CHECK_STACK_SLOT is true.  */
> @@ -8491,10 +8588,6 @@ ix86_find_max_used_stack_alignment (unsigned int 
> &stack_alignment,
>    add_to_hard_reg_set (&set_up_by_prologue, Pmode,
>                        HARD_FRAME_POINTER_REGNUM);
>
> -  /* The preferred stack alignment is the minimum stack alignment.  */
> -  if (stack_alignment > crtl->preferred_stack_boundary)
> -    stack_alignment = crtl->preferred_stack_boundary;
> -
>    bool require_stack_frame = false;
>
>    FOR_EACH_BB_FN (bb, cfun)
> @@ -8506,27 +8599,66 @@ ix86_find_max_used_stack_alignment (unsigned int 
> &stack_alignment,
>                                        set_up_by_prologue))
>           {
>             require_stack_frame = true;
> -
> -           if (check_stack_slot)
> -             {
> -               /* Find the maximum stack alignment.  */
> -               subrtx_iterator::array_type array;
> -               FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
> -                 if (MEM_P (*iter)
> -                     && (reg_mentioned_p (stack_pointer_rtx,
> -                                          *iter)
> -                         || reg_mentioned_p (frame_pointer_rtx,
> -                                             *iter)))
> -                   {
> -                     unsigned int alignment = MEM_ALIGN (*iter);
> -                     if (alignment > stack_alignment)
> -                       stack_alignment = alignment;
> -                   }
> -             }
> +           break;
>           }
>      }
>
>    cfun->machine->stack_frame_required = require_stack_frame;
> +
> +  /* Stop if we don't need to check stack slot.  */
> +  if (!check_stack_slot)
> +    return;
> +
> +  /* The preferred stack alignment is the minimum stack alignment.  */
> +  if (stack_alignment > crtl->preferred_stack_boundary)
> +    stack_alignment = crtl->preferred_stack_boundary;
> +
> +  HARD_REG_SET stack_slot_access;
> +  CLEAR_HARD_REG_SET (stack_slot_access);
> +
> +  /* Stack slot can be accessed by stack pointer, frame pointer or
> +     registers defined by stack pointer or frame pointer.  */
> +  auto_bitmap worklist;
> +
> +  add_to_hard_reg_set (&stack_slot_access, Pmode,
> +                      STACK_POINTER_REGNUM);
> +  bitmap_set_bit (worklist, STACK_POINTER_REGNUM);
> +
> +  if (frame_pointer_needed)
> +    {
> +      add_to_hard_reg_set (&stack_slot_access, Pmode,
> +                          HARD_FRAME_POINTER_REGNUM);
> +      bitmap_set_bit (worklist, HARD_FRAME_POINTER_REGNUM);
> +    }
> +
> +  unsigned int reg;
> +
> +  do
> +    {
> +      reg = bitmap_clear_first_set_bit (worklist);
> +      ix86_find_all_reg_use (stack_slot_access, reg, worklist);
> +    }
> +  while (!bitmap_empty_p (worklist));
> +
> +  hard_reg_set_iterator hrsi;
> +  stack_access_data data = { nullptr, &stack_alignment };
> +
> +  EXECUTE_IF_SET_IN_HARD_REG_SET (stack_slot_access, 0, reg, hrsi)
> +    for (df_ref ref = DF_REG_USE_CHAIN (reg);
> +        ref != NULL;
> +        ref = DF_REF_NEXT_REG (ref))
> +      {
> +       if (DF_REF_IS_ARTIFICIAL (ref))
> +         continue;
> +
> +       rtx_insn *insn = DF_REF_INSN (ref);
> +
> +       if (!NONJUMP_INSN_P (insn))
> +         continue;
> +
> +       data.reg = DF_REF_REG (ref);
> +       note_stores (insn, ix86_update_stack_alignment, &data);
> +      }
>  }
>
>  /* Finalize stack_realign_needed and frame_pointer_needed flags, which
> diff --git a/gcc/testsuite/g++.target/i386/pr109780-1.C 
> b/gcc/testsuite/g++.target/i386/pr109780-1.C
> new file mode 100644
> index 00000000000..7e3eabdec94
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/i386/pr109780-1.C
> @@ -0,0 +1,72 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target c++17 } */
> +/* { dg-options "-O2 -mavx2 -mtune=haswell" } */
> +
> +template <typename _Tp> struct remove_reference {
> +  using type = __remove_reference(_Tp);
> +};
> +template <typename T> struct MaybeStorageBase {
> +  T val;
> +  struct Union {
> +    ~Union();
> +  } mStorage;
> +};
> +template <typename T> struct MaybeStorage : MaybeStorageBase<T> {
> +  char mIsSome;
> +};
> +template <typename T, typename U = typename remove_reference<T>::type>
> +constexpr MaybeStorage<U> Some(T &&);
> +template <typename T, typename U> constexpr MaybeStorage<U> Some(T &&aValue) 
> {
> +  return {aValue};
> +}
> +template <class> struct Span {
> +  int operator[](long idx) {
> +    int *__trans_tmp_4;
> +    if (__builtin_expect(idx, 0))
> +      *(int *)__null = false;
> +    __trans_tmp_4 = storage_.data();
> +    return __trans_tmp_4[idx];
> +  }
> +  struct {
> +    int *data() { return data_; }
> +    int *data_;
> +  } storage_;
> +};
> +struct Variant {
> +  template <typename RefT> Variant(RefT) {}
> +};
> +long from_i, from___trans_tmp_9;
> +namespace js::intl {
> +struct DecimalNumber {
> +  Variant string_;
> +  unsigned long significandStart_;
> +  unsigned long significandEnd_;
> +  bool zero_ = false;
> +  bool negative_;
> +  template <typename CharT> DecimalNumber(CharT string) : string_(string) {}
> +  template <typename CharT>
> +  static MaybeStorage<DecimalNumber> from(Span<const CharT>);
> +  void from();
> +};
> +} // namespace js::intl
> +void js::intl::DecimalNumber::from() {
> +  Span<const char16_t> __trans_tmp_3;
> +  from(__trans_tmp_3);
> +}
> +template <typename CharT>
> +MaybeStorage<js::intl::DecimalNumber>
> +js::intl::DecimalNumber::from(Span<const CharT> chars) {
> +  DecimalNumber number(chars);
> +  if (auto ch = chars[from_i]) {
> +    from_i++;
> +    number.negative_ = ch == '-';
> +  }
> +  while (from___trans_tmp_9 && chars[from_i])
> +    ;
> +  if (chars[from_i])
> +    while (chars[from_i - 1])
> +      number.zero_ = true;
> +  return Some(number);
> +}
> +
> +/* { dg-final { scan-assembler-not "and\[lq\]?\[^\\n\]*-32,\[^\\n\]*sp" } } 
> */
> diff --git a/gcc/testsuite/gcc.target/i386/pr109093-1.c 
> b/gcc/testsuite/gcc.target/i386/pr109093-1.c
> new file mode 100644
> index 00000000000..58a7b006c8a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr109093-1.c
> @@ -0,0 +1,33 @@
> +/* { dg-do run  { target avx2_runtime } }  */
> +/* { dg-options "-O2 -mavx2 -mtune=znver1 -ftrivial-auto-var-init=zero 
> -fno-stack-protector" } */
> +
> +int a, b, c, d;
> +char e, f = 1;
> +short g, h, i;
> +
> +__attribute__ ((weak))
> +void
> +run (void)
> +{
> +  short j;
> +
> +  for (; g >= 0; --g)
> +    {
> +      int *k[10];
> +
> +      for (d = 0; d < 10; d++)
> +       k[d] = &b;
> +
> +      c = *k[1];
> +
> +      for (; a;)
> +       j = i - c / f || (e ^= h);
> +    }
> +}
> +
> +int
> +main (void)
> +{
> +  run ();
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr109780-1.c 
> b/gcc/testsuite/gcc.target/i386/pr109780-1.c
> new file mode 100644
> index 00000000000..6b06947f2a5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr109780-1.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=skylake" } */
> +
> +char perm[64];
> +
> +void
> +__attribute__((noipa))
> +foo (int n)
> +{
> +  for (int i = 0; i < n; ++i)
> +    perm[i] = i;
> +}
> +
> +/* { dg-final { scan-assembler-not "and\[lq\]?\[^\\n\]*-32,\[^\\n\]*sp" } } 
> */
> diff --git a/gcc/testsuite/gcc.target/i386/pr109780-2.c 
> b/gcc/testsuite/gcc.target/i386/pr109780-2.c
> new file mode 100644
> index 00000000000..152da06c6ad
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr109780-2.c
> @@ -0,0 +1,21 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=skylake" } */
> +
> +#define N 9
> +
> +void
> +f (double x, double y, double *res)
> +{
> +  y = -y;
> +  for (int i = 0; i < N; ++i)
> +    {
> +      double tmp = y;
> +      y = x;
> +      x = tmp;
> +      res[i] = i;
> +    }
> +  res[N] = y * y;
> +  res[N + 1] = x;
> +}
> +
> +/* { dg-final { scan-assembler-not "and\[lq\]?\[^\\n\]*-32,\[^\\n\]*sp" } } 
> */
> diff --git a/gcc/testsuite/gcc.target/i386/pr109780-3.c 
> b/gcc/testsuite/gcc.target/i386/pr109780-3.c
> new file mode 100644
> index 00000000000..a3a770a80e3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr109780-3.c
> @@ -0,0 +1,46 @@
> +/* { dg-do run  { target avx2_runtime } }  */
> +/* { dg-options "-O2 -mavx2 -mtune=znver1 -fno-stack-protector 
> -fno-stack-clash-protection" } */
> +
> +char a;
> +static int b, c, f;
> +char *d = &a;
> +static char *e = &a;
> +
> +__attribute__ ((weak))
> +void
> +g (int h, int i)
> +{
> +  int j = 1;
> +  for (; c != -3; c = c - 1)
> +    {
> +      int k[10];
> +      f = 0;
> +      for (; f < 10; f++)
> +        k[f] = 0;
> +      *d = k[1];
> +      if (i < *d)
> +        {
> +          *e = h;
> +          for (; j < 9; j++)
> +            {
> +              b = 1;
> +              for (; b < 7; b++)
> +                ;
> +            }
> +        }
> +    }
> +}
> +
> +__attribute__ ((weak))
> +void
> +run (void)
> +{
> +  g (1, 1);
> +}
> +
> +int
> +main (void)
> +{
> +  run ();
> +  return 0;
> +}
> --
> 2.49.0
>

Reply via email to