https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117248

            Bug ID: 117248
           Summary: gcc/libgcc/libgcc2.h:232:25: internal compiler error:
                    Arithmetic exception
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: danglin at gcc dot gnu.org
                CC: rsandifo at gcc dot gnu.org
  Target Milestone: ---
              Host: hppa64-hp-hpux11.11
            Target: hppa64-hp-hpux11.11
             Build: hppa64-hp-hpux11.11

While working on changes to support LRA, the following bug was
introduced.  It only occurs when LRA is used.  It doesn't occur with
legacy reload.

It occurs building libgcc in stage 2.

/home/dave/gnu/gcc/objdir64/./gcc/xgcc -B/home/dave/gnu/gcc/objdir64/./gcc/
-B/o
pt/gnu64/gcc/gcc-15/hppa64-hp-hpux11.11/bin/
-B/opt/gnu64/gcc/gcc-15/hppa64-hp-h
pux11.11/lib/ -isystem /opt/gnu64/gcc/gcc-15/hppa64-hp-hpux11.11/include
-isyste
m /opt/gnu64/gcc/gcc-15/hppa64-hp-hpux11.11/sys-include   -fno-checking -O2 -g
-
O2  -O2 -g -DIN_GCC   -W -Wall -Wno-error=narrowing -Wwrite-strings -Wcast-qual
-Wstrict-prototypes -Wmissing-prototypes -Wold-style-definition  -isystem
./incl
ude  -frandom-seed=fixed-seed -Dpa64=1 -DELF=1 -mlong-calls  -g -DIN_LIBGCC2
-fb
uilding-libgcc -fno-stack-protector  -frandom-seed=fixed-seed -Dpa64=1 -DELF=1
-
mlong-calls  -I. -I. -I../.././gcc -I../../../gcc/libgcc
-I../../../gcc/libgcc/.
 -I../../../gcc/libgcc/../gcc -I../../../gcc/libgcc/../include  -DHAVE_CC_TLS
-D
USE_EMUTLS  -o _umoddi3_di.o -MT _umoddi3_di.o -MD -MP -MF _umoddi3_di.dep
-DLIB
GCC2_UNITS_PER_WORD=4 -DL_umoddi3 -c ../../../gcc/libgcc/libgcc2.c \
  -fexceptions -fnon-call-exceptions -fvisibility=hidden -DHIDE_EXPORTS
In file included from ../../../gcc/libgcc/libgcc2.c:56:
../../../gcc/libgcc/libgcc2.c: In function '__umoddi3':
../../../gcc/libgcc/libgcc2.h:232:25: internal compiler error: Arithmetic
except
ion
  232 | #define __NDW(a,b)      __ ## a ## di ## b
      |                         ^~
../../../gcc/libgcc/libgcc2.h:290:25: note: in expansion of macro '__NDW'
  290 | #define __umoddi3       __NDW(umod,3)
      |                         ^~~~~
../../../gcc/libgcc/libgcc2.c:1286:1: note: in expansion of macro '__umoddi3'
 1286 | __umoddi3 (UDWtype u, UDWtype v)
      | ^~~~~~~~~
libbacktrace could not find executable to open
Please submit a full bug report, with preprocessed source (by using
-freport-bug).
See <https://gcc.gnu.org/bugs/> for instructions.
make[3]: *** [../../../gcc/libgcc/config/pa/t-dimode:23: _umoddi3_di.o] Error 1
make[3]: *** Waiting for unfinished jobs....
make[3]: Leaving directory
'/home/dave/gnu/gcc/objdir64/hppa64-hp-hpux11.11/libgcc'
make[2]: *** [Makefile:18835: all-stage2-target-libgcc] Error 2

Program received signal SIGFPE, Arithmetic exception.
0x40000000025cede8 in $$divU () at ../../gcc/gcc/gimple-expr.h:66
66        return (type1 == type2
(gdb) info break
Num     Type           Disp Enb Address            What
1       breakpoint     keep y   <MULTIPLE>
        breakpoint already hit 27704 times
        ignore next 281 hits
1.1                         y     0x40000000014f4e24 in
vect_slp_function(function*) at ../../gcc/gcc/gimple-iterator.h:168
1.2                         y     0x800003ffbfc349a0 <$$divU>
1.3                         y     0x800003ffbfe3f9cc <$$divU>
(gdb) bt
#0  0x40000000025cede8 in $$divU () at ../../gcc/gcc/gimple-expr.h:66
#1  0x80000001004c1f58 in ?? ()
#2  0x40000000014d007c in vectorizable_slp_permutation_1 (
    vinfo=0x8000000100496ff0, gsi=0x0, node=0x80000001005aa3d8, perm=...,
    children=..., dump_p=false) at ../../gcc/gcc/tree-vect-slp.cc:10435
#3  0x40000000014e8928 in vectorizable_slp_permutation (
    vinfo=0x800003ffbf480540, gsi=0x0, node=0x80000001004886e8,
    cost_vec=0x80000001005aa400) at ../../gcc/gcc/dumpfile.h:534
#4  vect_slp_analyze_node_operations_1 (vinfo=0x800003ffbf480540,
    node=0x80000001004886e8, node_instance=0x2, cost_vec=0x80000001005aa400)
    at ../../gcc/gcc/tree-vect-slp.cc:7433
#5  vect_slp_analyze_node_operations (vinfo=0x800003ffbf480540,
    node=<optimized out>, node_instance=0x2, visited_set=...,
    visited_vec=..., cost_vec=0x80000001005aa400)
    at ../../gcc/gcc/tree-vect-slp.cc:7656
#6  0x40000000014e80c8 in vect_slp_analyze_node_operations (
    vinfo=0x800003ffbf480540, node=<optimized out>, node_instance=0x2,
    visited_set=..., visited_vec=..., cost_vec=0x80000001005aa400)
    at ../../gcc/gcc/tree-vect-slp.cc:7633
#7  0x40000000014e80c8 in vect_slp_analyze_node_operations (
    vinfo=0x800003ffbf480540, node=<optimized out>, node_instance=0x2,
    visited_set=..., visited_vec=..., cost_vec=0x80000001005aa400)
    at ../../gcc/gcc/tree-vect-slp.cc:7633
---Type <return> to continue, or q <return> to quit---
#8  0x40000000014ea594 in vect_slp_analyze_operations (vinfo=0x0)
    at ../../gcc/gcc/tree-vect-slp.cc:8051
#9  0x40000000014f26a0 in vect_slp_analyze_bb_1 (bb_vinfo=0x80000001004966f8,
    n_stmts=<optimized out>, fatal=<optimized out>,
    dataref_groups=0x800003ffbfff1960) at ../../gcc/gcc/tree-vect-slp.cc:9082
#10 vect_slp_region (bbs=..., datarefs=...,
    dataref_groups=0x800003ffbfff1960, n_stmts=<optimized out>,
    orig_loop=<optimized out>) at ../../gcc/gcc/tree-vect-slp.cc:9129
#11 vect_slp_bbs (bbs=..., orig_loop=<optimized out>)
    at ../../gcc/gcc/tree-vect-slp.cc:9352
#12 0x40000000014f4978 in vect_slp_function (fun=0x800003ffbf480540)
    at ../../gcc/gcc/tree-vect-slp.cc:9474
#13 0x40000000014fbbd8 in (anonymous namespace)::pass_slp_vectorize::execute (
    this=<optimized out>, fun=0x80000001004966f8)
    at ../../gcc/gcc/tree-vectorizer.cc:1538
#14 0x4000000000f4bc6c in execute_one_pass (pass=0x80000001004966f8)
    at ../../gcc/gcc/passes.cc:2659
#15 0x4000000000f4c9ac in execute_pass_list_1 (pass=0x80000001004966f8)
    at ../../gcc/gcc/passes.cc:2768
#16 0x4000000000f4c9c8 in execute_pass_list_1 (pass=0x80000001004966f8)
    at ../../gcc/gcc/passes.cc:2769
#17 0x4000000000f4c9c8 in execute_pass_list_1 (pass=0x80000001004966f8)
    at ../../gcc/gcc/passes.cc:2769
---Type <return> to continue, or q <return> to quit---
#18 0x4000000000f4ca40 in execute_pass_list (fn=<optimized out>,
    pass=<optimized out>) at ../../gcc/gcc/passes.cc:2779
#19 0x4000000000a21900 in expand (this=0x80000001004966f8)
    at ../../gcc/gcc/context.h:48
#20 cgraph_node::expand (this=0x80000001004966f8)
    at ../../gcc/gcc/cgraphunit.cc:1798
#21 0x4000000000a244ac in expand_all_functions ()
    at ../../gcc/gcc/cgraphunit.cc:2028
#22 symbol_table::compile (this=0x80000001005aa3d8)
    at ../../gcc/gcc/cgraphunit.cc:2404
#23 0x4000000000a27fb0 in compile (this=0x0)
    at ../../gcc/gcc/cgraphunit.cc:2589
#24 symbol_table::finalize_compilation_unit (this=0x0)
    at ../../gcc/gcc/cgraphunit.cc:2589
#25 0x40000000010d1664 in compile_file () at ../../gcc/gcc/toplev.cc:478
#26 0x80000001003e0cf8 in ?? ()
Backtrace stopped: previous frame identical to this frame (corrupt stack?)

It would seem tree-vect-slp.cc is miscompiled.

The bug was introduced by the following commit:
bash-5.1$ git bisect good
2abd04d01bc4e18158c785e75c91576b836f3ba6 is the first bad commit
commit 2abd04d01bc4e18158c785e75c91576b836f3ba6
Author: Richard Sandiford <richard.sandif...@arm.com>
Date:   Mon Oct 7 13:03:04 2024 +0100

    vect: Restructure repeating_p case for SLP permutations [PR116583]

    The repeating_p case previously handled the specific situation
    in which the inputs have N lanes and the output has N lanes,
    where N divides the number of vector elements.  In that case,
    every output uses the same permute vector.

    The code was therefore structured so that the outer loop only
    constructed one permute vector, with an inner loop generating
    as many VEC_PERM_EXPRs from it as required.

    However, the main patch for PR116583 adds support for cycling
    through N permute vectors, rather than just having one.
    The current structure doesn't really handle that case well.
    (We'd need to interleave the results after generating them,
    which sounds a bit fragile.)

    This patch instead makes the transform phase calculate each output
    vector's permutation explicitly, like for the !repeating_p path.
    As a bonus, it gets rid of one use of SLP_TREE_NUMBER_OF_VEC_STMTS.

    This arguably undermines one of the justifications for using repeating_p
    for constant-length vectors: that the repeating_p path involved less
    work than the !repeating_p path.  That justification does still hold for
    the analysis phase, though, and that should be the more time-sensitive
    part.  And the other justification -- to get more coverage of the code --
    still applies.  So I'd prefer that we continue to use repeating_p for
    constant-length vectors unless that causes a known missed optimisation.

    gcc/
            PR tree-optimization/116583
            * tree-vect-slp.cc (vectorizable_slp_permutation_1): Remove
            the noutputs_per_mask inner loop and instead generate a
            separate permute vector for each output.

 gcc/tree-vect-slp.cc | 75 ++++++++++++++++++++++++++++------------------------
 1 file changed, 41 insertions(+), 34 deletions(-)

X-Git-Url:
https://gcc.gnu.org/git/?p=gcc.git;a=blobdiff_plain;f=gcc%2Ftree-vect-slp.cc;h=1c986a65252102d732a5b96c41f169fdaffc9ea9;hp=97a471ad9108a18cd27910169025f92821728d57;hb=2abd04d01bc4e18158c785e75c91576b836f3ba6;hpb=1732298d51028ae50a802e538df5d7249556255d

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 97a471ad9108..1c986a652521 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -10257,26 +10257,33 @@ vectorizable_slp_permutation_1 (vec_info *vinfo,
gimple_stmt_iterator *gsi,
       return 1;
     }

-  /* REPEATING_P is true if every output vector is guaranteed to use the
-     same permute vector.  We can handle that case for both variable-length
-     and constant-length vectors, but we only handle other cases for
-     constant-length vectors.
+  /* Set REPEATING_P to true if every output uses the same permute vector
+     and if we can generate the vectors in a vector-length agnostic way.
+
+     When REPEATING_P is true, NOUTPUTS holds the total number of outputs
+     that we actually need to generate.  */
+  uint64_t noutputs = 0;
+  loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo);
+  if (!linfo
+      || !constant_multiple_p (LOOP_VINFO_VECT_FACTOR (linfo)
+                              * SLP_TREE_LANES (node), nunits, &noutputs))
+    repeating_p = false;
+
+  /* We can handle the conditions described for REPEATING_P above for
+     both variable- and constant-length vectors.  The fallback requires
+     us to generate every element of every permute vector explicitly,
+     which is only possible for constant-length permute vectors.

      Set:

      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
-       mask vector that we want to build.
+       mask vectors that we want to build.

      - NCOPIES to the number of copies of PERM that we need in order
-       to build the necessary permute mask vectors.
-
-     - NOUTPUTS_PER_MASK to the number of output vectors we want to create
-       for each permute mask vector.  This is only relevant when GSI is
-       nonnull.  */
+       to build the necessary permute mask vectors.  */
   uint64_t npatterns;
   unsigned nelts_per_pattern;
   uint64_t ncopies;
-  unsigned noutputs_per_mask;
   if (repeating_p)
     {
       /* We need a single permute mask vector that has the form:
@@ -10288,7 +10295,6 @@ vectorizable_slp_permutation_1 (vec_info *vinfo,
gimple_stmt_iterator *gsi,
         that we use for permutes requires 3n elements.  */
       npatterns = SLP_TREE_LANES (node);
       nelts_per_pattern = ncopies = 3;
-      noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
     }
   else
     {
@@ -10298,10 +10304,8 @@ vectorizable_slp_permutation_1 (vec_info *vinfo,
gimple_stmt_iterator *gsi,
          || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
        return -1;
       nelts_per_pattern = ncopies = 1;
-      if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
-       if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
-         return -1;
-      noutputs_per_mask = 1;
+      if (linfo && !LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
+       return -1;
     }
   unsigned olanes = ncopies * SLP_TREE_LANES (node);
   gcc_assert (repeating_p || multiple_p (olanes, nunits));
@@ -10378,16 +10382,24 @@ vectorizable_slp_permutation_1 (vec_info *vinfo,
gimple_stmt_iterator *gsi,
   mask.quick_grow (count);
   vec_perm_indices indices;
   unsigned nperms = 0;
-  for (unsigned i = 0; i < vperm.length (); ++i)
-    {
-      mask_element = vperm[i].second;
+  /* When REPEATING_P is true, we only have one unique permute vector
+     to check during analysis, but we need to generate NOUTPUTS vectors
+     during transformation.  */
+  unsigned total_nelts = olanes;
+  if (repeating_p && gsi)
+    total_nelts *= noutputs;
+  for (unsigned i = 0; i < total_nelts; ++i)
+    {
+      unsigned vi = i / olanes;
+      unsigned ei = i % olanes;
+      mask_element = vperm[ei].second;
       if (first_vec.first == -1U
-         || first_vec == vperm[i].first)
-       first_vec = vperm[i].first;
+         || first_vec == vperm[ei].first)
+       first_vec = vperm[ei].first;
       else if (second_vec.first == -1U
-              || second_vec == vperm[i].first)
+              || second_vec == vperm[ei].first)
        {
-         second_vec = vperm[i].first;
+         second_vec = vperm[ei].first;
          mask_element += nunits;
        }
       else
@@ -10451,17 +10463,12 @@ vectorizable_slp_permutation_1 (vec_info *vinfo,
gimple_stmt_iterator *gsi,
              if (!identity_p)
                mask_vec = vect_gen_perm_mask_checked (vectype, indices);

-             for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
-               {
-                 tree first_def
-                   = vect_get_slp_vect_def (first_node,
-                                            first_vec.second + vi);
-                 tree second_def
-                   = vect_get_slp_vect_def (second_node,
-                                            second_vec.second + vi);
-                 vect_add_slp_permutation (vinfo, gsi, node, first_def,
-                                           second_def, mask_vec, mask[0]);
-               }
+             tree first_def
+               = vect_get_slp_vect_def (first_node, first_vec.second + vi);
+             tree second_def
+               = vect_get_slp_vect_def (second_node, second_vec.second + vi);
+             vect_add_slp_permutation (vinfo, gsi, node, first_def,
+                                       second_def, mask_vec, mask[0]);
            }

Maybe there is an optimization issue with olanes variable in computation of vi.
This could cause divide by zero fault.

Reply via email to