All,

I have run into a(nother) problem with reload with
-freorder-blocks-and-partition.

Attached is my WIP patch (some of this has been sent up to gcc-patches
for review), and also the profile information (tarred up) I have
gathered in the train session

I get a segfault when executing crafty, which seems to come from
incorrect code generation in iterate.c.

The following shows some of the RTL dump after IRA

(insn 3087 1471 3088 163 (clobber (reg:DI 682 [ D.7985 ])) -1
     (nil))
(insn 3088 3087 3085 163 (set (subreg:SI (reg:DI 682 [ D.7985 ]) 0)
        (sign_extend:SI (mem/c:QI (reg/f:SI 1417) [0
transposition_id+0 S1 A8]))) 735 {*thumb2_extendqisi_v6}
     (expr_list:REG_DEAD (reg/f:SI 1417)
        (nil)))
...
(insn 3089 1477 1478 163 (set (subreg:SI (reg:DI 682 [ D.7985 ]) 4)
        (ashiftrt:SI (subreg:SI (reg:DI 682 [ D.7985 ]) 0)
            (const_int 31 [0x1f]))) 130 {*arm_shiftsi3}
     (nil))
...
(insn 2898 2622 2899 203 (set (reg:SI 1677)
        (mem/u/c:SI (symbol_ref/u:SI ("*.LC60") [flags 0x2]) [2 S4
A32])) 635 {*thumb2_movsi_vfp}
     (insn_list:REG_LABEL_OPERAND 1525 (expr_list:REG_EQUIV (label_ref:SI 1525)
            (nil))))
(insn 2899 2898 3491 203 (set (reg:SI 1678)
        (ior:SI (reg:SI 1677)
            (const_int 1 [0x1]))) 98 {*iorsi3_insn}
     (expr_list:REG_DEAD (reg:SI 1677)
        (nil)))
(insn 3491 2899 3492 203 (set (reg:DI 1734 [orig:682 D.7985 ] [682])
        (reg:DI 682 [ D.7985 ])) 636 {*movdi_vfp}
     (nil))
...
(jump_insn 2900 3499 2625 203 (set (pc)
        (reg:SI 1678)) 727 {*thumb2_indirect_jump}
     (expr_list:REG_DEAD (reg:SI 1678)
        (expr_list:REG_CROSSING_JUMP (nil)
            (nil))))
...

Insns 2898, 2899, and 2900 form the standard Thumb-2 indirect jump
sequence.  Insn 3491 is a move that has been generated as part of
emit_moves in IRA for the loop it belongs to (effectively copying r682
into r1734).

Now despite thinking that r1678 is live throughout insn 3491 after
reload this part of the RTL dump looks like

(insn 2898 2622 2899 212 (set (reg:SI 4 r4 [1677])
        (mem/u/c:SI (symbol_ref/u:SI ("*.LC60") [flags 0x2]) [2 S4
A32])) 635 {*thumb2_movsi_vfp}
     (insn_list:REG_LABEL_OPERAND 1525 (expr_list:REG_EQUIV (label_ref:SI 1525)
            (nil))))
(insn 2899 2898 3491 212 (set (reg:SI 4 r4 [1678])
        (ior:SI (reg:SI 4 r4 [1677])
            (const_int 1 [0x1]))) 98 {*iorsi3_insn}
     (nil))
(insn 3491 2899 3493 212 (set (reg:DI 4 r4 [orig:682 D.7985 ] [682])
        (mem/c:DI (plus:SI (reg/f:SI 13 sp)
                (const_int 24 [0x18])) [9 %sfp+-672 S8 A64])) 636 {*movdi_vfp}
     (nil))
...
(jump_insn 2900 3497 2625 212 (set (pc)
        (reg:SI 4 r4 [1678])) 727 {*thumb2_indirect_jump}
     (expr_list:REG_CROSSING_JUMP (nil)
        (nil)))

That is all of r682, r1678, and r1734 have been assigned to hard
register r4.  This is incorrect - as insn 2900 wants to be using r1678
from insn 2899.

Looking at the logs it seems to me that r1734 because its original is
r682 and that is assigned r4.

The reload dump says the following about the liveness of the registers
for various insns:
insn=3087, live_throughout: ..., dead_or_set: 682
insn=3088, live_throughout: ..., dead_or_set: 682
insn=3089, live_throughout: ..., dead_or_set: 682
insn=2898, live_throughout: ..., 682, ..., dead_or_set: 1677
insn=2899, live_throughout: ..., 682, ..., dead_or_set: 1677, 1678
insn=3491, live_throughout: ..., 682, 1678, ..., dead_or_set: 1734
insn=2900, live_throughout: ..., 682, 1734, ..., dead_or_set: 1678

This suggests to me that the compiler should know assigning the same
hard-register to r682 and r1678 is incorrect as they have overlapping
live-ranges, and are not duplicates of each other.

The compiler is configured as follows:

Target: arm-none-linux-gnueabi
Configured with:
/work/sources/gcc-fsf-enable-hot-cold-partitioning/configure
--target=arm-none-linux-gnueabi
--prefix=/work/builds/gcc-fsf-enable-hot-cold-partitioning-arm-none-linux-gnueabi/tools
--with-sysroot=/work/builds/gcc-fsf-enable-hot-cold-partitioning-arm-none-linux-gnueabi/sysroot
--disable-libssp --disable-libgomp --disable-libmudflap
--enable-languages=c,c++,fortran --with-cpu=cortex-a9 --with-fpu=neon
--with-float=softfp --enable-build-with-cxx : (reconfigured)
/work/sources/gcc-fsf-enable-hot-cold-partitioning/configure
--target=arm-none-linux-gnueabi
--prefix=/work/builds/gcc-fsf-enable-hot-cold-partitioning-arm-none-linux-gnueabi/tools
--with-sysroot=/work/builds/gcc-fsf-enable-hot-cold-partitioning-arm-none-linux-gnueabi/sysroot
--disable-libssp --disable-libgomp --disable-libmudflap
--enable-languages=c,c++,fortran --with-cpu=cortex-a9 --with-fpu=neon
--with-float=softfp --enable-build-with-cxx
Thread model: posix
gcc version 4.8.0 20120821 (experimental) (GCC)

The gcc command line looks like:

./xgcc -B`pwd` -march=armv7-a -mtune=cortex-a9 -mthumb -mfpu=neon
-mvectorize-with-neon-quad -mfloat-abi=softfp
-fprofile-use=.../186.crafty -freorder-blocks-and-partition
-fno-common  -fdump-noaddr -O3 -dp -save-temps iterate.c -o iterate.o

Does anyone have any hints as to where I should go looking?

Thanks,

Matt

-- 
Matthew Gretton-Dann
Linaro Toolchain Working Group
matthew.gretton-d...@linaro.org
diff --git a/gcc/cfgrtl.c b/gcc/cfgrtl.c
index c62b5bc..341ea9e 100644
--- a/gcc/cfgrtl.c
+++ b/gcc/cfgrtl.c
@@ -1572,6 +1572,11 @@ rtl_tidy_fallthru_edge (edge e)
     if (INSN_P (q))
       return;
 
+  /* If the two blocks are in different partitions we do not want to mark
+     this as a fallthru edge.  */
+  if (BB_PARTITION (b) != BB_PARTITION (c))
+    return;
+
   /* Remove what will soon cease being the jump insn from the source block.
      If block B consisted only of this single jump, turn it into a deleted
      note.  */
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 2805b7c..21d2213 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -1989,16 +1989,6 @@ arm_option_override (void)
   else
     max_insns_skipped = current_tune->max_insns_skipped;
 
-  /* Hot/Cold partitioning is not currently supported, since we can't
-     handle literal pool placement in that case.  */
-  if (flag_reorder_blocks_and_partition)
-    {
-      inform (input_location,
-             "-freorder-blocks-and-partition not supported on this 
architecture");
-      flag_reorder_blocks_and_partition = 0;
-      flag_reorder_blocks = 1;
-    }
-
   if (flag_pic)
     /* Hoisting PIC address calculations more aggressively provides a small,
        but measurable, size reduction for PIC code.  Therefore, we decrease
@@ -13479,9 +13469,19 @@ arm_reorg (void)
   minipool_pad = 0;
 
   /* Scan all the insns and record the operands that will need fixing.  */
-  for (insn = next_nonnote_insn (insn); insn; insn = next_nonnote_insn (insn))
+  for (insn = next_nondebug_insn (insn); insn; insn = next_nondebug_insn 
(insn))
     {
-      if (GET_CODE (insn) == BARRIER)
+      if (NOTE_P (insn))
+       {
+         if (NOTE_KIND (insn) == NOTE_INSN_SWITCH_TEXT_SECTIONS)
+           {
+             /* Given we can't address a range greater than 4MB go ahead
+                and increase the address sky high to force all pools before
+                this note to be dumped.  */
+             address += 0x400000;
+           }
+       }
+      else if (GET_CODE (insn) == BARRIER)
        push_minipool_barrier (insn, address);
       else if (INSN_P (insn))
        {
diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
index 8acde0e..9ccc283 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -1922,20 +1922,25 @@ enum arm_auto_incmodes
                                 || (TARGET_THUMB1                      \
                                     && (optimize_size || flag_pic)))
 
+/* TODO: When partitioning we cannot use tbh/tbb or other short forms as the
+   branches may be in different sections.  We could improve this by looking
+   at the labels in BODY and determining whether we cross HOT/COLD boundaries. 
 */
 #define CASE_VECTOR_SHORTEN_MODE(min, max, body)                       \
-  (TARGET_THUMB1                                                       \
-   ? (min >= 0 && max < 512                                            \
-      ? (ADDR_DIFF_VEC_FLAGS (body).offset_unsigned = 1, QImode)       \
-      : min >= -256 && max < 256                                       \
-      ? (ADDR_DIFF_VEC_FLAGS (body).offset_unsigned = 0, QImode)       \
-      : min >= 0 && max < 8192                                         \
-      ? (ADDR_DIFF_VEC_FLAGS (body).offset_unsigned = 1, HImode)       \
-      : min >= -4096 && max < 4096                                     \
-      ? (ADDR_DIFF_VEC_FLAGS (body).offset_unsigned = 0, HImode)       \
-      : SImode)                                                                
\
-   : ((min < 0 || max >= 0x20000 || !TARGET_THUMB2) ? SImode           \
-      : (max >= 0x200) ? HImode                                                
\
-      : QImode))
+  (flag_reorder_blocks_and_partition                                   \
+   ? SImode                                                            \
+   : (TARGET_THUMB1                                                    \
+     ? (min >= 0 && max < 512                                          \
+       ? (ADDR_DIFF_VEC_FLAGS (body).offset_unsigned = 1, QImode)      \
+       : min >= -256 && max < 256                                      \
+       ? (ADDR_DIFF_VEC_FLAGS (body).offset_unsigned = 0, QImode)      \
+       : min >= 0 && max < 8192                                        \
+       ? (ADDR_DIFF_VEC_FLAGS (body).offset_unsigned = 1, HImode)      \
+       : min >= -4096 && max < 4096                                    \
+       ? (ADDR_DIFF_VEC_FLAGS (body).offset_unsigned = 0, HImode)      \
+       : SImode)                                                       \
+     : ((min < 0 || max >= 0x20000 || !TARGET_THUMB2) ? SImode         \
+       : (max >= 0x200) ? HImode                                       \
+       : QImode)))
 
 /* signed 'char' is most compatible, but RISC OS wants it unsigned.
    unsigned is probably best, but may break some code.  */
diff --git a/gcc/postreload-gcse.c b/gcc/postreload-gcse.c
index b9e9f25..300d7df 100644
--- a/gcc/postreload-gcse.c
+++ b/gcc/postreload-gcse.c
@@ -1051,6 +1051,13 @@ eliminate_partially_redundant_load (basic_block bb, rtx 
insn,
          /* Adding a load on a critical edge will cause a split.  */
          if (EDGE_CRITICAL_P (pred))
            critical_edge_split = true;
+
+         /* If the destination register is used at the BB end we can not
+            insert the load.  */
+         if (reg_used_between_p (dest, PREV_INSN (BB_END (pred_bb)),
+                                 next_pred_bb_end))
+           goto cleanup;
+
          not_ok_count += pred->count;
          unoccr = (struct unoccr *) obstack_alloc (&unoccr_obstack,
                                                    sizeof (struct unoccr));

Attachment: 186.crafty.tar.xz
Description: Binary data

_______________________________________________
linaro-toolchain mailing list
linaro-toolchain@lists.linaro.org
http://lists.linaro.org/mailman/listinfo/linaro-toolchain

Reply via email to