Here's the patch for sms-and-memory-dependencies.  The idea is to bypass
the sched-deps.c {output,read,anti,true}_dependence tests altogether --
which is easy to do thanks to the note_mem_dep hook -- and instead handle
them in ddg.c.  The ddg.c tests then use RTL loop iv analysis to try
to get longer distances on the memory dependencies.  (Note that other
memory-related dependencies, such as those between volatile MEMs and
other volatile instructions, are still handled by sched-deps.c.)

Dependencies are now always created in pairs, so there's no need for
get_sched_window to set an upper bound when processing incoming MEM_DEPs,
or a lower bound when processing outgoing MEM_DEPs; we can rely on the
partnering edge to do that instead.

Richard


gcc/
        * Makefile.in (ddg.o): Depend on $(TREE_PASS_H).
        * ddg.h (REG_OR_MEM_DEP, REG_AND_MEM_DEP): Delete.
        (ddg_mem_ref): New structure.
        (ddg): Add loads and stores array.
        (create_ddg): Add a loop argument.
        (add_edges_to_ddg): Declare.
        (MAX_DDG_DISTANCE): New macro.
        * ddg.c: Include tree-pass.h.
        (mem_ref_p, mark_mem_use, mark_mem_use_1, mem_read_insn_p)
        (mark_mem_store, mem_write_insn_p, rtx_mem_access_p)
        (mem_access_insn_p): Delete.
        (create_mem_ref): New function.
        (graph_and_node): New structure.
        (record_loads, record_stores): New functions.
        (create_ddg_dep_from_intra_loop_link): Treat all dependencies
        as register dependencies.
        (walk_mems_2, walk_mems_1, insns_may_alias_p, add_intra_loop_mem_dep)
        (add_inter_loop_mem_dep): Delete.
        (build_intra_loop_deps): Ignore memory dependencies created by
        sched-deps.c.  Don't handle memory dependencies here.
        (measure_mem_distance, add_memory_dep): New functions.
        (FOR_EACH_LATER_MEM_REF): New macro.
        (build_memory_deps): New function.
        (create_ddg): Take the loop as argument.  Don't count loads and
        stores here.  Call iv_analysis_loop_init.  Pass all loads to
        record_loads and all stores to record_stores.  Move edge
        creation to...
        (add_edges_to_ddg): ...this new function.  Also call
        build_memory_deps.
        * modulo-sched.c (sat_mulpp, sat_addsp, sat_subsp): New functions.
        (schedule_reg_moves): Only handle register dependencies.
        (sms_schedule): Update call to create_ddg.  Call iv_analysis_done
        after creating all ddgs.  Only set issue_rate if there are ddgs.
        Only call setup_sched_infos and haifa_sched_init if there are ddgs.
        Call add_edges_to_ddg before processing each ddg.
        (get_sched_window): Use saturating arithmetic.  Do not add an
        implicit upper bound for incoming MEM_DEPs, or an implicit lower
        bound for outgoing MEM_DEPs.  Rework calculation of final window.
        (calculate_must_precede_follow, compute_split_row): Use saturating
        arithmetic.

Index: gcc/Makefile.in
===================================================================
--- gcc/Makefile.in     2011-12-30 13:13:45.077544981 +0000
+++ gcc/Makefile.in     2011-12-30 13:24:57.330195801 +0000
@@ -3316,7 +3316,7 @@ ddg.o : ddg.c $(DDG_H) $(CONFIG_H) $(SYS
    $(DIAGNOSTIC_CORE_H) $(RTL_H) $(TM_P_H) $(REGS_H) $(FUNCTION_H) \
    $(FLAGS_H) insn-config.h $(INSN_ATTR_H) $(EXCEPT_H) $(RECOG_H) \
    $(SCHED_INT_H) $(CFGLAYOUT_H) $(CFGLOOP_H) $(EXPR_H) $(BITMAP_H) \
-   hard-reg-set.h sbitmap.h $(TM_H)
+   hard-reg-set.h sbitmap.h $(TM_H) $(TREE_PASS_H)
 modulo-sched.o : modulo-sched.c $(DDG_H) $(CONFIG_H) $(CONFIG_H) $(SYSTEM_H) \
    coretypes.h $(TARGET_H) $(DIAGNOSTIC_CORE_H) $(RTL_H) $(TM_P_H) $(REGS_H) 
$(FUNCTION_H) \
    $(FLAGS_H) insn-config.h $(INSN_ATTR_H) $(EXCEPT_H) $(RECOG_H) \
Index: gcc/ddg.h
===================================================================
--- gcc/ddg.h   2011-12-30 13:13:45.077544981 +0000
+++ gcc/ddg.h   2011-12-30 13:24:57.324195831 +0000
@@ -35,8 +35,7 @@ typedef struct ddg_scc *ddg_scc_ptr;
 typedef struct ddg_all_sccs *ddg_all_sccs_ptr;
 
 typedef enum {TRUE_DEP, OUTPUT_DEP, ANTI_DEP} dep_type;
-typedef enum {REG_OR_MEM_DEP, REG_DEP, MEM_DEP, REG_AND_MEM_DEP}
-            dep_data_type;
+typedef enum {REG_DEP, MEM_DEP} dep_data_type;
 
 /* The following two macros enables direct access to the successors and
    predecessors bitmaps held in each ddg_node.  Do not make changes to
@@ -44,6 +43,28 @@ typedef enum {REG_OR_MEM_DEP, REG_DEP, M
 #define NODE_SUCCESSORS(x)  ((x)->successors)
 #define NODE_PREDECESSORS(x)  ((x)->predecessors)
 
+/* A structure that represents a memory read or write in the DDG;
+   context decides which.  */
+struct ddg_mem_ref {
+  /* The previous reference of the same type (read or write) in the DDG.  */
+  struct ddg_mem_ref *prev;
+
+  /* The DDG node that contains the memory reference.  */
+  ddg_node_ptr node;
+
+  /* The memory reference itself.  */
+  rtx mem;
+
+  /* If the address is a known induction variable, its value in iteration
+     I is given by:
+
+         BASE + OFFSET + I * STEP
+
+     In other cases BASE is null.  */
+  rtx base;
+  HOST_WIDE_INT offset, step;
+};
+
 /* A structure that represents a node in the DDG.  */
 struct ddg_node
 {
@@ -117,6 +138,11 @@ struct ddg
   /* Number of instructions in the basic block.  */
   int num_nodes;
 
+  /* The loads and stores in the BB, from the end of the block to
+     the beginning.  */
+  struct ddg_mem_ref *loads;
+  struct ddg_mem_ref *stores;
+
   /* Number of load/store instructions in the BB - statistics.  */
   int num_loads;
   int num_stores;
@@ -167,7 +193,9 @@ struct ddg_all_sccs
 };
 
 
-ddg_ptr create_ddg (basic_block, int closing_branch_deps);
+struct loop;
+ddg_ptr create_ddg (struct loop *, basic_block, int closing_branch_deps);
+void add_edges_to_ddg (ddg_ptr);
 void free_ddg (ddg_ptr);
 
 void print_ddg (FILE *, ddg_ptr);
@@ -188,4 +216,7 @@ int longest_simple_path (ddg_ptr, int fr
 
 bool autoinc_var_is_used_p (rtx, rtx);
 
+/* The maximum allowable distance on a DDG edge.  */
+#define MAX_DDG_DISTANCE INT_MAX
+
 #endif /* GCC_DDG_H */
Index: gcc/ddg.c
===================================================================
--- gcc/ddg.c   2011-12-30 13:13:45.077544981 +0000
+++ gcc/ddg.c   2011-12-30 13:36:35.005498271 +0000
@@ -43,6 +43,7 @@ Software Foundation; either version 3, o
 #include "expr.h"
 #include "bitmap.h"
 #include "ddg.h"
+#include "tree-pass.h"
 
 #ifdef INSN_SCHEDULING
 
@@ -61,88 +62,102 @@ static ddg_edge_ptr create_ddg_edge (ddg
                                     dep_data_type, int, int);
 static void add_edge_to_ddg (ddg_ptr g, ddg_edge_ptr);
 
-/* Auxiliary variable for mem_read_insn_p/mem_write_insn_p.  */
-static bool mem_ref_p;
+/* Create a memory reference record for MEM, which occurs in NODE.
+   PREV is the previous reference of the same type.  */
+static struct ddg_mem_ref *
+create_mem_ref (struct ddg_mem_ref *prev, ddg_node_ptr node, rtx mem)
+{
+  struct ddg_mem_ref *entry;
+  enum machine_mode pmode;
+  struct rtx_iv iv;
+  rtx x;
+
+  entry = XCNEW (struct ddg_mem_ref);
+  entry->prev = prev;
+  entry->node = node;
+  entry->mem = mem;
+
+  pmode = targetm.addr_space.address_mode (MEM_ADDR_SPACE (mem));
+  if (iv_analyze_expr (node->insn, XEXP (mem, 0), pmode, &iv)
+      && iv.extend == UNKNOWN
+      && CONST_INT_P (iv.step))
+    {
+      x = iv.base;
+      if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
+       {
+         entry->base = XEXP (x, 0);
+         entry->offset = INTVAL (XEXP (x, 1));
+       }
+      else
+       {
+         entry->base = x;
+         entry->offset = 0;
+       }
+      entry->step = INTVAL (iv.step);
+    }
 
-/* Auxiliary function for mem_read_insn_p.  */
-static int
-mark_mem_use (rtx *x, void *data ATTRIBUTE_UNUSED)
-{
-  if (MEM_P (*x))
-    mem_ref_p = true;
-  return 0;
+  if (dump_file)
+    {
+      fprintf (dump_file, "Found memory reference in insn %d:\n",
+              INSN_UID (node->insn));
+      print_rtl (dump_file, mem);
+      if (entry->base)
+       {
+         fprintf (dump_file, "\nwith base:");
+         print_rtl (dump_file, entry->base);
+         fprintf (dump_file, "\noffset " HOST_WIDE_INT_PRINT_DEC
+                  " and step " HOST_WIDE_INT_PRINT_DEC "\n\n",
+                  entry->offset, entry->step);
+       }
+      else
+       fprintf (dump_file, "\nwhich isn't a recognized iv\n\n");
+    }
+  return entry;
 }
 
-/* Auxiliary function for mem_read_insn_p.  */
-static void
-mark_mem_use_1 (rtx *x, void *data)
-{
-  for_each_rtx (x, mark_mem_use, data);
-}
+/* A structure for pairing a node and the graph to which it belongs.  */
+struct graph_and_node {
+  ddg_ptr g;
+  ddg_node_ptr node;
+};
 
-/* Returns nonzero if INSN reads from memory.  */
-static bool
-mem_read_insn_p (rtx insn)
+/* A for_each_rtx callback.  Record all loads in an instruction.
+   DATA points to a graph_and_node.  */
+static int
+record_loads_1 (rtx *loc, void *data)
 {
-  mem_ref_p = false;
-  note_uses (&PATTERN (insn), mark_mem_use_1, NULL);
-  return mem_ref_p;
-}
+  struct graph_and_node *gn;
 
-static void
-mark_mem_store (rtx loc, const_rtx setter ATTRIBUTE_UNUSED, void *data 
ATTRIBUTE_UNUSED)
-{
-  if (MEM_P (loc))
-    mem_ref_p = true;
+  if (MEM_P (*loc))
+    {
+      gn = (struct graph_and_node *) data;
+      gn->g->loads = create_mem_ref (gn->g->loads, gn->node, *loc);
+      gn->g->num_loads++;
+    }
+  return 0;
 }
 
-/* Returns nonzero if INSN writes to memory.  */
-static bool
-mem_write_insn_p (rtx insn)
+/* A note_uses callback.  Record all loads in an instruction.
+   DATA points to a graph_and_node.  */
+static void
+record_loads (rtx *loc, void *data)
 {
-  mem_ref_p = false;
-  note_stores (PATTERN (insn), mark_mem_store, NULL);
-  return mem_ref_p;
+  for_each_rtx (loc, record_loads_1, data);
 }
 
-/* Returns nonzero if X has access to memory.  */
-static bool
-rtx_mem_access_p (rtx x)
+/* A note_stores callback.  Record all stores in an instruction.
+   DATA points to a graph_and_node.  */
+static void
+record_stores (rtx x, const_rtx setter ATTRIBUTE_UNUSED, void *data)
 {
-  int i, j;
-  const char *fmt;
-  enum rtx_code code;
-
-  if (x == 0)
-    return false;
+  struct graph_and_node *gn;
 
   if (MEM_P (x))
-    return true;
-
-  code = GET_CODE (x);
-  fmt = GET_RTX_FORMAT (code);
-  for (i = GET_RTX_LENGTH (code) - 1; i >= 0; i--)
     {
-      if (fmt[i] == 'e')
-       {
-         if (rtx_mem_access_p (XEXP (x, i)))
-            return true;
-        }
-      else if (fmt[i] == 'E')
-       for (j = 0; j < XVECLEN (x, i); j++)
-         {
-           if (rtx_mem_access_p (XVECEXP (x, i, j)))
-              return true;
-          }
+      gn = (struct graph_and_node *) data;
+      gn->g->stores = create_mem_ref (gn->g->stores, gn->node, x);
+      gn->g->num_stores++;
     }
-  return false;
-}
-
-/* Returns nonzero if INSN reads to or writes from memory.  */
-static bool
-mem_access_insn_p (rtx insn)
-{
-  return rtx_mem_access_p (PATTERN (insn));
 }
 
 /* Return true if DEF_INSN contains address being auto-inc or auto-dec
@@ -175,9 +190,6 @@ create_ddg_dep_from_intra_loop_link (ddg
   ddg_edge_ptr e;
   int latency, distance = 0;
   dep_type t = TRUE_DEP;
-  dep_data_type dt = (mem_access_insn_p (src_node->insn)
-                     && mem_access_insn_p (dest_node->insn) ? MEM_DEP
-                                                            : REG_DEP);
   gcc_assert (src_node->cuid < dest_node->cuid);
   gcc_assert (link);
 
@@ -201,7 +213,7 @@ create_ddg_dep_from_intra_loop_link (ddg
      TODO: support the removal of all anti-deps edges, i.e. including those
      whose register has multiple defs in the loop.  */
   if (flag_modulo_sched_allow_regmoves 
-      && (t == ANTI_DEP && dt == REG_DEP)
+      && t == ANTI_DEP
       && !autoinc_var_is_used_p (dest_node->insn, src_node->insn))
     {
       rtx set;
@@ -224,7 +236,7 @@ create_ddg_dep_from_intra_loop_link (ddg
     }
 
    latency = dep_cost (link);
-   e = create_ddg_edge (src_node, dest_node, t, dt, latency, distance);
+   e = create_ddg_edge (src_node, dest_node, t, REG_DEP, latency, distance);
    add_edge_to_ddg (g, e);
 }
 
@@ -380,107 +392,6 @@ build_inter_loop_deps (ddg_ptr g)
   }
 }
 
-
-static int
-walk_mems_2 (rtx *x, rtx mem)
-{
-  if (MEM_P (*x))
-    {
-      if (may_alias_p (*x, mem))
-        return 1;
-
-      return -1;
-    }
-  return 0;
-}
-
-static int
-walk_mems_1 (rtx *x, rtx *pat)
-{
-  if (MEM_P (*x))
-    {
-      /* Visit all MEMs in *PAT and check indepedence.  */
-      if (for_each_rtx (pat, (rtx_function) walk_mems_2, *x))
-        /* Indicate that dependence was determined and stop traversal.  */
-        return 1;
-
-      return -1;
-    }
-  return 0;
-}
-
-/* Return 1 if two specified instructions have mem expr with conflict alias 
sets*/
-static int
-insns_may_alias_p (rtx insn1, rtx insn2)
-{
-  /* For each pair of MEMs in INSN1 and INSN2 check their independence.  */
-  return  for_each_rtx (&PATTERN (insn1), (rtx_function) walk_mems_1,
-                        &PATTERN (insn2));
-}
-
-/* Given two nodes, analyze their RTL insns and add intra-loop mem deps
-   to ddg G.  */
-static void
-add_intra_loop_mem_dep (ddg_ptr g, ddg_node_ptr from, ddg_node_ptr to)
-{
-
-  if ((from->cuid == to->cuid)
-      || !insns_may_alias_p (from->insn, to->insn))
-    /* Do not create edge if memory references have disjoint alias sets
-       or 'to' and 'from' are the same instruction.  */
-    return;
-
-  if (mem_write_insn_p (from->insn))
-    {
-      if (mem_read_insn_p (to->insn))
-       create_ddg_dep_no_link (g, from, to,
-                               DEBUG_INSN_P (to->insn)
-                               ? ANTI_DEP : TRUE_DEP, MEM_DEP, 0);
-      else
-       create_ddg_dep_no_link (g, from, to,
-                               DEBUG_INSN_P (to->insn)
-                               ? ANTI_DEP : OUTPUT_DEP, MEM_DEP, 0);
-    }
-  else if (!mem_read_insn_p (to->insn))
-    create_ddg_dep_no_link (g, from, to, ANTI_DEP, MEM_DEP, 0);
-}
-
-/* Given two nodes, analyze their RTL insns and add inter-loop mem deps
-   to ddg G.  */
-static void
-add_inter_loop_mem_dep (ddg_ptr g, ddg_node_ptr from, ddg_node_ptr to)
-{
-  if (!insns_may_alias_p (from->insn, to->insn))
-    /* Do not create edge if memory references have disjoint alias sets.  */
-    return;
-
-  if (mem_write_insn_p (from->insn))
-    {
-      if (mem_read_insn_p (to->insn))
-       create_ddg_dep_no_link (g, from, to,
-                               DEBUG_INSN_P (to->insn)
-                               ? ANTI_DEP : TRUE_DEP, MEM_DEP, 1);
-      else if (from->cuid != to->cuid)
-       create_ddg_dep_no_link (g, from, to,
-                               DEBUG_INSN_P (to->insn)
-                               ? ANTI_DEP : OUTPUT_DEP, MEM_DEP, 1);
-    }
-  else
-    {
-      if (mem_read_insn_p (to->insn))
-       return;
-      else if (from->cuid != to->cuid)
-       {
-         create_ddg_dep_no_link (g, from, to, ANTI_DEP, MEM_DEP, 1);
-         if (DEBUG_INSN_P (from->insn) || DEBUG_INSN_P (to->insn))
-           create_ddg_dep_no_link (g, to, from, ANTI_DEP, MEM_DEP, 1);
-         else
-           create_ddg_dep_no_link (g, to, from, TRUE_DEP, MEM_DEP, 1);
-       }
-    }
-
-}
-
 /* Perform intra-block Data Dependency analysis and connect the nodes in
    the DDG.  We assume the loop has a single basic block.  */
 static void
@@ -493,6 +404,9 @@ build_intra_loop_deps (ddg_ptr g)
 
   /* Build the dependence information, using the sched_analyze function.  */
   init_deps_global ();
+  /* Ignore the usual dependencies between two MEM rtxes.  We still rely
+     on sched_analyze to handle memory barriers and the like.  */
+  sched_deps_info->note_mem_dep = 0;
   init_deps (&tmp_deps, false);
 
   /* Do the intra-block data dependence analysis for the given block.  */
@@ -519,37 +433,6 @@ build_intra_loop_deps (ddg_ptr g)
 
          create_ddg_dep_from_intra_loop_link (g, src_node, dest_node, dep);
        }
-
-      /* If this insn modifies memory, add an edge to all insns that access
-        memory.  */
-      if (mem_access_insn_p (dest_node->insn))
-       {
-         int j;
-
-         for (j = 0; j <= i; j++)
-           {
-             ddg_node_ptr j_node = &g->nodes[j];
-             if (DEBUG_INSN_P (j_node->insn))
-               continue;
-             if (mem_access_insn_p (j_node->insn))
-               {
-                 /* Don't bother calculating inter-loop dep if an intra-loop 
dep
-                    already exists.  */
-                 if (! TEST_BIT (dest_node->successors, j))
-                   add_inter_loop_mem_dep (g, dest_node, j_node);
-                 /* If -fmodulo-sched-allow-regmoves
-                    is set certain anti-dep edges are not created.
-                    It might be that these anti-dep edges are on the
-                    path from one memory instruction to another such that
-                    removing these edges could cause a violation of the
-                    memory dependencies.  Thus we add intra edges between
-                    every two memory instructions in this case.  */
-                 if (flag_modulo_sched_allow_regmoves
-                     && !TEST_BIT (dest_node->predecessors, j))
-                   add_intra_loop_mem_dep (g, j_node, dest_node);
-               }
-            }
-        }
     }
 
   /* Free the INSN_LISTs.  */
@@ -560,13 +443,187 @@ build_intra_loop_deps (ddg_ptr g)
   sched_free_deps (head, tail, false);
 }
 
+/* Given a "source" memory reference from iteration 0 and a "target"
+   memory reference from iteration BASE_DISTANCE, return the first
+   N >= BASE_DISTANCE such that the source reference in iteration 0
+   overlaps the target reference in iteration N.
+
+   FROM_OFFSET is the offset of the source reference from an unspecified
+   base, while TO_OFFSET is the offset of the target reference from that
+   same base.  FROM_SIZE and TO_SIZE are the sizes of the two references
+   in bytes.
+
+   PMODE is the mode of both addresses, and STEP is the amount that
+   will be added to each address by one loop iteration.  */
+static int
+measure_mem_distance (enum machine_mode pmode,
+                     unsigned HOST_WIDE_INT from_offset,
+                     unsigned HOST_WIDE_INT from_size,
+                     unsigned HOST_WIDE_INT to_offset,
+                     unsigned HOST_WIDE_INT to_size,
+                     unsigned HOST_WIDE_INT base_distance,
+                     HOST_WIDE_INT step)
+{
+  unsigned HOST_WIDE_INT extra, from2to, to2from;
+
+  from2to = (to_offset - from_offset) & GET_MODE_MASK (pmode);
+  to2from = (from_offset - to_offset) & GET_MODE_MASK (pmode);
+  if (from2to < from_size || to2from < to_size)
+    /* The source reference in iteration 0 overlaps the target reference
+       in iteration BASE_DISTANCE.  The check is written this way to cope
+       with cases where offset + size overflows.  */
+    return base_distance;
+
+  /* N > BASE_DISTANCE.  To cope more easily with cases where the round-up
+     divisions:
+
+         (to2from - (to_size - 1) + (step - 1)) / step
+         (from2to - (from_size - 1) + (step - 1)) / step
+
+     would overflow, bump BASE_DISTANCE and subtract STEP from each
+     dividend to compensate.  */
+  base_distance++;
+  if (step > 0)
+    extra = (to2from - to_size) / (unsigned HOST_WIDE_INT) step;
+  else
+    extra = (from2to - from_size) / (unsigned HOST_WIDE_INT) -step;
+  if (extra > MAX_DDG_DISTANCE || base_distance + extra > MAX_DDG_DISTANCE)
+    return MAX_DDG_DISTANCE;
+  return base_distance + extra;
+}
+
+/* If FROM and TO might alias, record memory dependencies:
+
+         FROM--(FORWARD_TYPE)-->TO
+     and TO--(BACKWARD_TYPE)-->FROM
+
+   FROM comes before TO in the original loop, and both belong to G.
+   FORWARD_DISTANCE is the minimum distance of the FROM--->TO dependence.  */
+static void
+add_memory_dep (ddg_ptr g, struct ddg_mem_ref *from,
+               struct ddg_mem_ref *to, dep_type forward_type,
+               dep_type backward_type, int forward_distance)
+{
+  HOST_WIDE_INT step;
+  unsigned HOST_WIDE_INT from_size, to_size, to_disp, abs_step, future_offset;
+  enum machine_mode pmode;
+  int backward_distance;
+
+  gcc_checking_assert (from->node->cuid < to->node->cuid);
+
+  if (!may_alias_p (from->mem, to->mem))
+    return;
+
+  /* In the worst case, the TO---->FROM edge has a distance of 1.  */
+  backward_distance = 1;
+
+  /* See if we can get more accurate distances.  Look for cases where
+     the addresses of FROM and TO are ivs with the same base and step.  */
+  if (from->base
+      && to->base
+      && from->step
+      && from->step == to->step
+      && !MEM_VOLATILE_P (from->mem)
+      && !MEM_VOLATILE_P (to->mem)
+      && MEM_SIZE_KNOWN_P (from->mem)
+      && MEM_SIZE_KNOWN_P (to->mem)
+      && MEM_ADDR_SPACE (from->mem) == MEM_ADDR_SPACE (to->mem)
+      && rtx_equal_p (from->base, to->base))
+    {
+      step = to->step;
+      abs_step = (step < 0 ? -step : step);
+      from_size = MEM_SIZE (from->mem);
+      to_size = MEM_SIZE (to->mem);
+
+      /* If the step is a power of two, or the negative of a power of two,
+        see whether we can prove that the references never overlap.  */
+      if (abs_step == (abs_step & -abs_step))
+       {
+         to_disp = (to->offset - from->offset) % abs_step;
+         if (from_size <= to_disp && to_disp + to_size <= abs_step)
+           return;
+       }
+
+      pmode = targetm.addr_space.address_mode (MEM_ADDR_SPACE (to->mem));
+      future_offset = to->offset + forward_distance * step;
+      forward_distance = measure_mem_distance (pmode,
+                                              from->offset, from_size,
+                                              future_offset, to_size,
+                                              forward_distance, step);
+      future_offset = from->offset + backward_distance * step;
+      backward_distance = measure_mem_distance (pmode,
+                                               to->offset, to_size,
+                                               future_offset, from_size,
+                                               backward_distance, step);
+    }
+
+  if (DEBUG_INSN_P (from->node->insn) || DEBUG_INSN_P (to->node->insn))
+    {
+      forward_type = ANTI_DEP;
+      backward_type = ANTI_DEP;
+    }
+  create_ddg_dep_no_link (g, from->node, to->node, forward_type, MEM_DEP,
+                         forward_distance);
+  create_ddg_dep_no_link (g, to->node, from->node, backward_type, MEM_DEP,
+                         backward_distance);
+}
+
+/* Make REF2 iterate over all entries in ddg_mem_ref list LIST
+   that come later than ddg_mem_ref REF1.  */
+#define FOR_EACH_LATER_MEM_REF(REF2, REF1, LIST) \
+  for (REF2 = (LIST); \
+       REF2 && REF2->node->cuid > REF1->node->cuid; \
+       REF2 = REF2->prev)
+
+/* Check for dependencies between pairs of memory rtxes.  */
+static void
+build_memory_deps (ddg_ptr g)
+{
+  struct ddg_mem_ref *ref1, *ref2;
+  int distance;
+
+  for (ref1 = g->loads; ref1; ref1 = ref1->prev)
+    {
+      /* LOAD--->LOAD.  */
+      if (MEM_VOLATILE_P (ref1->mem))
+       FOR_EACH_LATER_MEM_REF (ref2, ref1, g->loads)
+         if (MEM_VOLATILE_P (ref2->mem))
+           add_memory_dep (g, ref1, ref2, ANTI_DEP, ANTI_DEP, 0);
+
+      /* LOAD--->STORE.  */
+      FOR_EACH_LATER_MEM_REF (ref2, ref1, g->stores)
+       {
+         distance = anti_dependence (ref1->mem, ref2->mem) ? 0 : 1;
+         add_memory_dep (g, ref1, ref2, ANTI_DEP, TRUE_DEP, distance);
+       }
+    }
+
+  for (ref1 = g->stores; ref1; ref1 = ref1->prev)
+    {
+      /* STORE--->LOAD.  */
+      FOR_EACH_LATER_MEM_REF (ref2, ref1, g->loads)
+       {
+         distance = true_dependence (ref1->mem, VOIDmode,
+                                     ref2->mem, rtx_varies_p) ? 0 : 1;
+         add_memory_dep (g, ref1, ref2, TRUE_DEP, ANTI_DEP, distance);
+       }
+
+      /* STORE--->STORE.  */
+      FOR_EACH_LATER_MEM_REF (ref2, ref1, g->stores)
+       {
+         distance = output_dependence (ref1->mem, ref2->mem) ? 0 : 1;
+         add_memory_dep (g, ref1, ref2, OUTPUT_DEP, OUTPUT_DEP, distance);
+       }
+    }
+}
 
-/* Given a basic block, create its DDG and return a pointer to a variable
-   of ddg type that represents it.
+/* Given a basic block, create the nodes of its DDG and return a pointer
+   to a variable of ddg type that represents it.
    Initialize the ddg structure fields to the appropriate values.  */
 ddg_ptr
-create_ddg (basic_block bb, int closing_branch_deps)
+create_ddg (struct loop *loop, basic_block bb, int closing_branch_deps)
 {
+  struct graph_and_node gn;
   ddg_ptr g;
   rtx insn, first_note;
   int i;
@@ -586,13 +643,6 @@ create_ddg (basic_block bb, int closing_
 
       if (DEBUG_INSN_P (insn))
        g->num_debug++;
-      else
-       {
-         if (mem_read_insn_p (insn))
-           g->num_loads++;
-         if (mem_write_insn_p (insn))
-           g->num_stores++;
-       }
       num_nodes++;
     }
 
@@ -603,6 +653,8 @@ create_ddg (basic_block bb, int closing_
       return NULL;
     }
 
+  iv_analysis_loop_init (loop);
+
   /* Allocate the nodes array, and initialize the nodes.  */
   g->num_nodes = num_nodes;
   g->nodes = (ddg_node_ptr) xcalloc (num_nodes, sizeof (struct ddg_node));
@@ -637,18 +689,31 @@ create_ddg (basic_block bb, int closing_
       g->nodes[i].predecessors = sbitmap_alloc (num_nodes);
       sbitmap_zero (g->nodes[i].predecessors);
       g->nodes[i].first_note = (first_note ? first_note : insn);
-      g->nodes[i++].insn = insn;
+      g->nodes[i].insn = insn;
       first_note = NULL_RTX;
+
+      gn.g = g;
+      gn.node = &g->nodes[i];
+      note_uses (&PATTERN (insn), record_loads, &gn);
+      note_stores (PATTERN (insn), record_stores, &gn);
+
+      i++;
     }
 
   /* We must have found a branch in DDG.  */
   gcc_assert (g->closing_branch);
+  return g;
+}
 
+/* Add the edges to a DDG that was previously created by create_ddg.
+   This function relies on scheduler dependencies.  */
 
-  /* Build the data dependency graph.  */
+void
+add_edges_to_ddg (ddg_ptr g)
+{
   build_intra_loop_deps (g);
   build_inter_loop_deps (g);
-  return g;
+  build_memory_deps (g);
 }
 
 /* Free all the memory allocated for the DDG.  */
Index: gcc/modulo-sched.c
===================================================================
--- gcc/modulo-sched.c  2011-12-30 13:13:45.077544981 +0000
+++ gcc/modulo-sched.c  2011-12-30 13:24:57.327195816 +0000
@@ -345,6 +345,38 @@ ps_num_consecutive_stages (partial_sched
     return ps_reg_move (ps, id)->num_consecutive_stages;
 }
 
+/* Perform a saturating multiplication of nonnegative values A and B.  */
+
+static inline int
+sat_mulpp (unsigned int a, unsigned int b)
+{
+  if ((unsigned int) INT_MAX / b <= a)
+    return INT_MAX;
+  else
+    return a * b;
+}
+
+/* Perform a saturating addition of signed value A and nonnegative value B.  */
+
+static inline int
+sat_addsp (int a, int b)
+{
+  if (INT_MAX - b <= a)
+    return INT_MAX;
+  return a + b;
+}
+
+/* Perform a saturating subtraction of signed value A and nonnegative
+   value B.  */
+
+static inline int
+sat_subsp (int a, int b)
+{
+  if (INT_MIN + b >= a)
+    return INT_MIN;
+  return a - b;
+}
+
 /* Given HEAD and TAIL which are the first and last insns in a loop;
    return the register which controls the loop.  Return zero if it has
    more than one occurrence in the loop besides the control part or the
@@ -709,7 +741,9 @@ schedule_reg_moves (partial_schedule_ptr
         ranges started at u (excluding self-loops).  */
       distances[0] = distances[1] = false;
       for (e = u->out; e; e = e->next_out)
-       if (e->type == TRUE_DEP && e->dest != e->src)
+       if (e->data_type == REG_DEP
+           && e->type == TRUE_DEP
+           && e->dest != e->src)
          {
            int nreg_moves4e = (SCHED_TIME (e->dest->cuid)
                                - SCHED_TIME (e->src->cuid)) / ii;
@@ -781,7 +815,9 @@ schedule_reg_moves (partial_schedule_ptr
         copy of this register, depending on the time the use is scheduled.
         Record which uses require which move results.  */
       for (e = u->out; e; e = e->next_out)
-       if (e->type == TRUE_DEP && e->dest != e->src)
+       if (e->data_type == REG_DEP
+           && e->type == TRUE_DEP
+           && e->dest != e->src)
          {
            int dest_copy = (SCHED_TIME (e->dest->cuid)
                             - SCHED_TIME (e->src->cuid)) / ii;
@@ -1355,6 +1391,7 @@ sms_schedule (void)
   basic_block condition_bb = NULL;
   edge latch_edge;
   gcov_type trip_count = 0;
+  int num_ddgs;
 
   loop_optimizer_init (LOOPS_HAVE_PREHEADERS
                       | LOOPS_HAVE_RECORDED_EXITS);
@@ -1364,34 +1401,19 @@ sms_schedule (void)
       return;  /* There are no loops to schedule.  */
     }
 
-  /* Initialize issue_rate.  */
-  if (targetm.sched.issue_rate)
-    {
-      int temp = reload_completed;
-
-      reload_completed = 1;
-      issue_rate = targetm.sched.issue_rate ();
-      reload_completed = temp;
-    }
-  else
-    issue_rate = 1;
-
-  /* Initialize the scheduler.  */
-  setup_sched_infos ();
-  haifa_sched_init ();
-
   /* Allocate memory to hold the DDG array one entry for each loop.
      We use loop->num as index into this array.  */
   g_arr = XCNEWVEC (ddg_ptr, number_of_loops ());
 
   if (dump_file)
-  {
-    fprintf (dump_file, "\n\nSMS analysis phase\n");
-    fprintf (dump_file, "===================\n\n");
-  }
+    {
+      fprintf (dump_file, "\n\nSMS loop discovery phase\n");
+      fprintf (dump_file, "========================\n\n");
+    }
 
   /* Build DDGs for all the relevant loops and hold them in G_ARR
      indexed by the loop index.  */
+  num_ddgs = 0;
   FOR_EACH_LOOP (li, loop, 0)
     {
       rtx head, tail;
@@ -1512,7 +1534,7 @@ sms_schedule (void)
          instructions. The branch is rotated to be in row ii-1 at the
          end of the scheduling procedure to make sure it's the last
          instruction in the iteration.  */
-      if (! (g = create_ddg (bb, 1)))
+      if (! (g = create_ddg (loop, bb, 1)))
         {
           if (dump_file)
            fprintf (dump_file, "SMS create_ddg failed\n");
@@ -1523,12 +1545,38 @@ sms_schedule (void)
       if (dump_file)
         fprintf (dump_file, "...OK\n");
 
+      num_ddgs++;
+    }
+  iv_analysis_done ();
+
+  if (num_ddgs == 0)
+    {
+      if (dump_file)
+       fprintf (dump_file, "No suitable loops\n");
+      goto done;
     }
+
+  /* Initialize issue_rate.  */
+  if (targetm.sched.issue_rate)
+    {
+      int temp = reload_completed;
+
+      reload_completed = 1;
+      issue_rate = targetm.sched.issue_rate ();
+      reload_completed = temp;
+    }
+  else
+    issue_rate = 1;
+
+  /* Initialize the scheduler.  */
+  setup_sched_infos ();
+  haifa_sched_init ();
+
   if (dump_file)
-  {
-    fprintf (dump_file, "\nSMS transformation phase\n");
-    fprintf (dump_file, "=========================\n\n");
-  }
+    {
+      fprintf (dump_file, "\nSMS transformation phase\n");
+      fprintf (dump_file, "=========================\n\n");
+    }
 
   /* We don't want to perform SMS on new loops - created by versioning.  */
   FOR_EACH_LOOP (li, loop, 0)
@@ -1542,6 +1590,8 @@ sms_schedule (void)
       if (! (g = g_arr[loop->num]))
         continue;
 
+      add_edges_to_ddg (g);
+
       if (dump_file)
       {
          rtx insn = BB_END (loop->header);
@@ -1754,10 +1804,12 @@ sms_schedule (void)
       free_ddg (g);
     }
 
-  free (g_arr);
-
   /* Release scheduler data, needed until now because of DFA.  */
   haifa_sched_finish ();
+
+ done:
+  free (g_arr);
+
   loop_optimizer_finalize ();
 }
 
@@ -1844,6 +1896,7 @@ #define DFA_HISTORY SMS_DFA_HISTORY
 /* A threshold for the number of repeated unsuccessful attempts to insert
    an empty row, before we flush the partial schedule and start over.  */
 #define MAX_SPLIT_NUM 10
+
 /* Given the partial schedule PS, this function calculates and returns the
    cycles in which we can schedule the node with the given index I.
    NOTE: Here we do the backtracking in SMS, in some special cases. We have
@@ -1896,7 +1949,7 @@ get_sched_window (partial_schedule_ptr p
       fprintf (dump_file, "=========== =========== =========== ==========="
               " =====\n");
     }
-  /* Calculate early_start and limit end.  Both bounds are inclusive.  */
+  /* Calculate early_start and limit start.  Both bounds are inclusive.  */
   if (psp_not_empty)
     for (e = u_node->in; e != 0; e = e->next_in)
       {
@@ -1905,26 +1958,36 @@ get_sched_window (partial_schedule_ptr p
        if (TEST_BIT (sched_nodes, v))
          {
            int p_st = SCHED_TIME (v);
-           int earliest = p_st + e->latency - (e->distance * ii);
-           int latest = (e->data_type == MEM_DEP ? p_st + ii - 1 : INT_MAX);
+           int earliest = sat_subsp (sat_addsp (p_st, e->latency),
+                                     sat_mulpp (e->distance, ii));
 
+           if (e->data_type == MEM_DEP)
+             {
+               start = MAX (start, earliest);
+               if (dump_file)
+                 fprintf (dump_file, "%11d %11s %11s %11s",
+                          earliest, "", "", "");
+             }
+           else
+             {
+               early_start = MAX (early_start, earliest);
+               if (dump_file)
+                 fprintf (dump_file, "%11s %11d %11s %11s",
+                          "", earliest, "", "");
+             }
            if (dump_file)
              {
-               fprintf (dump_file, "%11s %11d %11s %11d %5d",
-                        "", earliest, "", latest, p_st);
+               fprintf (dump_file, " %5d", p_st);
                print_ddg_edge (dump_file, e);
                fprintf (dump_file, "\n");
              }
 
-           early_start = MAX (early_start, earliest);
-           end = MIN (end, latest);
-
            if (e->type == TRUE_DEP && e->data_type == REG_DEP)
              count_preds++;
          }
       }
 
-  /* Calculate late_start and limit start.  Both bounds are inclusive.  */
+  /* Calculate late_start and limit end.  Both bounds are inclusive.  */
   if (pss_not_empty)
     for (e = u_node->out; e != 0; e = e->next_out)
       {
@@ -1933,20 +1996,30 @@ get_sched_window (partial_schedule_ptr p
        if (TEST_BIT (sched_nodes, v))
          {
            int s_st = SCHED_TIME (v);
-           int earliest = (e->data_type == MEM_DEP ? s_st - ii + 1 : INT_MIN);
-           int latest = s_st - e->latency + (e->distance * ii);
+           int latest = sat_addsp (sat_subsp (s_st, e->latency),
+                                   sat_mulpp (e->distance, ii));
 
+           if (e->data_type == MEM_DEP)
+             {
+               end = MIN (end, latest);
+               if (dump_file)
+                 fprintf (dump_file, "%11s %11s %11s %11d",
+                          "", "", "", latest);
+             }
+           else
+             {
+               late_start = MIN (late_start, latest);
+               if (dump_file)
+                 fprintf (dump_file, "%11s %11s %11d %11s",
+                          "", "", latest, "");
+             }
            if (dump_file)
              {
-               fprintf (dump_file, "%11d %11s %11d %11s %5d",
-                        earliest, "", latest, "", s_st);
+               fprintf (dump_file, " %5d", s_st);
                print_ddg_edge (dump_file, e);
                fprintf (dump_file, "\n");
              }
 
-           start = MAX (start, earliest);
-           late_start = MIN (late_start, latest);
-
            if (e->type == TRUE_DEP && e->data_type == REG_DEP)
              count_succs++;
          }
@@ -1963,14 +2036,22 @@ get_sched_window (partial_schedule_ptr p
 
   /* Get a target scheduling window no bigger than ii.  */
   if (early_start == INT_MIN && late_start == INT_MAX)
-    early_start = NODE_ASAP (u_node);
-  else if (early_start == INT_MIN)
-    early_start = late_start - (ii - 1);
-  late_start = MIN (late_start, early_start + (ii - 1));
-
-  /* Apply memory dependence limits.  */
-  start = MAX (start, early_start);
-  end = MIN (end, late_start);
+    {
+      /* The default window (as given in the paper) is based on
+        the node's ASAP value, but shift or shrink it as necessary
+        in order to honor memory dependencies.  */
+      early_start = MIN (NODE_ASAP (u_node), end - (ii - 1));
+      start = MAX (early_start, start);
+    }
+  else
+    {
+      end = MIN (end, late_start);
+      if (early_start == INT_MIN)
+       start = MAX (start, end - (ii - 1));
+      else
+       start = MAX (start, early_start);
+    }
+  end = MIN (end, start + (ii - 1));
 
   if (dump_file && (psp_not_empty || pss_not_empty))
     fprintf (dump_file, "%11s %11d %11d %11s %5s final window\n",
@@ -2060,8 +2141,8 @@ calculate_must_precede_follow (ddg_node_
       SCHED_TIME (e->src) - (e->distance * ii) == first_cycle_in_window  */
   for (e = u_node->in; e != 0; e = e->next_in)
     if (TEST_BIT (sched_nodes, e->src->cuid)
-       && ((SCHED_TIME (e->src->cuid) - (e->distance * ii)) ==
-             first_cycle_in_window))
+       && (sat_subsp (SCHED_TIME (e->src->cuid), sat_mulpp (e->distance, ii))
+           == first_cycle_in_window))
       {
        if (dump_file)
          fprintf (dump_file, "%d ", e->src->cuid);
@@ -2371,7 +2452,8 @@ compute_split_row (sbitmap sched_nodes, 
       int v = e->src->cuid;
 
       if (TEST_BIT (sched_nodes, v)
-         && (low == SCHED_TIME (v) + e->latency - (e->distance * ii)))
+         && low == sat_subsp (sat_addsp (SCHED_TIME (v), e->latency),
+                              sat_mulpp (e->distance, ii)))
        if (SCHED_TIME (v) > lower)
          {
            crit_pred = v;

_______________________________________________
linaro-toolchain mailing list
linaro-toolchain@lists.linaro.org
http://lists.linaro.org/mailman/listinfo/linaro-toolchain

Reply via email to