Here's the patch for sms-and-memory-dependencies. The idea is to bypass the sched-deps.c {output,read,anti,true}_dependence tests altogether -- which is easy to do thanks to the note_mem_dep hook -- and instead handle them in ddg.c. The ddg.c tests then use RTL loop iv analysis to try to get longer distances on the memory dependencies. (Note that other memory-related dependencies, such as those between volatile MEMs and other volatile instructions, are still handled by sched-deps.c.)
Dependencies are now always created in pairs, so there's no need for get_sched_window to set an upper bound when processing incoming MEM_DEPs, or a lower bound when processing outgoing MEM_DEPs; we can rely on the partnering edge to do that instead.
Richard
gcc/ * Makefile.in (ddg.o): Depend on $(TREE_PASS_H). * ddg.h (REG_OR_MEM_DEP, REG_AND_MEM_DEP): Delete. (ddg_mem_ref): New structure. (ddg): Add loads and stores array. (create_ddg): Add a loop argument. (add_edges_to_ddg): Declare. (MAX_DDG_DISTANCE): New macro. * ddg.c: Include tree-pass.h. (mem_ref_p, mark_mem_use, mark_mem_use_1, mem_read_insn_p) (mark_mem_store, mem_write_insn_p, rtx_mem_access_p) (mem_access_insn_p): Delete. (create_mem_ref): New function. (graph_and_node): New structure. (record_loads, record_stores): New functions. (create_ddg_dep_from_intra_loop_link): Treat all dependencies as register dependencies. (walk_mems_2, walk_mems_1, insns_may_alias_p, add_intra_loop_mem_dep) (add_inter_loop_mem_dep): Delete. (build_intra_loop_deps): Ignore memory dependencies created by sched-deps.c. Don't handle memory dependencies here. (measure_mem_distance, add_memory_dep): New functions. (FOR_EACH_LATER_MEM_REF): New macro. (build_memory_deps): New function. (create_ddg): Take the loop as argument. Don't count loads and stores here. Call iv_analysis_loop_init. Pass all loads to record_loads and all stores to record_stores. Move edge creation to... (add_edges_to_ddg): ...this new function. Also call build_memory_deps. * modulo-sched.c (sat_mulpp, sat_addsp, sat_subsp): New functions. (schedule_reg_moves): Only handle register dependencies. (sms_schedule): Update call to create_ddg. Call iv_analysis_done after creating all ddgs. Only set issue_rate if there are ddgs. Only call setup_sched_infos and haifa_sched_init if there are ddgs. Call add_edges_to_ddg before processing each ddg. (get_sched_window): Use saturating arithmetic. Do not add an implicit upper bound for incoming MEM_DEPs, or an implicit lower bound for outgoing MEM_DEPs. Rework calculation of final window. (calculate_must_precede_follow, compute_split_row): Use saturating arithmetic.
Index: gcc/Makefile.in =================================================================== --- gcc/Makefile.in 2011-12-30 13:13:45.077544981 +0000 +++ gcc/Makefile.in 2011-12-30 13:24:57.330195801 +0000 @@ -3316,7 +3316,7 @@ ddg.o : ddg.c $(DDG_H) $(CONFIG_H) $(SYS $(DIAGNOSTIC_CORE_H) $(RTL_H) $(TM_P_H) $(REGS_H) $(FUNCTION_H) \ $(FLAGS_H) insn-config.h $(INSN_ATTR_H) $(EXCEPT_H) $(RECOG_H) \ $(SCHED_INT_H) $(CFGLAYOUT_H) $(CFGLOOP_H) $(EXPR_H) $(BITMAP_H) \ - hard-reg-set.h sbitmap.h $(TM_H) + hard-reg-set.h sbitmap.h $(TM_H) $(TREE_PASS_H) modulo-sched.o : modulo-sched.c $(DDG_H) $(CONFIG_H) $(CONFIG_H) $(SYSTEM_H) \ coretypes.h $(TARGET_H) $(DIAGNOSTIC_CORE_H) $(RTL_H) $(TM_P_H) $(REGS_H) $(FUNCTION_H) \ $(FLAGS_H) insn-config.h $(INSN_ATTR_H) $(EXCEPT_H) $(RECOG_H) \ Index: gcc/ddg.h =================================================================== --- gcc/ddg.h 2011-12-30 13:13:45.077544981 +0000 +++ gcc/ddg.h 2011-12-30 13:24:57.324195831 +0000 @@ -35,8 +35,7 @@ typedef struct ddg_scc *ddg_scc_ptr; typedef struct ddg_all_sccs *ddg_all_sccs_ptr;
typedef enum {TRUE_DEP, OUTPUT_DEP, ANTI_DEP} dep_type; -typedef enum {REG_OR_MEM_DEP, REG_DEP, MEM_DEP, REG_AND_MEM_DEP} - dep_data_type; +typedef enum {REG_DEP, MEM_DEP} dep_data_type;
/* The following two macros enables direct access to the successors and predecessors bitmaps held in each ddg_node. Do not make changes to @@ -44,6 +43,28 @@ typedef enum {REG_OR_MEM_DEP, REG_DEP, M #define NODE_SUCCESSORS(x) ((x)->successors) #define NODE_PREDECESSORS(x) ((x)->predecessors)
+/* A structure that represents a memory read or write in the DDG; + context decides which. */ +struct ddg_mem_ref { + /* The previous reference of the same type (read or write) in the DDG. */ + struct ddg_mem_ref *prev; + + /* The DDG node that contains the memory reference. */ + ddg_node_ptr node; + + /* The memory reference itself. */ + rtx mem; + + /* If the address is a known induction variable, its value in iteration + I is given by: + + BASE + OFFSET + I * STEP + + In other cases BASE is null. */ + rtx base; + HOST_WIDE_INT offset, step; +}; + /* A structure that represents a node in the DDG. */ struct ddg_node { @@ -117,6 +138,11 @@ struct ddg /* Number of instructions in the basic block. */ int num_nodes;
+ /* The loads and stores in the BB, from the end of the block to + the beginning. */ + struct ddg_mem_ref *loads; + struct ddg_mem_ref *stores; + /* Number of load/store instructions in the BB - statistics. */ int num_loads; int num_stores; @@ -167,7 +193,9 @@ struct ddg_all_sccs };
-ddg_ptr create_ddg (basic_block, int closing_branch_deps); +struct loop; +ddg_ptr create_ddg (struct loop *, basic_block, int closing_branch_deps); +void add_edges_to_ddg (ddg_ptr); void free_ddg (ddg_ptr);
void print_ddg (FILE *, ddg_ptr); @@ -188,4 +216,7 @@ int longest_simple_path (ddg_ptr, int fr
bool autoinc_var_is_used_p (rtx, rtx);
+/* The maximum allowable distance on a DDG edge. */ +#define MAX_DDG_DISTANCE INT_MAX + #endif /* GCC_DDG_H */ Index: gcc/ddg.c =================================================================== --- gcc/ddg.c 2011-12-30 13:13:45.077544981 +0000 +++ gcc/ddg.c 2011-12-30 13:36:35.005498271 +0000 @@ -43,6 +43,7 @@ Software Foundation; either version 3, o #include "expr.h" #include "bitmap.h" #include "ddg.h" +#include "tree-pass.h"
#ifdef INSN_SCHEDULING
@@ -61,88 +62,102 @@ static ddg_edge_ptr create_ddg_edge (ddg dep_data_type, int, int); static void add_edge_to_ddg (ddg_ptr g, ddg_edge_ptr); -/* Auxiliary variable for mem_read_insn_p/mem_write_insn_p. */ -static bool mem_ref_p; +/* Create a memory reference record for MEM, which occurs in NODE. + PREV is the previous reference of the same type. */ +static struct ddg_mem_ref * +create_mem_ref (struct ddg_mem_ref *prev, ddg_node_ptr node, rtx mem) +{ + struct ddg_mem_ref *entry; + enum machine_mode pmode; + struct rtx_iv iv; + rtx x; + + entry = XCNEW (struct ddg_mem_ref); + entry->prev = prev; + entry->node = node; + entry->mem = mem; + + pmode = targetm.addr_space.address_mode (MEM_ADDR_SPACE (mem)); + if (iv_analyze_expr (node->insn, XEXP (mem, 0), pmode, &iv) + && iv.extend == UNKNOWN + && CONST_INT_P (iv.step)) + { + x = iv.base; + if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1))) + { + entry->base = XEXP (x, 0); + entry->offset = INTVAL (XEXP (x, 1)); + } + else + { + entry->base = x; + entry->offset = 0; + } + entry->step = INTVAL (iv.step); + }
-/* Auxiliary function for mem_read_insn_p. */ -static int -mark_mem_use (rtx *x, void *data ATTRIBUTE_UNUSED) -{ - if (MEM_P (*x)) - mem_ref_p = true; - return 0; + if (dump_file) + { + fprintf (dump_file, "Found memory reference in insn %d:\n", + INSN_UID (node->insn)); + print_rtl (dump_file, mem); + if (entry->base) + { + fprintf (dump_file, "\nwith base:"); + print_rtl (dump_file, entry->base); + fprintf (dump_file, "\noffset " HOST_WIDE_INT_PRINT_DEC + " and step " HOST_WIDE_INT_PRINT_DEC "\n\n", + entry->offset, entry->step); + } + else + fprintf (dump_file, "\nwhich isn't a recognized iv\n\n"); + } + return entry; }
-/* Auxiliary function for mem_read_insn_p. */ -static void -mark_mem_use_1 (rtx *x, void *data) -{ - for_each_rtx (x, mark_mem_use, data); -} +/* A structure for pairing a node and the graph to which it belongs. */ +struct graph_and_node { + ddg_ptr g; + ddg_node_ptr node; +};
-/* Returns nonzero if INSN reads from memory. */ -static bool -mem_read_insn_p (rtx insn) +/* A for_each_rtx callback. Record all loads in an instruction. + DATA points to a graph_and_node. */ +static int +record_loads_1 (rtx *loc, void *data) { - mem_ref_p = false; - note_uses (&PATTERN (insn), mark_mem_use_1, NULL); - return mem_ref_p; -} + struct graph_and_node *gn;
-static void -mark_mem_store (rtx loc, const_rtx setter ATTRIBUTE_UNUSED, void *data ATTRIBUTE_UNUSED) -{ - if (MEM_P (loc)) - mem_ref_p = true; + if (MEM_P (*loc)) + { + gn = (struct graph_and_node *) data; + gn->g->loads = create_mem_ref (gn->g->loads, gn->node, *loc); + gn->g->num_loads++; + } + return 0; }
-/* Returns nonzero if INSN writes to memory. */ -static bool -mem_write_insn_p (rtx insn) +/* A note_uses callback. Record all loads in an instruction. + DATA points to a graph_and_node. */ +static void +record_loads (rtx *loc, void *data) { - mem_ref_p = false; - note_stores (PATTERN (insn), mark_mem_store, NULL); - return mem_ref_p; + for_each_rtx (loc, record_loads_1, data); }
-/* Returns nonzero if X has access to memory. */ -static bool -rtx_mem_access_p (rtx x) +/* A note_stores callback. Record all stores in an instruction. + DATA points to a graph_and_node. */ +static void +record_stores (rtx x, const_rtx setter ATTRIBUTE_UNUSED, void *data) { - int i, j; - const char *fmt; - enum rtx_code code; - - if (x == 0) - return false; + struct graph_and_node *gn;
if (MEM_P (x)) - return true; - - code = GET_CODE (x); - fmt = GET_RTX_FORMAT (code); - for (i = GET_RTX_LENGTH (code) - 1; i >= 0; i--) { - if (fmt[i] == 'e') - { - if (rtx_mem_access_p (XEXP (x, i))) - return true; - } - else if (fmt[i] == 'E') - for (j = 0; j < XVECLEN (x, i); j++) - { - if (rtx_mem_access_p (XVECEXP (x, i, j))) - return true; - } + gn = (struct graph_and_node *) data; + gn->g->stores = create_mem_ref (gn->g->stores, gn->node, x); + gn->g->num_stores++; } - return false; -} - -/* Returns nonzero if INSN reads to or writes from memory. */ -static bool -mem_access_insn_p (rtx insn) -{ - return rtx_mem_access_p (PATTERN (insn)); }
/* Return true if DEF_INSN contains address being auto-inc or auto-dec @@ -175,9 +190,6 @@ create_ddg_dep_from_intra_loop_link (ddg ddg_edge_ptr e; int latency, distance = 0; dep_type t = TRUE_DEP; - dep_data_type dt = (mem_access_insn_p (src_node->insn) - && mem_access_insn_p (dest_node->insn) ? MEM_DEP - : REG_DEP); gcc_assert (src_node->cuid < dest_node->cuid); gcc_assert (link);
@@ -201,7 +213,7 @@ create_ddg_dep_from_intra_loop_link (ddg TODO: support the removal of all anti-deps edges, i.e. including those whose register has multiple defs in the loop. */ if (flag_modulo_sched_allow_regmoves - && (t == ANTI_DEP && dt == REG_DEP) + && t == ANTI_DEP && !autoinc_var_is_used_p (dest_node->insn, src_node->insn)) { rtx set; @@ -224,7 +236,7 @@ create_ddg_dep_from_intra_loop_link (ddg }
latency = dep_cost (link); - e = create_ddg_edge (src_node, dest_node, t, dt, latency, distance); + e = create_ddg_edge (src_node, dest_node, t, REG_DEP, latency, distance); add_edge_to_ddg (g, e); }
@@ -380,107 +392,6 @@ build_inter_loop_deps (ddg_ptr g) } }
- -static int -walk_mems_2 (rtx *x, rtx mem) -{ - if (MEM_P (*x)) - { - if (may_alias_p (*x, mem)) - return 1; - - return -1; - } - return 0; -} - -static int -walk_mems_1 (rtx *x, rtx *pat) -{ - if (MEM_P (*x)) - { - /* Visit all MEMs in *PAT and check indepedence. */ - if (for_each_rtx (pat, (rtx_function) walk_mems_2, *x)) - /* Indicate that dependence was determined and stop traversal. */ - return 1; - - return -1; - } - return 0; -} - -/* Return 1 if two specified instructions have mem expr with conflict alias sets*/ -static int -insns_may_alias_p (rtx insn1, rtx insn2) -{ - /* For each pair of MEMs in INSN1 and INSN2 check their independence. */ - return for_each_rtx (&PATTERN (insn1), (rtx_function) walk_mems_1, - &PATTERN (insn2)); -} - -/* Given two nodes, analyze their RTL insns and add intra-loop mem deps - to ddg G. */ -static void -add_intra_loop_mem_dep (ddg_ptr g, ddg_node_ptr from, ddg_node_ptr to) -{ - - if ((from->cuid == to->cuid) - || !insns_may_alias_p (from->insn, to->insn)) - /* Do not create edge if memory references have disjoint alias sets - or 'to' and 'from' are the same instruction. */ - return; - - if (mem_write_insn_p (from->insn)) - { - if (mem_read_insn_p (to->insn)) - create_ddg_dep_no_link (g, from, to, - DEBUG_INSN_P (to->insn) - ? ANTI_DEP : TRUE_DEP, MEM_DEP, 0); - else - create_ddg_dep_no_link (g, from, to, - DEBUG_INSN_P (to->insn) - ? ANTI_DEP : OUTPUT_DEP, MEM_DEP, 0); - } - else if (!mem_read_insn_p (to->insn)) - create_ddg_dep_no_link (g, from, to, ANTI_DEP, MEM_DEP, 0); -} - -/* Given two nodes, analyze their RTL insns and add inter-loop mem deps - to ddg G. */ -static void -add_inter_loop_mem_dep (ddg_ptr g, ddg_node_ptr from, ddg_node_ptr to) -{ - if (!insns_may_alias_p (from->insn, to->insn)) - /* Do not create edge if memory references have disjoint alias sets. */ - return; - - if (mem_write_insn_p (from->insn)) - { - if (mem_read_insn_p (to->insn)) - create_ddg_dep_no_link (g, from, to, - DEBUG_INSN_P (to->insn) - ? ANTI_DEP : TRUE_DEP, MEM_DEP, 1); - else if (from->cuid != to->cuid) - create_ddg_dep_no_link (g, from, to, - DEBUG_INSN_P (to->insn) - ? ANTI_DEP : OUTPUT_DEP, MEM_DEP, 1); - } - else - { - if (mem_read_insn_p (to->insn)) - return; - else if (from->cuid != to->cuid) - { - create_ddg_dep_no_link (g, from, to, ANTI_DEP, MEM_DEP, 1); - if (DEBUG_INSN_P (from->insn) || DEBUG_INSN_P (to->insn)) - create_ddg_dep_no_link (g, to, from, ANTI_DEP, MEM_DEP, 1); - else - create_ddg_dep_no_link (g, to, from, TRUE_DEP, MEM_DEP, 1); - } - } - -} - /* Perform intra-block Data Dependency analysis and connect the nodes in the DDG. We assume the loop has a single basic block. */ static void @@ -493,6 +404,9 @@ build_intra_loop_deps (ddg_ptr g)
/* Build the dependence information, using the sched_analyze function. */ init_deps_global (); + /* Ignore the usual dependencies between two MEM rtxes. We still rely + on sched_analyze to handle memory barriers and the like. */ + sched_deps_info->note_mem_dep = 0; init_deps (&tmp_deps, false);
/* Do the intra-block data dependence analysis for the given block. */ @@ -519,37 +433,6 @@ build_intra_loop_deps (ddg_ptr g)
create_ddg_dep_from_intra_loop_link (g, src_node, dest_node, dep); } - - /* If this insn modifies memory, add an edge to all insns that access - memory. */ - if (mem_access_insn_p (dest_node->insn)) - { - int j; - - for (j = 0; j <= i; j++) - { - ddg_node_ptr j_node = &g->nodes[j]; - if (DEBUG_INSN_P (j_node->insn)) - continue; - if (mem_access_insn_p (j_node->insn)) - { - /* Don't bother calculating inter-loop dep if an intra-loop dep - already exists. */ - if (! TEST_BIT (dest_node->successors, j)) - add_inter_loop_mem_dep (g, dest_node, j_node); - /* If -fmodulo-sched-allow-regmoves - is set certain anti-dep edges are not created. - It might be that these anti-dep edges are on the - path from one memory instruction to another such that - removing these edges could cause a violation of the - memory dependencies. Thus we add intra edges between - every two memory instructions in this case. */ - if (flag_modulo_sched_allow_regmoves - && !TEST_BIT (dest_node->predecessors, j)) - add_intra_loop_mem_dep (g, j_node, dest_node); - } - } - } }
/* Free the INSN_LISTs. */ @@ -560,13 +443,187 @@ build_intra_loop_deps (ddg_ptr g) sched_free_deps (head, tail, false); }
+/* Given a "source" memory reference from iteration 0 and a "target" + memory reference from iteration BASE_DISTANCE, return the first + N >= BASE_DISTANCE such that the source reference in iteration 0 + overlaps the target reference in iteration N. + + FROM_OFFSET is the offset of the source reference from an unspecified + base, while TO_OFFSET is the offset of the target reference from that + same base. FROM_SIZE and TO_SIZE are the sizes of the two references + in bytes. + + PMODE is the mode of both addresses, and STEP is the amount that + will be added to each address by one loop iteration. */ +static int +measure_mem_distance (enum machine_mode pmode, + unsigned HOST_WIDE_INT from_offset, + unsigned HOST_WIDE_INT from_size, + unsigned HOST_WIDE_INT to_offset, + unsigned HOST_WIDE_INT to_size, + unsigned HOST_WIDE_INT base_distance, + HOST_WIDE_INT step) +{ + unsigned HOST_WIDE_INT extra, from2to, to2from; + + from2to = (to_offset - from_offset) & GET_MODE_MASK (pmode); + to2from = (from_offset - to_offset) & GET_MODE_MASK (pmode); + if (from2to < from_size || to2from < to_size) + /* The source reference in iteration 0 overlaps the target reference + in iteration BASE_DISTANCE. The check is written this way to cope + with cases where offset + size overflows. */ + return base_distance; + + /* N > BASE_DISTANCE. To cope more easily with cases where the round-up + divisions: + + (to2from - (to_size - 1) + (step - 1)) / step + (from2to - (from_size - 1) + (step - 1)) / step + + would overflow, bump BASE_DISTANCE and subtract STEP from each + dividend to compensate. */ + base_distance++; + if (step > 0) + extra = (to2from - to_size) / (unsigned HOST_WIDE_INT) step; + else + extra = (from2to - from_size) / (unsigned HOST_WIDE_INT) -step; + if (extra > MAX_DDG_DISTANCE || base_distance + extra > MAX_DDG_DISTANCE) + return MAX_DDG_DISTANCE; + return base_distance + extra; +} + +/* If FROM and TO might alias, record memory dependencies: + + FROM--(FORWARD_TYPE)-->TO + and TO--(BACKWARD_TYPE)-->FROM + + FROM comes before TO in the original loop, and both belong to G. + FORWARD_DISTANCE is the minimum distance of the FROM--->TO dependence. */ +static void +add_memory_dep (ddg_ptr g, struct ddg_mem_ref *from, + struct ddg_mem_ref *to, dep_type forward_type, + dep_type backward_type, int forward_distance) +{ + HOST_WIDE_INT step; + unsigned HOST_WIDE_INT from_size, to_size, to_disp, abs_step, future_offset; + enum machine_mode pmode; + int backward_distance; + + gcc_checking_assert (from->node->cuid < to->node->cuid); + + if (!may_alias_p (from->mem, to->mem)) + return; + + /* In the worst case, the TO---->FROM edge has a distance of 1. */ + backward_distance = 1; + + /* See if we can get more accurate distances. Look for cases where + the addresses of FROM and TO are ivs with the same base and step. */ + if (from->base + && to->base + && from->step + && from->step == to->step + && !MEM_VOLATILE_P (from->mem) + && !MEM_VOLATILE_P (to->mem) + && MEM_SIZE_KNOWN_P (from->mem) + && MEM_SIZE_KNOWN_P (to->mem) + && MEM_ADDR_SPACE (from->mem) == MEM_ADDR_SPACE (to->mem) + && rtx_equal_p (from->base, to->base)) + { + step = to->step; + abs_step = (step < 0 ? -step : step); + from_size = MEM_SIZE (from->mem); + to_size = MEM_SIZE (to->mem); + + /* If the step is a power of two, or the negative of a power of two, + see whether we can prove that the references never overlap. */ + if (abs_step == (abs_step & -abs_step)) + { + to_disp = (to->offset - from->offset) % abs_step; + if (from_size <= to_disp && to_disp + to_size <= abs_step) + return; + } + + pmode = targetm.addr_space.address_mode (MEM_ADDR_SPACE (to->mem)); + future_offset = to->offset + forward_distance * step; + forward_distance = measure_mem_distance (pmode, + from->offset, from_size, + future_offset, to_size, + forward_distance, step); + future_offset = from->offset + backward_distance * step; + backward_distance = measure_mem_distance (pmode, + to->offset, to_size, + future_offset, from_size, + backward_distance, step); + } + + if (DEBUG_INSN_P (from->node->insn) || DEBUG_INSN_P (to->node->insn)) + { + forward_type = ANTI_DEP; + backward_type = ANTI_DEP; + } + create_ddg_dep_no_link (g, from->node, to->node, forward_type, MEM_DEP, + forward_distance); + create_ddg_dep_no_link (g, to->node, from->node, backward_type, MEM_DEP, + backward_distance); +} + +/* Make REF2 iterate over all entries in ddg_mem_ref list LIST + that come later than ddg_mem_ref REF1. */ +#define FOR_EACH_LATER_MEM_REF(REF2, REF1, LIST) \ + for (REF2 = (LIST); \ + REF2 && REF2->node->cuid > REF1->node->cuid; \ + REF2 = REF2->prev) + +/* Check for dependencies between pairs of memory rtxes. */ +static void +build_memory_deps (ddg_ptr g) +{ + struct ddg_mem_ref *ref1, *ref2; + int distance; + + for (ref1 = g->loads; ref1; ref1 = ref1->prev) + { + /* LOAD--->LOAD. */ + if (MEM_VOLATILE_P (ref1->mem)) + FOR_EACH_LATER_MEM_REF (ref2, ref1, g->loads) + if (MEM_VOLATILE_P (ref2->mem)) + add_memory_dep (g, ref1, ref2, ANTI_DEP, ANTI_DEP, 0); + + /* LOAD--->STORE. */ + FOR_EACH_LATER_MEM_REF (ref2, ref1, g->stores) + { + distance = anti_dependence (ref1->mem, ref2->mem) ? 0 : 1; + add_memory_dep (g, ref1, ref2, ANTI_DEP, TRUE_DEP, distance); + } + } + + for (ref1 = g->stores; ref1; ref1 = ref1->prev) + { + /* STORE--->LOAD. */ + FOR_EACH_LATER_MEM_REF (ref2, ref1, g->loads) + { + distance = true_dependence (ref1->mem, VOIDmode, + ref2->mem, rtx_varies_p) ? 0 : 1; + add_memory_dep (g, ref1, ref2, TRUE_DEP, ANTI_DEP, distance); + } + + /* STORE--->STORE. */ + FOR_EACH_LATER_MEM_REF (ref2, ref1, g->stores) + { + distance = output_dependence (ref1->mem, ref2->mem) ? 0 : 1; + add_memory_dep (g, ref1, ref2, OUTPUT_DEP, OUTPUT_DEP, distance); + } + } +}
-/* Given a basic block, create its DDG and return a pointer to a variable - of ddg type that represents it. +/* Given a basic block, create the nodes of its DDG and return a pointer + to a variable of ddg type that represents it. Initialize the ddg structure fields to the appropriate values. */ ddg_ptr -create_ddg (basic_block bb, int closing_branch_deps) +create_ddg (struct loop *loop, basic_block bb, int closing_branch_deps) { + struct graph_and_node gn; ddg_ptr g; rtx insn, first_note; int i; @@ -586,13 +643,6 @@ create_ddg (basic_block bb, int closing_
if (DEBUG_INSN_P (insn)) g->num_debug++; - else - { - if (mem_read_insn_p (insn)) - g->num_loads++; - if (mem_write_insn_p (insn)) - g->num_stores++; - } num_nodes++; }
@@ -603,6 +653,8 @@ create_ddg (basic_block bb, int closing_ return NULL; }
+ iv_analysis_loop_init (loop); + /* Allocate the nodes array, and initialize the nodes. */ g->num_nodes = num_nodes; g->nodes = (ddg_node_ptr) xcalloc (num_nodes, sizeof (struct ddg_node)); @@ -637,18 +689,31 @@ create_ddg (basic_block bb, int closing_ g->nodes[i].predecessors = sbitmap_alloc (num_nodes); sbitmap_zero (g->nodes[i].predecessors); g->nodes[i].first_note = (first_note ? first_note : insn); - g->nodes[i++].insn = insn; + g->nodes[i].insn = insn; first_note = NULL_RTX; + + gn.g = g; + gn.node = &g->nodes[i]; + note_uses (&PATTERN (insn), record_loads, &gn); + note_stores (PATTERN (insn), record_stores, &gn); + + i++; }
/* We must have found a branch in DDG. */ gcc_assert (g->closing_branch); + return g; +}
+/* Add the edges to a DDG that was previously created by create_ddg. + This function relies on scheduler dependencies. */
- /* Build the data dependency graph. */ +void +add_edges_to_ddg (ddg_ptr g) +{ build_intra_loop_deps (g); build_inter_loop_deps (g); - return g; + build_memory_deps (g); }
/* Free all the memory allocated for the DDG. */ Index: gcc/modulo-sched.c =================================================================== --- gcc/modulo-sched.c 2011-12-30 13:13:45.077544981 +0000 +++ gcc/modulo-sched.c 2011-12-30 13:24:57.327195816 +0000 @@ -345,6 +345,38 @@ ps_num_consecutive_stages (partial_sched return ps_reg_move (ps, id)->num_consecutive_stages; }
+/* Perform a saturating multiplication of nonnegative values A and B. */ + +static inline int +sat_mulpp (unsigned int a, unsigned int b) +{ + if ((unsigned int) INT_MAX / b <= a) + return INT_MAX; + else + return a * b; +} + +/* Perform a saturating addition of signed value A and nonnegative value B. */ + +static inline int +sat_addsp (int a, int b) +{ + if (INT_MAX - b <= a) + return INT_MAX; + return a + b; +} + +/* Perform a saturating subtraction of signed value A and nonnegative + value B. */ + +static inline int +sat_subsp (int a, int b) +{ + if (INT_MIN + b >= a) + return INT_MIN; + return a - b; +} + /* Given HEAD and TAIL which are the first and last insns in a loop; return the register which controls the loop. Return zero if it has more than one occurrence in the loop besides the control part or the @@ -709,7 +741,9 @@ schedule_reg_moves (partial_schedule_ptr ranges started at u (excluding self-loops). */ distances[0] = distances[1] = false; for (e = u->out; e; e = e->next_out) - if (e->type == TRUE_DEP && e->dest != e->src) + if (e->data_type == REG_DEP + && e->type == TRUE_DEP + && e->dest != e->src) { int nreg_moves4e = (SCHED_TIME (e->dest->cuid) - SCHED_TIME (e->src->cuid)) / ii; @@ -781,7 +815,9 @@ schedule_reg_moves (partial_schedule_ptr copy of this register, depending on the time the use is scheduled. Record which uses require which move results. */ for (e = u->out; e; e = e->next_out) - if (e->type == TRUE_DEP && e->dest != e->src) + if (e->data_type == REG_DEP + && e->type == TRUE_DEP + && e->dest != e->src) { int dest_copy = (SCHED_TIME (e->dest->cuid) - SCHED_TIME (e->src->cuid)) / ii; @@ -1355,6 +1391,7 @@ sms_schedule (void) basic_block condition_bb = NULL; edge latch_edge; gcov_type trip_count = 0; + int num_ddgs;
loop_optimizer_init (LOOPS_HAVE_PREHEADERS | LOOPS_HAVE_RECORDED_EXITS); @@ -1364,34 +1401,19 @@ sms_schedule (void) return; /* There are no loops to schedule. */ }
- /* Initialize issue_rate. */ - if (targetm.sched.issue_rate) - { - int temp = reload_completed; - - reload_completed = 1; - issue_rate = targetm.sched.issue_rate (); - reload_completed = temp; - } - else - issue_rate = 1; - - /* Initialize the scheduler. */ - setup_sched_infos (); - haifa_sched_init (); - /* Allocate memory to hold the DDG array one entry for each loop. We use loop->num as index into this array. */ g_arr = XCNEWVEC (ddg_ptr, number_of_loops ());
if (dump_file) - { - fprintf (dump_file, "\n\nSMS analysis phase\n"); - fprintf (dump_file, "===================\n\n"); - } + { + fprintf (dump_file, "\n\nSMS loop discovery phase\n"); + fprintf (dump_file, "========================\n\n"); + }
/* Build DDGs for all the relevant loops and hold them in G_ARR indexed by the loop index. */ + num_ddgs = 0; FOR_EACH_LOOP (li, loop, 0) { rtx head, tail; @@ -1512,7 +1534,7 @@ sms_schedule (void) instructions. The branch is rotated to be in row ii-1 at the end of the scheduling procedure to make sure it's the last instruction in the iteration. */ - if (! (g = create_ddg (bb, 1))) + if (! (g = create_ddg (loop, bb, 1))) { if (dump_file) fprintf (dump_file, "SMS create_ddg failed\n"); @@ -1523,12 +1545,38 @@ sms_schedule (void) if (dump_file) fprintf (dump_file, "...OK\n");
+ num_ddgs++; + } + iv_analysis_done (); + + if (num_ddgs == 0) + { + if (dump_file) + fprintf (dump_file, "No suitable loops\n"); + goto done; } + + /* Initialize issue_rate. */ + if (targetm.sched.issue_rate) + { + int temp = reload_completed; + + reload_completed = 1; + issue_rate = targetm.sched.issue_rate (); + reload_completed = temp; + } + else + issue_rate = 1; + + /* Initialize the scheduler. */ + setup_sched_infos (); + haifa_sched_init (); + if (dump_file) - { - fprintf (dump_file, "\nSMS transformation phase\n"); - fprintf (dump_file, "=========================\n\n"); - } + { + fprintf (dump_file, "\nSMS transformation phase\n"); + fprintf (dump_file, "=========================\n\n"); + }
/* We don't want to perform SMS on new loops - created by versioning. */ FOR_EACH_LOOP (li, loop, 0) @@ -1542,6 +1590,8 @@ sms_schedule (void) if (! (g = g_arr[loop->num])) continue;
+ add_edges_to_ddg (g); + if (dump_file) { rtx insn = BB_END (loop->header); @@ -1754,10 +1804,12 @@ sms_schedule (void) free_ddg (g); }
- free (g_arr); - /* Release scheduler data, needed until now because of DFA. */ haifa_sched_finish (); + + done: + free (g_arr); + loop_optimizer_finalize (); }
@@ -1844,6 +1896,7 @@ #define DFA_HISTORY SMS_DFA_HISTORY /* A threshold for the number of repeated unsuccessful attempts to insert an empty row, before we flush the partial schedule and start over. */ #define MAX_SPLIT_NUM 10 + /* Given the partial schedule PS, this function calculates and returns the cycles in which we can schedule the node with the given index I. NOTE: Here we do the backtracking in SMS, in some special cases. We have @@ -1896,7 +1949,7 @@ get_sched_window (partial_schedule_ptr p fprintf (dump_file, "=========== =========== =========== ===========" " =====\n"); } - /* Calculate early_start and limit end. Both bounds are inclusive. */ + /* Calculate early_start and limit start. Both bounds are inclusive. */ if (psp_not_empty) for (e = u_node->in; e != 0; e = e->next_in) { @@ -1905,26 +1958,36 @@ get_sched_window (partial_schedule_ptr p if (TEST_BIT (sched_nodes, v)) { int p_st = SCHED_TIME (v); - int earliest = p_st + e->latency - (e->distance * ii); - int latest = (e->data_type == MEM_DEP ? p_st + ii - 1 : INT_MAX); + int earliest = sat_subsp (sat_addsp (p_st, e->latency), + sat_mulpp (e->distance, ii));
+ if (e->data_type == MEM_DEP) + { + start = MAX (start, earliest); + if (dump_file) + fprintf (dump_file, "%11d %11s %11s %11s", + earliest, "", "", ""); + } + else + { + early_start = MAX (early_start, earliest); + if (dump_file) + fprintf (dump_file, "%11s %11d %11s %11s", + "", earliest, "", ""); + } if (dump_file) { - fprintf (dump_file, "%11s %11d %11s %11d %5d", - "", earliest, "", latest, p_st); + fprintf (dump_file, " %5d", p_st); print_ddg_edge (dump_file, e); fprintf (dump_file, "\n"); }
- early_start = MAX (early_start, earliest); - end = MIN (end, latest); - if (e->type == TRUE_DEP && e->data_type == REG_DEP) count_preds++; } }
- /* Calculate late_start and limit start. Both bounds are inclusive. */ + /* Calculate late_start and limit end. Both bounds are inclusive. */ if (pss_not_empty) for (e = u_node->out; e != 0; e = e->next_out) { @@ -1933,20 +1996,30 @@ get_sched_window (partial_schedule_ptr p if (TEST_BIT (sched_nodes, v)) { int s_st = SCHED_TIME (v); - int earliest = (e->data_type == MEM_DEP ? s_st - ii + 1 : INT_MIN); - int latest = s_st - e->latency + (e->distance * ii); + int latest = sat_addsp (sat_subsp (s_st, e->latency), + sat_mulpp (e->distance, ii));
+ if (e->data_type == MEM_DEP) + { + end = MIN (end, latest); + if (dump_file) + fprintf (dump_file, "%11s %11s %11s %11d", + "", "", "", latest); + } + else + { + late_start = MIN (late_start, latest); + if (dump_file) + fprintf (dump_file, "%11s %11s %11d %11s", + "", "", latest, ""); + } if (dump_file) { - fprintf (dump_file, "%11d %11s %11d %11s %5d", - earliest, "", latest, "", s_st); + fprintf (dump_file, " %5d", s_st); print_ddg_edge (dump_file, e); fprintf (dump_file, "\n"); }
- start = MAX (start, earliest); - late_start = MIN (late_start, latest); - if (e->type == TRUE_DEP && e->data_type == REG_DEP) count_succs++; } @@ -1963,14 +2036,22 @@ get_sched_window (partial_schedule_ptr p
/* Get a target scheduling window no bigger than ii. */ if (early_start == INT_MIN && late_start == INT_MAX) - early_start = NODE_ASAP (u_node); - else if (early_start == INT_MIN) - early_start = late_start - (ii - 1); - late_start = MIN (late_start, early_start + (ii - 1)); - - /* Apply memory dependence limits. */ - start = MAX (start, early_start); - end = MIN (end, late_start); + { + /* The default window (as given in the paper) is based on + the node's ASAP value, but shift or shrink it as necessary + in order to honor memory dependencies. */ + early_start = MIN (NODE_ASAP (u_node), end - (ii - 1)); + start = MAX (early_start, start); + } + else + { + end = MIN (end, late_start); + if (early_start == INT_MIN) + start = MAX (start, end - (ii - 1)); + else + start = MAX (start, early_start); + } + end = MIN (end, start + (ii - 1));
if (dump_file && (psp_not_empty || pss_not_empty)) fprintf (dump_file, "%11s %11d %11d %11s %5s final window\n", @@ -2060,8 +2141,8 @@ calculate_must_precede_follow (ddg_node_ SCHED_TIME (e->src) - (e->distance * ii) == first_cycle_in_window */ for (e = u_node->in; e != 0; e = e->next_in) if (TEST_BIT (sched_nodes, e->src->cuid) - && ((SCHED_TIME (e->src->cuid) - (e->distance * ii)) == - first_cycle_in_window)) + && (sat_subsp (SCHED_TIME (e->src->cuid), sat_mulpp (e->distance, ii)) + == first_cycle_in_window)) { if (dump_file) fprintf (dump_file, "%d ", e->src->cuid); @@ -2371,7 +2452,8 @@ compute_split_row (sbitmap sched_nodes, int v = e->src->cuid;
if (TEST_BIT (sched_nodes, v) - && (low == SCHED_TIME (v) + e->latency - (e->distance * ii))) + && low == sat_subsp (sat_addsp (SCHED_TIME (v), e->latency), + sat_mulpp (e->distance, ii))) if (SCHED_TIME (v) > lower) { crit_pred = v;
linaro-toolchain@lists.linaro.org