Le 19/04/2015 17:58, Thomas Koenig a écrit : > Hello world, > > here is the first installation of the matmul inlining patch. > > This patch calculates c=MATMUL(a,b) using DO loops where there is no > dependency between a and c/b and c loops, taking care of realloc on > assignment and bounds checking (using the same error messages that the > library does), and does not cause any regressions in the test suite. > > There are several directions this should be extended at a later date: > > - Remove unneeded bounds checking for the individual array accesses > - Add handling of TRANSPOSE of the arguments > - Add handling of temporaries for arguments, where needed > > However, I think the patch is useful as it is now, and can go > into trunk. > > So: OK for trunk? > Hello,
This is impressive. I have a few comments, but in general it's mostly good. I couldn't tell whether subreferences array(:,:)%subref%comp are correctly handled, either positively or negatively. Tests for it are more than welcome in any case. ;-) An interesting case is non-default lbound. The lbound intrinsic is supposed to return 1 in the case of array subobjects, which may have interesting effects. So, test with non-default lbound as well. I think strides are properly handled, but would feel more confortable with tests about them. To sum um, tests missing for the following: array(4,:,:) array(3:5,:) array(3:10:2,:) array(:,:)%comp with both lbound == 1 and lbound != 1. One test with lhs-rhs dependency would be good as well. More specific comments below. Mikael > Index: fortran/array.c > =================================================================== > --- fortran/array.c (Revision 222218) > +++ fortran/array.c (Arbeitskopie) > @@ -338,6 +338,9 @@ gfc_resolve_array_spec (gfc_array_spec *as, int ch > if (as == NULL) > return true; > > + if (as->resolved) > + return true; > + Why this? [...] > Index: fortran/frontend-passes.c > =================================================================== > --- fortran/frontend-passes.c (Revision 222218) > +++ fortran/frontend-passes.c (Arbeitskopie) > @@ -43,7 +44,11 @@ static void doloop_warn (gfc_namespace *); > static void optimize_reduction (gfc_namespace *); > static int callback_reduction (gfc_expr **, int *, void *); > static void realloc_strings (gfc_namespace *); > -static gfc_expr *create_var (gfc_expr *); > +static gfc_expr *create_var (gfc_expr *, const char *vname=NULL); > +static int optimize_matmul_assign (gfc_code **, int *, void *); The function doesn't really "optimize", so name it inline_matmul_assign instead. Same for the comments about "optimizing MATMUL". [...] > @@ -524,29 +542,11 @@ constant_string_length (gfc_expr *e) > > } > > -/* Returns a new expression (a variable) to be used in place of the old one, > - with an assignment statement before the current statement to set > - the value of the variable. Creates a new BLOCK for the statement if > - that hasn't already been done and puts the statement, plus the > - newly created variables, in that block. Special cases: If the > - expression is constant or a temporary which has already > - been created, just copy it. */ > - > -static gfc_expr* > -create_var (gfc_expr * e) Keep a comment here. > +static gfc_namespace* > +insert_block () > { > - char name[GFC_MAX_SYMBOL_LEN +1]; > - static int num = 1; > - gfc_symtree *symtree; > - gfc_symbol *symbol; > - gfc_expr *result; > - gfc_code *n; > gfc_namespace *ns; > - int i; > > - if (e->expr_type == EXPR_CONSTANT || is_fe_temp (e)) > - return gfc_copy_expr (e); > - > /* If the block hasn't already been created, do so. */ > if (inserted_block == NULL) > { > @@ -1939,7 +1977,1049 @@ doloop_warn (gfc_namespace *ns) > gfc_code_walker (&ns->code, doloop_code, do_function, NULL); > } > > +/* This selction deals with inlining calls to MATMUL. */ section > > +/* Auxiliary function to build and simplify an array inquiry function. > + dim is zero-based. */ > + > +static gfc_expr * > +get_array_inq_function (gfc_expr *e, int dim, gfc_isym_id id) It's better if the id is the first argument, so that the function id and its arguments come in their natural order. [...] > +/* Builds a logical expression. */ > + > +static gfc_expr* > +build_logical_expr (gfc_expr *e1, gfc_expr *e2, gfc_intrinsic_op op) Same here, op first. [...] > + > +/* Return an operation of one two gfc_expr (one if e2 is NULL). This assumes > + compatible typespecs. */ > + > +static gfc_expr * > +get_operand (gfc_intrinsic_op op, gfc_expr *e1, gfc_expr *e2) Here it's good already. :-) [...] > +/* Insert code to issue a runtime error if the expressions are not equal. */ > + > +static gfc_code * > +runtime_error_ne (gfc_expr *e1, gfc_expr *e2, const char *msg) > +{ > + gfc_expr *cond; > + gfc_code *if_1, *if_2; > + gfc_code *c; > + // const char *name; Any reason... > + gfc_actual_arglist *a1, *a2, *a3; > + > + gcc_assert (e1->where.lb); > + /* Build the call to runtime_error. */ > + c = XCNEW (gfc_code); > + c->op = EXEC_CALL; > + c->loc = e1->where; > + // name = gfc_get_string (PREFIX ("runtime_error")); > + // c->resolved_sym = gfc_get_intrinsic_sub_symbol (name); ... to keep these? [...] > + > +/* Handle matrix reallocation. Caller is responsible to insert into > + the code tree. > + > + For the two-dimensional case, build > + > + if (allocated(c)) then > + if (size(c,1) /= size(a,1) .or. size(c,2) /= size(b,2)) then > + deallocate(c) > + allocate (c(size(a,1), size(b,2))) > + end if > + else > + allocate (c(size(a,1),size(b,2))) > + end if > + > + and for the other cases correspondingly. > +*/ > + > +static gfc_code * > +matmul_lhs_realloc (gfc_expr *c, gfc_expr *a, gfc_expr *b, > + enum matrix_case m_case) > +{ > + > + gfc_expr *allocated, *alloc_expr; > + gfc_code *if_alloc_1, *if_alloc_2, *if_size_1, *if_size_2; > + gfc_code *else_alloc; > + gfc_code *deallocate, *allocate1, *allocate_else; > + gfc_ref *ref; > + gfc_array_ref *ar; > + gfc_expr *cond, *ne1, *ne2; > + > + alloc_expr = gfc_copy_expr (c); > + > + ref = alloc_expr->ref; > + > + while (ref) > + { > + if (ref->type == REF_ARRAY && ref->u.ar.type != AR_ELEMENT) > + break; > + > + ref = ref->next; > + > + } > + ar = &ref->u.ar; You can probably use gfc_find_array_ref here. [...] > + > +/* Function to return a scalarized expression. It is assumed that indices are > + zero based to make generation of DO loops easier. A zero as index will > + access the first element along a dimension. Single element references will > + be skipped. A NULL as an expression will be replaced by a full reference. > + This assumes that the index loops have gfc_index_integer_kind, and that all > + references have been frozen. */ > + > +static gfc_expr* > +scalarized_expr (gfc_expr *e_in, gfc_expr **index, int count_index) I suggest using a variable argument list for index (no strong opinion, your decision). > +{ > + gfc_ref *ref; > + gfc_array_ref *ar; > + int i; > + int rank; > + gfc_expr *e; > + int i_index; > + bool was_fullref; > + > + e = gfc_copy_expr(e_in); > + > + rank = e->rank; > + > + ref = e->ref; > + > + while (ref) > + { > + if (ref->type == REF_ARRAY > + && (ref->u.ar.type == AR_FULL || ref->u.ar.type == AR_SECTION)) > + break; > + ref = ref->next; > + } > + ar = &ref->u.ar; gfc_find_array_ref again. > + > + /* We scalarize count_index variables, reducing the rank by count_index. > */ > + > + e->rank = rank - count_index; > + > + was_fullref = ar->type == AR_FULL; > + > + if (e->rank == 0) > + ar->type = AR_ELEMENT; > + else > + ar->type = AR_SECTION; > + > + /* Loop over the indices. For each index, create the expression > + index * stride + lbound(e, dim). */ > + > + i_index = 0; > + for (i=0; i < ar->dimen; i++) > + { > + if (was_fullref || ar->dimen_type[i] == DIMEN_RANGE) > + { > + if (index[i_index] != NULL) > + { > + gfc_expr *lbound, *nindex; > + gfc_expr *loopvar; > + > + loopvar = gfc_copy_expr (index[i_index]); > + > + if (ar->stride[i]) > + { > + gfc_expr *tmp; > + > + tmp = gfc_copy_expr(ar->stride[i]); > + if (tmp->ts.kind != gfc_index_integer_kind) > + { > + gfc_typespec ts; > + gfc_clear_ts (&ts); > + ts.type = BT_INTEGER; > + ts.kind = gfc_index_integer_kind; > + gfc_convert_type (tmp, &ts, 2); > + } > + nindex = get_operand (INTRINSIC_TIMES, loopvar, tmp); > + } > + else > + nindex = loopvar; > + > + if (ar->start[i]) > + { > + lbound = gfc_copy_expr (ar->start[i]); > + if (lbound->ts.kind != gfc_index_integer_kind) > + { > + gfc_typespec ts; > + gfc_clear_ts (&ts); > + ts.type = BT_INTEGER; > + ts.kind = gfc_index_integer_kind; > + gfc_convert_type (lbound, &ts, 2); > + > + } > + } > + else > + lbound = get_array_inq_function (e_in, i+1, GFC_ISYM_LBOUND); I think you are assuming that e_in is a full array ref without subreference. What if e_in is foo(3, :, :) or bar(:,:)%comp (think about non-default lbound)? > + > + ar->dimen_type[i] = DIMEN_ELEMENT; > + ar->start[i] = get_operand (INTRINSIC_PLUS, nindex, lbound); Use gfc_replace_expr here (or gfc_free_expr) ... > + ar->end[i] = NULL; > + ar->stride[i] = NULL; ... and gfc_free_expr here. > + gfc_simplify_expr (ar->start[i], 0); > + } > + else if (was_fullref) > + { > + ar->dimen_type[i] = DIMEN_RANGE; > + ar->start[i] = NULL; > + ar->end[i] = NULL; > + ar->stride[i] = NULL; > + } Is this reachable ? > + i_index ++; > + } > + } > + return e; > +} > + > + > +/* Optimize assignments of the form c = matmul(a,b). > + Handle only the cases currently where b and c are rank-two arrays. > + > + This basically translates the code to > + > + BLOCK > + integer i,j,k > + c = 0 > + do j=0, size(b,2)-1 > + do k=0, size(a, 2)-1 > + do i=0, size(a, 1)-1 > + c(i * stride(c,1) + lbound(c,1), j * stride(c,2) + lbound(c,2)) = > + c(i * stride(c,1) + lbound(c,1), j * stride(c,2) + lbound(c,2)) + > + a(i * stride(a,1) + lbound(a,1), k * stride(a,2) + lbound(a,2)) * > + b(k * stride(b,1) + lbound(b,1), j * stride(b,2) + lbound(b,2)) > + end do > + end do > + end do > + END BLOCK > + > +*/ > + > +static int > +optimize_matmul_assign (gfc_code **c, int *walk_subtrees, > + void *data ATTRIBUTE_UNUSED) > +{ [...] > + > + current_code = &ns->code; > + > + /* Freeze the references, keeping track of how many temporary variables > were > + created. */ > + n_vars = 0; > + freeze_references (matrix_a); > + freeze_references (matrix_b); > + freeze_references (expr1); > + > + if (n_vars == 0) > + next_code_point = current_code; > + else > + { > + next_code_point = &ns->code; > + for (i=0; i<n_vars; i++) > + next_code_point = &(*next_code_point)->next; > + } I'm not fond of this n_vars stuff. Is next_code_point different from current_code->next? Can freeze_references take next_code_point as argument so that it can update it directly instead, maybe? [...] > Index: fortran/options.c > =================================================================== > --- fortran/options.c (Revision 222218) > +++ fortran/options.c (Arbeitskopie) > @@ -378,6 +378,11 @@ gfc_post_options (const char **pfilename) > if (!flag_automatic) > flag_max_stack_var_size = 0; > > + /* If we call BLAS directly, only inline up to the BLAS limit. */ This deserves a note in the documentation. The new flag in general deserves documentation. > + > + if (flag_external_blas && flag_inline_matmul_limit < 0) > + flag_inline_matmul_limit = flag_blas_matmul_limit; Hum, shouldn't we do something for flag_inline_matmul_limit > 0 as well? > + > /* Optimization implies front end optimization, unless the user > specified it directly. */ > > Index: fortran/simplify.c > =================================================================== > --- fortran/simplify.c (Revision 222218) > +++ fortran/simplify.c (Arbeitskopie) > @@ -3445,6 +3445,32 @@ simplify_bound (gfc_expr *array, gfc_expr *dim, gf > > done: > > + > + if (!upper && as && as->type == AS_ASSUMED_SHAPE && dim > + && dim->expr_type == EXPR_CONSTANT && ref->u.ar.type != AR_SECTION) > + { > + if (!(array->symtree && array->symtree->n.sym > + && (array->symtree->n.sym->attr.allocatable > + || array->symtree->n.sym->attr.pointer))) > + { > + unsigned long int ndim; > + gfc_expr *lower, *res; > + > + ndim = mpz_get_si (dim->value.integer) - 1; > + lower = as->lower[ndim]; > + if (lower->expr_type == EXPR_CONSTANT) > + { > + res = gfc_copy_expr (lower); > + if (kind) > + { > + int nkind = mpz_get_si (kind->value.integer); > + res->ts.kind = nkind; > + } > + return res; > + } > + } > + } > + > if (as && (as->type == AS_DEFERRED || as->type == AS_ASSUMED_SHAPE > || as->type == AS_ASSUMED_RANK)) > return NULL; Can you submit that part separately with a testcase? > ! { dg-do run } > ! { dg-options "-ffrontend-optimize -fdump-tree-original" } I believe dg-additional-options is the preferred directive, if you want to let the testsuite test various optimizations. > ! PR 37131 - check basic functionality of inlined matmul, making > ! sure that the library is not called, with and without reallocation. > > program main > real, dimension(3,2) :: a > real, dimension(2,4) :: b > real, dimension(3,4) :: c > real, dimension(3,4) :: cres > real, dimension(:,:), allocatable :: calloc > integer :: a1 = size(a,1), a2 = size(a,2) > integer :: b1 = size(b,1), b2 = size(b,2) > integer :: c1 = size(c,1), c2 = size(c,2) > > data a / 2., -3., 5., -7., 11., -13./ > data b /17., -23., 29., -31., 37., -39., 41., -47./ > data cres /195., -304., 384., 275., -428., 548., 347., -540., 692., > 411., -640., 816./ > c = matmul(a,b) > if (sum(c-cres)>1e-4) call abort > > calloc = matmul(a,b) > if (sum(calloc-cres)>1e-4) call abort You should use sum(abs(calloc-cres)), and as you are using quite small integers only, I suppose you can compare against 0 directly. Same everywhere else.