Le 19/04/2015 17:58, Thomas Koenig a écrit :
> Hello world,
>
> here is the first installation of the matmul inlining patch.
>
> This patch calculates c=MATMUL(a,b) using DO loops where there is no
> dependency between a and c/b and c loops, taking care of realloc on
> assignment and bounds checking (using the same error messages that the
> library does), and does not cause any regressions in the test suite.
>
> There are several directions this should be extended at a later date:
>
> - Remove unneeded bounds checking for the individual array accesses
> - Add handling of TRANSPOSE of the arguments
> - Add handling of temporaries for arguments, where needed
>
> However, I think the patch is useful as it is now, and can go
> into trunk.
>
> So: OK for trunk?
>
Hello,
This is impressive.
I have a few comments, but in general it's mostly good.
I couldn't tell whether subreferences array(:,:)%subref%comp are
correctly handled, either positively or negatively. Tests for it are
more than welcome in any case. ;-)
An interesting case is non-default lbound. The lbound intrinsic is
supposed to return 1 in the case of array subobjects, which may have
interesting effects.
So, test with non-default lbound as well.
I think strides are properly handled, but would feel more confortable
with tests about them.
To sum um, tests missing for the following:
array(4,:,:)
array(3:5,:)
array(3:10:2,:)
array(:,:)%comp
with both lbound == 1 and lbound != 1.
One test with lhs-rhs dependency would be good as well.
More specific comments below.
Mikael
> Index: fortran/array.c
> ===================================================================
> --- fortran/array.c (Revision 222218)
> +++ fortran/array.c (Arbeitskopie)
> @@ -338,6 +338,9 @@ gfc_resolve_array_spec (gfc_array_spec *as, int ch
> if (as == NULL)
> return true;
>
> + if (as->resolved)
> + return true;
> +
Why this?
[...]
> Index: fortran/frontend-passes.c
> ===================================================================
> --- fortran/frontend-passes.c (Revision 222218)
> +++ fortran/frontend-passes.c (Arbeitskopie)
> @@ -43,7 +44,11 @@ static void doloop_warn (gfc_namespace *);
> static void optimize_reduction (gfc_namespace *);
> static int callback_reduction (gfc_expr **, int *, void *);
> static void realloc_strings (gfc_namespace *);
> -static gfc_expr *create_var (gfc_expr *);
> +static gfc_expr *create_var (gfc_expr *, const char *vname=NULL);
> +static int optimize_matmul_assign (gfc_code **, int *, void *);
The function doesn't really "optimize", so name it inline_matmul_assign
instead.
Same for the comments about "optimizing MATMUL".
[...]
> @@ -524,29 +542,11 @@ constant_string_length (gfc_expr *e)
>
> }
>
> -/* Returns a new expression (a variable) to be used in place of the old one,
> - with an assignment statement before the current statement to set
> - the value of the variable. Creates a new BLOCK for the statement if
> - that hasn't already been done and puts the statement, plus the
> - newly created variables, in that block. Special cases: If the
> - expression is constant or a temporary which has already
> - been created, just copy it. */
> -
> -static gfc_expr*
> -create_var (gfc_expr * e)
Keep a comment here.
> +static gfc_namespace*
> +insert_block ()
> {
> - char name[GFC_MAX_SYMBOL_LEN +1];
> - static int num = 1;
> - gfc_symtree *symtree;
> - gfc_symbol *symbol;
> - gfc_expr *result;
> - gfc_code *n;
> gfc_namespace *ns;
> - int i;
>
> - if (e->expr_type == EXPR_CONSTANT || is_fe_temp (e))
> - return gfc_copy_expr (e);
> -
> /* If the block hasn't already been created, do so. */
> if (inserted_block == NULL)
> {
> @@ -1939,7 +1977,1049 @@ doloop_warn (gfc_namespace *ns)
> gfc_code_walker (&ns->code, doloop_code, do_function, NULL);
> }
>
> +/* This selction deals with inlining calls to MATMUL. */
section
>
> +/* Auxiliary function to build and simplify an array inquiry function.
> + dim is zero-based. */
> +
> +static gfc_expr *
> +get_array_inq_function (gfc_expr *e, int dim, gfc_isym_id id)
It's better if the id is the first argument, so that the function id and
its arguments come in their natural order.
[...]
> +/* Builds a logical expression. */
> +
> +static gfc_expr*
> +build_logical_expr (gfc_expr *e1, gfc_expr *e2, gfc_intrinsic_op op)
Same here, op first.
[...]
> +
> +/* Return an operation of one two gfc_expr (one if e2 is NULL). This assumes
> + compatible typespecs. */
> +
> +static gfc_expr *
> +get_operand (gfc_intrinsic_op op, gfc_expr *e1, gfc_expr *e2)
Here it's good already. :-)
[...]
> +/* Insert code to issue a runtime error if the expressions are not equal. */
> +
> +static gfc_code *
> +runtime_error_ne (gfc_expr *e1, gfc_expr *e2, const char *msg)
> +{
> + gfc_expr *cond;
> + gfc_code *if_1, *if_2;
> + gfc_code *c;
> + // const char *name;
Any reason...
> + gfc_actual_arglist *a1, *a2, *a3;
> +
> + gcc_assert (e1->where.lb);
> + /* Build the call to runtime_error. */
> + c = XCNEW (gfc_code);
> + c->op = EXEC_CALL;
> + c->loc = e1->where;
> + // name = gfc_get_string (PREFIX ("runtime_error"));
> + // c->resolved_sym = gfc_get_intrinsic_sub_symbol (name);
... to keep these?
[...]
> +
> +/* Handle matrix reallocation. Caller is responsible to insert into
> + the code tree.
> +
> + For the two-dimensional case, build
> +
> + if (allocated(c)) then
> + if (size(c,1) /= size(a,1) .or. size(c,2) /= size(b,2)) then
> + deallocate(c)
> + allocate (c(size(a,1), size(b,2)))
> + end if
> + else
> + allocate (c(size(a,1),size(b,2)))
> + end if
> +
> + and for the other cases correspondingly.
> +*/
> +
> +static gfc_code *
> +matmul_lhs_realloc (gfc_expr *c, gfc_expr *a, gfc_expr *b,
> + enum matrix_case m_case)
> +{
> +
> + gfc_expr *allocated, *alloc_expr;
> + gfc_code *if_alloc_1, *if_alloc_2, *if_size_1, *if_size_2;
> + gfc_code *else_alloc;
> + gfc_code *deallocate, *allocate1, *allocate_else;
> + gfc_ref *ref;
> + gfc_array_ref *ar;
> + gfc_expr *cond, *ne1, *ne2;
> +
> + alloc_expr = gfc_copy_expr (c);
> +
> + ref = alloc_expr->ref;
> +
> + while (ref)
> + {
> + if (ref->type == REF_ARRAY && ref->u.ar.type != AR_ELEMENT)
> + break;
> +
> + ref = ref->next;
> +
> + }
> + ar = &ref->u.ar;
You can probably use gfc_find_array_ref here.
[...]
> +
> +/* Function to return a scalarized expression. It is assumed that indices are
> + zero based to make generation of DO loops easier. A zero as index will
> + access the first element along a dimension. Single element references will
> + be skipped. A NULL as an expression will be replaced by a full reference.
> + This assumes that the index loops have gfc_index_integer_kind, and that all
> + references have been frozen. */
> +
> +static gfc_expr*
> +scalarized_expr (gfc_expr *e_in, gfc_expr **index, int count_index)
I suggest using a variable argument list for index (no strong opinion,
your decision).
> +{
> + gfc_ref *ref;
> + gfc_array_ref *ar;
> + int i;
> + int rank;
> + gfc_expr *e;
> + int i_index;
> + bool was_fullref;
> +
> + e = gfc_copy_expr(e_in);
> +
> + rank = e->rank;
> +
> + ref = e->ref;
> +
> + while (ref)
> + {
> + if (ref->type == REF_ARRAY
> + && (ref->u.ar.type == AR_FULL || ref->u.ar.type == AR_SECTION))
> + break;
> + ref = ref->next;
> + }
> + ar = &ref->u.ar;
gfc_find_array_ref again.
> +
> + /* We scalarize count_index variables, reducing the rank by count_index.
> */
> +
> + e->rank = rank - count_index;
> +
> + was_fullref = ar->type == AR_FULL;
> +
> + if (e->rank == 0)
> + ar->type = AR_ELEMENT;
> + else
> + ar->type = AR_SECTION;
> +
> + /* Loop over the indices. For each index, create the expression
> + index * stride + lbound(e, dim). */
> +
> + i_index = 0;
> + for (i=0; i < ar->dimen; i++)
> + {
> + if (was_fullref || ar->dimen_type[i] == DIMEN_RANGE)
> + {
> + if (index[i_index] != NULL)
> + {
> + gfc_expr *lbound, *nindex;
> + gfc_expr *loopvar;
> +
> + loopvar = gfc_copy_expr (index[i_index]);
> +
> + if (ar->stride[i])
> + {
> + gfc_expr *tmp;
> +
> + tmp = gfc_copy_expr(ar->stride[i]);
> + if (tmp->ts.kind != gfc_index_integer_kind)
> + {
> + gfc_typespec ts;
> + gfc_clear_ts (&ts);
> + ts.type = BT_INTEGER;
> + ts.kind = gfc_index_integer_kind;
> + gfc_convert_type (tmp, &ts, 2);
> + }
> + nindex = get_operand (INTRINSIC_TIMES, loopvar, tmp);
> + }
> + else
> + nindex = loopvar;
> +
> + if (ar->start[i])
> + {
> + lbound = gfc_copy_expr (ar->start[i]);
> + if (lbound->ts.kind != gfc_index_integer_kind)
> + {
> + gfc_typespec ts;
> + gfc_clear_ts (&ts);
> + ts.type = BT_INTEGER;
> + ts.kind = gfc_index_integer_kind;
> + gfc_convert_type (lbound, &ts, 2);
> +
> + }
> + }
> + else
> + lbound = get_array_inq_function (e_in, i+1, GFC_ISYM_LBOUND);
I think you are assuming that e_in is a full array ref without
subreference. What if e_in is foo(3, :, :) or bar(:,:)%comp (think
about non-default lbound)?
> +
> + ar->dimen_type[i] = DIMEN_ELEMENT;
> + ar->start[i] = get_operand (INTRINSIC_PLUS, nindex, lbound);
Use gfc_replace_expr here (or gfc_free_expr) ...
> + ar->end[i] = NULL;
> + ar->stride[i] = NULL;
... and gfc_free_expr here.
> + gfc_simplify_expr (ar->start[i], 0);
> + }
> + else if (was_fullref)
> + {
> + ar->dimen_type[i] = DIMEN_RANGE;
> + ar->start[i] = NULL;
> + ar->end[i] = NULL;
> + ar->stride[i] = NULL;
> + }
Is this reachable ?
> + i_index ++;
> + }
> + }
> + return e;
> +}
> +
> +
> +/* Optimize assignments of the form c = matmul(a,b).
> + Handle only the cases currently where b and c are rank-two arrays.
> +
> + This basically translates the code to
> +
> + BLOCK
> + integer i,j,k
> + c = 0
> + do j=0, size(b,2)-1
> + do k=0, size(a, 2)-1
> + do i=0, size(a, 1)-1
> + c(i * stride(c,1) + lbound(c,1), j * stride(c,2) + lbound(c,2)) =
> + c(i * stride(c,1) + lbound(c,1), j * stride(c,2) + lbound(c,2)) +
> + a(i * stride(a,1) + lbound(a,1), k * stride(a,2) + lbound(a,2)) *
> + b(k * stride(b,1) + lbound(b,1), j * stride(b,2) + lbound(b,2))
> + end do
> + end do
> + end do
> + END BLOCK
> +
> +*/
> +
> +static int
> +optimize_matmul_assign (gfc_code **c, int *walk_subtrees,
> + void *data ATTRIBUTE_UNUSED)
> +{
[...]
> +
> + current_code = &ns->code;
> +
> + /* Freeze the references, keeping track of how many temporary variables
> were
> + created. */
> + n_vars = 0;
> + freeze_references (matrix_a);
> + freeze_references (matrix_b);
> + freeze_references (expr1);
> +
> + if (n_vars == 0)
> + next_code_point = current_code;
> + else
> + {
> + next_code_point = &ns->code;
> + for (i=0; i<n_vars; i++)
> + next_code_point = &(*next_code_point)->next;
> + }
I'm not fond of this n_vars stuff.
Is next_code_point different from current_code->next?
Can freeze_references take next_code_point as argument so that it can
update it directly instead, maybe?
[...]
> Index: fortran/options.c
> ===================================================================
> --- fortran/options.c (Revision 222218)
> +++ fortran/options.c (Arbeitskopie)
> @@ -378,6 +378,11 @@ gfc_post_options (const char **pfilename)
> if (!flag_automatic)
> flag_max_stack_var_size = 0;
>
> + /* If we call BLAS directly, only inline up to the BLAS limit. */
This deserves a note in the documentation.
The new flag in general deserves documentation.
> +
> + if (flag_external_blas && flag_inline_matmul_limit < 0)
> + flag_inline_matmul_limit = flag_blas_matmul_limit;
Hum, shouldn't we do something for flag_inline_matmul_limit > 0 as well?
> +
> /* Optimization implies front end optimization, unless the user
> specified it directly. */
>
> Index: fortran/simplify.c
> ===================================================================
> --- fortran/simplify.c (Revision 222218)
> +++ fortran/simplify.c (Arbeitskopie)
> @@ -3445,6 +3445,32 @@ simplify_bound (gfc_expr *array, gfc_expr *dim, gf
>
> done:
>
> +
> + if (!upper && as && as->type == AS_ASSUMED_SHAPE && dim
> + && dim->expr_type == EXPR_CONSTANT && ref->u.ar.type != AR_SECTION)
> + {
> + if (!(array->symtree && array->symtree->n.sym
> + && (array->symtree->n.sym->attr.allocatable
> + || array->symtree->n.sym->attr.pointer)))
> + {
> + unsigned long int ndim;
> + gfc_expr *lower, *res;
> +
> + ndim = mpz_get_si (dim->value.integer) - 1;
> + lower = as->lower[ndim];
> + if (lower->expr_type == EXPR_CONSTANT)
> + {
> + res = gfc_copy_expr (lower);
> + if (kind)
> + {
> + int nkind = mpz_get_si (kind->value.integer);
> + res->ts.kind = nkind;
> + }
> + return res;
> + }
> + }
> + }
> +
> if (as && (as->type == AS_DEFERRED || as->type == AS_ASSUMED_SHAPE
> || as->type == AS_ASSUMED_RANK))
> return NULL;
Can you submit that part separately with a testcase?
> ! { dg-do run }
> ! { dg-options "-ffrontend-optimize -fdump-tree-original" }
I believe dg-additional-options is the preferred directive, if you want
to let the testsuite test various optimizations.
> ! PR 37131 - check basic functionality of inlined matmul, making
> ! sure that the library is not called, with and without reallocation.
>
> program main
> real, dimension(3,2) :: a
> real, dimension(2,4) :: b
> real, dimension(3,4) :: c
> real, dimension(3,4) :: cres
> real, dimension(:,:), allocatable :: calloc
> integer :: a1 = size(a,1), a2 = size(a,2)
> integer :: b1 = size(b,1), b2 = size(b,2)
> integer :: c1 = size(c,1), c2 = size(c,2)
>
> data a / 2., -3., 5., -7., 11., -13./
> data b /17., -23., 29., -31., 37., -39., 41., -47./
> data cres /195., -304., 384., 275., -428., 548., 347., -540., 692.,
> 411., -640., 816./
> c = matmul(a,b)
> if (sum(c-cres)>1e-4) call abort
>
> calloc = matmul(a,b)
> if (sum(calloc-cres)>1e-4) call abort
You should use sum(abs(calloc-cres)), and as you are using quite small
integers only, I suppose you can compare against 0 directly.
Same everywhere else.