On Fri, 21 Jun 2019 at 08:57, Jakub Jelinek <ja...@redhat.com> wrote:
>
> Hi!
>
> The following patch adds exclusive scan support for simd, it is similar to
> the inclusive scan, just we need to swap the input and scan phases and
> use slightly different pattern at the start of the scan phase, so that it
> computes what we need.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, committed to trunk.
>
> 2019-06-21  Jakub Jelinek  <ja...@redhat.com>
>
>         * omp-low.c (lower_rec_simd_input_clauses): Add rvar2 argument,
>         create another "omp scan inscan exclusive" array if
>         !ctx->scan_inclusive.
>         (lower_rec_input_clauses): Handle exclusive scan inscan reductions.
>         (lower_omp_scan): Likewise.
>         * tree-vectorizer.h (struct _stmt_vec_info): Use 3-bit instead of
>         2-bit bitfield for simd_lane_access_p member.
>         * tree-vect-data-refs.c (vect_analyze_data_refs): Also handle
>         aux == (void *)-4 as simd lane access.
>         * tree-vect-stmts.c (check_scan_store): Handle exclusive scan.  Update
>         comment with permutations to show the canonical permutation order.
>         (vectorizable_scan_store): Handle exclusive scan.
>         (vectorizable_store): Call vectorizable_scan_store even for
>         STMT_VINFO_SIMD_LANE_ACCESS_P > 3.
>
>         * gcc.dg/vect/vect-simd-12.c: New test.
>         * gcc.dg/vect/vect-simd-13.c: New test.
>         * gcc.dg/vect/vect-simd-14.c: New test.
>         * gcc.dg/vect/vect-simd-15.c: New test.
>         * gcc.target/i386/sse2-vect-simd-12.c: New test.
>         * gcc.target/i386/sse2-vect-simd-13.c: New test.
>         * gcc.target/i386/sse2-vect-simd-14.c: New test.
>         * gcc.target/i386/sse2-vect-simd-15.c: New test.
>         * gcc.target/i386/avx2-vect-simd-12.c: New test.
>         * gcc.target/i386/avx2-vect-simd-13.c: New test.
>         * gcc.target/i386/avx2-vect-simd-14.c: New test.
>         * gcc.target/i386/avx2-vect-simd-15.c: New test.
>         * gcc.target/i386/avx512f-vect-simd-12.c: New test.
>         * gcc.target/i386/avx512f-vect-simd-13.c: New test.
>         * gcc.target/i386/avx512f-vect-simd-14.c: New test.
>         * gcc.target/i386/avx512bw-vect-simd-15.c: New test.
>         * g++.dg/vect/simd-6.cc: New test.
>         * g++.dg/vect/simd-7.cc: New test.
>         * g++.dg/vect/simd-8.cc: New test.
>         * g++.dg/vect/simd-9.cc: New test.
>         * c-c++-common/gomp/scan-2.c: Don't expect any diagnostics.
>
> --- gcc/omp-low.c.jj    2019-06-20 13:26:29.085150770 +0200
> +++ gcc/omp-low.c       2019-06-20 15:46:25.964253058 +0200
> @@ -3692,7 +3692,8 @@ struct omplow_simd_context {
>  static bool
>  lower_rec_simd_input_clauses (tree new_var, omp_context *ctx,
>                               omplow_simd_context *sctx, tree &ivar,
> -                             tree &lvar, tree *rvar = NULL)
> +                             tree &lvar, tree *rvar = NULL,
> +                             tree *rvar2 = NULL)
>  {
>    if (known_eq (sctx->max_vf, 0U))
>      {
> @@ -3767,6 +3768,25 @@ lower_rec_simd_input_clauses (tree new_v
>           *rvar = build4 (ARRAY_REF, TREE_TYPE (new_var), iavar,
>                           sctx->lastlane, NULL_TREE, NULL_TREE);
>           TREE_THIS_NOTRAP (*rvar) = 1;
> +
> +         if (!ctx->scan_inclusive)
> +           {
> +             /* And for exclusive scan yet another one, which will
> +                hold the value during the scan phase.  */
> +             tree savar = create_tmp_var_raw (atype);
> +             if (TREE_ADDRESSABLE (new_var))
> +               TREE_ADDRESSABLE (savar) = 1;
> +             DECL_ATTRIBUTES (savar)
> +               = tree_cons (get_identifier ("omp simd array"), NULL,
> +                            tree_cons (get_identifier ("omp simd inscan "
> +                                                       "exclusive"), NULL,
> +                                       DECL_ATTRIBUTES (savar)));
> +             gimple_add_tmp_var (savar);
> +             ctx->cb.decl_map->put (iavar, savar);
> +             *rvar2 = build4 (ARRAY_REF, TREE_TYPE (new_var), savar,
> +                              sctx->idx, NULL_TREE, NULL_TREE);
> +             TREE_THIS_NOTRAP (*rvar2) = 1;
> +           }
>         }
>        ivar = build4 (ARRAY_REF, TREE_TYPE (new_var), iavar, sctx->idx,
>                      NULL_TREE, NULL_TREE);
> @@ -5185,14 +5205,15 @@ lower_rec_input_clauses (tree clauses, g
>                       new_vard = TREE_OPERAND (new_var, 0);
>                       gcc_assert (DECL_P (new_vard));
>                     }
> -                 tree rvar = NULL_TREE, *rvarp = NULL;
> +                 tree rvar = NULL_TREE, *rvarp = NULL, rvar2 = NULL_TREE;
>                   if (is_simd
>                       && OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION
>                       && OMP_CLAUSE_REDUCTION_INSCAN (c))
>                     rvarp = &rvar;
>                   if (is_simd
>                       && lower_rec_simd_input_clauses (new_var, ctx, &sctx,
> -                                                      ivar, lvar, rvarp))
> +                                                      ivar, lvar, rvarp,
> +                                                      &rvar2))
>                     {
>                       if (new_vard == new_var)
>                         {
> @@ -5220,6 +5241,14 @@ lower_rec_input_clauses (tree clauses, g
>                                     (c, ivar2, build_outer_var_ref (var, 
> ctx));
>                               gimplify_and_add (x, &llist[0]);
>
> +                             if (rvar2)
> +                               {
> +                                 x = lang_hooks.decls.omp_clause_default_ctor
> +                                       (c, unshare_expr (rvar2),
> +                                        build_outer_var_ref (var, ctx));
> +                                 gimplify_and_add (x, &llist[0]);
> +                               }
> +
>                               /* For types that need construction, add another
>                                  private var which will be default constructed
>                                  and optionally initialized with
> @@ -5229,7 +5258,9 @@ lower_rec_input_clauses (tree clauses, g
>                                  iteration.  */
>                               tree nv = create_tmp_var_raw (TREE_TYPE (ivar));
>                               gimple_add_tmp_var (nv);
> -                             ctx->cb.decl_map->put (TREE_OPERAND (ivar, 0),
> +                             ctx->cb.decl_map->put (TREE_OPERAND (rvar2
> +                                                                  ? rvar2
> +                                                                  : ivar, 0),
>                                                      nv);
>                               x = lang_hooks.decls.omp_clause_default_ctor
>                                     (c, nv, build_outer_var_ref (var, ctx));
> @@ -5296,6 +5327,18 @@ lower_rec_input_clauses (tree clauses, g
>                               gimplify_stmt (&dtor, &tseq);
>                               gimple_seq_add_seq (&llist[1], tseq);
>                             }
> +
> +                         if (rvar2)
> +                           {
> +                             x = lang_hooks.decls.omp_clause_dtor (c, rvar2);
> +                             if (x)
> +                               {
> +                                 tseq = NULL;
> +                                 dtor = x;
> +                                 gimplify_stmt (&dtor, &tseq);
> +                                 gimple_seq_add_seq (&llist[1], tseq);
> +                               }
> +                           }
>                           break;
>                         }
>                       if (x)
> @@ -5390,6 +5433,24 @@ lower_rec_input_clauses (tree clauses, g
>                               gimple_seq_add_seq (ilist, tseq);
>                             }
>                           OMP_CLAUSE_REDUCTION_GIMPLE_INIT (c) = NULL;
> +                         if (!ctx->scan_inclusive)
> +                           {
> +                             tree nv2
> +                               = create_tmp_var_raw (TREE_TYPE (new_var));
> +                             gimple_add_tmp_var (nv2);
> +                             ctx->cb.decl_map->put (nv, nv2);
> +                             x = lang_hooks.decls.omp_clause_default_ctor
> +                                   (c, nv2, build_outer_var_ref (var, ctx));
> +                             gimplify_and_add (x, ilist);
> +                             x = lang_hooks.decls.omp_clause_dtor (c, nv2);
> +                             if (x)
> +                               {
> +                                 tseq = NULL;
> +                                 dtor = x;
> +                                 gimplify_stmt (&dtor, &tseq);
> +                                 gimple_seq_add_seq (dlist, tseq);
> +                               }
> +                           }
>                           x = lang_hooks.decls.omp_clause_dtor (c, nv);
>                           if (x)
>                             {
> @@ -5399,6 +5460,21 @@ lower_rec_input_clauses (tree clauses, g
>                               gimple_seq_add_seq (dlist, tseq);
>                             }
>                         }
> +                     else if (!ctx->scan_inclusive
> +                              && TREE_ADDRESSABLE (TREE_TYPE (new_var)))
> +                       {
> +                         tree nv2 = create_tmp_var_raw (TREE_TYPE (new_var));
> +                         gimple_add_tmp_var (nv2);
> +                         ctx->cb.decl_map->put (new_vard, nv2);
> +                         x = lang_hooks.decls.omp_clause_dtor (c, nv2);
> +                         if (x)
> +                           {
> +                             tseq = NULL;
> +                             dtor = x;
> +                             gimplify_stmt (&dtor, &tseq);
> +                             gimple_seq_add_seq (dlist, tseq);
> +                           }
> +                       }
>                       DECL_HAS_VALUE_EXPR_P (placeholder) = 0;
>                       goto do_dtor;
>                     }
> @@ -5487,14 +5563,15 @@ lower_rec_input_clauses (tree clauses, g
>                       new_vard = TREE_OPERAND (new_var, 0);
>                       gcc_assert (DECL_P (new_vard));
>                     }
> -                 tree rvar = NULL_TREE, *rvarp = NULL;
> +                 tree rvar = NULL_TREE, *rvarp = NULL, rvar2 = NULL_TREE;
>                   if (is_simd
>                       && OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION
>                       && OMP_CLAUSE_REDUCTION_INSCAN (c))
>                     rvarp = &rvar;
>                   if (is_simd
>                       && lower_rec_simd_input_clauses (new_var, ctx, &sctx,
> -                                                      ivar, lvar, rvarp))
> +                                                      ivar, lvar, rvarp,
> +                                                      &rvar2))
>                     {
>                       if (new_vard != new_var)
>                         {
> @@ -8573,18 +8650,40 @@ lower_omp_scan (gimple_stmt_iterator *gs
>    gimple_seq before = NULL;
>    omp_context *octx = ctx->outer;
>    gcc_assert (octx);
> +  if (!octx->scan_inclusive && !has_clauses)
> +    {
> +      gimple_stmt_iterator gsi2 = *gsi_p;
> +      gsi_next (&gsi2);
> +      gimple *stmt2 = gsi_stmt (gsi2);
> +      /* For exclusive scan, swap GIMPLE_OMP_SCAN without clauses
> +        with following GIMPLE_OMP_SCAN with clauses, so that input_phase,
> +        the one with exclusive clause(s), comes first.  */
> +      if (stmt2
> +         && gimple_code (stmt2) == GIMPLE_OMP_SCAN
> +         && gimple_omp_scan_clauses (as_a <gomp_scan *> (stmt2)) != NULL)
> +       {
> +         gsi_remove (gsi_p, false);
> +         gsi_insert_after (gsi_p, stmt, GSI_SAME_STMT);
> +         ctx = maybe_lookup_ctx (stmt2);
> +         gcc_assert (ctx);
> +         lower_omp_scan (gsi_p, ctx);
> +         return;
> +       }
> +    }
> +
>    bool input_phase = has_clauses ^ octx->scan_inclusive;
>    if (gimple_code (octx->stmt) == GIMPLE_OMP_FOR
>        && (gimple_omp_for_kind (octx->stmt) & GF_OMP_FOR_SIMD)
> -      && !gimple_omp_for_combined_into_p (octx->stmt)
> -      && octx->scan_inclusive)
> +      && !gimple_omp_for_combined_into_p (octx->stmt))
>      {
>        if (tree c = omp_find_clause (gimple_omp_for_clauses (octx->stmt),
>                                     OMP_CLAUSE__SIMDUID_))
>         {
>           tree uid = OMP_CLAUSE__SIMDUID__DECL (c);
>           lane = create_tmp_var (unsigned_type_node);
> -         tree t = build_int_cst (integer_type_node, 1 + !input_phase);
> +         tree t = build_int_cst (integer_type_node,
> +                                 input_phase ? 1
> +                                 : octx->scan_inclusive ? 2 : 3);
>           gimple *g
>             = gimple_build_call_internal (IFN_GOMP_SIMD_LANE, 2, uid, t);
>           gimple_call_set_lhs (g, lane);
> @@ -8601,6 +8700,8 @@ lower_omp_scan (gimple_stmt_iterator *gs
>             tree val = new_var;
>             tree var2 = NULL_TREE;
>             tree var3 = NULL_TREE;
> +           tree var4 = NULL_TREE;
> +           tree lane0 = NULL_TREE;
>             tree new_vard = new_var;
>             if (omp_is_reference (var))
>               {
> @@ -8623,16 +8724,26 @@ lower_omp_scan (gimple_stmt_iterator *gs
>                                           DECL_ATTRIBUTES (v)))
>                       {
>                         val = unshare_expr (val);
> +                       lane0 = TREE_OPERAND (val, 1);
>                         TREE_OPERAND (val, 1) = lane;
>                         var2 = lookup_decl (v, octx);
> +                       if (!octx->scan_inclusive)
> +                         var4 = lookup_decl (var2, octx);
>                         if (input_phase
>                             && OMP_CLAUSE_REDUCTION_PLACEHOLDER (c))
> -                         var3 = maybe_lookup_decl (var2, octx);
> +                         var3 = maybe_lookup_decl (var4 ? var4 : var2, octx);
>                         if (!input_phase)
>                           {
>                             var2 = build4 (ARRAY_REF, TREE_TYPE (val),
>                                            var2, lane, NULL_TREE, NULL_TREE);
>                             TREE_THIS_NOTRAP (var2) = 1;
> +                           if (!octx->scan_inclusive)
> +                             {
> +                               var4 = build4 (ARRAY_REF, TREE_TYPE (val),
> +                                              var4, lane, NULL_TREE,
> +                                              NULL_TREE);
> +                               TREE_THIS_NOTRAP (var4) = 1;
> +                             }
>                           }
>                         else
>                           var2 = val;
> @@ -8643,12 +8754,28 @@ lower_omp_scan (gimple_stmt_iterator *gs
>             else
>               {
>                 var2 = build_outer_var_ref (var, octx);
> -               if (input_phase && OMP_CLAUSE_REDUCTION_PLACEHOLDER (c))
> +               if (OMP_CLAUSE_REDUCTION_PLACEHOLDER (c))
>                   {
>                     var3 = maybe_lookup_decl (new_vard, octx);
> -                   if (var3 == new_vard)
> +                   if (var3 == new_vard || var3 == NULL_TREE)
>                       var3 = NULL_TREE;
> +                   else if (!octx->scan_inclusive && !input_phase)
> +                     {
> +                       var4 = maybe_lookup_decl (var3, octx);
> +                       if (var4 == var3 || var4 == NULL_TREE)
> +                         {
> +                           if (TREE_ADDRESSABLE (TREE_TYPE (new_var)))
> +                             {
> +                               var4 = var3;
> +                               var3 = NULL_TREE;
> +                             }
> +                           else
> +                             var4 = NULL_TREE;
> +                         }
> +                     }
>                   }
> +               if (!octx->scan_inclusive && !input_phase && var4 == 
> NULL_TREE)
> +                 var4 = create_tmp_var (TREE_TYPE (val));
>               }
>             if (OMP_CLAUSE_REDUCTION_PLACEHOLDER (c))
>               {
> @@ -8689,9 +8816,17 @@ lower_omp_scan (gimple_stmt_iterator *gs
>                   }
>                 else
>                   {
> +                   tree x;
> +                   if (!octx->scan_inclusive)
> +                     {
> +                       tree v4 = unshare_expr (var4);
> +                       tree v2 = unshare_expr (var2);
> +                       x = lang_hooks.decls.omp_clause_assign_op (c, v4, v2);
> +                       gimplify_and_add (x, &before);
> +                     }
>                     gimple_seq tseq = OMP_CLAUSE_REDUCTION_GIMPLE_MERGE (c);
> -                   tree x = (DECL_HAS_VALUE_EXPR_P (new_vard)
> -                             ? DECL_VALUE_EXPR (new_vard) : NULL_TREE);
> +                   x = (DECL_HAS_VALUE_EXPR_P (new_vard)
> +                        ? DECL_VALUE_EXPR (new_vard) : NULL_TREE);
>                     tree vexpr = val;
>                     if (x && omp_is_reference (var))
>                       vexpr = build_fold_addr_expr_loc (clause_loc, val);
> @@ -8706,8 +8841,18 @@ lower_omp_scan (gimple_stmt_iterator *gs
>                       SET_DECL_VALUE_EXPR (new_vard, x);
>                     SET_DECL_VALUE_EXPR (placeholder, NULL_TREE);
>                     DECL_HAS_VALUE_EXPR_P (placeholder) = 0;
> -                   x = lang_hooks.decls.omp_clause_assign_op (c, val, var2);
> -                   gimplify_and_add (x, &before);
> +                   if (octx->scan_inclusive)
> +                     {
> +                       x = lang_hooks.decls.omp_clause_assign_op (c, val,
> +                                                                  var2);
> +                       gimplify_and_add (x, &before);
> +                     }
> +                   else if (lane0 == NULL_TREE)
> +                     {
> +                       x = lang_hooks.decls.omp_clause_assign_op (c, val,
> +                                                                  var4);
> +                       gimplify_and_add (x, &before);
> +                     }
>                   }
>               }
>             else
> @@ -8728,10 +8873,29 @@ lower_omp_scan (gimple_stmt_iterator *gs
>
>                     tree x = build2 (code, TREE_TYPE (var2),
>                                      unshare_expr (var2), unshare_expr (val));
> -                   gimplify_assign (unshare_expr (var2), x, &before);
> -                   gimplify_assign (val, var2, &before);
> +                   if (octx->scan_inclusive)
> +                     {
> +                       gimplify_assign (unshare_expr (var2), x, &before);
> +                       gimplify_assign (val, var2, &before);
> +                     }
> +                   else
> +                     {
> +                       gimplify_assign (unshare_expr (var4),
> +                                        unshare_expr (var2), &before);
> +                       gimplify_assign (var2, x, &before);
> +                       if (lane0 == NULL_TREE)
> +                         gimplify_assign (val, var4, &before);
> +                     }
>                   }
>               }
> +           if (!octx->scan_inclusive && !input_phase && lane0)
> +             {
> +               tree vexpr = unshare_expr (var4);
> +               TREE_OPERAND (vexpr, 1) = lane0;
> +               if (omp_is_reference (var))
> +                 vexpr = build_fold_addr_expr_loc (clause_loc, vexpr);
> +               SET_DECL_VALUE_EXPR (new_vard, vexpr);
> +             }
>           }
>      }
>    else if (has_clauses)
> --- gcc/tree-vectorizer.h.jj    2019-06-20 13:26:29.078150879 +0200
> +++ gcc/tree-vectorizer.h       2019-06-20 14:18:04.241075200 +0200
> @@ -917,7 +917,7 @@ struct _stmt_vec_info {
>    bool strided_p;
>
>    /* For both loads and stores.  */
> -  unsigned simd_lane_access_p : 2;
> +  unsigned simd_lane_access_p : 3;
>
>    /* Classifies how the load or store is going to be implemented
>       for loop vectorization.  */
> --- gcc/tree-vect-data-refs.c.jj        2019-06-20 13:55:35.421150589 +0200
> +++ gcc/tree-vect-data-refs.c   2019-06-20 14:18:04.240075216 +0200
> @@ -4223,7 +4223,8 @@ vect_analyze_data_refs (vec_info *vinfo,
>        /* See if this was detected as SIMD lane access.  */
>        if (dr->aux == (void *)-1
>           || dr->aux == (void *)-2
> -         || dr->aux == (void *)-3)
> +         || dr->aux == (void *)-3
> +         || dr->aux == (void *)-4)
>         {
>           if (nested_in_vect_loop_p (loop, stmt_info))
>             return opt_result::failure_at (stmt_info->stmt,
> --- gcc/tree-vect-stmts.c.jj    2019-06-20 13:26:29.084150785 +0200
> +++ gcc/tree-vect-stmts.c       2019-06-20 14:18:04.239075231 +0200
> @@ -6512,7 +6512,37 @@ check_scan_store (stmt_vec_info stmt_inf
>       kinds are there in order to allow optimizing the initializer store
>       and combiner sequence, e.g. if it is originally some C++ish user
>       defined reduction, but allow the vectorizer to pattern recognize it
> -     and turn into the appropriate vectorized scan.  */
> +     and turn into the appropriate vectorized scan.
> +
> +     For exclusive scan, this is slightly different:
> +     #pragma omp simd reduction(inscan,+:r)
> +     for (...)
> +       {
> +        use (r);
> +        #pragma omp scan exclusive (r)
> +        r += something ();
> +       }
> +     shall have body with:
> +       // Initialization for input phase, store the reduction initializer:
> +       _20 = .GOMP_SIMD_LANE (simduid.3_14(D), 0);
> +       _21 = .GOMP_SIMD_LANE (simduid.3_14(D), 1);
> +       D.2042[_21] = 0;
> +       // Actual input phase:
> +       ...
> +       r.0_5 = D.2042[_20];
> +       _6 = _4 + r.0_5;
> +       D.2042[_20] = _6;
> +       // Initialization for scan phase:
> +       _25 = .GOMP_SIMD_LANE (simduid.3_14(D), 3);
> +       _26 = D.2043[_25];
> +       D.2044[_25] = _26;
> +       _27 = D.2042[_25];
> +       _28 = _26 + _27;
> +       D.2043[_25] = _28;
> +       // Actual scan phase:
> +       ...
> +       r.1_8 = D.2044[_20];
> +       ...  */
>
>    if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 2)
>      {
> @@ -6553,26 +6583,52 @@ check_scan_store (stmt_vec_info stmt_inf
>    if (TREE_CODE (rhs) != SSA_NAME)
>      goto fail;
>
> -  use_operand_p use_p;
> -  imm_use_iterator iter;
>    gimple *other_store_stmt = NULL;
> -  FOR_EACH_IMM_USE_FAST (use_p, iter, rhs)
> +  tree var = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0);
> +  bool inscan_var_store
> +    = lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var)) != NULL;
> +
> +  if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4)
>      {
> -      gimple *use_stmt = USE_STMT (use_p);
> -      if (use_stmt == stmt || is_gimple_debug (use_stmt))
> -       continue;
> -      if (gimple_bb (use_stmt) != gimple_bb (stmt)
> -         || !gimple_store_p (use_stmt)
> -         || other_store_stmt)
> -       goto fail;
> -      other_store_stmt = use_stmt;
> +      if (!inscan_var_store)
> +       {
> +         use_operand_p use_p;
> +         imm_use_iterator iter;
> +         FOR_EACH_IMM_USE_FAST (use_p, iter, rhs)
> +           {
> +             gimple *use_stmt = USE_STMT (use_p);
> +             if (use_stmt == stmt || is_gimple_debug (use_stmt))
> +               continue;
> +             if (gimple_bb (use_stmt) != gimple_bb (stmt)
> +                 || !is_gimple_assign (use_stmt)
> +                 || gimple_assign_rhs_class (use_stmt) != GIMPLE_BINARY_RHS
> +                 || other_store_stmt
> +                 || TREE_CODE (gimple_assign_lhs (use_stmt)) != SSA_NAME)
> +               goto fail;
> +             other_store_stmt = use_stmt;
> +           }
> +         if (other_store_stmt == NULL)
> +           goto fail;
> +         rhs = gimple_assign_lhs (other_store_stmt);
> +         if (!single_imm_use (rhs, &use_p, &other_store_stmt))
> +           goto fail;
> +       }
>      }
> -  if (other_store_stmt == NULL)
> -    goto fail;
> -  stmt_vec_info other_store_stmt_info
> -    = loop_vinfo->lookup_stmt (other_store_stmt);
> -  if (other_store_stmt_info == NULL
> -      || STMT_VINFO_SIMD_LANE_ACCESS_P (other_store_stmt_info) != 3)
> +  else if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 3)
> +    {
> +      use_operand_p use_p;
> +      imm_use_iterator iter;
> +      FOR_EACH_IMM_USE_FAST (use_p, iter, rhs)
> +       {
> +         gimple *use_stmt = USE_STMT (use_p);
> +         if (use_stmt == stmt || is_gimple_debug (use_stmt))
> +           continue;
> +         if (other_store_stmt)
> +           goto fail;
> +         other_store_stmt = use_stmt;
> +       }
> +    }
> +  else
>      goto fail;
>
>    gimple *def_stmt = SSA_NAME_DEF_STMT (rhs);
> @@ -6599,8 +6655,7 @@ check_scan_store (stmt_vec_info stmt_inf
>
>    tree rhs1 = gimple_assign_rhs1 (def_stmt);
>    tree rhs2 = gimple_assign_rhs2 (def_stmt);
> -  if (TREE_CODE (rhs1) != SSA_NAME
> -      || TREE_CODE (rhs2) != SSA_NAME)
> +  if (TREE_CODE (rhs1) != SSA_NAME || TREE_CODE (rhs2) != SSA_NAME)
>      goto fail;
>
>    gimple *load1_stmt = SSA_NAME_DEF_STMT (rhs1);
> @@ -6615,22 +6670,83 @@ check_scan_store (stmt_vec_info stmt_inf
>    stmt_vec_info load2_stmt_info = loop_vinfo->lookup_stmt (load2_stmt);
>    if (load1_stmt_info == NULL
>        || load2_stmt_info == NULL
> -      || STMT_VINFO_SIMD_LANE_ACCESS_P (load1_stmt_info) != 3
> -      || STMT_VINFO_SIMD_LANE_ACCESS_P (load2_stmt_info) != 3)
> +      || (STMT_VINFO_SIMD_LANE_ACCESS_P (load1_stmt_info)
> +         != STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info))
> +      || (STMT_VINFO_SIMD_LANE_ACCESS_P (load2_stmt_info)
> +         != STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)))
>      goto fail;
>
> -  if (scan_operand_equal_p (gimple_assign_lhs (stmt),
> +  if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && inscan_var_store)
> +    {
> +      dr_vec_info *load1_dr_info = STMT_VINFO_DR_INFO (load1_stmt_info);
> +      if (TREE_CODE (DR_BASE_ADDRESS (load1_dr_info->dr)) != ADDR_EXPR
> +         || !VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (load1_dr_info->dr), 0)))
> +       goto fail;
> +      tree var1 = TREE_OPERAND (DR_BASE_ADDRESS (load1_dr_info->dr), 0);
> +      tree lrhs;
> +      if (lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var1)))
> +       lrhs = rhs1;
> +      else
> +       lrhs = rhs2;
> +      use_operand_p use_p;
> +      imm_use_iterator iter;
> +      FOR_EACH_IMM_USE_FAST (use_p, iter, lrhs)
> +       {
> +         gimple *use_stmt = USE_STMT (use_p);
> +         if (use_stmt == def_stmt || is_gimple_debug (use_stmt))
> +           continue;
> +         if (other_store_stmt)
> +           goto fail;
> +         other_store_stmt = use_stmt;
> +       }
> +    }
> +
> +  if (other_store_stmt == NULL)
> +    goto fail;
> +  if (gimple_bb (other_store_stmt) != gimple_bb (stmt)
> +      || !gimple_store_p (other_store_stmt))
> +    goto fail;
> +
> +  stmt_vec_info other_store_stmt_info
> +    = loop_vinfo->lookup_stmt (other_store_stmt);
> +  if (other_store_stmt_info == NULL
> +      || (STMT_VINFO_SIMD_LANE_ACCESS_P (other_store_stmt_info)
> +         != STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)))
> +    goto fail;
> +
> +  gimple *stmt1 = stmt;
> +  gimple *stmt2 = other_store_stmt;
> +  if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && !inscan_var_store)
> +    std::swap (stmt1, stmt2);
> +  if (scan_operand_equal_p (gimple_assign_lhs (stmt1),
>                             gimple_assign_rhs1 (load2_stmt)))
>      {
>        std::swap (rhs1, rhs2);
>        std::swap (load1_stmt, load2_stmt);
>        std::swap (load1_stmt_info, load2_stmt_info);
>      }
> -  if (!scan_operand_equal_p (gimple_assign_lhs (stmt),
> -                            gimple_assign_rhs1 (load1_stmt))
> -      || !scan_operand_equal_p (gimple_assign_lhs (other_store_stmt),
> +  if (!scan_operand_equal_p (gimple_assign_lhs (stmt1),
> +                            gimple_assign_rhs1 (load1_stmt)))
> +    goto fail;
> +
> +  tree var3 = NULL_TREE;
> +  if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 3
> +      && !scan_operand_equal_p (gimple_assign_lhs (stmt2),
>                                 gimple_assign_rhs1 (load2_stmt)))
>      goto fail;
> +  else if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4)
> +    {
> +      dr_vec_info *load2_dr_info = STMT_VINFO_DR_INFO (load2_stmt_info);
> +      if (TREE_CODE (DR_BASE_ADDRESS (load2_dr_info->dr)) != ADDR_EXPR
> +         || !VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (load2_dr_info->dr), 0)))
> +       goto fail;
> +      var3 = TREE_OPERAND (DR_BASE_ADDRESS (load2_dr_info->dr), 0);
> +      if (!lookup_attribute ("omp simd array", DECL_ATTRIBUTES (var3))
> +         || lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var3))
> +         || lookup_attribute ("omp simd inscan exclusive",
> +                              DECL_ATTRIBUTES (var3)))
> +       goto fail;
> +    }
>
>    dr_vec_info *other_dr_info = STMT_VINFO_DR_INFO (other_store_stmt_info);
>    if (TREE_CODE (DR_BASE_ADDRESS (other_dr_info->dr)) != ADDR_EXPR
> @@ -6648,6 +6764,14 @@ check_scan_store (stmt_vec_info stmt_inf
>    if (lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var1)))
>      std::swap (var1, var2);
>
> +  if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4)
> +    {
> +      if (!lookup_attribute ("omp simd inscan exclusive",
> +                            DECL_ATTRIBUTES (var1)))
> +       goto fail;
> +      var1 = var3;
> +    }
> +
>    if (loop_vinfo->scan_map == NULL)
>      goto fail;
>    tree *init = loop_vinfo->scan_map->get (var1);
> @@ -6655,6 +6779,7 @@ check_scan_store (stmt_vec_info stmt_inf
>      goto fail;
>
>    /* The IL is as expected, now check if we can actually vectorize it.
> +     Inclusive scan:
>         _26 = D.2043[_25];
>         _27 = D.2042[_25];
>         _28 = _26 + _27;
> @@ -6664,21 +6789,49 @@ check_scan_store (stmt_vec_info stmt_inf
>       from the D.2042[_21] = 0; store):
>         _30 = MEM <vector(8) int> [(int *)&D.2043];
>         _31 = MEM <vector(8) int> [(int *)&D.2042];
> -       _32 = VEC_PERM_EXPR <_31, _40, { 8, 0, 1, 2, 3, 4, 5, 6 }>;
> +       _32 = VEC_PERM_EXPR <_40, _31, { 0, 8, 9, 10, 11, 12, 13, 14 }>;
>         _33 = _31 + _32;
>         // _33 = { _31[0], _31[0]+_31[1], _31[1]+_31[2], ..., _31[6]+_31[7] };
> -       _34 = VEC_PERM_EXPR <_33, _40, { 8, 9, 0, 1, 2, 3, 4, 5 }>;
> +       _34 = VEC_PERM_EXPR <_40, _33, { 0, 1, 8, 9, 10, 11, 12, 13 }>;
>         _35 = _33 + _34;
>         // _35 = { _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3],
>         //         _31[1]+.._31[4], ... _31[4]+.._31[7] };
> -       _36 = VEC_PERM_EXPR <_35, _40, { 8, 9, 10, 11, 0, 1, 2, 3 }>;
> +       _36 = VEC_PERM_EXPR <_40, _35, { 0, 1, 2, 3, 8, 9, 10, 11 }>;
>         _37 = _35 + _36;
>         // _37 = { _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3],
>         //         _31[0]+.._31[4], ... _31[0]+.._31[7] };
>         _38 = _30 + _37;
>         _39 = VEC_PERM_EXPR <_38, _38, { 7, 7, 7, 7, 7, 7, 7, 7 }>;
>         MEM <vector(8) int> [(int *)&D.2043] = _39;
> -       MEM <vector(8) int> [(int *)&D.2042] = _38;  */
> +       MEM <vector(8) int> [(int *)&D.2042] = _38;
> +     Exclusive scan:
> +       _26 = D.2043[_25];
> +       D.2044[_25] = _26;
> +       _27 = D.2042[_25];
> +       _28 = _26 + _27;
> +       D.2043[_25] = _28;
> +     should be vectorized as (where _40 is the vectorized rhs
> +     from the D.2042[_21] = 0; store):
> +       _30 = MEM <vector(8) int> [(int *)&D.2043];
> +       _31 = MEM <vector(8) int> [(int *)&D.2042];
> +       _32 = VEC_PERM_EXPR <_40, _31, { 0, 8, 9, 10, 11, 12, 13, 14 }>;
> +       _33 = VEC_PERM_EXPR <_40, _32, { 0, 8, 9, 10, 11, 12, 13, 14 }>;
> +       _34 = _32 + _33;
> +       // _34 = { 0, _31[0], _31[0]+_31[1], _31[1]+_31[2], _31[2]+_31[3],
> +       //         _31[3]+_31[4], ... _31[5]+.._31[6] };
> +       _35 = VEC_PERM_EXPR <_40, _34, { 0, 1, 8, 9, 10, 11, 12, 13 }>;
> +       _36 = _34 + _35;
> +       // _36 = { 0, _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3],
> +       //         _31[1]+.._31[4], ... _31[3]+.._31[6] };
> +       _37 = VEC_PERM_EXPR <_40, _36, { 0, 1, 2, 3, 8, 9, 10, 11 }>;
> +       _38 = _36 + _37;
> +       // _38 = { 0, _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3],
> +       //         _31[0]+.._31[4], ... _31[0]+.._31[6] };
> +       _39 = _30 + _38;
> +       _50 = _31 + _39;
> +       _51 = VEC_PERM_EXPR <_50, _50, { 7, 7, 7, 7, 7, 7, 7, 7 }>;
> +       MEM <vector(8) int> [(int *)&D.2044] = _39;
> +       MEM <vector(8) int> [(int *)&D.2042] = _51;  */
>    enum machine_mode vec_mode = TYPE_MODE (vectype);
>    optab optab = optab_for_tree_code (code, vectype, optab_default);
>    if (!optab || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
> @@ -6715,6 +6868,24 @@ vectorizable_scan_store (stmt_vec_info s
>    tree rhs = gimple_assign_rhs1 (stmt);
>    gcc_assert (TREE_CODE (rhs) == SSA_NAME);
>
> +  tree var = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0);
> +  bool inscan_var_store
> +    = lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var)) != NULL;
> +
> +  if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && !inscan_var_store)
> +    {
> +      use_operand_p use_p;
> +      imm_use_iterator iter;
> +      FOR_EACH_IMM_USE_FAST (use_p, iter, rhs)
> +       {
> +         gimple *use_stmt = USE_STMT (use_p);
> +         if (use_stmt == stmt || is_gimple_debug (use_stmt))
> +           continue;
> +         rhs = gimple_assign_lhs (use_stmt);
> +         break;
> +       }
> +    }
> +
>    gimple *def_stmt = SSA_NAME_DEF_STMT (rhs);
>    enum tree_code code = gimple_assign_rhs_code (def_stmt);
>    if (code == POINTER_PLUS_EXPR)
> @@ -6737,15 +6908,12 @@ vectorizable_scan_store (stmt_vec_info s
>      {
>        std::swap (rhs1, rhs2);
>        std::swap (var1, var2);
> +      std::swap (load1_dr_info, load2_dr_info);
>      }
>
>    tree *init = loop_vinfo->scan_map->get (var1);
>    gcc_assert (init);
>
> -  tree var = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0);
> -  bool inscan_var_store
> -    = lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var)) != NULL;
> -
>    unsigned HOST_WIDE_INT nunits;
>    if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits))
>      gcc_unreachable ();
> @@ -6789,29 +6957,50 @@ vectorizable_scan_store (stmt_vec_info s
>    tree vec_oprnd1 = NULL_TREE;
>    tree vec_oprnd2 = NULL_TREE;
>    tree vec_oprnd3 = NULL_TREE;
> -  tree dataref_ptr = unshare_expr (DR_BASE_ADDRESS (dr_info->dr));
> +  tree dataref_ptr = DR_BASE_ADDRESS (dr_info->dr);
>    tree dataref_offset = build_int_cst (ref_type, 0);
>    tree bump = vect_get_data_ptr_increment (dr_info, vectype, 
> VMAT_CONTIGUOUS);
> +  tree ldataref_ptr = NULL_TREE;
>    tree orig = NULL_TREE;
> +  if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && !inscan_var_store)
> +    ldataref_ptr = DR_BASE_ADDRESS (load1_dr_info->dr);
>    for (int j = 0; j < ncopies; j++)
>      {
>        stmt_vec_info new_stmt_info;
>        if (j == 0)
>         {
>           vec_oprnd1 = vect_get_vec_def_for_operand (*init, stmt_info);
> -         vec_oprnd2 = vect_get_vec_def_for_operand (rhs1, stmt_info);
> +         if (ldataref_ptr == NULL)
> +           vec_oprnd2 = vect_get_vec_def_for_operand (rhs1, stmt_info);
>           vec_oprnd3 = vect_get_vec_def_for_operand (rhs2, stmt_info);
>           orig = vec_oprnd3;
>         }
>        else
>         {
>           vec_oprnd1 = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd1);
> -         vec_oprnd2 = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd2);
> +         if (ldataref_ptr == NULL)
> +           vec_oprnd2 = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd2);
>           vec_oprnd3 = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd3);
>           if (!inscan_var_store)
>             dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset, 
> bump);
>         }
>
> +      if (ldataref_ptr)
> +       {
> +         vec_oprnd2 = make_ssa_name (vectype);
> +         tree data_ref = fold_build2 (MEM_REF, vectype,
> +                                      unshare_expr (ldataref_ptr),
> +                                      dataref_offset);
> +         vect_copy_ref_info (data_ref, DR_REF (load1_dr_info->dr));
> +         gimple *g = gimple_build_assign (vec_oprnd2, data_ref);
> +         new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi);
> +         if (prev_stmt_info == NULL)
> +           STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
> +         else
> +           STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
> +         prev_stmt_info = new_stmt_info;
> +       }
> +
>        tree v = vec_oprnd2;
>        for (int i = 0; i < units_log2; ++i)
>         {
> @@ -6848,6 +7037,17 @@ vectorizable_scan_store (stmt_vec_info s
>               new_temp = new_temp2;
>             }
>
> +         /* For exclusive scan, perform the perms[i] permutation once
> +            more.  */
> +         if (i == 0
> +             && STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4
> +             && v == vec_oprnd2)
> +           {
> +             v = new_temp;
> +             --i;
> +             continue;
> +           }
> +
>           tree new_temp2 = make_ssa_name (vectype);
>           g = gimple_build_assign (new_temp2, code, v, new_temp);
>           new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi);
> @@ -6863,16 +7063,30 @@ vectorizable_scan_store (stmt_vec_info s
>        STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
>        prev_stmt_info = new_stmt_info;
>
> +      tree last_perm_arg = new_temp;
> +      /* For exclusive scan, new_temp computed above is the exclusive scan
> +        prefix sum.  Turn it into inclusive prefix sum for the broadcast
> +        of the last element into orig.  */
> +      if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4)
> +       {
> +         last_perm_arg = make_ssa_name (vectype);
> +         g = gimple_build_assign (last_perm_arg, code, new_temp, vec_oprnd2);
> +         new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi);
> +         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
> +         prev_stmt_info = new_stmt_info;
> +       }
> +
>        orig = make_ssa_name (vectype);
> -      g = gimple_build_assign (orig, VEC_PERM_EXPR, new_temp, new_temp,
> -                              perms[units_log2]);
> +      g = gimple_build_assign (orig, VEC_PERM_EXPR, last_perm_arg,
> +                              last_perm_arg, perms[units_log2]);
>        new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi);
>        STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
>        prev_stmt_info = new_stmt_info;
>
>        if (!inscan_var_store)
>         {
> -         tree data_ref = fold_build2 (MEM_REF, vectype, dataref_ptr,
> +         tree data_ref = fold_build2 (MEM_REF, vectype,
> +                                      unshare_expr (dataref_ptr),
>                                        dataref_offset);
>           vect_copy_ref_info (data_ref, DR_REF (dr_info->dr));
>           g = gimple_build_assign (data_ref, new_temp);
> @@ -6888,7 +7102,8 @@ vectorizable_scan_store (stmt_vec_info s
>         if (j != 0)
>           dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset, bump);
>
> -       tree data_ref = fold_build2 (MEM_REF, vectype, dataref_ptr,
> +       tree data_ref = fold_build2 (MEM_REF, vectype,
> +                                    unshare_expr (dataref_ptr),
>                                      dataref_offset);
>         vect_copy_ref_info (data_ref, DR_REF (dr_info->dr));
>         gimple *g = gimple_build_assign (data_ref, orig);
> @@ -7325,7 +7540,7 @@ vectorizable_store (stmt_vec_info stmt_i
>         }
>        return true;
>      }
> -  else if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 3)
> +  else if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) >= 3)
>      return vectorizable_scan_store (stmt_info, gsi, vec_stmt, ncopies);
>
>    if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> --- gcc/testsuite/gcc.dg/vect/vect-simd-12.c.jj 2019-06-20 15:08:50.260400440 
> +0200
> +++ gcc/testsuite/gcc.dg/vect/vect-simd-12.c    2019-06-20 15:08:24.332805239 
> +0200
> @@ -0,0 +1,122 @@
> +/* { dg-require-effective-target size32plus } */
> +/* { dg-additional-options "-fopenmp-simd" } */
> +/* { dg-additional-options "-mavx" { target avx_runtime } } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { 
> target i?86-*-* x86_64-*-* } } } */
> +
> +#ifndef main
> +#include "tree-vect.h"
> +#endif
> +
> +int r, a[1024], b[1024];
> +
> +__attribute__((noipa)) void
> +foo (int *a, int *b)
> +{
> +  #pragma omp simd reduction (inscan, +:r)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      b[i] = r;
> +      #pragma omp scan exclusive(r)
> +      r += a[i];
> +    }
> +}
> +
> +__attribute__((noipa)) int
> +bar (void)
> +{
> +  int s = 0;
> +  #pragma omp simd reduction (inscan, +:s)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      b[i] = s;
> +      #pragma omp scan exclusive(s)
> +      s += 2 * a[i];
> +    }
> +  return s;
> +}
> +
> +__attribute__((noipa)) void
> +baz (int *a, int *b)
> +{
> +  #pragma omp simd reduction (inscan, +:r) if (simd: 0)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      b[i] = r;
> +      #pragma omp scan exclusive(r)
> +      r += a[i];
> +    }
> +}
> +
> +__attribute__((noipa)) int
> +qux (void)
> +{
> +  int s = 0;
> +  #pragma omp simd reduction (inscan, +:s) simdlen (1)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      b[i] = s;
> +      #pragma omp scan exclusive(s)
> +      s += 2 * a[i];
> +    }
> +  return s;
> +}
> +
> +int
> +main ()
> +{
> +  int s = 0;
> +#ifndef main
> +  check_vect ();
> +#endif
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      a[i] = i;
> +      b[i] = -1;
> +      asm ("" : "+g" (i));
> +    }
> +  foo (a, b);
> +  if (r != 1024 * 1023 / 2)
> +    abort ();
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i] != s)
> +       abort ();
> +      else
> +       b[i] = 25;
> +      s += i;
> +    }
> +  if (bar () != 1024 * 1023)
> +    abort ();
> +  s = 0;
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i] != s)
> +       abort ();
> +      else
> +       b[i] = -1;
> +      s += 2 * i;
> +    }
> +  r = 0;
> +  baz (a, b);
> +  if (r != 1024 * 1023 / 2)
> +    abort ();
> +  s = 0;
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i] != s)
> +       abort ();
> +      else
> +       b[i] = -25;
> +      s += i;
> +    }
> +  if (qux () != 1024 * 1023)
> +    abort ();
> +  s = 0;
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i] != s)
> +       abort ();
> +      s += 2 * i;
> +    }
> +  return 0;
> +}
> --- gcc/testsuite/gcc.dg/vect/vect-simd-13.c.jj 2019-06-20 15:47:23.580359715 
> +0200
> +++ gcc/testsuite/gcc.dg/vect/vect-simd-13.c    2019-06-20 15:13:23.500134387 
> +0200
> @@ -0,0 +1,124 @@
> +/* { dg-require-effective-target size32plus } */
> +/* { dg-additional-options "-fopenmp-simd" } */
> +/* { dg-additional-options "-mavx" { target avx_runtime } } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { 
> target i?86-*-* x86_64-*-* } } } */
> +
> +#ifndef main
> +#include "tree-vect.h"
> +#endif
> +
> +int r, a[1024], b[1024];
> +
> +#pragma omp declare reduction (foo: int: omp_out += omp_in) initializer 
> (omp_priv = 0)
> +
> +__attribute__((noipa)) void
> +foo (int *a, int *b)
> +{
> +  #pragma omp simd reduction (inscan, foo:r)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      b[i] = r;
> +      #pragma omp scan exclusive(r)
> +      r += a[i];
> +    }
> +}
> +
> +__attribute__((noipa)) int
> +bar (void)
> +{
> +  int s = 0;
> +  #pragma omp simd reduction (inscan, foo:s)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      b[i] = s;
> +      #pragma omp scan exclusive(s)
> +      s += 2 * a[i];
> +    }
> +  return s;
> +}
> +
> +__attribute__((noipa)) void
> +baz (int *a, int *b)
> +{
> +  #pragma omp simd reduction (inscan, foo:r) if (simd: 0)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      b[i] = r;
> +      #pragma omp scan exclusive(r)
> +      r += a[i];
> +    }
> +}
> +
> +__attribute__((noipa)) int
> +qux (void)
> +{
> +  int s = 0;
> +  #pragma omp simd reduction (inscan, foo:s) simdlen (1)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      b[i] = s;
> +      #pragma omp scan exclusive(s)
> +      s += 2 * a[i];
> +    }
> +  return s;
> +}
> +
> +int
> +main ()
> +{
> +  int s = 0;
> +#ifndef main
> +  check_vect ();
> +#endif
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      a[i] = i;
> +      b[i] = -1;
> +      asm ("" : "+g" (i));
> +    }
> +  foo (a, b);
> +  if (r != 1024 * 1023 / 2)
> +    abort ();
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i] != s)
> +       abort ();
> +      else
> +       b[i] = 25;
> +      s += i;
> +    }
> +  if (bar () != 1024 * 1023)
> +    abort ();
> +  s = 0;
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i] != s)
> +       abort ();
> +      else
> +       b[i] = -1;
> +      s += 2 * i;
> +    }
> +  r = 0;
> +  baz (a, b);
> +  if (r != 1024 * 1023 / 2)
> +    abort ();
> +  s = 0;
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i] != s)
> +       abort ();
> +      else
> +       b[i] = -25;
> +      s += i;
> +    }
> +  if (qux () != 1024 * 1023)
> +    abort ();
> +  s = 0;
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i] != s)
> +       abort ();
> +      s += 2 * i;
> +    }
> +  return 0;
> +}
> --- gcc/testsuite/gcc.dg/vect/vect-simd-14.c.jj 2019-06-20 15:48:30.536321539 
> +0200
> +++ gcc/testsuite/gcc.dg/vect/vect-simd-14.c    2019-06-20 15:54:39.291617792 
> +0200
> @@ -0,0 +1,94 @@
> +/* { dg-require-effective-target size32plus } */
> +/* { dg-additional-options "-fopenmp-simd" } */
> +/* { dg-additional-options "-mavx" { target avx_runtime } } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { 
> target i?86-*-* x86_64-*-* } } } */
> +
> +#ifndef main
> +#include "tree-vect.h"
> +#endif
> +
> +float r = 1.0f, a[1024], b[1024];
> +
> +__attribute__((noipa)) void
> +foo (float *a, float *b)
> +{
> +  #pragma omp simd reduction (inscan, *:r)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      b[i] = r;
> +      #pragma omp scan exclusive(r)
> +      r *= a[i];
> +    }
> +}
> +
> +__attribute__((noipa)) float
> +bar (void)
> +{
> +  float s = -__builtin_inff ();
> +  #pragma omp simd reduction (inscan, max:s)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      b[i] = s;
> +      #pragma omp scan exclusive(s)
> +      s = s > a[i] ? s : a[i];
> +    }
> +  return s;
> +}
> +
> +int
> +main ()
> +{
> +  float s = 1.0f;
> +#ifndef main
> +  check_vect ();
> +#endif
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (i < 80)
> +       a[i] = (i & 1) ? 0.25f : 0.5f;
> +      else if (i < 200)
> +       a[i] = (i % 3) == 0 ? 2.0f : (i % 3) == 1 ? 4.0f : 1.0f;
> +      else if (i < 280)
> +       a[i] = (i & 1) ? 0.25f : 0.5f;
> +      else if (i < 380)
> +       a[i] = (i % 3) == 0 ? 2.0f : (i % 3) == 1 ? 4.0f : 1.0f;
> +      else
> +       switch (i % 6)
> +         {
> +         case 0: a[i] = 0.25f; break;
> +         case 1: a[i] = 2.0f; break;
> +         case 2: a[i] = -1.0f; break;
> +         case 3: a[i] = -4.0f; break;
> +         case 4: a[i] = 0.5f; break;
> +         case 5: a[i] = 1.0f; break;
> +         default: a[i] = 0.0f; break;
> +         }
> +      b[i] = -19.0f;
> +      asm ("" : "+g" (i));
> +    }
> +  foo (a, b);
> +  if (r * 16384.0f != 0.125f)
> +    abort ();
> +  float m = -175.25f;
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i] != s)
> +       abort ();
> +      else
> +       b[i] = -231.75f;
> +      s *= a[i];
> +      a[i] = m - ((i % 3) == 1 ? 2.0f : (i % 3) == 2 ? 4.0f : 0.0f);
> +      m += 0.75f;
> +    }
> +  if (bar () != 592.0f)
> +    abort ();
> +  s = -__builtin_inff ();
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i] != s)
> +       abort ();
> +      if (s < a[i])
> +       s = a[i];
> +    }
> +  return 0;
> +}


Hi,
I've noticed that this new test (gcc.dg/vect/vect-simd-14.c)
fails at execution time on arm targets.

It does pass on aarch64.

Christophe

> --- gcc/testsuite/gcc.dg/vect/vect-simd-15.c.jj 2019-06-20 15:50:34.483399705 
> +0200
> +++ gcc/testsuite/gcc.dg/vect/vect-simd-15.c    2019-06-20 15:52:09.976919050 
> +0200
> @@ -0,0 +1,186 @@
> +/* { dg-require-effective-target size32plus } */
> +/* { dg-additional-options "-fopenmp-simd" } */
> +/* { dg-additional-options "-mavx" { target avx_runtime } } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { 
> target i?86-*-* x86_64-*-* } } } */
> +
> +#ifndef main
> +#include "tree-vect.h"
> +#endif
> +
> +int r, a[1024], b[1024];
> +unsigned short r2, b2[1024];
> +unsigned char r3, b3[1024];
> +
> +__attribute__((noipa)) void
> +foo (int *a, int *b, unsigned short *b2, unsigned char *b3)
> +{
> +  #pragma omp simd reduction (inscan, +:r, r2, r3)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      {
> +       b[i] = r;
> +       b2[i] = r2;
> +       b3[i] = r3;
> +      }
> +      #pragma omp scan exclusive(r, r2, r3)
> +      { r += a[i]; r2 += a[i]; r3 += a[i]; }
> +    }
> +}
> +
> +__attribute__((noipa)) int
> +bar (unsigned short *s2p, unsigned char *s3p)
> +{
> +  int s = 0;
> +  unsigned short s2 = 0;
> +  unsigned char s3 = 0;
> +  #pragma omp simd reduction (inscan, +:s, s2, s3)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      { b[i] = s; b2[i] = s2; b3[i] = s3; }
> +      #pragma omp scan exclusive(s, s2, s3)
> +      {
> +       s += 2 * a[i];
> +       s2 += 2 * a[i];
> +       s3 += 2 * a[i];
> +      }
> +    }
> +  *s2p = s2;
> +  *s3p = s3;
> +  return s;
> +}
> +
> +__attribute__((noipa)) void
> +baz (int *a, int *b, unsigned short *b2, unsigned char *b3)
> +{
> +  #pragma omp simd reduction (inscan, +:r, r2, r3) if (simd: 0)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      {
> +       b[i] = r;
> +       b2[i] = r2;
> +       b3[i] = r3;
> +      }
> +      #pragma omp scan exclusive(r, r2, r3)
> +      {
> +       r += a[i];
> +       r2 += a[i];
> +       r3 += a[i];
> +      }
> +    }
> +}
> +
> +__attribute__((noipa)) int
> +qux (unsigned short *s2p, unsigned char *s3p)
> +{
> +  int s = 0;
> +  unsigned short s2 = 0;
> +  unsigned char s3 = 0;
> +  #pragma omp simd reduction (inscan, +:s, s2, s3) simdlen (1)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      { b[i] = s; b2[i] = s2; b3[i] = s3; }
> +      #pragma omp scan exclusive(s, s2, s3)
> +      { s += 2 * a[i]; s2 += 2 * a[i]; s3 += 2 * a[i]; }
> +    }
> +  *s2p = s2;
> +  *s3p = s3;
> +  return s;
> +}
> +
> +int
> +main ()
> +{
> +  int s = 0;
> +  unsigned short s2;
> +  unsigned char s3;
> +#ifndef main
> +  check_vect ();
> +#endif
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      a[i] = i;
> +      b[i] = -1;
> +      b2[i] = -1;
> +      b3[i] = -1;
> +      asm ("" : "+g" (i));
> +    }
> +  foo (a, b, b2, b3);
> +  if (r != 1024 * 1023 / 2
> +      || r2 != (unsigned short) r
> +      || r3 != (unsigned char) r)
> +    abort ();
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i] != s
> +         || b2[i] != (unsigned short) s
> +         || b3[i] != (unsigned char) s)
> +       abort ();
> +      else
> +       {
> +         b[i] = 25;
> +         b2[i] = 24;
> +         b3[i] = 26;
> +       }
> +      s += i;
> +    }
> +  if (bar (&s2, &s3) != 1024 * 1023)
> +    abort ();
> +  if (s2 != (unsigned short) (1024 * 1023)
> +      || s3 != (unsigned char) (1024 * 1023))
> +    abort ();
> +  s = 0;
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i] != s
> +         || b2[i] != (unsigned short) s
> +         || b3[i] != (unsigned char) s)
> +       abort ();
> +      else
> +       {
> +         b[i] = -1;
> +         b2[i] = -1;
> +         b3[i] = -1;
> +       }
> +      s += 2 * i;
> +    }
> +  r = 0;
> +  r2 = 0;
> +  r3 = 0;
> +  baz (a, b, b2, b3);
> +  if (r != 1024 * 1023 / 2
> +      || r2 != (unsigned short) r
> +      || r3 != (unsigned char) r)
> +    abort ();
> +  s = 0;
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i] != s
> +         || b2[i] != (unsigned short) s
> +         || b3[i] != (unsigned char) s)
> +       abort ();
> +      else
> +       {
> +         b[i] = 25;
> +         b2[i] = 24;
> +         b3[i] = 26;
> +       }
> +      s += i;
> +    }
> +  s2 = 0;
> +  s3 = 0;
> +  if (qux (&s2, &s3) != 1024 * 1023)
> +    abort ();
> +  if (s2 != (unsigned short) (1024 * 1023)
> +      || s3 != (unsigned char) (1024 * 1023))
> +    abort ();
> +  s = 0;
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i] != s
> +         || b2[i] != (unsigned short) s
> +         || b3[i] != (unsigned char) s)
> +       abort ();
> +      s += 2 * i;
> +    }
> +  return 0;
> +}
> --- gcc/testsuite/gcc.target/i386/sse2-vect-simd-12.c.jj        2019-06-20 
> 15:58:35.276983324 +0200
> +++ gcc/testsuite/gcc.target/i386/sse2-vect-simd-12.c   2019-06-20 
> 15:58:35.274983355 +0200
> @@ -0,0 +1,16 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fopenmp-simd -msse2 -mno-sse3 
> -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target sse2 } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } 
> */
> +
> +#include "sse2-check.h"
> +
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/vect/vect-simd-12.c"
> +
> +static void
> +sse2_test (void)
> +{
> +  do_main ();
> +}
> --- gcc/testsuite/gcc.target/i386/sse2-vect-simd-13.c.jj        2019-06-20 
> 15:58:35.283983216 +0200
> +++ gcc/testsuite/gcc.target/i386/sse2-vect-simd-13.c   2019-06-20 
> 15:58:35.281983247 +0200
> @@ -0,0 +1,16 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fopenmp-simd -msse2 -mno-sse3 
> -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target sse2 } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } 
> */
> +
> +#include "sse2-check.h"
> +
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/vect/vect-simd-13.c"
> +
> +static void
> +sse2_test (void)
> +{
> +  do_main ();
> +}
> --- gcc/testsuite/gcc.target/i386/sse2-vect-simd-14.c.jj        2019-06-20 
> 15:58:35.288983139 +0200
> +++ gcc/testsuite/gcc.target/i386/sse2-vect-simd-14.c   2019-06-20 
> 15:58:35.287983154 +0200
> @@ -0,0 +1,15 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fopenmp-simd -msse2 -mno-sse3 
> -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target sse2 } */
> +
> +#include "sse2-check.h"
> +
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/vect/vect-simd-14.c"
> +
> +static void
> +sse2_test (void)
> +{
> +  do_main ();
> +}
> --- gcc/testsuite/gcc.target/i386/sse2-vect-simd-15.c.jj        2019-06-20 
> 15:58:35.293983061 +0200
> +++ gcc/testsuite/gcc.target/i386/sse2-vect-simd-15.c   2019-06-20 
> 15:58:35.292983077 +0200
> @@ -0,0 +1,16 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fopenmp-simd -msse2 -mno-sse3 
> -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target sse2 } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } 
> */
> +
> +#include "sse2-check.h"
> +
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/vect/vect-simd-15.c"
> +
> +static void
> +sse2_test (void)
> +{
> +  do_main ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx2-vect-simd-12.c.jj        2019-06-20 
> 15:58:35.299982969 +0200
> +++ gcc/testsuite/gcc.target/i386/avx2-vect-simd-12.c   2019-06-20 
> 15:58:35.297982999 +0200
> @@ -0,0 +1,16 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fopenmp-simd -mavx2 -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target avx2 } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } 
> */
> +
> +#include "avx2-check.h"
> +
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/vect/vect-simd-12.c"
> +
> +static void
> +avx2_test (void)
> +{
> +  do_main ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx2-vect-simd-13.c.jj        2019-06-20 
> 15:58:35.305982876 +0200
> +++ gcc/testsuite/gcc.target/i386/avx2-vect-simd-13.c   2019-06-20 
> 15:58:35.303982907 +0200
> @@ -0,0 +1,16 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fopenmp-simd -mavx2 -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target avx2 } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } 
> */
> +
> +#include "avx2-check.h"
> +
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/vect/vect-simd-13.c"
> +
> +static void
> +avx2_test (void)
> +{
> +  do_main ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx2-vect-simd-14.c.jj        2019-06-20 
> 15:58:35.310982799 +0200
> +++ gcc/testsuite/gcc.target/i386/avx2-vect-simd-14.c   2019-06-20 
> 15:58:35.309982815 +0200
> @@ -0,0 +1,16 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fopenmp-simd -mavx2 -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target avx2 } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } 
> */
> +
> +#include "avx2-check.h"
> +
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/vect/vect-simd-14.c"
> +
> +static void
> +avx2_test (void)
> +{
> +  do_main ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx2-vect-simd-15.c.jj        2019-06-20 
> 15:58:35.316982707 +0200
> +++ gcc/testsuite/gcc.target/i386/avx2-vect-simd-15.c   2019-06-20 
> 15:58:35.314982738 +0200
> @@ -0,0 +1,16 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fopenmp-simd -mavx2 -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target avx2 } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } 
> */
> +
> +#include "avx2-check.h"
> +
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/vect/vect-simd-15.c"
> +
> +static void
> +avx2_test (void)
> +{
> +  do_main ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx512f-vect-simd-12.c.jj     2019-06-20 
> 15:58:35.323982599 +0200
> +++ gcc/testsuite/gcc.target/i386/avx512f-vect-simd-12.c        2019-06-20 
> 15:58:35.321982630 +0200
> @@ -0,0 +1,16 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fopenmp-simd -mavx512f -mprefer-vector-width=512 
> -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target avx512f } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } 
> */
> +
> +#include "avx512f-check.h"
> +
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/vect/vect-simd-12.c"
> +
> +static void
> +avx512f_test (void)
> +{
> +  do_main ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx512f-vect-simd-13.c.jj     2019-06-20 
> 15:58:35.328982522 +0200
> +++ gcc/testsuite/gcc.target/i386/avx512f-vect-simd-13.c        2019-06-20 
> 15:58:35.326982553 +0200
> @@ -0,0 +1,16 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fopenmp-simd -mavx512f -mprefer-vector-width=512 
> -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target avx512f } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } 
> */
> +
> +#include "avx512f-check.h"
> +
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/vect/vect-simd-13.c"
> +
> +static void
> +avx512f_test (void)
> +{
> +  do_main ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx512f-vect-simd-14.c.jj     2019-06-20 
> 15:58:35.333982445 +0200
> +++ gcc/testsuite/gcc.target/i386/avx512f-vect-simd-14.c        2019-06-20 
> 15:58:35.332982461 +0200
> @@ -0,0 +1,16 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fopenmp-simd -mavx512f -mprefer-vector-width=512 
> -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target avx512f } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } 
> */
> +
> +#include "avx512f-check.h"
> +
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/vect/vect-simd-14.c"
> +
> +static void
> +avx512f_test (void)
> +{
> +  do_main ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx512bw-vect-simd-15.c.jj    2019-06-20 
> 15:58:35.347982230 +0200
> +++ gcc/testsuite/gcc.target/i386/avx512bw-vect-simd-15.c       2019-06-20 
> 15:58:35.346982245 +0200
> @@ -0,0 +1,16 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fopenmp-simd -mavx512bw -mprefer-vector-width=512 
> -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target avx512bw } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } 
> */
> +
> +#include "avx512bw-check.h"
> +
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/vect/vect-simd-15.c"
> +
> +static void
> +avx512bw_test (void)
> +{
> +  do_main ();
> +}
> --- gcc/testsuite/g++.dg/vect/simd-6.cc.jj      2019-06-20 16:00:34.800142524 
> +0200
> +++ gcc/testsuite/g++.dg/vect/simd-6.cc 2019-06-20 16:07:41.722559826 +0200
> @@ -0,0 +1,161 @@
> +// { dg-require-effective-target size32plus }
> +// { dg-additional-options "-fopenmp-simd" }
> +// { dg-additional-options "-mavx" { target avx_runtime } }
> +// { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { 
> xfail *-*-* } } }
> +
> +#include "../../gcc.dg/vect/tree-vect.h"
> +
> +template <typename T>
> +struct S {
> +  inline S ();
> +  inline ~S ();
> +  inline S (const S &);
> +  inline S & operator= (const S &);
> +  T s;
> +};
> +
> +template <typename T>
> +S<T>::S () : s (0)
> +{
> +}
> +
> +template <typename T>
> +S<T>::~S ()
> +{
> +}
> +
> +template <typename T>
> +S<T>::S (const S &x)
> +{
> +  s = x.s;
> +}
> +
> +template <typename T>
> +S<T> &
> +S<T>::operator= (const S &x)
> +{
> +  s = x.s;
> +  return *this;
> +}
> +
> +template <typename T>
> +static inline void
> +ini (S<T> &x)
> +{
> +  x.s = 0;
> +}
> +
> +S<int> r, a[1024], b[1024];
> +
> +#pragma omp declare reduction (+: S<int>: omp_out.s += omp_in.s)
> +#pragma omp declare reduction (plus: S<int>: omp_out.s += omp_in.s) 
> initializer (ini (omp_priv))
> +
> +template <typename T>
> +__attribute__((noipa)) void
> +foo (S<T> *a, S<T> *b)
> +{
> +  #pragma omp simd reduction (inscan, +:r)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      b[i] = r;
> +      #pragma omp scan exclusive(r)
> +      r.s += a[i].s;
> +    }
> +}
> +
> +template <typename T>
> +__attribute__((noipa)) S<T>
> +bar (void)
> +{
> +  S<T> s;
> +  #pragma omp simd reduction (inscan, plus:s)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      b[i] = s;
> +      #pragma omp scan exclusive(s)
> +      s.s += 2 * a[i].s;
> +    }
> +  return S<T> (s);
> +}
> +
> +__attribute__((noipa)) void
> +baz (S<int> *a, S<int> *b)
> +{
> +  #pragma omp simd reduction (inscan, +:r) simdlen(1)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      b[i] = r;
> +      #pragma omp scan exclusive(r)
> +      r.s += a[i].s;
> +    }
> +}
> +
> +__attribute__((noipa)) S<int>
> +qux (void)
> +{
> +  S<int> s;
> +  #pragma omp simd if (0) reduction (inscan, plus:s)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      b[i] = s;
> +      #pragma omp scan exclusive(s)
> +      s.s += 2 * a[i].s;
> +    }
> +  return S<int> (s);
> +}
> +
> +int
> +main ()
> +{
> +  S<int> s;
> +  check_vect ();
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      a[i].s = i;
> +      b[i].s = -1;
> +      asm ("" : "+g" (i));
> +    }
> +  foo (a, b);
> +  if (r.s != 1024 * 1023 / 2)
> +    abort ();
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i].s != s.s)
> +       abort ();
> +      else
> +       b[i].s = 25;
> +      s.s += i;
> +    }
> +  if (bar<int> ().s != 1024 * 1023)
> +    abort ();
> +  s.s = 0;
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i].s != s.s)
> +       abort ();
> +      s.s += 2 * i;
> +    }
> +  r.s = 0;
> +  baz (a, b);
> +  if (r.s != 1024 * 1023 / 2)
> +    abort ();
> +  s.s = 0;
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i].s != s.s)
> +       abort ();
> +      else
> +       b[i].s = 25;
> +      s.s += i;
> +    }
> +  if (qux ().s != 1024 * 1023)
> +    abort ();
> +  s.s = 0;
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i].s != s.s)
> +       abort ();
> +      s.s += 2 * i;
> +    }
> +  return 0;
> +}
> --- gcc/testsuite/g++.dg/vect/simd-7.cc.jj      2019-06-20 16:00:51.095891542 
> +0200
> +++ gcc/testsuite/g++.dg/vect/simd-7.cc 2019-06-20 16:12:50.222747875 +0200
> @@ -0,0 +1,124 @@
> +// { dg-require-effective-target size32plus }
> +// { dg-additional-options "-fopenmp-simd" }
> +// { dg-additional-options "-mavx" { target avx_runtime } }
> +// { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { 
> target i?86-*-* x86_64-*-* } } } */
> +
> +#include "../../gcc.dg/vect/tree-vect.h"
> +
> +int r, a[1024], b[1024], q;
> +
> +template <typename T, typename U>
> +__attribute__((noipa)) void
> +foo (T a, T b, U r)
> +{
> +  #pragma omp simd reduction (inscan, +:r)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      b[i] = r;
> +      #pragma omp scan exclusive(r)
> +      r += a[i];
> +    }
> +}
> +
> +template <typename T>
> +__attribute__((noipa)) T
> +bar (void)
> +{
> +  T &s = q;
> +  q = 0;
> +  #pragma omp simd reduction (inscan, +:s)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      b[i] = s;
> +      #pragma omp scan exclusive(s)
> +      s += 2 * a[i];
> +    }
> +  return s;
> +}
> +
> +template <typename T>
> +__attribute__((noipa)) void
> +baz (T *a, T *b, T &r)
> +{
> +  #pragma omp simd reduction (inscan, +:r) if (simd: 0)
> +  for (T i = 0; i < 1024; i++)
> +    {
> +      b[i] = r;
> +      #pragma omp scan exclusive(r)
> +      r += a[i];
> +    }
> +}
> +
> +template <typename T>
> +__attribute__((noipa)) int
> +qux (void)
> +{
> +  T s = q;
> +  q = 0;
> +  #pragma omp simd reduction (inscan, +:s) simdlen (1)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      b[i] = s;
> +      #pragma omp scan exclusive(s)
> +      s += 2 * a[i];
> +    }
> +  return s;
> +}
> +
> +int
> +main ()
> +{
> +  int s = 0;
> +  check_vect ();
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      a[i] = i;
> +      b[i] = -1;
> +      asm ("" : "+g" (i));
> +    }
> +  foo<int *, int &> (a, b, r);
> +  if (r != 1024 * 1023 / 2)
> +    abort ();
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i] != s)
> +       abort ();
> +      else
> +       b[i] = 25;
> +      s += i;
> +    }
> +  if (bar<int> () != 1024 * 1023)
> +    abort ();
> +  s = 0;
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i] != s)
> +       abort ();
> +      else
> +       b[i] = -1;
> +      s += 2 * i;
> +    }
> +  r = 0;
> +  baz<int> (a, b, r);
> +  if (r != 1024 * 1023 / 2)
> +    abort ();
> +  s = 0;
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i] != s)
> +       abort ();
> +      else
> +       b[i] = -25;
> +      s += i;
> +    }
> +  if (qux<int &> () != 1024 * 1023)
> +    abort ();
> +  s = 0;
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i] != s)
> +       abort ();
> +      s += 2 * i;
> +    }
> +  return 0;
> +}
> --- gcc/testsuite/g++.dg/vect/simd-8.cc.jj      2019-06-20 16:00:54.154844430 
> +0200
> +++ gcc/testsuite/g++.dg/vect/simd-8.cc 2019-06-20 16:15:37.994133891 +0200
> @@ -0,0 +1,122 @@
> +// { dg-require-effective-target size32plus }
> +// { dg-additional-options "-fopenmp-simd" }
> +// { dg-additional-options "-mavx" { target avx_runtime } }
> +// { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { 
> target i?86-*-* x86_64-*-* } } }
> +
> +#include "../../gcc.dg/vect/tree-vect.h"
> +
> +int r, a[1024], b[1024], q;
> +
> +#pragma omp declare reduction (foo: int: omp_out += omp_in) initializer 
> (omp_priv = 0)
> +
> +__attribute__((noipa)) void
> +foo (int *a, int *b, int &r)
> +{
> +  #pragma omp simd reduction (inscan, foo:r)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      b[i] = r;
> +      #pragma omp scan exclusive(r)
> +      r += a[i];
> +    }
> +}
> +
> +__attribute__((noipa)) int
> +bar (void)
> +{
> +  int &s = q;
> +  q = 0;
> +  #pragma omp simd reduction (inscan, foo:s)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      b[i] = s;
> +      #pragma omp scan exclusive(s)
> +      s += 2 * a[i];
> +    }
> +  return s;
> +}
> +
> +__attribute__((noipa)) void
> +baz (int *a, int *b, int &r)
> +{
> +  #pragma omp simd reduction (inscan, foo:r) if (simd: 0)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      b[i] = r;
> +      #pragma omp scan exclusive(r)
> +      r += a[i];
> +    }
> +}
> +
> +__attribute__((noipa)) int
> +qux (void)
> +{
> +  int &s = q;
> +  q = 0;
> +  #pragma omp simd reduction (inscan, foo:s) simdlen (1)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      b[i] = s;
> +      #pragma omp scan exclusive(s)
> +      s += 2 * a[i];
> +    }
> +  return s;
> +}
> +
> +int
> +main ()
> +{
> +  int s = 0;
> +  check_vect ();
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      a[i] = i;
> +      b[i] = -1;
> +      asm ("" : "+g" (i));
> +    }
> +  foo (a, b, r);
> +  if (r != 1024 * 1023 / 2)
> +    abort ();
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i] != s)
> +       abort ();
> +      else
> +       b[i] = 25;
> +      s += i;
> +    }
> +  if (bar () != 1024 * 1023)
> +    abort ();
> +  s = 0;
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i] != s)
> +       abort ();
> +      else
> +       b[i] = -1;
> +      s += 2 * i;
> +    }
> +  r = 0;
> +  baz (a, b, r);
> +  if (r != 1024 * 1023 / 2)
> +    abort ();
> +  s = 0;
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i] != s)
> +       abort ();
> +      else
> +       b[i] = -25;
> +      s += i;
> +    }
> +  if (qux () != 1024 * 1023)
> +    abort ();
> +  s = 0;
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i] != s)
> +       abort ();
> +      s += 2 * i;
> +    }
> +  return 0;
> +}
> --- gcc/testsuite/g++.dg/vect/simd-9.cc.jj      2019-06-20 16:00:57.197797566 
> +0200
> +++ gcc/testsuite/g++.dg/vect/simd-9.cc 2019-06-20 16:17:27.484427949 +0200
> @@ -0,0 +1,153 @@
> +// { dg-require-effective-target size32plus }
> +// { dg-additional-options "-fopenmp-simd" }
> +// { dg-additional-options "-mavx" { target avx_runtime } }
> +// { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { 
> xfail *-*-* } } }
> +
> +#include "../../gcc.dg/vect/tree-vect.h"
> +
> +struct S {
> +  inline S ();
> +  inline ~S ();
> +  inline S (const S &);
> +  inline S & operator= (const S &);
> +  int s;
> +};
> +
> +S::S () : s (0)
> +{
> +}
> +
> +S::~S ()
> +{
> +}
> +
> +S::S (const S &x)
> +{
> +  s = x.s;
> +}
> +
> +S &
> +S::operator= (const S &x)
> +{
> +  s = x.s;
> +  return *this;
> +}
> +
> +static inline void
> +ini (S &x)
> +{
> +  x.s = 0;
> +}
> +
> +S r, a[1024], b[1024];
> +
> +#pragma omp declare reduction (+: S: omp_out.s += omp_in.s)
> +#pragma omp declare reduction (plus: S: omp_out.s += omp_in.s) initializer 
> (ini (omp_priv))
> +
> +__attribute__((noipa)) void
> +foo (S *a, S *b, S &r)
> +{
> +  #pragma omp simd reduction (inscan, +:r)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      b[i] = r;
> +      #pragma omp scan exclusive(r)
> +      r.s += a[i].s;
> +    }
> +}
> +
> +__attribute__((noipa)) S
> +bar (void)
> +{
> +  S s;
> +  #pragma omp simd reduction (inscan, plus:s)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      b[i] = s;
> +      #pragma omp scan exclusive(s)
> +      s.s += 2 * a[i].s;
> +    }
> +  return s;
> +}
> +
> +__attribute__((noipa)) void
> +baz (S *a, S *b, S &r)
> +{
> +  #pragma omp simd reduction (inscan, +:r) simdlen(1)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      b[i] = r;
> +      #pragma omp scan exclusive(r)
> +      r.s += a[i].s;
> +    }
> +}
> +
> +__attribute__((noipa)) S
> +qux (void)
> +{
> +  S s;
> +  #pragma omp simd if (0) reduction (inscan, plus:s)
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      b[i] = s;
> +      #pragma omp scan exclusive(s)
> +      s.s += 2 * a[i].s;
> +    }
> +  return s;
> +}
> +
> +int
> +main ()
> +{
> +  S s;
> +  check_vect ();
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      a[i].s = i;
> +      b[i].s = -1;
> +      asm ("" : "+g" (i));
> +    }
> +  foo (a, b, r);
> +  if (r.s != 1024 * 1023 / 2)
> +    abort ();
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i].s != s.s)
> +       abort ();
> +      else
> +       b[i].s = 25;
> +      s.s += i;
> +    }
> +  if (bar ().s != 1024 * 1023)
> +    abort ();
> +  s.s = 0;
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i].s != s.s)
> +       abort ();
> +      s.s += 2 * i;
> +    }
> +  r.s = 0;
> +  baz (a, b, r);
> +  if (r.s != 1024 * 1023 / 2)
> +    abort ();
> +  s.s = 0;
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i].s != s.s)
> +       abort ();
> +      else
> +       b[i].s = 25;
> +      s.s += i;
> +    }
> +  if (qux ().s != 1024 * 1023)
> +    abort ();
> +  s.s = 0;
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      if (b[i].s != s.s)
> +       abort ();
> +      s.s += 2 * i;
> +    }
> +  return 0;
> +}
> --- gcc/testsuite/c-c++-common/gomp/scan-2.c.jj 2019-06-10 14:18:17.461525669 
> +0200
> +++ gcc/testsuite/c-c++-common/gomp/scan-2.c    2019-06-20 23:54:03.615422149 
> +0200
> @@ -8,7 +8,7 @@ f1 (int *c, int *d)
>    for (i = 0; i < 64; i++)
>      {
>        d[i] = a;
> -      #pragma omp scan exclusive (a)           /* { dg-message "sorry, 
> unimplemented: '#pragma omp scan' not supported yet" } */
> +      #pragma omp scan exclusive (a)
>        a += c[i];
>      }
>  }
>
>         Jakub

Reply via email to