Hi Jakub,
For 401.bzip2 it looks perfect. This is loop is vectorized:
.L6:
        vmovdqa (%rsi,%rax), %ymm0
        addl    $1, %ecx
        vpsrad  $8, %ymm0, %ymm0
        vpsrld  $31, %ymm0, %ymm1
        vpaddd  %ymm1, %ymm0, %ymm0
        vpsrad  $1, %ymm0, %ymm0
        vpaddd  %ymm2, %ymm0, %ymm0
        vpslld  $8, %ymm0, %ymm0
        vmovdqa %ymm0, (%rsi,%rax)
        addq    $32, %rax
        cmpl    %r8d, %ecx
        jne     .L3

Unfortunatelly, I have no perf. simulator of AVX2. But I believe it
will benefit since, in absence of that optimization we got (same
options):
.L3:
        movl    (%rax), %edx
        sarl    $8, %edx
        movl    %edx, %ecx
        shrl    $31, %ecx
        addl    %ecx, %edx
        sarl    %edx
        addl    $1, %edx
        sall    $8, %edx
        movl    %edx, (%rax)
        addq    $4, %rax
        cmpq    %rsi, %rax
        jne     .L3

Thanks, K

On Wed, Dec 14, 2011 at 4:25 PM, Jakub Jelinek <ja...@redhat.com> wrote:
> On Tue, Dec 13, 2011 at 05:57:40PM +0400, Kirill Yukhin wrote:
>> > Let me hack up a quick pattern recognizer for this...
>
> Here it is, untested so far.
> On the testcase doing 2000000 f1+f2+f3+f4 calls in the loop with -O3 -mavx
> on Sandybridge (so, vectorized just with 16 byte vectors) gives:
> vanilla                                 0m34.571s
> the tree-vect* parts of this patch only 0m9.013s
> the whole patch                         0m8.824s
> The i386 parts are just a small optimization, I guess it could be
> done in the vectorizer too (but then we'd have to check whether the
> arithmetic/logical right shifts are supported and check costs?), or
> perhaps in the generic vcond expander (again, we'd need to check some
> costs).
>
> Can you see if it gives a measurable speedup on SPEC bzip2?
>
> 2011-12-14  Jakub Jelinek  <ja...@redhat.com>
>
>        * tree-vectorizer.h (NUM_PATTERNS): Bump to 10.
>        * tree-vect-patterns.c (vect_recog_sdivmod_pow2_pattern): New
>        function.
>        (vect_vect_recog_func_ptrs): Add it.
>
>        * config/i386/sse.md (vcond<V_256:mode><VI_256:mode>,
>        vcond<V_128:mode><VI124_128:mode>, vcond<VI8F_128:mode>v2di):
>        Use general_operand instead of nonimmediate_operand for
>        operand 5 and no predicate for operands 1 and 2.
>        * config/i386/i386.c (ix86_expand_int_vcond): Optimize
>        x < 0 ? -1 : 0 and x < 0 ? 1 : 0 into vector arithmetic
>        resp. logical shift.
>
>        * gcc.dg/vect/vect-sdivmod-1.c: New test.
>
> --- gcc/tree-vectorizer.h.jj    2011-12-14 08:11:03.592010101 +0100
> +++ gcc/tree-vectorizer.h       2011-12-14 08:28:57.006693371 +0100
> @@ -929,7 +929,7 @@ extern void vect_slp_transform_bb (basic
>    Additional pattern recognition functions can (and will) be added
>    in the future.  */
>  typedef gimple (* vect_recog_func_ptr) (VEC (gimple, heap) **, tree *, tree 
> *);
> -#define NUM_PATTERNS 9
> +#define NUM_PATTERNS 10
>  void vect_pattern_recog (loop_vec_info);
>
>  /* In tree-vectorizer.c.  */
> --- gcc/tree-vect-patterns.c.jj 2011-12-02 01:52:27.918883940 +0100
> +++ gcc/tree-vect-patterns.c    2011-12-14 11:50:10.439763740 +0100
> @@ -53,6 +53,8 @@ static gimple vect_recog_widen_shift_pat
>                                        tree *, tree *);
>  static gimple vect_recog_vector_vector_shift_pattern (VEC (gimple, heap) **,
>                                                      tree *, tree *);
> +static gimple vect_recog_sdivmod_pow2_pattern (VEC (gimple, heap) **,
> +                                              tree *, tree *);
>  static gimple vect_recog_mixed_size_cond_pattern (VEC (gimple, heap) **,
>                                                  tree *, tree *);
>  static gimple vect_recog_bool_pattern (VEC (gimple, heap) **, tree *, tree 
> *);
> @@ -64,6 +66,7 @@ static vect_recog_func_ptr vect_vect_rec
>        vect_recog_over_widening_pattern,
>        vect_recog_widen_shift_pattern,
>        vect_recog_vector_vector_shift_pattern,
> +       vect_recog_sdivmod_pow2_pattern,
>        vect_recog_mixed_size_cond_pattern,
>        vect_recog_bool_pattern};
>
> @@ -1573,6 +1576,211 @@ vect_recog_vector_vector_shift_pattern (
>   return pattern_stmt;
>  }
>
> +/* Detect a signed division by power of two constant that wouldn't be
> +   otherwise vectorized:
> +
> +   type a_t, b_t;
> +
> +   S1 a_t = b_t / N;
> +
> +  where type 'type' is a signed integral type and N is a constant positive
> +  power of two.
> +
> +  Similarly handle signed modulo by power of two constant:
> +
> +   S4 a_t = b_t % N;
> +
> +  Input/Output:
> +
> +  * STMTS: Contains a stmt from which the pattern search begins,
> +    i.e. the division stmt.  S1 is replaced by:
> +  S3  y_t = b_t < 0 ? N - 1 : 0;
> +  S2  x_t = b_t + y_t;
> +  S1' a_t = x_t >> log2 (N);
> +
> +    S4 is replaced by (where *_T temporaries have unsigned type):
> +  S9  y_T = b_t < 0 ? -1U : 0U;
> +  S8  z_T = y_T >> (sizeof (type_t) * CHAR_BIT - log2 (N));
> +  S7  z_t = (type) z_T;
> +  S6  w_t = b_t + z_t;
> +  S5  x_t = w_t & (N - 1);
> +  S4' a_t = x_t - z_t;
> +
> +  Output:
> +
> +  * TYPE_IN: The type of the input arguments to the pattern.
> +
> +  * TYPE_OUT: The type of the output of this pattern.
> +
> +  * Return value: A new stmt that will be used to replace the division
> +    S1 or modulo S4 stmt.  */
> +
> +static gimple
> +vect_recog_sdivmod_pow2_pattern (VEC (gimple, heap) **stmts,
> +                                tree *type_in, tree *type_out)
> +{
> +  gimple last_stmt = VEC_pop (gimple, *stmts);
> +  gimple_stmt_iterator gsi;
> +  tree oprnd0, oprnd1, vectype, itype, cond;
> +  gimple pattern_stmt, def_stmt;
> +  enum tree_code rhs_code;
> +  stmt_vec_info stmt_vinfo = vinfo_for_stmt (last_stmt);
> +  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
> +  optab optab;
> +
> +  if (!is_gimple_assign (last_stmt))
> +    return NULL;
> +
> +  rhs_code = gimple_assign_rhs_code (last_stmt);
> +  switch (rhs_code)
> +    {
> +    case TRUNC_DIV_EXPR:
> +    case TRUNC_MOD_EXPR:
> +      break;
> +    default:
> +      return NULL;
> +    }
> +
> +  if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
> +    return NULL;
> +
> +  oprnd0 = gimple_assign_rhs1 (last_stmt);
> +  oprnd1 = gimple_assign_rhs2 (last_stmt);
> +  itype = TREE_TYPE (oprnd0);
> +  if (TREE_CODE (oprnd0) != SSA_NAME
> +      || TREE_CODE (oprnd1) != INTEGER_CST
> +      || TREE_CODE (itype) != INTEGER_TYPE
> +      || TYPE_UNSIGNED (itype)
> +      || TYPE_PRECISION (itype) != GET_MODE_PRECISION (TYPE_MODE (itype))
> +      || !integer_pow2p (oprnd1)
> +      || tree_int_cst_sgn (oprnd1) != 1)
> +    return NULL;
> +
> +  vectype = get_vectype_for_scalar_type (itype);
> +  if (vectype == NULL_TREE)
> +    return NULL;
> +
> +  /* If the target can handle vectorized division or modulo natively,
> +     don't attempt to optimize this.  */
> +  optab = optab_for_tree_code (rhs_code, vectype, optab_default);
> +  if (optab != NULL)
> +    {
> +      enum machine_mode vec_mode = TYPE_MODE (vectype);
> +      int icode = (int) optab_handler (optab, vec_mode);
> +      if (icode != CODE_FOR_nothing
> +         || GET_MODE_SIZE (vec_mode) == UNITS_PER_WORD)
> +       return NULL;
> +    }
> +
> +  /* Pattern detected.  */
> +  if (vect_print_dump_info (REPORT_DETAILS))
> +    fprintf (vect_dump, "vect_recog_sdivmod_pow2_pattern: detected: ");
> +
> +  cond = build2 (LT_EXPR, boolean_type_node, oprnd0, build_int_cst (itype, 
> 0));
> +  gsi = gsi_for_stmt (last_stmt);
> +  if (rhs_code == TRUNC_DIV_EXPR)
> +    {
> +      tree var = vect_recog_temp_ssa_var (itype, NULL);
> +      def_stmt
> +       = gimple_build_assign_with_ops3 (COND_EXPR, var, cond,
> +                                        fold_build2 (MINUS_EXPR, itype,
> +                                                     oprnd1,
> +                                                     build_int_cst (itype,
> +                                                                    1)),
> +                                        build_int_cst (itype, 0));
> +      gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT);
> +      set_vinfo_for_stmt (def_stmt, new_stmt_vec_info (def_stmt, loop_vinfo,
> +                                                      NULL));
> +      var = vect_recog_temp_ssa_var (itype, NULL);
> +      def_stmt
> +       = gimple_build_assign_with_ops (PLUS_EXPR, var, oprnd0,
> +                                       gimple_assign_lhs (def_stmt));
> +      STMT_VINFO_PATTERN_DEF_STMT (stmt_vinfo) = def_stmt;
> +
> +      pattern_stmt
> +       = gimple_build_assign_with_ops (RSHIFT_EXPR,
> +                                       vect_recog_temp_ssa_var (itype, NULL),
> +                                       var,
> +                                       build_int_cst (itype,
> +                                                      tree_log2 (oprnd1)));
> +    }
> +  else
> +    {
> +      tree signmask;
> +      tree utype = build_nonstandard_integer_type (TYPE_PRECISION (itype), 
> 1);
> +      tree shift = build_int_cst (utype, GET_MODE_BITSIZE (TYPE_MODE (itype))
> +                                        - tree_log2 (oprnd1));
> +      if (compare_tree_int (oprnd1, 2) == 0)
> +       {
> +         signmask = vect_recog_temp_ssa_var (itype, NULL);
> +         def_stmt
> +           = gimple_build_assign_with_ops3 (COND_EXPR, signmask, cond,
> +                                            build_int_cst (itype, 1),
> +                                            build_int_cst (itype, 0));
> +         gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT);
> +         set_vinfo_for_stmt (def_stmt,
> +                             new_stmt_vec_info (def_stmt, loop_vinfo, NULL));
> +       }
> +      else
> +       {
> +         tree var = vect_recog_temp_ssa_var (utype, NULL);
> +         def_stmt
> +           = gimple_build_assign_with_ops3 (COND_EXPR, var, cond,
> +                                            build_int_cst (utype, -1),
> +                                            build_int_cst (utype, 0));
> +         gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT);
> +         set_vinfo_for_stmt (def_stmt,
> +                             new_stmt_vec_info (def_stmt, loop_vinfo, NULL));
> +         var = vect_recog_temp_ssa_var (utype, NULL);
> +         def_stmt
> +           = gimple_build_assign_with_ops (RSHIFT_EXPR, var,
> +                                           gimple_assign_lhs (def_stmt),
> +                                           shift);
> +         gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT);
> +         set_vinfo_for_stmt (def_stmt,
> +                             new_stmt_vec_info (def_stmt, loop_vinfo, NULL));
> +         signmask = vect_recog_temp_ssa_var (itype, NULL);
> +         def_stmt
> +           = gimple_build_assign_with_ops (NOP_EXPR, signmask, var,
> +                                           NULL_TREE);
> +         gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT);
> +         set_vinfo_for_stmt (def_stmt,
> +                             new_stmt_vec_info (def_stmt, loop_vinfo, NULL));
> +       }
> +      def_stmt
> +       = gimple_build_assign_with_ops (PLUS_EXPR,
> +                                       vect_recog_temp_ssa_var (itype, NULL),
> +                                       oprnd0, signmask);
> +      gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT);
> +      set_vinfo_for_stmt (def_stmt, new_stmt_vec_info (def_stmt, loop_vinfo,
> +                                                      NULL));
> +      def_stmt
> +       = gimple_build_assign_with_ops (BIT_AND_EXPR,
> +                                       vect_recog_temp_ssa_var (itype, NULL),
> +                                       gimple_assign_lhs (def_stmt),
> +                                       fold_build2 (MINUS_EXPR, itype,
> +                                                    oprnd1,
> +                                                    build_int_cst (itype,
> +                                                                   1)));
> +      STMT_VINFO_PATTERN_DEF_STMT (stmt_vinfo) = def_stmt;
> +
> +      pattern_stmt
> +       = gimple_build_assign_with_ops (MINUS_EXPR,
> +                                       vect_recog_temp_ssa_var (itype, NULL),
> +                                       gimple_assign_lhs (def_stmt),
> +                                       signmask);
> +    }
> +
> +  if (vect_print_dump_info (REPORT_DETAILS))
> +    print_gimple_stmt (vect_dump, pattern_stmt, 0, TDF_SLIM);
> +
> +  VEC_safe_push (gimple, heap, *stmts, last_stmt);
> +
> +  *type_in = vectype;
> +  *type_out = vectype;
> +  return pattern_stmt;
> +}
> +
>  /* Function vect_recog_mixed_size_cond_pattern
>
>    Try to find the following pattern:
> --- gcc/config/i386/sse.md.jj   2011-12-03 12:22:33.000000000 +0100
> +++ gcc/config/i386/sse.md      2011-12-14 13:00:47.695788256 +0100
> @@ -6340,9 +6340,9 @@ (define_expand "vcond<V_256:mode><VI_256
>        (if_then_else:V_256
>          (match_operator 3 ""
>            [(match_operand:VI_256 4 "nonimmediate_operand" "")
> -            (match_operand:VI_256 5 "nonimmediate_operand" "")])
> -         (match_operand:V_256 1 "general_operand" "")
> -         (match_operand:V_256 2 "general_operand" "")))]
> +            (match_operand:VI_256 5 "general_operand" "")])
> +         (match_operand:V_256 1 "" "")
> +         (match_operand:V_256 2 "" "")))]
>   "TARGET_AVX2
>    && (GET_MODE_NUNITS (<V_256:MODE>mode)
>        == GET_MODE_NUNITS (<VI_256:MODE>mode))"
> @@ -6357,9 +6357,9 @@ (define_expand "vcond<V_128:mode><VI124_
>        (if_then_else:V_128
>          (match_operator 3 ""
>            [(match_operand:VI124_128 4 "nonimmediate_operand" "")
> -            (match_operand:VI124_128 5 "nonimmediate_operand" "")])
> -         (match_operand:V_128 1 "general_operand" "")
> -         (match_operand:V_128 2 "general_operand" "")))]
> +            (match_operand:VI124_128 5 "general_operand" "")])
> +         (match_operand:V_128 1 "" "")
> +         (match_operand:V_128 2 "" "")))]
>   "TARGET_SSE2
>    && (GET_MODE_NUNITS (<V_128:MODE>mode)
>        == GET_MODE_NUNITS (<VI124_128:MODE>mode))"
> @@ -6374,9 +6374,9 @@ (define_expand "vcond<VI8F_128:mode>v2di
>        (if_then_else:VI8F_128
>          (match_operator 3 ""
>            [(match_operand:V2DI 4 "nonimmediate_operand" "")
> -            (match_operand:V2DI 5 "nonimmediate_operand" "")])
> -         (match_operand:VI8F_128 1 "general_operand" "")
> -         (match_operand:VI8F_128 2 "general_operand" "")))]
> +            (match_operand:V2DI 5 "general_operand" "")])
> +         (match_operand:VI8F_128 1 "" "")
> +         (match_operand:VI8F_128 2 "" "")))]
>   "TARGET_SSE4_2"
>  {
>   bool ok = ix86_expand_int_vcond (operands);
> --- gcc/config/i386/i386.c.jj   2011-12-14 08:11:01.000000000 +0100
> +++ gcc/config/i386/i386.c      2011-12-14 13:01:30.207538465 +0100
> @@ -19434,6 +19434,45 @@ ix86_expand_int_vcond (rtx operands[])
>   cop0 = operands[4];
>   cop1 = operands[5];
>
> +  /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
> +     and x < 0 ? 1 : 0 into (unsigned) x >> 31.  */
> +  if ((code == LT || code == GE)
> +      && data_mode == mode
> +      && cop1 == CONST0_RTX (mode)
> +      && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
> +      && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
> +      && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
> +      && (GET_MODE_SIZE (data_mode) == 16
> +         || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
> +    {
> +      rtx negop = operands[2 - (code == LT)];
> +      int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
> +      if (negop == CONST1_RTX (data_mode))
> +       {
> +         rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT 
> (shift),
> +                                        operands[0], 1, OPTAB_DIRECT);
> +         if (res != operands[0])
> +           emit_move_insn (operands[0], res);
> +         return true;
> +       }
> +      else if (GET_MODE_INNER (data_mode) != DImode
> +              && vector_all_ones_operand (negop, data_mode))
> +       {
> +         rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT 
> (shift),
> +                                        operands[0], 0, OPTAB_DIRECT);
> +         if (res != operands[0])
> +           emit_move_insn (operands[0], res);
> +         return true;
> +       }
> +    }
> +
> +  if (!nonimmediate_operand (cop1, mode))
> +    cop1 = force_reg (mode, cop1);
> +  if (!general_operand (operands[1], data_mode))
> +    operands[1] = force_reg (data_mode, operands[1]);
> +  if (!general_operand (operands[2], data_mode))
> +    operands[2] = force_reg (data_mode, operands[2]);
> +
>   /* XOP supports all of the comparisons on all 128-bit vector int types.  */
>   if (TARGET_XOP
>       && (mode == V16QImode || mode == V8HImode
> --- gcc/testsuite/gcc.dg/vect/vect-sdivmod-1.c.jj       2011-12-14 
> 13:05:36.240133846 +0100
> +++ gcc/testsuite/gcc.dg/vect/vect-sdivmod-1.c  2011-12-14 13:08:10.288228745 
> +0100
> @@ -0,0 +1,98 @@
> +#include "tree-vect.h"
> +
> +extern void abort (void);
> +int a[4096];
> +
> +__attribute__((noinline, noclone)) void
> +f1 (int x)
> +{
> +  int i, j;
> +  for (i = 1; i <= x; i++)
> +    {
> +      j = a[i] >> 8;
> +      j = 1 + (j / 2);
> +      a[i] = j << 8;
> +    }
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f2 (int x)
> +{
> +  int i, j;
> +  for (i = 1; i <= x; i++)
> +    {
> +      j = a[i] >> 8;
> +      j = 1 + (j / 16);
> +      a[i] = j << 8;
> +    }
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f3 (int x)
> +{
> +  int i, j;
> +  for (i = 1; i <= x; i++)
> +    {
> +      j = a[i] >> 8;
> +      j = 1 + (j % 2);
> +      a[i] = j << 8;
> +    }
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f4 (int x)
> +{
> +  int i, j;
> +  for (i = 1; i <= x; i++)
> +    {
> +      j = a[i] >> 8;
> +      j = 1 + (j % 16);
> +      a[i] = j << 8;
> +    }
> +}
> +
> +int
> +main ()
> +{
> +  int i;
> +  check_vect ();
> +  for (i = 0; i < 4096; i++)
> +    {
> +      asm ("");
> +      a[i] = (i - 2048) << 8;
> +    }
> +  f1 (4095);
> +  if (a[0] != (-2048 << 8))
> +    abort ();
> +  for (i = 1; i < 4096; i++)
> +    if (a[i] != ((1 + ((i - 2048) / 2)) << 8))
> +      abort ();
> +    else
> +      a[i] = (i - 2048) << 8;
> +  f2 (4095);
> +  if (a[0] != (-2048 << 8))
> +    abort ();
> +  for (i = 1; i < 4096; i++)
> +    if (a[i] != ((1 + ((i - 2048) / 16)) << 8))
> +      abort ();
> +    else
> +      a[i] = (i - 2048) << 8;
> +  f3 (4095);
> +  if (a[0] != (-2048 << 8))
> +    abort ();
> +  for (i = 1; i < 4096; i++)
> +    if (a[i] != ((1 + ((i - 2048) % 2)) << 8))
> +      abort ();
> +    else
> +      a[i] = (i - 2048) << 8;
> +  f4 (4095);
> +  if (a[0] != (-2048 << 8))
> +    abort ();
> +  for (i = 1; i < 4096; i++)
> +    if (a[i] != ((1 + ((i - 2048) % 16)) << 8))
> +      abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 4 "vect" { target 
> vect_condition } } } */
> +/* { dg-final { cleanup-tree-dump "vect" } } */
>
>        Jakub

Reply via email to