On 17.07.2017 10:53, Georg-Johann Lay wrote:
Hi, while testing a patch to fix PR81444, I came across a new FAIL due
to the patch in i386.c/pr71321.c

PR81444 is about wrong modes used by expmed.c as it computes costs for
widening operations like widening mul.  It uses  GET_MODE_WIDER_MODE
for the wider mode where is should use GET_MODE_2XWIDER_MODE (the
internals specify a mode 2 times as wide for the wider mode).

Applying the patch from below then leads to the following change in
code generated for PR71321.c:

Without patch:



cvt_to_2digit_ascii:
     movzbl    %dil, %eax    # 7    *zero_extendqihi2/1    [length = 4]
     leal    (%rax,%rax,4), %edx    # 46    *leasi    [length = 3]
     leal    (%rax,%rdx,8), %edx    # 47    *leasi    [length = 3]
     leal    (%rdx,%rdx,4), %edx    # 48    *leasi    [length = 3]
     shrw    $11, %dx    # 16    *lshrhi3_1    [length = 4]
     leal    (%rdx,%rdx,4), %eax    # 49    *leasi    [length = 3]
     movzbl    %dl, %edx    # 35    *zero_extendqisi2/1    [length = 3]
     addl    %eax, %eax    # 50    *ashlsi3_1/1    [length = 2]
     subl    %eax, %edi    # 51    *subsi_1/1    [length = 2]
     movzbl    %dil, %eax    # 23    *zero_extendqisi2/1    [length = 4]
     sall    $8, %eax    # 24    *ashlsi3_1/1    [length = 3]
     orl    %edx, %eax    # 36    *iorsi_1/1    [length = 2]
     addl    $667696, %eax    # 37    *addsi_1/1    [length = 5]
     ret    # 55    simple_return_internal    [length = 1]


With patch applied:

cvt_to_2digit_ascii:
     movl    $-51, %edx    # 7    *movqi_internal/2    [length = 5]
     movl    %edx, %eax    # 35    *movqi_internal/1    [length = 2]
     mulb    %dil    # 8    *umulqihi3_1    [length = 3]
     movl    %eax, %edx    # 36    *movhi_internal/1    [length = 2]

(Following code is the same)

     shrw    $11, %dx    # 10    *lshrhi3_1    [length = 4]
     leal    (%rdx,%rdx,4), %eax    # 37    *leasi    [length = 3]
     movzbl    %dl, %edx    # 23    *zero_extendqisi2/1    [length = 3]
     addl    %eax, %eax    # 38    *ashlsi3_1/1    [length = 2]
     subl    %eax, %edi    # 39    *subsi_1/1    [length = 2]
     movzbl    %dil, %eax    # 17    *zero_extendqisi2/1    [length = 4]
     sall    $8, %eax    # 18    *ashlsi3_1/1    [length = 3]
     orl    %edx, %eax    # 24    *iorsi_1/1    [length = 2]
     addl    $667696, %eax    # 25    *addsi_1/1    [length = 5]
     ret    # 43    simple_return_internal    [length = 1]

Is this actually a performance regression or did the code improve actually? (compiled with -O2 on x86_64).

What would be the proposed fix:  Adjust the test case or is this some
problem in the i386.c cost computation?


...some more information. Looking into what ix86_rtx_costs is coming
up with. The unchanged version sees the strange modes an computes,
for example:

cost 4 (speed=1, outer=set) =
(zero_extend:HI (reg:XI 87))

cost 4 (speed=1, outer=set) =
(zero_extend:HI (reg:XI 87))

cost 24 (speed=1, outer=set) =
(mult:HI (zero_extend:HI (reg:XI 87))
    (zero_extend:HI (reg:XI 87)))

cost 4 (speed=1, outer=truncate) =
(lshiftrt:HI (mult:HI (zero_extend:HI (reg:XI 87))
        (zero_extend:HI (reg:XI 87)))
    (const_int 8 [0x8]))

cost 0 (speed=1, outer=lshiftrt) =
(const_int 8 [0x8])

cost 4 (speed=1, outer=lshiftrt) =
(zero_extend:HI (reg:XI 87))

cost 4 (speed=1, outer=lshiftrt) =
(zero_extend:HI (reg:XI 87))

cost 24 (speed=1, outer=lshiftrt) =
(mult:HI (zero_extend:HI (reg:XI 87))
    (zero_extend:HI (reg:XI 87)))


The patched version doesn't go into the sub-expressions and comes up
with smaller costs of 12 / 12 instead of the 24 / 24 from above:

cost 12 (speed=1, outer=set) =
(mult:HI (zero_extend:HI (reg:QI 87))
    (zero_extend:HI (reg:QI 87)))

cost 4 (speed=1, outer=truncate) =
(lshiftrt:HI (mult:HI (zero_extend:HI (reg:QI 87))
        (zero_extend:HI (reg:QI 87)))
    (const_int 8 [0x8]))

cost 0 (speed=1, outer=lshiftrt) =
(const_int 8 [0x8])

cost 12 (speed=1, outer=lshiftrt) =
(mult:HI (zero_extend:HI (reg:QI 87))
    (zero_extend:HI (reg:QI 87)))

Hence, with the proper expressions, costs for widening mul are cheaper
than with the XImode.  XImode is artefact of clobbering mode of
all->reg in expmed.c::init_expmed_one_conv().


Johann



Index: expmed.c
===================================================================
--- expmed.c    (revision 250090)
+++ expmed.c    (working copy)
@@ -119,6 +119,7 @@ init_expmed_one_conv (struct init_expmed
  {
    int to_size, from_size;
    rtx which;
+  machine_mode orig_mode = GET_MODE (all->reg);

    to_size = GET_MODE_PRECISION (to_mode);
    from_size = GET_MODE_PRECISION (from_mode);
@@ -140,6 +141,8 @@ init_expmed_one_conv (struct init_expmed
    PUT_MODE (all->reg, from_mode);
    set_convert_cost (to_mode, from_mode, speed,
              set_src_cost (which, to_mode, speed));
+  // Unclobber the mode, callers still use it.
+  PUT_MODE (all->reg, orig_mode);
  }

  static void
@@ -210,7 +213,7 @@ init_expmed_one_mode (struct init_expmed
      }
    if (GET_MODE_CLASS (mode) == MODE_INT)
      {
-      machine_mode  wider_mode = GET_MODE_WIDER_MODE (mode);
+      machine_mode wider_mode = GET_MODE_2XWIDER_MODE (mode);
        if (wider_mode != VOIDmode)
      {
        PUT_MODE (all->zext, wider_mode);


Reply via email to