Hi,

On 2021/6/3 21:09, Bill Schmidt wrote:
> On 6/2/21 7:46 PM, Xionghu Luo wrote:
>> Hi,
>>
>> On 2021/6/3 06:20, Segher Boessenkool wrote:
>>> On Wed, Jun 02, 2021 at 03:19:32AM -0500, Xionghu Luo wrote:
>>>> On P8LE, extra rot64+rot64 load or store instructions are generated
>>>> in float128 to vector __int128 conversion.
>>>>
>>>> This patch teaches pass swaps to also handle such pattens to remove
>>>> extra swap instructions.
>>> Did you check if this is already handled by simplify-rtx if the mode had
>>> been TImode (not V1TImode)?  If not, why do you not handle it there?
>> I tried to do it in combine or peephole, the later pass split2
>> or split3 will still split it to rotate + rotate again as we have split
>> after reload, and this pattern is quite P8LE specific, so put it in pass
>> swap.  The simplify-rtx could simplify
>> r124:KF#0=r123:KF#0<-<0x40<-<0x40 to r124:KF#0=r123:KF#0 for register
>> operations already.
>>
>>
>> vsx.md:
>>
>> ;; The post-reload split requires that we re-permute the source
>> ;; register in case it is still live.
>> (define_split
>>    [(set (match_operand:VSX_LE_128 0 "memory_operand")
>>          (match_operand:VSX_LE_128 1 "vsx_register_operand"))]
>>    "!BYTES_BIG_ENDIAN && TARGET_VSX && reload_completed && 
>> !TARGET_P9_VECTOR
>>     && !altivec_indexed_or_indirect_operand (operands[0], <MODE>mode)"
>>    [(const_int 0)]
>> {
>>    rs6000_emit_le_vsx_permute (operands[1], operands[1], <MODE>mode);
>>    rs6000_emit_le_vsx_permute (operands[0], operands[1], <MODE>mode);
>>    rs6000_emit_le_vsx_permute (operands[1], operands[1], <MODE>mode);
>>    DONE;
>> })
> 
> Note also that swap optimization can handle more general cases than 
> simplify-rtx.  In my view it's best to have it covered in both places.
> 

But this pattern is after reload quite later than swap optimization,
so it couldn't remove the swap operations as expected, I have a below
example that matched the above pattern in pass split2, this may be not 
quite appropriate as there is a function call between the load and store.

extern vector __int128 foo1 (__float128 a);

int foo2 ()
{
  __binary128 f128 = {3.1415926535897932384626433832795028841971693993751058Q};
  vector __int128 ret = foo1 (f128);
  return ret[0];
}


295r.split (*see insn 35, 36, 37*):

...
Splitting with gen_split_558 (vsx.md:1079)
...

(insn 33 12 34 2 (set (reg/f:DI 9 %r9 [121])
        (high:DI (unspec:DI [
                    (symbol_ref:DI ("*.LANCHOR0") [flags 0x182])
                    (reg:DI 2 %r2)
                ] UNSPEC_TOCREL))) "pr100085.c":279:25 715 {*largetoc_high}
     (nil))
(insn 34 33 6 2 (set (reg/f:DI 9 %r9 [121])
        (lo_sum:DI (reg/f:DI 9 %r9 [121])
            (unspec:DI [
                    (symbol_ref:DI ("*.LANCHOR0") [flags 0x182])
                    (reg:DI 2 %r2)
                ] UNSPEC_TOCREL))) "pr100085.c":279:25 717 {*largetoc_low}
     (expr_list:REG_EQUAL (symbol_ref:DI ("*.LANCHOR0") [flags 0x182])
        (nil)))
(insn 6 34 8 2 (set (reg:V1TI 66 %v2 [123])
        (rotate:V1TI (mem/c:V1TI (reg/f:DI 9 %r9 [121]) [1 f128+0 S16 A128])
            (const_int 64 [0x40]))) "pr100085.c":279:25 1113 
{*vsx_le_permute_v1ti}
     (nil))
(insn 8 6 9 2 (set (reg:V1TI 66 %v2)
        (rotate:V1TI (reg:V1TI 66 %v2 [123])
            (const_int 64 [0x40]))) "pr100085.c":279:25 1113 
{*vsx_le_permute_v1ti}
     (nil))
(call_insn 9 8 32 2 (parallel [
            (set (reg:V1TI 66 %v2)
                (call (mem:SI (symbol_ref:DI ("foo1") [flags 0x41]  
<function_decl 0x7ffff4fb6f00 foo1>) [0 foo
1 S4 A8])
                    (const_int 0 [0])))
            (use (const_int 0 [0]))
            (clobber (reg:DI 96 lr))
        ]) "pr100085.c":279:25 735 {*call_value_nonlocal_aixdi}
     (expr_list:REG_CALL_DECL (symbol_ref:DI ("foo1") [flags 0x41]  
<function_decl 0x7ffff4fb6f00 foo1>)
        (nil))
    (expr_list (use (reg:DI 2 %r2))
        (expr_list:KF (use (reg:KF 66 %v2))
            (nil))))
(insn 32 9 35 2 (set (reg:DI 9 %r9 [138])
        (plus:DI (reg/f:DI 1 %r1)
            (const_int 32 [0x20]))) "pr100085.c":279:25 66 {*adddi3}
     (nil))
(insn 35 32 36 2 (set (reg:V1TI 66 %v2)
        (rotate:V1TI (reg:V1TI 66 %v2)
            (const_int 64 [0x40]))) "pr100085.c":279:25 1113 
{*vsx_le_permute_v1ti}
     (nil))
(insn 36 35 37 2 (set (mem/c:V1TI (reg:DI 9 %r9 [138]) [2 %sfp+32 S16 A128])
        (rotate:V1TI (reg:V1TI 66 %v2)
            (const_int 64 [0x40]))) "pr100085.c":279:25 1113 
{*vsx_le_permute_v1ti}
     (nil))
(insn 37 36 28 2 (set (reg:V1TI 66 %v2)
        (rotate:V1TI (reg:V1TI 66 %v2)
            (const_int 64 [0x40]))) "pr100085.c":279:25 1113 
{*vsx_le_permute_v1ti}
     (nil))
(insn 28 37 17 2 (set (reg:DI 3 %r3 [133])
        (mem/c:DI (plus:DI (reg/f:DI 1 %r1)
                (const_int 32 [0x20])) [2 %sfp+32 S8 A128])) 
"pr100085.c":279:25 636 {*movdi_internal64}
     (nil))
(insn 17 28 18 2 (set (reg/i:DI 3 %r3)
        (sign_extend:DI (reg:SI 3 %r3 [129]))) "pr100085.c":281:1 31 
{extendsidi2}
     (nil))
(insn 18 17 30 2 (use (reg/i:DI 3 %r3)) "pr100085.c":281:1 -1
     (nil))

-- 
Thanks,
Xionghu

Reply via email to