https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96401

--- Comment #1 from Tom de Vries <vries at gcc dot gnu.org> ---
(In reply to Tom de Vries from comment #0)
> In other words, we may emit instead:
> ...
>         .reg.u32 %r22;
>                 ld.u32  %r22, [%frame];
>                 st.u16  [%frame+4], %r22;
> ...

So, why don't we?

Using -dP we see the respective insns:
...
//(insn 5 2 6 2
//    (set (reg:SI 22 [ v$0_1 ])
//         (mem/v/c:SI (reg/f:DI 2 %frame) [1 v+0 S4 A128]))
//     "test.c":7:6 6 {*movsi_insn}
//     (nil))
                ld.u32  %r22, [%frame]; // 5    [c=4]  *movsi_insn/1

//(insn 6 5 9 2
//    (set (reg:HI 24 [ v$0_1 ])
//         (subreg:HI (reg:SI 22 [ v$0_1 ]) 0))
//     "test.c":7:6 5 {*movhi_insn}
//     (expr_list:REG_DEAD (reg:SI 22 [ v$0_1 ])
//     (nil)))
                cvt.u16.u32     %r24, %r22;     // 6    [c=12]  *movhi_insn/0

//(insn 9 6 12 2
//    (set (mem/v/c:HI (plus:DI (reg/f:DI 2 %frame)
//                     (const_int 4 [0x4])) [2 v2+0 S2 A32])
//         (reg:HI 24 [ v$0_1 ]))
//     "test.c":7:6 5 {*movhi_insn}
//     (expr_list:REG_DEAD (reg:HI 24 [ v$0_1 ])
//     (nil)))
                st.u16  [%frame+4], %r24;       // 9    [c=4]  *movhi_insn/2
...

I went to investigate why combine doesn't combine insns 6 and 9, that is, why
doesn't it generate:
...
//(insn 9 6 12 2
//    (set (mem/v/c:HI (plus:DI (reg/f:DI 2 %frame)
//                     (const_int 4 [0x4])) [2 v2+0 S2 A32])
//         (subreg:HI (reg:SI 22 [ v$0_1 ]) 0))
//     "test.c":7:6 5 {*movhi_insn}
//     (expr_list:REG_DEAD (reg:HI 22 [ v$0_1 ])
//     (nil)))
...

Part of the required changes is to make the movhi_insn store alternative work
for subreg source operand:
...
@@ -229,8 +234,8 @@

 (define_insn "*mov<mode>_insn"
   [(set (match_operand:QHSDIM 0 "nonimmediate_operand" "=R,R,m")
-       (match_operand:QHSDIM 1 "general_operand" "Ri,m,R"))]
-  "!MEM_P (operands[0]) || REG_P (operands[1])"
+       (match_operand:QHSDIM 1 "general_operand" "Ri,m,Q"))]
+  "!MEM_P (operands[0]) || REG_P (operands[1]) || SUBREG_P (operands[1])"
 {
   if (which_alternative == 1)
     return "%.\\tld%A1%u1\\t%0, %1;";
...
which required me to define:
...
+(define_constraint "Q"
+  "A pseudo register or subreg."
+  (ior (match_code "reg")
+      (match_code "subreg")))
+
...
[ Note that this constraint is an oddity, like the R constraint: it's not a
register constraint. ]

After debugging I found that I needed this as well:
...
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index d2f321fcbcc..2234edad53b 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -6444,7 +6444,7 @@ nvptx_data_alignment (const_tree type, unsigned int
basic_align)
 static bool
 nvptx_modes_tieable_p (machine_mode, machine_mode)
 {
-  return false;
+  return true;
 }

 /* Implement TARGET_HARD_REGNO_NREGS.  */
...
due to this bit in combine.c:subst():
...
                  /* In general, don't install a subreg involving two           
                     modes not tieable.  It can worsen register                 
                     allocation, and can even make invalid reload               
                     insns, since the reg inside may need to be copied          
                     from in the outside mode, and that may be invalid          
                     if it is an fp reg copied in integer mode.                 
                                                                               
      ...

Using these changes, I get the desired:
...
        .reg.u32 %r22;
                ld.u32  %r22, [%frame];
                st.u16  [%frame+4], %r22;
...

Reply via email to