https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96428
--- Comment #4 from Tom de Vries <vries at gcc dot gnu.org> ---
FTR, this is not the leanest solution.
This patch generates:
...
cvt.u64.u64 %r74, %r65.x;
cvt.u64.u64 %r75, %r65.y;
mov.b64 {%r76,%r77}, %r74;
shfl.idx.b32 %r76, %r76, 0, 31;
shfl.idx.b32 %r77, %r77, 0, 31;
mov.b64 %r74, {%r76,%r77};
mov.b64 {%r78,%r79}, %r75;
shfl.idx.b32 %r78, %r78, 0, 31;
shfl.idx.b32 %r79, %r79, 0, 31;
mov.b64 %r75, {%r78,%r79};
cvt.u64.u64 %r65.x, %r74;
cvt.u64.u64 %r65.y, %r75;
...
but using this followup patch:
...
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index cf53a921e5b..84df8e1ca4a 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -1821,15 +1821,9 @@ nvptx_gen_shuffle (rtx dst, rtx src, rtx idx,
nvptx_shuffle_kind kind)
rtx src1 = gen_rtx_SUBREG (DImode, src, 8);
rtx dst0 = gen_rtx_SUBREG (DImode, dst, 0);
rtx dst1 = gen_rtx_SUBREG (DImode, dst, 8);
- rtx tmp0 = gen_reg_rtx (DImode);
- rtx tmp1 = gen_reg_rtx (DImode);
start_sequence ();
- emit_insn (gen_movdi (tmp0, src0));
- emit_insn (gen_movdi (tmp1, src1));
- emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
- emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
- emit_insn (gen_movdi (dst0, tmp0));
- emit_insn (gen_movdi (dst1, tmp1));
+ emit_insn (nvptx_gen_shuffle (dst0, src0, idx, kind));
+ emit_insn (nvptx_gen_shuffle (dst1, src1, idx, kind));
res = get_insns ();
end_sequence ();
}
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index c23edcf34bf..6e81ad449b3 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -176,6 +176,11 @@
"A pseudo register."
(match_code "reg"))
+(define_constraint "Q"
+ "A pseudo register."
+ (ior (match_code "reg")
+ (match_code "subreg")))
+
(define_constraint "Ia"
"Any integer constant."
(and (match_code "const_int") (match_test "true")))
@@ -1513,21 +1518,23 @@
;; extract parts of a 64 bit object into 2 32-bit ints
(define_insn "unpack<mode>si2"
[(set (match_operand:SI 0 "nvptx_register_operand" "=R")
- (unspec:SI [(match_operand:BITD 2 "nvptx_register_operand" "R")
+ (unspec:SI [(match_operand:BITD 2 "register_operand" "Q")
(const_int 0)] UNSPEC_BIT_CONV))
(set (match_operand:SI 1 "nvptx_register_operand" "=R")
(unspec:SI [(match_dup 2) (const_int 1)] UNSPEC_BIT_CONV))]
""
- "%.\\tmov.b64\\t{%0,%1}, %2;")
+ "%.\\tmov.b64\\t{%0,%1}, %2;"
+ [(set_attr "subregs_ok" "true")])
;; pack 2 32-bit ints into a 64 bit object
(define_insn "packsi<mode>2"
- [(set (match_operand:BITD 0 "nvptx_register_operand" "=R")
+ [(set (match_operand:BITD 0 "register_operand" "=Q")
(unspec:BITD [(match_operand:SI 1 "nvptx_register_operand" "R")
(match_operand:SI 2 "nvptx_register_operand" "R")]
UNSPEC_BIT_CONV))]
""
- "%.\\tmov.b64\\t%0, {%1,%2};")
+ "%.\\tmov.b64\\t%0, {%1,%2};"
+ [(set_attr "subregs_ok" "true")])
;; Atomic insns.
...
we have instead:
...
mov.b64 {%r74,%r75}, %r65.x;
shfl.idx.b32 %r74, %r74, 0, 31;
shfl.idx.b32 %r75, %r75, 0, 31;
mov.b64 %r65.x, {%r74,%r75};
...
But for an ICE fix, I'd rather keep things simple.