Hi Andrew,
Andrew Stubbs wrote:
+static bool
+gcn_v_cmp_insn_p (attr_type type)
+{
+ return type == TYPE_VOPC || type == TYPE_VOP3A;
}
There are many vop3a encoded instructions. I don't understand how this
uniquely identifies v_cmp instructions?
It doesn't - how about an attribute variant?
+ /* CDNA3: VALU writes VGPR/VCC: v_readlane, v_readfirstlane,
v_cmp,
+ v_add_*i/u, v_sub_*i/u, v_div_*scale - followed by:
+ - VALU reads SGPR as constant requires 1 waite state
+ - VALU reads SGPR as carry-in requires no waite state
+ - v_readlane/v_writelane reads SGPR as lane select requires
4 wait
+ states. */
+ if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 4
+ && prev_insn->unit == UNIT_VECTOR
+ && (get_attr_laneselect (prev_insn->insn) == LANESELECT_READ
+ || gcn_v_cmp_insn_p (prev_insn->type)
+ || prev_insn->type == TYPE_VOP2
+ || prev_insn->type == TYPE_VOP3B)
+ && hard_reg_set_intersect_p
+ (depregs, reg_class_contents[(int) SGPR_REGS]))
Is it necessary to check all those attributes? "prev_insn->unit ==
UNIT_VECTOR" together with the register dependency check is enough to
establish "VALU reads SGPR". Is this attempting to rule out the
carry-in instructions?
The depregs is to check whether the write followed by read affects the
same SGPR register. – I have not tried to handle the kind of register
usage so far (carry in vs. constant; lane-select vs other use).
Otherwise, idea is to handle 'VALU writes' by the listed insn, only. I
am not sure whether restricting it to the subset of "v_readlane,
v_readfirstlane, v_cmp, v_add_*i/u, v_sub_*i/u, v_div_*scale" makes
sense or not – as that list covers a large portion of the VALU write insns.
+ if (get_attr_laneselect (insn) != LANESELECT_NO)
+ nops_rqd = 4 - prev_insn->age;
+ else if ((prev_insn->age + nops_rqd) < 1)
+ nops_rqd = 1 - prev_insn->age;
This is safe, but I don't think it actually determines if the value is
use *as* laneselect (not for write, anyway). I might revisit this
stuff at some point, but it's fine for now.
True, albeit that's already preexisting for all lane-select checks (one
for all CDNA and 3 additional cases for CDNA, one of which is the one
above).
Attached is an updated patch (as RFC – or is it by chance already okay?).
Tobias
gcn: Add more s_nop for MI300
Implement another case where the CDNA3 ISA documentation requires s_nop,
add a comment why another case does not need to be handled. And add one
case where an s_nop is required by MI300A hardware but seems to be not
mentioned in the CDNA3 ISA documentation.
gcc/ChangeLog:
* gcn.md (define_attr "vcmp"): Add with values vcmp/vcmpx/no.
(*movbi, cstoredi4.., cstore<mode>4): Set it.
* gcn-valu.md (vec_cmp<mode>...): Likewise.
* config/gcn/gcn.cc (gcn_cmpx_insn_p): Remove.
(gcn_md_reorg): Add two new conditions for MI300.
gcc/config/gcn/gcn-valu.md | 4 +++
gcc/config/gcn/gcn.cc | 83 +++++++++++++++++++++++++---------------------
gcc/config/gcn/gcn.md | 9 +++++
3 files changed, 58 insertions(+), 38 deletions(-)
diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index 09943293293..a34d2e30c97 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -3938,6 +3938,7 @@ (define_insn "vec_cmp<mode>di"
v_cmpx%E1\t%2, %3
v_cmpx%E1\t%2, %3"
[(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vop3a,vopc,vopc")
+ (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmp,vcmpx,vcmpx")
(set_attr "length" "4,8,4,8,8,8,4,8")
(set_attr "rdna" "*,*,no,no,*,*,yes,yes")])
@@ -3992,6 +3993,7 @@ (define_insn "vec_cmp<mode>di_exec"
v_cmpx%E1\t%2, %3
v_cmpx%E1\t%2, %3"
[(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vop3a,vopc,vopc")
+ (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmp,vcmpx,vcmpx")
(set_attr "length" "4,8,4,8,8,8,4,8")
(set_attr "rdna" "*,*,no,no,*,*,yes,yes")])
@@ -4050,6 +4052,7 @@ (define_insn "vec_cmp<mode>di_dup"
v_cmpx%E1\t%2, %3
v_cmpx%E1\t%2, %3"
[(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vopc,vopc")
+ (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmpx,vcmpx")
(set_attr "length" "4,8,4,8,8,4,8")
(set_attr "rdna" "*,*,no,no,*,yes,yes")])
@@ -4073,6 +4076,7 @@ (define_insn "vec_cmp<mode>di_dup_exec"
v_cmpx%E1\t%2, %3
v_cmpx%E1\t%2, %3"
[(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vopc,vopc")
+ (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmpx,vcmpx")
(set_attr "length" "4,8,4,8,8,4,8")
(set_attr "rdna" "*,*,no,no,*,yes,yes")])
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index 75b0936dce8..fe265dded13 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -5792,42 +5792,6 @@ gcn_libc_has_function (enum function_class fn_class,
/* }}} */
/* {{{ md_reorg pass. */
-/* Identify V_CMPX from the "type" attribute;
- note: this will also match 'v_cmp %E1 vcc'. */
-
-static bool
-gcn_cmpx_insn_p (attr_type type)
-{
- switch (type)
- {
- case TYPE_VOPC:
- return true;
- case TYPE_MUBUF:
- case TYPE_MTBUF:
- case TYPE_FLAT:
- case TYPE_VOP3P_MAI:
- case TYPE_UNKNOWN:
- case TYPE_SOP1:
- case TYPE_SOP2:
- case TYPE_SOPK:
- case TYPE_SOPC:
- case TYPE_SOPP:
- case TYPE_SMEM:
- case TYPE_DS:
- case TYPE_VOP2:
- case TYPE_VOP1:
- case TYPE_VOP3A:
- case TYPE_VOP3B:
- case TYPE_VOP_SDWA:
- case TYPE_VOP_DPP:
- case TYPE_MULT:
- case TYPE_VMULT:
- return false;
- }
- gcc_unreachable ();
- return false;
-}
-
/* Identify VMEM instructions from their "type" attribute. */
static bool
@@ -6356,19 +6320,62 @@ gcn_md_reorg (void)
reg_class_contents[(int)VCC_CONDITIONAL_REG])))
nops_rqd = ivccwait - prev_insn->age;
+ /* NOTE: The following condition for adding wait state exists, but
+ GCC does not access the special registers using their SGPR#.
+ Thus, no action is required here. The following wait-state
+ condition exists at least for VEGA/gfx900+ to CDNA3:
+ Mixed use of VCC: alias vs. SGPR# - v_readlane,
+ v_readfirstlane, v_cmp, v_add_*i/u, v_sub_*i/u, v_div_*scale
+ followed by VALU reads VCC as constant requires 1 wait state.
+ (As carry-in, it requires none.)
+ [VCC can be accessed by name or logical SGPR that holds it.] */
+
+ /* Testing indicates that CDNA3 requires an s_nop between
+ e.g. 'v_cmp_eq_u64 vcc, v[4:5], v[8:9]' and 'v_mov_b32 v0, vcc_lo'.
+ Thus: add it between v_cmp writing VCC and VALU read of VCC. */
+ if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 1
+ && iunit == UNIT_VECTOR
+ && (hard_reg_set_intersect_p
+ (depregs, reg_class_contents[(int)VCC_CONDITIONAL_REG]))
+ && get_attr_vcmp (prev_insn->insn) == VCMP_VCMP)
+ nops_rqd = 1 - prev_insn->age;
+
+ /* CDNA3: VALU writes VGPR/VCC: v_readlane, v_readfirstlane, v_cmp,
+ v_add_*i/u, v_sub_*i/u, v_div_*scale - followed by:
+ - VALU reads SGPR as constant requires 1 waite state
+ - VALU reads SGPR as carry-in requires no waite state
+ - v_readlane/v_writelane reads SGPR as lane select requires 4 wait
+ states. */
+ if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 4
+ && prev_insn->unit == UNIT_VECTOR
+ && (get_attr_laneselect (prev_insn->insn) == LANESELECT_READ
+ || get_attr_vcmp (prev_insn->insn) == VCMP_VCMP
+ || prev_insn->type == TYPE_VOP2
+ || prev_insn->type == TYPE_VOP3B)
+ && hard_reg_set_intersect_p
+ (depregs, reg_class_contents[(int) SGPR_REGS]))
+ {
+ if (get_attr_laneselect (insn) != LANESELECT_NO)
+ nops_rqd = 4 - prev_insn->age;
+ else if ((prev_insn->age + nops_rqd) < 1)
+ nops_rqd = 1 - prev_insn->age;
+ }
+
/* CDNA3: v_cmpx followed by
- V_readlane, v_readfirstlane, v_writelane requires 4 wait states
- VALU reads EXEC as constant requires 2 wait states
- other VALU requires no wait state */
if (TARGET_CDNA3_NOPS
&& (prev_insn->age + nops_rqd) < 4
- && gcn_cmpx_insn_p (prev_insn->type)
+ && get_attr_vcmp (prev_insn->insn) == VCMP_VCMPX
&& get_attr_laneselect (insn) != LANESELECT_NO)
nops_rqd = 4 - prev_insn->age;
else if (TARGET_CDNA3_NOPS
&& (prev_insn->age + nops_rqd) < 2
&& iunit == UNIT_VECTOR
- && gcn_cmpx_insn_p (prev_insn->type)
+ && get_attr_vcmp (prev_insn->insn) == VCMP_VCMPX
&& TEST_HARD_REG_BIT (ireads, EXECZ_REG))
nops_rqd = 2 - prev_insn->age;
diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index 9172db08b2e..67a45edba36 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -324,6 +324,11 @@ (define_attr "flatmemaccess"
"store,storex34,load,atomic,atomicwait,cmpswapx2,no"
(const_string "no"))
+; Identify v_cmp and v_cmpx instructions for "Manually Inserted Wait State"
+; handling.
+
+(define_attr "vcmp" "vcmp,vcmpx,no" (const_string "no"))
+
; Identify instructions that require "Manually Inserted Wait State" if
; a previous instruction writes to VCC. The number gives the number of NOPs.
@@ -575,6 +580,7 @@ (define_insn "*movbi"
[(set_attr "type" "sop1,vop1,vop3a,sopk,vopc,mult,smem,smem,smem,flat,flat,
flat,flat,flat,flat")
(set_attr "flatmemaccess" "*,*,*,*,*,*,*,*,*,load,load,store,load,load,store")
+ (set_attr "vcmp" "*,*,*,*,vcmp,*,*,*,*,*,*,*,*,*,*")
(set_attr "exec" "*,*,none,*,*,*,*,*,*,*,*,*,*,*,*")
(set_attr "length" "4,4,4,4,4,8,12,12,12,12,12,12,12,12,12")
(set_attr "xnack" "*,*,*,*,*,*,off,on,*,off,on,*,off,on,*")
@@ -1098,6 +1104,7 @@ (define_insn "cstoredi4_vec_and_scalar"
s_cmp%D1\t%2, %3
v_cmp%E1\tvcc, %2, %3"
[(set_attr "type" "sopc,vopc")
+ (set_attr "vcmp" "vcmp")
(set_attr "length" "8")])
(define_insn "cstoredi4_vector"
@@ -1108,6 +1115,7 @@ (define_insn "cstoredi4_vector"
""
"v_cmp%E1\tvcc, %2, %3"
[(set_attr "type" "vopc")
+ (set_attr "vcmp" "vcmp")
(set_attr "length" "8")])
(define_expand "cbranchdi4"
@@ -1134,6 +1142,7 @@ (define_insn "cstore<mode>4"
""
"v_cmp%E1\tvcc, %2, %3"
[(set_attr "type" "vopc")
+ (set_attr "vcmp" "vcmp")
(set_attr "length" "8")])
(define_expand "cbranch<mode>4"