Module: Mesa Branch: main Commit: f2bb7b185d846b1f157a3ee2f2c5d06840a39f0f URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=f2bb7b185d846b1f157a3ee2f2c5d06840a39f0f
Author: Daniel Schürmann <[email protected]> Date: Wed Nov 1 16:24:15 2023 +0100 aco: delete instruction selection for boolean subgroup operations These are now lowered in NIR. Reviewed-by: Georg Lehmann <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/218> --- src/amd/compiler/aco_instruction_selection.cpp | 199 +++---------------------- 1 file changed, 21 insertions(+), 178 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 9472d2e5193..c40f05ceb96 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -7633,143 +7633,6 @@ visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr) } } -void -emit_boolean_reduce(isel_context* ctx, nir_op op, unsigned cluster_size, Temp src, Temp dst) -{ - Builder bld(ctx->program, ctx->block); - assert(dst.regClass() == bld.lm); - - if (cluster_size == 1) { - bld.copy(Definition(dst), src); - } - if (op == nir_op_iand && cluster_size == 4) { - /* subgroupClusteredAnd(val, 4) -> ~wqm(~val & exec) */ - Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src); - tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, Operand(exec, bld.lm)); - bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), - bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp)); - } else if (op == nir_op_ior && cluster_size == 4) { - /* subgroupClusteredOr(val, 4) -> wqm(val & exec) */ - bld.sop1( - Builder::s_wqm, Definition(dst), bld.def(s1, scc), - bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))); - } else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) { - /* subgroupAnd(val) -> (~val & exec) == 0 */ - Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src); - tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, Operand(exec, bld.lm)) - .def(1) - .getTemp(); - Temp cond = bool_to_vector_condition(ctx, tmp); - bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond); - } else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) { - /* subgroupOr(val) -> (val & exec) != 0 */ - Temp tmp = - bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)) - .def(1) - .getTemp(); - bool_to_vector_condition(ctx, tmp, dst); - } else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) { - /* subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1 */ - Temp tmp = - bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); - tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp); - tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(1u)) - .def(1) - .getTemp(); - bool_to_vector_condition(ctx, tmp, dst); - } else { - /* subgroupClustered{And,Or,Xor}(val, n): - * lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) (just v_mbcnt_lo on wave32) - * cluster_offset = ~(n - 1) & lane_id cluster_mask = ((1 << n) - 1) - * subgroupClusteredAnd(): - * return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask - * subgroupClusteredOr(): - * return ((val & exec) >> cluster_offset) & cluster_mask != 0 - * subgroupClusteredXor(): - * return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0 - */ - Temp lane_id = emit_mbcnt(ctx, bld.tmp(v1)); - Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), - Operand::c32(~uint32_t(cluster_size - 1)), lane_id); - - Temp tmp; - if (op == nir_op_iand) - tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src, - Operand(exec, bld.lm)); - else - tmp = - bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); - - uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u; - - if (ctx->program->gfx_level <= GFX7) - tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), tmp, cluster_offset); - else if (ctx->program->wave_size == 64) - tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp); - else - tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp); - tmp = emit_extract_vector(ctx, tmp, 0, v1); - if (cluster_mask != 0xffffffff) - tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(cluster_mask), tmp); - - if (op == nir_op_iand) { - bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(dst), Operand::c32(cluster_mask), tmp); - } else if (op == nir_op_ior) { - bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), tmp); - } else if (op == nir_op_ixor) { - tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), - bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand::zero())); - bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), tmp); - } - } -} - -void -emit_boolean_exclusive_scan(isel_context* ctx, nir_op op, Temp src, Temp dst) -{ - Builder bld(ctx->program, ctx->block); - assert(src.regClass() == bld.lm); - - /* subgroupExclusiveAnd(val) -> mbcnt(~val & exec) == 0 - * subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0 - * subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0 - */ - if (op == nir_op_iand) - src = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src); - - Temp tmp = - bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); - - Temp mbcnt = emit_mbcnt(ctx, bld.tmp(v1), Operand(tmp)); - - if (op == nir_op_iand) - bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(dst), Operand::zero(), mbcnt); - else if (op == nir_op_ior) - bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), mbcnt); - else if (op == nir_op_ixor) - bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), - bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), mbcnt)); -} - -void -emit_boolean_inclusive_scan(isel_context* ctx, nir_op op, Temp src, Temp dst) -{ - Builder bld(ctx->program, ctx->block); - - /* subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val - * subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val - * subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val - */ - Temp tmp = bld.tmp(bld.lm); - emit_boolean_exclusive_scan(ctx, op, src, tmp); - if (op == nir_op_iand) - bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, src); - else if (op == nir_op_ior) - bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), tmp, src); - else if (op == nir_op_ixor) - bld.sop2(Builder::s_xor, Definition(dst), bld.def(s1, scc), tmp, src); -} - ReduceOp get_reduce_op(nir_op op, unsigned bit_size) { @@ -8606,9 +8469,10 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) instr->intrinsic == nir_intrinsic_reduce ? nir_intrinsic_cluster_size(instr) : 0; cluster_size = util_next_power_of_two( MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size)); + const unsigned bit_size = instr->src[0].ssa->bit_size; + assert(bit_size != 1); - if (!nir_src_is_divergent(instr->src[0]) && cluster_size == ctx->program->wave_size && - instr->def.bit_size != 1) { + if (!nir_src_is_divergent(instr->src[0]) && cluster_size == ctx->program->wave_size) { /* We use divergence analysis to assign the regclass, so check if it's * working as expected */ ASSERTED bool expected_divergent = instr->intrinsic == nir_intrinsic_exclusive_scan; @@ -8624,47 +8488,26 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) } } - if (instr->def.bit_size == 1) { - if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin) - op = nir_op_iand; - else if (op == nir_op_iadd) - op = nir_op_ixor; - else if (op == nir_op_umax || op == nir_op_imax) - op = nir_op_ior; - assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor); - - switch (instr->intrinsic) { - case nir_intrinsic_reduce: emit_boolean_reduce(ctx, op, cluster_size, src, dst); break; - case nir_intrinsic_exclusive_scan: emit_boolean_exclusive_scan(ctx, op, src, dst); break; - case nir_intrinsic_inclusive_scan: emit_boolean_inclusive_scan(ctx, op, src, dst); break; - default: assert(false); - } - } else if (cluster_size == 1) { - bld.copy(Definition(dst), src); - } else { - unsigned bit_size = instr->src[0].ssa->bit_size; - - src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8)); + src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8)); + ReduceOp reduce_op = get_reduce_op(op, bit_size); - ReduceOp reduce_op = get_reduce_op(op, bit_size); - - aco_opcode aco_op; - switch (instr->intrinsic) { - case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break; - case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break; - case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break; - default: unreachable("unknown reduce intrinsic"); - } + aco_opcode aco_op; + switch (instr->intrinsic) { + case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break; + case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break; + case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break; + default: unreachable("unknown reduce intrinsic"); + } + + /* Avoid whole wave shift. */ + const bool use_inclusive_for_exclusive = aco_op == aco_opcode::p_exclusive_scan && + (op == nir_op_iadd || op == nir_op_ixor) && + dst.type() == RegType::vgpr; + if (use_inclusive_for_exclusive) + inclusive_scan_to_exclusive(ctx, reduce_op, Definition(dst), src); + else + emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size, Definition(dst), src); - /* Avoid whole wave shift. */ - const bool use_inclusive_for_exclusive = aco_op == aco_opcode::p_exclusive_scan && - (op == nir_op_iadd || op == nir_op_ixor) && - dst.type() == RegType::vgpr; - if (use_inclusive_for_exclusive) - inclusive_scan_to_exclusive(ctx, reduce_op, Definition(dst), src); - else - emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size, Definition(dst), src); - } set_wqm(ctx); break; }
