Francisco Jerez <[email protected]> writes: > Iago Toral <[email protected]> writes: > >> On Sat, 2018-12-29 at 12:39 -0800, Francisco Jerez wrote: >>> This legalization pass is meant to handle situations where the source >>> or destination regioning controls of an instruction are unsupported >>> by >>> the hardware and need to be lowered away into separate instructions. >>> This should be more reliable and future-proof than the current >>> approach of handling CHV/BXT restrictions manually all over the >>> visitor. The same mechanism is leveraged to lower unsupported type >>> conversions easily, which obsoletes the lower_conversions pass. >>> --- >>> src/intel/Makefile.sources | 1 + >>> src/intel/compiler/brw_fs.cpp | 5 +- >>> src/intel/compiler/brw_fs.h | 21 +- >>> src/intel/compiler/brw_fs_lower_regioning.cpp | 382 >>> ++++++++++++++++++ >>> src/intel/compiler/brw_ir_fs.h | 10 + >>> src/intel/compiler/meson.build | 1 + >>> 6 files changed, 401 insertions(+), 19 deletions(-) >>> create mode 100644 src/intel/compiler/brw_fs_lower_regioning.cpp >>> >>> diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources >>> index 5e7d32293b7..6b9874d2b80 100644 >>> --- a/src/intel/Makefile.sources >>> +++ b/src/intel/Makefile.sources >>> @@ -64,6 +64,7 @@ COMPILER_FILES = \ >>> compiler/brw_fs_live_variables.h \ >>> compiler/brw_fs_lower_conversions.cpp \ >>> compiler/brw_fs_lower_pack.cpp \ >>> + compiler/brw_fs_lower_regioning.cpp \ >>> compiler/brw_fs_nir.cpp \ >>> compiler/brw_fs_reg_allocate.cpp \ >>> compiler/brw_fs_register_coalesce.cpp \ >>> diff --git a/src/intel/compiler/brw_fs.cpp >>> b/src/intel/compiler/brw_fs.cpp >>> index 889509badab..caa7a798332 100644 >>> --- a/src/intel/compiler/brw_fs.cpp >>> +++ b/src/intel/compiler/brw_fs.cpp >>> @@ -6471,7 +6471,10 @@ fs_visitor::optimize() >>> OPT(dead_code_eliminate); >>> } >>> >>> - if (OPT(lower_conversions)) { >>> + progress = false; >>> + OPT(lower_conversions); >>> + OPT(lower_regioning); >>> + if (progress) { >>> OPT(opt_copy_propagation); >>> OPT(dead_code_eliminate); >>> OPT(lower_simd_width); >>> diff --git a/src/intel/compiler/brw_fs.h >>> b/src/intel/compiler/brw_fs.h >>> index dc36ecc21ac..36825754931 100644 >>> --- a/src/intel/compiler/brw_fs.h >>> +++ b/src/intel/compiler/brw_fs.h >>> @@ -164,6 +164,7 @@ public: >>> void lower_uniform_pull_constant_loads(); >>> bool lower_load_payload(); >>> bool lower_pack(); >>> + bool lower_regioning(); >>> bool lower_conversions(); >>> bool lower_logical_sends(); >>> bool lower_integer_multiplication(); >>> @@ -536,24 +537,8 @@ namespace brw { >>> } >>> } >>> >>> - /** >>> - * Remove any modifiers from the \p i-th source region of the >>> instruction, >>> - * including negate, abs and any implicit type conversion to the >>> execution >>> - * type. Instead any source modifiers will be implemented as a >>> separate >>> - * MOV instruction prior to the original instruction. >>> - */ >>> - inline bool >>> - lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst >>> *inst, unsigned i) >>> - { >>> - assert(inst->components_read(i) == 1); >>> - const fs_builder ibld(v, block, inst); >>> - const fs_reg tmp = ibld.vgrf(get_exec_type(inst)); >>> - >>> - ibld.MOV(tmp, inst->src[i]); >>> - inst->src[i] = tmp; >>> - >>> - return true; >>> - } >>> + bool >>> + lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst >>> *inst, unsigned i); >>> } >>> >>> void shuffle_from_32bit_read(const brw::fs_builder &bld, >>> diff --git a/src/intel/compiler/brw_fs_lower_regioning.cpp >>> b/src/intel/compiler/brw_fs_lower_regioning.cpp >>> new file mode 100644 >>> index 00000000000..9578622401d >>> --- /dev/null >>> +++ b/src/intel/compiler/brw_fs_lower_regioning.cpp >>> @@ -0,0 +1,382 @@ >>> +/* >>> + * Copyright © 2018 Intel Corporation >>> + * >>> + * Permission is hereby granted, free of charge, to any person >>> obtaining a >>> + * copy of this software and associated documentation files (the >>> "Software"), >>> + * to deal in the Software without restriction, including without >>> limitation >>> + * the rights to use, copy, modify, merge, publish, distribute, >>> sublicense, >>> + * and/or sell copies of the Software, and to permit persons to whom >>> the >>> + * Software is furnished to do so, subject to the following >>> conditions: >>> + * >>> + * The above copyright notice and this permission notice (including >>> the next >>> + * paragraph) shall be included in all copies or substantial >>> portions of the >>> + * Software. >>> + * >>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, >>> EXPRESS OR >>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF >>> MERCHANTABILITY, >>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO >>> EVENT SHALL >>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES >>> OR OTHER >>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, >>> ARISING >>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR >>> OTHER DEALINGS >>> + * IN THE SOFTWARE. >>> + */ >>> + >>> +#include "brw_fs.h" >>> +#include "brw_cfg.h" >>> +#include "brw_fs_builder.h" >>> + >>> +using namespace brw; >>> + >>> +namespace { >>> + /* From the SKL PRM Vol 2a, "Move": >>> + * >>> + * "A mov with the same source and destination type, no source >>> modifier, >>> + * and no saturation is a raw move. A packed byte destination >>> region (B >>> + * or UB type with HorzStride == 1 and ExecSize > 1) can only be >>> written >>> + * using raw move." >>> + */ >>> + bool >>> + is_byte_raw_mov(const fs_inst *inst) >>> + { >>> + return type_sz(inst->dst.type) == 1 && >>> + inst->opcode == BRW_OPCODE_MOV && >>> + inst->src[0].type == inst->dst.type && >>> + !inst->saturate && >>> + !inst->src[0].negate && >>> + !inst->src[0].abs; >>> + } >>> + >>> + /* >>> + * Return an acceptable byte stride for the destination of an >>> instruction >>> + * that requires it to have some particular alignment. >>> + */ >>> + unsigned >>> + required_dst_byte_stride(const fs_inst *inst) >>> + { >>> + if (type_sz(inst->dst.type) < get_exec_type_size(inst) && >>> + !is_byte_raw_mov(inst)) { >>> + return get_exec_type_size(inst); >>> + } else { >>> + unsigned stride = inst->dst.stride * type_sz(inst- >>> >dst.type); >>> + >>> + for (unsigned i = 0; i < inst->sources; i++) { >>> + if (!is_uniform(inst->src[i])) >>> + stride = MAX2(stride, inst->src[i].stride * >>> + type_sz(inst->src[i].type)); >>> + } >>> + >>> + return stride; >>> + } >>> + } >>> + >>> + /* >>> + * Return an acceptable byte sub-register offset for the >>> destination of an >>> + * instruction that requires it to be aligned to the sub-register >>> offset of >>> + * the sources. >>> + */ >>> + unsigned >>> + required_dst_byte_offset(const fs_inst *inst) >>> + { >>> + for (unsigned i = 0; i < inst->sources; i++) { >>> + if (!is_uniform(inst->src[i])) >>> + if (reg_offset(inst->src[i]) % REG_SIZE != >>> + reg_offset(inst->dst) % REG_SIZE) >>> + return 0; >>> + } >>> + >>> + return reg_offset(inst->dst) % REG_SIZE; >>> + } >>> + >>> + /* >>> + * Return whether the instruction has an unsupported channel bit >>> layout >>> + * specified for the i-th source region. >>> + */ >>> + bool >>> + has_invalid_src_region(const gen_device_info *devinfo, const >>> fs_inst *inst, >>> + unsigned i) >>> + { >>> + if (is_unordered(inst)) { >>> + return false; >>> + } else { >>> + const unsigned dst_byte_stride = inst->dst.stride * >>> type_sz(inst->dst.type); >>> + const unsigned src_byte_stride = inst->src[i].stride * >>> + type_sz(inst->src[i].type); >>> + const unsigned dst_byte_offset = reg_offset(inst->dst) % >>> REG_SIZE; >>> + const unsigned src_byte_offset = reg_offset(inst->src[i]) % >>> REG_SIZE; >>> + >>> + return has_dst_aligned_region_restriction(devinfo, inst) && >>> + !is_uniform(inst->src[i]) && >>> + (src_byte_stride != dst_byte_stride || >>> + src_byte_offset != dst_byte_offset); >>> + } >>> + } >>> + >>> + /* >>> + * Return whether the instruction has an unsupported channel bit >>> layout >>> + * specified for the destination region. >>> + */ >>> + bool >>> + has_invalid_dst_region(const gen_device_info *devinfo, >>> + const fs_inst *inst) >>> + { >>> + if (is_unordered(inst)) { >>> + return false; >>> + } else { >>> + const brw_reg_type exec_type = get_exec_type(inst); >>> + const unsigned dst_byte_offset = reg_offset(inst->dst) % >>> REG_SIZE; >>> + const unsigned dst_byte_stride = inst->dst.stride * >>> type_sz(inst->dst.type); >>> + const bool is_narrowing_conversion = !is_byte_raw_mov(inst) >>> && >>> + type_sz(inst->dst.type) < type_sz(exec_type); >>> + >>> + return (has_dst_aligned_region_restriction(devinfo, inst) >>> && >>> + (required_dst_byte_stride(inst) != dst_byte_stride >>> || >>> + required_dst_byte_offset(inst) != >>> dst_byte_offset)) || >>> + (is_narrowing_conversion && >>> + required_dst_byte_stride(inst) != dst_byte_stride); >>> + } >>> + } >>> + >>> + /* >>> + * Return whether the instruction has unsupported source >>> modifiers >>> + * specified for the i-th source region. >>> + */ >>> + bool >>> + has_invalid_src_modifiers(const gen_device_info *devinfo, const >>> fs_inst *inst, >>> + unsigned i) >>> + { >>> + return !inst->can_do_source_mods(devinfo) && >>> + (inst->src[i].negate || inst->src[i].abs); >>> + } >>> + >>> + /* >>> + * Return whether the instruction has an unsupported type >>> conversion >>> + * specified for the destination. >>> + */ >>> + bool >>> + has_invalid_conversion(const gen_device_info *devinfo, const >>> fs_inst *inst) >>> + { >>> + switch (inst->opcode) { >>> + case BRW_OPCODE_MOV: >>> + return false; >>> + case BRW_OPCODE_SEL: >>> + return inst->dst.type != get_exec_type(inst); >>> + case SHADER_OPCODE_BROADCAST: >>> + case SHADER_OPCODE_MOV_INDIRECT: >>> + /* The source and destination types of these may are hard- >>> coded to >>> + * integer at codegen time due to hardware limitations of >>> 64-bit >>> + * types. >>> + */ >>> + return ((devinfo->gen == 7 && !devinfo->is_haswell) || >>> + devinfo->is_cherryview || >>> gen_device_info_is_9lp(devinfo)) && >>> + type_sz(inst->src[0].type) > 4 && >>> + inst->dst.type != inst->src[0].type; >>> + default: >>> + /* FIXME: We assume the opcodes don't explicitly mentioned >>> before >>> + * just work fine with arbitrary conversions. >>> + */ >>> + return false; >>> + } >>> + } >>> + >>> + bool >>> + lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst); >>> +} >>> + >>> +namespace brw { >>> + /** >>> + * Remove any modifiers from the \p i-th source region of the >>> instruction, >>> + * including negate, abs and any implicit type conversion to the >>> execution >>> + * type. Instead any source modifiers will be implemented as a >>> separate >>> + * MOV instruction prior to the original instruction. >>> + */ >>> + bool >>> + lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst >>> *inst, unsigned i) >>> + { >>> + assert(inst->components_read(i) == 1); >>> + const fs_builder ibld(v, block, inst); >>> + const fs_reg tmp = ibld.vgrf(get_exec_type(inst)); >>> + >>> + lower_instruction(v, block, ibld.MOV(tmp, inst->src[i])); >>> + inst->src[i] = tmp; >>> + >>> + return true; >>> + } >>> +} >>> + >>> +namespace { >>> + /** >>> + * Remove any modifiers from the destination region of the >>> instruction, >>> + * including saturate, conditional mod and any implicit type >>> conversion >>> + * from the execution type. Instead any destination modifiers >>> will be >>> + * implemented as a separate MOV instruction after the original >>> + * instruction. >>> + */ >>> + bool >>> + lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst >>> *inst) >>> + { >>> + const fs_builder ibld(v, block, inst); >>> + const brw_reg_type type = get_exec_type(inst); >>> + /* Not strictly necessary, but if possible use a temporary >>> with the same >>> + * channel alignment as the current destination in order to >>> avoid >>> + * violating the restrictions enforced later on by >>> lower_src_region() >>> + * and lower_dst_region(), which would introduce additional >>> copy >>> + * instructions into the program unnecessarily. >>> + */ >>> + const unsigned stride = >>> + type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) >>> ? 1 : >>> + type_sz(inst->dst.type) * inst->dst.stride / type_sz(type); >>> + const fs_reg tmp = horiz_stride(ibld.vgrf(type, stride), >>> stride); >>> + >>> + /* Emit a MOV taking care of all the destination modifiers. */ >>> + fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp); >>> + mov->saturate = inst->saturate; >>> + mov->conditional_mod = inst->conditional_mod; >>> + if (inst->opcode != BRW_OPCODE_SEL) { >>> + mov->predicate = inst->predicate; >>> + mov->predicate_inverse = inst->predicate_inverse; >>> + } >>> + mov->flag_subreg = inst->flag_subreg; >>> + lower_instruction(v, block, mov); >>> + >>> + /* Point the original instruction at the temporary, and clean >>> up any >>> + * destination modifiers. >>> + */ >>> + assert(inst->size_written == inst->dst.component_size(inst- >>> >exec_size)); >>> + inst->dst = tmp; >>> + inst->size_written = inst->dst.component_size(inst- >>> >exec_size); >>> + inst->saturate = false; >>> + inst->conditional_mod = BRW_CONDITIONAL_NONE; >> >> I think this is not correct if if inst is a SEL instruction. Here is an >> example that is causing a regresion for me: >> >> With the original lower_conversions pass I have: >> >> sel.l(8) vgrf15+0.0:W, vgrf3+0.0:B, vgrf5+0.0:B >> mov(8) vgrf16+0.0<2>:B, vgrf15+0.0:W >> >> Now, with this pass this is turned into: >> >> sel(8) vgrf15+0.0:W, vgrf3+0.0:B, vgrf5+0.0:B >> mov.l.f0.0(8) vgrf16+0.0<2>:B, vgrf15+0.0:W >> >> So I guess the SEL instruction no longer works since it is not >> predicated and we have removed the conditional modifier as well. >> >> Maybe only move the conditional modifier to the MOV when the >> instruction is not a SEL like we do for the predicate? >> > > Yeah good point, the semantics of conditional mods are inconsistent for > the SEL instruction -- Fixed up locally in the same way predicates are > handled for SEL in the same function. >
It just occurred to me that there are a couple other ISA instructions
with non-standard semantics for the conditional mod that need to be
special-cased here as well. I'll reply with a fix in a minute.
>>> + return true;
>>> + }
>>> +
>>> + /**
>>> + * Remove any non-trivial shuffling of data from the \p i-th
>>> source region
>>> + * of the instruction. Instead implement the region as a series
>>> of integer
>>> + * copies into a temporary with the same channel layout as the
>>> destination.
>>> + */
>>> + bool
>>> + lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst,
>>> unsigned i)
>>> + {
>>> + assert(inst->components_read(i) == 1);
>>> + const fs_builder ibld(v, block, inst);
>>> + const unsigned stride = type_sz(inst->dst.type) * inst-
>>> >dst.stride /
>>> + type_sz(inst->src[i].type);
>>> + assert(stride > 0);
>>> + const fs_reg tmp = horiz_stride(ibld.vgrf(inst->src[i].type,
>>> stride),
>>> + stride);
>>> +
>>> + /* Emit a series of 32-bit integer copies with any source
>>> modifiers
>>> + * cleaned up (because their semantics are dependent on the
>>> type).
>>> + */
>>> + const brw_reg_type raw_type =
>>> brw_int_type(MIN2(type_sz(tmp.type), 4),
>>> + false);
>>> + const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
>>> + fs_reg raw_src = inst->src[i];
>>> + raw_src.negate = false;
>>> + raw_src.abs = false;
>>> +
>>> + for (unsigned j = 0; j < n; j++)
>>> + ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src,
>>> raw_type, j));
>>> +
>>> + /* Point the original instruction at the temporary, making
>>> sure to keep
>>> + * any source modifiers in the instruction.
>>> + */
>>> + fs_reg lower_src = tmp;
>>> + lower_src.negate = inst->src[i].negate;
>>> + lower_src.abs = inst->src[i].abs;
>>> + inst->src[i] = lower_src;
>>> +
>>> + return true;
>>> + }
>>> +
>>> + /**
>>> + * Remove any non-trivial shuffling of data from the destination
>>> region of
>>> + * the instruction. Instead implement the region as a series of
>>> integer
>>> + * copies from a temporary with a channel layout compatible with
>>> the
>>> + * sources.
>>> + */
>>> + bool
>>> + lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
>>> + {
>>> + const fs_builder ibld(v, block, inst);
>>> + const unsigned stride = required_dst_byte_stride(inst) /
>>> + type_sz(inst->dst.type);
>>> + assert(stride > 0);
>>> + const fs_reg tmp = horiz_stride(ibld.vgrf(inst->dst.type,
>>> stride),
>>> + stride);
>>> +
>>> + /* Emit a series of 32-bit integer copies from the temporary
>>> into the
>>> + * original destination.
>>> + */
>>> + const brw_reg_type raw_type =
>>> brw_int_type(MIN2(type_sz(tmp.type), 4),
>>> + false);
>>> + const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
>>> +
>>> + if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
>>> + /* Note that in general we cannot simply predicate the
>>> copies on the
>>> + * same flag register as the original instruction, since it
>>> may have
>>> + * been overwritten by the instruction itself. Instead
>>> initialize
>>> + * the temporary with the previous contents of the
>>> destination
>>> + * register.
>>> + */
>>> + for (unsigned j = 0; j < n; j++)
>>> + ibld.MOV(subscript(tmp, raw_type, j),
>>> + subscript(inst->dst, raw_type, j));
>>> + }
>>> +
>>> + for (unsigned j = 0; j < n; j++)
>>> + ibld.at(block, inst->next).MOV(subscript(inst->dst,
>>> raw_type, j),
>>> + subscript(tmp, raw_type,
>>> j));
>>> +
>>> + /* Point the original instruction at the temporary, making
>>> sure to keep
>>> + * any destination modifiers in the instruction.
>>> + */
>>> + assert(inst->size_written == inst->dst.component_size(inst-
>>> >exec_size));
>>> + inst->dst = tmp;
>>> + inst->size_written = inst->dst.component_size(inst-
>>> >exec_size);
>>> +
>>> + return true;
>>> + }
>>> +
>>> + /**
>>> + * Legalize the source and destination regioning controls of the
>>> specified
>>> + * instruction.
>>> + */
>>> + bool
>>> + lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
>>> + {
>>> + const gen_device_info *devinfo = v->devinfo;
>>> + bool progress = false;
>>> +
>>> + if (has_invalid_conversion(devinfo, inst))
>>> + progress |= lower_dst_modifiers(v, block, inst);
>>> +
>>> + if (has_invalid_dst_region(devinfo, inst))
>>> + progress |= lower_dst_region(v, block, inst);
>>> +
>>> + for (unsigned i = 0; i < inst->sources; i++) {
>>> + if (has_invalid_src_modifiers(devinfo, inst, i))
>>> + progress |= lower_src_modifiers(v, block, inst, i);
>>> +
>>> + if (has_invalid_src_region(devinfo, inst, i))
>>> + progress |= lower_src_region(v, block, inst, i);
>>> + }
>>> +
>>> + return progress;
>>> + }
>>> +}
>>> +
>>> +bool
>>> +fs_visitor::lower_regioning()
>>> +{
>>> + bool progress = false;
>>> +
>>> + foreach_block_and_inst(block, fs_inst, inst, cfg)
>>> + progress |= lower_instruction(this, block, inst);
>>> +
>>> + if (progress)
>>> + invalidate_live_intervals();
>>> +
>>> + return progress;
>>> +}
>>> diff --git a/src/intel/compiler/brw_ir_fs.h
>>> b/src/intel/compiler/brw_ir_fs.h
>>> index 5bb92e4cc86..3c23fb375e4 100644
>>> --- a/src/intel/compiler/brw_ir_fs.h
>>> +++ b/src/intel/compiler/brw_ir_fs.h
>>> @@ -486,6 +486,16 @@ get_exec_type_size(const fs_inst *inst)
>>> return type_sz(get_exec_type(inst));
>>> }
>>>
>>> +/**
>>> + * Return whether the instruction isn't an ALU instruction and
>>> cannot be
>>> + * assumed to complete in-order.
>>> + */
>>> +static inline bool
>>> +is_unordered(const fs_inst *inst)
>>> +{
>>> + return inst->mlen || inst->is_send_from_grf() || inst->is_math();
>>> +}
>>> +
>>> /**
>>> * Return whether the following regioning restriction applies to the
>>> specified
>>> * instruction. From the Cherryview PRM Vol 7. "Register Region
>>> diff --git a/src/intel/compiler/meson.build
>>> b/src/intel/compiler/meson.build
>>> index 69ce2eab4cf..4af134b418e 100644
>>> --- a/src/intel/compiler/meson.build
>>> +++ b/src/intel/compiler/meson.build
>>> @@ -57,6 +57,7 @@ libintel_compiler_files = files(
>>> 'brw_fs_live_variables.h',
>>> 'brw_fs_lower_conversions.cpp',
>>> 'brw_fs_lower_pack.cpp',
>>> + 'brw_fs_lower_regioning.cpp',
>>> 'brw_fs_nir.cpp',
>>> 'brw_fs_reg_allocate.cpp',
>>> 'brw_fs_register_coalesce.cpp',
> _______________________________________________
> mesa-dev mailing list
> [email protected]
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
signature.asc
Description: PGP signature
_______________________________________________ mesa-dev mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/mesa-dev
