Jose Maria Casanova Crespo <[email protected]> writes: > We use the information of the registers read/write patterns > to improve variable liveness analysis avoiding extending the > liveness range of a variable to the beginning of the block so > it always reaches the beginning of the shader. > > This optimization analyses inside each block if a partial write > defines completely the bytes used by a following instruction > in the block. So we are not in the case of the use of an undefined > value in the block. > > This avoids almost all the spilling that happens with 8bit/16bit > storage tests, without any compilation performance impact for shader-db > execution that is compensated by spilling reductions. > > At this moment we don't extend the logic to intra-block calculations > of livein/liveout to not hurt performance on the general case because of > not taking advance of BITWORD operations. > > The execution time for running dEQP-VK.*8bit_storage.* tests is reduced > from 7m27.966s to 13.015s. > > shader-bd on SKL shows improvements reducing spilling on > deus-ex-mankind-divided and dophin without increasing execution time. > > total instructions in shared programs: 14867218 -> 14863959 (-0.02%) > instructions in affected programs: 121570 -> 118311 (-2.68%) > helped: 38 > HURT: 0 > > total cycles in shared programs: 537923248 -> 537720965 (-0.04%) > cycles in affected programs: 63154229 -> 62951946 (-0.32%) > helped: 61 > HURT: 26 > > total loops in shared programs: 4828 -> 4828 (0.00%) > loops in affected programs: 0 -> 0 > helped: 0 > HURT: 0 > > total spills in shared programs: 7790 -> 7375 (-5.33%) > spills in affected programs: 2824 -> 2409 (-14.70%) > helped: 35 > HURT: 0 > > total fills in shared programs: 10557 -> 10024 (-5.05%) > fills in affected programs: 3752 -> 3219 (-14.21%) > helped: 38 > HURT: 0 > > v2: - Use functions dst_write_pattern and src_read_pattern > introduced in previous patch at v2. > - Avoid calculating read_pattern if defpartial is 0 > > Cc: Francisco Jerez <[email protected]> > --- > src/intel/compiler/brw_fs_live_variables.cpp | 61 ++++++++++++-------- > src/intel/compiler/brw_fs_live_variables.h | 13 ++++- > 2 files changed, 47 insertions(+), 27 deletions(-) > > diff --git a/src/intel/compiler/brw_fs_live_variables.cpp > b/src/intel/compiler/brw_fs_live_variables.cpp > index 059f076fa51..d3559e3114f 100644 > --- a/src/intel/compiler/brw_fs_live_variables.cpp > +++ b/src/intel/compiler/brw_fs_live_variables.cpp > @@ -54,9 +54,9 @@ using namespace brw; > > void > fs_live_variables::setup_one_read(struct block_data *bd, fs_inst *inst, > - int ip, const fs_reg ®) > + int ip, int src, int reg_offset) > { > - int var = var_from_reg(reg); > + int var = var_from_reg(inst->src[src]) + reg_offset; > assert(var < num_vars); > > start[var] = MIN2(start[var], ip); > @@ -64,31 +64,48 @@ fs_live_variables::setup_one_read(struct block_data *bd, > fs_inst *inst, > > /* The use[] bitset marks when the block makes use of a variable (VGRF > * channel) without having completely defined that variable within the > - * block. > + * block. We take into account that a partial write could have defined > + * completely the read bytes in the block. > */ > - if (!BITSET_TEST(bd->def, var)) > - BITSET_SET(bd->use, var); > + if (!BITSET_TEST(bd->def, var)) { > + if (!bd->defpartial[var]) { > + BITSET_SET(bd->use, var);
Is there any measurable benefit from not executing the functionally
equivalent else block in this case?
> + } else {
> + unsigned read_pattern = inst->src_read_pattern(src, reg_offset);
> + if ((bd->defpartial[var] & read_pattern) != read_pattern)
> + BITSET_SET(bd->use, var);
> + }
> + }
> }
>
> void
> fs_live_variables::setup_one_write(struct block_data *bd, fs_inst *inst,
> - int ip, const fs_reg ®)
> + int ip, int reg_offset)
> {
> - int var = var_from_reg(reg);
> + int var = var_from_reg(inst->dst) + reg_offset;
> assert(var < num_vars);
>
> start[var] = MIN2(start[var], ip);
> end[var] = MAX2(end[var], ip);
>
> /* The def[] bitset marks when an initialization in a block completely
> - * screens off previous updates of that variable (VGRF channel).
> + * screens off previous updates of that variable (VGRF channel). If
> + * we have a partial write now we store the write pattern so next
> + * reads in the block can check if what they read was completelly screened
"completely"
> + * of by this partial write.
"off"
> */
> - if (inst->dst.file == VGRF) {
> - if (!inst->is_partial_write() && !BITSET_TEST(bd->use, var))
> + assert(inst->dst.file == VGRF);
> + if(!BITSET_TEST(bd->use, var)) {
> + if (!inst->is_partial_write()) {
> BITSET_SET(bd->def, var);
> -
> - BITSET_SET(bd->defout, var);
> + bd->defpartial[var] = ~0u;
Same question as above. Is there any measurable benefit from not
executing the functionally equivalent else block here when the if
condition evaluates to true?
> + } else {
> + bd->defpartial[var] |= inst->dst_write_pattern(reg_offset);
> + if (bd->defpartial[var] == ~0u)
> + BITSET_SET(bd->def, var);
> + }
> }
> + BITSET_SET(bd->defout, var);
> }
>
> /**
> @@ -115,14 +132,9 @@ fs_live_variables::setup_def_use()
> foreach_inst_in_block(fs_inst, inst, block) {
> /* Set use[] for this instruction */
> for (unsigned int i = 0; i < inst->sources; i++) {
> - fs_reg reg = inst->src[i];
> -
> - if (reg.file != VGRF)
> - continue;
> -
> - for (unsigned j = 0; j < regs_read(inst, i); j++) {
> - setup_one_read(bd, inst, ip, reg);
> - reg.offset += REG_SIZE;
> + if (inst->src[i].file == VGRF) {
> + for (unsigned j = 0; j < regs_read(inst, i); j++)
> + setup_one_read(bd, inst, ip, i, j);
> }
> }
>
> @@ -130,11 +142,8 @@ fs_live_variables::setup_def_use()
>
> /* Set def[] for this instruction */
> if (inst->dst.file == VGRF) {
> - fs_reg reg = inst->dst;
> - for (unsigned j = 0; j < regs_written(inst); j++) {
> - setup_one_write(bd, inst, ip, reg);
> - reg.offset += REG_SIZE;
> - }
> + for (unsigned j = 0; j < regs_written(inst); j++)
> + setup_one_write(bd, inst, ip, j);
> }
>
> if (!inst->predicate && inst->exec_size >= 8)
> @@ -281,6 +290,7 @@ fs_live_variables::fs_live_variables(fs_visitor *v, const
> cfg_t *cfg)
> bitset_words = BITSET_WORDS(num_vars);
> for (int i = 0; i < cfg->num_blocks; i++) {
> block_data[i].def = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
> + block_data[i].defpartial = rzalloc_array(mem_ctx, unsigned, num_vars);
> block_data[i].use = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
> block_data[i].livein = rzalloc_array(mem_ctx, BITSET_WORD,
> bitset_words);
> block_data[i].liveout = rzalloc_array(mem_ctx, BITSET_WORD,
> bitset_words);
> @@ -319,6 +329,7 @@ fs_visitor::invalidate_live_intervals()
> void
> fs_visitor::calculate_live_intervals()
> {
> +
Stray whitespace.
> if (this->live_intervals)
> return;
>
> diff --git a/src/intel/compiler/brw_fs_live_variables.h
> b/src/intel/compiler/brw_fs_live_variables.h
> index 9e95e443170..5842b24f3e5 100644
> --- a/src/intel/compiler/brw_fs_live_variables.h
> +++ b/src/intel/compiler/brw_fs_live_variables.h
> @@ -44,6 +44,15 @@ struct block_data {
> */
> BITSET_WORD *def;
>
> + /**
> + * Which bytes of a variable are defined before being used in the block.
> + * Each bit of the 32-bit integer represents one of the register 32 bytes.
> + *
> + * As above, "defined" means unconditionally defined but at byte level.
> + */
> +
> + unsigned *defpartial;
> +
> /**
> * Which variables are used before being defined in the block.
> */
> @@ -115,9 +124,9 @@ public:
> protected:
> void setup_def_use();
> void setup_one_read(struct block_data *bd, fs_inst *inst, int ip,
> - const fs_reg ®);
> + int src, int reg_offset);
> void setup_one_write(struct block_data *bd, fs_inst *inst, int ip,
> - const fs_reg ®);
> + int reg_offset);
> void compute_live_variables();
> void compute_start_end();
>
> --
> 2.17.1
signature.asc
Description: PGP signature
_______________________________________________ mesa-dev mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/mesa-dev
