fs: Define new shader opcodes to set rounding modes

Francisco Jerez Tue, 29 Aug 2017 12:34:18 -0700

Francisco Jerez <[email protected]> writes:

> Chema Casanova <[email protected]> writes:
>
>> El 25/08/17 a las 20:09, Francisco Jerez escribió:
>>> Alejandro Piñeiro <[email protected]> writes:
>>>
>>>> Although it is possible to emit them directly as AND/OR on brw_fs_nir,
>>>> having specific opcodes makes it easier to remove duplicate settings
>>>> later.
>>>>
>>>> Signed-off-by:  Alejandro Piñeiro <[email protected]>
>>>> Signed-off-by:  Jose Maria Casanova Crespo <[email protected]>
>>>> ---
>>>>  src/intel/compiler/brw_eu.h             |  3 +++
>>>>  src/intel/compiler/brw_eu_defines.h     |  9 +++++++++
>>>>  src/intel/compiler/brw_eu_emit.c        | 19 +++++++++++++++++++
>>>>  src/intel/compiler/brw_fs_generator.cpp |  8 ++++++++
>>>>  src/intel/compiler/brw_shader.cpp       |  5 +++++
>>>>  5 files changed, 44 insertions(+)
>>>>
>>>> diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h
>>>> index a3a9c63239d..0a7f8020398 100644
>>>> --- a/src/intel/compiler/brw_eu.h
>>>> +++ b/src/intel/compiler/brw_eu.h
>>>> @@ -500,6 +500,9 @@ brw_broadcast(struct brw_codegen *p,
>>>>                struct brw_reg src,
>>>>                struct brw_reg idx);
>>>>  
>>>> +void
>>>> +brw_rounding_mode(struct brw_codegen *p,
>>>> +                  enum brw_rnd_mode mode);
>>>>  /***********************************************************************
>>>>   * brw_eu_util.c:
>>>>   */
>>>> diff --git a/src/intel/compiler/brw_eu_defines.h 
>>>> b/src/intel/compiler/brw_eu_defines.h
>>>> index 1af835d47ed..50435df2fcf 100644
>>>> --- a/src/intel/compiler/brw_eu_defines.h
>>>> +++ b/src/intel/compiler/brw_eu_defines.h
>>>> @@ -388,6 +388,9 @@ enum opcode {
>>>>     SHADER_OPCODE_TYPED_SURFACE_WRITE,
>>>>     SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
>>>>  
>>>> +   SHADER_OPCODE_RND_MODE_RTE,
>>>> +   SHADER_OPCODE_RND_MODE_RTZ,
>>>> +
>>> We don't need an opcode for each possible rounding mode (there's also RU
>>> and RD).  How about you add a single SHADER_OPCODE_RND_MODE opcode
>>> taking an immediate with the right rounding mode?
>> I like the proposal. It is better having a unique opcode for setting the
>> rounding mode. Changed for v2 of this patch.
>>> Also, you should be marking the rounding mode opcodes as
>>> has_side_effects(), because otherwise you're giving the scheduler the
>>> freedom of moving your rounding mode update instruction past the
>>> instruction you wanted it to have an effect on...
>> Well pointed, we already realized that it was missing while debugging
>> the latency problem of the control register. It was hiding the problem
>> re-scheduling the cr0 modification to the beginning of the shader.
>>>>     SHADER_OPCODE_MEMORY_FENCE,
>>>>  
>>>>     SHADER_OPCODE_GEN4_SCRATCH_READ,
>>>> @@ -1233,4 +1236,10 @@ enum brw_message_target {
>>>>  /* R0 */
>>>>  # define GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT                27
>>>>  
>>>> +enum PACKED brw_rnd_mode {
>>>> +   BRW_RND_MODE_UNSPECIFIED,
>>>> +   BRW_RND_MODE_RTE,
>>>> +   BRW_RND_MODE_RTZ,
>>> Since you're introducing a back-end-specific rounding mode enum already,
>>> why not use the hardware values right away so you avoid hard-coding
>>> magic constants below.
>> At the end we removed  MODE_UNSPECIFIED as it isn't really needed and we
>> include Round Up and Down in the enum assignation based using the PRM
>> values. Also renamed RTE for RTNE to maintain coherence with nir
>> conversion modifiers using the same acronym as PRM.
>>
>> +enum PACKED brw_rnd_mode {
>> +   BRW_RND_MODE_RTNE = 0,  /* Round to Nearest or Even */
>> +   BRW_RND_MODE_RU = 1,    /* Round Up, toward +inf */
>> +   BRW_RND_MODE_RD = 2,    /* Round Down, toward -inf */
>> +   BRW_RND_MODE_RTZ = 3    /* Round Toward Zero */
>> +};
>>
>> I have a doubt about how to avoid the magic constants to close the v2 of
>> this patch. One approach would be using the same code structure and
>> taking advantage using the codification of rounding field. This way we
>> just formula and expect that the C compiler optimizer to guess that the
>> immediate value is a constant.
>>
>
> Hm, I'm not particularly worried about the C++ compiler behavior, it's
> most likely immaterial.
>
>> switch (mode) {
>>     case BRW_RND_MODE_RTZ:
>> -      inst = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
>> brw_imm_ud(0x00000030u));
>> +      inst = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
>> brw_imm_ud(((unsigned int) mode << 4)));
>>        break;
>> -   case BRW_RND_MODE_RTE:
>> -      inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
>> brw_imm_ud(0xffffffcfu));
>> +   case BRW_RND_MODE_RTNE:
>> +      inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
>> brw_imm_ud(((unsigned int) mode << 4) | ~0x00000030u));
>>        break;
>>     default:
>>
>
> I don't think this is particularly advantageous since you still need to
> special-case every possible rounding mode.
>
>> Another approach could be to implement a general solution for all
>> supported rounding modes by the hw including Round Up and Down using
>> bitwise operations.
>>
>> /**
>>  * Changes the floating point rounding mode updating the control register
>>  * field defined at cr0.0[5-6] bits. This function supports the changes to
>>  * RTNE (00), RU (01), RD (10) and RTZ (11) rounding using bitwise
>> operations.
>>  * Only RTNE and RTZ rounding are enabled at nir.
>>  */
>>
>> void
>> brw_rounding_mode(struct brw_codegen *p,
>>                   enum brw_rnd_mode mode)
>> {
>>
>>    const unsigned int mask = 0x0000000030u;
>>    const unsigned int enable_bits =  ((unsigned int) mode) << 4;
>>    const unsigned int disable_bits = enable_bits | ~mask;
>>
>>    /* Used by RTNE rounding to set field to 00 */
>>    if (disable_bits != ~0) {
>>       brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0), brw_imm_ud(disable_bits));
>>     }
>>    /* Used by RTZ rounding to set field to 11 */
>>     if (enable_bits) {
>>       brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0), brw_imm_ud(enable_bits));
>>     }
>> }
>>
>
> Looks good (except for the SWITCH thread control flag still missing),
> though you could simplify it slightly:
>
> | void
> | brw_rounding_mode(struct brw_codegen *p, enum brw_rnd_mode mode)
> | {
> |    const unsigned bits = mode << BRW_CR0_RND_MODE_SHIFT;
> |
> |    if (bits != BRW_RND_MODE_MASK) {


Hmm, for consistency with the SHIFT macro maybe name this
BRW_CR0_RND_MODE_MASK instead?

> |       brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
> |                                brw_imm_ud(~BRW_RND_MODE_MASK));
> |       brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
> |    }
> |
> |    if (bits) {
> |       brw_inst *inst = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
> |                               brw_imm_ud(bits));
> |       brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
> |    }
> | }
>
>
>> Any preference ?
>>
>> Thanks for the review.
>>
>> Chema
>>
>>>
>>>> +};
>>>> +
>>>>  #endif /* BRW_EU_DEFINES_H */
>>>> diff --git a/src/intel/compiler/brw_eu_emit.c 
>>>> b/src/intel/compiler/brw_eu_emit.c
>>>> index 0b0d67a5c56..07ad3d9384b 100644
>>>> --- a/src/intel/compiler/brw_eu_emit.c
>>>> +++ b/src/intel/compiler/brw_eu_emit.c
>>>> @@ -3723,3 +3723,22 @@ brw_WAIT(struct brw_codegen *p)
>>>>     brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
>>>>     brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
>>>>  }
>>>> +
>>>> +void
>>>> +brw_rounding_mode(struct brw_codegen *p,
>>>> +                  enum brw_rnd_mode mode)
>>>> +{
>>>> +   switch (mode) {
>>>> +   case BRW_RND_MODE_UNSPECIFIED:
>>>> +      /* nothing to do here */
>>>> +      break;
>>>> +   case BRW_RND_MODE_RTZ:
>>>> +      brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0), brw_imm_ud(0x00000030u));
>>>> +      break;
>>>> +   case BRW_RND_MODE_RTE:
>>>> +      brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0), brw_imm_ud(0xffffffcfu));
>>> This has undefined behavior because the ALU instructions you use to set
>>> cr0 have non-zero latency, so the rounding mode change won't take effect
>>> till ~8 cycles after the instruction is issued.  Any instructions issued
>>> in that window will pick up the wrong rounding mode.  This is likely one
>>> of the reasons for your observations off-list regarding some shaders
>>> using the right or wrong rounding mode non-deterministically depending
>>> on the scheduler's behaviour.
>>>
>>> Here's a spec quote from the SKL PRM suggesting a workaround you should
>>> probably include in this commit:
>>>
>>> | Implementation Restriction on Register Access: When the control
>>> | register is used as an explicit source and/or destination, hardware
>>> | does not ensure execution pipeline coherency. Software must set the
>>> | thread control field to ‘switch’ for an instruction that uses control
>>> | register as an explicit operand. This is important as the control
>>> | register is an implicit source for most instructions. For example,
>>> | fields like FPMode and Accumulator Disable control the arithmetic
>>> | and/or logic instructions. Therefore, if the instruction updating the
>>> | control register doesn’t set ‘switch’, subsequent instructions may
>>> | have undefined results.
>>>
>>>
>>>> +      break;
>>>> +   default:
>>>> +      unreachable("Not reached");
>>>> +   }
>>>> +}
>>>> diff --git a/src/intel/compiler/brw_fs_generator.cpp 
>>>> b/src/intel/compiler/brw_fs_generator.cpp
>>>> index 2ade486705b..e0bd191ea7e 100644
>>>> --- a/src/intel/compiler/brw_fs_generator.cpp
>>>> +++ b/src/intel/compiler/brw_fs_generator.cpp
>>>> @@ -2139,6 +2139,14 @@ fs_generator::generate_code(const cfg_t *cfg, int 
>>>> dispatch_width)
>>>>           brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F));
>>>>           break;
>>>>  
>>>> +      case SHADER_OPCODE_RND_MODE_RTE:
>>>> +         brw_rounding_mode(p, BRW_RND_MODE_RTE);
>>>> +         break;
>>>> +
>>>> +      case SHADER_OPCODE_RND_MODE_RTZ:
>>>> +         brw_rounding_mode(p, BRW_RND_MODE_RTZ);
>>>> +         break;
>>>> +
>>>>        default:
>>>>           unreachable("Unsupported opcode");
>>>>  
>>>> diff --git a/src/intel/compiler/brw_shader.cpp 
>>>> b/src/intel/compiler/brw_shader.cpp
>>>> index c62b8ba6140..f22e204e262 100644
>>>> --- a/src/intel/compiler/brw_shader.cpp
>>>> +++ b/src/intel/compiler/brw_shader.cpp
>>>> @@ -486,6 +486,11 @@ brw_instruction_name(const struct gen_device_info 
>>>> *devinfo, enum opcode op)
>>>>        return "tes_add_indirect_urb_offset";
>>>>     case TES_OPCODE_GET_PRIMITIVE_ID:
>>>>        return "tes_get_primitive_id";
>>>> +
>>>> +   case SHADER_OPCODE_RND_MODE_RTE:
>>>> +      return "round_mode_rte";
>>>> +   case SHADER_OPCODE_RND_MODE_RTZ:
>>>> +      return "round_mode_rtz";
>>>>     }
>>>>  
>>>>     unreachable("not reached");
>>>> -- 
>>>> 2.11.0
>>>>
>>>> _______________________________________________
>>>> mesa-dev mailing list
>>>> [email protected]
>>>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>>>>
>>>>
>>>> _______________________________________________
>>>> mesa-dev mailing list
>>>> [email protected]
>>>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev

signature.asc
Description: PGP signature

_______________________________________________
mesa-dev mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 20/47] i965/fs: Define new shader opcodes to set rounding modes

Reply via email to