Avoid constant register reloads while emitting IBs by using a local write pointer and only updating the size at the end of each helper.
Signed-off-by: Tvrtko Ursulin <tvrtko.ursu...@igalia.com> --- drivers/gpu/drm/amd/amdgpu/cik_sdma.c | 105 +++++++++++++++----------- 1 file changed, 63 insertions(+), 42 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c index 9e8715b4739d..bf3049200fcd 100644 --- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c @@ -718,16 +718,18 @@ static void cik_sdma_vm_copy_pte(struct amdgpu_ib *ib, uint64_t pe, uint64_t src, unsigned count) { + u32 *ptr = &ib->ptr[ib->length_dw]; unsigned bytes = count * 8; - ib->ptr[ib->length_dw++] = SDMA_PACKET(SDMA_OPCODE_COPY, - SDMA_WRITE_SUB_OPCODE_LINEAR, 0); - ib->ptr[ib->length_dw++] = bytes; - ib->ptr[ib->length_dw++] = 0; /* src/dst endian swap */ - ib->ptr[ib->length_dw++] = lower_32_bits(src); - ib->ptr[ib->length_dw++] = upper_32_bits(src); - ib->ptr[ib->length_dw++] = lower_32_bits(pe); - ib->ptr[ib->length_dw++] = upper_32_bits(pe); + *ptr++ = SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_WRITE_SUB_OPCODE_LINEAR, 0); + *ptr++ = bytes; + *ptr++ = 0; /* src/dst endian swap */ + *ptr++ = lower_32_bits(src); + *ptr++ = upper_32_bits(src); + *ptr++ = lower_32_bits(pe); + *ptr++ = upper_32_bits(pe); + + ib->length_dw = ptr - ib->ptr; } /** @@ -745,18 +747,21 @@ static void cik_sdma_vm_write_pte(struct amdgpu_ib *ib, uint64_t pe, uint64_t value, unsigned count, uint32_t incr) { + u32 *ptr = &ib->ptr[ib->length_dw]; unsigned ndw = count * 2; - ib->ptr[ib->length_dw++] = SDMA_PACKET(SDMA_OPCODE_WRITE, - SDMA_WRITE_SUB_OPCODE_LINEAR, 0); - ib->ptr[ib->length_dw++] = lower_32_bits(pe); - ib->ptr[ib->length_dw++] = upper_32_bits(pe); - ib->ptr[ib->length_dw++] = ndw; + *ptr++ = SDMA_PACKET(SDMA_OPCODE_WRITE, + SDMA_WRITE_SUB_OPCODE_LINEAR, 0); + *ptr++ = lower_32_bits(pe); + *ptr++ = upper_32_bits(pe); + *ptr++ = ndw; for (; ndw > 0; ndw -= 2) { - ib->ptr[ib->length_dw++] = lower_32_bits(value); - ib->ptr[ib->length_dw++] = upper_32_bits(value); + *ptr++ = lower_32_bits(value); + *ptr++ = upper_32_bits(value); value += incr; } + + ib->length_dw = ptr - ib->ptr; } /** @@ -775,17 +780,21 @@ static void cik_sdma_vm_set_pte_pde(struct amdgpu_ib *ib, uint64_t pe, uint64_t addr, unsigned count, uint32_t incr, uint64_t flags) { + u32 *ptr = &ib->ptr[ib->length_dw]; + /* for physically contiguous pages (vram) */ - ib->ptr[ib->length_dw++] = SDMA_PACKET(SDMA_OPCODE_GENERATE_PTE_PDE, 0, 0); - ib->ptr[ib->length_dw++] = lower_32_bits(pe); /* dst addr */ - ib->ptr[ib->length_dw++] = upper_32_bits(pe); - ib->ptr[ib->length_dw++] = lower_32_bits(flags); /* mask */ - ib->ptr[ib->length_dw++] = upper_32_bits(flags); - ib->ptr[ib->length_dw++] = lower_32_bits(addr); /* value */ - ib->ptr[ib->length_dw++] = upper_32_bits(addr); - ib->ptr[ib->length_dw++] = incr; /* increment size */ - ib->ptr[ib->length_dw++] = 0; - ib->ptr[ib->length_dw++] = count; /* number of entries */ + *ptr++ = SDMA_PACKET(SDMA_OPCODE_GENERATE_PTE_PDE, 0, 0); + *ptr++ = lower_32_bits(pe); /* dst addr */ + *ptr++ = upper_32_bits(pe); + *ptr++ = lower_32_bits(flags); /* mask */ + *ptr++ = upper_32_bits(flags); + *ptr++ = lower_32_bits(addr); /* value */ + *ptr++ = upper_32_bits(addr); + *ptr++ = incr; /* increment size */ + *ptr++ = 0; + *ptr++ = count; /* number of entries */ + + ib->length_dw = ptr - ib->ptr; } /** @@ -798,18 +807,22 @@ static void cik_sdma_vm_set_pte_pde(struct amdgpu_ib *ib, uint64_t pe, static void cik_sdma_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib) { struct amdgpu_sdma_instance *sdma = amdgpu_sdma_get_instance_from_ring(ring); + u32 *ptr = &ib->ptr[ib->length_dw]; u32 pad_count; int i; pad_count = (-ib->length_dw) & 7; + if (!pad_count) + return; + for (i = 0; i < pad_count; i++) if (sdma && sdma->burst_nop && (i == 0)) - ib->ptr[ib->length_dw++] = - SDMA_PACKET(SDMA_OPCODE_NOP, 0, 0) | - SDMA_NOP_COUNT(pad_count - 1); + *ptr++ = SDMA_PACKET(SDMA_OPCODE_NOP, 0, 0) | + SDMA_NOP_COUNT(pad_count - 1); else - ib->ptr[ib->length_dw++] = - SDMA_PACKET(SDMA_OPCODE_NOP, 0, 0); + *ptr++ = SDMA_PACKET(SDMA_OPCODE_NOP, 0, 0); + + ib->length_dw += pad_count; } /** @@ -1290,13 +1303,17 @@ static void cik_sdma_emit_copy_buffer(struct amdgpu_ib *ib, uint32_t byte_count, uint32_t copy_flags) { - ib->ptr[ib->length_dw++] = SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR, 0); - ib->ptr[ib->length_dw++] = byte_count; - ib->ptr[ib->length_dw++] = 0; /* src/dst endian swap */ - ib->ptr[ib->length_dw++] = lower_32_bits(src_offset); - ib->ptr[ib->length_dw++] = upper_32_bits(src_offset); - ib->ptr[ib->length_dw++] = lower_32_bits(dst_offset); - ib->ptr[ib->length_dw++] = upper_32_bits(dst_offset); + u32 *ptr = &ib->ptr[ib->length_dw]; + + *ptr++ = SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR, 0); + *ptr++ = byte_count; + *ptr++ = 0; /* src/dst endian swap */ + *ptr++ = lower_32_bits(src_offset); + *ptr++ = upper_32_bits(src_offset); + *ptr++ = lower_32_bits(dst_offset); + *ptr++ = upper_32_bits(dst_offset); + + ib->length_dw = ptr - ib->ptr; } /** @@ -1314,11 +1331,15 @@ static void cik_sdma_emit_fill_buffer(struct amdgpu_ib *ib, uint64_t dst_offset, uint32_t byte_count) { - ib->ptr[ib->length_dw++] = SDMA_PACKET(SDMA_OPCODE_CONSTANT_FILL, 0, 0); - ib->ptr[ib->length_dw++] = lower_32_bits(dst_offset); - ib->ptr[ib->length_dw++] = upper_32_bits(dst_offset); - ib->ptr[ib->length_dw++] = src_data; - ib->ptr[ib->length_dw++] = byte_count; + u32 *ptr = &ib->ptr[ib->length_dw]; + + *ptr++ = SDMA_PACKET(SDMA_OPCODE_CONSTANT_FILL, 0, 0); + *ptr++ = lower_32_bits(dst_offset); + *ptr++ = upper_32_bits(dst_offset); + *ptr++ = src_data; + *ptr++ = byte_count; + + ib->length_dw = ptr - ib->ptr; } static const struct amdgpu_buffer_funcs cik_sdma_buffer_funcs = { -- 2.48.0