Avoid constant register reloads while emitting IBs by using a local write
pointer and only updating the size at the end of each helper.
Signed-off-by: Tvrtko Ursulin <tvrtko.ursu...@igalia.com>
---
 drivers/gpu/drm/amd/amdgpu/si_dma.c | 84 +++++++++++++++++------------
 1 file changed, 51 insertions(+), 33 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/si_dma.c 
b/drivers/gpu/drm/amd/amdgpu/si_dma.c
index 7f18e4875287..9e26c7598d74 100644
--- a/drivers/gpu/drm/amd/amdgpu/si_dma.c
+++ b/drivers/gpu/drm/amd/amdgpu/si_dma.c
@@ -323,14 +323,16 @@ static void si_dma_vm_copy_pte(struct amdgpu_ib *ib,
                               uint64_t pe, uint64_t src,
                               unsigned count)
 {
+       u32 *ptr = &ib->ptr[ib->length_dw];
        unsigned bytes = count * 8;
 
-       ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_COPY,
-                                             1, 0, 0, bytes);
-       ib->ptr[ib->length_dw++] = lower_32_bits(pe);
-       ib->ptr[ib->length_dw++] = lower_32_bits(src);
-       ib->ptr[ib->length_dw++] = upper_32_bits(pe) & 0xff;
-       ib->ptr[ib->length_dw++] = upper_32_bits(src) & 0xff;
+       *ptr++ = DMA_PACKET(DMA_PACKET_COPY, 1, 0, 0, bytes);
+       *ptr++ = lower_32_bits(pe);
+       *ptr++ = lower_32_bits(src);
+       *ptr++ = upper_32_bits(pe) & 0xff;
+       *ptr++ = upper_32_bits(src) & 0xff;
+
+       ib->length_dw = ptr - ib->ptr;
 }
 
 /**
@@ -348,16 +350,19 @@ static void si_dma_vm_write_pte(struct amdgpu_ib *ib, 
uint64_t pe,
                                uint64_t value, unsigned count,
                                uint32_t incr)
 {
+       u32 *ptr = &ib->ptr[ib->length_dw];
        unsigned ndw = count * 2;
 
-       ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_WRITE, 0, 0, 0, ndw);
-       ib->ptr[ib->length_dw++] = lower_32_bits(pe);
-       ib->ptr[ib->length_dw++] = upper_32_bits(pe);
+       *ptr++ = DMA_PACKET(DMA_PACKET_WRITE, 0, 0, 0, ndw);
+       *ptr++ = lower_32_bits(pe);
+       *ptr++ = upper_32_bits(pe);
        for (; ndw > 0; ndw -= 2) {
-               ib->ptr[ib->length_dw++] = lower_32_bits(value);
-               ib->ptr[ib->length_dw++] = upper_32_bits(value);
+               *ptr++ = lower_32_bits(value);
+               *ptr++ = upper_32_bits(value);
                value += incr;
        }
+
+       ib->length_dw = ptr - ib->ptr;
 }
 
 /**
@@ -377,6 +382,7 @@ static void si_dma_vm_set_pte_pde(struct amdgpu_ib *ib,
                                     uint64_t addr, unsigned count,
                                     uint32_t incr, uint64_t flags)
 {
+       u32 *ptr = &ib->ptr[ib->length_dw];
        uint64_t value;
        unsigned ndw;
 
@@ -391,19 +397,21 @@ static void si_dma_vm_set_pte_pde(struct amdgpu_ib *ib,
                        value = 0;
 
                /* for physically contiguous pages (vram) */
-               ib->ptr[ib->length_dw++] = DMA_PTE_PDE_PACKET(ndw);
-               ib->ptr[ib->length_dw++] = pe; /* dst addr */
-               ib->ptr[ib->length_dw++] = upper_32_bits(pe) & 0xff;
-               ib->ptr[ib->length_dw++] = lower_32_bits(flags); /* mask */
-               ib->ptr[ib->length_dw++] = upper_32_bits(flags);
-               ib->ptr[ib->length_dw++] = value; /* value */
-               ib->ptr[ib->length_dw++] = upper_32_bits(value);
-               ib->ptr[ib->length_dw++] = incr; /* increment size */
-               ib->ptr[ib->length_dw++] = 0;
+               *ptr++ = DMA_PTE_PDE_PACKET(ndw);
+               *ptr++ = pe; /* dst addr */
+               *ptr++ = upper_32_bits(pe) & 0xff;
+               *ptr++ = lower_32_bits(flags); /* mask */
+               *ptr++ = upper_32_bits(flags);
+               *ptr++ = value; /* value */
+               *ptr++ = upper_32_bits(value);
+               *ptr++ = incr; /* increment size */
+               *ptr++ = 0;
                pe += ndw * 4;
                addr += (ndw / 2) * incr;
                count -= ndw / 2;
        }
+
+       ib->length_dw = ptr - ib->ptr;
 }
 
 /**
@@ -415,8 +423,12 @@ static void si_dma_vm_set_pte_pde(struct amdgpu_ib *ib,
  */
 static void si_dma_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
 {
-       while (ib->length_dw & 0x7)
-               ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_NOP, 0, 0, 0, 
0);
+       int pad = 8 - (ib->length_dw & 0x7);
+
+       if (pad && pad < 8) {
+               memset32(ib->ptr, DMA_PACKET(DMA_PACKET_NOP, 0, 0, 0, 0), pad);
+               ib->length_dw += pad;
+       }
 }
 
 /**
@@ -783,12 +795,15 @@ static void si_dma_emit_copy_buffer(struct amdgpu_ib *ib,
                                       uint32_t byte_count,
                                       uint32_t copy_flags)
 {
-       ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_COPY,
-                                             1, 0, 0, byte_count);
-       ib->ptr[ib->length_dw++] = lower_32_bits(dst_offset);
-       ib->ptr[ib->length_dw++] = lower_32_bits(src_offset);
-       ib->ptr[ib->length_dw++] = upper_32_bits(dst_offset) & 0xff;
-       ib->ptr[ib->length_dw++] = upper_32_bits(src_offset) & 0xff;
+       u32 *ptr = &ib->ptr[ib->length_dw];
+
+       *ptr++ = DMA_PACKET(DMA_PACKET_COPY, 1, 0, 0, byte_count);
+       *ptr++ = lower_32_bits(dst_offset);
+       *ptr++ = lower_32_bits(src_offset);
+       *ptr++ = upper_32_bits(dst_offset) & 0xff;
+       *ptr++ = upper_32_bits(src_offset) & 0xff;
+
+       ib->length_dw = ptr - ib->ptr;
 }
 
 /**
@@ -806,11 +821,14 @@ static void si_dma_emit_fill_buffer(struct amdgpu_ib *ib,
                                       uint64_t dst_offset,
                                       uint32_t byte_count)
 {
-       ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_CONSTANT_FILL,
-                                             0, 0, 0, byte_count / 4);
-       ib->ptr[ib->length_dw++] = lower_32_bits(dst_offset);
-       ib->ptr[ib->length_dw++] = src_data;
-       ib->ptr[ib->length_dw++] = upper_32_bits(dst_offset) << 16;
+       u32 *ptr = &ib->ptr[ib->length_dw];
+
+       *ptr++ = DMA_PACKET(DMA_PACKET_CONSTANT_FILL, 0, 0, 0, byte_count / 4);
+       *ptr++ = lower_32_bits(dst_offset);
+       *ptr++ = src_data;
+       *ptr++ = upper_32_bits(dst_offset) << 16;
+
+       ib->length_dw = ptr - ib->ptr;
 }
 
 
-- 
2.48.0

Reply via email to