For the series: Tested-by: Edmondo Tommasina <edmondo.tommas...@gmail.com>
The Talos Principle benchmark seems to be about 0.5% faster. Thanks edmondo On Fri, Jan 27, 2017 at 12:02 PM, Marek Olšák <mar...@gmail.com> wrote: > From: Marek Olšák <marek.ol...@amd.com> > > also remove the BIND flags > --- > src/gallium/drivers/r600/r600_state_common.c | 14 +++++++++----- > src/gallium/drivers/radeon/r600_buffer_common.c | 6 ++++-- > src/gallium/drivers/radeon/r600_pipe_common.c | 15 +++++++-------- > src/gallium/drivers/radeon/r600_pipe_common.h | 1 - > src/gallium/drivers/radeonsi/si_compute.c | 8 ++++---- > src/gallium/drivers/radeonsi/si_descriptors.c | 7 ++++--- > src/gallium/drivers/radeonsi/si_state_draw.c | 6 ++++-- > 7 files changed, 32 insertions(+), 25 deletions(-) > > diff --git a/src/gallium/drivers/r600/r600_state_common.c > b/src/gallium/drivers/r600/r600_state_common.c > index c8502e9..9ff2364 100644 > --- a/src/gallium/drivers/r600/r600_state_common.c > +++ b/src/gallium/drivers/r600/r600_state_common.c > @@ -1086,24 +1086,27 @@ static void r600_set_constant_buffer(struct > pipe_context *ctx, uint shader, uint > > if (!(tmpPtr = malloc(size))) { > R600_ERR("Failed to allocate BE swap > buffer.\n"); > return; > } > > for (i = 0; i < size / 4; ++i) { > tmpPtr[i] = util_cpu_to_le32(((uint32_t > *)ptr)[i]); > } > > - u_upload_data(rctx->b.uploader, 0, size, 256, tmpPtr, > &cb->buffer_offset, &cb->buffer); > + u_upload_data(ctx->stream_uploader, 0, size, 256, > + tmpPtr, &cb->buffer_offset, > &cb->buffer); > free(tmpPtr); > } else { > - u_upload_data(rctx->b.uploader, 0, > input->buffer_size, 256, ptr, &cb->buffer_offset, &cb->buffer); > + u_upload_data(ctx->stream_uploader, 0, > + input->buffer_size, 256, ptr, > + &cb->buffer_offset, &cb->buffer); > } > /* account it in gtt */ > rctx->b.gtt += input->buffer_size; > } else { > /* Setup the hw buffer. */ > cb->buffer_offset = input->buffer_offset; > pipe_resource_reference(&cb->buffer, input->buffer); > r600_context_add_resource_size(ctx, input->buffer); > } > > @@ -1729,42 +1732,43 @@ static void r600_draw_vbo(struct pipe_context *ctx, > const struct pipe_draw_info > data += info.indirect_offset / > sizeof(unsigned); > start = data[2] * ib.index_size; > count = data[0]; > } > else { > start = 0; > count = 0; > } > } > > - u_upload_alloc(rctx->b.uploader, start, count * 2, > 256, > - &out_offset, &out_buffer, &ptr); > + u_upload_alloc(ctx->stream_uploader, start, count * 2, > + 256, &out_offset, &out_buffer, &ptr); > > util_shorten_ubyte_elts_to_userptr( > &rctx->b.b, &ib, 0, ib.offset > + start, count, ptr); > > pipe_resource_reference(&ib.buffer, NULL); > ib.user_buffer = NULL; > ib.buffer = out_buffer; > ib.offset = out_offset; > ib.index_size = 2; > } > > /* Upload the index buffer. > * The upload is skipped for small index counts on > little-endian machines > * and the indices are emitted via PKT3_DRAW_INDEX_IMMD. > * Indirect draws never use immediate indices. > * Note: Instanced rendering in combination with immediate > indices hangs. */ > if (ib.user_buffer && (R600_BIG_ENDIAN || info.indirect || > info.instance_count > 1 || > info.count*ib.index_size > > 20)) { > - u_upload_data(rctx->b.uploader, 0, info.count * > ib.index_size, 256, > + u_upload_data(ctx->stream_uploader, 0, > + info.count * ib.index_size, 256, > ib.user_buffer, &ib.offset, &ib.buffer); > ib.user_buffer = NULL; > } > } else { > info.index_bias = info.start; > } > > /* Set the index offset and primitive restart. */ > if (rctx->vgt_state.vgt_multi_prim_ib_reset_en != > info.primitive_restart || > rctx->vgt_state.vgt_multi_prim_ib_reset_indx != > info.restart_index || > diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c > b/src/gallium/drivers/radeon/r600_buffer_common.c > index da6f020..40a644b 100644 > --- a/src/gallium/drivers/radeon/r600_buffer_common.c > +++ b/src/gallium/drivers/radeon/r600_buffer_common.c > @@ -361,22 +361,24 @@ static void *r600_buffer_transfer_map(struct > pipe_context *ctx, > r600_can_dma_copy_buffer(rctx, box->x, 0, box->width)) { > assert(usage & PIPE_TRANSFER_WRITE); > > /* Check if mapping this buffer would cause waiting for the > GPU. */ > if (r600_rings_is_buffer_referenced(rctx, rbuffer->buf, > RADEON_USAGE_READWRITE) || > !rctx->ws->buffer_wait(rbuffer->buf, 0, > RADEON_USAGE_READWRITE)) { > /* Do a wait-free write-only transfer using a > temporary buffer. */ > unsigned offset; > struct r600_resource *staging = NULL; > > - u_upload_alloc(rctx->uploader, 0, box->width + > (box->x % R600_MAP_BUFFER_ALIGNMENT), > - 256, &offset, (struct > pipe_resource**)&staging, (void**)&data); > + u_upload_alloc(ctx->stream_uploader, 0, > + box->width + (box->x % > R600_MAP_BUFFER_ALIGNMENT), > + 256, &offset, (struct > pipe_resource**)&staging, > + (void**)&data); > > if (staging) { > data += box->x % R600_MAP_BUFFER_ALIGNMENT; > return r600_buffer_get_transfer(ctx, > resource, level, usage, box, > ptransfer, > data, staging, offset); > } > } else { > /* At this point, the buffer is always idle (we > checked it above). */ > usage |= PIPE_TRANSFER_UNSYNCHRONIZED; > } > diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c > b/src/gallium/drivers/radeon/r600_pipe_common.c > index 1da7228..a8cba9d 100644 > --- a/src/gallium/drivers/radeon/r600_pipe_common.c > +++ b/src/gallium/drivers/radeon/r600_pipe_common.c > @@ -186,21 +186,22 @@ void r600_draw_rectangle(struct blitter_context > *blitter, > viewport.scale[1] = 1.0f; > viewport.scale[2] = 1.0f; > viewport.translate[0] = 0.0f; > viewport.translate[1] = 0.0f; > viewport.translate[2] = 0.0f; > rctx->b.set_viewport_states(&rctx->b, 0, 1, &viewport); > > /* Upload vertices. The hw rectangle has only 3 vertices, > * I guess the 4th one is derived from the first 3. > * The vertex specification should match u_blitter's vertex element > state. */ > - u_upload_alloc(rctx->uploader, 0, sizeof(float) * 24, 256, &offset, > &buf, (void**)&vb); > + u_upload_alloc(rctx->b.stream_uploader, 0, sizeof(float) * 24, 256, > + &offset, &buf, (void**)&vb); > if (!buf) > return; > > vb[0] = x1; > vb[1] = y1; > vb[2] = depth; > vb[3] = 1; > > vb[8] = x1; > vb[9] = y2; > @@ -594,26 +595,24 @@ bool r600_common_context_init(struct > r600_common_context *rctx, > r600_streamout_init(rctx); > r600_query_init(rctx); > cayman_init_msaa(&rctx->b); > > rctx->allocator_zeroed_memory = > u_suballocator_create(&rctx->b, rscreen->info.gart_page_size, > 0, PIPE_USAGE_DEFAULT, true); > if (!rctx->allocator_zeroed_memory) > return false; > > - rctx->uploader = u_upload_create(&rctx->b, 1024 * 1024, > - PIPE_BIND_INDEX_BUFFER | > - PIPE_BIND_CONSTANT_BUFFER, > PIPE_USAGE_STREAM); > - if (!rctx->uploader) > + rctx->b.stream_uploader = u_upload_create(&rctx->b, 1024 * 1024, > + 0, PIPE_USAGE_STREAM); > + if (!rctx->b.stream_uploader) > return false; > - rctx->b.stream_uploader = rctx->uploader; > > rctx->ctx = rctx->ws->ctx_create(rctx->ws); > if (!rctx->ctx) > return false; > > if (rscreen->info.has_sdma && !(rscreen->debug_flags & > DBG_NO_ASYNC_DMA)) { > rctx->dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA, > r600_flush_dma_ring, > rctx); > rctx->dma.flush = r600_flush_dma_ring; > @@ -641,22 +640,22 @@ void r600_common_context_cleanup(struct > r600_common_context *rctx) > if (rctx->query_result_shader) > rctx->b.delete_compute_state(&rctx->b, > rctx->query_result_shader); > > if (rctx->gfx.cs) > rctx->ws->cs_destroy(rctx->gfx.cs); > if (rctx->dma.cs) > rctx->ws->cs_destroy(rctx->dma.cs); > if (rctx->ctx) > rctx->ws->ctx_destroy(rctx->ctx); > > - if (rctx->uploader) { > - u_upload_destroy(rctx->uploader); > + if (rctx->b.stream_uploader) { > + u_upload_destroy(rctx->b.stream_uploader); > } > > slab_destroy_child(&rctx->pool_transfers); > > if (rctx->allocator_zeroed_memory) { > u_suballocator_destroy(rctx->allocator_zeroed_memory); > } > rctx->ws->fence_reference(&rctx->last_gfx_fence, NULL); > rctx->ws->fence_reference(&rctx->last_sdma_fence, NULL); > } > diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h > b/src/gallium/drivers/radeon/r600_pipe_common.h > index fafe6c1..f228755 100644 > --- a/src/gallium/drivers/radeon/r600_pipe_common.h > +++ b/src/gallium/drivers/radeon/r600_pipe_common.h > @@ -547,21 +547,20 @@ struct r600_common_context { > struct r600_ring gfx; > struct r600_ring dma; > struct pipe_fence_handle *last_gfx_fence; > struct pipe_fence_handle *last_sdma_fence; > unsigned num_gfx_cs_flushes; > unsigned initial_gfx_cs_size; > unsigned gpu_reset_counter; > unsigned last_dirty_tex_counter; > unsigned last_compressed_colortex_counter; > > - struct u_upload_mgr *uploader; > struct u_suballocator *allocator_zeroed_memory; > struct slab_child_pool pool_transfers; > > /* Current unaccounted memory usage. */ > uint64_t vram; > uint64_t gtt; > > /* States. */ > struct r600_streamout streamout; > struct r600_scissors scissors; > diff --git a/src/gallium/drivers/radeonsi/si_compute.c > b/src/gallium/drivers/radeonsi/si_compute.c > index d05c488..aae651c 100644 > --- a/src/gallium/drivers/radeonsi/si_compute.c > +++ b/src/gallium/drivers/radeonsi/si_compute.c > @@ -496,23 +496,23 @@ static void si_setup_user_sgprs_co_v2(struct si_context > *sctx, > > dispatch.grid_size_x = info->grid[0] * info->block[0]; > dispatch.grid_size_y = info->grid[1] * info->block[1]; > dispatch.grid_size_z = info->grid[2] * info->block[2]; > > dispatch.private_segment_size = program->private_size; > dispatch.group_segment_size = program->local_size; > > dispatch.kernarg_address = kernel_args_va; > > - u_upload_data(sctx->b.uploader, 0, sizeof(dispatch), 256, > - &dispatch, &dispatch_offset, > - (struct pipe_resource**)&dispatch_buf); > + u_upload_data(sctx->b.b.stream_uploader, 0, sizeof(dispatch), > + 256, &dispatch, &dispatch_offset, > + (struct pipe_resource**)&dispatch_buf); > > if (!dispatch_buf) { > fprintf(stderr, "Error: Failed to allocate dispatch " > "packet."); > } > radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, > dispatch_buf, > RADEON_USAGE_READ, > RADEON_PRIO_CONST_BUFFER); > > dispatch_va = dispatch_buf->gpu_address + dispatch_offset; > > @@ -558,21 +558,21 @@ static void si_upload_compute_input(struct si_context > *sctx, > unsigned num_work_size_bytes = program->use_code_object_v2 ? 0 : 36; > uint32_t kernel_args_offset = 0; > uint32_t *kernel_args; > void *kernel_args_ptr; > uint64_t kernel_args_va; > unsigned i; > > /* The extra num_work_size_bytes are for work group / work item size > information */ > kernel_args_size = program->input_size + num_work_size_bytes; > > - u_upload_alloc(sctx->b.uploader, 0, kernel_args_size, 256, > + u_upload_alloc(sctx->b.b.stream_uploader, 0, kernel_args_size, 256, > &kernel_args_offset, > (struct pipe_resource**)&input_buffer, > &kernel_args_ptr); > > kernel_args = (uint32_t*)kernel_args_ptr; > kernel_args_va = input_buffer->gpu_address + kernel_args_offset; > > if (!code_object) { > for (i = 0; i < 3; i++) { > kernel_args[i] = info->grid[i]; > kernel_args[i + 3] = info->grid[i] * info->block[i]; > diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c > b/src/gallium/drivers/radeonsi/si_descriptors.c > index 4c1120a..4a5407a 100644 > --- a/src/gallium/drivers/radeonsi/si_descriptors.c > +++ b/src/gallium/drivers/radeonsi/si_descriptors.c > @@ -227,21 +227,21 @@ static bool si_upload_descriptors(struct si_context > *sctx, > radeon_emit(sctx->ce_ib, desc->ce_offset + begin * 4); > radeon_emit_array(sctx->ce_ib, list + begin, count); > } > > if (!si_ce_upload(sctx, desc->ce_offset, list_size, > &desc->buffer_offset, > &desc->buffer)) > return false; > } else { > void *ptr; > > - u_upload_alloc(sctx->b.uploader, 0, list_size, 256, > + u_upload_alloc(sctx->b.b.stream_uploader, 0, list_size, 256, > &desc->buffer_offset, > (struct pipe_resource**)&desc->buffer, &ptr); > if (!desc->buffer) > return false; /* skip the draw call */ > > util_memcpy_cpu_to_le32(ptr, desc->list, list_size); > desc->gpu_list = ptr; > > radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, > desc->buffer, > RADEON_USAGE_READ, > RADEON_PRIO_DESCRIPTORS); > @@ -953,21 +953,22 @@ bool si_upload_vertex_buffer_descriptors(struct > si_context *sctx) > if (!sctx->vertex_buffers_dirty || !count || !velems) > return true; > > unsigned fix_size3 = velems->fix_size3; > unsigned first_vb_use_mask = velems->first_vb_use_mask; > > /* Vertex buffer descriptors are the only ones which are uploaded > * directly through a staging buffer and don't go through > * the fine-grained upload path. > */ > - u_upload_alloc(sctx->b.uploader, 0, count * 16, 256, > &desc->buffer_offset, > + u_upload_alloc(sctx->b.b.stream_uploader, 0, count * 16, 256, > + &desc->buffer_offset, > (struct pipe_resource**)&desc->buffer, (void**)&ptr); > if (!desc->buffer) > return false; > > radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, > desc->buffer, RADEON_USAGE_READ, > RADEON_PRIO_DESCRIPTORS); > > assert(count <= SI_NUM_VERTEX_BUFFERS); > > @@ -1059,21 +1060,21 @@ static struct si_descriptors * > si_const_buffer_descriptors(struct si_context *sctx, unsigned shader) > { > return &sctx->descriptors[si_const_buffer_descriptors_idx(shader)]; > } > > void si_upload_const_buffer(struct si_context *sctx, struct r600_resource > **rbuffer, > const uint8_t *ptr, unsigned size, uint32_t > *const_offset) > { > void *tmp; > > - u_upload_alloc(sctx->b.uploader, 0, size, 256, const_offset, > + u_upload_alloc(sctx->b.b.stream_uploader, 0, size, 256, const_offset, > (struct pipe_resource**)rbuffer, &tmp); > if (*rbuffer) > util_memcpy_cpu_to_le32(tmp, ptr, size); > } > > static void si_set_constant_buffer(struct si_context *sctx, > struct si_buffer_resources *buffers, > unsigned descriptors_idx, > uint slot, const struct > pipe_constant_buffer *input) > { > diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c > b/src/gallium/drivers/radeonsi/si_state_draw.c > index 1dd6dcc..01340af 100644 > --- a/src/gallium/drivers/radeonsi/si_state_draw.c > +++ b/src/gallium/drivers/radeonsi/si_state_draw.c > @@ -1049,21 +1049,22 @@ void si_draw_vbo(struct pipe_context *ctx, const > struct pipe_draw_info *info) > /* Translate or upload, if needed. */ > /* 8-bit indices are supported on VI. */ > if (sctx->b.chip_class <= CIK && ib.index_size == 1) { > struct pipe_resource *out_buffer = NULL; > unsigned out_offset, start, count, start_offset; > void *ptr; > > si_get_draw_start_count(sctx, info, &start, &count); > start_offset = start * ib.index_size; > > - u_upload_alloc(sctx->b.uploader, start_offset, count > * 2, 256, > + u_upload_alloc(ctx->stream_uploader, start_offset, > + count * 2, 256, > &out_offset, &out_buffer, &ptr); > if (!out_buffer) { > pipe_resource_reference(&ib.buffer, NULL); > return; > } > > util_shorten_ubyte_elts_to_userptr(&sctx->b.b, &ib, 0, > ib.offset + > start_offset, > count, ptr); > > @@ -1072,21 +1073,22 @@ void si_draw_vbo(struct pipe_context *ctx, const > struct pipe_draw_info *info) > ib.buffer = out_buffer; > /* info->start will be added by the drawing code */ > ib.offset = out_offset - start_offset; > ib.index_size = 2; > } else if (ib.user_buffer && !ib.buffer) { > unsigned start, count, start_offset; > > si_get_draw_start_count(sctx, info, &start, &count); > start_offset = start * ib.index_size; > > - u_upload_data(sctx->b.uploader, start_offset, count * > ib.index_size, > + u_upload_data(ctx->stream_uploader, start_offset, > + count * ib.index_size, > 256, (char*)ib.user_buffer + > start_offset, > &ib.offset, &ib.buffer); > if (!ib.buffer) > return; > /* info->start will be added by the drawing code */ > ib.offset -= start_offset; > } > } > > /* VI reads index buffers through TC L2. */ > -- > 2.7.4 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev