On Tue, Mar 26, 2013 at 6:22 AM, Christian König <deathsim...@vodafone.de> wrote: > Am 25.03.2013 18:15, schrieb j.gli...@gmail.com: > >> From: Jerome Glisse <jgli...@redhat.com> >> >> Same as on r600, trace cs execution by writting cs offset after each >> states, this allow to pin point lockup inside command stream and >> narrow down the scope of lockup investigation. >> >> v2: Use WRITE_DATA packet instead of WRITE_MEM >> >> Signed-off-by: Jerome Glisse <jgli...@redhat.com> >> --- >> src/gallium/drivers/radeonsi/r600_hw_context.c | 61 >> ++++++++++++++++++++++++++ >> src/gallium/drivers/radeonsi/radeonsi_pipe.c | 22 ++++++++++ >> src/gallium/drivers/radeonsi/radeonsi_pipe.h | 12 +++++ >> src/gallium/drivers/radeonsi/radeonsi_pm4.c | 12 +++++ >> src/gallium/drivers/radeonsi/si_state_draw.c | 7 ++- >> src/gallium/drivers/radeonsi/sid.h | 14 ++++++ >> 6 files changed, 127 insertions(+), 1 deletion(-) >> >> diff --git a/src/gallium/drivers/radeonsi/r600_hw_context.c >> b/src/gallium/drivers/radeonsi/r600_hw_context.c >> index bd348f9..967f093 100644 >> --- a/src/gallium/drivers/radeonsi/r600_hw_context.c >> +++ b/src/gallium/drivers/radeonsi/r600_hw_context.c >> @@ -142,6 +142,12 @@ void si_need_cs_space(struct r600_context *ctx, >> unsigned num_dw, >> /* Save 16 dwords for the fence mechanism. */ >> num_dw += 16; >> +#if R600_TRACE_CS >> + if (ctx->screen->trace_bo) { >> + num_dw += R600_TRACE_CS_DWORDS; >> + } >> +#endif >> + >> /* Flush if there's not enough space. */ >> if (num_dw > RADEON_MAX_CMDBUF_DWORDS) { >> radeonsi_flush(&ctx->context, NULL, RADEON_FLUSH_ASYNC); >> @@ -206,9 +212,41 @@ void si_context_flush(struct r600_context *ctx, >> unsigned flags) >> /* force to keep tiling flags */ >> flags |= RADEON_FLUSH_KEEP_TILING_FLAGS; >> +#if R600_TRACE_CS >> + if (ctx->screen->trace_bo) { >> + struct r600_screen *rscreen = ctx->screen; >> + unsigned i; >> + >> + for (i = 0; i < cs->cdw; i++) { >> + fprintf(stderr, "[%4d] [%5d] 0x%08x\n", >> rscreen->cs_count, i, cs->buf[i]); >> + } >> + rscreen->cs_count++; >> + } >> +#endif >> + >> /* Flush the CS. */ >> ctx->ws->cs_flush(ctx->cs, flags); >> +#if R600_TRACE_CS >> + if (ctx->screen->trace_bo) { >> + struct r600_screen *rscreen = ctx->screen; >> + unsigned i; >> + >> + for (i = 0; i < 10; i++) { >> + usleep(5); >> + if >> (!ctx->ws->buffer_is_busy(rscreen->trace_bo->buf, RADEON_USAGE_READWRITE)) { >> + break; >> + } >> + } >> + if (i == 10) { >> + fprintf(stderr, "timeout on cs lockup likely >> happen at cs %d dw %d\n", >> + rscreen->trace_ptr[1], >> rscreen->trace_ptr[0]); >> + } else { >> + fprintf(stderr, "cs %d executed in %dms\n", >> rscreen->trace_ptr[1], i * 5); >> + } >> + } >> +#endif >> + >> ctx->pm4_dirty_cdwords = 0; >> ctx->flags = 0; >> @@ -665,3 +703,26 @@ void r600_context_draw_opaque_count(struct >> r600_context *ctx, struct r600_so_tar >> cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, t->filled_size, >> RADEON_USAGE_READ); >> } >> + >> +#if R600_TRACE_CS >> +void r600_trace_emit(struct r600_context *rctx) >> +{ >> + struct r600_screen *rscreen = rctx->screen; >> + struct radeon_winsys_cs *cs = rctx->cs; >> + uint64_t va; >> + uint32_t reloc; >> + >> + va = r600_resource_va(&rscreen->screen, (void*)rscreen->trace_bo); >> + reloc = r600_context_bo_reloc(rctx, rscreen->trace_bo, >> RADEON_USAGE_READWRITE); >> + cs->buf[cs->cdw++] = PKT3(PKT3_WRITE_DATA, 4, 0); >> + cs->buf[cs->cdw++] = >> PKT3_WRITE_DATA_DST_SEL(PKT3_WRITE_DATA_DST_SEL_MEM_SYNC) | >> + PKT3_WRITE_DATA_WR_CONFIRM | >> + >> PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME); >> + cs->buf[cs->cdw++] = va & 0xFFFFFFFFUL; >> + cs->buf[cs->cdw++] = (va >> 32UL) & 0xFFFFFFFFUL; >> + cs->buf[cs->cdw++] = cs->cdw; >> + cs->buf[cs->cdw++] = rscreen->cs_count; >> + cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); >> + cs->buf[cs->cdw++] = reloc; > > > The NOP packet here is superfluous, also I don't really like how this is > implemented after all. > > May I just use this patch as base of a cleaner implementation? > > Christian.
Yeah nop is a left over, what don't you like ? This is a build time debug option only that proved very useful (at least to me) on r600g Cheers, Jerome > >> +} >> +#endif >> diff --git a/src/gallium/drivers/radeonsi/radeonsi_pipe.c >> b/src/gallium/drivers/radeonsi/radeonsi_pipe.c >> index c5dac29..a370d7e 100644 >> --- a/src/gallium/drivers/radeonsi/radeonsi_pipe.c >> +++ b/src/gallium/drivers/radeonsi/radeonsi_pipe.c >> @@ -525,6 +525,14 @@ static void r600_destroy_screen(struct pipe_screen* >> pscreen) >> rscreen->ws->buffer_unmap(rscreen->fences.bo->cs_buf); >> si_resource_reference(&rscreen->fences.bo, NULL); >> } >> + >> +#if R600_TRACE_CS >> + if (rscreen->trace_bo) { >> + rscreen->ws->buffer_unmap(rscreen->trace_bo->cs_buf); >> + pipe_resource_reference((struct >> pipe_resource**)&rscreen->trace_bo, NULL); >> + } >> +#endif >> + >> pipe_mutex_destroy(rscreen->fences.mutex); >> rscreen->ws->destroy(rscreen->ws); >> @@ -727,5 +735,19 @@ struct pipe_screen *radeonsi_screen_create(struct >> radeon_winsys *ws) >> LIST_INITHEAD(&rscreen->fences.blocks); >> pipe_mutex_init(rscreen->fences.mutex); >> +#if R600_TRACE_CS >> + rscreen->cs_count = 0; >> + if (rscreen->info.drm_minor >= 28) { >> + rscreen->trace_bo = (struct >> si_resource*)pipe_buffer_create(&rscreen->screen, >> + >> PIPE_BIND_CUSTOM, >> + >> PIPE_USAGE_STAGING, >> + >> 4096); >> + if (rscreen->trace_bo) { >> + rscreen->trace_ptr = >> rscreen->ws->buffer_map(rscreen->trace_bo->cs_buf, NULL, >> + >> PIPE_TRANSFER_UNSYNCHRONIZED); >> + } >> + } >> +#endif >> + >> return &rscreen->screen; >> } >> diff --git a/src/gallium/drivers/radeonsi/radeonsi_pipe.h >> b/src/gallium/drivers/radeonsi/radeonsi_pipe.h >> index d0f04f4..7943563 100644 >> --- a/src/gallium/drivers/radeonsi/radeonsi_pipe.h >> +++ b/src/gallium/drivers/radeonsi/radeonsi_pipe.h >> @@ -47,6 +47,9 @@ >> #define R600_BIG_ENDIAN 0 >> #endif >> +#define R600_TRACE_CS 0 >> +#define R600_TRACE_CS_DWORDS 8 >> + >> struct r600_pipe_fences { >> struct si_resource *bo; >> unsigned *data; >> @@ -67,6 +70,11 @@ struct r600_screen { >> struct r600_tiling_info tiling_info; >> struct util_slab_mempool pool_buffers; >> struct r600_pipe_fences fences; >> +#if R600_TRACE_CS >> + struct si_resource *trace_bo; >> + uint32_t *trace_ptr; >> + unsigned cs_count; >> +#endif >> }; >> struct si_pipe_sampler_view { >> @@ -226,6 +234,10 @@ void r600_translate_index_buffer(struct r600_context >> *r600, >> struct pipe_index_buffer *ib, >> unsigned count); >> +#if R600_TRACE_CS >> +void r600_trace_emit(struct r600_context *rctx); >> +#endif >> + >> /* >> * common helpers >> */ >> diff --git a/src/gallium/drivers/radeonsi/radeonsi_pm4.c >> b/src/gallium/drivers/radeonsi/radeonsi_pm4.c >> index 79a2521..8e01738 100644 >> --- a/src/gallium/drivers/radeonsi/radeonsi_pm4.c >> +++ b/src/gallium/drivers/radeonsi/radeonsi_pm4.c >> @@ -199,6 +199,12 @@ unsigned si_pm4_dirty_dw(struct r600_context *rctx) >> continue; >> count += state->ndw; >> +#if R600_TRACE_CS >> + /* for tracing each states */ >> + if (rctx->screen->trace_bo) { >> + count += R600_TRACE_CS_DWORDS; >> + } >> +#endif >> } >> return count; >> @@ -219,6 +225,12 @@ void si_pm4_emit(struct r600_context *rctx, struct >> si_pm4_state *state) >> } >> cs->cdw += state->ndw; >> + >> +#if R600_TRACE_CS >> + if (rctx->screen->trace_bo) { >> + r600_trace_emit(rctx); >> + } >> +#endif >> } >> void si_pm4_emit_dirty(struct r600_context *rctx) >> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c >> b/src/gallium/drivers/radeonsi/si_state_draw.c >> index a78751b..1e1d1cc 100644 >> --- a/src/gallium/drivers/radeonsi/si_state_draw.c >> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c >> @@ -579,6 +579,12 @@ void si_draw_vbo(struct pipe_context *ctx, const >> struct pipe_draw_info *info) >> si_pm4_emit_dirty(rctx); >> rctx->pm4_dirty_cdwords = 0; >> +#if R600_TRACE_CS >> + if (rctx->screen->trace_bo) { >> + r600_trace_emit(rctx); >> + } >> +#endif >> + >> #if 0 >> /* Enable stream out if needed. */ >> if (rctx->streamout_start) { >> @@ -587,7 +593,6 @@ void si_draw_vbo(struct pipe_context *ctx, const >> struct pipe_draw_info *info) >> } >> #endif >> - >> rctx->flags |= R600_CONTEXT_DST_CACHES_DIRTY; >> /* Set the depth buffer as dirty. */ >> diff --git a/src/gallium/drivers/radeonsi/sid.h >> b/src/gallium/drivers/radeonsi/sid.h >> index 57553a6..8528981 100644 >> --- a/src/gallium/drivers/radeonsi/sid.h >> +++ b/src/gallium/drivers/radeonsi/sid.h >> @@ -77,6 +77,20 @@ >> #define PKT3_DRAW_INDEX_IMMD 0x2E >> #define PKT3_NUM_INSTANCES 0x2F >> #define PKT3_STRMOUT_BUFFER_UPDATE 0x34 >> +#define PKT3_WRITE_DATA 0x37 >> +#define PKT3_WRITE_DATA_DST_SEL(x) ((x) << 8) >> +#define PKT3_WRITE_DATA_DST_SEL_REG 0 >> +#define PKT3_WRITE_DATA_DST_SEL_MEM_SYNC 1 >> +#define PKT3_WRITE_DATA_DST_SEL_TC_OR_L2 2 >> +#define PKT3_WRITE_DATA_DST_SEL_GDS 3 >> +#define PKT3_WRITE_DATA_DST_SEL_RESERVED_4 4 >> +#define PKT3_WRITE_DATA_DST_SEL_MEM_ASYNC 5 >> +#define PKT3_WR_ONE_ADDR (1 << 16) >> +#define PKT3_WRITE_DATA_WR_CONFIRM (1 << 20) >> +#define PKT3_WRITE_DATA_ENGINE_SEL(x) ((x) << 30) >> +#define PKT3_WRITE_DATA_ENGINE_SEL_ME 0 >> +#define PKT3_WRITE_DATA_ENGINE_SEL_PFP 1 >> +#define PKT3_WRITE_DATA_ENGINE_SEL_CE 2 >> #define PKT3_MEM_SEMAPHORE 0x39 >> #define PKT3_MPEG_INDEX 0x3A >> #define PKT3_WAIT_REG_MEM 0x3C > > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev