Slightly faster than bpermute, and seems supported since at least LLVM 3.9.
Signed-off-by: Bas Nieuwenhuizen <ba...@google.com> --- src/amd/common/ac_llvm_build.c | 78 +++++++++++++++++++++++++++++------------- 1 file changed, 54 insertions(+), 24 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 237e9291d41..62a00f214de 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -789,44 +789,74 @@ ac_build_ddxy(struct ac_llvm_context *ctx, LLVMValueRef lds, LLVMValueRef val) { - LLVMValueRef thread_id, tl, trbl, tl_tid, trbl_tid, args[2]; + LLVMValueRef thread_id, tl, trbl, args[5]; LLVMValueRef result; - thread_id = ac_get_thread_id(ctx); - - tl_tid = LLVMBuildAnd(ctx->builder, thread_id, - LLVMConstInt(ctx->i32, mask, false), ""); + /* bpermute is VI+, mov_dpp is VI+ too */ + if (has_ds_bpermute) { + uint32_t tl_ctrl = 0, trbl_ctrl = 0; - trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid, - LLVMConstInt(ctx->i32, idx, false), ""); + for (unsigned i = 0; i < 4; ++i) { + tl_ctrl |= (i & mask) << (2 * i); + trbl_ctrl |= ((i & mask) + idx) << (2 * i); + } - if (has_ds_bpermute) { - args[0] = LLVMBuildMul(ctx->builder, tl_tid, - LLVMConstInt(ctx->i32, 4, false), ""); - args[1] = val; + args[0] = val; + args[1] = LLVMConstInt(ctx->i32, tl_ctrl, false); + args[2] = LLVMConstInt(ctx->i32, 0xf, false); + args[3] = LLVMConstInt(ctx->i32, 0xf, false); + args[4] = LLVMConstInt(ctx->i1, 1, false); tl = ac_build_intrinsic(ctx, - "llvm.amdgcn.ds.bpermute", ctx->i32, - args, 2, + "llvm.amdgcn.mov.dpp.i32", ctx->i32, + args, 5, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); - args[0] = LLVMBuildMul(ctx->builder, trbl_tid, - LLVMConstInt(ctx->i32, 4, false), ""); + args[1] = LLVMConstInt(ctx->i32, trbl_ctrl, false); trbl = ac_build_intrinsic(ctx, - "llvm.amdgcn.ds.bpermute", ctx->i32, - args, 2, + "llvm.amdgcn.mov.dpp.i32", ctx->i32, + args, 5, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); } else { - LLVMValueRef store_ptr, load_ptr0, load_ptr1; + LLVMValueRef tl_tid, trbl_tid; + + thread_id = ac_get_thread_id(ctx); + + tl_tid = LLVMBuildAnd(ctx->builder, thread_id, + LLVMConstInt(ctx->i32, mask, false), ""); + + trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid, + LLVMConstInt(ctx->i32, idx, false), ""); + + if (has_ds_bpermute) { + args[0] = LLVMBuildMul(ctx->builder, tl_tid, + LLVMConstInt(ctx->i32, 4, false), ""); + args[1] = val; + tl = ac_build_intrinsic(ctx, + "llvm.amdgcn.ds.bpermute", ctx->i32, + args, 2, + AC_FUNC_ATTR_READNONE | + AC_FUNC_ATTR_CONVERGENT); + + args[0] = LLVMBuildMul(ctx->builder, trbl_tid, + LLVMConstInt(ctx->i32, 4, false), ""); + trbl = ac_build_intrinsic(ctx, + "llvm.amdgcn.ds.bpermute", ctx->i32, + args, 2, + AC_FUNC_ATTR_READNONE | + AC_FUNC_ATTR_CONVERGENT); + } else { + LLVMValueRef store_ptr, load_ptr0, load_ptr1; - store_ptr = ac_build_gep0(ctx, lds, thread_id); - load_ptr0 = ac_build_gep0(ctx, lds, tl_tid); - load_ptr1 = ac_build_gep0(ctx, lds, trbl_tid); + store_ptr = ac_build_gep0(ctx, lds, thread_id); + load_ptr0 = ac_build_gep0(ctx, lds, tl_tid); + load_ptr1 = ac_build_gep0(ctx, lds, trbl_tid); - LLVMBuildStore(ctx->builder, val, store_ptr); - tl = LLVMBuildLoad(ctx->builder, load_ptr0, ""); - trbl = LLVMBuildLoad(ctx->builder, load_ptr1, ""); + LLVMBuildStore(ctx->builder, val, store_ptr); + tl = LLVMBuildLoad(ctx->builder, load_ptr0, ""); + trbl = LLVMBuildLoad(ctx->builder, load_ptr1, ""); + } } tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, ""); -- 2.13.0 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev