From: Luo Xionghu <[email protected]> add 'atomic_int, atomic_uint' type support for operations: fetch_add, fetch_sub, fetch_or, fetch_xor, fetch_and, exchange, fetch_min, fetch_max.
add 'atomic_int, atomic_uint, atomic_long, atomic_ulong' type support for operations as load, store, init, compare_exchange_strong, compare_exchange_weak. these builtins are implemented by llvm bitcode with native atomicrmw and xchg llvm IR, so the pass optimization could recognize all the atomic instructions to avoid miss optimized. v2: use DISABLE_ATOMIC_INT64 to disable atomic_long since it is not supported by hardware so far. Signed-off-by: Luo Xionghu <[email protected]> --- backend/src/ir/instruction.cpp | 5 +- backend/src/libocl/CMakeLists.txt | 2 +- backend/src/libocl/include/ocl_atom.h | 88 ++++++++++++- backend/src/libocl/include/ocl_types.h | 5 + backend/src/libocl/src/ocl_atom.cl | 226 +++++++++++++++++++++++++++++++++ backend/src/libocl/src/ocl_atomic.ll | 153 ++++++++++++++++++++++ backend/src/llvm/llvm_gen_backend.cpp | 102 +++++++++++++++ backend/src/llvm/llvm_to_gen.cpp | 2 +- 8 files changed, 578 insertions(+), 5 deletions(-) create mode 100644 backend/src/libocl/src/ocl_atomic.ll diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp index 464e483..d8640f9 100644 --- a/backend/src/ir/instruction.cpp +++ b/backend/src/ir/instruction.cpp @@ -1088,10 +1088,11 @@ namespace ir { { if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false)) return false; - if (UNLIKELY(checkRegisterData(FAMILY_DWORD, dst[0], fn, whyNot) == false)) + const RegisterFamily family = getFamily(this->type); + if (UNLIKELY(checkRegisterData(family, dst[0], fn, whyNot) == false)) return false; for (uint32_t srcID = 0; srcID < srcNum-1u; ++srcID) - if (UNLIKELY(checkRegisterData(FAMILY_DWORD, getSrc(fn, srcID+1u), fn, whyNot) == false)) + if (UNLIKELY(checkRegisterData(family, getSrc(fn, srcID+1u), fn, whyNot) == false)) return false; return true; diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt index 8bb4c1e..5f0b2e2 100644 --- a/backend/src/libocl/CMakeLists.txt +++ b/backend/src/libocl/CMakeLists.txt @@ -182,7 +182,7 @@ MACRO(ADD_LL_TO_BC_TARGET M) ) ENDMACRO(ADD_LL_TO_BC_TARGET) -SET (OCL_LL_MODULES ocl_barrier ocl_clz) +SET (OCL_LL_MODULES ocl_barrier ocl_clz ocl_atomic) FOREACH(f ${OCL_LL_MODULES}) COPY_THE_LL(${f}) ADD_LL_TO_BC_TARGET(${f}) diff --git a/backend/src/libocl/include/ocl_atom.h b/backend/src/libocl/include/ocl_atom.h index d0f6b10..44c0c4b 100644 --- a/backend/src/libocl/include/ocl_atom.h +++ b/backend/src/libocl/include/ocl_atom.h @@ -98,5 +98,91 @@ OVERLOADABLE int atomic_cmpxchg (volatile __local int *p, int cmp, int val); #define atom_dec atomic_dec #define atom_cmpxchg atomic_cmpxchg - +//OpenCL 2.0 features +#define ATOMIC_GEN_FUNCTIONS(ATYPE, CTYPE, POSTFIX) \ +CTYPE __gen_ocl_atomic_exchange##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \ +CTYPE __gen_ocl_atomic_fetch_add##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \ +CTYPE __gen_ocl_atomic_fetch_sub##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \ +CTYPE __gen_ocl_atomic_fetch_or##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \ +CTYPE __gen_ocl_atomic_fetch_xor##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \ +CTYPE __gen_ocl_atomic_fetch_and##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \ +CTYPE __gen_ocl_atomic_fetch_imin##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \ +CTYPE __gen_ocl_atomic_fetch_umin##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \ +CTYPE __gen_ocl_atomic_fetch_imax##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \ +CTYPE __gen_ocl_atomic_fetch_umax##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope);\ +CTYPE __gen_ocl_atomic_compare_exchange_strong##POSTFIX(volatile ATYPE* object, CTYPE expected, CTYPE desired, int sucess, int failure, int scope); \ +CTYPE __gen_ocl_atomic_compare_exchange_weak##POSTFIX(volatile ATYPE* object, CTYPE expected, CTYPE desired, int sucess, int failure, int scope); + +ATOMIC_GEN_FUNCTIONS(atomic_int, int, 32) +#ifndef DISABLE_ATOMIC_INT64 +ATOMIC_GEN_FUNCTIONS(atomic_long, long, 64) +#endif +float __gen_ocl_atomic_exchangef(volatile atomic_int *p, float val, int order, int scope); +float __gen_ocl_atomic_fetch_addf(volatile atomic_int *p, float val, int order, int scope); + +#undef ATOMIC_GEN_FUNCTIONS + +/* only used to initialize global address space */ +//#define ATOMIC_VAR_INIT(C value) +#define ATOMIC_VAR_INIT +#define ATOMIC_FLAG_INIT 0 + +//store +#define ATOMIC_FUNCTIONS(ATYPE, CTYPE, MTYPE1, MTYPE2) \ +OVERLOADABLE void atomic_init(volatile ATYPE *object, CTYPE desired); \ +OVERLOADABLE void atomic_store(volatile ATYPE *object, CTYPE desired); \ +OVERLOADABLE void atomic_store_explicit(volatile ATYPE *object, CTYPE desired, memory_order order); \ +OVERLOADABLE void atomic_store_explicit(volatile ATYPE *object, CTYPE desired, memory_order order, memory_scope scope); \ +OVERLOADABLE CTYPE atomic_load(volatile ATYPE *object); \ +OVERLOADABLE CTYPE atomic_load_explicit(volatile ATYPE *object, memory_order order); \ +OVERLOADABLE CTYPE atomic_load_explicit(volatile ATYPE *object, memory_order order, memory_scope scope); \ +OVERLOADABLE CTYPE atomic_exchange(volatile ATYPE *object, CTYPE desired); \ +OVERLOADABLE CTYPE atomic_exchange_explicit(volatile ATYPE *object, CTYPE desired, memory_order order); \ +OVERLOADABLE CTYPE atomic_exchange_explicit(volatile ATYPE *object, CTYPE desired, memory_order order, memory_scope scope); \ +OVERLOADABLE bool atomic_compare_exchange_strong(volatile ATYPE *object, CTYPE *expected, CTYPE desired); \ +OVERLOADABLE bool atomic_compare_exchange_strong_explicit(volatile ATYPE *object, CTYPE *expected, CTYPE desired, memory_order success, memory_order failure); \ +OVERLOADABLE bool atomic_compare_exchange_strong_explicit(volatile ATYPE *object, CTYPE *expected, CTYPE desired, memory_order success, memory_order failure, memory_scope scope); \ +OVERLOADABLE bool atomic_compare_exchange_weak(volatile ATYPE *object, CTYPE *expected, CTYPE desired); \ +OVERLOADABLE bool atomic_compare_exchange_weak_explicit(volatile ATYPE *object, CTYPE *expected, CTYPE desired, memory_order success, memory_order failure); \ +OVERLOADABLE bool atomic_compare_exchange_weak_explicit(volatile ATYPE *object, CTYPE *expected, CTYPE desired, memory_order success, memory_order failure, memory_scope scope); \ +OVERLOADABLE CTYPE atomic_fetch_add(volatile ATYPE *object, MTYPE1 desired); \ +OVERLOADABLE CTYPE atomic_fetch_add_explicit(volatile ATYPE *object, MTYPE1 desired, memory_order order); \ +OVERLOADABLE CTYPE atomic_fetch_add_explicit(volatile ATYPE *object, MTYPE1 desired, memory_order order, memory_scope scope); \ +OVERLOADABLE CTYPE atomic_fetch_sub(volatile ATYPE *object, MTYPE1 desired); \ +OVERLOADABLE CTYPE atomic_fetch_sub_explicit(volatile ATYPE *object, MTYPE1 desired, memory_order order); \ +OVERLOADABLE CTYPE atomic_fetch_sub_explicit(volatile ATYPE *object, MTYPE1 desired, memory_order order, memory_scope scope); \ +OVERLOADABLE CTYPE atomic_fetch_or(volatile ATYPE *object, MTYPE2 desired); \ +OVERLOADABLE CTYPE atomic_fetch_or_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order); \ +OVERLOADABLE CTYPE atomic_fetch_or_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order, memory_scope scope); \ +OVERLOADABLE CTYPE atomic_fetch_xor(volatile ATYPE *object, MTYPE2 desired); \ +OVERLOADABLE CTYPE atomic_fetch_xor_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order); \ +OVERLOADABLE CTYPE atomic_fetch_xor_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order, memory_scope scope); \ +OVERLOADABLE CTYPE atomic_fetch_and(volatile ATYPE *object, MTYPE2 desired); \ +OVERLOADABLE CTYPE atomic_fetch_and_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order); \ +OVERLOADABLE CTYPE atomic_fetch_and_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order, memory_scope scope); \ +OVERLOADABLE CTYPE atomic_fetch_min(volatile ATYPE *object, MTYPE2 desired); \ +OVERLOADABLE CTYPE atomic_fetch_min_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order); \ +OVERLOADABLE CTYPE atomic_fetch_min_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order, memory_scope scope); \ +OVERLOADABLE CTYPE atomic_fetch_max(volatile ATYPE *object, MTYPE2 desired); \ +OVERLOADABLE CTYPE atomic_fetch_max_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order); \ +OVERLOADABLE CTYPE atomic_fetch_max_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order, memory_scope scope); + +ATOMIC_FUNCTIONS(atomic_int, int, int, int) +ATOMIC_FUNCTIONS(atomic_uint, uint, uint, uint) +#ifndef DISABLE_ATOMIC_INT64 +ATOMIC_FUNCTIONS(atomic_long, long, long, long) +ATOMIC_FUNCTIONS(atomic_ulong, ulong, ulong, ulong) +#endif +ATOMIC_FUNCTIONS(atomic_float, float, float, float) +#undef ATOMIC_FUNCTIONS + + +OVERLOADABLE bool atomic_flag_test_and_set(volatile atomic_flag *object); +OVERLOADABLE bool atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order); +OVERLOADABLE bool atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope); +OVERLOADABLE void atomic_flag_clear(volatile atomic_flag *object); +OVERLOADABLE void atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order); +OVERLOADABLE void atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope); + +OVERLOADABLE void atomic_work_item_fence(cl_mem_fence_flags flags, memory_order order, memory_scope scope); #endif /* __OCL_ATOM_H__ */ diff --git a/backend/src/libocl/include/ocl_types.h b/backend/src/libocl/include/ocl_types.h index 736e4ce..334d0e9 100644 --- a/backend/src/libocl/include/ocl_types.h +++ b/backend/src/libocl/include/ocl_types.h @@ -20,6 +20,11 @@ #pragma OPENCL EXTENSION cl_khr_fp64 : enable #pragma OPENCL EXTENSION cl_khr_fp16 : enable +#define DISABLE_ATOMIC_INT64 +#ifndef DISABLE_ATOMIC_INT64 +#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable +#endif #include "ocl_defines.h" #define NULL 0 diff --git a/backend/src/libocl/src/ocl_atom.cl b/backend/src/libocl/src/ocl_atom.cl index 0b6c671..e0af560 100644 --- a/backend/src/libocl/src/ocl_atom.cl +++ b/backend/src/libocl/src/ocl_atom.cl @@ -17,6 +17,7 @@ */ #include "ocl_atom.h" #include "ocl_as.h" +#include "ocl_sync.h" OVERLOADABLE uint __gen_ocl_atomic_add(__global uint *p, uint val); OVERLOADABLE uint __gen_ocl_atomic_add(__local uint *p, uint val); @@ -135,3 +136,228 @@ DECL_ATOMIC_OP(cmpxchg) #define atom_inc atomic_inc #define atom_dec atomic_dec #define atom_cmpxchg atomic_cmpxchg + +// OpenCL 2.0 features. +#define DECL_ATOMIC_OP_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \ + OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p, CTYPE val) { \ + return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, val, memory_order_seq_cst, memory_scope_device); \ + } + +#define DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \ + OVERLOADABLE bool atomic_##NAME (volatile ATYPE *p, CTYPE* expected, CTYPE val) { \ + CTYPE oldValue = __gen_ocl_atomic_##PREFIX((STYPE*)p, *expected, val, memory_order_seq_cst, memory_order_seq_cst, memory_scope_device); \ + bool ret = oldValue == *expected; \ + *expected = oldValue; \ + return ret; \ + } + +#define DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \ + OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p) { \ + return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, 0, memory_order_seq_cst, memory_scope_device); \ + } + +#define DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \ + OVERLOADABLE void atomic_##NAME (volatile ATYPE *p, CTYPE val) { \ + __gen_ocl_atomic_##PREFIX((STYPE*)p, val, memory_order_seq_cst, memory_scope_device); \ + } + +#define DECL_ATOMIC_OP(NAME, PREFIX) \ + DECL_ATOMIC_OP_TYPE(NAME, PREFIX##32, atomic_uint, atomic_int, uint) \ + DECL_ATOMIC_OP_TYPE(NAME, PREFIX##32, atomic_int, atomic_int, int) \ + //DECL_ATOMIC_OP_TYPE(NAME, PREFIX##64, atomic_ulong, atomic_long, ulong) \ + DECL_ATOMIC_OP_TYPE(NAME, PREFIX##64, atomic_long, atomic_long, long) \ + +#define DECL_ATOMIC_COMPARE_EXCHANGE_OP(NAME, PREFIX) \ + DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX##32, atomic_uint, atomic_int, uint) \ + DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX##32, atomic_int, atomic_int, int) \ + //DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX##64, atomic_ulong, atomic_long, ulong) \ + DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX##64, atomic_long, atomic_long, long) \ + +#define DECL_ATOMIC_LOAD_OP(NAME, PREFIX) \ + DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX##32, atomic_uint, atomic_int, uint) \ + DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX##32, atomic_int, atomic_int, int) \ + //DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX##64, atomic_ulong, atomic_long, ulong) \ + DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX##64, atomic_long, atomic_long, long) \ + +#define DECL_ATOMIC_NO_RET_OP(NAME, PREFIX) \ + DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX##32, atomic_uint, atomic_int, uint) \ + DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX##32, atomic_int, atomic_int, int) \ + //DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX##64, atomic_ulong, atomic_long, ulong) \ + DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX##64, atomic_long, atomic_long, long) \ + +DECL_ATOMIC_OP(exchange, exchange) +DECL_ATOMIC_OP(fetch_add, fetch_add) +DECL_ATOMIC_OP(fetch_sub, fetch_sub) +DECL_ATOMIC_OP(fetch_and, fetch_and) +DECL_ATOMIC_OP(fetch_or, fetch_or) +DECL_ATOMIC_OP(fetch_xor, fetch_xor) +DECL_ATOMIC_LOAD_OP(load, fetch_add) +DECL_ATOMIC_NO_RET_OP(init, exchange) +DECL_ATOMIC_NO_RET_OP(store, exchange) +DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_strong, compare_exchange_strong) +DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_weak, compare_exchange_weak) +DECL_ATOMIC_OP_TYPE(fetch_min, fetch_imin32, atomic_int, atomic_int, int) +DECL_ATOMIC_OP_TYPE(fetch_min, fetch_umin32, atomic_uint, atomic_int, uint) +DECL_ATOMIC_OP_TYPE(fetch_max, fetch_imax32, atomic_int, atomic_int, int) +DECL_ATOMIC_OP_TYPE(fetch_max, fetch_umax32, atomic_uint, atomic_int, uint) +#ifndef DISABLE_ATOMIC_INT64 +DECL_ATOMIC_OP_TYPE(fetch_min, fetch_imin64, atomic_long, atomic_long, long) +DECL_ATOMIC_OP_TYPE(fetch_min, fetch_umin64, atomic_ulong, atomic_long, ulong) +DECL_ATOMIC_OP_TYPE(fetch_max, fetch_imax64, atomic_long, atomic_long, long) +DECL_ATOMIC_OP_TYPE(fetch_max, fetch_umax64, atomic_ulong, atomic_long, ulong) +#endif +DECL_ATOMIC_OP_TYPE(exchange, exchangef, atomic_float, atomic_int, float) +DECL_ATOMIC_NO_RET_TYPE(init, exchangef, atomic_float, atomic_int, float) +DECL_ATOMIC_NO_RET_TYPE(store, exchangef, atomic_float, atomic_int, float) +DECL_ATOMIC_LOAD_TYPE(load, fetch_addf, atomic_float, atomic_int, float) + +#undef DECL_ATOMIC_OP_TYPE +#undef DECL_ATOMIC_LOAD_TYPE +#undef DECL_ATOMIC_NO_RET_TYPE +#undef DECL_ATOMIC_COMPARE_EXCHANGE_TYPE + +// with memory_order. + +#define DECL_ATOMIC_OP_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \ + OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p, CTYPE val, memory_order order) { \ + return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, val, order, memory_scope_device); \ + } + +#define DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \ + OVERLOADABLE bool atomic_##NAME (volatile ATYPE *p, CTYPE* expected, CTYPE val, memory_order success, memory_order failure) { \ + CTYPE oldValue = __gen_ocl_atomic_##PREFIX((STYPE*)p, *expected, val, success, failure, memory_scope_device); \ + bool ret = oldValue == *expected; \ + *expected = oldValue; \ + return ret; \ + } + +#define DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \ + OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p, memory_order order) { \ + return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, 0, order, memory_scope_device); \ + } + +#define DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \ + OVERLOADABLE void atomic_##NAME (volatile ATYPE *p, CTYPE val, memory_order order) { \ + __gen_ocl_atomic_##PREFIX((STYPE*)p, val, order, memory_scope_device); \ + } + +DECL_ATOMIC_OP(exchange_explicit, exchange) +DECL_ATOMIC_OP(fetch_add_explicit, fetch_add) +DECL_ATOMIC_OP(fetch_sub_explicit, fetch_sub) +DECL_ATOMIC_OP(fetch_and_explicit, fetch_and) +DECL_ATOMIC_OP(fetch_or_explicit, fetch_or) +DECL_ATOMIC_OP(fetch_xor_explicit, fetch_xor) +DECL_ATOMIC_LOAD_OP(load_explicit, fetch_add) +DECL_ATOMIC_NO_RET_OP(store_explicit, exchange) +DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_strong_explicit, compare_exchange_strong) +DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_weak_explicit, compare_exchange_weak) +DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_imin32, atomic_int, atomic_int, int) +DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_umin32, atomic_uint, atomic_int, uint) +DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_imax32, atomic_int, atomic_int, int) +DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_umax32, atomic_uint, atomic_int, uint) +#ifndef DISABLE_ATOMIC_INT64 +DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_imin64, atomic_long, atomic_long, long) +DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_umin64, atomic_ulong, atomic_long, ulong) +DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_imax64, atomic_long, atomic_long, long) +DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_umax64, atomic_ulong, atomic_long, ulong) +#endif +DECL_ATOMIC_OP_TYPE(exchange_explicit, exchangef, atomic_float, atomic_int, float) +DECL_ATOMIC_NO_RET_TYPE(init_explicit, exchangef, atomic_float, atomic_int, float) +DECL_ATOMIC_NO_RET_TYPE(store_explicit, exchangef, atomic_float, atomic_int, float) +DECL_ATOMIC_LOAD_TYPE(load_explicit, fetch_addf, atomic_float, atomic_int, float) + +#undef DECL_ATOMIC_OP_TYPE +#undef DECL_ATOMIC_LOAD_TYPE +#undef DECL_ATOMIC_NO_RET_TYPE +#undef DECL_ATOMIC_COMPARE_EXCHANGE_TYPE + +// with memory_order and memory_scope +#define DECL_ATOMIC_OP_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \ + OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p, CTYPE val, memory_order order, memory_scope scope) { \ + return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, val, order, scope); \ + } + +#define DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \ + OVERLOADABLE bool atomic_##NAME (volatile ATYPE *p, CTYPE* expected, CTYPE val, memory_order success, memory_order failure, memory_scope scope) { \ + CTYPE oldValue = __gen_ocl_atomic_##PREFIX((STYPE*)p, *expected, val, success, failure, scope); \ + bool ret = oldValue == *expected; \ + *expected = oldValue; \ + return ret; \ + } + +#define DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \ + OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p, memory_order order, memory_scope scope) { \ + return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, 0, order, scope); \ + } + +#define DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \ + OVERLOADABLE void atomic_##NAME (volatile ATYPE *p, CTYPE val, memory_order order, memory_scope scope) { \ + __gen_ocl_atomic_##PREFIX((STYPE*)p, val, order, scope); \ + } + +DECL_ATOMIC_OP(exchange_explicit, exchange) +DECL_ATOMIC_OP(fetch_add_explicit, fetch_add) +DECL_ATOMIC_OP(fetch_sub_explicit, fetch_sub) +DECL_ATOMIC_OP(fetch_and_explicit, fetch_and) +DECL_ATOMIC_OP(fetch_or_explicit, fetch_or) +DECL_ATOMIC_OP(fetch_xor_explicit, fetch_xor) +DECL_ATOMIC_LOAD_OP(load_explicit, fetch_add) +DECL_ATOMIC_NO_RET_OP(store_explicit, exchange) +DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_strong_explicit, compare_exchange_strong) +DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_weak_explicit, compare_exchange_weak) +DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_imin32, atomic_int, atomic_int, int) +DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_umin32, atomic_uint, atomic_int, uint) +DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_imax32, atomic_int, atomic_int, int) +DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_umax32, atomic_uint, atomic_int, uint) +#ifndef DISABLE_ATOMIC_INT64 +DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_imin64, atomic_long, atomic_long, long) +DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_umin64, atomic_ulong, atomic_long, ulong) +DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_imax64, atomic_long, atomic_long, long) +DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_umax64, atomic_ulong, atomic_long, ulong) +#endif +DECL_ATOMIC_OP_TYPE(exchange_explicit, exchangef, atomic_float, atomic_int, float) +DECL_ATOMIC_NO_RET_TYPE(init_explicit, exchangef, atomic_float, atomic_int, float) +DECL_ATOMIC_NO_RET_TYPE(store_explicit, exchangef, atomic_float, atomic_int, float) +DECL_ATOMIC_LOAD_TYPE(load_explicit, fetch_addf, atomic_float, atomic_int, float) + +#undef DECL_ATOMIC_OP_TYPE +#undef DECL_ATOMIC_LOAD_TYPE +#undef DECL_ATOMIC_NO_RET_TYPE +#undef DECL_ATOMIC_COMPARE_EXCHANGE_TYPE +#undef DECL_ATOMIC_OP +#undef DECL_ATOMIC_LOAD_OP +#undef DECL_ATOMIC_NO_RET_OP +#undef DECL_ATOMIC_COMPARE_EXCHANGE_OP + +OVERLOADABLE bool atomic_flag_test_and_set(volatile atomic_flag *object) { + atomic_int * temp = (atomic_int*)object; + return (bool)__gen_ocl_atomic_compare_exchange_strong32(temp, 0, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_device); +} + +OVERLOADABLE bool atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order) { + atomic_int * temp = (atomic_int*)object; + return (bool)__gen_ocl_atomic_compare_exchange_strong32(temp, 0, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_device); +} + +OVERLOADABLE bool atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope){ + atomic_int * temp = (atomic_int*)object; + return (bool)__gen_ocl_atomic_compare_exchange_strong32(temp, 0, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_device); +} + +OVERLOADABLE void atomic_flag_clear(volatile atomic_flag *object){ + atomic_int * temp = (atomic_int*)object; + __gen_ocl_atomic_exchange32(temp, 0, memory_order_seq_cst, memory_scope_device); +} + +OVERLOADABLE void atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order){ + atomic_int * temp = (atomic_int*)object; + __gen_ocl_atomic_exchange32(temp, 0, memory_order_seq_cst, memory_scope_device); +} + +OVERLOADABLE void atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope){ + atomic_int * temp = (atomic_int*)object; + __gen_ocl_atomic_exchange32(temp, 0, memory_order_seq_cst, memory_scope_device); +} + +OVERLOADABLE void atomic_work_item_fence(cl_mem_fence_flags flags, memory_order order, memory_scope scope){ +} diff --git a/backend/src/libocl/src/ocl_atomic.ll b/backend/src/libocl/src/ocl_atomic.ll new file mode 100644 index 0000000..6b789b3 --- /dev/null +++ b/backend/src/libocl/src/ocl_atomic.ll @@ -0,0 +1,153 @@ +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64" + +;32bit version. +define i32 @__gen_ocl_atomic_exchange32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__gen_ocl_atomic_fetch_add32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__gen_ocl_atomic_fetch_sub32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__gen_ocl_atomic_fetch_or32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile or i32 addrspace(4)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__gen_ocl_atomic_fetch_xor32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile xor i32 addrspace(4)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__gen_ocl_atomic_fetch_and32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__gen_ocl_atomic_fetch_imin32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile min i32 addrspace(4)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__gen_ocl_atomic_fetch_imax32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__gen_ocl_atomic_fetch_umin32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile umin i32 addrspace(4)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__gen_ocl_atomic_fetch_umax32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %value seq_cst + ret i32 %0 +} + +define i32 @__gen_ocl_atomic_compare_exchange_strong32(i32 addrspace(4)* nocapture %ptr,i32 %compare, i32 %value, i32 %success, i32 %failure, i32 %scope) nounwind alwaysinline { +entry: + %0 = cmpxchg volatile i32 addrspace(4)* %ptr, i32 %compare, i32 %value seq_cst seq_cst + %1 = extractvalue { i32, i1 } %0, 0 + ret i32 %1 +} + +define i32 @__gen_ocl_atomic_compare_exchange_weak32(i32 addrspace(4)* nocapture %ptr,i32 %compare, i32 %value, i32 %sucess, i32 %failure, i32 %scope) nounwind alwaysinline { +entry: + %0 = cmpxchg weak volatile i32 addrspace(4)* %ptr, i32 %compare, i32 %value seq_cst seq_cst + %1 = extractvalue { i32, i1 } %0, 0 + ret i32 %1 +} + +;64bit version + +define i64 @__gen_ocl_atomic_exchange64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile xchg i64 addrspace(4)* %ptr, i64 %value seq_cst + ret i64 %0 +} + +define i64 @__gen_ocl_atomic_fetch_add64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile add i64 addrspace(4)* %ptr, i64 %value seq_cst + ret i64 %0 +} + +define i64 @__gen_ocl_atomic_fetch_sub64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile sub i64 addrspace(4)* %ptr, i64 %value seq_cst + ret i64 %0 +} + +define i64 @__gen_ocl_atomic_fetch_or64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile or i64 addrspace(4)* %ptr, i64 %value seq_cst + ret i64 %0 +} + +define i64 @__gen_ocl_atomic_fetch_xor64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile xor i64 addrspace(4)* %ptr, i64 %value seq_cst + ret i64 %0 +} + +define i64 @__gen_ocl_atomic_fetch_and64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile and i64 addrspace(4)* %ptr, i64 %value seq_cst + ret i64 %0 +} + +define i64 @__gen_ocl_atomic_fetch_imin64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile min i64 addrspace(4)* %ptr, i64 %value seq_cst + ret i64 %0 +} + +define i64 @__gen_ocl_atomic_fetch_imax64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile max i64 addrspace(4)* %ptr, i64 %value seq_cst + ret i64 %0 +} + +define i64 @__gen_ocl_atomic_fetch_umin64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile umin i64 addrspace(4)* %ptr, i64 %value seq_cst + ret i64 %0 +} + +define i64 @__gen_ocl_atomic_fetch_umax64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile umax i64 addrspace(4)* %ptr, i64 %value seq_cst + ret i64 %0 +} + +define i64 @__gen_ocl_atomic_compare_exchange_strong64(i64 addrspace(4)* nocapture %ptr,i64 %compare, i64 %value, i32 %sucess, i32 %failure, i32 %scope) nounwind alwaysinline { +entry: + %0 = cmpxchg volatile i64 addrspace(4)* %ptr, i64 %compare, i64 %value seq_cst seq_cst + %1 = extractvalue { i64, i1 } %0, 0 + ret i64 %1 +} + +define i64 @__gen_ocl_atomic_compare_exchange_weak64(i64 addrspace(4)* nocapture %ptr,i64 %compare, i64 %value, i32 %sucess, i32 %failure, i32 %scope) nounwind alwaysinline { +entry: + %0 = cmpxchg weak volatile i64 addrspace(4)* %ptr, i64 %compare, i64 %value seq_cst seq_cst + %1 = extractvalue { i64, i1 } %0, 0 + ret i64 %1 +} diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index d5d02f5..fb0a72c 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -639,6 +639,8 @@ namespace gbe DECL_VISIT_FN(BranchInst, BranchInst); DECL_VISIT_FN(PHINode, PHINode); DECL_VISIT_FN(AllocaInst, AllocaInst); + DECL_VISIT_FN(AtomicRMWInst, AtomicRMWInst); + DECL_VISIT_FN(AtomicCmpXchgInst, AtomicCmpXchgInst); #undef DECL_VISIT_FN // Emit unary instructions from gen native function @@ -675,6 +677,7 @@ namespace gbe // handle load of dword/qword with unaligned address void emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues, ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool dwAligned, bool fixedBTI); void visitInstruction(Instruction &I) {NOT_SUPPORTED;} + void emitAtomicInstHelper(const ir::AtomicOps opcode,const ir::Type type, const ir::Register dst, llvm::Value* llvmPtr, const ir::Tuple payloadTuple); private: ir::ImmediateIndex processConstantImmIndexImpl(Constant *CPV, int32_t index = 0u); template <typename T, typename P = T> @@ -3644,6 +3647,105 @@ namespace gbe ctx.ALU1(opcode, type, dst, src); } + void GenWriter::regAllocateAtomicCmpXchgInst(AtomicCmpXchgInst &I) { + this->newRegister(&I); + } + + void GenWriter::emitAtomicInstHelper(const ir::AtomicOps opcode,const ir::Type type, const ir::Register dst, llvm::Value* llvmPtr, const ir::Tuple payloadTuple) { + ir::Register pointer = this->getRegister(llvmPtr); + ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmPtr->getType()->getPointerAddressSpace()); + // Get the function arguments + ir::Register ptr; + ir::Register btiReg; + unsigned SurfaceIndex = 0xff; + ir::AddressMode AM; + if (legacyMode) { + Value *bti = getBtiRegister(llvmPtr); + Value *ptrBase = getPointerBase(llvmPtr); + ir::Register baseReg = this->getRegister(ptrBase); + if (isa<ConstantInt>(bti)) { + AM = ir::AM_StaticBti; + SurfaceIndex = cast<ConstantInt>(bti)->getZExtValue(); + addrSpace = btiToGen(SurfaceIndex); + } else { + AM = ir::AM_DynamicBti; + addrSpace = ir::MEM_MIXED; + btiReg = this->getRegister(bti); + } + const ir::RegisterFamily pointerFamily = ctx.getPointerFamily(); + ptr = ctx.reg(pointerFamily); + ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg); + } else { + AM = ir::AM_Stateless; + ptr = pointer; + } + + ctx.ATOMIC(opcode, type, dst, addrSpace, ptr, payloadTuple, AM, SurfaceIndex); + } + + void GenWriter::emitAtomicCmpXchgInst(AtomicCmpXchgInst &I) { + // Get the function arguments + Value *llvmPtr = I.getPointerOperand(); + ir::AtomicOps opcode = ir::ATOMIC_OP_CMPXCHG; + uint32_t payloadNum = 0; + vector<ir::Register> payload; + const ir::Register dst = this->getRegister(&I); + + payload.push_back(this->getRegister(I.getCompareOperand())); + payloadNum++; + payload.push_back(this->getRegister(I.getNewValOperand())); + payloadNum++; + ir::Type type = getType(ctx, llvmPtr->getType()->getPointerElementType()); + const ir::Tuple payloadTuple = payloadNum == 0 ? + ir::Tuple(0) : + ctx.arrayTuple(&payload[0], payloadNum); + this->emitAtomicInstHelper(opcode, type, dst, llvmPtr, payloadTuple); + } + + void GenWriter::regAllocateAtomicRMWInst(AtomicRMWInst &I) { + this->newRegister(&I); + } + + static INLINE ir::AtomicOps atomicOpsLLVMToGen(llvm::AtomicRMWInst::BinOp llvmOp) { + switch(llvmOp) { + case llvm::AtomicRMWInst::Xchg: return ir::ATOMIC_OP_XCHG; + case llvm::AtomicRMWInst::Add: return ir::ATOMIC_OP_ADD; + case llvm::AtomicRMWInst::Sub: return ir::ATOMIC_OP_SUB; + case llvm::AtomicRMWInst::And: return ir::ATOMIC_OP_AND; + case llvm::AtomicRMWInst::Or: return ir::ATOMIC_OP_OR; + case llvm::AtomicRMWInst::Xor: return ir::ATOMIC_OP_XOR; + case llvm::AtomicRMWInst::Max: return ir::ATOMIC_OP_IMAX; + case llvm::AtomicRMWInst::Min: return ir::ATOMIC_OP_IMIN; + case llvm::AtomicRMWInst::UMax: return ir::ATOMIC_OP_UMAX; + case llvm::AtomicRMWInst::UMin: return ir::ATOMIC_OP_UMIN; + case llvm::AtomicRMWInst::Nand: + case llvm::AtomicRMWInst::BAD_BINOP: break; + } + GBE_ASSERT(false); + return ir::ATOMIC_OP_INVALID; + } + + void GenWriter::emitAtomicRMWInst(AtomicRMWInst &I) { + // Get the function arguments + llvm::AtomicOrdering Order = I.getOrdering(); + llvm::AtomicRMWInst::BinOp llvmOpcode = I.getOperation(); + Value *llvmPtr = I.getOperand(0); + ir::AtomicOps opcode = atomicOpsLLVMToGen(llvmOpcode); + + const ir::Register dst = this->getRegister(&I); + + uint32_t payloadNum = 0; + vector<ir::Register> payload; + + payload.push_back(this->getRegister(I.getOperand(1))); + payloadNum++; + ir::Type type = getType(ctx, llvmPtr->getType()->getPointerElementType()); + const ir::Tuple payloadTuple = payloadNum == 0 ? + ir::Tuple(0) : + ctx.arrayTuple(&payload[0], payloadNum); + this->emitAtomicInstHelper(opcode, type, dst, llvmPtr, payloadTuple); + } + void GenWriter::emitAtomicInst(CallInst &I, CallSite &CS, ir::AtomicOps opcode) { CallSite::arg_iterator AI = CS.arg_begin(); CallSite::arg_iterator AE = CS.arg_end(); diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp index 0b3f0d2..11cb79f 100644 --- a/backend/src/llvm/llvm_to_gen.cpp +++ b/backend/src/llvm/llvm_to_gen.cpp @@ -201,7 +201,7 @@ namespace gbe // Run instcombine after redundancy elimination to exploit opportunities // opened up by them. MPM.add(createInstructionCombiningPass()); - MPM.add(createJumpThreadingPass()); // Thread jumps + //MPM.add(createJumpThreadingPass()); // Thread jumps MPM.add(createCorrelatedValuePropagationPass()); MPM.add(createDeadStoreEliminationPass()); // Delete dead stores MPM.add(createAggressiveDCEPass()); // Delete dead instructions -- 2.1.4 _______________________________________________ Beignet mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/beignet
