JonChesterfield updated this revision to Diff 382306.
JonChesterfield added a comment.
- rebase
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D112227/new/
https://reviews.llvm.org/D112227
Files:
clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
openmp/libomptarget/DeviceRTL/CMakeLists.txt
openmp/libomptarget/DeviceRTL/src/Configuration.cpp
openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
Index: openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -68,8 +68,23 @@
///{
#pragma omp begin declare variant match(device = {arch(amdgcn)})
-uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering) {
- return __builtin_amdgcn_atomic_inc32(Address, Val, Ordering, "");
+uint32_t atomicInc(uint32_t *A, uint32_t V, int Ordering) {
+ // builtin_amdgcn_atomic_inc32 should expand to this switch when
+ // passed a runtime value, but does not do so yet. Workaround here.
+ switch (Ordering) {
+ default:
+ __builtin_unreachable();
+ case __ATOMIC_RELAXED:
+ return __builtin_amdgcn_atomic_inc32(A, V, __ATOMIC_RELAXED, "");
+ case __ATOMIC_ACQUIRE:
+ return __builtin_amdgcn_atomic_inc32(A, V, __ATOMIC_ACQUIRE, "");
+ case __ATOMIC_RELEASE:
+ return __builtin_amdgcn_atomic_inc32(A, V, __ATOMIC_RELEASE, "");
+ case __ATOMIC_ACQ_REL:
+ return __builtin_amdgcn_atomic_inc32(A, V, __ATOMIC_ACQ_REL, "");
+ case __ATOMIC_SEQ_CST:
+ return __builtin_amdgcn_atomic_inc32(A, V, __ATOMIC_SEQ_CST, "");
+ }
}
uint32_t SHARED(namedBarrierTracker);
@@ -126,17 +141,64 @@
fence::team(__ATOMIC_RELEASE);
}
+// sema checking of amdgcn_fence is aggressive. Intention is to patch clang
+// so that it is usable within a template environment and so that a runtime
+// value of the memory order is expanded to this switch within clang/llvm.
+void fenceTeam(int Ordering) {
+ switch (Ordering) {
+ default:
+ __builtin_unreachable();
+ case __ATOMIC_ACQUIRE:
+ return __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
+ case __ATOMIC_RELEASE:
+ return __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
+ case __ATOMIC_ACQ_REL:
+ return __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "workgroup");
+ case __ATOMIC_SEQ_CST:
+ return __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
+ }
+}
+void fenceKernel(int Ordering) {
+ switch (Ordering) {
+ default:
+ __builtin_unreachable();
+ case __ATOMIC_ACQUIRE:
+ return __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "agent");
+ case __ATOMIC_RELEASE:
+ return __builtin_amdgcn_fence(__ATOMIC_RELEASE, "agent");
+ case __ATOMIC_ACQ_REL:
+ return __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent");
+ case __ATOMIC_SEQ_CST:
+ return __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
+ }
+}
+void fenceSystem(int Ordering) {
+ switch (Ordering) {
+ default:
+ __builtin_unreachable();
+ case __ATOMIC_ACQUIRE:
+ return __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "");
+ case __ATOMIC_RELEASE:
+ return __builtin_amdgcn_fence(__ATOMIC_RELEASE, "");
+ case __ATOMIC_ACQ_REL:
+ return __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "");
+ case __ATOMIC_SEQ_CST:
+ return __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
+ }
+}
+
void syncWarp(__kmpc_impl_lanemask_t) {
// AMDGCN doesn't need to sync threads in a warp
}
void syncThreads() { __builtin_amdgcn_s_barrier(); }
-void fenceTeam(int Ordering) { __builtin_amdgcn_fence(Ordering, "workgroup"); }
-
-void fenceKernel(int Ordering) { __builtin_amdgcn_fence(Ordering, "agent"); }
-
-void fenceSystem(int Ordering) { __builtin_amdgcn_fence(Ordering, ""); }
+// TODO: Don't have wavefront lane locks. Possibly can't have them.
+void unsetLock(omp_lock_t *) { __builtin_trap(); }
+int testLock(omp_lock_t *) { __builtin_trap(); }
+void initLock(omp_lock_t *) { __builtin_trap(); }
+void destroyLock(omp_lock_t *) { __builtin_trap(); }
+void setLock(omp_lock_t *) { __builtin_trap(); }
#pragma omp end declare variant
///}
@@ -238,7 +300,7 @@
}
void atomic::store(uint32_t *Addr, uint32_t V, int Ordering) {
- impl::atomicStore(Addr, V, Ordering);
+ impl::atomicStore(Addr, V, Ordering);
}
uint32_t atomic::inc(uint32_t *Addr, uint32_t V, int Ordering) {
Index: openmp/libomptarget/DeviceRTL/src/Configuration.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Configuration.cpp
+++ openmp/libomptarget/DeviceRTL/src/Configuration.cpp
@@ -20,14 +20,14 @@
#pragma omp declare target
-extern uint32_t __omp_rtl_debug_kind;
+// extern uint32_t __omp_rtl_debug_kind;
// TOOD: We want to change the name as soon as the old runtime is gone.
DeviceEnvironmentTy CONSTANT(omptarget_device_environment)
__attribute__((used));
uint32_t config::getDebugKind() {
- return __omp_rtl_debug_kind & omptarget_device_environment.DebugKind;
+ return /*__omp_rtl_debug_kind &*/ omptarget_device_environment.DebugKind;
}
uint32_t config::getNumDevices() {
Index: openmp/libomptarget/DeviceRTL/CMakeLists.txt
===================================================================
--- openmp/libomptarget/DeviceRTL/CMakeLists.txt
+++ openmp/libomptarget/DeviceRTL/CMakeLists.txt
@@ -91,6 +91,12 @@
endif()
endforeach()
+set(amdgpu_mcpus gfx700 gfx701 gfx801 gfx803 gfx900 gfx902 gfx906 gfx908 gfx90a gfx1010 gfx1030 gfx1031)
+if (DEFINED LIBOMPTARGET_AMDGCN_GFXLIST)
+ set(amdgpu_mcpus ${LIBOMPTARGET_AMDGCN_GFXLIST})
+endif()
+
+
# Activate RTL message dumps if requested by the user.
set(LIBOMPTARGET_DEVICE_DEBUG FALSE CACHE BOOL
"Activate DeviceRTL debug messages.")
@@ -121,11 +127,9 @@
# Set flags for LLVM Bitcode compilation.
set(bc_flags -S -x c++ -std=c++17
${clang_opt_flags}
- -target nvptx64
-Xclang -emit-llvm-bc
-Xclang -aux-triple -Xclang ${aux_triple}
-fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
- -Xclang -target-feature -Xclang +ptx61
-I${include_directory}
-I${devicertl_base_directory}/../include
${LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL}
@@ -137,23 +141,22 @@
list(APPEND bc_flags -DOMPTARGET_DEBUG=0)
endif()
-# Create target to build all Bitcode libraries.
-add_custom_target(omptarget-new-nvptx-bc)
-add_dependencies(omptarget-new-nvptx-bc opt llvm-link)
-# Generate a Bitcode library for all the compute capabilities the user requested
-foreach(sm ${nvptx_sm_list})
- # TODO: replace this with declare variant and isa selector.
- set(cuda_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0")
+macro(instantiate_DeviceRTL)
+ # parameters target_cpu, target_name, target_bc_flags
+
set(bc_files "")
foreach(src ${src_files})
get_filename_component(infile ${src} ABSOLUTE)
get_filename_component(outfile ${src} NAME)
- set(outfile "${outfile}-sm_${sm}.bc")
+ set(outfile "${outfile}-${target_cpu}.bc")
add_custom_command(OUTPUT ${outfile}
- COMMAND ${CLANG_TOOL} ${bc_flags}
- ${cuda_flags} ${infile} -o ${outfile}
+ COMMAND ${CLANG_TOOL}
+ ${bc_flags}
+ -Xclang -target-cpu -Xclang ${target_cpu}
+ ${target_bc_flags}
+ ${infile} -o ${outfile}
DEPENDS ${infile}
IMPLICIT_DEPENDS CXX ${infile}
COMMENT "Building LLVM bitcode ${outfile}"
@@ -173,7 +176,8 @@
list(APPEND bc_files ${outfile})
endforeach()
- set(bclib_name "libomptarget-new-nvptx-sm_${sm}.bc")
+ set(bclib_name "libomptarget-new-${target_name}-${target_cpu}.bc")
+
# Link to a bitcode library.
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
@@ -205,17 +209,44 @@
set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name})
- set(bclib_target_name "omptarget-new-nvptx-sm_${sm}-bc")
+ set(bclib_target_name "omptarget-new-${target_name}-${target_cpu}-bc")
add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}_opt)
- add_dependencies(omptarget-new-nvptx-bc ${bclib_target_name})
- add_dependencies(${bclib_target_name} opt llvm-link)
# Copy library to destination.
+ # Note: This is acting on the llvm-link'ed library, not the opt'ed one
add_custom_command(TARGET ${bclib_target_name} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
${LIBOMPTARGET_LIBRARY_DIR})
# Install bitcode library under the lib destination folder.
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+endmacro()
+
+# Generate a Bitcode library for all the compute capabilities the user requested
+foreach(sm ${nvptx_sm_list})
+ set(target_cpu sm_${sm})
+ set(target_name nvptx)
+
+ # TODO: replace CUDA_ARCH with declare variant and isa selector.
+ set(target_bc_flags
+ -target nvptx64
+ -Xclang -target-feature
+ -Xclang +ptx61
+ "-D__CUDA_ARCH__=${sm}0")
+
+ instantiate_DeviceRTL()
+endforeach()
+
+foreach(mcpu ${amdgpu_mcpus})
+ set(target_cpu ${mcpu})
+ set(target_name amdgpu)
+
+ set(target_bc_flags
+ -target amdgcn-amd-amdhsa
+ "-D__AMDGCN__"
+ -fvisibility=default
+ -nogpulib)
+
+ instantiate_DeviceRTL()
endforeach()
Index: clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
===================================================================
--- clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
+++ clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
@@ -252,7 +252,7 @@
std::string BitcodeSuffix;
if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,
options::OPT_fno_openmp_target_new_runtime, false))
- BitcodeSuffix = "new-amdgcn-" + GPUArch;
+ BitcodeSuffix = "new-amdgpu-" + GPUArch;
else
BitcodeSuffix = "amdgcn-" + GPUArch;
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits