yaxunl updated this revision to Diff 544812.
yaxunl marked an inline comment as done.
yaxunl added a reviewer: Anastasia.
yaxunl added a comment.
revised by comments
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D155213/new/
https://reviews.llvm.org/D155213
Files:
clang/include/clang/Basic/CodeGenOptions.def
clang/include/clang/Basic/LangOptions.def
clang/include/clang/Driver/Options.td
clang/lib/CodeGen/CGCall.cpp
clang/lib/CodeGen/Targets/AMDGPU.cpp
clang/lib/Driver/ToolChains/Clang.cpp
clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl
clang/test/Driver/hip-options.hip
clang/test/Driver/opencl.cl
Index: clang/test/Driver/opencl.cl
===================================================================
--- clang/test/Driver/opencl.cl
+++ clang/test/Driver/opencl.cl
@@ -17,6 +17,8 @@
// RUN: %clang -S -### -cl-denorms-are-zero %s 2>&1 | FileCheck --check-prefix=CHECK-DENORMS-ARE-ZERO %s
// RUN: %clang -S -### -cl-fp32-correctly-rounded-divide-sqrt %s 2>&1 | FileCheck --check-prefix=CHECK-ROUND-DIV %s
// RUN: %clang -S -### -cl-uniform-work-group-size %s 2>&1 | FileCheck --check-prefix=CHECK-UNIFORM-WG %s
+// RUN: %clang -S -### -foffload-uniform-block %s 2>&1 | FileCheck --check-prefix=CHECK-UNIFORM-WG %s
+// RUN: %clang -S -### -fno-offload-uniform-block -cl-uniform-work-group-size %s 2>&1 | FileCheck --check-prefix=CHECK-UNIFORM-WG %s
// RUN: not %clang -cl-std=c99 -DOPENCL %s 2>&1 | FileCheck --check-prefix=CHECK-C99 %s
// RUN: not %clang -cl-std=invalid -DOPENCL %s 2>&1 | FileCheck --check-prefix=CHECK-INVALID %s
// RUN: %clang -S -### -target spir-unknown-unknown %s 2>&1 | FileCheck --check-prefix=CHECK-W-SPIR-COMPAT %s
@@ -44,7 +46,7 @@
// CHECK-DENORMS-ARE-ZERO-NOT: "-cl-denorms-are-zero"
// CHECK-ROUND-DIV: "-cc1" {{.*}} "-cl-fp32-correctly-rounded-divide-sqrt"
-// CHECK-UNIFORM-WG: "-cc1" {{.*}} "-cl-uniform-work-group-size"
+// CHECK-UNIFORM-WG: "-cc1" {{.*}} "-foffload-uniform-block"
// CHECK-C99: error: invalid value 'c99' in '-cl-std=c99'
// CHECK-INVALID: error: invalid value 'invalid' in '-cl-std=invalid'
Index: clang/test/Driver/hip-options.hip
===================================================================
--- clang/test/Driver/hip-options.hip
+++ clang/test/Driver/hip-options.hip
@@ -205,3 +205,27 @@
// RUN: %clang -fdriver-only -Werror --target=x86_64-unknown-linux-gnu -nostdinc -nostdlib -fgpu-approx-transcendentals \
// RUN: -x c++ %s 2>&1 | count 0
+/ Check -fno-offload-uniform-block is passed to clang -cc1 but
+// (default) -fno-offload-uniform-block is not.
+
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib -fno-offload-uniform-block \
+// RUN: --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=NOUNIBLK %s
+
+// NOUNIBLK: "-cc1"{{.*}} "-triple" "amdgcn-amd-amdhsa" {{.*}} "-fno-offload-uniform-block"
+// NOUNIBLK: "-cc1"{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-fno-offload-uniform-block"
+
+// RUN: %clang -### -nogpuinc -nogpulib -foffload-uniform-block \
+// RUN: --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=UNIBLK %s
+
+// UNIBLK: "-cc1"{{.*}} "-triple" "amdgcn-amd-amdhsa" {{.*}} "-foffload-uniform-block"
+// UNIBLK: "-cc1"{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-foffload-uniform-block"
+
+// RUN: %clang -### -nogpuinc -nogpulib \
+// RUN: --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=DEFUNIBLK %s
+
+// DEFUNIBLK-NOT: "-f{{(no-)?}}offload-uniform-block"
+
+// Check no warnings for -f[no-]offload-uniform-block.
+
+// RUN: %clang -fdriver-only -Werror --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib -fno-offload-uniform-block \
+// RUN: -foffload-uniform-block --cuda-gpu-arch=gfx906 %s 2>&1 | count 0
Index: clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl
===================================================================
--- clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl
+++ clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl
@@ -1,6 +1,7 @@
// RUN: %clang_cc1 -emit-llvm -O0 -cl-std=CL1.2 -o - %s 2>&1 | FileCheck %s -check-prefixes CHECK,CHECK-UNIFORM
// RUN: %clang_cc1 -emit-llvm -O0 -cl-std=CL2.0 -o - %s 2>&1 | FileCheck %s -check-prefixes CHECK,CHECK-NONUNIFORM
// RUN: %clang_cc1 -emit-llvm -O0 -cl-std=CL2.0 -cl-uniform-work-group-size -o - %s 2>&1 | FileCheck %s -check-prefixes CHECK,CHECK-UNIFORM
+// RUN: %clang_cc1 -emit-llvm -O0 -cl-std=CL2.0 -foffload-uniform-block -o - %s 2>&1 | FileCheck %s -check-prefixes CHECK,CHECK-UNIFORM
kernel void ker() {};
// CHECK: define{{.*}}@ker() #0
Index: clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
===================================================================
--- clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
+++ clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
@@ -10,10 +10,18 @@
// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm \
// RUN: -verify -o - -x hip %s | FileCheck -check-prefix=NAMD %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -foffload-uniform-block \
+// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
+// RUN: | FileCheck -check-prefixes=CHECK,DEFAULT %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fno-offload-uniform-block \
+// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
+// RUN: | FileCheck -check-prefixes=NOUB %s
+
#include "Inputs/cuda.h"
__global__ void flat_work_group_size_default() {
// CHECK: define{{.*}} amdgpu_kernel void @_Z28flat_work_group_size_defaultv() [[FLAT_WORK_GROUP_SIZE_DEFAULT:#[0-9]+]]
+// NOUB: define{{.*}} void @_Z28flat_work_group_size_defaultv() [[NOUB:#[0-9]+]]
}
__attribute__((amdgpu_flat_work_group_size(32, 64))) // expected-no-diagnostics
@@ -45,3 +53,5 @@
// CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2"
// CHECK-DAG: attributes [[NUM_SGPR_32]] = {{.*}}"amdgpu-num-sgpr"="32"
// CHECK-DAG: attributes [[NUM_VGPR_64]] = {{.*}}"amdgpu-num-vgpr"="64"
+
+// NOUB-NOT: "uniform-work-group-size"="true"
Index: clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -7264,6 +7264,9 @@
Args.AddLastArg(CmdArgs, options::OPT_fgpu_default_stream_EQ);
}
+ Args.AddLastArg(CmdArgs, options::OPT_foffload_uniform_block,
+ options::OPT_fno_offload_uniform_block);
+
if (IsCudaDevice || IsHIPDevice) {
StringRef InlineThresh =
Args.getLastArgValue(options::OPT_fgpu_inline_threshold_EQ);
Index: clang/lib/CodeGen/Targets/AMDGPU.cpp
===================================================================
--- clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -401,13 +401,6 @@
if (FD)
setFunctionDeclAttributes(FD, F, M);
- const bool IsHIPKernel =
- M.getLangOpts().HIP && FD && FD->hasAttr<CUDAGlobalAttr>();
-
- // TODO: This should be moved to language specific attributes instead.
- if (IsHIPKernel)
- F->addFnAttr("uniform-work-group-size", "true");
-
if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics())
F->addFnAttr("amdgpu-unsafe-fp-atomics", "true");
Index: clang/lib/CodeGen/CGCall.cpp
===================================================================
--- clang/lib/CodeGen/CGCall.cpp
+++ clang/lib/CodeGen/CGCall.cpp
@@ -2398,10 +2398,15 @@
// to the compiler that the global work-size be a multiple of
// the work-group size specified to clEnqueueNDRangeKernel
// (i.e. work groups are uniform).
- FuncAttrs.addAttribute("uniform-work-group-size",
- llvm::toStringRef(CodeGenOpts.UniformWGSize));
+ FuncAttrs.addAttribute(
+ "uniform-work-group-size",
+ llvm::toStringRef(getLangOpts().OffloadUniformBlock));
}
}
+
+ if (TargetDecl->hasAttr<CUDAGlobalAttr>() &&
+ getLangOpts().OffloadUniformBlock)
+ FuncAttrs.addAttribute("uniform-work-group-size", "true");
}
// Attach "no-builtins" attributes to:
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -912,6 +912,12 @@
def b : JoinedOrSeparate<["-"], "b">, Flags<[LinkerInput]>,
HelpText<"Pass -b <arg> to the linker on AIX">, MetaVarName<"<arg>">,
Group<Link_Group>;
+
+defm offload_uniform_block : BoolFOption<"offload-uniform-block",
+ LangOpts<"OffloadUniformBlock">, Default<"LangOpts->CUDA">,
+ PosFlag<SetTrue, [CC1Option], "Assume">, NegFlag<SetFalse, [CC1Option], "Don't assume">,
+ BothFlags<[], " that kernels are launched with uniform block sizes (default true for CUDA/HIP and false otherwise)">>;
+
// OpenCL-only Options
def cl_opt_disable : Flag<["-"], "cl-opt-disable">, Group<opencl_Group>, Flags<[CC1Option]>,
HelpText<"OpenCL only. This option disables all optimizations. By default optimizations are enabled.">;
@@ -947,9 +953,8 @@
def cl_fp32_correctly_rounded_divide_sqrt : Flag<["-"], "cl-fp32-correctly-rounded-divide-sqrt">, Group<opencl_Group>, Flags<[CC1Option]>,
HelpText<"OpenCL only. Specify that single precision floating-point divide and sqrt used in the program source are correctly rounded.">,
MarshallingInfoFlag<CodeGenOpts<"OpenCLCorrectlyRoundedDivSqrt">>;
-def cl_uniform_work_group_size : Flag<["-"], "cl-uniform-work-group-size">, Group<opencl_Group>, Flags<[CC1Option]>,
- HelpText<"OpenCL only. Defines that the global work-size be a multiple of the work-group size specified to clEnqueueNDRangeKernel">,
- MarshallingInfoFlag<CodeGenOpts<"UniformWGSize">>;
+def cl_uniform_work_group_size : Flag<["-"], "cl-uniform-work-group-size">, Group<opencl_Group>, Flags<[CC1Option]>, Alias<foffload_uniform_block>,
+ HelpText<"OpenCL only. Defines that the global work-size be a multiple of the work-group size specified to clEnqueueNDRangeKernel">;
def cl_no_stdinc : Flag<["-"], "cl-no-stdinc">, Group<opencl_Group>,
HelpText<"OpenCL only. Disables all standard includes containing non-native compiler types and functions.">;
def cl_ext_EQ : CommaJoined<["-"], "cl-ext=">, Group<opencl_Group>, Flags<[CC1Option]>,
Index: clang/include/clang/Basic/LangOptions.def
===================================================================
--- clang/include/clang/Basic/LangOptions.def
+++ clang/include/clang/Basic/LangOptions.def
@@ -278,6 +278,7 @@
ENUM_LANGOPT(SYCLVersion , SYCLMajorVersion, 2, SYCL_None, "Version of the SYCL standard used")
LANGOPT(HIPUseNewLaunchAPI, 1, 0, "Use new kernel launching API for HIP")
+LANGOPT(OffloadUniformBlock, 1, 0, "Assume that kernels are launched with uniform block sizes (default true for CUDA/HIP and false otherwise)")
LANGOPT(SizedDeallocation , 1, 0, "sized deallocation")
LANGOPT(AlignedAllocation , 1, 0, "aligned allocation")
Index: clang/include/clang/Basic/CodeGenOptions.def
===================================================================
--- clang/include/clang/Basic/CodeGenOptions.def
+++ clang/include/clang/Basic/CodeGenOptions.def
@@ -202,7 +202,6 @@
/// float-to-int conversion instructions.
CODEGENOPT(StrictFloatCastOverflow, 1, 1)
-CODEGENOPT(UniformWGSize , 1, 0) ///< -cl-uniform-work-group-size
CODEGENOPT(NoZeroInitializedInBSS , 1, 0) ///< -fno-zero-initialized-in-bss.
/// Method of Objective-C dispatch to use.
ENUM_CODEGENOPT(ObjCDispatchMethod, ObjCDispatchMethodKind, 2, Legacy)
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits