gtbercea created this revision. gtbercea added reviewers: ABataev, caomhin. Herald added subscribers: cfe-commits, guansong.
Introduce a compiler flag for cases when the user knows that the collapsed loop counter can be safely represented using at most 32 bits. This will prevent the emission of expensive mathematical operations (such as the div operation) on the iteration variable using 64 bits where 32 bit operations are sufficient. Repository: rC Clang https://reviews.llvm.org/D55928 Files: docs/OpenMPSupport.rst include/clang/Basic/LangOptions.def include/clang/Driver/Options.td lib/Driver/ToolChains/Clang.cpp lib/Frontend/CompilerInvocation.cpp lib/Sema/SemaOpenMP.cpp test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
Index: test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp =================================================================== --- test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp +++ test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp @@ -1,6 +1,7 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-max-32bit-collapse-width -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 @@ -9,11 +10,12 @@ #define HEADER // Check that the execution mode of all 5 target regions on the gpu is set to SPMD Mode. -// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l43}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l48}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l56}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l34}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l40}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l45}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l50}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l58}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l65}}_exec_mode = weak constant i8 0 #define N 1000 #define M 10 @@ -80,7 +82,7 @@ // CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 // CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l32( +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l34( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) // CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} 4, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) @@ -214,16 +216,19 @@ // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: ret void -// CHECK: define internal void [[OUTL4]]( +// CHECK-32: define internal void [[OUTL4]]( +// CHECK-64: define internal void [[OUTL4]]( // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33, // CHECK: call void @__kmpc_for_static_fini( // CHECK: ret void -// CHECK: define weak void @__omp_offloading_{{.*}}_l56(i[[SZ:64|32]] %{{[^,]+}}, [10 x [10 x i32]]* dereferenceable{{.*}}) +// CHECK: define weak void @__omp_offloading_{{.*}}_l58(i[[SZ:64|32]] %{{[^,]+}}, [10 x [10 x i32]]* dereferenceable{{.*}}) // CHECK: call void [[OUTLINED:@__omp_outlined.*]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, [10 x [10 x i32]]* %{{.*}}) // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.*}}, i32* noalias %{{.*}} i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, [10 x [10 x i32]]* dereferenceable{{.*}}) +// CHECK-DIV64: div i64 +// CHECK-DIV32-NO: div i64 -// CHECK: define weak void @__omp_offloading_{{.*}}_l63(i[[SZ:64|32]] %{{[^,]+}}, [1000 x i32]* dereferenceable{{.*}}, i32* %{{[^)]+}}) +// CHECK: define weak void @__omp_offloading_{{.*}}_l65(i[[SZ:64|32]] %{{[^,]+}}, [1000 x i32]* dereferenceable{{.*}}, i32* %{{[^)]+}}) // CHECK: call void [[OUTLINED:@__omp_outlined.*]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, [1000 x i32]* %{{.*}}, i32* %{{.*}}) // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.*}}, i32* noalias %{{.*}} i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, [1000 x i32]* dereferenceable{{.*}}, i32* %{{.*}}) Index: lib/Sema/SemaOpenMP.cpp =================================================================== --- lib/Sema/SemaOpenMP.cpp +++ lib/Sema/SemaOpenMP.cpp @@ -4066,7 +4066,7 @@ if (!TestIsLessOp.hasValue()) TestIsLessOp = IsConstPos || (IsUnsigned && !Subtract); if (UB && (IsConstZero || - (TestIsLessOp.getValue() ? + (TestIsLessOp.getValue() ? (IsConstNeg || (IsUnsigned && Subtract)) : (IsConstPos || (IsUnsigned && !Subtract))))) { SemaRef.Diag(NewStep->getExprLoc(), @@ -4241,7 +4241,7 @@ Op == OO_Less || Op == OO_Greater, CE->getSourceRange(), CE->getOperatorLoc()); break; - case OO_ExclaimEqual: + case OO_ExclaimEqual: return setUB(getInitLCDecl(CE->getArg(0)) == LCDecl ? CE->getArg(1) : CE->getArg(0), /*LessOp=*/llvm::None, @@ -4499,7 +4499,7 @@ ExprResult CondExpr = SemaRef.BuildBinOp(S, DefaultLoc, - TestIsLessOp.getValue() ? + TestIsLessOp.getValue() ? (TestIsStrictOp ? BO_LT : BO_LE) : (TestIsStrictOp ? BO_GT : BO_GE), NewLB.get(), NewUB.get()); @@ -5200,13 +5200,14 @@ // Choose either the 32-bit or 64-bit version. ExprResult LastIteration = LastIteration64; - if (LastIteration32.isUsable() && - C.getTypeSize(LastIteration32.get()->getType()) == 32 && - (AllCountsNeedLessThan32Bits || NestedLoopCount == 1 || - fitsInto( - /*Bits=*/32, - LastIteration32.get()->getType()->hasSignedIntegerRepresentation(), - LastIteration64.get(), SemaRef))) + if (SemaRef.getLangOpts().OpenMPMax32BitCollapseWidth || + (LastIteration32.isUsable() && + C.getTypeSize(LastIteration32.get()->getType()) == 32 && + (AllCountsNeedLessThan32Bits || NestedLoopCount == 1 || + fitsInto( + /*Bits=*/32, + LastIteration32.get()->getType()->hasSignedIntegerRepresentation(), + LastIteration64.get(), SemaRef)))) LastIteration = LastIteration32; QualType VType = LastIteration.get()->getType(); QualType RealVType = VType; Index: lib/Frontend/CompilerInvocation.cpp =================================================================== --- lib/Frontend/CompilerInvocation.cpp +++ lib/Frontend/CompilerInvocation.cpp @@ -2843,6 +2843,11 @@ Opts.OpenMPCUDABlocksPerSM, Diags); } + // Prevent auto-widening the representation of loop counters during an + // OpenMP collapse clause. + Opts.OpenMPMax32BitCollapseWidth = + Args.hasArg(options::OPT_fopenmp_max_32bit_collapse_width) ? 1 : 0; + // Get the OpenMP target triples if any. if (Arg *A = Args.getLastArg(options::OPT_fopenmp_targets_EQ)) { Index: lib/Driver/ToolChains/Clang.cpp =================================================================== --- lib/Driver/ToolChains/Clang.cpp +++ lib/Driver/ToolChains/Clang.cpp @@ -4426,6 +4426,7 @@ Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_version_EQ); Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_cuda_number_of_sm_EQ); Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_cuda_blocks_per_sm_EQ); + Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_max_32bit_collapse_width); // When in OpenMP offloading mode with NVPTX target, forward // cuda-mode flag Index: include/clang/Driver/Options.td =================================================================== --- include/clang/Driver/Options.td +++ include/clang/Driver/Options.td @@ -1574,6 +1574,8 @@ Flags<[CC1Option, NoArgumentUnused, HelpHidden]>; def fopenmp_cuda_blocks_per_sm_EQ : Joined<["-"], "fopenmp-cuda-blocks-per-sm=">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>; +def fopenmp_max_32bit_collapse_width : Flag<["-"], "fopenmp-max-32bit-collapse-width">, Group<f_Group>, + Flags<[CC1Option, NoArgumentUnused, HelpHidden]>; def fno_optimize_sibling_calls : Flag<["-"], "fno-optimize-sibling-calls">, Group<f_Group>; def foptimize_sibling_calls : Flag<["-"], "foptimize-sibling-calls">, Group<f_Group>; def fno_escaping_block_tail_calls : Flag<["-"], "fno-escaping-block-tail-calls">, Group<f_Group>, Flags<[CC1Option]>; Index: include/clang/Basic/LangOptions.def =================================================================== --- include/clang/Basic/LangOptions.def +++ include/clang/Basic/LangOptions.def @@ -207,6 +207,7 @@ LANGOPT(OpenMPHostCXXExceptions , 1, 0, "C++ exceptions handling in the host code.") LANGOPT(OpenMPCUDANumSMs , 32, 0, "Number of SMs for CUDA devices.") LANGOPT(OpenMPCUDABlocksPerSM , 32, 0, "Number of blocks per SM for CUDA devices.") +LANGOPT(OpenMPMax32BitCollapseWidth , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.") LANGOPT(RenderScript , 1, 0, "RenderScript") LANGOPT(CUDAIsDevice , 1, 0, "compiling for CUDA device") Index: docs/OpenMPSupport.rst =================================================================== --- docs/OpenMPSupport.rst +++ docs/OpenMPSupport.rst @@ -109,6 +109,16 @@ between the threads and it is user responsibility to share the required data between the threads in the parallel regions. +Collapsed loop nest counter +--------------------------- + +When using the collapse clause on a loop nest the default behaviour is to +automatically extend the representation of the loop counter to 64 bits for +the cases where the sizes of the collapsed loops are not known at compile +time. To prevent this conservative choice and use at most 32 bits, +compile your program with the `-fopenmp-max-32bit-collapse-width`. + + Features not supported or with limited support for Cuda devices ---------------------------------------------------------------
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits