gtbercea updated this revision to Diff 180709.
gtbercea added a comment.
Invert accumulation direction.
Repository:
rC Clang
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D56413/new/
https://reviews.llvm.org/D56413
Files:
docs/OpenMPSupport.rst
include/clang/Basic/LangOptions.def
include/clang/Driver/Options.td
lib/Driver/ToolChains/Clang.cpp
lib/Frontend/CompilerInvocation.cpp
lib/Sema/SemaOpenMP.cpp
test/OpenMP/for_codegen.cpp
test/OpenMP/for_simd_codegen.cpp
test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
test/OpenMP/parallel_for_simd_codegen.cpp
test/OpenMP/simd_codegen.cpp
Index: test/OpenMP/simd_codegen.cpp
===================================================================
--- test/OpenMP/simd_codegen.cpp
+++ test/OpenMP/simd_codegen.cpp
@@ -278,8 +278,11 @@
// CHECK-NEXT: [[I_2:%.+]] = trunc i64 [[I_1_ADD0]] to i32
// CHECK-NEXT: store i32 [[I_2]], i32* {{%.+}}{{.*}}!llvm.mem.parallel_loop_access ![[T1_ID]]
// CHECK: [[IV2:%.+]] = load i64, i64* [[T1_OMP_IV]]{{.*}}!llvm.mem.parallel_loop_access ![[T1_ID]]
-// CHECK-NEXT: [[J_1:%.+]] = srem i64 [[IV2]], 4
-// CHECK-NEXT: [[J_2:%.+]] = mul nsw i64 [[J_1]], 2
+// CHECK: [[IV2_1:%.+]] = load i64, i64* [[T1_OMP_IV]]{{.*}}!llvm.mem.parallel_loop_access ![[T1_ID]]
+// CHECK-NEXT: [[I_1_DIV1:%.+]] = sdiv i64 [[IV2_1]], 4
+// CHECK-NEXT: [[I_1_MUL1:%.+]] = mul nsw i64 [[I_1_DIV1]], 4
+// CHECK-NEXT: [[I_1_SUB0:%.+]] = sub nsw i64 [[IV2]], [[I_1_MUL1]]
+// CHECK-NEXT: [[J_2:%.+]] = mul nsw i64 [[I_1_SUB0]], 2
// CHECK-NEXT: [[J_2_ADD0:%.+]] = add nsw i64 0, [[J_2]]
// CHECK-NEXT: store i64 [[J_2_ADD0]], i64* {{%.+}}{{.*}}!llvm.mem.parallel_loop_access ![[T1_ID]]
// simd.for.inc:
@@ -393,22 +396,70 @@
// CHECK-NEXT: [[CALC_I_1_MUL1:%.+]] = mul i32 [[CALC_I_1]], 1
// CHECK-NEXT: [[CALC_I_2:%.+]] = add i32 1, [[CALC_I_1_MUL1]]
// CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]]
+
// CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[COLL1_LOOP_ID]]
-// CHECK-NEXT: [[CALC_J_1:%.+]] = udiv i32 [[IV1_2]], 20
-// CHECK-NEXT: [[CALC_J_2:%.+]] = urem i32 [[CALC_J_1]], 3
+// CHECK: [[IV1_2_1:%.+]] = load i32, i32* [[OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[COLL1_LOOP_ID]]
+// CHECK-NEXT: [[CALC_J_1:%.+]] = udiv i32 [[IV1_2_1]], 60
+// CHECK-NEXT: [[MUL_1:%.+]] = mul i32 [[CALC_J_1]], 60
+// CHECK-NEXT: [[SUB_3:%.+]] = sub i32 [[IV1_2]], [[MUL_1]]
+// CHECK-NEXT: [[CALC_J_2:%.+]] = udiv i32 [[SUB_3]], 20
// CHECK-NEXT: [[CALC_J_2_MUL1:%.+]] = mul i32 [[CALC_J_2]], 1
// CHECK-NEXT: [[CALC_J_3:%.+]] = add i32 2, [[CALC_J_2_MUL1]]
// CHECK-NEXT: store i32 [[CALC_J_3]], i32* [[LC_J:.+]]
+
// CHECK: [[IV1_3:%.+]] = load i32, i32* [[OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[COLL1_LOOP_ID]]
-// CHECK-NEXT: [[CALC_K_1:%.+]] = udiv i32 [[IV1_3]], 5
-// CHECK-NEXT: [[CALC_K_2:%.+]] = urem i32 [[CALC_K_1]], 4
-// CHECK-NEXT: [[CALC_K_2_MUL1:%.+]] = mul i32 [[CALC_K_2]], 1
-// CHECK-NEXT: [[CALC_K_3:%.+]] = add i32 3, [[CALC_K_2_MUL1]]
-// CHECK-NEXT: store i32 [[CALC_K_3]], i32* [[LC_K:.+]]
-// CHECK: [[IV1_4:%.+]] = load i32, i32* [[OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[COLL1_LOOP_ID]]
-// CHECK-NEXT: [[CALC_L_1:%.+]] = urem i32 [[IV1_4]], 5
-// CHECK-NEXT: [[CALC_L_1_MUL1:%.+]] = mul i32 [[CALC_L_1]], 1
-// CHECK-NEXT: [[CALC_L_2:%.+]] = add i32 4, [[CALC_L_1_MUL1]]
+// CHECK: [[IV1_3_1:%.+]] = load i32, i32* [[OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[COLL1_LOOP_ID]]
+// CHECK-NEXT: [[DIV_1:%.+]] = udiv i32 [[IV1_3_1]], 60
+// CHECK-NEXT: [[MUL_2:%.+]] = mul i32 [[DIV_1]], 60
+// CHECK-NEXT: [[ADD_3:%.+]] = sub i32 [[IV1_3]], [[MUL_2]]
+
+// CHECK: [[IV1_4:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK: [[IV1_4_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[DIV_2:%.+]] = udiv i32 [[IV1_4_1]], 60
+// CHECK-NEXT: [[MUL_3:%.+]] = mul i32 [[DIV_2]], 60
+// CHECK-NEXT: [[SUB_6:%.+]] = sub i32 [[IV1_4]], [[MUL_3]]
+// CHECK-NEXT: [[DIV_3:%.+]] = udiv i32 [[SUB_6]], 20
+// CHECK-NEXT: [[MUL_4:%.+]] = mul i32 [[DIV_3]], 20
+// CHECK-NEXT: [[ADD_5:%.+]] = sub i32 [[ADD_3]], [[MUL_4]]
+// CHECK-NEXT: [[DIV_4:%.+]] = udiv i32 [[ADD_5]], 5
+// CHECK-NEXT: [[MUL_5:%.+]] = mul i32 [[DIV_4]], 1
+// CHECK-NEXT: [[ADD_6:%.+]] = add i32 3, [[MUL_5]]
+// CHECK-NEXT: store i32 [[ADD_6]], i32* [[LC_K:.+]]
+
+// CHECK: [[IV1_5:%.+]] = load i32, i32* [[OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[COLL1_LOOP_ID]]
+// CHECK: [[IV1_5_1:%.+]] = load i32, i32* [[OMP_IV]]{{.+}}!llvm.mem.parallel_loop_access ![[COLL1_LOOP_ID]]
+// CHECK-NEXT: [[DIV_5:%.+]] = udiv i32 [[IV1_5_1]], 60
+// CHECK-NEXT: [[MUL_6:%.+]] = mul i32 [[DIV_5]], 60
+// CHECK-NEXT: [[SUB_7:%.+]] = sub i32 [[IV1_5]], [[MUL_6]]
+
+// CHECK: [[IV1_6:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK: [[IV1_6_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[DIV_6:%.+]] = udiv i32 [[IV1_6_1]], 60
+// CHECK-NEXT: [[MUL_7:%.+]] = mul i32 [[DIV_6]], 60
+// CHECK-NEXT: [[SUB_10:%.+]] = sub i32 [[IV1_6]], [[MUL_7]]
+// CHECK-NEXT: [[DIV_7:%.+]] = udiv i32 [[SUB_10]], 20
+// CHECK-NEXT: [[MUL_8:%.+]] = mul i32 [[DIV_7]], 20
+// CHECK-NEXT: [[SUB_11:%.+]] = sub i32 [[SUB_7]], [[MUL_8]]
+
+// CHECK: [[IV1_7:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK: [[IV1_7_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[DIV_8:%.+]] = udiv i32 [[IV1_7_1]], 60
+// CHECK-NEXT: [[MUL_9:%.+]] = mul i32 [[DIV_8]], 60
+// CHECK-NEXT: [[SUB_12:%.+]] = sub i32 [[IV1_7]], [[MUL_9]]
+
+// CHECK: [[IV1_8:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK: [[IV1_8_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[DIV_3:%.+]] = udiv i32 [[IV1_8_1]], 60
+// CHECK-NEXT: [[MUL_4:%.+]] = mul i32 [[DIV_3]], 60
+// CHECK-NEXT: [[SUB_7:%.+]] = sub i32 [[IV1_8]], [[MUL_4]]
+// CHECK-NEXT: [[DIV_4:%.+]] = udiv i32 [[SUB_7]], 20
+// CHECK-NEXT: [[MUL_5:%.+]] = mul i32 [[DIV_4]], 20
+// CHECK-NEXT: [[SUB_8:%.+]] = sub i32 [[SUB_12]], [[MUL_5]]
+// CHECK-NEXT: [[DIV_5:%.+]] = udiv i32 [[SUB_8]], 5
+// CHECK-NEXT: [[MUL_6:%.+]] = mul i32 [[DIV_5]], 5
+// CHECK-NEXT: [[SUB_9:%.+]] = sub i32 [[SUB_11]], [[MUL_6]]
+// CHECK-NEXT: [[MUL_6:%.+]] = mul i32 [[SUB_9]], 1
+// CHECK-NEXT: [[CALC_L_2:%.+]] = add i32 4, [[MUL_6]]
// CHECK-NEXT: [[CALC_L_3:%.+]] = trunc i32 [[CALC_L_2]] to i16
// CHECK-NEXT: store i16 [[CALC_L_3]], i16* [[LC_L:.+]]
// ... loop body ...
Index: test/OpenMP/parallel_for_simd_codegen.cpp
===================================================================
--- test/OpenMP/parallel_for_simd_codegen.cpp
+++ test/OpenMP/parallel_for_simd_codegen.cpp
@@ -513,22 +513,70 @@
// CHECK-NEXT: [[CALC_I_1_MUL1:%.+]] = mul i32 [[CALC_I_1]], 1
// CHECK-NEXT: [[CALC_I_2:%.+]] = add i32 1, [[CALC_I_1_MUL1]]
// CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]]
+
// CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]
-// CHECK-NEXT: [[CALC_J_1:%.+]] = udiv i32 [[IV1_2]], 20
-// CHECK-NEXT: [[CALC_J_2:%.+]] = urem i32 [[CALC_J_1]], 3
+// CHECK: [[IV1_2_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[CALC_J_1:%.+]] = udiv i32 [[IV1_2_1]], 60
+// CHECK-NEXT: [[MUL_1:%.+]] = mul i32 [[CALC_J_1]], 60
+// CHECK-NEXT: [[SUB_3:%.+]] = sub i32 [[IV1_2]], [[MUL_1]]
+// CHECK-NEXT: [[CALC_J_2:%.+]] = udiv i32 [[SUB_3]], 20
// CHECK-NEXT: [[CALC_J_2_MUL1:%.+]] = mul i32 [[CALC_J_2]], 1
// CHECK-NEXT: [[CALC_J_3:%.+]] = add i32 2, [[CALC_J_2_MUL1]]
// CHECK-NEXT: store i32 [[CALC_J_3]], i32* [[LC_J:.+]]
+
// CHECK: [[IV1_3:%.+]] = load i32, i32* [[OMP_IV]]
-// CHECK-NEXT: [[CALC_K_1:%.+]] = udiv i32 [[IV1_3]], 5
-// CHECK-NEXT: [[CALC_K_2:%.+]] = urem i32 [[CALC_K_1]], 4
-// CHECK-NEXT: [[CALC_K_2_MUL1:%.+]] = mul i32 [[CALC_K_2]], 1
-// CHECK-NEXT: [[CALC_K_3:%.+]] = add i32 3, [[CALC_K_2_MUL1]]
-// CHECK-NEXT: store i32 [[CALC_K_3]], i32* [[LC_K:.+]]
+// CHECK: [[IV1_3_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[DIV_1:%.+]] = udiv i32 [[IV1_3_1]], 60
+// CHECK-NEXT: [[MUL_2:%.+]] = mul i32 [[DIV_1]], 60
+// CHECK-NEXT: [[ADD_3:%.+]] = sub i32 [[IV1_3]], [[MUL_2]]
+
// CHECK: [[IV1_4:%.+]] = load i32, i32* [[OMP_IV]]
-// CHECK-NEXT: [[CALC_L_1:%.+]] = urem i32 [[IV1_4]], 5
-// CHECK-NEXT: [[CALC_L_1_MUL1:%.+]] = mul i32 [[CALC_L_1]], 1
-// CHECK-NEXT: [[CALC_L_2:%.+]] = add i32 4, [[CALC_L_1_MUL1]]
+// CHECK: [[IV1_4_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[DIV_2:%.+]] = udiv i32 [[IV1_4_1]], 60
+// CHECK-NEXT: [[MUL_3:%.+]] = mul i32 [[DIV_2]], 60
+// CHECK-NEXT: [[SUB_6:%.+]] = sub i32 [[IV1_4]], [[MUL_3]]
+// CHECK-NEXT: [[DIV_3:%.+]] = udiv i32 [[SUB_6]], 20
+// CHECK-NEXT: [[MUL_4:%.+]] = mul i32 [[DIV_3]], 20
+// CHECK-NEXT: [[SUB_7:%.+]] = sub i32 [[ADD_3]], [[MUL_4]]
+// CHECK-NEXT: [[DIV_4:%.+]] = udiv i32 [[SUB_7]], 5
+// CHECK-NEXT: [[MUL_5:%.+]] = mul i32 [[DIV_4]], 1
+// CHECK-NEXT: [[ADD_6:%.+]] = add i32 3, [[MUL_5]]
+// CHECK-NEXT: store i32 [[ADD_6]], i32* [[LC_K:.+]]
+
+// CHECK: [[IV1_5:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK: [[IV1_5_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[DIV_5:%.+]] = udiv i32 [[IV1_5_1]], 60
+// CHECK-NEXT: [[MUL_6:%.+]] = mul i32 [[DIV_5]], 60
+// CHECK-NEXT: [[ADD_7:%.+]] = sub i32 [[IV1_5]], [[MUL_6]]
+
+// CHECK: [[IV1_6:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK: [[IV1_6_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[DIV_6:%.+]] = udiv i32 [[IV1_6_1]], 60
+// CHECK-NEXT: [[MUL_7:%.+]] = mul i32 [[DIV_6]], 60
+// CHECK-NEXT: [[SUB_10:%.+]] = sub i32 [[IV1_6]], [[MUL_7]]
+// CHECK-NEXT: [[DIV_7:%.+]] = udiv i32 [[SUB_10]], 20
+// CHECK-NEXT: [[MUL_8:%.+]] = mul i32 [[DIV_7]], 20
+// CHECK-NEXT: [[ADD_9:%.+]] = sub i32 [[ADD_7]], [[MUL_8]]
+
+// CHECK: [[IV1_7:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK: [[IV1_7_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[DIV_8:%.+]] = udiv i32 [[IV1_7_1]], 60
+// CHECK-NEXT: [[MUL_9:%.+]] = mul i32 [[DIV_8]], 60
+// CHECK-NEXT: [[ADD_10:%.+]] = sub i32 [[IV1_7]], [[MUL_9]]
+
+// CHECK: [[IV1_8:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK: [[IV1_8_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[DIV_3:%.+]] = udiv i32 [[IV1_8_1]], 60
+// CHECK-NEXT: [[MUL_4:%.+]] = mul i32 [[DIV_3]], 60
+// CHECK-NEXT: [[SUB_7:%.+]] = sub i32 [[IV1_8]], [[MUL_4]]
+// CHECK-NEXT: [[DIV_4:%.+]] = udiv i32 [[SUB_7]], 20
+// CHECK-NEXT: [[MUL_5:%.+]] = mul i32 [[DIV_4]], 20
+// CHECK-NEXT: [[SUB_8:%.+]] = sub i32 [[ADD_10]], [[MUL_5]]
+// CHECK-NEXT: [[DIV_5:%.+]] = udiv i32 [[SUB_8]], 5
+// CHECK-NEXT: [[MUL_6:%.+]] = mul i32 [[DIV_5]], 5
+// CHECK-NEXT: [[SUB_9:%.+]] = sub i32 [[ADD_9]], [[MUL_6]]
+// CHECK-NEXT: [[MUL_6:%.+]] = mul i32 [[SUB_9]], 1
+// CHECK-NEXT: [[CALC_L_2:%.+]] = add i32 4, [[MUL_6]]
// CHECK-NEXT: [[CALC_L_3:%.+]] = trunc i32 [[CALC_L_2]] to i16
// CHECK-NEXT: store i16 [[CALC_L_3]], i16* [[LC_L:.+]]
// ... loop body ...
@@ -655,7 +703,10 @@
// CHECK-NEXT: [[I_2:%.+]] = trunc i64 [[I_1_ADD0]] to i32
// CHECK-NEXT: store i32 [[I_2]], i32*
// CHECK: [[IV2:%.+]] = load i64, i64* [[T1_OMP_IV]]
-// CHECK-NEXT: [[J_1:%.+]] = srem i64 [[IV2]], 4
+// CHECK: [[IV2_1:%.+]] = load i64, i64* [[T1_OMP_IV]]
+// CHECK-NEXT: [[DIV_1:%.+]] = sdiv i64 [[IV2_1]], 4
+// CHECK-NEXT: [[MUL_1:%.+]] = mul nsw i64 [[DIV_1]], 4
+// CHECK-NEXT: [[J_1:%.+]] = sub nsw i64 [[IV2]], [[MUL_1]]
// CHECK-NEXT: [[J_2:%.+]] = mul nsw i64 [[J_1]], 2
// CHECK-NEXT: [[J_2_ADD0:%.+]] = add nsw i64 0, [[J_2]]
// CHECK-NEXT: store i64 [[J_2_ADD0]], i64*
Index: test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
===================================================================
--- test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
+++ test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
@@ -1,6 +1,7 @@
// Test target codegen - host bc file has to be created first.
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
@@ -9,11 +10,12 @@
#define HEADER
// Check that the execution mode of all 5 target regions on the gpu is set to SPMD Mode.
-// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 0
-// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0
-// CHECK-DAG: {{@__omp_offloading_.+l43}}_exec_mode = weak constant i8 0
-// CHECK-DAG: {{@__omp_offloading_.+l48}}_exec_mode = weak constant i8 0
-// CHECK-DAG: {{@__omp_offloading_.+l56}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+l34}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+l40}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+l45}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+l50}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+l58}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+l65}}_exec_mode = weak constant i8 0
#define N 1000
#define M 10
@@ -80,7 +82,7 @@
// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
-// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l32(
+// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l34(
// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
// CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} 4, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
@@ -214,16 +216,19 @@
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
// CHECK: ret void
-// CHECK: define internal void [[OUTL4]](
+// CHECK-32: define internal void [[OUTL4]](
+// CHECK-64: define internal void [[OUTL4]](
// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33,
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: ret void
-// CHECK: define weak void @__omp_offloading_{{.*}}_l56(i[[SZ:64|32]] %{{[^,]+}}, [10 x [10 x i32]]* dereferenceable{{.*}})
+// CHECK: define weak void @__omp_offloading_{{.*}}_l58(i[[SZ:64|32]] %{{[^,]+}}, [10 x [10 x i32]]* dereferenceable{{.*}})
// CHECK: call void [[OUTLINED:@__omp_outlined.*]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, [10 x [10 x i32]]* %{{.*}})
// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.*}}, i32* noalias %{{.*}} i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, [10 x [10 x i32]]* dereferenceable{{.*}})
+// CHECK-DIV64: div i64
+// CHECK-DIV32-NO: div i64
-// CHECK: define weak void @__omp_offloading_{{.*}}_l63(i[[SZ:64|32]] %{{[^,]+}}, [1000 x i32]* dereferenceable{{.*}}, i32* %{{[^)]+}})
+// CHECK: define weak void @__omp_offloading_{{.*}}_l65(i[[SZ:64|32]] %{{[^,]+}}, [1000 x i32]* dereferenceable{{.*}}, i32* %{{[^)]+}})
// CHECK: call void [[OUTLINED:@__omp_outlined.*]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, [1000 x i32]* %{{.*}}, i32* %{{.*}})
// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.*}}, i32* noalias %{{.*}} i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, [1000 x i32]* dereferenceable{{.*}}, i32* %{{.*}})
Index: test/OpenMP/for_simd_codegen.cpp
===================================================================
--- test/OpenMP/for_simd_codegen.cpp
+++ test/OpenMP/for_simd_codegen.cpp
@@ -408,7 +408,10 @@
// CHECK-NEXT: [[I_2:%.+]] = trunc i64 [[I_1_ADD0]] to i32
// CHECK-NEXT: store i32 [[I_2]], i32*
// CHECK: [[IV2:%.+]] = load i64, i64* [[T1_OMP_IV]]
-// CHECK-NEXT: [[J_1:%.+]] = srem i64 [[IV2]], 4
+// CHECK-NEXT: [[IV2_1:%.+]] = load i64, i64* [[T1_OMP_IV]]
+// CHECK-NEXT: [[DIV_2:%.+]] = sdiv i64 [[IV2_1]], 4
+// CHECK-NEXT: [[MUL_2:%.+]] = mul nsw i64 [[DIV_2]], 4
+// CHECK-NEXT: [[J_1:%.+]] = sub nsw i64 [[IV2]], [[MUL_2]]
// CHECK-NEXT: [[J_2:%.+]] = mul nsw i64 [[J_1]], 2
// CHECK-NEXT: [[J_2_ADD0:%.+]] = add nsw i64 0, [[J_2]]
// CHECK-NEXT: store i64 [[J_2_ADD0]], i64*
@@ -556,22 +559,70 @@
// CHECK-NEXT: [[CALC_I_1_MUL1:%.+]] = mul i32 [[CALC_I_1]], 1
// CHECK-NEXT: [[CALC_I_2:%.+]] = add i32 1, [[CALC_I_1_MUL1]]
// CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]]
+
// CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]
-// CHECK-NEXT: [[CALC_J_1:%.+]] = udiv i32 [[IV1_2]], 20
-// CHECK-NEXT: [[CALC_J_2:%.+]] = urem i32 [[CALC_J_1]], 3
+// CHECK: [[IV1_2_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[CALC_J_1:%.+]] = udiv i32 [[IV1_2_1]], 60
+// CHECK-NEXT: [[MUL_1:%.+]] = mul i32 [[CALC_J_1]], 60
+// CHECK-NEXT: [[SUB_3:%.+]] = sub i32 [[IV1_2]], [[MUL_1]]
+// CHECK-NEXT: [[CALC_J_2:%.+]] = udiv i32 [[SUB_3]], 20
// CHECK-NEXT: [[CALC_J_2_MUL1:%.+]] = mul i32 [[CALC_J_2]], 1
// CHECK-NEXT: [[CALC_J_3:%.+]] = add i32 2, [[CALC_J_2_MUL1]]
// CHECK-NEXT: store i32 [[CALC_J_3]], i32* [[LC_J:.+]]
+
// CHECK: [[IV1_3:%.+]] = load i32, i32* [[OMP_IV]]
-// CHECK-NEXT: [[CALC_K_1:%.+]] = udiv i32 [[IV1_3]], 5
-// CHECK-NEXT: [[CALC_K_2:%.+]] = urem i32 [[CALC_K_1]], 4
-// CHECK-NEXT: [[CALC_K_2_MUL1:%.+]] = mul i32 [[CALC_K_2]], 1
-// CHECK-NEXT: [[CALC_K_3:%.+]] = add i32 3, [[CALC_K_2_MUL1]]
-// CHECK-NEXT: store i32 [[CALC_K_3]], i32* [[LC_K:.+]]
+// CHECK: [[IV1_3_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[DIV_1:%.+]] = udiv i32 [[IV1_3_1]], 60
+// CHECK-NEXT: [[MUL_2:%.+]] = mul i32 [[DIV_1]], 60
+// CHECK-NEXT: [[ADD_3:%.+]] = sub i32 [[IV1_3]], [[MUL_2]]
+
// CHECK: [[IV1_4:%.+]] = load i32, i32* [[OMP_IV]]
-// CHECK-NEXT: [[CALC_L_1:%.+]] = urem i32 [[IV1_4]], 5
-// CHECK-NEXT: [[CALC_L_1_MUL1:%.+]] = mul i32 [[CALC_L_1]], 1
-// CHECK-NEXT: [[CALC_L_2:%.+]] = add i32 4, [[CALC_L_1_MUL1]]
+// CHECK: [[IV1_4_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[DIV_2:%.+]] = udiv i32 [[IV1_4_1]], 60
+// CHECK-NEXT: [[MUL_3:%.+]] = mul i32 [[DIV_2]], 60
+// CHECK-NEXT: [[SUB_6:%.+]] = sub i32 [[IV1_4]], [[MUL_3]]
+// CHECK-NEXT: [[DIV_3:%.+]] = udiv i32 [[SUB_6]], 20
+// CHECK-NEXT: [[MUL_4:%.+]] = mul i32 [[DIV_3]], 20
+// CHECK-NEXT: [[SUB_7:%.+]] = sub i32 [[ADD_3]], [[MUL_4]]
+// CHECK-NEXT: [[DIV_4:%.+]] = udiv i32 [[SUB_7]], 5
+// CHECK-NEXT: [[MUL_5:%.+]] = mul i32 [[DIV_4]], 1
+// CHECK-NEXT: [[ADD_6:%.+]] = add i32 3, [[MUL_5]]
+// CHECK-NEXT: store i32 [[ADD_6]], i32* [[LC_K:.+]]
+
+// CHECK: [[IV1_5:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK: [[IV1_5_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[DIV_5:%.+]] = udiv i32 [[IV1_5_1]], 60
+// CHECK-NEXT: [[MUL_6:%.+]] = mul i32 [[DIV_5]], 60
+// CHECK-NEXT: [[ADD_7:%.+]] = sub i32 [[IV1_5]], [[MUL_6]]
+
+// CHECK: [[IV1_6:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK: [[IV1_6_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[DIV_6:%.+]] = udiv i32 [[IV1_6_1]], 60
+// CHECK-NEXT: [[MUL_7:%.+]] = mul i32 [[DIV_6]], 60
+// CHECK-NEXT: [[SUB_10:%.+]] = sub i32 [[IV1_6]], [[MUL_7]]
+// CHECK-NEXT: [[DIV_7:%.+]] = udiv i32 [[SUB_10]], 20
+// CHECK-NEXT: [[MUL_8:%.+]] = mul i32 [[DIV_7]], 20
+// CHECK-NEXT: [[ADD_9:%.+]] = sub i32 [[ADD_7]], [[MUL_8]]
+
+// CHECK: [[IV1_7:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK: [[IV1_7_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[DIV_8:%.+]] = udiv i32 [[IV1_7_1]], 60
+// CHECK-NEXT: [[MUL_9:%.+]] = mul i32 [[DIV_8]], 60
+// CHECK-NEXT: [[ADD_10:%.+]] = sub i32 [[IV1_7]], [[MUL_9]]
+
+// CHECK: [[IV1_8:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK: [[IV1_8_1:%.+]] = load i32, i32* [[OMP_IV]]
+// CHECK-NEXT: [[DIV_3:%.+]] = udiv i32 [[IV1_8_1]], 60
+// CHECK-NEXT: [[MUL_4:%.+]] = mul i32 [[DIV_3]], 60
+// CHECK-NEXT: [[SUB_7:%.+]] = sub i32 [[IV1_8]], [[MUL_4]]
+// CHECK-NEXT: [[DIV_4:%.+]] = udiv i32 [[SUB_7]], 20
+// CHECK-NEXT: [[MUL_5:%.+]] = mul i32 [[DIV_4]], 20
+// CHECK-NEXT: [[SUB_8:%.+]] = sub i32 [[ADD_10]], [[MUL_5]]
+// CHECK-NEXT: [[DIV_5:%.+]] = udiv i32 [[SUB_8]], 5
+// CHECK-NEXT: [[MUL_6:%.+]] = mul i32 [[DIV_5]], 5
+// CHECK-NEXT: [[SUB_9:%.+]] = sub i32 [[ADD_9]], [[MUL_6]]
+// CHECK-NEXT: [[MUL_6:%.+]] = mul i32 [[SUB_9]], 1
+// CHECK-NEXT: [[CALC_L_2:%.+]] = add i32 4, [[MUL_6]]
// CHECK-NEXT: [[CALC_L_3:%.+]] = trunc i32 [[CALC_L_2]] to i16
// CHECK-NEXT: store i16 [[CALC_L_3]], i16* [[LC_L:.+]]
// ... loop body ...
Index: test/OpenMP/for_codegen.cpp
===================================================================
--- test/OpenMP/for_codegen.cpp
+++ test/OpenMP/for_codegen.cpp
@@ -38,7 +38,6 @@
// LIFETIME: call void @llvm.lifetime.end
// LIFETIME: call void @llvm.lifetime.end
// LIFETIME: call void @llvm.lifetime.end
- // LIFETIME: call void @llvm.lifetime.end
#pragma omp for collapse(2)
for (int i = 0; i < 4; i++) {
for (int j = i; j < 4; j++) {
Index: lib/Sema/SemaOpenMP.cpp
===================================================================
--- lib/Sema/SemaOpenMP.cpp
+++ lib/Sema/SemaOpenMP.cpp
@@ -4066,7 +4066,7 @@
if (!TestIsLessOp.hasValue())
TestIsLessOp = IsConstPos || (IsUnsigned && !Subtract);
if (UB && (IsConstZero ||
- (TestIsLessOp.getValue() ?
+ (TestIsLessOp.getValue() ?
(IsConstNeg || (IsUnsigned && Subtract)) :
(IsConstPos || (IsUnsigned && !Subtract))))) {
SemaRef.Diag(NewStep->getExprLoc(),
@@ -4241,7 +4241,7 @@
Op == OO_Less || Op == OO_Greater, CE->getSourceRange(),
CE->getOperatorLoc());
break;
- case OO_ExclaimEqual:
+ case OO_ExclaimEqual:
return setUB(getInitLCDecl(CE->getArg(0)) == LCDecl ?
CE->getArg(1) : CE->getArg(0),
/*LessOp=*/llvm::None,
@@ -4499,7 +4499,7 @@
ExprResult CondExpr =
SemaRef.BuildBinOp(S, DefaultLoc,
- TestIsLessOp.getValue() ?
+ TestIsLessOp.getValue() ?
(TestIsStrictOp ? BO_LT : BO_LE) :
(TestIsStrictOp ? BO_GT : BO_GE),
NewLB.get(), NewUB.get());
@@ -5200,13 +5200,14 @@
// Choose either the 32-bit or 64-bit version.
ExprResult LastIteration = LastIteration64;
- if (LastIteration32.isUsable() &&
- C.getTypeSize(LastIteration32.get()->getType()) == 32 &&
- (AllCountsNeedLessThan32Bits || NestedLoopCount == 1 ||
- fitsInto(
- /*Bits=*/32,
- LastIteration32.get()->getType()->hasSignedIntegerRepresentation(),
- LastIteration64.get(), SemaRef)))
+ if (SemaRef.getLangOpts().OpenMPOptimisticCollapse ||
+ (LastIteration32.isUsable() &&
+ C.getTypeSize(LastIteration32.get()->getType()) == 32 &&
+ (AllCountsNeedLessThan32Bits || NestedLoopCount == 1 ||
+ fitsInto(
+ /*Bits=*/32,
+ LastIteration32.get()->getType()->hasSignedIntegerRepresentation(),
+ LastIteration64.get(), SemaRef))))
LastIteration = LastIteration32;
QualType VType = LastIteration.get()->getType();
QualType RealVType = VType;
@@ -5500,31 +5501,59 @@
Built.Updates.resize(NestedLoopCount);
Built.Finals.resize(NestedLoopCount);
{
- ExprResult Div;
- // Go from inner nested loop to outer.
- for (int Cnt = NestedLoopCount - 1; Cnt >= 0; --Cnt) {
+ // We implement the following algorithm for obtaining the
+ // original loop iteration variable values based on the
+ // value of the collapsed loop iteration variable IV.
+ //
+ // Let n+1 be the number of collapsed loops in the nest.
+ // Iteration variables (I0, I1, .... In)
+ // Iteration counts (N0, N1, ... Nn)
+ //
+ // Acc = IV;
+ //
+ // To compute Ik for loop k, 0 <= k <= n, generate:
+ // Prod = N(k+1) * N(k+2) * ... * Nn;
+ // Ik = Acc / Prod;
+ // Acc -= Ik * Prod;
+ //
+ ExprResult Acc = IV;
+ for (unsigned int Cnt = 0; Cnt < NestedLoopCount; ++Cnt) {
LoopIterationSpace &IS = IterSpaces[Cnt];
SourceLocation UpdLoc = IS.IncSrcRange.getBegin();
- // Build: Iter = (IV / Div) % IS.NumIters
- // where Div is product of previous iterations' IS.NumIters.
- ExprResult Iter;
- if (Div.isUsable()) {
- Iter =
- SemaRef.BuildBinOp(CurScope, UpdLoc, BO_Div, IV.get(), Div.get());
- } else {
- Iter = IV;
- assert((Cnt == (int)NestedLoopCount - 1) &&
- "unusable div expected on first iteration only");
- }
-
- if (Cnt != 0 && Iter.isUsable())
- Iter = SemaRef.BuildBinOp(CurScope, UpdLoc, BO_Rem, Iter.get(),
- IS.NumIterations);
+ ExprResult Iter = IV;
+
+ // Compute prod
+ ExprResult Prod =
+ SemaRef.ActOnIntegerConstant(SourceLocation(), 1).get();
+ for (unsigned int K = Cnt+1; K < NestedLoopCount; ++K)
+ Prod = SemaRef.BuildBinOp(CurScope, UpdLoc, BO_Mul, Prod.get(),
+ IterSpaces[K].NumIterations);
+
+ // Iter = Acc / Prod
+ // If there is at least one more inner loop to avoid
+ // multiplication by 1.
+ if (Cnt + 1 < NestedLoopCount)
+ Iter = SemaRef.BuildBinOp(CurScope, UpdLoc, BO_Div,
+ Acc.get(), Prod.get());
+ else
+ Iter = Acc;
if (!Iter.isUsable()) {
HasErrors = true;
break;
}
+ // Update Acc:
+ // Acc -= Iter * Prod
+ // Check if there is at least one more inner loop to avoid
+ // multiplication by 1.
+ if (Cnt + 1 < NestedLoopCount)
+ Prod = SemaRef.BuildBinOp(CurScope, UpdLoc, BO_Mul,
+ Iter.get(), Prod.get());
+ else
+ Prod = Iter;
+ Acc = SemaRef.BuildBinOp(CurScope, UpdLoc, BO_Sub,
+ Acc.get(), Prod.get());
+
// Build update: IS.CounterVar(Private) = IS.Start + Iter * IS.Step
auto *VD = cast<VarDecl>(cast<DeclRefExpr>(IS.CounterVar)->getDecl());
DeclRefExpr *CounterVar = buildDeclRefExpr(
@@ -5553,22 +5582,6 @@
break;
}
- // Build Div for the next iteration: Div <- Div * IS.NumIters
- if (Cnt != 0) {
- if (Div.isUnset())
- Div = IS.NumIterations;
- else
- Div = SemaRef.BuildBinOp(CurScope, UpdLoc, BO_Mul, Div.get(),
- IS.NumIterations);
-
- // Add parentheses (for debugging purposes only).
- if (Div.isUsable())
- Div = tryBuildCapture(SemaRef, Div.get(), Captures);
- if (!Div.isUsable()) {
- HasErrors = true;
- break;
- }
- }
if (!Update.isUsable() || !Final.isUsable()) {
HasErrors = true;
break;
@@ -12980,7 +12993,7 @@
continue;
}
assert(Count < OMPMapClause::NumberOfModifiers &&
- "Modifiers exceed the allowed number of map type modifiers");
+ "Modifiers exceed the allowed number of map type modifiers");
Modifiers[Count] = MapTypeModifiers[I];
ModifiersLoc[Count] = MapTypeModifiersLoc[I];
++Count;
Index: lib/Frontend/CompilerInvocation.cpp
===================================================================
--- lib/Frontend/CompilerInvocation.cpp
+++ lib/Frontend/CompilerInvocation.cpp
@@ -2843,6 +2843,11 @@
Opts.OpenMPCUDABlocksPerSM, Diags);
}
+ // Prevent auto-widening the representation of loop counters during an
+ // OpenMP collapse clause.
+ Opts.OpenMPOptimisticCollapse =
+ Args.hasArg(options::OPT_fopenmp_optimistic_collapse) ? 1 : 0;
+
// Get the OpenMP target triples if any.
if (Arg *A = Args.getLastArg(options::OPT_fopenmp_targets_EQ)) {
Index: lib/Driver/ToolChains/Clang.cpp
===================================================================
--- lib/Driver/ToolChains/Clang.cpp
+++ lib/Driver/ToolChains/Clang.cpp
@@ -4426,6 +4426,10 @@
Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_version_EQ);
Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_cuda_number_of_sm_EQ);
Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_cuda_blocks_per_sm_EQ);
+ if (Args.hasFlag(options::OPT_fopenmp_optimistic_collapse,
+ options::OPT_fno_openmp_optimistic_collapse,
+ /*Default=*/false))
+ CmdArgs.push_back("-fopenmp-optimistic-collapse");
// When in OpenMP offloading mode with NVPTX target, forward
// cuda-mode flag
Index: include/clang/Driver/Options.td
===================================================================
--- include/clang/Driver/Options.td
+++ include/clang/Driver/Options.td
@@ -1574,6 +1574,10 @@
Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
def fopenmp_cuda_blocks_per_sm_EQ : Joined<["-"], "fopenmp-cuda-blocks-per-sm=">, Group<f_Group>,
Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
+def fopenmp_optimistic_collapse : Flag<["-"], "fopenmp-optimistic-collapse">, Group<f_Group>,
+ Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
+def fno_openmp_optimistic_collapse : Flag<["-"], "fno-openmp-optimistic-collapse">, Group<f_Group>,
+ Flags<[NoArgumentUnused, HelpHidden]>;
def fno_optimize_sibling_calls : Flag<["-"], "fno-optimize-sibling-calls">, Group<f_Group>;
def foptimize_sibling_calls : Flag<["-"], "foptimize-sibling-calls">, Group<f_Group>;
def fno_escaping_block_tail_calls : Flag<["-"], "fno-escaping-block-tail-calls">, Group<f_Group>, Flags<[CC1Option]>;
Index: include/clang/Basic/LangOptions.def
===================================================================
--- include/clang/Basic/LangOptions.def
+++ include/clang/Basic/LangOptions.def
@@ -207,6 +207,7 @@
LANGOPT(OpenMPHostCXXExceptions , 1, 0, "C++ exceptions handling in the host code.")
LANGOPT(OpenMPCUDANumSMs , 32, 0, "Number of SMs for CUDA devices.")
LANGOPT(OpenMPCUDABlocksPerSM , 32, 0, "Number of blocks per SM for CUDA devices.")
+LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.")
LANGOPT(RenderScript , 1, 0, "RenderScript")
LANGOPT(CUDAIsDevice , 1, 0, "compiling for CUDA device")
Index: docs/OpenMPSupport.rst
===================================================================
--- docs/OpenMPSupport.rst
+++ docs/OpenMPSupport.rst
@@ -109,6 +109,16 @@
between the threads and it is user responsibility to share the required data
between the threads in the parallel regions.
+Collapsed loop nest counter
+---------------------------
+
+When using the collapse clause on a loop nest the default behaviour is to
+automatically extend the representation of the loop counter to 64 bits for
+the cases where the sizes of the collapsed loops are not known at compile
+time. To prevent this conservative choice and use at most 32 bits,
+compile your program with the `-fopenmp-optimistic-collapse`.
+
+
Features not supported or with limited support for Cuda devices
---------------------------------------------------------------
_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits