ivanrodriguez3753 created this revision.
ivanrodriguez3753 added a reviewer: OpenMP.
Herald added subscribers: pengfei, guansong, tpr, yaxunl.
Herald added a project: All.
ivanrodriguez3753 requested review of this revision.
Herald added a reviewer: jdoerfert.
Herald added subscribers: cfe-commits, jplehr, sstefan1.
Herald added a project: clang.

It seems that the OpenMP CodeGen is incorrectly generating a pointer for a size 
calculation on the combined entry of a partially mapped struct. Here is the 
reduced test case:

  scrubbed-user@scrubbed-server: cat reduced.cpp
  #include <omp.h>
  #include <cassert>
  #include <iostream>
  
  #define N 1000
  
  struct T {
    int dep_1[N];
    int dep_2[N];
  };
  
  using namespace std;
  int main() {
    #define SMALL 2
    T t;
    #pragma omp target map(tofrom: t.dep_1, t.dep_2[0:SMALL])
    {
      for (int i = 0; i < SMALL; i++) {
        t.dep_1[i] = 1;
        t.dep_2[i] = 1;
      }
    }
  
    for (int i = 0; i < SMALL; i++) {
      assert(t.dep_1[i] == 1);
      assert(t.dep_2[i] == 1);
    }
  }

Originally, we were mapping `t.dep_2[0:N]`, but I reduced to the smallest size 
that still breaks the runtime. We'll see why we need at least 2 in a second... 
Here is some output from the runtime library crashing

  scrubbed-user@scrubbed-server: 
/ptmp/scrubbed-user/llvm-project/build/bin/clang++ -I 
/ptmp/scrubbed-user/llvm-project/build/projects/openmp/runtime/src -L 
/ptmp/scrubbed-user/llvm-project/build/projects/openmp/libomptarget/ -fopenmp 
-fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa 
-march=gfx908 reduced.cpp -g
  scrubbed-user@scrubbed-server: LIBOMPTARGET_DEBUG=1 ./a.out # only including 
relevant output, run yourself for the full verbose debug messaging
  
  PluginInterface --> Entry point 0x0000000000000000 maps to 
__omp_offloading_4e_6ccfb3ae_main_l16 (0x000055b886d524d8)
  Libomptarget --> Entry  0: Base=0x00007ffd9cac727c, Begin=0x00007ffd9cac727c, 
Size=4004, Type=0x20, Name=unknown
  Libomptarget --> Entry  1: Base=0x00007ffd9cac727c, Begin=0x00007ffd9cac727c, 
Size=4000, Type=0x1000000000003, Name=unknown
  Libomptarget --> Entry  2: Base=0x00007ffd9cac727c, Begin=0x00007ffd9cac821c, 
Size=8, Type=0x1000000000003, Name=unknown
  
  a.out:237581 terminated with signal 6 at PC=7f409bf30c6b SP=7ffd9cac6a00.  
Backtrace:
  /lib64/libc.so.6(gsignal+0x10d)[0x7f409bf30c6b]
  /lib64/libc.so.6(abort+0x177)[0x7f409bf32305]
  
/ptmp/scrubbed-user/llvm-project/build/bin/../lib/libomptarget.so.18git(+0x7452c1)[0x7f409ca652c1]
  
/ptmp/scrubbed-user/llvm-project/build/bin/../lib/libomptarget.so.18git(+0x740fe6)[0x7f409ca60fe6]
  
/ptmp/scrubbed-user/llvm-project/build/bin/../lib/libomptarget.so.18git(__tgt_target_kernel+0xe5)[0x7f409ca60335]
  ./a.out(+0x3385)[0x55b8850d5385]
  /lib64/libc.so.6(__libc_start_main+0xef)[0x7f409bf1b24d]
  ./a.out(+0x312a)[0x55b8850d512a]

If my understanding is correct, the combined entry should have a size equal to 
the highest pointer minus the lowest pointer (in the most ideal scenario). I'm 
not sure if upstream clang uses a tight or loose bounding box for the combined 
entry, but in any case, it's wrong. It should be either 4008 or 8000, depending 
on whether we are being clever or not.

Running in GDB:

  scrubbed-user@scrubbed-server: gdb a.out
  (gdb) r
  Starting program: 
/cray/css/users/scrubbed-user/sandbox/test_cleanup/target_depend_lvalue_01/reduced/upstream_build/a.out
 
  Missing separate debuginfos, use: zypper install 
glibc-debuginfo-2.31-150300.46.1.x86_64
  [Thread debugging using libthread_db enabled]
  Using host libthread_db library "/lib64/libthread_db.so.1".
  [New Thread 0x7fffceb2c700 (LWP 247765)]
  [New Thread 0x7ffece1ff700 (LWP 247766)]
  [Thread 0x7ffece1ff700 (LWP 247766) exited]
  Libomptarget message: explicit extension not allowed: host address specified 
is 0x00007fffffff786c (8 bytes), but device allocation maps to host at 
0x00007fffffff68cc (4004 bytes)
  Libomptarget error: Call to getTargetPointer returned null pointer (device 
failure or illegal mapping).
  Libomptarget error: Call to targetDataBegin failed, abort target.
  Libomptarget error: Failed to process data before launching the kernel.
  Libomptarget error: Consult https://openmp.llvm.org/design/Runtimes.html for 
debugging options.
  reduced.cpp:16:3: Libomptarget fatal error 1: failure of target construct 
while offloading is mandatory
  
  Thread 1 "a.out" received signal SIGABRT, Aborted.
  0x00007ffff62ccc6b in raise () from /lib64/libc.so.6
  Missing separate debuginfos, use: zypper install 
comgr5.5.0-debuginfo-2.5.0.50500-sles153.63.x86_64 
hip-runtime-amd5.5.0-debuginfo-5.5.30201.50500-sles153.63.x86_64 
hsa-rocr5.5.0-debuginfo-1.8.0.50500-sles153.63.x86_64 
libatomic1-debuginfo-12.2.1+git416-150000.1.7.1.x86_64 
libdrm2-debuginfo-2.4.107-150400.1.8.x86_64 
libdrm_amdgpu1-debuginfo-2.4.107-150400.1.8.x86_64 
libefa1-debuginfo-38.1-150400.4.6.x86_64 
libelf1-debuginfo-0.185-150400.5.3.1.x86_64 
libfabric1-debuginfo-1.13.2-150400.1.73.x86_64 
libffi7-debuginfo-3.2.1.git259-10.8.x86_64 
libgcc_s1-debuginfo-12.2.1+git416-150000.1.7.1.x86_64 
libibverbs1-debuginfo-38.1-150400.4.6.x86_64 
libinfinipath4-debuginfo-3.3-5.3.1.x86_64 libjansson4-debuginfo-2.9-1.24.x86_64 
libncurses6-debuginfo-6.1-150000.5.12.1.x86_64 
libnl3-200-debuginfo-3.3.0-1.29.x86_64 
libnuma1-debuginfo-2.0.14.20.g4ee5e0c-150400.1.24.x86_64 
libpsm_infinipath1-debuginfo-3.3-5.3.1.x86_64 
librdmacm1-debuginfo-38.1-150400.4.6.x86_64 
libstdc++6-debuginfo-12.2.1+git416-150000.1.7.1.x86_64 
libuuid1-debuginfo-2.37.2-150400.8.14.1.x86_64 
libz1-debuginfo-1.2.11-150000.3.39.1.x86_64
  (gdb) info stack
  #0  0x00007ffff62ccc6b in raise () from /lib64/libc.so.6
  #1  0x00007ffff62ce305 in abort () from /lib64/libc.so.6
  #2  0x00007ffff6e012c1 in handleTargetOutcome (Success=false, 
Loc=0x55555555bc18) at 
/ptmp/scrubbed-user/llvm-project/openmp/libomptarget/src/omptarget.cpp:303
  #3  0x00007ffff6dfcfe6 in targetKernel<AsyncInfoTy> (Loc=0x55555555bc18, 
DeviceId=0, NumTeams=1, ThreadLimit=0, HostPtr=0x555555559320 
<.__omp_offloading_4e_6ccfb3ae_main_l16.region_id>, 
      KernelArgs=0x7fffffff67f8) at 
/ptmp/scrubbed-user/llvm-project/openmp/libomptarget/src/interface.cpp:308
  #4  0x00007ffff6dfc335 in __tgt_target_kernel (Loc=0x55555555bc18, 
DeviceId=-1, NumTeams=-1, ThreadLimit=0, HostPtr=0x555555559320 
<.__omp_offloading_4e_6ccfb3ae_main_l16.region_id>, 
      KernelArgs=0x7fffffff67f8) at 
/ptmp/scrubbed-user/llvm-project/openmp/libomptarget/src/interface.cpp:333
  #5  0x000055555555a8e8 in main () at reduced.cpp:16
  (gdb) s
  Single stepping until exit from function raise,
  which has no line number information.
  
  a.out:245985 terminated with signal 6 at PC=7ffff62ccc6b SP=7fffffff6050.  
Backtrace:
  /lib64/libc.so.6(gsignal+0x10d)[0x7ffff62ccc6b]
  /lib64/libc.so.6(abort+0x177)[0x7ffff62ce305]
  
/ptmp/scrubbed-user/llvm-project/build/bin/../lib/libomptarget.so.18git(+0x7452c1)[0x7ffff6e012c1]
  
/ptmp/scrubbed-user/llvm-project/build/bin/../lib/libomptarget.so.18git(+0x740fe6)[0x7ffff6dfcfe6]
  
/ptmp/scrubbed-user/llvm-project/build/bin/../lib/libomptarget.so.18git(__tgt_target_kernel+0xe5)[0x7ffff6dfc335]
  
/cray/css/users/scrubbed-user/sandbox/test_cleanup/target_depend_lvalue_01/reduced/upstream_build/a.out(+0x68e8)[0x55555555a8e8]
  /lib64/libc.so.6(__libc_start_main+0xef)[0x7ffff62b724d]
  
/cray/css/users/scrubbed-user/sandbox/test_cleanup/target_depend_lvalue_01/reduced/upstream_build/a.out(+0x668a)[0x55555555a68a]
  [Thread 0x7fffceb2c700 (LWP 247765) exited]
  [Inferior 1 (process 245985) exited with code 01]

Running again except changing the combined entry size to 4008, note the process 
exits normally

  (gdb) b 
/ptmp/scrubbed-user/llvm-project/openmp/libomptarget/src/interface.cpp:
  
  malformed linespec error: unexpected end of input
  (gdb) b 
/ptmp/scrubbed-user/llvm-project/openmp/libomptarget/src/interface.cpp:329
  Breakpoint 1 at 0x7ffff6dfc2d7: file 
/ptmp/scrubbed-user/llvm-project/openmp/libomptarget/src/interface.cpp, line 
329.
  (gdb) r
  Starting program: 
/cray/css/users/scrubbed-user/sandbox/test_cleanup/target_depend_lvalue_01/reduced/upstream_build/a.out
 
  [Thread debugging using libthread_db enabled]
  Using host libthread_db library "/lib64/libthread_db.so.1".
  [New Thread 0x7fffceb2c700 (LWP 261095)]
  [New Thread 0x7ffece1ff700 (LWP 261096)]
  [Thread 0x7ffece1ff700 (LWP 261096) exited]
  
  Thread 1 "a.out" hit Breakpoint 1, __tgt_target_kernel (Loc=0x55555555bc18, 
DeviceId=-1, NumTeams=-1, ThreadLimit=0, HostPtr=0x555555559320 
<.__omp_offloading_4e_6ccfb3ae_main_l16.region_id>, KernelArgs=0x7fffffff67f8) 
at /ptmp/scrubbed-user/llvm-project/openmp/libomptarget/src/interface.cpp:329
  329       if (KernelArgs->Flags.NoWait)
  (gdb) p KernelArgs->ArgSizes[0]
  $1 = 4004
  (gdb) set KernelArgs->ArgSizes[0]=4008
  (gdb) p KernelArgs->ArgSizes[0]
  $2 = 4008
  (gdb) c
  Continuing.
  [Thread 0x7fffceb2c700 (LWP 261095) exited]
  [Inferior 1 (process 259669) exited normally]

So, it looks like the frontend is generating a size incorrectly, since it works 
when we hack via gdb to give it the size we think it should be.

As an additional data point, Cray's compiler (which I have access to because I 
work here) is failing with a different but more or less equivalent error 
message from our OpenMP offloading runtime (`CRAY_ACC_DEBUG` is a user facing 
debug flag similar to upstream llvm's `LIBOMPTARGET_DEBUG`):

  scrubbed-user@scrubbed-server: cc -fopenmp ../reduced.cpp
  scrubbed-user@scrubbed-server: CRAY_ACC_DEBUG=2 ./a.out
  ACC: Version 5.0 of HIP already initialized, runtime version 50530201
  ACC: Get Device 0
  ACC: Set Thread Context
  ACC: Start transfer 3 items from reduced.cpp:16
  ACC:       allocate 'unknown' (4004 bytes)
  ACC:       member, copy to acc 't.dep_1' (4000 bytes)
  ACC: libcrayacc/acc_present.c:679 CRAY_ACC_ERROR - Host region (7ffe9957034c 
to 7ffe99570354) overlaps present region (7ffe9956f3ac to 7ffe99570350 index 0) 
but is not contained for 't.dep_2[0:2]' from reduced.cpp:16
  scrubbed-user@scrubbed-server: CRAY_ACC_DEBUG=3 ./a.out
  ACC: __tgt_register_requires: flags = NONE
  ACC: __tgt_register_lib
  ACC:   NumDeviceImages=1
  ACC:   Device Images:
  ACC:   Image location: 0x200c52 - 0x201fd2
  ACC:   Processing valid image
  ACC:   NumEntries=1
  ACC:   Image entries:
  ACC:   __omp_offloading_4e_6ccfb3ae_main_l16
  ACC:     {
  ACC:         addr=0x200ac0
  ACC:         size=0
  ACC:         flags=0
  ACC:     }
  ACC:   NumHostEntries=1
  ACC:   Host entries:
  ACC:   __omp_offloading_4e_6ccfb3ae_main_l16
  ACC:     {
  ACC:         addr=0x200ac0
  ACC:         size=0
  ACC:         flags=0
  ACC:     }
  ACC: __tgt_target_kernel(device_id=-1, host_ptr=0x200ac0, arg_num=3)
  ACC: __internal_tgt_target_teams(device_id=-1, host_ptr=0x200ac0, arg_num=3, 
num_teams=1, thread_limit=0)
  ACC: Version 5.0 of HIP already initialized, runtime version 50530201
  ACC: Get Device 0
  ACC: Compute level 9.0
  ACC: Device Name: 
  ACC: Number of cus 120
  ACC: Device name 
  ACC: AMD GCN arch name: gfx908:sramecc+:xnack-
  ACC: Max shared memory 65536
  ACC: Max thread blocks per cu 8
  ACC: Max concurrent kernels 8
  ACC: Async table size 8
  ACC: Total GPU memory 34342961152
  ACC: Available GPU memory 34309406720
  ACC: Set Thread Context
  ACC: Establish link bewteen libcrayacc and libcraymp
  ACC:   libcrayacc interface v6
  ACC:    libcraymp interface v6
  ACC:    loading module data
  ACC: __internal_tgt_target_teams(device_id=-1, host_ptr=0x200ac0, arg_num=3, 
num_teams=1, thread_limit=1)
  ACC:   [0] 0x7ffd3634543c base 0x7ffd3634543c begin 0x7ffd3634543c : 4004 
bytes type=0x20 (TARGET_PARAM) name (unknown)
  ACC:   [1] 0x7ffd3634543c base 0x7ffd3634543c begin 0x7ffd3634543c : 4000 
bytes type=0x1000000000003 (TO FROM MEMBER_OF) name (t.dep_1)
  ACC:   [2] 0x7ffd363463dc base 0x7ffd3634543c begin 0x7ffd363463dc : 8 bytes 
type=0x1000000000003 (TO FROM MEMBER_OF) name (t.dep_2[0:2])
  ACC: Start transfer 3 items from reduced.cpp:16
  ACC:   flags: NEED_POST_PHASE
  ACC: 
  ACC:   Transfer Phase
  ACC:   Trans 1
  ACC:       Simple transfer of 'unknown' (4004 bytes)
  ACC:            host ptr 7ffd3634543c
  ACC:            acc  ptr 0
  ACC:            flags: ALLOCATE ACQ_PRESENT REG_PRESENT
  ACC:            memory not found in present table
  ACC:            allocate (4004 bytes)
  ACC:              get new reusable memory, added entry
  ACC:            new allocated ptr (7fb81a200000)
  ACC:            add to present table index 0: host 7ffd3634543c to 
7ffd363463e0, acc 7fb81a200000
  ACC:            new acc ptr 7fb81a200000
  ACC: 
  ACC:   Trans 2
  ACC:   Trans 3
  ACC:   Post Transfer Phase
  ACC:   Trans 1
  ACC:   Trans 2
  ACC:       Simple transfer of 't.dep_1' (4000 bytes)
  ACC:            host ptr 7ffd3634543c
  ACC:            acc  ptr 0
  ACC:            flags: COPY_HOST_TO_ACC REG_PRESENT DIR_MEMBER_UPDATE
  ACC:            host region 7ffd3634543c to 7ffd363463dc found in present 
table index 0 (ref count 1)
  ACC:            copy host to acc (7ffd3634543c to 7fb81a200000)
  ACC:                internal copy host to acc (host 7ffd3634543c to acc 
7fb81a200000) size = 4000
  ACC: 
  ACC:   Trans 3
  ACC:       Simple transfer of 't.dep_2[0:2]' (8 bytes)
  ACC:            host ptr 7ffd363463dc
  ACC:            acc  ptr 0
  ACC:            flags: COPY_HOST_TO_ACC REG_PRESENT DIR_MEMBER_UPDATE
  ACC: libcrayacc/acc_present.c:679 CRAY_ACC_ERROR - Host region (7ffd363463dc 
to 7ffd363463e4) overlaps present region (7ffd3634543c to 7ffd363463e0 index 0) 
but is not contained for 't.dep_2[0:2]' from reduced.cpp:16
  ACC: __tgt_unregister_lib
  ACC: Start executing pending destructors

The same gdb trick works using the executable generated by Cray's compiler.

Let's change the reduced test case to map `t.dep_2[0:N]`, and compare to a 
working test case. This working test case is identical except it maps all of 
t.dep_2, with no slice. 
The following are snippets from `-S -emit-llvm`, from the broken and working 
cases respectively:

broken:

  define dso_local noundef i32 @main() #4 !dbg !929 {
  entry:
    %retval = alloca i32, align 4
    %t = alloca %struct.T, align 4
    %.offload_baseptrs = alloca [3 x ptr], align 8
    %.offload_ptrs = alloca [3 x ptr], align 8
    %.offload_mappers = alloca [3 x ptr], align 8
    %.offload_sizes = alloca [3 x i64], align 8
    %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
    %i = alloca i32, align 4
    store i32 0, ptr %retval, align 4
    call void @llvm.dbg.declare(metadata ptr %t, metadata !930, metadata 
!DIExpression()), !dbg !938
    %dep_1 = getelementptr inbounds %struct.T, ptr %t, i32 0, i32 0, !dbg !939
    %dep_2 = getelementptr inbounds %struct.T, ptr %t, i32 0, i32 1, !dbg !941
    %arrayidx = getelementptr inbounds [1000 x i32], ptr %dep_2, i64 0, i64 0, 
!dbg !942
    %0 = getelementptr i32, ptr %arrayidx, i32 1, !dbg !943
    %1 = ptrtoint ptr %0 to i64, !dbg !943
    %2 = ptrtoint ptr %dep_1 to i64, !dbg !943
    %3 = sub i64 %1, %2, !dbg !943
    %4 = sdiv exact i64 %3, ptrtoint (ptr getelementptr (i8, ptr null, i32 1) 
to i64), !dbg !943
    call void @llvm.memcpy.p0.p0.i64(ptr align 8 %.offload_sizes, ptr align 8 
@.offload_sizes, i64 24, i1 false)

working:

  define dso_local noundef i32 @main() #4 !dbg !929 {
  entry:
    %retval = alloca i32, align 4
    %t = alloca %struct.T, align 4
    %.offload_baseptrs = alloca [3 x ptr], align 8
    %.offload_ptrs = alloca [3 x ptr], align 8
    %.offload_mappers = alloca [3 x ptr], align 8
    %.offload_sizes = alloca [3 x i64], align 8
    %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
    %i = alloca i32, align 4
    store i32 0, ptr %retval, align 4
    call void @llvm.dbg.declare(metadata ptr %t, metadata !930, metadata 
!DIExpression()), !dbg !938
    %dep_1 = getelementptr inbounds %struct.T, ptr %t, i32 0, i32 0, !dbg !939
    %dep_2 = getelementptr inbounds %struct.T, ptr %t, i32 0, i32 1, !dbg !941
    %0 = getelementptr [1000 x i32], ptr %dep_2, i32 1, !dbg !942
    %1 = ptrtoint ptr %0 to i64, !dbg !942
    %2 = ptrtoint ptr %dep_1 to i64, !dbg !942
    %3 = sub i64 %1, %2, !dbg !942
    %4 = sdiv exact i64 %3, ptrtoint (ptr getelementptr (i8, ptr null, i32 1) 
to i64), !dbg !942
    call void @llvm.memcpy.p0.p0.i64(ptr align 8 %.offload_sizes, ptr align 8 
@.offload_sizes, i64 24, i1 false)

It's a little subtle but the key is in the high pointer used for the pointer 
difference. The type of `getelementptr` used for the broken test case is an 
i32, while in the working test case it is an [1000 x i32]. In the context of 
our test case, this explains the 4004 byte size (as opposed to 4008 or 8000, 
again depending on whether or not we're being clever with our bounding box).

In `CGOpenMPRuntime.cpp`, both test cases go through

  } else {
    LowestElem = LB =
        CGF.EmitOMPSharedLValue(I->getAssociatedExpression())
            .getAddress(CGF);
  }

in `generateInfoForComponentList`. `EmitOMPSharedLValue` seems like it'll 
handle an arbitrarily long list of components like `a.b.c.ptr->whatever`, but 
it will return the last component it generated. In our case, it is the array 
slice.

`LowestElem` is later copied over to `HighestElem`, and `PartialStruct` is 
updated. It really seems like `StructRangeInfoTy` is only meant to hold DIRECT 
struct members, because the high pointer is emitted with a hardcoded GEP 
instruction of offset 1, `CreateConstGEP1_32`, in `emitCombinedEntry`:

  // Size is (addr of {highest+1} element) - (addr of lowest element)
  llvm::Value *HB = HBAddr.getPointer();
  llvm::Value *HAddr = CGF.Builder.CreateConstGEP1_32(
      HBAddr.getElementType(), HB, /*Idx0=*/1);
  llvm::Value *CLAddr = CGF.Builder.CreatePointerCast(LB, CGF.VoidPtrTy);
  llvm::Value *CHAddr = CGF.Builder.CreatePointerCast(HAddr, CGF.VoidPtrTy);
  llvm::Value *Diff = CGF.Builder.CreatePtrDiff(CGF.Int8Ty, CHAddr, CLAddr);
  llvm::Value *Size = CGF.Builder.CreateIntCast(Diff, CGF.Int64Ty,
                                                /*isSigned=*/false);
  CombinedInfo.Sizes.push_back(Size);

This PR addresses that be going backwards in the component list until we get to 
the second to last component (as in a direct member of the struct in question). 
It fixes the broken test case but breaks quite a few tests. Here's 
`check-clang-openmp` before and after this PR:

  Unsupported:   12
  Passed     : 1354



  Unsupported:   12
  Passed     : 1334
  Failed     :   20

I'm very unexperienced with clang's frontend codegen and was hoping for some 
pointers, as well as opinions about the broken test case. At the very least, 
this serves as a bug report. Any misunderstandings on my part, or missing 
context?

Some question:

1. Can someone confirm or deny that `PartialStruct` is supposed to only hold 
direct members?
2. Can the while loop rely on the GEP instruction dyn_cast?
3. If this idea of a solution is appropriate, should it be implemented as I 
did, or when the pointer is created, as where I left the comment, or a change 
to PartialStruct to keep track of or differentiate between direct members and 
transitive members? Maybe we could instead use the HighestElem plus its offset, 
instead of the `CreateConstGEP1_32`?


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D158559

Files:
  clang/lib/CodeGen/CGOpenMPRuntime.cpp


Index: clang/lib/CodeGen/CGOpenMPRuntime.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -7435,6 +7435,9 @@
           LowestElem = LB =
               CGF.EmitOMPSharedLValue(I->getAssociatedExpression())
                   .getAddress(CGF);
+          // Seems like changing here doesn't reflect on the @.offload_sizes 
entry,
+          // while changing it in  emitCombinedEntry does update the sizes 
array.
+          // I also couldn't get the full test case to working when trying to 
change here
         }
 
         // If this component is a pointer inside the base struct then we don't
@@ -8382,8 +8385,29 @@
       CombinedInfo.Pointers.push_back(LB);
       // Size is (addr of {highest+1} element) - (addr of lowest element)
       llvm::Value *HB = HBAddr.getPointer();
-      llvm::Value *HAddr = CGF.Builder.CreateConstGEP1_32(
-          HBAddr.getElementType(), HB, /*Idx0=*/1);
+      llvm::Value *HAddr;
+
+      if(HBAddr.getElementType() == PartialStruct.Base.getElementType()) {
+        HAddr = CGF.Builder.CreateConstGEP1_32(HBAddr.getElementType(), HB, 
/*Idx0=*/1);
+      }
+      else
+      {
+        //fixup the last pointer if it's not a direct struct member
+        llvm::Instruction* Instr = &CGF.Builder.GetInsertBlock()->back();
+        // we want to stop at the GEP that uses the base pointer as its
+        // source. Can we safely assume that we can go off of the boolean 
result of the cast?
+        // In other words, are we certain that all the members were generated 
in the IR
+        // using a GEP instruction?
+        llvm::Instruction* cur_inst = Instr;
+        llvm::Instruction* last_inst; 
+        while(dyn_cast<llvm::GetElementPtrInst>(cur_inst)) {
+          last_inst = cur_inst;
+          cur_inst = 
cast<llvm::Instruction>(cast<llvm::GetElementPtrInst>(cur_inst)->getOperand(0));
+        }
+        HAddr = CGF.Builder.CreateConstGEP1_32(
+          cast<llvm::GetElementPtrInst>(last_inst)->getSourceElementType(), 
cast<llvm::Value>(last_inst), /*Idx0=*/1);
+      }      
+
       llvm::Value *CLAddr = CGF.Builder.CreatePointerCast(LB, CGF.VoidPtrTy);
       llvm::Value *CHAddr = CGF.Builder.CreatePointerCast(HAddr, 
CGF.VoidPtrTy);
       llvm::Value *Diff = CGF.Builder.CreatePtrDiff(CGF.Int8Ty, CHAddr, 
CLAddr);


Index: clang/lib/CodeGen/CGOpenMPRuntime.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -7435,6 +7435,9 @@
           LowestElem = LB =
               CGF.EmitOMPSharedLValue(I->getAssociatedExpression())
                   .getAddress(CGF);
+          // Seems like changing here doesn't reflect on the @.offload_sizes entry,
+          // while changing it in  emitCombinedEntry does update the sizes array.
+          // I also couldn't get the full test case to working when trying to change here
         }
 
         // If this component is a pointer inside the base struct then we don't
@@ -8382,8 +8385,29 @@
       CombinedInfo.Pointers.push_back(LB);
       // Size is (addr of {highest+1} element) - (addr of lowest element)
       llvm::Value *HB = HBAddr.getPointer();
-      llvm::Value *HAddr = CGF.Builder.CreateConstGEP1_32(
-          HBAddr.getElementType(), HB, /*Idx0=*/1);
+      llvm::Value *HAddr;
+
+      if(HBAddr.getElementType() == PartialStruct.Base.getElementType()) {
+        HAddr = CGF.Builder.CreateConstGEP1_32(HBAddr.getElementType(), HB, /*Idx0=*/1);
+      }
+      else
+      {
+        //fixup the last pointer if it's not a direct struct member
+        llvm::Instruction* Instr = &CGF.Builder.GetInsertBlock()->back();
+        // we want to stop at the GEP that uses the base pointer as its
+        // source. Can we safely assume that we can go off of the boolean result of the cast?
+        // In other words, are we certain that all the members were generated in the IR
+        // using a GEP instruction?
+        llvm::Instruction* cur_inst = Instr;
+        llvm::Instruction* last_inst; 
+        while(dyn_cast<llvm::GetElementPtrInst>(cur_inst)) {
+          last_inst = cur_inst;
+          cur_inst = cast<llvm::Instruction>(cast<llvm::GetElementPtrInst>(cur_inst)->getOperand(0));
+        }
+        HAddr = CGF.Builder.CreateConstGEP1_32(
+          cast<llvm::GetElementPtrInst>(last_inst)->getSourceElementType(), cast<llvm::Value>(last_inst), /*Idx0=*/1);
+      }      
+
       llvm::Value *CLAddr = CGF.Builder.CreatePointerCast(LB, CGF.VoidPtrTy);
       llvm::Value *CHAddr = CGF.Builder.CreatePointerCast(HAddr, CGF.VoidPtrTy);
       llvm::Value *Diff = CGF.Builder.CreatePtrDiff(CGF.Int8Ty, CHAddr, CLAddr);
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to