yaxunl created this revision.
yaxunl added a reviewer: tra.
Herald added subscribers: kerbowa, mgorny, nhaehnle, jvesely.

To support std::complex and some other standard C/C++ functions in HIP device 
code,
they need to be forced to be `__host__ __device__` functions by pragmas. This 
is done
by some clang standard C++ wrapper headers which are shared between cuda-clang 
and hip-Clang.

For these standard C++ wapper headers to work properly, specific include path 
order
has to be enforced:

1. clang C++ wrapper include path
2. standard C++ include path
3. clang include path

Also, these C++ wrapper headers require device version of some standard C/C++ 
functions
must be declared before including them. This needs to be done by including a 
default
header which declares or defines these device functions. The default header is 
always
included before any other headers are included by users.

This patch adds the the default header and include path for HIP.


https://reviews.llvm.org/D81176

Files:
  clang/include/clang/Basic/DiagnosticDriverKinds.td
  clang/include/clang/Driver/Options.td
  clang/include/clang/Driver/ToolChain.h
  clang/lib/Driver/ToolChain.cpp
  clang/lib/Driver/ToolChains/AMDGPU.cpp
  clang/lib/Driver/ToolChains/AMDGPU.h
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/lib/Driver/ToolChains/Cuda.cpp
  clang/lib/Driver/ToolChains/Gnu.cpp
  clang/lib/Driver/ToolChains/Gnu.h
  clang/lib/Driver/ToolChains/HIP.cpp
  clang/lib/Driver/ToolChains/HIP.h
  clang/lib/Driver/ToolChains/Linux.cpp
  clang/lib/Driver/ToolChains/Linux.h
  clang/lib/Driver/ToolChains/MSVC.cpp
  clang/lib/Driver/ToolChains/MSVC.h
  clang/lib/Driver/ToolChains/ROCm.h
  clang/lib/Headers/CMakeLists.txt
  clang/lib/Headers/__clang_cuda_math_forward_declares.h
  clang/lib/Headers/__clang_hip_libdevice_declares.h
  clang/lib/Headers/__clang_hip_math.h
  clang/lib/Headers/__clang_hip_runtime_wrapper.h
  clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/hip.bc
  clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/ockl.bc
  
clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_correctly_rounded_sqrt_off.bc
  
clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc
  clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_daz_opt_off.bc
  clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_daz_opt_on.bc
  
clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_finite_only_off.bc
  
clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_finite_only_on.bc
  
clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_1010.bc
  
clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_1011.bc
  
clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_1012.bc
  
clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_803.bc
  
clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_900.bc
  
clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_unsafe_math_off.bc
  
clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_unsafe_math_on.bc
  
clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_wavefrontsize64_off.bc
  
clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_wavefrontsize64_on.bc
  clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/ocml.bc
  clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/opencl.bc
  clang/test/Driver/Inputs/rocm/amdgcn/bitcode/hip.bc
  clang/test/Driver/Inputs/rocm/amdgcn/bitcode/ockl.bc
  
clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_correctly_rounded_sqrt_off.bc
  clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc
  clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_daz_opt_off.bc
  clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_daz_opt_on.bc
  clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_finite_only_off.bc
  clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_finite_only_on.bc
  clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1010.bc
  clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1011.bc
  clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1012.bc
  clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_803.bc
  clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_900.bc
  clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_unsafe_math_off.bc
  clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_unsafe_math_on.bc
  clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_wavefrontsize64_off.bc
  clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_wavefrontsize64_on.bc
  clang/test/Driver/Inputs/rocm/amdgcn/bitcode/ocml.bc
  clang/test/Driver/Inputs/rocm/amdgcn/bitcode/opencl.bc
  clang/test/Driver/Inputs/rocm/include/hip/hip_runtime.h
  clang/test/Driver/hip-device-libs.hip
  clang/test/Driver/hip-include-path.hip
  clang/test/Driver/rocm-detect.cl
  clang/test/Driver/rocm-detect.hip
  clang/test/Driver/rocm-device-libs.cl
  clang/test/Driver/rocm-not-found.cl

Index: clang/test/Driver/rocm-not-found.cl
===================================================================
--- clang/test/Driver/rocm-not-found.cl
+++ clang/test/Driver/rocm-not-found.cl
@@ -5,7 +5,7 @@
 
 // RUN: %clang -### --sysroot=%s/no-rocm-there -target amdgcn--amdhsa %s 2>&1 | FileCheck %s --check-prefix ERR
 // RUN: %clang -### --rocm-path=%s/no-rocm-there -target amdgcn--amdhsa %s 2>&1 | FileCheck %s --check-prefix ERR
-// ERR: cannot find ROCm installation. Provide its path via --rocm-path, or pass -nogpulib.
+// ERR: cannot find ROCm installation. Provide its path via --rocm-path, or pass -nogpulib and -nogpuinc to build without ROCm device library and HIP includes.
 
 // Accept nogpulib or nostdlib for OpenCL.
 // RUN: %clang -### -nogpulib --rocm-path=%s/no-rocm-there %s 2>&1 | FileCheck %s --check-prefix OK
Index: clang/test/Driver/rocm-device-libs.cl
===================================================================
--- clang/test/Driver/rocm-device-libs.cl
+++ clang/test/Driver/rocm-device-libs.cl
@@ -6,7 +6,7 @@
 
 // RUN: %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=gfx900 \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-DEFAULT,GFX900-DEFAULT,GFX900,WAVE64 %s
 
@@ -15,7 +15,7 @@
 // Make sure the different denormal default is respected for gfx8
 // RUN: %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=gfx803 \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-DEFAULT,GFX803-DEFAULT,GFX803,WAVE64 %s
 
@@ -24,7 +24,7 @@
 // Make sure the non-canonical name works
 // RUN: %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=fiji \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-DEFAULT,GFX803-DEFAULT,GFX803,WAVE64 %s
 
@@ -33,7 +33,7 @@
 // RUN: %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=gfx900 \
 // RUN:   -cl-denorms-are-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-DAZ,GFX900,WAVE64 %s
 
@@ -41,7 +41,7 @@
 // RUN: %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=gfx803 \
 // RUN:   -cl-denorms-are-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-DAZ,GFX803,WAVE64 %s
 
@@ -50,7 +50,7 @@
 // RUN: %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=gfx803 \
 // RUN:   -cl-finite-math-only \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-FINITE-ONLY,GFX803,WAVE64 %s
 
@@ -59,7 +59,7 @@
 // RUN: %clang -### -target amdgcn-amd-amdhsa    \
 // RUN:   -x cl -mcpu=gfx803                     \
 // RUN:   -cl-fp32-correctly-rounded-divide-sqrt \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-CORRECT-SQRT,GFX803,WAVE64 %s
 
@@ -68,7 +68,7 @@
 // RUN: %clang -### -target amdgcn-amd-amdhsa    \
 // RUN:   -x cl -mcpu=gfx803                     \
 // RUN:   -cl-fast-relaxed-math \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-FAST-RELAXED,GFX803,WAVE64 %s
 
@@ -77,45 +77,45 @@
 // RUN: %clang -### -target amdgcn-amd-amdhsa    \
 // RUN:   -x cl -mcpu=gfx803                     \
 // RUN:   -cl-unsafe-math-optimizations \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-UNSAFE,GFX803,WAVE64 %s
 
 // RUN: %clang -### -target amdgcn-amd-amdhsa    \
 // RUN:   -x cl -mcpu=gfx1010                    \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMMON,GFX1010,WAVE32 %s
 
 // RUN: %clang -### -target amdgcn-amd-amdhsa    \
 // RUN:   -x cl -mcpu=gfx1011                    \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMMON,GFX1011,WAVE32 %s
 
 // RUN: %clang -### -target amdgcn-amd-amdhsa    \
 // RUN:   -x cl -mcpu=gfx1012                    \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMMON,GFX1012,WAVE32 %s
 
 
 // RUN: %clang -### -target amdgcn-amd-amdhsa    \
 // RUN:   -x cl -mcpu=gfx1010 -mwavefrontsize64  \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMMON,GFX1010,WAVE64 %s
 
 // RUN: %clang -### -target amdgcn-amd-amdhsa    \
 // RUN:   -x cl -mcpu=gfx1010 -mwavefrontsize64 -mno-wavefrontsize64  \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMMON,GFX1010,WAVE32 %s
 
 // Ignore -mno-wavefrontsize64 without wave32 support
 // RUN: %clang -### -target amdgcn-amd-amdhsa       \
 // RUN:   -x cl -mcpu=gfx803  -mno-wavefrontsize64  \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs    \
+// RUN:   --rocm-path=%S/Inputs/rocm    \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMMON,GFX803,WAVE64 %s
 
@@ -124,12 +124,12 @@
 // Test --hip-device-lib-path format
 // RUN: %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=gfx900 \
-// RUN:   --hip-device-lib-path=%S/Inputs/rocm-device-libs/amdgcn/bitcode \
+// RUN:   --hip-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode \
 // RUN:   %S/opencl.cl \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-DEFAULT,GFX900-DEFAULT,GFX900,WAVE64 %s
 
 // Test environment variable HIP_DEVICE_LIB_PATH
-// RUN: env HIP_DEVICE_LIB_PATH=%S/Inputs/rocm-device-libs/amdgcn/bitcode %clang -### -target amdgcn-amd-amdhsa \
+// RUN: env HIP_DEVICE_LIB_PATH=%S/Inputs/rocm/amdgcn/bitcode %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=gfx900 \
 // RUN:   %S/opencl.cl \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-DEFAULT,GFX900-DEFAULT,GFX900,WAVE64 %s
Index: clang/test/Driver/rocm-detect.hip
===================================================================
--- clang/test/Driver/rocm-detect.hip
+++ clang/test/Driver/rocm-detect.hip
@@ -8,17 +8,17 @@
 // target not included in the test.
 
 // RUN: %clang -### -v -target x86_64-linux-gnu --cuda-gpu-arch=gfx902 \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs %s 2>&1 \
+// RUN:   --rocm-path=%S/Inputs/rocm %s 2>&1 \
 // RUN:   | FileCheck -check-prefixes=COMMON,GFX902-DEFAULTLIBS %s
 
 // Should not interpret -nostdlib as disabling offload libraries.
 // RUN: %clang -### -v -target x86_64-linux-gnu --cuda-gpu-arch=gfx902 -nostdlib \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs %s 2>&1 \
+// RUN:   --rocm-path=%S/Inputs/rocm %s 2>&1 \
 // RUN:   | FileCheck -check-prefixes=COMMON,GFX902-DEFAULTLIBS %s
 
 
 // RUN: %clang -### -v -target x86_64-linux-gnu --cuda-gpu-arch=gfx902 -nogpulib \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs %s 2>&1 \
+// RUN:   --rocm-path=%S/Inputs/rocm %s 2>&1 \
 // RUN:   | FileCheck -check-prefixes=COMMON,GFX902,NODEFAULTLIBS %s
 
 
Index: clang/test/Driver/rocm-detect.cl
===================================================================
--- clang/test/Driver/rocm-detect.cl
+++ clang/test/Driver/rocm-detect.cl
@@ -7,12 +7,12 @@
 // target not included in the test.
 
 // RUN: %clang -### -v -target amdgcn-amd-amdhsa -mcpu=gfx902 \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs %s 2>&1 \
+// RUN:   --rocm-path=%S/Inputs/rocm %s 2>&1 \
 // RUN:   | FileCheck -check-prefixes=COMMON,GFX902-DEFAULTLIBS %s
 
 
 // RUN: %clang -### -v -target amdgcn-amd-amdhsa -mcpu=gfx902 -nogpulib \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs %s 2>&1 \
+// RUN:   --rocm-path=%S/Inputs/rocm %s 2>&1 \
 // RUN:   | FileCheck -check-prefixes=COMMON,GFX902,NODEFAULTLIBS %s
 
 
Index: clang/test/Driver/hip-include-path.hip
===================================================================
--- /dev/null
+++ clang/test/Driver/hip-include-path.hip
@@ -0,0 +1,31 @@
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang -c -v -target x86_64-unknown-linux-gnu --cuda-gpu-arch=gfx900 \
+// RUN:   -std=c++11 --rocm-path=%S/Inputs/rocm -nogpulib %s 2>&1 \
+// RUN:   | FileCheck -check-prefixes=COMMON,WRAP,HIP %s
+
+// RUN: %clang -c -v -target x86_64-unknown-linux-gnu --cuda-gpu-arch=gfx900 \
+// RUN:   -std=c++11 --rocm-path=%S/Inputs/rocm -nobuiltininc -nogpulib %s 2>&1 \
+// RUN:   | FileCheck -check-prefixes=COMMON,NOWRAP,HIP %s
+
+// RUN: %clang -c -v -target x86_64-unknown-linux-gnu --cuda-gpu-arch=gfx900 \
+// RUN:   -std=c++11 --rocm-path=%S/Inputs/rocm -nogpuinc -nogpulib %s 2>&1 \
+// RUN:   | FileCheck -check-prefixes=COMMON,WRAP,NOHIP %s
+
+// COMMON-LABEL: clang{{.*}} -cc1 -triple amdgcn-amd-amdhsa
+// WRAP: clang/{{.*}}/include/cuda_wrappers
+// NOWRAP-NOT: clang/{{.*}}/include/cuda_wrappers
+// HIP: {{.*}}Inputs/rocm/include
+// NOHIP-NOT: {{.*}}Inputs/rocm/include
+// COMMON: {{.*}}include/c++
+// COMMON: clang/{{.*}}/include
+
+// COMMON-LABEL: clang{{.*}} -cc1 -triple x86_64-unknown-linux-gnu
+// WRAP: clang/{{.*}}/include/cuda_wrappers
+// NOWRAP-NOT: clang/{{.*}}/include/cuda_wrappers
+// HIP: {{.*}}Inputs/rocm/include
+// NOHIP-NOT: {{.*}}Inputs/rocm/include
+// COMMON: {{.*}}include/c++
+// COMMON: clang/{{.*}}/include
Index: clang/test/Driver/hip-device-libs.hip
===================================================================
--- clang/test/Driver/hip-device-libs.hip
+++ clang/test/Driver/hip-device-libs.hip
@@ -8,7 +8,7 @@
 // Test subtarget with flushing on by default.
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:  --cuda-gpu-arch=gfx803 \
-// RUN:  --rocm-path=%S/Inputs/rocm-device-libs   \
+// RUN:  --rocm-path=%S/Inputs/rocm   \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,FLUSHD
 
@@ -16,7 +16,7 @@
 // Test subtarget with flushing off by ddefault.
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:  --cuda-gpu-arch=gfx900 \
-// RUN:  --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:  --rocm-path=%S/Inputs/rocm \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,NOFLUSHD
 
@@ -25,7 +25,7 @@
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx900 \
 // RUN:   -fcuda-flush-denormals-to-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,FLUSHD
 
@@ -34,7 +34,7 @@
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 \
 // RUN:   -fno-cuda-flush-denormals-to-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,NOFLUSHD
 
@@ -43,7 +43,7 @@
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx900 \
 // RUN:   -fno-cuda-flush-denormals-to-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,NOFLUSHD
 
@@ -52,7 +52,7 @@
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 \
 // RUN:   -fcuda-flush-denormals-to-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,FLUSHD
 
@@ -61,7 +61,7 @@
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 \
 // RUN:   -fcuda-flush-denormals-to-zero -fno-cuda-flush-denormals-to-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,NOFLUSHD
 
@@ -69,7 +69,7 @@
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx900 \
 // RUN:   -fcuda-flush-denormals-to-zero -fno-cuda-flush-denormals-to-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs   \
+// RUN:   --rocm-path=%S/Inputs/rocm   \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,NOFLUSHD
 
@@ -77,7 +77,7 @@
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx900 \
 // RUN:   -fno-cuda-flush-denormals-to-zero -fcuda-flush-denormals-to-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs   \
+// RUN:   --rocm-path=%S/Inputs/rocm   \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,FLUSHD
 
@@ -85,7 +85,7 @@
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 \
 // RUN:   -fno-cuda-flush-denormals-to-zero -fcuda-flush-denormals-to-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
+// RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,FLUSHD
 
@@ -93,13 +93,13 @@
 // Test --hip-device-lib-path flag
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 \
-// RUN:   --hip-device-lib-path=%S/Inputs/rocm-device-libs/amdgcn/bitcode   \
+// RUN:   --hip-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode   \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,FLUSHD
 
 
 // Test environment variable HIP_DEVICE_LIB_PATH
-// RUN: env HIP_DEVICE_LIB_PATH=%S/Inputs/rocm-device-libs/amdgcn/bitcode \
+// RUN: env HIP_DEVICE_LIB_PATH=%S/Inputs/rocm/amdgcn/bitcode \
 // RUN:   %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx900 \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
Index: clang/lib/Headers/__clang_hip_runtime_wrapper.h
===================================================================
--- /dev/null
+++ clang/lib/Headers/__clang_hip_runtime_wrapper.h
@@ -0,0 +1,64 @@
+/*===---- __clang_hip_runtime_wrapper.h - HIP runtime support ---------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/*
+ * WARNING: This header is intended to be directly -include'd by
+ * the compiler and is not supposed to be included by users.
+ *
+ */
+
+#ifndef __CLANG_HIP_RUNTIME_WRAPPER_H__
+#define __CLANG_HIP_RUNTIME_WRAPPER_H__
+
+#if __HIP__
+
+#include <cmath>
+#include <cstdlib>
+#include <stdlib.h>
+
+#define __host__ __attribute__((host))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __shared__ __attribute__((shared))
+#define __constant__ __attribute__((constant))
+
+#if __HIP_ENABLE_DEVICE_MALLOC__
+extern "C" __device__ void *__hip_malloc(size_t);
+extern "C" __device__ void *__hip_free(void *ptr);
+static inline __device__ void *malloc(size_t size) {
+  return __hip_malloc(size);
+}
+static inline __device__ void *free(void *ptr) { return __hip_free(ptr); }
+#else
+static inline __device__ void *malloc(size_t size) {
+  __builtin_trap();
+  return nullptr;
+}
+static inline __device__ void *free(void *ptr) {
+  __builtin_trap();
+  return nullptr;
+}
+#endif
+
+#include <__clang_hip_libdevice_declares.h>
+#include <__clang_hip_math.h>
+
+#if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
+#include <__clang_cuda_complex_builtins.h>
+#include <__clang_cuda_math_forward_declares.h>
+
+#include <algorithm>
+#include <complex>
+#include <new>
+#endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
+
+#define __CLANG_HIP_RUNTIME_WRPPER_INCLUDED__ 1
+
+#endif // __HIP__
+#endif // __CLANG_HIP_RUNTIME_WRAPPER_H__
Index: clang/lib/Headers/__clang_hip_math.h
===================================================================
--- /dev/null
+++ clang/lib/Headers/__clang_hip_math.h
@@ -0,0 +1,1097 @@
+/*===---- __clang_hip_math.h - HIP math decls -------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_HIP_MATH_H__
+#define __CLANG_HIP_MATH_H__
+
+#include <algorithm>
+#include <limits.h>
+#include <limits>
+#include <stdint.h>
+
+#pragma push_macro("__DEVICE__")
+#pragma push_macro("__RETURN_TYPE")
+
+// to be consistent with __clang_cuda_math_forward_declares
+#define __DEVICE__ static __device__
+#define __RETURN_TYPE bool
+
+__DEVICE__
+inline uint64_t __make_mantissa_base8(const char *tagp) {
+  uint64_t r = 0;
+  while (tagp) {
+    char tmp = *tagp;
+
+    if (tmp >= '0' && tmp <= '7')
+      r = (r * 8u) + tmp - '0';
+    else
+      return 0;
+
+    ++tagp;
+  }
+
+  return r;
+}
+
+__DEVICE__
+inline uint64_t __make_mantissa_base10(const char *tagp) {
+  uint64_t r = 0;
+  while (tagp) {
+    char tmp = *tagp;
+
+    if (tmp >= '0' && tmp <= '9')
+      r = (r * 10u) + tmp - '0';
+    else
+      return 0;
+
+    ++tagp;
+  }
+
+  return r;
+}
+
+__DEVICE__
+inline uint64_t __make_mantissa_base16(const char *tagp) {
+  uint64_t r = 0;
+  while (tagp) {
+    char tmp = *tagp;
+
+    if (tmp >= '0' && tmp <= '9')
+      r = (r * 16u) + tmp - '0';
+    else if (tmp >= 'a' && tmp <= 'f')
+      r = (r * 16u) + tmp - 'a' + 10;
+    else if (tmp >= 'A' && tmp <= 'F')
+      r = (r * 16u) + tmp - 'A' + 10;
+    else
+      return 0;
+
+    ++tagp;
+  }
+
+  return r;
+}
+
+__DEVICE__
+inline uint64_t __make_mantissa(const char *tagp) {
+  if (!tagp)
+    return 0u;
+
+  if (*tagp == '0') {
+    ++tagp;
+
+    if (*tagp == 'x' || *tagp == 'X')
+      return __make_mantissa_base16(tagp);
+    else
+      return __make_mantissa_base8(tagp);
+  }
+
+  return __make_mantissa_base10(tagp);
+}
+
+// BEGIN FLOAT
+__DEVICE__
+inline float abs(float x) { return __ocml_fabs_f32(x); }
+__DEVICE__
+inline float acosf(float x) { return __ocml_acos_f32(x); }
+__DEVICE__
+inline float acoshf(float x) { return __ocml_acosh_f32(x); }
+__DEVICE__
+inline float asinf(float x) { return __ocml_asin_f32(x); }
+__DEVICE__
+inline float asinhf(float x) { return __ocml_asinh_f32(x); }
+__DEVICE__
+inline float atan2f(float x, float y) { return __ocml_atan2_f32(x, y); }
+__DEVICE__
+inline float atanf(float x) { return __ocml_atan_f32(x); }
+__DEVICE__
+inline float atanhf(float x) { return __ocml_atanh_f32(x); }
+__DEVICE__
+inline float cbrtf(float x) { return __ocml_cbrt_f32(x); }
+__DEVICE__
+inline float ceilf(float x) { return __ocml_ceil_f32(x); }
+__DEVICE__
+inline float copysignf(float x, float y) { return __ocml_copysign_f32(x, y); }
+__DEVICE__
+inline float cosf(float x) { return __ocml_cos_f32(x); }
+__DEVICE__
+inline float coshf(float x) { return __ocml_cosh_f32(x); }
+__DEVICE__
+inline float cospif(float x) { return __ocml_cospi_f32(x); }
+__DEVICE__
+inline float cyl_bessel_i0f(float x) { return __ocml_i0_f32(x); }
+__DEVICE__
+inline float cyl_bessel_i1f(float x) { return __ocml_i1_f32(x); }
+__DEVICE__
+inline float erfcf(float x) { return __ocml_erfc_f32(x); }
+__DEVICE__
+inline float erfcinvf(float x) { return __ocml_erfcinv_f32(x); }
+__DEVICE__
+inline float erfcxf(float x) { return __ocml_erfcx_f32(x); }
+__DEVICE__
+inline float erff(float x) { return __ocml_erf_f32(x); }
+__DEVICE__
+inline float erfinvf(float x) { return __ocml_erfinv_f32(x); }
+__DEVICE__
+inline float exp10f(float x) { return __ocml_exp10_f32(x); }
+__DEVICE__
+inline float exp2f(float x) { return __ocml_exp2_f32(x); }
+__DEVICE__
+inline float expf(float x) { return __ocml_exp_f32(x); }
+__DEVICE__
+inline float expm1f(float x) { return __ocml_expm1_f32(x); }
+__DEVICE__
+inline float fabsf(float x) { return __ocml_fabs_f32(x); }
+__DEVICE__
+inline float fdimf(float x, float y) { return __ocml_fdim_f32(x, y); }
+__DEVICE__
+inline float fdividef(float x, float y) { return x / y; }
+__DEVICE__
+inline float floorf(float x) { return __ocml_floor_f32(x); }
+__DEVICE__
+inline float fmaf(float x, float y, float z) { return __ocml_fma_f32(x, y, z); }
+__DEVICE__
+inline float fmaxf(float x, float y) { return __ocml_fmax_f32(x, y); }
+__DEVICE__
+inline float fminf(float x, float y) { return __ocml_fmin_f32(x, y); }
+__DEVICE__
+inline float fmodf(float x, float y) { return __ocml_fmod_f32(x, y); }
+__DEVICE__
+inline float frexpf(float x, int *nptr) {
+  int tmp;
+  float r = __ocml_frexp_f32(x, (__attribute__((address_space(5))) int *)&tmp);
+  *nptr = tmp;
+
+  return r;
+}
+__DEVICE__
+inline float hypotf(float x, float y) { return __ocml_hypot_f32(x, y); }
+__DEVICE__
+inline int ilogbf(float x) { return __ocml_ilogb_f32(x); }
+__DEVICE__
+inline __RETURN_TYPE isfinite(float x) { return __ocml_isfinite_f32(x); }
+__DEVICE__
+inline __RETURN_TYPE isinf(float x) { return __ocml_isinf_f32(x); }
+__DEVICE__
+inline __RETURN_TYPE isnan(float x) { return __ocml_isnan_f32(x); }
+__DEVICE__
+inline float j0f(float x) { return __ocml_j0_f32(x); }
+__DEVICE__
+inline float j1f(float x) { return __ocml_j1_f32(x); }
+__DEVICE__
+inline float jnf(int n, float x) { // TODO: we could use Ahmes multiplication
+                                   // and the Miller & Brown algorithm
+  //       for linear recurrences to get O(log n) steps, but it's unclear if
+  //       it'd be beneficial in this case.
+  if (n == 0)
+    return j0f(x);
+  if (n == 1)
+    return j1f(x);
+
+  float x0 = j0f(x);
+  float x1 = j1f(x);
+  for (int i = 1; i < n; ++i) {
+    float x2 = (2 * i) / x * x1 - x0;
+    x0 = x1;
+    x1 = x2;
+  }
+
+  return x1;
+}
+__DEVICE__
+inline float ldexpf(float x, int e) { return __ocml_ldexp_f32(x, e); }
+__DEVICE__
+inline float lgammaf(float x) { return __ocml_lgamma_f32(x); }
+__DEVICE__
+inline long long int llrintf(float x) { return __ocml_rint_f32(x); }
+__DEVICE__
+inline long long int llroundf(float x) { return __ocml_round_f32(x); }
+__DEVICE__
+inline float log10f(float x) { return __ocml_log10_f32(x); }
+__DEVICE__
+inline float log1pf(float x) { return __ocml_log1p_f32(x); }
+__DEVICE__
+inline float log2f(float x) { return __ocml_log2_f32(x); }
+__DEVICE__
+inline float logbf(float x) { return __ocml_logb_f32(x); }
+__DEVICE__
+inline float logf(float x) { return __ocml_log_f32(x); }
+__DEVICE__
+inline long int lrintf(float x) { return __ocml_rint_f32(x); }
+__DEVICE__
+inline long int lroundf(float x) { return __ocml_round_f32(x); }
+__DEVICE__
+inline float modff(float x, float *iptr) {
+  float tmp;
+  float r = __ocml_modf_f32(x, (__attribute__((address_space(5))) float *)&tmp);
+  *iptr = tmp;
+
+  return r;
+}
+__DEVICE__
+inline float nanf(const char *tagp) {
+  union {
+    float val;
+    struct ieee_float {
+      uint32_t mantissa : 22;
+      uint32_t quiet : 1;
+      uint32_t exponent : 8;
+      uint32_t sign : 1;
+    } bits;
+
+    static_assert(sizeof(float) == sizeof(ieee_float), "");
+  } tmp;
+
+  tmp.bits.sign = 0u;
+  tmp.bits.exponent = ~0u;
+  tmp.bits.quiet = 1u;
+  tmp.bits.mantissa = __make_mantissa(tagp);
+
+  return tmp.val;
+}
+__DEVICE__
+inline float nearbyintf(float x) { return __ocml_nearbyint_f32(x); }
+__DEVICE__
+inline float nextafterf(float x, float y) { return __ocml_nextafter_f32(x, y); }
+__DEVICE__
+inline float norm3df(float x, float y, float z) {
+  return __ocml_len3_f32(x, y, z);
+}
+__DEVICE__
+inline float norm4df(float x, float y, float z, float w) {
+  return __ocml_len4_f32(x, y, z, w);
+}
+__DEVICE__
+inline float normcdff(float x) { return __ocml_ncdf_f32(x); }
+__DEVICE__
+inline float normcdfinvf(float x) { return __ocml_ncdfinv_f32(x); }
+__DEVICE__
+inline float
+normf(int dim, const float *a) { // TODO: placeholder until OCML adds support.
+  float r = 0;
+  while (dim--) {
+    r += a[0] * a[0];
+    ++a;
+  }
+
+  return __ocml_sqrt_f32(r);
+}
+__DEVICE__
+inline float powf(float x, float y) { return __ocml_pow_f32(x, y); }
+__DEVICE__
+inline float rcbrtf(float x) { return __ocml_rcbrt_f32(x); }
+__DEVICE__
+inline float remainderf(float x, float y) { return __ocml_remainder_f32(x, y); }
+__DEVICE__
+inline float remquof(float x, float y, int *quo) {
+  int tmp;
+  float r =
+      __ocml_remquo_f32(x, y, (__attribute__((address_space(5))) int *)&tmp);
+  *quo = tmp;
+
+  return r;
+}
+__DEVICE__
+inline float rhypotf(float x, float y) { return __ocml_rhypot_f32(x, y); }
+__DEVICE__
+inline float rintf(float x) { return __ocml_rint_f32(x); }
+__DEVICE__
+inline float rnorm3df(float x, float y, float z) {
+  return __ocml_rlen3_f32(x, y, z);
+}
+
+__DEVICE__
+inline float rnorm4df(float x, float y, float z, float w) {
+  return __ocml_rlen4_f32(x, y, z, w);
+}
+__DEVICE__
+inline float
+rnormf(int dim, const float *a) { // TODO: placeholder until OCML adds support.
+  float r = 0;
+  while (dim--) {
+    r += a[0] * a[0];
+    ++a;
+  }
+
+  return __ocml_rsqrt_f32(r);
+}
+__DEVICE__
+inline float roundf(float x) { return __ocml_round_f32(x); }
+__DEVICE__
+inline float rsqrtf(float x) { return __ocml_rsqrt_f32(x); }
+__DEVICE__
+inline float scalblnf(float x, long int n) {
+  return (n < INT_MAX) ? __ocml_scalbn_f32(x, n) : __ocml_scalb_f32(x, n);
+}
+__DEVICE__
+inline float scalbnf(float x, int n) { return __ocml_scalbn_f32(x, n); }
+__DEVICE__
+inline __RETURN_TYPE signbit(float x) { return __ocml_signbit_f32(x); }
+__DEVICE__
+inline void sincosf(float x, float *sptr, float *cptr) {
+  float tmp;
+
+  *sptr = __ocml_sincos_f32(x, (__attribute__((address_space(5))) float *)&tmp);
+  *cptr = tmp;
+}
+__DEVICE__
+inline void sincospif(float x, float *sptr, float *cptr) {
+  float tmp;
+
+  *sptr =
+      __ocml_sincospi_f32(x, (__attribute__((address_space(5))) float *)&tmp);
+  *cptr = tmp;
+}
+__DEVICE__
+inline float sinf(float x) { return __ocml_sin_f32(x); }
+__DEVICE__
+inline float sinhf(float x) { return __ocml_sinh_f32(x); }
+__DEVICE__
+inline float sinpif(float x) { return __ocml_sinpi_f32(x); }
+__DEVICE__
+inline float sqrtf(float x) { return __ocml_sqrt_f32(x); }
+__DEVICE__
+inline float tanf(float x) { return __ocml_tan_f32(x); }
+__DEVICE__
+inline float tanhf(float x) { return __ocml_tanh_f32(x); }
+__DEVICE__
+inline float tgammaf(float x) { return __ocml_tgamma_f32(x); }
+__DEVICE__
+inline float truncf(float x) { return __ocml_trunc_f32(x); }
+__DEVICE__
+inline float y0f(float x) { return __ocml_y0_f32(x); }
+__DEVICE__
+inline float y1f(float x) { return __ocml_y1_f32(x); }
+__DEVICE__
+inline float ynf(int n, float x) { // TODO: we could use Ahmes multiplication
+                                   // and the Miller & Brown algorithm
+  //       for linear recurrences to get O(log n) steps, but it's unclear if
+  //       it'd be beneficial in this case. Placeholder until OCML adds
+  //       support.
+  if (n == 0)
+    return y0f(x);
+  if (n == 1)
+    return y1f(x);
+
+  float x0 = y0f(x);
+  float x1 = y1f(x);
+  for (int i = 1; i < n; ++i) {
+    float x2 = (2 * i) / x * x1 - x0;
+    x0 = x1;
+    x1 = x2;
+  }
+
+  return x1;
+}
+
+// BEGIN INTRINSICS
+__DEVICE__
+inline float __cosf(float x) { return __ocml_native_cos_f32(x); }
+__DEVICE__
+inline float __exp10f(float x) { return __ocml_native_exp10_f32(x); }
+__DEVICE__
+inline float __expf(float x) { return __ocml_native_exp_f32(x); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline float __fadd_rd(float x, float y) { return __ocml_add_rtn_f32(x, y); }
+#endif
+__DEVICE__
+inline float __fadd_rn(float x, float y) { return x + y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline float __fadd_ru(float x, float y) { return __ocml_add_rtp_f32(x, y); }
+__DEVICE__
+inline float __fadd_rz(float x, float y) { return __ocml_add_rtz_f32(x, y); }
+__DEVICE__
+inline float __fdiv_rd(float x, float y) { return __ocml_div_rtn_f32(x, y); }
+#endif
+__DEVICE__
+inline float __fdiv_rn(float x, float y) { return x / y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline float __fdiv_ru(float x, float y) { return __ocml_div_rtp_f32(x, y); }
+__DEVICE__
+inline float __fdiv_rz(float x, float y) { return __ocml_div_rtz_f32(x, y); }
+#endif
+__DEVICE__
+inline float __fdividef(float x, float y) { return x / y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline float __fmaf_rd(float x, float y, float z) {
+  return __ocml_fma_rtn_f32(x, y, z);
+}
+#endif
+__DEVICE__
+inline float __fmaf_rn(float x, float y, float z) {
+  return __ocml_fma_f32(x, y, z);
+}
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline float __fmaf_ru(float x, float y, float z) {
+  return __ocml_fma_rtp_f32(x, y, z);
+}
+__DEVICE__
+inline float __fmaf_rz(float x, float y, float z) {
+  return __ocml_fma_rtz_f32(x, y, z);
+}
+__DEVICE__
+inline float __fmul_rd(float x, float y) { return __ocml_mul_rtn_f32(x, y); }
+#endif
+__DEVICE__
+inline float __fmul_rn(float x, float y) { return x * y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline float __fmul_ru(float x, float y) { return __ocml_mul_rtp_f32(x, y); }
+__DEVICE__
+inline float __fmul_rz(float x, float y) { return __ocml_mul_rtz_f32(x, y); }
+__DEVICE__
+inline float __frcp_rd(float x) { return __llvm_amdgcn_rcp_f32(x); }
+#endif
+__DEVICE__
+inline float __frcp_rn(float x) { return __llvm_amdgcn_rcp_f32(x); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline float __frcp_ru(float x) { return __llvm_amdgcn_rcp_f32(x); }
+__DEVICE__
+inline float __frcp_rz(float x) { return __llvm_amdgcn_rcp_f32(x); }
+#endif
+__DEVICE__
+inline float __frsqrt_rn(float x) { return __llvm_amdgcn_rsq_f32(x); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline float __fsqrt_rd(float x) { return __ocml_sqrt_rtn_f32(x); }
+#endif
+__DEVICE__
+inline float __fsqrt_rn(float x) { return __ocml_native_sqrt_f32(x); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline float __fsqrt_ru(float x) { return __ocml_sqrt_rtp_f32(x); }
+__DEVICE__
+inline float __fsqrt_rz(float x) { return __ocml_sqrt_rtz_f32(x); }
+__DEVICE__
+inline float __fsub_rd(float x, float y) { return __ocml_sub_rtn_f32(x, y); }
+#endif
+__DEVICE__
+inline float __fsub_rn(float x, float y) { return x - y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline float __fsub_ru(float x, float y) { return __ocml_sub_rtp_f32(x, y); }
+__DEVICE__
+inline float __fsub_rz(float x, float y) { return __ocml_sub_rtz_f32(x, y); }
+#endif
+__DEVICE__
+inline float __log10f(float x) { return __ocml_native_log10_f32(x); }
+__DEVICE__
+inline float __log2f(float x) { return __ocml_native_log2_f32(x); }
+__DEVICE__
+inline float __logf(float x) { return __ocml_native_log_f32(x); }
+__DEVICE__
+inline float __powf(float x, float y) { return __ocml_pow_f32(x, y); }
+__DEVICE__
+inline float __saturatef(float x) { return (x < 0) ? 0 : ((x > 1) ? 1 : x); }
+__DEVICE__
+inline void __sincosf(float x, float *sptr, float *cptr) {
+  *sptr = __ocml_native_sin_f32(x);
+  *cptr = __ocml_native_cos_f32(x);
+}
+__DEVICE__
+inline float __sinf(float x) { return __ocml_native_sin_f32(x); }
+__DEVICE__
+inline float __tanf(float x) { return __ocml_tan_f32(x); }
+// END INTRINSICS
+// END FLOAT
+
+// BEGIN DOUBLE
+__DEVICE__
+inline double abs(double x) { return __ocml_fabs_f64(x); }
+__DEVICE__
+inline double acos(double x) { return __ocml_acos_f64(x); }
+__DEVICE__
+inline double acosh(double x) { return __ocml_acosh_f64(x); }
+__DEVICE__
+inline double asin(double x) { return __ocml_asin_f64(x); }
+__DEVICE__
+inline double asinh(double x) { return __ocml_asinh_f64(x); }
+__DEVICE__
+inline double atan(double x) { return __ocml_atan_f64(x); }
+__DEVICE__
+inline double atan2(double x, double y) { return __ocml_atan2_f64(x, y); }
+__DEVICE__
+inline double atanh(double x) { return __ocml_atanh_f64(x); }
+__DEVICE__
+inline double cbrt(double x) { return __ocml_cbrt_f64(x); }
+__DEVICE__
+inline double ceil(double x) { return __ocml_ceil_f64(x); }
+__DEVICE__
+inline double copysign(double x, double y) { return __ocml_copysign_f64(x, y); }
+__DEVICE__
+inline double cos(double x) { return __ocml_cos_f64(x); }
+__DEVICE__
+inline double cosh(double x) { return __ocml_cosh_f64(x); }
+__DEVICE__
+inline double cospi(double x) { return __ocml_cospi_f64(x); }
+__DEVICE__
+inline double cyl_bessel_i0(double x) { return __ocml_i0_f64(x); }
+__DEVICE__
+inline double cyl_bessel_i1(double x) { return __ocml_i1_f64(x); }
+__DEVICE__
+inline double erf(double x) { return __ocml_erf_f64(x); }
+__DEVICE__
+inline double erfc(double x) { return __ocml_erfc_f64(x); }
+__DEVICE__
+inline double erfcinv(double x) { return __ocml_erfcinv_f64(x); }
+__DEVICE__
+inline double erfcx(double x) { return __ocml_erfcx_f64(x); }
+__DEVICE__
+inline double erfinv(double x) { return __ocml_erfinv_f64(x); }
+__DEVICE__
+inline double exp(double x) { return __ocml_exp_f64(x); }
+__DEVICE__
+inline double exp10(double x) { return __ocml_exp10_f64(x); }
+__DEVICE__
+inline double exp2(double x) { return __ocml_exp2_f64(x); }
+__DEVICE__
+inline double expm1(double x) { return __ocml_expm1_f64(x); }
+__DEVICE__
+inline double fabs(double x) { return __ocml_fabs_f64(x); }
+__DEVICE__
+inline double fdim(double x, double y) { return __ocml_fdim_f64(x, y); }
+__DEVICE__
+inline double floor(double x) { return __ocml_floor_f64(x); }
+__DEVICE__
+inline double fma(double x, double y, double z) {
+  return __ocml_fma_f64(x, y, z);
+}
+__DEVICE__
+inline double fmax(double x, double y) { return __ocml_fmax_f64(x, y); }
+__DEVICE__
+inline double fmin(double x, double y) { return __ocml_fmin_f64(x, y); }
+__DEVICE__
+inline double fmod(double x, double y) { return __ocml_fmod_f64(x, y); }
+__DEVICE__
+inline double frexp(double x, int *nptr) {
+  int tmp;
+  double r = __ocml_frexp_f64(x, (__attribute__((address_space(5))) int *)&tmp);
+  *nptr = tmp;
+
+  return r;
+}
+__DEVICE__
+inline double hypot(double x, double y) { return __ocml_hypot_f64(x, y); }
+__DEVICE__
+inline int ilogb(double x) { return __ocml_ilogb_f64(x); }
+__DEVICE__
+inline __RETURN_TYPE isfinite(double x) { return __ocml_isfinite_f64(x); }
+__DEVICE__
+inline __RETURN_TYPE isinf(double x) { return __ocml_isinf_f64(x); }
+__DEVICE__
+inline __RETURN_TYPE isnan(double x) { return __ocml_isnan_f64(x); }
+__DEVICE__
+inline double j0(double x) { return __ocml_j0_f64(x); }
+__DEVICE__
+inline double j1(double x) { return __ocml_j1_f64(x); }
+__DEVICE__
+inline double jn(int n, double x) { // TODO: we could use Ahmes multiplication
+                                    // and the Miller & Brown algorithm
+  //       for linear recurrences to get O(log n) steps, but it's unclear if
+  //       it'd be beneficial in this case. Placeholder until OCML adds
+  //       support.
+  if (n == 0)
+    return j0f(x);
+  if (n == 1)
+    return j1f(x);
+
+  double x0 = j0f(x);
+  double x1 = j1f(x);
+  for (int i = 1; i < n; ++i) {
+    double x2 = (2 * i) / x * x1 - x0;
+    x0 = x1;
+    x1 = x2;
+  }
+
+  return x1;
+}
+__DEVICE__
+inline double ldexp(double x, int e) { return __ocml_ldexp_f64(x, e); }
+__DEVICE__
+inline double lgamma(double x) { return __ocml_lgamma_f64(x); }
+__DEVICE__
+inline long long int llrint(double x) { return __ocml_rint_f64(x); }
+__DEVICE__
+inline long long int llround(double x) { return __ocml_round_f64(x); }
+__DEVICE__
+inline double log(double x) { return __ocml_log_f64(x); }
+__DEVICE__
+inline double log10(double x) { return __ocml_log10_f64(x); }
+__DEVICE__
+inline double log1p(double x) { return __ocml_log1p_f64(x); }
+__DEVICE__
+inline double log2(double x) { return __ocml_log2_f64(x); }
+__DEVICE__
+inline double logb(double x) { return __ocml_logb_f64(x); }
+__DEVICE__
+inline long int lrint(double x) { return __ocml_rint_f64(x); }
+__DEVICE__
+inline long int lround(double x) { return __ocml_round_f64(x); }
+__DEVICE__
+inline double modf(double x, double *iptr) {
+  double tmp;
+  double r =
+      __ocml_modf_f64(x, (__attribute__((address_space(5))) double *)&tmp);
+  *iptr = tmp;
+
+  return r;
+}
+__DEVICE__
+inline double nan(const char *tagp) {
+#if !_WIN32
+  union {
+    double val;
+    struct ieee_double {
+      uint64_t mantissa : 51;
+      uint32_t quiet : 1;
+      uint32_t exponent : 11;
+      uint32_t sign : 1;
+    } bits;
+    static_assert(sizeof(double) == sizeof(ieee_double), "");
+  } tmp;
+
+  tmp.bits.sign = 0u;
+  tmp.bits.exponent = ~0u;
+  tmp.bits.quiet = 1u;
+  tmp.bits.mantissa = __make_mantissa(tagp);
+
+  return tmp.val;
+#else
+  static_assert(sizeof(uint64_t) == sizeof(double));
+  uint64_t val = __make_mantissa(tagp);
+  val |= 0xFFF << 51;
+  return *reinterpret_cast<double *>(&val);
+#endif
+}
+__DEVICE__
+inline double nearbyint(double x) { return __ocml_nearbyint_f64(x); }
+__DEVICE__
+inline double nextafter(double x, double y) {
+  return __ocml_nextafter_f64(x, y);
+}
+__DEVICE__
+inline double
+norm(int dim, const double *a) { // TODO: placeholder until OCML adds support.
+  double r = 0;
+  while (dim--) {
+    r += a[0] * a[0];
+    ++a;
+  }
+
+  return __ocml_sqrt_f64(r);
+}
+__DEVICE__
+inline double norm3d(double x, double y, double z) {
+  return __ocml_len3_f64(x, y, z);
+}
+__DEVICE__
+inline double norm4d(double x, double y, double z, double w) {
+  return __ocml_len4_f64(x, y, z, w);
+}
+__DEVICE__
+inline double normcdf(double x) { return __ocml_ncdf_f64(x); }
+__DEVICE__
+inline double normcdfinv(double x) { return __ocml_ncdfinv_f64(x); }
+__DEVICE__
+inline double pow(double x, double y) { return __ocml_pow_f64(x, y); }
+__DEVICE__
+inline double rcbrt(double x) { return __ocml_rcbrt_f64(x); }
+__DEVICE__
+inline double remainder(double x, double y) {
+  return __ocml_remainder_f64(x, y);
+}
+__DEVICE__
+inline double remquo(double x, double y, int *quo) {
+  int tmp;
+  double r =
+      __ocml_remquo_f64(x, y, (__attribute__((address_space(5))) int *)&tmp);
+  *quo = tmp;
+
+  return r;
+}
+__DEVICE__
+inline double rhypot(double x, double y) { return __ocml_rhypot_f64(x, y); }
+__DEVICE__
+inline double rint(double x) { return __ocml_rint_f64(x); }
+__DEVICE__
+inline double
+rnorm(int dim, const double *a) { // TODO: placeholder until OCML adds support.
+  double r = 0;
+  while (dim--) {
+    r += a[0] * a[0];
+    ++a;
+  }
+
+  return __ocml_rsqrt_f64(r);
+}
+__DEVICE__
+inline double rnorm3d(double x, double y, double z) {
+  return __ocml_rlen3_f64(x, y, z);
+}
+__DEVICE__
+inline double rnorm4d(double x, double y, double z, double w) {
+  return __ocml_rlen4_f64(x, y, z, w);
+}
+__DEVICE__
+inline double round(double x) { return __ocml_round_f64(x); }
+__DEVICE__
+inline double rsqrt(double x) { return __ocml_rsqrt_f64(x); }
+__DEVICE__
+inline double scalbln(double x, long int n) {
+  return (n < INT_MAX) ? __ocml_scalbn_f64(x, n) : __ocml_scalb_f64(x, n);
+}
+__DEVICE__
+inline double scalbn(double x, int n) { return __ocml_scalbn_f64(x, n); }
+__DEVICE__
+inline __RETURN_TYPE signbit(double x) { return __ocml_signbit_f64(x); }
+__DEVICE__
+inline double sin(double x) { return __ocml_sin_f64(x); }
+__DEVICE__
+inline void sincos(double x, double *sptr, double *cptr) {
+  double tmp;
+  *sptr =
+      __ocml_sincos_f64(x, (__attribute__((address_space(5))) double *)&tmp);
+  *cptr = tmp;
+}
+__DEVICE__
+inline void sincospi(double x, double *sptr, double *cptr) {
+  double tmp;
+  *sptr =
+      __ocml_sincospi_f64(x, (__attribute__((address_space(5))) double *)&tmp);
+  *cptr = tmp;
+}
+__DEVICE__
+inline double sinh(double x) { return __ocml_sinh_f64(x); }
+__DEVICE__
+inline double sinpi(double x) { return __ocml_sinpi_f64(x); }
+__DEVICE__
+inline double sqrt(double x) { return __ocml_sqrt_f64(x); }
+__DEVICE__
+inline double tan(double x) { return __ocml_tan_f64(x); }
+__DEVICE__
+inline double tanh(double x) { return __ocml_tanh_f64(x); }
+__DEVICE__
+inline double tgamma(double x) { return __ocml_tgamma_f64(x); }
+__DEVICE__
+inline double trunc(double x) { return __ocml_trunc_f64(x); }
+__DEVICE__
+inline double y0(double x) { return __ocml_y0_f64(x); }
+__DEVICE__
+inline double y1(double x) { return __ocml_y1_f64(x); }
+__DEVICE__
+inline double yn(int n, double x) { // TODO: we could use Ahmes multiplication
+                                    // and the Miller & Brown algorithm
+  //       for linear recurrences to get O(log n) steps, but it's unclear if
+  //       it'd be beneficial in this case. Placeholder until OCML adds
+  //       support.
+  if (n == 0)
+    return j0f(x);
+  if (n == 1)
+    return j1f(x);
+
+  double x0 = j0f(x);
+  double x1 = j1f(x);
+  for (int i = 1; i < n; ++i) {
+    double x2 = (2 * i) / x * x1 - x0;
+    x0 = x1;
+    x1 = x2;
+  }
+
+  return x1;
+}
+
+// BEGIN INTRINSICS
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline double __dadd_rd(double x, double y) { return __ocml_add_rtn_f64(x, y); }
+#endif
+__DEVICE__
+inline double __dadd_rn(double x, double y) { return x + y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline double __dadd_ru(double x, double y) { return __ocml_add_rtp_f64(x, y); }
+__DEVICE__
+inline double __dadd_rz(double x, double y) { return __ocml_add_rtz_f64(x, y); }
+__DEVICE__
+inline double __ddiv_rd(double x, double y) { return __ocml_div_rtn_f64(x, y); }
+#endif
+__DEVICE__
+inline double __ddiv_rn(double x, double y) { return x / y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline double __ddiv_ru(double x, double y) { return __ocml_div_rtp_f64(x, y); }
+__DEVICE__
+inline double __ddiv_rz(double x, double y) { return __ocml_div_rtz_f64(x, y); }
+__DEVICE__
+inline double __dmul_rd(double x, double y) { return __ocml_mul_rtn_f64(x, y); }
+#endif
+__DEVICE__
+inline double __dmul_rn(double x, double y) { return x * y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline double __dmul_ru(double x, double y) { return __ocml_mul_rtp_f64(x, y); }
+__DEVICE__
+inline double __dmul_rz(double x, double y) { return __ocml_mul_rtz_f64(x, y); }
+__DEVICE__
+inline double __drcp_rd(double x) { return __llvm_amdgcn_rcp_f64(x); }
+#endif
+__DEVICE__
+inline double __drcp_rn(double x) { return __llvm_amdgcn_rcp_f64(x); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline double __drcp_ru(double x) { return __llvm_amdgcn_rcp_f64(x); }
+__DEVICE__
+inline double __drcp_rz(double x) { return __llvm_amdgcn_rcp_f64(x); }
+__DEVICE__
+inline double __dsqrt_rd(double x) { return __ocml_sqrt_rtn_f64(x); }
+#endif
+__DEVICE__
+inline double __dsqrt_rn(double x) { return __ocml_sqrt_f64(x); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline double __dsqrt_ru(double x) { return __ocml_sqrt_rtp_f64(x); }
+__DEVICE__
+inline double __dsqrt_rz(double x) { return __ocml_sqrt_rtz_f64(x); }
+__DEVICE__
+inline double __dsub_rd(double x, double y) { return __ocml_sub_rtn_f64(x, y); }
+#endif
+__DEVICE__
+inline double __dsub_rn(double x, double y) { return x - y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline double __dsub_ru(double x, double y) { return __ocml_sub_rtp_f64(x, y); }
+__DEVICE__
+inline double __dsub_rz(double x, double y) { return __ocml_sub_rtz_f64(x, y); }
+__DEVICE__
+inline double __fma_rd(double x, double y, double z) {
+  return __ocml_fma_rtn_f64(x, y, z);
+}
+#endif
+__DEVICE__
+inline double __fma_rn(double x, double y, double z) {
+  return __ocml_fma_f64(x, y, z);
+}
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline double __fma_ru(double x, double y, double z) {
+  return __ocml_fma_rtp_f64(x, y, z);
+}
+__DEVICE__
+inline double __fma_rz(double x, double y, double z) {
+  return __ocml_fma_rtz_f64(x, y, z);
+}
+#endif
+// END INTRINSICS
+// END DOUBLE
+
+// BEGIN INTEGER
+__DEVICE__
+inline int abs(int x) {
+  int sgn = x >> (sizeof(int) * CHAR_BIT - 1);
+  return (x ^ sgn) - sgn;
+}
+__DEVICE__
+inline long labs(long x) {
+  long sgn = x >> (sizeof(long) * CHAR_BIT - 1);
+  return (x ^ sgn) - sgn;
+}
+__DEVICE__
+inline long long llabs(long long x) {
+  long long sgn = x >> (sizeof(long long) * CHAR_BIT - 1);
+  return (x ^ sgn) - sgn;
+}
+
+#if defined(__cplusplus)
+__DEVICE__
+inline long abs(long x) { return labs(x); }
+__DEVICE__
+inline long long abs(long long x) { return llabs(x); }
+#endif
+// END INTEGER
+
+__DEVICE__
+inline _Float16 fma(_Float16 x, _Float16 y, _Float16 z) {
+  return __ocml_fma_f16(x, y, z);
+}
+
+__DEVICE__
+inline float fma(float x, float y, float z) { return fmaf(x, y, z); }
+
+#pragma push_macro("__DEF_FLOAT_FUN")
+#pragma push_macro("__DEF_FLOAT_FUN2")
+#pragma push_macro("__DEF_FLOAT_FUN2I")
+#pragma push_macro("__HIP_OVERLOAD")
+#pragma push_macro("__HIP_OVERLOAD2")
+
+// __hip_enable_if::type is a type function which returns __T if __B is true.
+template <bool __B, class __T = void> struct __hip_enable_if {};
+
+template <class __T> struct __hip_enable_if<true, __T> { typedef __T type; };
+
+// __HIP_OVERLOAD1 is used to resolve function calls with integer argument to
+// avoid compilation error due to ambibuity. e.g. floor(5) is resolved with
+// floor(double).
+#define __HIP_OVERLOAD1(__retty, __fn)                                         \
+  template <typename __T>                                                      \
+  __DEVICE__ typename __hip_enable_if<std::numeric_limits<__T>::is_integer,    \
+                                      __retty>::type                           \
+  __fn(__T __x) {                                                              \
+    return ::__fn((double)__x);                                                \
+  }
+
+// __HIP_OVERLOAD2 is used to resolve function calls with mixed float/double
+// or integer argument to avoid compilation error due to ambibuity. e.g.
+// max(5.0f, 6.0) is resolved with max(double, double).
+#define __HIP_OVERLOAD2(__retty, __fn)                                         \
+  template <typename __T1, typename __T2>                                      \
+  __DEVICE__                                                                   \
+      typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&    \
+                                   std::numeric_limits<__T2>::is_specialized,  \
+                               __retty>::type                                  \
+      __fn(__T1 __x, __T2 __y) {                                               \
+    return __fn((double)__x, (double)__y);                                     \
+  }
+
+// Define cmath functions with float argument and returns float.
+#define __DEF_FUN1(retty, func)                                                \
+  __DEVICE__                                                                   \
+  inline float func(float x) { return func##f(x); }                            \
+  __HIP_OVERLOAD1(retty, func)
+
+// Define cmath functions with float argument and returns retty.
+#define __DEF_FUNI(retty, func)                                                \
+  __DEVICE__                                                                   \
+  inline retty func(float x) { return func##f(x); }                            \
+  __HIP_OVERLOAD1(retty, func)
+
+// define cmath functions with two float arguments.
+#define __DEF_FUN2(retty, func)                                                \
+  __DEVICE__                                                                   \
+  inline float func(float x, float y) { return func##f(x, y); }                \
+  __HIP_OVERLOAD2(retty, func)
+
+__DEF_FUN1(double, acos)
+__DEF_FUN1(double, acosh)
+__DEF_FUN1(double, asin)
+__DEF_FUN1(double, asinh)
+__DEF_FUN1(double, atan)
+__DEF_FUN2(double, atan2);
+__DEF_FUN1(double, atanh)
+__DEF_FUN1(double, cbrt)
+__DEF_FUN1(double, ceil)
+__DEF_FUN2(double, copysign);
+__DEF_FUN1(double, cos)
+__DEF_FUN1(double, cosh)
+__DEF_FUN1(double, erf)
+__DEF_FUN1(double, erfc)
+__DEF_FUN1(double, exp)
+__DEF_FUN1(double, exp2)
+__DEF_FUN1(double, expm1)
+__DEF_FUN1(double, fabs)
+__DEF_FUN2(double, fdim);
+__DEF_FUN1(double, floor)
+__DEF_FUN2(double, fmax);
+__DEF_FUN2(double, fmin);
+__DEF_FUN2(double, fmod);
+//__HIP_OVERLOAD1(int, fpclassify)
+__DEF_FUN2(double, hypot);
+__DEF_FUNI(int, ilogb)
+__HIP_OVERLOAD1(bool, isfinite)
+__HIP_OVERLOAD2(bool, isgreater);
+__HIP_OVERLOAD2(bool, isgreaterequal);
+__HIP_OVERLOAD1(bool, isinf);
+__HIP_OVERLOAD2(bool, isless);
+__HIP_OVERLOAD2(bool, islessequal);
+__HIP_OVERLOAD2(bool, islessgreater);
+__HIP_OVERLOAD1(bool, isnan);
+//__HIP_OVERLOAD1(bool, isnormal)
+__HIP_OVERLOAD2(bool, isunordered);
+__DEF_FUN1(double, lgamma)
+__DEF_FUN1(double, log)
+__DEF_FUN1(double, log10)
+__DEF_FUN1(double, log1p)
+__DEF_FUN1(double, log2)
+__DEF_FUN1(double, logb)
+__DEF_FUNI(long long, llrint)
+__DEF_FUNI(long long, llround)
+__DEF_FUNI(long, lrint)
+__DEF_FUNI(long, lround)
+__DEF_FUN1(double, nearbyint);
+__DEF_FUN2(double, nextafter);
+__DEF_FUN2(double, pow);
+__DEF_FUN2(double, remainder);
+__DEF_FUN1(double, rint);
+__DEF_FUN1(double, round);
+__HIP_OVERLOAD1(bool, signbit)
+__DEF_FUN1(double, sin)
+__DEF_FUN1(double, sinh)
+__DEF_FUN1(double, sqrt)
+__DEF_FUN1(double, tan)
+__DEF_FUN1(double, tanh)
+__DEF_FUN1(double, tgamma)
+__DEF_FUN1(double, trunc);
+
+// define cmath functions with a float and an integer argument.
+#define __DEF_FLOAT_FUN2I(func)                                                \
+  __DEVICE__                                                                   \
+  inline float func(float x, int y) { return func##f(x, y); }
+__DEF_FLOAT_FUN2I(scalbn)
+
+template <class T> __DEVICE__ inline T min(T arg1, T arg2) {
+  return (arg1 < arg2) ? arg1 : arg2;
+}
+
+template <class T> __DEVICE__ inline T max(T arg1, T arg2) {
+  return (arg1 > arg2) ? arg1 : arg2;
+}
+
+__DEVICE__ inline int min(int arg1, int arg2) {
+  return (arg1 < arg2) ? arg1 : arg2;
+}
+__DEVICE__ inline int max(int arg1, int arg2) {
+  return (arg1 > arg2) ? arg1 : arg2;
+}
+
+__DEVICE__
+inline float max(float x, float y) { return fmaxf(x, y); }
+
+__DEVICE__
+inline double max(double x, double y) { return fmax(x, y); }
+
+__DEVICE__
+inline float min(float x, float y) { return fminf(x, y); }
+
+__DEVICE__
+inline double min(double x, double y) { return fmin(x, y); }
+
+__HIP_OVERLOAD2(double, max)
+__HIP_OVERLOAD2(double, min)
+
+__host__ inline static int min(int arg1, int arg2) {
+  return std::min(arg1, arg2);
+}
+
+__host__ inline static int max(int arg1, int arg2) {
+  return std::max(arg1, arg2);
+}
+
+#pragma pop_macro("__DEF_FLOAT_FUN")
+#pragma pop_macro("__DEF_FLOAT_FUN2")
+#pragma pop_macro("__DEF_FLOAT_FUN2I")
+#pragma pop_macro("__HIP_OVERLOAD")
+#pragma pop_macro("__HIP_OVERLOAD2")
+#pragma pop_macro("__DEVICE__")
+#pragma pop_macro("__RETURN_TYPE")
+
+#endif // __CLANG_HIP_MATH_H__
Index: clang/lib/Headers/__clang_hip_libdevice_declares.h
===================================================================
--- /dev/null
+++ clang/lib/Headers/__clang_hip_libdevice_declares.h
@@ -0,0 +1,326 @@
+/*===---- __clang_hip_libdevice_declares.h - HIP device library decls -------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_HIP_LIBDEVICE_DECLARES_H__
+#define __CLANG_HIP_LIBDEVICE_DECLARES_H__
+
+extern "C" {
+
+// BEGIN FLOAT
+__device__ __attribute__((const)) float __ocml_acos_f32(float);
+__device__ __attribute__((pure)) float __ocml_acosh_f32(float);
+__device__ __attribute__((const)) float __ocml_asin_f32(float);
+__device__ __attribute__((pure)) float __ocml_asinh_f32(float);
+__device__ __attribute__((const)) float __ocml_atan2_f32(float, float);
+__device__ __attribute__((const)) float __ocml_atan_f32(float);
+__device__ __attribute__((pure)) float __ocml_atanh_f32(float);
+__device__ __attribute__((pure)) float __ocml_cbrt_f32(float);
+__device__ __attribute__((const)) float __ocml_ceil_f32(float);
+__device__ __attribute__((const)) __device__ float __ocml_copysign_f32(float,
+                                                                       float);
+__device__ float __ocml_cos_f32(float);
+__device__ float __ocml_native_cos_f32(float);
+__device__ __attribute__((pure)) __device__ float __ocml_cosh_f32(float);
+__device__ float __ocml_cospi_f32(float);
+__device__ float __ocml_i0_f32(float);
+__device__ float __ocml_i1_f32(float);
+__device__ __attribute__((pure)) float __ocml_erfc_f32(float);
+__device__ __attribute__((pure)) float __ocml_erfcinv_f32(float);
+__device__ __attribute__((pure)) float __ocml_erfcx_f32(float);
+__device__ __attribute__((pure)) float __ocml_erf_f32(float);
+__device__ __attribute__((pure)) float __ocml_erfinv_f32(float);
+__device__ __attribute__((pure)) float __ocml_exp10_f32(float);
+__device__ __attribute__((pure)) float __ocml_native_exp10_f32(float);
+__device__ __attribute__((pure)) float __ocml_exp2_f32(float);
+__device__ __attribute__((pure)) float __ocml_exp_f32(float);
+__device__ __attribute__((pure)) float __ocml_native_exp_f32(float);
+__device__ __attribute__((pure)) float __ocml_expm1_f32(float);
+__device__ __attribute__((const)) float __ocml_fabs_f32(float);
+__device__ __attribute__((const)) float __ocml_fdim_f32(float, float);
+__device__ __attribute__((const)) float __ocml_floor_f32(float);
+__device__ __attribute__((const)) float __ocml_fma_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_fmax_f32(float, float);
+__device__ __attribute__((const)) float __ocml_fmin_f32(float, float);
+__device__ __attribute__((const)) __device__ float __ocml_fmod_f32(float,
+                                                                   float);
+__device__ float __ocml_frexp_f32(float,
+                                  __attribute__((address_space(5))) int *);
+__device__ __attribute__((const)) float __ocml_hypot_f32(float, float);
+__device__ __attribute__((const)) int __ocml_ilogb_f32(float);
+__device__ __attribute__((const)) int __ocml_isfinite_f32(float);
+__device__ __attribute__((const)) int __ocml_isinf_f32(float);
+__device__ __attribute__((const)) int __ocml_isnan_f32(float);
+__device__ float __ocml_j0_f32(float);
+__device__ float __ocml_j1_f32(float);
+__device__ __attribute__((const)) float __ocml_ldexp_f32(float, int);
+__device__ float __ocml_lgamma_f32(float);
+__device__ __attribute__((pure)) float __ocml_log10_f32(float);
+__device__ __attribute__((pure)) float __ocml_native_log10_f32(float);
+__device__ __attribute__((pure)) float __ocml_log1p_f32(float);
+__device__ __attribute__((pure)) float __ocml_log2_f32(float);
+__device__ __attribute__((pure)) float __ocml_native_log2_f32(float);
+__device__ __attribute__((const)) float __ocml_logb_f32(float);
+__device__ __attribute__((pure)) float __ocml_log_f32(float);
+__device__ __attribute__((pure)) float __ocml_native_log_f32(float);
+__device__ float __ocml_modf_f32(float,
+                                 __attribute__((address_space(5))) float *);
+__device__ __attribute__((const)) float __ocml_nearbyint_f32(float);
+__device__ __attribute__((const)) float __ocml_nextafter_f32(float, float);
+__device__ __attribute__((const)) float __ocml_len3_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_len4_f32(float, float, float,
+                                                        float);
+__device__ __attribute__((pure)) float __ocml_ncdf_f32(float);
+__device__ __attribute__((pure)) float __ocml_ncdfinv_f32(float);
+__device__ __attribute__((pure)) float __ocml_pow_f32(float, float);
+__device__ __attribute__((pure)) float __ocml_rcbrt_f32(float);
+__device__ __attribute__((const)) float __ocml_remainder_f32(float, float);
+__device__ float __ocml_remquo_f32(float, float,
+                                   __attribute__((address_space(5))) int *);
+__device__ __attribute__((const)) float __ocml_rhypot_f32(float, float);
+__device__ __attribute__((const)) float __ocml_rint_f32(float);
+__device__ __attribute__((const)) float __ocml_rlen3_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_rlen4_f32(float, float, float,
+                                                         float);
+__device__ __attribute__((const)) float __ocml_round_f32(float);
+__device__ __attribute__((pure)) float __ocml_rsqrt_f32(float);
+__device__ __attribute__((const)) float __ocml_scalb_f32(float, float);
+__device__ __attribute__((const)) float __ocml_scalbn_f32(float, int);
+__device__ __attribute__((const)) int __ocml_signbit_f32(float);
+__device__ float __ocml_sincos_f32(float,
+                                   __attribute__((address_space(5))) float *);
+__device__ float __ocml_sincospi_f32(float,
+                                     __attribute__((address_space(5))) float *);
+__device__ float __ocml_sin_f32(float);
+__device__ float __ocml_native_sin_f32(float);
+__device__ __attribute__((pure)) float __ocml_sinh_f32(float);
+__device__ float __ocml_sinpi_f32(float);
+__device__ __attribute__((const)) float __ocml_sqrt_f32(float);
+__device__ __attribute__((const)) float __ocml_native_sqrt_f32(float);
+__device__ float __ocml_tan_f32(float);
+__device__ __attribute__((pure)) float __ocml_tanh_f32(float);
+__device__ float __ocml_tgamma_f32(float);
+__device__ __attribute__((const)) float __ocml_trunc_f32(float);
+__device__ float __ocml_y0_f32(float);
+__device__ float __ocml_y1_f32(float);
+
+// BEGIN INTRINSICS
+__device__ __attribute__((const)) float __ocml_add_rte_f32(float, float);
+__device__ __attribute__((const)) float __ocml_add_rtn_f32(float, float);
+__device__ __attribute__((const)) float __ocml_add_rtp_f32(float, float);
+__device__ __attribute__((const)) float __ocml_add_rtz_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sub_rte_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sub_rtn_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sub_rtp_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sub_rtz_f32(float, float);
+__device__ __attribute__((const)) float __ocml_mul_rte_f32(float, float);
+__device__ __attribute__((const)) float __ocml_mul_rtn_f32(float, float);
+__device__ __attribute__((const)) float __ocml_mul_rtp_f32(float, float);
+__device__ __attribute__((const)) float __ocml_mul_rtz_f32(float, float);
+__device__ __attribute__((const)) float __ocml_div_rte_f32(float, float);
+__device__ __attribute__((const)) float __ocml_div_rtn_f32(float, float);
+__device__ __attribute__((const)) float __ocml_div_rtp_f32(float, float);
+__device__ __attribute__((const)) float __ocml_div_rtz_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sqrt_rte_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sqrt_rtn_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sqrt_rtp_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sqrt_rtz_f32(float, float);
+__device__ __attribute__((const)) float __ocml_fma_rte_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_fma_rtn_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_fma_rtp_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_fma_rtz_f32(float, float, float);
+
+__device__ __attribute__((const)) float
+__llvm_amdgcn_cos_f32(float) __asm("llvm.amdgcn.cos.f32");
+__device__ __attribute__((const)) float
+__llvm_amdgcn_rcp_f32(float) __asm("llvm.amdgcn.rcp.f32");
+__device__ __attribute__((const)) float
+__llvm_amdgcn_rsq_f32(float) __asm("llvm.amdgcn.rsq.f32");
+__device__ __attribute__((const)) float
+__llvm_amdgcn_sin_f32(float) __asm("llvm.amdgcn.sin.f32");
+// END INTRINSICS
+// END FLOAT
+
+// BEGIN DOUBLE
+__device__ __attribute__((const)) double __ocml_acos_f64(double);
+__device__ __attribute__((pure)) double __ocml_acosh_f64(double);
+__device__ __attribute__((const)) double __ocml_asin_f64(double);
+__device__ __attribute__((pure)) double __ocml_asinh_f64(double);
+__device__ __attribute__((const)) double __ocml_atan2_f64(double, double);
+__device__ __attribute__((const)) double __ocml_atan_f64(double);
+__device__ __attribute__((pure)) double __ocml_atanh_f64(double);
+__device__ __attribute__((pure)) double __ocml_cbrt_f64(double);
+__device__ __attribute__((const)) double __ocml_ceil_f64(double);
+__device__ __attribute__((const)) double __ocml_copysign_f64(double, double);
+__device__ double __ocml_cos_f64(double);
+__device__ __attribute__((pure)) double __ocml_cosh_f64(double);
+__device__ double __ocml_cospi_f64(double);
+__device__ double __ocml_i0_f64(double);
+__device__ double __ocml_i1_f64(double);
+__device__ __attribute__((pure)) double __ocml_erfc_f64(double);
+__device__ __attribute__((pure)) double __ocml_erfcinv_f64(double);
+__device__ __attribute__((pure)) double __ocml_erfcx_f64(double);
+__device__ __attribute__((pure)) double __ocml_erf_f64(double);
+__device__ __attribute__((pure)) double __ocml_erfinv_f64(double);
+__device__ __attribute__((pure)) double __ocml_exp10_f64(double);
+__device__ __attribute__((pure)) double __ocml_exp2_f64(double);
+__device__ __attribute__((pure)) double __ocml_exp_f64(double);
+__device__ __attribute__((pure)) double __ocml_expm1_f64(double);
+__device__ __attribute__((const)) double __ocml_fabs_f64(double);
+__device__ __attribute__((const)) double __ocml_fdim_f64(double, double);
+__device__ __attribute__((const)) double __ocml_floor_f64(double);
+__device__ __attribute__((const)) double __ocml_fma_f64(double, double, double);
+__device__ __attribute__((const)) double __ocml_fmax_f64(double, double);
+__device__ __attribute__((const)) double __ocml_fmin_f64(double, double);
+__device__ __attribute__((const)) double __ocml_fmod_f64(double, double);
+__device__ double __ocml_frexp_f64(double,
+                                   __attribute__((address_space(5))) int *);
+__device__ __attribute__((const)) double __ocml_hypot_f64(double, double);
+__device__ __attribute__((const)) int __ocml_ilogb_f64(double);
+__device__ __attribute__((const)) int __ocml_isfinite_f64(double);
+__device__ __attribute__((const)) int __ocml_isinf_f64(double);
+__device__ __attribute__((const)) int __ocml_isnan_f64(double);
+__device__ double __ocml_j0_f64(double);
+__device__ double __ocml_j1_f64(double);
+__device__ __attribute__((const)) double __ocml_ldexp_f64(double, int);
+__device__ double __ocml_lgamma_f64(double);
+__device__ __attribute__((pure)) double __ocml_log10_f64(double);
+__device__ __attribute__((pure)) double __ocml_log1p_f64(double);
+__device__ __attribute__((pure)) double __ocml_log2_f64(double);
+__device__ __attribute__((const)) double __ocml_logb_f64(double);
+__device__ __attribute__((pure)) double __ocml_log_f64(double);
+__device__ double __ocml_modf_f64(double,
+                                  __attribute__((address_space(5))) double *);
+__device__ __attribute__((const)) double __ocml_nearbyint_f64(double);
+__device__ __attribute__((const)) double __ocml_nextafter_f64(double, double);
+__device__ __attribute__((const)) double __ocml_len3_f64(double, double,
+                                                         double);
+__device__ __attribute__((const)) double __ocml_len4_f64(double, double, double,
+                                                         double);
+__device__ __attribute__((pure)) double __ocml_ncdf_f64(double);
+__device__ __attribute__((pure)) double __ocml_ncdfinv_f64(double);
+__device__ __attribute__((pure)) double __ocml_pow_f64(double, double);
+__device__ __attribute__((pure)) double __ocml_rcbrt_f64(double);
+__device__ __attribute__((const)) double __ocml_remainder_f64(double, double);
+__device__ double __ocml_remquo_f64(double, double,
+                                    __attribute__((address_space(5))) int *);
+__device__ __attribute__((const)) double __ocml_rhypot_f64(double, double);
+__device__ __attribute__((const)) double __ocml_rint_f64(double);
+__device__ __attribute__((const)) double __ocml_rlen3_f64(double, double,
+                                                          double);
+__device__ __attribute__((const)) double __ocml_rlen4_f64(double, double,
+                                                          double, double);
+__device__ __attribute__((const)) double __ocml_round_f64(double);
+__device__ __attribute__((pure)) double __ocml_rsqrt_f64(double);
+__device__ __attribute__((const)) double __ocml_scalb_f64(double, double);
+__device__ __attribute__((const)) double __ocml_scalbn_f64(double, int);
+__device__ __attribute__((const)) int __ocml_signbit_f64(double);
+__device__ double __ocml_sincos_f64(double,
+                                    __attribute__((address_space(5))) double *);
+__device__ double
+__ocml_sincospi_f64(double, __attribute__((address_space(5))) double *);
+__device__ double __ocml_sin_f64(double);
+__device__ __attribute__((pure)) double __ocml_sinh_f64(double);
+__device__ double __ocml_sinpi_f64(double);
+__device__ __attribute__((const)) double __ocml_sqrt_f64(double);
+__device__ double __ocml_tan_f64(double);
+__device__ __attribute__((pure)) double __ocml_tanh_f64(double);
+__device__ double __ocml_tgamma_f64(double);
+__device__ __attribute__((const)) double __ocml_trunc_f64(double);
+__device__ double __ocml_y0_f64(double);
+__device__ double __ocml_y1_f64(double);
+
+// BEGIN INTRINSICS
+__device__ __attribute__((const)) double __ocml_add_rte_f64(double, double);
+__device__ __attribute__((const)) double __ocml_add_rtn_f64(double, double);
+__device__ __attribute__((const)) double __ocml_add_rtp_f64(double, double);
+__device__ __attribute__((const)) double __ocml_add_rtz_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sub_rte_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sub_rtn_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sub_rtp_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sub_rtz_f64(double, double);
+__device__ __attribute__((const)) double __ocml_mul_rte_f64(double, double);
+__device__ __attribute__((const)) double __ocml_mul_rtn_f64(double, double);
+__device__ __attribute__((const)) double __ocml_mul_rtp_f64(double, double);
+__device__ __attribute__((const)) double __ocml_mul_rtz_f64(double, double);
+__device__ __attribute__((const)) double __ocml_div_rte_f64(double, double);
+__device__ __attribute__((const)) double __ocml_div_rtn_f64(double, double);
+__device__ __attribute__((const)) double __ocml_div_rtp_f64(double, double);
+__device__ __attribute__((const)) double __ocml_div_rtz_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sqrt_rte_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sqrt_rtn_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sqrt_rtp_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sqrt_rtz_f64(double, double);
+__device__ __attribute__((const)) double __ocml_fma_rte_f64(double, double,
+                                                            double);
+__device__ __attribute__((const)) double __ocml_fma_rtn_f64(double, double,
+                                                            double);
+__device__ __attribute__((const)) double __ocml_fma_rtp_f64(double, double,
+                                                            double);
+__device__ __attribute__((const)) double __ocml_fma_rtz_f64(double, double,
+                                                            double);
+
+__device__ __attribute__((const)) double
+__llvm_amdgcn_rcp_f64(double) __asm("llvm.amdgcn.rcp.f64");
+__device__ __attribute__((const)) double
+__llvm_amdgcn_rsq_f64(double) __asm("llvm.amdgcn.rsq.f64");
+
+__device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
+__device__ _Float16 __ocml_cos_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __ocml_fma_f16(_Float16, _Float16,
+                                                          _Float16);
+__device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
+__device__ __attribute__((const)) int __ocml_isinf_f16(_Float16);
+__device__ __attribute__((const)) int __ocml_isnan_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_log_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __llvm_amdgcn_rcp_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __ocml_rint_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16);
+__device__ _Float16 __ocml_sin_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16);
+
+typedef _Float16 __2f16 __attribute__((ext_vector_type(2)));
+typedef short __2i16 __attribute__((ext_vector_type(2)));
+
+__device__ __attribute__((const)) float __ockl_fdot2(__2f16 a, __2f16 b,
+                                                     float c, bool s);
+__device__ __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16);
+__device__ __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16);
+__device__ __2f16 __ocml_cos_2f16(__2f16);
+__device__ __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16);
+__device__ __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16);
+__device__ __attribute__((pure)) __2f16 __ocml_exp2_2f16(__2f16);
+__device__ __attribute__((const)) __2f16 __ocml_floor_2f16(__2f16);
+__device__ __attribute__((const))
+__2f16 __ocml_fma_2f16(__2f16, __2f16, __2f16);
+__device__ __attribute__((const)) __2i16 __ocml_isinf_2f16(__2f16);
+__device__ __attribute__((const)) __2i16 __ocml_isnan_2f16(__2f16);
+__device__ __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16);
+__device__ __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16);
+__device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16);
+__device__ inline __2f16
+__llvm_amdgcn_rcp_2f16(__2f16 x) // Not currently exposed by ROCDL.
+{
+  return __2f16{__llvm_amdgcn_rcp_f16(x.x), __llvm_amdgcn_rcp_f16(x.y)};
+}
+__device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16);
+__device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16);
+__device__ __2f16 __ocml_sin_2f16(__2f16);
+__device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
+__device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);
+
+} // extern "C"
+
+#endif // __CLANG_HIP_LIBDEVICE_DECLARES_H__
Index: clang/lib/Headers/__clang_cuda_math_forward_declares.h
===================================================================
--- clang/lib/Headers/__clang_cuda_math_forward_declares.h
+++ clang/lib/Headers/__clang_cuda_math_forward_declares.h
@@ -8,8 +8,8 @@
  */
 #ifndef __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
 #define __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
-#ifndef __CUDA__
-#error "This file is for CUDA compilation only."
+#if !__CUDA__ && !__HIP__
+#error "This file is for CUDA/HIP compilation only."
 #endif
 
 // This file forward-declares of some math functions we (or the CUDA headers)
Index: clang/lib/Headers/CMakeLists.txt
===================================================================
--- clang/lib/Headers/CMakeLists.txt
+++ clang/lib/Headers/CMakeLists.txt
@@ -45,6 +45,9 @@
   __clang_cuda_libdevice_declares.h
   __clang_cuda_math_forward_declares.h
   __clang_cuda_runtime_wrapper.h
+  __clang_hip_libdevice_declares.h
+  __clang_hip_math.h
+  __clang_hip_runtime_wrapper.h
   cetintrin.h
   cet.h
   cldemoteintrin.h
Index: clang/lib/Driver/ToolChains/ROCm.h
===================================================================
--- /dev/null
+++ clang/lib/Driver/ToolChains/ROCm.h
@@ -0,0 +1,166 @@
+//===--- ROCm.h - ROCm installation detector --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_ROCM_H
+#define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_ROCM_H
+
+#include "clang/Basic/Cuda.h"
+#include "clang/Driver/Options.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallString.h"
+
+namespace clang {
+namespace driver {
+
+/// A class to find a viable ROCM installation
+/// TODO: Generalize to handle libclc.
+class RocmInstallationDetector {
+private:
+  struct ConditionalLibrary {
+    SmallString<0> On;
+    SmallString<0> Off;
+
+    bool isValid() const { return !On.empty() && !Off.empty(); }
+
+    StringRef get(bool Enabled) const {
+      assert(isValid());
+      return Enabled ? On : Off;
+    }
+  };
+
+  const Driver &D;
+  bool IsValid = false;
+  // RocmVersion Version = RocmVersion::UNKNOWN;
+  SmallString<0> InstallPath;
+  // SmallString<0> BinPath;
+  SmallString<0> LibPath;
+  SmallString<0> LibDevicePath;
+  SmallString<0> IncludePath;
+  llvm::StringMap<std::string> LibDeviceMap;
+
+  // Libraries that are always linked.
+  SmallString<0> OCML;
+  SmallString<0> OCKL;
+
+  // Libraries that are always linked depending on the language
+  SmallString<0> OpenCL;
+  SmallString<0> HIP;
+
+  // Libraries swapped based on compile flags.
+  ConditionalLibrary WavefrontSize64;
+  ConditionalLibrary FiniteOnly;
+  ConditionalLibrary UnsafeMath;
+  ConditionalLibrary DenormalsAreZero;
+  ConditionalLibrary CorrectlyRoundedSqrt;
+
+  bool allGenericLibsValid() const {
+    return !OCML.empty() && !OCKL.empty() && !OpenCL.empty() && !HIP.empty() &&
+           WavefrontSize64.isValid() && FiniteOnly.isValid() &&
+           UnsafeMath.isValid() && DenormalsAreZero.isValid() &&
+           CorrectlyRoundedSqrt.isValid();
+  }
+
+  // CUDA architectures for which we have raised an error in
+  // CheckRocmVersionSupportsArch.
+  mutable llvm::SmallSet<CudaArch, 4> ArchsWithBadVersion;
+
+  void scanLibDevicePath();
+
+public:
+  RocmInstallationDetector(const Driver &D, const llvm::Triple &HostTriple,
+                           const llvm::opt::ArgList &Args);
+
+  /// Add arguments needed to link default bitcode libraries.
+  void addCommonBitcodeLibCC1Args(const llvm::opt::ArgList &DriverArgs,
+                                  llvm::opt::ArgStringList &CC1Args,
+                                  StringRef LibDeviceFile, bool Wave64,
+                                  bool DAZ, bool FiniteOnly, bool UnsafeMathOpt,
+                                  bool FastRelaxedMath, bool CorrectSqrt) const;
+
+  /// Emit an error if Version does not support the given Arch.
+  ///
+  /// If either Version or Arch is unknown, does not emit an error.  Emits at
+  /// most one error per Arch.
+  void CheckRocmVersionSupportsArch(CudaArch Arch) const;
+
+  /// Check whether we detected a valid Rocm install.
+  bool isValid() const { return IsValid; }
+  /// Print information about the detected CUDA installation.
+  void print(raw_ostream &OS) const;
+
+  /// Get the detected Rocm install's version.
+  // RocmVersion version() const { return Version; }
+
+  /// Get the detected Rocm installation path.
+  StringRef getInstallPath() const { return InstallPath; }
+
+  /// Get the detected path to Rocm's bin directory.
+  // StringRef getBinPath() const { return BinPath; }
+
+  /// Get the detected Rocm Include path.
+  StringRef getIncludePath() const { return IncludePath; }
+
+  /// Get the detected Rocm library path.
+  StringRef getLibPath() const { return LibPath; }
+
+  /// Get the detected Rocm device library path.
+  StringRef getLibDevicePath() const { return LibDevicePath; }
+
+  StringRef getOCMLPath() const {
+    assert(!OCML.empty());
+    return OCML;
+  }
+
+  StringRef getOCKLPath() const {
+    assert(!OCKL.empty());
+    return OCKL;
+  }
+
+  StringRef getOpenCLPath() const {
+    assert(!OpenCL.empty());
+    return OpenCL;
+  }
+
+  StringRef getHIPPath() const {
+    assert(!HIP.empty());
+    return HIP;
+  }
+
+  StringRef getWavefrontSize64Path(bool Enabled) const {
+    return WavefrontSize64.get(Enabled);
+  }
+
+  StringRef getFiniteOnlyPath(bool Enabled) const {
+    return FiniteOnly.get(Enabled);
+  }
+
+  StringRef getUnsafeMathPath(bool Enabled) const {
+    return UnsafeMath.get(Enabled);
+  }
+
+  StringRef getDenormalsAreZeroPath(bool Enabled) const {
+    return DenormalsAreZero.get(Enabled);
+  }
+
+  StringRef getCorrectlyRoundedSqrtPath(bool Enabled) const {
+    return CorrectlyRoundedSqrt.get(Enabled);
+  }
+
+  /// Get libdevice file for given architecture
+  std::string getLibDeviceFile(StringRef Gpu) const {
+    return LibDeviceMap.lookup(Gpu);
+  }
+
+  void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                         llvm::opt::ArgStringList &CC1Args) const;
+};
+
+} // end namespace driver
+} // end namespace clang
+
+#endif // LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_ROCM_H
Index: clang/lib/Driver/ToolChains/MSVC.h
===================================================================
--- clang/lib/Driver/ToolChains/MSVC.h
+++ clang/lib/Driver/ToolChains/MSVC.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_MSVC_H
 #define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_MSVC_H
 
+#include "AMDGPU.h"
 #include "Cuda.h"
 #include "clang/Basic/DebugInfoOptions.h"
 #include "clang/Driver/Compilation.h"
@@ -125,6 +126,9 @@
   void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                           llvm::opt::ArgStringList &CC1Args) const override;
 
+  void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                         llvm::opt::ArgStringList &CC1Args) const override;
+
   bool getWindowsSDKLibraryPath(std::string &path) const;
   /// Check if Universal CRT should be used if available
   bool getUniversalCRTLibraryPath(std::string &path) const;
@@ -155,6 +159,7 @@
   std::string VCToolChainPath;
   ToolsetLayout VSLayout = ToolsetLayout::OlderVS;
   CudaInstallationDetector CudaInstallation;
+  RocmInstallationDetector RocmInstallation;
 };
 
 } // end namespace toolchains
Index: clang/lib/Driver/ToolChains/MSVC.cpp
===================================================================
--- clang/lib/Driver/ToolChains/MSVC.cpp
+++ clang/lib/Driver/ToolChains/MSVC.cpp
@@ -739,7 +739,8 @@
 
 MSVCToolChain::MSVCToolChain(const Driver &D, const llvm::Triple &Triple,
                              const ArgList &Args)
-    : ToolChain(D, Triple, Args), CudaInstallation(D, Triple, Args) {
+    : ToolChain(D, Triple, Args), CudaInstallation(D, Triple, Args),
+      RocmInstallation(D, Triple, Args) {
   getProgramPaths().push_back(getDriver().getInstalledDir());
   if (getDriver().getInstalledDir() != getDriver().Dir)
     getProgramPaths().push_back(getDriver().Dir);
@@ -797,6 +798,11 @@
   CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);
 }
 
+void MSVCToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs,
+                                      ArgStringList &CC1Args) const {
+  RocmInstallation.AddHIPIncludeArgs(DriverArgs, CC1Args);
+}
+
 void MSVCToolChain::printVerboseInfo(raw_ostream &OS) const {
   CudaInstallation.print(OS);
 }
Index: clang/lib/Driver/ToolChains/Linux.h
===================================================================
--- clang/lib/Driver/ToolChains/Linux.h
+++ clang/lib/Driver/ToolChains/Linux.h
@@ -31,6 +31,8 @@
       llvm::opt::ArgStringList &CC1Args) const override;
   void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                           llvm::opt::ArgStringList &CC1Args) const override;
+  void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                         llvm::opt::ArgStringList &CC1Args) const override;
   void AddIAMCUIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                            llvm::opt::ArgStringList &CC1Args) const override;
   CXXStdlibType GetDefaultCXXStdlibType() const override;
Index: clang/lib/Driver/ToolChains/Linux.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Linux.cpp
+++ clang/lib/Driver/ToolChains/Linux.cpp
@@ -797,6 +797,11 @@
   CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);
 }
 
+void Linux::AddHIPIncludeArgs(const ArgList &DriverArgs,
+                              ArgStringList &CC1Args) const {
+  RocmInstallation.AddHIPIncludeArgs(DriverArgs, CC1Args);
+}
+
 void Linux::AddIAMCUIncludeArgs(const ArgList &DriverArgs,
                                 ArgStringList &CC1Args) const {
   if (GCCInstallation.isValid()) {
Index: clang/lib/Driver/ToolChains/HIP.h
===================================================================
--- clang/lib/Driver/ToolChains/HIP.h
+++ clang/lib/Driver/ToolChains/HIP.h
@@ -107,6 +107,8 @@
       llvm::opt::ArgStringList &CC1Args) const override;
   void AddIAMCUIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                            llvm::opt::ArgStringList &CC1Args) const override;
+  void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                         llvm::opt::ArgStringList &CC1Args) const override;
 
   SanitizerMask getSupportedSanitizers() const override;
 
Index: clang/lib/Driver/ToolChains/HIP.cpp
===================================================================
--- clang/lib/Driver/ToolChains/HIP.cpp
+++ clang/lib/Driver/ToolChains/HIP.cpp
@@ -427,6 +427,11 @@
   HostTC.AddIAMCUIncludeArgs(Args, CC1Args);
 }
 
+void HIPToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs,
+                                     ArgStringList &CC1Args) const {
+  RocmInstallation.AddHIPIncludeArgs(DriverArgs, CC1Args);
+}
+
 SanitizerMask HIPToolChain::getSupportedSanitizers() const {
   // The HIPToolChain only supports sanitizers in the sense that it allows
   // sanitizer arguments on the command line if they are supported by the host
Index: clang/lib/Driver/ToolChains/Gnu.h
===================================================================
--- clang/lib/Driver/ToolChains/Gnu.h
+++ clang/lib/Driver/ToolChains/Gnu.h
@@ -10,6 +10,7 @@
 #define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_GNU_H
 
 #include "Cuda.h"
+#include "ROCm.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
 #include <set>
@@ -278,6 +279,7 @@
 protected:
   GCCInstallationDetector GCCInstallation;
   CudaInstallationDetector CudaInstallation;
+  RocmInstallationDetector RocmInstallation;
 
 public:
   Generic_GCC(const Driver &D, const llvm::Triple &Triple,
Index: clang/lib/Driver/ToolChains/Gnu.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Gnu.cpp
+++ clang/lib/Driver/ToolChains/Gnu.cpp
@@ -2578,7 +2578,7 @@
 Generic_GCC::Generic_GCC(const Driver &D, const llvm::Triple &Triple,
                          const ArgList &Args)
     : ToolChain(D, Triple, Args), GCCInstallation(D),
-      CudaInstallation(D, Triple, Args) {
+      CudaInstallation(D, Triple, Args), RocmInstallation(D, Triple, Args) {
   getProgramPaths().push_back(getDriver().getInstalledDir());
   if (getDriver().getInstalledDir() != getDriver().Dir)
     getProgramPaths().push_back(getDriver().Dir);
Index: clang/lib/Driver/ToolChains/Cuda.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Cuda.cpp
+++ clang/lib/Driver/ToolChains/Cuda.cpp
@@ -241,7 +241,7 @@
     CC1Args.push_back(DriverArgs.MakeArgString(P));
   }
 
-  if (DriverArgs.hasArg(options::OPT_nocudainc))
+  if (DriverArgs.hasArg(options::OPT_nogpuinc))
     return;
 
   if (!isValid()) {
@@ -765,7 +765,7 @@
 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
                                        ArgStringList &CC1Args) const {
   // Check our CUDA version if we're going to include the CUDA headers.
-  if (!DriverArgs.hasArg(options::OPT_nocudainc) &&
+  if (!DriverArgs.hasArg(options::OPT_nogpuinc) &&
       !DriverArgs.hasArg(options::OPT_no_cuda_version_check)) {
     StringRef Arch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
     assert(!Arch.empty() && "Must have an explicit GPU arch.");
Index: clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -1202,12 +1202,14 @@
   Args.AddLastArg(CmdArgs, options::OPT_MP);
   Args.AddLastArg(CmdArgs, options::OPT_MV);
 
-  // Add offload include arguments specific for CUDA.  This must happen before
-  // we -I or -include anything else, because we must pick up the CUDA headers
-  // from the particular CUDA installation, rather than from e.g.
-  // /usr/local/include.
+  // Add offload include arguments specific for CUDA/HIP.  This must happen
+  // before we -I or -include anything else, because we must pick up the
+  // CUDA/HIP headers from the particular CUDA/ROCm installation, rather than
+  // from e.g. /usr/local/include.
   if (JA.isOffloading(Action::OFK_Cuda))
     getToolChain().AddCudaIncludeArgs(Args, CmdArgs);
+  if (JA.isOffloading(Action::OFK_HIP))
+    getToolChain().AddHIPIncludeArgs(Args, CmdArgs);
 
   // If we are offloading to a target via OpenMP we need to include the
   // openmp_wrappers folder which contains alternative system headers.
Index: clang/lib/Driver/ToolChains/AMDGPU.h
===================================================================
--- clang/lib/Driver/ToolChains/AMDGPU.h
+++ clang/lib/Driver/ToolChains/AMDGPU.h
@@ -10,6 +10,7 @@
 #define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_AMDGPU_H
 
 #include "Gnu.h"
+#include "ROCm.h"
 #include "clang/Driver/Options.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
@@ -21,148 +22,6 @@
 namespace clang {
 namespace driver {
 
-/// A class to find a viable ROCM installation
-/// TODO: Generalize to handle libclc.
-class RocmInstallationDetector {
-private:
-  struct ConditionalLibrary {
-    SmallString<0> On;
-    SmallString<0> Off;
-
-    bool isValid() const {
-      return !On.empty() && !Off.empty();
-    }
-
-    StringRef get(bool Enabled) const {
-      assert(isValid());
-      return Enabled ? On : Off;
-    }
-  };
-
-  const Driver &D;
-  bool IsValid = false;
-  //RocmVersion Version = RocmVersion::UNKNOWN;
-  SmallString<0> InstallPath;
-  //SmallString<0> BinPath;
-  SmallString<0> LibPath;
-  SmallString<0> LibDevicePath;
-  SmallString<0> IncludePath;
-  llvm::StringMap<std::string> LibDeviceMap;
-
-  // Libraries that are always linked.
-  SmallString<0> OCML;
-  SmallString<0> OCKL;
-
-  // Libraries that are always linked depending on the language
-  SmallString<0> OpenCL;
-  SmallString<0> HIP;
-
-  // Libraries swapped based on compile flags.
-  ConditionalLibrary WavefrontSize64;
-  ConditionalLibrary FiniteOnly;
-  ConditionalLibrary UnsafeMath;
-  ConditionalLibrary DenormalsAreZero;
-  ConditionalLibrary CorrectlyRoundedSqrt;
-
-  bool allGenericLibsValid() const {
-    return !OCML.empty() && !OCKL.empty() && !OpenCL.empty() && !HIP.empty() &&
-           WavefrontSize64.isValid() && FiniteOnly.isValid() &&
-           UnsafeMath.isValid() && DenormalsAreZero.isValid() &&
-           CorrectlyRoundedSqrt.isValid();
-  }
-
-  // CUDA architectures for which we have raised an error in
-  // CheckRocmVersionSupportsArch.
-  mutable llvm::SmallSet<CudaArch, 4> ArchsWithBadVersion;
-
-  void scanLibDevicePath();
-
-public:
-  RocmInstallationDetector(const Driver &D, const llvm::Triple &HostTriple,
-                           const llvm::opt::ArgList &Args);
-
-  /// Add arguments needed to link default bitcode libraries.
-  void addCommonBitcodeLibCC1Args(const llvm::opt::ArgList &DriverArgs,
-                                  llvm::opt::ArgStringList &CC1Args,
-                                  StringRef LibDeviceFile, bool Wave64,
-                                  bool DAZ, bool FiniteOnly, bool UnsafeMathOpt,
-                                  bool FastRelaxedMath, bool CorrectSqrt) const;
-
-  /// Emit an error if Version does not support the given Arch.
-  ///
-  /// If either Version or Arch is unknown, does not emit an error.  Emits at
-  /// most one error per Arch.
-  void CheckRocmVersionSupportsArch(CudaArch Arch) const;
-
-  /// Check whether we detected a valid Rocm install.
-  bool isValid() const { return IsValid; }
-  /// Print information about the detected CUDA installation.
-  void print(raw_ostream &OS) const;
-
-  /// Get the detected Rocm install's version.
-  // RocmVersion version() const { return Version; }
-
-  /// Get the detected Rocm installation path.
-  StringRef getInstallPath() const { return InstallPath; }
-
-  /// Get the detected path to Rocm's bin directory.
-  // StringRef getBinPath() const { return BinPath; }
-
-  /// Get the detected Rocm Include path.
-  StringRef getIncludePath() const { return IncludePath; }
-
-  /// Get the detected Rocm library path.
-  StringRef getLibPath() const { return LibPath; }
-
-  /// Get the detected Rocm device library path.
-  StringRef getLibDevicePath() const { return LibDevicePath; }
-
-  StringRef getOCMLPath() const {
-    assert(!OCML.empty());
-    return OCML;
-  }
-
-  StringRef getOCKLPath() const {
-    assert(!OCKL.empty());
-    return OCKL;
-  }
-
-  StringRef getOpenCLPath() const {
-    assert(!OpenCL.empty());
-    return OpenCL;
-  }
-
-  StringRef getHIPPath() const {
-    assert(!HIP.empty());
-    return HIP;
-  }
-
-  StringRef getWavefrontSize64Path(bool Enabled) const {
-    return WavefrontSize64.get(Enabled);
-  }
-
-  StringRef getFiniteOnlyPath(bool Enabled) const {
-    return FiniteOnly.get(Enabled);
-  }
-
-  StringRef getUnsafeMathPath(bool Enabled) const {
-    return UnsafeMath.get(Enabled);
-  }
-
-  StringRef getDenormalsAreZeroPath(bool Enabled) const {
-    return DenormalsAreZero.get(Enabled);
-  }
-
-  StringRef getCorrectlyRoundedSqrtPath(bool Enabled) const {
-    return CorrectlyRoundedSqrt.get(Enabled);
-  }
-
-  /// Get libdevice file for given architecture
-  std::string getLibDeviceFile(StringRef Gpu) const {
-    return LibDeviceMap.lookup(Gpu);
-  }
-};
-
 namespace tools {
 namespace amdgpu {
 
Index: clang/lib/Driver/ToolChains/AMDGPU.cpp
===================================================================
--- clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -199,6 +199,40 @@
     OS << "Found ROCm installation: " << InstallPath << '\n';
 }
 
+void RocmInstallationDetector::AddHIPIncludeArgs(const ArgList &DriverArgs,
+                                                 ArgStringList &CC1Args) const {
+  if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
+    // HIP header includes standard library wrapper headers under clang
+    // cuda_wrappers directory. Since these wrapper headers include_next
+    // standard C++ headers, whereas libc++ headers include_next other clang
+    // headers. The include paths have to follow this order:
+    // - wrapper include path
+    // - standard C++ include path
+    // - other clang include path
+    // Since standard C++ and other clang include paths are added in other
+    // places after this function, here we only need to make sure wrapper
+    // include path is added.
+    SmallString<128> P(D.ResourceDir);
+    llvm::sys::path::append(P, "include");
+    llvm::sys::path::append(P, "cuda_wrappers");
+    CC1Args.push_back("-internal-isystem");
+    CC1Args.push_back(DriverArgs.MakeArgString(P));
+    CC1Args.push_back("-include");
+    CC1Args.push_back("__clang_hip_runtime_wrapper.h");
+  }
+
+  if (DriverArgs.hasArg(options::OPT_nogpuinc))
+    return;
+
+  if (!isValid()) {
+    D.Diag(diag::err_drv_no_rocm_installation);
+    return;
+  }
+
+  CC1Args.push_back("-internal-isystem");
+  CC1Args.push_back(DriverArgs.MakeArgString(getIncludePath()));
+}
+
 void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                                   const InputInfo &Output,
                                   const InputInfoList &Inputs,
Index: clang/lib/Driver/ToolChain.cpp
===================================================================
--- clang/lib/Driver/ToolChain.cpp
+++ clang/lib/Driver/ToolChain.cpp
@@ -991,6 +991,9 @@
 void ToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
                                    ArgStringList &CC1Args) const {}
 
+void ToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs,
+                                  ArgStringList &CC1Args) const {}
+
 void ToolChain::AddIAMCUIncludeArgs(const ArgList &DriverArgs,
                                     ArgStringList &CC1Args) const {}
 
Index: clang/include/clang/Driver/ToolChain.h
===================================================================
--- clang/include/clang/Driver/ToolChain.h
+++ clang/include/clang/Driver/ToolChain.h
@@ -617,6 +617,10 @@
   virtual void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                                   llvm::opt::ArgStringList &CC1Args) const;
 
+  /// Add arguments to use system-specific HIP includes.
+  virtual void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                                 llvm::opt::ArgStringList &CC1Args) const;
+
   /// Add arguments to use MCU GCC toolchain includes.
   virtual void AddIAMCUIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                                    llvm::opt::ArgStringList &CC1Args) const;
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -2779,7 +2779,8 @@
 def no__dead__strip__inits__and__terms : Flag<["-"], "no_dead_strip_inits_and_terms">;
 def nobuiltininc : Flag<["-"], "nobuiltininc">, Flags<[CC1Option, CoreOption]>,
   HelpText<"Disable builtin #include directories">;
-def nocudainc : Flag<["-"], "nocudainc">;
+def nogpuinc : Flag<["-"], "nogpuinc">;
+def : Flag<["-"], "nocudainc">, Alias<nogpuinc>;
 def nogpulib : Flag<["-"], "nogpulib">,
   HelpText<"Do not link device library for CUDA/HIP device compilation">;
 def : Flag<["-"], "nocudalib">, Alias<nogpulib>;
Index: clang/include/clang/Basic/DiagnosticDriverKinds.td
===================================================================
--- clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -58,7 +58,7 @@
 
 def err_drv_no_rocm_installation : Error<
   "cannot find ROCm installation.  Provide its path via --rocm-path, or pass "
-  "-nogpulib.">;
+  "-nogpulib and -nogpuinc to build without ROCm device library and HIP includes.">;
 def err_drv_no_rocm_device_lib : Error<
   "cannot find device library for %0. Provide path to different ROCm installation "
   "via --rocm-path, or pass -nogpulib to build without linking default libraries.">;
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to