https://github.com/JonChesterfield updated 
https://github.com/llvm/llvm-project/pull/131164

>From 092024bbf31b0677e6efbb0e6fc0cee7606ecb08 Mon Sep 17 00:00:00 2001
From: Jon Chesterfield <jonathanchesterfi...@gmail.com>
Date: Tue, 18 Mar 2025 15:57:02 +0000
Subject: [PATCH] [Headers] Implement spirvamdgcnintrin.h

---
 clang/lib/Headers/amdgpuintrin.h      |   2 +-
 clang/lib/Headers/gpuintrin.h         |   4 +
 clang/lib/Headers/spirvamdgpuintrin.h | 191 ++++++++++++++++++++++
 clang/test/Headers/gpuintrin.c        | 223 ++++++++++++++++++++++++++
 4 files changed, 419 insertions(+), 1 deletion(-)
 create mode 100644 clang/lib/Headers/spirvamdgpuintrin.h

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index f7fb8e2814180..817cfeec896c4 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -1,4 +1,4 @@
-//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
-----------------------===//
+//===-- amdgpuintrin.h - AMDGPU intrinsic functions 
-----------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 0fb3916acac61..934490f51fb8e 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -59,7 +59,11 @@ _Pragma("omp end declare target");
 #if defined(__NVPTX__)
 #include <nvptxintrin.h>
 #elif defined(__AMDGPU__)
+#if defined(__SPIRV64__)
+#include <spirvamdgpuintrin.h>
+#else
 #include <amdgpuintrin.h>
+#endif
 #elif !defined(_OPENMP)
 #error "This header is only meant to be used on GPU architectures."
 #endif
diff --git a/clang/lib/Headers/spirvamdgpuintrin.h 
b/clang/lib/Headers/spirvamdgpuintrin.h
new file mode 100644
index 0000000000000..1d123d39657a2
--- /dev/null
+++ b/clang/lib/Headers/spirvamdgpuintrin.h
@@ -0,0 +1,191 @@
+//===-- spirvamdgpuintrin.h - spirv amdgpu intrinsic functions -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __SPIRVAMDGPUINTRIN_H
+#define __SPIRVAMDGPUINTRIN_H
+
+#if !defined( __SPIRV64__) || !defined(__AMDGPU__)
+#error "This file is intended for the spirv64-amd-amdhsa target"
+#endif
+
+#ifndef __GPUINTRIN_H
+#error "Never use <spirvamdgcnintrin.h> directly; include <gpuintrin.h> 
instead"
+#endif
+
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {arch(amdgcn)})");
+
+// Type aliases to the address spaces used by the SPIRV64 AMDGPU backend.
+#define __gpu_private __attribute__((address_space(0)))
+#define __gpu_constant __attribute__((address_space(1)))
+#define __gpu_local __attribute__((address_space(3)))
+#define __gpu_global __attribute__((address_space(1)))
+#define __gpu_generic __attribute__((address_space(4)))
+
+// Attribute to declare a function as a kernel is not available on spirv
+#define __gpu_kernel
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) {
+  return __builtin_amdgcn_workgroup_id_x();
+}
+
+// Returns the 'y' dimension of the current AMD workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) {
+  return __builtin_amdgcn_workgroup_id_y();
+}
+
+// Returns the 'z' dimension of the current AMD workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) {
+  return __builtin_amdgcn_workgroup_id_z();
+}
+
+// Returns the number of workitems in the 'x' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) {
+  return __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workitems in the 'y' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) {
+  return __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workitems in the 'z' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) {
+  return __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) {
+  return __builtin_amdgcn_workitem_id_x();
+}
+
+// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) {
+  return __builtin_amdgcn_workitem_id_y();
+}
+
+// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
+  return __builtin_amdgcn_workitem_id_z();
+}
+
+// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
+// and compilation options.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
+  return __builtin_amdgcn_wavefrontsize();
+}
+
+// Returns the id of the thread inside of an AMD wavefront executing together.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
+  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
+}
+
+// Returns the bit-mask of active threads in the current wavefront.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
+  return __builtin_amdgcn_read_exec();
+}
+
+// Copies the value from the first active thread in the wavefront to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __builtin_amdgcn_readfirstlane(__x);
+}
+
+// Returns a bitmask of threads in the current lane for which \p x is true.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
+                                                          bool __x) {
+  // The lane_mask & gives the nvptx semantics when lane_mask is a subset of
+  // the active threads
+  return __lane_mask & __builtin_amdgcn_ballot_w64(__x);
+}
+
+// Waits for all the threads in the block to converge and issues a fence.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) {
+  __builtin_amdgcn_s_barrier();
+  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
+}
+
+// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) 
{
+  __builtin_amdgcn_wave_barrier();
+}
+
+// Shuffles the the lanes inside the wavefront according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+                      uint32_t __width) {
+  uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1));
+  return __builtin_amdgcn_ds_bpermute(__lane << 2, __x);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __gpu_match_any_u32_impl(__lane_mask, __x);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+  return __gpu_match_any_u64_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __gpu_match_all_u32_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+  return __gpu_match_all_u64_impl(__lane_mask, __x);
+}
+
+// Returns true if the flat pointer points to AMDGPU 'shared' memory.
+_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
+  return __builtin_amdgcn_is_shared((void [[clang::address_space(0)]] *)((
+      void [[clang::opencl_generic]] *)ptr));
+}
+
+// Returns true if the flat pointer points to AMDGPU 'private' memory.
+_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) {
+  return __builtin_amdgcn_is_private((void [[clang::address_space(0)]] *)((
+      void [[clang::opencl_generic]] *)ptr));
+}
+
+// Terminates execution of the associated wavefront.
+_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
+  __builtin_amdgcn_endpgm();
+}
+
+// Suspend the thread briefly to assist the scheduler during busy loops.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
+  __builtin_amdgcn_s_sleep(2);
+}
+
+_Pragma("omp end declare variant");
+_Pragma("omp end declare target");
+
+#endif // __SPIRVAMDGPUINTRIN_H
diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
index 9a15ce277ba87..574619cbdcf0b 100644
--- a/clang/test/Headers/gpuintrin.c
+++ b/clang/test/Headers/gpuintrin.c
@@ -9,6 +9,11 @@
 // RUN:   -target-feature +ptx62 \
 // RUN:   -triple nvptx64-nvidia-cuda -emit-llvm %s -o - \
 // RUN: | FileCheck %s --check-prefix=NVPTX
+//
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include  \
+// RUN:   -internal-isystem %S/../../lib/Headers/ \
+// RUN:   -triple spirv64-amd-amdhsa -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefix=SPIRVAMD
 
 #include <gpuintrin.h>
 
@@ -978,6 +983,224 @@ __gpu_kernel void foo() {
 // NVPTX-NEXT:    call void @llvm.nvvm.exit()
 // NVPTX-NEXT:    ret void
 //
+//
+// SPIRVAMD-LABEL: define spir_func void @foo(
+// SPIRVAMD-SAME: ) #[[ATTR0:[0-9]+]] {
+// SPIRVAMD-NEXT:  [[ENTRY:.*:]]
+// SPIRVAMD-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_num_blocks_x()
+// SPIRVAMD-NEXT:    [[CALL1:%.*]] = call spir_func i32 @__gpu_num_blocks_y()
+// SPIRVAMD-NEXT:    [[CALL2:%.*]] = call spir_func i32 @__gpu_num_blocks_z()
+// SPIRVAMD-NEXT:    [[CALL3:%.*]] = call spir_func i32 @__gpu_num_blocks(i32 
noundef 0)
+// SPIRVAMD-NEXT:    [[CALL4:%.*]] = call spir_func i32 @__gpu_block_id_x()
+// SPIRVAMD-NEXT:    [[CALL5:%.*]] = call spir_func i32 @__gpu_block_id_y()
+// SPIRVAMD-NEXT:    [[CALL6:%.*]] = call spir_func i32 @__gpu_block_id_z()
+// SPIRVAMD-NEXT:    [[CALL7:%.*]] = call spir_func i32 @__gpu_block_id(i32 
noundef 0)
+// SPIRVAMD-NEXT:    [[CALL8:%.*]] = call spir_func i32 @__gpu_num_threads_x()
+// SPIRVAMD-NEXT:    [[CALL9:%.*]] = call spir_func i32 @__gpu_num_threads_y()
+// SPIRVAMD-NEXT:    [[CALL10:%.*]] = call spir_func i32 @__gpu_num_threads_z()
+// SPIRVAMD-NEXT:    [[CALL11:%.*]] = call spir_func i32 
@__gpu_num_threads(i32 noundef 0)
+// SPIRVAMD-NEXT:    [[CALL12:%.*]] = call spir_func i32 @__gpu_thread_id_x()
+// SPIRVAMD-NEXT:    [[CALL13:%.*]] = call spir_func i32 @__gpu_thread_id_y()
+// SPIRVAMD-NEXT:    [[CALL14:%.*]] = call spir_func i32 @__gpu_thread_id_z()
+// SPIRVAMD-NEXT:    [[CALL15:%.*]] = call spir_func i32 @__gpu_thread_id(i32 
noundef 0)
+// SPIRVAMD-NEXT:    [[CALL16:%.*]] = call spir_func i32 @__gpu_num_lanes()
+// SPIRVAMD-NEXT:    [[CALL17:%.*]] = call spir_func i32 @__gpu_lane_id()
+// SPIRVAMD-NEXT:    [[CALL18:%.*]] = call spir_func i64 @__gpu_lane_mask()
+// SPIRVAMD-NEXT:    [[CALL19:%.*]] = call spir_func i32 
@__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1)
+// SPIRVAMD-NEXT:    [[CALL20:%.*]] = call spir_func i64 
@__gpu_read_first_lane_u64(i64 noundef -1, i64 noundef -1)
+// SPIRVAMD-NEXT:    [[CALL21:%.*]] = call spir_func i64 @__gpu_ballot(i64 
noundef -1, i1 noundef zeroext true)
+// SPIRVAMD-NEXT:    call spir_func void @__gpu_sync_threads()
+// SPIRVAMD-NEXT:    call spir_func void @__gpu_sync_lane(i64 noundef -1)
+// SPIRVAMD-NEXT:    [[CALL22:%.*]] = call spir_func i32 
@__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 
noundef 0)
+// SPIRVAMD-NEXT:    [[CALL23:%.*]] = call spir_func i64 
@__gpu_first_lane_id(i64 noundef -1)
+// SPIRVAMD-NEXT:    [[CALL24:%.*]] = call spir_func zeroext i1 
@__gpu_is_first_in_lane(i64 noundef -1)
+// SPIRVAMD-NEXT:    call spir_func void @__gpu_exit() #[[ATTR4:[0-9]+]]
+// SPIRVAMD-NEXT:    unreachable
+//
+//
+// SPIRVAMD-LABEL: define internal spir_func i32 @__gpu_num_blocks(
+// SPIRVAMD-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// SPIRVAMD-NEXT:  [[ENTRY:.*:]]
+// SPIRVAMD-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SPIRVAMD-NEXT:    [[__DIM_ADDR:%.*]] = alloca i32, align 4
+// SPIRVAMD-NEXT:    store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
+// SPIRVAMD-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
+// SPIRVAMD-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// SPIRVAMD-NEXT:      i32 0, label %[[SW_BB:.*]]
+// SPIRVAMD-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// SPIRVAMD-NEXT:      i32 2, label %[[SW_BB3:.*]]
+// SPIRVAMD-NEXT:    ]
+// SPIRVAMD:       [[SW_BB]]:
+// SPIRVAMD-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_num_blocks_x()
+// SPIRVAMD-NEXT:    store i32 [[CALL]], ptr [[RETVAL]], align 4
+// SPIRVAMD-NEXT:    br label %[[RETURN:.*]]
+// SPIRVAMD:       [[SW_BB1]]:
+// SPIRVAMD-NEXT:    [[CALL2:%.*]] = call spir_func i32 @__gpu_num_blocks_y()
+// SPIRVAMD-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// SPIRVAMD-NEXT:    br label %[[RETURN]]
+// SPIRVAMD:       [[SW_BB3]]:
+// SPIRVAMD-NEXT:    [[CALL4:%.*]] = call spir_func i32 @__gpu_num_blocks_z()
+// SPIRVAMD-NEXT:    store i32 [[CALL4]], ptr [[RETVAL]], align 4
+// SPIRVAMD-NEXT:    br label %[[RETURN]]
+// SPIRVAMD:       [[SW_DEFAULT]]:
+// SPIRVAMD-NEXT:    unreachable
+// SPIRVAMD:       [[RETURN]]:
+// SPIRVAMD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
+// SPIRVAMD-NEXT:    ret i32 [[TMP1]]
+//
+//
+// SPIRVAMD-LABEL: define internal spir_func i32 @__gpu_block_id(
+// SPIRVAMD-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// SPIRVAMD-NEXT:  [[ENTRY:.*:]]
+// SPIRVAMD-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SPIRVAMD-NEXT:    [[__DIM_ADDR:%.*]] = alloca i32, align 4
+// SPIRVAMD-NEXT:    store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
+// SPIRVAMD-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
+// SPIRVAMD-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// SPIRVAMD-NEXT:      i32 0, label %[[SW_BB:.*]]
+// SPIRVAMD-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// SPIRVAMD-NEXT:      i32 2, label %[[SW_BB3:.*]]
+// SPIRVAMD-NEXT:    ]
+// SPIRVAMD:       [[SW_BB]]:
+// SPIRVAMD-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_block_id_x()
+// SPIRVAMD-NEXT:    store i32 [[CALL]], ptr [[RETVAL]], align 4
+// SPIRVAMD-NEXT:    br label %[[RETURN:.*]]
+// SPIRVAMD:       [[SW_BB1]]:
+// SPIRVAMD-NEXT:    [[CALL2:%.*]] = call spir_func i32 @__gpu_block_id_y()
+// SPIRVAMD-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// SPIRVAMD-NEXT:    br label %[[RETURN]]
+// SPIRVAMD:       [[SW_BB3]]:
+// SPIRVAMD-NEXT:    [[CALL4:%.*]] = call spir_func i32 @__gpu_block_id_z()
+// SPIRVAMD-NEXT:    store i32 [[CALL4]], ptr [[RETVAL]], align 4
+// SPIRVAMD-NEXT:    br label %[[RETURN]]
+// SPIRVAMD:       [[SW_DEFAULT]]:
+// SPIRVAMD-NEXT:    unreachable
+// SPIRVAMD:       [[RETURN]]:
+// SPIRVAMD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
+// SPIRVAMD-NEXT:    ret i32 [[TMP1]]
+//
+//
+// SPIRVAMD-LABEL: define internal spir_func i32 @__gpu_num_threads(
+// SPIRVAMD-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// SPIRVAMD-NEXT:  [[ENTRY:.*:]]
+// SPIRVAMD-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SPIRVAMD-NEXT:    [[__DIM_ADDR:%.*]] = alloca i32, align 4
+// SPIRVAMD-NEXT:    store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
+// SPIRVAMD-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
+// SPIRVAMD-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// SPIRVAMD-NEXT:      i32 0, label %[[SW_BB:.*]]
+// SPIRVAMD-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// SPIRVAMD-NEXT:      i32 2, label %[[SW_BB3:.*]]
+// SPIRVAMD-NEXT:    ]
+// SPIRVAMD:       [[SW_BB]]:
+// SPIRVAMD-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_num_threads_x()
+// SPIRVAMD-NEXT:    store i32 [[CALL]], ptr [[RETVAL]], align 4
+// SPIRVAMD-NEXT:    br label %[[RETURN:.*]]
+// SPIRVAMD:       [[SW_BB1]]:
+// SPIRVAMD-NEXT:    [[CALL2:%.*]] = call spir_func i32 @__gpu_num_threads_y()
+// SPIRVAMD-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// SPIRVAMD-NEXT:    br label %[[RETURN]]
+// SPIRVAMD:       [[SW_BB3]]:
+// SPIRVAMD-NEXT:    [[CALL4:%.*]] = call spir_func i32 @__gpu_num_threads_z()
+// SPIRVAMD-NEXT:    store i32 [[CALL4]], ptr [[RETVAL]], align 4
+// SPIRVAMD-NEXT:    br label %[[RETURN]]
+// SPIRVAMD:       [[SW_DEFAULT]]:
+// SPIRVAMD-NEXT:    unreachable
+// SPIRVAMD:       [[RETURN]]:
+// SPIRVAMD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
+// SPIRVAMD-NEXT:    ret i32 [[TMP1]]
+//
+//
+// SPIRVAMD-LABEL: define internal spir_func i32 @__gpu_thread_id(
+// SPIRVAMD-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// SPIRVAMD-NEXT:  [[ENTRY:.*:]]
+// SPIRVAMD-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SPIRVAMD-NEXT:    [[__DIM_ADDR:%.*]] = alloca i32, align 4
+// SPIRVAMD-NEXT:    store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
+// SPIRVAMD-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
+// SPIRVAMD-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// SPIRVAMD-NEXT:      i32 0, label %[[SW_BB:.*]]
+// SPIRVAMD-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// SPIRVAMD-NEXT:      i32 2, label %[[SW_BB3:.*]]
+// SPIRVAMD-NEXT:    ]
+// SPIRVAMD:       [[SW_BB]]:
+// SPIRVAMD-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_thread_id_x()
+// SPIRVAMD-NEXT:    store i32 [[CALL]], ptr [[RETVAL]], align 4
+// SPIRVAMD-NEXT:    br label %[[RETURN:.*]]
+// SPIRVAMD:       [[SW_BB1]]:
+// SPIRVAMD-NEXT:    [[CALL2:%.*]] = call spir_func i32 @__gpu_thread_id_y()
+// SPIRVAMD-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// SPIRVAMD-NEXT:    br label %[[RETURN]]
+// SPIRVAMD:       [[SW_BB3]]:
+// SPIRVAMD-NEXT:    [[CALL4:%.*]] = call spir_func i32 @__gpu_thread_id_z()
+// SPIRVAMD-NEXT:    store i32 [[CALL4]], ptr [[RETVAL]], align 4
+// SPIRVAMD-NEXT:    br label %[[RETURN]]
+// SPIRVAMD:       [[SW_DEFAULT]]:
+// SPIRVAMD-NEXT:    unreachable
+// SPIRVAMD:       [[RETURN]]:
+// SPIRVAMD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
+// SPIRVAMD-NEXT:    ret i32 [[TMP1]]
+//
+//
+// SPIRVAMD-LABEL: define internal spir_func i64 @__gpu_read_first_lane_u64(
+// SPIRVAMD-SAME: i64 noundef [[__LANE_MASK:%.*]], i64 noundef [[__X:%.*]]) 
#[[ATTR0]] {
+// SPIRVAMD-NEXT:  [[ENTRY:.*:]]
+// SPIRVAMD-NEXT:    [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
+// SPIRVAMD-NEXT:    [[__X_ADDR:%.*]] = alloca i64, align 8
+// SPIRVAMD-NEXT:    [[__HI:%.*]] = alloca i32, align 4
+// SPIRVAMD-NEXT:    [[__LO:%.*]] = alloca i32, align 4
+// SPIRVAMD-NEXT:    store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], 
align 8
+// SPIRVAMD-NEXT:    store i64 [[__X]], ptr [[__X_ADDR]], align 8
+// SPIRVAMD-NEXT:    [[TMP0:%.*]] = load i64, ptr [[__X_ADDR]], align 8
+// SPIRVAMD-NEXT:    [[SHR:%.*]] = lshr i64 [[TMP0]], 32
+// SPIRVAMD-NEXT:    [[CONV:%.*]] = trunc i64 [[SHR]] to i32
+// SPIRVAMD-NEXT:    store i32 [[CONV]], ptr [[__HI]], align 4
+// SPIRVAMD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[__X_ADDR]], align 8
+// SPIRVAMD-NEXT:    [[AND:%.*]] = and i64 [[TMP1]], 4294967295
+// SPIRVAMD-NEXT:    [[CONV1:%.*]] = trunc i64 [[AND]] to i32
+// SPIRVAMD-NEXT:    store i32 [[CONV1]], ptr [[__LO]], align 4
+// SPIRVAMD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRVAMD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__HI]], align 4
+// SPIRVAMD-NEXT:    [[CALL:%.*]] = call spir_func i32 
@__gpu_read_first_lane_u32(i64 noundef [[TMP2]], i32 noundef [[TMP3]])
+// SPIRVAMD-NEXT:    [[CONV2:%.*]] = zext i32 [[CALL]] to i64
+// SPIRVAMD-NEXT:    [[SHL:%.*]] = shl i64 [[CONV2]], 32
+// SPIRVAMD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRVAMD-NEXT:    [[TMP5:%.*]] = load i32, ptr [[__LO]], align 4
+// SPIRVAMD-NEXT:    [[CALL3:%.*]] = call spir_func i32 
@__gpu_read_first_lane_u32(i64 noundef [[TMP4]], i32 noundef [[TMP5]])
+// SPIRVAMD-NEXT:    [[CONV4:%.*]] = zext i32 [[CALL3]] to i64
+// SPIRVAMD-NEXT:    [[AND5:%.*]] = and i64 [[CONV4]], 4294967295
+// SPIRVAMD-NEXT:    [[OR:%.*]] = or i64 [[SHL]], [[AND5]]
+// SPIRVAMD-NEXT:    ret i64 [[OR]]
+//
+//
+// SPIRVAMD-LABEL: define internal spir_func i64 @__gpu_first_lane_id(
+// SPIRVAMD-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] {
+// SPIRVAMD-NEXT:  [[ENTRY:.*:]]
+// SPIRVAMD-NEXT:    [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
+// SPIRVAMD-NEXT:    store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], 
align 8
+// SPIRVAMD-NEXT:    [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRVAMD-NEXT:    [[TMP1:%.*]] = call i64 @llvm.cttz.i64(i64 [[TMP0]], i1 
true)
+// SPIRVAMD-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 1
+// SPIRVAMD-NEXT:    [[ISZERO:%.*]] = icmp eq i64 [[TMP0]], 0
+// SPIRVAMD-NEXT:    [[FFS:%.*]] = select i1 [[ISZERO]], i64 0, i64 [[TMP2]]
+// SPIRVAMD-NEXT:    [[CAST:%.*]] = trunc i64 [[FFS]] to i32
+// SPIRVAMD-NEXT:    [[SUB:%.*]] = sub nsw i32 [[CAST]], 1
+// SPIRVAMD-NEXT:    [[CONV:%.*]] = sext i32 [[SUB]] to i64
+// SPIRVAMD-NEXT:    ret i64 [[CONV]]
+//
+//
+// SPIRVAMD-LABEL: define internal spir_func zeroext i1 
@__gpu_is_first_in_lane(
+// SPIRVAMD-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] {
+// SPIRVAMD-NEXT:  [[ENTRY:.*:]]
+// SPIRVAMD-NEXT:    [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
+// SPIRVAMD-NEXT:    store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], 
align 8
+// SPIRVAMD-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_lane_id()
+// SPIRVAMD-NEXT:    [[CONV:%.*]] = zext i32 [[CALL]] to i64
+// SPIRVAMD-NEXT:    [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRVAMD-NEXT:    [[CALL1:%.*]] = call spir_func i64 
@__gpu_first_lane_id(i64 noundef [[TMP0]])
+// SPIRVAMD-NEXT:    [[CMP:%.*]] = icmp eq i64 [[CONV]], [[CALL1]]
+// SPIRVAMD-NEXT:    ret i1 [[CMP]]
+//
 //.
 // AMDGPU: [[RNG3]] = !{i32 1, i32 0}
 // AMDGPU: [[META4]] = !{}

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to