https://github.com/JonChesterfield updated https://github.com/llvm/llvm-project/pull/131164
>From 092024bbf31b0677e6efbb0e6fc0cee7606ecb08 Mon Sep 17 00:00:00 2001 From: Jon Chesterfield <jonathanchesterfi...@gmail.com> Date: Tue, 18 Mar 2025 15:57:02 +0000 Subject: [PATCH] [Headers] Implement spirvamdgcnintrin.h --- clang/lib/Headers/amdgpuintrin.h | 2 +- clang/lib/Headers/gpuintrin.h | 4 + clang/lib/Headers/spirvamdgpuintrin.h | 191 ++++++++++++++++++++++ clang/test/Headers/gpuintrin.c | 223 ++++++++++++++++++++++++++ 4 files changed, 419 insertions(+), 1 deletion(-) create mode 100644 clang/lib/Headers/spirvamdgpuintrin.h diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h index f7fb8e2814180..817cfeec896c4 100644 --- a/clang/lib/Headers/amdgpuintrin.h +++ b/clang/lib/Headers/amdgpuintrin.h @@ -1,4 +1,4 @@ -//===-- amdgpuintrin.h - AMDPGU intrinsic functions -----------------------===// +//===-- amdgpuintrin.h - AMDGPU intrinsic functions -----------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h index 0fb3916acac61..934490f51fb8e 100644 --- a/clang/lib/Headers/gpuintrin.h +++ b/clang/lib/Headers/gpuintrin.h @@ -59,7 +59,11 @@ _Pragma("omp end declare target"); #if defined(__NVPTX__) #include <nvptxintrin.h> #elif defined(__AMDGPU__) +#if defined(__SPIRV64__) +#include <spirvamdgpuintrin.h> +#else #include <amdgpuintrin.h> +#endif #elif !defined(_OPENMP) #error "This header is only meant to be used on GPU architectures." #endif diff --git a/clang/lib/Headers/spirvamdgpuintrin.h b/clang/lib/Headers/spirvamdgpuintrin.h new file mode 100644 index 0000000000000..1d123d39657a2 --- /dev/null +++ b/clang/lib/Headers/spirvamdgpuintrin.h @@ -0,0 +1,191 @@ +//===-- spirvamdgpuintrin.h - spirv amdgpu intrinsic functions -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __SPIRVAMDGPUINTRIN_H +#define __SPIRVAMDGPUINTRIN_H + +#if !defined( __SPIRV64__) || !defined(__AMDGPU__) +#error "This file is intended for the spirv64-amd-amdhsa target" +#endif + +#ifndef __GPUINTRIN_H +#error "Never use <spirvamdgcnintrin.h> directly; include <gpuintrin.h> instead" +#endif + +_Pragma("omp begin declare target device_type(nohost)"); +_Pragma("omp begin declare variant match(device = {arch(amdgcn)})"); + +// Type aliases to the address spaces used by the SPIRV64 AMDGPU backend. +#define __gpu_private __attribute__((address_space(0))) +#define __gpu_constant __attribute__((address_space(1))) +#define __gpu_local __attribute__((address_space(3))) +#define __gpu_global __attribute__((address_space(1))) +#define __gpu_generic __attribute__((address_space(4))) + +// Attribute to declare a function as a kernel is not available on spirv +#define __gpu_kernel + +// Returns the number of workgroups in the 'x' dimension of the grid. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) { + return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x(); +} + +// Returns the number of workgroups in the 'y' dimension of the grid. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) { + return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y(); +} + +// Returns the number of workgroups in the 'z' dimension of the grid. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) { + return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z(); +} + +// Returns the 'x' dimension of the current AMD workgroup's id. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) { + return __builtin_amdgcn_workgroup_id_x(); +} + +// Returns the 'y' dimension of the current AMD workgroup's id. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) { + return __builtin_amdgcn_workgroup_id_y(); +} + +// Returns the 'z' dimension of the current AMD workgroup's id. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) { + return __builtin_amdgcn_workgroup_id_z(); +} + +// Returns the number of workitems in the 'x' dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) { + return __builtin_amdgcn_workgroup_size_x(); +} + +// Returns the number of workitems in the 'y' dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) { + return __builtin_amdgcn_workgroup_size_y(); +} + +// Returns the number of workitems in the 'z' dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) { + return __builtin_amdgcn_workgroup_size_z(); +} + +// Returns the 'x' dimension id of the workitem in the current AMD workgroup. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) { + return __builtin_amdgcn_workitem_id_x(); +} + +// Returns the 'y' dimension id of the workitem in the current AMD workgroup. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) { + return __builtin_amdgcn_workitem_id_y(); +} + +// Returns the 'z' dimension id of the workitem in the current AMD workgroup. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) { + return __builtin_amdgcn_workitem_id_z(); +} + +// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware +// and compilation options. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) { + return __builtin_amdgcn_wavefrontsize(); +} + +// Returns the id of the thread inside of an AMD wavefront executing together. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) { + return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); +} + +// Returns the bit-mask of active threads in the current wavefront. +_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) { + return __builtin_amdgcn_read_exec(); +} + +// Copies the value from the first active thread in the wavefront to the rest. +_DEFAULT_FN_ATTRS static __inline__ uint32_t +__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) { + return __builtin_amdgcn_readfirstlane(__x); +} + +// Returns a bitmask of threads in the current lane for which \p x is true. +_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask, + bool __x) { + // The lane_mask & gives the nvptx semantics when lane_mask is a subset of + // the active threads + return __lane_mask & __builtin_amdgcn_ballot_w64(__x); +} + +// Waits for all the threads in the block to converge and issues a fence. +_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) { + __builtin_amdgcn_s_barrier(); + __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup"); +} + +// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU. +_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) { + __builtin_amdgcn_wave_barrier(); +} + +// Shuffles the the lanes inside the wavefront according to the given index. +_DEFAULT_FN_ATTRS static __inline__ uint32_t +__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, + uint32_t __width) { + uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1)); + return __builtin_amdgcn_ds_bpermute(__lane << 2, __x); +} + +// Returns a bitmask marking all lanes that have the same value of __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) { + return __gpu_match_any_u32_impl(__lane_mask, __x); +} + +// Returns a bitmask marking all lanes that have the same value of __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) { + return __gpu_match_any_u64_impl(__lane_mask, __x); +} + +// Returns the current lane mask if every lane contains __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) { + return __gpu_match_all_u32_impl(__lane_mask, __x); +} + +// Returns the current lane mask if every lane contains __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) { + return __gpu_match_all_u64_impl(__lane_mask, __x); +} + +// Returns true if the flat pointer points to AMDGPU 'shared' memory. +_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) { + return __builtin_amdgcn_is_shared((void [[clang::address_space(0)]] *)(( + void [[clang::opencl_generic]] *)ptr)); +} + +// Returns true if the flat pointer points to AMDGPU 'private' memory. +_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) { + return __builtin_amdgcn_is_private((void [[clang::address_space(0)]] *)(( + void [[clang::opencl_generic]] *)ptr)); +} + +// Terminates execution of the associated wavefront. +_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) { + __builtin_amdgcn_endpgm(); +} + +// Suspend the thread briefly to assist the scheduler during busy loops. +_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) { + __builtin_amdgcn_s_sleep(2); +} + +_Pragma("omp end declare variant"); +_Pragma("omp end declare target"); + +#endif // __SPIRVAMDGPUINTRIN_H diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c index 9a15ce277ba87..574619cbdcf0b 100644 --- a/clang/test/Headers/gpuintrin.c +++ b/clang/test/Headers/gpuintrin.c @@ -9,6 +9,11 @@ // RUN: -target-feature +ptx62 \ // RUN: -triple nvptx64-nvidia-cuda -emit-llvm %s -o - \ // RUN: | FileCheck %s --check-prefix=NVPTX +// +// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \ +// RUN: -internal-isystem %S/../../lib/Headers/ \ +// RUN: -triple spirv64-amd-amdhsa -emit-llvm %s -o - \ +// RUN: | FileCheck %s --check-prefix=SPIRVAMD #include <gpuintrin.h> @@ -978,6 +983,224 @@ __gpu_kernel void foo() { // NVPTX-NEXT: call void @llvm.nvvm.exit() // NVPTX-NEXT: ret void // +// +// SPIRVAMD-LABEL: define spir_func void @foo( +// SPIRVAMD-SAME: ) #[[ATTR0:[0-9]+]] { +// SPIRVAMD-NEXT: [[ENTRY:.*:]] +// SPIRVAMD-NEXT: [[CALL:%.*]] = call spir_func i32 @__gpu_num_blocks_x() +// SPIRVAMD-NEXT: [[CALL1:%.*]] = call spir_func i32 @__gpu_num_blocks_y() +// SPIRVAMD-NEXT: [[CALL2:%.*]] = call spir_func i32 @__gpu_num_blocks_z() +// SPIRVAMD-NEXT: [[CALL3:%.*]] = call spir_func i32 @__gpu_num_blocks(i32 noundef 0) +// SPIRVAMD-NEXT: [[CALL4:%.*]] = call spir_func i32 @__gpu_block_id_x() +// SPIRVAMD-NEXT: [[CALL5:%.*]] = call spir_func i32 @__gpu_block_id_y() +// SPIRVAMD-NEXT: [[CALL6:%.*]] = call spir_func i32 @__gpu_block_id_z() +// SPIRVAMD-NEXT: [[CALL7:%.*]] = call spir_func i32 @__gpu_block_id(i32 noundef 0) +// SPIRVAMD-NEXT: [[CALL8:%.*]] = call spir_func i32 @__gpu_num_threads_x() +// SPIRVAMD-NEXT: [[CALL9:%.*]] = call spir_func i32 @__gpu_num_threads_y() +// SPIRVAMD-NEXT: [[CALL10:%.*]] = call spir_func i32 @__gpu_num_threads_z() +// SPIRVAMD-NEXT: [[CALL11:%.*]] = call spir_func i32 @__gpu_num_threads(i32 noundef 0) +// SPIRVAMD-NEXT: [[CALL12:%.*]] = call spir_func i32 @__gpu_thread_id_x() +// SPIRVAMD-NEXT: [[CALL13:%.*]] = call spir_func i32 @__gpu_thread_id_y() +// SPIRVAMD-NEXT: [[CALL14:%.*]] = call spir_func i32 @__gpu_thread_id_z() +// SPIRVAMD-NEXT: [[CALL15:%.*]] = call spir_func i32 @__gpu_thread_id(i32 noundef 0) +// SPIRVAMD-NEXT: [[CALL16:%.*]] = call spir_func i32 @__gpu_num_lanes() +// SPIRVAMD-NEXT: [[CALL17:%.*]] = call spir_func i32 @__gpu_lane_id() +// SPIRVAMD-NEXT: [[CALL18:%.*]] = call spir_func i64 @__gpu_lane_mask() +// SPIRVAMD-NEXT: [[CALL19:%.*]] = call spir_func i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1) +// SPIRVAMD-NEXT: [[CALL20:%.*]] = call spir_func i64 @__gpu_read_first_lane_u64(i64 noundef -1, i64 noundef -1) +// SPIRVAMD-NEXT: [[CALL21:%.*]] = call spir_func i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) +// SPIRVAMD-NEXT: call spir_func void @__gpu_sync_threads() +// SPIRVAMD-NEXT: call spir_func void @__gpu_sync_lane(i64 noundef -1) +// SPIRVAMD-NEXT: [[CALL22:%.*]] = call spir_func i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) +// SPIRVAMD-NEXT: [[CALL23:%.*]] = call spir_func i64 @__gpu_first_lane_id(i64 noundef -1) +// SPIRVAMD-NEXT: [[CALL24:%.*]] = call spir_func zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) +// SPIRVAMD-NEXT: call spir_func void @__gpu_exit() #[[ATTR4:[0-9]+]] +// SPIRVAMD-NEXT: unreachable +// +// +// SPIRVAMD-LABEL: define internal spir_func i32 @__gpu_num_blocks( +// SPIRVAMD-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] { +// SPIRVAMD-NEXT: [[ENTRY:.*:]] +// SPIRVAMD-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// SPIRVAMD-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4 +// SPIRVAMD-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4 +// SPIRVAMD-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4 +// SPIRVAMD-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [ +// SPIRVAMD-NEXT: i32 0, label %[[SW_BB:.*]] +// SPIRVAMD-NEXT: i32 1, label %[[SW_BB1:.*]] +// SPIRVAMD-NEXT: i32 2, label %[[SW_BB3:.*]] +// SPIRVAMD-NEXT: ] +// SPIRVAMD: [[SW_BB]]: +// SPIRVAMD-NEXT: [[CALL:%.*]] = call spir_func i32 @__gpu_num_blocks_x() +// SPIRVAMD-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4 +// SPIRVAMD-NEXT: br label %[[RETURN:.*]] +// SPIRVAMD: [[SW_BB1]]: +// SPIRVAMD-NEXT: [[CALL2:%.*]] = call spir_func i32 @__gpu_num_blocks_y() +// SPIRVAMD-NEXT: store i32 [[CALL2]], ptr [[RETVAL]], align 4 +// SPIRVAMD-NEXT: br label %[[RETURN]] +// SPIRVAMD: [[SW_BB3]]: +// SPIRVAMD-NEXT: [[CALL4:%.*]] = call spir_func i32 @__gpu_num_blocks_z() +// SPIRVAMD-NEXT: store i32 [[CALL4]], ptr [[RETVAL]], align 4 +// SPIRVAMD-NEXT: br label %[[RETURN]] +// SPIRVAMD: [[SW_DEFAULT]]: +// SPIRVAMD-NEXT: unreachable +// SPIRVAMD: [[RETURN]]: +// SPIRVAMD-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4 +// SPIRVAMD-NEXT: ret i32 [[TMP1]] +// +// +// SPIRVAMD-LABEL: define internal spir_func i32 @__gpu_block_id( +// SPIRVAMD-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] { +// SPIRVAMD-NEXT: [[ENTRY:.*:]] +// SPIRVAMD-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// SPIRVAMD-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4 +// SPIRVAMD-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4 +// SPIRVAMD-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4 +// SPIRVAMD-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [ +// SPIRVAMD-NEXT: i32 0, label %[[SW_BB:.*]] +// SPIRVAMD-NEXT: i32 1, label %[[SW_BB1:.*]] +// SPIRVAMD-NEXT: i32 2, label %[[SW_BB3:.*]] +// SPIRVAMD-NEXT: ] +// SPIRVAMD: [[SW_BB]]: +// SPIRVAMD-NEXT: [[CALL:%.*]] = call spir_func i32 @__gpu_block_id_x() +// SPIRVAMD-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4 +// SPIRVAMD-NEXT: br label %[[RETURN:.*]] +// SPIRVAMD: [[SW_BB1]]: +// SPIRVAMD-NEXT: [[CALL2:%.*]] = call spir_func i32 @__gpu_block_id_y() +// SPIRVAMD-NEXT: store i32 [[CALL2]], ptr [[RETVAL]], align 4 +// SPIRVAMD-NEXT: br label %[[RETURN]] +// SPIRVAMD: [[SW_BB3]]: +// SPIRVAMD-NEXT: [[CALL4:%.*]] = call spir_func i32 @__gpu_block_id_z() +// SPIRVAMD-NEXT: store i32 [[CALL4]], ptr [[RETVAL]], align 4 +// SPIRVAMD-NEXT: br label %[[RETURN]] +// SPIRVAMD: [[SW_DEFAULT]]: +// SPIRVAMD-NEXT: unreachable +// SPIRVAMD: [[RETURN]]: +// SPIRVAMD-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4 +// SPIRVAMD-NEXT: ret i32 [[TMP1]] +// +// +// SPIRVAMD-LABEL: define internal spir_func i32 @__gpu_num_threads( +// SPIRVAMD-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] { +// SPIRVAMD-NEXT: [[ENTRY:.*:]] +// SPIRVAMD-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// SPIRVAMD-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4 +// SPIRVAMD-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4 +// SPIRVAMD-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4 +// SPIRVAMD-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [ +// SPIRVAMD-NEXT: i32 0, label %[[SW_BB:.*]] +// SPIRVAMD-NEXT: i32 1, label %[[SW_BB1:.*]] +// SPIRVAMD-NEXT: i32 2, label %[[SW_BB3:.*]] +// SPIRVAMD-NEXT: ] +// SPIRVAMD: [[SW_BB]]: +// SPIRVAMD-NEXT: [[CALL:%.*]] = call spir_func i32 @__gpu_num_threads_x() +// SPIRVAMD-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4 +// SPIRVAMD-NEXT: br label %[[RETURN:.*]] +// SPIRVAMD: [[SW_BB1]]: +// SPIRVAMD-NEXT: [[CALL2:%.*]] = call spir_func i32 @__gpu_num_threads_y() +// SPIRVAMD-NEXT: store i32 [[CALL2]], ptr [[RETVAL]], align 4 +// SPIRVAMD-NEXT: br label %[[RETURN]] +// SPIRVAMD: [[SW_BB3]]: +// SPIRVAMD-NEXT: [[CALL4:%.*]] = call spir_func i32 @__gpu_num_threads_z() +// SPIRVAMD-NEXT: store i32 [[CALL4]], ptr [[RETVAL]], align 4 +// SPIRVAMD-NEXT: br label %[[RETURN]] +// SPIRVAMD: [[SW_DEFAULT]]: +// SPIRVAMD-NEXT: unreachable +// SPIRVAMD: [[RETURN]]: +// SPIRVAMD-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4 +// SPIRVAMD-NEXT: ret i32 [[TMP1]] +// +// +// SPIRVAMD-LABEL: define internal spir_func i32 @__gpu_thread_id( +// SPIRVAMD-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] { +// SPIRVAMD-NEXT: [[ENTRY:.*:]] +// SPIRVAMD-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// SPIRVAMD-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4 +// SPIRVAMD-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4 +// SPIRVAMD-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4 +// SPIRVAMD-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [ +// SPIRVAMD-NEXT: i32 0, label %[[SW_BB:.*]] +// SPIRVAMD-NEXT: i32 1, label %[[SW_BB1:.*]] +// SPIRVAMD-NEXT: i32 2, label %[[SW_BB3:.*]] +// SPIRVAMD-NEXT: ] +// SPIRVAMD: [[SW_BB]]: +// SPIRVAMD-NEXT: [[CALL:%.*]] = call spir_func i32 @__gpu_thread_id_x() +// SPIRVAMD-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4 +// SPIRVAMD-NEXT: br label %[[RETURN:.*]] +// SPIRVAMD: [[SW_BB1]]: +// SPIRVAMD-NEXT: [[CALL2:%.*]] = call spir_func i32 @__gpu_thread_id_y() +// SPIRVAMD-NEXT: store i32 [[CALL2]], ptr [[RETVAL]], align 4 +// SPIRVAMD-NEXT: br label %[[RETURN]] +// SPIRVAMD: [[SW_BB3]]: +// SPIRVAMD-NEXT: [[CALL4:%.*]] = call spir_func i32 @__gpu_thread_id_z() +// SPIRVAMD-NEXT: store i32 [[CALL4]], ptr [[RETVAL]], align 4 +// SPIRVAMD-NEXT: br label %[[RETURN]] +// SPIRVAMD: [[SW_DEFAULT]]: +// SPIRVAMD-NEXT: unreachable +// SPIRVAMD: [[RETURN]]: +// SPIRVAMD-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4 +// SPIRVAMD-NEXT: ret i32 [[TMP1]] +// +// +// SPIRVAMD-LABEL: define internal spir_func i64 @__gpu_read_first_lane_u64( +// SPIRVAMD-SAME: i64 noundef [[__LANE_MASK:%.*]], i64 noundef [[__X:%.*]]) #[[ATTR0]] { +// SPIRVAMD-NEXT: [[ENTRY:.*:]] +// SPIRVAMD-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8 +// SPIRVAMD-NEXT: [[__X_ADDR:%.*]] = alloca i64, align 8 +// SPIRVAMD-NEXT: [[__HI:%.*]] = alloca i32, align 4 +// SPIRVAMD-NEXT: [[__LO:%.*]] = alloca i32, align 4 +// SPIRVAMD-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8 +// SPIRVAMD-NEXT: store i64 [[__X]], ptr [[__X_ADDR]], align 8 +// SPIRVAMD-NEXT: [[TMP0:%.*]] = load i64, ptr [[__X_ADDR]], align 8 +// SPIRVAMD-NEXT: [[SHR:%.*]] = lshr i64 [[TMP0]], 32 +// SPIRVAMD-NEXT: [[CONV:%.*]] = trunc i64 [[SHR]] to i32 +// SPIRVAMD-NEXT: store i32 [[CONV]], ptr [[__HI]], align 4 +// SPIRVAMD-NEXT: [[TMP1:%.*]] = load i64, ptr [[__X_ADDR]], align 8 +// SPIRVAMD-NEXT: [[AND:%.*]] = and i64 [[TMP1]], 4294967295 +// SPIRVAMD-NEXT: [[CONV1:%.*]] = trunc i64 [[AND]] to i32 +// SPIRVAMD-NEXT: store i32 [[CONV1]], ptr [[__LO]], align 4 +// SPIRVAMD-NEXT: [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8 +// SPIRVAMD-NEXT: [[TMP3:%.*]] = load i32, ptr [[__HI]], align 4 +// SPIRVAMD-NEXT: [[CALL:%.*]] = call spir_func i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP2]], i32 noundef [[TMP3]]) +// SPIRVAMD-NEXT: [[CONV2:%.*]] = zext i32 [[CALL]] to i64 +// SPIRVAMD-NEXT: [[SHL:%.*]] = shl i64 [[CONV2]], 32 +// SPIRVAMD-NEXT: [[TMP4:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8 +// SPIRVAMD-NEXT: [[TMP5:%.*]] = load i32, ptr [[__LO]], align 4 +// SPIRVAMD-NEXT: [[CALL3:%.*]] = call spir_func i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP4]], i32 noundef [[TMP5]]) +// SPIRVAMD-NEXT: [[CONV4:%.*]] = zext i32 [[CALL3]] to i64 +// SPIRVAMD-NEXT: [[AND5:%.*]] = and i64 [[CONV4]], 4294967295 +// SPIRVAMD-NEXT: [[OR:%.*]] = or i64 [[SHL]], [[AND5]] +// SPIRVAMD-NEXT: ret i64 [[OR]] +// +// +// SPIRVAMD-LABEL: define internal spir_func i64 @__gpu_first_lane_id( +// SPIRVAMD-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] { +// SPIRVAMD-NEXT: [[ENTRY:.*:]] +// SPIRVAMD-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8 +// SPIRVAMD-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8 +// SPIRVAMD-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8 +// SPIRVAMD-NEXT: [[TMP1:%.*]] = call i64 @llvm.cttz.i64(i64 [[TMP0]], i1 true) +// SPIRVAMD-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 1 +// SPIRVAMD-NEXT: [[ISZERO:%.*]] = icmp eq i64 [[TMP0]], 0 +// SPIRVAMD-NEXT: [[FFS:%.*]] = select i1 [[ISZERO]], i64 0, i64 [[TMP2]] +// SPIRVAMD-NEXT: [[CAST:%.*]] = trunc i64 [[FFS]] to i32 +// SPIRVAMD-NEXT: [[SUB:%.*]] = sub nsw i32 [[CAST]], 1 +// SPIRVAMD-NEXT: [[CONV:%.*]] = sext i32 [[SUB]] to i64 +// SPIRVAMD-NEXT: ret i64 [[CONV]] +// +// +// SPIRVAMD-LABEL: define internal spir_func zeroext i1 @__gpu_is_first_in_lane( +// SPIRVAMD-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] { +// SPIRVAMD-NEXT: [[ENTRY:.*:]] +// SPIRVAMD-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8 +// SPIRVAMD-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8 +// SPIRVAMD-NEXT: [[CALL:%.*]] = call spir_func i32 @__gpu_lane_id() +// SPIRVAMD-NEXT: [[CONV:%.*]] = zext i32 [[CALL]] to i64 +// SPIRVAMD-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8 +// SPIRVAMD-NEXT: [[CALL1:%.*]] = call spir_func i64 @__gpu_first_lane_id(i64 noundef [[TMP0]]) +// SPIRVAMD-NEXT: [[CMP:%.*]] = icmp eq i64 [[CONV]], [[CALL1]] +// SPIRVAMD-NEXT: ret i1 [[CMP]] +// //. // AMDGPU: [[RNG3]] = !{i32 1, i32 0} // AMDGPU: [[META4]] = !{} _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits