https://github.com/frasercrmck created https://github.com/llvm/llvm-project/pull/141755
This commit moves the various vload and vstore builtins (including vload_half, vloada_half, etc.) to the CLC library. This is almost entirely a code move and does not make any attempt to clean up or optimize the definitions of these builtins. There is no change to any of the targets' builtin libraries, except that the vstore helper rounding functions are now internalized. Cleanups can come in future work. The new CLC declarations and new OpenCL wrappers show how these CLC implementations could be defined more simply. The builtins could probably also be vectorized in future work; right now all of the 'half' versions for both vload and vstore are essentially scalarized. >From 512f180952e8c514ac23aa05bb016285f3f7478c Mon Sep 17 00:00:00 2001 From: Fraser Cormack <fra...@codeplay.com> Date: Wed, 28 May 2025 13:57:22 +0100 Subject: [PATCH] [libclc] Move vload & vstore to CLC library This commit moves the various vload and vstore builtins (including vload_half, vloada_half, etc.) to the CLC library. This is almost entirely a code move and does not make any attempt to clean up or optimize the definitions of these builtins. There is no change to any of the targets' builtin libraries, except that the vstore helper rounding functions are now internalized. Cleanups can come in future work. The new CLC declarations and new OpenCL wrappers show how these CLC implementations could be defined more simply. The builtins could probably also be vectorized in future work; right now all of the 'half' versions for both vload and vstore are essentially scalarized. --- .../clc/shared/clc_less_aligned_types.h | 23 ++ .../clc/shared/clc_less_aligned_types.inc | 23 ++ libclc/clc/include/clc/shared/clc_vload.h | 20 ++ libclc/clc/include/clc/shared/clc_vload.inc | 64 +++++ libclc/clc/include/clc/shared/clc_vstore.h | 20 ++ libclc/clc/include/clc/shared/clc_vstore.inc | 70 +++++ libclc/clc/lib/generic/SOURCES | 2 + libclc/clc/lib/generic/shared/clc_vload.cl | 130 +++++++++ .../lib/generic/shared/clc_vload_half.inc} | 0 libclc/clc/lib/generic/shared/clc_vstore.cl | 268 ++++++++++++++++++ .../lib/generic/shared/clc_vstore_half.inc} | 0 libclc/opencl/lib/generic/shared/vload.cl | 132 +-------- libclc/opencl/lib/generic/shared/vload.inc | 71 +++++ libclc/opencl/lib/generic/shared/vstore.cl | 251 +--------------- libclc/opencl/lib/generic/shared/vstore.inc | 77 +++++ 15 files changed, 776 insertions(+), 375 deletions(-) create mode 100644 libclc/clc/include/clc/shared/clc_less_aligned_types.h create mode 100644 libclc/clc/include/clc/shared/clc_less_aligned_types.inc create mode 100644 libclc/clc/include/clc/shared/clc_vload.h create mode 100644 libclc/clc/include/clc/shared/clc_vload.inc create mode 100644 libclc/clc/include/clc/shared/clc_vstore.h create mode 100644 libclc/clc/include/clc/shared/clc_vstore.inc create mode 100644 libclc/clc/lib/generic/shared/clc_vload.cl rename libclc/{opencl/lib/generic/shared/vload_half.inc => clc/lib/generic/shared/clc_vload_half.inc} (100%) create mode 100644 libclc/clc/lib/generic/shared/clc_vstore.cl rename libclc/{opencl/lib/generic/shared/vstore_half.inc => clc/lib/generic/shared/clc_vstore_half.inc} (100%) create mode 100644 libclc/opencl/lib/generic/shared/vload.inc create mode 100644 libclc/opencl/lib/generic/shared/vstore.inc diff --git a/libclc/clc/include/clc/shared/clc_less_aligned_types.h b/libclc/clc/include/clc/shared/clc_less_aligned_types.h new file mode 100644 index 0000000000000..73e436147ce0a --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_less_aligned_types.h @@ -0,0 +1,23 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Defines types to be used with (CLC) vstore and vload functions. These are +// vector types whose alignment is that of their respective scalar types. +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_SHARED_CLC_LESS_ALIGNED_TYPES_H__ +#define __CLC_SHARED_CLC_LESS_ALIGNED_TYPES_H__ + +#define __CLC_BODY <clc/shared/clc_less_aligned_types.inc> +#include <clc/integer/gentype.inc> + +#define __CLC_BODY <clc/shared/clc_less_aligned_types.inc> +#include <clc/math/gentype.inc> + +#endif // __CLC_SHARED_CLC_LESS_ALIGNED_TYPES_H__ diff --git a/libclc/clc/include/clc/shared/clc_less_aligned_types.inc b/libclc/clc/include/clc/shared/clc_less_aligned_types.inc new file mode 100644 index 0000000000000..45d69ea72fc0a --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_less_aligned_types.inc @@ -0,0 +1,23 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Defines types to be used with (CLC) vstore and vload functions. These are +// vector types whose alignment is that of their respective scalar types. +// +//===----------------------------------------------------------------------===// + +#ifdef __CLC_SCALAR + +typedef __CLC_GENTYPE __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE); + +#else + +typedef __CLC_GENTYPE __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE) + __attribute__((aligned(sizeof(__CLC_SCALAR_GENTYPE)))); + +#endif diff --git a/libclc/clc/include/clc/shared/clc_vload.h b/libclc/clc/include/clc/shared/clc_vload.h new file mode 100644 index 0000000000000..c3dbe0696cc19 --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_vload.h @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_SHARED_CLC_VLOAD_H__ +#define __CLC_SHARED_CLC_VLOAD_H__ + +#include <clc/shared/clc_less_aligned_types.h> + +#define __CLC_BODY <clc/shared/clc_vload.inc> +#include <clc/integer/gentype.inc> + +#define __CLC_BODY <clc/shared/clc_vload.inc> +#include <clc/math/gentype.inc> + +#endif // __CLC_SHARED_CLC_VLOAD_H__ diff --git a/libclc/clc/include/clc/shared/clc_vload.inc b/libclc/clc/include/clc/shared/clc_vload.inc new file mode 100644 index 0000000000000..8f3b00ec04454 --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_vload.inc @@ -0,0 +1,64 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#define CLC_VLOAD_NAME __CLC_XCONCAT(__clc_vload, __CLC_VECSIZE) +#define CLC_VLOAD_HALF_NAME __CLC_XCONCAT(__clc_vload_half, __CLC_VECSIZE) +#define CLC_VLOADA_HALF_NAME __CLC_XCONCAT(__clc_vloada_half, __CLC_VECSIZE) + +#ifndef __CLC_SCALAR + +#define CLC_VLOAD_TY __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE) + +#define CLC_VLOAD_DECL(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DECL CLC_VLOAD_TY CLC_VLOAD_NAME( \ + size_t offset, const ADDRSPACE __CLC_SCALAR_GENTYPE *x); + +CLC_VLOAD_DECL(__private) +CLC_VLOAD_DECL(__local) +CLC_VLOAD_DECL(__constant) +CLC_VLOAD_DECL(__global) + +#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED +CLC_VLOAD_DECL(__generic) +#endif + +#undef CLC_VLOAD_DECL +#undef CLC_VLOAD_TY + +#endif // __CLC_SCALAR + +// vload_half and vloada_half are available even if cl_khr_fp16 is unavailable. +// Declare these functions when working on float types, which we know are +// always available. +#ifdef __CLC_FPSIZE +#if __CLC_FPSIZE == 32 + +#define CLC_VLOAD_HALF_DECL(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE CLC_VLOAD_HALF_NAME( \ + size_t offset, const ADDRSPACE half *mem); \ + \ + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE CLC_VLOADA_HALF_NAME( \ + size_t offset, const ADDRSPACE half *mem); + +CLC_VLOAD_HALF_DECL(__private) +CLC_VLOAD_HALF_DECL(__local) +CLC_VLOAD_HALF_DECL(__constant) +CLC_VLOAD_HALF_DECL(__global) + +#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED +CLC_VLOAD_HALF_DECL(__generic) +#endif + +#undef CLC_VLOAD_HALF_DECL + +#endif +#endif + +#undef CLC_VLOAD_NAME +#undef CLC_VLOAD_HALF_NAME +#undef CLC_VLOADA_HALF_NAME diff --git a/libclc/clc/include/clc/shared/clc_vstore.h b/libclc/clc/include/clc/shared/clc_vstore.h new file mode 100644 index 0000000000000..647dc7da1afbe --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_vstore.h @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_SHARED_CLC_VSTORE_H__ +#define __CLC_SHARED_CLC_VSTORE_H__ + +#include <clc/shared/clc_less_aligned_types.h> + +#define __CLC_BODY <clc/shared/clc_vstore.inc> +#include <clc/integer/gentype.inc> + +#define __CLC_BODY <clc/shared/clc_vstore.inc> +#include <clc/math/gentype.inc> + +#endif // __CLC_SHARED_CLC_VSTORE_H__ diff --git a/libclc/clc/include/clc/shared/clc_vstore.inc b/libclc/clc/include/clc/shared/clc_vstore.inc new file mode 100644 index 0000000000000..38d54b2f1b67f --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_vstore.inc @@ -0,0 +1,70 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#define CLC_VSTORE_TY __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE) +#define CLC_VSTORE_NAME __CLC_XCONCAT(__clc_vstore, __CLC_VECSIZE) +#define CLC_VSTORE_HALF_NAME(x) \ + __CLC_XCONCAT(__CLC_XCONCAT(__clc_vstore_half, __CLC_VECSIZE), x) +#define CLC_VSTOREA_HALF_NAME(x) \ + __CLC_XCONCAT(__CLC_XCONCAT(__clc_vstorea_half, __CLC_VECSIZE), x) + +#ifndef __CLC_SCALAR + +#define CLC_VSTORE_DECL(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DECL void CLC_VSTORE_NAME( \ + CLC_VSTORE_TY data, size_t offset, ADDRSPACE __CLC_SCALAR_GENTYPE *p); + +CLC_VSTORE_DECL(__private) +CLC_VSTORE_DECL(__local) +CLC_VSTORE_DECL(__global) + +#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED +CLC_VSTORE_DECL(__generic) +#endif + +#undef CLC_VSTORE_DECL + +#endif // __CLC_SCALAR + +// vstore_half and vstorea_half are available even if cl_khr_fp16 is +// unavailable. +#ifdef __CLC_FPSIZE +#if __CLC_FPSIZE == 32 || __CLC_FPSIZE == 64 + +#define CLC_VSTORE_HALF_DECL(ADDRSPACE, SUFFIX) \ + _CLC_OVERLOAD _CLC_DECL void CLC_VSTORE_HALF_NAME(SUFFIX)( \ + CLC_VSTORE_TY data, size_t offset, ADDRSPACE half *p); \ + \ + _CLC_OVERLOAD _CLC_DECL void CLC_VSTOREA_HALF_NAME(SUFFIX)( \ + CLC_VSTORE_TY data, size_t offset, ADDRSPACE half *p); + +#define CLC_VSTORE_HALF_DECL_ALL_MODES(ADDRSPACE) \ + CLC_VSTORE_HALF_DECL(ADDRSPACE, ) \ + CLC_VSTORE_HALF_DECL(ADDRSPACE, _rtz) \ + CLC_VSTORE_HALF_DECL(ADDRSPACE, _rtn) \ + CLC_VSTORE_HALF_DECL(ADDRSPACE, _rtp) \ + CLC_VSTORE_HALF_DECL(ADDRSPACE, _rte) + +CLC_VSTORE_HALF_DECL_ALL_MODES(__private) +CLC_VSTORE_HALF_DECL_ALL_MODES(__local) +CLC_VSTORE_HALF_DECL_ALL_MODES(__global) + +#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED +CLC_VSTORE_HALF_DECL_ALL_MODES(__generic) +#endif + +#undef CLC_VSTORE_HALF_DECL +#undef CLC_VSTORE_HALF_DECL_ALL_MODES + +#endif +#endif + +#undef CLC_VSTORE_TY +#undef CLC_VSTORE_NAME +#undef CLC_VSTORE_HALF_NAME +#undef CLC_VSTOREA_HALF_NAME diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES index a8a906159e286..49c7ca636f240 100644 --- a/libclc/clc/lib/generic/SOURCES +++ b/libclc/clc/lib/generic/SOURCES @@ -147,3 +147,5 @@ relational/clc_signbit.cl shared/clc_clamp.cl shared/clc_max.cl shared/clc_min.cl +shared/clc_vload.cl +shared/clc_vstore.cl diff --git a/libclc/clc/lib/generic/shared/clc_vload.cl b/libclc/clc/lib/generic/shared/clc_vload.cl new file mode 100644 index 0000000000000..e4003e4a96736 --- /dev/null +++ b/libclc/clc/lib/generic/shared/clc_vload.cl @@ -0,0 +1,130 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include <clc/internal/clc.h> +#include <clc/shared/clc_vload.h> + +#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 __clc_vload2( \ + size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ + return *( \ + (const ADDR_SPACE less_aligned_##PRIM_TYPE##2 *)(&x[2 * offset])); \ + } \ + \ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 __clc_vload3( \ + size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ + PRIM_TYPE##2 vec = \ + *((const ADDR_SPACE less_aligned_##PRIM_TYPE##2 *)(&x[3 * offset])); \ + return (PRIM_TYPE##3)(vec.s0, vec.s1, x[offset * 3 + 2]); \ + } \ + \ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 __clc_vload4( \ + size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ + return *( \ + (const ADDR_SPACE less_aligned_##PRIM_TYPE##4 *)(&x[4 * offset])); \ + } \ + \ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 __clc_vload8( \ + size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ + return *( \ + (const ADDR_SPACE less_aligned_##PRIM_TYPE##8 *)(&x[8 * offset])); \ + } \ + \ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 __clc_vload16( \ + size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ + return *( \ + (const ADDR_SPACE less_aligned_##PRIM_TYPE##16 *)(&x[16 * offset])); \ + } + +#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED +#define VLOAD_VECTORIZE_GENERIC VLOAD_VECTORIZE +#else +// The generic address space isn't available, so make the macro do nothing +#define VLOAD_VECTORIZE_GENERIC(X, Y) +#endif + +#define VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE) \ + VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private) \ + VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local) \ + VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \ + VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global) \ + VLOAD_VECTORIZE_GENERIC(__CLC_SCALAR_GENTYPE, __generic) + +#define VLOAD_TYPES() \ + VLOAD_ADDR_SPACES(char) \ + VLOAD_ADDR_SPACES(uchar) \ + VLOAD_ADDR_SPACES(short) \ + VLOAD_ADDR_SPACES(ushort) \ + VLOAD_ADDR_SPACES(int) \ + VLOAD_ADDR_SPACES(uint) \ + VLOAD_ADDR_SPACES(long) \ + VLOAD_ADDR_SPACES(ulong) \ + VLOAD_ADDR_SPACES(float) + +VLOAD_TYPES() + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +VLOAD_ADDR_SPACES(double) +#endif +#ifdef cl_khr_fp16 +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +VLOAD_ADDR_SPACES(half) +#endif + +/* vload_half are legal even without cl_khr_fp16 */ +/* no vload_half for double */ +#define VEC_LOAD1(val, AS) val = __builtin_load_halff(&mem[offset++]); +#define VEC_LOAD2(val, AS) \ + VEC_LOAD1(val.lo, AS) \ + VEC_LOAD1(val.hi, AS) +#define VEC_LOAD3(val, AS) \ + VEC_LOAD1(val.s0, AS) \ + VEC_LOAD1(val.s1, AS) \ + VEC_LOAD1(val.s2, AS) +#define VEC_LOAD4(val, AS) \ + VEC_LOAD2(val.lo, AS) \ + VEC_LOAD2(val.hi, AS) +#define VEC_LOAD8(val, AS) \ + VEC_LOAD4(val.lo, AS) \ + VEC_LOAD4(val.hi, AS) +#define VEC_LOAD16(val, AS) \ + VEC_LOAD8(val.lo, AS) \ + VEC_LOAD8(val.hi, AS) + +#define __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) \ + _CLC_OVERLOAD _CLC_DEF TYPE __clc_vload_half##SUFFIX(size_t offset, \ + const AS half *mem) { \ + offset *= VEC_SIZE; \ + TYPE __tmp; \ + VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp; \ + } \ + _CLC_OVERLOAD _CLC_DEF TYPE __clc_vloada_half##SUFFIX(size_t offset, \ + const AS half *mem) { \ + offset *= OFFSET_SIZE; \ + TYPE __tmp; \ + VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp; \ + } + +#define FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) \ + __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) + +#define __CLC_BODY "clc_vload_half.inc" +#include <clc/math/gentype.inc> +#undef FUNC +#undef __FUNC +#undef VEC_LOAD16 +#undef VEC_LOAD8 +#undef VEC_LOAD4 +#undef VEC_LOAD3 +#undef VEC_LOAD2 +#undef VEC_LOAD1 +#undef VLOAD_TYPES +#undef VLOAD_ADDR_SPACES +#undef VLOAD_VECTORIZE +#undef VLOAD_VECTORIZE_GENERIC diff --git a/libclc/opencl/lib/generic/shared/vload_half.inc b/libclc/clc/lib/generic/shared/clc_vload_half.inc similarity index 100% rename from libclc/opencl/lib/generic/shared/vload_half.inc rename to libclc/clc/lib/generic/shared/clc_vload_half.inc diff --git a/libclc/clc/lib/generic/shared/clc_vstore.cl b/libclc/clc/lib/generic/shared/clc_vstore.cl new file mode 100644 index 0000000000000..adde58aec9153 --- /dev/null +++ b/libclc/clc/lib/generic/shared/clc_vstore.cl @@ -0,0 +1,268 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include <clc/float/definitions.h> +#include <clc/internal/clc.h> +#include <clc/math/clc_copysign.h> +#include <clc/math/clc_fabs.h> +#include <clc/math/clc_nextafter.h> +#include <clc/relational/clc_isinf.h> +#include <clc/relational/clc_isnan.h> +#include <clc/shared/clc_min.h> + +#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable + +#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ + typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ + __attribute__((aligned(sizeof(PRIM_TYPE)))); \ + _CLC_OVERLOAD _CLC_DEF void __clc_vstore2(PRIM_TYPE##2 vec, size_t offset, \ + ADDR_SPACE PRIM_TYPE *mem) { \ + *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ + *)(&mem[2 * offset])) = vec; \ + } \ + \ + _CLC_OVERLOAD _CLC_DEF void __clc_vstore3(PRIM_TYPE##3 vec, size_t offset, \ + ADDR_SPACE PRIM_TYPE *mem) { \ + *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ + *)(&mem[3 * offset])) = (PRIM_TYPE##2)(vec.s0, vec.s1); \ + mem[3 * offset + 2] = vec.s2; \ + } \ + \ + typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \ + __attribute__((aligned(sizeof(PRIM_TYPE)))); \ + _CLC_OVERLOAD _CLC_DEF void __clc_vstore4(PRIM_TYPE##4 vec, size_t offset, \ + ADDR_SPACE PRIM_TYPE *mem) { \ + *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \ + *)(&mem[4 * offset])) = vec; \ + } \ + \ + typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \ + __attribute__((aligned(sizeof(PRIM_TYPE)))); \ + _CLC_OVERLOAD _CLC_DEF void __clc_vstore8(PRIM_TYPE##8 vec, size_t offset, \ + ADDR_SPACE PRIM_TYPE *mem) { \ + *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \ + *)(&mem[8 * offset])) = vec; \ + } \ + \ + typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \ + __attribute__((aligned(sizeof(PRIM_TYPE)))); \ + _CLC_OVERLOAD _CLC_DEF void __clc_vstore16(PRIM_TYPE##16 vec, size_t offset, \ + ADDR_SPACE PRIM_TYPE *mem) { \ + *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \ + *)(&mem[16 * offset])) = vec; \ + } + +#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED +#define VSTORE_VECTORIZE_GENERIC VSTORE_VECTORIZE +#else +// The generic address space isn't available, so make the macro do nothing +#define VSTORE_VECTORIZE_GENERIC(X, Y) +#endif + +#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \ + VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private) \ + VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \ + VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) \ + VSTORE_VECTORIZE_GENERIC(__CLC_SCALAR___CLC_GENTYPE, __generic) + +VSTORE_ADDR_SPACES(char) +VSTORE_ADDR_SPACES(uchar) +VSTORE_ADDR_SPACES(short) +VSTORE_ADDR_SPACES(ushort) +VSTORE_ADDR_SPACES(int) +VSTORE_ADDR_SPACES(uint) +VSTORE_ADDR_SPACES(long) +VSTORE_ADDR_SPACES(ulong) +VSTORE_ADDR_SPACES(float) + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +VSTORE_ADDR_SPACES(double) +#endif + +#ifdef cl_khr_fp16 +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +VSTORE_ADDR_SPACES(half) +#endif + +#define VEC_STORE1(val, ROUNDF, BUILTIN) BUILTIN(ROUNDF(val), &mem[offset++]); + +#define VEC_STORE2(val, ROUNDF, BUILTIN) \ + VEC_STORE1(val.lo, ROUNDF, BUILTIN) \ + VEC_STORE1(val.hi, ROUNDF, BUILTIN) +#define VEC_STORE3(val, ROUNDF, BUILTIN) \ + VEC_STORE1(val.s0, ROUNDF, BUILTIN) \ + VEC_STORE1(val.s1, ROUNDF, BUILTIN) \ + VEC_STORE1(val.s2, ROUNDF, BUILTIN) +#define VEC_STORE4(val, ROUNDF, BUILTIN) \ + VEC_STORE2(val.lo, ROUNDF, BUILTIN) \ + VEC_STORE2(val.hi, ROUNDF, BUILTIN) +#define VEC_STORE8(val, ROUNDF, BUILTIN) \ + VEC_STORE4(val.lo, ROUNDF, BUILTIN) \ + VEC_STORE4(val.hi, ROUNDF, BUILTIN) +#define VEC_STORE16(val, ROUNDF, BUILTIN) \ + VEC_STORE8(val.lo, ROUNDF, BUILTIN) \ + VEC_STORE8(val.hi, ROUNDF, BUILTIN) + +#define __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, ROUNDF, BUILTIN) \ + _CLC_OVERLOAD _CLC_DEF void __clc_vstore_half##SUFFIX( \ + TYPE vec, size_t offset, AS half *mem) { \ + offset *= VEC_SIZE; \ + VEC_STORE##VEC_SIZE(vec, ROUNDF, BUILTIN) \ + } \ + _CLC_OVERLOAD _CLC_DEF void __clc_vstorea_half##SUFFIX( \ + TYPE vec, size_t offset, AS half *mem) { \ + offset *= OFFSET; \ + VEC_STORE##VEC_SIZE(vec, ROUNDF, BUILTIN) \ + } + +_CLC_DEF _CLC_OVERLOAD float __clc_noop(float x) { return x; } +_CLC_DEF _CLC_OVERLOAD float __clc_rtz(float x) { + /* Remove lower 13 bits to make sure the number is rounded down */ + int mask = 0xffffe000; + const int exp = (__clc_as_uint(x) >> 23 & 0xff) - 127; + /* Denormals cannot be flushed, and they use different bit for rounding */ + if (exp < -14) + mask <<= __clc_min(-(exp + 14), 10); + /* RTZ does not produce Inf for large numbers */ + if (__clc_fabs(x) > 65504.0f && !__clc_isinf(x)) + return __clc_copysign(65504.0f, x); + /* Handle nan corner case */ + if (__clc_isnan(x)) + return x; + return __clc_as_float(__clc_as_uint(x) & mask); +} +_CLC_DEF _CLC_OVERLOAD float __clc_rti(float x) { + const float inf = __clc_copysign(INFINITY, x); + /* Set lower 13 bits */ + int mask = (1 << 13) - 1; + const int exp = (__clc_as_uint(x) >> 23 & 0xff) - 127; + /* Denormals cannot be flushed, and they use different bit for rounding */ + if (exp < -14) + mask = (1 << (13 + __clc_min(-(exp + 14), 10))) - 1; + /* Handle nan corner case */ + if (__clc_isnan(x)) + return x; + const float next = + __clc_nextafter(__clc_as_float(__clc_as_uint(x) | mask), inf); + return ((__clc_as_uint(x) & mask) == 0) ? x : next; +} +_CLC_DEF _CLC_OVERLOAD float __clc_rtn(float x) { + return ((__clc_as_uint(x) & 0x80000000) == 0) ? __clc_rtz(x) : __clc_rti(x); +} +_CLC_DEF _CLC_OVERLOAD float __clc_rtp(float x) { + return ((__clc_as_uint(x) & 0x80000000) == 0) ? __clc_rti(x) : __clc_rtz(x); +} +_CLC_DEF _CLC_OVERLOAD float __clc_rte(float x) { + /* Mantisa + implicit bit */ + const uint mantissa = (__clc_as_uint(x) & 0x7fffff) | (1u << 23); + const int exp = (__clc_as_uint(x) >> 23 & 0xff) - 127; + int shift = 13; + if (exp < -14) { + /* The default assumes lower 13 bits are rounded, + * but it might be more for denormals. + * Shifting beyond last == 0b, and qr == 00b is not necessary */ + shift += __clc_min(-(exp + 14), 15); + } + int mask = (1 << shift) - 1; + const uint grs = mantissa & mask; + const uint last = mantissa & (1 << shift); + /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1. + * exp > 15 should round to inf. */ + bool roundup = (grs > (1 << (shift - 1))) || + (grs == (1 << (shift - 1)) && last != 0) || (exp > 15); + return roundup ? __clc_rti(x) : __clc_rtz(x); +} + +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD double __clc_noop(double x) { return x; } +_CLC_DEF _CLC_OVERLOAD double __clc_rtz(double x) { + /* Remove lower 42 bits to make sure the number is rounded down */ + ulong mask = 0xfffffc0000000000UL; + const int exp = (__clc_as_ulong(x) >> 52 & 0x7ff) - 1023; + /* Denormals cannot be flushed, and they use different bit for rounding */ + if (exp < -14) + mask <<= __clc_min(-(exp + 14), 10); + /* RTZ does not produce Inf for large numbers */ + if (__clc_fabs(x) > 65504.0 && !__clc_isinf(x)) + return __clc_copysign(65504.0, x); + /* Handle nan corner case */ + if (__clc_isnan(x)) + return x; + return __clc_as_double(__clc_as_ulong(x) & mask); +} +_CLC_DEF _CLC_OVERLOAD double __clc_rti(double x) { + const double inf = __clc_copysign((double)INFINITY, x); + /* Set lower 42 bits */ + long mask = (1UL << 42UL) - 1UL; + const int exp = (__clc_as_ulong(x) >> 52 & 0x7ff) - 1023; + /* Denormals cannot be flushed, and they use different bit for rounding */ + if (exp < -14) + mask = (1UL << (42UL + __clc_min(-(exp + 14), 10))) - 1; + /* Handle nan corner case */ + if (__clc_isnan(x)) + return x; + const double next = + __clc_nextafter(__clc_as_double(__clc_as_ulong(x) | mask), inf); + return ((__clc_as_ulong(x) & mask) == 0) ? x : next; +} +_CLC_DEF _CLC_OVERLOAD double __clc_rtn(double x) { + return ((__clc_as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rtz(x) + : __clc_rti(x); +} +_CLC_DEF _CLC_OVERLOAD double __clc_rtp(double x) { + return ((__clc_as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rti(x) + : __clc_rtz(x); +} +_CLC_DEF _CLC_OVERLOAD double __clc_rte(double x) { + /* Mantisa + implicit bit */ + const ulong mantissa = (__clc_as_ulong(x) & 0xfffffffffffff) | (1UL << 52); + const int exp = (__clc_as_ulong(x) >> 52 & 0x7ff) - 1023; + int shift = 42; + if (exp < -14) { + /* The default assumes lower 13 bits are rounded, + * but it might be more for denormals. + * Shifting beyond last == 0b, and qr == 00b is not necessary */ + shift += __clc_min(-(exp + 14), 15); + } + ulong mask = (1UL << shift) - 1UL; + const ulong grs = mantissa & mask; + const ulong last = mantissa & (1UL << shift); + /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1. + * exp > 15 should round to inf. */ + bool roundup = (grs > (1UL << (shift - 1UL))) || + (grs == (1UL << (shift - 1UL)) && last != 0) || (exp > 15); + return roundup ? __clc_rti(x) : __clc_rtz(x); +} +#endif + +#define __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, BUILTIN) \ + __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, __clc_noop, BUILTIN) \ + __FUNC(SUFFIX##_rtz, VEC_SIZE, OFFSET, TYPE, AS, __clc_rtz, BUILTIN) \ + __FUNC(SUFFIX##_rtn, VEC_SIZE, OFFSET, TYPE, AS, __clc_rtn, BUILTIN) \ + __FUNC(SUFFIX##_rtp, VEC_SIZE, OFFSET, TYPE, AS, __clc_rtp, BUILTIN) \ + __FUNC(SUFFIX##_rte, VEC_SIZE, OFFSET, TYPE, AS, __clc_rte, BUILTIN) + +#define FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, BUILTIN) \ + __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, BUILTIN) + +#define __CLC_BODY "clc_vstore_half.inc" +#include <clc/math/gentype.inc> +#undef FUNC +#undef __XFUNC +#undef __FUNC +#undef VEC_LOAD16 +#undef VEC_LOAD8 +#undef VEC_LOAD4 +#undef VEC_LOAD3 +#undef VEC_LOAD2 +#undef VEC_LOAD1 +#undef DECLARE_HELPER +#undef VSTORE_ADDR_SPACES +#undef VSTORE_VECTORIZE +#undef VSTORE_VECTORIZE_GENERIC diff --git a/libclc/opencl/lib/generic/shared/vstore_half.inc b/libclc/clc/lib/generic/shared/clc_vstore_half.inc similarity index 100% rename from libclc/opencl/lib/generic/shared/vstore_half.inc rename to libclc/clc/lib/generic/shared/clc_vstore_half.inc diff --git a/libclc/opencl/lib/generic/shared/vload.cl b/libclc/opencl/lib/generic/shared/vload.cl index 4bfb5a012ce1a..ad22839580132 100644 --- a/libclc/opencl/lib/generic/shared/vload.cl +++ b/libclc/opencl/lib/generic/shared/vload.cl @@ -7,134 +7,10 @@ //===----------------------------------------------------------------------===// #include <clc/opencl/clc.h> +#include <clc/shared/clc_vload.h> -#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ - typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ - __attribute__((aligned(sizeof(PRIM_TYPE)))); \ - _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 vload2(size_t offset, \ - const ADDR_SPACE PRIM_TYPE *x) { \ - return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ - *)(&x[2 * offset])); \ - } \ - \ - typedef PRIM_TYPE##3 less_aligned_##ADDR_SPACE##PRIM_TYPE##3 \ - __attribute__((aligned(sizeof(PRIM_TYPE)))); \ - _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset, \ - const ADDR_SPACE PRIM_TYPE *x) { \ - PRIM_TYPE##2 vec = \ - *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ - *)(&x[3 * offset])); \ - return (PRIM_TYPE##3)(vec.s0, vec.s1, x[offset * 3 + 2]); \ - } \ - \ - typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \ - __attribute__((aligned(sizeof(PRIM_TYPE)))); \ - _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset, \ - const ADDR_SPACE PRIM_TYPE *x) { \ - return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \ - *)(&x[4 * offset])); \ - } \ - \ - typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \ - __attribute__((aligned(sizeof(PRIM_TYPE)))); \ - _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset, \ - const ADDR_SPACE PRIM_TYPE *x) { \ - return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \ - *)(&x[8 * offset])); \ - } \ - \ - typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \ - __attribute__((aligned(sizeof(PRIM_TYPE)))); \ - _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16( \ - size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ - return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \ - *)(&x[16 * offset])); \ - } +#define __CLC_BODY "vload.inc" +#include <clc/integer/gentype.inc> -#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED -#define VLOAD_VECTORIZE_GENERIC VLOAD_VECTORIZE -#else -// The generic address space isn't available, so make the macro do nothing -#define VLOAD_VECTORIZE_GENERIC(X, Y) -#endif - -#define VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE) \ - VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private) \ - VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local) \ - VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \ - VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global) \ - VLOAD_VECTORIZE_GENERIC(__CLC_SCALAR_GENTYPE, __generic) - -#define VLOAD_TYPES() \ - VLOAD_ADDR_SPACES(char) \ - VLOAD_ADDR_SPACES(uchar) \ - VLOAD_ADDR_SPACES(short) \ - VLOAD_ADDR_SPACES(ushort) \ - VLOAD_ADDR_SPACES(int) \ - VLOAD_ADDR_SPACES(uint) \ - VLOAD_ADDR_SPACES(long) \ - VLOAD_ADDR_SPACES(ulong) \ - VLOAD_ADDR_SPACES(float) - -VLOAD_TYPES() - -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64 : enable -VLOAD_ADDR_SPACES(double) -#endif -#ifdef cl_khr_fp16 -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -VLOAD_ADDR_SPACES(half) -#endif - -/* vload_half are legal even without cl_khr_fp16 */ -/* no vload_half for double */ -#define VEC_LOAD1(val, AS) val = __builtin_load_halff(&mem[offset++]); -#define VEC_LOAD2(val, AS) \ - VEC_LOAD1(val.lo, AS) \ - VEC_LOAD1(val.hi, AS) -#define VEC_LOAD3(val, AS) \ - VEC_LOAD1(val.s0, AS) \ - VEC_LOAD1(val.s1, AS) \ - VEC_LOAD1(val.s2, AS) -#define VEC_LOAD4(val, AS) \ - VEC_LOAD2(val.lo, AS) \ - VEC_LOAD2(val.hi, AS) -#define VEC_LOAD8(val, AS) \ - VEC_LOAD4(val.lo, AS) \ - VEC_LOAD4(val.hi, AS) -#define VEC_LOAD16(val, AS) \ - VEC_LOAD8(val.lo, AS) \ - VEC_LOAD8(val.hi, AS) - -#define __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) \ - _CLC_OVERLOAD _CLC_DEF TYPE vload_half##SUFFIX(size_t offset, \ - const AS half *mem) { \ - offset *= VEC_SIZE; \ - TYPE __tmp; \ - VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp; \ - } \ - _CLC_OVERLOAD _CLC_DEF TYPE vloada_half##SUFFIX(size_t offset, \ - const AS half *mem) { \ - offset *= OFFSET_SIZE; \ - TYPE __tmp; \ - VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp; \ - } - -#define FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) \ - __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) - -#define __CLC_BODY "vload_half.inc" +#define __CLC_BODY "vload.inc" #include <clc/math/gentype.inc> -#undef FUNC -#undef __FUNC -#undef VEC_LOAD16 -#undef VEC_LOAD8 -#undef VEC_LOAD4 -#undef VEC_LOAD3 -#undef VEC_LOAD2 -#undef VEC_LOAD1 -#undef VLOAD_TYPES -#undef VLOAD_ADDR_SPACES -#undef VLOAD_VECTORIZE -#undef VLOAD_VECTORIZE_GENERIC diff --git a/libclc/opencl/lib/generic/shared/vload.inc b/libclc/opencl/lib/generic/shared/vload.inc new file mode 100644 index 0000000000000..62cb040aad180 --- /dev/null +++ b/libclc/opencl/lib/generic/shared/vload.inc @@ -0,0 +1,71 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#define CLC_VLOAD_NAME(x) __CLC_XCONCAT(__CLC_XCONCAT(x, vload), __CLC_VECSIZE) +#define CLC_VLOAD_HALF_NAME(x) \ + __CLC_XCONCAT(__CLC_XCONCAT(x, vload_half), __CLC_VECSIZE) +#define CLC_VLOADA_HALF_NAME(x) \ + __CLC_XCONCAT(__CLC_XCONCAT(x, vloada_half), __CLC_VECSIZE) + +#ifndef __CLC_SCALAR + +#define CLC_VLOAD_TY __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE) + +#define VLOAD_DEF(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DEF CLC_VLOAD_TY CLC_VLOAD_NAME()( \ + size_t offset, const ADDRSPACE __CLC_SCALAR_GENTYPE *x) { \ + return CLC_VLOAD_NAME(__clc_)(offset, x); \ + } + +VLOAD_DEF(__private) +VLOAD_DEF(__local) +VLOAD_DEF(__constant) +VLOAD_DEF(__global) + +#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED +VLOAD_DEF(__generic) +#endif + +#undef VLOAD_DEF +#undef CLC_VLOAD_TY + +#endif + +// vload_half and vloada_half are available even if cl_khr_fp16 is unavailable. +// Declare these functions when working on float types, which we know are +// always available. +#ifdef __CLC_FPSIZE +#if __CLC_FPSIZE == 32 + +#define VLOAD_HALF_DEF(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE CLC_VLOAD_HALF_NAME()( \ + size_t offset, const ADDRSPACE half *mem) { \ + return CLC_VLOAD_HALF_NAME(__clc_)(offset, mem); \ + } \ + \ + _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE CLC_VLOADA_HALF_NAME()( \ + size_t offset, const ADDRSPACE half *mem) { \ + return CLC_VLOADA_HALF_NAME(__clc_)(offset, mem); \ + } + +VLOAD_HALF_DEF(__private) +VLOAD_HALF_DEF(__local) +VLOAD_HALF_DEF(__constant) +VLOAD_HALF_DEF(__global) + +#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED +VLOAD_HALF_DEF(__generic) +#endif + +#undef VLOAD_HALF_DEF +#endif +#endif + +#undef CLC_VLOAD_NAME +#undef CLC_VLOAD_HALF_NAME +#undef CLC_VLOADA_HALF_NAME diff --git a/libclc/opencl/lib/generic/shared/vstore.cl b/libclc/opencl/lib/generic/shared/vstore.cl index fe4890defe846..145658f873dc5 100644 --- a/libclc/opencl/lib/generic/shared/vstore.cl +++ b/libclc/opencl/lib/generic/shared/vstore.cl @@ -7,253 +7,10 @@ //===----------------------------------------------------------------------===// #include <clc/opencl/clc.h> +#include <clc/shared/clc_vstore.h> -#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable +#define __CLC_BODY "vstore.inc" +#include <clc/integer/gentype.inc> -#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ - typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ - __attribute__((aligned(sizeof(PRIM_TYPE)))); \ - _CLC_OVERLOAD _CLC_DEF void vstore2(PRIM_TYPE##2 vec, size_t offset, \ - ADDR_SPACE PRIM_TYPE *mem) { \ - *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ - *)(&mem[2 * offset])) = vec; \ - } \ - \ - _CLC_OVERLOAD _CLC_DEF void vstore3(PRIM_TYPE##3 vec, size_t offset, \ - ADDR_SPACE PRIM_TYPE *mem) { \ - *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ - *)(&mem[3 * offset])) = (PRIM_TYPE##2)(vec.s0, vec.s1); \ - mem[3 * offset + 2] = vec.s2; \ - } \ - \ - typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \ - __attribute__((aligned(sizeof(PRIM_TYPE)))); \ - _CLC_OVERLOAD _CLC_DEF void vstore4(PRIM_TYPE##4 vec, size_t offset, \ - ADDR_SPACE PRIM_TYPE *mem) { \ - *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \ - *)(&mem[4 * offset])) = vec; \ - } \ - \ - typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \ - __attribute__((aligned(sizeof(PRIM_TYPE)))); \ - _CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset, \ - ADDR_SPACE PRIM_TYPE *mem) { \ - *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \ - *)(&mem[8 * offset])) = vec; \ - } \ - \ - typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \ - __attribute__((aligned(sizeof(PRIM_TYPE)))); \ - _CLC_OVERLOAD _CLC_DEF void vstore16(PRIM_TYPE##16 vec, size_t offset, \ - ADDR_SPACE PRIM_TYPE *mem) { \ - *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \ - *)(&mem[16 * offset])) = vec; \ - } - -#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED -#define VSTORE_VECTORIZE_GENERIC VSTORE_VECTORIZE -#else -// The generic address space isn't available, so make the macro do nothing -#define VSTORE_VECTORIZE_GENERIC(X, Y) -#endif - -#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \ - VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private) \ - VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \ - VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) \ - VSTORE_VECTORIZE_GENERIC(__CLC_SCALAR___CLC_GENTYPE, __generic) - -VSTORE_ADDR_SPACES(char) -VSTORE_ADDR_SPACES(uchar) -VSTORE_ADDR_SPACES(short) -VSTORE_ADDR_SPACES(ushort) -VSTORE_ADDR_SPACES(int) -VSTORE_ADDR_SPACES(uint) -VSTORE_ADDR_SPACES(long) -VSTORE_ADDR_SPACES(ulong) -VSTORE_ADDR_SPACES(float) - -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64 : enable -VSTORE_ADDR_SPACES(double) -#endif - -#ifdef cl_khr_fp16 -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -VSTORE_ADDR_SPACES(half) -#endif - -#define VEC_STORE1(val, ROUNDF, BUILTIN) BUILTIN(ROUNDF(val), &mem[offset++]); - -#define VEC_STORE2(val, ROUNDF, BUILTIN) \ - VEC_STORE1(val.lo, ROUNDF, BUILTIN) \ - VEC_STORE1(val.hi, ROUNDF, BUILTIN) -#define VEC_STORE3(val, ROUNDF, BUILTIN) \ - VEC_STORE1(val.s0, ROUNDF, BUILTIN) \ - VEC_STORE1(val.s1, ROUNDF, BUILTIN) \ - VEC_STORE1(val.s2, ROUNDF, BUILTIN) -#define VEC_STORE4(val, ROUNDF, BUILTIN) \ - VEC_STORE2(val.lo, ROUNDF, BUILTIN) \ - VEC_STORE2(val.hi, ROUNDF, BUILTIN) -#define VEC_STORE8(val, ROUNDF, BUILTIN) \ - VEC_STORE4(val.lo, ROUNDF, BUILTIN) \ - VEC_STORE4(val.hi, ROUNDF, BUILTIN) -#define VEC_STORE16(val, ROUNDF, BUILTIN) \ - VEC_STORE8(val.lo, ROUNDF, BUILTIN) \ - VEC_STORE8(val.hi, ROUNDF, BUILTIN) - -#define __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, ROUNDF, BUILTIN) \ - _CLC_OVERLOAD _CLC_DEF void vstore_half##SUFFIX(TYPE vec, size_t offset, \ - AS half *mem) { \ - offset *= VEC_SIZE; \ - VEC_STORE##VEC_SIZE(vec, ROUNDF, BUILTIN) \ - } \ - _CLC_OVERLOAD _CLC_DEF void vstorea_half##SUFFIX(TYPE vec, size_t offset, \ - AS half *mem) { \ - offset *= OFFSET; \ - VEC_STORE##VEC_SIZE(vec, ROUNDF, BUILTIN) \ - } - -_CLC_DEF _CLC_OVERLOAD float __clc_noop(float x) { return x; } -_CLC_DEF _CLC_OVERLOAD float __clc_rtz(float x) { - /* Remove lower 13 bits to make sure the number is rounded down */ - int mask = 0xffffe000; - const int exp = (as_uint(x) >> 23 & 0xff) - 127; - /* Denormals cannot be flushed, and they use different bit for rounding */ - if (exp < -14) - mask <<= min(-(exp + 14), 10); - /* RTZ does not produce Inf for large numbers */ - if (fabs(x) > 65504.0f && !isinf(x)) - return copysign(65504.0f, x); - /* Handle nan corner case */ - if (isnan(x)) - return x; - return as_float(as_uint(x) & mask); -} -_CLC_DEF _CLC_OVERLOAD float __clc_rti(float x) { - const float inf = copysign(INFINITY, x); - /* Set lower 13 bits */ - int mask = (1 << 13) - 1; - const int exp = (as_uint(x) >> 23 & 0xff) - 127; - /* Denormals cannot be flushed, and they use different bit for rounding */ - if (exp < -14) - mask = (1 << (13 + min(-(exp + 14), 10))) - 1; - /* Handle nan corner case */ - if (isnan(x)) - return x; - const float next = nextafter(as_float(as_uint(x) | mask), inf); - return ((as_uint(x) & mask) == 0) ? x : next; -} -_CLC_DEF _CLC_OVERLOAD float __clc_rtn(float x) { - return ((as_uint(x) & 0x80000000) == 0) ? __clc_rtz(x) : __clc_rti(x); -} -_CLC_DEF _CLC_OVERLOAD float __clc_rtp(float x) { - return ((as_uint(x) & 0x80000000) == 0) ? __clc_rti(x) : __clc_rtz(x); -} -_CLC_DEF _CLC_OVERLOAD float __clc_rte(float x) { - /* Mantisa + implicit bit */ - const uint mantissa = (as_uint(x) & 0x7fffff) | (1u << 23); - const int exp = (as_uint(x) >> 23 & 0xff) - 127; - int shift = 13; - if (exp < -14) { - /* The default assumes lower 13 bits are rounded, - * but it might be more for denormals. - * Shifting beyond last == 0b, and qr == 00b is not necessary */ - shift += min(-(exp + 14), 15); - } - int mask = (1 << shift) - 1; - const uint grs = mantissa & mask; - const uint last = mantissa & (1 << shift); - /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1. - * exp > 15 should round to inf. */ - bool roundup = (grs > (1 << (shift - 1))) || - (grs == (1 << (shift - 1)) && last != 0) || (exp > 15); - return roundup ? __clc_rti(x) : __clc_rtz(x); -} - -#ifdef cl_khr_fp64 -_CLC_DEF _CLC_OVERLOAD double __clc_noop(double x) { return x; } -_CLC_DEF _CLC_OVERLOAD double __clc_rtz(double x) { - /* Remove lower 42 bits to make sure the number is rounded down */ - ulong mask = 0xfffffc0000000000UL; - const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023; - /* Denormals cannot be flushed, and they use different bit for rounding */ - if (exp < -14) - mask <<= min(-(exp + 14), 10); - /* RTZ does not produce Inf for large numbers */ - if (fabs(x) > 65504.0 && !isinf(x)) - return copysign(65504.0, x); - /* Handle nan corner case */ - if (isnan(x)) - return x; - return as_double(as_ulong(x) & mask); -} -_CLC_DEF _CLC_OVERLOAD double __clc_rti(double x) { - const double inf = copysign((double)INFINITY, x); - /* Set lower 42 bits */ - long mask = (1UL << 42UL) - 1UL; - const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023; - /* Denormals cannot be flushed, and they use different bit for rounding */ - if (exp < -14) - mask = (1UL << (42UL + min(-(exp + 14), 10))) - 1; - /* Handle nan corner case */ - if (isnan(x)) - return x; - const double next = nextafter(as_double(as_ulong(x) | mask), inf); - return ((as_ulong(x) & mask) == 0) ? x : next; -} -_CLC_DEF _CLC_OVERLOAD double __clc_rtn(double x) { - return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rtz(x) - : __clc_rti(x); -} -_CLC_DEF _CLC_OVERLOAD double __clc_rtp(double x) { - return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rti(x) - : __clc_rtz(x); -} -_CLC_DEF _CLC_OVERLOAD double __clc_rte(double x) { - /* Mantisa + implicit bit */ - const ulong mantissa = (as_ulong(x) & 0xfffffffffffff) | (1UL << 52); - const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023; - int shift = 42; - if (exp < -14) { - /* The default assumes lower 13 bits are rounded, - * but it might be more for denormals. - * Shifting beyond last == 0b, and qr == 00b is not necessary */ - shift += min(-(exp + 14), 15); - } - ulong mask = (1UL << shift) - 1UL; - const ulong grs = mantissa & mask; - const ulong last = mantissa & (1UL << shift); - /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1. - * exp > 15 should round to inf. */ - bool roundup = (grs > (1UL << (shift - 1UL))) || - (grs == (1UL << (shift - 1UL)) && last != 0) || (exp > 15); - return roundup ? __clc_rti(x) : __clc_rtz(x); -} -#endif - -#define __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, BUILTIN) \ - __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, __clc_noop, BUILTIN) \ - __FUNC(SUFFIX##_rtz, VEC_SIZE, OFFSET, TYPE, AS, __clc_rtz, BUILTIN) \ - __FUNC(SUFFIX##_rtn, VEC_SIZE, OFFSET, TYPE, AS, __clc_rtn, BUILTIN) \ - __FUNC(SUFFIX##_rtp, VEC_SIZE, OFFSET, TYPE, AS, __clc_rtp, BUILTIN) \ - __FUNC(SUFFIX##_rte, VEC_SIZE, OFFSET, TYPE, AS, __clc_rte, BUILTIN) - -#define FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, BUILTIN) \ - __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, BUILTIN) - -#define __CLC_BODY "vstore_half.inc" +#define __CLC_BODY "vstore.inc" #include <clc/math/gentype.inc> -#undef FUNC -#undef __XFUNC -#undef __FUNC -#undef VEC_LOAD16 -#undef VEC_LOAD8 -#undef VEC_LOAD4 -#undef VEC_LOAD3 -#undef VEC_LOAD2 -#undef VEC_LOAD1 -#undef DECLARE_HELPER -#undef VSTORE_ADDR_SPACES -#undef VSTORE_VECTORIZE -#undef VSTORE_VECTORIZE_GENERIC diff --git a/libclc/opencl/lib/generic/shared/vstore.inc b/libclc/opencl/lib/generic/shared/vstore.inc new file mode 100644 index 0000000000000..4bdce0719912d --- /dev/null +++ b/libclc/opencl/lib/generic/shared/vstore.inc @@ -0,0 +1,77 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#define CLC_VSTORE_TY __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE) +#define CLC_VSTORE_NAME(x) \ + __CLC_XCONCAT(__CLC_XCONCAT(x, vstore), __CLC_VECSIZE) +#define CLC_VSTORE_HALF_NAME(x, y) \ + __CLC_XCONCAT(__CLC_XCONCAT(__CLC_XCONCAT(x, vstore_half), __CLC_VECSIZE), y) +#define CLC_VSTOREA_HALF_NAME(x, y) \ + __CLC_XCONCAT(__CLC_XCONCAT(__CLC_XCONCAT(x, vstorea_half), __CLC_VECSIZE), y) + +#ifndef __CLC_SCALAR + +#define CLC_VSTORE_DEF(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DEF void CLC_VSTORE_NAME()( \ + CLC_VSTORE_TY data, size_t offset, ADDRSPACE __CLC_SCALAR_GENTYPE *p) { \ + return CLC_VSTORE_NAME(__clc_)(data, offset, p); \ + } + +CLC_VSTORE_DEF(__private) +CLC_VSTORE_DEF(__local) +CLC_VSTORE_DEF(__global) + +#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED +CLC_VSTORE_DEF(__generic) +#endif + +#undef CLC_VSTORE_DEF + +#endif // __CLC_SCALAR + +// vstore_half and vstorea_half are available even if cl_khr_fp16 is +// unavailable. +#ifdef __CLC_FPSIZE +#if __CLC_FPSIZE == 32 || __CLC_FPSIZE == 64 + +#define CLC_VSTORE_HALF_DEF(ADDRSPACE, SUFFIX) \ + _CLC_OVERLOAD _CLC_DEF void CLC_VSTORE_HALF_NAME(, SUFFIX)( \ + CLC_VSTORE_TY data, size_t offset, ADDRSPACE half *p) { \ + CLC_VSTORE_HALF_NAME(__clc_, SUFFIX)(data, offset, p); \ + } \ + \ + _CLC_OVERLOAD _CLC_DEF void CLC_VSTOREA_HALF_NAME(, SUFFIX)( \ + CLC_VSTORE_TY data, size_t offset, ADDRSPACE half *p) { \ + CLC_VSTOREA_HALF_NAME(__clc_, SUFFIX)(data, offset, p); \ + } + +#define CLC_VSTORE_HALF_DEF_ALL_MODES(ADDRSPACE) \ + CLC_VSTORE_HALF_DEF(ADDRSPACE, ) \ + CLC_VSTORE_HALF_DEF(ADDRSPACE, _rtz) \ + CLC_VSTORE_HALF_DEF(ADDRSPACE, _rtn) \ + CLC_VSTORE_HALF_DEF(ADDRSPACE, _rtp) \ + CLC_VSTORE_HALF_DEF(ADDRSPACE, _rte) + +CLC_VSTORE_HALF_DEF_ALL_MODES(__private) +CLC_VSTORE_HALF_DEF_ALL_MODES(__local) +CLC_VSTORE_HALF_DEF_ALL_MODES(__global) + +#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED +CLC_VSTORE_HALF_DEF_ALL_MODES(__generic) +#endif + +#undef CLC_VSTORE_HALF_DEF +#undef CLC_VSTORE_HALF_DEF_ALL_MODES + +#endif +#endif + +#undef CLC_VSTORE_TY +#undef CLC_VSTORE_NAME +#undef CLC_VSTORE_HALF_NAME +#undef CLC_VSTOREA_HALF_NAME _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits