Author: Fraser Cormack Date: 2025-04-24T11:42:18+01:00 New Revision: 2edade28245b1fc2b7cb0b39804894f8fdcfb7ff
URL: https://github.com/llvm/llvm-project/commit/2edade28245b1fc2b7cb0b39804894f8fdcfb7ff DIFF: https://github.com/llvm/llvm-project/commit/2edade28245b1fc2b7cb0b39804894f8fdcfb7ff.diff LOG: [libclc][NFC] Clang-format vload/vstore code Added: Modified: libclc/generic/include/clc/shared/vload.h libclc/generic/include/clc/shared/vstore.h libclc/generic/lib/shared/vload.cl libclc/generic/lib/shared/vload_half.inc libclc/generic/lib/shared/vstore.cl libclc/generic/lib/shared/vstore_half.inc Removed: ################################################################################ diff --git a/libclc/generic/include/clc/shared/vload.h b/libclc/generic/include/clc/shared/vload.h index b2db5551d0903..a343d652933fd 100644 --- a/libclc/generic/include/clc/shared/vload.h +++ b/libclc/generic/include/clc/shared/vload.h @@ -6,23 +6,24 @@ // //===----------------------------------------------------------------------===// -#define _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE) \ - _CLC_OVERLOAD _CLC_DECL VEC_TYPE vload##SUFFIX##WIDTH(size_t offset, const ADDR_SPACE MEM_TYPE *x); +#define _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE) \ + _CLC_OVERLOAD _CLC_DECL VEC_TYPE vload##SUFFIX##WIDTH( \ + size_t offset, const ADDR_SPACE MEM_TYPE *x); -#define _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, ADDR_SPACE) \ - _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE) \ - _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE) \ - _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE) \ - _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE) \ +#define _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, ADDR_SPACE) \ + _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE) \ + _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE) \ + _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE) \ + _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE) \ _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##16, 16, ADDR_SPACE) -#define _CLC_VECTOR_VLOAD_PRIM3(SUFFIX, MEM_TYPE, PRIM_TYPE) \ - _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __private) \ - _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __local) \ - _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __constant) \ +#define _CLC_VECTOR_VLOAD_PRIM3(SUFFIX, MEM_TYPE, PRIM_TYPE) \ + _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __private) \ + _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __local) \ + _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __constant) \ _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __global) -#define _CLC_VECTOR_VLOAD_PRIM1(PRIM_TYPE) \ +#define _CLC_VECTOR_VLOAD_PRIM1(PRIM_TYPE) \ _CLC_VECTOR_VLOAD_PRIM3(, PRIM_TYPE, PRIM_TYPE) // Declare vector load prototypes @@ -40,12 +41,12 @@ _CLC_VECTOR_VLOAD_PRIM3(_half, half, float) _CLC_VECTOR_VLOAD_PRIM3(a_half, half, float) #ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64: enable - _CLC_VECTOR_VLOAD_PRIM1(double) +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +_CLC_VECTOR_VLOAD_PRIM1(double) #endif #ifdef cl_khr_fp16 -#pragma OPENCL EXTENSION cl_khr_fp16: enable - _CLC_VECTOR_VLOAD_PRIM1(half) +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +_CLC_VECTOR_VLOAD_PRIM1(half) #endif // Scalar vload_half also needs to be declared diff --git a/libclc/generic/include/clc/shared/vstore.h b/libclc/generic/include/clc/shared/vstore.h index 93687e030eb41..6e98f0368c5c1 100644 --- a/libclc/generic/include/clc/shared/vstore.h +++ b/libclc/generic/include/clc/shared/vstore.h @@ -6,33 +6,34 @@ // //===----------------------------------------------------------------------===// -#define _CLC_VSTORE_DECL(SUFFIX, PRIM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE, RND) \ - _CLC_OVERLOAD _CLC_DECL void vstore##SUFFIX##WIDTH##RND(VEC_TYPE vec, size_t offset, ADDR_SPACE PRIM_TYPE *out); +#define _CLC_VSTORE_DECL(SUFFIX, PRIM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE, RND) \ + _CLC_OVERLOAD _CLC_DECL void vstore##SUFFIX##WIDTH##RND( \ + VEC_TYPE vec, size_t offset, ADDR_SPACE PRIM_TYPE *out); -#define _CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, ADDR_SPACE, RND) \ - _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE, RND) \ - _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE, RND) \ - _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE, RND) \ - _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE, RND) \ +#define _CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, ADDR_SPACE, RND) \ + _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE, RND) \ + _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE, RND) \ + _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE, RND) \ + _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE, RND) \ _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##16, 16, ADDR_SPACE, RND) -#define _CLC_VECTOR_VSTORE_PRIM3(SUFFIX, MEM_TYPE, PRIM_TYPE, RND) \ - _CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __private, RND) \ - _CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __local, RND) \ +#define _CLC_VECTOR_VSTORE_PRIM3(SUFFIX, MEM_TYPE, PRIM_TYPE, RND) \ + _CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __private, RND) \ + _CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __local, RND) \ _CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __global, RND) -#define _CLC_VECTOR_VSTORE_PRIM1(PRIM_TYPE) \ - _CLC_VECTOR_VSTORE_PRIM3(,PRIM_TYPE, PRIM_TYPE, ) +#define _CLC_VECTOR_VSTORE_PRIM1(PRIM_TYPE) \ + _CLC_VECTOR_VSTORE_PRIM3(, PRIM_TYPE, PRIM_TYPE, ) -#define _CLC_VECTOR_VSTORE_HALF_PRIM1(PRIM_TYPE, RND) \ - _CLC_VSTORE_DECL(_half, half, PRIM_TYPE, , __private, RND) \ - _CLC_VSTORE_DECL(_half, half, PRIM_TYPE, , __local, RND) \ - _CLC_VSTORE_DECL(_half, half, PRIM_TYPE, , __global, RND) \ - _CLC_VECTOR_VSTORE_PRIM3(_half, half, PRIM_TYPE, RND) \ - _CLC_VSTORE_DECL(a_half, half, PRIM_TYPE, , __private, RND) \ - _CLC_VSTORE_DECL(a_half, half, PRIM_TYPE, , __local, RND) \ - _CLC_VSTORE_DECL(a_half, half, PRIM_TYPE, , __global, RND) \ - _CLC_VECTOR_VSTORE_PRIM3(a_half, half, PRIM_TYPE, RND) +#define _CLC_VECTOR_VSTORE_HALF_PRIM1(PRIM_TYPE, RND) \ + _CLC_VSTORE_DECL(_half, half, PRIM_TYPE, , __private, RND) \ + _CLC_VSTORE_DECL(_half, half, PRIM_TYPE, , __local, RND) \ + _CLC_VSTORE_DECL(_half, half, PRIM_TYPE, , __global, RND) \ + _CLC_VECTOR_VSTORE_PRIM3(_half, half, PRIM_TYPE, RND) \ + _CLC_VSTORE_DECL(a_half, half, PRIM_TYPE, , __private, RND) \ + _CLC_VSTORE_DECL(a_half, half, PRIM_TYPE, , __local, RND) \ + _CLC_VSTORE_DECL(a_half, half, PRIM_TYPE, , __global, RND) \ + _CLC_VECTOR_VSTORE_PRIM3(a_half, half, PRIM_TYPE, RND) _CLC_VECTOR_VSTORE_PRIM1(char) _CLC_VECTOR_VSTORE_PRIM1(uchar) @@ -44,26 +45,25 @@ _CLC_VECTOR_VSTORE_PRIM1(long) _CLC_VECTOR_VSTORE_PRIM1(ulong) _CLC_VECTOR_VSTORE_PRIM1(float) -_CLC_VECTOR_VSTORE_HALF_PRIM1(float,) +_CLC_VECTOR_VSTORE_HALF_PRIM1(float, ) _CLC_VECTOR_VSTORE_HALF_PRIM1(float, _rtz) _CLC_VECTOR_VSTORE_HALF_PRIM1(float, _rtn) _CLC_VECTOR_VSTORE_HALF_PRIM1(float, _rtp) _CLC_VECTOR_VSTORE_HALF_PRIM1(float, _rte) #ifdef cl_khr_fp64 - _CLC_VECTOR_VSTORE_PRIM1(double) - _CLC_VECTOR_VSTORE_HALF_PRIM1(double,) - _CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rtz) - _CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rtn) - _CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rtp) - _CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rte) +_CLC_VECTOR_VSTORE_PRIM1(double) +_CLC_VECTOR_VSTORE_HALF_PRIM1(double, ) +_CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rtz) +_CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rtn) +_CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rtp) +_CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rte) #endif #ifdef cl_khr_fp16 - _CLC_VECTOR_VSTORE_PRIM1(half) +_CLC_VECTOR_VSTORE_PRIM1(half) #endif - #undef _CLC_VSTORE_DECL #undef _CLC_VECTOR_VSTORE_DECL #undef _CLC_VECTOR_VSTORE_PRIM3 diff --git a/libclc/generic/lib/shared/vload.cl b/libclc/generic/lib/shared/vload.cl index dcbae4f20929f..a0306c500d5cd 100644 --- a/libclc/generic/lib/shared/vload.cl +++ b/libclc/generic/lib/shared/vload.cl @@ -8,59 +8,75 @@ #include <clc/clc.h> -#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ - typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ - _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 vload2(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ - return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&x[2*offset])); \ - } \ -\ - typedef PRIM_TYPE##3 less_aligned_##ADDR_SPACE##PRIM_TYPE##3 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ - _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ - PRIM_TYPE##2 vec = *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&x[3*offset])); \ - return (PRIM_TYPE##3)(vec.s0, vec.s1, x[offset*3+2]); \ - } \ -\ - typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ - _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ - return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4*) (&x[4*offset])); \ - } \ -\ - typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ - _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ - return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8*) (&x[8*offset])); \ - } \ -\ - typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ - _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ - return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16*) (&x[16*offset])); \ - } \ +#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ + typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ + __attribute__((aligned(sizeof(PRIM_TYPE)))); \ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 vload2(size_t offset, \ + const ADDR_SPACE PRIM_TYPE *x) { \ + return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ + *)(&x[2 * offset])); \ + } \ + \ + typedef PRIM_TYPE##3 less_aligned_##ADDR_SPACE##PRIM_TYPE##3 \ + __attribute__((aligned(sizeof(PRIM_TYPE)))); \ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset, \ + const ADDR_SPACE PRIM_TYPE *x) { \ + PRIM_TYPE##2 vec = \ + *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ + *)(&x[3 * offset])); \ + return (PRIM_TYPE##3)(vec.s0, vec.s1, x[offset * 3 + 2]); \ + } \ + \ + typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \ + __attribute__((aligned(sizeof(PRIM_TYPE)))); \ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset, \ + const ADDR_SPACE PRIM_TYPE *x) { \ + return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \ + *)(&x[4 * offset])); \ + } \ + \ + typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \ + __attribute__((aligned(sizeof(PRIM_TYPE)))); \ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset, \ + const ADDR_SPACE PRIM_TYPE *x) { \ + return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \ + *)(&x[8 * offset])); \ + } \ + \ + typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \ + __attribute__((aligned(sizeof(PRIM_TYPE)))); \ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16( \ + size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ + return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \ + *)(&x[16 * offset])); \ + } -#define VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE) \ - VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private) \ - VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local) \ - VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \ - VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global) \ +#define VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE) \ + VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private) \ + VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local) \ + VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \ + VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global) -#define VLOAD_TYPES() \ - VLOAD_ADDR_SPACES(char) \ - VLOAD_ADDR_SPACES(uchar) \ - VLOAD_ADDR_SPACES(short) \ - VLOAD_ADDR_SPACES(ushort) \ - VLOAD_ADDR_SPACES(int) \ - VLOAD_ADDR_SPACES(uint) \ - VLOAD_ADDR_SPACES(long) \ - VLOAD_ADDR_SPACES(ulong) \ - VLOAD_ADDR_SPACES(float) \ +#define VLOAD_TYPES() \ + VLOAD_ADDR_SPACES(char) \ + VLOAD_ADDR_SPACES(uchar) \ + VLOAD_ADDR_SPACES(short) \ + VLOAD_ADDR_SPACES(ushort) \ + VLOAD_ADDR_SPACES(int) \ + VLOAD_ADDR_SPACES(uint) \ + VLOAD_ADDR_SPACES(long) \ + VLOAD_ADDR_SPACES(ulong) \ + VLOAD_ADDR_SPACES(float) VLOAD_TYPES() #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable - VLOAD_ADDR_SPACES(double) +VLOAD_ADDR_SPACES(double) #endif #ifdef cl_khr_fp16 #pragma OPENCL EXTENSION cl_khr_fp16 : enable - VLOAD_ADDR_SPACES(half) +VLOAD_ADDR_SPACES(half) #endif /* vload_half are legal even without cl_khr_fp16 */ @@ -71,43 +87,45 @@ float __clc_vload_half_float_helper__global(const __global half *); float __clc_vload_half_float_helper__local(const __local half *); float __clc_vload_half_float_helper__private(const __private half *); -#define VEC_LOAD1(val, AS) val = __clc_vload_half_float_helper##AS (&mem[offset++]); +#define VEC_LOAD1(val, AS) \ + val = __clc_vload_half_float_helper##AS(&mem[offset++]); #else #define VEC_LOAD1(val, AS) val = __builtin_load_halff(&mem[offset++]); #endif -#define VEC_LOAD2(val, AS) \ - VEC_LOAD1(val.lo, AS) \ - VEC_LOAD1(val.hi, AS) -#define VEC_LOAD3(val, AS) \ - VEC_LOAD1(val.s0, AS) \ - VEC_LOAD1(val.s1, AS) \ - VEC_LOAD1(val.s2, AS) -#define VEC_LOAD4(val, AS) \ - VEC_LOAD2(val.lo, AS) \ - VEC_LOAD2(val.hi, AS) -#define VEC_LOAD8(val, AS) \ - VEC_LOAD4(val.lo, AS) \ - VEC_LOAD4(val.hi, AS) -#define VEC_LOAD16(val, AS) \ - VEC_LOAD8(val.lo, AS) \ - VEC_LOAD8(val.hi, AS) +#define VEC_LOAD2(val, AS) \ + VEC_LOAD1(val.lo, AS) \ + VEC_LOAD1(val.hi, AS) +#define VEC_LOAD3(val, AS) \ + VEC_LOAD1(val.s0, AS) \ + VEC_LOAD1(val.s1, AS) \ + VEC_LOAD1(val.s2, AS) +#define VEC_LOAD4(val, AS) \ + VEC_LOAD2(val.lo, AS) \ + VEC_LOAD2(val.hi, AS) +#define VEC_LOAD8(val, AS) \ + VEC_LOAD4(val.lo, AS) \ + VEC_LOAD4(val.hi, AS) +#define VEC_LOAD16(val, AS) \ + VEC_LOAD8(val.lo, AS) \ + VEC_LOAD8(val.hi, AS) -#define __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) \ - _CLC_OVERLOAD _CLC_DEF TYPE vload_half##SUFFIX(size_t offset, const AS half *mem) { \ - offset *= VEC_SIZE; \ - TYPE __tmp; \ - VEC_LOAD##VEC_SIZE(__tmp, AS) \ - return __tmp; \ - } \ - _CLC_OVERLOAD _CLC_DEF TYPE vloada_half##SUFFIX(size_t offset, const AS half *mem) { \ - offset *= OFFSET_SIZE; \ - TYPE __tmp; \ - VEC_LOAD##VEC_SIZE(__tmp, AS) \ - return __tmp; \ +#define __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) \ + _CLC_OVERLOAD _CLC_DEF TYPE vload_half##SUFFIX(size_t offset, \ + const AS half *mem) { \ + offset *= VEC_SIZE; \ + TYPE __tmp; \ + VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp; \ + } \ + _CLC_OVERLOAD _CLC_DEF TYPE vloada_half##SUFFIX(size_t offset, \ + const AS half *mem) { \ + offset *= OFFSET_SIZE; \ + TYPE __tmp; \ + VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp; \ } -#define FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) +#define FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) \ + __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) #define __CLC_BODY "vload_half.inc" #include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/shared/vload_half.inc b/libclc/generic/lib/shared/vload_half.inc index ff47969327bab..26716b9960018 100644 --- a/libclc/generic/lib/shared/vload_half.inc +++ b/libclc/generic/lib/shared/vload_half.inc @@ -11,21 +11,21 @@ #ifndef __CLC_SCALAR #if __CLC_VECSIZE == 3 -# define __CLC_OFFSET 4 +#define __CLC_OFFSET 4 #else -# define __CLC_OFFSET __CLC_VECSIZE +#define __CLC_OFFSET __CLC_VECSIZE #endif - FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __private); - FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __local); - FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __global); - FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __constant); +FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __private); +FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __local); +FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __global); +FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __constant); #undef __CLC_OFFSET #else - FUNC(, 1, 1, __CLC_GENTYPE, __private); - FUNC(, 1, 1, __CLC_GENTYPE, __local); - FUNC(, 1, 1, __CLC_GENTYPE, __global); - FUNC(, 1, 1, __CLC_GENTYPE, __constant); +FUNC(, 1, 1, __CLC_GENTYPE, __private); +FUNC(, 1, 1, __CLC_GENTYPE, __local); +FUNC(, 1, 1, __CLC_GENTYPE, __global); +FUNC(, 1, 1, __CLC_GENTYPE, __constant); #endif #endif diff --git a/libclc/generic/lib/shared/vstore.cl b/libclc/generic/lib/shared/vstore.cl index 0a105f5cd8c86..525f3d08bf0d8 100644 --- a/libclc/generic/lib/shared/vstore.cl +++ b/libclc/generic/lib/shared/vstore.cl @@ -10,36 +10,50 @@ #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable -#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ - typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ - _CLC_OVERLOAD _CLC_DEF void vstore2(PRIM_TYPE##2 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ - *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&mem[2*offset])) = vec; \ - } \ -\ - _CLC_OVERLOAD _CLC_DEF void vstore3(PRIM_TYPE##3 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ - *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&mem[3*offset])) = (PRIM_TYPE##2)(vec.s0, vec.s1); \ - mem[3 * offset + 2] = vec.s2;\ - } \ -\ - typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ - _CLC_OVERLOAD _CLC_DEF void vstore4(PRIM_TYPE##4 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ - *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4*) (&mem[4*offset])) = vec; \ - } \ -\ - typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ - _CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ - *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8*) (&mem[8*offset])) = vec; \ - } \ -\ - typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ - _CLC_OVERLOAD _CLC_DEF void vstore16(PRIM_TYPE##16 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ - *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16*) (&mem[16*offset])) = vec; \ - } \ +#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ + typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ + __attribute__((aligned(sizeof(PRIM_TYPE)))); \ + _CLC_OVERLOAD _CLC_DEF void vstore2(PRIM_TYPE##2 vec, size_t offset, \ + ADDR_SPACE PRIM_TYPE *mem) { \ + *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ + *)(&mem[2 * offset])) = vec; \ + } \ + \ + _CLC_OVERLOAD _CLC_DEF void vstore3(PRIM_TYPE##3 vec, size_t offset, \ + ADDR_SPACE PRIM_TYPE *mem) { \ + *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ + *)(&mem[3 * offset])) = (PRIM_TYPE##2)(vec.s0, vec.s1); \ + mem[3 * offset + 2] = vec.s2; \ + } \ + \ + typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \ + __attribute__((aligned(sizeof(PRIM_TYPE)))); \ + _CLC_OVERLOAD _CLC_DEF void vstore4(PRIM_TYPE##4 vec, size_t offset, \ + ADDR_SPACE PRIM_TYPE *mem) { \ + *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \ + *)(&mem[4 * offset])) = vec; \ + } \ + \ + typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \ + __attribute__((aligned(sizeof(PRIM_TYPE)))); \ + _CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset, \ + ADDR_SPACE PRIM_TYPE *mem) { \ + *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \ + *)(&mem[8 * offset])) = vec; \ + } \ + \ + typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \ + __attribute__((aligned(sizeof(PRIM_TYPE)))); \ + _CLC_OVERLOAD _CLC_DEF void vstore16(PRIM_TYPE##16 vec, size_t offset, \ + ADDR_SPACE PRIM_TYPE *mem) { \ + *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \ + *)(&mem[16 * offset])) = vec; \ + } -#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \ - VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private) \ - VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \ - VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) \ +#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \ + VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private) \ + VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \ + VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) VSTORE_ADDR_SPACES(char) VSTORE_ADDR_SPACES(uchar) @@ -51,26 +65,25 @@ VSTORE_ADDR_SPACES(long) VSTORE_ADDR_SPACES(ulong) VSTORE_ADDR_SPACES(float) - #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable - VSTORE_ADDR_SPACES(double) +VSTORE_ADDR_SPACES(double) #endif #ifdef cl_khr_fp16 #pragma OPENCL EXTENSION cl_khr_fp16 : enable - VSTORE_ADDR_SPACES(half) +VSTORE_ADDR_SPACES(half) #endif /* vstore_half are legal even without cl_khr_fp16 */ #if __clang_major__ < 6 -#define DECLARE_HELPER(STYPE, AS, builtin) void __clc_vstore_half_##STYPE##_helper##AS(STYPE, AS half *); +#define DECLARE_HELPER(STYPE, AS, builtin) \ + void __clc_vstore_half_##STYPE##_helper##AS(STYPE, AS half *); #else -#define DECLARE_HELPER(STYPE, AS, __builtin) \ -_CLC_DEF void __clc_vstore_half_##STYPE##_helper##AS(STYPE s, AS half *d) \ -{ \ - __builtin(s, d); \ -} +#define DECLARE_HELPER(STYPE, AS, __builtin) \ + _CLC_DEF void __clc_vstore_half_##STYPE##_helper##AS(STYPE s, AS half *d) { \ + __builtin(s, d); \ + } #endif DECLARE_HELPER(float, __private, __builtin_store_halff); @@ -83,176 +96,165 @@ DECLARE_HELPER(double, __global, __builtin_store_half); DECLARE_HELPER(double, __local, __builtin_store_half); #endif -#define VEC_STORE1(STYPE, AS, val, ROUNDF) __clc_vstore_half_##STYPE##_helper##AS (ROUNDF(val), &mem[offset++]); +#define VEC_STORE1(STYPE, AS, val, ROUNDF) \ + __clc_vstore_half_##STYPE##_helper##AS(ROUNDF(val), &mem[offset++]); -#define VEC_STORE2(STYPE, AS, val, ROUNDF) \ - VEC_STORE1(STYPE, AS, val.lo, ROUNDF) \ - VEC_STORE1(STYPE, AS, val.hi, ROUNDF) -#define VEC_STORE3(STYPE, AS, val, ROUNDF) \ - VEC_STORE1(STYPE, AS, val.s0, ROUNDF) \ - VEC_STORE1(STYPE, AS, val.s1, ROUNDF) \ - VEC_STORE1(STYPE, AS, val.s2, ROUNDF) -#define VEC_STORE4(STYPE, AS, val, ROUNDF) \ - VEC_STORE2(STYPE, AS, val.lo, ROUNDF) \ - VEC_STORE2(STYPE, AS, val.hi, ROUNDF) -#define VEC_STORE8(STYPE, AS, val, ROUNDF) \ - VEC_STORE4(STYPE, AS, val.lo, ROUNDF) \ - VEC_STORE4(STYPE, AS, val.hi, ROUNDF) -#define VEC_STORE16(STYPE, AS, val, ROUNDF) \ - VEC_STORE8(STYPE, AS, val.lo, ROUNDF) \ - VEC_STORE8(STYPE, AS, val.hi, ROUNDF) +#define VEC_STORE2(STYPE, AS, val, ROUNDF) \ + VEC_STORE1(STYPE, AS, val.lo, ROUNDF) \ + VEC_STORE1(STYPE, AS, val.hi, ROUNDF) +#define VEC_STORE3(STYPE, AS, val, ROUNDF) \ + VEC_STORE1(STYPE, AS, val.s0, ROUNDF) \ + VEC_STORE1(STYPE, AS, val.s1, ROUNDF) \ + VEC_STORE1(STYPE, AS, val.s2, ROUNDF) +#define VEC_STORE4(STYPE, AS, val, ROUNDF) \ + VEC_STORE2(STYPE, AS, val.lo, ROUNDF) \ + VEC_STORE2(STYPE, AS, val.hi, ROUNDF) +#define VEC_STORE8(STYPE, AS, val, ROUNDF) \ + VEC_STORE4(STYPE, AS, val.lo, ROUNDF) \ + VEC_STORE4(STYPE, AS, val.hi, ROUNDF) +#define VEC_STORE16(STYPE, AS, val, ROUNDF) \ + VEC_STORE8(STYPE, AS, val.lo, ROUNDF) \ + VEC_STORE8(STYPE, AS, val.hi, ROUNDF) -#define __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, ROUNDF) \ - _CLC_OVERLOAD _CLC_DEF void vstore_half##SUFFIX(TYPE vec, size_t offset, AS half *mem) { \ - offset *= VEC_SIZE; \ - VEC_STORE##VEC_SIZE(STYPE, AS, vec, ROUNDF) \ - } \ - _CLC_OVERLOAD _CLC_DEF void vstorea_half##SUFFIX(TYPE vec, size_t offset, AS half *mem) { \ - offset *= OFFSET; \ - VEC_STORE##VEC_SIZE(STYPE, AS, vec, ROUNDF) \ +#define __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, ROUNDF) \ + _CLC_OVERLOAD _CLC_DEF void vstore_half##SUFFIX(TYPE vec, size_t offset, \ + AS half *mem) { \ + offset *= VEC_SIZE; \ + VEC_STORE##VEC_SIZE(STYPE, AS, vec, ROUNDF) \ + } \ + _CLC_OVERLOAD _CLC_DEF void vstorea_half##SUFFIX(TYPE vec, size_t offset, \ + AS half *mem) { \ + offset *= OFFSET; \ + VEC_STORE##VEC_SIZE(STYPE, AS, vec, ROUNDF) \ } -_CLC_DEF _CLC_OVERLOAD float __clc_noop(float x) -{ - return x; +_CLC_DEF _CLC_OVERLOAD float __clc_noop(float x) { return x; } +_CLC_DEF _CLC_OVERLOAD float __clc_rtz(float x) { + /* Remove lower 13 bits to make sure the number is rounded down */ + int mask = 0xffffe000; + const int exp = (as_uint(x) >> 23 & 0xff) - 127; + /* Denormals cannot be flushed, and they use diff erent bit for rounding */ + if (exp < -14) + mask <<= min(-(exp + 14), 10); + /* RTZ does not produce Inf for large numbers */ + if (fabs(x) > 65504.0f && !isinf(x)) + return copysign(65504.0f, x); + /* Handle nan corner case */ + if (isnan(x)) + return x; + return as_float(as_uint(x) & mask); } -_CLC_DEF _CLC_OVERLOAD float __clc_rtz(float x) -{ - /* Remove lower 13 bits to make sure the number is rounded down */ - int mask = 0xffffe000; - const int exp = (as_uint(x) >> 23 & 0xff) - 127; - /* Denormals cannot be flushed, and they use diff erent bit for rounding */ - if (exp < -14) - mask <<= min(-(exp + 14), 10); - /* RTZ does not produce Inf for large numbers */ - if (fabs(x) > 65504.0f && !isinf(x)) - return copysign(65504.0f, x); - /* Handle nan corner case */ - if (isnan(x)) - return x; - return as_float(as_uint(x) & mask); +_CLC_DEF _CLC_OVERLOAD float __clc_rti(float x) { + const float inf = copysign(INFINITY, x); + /* Set lower 13 bits */ + int mask = (1 << 13) - 1; + const int exp = (as_uint(x) >> 23 & 0xff) - 127; + /* Denormals cannot be flushed, and they use diff erent bit for rounding */ + if (exp < -14) + mask = (1 << (13 + min(-(exp + 14), 10))) - 1; + /* Handle nan corner case */ + if (isnan(x)) + return x; + const float next = nextafter(as_float(as_uint(x) | mask), inf); + return ((as_uint(x) & mask) == 0) ? x : next; } -_CLC_DEF _CLC_OVERLOAD float __clc_rti(float x) -{ - const float inf = copysign(INFINITY, x); - /* Set lower 13 bits */ - int mask = (1 << 13) - 1; - const int exp = (as_uint(x) >> 23 & 0xff) - 127; - /* Denormals cannot be flushed, and they use diff erent bit for rounding */ - if (exp < -14) - mask = (1 << (13 + min(-(exp + 14), 10))) - 1; - /* Handle nan corner case */ - if (isnan(x)) - return x; - const float next = nextafter(as_float(as_uint(x) | mask), inf); - return ((as_uint(x) & mask) == 0) ? x : next; +_CLC_DEF _CLC_OVERLOAD float __clc_rtn(float x) { + return ((as_uint(x) & 0x80000000) == 0) ? __clc_rtz(x) : __clc_rti(x); } -_CLC_DEF _CLC_OVERLOAD float __clc_rtn(float x) -{ - return ((as_uint(x) & 0x80000000) == 0) ? __clc_rtz(x) : __clc_rti(x); +_CLC_DEF _CLC_OVERLOAD float __clc_rtp(float x) { + return ((as_uint(x) & 0x80000000) == 0) ? __clc_rti(x) : __clc_rtz(x); } -_CLC_DEF _CLC_OVERLOAD float __clc_rtp(float x) -{ - return ((as_uint(x) & 0x80000000) == 0) ? __clc_rti(x) : __clc_rtz(x); -} -_CLC_DEF _CLC_OVERLOAD float __clc_rte(float x) -{ - /* Mantisa + implicit bit */ - const uint mantissa = (as_uint(x) & 0x7fffff) | (1u << 23); - const int exp = (as_uint(x) >> 23 & 0xff) - 127; - int shift = 13; - if (exp < -14) { - /* The default assumes lower 13 bits are rounded, - * but it might be more for denormals. - * Shifting beyond last == 0b, and qr == 00b is not necessary */ - shift += min(-(exp + 14), 15); - } - int mask = (1 << shift) - 1; - const uint grs = mantissa & mask; - const uint last = mantissa & (1 << shift); - /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1. - * exp > 15 should round to inf. */ - bool roundup = (grs > (1 << (shift - 1))) || - (grs == (1 << (shift - 1)) && last != 0) || (exp > 15); - return roundup ? __clc_rti(x) : __clc_rtz(x); +_CLC_DEF _CLC_OVERLOAD float __clc_rte(float x) { + /* Mantisa + implicit bit */ + const uint mantissa = (as_uint(x) & 0x7fffff) | (1u << 23); + const int exp = (as_uint(x) >> 23 & 0xff) - 127; + int shift = 13; + if (exp < -14) { + /* The default assumes lower 13 bits are rounded, + * but it might be more for denormals. + * Shifting beyond last == 0b, and qr == 00b is not necessary */ + shift += min(-(exp + 14), 15); + } + int mask = (1 << shift) - 1; + const uint grs = mantissa & mask; + const uint last = mantissa & (1 << shift); + /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1. + * exp > 15 should round to inf. */ + bool roundup = (grs > (1 << (shift - 1))) || + (grs == (1 << (shift - 1)) && last != 0) || (exp > 15); + return roundup ? __clc_rti(x) : __clc_rtz(x); } #ifdef cl_khr_fp64 -_CLC_DEF _CLC_OVERLOAD double __clc_noop(double x) -{ - return x; +_CLC_DEF _CLC_OVERLOAD double __clc_noop(double x) { return x; } +_CLC_DEF _CLC_OVERLOAD double __clc_rtz(double x) { + /* Remove lower 42 bits to make sure the number is rounded down */ + ulong mask = 0xfffffc0000000000UL; + const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023; + /* Denormals cannot be flushed, and they use diff erent bit for rounding */ + if (exp < -14) + mask <<= min(-(exp + 14), 10); + /* RTZ does not produce Inf for large numbers */ + if (fabs(x) > 65504.0 && !isinf(x)) + return copysign(65504.0, x); + /* Handle nan corner case */ + if (isnan(x)) + return x; + return as_double(as_ulong(x) & mask); } -_CLC_DEF _CLC_OVERLOAD double __clc_rtz(double x) -{ - /* Remove lower 42 bits to make sure the number is rounded down */ - ulong mask = 0xfffffc0000000000UL; - const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023; - /* Denormals cannot be flushed, and they use diff erent bit for rounding */ - if (exp < -14) - mask <<= min(-(exp + 14), 10); - /* RTZ does not produce Inf for large numbers */ - if (fabs(x) > 65504.0 && !isinf(x)) - return copysign(65504.0, x); - /* Handle nan corner case */ - if (isnan(x)) - return x; - return as_double(as_ulong(x) & mask); +_CLC_DEF _CLC_OVERLOAD double __clc_rti(double x) { + const double inf = copysign((double)INFINITY, x); + /* Set lower 42 bits */ + long mask = (1UL << 42UL) - 1UL; + const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023; + /* Denormals cannot be flushed, and they use diff erent bit for rounding */ + if (exp < -14) + mask = (1UL << (42UL + min(-(exp + 14), 10))) - 1; + /* Handle nan corner case */ + if (isnan(x)) + return x; + const double next = nextafter(as_double(as_ulong(x) | mask), inf); + return ((as_ulong(x) & mask) == 0) ? x : next; } -_CLC_DEF _CLC_OVERLOAD double __clc_rti(double x) -{ - const double inf = copysign((double)INFINITY, x); - /* Set lower 42 bits */ - long mask = (1UL << 42UL) - 1UL; - const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023; - /* Denormals cannot be flushed, and they use diff erent bit for rounding */ - if (exp < -14) - mask = (1UL << (42UL + min(-(exp + 14), 10))) - 1; - /* Handle nan corner case */ - if (isnan(x)) - return x; - const double next = nextafter(as_double(as_ulong(x) | mask), inf); - return ((as_ulong(x) & mask) == 0) ? x : next; +_CLC_DEF _CLC_OVERLOAD double __clc_rtn(double x) { + return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rtz(x) + : __clc_rti(x); } -_CLC_DEF _CLC_OVERLOAD double __clc_rtn(double x) -{ - return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rtz(x) : __clc_rti(x); +_CLC_DEF _CLC_OVERLOAD double __clc_rtp(double x) { + return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rti(x) + : __clc_rtz(x); } -_CLC_DEF _CLC_OVERLOAD double __clc_rtp(double x) -{ - return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rti(x) : __clc_rtz(x); -} -_CLC_DEF _CLC_OVERLOAD double __clc_rte(double x) -{ - /* Mantisa + implicit bit */ - const ulong mantissa = (as_ulong(x) & 0xfffffffffffff) | (1UL << 52); - const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023; - int shift = 42; - if (exp < -14) { - /* The default assumes lower 13 bits are rounded, - * but it might be more for denormals. - * Shifting beyond last == 0b, and qr == 00b is not necessary */ - shift += min(-(exp + 14), 15); - } - ulong mask = (1UL << shift) - 1UL; - const ulong grs = mantissa & mask; - const ulong last = mantissa & (1UL << shift); - /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1. - * exp > 15 should round to inf. */ - bool roundup = (grs > (1UL << (shift - 1UL))) || - (grs == (1UL << (shift - 1UL)) && last != 0) || (exp > 15); - return roundup ? __clc_rti(x) : __clc_rtz(x); +_CLC_DEF _CLC_OVERLOAD double __clc_rte(double x) { + /* Mantisa + implicit bit */ + const ulong mantissa = (as_ulong(x) & 0xfffffffffffff) | (1UL << 52); + const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023; + int shift = 42; + if (exp < -14) { + /* The default assumes lower 13 bits are rounded, + * but it might be more for denormals. + * Shifting beyond last == 0b, and qr == 00b is not necessary */ + shift += min(-(exp + 14), 15); + } + ulong mask = (1UL << shift) - 1UL; + const ulong grs = mantissa & mask; + const ulong last = mantissa & (1UL << shift); + /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1. + * exp > 15 should round to inf. */ + bool roundup = (grs > (1UL << (shift - 1UL))) || + (grs == (1UL << (shift - 1UL)) && last != 0) || (exp > 15); + return roundup ? __clc_rti(x) : __clc_rtz(x); } #endif -#define __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS) \ - __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_noop) \ - __FUNC(SUFFIX ## _rtz, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtz) \ - __FUNC(SUFFIX ## _rtn, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtn) \ - __FUNC(SUFFIX ## _rtp, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtp) \ - __FUNC(SUFFIX ## _rte, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rte) +#define __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS) \ + __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_noop) \ + __FUNC(SUFFIX##_rtz, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtz) \ + __FUNC(SUFFIX##_rtn, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtn) \ + __FUNC(SUFFIX##_rtp, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtp) \ + __FUNC(SUFFIX##_rte, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rte) -#define FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS) \ - __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS) +#define FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS) \ + __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS) #define __CLC_BODY "vstore_half.inc" #include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/shared/vstore_half.inc b/libclc/generic/lib/shared/vstore_half.inc index 7c3472804b861..138c19ae78b3f 100644 --- a/libclc/generic/lib/shared/vstore_half.inc +++ b/libclc/generic/lib/shared/vstore_half.inc @@ -11,19 +11,22 @@ #ifndef __CLC_SCALAR #if __CLC_VECSIZE == 3 -# define __CLC_OFFSET 4 +#define __CLC_OFFSET 4 #else -# define __CLC_OFFSET __CLC_VECSIZE +#define __CLC_OFFSET __CLC_VECSIZE #endif - FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __private); - FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __local); - FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __global); +FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, + __CLC_SCALAR_GENTYPE, __private); +FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, + __CLC_SCALAR_GENTYPE, __local); +FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, + __CLC_SCALAR_GENTYPE, __global); #undef __CLC_OFFSET #else - FUNC(, 1, 1, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __private); - FUNC(, 1, 1, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __local); - FUNC(, 1, 1, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __global); +FUNC(, 1, 1, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __private); +FUNC(, 1, 1, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __local); +FUNC(, 1, 1, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __global); #endif #endif _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits