Author: Fraser Cormack
Date: 2025-04-24T11:42:18+01:00
New Revision: 2edade28245b1fc2b7cb0b39804894f8fdcfb7ff
URL: 
https://github.com/llvm/llvm-project/commit/2edade28245b1fc2b7cb0b39804894f8fdcfb7ff
DIFF: 
https://github.com/llvm/llvm-project/commit/2edade28245b1fc2b7cb0b39804894f8fdcfb7ff.diff

LOG: [libclc][NFC] Clang-format vload/vstore code

Added: 
    

Modified: 
    libclc/generic/include/clc/shared/vload.h
    libclc/generic/include/clc/shared/vstore.h
    libclc/generic/lib/shared/vload.cl
    libclc/generic/lib/shared/vload_half.inc
    libclc/generic/lib/shared/vstore.cl
    libclc/generic/lib/shared/vstore_half.inc

Removed: 
    


################################################################################
diff  --git a/libclc/generic/include/clc/shared/vload.h 
b/libclc/generic/include/clc/shared/vload.h
index b2db5551d0903..a343d652933fd 100644
--- a/libclc/generic/include/clc/shared/vload.h
+++ b/libclc/generic/include/clc/shared/vload.h
@@ -6,23 +6,24 @@
 //
 
//===----------------------------------------------------------------------===//
 
-#define _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE) \
-  _CLC_OVERLOAD _CLC_DECL VEC_TYPE vload##SUFFIX##WIDTH(size_t offset, const 
ADDR_SPACE MEM_TYPE *x);
+#define _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE)         
\
+  _CLC_OVERLOAD _CLC_DECL VEC_TYPE vload##SUFFIX##WIDTH(                       
\
+      size_t offset, const ADDR_SPACE MEM_TYPE *x);
 
-#define _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, ADDR_SPACE) \
-  _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE) \
-  _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE) \
-  _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE) \
-  _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE) \
+#define _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, ADDR_SPACE)        
\
+  _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE)               
\
+  _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE)               
\
+  _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE)               
\
+  _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE)               
\
   _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##16, 16, ADDR_SPACE)
 
-#define _CLC_VECTOR_VLOAD_PRIM3(SUFFIX, MEM_TYPE, PRIM_TYPE) \
-  _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __private) \
-  _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __local) \
-  _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __constant) \
+#define _CLC_VECTOR_VLOAD_PRIM3(SUFFIX, MEM_TYPE, PRIM_TYPE)                   
\
+  _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __private)               
\
+  _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __local)                 
\
+  _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __constant)              
\
   _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __global)
 
-#define _CLC_VECTOR_VLOAD_PRIM1(PRIM_TYPE) \
+#define _CLC_VECTOR_VLOAD_PRIM1(PRIM_TYPE)                                     
\
   _CLC_VECTOR_VLOAD_PRIM3(, PRIM_TYPE, PRIM_TYPE)
 
 // Declare vector load prototypes
@@ -40,12 +41,12 @@ _CLC_VECTOR_VLOAD_PRIM3(_half, half, float)
 _CLC_VECTOR_VLOAD_PRIM3(a_half, half, float)
 
 #ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64: enable
-  _CLC_VECTOR_VLOAD_PRIM1(double)
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+_CLC_VECTOR_VLOAD_PRIM1(double)
 #endif
 #ifdef cl_khr_fp16
-#pragma OPENCL EXTENSION cl_khr_fp16: enable
-  _CLC_VECTOR_VLOAD_PRIM1(half)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+_CLC_VECTOR_VLOAD_PRIM1(half)
 #endif
 
 // Scalar vload_half also needs to be declared

diff  --git a/libclc/generic/include/clc/shared/vstore.h 
b/libclc/generic/include/clc/shared/vstore.h
index 93687e030eb41..6e98f0368c5c1 100644
--- a/libclc/generic/include/clc/shared/vstore.h
+++ b/libclc/generic/include/clc/shared/vstore.h
@@ -6,33 +6,34 @@
 //
 
//===----------------------------------------------------------------------===//
 
-#define _CLC_VSTORE_DECL(SUFFIX, PRIM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE, RND) \
-  _CLC_OVERLOAD _CLC_DECL void vstore##SUFFIX##WIDTH##RND(VEC_TYPE vec, size_t 
offset, ADDR_SPACE PRIM_TYPE *out);
+#define _CLC_VSTORE_DECL(SUFFIX, PRIM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE, RND)  
\
+  _CLC_OVERLOAD _CLC_DECL void vstore##SUFFIX##WIDTH##RND(                     
\
+      VEC_TYPE vec, size_t offset, ADDR_SPACE PRIM_TYPE *out);
 
-#define _CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, ADDR_SPACE, RND) \
-  _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE, RND) \
-  _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE, RND) \
-  _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE, RND) \
-  _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE, RND) \
+#define _CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, ADDR_SPACE, RND)  
\
+  _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE, RND)         
\
+  _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE, RND)         
\
+  _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE, RND)         
\
+  _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE, RND)         
\
   _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##16, 16, ADDR_SPACE, RND)
 
-#define _CLC_VECTOR_VSTORE_PRIM3(SUFFIX, MEM_TYPE, PRIM_TYPE, RND) \
-  _CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __private, RND) \
-  _CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __local, RND) \
+#define _CLC_VECTOR_VSTORE_PRIM3(SUFFIX, MEM_TYPE, PRIM_TYPE, RND)             
\
+  _CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __private, RND)         
\
+  _CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __local, RND)           
\
   _CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __global, RND)
 
-#define _CLC_VECTOR_VSTORE_PRIM1(PRIM_TYPE) \
-  _CLC_VECTOR_VSTORE_PRIM3(,PRIM_TYPE, PRIM_TYPE, )
+#define _CLC_VECTOR_VSTORE_PRIM1(PRIM_TYPE)                                    
\
+  _CLC_VECTOR_VSTORE_PRIM3(, PRIM_TYPE, PRIM_TYPE, )
 
-#define _CLC_VECTOR_VSTORE_HALF_PRIM1(PRIM_TYPE, RND) \
-       _CLC_VSTORE_DECL(_half, half, PRIM_TYPE, , __private, RND) \
-       _CLC_VSTORE_DECL(_half, half, PRIM_TYPE, , __local, RND) \
-       _CLC_VSTORE_DECL(_half, half, PRIM_TYPE, , __global, RND) \
-       _CLC_VECTOR_VSTORE_PRIM3(_half, half, PRIM_TYPE, RND) \
-       _CLC_VSTORE_DECL(a_half, half, PRIM_TYPE, , __private, RND) \
-       _CLC_VSTORE_DECL(a_half, half, PRIM_TYPE, , __local, RND) \
-       _CLC_VSTORE_DECL(a_half, half, PRIM_TYPE, , __global, RND) \
-       _CLC_VECTOR_VSTORE_PRIM3(a_half, half, PRIM_TYPE, RND)
+#define _CLC_VECTOR_VSTORE_HALF_PRIM1(PRIM_TYPE, RND)                          
\
+  _CLC_VSTORE_DECL(_half, half, PRIM_TYPE, , __private, RND)                   
\
+  _CLC_VSTORE_DECL(_half, half, PRIM_TYPE, , __local, RND)                     
\
+  _CLC_VSTORE_DECL(_half, half, PRIM_TYPE, , __global, RND)                    
\
+  _CLC_VECTOR_VSTORE_PRIM3(_half, half, PRIM_TYPE, RND)                        
\
+  _CLC_VSTORE_DECL(a_half, half, PRIM_TYPE, , __private, RND)                  
\
+  _CLC_VSTORE_DECL(a_half, half, PRIM_TYPE, , __local, RND)                    
\
+  _CLC_VSTORE_DECL(a_half, half, PRIM_TYPE, , __global, RND)                   
\
+  _CLC_VECTOR_VSTORE_PRIM3(a_half, half, PRIM_TYPE, RND)
 
 _CLC_VECTOR_VSTORE_PRIM1(char)
 _CLC_VECTOR_VSTORE_PRIM1(uchar)
@@ -44,26 +45,25 @@ _CLC_VECTOR_VSTORE_PRIM1(long)
 _CLC_VECTOR_VSTORE_PRIM1(ulong)
 _CLC_VECTOR_VSTORE_PRIM1(float)
 
-_CLC_VECTOR_VSTORE_HALF_PRIM1(float,)
+_CLC_VECTOR_VSTORE_HALF_PRIM1(float, )
 _CLC_VECTOR_VSTORE_HALF_PRIM1(float, _rtz)
 _CLC_VECTOR_VSTORE_HALF_PRIM1(float, _rtn)
 _CLC_VECTOR_VSTORE_HALF_PRIM1(float, _rtp)
 _CLC_VECTOR_VSTORE_HALF_PRIM1(float, _rte)
 
 #ifdef cl_khr_fp64
-  _CLC_VECTOR_VSTORE_PRIM1(double)
-  _CLC_VECTOR_VSTORE_HALF_PRIM1(double,)
-  _CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rtz)
-  _CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rtn)
-  _CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rtp)
-  _CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rte)
+_CLC_VECTOR_VSTORE_PRIM1(double)
+_CLC_VECTOR_VSTORE_HALF_PRIM1(double, )
+_CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rtz)
+_CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rtn)
+_CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rtp)
+_CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rte)
 #endif
 
 #ifdef cl_khr_fp16
-  _CLC_VECTOR_VSTORE_PRIM1(half)
+_CLC_VECTOR_VSTORE_PRIM1(half)
 #endif
 
-
 #undef _CLC_VSTORE_DECL
 #undef _CLC_VECTOR_VSTORE_DECL
 #undef _CLC_VECTOR_VSTORE_PRIM3

diff  --git a/libclc/generic/lib/shared/vload.cl 
b/libclc/generic/lib/shared/vload.cl
index dcbae4f20929f..a0306c500d5cd 100644
--- a/libclc/generic/lib/shared/vload.cl
+++ b/libclc/generic/lib/shared/vload.cl
@@ -8,59 +8,75 @@
 
 #include <clc/clc.h>
 
-#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
-  typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 __attribute__ 
((aligned (sizeof(PRIM_TYPE))));\
-  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 vload2(size_t offset, const ADDR_SPACE 
PRIM_TYPE *x) { \
-    return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) 
(&x[2*offset])); \
-  } \
-\
-  typedef PRIM_TYPE##3 less_aligned_##ADDR_SPACE##PRIM_TYPE##3 __attribute__ 
((aligned (sizeof(PRIM_TYPE))));\
-  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset, const ADDR_SPACE 
PRIM_TYPE *x) { \
-    PRIM_TYPE##2 vec = *((const ADDR_SPACE 
less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&x[3*offset])); \
-    return (PRIM_TYPE##3)(vec.s0, vec.s1, x[offset*3+2]); \
-  } \
-\
-  typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 __attribute__ 
((aligned (sizeof(PRIM_TYPE))));\
-  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset, const ADDR_SPACE 
PRIM_TYPE *x) { \
-    return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4*) 
(&x[4*offset])); \
-  } \
-\
-  typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 __attribute__ 
((aligned (sizeof(PRIM_TYPE))));\
-  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset, const ADDR_SPACE 
PRIM_TYPE *x) { \
-    return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8*) 
(&x[8*offset])); \
-  } \
-\
-  typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 __attribute__ 
((aligned (sizeof(PRIM_TYPE))));\
-  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16(size_t offset, const ADDR_SPACE 
PRIM_TYPE *x) { \
-    return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16*) 
(&x[16*offset])); \
-  } \
+#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE)                                 
\
+  typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2                 
\
+      __attribute__((aligned(sizeof(PRIM_TYPE))));                             
\
+  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 vload2(size_t offset,                    
\
+                                             const ADDR_SPACE PRIM_TYPE *x) {  
\
+    return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2         
\
+                  *)(&x[2 * offset]));                                         
\
+  }                                                                            
\
+                                                                               
\
+  typedef PRIM_TYPE##3 less_aligned_##ADDR_SPACE##PRIM_TYPE##3                 
\
+      __attribute__((aligned(sizeof(PRIM_TYPE))));                             
\
+  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset,                    
\
+                                             const ADDR_SPACE PRIM_TYPE *x) {  
\
+    PRIM_TYPE##2 vec =                                                         
\
+        *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2            
\
+               *)(&x[3 * offset]));                                            
\
+    return (PRIM_TYPE##3)(vec.s0, vec.s1, x[offset * 3 + 2]);                  
\
+  }                                                                            
\
+                                                                               
\
+  typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4                 
\
+      __attribute__((aligned(sizeof(PRIM_TYPE))));                             
\
+  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset,                    
\
+                                             const ADDR_SPACE PRIM_TYPE *x) {  
\
+    return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4         
\
+                  *)(&x[4 * offset]));                                         
\
+  }                                                                            
\
+                                                                               
\
+  typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8                 
\
+      __attribute__((aligned(sizeof(PRIM_TYPE))));                             
\
+  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset,                    
\
+                                             const ADDR_SPACE PRIM_TYPE *x) {  
\
+    return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8         
\
+                  *)(&x[8 * offset]));                                         
\
+  }                                                                            
\
+                                                                               
\
+  typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16               
\
+      __attribute__((aligned(sizeof(PRIM_TYPE))));                             
\
+  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16(                                
\
+      size_t offset, const ADDR_SPACE PRIM_TYPE *x) {                          
\
+    return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16        
\
+                  *)(&x[16 * offset]));                                        
\
+  }
 
-#define VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE) \
-    VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private) \
-    VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local) \
-    VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \
-    VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global) \
+#define VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE)                                
\
+  VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private)                             
\
+  VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local)                               
\
+  VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant)                            
\
+  VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global)
 
-#define VLOAD_TYPES() \
-    VLOAD_ADDR_SPACES(char) \
-    VLOAD_ADDR_SPACES(uchar) \
-    VLOAD_ADDR_SPACES(short) \
-    VLOAD_ADDR_SPACES(ushort) \
-    VLOAD_ADDR_SPACES(int) \
-    VLOAD_ADDR_SPACES(uint) \
-    VLOAD_ADDR_SPACES(long) \
-    VLOAD_ADDR_SPACES(ulong) \
-    VLOAD_ADDR_SPACES(float) \
+#define VLOAD_TYPES()                                                          
\
+  VLOAD_ADDR_SPACES(char)                                                      
\
+  VLOAD_ADDR_SPACES(uchar)                                                     
\
+  VLOAD_ADDR_SPACES(short)                                                     
\
+  VLOAD_ADDR_SPACES(ushort)                                                    
\
+  VLOAD_ADDR_SPACES(int)                                                       
\
+  VLOAD_ADDR_SPACES(uint)                                                      
\
+  VLOAD_ADDR_SPACES(long)                                                      
\
+  VLOAD_ADDR_SPACES(ulong)                                                     
\
+  VLOAD_ADDR_SPACES(float)
 
 VLOAD_TYPES()
 
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
-    VLOAD_ADDR_SPACES(double)
+VLOAD_ADDR_SPACES(double)
 #endif
 #ifdef cl_khr_fp16
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-    VLOAD_ADDR_SPACES(half)
+VLOAD_ADDR_SPACES(half)
 #endif
 
 /* vload_half are legal even without cl_khr_fp16 */
@@ -71,43 +87,45 @@ float __clc_vload_half_float_helper__global(const __global 
half *);
 float __clc_vload_half_float_helper__local(const __local half *);
 float __clc_vload_half_float_helper__private(const __private half *);
 
-#define VEC_LOAD1(val, AS) val = __clc_vload_half_float_helper##AS 
(&mem[offset++]);
+#define VEC_LOAD1(val, AS)                                                     
\
+  val = __clc_vload_half_float_helper##AS(&mem[offset++]);
 #else
 #define VEC_LOAD1(val, AS) val = __builtin_load_halff(&mem[offset++]);
 #endif
 
-#define VEC_LOAD2(val, AS) \
-       VEC_LOAD1(val.lo, AS) \
-       VEC_LOAD1(val.hi, AS)
-#define VEC_LOAD3(val, AS) \
-       VEC_LOAD1(val.s0, AS) \
-       VEC_LOAD1(val.s1, AS) \
-       VEC_LOAD1(val.s2, AS)
-#define VEC_LOAD4(val, AS) \
-       VEC_LOAD2(val.lo, AS) \
-       VEC_LOAD2(val.hi, AS)
-#define VEC_LOAD8(val, AS) \
-       VEC_LOAD4(val.lo, AS) \
-       VEC_LOAD4(val.hi, AS)
-#define VEC_LOAD16(val, AS) \
-       VEC_LOAD8(val.lo, AS) \
-       VEC_LOAD8(val.hi, AS)
+#define VEC_LOAD2(val, AS)                                                     
\
+  VEC_LOAD1(val.lo, AS)                                                        
\
+  VEC_LOAD1(val.hi, AS)
+#define VEC_LOAD3(val, AS)                                                     
\
+  VEC_LOAD1(val.s0, AS)                                                        
\
+  VEC_LOAD1(val.s1, AS)                                                        
\
+  VEC_LOAD1(val.s2, AS)
+#define VEC_LOAD4(val, AS)                                                     
\
+  VEC_LOAD2(val.lo, AS)                                                        
\
+  VEC_LOAD2(val.hi, AS)
+#define VEC_LOAD8(val, AS)                                                     
\
+  VEC_LOAD4(val.lo, AS)                                                        
\
+  VEC_LOAD4(val.hi, AS)
+#define VEC_LOAD16(val, AS)                                                    
\
+  VEC_LOAD8(val.lo, AS)                                                        
\
+  VEC_LOAD8(val.hi, AS)
 
-#define __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) \
-  _CLC_OVERLOAD _CLC_DEF TYPE vload_half##SUFFIX(size_t offset, const AS half 
*mem) { \
-    offset *= VEC_SIZE; \
-    TYPE __tmp; \
-    VEC_LOAD##VEC_SIZE(__tmp, AS) \
-    return __tmp; \
-  } \
-  _CLC_OVERLOAD _CLC_DEF TYPE vloada_half##SUFFIX(size_t offset, const AS half 
*mem) { \
-    offset *= OFFSET_SIZE; \
-    TYPE __tmp; \
-    VEC_LOAD##VEC_SIZE(__tmp, AS) \
-    return __tmp; \
+#define __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS)                        
\
+  _CLC_OVERLOAD _CLC_DEF TYPE vload_half##SUFFIX(size_t offset,                
\
+                                                 const AS half *mem) {         
\
+    offset *= VEC_SIZE;                                                        
\
+    TYPE __tmp;                                                                
\
+    VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp;                                
\
+  }                                                                            
\
+  _CLC_OVERLOAD _CLC_DEF TYPE vloada_half##SUFFIX(size_t offset,               
\
+                                                  const AS half *mem) {        
\
+    offset *= OFFSET_SIZE;                                                     
\
+    TYPE __tmp;                                                                
\
+    VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp;                                
\
   }
 
-#define FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) __FUNC(SUFFIX, VEC_SIZE, 
OFFSET_SIZE, TYPE, AS)
+#define FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS)                          
\
+  __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS)
 
 #define __CLC_BODY "vload_half.inc"
 #include <clc/math/gentype.inc>

diff  --git a/libclc/generic/lib/shared/vload_half.inc 
b/libclc/generic/lib/shared/vload_half.inc
index ff47969327bab..26716b9960018 100644
--- a/libclc/generic/lib/shared/vload_half.inc
+++ b/libclc/generic/lib/shared/vload_half.inc
@@ -11,21 +11,21 @@
 #ifndef __CLC_SCALAR
 
 #if __CLC_VECSIZE == 3
-#  define __CLC_OFFSET 4
+#define __CLC_OFFSET 4
 #else
-#  define __CLC_OFFSET __CLC_VECSIZE
+#define __CLC_OFFSET __CLC_VECSIZE
 #endif
 
-  FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __private);
-  FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __local);
-  FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __global);
-  FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __constant);
+FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __private);
+FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __local);
+FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __global);
+FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __constant);
 
 #undef __CLC_OFFSET
 #else
-  FUNC(, 1, 1, __CLC_GENTYPE, __private);
-  FUNC(, 1, 1, __CLC_GENTYPE, __local);
-  FUNC(, 1, 1, __CLC_GENTYPE, __global);
-  FUNC(, 1, 1, __CLC_GENTYPE, __constant);
+FUNC(, 1, 1, __CLC_GENTYPE, __private);
+FUNC(, 1, 1, __CLC_GENTYPE, __local);
+FUNC(, 1, 1, __CLC_GENTYPE, __global);
+FUNC(, 1, 1, __CLC_GENTYPE, __constant);
 #endif
 #endif

diff  --git a/libclc/generic/lib/shared/vstore.cl 
b/libclc/generic/lib/shared/vstore.cl
index 0a105f5cd8c86..525f3d08bf0d8 100644
--- a/libclc/generic/lib/shared/vstore.cl
+++ b/libclc/generic/lib/shared/vstore.cl
@@ -10,36 +10,50 @@
 
 #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
 
-#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
-  typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 __attribute__ 
((aligned (sizeof(PRIM_TYPE))));\
-  _CLC_OVERLOAD _CLC_DEF void vstore2(PRIM_TYPE##2 vec, size_t offset, 
ADDR_SPACE PRIM_TYPE *mem) { \
-    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&mem[2*offset])) 
= vec; \
-  } \
-\
-  _CLC_OVERLOAD _CLC_DEF void vstore3(PRIM_TYPE##3 vec, size_t offset, 
ADDR_SPACE PRIM_TYPE *mem) { \
-    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&mem[3*offset])) 
= (PRIM_TYPE##2)(vec.s0, vec.s1); \
-    mem[3 * offset + 2] = vec.s2;\
-  } \
-\
-  typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 __attribute__ 
((aligned (sizeof(PRIM_TYPE))));\
-  _CLC_OVERLOAD _CLC_DEF void vstore4(PRIM_TYPE##4 vec, size_t offset, 
ADDR_SPACE PRIM_TYPE *mem) { \
-    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4*) (&mem[4*offset])) 
= vec; \
-  } \
-\
-  typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 __attribute__ 
((aligned (sizeof(PRIM_TYPE))));\
-  _CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset, 
ADDR_SPACE PRIM_TYPE *mem) { \
-    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8*) (&mem[8*offset])) 
= vec; \
-  } \
-\
-  typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 __attribute__ 
((aligned (sizeof(PRIM_TYPE))));\
-  _CLC_OVERLOAD _CLC_DEF void vstore16(PRIM_TYPE##16 vec, size_t offset, 
ADDR_SPACE PRIM_TYPE *mem) { \
-    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16*) 
(&mem[16*offset])) = vec; \
-  } \
+#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE)                                
\
+  typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2                 
\
+      __attribute__((aligned(sizeof(PRIM_TYPE))));                             
\
+  _CLC_OVERLOAD _CLC_DEF void vstore2(PRIM_TYPE##2 vec, size_t offset,         
\
+                                      ADDR_SPACE PRIM_TYPE *mem) {             
\
+    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2                      
\
+           *)(&mem[2 * offset])) = vec;                                        
\
+  }                                                                            
\
+                                                                               
\
+  _CLC_OVERLOAD _CLC_DEF void vstore3(PRIM_TYPE##3 vec, size_t offset,         
\
+                                      ADDR_SPACE PRIM_TYPE *mem) {             
\
+    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2                      
\
+           *)(&mem[3 * offset])) = (PRIM_TYPE##2)(vec.s0, vec.s1);             
\
+    mem[3 * offset + 2] = vec.s2;                                              
\
+  }                                                                            
\
+                                                                               
\
+  typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4                 
\
+      __attribute__((aligned(sizeof(PRIM_TYPE))));                             
\
+  _CLC_OVERLOAD _CLC_DEF void vstore4(PRIM_TYPE##4 vec, size_t offset,         
\
+                                      ADDR_SPACE PRIM_TYPE *mem) {             
\
+    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4                      
\
+           *)(&mem[4 * offset])) = vec;                                        
\
+  }                                                                            
\
+                                                                               
\
+  typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8                 
\
+      __attribute__((aligned(sizeof(PRIM_TYPE))));                             
\
+  _CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset,         
\
+                                      ADDR_SPACE PRIM_TYPE *mem) {             
\
+    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8                      
\
+           *)(&mem[8 * offset])) = vec;                                        
\
+  }                                                                            
\
+                                                                               
\
+  typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16               
\
+      __attribute__((aligned(sizeof(PRIM_TYPE))));                             
\
+  _CLC_OVERLOAD _CLC_DEF void vstore16(PRIM_TYPE##16 vec, size_t offset,       
\
+                                       ADDR_SPACE PRIM_TYPE *mem) {            
\
+    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16                     
\
+           *)(&mem[16 * offset])) = vec;                                       
\
+  }
 
-#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \
-    VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private) \
-    VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \
-    VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) \
+#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE)                         
\
+  VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private)                      
\
+  VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local)                        
\
+  VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global)
 
 VSTORE_ADDR_SPACES(char)
 VSTORE_ADDR_SPACES(uchar)
@@ -51,26 +65,25 @@ VSTORE_ADDR_SPACES(long)
 VSTORE_ADDR_SPACES(ulong)
 VSTORE_ADDR_SPACES(float)
 
-
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
-    VSTORE_ADDR_SPACES(double)
+VSTORE_ADDR_SPACES(double)
 #endif
 
 #ifdef cl_khr_fp16
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-    VSTORE_ADDR_SPACES(half)
+VSTORE_ADDR_SPACES(half)
 #endif
 
 /* vstore_half are legal even without cl_khr_fp16 */
 #if __clang_major__ < 6
-#define DECLARE_HELPER(STYPE, AS, builtin) void 
__clc_vstore_half_##STYPE##_helper##AS(STYPE, AS half *);
+#define DECLARE_HELPER(STYPE, AS, builtin)                                     
\
+  void __clc_vstore_half_##STYPE##_helper##AS(STYPE, AS half *);
 #else
-#define DECLARE_HELPER(STYPE, AS, __builtin) \
-_CLC_DEF void __clc_vstore_half_##STYPE##_helper##AS(STYPE s, AS half *d) \
-{ \
-       __builtin(s, d); \
-}
+#define DECLARE_HELPER(STYPE, AS, __builtin)                                   
\
+  _CLC_DEF void __clc_vstore_half_##STYPE##_helper##AS(STYPE s, AS half *d) {  
\
+    __builtin(s, d);                                                           
\
+  }
 #endif
 
 DECLARE_HELPER(float, __private, __builtin_store_halff);
@@ -83,176 +96,165 @@ DECLARE_HELPER(double, __global, __builtin_store_half);
 DECLARE_HELPER(double, __local, __builtin_store_half);
 #endif
 
-#define VEC_STORE1(STYPE, AS, val, ROUNDF) 
__clc_vstore_half_##STYPE##_helper##AS (ROUNDF(val), &mem[offset++]);
+#define VEC_STORE1(STYPE, AS, val, ROUNDF)                                     
\
+  __clc_vstore_half_##STYPE##_helper##AS(ROUNDF(val), &mem[offset++]);
 
-#define VEC_STORE2(STYPE, AS, val, ROUNDF) \
-       VEC_STORE1(STYPE, AS, val.lo, ROUNDF) \
-       VEC_STORE1(STYPE, AS, val.hi, ROUNDF)
-#define VEC_STORE3(STYPE, AS, val, ROUNDF) \
-       VEC_STORE1(STYPE, AS, val.s0, ROUNDF) \
-       VEC_STORE1(STYPE, AS, val.s1, ROUNDF) \
-       VEC_STORE1(STYPE, AS, val.s2, ROUNDF)
-#define VEC_STORE4(STYPE, AS, val, ROUNDF) \
-       VEC_STORE2(STYPE, AS, val.lo, ROUNDF) \
-       VEC_STORE2(STYPE, AS, val.hi, ROUNDF)
-#define VEC_STORE8(STYPE, AS, val, ROUNDF) \
-       VEC_STORE4(STYPE, AS, val.lo, ROUNDF) \
-       VEC_STORE4(STYPE, AS, val.hi, ROUNDF)
-#define VEC_STORE16(STYPE, AS, val, ROUNDF) \
-       VEC_STORE8(STYPE, AS, val.lo, ROUNDF) \
-       VEC_STORE8(STYPE, AS, val.hi, ROUNDF)
+#define VEC_STORE2(STYPE, AS, val, ROUNDF)                                     
\
+  VEC_STORE1(STYPE, AS, val.lo, ROUNDF)                                        
\
+  VEC_STORE1(STYPE, AS, val.hi, ROUNDF)
+#define VEC_STORE3(STYPE, AS, val, ROUNDF)                                     
\
+  VEC_STORE1(STYPE, AS, val.s0, ROUNDF)                                        
\
+  VEC_STORE1(STYPE, AS, val.s1, ROUNDF)                                        
\
+  VEC_STORE1(STYPE, AS, val.s2, ROUNDF)
+#define VEC_STORE4(STYPE, AS, val, ROUNDF)                                     
\
+  VEC_STORE2(STYPE, AS, val.lo, ROUNDF)                                        
\
+  VEC_STORE2(STYPE, AS, val.hi, ROUNDF)
+#define VEC_STORE8(STYPE, AS, val, ROUNDF)                                     
\
+  VEC_STORE4(STYPE, AS, val.lo, ROUNDF)                                        
\
+  VEC_STORE4(STYPE, AS, val.hi, ROUNDF)
+#define VEC_STORE16(STYPE, AS, val, ROUNDF)                                    
\
+  VEC_STORE8(STYPE, AS, val.lo, ROUNDF)                                        
\
+  VEC_STORE8(STYPE, AS, val.hi, ROUNDF)
 
-#define __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, ROUNDF) \
-  _CLC_OVERLOAD _CLC_DEF void vstore_half##SUFFIX(TYPE vec, size_t offset, AS 
half *mem) { \
-    offset *= VEC_SIZE; \
-    VEC_STORE##VEC_SIZE(STYPE, AS, vec, ROUNDF) \
-  } \
-  _CLC_OVERLOAD _CLC_DEF void vstorea_half##SUFFIX(TYPE vec, size_t offset, AS 
half *mem) { \
-    offset *= OFFSET; \
-    VEC_STORE##VEC_SIZE(STYPE, AS, vec, ROUNDF) \
+#define __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, ROUNDF)              
\
+  _CLC_OVERLOAD _CLC_DEF void vstore_half##SUFFIX(TYPE vec, size_t offset,     
\
+                                                  AS half *mem) {              
\
+    offset *= VEC_SIZE;                                                        
\
+    VEC_STORE##VEC_SIZE(STYPE, AS, vec, ROUNDF)                                
\
+  }                                                                            
\
+  _CLC_OVERLOAD _CLC_DEF void vstorea_half##SUFFIX(TYPE vec, size_t offset,    
\
+                                                   AS half *mem) {             
\
+    offset *= OFFSET;                                                          
\
+    VEC_STORE##VEC_SIZE(STYPE, AS, vec, ROUNDF)                                
\
   }
 
-_CLC_DEF _CLC_OVERLOAD float __clc_noop(float x)
-{
-       return x;
+_CLC_DEF _CLC_OVERLOAD float __clc_noop(float x) { return x; }
+_CLC_DEF _CLC_OVERLOAD float __clc_rtz(float x) {
+  /* Remove lower 13 bits to make sure the number is rounded down */
+  int mask = 0xffffe000;
+  const int exp = (as_uint(x) >> 23 & 0xff) - 127;
+  /* Denormals cannot be flushed, and they use 
diff erent bit for rounding */
+  if (exp < -14)
+    mask <<= min(-(exp + 14), 10);
+  /* RTZ does not produce Inf for large numbers */
+  if (fabs(x) > 65504.0f && !isinf(x))
+    return copysign(65504.0f, x);
+  /* Handle nan corner case */
+  if (isnan(x))
+    return x;
+  return as_float(as_uint(x) & mask);
 }
-_CLC_DEF _CLC_OVERLOAD float __clc_rtz(float x)
-{
-       /* Remove lower 13 bits to make sure the number is rounded down */
-       int mask = 0xffffe000;
-       const int exp = (as_uint(x) >> 23 & 0xff) - 127;
-       /* Denormals cannot be flushed, and they use 
diff erent bit for rounding */
-       if (exp < -14)
-               mask <<= min(-(exp + 14), 10);
-       /* RTZ does not produce Inf for large numbers */
-       if (fabs(x) > 65504.0f && !isinf(x))
-               return copysign(65504.0f, x);
-       /* Handle nan corner case */
-       if (isnan(x))
-               return x;
-       return as_float(as_uint(x) & mask);
+_CLC_DEF _CLC_OVERLOAD float __clc_rti(float x) {
+  const float inf = copysign(INFINITY, x);
+  /* Set lower 13 bits */
+  int mask = (1 << 13) - 1;
+  const int exp = (as_uint(x) >> 23 & 0xff) - 127;
+  /* Denormals cannot be flushed, and they use 
diff erent bit for rounding */
+  if (exp < -14)
+    mask = (1 << (13 + min(-(exp + 14), 10))) - 1;
+  /* Handle nan corner case */
+  if (isnan(x))
+    return x;
+  const float next = nextafter(as_float(as_uint(x) | mask), inf);
+  return ((as_uint(x) & mask) == 0) ? x : next;
 }
-_CLC_DEF _CLC_OVERLOAD float __clc_rti(float x)
-{
-       const float inf = copysign(INFINITY, x);
-       /* Set lower 13 bits */
-       int mask = (1 << 13) - 1;
-       const int exp = (as_uint(x) >> 23 & 0xff) - 127;
-       /* Denormals cannot be flushed, and they use 
diff erent bit for rounding */
-       if (exp < -14)
-               mask = (1 << (13 + min(-(exp + 14), 10))) - 1;
-       /* Handle nan corner case */
-       if (isnan(x))
-               return x;
-       const float next = nextafter(as_float(as_uint(x) | mask), inf);
-       return ((as_uint(x) & mask) == 0) ? x : next;
+_CLC_DEF _CLC_OVERLOAD float __clc_rtn(float x) {
+  return ((as_uint(x) & 0x80000000) == 0) ? __clc_rtz(x) : __clc_rti(x);
 }
-_CLC_DEF _CLC_OVERLOAD float __clc_rtn(float x)
-{
-       return ((as_uint(x) & 0x80000000) == 0) ? __clc_rtz(x) : __clc_rti(x);
+_CLC_DEF _CLC_OVERLOAD float __clc_rtp(float x) {
+  return ((as_uint(x) & 0x80000000) == 0) ? __clc_rti(x) : __clc_rtz(x);
 }
-_CLC_DEF _CLC_OVERLOAD float __clc_rtp(float x)
-{
-       return ((as_uint(x) & 0x80000000) == 0) ? __clc_rti(x) : __clc_rtz(x);
-}
-_CLC_DEF _CLC_OVERLOAD float __clc_rte(float x)
-{
-       /* Mantisa + implicit bit */
-       const uint mantissa = (as_uint(x) & 0x7fffff) | (1u << 23);
-       const int exp = (as_uint(x) >> 23 & 0xff) - 127;
-       int shift = 13;
-       if (exp < -14) {
-               /* The default assumes lower 13 bits are rounded,
-                * but it might be more for denormals.
-                * Shifting beyond last == 0b, and qr == 00b is not necessary */
-               shift += min(-(exp + 14), 15);
-       }
-       int mask = (1 << shift) - 1;
-       const uint grs = mantissa & mask;
-       const uint last = mantissa & (1 << shift);
-       /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1.
-        * exp > 15 should round to inf. */
-       bool roundup = (grs > (1 << (shift - 1))) ||
-               (grs == (1 << (shift - 1)) && last != 0) || (exp > 15);
-       return roundup ? __clc_rti(x) : __clc_rtz(x);
+_CLC_DEF _CLC_OVERLOAD float __clc_rte(float x) {
+  /* Mantisa + implicit bit */
+  const uint mantissa = (as_uint(x) & 0x7fffff) | (1u << 23);
+  const int exp = (as_uint(x) >> 23 & 0xff) - 127;
+  int shift = 13;
+  if (exp < -14) {
+    /* The default assumes lower 13 bits are rounded,
+     * but it might be more for denormals.
+     * Shifting beyond last == 0b, and qr == 00b is not necessary */
+    shift += min(-(exp + 14), 15);
+  }
+  int mask = (1 << shift) - 1;
+  const uint grs = mantissa & mask;
+  const uint last = mantissa & (1 << shift);
+  /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1.
+   * exp > 15 should round to inf. */
+  bool roundup = (grs > (1 << (shift - 1))) ||
+                 (grs == (1 << (shift - 1)) && last != 0) || (exp > 15);
+  return roundup ? __clc_rti(x) : __clc_rtz(x);
 }
 
 #ifdef cl_khr_fp64
-_CLC_DEF _CLC_OVERLOAD double __clc_noop(double x)
-{
-       return x;
+_CLC_DEF _CLC_OVERLOAD double __clc_noop(double x) { return x; }
+_CLC_DEF _CLC_OVERLOAD double __clc_rtz(double x) {
+  /* Remove lower 42 bits to make sure the number is rounded down */
+  ulong mask = 0xfffffc0000000000UL;
+  const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023;
+  /* Denormals cannot be flushed, and they use 
diff erent bit for rounding */
+  if (exp < -14)
+    mask <<= min(-(exp + 14), 10);
+  /* RTZ does not produce Inf for large numbers */
+  if (fabs(x) > 65504.0 && !isinf(x))
+    return copysign(65504.0, x);
+  /* Handle nan corner case */
+  if (isnan(x))
+    return x;
+  return as_double(as_ulong(x) & mask);
 }
-_CLC_DEF _CLC_OVERLOAD double __clc_rtz(double x)
-{
-       /* Remove lower 42 bits to make sure the number is rounded down */
-       ulong mask = 0xfffffc0000000000UL;
-       const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023;
-       /* Denormals cannot be flushed, and they use 
diff erent bit for rounding */
-       if (exp < -14)
-               mask <<= min(-(exp + 14), 10);
-       /* RTZ does not produce Inf for large numbers */
-       if (fabs(x) > 65504.0 && !isinf(x))
-               return copysign(65504.0, x);
-       /* Handle nan corner case */
-       if (isnan(x))
-               return x;
-       return as_double(as_ulong(x) & mask);
+_CLC_DEF _CLC_OVERLOAD double __clc_rti(double x) {
+  const double inf = copysign((double)INFINITY, x);
+  /* Set lower 42 bits */
+  long mask = (1UL << 42UL) - 1UL;
+  const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023;
+  /* Denormals cannot be flushed, and they use 
diff erent bit for rounding */
+  if (exp < -14)
+    mask = (1UL << (42UL + min(-(exp + 14), 10))) - 1;
+  /* Handle nan corner case */
+  if (isnan(x))
+    return x;
+  const double next = nextafter(as_double(as_ulong(x) | mask), inf);
+  return ((as_ulong(x) & mask) == 0) ? x : next;
 }
-_CLC_DEF _CLC_OVERLOAD double __clc_rti(double x)
-{
-       const double inf = copysign((double)INFINITY, x);
-       /* Set lower 42 bits */
-       long mask = (1UL << 42UL) - 1UL;
-       const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023;
-       /* Denormals cannot be flushed, and they use 
diff erent bit for rounding */
-       if (exp < -14)
-               mask = (1UL << (42UL + min(-(exp + 14), 10))) - 1;
-       /* Handle nan corner case */
-       if (isnan(x))
-               return x;
-       const double next = nextafter(as_double(as_ulong(x) | mask), inf);
-       return ((as_ulong(x) & mask) == 0) ? x : next;
+_CLC_DEF _CLC_OVERLOAD double __clc_rtn(double x) {
+  return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rtz(x)
+                                                     : __clc_rti(x);
 }
-_CLC_DEF _CLC_OVERLOAD double __clc_rtn(double x)
-{
-       return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rtz(x) : 
__clc_rti(x);
+_CLC_DEF _CLC_OVERLOAD double __clc_rtp(double x) {
+  return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rti(x)
+                                                     : __clc_rtz(x);
 }
-_CLC_DEF _CLC_OVERLOAD double __clc_rtp(double x)
-{
-       return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rti(x) : 
__clc_rtz(x);
-}
-_CLC_DEF _CLC_OVERLOAD double __clc_rte(double x)
-{
-       /* Mantisa + implicit bit */
-       const ulong mantissa = (as_ulong(x) & 0xfffffffffffff) | (1UL << 52);
-       const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023;
-       int shift = 42;
-       if (exp < -14) {
-               /* The default assumes lower 13 bits are rounded,
-                * but it might be more for denormals.
-                * Shifting beyond last == 0b, and qr == 00b is not necessary */
-               shift += min(-(exp + 14), 15);
-       }
-       ulong mask = (1UL << shift) - 1UL;
-       const ulong grs = mantissa & mask;
-       const ulong last = mantissa & (1UL << shift);
-       /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1.
-        * exp > 15 should round to inf. */
-       bool roundup = (grs > (1UL << (shift - 1UL))) ||
-               (grs == (1UL << (shift - 1UL)) && last != 0) || (exp > 15);
-       return roundup ? __clc_rti(x) : __clc_rtz(x);
+_CLC_DEF _CLC_OVERLOAD double __clc_rte(double x) {
+  /* Mantisa + implicit bit */
+  const ulong mantissa = (as_ulong(x) & 0xfffffffffffff) | (1UL << 52);
+  const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023;
+  int shift = 42;
+  if (exp < -14) {
+    /* The default assumes lower 13 bits are rounded,
+     * but it might be more for denormals.
+     * Shifting beyond last == 0b, and qr == 00b is not necessary */
+    shift += min(-(exp + 14), 15);
+  }
+  ulong mask = (1UL << shift) - 1UL;
+  const ulong grs = mantissa & mask;
+  const ulong last = mantissa & (1UL << shift);
+  /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1.
+   * exp > 15 should round to inf. */
+  bool roundup = (grs > (1UL << (shift - 1UL))) ||
+                 (grs == (1UL << (shift - 1UL)) && last != 0) || (exp > 15);
+  return roundup ? __clc_rti(x) : __clc_rtz(x);
 }
 #endif
 
-#define __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS) \
-       __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_noop) \
-       __FUNC(SUFFIX ## _rtz, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtz) \
-       __FUNC(SUFFIX ## _rtn, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtn) \
-       __FUNC(SUFFIX ## _rtp, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtp) \
-       __FUNC(SUFFIX ## _rte, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rte)
+#define __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS)                     
\
+  __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_noop)                
\
+  __FUNC(SUFFIX##_rtz, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtz)           
\
+  __FUNC(SUFFIX##_rtn, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtn)           
\
+  __FUNC(SUFFIX##_rtp, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtp)           
\
+  __FUNC(SUFFIX##_rte, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rte)
 
-#define FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS) \
-       __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS)
+#define FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS)                        
\
+  __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS)
 
 #define __CLC_BODY "vstore_half.inc"
 #include <clc/math/gentype.inc>

diff  --git a/libclc/generic/lib/shared/vstore_half.inc 
b/libclc/generic/lib/shared/vstore_half.inc
index 7c3472804b861..138c19ae78b3f 100644
--- a/libclc/generic/lib/shared/vstore_half.inc
+++ b/libclc/generic/lib/shared/vstore_half.inc
@@ -11,19 +11,22 @@
 #ifndef __CLC_SCALAR
 
 #if __CLC_VECSIZE == 3
-#  define __CLC_OFFSET 4
+#define __CLC_OFFSET 4
 #else
-#  define __CLC_OFFSET __CLC_VECSIZE
+#define __CLC_OFFSET __CLC_VECSIZE
 #endif
 
-  FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, 
__CLC_SCALAR_GENTYPE, __private);
-  FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, 
__CLC_SCALAR_GENTYPE, __local);
-  FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, 
__CLC_SCALAR_GENTYPE, __global);
+FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE,
+     __CLC_SCALAR_GENTYPE, __private);
+FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE,
+     __CLC_SCALAR_GENTYPE, __local);
+FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE,
+     __CLC_SCALAR_GENTYPE, __global);
 
 #undef __CLC_OFFSET
 #else
-  FUNC(, 1, 1, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __private);
-  FUNC(, 1, 1, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __local);
-  FUNC(, 1, 1, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __global);
+FUNC(, 1, 1, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __private);
+FUNC(, 1, 1, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __local);
+FUNC(, 1, 1, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __global);
 #endif
 #endif


        
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to