https://github.com/frasercrmck created 
https://github.com/llvm/llvm-project/pull/119596

These functions all map to the corresponding LLVM intrinsics, but the vector 
intrinsics weren't being generated. The intrinsic mapping from CLC vector 
function to vector intrinsic was working correctly, but the mapping from OpenCL 
builtin to CLC function was suboptimally recursively splitting vectors in 
halves.

For example, with this change, `ceil(float16)` calls `llvm.ceil.v16f32` 
directly.

The CLC versions of each of these builtins are also now enabled for SPIR-V 
targets. The LLVM -> SPIR-V translator maps the intrinsics to the appropriate 
OpExtInst. As such, there is no diff to the SPIR-V binaries before/after this 
change.

The clspv targets show a difference, but it's not expected to be a problem:

    >   %call = tail call spir_func double @llvm.fabs.f64(double noundef %x) #9
    <   %call = tail call spir_func double @_Z4fabsd(double noundef %x) #9

The AMDGPU targets make use of the same `_CLC_DEFINE_UNARY_BUILTIN` macro to 
override `sqrt`, so those functions also appear more optimal with this change, 
calling the vector `llvm.sqrt.vXf32` intrinsics directly.

>From 68df2622a3ca1b98a0cbf1fc9e6200e12fecbb2e Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fra...@codeplay.com>
Date: Wed, 11 Dec 2024 17:28:38 +0000
Subject: [PATCH] [libclc] Optimize ceil/fabs/floor/rint/trunc

These functions all map to the corresponding LLVM intrinsics, but the
vector intrinsics weren't being generated. The intrinsic mapping from
CLC vector function to vector intrinsic was working correctly, but the
mapping from OpenCL builtin to CLC function was suboptimally recursively
splitting vectors in halves.

For example, with this change, `ceil(float16)` calls `llvm.ceil.v16f32`
directly.

The CLC versions of each of these builtins are also now enabled for
SPIR-V targets. The LLVM -> SPIR-V translator maps the intrinsics to the
appropriate OpExtInst. As such, there is no diff to the SPIR-V binaries
before/after this change.

The clspv targets show a difference, but it's not expected to be a
problem:

    >   %call = tail call spir_func double @llvm.fabs.f64(double noundef %x) #9
    <   %call = tail call spir_func double @_Z4fabsd(double noundef %x) #9

The AMDGPU targets make use of the same _CLC_DEFINE_UNARY_BUILTIN macro
to override sqrt, so those functions also appear more optimal with this
change, calling the vector `llvm.sqrt.vXf32` intrinsics directly.
---
 libclc/clc/include/clc/clcmacro.h       | 16 +++++++++++++++-
 libclc/clc/include/clc/math/clc_ceil.h  |  7 -------
 libclc/clc/include/clc/math/clc_fabs.h  |  7 -------
 libclc/clc/include/clc/math/clc_floor.h |  7 -------
 libclc/clc/include/clc/math/clc_rint.h  |  7 -------
 libclc/clc/include/clc/math/clc_trunc.h |  7 -------
 6 files changed, 15 insertions(+), 36 deletions(-)

diff --git a/libclc/clc/include/clc/clcmacro.h 
b/libclc/clc/include/clc/clcmacro.h
index 244239284ecabc..c6583749eca661 100644
--- a/libclc/clc/include/clc/clcmacro.h
+++ b/libclc/clc/include/clc/clcmacro.h
@@ -191,7 +191,21 @@
 
 #define _CLC_DEFINE_UNARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE)      
\
   _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x) { return BUILTIN(x); } 
\
-  _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, RET_TYPE, FUNCTION, ARG1_TYPE)
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x) {                
\
+    return BUILTIN(x);                                                         
\
+  }                                                                            
\
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x) {                
\
+    return BUILTIN(x);                                                         
\
+  }                                                                            
\
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x) {                
\
+    return BUILTIN(x);                                                         
\
+  }                                                                            
\
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x) {                
\
+    return BUILTIN(x);                                                         
\
+  }                                                                            
\
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x) {              
\
+    return BUILTIN(x);                                                         
\
+  }
 
 #ifdef cl_khr_fp16
 
diff --git a/libclc/clc/include/clc/math/clc_ceil.h 
b/libclc/clc/include/clc/math/clc_ceil.h
index 66590687c34220..905aef37e11c66 100644
--- a/libclc/clc/include/clc/math/clc_ceil.h
+++ b/libclc/clc/include/clc/math/clc_ceil.h
@@ -1,11 +1,6 @@
 #ifndef __CLC_MATH_CLC_CEIL_H__
 #define __CLC_MATH_CLC_CEIL_H__
 
-#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
-// clspv and spir-v targets provide their own OpenCL-compatible ceil
-#define __clc_ceil ceil
-#else
-
 // Map the function to an LLVM intrinsic
 #define __CLC_FUNCTION __clc_ceil
 #define __CLC_INTRINSIC "llvm.ceil"
@@ -14,6 +9,4 @@
 #undef __CLC_INTRINSIC
 #undef __CLC_FUNCTION
 
-#endif
-
 #endif // __CLC_MATH_CLC_CEIL_H__
diff --git a/libclc/clc/include/clc/math/clc_fabs.h 
b/libclc/clc/include/clc/math/clc_fabs.h
index 93367b57313713..525577ab98a389 100644
--- a/libclc/clc/include/clc/math/clc_fabs.h
+++ b/libclc/clc/include/clc/math/clc_fabs.h
@@ -1,11 +1,6 @@
 #ifndef __CLC_MATH_CLC_FABS_H__
 #define __CLC_MATH_CLC_FABS_H__
 
-#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
-// clspv and spir-v targets provide their own OpenCL-compatible fabs
-#define __clc_fabs fabs
-#else
-
 // Map the function to an LLVM intrinsic
 #define __CLC_FUNCTION __clc_fabs
 #define __CLC_INTRINSIC "llvm.fabs"
@@ -14,6 +9,4 @@
 #undef __CLC_INTRINSIC
 #undef __CLC_FUNCTION
 
-#endif
-
 #endif // __CLC_MATH_CLC_FABS_H__
diff --git a/libclc/clc/include/clc/math/clc_floor.h 
b/libclc/clc/include/clc/math/clc_floor.h
index 9919872ec633c6..e2d9dbadb434db 100644
--- a/libclc/clc/include/clc/math/clc_floor.h
+++ b/libclc/clc/include/clc/math/clc_floor.h
@@ -1,11 +1,6 @@
 #ifndef __CLC_MATH_CLC_FLOOR_H__
 #define __CLC_MATH_CLC_FLOOR_H__
 
-#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
-// clspv and spir-v targets provide their own OpenCL-compatible floor
-#define __clc_floor floor
-#else
-
 // Map the function to an LLVM intrinsic
 #define __CLC_FUNCTION __clc_floor
 #define __CLC_INTRINSIC "llvm.floor"
@@ -14,6 +9,4 @@
 #undef __CLC_INTRINSIC
 #undef __CLC_FUNCTION
 
-#endif
-
 #endif // __CLC_MATH_CLC_FLOOR_H__
diff --git a/libclc/clc/include/clc/math/clc_rint.h 
b/libclc/clc/include/clc/math/clc_rint.h
index 3761407ad326d7..7bb81100f221c0 100644
--- a/libclc/clc/include/clc/math/clc_rint.h
+++ b/libclc/clc/include/clc/math/clc_rint.h
@@ -1,11 +1,6 @@
 #ifndef __CLC_MATH_CLC_RINT_H__
 #define __CLC_MATH_CLC_RINT_H__
 
-#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
-// clspv and spir-v targets provide their own OpenCL-compatible rint
-#define __clc_rint rint
-#else
-
 // Map the function to an LLVM intrinsic
 #define __CLC_FUNCTION __clc_rint
 #define __CLC_INTRINSIC "llvm.rint"
@@ -14,6 +9,4 @@
 #undef __CLC_INTRINSIC
 #undef __CLC_FUNCTION
 
-#endif
-
 #endif // __CLC_MATH_CLC_RINT_H__
diff --git a/libclc/clc/include/clc/math/clc_trunc.h 
b/libclc/clc/include/clc/math/clc_trunc.h
index c78c8899d85238..62467fa1144713 100644
--- a/libclc/clc/include/clc/math/clc_trunc.h
+++ b/libclc/clc/include/clc/math/clc_trunc.h
@@ -1,11 +1,6 @@
 #ifndef __CLC_MATH_CLC_TRUNC_H__
 #define __CLC_MATH_CLC_TRUNC_H__
 
-#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
-// clspv and spir-v targets provide their own OpenCL-compatible trunc
-#define __clc_trunc trunc
-#else
-
 // Map the function to an LLVM intrinsic
 #define __CLC_FUNCTION __clc_trunc
 #define __CLC_INTRINSIC "llvm.trunc"
@@ -14,6 +9,4 @@
 #undef __CLC_INTRINSIC
 #undef __CLC_FUNCTION
 
-#endif
-
 #endif // __CLC_MATH_CLC_TRUNC_H__

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to