https://github.com/adelejjeh created
https://github.com/llvm/llvm-project/pull/168770
## Problem Summary
PyTorch's `test_warp_softmax_64bit_indexing` began failing after latest
mainline promotion. The test failure manifested as a numerical precision error
where `log(1.1422761679)` computed with 54% higher error than expected
(9.042e-09 vs 5.859e-09), causing gradient computations to exceed tolerance
thresholds. This precision degradation was reproducible across all AMD GPU
architectures (gfx1100, gfx1200, gfx90a, gfx950).
I tracked down the problem to the upstream commit **4703f8b6610a** (March 6,
2025) titled "clang/HIP: Use generic builtins for f32 exp and log (#129638)".
This commit changed HIP math headers to call `__builtin_logf()` directly
instead of `__ocml_log_f32()`:
```diff
- float logf(float __x) { return __FAST_OR_SLOW(__logf, __ocml_log_f32)(__x); }
+ float logf(float __x) { return __FAST_OR_SLOW(__logf, __builtin_logf)(__x); }
```
This change exposed a bug with how Clang handles the `contract` fast-math flag
on log intrinsics with AMDGCN target.
## Key Findings
**1. Contract flag propagation:** When `-ffp-contract=fast` is enabled (default
for HIP), Clang's CodeGen adds the `contract` flag to all `CallInst`
instructions within the scope of `CGFPOptionsRAII`, including calls to LLVM
intrinsics like `llvm.log.f32`.
**2. Behavior change from OCML to builtin path:**
- **Old path** (via `__ocml_log_f32`): The preprocessed IR showed the call to
the OCML library function had the contract flag, but the OCML implementation
internally dropped the contract flag when calling the `llvm.log.f32` intrinsic.
```llvm
; Function Attrs: alwaysinline convergent mustprogress nounwind
define internal noundef float @_ZL4logff(float noundef %__x) #6 {
entry:
%retval = alloca float, align 4, addrspace(5)
%__x.addr = alloca float, align 4, addrspace(5)
%retval.ascast = addrspacecast ptr addrspace(5) %retval to ptr
%__x.addr.ascast = addrspacecast ptr addrspace(5) %__x.addr to ptr
store float %__x, ptr %__x.addr.ascast, align 4, !tbaa !23
%0 = load float, ptr %__x.addr.ascast, align 4, !tbaa !23
%call = call contract float @__ocml_log_f32(float noundef %0) #23
ret float %call
}
; Function Attrs: convergent mustprogress nofree norecurse nosync nounwind
willreturn memory(none)
define internal noundef float @__ocml_log_f32(float noundef %0) #7 {
%2 = tail call float @llvm.log.f32(float %0)
ret float %2
}
```
- **New path** (via `__builtin_logf`): The call goes directly to `llvm.log.f32`
intrinsic with the contract flag preserved, causing the backend to apply FMA
contraction during polynomial expansion.
```llvm
; Function Attrs: alwaysinline convergent mustprogress nounwind
define internal noundef float @_ZL4logff(float noundef %__x) #6 {
entry:
%retval = alloca float, align 4, addrspace(5)
%__x.addr = alloca float, align 4, addrspace(5)
%retval.ascast = addrspacecast ptr addrspace(5) %retval to ptr
%__x.addr.ascast = addrspacecast ptr addrspace(5) %__x.addr to ptr
store float %__x, ptr %__x.addr.ascast, align 4, !tbaa !24
%0 = load float, ptr %__x.addr.ascast, align 4, !tbaa !24
%1 = call contract float @llvm.log.f32(float %0)
ret float %1
}
```
**3. Why contract breaks log:** Our AMDGCM target back end implements the
natural logarithm by taking the result of the hardware log, then multiply that
by `ln(2)`, and apply some rounding error correction to that multiplication.
This results in something like:
```c
r = y * c1; // y is result of v_log_ instruction, c1 = ln(2)
r = r + fma(y, c2, fma(y, c1, -r)) // c2 is another error-correcting constant
```
```asm
v_log_f32_e32 v1, v1
s_mov_b32 s2, 0x3f317217
v_mul_f32_e32 v3, 0x3f317217, v1
v_fma_f32 v4, v1, s2, -v3
v_fmac_f32_e32 v4, 0x3377d1cf, v1
v_add_f32_e32 v3, v3, v4
```
With the presence of the `contract` flag, the back-end fuses the add (`r + Z`)
with the multiply thinking that it is legal, thus eliminating the intermediate
rounding. The error compensation term, which was calculated based on the
rounded product, is now being added to the full-precision result from the FMA,
leading to incorrect error correction and degraded accuracy. The corresponding
contracted operations become the following:
```c
r = y * c1;
r = fma(y, c1, fma(y, c2, fma(y, c1, -r)));
```
```asm
v_log_f32_e32 v1, v1
s_mov_b32 s2, 0x3f317217
v_mul_f32_e32 v3, 0x3f317217, v1
v_fma_f32 v3, v1, s2, -v3
v_fmac_f32_e32 v3, 0x3377d1cf, v1
v_fmac_f32_e32 v3, 0x3f317217, v1
```
## Solution and Proposed Fix
Based on our implementation of `llvm.log`, it should be illegal to add the
contract flag to the intrinsic call because it uses error-correcting summation.
`contract` on a callinst indicates that it is legal to propagate the flag to
the internals of the called function, but in this case that is not true since
as described above the error-correcting summation we use doesn't allow for
contraction.
My proposed fix involves adding logic to `CGBuiltin.cpp` to explicitly disable
the `contract` flag on the `CallInst` for the llvm.log intrinsic when the
target is AMDGCN/HIP.
>From ea795b9cf67834cd0dff1a1cc23deccd06c93121 Mon Sep 17 00:00:00 2001
From: Adel Ejjeh <[email protected]>
Date: Wed, 19 Nov 2025 14:32:05 -0600
Subject: [PATCH] [AMDGPU] Update log lowering to remove contract for AMDGCN
backend
---
clang/lib/CodeGen/CGBuiltin.cpp | 26 ++++++++++++++++-
clang/test/Headers/__clang_hip_math.hip | 38 ++++++++++++-------------
2 files changed, 44 insertions(+), 20 deletions(-)
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 3079f8ab7229e..3cf9be8e70b57 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -582,6 +582,23 @@ static Value *EmitISOVolatileStore(CodeGenFunction &CGF,
const CallExpr *E) {
return Store;
}
+// Check if an intrinsic is a transcendental function that is unsafe to
contract.
+static bool isUnsafeToContract(unsigned IntrinsicID, CodeGenFunction &CGF) {
+ switch (IntrinsicID) {
+ // The implementation for log in the AMDGCN backend uses a refinement
algorithm
+ // that requires intermediate rounding. The contract flag
+ // would allow FMA formation that recomputes products, breaking the
+ // refinement algorithm.
+ case Intrinsic::log:
+ case Intrinsic::log10:
+ if ((CGF.getTarget().getTriple().isAMDGCN() ||
+ CGF.getTarget().getTriple().isSPIRV()) &&
+ CGF.getLangOpts().HIP)
+ return true;
+ default:
+ return false;
+ }
+}
// Emit a simple mangled intrinsic that has 1 argument and a return type
// matching the argument type. Depending on mode, this may be a constrained
// floating-point intrinsic.
@@ -596,7 +613,14 @@ Value *emitUnaryMaybeConstrainedFPBuiltin(CodeGenFunction
&CGF,
return CGF.Builder.CreateConstrainedFPCall(F, { Src0 });
} else {
Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
- return CGF.Builder.CreateCall(F, Src0);
+ CallInst *Call = CGF.Builder.CreateCall(F, Src0);
+
+ // Check if the intrinsic is unsafe to contract
+ if (isUnsafeToContract(IntrinsicID, CGF)) {
+ Call->setHasAllowContract(false);
+ }
+
+ return Call;
}
}
diff --git a/clang/test/Headers/__clang_hip_math.hip
b/clang/test/Headers/__clang_hip_math.hip
index 7e2691633c215..aa97fc84f0904 100644
--- a/clang/test/Headers/__clang_hip_math.hip
+++ b/clang/test/Headers/__clang_hip_math.hip
@@ -3673,31 +3673,31 @@ extern "C" __device__ long long int test_llround(double
x) {
// DEFAULT-LABEL: define dso_local noundef float @test_log10f(
// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
// DEFAULT-NEXT: [[ENTRY:.*:]]
-// DEFAULT-NEXT: [[TMP0:%.*]] = tail call contract noundef float
@llvm.log10.f32(float [[X]])
+// DEFAULT-NEXT: [[TMP0:%.*]] = tail call noundef float
@llvm.log10.f32(float [[X]])
// DEFAULT-NEXT: ret float [[TMP0]]
//
// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float
@test_log10f(
// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]])
local_unnamed_addr #[[ATTR3]] {
// FINITEONLY-NEXT: [[ENTRY:.*:]]
-// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef
float @llvm.log10.f32(float nofpclass(nan inf) [[X]])
+// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf noundef float
@llvm.log10.f32(float nofpclass(nan inf) [[X]])
// FINITEONLY-NEXT: ret float [[TMP0]]
//
// APPROX-LABEL: define dso_local noundef float @test_log10f(
// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
// APPROX-NEXT: [[ENTRY:.*:]]
-// APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float
@llvm.log10.f32(float [[X]])
+// APPROX-NEXT: [[TMP0:%.*]] = tail call noundef float
@llvm.log10.f32(float [[X]])
// APPROX-NEXT: ret float [[TMP0]]
//
// NCRDIV-LABEL: define dso_local noundef float @test_log10f(
// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
// NCRDIV-NEXT: [[ENTRY:.*:]]
-// NCRDIV-NEXT: [[TMP0:%.*]] = tail call contract noundef float
@llvm.log10.f32(float [[X]])
+// NCRDIV-NEXT: [[TMP0:%.*]] = tail call noundef float
@llvm.log10.f32(float [[X]])
// NCRDIV-NEXT: ret float [[TMP0]]
//
// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_log10f(
// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4)
#[[ATTR3]] {
// AMDGCNSPIRV-NEXT: [[ENTRY:.*:]]
-// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4)
float @llvm.log10.f32(float [[X]])
+// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call noundef addrspace(4) float
@llvm.log10.f32(float [[X]])
// AMDGCNSPIRV-NEXT: ret float [[TMP0]]
//
extern "C" __device__ float test_log10f(float x) {
@@ -3945,25 +3945,25 @@ extern "C" __device__ double test_logb(double x) {
// DEFAULT-LABEL: define dso_local noundef float @test_logf(
// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
// DEFAULT-NEXT: [[ENTRY:.*:]]
-// DEFAULT-NEXT: [[TMP0:%.*]] = tail call contract noundef float
@llvm.log.f32(float [[X]])
+// DEFAULT-NEXT: [[TMP0:%.*]] = tail call noundef float @llvm.log.f32(float
[[X]])
// DEFAULT-NEXT: ret float [[TMP0]]
//
// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float
@test_logf(
// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]])
local_unnamed_addr #[[ATTR3]] {
// FINITEONLY-NEXT: [[ENTRY:.*:]]
-// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef
float @llvm.log.f32(float nofpclass(nan inf) [[X]])
+// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf noundef float
@llvm.log.f32(float nofpclass(nan inf) [[X]])
// FINITEONLY-NEXT: ret float [[TMP0]]
//
// APPROX-LABEL: define dso_local noundef float @test_logf(
// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
// APPROX-NEXT: [[ENTRY:.*:]]
-// APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float
@llvm.log.f32(float [[X]])
+// APPROX-NEXT: [[TMP0:%.*]] = tail call noundef float @llvm.log.f32(float
[[X]])
// APPROX-NEXT: ret float [[TMP0]]
//
// NCRDIV-LABEL: define dso_local noundef float @test_logf(
// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
// NCRDIV-NEXT: [[ENTRY:.*:]]
-// NCRDIV-NEXT: [[TMP0:%.*]] = tail call contract noundef float
@llvm.log.f32(float [[X]])
+// NCRDIV-NEXT: [[TMP0:%.*]] = tail call noundef float @llvm.log.f32(float
[[X]])
// NCRDIV-NEXT: ret float [[TMP0]]
//
// AMDGCNSPIRV-LABEL: define spir_func noundef float @test_logf(
@@ -8600,31 +8600,31 @@ extern "C" __device__ float test___fsub_rn(float x,
float y) {
// DEFAULT-LABEL: define dso_local noundef float @test___log10f(
// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
// DEFAULT-NEXT: [[ENTRY:.*:]]
-// DEFAULT-NEXT: [[TMP0:%.*]] = tail call contract noundef float
@llvm.log10.f32(float [[X]])
+// DEFAULT-NEXT: [[TMP0:%.*]] = tail call noundef float
@llvm.log10.f32(float [[X]])
// DEFAULT-NEXT: ret float [[TMP0]]
//
// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float
@test___log10f(
// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]])
local_unnamed_addr #[[ATTR3]] {
// FINITEONLY-NEXT: [[ENTRY:.*:]]
-// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef
float @llvm.log10.f32(float nofpclass(nan inf) [[X]])
+// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf noundef float
@llvm.log10.f32(float nofpclass(nan inf) [[X]])
// FINITEONLY-NEXT: ret float [[TMP0]]
//
// APPROX-LABEL: define dso_local noundef float @test___log10f(
// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
// APPROX-NEXT: [[ENTRY:.*:]]
-// APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float
@llvm.log10.f32(float [[X]])
+// APPROX-NEXT: [[TMP0:%.*]] = tail call noundef float
@llvm.log10.f32(float [[X]])
// APPROX-NEXT: ret float [[TMP0]]
//
// NCRDIV-LABEL: define dso_local noundef float @test___log10f(
// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
// NCRDIV-NEXT: [[ENTRY:.*:]]
-// NCRDIV-NEXT: [[TMP0:%.*]] = tail call contract noundef float
@llvm.log10.f32(float [[X]])
+// NCRDIV-NEXT: [[TMP0:%.*]] = tail call noundef float
@llvm.log10.f32(float [[X]])
// NCRDIV-NEXT: ret float [[TMP0]]
//
// AMDGCNSPIRV-LABEL: define spir_func noundef float @test___log10f(
// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4)
#[[ATTR3]] {
// AMDGCNSPIRV-NEXT: [[ENTRY:.*:]]
-// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4)
float @llvm.log10.f32(float [[X]])
+// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call noundef addrspace(4) float
@llvm.log10.f32(float [[X]])
// AMDGCNSPIRV-NEXT: ret float [[TMP0]]
//
extern "C" __device__ float test___log10f(float x) {
@@ -8668,31 +8668,31 @@ extern "C" __device__ float test___log2f(float x) {
// DEFAULT-LABEL: define dso_local noundef float @test___logf(
// DEFAULT-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
// DEFAULT-NEXT: [[ENTRY:.*:]]
-// DEFAULT-NEXT: [[TMP0:%.*]] = tail call contract noundef float
@llvm.log.f32(float [[X]])
+// DEFAULT-NEXT: [[TMP0:%.*]] = tail call noundef float @llvm.log.f32(float
[[X]])
// DEFAULT-NEXT: ret float [[TMP0]]
//
// FINITEONLY-LABEL: define dso_local noundef nofpclass(nan inf) float
@test___logf(
// FINITEONLY-SAME: float noundef nofpclass(nan inf) [[X:%.*]])
local_unnamed_addr #[[ATTR3]] {
// FINITEONLY-NEXT: [[ENTRY:.*:]]
-// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef
float @llvm.log.f32(float nofpclass(nan inf) [[X]])
+// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf noundef float
@llvm.log.f32(float nofpclass(nan inf) [[X]])
// FINITEONLY-NEXT: ret float [[TMP0]]
//
// APPROX-LABEL: define dso_local noundef float @test___logf(
// APPROX-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
// APPROX-NEXT: [[ENTRY:.*:]]
-// APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float
@llvm.log.f32(float [[X]])
+// APPROX-NEXT: [[TMP0:%.*]] = tail call noundef float @llvm.log.f32(float
[[X]])
// APPROX-NEXT: ret float [[TMP0]]
//
// NCRDIV-LABEL: define dso_local noundef float @test___logf(
// NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
// NCRDIV-NEXT: [[ENTRY:.*:]]
-// NCRDIV-NEXT: [[TMP0:%.*]] = tail call contract noundef float
@llvm.log.f32(float [[X]])
+// NCRDIV-NEXT: [[TMP0:%.*]] = tail call noundef float @llvm.log.f32(float
[[X]])
// NCRDIV-NEXT: ret float [[TMP0]]
//
// AMDGCNSPIRV-LABEL: define spir_func noundef float @test___logf(
// AMDGCNSPIRV-SAME: float noundef [[X:%.*]]) local_unnamed_addr addrspace(4)
#[[ATTR3]] {
// AMDGCNSPIRV-NEXT: [[ENTRY:.*:]]
-// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4)
float @llvm.log.f32(float [[X]])
+// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call noundef addrspace(4) float
@llvm.log.f32(float [[X]])
// AMDGCNSPIRV-NEXT: ret float [[TMP0]]
//
extern "C" __device__ float test___logf(float x) {
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits