[clang] [llvm] [X86] - Prevent the wrong fold of x86_avx512_mask_cmp_ss/sd to fcmp (PR #202321)

Rohit Aggarwal via cfe-commits Thu, 11 Jun 2026 06:28:51 -0700

https://github.com/rohitaggarwal007 updated 
https://github.com/llvm/llvm-project/pull/202321


>From 31a1e797a31c7770fdb9224575e13aaf45564a35 Mon Sep 17 00:00:00 2001
From: Rohit Aggarwal <[email protected]>
Date: Fri, 5 Jun 2026 11:01:35 +0530
Subject: [PATCH 1/4] [Testcase][No Real issue]
 058-mask-cmp-ss-imm-immediate-not-validated

---
 .../Target/X86/X86InstCombineIntrinsic.cpp    | 11 ++++
 .../Transforms/InstCombine/X86/x86-avx512.ll  | 61 +++++++++++++++++++
 2 files changed, 72 insertions(+)

diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp 
b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index 932b4a416a8d3..613f614bd8756 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -2424,6 +2424,17 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, 
IntrinsicInst &II) const {
   case Intrinsic::x86_avx512_mask_cmp_sd: {
     // These intrinsics only demand the 0th element of their input vectors. If
     // we can simplify the input based on that, do so now.
+    //
+    // NOTE: Only operands 0 and 1 (the scalar-as-vector FP inputs) may be
+    // touched here. For the mask.cmp.ss/sd intrinsics, operand 2 is the
+    // comparison predicate (0..31) and operand 4 is the SAE/embedded-rounding
+    // control; both encode semantics that a plain fcmp cannot represent
+    // (signaling-vs-quiet predicates and FP-exception suppression). Do NOT add
+    // a fold that lowers these to fcmp/select unless it first proves the
+    // predicate is an SSE-compatible *quiet* form and the SAE operand is the
+    // exception-enabled default (4 == _MM_FROUND_CUR_DIRECTION); otherwise the
+    // signaling/QNaN-trap behavior would be silently dropped. The comi/ucomi
+    // intrinsics share this case but have only the two FP operands.
     bool MadeChange = false;
     Value *Arg0 = II.getArgOperand(0);
     Value *Arg1 = II.getArgOperand(1);
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll 
b/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll
index d89cf6b0bb986..0d35a89f5d2a1 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll
@@ -802,6 +802,67 @@ define i8 @test_cmp_sd(<2 x double> %a, <2 x double> %b, 
i8 %mask) {
   ret i8 %3
 }
 
+; The mask.cmp.ss/sd predicate (arg2) and SAE/rounding (arg4) immediates carry
+; signaling-vs-quiet and exception-suppression semantics that a plain fcmp 
cannot
+; represent. InstCombine may only simplify the demanded (lane 0) operands; it 
must
+; never fold these intrinsics to fcmp, and must preserve arg2/arg4 verbatim.
+
+; predicate 20 = _CMP_NEQ_US (unordered, *signaling*) - aliases fcmp une but 
traps.
+define i8 @test_cmp_ss_signaling_pred(<4 x float> %a, <4 x float> %b, i8 
%mask) {
+; CHECK-LABEL: @test_cmp_ss_signaling_pred(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x 
float> [[A:%.*]], <4 x float> [[B:%.*]], i32 20, i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = insertelement <4 x float> %b, float 4.000000e+00, i32 1
+  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
+  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
+  %7 = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %3, <4 x float> 
%6, i32 20, i8 %mask, i32 4)
+  ret i8 %7
+}
+
+; SAE = 8 = _MM_FROUND_NO_EXC (suppress FP exceptions) - cannot be modeled by 
fcmp.
+define i8 @test_cmp_ss_sae(<4 x float> %a, <4 x float> %b, i8 %mask) {
+; CHECK-LABEL: @test_cmp_ss_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x 
float> [[A:%.*]], <4 x float> [[B:%.*]], i32 4, i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = insertelement <4 x float> %b, float 4.000000e+00, i32 1
+  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
+  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
+  %7 = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %3, <4 x float> 
%6, i32 4, i8 %mask, i32 8)
+  ret i8 %7
+}
+
+; Lane 0 is a QNaN constant with predicate 4 (_CMP_NEQ_UQ) and SAE = 8. The
+; intrinsic must survive unchanged (not be folded to an fcmp that ignores the
+; QNaN class / SAE suppression).
+define i8 @test_cmp_ss_qnan_lane0(<4 x float> %b, i8 %mask) {
+; CHECK-LABEL: @test_cmp_ss_qnan_lane0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x 
float> <float +qnan, float poison, float poison, float poison>, <4 x float> 
[[B:%.*]], i32 4, i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %1 = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> <float 
0x7FF8000000000000, float 1.000000e+00, float 2.000000e+00, float 
3.000000e+00>, <4 x float> %b, i32 4, i8 %mask, i32 8)
+  ret i8 %1
+}
+
+; sd variant: signaling predicate 20 + SAE 8 must both be preserved.
+define i8 @test_cmp_sd_signaling_sae(<2 x double> %a, <2 x double> %b, i8 
%mask) {
+; CHECK-LABEL: @test_cmp_sd_signaling_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x 
double> [[A:%.*]], <2 x double> [[B:%.*]], i32 20, i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+  %3 = tail call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %1, <2 x double> 
%2, i32 20, i8 %mask, i32 8)
+  ret i8 %3
+}
+
 define i64 @test(float %f, double %d) {
 ;
 ; CHECK-LABEL: @test(

>From dc9e272124385a379d7dd6c7b92f682445950e85 Mon Sep 17 00:00:00 2001
From: Rohit Aggarwal <[email protected]>
Date: Mon, 8 Jun 2026 16:38:43 +0530
Subject: [PATCH 2/4] [X86] - Add negative test case

---
 .../Transforms/InstCombine/X86/x86-avx512.ll    | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll 
b/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll
index 0d35a89f5d2a1..56cdd11d229dc 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll
@@ -863,6 +863,23 @@ define i8 @test_cmp_sd_signaling_sae(<2 x double> %a, <2 x 
double> %b, i8 %mask)
   ret i8 %3
 }
 
+; Explicit negative test: even the most fcmp-like form (predicate 0 = 
_CMP_EQ_OQ,
+; ordered + quiet; rounding 4 = _MM_FROUND_CUR_DIRECTION, i.e. no SAE) must 
NOT be
+; folded to an fcmp. The intrinsic must survive untouched. This function's 
checks
+; are maintained by hand (CHECK-NOT is not emitted by update_test_checks.py); 
do
+; not regenerate it.
+define i8 @test_cmp_ss_not_folded_to_fcmp(<4 x float> %a, <4 x float> %b, i8 
%mask) {
+; CHECK-LABEL: @test_cmp_ss_not_folded_to_fcmp(
+; CHECK-NOT:   fcmp
+; CHECK:       tail call i8 @llvm.x86.avx512.mask.cmp.ss({{.*}}, i32 0, i8 
{{.*}}, i32 4)
+; CHECK-NOT:   fcmp
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %b, float 2.000000e+00, i32 1
+  %3 = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %1, <4 x float> 
%2, i32 0, i8 %mask, i32 4)
+  ret i8 %3
+}
+
 define i64 @test(float %f, double %d) {
 ;
 ; CHECK-LABEL: @test(

>From b9bcd60e71fe1b12e7be73a32c4aaaf98da4d580 Mon Sep 17 00:00:00 2001
From: Rohit Aggarwal <[email protected]>
Date: Thu, 11 Jun 2026 15:05:47 +0530
Subject: [PATCH 3/4] [X86] Move mask cmp ss/sd predicate/SAE coverage to the
 Clang front end

Per review, the scalar masked-compare test coverage should live in the Clang
front-end CodeGen tests, mirroring how the packed vector compares are validated
in clang/lib/CodeGen/TargetBuiltins/X86.cpp.

- avx512f-builtins.c: strengthen the existing _mm_cmp_{ss,sd}_mask /
  _mm_cmp_round_{ss,sd}_mask tests to pin the exact predicate, mask and SAE
  immediates and assert (CHECK-NOT: fcmp) that they are never lowered to fcmp.
  Add per-predicate quiet-vs-signaling cases (_CMP_NEQ_UQ=4 vs _CMP_NEQ_US=20)
  showing the distinct immediates are preserved instead of collapsing to a
  single fcmp predicate, plus an ordered-quiet + SAE case.
- Revert the InstCombine IR-level regression tests and the source NOTE comment
  added previously; the behavior is now demonstrated at the front end.
---
 clang/test/CodeGen/X86/avx512f-builtins.c     | 72 +++++++++++++++--
 .../Target/X86/X86InstCombineIntrinsic.cpp    | 11 ---
 .../Transforms/InstCombine/X86/x86-avx512.ll  | 78 -------------------
 3 files changed, 64 insertions(+), 97 deletions(-)

diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c 
b/clang/test/CodeGen/X86/avx512f-builtins.c
index 249a917a00461..7d8ed733f2e4f 100644
--- a/clang/test/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CodeGen/X86/avx512f-builtins.c
@@ -9803,52 +9803,108 @@ 
TEST_CONSTEXPR(match_v16si(_mm512_maskz_compress_epi32(0xA635, (__m512i)(__v16si
 
 __mmask8 test_mm_cmp_round_ss_mask(__m128 __X, __m128 __Y) {
   // CHECK-LABEL: test_mm_cmp_round_ss_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp
+  // The scalar masked compare keeps its predicate (_CMP_NLT_US = 5) and SAE
+  // (_MM_FROUND_NO_EXC = 8) immediates and must NOT be lowered to fcmp.
+  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %{{.*}}, <4 x 
float> %{{.*}}, i32 5, i8 -1, i32 8)
+  // CHECK-NOT: fcmp
   return _mm_cmp_round_ss_mask(__X, __Y, _CMP_NLT_US, _MM_FROUND_NO_EXC);
 }
 
 __mmask8 test_mm_mask_cmp_round_ss_mask(__mmask8 __M, __m128 __X, __m128 __Y) {
   // CHECK-LABEL: test_mm_mask_cmp_round_ss_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp
+  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %{{.*}}, <4 x 
float> %{{.*}}, i32 5, i8 %{{.*}}, i32 8)
+  // CHECK-NOT: fcmp
   return _mm_mask_cmp_round_ss_mask(__M, __X, __Y, _CMP_NLT_US, 
_MM_FROUND_NO_EXC);
 }
 
 __mmask8 test_mm_cmp_ss_mask(__m128 __X, __m128 __Y) {
   // CHECK-LABEL: test_mm_cmp_ss_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp
+  // No rounding control: SAE defaults to _MM_FROUND_CUR_DIRECTION = 4.
+  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %{{.*}}, <4 x 
float> %{{.*}}, i32 5, i8 -1, i32 4)
+  // CHECK-NOT: fcmp
   return _mm_cmp_ss_mask(__X, __Y, _CMP_NLT_US);
 }
 
 __mmask8 test_mm_mask_cmp_ss_mask(__mmask8 __M, __m128 __X, __m128 __Y) {
   // CHECK-LABEL: test_mm_mask_cmp_ss_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp
+  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %{{.*}}, <4 x 
float> %{{.*}}, i32 5, i8 %{{.*}}, i32 4)
+  // CHECK-NOT: fcmp
   return _mm_mask_cmp_ss_mask(__M, __X, __Y, _CMP_NLT_US);
 }
 
 __mmask8 test_mm_cmp_round_sd_mask(__m128d __X, __m128d __Y) {
   // CHECK-LABEL: test_mm_cmp_round_sd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp
+  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %{{.*}}, <2 x 
double> %{{.*}}, i32 5, i8 -1, i32 8)
+  // CHECK-NOT: fcmp
   return _mm_cmp_round_sd_mask(__X, __Y, _CMP_NLT_US, _MM_FROUND_NO_EXC);
 }
 
 __mmask8 test_mm_mask_cmp_round_sd_mask(__mmask8 __M, __m128d __X, __m128d 
__Y) {
   // CHECK-LABEL: test_mm_mask_cmp_round_sd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp
+  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %{{.*}}, <2 x 
double> %{{.*}}, i32 5, i8 %{{.*}}, i32 8)
+  // CHECK-NOT: fcmp
   return _mm_mask_cmp_round_sd_mask(__M, __X, __Y, _CMP_NLT_US, 
_MM_FROUND_NO_EXC);
 }
 
 __mmask8 test_mm_cmp_sd_mask(__m128d __X, __m128d __Y) {
   // CHECK-LABEL: test_mm_cmp_sd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp
+  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %{{.*}}, <2 x 
double> %{{.*}}, i32 5, i8 -1, i32 4)
+  // CHECK-NOT: fcmp
   return _mm_cmp_sd_mask(__X, __Y, _CMP_NLT_US);
 }
 
 __mmask8 test_mm_mask_cmp_sd_mask(__mmask8 __M, __m128d __X, __m128d __Y) {
   // CHECK-LABEL: test_mm_mask_cmp_sd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp
+  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %{{.*}}, <2 x 
double> %{{.*}}, i32 5, i8 %{{.*}}, i32 4)
+  // CHECK-NOT: fcmp
   return _mm_mask_cmp_sd_mask(__M, __X, __Y, _CMP_NLT_US);
 }
 
+// The scalar masked compares must preserve the *distinct* signaling-vs-quiet
+// predicate immediate and must never be folded to a single fcmp (unlike the
+// packed _mm512_cmp_ps_mask family above, which is lowered to fcmp). These
+// mirror the per-predicate vector-cmp tests but assert the intrinsic survives.
+
+// _CMP_NEQ_UQ (4) is the *quiet* unordered-not-equal predicate.
+__mmask8 test_mm_cmp_ss_mask_neq_uq(__m128 __X, __m128 __Y) {
+  // CHECK-LABEL: test_mm_cmp_ss_mask_neq_uq
+  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %{{.*}}, <4 x 
float> %{{.*}}, i32 4, i8 -1, i32 4)
+  // CHECK-NOT: fcmp
+  return _mm_cmp_ss_mask(__X, __Y, _CMP_NEQ_UQ);
+}
+
+// _CMP_NEQ_US (20) is the *signaling* counterpart of _CMP_NEQ_UQ. A plain fcmp
+// would collapse both onto "une"; the intrinsic keeps them distinct (4 vs 20).
+__mmask8 test_mm_cmp_ss_mask_neq_us(__m128 __X, __m128 __Y) {
+  // CHECK-LABEL: test_mm_cmp_ss_mask_neq_us
+  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %{{.*}}, <4 x 
float> %{{.*}}, i32 20, i8 -1, i32 4)
+  // CHECK-NOT: fcmp
+  return _mm_cmp_ss_mask(__X, __Y, _CMP_NEQ_US);
+}
+
+// _CMP_EQ_OQ (0) + _MM_FROUND_NO_EXC (8): even the most fcmp-like (ordered,
+// quiet) predicate with SAE must stay an intrinsic, preserving the SAE bit.
+__mmask8 test_mm_cmp_round_ss_mask_eq_oq_sae(__m128 __X, __m128 __Y) {
+  // CHECK-LABEL: test_mm_cmp_round_ss_mask_eq_oq_sae
+  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %{{.*}}, <4 x 
float> %{{.*}}, i32 0, i8 -1, i32 8)
+  // CHECK-NOT: fcmp
+  return _mm_cmp_round_ss_mask(__X, __Y, _CMP_EQ_OQ, _MM_FROUND_NO_EXC);
+}
+
+__mmask8 test_mm_cmp_sd_mask_neq_uq(__m128d __X, __m128d __Y) {
+  // CHECK-LABEL: test_mm_cmp_sd_mask_neq_uq
+  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %{{.*}}, <2 x 
double> %{{.*}}, i32 4, i8 -1, i32 4)
+  // CHECK-NOT: fcmp
+  return _mm_cmp_sd_mask(__X, __Y, _CMP_NEQ_UQ);
+}
+
+__mmask8 test_mm_cmp_sd_mask_neq_us(__m128d __X, __m128d __Y) {
+  // CHECK-LABEL: test_mm_cmp_sd_mask_neq_us
+  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %{{.*}}, <2 x 
double> %{{.*}}, i32 20, i8 -1, i32 4)
+  // CHECK-NOT: fcmp
+  return _mm_cmp_sd_mask(__X, __Y, _CMP_NEQ_US);
+}
+
 __m512 test_mm512_movehdup_ps(__m512 __A) {
   // CHECK-LABEL: test_mm512_movehdup_ps
   // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x 
i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 
11, i32 11, i32 13, i32 13, i32 15, i32 15>
diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp 
b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index 613f614bd8756..932b4a416a8d3 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -2424,17 +2424,6 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, 
IntrinsicInst &II) const {
   case Intrinsic::x86_avx512_mask_cmp_sd: {
     // These intrinsics only demand the 0th element of their input vectors. If
     // we can simplify the input based on that, do so now.
-    //
-    // NOTE: Only operands 0 and 1 (the scalar-as-vector FP inputs) may be
-    // touched here. For the mask.cmp.ss/sd intrinsics, operand 2 is the
-    // comparison predicate (0..31) and operand 4 is the SAE/embedded-rounding
-    // control; both encode semantics that a plain fcmp cannot represent
-    // (signaling-vs-quiet predicates and FP-exception suppression). Do NOT add
-    // a fold that lowers these to fcmp/select unless it first proves the
-    // predicate is an SSE-compatible *quiet* form and the SAE operand is the
-    // exception-enabled default (4 == _MM_FROUND_CUR_DIRECTION); otherwise the
-    // signaling/QNaN-trap behavior would be silently dropped. The comi/ucomi
-    // intrinsics share this case but have only the two FP operands.
     bool MadeChange = false;
     Value *Arg0 = II.getArgOperand(0);
     Value *Arg1 = II.getArgOperand(1);
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll 
b/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll
index 56cdd11d229dc..d89cf6b0bb986 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll
@@ -802,84 +802,6 @@ define i8 @test_cmp_sd(<2 x double> %a, <2 x double> %b, 
i8 %mask) {
   ret i8 %3
 }
 
-; The mask.cmp.ss/sd predicate (arg2) and SAE/rounding (arg4) immediates carry
-; signaling-vs-quiet and exception-suppression semantics that a plain fcmp 
cannot
-; represent. InstCombine may only simplify the demanded (lane 0) operands; it 
must
-; never fold these intrinsics to fcmp, and must preserve arg2/arg4 verbatim.
-
-; predicate 20 = _CMP_NEQ_US (unordered, *signaling*) - aliases fcmp une but 
traps.
-define i8 @test_cmp_ss_signaling_pred(<4 x float> %a, <4 x float> %b, i8 
%mask) {
-; CHECK-LABEL: @test_cmp_ss_signaling_pred(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x 
float> [[A:%.*]], <4 x float> [[B:%.*]], i32 20, i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    ret i8 [[TMP1]]
-;
-  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = insertelement <4 x float> %b, float 4.000000e+00, i32 1
-  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
-  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
-  %7 = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %3, <4 x float> 
%6, i32 20, i8 %mask, i32 4)
-  ret i8 %7
-}
-
-; SAE = 8 = _MM_FROUND_NO_EXC (suppress FP exceptions) - cannot be modeled by 
fcmp.
-define i8 @test_cmp_ss_sae(<4 x float> %a, <4 x float> %b, i8 %mask) {
-; CHECK-LABEL: @test_cmp_ss_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x 
float> [[A:%.*]], <4 x float> [[B:%.*]], i32 4, i8 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    ret i8 [[TMP1]]
-;
-  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = insertelement <4 x float> %b, float 4.000000e+00, i32 1
-  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
-  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
-  %7 = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %3, <4 x float> 
%6, i32 4, i8 %mask, i32 8)
-  ret i8 %7
-}
-
-; Lane 0 is a QNaN constant with predicate 4 (_CMP_NEQ_UQ) and SAE = 8. The
-; intrinsic must survive unchanged (not be folded to an fcmp that ignores the
-; QNaN class / SAE suppression).
-define i8 @test_cmp_ss_qnan_lane0(<4 x float> %b, i8 %mask) {
-; CHECK-LABEL: @test_cmp_ss_qnan_lane0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x 
float> <float +qnan, float poison, float poison, float poison>, <4 x float> 
[[B:%.*]], i32 4, i8 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    ret i8 [[TMP1]]
-;
-  %1 = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> <float 
0x7FF8000000000000, float 1.000000e+00, float 2.000000e+00, float 
3.000000e+00>, <4 x float> %b, i32 4, i8 %mask, i32 8)
-  ret i8 %1
-}
-
-; sd variant: signaling predicate 20 + SAE 8 must both be preserved.
-define i8 @test_cmp_sd_signaling_sae(<2 x double> %a, <2 x double> %b, i8 
%mask) {
-; CHECK-LABEL: @test_cmp_sd_signaling_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x 
double> [[A:%.*]], <2 x double> [[B:%.*]], i32 20, i8 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    ret i8 [[TMP1]]
-;
-  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
-  %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
-  %3 = tail call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %1, <2 x double> 
%2, i32 20, i8 %mask, i32 8)
-  ret i8 %3
-}
-
-; Explicit negative test: even the most fcmp-like form (predicate 0 = 
_CMP_EQ_OQ,
-; ordered + quiet; rounding 4 = _MM_FROUND_CUR_DIRECTION, i.e. no SAE) must 
NOT be
-; folded to an fcmp. The intrinsic must survive untouched. This function's 
checks
-; are maintained by hand (CHECK-NOT is not emitted by update_test_checks.py); 
do
-; not regenerate it.
-define i8 @test_cmp_ss_not_folded_to_fcmp(<4 x float> %a, <4 x float> %b, i8 
%mask) {
-; CHECK-LABEL: @test_cmp_ss_not_folded_to_fcmp(
-; CHECK-NOT:   fcmp
-; CHECK:       tail call i8 @llvm.x86.avx512.mask.cmp.ss({{.*}}, i32 0, i8 
{{.*}}, i32 4)
-; CHECK-NOT:   fcmp
-;
-  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %b, float 2.000000e+00, i32 1
-  %3 = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %1, <4 x float> 
%2, i32 0, i8 %mask, i32 4)
-  ret i8 %3
-}
-
 define i64 @test(float %f, double %d) {
 ;
 ; CHECK-LABEL: @test(

>From 65d32c8a97eca592db1ec8116d2e7bc9740de22f Mon Sep 17 00:00:00 2001
From: Rohit Aggarwal <[email protected]>
Date: Thu, 11 Jun 2026 18:01:01 +0530
Subject: [PATCH 4/4] [X86] Move scalar mask cmp ss/sd predicate/SAE coverage
 under strict FP

Per review: asserting that the scalar masked compares preserve their predicate
and SAE/rounding immediates (and are not turned into fcmp) is only a meaningful
requirement under -ffp-exception-behavior=strict. Under the default FP model the
compiler is allowed to ignore signaling behavior and SAE, and more generally the
X86 target compare intrinsics do not model the strict FP environment.

- avx512f-builtins.c: revert the default-FP changes (the strengthened immediate
  checks, the CHECK-NOT: fcmp assertions and the extra signaling-vs-quiet
  functions); restore the original weak intrinsic checks.
- avx512f-builtins-constrained-cmp.c: add the scalar _mm_cmp_{ss,sd}_mask /
  _mm_cmp_round_{ss,sd}_mask tests here, where -ffp-exception-behavior=strict is
  in effect, pinning the predicate (_CMP_NLT_US = 5) and SAE (_MM_FROUND_NO_EXC 
=
  8, else 4) immediates, mirroring how the packed compares are covered.
---
 .../X86/avx512f-builtins-constrained-cmp.c    | 54 ++++++++++++++
 clang/test/CodeGen/X86/avx512f-builtins.c     | 72 +++----------------
 2 files changed, 62 insertions(+), 64 deletions(-)

diff --git a/clang/test/CodeGen/X86/avx512f-builtins-constrained-cmp.c 
b/clang/test/CodeGen/X86/avx512f-builtins-constrained-cmp.c
index b482466f05d58..b2e58a8baeed3 100644
--- a/clang/test/CodeGen/X86/avx512f-builtins-constrained-cmp.c
+++ b/clang/test/CodeGen/X86/avx512f-builtins-constrained-cmp.c
@@ -793,3 +793,57 @@ __mmask8 test_mm512_mask_cmp_pd_mask_true_us(__mmask8 m, 
__m512d a, __m512d b) {
   // CHECK: call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> 
%{{.*}}, <8 x double> %{{.*}}, i32 31, <8 x i1> {{.*}}, i32 4)
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_TRUE_US);
 }
+
+// The scalar masked compares are always emitted as the target intrinsic (there
+// is no fcmp lowering for them). Under strict FP the predicate (e.g.
+// _CMP_NLT_US = 5, a signaling predicate) and the SAE/rounding control
+// (_MM_FROUND_NO_EXC = 8, otherwise _MM_FROUND_CUR_DIRECTION = 4) must be
+// preserved verbatim, mirroring the packed compares above.
+
+__mmask8 test_mm_cmp_round_ss_mask(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_round_ss_mask
+  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %{{.*}}, <4 x 
float> %{{.*}}, i32 5, i8 -1, i32 8)
+  return _mm_cmp_round_ss_mask(a, b, _CMP_NLT_US, _MM_FROUND_NO_EXC);
+}
+
+__mmask8 test_mm_mask_cmp_round_ss_mask(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_round_ss_mask
+  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %{{.*}}, <4 x 
float> %{{.*}}, i32 5, i8 %{{.*}}, i32 8)
+  return _mm_mask_cmp_round_ss_mask(m, a, b, _CMP_NLT_US, _MM_FROUND_NO_EXC);
+}
+
+__mmask8 test_mm_cmp_ss_mask(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ss_mask
+  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %{{.*}}, <4 x 
float> %{{.*}}, i32 5, i8 -1, i32 4)
+  return _mm_cmp_ss_mask(a, b, _CMP_NLT_US);
+}
+
+__mmask8 test_mm_mask_cmp_ss_mask(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ss_mask
+  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %{{.*}}, <4 x 
float> %{{.*}}, i32 5, i8 %{{.*}}, i32 4)
+  return _mm_mask_cmp_ss_mask(m, a, b, _CMP_NLT_US);
+}
+
+__mmask8 test_mm_cmp_round_sd_mask(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_round_sd_mask
+  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %{{.*}}, <2 x 
double> %{{.*}}, i32 5, i8 -1, i32 8)
+  return _mm_cmp_round_sd_mask(a, b, _CMP_NLT_US, _MM_FROUND_NO_EXC);
+}
+
+__mmask8 test_mm_mask_cmp_round_sd_mask(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_round_sd_mask
+  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %{{.*}}, <2 x 
double> %{{.*}}, i32 5, i8 %{{.*}}, i32 8)
+  return _mm_mask_cmp_round_sd_mask(m, a, b, _CMP_NLT_US, _MM_FROUND_NO_EXC);
+}
+
+__mmask8 test_mm_cmp_sd_mask(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_sd_mask
+  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %{{.*}}, <2 x 
double> %{{.*}}, i32 5, i8 -1, i32 4)
+  return _mm_cmp_sd_mask(a, b, _CMP_NLT_US);
+}
+
+__mmask8 test_mm_mask_cmp_sd_mask(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_sd_mask
+  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %{{.*}}, <2 x 
double> %{{.*}}, i32 5, i8 %{{.*}}, i32 4)
+  return _mm_mask_cmp_sd_mask(m, a, b, _CMP_NLT_US);
+}
diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c 
b/clang/test/CodeGen/X86/avx512f-builtins.c
index 7d8ed733f2e4f..249a917a00461 100644
--- a/clang/test/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CodeGen/X86/avx512f-builtins.c
@@ -9803,108 +9803,52 @@ 
TEST_CONSTEXPR(match_v16si(_mm512_maskz_compress_epi32(0xA635, (__m512i)(__v16si
 
 __mmask8 test_mm_cmp_round_ss_mask(__m128 __X, __m128 __Y) {
   // CHECK-LABEL: test_mm_cmp_round_ss_mask
-  // The scalar masked compare keeps its predicate (_CMP_NLT_US = 5) and SAE
-  // (_MM_FROUND_NO_EXC = 8) immediates and must NOT be lowered to fcmp.
-  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %{{.*}}, <4 x 
float> %{{.*}}, i32 5, i8 -1, i32 8)
-  // CHECK-NOT: fcmp
+  // CHECK: @llvm.x86.avx512.mask.cmp
   return _mm_cmp_round_ss_mask(__X, __Y, _CMP_NLT_US, _MM_FROUND_NO_EXC);
 }
 
 __mmask8 test_mm_mask_cmp_round_ss_mask(__mmask8 __M, __m128 __X, __m128 __Y) {
   // CHECK-LABEL: test_mm_mask_cmp_round_ss_mask
-  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %{{.*}}, <4 x 
float> %{{.*}}, i32 5, i8 %{{.*}}, i32 8)
-  // CHECK-NOT: fcmp
+  // CHECK: @llvm.x86.avx512.mask.cmp
   return _mm_mask_cmp_round_ss_mask(__M, __X, __Y, _CMP_NLT_US, 
_MM_FROUND_NO_EXC);
 }
 
 __mmask8 test_mm_cmp_ss_mask(__m128 __X, __m128 __Y) {
   // CHECK-LABEL: test_mm_cmp_ss_mask
-  // No rounding control: SAE defaults to _MM_FROUND_CUR_DIRECTION = 4.
-  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %{{.*}}, <4 x 
float> %{{.*}}, i32 5, i8 -1, i32 4)
-  // CHECK-NOT: fcmp
+  // CHECK: @llvm.x86.avx512.mask.cmp
   return _mm_cmp_ss_mask(__X, __Y, _CMP_NLT_US);
 }
 
 __mmask8 test_mm_mask_cmp_ss_mask(__mmask8 __M, __m128 __X, __m128 __Y) {
   // CHECK-LABEL: test_mm_mask_cmp_ss_mask
-  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %{{.*}}, <4 x 
float> %{{.*}}, i32 5, i8 %{{.*}}, i32 4)
-  // CHECK-NOT: fcmp
+  // CHECK: @llvm.x86.avx512.mask.cmp
   return _mm_mask_cmp_ss_mask(__M, __X, __Y, _CMP_NLT_US);
 }
 
 __mmask8 test_mm_cmp_round_sd_mask(__m128d __X, __m128d __Y) {
   // CHECK-LABEL: test_mm_cmp_round_sd_mask
-  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %{{.*}}, <2 x 
double> %{{.*}}, i32 5, i8 -1, i32 8)
-  // CHECK-NOT: fcmp
+  // CHECK: @llvm.x86.avx512.mask.cmp
   return _mm_cmp_round_sd_mask(__X, __Y, _CMP_NLT_US, _MM_FROUND_NO_EXC);
 }
 
 __mmask8 test_mm_mask_cmp_round_sd_mask(__mmask8 __M, __m128d __X, __m128d 
__Y) {
   // CHECK-LABEL: test_mm_mask_cmp_round_sd_mask
-  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %{{.*}}, <2 x 
double> %{{.*}}, i32 5, i8 %{{.*}}, i32 8)
-  // CHECK-NOT: fcmp
+  // CHECK: @llvm.x86.avx512.mask.cmp
   return _mm_mask_cmp_round_sd_mask(__M, __X, __Y, _CMP_NLT_US, 
_MM_FROUND_NO_EXC);
 }
 
 __mmask8 test_mm_cmp_sd_mask(__m128d __X, __m128d __Y) {
   // CHECK-LABEL: test_mm_cmp_sd_mask
-  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %{{.*}}, <2 x 
double> %{{.*}}, i32 5, i8 -1, i32 4)
-  // CHECK-NOT: fcmp
+  // CHECK: @llvm.x86.avx512.mask.cmp
   return _mm_cmp_sd_mask(__X, __Y, _CMP_NLT_US);
 }
 
 __mmask8 test_mm_mask_cmp_sd_mask(__mmask8 __M, __m128d __X, __m128d __Y) {
   // CHECK-LABEL: test_mm_mask_cmp_sd_mask
-  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %{{.*}}, <2 x 
double> %{{.*}}, i32 5, i8 %{{.*}}, i32 4)
-  // CHECK-NOT: fcmp
+  // CHECK: @llvm.x86.avx512.mask.cmp
   return _mm_mask_cmp_sd_mask(__M, __X, __Y, _CMP_NLT_US);
 }
 
-// The scalar masked compares must preserve the *distinct* signaling-vs-quiet
-// predicate immediate and must never be folded to a single fcmp (unlike the
-// packed _mm512_cmp_ps_mask family above, which is lowered to fcmp). These
-// mirror the per-predicate vector-cmp tests but assert the intrinsic survives.
-
-// _CMP_NEQ_UQ (4) is the *quiet* unordered-not-equal predicate.
-__mmask8 test_mm_cmp_ss_mask_neq_uq(__m128 __X, __m128 __Y) {
-  // CHECK-LABEL: test_mm_cmp_ss_mask_neq_uq
-  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %{{.*}}, <4 x 
float> %{{.*}}, i32 4, i8 -1, i32 4)
-  // CHECK-NOT: fcmp
-  return _mm_cmp_ss_mask(__X, __Y, _CMP_NEQ_UQ);
-}
-
-// _CMP_NEQ_US (20) is the *signaling* counterpart of _CMP_NEQ_UQ. A plain fcmp
-// would collapse both onto "une"; the intrinsic keeps them distinct (4 vs 20).
-__mmask8 test_mm_cmp_ss_mask_neq_us(__m128 __X, __m128 __Y) {
-  // CHECK-LABEL: test_mm_cmp_ss_mask_neq_us
-  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %{{.*}}, <4 x 
float> %{{.*}}, i32 20, i8 -1, i32 4)
-  // CHECK-NOT: fcmp
-  return _mm_cmp_ss_mask(__X, __Y, _CMP_NEQ_US);
-}
-
-// _CMP_EQ_OQ (0) + _MM_FROUND_NO_EXC (8): even the most fcmp-like (ordered,
-// quiet) predicate with SAE must stay an intrinsic, preserving the SAE bit.
-__mmask8 test_mm_cmp_round_ss_mask_eq_oq_sae(__m128 __X, __m128 __Y) {
-  // CHECK-LABEL: test_mm_cmp_round_ss_mask_eq_oq_sae
-  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %{{.*}}, <4 x 
float> %{{.*}}, i32 0, i8 -1, i32 8)
-  // CHECK-NOT: fcmp
-  return _mm_cmp_round_ss_mask(__X, __Y, _CMP_EQ_OQ, _MM_FROUND_NO_EXC);
-}
-
-__mmask8 test_mm_cmp_sd_mask_neq_uq(__m128d __X, __m128d __Y) {
-  // CHECK-LABEL: test_mm_cmp_sd_mask_neq_uq
-  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %{{.*}}, <2 x 
double> %{{.*}}, i32 4, i8 -1, i32 4)
-  // CHECK-NOT: fcmp
-  return _mm_cmp_sd_mask(__X, __Y, _CMP_NEQ_UQ);
-}
-
-__mmask8 test_mm_cmp_sd_mask_neq_us(__m128d __X, __m128d __Y) {
-  // CHECK-LABEL: test_mm_cmp_sd_mask_neq_us
-  // CHECK: call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %{{.*}}, <2 x 
double> %{{.*}}, i32 20, i8 -1, i32 4)
-  // CHECK-NOT: fcmp
-  return _mm_cmp_sd_mask(__X, __Y, _CMP_NEQ_US);
-}
-
 __m512 test_mm512_movehdup_ps(__m512 __A) {
   // CHECK-LABEL: test_mm512_movehdup_ps
   // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x 
i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 
11, i32 11, i32 13, i32 13, i32 15, i32 15>

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [X86] - Prevent the wrong fold of x86_avx512_mask_cmp_ss/sd to fcmp (PR #202321)

Reply via email to