Author: Oliver Stannard Date: 2024-12-06T08:46:56Z New Revision: f893b475004fdea48288f329124817325e659792
URL: https://github.com/llvm/llvm-project/commit/f893b475004fdea48288f329124817325e659792 DIFF: https://github.com/llvm/llvm-project/commit/f893b475004fdea48288f329124817325e659792.diff LOG: [ARM] Fix instruction selection for MVE vsbciq intrinsic (#118284) There were two bugs in the implementation of the MVE vsbciq (subtract with carry across vector, with initial carry value) intrinsics: * The VSBCI instruction behaves as if the carry-in is always set, but we were selecting it when the carry-in is clear. * The vsbciq intrinsics should generate IR with the carry-in set, but they were leaving it clear. These two bugs almost cancelled each other out, but resulted in incorrect code when the vsbcq intrinsics (with a carry-in) were used, and the carry-in was a compile time constant. Added: Modified: clang/include/clang/Basic/arm_mve.td clang/test/CodeGen/arm-mve-intrinsics/vadc.c llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc.ll llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll Removed: ################################################################################ diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td index 93abbc47c54dd5..6dd8c52ddfd772 100644 --- a/clang/include/clang/Basic/arm_mve.td +++ b/clang/include/clang/Basic/arm_mve.td @@ -1270,13 +1270,13 @@ defm sqrshr: ScalarSaturatingShiftReg<s32, s64>; def lsll: LongScalarShift<u64, (args s32:$sh), (IRInt<"lsll"> $lo, $hi, $sh)>; def asrl: LongScalarShift<s64, (args s32:$sh), (IRInt<"asrl"> $lo, $hi, $sh)>; -multiclass vadcsbc { +multiclass vadcsbc<dag initial_carry_in> { def q: Intrinsic<Vector, (args Vector:$a, Vector:$b, Ptr<uint>:$carry), (seq (IRInt<NAME, [Vector]> $a, $b, (shl (load $carry), 29)):$pair, (store (and 1, (lshr (xval $pair, 1), 29)), $carry), (xval $pair, 0))>; def iq: Intrinsic<Vector, (args Vector:$a, Vector:$b, Ptr<uint>:$carry), - (seq (IRInt<NAME, [Vector]> $a, $b, 0):$pair, + (seq (IRInt<NAME, [Vector]> $a, $b, initial_carry_in):$pair, (store (and 1, (lshr (xval $pair, 1), 29)), $carry), (xval $pair, 0))>; def q_m: Intrinsic<Vector, (args Vector:$inactive, Vector:$a, Vector:$b, @@ -1288,13 +1288,13 @@ multiclass vadcsbc { def iq_m: Intrinsic<Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Ptr<uint>:$carry, Predicate:$pred), (seq (IRInt<NAME # "_predicated", [Vector, Predicate]> $inactive, $a, $b, - 0, $pred):$pair, + initial_carry_in, $pred):$pair, (store (and 1, (lshr (xval $pair, 1), 29)), $carry), (xval $pair, 0))>; } let params = T.Int32 in { - defm vadc: vadcsbc; - defm vsbc: vadcsbc; + defm vadc: vadcsbc<(u32 0)>; + defm vsbc: vadcsbc<(shl 1, 29)>; } let params = T.Int in { diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vadc.c b/clang/test/CodeGen/arm-mve-intrinsics/vadc.c index 21087b83300c88..29e067d65c6c89 100644 --- a/clang/test/CodeGen/arm-mve-intrinsics/vadc.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/vadc.c @@ -92,7 +92,7 @@ int32x4_t test_vadcq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, unsigne // CHECK-LABEL: @test_vsbciq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0) +// CHECK-NEXT: [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 536870912) // CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP0]], 1 // CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 29 // CHECK-NEXT: [[TMP3:%.*]] = and i32 1, [[TMP2]] @@ -110,7 +110,7 @@ int32x4_t test_vsbciq_s32(int32x4_t a, int32x4_t b, unsigned *carry_out) { // CHECK-LABEL: @test_vsbciq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0) +// CHECK-NEXT: [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 536870912) // CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP0]], 1 // CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 29 // CHECK-NEXT: [[TMP3:%.*]] = and i32 1, [[TMP2]] @@ -170,7 +170,7 @@ uint32x4_t test_vsbcq_u32(uint32x4_t a, uint32x4_t b, unsigned *carry) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 536870912, <4 x i1> [[TMP1]]) // CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP2]], 1 // CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 29 // CHECK-NEXT: [[TMP5:%.*]] = and i32 1, [[TMP4]] @@ -190,7 +190,7 @@ int32x4_t test_vsbciq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, unsign // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 // CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 536870912, <4 x i1> [[TMP1]]) // CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP2]], 1 // CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 29 // CHECK-NEXT: [[TMP5:%.*]] = and i32 1, [[TMP4]] diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 73ee8cf81adcd6..9714370ec7cc8e 100644 --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -5229,7 +5229,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) { return; case Intrinsic::arm_mve_vsbc: case Intrinsic::arm_mve_vsbc_predicated: - SelectMVE_VADCSBC(N, ARM::MVE_VSBC, ARM::MVE_VSBCI, true, + SelectMVE_VADCSBC(N, ARM::MVE_VSBC, ARM::MVE_VSBCI, false, IntNo == Intrinsic::arm_mve_vsbc_predicated); return; case Intrinsic::arm_mve_vshlc: diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc.ll index 967e8a94900e16..2b4c478dfe3e6a 100644 --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc.ll @@ -108,7 +108,7 @@ define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_s32(<4 x i32> %a, <4 x i32> %b, pt ; CHECK-NEXT: str r1, [r0] ; CHECK-NEXT: bx lr entry: - %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 0) + %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 536870912) %1 = extractvalue { <4 x i32>, i32 } %0, 1 %2 = lshr i32 %1, 29 %3 = and i32 %2, 1 @@ -125,6 +125,46 @@ define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_u32(<4 x i32> %a, <4 x i32> %b, pt ; CHECK-NEXT: ubfx r1, r1, #29, #1 ; CHECK-NEXT: str r1, [r0] ; CHECK-NEXT: bx lr +entry: + %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 536870912) + %1 = extractvalue { <4 x i32>, i32 } %0, 1 + %2 = lshr i32 %1, 29 + %3 = and i32 %2, 1 + store i32 %3, ptr %carry_out, align 4 + %4 = extractvalue { <4 x i32>, i32 } %0, 0 + ret <4 x i32> %4 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_s32_carry_in_zero(<4 x i32> %a, <4 x i32> %b, ptr nocapture %carry_out) { +; CHECK-LABEL: test_vsbcq_s32_carry_in_zero: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: vmsr fpscr_nzcvqc, r1 +; CHECK-NEXT: vsbc.i32 q0, q0, q1 +; CHECK-NEXT: vmrs r1, fpscr_nzcvqc +; CHECK-NEXT: ubfx r1, r1, #29, #1 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 0) + %1 = extractvalue { <4 x i32>, i32 } %0, 1 + %2 = lshr i32 %1, 29 + %3 = and i32 %2, 1 + store i32 %3, ptr %carry_out, align 4 + %4 = extractvalue { <4 x i32>, i32 } %0, 0 + ret <4 x i32> %4 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_u32_carry_in_zero(<4 x i32> %a, <4 x i32> %b, ptr nocapture %carry_out) { +; CHECK-LABEL: test_vsbcq_u32_carry_in_zero: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: vmsr fpscr_nzcvqc, r1 +; CHECK-NEXT: vsbc.i32 q0, q0, q1 +; CHECK-NEXT: vmrs r1, fpscr_nzcvqc +; CHECK-NEXT: ubfx r1, r1, #29, #1 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: bx lr entry: %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 0) %1 = extractvalue { <4 x i32>, i32 } %0, 1 @@ -196,7 +236,7 @@ define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_m_s32(<4 x i32> %inactive, <4 x i3 entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) - %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1) + %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 536870912, <4 x i1> %1) %3 = extractvalue { <4 x i32>, i32 } %2, 1 %4 = lshr i32 %3, 29 %5 = and i32 %4, 1 @@ -215,6 +255,54 @@ define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_m_u32(<4 x i32> %inactive, <4 x i3 ; CHECK-NEXT: ubfx r1, r1, #29, #1 ; CHECK-NEXT: str r1, [r0] ; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 536870912, <4 x i1> %1) + %3 = extractvalue { <4 x i32>, i32 } %2, 1 + %4 = lshr i32 %3, 29 + %5 = and i32 %4, 1 + store i32 %5, ptr %carry_out, align 4 + %6 = extractvalue { <4 x i32>, i32 } %2, 0 + ret <4 x i32> %6 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_m_s32_carry_in_zero(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, ptr nocapture %carry_out, i16 zeroext %p) { +; CHECK-LABEL: test_vsbcq_m_s32_carry_in_zero: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vmsr fpscr_nzcvqc, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vsbct.i32 q0, q1, q2 +; CHECK-NEXT: vmrs r1, fpscr_nzcvqc +; CHECK-NEXT: ubfx r1, r1, #29, #1 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1) + %3 = extractvalue { <4 x i32>, i32 } %2, 1 + %4 = lshr i32 %3, 29 + %5 = and i32 %4, 1 + store i32 %5, ptr %carry_out, align 4 + %6 = extractvalue { <4 x i32>, i32 } %2, 0 + ret <4 x i32> %6 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_m_u32_carry_in_zero(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, ptr nocapture %carry_out, i16 zeroext %p) { +; CHECK-LABEL: test_vsbcq_m_u32_carry_in_zero: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vmsr fpscr_nzcvqc, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vsbct.i32 q0, q1, q2 +; CHECK-NEXT: vmrs r1, fpscr_nzcvqc +; CHECK-NEXT: ubfx r1, r1, #29, #1 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) diff --git a/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll b/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll index 7fe8e94589a0c5..8a0134a4c5ff29 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll @@ -59,7 +59,7 @@ define void @sub_256(<4 x i32> %a_low, <4 x i32> %a_high, <4 x i32> %b_low, <4 x ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: b use_int32x4_t entry: - %adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 0) + %adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 536870912) %carry = extractvalue { <4 x i32>, i32 } %adc_low, 1 %result_low = extractvalue { <4 x i32>, i32 } %adc_low, 0 tail call void @use_int32x4_t(<4 x i32> %result_low) _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits