Torbjörn,
I reproduced this on x86-64 (against -m32) and it is just a type mismatch.
Could you confirm that the following fully addresses this for you as well:
Thanks,
Philipp.
diff --git a/gcc/testsuite/gcc.dg/pr124545-2.c
b/gcc/testsuite/gcc.dg/pr124545-2.c
index b4806567acce..990f509d3490 100644
--- a/gcc/testsuite/gcc.dg/pr124545-2.c
+++ b/gcc/testsuite/gcc.dg/pr124545-2.c
@@ -4,7 +4,9 @@
computed value. In particular it must NOT fire when CST is not
representable in the inner type (which would silently drop the bits
above the inner precision), and it must stay correct for unsigned
- inner types where the narrow operation wraps. */
+ inner types where the narrow operation wraps. Uses __UINT{32,64}_TYPE__
+ rather than unsigned {int,long} so that the narrow-vs-wide contrast is
+ independent of ILP32 vs LP64. */
/* { dg-do run } */
/* { dg-options "-O2" } */
@@ -13,23 +15,23 @@
__attribute__((noipa)) int
oor_eq (int a)
{
- return ((unsigned long long) a + 0x100000000ULL) == (unsigned long long) a;
+ return ((__UINT64_TYPE__) a + 0x100000000ULL) == (__UINT64_TYPE__) a;
}
-__attribute__((noipa)) unsigned long long
+__attribute__((noipa)) __UINT64_TYPE__
oor_val (int a)
{
- return (unsigned long long) a + 0x100000000ULL;
+ return (__UINT64_TYPE__) a + 0x100000000ULL;
}
/* Unsigned inner: narrow add wraps mod 2^32; the widened add does not.
The result must match the wide arithmetic for every input. */
__attribute__((noipa)) int
-uns_carry (unsigned int a)
+uns_carry (__UINT32_TYPE__ a)
{
- unsigned int t = a + 100u;
- unsigned long w = (unsigned long) a + 100;
- return w == (unsigned long) t;
+ __UINT32_TYPE__ t = a + 100u;
+ __UINT64_TYPE__ w = (__UINT64_TYPE__) a + 100;
+ return w == (__UINT64_TYPE__) t;
}
On Fri, 3 Jul 2026 at 20:12, Philipp Tomsich <[email protected]> wrote:
>
> Torbjörn,
>
> The test (as written today) doesn't really make sense on ILP32 (where
> sizeof(int) == sizeof(long)).
> We'll look into whether to disable (gate on LP64) or to explicitly use
> unsigned long long.
>
> Thanks for the report,
> Philipp.
>
>
> On Fri, 3 Jul 2026 at 20:00, Torbjorn SVENSSON
> <[email protected]> wrote:
> >
> > Hi,
> >
> > The gcc.dg/pr124545-2.c test does not work for arm-none-eabi.
> > Is this suppose to work or is it missing some dg-require-effective-target?
> >
> > Testing gcc.dg/pr124545-2.c
> > doing compile
> > Executing on host: /build/r17-2109-g2b8f4671103159/bin/arm-none-eabi-gcc
> > /build/gcc_src/gcc/testsuite/gcc.dg/pr124545-2.c -mthumb
> > -march=armv7ve+neon -mtune=cortex-a7 -mfloat-abi=hard -mfpu=auto
> > -dumpbase "" -fdiagnostics-plain-output -O2 --specs=rdimon.specs
> > -Wl,--start-group -lc -lm -Wl,--end-group --specs=nosys.specs
> > -Wl,--allow-multiple-definition -Wl,-u,_isatty,-u,_fstat -Wl,-wrap,exit
> > -Wl,-wrap,_exit -Wl,-wrap,main -Wl,-wrap,abort -Wl,gcc_tg.o -lm -o
> > ./pr124545-2.exe (timeout = 800)
> > spawn -ignore SIGHUP /build/r17-2109-g2b8f4671103159/bin/arm-none-eabi-gcc
> > /build/gcc_src/gcc/testsuite/gcc.dg/pr124545-2.c -mthumb
> > -march=armv7ve+neon -mtune=cortex-a7 -mfloat-abi=hard -mfpu=auto -dumpbase
> > -fdiagnostics-plain-output -O2 --specs=rdimon.specs -Wl,--start-group -lc
> > -lm -Wl,--end-group --specs=nosys.specs -Wl,--allow-multiple-definition
> > -Wl,-u,_isatty,-u,_fstat -Wl,-wrap,exit -Wl,-wrap,_exit -Wl,-wrap,main
> > -Wl,-wrap,abort -Wl,gcc_tg.o -lm -o ./pr124545-2.exe
> > pid is 165557 -165557
> > pid is -1
> > output is status 0
> > PASS: gcc.dg/pr124545-2.c (test for excess errors)
> > spawning command qemu-system-arm -nographic -machine virt -cpu cortex-a7
> > -m 256 -semihosting -monitor /dev/null -kernel ./pr124545-2.exe
> > spawn qemu-system-arm -nographic -machine virt -cpu cortex-a7 -m 256
> > -semihosting -monitor /dev/null -kernel ./pr124545-2.exe
> >
> > *** EXIT code 4242
> >
> > *** EXIT code 1
> > pid is -1
> > Shell closed.
> > Output is
> > *** EXIT code 4242
> >
> > *** EXIT code 1
> >
> > FAIL: gcc.dg/pr124545-2.c execution test
> >
> >
> >
> > This is the assembly:
> > $ /build/r17-2109-g2b8f4671103159/bin/arm-none-eabi-gcc
> > /build/gcc_src/gcc/testsuite/gcc.dg/pr124545-2.c -mthumb
> > -march=armv7ve+neon -mtune=cortex-a7 -mfloat-abi=hard -mfpu=auto -O2 -dp -S
> > -o -
> > .arch armv7-a
> > .arch_extension virt
> > .arch_extension idiv
> > .arch_extension sec
> > .arch_extension mp
> > .fpu neon
> > .eabi_attribute 28, 1
> > .eabi_attribute 20, 1
> > .eabi_attribute 21, 1
> > .eabi_attribute 23, 3
> > .eabi_attribute 24, 1
> > .eabi_attribute 25, 1
> > .eabi_attribute 26, 1
> > .eabi_attribute 30, 2
> > .eabi_attribute 34, 1
> > .eabi_attribute 18, 4
> > .file "pr124545-2.c"
> > .text
> > .align 1
> > .p2align 2,,3
> > .global oor_eq
> > .syntax unified
> > .thumb
> > .thumb_func
> > .type oor_eq, %function
> > oor_eq:
> > @ args = 0, pretend = 0, frame = 0
> > @ frame_needed = 0, uses_anonymous_args = 0
> > @ link register save eliminated.
> > movs r0, #0 @ 10 [c=4 l=2] *thumb2_movsi_shortim
> > bx lr @ 17 [c=8 l=4] *thumb2_return
> > .size oor_eq, .-oor_eq
> > .align 1
> > .p2align 2,,3
> > .global oor_val
> > .syntax unified
> > .thumb
> > .thumb_func
> > .type oor_val, %function
> > oor_val:
> > @ args = 0, pretend = 0, frame = 0
> > @ frame_needed = 0, uses_anonymous_args = 0
> > @ link register save eliminated.
> > asrs r1, r0, #31 @ 6 [c=4 l=2] *thumb2_shiftsi3_short/1
> > adds r1, r1, #1 @ 21 [c=4 l=2] *thumb2_addsi_short/0
> > bx lr @ 27 [c=8 l=4] *thumb2_return
> > .size oor_val, .-oor_val
> > .align 1
> > .p2align 2,,3
> > .global uns_carry
> > .syntax unified
> > .thumb
> > .thumb_func
> > .type uns_carry, %function
> > uns_carry:
> > @ args = 0, pretend = 0, frame = 0
> > @ frame_needed = 0, uses_anonymous_args = 0
> > @ link register save eliminated.
> > movs r0, #1 @ 10 [c=4 l=2] *thumb2_movsi_shortim
> > bx lr @ 17 [c=8 l=4] *thumb2_return
> > .size uns_carry, .-uns_carry
> > .align 1
> > .p2align 2,,3
> > .global inrange_eq
> > .syntax unified
> > .thumb
> > .thumb_func
> > .type inrange_eq, %function
> > inrange_eq:
> > @ args = 0, pretend = 0, frame = 0
> > @ frame_needed = 0, uses_anonymous_args = 0
> > @ link register save eliminated.
> > movs r0, #1 @ 11 [c=4 l=2] *thumb2_movsi_shortim
> > bx lr @ 18 [c=8 l=4] *thumb2_return
> > .size inrange_eq, .-inrange_eq
> > .section .text.startup,"ax",%progbits
> > .align 1
> > .p2align 2,,3
> > .global main
> > .syntax unified
> > .thumb
> > .thumb_func
> > .type main, %function
> > main:
> > @ args = 0, pretend = 0, frame = 16
> > @ frame_needed = 0, uses_anonymous_args = 0
> > push {r4, lr} @ 108 [c=8 l=2] *push_multi
> > movs r0, #5 @ 5 [c=4 l=2] *thumb2_movsi_shortim
> > sub sp, sp, #16 @ 109 [c=4 l=4] *arm_addsi3/11
> > bl oor_eq @ 6 [c=4 l=4] *call_value_symbol
> > cbnz r0, .L8 @ 9 [c=16 l=2] *thumb2_cbnz/0
> > mov r0, #-1 @ 15 [c=4 l=4] *thumb2_movsi_vfp/1
> > bl oor_eq @ 16 [c=4 l=4] *call_value_symbol
> > cbnz r0, .L8 @ 20 [c=16 l=2] *thumb2_cbnz/0
> > movs r0, #5 @ 22 [c=4 l=2] *thumb2_movsi_shortim
> > bl oor_val @ 23 [c=4 l=4] *call_value_symbol
> > cmp r1, #1 @ 26 [c=20 l=6] *cmp_ior/0
> > it eq
> > cmpeq r0, #5
> > bne .L8 @ 27 [c=16 l=2] arm_cond_branch
> > mvn r0, #15 @ 29 [c=4 l=4] *thumb2_movsi_vfp/3
> > bl uns_carry @ 30 [c=4 l=4]
> > *call_value_symbol
> > mov r4, r0 @ 93 [c=4 l=2] *thumb2_movsi_vfp/0
> > cbnz r0, .L8 @ 33 [c=16 l=2] *thumb2_cbnz/0
> > movs r0, #10 @ 35 [c=4 l=2] *thumb2_movsi_shortim
> > bl uns_carry @ 36 [c=4 l=4]
> > *call_value_symbol
> > cmp r0, #1 @ 38 [c=4 l=2] *arm_cmpsi_insn/0
> > bne .L8 @ 39 [c=16 l=2] arm_cond_branch
> > movw r3, #:lower16:.LANCHOR0 @ 106 [c=4 l=4]
> > *thumb2_movsi_vfp/4
> > movt r3, #:upper16:.LANCHOR0 @ 107 [c=4 l=4] *arm_movt/0
> > ldm r3, {r0, r1, r2, r3} @ 44 [c=8 l=4] *ldm4_
> > stm sp, {r0, r1, r2, r3} @ 45 [c=8 l=4] *stm4_
> > movs r1, #2 @ 47 [c=4 l=2] *thumb2_movsi_shortim
> > mov r0, sp @ 48 [c=4 l=2] *thumb2_movsi_vfp/0
> > bl inrange_eq @ 49 [c=4 l=4]
> > *call_value_symbol
> > cmp r0, #1 @ 51 [c=4 l=2] *arm_cmpsi_insn/0
> > bne .L8 @ 52 [c=16 l=2] arm_cond_branch
> > mov r0, r4 @ 58 [c=4 l=2] *thumb2_movsi_vfp/0
> > add sp, sp, #16 @ 113 [c=4 l=4] *arm_addsi3/5
> > @ sp needed @ 114 [c=8 l=0] force_register_use
> > pop {r4, pc} @ 115 [c=8 l=2]
> > *pop_multiple_with_writeback_and_return
> > .L8:
> > bl abort @ 11 [c=8 l=4] *call_symbol
> > .size main, .-main
> > .section .rodata
> > .align 2
> > .set .LANCHOR0,. + 0
> > .LC0:
> > .word 7
> > .word 7
> > .word 7
> > .word 7
> > .ident "GCC: (r17-2109-g2b8f4671103159) 17.0.0 20260703
> > (experimental)"
> >
> >
> > Let me know if you need anything else or want me to test some potential fix.
> >
> > Kind regards,
> > Torbjörn
> >
> > On 2026-07-02 08:56, Richard Biener wrote:
> > > On Wed, 1 Jul 2026, Philipp Tomsich wrote:
> > >
> > >> visit_nary_op canonicalises (T)(A + C) into (T)A + (T)C for its VN
> > >> lookup, but not the reverse -- so whether VN discovers (T)A + C ==
> > >> (T)(A + C) depends on which form it sees first. Add a match.pd rule
> > >> that rewrites (T)A +- CST into (T)(A +- CST') using the op! qualifier,
> > >> so the fold only fires when the narrow expression already has a value
> > >> number -- i.e. only inside VN via mprts_hook.
> > >>
> > >> Restrict to TYPE_OVERFLOW_UNDEFINED inner types: for unsigned inner the
> > >> narrow op wraps mod 2^prec (defined) while the widened outer op does
> > >> not, changing the observed value (bitfld-5.c is the concrete miscompile
> > >> when the guard is loosened).
> > >>
> > >> Use wi::min_precision (CST, SIGNED) rather than int_fits_type_p for the
> > >> fits-check, so sign-encoded small negatives (e.g. -1 as sizetype's
> > >> 0xFFFF...FFFF) qualify.
> > >
> > > OK.
> > >
> > > Thanks,
> > > Richard.
> > >
> > >> PR tree-optimization/124545
> > >>
> > >> gcc/ChangeLog:
> > >>
> > >> * match.pd: Add (T)A +- CST -> (T)(A +- CST') for widening
> > >> conversions from a signed inner type with undefined overflow.
> > >>
> > >> gcc/testsuite/ChangeLog:
> > >>
> > >> * gcc.dg/pr124545.c: New test.
> > >> * gcc.dg/pr124545-2.c: New test.
> > >>
> > >> Signed-off-by: Philipp Tomsich <[email protected]>
> > >>
> > >> ---
> > >>
> > >> gcc/match.pd | 32 ++++++++++++++++++
> > >> gcc/testsuite/gcc.dg/pr124545-2.c | 55 +++++++++++++++++++++++++++++++
> > >> gcc/testsuite/gcc.dg/pr124545.c | 29 ++++++++++++++++
> > >> 3 files changed, 116 insertions(+)
> > >> create mode 100644 gcc/testsuite/gcc.dg/pr124545-2.c
> > >> create mode 100644 gcc/testsuite/gcc.dg/pr124545.c
> > >>
> > >> diff --git a/gcc/match.pd b/gcc/match.pd
> > >> index ddf3b61638ce..817a52499128 100644
> > >> --- a/gcc/match.pd
> > >> +++ b/gcc/match.pd
> > >> @@ -4067,6 +4067,38 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > >> (plus (convert @0) (op @2 (convert @1))))))
> > >> #endif
> > >>
> > >> +/* Inverse of the above: (T)(A) +- CST -> (T)(A +- CST') when T is a
> > >> + widening conversion from a type with undefined overflow and the outer
> > >> + type wraps. This allows VN to discover that (T)A + (T)C == (T)(A +
> > >> C)
> > >> + regardless of which form appears first in program order. PR124545.
> > >> + The rewrite is unsound for unsigned inner types: the narrow op wraps
> > >> + mod 2^prec (defined) while the widened op does not, changing the
> > >> + observed value. Cover the unsigned case separately once ranger can
> > >> + prove no wrap. */
> > >> +#if GIMPLE
> > >> + (for op (plus minus)
> > >> + (simplify
> > >> + (op (convert @0) INTEGER_CST@1)
> > >> + (if (TREE_CODE (TREE_TYPE (@0)) == INTEGER_TYPE
> > >> + && TREE_CODE (type) == INTEGER_TYPE
> > >> + && TYPE_PRECISION (type) > TYPE_PRECISION (TREE_TYPE (@0))
> > >> + && TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (@0))
> > >> + && !TYPE_OVERFLOW_SANITIZED (TREE_TYPE (@0))
> > >> + && TYPE_OVERFLOW_WRAPS (type)
> > >> + /* CST must be the sign-extension of its low inner-precision bits,
> > >> + otherwise narrowing changes the value. Use min_precision (..,
> > >> + SIGNED) rather than int_fits_type_p so that small negative
> > >> offsets
> > >> + encoded as large unsigned constants (e.g. -1 as sizetype) still
> > >> + qualify. */
> > >> + && wi::min_precision (wi::to_wide (@1), SIGNED)
> > >> + <= TYPE_PRECISION (TREE_TYPE (@0)))
> > >> + (with {
> > >> + wide_int c1 = wi::to_wide (@1);
> > >> + tree inner_cst = wide_int_to_tree (TREE_TYPE (@0),
> > >> + wi::sext (c1, TYPE_PRECISION (TREE_TYPE
> > >> (@0)))); }
> > >> + (convert (op! @0 { inner_cst; }))))))
> > >> +#endif
> > >> +
> > >> /* (T)(A) +- (T)(B) -> (T)(A +- B) only when (A +- B) could be
> > >> simplified
> > >> to a simple value. */
> > >> (for op (plus minus)
> > >> diff --git a/gcc/testsuite/gcc.dg/pr124545-2.c
> > >> b/gcc/testsuite/gcc.dg/pr124545-2.c
> > >> new file mode 100644
> > >> index 000000000000..b4806567acce
> > >> --- /dev/null
> > >> +++ b/gcc/testsuite/gcc.dg/pr124545-2.c
> > >> @@ -0,0 +1,55 @@
> > >> +/* PR tree-optimization/124545 */
> > >> +/* Runtime correctness for the inverse-widening VN rewrite
> > >> + (T)A +- CST -> (T)(A +- CST'). The rewrite must never change the
> > >> + computed value. In particular it must NOT fire when CST is not
> > >> + representable in the inner type (which would silently drop the bits
> > >> + above the inner precision), and it must stay correct for unsigned
> > >> + inner types where the narrow operation wraps. */
> > >> +/* { dg-do run } */
> > >> +/* { dg-options "-O2" } */
> > >> +
> > >> +/* CST = 2^32 does not fit in int: the value must be preserved.
> > >> + Before the fix this comparison folded to a constant 1. */
> > >> +__attribute__((noipa)) int
> > >> +oor_eq (int a)
> > >> +{
> > >> + return ((unsigned long long) a + 0x100000000ULL) == (unsigned long
> > >> long) a;
> > >> +}
> > >> +
> > >> +__attribute__((noipa)) unsigned long long
> > >> +oor_val (int a)
> > >> +{
> > >> + return (unsigned long long) a + 0x100000000ULL;
> > >> +}
> > >> +
> > >> +/* Unsigned inner: narrow add wraps mod 2^32; the widened add does not.
> > >> + The result must match the wide arithmetic for every input. */
> > >> +__attribute__((noipa)) int
> > >> +uns_carry (unsigned int a)
> > >> +{
> > >> + unsigned int t = a + 100u;
> > >> + unsigned long w = (unsigned long) a + 100;
> > >> + return w == (unsigned long) t;
> > >> +}
> > >> +
> > >> +/* Legitimate in-range case (matches the PR): k == j - 1, so the two
> > >> + loads are the same address and the rewrite may fire. */
> > >> +__attribute__((noipa)) int
> > >> +inrange_eq (int *p, int j)
> > >> +{
> > >> + int k = j - 1;
> > >> + return p[j - 1] == p[k];
> > >> +}
> > >> +
> > >> +int
> > >> +main (void)
> > >> +{
> > >> + if (oor_eq (5) != 0) __builtin_abort ();
> > >> + if (oor_eq (-1) != 0) __builtin_abort ();
> > >> + if (oor_val (5) != 5ULL + 0x100000000ULL) __builtin_abort ();
> > >> + if (uns_carry (0xfffffff0u) != 0) __builtin_abort ();
> > >> + if (uns_carry (10) != 1) __builtin_abort ();
> > >> + int arr[4] = { 7, 7, 7, 7 };
> > >> + if (inrange_eq (arr, 2) != 1) __builtin_abort ();
> > >> + return 0;
> > >> +}
> > >> diff --git a/gcc/testsuite/gcc.dg/pr124545.c
> > >> b/gcc/testsuite/gcc.dg/pr124545.c
> > >> new file mode 100644
> > >> index 000000000000..a21346b179c7
> > >> --- /dev/null
> > >> +++ b/gcc/testsuite/gcc.dg/pr124545.c
> > >> @@ -0,0 +1,29 @@
> > >> +/* PR tree-optimization/124545 */
> > >> +/* Verify that VN recognizes (T)A + C == (T)(A + C') regardless of
> > >> + operand order in the equality comparison. */
> > >> +/* { dg-do compile } */
> > >> +/* { dg-options "-O2 -fdump-tree-fre1" } */
> > >> +
> > >> +int func1(int *a, int j) {
> > >> + int k = j - 1;
> > >> + return a[j - 1] == a[k];
> > >> +}
> > >> +
> > >> +int func2(int *a, int j) {
> > >> + int k = j - 1;
> > >> + return a[k] == a[j - 1];
> > >> +}
> > >> +
> > >> +int func3(int *a, int j) {
> > >> + int k = j - 3;
> > >> + return a[k] == a[j - 3];
> > >> +}
> > >> +
> > >> +int func4(int *a, int j) {
> > >> + int k = j + 2;
> > >> + return a[k] == a[j + 2];
> > >> +}
> > >> +
> > >> +/* All four functions should fold to return 1 after FRE. */
> > >> +/* The pattern is not applied on ilp32 targets (PR116845). */
> > >> +/* { dg-final { scan-tree-dump-times "return 1;" 4 "fre1" { xfail {
> > >> ilp32 } } } } */
> > >>
> > >
> >