https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97312
Aldy Hernandez <aldyh at gcc dot gnu.org> changed: What |Removed |Added ---------------------------------------------------------------------------- Last reconfirmed| |2020-10-07 Ever confirmed|0 |1 Status|UNCONFIRMED |WAITING --- Comment #1 from Aldy Hernandez <aldyh at gcc dot gnu.org> --- Confirmed. This test is checking the final assembly for a specific sequence. I don't speak aarch64 assembly, but the IL is different coming out of evrp. The first culprit is this difference in the mergephi1 dump: _9 = .CTZ (x_6(D)); - _10 = _9 & 31; + _10 = _9; These are unsigned ints, so assuming they are 32 bits on aarch64, __builtin_ctz is always less than 32. This is because a CTZ of 0 is undefined according to the GCC manual: Built-in Function: int __builtin_ctz (unsigned int x) Returns the number of trailing 0-bits in x, starting at the least significant bit position. If x is 0, the result is undefined. So a bitwise AND of anything less than 32 with 0x1f (31) is a no-op. Are aarch64 int's 32-bits? Here are the full IL differences: --- legacy-evrp/pr90838.c.038t.mergephi1 2020-10-07 08:44:12.152358885 -0400 +++ ranger/pr90838.c.038t.mergephi1 2020-10-07 08:39:12.339296502 -0400 @@ -1,41 +1,41 @@ ;; Function ctz1 (ctz1, funcdef_no=0, decl_uid=3587, cgraph_uid=1, symbol_order=0) ctz1 (unsigned int x) { static const char table[32] = "\x00\x01\x1c\x02\x1d\x0e\x18\x03\x1e\x16\x14\x0f\x19\x11\x04\b\x1f\x1b\r\x17\x15\x13\x10\x07\x1a\f\x12\x06\v\x05\n\t"; unsigned int _1; unsigned int _2; unsigned int _3; unsigned int _4; char _5; int _9; int _10; <bb 2> : _1 = -x_6(D); _2 = _1 & x_6(D); _3 = _2 * 125613361; _4 = _3 >> 27; _9 = .CTZ (x_6(D)); - _10 = _9 & 31; + _10 = _9; _5 = (char) _10; return _10; } ;; Function ctz2 (ctz2, funcdef_no=1, decl_uid=3591, cgraph_uid=2, symbol_order=1) ctz2 (unsigned int x) { static short int table[64] = {32, 0, 1, 12, 2, 6, 0, 13, 3, 0, 7, 0, 0, 0, 0, 14, 10, 4, 0, 0, 8, 0, 0, 25, 0, 0, 0, 0, 0, 21, 27, 15, 31, 11, 5, 0, 0, 0, 0, 0, 9, 0, 0, 24, 0, 0, 20, 26, 30, 0, 0, 0, 0, 23, 0, 19, 29, 0, 22, 18, 28, 17, 16, 0}; unsigned int _1; unsigned int _2; unsigned int _3; short int _4; int _8; <bb 2> : _1 = -x_5(D); @@ -87,27 +87,27 @@ ;; Function ctz4 (ctz4, funcdef_no=3, decl_uid=3601, cgraph_uid=4, symbol_order=5) ctz4 (long unsigned int x) { long unsigned int lsb; long unsigned int _1; long long unsigned int _2; long long unsigned int _3; char _4; int _9; int _10; <bb 2> : _1 = -x_5(D); lsb_6 = _1 & x_5(D); _2 = lsb_6 * 283881067100198605; _3 = _2 >> 58; _9 = .CTZ (x_5(D)); - _10 = _9 & 63; + _10 = _9; _4 = (char) _10; return _10; } The difference in assembly matches. We have 2 less AND's in the final output: $ diff -u legacy.s ranger.s --- legacy.s 2020-10-07 09:06:13.420446783 -0400 +++ ranger.s 2020-10-07 09:06:42.646646949 -0400 @@ -8,7 +8,6 @@ ctz1: rbit w0, w0 clz w0, w0 - and w0, w0, 31 ret .size ctz1, .-ctz1 .align 2 @@ -36,7 +35,6 @@ ctz4: rbit x0, x0 clz x0, x0 - and w0, w0, 63 ret .size ctz4, .-ctz4 If my analysis is correct, someone aarch64 savvy should adjust this: /* { dg-final { scan-assembler-times "and\t" 2 } } */