On Wed, May 21, 2025 at 11:48:08AM +0200, Jakub Jelinek wrote: > With the following patch instead the test finally FAILs on s390x with -O1 > and higher, which shows that left shifts really need special casing for > info->extended. > I don't have proof of anything else though.
Note, the shifts are the only special case I'm aware of where the most significant limb (if it is has padding bits) is accessed inside of a loop or with access outside of a loop but with variable idx. Everything else should access the most significant limb using INTEGER_CST idx and thus can (and should) deal with the needed extension on that access directly. And RSHIFT_EXPR shouldn't really violate the content of the padding bits. For LSHIFT_EXPR we should IMHO do the following (which fixes the testcase on s390x-linux). The LSHIFT_EXPR is /* Lower dst = src << n; as unsigned n1 = n % limb_prec; size_t n2 = n / limb_prec; size_t n3 = n1 != 0; unsigned n4 = (limb_prec - n1) % limb_prec; size_t idx; size_t p = prec / limb_prec - (prec % limb_prec == 0); for (idx = p; (ssize_t) idx >= (ssize_t) (n2 + n3); --idx) dst[idx] = (src[idx - n2] << n1) | (src[idx - n2 - n3] >> n4); if (n1) { dst[idx] = src[idx - n2] << n1; --idx; } for (; (ssize_t) idx >= 0; --idx) dst[idx] = 0; */ as described in the comment (note, the comments are for the little-endian lowering only, didn't want to complicate it with endianity). As can be seen, the most significant limb can be modifier either inside of the loop or in the if (n1) body if the loop had 0 iterations. In your patch you've modified I believe just the loop and not the if body, and made it conditional on every iteration (furthermore through gimplification of COND_EXPR which is not the way this is done elsewhere in gimple-lower-bitint.cc, there is if_then helper and it builds gimple_build_cond etc.). I think that is way too expensive. In theory we could peel off the first iteration manually and do the info->extended handling in there and do it again inside of the if (n1) case if idx == (bitint_big_endian ? size_zero_node : p) in that case, but I think just doing the extension after the loops is easier. Note, we don't need to worry about volatile here, the shift is done into an addressable variable memory only if it is non-volatile, otherwise it is computed into a temporary and then copied over into the volatile var. 2025-05-21 Jakub Jelinek <ja...@redhat.com> * gimple-lower-bitint.cc (bitint_extended): New variable. (bitint_large_huge::lower_shift_stmt): For LSHIFT_EXPR with bitint_extended if lhs has most significant partial limb extend it afterwards. * gcc.dg/bitintext.h: New file. * gcc.dg/torture/bitint-82.c: New test. --- gcc/gimple-lower-bitint.cc.jj 2025-05-20 08:18:46.863553163 +0200 +++ gcc/gimple-lower-bitint.cc 2025-05-21 12:27:22.718891219 +0200 @@ -77,7 +77,7 @@ enum bitint_prec_kind { static int small_max_prec, mid_min_prec, large_min_prec, huge_min_prec; static int limb_prec; -static bool bitint_big_endian; +static bool bitint_big_endian, bitint_extended; /* Categorize _BitInt(PREC) as small, middle, large or huge. */ @@ -103,6 +103,7 @@ bitint_precision_kind (int prec) return bitint_prec_small; } bitint_big_endian = info.big_endian; + bitint_extended = info.extended; if (!large_min_prec && GET_MODE_PRECISION (limb_mode) < MAX_FIXED_MODE_SIZE) large_min_prec = MAX_FIXED_MODE_SIZE + 1; @@ -3882,6 +3883,23 @@ bitint_large_huge::lower_shift_stmt (tre g = gimple_build_cond (GE_EXPR, add_cast (ssizetype, idx_next), ssize_int (0), NULL_TREE, NULL_TREE); insert_before (g); + if (bitint_extended && prec % limb_prec != 0) + { + /* The most significant limb has been updated either in the + loop or in the if after it. To simplify the code, just + read it back from memory and extend. */ + m_gsi = gsi_after_labels (edge_false->dest); + idx = bitint_big_endian ? size_zero_node : p; + tree l = limb_access (TREE_TYPE (lhs), obj, idx, true); + tree type = limb_access_type (TREE_TYPE (lhs), idx); + tree v = make_ssa_name (m_limb_type); + g = gimple_build_assign (v, l); + insert_before (g); + v = add_cast (type, v); + l = limb_access (TREE_TYPE (lhs), obj, idx, true); + g = gimple_build_assign (l, add_cast (m_limb_type, v)); + insert_before (g); + } } } --- gcc/testsuite/gcc.dg/bitintext.h.jj 2025-05-20 22:39:28.647171638 +0200 +++ gcc/testsuite/gcc.dg/bitintext.h 2025-05-21 11:35:50.642636439 +0200 @@ -0,0 +1,29 @@ +[[gnu::noipa]] void +do_copy (void *p, const void *q, __SIZE_TYPE__ r) +{ + __builtin_memcpy (p, q, r); +} + +/* Macro to test whether (on targets where psABI requires it) _BitInt + with padding bits have those filled with sign or zero extension. */ +#if defined(__s390x__) || defined(__arm__) || defined(__loongarch__) +#define BEXTC(x) \ + do { \ + if ((typeof (x)) -1 < 0) \ + { \ + _BitInt(sizeof (x) * __CHAR_BIT__) __x; \ + do_copy (&__x, &(x), sizeof (__x)); \ + if (__x != (x)) \ + __builtin_abort (); \ + } \ + else \ + { \ + unsigned _BitInt(sizeof (x) * __CHAR_BIT__) __x; \ + do_copy (&__x, &(x), sizeof (__x)); \ + if (__x != (x)) \ + __builtin_abort (); \ + } \ + } while (0) +#else +#define BEXTC(x) do { (void) (x); } while (0) +#endif --- gcc/testsuite/gcc.dg/torture/bitint-82.c.jj 2025-05-20 22:39:28.647171638 +0200 +++ gcc/testsuite/gcc.dg/torture/bitint-82.c 2025-05-21 11:28:06.483905700 +0200 @@ -0,0 +1,94 @@ +/* { dg-do run { target bitint } } */ +/* { dg-options "-std=c23 -pedantic-errors" } */ +/* { dg-skip-if "" { ! run_expensive_tests } { "*" } { "-O0" "-O2" } } */ +/* { dg-skip-if "" { ! run_expensive_tests } { "-flto" } { "" } } */ + +#if __BITINT_MAXWIDTH__ >= 532 +_BitInt(5) a = 2, b = -2; +_BitInt(38) c = 12345, d = -12345; +_BitInt(129) e = 147090211948845388976606115811401318743wb, f = -147090211948845388976606115811401318743wb; +_BitInt(532) g = 34476769918317100226195145251004381172591594205376273814wb, h = -102116935649428556311918486808926113041433456371844211259677321wb; +unsigned _BitInt(1) i = 1; +unsigned _BitInt(17) j = 49127uwb; +unsigned _BitInt(60) k = 588141367522129848uwb; +unsigned _BitInt(205) l = 33991671979236490040668305838261113909013362173682935296620088uwb; +unsigned _BitInt(475) z = 14834685124553929878903720794923785539321715423294864257448721201977655202426343038777008759878862591302200019811097993772912691139803983786083uwb; +#endif + +#include "../bitintext.h" + +#if __BITINT_MAXWIDTH__ >= 532 +[[gnu::noipa]] _BitInt(217) +f1 (_BitInt(9) a, unsigned _BitInt(12) b, _BitInt(36) c, unsigned _BitInt(105) d, + _BitInt(135) e, unsigned _BitInt(168) f, _BitInt(207) g, _BitInt(207) h, + unsigned _BitInt(531) i, _BitInt(36) j) +{ + BEXTC (a); BEXTC (b); BEXTC (c); BEXTC (d); + BEXTC (e); BEXTC (f); BEXTC (g); BEXTC (h); + BEXTC (i); BEXTC (j); + _BitInt(9) k = a + 1; + unsigned _BitInt(12) l = b - a; + _BitInt(36) m = c * j; + unsigned _BitInt(105) n = d >> (-2 * j); + _BitInt(135) o = e | -j; + unsigned _BitInt(168) p = f & 101010101010101010101010uwb; + _BitInt(207) q = g * j; + _BitInt(207) r = g + h; + unsigned _BitInt(531) s = i / j; + BEXTC (k); BEXTC (l); BEXTC (m); BEXTC (n); + BEXTC (o); BEXTC (p); BEXTC (q); BEXTC (r); + BEXTC (s); + unsigned _BitInt(105) t = d << (38 - j); + BEXTC (t); + _Atomic _BitInt(5) u = 15; + u += 8U; + BEXTC (u); + _BitInt(135) v = e << 28; + BEXTC (v); + unsigned _BitInt(475) w = z << (29 + j); + BEXTC (w); + return a + 4; +} +#endif + +int +main () +{ +#if __BITINT_MAXWIDTH__ >= 532 + BEXTC (a); BEXTC (b); + BEXTC (c); BEXTC (d); + BEXTC (e); BEXTC (f); + BEXTC (g); BEXTC (h); + BEXTC (i); + BEXTC (j); + BEXTC (k); + BEXTC (l); + BEXTC (z); + { + _BitInt(5) a = 2, b = -2; + _BitInt(38) c = 12345, d = -12345; + _BitInt(129) e = 147090211948845388976606115811401318743wb, f = -147090211948845388976606115811401318743wb; + _BitInt(532) g = 34476769918317100226195145251004381172591594205376273814wb, h = -102116935649428556311918486808926113041433456371844211259677321wb; + unsigned _BitInt(1) i = 1; + unsigned _BitInt(17) j = 49127uwb; + unsigned _BitInt(60) k = 588141367522129848uwb; + unsigned _BitInt(205) l = 33991671979236490040668305838261113909013362173682935296620088uwb; + BEXTC (a); BEXTC (b); + BEXTC (c); BEXTC (d); + BEXTC (e); BEXTC (f); + BEXTC (g); BEXTC (h); + BEXTC (i); + BEXTC (j); + BEXTC (k); + BEXTC (l); + } + _BitInt(217) m = f1 (57wb, 3927uwb, 10625699364wb, 23030359755638571619326514462579uwb, + 20797625176303404170317957140841712396356wb, + 111831871006433449872067089878311637796827405335256uwb, + 64853652491049541618437564623858346454131583900201311683495230wb, + 25108562626494976011700565632680191924545340440636663075662700wb, + 6366583146545926097709747296452085257498446783797668089081516596003270602920229800152065594152964557479773813310423759077951305431130758723519892452009351743676uwb, + -1); + BEXTC (m); +#endif +} Jakub