[Bug c/100257] New: poor codegen with vcvtph2ps / stride of 6
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100257 Bug ID: 100257 Summary: poor codegen with vcvtph2ps / stride of 6 Product: gcc Version: 12.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c Assignee: unassigned at gcc dot gnu.org Reporter: witold.baryluk+gcc at gmail dot com Target Milestone: --- gcc (Compiler-Explorer-Build) 12.0.0 20210424 (experimental) https://godbolt.org/z/n6ooMdnz8 This C code: ``` #include #include #include struct float3 { float f1; float f2; float f3; }; struct util_format_r16g16b16_float { uint16_t r; uint16_t g; uint16_t b; }; static inline struct float3 _mesa_half3_to_float3(uint16_t val_0, uint16_t val_1, uint16_t val_2) { #if defined(__F16C__) //const __m128i in = {val_0, val_1, val_2}; //__m128 out; //__asm volatile("vcvtph2ps %1, %0" : "=v"(out) : "v"(in)); const __m128i in = _mm_setr_epi16(val_0, val_1, val_2, 0, 0, 0, 0, 0); const __m128 out = _mm_cvtph_ps(in); const struct float3 r = {out[0], out[1], out[2]}; return r; #endif } void util_format_r16g16b16_float_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width) { float *dst = dst_row; for (unsigned x = 0; x < width; x += 1) { const struct util_format_r16g16b16_float pixel; memcpy(&pixel, src, sizeof pixel); struct float3 r = _mesa_half3_to_float3(pixel.r, pixel.g, pixel.b); dst[0] = r.f1; /* r */ dst[1] = r.f2; /* g */ dst[2] = r.f3; /* b */ dst[3] = 1; /* a */ src += 6; dst += 4; } } ``` Is compiled "poorly" by gcc, even worse when compiled on i386 (with -mf16c enabled) when using -FPIE. Example: gcc -O3 -m32 -march=znver2 -mfpmath=sse -fPIE util_format_r16g16b16_float_unpack_rgba_float: pushebp pushedi pushesi pushebx sub esp, 28 mov ecx, DWORD PTR 56[esp] mov edx, DWORD PTR 48[esp] call__x86.get_pc_thunk.ax add eax, OFFSET FLAT:_GLOBAL_OFFSET_TABLE_ mov ebx, DWORD PTR 52[esp] testecx, ecx je .L8 vmovss xmm3, DWORD PTR .LC0@GOTOFF[eax] xor esi, esi xor ebp, ebp vpxor xmm2, xmm2, xmm2 .L3: mov eax, DWORD PTR [ebx] vmovss DWORD PTR 12[edx], xmm3 add ebx, 6 add edx, 16 inc esi mov ecx, eax vmovd xmm0, eax shr ecx, 16 mov edi, ecx movzx ecx, WORD PTR -2[ebx] vpinsrw xmm0, xmm0, edi, 1 vmovd xmm1, ecx vpinsrw xmm1, xmm1, ebp, 1 vpunpckldq xmm0, xmm0, xmm1 vpunpcklqdq xmm0, xmm0, xmm2 vcvtph2ps xmm0, xmm0 vmovss DWORD PTR -16[edx], xmm0 vextractps DWORD PTR -12[edx], xmm0, 1 vextractps DWORD PTR -8[edx], xmm0, 2 cmp DWORD PTR 56[esp], esi jne .L3 .L8: add esp, 28 pop ebx pop esi pop edi pop ebp ret .LC0: .long 1065353216 __x86.get_pc_thunk.ax: mov eax, DWORD PTR [esp] ret clang: util_format_r16g16b16_float_unpack_rgba_float: # @util_format_r16g16b16_float_unpack_rgba_float mov eax, dword ptr [esp + 12] testeax, eax je .LBB0_3 mov ecx, dword ptr [esp + 8] mov edx, dword ptr [esp + 4] .LBB0_2:# =>This Inner Loop Header: Depth=1 vmovd xmm0, dword ptr [ecx] # xmm0 = mem[0],zero,zero,zero vpinsrw xmm0, xmm0, word ptr [ecx + 4], 2 add ecx, 6 vcvtph2ps xmm0, xmm0 vmovss dword ptr [edx], xmm0 vextractps dword ptr [edx + 4], xmm0, 1 vextractps dword ptr [edx + 8], xmm0, 2 mov dword ptr [edx + 12], 1065353216 add edx, 16 dec eax jne .LBB0_2 .LBB0_3: ret clang code is essentially optimal. The issue persist if I use `vcvtph2ps` directly via asm, or via intrinsics. The issue might be the src stride, of 6, instead 8, that is confusing gcc. Additionally, constant 1065353216 (which is weird, I would expect it to be 0), is stored in data section, instead inline as immediate, this makes code actually larger, and in PIE mode, requires extra pointer trickery, and on -m32, even calling extra function. Even without -fPIE the main loop has poor codegen even on x86-64 / amd64 compared to clang or what I would considered good code. gcc -m64 -O3 -march=native util_format_r16g16b16_float_unpack_rgba_float: testedx, edx je .L8 mov edx, edx sal rdx, 4 vmovss xmm3, DWORD PTR .LC0[rip] lea rcx, [rdi+rdx]
[Bug tree-optimization/96275] Vectorizer doesn't take into account bitmask condition from branch conditions.
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96275 --- Comment #3 from Witold Baryluk --- Thanks for looking into that. I just wanted to update that this still suboptimal in current gcc trunk 20201226. While clang produces superior code.
[Bug d/98457] New: [d] writef!"%s" doesn't work with MonoTime / SysTick
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98457 Bug ID: 98457 Summary: [d] writef!"%s" doesn't work with MonoTime / SysTick Product: gcc Version: 10.2.1 Status: UNCONFIRMED Severity: normal Priority: P3 Component: d Assignee: ibuclaw at gdcproject dot org Reporter: witold.baryluk+gcc at gmail dot com Target Milestone: --- void main() { import std.stdio; import core.time : MonoTime; writef!"%s"(MonoTime.currTime()); } Doesn't compile with gdc 10.2.1: $ gdc test_monotime.d /usr/lib/gcc/x86_64-linux-gnu/10/include/d/core/time.d:2405:16: error: static variable _ticksPerSecond cannot be read at compile time 2405 | return _ticksPerSecond[_clockIdx]; |^ /usr/lib/gcc/x86_64-linux-gnu/10/include/d/core/time.d:2418:99: note: called from here: ticksPerSecond() 2418 | return "MonoTime(" ~ signedToTempString(_ticks, 10) ~ " ticks, " ~ signedToTempString(ticksPerSecond, 10) ~ " ticks per second)"; | ^ /usr/lib/gcc/x86_64-linux-gnu/10/include/d/core/time.d:2418:98: note: called from here: signedToTempString(ticksPerSecond(), 10u) 2418 | return "MonoTime(" ~ signedToTempString(_ticks, 10) ~ " ticks, " ~ signedToTempString(ticksPerSecond, 10) ~ " ticks per second)"; | ^ /usr/lib/gcc/x86_64-linux-gnu/10/include/d/std/format.d:3353:28: note: called from here: val.toString() 3353 | put(w, val.toString()); |^ /usr/lib/gcc/x86_64-linux-gnu/10/include/d/std/format.d:3353:12: note: called from here: put(w, val.toString()) 3353 | put(w, val.toString()); |^ /usr/lib/gcc/x86_64-linux-gnu/10/include/d/std/format.d:3672:21: note: called from here: formatObject(w, val, f) 3672 | formatObject(w, val, f); | ^ /usr/lib/gcc/x86_64-linux-gnu/10/include/d/std/format.d:568:28: note: called from here: formatValue(w, _param_2, spec) 568 | formatValue(w, args[i], spec); |^ /usr/lib/gcc/x86_64-linux-gnu/10/include/d/std/format.d:5767:28: note: called from here: formattedWrite(w, fmt, _param_1) 5767 | auto n = formattedWrite(w, fmt, args); |^ /usr/lib/gcc/x86_64-linux-gnu/10/include/d/std/format.d:5729:16: note: called from here: format("%s", MonoTimeImpl(0L)) 5729 | .format(fmt, Args.init); |^ /usr/lib/gcc/x86_64-linux-gnu/10/include/d/std/format.d:5733:2: note: called from here: (*function () => null)() 5733 | }(); | ^ (null):0: confused by earlier errors, bailing out Adding manually .toString() makes it work (at the expense of possible extra allocation). No issues in ldc2 1.24.0 or dmd2 2.095.0-beta.1 It doesn't look like issue in phobos, but something deeper.
[Bug d/98457] [d] writef!"%s" doesn't work with MonoTime / SysTick
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98457 --- Comment #1 from Witold Baryluk --- Godbolt link: https://godbolt.org/z/q3bzhP with gcc trunk 20201217 and a bit more diagnostic /opt/compiler-explorer/gcc-trunk-20201227/lib/gcc/x86_64-linux-gnu/11.0.0/include/d/core/time.d:2405:16: error: static variable _ticksPerSecond cannot be read at compile time 2405 | return _ticksPerSecond[_clockIdx]; |^ /opt/compiler-explorer/gcc-trunk-20201227/lib/gcc/x86_64-linux-gnu/11.0.0/include/d/core/time.d:2418:99: note: called from here: ticksPerSecond() 2418 | return "MonoTime(" ~ signedToTempString(_ticks, 10) ~ " ticks, " ~ signedToTempString(ticksPerSecond, 10) ~ " ticks per second)"; | ^ /opt/compiler-explorer/gcc-trunk-20201227/lib/gcc/x86_64-linux-gnu/11.0.0/include/d/core/time.d:2418:98: note: called from here: signedToTempString(ticksPerSecond(), 10u) 2418 | return "MonoTime(" ~ signedToTempString(_ticks, 10) ~ " ticks, " ~ signedToTempString(ticksPerSecond, 10) ~ " ticks per second)"; | ^ /opt/compiler-explorer/gcc-trunk-20201227/lib/gcc/x86_64-linux-gnu/11.0.0/include/d/std/format.d:3353:28: note: called from here: val.toString() 3353 | put(w, val.toString()); |^ /opt/compiler-explorer/gcc-trunk-20201227/lib/gcc/x86_64-linux-gnu/11.0.0/include/d/std/format.d:3353:12: note: called from here: put(w, val.toString()) 3353 | put(w, val.toString()); |^ /opt/compiler-explorer/gcc-trunk-20201227/lib/gcc/x86_64-linux-gnu/11.0.0/include/d/std/format.d:3672:21: note: called from here: formatObject(w, val, f) 3672 | formatObject(w, val, f); | ^ /opt/compiler-explorer/gcc-trunk-20201227/lib/gcc/x86_64-linux-gnu/11.0.0/include/d/std/format.d:568:28: note: called from here: formatValue(w, _param_2, spec) 568 | formatValue(w, args[i], spec); |^ /opt/compiler-explorer/gcc-trunk-20201227/lib/gcc/x86_64-linux-gnu/11.0.0/include/d/std/format.d:5767:28: note: called from here: formattedWrite(w, fmt, _param_1) 5767 | auto n = formattedWrite(w, fmt, args); |^ /opt/compiler-explorer/gcc-trunk-20201227/lib/gcc/x86_64-linux-gnu/11.0.0/include/d/std/format.d:5729:16: note: called from here: format("%s", MonoTimeImpl(0L)) 5729 | .format(fmt, Args.init); |^ /opt/compiler-explorer/gcc-trunk-20201227/lib/gcc/x86_64-linux-gnu/11.0.0/include/d/std/format.d:5733:2: note: called from here: (*function () => null)() 5733 | }(); | ^ /opt/compiler-explorer/gcc-trunk-20201227/lib/gcc/x86_64-linux-gnu/11.0.0/include/d/std/stdio.d:3754:15: error: template instance std.format.checkFormatException!("�}�", MonoTimeImpl!cast(ClockType)0) error instantiating 3754 | alias e = checkFormatException!(fmt, A); | ^ :4:14: note: instantiated from here: writef!("%s", MonoTimeImpl!cast(ClockType)0) 4 | writef!"%s"(MonoTime.currTime()); | ^ /opt/compiler-explorer/gcc-trunk-20201227/lib/gcc/x86_64-linux-gnu/11.0.0/include/d/std/stdio.d:3755:5: note: while evaluating: static assert(!e) 3755 | static assert(!e, e.msg); | ^ Compiler returned: 1
[Bug d/98494] New: libphobos: std.process Config.stderrPassThrough missing
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98494 Bug ID: 98494 Summary: libphobos: std.process Config.stderrPassThrough missing Product: gcc Version: 10.2.1 Status: UNCONFIRMED Severity: normal Priority: P3 Component: d Assignee: ibuclaw at gdcproject dot org Reporter: witold.baryluk+gcc at gmail dot com Target Milestone: --- It appears that gdc version of libphobos is somehow lagging in some aspects behind upstream. One of the things I see missing, is `Config.stderrPassThrough` in std.process. I see it was added upstream about 12 months ago: enum Config { ... /** By default, the $(LREF execute) and $(LREF executeShell) functions will capture child processes' both stdout and stderr. This can be undesirable if the standard output is to be processed or otherwise used by the invoking program, as `execute`'s result would then contain a mix of output and warning/error messages. Specify this flag when calling `execute` or `executeShell` to cause invoked processes' stderr stream to be sent to $(REF stderr, std,stdio), and only capture and return standard output. This flag has no effect on $(LREF spawnProcess) or $(LREF spawnShell). */ stderrPassThrough = 128, } The implementation usage of this is relatively small and easy to backport: in executeImpl: -auto p = pipeFunc(commandLine, Redirect.stdout | Redirect.stderrToStdout, - env, config, workDir, extraArgs); +auto redirect = (config & Config.stderrPassThrough) +? Redirect.stdout +: Redirect.stdout | Redirect.stderrToStdout; + +auto p = pipeFunc(commandLine, redirect, + env, config, workDir, extraArgs); There are some other minor changes there, but nothing functionally significant. Mostly unittests and minor signature changes (adding `scope` to many input parameters). Thank you.
[Bug d/100769] New: [D] memcmp() == 0 for small constant strings not folded
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100769 Bug ID: 100769 Summary: [D] memcmp() == 0 for small constant strings not folded Product: gcc Version: 10.2.1 Status: UNCONFIRMED Severity: normal Priority: P3 Component: d Assignee: ibuclaw at gdcproject dot org Reporter: witold.baryluk+gcc at gmail dot com Target Milestone: --- I expect this D code to be quite optimal, but it isn't. ``` extern(C) int memcmp(const void *s1, const void *s2, size_t n); int recognize3(const char* s) { return memcmp(s, "stract class", 12) == 0; } ``` https://godbolt.org/z/vx17WK9rs It produces a call to memcmp, instead of inlining and specializing the code for this specific case. int example.recognize3(const(char*)): sub rsp, 8 mov edx, 12 mov esi, OFFSET FLAT:.LC0 callmemcmp testeax, eax seteal add rsp, 8 movzx eax, al ret ldc2 1.24.0 (for D) and clang 11.0.1-2 (for C and C++), and gcc 10.2.1 (for C and C++) produce close to optimal codes. Similarly ldc2 1.26.0 (for D), and gcc 11.1 (for C and C++): int example.recognize3(const(char*)): movabs rcx, 7142836979195081843 xor rcx, qword ptr [rdi] mov edx, dword ptr [rdi + 8] xor rdx, 1936941420 xor eax, eax or rdx, rcx seteal ret and recognize3: movabs rax, 7142836979195081843 cmp QWORD PTR [rdi], rax je .L6 .L2: mov eax, 1 xor eax, 1 ret .L6: xor eax, eax cmp DWORD PTR [rdi+8], 1936941420 jne .L2 xor eax, 1 ret Notice, how both gcc, clang and ldc2, compare first 8 bytes of input, then 4 bytes of input. clang and ldc2 just xor/or the result, then return, with no conditional jumps. gcc does a bit poorer, with more conditionals and more jumps, but still pretty good and same idea. gdc however, calls the generic memcmp, that does looping and does about 12 jumps and/or 13 exists.
[Bug d/100769] [D] memcmp() == 0 for small constant strings not folded
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100769 --- Comment #1 from Witold Baryluk --- A typo in the example (godbolt is good), I forgot the `.ptr`: extern(C) int memcmp(const void *s1, const void *s2, size_t n); int recognize3(const char* s) { return memcmp(s, "stract class".ptr, 12) == 0; } casting to ubyte*, or void*, doesn't change anything really. options: -O3 -frelease -fno-semantic-interposition tested on amd64, Debian / Linux.
[Bug d/100769] [D] memcmp() == 0 for small constant strings not folded
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100769 --- Comment #2 from Witold Baryluk --- Hmm. It appears that using `import core.stdc.string : memcmp;` actually resolves the problem. It looks like my manually declaration of memcmp for some reason disabled optimisations for memcmp.
[Bug d/100769] [D] memcmp() == 0 for small constant strings not folded
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100769 Witold Baryluk changed: What|Removed |Added Status|UNCONFIRMED |RESOLVED Resolution|--- |FIXED --- Comment #4 from Witold Baryluk --- Ok. That makes sense. Thanks.
[Bug d/100769] [D] memcmp() == 0 for small constant strings not folded
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100769 Witold Baryluk changed: What|Removed |Added Resolution|FIXED |INVALID
[Bug d/105360] New: Inlined lazy parameters / delegate literals, still emitted
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105360 Bug ID: 105360 Summary: Inlined lazy parameters / delegate literals, still emitted Product: gcc Version: 12.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: d Assignee: ibuclaw at gdcproject dot org Reporter: witold.baryluk+gcc at gmail dot com Target Milestone: --- ``` extern bool g(); extern void f(int n); void log(lazy int num) { if (g()) { const n = num(); f(n); } } void p(int n) { log(n * 137); } ``` This should emit the same (or close to the same) as code with no `lazy` (and num reference changed accordingly) on `log` function. (Because compiler knows that `num ` is called once, has no side effects, is moderately expensive, etc). And the code for p is exactly the same - log and `n * 137` fully inlined. However, the anonymous dgliteral code is still emitted, despite not being referenced anywhere: ``` pure nothrow @nogc @safe int example.p(int).__dgliteral2(): # < This should not be in object file imuleax, DWORD PTR [rdi], 137 ret ``` Rest of the object file is correct and optimal: ``` void example.log(lazy int): pushrbp pushrbx mov rbp, rdi mov rbx, rsi sub rsp, 8 callbool example.g() testal, al je .L3 mov rdi, rbp callrbx add rsp, 8 pop rbx pop rbp mov edi, eax jmp void example.f(int) .L3: add rsp, 8 pop rbx pop rbp ret void example.p(int): pushrbx mov ebx, edi callbool example.g() testal, al je .L6 imuledi, ebx, 137 pop rbx jmp void example.f(int) .L6: pop rbx ret ``` gdc (Compiler-Explorer-Build-gcc-748d46cd049c89a799f99f14547267ebae915af6-binutils-2.36.1) 12.0.1 20220421 (experimental) via godbolt.org For a code passing reasonably big literals, this can lead to object file code duplication. ldc2 shows no such problem.
[Bug d/105360] Inlined lazy parameters / delegate literals, still emitted
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105360 --- Comment #1 from Witold Baryluk --- https://godbolt.org/z/c8oT6E4cf
[Bug d/105413] New: gdc extended assembler cannot constraints r8 - r15
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105413 Bug ID: 105413 Summary: gdc extended assembler cannot constraints r8 - r15 Product: gcc Version: 12.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: d Assignee: ibuclaw at gdcproject dot org Reporter: witold.baryluk+gcc at gmail dot com Target Milestone: --- gcc in C does not support directly register constraints for x86_64 registers r8 - r15. In C this can be done however using local register variables and asm attributes. https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html There is no way to use this in GDC extended assembler. version (linux) { version (GNU) { enum SYSCALL { OPENAT = 56, } @nogc: nothrow: size_t syscall(SYSCALL ident)(size_t arg1, size_t arg2, size_t arg3, size_t arg4) { version (X86_64) { asm @nogc nothrow { "syscall" // output: : "=a" (arg1) // inputs: : "a" (ident), // rax - syscall number "D" (arg1), // rdi - arg1 "S" (arg2), // rsi - arg2 "d" (arg3), // rdx - arg3 "r10" (arg4), // r10 - arg4 "m"( *cast(ubyte*)arg1) // "dummy" input instead of full memory clobber // clobers : "c", "r11"; // Clobers rax, and rcx and r11. } return arg1; } else { static assert(false, "This platform/architecture is not supported when using GDC compiler"); } } } private int openatdummy() @nogc nothrow { return cast(int)syscall!(SYSCALL.OPENAT)(0, 0, 0, 0); } } myio.d: In function ‘syscall’: myio.d:232:10: error: matching constraint references invalid operand number 232 | ; https://godbolt.org/z/xGzxa6orc
[Bug d/105413] gdc extended assembler cannot constraints r8 - r15
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105413 --- Comment #3 from Witold Baryluk --- It works. Thank you. Any chance this will be in gcc 12.x? I work a lot on Debian Linux, and I doubt I will have gcc trunk or gcc 13 available any time soon. Also weirdly gcc does not inline this function, unless I add @attribute("always_inline") on syscall, or @attribute("flatten") on openatdummy.
[Bug d/107241] New: std.bitmanip.bigEndianToNative et al not inlined
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107241 Bug ID: 107241 Summary: std.bitmanip.bigEndianToNative et al not inlined Product: gcc Version: 12.2.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: d Assignee: ibuclaw at gdcproject dot org Reporter: witold.baryluk+gcc at gmail dot com Target Milestone: --- gdc fails to inline number of small functions that should fully inline and end in single instruction. on amd64 / x86, for example std.bitmanip.bigEndianToNative causes a chain of calls / jumps, even with @attribute("flatten") import std.bitmanip; import gcc.attributes; @attribute("flatten") size_t f(char[] b) { return std.bitmanip.bigEndianToNative!(size_t, 8)(cast(ubyte[8])(b[2..10])); } gcc -O3 -march=znver2 -frelease pure nothrow @nogc @safe ulong std.bitmanip.swapEndian!(ulong).swapEndian(const(ulong)): mov rax, rdi bswap rax ret pure nothrow @nogc @safe ulong std.bitmanip.endianToNativeImpl!(true, ulong, 8uL).endianToNativeImpl(ubyte[8]): jmp pure nothrow @nogc @safe ulong std.bitmanip.swapEndian!(ulong).swapEndian(const(ulong)) pure nothrow @nogc @safe ulong std.bitmanip.bigEndianToNative!(ulong, 8uL).bigEndianToNative(ubyte[8]): jmp pure nothrow @nogc @safe ulong std.bitmanip.endianToNativeImpl!(true, ulong, 8uL).endianToNativeImpl(ubyte[8]) ulong example.f(char[]): mov rdi, QWORD PTR [rsi+2] jmp pure nothrow @nogc @safe ulong std.bitmanip.bigEndianToNative!(ulong, 8uL).bigEndianToNative(ubyte[8]) No issues with LDC. ulong example.f(char[]): mov rax, qword ptr [rsi + 2] bswap rax ret godbolt: https://godbolt.org/z/Pj3f7oGso
[Bug c++/103966] New: std::atomic relaxed load, inc, store sub-optimal codegen
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103966 Bug ID: 103966 Summary: std::atomic relaxed load, inc, store sub-optimal codegen Product: gcc Version: 12.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c++ Assignee: unassigned at gcc dot gnu.org Reporter: witold.baryluk+gcc at gmail dot com Target Milestone: --- Both functions below, should compile to the same assembly on x86: #include #include uint64_t x; void inc_a() { x++; } std::atomic y; void inc_b_non_atomic() { y.store(y.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed); } and it does so in clang. It does not in gcc 12 (and earlier). https://godbolt.org/z/GcM67xz8T This pattern is very popular in approximate statistical counters / metrics, where the flow of information is unidirectional (i.e. from one thread that does updates, to another thread that only reads the counters), and its performance is critical in many codebases.
[Bug c++/103966] std::atomic relaxed load, inc, store sub-optimal codegen
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103966 --- Comment #1 from Witold Baryluk --- Current codegen on gcc 12 on 64-bit x86: inc_a(): inc QWORD PTR x[rip] ret inc_b_non_atomic(): mov rax, QWORD PTR y[rip] inc rax mov QWORD PTR y[rip], rax ret y: .zero 8 x: .zero 8
[Bug c++/103966] std::atomic relaxed load, inc, store sub-optimal codegen
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103966 --- Comment #2 from Witold Baryluk --- Similarly, dec, add, sub, are affected, as well mul. Example: #include #include uint64_t x; void add_a() { x += 5; } std::atomic y; void add_b_non_atomic() { y.store(y.load(std::memory_order_relaxed) + 5, std::memory_order_relaxed); } Producing: add_a(): add QWORD PTR x[rip], 5 ret add_b_non_atomic(): mov rax, QWORD PTR y[rip] add rax, 5 mov QWORD PTR y[rip], rax ret y: .zero 8 x: .zero 8
[Bug middle-end/35560] Missing CSE/PRE for memory operations involved in virtual call.
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=35560 Witold Baryluk changed: What|Removed |Added CC||witold.baryluk+gcc at gmail dot co ||m --- Comment #15 from Witold Baryluk --- I know this is a pretty old bug, but I was exploring some assembly of gcc and clang on godbolt, and also stumbled into same issue. https://godbolt.org/z/qPzMhWse1 class A { public: virtual int f7(int x) const; }; int g(const A * const a, int x) { int r = 0; for (int i = 0; i < 1; i++) r += a->f7(x); return r; } (same happens without loop, when just calling a->f7 multiple times) g(A const*, int): pushr13 mov r13d, esi pushr12 xor r12d, r12d pushrbp mov rbp, rdi pushrbx mov ebx, 1 sub rsp, 8 .L2: mov rax, QWORD PTR [rbp+0] # a vtable deref mov esi, r13d mov rdi, rbp call[QWORD PTR [rax]]# f7 indirect call add r12d, eax dec ebx jne .L2 add rsp, 8 pop rbx pop rbp mov eax, r12d pop r12 pop r13 ret I was expecting mov rax, QWORD PTR [rbp+0] and call[QWORD PTR [rax]], to be hoisted out of the loop (call converted to lea, and call register). A bit sad. Is there some recent work done on this optimization? Are there at least some cases where it is valid to do CSE, or change code so it is moved out of the loop?
[Bug c/108255] New: Repeated address-of (lea) not optimized for size.
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108255 Bug ID: 108255 Summary: Repeated address-of (lea) not optimized for size. Product: gcc Version: 13.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c Assignee: unassigned at gcc dot gnu.org Reporter: witold.baryluk+gcc at gmail dot com Target Milestone: --- https://godbolt.org/z/q5sx9e49j void f(int *); int g(int of) { int x = 13; f(&x); f(&x); f(&x); f(&x); f(&x); f(&x); f(&x); f(&x); return 0; } Got: g(int): sub rsp, 24 lea rdi, [rsp+12] mov DWORD PTR [rsp+12], 13 callf(int*) lea rdi, [rsp+12] # compute, 5 bytes callf(int*) lea rdi, [rsp+12] # recompute, 5 bytes callf(int*) lea rdi, [rsp+12] # recompute, 5 bytes callf(int*) lea rdi, [rsp+12] callf(int*) lea rdi, [rsp+12] callf(int*) lea rdi, [rsp+12] callf(int*) lea rdi, [rsp+12] callf(int*) xor eax, eax add rsp, 24 ret But, note that lea is 5 bytes. Expected (generated by clang 3.0 - 15.0): g(int): # @g(int) pushrbx # extra, but just 1 byte sub rsp, 16 mov dword ptr [rsp + 12], 13 # CSE temp lea rbx, [rsp + 12] mov rdi, rbx # use callf(int*)@PLT mov rdi, rbx # reuse, 3 bytes callf(int*)@PLT mov rdi, rbx # reuse, 3 bytes callf(int*)@PLT mov rdi, rbx callf(int*)@PLT mov rdi, rbx callf(int*)@PLT mov rdi, rbx callf(int*)@PLT mov rdi, rbx callf(int*)@PLT mov rdi, rbx callf(int*)@PLT xor eax, eax add rsp, 16 pop rbx # extra, but just 1 byte ret Technically this is more instructions. But mov rdi, rbx is 3 bytes, which is shorter than 5 bytes of lea. This is at minor expense of needing to save and restore rbx. PS. Same happens when using temporary `int *const y = &x;` Also same when optimizing for size (`-Os`). It looks like gcc 4.8.5 produced expected code, but gcc 4.9.0 does not. It is possible that the code produced by gcc 4.9.0 is faster, but it is also likely it contributes quite a bit to binary size. clang uses CSE even if there are even just two uses of `&x` in the above example. It is likely a bit higher threshold is (3 or 4) is actually optimal (can be calculated knowing encoding sizes). Weirdly tho, gcc -m32 does this: g(): pushebp mov ebp, esp pushebx lea ebx, [ebp-12] sub esp, 32 mov DWORD PTR [ebp-12], 13 pushebx callf(int*) mov DWORD PTR [esp], ebx callf(int*) mov DWORD PTR [esp], ebx callf(int*) mov ebx, DWORD PTR [ebp-4] xor eax, eax leave ret Where, it does compute address and stores it in temporary. But does it on a stack, instead in a register (my guess is there are no free register to store it and it is spilled)., but in fact lea here would be likely faster (mov DWORD PTR [esp], ebx, but requires memory/cache access, lea is 5 bytes, but does not require memory access)
[Bug d/109221] New: std.math.floor, core.math.ldexp, std.math.poly poor inlining
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109221 Bug ID: 109221 Summary: std.math.floor, core.math.ldexp, std.math.poly poor inlining Product: gcc Version: 13.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: d Assignee: ibuclaw at gdcproject dot org Reporter: witold.baryluk+gcc at gmail dot com Target Milestone: --- Example: static float sRGB_case4(float x) { // import std.math : exp; return 1.055f * expImpl(x) - 0.055f; // expImpl not inlined by default // (inlined when using pragma(inline, true), but that fails to inline in DMD) } // pragma(inline, true) // This is borrowed from phobos/exponential.d to help gcc inline it fully. // Only T == float case is here (as some traits are private to phobos). // Also isNaN and range checks are removed, as sRGB performs own checks. static private T expImpl(T)(T x) @safe pure nothrow @nogc { //import std.math : floatTraits, RealFormat; //import std.math.traits : isNaN; //import std.math.rounding : floor; //import std.math.algebraic : poly; //import std.math.constants : LOG2E; import std.math; import core.math; static immutable T[6] P = [ 5.001201E-1, 1.665459E-1, 4.1665795894E-2, 8.3334519073E-3, 1.3981999507E-3, 1.9875691500E-4, ]; enum T C1 = 0.693359375; enum T C2 = -2.12194440e-4; // Overflow and Underflow limits. enum T OF = 88.72283905206835; enum T UF = -103.278929903431851103; // ln(2^-149) // Special cases. //if (isNaN(x)) //return x; //if (x > OF) //return real.infinity; //if (x < UF) //return 0.0; // Express: e^^x = e^^g * 2^^n // = e^^g * e^^(n * LOG2E) // = e^^(g + n * LOG2E) T xx = floor((cast(T) LOG2E) * x + cast(T) 0.5); // NOT INLINED! const int n = cast(int) xx; x -= xx * C1; x -= xx * C2; xx = x * x; x = poly(x, P) * xx + x + 1.0f; // poly is generated optimally, but not inlined // Scale by power of 2. x = core.math.ldexp(x, n);// NOT INLINED return x; } gdc gdc (Compiler-Explorer-Build-gcc-454a4d5041f53cd1f7d902f6c0017b7ce95b36df-binutils-2.38) 13.0.1 20230318 (experimental) gdc -O3 -march=znver2 -frelease -fbounds-check=off pure nothrow @nogc @safe float std.math.algebraic.poly!(float, float, 6).poly(float, ref const(float[6])): vmovss xmm1, DWORD PTR [rdi+20] vfmadd213ss xmm1, xmm0, DWORD PTR [rdi+16] vfmadd213ss xmm1, xmm0, DWORD PTR [rdi+12] vfmadd213ss xmm1, xmm0, DWORD PTR [rdi+8] vfmadd213ss xmm1, xmm0, DWORD PTR [rdi+4] vfmadd213ss xmm0, xmm1, DWORD PTR [rdi] ret pure nothrow @nogc @safe float example.expImpl!(float).expImpl(float): pushrbx vmovaps xmm1, xmm0 sub rsp, 16 vmovss xmm0, DWORD PTR .LC0[rip] vfmadd213ss xmm0, xmm1, DWORD PTR .LC1[rip] vmovss DWORD PTR [rsp+8], xmm1 callpure nothrow @nogc @trusted float std.math.rounding.floor(float) vmovss xmm1, DWORD PTR [rsp+8] mov edi, OFFSET FLAT:immutable(float[6]) example.expImpl!(float).expImpl(float).P vfnmadd231ssxmm1, xmm0, DWORD PTR .LC2[rip] vmovss DWORD PTR [rsp+12], xmm0 vfnmadd231ssxmm1, xmm0, DWORD PTR .LC3[rip] vmulss xmm3, xmm1, xmm1 vmovaps xmm0, xmm1 vmovss DWORD PTR [rsp+8], xmm1 vmovd ebx, xmm3 callpure nothrow @nogc @safe float std.math.algebraic.poly!(float, float, 6).poly(float, ref const(float[6])) vmovss xmm1, DWORD PTR [rsp+8] vmovd xmm4, ebx vmovss xmm2, DWORD PTR [rsp+12] vfmadd132ss xmm0, xmm1, xmm4 vaddss xmm0, xmm0, DWORD PTR .LC4[rip] add rsp, 16 pop rbx vcvttss2si edi, xmm2 jmp ldexpf float example.sRGB_case4(float): sub rsp, 8 callpure nothrow @nogc @safe float example.expImpl!(float).expImpl(float) vmovss xmm1, DWORD PTR .LC6[rip] vfmadd132ss xmm0, xmm1, DWORD PTR .LC5[rip] add rsp, 8 ret https://godbolt.org/z/YMoMPdjn5 Additionally std.math.exp itself, is never inlined by gcc. This is important, as some early checks (isNaN, OF, UF checks) in exp could be removed by proper inlining.
[Bug d/109221] std.math.floor, core.math.ldexp, std.math.poly poor inlining
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109221 --- Comment #1 from Witold Baryluk --- PS. LDC 1.23.0 - 1.32.0 produce optimal code. LDC 1.22.0 a bit worse (due to use of x87 codegen), and 1.21 and older fail to inline `ldexp`, but still inline `poly` and `floor` perfectly.
[Bug d/109221] std.math.floor, core.math.ldexp, std.math.poly poor inlining
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109221 --- Comment #2 from Witold Baryluk --- Interesting enough, GDC 10.2 does inline `poly` instantiation with all the constants.
[Bug d/110113] New: gdc -fpreview=dip1021 crash in d/dmd/root/aav.d:127 dmd_aaGetRvalue from DsymbolTable::lookup(Identifier const*)
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110113 Bug ID: 110113 Summary: gdc -fpreview=dip1021 crash in d/dmd/root/aav.d:127 dmd_aaGetRvalue from DsymbolTable::lookup(Identifier const*) Product: gcc Version: 13.1.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: d Assignee: ibuclaw at gdcproject dot org Reporter: witold.baryluk+gcc at gmail dot com Target Milestone: --- Created attachment 55254 --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=55254&action=edit Minimized test case with dustmite Debian Linux amd64, experimental gcc-13, gdc 13.1.0-3 This is not very deterministic. Run few times to trigger. ``` user@debian:~$ cat lup.d class LUBench { } float lup(ulong , ulong , int , int = 1) { double[] solution; new LUBench; return solution[0] ; } float lup_3200(ulong iters, ulong flops) { return lup(iters, flops, 3200); } float raytrace() { struct V { float x, y, z; auto normalize() { } import std; auto cross() { } auto norm2() { } auto norm() { } auto opBinary(){ } } } user@debian:~$ gdc-13 -c -fpreview=dip1021 lup.d lup.d:11:7: error: function ‘lup.raytrace’ has no ‘return’ statement, but is expected to return a value of type ‘float’ 11 | float raytrace() { | ^ user@debian:~$ gdc-13 -c -fpreview=dip1021 lup.d lup.d:11:7: error: function ‘lup.raytrace’ has no ‘return’ statement, but is expected to return a value of type ‘float’ 11 | float raytrace() { | ^ user@debian:~$ gdc-13 -c -fpreview=dip1021 lup.d lup.d:11:7: error: function ‘lup.raytrace’ has no ‘return’ statement, but is expected to return a value of type ‘float’ 11 | float raytrace() { | ^ user@debian:~$ gdc-13 -c -fpreview=dip1021 lup.d lup.d:11:7: error: function ‘lup.raytrace’ has no ‘return’ statement, but is expected to return a value of type ‘float’ 11 | float raytrace() { | ^ user@debian:~$ gdc-13 -c -fpreview=dip1021 lup.d /usr/lib/gcc/x86_64-linux-gnu/13/include/d/std/math/algebraic.d:968:47: internal compiler error: Segmentation fault 968 | return cast(Unqual!T) (T(1) << bsr(val) + type); | ^ 0xd32f86 crash_signal ../../src/gcc/toplev.cc:314 0x7f53b651cf8f ??? ./signal/../sysdeps/unix/sysv/linux/x86_64/libc_sigaction.c:0 0x17f7d10 _D3dmd4root3aav15dmd_aaGetRvalueFNaNbNiPSQBnQBmQBk2AAPvZQd ../../src/gcc/d/dmd/root/aav.d:127 0x1706b25 DsymbolTable::lookup(Identifier const*) ../../src/gcc/d/dmd/dsymbol.d:2408 0x1706b25 ScopeDsymbol::search(Loc const&, Identifier*, int) ../../src/gcc/d/dmd/dsymbol.d:1470 0x17ef5b3 _D3dmd6opover15search_functionFCQBe7dsymbol12ScopeDsymbolCQCe10identifier10IdentifierZCQDhQCd7Dsymbol ../../src/gcc/d/dmd/opover.d:1435 0x1701fe0 search_toString(StructDeclaration*) ../../src/gcc/d/dmd/dstruct.d:51 0x180310a semanticTypeInfoMembers(StructDeclaration*) ../../src/gcc/d/dmd/semantic3.d:1650 0x1803394 Semantic3Visitor::visit(AggregateDeclaration*) ../../src/gcc/d/dmd/semantic3.d:1590 0x17fef19 semantic3(Dsymbol*, Scope*) ../../src/gcc/d/dmd/semantic3.d:83 0x175dc89 ExpressionSemanticVisitor::visit(DeclarationExp*) ../../src/gcc/d/dmd/expressionsem.d:5572 0x175dc89 ExpressionSemanticVisitor::visit(DeclarationExp*) ../../src/gcc/d/dmd/expressionsem.d:5407 0x175eb82 expressionSemantic(Expression*, Scope*) ../../src/gcc/d/dmd/expressionsem.d:12706 0x18096fa StatementSemanticVisitor::visit(ExpStatement*) ../../src/gcc/d/dmd/statementsem.d:207 0x18228c1 statementSemantic(Statement*, Scope*) ../../src/gcc/d/dmd/statementsem.d:149 0x18228c1 StatementSemanticVisitor::visit(CompoundStatement*) ../../src/gcc/d/dmd/statementsem.d:270 0x1809112 statementSemantic(Statement*, Scope*) ../../src/gcc/d/dmd/statementsem.d:149 0x18002a1 Semantic3Visitor::visit(FuncDeclaration*) ../../src/gcc/d/dmd/semantic3.d:598 0x17feae4 semantic3(Dsymbol*, Scope*) ../../src/gcc/d/dmd/semantic3.d:83 0x17feae4 Semantic3Visitor::visit(Module*) ../../src/gcc/d/dmd/semantic3.d:205 Please submit a full bug report, with preprocessed source (by using -freport-bug). Please include the complete backtrace with any bug report. See for instructions. user@debian:~$ ``` Could not reduce further, as it is sensitive to identifiers, and due to non-deterministic nature testing requires many repetitions.
[Bug d/110113] gdc -fpreview=dip1021 crash in d/dmd/root/aav.d:127 dmd_aaGetRvalue from DsymbolTable::lookup(Identifier const*)
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110113 --- Comment #1 from Witold Baryluk --- BTW. Adding return statement in `raytrace`, does not change anything: ``` user@debian:~$ gdc-13 -c -fpreview=dip1021 lup.d user@debian:~$ gdc-13 -c -fpreview=dip1021 lup.d user@debian:~$ gdc-13 -c -fpreview=dip1021 lup.d user@debian:~$ gdc-13 -c -fpreview=dip1021 lup.d user@debian:~$ gdc-13 -c -fpreview=dip1021 lup.d /usr/lib/gcc/x86_64-linux-gnu/13/include/d/std/math/algebraic.d:968:47: internal compiler error: Segmentation fault 968 | return cast(Unqual!T) (T(1) << bsr(val) + type); | ^ 0xd32f86 crash_signal ../../src/gcc/toplev.cc:314 0x7f7144273f8f ??? ./signal/../sysdeps/unix/sysv/linux/x86_64/libc_sigaction.c:0 0x17f7d10 _D3dmd4root3aav15dmd_aaGetRvalueFNaNbNiPSQBnQBmQBk2AAPvZQd ../../src/gcc/d/dmd/root/aav.d:127 0x1706b25 DsymbolTable::lookup(Identifier const*) ../../src/gcc/d/dmd/dsymbol.d:2408 0x1706b25 ScopeDsymbol::search(Loc const&, Identifier*, int) ../../src/gcc/d/dmd/dsymbol.d:1470 ... ... ```
[Bug d/110113] gdc -fpreview=dip1021 crash in d/dmd/root/aav.d:127 dmd_aaGetRvalue from DsymbolTable::lookup(Identifier const*)
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110113 --- Comment #2 from Witold Baryluk --- Also FYI, I was not able to trigger this on DMD64 D Compiler v2.104.0
[Bug d/110113] gdc -fpreview=dip1021 crash in d/dmd/root/aav.d:127 dmd_aaGetRvalue from DsymbolTable::lookup(Identifier const*)
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110113 --- Comment #10 from Witold Baryluk --- Thank you Iain. Amazing debugging skills. BTW. `import std;` was because dustmite reduced original import to just that. Original import was `import std.math.algebraic : sqrt;` But you already figured this out without even using Phobos.
[Bug d/110516] New: core.volatile.volatileLoad is broken
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110516 Bug ID: 110516 Summary: core.volatile.volatileLoad is broken Product: gcc Version: 14.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: d Assignee: ibuclaw at gdcproject dot org Reporter: witold.baryluk+gcc at gmail dot com Target Milestone: --- gcc 12.2.0 (from Debian stable) and gcc trunk 14.0.0 (in godbolt) tested. core.volatile.volatileLoad simply does not work. 1) It merges loads. 2) It removes unused loads at -O1 and higher. Example: void actualRun(ubyte* ptr1) { import core.volatile : volatileLoad; volatileLoad(ptr1); volatileLoad(ptr1); volatileLoad(ptr1); volatileLoad(ptr1); } Without optimisations: void example.actualRun(ubyte*): pushrbp mov rbp, rsp mov QWORD PTR [rbp-8], rdi nop pop rbp ret Incorrect. With optimisations: void example.actualRun(ubyte*): ret Incorrect. Expected: void example.actualRun(ubyte*): movzx eax, byte ptr [rdi] movzx eax, byte ptr [rdi] movzx eax, byte ptr [rdi] movzx eax, byte ptr [rdi] ret dmd and ldc behave properly. It looks like it never worked properly. Would be good to have a test case for this, so it does not become a regression later. I did not test volatileStore, but I would not be surprised it is also broken.
[Bug d/110516] core.volatile.volatileLoad discarded if result is unused
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110516 --- Comment #8 from Witold Baryluk --- I see. Point 1 is definitively incorrect. I interpreted asembler wrong: void example.actualRun(ubyte*): pushrbp mov rbp, rsp mov QWORD PTR [rbp-8], rdi nop pop rbp ret The move there, is just some stack manipulation, it has nothing to do with volatileLoad. You are right about the side effect visibility and volatileStore. Still, there should be a way to express real memory read, with result not stored anywhere in program (just written to register, then discarded). This has some (not very common) uses in memmory-mapped IO, i.e. in drivers for devices where the read itself could indicate something (this of course usually also require setting proper page table attributes to disable caching or other optimizations, etc, not just volatile load in machine code). I do not have specific examples at hand, but afaik I saw some examples in the past (mostly on older architectures), as well some watchdog chips that reset timer on read. Another use is for doing memory and cache read benchmarks and profiling. We want to invoke read (to register) from some memory location, but we do not need the value for anything else. And more esoteric use might be memory probing. On some level systems, kernel or bootloader, might not know the memory layout, and resort to just doing reads, and relaying on CPU fault handlers to report invalid reads. And some people might use load without destination, as a prefetch hint, or to prefault some memory pages.
[Bug d/110516] core.volatile.volatileLoad discarded if result is unused
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110516 --- Comment #9 from Witold Baryluk --- Thank you for a quick fix Iain!
[Bug d/113125] New: [D] internal compiler error: in make_import, at d/imports.cc:48
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113125 Bug ID: 113125 Summary: [D] internal compiler error: in make_import, at d/imports.cc:48 Product: gcc Version: 13.2.1 Status: UNCONFIRMED Severity: normal Priority: P3 Component: d Assignee: ibuclaw at gdcproject dot org Reporter: witold.baryluk+gcc at gmail dot com Target Milestone: --- Debian testing, amd64, gcc version 13.2.0 (Debian 13.2.0-7) meta.d: ``` module objc.meta; struct A; ``` runtime.d: ``` module objc.runtime; public import meta : A; ``` gdc -v -c -I. runtime.d ``` $ gdc -v -c -I. runtime.d Using built-in specs. COLLECT_GCC=gdc OFFLOAD_TARGET_NAMES=nvptx-none:amdgcn-amdhsa OFFLOAD_TARGET_DEFAULT=1 Target: x86_64-linux-gnu Configured with: ../src/configure -v --with-pkgversion='Debian 13.2.0-7' --with-bugurl=file:///usr/share/doc/gcc-13/README.Bugs --enable-languages=c,ada,c++,go,d,fortran,objc,obj-c++,m2 --prefix=/usr --with-gcc-major-version-only --program-suffix=-13 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/libexec --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --enable-bootstrap --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-gnu-unique-object --disable-vtable-verify --enable-plugin --enable-default-pie --with-system-zlib --enable-libphobos-checking=release --with-target-system-zlib=auto --enable-objc-gc=auto --enable-multiarch --disable-werror --enable-cet --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-offload-targets=nvptx-none=/build/reproducible-path/gcc-13-13.2.0/debian/tmp-nvptx/usr,amdgcn-amdhsa=/build/reproducible-path/gcc-13-13.2.0/debian/tmp-gcn/usr --enable-offload-defaulted --without-cuda-driver --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu --with-build-config=bootstrap-lto-lean --enable-link-serialization=3 Thread model: posix Supported LTO compression algorithms: zlib zstd gcc version 13.2.0 (Debian 13.2.0-7) COLLECT_GCC_OPTIONS='-v' '-c' '-I' '.' '-o' 'runtime.o' '-shared-libgcc' '-mtune=generic' '-march=x86-64' /usr/libexec/gcc/x86_64-linux-gnu/13/d21 runtime.d -quiet -dumpbase runtime.d -dumpbase-ext .d -mtune=generic -march=x86-64 -version -imultiarch x86_64-linux-gnu -I . -v -o /tmp/ccPyiN0m.s GNU D (Debian 13.2.0-7) version 13.2.0 (x86_64-linux-gnu) compiled by GNU C version 13.2.0, GMP version 6.3.0, MPFR version 4.2.1, MPC version 1.3.1, isl version isl-0.26-GMP GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072 binary/usr/libexec/gcc/x86_64-linux-gnu/13/d21 version v2.103.1 predefs GNU D_Version2 LittleEndian GNU_DWARF2_Exceptions GNU_StackGrowsDown GNU_InlineAsm D_LP64 D_PIC D_PIE assert D_PreConditions D_PostConditions D_Invariants D_ModuleInfo D_Exceptions D_TypeInfo all X86_64 D_HardFloat Posix linux CRuntime_Glibc CppRuntime_Gcc parse runtime importall runtime importmeta (meta.d) importobject(/usr/lib/gcc/x86_64-linux-gnu/13/include/d/object.d) importcore.attribute (/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/attribute.d) importgcc.attributes (/usr/lib/gcc/x86_64-linux-gnu/13/include/d/gcc/attributes.d) importcore.internal.hash (/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/internal/hash.d) importcore.internal.traits (/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/internal/traits.d) importcore.internal.entrypoint (/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/internal/entrypoint.d) importcore.internal.array.appending (/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/internal/array/appending.d) importcore.internal.array.comparison (/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/internal/array/comparison.d) importcore.internal.array.equality (/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/internal/array/equality.d) importcore.internal.array.casting (/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/internal/array/casting.d) importcore.internal.array.concatenation (/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/internal/array/concatenation.d) importcore.internal.array.construction (/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/internal/array/construction.d) importcore.internal.array.arrayassign (/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/internal/array/arrayassign.d) importcore.internal.array.capacity (/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/internal/array/capacity.d) importcore.internal.dassert (/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/internal/dassert.d) importcore.atomic (/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/atomic.d) importcore.internal.attributes (/usr/lib/gcc/x86_64-linux-gn