from:"witold.baryluk\+gcc at gmail dot com via Gcc\-bugs"

[Bug c/100257] New: poor codegen with vcvtph2ps / stride of 6

2021-04-25 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100257

Bug ID: 100257
   Summary: poor codegen with vcvtph2ps / stride of 6
   Product: gcc
   Version: 12.0
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: c
  Assignee: unassigned at gcc dot gnu.org
  Reporter: witold.baryluk+gcc at gmail dot com
  Target Milestone: ---

gcc (Compiler-Explorer-Build) 12.0.0 20210424 (experimental)


https://godbolt.org/z/n6ooMdnz8


This C code:

```
#include 
#include 
#include 

struct float3 {
float f1;
float f2;
float f3;
};

struct util_format_r16g16b16_float {
   uint16_t r;
   uint16_t g;
   uint16_t b;
};

static inline struct float3 _mesa_half3_to_float3(uint16_t val_0, uint16_t
val_1, uint16_t val_2) {
#if defined(__F16C__)
  //const __m128i in = {val_0, val_1, val_2};
  //__m128 out;
  //__asm volatile("vcvtph2ps %1, %0" : "=v"(out) : "v"(in));

  const __m128i in = _mm_setr_epi16(val_0, val_1, val_2, 0, 0, 0, 0, 0);
  const __m128 out = _mm_cvtph_ps(in);

  const struct float3 r = {out[0], out[1], out[2]};
  return r;
#endif
}


void
util_format_r16g16b16_float_unpack_rgba_float(void *restrict dst_row, const
uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
const struct util_format_r16g16b16_float pixel;
memcpy(&pixel, src, sizeof pixel);

struct float3 r = _mesa_half3_to_float3(pixel.r, pixel.g, pixel.b);
dst[0] = r.f1; /* r */
dst[1] = r.f2; /* g */
dst[2] = r.f3; /* b */
dst[3] = 1; /* a */

src += 6;
dst += 4;
   }
}

```

Is compiled "poorly" by gcc, even worse when compiled on i386 (with -mf16c
enabled) when using -FPIE.

Example:


gcc -O3 -m32 -march=znver2 -mfpmath=sse -fPIE

util_format_r16g16b16_float_unpack_rgba_float:
pushebp
pushedi
pushesi
pushebx
sub esp, 28
mov ecx, DWORD PTR 56[esp]
mov edx, DWORD PTR 48[esp]
call__x86.get_pc_thunk.ax
add eax, OFFSET FLAT:_GLOBAL_OFFSET_TABLE_
mov ebx, DWORD PTR 52[esp]
testecx, ecx
je  .L8
vmovss  xmm3, DWORD PTR .LC0@GOTOFF[eax]
xor esi, esi
xor ebp, ebp
vpxor   xmm2, xmm2, xmm2
.L3:
mov eax, DWORD PTR [ebx]
vmovss  DWORD PTR 12[edx], xmm3
add ebx, 6
add edx, 16
inc esi
mov ecx, eax
vmovd   xmm0, eax
shr ecx, 16
mov edi, ecx
movzx   ecx, WORD PTR -2[ebx]
vpinsrw xmm0, xmm0, edi, 1
vmovd   xmm1, ecx
vpinsrw xmm1, xmm1, ebp, 1
vpunpckldq  xmm0, xmm0, xmm1
vpunpcklqdq xmm0, xmm0, xmm2
vcvtph2ps   xmm0, xmm0
vmovss  DWORD PTR -16[edx], xmm0
vextractps  DWORD PTR -12[edx], xmm0, 1
vextractps  DWORD PTR -8[edx], xmm0, 2
cmp DWORD PTR 56[esp], esi
jne .L3
.L8:
add esp, 28
pop ebx
pop esi
pop edi
pop ebp
ret
.LC0:
.long   1065353216
__x86.get_pc_thunk.ax:
mov eax, DWORD PTR [esp]
ret



clang:

util_format_r16g16b16_float_unpack_rgba_float: #
@util_format_r16g16b16_float_unpack_rgba_float
mov eax, dword ptr [esp + 12]
testeax, eax
je  .LBB0_3
mov ecx, dword ptr [esp + 8]
mov edx, dword ptr [esp + 4]
.LBB0_2:# =>This Inner Loop Header: Depth=1
vmovd   xmm0, dword ptr [ecx]   # xmm0 = mem[0],zero,zero,zero
vpinsrw xmm0, xmm0, word ptr [ecx + 4], 2
add ecx, 6
vcvtph2ps   xmm0, xmm0
vmovss  dword ptr [edx], xmm0
vextractps  dword ptr [edx + 4], xmm0, 1
vextractps  dword ptr [edx + 8], xmm0, 2
mov dword ptr [edx + 12], 1065353216
add edx, 16
dec eax
jne .LBB0_2
.LBB0_3:
ret


clang code is essentially optimal.


The issue persist if I use `vcvtph2ps` directly via asm, or via intrinsics.

The issue might be the src stride, of 6, instead 8, that is confusing gcc.

Additionally, constant 1065353216  (which is weird, I would expect it to be 0),
is stored in data section, instead inline as immediate, this makes code
actually larger, and in PIE mode, requires extra pointer trickery, and on -m32,
even calling extra function.

Even without -fPIE the main loop has poor codegen even on x86-64 / amd64
compared to clang or what I would considered good code.

gcc -m64 -O3 -march=native

util_format_r16g16b16_float_unpack_rgba_float:
testedx, edx
je  .L8
mov edx, edx
sal rdx, 4
vmovss  xmm3, DWORD PTR .LC0[rip]
lea rcx, [rdi+rdx]

[Bug tree-optimization/96275] Vectorizer doesn't take into account bitmask condition from branch conditions.

2020-12-27 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96275

--- Comment #3 from Witold Baryluk  ---
Thanks for looking into that. I just wanted to update that this still
suboptimal in current gcc trunk 20201226. While clang produces superior code.

[Bug d/98457] New: [d] writef!"%s" doesn't work with MonoTime / SysTick

2020-12-27 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98457

Bug ID: 98457
   Summary: [d] writef!"%s" doesn't work with MonoTime / SysTick
   Product: gcc
   Version: 10.2.1
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: d
  Assignee: ibuclaw at gdcproject dot org
  Reporter: witold.baryluk+gcc at gmail dot com
  Target Milestone: ---

void main() {
  import std.stdio;
  import core.time : MonoTime;
  writef!"%s"(MonoTime.currTime());
}


Doesn't compile with gdc 10.2.1:

$ gdc test_monotime.d 
/usr/lib/gcc/x86_64-linux-gnu/10/include/d/core/time.d:2405:16: error: static
variable _ticksPerSecond cannot be read at compile time
 2405 | return _ticksPerSecond[_clockIdx];
  |^
/usr/lib/gcc/x86_64-linux-gnu/10/include/d/core/time.d:2418:99: note: called
from here: ticksPerSecond()
 2418 | return "MonoTime(" ~ signedToTempString(_ticks, 10) ~ "
ticks, " ~ signedToTempString(ticksPerSecond, 10) ~ " ticks per second)";
  |
  ^
/usr/lib/gcc/x86_64-linux-gnu/10/include/d/core/time.d:2418:98: note: called
from here: signedToTempString(ticksPerSecond(), 10u)
 2418 | return "MonoTime(" ~ signedToTempString(_ticks, 10) ~ "
ticks, " ~ signedToTempString(ticksPerSecond, 10) ~ " ticks per second)";
  |
 ^
/usr/lib/gcc/x86_64-linux-gnu/10/include/d/std/format.d:3353:28: note: called
from here: val.toString()
 3353 | put(w, val.toString());
  |^
/usr/lib/gcc/x86_64-linux-gnu/10/include/d/std/format.d:3353:12: note: called
from here: put(w, val.toString())
 3353 | put(w, val.toString());
  |^
/usr/lib/gcc/x86_64-linux-gnu/10/include/d/std/format.d:3672:21: note: called
from here: formatObject(w, val, f)
 3672 | formatObject(w, val, f);
  | ^
/usr/lib/gcc/x86_64-linux-gnu/10/include/d/std/format.d:568:28: note: called
from here: formatValue(w, _param_2, spec)
  568 | formatValue(w, args[i], spec);
  |^
/usr/lib/gcc/x86_64-linux-gnu/10/include/d/std/format.d:5767:28: note: called
from here: formattedWrite(w, fmt, _param_1)
 5767 | auto n = formattedWrite(w, fmt, args);
  |^
/usr/lib/gcc/x86_64-linux-gnu/10/include/d/std/format.d:5729:16: note: called
from here: format("%s", MonoTimeImpl(0L))
 5729 | .format(fmt, Args.init);
  |^
/usr/lib/gcc/x86_64-linux-gnu/10/include/d/std/format.d:5733:2: note: called
from here: (*function () => null)()
 5733 | }();
  |  ^

(null):0: confused by earlier errors, bailing out



Adding manually .toString() makes it work (at the expense of possible extra
allocation).

No issues in ldc2 1.24.0 or dmd2 2.095.0-beta.1

It doesn't look like issue in phobos, but something deeper.

[Bug d/98457] [d] writef!"%s" doesn't work with MonoTime / SysTick

2020-12-27 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98457

--- Comment #1 from Witold Baryluk  ---
Godbolt link: https://godbolt.org/z/q3bzhP

with gcc trunk 20201217 and a bit more diagnostic

/opt/compiler-explorer/gcc-trunk-20201227/lib/gcc/x86_64-linux-gnu/11.0.0/include/d/core/time.d:2405:16:
error: static variable _ticksPerSecond cannot be read at compile time
 2405 | return _ticksPerSecond[_clockIdx];
  |^
/opt/compiler-explorer/gcc-trunk-20201227/lib/gcc/x86_64-linux-gnu/11.0.0/include/d/core/time.d:2418:99:
note: called from here: ticksPerSecond()
 2418 | return "MonoTime(" ~ signedToTempString(_ticks, 10) ~ "
ticks, " ~ signedToTempString(ticksPerSecond, 10) ~ " ticks per second)";
  |
  ^
/opt/compiler-explorer/gcc-trunk-20201227/lib/gcc/x86_64-linux-gnu/11.0.0/include/d/core/time.d:2418:98:
note: called from here: signedToTempString(ticksPerSecond(), 10u)
 2418 | return "MonoTime(" ~ signedToTempString(_ticks, 10) ~ "
ticks, " ~ signedToTempString(ticksPerSecond, 10) ~ " ticks per second)";
  |
 ^
/opt/compiler-explorer/gcc-trunk-20201227/lib/gcc/x86_64-linux-gnu/11.0.0/include/d/std/format.d:3353:28:
note: called from here: val.toString()
 3353 | put(w, val.toString());
  |^
/opt/compiler-explorer/gcc-trunk-20201227/lib/gcc/x86_64-linux-gnu/11.0.0/include/d/std/format.d:3353:12:
note: called from here: put(w, val.toString())
 3353 | put(w, val.toString());
  |^
/opt/compiler-explorer/gcc-trunk-20201227/lib/gcc/x86_64-linux-gnu/11.0.0/include/d/std/format.d:3672:21:
note: called from here: formatObject(w, val, f)
 3672 | formatObject(w, val, f);
  | ^
/opt/compiler-explorer/gcc-trunk-20201227/lib/gcc/x86_64-linux-gnu/11.0.0/include/d/std/format.d:568:28:
note: called from here: formatValue(w, _param_2, spec)
  568 | formatValue(w, args[i], spec);
  |^
/opt/compiler-explorer/gcc-trunk-20201227/lib/gcc/x86_64-linux-gnu/11.0.0/include/d/std/format.d:5767:28:
note: called from here: formattedWrite(w, fmt, _param_1)
 5767 | auto n = formattedWrite(w, fmt, args);
  |^
/opt/compiler-explorer/gcc-trunk-20201227/lib/gcc/x86_64-linux-gnu/11.0.0/include/d/std/format.d:5729:16:
note: called from here: format("%s", MonoTimeImpl(0L))
 5729 | .format(fmt, Args.init);
  |^
/opt/compiler-explorer/gcc-trunk-20201227/lib/gcc/x86_64-linux-gnu/11.0.0/include/d/std/format.d:5733:2:
note: called from here: (*function () => null)()
 5733 | }();
  |  ^
/opt/compiler-explorer/gcc-trunk-20201227/lib/gcc/x86_64-linux-gnu/11.0.0/include/d/std/stdio.d:3754:15:
error: template instance std.format.checkFormatException!("�}�",
MonoTimeImpl!cast(ClockType)0) error instantiating
 3754 | alias e = checkFormatException!(fmt, A);
  |   ^
:4:14: note: instantiated from here: writef!("%s",
MonoTimeImpl!cast(ClockType)0)
4 |   writef!"%s"(MonoTime.currTime());
  |  ^
/opt/compiler-explorer/gcc-trunk-20201227/lib/gcc/x86_64-linux-gnu/11.0.0/include/d/std/stdio.d:3755:5:
note: while evaluating: static assert(!e)
 3755 | static assert(!e, e.msg);
  | ^
Compiler returned: 1

[Bug d/98494] New: libphobos: std.process Config.stderrPassThrough missing

2020-12-31 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98494

Bug ID: 98494
   Summary: libphobos: std.process Config.stderrPassThrough
missing
   Product: gcc
   Version: 10.2.1
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: d
  Assignee: ibuclaw at gdcproject dot org
  Reporter: witold.baryluk+gcc at gmail dot com
  Target Milestone: ---

It appears that gdc version of libphobos is somehow lagging in some aspects
behind upstream.

One of the things I see missing, is `Config.stderrPassThrough` in std.process.
I see it was added upstream about 12 months ago:

enum Config {
...
/**
By default, the $(LREF execute) and $(LREF executeShell) functions
will capture child processes' both stdout and stderr. This can be
undesirable if the standard output is to be processed or otherwise
used by the invoking program, as `execute`'s result would then
contain a mix of output and warning/error messages.

Specify this flag when calling `execute` or `executeShell` to
cause invoked processes' stderr stream to be sent to $(REF stderr,
std,stdio), and only capture and return standard output.

This flag has no effect on $(LREF spawnProcess) or $(LREF spawnShell).
*/
stderrPassThrough = 128,
}

The implementation usage of this is relatively small and easy to backport:

in executeImpl:

-auto p = pipeFunc(commandLine, Redirect.stdout | Redirect.stderrToStdout,
-  env, config, workDir, extraArgs);
+auto redirect = (config & Config.stderrPassThrough)
+? Redirect.stdout
+: Redirect.stdout | Redirect.stderrToStdout;
+
+auto p = pipeFunc(commandLine, redirect,
+  env, config, workDir, extraArgs);



There are some other minor changes there, but nothing functionally significant.
Mostly unittests and minor signature changes (adding `scope` to many input
parameters).

Thank you.

[Bug d/100769] New: [D] memcmp() == 0 for small constant strings not folded

2021-05-26 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100769

Bug ID: 100769
   Summary: [D] memcmp() == 0 for small constant strings not
folded
   Product: gcc
   Version: 10.2.1
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: d
  Assignee: ibuclaw at gdcproject dot org
  Reporter: witold.baryluk+gcc at gmail dot com
  Target Milestone: ---

I expect this D code to be quite optimal, but it isn't.

```
extern(C) int memcmp(const void *s1, const void *s2, size_t n);

int recognize3(const char* s) {
return memcmp(s, "stract class", 12) == 0;
}
```

https://godbolt.org/z/vx17WK9rs


It produces a call to memcmp, instead of inlining and specializing the code for
this specific case.

int example.recognize3(const(char*)):
sub rsp, 8
mov edx, 12
mov esi, OFFSET FLAT:.LC0
callmemcmp
testeax, eax
seteal
add rsp, 8
movzx   eax, al
ret



ldc2 1.24.0 (for D) and clang 11.0.1-2 (for C and C++), and gcc 10.2.1 (for C
and C++) produce close to optimal codes. Similarly ldc2 1.26.0 (for D), and gcc
11.1 (for C and C++):

int example.recognize3(const(char*)):
movabs  rcx, 7142836979195081843
xor rcx, qword ptr [rdi]
mov edx, dword ptr [rdi + 8]
xor rdx, 1936941420
xor eax, eax
or  rdx, rcx
seteal
ret

and

recognize3:
movabs  rax, 7142836979195081843
cmp QWORD PTR [rdi], rax
je  .L6
.L2:
mov eax, 1
xor eax, 1
ret
.L6:
xor eax, eax
cmp DWORD PTR [rdi+8], 1936941420
jne .L2
xor eax, 1
ret


Notice, how both gcc, clang and ldc2, compare first 8 bytes of input, then 4
bytes of input. clang and ldc2 just xor/or the result, then return, with no
conditional jumps. gcc does a bit poorer, with more conditionals and more
jumps, but still pretty good and same idea.

gdc however, calls the generic memcmp, that does looping and does about 12
jumps and/or 13 exists.

[Bug d/100769] [D] memcmp() == 0 for small constant strings not folded

2021-05-26 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100769

--- Comment #1 from Witold Baryluk  ---
A typo in the example (godbolt is good), I forgot the `.ptr`:

extern(C) int memcmp(const void *s1, const void *s2, size_t n);

int recognize3(const char* s) {
return memcmp(s, "stract class".ptr, 12) == 0;
}

casting to ubyte*, or void*, doesn't change anything really.

options: -O3 -frelease -fno-semantic-interposition 

tested on amd64, Debian / Linux.

[Bug d/100769] [D] memcmp() == 0 for small constant strings not folded

2021-05-26 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100769

--- Comment #2 from Witold Baryluk  ---
Hmm. It appears that using `import core.stdc.string : memcmp;` actually
resolves the problem. It looks like my manually declaration of memcmp for some
reason disabled optimisations for memcmp.

[Bug d/100769] [D] memcmp() == 0 for small constant strings not folded

2021-05-26 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100769

Witold Baryluk  changed:

   What|Removed |Added

 Status|UNCONFIRMED |RESOLVED
 Resolution|--- |FIXED

--- Comment #4 from Witold Baryluk  ---
Ok. That makes sense. Thanks.

[Bug d/100769] [D] memcmp() == 0 for small constant strings not folded

2021-05-26 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100769

Witold Baryluk  changed:

   What|Removed |Added

 Resolution|FIXED   |INVALID

[Bug d/105360] New: Inlined lazy parameters / delegate literals, still emitted

2022-04-23 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105360

Bug ID: 105360
   Summary: Inlined lazy parameters / delegate literals, still
emitted
   Product: gcc
   Version: 12.0
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: d
  Assignee: ibuclaw at gdcproject dot org
  Reporter: witold.baryluk+gcc at gmail dot com
  Target Milestone: ---

```
extern bool g();
extern void f(int n);

void log(lazy int num) {
if (g()) {
const n = num();
f(n);
}
}

void p(int n) {
log(n * 137);
}
```


This should emit the same (or close to the same) as code with no `lazy` (and
num reference changed accordingly) on `log` function. (Because compiler knows
that `num ` is called once, has no side effects, is moderately expensive, etc).

And the code for p is exactly the same - log and `n * 137` fully inlined.

However, the anonymous dgliteral code is still emitted, despite not being
referenced anywhere:

```
pure nothrow @nogc @safe int example.p(int).__dgliteral2():  #   < This should
not be in object file
imuleax, DWORD PTR [rdi], 137
ret
```


Rest of the object file is correct and optimal:

```
void example.log(lazy int):
pushrbp
pushrbx
mov rbp, rdi
mov rbx, rsi
sub rsp, 8
callbool example.g()
testal, al
je  .L3
mov rdi, rbp
callrbx
add rsp, 8
pop rbx
pop rbp
mov edi, eax
jmp void example.f(int)
.L3:
add rsp, 8
pop rbx
pop rbp
ret
void example.p(int):
pushrbx
mov ebx, edi
callbool example.g()
testal, al
je  .L6
imuledi, ebx, 137
pop rbx
jmp void example.f(int)
.L6:
pop rbx
ret
```


gdc
(Compiler-Explorer-Build-gcc-748d46cd049c89a799f99f14547267ebae915af6-binutils-2.36.1)
12.0.1 20220421 (experimental)  via godbolt.org


For a code passing reasonably big literals, this can lead to object file code
duplication.

ldc2 shows no such problem.

[Bug d/105360] Inlined lazy parameters / delegate literals, still emitted

2022-04-23 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105360

--- Comment #1 from Witold Baryluk  ---
https://godbolt.org/z/c8oT6E4cf

[Bug d/105413] New: gdc extended assembler cannot constraints r8 - r15

2022-04-27 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105413

Bug ID: 105413
   Summary: gdc extended assembler cannot constraints r8 - r15
   Product: gcc
   Version: 12.0
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: d
  Assignee: ibuclaw at gdcproject dot org
  Reporter: witold.baryluk+gcc at gmail dot com
  Target Milestone: ---

gcc in C does not support directly register constraints for x86_64 registers r8
- r15.

In C this can be done however using local register variables and asm
attributes.

https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html

There is no way to use this in GDC extended assembler.

version (linux) {
version (GNU) {

enum SYSCALL {
  OPENAT = 56,
}

@nogc:
nothrow:

size_t syscall(SYSCALL ident)(size_t arg1, size_t arg2, size_t arg3, size_t
arg4) {
version (X86_64) {
   asm @nogc nothrow {
 "syscall"
 // output:
 : "=a" (arg1)
 // inputs:
 : "a" (ident),  // rax - syscall number
   "D" (arg1),   // rdi - arg1
   "S" (arg2),   // rsi - arg2
   "d" (arg3),   // rdx - arg3
   "r10" (arg4),  // r10 - arg4
   "m"( *cast(ubyte*)arg1)   // "dummy" input instead of full memory
clobber
 // clobers
 : "c", "r11";  // Clobers rax, and rcx and r11.
   }
   return arg1;
   } else {
   static assert(false, "This platform/architecture is not supported when
using GDC compiler");
   } 
}

}

private int openatdummy() @nogc nothrow {
  return cast(int)syscall!(SYSCALL.OPENAT)(0, 0, 0, 0);
}

}



myio.d: In function ‘syscall’:
myio.d:232:10: error: matching constraint references invalid operand number
  232 |  ;



https://godbolt.org/z/xGzxa6orc

[Bug d/105413] gdc extended assembler cannot constraints r8 - r15

2022-10-08 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105413

--- Comment #3 from Witold Baryluk  ---
It works. Thank you.

Any chance this will be in gcc 12.x? I work a lot on Debian Linux, and I doubt
I will have gcc trunk or gcc 13 available any time soon.


Also weirdly gcc does not inline this function, unless I add
@attribute("always_inline") on syscall, or @attribute("flatten") on
openatdummy.

[Bug d/107241] New: std.bitmanip.bigEndianToNative et al not inlined

2022-10-12 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107241

Bug ID: 107241
   Summary: std.bitmanip.bigEndianToNative et al not inlined
   Product: gcc
   Version: 12.2.0
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: d
  Assignee: ibuclaw at gdcproject dot org
  Reporter: witold.baryluk+gcc at gmail dot com
  Target Milestone: ---

gdc fails to inline number of small functions that should fully inline and end
in single instruction.

on amd64 / x86, for example std.bitmanip.bigEndianToNative causes a chain of
calls / jumps, even with @attribute("flatten")



import std.bitmanip;
import gcc.attributes;

@attribute("flatten")
size_t f(char[] b) {
return std.bitmanip.bigEndianToNative!(size_t,
8)(cast(ubyte[8])(b[2..10]));
}




gcc -O3 -march=znver2 -frelease


pure nothrow @nogc @safe ulong
std.bitmanip.swapEndian!(ulong).swapEndian(const(ulong)):
mov rax, rdi
bswap   rax
ret
pure nothrow @nogc @safe ulong std.bitmanip.endianToNativeImpl!(true, ulong,
8uL).endianToNativeImpl(ubyte[8]):
jmp pure nothrow @nogc @safe ulong
std.bitmanip.swapEndian!(ulong).swapEndian(const(ulong))
pure nothrow @nogc @safe ulong std.bitmanip.bigEndianToNative!(ulong,
8uL).bigEndianToNative(ubyte[8]):
jmp pure nothrow @nogc @safe ulong
std.bitmanip.endianToNativeImpl!(true, ulong, 8uL).endianToNativeImpl(ubyte[8])
ulong example.f(char[]):
mov rdi, QWORD PTR [rsi+2]
jmp pure nothrow @nogc @safe ulong
std.bitmanip.bigEndianToNative!(ulong, 8uL).bigEndianToNative(ubyte[8])




No issues with LDC.

ulong example.f(char[]):
mov rax, qword ptr [rsi + 2]
bswap   rax
ret



godbolt: https://godbolt.org/z/Pj3f7oGso

[Bug c++/103966] New: std::atomic relaxed load, inc, store sub-optimal codegen

2022-01-10 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103966

Bug ID: 103966
   Summary: std::atomic relaxed load, inc, store sub-optimal
codegen
   Product: gcc
   Version: 12.0
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: c++
  Assignee: unassigned at gcc dot gnu.org
  Reporter: witold.baryluk+gcc at gmail dot com
  Target Milestone: ---

Both functions below, should compile to the same assembly on x86:

#include 
#include 

uint64_t x;
void inc_a() {
x++;
}

std::atomic y;

void inc_b_non_atomic() {
y.store(y.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed);
}


and it does so in clang.

It does not in gcc 12 (and earlier).

https://godbolt.org/z/GcM67xz8T



This pattern is very popular in approximate statistical counters / metrics,
where the flow of information is unidirectional (i.e. from one thread that does
updates, to another thread that only reads the counters), and its performance
is critical in many codebases.

[Bug c++/103966] std::atomic relaxed load, inc, store sub-optimal codegen

2022-01-10 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103966

--- Comment #1 from Witold Baryluk  ---
Current codegen on gcc 12 on 64-bit x86:

inc_a():
inc QWORD PTR x[rip]
ret
inc_b_non_atomic():
mov rax, QWORD PTR y[rip]
inc rax
mov QWORD PTR y[rip], rax
ret
y:
.zero   8
x:
.zero   8

[Bug c++/103966] std::atomic relaxed load, inc, store sub-optimal codegen

2022-01-10 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103966

--- Comment #2 from Witold Baryluk  ---
Similarly, dec, add, sub, are affected, as well mul.

Example:

#include 
#include 

uint64_t x;
void add_a() {
x += 5;
}

std::atomic y;

void add_b_non_atomic() {
y.store(y.load(std::memory_order_relaxed) + 5, std::memory_order_relaxed);
}



Producing:

add_a():
add QWORD PTR x[rip], 5
ret
add_b_non_atomic():
mov rax, QWORD PTR y[rip]
add rax, 5
mov QWORD PTR y[rip], rax
ret
y:
.zero   8
x:
.zero   8

[Bug middle-end/35560] Missing CSE/PRE for memory operations involved in virtual call.

2022-12-30 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=35560

Witold Baryluk  changed:

   What|Removed |Added

 CC||witold.baryluk+gcc at gmail 
dot co
   ||m

--- Comment #15 from Witold Baryluk  ---
I know this is a pretty old bug, but I was exploring some assembly of gcc and
clang on godbolt, and also stumbled into same issue.

https://godbolt.org/z/qPzMhWse1

class A {
public:
virtual int f7(int x) const;
};

int g(const A * const a, int x) {
int r = 0;
for (int i = 0; i < 1; i++)
r += a->f7(x);
return r;
}

(same happens without loop, when just calling a->f7 multiple times)



g(A const*, int):
pushr13
mov r13d, esi
pushr12
xor r12d, r12d
pushrbp
mov rbp, rdi
pushrbx
mov ebx, 1
sub rsp, 8
.L2:
mov rax, QWORD PTR [rbp+0]   # a vtable deref
mov esi, r13d
mov rdi, rbp
call[QWORD PTR [rax]]# f7 indirect call
add r12d, eax
dec ebx
jne .L2

add rsp, 8
pop rbx
pop rbp
mov eax, r12d
pop r12
pop r13
ret


I was expecting  mov rax, QWORD PTR [rbp+0] and call[QWORD PTR [rax]],
to be hoisted out of the loop (call converted to lea, and call register).


A bit sad.

Is there some recent work done on this optimization?

Are there at least some cases where it is valid to do CSE, or change code so it
is moved out of the loop?

[Bug c/108255] New: Repeated address-of (lea) not optimized for size.

2022-12-30 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108255

Bug ID: 108255
   Summary: Repeated address-of (lea) not optimized for size.
   Product: gcc
   Version: 13.0
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: c
  Assignee: unassigned at gcc dot gnu.org
  Reporter: witold.baryluk+gcc at gmail dot com
  Target Milestone: ---

https://godbolt.org/z/q5sx9e49j


void f(int *);

int g(int of) {
int x = 13;
f(&x);
f(&x);
f(&x);
f(&x);
f(&x);
f(&x);
f(&x);
f(&x);
return 0;
}


Got:

g(int):
sub rsp, 24
lea rdi, [rsp+12]
mov DWORD PTR [rsp+12], 13
callf(int*)
lea rdi, [rsp+12] # compute, 5 bytes
callf(int*)
lea rdi, [rsp+12] # recompute, 5 bytes
callf(int*)
lea rdi, [rsp+12] # recompute, 5 bytes
callf(int*)
lea rdi, [rsp+12]
callf(int*)
lea rdi, [rsp+12]
callf(int*)
lea rdi, [rsp+12]
callf(int*)
lea rdi, [rsp+12]
callf(int*)
xor eax, eax
add rsp, 24
ret


But, note that lea is 5 bytes.

Expected (generated by clang 3.0 - 15.0):

g(int):  # @g(int)
pushrbx  # extra, but just 1 byte
sub rsp, 16
mov dword ptr [rsp + 12], 13 # CSE temp
lea rbx, [rsp + 12]
mov rdi, rbx # use
callf(int*)@PLT
mov rdi, rbx # reuse, 3 bytes
callf(int*)@PLT
mov rdi, rbx # reuse, 3 bytes
callf(int*)@PLT
mov rdi, rbx
callf(int*)@PLT
mov rdi, rbx
callf(int*)@PLT
mov rdi, rbx
callf(int*)@PLT
mov rdi, rbx
callf(int*)@PLT
mov rdi, rbx
callf(int*)@PLT
xor eax, eax
add rsp, 16
pop rbx  # extra, but just 1 byte
ret


Technically this is more instructions.

But

mov rdi, rbx is 3 bytes, which is shorter than 5 bytes of lea. This is at minor
expense of needing to save and restore rbx.

PS. Same happens when using temporary `int *const y = &x;`

Also same when optimizing for size (`-Os`).

It looks like gcc 4.8.5 produced expected code, but gcc 4.9.0 does not.

It is possible that the code produced by gcc 4.9.0 is faster, but it is also
likely it contributes quite a bit to binary size.

clang uses CSE even if there are even just two uses of `&x` in the above
example. It is likely a bit higher threshold is (3 or 4) is actually optimal
(can be calculated knowing encoding sizes).


Weirdly tho, gcc -m32 does this:

g():
pushebp
mov ebp, esp
pushebx
lea ebx, [ebp-12]
sub esp, 32
mov DWORD PTR [ebp-12], 13
pushebx
callf(int*)
mov DWORD PTR [esp], ebx
callf(int*)
mov DWORD PTR [esp], ebx
callf(int*)
mov ebx, DWORD PTR [ebp-4]
xor eax, eax
leave
ret

Where, it does compute address and stores it in temporary. But does it on a
stack, instead in a register (my guess is there are no free register to store
it and it is spilled)., but in fact lea here would be likely faster (mov
DWORD PTR [esp], ebx, but requires memory/cache access, lea is 5 bytes, but
does not require memory access)

[Bug d/109221] New: std.math.floor, core.math.ldexp, std.math.poly poor inlining

2023-03-20 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109221

Bug ID: 109221
   Summary: std.math.floor, core.math.ldexp, std.math.poly poor
inlining
   Product: gcc
   Version: 13.0
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: d
  Assignee: ibuclaw at gdcproject dot org
  Reporter: witold.baryluk+gcc at gmail dot com
  Target Milestone: ---

Example:

static float sRGB_case4(float x) {
// import std.math : exp;
return 1.055f * expImpl(x) - 0.055f;  // expImpl not inlined by default
// (inlined when using pragma(inline, true), but that fails to inline in
DMD)
}


// pragma(inline, true)
// This is borrowed from phobos/exponential.d to help gcc inline it fully.
// Only T == float case is here (as some traits are private to phobos).
// Also isNaN and range checks are removed, as sRGB performs own checks.
static private T expImpl(T)(T x) @safe pure nothrow @nogc
{
//import std.math : floatTraits, RealFormat;
//import std.math.traits : isNaN;
//import std.math.rounding : floor;
//import std.math.algebraic : poly;
//import std.math.constants : LOG2E;
import std.math;
import core.math;

static immutable T[6] P = [
5.001201E-1,
1.665459E-1,
4.1665795894E-2,
8.3334519073E-3,
1.3981999507E-3,
1.9875691500E-4,
];

enum T C1 = 0.693359375;
enum T C2 = -2.12194440e-4;

// Overflow and Underflow limits.
enum T OF = 88.72283905206835;
enum T UF = -103.278929903431851103; // ln(2^-149)

// Special cases.
//if (isNaN(x))
//return x;
//if (x > OF)
//return real.infinity;
//if (x < UF)
//return 0.0;

// Express: e^^x = e^^g * 2^^n
//   = e^^g * e^^(n * LOG2E)
//   = e^^(g + n * LOG2E)
T xx = floor((cast(T) LOG2E) * x + cast(T) 0.5);   // NOT INLINED!
const int n = cast(int) xx;
x -= xx * C1;
x -= xx * C2;

xx = x * x;
x = poly(x, P) * xx + x + 1.0f; // poly is generated optimally, but
not inlined

// Scale by power of 2.
x = core.math.ldexp(x, n);// NOT INLINED

return x;
}


gdc gdc
(Compiler-Explorer-Build-gcc-454a4d5041f53cd1f7d902f6c0017b7ce95b36df-binutils-2.38)
13.0.1 20230318 (experimental)
gdc -O3 -march=znver2 -frelease -fbounds-check=off


pure nothrow @nogc @safe float std.math.algebraic.poly!(float, float,
6).poly(float, ref const(float[6])):
vmovss  xmm1, DWORD PTR [rdi+20]
vfmadd213ss xmm1, xmm0, DWORD PTR [rdi+16]
vfmadd213ss xmm1, xmm0, DWORD PTR [rdi+12]
vfmadd213ss xmm1, xmm0, DWORD PTR [rdi+8]
vfmadd213ss xmm1, xmm0, DWORD PTR [rdi+4]
vfmadd213ss xmm0, xmm1, DWORD PTR [rdi]
ret
pure nothrow @nogc @safe float example.expImpl!(float).expImpl(float):
pushrbx
vmovaps xmm1, xmm0
sub rsp, 16
vmovss  xmm0, DWORD PTR .LC0[rip]
vfmadd213ss xmm0, xmm1, DWORD PTR .LC1[rip]
vmovss  DWORD PTR [rsp+8], xmm1
callpure nothrow @nogc @trusted float
std.math.rounding.floor(float)
vmovss  xmm1, DWORD PTR [rsp+8]
mov edi, OFFSET FLAT:immutable(float[6])
example.expImpl!(float).expImpl(float).P
vfnmadd231ssxmm1, xmm0, DWORD PTR .LC2[rip]
vmovss  DWORD PTR [rsp+12], xmm0
vfnmadd231ssxmm1, xmm0, DWORD PTR .LC3[rip]
vmulss  xmm3, xmm1, xmm1
vmovaps xmm0, xmm1
vmovss  DWORD PTR [rsp+8], xmm1
vmovd   ebx, xmm3
callpure nothrow @nogc @safe float std.math.algebraic.poly!(float,
float, 6).poly(float, ref const(float[6]))
vmovss  xmm1, DWORD PTR [rsp+8]
vmovd   xmm4, ebx
vmovss  xmm2, DWORD PTR [rsp+12]
vfmadd132ss xmm0, xmm1, xmm4
vaddss  xmm0, xmm0, DWORD PTR .LC4[rip]
add rsp, 16
pop rbx
vcvttss2si  edi, xmm2
jmp ldexpf
float example.sRGB_case4(float):
sub rsp, 8
callpure nothrow @nogc @safe float
example.expImpl!(float).expImpl(float)
vmovss  xmm1, DWORD PTR .LC6[rip]
vfmadd132ss xmm0, xmm1, DWORD PTR .LC5[rip]
add rsp, 8
ret


https://godbolt.org/z/YMoMPdjn5


Additionally

std.math.exp itself, is never inlined by gcc. This is important, as some early
checks (isNaN, OF, UF checks) in exp could be removed by proper inlining.

[Bug d/109221] std.math.floor, core.math.ldexp, std.math.poly poor inlining

2023-03-20 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109221

--- Comment #1 from Witold Baryluk  ---
PS. LDC 1.23.0 - 1.32.0 produce optimal code. LDC 1.22.0 a bit worse (due to
use of x87 codegen), and 1.21 and older fail to inline `ldexp`, but still
inline `poly` and `floor` perfectly.

[Bug d/109221] std.math.floor, core.math.ldexp, std.math.poly poor inlining

2023-03-20 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109221

--- Comment #2 from Witold Baryluk  ---
Interesting enough, GDC 10.2 does inline `poly` instantiation with all the
constants.

[Bug d/110113] New: gdc -fpreview=dip1021 crash in d/dmd/root/aav.d:127 dmd_aaGetRvalue from DsymbolTable::lookup(Identifier const*)

2023-06-04 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110113

Bug ID: 110113
   Summary: gdc -fpreview=dip1021 crash in d/dmd/root/aav.d:127
dmd_aaGetRvalue from DsymbolTable::lookup(Identifier
const*)
   Product: gcc
   Version: 13.1.0
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: d
  Assignee: ibuclaw at gdcproject dot org
  Reporter: witold.baryluk+gcc at gmail dot com
  Target Milestone: ---

Created attachment 55254
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=55254&action=edit
Minimized test case with dustmite

Debian Linux amd64, experimental gcc-13, gdc 13.1.0-3


This is not very deterministic. Run few times to trigger.

```
user@debian:~$ cat lup.d
class LUBench {
}
float lup(ulong , ulong , int , int = 1) {
double[] solution;
new LUBench;
return solution[0] ;
}
float lup_3200(ulong iters, ulong flops) {
return lup(iters, flops, 3200);
}
float raytrace() {
struct V {
float x, y, z;
auto normalize() {
}
import std;
auto cross() {
}
auto norm2() {
}
auto norm() {
}
auto opBinary(){
}
}
}
user@debian:~$ gdc-13 -c -fpreview=dip1021 lup.d
lup.d:11:7: error: function ‘lup.raytrace’ has no ‘return’ statement, but is
expected to return a value of type ‘float’
   11 | float raytrace() {
  |   ^
user@debian:~$ gdc-13 -c -fpreview=dip1021 lup.d
lup.d:11:7: error: function ‘lup.raytrace’ has no ‘return’ statement, but is
expected to return a value of type ‘float’
   11 | float raytrace() {
  |   ^
user@debian:~$ gdc-13 -c -fpreview=dip1021 lup.d
lup.d:11:7: error: function ‘lup.raytrace’ has no ‘return’ statement, but is
expected to return a value of type ‘float’
   11 | float raytrace() {
  |   ^
user@debian:~$ gdc-13 -c -fpreview=dip1021 lup.d
lup.d:11:7: error: function ‘lup.raytrace’ has no ‘return’ statement, but is
expected to return a value of type ‘float’
   11 | float raytrace() {
  |   ^
user@debian:~$ gdc-13 -c -fpreview=dip1021 lup.d
/usr/lib/gcc/x86_64-linux-gnu/13/include/d/std/math/algebraic.d:968:47:
internal compiler error: Segmentation fault
  968 | return cast(Unqual!T) (T(1) << bsr(val) + type);
  |   ^
0xd32f86 crash_signal
../../src/gcc/toplev.cc:314
0x7f53b651cf8f ???
./signal/../sysdeps/unix/sysv/linux/x86_64/libc_sigaction.c:0
0x17f7d10 _D3dmd4root3aav15dmd_aaGetRvalueFNaNbNiPSQBnQBmQBk2AAPvZQd
../../src/gcc/d/dmd/root/aav.d:127
0x1706b25 DsymbolTable::lookup(Identifier const*)
../../src/gcc/d/dmd/dsymbol.d:2408
0x1706b25 ScopeDsymbol::search(Loc const&, Identifier*, int)
../../src/gcc/d/dmd/dsymbol.d:1470
0x17ef5b3
_D3dmd6opover15search_functionFCQBe7dsymbol12ScopeDsymbolCQCe10identifier10IdentifierZCQDhQCd7Dsymbol
../../src/gcc/d/dmd/opover.d:1435
0x1701fe0 search_toString(StructDeclaration*)
../../src/gcc/d/dmd/dstruct.d:51
0x180310a semanticTypeInfoMembers(StructDeclaration*)
../../src/gcc/d/dmd/semantic3.d:1650
0x1803394 Semantic3Visitor::visit(AggregateDeclaration*)
../../src/gcc/d/dmd/semantic3.d:1590
0x17fef19 semantic3(Dsymbol*, Scope*)
../../src/gcc/d/dmd/semantic3.d:83
0x175dc89 ExpressionSemanticVisitor::visit(DeclarationExp*)
../../src/gcc/d/dmd/expressionsem.d:5572
0x175dc89 ExpressionSemanticVisitor::visit(DeclarationExp*)
../../src/gcc/d/dmd/expressionsem.d:5407
0x175eb82 expressionSemantic(Expression*, Scope*)
../../src/gcc/d/dmd/expressionsem.d:12706
0x18096fa StatementSemanticVisitor::visit(ExpStatement*)
../../src/gcc/d/dmd/statementsem.d:207
0x18228c1 statementSemantic(Statement*, Scope*)
../../src/gcc/d/dmd/statementsem.d:149
0x18228c1 StatementSemanticVisitor::visit(CompoundStatement*)
../../src/gcc/d/dmd/statementsem.d:270
0x1809112 statementSemantic(Statement*, Scope*)
../../src/gcc/d/dmd/statementsem.d:149
0x18002a1 Semantic3Visitor::visit(FuncDeclaration*)
../../src/gcc/d/dmd/semantic3.d:598
0x17feae4 semantic3(Dsymbol*, Scope*)
../../src/gcc/d/dmd/semantic3.d:83
0x17feae4 Semantic3Visitor::visit(Module*)
../../src/gcc/d/dmd/semantic3.d:205
Please submit a full bug report, with preprocessed source (by using
-freport-bug).
Please include the complete backtrace with any bug report.
See  for instructions.
user@debian:~$ 
```


Could not reduce further, as it is sensitive to identifiers, and due to
non-deterministic nature testing requires many repetitions.

[Bug d/110113] gdc -fpreview=dip1021 crash in d/dmd/root/aav.d:127 dmd_aaGetRvalue from DsymbolTable::lookup(Identifier const*)

2023-06-04 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110113

--- Comment #1 from Witold Baryluk  ---
BTW. Adding return statement in `raytrace`, does not change anything:

```
user@debian:~$ gdc-13 -c -fpreview=dip1021 lup.d
user@debian:~$ gdc-13 -c -fpreview=dip1021 lup.d
user@debian:~$ gdc-13 -c -fpreview=dip1021 lup.d
user@debian:~$ gdc-13 -c -fpreview=dip1021 lup.d
user@debian:~$ gdc-13 -c -fpreview=dip1021 lup.d
/usr/lib/gcc/x86_64-linux-gnu/13/include/d/std/math/algebraic.d:968:47:
internal compiler error: Segmentation fault
  968 | return cast(Unqual!T) (T(1) << bsr(val) + type);
  |   ^
0xd32f86 crash_signal
../../src/gcc/toplev.cc:314
0x7f7144273f8f ???
./signal/../sysdeps/unix/sysv/linux/x86_64/libc_sigaction.c:0
0x17f7d10 _D3dmd4root3aav15dmd_aaGetRvalueFNaNbNiPSQBnQBmQBk2AAPvZQd
../../src/gcc/d/dmd/root/aav.d:127
0x1706b25 DsymbolTable::lookup(Identifier const*)
../../src/gcc/d/dmd/dsymbol.d:2408
0x1706b25 ScopeDsymbol::search(Loc const&, Identifier*, int)
../../src/gcc/d/dmd/dsymbol.d:1470
...
...
```

[Bug d/110113] gdc -fpreview=dip1021 crash in d/dmd/root/aav.d:127 dmd_aaGetRvalue from DsymbolTable::lookup(Identifier const*)

2023-06-04 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110113

--- Comment #2 from Witold Baryluk  ---
Also FYI, I was not able to trigger this on DMD64 D Compiler v2.104.0

[Bug d/110113] gdc -fpreview=dip1021 crash in d/dmd/root/aav.d:127 dmd_aaGetRvalue from DsymbolTable::lookup(Identifier const*)

2023-06-11 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110113

--- Comment #10 from Witold Baryluk  ---
Thank you Iain. Amazing debugging skills.

BTW. `import std;` was because dustmite reduced original import to just that.
Original import was `import std.math.algebraic : sqrt;`

But you already figured this out without even using Phobos.

[Bug d/110516] New: core.volatile.volatileLoad is broken

2023-07-01 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110516

Bug ID: 110516
   Summary: core.volatile.volatileLoad is broken
   Product: gcc
   Version: 14.0
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: d
  Assignee: ibuclaw at gdcproject dot org
  Reporter: witold.baryluk+gcc at gmail dot com
  Target Milestone: ---

gcc 12.2.0 (from Debian stable) and gcc trunk 14.0.0 (in godbolt) tested.

core.volatile.volatileLoad simply does not work.

1) It merges loads.
2) It removes unused loads at -O1 and higher.

Example:

void actualRun(ubyte* ptr1) {
  import core.volatile : volatileLoad;
  volatileLoad(ptr1);
  volatileLoad(ptr1);
  volatileLoad(ptr1);
  volatileLoad(ptr1);
}


Without optimisations:

void example.actualRun(ubyte*):
pushrbp
mov rbp, rsp
mov QWORD PTR [rbp-8], rdi
nop
pop rbp
ret


Incorrect.



With optimisations:

void example.actualRun(ubyte*):
ret

Incorrect.


Expected:

void example.actualRun(ubyte*):
movzx   eax, byte ptr [rdi]
movzx   eax, byte ptr [rdi]
movzx   eax, byte ptr [rdi]
movzx   eax, byte ptr [rdi]
ret



dmd and ldc behave properly.


It looks like it never worked properly.

Would be good to have a test case for this, so it does not become a regression
later.


I did not test volatileStore, but I would not be surprised it is also broken.

[Bug d/110516] core.volatile.volatileLoad discarded if result is unused

2023-07-01 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110516

--- Comment #8 from Witold Baryluk  ---
I see.

Point 1 is definitively incorrect. I interpreted asembler wrong:

void example.actualRun(ubyte*):
pushrbp
mov rbp, rsp
mov QWORD PTR [rbp-8], rdi
nop
pop rbp
ret


The move there, is just some stack manipulation, it has nothing to do with
volatileLoad.



You are right about the side effect visibility and volatileStore.

Still, there should be a way to express real memory read, with result not
stored anywhere in program (just written to register, then discarded).

This has some (not very common) uses in memmory-mapped IO, i.e. in drivers for
devices where the read itself could indicate something (this of course usually
also require setting proper page table attributes to disable caching or other
optimizations, etc, not just volatile load in machine code). I do not have
specific examples at hand, but afaik I saw some examples in the past (mostly on
older architectures), as well some watchdog chips that reset timer on read.

Another use is for doing memory and cache read benchmarks and profiling. We
want to invoke read (to register) from some memory location, but we do not need
the value for anything else.

And more esoteric use might be memory probing. On some level systems, kernel or
bootloader, might not know the memory layout, and resort to just doing reads,
and relaying on CPU fault handlers to report invalid reads.

And some people might use load without destination, as a prefetch hint, or to
prefault some memory pages.

[Bug d/110516] core.volatile.volatileLoad discarded if result is unused

2023-07-01 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110516

--- Comment #9 from Witold Baryluk  ---
Thank you for a quick fix Iain!

[Bug d/113125] New: [D] internal compiler error: in make_import, at d/imports.cc:48

2023-12-23 Thread witold.baryluk+gcc at gmail dot com via Gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113125

Bug ID: 113125
   Summary: [D] internal compiler error: in make_import, at
d/imports.cc:48
   Product: gcc
   Version: 13.2.1
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: d
  Assignee: ibuclaw at gdcproject dot org
  Reporter: witold.baryluk+gcc at gmail dot com
  Target Milestone: ---

Debian testing, amd64, gcc version 13.2.0 (Debian 13.2.0-7) 


meta.d:

```
module objc.meta;
struct A;
```


runtime.d:

```
module objc.runtime;
public import meta : A;
```


gdc -v -c -I. runtime.d

```
$ gdc -v -c -I. runtime.d 
Using built-in specs.
COLLECT_GCC=gdc
OFFLOAD_TARGET_NAMES=nvptx-none:amdgcn-amdhsa
OFFLOAD_TARGET_DEFAULT=1
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Debian 13.2.0-7'
--with-bugurl=file:///usr/share/doc/gcc-13/README.Bugs
--enable-languages=c,ada,c++,go,d,fortran,objc,obj-c++,m2 --prefix=/usr
--with-gcc-major-version-only --program-suffix=-13
--program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id
--libexecdir=/usr/libexec --without-included-gettext --enable-threads=posix
--libdir=/usr/lib --enable-nls --enable-bootstrap --enable-clocale=gnu
--enable-libstdcxx-debug --enable-libstdcxx-time=yes
--with-default-libstdcxx-abi=new --enable-gnu-unique-object
--disable-vtable-verify --enable-plugin --enable-default-pie --with-system-zlib
--enable-libphobos-checking=release --with-target-system-zlib=auto
--enable-objc-gc=auto --enable-multiarch --disable-werror --enable-cet
--with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32
--enable-multilib --with-tune=generic
--enable-offload-targets=nvptx-none=/build/reproducible-path/gcc-13-13.2.0/debian/tmp-nvptx/usr,amdgcn-amdhsa=/build/reproducible-path/gcc-13-13.2.0/debian/tmp-gcn/usr
--enable-offload-defaulted --without-cuda-driver --enable-checking=release
--build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu
--with-build-config=bootstrap-lto-lean --enable-link-serialization=3
Thread model: posix
Supported LTO compression algorithms: zlib zstd
gcc version 13.2.0 (Debian 13.2.0-7) 
COLLECT_GCC_OPTIONS='-v' '-c' '-I' '.' '-o' 'runtime.o' '-shared-libgcc'
'-mtune=generic' '-march=x86-64'
 /usr/libexec/gcc/x86_64-linux-gnu/13/d21 runtime.d -quiet -dumpbase runtime.d
-dumpbase-ext .d -mtune=generic -march=x86-64 -version -imultiarch
x86_64-linux-gnu -I . -v -o /tmp/ccPyiN0m.s
GNU D (Debian 13.2.0-7) version 13.2.0 (x86_64-linux-gnu)
compiled by GNU C version 13.2.0, GMP version 6.3.0, MPFR version
4.2.1, MPC version 1.3.1, isl version isl-0.26-GMP

GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
binary/usr/libexec/gcc/x86_64-linux-gnu/13/d21
version   v2.103.1

predefs   GNU D_Version2 LittleEndian GNU_DWARF2_Exceptions GNU_StackGrowsDown
GNU_InlineAsm D_LP64 D_PIC D_PIE assert D_PreConditions D_PostConditions
D_Invariants D_ModuleInfo D_Exceptions D_TypeInfo all X86_64 D_HardFloat Posix
linux CRuntime_Glibc CppRuntime_Gcc
parse runtime
importall runtime
importmeta  (meta.d)
importobject(/usr/lib/gcc/x86_64-linux-gnu/13/include/d/object.d)
importcore.attribute   
(/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/attribute.d)
importgcc.attributes   
(/usr/lib/gcc/x86_64-linux-gnu/13/include/d/gcc/attributes.d)
importcore.internal.hash   
(/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/internal/hash.d)
importcore.internal.traits 
(/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/internal/traits.d)
importcore.internal.entrypoint 
(/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/internal/entrypoint.d)
importcore.internal.array.appending
(/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/internal/array/appending.d)
importcore.internal.array.comparison   
(/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/internal/array/comparison.d)
importcore.internal.array.equality 
(/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/internal/array/equality.d)
importcore.internal.array.casting  
(/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/internal/array/casting.d)
importcore.internal.array.concatenation
(/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/internal/array/concatenation.d)
importcore.internal.array.construction 
(/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/internal/array/construction.d)
importcore.internal.array.arrayassign  
(/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/internal/array/arrayassign.d)
importcore.internal.array.capacity 
(/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/internal/array/capacity.d)
importcore.internal.dassert
(/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/internal/dassert.d)
importcore.atomic  
(/usr/lib/gcc/x86_64-linux-gnu/13/include/d/core/atomic.d)
importcore.internal.attributes 
(/usr/lib/gcc/x86_64-linux-gn

[Bug c/100257] New: poor codegen with vcvtph2ps / stride of 6

[Bug tree-optimization/96275] Vectorizer doesn't take into account bitmask condition from branch conditions.

[Bug d/98457] New: [d] writef!"%s" doesn't work with MonoTime / SysTick

[Bug d/98457] [d] writef!"%s" doesn't work with MonoTime / SysTick

[Bug d/98494] New: libphobos: std.process Config.stderrPassThrough missing

[Bug d/100769] New: [D] memcmp() == 0 for small constant strings not folded

[Bug d/100769] [D] memcmp() == 0 for small constant strings not folded

[Bug d/100769] [D] memcmp() == 0 for small constant strings not folded

[Bug d/100769] [D] memcmp() == 0 for small constant strings not folded

[Bug d/100769] [D] memcmp() == 0 for small constant strings not folded

[Bug d/105360] New: Inlined lazy parameters / delegate literals, still emitted

[Bug d/105360] Inlined lazy parameters / delegate literals, still emitted

[Bug d/105413] New: gdc extended assembler cannot constraints r8 - r15

[Bug d/105413] gdc extended assembler cannot constraints r8 - r15

[Bug d/107241] New: std.bitmanip.bigEndianToNative et al not inlined

[Bug c++/103966] New: std::atomic relaxed load, inc, store sub-optimal codegen

[Bug c++/103966] std::atomic relaxed load, inc, store sub-optimal codegen

[Bug c++/103966] std::atomic relaxed load, inc, store sub-optimal codegen

[Bug middle-end/35560] Missing CSE/PRE for memory operations involved in virtual call.

[Bug c/108255] New: Repeated address-of (lea) not optimized for size.

[Bug d/109221] New: std.math.floor, core.math.ldexp, std.math.poly poor inlining

[Bug d/109221] std.math.floor, core.math.ldexp, std.math.poly poor inlining

[Bug d/109221] std.math.floor, core.math.ldexp, std.math.poly poor inlining

[Bug d/110113] New: gdc -fpreview=dip1021 crash in d/dmd/root/aav.d:127 dmd_aaGetRvalue from DsymbolTable::lookup(Identifier const*)

[Bug d/110113] gdc -fpreview=dip1021 crash in d/dmd/root/aav.d:127 dmd_aaGetRvalue from DsymbolTable::lookup(Identifier const*)

[Bug d/110113] gdc -fpreview=dip1021 crash in d/dmd/root/aav.d:127 dmd_aaGetRvalue from DsymbolTable::lookup(Identifier const*)

[Bug d/110113] gdc -fpreview=dip1021 crash in d/dmd/root/aav.d:127 dmd_aaGetRvalue from DsymbolTable::lookup(Identifier const*)

[Bug d/110516] New: core.volatile.volatileLoad is broken

[Bug d/110516] core.volatile.volatileLoad discarded if result is unused

[Bug d/110516] core.volatile.volatileLoad discarded if result is unused

[Bug d/113125] New: [D] internal compiler error: in make_import, at d/imports.cc:48

31 matches

Site Navigation

Mail list logo

Footer information