B^HDEAD code generation (i386)

2023-01-09 Thread Stefan Kanthak
Hi,

compile the following 64-bit GCD routine for the i386 processor,
with full optimization and the preprocessor macro CTZ defined:

--- gcddi3.c ---
// Stein's algorithm: greatest common divisor

unsigned long long __gcddi3(unsigned long long p, unsigned long long q)
{
unsigned   r, s = 0;
unsigned long long t;

if (p == 0)
return q;

if (q == 0)
return p;

if (p == q)
return p;
#ifndef CTZ
while (((p | q) & 1) == 0)
p >>= 1, q >>= 1, s++;

while ((p & 1) == 0)
p >>= 1;

do {
while ((q & 1) == 0)
q >>= 1;
#elif CTZ != 32
s = __builtin_ctzll(p | q), p >>= s, q >>= s;
r = __builtin_ctzll(p), p >>= r;

do {
r = __builtin_ctzll(q), q >>= r;
#else
if (((unsigned long) p | (unsigned long) q) == 0)
p >>= 32, q >>= 32, s = 32;

r = __builtin_ctzl((unsigned long) p | (unsigned long) q), p >>= r, q >>= 
r, s += r;

if ((unsigned long) p == 0)
p >>= 32;

r = __builtin_ctzl(p), p >>= r;

do
{
if ((unsigned long) q == 0)
q >>= 32;

r = __builtin_ctzl(q), q >>= r;
#endif
if (p < q)
t = q, q = p, p = t;
} while (q -= p);

return p << s;
}
--- EOF ---

GCC 12.2: gcc -DCTZ -m32 -mno-sse -O3 gcddi3.c

# https://godbolt.org/z/c7cao8M57
__gcddi3(unsigned long long, unsigned long long):
pushebp
pushedi
pushesi
pushebx
sub esp, 28
mov ebx, DWORD PTR [esp+52]
mov ecx, DWORD PTR [esp+48]
mov esi, DWORD PTR [esp+56]
mov edi, DWORD PTR [esp+60]
mov ebp, ebx
mov DWORD PTR [esp+8], ecx
or  ebp, ecx
mov DWORD PTR [esp+12], ebx
mov eax, esi
mov edx, edi
je  .L1
mov eax, ecx
mov edx, ebx
xor eax, esi
xor edx, edi
or  eax, edx
je  .L6
mov eax, esi
or  eax, edi
je  .L6
mov eax, ecx
mov edx, ebx
sub esp, 8
xor ebp, ebp
or  edx, edi
or  eax, esi
pushedx#
pusheax#
call__ctzdi2   # Oops: shouldn't this be inlined with -O3?!
pop edx#
pop ecx#
mov ebx, eax
mov edx, DWORD PTR [esp+20]
mov eax, DWORD PTR [esp+16]
mov ecx, ebx
shrdeax, edx, cl
shr edx, cl
testbl, 32
cmovne  eax, edx
cmovne  edx, ebp
shrdesi, edi, cl
xor ebp, ebp
shr edi, cl
testbl, 32
mov DWORD PTR [esp+16], eax
cmovne  esi, edi
cmovne  edi, ebp
mov DWORD PTR [esp+20], edx
pushedx#
pusheax#
call__ctzdi2   # Oops: shouldn't this be inlined with -O3?!
mov edx, DWORD PTR [esp+28]
add esp, 16
mov ebp, eax
mov eax, DWORD PTR [esp+8]
mov ecx, ebp
xor ebp, ebp
shrdeax, edx, cl
shr edx, cl
and ecx, 32
cmovne  eax, edx
cmovne  edx, ebp
mov DWORD PTR [esp+8], eax
mov DWORD PTR [esp+12], edx
.L4:
sub esp, 8
xor ebp, ebp
pushedi#
pushesi#
call__ctzdi2   # Oops: shouldn't this be inlined with -O3?!
add esp, 16
mov edx, DWORD PTR [esp+12]
mov ecx, eax
shrdesi, edi, cl
shr edi, cl
testal, 32
mov eax, DWORD PTR [esp+8]
cmovne  esi, edi
cmovne  edi, ebp
mov ecx, edx
cmp eax, esi
sbb ecx, edi
jnc .L3
mov DWORD PTR [esp+8], esi
mov esi, eax
mov DWORD PTR [esp+12], edi
mov edi, edx
.L3:
sub esi, DWORD PTR [esp+8]
sbb edi, DWORD PTR [esp+12]
mov eax, edi
or  eax, esi
jne .L4
mov eax, DWORD PTR [esp+8]
mov edx, DWORD PTR [esp+12]
mov ecx, ebx
xor ebx, ebx
shldedx, eax, cl
sal eax, cl
and ecx, 32
cmovne  edx, eax
cmovne  eax, ebx
.L1:
add esp, 28
pop ebx
pop esi
pop edi
pop ebp
ret
.L6:
mov eax, DWORD PTR [esp+8]
mov edx, DWORD PTR [esp+12]
add esp, 28
pop ebx
pop esi
pop edi
pop ebp
ret


Compile it again, now with the preprocessor macro CTZ=32 defined
to avoid/inline the calls of __ctzdi2:

GCC 12.2: gcc -DCTZ=32 -m32 -mno-sse -O3 gcddi3.c

# https://godbolt.org/z/deo6538

B^HDEAD code generation (AMD64)

2023-01-09 Thread Stefan Kanthak
Hi,

compile the following 128-bit GCD routine for the AMD64 processor
with full optimization:

--- gcdti3.c ---
// Stein's algorithm: greatest common divisor

__uint128_t __gcdti3(__uint128_t p, __uint128_t q)
{
unsignedr, s = 0;
__uint128_t t;

if (p == 0)
return q;

if (q == 0)
return p;

if (p == q)
return p;

if (((unsigned long long) p | (unsigned long long) q) == 0)
p >>= 64, q >>= 64, s = 64;

r = __builtin_ctzll((unsigned long long) p | (unsigned long long) q), p >>= 
r, q >>= r, s += r;

if ((unsigned long long) p == 0)
p >>= 64;

r = __builtin_ctzll(p), p >>= r;

do
{
if ((unsigned long long) q == 0)
q >>= 64;

r = __builtin_ctzll(q), q >>= r;

if (p < q)
t = q, q = p, p = t;
} while (q -= p);

return p << s;
}
--- EOF ---

GCC 12.2: gcc -O3 gcdti3.c

# https://godbolt.org/z/d1Ma9qnsf
__gcdti3(unsigned __int128, unsigned __int128):
mov rax, rsi  # OOPS: GCCs plays six rounds of shell game!
mov r8, rdi   #
mov rsi, rdi  #
mov rdi, rax  #
mov rax, rdx  #
mov rdx, rcx  #
mov rcx, rdi
or  rcx, r8
je  .L1
mov rcx, r8
mov r8, rdi
xor rcx, rax
xor r8, rdx
or  rcx, r8
je  .L9
mov rcx, rax
or  rcx, rdx
je  .L9
mov rcx, rsi
xor r10d, r10d
or  rcx, rax
jne .L3
mov rsi, rdi
mov rax, rdx
xor edi, edi
xor edx, edx
mov rcx, rsi
mov r10d, 64
or  rcx, rax
.L3:
rep bsf rcx, rcx
xor r8d, r8d  # OUCH: BSF and TZCNT return at most 63,
shrdrsi, rdi, cl
shr rdi, cl
testcl, 64#   so this is dead code!
cmovne  rsi, rdi  #
cmovne  rdi, r8   #
shrdrax, rdx, cl
xor r11d, r11d# OUCH: BSF and TZCNT return at most 63,
shr rdx, cl
testcl, 64#   so this is dead code!
cmovne  rax, rdx  #
cmovne  rdx, r11  #
mov r8, rsi
mov r9, rdi
add r10d, ecx
mov rcx, r8
mov rsi, rax
mov rdi, rdx
testr8, r8
je  .L14
.L4:
rep bsf rcx, rcx
mov rax, r8
mov rdx, r9
xor r11d, r11d# OUCH: BSF and TZCNT return at most 63,
shr rdx, cl
shrdrax, r9, cl
and ecx, 64   #   (there's also no need to modify ECX)
cmovne  rax, rdx  #   so this is dead code!
cmovne  rdx, r11  #
.L7:
mov rcx, rsi
testrsi, rsi
jne .L5
mov rsi, rdi
xor edi, edi
mov rcx, rsi
.L5:
rep bsf rcx, rcx
xor r8d, r8d  # OUCH: BSF and TZCNT return at most 63,
shrdrsi, rdi, cl
shr rdi, cl
testcl, 64#   so this is dead code,
mov rcx, rdx
cmovne  rsi, rdi  #   and that too!
cmovne  rdi, r8   #
cmp rax, rsi
sbb rcx, rdi
jnc .L6
mov r8, rax
mov r9, rdx
mov rax, rsi
mov rdx, rdi
mov rsi, r8
mov rdi, r9
.L6:
sub rsi, rax
sbb rdi, rdx
mov rcx, rdi
or  rcx, rsi
jne .L7
mov ecx, r10d
xor esi, esi
shldrdx, rax, cl
sal rax, cl
and ecx, 64   # Oops: there's no need to modify ECX!
cmovne  rdx, rax
cmovne  rax, rsi
ret
.L9:
mov rax, rsi
mov rdx, rdi
.L1:
ret
.L14:
mov r8, r9
xor r9d, r9d
mov rcx, r8
jmp .L4

20 superfluous instructions of the total 102 instructions!

NOT AMUSED
Stefan Kanthak

PS: 
shows properly written assembly code.


EPIC optimiser failures (i386)

2023-01-09 Thread Stefan Kanthak
Hi,

compile the following routine for the i386 processor,
with optimisation:

--- double.c  ---
// IEEE-754 binary64 double-precision floating point

// binary64 != ±0.0 ->  0
// binary64 == +0.0 -> +1
// binary64 == -0.0 -> -1

int plusminus0(unsigned long long binary64)
{
if (binary64 != -binary64) // neither +0.0 nor -0.0
return 0;
if (binary64 == 0)
return 1;
return -1;
}
--- EOF ---

GCC 12.2gcc -m32 -O2 double.c

# https://godbolt.org/z/17as1M1xM
plusminus0(unsigned long long):
pushesi
pushebx
mov ecx, DWORD PTR [esp+12]
mov ebx, DWORD PTR [esp+16]
mov eax, ecx
neg eax
mov edx, ebx
adc edx, 0
xor eax, ecx
neg edx
xor edx, ebx
or  eax, edx
jne .L5
or  ecx, ebx
pop ebx
cmp ecx, 1
sbb esi, esi
and esi, 2
sub esi, 1
mov eax, esi
pop esi
ret
.L5:
xor esi, esi
pop ebx
mov eax, esi
pop esi
ret

OUCH: these 27 instructions in 56 bytes are as BAD^WHORRIBLE as code
  could get!

EVERY optimising^Wcompiler writer should be aware that

if (binary64 == -binary64)

is just a shorthand for

if (binary64 == 0 - binary64)

and thus equivalent to

if (binary64 + binary64 == 0)

which SHOULD lead to the following (optionally branch-free) code:

mov ecx, dword ptr [esp+4]
mov edx, dword ptr [esp+8]  # edx:ecx = binary64
add ecx, ecx
adc edx, edx
sbb eax, eax# eax = (binary64 < 0) ? -1 : 0
.ifnotdef BRANCHFREE
or  ecx, edx
jz  .L0 # binary64 == -binary64?
stc # CF = 1
adc eax, eax# eax = (binary64 < 0) ? -1 : 1
.L0:
.else
stc # CF = 1
adc eax, eax# eax = (binary64 < 0) ? -1 : 1
or  ecx, edx# ecx = (binary64 == -binary64) ? 0 : *
neg ecx # CF = (binary64 != -binary64)
sbb ecx, ecx# ecx = (binary64 != -binary64) ? -1 : 0
not ecx # ecx = (binary64 == -binary64) ? -1 : 0
and eax, ecx
.endif
ret

Either 10 instructions in 22 bytes or 13 instructions in 28 bytes,
i.e. less than half the instructions and bytes!

Since the lower half of the binary64 only needs to be tested against 0,
a TRUE optimising compiler would but come up with the following code:

mov eax, dword ptr [esp+8]  # upper half of binary64
cdq # edx = (binary64 < 0) ? -1 : 0
stc # CF = 1
adc edx, edx# edx = (binary64 < 0) ? -1 : 1
add eax, eax
or  eax, dword ptr [esp+4]
neg eax # CF = (binary64 != -binary64)
sbb eax, eax# eax = (binary64 != -binary64) ? -1 : 0
not eax # eax = (binary64 == -binary64) ? -1 : 0
and eax, edx
ret

11 instructions in 23 bytes.


--- single.c  ---
// IEEE-754 binary32 single-precision floating point

int plusminus0(unsigned long binary32)
{
if (binary32 != -binary32) // neither +0.0 nor -0.0
return 0;
if (binary32 == 0)
return 1;
return -1;
}
--- EOF ---

GCC 12.2gcc -m32 -O2 single.c

# https://godbolt.org/z/djT748e81
plusminus0(unsigned int):
mov edx, DWORD PTR [esp+4]
xor eax, eax
mov ecx, edx
neg ecx
cmp ecx, edx
jne .L1
cmp ecx, 1
sbb eax, eax
and eax, 2
sub eax, 1
.L1:
ret

OOPS (11 instructions in 26 bytes)!
An optimising compiler SHOULD but generate 8 instructions in 16 bytes:

xor eax, eax
mov ecx, DWORD PTR [esp+4]
add ecx, ecx
jnz .L1 # binary32 != -binary32?
sbb eax, eax# eax = (binary32 < 0) ? -1 : 0
stc # CF = 1
adc eax, eax# eax = (binary32 < 0) ? -1 : 1
.L1:
ret


A TRUE optimising compiler would  butgenerate the following branch-free
code, using 7 or 8 instructions in 19 or 18 bytes:

.if 0
mov eax, DWORD PTR [esp+4]
neg eax # OF = (binary32 == -0.0),
# ZF = (binary32 == +0.0)
.else
xor eax, eax
sub eax, DWORD PTR [esp+4]
.endif
setoah
setzal
sub al, ah  # al = ZF - OF
.if 0
cbw
cwde
.else
movsx   eax, al
.endif
ret

Stefan Kant

Re: B^HDEAD code generation (AMD64)

2023-01-09 Thread Thomas Koenig via Gcc

On 09.01.23 12:35, Stefan Kanthak wrote:

20 superfluous instructions of the total 102 instructions!


The proper place for bug reports is https://gcc.gnu.org/bugzilla/ .
Feel free to submit these cases there.


Re: B^HDEAD code generation (AMD64)

2023-01-09 Thread LIU Hao via Gcc

在 2023/1/9 19:48, Thomas Koenig via Gcc 写道:

On 09.01.23 12:35, Stefan Kanthak wrote:

20 superfluous instructions of the total 102 instructions!


The proper place for bug reports is https://gcc.gnu.org/bugzilla/ .
Feel free to submit these cases there.


Created https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108341, although I'm not sure whether it's 
related to this issue.



Given

   ```
   extern int r;

   int
   bz(int value)
 {
   r = __builtin_ctz(value);
   return value != 0;  // always true
 }
   ```

`value` is passed to `__builtin_ctz` and GCC should assume it can't be zero, otherwise the behavior 
is undefined. However GCC fails to optimize it and produces:


   ```
   bz:
 endbr64
 xoreax, eax
 rep bsfeax, edi
 movDWORD PTR r[rip], eax
 xoreax, eax
 test   edi, edi
 setne  al
 ret
   ```


--
Best regards,
LIU Hao



OpenPGP_signature
Description: OpenPGP digital signature


Widening multiplication, but no narrowing division [i386/AMD64]

2023-01-09 Thread Stefan Kanthak
Hi,

GCC (and other C compilers too) support the widening multiplication
of i386/AMD64 processors, but DON'T support their narrowing division:

--- demo.c ---
unsigned long long product(unsigned long multiplicand,
   unsigned long multiplier)
{
return (unsigned long long) multiplicand * multiplier;
}

unsigned long long quotient(unsigned long long dividend,
unsigned long divisor,
unsigned long *remainder)
{
*remainder = dividend % divisor;
return dividend / divisor;
}
--- EOF ---

GCC 12.2: gcc -m32 -O2 demo.c

# https://godbolt.org/z/1M9dohMcE
product(unsigned long, unsigned long):
mov eax, DWORD PTR [esp+8]
mul DWORD PTR [esp+4]
ret
quotient(unsigned long long, unsigned long, unsigned long*):
pushebx
xor edx, edx
sub esp, 24
mov eax, DWORD PTR [esp+40]
lea ecx, [esp+8]
sub esp, 12
pushecx
pushedx
pusheax
pushDWORD PTR [esp+60]
pushDWORD PTR [esp+60]
call__udivmoddi4
mov ebx, DWORD PTR [esp+40]
mov ecx, DWORD PTR [esp+76]
mov DWORD PTR [ecx], ebx
add esp, 56
pop ebx
ret

### Diversion ###

Even worse and completely BRAINDEAD, another compiler calls __udivdi3()
and wastes a multiplication to compute the remainder, ignoring the fact
that __udivdi3() calls __udivmoddi4() which already returns quotient
and remainder:

clang 15.0.0: clang -m32 -O2 demo.c

# https://godbolt.org/z/rv1sTe7xv
product(unsigned long, unsigned long):
mov eax, dword ptr [esp + 8]
mul dword ptr [esp + 4]
ret
quotient(unsigned long long, unsigned long, unsigned long*):
pushebp
pushebx
pushedi
pushesi
sub esp, 12
call.L1$pb
.L1$pb:
pop ebx
.Ltmp2:
add ebx, offset _GLOBAL_OFFSET_TABLE_+(.Ltmp2-.L1$pb)
mov esi, dword ptr [esp + 44]
mov edi, dword ptr [esp + 32]
mov ebp, dword ptr [esp + 40]
push0
pushebp
pushdword ptr [esp + 44]
pushedi
call__udivdi3@PLT
add esp, 16
imulebp, eax
sub edi, ebp
mov dword ptr [esi], edi
add esp, 12
pop esi
pop edi
pop ebx
pop ebp
ret

### end of diversion ###

Both compilers miss the fact that the i386 processor has a narrowing
integer division and can therefore divide 64-bit / 32-bit numbers,
for example with the well-known "long" alias "schoolbook" division,
returning 64-bit quotient and 32-bit remainder:

.arch   generic 32
.code32
.intel_syntax noprefix
.text

quotient:
mov ecx, [esp+12]   # ecx = divisor
.if 0
xor edx, edx
mov eax, [esp+8]# edx:eax = high dword of dividend
cmp eax, edx
je  0f  # high dword of dividend = 0?
.else
xor eax, eax
mov edx, [esp+8]# eax:edx = high dword of dividend
cmp edx, ecx
jb  0f  # high dword of dividend < divisor?
# quotient < 2**32?
xchgeax, edx
.endif
div ecx # eax = high dword of quotient,
0:  # edx = high dword of dividend'
pusheax
mov eax, [esp+8]# edx:eax = dividend'
div ecx # eax = low dword of quotient,
# edx = remainder
mov ecx, remainder  # ecx = address of remainder
mov [ecx], edx
pop edx # edx:eax = quotient
ret
.end

JFTR: dependent on the magnitude of the numbers and the processor
  it MIGHT be better to omit comparison and branch: there's a
  trade-öff between the latency of the (un-pipelined) division
  instruction and the latency of the conditional branch due to
  misprediction.

Stefan Kanthak



Re: Widening multiplication, but no narrowing division [i386/AMD64]

2023-01-09 Thread LIU Hao via Gcc

在 2023/1/9 20:20, Stefan Kanthak 写道:

Hi,

GCC (and other C compilers too) support the widening multiplication
of i386/AMD64 processors, but DON'T support their narrowing division:




QWORD-DWORD division would change the behavior of your program.


Given:

   ```
   uint32_t xdiv(uint64_t x, uint32_t y) { return x / y;  }
   ```

then `xdiv(0x20002, 2)` should first convert both operands to `uint64_t`, perform the division 
which yields `0x10001`, then truncate the quotient to 32-bit which gives `1`. The result is exact.



If DIV was used, it would effect an exception:

   ```
   mov edx, 2
   mov eax, edx   # edx:eax = 0x20002

   mov ecx, edx
   div ecx# division overflows because the quotient
  # can't stored into EAX
   ```





--
Best regards,
LIU Hao



OpenPGP_signature
Description: OpenPGP digital signature


Re: Widening multiplication, but no narrowing division [i386/AMD64]

2023-01-09 Thread Stefan Kanthak
LIU Hao wrote:

>在 2023/1/9 20:20, Stefan Kanthak 写道:
>> Hi,
>>
>> GCC (and other C compilers too) support the widening multiplication
>> of i386/AMD64 processors, but DON'T support their narrowing division:
>>
>>
>
> QWORD-DWORD division would change the behavior of your program.
[...]
> If DIV was used, it would effect an exception:

Guess why I use "schoolbook" division?
Please read the end of my post until you understand the code.

regards
Stefan



Re: Widening multiplication, but no narrowing division [i386/AMD64]

2023-01-09 Thread Paul Koning via Gcc



> On Jan 9, 2023, at 7:20 AM, Stefan Kanthak  wrote:
> 
> Hi,
> 
> GCC (and other C compilers too) support the widening multiplication
> of i386/AMD64 processors, but DON'T support their narrowing division:

I wonder if this changed in the recent past.  I have a pattern for this type of 
thing in pdp11.md:

(define_expand "divmodhi4"
  [(parallel
[(set (subreg:HI (match_dup 1) 0)
(div:HI (match_operand:SI 1 "register_operand" "0")
(match_operand:HI 2 "general_operand" "g")))
 (set (subreg:HI (match_dup 1) 2)
(mod:HI (match_dup 1) (match_dup 2)))])
   (set (match_operand:HI 0 "register_operand" "=r")
(subreg:HI (match_dup 1) 0))
   (set (match_operand:HI 3 "register_operand" "=r")
(subreg:HI (match_dup 1) 2))]
  "TARGET_40_PLUS"
  "")

and I'm pretty sure this worked at some point in the past.  

paul



Re: Widening multiplication, but no narrowing division [i386/AMD64]

2023-01-09 Thread Stefan Kanthak
"Paul Koning"  wrote:

>> On Jan 9, 2023, at 7:20 AM, Stefan Kanthak  wrote:
>> 
>> Hi,
>> 
>> GCC (and other C compilers too) support the widening multiplication
>> of i386/AMD64 processors, but DON'T support their narrowing division:
>
> I wonder if this changed in the recent past.
> I have a pattern for this type of thing in pdp11.md:
[...]
> and I'm pretty sure this worked at some point in the past.  

Unfortunately the C standard defines that the smaller operand (of lesser
conversion rank), here divisor, has to undergo a conversion to the "real
common type", i.e. the broader operand (of higher conversion rank), here
dividend. Unless the information about promotion/conversion is handed over
to the code generator it can't apply such patterns -- as demonstrated by
the demo code.

regards
Stefan


Re: Widening multiplication, but no narrowing division [i386/AMD64]

2023-01-09 Thread Paul Koning via Gcc



> On Jan 9, 2023, at 10:20 AM, Stefan Kanthak  wrote:
> 
> "Paul Koning"  wrote:
> 
>>> On Jan 9, 2023, at 7:20 AM, Stefan Kanthak  wrote:
>>> 
>>> Hi,
>>> 
>>> GCC (and other C compilers too) support the widening multiplication
>>> of i386/AMD64 processors, but DON'T support their narrowing division:
>> 
>> I wonder if this changed in the recent past.
>> I have a pattern for this type of thing in pdp11.md:
> [...]
>> and I'm pretty sure this worked at some point in the past.  
> 
> Unfortunately the C standard defines that the smaller operand (of lesser
> conversion rank), here divisor, has to undergo a conversion to the "real
> common type", i.e. the broader operand (of higher conversion rank), here
> dividend. Unless the information about promotion/conversion is handed over
> to the code generator it can't apply such patterns -- as demonstrated by
> the demo code.
> 
> regards
> Stefan

Yes, I was thinking the same.  But I spent a while on that pattern -- I wanted 
to support div/mod as a single operation because the machine has that 
primitive.  And I'm pretty sure I saw it work before I committed that change.  
That's why I'm wondering if something changed.

paul



Re: Widening multiplication, but no narrowing division [i386/AMD64]

2023-01-09 Thread Stefan Kanthak
"Paul Koning"  wrote:

>> On Jan 9, 2023, at 10:20 AM, Stefan Kanthak  wrote:
>> 
>> "Paul Koning"  wrote:
>> 
 On Jan 9, 2023, at 7:20 AM, Stefan Kanthak  wrote:
 
 Hi,
 
 GCC (and other C compilers too) support the widening multiplication
 of i386/AMD64 processors, but DON'T support their narrowing division:
>>> 
>>> I wonder if this changed in the recent past.
>>> I have a pattern for this type of thing in pdp11.md:
>> [...]
>>> and I'm pretty sure this worked at some point in the past.  
>> 
>> Unfortunately the C standard defines that the smaller operand (of lesser
>> conversion rank), here divisor, has to undergo a conversion to the "real
>> common type", i.e. the broader operand (of higher conversion rank), here
>> dividend. Unless the information about promotion/conversion is handed over
>> to the code generator it can't apply such patterns -- as demonstrated by
>> the demo code.

> Yes, I was thinking the same.  But I spent a while on that pattern -- I
> wanted to support div/mod as a single operation because the machine has
> that primitive.  And I'm pretty sure I saw it work before I committed
> that change.  That's why I'm wondering if something changed.

I can't tell from the past how GCC once worked, but today it can't
(or doesn't) use such patterns, at least not on i386/AMD64 processors.
To give another example where the necessary information is most
obviously NOT propagated from front end to back end:

--- clmul.c ---
// widening carry-less multiplication

unsigned long long clmul(unsigned long p, unsigned long q)
{
unsigned long long r = 0;
unsigned long  s = 1UL << 31;

do {
r <<= 1;
if (q & s)
#ifdef _MSC_VER
(unsigned long) r ^= p;
#else
r ^= p; // no need to promote/convert p here!
#endif
} while (s >>= 1);

return r;
}
--- EOF ---

# https://gcc.godbolt.org/z/E99v7fEP3
clmul(unsigned long, unsigned long):
pushebp
mov ecx, -2147483648
xor eax, eax
xor edx, edx
pushedi# OOPS: superfluous
xor edi, edi   # OOPS: superfluous
pushesi
pushebx# OUCH: WTF?
mov ebp, DWORD PTR [esp+24]
mov ebx, 32# OUCH: WTF?
mov esi, DWORD PTR [esp+20]
.L3:
shldedx, eax, 1
add eax, eax
testebp, ecx
je  .L2
xor eax, esi
xor edx, edi   # OOPS: superfluous
.L2:
shr ecx, 1
sub ebx, 1 # OUCH: WTF?
jne .L3
pop ebx# OUCH: WTF?
pop esi
pop edi# OOPS: superfluous
pop ebp
ret

8 superfluous instructions out of the total 25 instructions!

NOT AMUSED
Stefan


Re: Bypass assembler when generating LTO object files

2023-01-09 Thread Martin Jambor
Hello,

On Sun, Dec 18 2022, Mohamed Atef wrote:
> Hello,
>I am interested in working in this project during my free time, is
> understanding this https://gcc.gnu.org/wiki/LinkTimeOptimization
> A good starting point

That section of the Wiki is very old.  You may find bits there that are
still valid and relevant, but I would be actually a bit careful with
that content.

If you're looking for high-level overview of LTO, unfortunately I can
only recommend videos:

  - Honza's "Building openSUSE with GCC's link time optimization"
https://events.opensuse.org/conferences/oSC18/program/proposals/1846#2

  - my "Interprodecural optimizations in GCC"
https://www.youtube.com/watch?v=oQ71ZbOuSW4
(the first 12 minutes or so, the rest is then about optimizations)

For the task specifically, the patch from 2014
https://gcc.gnu.org/legacy-ml/gcc/2014-09/msg00340.html is still a good
starting point, even if not a very clear one.  The crux of the matter is
to enhance libiberty/simple-object*.[ch] to be able to create elf from
scratch (as opposed to modifying an existing one).  So look there too.

If you have any questions, feel free to ask.

Good luck,

Martin


Re: B^HDEAD code generation (AMD64)

2023-01-09 Thread Stefan Kanthak
"Thomas Koenig"  wrote:

> On 09.01.23 12:35, Stefan Kanthak wrote:
>> 20 superfluous instructions of the total 102 instructions!
> 
> The proper place for bug reports is https://gcc.gnu.org/bugzilla/ .

OUCH: there's NO proper place for bugs at all!

> Feel free to submit these cases there.

I feel free to do whatever I like to do where I do it, for example:

--- bug.cpp ---
int main() {
__uint128_t long long bug = 0;
}
--- EOF ---

See 

regards
Stefan


Re: B^HDEAD code generation (AMD64)

2023-01-09 Thread Andrew Pinski via Gcc
On Mon, Jan 9, 2023 at 4:42 PM Stefan Kanthak  wrote:
>
> "Thomas Koenig"  wrote:
>
> > On 09.01.23 12:35, Stefan Kanthak wrote:
> >> 20 superfluous instructions of the total 102 instructions!
> >
> > The proper place for bug reports is https://gcc.gnu.org/bugzilla/ .
>
> OUCH: there's NO proper place for bugs at all!

HUH? soon people will ignore emails that are demeaning/ableist like
what you have been recently (saying things like braindead, etc.). And
yes bugzilla is where GCC tracks bug reports.

>
> > Feel free to submit these cases there.
>
> I feel free to do whatever I like to do where I do it, for example:
>
> --- bug.cpp ---
> int main() {
> __uint128_t long long bug = 0;
> }
> --- EOF ---

With -pedantic-errors we get:

: In function 'int main()':
:2:22: error: 'long long' specified with '__int128 unsigned'
[-Wpedantic]
2 | __uint128_t long long bug = 0;
  |  ^~~~

And also run into https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108099 .

This is a known extension but maybe it is not documented ...
Anyways read that bug report.


>
> See 
>
> regards
> Stefan


Re: B^HDEAD code generation (AMD64)

2023-01-09 Thread Gabriel Ravier via Gcc

On 1/10/23 01:34, Stefan Kanthak wrote:

"Thomas Koenig"  wrote:


On 09.01.23 12:35, Stefan Kanthak wrote:

20 superfluous instructions of the total 102 instructions!

The proper place for bug reports is https://gcc.gnu.org/bugzilla/ .

OUCH: there's NO proper place for bugs at all!


Feel free to submit these cases there.

I feel free to do whatever I like to do where I do it, for example:

--- bug.cpp ---
int main() {
 __uint128_t long long bug = 0;
}
--- EOF ---

See 

regards
Stefan


If you're trying to speedrun actually getting banned from this mailing 
list, then sure, I guess you can "do whatever I like to do where I do 
it", but you might find that more difficult after somebody decides to do 
something about it.