[Bug target/71659] _xgetbv intrinsic missing

2017-02-28 Thread postmaster at raasu dot org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71659

postmaster at raasu dot org changed:

   What|Removed |Added

 CC||postmaster at raasu dot org

--- Comment #2 from postmaster at raasu dot org ---
Portability is one main reason to add missing intrinsics... with combination of
cpuid check and _xgetbv() we can cleanly check if AVX or MPX is available at
run-time. We can also check specific instructions during configure process to
see if we need to add workarounds for bad or missing functions/intrinsics.

Some developers think that cleanliness of the code is more important than need
to reduplicate hand-written assembler code every time for optimal performance.

We have to remember that gcc is not only used for BSD-like operating systems,
including OS/X, Linux, *BSD etc, but for Cygwin, MSYS/MSYS2 and MinGW which
benefit from gcc being as close as possible compiler of Visual C++ regarding
intrinsics support.

[Bug c/79938] New: gcc unnecessarily spills xmm register to stack when inserting vector items

2017-03-07 Thread postmaster at raasu dot org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79938

Bug ID: 79938
   Summary: gcc unnecessarily spills xmm register to stack when
inserting vector items
   Product: gcc
   Version: 6.2.0
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: c
  Assignee: unassigned at gcc dot gnu.org
  Reporter: postmaster at raasu dot org
  Target Milestone: ---

Created attachment 40906
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=40906&action=edit
assembler output

When adding together values from one vector and storing the results to another,
gcc uses two xmm registers instead of one and spills the second xmm register to
stack when it runs out of general purpose registers.

Instead of spilling the second xmm register to stack, it should use only one
xmm register as destination, because the addition is already being done using
four general purpose registers.

Using gcc -msse4.1 -O3 -S hadd.c -Wall -Wextra -fno-strict-aliasing -fwrapv -o
hadd.s

mika@LENOVO:~$ gcc --version
gcc (Ubuntu 6.2.0-3ubuntu11~14.04) 6.2.0 20160901
---
#include 
#include 
#include 

typedef uint8_t   v1si __attribute__ ((vector_size (16)));
typedef uint16_t  v2si __attribute__ ((vector_size (16)));
typedef uint32_t  v4si __attribute__ ((vector_size (16)));
typedef uint64_t  v8si __attribute__ ((vector_size (16)));

static __m128i haddd_epu8(__m128i a)
{
  v1si b = (v1si)a;
  v4si ret;
  ret[0]  = (b[ 0] + b[ 1]) + (b[ 2] + b[ 3]);
  ret[1]  = (b[ 4] + b[ 5]) + (b[ 6] + b[ 7]);
  ret[2]  = (b[ 8] + b[ 9]) + (b[10] + b[11]);
  ret[3]  = (b[12] + b[13]) + (b[14] + b[15]);
  return (__m128i)ret;
}

int main(int argc, char *argv[])
{
  __m128i a = _mm_set1_epi8(atoi(argv[1]));
  __m128i b = haddd_epu8(a);
  v4si c = (v4si)b;
  printf("b[0] = %i, b[1] = %i, b[2] = %i, b[3] = %i\n", c[0], c[1], c[2],
c[3]);
}

[Bug c/79938] gcc unnecessarily spills xmm register to stack when inserting vector items

2017-03-07 Thread postmaster at raasu dot org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79938

--- Comment #2 from postmaster at raasu dot org ---
(In reply to Richard Biener from comment #1)
> The situation is slightly better with GCC 7, only two spill/loads are
> remaining.
> Possibly BIT_INSERT_EXPR helps here.

With gcc 6.2.0 and
gcc -msse4.1 -mtune=core2 -O3 -S hadd.c -Wall -Wextra -fno-strict-aliasing
-fwrapv -o hadd.s

The resulting assembler output is almost perfect, but adding -mtune=core2 kinda
makes the code optimal only for Intel processors.

---
...
pxor%xmm1, %xmm1
movl$1, %edi
movd%eax, %xmm0
pshufb  %xmm1, %xmm0
pextrb  $1, %xmm0, %edx
pextrb  $0, %xmm0, %eax
addl%edx, %eax
pextrb  $2, %xmm0, %edx
addl%edx, %eax
pextrb  $4, %xmm0, %ecx
pextrb  $3, %xmm0, %edx
addl%eax, %edx
pextrb  $5, %xmm0, %eax
addl%eax, %ecx
pextrb  $6, %xmm0, %eax
addl%eax, %ecx
pextrb  $9, %xmm0, %esi
pextrb  $7, %xmm0, %eax
addl%eax, %ecx
pextrb  $8, %xmm0, %eax
addl%esi, %eax
pextrb  $10, %xmm0, %esi
addl%esi, %eax
pextrb  $11, %xmm0, %esi
addl%esi, %eax
pextrb  $13, %xmm0, %esi
movd%eax, %xmm1
pextrb  $12, %xmm0, %eax
addl%esi, %eax
pextrb  $14, %xmm0, %esi
addl%eax, %esi
pextrb  $15, %xmm0, %eax
movd%edx, %xmm0
addl%esi, %eax
pinsrd  $1, %ecx, %xmm0
movl$.LC0, %esi
pinsrd  $1, %eax, %xmm1
xorl%eax, %eax
punpcklqdq  %xmm1, %xmm0
...

[Bug c/79938] gcc unnecessarily spills xmm register to stack when inserting vector items

2017-03-07 Thread postmaster at raasu dot org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79938

--- Comment #3 from postmaster at raasu dot org ---
With -mssse3 instead of -msse4.1, the issue gets even worse:

---
...
pxor%xmm1, %xmm1
movl$.LC0, %esi
movl$1, %edi
movd%eax, %xmm0
movdqa  %xmm0, %xmm4
pshufb  %xmm1, %xmm4
movaps  %xmm4, (%rsp)
movzbl  (%rsp), %eax
movaps  %xmm4, 224(%rsp)
movzbl  225(%rsp), %edx
movaps  %xmm4, 208(%rsp)
movaps  %xmm4, 192(%rsp)
movaps  %xmm4, 176(%rsp)
addl%edx, %eax
movzbl  210(%rsp), %edx
movaps  %xmm4, 160(%rsp)
movaps  %xmm4, 144(%rsp)
movaps  %xmm4, 128(%rsp)
movaps  %xmm4, 112(%rsp)
addl%edx, %eax
movzbl  195(%rsp), %edx
movaps  %xmm4, 96(%rsp)
movzbl  105(%rsp), %ecx
movaps  %xmm4, 80(%rsp)
movaps  %xmm4, 64(%rsp)
movaps  %xmm4, 48(%rsp)
addl%edx, %eax
movzbl  165(%rsp), %edx
movaps  %xmm4, 32(%rsp)
movd%eax, %xmm0
movzbl  180(%rsp), %eax
movaps  %xmm4, 16(%rsp)
movaps  %xmm4, 240(%rsp)
addl%edx, %eax
movzbl  150(%rsp), %edx
addl%edx, %eax
movzbl  135(%rsp), %edx
addl%eax, %edx
movzbl  120(%rsp), %eax
movd%edx, %xmm6
punpckldq   %xmm6, %xmm0
addl%ecx, %eax
movzbl  90(%rsp), %ecx
addl%ecx, %eax
movzbl  75(%rsp), %ecx
addl%ecx, %eax
movzbl  45(%rsp), %ecx
movd%eax, %xmm1
movzbl  60(%rsp), %eax
addl%ecx, %eax
movzbl  30(%rsp), %ecx
addl%ecx, %eax
movzbl  15(%rsp), %ecx
addl%ecx, %eax
movd%eax, %xmm5
xorl%eax, %eax
punpckldq   %xmm5, %xmm1
punpcklqdq  %xmm1, %xmm0
movdqa  %xmm0, %xmm2
movd%xmm0, %edx
pshufd  $255, %xmm0, %xmm3
punpckhdq   %xmm0, %xmm2
pshufd  $85, %xmm0, %xmm1
...
---

Notice all the lines starting with "movaps  %xmm4,"
Same register contents are polluted all over the stack.

[Bug c/108580] New: gcc treats shifts as signed operation, does wrong promotion

2023-01-27 Thread postmaster at raasu dot org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108580

Bug ID: 108580
   Summary: gcc treats shifts as signed operation, does wrong
promotion
   Product: gcc
   Version: 12.2.0
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: c
  Assignee: unassigned at gcc dot gnu.org
  Reporter: postmaster at raasu dot org
  Target Milestone: ---

I have a simple program that fails to compile correctly on any common compiler:

int main()
{
   int bits = 8;
   char* a = (char*)malloc(1 << bits);
   char* b = (char*)malloc(1 << bits);
   memcpy(b, a, 1 << bits);
   return 0;
}

when assembled with "gcc -S", the result is

main:
.LFB6:
.cfi_startproc
endbr64
pushq   %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq%rsp, %rbp
.cfi_def_cfa_register 6
subq$32, %rsp
movl$8, -20(%rbp)
movl-20(%rbp), %eax
movl$1, %edx
movl%eax, %ecx
sall%cl, %edx
movl%edx, %eax
cltq
movq%rax, %rdi
callmalloc@PLT
movq%rax, -16(%rbp)
movl-20(%rbp), %eax
movl$1, %edx
movl%eax, %ecx
sall%cl, %edx
movl%edx, %eax
cltq
movq%rax, %rdi
callmalloc@PLT
movq%rax, -8(%rbp)
movl-20(%rbp), %eax
movl$1, %edx
movl%eax, %ecx
sall%cl, %edx
movl%edx, %eax
movslq  %eax, %rdx
movq-16(%rbp), %rcx
movq-8(%rbp), %rax
movq%rcx, %rsi
movq%rax, %rdi
callmemcpy@PLT
movl$0, %eax
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc

The part that is incorrect is:

sall%cl, %edx
movl%edx, %eax
cltq
movq%rax, %rdi

It should zero-extend before the shift, but instead it sign-extends after the
shift... Bit shifting is always unsigned operation. It correctly determines the
function requires 64-bit parameter, but fails to determine it's unsigned.
Integer promotion rules say that unsigned type in expression must be promoted
to larger unsigned type if it can hold the result. As bit shift is unsigned
operation, the temporary should also be unsigned.

Stock gcc headers don't have UINTPTR_C() macro which could be used to
explicitly cast the constant "1" to pointer size to give hint that the shift is
indeed unsigned operation.

gcc version is: gcc (Ubuntu 12.2.0-3ubuntu1) 12.2.0

[Bug c/108580] gcc treats shifts as signed operation, does wrong promotion

2023-01-28 Thread postmaster at raasu dot org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108580

--- Comment #2 from postmaster at raasu dot org ---
If I try to shift to highest bit of signed type, the compiler will reject the
code and that is correct behaviour. The point here is that left-hand side of
the shift operation is by default same size as "int", as in 32 bits, which
means it can't be promoted to "int" again. 

This behaviour is same with gcc, clang and Visual C++, but Visual C++ correctly
gives warning that the code is ambiguous (exact message is "Arithmetic
overflow"), however it's also C++ compiler, which might validate the code with
C++ rules, not C.

[Bug c/108580] gcc treats shifts as signed operation, does wrong promotion

2023-01-28 Thread postmaster at raasu dot org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108580

--- Comment #4 from postmaster at raasu dot org ---
I'm not mixing things... The assembly code clearly says it's using 32-bit
shift. Both with 32-bit and 64-bit architectures by default left-hand side of
shift operation is 32 bits (EAX instead of RAX) and right-hand size is 8 bits
(CL instead of CX, ECX or RCX). 

Using "1U << bits" to explicitly force unsigned 32-bit shift would be incorrect
code. "(size_t)1 << bits", which is also "incorrect" code, would surprisingly
result in correct code generation with both 32-bit and 64-bit targets.

Result of any left shift involving negative numbers, including left-shifting
non-zero bit to highest bit of signed integer, is undefined.

[Bug c/108580] gcc treats shifts as signed operation, does wrong promotion

2023-01-28 Thread postmaster at raasu dot org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108580

--- Comment #6 from postmaster at raasu dot org ---
There is wrong assumption again... Literal "1" is always unsigned as there is
no implicit signed literals, even though there is explicit signed literals...
When somebody writes "-1" it is treated as expression "0 - 1", not a literal
"negative one"... This is because subtract operator has higher precedence.
Empty literal always equals to literal "0".

[Bug c/108580] gcc treats shifts as signed operation, does wrong promotion

2023-01-28 Thread postmaster at raasu dot org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108580

--- Comment #8 from postmaster at raasu dot org ---
I know enough C that you can't write code like:

int i = 0x;

This is not equal to:

int i = -1;

or

int i = (-1);


---
Largest literal you can assign to "int" is "0x7FFF". Any larger value must
be either result of expression or another variable, otherwise it will result in
"arithmetic" overflow warning.

Some literals and operations are inherently unsigned, no matter what generic
rules say. As I already said, writing "1u << bits" would be incorrect, and
strict-conforming or "pedantic" compiler would throw warning as the types don't
match and implicit conversion doesn't happen with sizes larger than 32 bits.
Type modifiers are otherwise case-insensitive, but don't support mixed-case.

C standard doesn't even mention anything about "size_t" or have type modifier
for it. Even though printf() and alike support "%z", it is considered extension
and will be rejected when using strict/pedantic mode.

[Bug tree-optimization/79938] gcc unnecessarily spills xmm register to stack when inserting vector items

2021-08-03 Thread postmaster at raasu dot org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79938

--- Comment #5 from postmaster at raasu dot org ---
My brains think it's basically four shuffles and three vector additions. It's
part of vectorized adler32 implementation, so there is real-life use for the
optimization.

[Bug tree-optimization/79938] gcc unnecessarily spills xmm register to stack when inserting vector items

2021-08-05 Thread postmaster at raasu dot org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79938

--- Comment #6 from postmaster at raasu dot org ---
I tried identical code using intrinsics with both clang and gcc:

clang:

 movdqa xmm1,XMMWORD PTR [rip+0xd98]# 402050 <_IO_stdin_used+0x50>
 pand   xmm1,xmm0
 movdqa xmm2,xmm0
 pshufb xmm2,XMMWORD PTR [rip+0xd97]# 402060 <_IO_stdin_used+0x60>
 movdqa xmm3,xmm0
 pshufb xmm3,XMMWORD PTR [rip+0xd9a]# 402070 <_IO_stdin_used+0x70>
 paddd  xmm2,xmm1
 psrld  xmm0,0x18
 paddd  xmm0,xmm3
 paddd  xmm0,xmm2

gcc:

 movdqa  %xmm0, %xmm1
 movdqa  %xmm0, %xmm2
 movdqa  %xmm0, %xmm3
 pshufb  .LC0(%rip), %xmm1
 pshufb  .LC1(%rip), %xmm2
 pshufb  .LC2(%rip), %xmm3
 pshufb  .LC3(%rip), %xmm0
 paddd   %xmm3, %xmm0
 paddd   %xmm2, %xmm0
 paddd   %xmm1, %xmm0


This is the function using intrinsics:

static __m128i __attribute__((noinline)) haddd_epu8(__m128i a)
{
   __m128i b1 = _mm_shuffle_epi8(a, _mm_set_epi8(0x80, 0x80, 0x80, 12, 0x80,
0x80, 0x80,  8, 0x80, 0x80, 0x80,  4, 0x80, 0x80, 0x80,  0));
   __m128i b2 = _mm_shuffle_epi8(a, _mm_set_epi8(0x80, 0x80, 0x80, 13, 0x80,
0x80, 0x80,  9, 0x80, 0x80, 0x80,  5, 0x80, 0x80, 0x80,  1));
   __m128i b3 = _mm_shuffle_epi8(a, _mm_set_epi8(0x80, 0x80, 0x80, 14, 0x80,
0x80, 0x80, 10, 0x80, 0x80, 0x80,  6, 0x80, 0x80, 0x80,  2));
   __m128i b4 = _mm_shuffle_epi8(a, _mm_set_epi8(0x80, 0x80, 0x80, 15, 0x80,
0x80, 0x80, 11, 0x80, 0x80, 0x80,  7, 0x80, 0x80, 0x80,  3));
   __m128i c = _mm_add_epi32(b1, _mm_add_epi32(b2, _mm_add_epi32(b3, b4)));
   return c;
}