ICE in maybe_record_trace_start, at dwarf2cfi.c:2328 (trunk@243328)

2016-12-10 Thread George Spelvin
This is avr-gcc compiled from the git mirror commit
fcdd7053da, which says it's SVN trunk@243328.

This code used to work on some earlier GCC, but I don't know which
version.

Host is Debian Linux, i686, compiled with gcc (Debian 6.2.1-5) 6.2.1 20161124.

Configured with: ../gcc/configure --target=avr --enable-languages=c,c++ 
--enable-shared --with-system-zlib --enable-long-long --disable-nls 
--disable-libssp --disable-libsanitizer --program-prefix=avr- 
--with-as=/usr/bin/avr-as --with-ld=/usr/bin/avr-ld

$ avr-gcc -gdwarf-4 -Os -mmcu=at90s8515 -c scanf-nul.i
scanf-nul.i: In function 'main':
scanf-nul.i:130:1: internal compiler error: in maybe_record_trace_start, at 
dwarf2cfi.c:2328
 }
 ^
0x837b20c maybe_record_trace_start
../../gcc/gcc/dwarf2cfi.c:2328
0x837b502 create_trace_edges
../../gcc/gcc/dwarf2cfi.c:2424
0x837d412 scan_trace
../../gcc/gcc/dwarf2cfi.c:2638
0x837def9 create_cfi_notes
../../gcc/gcc/dwarf2cfi.c:2664
0x837def9 execute_dwarf2_frame
../../gcc/gcc/dwarf2cfi.c:3022
0x837def9 execute
../../gcc/gcc/dwarf2cfi.c:3502
Please submit a full bug report,
with preprocessed source if appropriate.
Please include the complete backtrace with any bug report.


The code is straight out of the avr-libc test suite.
The version below has the header spam stripped out.

== scanf-nul.i ==
typedef unsigned int uint8_t __attribute__((__mode__(__QI__)));
typedef unsigned int uint16_t __attribute__ ((__mode__ (__HI__)));
typedef unsigned int size_t;
struct __file {
 char *buf;
 unsigned char unget;
 uint8_t flags;
 int size;
 int len;
 int (*put)(char, struct __file *);
 int (*get)(struct __file *);
 void *udata;
};
typedef struct __file FILE;
extern FILE *fdevopen(int (*__put)(char, FILE*), int (*__get)(FILE*));
extern int fclose(FILE *__stream);
extern int fscanf(FILE *__stream, const char *__fmt, ...);
extern void exit(int __status) __attribute__((__noreturn__));
extern void *memset(void *, int, size_t);
extern int memcmp(const void *, const void *, size_t) __attribute__((__pure__));

int vrslt = 1;
struct {
int i;
int j;
char s[8];
char t[8];
} v = { 1, 1, {1}, {1} };

const char *getpnt, *getend;

int ugetc(FILE *fp)
{
(void)fp;
if (getpnt == getend)
return -2;
return *getpnt++;
}

int uread(void *cookie, char *buf, size_t size)
{
size_t n;

for (n = 0; n < size; n++) {
int i = ugetc(cookie);
if (i < 0) break;
*buf++ = i;
}
return n;
}

int uclose (void *cookie)
{
(void)cookie;
return 0;
}

static FILE * uopen(const char *buf, int size)
{
static FILE *fp;

if (fp)
fclose(fp);
fp = fdevopen(0, ugetc);
if (!fp)
exit(117);
getpnt = buf;
getend = buf + size;
return fp;
}

int main()
{
FILE *fp;
int i;

memset(&v, ~0, sizeof v);
fp = uopen("A\000B", 3);
vrslt = fscanf(fp, "%c%c%c", v.s, v.s + 1, v.s + 2);
if (vrslt != 3)
exit(133);
if(memcmp(v.s, "A\000B", 3))
exit(134);

memset(&v, ~0, sizeof v);
fp = uopen("\t \000", 3);
i = fscanf(fp, " %c", v.s);
if (i != 1) exit(140);
if (v.s[0]) exit(141);

memset(&v, ~0, sizeof v);
fp = uopen("123\000456", 7);
i = fscanf(fp, "%d%c%d", &v.i, v.s, &v.j);
if (i != 3) exit(147);
if (v.i != 123 || v.s[0] || v.j != 456)
exit(148);

memset(&v, ~0, sizeof v);
fp = uopen("A\000BC", 4);
i = fscanf(fp, "%s%s", v.s, v.t);
if (i != 1) exit(154);
if (memcmp(v.s, "A\000BC", 4))
exit(155);

return 0;
}


How do I make register allocation behave?

2011-05-26 Thread George Spelvin
This is some low-level crypto code, an MMX implementation of Dan
Bernstien's "ChaCha" pseudorandom function.  The input is a 4x4 array
of 32-bit words, and mixing proceeds down either columns or diagonals.

Thus, the implementation keeps each row in a pair of MMX registers,
does mixing down the columns, then swizzles the rows (shear), mixes down
the colums again, then unshears.

It maps very nicely to sse2 registers, but I was trying to write an MMX
implementation for completeness.  This is tricky because I really need
9 registers, but I have only 8.

I could of course write this in straight assembly, but I was trying to get
gcc to do instruction scheduling for me.  I have progressibely added
more and more "keep this in MMX registers, damn it!" hints to the source,
but GCC keeps generating preposterously large stack frames.
(This example of 516 bytes is better than the 2000+ bytes I started with
before adding all the explicit register specifications.)

I realize that the register pressure is extreme, but I'm handing gcc
statements that map directly to 2-address instructions, and I'm not sure
how much more I can do.

Is there some elementary mistake I'm making?  Or should I just stop being cruel
to the compiler?

System is (32-bit) Debian Linux, gcc version 4.6.1 20110524 (prerelease) 
(Debian 4.6.0-9)
cc -W -Wall -Os -fomit-frame-pointer -march=pentium2 -mmmx -mno-sse -S chacha1.c
gcc -v  
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/lib/gcc/i486-linux-gnu/4.6.1/lto-wrapper
Target: i486-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Debian 4.6.0-9' 
--with-bugurl=file:///usr/share/doc/gcc-4.6/README.Bugs 
--enable-languages=c,c++,fortran,objc,obj-c++,go --prefix=/usr 
--program-suffix=-4.6 --enable-shared --enable-multiarch 
--with-multiarch-defaults=i386-linux-gnu --enable-linker-build-id 
--with-system-zlib --libexecdir=/usr/lib --without-included-gettext 
--enable-threads=posix --with-gxx-include-dir=/usr/include/c++/4.6 
--libdir=/usr/lib --enable-nls --enable-clocale=gnu --enable-libstdcxx-debug 
--enable-libstdcxx-time=yes --enable-plugin --enable-objc-gc 
--enable-targets=all --with-arch-32=i586 --with-tune=generic 
--enable-checking=release --build=i486-linux-gnu --host=i486-linux-gnu 
--target=i486-linux-gnu
Thread model: posix
gcc version 4.6.1 20110524 (prerelease) (Debian 4.6.0-9) 


Source is as follows, then generated assembly.

#include 

/* Some types and a round constant needed everywhere */
typedef int32_t v4si __attribute__ ((vector_size (16)));
typedef int32_t v4si_u __attribute__ ((vector_size (16), aligned(4)));
typedef int32_t v2si __attribute__ ((vector_size (8)));

extern v4si const sigma;

#define ROUNDS 12   /* 8, 12, or 20 */

void chacha1(uint32_t const key[8], uint32_t const iv[4], uint32_t *__restrict 
out);
void chacha2(uint32_t const key[8], uint32_t const iv[4], uint32_t *__restrict 
out);
void chacha3(uint32_t const key[8], uint32_t const iv[4], uint32_t *__restrict 
out);

/* Version 1: an mmx implementation */

/* The basic quarter round: x ^= y += z; z <<<= k; (rotate) */
#if 1
#define OP(x,y,z,k) do { \
register v2si t  asm("%mm7");   \
y = __builtin_ia32_paddd(y, z); \
x = __builtin_ia32_pxor(x, y);  \
t = x;  \
x = __builtin_ia32_pslldi(x, k);\
t = __builtin_ia32_psrldi(t, 32-k); \
x = __builtin_ia32_por(x, t);   \
} while (0)
#else
#define OP(x,y,z,k) ( \
x ^= y += z,\
x = __builtin_ia32_pslldi(x, k) |   \
__builtin_ia32_psrldi(x, 32-k)  \
)
#endif

/* Rotate words right 32 bits */
/* If the words of y:x are 3:2:1:0, rotate right to 0:3:2:1 */
/* Little-endian, that's 0123 -> 1230 */
#define ROTW(x,y) do { \
register v2si t  asm("%mm7") = t;   \
t = __builtin_ia32_punpckldq(t, x); \
x = __builtin_ia32_punpckhdq(x, x); \
x = __builtin_ia32_punpckldq(x, y); \
y = __builtin_ia32_punpckhdq(y, t); \
} while(0)

void
chacha1(uint32_t const key[8], uint32_t const iv[4], uint32_t *__restrict out)
{
/*
 * There aren't enough MMX registers for all this, plus
 * temporaries, so the compiler will have to do some spilling.
 */
register v2si a0 asm("%mm0") = ((v2si const *)&sigma)[0];
register v2si a1 asm("%mm1") = ((v2si const *)&sigma)[1];
register v2si b0 asm("%mm2") = ((v2si const *)key)[0];
register v2si b1 asm("%mm3") = ((v2si const *)key)[1];
register v2si c0 asm("%mm4") = ((v2si const *)key)[2];
register v2si c1 asm("%mm5") = ((v2si const *)key)[3];
register v2si d  asm("%mm6") = ((v2si const *)iv)[0];
v2si dd[2]; /* On stack */