This is some low-level crypto code, an MMX implementation of Dan
Bernstien's "ChaCha" pseudorandom function. The input is a 4x4 array
of 32-bit words, and mixing proceeds down either columns or diagonals.
Thus, the implementation keeps each row in a pair of MMX registers,
does mixing down the columns, then swizzles the rows (shear), mixes down
the colums again, then unshears.
It maps very nicely to sse2 registers, but I was trying to write an MMX
implementation for completeness. This is tricky because I really need
9 registers, but I have only 8.
I could of course write this in straight assembly, but I was trying to get
gcc to do instruction scheduling for me. I have progressibely added
more and more "keep this in MMX registers, damn it!" hints to the source,
but GCC keeps generating preposterously large stack frames.
(This example of 516 bytes is better than the 2000+ bytes I started with
before adding all the explicit register specifications.)
I realize that the register pressure is extreme, but I'm handing gcc
statements that map directly to 2-address instructions, and I'm not sure
how much more I can do.
Is there some elementary mistake I'm making? Or should I just stop being cruel
to the compiler?
System is (32-bit) Debian Linux, gcc version 4.6.1 20110524 (prerelease)
(Debian 4.6.0-9)
cc -W -Wall -Os -fomit-frame-pointer -march=pentium2 -mmmx -mno-sse -S chacha1.c
gcc -v
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/lib/gcc/i486-linux-gnu/4.6.1/lto-wrapper
Target: i486-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Debian 4.6.0-9'
--with-bugurl=file:///usr/share/doc/gcc-4.6/README.Bugs
--enable-languages=c,c++,fortran,objc,obj-c++,go --prefix=/usr
--program-suffix=-4.6 --enable-shared --enable-multiarch
--with-multiarch-defaults=i386-linux-gnu --enable-linker-build-id
--with-system-zlib --libexecdir=/usr/lib --without-included-gettext
--enable-threads=posix --with-gxx-include-dir=/usr/include/c++/4.6
--libdir=/usr/lib --enable-nls --enable-clocale=gnu --enable-libstdcxx-debug
--enable-libstdcxx-time=yes --enable-plugin --enable-objc-gc
--enable-targets=all --with-arch-32=i586 --with-tune=generic
--enable-checking=release --build=i486-linux-gnu --host=i486-linux-gnu
--target=i486-linux-gnu
Thread model: posix
gcc version 4.6.1 20110524 (prerelease) (Debian 4.6.0-9)
Source is as follows, then generated assembly.
#include
/* Some types and a round constant needed everywhere */
typedef int32_t v4si __attribute__ ((vector_size (16)));
typedef int32_t v4si_u __attribute__ ((vector_size (16), aligned(4)));
typedef int32_t v2si __attribute__ ((vector_size (8)));
extern v4si const sigma;
#define ROUNDS 12 /* 8, 12, or 20 */
void chacha1(uint32_t const key[8], uint32_t const iv[4], uint32_t *__restrict
out);
void chacha2(uint32_t const key[8], uint32_t const iv[4], uint32_t *__restrict
out);
void chacha3(uint32_t const key[8], uint32_t const iv[4], uint32_t *__restrict
out);
/* Version 1: an mmx implementation */
/* The basic quarter round: x ^= y += z; z <<<= k; (rotate) */
#if 1
#define OP(x,y,z,k) do { \
register v2si t asm("%mm7"); \
y = __builtin_ia32_paddd(y, z); \
x = __builtin_ia32_pxor(x, y); \
t = x; \
x = __builtin_ia32_pslldi(x, k);\
t = __builtin_ia32_psrldi(t, 32-k); \
x = __builtin_ia32_por(x, t); \
} while (0)
#else
#define OP(x,y,z,k) ( \
x ^= y += z,\
x = __builtin_ia32_pslldi(x, k) | \
__builtin_ia32_psrldi(x, 32-k) \
)
#endif
/* Rotate words right 32 bits */
/* If the words of y:x are 3:2:1:0, rotate right to 0:3:2:1 */
/* Little-endian, that's 0123 -> 1230 */
#define ROTW(x,y) do { \
register v2si t asm("%mm7") = t; \
t = __builtin_ia32_punpckldq(t, x); \
x = __builtin_ia32_punpckhdq(x, x); \
x = __builtin_ia32_punpckldq(x, y); \
y = __builtin_ia32_punpckhdq(y, t); \
} while(0)
void
chacha1(uint32_t const key[8], uint32_t const iv[4], uint32_t *__restrict out)
{
/*
* There aren't enough MMX registers for all this, plus
* temporaries, so the compiler will have to do some spilling.
*/
register v2si a0 asm("%mm0") = ((v2si const *)&sigma)[0];
register v2si a1 asm("%mm1") = ((v2si const *)&sigma)[1];
register v2si b0 asm("%mm2") = ((v2si const *)key)[0];
register v2si b1 asm("%mm3") = ((v2si const *)key)[1];
register v2si c0 asm("%mm4") = ((v2si const *)key)[2];
register v2si c1 asm("%mm5") = ((v2si const *)key)[3];
register v2si d asm("%mm6") = ((v2si const *)iv)[0];
v2si dd[2]; /* On stack */