https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108695
--- Comment #9 from Martin Liška <marxin at gcc dot gnu.org> ---
Actually, looking at the tree dumps before and after the revision, it's leading
to a different place:
First difference happens in:
test_aes.ltrans0.ltrans.116t.dse2
<bb 4> [local count: 8687547526]:
- _118 = MEM[(ulong *)iv_4(D)];
- _120 = MEM[(ulong *)input_19];
- _121 = _118 ^ _120;
- MEM[(ulong *)iv_4(D)] = _121;
- _129 = MEM[(ulong *)iv_4(D) + 8B];
- _131 = MEM[(ulong *)input_19 + 8B];
- _132 = _129 ^ _131;
- MEM[(ulong *)iv_4(D) + 8B] = _132;
(there's one more optimized out block like this. Which maps to:
int AES_Gen_CBC_Enc(AES_Crypt_Blk_fn *cryptfn,
const uchar* rkeys, uint rounds,
uchar *iv, uint pad,
const uchar *input, uchar *output,
ssize_t len, ssize_t *olen)
{
*olen = len;
while (len >= 16) {
XOR16(iv, input, iv);
cryptfn(rkeys, rounds, iv, iv);
memcpy(output, iv, 16);
len -= 16; input += 16; output += 16;
}
if (len || pad == PAD_ALWAYS) {
uchar *in = crypto->blkbuf2;
fill_blk(input, in, len, pad);
XOR16(iv, in, iv);
cryptfn(rkeys, rounds, iv, output);
/* Store last IV */
memcpy(iv, output, 16);
*olen += 16-(len&15);
//memset(in, 0, 16);
//LFENCE;
}
return (pad == PAD_ALWAYS || (len&15))? 16-(len&15): 0;
}
where the XOR16 is implemented as:
#define XORN(in1,in2,out,len) \
do { \
uint _i; \
for (_i = 0; _i < len/sizeof(ulong); ++_i) \
*((ulong*)(out)+_i) = *((ulong*)(in1)+_i) ^
*((ulong*)(in2)+_i); \
} while(0)