On Mon, 18 Dec 2017, Richard Henderson wrote:
> The x86 vector instruction set is extremely irregular. With newer
> editions, Intel has filled in some of the blanks. However, we don't
> get many 64-bit operations until SSE4.2, introduced in 2009.
>
> The subsequent edition was for AVX1, introduced in 2011, which added
> three-operand addressing, and adjusts how all instructions should be
> encoded.
>
> Given the relatively narrow 2 year window between possible to support
> and desirable to support, and to vastly simplify code maintainence,
> I am only planning to support AVX1 and later cpus.
>
> Signed-off-by: Richard Henderson <[email protected]>
> ---
> tcg/i386/tcg-target.h | 36 ++-
> tcg/i386/tcg-target.inc.c | 561
> ++++++++++++++++++++++++++++++++++++++++++----
> 2 files changed, 546 insertions(+), 51 deletions(-)
>
> diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
> index 63d27f10e7..e9a4d92598 100644
> --- a/tcg/i386/tcg-target.inc.c
> +++ b/tcg/i386/tcg-target.inc.c
> -static inline void tcg_out_mov(TCGContext *s, TCGType type,
> - TCGReg ret, TCGReg arg)
> +static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
> +{
> + int rexw = 0;
> +
> + if (arg == ret) {
> + return;
> + }
> + switch (type) {
> + case TCG_TYPE_I64:
> + rexw = P_REXW;
> + /* fallthru */
> + case TCG_TYPE_I32:
> + if (ret < 16) {
> + if (arg < 16) {
> + tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
> + } else {
> + tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, ret, 0, arg);
> + }
> + } else {
> + if (arg < 16) {
> + tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
> + } else {
> + tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
> + }
> + }
> + break;
> +
> + case TCG_TYPE_V64:
> + tcg_debug_assert(ret >= 16 && arg >= 16);
> + tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
> + break;
> + case TCG_TYPE_V128:
> + tcg_debug_assert(ret >= 16 && arg >= 16);
> + tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
> + break;
> + case TCG_TYPE_V256:
> + tcg_debug_assert(ret >= 16 && arg >= 16);
> + tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
> + break;
> +
> + default:
> + g_assert_not_reached();
> + }
> +}
I think something is wrong with instruction encodings here. Looks like
tcg_out_mov(&tcg_ctx, TCG_TYPE_I64, TCG_REG_EBP, TCG_REG_XMM0)
produces
vmovq %xmm5, %rax
instead.
Here is the dump.
IN:
0x00400580: 4e040c41 dup v1.4s, w2
0x00400584: 4b0203e2 neg w2, w2
0x00400588: 3d800021 str q1, [x1]
0x0040058c: d65f03c0 ret
OP after optimization and liveness analysis:
ld_i32 tmp0,env,$0xffffffffffffffec dead: 1
movi_i32 tmp1,$0x0
brcond_i32 tmp0,tmp1,lt,$L0 dead: 0 1
---- 0000000000400580 0000000000000000 0000000000000000
dup_vec v128,e32,tmp2,x2
st_vec v128,e8,tmp2,env,$0x8b0 dead: 0
---- 0000000000400584 0000000000000000 0000000000000000
ext32u_i64 tmp4,x2 dead: 1
neg_i64 tmp5,tmp4 dead: 1
ext32u_i64 x2,tmp5 sync: 0 dead: 0 1
<...>
OUT: [size=111]
0x6075bf40: 41 8b 6e ec movl -0x14(%r14), %ebp
0x6075bf44: 85 ed testl %ebp, %ebp
0x6075bf46: 0f 8c 59 00 00 00 jl 0x6075bfa5
0x6075bf4c: c4 c1 7a 7e 46 50 vmovq 0x50(%r14), %xmm0
0x6075bf52: c5 f9 70 c8 00 vpshufd $0, %xmm0, %xmm1
0x6075bf57: c4 c1 7a 7f 8e b0 08 00 vmovdqu %xmm1, 0x8b0(%r14)
0x6075bf5f: 00
0x6075bf60: c4 e1 f9 7e e8 vmovq %xmm5, %rax
0x6075bf65: 8b ed movl %ebp, %ebp
0x6075bf67: 48 f7 dd negq %rbp
0x6075bf6a: 8b ed movl %ebp, %ebp
0x6075bf6c: 49 89 6e 50 movq %rbp, 0x50(%r14)
<...>
%xmm5 is used uninitialized, there is no move from either %xmm0 or
0x50(%r14) to %ebp, there are two unnecessary movl %ebp, %ebp.
--
Kirill