On Mon, Feb 4, 2019 at 5:59 AM Jakub Jelinek <ja...@redhat.com> wrote: > > On Mon, Feb 04, 2019 at 05:36:12AM -0800, H.J. Lu wrote: > > + /* Shift __A128 and __N128 to the left by the adjustment. */ > > + switch (offset) > > Ah, no, sorry, that is a bad suggestion then. On the other side,
The generated codes aren't too bad: (gdb) disass test_maskmovq Dump of assembler code for function test_maskmovq: 0x00000000004011b0 <+0>: mov %rdx,%rax 0x00000000004011b3 <+3>: movq (%rdi),%xmm0 0x00000000004011b7 <+7>: movq (%rsi),%xmm1 0x00000000004011bb <+11>: and $0xf,%eax 0x00000000004011be <+14>: je 0x4011d4 <test_maskmovq+36> 0x00000000004011c0 <+16>: cmp $0x8,%rax 0x00000000004011c4 <+20>: jbe 0x4011e0 <test_maskmovq+48> 0x00000000004011c6 <+22>: sub $0x8,%rdx 0x00000000004011ca <+26>: pslldq $0x8,%xmm0 0x00000000004011cf <+31>: pslldq $0x8,%xmm1 0x00000000004011d4 <+36>: mov %rdx,%rdi 0x00000000004011d7 <+39>: maskmovdqu %xmm1,%xmm0 0x00000000004011db <+43>: retq 0x00000000004011dc <+44>: nopl 0x0(%rax) 0x00000000004011e0 <+48>: sub %rax,%rdx 0x00000000004011e3 <+51>: jmpq *0x402008(,%rax,8) 0x00000000004011ea <+58>: nopw 0x0(%rax,%rax,1) 0x00000000004011f0 <+64>: pslldq $0x7,%xmm0 0x00000000004011f5 <+69>: pslldq $0x7,%xmm1 0x00000000004011fa <+74>: jmp 0x4011d4 <test_maskmovq+36> 0x00000000004011fc <+76>: nopl 0x0(%rax) 0x0000000000401200 <+80>: pslldq $0x2,%xmm0 0x0000000000401205 <+85>: pslldq $0x2,%xmm1 0x000000000040120a <+90>: jmp 0x4011d4 <test_maskmovq+36> 0x000000000040120c <+92>: nopl 0x0(%rax) 0x0000000000401210 <+96>: pslldq $0x3,%xmm0 0x0000000000401215 <+101>: pslldq $0x3,%xmm1 0x000000000040121a <+106>: jmp 0x4011d4 <test_maskmovq+36> 0x000000000040121c <+108>: nopl 0x0(%rax) 0x0000000000401220 <+112>: pslldq $0x4,%xmm0 0x0000000000401225 <+117>: pslldq $0x4,%xmm1 0x000000000040122a <+122>: jmp 0x4011d4 <test_maskmovq+36> 0x000000000040122c <+124>: nopl 0x0(%rax) 0x0000000000401230 <+128>: pslldq $0x5,%xmm0 0x0000000000401235 <+133>: pslldq $0x5,%xmm1 0x000000000040123a <+138>: jmp 0x4011d4 <test_maskmovq+36> 0x000000000040123c <+140>: nopl 0x0(%rax) 0x0000000000401240 <+144>: pslldq $0x6,%xmm0 0x0000000000401245 <+149>: pslldq $0x6,%xmm1 0x000000000040124a <+154>: jmp 0x4011d4 <test_maskmovq+36> 0x000000000040124c <+156>: pslldq $0x1,%xmm0 0x0000000000401251 <+161>: pslldq $0x1,%xmm1 0x0000000000401256 <+166>: jmpq 0x4011d4 <test_maskmovq+36> End of assembler dump. __int128 isn't much better: (gdb) disass test_maskmovq Dump of assembler code for function test_maskmovq: 0x00000000004011b0 <+0>: mov %rdx,%rcx 0x00000000004011b3 <+3>: mov (%rdi),%rax 0x00000000004011b6 <+6>: mov (%rsi),%rdi 0x00000000004011b9 <+9>: and $0xf,%ecx 0x00000000004011bc <+12>: je 0x401240 <test_maskmovq+144> 0x00000000004011c2 <+18>: cmp $0x8,%rcx 0x00000000004011c6 <+22>: mov $0x8,%esi 0x00000000004011cb <+27>: mov %rax,%r8 0x00000000004011ce <+30>: push %rbx 0x00000000004011cf <+31>: cmova %rsi,%rcx 0x00000000004011d3 <+35>: sar $0x3f,%rax 0x00000000004011d7 <+39>: mov %r8,%r10 0x00000000004011da <+42>: mov %rdi,%rbx 0x00000000004011dd <+45>: mov %rax,%r11 0x00000000004011e0 <+48>: sar $0x3f,%rdi 0x00000000004011e4 <+52>: xor %eax,%eax 0x00000000004011e6 <+54>: sub %rcx,%rdx 0x00000000004011e9 <+57>: shl $0x3,%ecx 0x00000000004011ec <+60>: mov %rdi,%rsi 0x00000000004011ef <+63>: shl %cl,%r10 0x00000000004011f2 <+66>: shld %cl,%r8,%r11 0x00000000004011f6 <+70>: test $0x40,%cl 0x00000000004011f9 <+73>: cmovne %r10,%r11 0x00000000004011fd <+77>: cmovne %rax,%r10 0x0000000000401201 <+81>: shld %cl,%rbx,%rsi 0x0000000000401205 <+85>: xor %edi,%edi 0x0000000000401207 <+87>: shl %cl,%rbx 0x000000000040120a <+90>: test $0x40,%cl 0x000000000040120d <+93>: mov %r11,-0x8(%rsp) 0x0000000000401212 <+98>: cmovne %rbx,%rsi 0x0000000000401216 <+102>: movq %r10,%xmm0 0x000000000040121b <+107>: cmovne %rdi,%rbx 0x000000000040121f <+111>: mov %rdx,%rdi 0x0000000000401222 <+114>: movq %rbx,%xmm1 0x0000000000401227 <+119>: movhps -0x8(%rsp),%xmm0 0x000000000040122c <+124>: mov %rsi,-0x8(%rsp) 0x0000000000401231 <+129>: movhps -0x8(%rsp),%xmm1 0x0000000000401236 <+134>: maskmovdqu %xmm1,%xmm0 0x000000000040123a <+138>: pop %rbx 0x000000000040123b <+139>: retq 0x000000000040123c <+140>: nopl 0x0(%rax) 0x0000000000401240 <+144>: movq %rdi,%xmm1 0x0000000000401245 <+149>: movq %rax,%xmm0 0x000000000040124a <+154>: mov %rdx,%rdi 0x000000000040124d <+157>: maskmovdqu %xmm1,%xmm0 0x0000000000401251 <+161>: retq End of assembler dump. > (zext (word_var)) << shift > where zext is from "word" to double-word and shift is 1 to word bitsize - 1 > can be done as (word_var << shift) | ((word_var >> (word_bitsize - shift) } > << word_bitsize)) > so you could avoid the int128 shifts anyway and just shift left and right > and construct v2di from that. > This requires 2 64-bit variables for one 128-bit variable. There is much a difference for x86-64. I don't think we can emulate MMX with SSE in 32-bit mode since __m64 is passed and returned in MMX registers. -- H.J.