On Sat, 13 Dec 2025 01:21:14 GMT, Vladimir Ivanov <[email protected]> wrote:

> Good work, Xiaohong!
>
Thanks so much for your review!
 
> Can you, please, include samples of machine code generated before/after the 
> patch (for AVX2 and AVX512)?

Sure. The generated code has no difference for cases that just need **1 gather 
load**. For cases that need **2/4 times** of gather loads, the main differences 
come from the **duplicate initializing instructions** before iterations of 8B 
gather loads (which could be optimized in future), and the additional code 
generated for **vector slice and merging**.

Following is an example of loading a `Short256Vector` under `-XX:UseAVX=2`, 
which needs 2 times of gather loads. The corresponding Java code is:

private static final VectorSpecies<Short> S_SPECIES = 
ShortVector.SPECIES_PREFERRED;

static void gather_short() {
    for (int i = 0; i < LENGTH; i += S_SPECIES.length()) {
        ShortVector.fromArray(S_SPECIES, sa, i, index, i)
                   .intoArray(sr, i);
    }
}

static void gather_short_masked() {
    VectorMask<Short> mask = VectorMask.fromArray(S_SPECIES, m, 0);
    for (int i = 0; i < LENGTH; i += S_SPECIES.length()) {
        ShortVector.fromArray(S_SPECIES, sa, i, index, i, mask)
                   .intoArray(sr, i);
    }
}




Here is the kernel code generated **without** this patch:

  0x00007a0e8c06ecb0:   vmovd  %r9d,%xmm1
  0x00007a0e8c06ecb5:   lea    0x10(%rbx,%rsi,2),%r14
  0x00007a0e8c06ecba:   mov    %r13,%r8
  0x00007a0e8c06ecbd:   mov    $0x10,%r9d
  0x00007a0e8c06ecc3:   vpxor  %ymm5,%ymm5,%ymm5
  0x00007a0e8c06ecc7:   vpxor  %ymm4,%ymm4,%ymm4
  0x00007a0e8c06eccb:   vpcmpeqd %ymm6,%ymm6,%ymm6
  0x00007a0e8c06eccf:   vpsubd %ymm6,%ymm5,%ymm6
  0x00007a0e8c06ecd3:   vpslld $0x1,%ymm6,%ymm6
  0x00007a0e8c06ecd8:   vmovdqu 0x41020(%rip),%ymm5        # Stub::Stub 
Generator vector_iota_indices_stub+128 0x00007a0e8c0afd00
                                                            ;   {external_word}
  0x00007a0e8c06ece0:   vpxor  %ymm3,%ymm3,%ymm3
  0x00007a0e8c06ece4:   mov    (%r8),%r11d
  0x00007a0e8c06ece7:   vpinsrw $0x0,(%r14,%r11,2),%xmm3,%xmm3
  0x00007a0e8c06ecee:   mov    0x4(%r8),%r11d
  0x00007a0e8c06ecf2:   vpinsrw $0x1,(%r14,%r11,2),%xmm3,%xmm3
  0x00007a0e8c06ecf9:   mov    0x8(%r8),%r11d
  0x00007a0e8c06ecfd:   vpinsrw $0x2,(%r14,%r11,2),%xmm3,%xmm3
  0x00007a0e8c06ed04:   mov    0xc(%r8),%r11d
  0x00007a0e8c06ed08:   vpinsrw $0x3,(%r14,%r11,2),%xmm3,%xmm3
  0x00007a0e8c06ed0f:   vpermd %ymm3,%ymm5,%ymm3
  0x00007a0e8c06ed14:   vpsubd %ymm6,%ymm5,%ymm5
  0x00007a0e8c06ed18:   vpor   %ymm3,%ymm4,%ymm4
  0x00007a0e8c06ed1c:   add    $0x10,%r8
  0x00007a0e8c06ed20:   sub    $0x4,%r9d
  0x00007a0e8c06ed24:   jne    0x00007a0e8c06ece0
  0x00007a0e8c06ed26:   vmovdqu %ymm4,0x10(%rbp,%rsi,2)


And here is the kernel code generated **with** this patch:

  0x000070118c06a033:   vmovd  %edi,%xmm5 
  0x000070118c06a037:   vmovq  %rbp,%xmm3
  0x000070118c06a03c:   vmovd  %ecx,%xmm2
  0x000070118c06a040:   mov    %r9d,(%rsp)
  0x000070118c06a044:   lea    0x10(%rsi,%r10,2),%r14                       # 
start of the second gather_load operation
  0x000070118c06a049:   mov    %r11,%rbp
  0x000070118c06a04c:   mov    $0x8,%ecx
  0x000070118c06a051:   vpxor  %xmm4,%xmm4,%xmm4
  0x000070118c06a055:   vpxor  %xmm10,%xmm10,%xmm10
  0x000070118c06a05a:   vpcmpeqd %xmm11,%xmm11,%xmm11
  0x000070118c06a05f:   vpsubd %xmm11,%xmm4,%xmm11
  0x000070118c06a064:   vpslld $0x1,%xmm11,%xmm11
  0x000070118c06a06a:   vmovdqu 0x45cgt8e(%rip),%xmm4        # Stub::Stub 
Generator vector_iota_indices_stub+128 0x000070118c0afd00
                                                            ;   {external_word}
  0x000070118c06a072:   vpxor  %xmm6,%xmm6,%xmm6
  0x000070118c06a076:   mov    0x0(%rbp),%edi
  0x000070118c06a079:   vpinsrw $0x0,(%r14,%rdi,2),%xmm6,%xmm6
  0x000070118c06a080:   mov    0x4(%rbp),%edi
  0x000070118c06a083:   vpinsrw $0x1,(%r14,%rdi,2),%xmm6,%xmm6
  0x000070118c06a08a:   mov    0x8(%rbp),%edi
  0x000070118c06a08d:   vpinsrw $0x2,(%r14,%rdi,2),%xmm6,%xmm6
  0x000070118c06a094:   mov    0xc(%rbp),%edi
  0x000070118c06a097:   vpinsrw $0x3,(%r14,%rdi,2),%xmm6,%xmm6
  0x000070118c06a09e:   vpermd %ymm6,%ymm4,%ymm6
  0x000070118c06a0a3:   vpsubd %xmm11,%xmm4,%xmm4
  0x000070118c06a0a8:   vpor   %xmm6,%xmm10,%xmm10
  0x000070118c06a0ac:   add    $0x10,%rbp
  0x000070118c06a0b0:   sub    $0x4,%ecx
  0x000070118c06a0b3:   jne    0x000070118c06a072
  0x000070118c06a0b5:   vmovdqu %xmm10,%xmm4                                # 
vector reinterpret, the end of second gather_load
  0x000070118c06a0ba:   vperm2i128 $0x21,%ymm4,%ymm9,%ymm6      # vector slice
 
  0x000070118c06a0c0:   lea    0x10(%rsi,%r10,2),%r11                           
    # start of the first gather_load operation                       
  0x000070118c06a0c5:   mov    %rax,%rcx
  0x000070118c06a0c8:   mov    $0x8,%r8d
  0x000070118c06a0ce:   vpxor  %xmm10,%xmm10,%xmm10
  0x000070118c06a0d3:   vpxor  %xmm4,%xmm4,%xmm4
  0x000070118c06a0d7:   vpcmpeqd %xmm13,%xmm13,%xmm13
  0x000070118c06a0dc:   vpsubd %xmm13,%xmm10,%xmm13
  0x000070118c06a0e1:   vpslld $0x1,%xmm13,%xmm13
  0x000070118c06a0e7:   vmovdqu 0x45c11(%rip),%xmm10        # Stub::Stub 
Generator vector_iota_indices_stub+128 0x000070118c0afd00
                                                            ;   {external_word}
  0x000070118c06a0ef:   vpxor  %xmm12,%xmm12,%xmm12
  0x000070118c06a0f4:   mov    (%rcx),%r9d
  0x000070118c06a0f7:   vpinsrw $0x0,(%r11,%r9,2),%xmm12,%xmm12
  0x000070118c06a0fe:   mov    0x4(%rcx),%r9d
  0x000070118c06a102:   vpinsrw $0x1,(%r11,%r9,2),%xmm12,%xmm12
  0x000070118c06a109:   mov    0x8(%rcx),%r9d
  0x000070118c06a10d:   vpinsrw $0x2,(%r11,%r9,2),%xmm12,%xmm12
  0x000070118c06a114:   mov    0xc(%rcx),%r9d
  0x000070118c06a118:   vpinsrw $0x3,(%r11,%r9,2),%xmm12,%xmm12
  0x000070118c06a11f:   vpermd %ymm12,%ymm10,%ymm12
  0x000070118c06a124:   vpsubd %xmm13,%xmm10,%xmm10
  0x000070118c06a129:   vpor   %xmm12,%xmm4,%xmm4
  0x000070118c06a12e:   add    $0x10,%rcx
  0x000070118c06a132:   sub    $0x4,%r8d
  0x000070118c06a136:   jne    0x000070118c06a0ef
  0x000070118c06a138:   vmovdqu %xmm4,%xmm4                     ; vector 
reinterpret, the end of the first gather_load 
  0x000070118c06a13c:   vpor   %ymm6,%ymm4,%ymm4             ; final merge
  0x000070118c06a140:   vmovq  %xmm3,%r11
  0x000070118c06a145:   vmovdqu %ymm4,0x10(%r11,%r10,2)     ;*invokestatic 
store {reexecute=0 rethrow=0 return_oop=0}
                                                            ; - 
jdk.incubator.vector.ShortVector::intoArray@44 (line 3514)
                                                            ; - 
VectorAPITest::gather_short@38 (line 116)


For the masked cases, besides additional added instructions, there are more 
code generated for the **mask slice** operations.

I also attached the full code for kinds of cases. Please kindly share your 
feedback. Thanks a lot!
[avx2_short_max_after.txt](https://github.com/user-attachments/files/24160578/avx2_short_max_after.txt)
[avx2_short_max_before.txt](https://github.com/user-attachments/files/24160581/avx2_short_max_before.txt)
[avx3_short_max_before.txt](https://github.com/user-attachments/files/24160582/avx3_short_max_before.txt)
[avx3_short_max_after.txt](https://github.com/user-attachments/files/24160584/avx3_short_max_after.txt)
[avx3_short_max_masked_after.txt](https://github.com/user-attachments/files/24160638/avx3_short_max_masked_after.txt)
[avx3_short_max_masked_before.txt](https://github.com/user-attachments/files/24160644/avx3_short_max_masked_before.txt)

-------------

PR Comment: https://git.openjdk.org/jdk/pull/28520#issuecomment-3654421400

Reply via email to