https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102327

--- Comment #3 from Hongtao.liu <crazylht at gmail dot com> ---
Here's optimization i did for v8hf/v16hf/v32hf vec_set/init/extract.

diff --git a/origin.s b/after.s
index e43e09c..5ea1fb6 100644
--- a/origin.s
+++ b/after.s
@@ -6,25 +6,13 @@
 vec_init_v8hf:
 .LFB0:
        .cfi_startproc
-       vmovw   %xmm0, %eax
-       vmovw   %xmm2, %esi
-       vmovw   %xmm4, %edx
-       vmovw   %xmm6, %ecx
-       vmovd   %eax, %xmm0
-       vmovw   %xmm1, %eax
-       vmovd   %esi, %xmm1
-       vmovd   %ecx, %xmm2
-       vpinsrw $1, %eax, %xmm0, %xmm0
-       vmovw   %xmm3, %eax
-       vpinsrw $1, %eax, %xmm1, %xmm3
-       vmovw   %xmm5, %eax
-       vmovd   %edx, %xmm1
-       vpinsrw $1, %eax, %xmm1, %xmm1
-       vmovw   %xmm7, %eax
-       vpunpckldq      %xmm3, %xmm0, %xmm0
-       vpinsrw $1, %eax, %xmm2, %xmm2
-       vpunpckldq      %xmm2, %xmm1, %xmm1
-       vpunpcklqdq     %xmm1, %xmm0, %xmm0
+       vpunpcklwd      %xmm1, %xmm0, %xmm0
+       vpunpcklwd      %xmm3, %xmm2, %xmm2
+       vpunpcklwd      %xmm5, %xmm4, %xmm4
+       vpunpcklwd      %xmm7, %xmm6, %xmm6
+       vpunpckldq      %xmm2, %xmm0, %xmm0
+       vpunpckldq      %xmm6, %xmm4, %xmm4
+       vpunpcklqdq     %xmm4, %xmm0, %xmm0
        ret
        .cfi_endproc
 .LFE0:
@@ -35,8 +23,7 @@ vec_init_v8hf:
 vec_extract_v8hf_4:
 .LFB1:
        .cfi_startproc
-       vpextrw $4, %xmm0, %eax
-       vmovw   %eax, %xmm0
+       vpsrldq $8, %xmm0, %xmm0
        ret
        .cfi_endproc
 .LFE1:
@@ -47,8 +34,7 @@ vec_extract_v8hf_4:
 vec_extract_v16hf_3:
 .LFB2:
        .cfi_startproc
-       vpextrw $3, %xmm0, %eax
-       vmovw   %eax, %xmm0
+       vpsrldq $6, %xmm0, %xmm0
        ret
        .cfi_endproc
 .LFE2:
@@ -71,8 +57,7 @@ vec_extract_v16hf_15:
 .LFB4:
        .cfi_startproc
        vextracti128    $0x1, %ymm0, %xmm0
-       vpextrw $7, %xmm0, %eax
-       vmovw   %eax, %xmm0
+       vpsrldq $14, %xmm0, %xmm0
        ret
        .cfi_endproc
 .LFE4:
@@ -83,8 +68,7 @@ vec_extract_v16hf_15:
 vec_extract_v32hf_5:
 .LFB5:
        .cfi_startproc
-       vpextrw $5, %xmm0, %eax
-       vmovw   %eax, %xmm0
+       vpsrldq $10, %xmm0, %xmm0
        ret
        .cfi_endproc
 .LFE5:
@@ -107,8 +91,7 @@ vec_extract_v32hf_14:
 .LFB7:
        .cfi_startproc
        vextracti128    $0x1, %ymm0, %xmm0
-       vpextrw $6, %xmm0, %eax
-       vmovw   %eax, %xmm0
+       vpsrldq $12, %xmm0, %xmm0
        ret
        .cfi_endproc
 .LFE7:
@@ -144,8 +127,7 @@ vec_extract_v32hf_28:
        .cfi_startproc
        vextracti64x4   $0x1, %zmm0, %ymm0
        vextracti128    $0x1, %ymm0, %xmm0
-       vpextrw $4, %xmm0, %eax
-       vmovw   %eax, %xmm0
+       vpsrldq $8, %xmm0, %xmm0
        ret
        .cfi_endproc
 .LFE10:
@@ -156,8 +138,8 @@ vec_extract_v32hf_28:
 vec_set_v8hf_4:
 .LFB11:
        .cfi_startproc
-       vmovw   %xmm1, %eax
-       vpinsrw $4, %eax, %xmm0, %xmm0
+       vpbroadcastw    %xmm1, %xmm1
+       vpblendw        $16, %xmm1, %xmm0, %xmm0
        ret
        .cfi_endproc
 .LFE11:
@@ -168,9 +150,9 @@ vec_set_v8hf_4:
 vec_set_v16hf_3:
 .LFB12:
        .cfi_startproc
-       vmovw   %xmm1, %eax
-       vpinsrw $3, %eax, %xmm0, %xmm2
-       vinserti128     $0x0, %xmm2, %ymm0, %ymm0
+       vpbroadcastw    %xmm1, %ymm1
+       vpblendw        $8, %ymm1, %ymm0, %ymm1
+       vpblendd        $15, %ymm1, %ymm0, %ymm0
        ret
        .cfi_endproc
 .LFE12:
@@ -181,9 +163,9 @@ vec_set_v16hf_3:
 vec_set_v16hf_8:
 .LFB13:
        .cfi_startproc
-       vextracti128    $0x1, %ymm0, %xmm2
-       vmovsh  %xmm1, %xmm2, %xmm2
-       vinserti128     $0x1, %xmm2, %ymm0, %ymm0
+       vpbroadcastw    %xmm1, %ymm1
+       vpblendw        $1, %ymm1, %ymm0, %ymm1
+       vpblendd        $240, %ymm1, %ymm0, %ymm0
        ret
        .cfi_endproc
 .LFE13:
@@ -194,10 +176,9 @@ vec_set_v16hf_8:
 vec_set_v16hf_15:
 .LFB14:
        .cfi_startproc
-       vextracti128    $0x1, %ymm0, %xmm2
-       vmovw   %xmm1, %eax
-       vpinsrw $7, %eax, %xmm2, %xmm2
-       vinserti128     $0x1, %xmm2, %ymm0, %ymm0
+       vpbroadcastw    %xmm1, %ymm1
+       vpblendw        $128, %ymm1, %ymm0, %ymm1
+       vpblendd        $240, %ymm1, %ymm0, %ymm0
        ret
        .cfi_endproc
 .LFE14:
@@ -208,7 +189,8 @@ vec_set_v16hf_15:
 vec_set_v32hf_5:
 .LFB15:
        .cfi_startproc
-       kmovd   .LC0(%rip), %k1
+       movl    $32, %eax
+       kmovd   %eax, %k1
        vpbroadcastw    %xmm1, %zmm0{%k1}
        ret
        .cfi_endproc
@@ -220,7 +202,8 @@ vec_set_v32hf_5:
 vec_set_v32hf_8:
 .LFB16:
        .cfi_startproc
-       kmovd   .LC1(%rip), %k1
+       movl    $256, %eax
+       kmovd   %eax, %k1
        vpbroadcastw    %xmm1, %zmm0{%k1}
        ret
        .cfi_endproc
@@ -232,7 +215,8 @@ vec_set_v32hf_8:
 vec_set_v32hf_14:
 .LFB17:
        .cfi_startproc
-       kmovd   .LC2(%rip), %k1
+       movl    $16384, %eax
+       kmovd   %eax, %k1
        vpbroadcastw    %xmm1, %zmm0{%k1}
        ret
        .cfi_endproc
@@ -244,7 +228,8 @@ vec_set_v32hf_14:
 vec_set_v32hf_16:
 .LFB18:
        .cfi_startproc
-       kmovd   .LC3(%rip), %k1
+       movl    $65536, %eax
+       kmovd   %eax, %k1
        vpbroadcastw    %xmm1, %zmm0{%k1}
        ret
        .cfi_endproc
@@ -256,7 +241,8 @@ vec_set_v32hf_16:
 vec_set_v32hf_24:
 .LFB19:
        .cfi_startproc
-       kmovd   .LC4(%rip), %k1
+       movl    $16777216, %eax
+       kmovd   %eax, %k1
        vpbroadcastw    %xmm1, %zmm0{%k1}
        ret
        .cfi_endproc
@@ -268,30 +254,12 @@ vec_set_v32hf_24:
 vec_set_v32hf_28:
 .LFB20:
        .cfi_startproc
-       kmovd   .LC5(%rip), %k1
+       movl    $268435456, %eax
+       kmovd   %eax, %k1
        vpbroadcastw    %xmm1, %zmm0{%k1}
        ret
        .cfi_endproc
 .LFE20:
        .size   vec_set_v32hf_28, .-vec_set_v32hf_28
-       .section        .rodata.cst4,"aM",@progbits,4
-       .align 4
-.LC0:
-       .long   32
-       .align 4
-.LC1:
-       .long   256
-       .align 4
-.LC2:
-       .long   16384
-       .align 4
-.LC3:
-       .long   65536
-       .align 4
-.LC4:
-       .long   16777216
-       .align 4
-.LC5:
-       .long   268435456
        .ident  "GCC: (GNU) 12.0.0 20210913 (experimental)"
        .section        .note.GNU-stack,"",@progbits

Reply via email to