From 2b3b075aa9078fa94196d1a7f5382566cd353074 Mon Sep 17 00:00:00 2001
From: Martin Vignali <martin.vignali@gmail.com>
Date: Tue, 16 Jan 2018 23:30:18 +0100
Subject: [PATCH] avfilter/x86/vf_blend : for testing with sse4 and avx

---
 libavfilter/x86/vf_blend.asm    | 226 +++++++++++++++++++++++++++++-----------
 libavfilter/x86/vf_blend_init.c |  93 +++++++++++++++++
 2 files changed, 257 insertions(+), 62 deletions(-)

diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm
index 4916aaf251..f8eb245eb7 100644
--- a/libavfilter/x86/vf_blend.asm
+++ b/libavfilter/x86/vf_blend.asm
@@ -34,6 +34,30 @@ pb_255: times 16 db 255
 
 SECTION .text
 
+;%1 dst, %2 src %3 xm fill by zero (only use in SSE2)
+%macro PMOVZXBW 3
+%if cpuflag(avx2)
+    vpmovzxbw %1, %2
+%elif cpuflag(sse4)
+    pmovzxbw %1, %2
+%else; SSE2
+     movh      %1, %2
+     punpcklbw %1, %3
+%endif
+%endmacro
+
+; %1 dst, %2 src, %3 tmp (only use in avx2) : convert 16b to 8b and store
+%macro PACKUSWB_AND_STORE 3
+%if mmsize == 32
+     vextracti128 xm%3, m%2, 1
+     packuswb     xm%2, xm%3
+     movu   %1, xm%2
+%else
+     packuswb  xm%2, xm%2
+     movh   %1, xm%2
+%endif
+%endmacro
+
 %macro BLEND_INIT 2
 %if ARCH_X86_64
 cglobal blend_%1, 6, 9, %2, top, top_linesize, bottom, bottom_linesize, dst, dst_linesize, width, end, x
@@ -74,39 +98,32 @@ BLEND_INIT %1, 2
 BLEND_END
 %endmacro
 
-INIT_XMM sse2
-BLEND_SIMPLE xor,      xor
-BLEND_SIMPLE or,       or
-BLEND_SIMPLE and,      and
-BLEND_SIMPLE addition, addusb
-BLEND_SIMPLE subtract, subusb
-BLEND_SIMPLE darken,   minub
-BLEND_SIMPLE lighten,  maxub
-
+%macro GRAINEXTRACT 0
+%if cpuflag(avx2)||cpuflag(sse4)
+BLEND_INIT grainextract, 3
+%else ; SSE2
 BLEND_INIT grainextract, 4
-    pxor       m2, m2
-    mova       m3, [pw_128]
+    pxor       m3, m3
+%endif
+
+    VBROADCASTI128       m2, [pw_128]
 .nextrow:
     mov        xq, widthq
-
     .loop:
-        movh            m0, [topq + xq]
-        movh            m1, [bottomq + xq]
-        punpcklbw       m0, m2
-        punpcklbw       m1, m2
-        paddw           m0, m3
-        psubw           m0, m1
-        packuswb        m0, m0
-        movh   [dstq + xq], m0
+        PMOVZXBW m0, [topq + xq], m3
+        PMOVZXBW m1, [bottomq + xq], m3
+        paddw    m0, m2
+        psubw    m0, m1
+        PACKUSWB_AND_STORE [dstq + xq], 0, 1
         add             xq, mmsize / 2
     jl .loop
 BLEND_END
+%endmacro
 
 %macro MULTIPLY 3 ; a, b, pw_1
     pmullw          %1, %2               ; xxxxxxxx  a * b
     paddw           %1, %3
-    mova            %2, %1
-    psrlw           %2, 8
+    psrlw           %2, %1, 8
     paddw           %1, %2
     psrlw           %1, 8                ; 00xx00xx  a * b / 255
 %endmacro
@@ -118,92 +135,109 @@ BLEND_END
     pxor            %1, %4               ; 00xx00xx  255 - x / 255
 %endmacro
 
+%macro BLEND_MULTIPLY 0
+%if cpuflag(avx2)||cpuflag(sse4)
+BLEND_INIT multiply, 3
+%else
 BLEND_INIT multiply, 4
-    pxor       m2, m2
-    mova       m3, [pw_1]
+    pxor       m3, m3
+%endif
+
+    VBROADCASTI128       m2, [pw_1]
 .nextrow:
     mov        xq, widthq
 
     .loop:
                                              ;     word
                                              ;     |--|
-        movh            m0, [topq + xq]      ; 0000xxxx
-        movh            m1, [bottomq + xq]
-        punpcklbw       m0, m2               ; 00xx00xx
-        punpcklbw       m1, m2
+        PMOVZXBW        m0, [topq + xq], m3 ; 00xx00xx
+        PMOVZXBW        m1, [bottomq + xq], m3
 
-        MULTIPLY        m0, m1, m3
+        MULTIPLY        m0, m1, m2
 
-        packuswb        m0, m0               ; 0000xxxx
-        movh   [dstq + xq], m0
+        PACKUSWB_AND_STORE [dstq + xq], 0, 1 ; 0000xxxx
         add             xq, mmsize / 2
 
     jl .loop
 BLEND_END
+%endmacro
 
+%macro BLEND_SCREEN 0
+%if cpuflag(avx2)||cpuflag(sse4)
+BLEND_INIT screen, 4
+%else
 BLEND_INIT screen, 5
-    pxor       m2, m2
-    mova       m3, [pw_1]
-    mova       m4, [pw_255]
+    pxor       m4, m4
+%endif
+
+    VBROADCASTI128       m2, [pw_1]
+    VBROADCASTI128       m3, [pw_255]
 .nextrow:
     mov        xq, widthq
 
     .loop:
-        movh            m0, [topq + xq]      ; 0000xxxx
-        movh            m1, [bottomq + xq]
-        punpcklbw       m0, m2               ; 00xx00xx
-        punpcklbw       m1, m2
+        PMOVZXBW        m0, [topq + xq], m4 ; 00xx00xx
+        PMOVZXBW        m1, [bottomq + xq], m4 ; 00xx00xx
 
-        SCREEN          m0, m1, m3, m4
+        SCREEN          m0, m1, m2, m3
 
-        packuswb        m0, m0               ; 0000xxxx
-        movh   [dstq + xq], m0
+        PACKUSWB_AND_STORE [dstq + xq], 0, 1 ; 0000xxxx
         add             xq, mmsize / 2
 
     jl .loop
 BLEND_END
+%endmacro
 
+%macro AVERAGE 0
+%if cpuflag(avx2)||cpuflag(sse4)
+BLEND_INIT average, 2
+%else
 BLEND_INIT average, 3
     pxor       m2, m2
+%endif
 .nextrow:
     mov        xq, widthq
 
     .loop:
-        movh            m0, [topq + xq]
-        movh            m1, [bottomq + xq]
-        punpcklbw       m0, m2
-        punpcklbw       m1, m2
+        PMOVZXBW        m0, [topq + xq], m2
+        PMOVZXBW        m1, [bottomq + xq], m2
         paddw           m0, m1
         psrlw           m0, 1
-        packuswb        m0, m0
-        movh   [dstq + xq], m0
+        PACKUSWB_AND_STORE [dstq + xq], 0, 1
         add             xq, mmsize / 2
     jl .loop
 BLEND_END
+%endmacro
+
 
+%macro GRAINMERGE 0
+%if cpuflag(avx2)||cpuflag(sse4)
+BLEND_INIT grainmerge, 3
+%else
 BLEND_INIT grainmerge, 4
-    pxor       m2, m2
-    mova       m3, [pw_128]
+    pxor       m3, m3
+%endif
+
+    VBROADCASTI128       m2, [pw_128]
 .nextrow:
     mov        xq, widthq
 
     .loop:
-        movh            m0, [topq + xq]
-        movh            m1, [bottomq + xq]
-        punpcklbw       m0, m2
-        punpcklbw       m1, m2
+        PMOVZXBW        m0, [topq + xq], m3
+        PMOVZXBW        m1, [bottomq + xq], m3
         paddw           m0, m1
-        psubw           m0, m3
-        packuswb        m0, m0
-        movh   [dstq + xq], m0
+        psubw           m0, m2
+        PACKUSWB_AND_STORE [dstq + xq], 0, 1
         add             xq, mmsize / 2
     jl .loop
 BLEND_END
+%endmacro
 
+%macro HARDMIX 0
 BLEND_INIT hardmix, 5
-    mova       m2, [pb_255]
-    mova       m3, [pb_128]
-    mova       m4, [pb_127]
+    VBROADCASTI128       m2, [pb_255]
+    VBROADCASTI128       m3, [pb_128]
+    VBROADCASTI128       m4, [pb_127]
 .nextrow:
     mov        xq, widthq
 
@@ -218,7 +252,9 @@ BLEND_INIT hardmix, 5
         add             xq, mmsize
     jl .loop
 BLEND_END
+%endmacro
 
+%macro DIVIDE 0
 BLEND_INIT divide, 4
     pxor       m2, m2
     mova       m3, [ps_255]
@@ -247,9 +283,11 @@ BLEND_INIT divide, 4
 
     jl .loop
 BLEND_END
+%endmacro
 
+%macro PHOENIX 0
 BLEND_INIT phoenix, 4
-    mova       m3, [pb_255]
+    VBROADCASTI128       m3, [pb_255]
 .nextrow:
     mov        xq, widthq
 
@@ -266,6 +304,7 @@ BLEND_INIT phoenix, 4
         add             xq, mmsize
     jl .loop
 BLEND_END
+%endmacro
 
 %macro BLEND_ABS 0
 BLEND_INIT difference, 5
@@ -291,7 +330,7 @@ BLEND_END
 
 BLEND_INIT extremity, 8
     pxor       m2, m2
-    mova       m4, [pw_255]
+    VBROADCASTI128       m4, [pw_255]
 .nextrow:
     mov        xq, widthq
 
@@ -315,7 +354,7 @@ BLEND_END
 
 BLEND_INIT negation, 8
     pxor       m2, m2
-    mova       m4, [pw_255]
+    VBROADCASTI128       m4, [pw_255]
 .nextrow:
     mov        xq, widthq
 
@@ -341,6 +380,69 @@ BLEND_END
 %endmacro
 
 INIT_XMM sse2
+BLEND_SIMPLE xor,      xor
+BLEND_SIMPLE or,       or
+BLEND_SIMPLE and,      and
+BLEND_SIMPLE addition, addusb
+BLEND_SIMPLE subtract, subusb
+BLEND_SIMPLE darken,   minub
+BLEND_SIMPLE lighten,  maxub
+GRAINEXTRACT
+BLEND_MULTIPLY
+BLEND_SCREEN
+AVERAGE
+GRAINMERGE
+HARDMIX
+PHOENIX
+DIVIDE
+
 BLEND_ABS
+
 INIT_XMM ssse3
 BLEND_ABS
+
+INIT_XMM sse4
+GRAINMERGE
+GRAINEXTRACT
+BLEND_MULTIPLY
+BLEND_SCREEN
+AVERAGE
+
+INIT_XMM avx
+BLEND_SIMPLE xor,      xor
+BLEND_SIMPLE or,       or
+BLEND_SIMPLE and,      and
+BLEND_SIMPLE addition, addusb
+BLEND_SIMPLE subtract, subusb
+BLEND_SIMPLE darken,   minub
+BLEND_SIMPLE lighten,  maxub
+GRAINEXTRACT
+BLEND_MULTIPLY
+BLEND_SCREEN
+AVERAGE
+GRAINMERGE
+HARDMIX
+PHOENIX
+
+BLEND_ABS
+
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+BLEND_SIMPLE xor,      xor
+BLEND_SIMPLE or,       or
+BLEND_SIMPLE and,      and
+BLEND_SIMPLE addition, addusb
+BLEND_SIMPLE subtract, subusb
+BLEND_SIMPLE darken,   minub
+BLEND_SIMPLE lighten,  maxub
+GRAINEXTRACT
+BLEND_MULTIPLY
+BLEND_SCREEN
+AVERAGE
+GRAINMERGE
+HARDMIX
+PHOENIX
+
+BLEND_ABS
+%endif
diff --git a/libavfilter/x86/vf_blend_init.c b/libavfilter/x86/vf_blend_init.c
index a4fc9af246..de841d0b55 100644
--- a/libavfilter/x86/vf_blend_init.c
+++ b/libavfilter/x86/vf_blend_init.c
@@ -31,26 +31,65 @@ void ff_blend_##name##_##opt(const uint8_t *top, ptrdiff_t top_linesize,       \
                              struct FilterParams *param, double *values, int starty);
 
 BLEND_FUNC(addition, sse2)
+BLEND_FUNC(addition, avx)//TEST
+BLEND_FUNC(addition, avx2)
 BLEND_FUNC(grainmerge, sse2)
+BLEND_FUNC(grainmerge, sse4)
+BLEND_FUNC(grainmerge, avx)//TEST
+BLEND_FUNC(grainmerge, avx2)
 BLEND_FUNC(average, sse2)
+BLEND_FUNC(average, sse4);//TEST
+BLEND_FUNC(average, avx);//TEST
+BLEND_FUNC(average, avx2)
 BLEND_FUNC(and, sse2)
+BLEND_FUNC(and, avx)//TEST
+BLEND_FUNC(and, avx2)
 BLEND_FUNC(darken, sse2)
+BLEND_FUNC(darken, avx)//TEST
+BLEND_FUNC(darken, avx2)
 BLEND_FUNC(grainextract, sse2)
+BLEND_FUNC(grainextract, sse4);//TEST
+BLEND_FUNC(grainextract, avx);//TEST
+BLEND_FUNC(grainextract, avx2)
 BLEND_FUNC(multiply, sse2)
+BLEND_FUNC(multiply, sse4);//TEST
+BLEND_FUNC(multiply, avx);//TEST
+BLEND_FUNC(multiply, avx2)
 BLEND_FUNC(screen, sse2)
+BLEND_FUNC(screen, sse4);//TEST
+BLEND_FUNC(screen, avx);//TEST
+BLEND_FUNC(screen, avx2)
 BLEND_FUNC(hardmix, sse2)
+BLEND_FUNC(hardmix, avx)//TEST
+BLEND_FUNC(hardmix, avx2)
 BLEND_FUNC(divide, sse2)
 BLEND_FUNC(lighten, sse2)
+BLEND_FUNC(lighten, avx)//TEST
+BLEND_FUNC(lighten, avx2)
 BLEND_FUNC(or, sse2)
+BLEND_FUNC(or, avx)//TEST
+BLEND_FUNC(or, avx2)
 BLEND_FUNC(phoenix, sse2)
+BLEND_FUNC(phoenix, avx)//TEST
+BLEND_FUNC(phoenix, avx2)
 BLEND_FUNC(subtract, sse2)
+BLEND_FUNC(subtract, avx)//TEST
+BLEND_FUNC(subtract, avx2)
 BLEND_FUNC(xor, sse2)
+BLEND_FUNC(xor, avx)//TEST
+BLEND_FUNC(xor, avx2)
 BLEND_FUNC(difference, sse2)
 BLEND_FUNC(difference, ssse3)
+BLEND_FUNC(difference, avx)//TEST
+BLEND_FUNC(difference, avx2)
 BLEND_FUNC(extremity, sse2)
 BLEND_FUNC(extremity, ssse3)
+BLEND_FUNC(extremity, avx)//TEST
+BLEND_FUNC(extremity, avx2)
 BLEND_FUNC(negation, sse2)
 BLEND_FUNC(negation, ssse3)
+BLEND_FUNC(negation, avx)//TEST
+BLEND_FUNC(negation, avx2)
 
 av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
 {
@@ -85,4 +124,58 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
         case BLEND_NEGATION:   param->blend = ff_blend_negation_ssse3;   break;
         }
     }
+    
+    if (EXTERNAL_SSE4(cpu_flags) && param->opacity == 1 && !is_16bit) {
+        switch (param->mode) {
+        case BLEND_GRAINMERGE: param->blend = ff_blend_grainmerge_sse4; break;
+        case BLEND_GRAINEXTRACT: param->blend = ff_blend_grainextract_sse4; break;
+        case BLEND_MULTIPLY: param->blend = ff_blend_multiply_sse4; break;
+        case BLEND_SCREEN:   param->blend = ff_blend_screen_sse4;   break;
+        case BLEND_AVERAGE:  param->blend = ff_blend_average_sse4;  break;
+        }
+    }
+    
+    if (EXTERNAL_AVX(cpu_flags) && param->opacity == 1 && !is_16bit) {
+        switch (param->mode) {
+        case BLEND_ADDITION:     param->blend = ff_blend_addition_avx;     break;
+        case BLEND_GRAINMERGE:   param->blend = ff_blend_grainmerge_avx;   break;
+        case BLEND_AND:          param->blend = ff_blend_and_avx;          break;
+        case BLEND_AVERAGE:      param->blend = ff_blend_average_avx;      break;
+        case BLEND_DARKEN:       param->blend = ff_blend_darken_avx;       break;
+        case BLEND_GRAINEXTRACT: param->blend = ff_blend_grainextract_avx; break;
+        case BLEND_HARDMIX:      param->blend = ff_blend_hardmix_avx;      break;
+        case BLEND_LIGHTEN:      param->blend = ff_blend_lighten_avx;      break;
+        case BLEND_MULTIPLY:     param->blend = ff_blend_multiply_avx;     break;
+        case BLEND_OR:           param->blend = ff_blend_or_avx;           break;
+        case BLEND_PHOENIX:      param->blend = ff_blend_phoenix_avx;      break;
+        case BLEND_SCREEN:       param->blend = ff_blend_screen_avx;       break;
+        case BLEND_SUBTRACT:     param->blend = ff_blend_subtract_avx;     break;
+        case BLEND_XOR:          param->blend = ff_blend_xor_avx;          break;
+        case BLEND_DIFFERENCE:   param->blend = ff_blend_difference_avx;   break;
+        case BLEND_EXTREMITY:    param->blend = ff_blend_extremity_avx;    break;
+        case BLEND_NEGATION:     param->blend = ff_blend_negation_avx;     break;
+        }
+    }
+    
+    if (EXTERNAL_AVX2_FAST(cpu_flags) && param->opacity == 1 && !is_16bit) {
+        switch (param->mode) {
+        case BLEND_ADDITION:     param->blend = ff_blend_addition_avx2;     break;
+        case BLEND_GRAINMERGE:   param->blend = ff_blend_grainmerge_avx2;   break;
+        case BLEND_AND:          param->blend = ff_blend_and_avx2;          break;
+        case BLEND_AVERAGE:      param->blend = ff_blend_average_avx2;      break;
+        case BLEND_DARKEN:       param->blend = ff_blend_darken_avx2;       break;
+        case BLEND_GRAINEXTRACT: param->blend = ff_blend_grainextract_avx2; break;
+        case BLEND_HARDMIX:      param->blend = ff_blend_hardmix_avx2;      break;
+        case BLEND_LIGHTEN:      param->blend = ff_blend_lighten_avx2;      break;
+        case BLEND_MULTIPLY:     param->blend = ff_blend_multiply_avx2;     break;
+        case BLEND_OR:           param->blend = ff_blend_or_avx2;           break;
+        case BLEND_PHOENIX:      param->blend = ff_blend_phoenix_avx2;      break;
+        case BLEND_SCREEN:       param->blend = ff_blend_screen_avx2;       break;
+        case BLEND_SUBTRACT:     param->blend = ff_blend_subtract_avx2;     break;
+        case BLEND_XOR:          param->blend = ff_blend_xor_avx2;          break;
+        case BLEND_DIFFERENCE:   param->blend = ff_blend_difference_avx2;   break;
+        case BLEND_EXTREMITY:    param->blend = ff_blend_extremity_avx2;    break;
+        case BLEND_NEGATION:     param->blend = ff_blend_negation_avx2;     break;
+        }
+    }
 }
-- 
2.14.3 (Apple Git-98)

