From 1df9ac1968c2da4e924c517f62ab39f6563b21a6 Mon Sep 17 00:00:00 2001
From: Martin Vignali <martin.vignali@gmail.com>
Date: Tue, 6 Feb 2018 18:16:11 +0100
Subject: [PATCH 6/7] avfilter/x86/vf_blend : add 16 bit version for phoenix
 and difference mode (SSE and AVX2)

---
 libavfilter/x86/vf_blend.asm    | 42 +++++++++++++++++++++++++++++++----------
 libavfilter/x86/vf_blend_init.c |  8 ++++++++
 2 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm
index 082de9c8af..176751b904 100644
--- a/libavfilter/x86/vf_blend.asm
+++ b/libavfilter/x86/vf_blend.asm
@@ -273,8 +273,9 @@ BLEND_INIT divide, 4
 BLEND_END
 %endmacro
 
-%macro PHOENIX 0
-BLEND_INIT phoenix, 4
+%macro PHOENIX 2-3
+; %1 name, %2 b or w, %3 (opt) 1 if 16 bit
+BLEND_INIT %1, 4, %3
     VBROADCASTI128       m3, [pb_255]
 .nextrow:
     mov        xq, widthq
@@ -283,19 +284,19 @@ BLEND_INIT phoenix, 4
         movu            m0, [topq + xq]
         movu            m1, [bottomq + xq]
         mova            m2, m0
-        pminub          m0, m1
-        pmaxub          m1, m2
+        pminu%2         m0, m1
+        pmaxu%2         m1, m2
         mova            m2, m3
-        psubusb         m2, m1
-        paddusb         m2, m0
+        psubus%2        m2, m1
+        paddus%2        m2, m0
         mova   [dstq + xq], m2
         add             xq, mmsize
     jl .loop
 BLEND_END
 %endmacro
 
-%macro BLEND_ABS 0
-BLEND_INIT difference, 5
+%macro DIFFERENCE 1-2
+BLEND_INIT %1, 5, %2
     pxor       m2, m2
 .nextrow:
     mov        xq, widthq
@@ -303,6 +304,17 @@ BLEND_INIT difference, 5
     .loop:
         movu            m0, [topq + xq]
         movu            m1, [bottomq + xq]
+%if %0 == 2 ; 16 bit
+        punpckhwd       m3, m0, m2
+        punpcklwd       m0, m2
+        punpckhwd       m4, m1, m2
+        punpcklwd       m1, m2
+        psubd           m0, m1
+        psubd           m3, m4
+        pabsd           m0, m0
+        pabsd           m3, m3
+        packusdw        m0, m3
+%else
         punpckhbw       m3, m0, m2
         punpcklbw       m0, m2
         punpckhbw       m4, m1, m2
@@ -311,11 +323,14 @@ BLEND_INIT difference, 5
         psubw           m3, m4
         ABS2            m0, m3, m1, m4
         packuswb        m0, m3
+%endif
         mova   [dstq + xq], m0
         add             xq, mmsize
     jl .loop
 BLEND_END
+%endmacro
 
+%macro BLEND_ABS 0
 BLEND_INIT extremity, 8
     pxor       m2, m2
     VBROADCASTI128       m4, [pw_255]
@@ -381,7 +396,8 @@ BLEND_SCREEN
 AVERAGE
 GRAINMERGE
 HARDMIX
-PHOENIX
+PHOENIX phoenix, b
+DIFFERENCE difference
 DIVIDE
 
 BLEND_ABS
@@ -393,11 +409,14 @@ BLEND_SIMPLE subtract_16, subusw, 1
 BLEND_SIMPLE xor_16,      xor,    1
 
 INIT_XMM ssse3
+DIFFERENCE difference
 BLEND_ABS
 
 INIT_XMM sse4
 BLEND_SIMPLE darken_16,   minuw, 1
 BLEND_SIMPLE lighten_16,  maxuw, 1
+PHOENIX      phoenix_16,      w, 1
+DIFFERENCE   difference_16,      1
 
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
@@ -414,8 +433,9 @@ BLEND_SCREEN
 AVERAGE
 GRAINMERGE
 HARDMIX
-PHOENIX
+PHOENIX phoenix, b
 
+DIFFERENCE difference
 BLEND_ABS
 
 BLEND_SIMPLE addition_16, addusw, 1
@@ -425,4 +445,6 @@ BLEND_SIMPLE lighten_16,  maxuw,  1
 BLEND_SIMPLE or_16,       or,     1
 BLEND_SIMPLE subtract_16, subusw, 1
 BLEND_SIMPLE xor_16,      xor,    1
+PHOENIX      phoenix_16,       w, 1
+DIFFERENCE   difference_16,       1
 %endif
diff --git a/libavfilter/x86/vf_blend_init.c b/libavfilter/x86/vf_blend_init.c
index 9966ead362..95a29ad22d 100644
--- a/libavfilter/x86/vf_blend_init.c
+++ b/libavfilter/x86/vf_blend_init.c
@@ -75,10 +75,14 @@ BLEND_FUNC(and_16, sse2)
 BLEND_FUNC(and_16, avx2)
 BLEND_FUNC(darken_16, sse4)
 BLEND_FUNC(darken_16, avx2)
+BLEND_FUNC(difference_16, sse4)
+BLEND_FUNC(difference_16, avx2)
 BLEND_FUNC(lighten_16, sse4)
 BLEND_FUNC(lighten_16, avx2)
 BLEND_FUNC(or_16, sse2)
 BLEND_FUNC(or_16, avx2)
+BLEND_FUNC(phoenix_16, sse4)
+BLEND_FUNC(phoenix_16, avx2)
 BLEND_FUNC(subtract_16, sse2)
 BLEND_FUNC(subtract_16, avx2)
 BLEND_FUNC(xor_16, sse2)
@@ -153,7 +157,9 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
         if (EXTERNAL_SSE4(cpu_flags) && param->opacity == 1) {
             switch (param->mode) {
             case BLEND_DARKEN:   param->blend = ff_blend_darken_16_sse4;     break;
+            case BLEND_DIFFERENCE: param->blend = ff_blend_difference_16_sse4; break;
             case BLEND_LIGHTEN:  param->blend = ff_blend_lighten_16_sse4;    break;
+            case BLEND_PHOENIX:  param->blend = ff_blend_phoenix_16_sse4;    break;
             }
         }
         if (EXTERNAL_AVX2_FAST(cpu_flags) && param->opacity == 1) {
@@ -161,8 +167,10 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
             case BLEND_ADDITION: param->blend = ff_blend_addition_16_avx2; break;
             case BLEND_AND:      param->blend = ff_blend_and_16_avx2;      break;
             case BLEND_DARKEN:   param->blend = ff_blend_darken_16_avx2;   break;
+            case BLEND_DIFFERENCE: param->blend = ff_blend_difference_16_avx2; break;
             case BLEND_LIGHTEN:  param->blend = ff_blend_lighten_16_avx2;  break;
             case BLEND_OR:       param->blend = ff_blend_or_16_avx2;       break;
+            case BLEND_PHOENIX:  param->blend = ff_blend_phoenix_16_avx2;  break;
             case BLEND_SUBTRACT: param->blend = ff_blend_subtract_16_avx2; break;
             case BLEND_XOR:      param->blend = ff_blend_xor_16_avx2;      break;
             }
-- 
2.14.3 (Apple Git-98)

