From ccdde79060a37d4694eafb9e39ce02a4eb026fd0 Mon Sep 17 00:00:00 2001
From: Christophe Gisquet <christophe.gisquet@gmail.com>
Date: Fri, 8 Jan 2016 19:55:43 +0100
Subject: [PATCH] x86: float_dsp: avx version of butterflies_float

No gain.
---
 libavutil/x86/float_dsp.asm    | 33 ++++++++++++++++++++++++++++-----
 libavutil/x86/float_dsp_init.c |  2 ++
 2 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
index 021ff03..5948c95 100644
--- a/libavutil/x86/float_dsp.asm
+++ b/libavutil/x86/float_dsp.asm
@@ -357,19 +357,42 @@ cglobal scalarproduct_float, 3,3,2, v1, v2, offset
 ;-----------------------------------------------------------------------------
 ; void ff_butterflies_float(float *src0, float *src1, int len);
 ;-----------------------------------------------------------------------------
-INIT_XMM sse
+%macro BUTTERFLIES 0
 cglobal butterflies_float, 3,3,3, src0, src1, len
     shl       lend, 2
+%if cpuflag(avx)
+    test      lend, mmsize-1
+    jz     .normal
+    sub       lend, 16
+    movaps     xm0, [src0q + lenq]
+    movaps     xm1, [src1q + lenq]
+    subps      xm2, xm0, xm1
+    addps      xm0, xm0, xm1
+    movaps     [src1q + lenq], xm2
+    movaps     [src0q + lenq], xm0
+    jz        .end
+.normal:
+%define MOVR  movu
+%else
+%define MOVR  mova
+%endif
     add      src0q, lenq
     add      src1q, lenq
     neg       lenq
 .loop:
-    mova        m0, [src0q + lenq]
-    mova        m1, [src1q + lenq]
+    MOVR        m0, [src0q + lenq]
+    MOVR        m1, [src1q + lenq]
     subps       m2, m0, m1
     addps       m0, m0, m1
-    mova        [src1q + lenq], m2
-    mova        [src0q + lenq], m0
+    MOVR        [src1q + lenq], m2
+    MOVR        [src0q + lenq], m0
     add       lenq, mmsize
     jl .loop
+.end:
     REP_RET
+%endmacro
+
+INIT_XMM sse
+BUTTERFLIES
+INIT_YMM avx
+BUTTERFLIES
diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c
index f211f23..f8b645a 100644
--- a/libavutil/x86/float_dsp_init.c
+++ b/libavutil/x86/float_dsp_init.c
@@ -64,6 +64,7 @@ void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
 
 void ff_butterflies_float_sse(float *src0, float *src1, int len);
+void ff_butterflies_float_avx(float *src0, float *src1, int len);
 
 av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
 {
@@ -91,6 +92,7 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
         fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx;
         fdsp->vector_fmul_add    = ff_vector_fmul_add_avx;
         fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
+        fdsp->butterflies_float   = ff_butterflies_float_avx;
     }
     if (EXTERNAL_FMA3(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
         fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3;
-- 
2.6.3

