From 2393313673442256d70e8860f2a4e9df496c1bee Mon Sep 17 00:00:00 2001
From: Martin Vignali <martin.vignali@gmail.com>
Date: Thu, 5 Oct 2017 16:31:01 +0200
Subject: [PATCH 2/4] libavcodec/blockdsp : add clear_block_prores

using a loop and call clear_block, can
be slower than memset

this new func, clear even block number
and is faster than a memset
---
 libavcodec/blockdsp.c          |  6 ++++++
 libavcodec/blockdsp.h          |  1 +
 libavcodec/x86/blockdsp.asm    | 34 ++++++++++++++++++++++++++++++++++
 libavcodec/x86/blockdsp_init.c |  4 ++++
 tests/checkasm/blockdsp.c      | 19 +++++++++++++++++++
 5 files changed, 64 insertions(+)

diff --git a/libavcodec/blockdsp.c b/libavcodec/blockdsp.c
index c7efe7e77b..564149243f 100644
--- a/libavcodec/blockdsp.c
+++ b/libavcodec/blockdsp.c
@@ -35,6 +35,11 @@ static void clear_blocks_c(int16_t *blocks)
     memset(blocks, 0, sizeof(int16_t) * 6 * 64);
 }
 
+static void clear_blocks_prores_c(int16_t * blocks, ptrdiff_t block_count)
+{
+    memset(blocks, 0, sizeof(int16_t) * 64 * block_count);
+}
+
 static void fill_block16_c(uint8_t *block, uint8_t value, ptrdiff_t line_size,
                            int h)
 {
@@ -61,6 +66,7 @@ av_cold void ff_blockdsp_init(BlockDSPContext *c, AVCodecContext *avctx)
 {
     c->clear_block  = clear_block_c;
     c->clear_blocks = clear_blocks_c;
+    c->clear_blocks_prores = clear_blocks_prores_c;
 
     c->fill_block_tab[0] = fill_block16_c;
     c->fill_block_tab[1] = fill_block8_c;
diff --git a/libavcodec/blockdsp.h b/libavcodec/blockdsp.h
index 26fc2ea13b..56881b38c9 100644
--- a/libavcodec/blockdsp.h
+++ b/libavcodec/blockdsp.h
@@ -35,6 +35,7 @@ typedef void (*op_fill_func)(uint8_t *block /* align width (8 or 16) */,
 typedef struct BlockDSPContext {
     void (*clear_block)(int16_t *block /* align 32 */);
     void (*clear_blocks)(int16_t *blocks /* align 32 */);
+    void (*clear_blocks_prores)(int16_t * blocks, ptrdiff_t block_count); /* align 32, block_count even */
 
     op_fill_func fill_block_tab[2];
 } BlockDSPContext;
diff --git a/libavcodec/x86/blockdsp.asm b/libavcodec/x86/blockdsp.asm
index 9d203df8f5..e99425272a 100644
--- a/libavcodec/x86/blockdsp.asm
+++ b/libavcodec/x86/blockdsp.asm
@@ -86,3 +86,37 @@ INIT_XMM sse
 CLEAR_BLOCKS 1
 INIT_YMM avx
 CLEAR_BLOCKS 1
+
+
+;----------------------------------------------------------------
+; void ff_clear_blocks_prores(int16_t *blocks, ptrdiff_t block_count);
+;----------------------------------------------------------------
+; %1 = number of xmm registers used
+%macro CLEAR_BLOCKS_PRORES 1
+cglobal clear_blocks_prores, 2, 2, %1, blocks, block_count
+    shl block_countq, 7 ;    64*sizeof(int16_t) * block_count
+    add   blocksq, block_countq
+    neg   block_countq
+    ZERO       m0, m0, m0
+.loop:
+    mova  [blocksq+ block_countq +mmsize*0], m0
+    mova  [blocksq+ block_countq +mmsize*1], m0
+    mova  [blocksq+ block_countq +mmsize*2], m0
+    mova  [blocksq+ block_countq +mmsize*3], m0
+    mova  [blocksq+ block_countq +mmsize*4], m0
+    mova  [blocksq+ block_countq +mmsize*5], m0
+    mova  [blocksq+ block_countq +mmsize*6], m0
+    mova  [blocksq+ block_countq +mmsize*7], m0
+    add   block_countq, mmsize*8
+    js .loop
+    RET
+%endmacro
+
+INIT_MMX mmx
+%define ZERO pxor
+CLEAR_BLOCKS_PRORES 0
+INIT_XMM sse
+%define ZERO xorps
+CLEAR_BLOCKS_PRORES 1
+INIT_YMM avx
+CLEAR_BLOCKS_PRORES 1
diff --git a/libavcodec/x86/blockdsp_init.c b/libavcodec/x86/blockdsp_init.c
index 8b01a447cd..80988598e2 100644
--- a/libavcodec/x86/blockdsp_init.c
+++ b/libavcodec/x86/blockdsp_init.c
@@ -32,6 +32,8 @@ void ff_clear_block_avx(int16_t *block);
 void ff_clear_blocks_mmx(int16_t *blocks);
 void ff_clear_blocks_sse(int16_t *blocks);
 void ff_clear_blocks_avx(int16_t *blocks);
+void ff_clear_blocks_prores_sse(int16_t * blocks, ptrdiff_t block_count);
+void ff_clear_blocks_prores_avx(int16_t * blocks, ptrdiff_t block_count);
 
 av_cold void ff_blockdsp_init_x86(BlockDSPContext *c,
                                   AVCodecContext *avctx)
@@ -51,10 +53,12 @@ av_cold void ff_blockdsp_init_x86(BlockDSPContext *c,
     if (EXTERNAL_SSE(cpu_flags)) {
         c->clear_block  = ff_clear_block_sse;
         c->clear_blocks = ff_clear_blocks_sse;
+        c->clear_blocks_prores = ff_clear_blocks_prores_sse;
     }
     if (EXTERNAL_AVX_FAST(cpu_flags)) {
         c->clear_block  = ff_clear_block_avx;
         c->clear_blocks = ff_clear_blocks_avx;
+        c->clear_blocks_prores = ff_clear_blocks_prores_avx;
     }
 #endif /* HAVE_X86ASM */
 }
diff --git a/tests/checkasm/blockdsp.c b/tests/checkasm/blockdsp.c
index c753506b3c..5136aa80f5 100644
--- a/tests/checkasm/blockdsp.c
+++ b/tests/checkasm/blockdsp.c
@@ -28,6 +28,8 @@
 #include "libavutil/internal.h"
 #include "libavutil/intreadwrite.h"
 
+#define PRORES_MAX_BUF_SIZE 2048//32*8*8
+
 #define randomize_buffers(size)             \
     do {                                    \
         int i;                              \
@@ -51,6 +53,20 @@ do {                                                                \
     }                                                               \
 } while (0)
 
+static void check_clear_blocks_prores(ptrdiff_t block_count) {
+    LOCAL_ALIGNED_32(uint16_t, buf0, [PRORES_MAX_BUF_SIZE]);
+    LOCAL_ALIGNED_32(uint16_t, buf1, [PRORES_MAX_BUF_SIZE]);
+
+    declare_func(void, uint16_t * block, ptrdiff_t size);
+
+    randomize_buffers(PRORES_MAX_BUF_SIZE/2);
+    call_ref(buf0, block_count);
+    call_new(buf1, block_count);
+    if (memcmp(buf0, buf1, PRORES_MAX_BUF_SIZE))
+        fail();
+    bench_new(buf0, block_count);
+}
+
 void checkasm_check_blockdsp(void)
 {
     LOCAL_ALIGNED_32(uint16_t, buf0, [6 * 8 * 8]);
@@ -64,5 +80,8 @@ void checkasm_check_blockdsp(void)
     check_clear(clear_block,  8 * 8);
     check_clear(clear_blocks, 8 * 8 * 6);
 
+    if (check_func(h.clear_blocks_prores, "blockdsp.clear_blocks_prores"))
+        check_clear_blocks_prores(32);
+
     report("blockdsp");
 }
-- 
2.11.0 (Apple Git-81)

