From 38715daa744ca037b3922c31b6066f6bbf9c8877 Mon Sep 17 00:00:00 2001
From: Martin Vignali <martin.vignali@gmail.com>
Date: Sat, 26 Aug 2017 19:57:32 +0200
Subject: [PATCH] libavcodec : add SSE SIMD for reorder pixels

---
 libavcodec/Makefile          |  2 +-
 libavcodec/exr.c             | 61 ++++++++++++++++++++++++++++----
 libavcodec/exrdsp.c          | 48 +++++++++++++++++++++++++
 libavcodec/exrdsp.h          | 31 +++++++++++++++++
 libavcodec/x86/Makefile      |  2 ++
 libavcodec/x86/exrdsp.asm    | 83 ++++++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/exrdsp_init.c | 77 ++++++++++++++++++++++++++++++++++++++++
 7 files changed, 296 insertions(+), 8 deletions(-)
 create mode 100644 libavcodec/exrdsp.c
 create mode 100644 libavcodec/exrdsp.h
 create mode 100644 libavcodec/x86/exrdsp.asm
 create mode 100644 libavcodec/x86/exrdsp_init.c

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 982d7f5179..7d93a43b79 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -286,7 +286,7 @@ OBJS-$(CONFIG_EIGHTSVX_FIB_DECODER)    += 8svx.o
 OBJS-$(CONFIG_ESCAPE124_DECODER)       += escape124.o
 OBJS-$(CONFIG_ESCAPE130_DECODER)       += escape130.o
 OBJS-$(CONFIG_EVRC_DECODER)            += evrcdec.o acelp_vectors.o lsp.o
-OBJS-$(CONFIG_EXR_DECODER)             += exr.o
+OBJS-$(CONFIG_EXR_DECODER)             += exr.o exrdsp.o
 OBJS-$(CONFIG_FFV1_DECODER)            += ffv1dec.o ffv1.o
 OBJS-$(CONFIG_FFV1_ENCODER)            += ffv1enc.o ffv1.o
 OBJS-$(CONFIG_FFWAVESYNTH_DECODER)     += ffwavesynth.o
diff --git a/libavcodec/exr.c b/libavcodec/exr.c
index 759880756d..29feb3c092 100644
--- a/libavcodec/exr.c
+++ b/libavcodec/exr.c
@@ -40,6 +40,7 @@
 #include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/imgutils.h"
+#include "libavutil/timer.h"
 #include "libavutil/intfloat.h"
 #include "libavutil/opt.h"
 #include "libavutil/color_utils.h"
@@ -55,6 +56,7 @@
 #include "internal.h"
 #include "mathops.h"
 #include "thread.h"
+#include "exrdsp.h"
 
 enum ExrCompr {
     EXR_RAW,
@@ -121,6 +123,7 @@ typedef struct EXRContext {
     AVClass *class;
     AVFrame *picture;
     AVCodecContext *avctx;
+    ExrDSPContext dsp;
 
 #if HAVE_BIGENDIAN
     BswapDSPContext bbdsp;
@@ -275,7 +278,8 @@ static void predictor(uint8_t *src, int size)
     }
 }
 
-static void reorder_pixels(uint8_t *src, uint8_t *dst, int size)
+/*
+static void reorder_pixels_ref(uint8_t *src, uint8_t *dst, int size)
 {
     const uint8_t *t1 = src;
     int half_size     = size / 2;
@@ -290,8 +294,9 @@ static void reorder_pixels(uint8_t *src, uint8_t *dst, int size)
         *(s++) = *(t2++);
     }
 }
+ */
 
-static int zip_uncompress(const uint8_t *src, int compressed_size,
+static int zip_uncompress(EXRContext *s, const uint8_t *src, int compressed_size,
                           int uncompressed_size, EXRThreadData *td)
 {
     unsigned long dest_len = uncompressed_size;
@@ -301,12 +306,52 @@ static int zip_uncompress(const uint8_t *src, int compressed_size,
         return AVERROR_INVALIDDATA;
 
     predictor(td->tmp, uncompressed_size);
-    reorder_pixels(td->tmp, td->uncompressed_data, uncompressed_size);
+
+    /*
+    uint8_t srcBuffer[34] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33};
+    uint8_t dstRef[34];
+    uint8_t dstSse[34];
+
+    memset(dstRef, 0, 34);
+    memset(dstSse, 0, 34);
+
+    int testSize = 34;
+
+    reorder_pixels_ref(srcBuffer, dstRef, testSize);
+    s->dsp.reorder_pixels(srcBuffer, dstSse, testSize);
+
+    printf("REF : ");
+    for (int i = 0; i < 34; i++){
+        printf("%d, ", dstRef[i]);
+    }
+    printf("\n");
+
+    printf("SSE : ");
+    for (int i = 0; i < 34; i++){
+        printf("%d, ", dstSse[i]);
+    }
+    printf("\n");
+
+    if (memcmp ( dstRef, dstSse, 34 ) == 0){
+        printf("EQUAL\n");
+    }else{
+        printf("NOT EQUAL\n");
+        for (int i = 0; i < 34; i++){
+            if (dstRef[i] != dstSse[i]){
+                printf("ERROR : ref != sse version for i = %d : ref %d ; sse %d\n", i, dstRef[i], dstSse[i]);
+            }
+        }
+    }
+    */
+
+    START_TIMER;
+    s->dsp.reorder_pixels(td->tmp, td->uncompressed_data, uncompressed_size);
+    STOP_TIMER("reorder_pixels_zip");
 
     return 0;
 }
 
-static int rle_uncompress(const uint8_t *src, int compressed_size,
+static int rle_uncompress(EXRContext *ctx, const uint8_t *src, int compressed_size,
                           int uncompressed_size, EXRThreadData *td)
 {
     uint8_t *d      = td->tmp;
@@ -346,7 +391,7 @@ static int rle_uncompress(const uint8_t *src, int compressed_size,
         return AVERROR_INVALIDDATA;
 
     predictor(td->tmp, uncompressed_size);
-    reorder_pixels(td->tmp, td->uncompressed_data, uncompressed_size);
+    ctx->dsp.reorder_pixels(td->tmp, td->uncompressed_data, uncompressed_size);
 
     return 0;
 }
@@ -1161,7 +1206,7 @@ static int decode_block(AVCodecContext *avctx, void *tdata,
         switch (s->compression) {
         case EXR_ZIP1:
         case EXR_ZIP16:
-            ret = zip_uncompress(src, data_size, uncompressed_size, td);
+            ret = zip_uncompress(s, src, data_size, uncompressed_size, td);
             break;
         case EXR_PIZ:
             ret = piz_uncompress(s, src, data_size, uncompressed_size, td);
@@ -1170,7 +1215,7 @@ static int decode_block(AVCodecContext *avctx, void *tdata,
             ret = pxr24_uncompress(s, src, data_size, uncompressed_size, td);
             break;
         case EXR_RLE:
-            ret = rle_uncompress(src, data_size, uncompressed_size, td);
+            ret = rle_uncompress(s, src, data_size, uncompressed_size, td);
             break;
         case EXR_B44:
         case EXR_B44A:
@@ -1804,6 +1849,8 @@ static av_cold int decode_init(AVCodecContext *avctx)
 
     s->avctx              = avctx;
 
+    ff_exrdsp_init(&s->dsp);
+
 #if HAVE_BIGENDIAN
     ff_bswapdsp_init(&s->bbdsp);
 #endif
diff --git a/libavcodec/exrdsp.c b/libavcodec/exrdsp.c
new file mode 100644
index 0000000000..964665cda3
--- /dev/null
+++ b/libavcodec/exrdsp.c
@@ -0,0 +1,48 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "exrdsp.h"
+#include "config.h"
+
+static void reorder_pixels_scalar(uint8_t *src, uint8_t *dst, int size)
+{
+    const uint8_t *t1 = src;
+    int half_size     = size / 2;
+    const uint8_t *t2 = src + half_size;
+    uint8_t *s        = dst;
+    int i;
+
+    av_assert1(size % 2 == 0);
+
+    for (i = 0; i < half_size; i++) {
+        *(s++) = *(t1++);
+        *(s++) = *(t2++);
+    }
+}
+
+av_cold void ff_exrdsp_init(ExrDSPContext *c)
+{
+    c->reorder_pixels   = reorder_pixels_scalar;
+
+    if (ARCH_X86)
+        ff_exrdsp_init_x86(c);
+}
diff --git a/libavcodec/exrdsp.h b/libavcodec/exrdsp.h
new file mode 100644
index 0000000000..f991164d15
--- /dev/null
+++ b/libavcodec/exrdsp.h
@@ -0,0 +1,31 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_EXRDSP_H
+#define AVCODEC_EXRDSP_H
+
+#include <stdint.h>
+
+typedef struct ExrDSPContext {
+    void (*reorder_pixels)(uint8_t *src, uint8_t *dst, int size);
+} ExrDSPContext;
+
+void ff_exrdsp_init(ExrDSPContext *c);
+void ff_exrdsp_init_x86(ExrDSPContext *c);
+
+#endif /* AVCODEC_EXRDSP_H */
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index e36644c72a..a805cd37b4 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -52,6 +52,7 @@ OBJS-$(CONFIG_APNG_DECODER)            += x86/pngdsp_init.o
 OBJS-$(CONFIG_CAVS_DECODER)            += x86/cavsdsp.o
 OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o x86/synth_filter_init.o
 OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
+OBJS-$(CONFIG_EXR_DECODER)             += x86/exrdsp_init.o
 OBJS-$(CONFIG_OPUS_DECODER)            += x86/opus_dsp_init.o
 OBJS-$(CONFIG_OPUS_ENCODER)            += x86/opus_dsp_init.o
 OBJS-$(CONFIG_HEVC_DECODER)            += x86/hevcdsp_init.o
@@ -153,6 +154,7 @@ X86ASM-OBJS-$(CONFIG_DCA_DECODER)      += x86/dcadsp.o x86/synth_filter.o
 X86ASM-OBJS-$(CONFIG_DIRAC_DECODER)    += x86/diracdsp.o                \
                                           x86/dirac_dwt.o
 X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER)    += x86/dnxhdenc.o
+X86ASM-OBJS-$(CONFIG_EXR_DECODER)      += x86/exrdsp.o
 X86ASM-OBJS-$(CONFIG_FLAC_DECODER)     += x86/flacdsp.o
 ifdef CONFIG_GPL
 X86ASM-OBJS-$(CONFIG_FLAC_ENCODER)     += x86/flac_dsp_gpl.o
diff --git a/libavcodec/x86/exrdsp.asm b/libavcodec/x86/exrdsp.asm
new file mode 100644
index 0000000000..c8419554ca
--- /dev/null
+++ b/libavcodec/x86/exrdsp.asm
@@ -0,0 +1,83 @@
+;******************************************************************************
+;* X86 Optimized functions for Open Exr Decoder
+;* Copyright (c) 2006 Industrial Light & Magic, a division of Lucas Digital Ltd. LLC
+;*
+;* reorder_pixels based on patch by John Loy
+;* port to ASM by Jokyo Images support by CNC - French National Center for Cinema
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+;------------------------------------------------------------------------------
+; void ff_reorder_pixels_sse2(uint8_t *src, uint8_t *dst, int size)
+;------------------------------------------------------------------------------
+
+
+INIT_XMM sse2
+cglobal reorder_pixels, 3,6,3, src, dst, size
+
+    shr                sizeq, 1;       sizeq = half_size
+    mov                   r3, sizeq
+    shr                   r3, 4;       r3 = half_size/16 -> loop_simd count
+
+loop_simd:
+;initial condition loop
+    jle      after_loop_simd;          jump to scalar part if loop_simd count(r3) is 0
+
+    movdqa                m0, [srcq];           load first part
+    movdqu                m1, [srcq + sizeq];   load second part
+    movdqa                m2, m0;               copy m0
+
+    punpcklbw             m2, m1;               interleaved part 1
+    movdqa            [dstq], m2;               copy to dst array
+
+    punpckhbw             m0, m1;               interleaved part 2
+    movdqa     [dstq+mmsize], m0;               copy to dst array
+
+    add                 dstq, 2*mmsize;         inc dst
+    add                 srcq, mmsize;           inc src
+    sub                   r3, 1
+    jmp            loop_simd
+
+
+after_loop_simd:
+;scalar part
+    mov                   r3, sizeq;            r3 = half_size
+    and                   r3, 15;               r3 = half_size % 16
+
+loop_scalar:
+;initial condition loop
+    cmp                   r3, 0
+    jle                    end
+
+    mov                  r5b, [srcq];           load byte first part
+    mov               [dstq], r5b;              copy first byte to dst
+
+    mov                  r5b, [srcq + sizeq];   load byte second part
+    mov             [dstq+1], r5b;              copy byte second part
+
+    add                 dstq, 2;                inc dst
+    inc                 srcq
+    sub                   r3, 1
+    jmp          loop_scalar
+
+end:
+    RET
diff --git a/libavcodec/x86/exrdsp_init.c b/libavcodec/x86/exrdsp_init.c
new file mode 100644
index 0000000000..a9014f1fcc
--- /dev/null
+++ b/libavcodec/x86/exrdsp_init.c
@@ -0,0 +1,77 @@
+/*
+ * OpenEXR (.exr) image decoder
+ *
+ * Copyright (c) 2006 Industrial Light & Magic, a division of Lucas Digital Ltd. LLC
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/exrdsp.h"
+
+//#include <emmintrin.h>//SSE2
+
+void ff_reorder_pixels_sse2(uint8_t *src, uint8_t *dst, int size);
+
+/*
+static void ff_reorder_pixels_sse2_intrinsics(uint8_t *src, uint8_t *dst, int size)
+{
+    int half_size     = size >> 1;
+
+    int i;
+    const uint8_t *t1;
+    const uint8_t *t2;
+    uint8_t *s;
+
+    const __m128i *vector1 = (__m128i*)src;
+    const __m128i *vector2 = (__m128i*)(src+half_size);
+
+    __m128i *vector_out = (__m128i*)dst;
+
+    for (i = 0; i < half_size/sizeof(__m128i); i++) {
+        __m128i a = _mm_load_si128(vector1++);//Start is aligned
+        __m128i b = _mm_loadu_si128(vector2++);//Half is not aligned
+
+        __m128i lo = _mm_unpacklo_epi8(a, b);
+        __m128i hi = _mm_unpackhi_epi8(a, b);
+
+        _mm_store_si128(vector_out++, lo);//Can be store aligned
+        _mm_store_si128(vector_out++, hi);//here too
+    }
+
+    t1 = (uint8_t*)vector1;
+    t2 = (uint8_t*)vector2;
+    s = (uint8_t*)vector_out;
+
+    for (i = 0; i < half_size % sizeof(__m128i); i++) {
+        *(s++) = *(t1++);
+        *(s++) = *(t2++);
+    }
+}*/
+
+
+av_cold void ff_exrdsp_init_x86(ExrDSPContext *dsp)
+{
+#if ARCH_X86_64
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        dsp->reorder_pixels = ff_reorder_pixels_sse2;
+    }
+#endif /* ARCH_X86_64 */
+}
-- 
2.11.0 (Apple Git-81)

