From 4316cb54e3ae939ffadee1f59595ad5af5124db6 Mon Sep 17 00:00:00 2001
From: Stefano Sabatini <stefasab@gmail.com>
Date: Fri, 15 May 2015 18:58:17 +0200
Subject: [PATCH] ffmpeg_dxva.c: add support to optimized GPU to CPU copy

Based on code from vlc dxva2.c, commit 62107e56 by Laurent Aimar
<fenrir@videolan.org>.

Needs proper integration with the build system.

Signed-off-by: Stefano Sabatini <stefasab@gmail.com>
---
 ffmpeg_dxva2.c | 228 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 219 insertions(+), 9 deletions(-)

diff --git a/ffmpeg_dxva2.c b/ffmpeg_dxva2.c
index 741c55b..e4e8d07 100644
--- a/ffmpeg_dxva2.c
+++ b/ffmpeg_dxva2.c
@@ -39,6 +39,7 @@
 #include "libavutil/frame.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/pixfmt.h"
+#include "libavutil/x86/asm.h"
 
 /* define all the GUIDs used directly here,
    to avoid problems with inconsistent dxva2api.h versions in mingw-w64 and different MSVC version */
@@ -87,6 +88,210 @@ static const dxva2_mode dxva2_modes[] = {
     { NULL,                      0 },
 };
 
+typedef struct {
+# ifdef HAVE_SSE2
+    uint8_t *buffer;
+    size_t  size;
+# endif
+} copy_cache;
+
+static int copy_cache_init(copy_cache *cache, unsigned width)
+{
+#if HAVE_SSE2
+    cache->size = FFMAX((width + 0x3f) & ~ 0x3f, 4096);
+    cache->buffer = av_malloc(cache->size);
+    if (!cache->buffer)
+        return AVERROR(ENOMEM);
+#else
+    (void) cache; (void) width;
+#endif
+    return 0;
+}
+
+static void copy_cache_clean(copy_cache *cache)
+{
+#ifdef HAVE_SSE2
+    av_freep(&cache->buffer);
+    cache->size   = 0;
+#else
+    (void) cache;
+#endif
+}
+
+#ifdef HAVE_SSE2
+/* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 instruction
+ * load and storing data with the SSE>=2 instruction store.
+ */
+#define COPY16(dstp, srcp, load, store) \
+    __asm__ volatile (                      \
+        load "  0(%[src]), %%xmm1\n"    \
+        store " %%xmm1,    0(%[dst])\n" \
+        : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1")
+
+#define COPY64(dstp, srcp, load, store) \
+    __asm__ volatile (                      \
+        load "  0(%[src]), %%xmm1\n"    \
+        load " 16(%[src]), %%xmm2\n"    \
+        load " 32(%[src]), %%xmm3\n"    \
+        load " 48(%[src]), %%xmm4\n"    \
+        store " %%xmm1,    0(%[dst])\n" \
+        store " %%xmm2,   16(%[dst])\n" \
+        store " %%xmm3,   32(%[dst])\n" \
+        store " %%xmm4,   48(%[dst])\n" \
+        : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
+
+#define AV_CPU_SSE4()  ((cpu & AV_CPU_FLAG_SSE4)  != 0)
+#define AV_CPU_SSSE3() ((cpu & AV_CPU_FLAG_SSSE3) != 0)
+#define AV_CPU_SSE2()  ((cpu & AV_CPU_FLAG_SSE2)  != 0)
+
+/* Optimized copy from "Uncacheable Speculative Write Combining" memory
+ * as used by some video surface.
+ * XXX It is really efficient only when SSE4.1 is available.
+ */
+static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
+                         const uint8_t *src, size_t src_pitch,
+                         unsigned width, unsigned height,
+                         unsigned cpu)
+{
+#ifndef HAVE_SSSE3
+    av_unused(cpu);
+#endif
+    av_assert0(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
+
+    __asm__ volatile ("mfence");
+
+    for (unsigned y = 0; y < height; y++) {
+        const unsigned unaligned = (-(uintptr_t)src) & 0x0f;
+        unsigned x = unaligned;
+
+#ifdef HAVE_SSE42
+        if (AV_CPU_SSE4()) {
+            if (!unaligned) {
+                for (; x+63 < width; x += 64)
+                    COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
+            } else {
+                COPY16(dst, src, "movdqu", "movdqa");
+                for (; x+63 < width; x += 64)
+                    COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
+            }
+        } else
+#endif
+        {
+            if (!unaligned) {
+                for (; x+63 < width; x += 64)
+                    COPY64(&dst[x], &src[x], "movdqa", "movdqa");
+            } else {
+                COPY16(dst, src, "movdqu", "movdqa");
+                for (; x+63 < width; x += 64)
+                    COPY64(&dst[x], &src[x], "movdqa", "movdqu");
+            }
+        }
+
+        for (; x < width; x++)
+            dst[x] = src[x];
+
+        src += src_pitch;
+        dst += dst_pitch;
+    }
+    __asm__ volatile ("mfence");
+}
+
+static void Copy2d(uint8_t *dst, size_t dst_pitch,
+                   const uint8_t *src, size_t src_pitch,
+                   unsigned width, unsigned height)
+{
+    av_assert0(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
+
+    for (unsigned y = 0; y < height; y++) {
+        unsigned x = 0;
+
+        int unaligned = ((intptr_t)dst & 0x0f) != 0;
+        if (!unaligned) {
+            for (; x+63 < width; x += 64)
+                COPY64(&dst[x], &src[x], "movdqa", "movntdq");
+        } else {
+            for (; x+63 < width; x += 64)
+                COPY64(&dst[x], &src[x], "movdqa", "movdqu");
+        }
+
+        for (; x < width; x++)
+            dst[x] = src[x];
+
+        src += src_pitch;
+        dst += dst_pitch;
+    }
+}
+
+static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
+                          const uint8_t *src, size_t src_pitch,
+                          uint8_t *cache, size_t cache_size,
+                          unsigned width, unsigned height, unsigned cpu)
+{
+    const unsigned w16 = (width+15) & ~15;
+    const unsigned hstep = cache_size / w16;
+    av_assert0(hstep > 0);
+
+    for (unsigned y = 0; y < height; y += hstep) {
+        const unsigned hblock = FFMIN(hstep, height - y);
+
+        /* Copy a bunch of line into our cache */
+        CopyFromUswc(cache, w16,
+                     src, src_pitch,
+                     width, hblock, cpu);
+
+        /* Copy from our cache to the destination */
+        Copy2d(dst, dst_pitch,
+               cache, w16,
+               width, hblock);
+
+        /* */
+        src += src_pitch * hblock;
+        dst += dst_pitch * hblock;
+    }
+}
+
+static void SSE_CopyFromNv12ToNv12(uint8_t *dst[4], size_t dst_pitch[4],
+                                   uint8_t *src[2], size_t src_pitch[2],
+                                   unsigned width, unsigned height,
+                                   copy_cache *cache, unsigned cpu)
+{
+  SSE_CopyPlane(dst[0], dst_pitch[0],
+		src[0], src_pitch[0],
+		cache->buffer, cache->size,
+		width, height, cpu);
+  SSE_CopyPlane(dst[1], dst_pitch[1],
+		src[1], src_pitch[1],
+		cache->buffer, cache->size,
+		width, height/2, cpu);
+  __asm__ volatile ("emms");
+}
+#undef COPY64
+
+#endif /* HAVE_SSE2 */
+
+static void CopyFromNv12ToNv12(uint8_t *dst[4], size_t dst_pitch[4],
+			       uint8_t *src[2], size_t src_pitch[2],
+			       unsigned width, unsigned height,
+			       copy_cache *cache)
+{
+#ifdef HAVE_SSE2
+    unsigned cpu = av_get_cpu_flags();
+    if (AV_CPU_SSE2())
+      return SSE_CopyFromNv12ToNv12(dst, dst_pitch,
+				    src, src_pitch, width, height,
+				    cache, cpu);
+#else
+    (void) cache;
+#endif
+
+    av_image_copy_plane(dst[0], dst_pitch[0],
+                        src[0], src_pitch[0],
+                        width, height);
+    av_image_copy_plane(dst[1], dst_pitch[1],
+                        src[1], src_pitch[1],
+                        width, height/2);
+}
+
 typedef struct surface_info {
     int used;
     uint64_t age;
@@ -112,6 +317,7 @@ typedef struct DXVA2Context {
     uint32_t                    num_surfaces;
     uint64_t                    surface_age;
 
+    copy_cache                  cache;
     AVFrame                     *tmp_frame;
 } DXVA2Context;
 
@@ -258,6 +464,8 @@ static int dxva2_retrieve_data(AVCodecContext *s, AVFrame *frame)
     D3DLOCKED_RECT     LockedRect;
     HRESULT            hr;
     int                ret;
+    uint8_t *plane[2];
+    size_t  pitch[2];
 
     IDirect3DSurface9_GetDesc(surface, &surfaceDesc);
 
@@ -275,14 +483,12 @@ static int dxva2_retrieve_data(AVCodecContext *s, AVFrame *frame)
         return AVERROR_UNKNOWN;
     }
 
-    av_image_copy_plane(ctx->tmp_frame->data[0], ctx->tmp_frame->linesize[0],
-                        (uint8_t*)LockedRect.pBits,
-                        LockedRect.Pitch, frame->width, frame->height);
-
-    av_image_copy_plane(ctx->tmp_frame->data[1], ctx->tmp_frame->linesize[1],
-                        (uint8_t*)LockedRect.pBits + LockedRect.Pitch * surfaceDesc.Height,
-                        LockedRect.Pitch, frame->width, frame->height / 2);
-
+    plane[0] = LockedRect.pBits;
+    plane[1] = (uint8_t*)LockedRect.pBits + LockedRect.Pitch * surfaceDesc.Height;
+    pitch[0] = pitch[1] = LockedRect.Pitch;
+    CopyFromNv12ToNv12(ctx->tmp_frame->data, ctx->tmp_frame->linesize,
+		       plane, pitch,
+		       frame->width, frame->height, &ctx->cache);
     IDirect3DSurface9_UnlockRect(surface);
 
     ret = av_frame_copy_props(ctx->tmp_frame, frame);
@@ -302,6 +508,7 @@ static int dxva2_alloc(AVCodecContext *s)
 {
     InputStream  *ist = s->opaque;
     int loglevel = (ist->hwaccel_id == HWACCEL_AUTO) ? AV_LOG_VERBOSE : AV_LOG_ERROR;
+    int ret;
     DXVA2Context *ctx;
     pDirect3DCreate9      *createD3D = NULL;
     pCreateDeviceManager9 *createDeviceManager = NULL;
@@ -364,7 +571,7 @@ static int dxva2_alloc(AVCodecContext *s)
     d3dpp.SwapEffect       = D3DSWAPEFFECT_DISCARD;
     d3dpp.Flags            = D3DPRESENTFLAG_VIDEO;
 
-    hr = IDirect3D9_CreateDevice(ctx->d3d9, adapter, D3DDEVTYPE_HAL, GetShellWindow(),
+    hr = IDirect3D9_CreateDevice(ctx->d3d9, adapter, D3DDEVTYPE_HAL, GetDesktopWindow(),
                                  D3DCREATE_SOFTWARE_VERTEXPROCESSING | D3DCREATE_MULTITHREADED | D3DCREATE_FPU_PRESERVE,
                                  &d3dpp, &ctx->d3d9device);
     if (FAILED(hr)) {
@@ -400,6 +607,9 @@ static int dxva2_alloc(AVCodecContext *s)
     if (!ctx->tmp_frame)
         goto fail;
 
+    if ((ret = copy_cache_init(&ctx->cache, 1920)) < 0)
+        return ret;
+
     s->hwaccel_context = av_mallocz(sizeof(struct dxva_context));
     if (!s->hwaccel_context)
         goto fail;
-- 
1.8.4.msysgit.0