From 5d3b88d2a9bad1f9e9431b6244493cbc6dda5701 Mon Sep 17 00:00:00 2001
From: Dominique Leroux <dominique.leroux@autodesk.com>
Date: Thu, 4 Dec 2014 19:05:44 -0500
Subject: [PATCH] Fixed PNG decoding with Paeth filter for RGBA 64-bits.

I noticed an artifact on the resulting image when passing a RGBA 64-bits PNG
through the copy filter.  The last column, although obviously "derived" from the
original image's last column (i.e. not completely garbage), had way too much
yellow in it; a telltale sign of an off-by-X somewhere.  Original scenario used
the scale filter, but taking it out was still exhibiting the problem.

So my boiled-down test consisted in running:

      ./ffmpeg.exe -i original_rgba64.png -vf copy result.png

and inspecting the result image with a viewer.  I realized the image had some
lines filtered with Paeth, and only those lines would get the artifact on their
rightmost pixel.

In libavcodec/pngdec.c, png_filter_row() has buffer-sizing arithmetic for
feeding the Paeth prediction function with a buffer that has a multiple of 4
bytes.  This is done so the optimized version of the algorithm, using MMX/SSE3,
can work on 4 bytes at a time without causing a write past the end of the buffer
when writing back into the result buffer.  This arithmetic assumed that
situations where the buffer is improperly sized were limited to RGB 24-bits
(bpp, or bytes per pixel, = 3) and that all other formats had buffers properly
sized for SIMD, i.e. a multiple of 4.

There are 2 problems with this:

. bpp == 8 (RGBA 64-bits, my case) also has the proper size for SIMD.  But it
  got handled as if it had a bpp of 3 because the code was testing for bpp == 4
  (with ternary operator's operands reversed).
. bpp == 6 (RGB 48-bits) can also cause writes past the end of the buffer when
  handling the last 2 of 6 bytes for an image that has an odd number of columns.

So I am reformulating the SIMD-adjusted width to ignore the trailing 3 (or less)
bytes when the total width is not a multiple of 4.  These bytes are fed to the
unoptimized version of the algorithm.

I was tempted to apply this to bpp == 1 as well (i.e. take out the bpp > 2 test
in the enclosing if). But looking at pngdec.c's history, this has been done
earlier and differences between the results of the optimized and non-optimized
versions of the algorithm were observed.  So I am leaving this untouched.

It then looked to me that knowing the SIMD stride requirements of the x86
platform implementation was a bit of a long-distance coupling between the
platform-independent pngdec.c and the x86-specific x86/pngdsp.asm.  So I
abstracted this out through a pair of members in the PNGDSPContext structure:
SIMD stride as well as a mask for adjusting the buffer size.

Signed-off-by: Dominique Leroux <dominique.leroux@autodesk.com>
---
 Changelog                    |  1 +
 libavcodec/pngdec.c          | 16 ++++++++++++----
 libavcodec/pngdsp.c          |  7 +++++--
 libavcodec/pngdsp.h          |  5 +++++
 libavcodec/x86/pngdsp_init.c | 17 +++++++++++------
 5 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/Changelog b/Changelog
index c0bbbe2..9a18558 100644
--- a/Changelog
+++ b/Changelog
@@ -2,6 +2,7 @@ Entries are sorted chronologically from oldest to youngest within each release,
 releases are sorted from youngest to oldest.
 
 version <next>:
+- PNG Paeth RGBA64 decoding artifact fixed
 
 
 version 2.5:
diff --git a/libavcodec/pngdec.c b/libavcodec/pngdec.c
index e6b7593..1cedd4f 100644
--- a/libavcodec/pngdec.c
+++ b/libavcodec/pngdec.c
@@ -269,10 +269,18 @@ static void png_filter_row(PNGDSPContext *dsp, uint8_t *dst, int filter_type,
             p      = last[i];
             dst[i] = p + src[i];
         }
-        if (bpp > 2 && size > 4) {
-            /* would write off the end of the array if we let it process
-             * the last pixel with bpp=3 */
-            int w = bpp == 4 ? size : size - 3;
+        if (bpp > 2 && size > dsp->simd_stride_bytes) {
+            /* would write off the end of the array if we let it process the
+             * last pixel with bpp=3 or 6 when prediction is implemented using
+             * SIMD instructions, which work on 4 operands (i.e. bytes) at a
+             * time. Both RGB 24-bits (bpp=3) and 48-bits (bpp=6) can be
+             * affected by this.  So we make sure to only apply the accelerated
+             * version on buffers whose size is a multiple 4 and let the
+             * unaccelerated version handle the last incomplete 4-bytes group,
+             * if any. */
+
+            /* When stride is 4, equivalent to: size - (size % 4) */
+            int w = size & dsp->simd_stride_bytes_mask;
             if (w > i) {
                 dsp->add_paeth_prediction(dst + i, src + i, last + i, w - i, bpp);
                 i = w;
diff --git a/libavcodec/pngdsp.c b/libavcodec/pngdsp.c
index d275316..3c67b5b 100644
--- a/libavcodec/pngdsp.c
+++ b/libavcodec/pngdsp.c
@@ -42,9 +42,12 @@ static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w)
 
 av_cold void ff_pngdsp_init(PNGDSPContext *dsp)
 {
-    dsp->add_bytes_l2         = add_bytes_l2_c;
-    dsp->add_paeth_prediction = ff_add_png_paeth_prediction;
+    dsp->add_bytes_l2           = add_bytes_l2_c;
+    dsp->add_paeth_prediction   = ff_add_png_paeth_prediction;
+    dsp->simd_stride_bytes      = 1;
 
     if (ARCH_X86)
         ff_pngdsp_init_x86(dsp);
+
+    dsp->simd_stride_bytes_mask = ~(dsp->simd_stride_bytes - 1);
 }
diff --git a/libavcodec/pngdsp.h b/libavcodec/pngdsp.h
index fbc1a50..7757fb7 100644
--- a/libavcodec/pngdsp.h
+++ b/libavcodec/pngdsp.h
@@ -32,6 +32,11 @@ typedef struct PNGDSPContext {
     /* this might write to dst[w] */
     void (*add_paeth_prediction)(uint8_t *dst, uint8_t *src,
                                  uint8_t *top, int w, int bpp);
+    /* number of bytes that SIMD operations will handle in parallel. */
+    int simd_stride_bytes;
+
+    /* "image line width (in bytes)" & "this mask" = SIMD-safe width. */
+    int simd_stride_bytes_mask;
 } PNGDSPContext;
 
 void ff_pngdsp_init(PNGDSPContext *dsp);
diff --git a/libavcodec/x86/pngdsp_init.c b/libavcodec/x86/pngdsp_init.c
index 7dca62c..f81cd52 100644
--- a/libavcodec/x86/pngdsp_init.c
+++ b/libavcodec/x86/pngdsp_init.c
@@ -39,12 +39,17 @@ av_cold void ff_pngdsp_init_x86(PNGDSPContext *dsp)
 
 #if ARCH_X86_32
     if (EXTERNAL_MMX(cpu_flags))
-        dsp->add_bytes_l2         = ff_add_bytes_l2_mmx;
+        dsp->add_bytes_l2           = ff_add_bytes_l2_mmx;
 #endif
-    if (EXTERNAL_MMXEXT(cpu_flags))
-        dsp->add_paeth_prediction = ff_add_png_paeth_prediction_mmxext;
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        dsp->add_paeth_prediction   = ff_add_png_paeth_prediction_mmxext;
+        dsp->simd_stride_bytes      = 4;
+    }
     if (EXTERNAL_SSE2(cpu_flags))
-        dsp->add_bytes_l2         = ff_add_bytes_l2_sse2;
-    if (EXTERNAL_SSSE3(cpu_flags))
-        dsp->add_paeth_prediction = ff_add_png_paeth_prediction_ssse3;
+        dsp->add_bytes_l2           = ff_add_bytes_l2_sse2;
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        dsp->add_paeth_prediction   = ff_add_png_paeth_prediction_ssse3;
+        dsp->simd_stride_bytes      = 4;
+    }
+
 }
-- 
2.1.0

