Hello I've been working on assembly for the vc2 encoder and have reached an impasse. My code results in very visible errors, very obvious vertical streaks in the bottom-right half of the image and some low-frequency effect (I think).
I cannot see the problem in my code so I need some fresh eyes to look at it. You can test the code with these two commands: > ./ffmpeg -f lavfi -i testsrc=s=1024x576,format=yuv420p -vcodec vc2 -strict -1 > -vframes 1 -cpuflags 0 -y temp.vc2 > ./ffmpeg -f lavfi -i testsrc=s=1024x576,format=yuv420p -vcodec vc2 -strict -1 > -vframes 1 -y temp.vc2 Decode each to a png image to see the problem. Attached is a diff from master (my branch is a two dozen small commits). Thanks.
diff --git a/libavcodec/vc2enc_dwt.c b/libavcodec/vc2enc_dwt.c
index c60b003..bac262d 100644
--- a/libavcodec/vc2enc_dwt.c
+++ b/libavcodec/vc2enc_dwt.c
@@ -23,6 +23,7 @@
#include "libavutil/attributes.h"
#include "libavutil/mem.h"
#include "vc2enc_dwt.h"
+#include "libavcodec/x86/vc2dsp.h"
/* Since the transforms spit out interleaved coefficients, this function
* rearranges the coefficients into the more traditional subdivision,
@@ -76,16 +77,21 @@ static void vc2_subband_dwt_97(VC2TransformContext *t,
dwtcoef *data,
for (y = 0; y < synth_height; y++) {
/* Lifting stage 2. */
synthl[1] -= (8*synthl[0] + 9*synthl[2] - synthl[4] + 8) >> 4;
+
for (x = 1; x < width - 2; x++)
- synthl[2*x + 1] -= (9*synthl[2*x] + 9*synthl[2*x + 2] - synthl[2*x
+ 4] -
- synthl[2 * x - 2] + 8) >> 4;
+ synthl[2*x + 1] -= (9*synthl[2*x] +
+ 9*synthl[2*x + 2] -
+ synthl[2*x + 4] -
+ synthl[2*x - 2] + 8) >> 4;
+
synthl[synth_width - 1] -= (17*synthl[synth_width - 2] -
- synthl[synth_width - 4] + 8) >> 4;
+ synthl[synth_width - 4] + 8) >> 4;
synthl[synth_width - 3] -= (8*synthl[synth_width - 2] +
9*synthl[synth_width - 4] -
- synthl[synth_width - 6] + 8) >> 4;
+ synthl[synth_width - 6] + 8) >> 4;
/* Lifting stage 1. */
synthl[0] += (synthl[1] + synthl[1] + 2) >> 2;
+
for (x = 1; x < width - 1; x++)
synthl[2*x] += (synthl[2*x - 1] + synthl[2*x + 1] + 2) >> 2;
@@ -97,25 +103,28 @@ static void vc2_subband_dwt_97(VC2TransformContext *t,
dwtcoef *data,
/* Vertical synthesis: Lifting stage 2. */
synthl = synth + synth_width;
for (x = 0; x < synth_width; x++)
- synthl[x] -= (8*synthl[x - synth_width] + 9*synthl[x + synth_width] -
- synthl[x + 3 * synth_width] + 8) >> 4;
+ synthl[x] -= (8*synthl[x - synth_width] +
+ 9*synthl[x + synth_width] -
+ synthl[x + 3*synth_width] + 8) >> 4;
synthl = synth + (synth_width << 1);
for (y = 1; y < height - 2; y++) {
for (x = 0; x < synth_width; x++)
synthl[x + synth_width] -= (9*synthl[x] +
- 9*synthl[x + 2 * synth_width] -
- synthl[x - 2 * synth_width] -
- synthl[x + 4 * synth_width] + 8) >> 4;
+ 9*synthl[x + 2*synth_width] -
+ synthl[x - 2*synth_width] -
+ synthl[x + 4*synth_width] + 8) >> 4;
synthl += synth_width << 1;
}
synthl = synth + (synth_height - 1) * synth_width;
for (x = 0; x < synth_width; x++) {
synthl[x] -= (17*synthl[x - synth_width] -
- synthl[x - 3*synth_width] + 8) >> 4;
- synthl[x - 2*synth_width] -= (9*synthl[x -
3*synth_width] +
- 8*synthl[x - 1*synth_width] - synthl[x - 5*synth_width]
+ 8) >> 4;
+ synthl[x - 3*synth_width] + 8) >> 4;
+
+ synthl[x - 2*synth_width] -= (9*synthl[x - 3*synth_width] +
+ 8*synthl[x - 1*synth_width] -
+ synthl[x - 5*synth_width] + 8) >> 4;
}
/* Vertical synthesis: Lifting stage 1. */
@@ -262,6 +271,10 @@ av_cold int ff_vc2enc_init_transforms(VC2TransformContext
*s, int p_width, int p
s->vc2_subband_dwt[VC2_TRANSFORM_HAAR] = vc2_subband_dwt_haar;
s->vc2_subband_dwt[VC2_TRANSFORM_HAAR_S] = vc2_subband_dwt_haar_shift;
+#if ARCH_X86
+ ff_vc2enc_init_x86(s, p_width, p_height);
+#endif
+
s->buffer = av_malloc(2*p_width*p_height*sizeof(dwtcoef));
if (!s->buffer)
return 1;
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 839b5bc..24aedda 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -34,6 +34,7 @@ OBJS-$(CONFIG_PIXBLOCKDSP) +=
x86/pixblockdsp_init.o
OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp_init.o
OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp_init.o
OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_init.o
+OBJS-$(CONFIG_VC2_ENCODER) += x86/vc2dsp_init.o
OBJS-$(CONFIG_VIDEODSP) += x86/videodsp_init.o
OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o
OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp_init.o
@@ -122,6 +123,7 @@ YASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o
\
YASM-OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp.o
YASM-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_loopfilter.o \
x86/vc1dsp_mc.o
+YASM-OBJS-$(CONFIG_VC2_ENCODER) += x86/vc2enc_dwt.o
YASM-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct10.o
YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
diff --git a/libavcodec/x86/vc2dsp.h b/libavcodec/x86/vc2dsp.h
new file mode 100644
index 0000000..825862e
--- /dev/null
+++ b/libavcodec/x86/vc2dsp.h
@@ -0,0 +1,28 @@
+/*
+ * VC2 encoder
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_VC2DSP_H
+#define AVCODEC_X86_VC2DSP_H
+
+#include "libavcodec/vc2enc_dwt.h"
+
+void ff_vc2enc_init_x86(VC2TransformContext *s, int p_width, int p_height);
+
+#endif /* AVCODEC_X86_VC2DSP_H */
diff --git a/libavcodec/x86/vc2dsp_init.c b/libavcodec/x86/vc2dsp_init.c
new file mode 100644
index 0000000..b3bdb53
--- /dev/null
+++ b/libavcodec/x86/vc2dsp_init.c
@@ -0,0 +1,67 @@
+/*
+ * VC2 encoder
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/x86/vc2dsp.h"
+
+#if HAVE_SSE2_EXTERNAL
+void ff_vc2_subband_dwt_97_sse2(VC2TransformContext*, dwtcoef*, ptrdiff_t, int
, int);
+#endif
+
+static av_always_inline void deinterleave(dwtcoef *linell, ptrdiff_t stride,
+ int width, int height, dwtcoef
*synthl)
+{
+ int x, y;
+ ptrdiff_t synthw = width << 1;
+ dwtcoef *linehl = linell + width;
+ dwtcoef *linelh = linell + height*stride;
+ dwtcoef *linehh = linelh + width;
+
+ /* Deinterleave the coefficients. */
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++) {
+ linell[x] = synthl[(x << 1)];
+ linehl[x] = synthl[(x << 1) + 1];
+ linelh[x] = synthl[(x << 1) + synthw];
+ linehh[x] = synthl[(x << 1) + synthw + 1];
+ }
+ synthl += synthw << 1;
+ linell += stride;
+ linelh += stride;
+ linehl += stride;
+ linehh += stride;
+ }
+}
+
+void static wrapper_97(VC2TransformContext *t, dwtcoef *data, ptrdiff_t
stride, int width, int height)
+{
+ ff_vc2_subband_dwt_97_sse2(t, data, stride, width, height);
+ deinterleave(data, stride, width, height, t->buffer);
+}
+
+av_cold void ff_vc2enc_init_x86(VC2TransformContext *s, int p_width, int
p_height)
+{
+ int cpuflags = av_get_cpu_flags();
+
+ if (EXTERNAL_SSE2(cpuflags)) {
+ s->vc2_subband_dwt[VC2_TRANSFORM_9_7] = wrapper_97;
+ }
+}
diff --git a/libavcodec/x86/vc2enc_dwt.asm b/libavcodec/x86/vc2enc_dwt.asm
new file mode 100644
index 0000000..c6fdf36
--- /dev/null
+++ b/libavcodec/x86/vc2enc_dwt.asm
@@ -0,0 +1,312 @@
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+cextern pw_2
+cextern pw_8
+cextern pw_9
+cextern pw_17
+
+section .text
+
+INIT_XMM sse2
+cglobal vc2_subband_dwt_97, 5, 12, 5, synth, data, stride, width, height
+ sub rsp, 5 * gprsize
+ mov synthq, [synthq]
+ movsxdifnidn widthq, widthd
+ movsxdifnidn heightq, heightd
+ mov [rsp + gprsize*0], synthq
+ mov [rsp + gprsize*1], dataq
+ mov [rsp + gprsize*2], strideq
+ mov [rsp + gprsize*3], widthq
+ mov [rsp + gprsize*4], heightq
+
+ DECLARE_REG_TMP 9, 10, 11
+ %define synth_w r5
+ %define synth_h r6
+ mov synth_w, widthq
+ mov synth_h, heightq
+ add synth_w, synth_w
+ add synth_h, synth_h
+
+ %define x r7
+ %define y r8
+ ; shifting loop (for precision and copy)
+ lea synthq, [synthq + synth_w*2] ; add 1 synth_w to save comparing x
against synth_w
+ lea dataq, [dataq + synth_w*2]
+ mov y, synth_h
+ .loop1_y:
+ mov x, synth_w
+ neg x
+ .loop1_x:
+ movu m0, [dataq + x*2]
+ paddw m0, m0
+ movu [synthq + x*2], m0
+ add x, mmsize/2
+ jl .loop1_x
+
+ lea dataq, [dataq + strideq*2]
+ lea synthq, [synthq + synth_w*2]
+ sub y, 1
+ jg .loop1_y
+
+ ; Horizontal synthesis
+ mov synthq, [rsp]
+ mov y, synth_h
+ .loop2_y:
+ ; Lifting stage 2
+ mov t0w, word [synthq+0]
+ mov t1w, word [synthq+4]
+ imul t0w, 8
+ imul t1w, 9
+ add t0w, 8
+ sub t1w, word [synthq+8]
+ add t0w, t1w
+ sar t0w, 4
+ sub word [synthq+2], t0w
+
+ mov x, 1
+ sub widthq, 2
+ .loop2_x_1:
+%if 0 ; needs horizontal masking
+ movu m0, [synthq + x*4 + 0]
+ movu m1, [synthq + x*4 + 4]
+ movu m2, [synthq + x*4 + 8]
+ movu m3, [synthq + x*4 - 4]
+ pmullw m0, [pw_9]
+ pmullw m1, [pw_9]
+ psubw m0, m2
+ psubw m1, m3
+ paddw m0, m1
+ paddw m0, [pw_8]
+ psraw m0, 4
+ movu m4, [synthq + x*4 + 2]
+ psubw m4, m0
+ movu [synthq + x*4 + 2], m4
+ add x, mmsize / 2
+ cmp x, widthq
+%else
+ mov t0w, word [synthq + x*4 + 0]
+ mov t1w, word [synthq + x*4 + 4]
+ imul t0w, 9
+ imul t1w, 9
+ sub t0w, word [synthq + x*4 + 8]
+ sub t1w, word [synthq + x*4 - 4]
+ add t0w, 8
+ add t0w, t1w
+ sar t0w, 4
+ sub word [synthq + x*4 + 2], t0w
+ add x, 1
+ cmp x, widthq
+%endif
+ jl .loop2_x_1
+
+ mov t0w, word [synthq + synth_w*2 - 4]
+ mov t1w, word [synthq + synth_w*2 - 8]
+ imul t0w, 17
+ sub t1w, 8 ; -8 so that the sign is corrected below
+ sub t0w, t1w
+ sar t0w, 4
+ sub word [synthq + synth_w*2 - 2], t0w
+
+ mov t0w, word [synthq + synth_w*2 - 4]
+ mov t1w, word [synthq + synth_w*2 - 8]
+ imul t0w, 8
+ imul t1w, 9
+ sub t0w, word [synthq + synth_w*2 - 12]
+ add t1w, 8
+ add t0w, t1w
+ sar t0w, 4
+ sub word [synthq + synth_w*2 - 6], t0w
+
+ ; Lifting stage 1
+ mov t0w, word [synthq + 2]
+ add t0w, t0w
+ add t0w, 2
+ sar t0w, 2
+ add word [synthq], t0w
+
+ mov x, 1
+ add widthq, 1
+ .loop2_x_2:
+%if 0 ; needs horizontal masking
+ movu m0, [synthq + x*4 - 2]
+ movu m1, [synthq + x*4 + 2]
+ paddw m0, m1
+ paddw m0, [pw_2]
+ psraw m0, 2
+ movu m2, [synthq + x*4 + 0]
+ paddw m2, m0
+ movu [synthq + x*4 + 0], m2
+ add x, mmsize / 2
+ cmp x, widthq
+%else
+ mov t0w, word [synthq + x*4 - 2]
+ mov t1w, word [synthq + x*4 + 2]
+ add t0w, 2
+ add t0w, t1w
+ sar t0w, 2
+ add word [synthq + x*4 + 0], t0w
+ add x, 1
+ cmp x, widthq
+%endif
+ jl .loop2_x_2
+
+ mov t0w, word [synthq + synth_w*2 - 6]
+ mov t1w, word [synthq + synth_w*2 - 2]
+ add t0w, t1w
+ add t0w, 2
+ sar t0w, 2
+ add word [synthq + synth_w*2 - 4], t0w
+
+ lea synthq, [synthq + synth_w*2]
+ sub y, 1
+ jg .loop2_y
+
+ ; Vertical synthesis: Lifting stage 2
+ mov synthq, [rsp] ; No addition of synth_w so indicies in the loop have an
extra synth_w added.
+ mov x, synth_w
+ .loop3_x:
+ movu m0, [synthq]
+ movu m1, [synthq + synth_w*4]
+ movu m2, [synthq + synth_w*8]
+ pmullw m0, [pw_8]
+ pmullw m1, [pw_9]
+ psubw m0, m2
+ paddw m1, [pw_8]
+ paddw m0, m1
+ psraw m0, 4
+ movu m3, [synthq + synth_w*2]
+ psubw m3, m0
+ movu [synthq + synth_w*2], m3
+ add synthq, mmsize
+ sub x, mmsize/2
+ jg .loop3_x
+
+ mov synthq, [rsp]
+ mov t0, synth_w
+ neg t0
+ mov y, heightq
+ sub y, 3
+ .loop4_y:
+ lea synthq, [synthq + synth_w*4]
+ mov t1, synthq
+ mov x, synth_w
+ .loop4_x:
+ movu m0, [t1]
+ movu m1, [t1 + synth_w*4]
+ movu m2, [t1 + t0*4]
+ movu m3, [t1 + synth_w*8]
+ pmullw m0, [pw_9]
+ pmullw m1, [pw_9]
+ paddw m2, m3
+ paddw m0, [pw_8]
+ psubw m1, m2
+ paddw m0, m1
+ psraw m0, 4
+ movu m4, [t1 + synth_w*2]
+ psubw m4, m0
+ movu [t1 + synth_w*2], m4
+ add t1, mmsize
+ sub x, mmsize/2
+ jg .loop4_x
+ sub y, 1
+ jg .loop4_y
+
+ mov synthq, [rsp]
+ mov t0, synth_h
+ sub t0, 1
+ imul t0, synth_w
+ lea synthq, [synthq + t0*2]
+ neg synth_w
+ lea t1, [synth_w*3]
+ lea t0, [synth_w*5]
+ mov x, synth_w
+ .loop5_x:
+ movu m0, [synthq + synth_w*2]
+ movu m1, [synthq + t1*2]
+ pmullw m0, [pw_17]
+ psubw m1, [pw_8] ; -8 so that the sign is corrected below
+ psubw m0, m1
+ psraw m0, 4
+ movu m2, [synthq]
+ psubw m2, m0
+ movu [synthq], m2
+
+ movu m0, [synthq + t1*2]
+ movu m1, [synthq + synth_w*2]
+ movu m2, [synthq + t0*2]
+ pmullw m0, [pw_9]
+ pmullw m1, [pw_8]
+ psubw m2, [pw_8] ; -8 so that the sign is corrected below
+ paddw m0, m1
+ psubw m0, m2
+ psraw m0, 4
+ movu m3, [synthq + synth_w*4]
+ psubw m3, m0
+ movu [synthq + synth_w*4], m3
+
+ add synthq, mmsize
+ add x, mmsize/2
+ jl .loop5_x
+
+ ; Vertical synthesis: Lifting stage 1
+ mov synthq, [rsp]
+ mov x, synth_w
+ neg synth_w
+ .loop6_x:
+ movu m0, [synthq + synth_w]
+ paddw m0, m0
+ paddw m0, [pw_2]
+ psraw m0, 2
+ movu m1, [synthq]
+ paddw m1, m0
+ movu [synthq], m1
+ add synthq, mmsize
+ add x, mmsize/2
+ jl .loop6_x
+
+ mov synthq, [rsp]
+ lea synthq, [synthq + synth_w*2] ; 1 synth_w left out so indicies in the
loop have an extra synth_w added.
+ mov y, heightq
+ sub y, 2
+ .loop7_y:
+ mov t0, synthq
+ mov x, synth_w
+ .loop7_x:
+ movu m0, [t0]
+ movu m1, [t0 + synth_w*4]
+ paddw m0, m1
+ paddw m0, [pw_2]
+ psraw m0, 2
+ movu m2, [t0 + synth_w*2]
+ paddw m2, m0
+ movu [t0 + synth_w*2], m2
+ add t0, mmsize
+ sub x, mmsize/2
+ jg .loop7_x
+ lea synthq, [synthq + synth_w*4]
+ sub y, 1
+ jg .loop7_y
+
+ mov synthq, [rsp]
+ mov t0, synth_h
+ sub t0, 3
+ imul t0, synth_w ; 1 synth_w left out so indicies in the loop have an
extra synth_w added.
+ lea synthq, [synthq + t0*2]
+ mov x, synth_w
+ .loop8_x:
+ movu m0, [synthq]
+ movu m1, [synthq + synth_w*4]
+ paddw m0, m1
+ paddw m0, [pw_2]
+ psraw m0, 2
+ movu m2, [synthq + synth_w*2]
+ paddw m2, m0
+ movu [synthq + synth_w*2], m2
+ add synthq, mmsize
+ sub x, mmsize/2
+ jg .loop8_x
+
+ add rsp, 5 * gprsize
+RET
+
signature.asc
Description: OpenPGP digital signature
_______________________________________________ ffmpeg-devel mailing list [email protected] http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
