sorry for the delay, I had to setup my new laptop first, expect faster tests/reviews again
On Tue, May 17, 2011 at 22:28:28 (CEST), Vitor Sessak wrote: > From 5f3a8fb8f98da0b249db143b43d2914f9c7340dd Mon Sep 17 00:00:00 2001 > From: Vitor Sessak <[email protected]> > Date: Sat, 14 May 2011 14:17:15 +0200 > Subject: [PATCH 3/3] dct32: Add AVX implementation of 32-point DCT > > --- > libavcodec/mpegaudio.h | 4 +- > libavcodec/x86/dct32_sse.asm | 334 > +++++++++++++++++++++++++++--------------- > libavcodec/x86/fft.c | 4 +- > libavcodec/x86/fft.h | 1 + > 4 files changed, 224 insertions(+), 119 deletions(-) > > diff --git a/libavcodec/mpegaudio.h b/libavcodec/mpegaudio.h > index 3422b6d..a9331cb 100644 > --- a/libavcodec/mpegaudio.h > +++ b/libavcodec/mpegaudio.h > @@ -130,9 +130,9 @@ typedef struct MPADecodeContext { > uint32_t free_format_next_header; > GetBitContext gb; > GetBitContext in_gb; > - DECLARE_ALIGNED(16, MPA_INT, synth_buf)[MPA_MAX_CHANNELS][512 * 2]; > + DECLARE_ALIGNED(32, MPA_INT, synth_buf)[MPA_MAX_CHANNELS][512 * 2]; > int synth_buf_offset[MPA_MAX_CHANNELS]; > - DECLARE_ALIGNED(16, INTFLOAT, sb_samples)[MPA_MAX_CHANNELS][36][SBLIMIT]; > + DECLARE_ALIGNED(32, INTFLOAT, sb_samples)[MPA_MAX_CHANNELS][36][SBLIMIT]; > INTFLOAT mdct_buf[MPA_MAX_CHANNELS][SBLIMIT * 18]; /* previous samples, > for layer 3 MDCT */ > GranuleDef granules[2][2]; /* Used in Layer 3 */ > #ifdef DEBUG > diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm > index fa0a502..2e1176c 100644 > --- a/libavcodec/x86/dct32_sse.asm > +++ b/libavcodec/x86/dct32_sse.asm > @@ -20,31 +20,41 @@ > > ;****************************************************************************** > > %include "x86inc.asm" > +%include "config.asm" > > SECTION_RODATA 32 > > align 32 > ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043 > dd 0.553104, 0.582935, 0.622504, 0.674808 > - dd -1.169440, -0.972568, -0.839350, -0.744536 > dd -10.190008, -3.407609, -2.057781, -1.484165 > + dd -1.169440, -0.972568, -0.839350, -0.744536 > dd 0.502419, 0.522499, 0.566944, 0.646822 > dd 0.788155, 1.060678, 1.722447, 5.101149 > dd 0.509796, 0.601345, 0.899976, 2.562916 > + dd 0.509796, 0.601345, 0.899976, 2.562916 > dd 1.000000, 1.000000, 1.306563, 0.541196 > + dd 1.000000, 1.000000, 1.306563, 0.541196 > + dd 1.000000, 0.707107, 1.000000, -0.707107 > dd 1.000000, 0.707107, 1.000000, -0.707107 > > > -ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 > +ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000 > > -%macro BUTTERFLY 4 > +%macro BUTTERFLY_SSE 4 > movaps %4, %1 > subps %1, %2 > addps %2, %4 > mulps %1, %3 > %endmacro > > -%macro BUTTERFLY0 5 > +%macro BUTTERFLY_AVX 4 > + vsubps %4, %1, %2 > + vaddps %2, %2, %1 > + vmulps %1, %4, %3 > +%endmacro > + > +%macro BUTTERFLY0_SSE 5 > movaps %4, %1 > shufps %1, %1, %5 > xorps %4, %2 > @@ -52,6 +62,13 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 > mulps %1, %3 > %endmacro > > +%macro BUTTERFLY0_AVX 5 > + vshufps %4, %1, %1, %5 > + vxorps %1, %1, %2 > + vaddps %4, %4, %1 > + vmulps %1, %4, %3 > +%endmacro > + > %macro BUTTERFLY2 4 > BUTTERFLY0 %1, %2, %3, %4, 0x1b > %endmacro > @@ -60,8 +77,199 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 > BUTTERFLY0 %1, %2, %3, %4, 0xb1 > %endmacro > > -INIT_XMM > +%macro PASS6_AND_PERMUTE 0 > + mov tmpd, [outq+4] > + movss m7, [outq+72] > + addss m7, [outq+76] > + movss m3, [outq+56] > + addss m3, [outq+60] > + addss m4, m3 > + movss m2, [outq+52] > + addss m2, m3 > + movss m3, [outq+104] > + addss m3, [outq+108] > + addss m1, m3 > + addss m5, m4 > + movss [outq+ 16], m1 > + movss m1, [outq+100] > + addss m1, m3 > + movss m3, [outq+40] > + movss [outq+ 48], m1 > + addss m3, [outq+44] > + movss m1, [outq+100] > + addss m4, m3 > + addss m3, m2 > + addss m1, [outq+108] > + movss [outq+ 40], m3 > + addss m2, [outq+36] > + movss m3, [outq+8] > + movss [outq+ 56], m2 > + addss m3, [outq+12] > + movss [outq+ 32], m3 > + movss m3, [outq+80] > + movss [outq+ 8], m5 > + movss [outq+ 80], m1 > + movss m2, [outq+52] > + movss m5, [outq+120] > + addss m5, [outq+124] > + movss m1, [outq+64] > + addss m2, [outq+60] > + addss m0, m5 > + addss m5, [outq+116] > + mov [outq+64], tmpd > + addss m6, m0 > + addss m1, m6 > + mov tmpd, [outq+12] > + mov [outq+ 96], tmpd > + movss [outq+ 4], m1 > + movss m1, [outq+24] > + movss [outq+ 24], m4 > + movss m4, [outq+88] > + addss m4, [outq+92] > + addss m3, m4 > + addss m4, [outq+84] > + mov tmpd, [outq+108] > + addss m1, [outq+28] > + addss m0, m1 > + addss m1, m5 > + addss m6, m3 > + addss m3, m0 > + addss m0, m7 > + addss m5, [outq+20] > + addss m7, m1 > + movss [outq+ 12], m6 > + mov [outq+112], tmpd > + movss m6, [outq+28] > + movss [outq+ 28], m0 > + movss m0, [outq+36] > + movss [outq+ 36], m7 > + addss m1, m4 > + movss m7, [outq+116] > + addss m0, m2 > + addss m7, [outq+124] > + movss [outq+ 72], m0 > + movss m0, [outq+44] > + addss m2, m0 > + movss [outq+ 44], m1 > + movss [outq+ 88], m2 > + addss m0, [outq+60] > + mov tmpd, [outq+60] > + mov [outq+120], tmpd > + movss [outq+104], m0 > + addss m4, m5 > + addss m5, [outq+68] > + movss [outq+52], m4 > + movss [outq+60], m5 > + movss m4, [outq+68] > + movss m5, [outq+20] > + movss [outq+ 20], m3 > + addss m5, m7 > + addss m7, m6 > + addss m4, m5 > + movss m2, [outq+84] > + addss m2, [outq+92] > + addss m5, m2 > + movss [outq+ 68], m4 > + addss m2, m7 > + movss m4, [outq+76] > + movss [outq+ 84], m2 > + movss [outq+ 76], m5 > + addss m7, m4 > + addss m6, [outq+124] > + addss m4, m6 > + addss m6, [outq+92] > + movss [outq+100], m4 > + movss [outq+108], m6 > + movss m6, [outq+92] > + movss [outq+92], m7 > + addss m6, [outq+124] > + movss [outq+116], m6 > +%endmacro > + > +%define BUTTERFLY BUTTERFLY_AVX > +%define BUTTERFLY0 BUTTERFLY0_AVX > + > +INIT_YMM > section .text align=16 > +%ifdef HAVE_AVX > +; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in) > +cglobal dct32_float_avx, 2,3,8, out, in, tmp > + ; pass 1 > + vmovaps m4, [inq+0] Unfortunately, it segfaults (again) exactly here with: (gdb) bt full #0 ff_dct32_float_avx () at libavcodec/x86/dct32_sse.asm:198 No locals. #1 0x00000000006b6e13 in ff_mpa_synth_filter_float (s=0x18b3cf8, synth_buf_ptr=<value optimized out>, synth_buf_offset=0x18ae100, window=0x11d43e0, dither_state=0x18b3ce4, samples=<value optimized out>, incr=1, sb_samples=0x18ae110) at libavcodec/mpegaudiodsp_template.c:172 synth_buf = 0x18ac100 offset = <value optimized out> #2 0x00000000006b5223 in mp_decode_frame (s=0x18abc80, samples=0x18b3d80, buf=<value optimized out>, buf_size=<value optimized out>) at libavcodec/mpegaudiodec.c:1775 i = <value optimized out> nb_frames = 36 ch = <value optimized out> samples_ptr = <value optimized out> #3 0x00000000006b5d45 in decode_frame (avctx=0x1870c40, data=0x18b3d80, data_size=0x7fff6ad05cf8, avpkt=<value optimized out>) at libavcodec/mpegaudiodec.c:1832 buf = 0x18717a0 "\377\373T\304" buf_size = 192 s = 0x18abc80 out_size = <value optimized out> out_samples = 0x18b3d80 #4 0x000000000077eb78 in avcodec_decode_audio3 (avctx=0x1870c40, samples=<value optimized out>, frame_size_ptr=<value optimized out>, avpkt=<value optimized out>) at libavcodec/utils.c:720 ret = <value optimized out> #5 0x0000000000432991 in output_packet (ist=<value optimized out>, ist_index=0, ost_table=0x1876980, nb_ostreams=1, pkt=<value optimized out>) at ffmpeg.c:1520 data_buf = 0x18717a0 "\377\373T\304" decoded_data_buf = 0x0 data_size = 192 decoded_data_size = 0 os = <value optimized out> ost = <value optimized out> ret = <value optimized out> i = <value optimized out> got_output = 0 picture = {data = {0x2ad3be4d10c0 "p\033@", 0x5 <Address 0x5 out of bounds>, 0x7 <Address 0x7 out of bounds>, 0x2ad3bdd06f90 "\350\002\207\275\323*"}, linesize = {-427471993, 0, -1117422261, 10963}, base = {0x7 <Address 0x7 out of bounds>, 0x39a152e <Address 0x39a152e out of bounds>, 0x7 <Address 0x7 out of bounds>, 0x7fff6ad05c30 "\377\377\377\377"}, key_frame = -1105984896, pict_type = 10963, pts = 47088915450976, coded_picture_number = 53, display_picture_number = 0, quality = -1105945408, age = 10963, reference = -1105985176, qscale_table = 0x2ad3be140c60 "", qstride = 24, mbskip_table = 0x2ad3be149630 "\037H", motion_val = {0x7fff6ad05a90, 0x0}, mb_type = 0x2ad3bd8702e8, motion_subsample_log2 = 0 '\000', opaque = 0x2ad3be4d1048, error = {47088911018432, 4201052, 47088915503104, 4196064}, type = 0, repeat_pict = 1, qscale_type = 1540, interlaced_frame = 1, top_field_first = -1105930240, pan_scan = 0x400380, palette_has_changed = -1115224512, buffer_hints = 10963, dct_coeff = 0x7fff6ad05c70, ref_index = {0x2ad3be4d1048 "N\033@", 0x7fff6ad05c90 ""}, reordered_opaque = 47088906207976, hwaccel_picture_private = 0xe6854b87, pkt_pts = 47088904011756, pkt_dts = 0, owner = 0x2ad3be4d1048, thread_opaque = 0x1} buffer_to_free = <value optimized out> samples_size = 192000 subtitle = {format = 0, start_display_time = 2147483648, end_display_time = 25620000, num_rects = 0, rects = 0x0, pts = 140734985428496} subtitle_to_free = 0x0 pkt_pts = <value optimized out> frame_available = <value optimized out> avpkt = {pts = 0, dts = 0, data = 0x18717a0 "\377\373T\304", size = 192, stream_index = 0, flags = 1, side_data = 0x0, side_data_elems = 0, duration = 338688, destruct = 0x50e570 <av_destruct_packet>, priv = 0x0, pos = 0, convergence_duration = 0} bps = 4 #6 0x00000000004353f6 in transcode (nb_output_files=1, nb_input_files=1, stream_maps=0x0, nb_stream_maps=0, input_files=0xce52e0, output_files=0xce4fc0) at ffmpeg.c:2646 ist_index = 0 pkt = {pts = 0, dts = 0, data = 0x18717a0 "\377\373T\304", size = 192, stream_index = 0, flags = 1, side_data = 0x0, side_data_elems = 0, duration = 338688, destruct = 0x50e570 <av_destruct_packet>, priv = 0x0, pos = 0, convergence_duration = 0} file_index = <value optimized out> ipts_min = <value optimized out> opts_min = <value optimized out> ret = <value optimized out> i = <value optimized out> j = <value optimized out> k = <value optimized out> n = <value optimized out> nb_istreams = 1 nb_ostreams = <value optimized out> is = <value optimized out> os = <value optimized out> codec = <value optimized out> icodec = <value optimized out> ost = <value optimized out> ost_table = 0x1876980 ist = 0x186ec40 ist_table = 0x188b520 file_table = 0x18891c0 error = "\000\000G", '\000' <repeats 13 times>, "\020g\320j\377\177\000\000@\000\000\000\000\000\000\000\001", '\000' <repeats 23 times>, "@\006\207\275\323*\000\000\377\377\377\377\001\000\000\000X\026G\000\000\000\000\000\003\374\000\000\000\000\000\000\300\217L\276\323*\000\000\000\000\000\000\000\000\000\000\300\000\000\000\000\000\000\000\n\000\000\000\000\000\000\000pg\320j\377\177\000\000`g\320j\377\177", '\000' <repeats 34 times>, "G\023\222\000\000\000\000\000\200g\320j\377\177\000\000\220g\320j\377\177\000\000\220g\320j\377\177\000\000\240g\320j\377\177\000\000P\000\000\000\000\000\000\000\300\301L\276\323*\000\000\260g\320j\377\177\000\000\300g\320j\377\177\000\000\237n\223\000\000\000\000\000\365\241!\276\323*\000\000\340~\000\000\000\000\000\000\300\301L\276\323*\000\000`=\212\001\000\000\000\000\251\273\033\276\323*\000\000\320~\000\000\000\000\000\000\005s\033\276"... want_sdp = <value optimized out> no_packet = '\000' <repeats 99 times> no_packet_count = 0 #7 0x000000000043a295 in main (argc=<value optimized out>, argv=<value optimized out>) at ffmpeg.c:4457 No locals. (gdb) :-( -- Gruesse/greetings, Reinhard Tartler, KeyID 945348A4 _______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
