sorry for the delay, I had to setup my new laptop first, expect faster
tests/reviews again

On Tue, May 17, 2011 at 22:28:28 (CEST), Vitor Sessak wrote:

> From 5f3a8fb8f98da0b249db143b43d2914f9c7340dd Mon Sep 17 00:00:00 2001
> From: Vitor Sessak <[email protected]>
> Date: Sat, 14 May 2011 14:17:15 +0200
> Subject: [PATCH 3/3] dct32: Add AVX implementation of 32-point DCT
>
> ---
>  libavcodec/mpegaudio.h       |    4 +-
>  libavcodec/x86/dct32_sse.asm |  334 
> +++++++++++++++++++++++++++---------------
>  libavcodec/x86/fft.c         |    4 +-
>  libavcodec/x86/fft.h         |    1 +
>  4 files changed, 224 insertions(+), 119 deletions(-)
>
> diff --git a/libavcodec/mpegaudio.h b/libavcodec/mpegaudio.h
> index 3422b6d..a9331cb 100644
> --- a/libavcodec/mpegaudio.h
> +++ b/libavcodec/mpegaudio.h
> @@ -130,9 +130,9 @@ typedef struct MPADecodeContext {
>      uint32_t free_format_next_header;
>      GetBitContext gb;
>      GetBitContext in_gb;
> -    DECLARE_ALIGNED(16, MPA_INT, synth_buf)[MPA_MAX_CHANNELS][512 * 2];
> +    DECLARE_ALIGNED(32, MPA_INT, synth_buf)[MPA_MAX_CHANNELS][512 * 2];
>      int synth_buf_offset[MPA_MAX_CHANNELS];
> -    DECLARE_ALIGNED(16, INTFLOAT, sb_samples)[MPA_MAX_CHANNELS][36][SBLIMIT];
> +    DECLARE_ALIGNED(32, INTFLOAT, sb_samples)[MPA_MAX_CHANNELS][36][SBLIMIT];
>      INTFLOAT mdct_buf[MPA_MAX_CHANNELS][SBLIMIT * 18]; /* previous samples, 
> for layer 3 MDCT */
>      GranuleDef granules[2][2]; /* Used in Layer 3 */
>  #ifdef DEBUG
> diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm
> index fa0a502..2e1176c 100644
> --- a/libavcodec/x86/dct32_sse.asm
> +++ b/libavcodec/x86/dct32_sse.asm
> @@ -20,31 +20,41 @@
>  
> ;******************************************************************************
>  
>  %include "x86inc.asm"
> +%include "config.asm"
>  
>  SECTION_RODATA 32
>  
>  align 32
>  ps_cos_vec: dd   0.500603,  0.505471,  0.515447,  0.531043
>              dd   0.553104,  0.582935,  0.622504,  0.674808
> -            dd  -1.169440, -0.972568, -0.839350, -0.744536
>              dd -10.190008, -3.407609, -2.057781, -1.484165
> +            dd  -1.169440, -0.972568, -0.839350, -0.744536
>              dd   0.502419,  0.522499,  0.566944,  0.646822
>              dd   0.788155,  1.060678,  1.722447,  5.101149
>              dd   0.509796,  0.601345,  0.899976,  2.562916
> +            dd   0.509796,  0.601345,  0.899976,  2.562916
>              dd   1.000000,  1.000000,  1.306563,  0.541196
> +            dd   1.000000,  1.000000,  1.306563,  0.541196
> +            dd   1.000000,  0.707107,  1.000000, -0.707107
>              dd   1.000000,  0.707107,  1.000000, -0.707107
>  
>  
> -ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
> +ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
>  
> -%macro BUTTERFLY 4
> +%macro BUTTERFLY_SSE 4
>      movaps %4, %1
>      subps  %1, %2
>      addps  %2, %4
>      mulps  %1, %3
>  %endmacro
>  
> -%macro BUTTERFLY0 5
> +%macro BUTTERFLY_AVX 4
> +    vsubps  %4, %1, %2
> +    vaddps  %2, %2, %1
> +    vmulps  %1, %4, %3
> +%endmacro
> +
> +%macro BUTTERFLY0_SSE 5
>      movaps %4, %1
>      shufps %1, %1, %5
>      xorps  %4, %2
> @@ -52,6 +62,13 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
>      mulps  %1, %3
>  %endmacro
>  
> +%macro BUTTERFLY0_AVX 5
> +    vshufps %4, %1, %1, %5
> +    vxorps  %1, %1, %2
> +    vaddps  %4, %4, %1
> +    vmulps  %1, %4, %3
> +%endmacro
> +
>  %macro BUTTERFLY2 4
>      BUTTERFLY0 %1, %2, %3, %4, 0x1b
>  %endmacro
> @@ -60,8 +77,199 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
>      BUTTERFLY0 %1, %2, %3, %4, 0xb1
>  %endmacro
>  
> -INIT_XMM
> +%macro PASS6_AND_PERMUTE 0
> +    mov         tmpd, [outq+4]
> +    movss         m7, [outq+72]
> +    addss         m7, [outq+76]
> +    movss         m3, [outq+56]
> +    addss         m3, [outq+60]
> +    addss         m4, m3
> +    movss         m2, [outq+52]
> +    addss         m2, m3
> +    movss         m3, [outq+104]
> +    addss         m3, [outq+108]
> +    addss         m1, m3
> +    addss         m5, m4
> +    movss [outq+ 16], m1
> +    movss         m1, [outq+100]
> +    addss         m1, m3
> +    movss         m3, [outq+40]
> +    movss [outq+ 48], m1
> +    addss         m3, [outq+44]
> +    movss         m1, [outq+100]
> +    addss         m4, m3
> +    addss         m3, m2
> +    addss         m1, [outq+108]
> +    movss [outq+ 40], m3
> +    addss         m2, [outq+36]
> +    movss         m3, [outq+8]
> +    movss [outq+ 56], m2
> +    addss         m3, [outq+12]
> +    movss [outq+ 32], m3
> +    movss         m3, [outq+80]
> +    movss [outq+  8], m5
> +    movss [outq+ 80], m1
> +    movss         m2, [outq+52]
> +    movss         m5, [outq+120]
> +    addss         m5, [outq+124]
> +    movss         m1, [outq+64]
> +    addss         m2, [outq+60]
> +    addss         m0, m5
> +    addss         m5, [outq+116]
> +    mov    [outq+64], tmpd
> +    addss         m6, m0
> +    addss         m1, m6
> +    mov         tmpd, [outq+12]
> +    mov   [outq+ 96], tmpd
> +    movss [outq+  4], m1
> +    movss         m1, [outq+24]
> +    movss [outq+ 24], m4
> +    movss         m4, [outq+88]
> +    addss         m4, [outq+92]
> +    addss         m3, m4
> +    addss         m4, [outq+84]
> +    mov         tmpd, [outq+108]
> +    addss         m1, [outq+28]
> +    addss         m0, m1
> +    addss         m1, m5
> +    addss         m6, m3
> +    addss         m3, m0
> +    addss         m0, m7
> +    addss         m5, [outq+20]
> +    addss         m7, m1
> +    movss [outq+ 12], m6
> +    mov   [outq+112], tmpd
> +    movss         m6, [outq+28]
> +    movss [outq+ 28], m0
> +    movss         m0, [outq+36]
> +    movss [outq+ 36], m7
> +    addss         m1, m4
> +    movss         m7, [outq+116]
> +    addss         m0, m2
> +    addss         m7, [outq+124]
> +    movss [outq+ 72], m0
> +    movss         m0, [outq+44]
> +    addss         m2, m0
> +    movss [outq+ 44], m1
> +    movss [outq+ 88], m2
> +    addss         m0, [outq+60]
> +    mov         tmpd, [outq+60]
> +    mov   [outq+120], tmpd
> +    movss [outq+104], m0
> +    addss         m4, m5
> +    addss         m5, [outq+68]
> +    movss  [outq+52], m4
> +    movss  [outq+60], m5
> +    movss         m4, [outq+68]
> +    movss         m5, [outq+20]
> +    movss [outq+ 20], m3
> +    addss         m5, m7
> +    addss         m7, m6
> +    addss         m4, m5
> +    movss         m2, [outq+84]
> +    addss         m2, [outq+92]
> +    addss         m5, m2
> +    movss [outq+ 68], m4
> +    addss         m2, m7
> +    movss         m4, [outq+76]
> +    movss [outq+ 84], m2
> +    movss [outq+ 76], m5
> +    addss         m7, m4
> +    addss         m6, [outq+124]
> +    addss         m4, m6
> +    addss         m6, [outq+92]
> +    movss [outq+100], m4
> +    movss [outq+108], m6
> +    movss         m6, [outq+92]
> +    movss  [outq+92], m7
> +    addss         m6, [outq+124]
> +    movss [outq+116], m6
> +%endmacro
> +
> +%define BUTTERFLY  BUTTERFLY_AVX
> +%define BUTTERFLY0 BUTTERFLY0_AVX
> +
> +INIT_YMM
>  section .text align=16
> +%ifdef HAVE_AVX
> +; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
> +cglobal dct32_float_avx, 2,3,8, out, in, tmp
> +    ; pass 1
> +    vmovaps     m4, [inq+0]

Unfortunately, it segfaults (again) exactly here with:

(gdb) bt full
#0  ff_dct32_float_avx () at libavcodec/x86/dct32_sse.asm:198
No locals.
#1  0x00000000006b6e13 in ff_mpa_synth_filter_float (s=0x18b3cf8, 
synth_buf_ptr=<value optimized out>, synth_buf_offset=0x18ae100, 
window=0x11d43e0, dither_state=0x18b3ce4, samples=<value optimized out>, 
incr=1, sb_samples=0x18ae110) at libavcodec/mpegaudiodsp_template.c:172
        synth_buf = 0x18ac100
        offset = <value optimized out>
#2  0x00000000006b5223 in mp_decode_frame (s=0x18abc80, samples=0x18b3d80, 
buf=<value optimized out>, buf_size=<value optimized out>) at 
libavcodec/mpegaudiodec.c:1775
        i = <value optimized out>
        nb_frames = 36
        ch = <value optimized out>
        samples_ptr = <value optimized out>
#3  0x00000000006b5d45 in decode_frame (avctx=0x1870c40, data=0x18b3d80, 
data_size=0x7fff6ad05cf8, avpkt=<value optimized out>) at 
libavcodec/mpegaudiodec.c:1832
        buf = 0x18717a0 "\377\373T\304"
        buf_size = 192
        s = 0x18abc80
        out_size = <value optimized out>
        out_samples = 0x18b3d80
#4  0x000000000077eb78 in avcodec_decode_audio3 (avctx=0x1870c40, 
samples=<value optimized out>, frame_size_ptr=<value optimized out>, 
avpkt=<value optimized out>) at libavcodec/utils.c:720
        ret = <value optimized out>
#5  0x0000000000432991 in output_packet (ist=<value optimized out>, 
ist_index=0, ost_table=0x1876980, nb_ostreams=1, pkt=<value optimized out>) at 
ffmpeg.c:1520
        data_buf = 0x18717a0 "\377\373T\304"
        decoded_data_buf = 0x0
        data_size = 192
        decoded_data_size = 0
        os = <value optimized out>
        ost = <value optimized out>
        ret = <value optimized out>
        i = <value optimized out>
        got_output = 0
        picture = {data = {0x2ad3be4d10c0 "p\033@", 0x5 <Address 0x5 out of 
bounds>, 0x7 <Address 0x7 out of bounds>, 0x2ad3bdd06f90 
"\350\002\207\275\323*"}, linesize = {-427471993, 0, -1117422261, 10963}, base 
= {0x7 <Address 0x7 out of bounds>, 0x39a152e <Address 0x39a152e out of 
bounds>, 0x7 <Address 0x7 out of bounds>, 0x7fff6ad05c30 "\377\377\377\377"}, 
key_frame = -1105984896, pict_type = 10963, pts = 47088915450976, 
coded_picture_number = 53, display_picture_number = 0, quality = -1105945408, 
age = 10963, reference = -1105985176, qscale_table = 0x2ad3be140c60 "", qstride 
= 24, mbskip_table = 0x2ad3be149630 "\037H", motion_val = {0x7fff6ad05a90, 
0x0}, mb_type = 0x2ad3bd8702e8, motion_subsample_log2 = 0 '\000', opaque = 
0x2ad3be4d1048, error = {47088911018432, 4201052, 47088915503104, 4196064}, 
type = 0, repeat_pict = 1, qscale_type = 1540, interlaced_frame = 1, 
top_field_first = -1105930240, pan_scan = 0x400380, palette_has_changed = 
-1115224512, buffer_hints = 10963, 
 dct_coeff = 0x7fff6ad05c70, ref_index = {0x2ad3be4d1048 "N\033@", 
0x7fff6ad05c90 ""}, reordered_opaque = 47088906207976, hwaccel_picture_private 
= 0xe6854b87, pkt_pts = 47088904011756, pkt_dts = 0, owner = 0x2ad3be4d1048, 
thread_opaque = 0x1}
        buffer_to_free = <value optimized out>
        samples_size = 192000
        subtitle = {format = 0, start_display_time = 2147483648, 
end_display_time = 25620000, num_rects = 0, rects = 0x0, pts = 140734985428496}
        subtitle_to_free = 0x0
        pkt_pts = <value optimized out>
        frame_available = <value optimized out>
        avpkt = {pts = 0, dts = 0, data = 0x18717a0 "\377\373T\304", size = 
192, stream_index = 0, flags = 1, side_data = 0x0, side_data_elems = 0, 
duration = 338688, destruct = 0x50e570 <av_destruct_packet>, priv = 0x0, pos = 
0, convergence_duration = 0}
        bps = 4
#6  0x00000000004353f6 in transcode (nb_output_files=1, nb_input_files=1, 
stream_maps=0x0, nb_stream_maps=0, input_files=0xce52e0, output_files=0xce4fc0) 
at ffmpeg.c:2646
        ist_index = 0
        pkt = {pts = 0, dts = 0, data = 0x18717a0 "\377\373T\304", size = 192, 
stream_index = 0, flags = 1, side_data = 0x0, side_data_elems = 0, duration = 
338688, destruct = 0x50e570 <av_destruct_packet>, priv = 0x0, pos = 0, 
convergence_duration = 0}
        file_index = <value optimized out>
        ipts_min = <value optimized out>
        opts_min = <value optimized out>
        ret = <value optimized out>
        i = <value optimized out>
        j = <value optimized out>
        k = <value optimized out>
        n = <value optimized out>
        nb_istreams = 1
        nb_ostreams = <value optimized out>
        is = <value optimized out>
        os = <value optimized out>
        codec = <value optimized out>
        icodec = <value optimized out>
        ost = <value optimized out>
        ost_table = 0x1876980
        ist = 0x186ec40
        ist_table = 0x188b520
        file_table = 0x18891c0
        error = "\000\000G", '\000' <repeats 13 times>, 
"\020g\320j\377\177\000\000@\000\000\000\000\000\000\000\001", '\000' <repeats 
23 times>, 
"@\006\207\275\323*\000\000\377\377\377\377\001\000\000\000X\026G\000\000\000\000\000\003\374\000\000\000\000\000\000\300\217L\276\323*\000\000\000\000\000\000\000\000\000\000\300\000\000\000\000\000\000\000\n\000\000\000\000\000\000\000pg\320j\377\177\000\000`g\320j\377\177",
 '\000' <repeats 34 times>, 
"G\023\222\000\000\000\000\000\200g\320j\377\177\000\000\220g\320j\377\177\000\000\220g\320j\377\177\000\000\240g\320j\377\177\000\000P\000\000\000\000\000\000\000\300\301L\276\323*\000\000\260g\320j\377\177\000\000\300g\320j\377\177\000\000\237n\223\000\000\000\000\000\365\241!\276\323*\000\000\340~\000\000\000\000\000\000\300\301L\276\323*\000\000`=\212\001\000\000\000\000\251\273\033\276\323*\000\000\320~\000\000\000\000\000\000\005s\033\276"...
        want_sdp = <value optimized out>
        no_packet = '\000' <repeats 99 times>
        no_packet_count = 0
#7  0x000000000043a295 in main (argc=<value optimized out>, argv=<value 
optimized out>) at ffmpeg.c:4457
No locals.
(gdb) 

:-(

-- 
Gruesse/greetings,
Reinhard Tartler, KeyID 945348A4
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to