from:"gramner via ffmpeg\-devel"

[FFmpeg-devel] [PATCH] vp9: Add AVX-512ICL asm for 8bpc subpel mc (PR #20358)

2025-08-27 Thread gramner via ffmpeg-devel

PR #20358 opened by gramner
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20358
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20358.patch

Checkasm numbers on Zen 5 (Strix Halo):
```
vp9_put_8tap_smooth_4h_8bpp_ssse3: 18.0
vp9_put_8tap_smooth_4h_8bpp_avx512icl:  7.9

vp9_put_8tap_smooth_4v_8bpp_ssse3: 18.0
vp9_put_8tap_smooth_4v_8bpp_avx512icl: 10.4

vp9_put_8tap_smooth_4hv_8bpp_ssse3:67.8
vp9_put_8tap_smooth_4hv_8bpp_avx512icl:14.7

vp9_put_8tap_smooth_8h_8bpp_ssse3: 28.0
vp9_put_8tap_smooth_8h_8bpp_avx512icl: 11.1

vp9_put_8tap_smooth_8v_8bpp_ssse3: 28.9
vp9_put_8tap_smooth_8v_8bpp_avx512icl: 15.1

vp9_put_8tap_smooth_8hv_8bpp_ssse3:79.2
vp9_put_8tap_smooth_8hv_8bpp_avx512icl:25.8

vp9_put_8tap_smooth_16h_8bpp_ssse3:78.7
vp9_put_8tap_smooth_16h_8bpp_avx512icl:27.3

vp9_put_8tap_smooth_16v_8bpp_ssse3:79.1
vp9_put_8tap_smooth_16v_8bpp_avx512icl:34.0

vp9_put_8tap_smooth_16hv_8bpp_ssse3:  199.8
vp9_put_8tap_smooth_16hv_8bpp_avx512icl:   70.2

vp9_put_8tap_smooth_32h_8bpp_avx2:151.5
vp9_put_8tap_smooth_32h_8bpp_avx512icl:81.6

vp9_put_8tap_smooth_32v_8bpp_avx2:148.0
vp9_put_8tap_smooth_32v_8bpp_avx512icl:   101.0

vp9_put_8tap_smooth_32hv_8bpp_avx2:   337.7
vp9_put_8tap_smooth_32hv_8bpp_avx512icl:  247.1

vp9_put_8tap_smooth_64h_8bpp_avx2:600.1
vp9_put_8tap_smooth_64h_8bpp_avx512icl:   259.9

vp9_put_8tap_smooth_64v_8bpp_avx2:590.6
vp9_put_8tap_smooth_64v_8bpp_avx512icl:   252.4

vp9_put_8tap_smooth_64hv_8bpp_avx2:  1343.4
vp9_put_8tap_smooth_64hv_8bpp_avx512icl:  938.1
```



>From 915710f2e618f0fcc7a9daacfbd2fcdb3797cf2f Mon Sep 17 00:00:00 2001
From: Henrik Gramner 
Date: Wed, 27 Aug 2025 22:51:08 +0200
Subject: [PATCH] vp9: Add AVX-512ICL asm for 8bpc subpel mc

---
 libavcodec/x86/vp9dsp_init.c |  12 +
 libavcodec/x86/vp9dsp_init.h |  31 ++
 libavcodec/x86/vp9mc.asm | 866 +++
 tests/checkasm/vp9dsp.c  |   6 +-
 4 files changed, 912 insertions(+), 3 deletions(-)

diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index 4373fa3f04..15862f43bf 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -50,6 +50,13 @@ decl_mc_funcs(8, ssse3, int8_t, 32, 8);
 #if ARCH_X86_64
 decl_mc_funcs(16, ssse3, int8_t, 32, 8);
 decl_mc_funcs(32, avx2, int8_t, 32, 8);
+#if HAVE_AVX512ICL_EXTERNAL
+decl_subpel_asm( 4, 8, avx512icl);
+decl_subpel_asm( 8, 8, avx512icl);
+decl_subpel_asm(16, 8, avx512icl);
+decl_subpel_asm(32, 8, avx512icl);
+decl_subpel_asm(64, 8, avx512icl);
+#endif
 #endif
 
 mc_rep_funcs(16,  8,  8,  sse2, int16_t,  8, 8)
@@ -418,6 +425,11 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int 
bpp, int bitexact)
 dsp->itxfm_add[TX_32X32][ADST_DCT]  =
 dsp->itxfm_add[TX_32X32][DCT_ADST]  =
 dsp->itxfm_add[TX_32X32][DCT_DCT]   = 
ff_vp9_idct_idct_32x32_add_avx512icl;
+init_subpel_asm(4,  4, 8, avx512icl);
+init_subpel_asm(3,  8, 8, avx512icl);
+init_subpel_asm(2, 16, 8, avx512icl);
+init_subpel_asm(1, 32, 8, avx512icl);
+init_subpel_asm(0, 64, 8, avx512icl);
 }
 #endif
 
diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h
index fc1e0557fa..5690d16970 100644
--- a/libavcodec/x86/vp9dsp_init.h
+++ b/libavcodec/x86/vp9dsp_init.h
@@ -172,6 +172,37 @@ filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
 init_subpel3_8to64(idx, type, bpp, opt); \
 init_subpel2(4, idx,  4, type, bpp, opt)
 
+#define decl_subpel_asm_fn(type) \
+void ff_vp9_put_8tap_##type(uint8_t *dst, ptrdiff_t dst_stride, \
+const uint8_t *src, ptrdiff_t src_stride, \
+int h, int mx, int my); \
+void ff_vp9_avg_8tap_##type(uint8_t *dst, ptrdiff_t dst_stride, \
+const uint8_t *src, ptrdiff_t src_stride, \
+int h, int mx, int my)
+
+#define decl_subpel_asm_dir(type) \
+decl_subpel_asm_fn(regular_##type); \
+decl_subpel_asm_fn(smooth_##type); \
+decl_subpel_asm_fn(sharp_##type)
+
+#define decl_subpel_asm(sz, bpp, opt) \
+decl_subpel_asm_dir(sz##h_##bpp##_##opt); \
+decl_subpel_asm_dir(sz##v_##bpp##_##opt); \
+decl_subpel_asm_dir(sz##hv_##bpp##_##opt)
+
+#define init_subpel_asm_dir(idx1, idx2, idx3, type) \
+dsp->mc[idx1][FILTER_8TAP_REGULAR][0][idx2][idx3] = 
ff_vp9_put_8tap_regular_##type; \
+dsp->mc[idx1][FILTER_8TAP_SHARP  ][0][idx2][idx3] = 
ff_vp9_put_8tap_sharp_##type; \
+dsp->mc[idx1][FILTER_8TAP_SMOOTH ][0][idx2][idx3] = 
ff_vp9_put_8tap_smooth_##type; \
+dsp->mc[idx1][FILTER_8TAP_REGULAR][1][idx2][idx3] = 
ff_vp9_avg_8tap_regular_##type; \
+dsp->mc[idx1][FILTER_8TAP_SHARP  ][1][idx2][idx3] = 
ff_vp9_avg_8tap_sharp_##type; \
+dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][idx2][idx3] = 
ff_vp9_avg_8tap_smooth_##type
+
+#define

Re: [FFmpeg-devel] [PATCH v2] checkasm: add sample argument to adjust during bench

2024-05-21 Thread Henrik Gramner via ffmpeg-devel

On Tue, May 21, 2024 at 2:33 PM J. Dekker  wrote:
> @@ -338,8 +338,9 @@ typedef struct CheckasmPerf {
>  uint64_t tsum = 0;\
>  int ti, tcount = 0;\
>  uint64_t t = 0; \
> +const uint64_t truns = bench_runs;\
>  checkasm_set_signal_handler_state(1);\
> -for (ti = 0; ti < BENCH_RUNS; ti++) {\
> +for (ti = 0; ti < truns; ti++) {\

This is comparing int with uint64_t. We should probably just use int
for the sample count too.
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 1/4] avutil/x86/pixelutils: Empty MMX state in ff_pixelutils_sad_8x8_mmxext

2023-11-01 Thread Henrik Gramner via ffmpeg-devel

On Wed, Nov 1, 2023 at 10:44 AM Andreas Rheinhardt
 wrote:
>  libavutil/x86/pixelutils.asm | 1 +
>  1 file changed, 1 insertion(+)

IIRC the emms instructions is quite slow on many systems, so if this
is the only pixelutils function using mmx it's probably better to just
rewrite it to use SSE2 instead (even if that means only using the
lower half of xmm registers).
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 2/3] x86/ac3dsp: add ff_float_to_fixed24_avx2()

2023-11-23 Thread Henrik Gramner via ffmpeg-devel

On Thu, Nov 23, 2023 at 12:51 PM James Almer  wrote:
> movdqa wiht ymm is avx2. I could change it to movaps, but technically
> the registers contain floats and i don't know if any old AVX cpu has
> penalties for changing domains.

Fwiw I believe what domain the result of fp <-> int conversion
instructions belongs to actually differs between µarchs. Realistically
whether movaps or movdqa is used to store the result to memory is
unlikely to matter in practice though.
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2] avcodec/amfenc: increase precision of Sleep() on Windows

2023-11-27 Thread Henrik Gramner via ffmpeg-devel

On Mon, Nov 27, 2023 at 2:42 PM Mark Thompson  wrote:
> Is it reasonable to set this global state from a library without the parent 
> program knowing?  We'd really prefer not to affect the global state 
> unexpectedly.

CreateWaitableTimerExW() with the
CREATE_WAITABLE_TIMER_HIGH_RESOLUTION flag might be an alternative?
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] avformat/mov_chan: Use anonymous union

2024-03-25 Thread Henrik Gramner via ffmpeg-devel

On Mon, Mar 25, 2024 at 4:01 PM Andreas Rheinhardt
 wrote:
>
> Right, it is an anonymous enum, not union. Amended locally.
>
> - Andreas

Can confirm this eliminates the warnings, lgtm.
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] avcodec/x86/h264_idct: Fix incorrect xmm spilling on win64

2024-03-25 Thread Henrik Gramner via ffmpeg-devel

On Sun, Mar 24, 2024 at 8:21 PM Henrik Gramner  wrote:
>
> Broken in afa471d0efed1df5dca6eeeb2fcdd211ae4cad4e. It just happened
> to work before due to x86inc.asm previously performing XMM spills in
> INIT_MMX mode which was more of a bug than an intentional feature.

Will apply.
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

Re: [FFmpeg-devel] [GASPP PATCH] Implicitly start out in the text section for armasm

2024-04-04 Thread Henrik Gramner via ffmpeg-devel

On Wed, Apr 3, 2024 at 3:47 PM Martin Storsjö  wrote:
>
> This fixes assembling files starting with bare symbol declarations,
> without explicitly switching to .text first.

lgtm.
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] lavf/vsrc_ddagrab: WinAPI functions must be called as stdcall in x86_32

2024-04-07 Thread Henrik Gramner via ffmpeg-devel

On Sun, Apr 7, 2024 at 2:59 AM Vadim Guchenko  wrote:
> +typedef DPI_AWARENESS_CONTEXT (__stdcall 
> *set_thread_dpi_t)(DPI_AWARENESS_CONTEXT);

I believe most existing code uses WINAPI instead of __stdcall.
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v3 2/5] ffbuild/libversion.sh: add shebang

2024-04-09 Thread Henrik Gramner via ffmpeg-devel

On Tue, Apr 9, 2024 at 11:52 PM Marth64  wrote:
> > +#!/bin/sh
> Might I suggest `#!/usr/bin/env sh` instead for this case?
> I tend to prefer it from a portability and usability perspective,
> but I can imagine for sh it might not matter.

/bin/sh exists on virtually every *NIX system whereas /usr/bin/env is
not ubiquitous, so that seems like a terrible idea that would achieve
the opposite result.
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] avcodec/x86/hevc: fix luma 12b overflow

2024-02-25 Thread Henrik Gramner via ffmpeg-devel

On Sun, Feb 25, 2024 at 5:42 PM Ronald S. Bultje  wrote:
> +movam13, [pw_8]
> +paddw   m10, m12, m12
> +paddw   m12, m10 ; 9 * (q0 - p0) - 3 * ( q1 - p1 )
>  paddw   m12, m13; + 8

Memory operand

> +paddw   m10, m13, m13
> +paddw   m13, m10 ; abs(9 * (q0 - p0) - 3 * ( q1 - p1 ))
> +paddw   m13, [pw_8]
[...]
> +paddw   m13, m12, m12
> +paddw   m13, m12 ; 3*abs(m12)
> +paddw   m13, [pw_8]

Another minor improvement would be to reorder the adds like (x + x) +
(x + 8) instead of ((x + x) + x) + 8 to allow for more
instruction-level parallelism.
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] libavcodec/h264pred: Remove pred8x8_horizontal_8_mmxext

2024-03-02 Thread Henrik Gramner via ffmpeg-devel

On Sat, Mar 2, 2024 at 10:13 PM Kieran Kunhya  wrote:
>  SPLATB_LOAD m0, r0+r1*0-1, m2
>  SPLATB_LOAD m1, r0+r1*1-1, m2

This adds an extra unnecessary shuffle in the SSE2 code as it splats
to a full register. The easiest way of fixing it would probably be to
unroll the macro and manually get rid of it.

Although on x86-64 it might be faster to do a 1->8 byte splat using a
GPR multiply with 0x0101010101010101.
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

[FFmpeg-devel] [PATCH] x86: Update x86inc.asm

2024-03-16 Thread Henrik Gramner via ffmpeg-devel

Makes things up-to-date with the upstream at
https://code.videolan.org/videolan/x86inc.asm

Specifying every individual change is difficult as there have been
divergences and cherry-picks over time, but the full upstream change
log can be found at
https://code.videolan.org/videolan/x86inc.asm/-/commits/master?ref_type=heads

Tested and passes FATE on Linux (x86-64), macOS (x86-64), and Windows
(x86-64 and x86-32). The macOS system with AVX2, the rest with
AVX512ICL.

The following local changes compared to upstream remain (is the OS/2
part still relevant?):
@@ -37,2 +37,5 @@

+%if HAVE_ALIGNED_STACK
+%define STACK_ALIGNMENT 16
+%endif
 %ifndef STACK_ALIGNMENT
@@ -86,4 +89,11 @@

+; aout does not support align=
+; NOTE: This section is out of sync with x264, in order to
+; keep supporting OS/2.
 %macro SECTION_RODATA 0-1 16
-%ifidn __OUTPUT_FORMAT__,win32
+%ifidn __OUTPUT_FORMAT__,aout
+SECTION .text
+%elifidn __OUTPUT_FORMAT__,coff
+SECTION .text
+%elifidn __OUTPUT_FORMAT__,win32
 SECTION .rdata align=%1


x86_update_x86inc.patch
Description: Binary data
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

[FFmpeg-devel] [PATCH] avutil/x86util: Fix broken pre-SSE4.1 PMINSD emulation

2024-03-17 Thread Henrik Gramner via ffmpeg-devel

Fixes yadif-16 which allows FATE to pass.

Broken since 2904db90458a1253e4aea6844ba9a59ac11923b6 (2017).


pminsd_emulation.patch
Description: Binary data
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] avutil/x86util: Fix broken pre-SSE4.1 PMINSD emulation

2024-03-17 Thread Henrik Gramner via ffmpeg-devel

On Sun, Mar 17, 2024 at 1:44 PM James Almer  wrote:
> LGTM. I wonder why we even added a float based fallback for this.

Thanks, pushed.
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] x86: Update x86inc.asm

2024-03-19 Thread Henrik Gramner via ffmpeg-devel

On Sat, Mar 16, 2024 at 8:53 PM Henrik Gramner  wrote:
> Makes things up-to-date with the upstream at
> https://code.videolan.org/videolan/x86inc.asm

Will push in a few days if there are no comments.
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] x86: Update x86inc.asm

2024-03-24 Thread Henrik Gramner via ffmpeg-devel

On Tue, Mar 19, 2024 at 11:20 AM Henrik Gramner  wrote:
>
> Will push in a few days if there are no comments.

Pushed.
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

[FFmpeg-devel] [PATCH] avcodec/x86/h264_idct: Fix incorrect xmm spilling on win64

2024-03-24 Thread Henrik Gramner via ffmpeg-devel

Broken in afa471d0efed1df5dca6eeeb2fcdd211ae4cad4e. It just happened
to work before due to x86inc.asm previously performing XMM spills in
INIT_MMX mode which was more of a bug than an intentional feature.


x86_h264_idct_spill_xmm.patch
Description: Binary data
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] x86inc: Add REPX macro to repeat instructions/operations

2023-10-01 Thread Henrik Gramner via ffmpeg-devel

On Fri, Sep 29, 2023 at 1:38 PM Frank Plowman  wrote:
>  libavutil/x86/x86inc.asm | 10 ++
>  1 file changed, 10 insertions(+)

LGTM.

As a side note https://code.videolan.org/videolan/x86inc.asm is the
upstream repo for x86inc.asm.
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] checkasm: Generalize crash handling

2023-12-21 Thread Henrik Gramner via ffmpeg-devel

On Tue, Dec 19, 2023 at 1:02 PM Martin Storsjö  wrote:
> This replaces the riscv specific handling from
> 7212466e735aa187d82f51dadbce957fe3da77f0 (which essentially is
> reverted, together with 286d6742218ba0235c32876b50bf593cb1986353)
> with a different implementation of the same (plus a bit more), based
> on the corresponding feature in dav1d's checkasm, supporting both Unix
> and Windows.

lgtm
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] checkasm: Generalize crash handling

2023-12-21 Thread Henrik Gramner via ffmpeg-devel

On Thu, Dec 21, 2023 at 9:16 PM Rémi Denis-Courmont  wrote:
> > +checkasm_fail_func("%s",
> > +   s == SIGFPE ? "fatal arithmetic error" :
> > +   s == SIGILL ? "illegal instruction" :
> > +   s == SIGBUS ? "bus error" :
> > + "segmentation fault");
>
> The current code for the error print-out is both simpler and more versatile,
> so I don't get this.

IMO "illegal instruction" is a far better error message than "fatal
signal 4" (with an implementation-defined number which nobody knows
the meaning of without having to look it up).

> > +/* fall back to the default signal handler */
> > +static const struct sigaction default_sa = { .sa_handler = SIG_DFL
> > }; +sigaction(s, &default_sa, NULL);
> > +raise(s);
>
> Why raise here? Returning from the handler will reevaluate the same code with
> the same thread state, and trigger the default signal handler anyway (since
> you don't modify the user context).

No it wont, it'll get stuck in an infinite loop invoking the signal
handler over and over. At least on my system.

> > +const struct sigaction sa = {
> > +.sa_handler = signal_handler,
> > +.sa_flags = SA_NODEFER,
>
> That does not look very sane to me. If a recursive signal occurs, processing
> it recursively is NOT a good idea. This would cause an infinite loop,
> eventually a literal stack overflow after maxing out the processor for a 
> while.
> I'd rather let the OS kernel deal with the problem, by killing the process or
> whatever the last resort is.
>
> > +#define checkasm_save_context() setjmp(checkasm_context_buf)
> > +#define checkasm_load_context() longjmp(checkasm_context_buf, 1)
> > +#endif
>
> Been there done that and it did not end well.
> sigsetjmp() & co are necessary here.

For all intents and purposes sigjmp()/longjmp() with SA_NODEFER does
the same thing as sigsetjmp()/siglongjmp() without SA_NODEFER for this
particular use case (no infinite recursion is possible the way the
code is written). The change isn't necessary per se but it seems
reasonable and I have no objections to it.
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] checkasm: Generalize crash handling

2023-12-22 Thread Henrik Gramner via ffmpeg-devel

On Fri, Dec 22, 2023 at 7:20 AM Rémi Denis-Courmont  wrote:
> >> > +checkasm_fail_func("%s",
> >> > +   s == SIGFPE ? "fatal arithmetic error" :
> >> > +   s == SIGILL ? "illegal instruction" :
> >> > +   s == SIGBUS ? "bus error" :
> >> > + "segmentation fault");
> >>
> >> The current code for the error print-out is both simpler and more 
> >> versatile,
> >> so I don't get this.
> >
> >IMO "illegal instruction" is a far better error message than "fatal
> >signal 4" (with an implementation-defined number which nobody knows
> >the meaning of without having to look it up).
>
> The current code prints the number and the name.

Not on all supported systems. And even when it does it's in an
implementation-defined and locale-dependent form, which isn't great.

> >> +const struct sigaction sa = {
> >> +.sa_handler = signal_handler,
> >> +.sa_flags = SA_NODEFER,
> >
> >That does not look very sane to me. If a recursive signal occurs, processing
> >it recursively is NOT a good idea.
>
> Following that, it actually seems safer to automatically reset the handler, 
> using `signal()` or equivalently passing the `SA_RESETHAND` flag. Then the 
> handler can rearm its own self if and *only* if it was able to actually 
> handle the signal by observing a long jump. Resetting to default explicitly 
> is no longer useful then.

Sure, that approach sounds fine.
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2 3/3] avfilter/vf_colordetect: add x86 SIMD implementation

2025-07-16 Thread Henrik Gramner via ffmpeg-devel

On Wed, Jul 16, 2025 at 6:26 PM Niklas Haas  wrote:
> +cglobal detect_range%1, 6, 7, 5, data, stride, width, height, mpeg_min, 
> mpeg_max, x
> +movd xm0, mpeg_mind
> +movd xm1, mpeg_maxd
> +vpbroadcast%1 m0, xm0
> +vpbroadcast%1 m1, xm1

You could perhaps also do something like the following to shave off a
few instructions:

cglobal detect_range%1, 4, 7, 5, data, stride, width, height,
mpeg_min, mpeg_max, x
%if UNIX64 && notcpuflag(avx512)
movd xm0, mpeg_mind
movd xm1, mpeg_maxd
vpbroadcast%1 m0, xm0
vpbroadcast%1 m1, xm1
%else
vpbroadcast%1 m0, mpeg_minm
vpbroadcast%1 m0, mpeg_maxm
%endif
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

[FFmpeg-devel] [PATCH] avcodec/x86/vp9: Add AVX-512ICL for 16x16 and 32x32 10bpc inverse transforms

2025-05-21 Thread Henrik Gramner via ffmpeg-devel

Tested to pass FATE on Linux and Windows.

Checkasm numbers vs the existing SSE2 code on Zen 5 (Strix Halo):
vp9_inv_adst_adst_16x16_sub16_add_10_sse2:   1041.8 ( 1.92x)
vp9_inv_adst_adst_16x16_sub16_add_10_avx512icl:   132.5 (15.06x)

vp9_inv_dct_adst_16x16_sub16_add_10_sse2: 901.0 ( 1.98x)
vp9_inv_dct_adst_16x16_sub16_add_10_avx512icl:120.8 (14.79x)

vp9_inv_dct_dct_16x16_sub16_add_10_sse2:  750.6 ( 2.10x)
vp9_inv_dct_dct_16x16_sub16_add_10_avx512icl: 110.9 (14.18x)

vp9_inv_dct_dct_32x32_sub32_add_10_sse2: 3922.6 ( 2.24x)
vp9_inv_dct_dct_32x32_sub32_add_10_avx512icl: 506.6 (17.37x)


vp9_itx_10_avx512.patch
Description: Binary data
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

[FFmpeg-devel] [PATCH] avcodec/x86/vp9: Add AVX-512ICL for 16x16 and 32x32 8bpc inverse transforms

2025-05-16 Thread Henrik Gramner via ffmpeg-devel

Placed in a new separate file as the existing combined MMX/SSE/AVX
file is humongous and takes forever to assemble as is.

This adds ~16 KiB of .text. The existing 8bpc asm is ~240 KiB of which
the corresponding AVX2 functions makes up ~42 KiB.

Tested to pass FATE on Linux and Windows.

Checkasm numbers vs AVX2 on Zen 5 (Strix Halo):
  vp9_inv_adst_adst_16x16_sub16_add_8_avx2:209.3
  vp9_inv_adst_adst_16x16_sub16_add_8_avx512icl:99.5

  vp9_inv_adst_dct_16x16_sub16_add_8_avx2: 165.2
  vp9_inv_adst_dct_16x16_sub16_add_8_avx512icl: 89.7

  vp9_inv_dct_adst_16x16_sub16_add_8_avx2: 165.9
  vp9_inv_dct_adst_16x16_sub16_add_8_avx512icl: 87.7

  vp9_inv_dct_dct_16x16_sub16_add_8_avx2:  121.3
  vp9_inv_dct_dct_16x16_sub16_add_8_avx512icl:  79.2

  vp9_inv_dct_dct_32x32_sub32_add_8_avx2:  745.5
  vp9_inv_dct_dct_32x32_sub32_add_8_avx512icl: 285.5


vp9_itx_avx512.patch
Description: Binary data
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] avcodec/x86/vp9: Add AVX-512ICL for 16x16 and 32x32 8bpc inverse transforms

2025-05-19 Thread Henrik Gramner via ffmpeg-devel

On Sat, May 17, 2025 at 12:59 AM Henrik Gramner  wrote:
>
> Placed in a new separate file as the existing combined MMX/SSE/AVX
> file is humongous and takes forever to assemble as is.
>
> This adds ~16 KiB of .text. The existing 8bpc asm is ~240 KiB of which
> the corresponding AVX2 functions makes up ~42 KiB.
>
> Tested to pass FATE on Linux and Windows.

Pushed.
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] avcodec/x86/vp9: Add AVX-512ICL for 16x16 and 32x32 10bpc inverse transforms

2025-05-26 Thread Henrik Gramner via ffmpeg-devel

On Wed, May 21, 2025 at 5:48 PM Henrik Gramner  wrote:
>
> Tested to pass FATE on Linux and Windows.

Pushed.
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

[FFmpeg-devel] [PATCH] vp9: Improve 8bpc AVX2 inverse transform asm (PR #20526)

2025-09-15 Thread Henrik Gramner via ffmpeg-devel

PR #20526 opened by Henrik Gramner (gramner)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20526
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20526.patch

This adds size-efficient AVX2 implementations for all the inverse transform 
types (replacing a few fully unrolled existing ones in the process). This 
reduces code size by ~27 kB despite supporting more sizes.

The existing AVX implementations are also removed as they serve very little 
purpose now. This reduces code size by another ~65 kB.

The time to compile `vp9itxfm.asm` (which was kind of an extreme outlier 
before) is reduced to less than half, with the AVX2 code split out to a 
separate file.

Checkasm numbers on Zen 4:
```
  oldnew
vp9_inv_adst_adst_4x4_sub4_add_8_ssse3:  29.1
vp9_inv_adst_adst_4x4_sub4_add_8_avx2:N/A   22.5

vp9_inv_dct_dct_4x4_sub4_add_8_ssse3:26.2
vp9_inv_dct_dct_4x4_sub4_add_8_avx2:  N/A   16.6

vp9_inv_adst_adst_8x8_sub8_add_8_ssse3: 105.2
vp9_inv_adst_adst_8x8_sub8_add_8_avx2:N/A   62.3

vp9_inv_dct_dct_8x8_sub8_add_8_ssse3:55.7
vp9_inv_dct_dct_8x8_sub8_add_8_avx2:  N/A   47.1

vp9_inv_adst_adst_16x16_sub16_add_8_ssse3:  526.9
vp9_inv_adst_adst_16x16_sub16_add_8_avx2:   261.5  225.8

vp9_inv_dct_dct_16x16_sub8_add_8_ssse3: 142.4
vp9_inv_dct_dct_16x16_sub8_add_8_avx2:  163.3   89.0
vp9_inv_dct_dct_16x16_sub16_add_8_ssse3:305.6
vp9_inv_dct_dct_16x16_sub16_add_8_avx2: 163.3  163.2

vp9_inv_dct_dct_32x32_sub16_add_8_ssse3:893.5
vp9_inv_dct_dct_32x32_sub16_add_8_avx2: 465.2  462.2
vp9_inv_dct_dct_32x32_sub32_add_8_ssse3:   1760.7
vp9_inv_dct_dct_32x32_sub32_add_8_avx2: 903.7  879.9

vp9_inv_wht_wht_4x4_sub4_add_8_mmx:  16.7
vp9_inv_wht_wht_4x4_sub4_add_8_avx2:  N/A   14.8

```


>From 41bf9b5cfcd97cbab5a4554101397b0b31bf0ee1 Mon Sep 17 00:00:00 2001
From: Henrik Gramner 
Date: Mon, 15 Sep 2025 14:11:57 +0200
Subject: [PATCH 1/2] vp9: Add 8bpc AVX2 asm for inverse transforms

---
 libavcodec/x86/Makefile  |1 +
 libavcodec/x86/vp9dsp_init.c |   15 +
 libavcodec/x86/vp9itxfm.asm  |  369 +--
 libavcodec/x86/vp9itxfm_avx2.asm | 1640 ++
 4 files changed, 1666 insertions(+), 359 deletions(-)
 create mode 100644 libavcodec/x86/vp9itxfm_avx2.asm

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index ebd2bdb310..461753c2fe 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -186,6 +186,7 @@ X86ASM-OBJS-$(CONFIG_VP6_DECODER)  += x86/vp6dsp.o
 X86ASM-OBJS-$(CONFIG_VP9_DECODER)  += x86/vp9intrapred.o\
   x86/vp9intrapred_16bpp.o  \
   x86/vp9itxfm.o\
+  x86/vp9itxfm_avx2.o   \
   x86/vp9itxfm_avx512.o \
   x86/vp9itxfm_16bpp.o  \
   x86/vp9itxfm_16bpp_avx512.o   \
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index bbabcf38c3..a1e47445a8 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -108,9 +108,11 @@ itxfm_func(idct,  iadst, 4, sse2);
 itxfm_func(iadst, idct,  4, sse2);
 itxfm_func(iadst, iadst, 4, sse2);
 itxfm_funcs(4, ssse3);
+itxfm_funcs(4, avx2);
 itxfm_funcs(8, sse2);
 itxfm_funcs(8, ssse3);
 itxfm_funcs(8, avx);
+itxfm_funcs(8, avx2);
 itxfm_funcs(16, sse2);
 itxfm_funcs(16, ssse3);
 itxfm_funcs(16, avx);
@@ -118,6 +120,7 @@ itxfm_func(idct, idct, 32, sse2);
 itxfm_func(idct, idct, 32, ssse3);
 itxfm_func(idct, idct, 32, avx);
 itxfm_func(iwht, iwht, 4, mmx);
+itxfm_func(iwht, iwht, 4, avx2);
 itxfm_funcs(16, avx2);
 itxfm_funcs(16, avx512icl);
 itxfm_func(idct, idct, 32, avx2);
@@ -392,6 +395,18 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int 
bpp, int bitexact)
 init_fpel_func(0, 1, 64, avg, _8, avx2);
 if (ARCH_X86_64) {
 #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+dsp->itxfm_add[4 /* lossless */][DCT_DCT] =
+dsp->itxfm_add[4 /* lossless */][ADST_DCT] =
+dsp->itxfm_add[4 /* lossless */][DCT_ADST] =
+dsp->itxfm_add[4 /* lossless */][ADST_ADST] = 
ff_vp9_iwht_iwht_4x4_add_avx2;
+dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_avx2;
+dsp->itxfm_add[TX_4X4][ADST_DCT]  = ff_vp9_idct_iadst_4x4_add_avx2;
+dsp->itxfm_add[TX_4X4][DCT_ADST]  = ff_vp9_iadst_idct_4x4_add_avx2;
+dsp->itxfm_add[TX_4X4][ADST_ADST] = 
ff_vp9_iadst_iadst_4x4_add_avx2;
+dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx2;
+dsp->itxfm_add[TX_8X8][ADST_DCT]  = ff_vp9_idct_iadst_8x8_add_avx2;
+dsp->itxfm_add[TX_8X8][DCT_ADST]  = ff_vp9_iadst_idct_8x8_add_avx2;
+dsp->itxfm_add[TX_8X8

[FFmpeg-devel] [PATCH] vp9: Add 8bpc intra prediction AVX2 asm (PR #20386)

2025-09-01 Thread Henrik Gramner via ffmpeg-devel

PR #20386 opened by Henrik Gramner (gramner)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20386
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20386.patch

A few of the most basic variants had existing AVX2 implementations since 
before. Those were rewritten to reduce code size.

Checkasm numbers on Zen 5 (Strix Halo):
```
vp9_dc_32x32_8bpp_ssse3: 24.2
vp9_dc_32x32_8bpp_avx2:  10.3

vp9_dc_left_32x32_8bpp_ssse3:23.6
vp9_dc_left_32x32_8bpp_avx2:  9.9

vp9_dc_top_32x32_8bpp_ssse3: 22.9
vp9_dc_top_32x32_8bpp_avx2:  10.0

vp9_diag_downleft_32x32_8bpp_avx:28.5
vp9_diag_downleft_32x32_8bpp_avx2:   13.5

vp9_diag_downright_32x32_8bpp_avx:   35.0
vp9_diag_downright_32x32_8bpp_avx2:  17.0

vp9_hor_32x32_8bpp_avx:  22.3
vp9_hor_32x32_8bpp_avx2: 11.1

vp9_hor_down_32x32_8bpp_avx: 27.5
vp9_hor_down_32x32_8bpp_avx2:19.8

vp9_hor_up_32x32_8bpp_avx:   26.0
vp9_hor_up_32x32_8bpp_avx2:  16.0

vp9_tm_32x32_8bpp_avx:   97.9
vp9_tm_32x32_8bpp_avx2:  23.6

vp9_vert_32x32_8bpp_sse: 20.8
vp9_vert_32x32_8bpp_avx2: 8.9

vp9_vert_left_32x32_8bpp_avx:28.1
vp9_vert_left_32x32_8bpp_avx2:   15.2

vp9_vert_right_32x32_8bpp_avx:   32.0
vp9_vert_right_32x32_8bpp_avx2:  21.3
```



>From ce6ff1b6229f2346e3caee18efbe36e794a94c6d Mon Sep 17 00:00:00 2001
From: Henrik Gramner 
Date: Mon, 1 Sep 2025 02:03:00 +0200
Subject: [PATCH] vp9: Add 8bpc intra prediction AVX2 asm

---
 libavcodec/x86/vp9dsp_init.c|  13 +-
 libavcodec/x86/vp9intrapred.asm | 467 +---
 2 files changed, 309 insertions(+), 171 deletions(-)

diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index 9836b3321c..bbabcf38c3 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -207,11 +207,8 @@ ipred_dir_tm_h_funcs(8, avx);
 ipred_dir_tm_h_funcs(16, avx);
 ipred_dir_tm_h_funcs(32, avx);
 
-ipred_func(32, v, avx);
-
-ipred_dc_funcs(32, avx2);
-ipred_func(32, h, avx2);
-ipred_func(32, tm, avx2);
+ipred_all_funcs(32, avx2);
+ipred_func(32, v, avx2);
 
 #undef ipred_func
 #undef ipred_dir_tm_h_funcs
@@ -388,7 +385,6 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int 
bpp, int bitexact)
 if (EXTERNAL_AVX_FAST(cpu_flags)) {
 init_fpel_func(1, 0, 32, put, , avx);
 init_fpel_func(0, 0, 64, put, , avx);
-init_ipred(32, avx, v, VERT);
 }
 
 if (EXTERNAL_AVX2_FAST(cpu_flags)) {
@@ -408,9 +404,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int 
bpp, int bitexact)
 init_subpel3_32_64(1, avg, 8, avx2);
 #endif
 }
-init_dc_ipred(32, avx2);
-init_ipred(32, avx2, h,  HOR);
-init_ipred(32, avx2, tm, TM_VP8);
+init_all_ipred(32, avx2);
+init_ipred(32, avx2, v, VERT);
 }
 
 #if ARCH_X86_64
diff --git a/libavcodec/x86/vp9intrapred.asm b/libavcodec/x86/vp9intrapred.asm
index 31f7d449fd..b67addd7e3 100644
--- a/libavcodec/x86/vp9intrapred.asm
+++ b/libavcodec/x86/vp9intrapred.asm
@@ -2,6 +2,7 @@
 ;* VP9 Intra prediction SIMD optimizations
 ;*
 ;* Copyright (c) 2013 Ronald S. Bultje 
+;* Copyright (c) 2025 Two Orioles, LLC
 ;*
 ;* Parts based on:
 ;* H.264 intra prediction asm optimizations
@@ -230,40 +231,6 @@ DC_16to32_FUNCS
 INIT_XMM ssse3
 DC_16to32_FUNCS
 
-%if HAVE_AVX2_EXTERNAL
-INIT_YMM avx2
-cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a
-movam0, [lq]
-movam1, [aq]
-DEFINE_ARGS dst, stride, stride3, cnt
-lea   stride3q, [strideq*3]
-pxorm2, m2
-psadbw  m0, m2
-psadbw  m1, m2
-paddw   m0, m1
-vextracti128   xm1, m0, 1
-paddw  xm0, xm1
-movhlpsxm1, xm0
-paddw  xm0, xm1
-pmulhrsw   xm0, [pw_512]
-vpbroadcastbm0, xm0
-mov   cntd, 4
-.loop:
-mova  [dstq+strideq*0], m0
-mova  [dstq+strideq*1], m0
-mova  [dstq+strideq*2], m0
-mova  [dstq+stride3q ], m0
-lea   dstq, [dstq+strideq*4]
-mova  [dstq+strideq*0], m0
-mova  [dstq+strideq*1], m0
-mova  [dstq+strideq*2], m0
-mova  [dstq+stride3q ], m0
-lea   dstq, [dstq+strideq*4]
-dec   cntd
-jg .loop
-RET
-%endif
-
 ; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const 
uint8_t *a)
 
 %macro DC_1D_4to8_FUNCS 2 ; dir (top or left), arg (a or l)
@@ -395,44 +362,6 @@ INIT_XMM ssse3
 DC_1D_16to32_FUNCS top,  a
 DC_1D_16to32_FUNCS left, l
 
-%macro DC_1D_AVX2_FUNCS 2 ; dir (top or left), arg (a or l)
-%if HAVE_AVX2_EXTERNAL
-cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
-movam0, [%2q]
-DEFINE_ARGS dst, stride,

[FFmpeg-devel] [PATCH] vp9: Add AVX-512ICL asm for 8bpc subpel mc (PR #20358)

Re: [FFmpeg-devel] [PATCH v2] checkasm: add sample argument to adjust during bench

Re: [FFmpeg-devel] [PATCH 1/4] avutil/x86/pixelutils: Empty MMX state in ff_pixelutils_sad_8x8_mmxext

Re: [FFmpeg-devel] [PATCH 2/3] x86/ac3dsp: add ff_float_to_fixed24_avx2()

Re: [FFmpeg-devel] [PATCH v2] avcodec/amfenc: increase precision of Sleep() on Windows

Re: [FFmpeg-devel] [PATCH] avformat/mov_chan: Use anonymous union

Re: [FFmpeg-devel] [PATCH] avcodec/x86/h264_idct: Fix incorrect xmm spilling on win64

Re: [FFmpeg-devel] [GASPP PATCH] Implicitly start out in the text section for armasm

Re: [FFmpeg-devel] [PATCH] lavf/vsrc_ddagrab: WinAPI functions must be called as stdcall in x86_32

Re: [FFmpeg-devel] [PATCH v3 2/5] ffbuild/libversion.sh: add shebang

Re: [FFmpeg-devel] [PATCH] avcodec/x86/hevc: fix luma 12b overflow

Re: [FFmpeg-devel] [PATCH] libavcodec/h264pred: Remove pred8x8_horizontal_8_mmxext

[FFmpeg-devel] [PATCH] x86: Update x86inc.asm

[FFmpeg-devel] [PATCH] avutil/x86util: Fix broken pre-SSE4.1 PMINSD emulation

Re: [FFmpeg-devel] [PATCH] avutil/x86util: Fix broken pre-SSE4.1 PMINSD emulation

Re: [FFmpeg-devel] [PATCH] x86: Update x86inc.asm

Re: [FFmpeg-devel] [PATCH] x86: Update x86inc.asm

[FFmpeg-devel] [PATCH] avcodec/x86/h264_idct: Fix incorrect xmm spilling on win64

Re: [FFmpeg-devel] [PATCH] x86inc: Add REPX macro to repeat instructions/operations

Re: [FFmpeg-devel] [PATCH] checkasm: Generalize crash handling

Re: [FFmpeg-devel] [PATCH] checkasm: Generalize crash handling

Re: [FFmpeg-devel] [PATCH] checkasm: Generalize crash handling

Re: [FFmpeg-devel] [PATCH v2 3/3] avfilter/vf_colordetect: add x86 SIMD implementation

[FFmpeg-devel] [PATCH] avcodec/x86/vp9: Add AVX-512ICL for 16x16 and 32x32 10bpc inverse transforms

[FFmpeg-devel] [PATCH] avcodec/x86/vp9: Add AVX-512ICL for 16x16 and 32x32 8bpc inverse transforms

Re: [FFmpeg-devel] [PATCH] avcodec/x86/vp9: Add AVX-512ICL for 16x16 and 32x32 8bpc inverse transforms

Re: [FFmpeg-devel] [PATCH] avcodec/x86/vp9: Add AVX-512ICL for 16x16 and 32x32 10bpc inverse transforms

[FFmpeg-devel] [PATCH] vp9: Improve 8bpc AVX2 inverse transform asm (PR #20526)

[FFmpeg-devel] [PATCH] vp9: Add 8bpc intra prediction AVX2 asm (PR #20386)

29 matches

Site Navigation

Mail list logo

Footer information