Re: [FFmpeg-devel] [PATCH 1/3] swscale/x86/swscale: Process yuv2yuvX tails using next largest register size

2023-09-04 Thread Alan Kelly via ffmpeg-devel
Hi,

Any issues with this patch or can it be merged?

Thanks,

Alan

On Fri, Jul 14, 2023 at 12:08 PM Alan Kelly  wrote:

> ---
>  libswscale/x86/swscale.c | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> index ff16398988..8c67bf4fab 100644
> --- a/libswscale/x86/swscale.c
> +++ b/libswscale/x86/swscale.c
> @@ -194,7 +194,7 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int
> filterSize, \
>  return; \
>  }
>
> -#define YUV2YUVX_FUNC(opt, step)  \
> +#define YUV2YUVX_FUNC(opt, step, tail)  \
>  void ff_yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, int
> srcOffset, \
> uint8_t *dest, int dstW,  \
> const uint8_t *dither, int offset); \
> @@ -211,7 +211,7 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int
> filterSize, \
>  if(pixelsProcessed > 0) \
>  ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset,
> pixelsProcessed + offset, dither, offset); \
>  if(remainder > 0){ \
> -  ff_yuv2yuvX_mmxext(filter, filterSize - 1, pixelsProcessed, dest -
> offset, pixelsProcessed + remainder + offset, dither, offset); \
> +  yuv2yuvX_ ##tail(filter, filterSize, src, dest, dstW, dither,
> offset); \
>  } \
>  return; \
>  }
> @@ -220,10 +220,10 @@ static void yuv2yuvX_ ##opt(const int16_t *filter,
> int filterSize, \
>  YUV2YUVX_FUNC_MMX(mmxext, 16)
>  #endif
>  #if HAVE_SSE3_EXTERNAL
> -YUV2YUVX_FUNC(sse3, 32)
> +YUV2YUVX_FUNC(sse3, 32, mmxext)
>  #endif
>  #if HAVE_AVX2_EXTERNAL
> -YUV2YUVX_FUNC(avx2, 64)
> +YUV2YUVX_FUNC(avx2, 64, sse3)
>  #endif
>
>  #define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
> --
> 2.41.0.255.g8b1d071c50-goog
>
>
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


[FFmpeg-devel] [PATCH 1/2] swscale/x86/yuv2yuvX: Add yuv2yuvX avx512

2023-09-06 Thread Alan Kelly via ffmpeg-devel
---
 libswscale/x86/swscale.c|  7 +++
 libswscale/x86/yuv2yuvX.asm | 19 ++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index ff16398988..00e42b4bec 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -225,6 +225,9 @@ YUV2YUVX_FUNC(sse3, 32)
 #if HAVE_AVX2_EXTERNAL
 YUV2YUVX_FUNC(avx2, 64)
 #endif
+#if ARCH_X86_64 && HAVE_AVX512_EXTERNAL
+YUV2YUVX_FUNC(avx512, 128)
+#endif
 
 #define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
 void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
@@ -467,6 +470,10 @@ av_cold void ff_sws_init_swscale_x86(SwsContext *c)
 #if HAVE_AVX2_EXTERNAL
 if (EXTERNAL_AVX2_FAST(cpu_flags))
 c->yuv2planeX = yuv2yuvX_avx2;
+#endif
+#if ARCH_X86_64 && HAVE_AVX512_EXTERNAL
+if (EXTERNAL_AVX512ICL(cpu_flags))
+c->yuv2planeX = yuv2yuvX_avx512;
 #endif
 }
 #if ARCH_X86_32 && !HAVE_ALIGNED_STACK
diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm
index 369c850674..57bfa09d66 100644
--- a/libswscale/x86/yuv2yuvX.asm
+++ b/libswscale/x86/yuv2yuvX.asm
@@ -22,6 +22,10 @@
 
 %include "libavutil/x86/x86util.asm"
 
+SECTION_RODATA 64
+
+permutation: dq 0, 2, 4, 6, 1, 3, 5, 7
+
 SECTION .text
 
 ;-
@@ -50,6 +54,10 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, 
dstW, dither, offset
 %else
 movq xm3, [ditherq]
 %endif ; avx2
+
+%if cpuflag(avx512)
+mova m15, [permutation]
+%endif
 cmp  offsetd, 0
 jz   .offset
 
@@ -109,7 +117,10 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, 
dstW, dither, offset
 packuswb m6, m6, m1
 %endif
 mov  srcq, [filterq]
-%if cpuflag(avx2)
+%if cpuflag(avx512)
+vpermt2q m3, m15, m3
+vpermt2q m6, m15, m6
+%elif cpuflag(avx2)
 vpermq   m3, m3, 216
 vpermq   m6, m6, 216
 %endif
@@ -131,4 +142,10 @@ YUV2YUVX_FUNC
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
 YUV2YUVX_FUNC
+%if HAVE_AVX512_EXTERNAL
+%if ARCH_X86_64
+INIT_ZMM avx512
+YUV2YUVX_FUNC
+%endif
+%endif
 %endif
-- 
2.42.0.283.g2d96d420d3-goog

___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


[FFmpeg-devel] [PATCH 2/2] swscale/x86/yuv2yuvX: Process tails by jumping back into the main loop.

2023-09-06 Thread Alan Kelly via ffmpeg-devel
---
 libswscale/x86/swscale.c| 19 ---
 libswscale/x86/yuv2yuvX.asm | 24 ++--
 2 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 00e42b4bec..6980002e9e 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -194,7 +194,7 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int 
filterSize, \
 return; \
 }
 
-#define YUV2YUVX_FUNC(opt, step)  \
+#define YUV2YUVX_FUNC(opt, step, tail)  \
 void ff_yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, int srcOffset, \
uint8_t *dest, int dstW,  \
const uint8_t *dither, int offset); \
@@ -202,17 +202,14 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int 
filterSize, \
const int16_t **src, uint8_t *dest, int dstW, \
const uint8_t *dither, int offset) \
 { \
-int remainder = (dstW % step); \
-int pixelsProcessed = dstW - remainder; \
 if(((uintptr_t)dest) & 15){ \
 yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); \
 return; \
 } \
-if(pixelsProcessed > 0) \
-ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, 
pixelsProcessed + offset, dither, offset); \
-if(remainder > 0){ \
-  ff_yuv2yuvX_mmxext(filter, filterSize - 1, pixelsProcessed, dest - 
offset, pixelsProcessed + remainder + offset, dither, offset); \
-} \
+if (dstW >= step) \
+ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + 
offset, dither, offset); \
+else \
+yuv2yuvX_ ##tail(filter, filterSize, src, dest, dstW, dither, offset); 
\
 return; \
 }
 
@@ -220,13 +217,13 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int 
filterSize, \
 YUV2YUVX_FUNC_MMX(mmxext, 16)
 #endif
 #if HAVE_SSE3_EXTERNAL
-YUV2YUVX_FUNC(sse3, 32)
+YUV2YUVX_FUNC(sse3, 32, mmxext)
 #endif
 #if HAVE_AVX2_EXTERNAL
-YUV2YUVX_FUNC(avx2, 64)
+YUV2YUVX_FUNC(avx2, 64, sse3)
 #endif
 #if ARCH_X86_64 && HAVE_AVX512_EXTERNAL
-YUV2YUVX_FUNC(avx512, 128)
+YUV2YUVX_FUNC(avx512, 128, avx2)
 #endif
 
 #define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm
index 57bfa09d66..03bfd6ad1d 100644
--- a/libswscale/x86/yuv2yuvX.asm
+++ b/libswscale/x86/yuv2yuvX.asm
@@ -54,6 +54,8 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, 
dstW, dither, offset
 %else
 movq xm3, [ditherq]
 %endif ; avx2
+mov  ditherq, dstWq
+sub  dstWq, mmsize * unroll
 
 %if cpuflag(avx512)
 mova m15, [permutation]
@@ -92,13 +94,17 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, 
dstW, dither, offset
 %else
 mova m0, [filterSizeq + 8]
 %endif
-pmulhw   m2, m0, [srcq + offsetq * 2]
-pmulhw   m5, m0, [srcq + offsetq * 2 + mmsize]
+movu m2, [srcq + offsetq * 2]
+movu m5, [srcq + offsetq * 2 + mmsize]
+pmulhw   m2, m0, m2
+pmulhw   m5, m0, m5
 paddwm3, m3, m2
 paddwm4, m4, m5
 %if cpuflag(sse3)
-pmulhw   m2, m0, [srcq + offsetq * 2 + 2 * mmsize]
-pmulhw   m5, m0, [srcq + offsetq * 2 + 3 * mmsize]
+movu m2, [srcq + offsetq * 2 + 2 * mmsize]
+movu m5, [srcq + offsetq * 2 + 3 * mmsize]
+pmulhw   m2, m0, m2
+pmulhw   m5, m0, m5
 paddwm6, m6, m2
 paddwm1, m1, m5
 %endif
@@ -131,8 +137,14 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, 
dstW, dither, offset
 add  offsetq, mmsize * unroll
 mov  filterSizeq, filterq
 cmp  offsetq, dstWq
-jb  .outerloop
-RET
+jb   .outerloop
+
+mov  dstWq, offsetq
+mov  offsetq, ditherq
+sub  offsetq, mmsize * unroll
+cmp  dstWq, ditherq
+jb   .outerloop
+REP_RET
 %endmacro
 
 INIT_MMX mmxext
-- 
2.42.0.283.g2d96d420d3-goog

___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 1/3] swscale/x86/swscale: Process yuv2yuvX tails using next largest register size

2023-09-06 Thread Alan Kelly via ffmpeg-devel
On Tue, Sep 5, 2023 at 12:03 AM Michael Niedermayer 
wrote:

> On Mon, Sep 04, 2023 at 02:30:00PM +0200, Alan Kelly via ffmpeg-devel
> wrote:
> > Hi,
> >
> > Any issues with this patch or can it be merged?
>
> are all cases covered by tests ?
> if yes and the tests pass, it should be ok
>
> thx
>
> [...]
> --
> Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
>
> Observe your enemies, for they first find out your faults. -- Antisthenes
> ___
> ffmpeg-devel mailing list
> [email protected]
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> [email protected] with subject "unsubscribe".
>

All branches are tested. However, the third patch in this chain supersedes
this change since each version of yuv2yuvX now handles remainders. I have
sent the two other patches as a new chain so let's abandon this. Thanks
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] swscale: Break loop-carried dependency enabling parallel out of order execution of the gathers.

2025-08-07 Thread Alan Kelly via ffmpeg-devel
On Mon, Aug 4, 2025 at 10:04 PM Hendrik Leppkes  wrote:

> On Mon, Aug 4, 2025 at 7:19 PM Jacob Lifshay 
> wrote:
> >
> >
> >
> > On August 4, 2025 6:49:20 AM PDT, Alan Kelly via ffmpeg-devel <
> [email protected]> wrote:
> > > The gather is unmasked but the instruction does a merge into ymm4,
> which
> > > depends on the value of ymm4 from the previous loop iteration. The
> > > out-of-order scheduler does not know statically that the instruction is
> > > fully unmasked, preventing parallel out-of-order execution of the
> > > gathers.
> > > ---
> > >  libswscale/x86/scale_avx2.asm | 3 +++
> > >  1 file changed, 3 insertions(+)
> > >
> > > diff --git a/libswscale/x86/scale_avx2.asm
> b/libswscale/x86/scale_avx2.asm
> > > index b4b852d60b..90ee8b0a0e 100644
> > > --- a/libswscale/x86/scale_avx2.asm
> > > +++ b/libswscale/x86/scale_avx2.asm
> > > @@ -68,8 +68,10 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w,
> srcmem, filter, fltpos, fltsize,
> > >  .innerloop:
> > >  %endif
> > >  vpcmpeqd  m13, m13
> > > +pxor m3, m3  ; break loop-carried dependency
> >
> > this is in AVX2 code, so you should use vpxor since pxor will just clear
> the lower 128 bits and leave the upper 128 bits unmodified. actually, on
> some older intel cpus it will cause a huge stall due to not being
> v-prefixed:
> >
> https://stackoverflow.com/questions/41303780/why-is-this-sse-code-6-times-slower-without-vzeroupper-on-skylake/41349852#41349852
> >
>
> The v is actually automatically added by the pre-processor through
> x86inc.asm if the function is marked as avx - its a bit confusing
> because all other instructions are explicitly using it however, so it
> might still be a good idea to be explicit about it.
>
> As for the patch itself, any numbers?
>
> - Hendrik
> ___
> ffmpeg-devel mailing list
> [email protected]
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> [email protected] with subject "unsubscribe".
>


Thanks for the quick review and sorry for the slow response. I got dragged
down a benchmarking rabbit hole.

I was unable to reproduce the benchmarks I did when I originally ported
hscale to avx2 on Skylake and Cascade Lake. I could only reproduce the
speed-up on Broadwell and Sapphire Rapids. I found an Intel security
vulnerability called Gather Data Sampling
https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/gather-data-sampling.html
which when mitigated, has a large impact on the performance of gather
instructions. The machines I tested on had the mitigation applied, causing
a huge performance loss for avx2 hscale.

Broadwell:
hscale_8_to_15__fs_4_dstW_512_c:  3379.5 ( 1.00x)
hscale_8_to_15__fs_4_dstW_512_sse2:615.7 ( 5.49x)
hscale_8_to_15__fs_4_dstW_512_ssse3:   613.4 ( 5.51x)
hscale_8_to_15__fs_4_dstW_512_avx2:495.7 ( 6.82x)

Skylake:
hscale_8_to_15__fs_4_dstW_512_c:  3411.4 ( 1.00x)
hscale_8_to_15__fs_4_dstW_512_sse2:591.0 ( 5.77x)
hscale_8_to_15__fs_4_dstW_512_ssse3:   591.5 ( 5.77x)
hscale_8_to_15__fs_4_dstW_512_avx2:   1386.2 ( 2.46x)

Cascade Lake:
hscale_8_to_15__fs_4_dstW_512_c:  3231.3 ( 1.00x)
hscale_8_to_15__fs_4_dstW_512_sse2:517.9 ( 6.24x)
hscale_8_to_15__fs_4_dstW_512_ssse3:   521.6 ( 6.19x)
hscale_8_to_15__fs_4_dstW_512_avx2:   1775.0 ( 1.82x)

Sapphire Rapids:
hscale_8_to_15__fs_4_dstW_512_c:  1840.0 ( 1.00x)
hscale_8_to_15__fs_4_dstW_512_sse2:287.9 ( 6.39x)
hscale_8_to_15__fs_4_dstW_512_ssse3:   293.8 ( 6.26x)
hscale_8_to_15__fs_4_dstW_512_avx2:219.2 ( 8.40x)

This patch increases performance by about 3%. But I think the real question
is what should be done about avx2 hscale? Most machines probably have this
patch applied, should I send a patch removing avx2 hscale or disable it on
Skylake, Ice Lake, Cascade Lake and possibly other machines?
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


[FFmpeg-devel] [PATCH] swscale: Break loop-carried dependency enabling parallel out of order execution of the gathers.

2025-08-04 Thread Alan Kelly via ffmpeg-devel
The gather is unmasked but the instruction does a merge into ymm4, which
depends on the value of ymm4 from the previous loop iteration. The
out-of-order scheduler does not know statically that the instruction is
fully unmasked, preventing parallel out-of-order execution of the
gathers.
---
 libswscale/x86/scale_avx2.asm | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm
index b4b852d60b..90ee8b0a0e 100644
--- a/libswscale/x86/scale_avx2.asm
+++ b/libswscale/x86/scale_avx2.asm
@@ -68,8 +68,10 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, 
filter, fltpos, fltsize,
 .innerloop:
 %endif
 vpcmpeqd  m13, m13
+pxor m3, m3  ; break loop-carried dependency
 vpgatherdd m3,[srcmemq + m1], m13
 vpcmpeqd  m13, m13
+pxor m4, m4  ; break loop-carried dependency
 vpgatherdd m4,[srcmemq + m2], m13
 vpunpcklbw m5, m3, m0
 vpunpckhbw m6, m3, m0
@@ -119,6 +121,7 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, 
filter, fltpos, fltsize,
 .tail_innerloop:
 %endif
 vpcmpeqd  xm13, xm13
+pxor m3, m3  ; break loop-carried dependency
 vpgatherdd xm3,[srcmemq + xm1], xm13
 vpunpcklbw xm5, xm3, xm0
 vpunpckhbw xm6, xm3, xm0
-- 
2.50.1.565.gc32cd1483b-goog

___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


[FFmpeg-devel] [PATCH] swscale: Disable avx2 hscale 8to15 on IceLake and below due to Intel Gather Data Sampling mitigation performance loss

2025-08-08 Thread Alan Kelly via ffmpeg-devel
Intel provided a microcode update to mitigate this security
vulnerability which has a huge negative performance impact on gather
instructions. This means that hscale 8to15 avx2, which uses gather
extensively, is no longer faster than SSSE3 on impacted CPUs.
---
 libavutil/x86/cpu.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
index d6cd4fab9c..923c63e0c4 100644
--- a/libavutil/x86/cpu.c
+++ b/libavutil/x86/cpu.c
@@ -244,9 +244,11 @@ int ff_get_cpu_flags_x86(void)
 family == 6 && model < 23)
 rval |= AV_CPU_FLAG_SSSE3SLOW;
 
-/* Haswell has slow gather */
-if ((rval & AV_CPU_FLAG_AVX2) && family == 6 && model < 70)
+/* Ice Lake and below have slow gather due to Gather Data Sampling
+ * mitigation. */
+if ((rval & AV_CPU_FLAG_AVX2) && family == 6 && model < 143) {
 rval |= AV_CPU_FLAG_SLOW_GATHER;
+}
 }
 
 #endif /* cpuid */
-- 
2.50.1.703.g449372360f-goog

___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] swscale: Disable avx2 hscale 8to15 on IceLake and below due to Intel Gather Data Sampling mitigation performance loss

2025-08-08 Thread Alan Kelly via ffmpeg-devel
On Fri, Aug 8, 2025 at 2:21 PM Alan Kelly  wrote:

> Intel provided a microcode update to mitigate this security
> vulnerability which has a huge negative performance impact on gather
> instructions. This means that hscale 8to15 avx2, which uses gather
> extensively, is no longer faster than SSSE3 on impacted CPUs.
> ---
>  libavutil/x86/cpu.c | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
> index d6cd4fab9c..923c63e0c4 100644
> --- a/libavutil/x86/cpu.c
> +++ b/libavutil/x86/cpu.c
> @@ -244,9 +244,11 @@ int ff_get_cpu_flags_x86(void)
>  family == 6 && model < 23)
>  rval |= AV_CPU_FLAG_SSSE3SLOW;
>
> -/* Haswell has slow gather */
> -if ((rval & AV_CPU_FLAG_AVX2) && family == 6 && model < 70)
> +/* Ice Lake and below have slow gather due to Gather Data Sampling
> + * mitigation. */
> +if ((rval & AV_CPU_FLAG_AVX2) && family == 6 && model < 143) {
>  rval |= AV_CPU_FLAG_SLOW_GATHER;
> +}
>  }
>
>  #endif /* cpuid */
> --
> 2.50.1.703.g449372360f-goog
>


https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/gather-data-sampling.html



Broadwell:
hscale_8_to_15__fs_4_dstW_512_c:  3379.5 ( 1.00x)
hscale_8_to_15__fs_4_dstW_512_sse2:615.7 ( 5.49x)
hscale_8_to_15__fs_4_dstW_512_ssse3:   613.4 ( 5.51x)
hscale_8_to_15__fs_4_dstW_512_avx2:495.7 ( 6.82x)

Skylake:
hscale_8_to_15__fs_4_dstW_512_c:  3411.4 ( 1.00x)
hscale_8_to_15__fs_4_dstW_512_sse2:591.0 ( 5.77x)
hscale_8_to_15__fs_4_dstW_512_ssse3:   591.5 ( 5.77x)
hscale_8_to_15__fs_4_dstW_512_avx2:   1386.2 ( 2.46x)

Cascade Lake:
hscale_8_to_15__fs_4_dstW_512_c:  3231.3 ( 1.00x)
hscale_8_to_15__fs_4_dstW_512_sse2:517.9 ( 6.24x)
hscale_8_to_15__fs_4_dstW_512_ssse3:   521.6 ( 6.19x)
hscale_8_to_15__fs_4_dstW_512_avx2:   1775.0 ( 1.82x)

Sapphire Rapids:
hscale_8_to_15__fs_4_dstW_512_c:  1840.0 ( 1.00x)
hscale_8_to_15__fs_4_dstW_512_sse2:287.9 ( 6.39x)
hscale_8_to_15__fs_4_dstW_512_ssse3:   293.8 ( 6.26x)
hscale_8_to_15__fs_4_dstW_512_avx2:219.2 ( 8.40x)
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] swscale: Disable avx2 hscale 8to15 on IceLake and below due to Intel Gather Data Sampling mitigation performance loss

2025-08-21 Thread Alan Kelly via ffmpeg-devel
On Fri, Aug 8, 2025 at 2:23 PM Alan Kelly  wrote:

>
>
> On Fri, Aug 8, 2025 at 2:21 PM Alan Kelly  wrote:
>
>> Intel provided a microcode update to mitigate this security
>> vulnerability which has a huge negative performance impact on gather
>> instructions. This means that hscale 8to15 avx2, which uses gather
>> extensively, is no longer faster than SSSE3 on impacted CPUs.
>> ---
>>  libavutil/x86/cpu.c | 6 --
>>  1 file changed, 4 insertions(+), 2 deletions(-)
>>
>> diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
>> index d6cd4fab9c..923c63e0c4 100644
>> --- a/libavutil/x86/cpu.c
>> +++ b/libavutil/x86/cpu.c
>> @@ -244,9 +244,11 @@ int ff_get_cpu_flags_x86(void)
>>  family == 6 && model < 23)
>>  rval |= AV_CPU_FLAG_SSSE3SLOW;
>>
>> -/* Haswell has slow gather */
>> -if ((rval & AV_CPU_FLAG_AVX2) && family == 6 && model < 70)
>> +/* Ice Lake and below have slow gather due to Gather Data
>> Sampling
>> + * mitigation. */
>> +if ((rval & AV_CPU_FLAG_AVX2) && family == 6 && model < 143) {
>>  rval |= AV_CPU_FLAG_SLOW_GATHER;
>> +}
>>  }
>>
>>  #endif /* cpuid */
>> --
>> 2.50.1.703.g449372360f-goog
>>
>
>
> https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/gather-data-sampling.html
>
>
>
> Broadwell:
> hscale_8_to_15__fs_4_dstW_512_c:  3379.5 ( 1.00x)
> hscale_8_to_15__fs_4_dstW_512_sse2:615.7 ( 5.49x)
> hscale_8_to_15__fs_4_dstW_512_ssse3:   613.4 ( 5.51x)
> hscale_8_to_15__fs_4_dstW_512_avx2:495.7 ( 6.82x)
>
> Skylake:
> hscale_8_to_15__fs_4_dstW_512_c:  3411.4 ( 1.00x)
> hscale_8_to_15__fs_4_dstW_512_sse2:591.0 ( 5.77x)
> hscale_8_to_15__fs_4_dstW_512_ssse3:   591.5 ( 5.77x)
> hscale_8_to_15__fs_4_dstW_512_avx2:   1386.2 ( 2.46x)
>
> Cascade Lake:
> hscale_8_to_15__fs_4_dstW_512_c:  3231.3 ( 1.00x)
> hscale_8_to_15__fs_4_dstW_512_sse2:517.9 ( 6.24x)
> hscale_8_to_15__fs_4_dstW_512_ssse3:   521.6 ( 6.19x)
> hscale_8_to_15__fs_4_dstW_512_avx2:   1775.0 ( 1.82x)
>
> Sapphire Rapids:
> hscale_8_to_15__fs_4_dstW_512_c:  1840.0 ( 1.00x)
> hscale_8_to_15__fs_4_dstW_512_sse2:287.9 ( 6.39x)
> hscale_8_to_15__fs_4_dstW_512_ssse3:   293.8 ( 6.26x)
> hscale_8_to_15__fs_4_dstW_512_avx2:219.2 ( 8.40x)
>
>
>


Hi,

Are there any objections to the patch? The performance impact is huge, so
it should be patched quickly.

Thanks,

Alan
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".