Re: [FFmpeg-devel] [PATCH 1/3] swscale/x86/swscale: Process yuv2yuvX tails using next largest register size
Hi,
Any issues with this patch or can it be merged?
Thanks,
Alan
On Fri, Jul 14, 2023 at 12:08 PM Alan Kelly wrote:
> ---
> libswscale/x86/swscale.c | 8
> 1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> index ff16398988..8c67bf4fab 100644
> --- a/libswscale/x86/swscale.c
> +++ b/libswscale/x86/swscale.c
> @@ -194,7 +194,7 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int
> filterSize, \
> return; \
> }
>
> -#define YUV2YUVX_FUNC(opt, step) \
> +#define YUV2YUVX_FUNC(opt, step, tail) \
> void ff_yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, int
> srcOffset, \
> uint8_t *dest, int dstW, \
> const uint8_t *dither, int offset); \
> @@ -211,7 +211,7 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int
> filterSize, \
> if(pixelsProcessed > 0) \
> ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset,
> pixelsProcessed + offset, dither, offset); \
> if(remainder > 0){ \
> - ff_yuv2yuvX_mmxext(filter, filterSize - 1, pixelsProcessed, dest -
> offset, pixelsProcessed + remainder + offset, dither, offset); \
> + yuv2yuvX_ ##tail(filter, filterSize, src, dest, dstW, dither,
> offset); \
> } \
> return; \
> }
> @@ -220,10 +220,10 @@ static void yuv2yuvX_ ##opt(const int16_t *filter,
> int filterSize, \
> YUV2YUVX_FUNC_MMX(mmxext, 16)
> #endif
> #if HAVE_SSE3_EXTERNAL
> -YUV2YUVX_FUNC(sse3, 32)
> +YUV2YUVX_FUNC(sse3, 32, mmxext)
> #endif
> #if HAVE_AVX2_EXTERNAL
> -YUV2YUVX_FUNC(avx2, 64)
> +YUV2YUVX_FUNC(avx2, 64, sse3)
> #endif
>
> #define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
> --
> 2.41.0.255.g8b1d071c50-goog
>
>
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".
[FFmpeg-devel] [PATCH 1/2] swscale/x86/yuv2yuvX: Add yuv2yuvX avx512
--- libswscale/x86/swscale.c| 7 +++ libswscale/x86/yuv2yuvX.asm | 19 ++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index ff16398988..00e42b4bec 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -225,6 +225,9 @@ YUV2YUVX_FUNC(sse3, 32) #if HAVE_AVX2_EXTERNAL YUV2YUVX_FUNC(avx2, 64) #endif +#if ARCH_X86_64 && HAVE_AVX512_EXTERNAL +YUV2YUVX_FUNC(avx512, 128) +#endif #define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \ void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \ @@ -467,6 +470,10 @@ av_cold void ff_sws_init_swscale_x86(SwsContext *c) #if HAVE_AVX2_EXTERNAL if (EXTERNAL_AVX2_FAST(cpu_flags)) c->yuv2planeX = yuv2yuvX_avx2; +#endif +#if ARCH_X86_64 && HAVE_AVX512_EXTERNAL +if (EXTERNAL_AVX512ICL(cpu_flags)) +c->yuv2planeX = yuv2yuvX_avx512; #endif } #if ARCH_X86_32 && !HAVE_ALIGNED_STACK diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm index 369c850674..57bfa09d66 100644 --- a/libswscale/x86/yuv2yuvX.asm +++ b/libswscale/x86/yuv2yuvX.asm @@ -22,6 +22,10 @@ %include "libavutil/x86/x86util.asm" +SECTION_RODATA 64 + +permutation: dq 0, 2, 4, 6, 1, 3, 5, 7 + SECTION .text ;- @@ -50,6 +54,10 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset %else movq xm3, [ditherq] %endif ; avx2 + +%if cpuflag(avx512) +mova m15, [permutation] +%endif cmp offsetd, 0 jz .offset @@ -109,7 +117,10 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset packuswb m6, m6, m1 %endif mov srcq, [filterq] -%if cpuflag(avx2) +%if cpuflag(avx512) +vpermt2q m3, m15, m3 +vpermt2q m6, m15, m6 +%elif cpuflag(avx2) vpermq m3, m3, 216 vpermq m6, m6, 216 %endif @@ -131,4 +142,10 @@ YUV2YUVX_FUNC %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 YUV2YUVX_FUNC +%if HAVE_AVX512_EXTERNAL +%if ARCH_X86_64 +INIT_ZMM avx512 +YUV2YUVX_FUNC +%endif +%endif %endif -- 2.42.0.283.g2d96d420d3-goog ___ ffmpeg-devel mailing list [email protected] https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email [email protected] with subject "unsubscribe".
[FFmpeg-devel] [PATCH 2/2] swscale/x86/yuv2yuvX: Process tails by jumping back into the main loop.
---
libswscale/x86/swscale.c| 19 ---
libswscale/x86/yuv2yuvX.asm | 24 ++--
2 files changed, 26 insertions(+), 17 deletions(-)
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 00e42b4bec..6980002e9e 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -194,7 +194,7 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int
filterSize, \
return; \
}
-#define YUV2YUVX_FUNC(opt, step) \
+#define YUV2YUVX_FUNC(opt, step, tail) \
void ff_yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, int srcOffset, \
uint8_t *dest, int dstW, \
const uint8_t *dither, int offset); \
@@ -202,17 +202,14 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int
filterSize, \
const int16_t **src, uint8_t *dest, int dstW, \
const uint8_t *dither, int offset) \
{ \
-int remainder = (dstW % step); \
-int pixelsProcessed = dstW - remainder; \
if(((uintptr_t)dest) & 15){ \
yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); \
return; \
} \
-if(pixelsProcessed > 0) \
-ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset,
pixelsProcessed + offset, dither, offset); \
-if(remainder > 0){ \
- ff_yuv2yuvX_mmxext(filter, filterSize - 1, pixelsProcessed, dest -
offset, pixelsProcessed + remainder + offset, dither, offset); \
-} \
+if (dstW >= step) \
+ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW +
offset, dither, offset); \
+else \
+yuv2yuvX_ ##tail(filter, filterSize, src, dest, dstW, dither, offset);
\
return; \
}
@@ -220,13 +217,13 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int
filterSize, \
YUV2YUVX_FUNC_MMX(mmxext, 16)
#endif
#if HAVE_SSE3_EXTERNAL
-YUV2YUVX_FUNC(sse3, 32)
+YUV2YUVX_FUNC(sse3, 32, mmxext)
#endif
#if HAVE_AVX2_EXTERNAL
-YUV2YUVX_FUNC(avx2, 64)
+YUV2YUVX_FUNC(avx2, 64, sse3)
#endif
#if ARCH_X86_64 && HAVE_AVX512_EXTERNAL
-YUV2YUVX_FUNC(avx512, 128)
+YUV2YUVX_FUNC(avx512, 128, avx2)
#endif
#define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm
index 57bfa09d66..03bfd6ad1d 100644
--- a/libswscale/x86/yuv2yuvX.asm
+++ b/libswscale/x86/yuv2yuvX.asm
@@ -54,6 +54,8 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest,
dstW, dither, offset
%else
movq xm3, [ditherq]
%endif ; avx2
+mov ditherq, dstWq
+sub dstWq, mmsize * unroll
%if cpuflag(avx512)
mova m15, [permutation]
@@ -92,13 +94,17 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest,
dstW, dither, offset
%else
mova m0, [filterSizeq + 8]
%endif
-pmulhw m2, m0, [srcq + offsetq * 2]
-pmulhw m5, m0, [srcq + offsetq * 2 + mmsize]
+movu m2, [srcq + offsetq * 2]
+movu m5, [srcq + offsetq * 2 + mmsize]
+pmulhw m2, m0, m2
+pmulhw m5, m0, m5
paddwm3, m3, m2
paddwm4, m4, m5
%if cpuflag(sse3)
-pmulhw m2, m0, [srcq + offsetq * 2 + 2 * mmsize]
-pmulhw m5, m0, [srcq + offsetq * 2 + 3 * mmsize]
+movu m2, [srcq + offsetq * 2 + 2 * mmsize]
+movu m5, [srcq + offsetq * 2 + 3 * mmsize]
+pmulhw m2, m0, m2
+pmulhw m5, m0, m5
paddwm6, m6, m2
paddwm1, m1, m5
%endif
@@ -131,8 +137,14 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest,
dstW, dither, offset
add offsetq, mmsize * unroll
mov filterSizeq, filterq
cmp offsetq, dstWq
-jb .outerloop
-RET
+jb .outerloop
+
+mov dstWq, offsetq
+mov offsetq, ditherq
+sub offsetq, mmsize * unroll
+cmp dstWq, ditherq
+jb .outerloop
+REP_RET
%endmacro
INIT_MMX mmxext
--
2.42.0.283.g2d96d420d3-goog
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH 1/3] swscale/x86/swscale: Process yuv2yuvX tails using next largest register size
On Tue, Sep 5, 2023 at 12:03 AM Michael Niedermayer wrote: > On Mon, Sep 04, 2023 at 02:30:00PM +0200, Alan Kelly via ffmpeg-devel > wrote: > > Hi, > > > > Any issues with this patch or can it be merged? > > are all cases covered by tests ? > if yes and the tests pass, it should be ok > > thx > > [...] > -- > Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB > > Observe your enemies, for they first find out your faults. -- Antisthenes > ___ > ffmpeg-devel mailing list > [email protected] > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > [email protected] with subject "unsubscribe". > All branches are tested. However, the third patch in this chain supersedes this change since each version of yuv2yuvX now handles remainders. I have sent the two other patches as a new chain so let's abandon this. Thanks ___ ffmpeg-devel mailing list [email protected] https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email [email protected] with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] swscale: Break loop-carried dependency enabling parallel out of order execution of the gathers.
On Mon, Aug 4, 2025 at 10:04 PM Hendrik Leppkes wrote: > On Mon, Aug 4, 2025 at 7:19 PM Jacob Lifshay > wrote: > > > > > > > > On August 4, 2025 6:49:20 AM PDT, Alan Kelly via ffmpeg-devel < > [email protected]> wrote: > > > The gather is unmasked but the instruction does a merge into ymm4, > which > > > depends on the value of ymm4 from the previous loop iteration. The > > > out-of-order scheduler does not know statically that the instruction is > > > fully unmasked, preventing parallel out-of-order execution of the > > > gathers. > > > --- > > > libswscale/x86/scale_avx2.asm | 3 +++ > > > 1 file changed, 3 insertions(+) > > > > > > diff --git a/libswscale/x86/scale_avx2.asm > b/libswscale/x86/scale_avx2.asm > > > index b4b852d60b..90ee8b0a0e 100644 > > > --- a/libswscale/x86/scale_avx2.asm > > > +++ b/libswscale/x86/scale_avx2.asm > > > @@ -68,8 +68,10 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, > srcmem, filter, fltpos, fltsize, > > > .innerloop: > > > %endif > > > vpcmpeqd m13, m13 > > > +pxor m3, m3 ; break loop-carried dependency > > > > this is in AVX2 code, so you should use vpxor since pxor will just clear > the lower 128 bits and leave the upper 128 bits unmodified. actually, on > some older intel cpus it will cause a huge stall due to not being > v-prefixed: > > > https://stackoverflow.com/questions/41303780/why-is-this-sse-code-6-times-slower-without-vzeroupper-on-skylake/41349852#41349852 > > > > The v is actually automatically added by the pre-processor through > x86inc.asm if the function is marked as avx - its a bit confusing > because all other instructions are explicitly using it however, so it > might still be a good idea to be explicit about it. > > As for the patch itself, any numbers? > > - Hendrik > ___ > ffmpeg-devel mailing list > [email protected] > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > [email protected] with subject "unsubscribe". > Thanks for the quick review and sorry for the slow response. I got dragged down a benchmarking rabbit hole. I was unable to reproduce the benchmarks I did when I originally ported hscale to avx2 on Skylake and Cascade Lake. I could only reproduce the speed-up on Broadwell and Sapphire Rapids. I found an Intel security vulnerability called Gather Data Sampling https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/gather-data-sampling.html which when mitigated, has a large impact on the performance of gather instructions. The machines I tested on had the mitigation applied, causing a huge performance loss for avx2 hscale. Broadwell: hscale_8_to_15__fs_4_dstW_512_c: 3379.5 ( 1.00x) hscale_8_to_15__fs_4_dstW_512_sse2:615.7 ( 5.49x) hscale_8_to_15__fs_4_dstW_512_ssse3: 613.4 ( 5.51x) hscale_8_to_15__fs_4_dstW_512_avx2:495.7 ( 6.82x) Skylake: hscale_8_to_15__fs_4_dstW_512_c: 3411.4 ( 1.00x) hscale_8_to_15__fs_4_dstW_512_sse2:591.0 ( 5.77x) hscale_8_to_15__fs_4_dstW_512_ssse3: 591.5 ( 5.77x) hscale_8_to_15__fs_4_dstW_512_avx2: 1386.2 ( 2.46x) Cascade Lake: hscale_8_to_15__fs_4_dstW_512_c: 3231.3 ( 1.00x) hscale_8_to_15__fs_4_dstW_512_sse2:517.9 ( 6.24x) hscale_8_to_15__fs_4_dstW_512_ssse3: 521.6 ( 6.19x) hscale_8_to_15__fs_4_dstW_512_avx2: 1775.0 ( 1.82x) Sapphire Rapids: hscale_8_to_15__fs_4_dstW_512_c: 1840.0 ( 1.00x) hscale_8_to_15__fs_4_dstW_512_sse2:287.9 ( 6.39x) hscale_8_to_15__fs_4_dstW_512_ssse3: 293.8 ( 6.26x) hscale_8_to_15__fs_4_dstW_512_avx2:219.2 ( 8.40x) This patch increases performance by about 3%. But I think the real question is what should be done about avx2 hscale? Most machines probably have this patch applied, should I send a patch removing avx2 hscale or disable it on Skylake, Ice Lake, Cascade Lake and possibly other machines? ___ ffmpeg-devel mailing list [email protected] https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email [email protected] with subject "unsubscribe".
[FFmpeg-devel] [PATCH] swscale: Break loop-carried dependency enabling parallel out of order execution of the gathers.
The gather is unmasked but the instruction does a merge into ymm4, which depends on the value of ymm4 from the previous loop iteration. The out-of-order scheduler does not know statically that the instruction is fully unmasked, preventing parallel out-of-order execution of the gathers. --- libswscale/x86/scale_avx2.asm | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm index b4b852d60b..90ee8b0a0e 100644 --- a/libswscale/x86/scale_avx2.asm +++ b/libswscale/x86/scale_avx2.asm @@ -68,8 +68,10 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize, .innerloop: %endif vpcmpeqd m13, m13 +pxor m3, m3 ; break loop-carried dependency vpgatherdd m3,[srcmemq + m1], m13 vpcmpeqd m13, m13 +pxor m4, m4 ; break loop-carried dependency vpgatherdd m4,[srcmemq + m2], m13 vpunpcklbw m5, m3, m0 vpunpckhbw m6, m3, m0 @@ -119,6 +121,7 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize, .tail_innerloop: %endif vpcmpeqd xm13, xm13 +pxor m3, m3 ; break loop-carried dependency vpgatherdd xm3,[srcmemq + xm1], xm13 vpunpcklbw xm5, xm3, xm0 vpunpckhbw xm6, xm3, xm0 -- 2.50.1.565.gc32cd1483b-goog ___ ffmpeg-devel mailing list [email protected] https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email [email protected] with subject "unsubscribe".
[FFmpeg-devel] [PATCH] swscale: Disable avx2 hscale 8to15 on IceLake and below due to Intel Gather Data Sampling mitigation performance loss
Intel provided a microcode update to mitigate this security
vulnerability which has a huge negative performance impact on gather
instructions. This means that hscale 8to15 avx2, which uses gather
extensively, is no longer faster than SSSE3 on impacted CPUs.
---
libavutil/x86/cpu.c | 6 --
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
index d6cd4fab9c..923c63e0c4 100644
--- a/libavutil/x86/cpu.c
+++ b/libavutil/x86/cpu.c
@@ -244,9 +244,11 @@ int ff_get_cpu_flags_x86(void)
family == 6 && model < 23)
rval |= AV_CPU_FLAG_SSSE3SLOW;
-/* Haswell has slow gather */
-if ((rval & AV_CPU_FLAG_AVX2) && family == 6 && model < 70)
+/* Ice Lake and below have slow gather due to Gather Data Sampling
+ * mitigation. */
+if ((rval & AV_CPU_FLAG_AVX2) && family == 6 && model < 143) {
rval |= AV_CPU_FLAG_SLOW_GATHER;
+}
}
#endif /* cpuid */
--
2.50.1.703.g449372360f-goog
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] swscale: Disable avx2 hscale 8to15 on IceLake and below due to Intel Gather Data Sampling mitigation performance loss
On Fri, Aug 8, 2025 at 2:21 PM Alan Kelly wrote:
> Intel provided a microcode update to mitigate this security
> vulnerability which has a huge negative performance impact on gather
> instructions. This means that hscale 8to15 avx2, which uses gather
> extensively, is no longer faster than SSSE3 on impacted CPUs.
> ---
> libavutil/x86/cpu.c | 6 --
> 1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
> index d6cd4fab9c..923c63e0c4 100644
> --- a/libavutil/x86/cpu.c
> +++ b/libavutil/x86/cpu.c
> @@ -244,9 +244,11 @@ int ff_get_cpu_flags_x86(void)
> family == 6 && model < 23)
> rval |= AV_CPU_FLAG_SSSE3SLOW;
>
> -/* Haswell has slow gather */
> -if ((rval & AV_CPU_FLAG_AVX2) && family == 6 && model < 70)
> +/* Ice Lake and below have slow gather due to Gather Data Sampling
> + * mitigation. */
> +if ((rval & AV_CPU_FLAG_AVX2) && family == 6 && model < 143) {
> rval |= AV_CPU_FLAG_SLOW_GATHER;
> +}
> }
>
> #endif /* cpuid */
> --
> 2.50.1.703.g449372360f-goog
>
https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/gather-data-sampling.html
Broadwell:
hscale_8_to_15__fs_4_dstW_512_c: 3379.5 ( 1.00x)
hscale_8_to_15__fs_4_dstW_512_sse2:615.7 ( 5.49x)
hscale_8_to_15__fs_4_dstW_512_ssse3: 613.4 ( 5.51x)
hscale_8_to_15__fs_4_dstW_512_avx2:495.7 ( 6.82x)
Skylake:
hscale_8_to_15__fs_4_dstW_512_c: 3411.4 ( 1.00x)
hscale_8_to_15__fs_4_dstW_512_sse2:591.0 ( 5.77x)
hscale_8_to_15__fs_4_dstW_512_ssse3: 591.5 ( 5.77x)
hscale_8_to_15__fs_4_dstW_512_avx2: 1386.2 ( 2.46x)
Cascade Lake:
hscale_8_to_15__fs_4_dstW_512_c: 3231.3 ( 1.00x)
hscale_8_to_15__fs_4_dstW_512_sse2:517.9 ( 6.24x)
hscale_8_to_15__fs_4_dstW_512_ssse3: 521.6 ( 6.19x)
hscale_8_to_15__fs_4_dstW_512_avx2: 1775.0 ( 1.82x)
Sapphire Rapids:
hscale_8_to_15__fs_4_dstW_512_c: 1840.0 ( 1.00x)
hscale_8_to_15__fs_4_dstW_512_sse2:287.9 ( 6.39x)
hscale_8_to_15__fs_4_dstW_512_ssse3: 293.8 ( 6.26x)
hscale_8_to_15__fs_4_dstW_512_avx2:219.2 ( 8.40x)
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] swscale: Disable avx2 hscale 8to15 on IceLake and below due to Intel Gather Data Sampling mitigation performance loss
On Fri, Aug 8, 2025 at 2:23 PM Alan Kelly wrote:
>
>
> On Fri, Aug 8, 2025 at 2:21 PM Alan Kelly wrote:
>
>> Intel provided a microcode update to mitigate this security
>> vulnerability which has a huge negative performance impact on gather
>> instructions. This means that hscale 8to15 avx2, which uses gather
>> extensively, is no longer faster than SSSE3 on impacted CPUs.
>> ---
>> libavutil/x86/cpu.c | 6 --
>> 1 file changed, 4 insertions(+), 2 deletions(-)
>>
>> diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
>> index d6cd4fab9c..923c63e0c4 100644
>> --- a/libavutil/x86/cpu.c
>> +++ b/libavutil/x86/cpu.c
>> @@ -244,9 +244,11 @@ int ff_get_cpu_flags_x86(void)
>> family == 6 && model < 23)
>> rval |= AV_CPU_FLAG_SSSE3SLOW;
>>
>> -/* Haswell has slow gather */
>> -if ((rval & AV_CPU_FLAG_AVX2) && family == 6 && model < 70)
>> +/* Ice Lake and below have slow gather due to Gather Data
>> Sampling
>> + * mitigation. */
>> +if ((rval & AV_CPU_FLAG_AVX2) && family == 6 && model < 143) {
>> rval |= AV_CPU_FLAG_SLOW_GATHER;
>> +}
>> }
>>
>> #endif /* cpuid */
>> --
>> 2.50.1.703.g449372360f-goog
>>
>
>
> https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/gather-data-sampling.html
>
>
>
> Broadwell:
> hscale_8_to_15__fs_4_dstW_512_c: 3379.5 ( 1.00x)
> hscale_8_to_15__fs_4_dstW_512_sse2:615.7 ( 5.49x)
> hscale_8_to_15__fs_4_dstW_512_ssse3: 613.4 ( 5.51x)
> hscale_8_to_15__fs_4_dstW_512_avx2:495.7 ( 6.82x)
>
> Skylake:
> hscale_8_to_15__fs_4_dstW_512_c: 3411.4 ( 1.00x)
> hscale_8_to_15__fs_4_dstW_512_sse2:591.0 ( 5.77x)
> hscale_8_to_15__fs_4_dstW_512_ssse3: 591.5 ( 5.77x)
> hscale_8_to_15__fs_4_dstW_512_avx2: 1386.2 ( 2.46x)
>
> Cascade Lake:
> hscale_8_to_15__fs_4_dstW_512_c: 3231.3 ( 1.00x)
> hscale_8_to_15__fs_4_dstW_512_sse2:517.9 ( 6.24x)
> hscale_8_to_15__fs_4_dstW_512_ssse3: 521.6 ( 6.19x)
> hscale_8_to_15__fs_4_dstW_512_avx2: 1775.0 ( 1.82x)
>
> Sapphire Rapids:
> hscale_8_to_15__fs_4_dstW_512_c: 1840.0 ( 1.00x)
> hscale_8_to_15__fs_4_dstW_512_sse2:287.9 ( 6.39x)
> hscale_8_to_15__fs_4_dstW_512_ssse3: 293.8 ( 6.26x)
> hscale_8_to_15__fs_4_dstW_512_avx2:219.2 ( 8.40x)
>
>
>
Hi,
Are there any objections to the patch? The performance impact is huge, so
it should be patched quickly.
Thanks,
Alan
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".
