A few landing pads were missed for AVX-512 code. No chance of hitting those on OpenBSD since we don't support AVX-512 yet. But I have a diff for this.
Passes regress on a machine with AVX-512 enabled. Allows me to watch youtube vids on that machine. ok? Index: multimedia/dav1d/Makefile =================================================================== RCS file: /cvs/ports/multimedia/dav1d/Makefile,v retrieving revision 1.37 diff -u -p -r1.37 Makefile --- multimedia/dav1d/Makefile 27 Sep 2023 10:10:19 -0000 1.37 +++ multimedia/dav1d/Makefile 18 Feb 2024 18:51:06 -0000 @@ -6,7 +6,7 @@ COMMENT= small and fast AV1 decoder VER= 1.2.1 DISTNAME= dav1d-${VER} -REVISION= 1 +REVISION= 2 CATEGORIES= multimedia SITES= https://downloads.videolan.org/pub/videolan/dav1d/${VER}/ EXTRACT_SUFX= .tar.xz Index: multimedia/dav1d/patches/patch-src_x86_ipred16_avx512_asm =================================================================== RCS file: multimedia/dav1d/patches/patch-src_x86_ipred16_avx512_asm diff -N multimedia/dav1d/patches/patch-src_x86_ipred16_avx512_asm --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ multimedia/dav1d/patches/patch-src_x86_ipred16_avx512_asm 18 Feb 2024 18:51:06 -0000 @@ -0,0 +1,203 @@ +Index: src/x86/ipred16_avx512.asm +--- src/x86/ipred16_avx512.asm.orig ++++ src/x86/ipred16_avx512.asm +@@ -104,6 +104,7 @@ cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl, + add wq, r6 + jmp wq + .w4: ++ _CET_ENDBR + vpbroadcastq m4, [tlq+2] ; top + movsldup m7, [base+ipred_shuf] + lea r6, [strideq*3] +@@ -133,6 +134,7 @@ cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl, + .w4_end: + RET + .w8: ++ _CET_ENDBR + vbroadcasti32x4 m4, [tlq+2] + movsldup m7, [base+ipred_shuf] + lea r6, [strideq*3] +@@ -152,6 +154,7 @@ cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl, + jg .w8_loop + RET + .w16: ++ _CET_ENDBR + vbroadcasti32x8 m4, [tlq+2] + movsldup m7, [base+ipred_shuf] + psubw m5, m4, m3 +@@ -168,6 +171,7 @@ cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl, + jg .w16_loop + RET + .w32: ++ _CET_ENDBR + movu m4, [tlq+2] + psubw m5, m4, m3 + pabsw m6, m5 +@@ -181,6 +185,7 @@ cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl, + jg .w32_loop + RET + .w64: ++ _CET_ENDBR + movu m4, [tlq+ 2] + movu m7, [tlq+66] + psubw m5, m4, m3 +@@ -212,6 +217,7 @@ cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl + lea stride3q, [strideq*3] + jmp wq + .w4: ++ _CET_ENDBR + vpbroadcastq m5, [tlq+2] ; top + movsldup m4, [ipred_shuf] + psubw m5, m6 ; top - bottom +@@ -239,6 +245,7 @@ cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl + .end: + RET + .w8: ++ _CET_ENDBR + vbroadcasti32x4 m5, [tlq+2] ; top + movsldup m4, [ipred_shuf] + psubw m5, m6 ; top - bottom +@@ -256,6 +263,7 @@ cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl + jl .w8_loop + RET + .w16: ++ _CET_ENDBR + vbroadcasti32x8 m5, [tlq+2] ; top + movsldup m4, [ipred_shuf] + psubw m5, m6 ; top - bottom +@@ -277,6 +285,7 @@ cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl + jl .w16_loop + RET + .w32: ++ _CET_ENDBR + movu m5, [tlq+2] + psubw m5, m6 + .w32_loop: +@@ -295,6 +304,7 @@ cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl + jl .w32_loop + RET + .w64: ++ _CET_ENDBR + movu m4, [tlq+ 2] + movu m5, [tlq+66] + psubw m4, m6 +@@ -329,6 +339,7 @@ cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl + lea wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq] + jmp wq + .w4: ++ _CET_ENDBR + movsldup m4, [base+ipred_shuf] + vpbroadcastq m5, [base+smooth_weights_1d_16bpc+4*2] + .w4_loop: +@@ -356,6 +367,7 @@ cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl + .end: + RET + .w8: ++ _CET_ENDBR + movsldup m4, [base+ipred_shuf] + vbroadcasti32x4 m5, [base+smooth_weights_1d_16bpc+8*2] + .w8_loop: +@@ -373,6 +385,7 @@ cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl + jg .w8_loop + RET + .w16: ++ _CET_ENDBR + movsldup m4, [base+ipred_shuf] + vbroadcasti32x8 m5, [base+smooth_weights_1d_16bpc+16*2] + .w16_loop: +@@ -395,6 +408,7 @@ cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl + jg .w16_loop + RET + .w32: ++ _CET_ENDBR + movu m5, [base+smooth_weights_1d_16bpc+32*2] + .w32_loop: + vpbroadcastq m3, [tlq+hq-8] +@@ -415,6 +429,7 @@ cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl + jg .w32_loop + RET + .w64: ++ _CET_ENDBR + movu m4, [base+smooth_weights_1d_16bpc+64*2] + movu m5, [base+smooth_weights_1d_16bpc+64*3] + .w64_loop: +@@ -456,6 +471,7 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, + lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*2] + jmp wq + .w4: ++ _CET_ENDBR + vpbroadcastq m5, [tlq+hq+2] + movshdup m3, [base+ipred_shuf] + movsldup m4, [base+ipred_shuf] +@@ -483,6 +499,7 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, + jg .w4_loop + RET + .w8: ++ _CET_ENDBR + vbroadcasti32x4 ym5, [tlq+hq+2] + movshdup m6, [base+ipred_shuf] + movsldup m7, [base+ipred_shuf] +@@ -517,6 +534,7 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, + jg .w8_loop + RET + .w16: ++ _CET_ENDBR + pmovzxwd m5, [tlq+hq+2] + mova m6, [base+smooth_weights_2d_16bpc+16*4] + vpblendmw m5{k1}, m0, m5 ; top, bottom +@@ -541,6 +559,7 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, + jg .w16_loop + RET + .w32: ++ _CET_ENDBR + pmovzxwd m5, [tlq+hq+ 2] + pmovzxwd m6, [tlq+hq+34] + mova m7, [base+smooth_weights_2d_16bpc+32*4] +@@ -574,6 +593,7 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, + jg .w32_loop + RET + .w64: ++ _CET_ENDBR + pmovzxwd m5, [tlq+hq+ 2] + pmovzxwd m6, [tlq+hq+34] + pmovzxwd m7, [tlq+hq+66] +@@ -621,6 +641,7 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx + lea stride3q, [strideq*3] + jmp wq + .w4: ++ _CET_ENDBR + pmovzxbw ym0, [idxq] + add idxq, 16 + vpermw ym0, ym0, ym3 +@@ -634,6 +655,7 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx + jg .w4 + RET + .w8: ++ _CET_ENDBR + pmovzxbw m0, [idxq] + add idxq, 32 + vpermw m0, m0, m3 +@@ -646,6 +668,7 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx + jg .w8 + RET + .w16: ++ _CET_ENDBR + vpermb m1, m2, [idxq] + add idxq, 64 + vpermw m0, m1, m3 +@@ -660,6 +683,7 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx + jg .w16 + RET + .w32: ++ _CET_ENDBR + vpermb m1, m2, [idxq] + add idxq, 64 + vpermw m0, m1, m3 +@@ -672,6 +696,7 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx + jg .w32 + RET + .w64: ++ _CET_ENDBR + vpermb m1, m2, [idxq] + add idxq, 64 + vpermw m0, m1, m3 Index: multimedia/dav1d/patches/patch-src_x86_ipred_avx512_asm =================================================================== RCS file: multimedia/dav1d/patches/patch-src_x86_ipred_avx512_asm diff -N multimedia/dav1d/patches/patch-src_x86_ipred_avx512_asm --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ multimedia/dav1d/patches/patch-src_x86_ipred_avx512_asm 18 Feb 2024 18:51:06 -0000 @@ -0,0 +1,374 @@ +Index: src/x86/ipred_avx512.asm +--- src/x86/ipred_avx512.asm.orig ++++ src/x86/ipred_avx512.asm +@@ -168,18 +168,23 @@ cglobal ipred_dc_left_8bpc, 3, 7, 5, dst, stride, tl, + add wq, r5 + jmp r6 + .h64: ++ _CET_ENDBR + movu ym1, [tlq+32] ; unaligned when jumping here from dc_top + vpdpbusd ym0, ym1, ym2 + .h32: ++ _CET_ENDBR + vextracti32x4 xm1, ym0, 1 + paddd xm0, xm1 + .h16: ++ _CET_ENDBR + punpckhqdq xm1, xm0, xm0 + paddd xm0, xm1 + .h8: ++ _CET_ENDBR + psrlq xm1, xm0, 32 + paddd xm0, xm1 + .h4: ++ _CET_ENDBR + vpsrlvd xm0, xmm3 + lea stride3q, [strideq*3] + vpbroadcastb m0, xm0 +@@ -204,10 +209,12 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, + lea stride3q, [strideq*3] + jmp r6 + .h4: ++ _CET_ENDBR + movd xmm1, [tlq-4] + vpdpbusd xm0, xmm1, xm3 + jmp wq + .w4: ++ _CET_ENDBR + movd xmm1, [tlq+1] + vpdpbusd xm0, xmm1, xm3 + cmp hd, 4 +@@ -228,6 +235,7 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, + .w4_end: + vpbroadcastb xm0, xmm0 + .s4: ++ _CET_ENDBR + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xm0 + movd [dstq+strideq*2], xm0 +@@ -237,10 +245,12 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, + jg .s4 + RET + .h8: ++ _CET_ENDBR + movq xmm1, [tlq-8] + vpdpbusd xm0, xmm1, xm3 + jmp wq + .w8: ++ _CET_ENDBR + movq xmm1, [tlq+1] + vextracti32x4 xm2, ym0, 1 + vpdpbusd xm0, xmm1, xm3 +@@ -261,6 +271,7 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, + .w8_end: + vpbroadcastb xm0, xmm0 + .s8: ++ _CET_ENDBR + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm0 +@@ -270,10 +281,12 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, + jg .s8 + RET + .h16: ++ _CET_ENDBR + mova xmm1, [tlq-16] + vpdpbusd xm0, xmm1, xm3 + jmp wq + .w16: ++ _CET_ENDBR + movu xmm1, [tlq+1] + vextracti32x4 xm2, ym0, 1 + vpdpbusd xm0, xmm1, xm3 +@@ -294,6 +307,7 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, + .w16_end: + vpbroadcastb xm0, xmm0 + .s16: ++ _CET_ENDBR + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm0 + mova [dstq+strideq*2], xm0 +@@ -303,10 +317,12 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, + jg .s16 + RET + .h32: ++ _CET_ENDBR + mova ym1, [tlq-32] + vpdpbusd ym0, ym1, ym3 + jmp wq + .w32: ++ _CET_ENDBR + movu ym1, [tlq+1] + vpdpbusd ym0, ym1, ym3 + vextracti32x4 xm1, ym0, 1 +@@ -326,6 +342,7 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, + .w32_end: + vpbroadcastb ym0, xmm0 + .s32: ++ _CET_ENDBR + mova [dstq+strideq*0], ym0 + mova [dstq+strideq*1], ym0 + mova [dstq+strideq*2], ym0 +@@ -335,12 +352,14 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, + jg .s32 + RET + .h64: ++ _CET_ENDBR + mova ym1, [tlq-64] + mova ym2, [tlq-32] + vpdpbusd ym0, ym1, ym3 + vpdpbusd ym0, ym2, ym3 + jmp wq + .w64: ++ _CET_ENDBR + movu ym1, [tlq+ 1] + movu ym2, [tlq+33] + vpdpbusd ym0, ym1, ym3 +@@ -361,6 +380,7 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, + .w64_end: + vpbroadcastb m0, xmm0 + .s64: ++ _CET_ENDBR + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 +@@ -401,6 +421,7 @@ cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h, + add wq, r6 + jmp wq + .w4: ++ _CET_ENDBR + mova xmm1, [base+ipred_h_shuf+16] + .w4_loop: + movd xmm0, [tlq+hq-4] +@@ -414,6 +435,7 @@ cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h, + jg .w4_loop + RET + .w8: ++ _CET_ENDBR + movsldup xmm2, [base+ipred_h_shuf+16] + movshdup xmm3, [base+ipred_h_shuf+16] + .w8_loop: +@@ -429,6 +451,7 @@ cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h, + jg .w8_loop + RET + .w16: ++ _CET_ENDBR + movsldup m1, [base+smooth_shuf] + .w16_loop: + vpbroadcastd m0, [tlq+hq-4] +@@ -442,6 +465,7 @@ cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h, + jg .w16 + RET + .w32: ++ _CET_ENDBR + vpbroadcastd ym3, [base+pb_1] + vpord m2, m3, [base+pb_2] {1to16} + .w32_loop: +@@ -457,6 +481,7 @@ cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h, + jg .w32_loop + RET + .w64: ++ _CET_ENDBR + vpbroadcastd m4, [base+pb_3] + vpbroadcastd m5, [base+pb_2] + vpbroadcastd m6, [base+pb_1] +@@ -509,6 +534,7 @@ cglobal ipred_paeth_8bpc, 3, 7, 10, dst, stride, tl, w + jmp wq + INIT_YMM avx512icl + .w4: ++ _CET_ENDBR + vpbroadcastd m6, [topq] + mova m9, [ipred_h_shuf] + psubusb m7, m5, m6 +@@ -536,6 +562,7 @@ INIT_YMM avx512icl + RET + INIT_ZMM avx512icl + .w8: ++ _CET_ENDBR + vpbroadcastq m6, [topq] + movsldup m9, [smooth_shuf] + psubusb m7, m5, m6 +@@ -564,6 +591,7 @@ INIT_ZMM avx512icl + .w8_ret: + RET + .w16: ++ _CET_ENDBR + vbroadcasti32x4 m6, [topq] + movsldup m9, [smooth_shuf] + psubusb m7, m5, m6 +@@ -582,6 +610,7 @@ INIT_ZMM avx512icl + jg .w16_loop + RET + .w32: ++ _CET_ENDBR + vbroadcasti32x8 m6, [topq] + mova ym9, ym8 + psubusb m7, m5, m6 +@@ -598,6 +627,7 @@ INIT_ZMM avx512icl + jg .w32_loop + RET + .w64: ++ _CET_ENDBR + movu m6, [topq] + psubusb m7, m5, m6 + psubusb m0, m6, m5 +@@ -626,6 +656,7 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, + lea stride3q, [strideq*3] + jmp wq + .w4: ++ _CET_ENDBR + vpbroadcastd m2, [tlq+1] + movshdup m5, [smooth_shuf] + mova ym6, [smooth_endA] +@@ -656,6 +687,7 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, + .ret: + RET + .w8: ++ _CET_ENDBR + vpbroadcastq m2, [tlq+1] + movshdup m5, [smooth_shuf] + mova ym6, [smooth_endA] +@@ -679,6 +711,7 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, + jl .w8_loop + RET + .w16: ++ _CET_ENDBR + vbroadcasti32x4 m3, [tlq+1] + movshdup m6, [smooth_shuf] + mova m7, [smooth_endB] +@@ -707,6 +740,7 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, + jl .w16_loop + RET + .w32: ++ _CET_ENDBR + vbroadcasti32x8 m3, [tlq+1] + movshdup m6, [smooth_shuf] + mova m7, [smooth_endB] +@@ -733,6 +767,7 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, + jl .w32_loop + RET + .w64: ++ _CET_ENDBR + movu m3, [tlq+1] + mova m6, [smooth_endB] + punpcklbw m2, m3, m4 +@@ -772,6 +807,7 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl + lea stride3q, [strideq*3] + jmp wq + .w4: ++ _CET_ENDBR + movsldup m3, [smooth_shuf] + vpbroadcastq m7, [smooth_weights+4*2] + mova ym8, [smooth_endA] +@@ -802,6 +838,7 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl + .ret: + RET + .w8: ++ _CET_ENDBR + movsldup m3, [smooth_shuf] + vbroadcasti32x4 m7, [smooth_weights+8*2] + mova ym8, [smooth_endA] +@@ -825,6 +862,7 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl + jg .w8_loop + RET + .w16: ++ _CET_ENDBR + movsldup m7, [smooth_shuf] + vbroadcasti32x4 m8, [smooth_weights+16*2] + vbroadcasti32x4 m9, [smooth_weights+16*3] +@@ -850,6 +888,7 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl + jg .w16_loop + RET + .w32: ++ _CET_ENDBR + mova m10, [smooth_endA] + vpbroadcastd ym7, [pb_1] + vbroadcasti32x8 m8, [smooth_weights+32*2] +@@ -874,6 +913,7 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl + jg .w32_loop + RET + .w64: ++ _CET_ENDBR + mova m7, [smooth_weights+64*2] + mova m8, [smooth_weights+64*3] + mova m9, [smooth_endA] +@@ -912,6 +952,7 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, + lea stride3q, [strideq*3] + jmp wq + .w4: ++ _CET_ENDBR + vpbroadcastd m8, [tlq+hq+1] + movsldup m4, [smooth_shuf] + movshdup m5, [smooth_shuf] +@@ -954,6 +995,7 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, + .ret: + RET + .w8: ++ _CET_ENDBR + vpbroadcastq m8, [tlq+hq+1] + movsldup m4, [smooth_shuf] + movshdup m5, [smooth_shuf] +@@ -988,6 +1030,7 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, + jg .w8_loop + RET + .w16: ++ _CET_ENDBR + vbroadcasti32x4 m9, [tlq+hq+1] + movsldup m5, [smooth_shuf] + movshdup m10, [smooth_shuf] +@@ -1031,6 +1074,7 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, + jg .w16_loop + RET + .w32: ++ _CET_ENDBR + vbroadcasti32x8 m9, [tlq+hq+1] + movshdup m10, [smooth_shuf] + mova m12, [smooth_weights+32*2] +@@ -1073,6 +1117,7 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, + jg .w32_loop + RET + .w64: ++ _CET_ENDBR + movu m9, [tlq+hq+1] + mova m11, [smooth_weights+64*2] + mova m2, [smooth_weights+64*3] +@@ -1122,6 +1167,7 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, + lea stride3q, [strideq*3] + jmp wq + .w4: ++ _CET_ENDBR + pshufb xmm0, xm4, [idxq] + add idxq, 16 + movd [dstq+strideq*0], xmm0 +@@ -1133,6 +1179,7 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, + jg .w4 + RET + .w8: ++ _CET_ENDBR + pshufb xmm0, xm4, [idxq+16*0] + pshufb xmm1, xm4, [idxq+16*1] + add idxq, 16*2 +@@ -1145,6 +1192,7 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, + jg .w8 + RET + .w16: ++ _CET_ENDBR + pshufb m0, m4, [idxq] + add idxq, 64 + mova [dstq+strideq*0], xm0 +@@ -1156,6 +1204,7 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, + jg .w16 + RET + .w32: ++ _CET_ENDBR + pshufb m0, m4, [idxq+64*0] + pshufb m1, m4, [idxq+64*1] + add idxq, 64*2 +@@ -1168,6 +1217,7 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, + jg .w32 + RET + .w64: ++ _CET_ENDBR + pshufb m0, m4, [idxq+64*0] + pshufb m1, m4, [idxq+64*1] + pshufb m2, m4, [idxq+64*2] Index: multimedia/dav1d/patches/patch-src_x86_itx_avx512_asm =================================================================== RCS file: /cvs/ports/multimedia/dav1d/patches/patch-src_x86_itx_avx512_asm,v retrieving revision 1.1 diff -u -p -r1.1 patch-src_x86_itx_avx512_asm --- multimedia/dav1d/patches/patch-src_x86_itx_avx512_asm 13 Jul 2023 12:36:36 -0000 1.1 +++ multimedia/dav1d/patches/patch-src_x86_itx_avx512_asm 18 Feb 2024 18:51:06 -0000 @@ -49,7 +49,15 @@ Index: src/x86/itx_avx512.asm vextracti32x4 xm2, m0, 1 vextracti32x4 xm3, m1, 1 pshufd xm4, xm0, q1032 -@@ -818,6 +824,7 @@ cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, str +@@ -787,6 +793,7 @@ cglobal iflipadst_4x8_internal_8bpc, 0, 6, 0, dst, str + punpckhwd m1, m3 + jmp tx2q + .pass2: ++ _CET_ENDBR + vextracti32x4 xm2, m0, 1 + vextracti32x4 xm3, m1, 1 + pshufd xm4, xm0, q1032 +@@ -818,6 +825,7 @@ cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, str vextracti32x8 ym1, m0, 1 jmp tx2q .pass2: @@ -57,7 +65,7 @@ Index: src/x86/itx_avx512.asm vpbroadcastd ym4, [o(pw_4096)] jmp m(iadst_4x8_internal_8bpc).end2 -@@ -935,6 +942,7 @@ cglobal idct_4x16_internal_8bpc, 0, 6, 0, dst, stride, +@@ -935,6 +943,7 @@ cglobal idct_4x16_internal_8bpc, 0, 6, 0, dst, stride, pmulhrsw m1, m4 jmp tx2q .pass2: @@ -65,7 +73,7 @@ Index: src/x86/itx_avx512.asm vextracti32x4 xm2, ym0, 1 vextracti32x4 xm3, ym1, 1 vextracti32x4 xm4, m0, 2 -@@ -975,6 +983,7 @@ cglobal iadst_4x16_internal_8bpc, 0, 6, 0, dst, stride +@@ -975,6 +984,7 @@ cglobal iadst_4x16_internal_8bpc, 0, 6, 0, dst, stride punpcklwd m0, m2 jmp tx2q .pass2: @@ -73,7 +81,7 @@ Index: src/x86/itx_avx512.asm call .main vpbroadcastd m5, [o(pw_2048)] psrlq m10, 4 -@@ -1082,6 +1091,7 @@ cglobal iflipadst_4x16_internal_8bpc, 0, 6, 0, dst, st +@@ -1082,6 +1092,7 @@ cglobal iflipadst_4x16_internal_8bpc, 0, 6, 0, dst, st punpckhwd m1, m2 jmp tx2q .pass2: @@ -81,7 +89,7 @@ Index: src/x86/itx_avx512.asm call m(iadst_4x16_internal_8bpc).main vpbroadcastd m6, [o(pw_2048)] psrlq m10, 12 -@@ -1109,6 +1119,7 @@ cglobal iidentity_4x16_internal_8bpc, 0, 6, 0, dst, st +@@ -1109,6 +1120,7 @@ cglobal iidentity_4x16_internal_8bpc, 0, 6, 0, dst, st punpckhdq m1, m2 jmp tx2q .pass2: @@ -89,7 +97,7 @@ Index: src/x86/itx_avx512.asm vpbroadcastd m3, [o(pw_1697x16)] vpbroadcastd m5, [o(pw_2048)] pmulhrsw m2, m3, m0 -@@ -1181,6 +1192,7 @@ cglobal idct_8x4_internal_8bpc, 0, 6, 0, dst, stride, +@@ -1181,6 +1193,7 @@ cglobal idct_8x4_internal_8bpc, 0, 6, 0, dst, stride, pshufb m1, m4 jmp tx2q .pass2: @@ -97,7 +105,7 @@ Index: src/x86/itx_avx512.asm IDCT4_1D_PACKED vpermq m0, m0, q3120 vpermq m1, m1, q2031 -@@ -1210,6 +1222,7 @@ cglobal iadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, +@@ -1210,6 +1223,7 @@ cglobal iadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, punpcklwd m0, m3 jmp tx2q .pass2: @@ -105,7 +113,7 @@ Index: src/x86/itx_avx512.asm call .main .end: vpermq m0, m0, q3120 -@@ -1253,6 +1266,7 @@ cglobal iflipadst_8x4_internal_8bpc, 0, 6, 0, dst, str +@@ -1253,6 +1267,7 @@ cglobal iflipadst_8x4_internal_8bpc, 0, 6, 0, dst, str punpcklwd m0, m3 jmp tx2q .pass2: @@ -113,7 +121,7 @@ Index: src/x86/itx_avx512.asm call m(iadst_8x4_internal_8bpc).main mova m2, m1 vpermq m1, m0, q2031 -@@ -1280,6 +1294,7 @@ cglobal iidentity_8x4_internal_8bpc, 0, 6, 0, dst, str +@@ -1280,6 +1295,7 @@ cglobal iidentity_8x4_internal_8bpc, 0, 6, 0, dst, str paddsw m1, m1 jmp tx2q .pass2: @@ -121,7 +129,7 @@ Index: src/x86/itx_avx512.asm vpbroadcastd m3, [o(pw_1697x8)] pmulhrsw m2, m3, m0 pmulhrsw m3, m1 -@@ -1349,6 +1364,7 @@ cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, +@@ -1349,6 +1365,7 @@ cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, vshufi32x4 m3, m5, m3, 0x03 jmp tx2q .pass2: @@ -129,7 +137,7 @@ Index: src/x86/itx_avx512.asm call .main vpbroadcastd m4, [o(pw_2048)] vpermq m0, m0, q3120 -@@ -1388,6 +1404,7 @@ cglobal iadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, +@@ -1388,6 +1405,7 @@ cglobal iadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, vinserti32x4 m1, m4, xm1, 1 jmp tx2q .pass2: @@ -137,7 +145,7 @@ Index: src/x86/itx_avx512.asm pshufd m4, m0, q1032 pshufd m5, m1, q1032 call .main_pass2 -@@ -1455,6 +1472,7 @@ cglobal iflipadst_8x8_internal_8bpc, 0, 6, 0, dst, str +@@ -1455,6 +1473,7 @@ cglobal iflipadst_8x8_internal_8bpc, 0, 6, 0, dst, str vshufi32x4 m2, m4, m2, 0x03 jmp tx2q .pass2: @@ -145,7 +153,7 @@ Index: src/x86/itx_avx512.asm pshufd m4, m0, q1032 pshufd m5, m1, q1032 call m(iadst_8x8_internal_8bpc).main_pass2 -@@ -1493,6 +1511,7 @@ cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, str +@@ -1493,6 +1512,7 @@ cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, str punpckhdq m3, m4 jmp tx2q .pass2: @@ -153,7 +161,7 @@ Index: src/x86/itx_avx512.asm vpbroadcastd m4, [o(pw_4096)] jmp m(iadst_8x8_internal_8bpc).end -@@ -1553,6 +1572,7 @@ cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, +@@ -1553,6 +1573,7 @@ cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, punpckhdq m3, m4 ; 3 7 11 15 jmp tx2q .pass2: @@ -161,7 +169,7 @@ Index: src/x86/itx_avx512.asm vprord m5, [o(int16_perm)], 16 vshufi32x4 m2, m2, q1320 ; 2 10 14 6 vshufi32x4 m4, m1, m3, q2310 ; 1 5 15 11 -@@ -1686,6 +1706,7 @@ cglobal iadst_8x16_internal_8bpc, 0, 6, 0, dst, stride +@@ -1686,6 +1707,7 @@ cglobal iadst_8x16_internal_8bpc, 0, 6, 0, dst, stride punpckhqdq m3, m5 jmp tx2q .pass2: @@ -169,7 +177,7 @@ Index: src/x86/itx_avx512.asm call .main_pass2 vpbroadcastd m6, [o(pw_2048)] psrlq m10, 4 -@@ -1794,6 +1815,7 @@ cglobal iflipadst_8x16_internal_8bpc, 0, 6, 0, dst, st +@@ -1794,6 +1816,7 @@ cglobal iflipadst_8x16_internal_8bpc, 0, 6, 0, dst, st pshufb m2, m1, m6 ; e0 f0 e1 f1 e2 f2 e3 f3 jmp m(iadst_8x16_internal_8bpc).pass1_end .pass2: @@ -177,7 +185,7 @@ Index: src/x86/itx_avx512.asm call m(iadst_8x16_internal_8bpc).main_pass2 vpbroadcastd m7, [o(pw_2048)] psrlq m10, 36 -@@ -1823,6 +1845,7 @@ cglobal iidentity_8x16_internal_8bpc, 0, 6, 0, dst, st +@@ -1823,6 +1846,7 @@ cglobal iidentity_8x16_internal_8bpc, 0, 6, 0, dst, st punpckhqdq m3, m4 ; a3 b3 c3 d3 e3 f3 g3 h3 jmp tx2q .pass2: @@ -185,7 +193,7 @@ Index: src/x86/itx_avx512.asm vpbroadcastd m7, [o(pw_1697x16)] mova ym8, [o(gather8b)] lea r3, [dstq+strideq*2] -@@ -1897,6 +1920,7 @@ cglobal idct_16x4_internal_8bpc, 0, 6, 0, dst, stride, +@@ -1897,6 +1921,7 @@ cglobal idct_16x4_internal_8bpc, 0, 6, 0, dst, stride, punpcklwd m0, m2 jmp tx2q .pass2: @@ -193,7 +201,7 @@ Index: src/x86/itx_avx512.asm IDCT4_1D_PACKED mova m2, [o(permA)] jmp m(iadst_16x4_internal_8bpc).end -@@ -1936,6 +1960,7 @@ cglobal iadst_16x4_internal_8bpc, 0, 6, 0, dst, stride +@@ -1936,6 +1961,7 @@ cglobal iadst_16x4_internal_8bpc, 0, 6, 0, dst, stride pmulhrsw m1, m6 jmp tx2q .pass2: @@ -201,7 +209,7 @@ Index: src/x86/itx_avx512.asm call .main movu m2, [o(permA+1)] .end: -@@ -1986,6 +2011,7 @@ cglobal iflipadst_16x4_internal_8bpc, 0, 6, 0, dst, st +@@ -1986,6 +2012,7 @@ cglobal iflipadst_16x4_internal_8bpc, 0, 6, 0, dst, st psrlq m10, 16 jmp m(iadst_16x4_internal_8bpc).pass1_end .pass2: @@ -209,7 +217,7 @@ Index: src/x86/itx_avx512.asm call m(iadst_16x4_internal_8bpc).main movu m2, [o(permA+2)] jmp m(iadst_16x4_internal_8bpc).end -@@ -2013,6 +2039,7 @@ cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, st +@@ -2013,6 +2040,7 @@ cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, st vpermb m1, m5, m1 jmp tx2q .pass2: @@ -217,7 +225,7 @@ Index: src/x86/itx_avx512.asm vpbroadcastd m3, [o(pw_1697x8)] pmulhrsw m2, m3, m0 pmulhrsw m3, m1 -@@ -2112,6 +2139,7 @@ cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, +@@ -2112,6 +2140,7 @@ cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, punpckhdq m5, m6, m7 ; i2 j2 k2 l2 i3 j3 k3 l3 jmp tx2q .pass2: @@ -225,7 +233,7 @@ Index: src/x86/itx_avx512.asm vshufi32x4 m0, m2, m4, q2020 ; 0 1 vshufi32x4 m2, m4, q3131 ; 4 5 vshufi32x4 m1, m3, m5, q2020 ; 2 3 -@@ -2211,6 +2239,7 @@ cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride +@@ -2211,6 +2240,7 @@ cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride REPX {pmulhrsw x, m7}, m2, m3, m4, m5 jmp tx2q .pass2: @@ -233,7 +241,7 @@ Index: src/x86/itx_avx512.asm vshufi32x4 m0, m2, m4, q2020 vshufi32x4 m2, m4, q3131 ; 4 5 vshufi32x4 m1, m3, m5, q2020 -@@ -2265,6 +2294,7 @@ cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, st +@@ -2265,6 +2295,7 @@ cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, st psrlq m10, 20 jmp m(iadst_16x8_internal_8bpc).pass1_end .pass2: @@ -241,7 +249,7 @@ Index: src/x86/itx_avx512.asm vshufi32x4 m0, m2, m4, q2020 vshufi32x4 m2, m4, q3131 ; 4 5 vshufi32x4 m1, m3, m5, q2020 -@@ -2314,6 +2344,7 @@ cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, st +@@ -2314,6 +2345,7 @@ cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, st REPX {vpermb x, m9, x}, m2, m3, m4, m5 jmp tx2q .pass2: @@ -249,7 +257,7 @@ Index: src/x86/itx_avx512.asm mova m7, [o(permB)] vpbroadcastd m6, [o(pw_4096)] vpermq m0, m7, m2 -@@ -2373,6 +2404,7 @@ cglobal idct_16x16_internal_8bpc, 0, 6, 0, dst, stride +@@ -2373,6 +2405,7 @@ cglobal idct_16x16_internal_8bpc, 0, 6, 0, dst, stride punpckldq m6, m11 jmp tx2q .pass2: @@ -257,7 +265,7 @@ Index: src/x86/itx_avx512.asm vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4 vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec -@@ -2538,6 +2570,7 @@ cglobal iadst_16x16_internal_8bpc, 0, 6, 0, dst, strid +@@ -2538,6 +2571,7 @@ cglobal iadst_16x16_internal_8bpc, 0, 6, 0, dst, strid REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 jmp tx2q .pass2: @@ -265,7 +273,7 @@ Index: src/x86/itx_avx512.asm call .main_pass2 mova m10, [o(permD)] psrlq m8, m10, 8 -@@ -2720,6 +2753,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 6, 0, dst, s +@@ -2720,6 +2754,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 6, 0, dst, s punpckhwd m5, m8, m9 ; i2 j2 k2 l2 i3 j3 k3 l3 jmp m(iadst_16x16_internal_8bpc).pass1_end .pass2: @@ -273,7 +281,7 @@ Index: src/x86/itx_avx512.asm call m(iadst_16x16_internal_8bpc).main_pass2 mova m10, [o(permD)] psrlq m8, m10, 8 -@@ -2789,6 +2823,7 @@ cglobal iidentity_16x16_internal_8bpc, 0, 6, 0, dst, s +@@ -2789,6 +2824,7 @@ cglobal iidentity_16x16_internal_8bpc, 0, 6, 0, dst, s jmp tx2q ALIGN function_align .pass2: @@ -281,7 +289,7 @@ Index: src/x86/itx_avx512.asm vpbroadcastd m11, [o(pw_1697x16)] pmulhrsw m12, m11, m0 pmulhrsw m13, m11, m1 -@@ -3131,6 +3166,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, +@@ -3131,6 +3167,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, call m(idct_8x16_internal_8bpc).main call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast .pass2: @@ -289,7 +297,7 @@ Index: src/x86/itx_avx512.asm vpbroadcastd m10, [o(pw_8192)] vpermt2q m0, m15, m4 ; t0 t1 t9 t8 vpermt2q m20, m15, m18 ; t31 t30a t23a t22 -@@ -3586,6 +3622,7 @@ cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 22, dst +@@ -3586,6 +3623,7 @@ cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 22, dst punpckhwd m17, m17 call .main_oddhalf_fast .pass2: @@ -297,7 +305,7 @@ Index: src/x86/itx_avx512.asm vpbroadcastd m10, [o(pw_2048)] mova m11, [o(end_16x32p)] lea r3, [strideq*3] -@@ -3798,6 +3835,7 @@ cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 22, dst +@@ -3798,6 +3836,7 @@ cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 22, dst punpckhwd m17, m17 ; 15 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast .pass2: @@ -305,7 +313,7 @@ Index: src/x86/itx_avx512.asm vpbroadcastd m9, [o(pw_16384)] call .transpose_round vshufi32x4 m16, m14, m2, q3131 ; 5 -@@ -5683,6 +5721,7 @@ ALIGN function_align +@@ -5683,6 +5722,7 @@ ALIGN function_align vinserti32x8 m17, ym21, 1 ; c30 c31 d30 d31 ret .pass2: Index: multimedia/dav1d/patches/patch-src_x86_mc16_avx512_asm =================================================================== RCS file: multimedia/dav1d/patches/patch-src_x86_mc16_avx512_asm diff -N multimedia/dav1d/patches/patch-src_x86_mc16_avx512_asm --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ multimedia/dav1d/patches/patch-src_x86_mc16_avx512_asm 18 Feb 2024 18:51:06 -0000 @@ -0,0 +1,867 @@ +Index: src/x86/mc16_avx512.asm +--- src/x86/mc16_avx512.asm.orig ++++ src/x86/mc16_avx512.asm +@@ -276,6 +276,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w + add t0, r7 + jmp t0 + .put_w2: ++ _CET_ENDBR + mov r6d, [srcq+ssq*0] + mov r7d, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] +@@ -286,6 +287,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w + jg .put_w2 + RET + .put_w4: ++ _CET_ENDBR + mov r6, [srcq+ssq*0] + mov r7, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] +@@ -296,6 +298,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w + jg .put_w4 + RET + .put_w8: ++ _CET_ENDBR + movu xmm0, [srcq+ssq*0] + movu xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] +@@ -306,6 +309,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w + jg .put_w8 + RET + .put_w16: ++ _CET_ENDBR + movu ym0, [srcq+ssq*0] + movu ym1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] +@@ -316,6 +320,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w + jg .put_w16 + RET + .put_w32: ++ _CET_ENDBR + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] +@@ -326,6 +331,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w + jg .put_w32 + RET + .put_w64: ++ _CET_ENDBR + movu m0, [srcq+ssq*0+64*0] + movu m1, [srcq+ssq*0+64*1] + movu m2, [srcq+ssq*1+64*0] +@@ -340,6 +346,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w + jg .put_w64 + RET + .put_w128: ++ _CET_ENDBR + movu m0, [srcq+64*0] + movu m1, [srcq+64*1] + movu m2, [srcq+64*2] +@@ -368,6 +375,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w + vpbroadcastd m6, [r7-put_avx512icl+put_bilin_h_rnd+r6*4] + jmp t0 + .h_w2: ++ _CET_ENDBR + movq xmm1, [srcq+ssq*0] + movhps xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] +@@ -384,6 +392,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w + jg .h_w2 + RET + .h_w4: ++ _CET_ENDBR + movq xmm0, [srcq+ssq*0+0] + movhps xmm0, [srcq+ssq*1+0] + movq xmm1, [srcq+ssq*0+2] +@@ -401,6 +410,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w + jg .h_w4 + RET + .h_w8: ++ _CET_ENDBR + movu xm0, [srcq+ssq*0+0] + vinserti32x4 ym0, [srcq+ssq*1+0], 1 + movu xm1, [srcq+ssq*0+2] +@@ -418,6 +428,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w + jg .h_w8 + RET + .h_w16: ++ _CET_ENDBR + movu ym0, [srcq+ssq*0+0] + vinserti32x8 m0, [srcq+ssq*1+0], 1 + movu ym1, [srcq+ssq*0+2] +@@ -435,6 +446,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w + jg .h_w16 + RET + .h_w32: ++ _CET_ENDBR + pmullw m0, m4, [srcq+ssq*0+0] + pmullw m2, m5, [srcq+ssq*0+2] + pmullw m1, m4, [srcq+ssq*1+0] +@@ -453,6 +465,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w + jg .h_w32 + RET + .h_w64: ++ _CET_ENDBR + pmullw m0, m4, [srcq+64*0+0] + pmullw m2, m5, [srcq+64*0+2] + pmullw m1, m4, [srcq+64*1+0] +@@ -471,6 +484,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w + jg .h_w64 + RET + .h_w128: ++ _CET_ENDBR + pmullw m0, m4, [srcq+64*0+0] + pmullw m7, m5, [srcq+64*0+2] + pmullw m1, m4, [srcq+64*1+0] +@@ -501,6 +515,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w + add t0, r7 + jmp t0 + .v_w2: ++ _CET_ENDBR + movd xmm0, [srcq+ssq*0] + .v_w2_loop: + movd xmm1, [srcq+ssq*1] +@@ -518,6 +533,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w + jg .v_w2_loop + RET + .v_w4: ++ _CET_ENDBR + movq xmm0, [srcq+ssq*0] + .v_w4_loop: + movq xmm1, [srcq+ssq*1] +@@ -535,6 +551,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w + jg .v_w4_loop + RET + .v_w8: ++ _CET_ENDBR + movu xmm0, [srcq+ssq*0] + .v_w8_loop: + vbroadcasti128 ymm1, [srcq+ssq*1] +@@ -553,6 +570,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w + vzeroupper + RET + .v_w16: ++ _CET_ENDBR + movu ym0, [srcq+ssq*0] + .v_w16_loop: + movu ym3, [srcq+ssq*1] +@@ -571,6 +589,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w + jg .v_w16_loop + RET + .v_w32: ++ _CET_ENDBR + movu m0, [srcq+ssq*0] + .v_w32_loop: + movu m3, [srcq+ssq*1] +@@ -589,6 +608,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w + jg .v_w32_loop + RET + .v_w64: ++ _CET_ENDBR + movu m0, [srcq+ssq*0+64*0] + movu m1, [srcq+ssq*0+64*1] + .v_w64_loop: +@@ -618,6 +638,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w + jg .v_w64_loop + RET + .v_w128: ++ _CET_ENDBR + movu m0, [srcq+ssq*0+64*0] + movu m1, [srcq+ssq*0+64*1] + movu m2, [srcq+ssq*0+64*2] +@@ -683,6 +704,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w + .hv_12bpc: + jmp t0 + .hv_w2: ++ _CET_ENDBR + vpbroadcastq xmm1, [srcq+ssq*0] + pmullw xmm0, xmm1, xm4 + psrlq xmm1, 16 +@@ -714,6 +736,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w + jg .hv_w2_loop + RET + .hv_w4: ++ _CET_ENDBR + pmullw xmm0, xm4, [srcq+ssq*0-8] + pmullw xmm1, xm5, [srcq+ssq*0-6] + paddw xmm0, xm6 +@@ -744,6 +767,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w + jg .hv_w4_loop + RET + .hv_w8: ++ _CET_ENDBR + pmullw xmm0, xm4, [srcq+ssq*0+0] + pmullw xmm1, xm5, [srcq+ssq*0+2] + paddw xmm0, xm6 +@@ -775,6 +799,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w + jg .hv_w8_loop + RET + .hv_w16: ++ _CET_ENDBR + pmullw ym0, ym4, [srcq+ssq*0+0] + pmullw ym1, ym5, [srcq+ssq*0+2] + paddw ym0, ym6 +@@ -808,6 +833,7 @@ cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w + .hv_w32: + .hv_w64: + .hv_w128: ++ _CET_ENDBR + movifnidn wd, wm + lea r6d, [hq+wq*8-256] + mov r4, srcq +@@ -874,6 +900,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, + lea stride3q, [strideq*3] + jmp wq + .prep_w4: ++ _CET_ENDBR + movq xmm0, [srcq+strideq*0] + movhps xmm0, [srcq+strideq*1] + vpbroadcastq ymm1, [srcq+strideq*2] +@@ -890,6 +917,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, + vzeroupper + RET + .prep_w8: ++ _CET_ENDBR + movu xm0, [srcq+strideq*0] + vinserti32x4 ym0, [srcq+strideq*1], 1 + vinserti32x4 m0, [srcq+strideq*2], 2 +@@ -903,6 +931,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, + jg .prep_w8 + RET + .prep_w16: ++ _CET_ENDBR + movu ym0, [srcq+strideq*0] + vinserti32x8 m0, [srcq+strideq*1], 1 + movu ym1, [srcq+strideq*2] +@@ -919,6 +948,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, + jg .prep_w16 + RET + .prep_w32: ++ _CET_ENDBR + pmullw m0, m4, [srcq+strideq*0] + pmullw m1, m4, [srcq+strideq*1] + pmullw m2, m4, [srcq+strideq*2] +@@ -934,6 +964,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, + jg .prep_w32 + RET + .prep_w64: ++ _CET_ENDBR + pmullw m0, m4, [srcq+strideq*0+64*0] + pmullw m1, m4, [srcq+strideq*0+64*1] + pmullw m2, m4, [srcq+strideq*1+64*0] +@@ -949,6 +980,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, + jg .prep_w64 + RET + .prep_w128: ++ _CET_ENDBR + pmullw m0, m4, [srcq+64*0] + pmullw m1, m4, [srcq+64*1] + pmullw m2, m4, [srcq+64*2] +@@ -981,6 +1013,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, + lea stride3q, [strideq*3] + jmp wq + .h_w4: ++ _CET_ENDBR + movu xm1, [srcq+strideq*0] + vinserti32x4 ym1, [srcq+strideq*2], 1 + movu xm2, [srcq+strideq*1] +@@ -1001,6 +1034,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, + jg .h_w4 + RET + .h_w8: ++ _CET_ENDBR + movu xm0, [srcq+strideq*0+0] + movu xm1, [srcq+strideq*0+2] + vinserti32x4 ym0, [srcq+strideq*1+0], 1 +@@ -1021,6 +1055,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, + jg .h_w8 + RET + .h_w16: ++ _CET_ENDBR + movu ym0, [srcq+strideq*0+0] + vinserti32x8 m0, [srcq+strideq*1+0], 1 + movu ym1, [srcq+strideq*0+2] +@@ -1037,6 +1072,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, + jg .h_w16 + RET + .h_w32: ++ _CET_ENDBR + pmullw m0, m4, [srcq+strideq*0+0] + pmullw m2, m5, [srcq+strideq*0+2] + pmullw m1, m4, [srcq+strideq*1+0] +@@ -1055,6 +1091,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, + jg .h_w32 + RET + .h_w64: ++ _CET_ENDBR + pmullw m0, m4, [srcq+ 0] + pmullw m2, m5, [srcq+ 2] + pmullw m1, m4, [srcq+64] +@@ -1073,6 +1110,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, + jg .h_w64 + RET + .h_w128: ++ _CET_ENDBR + pmullw m0, m4, [srcq+ 0] + pmullw m7, m5, [srcq+ 2] + pmullw m1, m4, [srcq+ 64] +@@ -1111,6 +1149,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, + .v_12bpc: + jmp wq + .v_w4: ++ _CET_ENDBR + movq xmm0, [srcq+strideq*0] + .v_w4_loop: + vpbroadcastq xmm2, [srcq+strideq*1] +@@ -1134,6 +1173,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, + vzeroupper + RET + .v_w8: ++ _CET_ENDBR + movu xm0, [srcq+strideq*0] + .v_w8_loop: + vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 +@@ -1153,6 +1193,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, + jg .v_w8_loop + RET + .v_w16: ++ _CET_ENDBR + movu ym0, [srcq+strideq*0] + .v_w16_loop: + vinserti32x8 m1, m0, [srcq+strideq*1], 1 ; 0 1 +@@ -1179,6 +1220,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, + jg .v_w16_loop + RET + .v_w32: ++ _CET_ENDBR + movu m0, [srcq+strideq*0] + .v_w32_loop: + movu m3, [srcq+strideq*1] +@@ -1201,6 +1243,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, + jg .v_w32_loop + RET + .v_w64: ++ _CET_ENDBR + movu m0, [srcq+64*0] + movu m1, [srcq+64*1] + .v_w64_loop: +@@ -1224,6 +1267,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, + jg .v_w64_loop + RET + .v_w128: ++ _CET_ENDBR + movu m0, [srcq+64*0] + movu m1, [srcq+64*1] + movu m2, [srcq+64*2] +@@ -1264,6 +1308,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, + lea stride3q, [strideq*3] + jmp wq + .hv_w4: ++ _CET_ENDBR + movq xmm0, [srcq+strideq*0+0] + movq xmm1, [srcq+strideq*0+2] + pmullw xmm0, xm4 +@@ -1298,6 +1343,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, + jg .hv_w4_loop + RET + .hv_w8: ++ _CET_ENDBR + pmullw xm0, xm4, [srcq+strideq*0+0] + pmullw xm1, xm5, [srcq+strideq*0+2] + psubw xm0, xm6 +@@ -1330,6 +1376,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, + jg .hv_w8_loop + RET + .hv_w16: ++ _CET_ENDBR + pmullw ym0, ym4, [srcq+strideq*0+0] + pmullw ym1, ym5, [srcq+strideq*0+2] + psubw ym0, ym6 +@@ -1358,6 +1405,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, + jg .hv_w16_loop + RET + .hv_w32: ++ _CET_ENDBR + pmullw m0, m4, [srcq+strideq*0+0] + pmullw m1, m5, [srcq+strideq*0+2] + psubw m0, m6 +@@ -1388,6 +1436,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, + jg .hv_w32_loop + RET + .hv_w64: ++ _CET_ENDBR + pmullw m0, m4, [srcq+ 0] + pmullw m2, m5, [srcq+ 2] + pmullw m1, m4, [srcq+64] +@@ -1425,6 +1474,7 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, + jg .hv_w64_loop + RET + .hv_w128: ++ _CET_ENDBR + pmullw m0, m4, [srcq+ 0] + pmullw m8, m5, [srcq+ 2] + pmullw m1, m4, [srcq+ 64] +@@ -1534,6 +1584,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, + %endif + jmp wq + .h_w2: ++ _CET_ENDBR + movzx mxd, mxb + sub srcq, 2 + mova ym2, [spel_h_shuf2a] +@@ -1559,6 +1610,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, + jg .h_w2_loop + RET + .h_w4: ++ _CET_ENDBR + movzx mxd, mxb + sub srcq, 2 + pmovsxbw xmm0, [base+subpel_filters+mxq*8] +@@ -1608,6 +1660,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, + je .h_w16 + jg .h_w32 + .h_w8: ++ _CET_ENDBR + mova m4, [spel_h_shufA] + movu m5, [spel_h_shufB] + movu m6, [spel_h_shufC] +@@ -1636,6 +1689,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, + jg .h_w8_loop + RET + .h_w16: ++ _CET_ENDBR + vbroadcasti32x4 m6, [spel_h_shufA] + vbroadcasti32x4 m7, [spel_h_shufB] + .h_w16_loop: +@@ -1672,6 +1726,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, + jg .h_w16_loop + RET + .h_w32: ++ _CET_ENDBR + lea srcq, [srcq+wq*2] + vbroadcasti32x4 m6, [spel_h_shufA] + lea dstq, [dstq+wq*2] +@@ -1731,6 +1786,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, + vpbroadcastd m15, [rsp+stack_offset+20] + jmp r7 + .v_w2: ++ _CET_ENDBR + movd xmm2, [srcq+ssq*0] + pinsrd xmm2, [srcq+ssq*1], 1 + pinsrd xmm2, [srcq+ssq*2], 2 +@@ -1770,6 +1826,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, + jg .v_w2_loop + RET + .v_w4: ++ _CET_ENDBR + movq xmm1, [srcq+ssq*0] + vpbroadcastq ymm0, [srcq+ssq*1] + vpbroadcastq ymm2, [srcq+ssq*2] +@@ -1814,6 +1871,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, + vzeroupper + RET + .v_w8: ++ _CET_ENDBR + vbroadcasti32x4 m2, [srcq+ssq*2] + vinserti32x4 m1, m2, [srcq+ssq*0], 0 + vinserti32x4 m1, [srcq+ssq*1], 1 ; 0 1 2 +@@ -1852,6 +1910,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, + jg .v_w8_loop + RET + .v_w16: ++ _CET_ENDBR + vbroadcasti32x8 m1, [srcq+ssq*1] + vinserti32x8 m0, m1, [srcq+ssq*0], 0 + vinserti32x8 m1, [srcq+ssq*2], 1 +@@ -1904,6 +1963,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, + .v_w32: + .v_w64: + .v_w128: ++ _CET_ENDBR + %if WIN64 + movaps [rsp+stack_offset+8], xmm6 + %endif +@@ -2595,6 +2655,7 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w + %endif + jmp wq + .h_w4: ++ _CET_ENDBR + movzx mxd, mxb + sub srcq, 2 + pmovsxbw xmm0, [base+subpel_filters+mxq*8] +@@ -2646,6 +2707,7 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w + je .h_w16 + jg .h_w32 + .h_w8: ++ _CET_ENDBR + mova m6, [spel_h_shufA] + movu m7, [spel_h_shufB] + movu m8, [spel_h_shufC] +@@ -2682,6 +2744,7 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w + jg .h_w8_loop + RET + .h_w16: ++ _CET_ENDBR + vbroadcasti32x4 m6, [spel_h_shufA] + vbroadcasti32x4 m7, [spel_h_shufB] + mova m11, [prep_endC] +@@ -2715,6 +2778,7 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w + jg .h_w16_loop + RET + .h_w32: ++ _CET_ENDBR + vbroadcasti32x4 m6, [spel_h_shufA] + lea srcq, [srcq+wq*2] + vbroadcasti32x4 m7, [spel_h_shufB] +@@ -2773,6 +2837,7 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w + vpbroadcastd m15, [tmpq+12] + jmp r7 + .v_w4: ++ _CET_ENDBR + movq xmm1, [srcq+strideq*0] + vpbroadcastq ymm0, [srcq+strideq*1] + vpbroadcastq ymm2, [srcq+strideq*2] +@@ -2814,6 +2879,7 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w + vzeroupper + RET + .v_w8: ++ _CET_ENDBR + vbroadcasti32x4 m2, [srcq+strideq*2] + vinserti32x4 m1, m2, [srcq+strideq*0], 0 + vinserti32x4 m1, [srcq+strideq*1], 1 ; 0 1 2 +@@ -2849,6 +2915,7 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w + jg .v_w8_loop + RET + .v_w16: ++ _CET_ENDBR + vbroadcasti32x8 m1, [srcq+strideq*1] + vinserti32x8 m0, m1, [srcq+strideq*0], 0 + vinserti32x8 m1, [srcq+strideq*2], 1 +@@ -2896,6 +2963,7 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w + .v_w32: + .v_w64: + .v_w128: ++ _CET_ENDBR + %if WIN64 + PUSH r8 + movaps [rsp+stack_offset+8], xmm6 +@@ -3613,6 +3681,7 @@ ALIGN function_align + lea stride3q, [strideq*3] + jmp wq + .w4: ++ _CET_ENDBR + movq [dstq ], xm0 + movhps [dstq+strideq*1], xm0 + vextracti32x4 xm2, ym0, 1 +@@ -3647,6 +3716,7 @@ ALIGN function_align + call .main + lea dstq, [dstq+strideq*4] + .w8: ++ _CET_ENDBR + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 +@@ -3665,6 +3735,7 @@ ALIGN function_align + call .main + lea dstq, [dstq+strideq*4] + .w16: ++ _CET_ENDBR + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 +@@ -3676,6 +3747,7 @@ ALIGN function_align + call .main + lea dstq, [dstq+strideq*2] + .w32: ++ _CET_ENDBR + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 +@@ -3685,6 +3757,7 @@ ALIGN function_align + call .main + add dstq, strideq + .w64: ++ _CET_ENDBR + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + dec hd +@@ -3694,6 +3767,7 @@ ALIGN function_align + call .main + add dstq, strideq + .w128: ++ _CET_ENDBR + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + call .main +@@ -3853,6 +3927,7 @@ cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, + lea stride3q, [strideq*3] + jmp wq + .w4: ++ _CET_ENDBR + mova m4, [w_mask_shuf4] + vpermt2b m2, m4, m3 + mova m3, m14 +@@ -3890,6 +3965,7 @@ cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, + .w4_end: + RET + .w8: ++ _CET_ENDBR + mova m8, [w_mask_shuf8] + vpbroadcastd m9, [pb_64] + jmp .w8_start +@@ -3918,6 +3994,7 @@ cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, + .w8_end: + RET + .w16: ++ _CET_ENDBR + mova m8, [w_mask_shuf16] + vpbroadcastd m9, [pb_64] + jmp .w16_start +@@ -3943,6 +4020,7 @@ cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, + lea dstq, [dstq+strideq*4] + add maskq, 32 + .w32: ++ _CET_ENDBR + paddw m2, m3 + mova m8, m14 + vpdpwssd m8, m11, m2 +@@ -3964,6 +4042,7 @@ cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, + lea dstq, [dstq+strideq*2] + add maskq, 32 + .w64: ++ _CET_ENDBR + mova m8, m2 + mova m9, m3 + mova [dstq+strideq*0+64*0], m0 +@@ -3987,6 +4066,7 @@ cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, + lea dstq, [dstq+strideq*2] + add maskq, 64 + .w128: ++ _CET_ENDBR + mova m16, m2 + mova m8, m3 + mova [dstq+strideq*0+64*0], m0 +@@ -4088,6 +4168,7 @@ cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, + lea stride3q, [strideq*3] + jmp wq + .w4: ++ _CET_ENDBR + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti32x4 xm2, ym0, 1 +@@ -4122,6 +4203,7 @@ cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, + call .main + lea dstq, [dstq+strideq*4] + .w8: ++ _CET_ENDBR + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 +@@ -4140,6 +4222,7 @@ cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, + call .main + lea dstq, [dstq+strideq*4] + .w16: ++ _CET_ENDBR + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 +@@ -4151,6 +4234,7 @@ cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, + call .main + lea dstq, [dstq+strideq*2] + .w32: ++ _CET_ENDBR + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 +@@ -4160,6 +4244,7 @@ cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, + call .main + add dstq, strideq + .w64: ++ _CET_ENDBR + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + dec hd +@@ -4169,6 +4254,7 @@ cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, + call .main + add dstq, strideq + .w128: ++ _CET_ENDBR + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + call .main +@@ -4247,6 +4333,7 @@ cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, + lea stride3q, [strideq*3] + jmp wq + .w4: ++ _CET_ENDBR + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti32x4 xm2, ym0, 1 +@@ -4281,6 +4368,7 @@ cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, + call .main + lea dstq, [dstq+strideq*4] + .w8: ++ _CET_ENDBR + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 +@@ -4299,6 +4387,7 @@ cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, + call .main + lea dstq, [dstq+strideq*4] + .w16: ++ _CET_ENDBR + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 +@@ -4310,6 +4399,7 @@ cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, + call .main + lea dstq, [dstq+strideq*2] + .w32: ++ _CET_ENDBR + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 +@@ -4319,6 +4409,7 @@ cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, + call .main + add dstq, strideq + .w64: ++ _CET_ENDBR + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + dec hd +@@ -4328,6 +4419,7 @@ cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, + call .main + add dstq, strideq + .w128: ++ _CET_ENDBR + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + call .main +@@ -4395,6 +4487,7 @@ cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask + lea r6, [dsq*3] + jmp wq + .w4: ++ _CET_ENDBR + pmovzxbw ym19, [maskq] + movq xm16, [dstq+dsq*0] + movhps xm16, [dstq+dsq*1] +@@ -4419,6 +4512,7 @@ cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask + vzeroupper + RET + .w8: ++ _CET_ENDBR + pmovzxbw m2, [maskq] + mova xm0, [dstq+dsq*0] + vinserti32x4 ym0, [dstq+dsq*1], 1 +@@ -4439,6 +4533,7 @@ cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask + jg .w8 + RET + .w16: ++ _CET_ENDBR + pmovzxbw m4, [maskq+32*0] + pmovzxbw m5, [maskq+32*1] + mova ym0, [dstq+dsq*0] +@@ -4464,6 +4559,7 @@ cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask + jg .w16 + RET + .w32: ++ _CET_ENDBR + pmovzxbw m4, [maskq+32*0] + pmovzxbw m5, [maskq+32*1] + mova m0, [dstq+dsq*0] +@@ -4493,6 +4589,7 @@ cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h + add wq, r5 + jmp wq + .w2: ++ _CET_ENDBR + vpbroadcastd xmm2, [obmc_masks_avx2+2*2] + .w2_loop: + movd xmm0, [dstq+dsq*0] +@@ -4509,6 +4606,7 @@ cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h + jg .w2_loop + RET + .w4: ++ _CET_ENDBR + vpbroadcastq xmm2, [obmc_masks_avx2+4*2] + .w4_loop: + movq xmm0, [dstq+dsq*0] +@@ -4524,6 +4622,7 @@ cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h + jg .w4_loop + RET + .w8: ++ _CET_ENDBR + vbroadcasti32x4 ym2, [obmc_masks_avx2+8*2] + .w8_loop: + mova xm0, [dstq+dsq*0] +@@ -4539,6 +4638,7 @@ cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h + jg .w8_loop + RET + .w16: ++ _CET_ENDBR + vbroadcasti32x8 m2, [obmc_masks_avx2+16*2] + .w16_loop: + mova ym0, [dstq+dsq*0] +@@ -4554,6 +4654,7 @@ cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h + jg .w16_loop + RET + .w32: ++ _CET_ENDBR + mova m4, [obmc_masks_avx2+32*2] + .w32_loop: + mova m0, [dstq+dsq*0] +@@ -4586,6 +4687,7 @@ cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, ma + neg hq + jmp wq + .w2: ++ _CET_ENDBR + movd xmm0, [dstq+dsq*0] + pinsrd xmm0, [dstq+dsq*1], 1 + movd xmm2, [maskq+hq*2] +@@ -4602,6 +4704,7 @@ cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, ma + jl .w2 + RET + .w4: ++ _CET_ENDBR + mova xmm3, [blend_shuf] + .w4_loop: + movq xmm0, [dstq+dsq*0] +@@ -4619,6 +4722,7 @@ cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, ma + jl .w4_loop + RET + .w8: ++ _CET_ENDBR + vbroadcasti32x4 ym3, [blend_shuf] + shufpd ym3, ym3, 0x0c + .w8_loop: +@@ -4637,6 +4741,7 @@ cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, ma + jl .w8_loop + RET + .w16: ++ _CET_ENDBR + vbroadcasti32x4 m3, [blend_shuf] + shufpd m3, m3, 0xf0 + .w16_loop: +@@ -4655,6 +4760,7 @@ cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, ma + jl .w16_loop + RET + .w32: ++ _CET_ENDBR + vpbroadcastw m4, [maskq+hq*2] + vpbroadcastw m5, [maskq+hq*2+2] + mova m0, [dstq+dsq*0] +@@ -4673,6 +4779,7 @@ cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, ma + jl .w32 + RET + .w64: ++ _CET_ENDBR + vpbroadcastw m4, [maskq+hq*2] + mova m0, [dstq+64*0] + psubw m2, m0, [tmpq+64*0] +@@ -4690,6 +4797,7 @@ cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, ma + jl .w64 + RET + .w128: ++ _CET_ENDBR + vpbroadcastw m8, [maskq+hq*2] + mova m0, [dstq+64*0] + psubw m4, m0, [tmpq+64*0] Index: multimedia/dav1d/patches/patch-src_x86_mc_avx512_asm =================================================================== RCS file: /cvs/ports/multimedia/dav1d/patches/patch-src_x86_mc_avx512_asm,v retrieving revision 1.1 diff -u -p -r1.1 patch-src_x86_mc_avx512_asm --- multimedia/dav1d/patches/patch-src_x86_mc_avx512_asm 13 Jul 2023 12:36:37 -0000 1.1 +++ multimedia/dav1d/patches/patch-src_x86_mc_avx512_asm 18 Feb 2024 18:51:06 -0000 @@ -904,7 +904,23 @@ Index: src/x86/mc_avx512.asm pmovzxbq m5, [pb_02461357] .w32_loop: W_MASK 0, 4, 0, 1 -@@ -3930,6 +4046,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, +@@ -3874,6 +3990,7 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, + jg .w32_loop + RET + .w64: ++ _CET_ENDBR + pmovzxbq m5, [pb_02461357] + .w64_loop: + W_MASK 0, 4, 0, 1 +@@ -3892,6 +4009,7 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, + jg .w64_loop + RET + .w128: ++ _CET_ENDBR + pmovzxbq m13, [pb_02461357] + .w128_loop: + W_MASK 0, 4, 0, 1 +@@ -3930,6 +4048,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, lea stride3q, [strideq*3] jmp wq .w4: @@ -912,7 +928,7 @@ Index: src/x86/mc_avx512.asm cmp hd, 8 jg .w4_h16 WRAP_YMM W_MASK 0, 4, 0, 1, 1 -@@ -3959,6 +4076,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, +@@ -3959,6 +4078,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, vpscatterdd [dstq+m9]{k1}, m0 RET .w8: @@ -920,7 +936,7 @@ Index: src/x86/mc_avx512.asm cmp hd, 4 jne .w8_h8 WRAP_YMM W_MASK 0, 4, 0, 1, 1 -@@ -4001,6 +4119,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, +@@ -4001,6 +4121,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, add maskq, 64 lea dstq, [dstq+strideq*4] .w16: @@ -928,7 +944,7 @@ Index: src/x86/mc_avx512.asm W_MASK 0, 4, 0, 1, 1 vpermb m4, m8, m4 vpermq m0, m0, q3120 -@@ -4013,6 +4132,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, +@@ -4013,6 +4134,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, jg .w16_loop RET .w32: @@ -936,7 +952,23 @@ Index: src/x86/mc_avx512.asm pmovzxbq m9, [pb_02461357] .w32_loop: W_MASK 0, 4, 0, 1, 1 -@@ -4078,6 +4198,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask +@@ -4029,6 +4151,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, + jg .w32_loop + RET + .w64: ++ _CET_ENDBR + pmovzxbq m9, [pb_02461357] + .w64_loop: + W_MASK 0, 4, 0, 1, 1 +@@ -4044,6 +4167,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, + jg .w64_loop + RET + .w128: ++ _CET_ENDBR + pmovzxbq m11, [pb_02461357] + .w128_loop: + W_MASK 0, 4, 0, 1, 1 +@@ -4078,6 +4202,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask lea r6, [dsq*3] jmp wq .w4: @@ -944,7 +976,7 @@ Index: src/x86/mc_avx512.asm movd xmm0, [dstq+dsq*0] pinsrd xmm0, [dstq+dsq*1], 1 vpbroadcastd xmm1, [dstq+dsq*2] -@@ -4104,6 +4225,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask +@@ -4104,6 +4229,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask jg .w4 RET .w8: @@ -952,7 +984,7 @@ Index: src/x86/mc_avx512.asm movq xmm0, [dstq+dsq*0] vpbroadcastq xmm1, [dstq+dsq*1] vpbroadcastq ymm2, [dstq+dsq*2] -@@ -4134,6 +4256,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask +@@ -4134,6 +4260,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask vzeroupper RET .w16: @@ -960,7 +992,7 @@ Index: src/x86/mc_avx512.asm mova xm1, [dstq+dsq*0] vinserti32x4 ym1, [dstq+dsq*1], 1 vinserti32x4 m1, [dstq+dsq*2], 2 -@@ -4160,6 +4283,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask +@@ -4160,6 +4287,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask jg .w16 RET .w32: @@ -968,7 +1000,7 @@ Index: src/x86/mc_avx512.asm mova ym1, [dstq+dsq*0] vinserti32x8 m1, [dstq+dsq*1], 1 mova m4, [maskq] -@@ -4193,6 +4317,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas +@@ -4193,6 +4321,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas add maskq, obmc_masks-blend_v_avx512icl_table jmp wq .w2: @@ -976,7 +1008,7 @@ Index: src/x86/mc_avx512.asm vpbroadcastd xmm2, [maskq+2*2] .w2_s0_loop: movd xmm0, [dstq+dsq*0] -@@ -4210,6 +4335,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas +@@ -4210,6 +4339,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas jg .w2_s0_loop RET .w4: @@ -984,7 +1016,7 @@ Index: src/x86/mc_avx512.asm vpbroadcastq xmm2, [maskq+4*2] .w4_loop: movd xmm0, [dstq+dsq*0] -@@ -4227,6 +4353,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas +@@ -4227,6 +4357,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas jg .w4_loop RET .w8: @@ -992,7 +1024,7 @@ Index: src/x86/mc_avx512.asm mova xmm3, [maskq+8*2] .w8_loop: movq xmm0, [dstq+dsq*0] -@@ -4247,6 +4374,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas +@@ -4247,6 +4378,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas jg .w8_loop RET .w16: @@ -1000,7 +1032,7 @@ Index: src/x86/mc_avx512.asm vbroadcasti32x4 ym3, [maskq+16*2] vbroadcasti32x4 ym4, [maskq+16*3] .w16_loop: -@@ -4268,6 +4396,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas +@@ -4268,6 +4400,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas jg .w16_loop RET .w32: @@ -1008,7 +1040,7 @@ Index: src/x86/mc_avx512.asm mova m4, [maskq+32*2] vshufi32x4 m3, m4, m4, q2020 vshufi32x4 m4, m4, q3131 -@@ -4305,6 +4434,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas +@@ -4305,6 +4438,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas neg hq jmp wq .w2: @@ -1016,7 +1048,7 @@ Index: src/x86/mc_avx512.asm movd xmm0, [dstq+dsq*0] pinsrw xmm0, [dstq+dsq*1], 1 movd xmm2, [maskq+hq*2] -@@ -4322,6 +4452,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas +@@ -4322,6 +4456,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas jl .w2 RET .w4: @@ -1024,7 +1056,7 @@ Index: src/x86/mc_avx512.asm mova xmm3, [blend_shuf] .w4_loop: movd xmm0, [dstq+dsq*0] -@@ -4341,6 +4472,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas +@@ -4341,6 +4476,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas jl .w4_loop RET .w8: @@ -1032,7 +1064,7 @@ Index: src/x86/mc_avx512.asm vbroadcasti128 ymm4, [blend_shuf] shufpd ymm4, ymm4, 0x03 .w8_loop: -@@ -4365,6 +4497,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas +@@ -4365,6 +4501,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas vzeroupper RET .w16: @@ -1040,7 +1072,7 @@ Index: src/x86/mc_avx512.asm vbroadcasti32x4 ym4, [blend_shuf] shufpd ym4, ym4, 0x0c .w16_loop: -@@ -4388,6 +4521,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas +@@ -4388,6 +4525,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas jl .w16_loop RET .w32: @@ -1048,7 +1080,7 @@ Index: src/x86/mc_avx512.asm vbroadcasti32x4 m4, [blend_shuf] shufpd m4, m4, 0xf0 .w32_loop: -@@ -4411,6 +4545,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas +@@ -4411,6 +4549,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas jl .w32_loop RET .w64: @@ -1056,7 +1088,7 @@ Index: src/x86/mc_avx512.asm vpbroadcastw m3, [maskq+hq*2] mova m1, [dstq] mova m2, [tmpq] -@@ -4428,6 +4563,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas +@@ -4428,6 +4567,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas jl .w64 RET .w128: