This makes the regress test pass again on arm64 after the last two
updates.
Please make sure the regress passes when updating this ports. I can
help with the arm64 asm if needed.
Cheers,
Mark
P.S. Interestingly enough they missed a few BTI landing pads this
time.
Index: multimedia/dav1d/Makefile
===================================================================
RCS file: /cvs/ports/multimedia/dav1d/Makefile,v
retrieving revision 1.34
diff -u -p -r1.34 Makefile
--- multimedia/dav1d/Makefile 11 Jun 2023 07:58:45 -0000 1.34
+++ multimedia/dav1d/Makefile 12 Jul 2023 20:52:07 -0000
@@ -2,6 +2,7 @@ COMMENT= small and fast AV1 decoder
VER= 1.2.1
DISTNAME= dav1d-${VER}
+REVISION= 0
CATEGORIES= multimedia
MASTER_SITES= https://downloads.videolan.org/pub/videolan/dav1d/${VER}/
EXTRACT_SUFX= .tar.xz
Index: multimedia/dav1d/patches/patch-src_arm_64_ipred16_S
===================================================================
RCS file: /cvs/ports/multimedia/dav1d/patches/patch-src_arm_64_ipred16_S,v
retrieving revision 1.3
diff -u -p -r1.3 patch-src_arm_64_ipred16_S
--- multimedia/dav1d/patches/patch-src_arm_64_ipred16_S 11 Jun 2023 07:58:45
-0000 1.3
+++ multimedia/dav1d/patches/patch-src_arm_64_ipred16_S 12 Jul 2023 20:52:07
-0000
@@ -387,7 +387,180 @@ Index: src/arm/64/ipred16.S
endfunc
const padding_mask_buf
-@@ -3880,13 +3898,13 @@ function ipred_filter_\bpc\()bpc_neon
+@@ -1728,11 +1746,11 @@ endfunc
+ // const int dx, const int max_base_x);
+ function ipred_z1_fill1_16bpc_neon, export=1
+ clz w9, w3
+- adr x8, L(ipred_z1_fill1_tbl)
++ adrp x8, L(ipred_z1_fill1_tbl)
++ add x8, x8, :lo12: L(ipred_z1_fill1_tbl)
+ sub w9, w9, #25
+- ldrh w9, [x8, w9, uxtw #1]
++ ldr x8, [x8, w9, uxtw #3]
+ add x10, x2, w6, uxtw #1 // top[max_base_x]
+- sub x8, x8, w9, uxtw
+ ld1r {v31.8h}, [x10] // padding
+ mov w7, w5
+ mov w15, #64
+@@ -1917,12 +1935,14 @@ function ipred_z1_fill1_16bpc_neon, export=1
+ mov w3, w12
+ b 169b
+
++ .pushsection .data.rel.ro, "aw"
+ L(ipred_z1_fill1_tbl):
+- .hword L(ipred_z1_fill1_tbl) - 640b
+- .hword L(ipred_z1_fill1_tbl) - 320b
+- .hword L(ipred_z1_fill1_tbl) - 160b
+- .hword L(ipred_z1_fill1_tbl) - 80b
+- .hword L(ipred_z1_fill1_tbl) - 40b
++ .xword 640b
++ .xword 320b
++ .xword 160b
++ .xword 80b
++ .xword 40b
++ .popsection
+ endfunc
+
+ function ipred_z1_fill2_16bpc_neon, export=1
+@@ -2050,11 +2070,11 @@ endconst
+ // const int dx, const int dy);
+ function ipred_z2_fill1_16bpc_neon, export=1
+ clz w10, w4
+- adr x9, L(ipred_z2_fill1_tbl)
++ adrp x9, L(ipred_z2_fill1_tbl)
++ add x9, x9, :lo12: L(ipred_z2_fill1_tbl)
+ sub w10, w10, #25
+- ldrh w10, [x9, w10, uxtw #1]
++ ldr x9, [x9, w10, uxtw #3]
+ mov w8, #(1 << 6) // xpos = 1 << 6
+- sub x9, x9, w10, uxtw
+ sub w8, w8, w6 // xpos -= dx
+
+ movrel x11, increments
+@@ -2815,12 +2835,14 @@ function ipred_z2_fill1_16bpc_neon, export=1
+ ldp d8, d9, [sp], 0x40
+ ret
+
++ .pushsection .data.rel.ro, "aw"
+ L(ipred_z2_fill1_tbl):
+- .hword L(ipred_z2_fill1_tbl) - 640b
+- .hword L(ipred_z2_fill1_tbl) - 320b
+- .hword L(ipred_z2_fill1_tbl) - 160b
+- .hword L(ipred_z2_fill1_tbl) - 80b
+- .hword L(ipred_z2_fill1_tbl) - 40b
++ .xword 640b
++ .xword 320b
++ .xword 160b
++ .xword 80b
++ .xword 40b
++ .popsection
+ endfunc
+
+ function ipred_z2_fill2_16bpc_neon, export=1
+@@ -3432,11 +3454,11 @@ endfunc
+ // const int dy, const int max_base_y);
+ function ipred_z3_fill1_16bpc_neon, export=1
+ clz w9, w4
+- adr x8, L(ipred_z3_fill1_tbl)
++ adrp x8, L(ipred_z3_fill1_tbl)
++ add x8, x8, :lo12: L(ipred_z3_fill1_tbl)
+ sub w9, w9, #25
+- ldrh w9, [x8, w9, uxtw #1]
++ ldr x8, [x8, w9, uxtw #3]
+ add x10, x2, w6, uxtw #1 // left[max_base_y]
+- sub x8, x8, w9, uxtw
+ ld1r {v31.8h}, [x10] // padding
+ mov w7, w5
+ mov w15, #64
+@@ -3638,17 +3660,20 @@ function ipred_z3_fill1_16bpc_neon, export=1
+ 9:
+ ret
+
++ .pushsection .data.rel.ro, "aw"
+ L(ipred_z3_fill1_tbl):
+- .hword L(ipred_z3_fill1_tbl) - 640b
+- .hword L(ipred_z3_fill1_tbl) - 320b
+- .hword L(ipred_z3_fill1_tbl) - 160b
+- .hword L(ipred_z3_fill1_tbl) - 80b
+- .hword L(ipred_z3_fill1_tbl) - 40b
++ .xword 640b
++ .xword 320b
++ .xword 160b
++ .xword 80b
++ .xword 40b
++ .popsection
+ endfunc
+
+ function ipred_z3_fill_padding_neon, export=0
+ cmp w3, #8
+- adr x8, L(ipred_z3_fill_padding_tbl)
++ adrp x8, L(ipred_z3_fill_padding_tbl)
++ add x8, x8, :lo12: L(ipred_z3_fill_padding_tbl)
+ b.gt L(ipred_z3_fill_padding_wide)
+ // w3 = remaining width, w4 = constant height
+ mov w12, w4
+@@ -3659,10 +3684,11 @@ function ipred_z3_fill_padding_neon, export=0
+ // power of two in the remaining width, and repeating.
+ clz w9, w3
+ sub w9, w9, #25
+- ldrh w9, [x8, w9, uxtw #1]
+- sub x9, x8, w9, uxtw
++ ldr x9, [x8, w9, uxtw #3]
+ br x9
+
++20:
++ AARCH64_VALID_JUMP_TARGET
+ 2:
+ st1 {v31.s}[0], [x0], x1
+ subs w4, w4, #4
+@@ -3681,6 +3707,8 @@ function ipred_z3_fill_padding_neon, export=0
+ mov w4, w12
+ b 1b
+
++40:
++ AARCH64_VALID_JUMP_TARGET
+ 4:
+ st1 {v31.4h}, [x0], x1
+ subs w4, w4, #4
+@@ -3699,10 +3727,11 @@ function ipred_z3_fill_padding_neon, export=0
+ mov w4, w12
+ b 1b
+
+-8:
+-16:
+-32:
+-64:
++80:
++160:
++320:
++640:
++ AARCH64_VALID_JUMP_TARGET
+ st1 {v31.8h}, [x0], x1
+ subs w4, w4, #4
+ st1 {v31.8h}, [x13], x1
+@@ -3723,13 +3752,15 @@ function ipred_z3_fill_padding_neon, export=0
+ 9:
+ ret
+
++ .pushsection .data.rel.ro, "aw"
+ L(ipred_z3_fill_padding_tbl):
+- .hword L(ipred_z3_fill_padding_tbl) - 64b
+- .hword L(ipred_z3_fill_padding_tbl) - 32b
+- .hword L(ipred_z3_fill_padding_tbl) - 16b
+- .hword L(ipred_z3_fill_padding_tbl) - 8b
+- .hword L(ipred_z3_fill_padding_tbl) - 4b
+- .hword L(ipred_z3_fill_padding_tbl) - 2b
++ .xword 640b
++ .xword 320b
++ .xword 160b
++ .xword 80b
++ .xword 40b
++ .xword 20b
++ .popsection
+
+ L(ipred_z3_fill_padding_wide):
+ // Fill a WxH rectangle with padding, with W > 8.
+@@ -3880,13 +3911,13 @@ function ipred_filter_\bpc\()bpc_neon
add x6, x6, w5, uxtw
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
clz w9, w3
@@ -404,7 +577,7 @@ Index: src/arm/64/ipred16.S
sxtl v18.8h, v18.8b
sxtl v19.8h, v19.8b
add x6, x0, x1
-@@ -4160,11 +4178,13 @@ function ipred_filter_\bpc\()bpc_neon
+@@ -4160,11 +4191,13 @@ function ipred_filter_\bpc\()bpc_neon
9:
ret
@@ -422,7 +595,7 @@ Index: src/arm/64/ipred16.S
endfunc
.endm
-@@ -4184,11 +4204,11 @@ endfunc
+@@ -4184,11 +4217,11 @@ endfunc
function pal_pred_16bpc_neon, export=1
ld1 {v30.8h}, [x2]
clz w9, w4
@@ -437,7 +610,7 @@ Index: src/arm/64/ipred16.S
br x6
40:
AARCH64_VALID_JUMP_TARGET
-@@ -4357,12 +4377,14 @@ function pal_pred_16bpc_neon, export=1
+@@ -4357,12 +4390,14 @@ function pal_pred_16bpc_neon, export=1
b.gt 64b
ret
@@ -457,7 +630,7 @@ Index: src/arm/64/ipred16.S
endfunc
// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
-@@ -4373,12 +4395,12 @@ endfunc
+@@ -4373,12 +4408,12 @@ endfunc
function ipred_cfl_128_16bpc_neon, export=1
dup v31.8h, w7 // bitdepth_max
clz w9, w3
@@ -473,7 +646,7 @@ Index: src/arm/64/ipred16.S
add x6, x0, x1
lsl x1, x1, #1
movi v30.8h, #0
-@@ -4510,12 +4532,14 @@ L(ipred_cfl_splat_w16):
+@@ -4510,12 +4545,14 @@ L(ipred_cfl_splat_w16):
b.gt 1b
ret
@@ -492,7 +665,7 @@ Index: src/arm/64/ipred16.S
endfunc
// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
-@@ -4526,12 +4550,12 @@ endfunc
+@@ -4526,12 +4563,12 @@ endfunc
function ipred_cfl_top_16bpc_neon, export=1
dup v31.8h, w7 // bitdepth_max
clz w9, w3
@@ -508,7 +681,7 @@ Index: src/arm/64/ipred16.S
add x6, x0, x1
lsl x1, x1, #1
movi v30.8h, #0
-@@ -4569,11 +4593,13 @@ function ipred_cfl_top_16bpc_neon, export=1
+@@ -4569,11 +4606,13 @@ function ipred_cfl_top_16bpc_neon, export=1
dup v0.8h, v0.h[0]
b L(ipred_cfl_splat_w16)
@@ -526,7 +699,7 @@ Index: src/arm/64/ipred16.S
endfunc
// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
-@@ -4586,15 +4612,15 @@ function ipred_cfl_left_16bpc_neon, export=1
+@@ -4586,15 +4625,15 @@ function ipred_cfl_left_16bpc_neon, export=1
sub x2, x2, w4, uxtw #1
clz w9, w3
clz w8, w4
@@ -548,7 +721,7 @@ Index: src/arm/64/ipred16.S
add x6, x0, x1
lsl x1, x1, #1
movi v30.8h, #0
-@@ -4636,11 +4662,13 @@ L(ipred_cfl_left_h32):
+@@ -4636,11 +4675,13 @@ L(ipred_cfl_left_h32):
dup v0.8h, v0.h[0]
br x9
@@ -566,7 +739,7 @@ Index: src/arm/64/ipred16.S
endfunc
// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride,
-@@ -4656,16 +4684,15 @@ function ipred_cfl_16bpc_neon, export=1
+@@ -4656,16 +4697,15 @@ function ipred_cfl_16bpc_neon, export=1
clz w9, w3
clz w6, w4
dup v16.4s, w8 // width + height
@@ -587,7 +760,7 @@ Index: src/arm/64/ipred16.S
ushr v16.4s, v16.4s, #1 // (width + height) >> 1
dup v17.4s, w8 // -ctz(width + height)
add x6, x0, x1
-@@ -4789,15 +4816,17 @@ L(ipred_cfl_w32):
+@@ -4789,15 +4829,17 @@ L(ipred_cfl_w32):
dup v0.8h, v0.h[0]
b L(ipred_cfl_splat_w16)
@@ -613,7 +786,7 @@ Index: src/arm/64/ipred16.S
endfunc
// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx,
-@@ -4806,14 +4835,14 @@ endfunc
+@@ -4806,14 +4848,14 @@ endfunc
function ipred_cfl_ac_420_16bpc_neon, export=1
clz w8, w5
lsl w4, w4, #2
@@ -631,7 +804,7 @@ Index: src/arm/64/ipred16.S
sub w8, w6, w4 // height - h_pad
rbit w9, w5 // rbit(width)
rbit w10, w6 // rbit(height)
-@@ -4945,9 +4974,9 @@ L(ipred_cfl_ac_420_w8_hpad):
+@@ -4945,9 +4987,9 @@ L(ipred_cfl_ac_420_w8_hpad):
L(ipred_cfl_ac_420_w16):
AARCH64_VALID_JUMP_TARGET
@@ -644,7 +817,7 @@ Index: src/arm/64/ipred16.S
br x7
L(ipred_cfl_ac_420_w16_wpad0):
-@@ -5124,17 +5153,19 @@ L(ipred_cfl_ac_420_w16_hpad):
+@@ -5124,17 +5166,19 @@ L(ipred_cfl_ac_420_w16_hpad):
lsl w6, w6, #2
b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
@@ -672,7 +845,7 @@ Index: src/arm/64/ipred16.S
endfunc
// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx,
-@@ -5143,14 +5174,14 @@ endfunc
+@@ -5143,14 +5187,14 @@ endfunc
function ipred_cfl_ac_422_16bpc_neon, export=1
clz w8, w5
lsl w4, w4, #2
@@ -690,7 +863,7 @@ Index: src/arm/64/ipred16.S
sub w8, w6, w4 // height - h_pad
rbit w9, w5 // rbit(width)
rbit w10, w6 // rbit(height)
-@@ -5251,9 +5282,9 @@ L(ipred_cfl_ac_422_w8_wpad):
+@@ -5251,9 +5295,9 @@ L(ipred_cfl_ac_422_w8_wpad):
L(ipred_cfl_ac_422_w16):
AARCH64_VALID_JUMP_TARGET
@@ -703,7 +876,7 @@ Index: src/arm/64/ipred16.S
br x7
L(ipred_cfl_ac_422_w16_wpad0):
-@@ -5372,17 +5403,19 @@ L(ipred_cfl_ac_422_w16_wpad3):
+@@ -5372,17 +5416,19 @@ L(ipred_cfl_ac_422_w16_wpad3):
mov v1.16b, v3.16b
b L(ipred_cfl_ac_420_w16_hpad)
@@ -731,7 +904,7 @@ Index: src/arm/64/ipred16.S
endfunc
// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx,
-@@ -5391,14 +5424,14 @@ endfunc
+@@ -5391,14 +5437,14 @@ endfunc
function ipred_cfl_ac_444_16bpc_neon, export=1
clz w8, w5
lsl w4, w4, #2
@@ -749,7 +922,7 @@ Index: src/arm/64/ipred16.S
sub w8, w6, w4 // height - h_pad
rbit w9, w5 // rbit(width)
rbit w10, w6 // rbit(height)
-@@ -5507,10 +5540,11 @@ L(ipred_cfl_ac_444_w16_wpad):
+@@ -5507,10 +5553,11 @@ L(ipred_cfl_ac_444_w16_wpad):
L(ipred_cfl_ac_444_w32):
AARCH64_VALID_JUMP_TARGET
@@ -764,7 +937,7 @@ Index: src/arm/64/ipred16.S
br x7
L(ipred_cfl_ac_444_w32_wpad0):
-@@ -5625,15 +5659,17 @@ L(ipred_cfl_ac_444_w32_hpad):
+@@ -5625,15 +5672,17 @@ L(ipred_cfl_ac_444_w32_hpad):
lsl w6, w6, #3
b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
Index: multimedia/dav1d/patches/patch-src_arm_64_ipred_S
===================================================================
RCS file: /cvs/ports/multimedia/dav1d/patches/patch-src_arm_64_ipred_S,v
retrieving revision 1.3
diff -u -p -r1.3 patch-src_arm_64_ipred_S
--- multimedia/dav1d/patches/patch-src_arm_64_ipred_S 11 Jun 2023 07:58:45
-0000 1.3
+++ multimedia/dav1d/patches/patch-src_arm_64_ipred_S 12 Jul 2023 20:52:07
-0000
@@ -422,7 +422,42 @@ Index: src/arm/64/ipred.S
endfunc
function ipred_z1_fill2_8bpc_neon, export=1
-@@ -3160,11 +3180,11 @@ endfunc
+@@ -1940,11 +1960,11 @@ endconst
+ // const int dx, const int dy);
+ function ipred_z2_fill1_8bpc_neon, export=1
+ clz w10, w4
+- adr x9, L(ipred_z2_fill1_tbl)
++ adrp x9, L(ipred_z2_fill1_tbl)
++ add x9, x9, :lo12: L(ipred_z2_fill1_tbl)
+ sub w10, w10, #25
+- ldrh w10, [x9, w10, uxtw #1]
++ ldr x9, [x9, w10, uxtw #3]
+ mov w8, #(1 << 6) // xpos = 1 << 6
+- sub x9, x9, w10, uxtw
+ sub w8, w8, w6 // xpos -= dx
+
+ movrel x11, increments
+@@ -2651,12 +2671,14 @@ function ipred_z2_fill1_8bpc_neon, export=1
+ ldp d8, d9, [sp], 0x40
+ ret
+
++ .pushsection .data.rel.ro, "aw"
+ L(ipred_z2_fill1_tbl):
+- .hword L(ipred_z2_fill1_tbl) - 640b
+- .hword L(ipred_z2_fill1_tbl) - 320b
+- .hword L(ipred_z2_fill1_tbl) - 160b
+- .hword L(ipred_z2_fill1_tbl) - 80b
+- .hword L(ipred_z2_fill1_tbl) - 40b
++ .xword 640b
++ .xword 320b
++ .xword 160b
++ .xword 80b
++ .xword 40b
++ .popsection
+ endfunc
+
+ function ipred_z2_fill2_8bpc_neon, export=1
+@@ -3160,11 +3182,11 @@ endfunc
function ipred_z3_fill1_8bpc_neon, export=1
cmp w6, #64
clz w9, w3
@@ -437,7 +472,7 @@ Index: src/arm/64/ipred.S
movrel x11, increments
ld1r {v31.16b}, [x10] // padding
ld1 {v30.8h}, [x11] // increments
-@@ -3503,17 +3523,20 @@ L(ipred_z3_fill1_large_h16):
+@@ -3503,17 +3525,20 @@ L(ipred_z3_fill1_large_h16):
9:
ret
@@ -464,7 +499,7 @@ Index: src/arm/64/ipred.S
b.gt L(ipred_z3_fill_padding_wide)
// w3 = remaining width, w4 = constant height
mov w12, w4
-@@ -3524,8 +3547,7 @@ function ipred_z3_fill_padding_neon, export=0
+@@ -3524,10 +3549,11 @@ function ipred_z3_fill_padding_neon, export=0
// power of two in the remaining width, and repeating.
clz w9, w3
sub w9, w9, #25
@@ -473,8 +508,45 @@ Index: src/arm/64/ipred.S
+ ldr x9, [x8, w9, uxtw #3]
br x9
++20:
++ AARCH64_VALID_JUMP_TARGET
2:
-@@ -3605,13 +3627,15 @@ function ipred_z3_fill_padding_neon, export=0
+ st1 {v31.h}[0], [x0], x1
+ subs w4, w4, #4
+@@ -3546,6 +3572,8 @@ function ipred_z3_fill_padding_neon, export=0
+ mov w4, w12
+ b 1b
+
++40:
++ AARCH64_VALID_JUMP_TARGET
+ 4:
+ st1 {v31.s}[0], [x0], x1
+ subs w4, w4, #4
+@@ -3564,7 +3592,8 @@ function ipred_z3_fill_padding_neon, export=0
+ mov w4, w12
+ b 1b
+
+-8:
++80:
++ AARCH64_VALID_JUMP_TARGET
+ st1 {v31.8b}, [x0], x1
+ subs w4, w4, #4
+ st1 {v31.8b}, [x13], x1
+@@ -3582,9 +3611,10 @@ function ipred_z3_fill_padding_neon, export=0
+ mov w4, w12
+ b 1b
+
+-16:
+-32:
+-64:
++160:
++320:
++640:
++ AARCH64_VALID_JUMP_TARGET
+ st1 {v31.16b}, [x0], x1
+ subs w4, w4, #4
+ st1 {v31.16b}, [x13], x1
+@@ -3605,13 +3635,15 @@ function ipred_z3_fill_padding_neon, export=0
9:
ret
@@ -486,17 +558,17 @@ Index: src/arm/64/ipred.S
- .hword L(ipred_z3_fill_padding_tbl) - 8b
- .hword L(ipred_z3_fill_padding_tbl) - 4b
- .hword L(ipred_z3_fill_padding_tbl) - 2b
-+ .xword 64b
-+ .xword 32b
-+ .xword 16b
-+ .xword 8b
-+ .xword 4b
-+ .xword 2b
++ .xword 640b
++ .xword 320b
++ .xword 160b
++ .xword 80b
++ .xword 40b
++ .xword 20b
+ .popsection
L(ipred_z3_fill_padding_wide):
// Fill a WxH rectangle with padding, with W > 16.
-@@ -3766,13 +3790,13 @@ function ipred_filter_8bpc_neon, export=1
+@@ -3766,13 +3798,13 @@ function ipred_filter_8bpc_neon, export=1
add x6, x6, w5, uxtw
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
clz w9, w3
@@ -513,7 +585,7 @@ Index: src/arm/64/ipred.S
sxtl v18.8h, v18.8b
sxtl v19.8h, v19.8b
add x6, x0, x1
-@@ -3913,11 +3937,13 @@ function ipred_filter_8bpc_neon, export=1
+@@ -3913,11 +3945,13 @@ function ipred_filter_8bpc_neon, export=1
9:
ret
@@ -531,7 +603,7 @@ Index: src/arm/64/ipred.S
endfunc
// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
-@@ -3926,11 +3952,11 @@ endfunc
+@@ -3926,11 +3960,11 @@ endfunc
function pal_pred_8bpc_neon, export=1
ld1 {v0.8h}, [x2]
clz w9, w4
@@ -546,7 +618,7 @@ Index: src/arm/64/ipred.S
add x2, x0, x1
lsl x1, x1, #1
br x6
-@@ -4008,12 +4034,14 @@ function pal_pred_8bpc_neon, export=1
+@@ -4008,12 +4042,14 @@ function pal_pred_8bpc_neon, export=1
b.gt 64b
ret
@@ -566,7 +638,7 @@ Index: src/arm/64/ipred.S
endfunc
// void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
-@@ -4022,12 +4050,12 @@ endfunc
+@@ -4022,12 +4058,12 @@ endfunc
// const int16_t *ac, const int alpha);
function ipred_cfl_128_8bpc_neon, export=1
clz w9, w3
@@ -582,7 +654,7 @@ Index: src/arm/64/ipred.S
add x6, x0, x1
lsl x1, x1, #1
br x7
-@@ -4132,12 +4160,14 @@ L(ipred_cfl_splat_w16):
+@@ -4132,12 +4168,14 @@ L(ipred_cfl_splat_w16):
b.gt 1b
ret
@@ -601,7 +673,7 @@ Index: src/arm/64/ipred.S
endfunc
// void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
-@@ -4146,12 +4176,12 @@ endfunc
+@@ -4146,12 +4184,12 @@ endfunc
// const int16_t *ac, const int alpha);
function ipred_cfl_top_8bpc_neon, export=1
clz w9, w3
@@ -617,7 +689,7 @@ Index: src/arm/64/ipred.S
add x6, x0, x1
lsl x1, x1, #1
br x7
-@@ -4186,11 +4216,13 @@ function ipred_cfl_top_8bpc_neon, export=1
+@@ -4186,11 +4224,13 @@ function ipred_cfl_top_8bpc_neon, export=1
dup v0.8h, v2.h[0]
b L(ipred_cfl_splat_w16)
@@ -635,7 +707,7 @@ Index: src/arm/64/ipred.S
endfunc
// void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
-@@ -4201,15 +4233,15 @@ function ipred_cfl_left_8bpc_neon, export=1
+@@ -4201,15 +4241,15 @@ function ipred_cfl_left_8bpc_neon, export=1
sub x2, x2, w4, uxtw
clz w9, w3
clz w8, w4
@@ -657,7 +729,7 @@ Index: src/arm/64/ipred.S
add x6, x0, x1
lsl x1, x1, #1
br x7
-@@ -4248,11 +4280,13 @@ L(ipred_cfl_left_h32):
+@@ -4248,11 +4288,13 @@ L(ipred_cfl_left_h32):
dup v0.8h, v2.h[0]
br x9
@@ -675,7 +747,7 @@ Index: src/arm/64/ipred.S
endfunc
// void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride,
-@@ -4266,16 +4300,15 @@ function ipred_cfl_8bpc_neon, export=1
+@@ -4266,16 +4308,15 @@ function ipred_cfl_8bpc_neon, export=1
clz w9, w3
clz w6, w4
dup v16.8h, w8 // width + height
@@ -696,7 +768,7 @@ Index: src/arm/64/ipred.S
ushr v16.8h, v16.8h, #1 // (width + height) >> 1
dup v17.8h, w8 // -ctz(width + height)
add x6, x0, x1
-@@ -4392,15 +4425,17 @@ L(ipred_cfl_w32):
+@@ -4392,15 +4433,17 @@ L(ipred_cfl_w32):
dup v0.8h, v0.h[0]
b L(ipred_cfl_splat_w16)
@@ -722,7 +794,7 @@ Index: src/arm/64/ipred.S
endfunc
// void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx,
-@@ -4409,14 +4444,14 @@ endfunc
+@@ -4409,14 +4452,14 @@ endfunc
function ipred_cfl_ac_420_8bpc_neon, export=1
clz w8, w5
lsl w4, w4, #2
@@ -740,7 +812,7 @@ Index: src/arm/64/ipred.S
sub w8, w6, w4 // height - h_pad
rbit w9, w5 // rbit(width)
rbit w10, w6 // rbit(height)
-@@ -4555,9 +4590,9 @@ L(ipred_cfl_ac_420_w8_subtract_dc):
+@@ -4555,9 +4598,9 @@ L(ipred_cfl_ac_420_w8_subtract_dc):
L(ipred_cfl_ac_420_w16):
AARCH64_VALID_JUMP_TARGET
@@ -753,7 +825,7 @@ Index: src/arm/64/ipred.S
br x7
L(ipred_cfl_ac_420_w16_wpad0):
-@@ -4714,17 +4749,19 @@ L(ipred_cfl_ac_420_w16_hpad):
+@@ -4714,17 +4757,19 @@ L(ipred_cfl_ac_420_w16_hpad):
lsl w6, w6, #1
b L(ipred_cfl_ac_420_w8_calc_subtract_dc)
@@ -781,7 +853,7 @@ Index: src/arm/64/ipred.S
endfunc
// void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx,
-@@ -4733,14 +4770,14 @@ endfunc
+@@ -4733,14 +4778,14 @@ endfunc
function ipred_cfl_ac_422_8bpc_neon, export=1
clz w8, w5
lsl w4, w4, #2
@@ -799,7 +871,7 @@ Index: src/arm/64/ipred.S
sub w8, w6, w4 // height - h_pad
rbit w9, w5 // rbit(width)
rbit w10, w6 // rbit(height)
-@@ -4831,9 +4868,9 @@ L(ipred_cfl_ac_422_w8_wpad):
+@@ -4831,9 +4876,9 @@ L(ipred_cfl_ac_422_w8_wpad):
L(ipred_cfl_ac_422_w16):
AARCH64_VALID_JUMP_TARGET
@@ -812,7 +884,7 @@ Index: src/arm/64/ipred.S
br x7
L(ipred_cfl_ac_422_w16_wpad0):
-@@ -4936,17 +4973,19 @@ L(ipred_cfl_ac_422_w16_wpad3):
+@@ -4936,17 +4981,19 @@ L(ipred_cfl_ac_422_w16_wpad3):
mov v1.16b, v3.16b
b L(ipred_cfl_ac_420_w16_hpad)
@@ -840,7 +912,7 @@ Index: src/arm/64/ipred.S
endfunc
// void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx,
-@@ -4955,14 +4994,14 @@ endfunc
+@@ -4955,14 +5002,14 @@ endfunc
function ipred_cfl_ac_444_8bpc_neon, export=1
clz w8, w5
lsl w4, w4, #2
@@ -858,7 +930,7 @@ Index: src/arm/64/ipred.S
sub w8, w6, w4 // height - h_pad
rbit w9, w5 // rbit(width)
rbit w10, w6 // rbit(height)
-@@ -5083,9 +5122,10 @@ L(ipred_cfl_ac_444_w16_wpad):
+@@ -5083,9 +5130,10 @@ L(ipred_cfl_ac_444_w16_wpad):
L(ipred_cfl_ac_444_w32):
AARCH64_VALID_JUMP_TARGET
@@ -872,7 +944,7 @@ Index: src/arm/64/ipred.S
br x7
L(ipred_cfl_ac_444_w32_wpad0):
-@@ -5231,15 +5271,17 @@ L(ipred_cfl_ac_444_w32_hpad):
+@@ -5231,15 +5279,17 @@ L(ipred_cfl_ac_444_w32_hpad):
dup v4.8h, v4.h[0]
b L(ipred_cfl_ac_420_w8_subtract_dc)