ok

On 12/07/23 22:56 +0200, Mark Kettenis wrote:
> This makes the regress test pass again on arm64 after the last two
> updates.
> 
> Please make sure the regress passes when updating this ports.  I can
> help with the arm64 asm if needed.
> 
> Cheers,
> 
> Mark
> 
> P.S. Interestingly enough they missed a few BTI landing pads this
>      time.
> 
> 
> Index: multimedia/dav1d/Makefile
> ===================================================================
> RCS file: /cvs/ports/multimedia/dav1d/Makefile,v
> retrieving revision 1.34
> diff -u -p -r1.34 Makefile
> --- multimedia/dav1d/Makefile 11 Jun 2023 07:58:45 -0000      1.34
> +++ multimedia/dav1d/Makefile 12 Jul 2023 20:52:07 -0000
> @@ -2,6 +2,7 @@ COMMENT=      small and fast AV1 decoder
>  
>  VER=         1.2.1
>  DISTNAME=    dav1d-${VER}
> +REVISION=    0
>  CATEGORIES=  multimedia
>  MASTER_SITES=        
> https://downloads.videolan.org/pub/videolan/dav1d/${VER}/
>  EXTRACT_SUFX=        .tar.xz
> Index: multimedia/dav1d/patches/patch-src_arm_64_ipred16_S
> ===================================================================
> RCS file: /cvs/ports/multimedia/dav1d/patches/patch-src_arm_64_ipred16_S,v
> retrieving revision 1.3
> diff -u -p -r1.3 patch-src_arm_64_ipred16_S
> --- multimedia/dav1d/patches/patch-src_arm_64_ipred16_S       11 Jun 2023 
> 07:58:45 -0000      1.3
> +++ multimedia/dav1d/patches/patch-src_arm_64_ipred16_S       12 Jul 2023 
> 20:52:07 -0000
> @@ -387,7 +387,180 @@ Index: src/arm/64/ipred16.S
>   endfunc
>   
>   const padding_mask_buf
> -@@ -3880,13 +3898,13 @@ function ipred_filter_\bpc\()bpc_neon
> +@@ -1728,11 +1746,11 @@ endfunc
> + //                                const int dx, const int max_base_x);
> + function ipred_z1_fill1_16bpc_neon, export=1
> +         clz             w9,  w3
> +-        adr             x8,  L(ipred_z1_fill1_tbl)
> ++        adrp            x8,  L(ipred_z1_fill1_tbl)
> ++        add             x8,  x8, :lo12: L(ipred_z1_fill1_tbl)
> +         sub             w9,  w9,  #25
> +-        ldrh            w9,  [x8, w9, uxtw #1]
> ++        ldr             x8,  [x8, w9, uxtw #3]
> +         add             x10, x2,  w6,  uxtw #1    // top[max_base_x]
> +-        sub             x8,  x8,  w9,  uxtw
> +         ld1r            {v31.8h}, [x10]           // padding
> +         mov             w7,  w5
> +         mov             w15, #64
> +@@ -1917,12 +1935,14 @@ function ipred_z1_fill1_16bpc_neon, export=1
> +         mov             w3,  w12
> +         b               169b
> + 
> ++    .pushsection .data.rel.ro, "aw"
> + L(ipred_z1_fill1_tbl):
> +-        .hword L(ipred_z1_fill1_tbl) - 640b
> +-        .hword L(ipred_z1_fill1_tbl) - 320b
> +-        .hword L(ipred_z1_fill1_tbl) - 160b
> +-        .hword L(ipred_z1_fill1_tbl) -  80b
> +-        .hword L(ipred_z1_fill1_tbl) -  40b
> ++        .xword 640b
> ++        .xword 320b
> ++        .xword 160b
> ++        .xword  80b
> ++        .xword  40b
> ++    .popsection
> + endfunc
> + 
> + function ipred_z1_fill2_16bpc_neon, export=1
> +@@ -2050,11 +2070,11 @@ endconst
> + //                                const int dx, const int dy);
> + function ipred_z2_fill1_16bpc_neon, export=1
> +         clz             w10, w4
> +-        adr             x9,  L(ipred_z2_fill1_tbl)
> ++        adrp            x9,  L(ipred_z2_fill1_tbl)
> ++        add             x9,  x9, :lo12: L(ipred_z2_fill1_tbl)
> +         sub             w10, w10, #25
> +-        ldrh            w10, [x9, w10, uxtw #1]
> ++        ldr             x9, [x9, w10, uxtw #3]
> +         mov             w8,  #(1 << 6)            // xpos = 1 << 6
> +-        sub             x9,  x9,  w10, uxtw
> +         sub             w8,  w8,  w6              // xpos -= dx
> + 
> +         movrel          x11, increments
> +@@ -2815,12 +2835,14 @@ function ipred_z2_fill1_16bpc_neon, export=1
> +         ldp             d8,  d9,  [sp], 0x40
> +         ret
> + 
> ++    .pushsection .data.rel.ro, "aw"
> + L(ipred_z2_fill1_tbl):
> +-        .hword L(ipred_z2_fill1_tbl) - 640b
> +-        .hword L(ipred_z2_fill1_tbl) - 320b
> +-        .hword L(ipred_z2_fill1_tbl) - 160b
> +-        .hword L(ipred_z2_fill1_tbl) -  80b
> +-        .hword L(ipred_z2_fill1_tbl) -  40b
> ++        .xword 640b
> ++        .xword 320b
> ++        .xword 160b
> ++        .xword  80b
> ++        .xword  40b
> ++    .popsection
> + endfunc
> + 
> + function ipred_z2_fill2_16bpc_neon, export=1
> +@@ -3432,11 +3454,11 @@ endfunc
> + //                                const int dy, const int max_base_y);
> + function ipred_z3_fill1_16bpc_neon, export=1
> +         clz             w9,  w4
> +-        adr             x8,  L(ipred_z3_fill1_tbl)
> ++        adrp            x8,  L(ipred_z3_fill1_tbl)
> ++        add             x8,  x8, :lo12: L(ipred_z3_fill1_tbl)
> +         sub             w9,  w9,  #25
> +-        ldrh            w9,  [x8, w9, uxtw #1]
> ++        ldr             x8,  [x8, w9, uxtw #3]
> +         add             x10, x2,  w6,  uxtw #1    // left[max_base_y]
> +-        sub             x8,  x8,  w9,  uxtw
> +         ld1r            {v31.8h}, [x10]           // padding
> +         mov             w7,  w5
> +         mov             w15, #64
> +@@ -3638,17 +3660,20 @@ function ipred_z3_fill1_16bpc_neon, export=1
> + 9:
> +         ret
> + 
> ++    .pushsection .data.rel.ro, "aw"
> + L(ipred_z3_fill1_tbl):
> +-        .hword L(ipred_z3_fill1_tbl) - 640b
> +-        .hword L(ipred_z3_fill1_tbl) - 320b
> +-        .hword L(ipred_z3_fill1_tbl) - 160b
> +-        .hword L(ipred_z3_fill1_tbl) -  80b
> +-        .hword L(ipred_z3_fill1_tbl) -  40b
> ++        .xword 640b
> ++        .xword 320b
> ++        .xword 160b
> ++        .xword  80b
> ++        .xword  40b
> ++    .popsection
> + endfunc
> + 
> + function ipred_z3_fill_padding_neon, export=0
> +         cmp             w3,  #8
> +-        adr             x8,  L(ipred_z3_fill_padding_tbl)
> ++        adrp            x8,  L(ipred_z3_fill_padding_tbl)
> ++        add             x8,  x8, :lo12: L(ipred_z3_fill_padding_tbl)
> +         b.gt            L(ipred_z3_fill_padding_wide)
> +         // w3 = remaining width, w4 = constant height
> +         mov             w12, w4
> +@@ -3659,10 +3684,11 @@ function ipred_z3_fill_padding_neon, export=0
> +         // power of two in the remaining width, and repeating.
> +         clz             w9,  w3
> +         sub             w9,  w9,  #25
> +-        ldrh            w9,  [x8, w9, uxtw #1]
> +-        sub             x9,  x8,  w9,  uxtw
> ++        ldr             x9,  [x8, w9, uxtw #3]
> +         br              x9
> + 
> ++20:
> ++        AARCH64_VALID_JUMP_TARGET
> + 2:
> +         st1             {v31.s}[0], [x0],  x1
> +         subs            w4,  w4,  #4
> +@@ -3681,6 +3707,8 @@ function ipred_z3_fill_padding_neon, export=0
> +         mov             w4,  w12
> +         b               1b
> + 
> ++40:
> ++        AARCH64_VALID_JUMP_TARGET
> + 4:
> +         st1             {v31.4h}, [x0],  x1
> +         subs            w4,  w4,  #4
> +@@ -3699,10 +3727,11 @@ function ipred_z3_fill_padding_neon, export=0
> +         mov             w4,  w12
> +         b               1b
> + 
> +-8:
> +-16:
> +-32:
> +-64:
> ++80:
> ++160:
> ++320:
> ++640:
> ++        AARCH64_VALID_JUMP_TARGET
> +         st1             {v31.8h}, [x0],  x1
> +         subs            w4,  w4,  #4
> +         st1             {v31.8h}, [x13], x1
> +@@ -3723,13 +3752,15 @@ function ipred_z3_fill_padding_neon, export=0
> + 9:
> +         ret
> + 
> ++    .pushsection .data.rel.ro, "aw"
> + L(ipred_z3_fill_padding_tbl):
> +-        .hword L(ipred_z3_fill_padding_tbl) - 64b
> +-        .hword L(ipred_z3_fill_padding_tbl) - 32b
> +-        .hword L(ipred_z3_fill_padding_tbl) - 16b
> +-        .hword L(ipred_z3_fill_padding_tbl) -  8b
> +-        .hword L(ipred_z3_fill_padding_tbl) -  4b
> +-        .hword L(ipred_z3_fill_padding_tbl) -  2b
> ++        .xword 640b
> ++        .xword 320b
> ++        .xword 160b
> ++        .xword  80b
> ++        .xword  40b
> ++        .xword  20b
> ++    .popsection
> + 
> + L(ipred_z3_fill_padding_wide):
> +         // Fill a WxH rectangle with padding, with W > 8.
> +@@ -3880,13 +3911,13 @@ function ipred_filter_\bpc\()bpc_neon
>           add             x6,  x6,  w5, uxtw
>           ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
>           clz             w9,  w3
> @@ -404,7 +577,7 @@ Index: src/arm/64/ipred16.S
>           sxtl            v18.8h,  v18.8b
>           sxtl            v19.8h,  v19.8b
>           add             x6,  x0,  x1
> -@@ -4160,11 +4178,13 @@ function ipred_filter_\bpc\()bpc_neon
> +@@ -4160,11 +4191,13 @@ function ipred_filter_\bpc\()bpc_neon
>   9:
>           ret
>   
> @@ -422,7 +595,7 @@ Index: src/arm/64/ipred16.S
>   endfunc
>   .endm
>   
> -@@ -4184,11 +4204,11 @@ endfunc
> +@@ -4184,11 +4217,11 @@ endfunc
>   function pal_pred_16bpc_neon, export=1
>           ld1             {v30.8h}, [x2]
>           clz             w9,  w4
> @@ -437,7 +610,7 @@ Index: src/arm/64/ipred16.S
>           br              x6
>   40:
>           AARCH64_VALID_JUMP_TARGET
> -@@ -4357,12 +4377,14 @@ function pal_pred_16bpc_neon, export=1
> +@@ -4357,12 +4390,14 @@ function pal_pred_16bpc_neon, export=1
>           b.gt            64b
>           ret
>   
> @@ -457,7 +630,7 @@ Index: src/arm/64/ipred16.S
>   endfunc
>   
>   // void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
> -@@ -4373,12 +4395,12 @@ endfunc
> +@@ -4373,12 +4408,12 @@ endfunc
>   function ipred_cfl_128_16bpc_neon, export=1
>           dup             v31.8h,  w7   // bitdepth_max
>           clz             w9,  w3
> @@ -473,7 +646,7 @@ Index: src/arm/64/ipred16.S
>           add             x6,  x0,  x1
>           lsl             x1,  x1,  #1
>           movi            v30.8h,  #0
> -@@ -4510,12 +4532,14 @@ L(ipred_cfl_splat_w16):
> +@@ -4510,12 +4545,14 @@ L(ipred_cfl_splat_w16):
>           b.gt            1b
>           ret
>   
> @@ -492,7 +665,7 @@ Index: src/arm/64/ipred16.S
>   endfunc
>   
>   // void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
> -@@ -4526,12 +4550,12 @@ endfunc
> +@@ -4526,12 +4563,12 @@ endfunc
>   function ipred_cfl_top_16bpc_neon, export=1
>           dup             v31.8h,  w7   // bitdepth_max
>           clz             w9,  w3
> @@ -508,7 +681,7 @@ Index: src/arm/64/ipred16.S
>           add             x6,  x0,  x1
>           lsl             x1,  x1,  #1
>           movi            v30.8h,  #0
> -@@ -4569,11 +4593,13 @@ function ipred_cfl_top_16bpc_neon, export=1
> +@@ -4569,11 +4606,13 @@ function ipred_cfl_top_16bpc_neon, export=1
>           dup             v0.8h,   v0.h[0]
>           b               L(ipred_cfl_splat_w16)
>   
> @@ -526,7 +699,7 @@ Index: src/arm/64/ipred16.S
>   endfunc
>   
>   // void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
> -@@ -4586,15 +4612,15 @@ function ipred_cfl_left_16bpc_neon, export=1
> +@@ -4586,15 +4625,15 @@ function ipred_cfl_left_16bpc_neon, export=1
>           sub             x2,  x2,  w4, uxtw #1
>           clz             w9,  w3
>           clz             w8,  w4
> @@ -548,7 +721,7 @@ Index: src/arm/64/ipred16.S
>           add             x6,  x0,  x1
>           lsl             x1,  x1,  #1
>           movi            v30.8h,  #0
> -@@ -4636,11 +4662,13 @@ L(ipred_cfl_left_h32):
> +@@ -4636,11 +4675,13 @@ L(ipred_cfl_left_h32):
>           dup             v0.8h,   v0.h[0]
>           br              x9
>   
> @@ -566,7 +739,7 @@ Index: src/arm/64/ipred16.S
>   endfunc
>   
>   // void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride,
> -@@ -4656,16 +4684,15 @@ function ipred_cfl_16bpc_neon, export=1
> +@@ -4656,16 +4697,15 @@ function ipred_cfl_16bpc_neon, export=1
>           clz             w9,  w3
>           clz             w6,  w4
>           dup             v16.4s, w8               // width + height
> @@ -587,7 +760,7 @@ Index: src/arm/64/ipred16.S
>           ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
>           dup             v17.4s,  w8              // -ctz(width + height)
>           add             x6,  x0,  x1
> -@@ -4789,15 +4816,17 @@ L(ipred_cfl_w32):
> +@@ -4789,15 +4829,17 @@ L(ipred_cfl_w32):
>           dup             v0.8h,   v0.h[0]
>           b               L(ipred_cfl_splat_w16)
>   
> @@ -613,7 +786,7 @@ Index: src/arm/64/ipred16.S
>   endfunc
>   
>   // void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx,
> -@@ -4806,14 +4835,14 @@ endfunc
> +@@ -4806,14 +4848,14 @@ endfunc
>   function ipred_cfl_ac_420_16bpc_neon, export=1
>           clz             w8,  w5
>           lsl             w4,  w4,  #2
> @@ -631,7 +804,7 @@ Index: src/arm/64/ipred16.S
>           sub             w8,  w6,  w4         // height - h_pad
>           rbit            w9,  w5              // rbit(width)
>           rbit            w10, w6              // rbit(height)
> -@@ -4945,9 +4974,9 @@ L(ipred_cfl_ac_420_w8_hpad):
> +@@ -4945,9 +4987,9 @@ L(ipred_cfl_ac_420_w8_hpad):
>   
>   L(ipred_cfl_ac_420_w16):
>           AARCH64_VALID_JUMP_TARGET
> @@ -644,7 +817,7 @@ Index: src/arm/64/ipred16.S
>           br              x7
>   
>   L(ipred_cfl_ac_420_w16_wpad0):
> -@@ -5124,17 +5153,19 @@ L(ipred_cfl_ac_420_w16_hpad):
> +@@ -5124,17 +5166,19 @@ L(ipred_cfl_ac_420_w16_hpad):
>           lsl             w6,  w6,  #2
>           b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
>   
> @@ -672,7 +845,7 @@ Index: src/arm/64/ipred16.S
>   endfunc
>   
>   // void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx,
> -@@ -5143,14 +5174,14 @@ endfunc
> +@@ -5143,14 +5187,14 @@ endfunc
>   function ipred_cfl_ac_422_16bpc_neon, export=1
>           clz             w8,  w5
>           lsl             w4,  w4,  #2
> @@ -690,7 +863,7 @@ Index: src/arm/64/ipred16.S
>           sub             w8,  w6,  w4         // height - h_pad
>           rbit            w9,  w5              // rbit(width)
>           rbit            w10, w6              // rbit(height)
> -@@ -5251,9 +5282,9 @@ L(ipred_cfl_ac_422_w8_wpad):
> +@@ -5251,9 +5295,9 @@ L(ipred_cfl_ac_422_w8_wpad):
>   
>   L(ipred_cfl_ac_422_w16):
>           AARCH64_VALID_JUMP_TARGET
> @@ -703,7 +876,7 @@ Index: src/arm/64/ipred16.S
>           br              x7
>   
>   L(ipred_cfl_ac_422_w16_wpad0):
> -@@ -5372,17 +5403,19 @@ L(ipred_cfl_ac_422_w16_wpad3):
> +@@ -5372,17 +5416,19 @@ L(ipred_cfl_ac_422_w16_wpad3):
>           mov             v1.16b,  v3.16b
>           b               L(ipred_cfl_ac_420_w16_hpad)
>   
> @@ -731,7 +904,7 @@ Index: src/arm/64/ipred16.S
>   endfunc
>   
>   // void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx,
> -@@ -5391,14 +5424,14 @@ endfunc
> +@@ -5391,14 +5437,14 @@ endfunc
>   function ipred_cfl_ac_444_16bpc_neon, export=1
>           clz             w8,  w5
>           lsl             w4,  w4,  #2
> @@ -749,7 +922,7 @@ Index: src/arm/64/ipred16.S
>           sub             w8,  w6,  w4         // height - h_pad
>           rbit            w9,  w5              // rbit(width)
>           rbit            w10, w6              // rbit(height)
> -@@ -5507,10 +5540,11 @@ L(ipred_cfl_ac_444_w16_wpad):
> +@@ -5507,10 +5553,11 @@ L(ipred_cfl_ac_444_w16_wpad):
>   
>   L(ipred_cfl_ac_444_w32):
>           AARCH64_VALID_JUMP_TARGET
> @@ -764,7 +937,7 @@ Index: src/arm/64/ipred16.S
>           br              x7
>   
>   L(ipred_cfl_ac_444_w32_wpad0):
> -@@ -5625,15 +5659,17 @@ L(ipred_cfl_ac_444_w32_hpad):
> +@@ -5625,15 +5672,17 @@ L(ipred_cfl_ac_444_w32_hpad):
>           lsl             w6,  w6,  #3
>           b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
>   
> Index: multimedia/dav1d/patches/patch-src_arm_64_ipred_S
> ===================================================================
> RCS file: /cvs/ports/multimedia/dav1d/patches/patch-src_arm_64_ipred_S,v
> retrieving revision 1.3
> diff -u -p -r1.3 patch-src_arm_64_ipred_S
> --- multimedia/dav1d/patches/patch-src_arm_64_ipred_S 11 Jun 2023 07:58:45 
> -0000      1.3
> +++ multimedia/dav1d/patches/patch-src_arm_64_ipred_S 12 Jul 2023 20:52:07 
> -0000
> @@ -422,7 +422,42 @@ Index: src/arm/64/ipred.S
>   endfunc
>   
>   function ipred_z1_fill2_8bpc_neon, export=1
> -@@ -3160,11 +3180,11 @@ endfunc
> +@@ -1940,11 +1960,11 @@ endconst
> + //                               const int dx, const int dy);
> + function ipred_z2_fill1_8bpc_neon, export=1
> +         clz             w10, w4
> +-        adr             x9,  L(ipred_z2_fill1_tbl)
> ++        adrp            x9,  L(ipred_z2_fill1_tbl)
> ++        add             x9,  x9, :lo12: L(ipred_z2_fill1_tbl)
> +         sub             w10, w10, #25
> +-        ldrh            w10, [x9, w10, uxtw #1]
> ++        ldr             x9, [x9, w10, uxtw #3]
> +         mov             w8,  #(1 << 6)            // xpos = 1 << 6
> +-        sub             x9,  x9,  w10, uxtw
> +         sub             w8,  w8,  w6              // xpos -= dx
> + 
> +         movrel          x11, increments
> +@@ -2651,12 +2671,14 @@ function ipred_z2_fill1_8bpc_neon, export=1
> +         ldp             d8,  d9,  [sp], 0x40
> +         ret
> + 
> ++    .pushsection .data.rel.ro, "aw"
> + L(ipred_z2_fill1_tbl):
> +-        .hword L(ipred_z2_fill1_tbl) - 640b
> +-        .hword L(ipred_z2_fill1_tbl) - 320b
> +-        .hword L(ipred_z2_fill1_tbl) - 160b
> +-        .hword L(ipred_z2_fill1_tbl) -  80b
> +-        .hword L(ipred_z2_fill1_tbl) -  40b
> ++        .xword 640b
> ++        .xword 320b
> ++        .xword 160b
> ++        .xword  80b
> ++        .xword  40b
> ++    .popsection
> + endfunc
> + 
> + function ipred_z2_fill2_8bpc_neon, export=1
> +@@ -3160,11 +3182,11 @@ endfunc
>   function ipred_z3_fill1_8bpc_neon, export=1
>           cmp             w6,  #64
>           clz             w9,  w3
> @@ -437,7 +472,7 @@ Index: src/arm/64/ipred.S
>           movrel          x11, increments
>           ld1r            {v31.16b}, [x10]          // padding
>           ld1             {v30.8h},  [x11]          // increments
> -@@ -3503,17 +3523,20 @@ L(ipred_z3_fill1_large_h16):
> +@@ -3503,17 +3525,20 @@ L(ipred_z3_fill1_large_h16):
>   9:
>           ret
>   
> @@ -464,7 +499,7 @@ Index: src/arm/64/ipred.S
>           b.gt            L(ipred_z3_fill_padding_wide)
>           // w3 = remaining width, w4 = constant height
>           mov             w12, w4
> -@@ -3524,8 +3547,7 @@ function ipred_z3_fill_padding_neon, export=0
> +@@ -3524,10 +3549,11 @@ function ipred_z3_fill_padding_neon, export=0
>           // power of two in the remaining width, and repeating.
>           clz             w9,  w3
>           sub             w9,  w9,  #25
> @@ -473,8 +508,45 @@ Index: src/arm/64/ipred.S
>  +        ldr             x9,  [x8, w9, uxtw #3]
>           br              x9
>   
> ++20:
> ++        AARCH64_VALID_JUMP_TARGET
>   2:
> -@@ -3605,13 +3627,15 @@ function ipred_z3_fill_padding_neon, export=0
> +         st1             {v31.h}[0], [x0],  x1
> +         subs            w4,  w4,  #4
> +@@ -3546,6 +3572,8 @@ function ipred_z3_fill_padding_neon, export=0
> +         mov             w4,  w12
> +         b               1b
> + 
> ++40:
> ++        AARCH64_VALID_JUMP_TARGET
> + 4:
> +         st1             {v31.s}[0], [x0],  x1
> +         subs            w4,  w4,  #4
> +@@ -3564,7 +3592,8 @@ function ipred_z3_fill_padding_neon, export=0
> +         mov             w4,  w12
> +         b               1b
> + 
> +-8:
> ++80:
> ++        AARCH64_VALID_JUMP_TARGET
> +         st1             {v31.8b}, [x0],  x1
> +         subs            w4,  w4,  #4
> +         st1             {v31.8b}, [x13], x1
> +@@ -3582,9 +3611,10 @@ function ipred_z3_fill_padding_neon, export=0
> +         mov             w4,  w12
> +         b               1b
> + 
> +-16:
> +-32:
> +-64:
> ++160:
> ++320:
> ++640:
> ++        AARCH64_VALID_JUMP_TARGET
> +         st1             {v31.16b}, [x0],  x1
> +         subs            w4,  w4,  #4
> +         st1             {v31.16b}, [x13], x1
> +@@ -3605,13 +3635,15 @@ function ipred_z3_fill_padding_neon, export=0
>   9:
>           ret
>   
> @@ -486,17 +558,17 @@ Index: src/arm/64/ipred.S
>  -        .hword L(ipred_z3_fill_padding_tbl) -  8b
>  -        .hword L(ipred_z3_fill_padding_tbl) -  4b
>  -        .hword L(ipred_z3_fill_padding_tbl) -  2b
> -+        .xword 64b
> -+        .xword 32b
> -+        .xword 16b
> -+        .xword  8b
> -+        .xword  4b
> -+        .xword  2b
> ++        .xword 640b
> ++        .xword 320b
> ++        .xword 160b
> ++        .xword  80b
> ++        .xword  40b
> ++        .xword  20b
>  +    .popsection
>   
>   L(ipred_z3_fill_padding_wide):
>           // Fill a WxH rectangle with padding, with W > 16.
> -@@ -3766,13 +3790,13 @@ function ipred_filter_8bpc_neon, export=1
> +@@ -3766,13 +3798,13 @@ function ipred_filter_8bpc_neon, export=1
>           add             x6,  x6,  w5, uxtw
>           ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
>           clz             w9,  w3
> @@ -513,7 +585,7 @@ Index: src/arm/64/ipred.S
>           sxtl            v18.8h,  v18.8b
>           sxtl            v19.8h,  v19.8b
>           add             x6,  x0,  x1
> -@@ -3913,11 +3937,13 @@ function ipred_filter_8bpc_neon, export=1
> +@@ -3913,11 +3945,13 @@ function ipred_filter_8bpc_neon, export=1
>   9:
>           ret
>   
> @@ -531,7 +603,7 @@ Index: src/arm/64/ipred.S
>   endfunc
>   
>   // void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
> -@@ -3926,11 +3952,11 @@ endfunc
> +@@ -3926,11 +3960,11 @@ endfunc
>   function pal_pred_8bpc_neon, export=1
>           ld1             {v0.8h}, [x2]
>           clz             w9,  w4
> @@ -546,7 +618,7 @@ Index: src/arm/64/ipred.S
>           add             x2,  x0,  x1
>           lsl             x1,  x1,  #1
>           br              x6
> -@@ -4008,12 +4034,14 @@ function pal_pred_8bpc_neon, export=1
> +@@ -4008,12 +4042,14 @@ function pal_pred_8bpc_neon, export=1
>           b.gt            64b
>           ret
>   
> @@ -566,7 +638,7 @@ Index: src/arm/64/ipred.S
>   endfunc
>   
>   // void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
> -@@ -4022,12 +4050,12 @@ endfunc
> +@@ -4022,12 +4058,12 @@ endfunc
>   //                              const int16_t *ac, const int alpha);
>   function ipred_cfl_128_8bpc_neon, export=1
>           clz             w9,  w3
> @@ -582,7 +654,7 @@ Index: src/arm/64/ipred.S
>           add             x6,  x0,  x1
>           lsl             x1,  x1,  #1
>           br              x7
> -@@ -4132,12 +4160,14 @@ L(ipred_cfl_splat_w16):
> +@@ -4132,12 +4168,14 @@ L(ipred_cfl_splat_w16):
>           b.gt            1b
>           ret
>   
> @@ -601,7 +673,7 @@ Index: src/arm/64/ipred.S
>   endfunc
>   
>   // void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
> -@@ -4146,12 +4176,12 @@ endfunc
> +@@ -4146,12 +4184,12 @@ endfunc
>   //                              const int16_t *ac, const int alpha);
>   function ipred_cfl_top_8bpc_neon, export=1
>           clz             w9,  w3
> @@ -617,7 +689,7 @@ Index: src/arm/64/ipred.S
>           add             x6,  x0,  x1
>           lsl             x1,  x1,  #1
>           br              x7
> -@@ -4186,11 +4216,13 @@ function ipred_cfl_top_8bpc_neon, export=1
> +@@ -4186,11 +4224,13 @@ function ipred_cfl_top_8bpc_neon, export=1
>           dup             v0.8h,   v2.h[0]
>           b               L(ipred_cfl_splat_w16)
>   
> @@ -635,7 +707,7 @@ Index: src/arm/64/ipred.S
>   endfunc
>   
>   // void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
> -@@ -4201,15 +4233,15 @@ function ipred_cfl_left_8bpc_neon, export=1
> +@@ -4201,15 +4241,15 @@ function ipred_cfl_left_8bpc_neon, export=1
>           sub             x2,  x2,  w4, uxtw
>           clz             w9,  w3
>           clz             w8,  w4
> @@ -657,7 +729,7 @@ Index: src/arm/64/ipred.S
>           add             x6,  x0,  x1
>           lsl             x1,  x1,  #1
>           br              x7
> -@@ -4248,11 +4280,13 @@ L(ipred_cfl_left_h32):
> +@@ -4248,11 +4288,13 @@ L(ipred_cfl_left_h32):
>           dup             v0.8h,   v2.h[0]
>           br              x9
>   
> @@ -675,7 +747,7 @@ Index: src/arm/64/ipred.S
>   endfunc
>   
>   // void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride,
> -@@ -4266,16 +4300,15 @@ function ipred_cfl_8bpc_neon, export=1
> +@@ -4266,16 +4308,15 @@ function ipred_cfl_8bpc_neon, export=1
>           clz             w9,  w3
>           clz             w6,  w4
>           dup             v16.8h, w8               // width + height
> @@ -696,7 +768,7 @@ Index: src/arm/64/ipred.S
>           ushr            v16.8h,  v16.8h,  #1     // (width + height) >> 1
>           dup             v17.8h,  w8              // -ctz(width + height)
>           add             x6,  x0,  x1
> -@@ -4392,15 +4425,17 @@ L(ipred_cfl_w32):
> +@@ -4392,15 +4433,17 @@ L(ipred_cfl_w32):
>           dup             v0.8h,   v0.h[0]
>           b               L(ipred_cfl_splat_w16)
>   
> @@ -722,7 +794,7 @@ Index: src/arm/64/ipred.S
>   endfunc
>   
>   // void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx,
> -@@ -4409,14 +4444,14 @@ endfunc
> +@@ -4409,14 +4452,14 @@ endfunc
>   function ipred_cfl_ac_420_8bpc_neon, export=1
>           clz             w8,  w5
>           lsl             w4,  w4,  #2
> @@ -740,7 +812,7 @@ Index: src/arm/64/ipred.S
>           sub             w8,  w6,  w4         // height - h_pad
>           rbit            w9,  w5              // rbit(width)
>           rbit            w10, w6              // rbit(height)
> -@@ -4555,9 +4590,9 @@ L(ipred_cfl_ac_420_w8_subtract_dc):
> +@@ -4555,9 +4598,9 @@ L(ipred_cfl_ac_420_w8_subtract_dc):
>   
>   L(ipred_cfl_ac_420_w16):
>           AARCH64_VALID_JUMP_TARGET
> @@ -753,7 +825,7 @@ Index: src/arm/64/ipred.S
>           br              x7
>   
>   L(ipred_cfl_ac_420_w16_wpad0):
> -@@ -4714,17 +4749,19 @@ L(ipred_cfl_ac_420_w16_hpad):
> +@@ -4714,17 +4757,19 @@ L(ipred_cfl_ac_420_w16_hpad):
>           lsl             w6,  w6,  #1
>           b               L(ipred_cfl_ac_420_w8_calc_subtract_dc)
>   
> @@ -781,7 +853,7 @@ Index: src/arm/64/ipred.S
>   endfunc
>   
>   // void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx,
> -@@ -4733,14 +4770,14 @@ endfunc
> +@@ -4733,14 +4778,14 @@ endfunc
>   function ipred_cfl_ac_422_8bpc_neon, export=1
>           clz             w8,  w5
>           lsl             w4,  w4,  #2
> @@ -799,7 +871,7 @@ Index: src/arm/64/ipred.S
>           sub             w8,  w6,  w4         // height - h_pad
>           rbit            w9,  w5              // rbit(width)
>           rbit            w10, w6              // rbit(height)
> -@@ -4831,9 +4868,9 @@ L(ipred_cfl_ac_422_w8_wpad):
> +@@ -4831,9 +4876,9 @@ L(ipred_cfl_ac_422_w8_wpad):
>   
>   L(ipred_cfl_ac_422_w16):
>           AARCH64_VALID_JUMP_TARGET
> @@ -812,7 +884,7 @@ Index: src/arm/64/ipred.S
>           br              x7
>   
>   L(ipred_cfl_ac_422_w16_wpad0):
> -@@ -4936,17 +4973,19 @@ L(ipred_cfl_ac_422_w16_wpad3):
> +@@ -4936,17 +4981,19 @@ L(ipred_cfl_ac_422_w16_wpad3):
>           mov             v1.16b,  v3.16b
>           b               L(ipred_cfl_ac_420_w16_hpad)
>   
> @@ -840,7 +912,7 @@ Index: src/arm/64/ipred.S
>   endfunc
>   
>   // void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx,
> -@@ -4955,14 +4994,14 @@ endfunc
> +@@ -4955,14 +5002,14 @@ endfunc
>   function ipred_cfl_ac_444_8bpc_neon, export=1
>           clz             w8,  w5
>           lsl             w4,  w4,  #2
> @@ -858,7 +930,7 @@ Index: src/arm/64/ipred.S
>           sub             w8,  w6,  w4         // height - h_pad
>           rbit            w9,  w5              // rbit(width)
>           rbit            w10, w6              // rbit(height)
> -@@ -5083,9 +5122,10 @@ L(ipred_cfl_ac_444_w16_wpad):
> +@@ -5083,9 +5130,10 @@ L(ipred_cfl_ac_444_w16_wpad):
>   
>   L(ipred_cfl_ac_444_w32):
>           AARCH64_VALID_JUMP_TARGET
> @@ -872,7 +944,7 @@ Index: src/arm/64/ipred.S
>           br              x7
>   
>   L(ipred_cfl_ac_444_w32_wpad0):
> -@@ -5231,15 +5271,17 @@ L(ipred_cfl_ac_444_w32_hpad):
> +@@ -5231,15 +5279,17 @@ L(ipred_cfl_ac_444_w32_hpad):
>           dup             v4.8h,   v4.h[0]
>           b               L(ipred_cfl_ac_420_w8_subtract_dc)
>   

-- 
Regards,
Robert Nagy

Reply via email to