This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 89c21b5ab7d4880c679b7e28da5fae7d6f9e08f9 Author: Jun Zhao <[email protected]> AuthorDate: Fri Mar 6 17:57:05 2026 +0800 Commit: Jun Zhao <[email protected]> CommitDate: Mon Mar 30 14:32:10 2026 +0000 lavc/hevc: add aarch64 NEON for Planar prediction Add NEON-optimized implementation for HEVC intra Planar prediction at 8-bit depth, supporting all block sizes (4x4 to 32x32). Planar prediction implements bilinear interpolation using an incremental base update: base_{y+1}[x] = base_y[x] - (top[x] - left[N]), reducing per-row computation from 4 multiply-adds to 1 subtract + 1 multiply. Uses rshrn for rounded narrowing shifts, eliminating manual rounding bias. All left[y] values are broadcast in the NEON domain, avoiding GP-to-NEON transfers. 4x4 interleaves row computations across 4 rows to break dependencies. 16x16 uses v19-v22 for persistent base/decrement vectors, avoiding callee-saved register spills. 32x32 processes 8 rows per loop iteration (4 iterations total) to reduce code size while maintaining full NEON utilization. Speedup over C on Apple M4 (checkasm --bench): 4x4: 2.25x 8x8: 6.40x 16x16: 9.72x 32x32: 3.21x Signed-off-by: Jun Zhao <[email protected]> --- libavcodec/aarch64/hevcpred_init_aarch64.c | 14 + libavcodec/aarch64/hevcpred_neon.S | 430 +++++++++++++++++++++++++++++ 2 files changed, 444 insertions(+) diff --git a/libavcodec/aarch64/hevcpred_init_aarch64.c b/libavcodec/aarch64/hevcpred_init_aarch64.c index db4029a161..55f3e2e731 100644 --- a/libavcodec/aarch64/hevcpred_init_aarch64.c +++ b/libavcodec/aarch64/hevcpred_init_aarch64.c @@ -39,6 +39,16 @@ void ff_hevc_pred_dc_32x32_8_neon(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int c_idx); +// Planar prediction +void ff_hevc_pred_planar_4x4_8_neon(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride); +void ff_hevc_pred_planar_8x8_8_neon(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride); +void ff_hevc_pred_planar_16x16_8_neon(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride); +void ff_hevc_pred_planar_32x32_8_neon(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride); + static void pred_dc_neon(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int log2_size, int c_idx) @@ -70,5 +80,9 @@ av_cold void ff_hevc_pred_init_aarch64(HEVCPredContext *hpc, int bit_depth) if (bit_depth == 8) { hpc->pred_dc = pred_dc_neon; + hpc->pred_planar[0] = ff_hevc_pred_planar_4x4_8_neon; + hpc->pred_planar[1] = ff_hevc_pred_planar_8x8_8_neon; + hpc->pred_planar[2] = ff_hevc_pred_planar_16x16_8_neon; + hpc->pred_planar[3] = ff_hevc_pred_planar_32x32_8_neon; } } diff --git a/libavcodec/aarch64/hevcpred_neon.S b/libavcodec/aarch64/hevcpred_neon.S index cd1508bb6c..26d4c887ab 100644 --- a/libavcodec/aarch64/hevcpred_neon.S +++ b/libavcodec/aarch64/hevcpred_neon.S @@ -405,3 +405,433 @@ function ff_hevc_pred_dc_32x32_8_neon, export=1 // No edge smoothing for 32x32 (size >= 32) ret endfunc + +// ============================================================================= +// Planar Prediction +// ============================================================================= + +/* + * Planar prediction algorithm: + * For each pixel (x, y): + * POS(x,y) = ((size-1-x)*left[y] + (x+1)*top[size] + + * (size-1-y)*top[x] + (y+1)*left[size] + size) >> (log2_size+1) + */ +// ----------------------------------------------------------------------------- +// pred_planar_4x4_8: Planar prediction +// Arguments: +// x0: src +// x1: top +// x2: left +// x3: stride +// ----------------------------------------------------------------------------- +function ff_hevc_pred_planar_4x4_8_neon, export=1 + // Load reference samples + ldr s0, [x1] // top[0..3] + ldr s1, [x2] // left[0..3] + ldrb w4, [x1, #4] // top[4] + ldrb w5, [x2, #4] // left[4] + + // Setup weight vectors for x direction + movrel x6, planar_weights_4 + ldp d4, d5, [x6] // weights_dec, weights_inc + + // Precompute base[x] = inc[x]*top[4] + 3*top[x] + left[4] + 4 + dup v6.8b, w4 // top[4] + umull v2.8h, v5.8b, v6.8b // inc[x]*top[4] + uxtl v3.8h, v0.8b // widen top[x] + ushll v6.8h, v0.8b, #1 // top[x]<<1 + add v6.8h, v6.8h, v3.8h // 3*top[x] + add v2.8h, v2.8h, v6.8h // + inc*top[4] + dup v5.8h, w5 // left[4] as 16-bit + add v2.8h, v2.8h, v5.8h // + left[4] (y=0: (y+1)*left[4]=1*left[4]) + // v2 = base for row 0 (rounding folded into rshrn below) + + // Precompute decrement: top[x] - left[4] + sub v5.8h, v3.8h, v5.8h // decrement = top[x] - left[4] + + // Pre-dup all left values + dup v16.8b, v1.b[0] // left[0] + dup v17.8b, v1.b[1] // left[1] + dup v18.8b, v1.b[2] // left[2] + dup v19.8b, v1.b[3] // left[3] + + // Compute bases for all 4 rows + mov v3.16b, v2.16b // base0 = base + sub v20.8h, v2.8h, v5.8h // base1 = base - dec + sub v21.8h, v20.8h, v5.8h // base2 = base1 - dec + sub v22.8h, v21.8h, v5.8h // base3 = base2 - dec + + // Interleaved row 0/1 computation + umull v6.8h, v4.8b, v16.8b // row0: dec*left[0] + umull v7.8h, v4.8b, v17.8b // row1: dec*left[1] + add v6.8h, v6.8h, v3.8h // row0: + base0 + add v7.8h, v7.8h, v20.8h // row1: + base1 + rshrn v6.8b, v6.8h, #3 // row0: >>3 + rshrn v7.8b, v7.8h, #3 // row1: >>3 + + // Interleaved row 2/3 computation + umull v23.8h, v4.8b, v18.8b // row2: dec*left[2] + st1 {v6.s}[0], [x0], x3 // store row0 + umull v24.8h, v4.8b, v19.8b // row3: dec*left[3] + st1 {v7.s}[0], [x0], x3 // store row1 + add v23.8h, v23.8h, v21.8h // row2: + base2 + add v24.8h, v24.8h, v22.8h // row3: + base3 + rshrn v23.8b, v23.8h, #3 // row2 + rshrn v24.8b, v24.8h, #3 // row3 + st1 {v23.s}[0], [x0], x3 // store row2 + str s24, [x0] // store row3 + + ret +endfunc + +// ----------------------------------------------------------------------------- +// pred_planar_8x8_8: Planar prediction +// Arguments: +// x0: src +// x1: top +// x2: left +// x3: stride +// ----------------------------------------------------------------------------- +function ff_hevc_pred_planar_8x8_8_neon, export=1 + // Load reference samples + ldr d0, [x1] // top[0..7] + ldr d1, [x2] // left[0..7] + ldrb w4, [x1, #8] // top[8] + ldrb w5, [x2, #8] // left[8] + + // Setup weight vectors + movrel x6, planar_weights_8 + ldp d4, d5, [x6] // weights_dec, weights_inc + + // Precompute base[x] = inc[x]*top[8] + 7*top[x] + left[8] + 8 + dup v6.8b, w4 // top[8] + umull v2.8h, v5.8b, v6.8b // inc[x]*top[8] + uxtl v3.8h, v0.8b // widen top[x] + ushll v6.8h, v0.8b, #3 // top[x]<<3 + sub v6.8h, v6.8h, v3.8h // 7*top[x] + add v2.8h, v2.8h, v6.8h // + inc*top[8] + dup v5.8h, w5 // left[8] as 16-bit + add v2.8h, v2.8h, v5.8h // + left[8] + // v2 = base for row 0 (rounding folded into rshrn below) + + // Precompute decrement: top[x] - left[8] + sub v5.8h, v3.8h, v5.8h // decrement + + // Precompute all 8 row bases to break serial dependency chain + mov v16.16b, v2.16b // base0 + sub v17.8h, v16.8h, v5.8h // base1 + sub v18.8h, v17.8h, v5.8h // base2 + sub v19.8h, v18.8h, v5.8h // base3 + sub v20.8h, v19.8h, v5.8h // base4 + sub v21.8h, v20.8h, v5.8h // base5 + sub v22.8h, v21.8h, v5.8h // base6 + sub v23.8h, v22.8h, v5.8h // base7 + + // Rows 0-1 + dup v6.8b, v1.b[0] + dup v7.8b, v1.b[1] + umull v2.8h, v4.8b, v6.8b + umull v3.8h, v4.8b, v7.8b + add v2.8h, v2.8h, v16.8h + add v3.8h, v3.8h, v17.8h + rshrn v2.8b, v2.8h, #4 + rshrn v3.8b, v3.8h, #4 + st1 {v2.d}[0], [x0], x3 + st1 {v3.d}[0], [x0], x3 + + // Rows 2-3 + dup v6.8b, v1.b[2] + dup v7.8b, v1.b[3] + umull v2.8h, v4.8b, v6.8b + umull v3.8h, v4.8b, v7.8b + add v2.8h, v2.8h, v18.8h + add v3.8h, v3.8h, v19.8h + rshrn v2.8b, v2.8h, #4 + rshrn v3.8b, v3.8h, #4 + st1 {v2.d}[0], [x0], x3 + st1 {v3.d}[0], [x0], x3 + + // Rows 4-5 + dup v6.8b, v1.b[4] + dup v7.8b, v1.b[5] + umull v2.8h, v4.8b, v6.8b + umull v3.8h, v4.8b, v7.8b + add v2.8h, v2.8h, v20.8h + add v3.8h, v3.8h, v21.8h + rshrn v2.8b, v2.8h, #4 + rshrn v3.8b, v3.8h, #4 + st1 {v2.d}[0], [x0], x3 + st1 {v3.d}[0], [x0], x3 + + // Rows 6-7 + dup v6.8b, v1.b[6] + dup v7.8b, v1.b[7] + umull v2.8h, v4.8b, v6.8b + umull v3.8h, v4.8b, v7.8b + add v2.8h, v2.8h, v22.8h + add v3.8h, v3.8h, v23.8h + rshrn v2.8b, v2.8h, #4 + rshrn v3.8b, v3.8h, #4 + st1 {v2.d}[0], [x0], x3 + str d3, [x0] + + ret +endfunc + +// ----------------------------------------------------------------------------- +// pred_planar_16x16_8: Planar prediction +// +// Formula: POS(x,y) = ((15-x)*left[y] + (x+1)*top[16] +// + (15-y)*top[x] + (y+1)*left[16] + 16) >> 5 +// +// Decomposed into incremental base update (same as 4x4/8x8/32x32): +// base_0[x] = inc[x]*top[16] + 15*top[x] + left[16] + 16 +// base_{y+1}[x] = base_y[x] - (top[x] - left[16]) +// POS(x,y) = (base_y[x] + weights_dec[x]*left[y]) >> 5 +// +// 16-wide requires split-half processing (umull/umull2). +// 16 rows fully unrolled with NEON-domain left[y] broadcast. +// +// Arguments: +// x0: src +// x1: top +// x2: left +// x3: stride +// ----------------------------------------------------------------------------- +function ff_hevc_pred_planar_16x16_8_neon, export=1 + // Load reference samples + ldr q0, [x1] // top[0..15] + ldr q1, [x2] // left[0..15] + ldrb w4, [x1, #16] // top[16] + ldrb w5, [x2, #16] // left[16] + + // Setup weight vectors for 16 elements + movrel x6, planar_weights_16 + ldp q4, q5, [x6] // weights_dec [15..0], weights_inc [1..16] + + // Precompute base[x] = inc[x]*top[16] + 15*top[x] + left[16] + dup v6.16b, w4 // top[16] broadcast + umull v19.8h, v5.8b, v6.8b // inc[x]*top[16] lo + umull2 v20.8h, v5.16b, v6.16b // inc[x]*top[16] hi + + // 15*top[x] = (top[x]<<4) - top[x] + uxtl v2.8h, v0.8b // widen top[0..7] + uxtl2 v3.8h, v0.16b // widen top[8..15] + ushll v6.8h, v0.8b, #4 // top[0..7]<<4 + ushll2 v7.8h, v0.16b, #4 // top[8..15]<<4 + sub v6.8h, v6.8h, v2.8h // 15*top[0..7] + sub v7.8h, v7.8h, v3.8h // 15*top[8..15] + + add v19.8h, v19.8h, v6.8h // + 15*top lo + add v20.8h, v20.8h, v7.8h // + 15*top hi + + dup v5.8h, w5 // left[16] as 16-bit + add v19.8h, v19.8h, v5.8h // + left[16] + add v20.8h, v20.8h, v5.8h + // v19/v20 = base_lo/base_hi for row 0 (rounding folded into rshrn below) + + // Precompute decrement: top[x] - left[16] + sub v21.8h, v2.8h, v5.8h // dec lo = top[0..7] - left[16] + sub v22.8h, v3.8h, v5.8h // dec hi = top[8..15] - left[16] + + // Persistent registers: + // v19,v20 = base[0..15] (16-bit, decremented each row) + // v21,v22 = decrement[0..15] (16-bit, constant) + // v4 = weight_dec[0..15] (8-bit, constant) + // v1 = left[0..15] (8-bit, preloaded for NEON-domain broadcast) + +.macro planar16_row lane, last=0 + dup v6.16b, v1.b[\lane] // left[y] NEON-domain broadcast + umull v16.8h, v4.8b, v6.8b // dec[x]*left[y] lo + umull2 v17.8h, v4.16b, v6.16b // dec[x]*left[y] hi + add v16.8h, v16.8h, v19.8h // + base lo + add v17.8h, v17.8h, v20.8h // + base hi + rshrn v16.8b, v16.8h, #5 // >>5 lo (rounded) + rshrn2 v16.16b, v17.8h, #5 // >>5 hi, merge into q16 + .if \last == 0 + st1 {v16.16b}, [x0], x3 + sub v19.8h, v19.8h, v21.8h // base -= decrement + sub v20.8h, v20.8h, v22.8h + .else + str q16, [x0] + .endif +.endm + + planar16_row 0 + planar16_row 1 + planar16_row 2 + planar16_row 3 + planar16_row 4 + planar16_row 5 + planar16_row 6 + planar16_row 7 + planar16_row 8 + planar16_row 9 + planar16_row 10 + planar16_row 11 + planar16_row 12 + planar16_row 13 + planar16_row 14 + planar16_row 15, last=1 + +.purgem planar16_row + + ret +endfunc + +// ----------------------------------------------------------------------------- +// pred_planar_32x32_8: Planar prediction +// +// Formula: POS(x,y) = ((31-x)*left[y] + (x+1)*top[32] +// + (31-y)*top[x] + (y+1)*left[32] + 32) >> 6 +// +// Decomposed as: base[x] = weight_inc[x]*top[32] + 31*top[x] + 32 +// Per row: base[x] += left[32] (incremental for (y+1)*left[32]) +// base[x] -= top[x] (incremental for (31-y)*top[x]) +// result = base[x] + weight_dec[x]*left[y] +// +// Both row_add and the (31-y)*top[x] term are folded into the base, +// eliminating all GP→NEON scalar broadcasts except for left[y]. +// The loop processes 8 rows per iteration (macro-expanded), with 4 +// iterations total. left[y] values are loaded 8 at a time and +// broadcast per-lane within each iteration. +// +// Arguments: +// x0: src +// x1: top +// x2: left +// x3: stride +// ----------------------------------------------------------------------------- +function ff_hevc_pred_planar_32x32_8_neon, export=1 + // Load top[0..31] + ldp q0, q1, [x1] // top[0..15], top[16..31] + ldrb w4, [x1, #32] // top[32] + ldrb w5, [x2, #32] // left[32] + + // Load weight vectors + movrel x6, planar_weights_32 + ldp q4, q5, [x6] // weight_dec = {31,30,...,0} + ldp q6, q7, [x6, #32] // weight_inc = {1,2,...,32} + + // Precompute term_A = weight_inc * top[32] (16-bit) + dup v2.16b, w4 + umull v20.8h, v6.8b, v2.8b + umull2 v21.8h, v6.16b, v2.16b + umull v22.8h, v7.8b, v2.8b + umull2 v23.8h, v7.16b, v2.16b + + // Widen top[x] for incremental subtraction + uxtl v24.8h, v0.8b + uxtl2 v25.8h, v0.16b + + // 31*top[x] = top[x]<<5 - top[x] + ushll v6.8h, v0.8b, #5 + ushll2 v7.8h, v0.16b, #5 + sub v6.8h, v6.8h, v24.8h // 31*top[0..7] + sub v7.8h, v7.8h, v25.8h // 31*top[8..15] + + // base[0..15] = term_A + 31*top[0..15] + add v20.8h, v20.8h, v6.8h + add v21.8h, v21.8h, v7.8h + + // Same for top[16..31] + uxtl v26.8h, v1.8b + uxtl2 v27.8h, v1.16b + ushll v6.8h, v1.8b, #5 + ushll2 v7.8h, v1.16b, #5 + sub v6.8h, v6.8h, v26.8h // 31*top[16..23] + sub v7.8h, v7.8h, v27.8h // 31*top[24..31] + add v22.8h, v22.8h, v6.8h + add v23.8h, v23.8h, v7.8h + + // Compute combined decrement: top[x] - left[32] + // Each row: base += left[32] and base -= top[x] + // Combined: base -= (top[x] - left[32]) + dup v3.8h, w5 // left[32] as 16-bit + sub v24.8h, v24.8h, v3.8h // top[0..7] - left[32] + sub v25.8h, v25.8h, v3.8h // top[8..15] - left[32] + sub v26.8h, v26.8h, v3.8h // top[16..23] - left[32] + sub v27.8h, v27.8h, v3.8h // top[24..31] - left[32] + + // Now base needs initial +=left[32] for y=0 (row_add = 1*left[32]) + add v20.8h, v20.8h, v3.8h + add v21.8h, v21.8h, v3.8h + add v22.8h, v22.8h, v3.8h + add v23.8h, v23.8h, v3.8h + + // Persistent registers: + // v20-v23 = base[0..31] (includes running row_add, decremented by combined each row) + // v24,v25 = top[0..15] - left[32] (combined decrement) + // v26,v27 = top[16..31] - left[32] (combined decrement) + // v4,v5 = weight_dec[0..31] (8-bit) + +.macro planar32_row lane, leftreg + dup v2.16b, \leftreg\().b[\lane] + umull v16.8h, v4.8b, v2.8b + umull2 v17.8h, v4.16b, v2.16b + umull v18.8h, v5.8b, v2.8b + umull2 v19.8h, v5.16b, v2.16b + add v16.8h, v16.8h, v20.8h + add v17.8h, v17.8h, v21.8h + add v18.8h, v18.8h, v22.8h + add v19.8h, v19.8h, v23.8h + rshrn v28.8b, v16.8h, #6 + rshrn2 v28.16b, v17.8h, #6 + rshrn v29.8b, v18.8h, #6 + rshrn2 v29.16b, v19.8h, #6 + stp q28, q29, [x0] + add x0, x0, x3 + sub v20.8h, v20.8h, v24.8h + sub v21.8h, v21.8h, v25.8h + sub v22.8h, v22.8h, v26.8h + sub v23.8h, v23.8h, v27.8h +.endm + + // Process 32 rows in 4 iterations of 8 rows each + mov w7, #4 +.Lplanar32_loop: + ld1 {v1.8b}, [x2], #8 // load 8 left[] bytes for this iteration + + planar32_row 0, v1 + planar32_row 1, v1 + planar32_row 2, v1 + planar32_row 3, v1 + planar32_row 4, v1 + planar32_row 5, v1 + planar32_row 6, v1 + planar32_row 7, v1 + + subs w7, w7, #1 + b.gt .Lplanar32_loop + +.purgem planar32_row + + ret +endfunc + + +// ============================================================================= +// Weight tables for planar prediction +// ============================================================================= + +const planar_weights_4, align=4 + .byte 3, 2, 1, 0, 0, 0, 0, 0 // weights_dec for 4x4 + .byte 1, 2, 3, 4, 0, 0, 0, 0 // weights_inc for 4x4 +endconst + +const planar_weights_8, align=4 + .byte 7, 6, 5, 4, 3, 2, 1, 0 // weights_dec + .byte 1, 2, 3, 4, 5, 6, 7, 8 // weights_inc +endconst + +const planar_weights_16, align=4 + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + .byte 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +endconst + +const planar_weights_32, align=4 + .byte 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + .byte 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + .byte 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 +endconst _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
