Thank you for doing this. I suggest comparing against the previous effort by
Christophe Fontaine though.
Couple of comments inline, superficially looks correct otherwise.
> +/*
> + * Helper for emit_ld_mbuf(): fast path.
> + * Compute the packet offset; if it lies inside the first segment leave the
> + * data pointer in R0, otherwise branch to the slow path.
> + */
> +static void
> +emit_ldmb_fast_path(struct a64_jit_ctx *ctx, uint8_t src, uint8_t mode,
> + uint32_t sz, int32_t imm, const uint32_t ofs[LDMB_OFS_NUM])
> +{
> + uint8_t r0 = ebpf_to_a64_reg(ctx, EBPF_REG_0);
> + uint8_t r6 = ebpf_to_a64_reg(ctx, EBPF_REG_6);
> + uint8_t tmp1 = ebpf_to_a64_reg(ctx, TMP_REG_1);
> + uint8_t tmp2 = ebpf_to_a64_reg(ctx, TMP_REG_2);
> + uint8_t tmp3 = ebpf_to_a64_reg(ctx, TMP_REG_3);
> +
> + /* off = imm (+ src for BPF_IND) */
> + emit_mov_imm(ctx, 1, tmp1, imm);
> + if (mode == BPF_IND)
> + emit_add(ctx, 1, tmp1, src);
> +
> + /* if ((int64_t)(mbuf->data_len - off) < sz) goto slow_path */
> + emit_mov_imm(ctx, 1, tmp2, offsetof(struct rte_mbuf, data_len));
> + emit_ldr(ctx, BPF_H, tmp2, r6, tmp2);
> + emit_sub(ctx, 1, tmp2, tmp1);
> + emit_mov_imm(ctx, 1, tmp3, sz);
> + emit_cmp(ctx, 1, tmp2, tmp3);
> + emit_b_cond(ctx, A64_LT, (int32_t)(ofs[LDMB_SLOW_OFS] - ctx->idx));
Are we checking that (int64_t)off >= 0 anywhere?
> +
> + /* R0 = mbuf->buf_addr + mbuf->data_off + off */
> + emit_mov_imm(ctx, 1, tmp2, offsetof(struct rte_mbuf, data_off));
> + emit_ldr(ctx, BPF_H, tmp2, r6, tmp2);
> + emit_mov_imm(ctx, 1, r0, offsetof(struct rte_mbuf, buf_addr));
> + emit_ldr(ctx, EBPF_DW, r0, r6, r0);
> + emit_add(ctx, 1, r0, tmp2);
> + emit_add(ctx, 1, r0, tmp1);
> +
> + emit_b(ctx, (int32_t)(ofs[LDMB_FIN_OFS] - ctx->idx));
> +}
// snip
> +/*
> + * Emit code for BPF_LD | BPF_ABS and BPF_LD | BPF_IND packet loads:
> + *
> + * off = imm (+ src for BPF_IND)
> + * if (mbuf->data_len - off >= sz) -- fast path
> + * ptr = mbuf->buf_addr + mbuf->data_off + off;
> + * else -- slow path
> + * ptr = __rte_pktmbuf_read(mbuf, off, sz, buf);
> + * if (ptr == NULL)
> + * return 0;
> + * R0 = ntoh(*(size *)ptr); -- common tail
nit: this pseudo-code could probably be made more C-like.
> + *
> + * The three blocks are sized in a dry run so the forward branches can be
> + * resolved, then emitted for real (arm64 instructions are fixed width, so
> + * the dry run reproduces the real instruction count exactly).
> + */
> +static void
> +emit_ld_mbuf(struct a64_jit_ctx *ctx, uint8_t op, uint8_t src, int32_t imm,
> + uint32_t stack_ofs)
> +{
> + uint8_t mode = BPF_MODE(op);
> + uint8_t opsz = BPF_SIZE(op);
> + uint32_t sz = bpf_size(opsz);
> + uint32_t ofs[LDMB_OFS_NUM];
> +
> + /* seed offsets so the dry-run branches stay in range */
> + ofs[LDMB_FAST_OFS] = ofs[LDMB_SLOW_OFS] = ofs[LDMB_FIN_OFS] = ctx->idx;
> +
> + /* dry run to record block offsets */
> + emit_ldmb_fast_path(ctx, src, mode, sz, imm, ofs);
> + ofs[LDMB_SLOW_OFS] = ctx->idx;
> + emit_ldmb_slow_path(ctx, sz, stack_ofs);
> + ofs[LDMB_FIN_OFS] = ctx->idx;
> + emit_ldmb_fin(ctx, opsz, sz);
nit: we already do two passes for the whole program, could avoid quadruple work
here
> +
> + /* rewind and emit for real with resolved offsets */
> + ctx->idx = ofs[LDMB_FAST_OFS];
> + emit_ldmb_fast_path(ctx, src, mode, sz, imm, ofs);
> + emit_ldmb_slow_path(ctx, sz, stack_ofs);
> + emit_ldmb_fin(ctx, opsz, sz);
> +}
// snip the rest