Thank you for doing this. I suggest comparing against the previous effort by 
Christophe Fontaine though.

Couple of comments inline, superficially looks correct otherwise.

> +/*
> + * Helper for emit_ld_mbuf(): fast path.
> + * Compute the packet offset; if it lies inside the first segment leave the
> + * data pointer in R0, otherwise branch to the slow path.
> + */
> +static void
> +emit_ldmb_fast_path(struct a64_jit_ctx *ctx, uint8_t src, uint8_t mode,
> +                 uint32_t sz, int32_t imm, const uint32_t ofs[LDMB_OFS_NUM])
> +{
> +     uint8_t r0 = ebpf_to_a64_reg(ctx, EBPF_REG_0);
> +     uint8_t r6 = ebpf_to_a64_reg(ctx, EBPF_REG_6);
> +     uint8_t tmp1 = ebpf_to_a64_reg(ctx, TMP_REG_1);
> +     uint8_t tmp2 = ebpf_to_a64_reg(ctx, TMP_REG_2);
> +     uint8_t tmp3 = ebpf_to_a64_reg(ctx, TMP_REG_3);
> +
> +     /* off = imm (+ src for BPF_IND) */
> +     emit_mov_imm(ctx, 1, tmp1, imm);
> +     if (mode == BPF_IND)
> +             emit_add(ctx, 1, tmp1, src);
> +
> +     /* if ((int64_t)(mbuf->data_len - off) < sz) goto slow_path */
> +     emit_mov_imm(ctx, 1, tmp2, offsetof(struct rte_mbuf, data_len));
> +     emit_ldr(ctx, BPF_H, tmp2, r6, tmp2);
> +     emit_sub(ctx, 1, tmp2, tmp1);
> +     emit_mov_imm(ctx, 1, tmp3, sz);
> +     emit_cmp(ctx, 1, tmp2, tmp3);
> +     emit_b_cond(ctx, A64_LT, (int32_t)(ofs[LDMB_SLOW_OFS] - ctx->idx));

Are we checking that (int64_t)off >= 0 anywhere?

> +
> +     /* R0 = mbuf->buf_addr + mbuf->data_off + off */
> +     emit_mov_imm(ctx, 1, tmp2, offsetof(struct rte_mbuf, data_off));
> +     emit_ldr(ctx, BPF_H, tmp2, r6, tmp2);
> +     emit_mov_imm(ctx, 1, r0, offsetof(struct rte_mbuf, buf_addr));
> +     emit_ldr(ctx, EBPF_DW, r0, r6, r0);
> +     emit_add(ctx, 1, r0, tmp2);
> +     emit_add(ctx, 1, r0, tmp1);
> +
> +     emit_b(ctx, (int32_t)(ofs[LDMB_FIN_OFS] - ctx->idx));
> +}

// snip

> +/*
> + * Emit code for BPF_LD | BPF_ABS and BPF_LD | BPF_IND packet loads:
> + *
> + *   off = imm (+ src for BPF_IND)
> + *   if (mbuf->data_len - off >= sz)                     -- fast path
> + *           ptr = mbuf->buf_addr + mbuf->data_off + off;
> + *   else                                                -- slow path
> + *           ptr = __rte_pktmbuf_read(mbuf, off, sz, buf);
> + *           if (ptr == NULL)
> + *                   return 0;
> + *   R0 = ntoh(*(size *)ptr);                            -- common tail

nit: this pseudo-code could probably be made more C-like.

> + *
> + * The three blocks are sized in a dry run so the forward branches can be
> + * resolved, then emitted for real (arm64 instructions are fixed width, so
> + * the dry run reproduces the real instruction count exactly).
> + */
> +static void
> +emit_ld_mbuf(struct a64_jit_ctx *ctx, uint8_t op, uint8_t src, int32_t imm,
> +          uint32_t stack_ofs)
> +{
> +     uint8_t mode = BPF_MODE(op);
> +     uint8_t opsz = BPF_SIZE(op);
> +     uint32_t sz = bpf_size(opsz);
> +     uint32_t ofs[LDMB_OFS_NUM];
> +
> +     /* seed offsets so the dry-run branches stay in range */
> +     ofs[LDMB_FAST_OFS] = ofs[LDMB_SLOW_OFS] = ofs[LDMB_FIN_OFS] = ctx->idx;
> +
> +     /* dry run to record block offsets */
> +     emit_ldmb_fast_path(ctx, src, mode, sz, imm, ofs);
> +     ofs[LDMB_SLOW_OFS] = ctx->idx;
> +     emit_ldmb_slow_path(ctx, sz, stack_ofs);
> +     ofs[LDMB_FIN_OFS] = ctx->idx;
> +     emit_ldmb_fin(ctx, opsz, sz);

nit: we already do two passes for the whole program, could avoid quadruple work 
here

> +
> +     /* rewind and emit for real with resolved offsets */
> +     ctx->idx = ofs[LDMB_FAST_OFS];
> +     emit_ldmb_fast_path(ctx, src, mode, sz, imm, ofs);
> +     emit_ldmb_slow_path(ctx, sz, stack_ofs);
> +     emit_ldmb_fin(ctx, opsz, sz);
> +}

// snip the rest

Reply via email to