On Thu, 26 Feb 2026 14:49:38 +0800
P1erreCashon <[email protected]> wrote:
> +/* Functions to compare multiple of 16 byte keys (up to 128 bytes) */
> +static inline int
> +rte_hash_k16_cmp_eq(const void *key1, const void *key2, size_t key_len
> __rte_unused)
> +{
> + const uint8_t *p1 = (const uint8_t *)key1;
> + const uint8_t *p2 = (const uint8_t *)key2;
> + size_t offset = 0;
> +
> + while (offset < 16) {
> + size_t vl = __riscv_vsetvl_e8m1(16 - offset);
> +
> + vuint8m1_t v1 = __riscv_vle8_v_u8m1(p1 + offset, vl);
> + vuint8m1_t v2 = __riscv_vle8_v_u8m1(p2 + offset, vl);
> +
> + /* find != bytes */
> + vbool8_t neq = __riscv_vmsne_vv_u8m1_b8(v1, v2, vl);
> +
> + /* if any byte mismatches, return not equal */
> + if (__riscv_vfirst_m_b8(neq, vl) >= 0)
> + return 1;
> +
> + offset += vl;
> + }
> +
> + /* all bytes equal */
> + return 0;
> +}
> +
Compiling this with godbolt generates much bigger code than simple 64 bit
version.
Is it really faster?
int
rte_hash_k16_cmp_eq(const void *key1, const void *key2, size_t key_len
__rte_unused)
{
const unaligned_uint64_t *k1 = key1;
const unaligned_uint64_t *k2 = key2;
return !!((k1[0] ^ k2[0]) | (k1[1] ^ k2[1]));
}
rte_hash_k16_cmp_eq:
ld a4,8(a0)
ld a5,0(a0)
ld a2,0(a1)
ld a3,8(a1)
xor a0,a5,a2
xor a4,a4,a3
or a0,a0,a4
snez a0,a0
ret
ricsv_hash_k16_cmp_eq:
li a4,0
li a6,16
li a7,15
.L5:
sub a5,a6,a4
vsetvli a5,a5,e8,m1,ta,ma
add a2,a0,a4
add a3,a1,a4
vle8.v v1,0(a2)
vle8.v v2,0(a3)
add a4,a4,a5
vmsne.vv v1,v1,v2
vfirst.m a5,v1
bge a5,zero,.L6
bleu a4,a7,.L5
li a0,0
ret
.L6:
li a0,1
ret