Re: [PATCH v1 1/3] examples/l3fwd: relax RSS requirement with option
Hi Trevor, Now the port Rx mq_mode had been set to RTE_ETH_MQ_RX_RSS by default, but some hw and/or virtual interface does not support the RSS and offload mode presupposed, e.g., some virtio interfaces in the cloud don't support RSS and the error msg may like: virtio_dev_configure(): RSS support requested but not supported by the device Port0 dev_configure = -95 So to enable the l3fwd running in that environment, the Rx mode requirement can be relaxed to reflect the hardware feature reality here, and the l3fwd can run smoothly then. An option named "relax-rx-mode" is added to enable the relax action here, and it's disabled by default. Signed-off-by: Trevor Tao --- examples/l3fwd/main.c | 19 +-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c index 6063eb1399..2c8f528d00 100644 --- a/examples/l3fwd/main.c +++ b/examples/l3fwd/main.c @@ -73,6 +73,7 @@ static enum L3FWD_LOOKUP_MODE lookup_mode; static int numa_on = 1; /**< NUMA is enabled by default. */ static int parse_ptype; /**< Parse packet type using rx callback, and */ /**< disabled by default */ +static int relax_rx_mode; /**< Relax RX mode is disabled by default */ static int per_port_pool; /**< Use separate buffer pools per port; disabled */ /**< by default */ @@ -678,6 +679,7 @@ static const char short_options[] = #define CMD_LINE_OPT_MAX_PKT_LEN "max-pkt-len" #define CMD_LINE_OPT_HASH_ENTRY_NUM "hash-entry-num" #define CMD_LINE_OPT_PARSE_PTYPE "parse-ptype" +#define CMD_LINE_OPT_RELAX_RX_MODE "relax-rx-mode" #define CMD_LINE_OPT_PER_PORT_POOL "per-port-pool" #define CMD_LINE_OPT_MODE "mode" #define CMD_LINE_OPT_EVENTQ_SYNC "eventq-sched" @@ -705,6 +707,7 @@ enum { CMD_LINE_OPT_MAX_PKT_LEN_NUM, CMD_LINE_OPT_HASH_ENTRY_NUM_NUM, CMD_LINE_OPT_PARSE_PTYPE_NUM, + CMD_LINE_OPT_RELAX_RX_MODE_NUM, CMD_LINE_OPT_RULE_IPV4_NUM, CMD_LINE_OPT_RULE_IPV6_NUM, CMD_LINE_OPT_ALG_NUM, @@ -728,6 +731,7 @@ static const struct option lgopts[] = { {CMD_LINE_OPT_MAX_PKT_LEN, 1, 0, CMD_LINE_OPT_MAX_PKT_LEN_NUM}, {CMD_LINE_OPT_HASH_ENTRY_NUM, 1, 0, CMD_LINE_OPT_HASH_ENTRY_NUM_NUM}, {CMD_LINE_OPT_PARSE_PTYPE, 0, 0, CMD_LINE_OPT_PARSE_PTYPE_NUM}, + {CMD_LINE_OPT_RELAX_RX_MODE, 0, 0, CMD_LINE_OPT_RELAX_RX_MODE_NUM}, {CMD_LINE_OPT_PER_PORT_POOL, 0, 0, CMD_LINE_OPT_PARSE_PER_PORT_POOL}, {CMD_LINE_OPT_MODE, 1, 0, CMD_LINE_OPT_MODE_NUM}, {CMD_LINE_OPT_EVENTQ_SYNC, 1, 0, CMD_LINE_OPT_EVENTQ_SYNC_NUM}, @@ -853,6 +857,11 @@ parse_args(int argc, char **argv) parse_ptype = 1; break; + case CMD_LINE_OPT_RELAX_RX_MODE_NUM: + printf("Relax rx mode is enabled\n"); + relax_rx_mode = 1; + break; + case CMD_LINE_OPT_PARSE_PER_PORT_POOL: printf("per port buffer pool is enabled\n"); per_port_pool = 1; @@ -1257,8 +1266,14 @@ l3fwd_poll_resource_setup(void) local_port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads; - if (dev_info.max_rx_queues == 1) - local_port_conf.rxmode.mq_mode = RTE_ETH_MQ_RX_NONE; + /* relax the rx rss requirement */ + if (dev_info.max_rx_queues == 1 || !local_port_conf.rx_adv_conf.rss_conf.rss_hf) { + if (relax_rx_mode) { + printf("warning: modified the rx mq_mode to RTE_ETH_MQ_RX_NONE base on" + " device capability\n"); + local_port_conf.rxmode.mq_mode = RTE_ETH_MQ_RX_NONE; + } + } But that way we change current behavior - always use MQ_RX_NONE for devices with just one RX queue. Was it intended? Might be it should be: if (dev_info.max_rx_queues == 1) local_port_conf.rxmode.mq_mode = RTE_ETH_MQ_RX_NONE; + /* relax the rx rss requirement */ + if (relax_rx_mode && + !local_port_conf.rx_adv_conf.rss_conf.rss_hf) { + printf("..."); + local_port_conf.rxmode.mq_mode = RTE_ETH_MQ_RX_NONE; + } ? if (local_port_conf.rx_adv_conf.rss_conf.rss_hf != port_conf.rx_adv_conf.rss_conf.rss_hf) {
Re: [PATCH v1 2/3] examples/l3fwd: relax the Offload requirement
02.10.2023 09:53, Trevor Tao пишет: Now the port Rx offload mode is set to RTE_ETH_RX_OFFLOAD_CHECKSUM by default, but some hw and/or virtual interface does not support the offload mode presupposed, e.g., some virtio interfaces in the cloud may only partly support RTE_ETH_RX_OFFLOAD_UDP_CKSUM/ RTE_ETH_RX_OFFLOAD_TCP_CKSUM, but not RTE_ETH_RX_OFFLOAD_IPV4_CKSUM, and the error msg here: Ethdev port_id=0 requested Rx offloads 0xe does not match Rx offloads capabilities 0x201d in rte_eth_dev_configure() So to enable the l3fwd running in that environment, the Rx mode requirement can be relaxed to reflect the hardware feature reality here, and the l3fwd can run smoothly then. A warning msg would be provided to user in case it happens here. On the other side, enabling the software cksum check in case missing the hw support. The relax action for rx cksum offload is just enabled when relax_rx_mode is true which is false by default. Signed-off-by: Trevor Tao --- examples/l3fwd/l3fwd.h | 12 ++-- examples/l3fwd/l3fwd_em.h | 2 +- examples/l3fwd/l3fwd_lpm.h | 2 +- examples/l3fwd/main.c | 14 ++ 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h index b55855c932..fd98ad3373 100644 --- a/examples/l3fwd/l3fwd.h +++ b/examples/l3fwd/l3fwd.h @@ -159,7 +159,7 @@ send_single_packet(struct lcore_conf *qconf, #ifdef DO_RFC_1812_CHECKS static inline int -is_valid_ipv4_pkt(struct rte_ipv4_hdr *pkt, uint32_t link_len) +is_valid_ipv4_pkt(struct rte_ipv4_hdr *pkt, uint32_t link_len, uint64_t ol_flags) { /* From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2 */ /* @@ -170,7 +170,15 @@ is_valid_ipv4_pkt(struct rte_ipv4_hdr *pkt, uint32_t link_len) return -1; /* 2. The IP checksum must be correct. */ - /* this is checked in H/W */ + /* if this is not checked in H/W, check it. */ + if ((ol_flags & RTE_ETH_RX_OFFLOAD_IPV4_CKSUM) == 0) { + uint16_t actual_cksum, expected_cksum; + actual_cksum = pkt->hdr_checksum; + pkt->hdr_checksum = 0; + expected_cksum = rte_ipv4_cksum(pkt); + if (actual_cksum != expected_cksum) + return -2; + } /* * 3. The IP version number must be 4. If the version number is not 4 diff --git a/examples/l3fwd/l3fwd_em.h b/examples/l3fwd/l3fwd_em.h index 7d051fc076..1fee2e2e6c 100644 --- a/examples/l3fwd/l3fwd_em.h +++ b/examples/l3fwd/l3fwd_em.h @@ -20,7 +20,7 @@ l3fwd_em_handle_ipv4(struct rte_mbuf *m, uint16_t portid, #ifdef DO_RFC_1812_CHECKS /* Check to make sure the packet is valid (RFC1812) */ - if (is_valid_ipv4_pkt(ipv4_hdr, m->pkt_len) < 0) { + if (is_valid_ipv4_pkt(ipv4_hdr, m->pkt_len, m->ol_flags) < 0) { rte_pktmbuf_free(m); return BAD_PORT; } diff --git a/examples/l3fwd/l3fwd_lpm.h b/examples/l3fwd/l3fwd_lpm.h index c61b969584..5ddae7da0f 100644 --- a/examples/l3fwd/l3fwd_lpm.h +++ b/examples/l3fwd/l3fwd_lpm.h @@ -22,7 +22,7 @@ l3fwd_lpm_simple_forward(struct rte_mbuf *m, uint16_t portid, #ifdef DO_RFC_1812_CHECKS /* Check to make sure the packet is valid (RFC1812) */ - if (is_valid_ipv4_pkt(ipv4_hdr, m->pkt_len) < 0) { + if (is_valid_ipv4_pkt(ipv4_hdr, m->pkt_len) < 0, m->ol_flags) { Typo, pls fix. rte_pktmbuf_free(m); return; } diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c index 2c8f528d00..a48ae7f62b 100644 --- a/examples/l3fwd/main.c +++ b/examples/l3fwd/main.c @@ -1284,6 +1284,20 @@ l3fwd_poll_resource_setup(void) local_port_conf.rx_adv_conf.rss_conf.rss_hf); } + /* relax the rx offload requirement */ + if ((local_port_conf.rxmode.offloads & dev_info.rx_offload_capa) != + local_port_conf.rxmode.offloads) { Ok, but we relax only IP cksum. Though l3fwd tries to enable IP/TCP/UDP cksum. What if TCP/UDP is not supported, should we allow it or fail? + printf("Port %u requested Rx offloads 0x%"PRIx64" does not" + " match Rx offloads capabilities 0x%"PRIx64"\n", + portid, local_port_conf.rxmode.offloads, + dev_info.rx_offload_capa); + if (relax_rx_mode) { + local_port_conf.rxmode.offloads &= dev_info.rx_offload_capa; + printf("warning: modified the rx offload to 0x%"PRIx64" based on device" + " capability\n", local_port_conf.rxmode.offloads); + } + } + ret = rte_eth_dev_configure(portid, nb_rx_queue, (uint16_t)n
Re: [PATCH v6 3/3] power: amd power monitor support
0xf2, 0x0f, 0xae, 0xf7;" - : /* ignore rflags */ - : "D"(0), /* enter C0.2 */ - "a"(tsc_l), "d"(tsc_h)); -#endif + /* execute mwait */ + power_monitor_ops.mwait(tsc_timestamp); end: /* erase sleep address */ @@ -186,6 +238,14 @@ RTE_INIT(rte_power_intrinsics_init) { wait_multi_supported = 1; if (i.power_monitor) monitor_supported = 1; + + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_MONITORX)) { /* AMD */ + power_monitor_ops.mmonitor = &amd_monitorx; + power_monitor_ops.mwait = &amd_mwaitx; + } else { /* Intel */ + power_monitor_ops.mmonitor = &intel_umonitor; + power_monitor_ops.mwait = &intel_umwait; + } } int Acked-by: Konstantin Ananyev
RE: [PATCH] maintainers: remove olivier.m...@6wind.com
> > > > > On Tue, Oct 17, 2023 at 04:27:37PM +0200, Olivier Matz wrote: > > > > Unfortunatly I don't have enough time to undertake my maintainer role at > > > > the expected level. It will probably not going to get better anytime > > > > soon, so remove myself from maintainers. > > > > > > > > Signed-off-by: Olivier Matz > > > > --- > > > Sorry to see your name dropped from the file after so many years! > > > > > > Sadly, > > > Acked-by: Bruce Richardson > > > > Sorry to see you go. > > Echo the same here. > > > Should start a CREDITS file to remember past maintainers? > > +1 +1 for both. > > > > Acked-by: Stephen Hemminger
Re: [PATCH v2 1/3] examples/l3fwd: relax RSS requirement with option
13.10.2023 05:27, Trevor Tao пишет: Now the port Rx mq_mode had been set to RTE_ETH_MQ_RX_RSS by default, but some hw and/or virtual interface does not support the RSS and offload mode presupposed, e.g., some virtio interfaces in the cloud don't support RSS and the error msg may like: virtio_dev_configure(): RSS support requested but not supported by the device Port0 dev_configure = -95 So to enable the l3fwd running in that environment, the Rx mode requirement can be relaxed to reflect the hardware feature reality here, and the l3fwd can run smoothly then. An option named "relax-rx-mode" is added to enable the relax action here, and it's disabled by default. Signed-off-by: Trevor Tao --- examples/l3fwd/main.c | 16 1 file changed, 16 insertions(+) diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c index 6063eb1399..89ad546a5e 100644 --- a/examples/l3fwd/main.c +++ b/examples/l3fwd/main.c @@ -73,6 +73,7 @@ static enum L3FWD_LOOKUP_MODE lookup_mode; static int numa_on = 1; /**< NUMA is enabled by default. */ static int parse_ptype; /**< Parse packet type using rx callback, and */ /**< disabled by default */ +static int relax_rx_mode; /**< Relax RX mode is disabled by default */ static int per_port_pool; /**< Use separate buffer pools per port; disabled */ /**< by default */ @@ -678,6 +679,7 @@ static const char short_options[] = #define CMD_LINE_OPT_MAX_PKT_LEN "max-pkt-len" #define CMD_LINE_OPT_HASH_ENTRY_NUM "hash-entry-num" #define CMD_LINE_OPT_PARSE_PTYPE "parse-ptype" +#define CMD_LINE_OPT_RELAX_RX_MODE "relax-rx-mode" #define CMD_LINE_OPT_PER_PORT_POOL "per-port-pool" #define CMD_LINE_OPT_MODE "mode" #define CMD_LINE_OPT_EVENTQ_SYNC "eventq-sched" @@ -705,6 +707,7 @@ enum { CMD_LINE_OPT_MAX_PKT_LEN_NUM, CMD_LINE_OPT_HASH_ENTRY_NUM_NUM, CMD_LINE_OPT_PARSE_PTYPE_NUM, + CMD_LINE_OPT_RELAX_RX_MODE_NUM, CMD_LINE_OPT_RULE_IPV4_NUM, CMD_LINE_OPT_RULE_IPV6_NUM, CMD_LINE_OPT_ALG_NUM, @@ -728,6 +731,7 @@ static const struct option lgopts[] = { {CMD_LINE_OPT_MAX_PKT_LEN, 1, 0, CMD_LINE_OPT_MAX_PKT_LEN_NUM}, {CMD_LINE_OPT_HASH_ENTRY_NUM, 1, 0, CMD_LINE_OPT_HASH_ENTRY_NUM_NUM}, {CMD_LINE_OPT_PARSE_PTYPE, 0, 0, CMD_LINE_OPT_PARSE_PTYPE_NUM}, + {CMD_LINE_OPT_RELAX_RX_MODE, 0, 0, CMD_LINE_OPT_RELAX_RX_MODE_NUM}, {CMD_LINE_OPT_PER_PORT_POOL, 0, 0, CMD_LINE_OPT_PARSE_PER_PORT_POOL}, {CMD_LINE_OPT_MODE, 1, 0, CMD_LINE_OPT_MODE_NUM}, {CMD_LINE_OPT_EVENTQ_SYNC, 1, 0, CMD_LINE_OPT_EVENTQ_SYNC_NUM}, @@ -853,6 +857,11 @@ parse_args(int argc, char **argv) parse_ptype = 1; break; + case CMD_LINE_OPT_RELAX_RX_MODE_NUM: + printf("Relax rx mode is enabled\n"); + relax_rx_mode = 1; + break; + case CMD_LINE_OPT_PARSE_PER_PORT_POOL: printf("per port buffer pool is enabled\n"); per_port_pool = 1; @@ -1260,6 +1269,13 @@ l3fwd_poll_resource_setup(void) if (dev_info.max_rx_queues == 1) local_port_conf.rxmode.mq_mode = RTE_ETH_MQ_RX_NONE; + /* relax the rx rss requirement */ + if (relax_rx_mode && !local_port_conf.rx_adv_conf.rss_conf.rss_hf) { + printf("warning: modified the rx mq_mode to RTE_ETH_MQ_RX_NONE base on" + " device capability\n"); + local_port_conf.rxmode.mq_mode = RTE_ETH_MQ_RX_NONE; + } + if (local_port_conf.rx_adv_conf.rss_conf.rss_hf != port_conf.rx_adv_conf.rss_conf.rss_hf) { printf("Port %u modified RSS hash function based on hardware support," Acked-by: Konstantin Ananyev
Re: [PATCH v2 2/3] examples/l3fwd: relax the Offload requirement
13.10.2023 05:27, Trevor Tao пишет: Now the port Rx offload mode is set to RTE_ETH_RX_OFFLOAD_CHECKSUM by default, but some hw and/or virtual interface does not support the offload mode presupposed, e.g., some virtio interfaces in the cloud may only partly support RTE_ETH_RX_OFFLOAD_UDP_CKSUM/ RTE_ETH_RX_OFFLOAD_TCP_CKSUM, but not RTE_ETH_RX_OFFLOAD_IPV4_CKSUM, and the error msg here: Ethdev port_id=0 requested Rx offloads 0xe does not match Rx offloads capabilities 0x201d in rte_eth_dev_configure() So to enable the l3fwd running in that environment, the Rx mode requirement can be relaxed to reflect the hardware feature reality here, and the l3fwd can run smoothly then. A warning msg would be provided to user in case it happens here. On the other side, enabling the software cksum check in case missing the hw support. The relax action for rx cksum offload is just enabled when relax_rx_mode is true which is false by default. Signed-off-by: Trevor Tao --- examples/l3fwd/l3fwd.h | 12 ++-- examples/l3fwd/l3fwd_em.h | 2 +- examples/l3fwd/l3fwd_lpm.h | 2 +- examples/l3fwd/main.c | 14 ++ 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h index b55855c932..fd98ad3373 100644 --- a/examples/l3fwd/l3fwd.h +++ b/examples/l3fwd/l3fwd.h @@ -159,7 +159,7 @@ send_single_packet(struct lcore_conf *qconf, #ifdef DO_RFC_1812_CHECKS static inline int -is_valid_ipv4_pkt(struct rte_ipv4_hdr *pkt, uint32_t link_len) +is_valid_ipv4_pkt(struct rte_ipv4_hdr *pkt, uint32_t link_len, uint64_t ol_flags) { /* From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2 */ /* @@ -170,7 +170,15 @@ is_valid_ipv4_pkt(struct rte_ipv4_hdr *pkt, uint32_t link_len) return -1; /* 2. The IP checksum must be correct. */ - /* this is checked in H/W */ + /* if this is not checked in H/W, check it. */ + if ((ol_flags & RTE_ETH_RX_OFFLOAD_IPV4_CKSUM) == 0) { That looks like a wrong flag, I think it should be: if ((ol_flags & RTE_MBUF_F_RX_IP_CKSUM_MASK) == RTE_MBUF_F_RX_IP_CKSUM_NONE) Which makes me wonder was that piece of code ever tested properly? + uint16_t actual_cksum, expected_cksum; + actual_cksum = pkt->hdr_checksum; + pkt->hdr_checksum = 0; + expected_cksum = rte_ipv4_cksum(pkt); + if (actual_cksum != expected_cksum) + return -2; + } Actually, while looking at it another thing stroke me, when HW ip cksum is enabled, shouldn't we check that it is a valid one? I.E: if (ol_flags & RTE_MBUF_F_RX_L4_CKSUM_MASK) == RTE_MBUF_F_RX_L4_CKSUM_BAD) return -2; /* * 3. The IP version number must be 4. If the version number is not 4 diff --git a/examples/l3fwd/l3fwd_em.h b/examples/l3fwd/l3fwd_em.h index 7d051fc076..1fee2e2e6c 100644 --- a/examples/l3fwd/l3fwd_em.h +++ b/examples/l3fwd/l3fwd_em.h @@ -20,7 +20,7 @@ l3fwd_em_handle_ipv4(struct rte_mbuf *m, uint16_t portid, #ifdef DO_RFC_1812_CHECKS /* Check to make sure the packet is valid (RFC1812) */ - if (is_valid_ipv4_pkt(ipv4_hdr, m->pkt_len) < 0) { + if (is_valid_ipv4_pkt(ipv4_hdr, m->pkt_len, m->ol_flags) < 0) { rte_pktmbuf_free(m); return BAD_PORT; } diff --git a/examples/l3fwd/l3fwd_lpm.h b/examples/l3fwd/l3fwd_lpm.h index c61b969584..4ee61e8d88 100644 --- a/examples/l3fwd/l3fwd_lpm.h +++ b/examples/l3fwd/l3fwd_lpm.h @@ -22,7 +22,7 @@ l3fwd_lpm_simple_forward(struct rte_mbuf *m, uint16_t portid, #ifdef DO_RFC_1812_CHECKS /* Check to make sure the packet is valid (RFC1812) */ - if (is_valid_ipv4_pkt(ipv4_hdr, m->pkt_len) < 0) { + if (is_valid_ipv4_pkt(ipv4_hdr, m->pkt_len, m->ol_flags) < 0) { rte_pktmbuf_free(m); return; } diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c index 89ad546a5e..2b815375a9 100644 --- a/examples/l3fwd/main.c +++ b/examples/l3fwd/main.c @@ -1285,6 +1285,20 @@ l3fwd_poll_resource_setup(void) local_port_conf.rx_adv_conf.rss_conf.rss_hf); } + /* relax the rx offload requirement */ + if ((local_port_conf.rxmode.offloads & dev_info.rx_offload_capa) != + local_port_conf.rxmode.offloads) { + printf("Port %u requested Rx offloads 0x%"PRIx64" does not" + " match Rx offloads capabilities 0x%"PRIx64"\n", + portid, local_port_conf.rxmode.offloads, + dev_info.rx_offload_capa); + if (relax_rx_mode) { + local_port_conf.rxmode.offloads &= dev_info.rx_offload_capa; + printf("warning: modified the rx offload to 0x%"PRIx64" bas
Re: [PATCH v2 19/19] ring: use rte optional stdatomic API
17.10.2023 21:31, Tyler Retzlaff пишет: Replace the use of gcc builtin __atomic_xxx intrinsics with corresponding rte_atomic_xxx optional stdatomic API Signed-off-by: Tyler Retzlaff --- drivers/net/mlx5/mlx5_hws_cnt.h | 2 +- lib/ring/rte_ring_c11_pvt.h | 33 + lib/ring/rte_ring_core.h | 10 +- lib/ring/rte_ring_generic_pvt.h | 3 ++- lib/ring/rte_ring_hts_elem_pvt.h | 22 -- lib/ring/rte_ring_peek_elem_pvt.h | 6 +++--- lib/ring/rte_ring_rts_elem_pvt.h | 27 ++- 7 files changed, 54 insertions(+), 49 deletions(-) diff --git a/drivers/net/mlx5/mlx5_hws_cnt.h b/drivers/net/mlx5/mlx5_hws_cnt.h index f462665..cc9ac10 100644 --- a/drivers/net/mlx5/mlx5_hws_cnt.h +++ b/drivers/net/mlx5/mlx5_hws_cnt.h @@ -394,7 +394,7 @@ struct mlx5_hws_age_param { __rte_ring_get_elem_addr(r, revert2head, sizeof(cnt_id_t), n, &zcd->ptr1, &zcd->n1, &zcd->ptr2); /* Update tail */ - __atomic_store_n(&r->prod.tail, revert2head, __ATOMIC_RELEASE); + rte_atomic_store_explicit(&r->prod.tail, revert2head, rte_memory_order_release); return n; } diff --git a/lib/ring/rte_ring_c11_pvt.h b/lib/ring/rte_ring_c11_pvt.h index f895950..f8be538 100644 --- a/lib/ring/rte_ring_c11_pvt.h +++ b/lib/ring/rte_ring_c11_pvt.h @@ -22,9 +22,10 @@ * we need to wait for them to complete */ if (!single) - rte_wait_until_equal_32(&ht->tail, old_val, __ATOMIC_RELAXED); + rte_wait_until_equal_32((volatile uint32_t *)(uintptr_t)&ht->tail, old_val, + rte_memory_order_relaxed); - __atomic_store_n(&ht->tail, new_val, __ATOMIC_RELEASE); + rte_atomic_store_explicit(&ht->tail, new_val, rte_memory_order_release); } /** @@ -61,19 +62,19 @@ unsigned int max = n; int success; - *old_head = __atomic_load_n(&r->prod.head, __ATOMIC_RELAXED); + *old_head = rte_atomic_load_explicit(&r->prod.head, rte_memory_order_relaxed); do { /* Reset n to the initial burst count */ n = max; /* Ensure the head is read before tail */ - __atomic_thread_fence(__ATOMIC_ACQUIRE); + __atomic_thread_fence(rte_memory_order_acquire); /* load-acquire synchronize with store-release of ht->tail * in update_tail. */ - cons_tail = __atomic_load_n(&r->cons.tail, - __ATOMIC_ACQUIRE); + cons_tail = rte_atomic_load_explicit(&r->cons.tail, + rte_memory_order_acquire); /* The subtraction is done between two unsigned 32bits value * (the result is always modulo 32 bits even if we have @@ -95,10 +96,10 @@ r->prod.head = *new_head, success = 1; else /* on failure, *old_head is updated */ - success = __atomic_compare_exchange_n(&r->prod.head, + success = rte_atomic_compare_exchange_strong_explicit(&r->prod.head, old_head, *new_head, - 0, __ATOMIC_RELAXED, - __ATOMIC_RELAXED); + rte_memory_order_relaxed, + rte_memory_order_relaxed); } while (unlikely(success == 0)); return n; } @@ -137,19 +138,19 @@ int success; /* move cons.head atomically */ - *old_head = __atomic_load_n(&r->cons.head, __ATOMIC_RELAXED); + *old_head = rte_atomic_load_explicit(&r->cons.head, rte_memory_order_relaxed); do { /* Restore n as it may change every loop */ n = max; /* Ensure the head is read before tail */ - __atomic_thread_fence(__ATOMIC_ACQUIRE); + __atomic_thread_fence(rte_memory_order_acquire); /* this load-acquire synchronize with store-release of ht->tail * in update_tail. */ - prod_tail = __atomic_load_n(&r->prod.tail, - __ATOMIC_ACQUIRE); + prod_tail = rte_atomic_load_explicit(&r->prod.tail, + rte_memory_order_acquire); /* The subtraction is done between two unsigned 32bits value * (the result is always modulo 32 bits even if we have @@ -170,10 +171,10 @@ r->cons.head = *new_head, success = 1; else /* on failure, *old_head will be updated */ - success = __atomic_compare_exchange_n(&r->cons.head, + success = rte_atomic_compare_exchange_strong_explicit(&r->cons.head,
Re: [PATCH v2 06/19] ipsec: use rte optional stdatomic API
17.10.2023 21:31, Tyler Retzlaff пишет: Replace the use of gcc builtin __atomic_xxx intrinsics with corresponding rte_atomic_xxx optional stdatomic API Signed-off-by: Tyler Retzlaff --- lib/ipsec/ipsec_sqn.h | 2 +- lib/ipsec/sa.h| 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/ipsec/ipsec_sqn.h b/lib/ipsec/ipsec_sqn.h index 505950e..984a9dd 100644 --- a/lib/ipsec/ipsec_sqn.h +++ b/lib/ipsec/ipsec_sqn.h @@ -128,7 +128,7 @@ n = *num; if (SQN_ATOMIC(sa)) - sqn = __atomic_fetch_add(&sa->sqn.outb, n, __ATOMIC_RELAXED) + n; + sqn = rte_atomic_fetch_add_explicit(&sa->sqn.outb, n, rte_memory_order_relaxed) + n; else { sqn = sa->sqn.outb + n; sa->sqn.outb = sqn; diff --git a/lib/ipsec/sa.h b/lib/ipsec/sa.h index ce4af8c..4b30bea 100644 --- a/lib/ipsec/sa.h +++ b/lib/ipsec/sa.h @@ -124,7 +124,7 @@ struct rte_ipsec_sa { * place from other frequently accessed data. */ union { - uint64_t outb; + RTE_ATOMIC(uint64_t) outb; struct { uint32_t rdidx; /* read index */ uint32_t wridx; /* write index */ Acked-by: Konstantin Ananyev
Re: [PATCH v2 07/19] mbuf: use rte optional stdatomic API
17.10.2023 21:31, Tyler Retzlaff пишет: Replace the use of gcc builtin __atomic_xxx intrinsics with corresponding rte_atomic_xxx optional stdatomic API Signed-off-by: Tyler Retzlaff --- lib/mbuf/rte_mbuf.h | 20 ++-- lib/mbuf/rte_mbuf_core.h | 5 +++-- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/lib/mbuf/rte_mbuf.h b/lib/mbuf/rte_mbuf.h index 913c459..b8ab477 100644 --- a/lib/mbuf/rte_mbuf.h +++ b/lib/mbuf/rte_mbuf.h @@ -361,7 +361,7 @@ struct rte_pktmbuf_pool_private { static inline uint16_t rte_mbuf_refcnt_read(const struct rte_mbuf *m) { - return __atomic_load_n(&m->refcnt, __ATOMIC_RELAXED); + return rte_atomic_load_explicit(&m->refcnt, rte_memory_order_relaxed); } /** @@ -374,15 +374,15 @@ struct rte_pktmbuf_pool_private { static inline void rte_mbuf_refcnt_set(struct rte_mbuf *m, uint16_t new_value) { - __atomic_store_n(&m->refcnt, new_value, __ATOMIC_RELAXED); + rte_atomic_store_explicit(&m->refcnt, new_value, rte_memory_order_relaxed); } /* internal */ static inline uint16_t __rte_mbuf_refcnt_update(struct rte_mbuf *m, int16_t value) { - return __atomic_fetch_add(&m->refcnt, value, -__ATOMIC_ACQ_REL) + value; + return rte_atomic_fetch_add_explicit(&m->refcnt, value, +rte_memory_order_acq_rel) + value; } /** @@ -463,7 +463,7 @@ struct rte_pktmbuf_pool_private { static inline uint16_t rte_mbuf_ext_refcnt_read(const struct rte_mbuf_ext_shared_info *shinfo) { - return __atomic_load_n(&shinfo->refcnt, __ATOMIC_RELAXED); + return rte_atomic_load_explicit(&shinfo->refcnt, rte_memory_order_relaxed); } /** @@ -478,7 +478,7 @@ struct rte_pktmbuf_pool_private { rte_mbuf_ext_refcnt_set(struct rte_mbuf_ext_shared_info *shinfo, uint16_t new_value) { - __atomic_store_n(&shinfo->refcnt, new_value, __ATOMIC_RELAXED); + rte_atomic_store_explicit(&shinfo->refcnt, new_value, rte_memory_order_relaxed); } /** @@ -502,8 +502,8 @@ struct rte_pktmbuf_pool_private { return (uint16_t)value; } - return __atomic_fetch_add(&shinfo->refcnt, value, -__ATOMIC_ACQ_REL) + value; + return rte_atomic_fetch_add_explicit(&shinfo->refcnt, value, +rte_memory_order_acq_rel) + value; } /** Mbuf prefetch */ @@ -1315,8 +1315,8 @@ static inline int __rte_pktmbuf_pinned_extbuf_decref(struct rte_mbuf *m) * Direct usage of add primitive to avoid * duplication of comparing with one. */ - if (likely(__atomic_fetch_add(&shinfo->refcnt, -1, -__ATOMIC_ACQ_REL) - 1)) + if (likely(rte_atomic_fetch_add_explicit(&shinfo->refcnt, -1, +rte_memory_order_acq_rel) - 1)) return 1; /* Reinitialize counter before mbuf freeing. */ diff --git a/lib/mbuf/rte_mbuf_core.h b/lib/mbuf/rte_mbuf_core.h index e9bc0d1..5688683 100644 --- a/lib/mbuf/rte_mbuf_core.h +++ b/lib/mbuf/rte_mbuf_core.h @@ -19,6 +19,7 @@ #include #include +#include #ifdef __cplusplus extern "C" { @@ -497,7 +498,7 @@ struct rte_mbuf { * rte_mbuf_refcnt_set(). The functionality of these functions (atomic, * or non-atomic) is controlled by the RTE_MBUF_REFCNT_ATOMIC flag. */ - uint16_t refcnt; + RTE_ATOMIC(uint16_t) refcnt; /** * Number of segments. Only valid for the first segment of an mbuf @@ -674,7 +675,7 @@ struct rte_mbuf { struct rte_mbuf_ext_shared_info { rte_mbuf_extbuf_free_callback_t free_cb; /**< Free callback function */ void *fcb_opaque;/**< Free callback argument */ - uint16_t refcnt; + RTE_ATOMIC(uint16_t) refcnt; }; /** Maximum number of nb_segs allowed. */ Acked-by: Konstantin Ananyev
Re: [PATCH v2 08/19] mempool: use rte optional stdatomic API
17.10.2023 21:31, Tyler Retzlaff пишет: Replace the use of gcc builtin __atomic_xxx intrinsics with corresponding rte_atomic_xxx optional stdatomic API Signed-off-by: Tyler Retzlaff --- lib/mempool/rte_mempool.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h index f70bf36..df87cd2 100644 --- a/lib/mempool/rte_mempool.h +++ b/lib/mempool/rte_mempool.h @@ -327,8 +327,8 @@ struct rte_mempool { if (likely(__lcore_id < RTE_MAX_LCORE)) \ (mp)->stats[__lcore_id].name += (n); \ else \ - __atomic_fetch_add(&((mp)->stats[RTE_MAX_LCORE].name), \ - (n), __ATOMIC_RELAXED); \ + rte_atomic_fetch_add_explicit(&((mp)->stats[RTE_MAX_LCORE].name), \ + (n), rte_memory_order_relaxed); \ } while (0) #else #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {} while (0) Acked-by: Konstantin Ananyev
Re: [PATCH v2 11/19] stack: use rte optional stdatomic API
RELEASE, - __ATOMIC_RELAXED); + 1, rte_memory_order_release, + rte_memory_order_relaxed); } while (success == 0); /* NOTE: review for potential ordering optimization */ - __atomic_fetch_add(&list->len, num, __ATOMIC_SEQ_CST); + rte_atomic_fetch_add_explicit(&list->len, num, rte_memory_order_seq_cst); } static __rte_always_inline struct rte_stack_lf_elem * @@ -83,15 +83,15 @@ /* Reserve num elements, if available */ while (1) { /* NOTE: review for potential ordering optimization */ - uint64_t len = __atomic_load_n(&list->len, __ATOMIC_SEQ_CST); + uint64_t len = rte_atomic_load_explicit(&list->len, rte_memory_order_seq_cst); /* Does the list contain enough elements? */ if (unlikely(len < num)) return NULL; /* NOTE: review for potential ordering optimization */ - if (__atomic_compare_exchange_n(&list->len, &len, len - num, - 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) + if (rte_atomic_compare_exchange_strong_explicit(&list->len, &len, len - num, + rte_memory_order_seq_cst, rte_memory_order_seq_cst)) break; } @@ -143,8 +143,8 @@ (rte_int128_t *)&list->head, (rte_int128_t *)&old_head, (rte_int128_t *)&new_head, - 1, __ATOMIC_RELEASE, - __ATOMIC_RELAXED); + 1, rte_memory_order_release, + rte_memory_order_relaxed); } while (success == 0); return old_head.top; Acked-by: Konstantin Ananyev
RE: [PATCH v2 19/19] ring: use rte optional stdatomic API
> > On Tue, Oct 24, 2023 at 09:43:13AM +0100, Konstantin Ananyev wrote: > > 17.10.2023 21:31, Tyler Retzlaff пишет: > > >Replace the use of gcc builtin __atomic_xxx intrinsics with > > >corresponding rte_atomic_xxx optional stdatomic API > > > > > >Signed-off-by: Tyler Retzlaff > > >--- > > > drivers/net/mlx5/mlx5_hws_cnt.h | 2 +- > > > lib/ring/rte_ring_c11_pvt.h | 33 + > > > lib/ring/rte_ring_core.h | 10 +- > > > lib/ring/rte_ring_generic_pvt.h | 3 ++- > > > lib/ring/rte_ring_hts_elem_pvt.h | 22 -- > > > lib/ring/rte_ring_peek_elem_pvt.h | 6 +++--- > > > lib/ring/rte_ring_rts_elem_pvt.h | 27 ++- > > > 7 files changed, 54 insertions(+), 49 deletions(-) > > > > > >diff --git a/drivers/net/mlx5/mlx5_hws_cnt.h > > >b/drivers/net/mlx5/mlx5_hws_cnt.h > > >index f462665..cc9ac10 100644 > > >--- a/drivers/net/mlx5/mlx5_hws_cnt.h > > >+++ b/drivers/net/mlx5/mlx5_hws_cnt.h > > >@@ -394,7 +394,7 @@ struct mlx5_hws_age_param { > > > __rte_ring_get_elem_addr(r, revert2head, sizeof(cnt_id_t), n, > > > &zcd->ptr1, &zcd->n1, &zcd->ptr2); > > > /* Update tail */ > > >- __atomic_store_n(&r->prod.tail, revert2head, __ATOMIC_RELEASE); > > >+ rte_atomic_store_explicit(&r->prod.tail, revert2head, > > >rte_memory_order_release); > > > return n; > > > } > > >diff --git a/lib/ring/rte_ring_c11_pvt.h b/lib/ring/rte_ring_c11_pvt.h > > >index f895950..f8be538 100644 > > >--- a/lib/ring/rte_ring_c11_pvt.h > > >+++ b/lib/ring/rte_ring_c11_pvt.h > > >@@ -22,9 +22,10 @@ > > >* we need to wait for them to complete > > >*/ > > > if (!single) > > >- rte_wait_until_equal_32(&ht->tail, old_val, __ATOMIC_RELAXED); > > >+ rte_wait_until_equal_32((volatile uint32_t > > >*)(uintptr_t)&ht->tail, old_val, > > >+ rte_memory_order_relaxed); > > >- __atomic_store_n(&ht->tail, new_val, __ATOMIC_RELEASE); > > >+ rte_atomic_store_explicit(&ht->tail, new_val, rte_memory_order_release); > > > } > > > /** > > >@@ -61,19 +62,19 @@ > > > unsigned int max = n; > > > int success; > > >- *old_head = __atomic_load_n(&r->prod.head, __ATOMIC_RELAXED); > > >+ *old_head = rte_atomic_load_explicit(&r->prod.head, > > >rte_memory_order_relaxed); > > > do { > > > /* Reset n to the initial burst count */ > > > n = max; > > > /* Ensure the head is read before tail */ > > >- __atomic_thread_fence(__ATOMIC_ACQUIRE); > > >+ __atomic_thread_fence(rte_memory_order_acquire); > > > /* load-acquire synchronize with store-release of ht->tail > > >* in update_tail. > > >*/ > > >- cons_tail = __atomic_load_n(&r->cons.tail, > > >- __ATOMIC_ACQUIRE); > > >+ cons_tail = rte_atomic_load_explicit(&r->cons.tail, > > >+ rte_memory_order_acquire); > > > /* The subtraction is done between two unsigned 32bits value > > >* (the result is always modulo 32 bits even if we have > > >@@ -95,10 +96,10 @@ > > > r->prod.head = *new_head, success = 1; > > > else > > > /* on failure, *old_head is updated */ > > >- success = __atomic_compare_exchange_n(&r->prod.head, > > >+ success = > > >rte_atomic_compare_exchange_strong_explicit(&r->prod.head, > > > old_head, *new_head, > > >- 0, __ATOMIC_RELAXED, > > >- __ATOMIC_RELAXED); > > >+ rte_memory_order_relaxed, > > >+ rte_memory_order_relaxed); > > > } while (unlikely(success == 0)); > > > return n; > > > } > > >@@ -137,19 +138,19 @@ > > > int success; > > > /* move cons.head atomically */ > > >- *old_head = __atomic_load_n(&r->cons.head, __ATOMIC_RELAXED); > > >+ *old_head = rte_atomic_load_explicit(&r->cons.head, > >
Re: 回复: 回复: [PATCH v2 1/3] ethdev: add API for direct rearm mode
12/10/2022 14:38, Feifei Wang пишет: -邮件原件- 发件人: Konstantin Ananyev 发送时间: Wednesday, October 12, 2022 6:21 AM 收件人: Feifei Wang ; tho...@monjalon.net; Ferruh Yigit ; Andrew Rybchenko ; Ray Kinsella 抄送: dev@dpdk.org; nd ; Honnappa Nagarahalli ; Ruifeng Wang 主题: Re: 回复: [PATCH v2 1/3] ethdev: add API for direct rearm mode Add API for enabling direct rearm mode and for mapping RX and TX queues. Currently, the API supports 1:1(txq : rxq) mapping. Furthermore, to avoid Rx load Tx data directly, add API called 'rte_eth_txq_data_get' to get Tx sw_ring and its information. Suggested-by: Honnappa Nagarahalli Suggested-by: Ruifeng Wang Signed-off-by: Feifei Wang Reviewed-by: Ruifeng Wang Reviewed-by: Honnappa Nagarahalli --- lib/ethdev/ethdev_driver.h | 9 lib/ethdev/ethdev_private.c | 1 + lib/ethdev/rte_ethdev.c | 37 ++ lib/ethdev/rte_ethdev.h | 95 lib/ethdev/rte_ethdev_core.h | 5 ++ lib/ethdev/version.map | 4 ++ 6 files changed, 151 insertions(+) diff --git a/lib/ethdev/ethdev_driver.h b/lib/ethdev/ethdev_driver.h index 47a55a419e..14f52907c1 100644 --- a/lib/ethdev/ethdev_driver.h +++ b/lib/ethdev/ethdev_driver.h @@ -58,6 +58,8 @@ struct rte_eth_dev { eth_rx_descriptor_status_t rx_descriptor_status; /** Check the status of a Tx descriptor */ eth_tx_descriptor_status_t tx_descriptor_status; + /** Use Tx mbufs for Rx to rearm */ + eth_rx_direct_rearm_t rx_direct_rearm; /** * Device data that is shared between primary and secondary processes @@ -486,6 +488,11 @@ typedef int (*eth_rx_enable_intr_t)(struct rte_eth_dev *dev, typedef int (*eth_rx_disable_intr_t)(struct rte_eth_dev *dev, uint16_t rx_queue_id); +/**< @internal Get Tx information of a transmit queue of an +Ethernet device. */ typedef void (*eth_txq_data_get_t)(struct rte_eth_dev *dev, + uint16_t tx_queue_id, + struct rte_eth_txq_data *txq_data); + /** @internal Release memory resources allocated by given Rx/Tx queue. */ typedef void (*eth_queue_release_t)(struct rte_eth_dev *dev, uint16_t queue_id); @@ -1138,6 +1145,8 @@ struct eth_dev_ops { eth_rxq_info_get_t rxq_info_get; /** Retrieve Tx queue information */ eth_txq_info_get_t txq_info_get; + /** Get the address where Tx data is stored */ + eth_txq_data_get_t txq_data_get; eth_burst_mode_get_t rx_burst_mode_get; /**< Get Rx burst mode */ eth_burst_mode_get_t tx_burst_mode_get; /**< Get Tx burst mode */ eth_fw_version_get_t fw_version_get; /**< Get firmware version */ diff --git a/lib/ethdev/ethdev_private.c b/lib/ethdev/ethdev_private.c index 48090c879a..bfe16c7d77 100644 --- a/lib/ethdev/ethdev_private.c +++ b/lib/ethdev/ethdev_private.c @@ -276,6 +276,7 @@ eth_dev_fp_ops_setup(struct rte_eth_fp_ops *fpo, fpo->rx_queue_count = dev->rx_queue_count; fpo->rx_descriptor_status = dev->rx_descriptor_status; fpo->tx_descriptor_status = dev->tx_descriptor_status; + fpo->rx_direct_rearm = dev->rx_direct_rearm; fpo->rxq.data = dev->data->rx_queues; fpo->rxq.clbk = (void **)(uintptr_t)dev->post_rx_burst_cbs; diff --git a/lib/ethdev/rte_ethdev.c b/lib/ethdev/rte_ethdev.c index 0c2c1088c0..0dccec2e4b 100644 --- a/lib/ethdev/rte_ethdev.c +++ b/lib/ethdev/rte_ethdev.c @@ -1648,6 +1648,43 @@ rte_eth_dev_is_removed(uint16_t port_id) return ret; } +int +rte_eth_tx_queue_data_get(uint16_t port_id, uint16_t queue_id, + struct rte_eth_txq_data *txq_data) { + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + + if (queue_id >= dev->data->nb_tx_queues) { + RTE_ETHDEV_LOG(ERR, "Invalid Tx queue_id=%u\n", queue_id); + return -EINVAL; + } + + if (txq_data == NULL) { + RTE_ETHDEV_LOG(ERR, "Cannot get ethdev port %u Tx queue %u data to NULL\n", + port_id, queue_id); + return -EINVAL; + } + + if (dev->data->tx_queues == NULL || + dev->data->tx_queues[queue_id] == NULL) { + RTE_ETHDEV_LOG(ERR, + "Tx queue %"PRIu16" of device with port_id=%" + PRIu16" has not been setup\n", + queue_id, port_id); + return -EINVAL; + } + + if (*dev->dev_ops->txq_data_get == NULL) + return -ENOTSUP; + + dev->dev_ops->txq_d
Minutes of Technical Board Meeting, 2022-Oct-5
Members Attending -- -Aaron -Bruce -Konstantin (chair) -Kevin -Maxime -Stephen -Thomas NOTE: The technical board meetings are on every second Wednesday at https://meet.jit.si/DPDK at 3 pm UTC. Meetings are public, and DPDK community members are welcome to attend. NOTE: Next meeting will be on Wednesday 2022-Oct-19 @3pm UTC, and will be chaired by Jerin. 1) GB representative rotation: Stephen Hemminger will be TB rep to GB for next 3 months. 2) Meeting management tool suggestion from Nathan. Followed by more broad discussion about current DPDK project tools and how convenient the are to the different parts of the community. In particular: - how end-users (not developers) can provide their input for new features, existing functionality? - bug-tacking tool improvement - new contributors - what are the best options for them? No specific decisions were made. 3) Meson version update for 22.11 - Decision is - let's try to go ahead with it. AR to Bruce to ping Intel Labs team to confirm that they will be able to deal with that. 4) License exception for GVE code from kernel (under MIT license) - Looks ok, to follow the procedure need to apply for exception approval from GB.
Re: flow classify library status
25/10/2022 16:24, Ferruh Yigit пишет: On 10/25/2022 3:37 PM, David Marchand wrote: Hello, The flow_classify library has been out and marked experimental for a long time. Its current maintainer is now leaving the community. The library did not get much traction: I am not aware of an opensource project using it. Are there users of this library? If we deprecated it (to later drop it), do we already have some replacement feature in DPDK? +1 to remove it. As far as I remember initially it started as underlying work to provide stats with common standards like IPFIX. But not sure if next stage ever happened. Yep, it didn't work. Agree to deprecate/remove.
RE: [PATCH] flow_classify: mark library as deprecated
> > This library has no maintainer and, for now, nobody expressed interest > in taking over. > Mark this experimental library as deprecated and announce plan for > removal in v23.11. > > Signed-off-by: David Marchand > --- Acked-by: Konstantin Ananyev > -- > 2.37.3
RE: [dpdk-dev v5] crypto/ipsec_mb: multi-process IPC request handler
> > As the queue pair used in secondary process needs to be set up by > the primary process, this patch adds an IPC register function to help > secondary process to send out queue-pair setup request to primary > process via IPC request messages. A new "qp_in_used_pid" param stores > the PID to provide the ownership of the queue-pair so that only the PID > matched queue-pair can be free'd in the request. As I can see that patch was already merged, but still: How we suppose to guarantee synchronization with such approach? Let say secondary process sends an IPC message to configure crypto-queue, and at the same moment primary process (for whatever reason) decides to reconfigure same crypto-dev. Is there any way to prevent such race-conditions to happen? > > Signed-off-by: Kai Ji > Acked-by: Arek Kusztal > Acked-by: Pablo de Lara > --- > v5: > - minor updates and typo fix > > v4: > - review comments resolved > > v3: > - remove shared memzone as qp_conf params can be passed directly from > ipc message. > > v2: > - add in shared memzone for data exchange between multi-process > --- > drivers/crypto/ipsec_mb/ipsec_mb_ops.c | 129 - > drivers/crypto/ipsec_mb/ipsec_mb_private.c | 24 +++- > drivers/crypto/ipsec_mb/ipsec_mb_private.h | 38 +- > 3 files changed, 185 insertions(+), 6 deletions(-) > > diff --git a/drivers/crypto/ipsec_mb/ipsec_mb_ops.c > b/drivers/crypto/ipsec_mb/ipsec_mb_ops.c > index cedcaa2742..bf18d692bd 100644 > --- a/drivers/crypto/ipsec_mb/ipsec_mb_ops.c > +++ b/drivers/crypto/ipsec_mb/ipsec_mb_ops.c > @@ -3,6 +3,7 @@ > */ > > #include > +#include > > #include > #include > @@ -93,6 +94,46 @@ ipsec_mb_info_get(struct rte_cryptodev *dev, > } > } > > +static int > +ipsec_mb_secondary_qp_op(int dev_id, int qp_id, > + const struct rte_cryptodev_qp_conf *qp_conf, > + int socket_id, enum ipsec_mb_mp_req_type op_type) > +{ > + int ret; > + struct rte_mp_msg qp_req_msg; > + struct rte_mp_msg *qp_resp_msg; > + struct rte_mp_reply qp_resp; > + struct ipsec_mb_mp_param *req_param; > + struct ipsec_mb_mp_param *resp_param; > + struct timespec ts = {.tv_sec = 1, .tv_nsec = 0}; > + > + memset(&qp_req_msg, 0, sizeof(IPSEC_MB_MP_MSG)); > + memcpy(qp_req_msg.name, IPSEC_MB_MP_MSG, sizeof(IPSEC_MB_MP_MSG)); > + req_param = (struct ipsec_mb_mp_param *)&qp_req_msg.param; > + > + qp_req_msg.len_param = sizeof(struct ipsec_mb_mp_param); > + req_param->type = op_type; > + req_param->dev_id = dev_id; > + req_param->qp_id = qp_id; > + req_param->socket_id = socket_id; > + req_param->process_id = getpid(); > + if (qp_conf) { > + req_param->nb_descriptors = qp_conf->nb_descriptors; > + req_param->mp_session = (void *)qp_conf->mp_session; > + } > + > + qp_req_msg.num_fds = 0; > + ret = rte_mp_request_sync(&qp_req_msg, &qp_resp, &ts); > + if (ret) { > + RTE_LOG(ERR, USER1, "Create MR request to primary process > failed."); > + return -1; > + } > + qp_resp_msg = &qp_resp.msgs[0]; > + resp_param = (struct ipsec_mb_mp_param *)qp_resp_msg->param; > + > + return resp_param->result; > +} > + > /** Release queue pair */ > int > ipsec_mb_qp_release(struct rte_cryptodev *dev, uint16_t qp_id) > @@ -100,7 +141,10 @@ ipsec_mb_qp_release(struct rte_cryptodev *dev, uint16_t > qp_id) > struct ipsec_mb_qp *qp = dev->data->queue_pairs[qp_id]; > struct rte_ring *r = NULL; > > - if (qp != NULL && rte_eal_process_type() == RTE_PROC_PRIMARY) { > + if (qp != NULL) > + return 0; > + > + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { > r = rte_ring_lookup(qp->name); > rte_ring_free(r); > > @@ -115,6 +159,9 @@ ipsec_mb_qp_release(struct rte_cryptodev *dev, uint16_t > qp_id) > #endif > rte_free(qp); > dev->data->queue_pairs[qp_id] = NULL; > + } else { /* secondary process */ > + return ipsec_mb_secondary_qp_op(dev->data->dev_id, qp_id, > + NULL, 0, RTE_IPSEC_MB_MP_REQ_QP_FREE); > } > return 0; > } > @@ -222,9 +269,13 @@ ipsec_mb_qp_setup(struct rte_cryptodev *dev, uint16_t > qp_id, > #endif > qp = dev->data->queue_pairs[qp_id]; > if (qp == NULL) { > - IPSEC_MB_LOG(ERR, "Primary process hasn't configured > device qp."); > - return -EINVAL; > + IPSEC_MB_LOG(DEBUG, "Secondary process setting up > device qp."); > + return ipsec_mb_secondary_qp_op(dev->data->dev_id, > qp_id, > + qp_conf, socket_id, > RTE_IPSEC_MB_MP_REQ_QP_SET); > } > + > + IPSEC_MB_LOG(ERR, "Queue pair already setup'ed."); > + return -EINVAL; > } else { > /* Free memory
Re: 回复: 回复: 回复: [PATCH v2 1/3] ethdev: add API for direct rearm mode
Hi Feifei, Add API for enabling direct rearm mode and for mapping RX and TX queues. Currently, the API supports 1:1(txq : rxq) mapping. Furthermore, to avoid Rx load Tx data directly, add API called 'rte_eth_txq_data_get' to get Tx sw_ring and its information. Suggested-by: Honnappa Nagarahalli Suggested-by: Ruifeng Wang Signed-off-by: Feifei Wang Reviewed-by: Ruifeng Wang Reviewed-by: Honnappa Nagarahalli --- lib/ethdev/ethdev_driver.h | 9 lib/ethdev/ethdev_private.c | 1 + lib/ethdev/rte_ethdev.c | 37 ++ lib/ethdev/rte_ethdev.h | 95 lib/ethdev/rte_ethdev_core.h | 5 ++ lib/ethdev/version.map | 4 ++ 6 files changed, 151 insertions(+) diff --git a/lib/ethdev/ethdev_driver.h b/lib/ethdev/ethdev_driver.h index 47a55a419e..14f52907c1 100644 --- a/lib/ethdev/ethdev_driver.h +++ b/lib/ethdev/ethdev_driver.h @@ -58,6 +58,8 @@ struct rte_eth_dev { eth_rx_descriptor_status_t rx_descriptor_status; /** Check the status of a Tx descriptor */ eth_tx_descriptor_status_t tx_descriptor_status; + /** Use Tx mbufs for Rx to rearm */ + eth_rx_direct_rearm_t rx_direct_rearm; /** * Device data that is shared between primary and secondary processes @@ -486,6 +488,11 @@ typedef int (*eth_rx_enable_intr_t)(struct rte_eth_dev *dev, typedef int (*eth_rx_disable_intr_t)(struct rte_eth_dev *dev, uint16_t rx_queue_id); +/**< @internal Get Tx information of a transmit queue of an +Ethernet device. */ typedef void (*eth_txq_data_get_t)(struct rte_eth_dev *dev, + uint16_t tx_queue_id, + struct rte_eth_txq_data *txq_data); + /** @internal Release memory resources allocated by given Rx/Tx queue. */ typedef void (*eth_queue_release_t)(struct rte_eth_dev *dev, uint16_t queue_id); @@ -1138,6 +1145,8 @@ struct eth_dev_ops { eth_rxq_info_get_t rxq_info_get; /** Retrieve Tx queue information */ eth_txq_info_get_t txq_info_get; + /** Get the address where Tx data is stored */ + eth_txq_data_get_t txq_data_get; eth_burst_mode_get_t rx_burst_mode_get; /**< Get Rx burst mode */ eth_burst_mode_get_t tx_burst_mode_get; /**< Get Tx burst mode */ eth_fw_version_get_t fw_version_get; /**< Get firmware version */ diff --git a/lib/ethdev/ethdev_private.c b/lib/ethdev/ethdev_private.c index 48090c879a..bfe16c7d77 100644 --- a/lib/ethdev/ethdev_private.c +++ b/lib/ethdev/ethdev_private.c @@ -276,6 +276,7 @@ eth_dev_fp_ops_setup(struct rte_eth_fp_ops *fpo, fpo->rx_queue_count = dev->rx_queue_count; fpo->rx_descriptor_status = dev->rx_descriptor_status; fpo->tx_descriptor_status = dev->tx_descriptor_status; + fpo->rx_direct_rearm = dev->rx_direct_rearm; fpo->rxq.data = dev->data->rx_queues; fpo->rxq.clbk = (void **)(uintptr_t)dev->post_rx_burst_cbs; diff --git a/lib/ethdev/rte_ethdev.c b/lib/ethdev/rte_ethdev.c index 0c2c1088c0..0dccec2e4b 100644 --- a/lib/ethdev/rte_ethdev.c +++ b/lib/ethdev/rte_ethdev.c @@ -1648,6 +1648,43 @@ rte_eth_dev_is_removed(uint16_t port_id) return ret; } +int +rte_eth_tx_queue_data_get(uint16_t port_id, uint16_t queue_id, + struct rte_eth_txq_data *txq_data) { + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + + if (queue_id >= dev->data->nb_tx_queues) { + RTE_ETHDEV_LOG(ERR, "Invalid Tx queue_id=%u\n", queue_id); + return -EINVAL; + } + + if (txq_data == NULL) { + RTE_ETHDEV_LOG(ERR, "Cannot get ethdev port %u Tx queue %u data to NULL\n", + port_id, queue_id); + return -EINVAL; + } + + if (dev->data->tx_queues == NULL || + dev->data->tx_queues[queue_id] == NULL) { + RTE_ETHDEV_LOG(ERR, + "Tx queue %"PRIu16" of device with port_id=%" + PRIu16" has not been setup\n", + queue_id, port_id); + return -EINVAL; + } + + if (*dev->dev_ops->txq_data_get == NULL) + return -ENOTSUP; + + dev->dev_ops->txq_data_get(dev, queue_id, txq_data); + + return 0; +} + static int rte_eth_rx_queue_check_split(const struct rte_eth_rxseg_split *rx_seg, uint16_t n_seg, uint32_t *mbp_buf_size, diff --git a/lib/ethdev/rte_ethdev.h b/lib/ethdev/rte_ethdev.h index 2e783536c1..daf7f05d62 100644 --- a/lib/ethdev/rte_ethdev.h +++ b/lib/ethdev/rte_ethdev.h @@ -1949,6 +1949,23 @@ struct rte_eth_txq_info { uint8_t queue
Re: [PATCH v3 7/7] ip_frag: fix whitespace
17/01/2023 00:14, Stephen Hemminger пишет: The style standard is to use blank after keywords. I.e "if (" not "if(" Signed-off-by: Stephen Hemminger --- lib/ip_frag/rte_ipv4_reassembly.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/ip_frag/rte_ipv4_reassembly.c b/lib/ip_frag/rte_ipv4_reassembly.c index 4a89a5f5365a..88ee0aa63f62 100644 --- a/lib/ip_frag/rte_ipv4_reassembly.c +++ b/lib/ip_frag/rte_ipv4_reassembly.c @@ -34,7 +34,7 @@ ipv4_frag_reassemble(struct ip_frag_pkt *fp) for (i = n; i != IP_FIRST_FRAG_IDX && ofs != first_len; i--) { /* previous fragment found. */ - if(fp->frags[i].ofs + fp->frags[i].len == ofs) { + if (fp->frags[i].ofs + fp->frags[i].len == ofs) { RTE_ASSERT(curr_idx != i); Acked-by: Konstantin Ananyev
Re: [PATCH v2 1/2] ring: add ring list telemetry cmd
Hi Jie, This patch supports the list of rings with telemetry cmd. An example using this command is shown below: --> /ring/list { "/ring/list": [ "HT_:7d:00.2", "MP_mb_pool_0" ] } Signed-off-by: Jie Hai --- lib/ring/meson.build | 1 + lib/ring/rte_ring.c | 40 2 files changed, 41 insertions(+) diff --git a/lib/ring/meson.build b/lib/ring/meson.build index c20685c689ac..7fca958ed7fa 100644 --- a/lib/ring/meson.build +++ b/lib/ring/meson.build @@ -18,3 +18,4 @@ indirect_headers += files ( 'rte_ring_rts.h', 'rte_ring_rts_elem_pvt.h', ) +deps += ['telemetry'] diff --git a/lib/ring/rte_ring.c b/lib/ring/rte_ring.c index cddaf6b2876f..bb1dafd4d1ca 100644 --- a/lib/ring/rte_ring.c +++ b/lib/ring/rte_ring.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "rte_ring.h" #include "rte_ring_elem.h" @@ -419,3 +420,42 @@ rte_ring_lookup(const char *name) return r; } + +static void +rte_ring_walk(void (*func)(struct rte_ring *, void *), void *arg) As a nit: it is a static function, so I think we can skip 'rte_' prefix for it. Apart from that: Acked-by: Konstantin Ananyev +{ + struct rte_ring_list *ring_list; + struct rte_tailq_entry *te; + + ring_list = RTE_TAILQ_CAST(rte_ring_tailq.head, rte_ring_list); + rte_mcfg_tailq_read_lock(); + + TAILQ_FOREACH(te, ring_list, next) { + (*func)((struct rte_ring *) te->data, arg); + } + + rte_mcfg_tailq_read_unlock(); +} + +static void +ring_list_cb(struct rte_ring *r, void *arg) +{ + struct rte_tel_data *d = (struct rte_tel_data *)arg; + + rte_tel_data_add_array_string(d, r->name); +} + +static int +ring_handle_list(const char *cmd __rte_unused, + const char *params __rte_unused, struct rte_tel_data *d) +{ + rte_tel_data_start_array(d, RTE_TEL_STRING_VAL); + rte_ring_walk(ring_list_cb, d); + return 0; +} + +RTE_INIT(ring_init_telemetry) +{ + rte_telemetry_register_cmd("/ring/list", ring_handle_list, + "Returns list of available ring. Takes no parameters"); +}
Re: [PATCH v2 2/2] ring: add ring info telemetry cmd
This patch supports dump of the info of ring by its name. An example using this command is shown below: --> /ring/info,MP_mb_pool_0 { "/ring/info": { "name": "MP_mb_pool_0", "socket": 0, "flags": 0, "producer_type": "MP", "consumer_type": "MC", "size": 262144, "mask": 262143, "capacity": 262143, "used_count": 147173, "consumer_tail": 8283, "consumer_head": 8283, "producer_tail": 155456, "producer_head": 155456, "mz_name": "RG_MP_mb_pool_0", "mz_len": 2097920, "mz_hugepage_sz": 1073741824, "mz_socket_id": 0, "mz_flags": 0 } } Signed-off-by: Jie Hai --- lib/ring/rte_ring.c | 88 + 1 file changed, 88 insertions(+) diff --git a/lib/ring/rte_ring.c b/lib/ring/rte_ring.c index bb1dafd4d1ca..82f3d6a6cd60 100644 --- a/lib/ring/rte_ring.c +++ b/lib/ring/rte_ring.c @@ -45,6 +45,9 @@ EAL_REGISTER_TAILQ(rte_ring_tailq) /* by default set head/tail distance as 1/8 of ring capacity */ #define HTD_MAX_DEF 8 +/* size of name of producer/consumer synchronization modes */ +#define SYNC_MODE_NAME_SZ 16 + /* return the size of memory occupied by a ring */ ssize_t rte_ring_get_memsize_elem(unsigned int esize, unsigned int count) @@ -454,8 +457,93 @@ ring_handle_list(const char *cmd __rte_unused, return 0; } +static void +ring_get_sync_name_by_type(struct rte_ring *r, char *prod, char *cons) +{ + switch (r->prod.sync_type) { + case RTE_RING_SYNC_MT: + strcpy(prod, "MP"); + break; + case RTE_RING_SYNC_ST: + strcpy(prod, "SP"); + break; + case RTE_RING_SYNC_MT_RTS: + strcpy(prod, "MP_RTS"); + break; + case RTE_RING_SYNC_MT_HTS: + strcpy(prod, "MP_HTS"); + break; + default: + strcpy(prod, "Unknown"); + } It is probably not the best option to blindly copy strings somewhere. I think it would be better to introduce function like that: static const char * ring_prod_sync_type_to_name(enum rte_ring_sync_type type) { switch(type) { case RTE_RING_SYNC_MT: return "MP"; case RTE_RING_SYNC_ST: return "SP"; ... } return "Unknown"; } Same for _cons_ type and use them accordingly. + + switch (r->cons.sync_type) { + case RTE_RING_SYNC_MT: + strcpy(cons, "MC"); + break; + case RTE_RING_SYNC_ST: + strcpy(cons, "SC"); + break; + case RTE_RING_SYNC_MT_RTS: + strcpy(cons, "MC_RTS"); + break; + case RTE_RING_SYNC_MT_HTS: + strcpy(cons, "MC_HTS"); + break; + default: + strcpy(cons, "Unknown"); + } +} + +static int +ring_handle_info(const char *cmd __rte_unused, const char *params, + struct rte_tel_data *d) +{ + char prod_type[SYNC_MODE_NAME_SZ]; + char cons_type[SYNC_MODE_NAME_SZ]; + const struct rte_memzone *mz; + char name[RTE_RING_NAMESIZE]; + struct rte_ring *r; + + if (params == NULL || strlen(params) == 0 || + strlen(params) >= RTE_RING_NAMESIZE) + return -EINVAL; + + strlcpy(name, params, RTE_RING_NAMESIZE); That copy looks absolutely redundant, you can do just rte_ring_lookup(params) instead. + r = rte_ring_lookup(name); + if (r == NULL) + return -EINVAL; + + rte_tel_data_start_dict(d); + rte_tel_data_add_dict_string(d, "name", r->name); Do I get it right that it could be executed from specific telemetry thread? If so, we probably shouldn't release rte_mcfg_tailq_read_lock while accessing ring data. + rte_tel_data_add_dict_int(d, "socket", r->memzone->socket_id); You do print it below, when printing memzone related data. + rte_tel_data_add_dict_int(d, "flags", r->flags); + ring_get_sync_name_by_type(r, prod_type, cons_type); + rte_tel_data_add_dict_string(d, "producer_type", prod_type); + rte_tel_data_add_dict_string(d, "consumer_type", cons_type); + rte_tel_data_add_dict_u64(d, "size", r->size); + rte_tel_data_add_dict_u64(d, "mask", r->mask); + rte_tel_data_add_dict_u64(d, "capacity", r->capacity); + rte_tel_data_add_dict_u64(d, "used_count", rte_ring_count(r)); + rte_tel_data_add_dict_u64(d, "consumer_tail", r->cons.tail); + rte_tel_data_add_dict_u64(d, "consumer_head", r->cons.head); + rte_tel_data_add_dict_u64(d, "producer_tail", r->prod.tail); + rte_tel_data_add_dict_u64(d, "producer_head", r->prod.head); + + mz = r->memzone;`` Would it make sense to check that mz != NULL here? I know that it shouldn't be NULL for valid ring created by rte_ring_create(), but still probably no harm. + rte_tel_data_add_dict_string(d, "m
Re: [PATCH v5] mempool cache: add zero-copy get and put functions
Hi Morten, Few nits, see below. Also I still think we do need a test case for _zc_get_ before accepting it in the mainline. With that in place: Acked-by: Konstantin Ananyev Zero-copy access to mempool caches is beneficial for PMD performance, and must be provided by the mempool library to fix [Bug 1052] without a performance regression. [Bug 1052]: https://bugs.dpdk.org/show_bug.cgi?id=1052 Bugzilla ID: 1052 v5: * Bugfix: Compare zero-copy get request to the cache size instead of the flush threshold; otherwise refill could overflow the memory allocated for the cache. (Andrew) * Split the zero-copy put function into an internal function doing the work, and a public function with trace. * Avoid code duplication by rewriting rte_mempool_do_generic_put() to use the internal zero-copy put function. (Andrew) * Corrected the return type of rte_mempool_cache_zc_put_bulk() from void * to void **; it returns a pointer to an array of objects. * Fix coding style: Add missing curly brackets. (Andrew) v4: * Fix checkpatch warnings. v3: * Bugfix: Respect the cache size; compare to the flush threshold instead of RTE_MEMPOOL_CACHE_MAX_SIZE. * Added 'rewind' function for incomplete 'put' operations. (Konstantin) * Replace RTE_ASSERTs with runtime checks of the request size. Instead of failing, return NULL if the request is too big. (Konstantin) * Modified comparison to prevent overflow if n is really huge and len is non-zero. * Updated the comments in the code. v2: * Fix checkpatch warnings. * Fix missing registration of trace points. * The functions are inline, so they don't go into the map file. v1 changes from the RFC: * Removed run-time parameter checks. (Honnappa) This is a hot fast path function; requiring correct application behaviour, i.e. function parameters must be valid. * Added RTE_ASSERT for parameters instead. Code for this is only generated if built with RTE_ENABLE_ASSERT. * Removed fallback when 'cache' parameter is not set. (Honnappa) * Chose the simple get function; i.e. do not move the existing objects in the cache to the top of the new stack, just leave them at the bottom. * Renamed the functions. Other suggestions are welcome, of course. ;-) * Updated the function descriptions. * Added the functions to trace_fp and version.map. Signed-off-by: Morten Brørup --- lib/mempool/mempool_trace_points.c | 9 ++ lib/mempool/rte_mempool.h | 237 + lib/mempool/rte_mempool_trace_fp.h | 23 +++ lib/mempool/version.map| 5 + 4 files changed, 245 insertions(+), 29 deletions(-) diff --git a/lib/mempool/mempool_trace_points.c b/lib/mempool/mempool_trace_points.c index 4ad76deb34..83d353a764 100644 --- a/lib/mempool/mempool_trace_points.c +++ b/lib/mempool/mempool_trace_points.c @@ -77,3 +77,12 @@ RTE_TRACE_POINT_REGISTER(rte_mempool_trace_ops_free, RTE_TRACE_POINT_REGISTER(rte_mempool_trace_set_ops_byname, lib.mempool.set.ops.byname) + +RTE_TRACE_POINT_REGISTER(rte_mempool_trace_cache_zc_put_bulk, + lib.mempool.cache.zc.put.bulk) + +RTE_TRACE_POINT_REGISTER(rte_mempool_trace_cache_zc_put_rewind, + lib.mempool.cache.zc.put.rewind) + +RTE_TRACE_POINT_REGISTER(rte_mempool_trace_cache_zc_get_bulk, + lib.mempool.cache.zc.get.bulk) diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h index 9f530db24b..5efd3c2b5b 100644 --- a/lib/mempool/rte_mempool.h +++ b/lib/mempool/rte_mempool.h @@ -47,6 +47,7 @@ #include #include #include +#include #include "rte_mempool_trace_fp.h" @@ -1346,6 +1347,197 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache, cache->len = 0; } +/** + * @internal used by rte_mempool_cache_zc_put_bulk() and rte_mempool_do_generic_put(). + * + * Zero-copy put objects in a user-owned mempool cache backed by the specified mempool. + * + * @param cache + * A pointer to the mempool cache. + * @param mp + * A pointer to the mempool. + * @param n + * The number of objects to be put in the mempool cache. + * @return + * The pointer to where to put the objects in the mempool cache. + * NULL if the request itself is too big for the cache, i.e. + * exceeds the cache flush threshold. + */ +static __rte_always_inline void ** +__rte_mempool_cache_zc_put_bulk(struct rte_mempool_cache *cache, + struct rte_mempool *mp, + unsigned int n) +{ + void **cache_objs; + + RTE_ASSERT(cache != NULL); + RTE_ASSERT(mp != NULL); + + if (n <= cache->flushthresh - cache->len) { + /* +* The objects can be added to the cache without crossing the +* flush threshold. +*/ + cache_objs = &cache->objs[cache->len]; + cache->len += n; + } else if (likely(n <= cache->flushthresh)) { + /* +* The requ
RE: [PATCH v5] mempool cache: add zero-copy get and put functions
> > Few nits, see below. > > Also I still think we do need a test case for _zc_get_ before > > accepting it in the mainline. > > Poking at my bad conscience... :-) > > It's on my todo-list. Apparently not high enough. ;-) > > > With that in place: > > Acked-by: Konstantin Ananyev > > > > > Zero-copy access to mempool caches is beneficial for PMD performance, > > and > > > must be provided by the mempool library to fix [Bug 1052] without a > > > performance regression. > > > > > > [Bug 1052]: https://bugs.dpdk.org/show_bug.cgi?id=1052 > > > > > > Bugzilla ID: 1052 > > > > > > v5: > > > * Bugfix: Compare zero-copy get request to the cache size instead of > > the > > >flush threshold; otherwise refill could overflow the memory > > allocated > > >for the cache. (Andrew) > > > * Split the zero-copy put function into an internal function doing > > the > > >work, and a public function with trace. > > > * Avoid code duplication by rewriting rte_mempool_do_generic_put() to > > use > > >the internal zero-copy put function. (Andrew) > > > * Corrected the return type of rte_mempool_cache_zc_put_bulk() from > > void * > > >to void **; it returns a pointer to an array of objects. > > > * Fix coding style: Add missing curly brackets. (Andrew) > > > v4: > > > * Fix checkpatch warnings. > > > v3: > > > * Bugfix: Respect the cache size; compare to the flush threshold > > instead > > >of RTE_MEMPOOL_CACHE_MAX_SIZE. > > > * Added 'rewind' function for incomplete 'put' operations. > > (Konstantin) > > > * Replace RTE_ASSERTs with runtime checks of the request size. > > >Instead of failing, return NULL if the request is too big. > > (Konstantin) > > > * Modified comparison to prevent overflow if n is really huge and len > > is > > >non-zero. > > > * Updated the comments in the code. > > > v2: > > > * Fix checkpatch warnings. > > > * Fix missing registration of trace points. > > > * The functions are inline, so they don't go into the map file. > > > v1 changes from the RFC: > > > * Removed run-time parameter checks. (Honnappa) > > >This is a hot fast path function; requiring correct application > > >behaviour, i.e. function parameters must be valid. > > > * Added RTE_ASSERT for parameters instead. > > >Code for this is only generated if built with RTE_ENABLE_ASSERT. > > > * Removed fallback when 'cache' parameter is not set. (Honnappa) > > > * Chose the simple get function; i.e. do not move the existing > > objects in > > >the cache to the top of the new stack, just leave them at the > > bottom. > > > * Renamed the functions. Other suggestions are welcome, of course. ;- > > ) > > > * Updated the function descriptions. > > > * Added the functions to trace_fp and version.map. > > > > > > Signed-off-by: Morten Brørup > > > --- > > > lib/mempool/mempool_trace_points.c | 9 ++ > > > lib/mempool/rte_mempool.h | 237 +- > > --- > > > lib/mempool/rte_mempool_trace_fp.h | 23 +++ > > > lib/mempool/version.map| 5 + > > > 4 files changed, 245 insertions(+), 29 deletions(-) > > > > > > diff --git a/lib/mempool/mempool_trace_points.c > > b/lib/mempool/mempool_trace_points.c > > > index 4ad76deb34..83d353a764 100644 > > > --- a/lib/mempool/mempool_trace_points.c > > > +++ b/lib/mempool/mempool_trace_points.c > > > @@ -77,3 +77,12 @@ > > RTE_TRACE_POINT_REGISTER(rte_mempool_trace_ops_free, > > > > > > RTE_TRACE_POINT_REGISTER(rte_mempool_trace_set_ops_byname, > > > lib.mempool.set.ops.byname) > > > + > > > +RTE_TRACE_POINT_REGISTER(rte_mempool_trace_cache_zc_put_bulk, > > > + lib.mempool.cache.zc.put.bulk) > > > + > > > +RTE_TRACE_POINT_REGISTER(rte_mempool_trace_cache_zc_put_rewind, > > > + lib.mempool.cache.zc.put.rewind) > > > + > > > +RTE_TRACE_POINT_REGISTER(rte_mempool_trace_cache_zc_get_bulk, > > > + lib.mempool.cache.zc.get.bulk) > > > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h > > > index 9f530db24b..5efd3c2b5b 100644 > > > --- a/lib/mempool/rte_mempool.h > > > +++ b/lib/mempool/rte_mempool.h > > > @@ -47,6 +47,7 @@ > &g
RE: [PATCH v5] mempool cache: add zero-copy get and put functions
> > > > > @@ -1364,32 +1556,25 @@ rte_mempool_do_generic_put(struct > > rte_mempool > > > > *mp, void * const *obj_table, > > > > > { > > > > > void **cache_objs; > > > > > > > > > > - /* No cache provided */ > > > > > - if (unlikely(cache == NULL)) > > > > > - goto driver_enqueue; > > > > > + /* No cache provided? */ > > > > > + if (unlikely(cache == NULL)) { > > > > > + /* Increment stats now, adding in mempool always > > succeeds. > > > > */ > > > > > + RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1); > > > > > + RTE_MEMPOOL_STAT_ADD(mp, put_objs, n); > > > > > > > > > > - /* increment stat now, adding in mempool always success */ > > > > > - RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1); > > > > > - RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n); > > > > > + goto driver_enqueue; > > > > > + } > > > > > > > > > > - /* The request itself is too big for the cache */ > > > > > - if (unlikely(n > cache->flushthresh)) > > > > > - goto driver_enqueue_stats_incremented; > > > > > + /* Prepare to add the objects to the cache. */ > > > > > + cache_objs = __rte_mempool_cache_zc_put_bulk(cache, mp, n); > > > > > > > > > > - /* > > > > > - * The cache follows the following algorithm: > > > > > - * 1. If the objects cannot be added to the cache without > > > > crossing > > > > > - * the flush threshold, flush the cache to the > > backend. > > > > > - * 2. Add the objects to the cache. > > > > > - */ > > > > > + /* The request itself is too big for the cache? */ > > > > > + if (unlikely(cache_objs == NULL)) { > > > > > + /* Increment stats now, adding in mempool always > > succeeds. > > > > */ > > > > > + RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1); > > > > > + RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n); > > > > > > > > Shouldn't it be RTE_MEMPOOL_STAT_ADD() here? > > > > > > I can see why you are wondering, but the answer is no. The statistics > > in mempool cache are not related to the cache, they are related > > > to the mempool; they are there to provide faster per-lcore update > > access [1]. > > > > > > [1]: > > https://elixir.bootlin.com/dpdk/v22.11.1/source/lib/mempool/rte_mempool > > .h#L94 > > > > But the condition above: > > if (unlikely(cache_objs == NULL)) > > means that me can't put these object to the cache and have to put > > objects straight to the pool (skipping cache completely), right? > > Correct. > > > If so, then why to update cache stats instead of pool stats? > > Because updating the stats in the cache structure is faster than updating the > stats in the pool structure. Refer to the two macros: > RTE_MEMPOOL_STAT_ADD() [2] is effectively five lines of code, but > RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) [3] is a one- > liner: ((cache)->stats.name += (n)). > > [2]: > https://elixir.bootlin.com/dpdk/v22.11.1/source/lib/mempool/rte_mempool.h#L325 > [3]: > https://elixir.bootlin.com/dpdk/v22.11.1/source/lib/mempool/rte_mempool.h#L348 > > And to reiterate that this is the correct behavior here, I will rephrase my > previous response: The stats kept in the cache are part of the > pool stats, they are not stats for the cache itself. Ah ok, that's the same as current behavior. It is still looks a bit strange to me that we incrementing cache (not pool) stats here. But that's another story, so no extra comments from me for that case. > > > > > > > > > > > - if (cache->len + n <= cache->flushthresh) { > > > > > - cache_objs = &cache->objs[cache->len]; > > > > > - cache->len += n; > > > > > - } else { > > > > > - cache_objs = &cache->objs[0]; > > > > > - rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache- > > >len); > > > > > - cache->len = n; > > > > > + goto driver_enqueue; > > > > > } > > > > > > > > > > /* Add the objects to the cache. */
Re: 回复: [PATCH v3 0/3] Direct re-arming of buffers on receive side
Hi Feifei, +ping konstantin, Would you please give some comments for this patch series? Thanks very much. Sure, will have a look in next few days. Apologies for the delay.
Re: [PATCH v3 2/2] ring: add ring info telemetry cmd
31/01/2023 02:28, Jie Hai пишет: This patch supports dump of the info of ring by its name. An example using this command is shown below: --> /ring/info,MP_mb_pool_0 { "/ring/info": { "name": "MP_mb_pool_0", "socket": 0, "flags": 0, "producer_type": "MP", "consumer_type": "MC", "size": 262144, "mask": 262143, "capacity": 262143, "used_count": 147173, "consumer_tail": 8283, "consumer_head": 8283, "producer_tail": 155456, "producer_head": 155456, "mz_name": "RG_MP_mb_pool_0", "mz_len": 2097920, "mz_hugepage_sz": 1073741824, "mz_socket_id": 0, "mz_flags": 0 } } Signed-off-by: Jie Hai --- lib/ring/rte_ring.c | 83 + 1 file changed, 83 insertions(+) diff --git a/lib/ring/rte_ring.c b/lib/ring/rte_ring.c index e6aac332d88f..2e57aa653339 100644 --- a/lib/ring/rte_ring.c +++ b/lib/ring/rte_ring.c @@ -454,8 +454,91 @@ ring_handle_list(const char *cmd __rte_unused, return 0; } +static const char * +ring_prod_sync_type_to_name(struct rte_ring *r) +{ + switch (r->prod.sync_type) { + case RTE_RING_SYNC_MT: + return "MP"; + case RTE_RING_SYNC_ST: + return "SP"; + case RTE_RING_SYNC_MT_RTS: + return "MP_RTS"; + case RTE_RING_SYNC_MT_HTS: + return "MP_HTS"; + default: + return "Unknown"; + } +} + +static const char * +ring_cons_sync_type_to_name(struct rte_ring *r) +{ + switch (r->cons.sync_type) { + case RTE_RING_SYNC_MT: + return "MC"; + case RTE_RING_SYNC_ST: + return "SC"; + case RTE_RING_SYNC_MT_RTS: + return "MC_RTS"; + case RTE_RING_SYNC_MT_HTS: + return "MC_HTS"; + default: + return "Unknown"; + } +} + +static int +ring_handle_info(const char *cmd __rte_unused, const char *params, + struct rte_tel_data *d) +{ + const struct rte_memzone *mz; + struct rte_ring *r; + + if (params == NULL || strlen(params) == 0 || + strlen(params) >= RTE_RING_NAMESIZE) + return -EINVAL; + + r = rte_ring_lookup(params); + if (r == NULL) + return -EINVAL; thanks for the update, but I think there still a potential problem here: as we release tailq_lock inside ring_lookup() and then grab it after again. Between these two points we have sort of race condition. We need a way not to release it in between. Probably the simplest way - make this function to use ring_walk() that you introduced in previous patch, instead of ring_lookup(). Similar to what mempool_handle_info() is doing. + + rte_mcfg_tailq_read_lock(); + + rte_tel_data_start_dict(d); + rte_tel_data_add_dict_string(d, "name", r->name); + rte_tel_data_add_dict_int(d, "socket", r->memzone->socket_id); + rte_tel_data_add_dict_int(d, "flags", r->flags); + rte_tel_data_add_dict_string(d, "producer_type", + ring_prod_sync_type_to_name(r)); + rte_tel_data_add_dict_string(d, "consumer_type", + ring_cons_sync_type_to_name(r)); + rte_tel_data_add_dict_u64(d, "size", r->size); + rte_tel_data_add_dict_u64(d, "mask", r->mask); + rte_tel_data_add_dict_u64(d, "capacity", r->capacity); + rte_tel_data_add_dict_u64(d, "used_count", rte_ring_count(r)); + rte_tel_data_add_dict_u64(d, "consumer_tail", r->cons.tail); + rte_tel_data_add_dict_u64(d, "consumer_head", r->cons.head); + rte_tel_data_add_dict_u64(d, "producer_tail", r->prod.tail); + rte_tel_data_add_dict_u64(d, "producer_head", r->prod.head); + + mz = r->memzone; + if (mz == NULL) + return 0; + rte_tel_data_add_dict_string(d, "mz_name", mz->name); + rte_tel_data_add_dict_int(d, "mz_len", mz->len); + rte_tel_data_add_dict_int(d, "mz_hugepage_sz", mz->hugepage_sz); + rte_tel_data_add_dict_int(d, "mz_socket_id", mz->socket_id); + rte_tel_data_add_dict_int(d, "mz_flags", mz->flags); + + rte_mcfg_tailq_read_unlock(); + return 0; +} + RTE_INIT(ring_init_telemetry) { rte_telemetry_register_cmd("/ring/list", ring_handle_list, "Returns list of available ring. Takes no parameters"); + rte_telemetry_register_cmd("/ring/info", ring_handle_info, + "Returns ring info. Parameters: ring_name."); }
Re: [PATCH v3 1/3] ethdev: enable direct rearm with separate API
Hi Feifei, Add 'tx_fill_sw_ring' and 'rx_flush_descriptor' API into direct rearm mode for separate Rx and Tx Operation. And this can support different multiple sources in direct rearm mode. For examples, Rx driver is ixgbe, and Tx driver is i40e. Thanks for your effort and thanks for taking comments provided into consideration. That approach looks much better then previous ones. Few nits below. Konstantin Suggested-by: Honnappa Nagarahalli Suggested-by: Ruifeng Wang Signed-off-by: Feifei Wang Reviewed-by: Ruifeng Wang Reviewed-by: Honnappa Nagarahalli --- lib/ethdev/ethdev_driver.h | 10 ++ lib/ethdev/ethdev_private.c | 2 + lib/ethdev/rte_ethdev.c | 52 +++ lib/ethdev/rte_ethdev.h | 174 +++ lib/ethdev/rte_ethdev_core.h | 11 +++ lib/ethdev/version.map | 6 ++ 6 files changed, 255 insertions(+) diff --git a/lib/ethdev/ethdev_driver.h b/lib/ethdev/ethdev_driver.h index 6a550cfc83..bc539ec862 100644 --- a/lib/ethdev/ethdev_driver.h +++ b/lib/ethdev/ethdev_driver.h @@ -59,6 +59,10 @@ struct rte_eth_dev { eth_rx_descriptor_status_t rx_descriptor_status; /** Check the status of a Tx descriptor */ eth_tx_descriptor_status_t tx_descriptor_status; + /** Fill Rx sw-ring with Tx buffers in direct rearm mode */ + eth_tx_fill_sw_ring_t tx_fill_sw_ring; + /** Flush Rx descriptor in direct rearm mode */ + eth_rx_flush_descriptor_t rx_flush_descriptor; /** * Device data that is shared between primary and secondary processes @@ -504,6 +508,10 @@ typedef void (*eth_rxq_info_get_t)(struct rte_eth_dev *dev, typedef void (*eth_txq_info_get_t)(struct rte_eth_dev *dev, uint16_t tx_queue_id, struct rte_eth_txq_info *qinfo); +/**< @internal Get rearm data for a receive queue of an Ethernet device. */ +typedef void (*eth_rxq_rearm_data_get_t)(struct rte_eth_dev *dev, + uint16_t tx_queue_id, struct rte_eth_rxq_rearm_data *rxq_rearm_data); + typedef int (*eth_burst_mode_get_t)(struct rte_eth_dev *dev, uint16_t queue_id, struct rte_eth_burst_mode *mode); @@ -1215,6 +1223,8 @@ struct eth_dev_ops { eth_rxq_info_get_t rxq_info_get; /** Retrieve Tx queue information */ eth_txq_info_get_t txq_info_get; + /** Get Rx queue rearm data */ + eth_rxq_rearm_data_get_t rxq_rearm_data_get; eth_burst_mode_get_t rx_burst_mode_get; /**< Get Rx burst mode */ eth_burst_mode_get_t tx_burst_mode_get; /**< Get Tx burst mode */ eth_fw_version_get_t fw_version_get; /**< Get firmware version */ diff --git a/lib/ethdev/ethdev_private.c b/lib/ethdev/ethdev_private.c index 48090c879a..c5dd5e30f6 100644 --- a/lib/ethdev/ethdev_private.c +++ b/lib/ethdev/ethdev_private.c @@ -276,6 +276,8 @@ eth_dev_fp_ops_setup(struct rte_eth_fp_ops *fpo, fpo->rx_queue_count = dev->rx_queue_count; fpo->rx_descriptor_status = dev->rx_descriptor_status; fpo->tx_descriptor_status = dev->tx_descriptor_status; + fpo->tx_fill_sw_ring = dev->tx_fill_sw_ring; + fpo->rx_flush_descriptor = dev->rx_flush_descriptor; fpo->rxq.data = dev->data->rx_queues; fpo->rxq.clbk = (void **)(uintptr_t)dev->post_rx_burst_cbs; diff --git a/lib/ethdev/rte_ethdev.c b/lib/ethdev/rte_ethdev.c index 5d5e18db1e..2af5cb42fe 100644 --- a/lib/ethdev/rte_ethdev.c +++ b/lib/ethdev/rte_ethdev.c @@ -3282,6 +3282,21 @@ rte_eth_dev_set_rx_queue_stats_mapping(uint16_t port_id, uint16_t rx_queue_id, stat_idx, STAT_QMAP_RX)); } +int +rte_eth_dev_direct_rearm(uint16_t rx_port_id, uint16_t rx_queue_id, + uint16_t tx_port_id, uint16_t tx_rx_queue_id, + struct rte_eth_rxq_rearm_data *rxq_rearm_data) +{ + int nb_rearm = 0; + + nb_rearm = rte_eth_tx_fill_sw_ring(tx_port_id, tx_rx_queue_id, rxq_rearm_data); + + if (nb_rearm > 0) + return rte_eth_rx_flush_descriptor(rx_port_id, rx_queue_id, nb_rearm); + + return 0; +} + int rte_eth_dev_fw_version_get(uint16_t port_id, char *fw_version, size_t fw_size) { @@ -5323,6 +5338,43 @@ rte_eth_tx_queue_info_get(uint16_t port_id, uint16_t queue_id, return 0; } +int +rte_eth_rx_queue_rearm_data_get(uint16_t port_id, uint16_t queue_id, + struct rte_eth_rxq_rearm_data *rxq_rearm_data) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + + if (queue_id >= dev->data->nb_rx_queues) { + RTE_ETHDEV_LOG(ERR, "Invalid Rx queue_id=%u\n", queue_id); + return -EINVAL; + } + + if (rxq_rearm_data == NULL) { + RTE_ETHDEV_LOG(ERR, "Cannot get ethdev port %u Rx queue %u rearm data to NULL\n", + port_id, queue_id); + return -EINVAL; + } +
Re: [PATCH v3 2/3] net/i40e: enable direct rearm with separate API
04/01/2023 07:30, Feifei Wang пишет: Add internal API to separate direct rearm operations between Rx and Tx. Suggested-by: Honnappa Nagarahalli Signed-off-by: Feifei Wang Reviewed-by: Ruifeng Wang Reviewed-by: Honnappa Nagarahalli --- drivers/net/i40e/i40e_ethdev.c | 1 + drivers/net/i40e/i40e_ethdev.h | 2 + drivers/net/i40e/i40e_rxtx.c| 19 + drivers/net/i40e/i40e_rxtx.h| 4 ++ drivers/net/i40e/i40e_rxtx_vec_common.h | 54 + drivers/net/i40e/i40e_rxtx_vec_neon.c | 42 +++ 6 files changed, 122 insertions(+) diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c index 7726a89d99..29c1ce2470 100644 --- a/drivers/net/i40e/i40e_ethdev.c +++ b/drivers/net/i40e/i40e_ethdev.c @@ -497,6 +497,7 @@ static const struct eth_dev_ops i40e_eth_dev_ops = { .flow_ops_get = i40e_dev_flow_ops_get, .rxq_info_get = i40e_rxq_info_get, .txq_info_get = i40e_txq_info_get, + .rxq_rearm_data_get = i40e_rxq_rearm_data_get, .rx_burst_mode_get= i40e_rx_burst_mode_get, .tx_burst_mode_get= i40e_tx_burst_mode_get, .timesync_enable = i40e_timesync_enable, diff --git a/drivers/net/i40e/i40e_ethdev.h b/drivers/net/i40e/i40e_ethdev.h index fe943a45ff..6a6a2a6d3c 100644 --- a/drivers/net/i40e/i40e_ethdev.h +++ b/drivers/net/i40e/i40e_ethdev.h @@ -1352,6 +1352,8 @@ void i40e_rxq_info_get(struct rte_eth_dev *dev, uint16_t queue_id, struct rte_eth_rxq_info *qinfo); void i40e_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id, struct rte_eth_txq_info *qinfo); +void i40e_rxq_rearm_data_get(struct rte_eth_dev *dev, uint16_t queue_id, + struct rte_eth_rxq_rearm_data *rxq_rearm_data); int i40e_rx_burst_mode_get(struct rte_eth_dev *dev, uint16_t queue_id, struct rte_eth_burst_mode *mode); int i40e_tx_burst_mode_get(struct rte_eth_dev *dev, uint16_t queue_id, diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c index 788ffb51c2..d8d801acaf 100644 --- a/drivers/net/i40e/i40e_rxtx.c +++ b/drivers/net/i40e/i40e_rxtx.c @@ -3197,6 +3197,19 @@ i40e_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id, qinfo->conf.offloads = txq->offloads; } +void +i40e_rxq_rearm_data_get(struct rte_eth_dev *dev, uint16_t queue_id, + struct rte_eth_rxq_rearm_data *rxq_rearm_data) +{ + struct i40e_rx_queue *rxq; + + rxq = dev->data->rx_queues[queue_id]; + + rxq_rearm_data->rx_sw_ring = rxq->sw_ring; + rxq_rearm_data->rearm_start = &rxq->rxrearm_start; + rxq_rearm_data->rearm_nb = &rxq->rxrearm_nb; +} + #ifdef RTE_ARCH_X86 static inline bool get_avx_supported(bool request_avx512) @@ -3321,6 +3334,9 @@ i40e_set_rx_function(struct rte_eth_dev *dev) PMD_INIT_LOG(DEBUG, "Using Vector Rx (port %d).", dev->data->port_id); dev->rx_pkt_burst = i40e_recv_pkts_vec; +#ifdef RTE_ARCH_ARM64 + dev->rx_flush_descriptor = i40e_rx_flush_descriptor_vec; +#endif } #endif /* RTE_ARCH_X86 */ } else if (!dev->data->scattered_rx && ad->rx_bulk_alloc_allowed) { @@ -3484,6 +3500,9 @@ i40e_set_tx_function(struct rte_eth_dev *dev) PMD_INIT_LOG(DEBUG, "Using Vector Tx (port %d).", dev->data->port_id); dev->tx_pkt_burst = i40e_xmit_pkts_vec; +#ifdef RTE_ARCH_ARM64 + dev->tx_fill_sw_ring = i40e_tx_fill_sw_ring; +#endif As I can see tx_fill_sw_ring() is non ARM specific, any reason to guard it with #ifdef ARM? Actually same ask for rx_flush_descriptor() - can we have generic version too? #endif /* RTE_ARCH_X86 */ } else { PMD_INIT_LOG(DEBUG, "Simple tx finally be used."); diff --git a/drivers/net/i40e/i40e_rxtx.h b/drivers/net/i40e/i40e_rxtx.h index 5e6eecc501..8a29bd89df 100644 --- a/drivers/net/i40e/i40e_rxtx.h +++ b/drivers/net/i40e/i40e_rxtx.h @@ -233,6 +233,10 @@ uint32_t i40e_dev_rx_queue_count(void *rx_queue); int i40e_dev_rx_descriptor_status(void *rx_queue, uint16_t offset); int i40e_dev_tx_descriptor_status(void *tx_queue, uint16_t offset); +int i40e_tx_fill_sw_ring(void *tx_queue, + struct rte_eth_rxq_rearm_data *rxq_rearm_data); +int i40e_rx_flush_descriptor_vec(void *rx_queue, uint16_t nb_rearm); + uint16_t i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts); uint16_t i40e_recv_scattered_pkts_vec(void *rx_queue, diff --git a/drivers/net/i40e/i40e_rxtx_vec_common.h b/drivers/net/i40e/i40e_rxtx_vec_common.h index fe1a6ec75e..eb96301a43 100644 --- a/drivers/net/i40e/i40e_rxtx_vec_common.h +++ b/drivers/net/i40e/i40e_rx
Re: [PATCH] acl: fix trie splitting
It is not really clear to me what is the actual problem and what are you trying to fix here... When using a large number of ACL rules, the trie is supposed to split when there are over 2048 nodes. What do you mean by 'there'? As I remember, it decided to split set of rules, when trie starts to grow too fast: if merging of last rule creates more then cur_node_max new nodes, then it splits. However, node_count is negative, so node_count > context->cur_node_max never actually runs, so all the nodes created from the rules end up being in one trie. Hmm... sentence that node count is negative is too cryptic for me. Obviously there are plenty examples with rule-sets that causes more then one trie to be created. Simplest way to check that, just run acl autotest: echo 'acl_autotest' | ./x86_64-default-linuxapp-gcc-dbg/app/test/dpdk-test --lcores=15 --no-pci --no-huge --log-level=acl,debug ... You will see that there are bunch of cases with more then one trie. Another way - try with dts test-cases for acl. few of them will create multiple tries: git clone -v http://dpdk.org/git/tools/dts/ cd dts gzip -cd dep/test-acl-input.tar.gz | tar xfv - ./x86_64-default-linuxapp-gcc-dbg/app/dpdk-test-acl --lcores=15 --no-pci \ --no-huge --log-level=acl,debug -- \ --rulesf=/home/kananyev/devel/dts/test-acl-input/acl2v4_10k_rule \ --tracef=/home/kananyev/devel/dts/test-acl-input/acl2v4_10k_trace \ --verbose=0 ... dump_config: rulesf:/home/kananyev/devel/dts/test-acl-input/acl2v4_10k_rule tracef:/home/kananyev/devel/dts/test-acl-input/acl2v4_10k_trace rulenum:65536 tracenum:65536 tracestep:256 bldcat:3 runcat:1 maxsize:0 iter:1 verbose:0 alg:0(default) ipv6:0 ACL: Gen phase for ACL "TESTACL": runtime memory footprint on socket -1: single nodes/bytes used: 54437/435496 quad nodes/vectors/bytes used: 55775/189941/1519528 DFA nodes/group64/bytes used: 9516/28940/14819336 match nodes/bytes used: 9760/1249280 total: 18027888 bytes max limit: 18446744073709551615 bytes ACL: Build phase for ACL "TESTACL": node limit for tree split: 2048 nodes created: 129488 memory consumed: 184549530 ACL: trie 0: number of rules: 655, indexes: 4 ACL: trie 1: number of rules: 9129, indexes: 4 rte_acl_build(3) finished with 0 acl context @0x103adfa80 socket_id=-1 alg=3 first_load_sz=4 max_rules=65536 rule_size=96 num_rules=9784 num_categories=3 num_tries=2 Original PR with sample files and test output can be found here: https://github.com/DPDK/dpdk/pull/50 Fixes: dc276b5780c2 ("acl: new library") Signed-off-by: Arthur Leung --- app/test-acl/test-acl.sh | 2 +- lib/acl/acl_bld.c| 9 +++-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/app/test-acl/test-acl.sh b/app/test-acl/test-acl.sh index 30814f3fe2..59bfa121cf 100644 --- a/app/test-acl/test-acl.sh +++ b/app/test-acl/test-acl.sh @@ -17,7 +17,7 @@ # '/' # trace record format: # \ -# ... +# ... # # As an example: # /bin/bash app/test-acl/test-acl.sh build/app/dpdk-test-acl \ diff --git a/lib/acl/acl_bld.c b/lib/acl/acl_bld.c index 2816632803..6064a8103b 100644 --- a/lib/acl/acl_bld.c +++ b/lib/acl/acl_bld.c @@ -946,7 +946,7 @@ build_trie(struct acl_build_context *context, struct rte_acl_build_rule *head, struct rte_acl_build_rule **last, uint32_t *count) { uint32_t n, m; - int field_index, node_count; + int field_index; struct rte_acl_node *trie; struct rte_acl_build_rule *prev, *rule; struct rte_acl_node *end, *merge, *root, *end_prev; @@ -1048,15 +1048,13 @@ build_trie(struct acl_build_context *context, struct rte_acl_build_rule *head, } } - node_count = context->num_nodes; (*count)++; /* merge this rule into the trie */ if (acl_merge_trie(context, trie, root, 0, NULL)) return NULL; - node_count = context->num_nodes - node_count; - if (node_count > context->cur_node_max) { + if (context->num_nodes > (context->cur_node_max * context->num_tries)) { *last = prev; return trie; } @@ -1368,6 +1366,7 @@ acl_build_tries(struct acl_build_context *context, for (n = 0;; n = num_tries) { num_tries = n + 1; + context->num_tries = num_tries; last = build_one_trie(context, rule_sets, n, context->node_max); if (context->bld_tries[n].trie == NULL) { @@ -1411,8 +1410,6 @@ acl_build_tries(struct acl_build_context *context, } } - - context->num_tries = num_tries; return 0; }
RE: [PATCH v5 2/4] eal: allow applications to report their cpu usage
> > Konstantin Ananyev, Jan 04, 2023 at 11:53: > > Probably we can even print warning or so if some-one tries to overwrite > > it once again. > > I'm not sure that is necessary. If an application wants to reset the > callback to NULL at any point in time, I don't see why DPDK should tell > them it is a bad thing. Problem is not in resetting cb function itself. Usually with CB user needs some sort of data structure (to accumulate stats, track states, etc.). If we allow to reset the CB, then it arises the question when/how should we allow user to free associated data? And, as I undersand, we don't have a clear way to do it.
RE: [PATCH v5 2/4] eal: allow applications to report their cpu usage
> -Original Message- > From: Robin Jarry > Sent: Monday, February 6, 2023 8:29 PM > To: Konstantin Ananyev ; dev@dpdk.org > Cc: Tyler Retzlaff ; Kevin Laatz > ; Morten Brørup > > Subject: Re: [PATCH v5 2/4] eal: allow applications to report their cpu usage > > Konstantin Ananyev, Feb 06, 2023 at 21:07: > > Problem is not in resetting cb function itself. > > > > Usually with CB user needs some sort of data structure (to accumulate > > stats, track states, etc.). If we allow to reset the CB, then it > > arises the question when/how should we allow user to free associated > > data? > > > > And, as I undersand, we don't have a clear way to do it. > > If the application decides to reset the callback function, they are in > a good position to determine what resources they need to free. Yes, app knows what resources it wants to free. But it has no way to know *when* it is safe to free them. Just a bit more explanation: App invokes your function which resets global value of CB. How would it know that after return from this function none other thread still not executing this CB right now? And how determine when this thread will finish with executing CB function? That's why it might be easier simply not allow to reset it at all > I don't > see why EAL should get involved here but I may be missing a point.
RE: [PATCH v5 2/4] eal: allow applications to report their cpu usage
> Konstantin Ananyev, Feb 06, 2023 at 21:34: > > Yes, app knows what resources it wants to free. > > But it has no way to know *when* it is safe to free them. > > Just a bit more explanation: > > App invokes your function which resets global value of CB. > > How would it know that after return from this function none > > other thread still not executing this CB right now? > > And how determine when this thread will finish with executing CB function? > > That's why it might be easier simply not allow to reset it at all > > Ok I see. But what should we do to prevent this? Simply ignore the > request and log a warning? That's seems like simplest choice to me... Or if you still prefer to allow it - put a special comment that it is user responsibility to handle such possible race-condition (inside his CB function or so).
RE: [PATCH v11 2/4] net/i40e: implement mbufs recycle mode
> > > > Define specific function implementation for i40e driver. > > Currently, mbufs recycle mode can support 128bit vector path and avx2 path. > > And can be enabled both in fast free and no fast free mode. > > > > Suggested-by: Honnappa Nagarahalli > > Signed-off-by: Feifei Wang > > Reviewed-by: Ruifeng Wang > > Reviewed-by: Honnappa Nagarahalli > > --- > > drivers/net/i40e/i40e_ethdev.c| 1 + > > drivers/net/i40e/i40e_ethdev.h| 2 + > > .../net/i40e/i40e_recycle_mbufs_vec_common.c | 147 > > ++ > > drivers/net/i40e/i40e_rxtx.c | 32 > > drivers/net/i40e/i40e_rxtx.h | 4 + > > drivers/net/i40e/meson.build | 1 + > > 6 files changed, 187 insertions(+) > > create mode 100644 drivers/net/i40e/i40e_recycle_mbufs_vec_common.c > > > > diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c > > index 8271bbb394..50ba9aac94 100644 > > --- a/drivers/net/i40e/i40e_ethdev.c > > +++ b/drivers/net/i40e/i40e_ethdev.c > > @@ -496,6 +496,7 @@ static const struct eth_dev_ops i40e_eth_dev_ops = { > > .flow_ops_get = i40e_dev_flow_ops_get, > > .rxq_info_get = i40e_rxq_info_get, > > .txq_info_get = i40e_txq_info_get, > > + .recycle_rxq_info_get = i40e_recycle_rxq_info_get, > > .rx_burst_mode_get= i40e_rx_burst_mode_get, > > .tx_burst_mode_get= i40e_tx_burst_mode_get, > > .timesync_enable = i40e_timesync_enable, > > diff --git a/drivers/net/i40e/i40e_ethdev.h b/drivers/net/i40e/i40e_ethdev.h > > index 6f65d5e0ac..af758798e1 100644 > > --- a/drivers/net/i40e/i40e_ethdev.h > > +++ b/drivers/net/i40e/i40e_ethdev.h > > @@ -1355,6 +1355,8 @@ void i40e_rxq_info_get(struct rte_eth_dev *dev, > > uint16_t queue_id, > > struct rte_eth_rxq_info *qinfo); > > void i40e_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id, > > struct rte_eth_txq_info *qinfo); > > +void i40e_recycle_rxq_info_get(struct rte_eth_dev *dev, uint16_t queue_id, > > + struct rte_eth_recycle_rxq_info *recycle_rxq_info); > > int i40e_rx_burst_mode_get(struct rte_eth_dev *dev, uint16_t queue_id, > >struct rte_eth_burst_mode *mode); int > > i40e_tx_burst_mode_get(struct rte_eth_dev *dev, uint16_t queue_id, diff -- > > git a/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c > > b/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c > > new file mode 100644 > > index 00..5663ecccde > > --- /dev/null > > +++ b/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c > > @@ -0,0 +1,147 @@ > > +/* SPDX-License-Identifier: BSD-3-Clause > > + * Copyright (c) 2023 Arm Limited. > > + */ > > + > > +#include > > +#include > > + > > +#include "base/i40e_prototype.h" > > +#include "base/i40e_type.h" > > +#include "i40e_ethdev.h" > > +#include "i40e_rxtx.h" > > + > > +#pragma GCC diagnostic ignored "-Wcast-qual" > > + > > +void > > +i40e_recycle_rx_descriptors_refill_vec(void *rx_queue, uint16_t > > +nb_mbufs) { > > + struct i40e_rx_queue *rxq = rx_queue; > > + struct i40e_rx_entry *rxep; > > + volatile union i40e_rx_desc *rxdp; > > + uint16_t rx_id; > > + uint64_t paddr; > > + uint64_t dma_addr; > > + uint16_t i; > > + > > + rxdp = rxq->rx_ring + rxq->rxrearm_start; > > + rxep = &rxq->sw_ring[rxq->rxrearm_start]; > > + > > + for (i = 0; i < nb_mbufs; i++) { > > + /* Initialize rxdp descs. */ > > + paddr = (rxep[i].mbuf)->buf_iova + > > RTE_PKTMBUF_HEADROOM; > > + dma_addr = rte_cpu_to_le_64(paddr); > > + /* flush desc with pa dma_addr */ > > + rxdp[i].read.hdr_addr = 0; > > + rxdp[i].read.pkt_addr = dma_addr; > > + } > > + > > + /* Update the descriptor initializer index */ > > + rxq->rxrearm_start += nb_mbufs; > > + rx_id = rxq->rxrearm_start - 1; > > + > > + if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) { > > + rxq->rxrearm_start = 0; > > + rx_id = rxq->nb_rx_desc - 1; > > + } > > + > > + rxq->rxrearm_nb -= nb_mbufs; > > + > > + rte_io_wmb(); > > + /* Update the tail pointer on the NIC */ > > + I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id); } > > + > > +uint16_t > > +i40e_recycle_tx_mbufs_reuse_vec(void *tx_queue, > > + struct rte_eth_recycle_rxq_info *recycle_rxq_info) { > > + struct i40e_tx_queue *txq = tx_queue; > > + struct i40e_tx_entry *txep; > > + struct rte_mbuf **rxep; > > + int i, n; > > + uint16_t nb_recycle_mbufs; > > + uint16_t avail = 0; > > + uint16_t mbuf_ring_size = recycle_rxq_info->mbuf_ring_size; > > + uint16_t mask = recycle_rxq_info->mbuf_ring_size - 1; > > + uint16_t refill_requirement = recycle_rxq_info->refill_requirement; > > + uint16_t refill_head = *recycle_rxq_info->refill_head; > > + uint16_t receive_tail = *recycle_rxq_info->receive_tail; > > + > > + /* Get available recycling Rx buffers. */ > > +
Re: [PATCH v11 2/4] net/i40e: implement mbufs recycle mode
31/08/2023 18:24, Konstantin Ananyev пишет: Define specific function implementation for i40e driver. Currently, mbufs recycle mode can support 128bit vector path and avx2 path. And can be enabled both in fast free and no fast free mode. Suggested-by: Honnappa Nagarahalli Signed-off-by: Feifei Wang Reviewed-by: Ruifeng Wang Reviewed-by: Honnappa Nagarahalli --- drivers/net/i40e/i40e_ethdev.c| 1 + drivers/net/i40e/i40e_ethdev.h| 2 + .../net/i40e/i40e_recycle_mbufs_vec_common.c | 147 ++ drivers/net/i40e/i40e_rxtx.c | 32 drivers/net/i40e/i40e_rxtx.h | 4 + drivers/net/i40e/meson.build | 1 + 6 files changed, 187 insertions(+) create mode 100644 drivers/net/i40e/i40e_recycle_mbufs_vec_common.c diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c index 8271bbb394..50ba9aac94 100644 --- a/drivers/net/i40e/i40e_ethdev.c +++ b/drivers/net/i40e/i40e_ethdev.c @@ -496,6 +496,7 @@ static const struct eth_dev_ops i40e_eth_dev_ops = { .flow_ops_get = i40e_dev_flow_ops_get, .rxq_info_get = i40e_rxq_info_get, .txq_info_get = i40e_txq_info_get, + .recycle_rxq_info_get = i40e_recycle_rxq_info_get, .rx_burst_mode_get= i40e_rx_burst_mode_get, .tx_burst_mode_get= i40e_tx_burst_mode_get, .timesync_enable = i40e_timesync_enable, diff --git a/drivers/net/i40e/i40e_ethdev.h b/drivers/net/i40e/i40e_ethdev.h index 6f65d5e0ac..af758798e1 100644 --- a/drivers/net/i40e/i40e_ethdev.h +++ b/drivers/net/i40e/i40e_ethdev.h @@ -1355,6 +1355,8 @@ void i40e_rxq_info_get(struct rte_eth_dev *dev, uint16_t queue_id, struct rte_eth_rxq_info *qinfo); void i40e_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id, struct rte_eth_txq_info *qinfo); +void i40e_recycle_rxq_info_get(struct rte_eth_dev *dev, uint16_t queue_id, + struct rte_eth_recycle_rxq_info *recycle_rxq_info); int i40e_rx_burst_mode_get(struct rte_eth_dev *dev, uint16_t queue_id, struct rte_eth_burst_mode *mode); int i40e_tx_burst_mode_get(struct rte_eth_dev *dev, uint16_t queue_id, diff -- git a/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c b/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c new file mode 100644 index 00..5663ecccde --- /dev/null +++ b/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c @@ -0,0 +1,147 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (c) 2023 Arm Limited. + */ + +#include +#include + +#include "base/i40e_prototype.h" +#include "base/i40e_type.h" +#include "i40e_ethdev.h" +#include "i40e_rxtx.h" + +#pragma GCC diagnostic ignored "-Wcast-qual" + +void +i40e_recycle_rx_descriptors_refill_vec(void *rx_queue, uint16_t +nb_mbufs) { + struct i40e_rx_queue *rxq = rx_queue; + struct i40e_rx_entry *rxep; + volatile union i40e_rx_desc *rxdp; + uint16_t rx_id; + uint64_t paddr; + uint64_t dma_addr; + uint16_t i; + + rxdp = rxq->rx_ring + rxq->rxrearm_start; + rxep = &rxq->sw_ring[rxq->rxrearm_start]; + + for (i = 0; i < nb_mbufs; i++) { + /* Initialize rxdp descs. */ + paddr = (rxep[i].mbuf)->buf_iova + RTE_PKTMBUF_HEADROOM; + dma_addr = rte_cpu_to_le_64(paddr); + /* flush desc with pa dma_addr */ + rxdp[i].read.hdr_addr = 0; + rxdp[i].read.pkt_addr = dma_addr; + } + + /* Update the descriptor initializer index */ + rxq->rxrearm_start += nb_mbufs; + rx_id = rxq->rxrearm_start - 1; + + if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) { + rxq->rxrearm_start = 0; + rx_id = rxq->nb_rx_desc - 1; + } + + rxq->rxrearm_nb -= nb_mbufs; + + rte_io_wmb(); + /* Update the tail pointer on the NIC */ + I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id); } + +uint16_t +i40e_recycle_tx_mbufs_reuse_vec(void *tx_queue, + struct rte_eth_recycle_rxq_info *recycle_rxq_info) { + struct i40e_tx_queue *txq = tx_queue; + struct i40e_tx_entry *txep; + struct rte_mbuf **rxep; + int i, n; + uint16_t nb_recycle_mbufs; + uint16_t avail = 0; + uint16_t mbuf_ring_size = recycle_rxq_info->mbuf_ring_size; + uint16_t mask = recycle_rxq_info->mbuf_ring_size - 1; + uint16_t refill_requirement = recycle_rxq_info->refill_requirement; + uint16_t refill_head = *recycle_rxq_info->refill_head; + uint16_t receive_tail = *recycle_rxq_info->receive_tail; + + /* Get available recycling Rx buffers. */ + avail = (mbuf_ring_size - (refill_head - receive_tail)) & mask; + + /* Check Tx
RE: [PATCH v11 2/4] net/i40e: implement mbufs recycle mode
> > > > > > > > Define specific function implementation for i40e driver. > > > > Currently, mbufs recycle mode can support 128bit vector path and avx2 > > path. > > > > And can be enabled both in fast free and no fast free mode. > > > > > > > > Suggested-by: Honnappa Nagarahalli > > > > Signed-off-by: Feifei Wang > > > > Reviewed-by: Ruifeng Wang > > > > Reviewed-by: Honnappa Nagarahalli > > > > --- > > > > drivers/net/i40e/i40e_ethdev.c| 1 + > > > > drivers/net/i40e/i40e_ethdev.h| 2 + > > > > .../net/i40e/i40e_recycle_mbufs_vec_common.c | 147 > > > > ++ > > > > drivers/net/i40e/i40e_rxtx.c | 32 > > > > drivers/net/i40e/i40e_rxtx.h | 4 + > > > > drivers/net/i40e/meson.build | 1 + > > > > 6 files changed, 187 insertions(+) > > > > create mode 100644 drivers/net/i40e/i40e_recycle_mbufs_vec_common.c > > > > > > > > diff --git a/drivers/net/i40e/i40e_ethdev.c > > > > b/drivers/net/i40e/i40e_ethdev.c index 8271bbb394..50ba9aac94 100644 > > > > --- a/drivers/net/i40e/i40e_ethdev.c > > > > +++ b/drivers/net/i40e/i40e_ethdev.c > > > > @@ -496,6 +496,7 @@ static const struct eth_dev_ops i40e_eth_dev_ops > > = { > > > > .flow_ops_get = i40e_dev_flow_ops_get, > > > > .rxq_info_get = i40e_rxq_info_get, > > > > .txq_info_get = i40e_txq_info_get, > > > > + .recycle_rxq_info_get = i40e_recycle_rxq_info_get, > > > > .rx_burst_mode_get= i40e_rx_burst_mode_get, > > > > .tx_burst_mode_get= i40e_tx_burst_mode_get, > > > > .timesync_enable = i40e_timesync_enable, > > > > diff --git a/drivers/net/i40e/i40e_ethdev.h > > > > b/drivers/net/i40e/i40e_ethdev.h index 6f65d5e0ac..af758798e1 100644 > > > > --- a/drivers/net/i40e/i40e_ethdev.h > > > > +++ b/drivers/net/i40e/i40e_ethdev.h > > > > @@ -1355,6 +1355,8 @@ void i40e_rxq_info_get(struct rte_eth_dev > > > > *dev, uint16_t queue_id, > > > > struct rte_eth_rxq_info *qinfo); > > > > void i40e_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id, > > > > struct rte_eth_txq_info *qinfo); > > > > +void i40e_recycle_rxq_info_get(struct rte_eth_dev *dev, uint16_t > > queue_id, > > > > + struct rte_eth_recycle_rxq_info *recycle_rxq_info); > > > > int i40e_rx_burst_mode_get(struct rte_eth_dev *dev, uint16_t queue_id, > > > >struct rte_eth_burst_mode *mode); int > > > > i40e_tx_burst_mode_get(struct rte_eth_dev *dev, uint16_t queue_id, > > > > diff -- git a/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c > > > > b/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c > > > > new file mode 100644 > > > > index 00..5663ecccde > > > > --- /dev/null > > > > +++ b/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c > > > > @@ -0,0 +1,147 @@ > > > > +/* SPDX-License-Identifier: BSD-3-Clause > > > > + * Copyright (c) 2023 Arm Limited. > > > > + */ > > > > + > > > > +#include > > > > +#include > > > > + > > > > +#include "base/i40e_prototype.h" > > > > +#include "base/i40e_type.h" > > > > +#include "i40e_ethdev.h" > > > > +#include "i40e_rxtx.h" > > > > + > > > > +#pragma GCC diagnostic ignored "-Wcast-qual" > > > > + > > > > +void > > > > +i40e_recycle_rx_descriptors_refill_vec(void *rx_queue, uint16_t > > > > +nb_mbufs) { > > > > + struct i40e_rx_queue *rxq = rx_queue; > > > > + struct i40e_rx_entry *rxep; > > > > + volatile union i40e_rx_desc *rxdp; > > > > + uint16_t rx_id; > > > > + uint64_t paddr; > > > > + uint64_t dma_addr; > > > > + uint16_t i; > > > > + > > > > + rxdp = rxq->rx_ring + rxq->rxrearm_start; > > > > + rxep = &rxq->sw_ring[rxq->rxrearm_start]; > > > > + > > > > + for (i = 0; i < nb_mbufs; i++) { > > > > + /* Initialize rxdp descs. */ > > > > + paddr = (rxep[i].mbuf)->buf_iova + > > > > RTE_PKTMBUF_HEADROOM; > > > > + dma_addr = rte_cpu_to_le_64(paddr); > > > > + /* flush desc with pa dma_addr */ > > > > + rxdp[i].read.hdr_addr = 0; > > > > + rxdp[i].read.pkt_addr = dma_addr; > > > > + } > > > > + > > > > + /* Update the descriptor initializer index */ > > > > + rxq->rxrearm_start += nb_mbufs; > > > > + rx_id = rxq->rxrearm_start - 1; > > > > + > > > > + if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) { > > > > + rxq->rxrearm_start = 0; > > > > + rx_id = rxq->nb_rx_desc - 1; > > > > + } > > > > + > > > > + rxq->rxrearm_nb -= nb_mbufs; > > > > + > > > > + rte_io_wmb(); > > > > + /* Update the tail pointer on the NIC */ > > > > + I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id); } > > > > + > > > > +uint16_t > > > > +i40e_recycle_tx_mbufs_reuse_vec(void *tx_queue, > > > > + struct rte_eth_recycle_rxq_info *recy
RE: [PATCH v11 2/4] net/i40e: implement mbufs recycle mode
Hi Feifei, > > > > > > Define specific function implementation for i40e driver. > > > > > > Currently, mbufs recycle mode can support 128bit vector path and > > > > > > avx2 > > > > path. > > > > > > And can be enabled both in fast free and no fast free mode. > > > > > > > > > > > > Suggested-by: Honnappa Nagarahalli > > > > > > > > > > > > Signed-off-by: Feifei Wang > > > > > > Reviewed-by: Ruifeng Wang > > > > > > Reviewed-by: Honnappa Nagarahalli > > > > > > > > --- > > > > > > drivers/net/i40e/i40e_ethdev.c| 1 + > > > > > > drivers/net/i40e/i40e_ethdev.h| 2 + > > > > > > .../net/i40e/i40e_recycle_mbufs_vec_common.c | 147 > > > > > > ++ > > > > > > drivers/net/i40e/i40e_rxtx.c | 32 > > > > > > drivers/net/i40e/i40e_rxtx.h | 4 + > > > > > > drivers/net/i40e/meson.build | 1 + > > > > > > 6 files changed, 187 insertions(+) create mode 100644 > > > > > > drivers/net/i40e/i40e_recycle_mbufs_vec_common.c > > > > > > > > > > > > diff --git a/drivers/net/i40e/i40e_ethdev.c > > > > > > b/drivers/net/i40e/i40e_ethdev.c index 8271bbb394..50ba9aac94 > > > > > > 100644 > > > > > > --- a/drivers/net/i40e/i40e_ethdev.c > > > > > > +++ b/drivers/net/i40e/i40e_ethdev.c > > > > > > @@ -496,6 +496,7 @@ static const struct eth_dev_ops > > > > > > i40e_eth_dev_ops > > > > = { > > > > > > .flow_ops_get = i40e_dev_flow_ops_get, > > > > > > .rxq_info_get = i40e_rxq_info_get, > > > > > > .txq_info_get = i40e_txq_info_get, > > > > > > + .recycle_rxq_info_get = i40e_recycle_rxq_info_get, > > > > > > .rx_burst_mode_get= i40e_rx_burst_mode_get, > > > > > > .tx_burst_mode_get= i40e_tx_burst_mode_get, > > > > > > .timesync_enable = i40e_timesync_enable, > > > > > > diff --git a/drivers/net/i40e/i40e_ethdev.h > > > > > > b/drivers/net/i40e/i40e_ethdev.h index 6f65d5e0ac..af758798e1 > > > > > > 100644 > > > > > > --- a/drivers/net/i40e/i40e_ethdev.h > > > > > > +++ b/drivers/net/i40e/i40e_ethdev.h > > > > > > @@ -1355,6 +1355,8 @@ void i40e_rxq_info_get(struct rte_eth_dev > > > > > > *dev, uint16_t queue_id, > > > > > > struct rte_eth_rxq_info *qinfo); void > > > > > > i40e_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id, > > > > > > struct rte_eth_txq_info *qinfo); > > > > > > +void i40e_recycle_rxq_info_get(struct rte_eth_dev *dev, > > > > > > +uint16_t > > > > queue_id, > > > > > > + struct rte_eth_recycle_rxq_info *recycle_rxq_info); > > > > > > int i40e_rx_burst_mode_get(struct rte_eth_dev *dev, uint16_t > > queue_id, > > > > > >struct rte_eth_burst_mode *mode); int > > > > > > i40e_tx_burst_mode_get(struct rte_eth_dev *dev, uint16_t > > > > > > queue_id, diff -- git > > > > > > a/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c > > > > > > b/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c > > > > > > new file mode 100644 > > > > > > index 00..5663ecccde > > > > > > --- /dev/null > > > > > > +++ b/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c > > > > > > @@ -0,0 +1,147 @@ > > > > > > +/* SPDX-License-Identifier: BSD-3-Clause > > > > > > + * Copyright (c) 2023 Arm Limited. > > > > > > + */ > > > > > > + > > > > > > +#include > > > > > > +#include > > > > > > + > > > > > > +#include "base/i40e_prototype.h" > > > > > > +#include "base/i40e_type.h" > > > > > > +#include "i40e_ethdev.h" > > > > > > +#include "i40e_rxtx.h" > > > > > > + > > > > > > +#pragma GCC diagnostic ignored "-Wcast-qual" > > > > > > + > > > > > > +void > > > > > > +i40e_recycle_rx_descriptors_refill_vec(void *rx_queue, uint16_t > > > > > > +nb_mbufs) { > > > > > > + struct i40e_rx_queue *rxq = rx_queue; > > > > > > + struct i40e_rx_entry *rxep; > > > > > > + volatile union i40e_rx_desc *rxdp; > > > > > > + uint16_t rx_id; > > > > > > + uint64_t paddr; > > > > > > + uint64_t dma_addr; > > > > > > + uint16_t i; > > > > > > + > > > > > > + rxdp = rxq->rx_ring + rxq->rxrearm_start; > > > > > > + rxep = &rxq->sw_ring[rxq->rxrearm_start]; > > > > > > + > > > > > > + for (i = 0; i < nb_mbufs; i++) { > > > > > > + /* Initialize rxdp descs. */ > > > > > > + paddr = (rxep[i].mbuf)->buf_iova + > > > > > > RTE_PKTMBUF_HEADROOM; > > > > > > + dma_addr = rte_cpu_to_le_64(paddr); > > > > > > + /* flush desc with pa dma_addr */ > > > > > > + rxdp[i].read.hdr_addr = 0; > > > > > > + rxdp[i].read.pkt_addr = dma_addr; > > > > > > + } > > > > > > + > > > > > > + /* Update the descriptor initializer index */ > > > > > > + rxq->rxrearm_start += nb_mbufs; > > > > > > + rx_id = rxq->rxrearm_start - 1; > > > > > > + > > > > > > + if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) { > > > > > > + rxq->rxrearm_start = 0; > > > > > > + rx_id = rxq->nb_rx_desc - 1; > > > > > > +
RE: [RFC] lib/st_ring: add single thread ring
> Add a single thread safe and multi-thread unsafe ring data structure. > This library provides an simple and efficient alternative to multi-thread > safe ring when multi-thread safety is not required. Just a thought: do we really need whole new library for that? >From what I understand all we need right now just one extra function: rte_ring_mt_unsafe_prod_deque(...) Sorry for ugly name :) To dequeue N elems from prod.tail. Or you think there would be some extra advantages in ST version of the ring: extra usages, better performance, etc.? > > Signed-off-by: Honnappa Nagarahalli > --- > v1: > 1) The code is very prelimnary and is not even compiled > 2) This is intended to show the APIs and some thoughts on implementation > 3) More APIs and the rest of the implementation will come in subsequent >versions > > lib/st_ring/rte_st_ring.h | 567 ++ > 1 file changed, 567 insertions(+) > create mode 100644 lib/st_ring/rte_st_ring.h > > diff --git a/lib/st_ring/rte_st_ring.h b/lib/st_ring/rte_st_ring.h > new file mode 100644 > index 00..8cb8832591 > --- /dev/null > +++ b/lib/st_ring/rte_st_ring.h > @@ -0,0 +1,567 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(c) 2023 Arm Limited > + */ > + > +#ifndef _RTE_ST_RING_H_ > +#define _RTE_ST_RING_H_ > + > +/** > + * @file > + * RTE Signle Thread Ring (ST Ring) > + * > + * The ST Ring is a fixed-size queue intended to be accessed > + * by one thread at a time. It does not provide concurrent access to > + * multiple threads. If there are multiple threads accessing the ST ring, > + * then the threads have to use locks to protect the ring from > + * getting corrupted. > + * > + * - FIFO (First In First Out) > + * - Maximum size is fixed; the pointers are stored in a table. > + * - Consumer and producer part of same thread. > + * - Multi-thread producers and consumers need locking. > + * - Single/Bulk/burst dequeue at Tail or Head > + * - Single/Bulk/burst enqueue at Head or Tail > + * > + */ > + > +#ifdef __cplusplus > +extern "C" { > +#endif > + > +#include > +#include > + > +/** > + * Calculate the memory size needed for a ST ring > + * > + * This function returns the number of bytes needed for a ST ring, given > + * the number of elements in it. This value is the sum of the size of > + * the structure rte_st_ring and the size of the memory needed by the > + * elements. The value is aligned to a cache line size. > + * > + * @param count > + * The number of elements in the ring (must be a power of 2). > + * @return > + * - The memory size needed for the ST ring on success. > + * - -EINVAL if count is not a power of 2. > + */ > +ssize_t rte_st_ring_get_memsize(unsigned int count); > + > +/** > + * Initialize a ST ring structure. > + * > + * Initialize a ST ring structure in memory pointed by "r". The size of the > + * memory area must be large enough to store the ring structure and the > + * object table. It is advised to use rte_st_ring_get_memsize() to get the > + * appropriate size. > + * > + * The ST ring size is set to *count*, which must be a power of two. > + * The real usable ring size is *count-1* instead of *count* to > + * differentiate a full ring from an empty ring. > + * > + * The ring is not added in RTE_TAILQ_ST_RING global list. Indeed, the > + * memory given by the caller may not be shareable among dpdk > + * processes. > + * > + * @param r > + * The pointer to the ring structure followed by the elements table. > + * @param name > + * The name of the ring. > + * @param count > + * The number of elements in the ring (must be a power of 2, > + * unless RTE_ST_RING_F_EXACT_SZ is set in flags). > + * @param flags > + * An OR of the following: > + * - RTE_ST_RING_F_EXACT_SZ: If this flag is set, the ring will hold > + * exactly the requested number of entries, and the requested size > + * will be rounded up to the next power of two, but the usable space > + * will be exactly that requested. Worst case, if a power-of-2 size is > + * requested, half the ring space will be wasted. > + * Without this flag set, the ring size requested must be a power of 2, > + * and the usable space will be that size - 1. > + * @return > + * 0 on success, or a negative value on error. > + */ > +int rte_st_ring_init(struct rte_st_ring *r, const char *name, > + unsigned int count, unsigned int flags); > + > +/** > + * Create a new ST ring named *name* in memory. > + * > + * This function uses ``memzone_reserve()`` to allocate memory. Then it > + * calls rte_st_ring_init() to initialize an empty ring. > + * > + * The new ring size is set to *count*, which must be a power of two. > + * The real usable ring size is *count-1* instead of *count* to > + * differentiate a full ring from an empty ring. > + * > + * The ring is added in RTE_TAILQ_ST_RING list. > + * > + * @param name > + * The name of the ring. > + * @param count > + * The size of the ring (must be a pow
RE: [PATCH v11 2/4] net/i40e: implement mbufs recycle mode
> > > > > > > > Define specific function implementation for i40e driver. > > > > > > > > Currently, mbufs recycle mode can support 128bit vector path > > > > > > > > and > > > > > > > > avx2 > > > > > > path. > > > > > > > > And can be enabled both in fast free and no fast free mode. > > > > > > > > > > > > > > > > Suggested-by: Honnappa Nagarahalli > > > > > > > > > > > > > > > > Signed-off-by: Feifei Wang > > > > > > > > Reviewed-by: Ruifeng Wang > > > > > > > > Reviewed-by: Honnappa Nagarahalli > > > > > > > > > > > > --- > > > > > > > > drivers/net/i40e/i40e_ethdev.c| 1 + > > > > > > > > drivers/net/i40e/i40e_ethdev.h| 2 + > > > > > > > > .../net/i40e/i40e_recycle_mbufs_vec_common.c | 147 > > > > > > > > ++ > > > > > > > > drivers/net/i40e/i40e_rxtx.c | 32 > > > > > > > > drivers/net/i40e/i40e_rxtx.h | 4 + > > > > > > > > drivers/net/i40e/meson.build | 1 + > > > > > > > > 6 files changed, 187 insertions(+) create mode 100644 > > > > > > > > drivers/net/i40e/i40e_recycle_mbufs_vec_common.c > > > > > > > > > > > > > > > > diff --git a/drivers/net/i40e/i40e_ethdev.c > > > > > > > > b/drivers/net/i40e/i40e_ethdev.c index > > > > > > > > 8271bbb394..50ba9aac94 > > > > > > > > 100644 > > > > > > > > --- a/drivers/net/i40e/i40e_ethdev.c > > > > > > > > +++ b/drivers/net/i40e/i40e_ethdev.c > > > > > > > > @@ -496,6 +496,7 @@ static const struct eth_dev_ops > > > > > > > > i40e_eth_dev_ops > > > > > > = { > > > > > > > > .flow_ops_get = i40e_dev_flow_ops_get, > > > > > > > > .rxq_info_get = i40e_rxq_info_get, > > > > > > > > .txq_info_get = i40e_txq_info_get, > > > > > > > > + .recycle_rxq_info_get = > > > > > > > > i40e_recycle_rxq_info_get, > > > > > > > > .rx_burst_mode_get= i40e_rx_burst_mode_get, > > > > > > > > .tx_burst_mode_get= i40e_tx_burst_mode_get, > > > > > > > > .timesync_enable = i40e_timesync_enable, > > > > > > > > diff --git a/drivers/net/i40e/i40e_ethdev.h > > > > > > > > b/drivers/net/i40e/i40e_ethdev.h index > > > > > > > > 6f65d5e0ac..af758798e1 > > > > > > > > 100644 > > > > > > > > --- a/drivers/net/i40e/i40e_ethdev.h > > > > > > > > +++ b/drivers/net/i40e/i40e_ethdev.h > > > > > > > > @@ -1355,6 +1355,8 @@ void i40e_rxq_info_get(struct > > > > > > > > rte_eth_dev *dev, uint16_t queue_id, > > > > > > > > struct rte_eth_rxq_info *qinfo); void > > > > > > > > i40e_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id, > > > > > > > > struct rte_eth_txq_info *qinfo); > > > > > > > > +void i40e_recycle_rxq_info_get(struct rte_eth_dev *dev, > > > > > > > > +uint16_t > > > > > > queue_id, > > > > > > > > + struct rte_eth_recycle_rxq_info *recycle_rxq_info); > > > > > > > > int i40e_rx_burst_mode_get(struct rte_eth_dev *dev, > > > > > > > > uint16_t > > > > queue_id, > > > > > > > >struct rte_eth_burst_mode *mode); > > > > > > > > int > > > > > > > > i40e_tx_burst_mode_get(struct rte_eth_dev *dev, uint16_t > > > > > > > > queue_id, diff -- git > > > > > > > > a/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c > > > > > > > > b/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c > > > > > > > > new file mode 100644 > > > > > > > > index 00..5663ecccde > > > > > > > > --- /dev/null > > > > > > > > +++ b/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c > > > > > > > > @@ -0,0 +1,147 @@ > > > > > > > > +/* SPDX-License-Identifier: BSD-3-Clause > > > > > > > > + * Copyright (c) 2023 Arm Limited. > > > > > > > > + */ > > > > > > > > + > > > > > > > > +#include > > > > > > > > +#include > > > > > > > > + > > > > > > > > +#include "base/i40e_prototype.h" > > > > > > > > +#include "base/i40e_type.h" > > > > > > > > +#include "i40e_ethdev.h" > > > > > > > > +#include "i40e_rxtx.h" > > > > > > > > + > > > > > > > > +#pragma GCC diagnostic ignored "-Wcast-qual" > > > > > > > > + > > > > > > > > +void > > > > > > > > +i40e_recycle_rx_descriptors_refill_vec(void *rx_queue, > > > > > > > > +uint16_t > > > > > > > > +nb_mbufs) { > > > > > > > > + struct i40e_rx_queue *rxq = rx_queue; > > > > > > > > + struct i40e_rx_entry *rxep; > > > > > > > > + volatile union i40e_rx_desc *rxdp; > > > > > > > > + uint16_t rx_id; > > > > > > > > + uint64_t paddr; > > > > > > > > + uint64_t dma_addr; > > > > > > > > + uint16_t i; > > > > > > > > + > > > > > > > > + rxdp = rxq->rx_ring + rxq->rxrearm_start; > > > > > > > > + rxep = &rxq->sw_ring[rxq->rxrearm_start]; > > > > > > > > + > > > > > > > > + for (i = 0; i < nb_mbufs; i++) { > > > > > > > > + /* Initialize rxdp descs. */ > > > > > > > > + paddr = (rxep[i].mbuf)->buf_iova + > > > > > > > > RTE_PKTMBUF_HEADROOM; > > > > > > > > + dma_add
RE: [RFC] lib/st_ring: add single thread ring
> > > Add a single thread safe and multi-thread unsafe ring data structure. > > > This library provides an simple and efficient alternative to > > > multi-thread safe ring when multi-thread safety is not required. > > > > Just a thought: do we really need whole new library for that? > > From what I understand all we need right now just one extra function: > > rte_ring_mt_unsafe_prod_deque(...) > > Sorry for ugly name :) > > To dequeue N elems from prod.tail. > > Or you think there would be some extra advantages in ST version of the ring: > > extra usages, better performance, etc.? > There are multiple implementations of the ST ring being used in other parts > of DPDK. Mattias Ronnblom pointed out some (distributed > scheduler, eth RX adapter, cmdline) [1] existing ones which will be replaced > by this one. > This implementation will not use atomic instructions, head and tail indices > will be in the same cache line and it will be a double ended > queue. So, I am expecting better perf and more use cases (some might not be > applicable currently). Yep, I do understand that we can skip sync logic for ST case. Ok, if we do have multiple use-cases it might be plausible to have a separate API for it. > > [1] https://mails.dpdk.org/archives/dev/2023-August/275003.html > > > > > > > > > Signed-off-by: Honnappa Nagarahalli > > > --- > > > v1: > > > 1) The code is very prelimnary and is not even compiled > > > 2) This is intended to show the APIs and some thoughts on > > > implementation > > > 3) More APIs and the rest of the implementation will come in subsequent > > >versions > > > > > > lib/st_ring/rte_st_ring.h | 567 > > > ++ > > > 1 file changed, 567 insertions(+) > > > create mode 100644 lib/st_ring/rte_st_ring.h > > > > > > diff --git a/lib/st_ring/rte_st_ring.h b/lib/st_ring/rte_st_ring.h new > > > file mode 100644 index 00..8cb8832591 > > > --- /dev/null > > > +++ b/lib/st_ring/rte_st_ring.h > > > @@ -0,0 +1,567 @@ > > > +/* SPDX-License-Identifier: BSD-3-Clause > > > + * Copyright(c) 2023 Arm Limited > > > + */ > > > + > > > +#ifndef _RTE_ST_RING_H_ > > > +#define _RTE_ST_RING_H_ > > > + > > > +/** > > > + * @file > > > + * RTE Signle Thread Ring (ST Ring) > > > + * > > > + * The ST Ring is a fixed-size queue intended to be accessed > > > + * by one thread at a time. It does not provide concurrent access to > > > + * multiple threads. If there are multiple threads accessing the ST > > > +ring, > > > + * then the threads have to use locks to protect the ring from > > > + * getting corrupted. > > > + * > > > + * - FIFO (First In First Out) > > > + * - Maximum size is fixed; the pointers are stored in a table. > > > + * - Consumer and producer part of same thread. > > > + * - Multi-thread producers and consumers need locking. > > > + * - Single/Bulk/burst dequeue at Tail or Head > > > + * - Single/Bulk/burst enqueue at Head or Tail > > > + * > > > + */ > > > + > > > +#ifdef __cplusplus > > > +extern "C" { > > > +#endif > > > + > > > +#include > > > +#include > > > + > > > +/** > > > + * Calculate the memory size needed for a ST ring > > > + * > > > + * This function returns the number of bytes needed for a ST ring, > > > +given > > > + * the number of elements in it. This value is the sum of the size of > > > + * the structure rte_st_ring and the size of the memory needed by the > > > + * elements. The value is aligned to a cache line size. > > > + * > > > + * @param count > > > + * The number of elements in the ring (must be a power of 2). > > > + * @return > > > + * - The memory size needed for the ST ring on success. > > > + * - -EINVAL if count is not a power of 2. > > > + */ > > > +ssize_t rte_st_ring_get_memsize(unsigned int count); > > > + > > > +/** > > > + * Initialize a ST ring structure. > > > + * > > > + * Initialize a ST ring structure in memory pointed by "r". The size > > > +of the > > > + * memory area must be large enough to store the ring structure and > > > +the > > > + * object table. It is advised to use rte_st_ring_get_memsize() to > > > +get the > > > + * appropriate size. > > > + * > > > + * The ST ring size is set to *count*, which must be a power of two. > > > + * The real usable ring size is *count-1* instead of *count* to > > > + * differentiate a full ring from an empty ring. > > > + * > > > + * The ring is not added in RTE_TAILQ_ST_RING global list. Indeed, > > > +the > > > + * memory given by the caller may not be shareable among dpdk > > > + * processes. > > > + * > > > + * @param r > > > + * The pointer to the ring structure followed by the elements table. > > > + * @param name > > > + * The name of the ring. > > > + * @param count > > > + * The number of elements in the ring (must be a power of 2, > > > + * unless RTE_ST_RING_F_EXACT_SZ is set in flags). > > > + * @param flags > > > + * An OR of the following: > > > + * - RTE_ST_RING_F_EXACT_SZ: If this flag is set, the ring will hold > > > +
Re: [RFC] random: use per lcore state
06/09/2023 21:02, Mattias Rönnblom пишет: On 2023-09-06 19:20, Stephen Hemminger wrote: Move the random number state into thread local storage. Me and Morten discussed TLS versus other alternatives in some other thread. The downside of TLS that Morten pointed out, from what I recall, is that lazy initialization is *required* (since the number of threads is open-ended), and the data ends up in non-huge page memory. Hmm.. correct me if I am wrong, but with current implementation, rand state is also in non-huge memory: static struct rte_rand_state rand_states[RTE_MAX_LCORE + 1]; It was also unclear to me what the memory footprint implications would be,h would large per-lcore data structures be put in TLS. More specifically, if they would be duplicated across all threads, even non-lcore threads. None of these issues affect rte_random.c's potential usage of TLS (except lazy [re-]initialization makes things more complicated). Preferably, there should be one pattern that is usable across all or at least most DPDK modules requiring per-lcore state. This has a several benefits. - no false cache sharing from cpu prefetching - fixes initialization of random state for non-DPDK threads This seems like a non-reason to me. That bug is easily fixed, if it isn't already. - fixes unsafe usage of random state by non-DPDK threads "Makes random number generation MT safe from all threads (including unregistered non-EAL threads)." With current API semantics you may still register an non-EAL thread, to get MT safe access to this API, so I guess it's more about being more convenient and less error prone, than anything else. I understand that we never guaranteed MT safety for non-EAL threads here, but as a user of rte_rand() - it would be much more convenient, if I can use it from any thread wthout worring is it a EAL thread or not. About TlS usage and re-seeding - can we use some sort of middle-ground: extend rte_rand_state with some gen-counter. Make a 'master' copy of rte_rand_state that will be updated by rte_srand(), and TLS copies of rte_rand_state, so rte_rand() can fist compare its gen-counter value with master copy to decide, does it need to copy new state from master or not. The new MT safety guarantees should be in the API docs as well. Yes, it is an extension to the current API, not a fix. The initialization of random number state is done by the lcore (lazy initialization). Signed-off-by: Stephen Hemminger --- lib/eal/common/rte_random.c | 38 +++-- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/lib/eal/common/rte_random.c b/lib/eal/common/rte_random.c index 53636331a27b..9657adf6ad3b 100644 --- a/lib/eal/common/rte_random.c +++ b/lib/eal/common/rte_random.c @@ -19,13 +19,14 @@ struct rte_rand_state { uint64_t z3; uint64_t z4; uint64_t z5; -} __rte_cache_aligned; + uint64_t seed; +}; -/* One instance each for every lcore id-equipped thread, and one - * additional instance to be shared by all others threads (i.e., all - * unregistered non-EAL threads). - */ -static struct rte_rand_state rand_states[RTE_MAX_LCORE + 1]; +/* Global random seed */ +static uint64_t rte_rand_seed; + +/* Per lcore random state. */ +static RTE_DEFINE_PER_LCORE(struct rte_rand_state, rte_rand_state); static uint32_t __rte_rand_lcg32(uint32_t *seed) @@ -81,11 +82,7 @@ __rte_srand_lfsr258(uint64_t seed, struct rte_rand_state *state) void rte_srand(uint64_t seed) { - unsigned int lcore_id; - - /* add lcore_id to seed to avoid having the same sequence */ - for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) - __rte_srand_lfsr258(seed + lcore_id, &rand_states[lcore_id]); + __atomic_store_n(&rte_rand_seed, seed, __ATOMIC_RELAXED); } static __rte_always_inline uint64_t @@ -119,15 +116,18 @@ __rte_rand_lfsr258(struct rte_rand_state *state) static __rte_always_inline struct rte_rand_state *__rte_rand_get_state(void) { - unsigned int idx; + struct rte_rand_state *rand_state = &RTE_PER_LCORE(rte_rand_state); There should really be a RTE_PER_THREAD, an alias to RTE_PER_LCORE, to cover this usage. Or just use __thread (or _Thread_local?). + uint64_t seed; - idx = rte_lcore_id(); + seed = __atomic_load_n(&rte_rand_seed, __ATOMIC_RELAXED); + if (unlikely(seed != rand_state->seed)) { + rand_state->seed = seed; Re-seeding should restart the series, on all lcores. There's nothing preventing the user from re-seeding the machinery repeatedly, with the same seed. Seems like an unusual, but still valid, use case, if you run repeated tests of some sort. Use a seqlock? :) I guess you need a seed generation number as well (e.g., is this the first time you seed with X, or the second one, etc.) - /* last instance reserved for unregistered non-EAL threads */ - if (unlikely(idx == LCORE_ID_ANY)) - idx = RTE_MAX_LCORE; + seed += rte_thread
Re: [RFC] random: use per lcore state
09/09/2023 07:45, Mattias Rönnblom пишет: On 2023-09-09 02:13, Konstantin Ananyev wrote: 06/09/2023 21:02, Mattias Rönnblom пишет: On 2023-09-06 19:20, Stephen Hemminger wrote: Move the random number state into thread local storage. Me and Morten discussed TLS versus other alternatives in some other thread. The downside of TLS that Morten pointed out, from what I recall, is that lazy initialization is *required* (since the number of threads is open-ended), and the data ends up in non-huge page memory. Hmm.. correct me if I am wrong, but with current implementation, rand state is also in non-huge memory: static struct rte_rand_state rand_states[RTE_MAX_LCORE + 1]; Yes. The current pattern is certainly not perfect. It was also unclear to me what the memory footprint implications would be,h would large per-lcore data structures be put in TLS. More specifically, if they would be duplicated across all threads, even non-lcore threads. None of these issues affect rte_random.c's potential usage of TLS (except lazy [re-]initialization makes things more complicated). Preferably, there should be one pattern that is usable across all or at least most DPDK modules requiring per-lcore state. This has a several benefits. - no false cache sharing from cpu prefetching - fixes initialization of random state for non-DPDK threads This seems like a non-reason to me. That bug is easily fixed, if it isn't already. - fixes unsafe usage of random state by non-DPDK threads "Makes random number generation MT safe from all threads (including unregistered non-EAL threads)." With current API semantics you may still register an non-EAL thread, to get MT safe access to this API, so I guess it's more about being more convenient and less error prone, than anything else. I understand that we never guaranteed MT safety for non-EAL threads here, Registered non-EAL threads have a lcore id and thus may safely call rte_rand(). I am aware about such ability, but for me register/unregister thread just to call rte_rand() seems like way too much hassle. Multiple unregistered non-EAL threads may not do so, in parallel. but as a user of rte_rand() - it would be much more convenient, if I can use it from any thread wthout worring is it a EAL thread or not. Sure, especially if it comes for free. The for-free solution has yet to reveal itself though. About TlS usage and re-seeding - can we use some sort of middle-ground: extend rte_rand_state with some gen-counter. Make a 'master' copy of rte_rand_state that will be updated by rte_srand(), and TLS copies of rte_rand_state, so rte_rand() can fist compare its gen-counter value with master copy to decide, does it need to copy new state from master or not. Calling threads shouldn't all produce the same sequence. That would be silly and not very random. The generation number should be tied to the seed. Actually, yes you right, probably we don't need a master copy of rte_rand_state itself. It seems that just having a 'master' copy of 'seed' value, plus some counter (to indicate that seed has been updated) is enough here. The new MT safety guarantees should be in the API docs as well. Yes, it is an extension to the current API, not a fix. The initialization of random number state is done by the lcore (lazy initialization). Signed-off-by: Stephen Hemminger --- lib/eal/common/rte_random.c | 38 +++-- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/lib/eal/common/rte_random.c b/lib/eal/common/rte_random.c index 53636331a27b..9657adf6ad3b 100644 --- a/lib/eal/common/rte_random.c +++ b/lib/eal/common/rte_random.c @@ -19,13 +19,14 @@ struct rte_rand_state { uint64_t z3; uint64_t z4; uint64_t z5; -} __rte_cache_aligned; + uint64_t seed; +}; -/* One instance each for every lcore id-equipped thread, and one - * additional instance to be shared by all others threads (i.e., all - * unregistered non-EAL threads). - */ -static struct rte_rand_state rand_states[RTE_MAX_LCORE + 1]; +/* Global random seed */ +static uint64_t rte_rand_seed; + +/* Per lcore random state. */ +static RTE_DEFINE_PER_LCORE(struct rte_rand_state, rte_rand_state); static uint32_t __rte_rand_lcg32(uint32_t *seed) @@ -81,11 +82,7 @@ __rte_srand_lfsr258(uint64_t seed, struct rte_rand_state *state) void rte_srand(uint64_t seed) { - unsigned int lcore_id; - - /* add lcore_id to seed to avoid having the same sequence */ - for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) - __rte_srand_lfsr258(seed + lcore_id, &rand_states[lcore_id]); + __atomic_store_n(&rte_rand_seed, seed, __ATOMIC_RELAXED); } static __rte_always_inline uint64_t @@ -119,15 +116,18 @@ __rte_rand_lfsr258(struct rte_rand_state *state) static __rte_always_inline struct rte_rand_state *__rte_rand_get_s
Re: [PATCH v3 00/11] rework thread management
| 2 +- lib/eal/linux/eal_alarm.c | 1 + lib/eal/linux/eal_interrupts.c| 10 +- lib/eal/linux/eal_thread.c| 18 +-- lib/eal/linux/eal_timer.c | 11 +- lib/eal/unix/rte_thread.c | 2 +- lib/eal/version.map | 44 +++--- lib/eal/windows/eal.c | 2 +- lib/eal/windows/eal_interrupts.c | 2 +- lib/eal/windows/eal_thread.c | 8 -- lib/eal/windows/rte_thread.c | 2 +- lib/ethdev/ethdev_driver.c| 1 + lib/ethdev/ethdev_driver.h| 2 + lib/ethdev/rte_ethdev_core.h | 2 - lib/ethdev/rte_flow.c | 1 + lib/eventdev/rte_event_eth_rx_adapter.c | 24 ++-- lib/vhost/fd_man.c| 6 +- lib/vhost/fd_man.h| 2 +- lib/vhost/socket.c| 23 ++- lib/vhost/vduse.c | 5 +- lib/vhost/vhost.c | 1 + 81 files changed, 466 insertions(+), 632 deletions(-) Series-acked-by: Konstantin Ananyev
Minutes of Technical Board Meeting, 2023-Sep-06
Minutes of Technical Board Meeting, 2023-Spetember-06 Members Attending - -Aaron -Bruce -Hemant -Jerin -Kevin -Konstantin (Chair) -Maxime -Stephen -Thomas NOTE: The technical board meetings every second Wednesday at https://meet.jit.si/DPDK at 3 pm UTC. Meetings are public, and DPDK community members are welcome to attend. NOTE: Next meeting will be on Wednesday 2023-September-20 @3pm UTC, and will be chaired by Maxime 1) Aaron: Intel test lab update - Intel test lab is up and running - To avoid such shortcomings in future, the plan is to duplicate Intel-Lab specific test-cases in UNH test lab. Right now it is on the stage of identifying such test-cases and required HW. 2) Jerin: Policy for trace point addtion is automated via devtools/checkpatches.sh and proposed in: https://patches.dpdk.org/project/dpdk/patch/20230307120514.2774917-2-adwiv...@marvell.com/ - Main idea: libraries that already instrumented with trace-points (cryptodev, ethdev, eventdev and mempool) will require trace-point to be present in their new public API functions. - General ACK from the TB members. 3) Thomas: Call for reviews for the patch-series: http://patchwork.dpdk.org/project/dpdk/list/?series=29438 Main changes: - Promote DPDK thread API to stable (help MSVC integration) - Reduce the usage of pthread functions inside DPDK code. (pthread_cancel and pthread locks still remain for now). - Remove 'attributes' parameter for rte_thread_create_control() function - public API change. 4) Ben: quick update on marketing activity - DPDK August newsletter is out: https://www.dpdk.org/monthly-newsletter-august-2023/?utm_content=263403350&utm_medium=social&utm_source=linkedin&hss_channel=lcp-33275771 - Update on last DPDk site activities and expected number of attendees for DPDK user-space event.
Re: [PATCH v2 01/29] bpf: make rte_bpf_dump and rte_bpf_convert stable API's
These two API's were introduced in 23.11 and can now be made not experimental. Signed-off-by: Stephen Hemminger --- lib/bpf/rte_bpf.h | 2 -- lib/bpf/version.map | 9 ++--- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/lib/bpf/rte_bpf.h b/lib/bpf/rte_bpf.h index 4d71120dbd9d..f70d8dacd0d3 100644 --- a/lib/bpf/rte_bpf.h +++ b/lib/bpf/rte_bpf.h @@ -209,7 +209,6 @@ rte_bpf_get_jit(const struct rte_bpf *bpf, struct rte_bpf_jit *jit); * @param len * Number of BPF instructions to dump. */ -__rte_experimental void rte_bpf_dump(FILE *f, const struct ebpf_insn *buf, uint32_t len); @@ -229,7 +228,6 @@ struct bpf_program; * - ENOMEM - can't reserve enough memory * - ENOTSUP - operation not supported */ -__rte_experimental struct rte_bpf_prm * rte_bpf_convert(const struct bpf_program *prog); diff --git a/lib/bpf/version.map b/lib/bpf/version.map index c49bf1701f0a..2e957494e9df 100644 --- a/lib/bpf/version.map +++ b/lib/bpf/version.map @@ -1,7 +1,9 @@ DPDK_24 { global: + rte_bpf_convert; rte_bpf_destroy; + rte_bpf_dump; rte_bpf_elf_load; rte_bpf_eth_rx_elf_load; rte_bpf_eth_rx_unload; @@ -14,10 +16,3 @@ DPDK_24 { local: *; }; - -EXPERIMENTAL { - global: - - rte_bpf_convert; - rte_bpf_dump; -}; -- Acked-by: Konstantin Ananyev 2.39.2
Re: [PATCH v2 23/29] ipsec: remove experimental from SA API
These API's were added in 21.11, remove experimental flag. Signed-off-by: Stephen Hemminger --- lib/ipsec/rte_ipsec.h | 2 -- lib/ipsec/version.map | 9 + 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/lib/ipsec/rte_ipsec.h b/lib/ipsec/rte_ipsec.h index 04129926b69f..81624f909192 100644 --- a/lib/ipsec/rte_ipsec.h +++ b/lib/ipsec/rte_ipsec.h @@ -168,7 +168,6 @@ rte_ipsec_pkt_process(const struct rte_ipsec_session *ss, struct rte_mbuf *mb[], * @return * 0 on success, negative value otherwise. */ -__rte_experimental int rte_ipsec_telemetry_sa_add(const struct rte_ipsec_sa *sa); @@ -178,7 +177,6 @@ rte_ipsec_telemetry_sa_add(const struct rte_ipsec_sa *sa); * @param sa * Pointer to the *rte_ipsec_sa* object that will have telemetry disabled. */ -__rte_experimental void rte_ipsec_telemetry_sa_del(const struct rte_ipsec_sa *sa); diff --git a/lib/ipsec/version.map b/lib/ipsec/version.map index f0063af354f0..9d01ebeadc4c 100644 --- a/lib/ipsec/version.map +++ b/lib/ipsec/version.map @@ -16,15 +16,8 @@ DPDK_24 { rte_ipsec_sad_lookup; rte_ipsec_ses_from_crypto; rte_ipsec_session_prepare; - - local: *; -}; - -EXPERIMENTAL { - global: - - # added in 21.11 rte_ipsec_telemetry_sa_add; rte_ipsec_telemetry_sa_del; + local: *; }; -- Acked-by: Konstantin Ananyev 2.39.2
Re: [PATCH v2 18/29] ip_frag: mark a couple of functions stable
There were two functions added in 22.11 which were marked as experimental. Remove the experimental tag. Signed-off-by: Stephen Hemminger --- lib/ip_frag/rte_ip_frag.h | 2 -- lib/ip_frag/version.map | 9 ++--- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/lib/ip_frag/rte_ip_frag.h b/lib/ip_frag/rte_ip_frag.h index feab72ae64b9..cd3a3e143ee7 100644 --- a/lib/ip_frag/rte_ip_frag.h +++ b/lib/ip_frag/rte_ip_frag.h @@ -205,7 +205,6 @@ int32_t rte_ipv4_fragment_packet(struct rte_mbuf *pkt_in, * in the pkts_out array. * Otherwise - (-1) * errno. */ -__rte_experimental int32_t rte_ipv4_fragment_copy_nonseg_packet(struct rte_mbuf *pkt_in, struct rte_mbuf **pkts_out, @@ -289,7 +288,6 @@ rte_ip_frag_table_statistics_dump(FILE * f, const struct rte_ip_frag_tbl *tbl); * @param tms * Current timestamp */ -__rte_experimental void rte_ip_frag_table_del_expired_entries(struct rte_ip_frag_tbl *tbl, struct rte_ip_frag_death_row *dr, uint64_t tms); diff --git a/lib/ip_frag/version.map b/lib/ip_frag/version.map index 7ba446c9938b..3e7e573dc412 100644 --- a/lib/ip_frag/version.map +++ b/lib/ip_frag/version.map @@ -3,19 +3,14 @@ DPDK_24 { rte_ip_frag_free_death_row; rte_ip_frag_table_create; + rte_ip_frag_table_del_expired_entries; rte_ip_frag_table_destroy; rte_ip_frag_table_statistics_dump; rte_ipv4_frag_reassemble_packet; + rte_ipv4_fragment_copy_nonseg_packet; rte_ipv4_fragment_packet; rte_ipv6_frag_reassemble_packet; rte_ipv6_fragment_packet; local: *; }; - -EXPERIMENTAL { - global: - - rte_ip_frag_table_del_expired_entries; - rte_ipv4_fragment_copy_nonseg_packet; -}; -- Acked-by: Konstantin Ananyev 2.39.2
Re: [PATCH v1] examples/l3fwd: relax the RSS/Offload requirement
03/09/2023 05:01, Trevor Tao пишет: Now the port Rx mq_mode had been set to RTE_ETH_MQ_RX_RSS, and offload mode set to RTE_ETH_RX_OFFLOAD_CHECKSUM by default, but some hardware and/or virtual interface does not support the RSS and offload mode presupposed, e.g., some virtio interfaces in the cloud don't support RSS and may only partly support RTE_ETH_RX_OFFLOAD_UDP_CKSUM/ RTE_ETH_RX_OFFLOAD_TCP_CKSUM, but not RTE_ETH_RX_OFFLOAD_IPV4_CKSUM, and the error msg here: virtio_dev_configure(): RSS support requested but not supported by the device Port0 dev_configure = -95 and: Ethdev port_id=0 requested Rx offloads 0xe does not match Rx offloads capabilities 0x201d in rte_eth_dev_configure() So to enable the l3fwd running in that environment, the Rx mode requirement can be relaxed to reflect the hardware feature reality here, and the l3fwd can run smoothly then. A warning msg would be provided to user in case it happens here. On the other side, enabling the software cksum check in case the hw support missing. Fixes: af75078fece3 ("first public release") Cc: sta...@dpdk.org I don't think there was abug here. We are talking about changing current requirements for the app. So not sure it is a real fix and that such change can be propagated to stable releases. Signed-off-by: Trevor Tao --- examples/l3fwd/l3fwd.h | 12 +++- examples/l3fwd/main.c | 21 +++-- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h index b55855c932..cc10643c4b 100644 --- a/examples/l3fwd/l3fwd.h +++ b/examples/l3fwd/l3fwd.h @@ -115,6 +115,8 @@ extern struct acl_algorithms acl_alg[]; extern uint32_t max_pkt_len; +extern struct rte_eth_conf port_conf; + /* Send burst of packets on an output interface */ static inline int send_burst(struct lcore_conf *qconf, uint16_t n, uint16_t port) @@ -170,7 +172,15 @@ is_valid_ipv4_pkt(struct rte_ipv4_hdr *pkt, uint32_t link_len) return -1; /* 2. The IP checksum must be correct. */ - /* this is checked in H/W */ + /* if this is not checked in H/W, check it. */ + if ((port_conf.rxmode.offloads & RTE_ETH_RX_OFFLOAD_IPV4_CKSUM) == 0) { Might be better to check particular mbuf flag: if ((mbuf->ol_flags & RTE_MBUF_F_RX_IP_CKSUM_MASK) == TE_MBUF_F_RX_IP_CKSUM_UNKNOWN) {...} + uint16_t actual_cksum, expected_cksum; + actual_cksum = pkt->hdr_checksum; + pkt->hdr_checksum = 0; + expected_cksum = rte_ipv4_cksum(pkt); + if (actual_cksum != expected_cksum) + return -2; + } /* * 3. The IP version number must be 4. If the version number is not 4 diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c index 6063eb1399..37aec64718 100644 --- a/examples/l3fwd/main.c +++ b/examples/l3fwd/main.c @@ -117,7 +117,7 @@ static struct lcore_params * lcore_params = lcore_params_array_default; static uint16_t nb_lcore_params = sizeof(lcore_params_array_default) / sizeof(lcore_params_array_default[0]); -static struct rte_eth_conf port_conf = { +struct rte_eth_conf port_conf = { .rxmode = { .mq_mode = RTE_ETH_MQ_RX_RSS, .offloads = RTE_ETH_RX_OFFLOAD_CHECKSUM, @@ -1257,8 +1257,12 @@ l3fwd_poll_resource_setup(void) local_port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads; - if (dev_info.max_rx_queues == 1) + /* relax the rx rss requirement */ + if (dev_info.max_rx_queues == 1 || !local_port_conf.rx_adv_conf.rss_conf.rss_hf) { + printf("warning: modified the rx mq_mode to RTE_ETH_MQ_RX_NONE base on" + " device capability\n"); local_port_conf.rxmode.mq_mode = RTE_ETH_MQ_RX_NONE; Should we probably instead have a new commnad-line option to explicitly disable RSS? Something like: '--no-rss' or so? + } if (local_port_conf.rx_adv_conf.rss_conf.rss_hf != port_conf.rx_adv_conf.rss_conf.rss_hf) { @@ -1269,6 +1273,19 @@ l3fwd_poll_resource_setup(void) local_port_conf.rx_adv_conf.rss_conf.rss_hf); } + /* relax the rx offload requirement */ + if ((local_port_conf.rxmode.offloads & dev_info.rx_offload_capa) != + local_port_conf.rxmode.offloads) { + printf("Port %u requested Rx offloads 0x%"PRIx64" does not" + " match Rx offloads capabilities 0x%"PRIx64"\n", + portid, local_port_conf.rxmode.offloads, + dev_info.rx_offload_capa); + local_port_conf.rxmode.offloads &= dev_info.rx_offload_capa; + port_conf.rxmode.offloads = loc
Re:Re: [PATCH v1] examples/l3fwd: relax the RSS/Offload requirement
Hi Trevor, At 2023-09-18 02:04:19, "Konstantin Ananyev" wrote: 03/09/2023 05:01, Trevor Tao пишет: Now the port Rx mq_mode had been set to RTE_ETH_MQ_RX_RSS, and offload mode set to RTE_ETH_RX_OFFLOAD_CHECKSUM by default, but some hardware and/or virtual interface does not support the RSS and offload mode presupposed, e.g., some virtio interfaces in the cloud don't support RSS and may only partly support RTE_ETH_RX_OFFLOAD_UDP_CKSUM/ RTE_ETH_RX_OFFLOAD_TCP_CKSUM, but not RTE_ETH_RX_OFFLOAD_IPV4_CKSUM, and the error msg here: virtio_dev_configure(): RSS support requested but not supported by the device Port0 dev_configure = -95 and: Ethdev port_id=0 requested Rx offloads 0xe does not match Rx offloads capabilities 0x201d in rte_eth_dev_configure() So to enable the l3fwd running in that environment, the Rx mode requirement can be relaxed to reflect the hardware feature reality here, and the l3fwd can run smoothly then. A warning msg would be provided to user in case it happens here. On the other side, enabling the software cksum check in case the hw support missing. Fixes: af75078fece3 ("first public release") Cc: sta...@dpdk.org I don't think there was abug here. We are talking about changing current requirements for the app. So not sure it is a real fix and that such change can be propagated to stable releases. Trevor: I think it's not a bug fix but a feature enhancement, it would enable l3fwd to work smoothly on the HW/virtual interfaces which don't support RSS and/or cksum offloading. Yes. it seems like sort of an enhancement. While 'Fixes: ...' are for bugs only. AFAIK, only bug-fixes are take for backporting by stable releases. That's why there seems no point to add CC: sta...@dpdk.org Another generic things: - l3fwd doc and release notes probably need to be updated - as you areintroducing 2 distinct features: no-rss and no-ipv4-cksum it is probably better to split it into 2 different patches (in the same series). Signed-off-by: Trevor Tao --- examples/l3fwd/l3fwd.h | 12 +++- examples/l3fwd/main.c | 21 +++-- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h index b55855c932..cc10643c4b 100644 --- a/examples/l3fwd/l3fwd.h +++ b/examples/l3fwd/l3fwd.h @@ -115,6 +115,8 @@ extern struct acl_algorithms acl_alg[]; extern uint32_t max_pkt_len; +extern struct rte_eth_conf port_conf; + /* Send burst of packets on an output interface */ static inline int send_burst(struct lcore_conf *qconf, uint16_t n, uint16_t port) @@ -170,7 +172,15 @@ is_valid_ipv4_pkt(struct rte_ipv4_hdr *pkt, uint32_t link_len) return -1; /* 2. The IP checksum must be correct. */ - /* this is checked in H/W */ + /* if this is not checked in H/W, check it. */ + if ((port_conf.rxmode.offloads & RTE_ETH_RX_OFFLOAD_IPV4_CKSUM) == 0) { Might be better to check particular mbuf flag: if ((mbuf->ol_flags & RTE_MBUF_F_RX_IP_CKSUM_MASK) == TE_MBUF_F_RX_IP_CKSUM_UNKNOWN) {...} Trevor: the utility function is_valid_ipv4_pkt is just against an IPv4 pkt, and there's no mbuf information, and if needed, there would be an extra ol_flags added here to check if it was already done by the ethernet device, but look for a sample in: https://github.com/DPDK/dpdk/blob/main/examples/l3fwd-power/main.c#L487 so I think it's ok to just use the port_conf here. If you still think it's better to use m->ol_flags, please tell me. Yep, passing ol_flags, or mbuf itself seems like a proper way to do it. Aproach taken in l3fwd-power doesn't look right to me, see below. + uint16_t actual_cksum, expected_cksum; + actual_cksum = pkt->hdr_checksum; + pkt->hdr_checksum = 0; + expected_cksum = rte_ipv4_cksum(pkt); + if (actual_cksum != expected_cksum) + return -2; + } /* * 3. The IP version number must be 4. If the version number is not 4 diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c index 6063eb1399..37aec64718 100644 --- a/examples/l3fwd/main.c +++ b/examples/l3fwd/main.c @@ -117,7 +117,7 @@ static struct lcore_params * lcore_params = lcore_params_array_default; static uint16_t nb_lcore_params = sizeof(lcore_params_array_default) / sizeof(lcore_params_array_default[0]); -static struct rte_eth_conf port_conf = { +struct rte_eth_conf port_conf = { .rxmode = { .mq_mode = RTE_ETH_MQ_RX_RSS, .offloads = RTE_ETH_RX_OFFLOAD_CHECKSUM, @@ -1257,8 +1257,12 @@ l3fwd_poll_resource_setup(void) local_port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads; - if (dev_info.max_rx_queues == 1)
RE: [PATCH v11 2/4] net/i40e: implement mbufs recycle mode
Hi Feifei, > > > -Original Message- > > > From: Feifei Wang > > > Sent: Tuesday, September 5, 2023 11:11 AM > > > To: Konstantin Ananyev ; Konstantin > > > Ananyev > > > Cc: dev@dpdk.org; nd ; Honnappa Nagarahalli > > > ; Ruifeng Wang > > ; > > > Yuying Zhang ; Beilei Xing > > > ; nd ; nd ; nd > > > ; nd ; nd > > > Subject: RE: [PATCH v11 2/4] net/i40e: implement mbufs recycle mode > > > > > > > > > > > > > -----Original Message- > > > > From: Konstantin Ananyev > > > > Sent: Monday, September 4, 2023 6:22 PM > > > > To: Feifei Wang ; Konstantin Ananyev > > > > > > > > Cc: dev@dpdk.org; nd ; Honnappa Nagarahalli > > > > ; Ruifeng Wang > > > ; > > > > Yuying Zhang ; Beilei Xing > > > > ; nd ; nd ; nd > > > > ; nd > > > > Subject: RE: [PATCH v11 2/4] net/i40e: implement mbufs recycle mode > > > > > > > > > > > > > > > > > > > > > > > > Define specific function implementation for i40e driver. > > > > > > > > > > > > Currently, mbufs recycle mode can support 128bit > > > > > > > > > > > > vector path and > > > > > > > > > > > > avx2 > > > > > > > > > > path. > > > > > > > > > > > > And can be enabled both in fast free and no fast free > > > > > > > > > > > > mode. > > > > > > > > > > > > > > > > > > > > > > > > Suggested-by: Honnappa Nagarahalli > > > > > > > > > > > > > > > > > > > > > > > > Signed-off-by: Feifei Wang > > > > > > > > > > > > Reviewed-by: Ruifeng Wang > > > > > > > > > > > > Reviewed-by: Honnappa Nagarahalli > > > > > > > > > > > > > > > > > > > > --- > > > > > > > > > > > > drivers/net/i40e/i40e_ethdev.c| 1 + > > > > > > > > > > > > drivers/net/i40e/i40e_ethdev.h| 2 + > > > > > > > > > > > > .../net/i40e/i40e_recycle_mbufs_vec_common.c | 147 > > > > > > > > > > > > ++ > > > > > > > > > > > > drivers/net/i40e/i40e_rxtx.c | 32 > > > > > > > > > > > > > > > > > > > > > > > > drivers/net/i40e/i40e_rxtx.h | 4 + > > > > > > > > > > > > drivers/net/i40e/meson.build | 1 + > > > > > > > > > > > > 6 files changed, 187 insertions(+) create mode > > > > > > > > > > > > 100644 > > > > > > > > > > > > drivers/net/i40e/i40e_recycle_mbufs_vec_common.c > > > > > > > > > > > > > > > > > > > > > > > > diff --git a/drivers/net/i40e/i40e_ethdev.c > > > > > > > > > > > > b/drivers/net/i40e/i40e_ethdev.c index > > > > > > > > > > > > 8271bbb394..50ba9aac94 > > > > > > > > > > > > 100644 > > > > > > > > > > > > --- a/drivers/net/i40e/i40e_ethdev.c > > > > > > > > > > > > +++ b/drivers/net/i40e/i40e_ethdev.c > > > > > > > > > > > > @@ -496,6 +496,7 @@ static const struct eth_dev_ops > > > > > > > > > > > > i40e_eth_dev_ops > > > > > > > > > > = { > > > > > > > > > > > > .flow_ops_get = > > > > > > > > > > > > i40e_dev_flow_ops_get, > > > > > > > > > > > > .rxq_info_get = > > > > > > > > > > > > i40e_rxq_info_get, > > > > > > > > > > > > .txq_info_get = > > > > > > > > > > > > i40e_txq_info_get, > > > > > > > > > > > > + .recycle_rxq_info_get = > > > i40e_recycle_rxq_info_get,
Re: [PATCH v11 2/4] net/i40e: implement mbufs recycle mode
Define specific function implementation for i40e driver. Currently, mbufs recycle mode can support 128bit vector path and avx2 path. And can be enabled both in fast free and no fast free mode. Suggested-by: Honnappa Nagarahalli Signed-off-by: Feifei Wang Reviewed-by: Ruifeng Wang Reviewed-by: Honnappa Nagarahalli --- drivers/net/i40e/i40e_ethdev.c| 1 + drivers/net/i40e/i40e_ethdev.h| 2 + .../net/i40e/i40e_recycle_mbufs_vec_common.c | 147 ++ drivers/net/i40e/i40e_rxtx.c | 32 drivers/net/i40e/i40e_rxtx.h | 4 + drivers/net/i40e/meson.build | 1 + 6 files changed, 187 insertions(+) create mode 100644 drivers/net/i40e/i40e_recycle_mbufs_vec_common.c diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c index 8271bbb394..50ba9aac94 100644 --- a/drivers/net/i40e/i40e_ethdev.c +++ b/drivers/net/i40e/i40e_ethdev.c @@ -496,6 +496,7 @@ static const struct eth_dev_ops i40e_eth_dev_ops = { .flow_ops_get = i40e_dev_flow_ops_get, .rxq_info_get = i40e_rxq_info_get, .txq_info_get = i40e_txq_info_get, + .recycle_rxq_info_get = i40e_recycle_rxq_info_get, .rx_burst_mode_get= i40e_rx_burst_mode_get, .tx_burst_mode_get= i40e_tx_burst_mode_get, .timesync_enable = i40e_timesync_enable, diff --git a/drivers/net/i40e/i40e_ethdev.h b/drivers/net/i40e/i40e_ethdev.h index 6f65d5e0ac..af758798e1 100644 --- a/drivers/net/i40e/i40e_ethdev.h +++ b/drivers/net/i40e/i40e_ethdev.h @@ -1355,6 +1355,8 @@ void i40e_rxq_info_get(struct rte_eth_dev *dev, uint16_t queue_id, struct rte_eth_rxq_info *qinfo); void i40e_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id, struct rte_eth_txq_info *qinfo); +void i40e_recycle_rxq_info_get(struct +rte_eth_dev *dev, uint16_t queue_id, + struct rte_eth_recycle_rxq_info +*recycle_rxq_info); int i40e_rx_burst_mode_get(struct rte_eth_dev *dev, uint16_t queue_id, struct rte_eth_burst_mode *mode); int i40e_tx_burst_mode_get(struct rte_eth_dev *dev, uint16_t queue_id, diff -- git a/drivers/net/i40e/i40e_recycle_mbufs_vec_common .c b/drivers/net/i40e/i40e_recycle_mbufs_vec_common .c new file mode 100644 index 00..5663ecccde --- /dev/null +++ b/drivers/net/i40e/i40e_recycle_mbufs_vec_co +++ mmon +++ .c @@ -0,0 +1,147 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (c) 2023 Arm Limited. + */ + +#include #include + +#include "base/i40e_prototype.h" +#include "base/i40e_type.h" +#include "i40e_ethdev.h" +#include "i40e_rxtx.h" + +#pragma GCC diagnostic ignored "-Wcast-qual" + +void +i40e_recycle_rx_descriptors_refill_vec(void +*rx_queue, uint16_t +nb_mbufs) { + struct i40e_rx_queue *rxq = rx_queue; + struct i40e_rx_entry *rxep; + volatile union i40e_rx_desc *rxdp; + uint16_t rx_id; + uint64_t paddr; + uint64_t dma_addr; + uint16_t i; + + rxdp = rxq->rx_ring + rxq->rxrearm_start; + rxep = &rxq->sw_ring[rxq->rxrearm_start]; + + for (i = 0; i < nb_mbufs; i++) { + /* Initialize rxdp descs. */ + paddr = (rxep[i].mbuf)->buf_iova + RTE_PKTMBUF_HEADROOM; + dma_addr = rte_cpu_to_le_64(paddr); + /* flush desc with pa dma_addr */ + rxdp[i].read.hdr_addr = 0; + rxdp[i].read.pkt_addr = dma_addr; + } + + /* Update the descriptor initializer index */ + rxq->rxrearm_start += nb_mbufs; + rx_id = rxq->rxrearm_start - 1; + + if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) { + rxq->rxrearm_start = 0; + rx_id = rxq->nb_rx_desc - 1; + } + + rxq->rxrearm_nb -= nb_mbufs; + + rte_io_wmb(); + /* Update the tail pointer on the NIC */ + I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, +rx_id); } + +uint16_t +i40e_recycle_tx_mbufs_reuse_vec(void *tx_queue, + struct rte_eth_recycle_rxq_info *recycle_rxq_info) { + struct i40e_tx_queue *txq = tx_queue; + struct i40e_tx_entry *txep; + struct rte_mbuf **rxep; + int i, n; + uint16_t nb_recycle_mbufs; + uint16_t avail = 0; + uint16_t mbuf_ring_size = recycle_rxq_info- mbuf_ring_size; + uint16_t mask = recycle_rxq_info->mbuf_ring_size - 1; + uint16_t refill_requirement = +recycle_rxq_info- refill_requirement; + uint16_t refill_head = *recycle_rxq_info->refill_head; + uint16_t receive_tail = +*recycle_rxq_info->receive_tail; + + /* Get available recycling Rx buffers. */ + avail = (mbuf_ring_size - (refill_head - +receive_tail)) & mask; + + /* Check Tx free thresh and Rx available space. */ + if (txq->nb_tx_free > txq->tx_free_thresh || +avail <= +txq- tx_rs_thresh) +
RE: [PATCH v13 2/4] net/i40e: implement mbufs recycle mode
> On 9/25/2023 4:19 AM, Feifei Wang wrote: > > Define specific function implementation for i40e driver. > > Currently, mbufs recycle mode can support 128bit > > vector path and avx2 path. And can be enabled both in > > fast free and no fast free mode. > > > > Suggested-by: Honnappa Nagarahalli > > Signed-off-by: Feifei Wang > > Reviewed-by: Ruifeng Wang > > Reviewed-by: Honnappa Nagarahalli > > > > Hi Konstantin, > > I guess this patch was under discussion, can you please check the latest > version? Hi Ferruh, Yes, I saw the new version, but didn't have time yet, to review/ack it. Will try to do it by today's COB. Konstantin > > > > Hi Yuying, Beilei, Qiming, Wenjun, > > This set includes i40e & ixgbe recycle_mbufs implementations, patch will > be merged if there is no objection.
Re: [PATCH v13 4/4] app/testpmd: add recycle mbufs engine
25.09.2023 04:19, Feifei Wang пишет: Add recycle mbufs engine for testpmd. This engine forward pkts with I/O forward mode. But enable mbufs recycle feature to recycle used txq mbufs for rxq mbuf ring, which can bypass mempool path and save CPU cycles. Suggested-by: Jerin Jacob Signed-off-by: Feifei Wang Reviewed-by: Ruifeng Wang --- Acked-by: Konstantin Ananyev
Re: [PATCH v13 3/4] net/ixgbe: implement mbufs recycle mode
25.09.2023 04:19, Feifei Wang пишет: Define specific function implementation for ixgbe driver. Currently, recycle buffer mode can support 128bit vector path. And can be enabled both in fast free and no fast free mode. Suggested-by: Honnappa Nagarahalli Signed-off-by: Feifei Wang Reviewed-by: Ruifeng Wang Reviewed-by: Honnappa Nagarahalli --- Acked-by: Konstantin Ananyev
Re: [PATCH v13 2/4] net/i40e: implement mbufs recycle mode
25.09.2023 04:19, Feifei Wang пишет: Define specific function implementation for i40e driver. Currently, mbufs recycle mode can support 128bit vector path and avx2 path. And can be enabled both in fast free and no fast free mode. Suggested-by: Honnappa Nagarahalli Signed-off-by: Feifei Wang Reviewed-by: Ruifeng Wang Reviewed-by: Honnappa Nagarahalli --- Acked-by: Konstantin Ananyev
Re: [PATCH v1] examples/l3fwd: relax the RSS/Offload requirement
Hi Trevor, At 2023-09-18 02:04:19, "Konstantin Ananyev" wrote: 03/09/2023 05:01, Trevor Tao пишет: Now the port Rx mq_mode had been set to RTE_ETH_MQ_RX_RSS, and offload mode set to RTE_ETH_RX_OFFLOAD_CHECKSUM by default, but some hardware and/or virtual interface does not support the RSS and offload mode presupposed, e.g., some virtio interfaces in the cloud don't support RSS and may only partly support RTE_ETH_RX_OFFLOAD_UDP_CKSUM/ RTE_ETH_RX_OFFLOAD_TCP_CKSUM, but not RTE_ETH_RX_OFFLOAD_IPV4_CKSUM, and the error msg here: virtio_dev_configure(): RSS support requested but not supported by the device Port0 dev_configure = -95 and: Ethdev port_id=0 requested Rx offloads 0xe does not match Rx offloads capabilities 0x201d in rte_eth_dev_configure() So to enable the l3fwd running in that environment, the Rx mode requirement can be relaxed to reflect the hardware feature reality here, and the l3fwd can run smoothly then. A warning msg would be provided to user in case it happens here. On the other side, enabling the software cksum check in case the hw support missing. Fixes: af75078fece3 ("first public release") Cc: sta...@dpdk.org I don't think there was abug here. We are talking about changing current requirements for the app. So not sure it is a real fix and that such change can be propagated to stable releases. Trevor: I think it's not a bug fix but a feature enhancement, it would enable l3fwd to work smoothly on the HW/virtual interfaces which don't support RSS and/or cksum offloading. Yes. it seems like sort of an enhancement. While 'Fixes: ...' are for bugs only. AFAIK, only bug-fixes are take for backporting by stable releases. That's why there seems no point to add CC: sta...@dpdk.org Another generic things: >- l3fwd doc and release notes probably need to be updated *Trevor>>I think it's ok to update the l3fwd doc and release notes, but I would like to know which part of the doc/notes is approriate to add the enhancement declaration. * think both: http://doc.dpdk.org/guides/sample_app_ug/l3_forward.html and elease notes in doc/guides/rel_notes/ need to be updated. - as you areintroducing 2 distinct features: no-rss and no-ipv4-cksum it is probably better to split it into 2 different patches (in the >same series). *Trevor>>I think it's ok to split it into 2 patches here in the same series, if you would like to.* *Thanks.* That is not my own desire, but usual contrution practise we all try to comply with. You can find more details at: https://doc.dpdk.org/guides/contributing/patches.html Thanks Konstantin Signed-off-by: Trevor Tao --- examples/l3fwd/l3fwd.h | 12 +++- examples/l3fwd/main.c | 21 +++-- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h index b55855c932..cc10643c4b 100644 --- a/examples/l3fwd/l3fwd.h +++ b/examples/l3fwd/l3fwd.h @@ -115,6 +115,8 @@ extern struct acl_algorithms acl_alg[]; extern uint32_t max_pkt_len; +extern struct rte_eth_conf port_conf; + /* Send burst of packets on an output interface */ static inline int send_burst(struct lcore_conf *qconf, uint16_t n, uint16_t port) @@ -170,7 +172,15 @@ is_valid_ipv4_pkt(struct rte_ipv4_hdr *pkt, uint32_t link_len) return -1; /* 2. The IP checksum must be correct. */ - /* this is checked in H/W */ + /* if this is not checked in H/W, check it. */ + if ((port_conf.rxmode.offloads & RTE_ETH_RX_OFFLOAD_IPV4_CKSUM) == 0) { Might be better to check particular mbuf flag: if ((mbuf->ol_flags & RTE_MBUF_F_RX_IP_CKSUM_MASK) == TE_MBUF_F_RX_IP_CKSUM_UNKNOWN) {...} Trevor: the utility function is_valid_ipv4_pkt is just against an IPv4 pkt, and there's no mbuf information, and if needed, there would be an extra ol_flags added here to check if it was already done by the ethernet device, but look for a sample in: https://github.com/DPDK/dpdk/blob/main/examples/l3fwd-power/main.c#L487 so I think it's ok to just use the port_conf here. If you still think it's better to use m->ol_flags, please tell me. Yep, passing ol_flags, or mbuf itself seems like a proper way to do it. Aproach taken in l3fwd-power doesn't look right to me, see below. + uint16_t actual_cksum, expected_cksum; + actual_cksum = pkt->hdr_checksum; + pkt->hdr_checksum = 0; + expected_cksum = rte_ipv4_cksum(pkt); + if (actual_cksum != expected_cksum) + return -2; + } /* * 3. The IP version number must be 4. If the version number is not 4 diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c index 6063eb1399..37aec64718 100644 --- a/examples/l3fwd/main.c +++ b/examples/l3fwd/main.c @@ -117,7 +117,7 @@ static struct lcore
RE: [PATCH v5 3/3] power: amd power monitor support
> > Caution: This message originated from an External Source. Use proper caution > > when opening attachments, clicking links, or responding. > > > > > > 25/08/2023 17:00, Tyler Retzlaff пишет: > > > On Thu, Aug 24, 2023 at 10:04:42AM +0100, Ferruh Yigit wrote: > > >> On 8/23/2023 5:03 PM, Tyler Retzlaff wrote: > > >>> On Wed, Aug 23, 2023 at 10:19:39AM +0100, Ferruh Yigit wrote: > > >>>> On 8/22/2023 11:30 PM, Konstantin Ananyev wrote: > > >>>>> 18/08/2023 14:48, Bruce Richardson пишет: > > >>>>>> On Fri, Aug 18, 2023 at 02:25:14PM +0100, Ferruh Yigit wrote: > > >>>>>>> On 8/17/2023 3:18 PM, Konstantin Ananyev wrote: > > >>>>>>>> > > >>>>>>>>>> Caution: This message originated from an External Source. Use > > >>>>>>>>>> proper caution when opening attachments, clicking links, or > > >>>>>>>>>> responding. > > >>>>>>>>>> > > >>>>>>>>>> > > >>>>>>>>>> On Wed, Aug 16, 2023 at 11:59:59AM -0700, Sivaprasad Tummala > > wrote: > > >>>>>>>>>>> mwaitx allows EPYC processors to enter a implementation > > >>>>>>>>>>> dependent power/performance optimized state (C1 state) for a > > >>>>>>>>>>> specific period or until a store to the monitored address > > >>>>>>>>>>> range. > > >>>>>>>>>>> > > >>>>>>>>>>> Signed-off-by: Sivaprasad Tummala > > >>>>>>>>>>> > > >>>>>>>>>>> Acked-by: Anatoly Burakov > > >>>>>>>>>>> --- > > >>>>>>>>>>>lib/eal/x86/rte_power_intrinsics.c | 77 > > >>>>>>>>>>> +- > > >>>>>>>>>>>1 file changed, 66 insertions(+), 11 deletions(-) > > >>>>>>>>>>> > > >>>>>>>>>>> diff --git a/lib/eal/x86/rte_power_intrinsics.c > > >>>>>>>>>>> b/lib/eal/x86/rte_power_intrinsics.c > > >>>>>>>>>>> index 6eb9e50807..b4754e17da 100644 > > >>>>>>>>>>> --- a/lib/eal/x86/rte_power_intrinsics.c > > >>>>>>>>>>> +++ b/lib/eal/x86/rte_power_intrinsics.c > > >>>>>>>>>>> @@ -17,6 +17,60 @@ static struct power_wait_status { > > >>>>>>>>>>> volatile void *monitor_addr; /**< NULL if not > > >>>>>>>>>>> currently sleeping */ } __rte_cache_aligned > > >>>>>>>>>>> wait_status[RTE_MAX_LCORE]; > > >>>>>>>>>>> > > >>>>>>>>>>> +/** > > >>>>>>>>>>> + * These functions uses UMONITOR/UMWAIT instructions and > > >>>>>>>>>>> +will > > >>>>>>>>>>> enter C0.2 > > >>>>>>>>>> state. > > >>>>>>>>>>> + * For more information about usage of these instructions, > > >>>>>>>>>>> +please refer to > > >>>>>>>>>>> + * Intel(R) 64 and IA-32 Architectures Software Developer's > > Manual. > > >>>>>>>>>>> + */ > > >>>>>>>>>>> +static void intel_umonitor(volatile void *addr) { > > >>>>>>>>>>> + /* UMONITOR */ > > >>>>>>>>>>> + asm volatile(".byte 0xf3, 0x0f, 0xae, 0xf7;" > > >>>>>>>>>>> + : > > >>>>>>>>>>> + : "D"(addr)); } > > >>>>>>>>>>> + > > >>>>>>>>>>> +static void intel_umwait(const uint64_t timeout) { > > >>>>>>>>>>> + const uint32_t tsc_l = (uint32_t)timeout; > > >>>>>>>>>>> + const uint32_t tsc_h = (uint32_t)(timeout >> 32); > > >>
[dpdk-dev] [PATCH] acl: fix build with gcc 11
gcc 11 with '-O2' complains about some variables being used without being initialized: In file included from ../lib/librte_acl/acl_run_avx512x8.h:201, from ../lib/librte_acl/acl_run_avx512.c:110: In function ‘start_flow_avx512x8’, inlined from ‘search_trie_avx512x8.constprop’ at ../lib/librte_acl/acl_run_avx512_common.h:317:2: ../lib/librte_acl/acl_run_avx512_common.h:210:13: warning: ‘pdata’ is used uninitialized [-Wuninitialized] In file included from ../lib/librte_acl/acl_run_avx512x8.h:201, from ../lib/librte_acl/acl_run_avx512.c:110: ../lib/librte_acl/acl_run_avx512_common.h: In function ‘search_trie_avx512x8.constprop’: ../lib/librte_acl/acl_run_avx512_common.h:314:32: note: ‘pdata’ declared here In file included from ../lib/librte_acl/acl_run_avx512x8.h:201, from ../lib/librte_acl/acl_run_avx512.c:110: Indeed, these variables are not explicitly initialized, but this is done intentionally. We rely on constant mask value that we pass to start_flow*() functions as a parameter Note that gcc 11 with '-O3' and gcc 9/10 with both '-O2' and '-O3' doesn't produce this warning, same as clang. Which makes me think that they are able to successfully propagate this constant mask value though the code. Also even gcc 11 with '-O2' produces a warning, it is able to generate an output binary with properly propagated constant values. Anyway, to support clean build with gcc-11 this patch adds explicit initialization for these variables. I checked the output binary: with '-O3' both clang and gcc 10/11 generate no extra code for it. Also performance test didn't reveal any regressions. Bugzilla ID: 673 Fixes: b64c2295f7fc ("acl: add 256-bit AVX512 classify method") Fixes: 45da22e42ec3 ("acl: add 512-bit AVX512 classify method") Cc: sta...@dpdk.org Reported-by: Ali Alnubani Signed-off-by: Konstantin Ananyev --- lib/acl/acl_run_avx512_common.h | 24 1 file changed, 24 insertions(+) diff --git a/lib/acl/acl_run_avx512_common.h b/lib/acl/acl_run_avx512_common.h index fafaf591e..fbad74d45 100644 --- a/lib/acl/acl_run_avx512_common.h +++ b/lib/acl/acl_run_avx512_common.h @@ -303,6 +303,28 @@ _F_(match_check_process)(struct acl_flow_avx512 *flow, uint32_t fm[2], } } +static inline void +_F_(reset_flow_vars)(_T_simd di[2], _T_simd idx[2], _T_simd pdata[4], + _T_simd tr_lo[2], _T_simd tr_hi[2]) +{ + di[0] = _M_SI_(setzero)(); + di[1] = _M_SI_(setzero)(); + + idx[0] = _M_SI_(setzero)(); + idx[1] = _M_SI_(setzero)(); + + pdata[0] = _M_SI_(setzero)(); + pdata[1] = _M_SI_(setzero)(); + pdata[2] = _M_SI_(setzero)(); + pdata[3] = _M_SI_(setzero)(); + + tr_lo[0] = _M_SI_(setzero)(); + tr_lo[1] = _M_SI_(setzero)(); + + tr_hi[0] = _M_SI_(setzero)(); + tr_hi[1] = _M_SI_(setzero)(); +} + /* * Perform search for up to (2 * _N_) flows in parallel. * Use two sets of metadata, each serves _N_ flows max. @@ -313,6 +335,8 @@ _F_(search_trie)(struct acl_flow_avx512 *flow) uint32_t fm[2]; _T_simd di[2], idx[2], in[2], pdata[4], tr_lo[2], tr_hi[2]; + _F_(reset_flow_vars)(di, idx, pdata, tr_lo, tr_hi); + /* first 1B load */ _F_(start_flow)(flow, _SIMD_MASK_BIT_, _SIMD_MASK_MAX_, &pdata[0], &idx[0], &di[0]); -- 2.26.3
[dpdk-dev] [PATCH v2] acl: fix build with gcc 11
gcc 11 with '-O2' complains about some variables being used without being initialized: In file included from ../lib/librte_acl/acl_run_avx512x8.h:201, from ../lib/librte_acl/acl_run_avx512.c:110: In function ‘start_flow_avx512x8’, inlined from ‘search_trie_avx512x8.constprop’ at ../lib/librte_acl/acl_run_avx512_common.h:317:2: ../lib/librte_acl/acl_run_avx512_common.h:210:13: warning: ‘pdata’ is used uninitialized [-Wuninitialized] In file included from ../lib/librte_acl/acl_run_avx512x8.h:201, from ../lib/librte_acl/acl_run_avx512.c:110: ../lib/librte_acl/acl_run_avx512_common.h: In function ‘search_trie_avx512x8.constprop’: ../lib/librte_acl/acl_run_avx512_common.h:314:32: note: ‘pdata’ declared here In file included from ../lib/librte_acl/acl_run_avx512x8.h:201, from ../lib/librte_acl/acl_run_avx512.c:110: Indeed, these variables are not explicitly initialized, but this is done intentionally. We rely on constant mask value that we pass to start_flow*() functions as a parameter to mask out uninitialized values. Note that '-O3' doesn't produce this warning. Anyway, to support clean build with gcc-11 this patch adds explicit initialization for these variables. I checked the output binary: with '-O3' both clang and gcc 10/11 generate no extra code for it. Also performance test didn't reveal any regressions. Bugzilla ID: 673 Fixes: b64c2295f7fc ("acl: add 256-bit AVX512 classify method") Fixes: 45da22e42ec3 ("acl: add 512-bit AVX512 classify method") Cc: sta...@dpdk.org Reported-by: Ali Alnubani Signed-off-by: Konstantin Ananyev --- v2: update commit log --- lib/acl/acl_run_avx512_common.h | 24 1 file changed, 24 insertions(+) diff --git a/lib/acl/acl_run_avx512_common.h b/lib/acl/acl_run_avx512_common.h index fafaf591e..fbad74d45 100644 --- a/lib/acl/acl_run_avx512_common.h +++ b/lib/acl/acl_run_avx512_common.h @@ -303,6 +303,28 @@ _F_(match_check_process)(struct acl_flow_avx512 *flow, uint32_t fm[2], } } +static inline void +_F_(reset_flow_vars)(_T_simd di[2], _T_simd idx[2], _T_simd pdata[4], + _T_simd tr_lo[2], _T_simd tr_hi[2]) +{ + di[0] = _M_SI_(setzero)(); + di[1] = _M_SI_(setzero)(); + + idx[0] = _M_SI_(setzero)(); + idx[1] = _M_SI_(setzero)(); + + pdata[0] = _M_SI_(setzero)(); + pdata[1] = _M_SI_(setzero)(); + pdata[2] = _M_SI_(setzero)(); + pdata[3] = _M_SI_(setzero)(); + + tr_lo[0] = _M_SI_(setzero)(); + tr_lo[1] = _M_SI_(setzero)(); + + tr_hi[0] = _M_SI_(setzero)(); + tr_hi[1] = _M_SI_(setzero)(); +} + /* * Perform search for up to (2 * _N_) flows in parallel. * Use two sets of metadata, each serves _N_ flows max. @@ -313,6 +335,8 @@ _F_(search_trie)(struct acl_flow_avx512 *flow) uint32_t fm[2]; _T_simd di[2], idx[2], in[2], pdata[4], tr_lo[2], tr_hi[2]; + _F_(reset_flow_vars)(di, idx, pdata, tr_lo, tr_hi); + /* first 1B load */ _F_(start_flow)(flow, _SIMD_MASK_BIT_, _SIMD_MASK_MAX_, &pdata[0], &idx[0], &di[0]); -- 2.26.3
[dpdk-dev] [PATCH] examples/ipsec-secgw: fix handling IPv6 extension headers
Recent patch to support UDP encapsulation introduced problem with handling inbound IPv6 packets with header extensions. This patch aims to fix the issue. Bugzilla ID: 695 Fixes: 9a1cc8f1ed74 ("examples/ipsec-secgw: support UDP encapsulation") Reported-by: Vladimir Medvedkin Signed-off-by: Konstantin Ananyev --- examples/ipsec-secgw/ipsec-secgw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c index a9f9b5859..f252d3498 100644 --- a/examples/ipsec-secgw/ipsec-secgw.c +++ b/examples/ipsec-secgw/ipsec-secgw.c @@ -426,7 +426,7 @@ prepare_one_packet(struct rte_mbuf *pkt, struct ipsec_traffic *t) return; } - switch (iph6->proto) { + switch (next_proto) { case IPPROTO_ESP: t->ipsec.pkts[(t->ipsec.num)++] = pkt; break; -- 2.26.3
RE: [PATCH 3/3] examples/l3fwd: fix maximum acceptable port ID in routes
> Application is accepting routes for port ID up to UINT8_MAX > for LPM amd EM routes on parsing the given rule file, but only > up to 32 ports can be enabled as per the variable enabled_port_mask > which is defined as uint32_t. > > This patch restricts the rules parsing code to accept routes for > port ID up to 31 only to avoid any unnecessary maintenance of rules > which will never be used. If we want to add this extra check, probably better to do it in setup_lpm(). Where we already check that port is enabled, and If not, then this route rule will be skipped: /* populate the LPM table */ for (i = 0; i < route_num_v4; i++) { struct in_addr in; /* skip unused ports */ if ((1 << route_base_v4[i].if_out & enabled_port_mask) == 0) continue; Same for EM. Another question here - why we just silently skip the rule with invalid port? Probably need to fail with error... that what ACL code-path does. > Fixes: e7e6dd643092 ("examples/l3fwd: support config file for EM") > Fixes: 52def963fc1c ("examples/l3fwd: support config file for LPM/FIB") > Cc: sean.morris...@intel.com > Cc: sta...@dpdk.org > > Signed-off-by: Gagandeep Singh > --- > examples/l3fwd/em_route_parse.c | 6 -- > examples/l3fwd/lpm_route_parse.c | 6 -- > 2 files changed, 8 insertions(+), 4 deletions(-) > > diff --git a/examples/l3fwd/em_route_parse.c b/examples/l3fwd/em_route_parse.c > index 8b534de5f1..65c71cd1ba 100644 > --- a/examples/l3fwd/em_route_parse.c > +++ b/examples/l3fwd/em_route_parse.c > @@ -65,7 +65,8 @@ em_parse_v6_rule(char *str, struct em_rule *v) > /* protocol. */ > GET_CB_FIELD(in[CB_FLD_PROTO], v->v6_key.proto, 0, UINT8_MAX, 0); > /* out interface. */ > - GET_CB_FIELD(in[CB_FLD_IF_OUT], v->if_out, 0, UINT8_MAX, 0); > + GET_CB_FIELD(in[CB_FLD_IF_OUT], v->if_out, 0, > + (sizeof(enabled_port_mask) * CHAR_BIT) - 1, 0); > > return 0; > } > @@ -102,7 +103,8 @@ em_parse_v4_rule(char *str, struct em_rule *v) > /* protocol. */ > GET_CB_FIELD(in[CB_FLD_PROTO], v->v4_key.proto, 0, UINT8_MAX, 0); > /* out interface. */ > - GET_CB_FIELD(in[CB_FLD_IF_OUT], v->if_out, 0, UINT8_MAX, 0); > + GET_CB_FIELD(in[CB_FLD_IF_OUT], v->if_out, 0, > + (sizeof(enabled_port_mask) * CHAR_BIT) - 1, 0); > > return 0; > } > diff --git a/examples/l3fwd/lpm_route_parse.c > b/examples/l3fwd/lpm_route_parse.c > index f27b66e838..357c12d9fe 100644 > --- a/examples/l3fwd/lpm_route_parse.c > +++ b/examples/l3fwd/lpm_route_parse.c > @@ -110,7 +110,8 @@ lpm_parse_v6_rule(char *str, struct lpm_route_rule *v) > > rc = lpm_parse_v6_net(in[CB_FLD_DST_ADDR], v->ip_32, &v->depth); > > - GET_CB_FIELD(in[CB_FLD_IF_OUT], v->if_out, 0, UINT8_MAX, 0); > + GET_CB_FIELD(in[CB_FLD_IF_OUT], v->if_out, 0, > + (sizeof(enabled_port_mask) * CHAR_BIT) - 1, 0); > > return rc; > } > @@ -132,7 +133,8 @@ lpm_parse_v4_rule(char *str, struct lpm_route_rule *v) > > rc = parse_ipv4_addr_mask(in[CB_FLD_DST_ADDR], &v->ip, &v->depth); > > - GET_CB_FIELD(in[CB_FLD_IF_OUT], v->if_out, 0, UINT8_MAX, 0); > + GET_CB_FIELD(in[CB_FLD_IF_OUT], v->if_out, 0, > + (sizeof(enabled_port_mask) * CHAR_BIT) - 1, 0); > > return rc; > } > -- > 2.25.1
RE: [RFC v2] ethdev: an API for cache stashing hints
> An application provides cache stashing hints to the ethernet devices to > improve memory access latencies from the CPU and the NIC. This patch > introduces three distinct hints for this purpose. > > The RTE_ETH_DEV_STASH_HINT_HOST_WILLNEED hint indicates that the host > (CPU) requires the data written by the NIC immediately. This implies > that the CPU expects to read data from its local cache rather than LLC > or main memory if possible. This would improve memory access latency in > the Rx path. For PCI devices with TPH capability, these hints translate > into DWHR (Device Writes Host Reads) access pattern. This hint is only > valid for receive queues. > > The RTE_ETH_DEV_STASH_HINT_BI_DIR_DATA hint indicates that the host and > the device access the data structure equally. Rx/Tx queue descriptors > fit the description of such data. This hint applies to both Rx and Tx > directions. In the PCI TPH context, this hint translates into a > Bi-Directional access pattern. > > RTE_ETH_DEV_STASH_HINT_DEV_ONLY hint indicates that the CPU is not > involved in a given device's receive or transmit paths. This implies > that only devices are involved in the IO path. Depending on the > implementation, this hint may result in data getting placed in a cache > close to the device or not cached at all. For PCI devices with TPH > capability, this hint translates into D*D* (DWDR, DRDW, DWDW, DRDR) > access patterns. This is a bidirectional hint, and it can be applied to > both Rx and Tx queues. > > The RTE_ETH_DEV_STASH_HINT_HOST_DONTNEED hint indicates that the device > reads data written by the host (CPU) that may still be in the host's > local cache but is not required by the host anytime soon. This hint is > intended to prevent unnecessary cache invalidations that cause > interconnect latencies when a device writes to a buffer already in host > cache memory. In DPDK, this could happen with the recycling of mbufs > where a mbuf is placed in the Tx queue that then gets back into mempool > and gets recycled back into the Rx queue, all while a copy is being held > in the CPU's local cache unnecessarily. By using this hint on supported > platforms, the mbuf will be invalidated after the device completes the > buffer reading, but it will be well before the buffer gets recycled and > updated in the Rx path. This hint is only valid for transmit queues. > > Applications use three main interfaces in the ethdev library to discover > and set cache stashing hints. rte_eth_dev_stashing_hints_tx interface is > used to set hints on a Tx queue. rte_eth_dev_stashing_hints_rx interface > is used to set hints on an Rx queue. Both of these functions take the > following parameters as inputs: a port_id (the id of the ethernet > device), a cpu_id (the target CPU), a cache_level (the level of the > cache hierarchy the data should be stashed into), a queue_id (the queue > the hints are applied to). In addition to the above list of parameters, > a type parameter indicates the type of the object the application > expects to be stashed by the hardware. Depending on the hardware, these > may vary. Intel E810 NICs support the stashing of Rx/Tx descriptors, > packet headers, and packet payloads. These are indicated by the macros > RTE_ETH_DEV_STASH_TYPE_DESC, RTE_ETH_DEV_STASH_TYPE_HEADER, > RTE_ETH_DEV_STASH_TYPE_PAYLOAD. Hardware capable of stashing data at any > given offset into a packet can use the RTE_ETH_DEV_STASH_TYPE_OFFSET > type. When an offset is used, the offset parameter in the above two > functions should be set appropriately. > > rte_eth_dev_stashing_hints_discover is used to discover the object types > and hints supported in the platform and the device. The function takes > types and hints pointers used as a bit vector to indicate hints and > types supported by the NIC. An application that intends to use stashing > hints should first discover supported hints and types and then use the > functions rte_eth_dev_stashing_hints_tx and > rte_eth_dev_stashing_hints_rx as required to set stashing hints > accordingly. eth_dev_ops structure has been updated with two new ops > that a PMD should implement to support cache stashing hints. A PMD that > intends to support cache stashing hints should initialize the > set_stashing_hints function pointer to a function that issues hints to > the underlying hardware in compliance with platform capabilities. The > same PMD should also implement a function that can return two-bit fields > indicating supported types and hints and then initialize the > discover_stashing_hints function pointer with it. If the NIC supports > cache stashing hints, the NIC should always set the > RTE_ETH_DEV_CAPA_CACHE_STASHING device capability. Sounds like an interesting idea... Do you plan to have a reference implementation in one (or few) actual PMDs?
RE: [PATCH 3/3] examples/l3fwd: fix maximum acceptable port ID in routes
> > > Application is accepting routes for port ID up to UINT8_MAX for LPM > > > amd EM routes on parsing the given rule file, but only up to 32 ports > > > can be enabled as per the variable enabled_port_mask which is defined > > > as uint32_t. > > > > > > This patch restricts the rules parsing code to accept routes for port > > > ID up to 31 only to avoid any unnecessary maintenance of rules which > > > will never be used. > > > > If we want to add this extra check, probably better to do it in setup_lpm(). > > Where we already check that port is enabled, and If not, then this route > > rule will > > be skipped: > > > > /* populate the LPM table */ > > for (i = 0; i < route_num_v4; i++) { > > struct in_addr in; > > > > /* skip unused ports */ > > if ((1 << route_base_v4[i].if_out & > > enabled_port_mask) == 0) > > continue; > > > > Same for EM. > I am trying to update the check for MAX if_out value in rules config file > parsing which will be before setup_lpm(). > The reason is, restricting and adding only those rules which can be used by > the application > while populating the route_base_v4/v6 at first step and avoid unnecessary > memory allocation > for local variables to store more not required rules. Hmm... but why it is a problem? > > > ((1 << route_base_v4[i].if_out & > > enabled_port_mask) > By looking into this check, it seems restriction to maximum 31 port ID while > parsing rule file becomes > more valid as this check can pass due to overflow in case value of > route_base_v4[i].if_out > Is 31+. Agree, I think we need both, and it probably need to be in setup_lpm(). Something like: if (route_base_v4[i].if_out >= sizeof(enabled_port_mask) * CHAR_BIT || ((1 << route_base_v4[i].if_out & enabled_port_mask) == 0) { /* print some error message here*/ rte_exiit(...); /* or return an error */ } > > > Another question here - why we just silently skip the rule with invalid > > port? > In read_config_files_lpm() we are calling the rte_exit in case port ID is 31+. > In setup_lpm, skipping the rules for the ports which are not enabled and not > giving error, > I guess probably because of ease of use. > e.g. user has only single ipv4_routes config file with route rules for port > ID 0,1,2,3,4 > and want to use same file for multiple test cases like > 1. when only port 0 enabled > 2. when only port 0 and 1 enabled and so on. > In this case, user can avoid to have separate route files for each of the > test case. The problem as I see it - we are not consistent here. In some cases we just silently skip rules with invalid (or disabled) port numbers, in other cases we generate an error and exit. For me it would be better, if we follow one simple policy (abort with error) here for all cases. > > > Probably need to fail with error... that what ACL code-path does. > > > > > Fixes: e7e6dd643092 ("examples/l3fwd: support config file for EM") > > > Fixes: 52def963fc1c ("examples/l3fwd: support config file for > > > LPM/FIB") > > > Cc: sean.morris...@intel.com > > > Cc: sta...@dpdk.org > > > > > > Signed-off-by: Gagandeep Singh > > > --- > > > examples/l3fwd/em_route_parse.c | 6 -- > > > examples/l3fwd/lpm_route_parse.c | 6 -- > > > 2 files changed, 8 insertions(+), 4 deletions(-) > > > > > > diff --git a/examples/l3fwd/em_route_parse.c > > > b/examples/l3fwd/em_route_parse.c index 8b534de5f1..65c71cd1ba 100644 > > > --- a/examples/l3fwd/em_route_parse.c > > > +++ b/examples/l3fwd/em_route_parse.c > > > @@ -65,7 +65,8 @@ em_parse_v6_rule(char *str, struct em_rule *v) > > > /* protocol. */ > > > GET_CB_FIELD(in[CB_FLD_PROTO], v->v6_key.proto, 0, UINT8_MAX, 0); > > > /* out interface. */ > > > - GET_CB_FIELD(in[CB_FLD_IF_OUT], v->if_out, 0, UINT8_MAX, 0); > > > + GET_CB_FIELD(in[CB_FLD_IF_OUT], v->if_out, 0, > > > + (sizeof(enabled_port_mask) * CHAR_BIT) - 1, 0); > > > > > > return 0; > > > } > > > @@ -102,7 +103,8 @@ em_parse_v4_rule(char *str, struct em_rule *v) > > > /* protocol. */ > > > GET_CB_FIELD(in[CB_FLD_PROTO], v->v4_key.proto, 0, UINT8_MAX, 0); > > > /* out interface. */ > > > - GET_CB_FIELD(in[CB_FLD_IF_OUT], v->if_out, 0, UINT8_MAX, 0); > > > + GET_CB_FIELD(in[CB_FLD_IF_OUT], v->if_out, 0, > > > + (sizeof(enabled_port_mask) * CHAR_BIT) - 1, 0); > > > > > > return 0; > > > } > > > diff --git a/examples/l3fwd/lpm_route_parse.c > > > b/examples/l3fwd/lpm_route_parse.c > > > index f27b66e838..357c12d9fe 100644 > > > --- a/examples/l3fwd/lpm_route_parse.c > > > +++ b/examples/l3fwd/lpm_route_parse.c > > > @@ -110,7 +110,8 @@ lpm_parse_v6_rule(char *str, struct lpm_route_rule > > > *v) > > > > > > rc = lpm_parse_v6_net(in[CB_FLD_DST_ADDR], v->ip_32, &v->depth); > > > > > > - GET_CB_FIELD(in[CB_FLD_IF_OUT], v->if_out, 0, UINT8_MAX, 0); > > > + GET_CB_FIELD(in[C
RE: BPF standardization
> It would be good to make sure that DPDK BPF conforms to IETF draft. > https://datatracker.ietf.org/doc/draft-ietf-bpf-isa/ > > Based on LWN article on presentation at Linux Storage, Filesystem, > Memory Mangerment, and BPF Summit. > > https://lwn.net/SubscriberLink/975830/3b32df6be23d3abf/ Yes, it would be really good... Another interesting option that was raised few times byt different people, would be opportunity to re-use extrernal eBPF verifiers with DPDK eBPF progs: either one from linux kernel or user-space (PREVAIL: https://github.com/vbpf/ebpf-verifier/tree/main), or even both. One of the main obstacle with that: both linux kernel and PREVAIL assume input context for eBPF prog in particular format (usually struct __sk_buff or struct xdp_md). In fact, PREVAIL is more flexible here, and allows to specify your own format, but it still expects some main things (data, data_end) to be present and located in the same way as linux kernel. After another thought, might be a simple way to overcome it would be to mimic what linux kernel does with 'direct' packet access: At verify stage it rewrites given BPF prog to convert load instructions that access fields of a context type into a sequence of instructions that access fields of the underlying structure: struct __sk_buff-> struct sk_buff struct bpf_sock_ops -> struct sock etc. (for more details see convert_ctx_accesses() in linux/kernel/bpf/verifier.c). Inside DPDK verifier/loader we can probably do the same: convert direct access of __sk_buff and/or xdp_md fields into rte_mbuf fields. I.E.: (__sk_buff->data) -> (mbuf->buf_addr + mbuf->data_off) (__sk_buff->data_end) -> (mbuf->buf_addr + mbuf->data_off + mbuf->data_len) and so on. BTW, right now, eBPF programs produced by DPDK cBPF->eBPF converter can be successfully verified by linux kernel. Things are easy here, as cBPF converter doesn't try to access packet contents directly (but only through special instructions: BPF_LD_ABS, BPF_LD_IND). Just small fix is required in rte_bpf_convert() to achieve that, see below. In theory, that would not also give us ability to re-use external verifiers, but also should make possible to execute subset of eBPF progs written for linux kernel within DPDK app. Of-course, not all of them, as right now linux eBPF has much richer functionality that we missing (MAPs, tail calls, etc.), but that's another story. We plan to do some work for eBPF+DPDK within next several months, so might be able to look at it too... though not hard promises here. Meanwhile interested in comments/thoughts/volunteers :) Thanks Konstantin == [PATCH 1/2] bpf: fix converter emitted code fails with linux verifier bpf_convert_filter() uses standard approach with XOR-ing itself: xor r0, r0, r0 to reset some register values. Unfortunately linux verifier seems way too strict here and doesn't allow access to register with undefined value. It generates error log like that for this op: Failed to verify program: Permission denied (13) LOG: func#0 @0 0: R1=ctx(id=0,off=0,imm=0) R10=fp0 0: (af) r0 ^= r0 R0 !read_ok processed 1 insns (limit 100) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 To overcome that, simply replace XOR with itself to explicit mov32 r0, #0x0 Signed-off-by: Konstantin Ananyev --- lib/bpf/bpf_convert.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lib/bpf/bpf_convert.c b/lib/bpf/bpf_convert.c index d7ff2b4325..eceaa19c76 100644 --- a/lib/bpf/bpf_convert.c +++ b/lib/bpf/bpf_convert.c @@ -267,8 +267,11 @@ static int bpf_convert_filter(const struct bpf_insn *prog, size_t len, /* Classic BPF expects A and X to be reset first. These need * to be guaranteed to be the first two instructions. */ - *new_insn++ = EBPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); - *new_insn++ = EBPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); + //*new_insn++ = EBPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + //*new_insn++ = EBPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); + + *new_insn++ = BPF_MOV32_IMM(BPF_REG_A, 0); + *new_insn++ = BPF_MOV32_IMM(BPF_REG_X, 0); /* All programs must keep CTX in callee saved BPF_REG_CTX. * In eBPF case it's done by the compiler, here we need to -- 2.35.3
RE: BPF standardization
> This was taken from how kernel converts cBPF and the prologue it generates. > Surprising that verifier gags? Indeed it looks strange... Have to admit I didn't dig into how linux converter works... Below is the patch I used to test it from DPDK UT - might be you'll spot something obvious here. Subject: [PATCH 2/2] test/bpf: call linux verifier for converted cBPF filter To ensure DPDK/kernel compatibility for converted cBPF filters, etc. Signed-off-by: Konstantin Ananyev --- app/test/meson.build | 1 + app/test/test_bpf.c | 11 + app/test/test_bpf.h | 12 ++ app/test/test_bpf_verify_linux.c | 39 4 files changed, 63 insertions(+) create mode 100644 app/test/test_bpf.h create mode 100644 app/test/test_bpf_verify_linux.c diff --git a/app/test/meson.build b/app/test/meson.build index e29258e6ec..d46e688db5 100644 --- a/app/test/meson.build +++ b/app/test/meson.build @@ -35,6 +35,7 @@ source_file_deps = { 'test_bitops.c': [], 'test_bitratestats.c': ['metrics', 'bitratestats', 'ethdev'] + sample_packet_forward_deps, 'test_bpf.c': ['bpf', 'net'], +'test_bpf_verify_linux.c': [], 'test_byteorder.c': [], #'test_cfgfile.c': ['cfgfile'], 'test_cksum.c': ['net'], diff --git a/app/test/test_bpf.c b/app/test/test_bpf.c index 7819d6aba9..749f9de20c 100644 --- a/app/test/test_bpf.c +++ b/app/test/test_bpf.c @@ -15,6 +15,7 @@ #include #include #include "test.h" +#include "test_bpf.h" #if !defined(RTE_LIB_BPF) @@ -3431,10 +3432,13 @@ static const char * const sample_filters[] = { static int test_bpf_filter(pcap_t *pcap, const char *s) { + int32_t rc; struct bpf_program fcode; struct rte_bpf_prm *prm = NULL; struct rte_bpf *bpf = NULL; + static char logbuf[UINT16_MAX + 1]; + if (pcap_compile(pcap, &fcode, s, 1, PCAP_NETMASK_UNKNOWN)) { printf("%s@%d: pcap_compile('%s') failed: %s;\n", __func__, __LINE__, s, pcap_geterr(pcap)); @@ -3451,6 +3455,13 @@ test_bpf_filter(pcap_t *pcap, const char *s) printf("bpf convert for \"%s\" produced:\n", s); rte_bpf_dump(stdout, prm->ins, prm->nb_ins); + rc = bpf_linux_verify(prm, logbuf, sizeof(logbuf)); + printf("%s@%d: linux verifier reports: %d(%s);\n", + __func__, __LINE__, rc, strerror(-rc)); + printf("linux verifier log: %s\n", logbuf); + if (rc != 0) + goto error; + bpf = rte_bpf_load(prm); if (bpf == NULL) { printf("%s@%d: failed to load bpf code, error=%d(%s);\n", diff --git a/app/test/test_bpf.h b/app/test/test_bpf.h new file mode 100644 index 00..e5be2be920 --- /dev/null +++ b/app/test/test_bpf.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: BSD-3-Clause + */ + +#ifndef _TEST_BPF_H_ +#define _TEST_BPF_H_ + +#include + +int bpf_linux_verify(const struct rte_bpf_prm *prm, char *logbuf, + uint32_t lbsz); + +#endif diff --git a/app/test/test_bpf_verify_linux.c b/app/test/test_bpf_verify_linux.c new file mode 100644 index 00..2a42d61fb5 --- /dev/null +++ b/app/test/test_bpf_verify_linux.c @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: BSD-3-Clause + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "test_bpf.h" + +int +bpf_linux_verify(const struct rte_bpf_prm *prm, char *logbuf, uint32_t lbsz) +{ + int32_t rc; + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + attr.insn_cnt = prm->nb_ins; + attr.insns = (uintptr_t)prm->ins; + attr.license = (uintptr_t)"GPL"; + attr.log_buf = (uintptr_t)logbuf; + attr.log_size = lbsz; + attr.log_level = (logbuf == NULL) ? 0 : 3; + + rc = syscall(SYS_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); + if (rc < 0) + rc = -errno; + else { + /* closed fd for BPF prog */ + close(rc); + rc = 0; + } + + return rc; +} -- 2.35.3
RE: IPv6 APIs rework
> Vladimir Medvedkin, Jul 18, 2024 at 23:25: > > I think alignment should be 1 since in FIB6 users usually don't copy IPv6 > > address and just provide a pointer to the memory inside the packet. Current > > vector implementation loads IPv6 addresses using unaligned access ( > > _mm512_loadu_si512) so it doesn't rely on alignment. > > Yes, my intention was exactly that, being able to map that structure > directly in packets without copying them on the stack. > > > > 2. In the IPv6 packet header, the IPv6 addresses are not 16 byte aligned, > > > they are 8 byte aligned. So we cannot make the IPv6 address type 16 byte > > > aligned. > > > Not necessary, if Ethernet frame in mbuf starts on 8b aligned address, then > > IPv6 is aligned only by 2 bytes. > > We probably could safely say that aligning on 2 bytes would be OK. But > is there any benefit, performance wise, in doing so? Keeping the same > alignment as before the change would at least make it ABI compatible. I am also not sure that this extra alignment (2B or 4B) here will give us any benefit, while it most likely will introduce extra restrictions. AFAIK, right now we do have ipv6 as array of plain chars, and there were no much complaints about it. So I am for keeping it 1B aligned. Overall proposal looks reasonable to me... might be 24.11 is a good opportunity for such change. Konstantin
RE: [PATCH dpdk v2] rel_notes: announce 24.11 ipv6 api breakage
> In 24.11, all IPv6 public APIs will be modified to use a structure > instead of fixed size arrays. > > Signed-off-by: Robin Jarry > Acked-by: Morten Brørup > --- > > Notes: > v2: updated with the exhaustive list of symbols > > doc/guides/rel_notes/deprecation.rst | 42 > 1 file changed, 42 insertions(+) > > diff --git a/doc/guides/rel_notes/deprecation.rst > b/doc/guides/rel_notes/deprecation.rst > index 6948641ff69b..bb17b78d193a 100644 > --- a/doc/guides/rel_notes/deprecation.rst > +++ b/doc/guides/rel_notes/deprecation.rst > @@ -147,3 +147,45 @@ Deprecation Notices >will be deprecated and subsequently removed in DPDK 24.11 release. >Before this, the new port library API (functions rte_swx_port_*) >will gradually transition from experimental to stable status. > + > +* net: A new IPv6 address structure will be introduced in DPDK 24.11. > + It will replace all ad-hoc ``uint8_t[16]`` arrays in all public APIs and > structures. > + The following libraries and symbols are expected to be affected: > + > + ethdev > +- ``struct rte_flow_item_icmp6_nd_ns`` > +- ``struct rte_flow_item_icmp6_nd_na`` > +- ``struct rte_flow_action_set_ipv6`` > +- ``struct rte_flow_tunnel`` > + fib > +- ``rte_fib6_add()`` > +- ``rte_fib6_delete()`` > +- ``rte_fib6_lookup_bulk()`` > + gro > +- ``struct tcp6_flow_key`` > + hash > +- ``struct rte_ipv6_tuple`` > + ipsec > +- ``struct rte_ipsec_sadv6_key`` > + lpm > +- ``rte_lpm6_add()`` > +- ``rte_lpm6_is_rule_present()`` > +- ``rte_lpm6_delete()`` > +- ``rte_lpm6_delete_bulk_func()`` > +- ``rte_lpm6_lookup()`` > +- ``rte_lpm6_lookup_bulk_func()`` > + net > +- ``struct rte_ipv6_hdr`` > + node > +- ``rte_node_ip6_route_add()`` > + pipeline > +- ``struct rte_table_action_ipv6_header`` > + rib > +- ``rte_rib6_lookup()`` > +- ``rte_rib6_lookup_exact()`` > +- ``rte_rib6_get_nxt()`` > +- ``rte_rib6_insert()`` > +- ``rte_rib6_remove()`` > +- ``rte_rib6_get_ip()`` > + table > +- ``struct rte_table_lpm_ipv6_key`` > -- Acked-by: Konstantin Ananyev > 2.45.2 >
RE: [PATCH v2] doc: announce rte_ipsec API changes
Hi, > In case of event mode operations where event device can help in atomic > sequence number increment across cores, sequence number need to be > provided by the application instead of being updated in rte_ipsec or the > PMD. To support this, a new flag ``RTE_IPSEC_SAFLAG_SQN_ASSIGN_DISABLE`` > will be added to disable sequence number update inside IPsec library and > the API rte_ipsec_pkt_crypto_prepare will be extended to include ``sqn`` > as an additional parameter to specify sequence number to be used for > IPsec from the application. Could you probably elaborate a bit more: Why such change is necessary for event-dev mode, what exactly will be affected in librte_ipsec (would it be for outbound mode, or both), etc. > > Signed-off-by: Aakash Sasidharan > --- > doc/guides/rel_notes/deprecation.rst | 7 +++ > 1 file changed, 7 insertions(+) > > diff --git a/doc/guides/rel_notes/deprecation.rst > b/doc/guides/rel_notes/deprecation.rst > index 6948641ff6..bc1d93cca7 100644 > --- a/doc/guides/rel_notes/deprecation.rst > +++ b/doc/guides/rel_notes/deprecation.rst > @@ -133,6 +133,13 @@ Deprecation Notices >Since these functions are not called directly by the application, >the API remains unaffected. > > +* ipsec: The rte_ipsec library is updated to support sequence number provided > + by application. A new flag ``RTE_IPSEC_SAFLAG_SQN_ASSIGN_DISABLE`` is > introduced > + to disable sequence number assignment in lib IPsec. > + The API rte_ipsec_pkt_crypto_prepare is extended to include ``sqn`` as an > + additional parameter allowing application to specify the sequence number > to be > + used for the IPsec operation. > + > * pipeline: The pipeline library legacy API (functions rte_pipeline_*) >will be deprecated and subsequently removed in DPDK 24.11 release. >Before this, the new pipeline library API (functions rte_swx_pipeline_*) > -- > 2.25.1
RE: [PATCH 3/3] examples/l3fwd: fix maximum acceptable port ID in routes
> > > > > > Application is accepting routes for port ID up to UINT8_MAX for > > > > > > LPM amd EM routes on parsing the given rule file, but only up to > > > > > > 32 ports can be enabled as per the variable enabled_port_mask > > > > > > which is defined as uint32_t. > > > > > > > > > > > > This patch restricts the rules parsing code to accept routes for > > > > > > port ID up to 31 only to avoid any unnecessary maintenance of > > > > > > rules which will never be used. > > > > > > > > > > If we want to add this extra check, probably better to do it in > > > > > setup_lpm(). > > > > > Where we already check that port is enabled, and If not, then this > > > > > route rule will be skipped: > > > > > > > > > > /* populate the LPM table */ > > > > > for (i = 0; i < route_num_v4; i++) { > > > > > struct in_addr in; > > > > > > > > > > /* skip unused ports */ > > > > > if ((1 << route_base_v4[i].if_out & > > > > > enabled_port_mask) == 0) > > > > > continue; > > > > > > > > > > Same for EM. > > > > I am trying to update the check for MAX if_out value in rules config > > > > file parsing > > > which will be before setup_lpm(). > > > > The reason is, restricting and adding only those rules which can be > > > > used by the application while populating the route_base_v4/v6 at > > > > first step and avoid unnecessary memory allocation for local > > > > variables to store more > > > not required rules. > > > > > > Hmm... but why it is a problem? > > Not really a problem, Just trying to optimize wherever it Is possible. > > > > > > > > > > > > > > ((1 << route_base_v4[i].if_out & > > > > > enabled_port_mask) > > > > By looking into this check, it seems restriction to maximum 31 port > > > > ID while parsing rule file becomes more valid as this check can pass > > > > due to overflow in case value of route_base_v4[i].if_out Is 31+. > > > > > > Agree, I think we need both, and it probably need to be in setup_lpm(). > > > Something like: > > > > > > if (route_base_v4[i].if_out >= sizeof(enabled_port_mask) * CHAR_BIT || > > >((1 << route_base_v4[i].if_out & enabled_port_mask) == 0) { > > > /* print some error message here*/ > > > rte_exiit(...); /* or return an error */ } > > > > > Yes, I can change it to this. > > I re-checked the code, IMO we should restrict the rules in " > read_config_files" > May be we can move this check to read_config_files. > As having this check in the setup can result in rte_exit() call when no user > rule file > Is given and application is using the default rules. In that case > route_base_v4 will > Have 16 rules for 16 ports (default rules). > So this check will fails always unless user enable all the 16 ports with -p > option. Ah yes, you are right. That's why probably right now we probably just do 'continue;' here... Yeh, probably the easiest way is to put this check before setup_lpm() - in parsing code, or straight after that. Can I ask you for one more thing: can we add a new function that would do this check and use it everywhere (lpm/em/acl). > > > > > > > > > > > Another question here - why we just silently skip the rule with > > > > > invalid port? > > > > In read_config_files_lpm() we are calling the rte_exit in case port ID > > > > is 31+. > > > > In setup_lpm, skipping the rules for the ports which are not enabled > > > > and not giving error, I guess probably because of ease of use. > > > > e.g. user has only single ipv4_routes config file with route rules > > > > for port ID 0,1,2,3,4 and want to use same file for multiple test > > > > cases like 1. when only port 0 enabled 2. when only port 0 and 1 > > > > enabled and so on. > > > > In this case, user can avoid to have separate route files for each of > > > > the test > > case. > > > > > > The problem as I see it - we are not consistent here. > > > In some cases we just silently skip rules with invalid (or disabled) > > > port numbers, in other cases we generate an error and exit. > > > For me it would be better, if we follow one simple policy (abort with > > > error) here for all cases. > > Ok, I will add the rte_exit if route port is invalid or not enabled. > > With this change onwards It will be assumed user will add only those routes > > With > > port IDs which are valid and enabled in the application. > > > > > > > > > > > > > > Probably need to fail with error... that what ACL code-path does. > > > > > > > > > > > Fixes: e7e6dd643092 ("examples/l3fwd: support config file for > > > > > > EM") > > > > > > Fixes: 52def963fc1c ("examples/l3fwd: support config file for > > > > > > LPM/FIB") > > > > > > Cc: sean.morris...@intel.com > > > > > > Cc: sta...@dpdk.org > > > > > > > > > > > > Signed-off-by: Gagandeep Singh > > > > > > --- > > > > > > examples/l3fwd/em_route_parse.c | 6 -- > > > > > > examples/l3fwd/lpm_route_parse.c | 6 -- > > > > >
RE: [PATCH 3/3] examples/l3fwd: fix maximum acceptable port ID in routes
> > > > > > > Application is accepting routes for port ID up to UINT8_MAX for > > > > > > > LPM amd EM routes on parsing the given rule file, but only up to > > > > > > > 32 ports can be enabled as per the variable enabled_port_mask > > > > > > > which is defined as uint32_t. > > > > > > > > > > > > > > This patch restricts the rules parsing code to accept routes for > > > > > > > port ID up to 31 only to avoid any unnecessary maintenance of > > > > > > > rules which will never be used. > > > > > > > > > > > > If we want to add this extra check, probably better to do it in > > > > > > setup_lpm(). > > > > > > Where we already check that port is enabled, and If not, then this > > > > > > route rule will be skipped: > > > > > > > > > > > > /* populate the LPM table */ > > > > > > for (i = 0; i < route_num_v4; i++) { > > > > > > struct in_addr in; > > > > > > > > > > > > /* skip unused ports */ > > > > > > if ((1 << route_base_v4[i].if_out & > > > > > > enabled_port_mask) == 0) > > > > > > continue; > > > > > > > > > > > > Same for EM. > > > > > I am trying to update the check for MAX if_out value in rules config > > > > > file parsing > > > > which will be before setup_lpm(). > > > > > The reason is, restricting and adding only those rules which can be > > > > > used by the application while populating the route_base_v4/v6 at > > > > > first step and avoid unnecessary memory allocation for local > > > > > variables to store more > > > > not required rules. > > > > > > > > Hmm... but why it is a problem? > > > Not really a problem, Just trying to optimize wherever it Is possible. > > > > > > > > > > > > > > > > > > ((1 << route_base_v4[i].if_out & > > > > > > enabled_port_mask) > > > > > By looking into this check, it seems restriction to maximum 31 port > > > > > ID while parsing rule file becomes more valid as this check can pass > > > > > due to overflow in case value of route_base_v4[i].if_out Is 31+. > > > > > > > > Agree, I think we need both, and it probably need to be in setup_lpm(). > > > > Something like: > > > > > > > > if (route_base_v4[i].if_out >= sizeof(enabled_port_mask) * CHAR_BIT || > > > >((1 << route_base_v4[i].if_out & enabled_port_mask) == 0) { > > > > /* print some error message here*/ > > > > rte_exiit(...); /* or return an error */ } > > > > > > > Yes, I can change it to this. > > > > I re-checked the code, IMO we should restrict the rules in " > > read_config_files" > > May be we can move this check to read_config_files. > > As having this check in the setup can result in rte_exit() call when no > > user rule file > > Is given and application is using the default rules. In that case > > route_base_v4 will > > Have 16 rules for 16 ports (default rules). > > So this check will fails always unless user enable all the 16 ports with -p > > option. > > Ah yes, you are right. > That's why probably right now we probably just do 'continue;' here... > Yeh, probably the easiest way is to put this check before setup_lpm() - > in parsing code, or straight after that. > Can I ask you for one more thing: can we add a new function that would > do this check and use it everywhere (lpm/em/acl). As alternative thought - we might add to setup_lpm() an extra parameter to indicate what do we want to do on rule with invalid/disabled port - just skip it or fail. Another alternative - remove default route ability at all, though that one is a change in behavior and probably there would be some complaints. > > > > > > > > > > > > > > > Another question here - why we just silently skip the rule with > > > > > > invalid port? > > > > > In read_config_files_lpm() we are calling the rte_exit in case port > > > > > ID is 31+. > > > > > In setup_lpm, skipping the rules for the ports which are not enabled > > > > > and not giving error, I guess probably because of ease of use. > > > > > e.g. user has only single ipv4_routes config file with route rules > > > > > for port ID 0,1,2,3,4 and want to use same file for multiple test > > > > > cases like 1. when only port 0 enabled 2. when only port 0 and 1 > > > > > enabled and so on. > > > > > In this case, user can avoid to have separate route files for each of > > > > > the test > > > case. > > > > > > > > The problem as I see it - we are not consistent here. > > > > In some cases we just silently skip rules with invalid (or disabled) > > > > port numbers, in other cases we generate an error and exit. > > > > For me it would be better, if we follow one simple policy (abort with > > > > error) here for all cases. > > > Ok, I will add the rte_exit if route port is invalid or not enabled. > > > With this change onwards It will be assumed user will add only those > > > routes With > > > port IDs which are valid and enabled in the application. > > > > > > > > > > > > > > > > > > Probably need
RE: [PATCH v2] doc: announce rte_ipsec API changes
> > > In case of event mode operations where event device can help in atomic > > > sequence number increment across cores, sequence number need to be > > > provided by the application instead of being updated in rte_ipsec or > > > the PMD. To support this, a new flag > > > ``RTE_IPSEC_SAFLAG_SQN_ASSIGN_DISABLE`` > > > will be added to disable sequence number update inside IPsec library > > > and the API rte_ipsec_pkt_crypto_prepare will be extended to include > > > ``sqn`` as an additional parameter to specify sequence number to be > > > used for IPsec from the application. > > > > Could you probably elaborate a bit more: > > Why such change is necessary for event-dev mode, what exactly will be > > affected in librte_ipsec (would it be for outbound mode, or both), etc. > > > > [Aakash] When using eventdev, it is possible to have multiple cores process > packets from the same flow at the same time, but still > have ordering maintained. > > Sequence for IPsec would be like below, > 1. Ethdev Rx computes flow hash and submits packets to an ORDERED eventdev > queue. > One flow would always hit one event dev queue. > One eventdev queue can be attached to multiple eventdev ports. > 2. Lcores receives packets via these eventdev ports. > Lcores can now process the packets from the same flow in parallel. > 3. Lcores submit the packets to an ATOMIC queue > This is needed as IPsec seq no update needs to be done atomically. > 4. After seq no update, packets are moved to ORDERED queue. > Lcores can now processes the packets in parallel again. > 5. During Tx, eventdev ensures packet ordering based on ORDERED queue. > > Since lib IPsec takes care of sequence number assignment, complete > rte_ipsec_pkt_crypto_prepare() routine need to be made as > ATOMIC stage. > But apart from seq no update, rest of the operations can be done in parallel. Thanks for explanation. Basically you are seeking ability to split rte_ipsec_pkt_crypto_prepare() for outbound into two stages: 1. update sqn 2. all other preps To be able to do step #2 in parallel, correct? My thought always was that step #2 is not that expensive in terms of performance, and there probably not much point to make it parallel. But I suppose you measured step#2 overhead on your platform and concluded that it worth it... One concern I have with the way you suggested - now we need to store/update sa.sqn by some external entity. Another thing - don't really want to pollute crypto_prepare() API with new parameters which meaning is a bit obscure and depends on other API calls... Wouldn't it be easier and probably more straightforward to just introduce 2 new functions here that would represent step #1 and step #2? Then we can keep crypto_prepare() intact, and user will have a choice: - either use original crypto_prepare() - nothing needs to be changed - or instead call these new functions on his own, if he wants to. > In addition, we are also looking at another use case when a set of packets > from a session can be IPsec processed by rte_security > device and some packets from the same session would need to be SW processed > with lib IPsec. Here again the sequence number > assignment would need to occur at central place so that sequence number is > not repeated. Interesting, and how SW/HW SQN will be synchronized in that case? > Initially we are looking at outbound only. But similar kind of use case would > be applicable for inbound also. > > > > > > > Signed-off-by: Aakash Sasidharan > > > --- > > > doc/guides/rel_notes/deprecation.rst | 7 +++ > > > 1 file changed, 7 insertions(+) > > > > > > diff --git a/doc/guides/rel_notes/deprecation.rst > > > b/doc/guides/rel_notes/deprecation.rst > > > index 6948641ff6..bc1d93cca7 100644 > > > --- a/doc/guides/rel_notes/deprecation.rst > > > +++ b/doc/guides/rel_notes/deprecation.rst > > > @@ -133,6 +133,13 @@ Deprecation Notices > > >Since these functions are not called directly by the application, > > >the API remains unaffected. > > > > > > +* ipsec: The rte_ipsec library is updated to support sequence number > > > +provided > > > + by application. A new flag ``RTE_IPSEC_SAFLAG_SQN_ASSIGN_DISABLE`` > > > +is introduced > > > + to disable sequence number assignment in lib IPsec. > > > + The API rte_ipsec_pkt_crypto_prepare is extended to include ``sqn`` > > > +as an > > > + additional parameter allowing application to specify the sequence > > > +number to be > > > + used for the IPsec operation. > > > + > > > * pipeline: The pipeline library legacy API (functions rte_pipeline_*) > > >will be deprecated and subsequently removed in DPDK 24.11 release. > > >Before this, the new pipeline library API (functions > > > rte_swx_pipeline_*) > > > -- > > > 2.25.1 >
RE: [PATCH v2] doc: announce rte_ipsec API changes
> > > > > In case of event mode operations where event device can help in > > > > > atomic sequence number increment across cores, sequence number > > > > > need to be provided by the application instead of being updated in > > > > > rte_ipsec or the PMD. To support this, a new flag > > > > > ``RTE_IPSEC_SAFLAG_SQN_ASSIGN_DISABLE`` > > > > > will be added to disable sequence number update inside IPsec > > > > > library and the API rte_ipsec_pkt_crypto_prepare will be extended > > > > > to include ``sqn`` as an additional parameter to specify sequence > > > > > number to be used for IPsec from the application. > > > > > > > > Could you probably elaborate a bit more: > > > > Why such change is necessary for event-dev mode, what exactly will > > > > be affected in librte_ipsec (would it be for outbound mode, or both), > > > > etc. > > > > > > > > > > [Aakash] When using eventdev, it is possible to have multiple cores > > > process packets from the same flow at the same time, but still have > > > ordering > > maintained. > > > > > > Sequence for IPsec would be like below, 1. Ethdev Rx computes flow > > > hash and submits packets to an ORDERED eventdev queue. > > > One flow would always hit one event dev queue. > > > One eventdev queue can be attached to multiple eventdev ports. > > > 2. Lcores receives packets via these eventdev ports. > > > Lcores can now process the packets from the same flow in parallel. > > > 3. Lcores submit the packets to an ATOMIC queue > > > This is needed as IPsec seq no update needs to be done atomically. > > > 4. After seq no update, packets are moved to ORDERED queue. > > > Lcores can now processes the packets in parallel again. > > > 5. During Tx, eventdev ensures packet ordering based on ORDERED queue. > > > > > > Since lib IPsec takes care of sequence number assignment, complete > > > rte_ipsec_pkt_crypto_prepare() routine need to be made as ATOMIC stage. > > > But apart from seq no update, rest of the operations can be done in > > > parallel. > > > > Thanks for explanation. > > Basically you are seeking ability to split rte_ipsec_pkt_crypto_prepare() > > for > > outbound into two stages: > > 1. update sqn > > 2. all other preps > > To be able to do step #2 in parallel, correct? > > My thought always was that step #2 is not that expensive in terms of > > performance, and there probably not much point to make it parallel. > > But I suppose you measured step#2 overhead on your platform and > > concluded that it worth it... > > > > One concern I have with the way you suggested - now we need to > > store/update sa.sqn by some external entity. > > Another thing - don't really want to pollute crypto_prepare() API with new > > parameters which meaning is a bit obscure and depends on other API calls... > > > > Wouldn't it be easier and probably more straightforward to just introduce 2 > > new functions here that would represent step #1 and step #2? > > Then we can keep crypto_prepare() intact, and user will have a choice: > > - either use original crypto_prepare() - nothing needs to be changed > > - or instead call these new functions on his own, if he wants to. > > > > [Aakash] As I understand, your suggestion is to introduce a set of two new > APIs by splitting the current logic in crypto_prepare(). This > should be okay. > For this, I believe we would need change in the structure > rte_ipsec_sa_pkt_func to hold the function pointers for the new APIs. Yes, that was my thought. > > Assuming that, introduction of the new flag > RTE_IPSEC_SAFLAG_SQN_ASSIGN_DISABLE to disable seq no assignment in lib IPsec > is > fine, shall I send v3 announcing changes in ``struct rte_ipsec_sa_pkt_func``? I am definitely not against this new flag, but if we'll have 2 new functions instead, do you still need it? > > > In addition, we are also looking at another use case when a set of > > > packets from a session can be IPsec processed by rte_security device > > > and some packets from the same session would need to be SW processed > > with lib IPsec. Here again the sequence number assignment would need to > > occur at central place so that sequence number is not repeated. > > > > Interesting, and how SW/HW SQN will be synchronized in that case? > > > > [Aakash] The design is such that HW would assign sequence number for all > cases. HW would then pass this data as a metadata to SW > so that it can do SW processing with the assigned sequence number. As I can see there are two options to fulfill that requirement: 1. Introduce a new function that would update sa.sqn value. Something like rte_ipsec_sa_update_sqn(...). So when metadata from HW arrives, SW can call it and sync sa.sqn with new HW value, and then continue with conventional rte_ipsec_crypto_prepare(...); 2. Introduce new (extended) variants of ipsec_crypto_prepare/process that would take SQN (might be something else ?) as extra parameter, something like: rte_ipcec_xprepare(const struct rte_ipsec_sess
[PATCH] examples/l3fwd: fix read beyond array bondaries
From: Konstantin Ananyev ASAN report: ERROR: AddressSanitizer: unknown-crash on address 0x7ef92e32 at pc 0x0053d1e9 bp 0x7ef92c00 sp 0x7ef92bf8 READ of size 16 at 0x7ef92e32 thread T0 #0 0x53d1e8 in _mm_loadu_si128 /usr/lib64/gcc/x86_64-suse-linux/11/include/emmintrin.h:703 #1 0x53d1e8 in send_packets_multi ../examples/l3fwd/l3fwd_sse.h:125 #2 0x53d1e8 in acl_send_packets ../examples/l3fwd/l3fwd_acl.c:1048 #3 0x53ec18 in acl_main_loop ../examples/l3fwd/l3fwd_acl.c:1127 #4 0x12151eb in rte_eal_mp_remote_launch ../lib/eal/common/eal_common_launch.c:83 #5 0x5bf2df in main ../examples/l3fwd/main.c:1647 #6 0x7f6d42a0d2bc in __libc_start_main (/lib64/libc.so.6+0x352bc) #7 0x527499 in _start (/home/kananyev/dpdk-l3fwd-acl/x86_64-native-linuxapp-gcc-dbg-b1/examples/dpdk-l3fwd+0x527499) Reason for that is that send_packets_multi() uses 16B loads to access input dst_port[]and might read beyond array boundaries. Right now, it doesn't cause any real issue - junk values are ignored, also inside l3fwd we always allocate dst_port[] array on the stack, so memory beyond it is always available. Anyway, it probably need to be fixed. The patch below simply allocates extra space for dst_port[], so send_packets_multi() will never read beyond its boundaries. Probably a better fix would be to change send_packets_multi() itself to avoid access beyond 'nb_rx' entries. Bugzilla ID: 1502 Fixes: 94c54b4158d5 ("examples/l3fwd: rework exact-match") Cc: sta...@dpdk.org Signed-off-by: Konstantin Ananyev --- examples/l3fwd/l3fwd_acl.c | 2 +- examples/l3fwd/l3fwd_altivec.h | 6 +- examples/l3fwd/l3fwd_common.h| 7 +++ examples/l3fwd/l3fwd_em_hlm.h| 2 +- examples/l3fwd/l3fwd_em_sequential.h | 2 +- examples/l3fwd/l3fwd_fib.c | 2 +- examples/l3fwd/l3fwd_lpm_altivec.h | 2 +- examples/l3fwd/l3fwd_lpm_neon.h | 2 +- examples/l3fwd/l3fwd_lpm_sse.h | 2 +- examples/l3fwd/l3fwd_neon.h | 6 +- examples/l3fwd/l3fwd_sse.h | 6 +- 11 files changed, 29 insertions(+), 10 deletions(-) diff --git a/examples/l3fwd/l3fwd_acl.c b/examples/l3fwd/l3fwd_acl.c index b635011ef7..baa01e6dde 100644 --- a/examples/l3fwd/l3fwd_acl.c +++ b/examples/l3fwd/l3fwd_acl.c @@ -1056,7 +1056,7 @@ int acl_main_loop(__rte_unused void *dummy) { struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; - uint16_t hops[MAX_PKT_BURST]; + uint16_t hops[SENDM_PORT_OVERHEAD(MAX_PKT_BURST)]; unsigned int lcore_id; uint64_t prev_tsc, diff_tsc, cur_tsc; int i, nb_rx; diff --git a/examples/l3fwd/l3fwd_altivec.h b/examples/l3fwd/l3fwd_altivec.h index e45e138e59..b91a6b5587 100644 --- a/examples/l3fwd/l3fwd_altivec.h +++ b/examples/l3fwd/l3fwd_altivec.h @@ -11,6 +11,9 @@ #include "altivec/port_group.h" #include "l3fwd_common.h" +#undef SENDM_PORT_OVERHEAD +#define SENDM_PORT_OVERHEAD(x) ((x) + 2 * FWDSTEP) + /* * Update source and destination MAC addresses in the ethernet header. * Perform RFC1812 checks and updates for IPV4 packets. @@ -117,7 +120,8 @@ process_packet(struct rte_mbuf *pkt, uint16_t *dst_port) */ static __rte_always_inline void send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst, - uint16_t dst_port[MAX_PKT_BURST], int nb_rx) + uint16_t dst_port[SENDM_PORT_OVERHEAD(MAX_PKT_BURST)], + int nb_rx) { int32_t k; int j = 0; diff --git a/examples/l3fwd/l3fwd_common.h b/examples/l3fwd/l3fwd_common.h index 224b1c08e8..d94e5f1357 100644 --- a/examples/l3fwd/l3fwd_common.h +++ b/examples/l3fwd/l3fwd_common.h @@ -18,6 +18,13 @@ /* Minimum value of IPV4 total length (20B) in network byte order. */ #defineIPV4_MIN_LEN_BE (sizeof(struct rte_ipv4_hdr) << 8) +/* + * send_packet_multi() specific number of dest ports + * due to implementation we need to allocate array bigger then + * actual max number of elements in the array. + */ +#define SENDM_PORT_OVERHEAD(x) (x) + /* * From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2: * - The IP version number must be 4. diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h index 31cda9ddc1..c1d819997a 100644 --- a/examples/l3fwd/l3fwd_em_hlm.h +++ b/examples/l3fwd/l3fwd_em_hlm.h @@ -249,7 +249,7 @@ static inline void l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint16_t portid, struct lcore_conf *qconf) { - uint16_t dst_port[MAX_PKT_BURST]; + uint16_t dst_port[SENDM_PORT_OVERHEAD(MAX_PKT_BURST)]; l3fwd_em_process_packets(nb_rx, pkts_burst, dst_port, portid, qconf, 0); send_packets_multi(qconf, pkts_burst, dst_port, nb_rx); diff --git a/examples/l3fwd/l3fwd_em_sequential.h b/examples/l3fwd/l3fwd_em_sequential.h index 067f23889a..3a40b2e434 100644 --- a/examples/l3fwd/l3fwd_em_sequential.h +++ b/
RE: [PATCH v3] doc: announce rte_ipsec API changes
> In case of event mode operations where event device can help in atomic > sequence number increment across cores, sequence number need to be > provided by the application instead of being updated in rte_ipsec or the > PMD. > > To support this, two new APIs rte_ipsec_pkt_crypto_sqn_assign and > rte_ipsec_pkt_crypto_xprepare are introduced decoupling the seq no update > functionality from the existing rte_ipsec_pkt_crypto_prepare API. > Additionally, a new flag ``RTE_IPSEC_SAFLAG_SQN_ASSIGN_DISABLE`` will be > added to allow disabling of internal sequence number update inside IPsec > library. > > Signed-off-by: Aakash Sasidharan > --- > doc/guides/rel_notes/deprecation.rst | 8 > 1 file changed, 8 insertions(+) > > diff --git a/doc/guides/rel_notes/deprecation.rst > b/doc/guides/rel_notes/deprecation.rst > index 6948641ff6..9be7b90b94 100644 > --- a/doc/guides/rel_notes/deprecation.rst > +++ b/doc/guides/rel_notes/deprecation.rst > @@ -133,6 +133,14 @@ Deprecation Notices >Since these functions are not called directly by the application, >the API remains unaffected. > > +* ipsec: The rte_ipsec library is updated to support sequence number provided > + by application. To allow the same, two new APIs are being introduced: > + rte_ipsec_pkt_crypto_sqn_assign and rte_ipsec_pkt_crypto_xprepare. These > APIs > + separate the seq no update functionality from the existing > rte_ipsec_pkt_crypto_prepare > + API. Corresponding configure structure changes are being made for the new > APIs. > + Additionally a new flag ``RTE_IPSEC_SAFLAG_SQN_ASSIGN_DISABLE`` is > + introduced to disable sequence number assignment in lib IPsec. > + > * pipeline: The pipeline library legacy API (functions rte_pipeline_*) >will be deprecated and subsequently removed in DPDK 24.11 release. >Before this, the new pipeline library API (functions rte_swx_pipeline_*) > -- Acked-by: Konstantin Ananyev > 2.25.1
[PATCH v2 0/2] examples/l3fwd fixes for ACL mode
From: Konstantin Ananyev As Song Jiale pointed outprevious fix is not enough to fix the problem he is observing with l3fwd in ACl mode: https://bugs.dpdk.org/show_bug.cgi?id=1502 This is a second attempt to fix it. Konstantin Ananyev (2): examples/l3fwd: fix read beyond array bondaries examples/l3fwd: fix read beyond array boundaries in ACL mode examples/l3fwd/l3fwd_acl.c | 37 examples/l3fwd/l3fwd_altivec.h | 6 - examples/l3fwd/l3fwd_common.h| 7 ++ examples/l3fwd/l3fwd_em_hlm.h| 2 +- examples/l3fwd/l3fwd_em_sequential.h | 2 +- examples/l3fwd/l3fwd_fib.c | 2 +- examples/l3fwd/l3fwd_lpm_altivec.h | 2 +- examples/l3fwd/l3fwd_lpm_neon.h | 2 +- examples/l3fwd/l3fwd_lpm_sse.h | 2 +- examples/l3fwd/l3fwd_neon.h | 6 - examples/l3fwd/l3fwd_sse.h | 6 - 11 files changed, 55 insertions(+), 19 deletions(-) -- 2.35.3
[PATCH v2 1/2] examples/l3fwd: fix read beyond array bondaries
From: Konstantin Ananyev ASAN report: ERROR: AddressSanitizer: unknown-crash on address 0x7ef92e32 at pc 0x0053d1e9 bp 0x7ef92c00 sp 0x7ef92bf8 READ of size 16 at 0x7ef92e32 thread T0 #0 0x53d1e8 in _mm_loadu_si128 /usr/lib64/gcc/x86_64-suse-linux/11/include/emmintrin.h:703 #1 0x53d1e8 in send_packets_multi ../examples/l3fwd/l3fwd_sse.h:125 #2 0x53d1e8 in acl_send_packets ../examples/l3fwd/l3fwd_acl.c:1048 #3 0x53ec18 in acl_main_loop ../examples/l3fwd/l3fwd_acl.c:1127 #4 0x12151eb in rte_eal_mp_remote_launch ../lib/eal/common/eal_common_launch.c:83 #5 0x5bf2df in main ../examples/l3fwd/main.c:1647 #6 0x7f6d42a0d2bc in __libc_start_main (/lib64/libc.so.6+0x352bc) #7 0x527499 in _start (/home/kananyev/dpdk-l3fwd-acl/x86_64-native-linuxapp-gcc-dbg-b1/examples/dpdk-l3fwd+0x527499) Reason for that is that send_packets_multi() uses 16B loads to access input dst_port[]and might read beyond array boundaries. Right now, it doesn't cause any real issue - junk values are ignored, also inside l3fwd we always allocate dst_port[] array on the stack, so memory beyond it is always available. Anyway, it probably need to be fixed. The patch below simply allocates extra space for dst_port[], so send_packets_multi() will never read beyond its boundaries. Probably a better fix would be to change send_packets_multi() itself to avoid access beyond 'nb_rx' entries. Bugzilla ID: 1502 Fixes: 94c54b4158d5 ("examples/l3fwd: rework exact-match") Cc: sta...@dpdk.org Signed-off-by: Konstantin Ananyev --- examples/l3fwd/l3fwd_acl.c | 2 +- examples/l3fwd/l3fwd_altivec.h | 6 +- examples/l3fwd/l3fwd_common.h| 7 +++ examples/l3fwd/l3fwd_em_hlm.h| 2 +- examples/l3fwd/l3fwd_em_sequential.h | 2 +- examples/l3fwd/l3fwd_fib.c | 2 +- examples/l3fwd/l3fwd_lpm_altivec.h | 2 +- examples/l3fwd/l3fwd_lpm_neon.h | 2 +- examples/l3fwd/l3fwd_lpm_sse.h | 2 +- examples/l3fwd/l3fwd_neon.h | 6 +- examples/l3fwd/l3fwd_sse.h | 6 +- 11 files changed, 29 insertions(+), 10 deletions(-) diff --git a/examples/l3fwd/l3fwd_acl.c b/examples/l3fwd/l3fwd_acl.c index b635011ef7..baa01e6dde 100644 --- a/examples/l3fwd/l3fwd_acl.c +++ b/examples/l3fwd/l3fwd_acl.c @@ -1056,7 +1056,7 @@ int acl_main_loop(__rte_unused void *dummy) { struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; - uint16_t hops[MAX_PKT_BURST]; + uint16_t hops[SENDM_PORT_OVERHEAD(MAX_PKT_BURST)]; unsigned int lcore_id; uint64_t prev_tsc, diff_tsc, cur_tsc; int i, nb_rx; diff --git a/examples/l3fwd/l3fwd_altivec.h b/examples/l3fwd/l3fwd_altivec.h index e45e138e59..b91a6b5587 100644 --- a/examples/l3fwd/l3fwd_altivec.h +++ b/examples/l3fwd/l3fwd_altivec.h @@ -11,6 +11,9 @@ #include "altivec/port_group.h" #include "l3fwd_common.h" +#undef SENDM_PORT_OVERHEAD +#define SENDM_PORT_OVERHEAD(x) ((x) + 2 * FWDSTEP) + /* * Update source and destination MAC addresses in the ethernet header. * Perform RFC1812 checks and updates for IPV4 packets. @@ -117,7 +120,8 @@ process_packet(struct rte_mbuf *pkt, uint16_t *dst_port) */ static __rte_always_inline void send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst, - uint16_t dst_port[MAX_PKT_BURST], int nb_rx) + uint16_t dst_port[SENDM_PORT_OVERHEAD(MAX_PKT_BURST)], + int nb_rx) { int32_t k; int j = 0; diff --git a/examples/l3fwd/l3fwd_common.h b/examples/l3fwd/l3fwd_common.h index 224b1c08e8..d94e5f1357 100644 --- a/examples/l3fwd/l3fwd_common.h +++ b/examples/l3fwd/l3fwd_common.h @@ -18,6 +18,13 @@ /* Minimum value of IPV4 total length (20B) in network byte order. */ #defineIPV4_MIN_LEN_BE (sizeof(struct rte_ipv4_hdr) << 8) +/* + * send_packet_multi() specific number of dest ports + * due to implementation we need to allocate array bigger then + * actual max number of elements in the array. + */ +#define SENDM_PORT_OVERHEAD(x) (x) + /* * From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2: * - The IP version number must be 4. diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h index 31cda9ddc1..c1d819997a 100644 --- a/examples/l3fwd/l3fwd_em_hlm.h +++ b/examples/l3fwd/l3fwd_em_hlm.h @@ -249,7 +249,7 @@ static inline void l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint16_t portid, struct lcore_conf *qconf) { - uint16_t dst_port[MAX_PKT_BURST]; + uint16_t dst_port[SENDM_PORT_OVERHEAD(MAX_PKT_BURST)]; l3fwd_em_process_packets(nb_rx, pkts_burst, dst_port, portid, qconf, 0); send_packets_multi(qconf, pkts_burst, dst_port, nb_rx); diff --git a/examples/l3fwd/l3fwd_em_sequential.h b/examples/l3fwd/l3fwd_em_sequential.h index 067f23889a..3a40b2e434 100644 --- a/examples/l3fwd/l3fwd_em_sequential.h +++ b/
[PATCH v2 2/2] examples/l3fwd: fix read beyond array boundaries in ACL mode
From: Konstantin Ananyev With commit: ACL mode now can use send_packets_multi(). What I missed with that changes: send_packets_multi() can't deal properly with input dst_port[i] == BAD_PORT (though it can set it itself), as it uses dst_port[i] values to read L2 addresses for the port and assumes dst_port[] to contain valid only values. To fix that just add a check that all dst_port[] entries are valid before calling : send_packets_multi(). Otherwhise use send_packets_single(). An alternative, and probably more logical approach would be to re-arrange send_packets_multi() so that it updates L2 packet headers at the very last state - when dst_port[] are finialized. But that would affect all other modes, but that would affect all other modes and will require much more code changes and testing. Bugzilla ID: 1502 Fixes: aa7c6077c19b ("examples/l3fwd: avoid packets reorder in ACL mode") Reported-by: Song Jiale Signed-off-by: Konstantin Ananyev --- examples/l3fwd/l3fwd_acl.c | 35 ++- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/examples/l3fwd/l3fwd_acl.c b/examples/l3fwd/l3fwd_acl.c index baa01e6dde..45013ac06e 100644 --- a/examples/l3fwd/l3fwd_acl.c +++ b/examples/l3fwd/l3fwd_acl.c @@ -993,11 +993,15 @@ dump_denied_pkt(const struct rte_mbuf *pkt, uint32_t res) #endif } -static inline void +/* + * run packets through ACL classify. + * returns number of packets to be dropped (hops[i] == BAD_PORT) + */ +static inline uint32_t acl_process_pkts(struct rte_mbuf *pkts[MAX_PKT_BURST], uint16_t hops[MAX_PKT_BURST], uint32_t num, int32_t socketid) { - uint32_t i, n4, n6, res; + uint32_t i, k, n4, n6, res; struct acl_search_t acl_search; /* split packets burst depending on packet type (IPv4/IPv6) */ @@ -1020,6 +1024,7 @@ acl_process_pkts(struct rte_mbuf *pkts[MAX_PKT_BURST], /* combine lookup results back, into one array of next hops */ n4 = 0; n6 = 0; + k = 0; for (i = 0; i != num; i++) { switch (acl_search.types[i]) { case TYPE_IPV4: @@ -1034,21 +1039,33 @@ acl_process_pkts(struct rte_mbuf *pkts[MAX_PKT_BURST], if (likely((res & ACL_DENY_SIGNATURE) == 0 && res != 0)) hops[i] = res - FWD_PORT_SHIFT; else { + /* bad or deined by ACL rule packets */ hops[i] = BAD_PORT; dump_denied_pkt(pkts[i], res); + k++; } } + + return k; } +/* + * send_packets_multi() can't deal properly with hops[i] == BAD_PORT + * (it assumes input hops[] contain only valid port numbers), + * so it is ok to use it only when there are no denied packets. + */ static inline void acl_send_packets(struct lcore_conf *qconf, struct rte_mbuf *pkts[], - uint16_t hops[], uint32_t num) + uint16_t hops[], uint32_t num, uint32_t nb_drop) { #if defined ACL_SEND_MULTI - send_packets_multi(qconf, pkts, hops, num); + if (nb_drop == 0) + send_packets_multi(qconf, pkts, hops, num); + else #else - send_packets_single(qconf, pkts, hops, num); + RTE_SET_USED(nb_drop); #endif + send_packets_single(qconf, pkts, hops, num); } /* main processing loop */ @@ -1059,7 +1076,7 @@ acl_main_loop(__rte_unused void *dummy) uint16_t hops[SENDM_PORT_OVERHEAD(MAX_PKT_BURST)]; unsigned int lcore_id; uint64_t prev_tsc, diff_tsc, cur_tsc; - int i, nb_rx; + int i, nb_drop, nb_rx; uint16_t portid; uint16_t queueid; struct lcore_conf *qconf; @@ -1122,10 +1139,10 @@ acl_main_loop(__rte_unused void *dummy) pkts_burst, MAX_PKT_BURST); if (nb_rx > 0) { - acl_process_pkts(pkts_burst, hops, nb_rx, - socketid); + nb_drop = acl_process_pkts(pkts_burst, hops, + nb_rx, socketid); acl_send_packets(qconf, pkts_burst, hops, - nb_rx); + nb_rx, nb_drop); } } } -- 2.35.3
Minutes of Technical Board Meeting, 2024-July-24
Minutes of Technical Board Meeting, 2024-July-24 Members Attending - -Aaron -Bruce -Hemant -Jerin -Konstantin -Maxime -Stephen -Thomas NOTE: The technical board meetings every second Wednesday at: https://zoom-lfx.platform.linuxfoundation.org/meeting/96459488340?password=d808f1f6-0a28-4165-929e-5a5bcae7efeb at 3 pm UTC. Meetings are public, and DPDK community members are welcome to attend. NOTE: Next meeting will be on Wednesday 2023- Aug - 7th @3pm UTC, and will be chaired by Maxime # Approved introducing git tree next-dts with Juraj Linkes (juraj.lin...@pantheon.tech) as a maintainer for it. # DPDK Summit (Montreal) CFP timeline update: - CFP Close: Wednesday, July 31 at 11:59 pm EDT (UTC-4) - CFP Notifications: Tuesday, August 6 - Schedule to be announced: Wednesday, August 7 # DPDK Summit APAC (Bangkok) - videos are available on DPDK YouTube channel: https://www.youtube.com/channel/UCNdmaRMwOBu0H8a7DdXAT9A # Pending Process on Robin's Grout Router (https://github.com/rjarry/grout/tree/main) as DPDK hosted project - Previous Approval: Hosting the Grout on dpdk.org was approved in a previous TB meeting. - Next Steps: 1) The final approval process requires GB approval. 2) As the next GB representative from TB, Konstantin Ananyev presented this information to GB. 3) GB to vote over email and come back with final approval.
RE: [PATCH] build: output a dependency log in build directory
> As meson processes our DPDK source tree it manages dependencies > specified by each individual driver. To enable future analysis of the > dependency links between components, log the dependencies of each DPDK > component as it gets processed. This could potentially allow other tools > to automatically enable or disable components based on the desired end > components to be built, e.g. if the user requests net/ice, ensure that > common/iavf is also enabled in the drivers. > > The output file produced is in "dot" or "graphviz" format, which allows > producing a graphical representation of the DPDK dependency tree if so > desired. For example: "dot -Tpng -O build/deps.dot" to produce the > image file "build/deps.dot.png". > > Signed-off-by: Bruce Richardson I think it is a great idea. Acked-by: Konstantin Ananyev > --- > app/meson.build| 1 + > buildtools/log-deps.py | 43 ++ > buildtools/meson.build | 2 ++ > drivers/meson.build| 1 + > lib/meson.build| 1 + > 5 files changed, 48 insertions(+) > create mode 100644 buildtools/log-deps.py > > diff --git a/app/meson.build b/app/meson.build > index 5b2c80c7a1..6afa457f4c 100644 > --- a/app/meson.build > +++ b/app/meson.build > @@ -76,6 +76,7 @@ foreach app:apps > > if build > subdir(name) > +run_command([log_deps_cmd, name, deps]) > if not build and require_apps > error('Cannot build explicitly requested app > "@0@".\n'.format(name) >+ '\tReason: ' + reason) > diff --git a/buildtools/log-deps.py b/buildtools/log-deps.py > new file mode 100644 > index 00..a4331fa15b > --- /dev/null > +++ b/buildtools/log-deps.py > @@ -0,0 +1,43 @@ > +#! /usr/bin/env python3 > +# SPDX-License-Identifier: BSD-3-Clause > +# Copyright(c) 2024 Intel Corporation > + > +"""Utility script to build up a list of dependencies from meson.""" > + > +import os > +import sys > + > + > +def file_to_list(filename): > +"""Read file into a list of strings.""" > +with open(filename) as f: > +return f.readlines() > + > + > +def list_to_file(filename, lines): > +"""Write a list of strings out to a file.""" > +with open(filename, 'w') as f: > +f.writelines(lines) > + > + > +depsfile = f'{os.environ["MESON_BUILD_ROOT"]}/deps.dot' > + > +# to reset the deps file on each build, the script is called without any > params > +if len(sys.argv) == 1: > +os.remove(depsfile) > +sys.exit(0) > + > +try: > +contents = file_to_list(depsfile) > +except FileNotFoundError: > +contents = ['digraph {\n', '}\n'] > + > +component = sys.argv[1] > +if len(sys.argv) > 2: > +contents[-1] = f'"{component}" -> {{ "{"\", \"".join(sys.argv[2:])}" > }}\n' > +else: > +contents[-1] = f'"{component}"\n' > + > +contents.append('}\n') > + > +list_to_file(depsfile, contents) > diff --git a/buildtools/meson.build b/buildtools/meson.build > index 3adf34e1a8..332f0f3d38 100644 > --- a/buildtools/meson.build > +++ b/buildtools/meson.build > @@ -24,6 +24,8 @@ get_numa_count_cmd = py3 + files('get-numa-count.py') > get_test_suites_cmd = py3 + files('get-test-suites.py') > has_hugepages_cmd = py3 + files('has-hugepages.py') > cmdline_gen_cmd = py3 + files('dpdk-cmdline-gen.py') > +log_deps_cmd = py3 + files('log-deps.py') > +run_command(log_deps_cmd) # call with no parameters to reset the file > > # install any build tools that end-users might want also > install_data([ > diff --git a/drivers/meson.build b/drivers/meson.build > index 66931d4241..44935e067c 100644 > --- a/drivers/meson.build > +++ b/drivers/meson.build > @@ -154,6 +154,7 @@ foreach subpath:subdirs > if build > # pull in driver directory which should update all the local > variables > subdir(drv_path) > +run_command([log_deps_cmd, drv_path.underscorify(), deps]) > > if dpdk_conf.get('RTE_IOVA_IN_MBUF') == 0 and > require_iova_in_mbuf > build = false > diff --git a/lib/meson.build b/lib/meson.build > index 162287753f..da2815465f 100644 > --- a/lib/meson.build > +++ b/lib/meson.build > @@ -160,6 +160,7 @@ foreach l:libraries > > if build > subdir(l) > +run_command([log_deps_cmd, l, deps]) > if not build and require_libs > error('Cannot build explicitly requested lib > "@0@".\n'.format(name) > +'\tReason: ' + reason) > -- > 2.43.0
[PATCH v2 0/2] examples/l3fwd fixes for ACL mode
From: Konstantin Ananyev As Song Jiale pointed outprevious fix is not enough to fix the problem he is observing with l3fwd in ACl mode: https://bugs.dpdk.org/show_bug.cgi?id=1502 This is a second attempt to fix it. Konstantin Ananyev (2): examples/l3fwd: fix read beyond array bondaries examples/l3fwd: fix read beyond array boundaries in ACL mode examples/l3fwd/l3fwd_acl.c | 37 examples/l3fwd/l3fwd_altivec.h | 6 - examples/l3fwd/l3fwd_common.h| 7 ++ examples/l3fwd/l3fwd_em_hlm.h| 2 +- examples/l3fwd/l3fwd_em_sequential.h | 2 +- examples/l3fwd/l3fwd_fib.c | 2 +- examples/l3fwd/l3fwd_lpm_altivec.h | 2 +- examples/l3fwd/l3fwd_lpm_neon.h | 2 +- examples/l3fwd/l3fwd_lpm_sse.h | 2 +- examples/l3fwd/l3fwd_neon.h | 6 - examples/l3fwd/l3fwd_sse.h | 6 - 11 files changed, 55 insertions(+), 19 deletions(-) -- 2.35.3
[RFC 0/6] Stage-Ordered API and other extensions for ring library
From: Konstantin Ananyev Konstantin Ananyev (6): ring: common functions for 'move head' ops ring: make copying functions generic ring/soring: introduce Staged Ordered Ring app/test: add unit tests for soring API examples/l3fwd: make ACL work in pipeline and eventdev modes ring: minimize reads of the counterpart cache-line The main aim of these series is to extend ring library with new API that allows user to create/use Staged-Ordered-Ring (SORING) abstraction. In addition to that there are few other patches that serve different purposes: - first two patches are just code reordering to de-duplicate and generalize existing rte_ring code. - next two patches introduce SORING API into the ring library and provide UT for it. - patch #5 extends l3fwd sample app to work in pipeline (worker-pool) mode. Right now it is done for demonstration and performance comparison pruposes: it makes possible to run l3fwd in different modes: run-to-completion, eventdev, pipeline and perform sort-of 'apple-to-apple' performance comparisons. I am aware that in general community consensus on l3fwd is to keep its functionality simple and limited. From other side we already do have eventdev mode for it, so why pipeline should be prohibited? Though if l3fwd is not an option, then we need to select some other existing sample app to integrate with. Probably ipsec-secgw would be the second best choice from my perspective, though it would require much more effort. Have to say that current l3fwd patch is way too big and unfinished, so if we'll decide to go forward with it, it has to be split and reworked. - patch #6 - attempt to optimize (by caching counter-part tail value) enqueue/dequeue operations for vanilla rte_ring. Logically tt is not linked with patches 3-5 and probably should be in a separate series. I put it here for now just to minimize 'Depends-on' hassle, so everyone can build/try everything in one go. Seeking community help/feedback (apart from usual patch review activity): = - While we tested these changes quite extensively, our platform coverage is limited to x86 right now. So would appreciate the feedback how it behaves on other architectures DPDK supports (ARM, PPC, etc.). Specially for patch #6: so far we didn't observe noticeable performance improvement with it on x86_64, So if there would be no real gain on other platforms (or scenarios) - I am ok to drop that patch. - Adding new (pipeline) mode for l3fwd sample app. Is it worth it? If not, what other sample app should be used to demonstrate new functionality we worked on? ipsec-secgw? Something else? SORING overview === Staged-Ordered-Ring (SORING) provides a SW abstraction for 'ordered' queues with multiple processing 'stages'. It is based on conventional DPDK rte_ring, re-uses many of its concepts, and even substantial part of its code. It can be viewed as an 'extension' of rte_ring functionality. In particular, main SORING properties: - circular ring buffer with fixed size objects - producer, consumer plus multiple processing stages in between. - allows to split objects processing into multiple stages. - objects remain in the same ring while moving from one stage to the other, initial order is preserved, no extra copying needed. - preserves the ingress order of objects within the queue across multiple stages - each stage (and producer/consumer) can be served by single and/or multiple threads. - number of stages, size and number of objects in the ring are configurable at ring initialization time. Data-path API provides four main operations: - enqueue/dequeue works in the same manner as for conventional rte_ring, all rte_ring synchronization types are supported. - acquire/release - for each stage there is an acquire (start) and release (finish) operation. After some objects are 'acquired' - given thread can safely assume that it has exclusive ownership of these objects till it will invoke 'release' for them. After 'release', objects can be 'acquired' by next stage and/or dequeued by the consumer (in case of last stage). Expected use-case: applications that uses pipeline model (probably with multiple stages) for packet processing, when preserving incoming packet order is important. The concept of ‘ring with stages’ is similar to DPDK OPDL eventdev PMD [1], but the internals are different. In particular, SORING maintains internal array of 'states' for each element in the ring that is shared by all threads/processes that access the ring. That allows 'release' to avoid excessive waits on the tail value and helps to improve performancei and scalability. In terms of performance, with our measurements rte_soring and conventional rte_ring provide near
[RFC 1/6] ring: common functions for 'move head' ops
From: Konstantin Ananyev Note upfront: that change doesn't introduce any functional or performance changes. It is just a code-reordering for: - code deduplication - ability in future to re-use the same code to introduce new functionality For each sync mode corresponding move_prod_head() and move_cons_head() are nearly identical to each other, the only differences are: - do we need to use a @capacity to calculate number of entries or not. - what we need to update (prod/cons) and what is used as read-only counterpart. So instead of having 2 copies of nearly identical functions, introduce a new common one that could be used by both functions: move_prod_head() and move_cons_head(). As another positive thing - we can get rid of referencing whole rte_ring structure in that new common sub-function. Signed-off-by: Konstantin Ananyev --- lib/ring/rte_ring_c11_pvt.h | 134 +-- lib/ring/rte_ring_elem_pvt.h | 66 +++ lib/ring/rte_ring_generic_pvt.h | 121 lib/ring/rte_ring_hts_elem_pvt.h | 85 ++-- lib/ring/rte_ring_rts_elem_pvt.h | 85 ++-- 5 files changed, 149 insertions(+), 342 deletions(-) diff --git a/lib/ring/rte_ring_c11_pvt.h b/lib/ring/rte_ring_c11_pvt.h index 629b2d9288..048933ddc6 100644 --- a/lib/ring/rte_ring_c11_pvt.h +++ b/lib/ring/rte_ring_c11_pvt.h @@ -28,41 +28,19 @@ __rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t old_val, rte_atomic_store_explicit(&ht->tail, new_val, rte_memory_order_release); } -/** - * @internal This function updates the producer head for enqueue - * - * @param r - * A pointer to the ring structure - * @param is_sp - * Indicates whether multi-producer path is needed or not - * @param n - * The number of elements we will want to enqueue, i.e. how far should the - * head be moved - * @param behavior - * RTE_RING_QUEUE_FIXED:Enqueue a fixed number of items from a ring - * RTE_RING_QUEUE_VARIABLE: Enqueue as many items as possible from ring - * @param old_head - * Returns head value as it was before the move, i.e. where enqueue starts - * @param new_head - * Returns the current/new head value i.e. where enqueue finishes - * @param free_entries - * Returns the amount of free space in the ring BEFORE head was moved - * @return - * Actual number of objects enqueued. - * If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only. - */ static __rte_always_inline unsigned int -__rte_ring_move_prod_head(struct rte_ring *r, unsigned int is_sp, - unsigned int n, enum rte_ring_queue_behavior behavior, - uint32_t *old_head, uint32_t *new_head, - uint32_t *free_entries) +__rte_ring_headtail_move_head(struct rte_ring_headtail *d, + const struct rte_ring_headtail *s, uint32_t capacity, + unsigned int is_st, unsigned int n, + enum rte_ring_queue_behavior behavior, + uint32_t *old_head, uint32_t *new_head, uint32_t *entries) { - const uint32_t capacity = r->capacity; - uint32_t cons_tail; - unsigned int max = n; + uint32_t stail; int success; + unsigned int max = n; - *old_head = rte_atomic_load_explicit(&r->prod.head, rte_memory_order_relaxed); + *old_head = rte_atomic_load_explicit(&d->head, + rte_memory_order_relaxed); do { /* Reset n to the initial burst count */ n = max; @@ -73,112 +51,36 @@ __rte_ring_move_prod_head(struct rte_ring *r, unsigned int is_sp, /* load-acquire synchronize with store-release of ht->tail * in update_tail. */ - cons_tail = rte_atomic_load_explicit(&r->cons.tail, + stail = rte_atomic_load_explicit(&s->tail, rte_memory_order_acquire); /* The subtraction is done between two unsigned 32bits value * (the result is always modulo 32 bits even if we have -* *old_head > cons_tail). So 'free_entries' is always between 0 +* *old_head > s->tail). So 'free_entries' is always between 0 * and capacity (which is < size). */ - *free_entries = (capacity + cons_tail - *old_head); + *entries = (capacity + stail - *old_head); /* check that we have enough room in ring */ - if (unlikely(n > *free_entries)) + if (unlikely(n > *entries)) n = (behavior == RTE_RING_QUEUE_FIXED) ? - 0 : *free_entries; + 0 : *entries; if (n == 0) return 0; *new_head = *old_hea
[RFC 2/6] ring: make copying functions generic
From: Konstantin Ananyev Note upfront: that change doesn't introduce any functional or performance changes. It is just a code-reordering for: - improve code modularity and re-usability - ability in future to re-use the same code to introduce new functionality There is no real need for enqueue_elems()/dequeue_elems() to get pointer to actual rte_ring structure, instead it is enough to pass a pointer to actual elements buffer inside the ring. In return, we'll get a copying functions that could be used for other queueing abstractions that do have circular ring buffer inside. Signed-off-by: Konstantin Ananyev --- lib/ring/rte_ring_elem_pvt.h | 117 --- 1 file changed, 68 insertions(+), 49 deletions(-) diff --git a/lib/ring/rte_ring_elem_pvt.h b/lib/ring/rte_ring_elem_pvt.h index 3a83668a08..216cb6089f 100644 --- a/lib/ring/rte_ring_elem_pvt.h +++ b/lib/ring/rte_ring_elem_pvt.h @@ -17,12 +17,14 @@ #endif static __rte_always_inline void -__rte_ring_enqueue_elems_32(struct rte_ring *r, const uint32_t size, - uint32_t idx, const void *obj_table, uint32_t n) +__rte_ring_enqueue_elems_32(void *ring_table, const void *obj_table, + uint32_t size, uint32_t idx, uint32_t n) { unsigned int i; - uint32_t *ring = (uint32_t *)&r[1]; + + uint32_t *ring = ring_table; const uint32_t *obj = (const uint32_t *)obj_table; + if (likely(idx + n <= size)) { for (i = 0; i < (n & ~0x7); i += 8, idx += 8) { ring[idx] = obj[i]; @@ -60,14 +62,14 @@ __rte_ring_enqueue_elems_32(struct rte_ring *r, const uint32_t size, } static __rte_always_inline void -__rte_ring_enqueue_elems_64(struct rte_ring *r, uint32_t prod_head, - const void *obj_table, uint32_t n) +__rte_ring_enqueue_elems_64(void *ring_table, const void *obj_table, + uint32_t size, uint32_t idx, uint32_t n) { unsigned int i; - const uint32_t size = r->size; - uint32_t idx = prod_head & r->mask; - uint64_t *ring = (uint64_t *)&r[1]; + + uint64_t *ring = ring_table; const unaligned_uint64_t *obj = (const unaligned_uint64_t *)obj_table; + if (likely(idx + n <= size)) { for (i = 0; i < (n & ~0x3); i += 4, idx += 4) { ring[idx] = obj[i]; @@ -93,14 +95,14 @@ __rte_ring_enqueue_elems_64(struct rte_ring *r, uint32_t prod_head, } static __rte_always_inline void -__rte_ring_enqueue_elems_128(struct rte_ring *r, uint32_t prod_head, - const void *obj_table, uint32_t n) +__rte_ring_enqueue_elems_128(void *ring_table, const void *obj_table, + uint32_t size, uint32_t idx, uint32_t n) { unsigned int i; - const uint32_t size = r->size; - uint32_t idx = prod_head & r->mask; - rte_int128_t *ring = (rte_int128_t *)&r[1]; + + rte_int128_t *ring = ring_table; const rte_int128_t *obj = (const rte_int128_t *)obj_table; + if (likely(idx + n <= size)) { for (i = 0; i < (n & ~0x1); i += 2, idx += 2) memcpy((void *)(ring + idx), @@ -126,37 +128,47 @@ __rte_ring_enqueue_elems_128(struct rte_ring *r, uint32_t prod_head, * single and multi producer enqueue functions. */ static __rte_always_inline void -__rte_ring_enqueue_elems(struct rte_ring *r, uint32_t prod_head, - const void *obj_table, uint32_t esize, uint32_t num) +__rte_ring_do_enqueue_elems(void *ring_table, const void *obj_table, + uint32_t size, uint32_t idx, uint32_t esize, uint32_t num) { /* 8B and 16B copies implemented individually to retain * the current performance. */ if (esize == 8) - __rte_ring_enqueue_elems_64(r, prod_head, obj_table, num); + __rte_ring_enqueue_elems_64(ring_table, obj_table, size, + idx, num); else if (esize == 16) - __rte_ring_enqueue_elems_128(r, prod_head, obj_table, num); + __rte_ring_enqueue_elems_128(ring_table, obj_table, size, + idx, num); else { - uint32_t idx, scale, nr_idx, nr_num, nr_size; + uint32_t scale, nr_idx, nr_num, nr_size; /* Normalize to uint32_t */ scale = esize / sizeof(uint32_t); nr_num = num * scale; - idx = prod_head & r->mask; nr_idx = idx * scale; - nr_size = r->size * scale; - __rte_ring_enqueue_elems_32(r, nr_size, nr_idx, - obj_table, nr_num); + nr_size = size * scale; + __rte_ring_enqueue_elems_32(ring_table, obj_table, nr_size, + nr_idx, nr_num); } } static __rte_always_inline void -__rte_ring_dequeue_elems_32(struct rte_ring
[RFC 3/6] ring/soring: introduce Staged Ordered Ring
From: Konstantin Ananyev Staged-Ordered-Ring (SORING) provides a SW abstraction for 'ordered' queues with multiple processing 'stages'. It is based on conventional DPDK rte_ring, re-uses many of its concepts, and even substantial part of its code. It can be viewed as an 'extension' of rte_ring functionality. In particular, main SORING properties: - circular ring buffer with fixed size objects - producer, consumer plus multiple processing stages in the middle. - allows to split objects processing into multiple stages. - objects remain in the same ring while moving from one stage to the other, initial order is preserved, no extra copying needed. - preserves the ingress order of objects within the queue across multiple stages, i.e.: at the same stage multiple threads can process objects from the ring in any order, but for the next stage objects will always appear in the original order. - each stage (and producer/consumer) can be served by single and/or multiple threads. - number of stages, size and number of objects in the ring are configurable at ring initialization time. Data-path API provides four main operations: - enqueue/dequeue works in the same manner as for conventional rte_ring, all rte_ring synchronization types are supported. - acquire/release - for each stage there is an acquire (start) and release (finish) operation. after some objects are 'acquired' - given thread can safely assume that it has exclusive possession of these objects till 'release' for them is invoked. Note that right now user has to release exactly the same number of objects that was acquired before. After 'release', objects can be 'acquired' by next stage and/or dequeued by the consumer (in case of last stage). Expected use-case: applications that uses pipeline model (probably with multiple stages) for packet processing, when preserving incoming packet order is important. I.E.: IPsec processing, etc. Signed-off-by: Konstantin Ananyev --- lib/ring/meson.build | 4 +- lib/ring/rte_soring.c | 144 ++ lib/ring/rte_soring.h | 270 ++ lib/ring/soring.c | 431 ++ lib/ring/soring.h | 124 lib/ring/version.map | 13 ++ 6 files changed, 984 insertions(+), 2 deletions(-) create mode 100644 lib/ring/rte_soring.c create mode 100644 lib/ring/rte_soring.h create mode 100644 lib/ring/soring.c create mode 100644 lib/ring/soring.h diff --git a/lib/ring/meson.build b/lib/ring/meson.build index 7fca958ed7..21f2c12989 100644 --- a/lib/ring/meson.build +++ b/lib/ring/meson.build @@ -1,8 +1,8 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2017 Intel Corporation -sources = files('rte_ring.c') -headers = files('rte_ring.h') +sources = files('rte_ring.c', 'rte_soring.c', 'soring.c') +headers = files('rte_ring.h', 'rte_soring.h') # most sub-headers are not for direct inclusion indirect_headers += files ( 'rte_ring_core.h', diff --git a/lib/ring/rte_soring.c b/lib/ring/rte_soring.c new file mode 100644 index 00..17b1b73a42 --- /dev/null +++ b/lib/ring/rte_soring.c @@ -0,0 +1,144 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2024 Huawei Technologies Co., Ltd + */ + +#include "soring.h" +#include + +RTE_LOG_REGISTER_DEFAULT(soring_logtype, INFO); +#define RTE_LOGTYPE_SORING soring_logtype +#define SORING_LOG(level, ...) \ + RTE_LOG_LINE(level, SORING, "" __VA_ARGS__) + +static uint32_t +soring_calc_elem_num(uint32_t count) +{ + return rte_align32pow2(count + 1); +} + +static int +soring_check_param(uint32_t esize, uint32_t stsize, uint32_t count, + uint32_t stages) +{ + if (stages == 0) { + SORING_LOG(ERR, "invalid number of stages: %u", stages); + return -EINVAL; + } + + /* Check if element size is a multiple of 4B */ + if (esize == 0 || esize % 4 != 0) { + SORING_LOG(ERR, "invalid element size: %u", esize); + return -EINVAL; + } + + /* Check if ret-code size is a multiple of 4B */ + if (stsize % 4 != 0) { + SORING_LOG(ERR, "invalid retcode size: %u", stsize); + return -EINVAL; + } + +/* count must be a power of 2 */ + if (rte_is_power_of_2(count) == 0 || + (count > RTE_SORING_ELEM_MAX + 1)) { + SORING_LOG(ERR, "invalid number of elements: %u", count); + return -EINVAL; + } + + return 0; +} + +/* + * Calculate size offsets for SORING internal data layout. + */ +static size_t +soring_get_szofs(uint32_t esize, uint32_t stsize, uint32_t count, + uint32_t stages, size_t *elst_ofs, size_t *state_ofs, + size_t *st
[RFC 4/6] app/test: add unit tests for soring API
From: Konstantin Ananyev Add both functional and stess test-cases for soring API. Stress test serves as both functional and performance test of soring enqueue/dequeue/acquire/release operations under high contention (for both over committed and non-over committed scenarios). Signed-off-by: Eimear Morrissey Signed-off-by: Konstantin Ananyev --- app/test/meson.build | 3 + app/test/test_soring.c | 452 app/test/test_soring_mt_stress.c | 45 ++ app/test/test_soring_stress.c | 48 ++ app/test/test_soring_stress.h | 35 ++ app/test/test_soring_stress_impl.h | 832 + 6 files changed, 1415 insertions(+) create mode 100644 app/test/test_soring.c create mode 100644 app/test/test_soring_mt_stress.c create mode 100644 app/test/test_soring_stress.c create mode 100644 app/test/test_soring_stress.h create mode 100644 app/test/test_soring_stress_impl.h diff --git a/app/test/meson.build b/app/test/meson.build index e29258e6ec..c290162e43 100644 --- a/app/test/meson.build +++ b/app/test/meson.build @@ -175,6 +175,9 @@ source_file_deps = { 'test_security_proto.c' : ['cryptodev', 'security'], 'test_seqlock.c': [], 'test_service_cores.c': [], +'test_soring.c': [], +'test_soring_mt_stress.c': [], +'test_soring_stress.c': [], 'test_spinlock.c': [], 'test_stack.c': ['stack'], 'test_stack_perf.c': ['stack'], diff --git a/app/test/test_soring.c b/app/test/test_soring.c new file mode 100644 index 00..381979bc6f --- /dev/null +++ b/app/test/test_soring.c @@ -0,0 +1,452 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2024 Huawei Technologies Co., Ltd + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "test.h" + +#define MAX_ACQUIRED 20 + +#define SORING_TEST_ASSERT(val, expected) do { \ + RTE_TEST_ASSERT(expected == val, \ + "%s: expected %u got %u\n", #val, expected, val); \ +} while (0) + +static void +set_soring_init_param(struct rte_soring_param *prm, + const char *name, uint32_t esize, uint32_t elems, + uint32_t stages, uint32_t stsize, + enum rte_ring_sync_type rst_prod, + enum rte_ring_sync_type rst_cons) +{ + prm->name = name; + prm->esize = esize; + prm->elems = elems; + prm->stages = stages; + prm->stsize = stsize; + prm->prod_synt = rst_prod; + prm->cons_synt = rst_cons; +} + +static int +move_forward_stage(struct rte_soring *sor, + uint32_t num_packets, uint32_t stage) +{ + uint32_t acquired; + uint32_t ftoken; + uint32_t *acquired_objs[MAX_ACQUIRED]; + + acquired = rte_soring_acquire(sor, acquired_objs, NULL, stage, + num_packets, RTE_RING_QUEUE_FIXED, &ftoken, NULL); + SORING_TEST_ASSERT(acquired, num_packets); + rte_soring_release(sor, NULL, NULL, stage, num_packets, + ftoken); + + return 0; +} + +/* + * struct rte_soring_param param checking. + */ +static int +test_soring_init(void) +{ + struct rte_soring *sor = NULL; + struct rte_soring_param prm; + int rc; + size_t sz; + memset(&prm, 0, sizeof(prm)); + +/*init memory*/ + set_soring_init_param(&prm, "alloc_memory", sizeof(uintptr_t), + 4, 1, 4, RTE_RING_SYNC_MT, RTE_RING_SYNC_MT); + sz = rte_soring_get_memsize(&prm); + sor = rte_zmalloc(NULL, sz, RTE_CACHE_LINE_SIZE); + RTE_TEST_ASSERT_NOT_NULL(sor, "could not allocate memory for soring"); + + set_soring_init_param(&prm, "test_invalid_stages", sizeof(uintptr_t), + 4, 0, 4, RTE_RING_SYNC_MT, RTE_RING_SYNC_MT); + rc = rte_soring_init(sor, &prm); + RTE_TEST_ASSERT_FAIL(rc, "initted soring with invalid num stages"); + + set_soring_init_param(&prm, "test_invalid_esize", 0, + 4, 1, 4, RTE_RING_SYNC_MT, RTE_RING_SYNC_MT); + rc = rte_soring_init(sor, &prm); + RTE_TEST_ASSERT_FAIL(rc, "initted soring with 0 esize"); + + set_soring_init_param(&prm, "test_invalid_esize", 9, + 4, 1, 4, RTE_RING_SYNC_MT, RTE_RING_SYNC_MT); + rc = rte_soring_init(sor, &prm); + RTE_TEST_ASSERT_FAIL(rc, "initted soring with esize not multiple of 4"); + + set_soring_init_param(&prm, "test_invalid_rsize", sizeof(uintptr_t), +
[RFC 5/6] examples/l3fwd: make ACL work in pipeline and eventdev modes
From: Konstantin Ananyev Note upfront: This is a huge commit that is combined from several ones. For now, I submit it just for reference and demonstration purposes and will probably remove it in future versions. If will decide to go ahead with it, then it needs to be reworked and split into several proper commits. It adds for l3fwd: - eventdev mode for ACL lookup-mode - Introduce a worker-pool-mode (right now implemented for ACL lookup-mode only). Worker-Pool mode is a simple pipeline model, with the following stages: 1) I/O thread receives packets from NIC RX HW queues and enqueues them into the work queue 2) Worker thread reads packets from the work queue(s), process them and then puts processed packets back into the work queue along with the processing status (routing info/error code). 3) I/O thread dequeues packets and their status from the work queue, and based on it either TX packet or drops it. Very similar to l3fwd-eventdev working model. Note that it could be several I/O threads, each can serve one or multiple HW RX queues. Also there could be several Worker threads, each of them can process packets from multiple work queues in round-robin fashion. Work queue can be one of the following types: - wqorder: allows Worker threads to process packets in any order, but guarantees that on dequeue stage the ingress order of packets will be preserved. I.E. at stage #3, I/O thread will get packets exactly in the same order as they were enqueued at stage #1. - wqunorder: doesn't provide any ordered guarantees. 'wqunroder' mode is implemented using 2 rte_ring structures per queue. 'wqorder' mode is implemtened using rte_soring structure per queue. To facilitate this new functionality, command line parameters were extended: --mode: Possible values one of: poll/eventdev/wqorder/wqorderS/wqunorder/wqunorderS Default value: poll - wqorder: Worker-Pool ordered mode with a separate work queue for each HW RX queue. - wqorderS: Worker-Pool ordered mode with one work queue per I/O thread. - wqunorder: Worker-Pool un-ordered mode with a separate work queue for each HW RX queue. - wqunorderS: Worker-Pool un-ordered mode with oen work queue per I/O thread. --wqsize: number of elements for each worker queue. --lookup-iter: forces to perform ACL lookup several times over the same packet. This is artificial parameter and is added temporally for benchmarking purposes. Will be removed in latest versions (if any). Note that in Worker-Pool mode all free lcores that were not assigned as I/O threads will be used as Worker threads. As an example: dpdk-l3fwd --lcores=53,55,57,59,61 ... -- \ -P -p f --config '(0,0,53)(1,0,53)(2,0,53)(3,0,53)' --lookup acl \ --parse-ptype --mode=wqorder ... In that case lcore 53 will be used as I/O thread (stages #1,3) to serve 4 HW RX queues, while lcores 55,57,59,61 will serve as Worker threads (stage #2). Signed-off-by: Konstantin Ananyev --- examples/l3fwd/l3fwd.h | 55 +++ examples/l3fwd/l3fwd_acl.c | 125 +++--- examples/l3fwd/l3fwd_acl_event.h | 258 + examples/l3fwd/l3fwd_event.c | 14 ++ examples/l3fwd/l3fwd_event.h | 1 + examples/l3fwd/l3fwd_sse.h | 49 +- examples/l3fwd/l3fwd_wqp.c | 274 +++ examples/l3fwd/l3fwd_wqp.h | 132 +++ examples/l3fwd/main.c| 75 - examples/l3fwd/meson.build | 1 + 10 files changed, 956 insertions(+), 28 deletions(-) create mode 100644 examples/l3fwd/l3fwd_acl_event.h create mode 100644 examples/l3fwd/l3fwd_wqp.c create mode 100644 examples/l3fwd/l3fwd_wqp.h diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h index 93ce652d02..218f363764 100644 --- a/examples/l3fwd/l3fwd.h +++ b/examples/l3fwd/l3fwd.h @@ -77,6 +77,42 @@ struct __rte_cache_aligned lcore_rx_queue { uint16_t queue_id; }; +enum L3FWD_WORKER_MODE { + L3FWD_WORKER_POLL, + L3FWD_WORKER_UNQUE, + L3FWD_WORKER_ORQUE, +}; + +struct l3fwd_wqp_param { + enum L3FWD_WORKER_MODE mode; + uint32_t qsize;/**< Number of elems in worker queue */ + int32_t single;/**< use single queue per I/O (poll) thread */ +}; + +extern struct l3fwd_wqp_param l3fwd_wqp_param; + +enum { + LCORE_WQ_IN, + LCORE_WQ_OUT, + LCORE_WQ_NUM, +}; + +union lcore_wq { + struct rte_ring *r[LCORE_WQ_NUM]; + struct { + struct rte_soring *sor; + /* used by WQ, sort of thred-local var */ + uint32_t ftoken; + }; +}; + +struct lcore_wq_pool { + uint32_t nb_queue; + uint32_t qmask; + union lcore_wq queue[MAX_RX_QUEUE_PER_LCORE]; + struct l3fwd_wqp_param prm; +}; + struct __rte_cache_aligned lcore_conf { uint16_t n_rx_queue; struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE]; @@ -
[RFC 6/6] ring: minimize reads of the counterpart cache-line
From: Konstantin Ananyev Note upfront: this change shouldn't affect rte_ring public API. Though as layout of public structures have changed - it is an ABI breakage. This is an attempt to implement rte_ring optimization that was suggested by Morten and discussed on this mailing list a while ago. The idea is to optimize MP/SP & MC/SC ring enqueue/dequeue ops by storing along with the head its Cached Foreign Tail (CFT) value. I.E.: for producer we cache consumer tail value and visa-versa. To avoid races head and CFT values are read/written using atomic 64-bit ops. In theory that might help by reducing number of times producer needs to access consumer's cache-line and visa-versa. In practice, I didn't see any impressive boost so far: - ring_per_autotest micro-bench - results are a mixed bag, Some are a bit better, some are worse. - [so]ring_stress_autotest micro-benchs: ~10-15% improvement - l3fwd in wqorder/wqundorder (see previous patch for details): no real difference. Though so far my testing scope was quite limited, I tried it only on x86 machines. So can I ask all interested parties: different platform vendors (ARM, PPC, etc.) and people who do use rte_ring extensively to give it a try and come up with the feedback. If there would be no real performance improvements on any platform we support, or some problems will be encountered - I am ok to drop that patch. Signed-off-by: Konstantin Ananyev --- drivers/net/mlx5/mlx5_hws_cnt.h | 5 ++-- drivers/net/ring/rte_eth_ring.c | 2 +- lib/ring/rte_ring.c | 6 ++-- lib/ring/rte_ring_core.h | 12 +++- lib/ring/rte_ring_generic_pvt.h | 46 +-- lib/ring/rte_ring_peek_elem_pvt.h | 4 +-- lib/ring/soring.c | 31 +++-- lib/ring/soring.h | 4 +-- 8 files changed, 77 insertions(+), 33 deletions(-) diff --git a/drivers/net/mlx5/mlx5_hws_cnt.h b/drivers/net/mlx5/mlx5_hws_cnt.h index 996ac8dd9a..663146563c 100644 --- a/drivers/net/mlx5/mlx5_hws_cnt.h +++ b/drivers/net/mlx5/mlx5_hws_cnt.h @@ -388,11 +388,12 @@ __mlx5_hws_cnt_pool_enqueue_revert(struct rte_ring *r, unsigned int n, MLX5_ASSERT(r->prod.sync_type == RTE_RING_SYNC_ST); MLX5_ASSERT(r->cons.sync_type == RTE_RING_SYNC_ST); - current_head = rte_atomic_load_explicit(&r->prod.head, rte_memory_order_relaxed); + current_head = rte_atomic_load_explicit(&r->prod.head.val.pos, + rte_memory_order_relaxed); MLX5_ASSERT(n <= r->capacity); MLX5_ASSERT(n <= rte_ring_count(r)); revert2head = current_head - n; - r->prod.head = revert2head; /* This ring should be SP. */ + r->prod.head.val.pos = revert2head; /* This ring should be SP. */ __rte_ring_get_elem_addr(r, revert2head, sizeof(cnt_id_t), n, &zcd->ptr1, &zcd->n1, &zcd->ptr2); /* Update tail */ diff --git a/drivers/net/ring/rte_eth_ring.c b/drivers/net/ring/rte_eth_ring.c index 1346a0dba3..31009e90d2 100644 --- a/drivers/net/ring/rte_eth_ring.c +++ b/drivers/net/ring/rte_eth_ring.c @@ -325,7 +325,7 @@ eth_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc) */ pmc->addr = &rng->prod.head; pmc->size = sizeof(rng->prod.head); - pmc->opaque[0] = rng->prod.head; + pmc->opaque[0] = rng->prod.head.val.pos; pmc->fn = ring_monitor_callback; return 0; } diff --git a/lib/ring/rte_ring.c b/lib/ring/rte_ring.c index aebb6d6728..cb2c39c7ad 100644 --- a/lib/ring/rte_ring.c +++ b/lib/ring/rte_ring.c @@ -102,7 +102,7 @@ reset_headtail(void *p) switch (ht->sync_type) { case RTE_RING_SYNC_MT: case RTE_RING_SYNC_ST: - ht->head = 0; + ht->head.raw = 0; ht->tail = 0; break; case RTE_RING_SYNC_MT_RTS: @@ -373,9 +373,9 @@ rte_ring_dump(FILE *f, const struct rte_ring *r) fprintf(f, " size=%"PRIu32"\n", r->size); fprintf(f, " capacity=%"PRIu32"\n", r->capacity); fprintf(f, " ct=%"PRIu32"\n", r->cons.tail); - fprintf(f, " ch=%"PRIu32"\n", r->cons.head); + fprintf(f, " ch=%"PRIu32"\n", r->cons.head.val.pos); fprintf(f, " pt=%"PRIu32"\n", r->prod.tail); - fprintf(f, " ph=%"PRIu32"\n", r->prod.head); + fprintf(f, " ph=%"PRIu32"\n", r->prod.head.val.pos); fprintf(f, " used=%u\n", rte_ring_count(r)); fprintf(f, " avail=%u\n", rte_ring_free_count(r)); } diff --git a/lib/ring/rte_ring_core.h b/lib/ring/rte_ring_core.h index 270869d214..b88a1bc352 100644 --- a/lib/ring/rte_ring_core.h +++
RE: [PATCH v2 0/2] examples/l3fwd fixes for ACL mode
Sorry, that's a dup, sent by mistake this time. Please disregard. Konstantin > -Original Message- > From: Konstantin Ananyev > Sent: Thursday, August 15, 2024 9:53 AM > To: dev@dpdk.org > Cc: honnappa.nagaraha...@arm.com; jer...@marvell.com; hemant.agra...@nxp.com; > bruce.richard...@intel.com; > d...@linux.vnet.ibm.com; ruifeng.w...@arm.com; m...@smartsharesystems.com; > Konstantin Ananyev > > Subject: [PATCH v2 0/2] examples/l3fwd fixes for ACL mode > > From: Konstantin Ananyev > > As Song Jiale pointed outprevious fix is not enough to fix > the problem he is observing with l3fwd in ACl mode: > https://bugs.dpdk.org/show_bug.cgi?id=1502 > This is a second attempt to fix it. > > Konstantin Ananyev (2): > examples/l3fwd: fix read beyond array bondaries > examples/l3fwd: fix read beyond array boundaries in ACL mode > > examples/l3fwd/l3fwd_acl.c | 37 > examples/l3fwd/l3fwd_altivec.h | 6 - > examples/l3fwd/l3fwd_common.h| 7 ++ > examples/l3fwd/l3fwd_em_hlm.h| 2 +- > examples/l3fwd/l3fwd_em_sequential.h | 2 +- > examples/l3fwd/l3fwd_fib.c | 2 +- > examples/l3fwd/l3fwd_lpm_altivec.h | 2 +- > examples/l3fwd/l3fwd_lpm_neon.h | 2 +- > examples/l3fwd/l3fwd_lpm_sse.h | 2 +- > examples/l3fwd/l3fwd_neon.h | 6 - > examples/l3fwd/l3fwd_sse.h | 6 - > 11 files changed, 55 insertions(+), 19 deletions(-) > > -- > 2.35.3
RE: [RFC 3/6] ring/soring: introduce Staged Ordered Ring
> > From: Konstantin Ananyev > > > > Staged-Ordered-Ring (SORING) provides a SW abstraction for 'ordered' queues > > with multiple processing 'stages'. > > It is based on conventional DPDK rte_ring, re-uses many of its concepts, > > and even substantial part of its code. > > It can be viewed as an 'extension' of rte_ring functionality. > > In particular, main SORING properties: > > - circular ring buffer with fixed size objects > > - producer, consumer plus multiple processing stages in the middle. > > - allows to split objects processing into multiple stages. > > - objects remain in the same ring while moving from one stage to the other, > > initial order is preserved, no extra copying needed. > > - preserves the ingress order of objects within the queue across multiple > > stages, i.e.: > > at the same stage multiple threads can process objects from the ring in > > any order, but for the next stage objects will always appear in the > > original order. > > - each stage (and producer/consumer) can be served by single and/or > > multiple threads. > > - number of stages, size and number of objects in the ring are > > configurable at ring initialization time. > > > > Data-path API provides four main operations: > > - enqueue/dequeue works in the same manner as for conventional rte_ring, > > all rte_ring synchronization types are supported. > > - acquire/release - for each stage there is an acquire (start) and > > release (finish) operation. > > after some objects are 'acquired' - given thread can safely assume that > > it has exclusive possession of these objects till 'release' for them is > > invoked. > > Note that right now user has to release exactly the same number of > > objects that was acquired before. > > After 'release', objects can be 'acquired' by next stage and/or dequeued > > by the consumer (in case of last stage). > > > > Expected use-case: applications that uses pipeline model > > (probably with multiple stages) for packet processing, when preserving > > incoming packet order is important. I.E.: IPsec processing, etc. > > > > Signed-off-by: Konstantin Ananyev > > --- > > The existing RING library is for a ring of objects. > > It is very confusing that the new SORING library is for a ring of object > pairs (obj, objst). > > The new SORING library should be for a ring of objects, like the existing > RING library. Please get rid of all the objst stuff. > > This might also improve performance when not using the optional secondary > object. > > > With that in place, you can extend the SORING library with additional APIs > for object pairs. > > I suggest calling the secondary object "metadata" instead of "status" or > "state" or "ret-value". > I agree that data passed as {obj[num], meta[num]} is more efficient than > {obj, meta}[num] in some use cases, which is why your API > uses two vector pointers instead of one. I suppose what you suggest is to have 2 set of functions: one that takes both objs[] and meta[] and second that takes just objs[]? If so, yes I can do that - in fact I was thinking about same thing. BTW, right now meta[] is an optional one anyway. Also will probably get rid of explicit 'behavior' and will have '_burst_' and '_bulk_' versions instead, same as rte_ring. > > Furthermore, you should consider semi-zero-copy APIs for the > "acquire"/"release" functions: > > The "acquire" function can use a concept similar to rte_pktmbuf_read(), where > a vector is provided for copying (if the ring wraps), and > the return value either points directly to the objects in the ring > (zero-copy), or to the vector where the objects were copied to. You mean to introduce analog of rte_ring '_zc_' functions? Yes, I considered that, but decided to leave it for the future. First, because we do need a generic and simple function with copying things anyway. Second I am not so convinced that this _zc_ will give much performance gain, while it definitely makes API not that straightforward. > And the "release" function does not need to copy the object vector back if > the "acquire" function returned a zero-copy pointer. For "release" you don't need to *always* copy objs[] and meta[]. It is optional and is left for the user to decide based on the use-case. If he doesn't need to update objs[] or meta[] he can just pass a NULL ptr here.
RE: [PATCH v2 1/3] app/testpmd: add register keyword
> > > > > >> diff --git a/app/test-pmd/macswap_sse.h b/app/test-pmd/macswap_sse.h > > >> index 223f87a539..29088843b7 100644 > > >> --- a/app/test-pmd/macswap_sse.h > > >> +++ b/app/test-pmd/macswap_sse.h > > >> @@ -16,13 +16,13 @@ do_macswap(struct rte_mbuf *pkts[], uint16_t nb, > > >>uint64_t ol_flags; > > >>int i; > > >>int r; > > >> - __m128i addr0, addr1, addr2, addr3; > > >> + register __m128i addr0, addr1, addr2, addr3; > > > Some compilers treat register as a no-op. Are you sure? Did you check > > > with godbolt. > > > > Thank you Stephen, I have tested the code changes on Linux using GCC and > > Clang compiler. > > > > In both cases in Linux environment, we have seen the the values loaded > > onto register `xmm`. > > > > ``` > > registerconst__m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12, 5, 4, 3, 2, > > 1, 0, 11, 10, 9, 8, 7, 6); > > vmovdqaxmm0, xmmwordptr[rip+ .LCPI0_0] Yep, that what I would probably expect: one time load before the loop starts, right? Curious what exactly it would generate then if 'register' keyword is missed? BTW, on my box, gcc-11 with '-O3 -msse4.2 ...' I am seeing expected behavior without 'register' keyword. Is it some particular compiler version that misbehaves? > > > > ``` > > > > Both cases we have performance improvement. > > > > > > Can you please help us understand if we have missed out something? > > Ok, not sure why compiler would not decide to already use a register here?
RE: [RFC 3/6] ring/soring: introduce Staged Ordered Ring
> > Staged-Ordered-Ring (SORING) provides a SW abstraction for 'ordered' queues > > with multiple processing 'stages'. > > It is based on conventional DPDK rte_ring, re-uses many of its concepts, > > and even substantial part of its code. > > It can be viewed as an 'extension' of rte_ring functionality. > > In particular, main SORING properties: > > - circular ring buffer with fixed size objects > > - producer, consumer plus multiple processing stages in the middle. > > - allows to split objects processing into multiple stages. > > - objects remain in the same ring while moving from one stage to the other, > >initial order is preserved, no extra copying needed. > > - preserves the ingress order of objects within the queue across multiple > >stages, i.e.: > >at the same stage multiple threads can process objects from the ring in > >any order, but for the next stage objects will always appear in the > >original order. > > - each stage (and producer/consumer) can be served by single and/or > >multiple threads. > > - number of stages, size and number of objects in the ring are > >configurable at ring initialization time. > > > > Data-path API provides four main operations: > > - enqueue/dequeue works in the same manner as for conventional rte_ring, > >all rte_ring synchronization types are supported. > > - acquire/release - for each stage there is an acquire (start) and > >release (finish) operation. > >after some objects are 'acquired' - given thread can safely assume that > >it has exclusive possession of these objects till 'release' for them is > >invoked. > >Note that right now user has to release exactly the same number of > >objects that was acquired before. > >After 'release', objects can be 'acquired' by next stage and/or dequeued > >by the consumer (in case of last stage). > > > > Expected use-case: applications that uses pipeline model > > (probably with multiple stages) for packet processing, when preserving > > incoming packet order is important. I.E.: IPsec processing, etc. > > > > How does SORING related to Eventdev? So far there is no direct relation. Though yes, DPDK eventdev framework also provides ‘ordered’ queue ability (along with other modes). Again, as I mentioned in the cover-letter rte_soring uses similar concept as OPDL eventdev implementation. One of the main aims with rte_soring was to introduce sort of extension to rte_ring, while keeping its API and implementation as lightweight and generic as possible. So it could be consumed by various apps that do use pipeline model, but for whatever reason do not use (/plan to use) eventdev framework. > Would it be feasible to reshape this into a SW event device? I guess an opposite approach might work better - i.e. make some SW-based eventdev Implementation to use rte_soring internally. Though I didn't try to do that so far.
Re: [PATCH v1 0/5] Direct re-arming of buffers on receive side
[konstantin.v.anan...@yandex.ru appears similar to someone who previously sent you email, but may not be that person. Learn why this could be a risk at https://aka.ms/LearnAboutSenderIdentification.] 16/05/2022 07:10, Feifei Wang пишет: Currently, the transmit side frees the buffers into the lcore cache and the receive side allocates buffers from the lcore cache. The transmit side typically frees 32 buffers resulting in 32*8=256B of stores to lcore cache. The receive side allocates 32 buffers and stores them in the receive side software ring, resulting in 32*8=256B of stores and 256B of load from the lcore cache. This patch proposes a mechanism to avoid freeing to/allocating from the lcore cache. i.e. the receive side will free the buffers from transmit side directly into it's software ring. This will avoid the 256B of loads and stores introduced by the lcore cache. It also frees up the cache lines used by the lcore cache. However, this solution poses several constraints: 1)The receive queue needs to know which transmit queue it should take the buffers from. The application logic decides which transmit port to use to send out the packets. In many use cases the NIC might have a single port ([1], [2], [3]), in which case a given transmit queue is always mapped to a single receive queue (1:1 Rx queue: Tx queue). This is easy to configure. If the NIC has 2 ports (there are several references), then we will have 1:2 (RX queue: TX queue) mapping which is still easy to configure. However, if this is generalized to 'N' ports, the configuration can be long. More over the PMD would have to scan a list of transmit queues to pull the buffers from. Just to re-iterate some generic concerns about this proposal: - We effectively link RX and TX queues - when this feature is enabled, user can't stop TX queue without stopping linked RX queue first. Right now user is free to start/stop any queues at his will. If that feature will allow to link queues from different ports, then even ports will become dependent and user will have to pay extra care when managing such ports. [Feifei] When direct rearm enabled, there are two path for thread to choose. If there are enough Tx freed buffers, Rx can put buffers from Tx. Otherwise, Rx will put buffers from mempool as usual. Thus, users do not need to pay much attention managing ports. What I am talking about: right now different port or different queues of the same port can be treated as independent entities: in general user is free to start/stop (and even reconfigure in some cases) one entity without need to stop other entity. I.E user can stop and re-configure TX queue while keep receiving packets from RX queue. With direct re-arm enabled, I think it wouldn't be possible any more: before stopping/reconfiguring TX queue user would have make sure that corresponding RX queue wouldn't be used by datapath. I am trying to understand the problem better. For the TX queue to be stopped, the user must have blocked the data plane from accessing the TX queue. Surely it is user responsibility tnot to call tx_burst() for stopped/released queue. The problem is that while TX for that queue is stopped, RX for related queue still can continue. So rx_burst() will try to read/modify TX queue data, that might be already freed, or simultaneously modified by control path. Understood, agree on the issue Again, it all can be mitigated by carefully re-designing and modifying control and data-path inside user app - by doing extra checks and synchronizations, etc. But from practical point - I presume most of users simply would avoid using this feature due all potential problems it might cause. That is subjective, it all depends on the performance improvements users see in their application. IMO, the performance improvement seen with this patch is worth few changes. Yes, it is subjective till some extent, though my feeling that it might end-up being sort of synthetic improvement used only by some show-case benchmarks. From my perspective, it would be much more plausible, if we can introduce some sort of generic improvement, that doesn't impose all these extra constraints and implications. Like one, discussed below in that thread with ZC mempool approach. Like Feifei says, the RX side has the normal packet allocation path still available. Also this sounds like a corner case to me, we can handle this through checks in the queue_stop API. Depends. if it would be allowed to link queues only from the same port, then yes, extra checks for queue-stop might be enough. As right now DPDK doesn't allow user to change number of queues without dev_stop() first. Though if it would be allowed to link queues from different ports, then situation will be much worse. Right now ports are totally independent entities (except some special cases like link-bonding, etc.). As one port can keep doing RX/TX, second one can be stopped, re-confgured, even deta
Re: [PATCH] ip_frag: add IPv4 fast fragment switch and test data
Some NIC drivers support DEV_TX_OFFLOAD_MBUF_FAST_FREE offload( Device supports optimization for fast release of mbufs.When set application must guarantee that per-queue all mbufs comes from the same mempool and has refcnt = 1).In order to adapt to this offload function,we need to modify the existing fragment logic(attach mbuf, so it is fast,we can call it fast fragment mode) and add the fragment logic in the non-attach mbuf mode(slow fragment mode).Add some test data for this modification. That doesn't look like a good idea to me. Yes, drivers configured with MBUF_FAST_FREE would not work correctly with indirect mbufs. So it is application responsibility to choose what it wants: either indirect mbufs enabled or MBUF_FAST_FREE enabled. Inside the lib we shouldn't try to guess what was particular driver configuration. Plus it is the change (and regression) of existing behavior - right now it is perfectly fine to use the same mbuf for both direct and indirect mbufs. If you really have a use-case for doing fragmentation via full copying all packet data, then probably the easiest and safest way would be to introduce new function: rte_ipv4_fragment_clone_packet(...) or so. Signed-off-by: Huichao Cai --- app/test/test_ipfrag.c | 14 +++-- lib/ip_frag/rte_ipv4_fragmentation.c | 56 +--- 2 files changed, 51 insertions(+), 19 deletions(-) diff --git a/app/test/test_ipfrag.c b/app/test/test_ipfrag.c index 610a86b..f5fe4b8 100644 --- a/app/test/test_ipfrag.c +++ b/app/test/test_ipfrag.c @@ -407,12 +407,20 @@ static void ut_teardown(void) pktid); } - if (tests[i].ipv == 4) - len = rte_ipv4_fragment_packet(b, pkts_out, BURST, + if (tests[i].ipv == 4) { + if (i % 2) + len = rte_ipv4_fragment_packet(b, + pkts_out, BURST, tests[i].mtu_size, direct_pool, indirect_pool); - else if (tests[i].ipv == 6) + else + len = rte_ipv4_fragment_packet(b, + pkts_out, BURST, + tests[i].mtu_size, + direct_pool, + direct_pool); + } else if (tests[i].ipv == 6) len = rte_ipv6_fragment_packet(b, pkts_out, BURST, tests[i].mtu_size, direct_pool, diff --git a/lib/ip_frag/rte_ipv4_fragmentation.c b/lib/ip_frag/rte_ipv4_fragmentation.c index a562424..65bfad7 100644 --- a/lib/ip_frag/rte_ipv4_fragmentation.c +++ b/lib/ip_frag/rte_ipv4_fragmentation.c @@ -102,6 +102,11 @@ static inline uint16_t __create_ipopt_frag_hdr(uint8_t *iph, * MBUF pool used for allocating direct buffers for the output fragments. * @param pool_indirect * MBUF pool used for allocating indirect buffers for the output fragments. + * If pool_indirect == pool_direct,this means that the fragment will adapt + * to DEV_TX_OFFLOAD_MBUF_FAST_FREE offload. + * DEV_TX_OFFLOAD_MBUF_FAST_FREE: Device supports optimization + * for fast release of mbufs. When set application must guarantee that + * per-queue all mbufs comes from the same mempool and has refcnt = 1. * @return * Upon successful completion - number of output fragments placed * in the pkts_out array. @@ -123,6 +128,7 @@ static inline uint16_t __create_ipopt_frag_hdr(uint8_t *iph, uint16_t frag_bytes_remaining; uint8_t ipopt_frag_hdr[IPV4_HDR_MAX_LEN]; uint16_t ipopt_len; + bool is_fast_frag_mode = true; /* * Formal parameter checking. @@ -133,6 +139,9 @@ static inline uint16_t __create_ipopt_frag_hdr(uint8_t *iph, unlikely(mtu_size < RTE_ETHER_MIN_MTU)) return -EINVAL; + if (pool_indirect == pool_direct) + is_fast_frag_mode = false; + in_hdr = rte_pktmbuf_mtod(pkt_in, struct rte_ipv4_hdr *); header_len = (in_hdr->version_ihl & RTE_IPV4_HDR_IHL_MASK) * RTE_IPV4_IHL_MULTIPLIER; @@ -190,30 +199,45 @@ static inline uint16_t __create_ipopt_frag_hdr(uint8_t *iph, out_seg_prev = out_pkt; more_out_segs = 1; while (likely(more_out_segs && more_in_segs)) { - struct rte_mbuf *out_seg = NULL; uint32_t len; - /* Allocate indirect buffer */ - out_seg = rte_pktmbuf_alloc(pool_indirect); -
Re: [PATCH] ip_frag: add IPv4 fast fragment switch and test data
04/06/2022 03:19, Huichao Cai пишет: I've seen some applications that have to rewrite fragment functions themselves in order to use MBUF_FAST_FREE features, such as iQiYi's DPVS. I am not sure that it will really help to improve performance, as if you have a lot of packets to fragment, you'll probably spend more time copying them. Might be it will help somehow if you'll have very rare occurrence of such packets. Also please keep in mind, that ip_frag is not the only one that does use indirect mbufs and refcnt. As another example - GSO implementation. So application writer has to be extremely careful when enabling MBUF_FAST_FREE. My personal advice - just don't use it, though I am quite conservative here. Anyway, as I said before, if there is a real use-case for it - I am ok to introduce new function that would do copying while fragmenting. Konstantin
Re: Optimizations are not features
04/06/2022 13:51, Andrew Rybchenko пишет: On 6/4/22 15:19, Morten Brørup wrote: From: Jerin Jacob [mailto:jerinjac...@gmail.com] Sent: Saturday, 4 June 2022 13.10 On Sat, Jun 4, 2022 at 3:30 PM Andrew Rybchenko wrote: On 6/4/22 12:33, Jerin Jacob wrote: On Sat, Jun 4, 2022 at 2:39 PM Morten Brørup wrote: I would like the DPDK community to change its view on compile time options. Here is why: Application specific performance micro-optimizations like “fast mbuf free” and “mbuf direct re-arm” are being added to DPDK and presented as features. They are not features, but optimizations, and I don’t understand the need for them to be available at run-time! Instead of adding a bunch of exotic exceptions to the fast path of the PMDs, they should be compile time options. This will improve performance by avoiding branches in the fast path, both for the applications using them, and for generic applications (where the exotic code is omitted). Agree. I think, keeping the best of both worlds would be -Enable the feature/optimization as runtime -Have a compile-time option to disable the feature/optimization as an override. It is hard to find the right balance, but in general compile time options are a nightmare for maintenance. Number of required builds will grow as an exponent. Test combinations are exponential for N features, regardless if N are runtime or compile time options. But since I'm talking about build checks I don't care about exponential grows in run time. Yes, testing should care, but it is a separate story. Of course, we can limit number of checked combinations, but it will result in flow of patches to fix build in other cases. The build breakage can be fixed if we use (2) vs (1) 1) #ifdef ... My feature #endif 2) static __rte_always_inline int rte_has_xyz_feature(void) { #ifdef RTE_LIBRTE_XYZ_FEATURE return RTE_LIBRTE_XYZ_FEATURE; #else return 0; #endif } if(rte_has_xyz_feature())) { My feature code } Jerin, thanks, very good example. I'm not sure all the features can be covered by that, e.g. added fields in structures. +1 Also, I would consider such features "opt in" at compile time only. As such, they could be allowed to break the ABI/API. Also compile time options tend to make code less readable which makes all aspects of the development harder. Yes, compile time is nice for micro optimizations, but I have great concerns that it is a right way to go. Please note that I am only talking about the performance optimizations that are limited to application specific use cases. I think it makes sense to require that performance optimizing an application also requires recompiling the performance critical libraries used by it. abandon some of existing functionality to create a 'short-cut' Allowing compile time options for application specific performance optimizations in DPDK would also open a path for other optimizations, which can only be achieved at compile time, such as “no fragmented packets”, “no attached mbufs” and “single mbuf pool”. And even more exotic optimizations, such as the “indexed mempool cache”, which was rejected due to ABI violations – they could be marked as “risky and untested” or similar, but still be part of the DPDK main repository. Thanks Morten for bringing it up, it is an interesting topic. Though I look at it from different angle. All optimizations you mentioned above introduce new limitations: MBUF_FAST_FREE - no indirect mbufs and multiple mempools, mempool object indexes - mempool size is limited to 4GB, direct rearm - drop ability to stop/reconfigure TX queue, while RX queue is still running, etc. Note that all these limitations are not forced by HW. All of them are pure SW limitations that developers forced in (or tried to) to get few extra performance. That's concerning tendency. As more and more such 'optimization via limitation' will come in: - DPDK feature list will become more and more fragmented. - Would cause more and more confusion for the users. - Unmet expectations - difference in performance between 'default' and 'optimized' version of DPDK will become bigger and bigger. - As Andrew already mentioned, maintaining all these 'sub-flavours' of DPDK will become more and more difficult. So, probably instead of making such changes easier, we need somehow to persuade developers to think more about optimizations that would be generic and transparent to the user. I do realize that it is not always possible due to various reasons (HW limitations, external dependencies, etc.) but that's another story. Let's take for example MBUF_FAST_FREE. In fact, I am not sure that we need it as tx offload flag at all. PMD TX-path has all necessary information to decide at run-time can it do fast_free() for not: At tx_burst() PMD can check are all mbufs satisfy these conditions (same mempool, refcnt==1) and update some fields and/or counters inside TXQ to reflect it. Then, at tx_free() we ca
Re: [PATCH v1 5/5] examples/l3fwd: enable direct rearm mode
31/05/2022 18:14, Honnappa Nagarahalli пишет: 25/05/2022 01:24, Honnappa Nagarahalli пишет: From: Konstantin Ananyev 20/04/2022 09:16, Feifei Wang пишет: Enable direct rearm mode. The mapping is decided in the data plane based on the first packet received. Suggested-by: Honnappa Nagarahalli Signed-off-by: Feifei Wang Reviewed-by: Ruifeng Wang Reviewed-by: Honnappa Nagarahalli --- examples/l3fwd/l3fwd_lpm.c | 16 +++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c index bec22c44cd..38ffdf4636 100644 --- a/examples/l3fwd/l3fwd_lpm.c +++ b/examples/l3fwd/l3fwd_lpm.c @@ -147,7 +147,7 @@ lpm_main_loop(__rte_unused void *dummy) unsigned lcore_id; uint64_t prev_tsc, diff_tsc, cur_tsc; int i, nb_rx; - uint16_t portid; + uint16_t portid, tx_portid; uint8_t queueid; struct lcore_conf *qconf; const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / @@ -158,6 +158,8 @@ lpm_main_loop(__rte_unused void *dummy) const uint16_t n_rx_q = qconf->n_rx_queue; const uint16_t n_tx_p = qconf->n_tx_port; + int direct_rearm_map[n_rx_q]; + if (n_rx_q == 0) { RTE_LOG(INFO, L3FWD, "lcore %u has nothing to do\n", lcore_id); return 0; @@ -169,6 +171,7 @@ lpm_main_loop(__rte_unused void *dummy) portid = qconf->rx_queue_list[i].port_id; queueid = qconf->rx_queue_list[i].queue_id; + direct_rearm_map[i] = 0; RTE_LOG(INFO, L3FWD, " -- lcoreid=%u portid=%u rxqueueid=%hhu\n", lcore_id, portid, queueid); @@ -209,6 +212,17 @@ lpm_main_loop(__rte_unused void *dummy) if (nb_rx == 0) continue; + /* Determine the direct rearm mapping based on the +first + * packet received on the rx queue + */ + if (direct_rearm_map[i] == 0) { + tx_portid = lpm_get_dst_port(qconf, pkts_burst[0], + portid); + rte_eth_direct_rxrearm_map(portid, queueid, + tx_portid, queueid); + direct_rearm_map[i] = 1; + } + That just doesn't look right to me: why to make decision based on the first packet? The TX queue depends on the incoming packet. So, this method covers more scenarios than doing it in the control plane where the outgoing queue is not known. What would happen if second and all other packets have to be routed to different ports? This is an example application and it should be fine to make this assumption. More over, it does not cause any problems if packets change in between. When the packets change back, the feature works again. In fact, this direct-rearm mode seems suitable only for hard-coded one to one mapped forwarding (examples/l2fwd, testpmd). For l3fwd it can be used safely only when we have one port in use. Can you elaborate more on the safety issue when more than one port is used? Also I think it should be selected at init-time and it shouldn't be on by default. To summarize, my opinion: special cmd-line parameter to enable it. Can you please elaborate why a command line parameter is required? Other similar features like RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE are enabled without a command line parameter. IMO, this is how it should ber. Essentially we are trying to measure how different PMDs perform, the ones that have implemented performance improvement features would show better performance (i.e. the PMDs implementing the features should not be penalized by asking for additional user input). From my perspective, main purpose of l3fwd application is to demonstrate DPDK ability to do packet routing based on input packet contents. Making guesses about packet contents is a change in expected behavior. For some cases it might improve performance, for many others - will most likely cause performance drop. I think that performance drop as default behavior (running the same parameters as before) should not be allowed. Plus you did not provided ability to switch off that behavior, if undesired. There is no drop in L3fwd performance due to this patch. Hmm.. Are you saying even when your guess is wrong, and you constantly hitting slow-path (check tx_queue first - failure, then allocate from mempool) you didn't observe any performance drop? There is more work to do, and if workload is cpu-bound, my guess - it should be noticeable. Also, from previous experience, quite often even after tiny changes in rx/tx code-path some slowdown was reported. Usually that happened on some low-end ARM cpus (Marvell, NXP). About comparison with RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE default enablement - I don't think it is correct. Within l3fwd app we can safely guarantee that all RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE pre-requirements are met: in each TX
Re: [RFC 8/8] ip_frag: fix gcc-12 warnings
07/06/2022 18:17, Stephen Hemminger пишет: The function rte_memcpy can derference past source buffer which will cause array out of bounds warnings. But there is no good reason to use rte_memcpy instead of memcpy in this code. Memcpy is just as fast for these small inputs, and compiler will optimize. AFAIK, rte_memcpy() will outperform memcpy() when _size_ parameter is a variable. Unfortunately that's exactly the case here. So not sure it is a good change, at least without extensive perf testing. BTW, if rte_memcpy() really access src buffer beyond it's boundaries, I think that's definitely a bug that needs to be fixed. Signed-off-by: Stephen Hemminger --- lib/ip_frag/rte_ipv4_fragmentation.c | 7 +++ 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lib/ip_frag/rte_ipv4_fragmentation.c b/lib/ip_frag/rte_ipv4_fragmentation.c index a19f6fda6408..27a8ad224dec 100644 --- a/lib/ip_frag/rte_ipv4_fragmentation.c +++ b/lib/ip_frag/rte_ipv4_fragmentation.c @@ -5,7 +5,6 @@ #include #include -#include #include #include "ip_frag_common.h" @@ -26,7 +25,7 @@ static inline void __fill_ipv4hdr_frag(struct rte_ipv4_hdr *dst, const struct rte_ipv4_hdr *src, uint16_t header_len, uint16_t len, uint16_t fofs, uint16_t dofs, uint32_t mf) { - rte_memcpy(dst, src, header_len); + memcpy(dst, src, header_len); fofs = (uint16_t)(fofs + (dofs >> RTE_IPV4_HDR_FO_SHIFT)); fofs = (uint16_t)(fofs | mf << RTE_IPV4_HDR_MF_SHIFT); dst->fragment_offset = rte_cpu_to_be_16(fofs); @@ -48,7 +47,7 @@ static inline uint16_t __create_ipopt_frag_hdr(uint8_t *iph, struct rte_ipv4_hdr *iph_opt = (struct rte_ipv4_hdr *)ipopt_frag_hdr; ipopt_len = 0; - rte_memcpy(ipopt_frag_hdr, iph, sizeof(struct rte_ipv4_hdr)); + memcpy(ipopt_frag_hdr, iph, sizeof(struct rte_ipv4_hdr)); ipopt_frag_hdr += sizeof(struct rte_ipv4_hdr); uint8_t *p_opt = iph + sizeof(struct rte_ipv4_hdr); @@ -65,7 +64,7 @@ static inline uint16_t __create_ipopt_frag_hdr(uint8_t *iph, break; if (RTE_IPV4_HDR_OPT_COPIED(*p_opt)) { - rte_memcpy(ipopt_frag_hdr + ipopt_len, + memcpy(ipopt_frag_hdr + ipopt_len, p_opt, p_opt[1]); ipopt_len += p_opt[1]; }
Re: [PATCH 3/6] eal: add basic rte thread ID equal API
09/06/2022 14:58, Tyler Retzlaff пишет: Add rte_thread_equal() that tests if two rte_thread_id are equal. Signed-off-by: Narcisa Vasile Signed-off-by: Tyler Retzlaff --- lib/eal/common/rte_thread.c | 6 ++ lib/eal/include/rte_thread.h | 19 +++ lib/eal/version.map | 1 + 3 files changed, 26 insertions(+) diff --git a/lib/eal/common/rte_thread.c b/lib/eal/common/rte_thread.c index 10d6652..21ed042 100644 --- a/lib/eal/common/rte_thread.c +++ b/lib/eal/common/rte_thread.c @@ -6,6 +6,12 @@ #include int +rte_thread_equal(rte_thread_t t1, rte_thread_t t2) +{ + return t1.opaque_id == t2.opaque_id; for posix systems, why not: return pthread_equal(t1.opaque_id, t2.opaque_id); ? +} + +int rte_thread_attr_init(rte_thread_attr_t *attr) { RTE_VERIFY(attr != NULL); diff --git a/lib/eal/include/rte_thread.h b/lib/eal/include/rte_thread.h index 321fb59..32ab745 100644 --- a/lib/eal/include/rte_thread.h +++ b/lib/eal/include/rte_thread.h @@ -134,6 +134,25 @@ int rte_thread_create(rte_thread_t *thread_id, __rte_experimental rte_thread_t rte_thread_self(void); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * Check if 2 thread ids are equal. + * + * @param t1 + * First thread id. + * + * @param t2 + * Second thread id. + * + * @return + * If the ids are equal, return nonzero. + * Otherwise, return 0. + */ +__rte_experimental +int rte_thread_equal(rte_thread_t t1, rte_thread_t t2); + #ifdef RTE_HAS_CPUSET /** diff --git a/lib/eal/version.map b/lib/eal/version.map index 22e5c85..4a52484 100644 --- a/lib/eal/version.map +++ b/lib/eal/version.map @@ -428,6 +428,7 @@ EXPERIMENTAL { rte_thread_attr_set_priority; rte_thread_create; rte_thread_detach; + rte_thread_equal; rte_thread_get_affinity_by_id; rte_thread_get_priority; rte_thread_join;
Re: [RFC] eal/x86: disable array bounds checks in rte_memcpy_generic with gcc-12
08/06/2022 23:49, Stephen Hemminger пишет: Gcc 12 adds more array bounds checking (good); but it is not smart enough to realize that for small fixed sizes, the bigger move options are not used. An example is using rte_memcpy() on a RSS key of 40 bytes may trigger rte_memcpy complaints from rte_mov128 reading past end of input. In order to keep some of the checks add special case for calls to rte_memcpy() with fixed size arguments to use the compiler builtin instead. Don't want to give all the checking for code that uses rte_memcpy() everywhere. Signed-off-by: Stephen Hemminger --- lib/eal/x86/include/rte_memcpy.h | 16 +++- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h index 18aa4e43a743..b90cdd8d7326 100644 --- a/lib/eal/x86/include/rte_memcpy.h +++ b/lib/eal/x86/include/rte_memcpy.h @@ -27,6 +27,10 @@ extern "C" { #pragma GCC diagnostic ignored "-Wstringop-overflow" #endif +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 12) +#pragma GCC diagnostic ignored "-Warray-bounds" +#endif + /** * Copy bytes from one location to another. The locations must not overlap. * @@ -842,19 +846,21 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) return ret; } +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 10) +#pragma GCC diagnostic pop +#endif + static __rte_always_inline void * rte_memcpy(void *dst, const void *src, size_t n) { - if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK)) + if (__builtin_constant_p(n)) + return __builtin_memcpy(dst, src, n); + else if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK)) return rte_memcpy_aligned(dst, src, n); else return rte_memcpy_generic(dst, src, n); } -#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 10) -#pragma GCC diagnostic pop -#endif - #ifdef __cplusplus } #endif Acked-by: Konstantin Ananyev
Re: [PATCH 3/6] eal: add basic rte thread ID equal API
Add rte_thread_equal() that tests if two rte_thread_id are equal. Signed-off-by: Narcisa Vasile Signed-off-by: Tyler Retzlaff --- lib/eal/common/rte_thread.c | 6 ++ lib/eal/include/rte_thread.h | 19 +++ lib/eal/version.map | 1 + 3 files changed, 26 insertions(+) diff --git a/lib/eal/common/rte_thread.c b/lib/eal/common/rte_thread.c index 10d6652..21ed042 100644 --- a/lib/eal/common/rte_thread.c +++ b/lib/eal/common/rte_thread.c @@ -6,6 +6,12 @@ #include int +rte_thread_equal(rte_thread_t t1, rte_thread_t t2) +{ + return t1.opaque_id == t2.opaque_id; for posix systems, why not: return pthread_equal(t1.opaque_id, t2.opaque_id); because it would require 2 implementations We already have plenty of such cases for rte_thread implementation. Why it became a problem here? when this works for both windows and posix platforms. (less code to maintain, no functional difference). Well posix insists that the only safe way for applications to directly compare two pthread_t values is to call pthread_equal(). So I'd suggest we do what is recommended.
Re: [PATCH v2 3/6] eal: add basic rte thread ID equal API
Add rte_thread_equal() that tests if two rte_thread_id are equal. Signed-off-by: Narcisa Vasile Signed-off-by: Tyler Retzlaff Acked-by: Chengwen Feng --- lib/eal/include/rte_thread.h | 19 +++ lib/eal/unix/rte_thread.c| 6 ++ lib/eal/version.map | 1 + lib/eal/windows/rte_thread.c | 6 ++ 4 files changed, 32 insertions(+) diff --git a/lib/eal/include/rte_thread.h b/lib/eal/include/rte_thread.h index c27e580..de0486d 100644 --- a/lib/eal/include/rte_thread.h +++ b/lib/eal/include/rte_thread.h @@ -134,6 +134,25 @@ int rte_thread_create(rte_thread_t *thread_id, __rte_experimental rte_thread_t rte_thread_self(void); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * Check if 2 thread ids are equal. + * + * @param t1 + * First thread id. + * + * @param t2 + * Second thread id. + * + * @return + * If the ids are equal, return nonzero. + * Otherwise, return 0. + */ +__rte_experimental +int rte_thread_equal(rte_thread_t t1, rte_thread_t t2); + #ifdef RTE_HAS_CPUSET /** diff --git a/lib/eal/unix/rte_thread.c b/lib/eal/unix/rte_thread.c index 19c7b80..0304d53 100644 --- a/lib/eal/unix/rte_thread.c +++ b/lib/eal/unix/rte_thread.c @@ -183,6 +183,12 @@ struct eal_tls_key { return pthread_detach((pthread_t)thread_id.opaque_id); } +int +rte_thread_equal(rte_thread_t t1, rte_thread_t t2) +{ + return pthread_equal((pthread_t)t1.opaque_id, (pthread_t)t2.opaque_id); +} + rte_thread_t rte_thread_self(void) { diff --git a/lib/eal/version.map b/lib/eal/version.map index 22e5c85..4a52484 100644 --- a/lib/eal/version.map +++ b/lib/eal/version.map @@ -428,6 +428,7 @@ EXPERIMENTAL { rte_thread_attr_set_priority; rte_thread_create; rte_thread_detach; + rte_thread_equal; rte_thread_get_affinity_by_id; rte_thread_get_priority; rte_thread_join; diff --git a/lib/eal/windows/rte_thread.c b/lib/eal/windows/rte_thread.c index b5f2b04..1352513 100644 --- a/lib/eal/windows/rte_thread.c +++ b/lib/eal/windows/rte_thread.c @@ -291,6 +291,12 @@ struct thread_routine_ctx { return 0; } +int +rte_thread_equal(rte_thread_t t1, rte_thread_t t2) +{ + return t1.opaque_id == t2.opaque_id; +} + rte_thread_t rte_thread_self(void) { Acked-by: Konstantin Ananyev