On Thu, Apr 27, 2023 at 04:24:10PM +0900, Akihiko Odaki wrote: > On 2023/04/27 1:09, Tomasz Dzieciol wrote: > > Format of Intel 82576 was changed in comparison to Intel 82574 extended > > descriptors. This change updates filling of advanced descriptors fields > > accordingly: > > * remove TCP ACK detection > > * add IPv6 with extensions traffic detection > > * fragment checksum and IP ID is filled only when RXCSUM.IPPCSE is set (in > > addition to RXCSUM.PCSD bit cleared condition) > > > > Refactoring is done in preparation for support of multiple advanced > > descriptors RX modes, especially packet-split modes. > > > > Signed-off-by: Tomasz Dzieciol <t.dziec...@partner.samsung.com> > > As I wrote earlier: > > Please rebase to: > > https://protect2.fireeye.com/v1/url?k=512318e9-30a80dd9-512293a6-000babffaa23-3bee4c44b5723bf0&q=1&e=355018c1-72f7-43ba-be71-fd86193bc66d&u=https%3A%2F%2Fpatchew.org%2FQEMU%2F20230420054657.50367-1-akihiko.odaki%40daynix.com%2F > > > > And add a cover letter with the following lines: > > Based-on: <20230420054657.50367-1-akihiko.od...@daynix.com> > > ("[PATCH v2 00/41] igb: Fix for DPDK") > > > > I want the series to be merged first as it includes several bug fixes. > > Also, next time CC: Sriram Yagnaraman <sriram.yagnara...@est.tech> > He is a reviewer of igb.
I rebased it to v3, just didn't added cover letter. Should I rebase it to v2 as pointed by the link you provided or to v4 as it it currently the newest? I will apply rest of comments. > > > --- > > hw/net/e1000e_core.c | 16 +- > > hw/net/e1000x_regs.h | 1 + > > hw/net/igb_core.c | 505 ++++++++++++++++++++++++--------------- > > hw/net/igb_regs.h | 21 +- > > hw/net/trace-events | 6 +- > > tests/qtest/libqos/igb.c | 5 + > > 6 files changed, 344 insertions(+), 210 deletions(-) > > > > diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c > > index 78373d7db7..d874809d93 100644 > > --- a/hw/net/e1000e_core.c > > +++ b/hw/net/e1000e_core.c > > @@ -1418,11 +1418,11 @@ e1000e_write_hdr_to_rx_buffers(E1000ECore *core, > > } > > static void > > -e1000e_write_to_rx_buffers(E1000ECore *core, > > - hwaddr ba[MAX_PS_BUFFERS], > > - e1000e_ba_state *bastate, > > - const char *data, > > - dma_addr_t data_len) > > +e1000e_write_payload_to_rx_buffers(E1000ECore *core, > > + hwaddr ba[MAX_PS_BUFFERS], > > + e1000e_ba_state *bastate, > > + const char *data, > > + dma_addr_t data_len) > > { > > while (data_len > 0) { > > uint32_t cur_buf_len = core->rxbuf_sizes[bastate->cur_idx]; > > @@ -1594,8 +1594,8 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct > > NetRxPkt *pkt, > > while (copy_size) { > > iov_copy = MIN(copy_size, iov->iov_len - iov_ofs); > > - e1000e_write_to_rx_buffers(core, ba, &bastate, > > - iov->iov_base + iov_ofs, > > iov_copy); > > + e1000e_write_payload_to_rx_buffers(core, ba, &bastate, > > + iov->iov_base + > > iov_ofs, iov_copy); > > copy_size -= iov_copy; > > iov_ofs += iov_copy; > > @@ -1607,7 +1607,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct > > NetRxPkt *pkt, > > if (desc_offset + desc_size >= total_size) { > > /* Simulate FCS checksum presence in the last > > descriptor */ > > - e1000e_write_to_rx_buffers(core, ba, &bastate, > > + e1000e_write_payload_to_rx_buffers(core, ba, &bastate, > > (const char *) &fcs_pad, > > e1000x_fcs_len(core->mac)); > > } > > } > > diff --git a/hw/net/e1000x_regs.h b/hw/net/e1000x_regs.h > > index 13760c66d3..344fd10359 100644 > > --- a/hw/net/e1000x_regs.h > > +++ b/hw/net/e1000x_regs.h > > @@ -827,6 +827,7 @@ union e1000_rx_desc_packet_split { > > /* Receive Checksum Control bits */ > > #define E1000_RXCSUM_IPOFLD 0x100 /* IP Checksum Offload Enable */ > > #define E1000_RXCSUM_TUOFLD 0x200 /* TCP/UDP Checksum Offload > > Enable */ > > +#define E1000_RXCSUM_IPPCSE 0x1000 /* IP Payload Checksum enable */ > > #define E1000_RXCSUM_PCSD 0x2000 /* Packet Checksum Disable */ > > #define E1000_RING_DESC_LEN (16) > > diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c > > index 96b7335b31..47c4a16e84 100644 > > --- a/hw/net/igb_core.c > > +++ b/hw/net/igb_core.c > > @@ -267,6 +267,21 @@ igb_rx_use_legacy_descriptor(IGBCore *core) > > return false; > > } > > +typedef struct E1000E_RingInfo_st { > > + int dbah; > > + int dbal; > > + int dlen; > > + int dh; > > + int dt; > > + int idx; > > +} E1000E_RingInfo; > > + > > +static uint32_t > > +igb_rx_queue_desctyp_get(IGBCore *core, const E1000E_RingInfo *r) > > +{ > > + return core->mac[E1000_SRRCTL(r->idx) >> 2] & > > E1000_SRRCTL_DESCTYPE_MASK; > > +} > > + > > static inline bool > > igb_rss_enabled(IGBCore *core) > > { > > @@ -694,15 +709,6 @@ static uint32_t igb_rx_wb_eic(IGBCore *core, int > > queue_idx) > > return (ent & E1000_IVAR_VALID) ? BIT(ent & 0x1f) : 0; > > } > > -typedef struct E1000E_RingInfo_st { > > - int dbah; > > - int dbal; > > - int dlen; > > - int dh; > > - int dt; > > - int idx; > > -} E1000E_RingInfo; > > - > > static inline bool > > igb_ring_empty(IGBCore *core, const E1000E_RingInfo *r) > > { > > @@ -941,6 +947,14 @@ igb_has_rxbufs(IGBCore *core, const E1000E_RingInfo > > *r, size_t total_size) > > bufsize; > > } > > +static uint32_t > > +igb_get_queue_rx_header_buf_size(IGBCore *core, const E1000E_RingInfo *r) > > +{ > > + uint32_t srr_size = (core->mac[E1000_SRRCTL(r->idx) >> 2] & > > + E1000_SRRCTL_BSIZEHDR_MASK) >> 8; > > + return srr_size * 64; > > Instead, do: > uint32_t srrctl = core->mac[E1000_SRRCTL(r->idx) >> 2]; > return (srrctl & E1000_SRRCTL_BSIZEHDRSIZE_MASK) >> > E1000_SRRCTL_BSIZEHDRSIZE_SHIFT; > > x >> E1000_SRRCTL_BSIZEHDRSIZE_SHIFT is identical with: (x >> 8) * 64 > > Also, you should use E1000_SRRCTL_BSIZEHDRSIZE_MASK rather than > E1000_SRRCTL_BSIZEHDR_MASK here. Section 8.10.2 of the datasheet says the > field is 3-bit. > > > +} > > + > > void > > igb_start_recv(IGBCore *core) > > { > > @@ -1225,21 +1239,34 @@ igb_read_lgcy_rx_descr(IGBCore *core, struct > > e1000_rx_desc *desc, > > } > > static inline void > > -igb_read_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc, > > - hwaddr *buff_addr) > > +igb_read_adv_rx_single_buf_descr(IGBCore *core, union e1000_adv_rx_desc > > *desc, > > + hwaddr *buff_addr) > > { > > *buff_addr = le64_to_cpu(desc->read.pkt_addr); > > } > > static inline void > > igb_read_rx_descr(IGBCore *core, union e1000_rx_desc_union *desc, > > - hwaddr *buff_addr) > > + hwaddr buff_addr[MAX_PS_BUFFERS], > > Don't use an array here. Instead use a struct as e1000_adv_rx_desc does; it > will be more explicit and easier to read. And I think this should be rather > part of the next patch. > > > + const E1000E_RingInfo *r) > > { > > + uint32_t desc_type; > > + > > if (igb_rx_use_legacy_descriptor(core)) { > > - igb_read_lgcy_rx_descr(core, &desc->legacy, buff_addr); > > - } else { > > - igb_read_adv_rx_descr(core, &desc->adv, buff_addr); > > + igb_read_lgcy_rx_descr(core, &desc->legacy, &buff_addr[1]); > > + buff_addr[0] = 0; > > + return; > > } > > + > > + /* modes other than advanced single buffer descriptor not supported */ > > + desc_type = igb_rx_queue_desctyp_get(core, r); > > + if (desc_type != E1000_SRRCTL_DESCTYPE_ADV_ONEBUF) { > > + trace_igb_wrn_rx_desc_modes_not_supp(desc_type); > > + } > > + > > + /* advanced single buffer descriptor */ > > + igb_read_adv_rx_single_buf_descr(core, &desc->adv, &buff_addr[1]); > > + buff_addr[0] = 0; > > } > > static void > > @@ -1281,15 +1308,11 @@ igb_verify_csum_in_sw(IGBCore *core, > > } > > static void > > -igb_build_rx_metadata(IGBCore *core, > > - struct NetRxPkt *pkt, > > - bool is_eop, > > - const E1000E_RSSInfo *rss_info, uint16_t etqf, bool > > ts, > > - uint16_t *pkt_info, uint16_t *hdr_info, > > - uint32_t *rss, > > - uint32_t *status_flags, > > - uint16_t *ip_id, > > - uint16_t *vlan_tag) > > +igb_build_rx_metadata_common(IGBCore *core, > > + struct NetRxPkt *pkt, > > + bool is_eop, > > + uint32_t *status_flags, > > + uint16_t *vlan_tag) > > { > > struct virtio_net_hdr *vhdr; > > bool hasip4, hasip6, csum_valid; > > @@ -1298,7 +1321,6 @@ igb_build_rx_metadata(IGBCore *core, > > *status_flags = E1000_RXD_STAT_DD; > > /* No additional metadata needed for non-EOP descriptors */ > > - /* TODO: EOP apply only to status so don't skip whole function. */ > > if (!is_eop) { > > goto func_exit; > > } > > @@ -1315,64 +1337,6 @@ igb_build_rx_metadata(IGBCore *core, > > trace_e1000e_rx_metadata_vlan(*vlan_tag); > > } > > - /* Packet parsing results */ > > - if ((core->mac[RXCSUM] & E1000_RXCSUM_PCSD) != 0) { > > - if (rss_info->enabled) { > > - *rss = cpu_to_le32(rss_info->hash); > > - trace_igb_rx_metadata_rss(*rss); > > - } > > - } else if (hasip4) { > > - *status_flags |= E1000_RXD_STAT_IPIDV; > > - *ip_id = cpu_to_le16(net_rx_pkt_get_ip_id(pkt)); > > - trace_e1000e_rx_metadata_ip_id(*ip_id); > > - } > > - > > - if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP && net_rx_pkt_is_tcp_ack(pkt)) > > { > > - *status_flags |= E1000_RXD_STAT_ACK; > > - trace_e1000e_rx_metadata_ack(); > > - } > > - > > - if (pkt_info) { > > - *pkt_info = rss_info->enabled ? rss_info->type : 0; > > - > > - if (etqf < 8) { > > - *pkt_info |= BIT(11) | (etqf << 4); > > - } else { > > - if (hasip4) { > > - *pkt_info |= E1000_ADVRXD_PKT_IP4; > > - } > > - > > - if (hasip6) { > > - *pkt_info |= E1000_ADVRXD_PKT_IP6; > > - } > > - > > - switch (l4hdr_proto) { > > - case ETH_L4_HDR_PROTO_TCP: > > - *pkt_info |= E1000_ADVRXD_PKT_TCP; > > - break; > > - > > - case ETH_L4_HDR_PROTO_UDP: > > - *pkt_info |= E1000_ADVRXD_PKT_UDP; > > - break; > > - > > - case ETH_L4_HDR_PROTO_SCTP: > > - *pkt_info |= E1000_ADVRXD_PKT_SCTP; > > - break; > > - > > - default: > > - break; > > - } > > - } > > - } > > - > > - if (hdr_info) { > > - *hdr_info = 0; > > - } > > - > > - if (ts) { > > - *status_flags |= BIT(16); > > - } > > - > > /* RX CSO information */ > > if (hasip6 && (core->mac[RFCTL] & E1000_RFCTL_IPV6_XSUM_DIS)) { > > trace_e1000e_rx_metadata_ipv6_sum_disabled(); > > @@ -1426,58 +1390,125 @@ func_exit: > > } > > static inline void > > -igb_write_lgcy_rx_descr(IGBCore *core, struct e1000_rx_desc *desc, > > +igb_write_lgcy_rx_descr(IGBCore *core, > > + struct e1000_rx_desc *d, > > Why renaming desc to d? > > > struct NetRxPkt *pkt, > > - const E1000E_RSSInfo *rss_info, uint16_t etqf, > > bool ts, > > + const E1000E_RSSInfo *rss_info, > > uint16_t length) > > { > > - uint32_t status_flags, rss; > > - uint16_t ip_id; > > + uint32_t status_flags; > > assert(!rss_info->enabled); > > - desc->length = cpu_to_le16(length); > > - desc->csum = 0; > > - igb_build_rx_metadata(core, pkt, pkt != NULL, > > - rss_info, etqf, ts, > > - NULL, NULL, &rss, > > - &status_flags, &ip_id, > > - &desc->special); > > - desc->errors = (uint8_t) (le32_to_cpu(status_flags) >> 24); > > - desc->status = (uint8_t) le32_to_cpu(status_flags); > > + memset(d, 0, sizeof(*d)); > > + d->length = cpu_to_le16(length); > > + igb_build_rx_metadata_common(core, pkt, pkt != NULL, > > + &status_flags, > > + &d->special); > > + > > + d->errors = (uint8_t) (le32_to_cpu(status_flags) >> 24); > > + d->status = (uint8_t) le32_to_cpu(status_flags); > > +} > > + > > +static uint16_t > > +igb_rx_desc_get_packet_type(IGBCore *core, struct NetRxPkt *pkt, uint16_t > > etqf) > > +{ > > + uint16_t pkt_type = 0; > > + bool hasip4, hasip6; > > + EthL4HdrProto l4hdr_proto; > > + > > + net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto); > > + > > + if (hasip6 && !(core->mac[RFCTL] & E1000_RFCTL_IPV6_DIS)) { > > + eth_ip6_hdr_info *ip6hdr_info = net_rx_pkt_get_ip6_info(pkt); > > + pkt_type |= ip6hdr_info->has_ext_hdrs ? E1000_ADVRXD_PKT_IP6E : > > + E1000_ADVRXD_PKT_IP6; > > + } else if (hasip4) { > > + pkt_type = E1000_ADVRXD_PKT_IP4; > > + } > > + > > + if (etqf < 8) { > > + pkt_type |= (BIT(11) >> 4) | etqf; > > + return pkt_type; > > + } > > + > > + switch (l4hdr_proto) { > > + case ETH_L4_HDR_PROTO_TCP: > > + pkt_type |= E1000_ADVRXD_PKT_TCP; > > + break; > > + case ETH_L4_HDR_PROTO_UDP: > > + pkt_type |= E1000_ADVRXD_PKT_UDP; > > + break; > > + case ETH_L4_HDR_PROTO_SCTP: > > + pkt_type |= E1000_ADVRXD_PKT_SCTP; > > + break; > > + default: > > + break; > > + } > > + > > + return pkt_type; > > } > > static inline void > > -igb_write_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc, > > +igb_write_adv_rx_descr(IGBCore *core, > > + union e1000_adv_rx_desc *d, > > struct NetRxPkt *pkt, > > - const E1000E_RSSInfo *rss_info, uint16_t etqf, bool > > ts, > > + const E1000E_RSSInfo *rss_info, > > + uint16_t etqf, > > + bool ts, > > uint16_t length) > > { > > - memset(&desc->wb, 0, sizeof(desc->wb)); > > + bool hasip4, hasip6; > > + EthL4HdrProto l4hdr_proto; > > + uint16_t rss_type = 0, pkt_type; > > + bool eop = (pkt != NULL); > > + memset(&d->wb, 0, sizeof(d->wb)); > > + > > + d->wb.upper.length = cpu_to_le16(length); > > + igb_build_rx_metadata_common(core, pkt, eop, > > + &d->wb.upper.status_error, > > + &d->wb.upper.vlan); > > + > > + if (!eop) { > > + return; > > + } > > + > > + net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto); > > - desc->wb.upper.length = cpu_to_le16(length); > > + if ((core->mac[RXCSUM] & E1000_RXCSUM_PCSD) && rss_info->enabled) { > > I think the older style better described that RSS and checksumming is > mutally exclusive. > > > + d->wb.lower.hi_dword.rss = cpu_to_le32(rss_info->hash); > > + rss_type = rss_info->type; > > + trace_igb_rx_metadata_rss(d->wb.lower.hi_dword.rss, rss_type); > > + } else if (!(core->mac[RXCSUM] & E1000_RXCSUM_PCSD) && > > + core->mac[RXCSUM] & E1000_RXCSUM_IPPCSE && > > + hasip4) { > > + d->wb.lower.hi_dword.csum_ip.ip_id = > > cpu_to_le16(net_rx_pkt_get_ip_id(pkt)); > > + trace_e1000e_rx_metadata_ip_id(d->wb.lower.hi_dword.csum_ip.ip_id); > > + } > > - igb_build_rx_metadata(core, pkt, pkt != NULL, > > - rss_info, etqf, ts, > > - &desc->wb.lower.lo_dword.pkt_info, > > - &desc->wb.lower.lo_dword.hdr_info, > > - &desc->wb.lower.hi_dword.rss, > > - &desc->wb.upper.status_error, > > - &desc->wb.lower.hi_dword.csum_ip.ip_id, > > - &desc->wb.upper.vlan); > > + if (ts) { > > + d->wb.upper.status_error |= BIT(16); > > + } > > + > > + pkt_type = igb_rx_desc_get_packet_type(core, pkt, etqf); > > + trace_e1000e_rx_metadata_pkt_type(pkt_type); > > + d->wb.lower.lo_dword.pkt_info = cpu_to_le16(rss_type | (pkt_type << > > 4)); > > } > > static inline void > > -igb_write_rx_descr(IGBCore *core, union e1000_rx_desc_union *desc, > > - struct NetRxPkt *pkt, const E1000E_RSSInfo *rss_info, > > - uint16_t etqf, bool ts, uint16_t length) > > +igb_write_rx_descr(IGBCore *core, > > + union e1000_rx_desc_union *desc, > > + struct NetRxPkt *pkt, > > + const E1000E_RSSInfo *rss_info, > > + uint16_t etqf, > > + bool ts, > > + uint16_t(*written)[MAX_PS_BUFFERS], > > + const E1000E_RingInfo *r) > > { > > if (igb_rx_use_legacy_descriptor(core)) { > > - igb_write_lgcy_rx_descr(core, &desc->legacy, pkt, rss_info, > > - etqf, ts, length); > > + igb_write_lgcy_rx_descr(core, &desc->legacy, pkt, rss_info, > > (*written)[1]); > > } else { > > - igb_write_adv_rx_descr(core, &desc->adv, pkt, rss_info, > > - etqf, ts, length); > > + igb_write_adv_rx_descr(core, &desc->adv, pkt, rss_info, etqf, ts, > > (*written)[1]); > > } > > } > > @@ -1513,19 +1544,6 @@ igb_pci_dma_write_rx_desc(IGBCore *core, PCIDevice > > *dev, dma_addr_t addr, > > } > > } > > -static void > > -igb_write_to_rx_buffers(IGBCore *core, > > - PCIDevice *d, > > - hwaddr ba, > > - uint16_t *written, > > - const char *data, > > - dma_addr_t data_len) > > -{ > > - trace_igb_rx_desc_buff_write(ba, *written, data, data_len); > > - pci_dma_write(d, ba + *written, data, data_len); > > - *written += data_len; > > -} > > - > > static void > > igb_update_rx_stats(IGBCore *core, const E1000E_RingInfo *rxi, > > size_t pkt_size, size_t pkt_fcs_size) > > @@ -1551,6 +1569,136 @@ igb_rx_descr_threshold_hit(IGBCore *core, const > > E1000E_RingInfo *rxi) > > ((core->mac[E1000_SRRCTL(rxi->idx) >> 2] >> 20) & 31) * 16; > > } > > +typedef struct igb_ba_state_st { > > + uint16_t written[MAX_PS_BUFFERS]; > > + uint8_t cur_idx; > > +} igb_ba_state; > > + > > +typedef struct { > > + size_t size; > > + size_t total_size; > > + size_t ps_hdr_len; > > + size_t desc_size; > > + size_t desc_offset; > > + uint32_t rx_desc_packet_buf_size; > > + uint32_t rx_desc_header_buf_size; > > + struct iovec *iov; > > + size_t iov_ofs; > > + bool is_first; > > + igb_ba_state bastate; > > + hwaddr ba[MAX_PS_BUFFERS]; > > +} igb_packet_tx_dma_state; > > Follow the convention described in docs/devel/style.rst for new typedefs. > > Also, it says tx but it's used for rx. > > > + > > +static void > > +igb_truncate_to_descriptor_size(igb_packet_tx_dma_state *pdma_st, size_t > > *size) > > +{ > > + if (*size > pdma_st->rx_desc_packet_buf_size) { > > + *size = pdma_st->rx_desc_packet_buf_size; > > + } > > +} > > + > > +static void > > +igb_write_payload_to_rx_buffers(IGBCore *core, > > + PCIDevice *d, > > + hwaddr (*ba)[MAX_PS_BUFFERS], > > > > + igb_ba_state *bastate, > > + uint32_t cur_buf_len, > > + const char *data, > > + dma_addr_t data_len) > > +{ > > + while (data_len > 0) { > > + assert(bastate->cur_idx < MAX_PS_BUFFERS); > > + > > + uint32_t cur_buf_bytes_left = cur_buf_len - > > + bastate->written[bastate->cur_idx]; > > + uint32_t bytes_to_write = MIN(data_len, cur_buf_bytes_left); > > + > > + trace_igb_rx_desc_buff_write(bastate->cur_idx, > > + (*ba)[bastate->cur_idx], > > + bastate->written[bastate->cur_idx], > > + data, > > + bytes_to_write); > > + > > + pci_dma_write(d, > > + (*ba)[bastate->cur_idx] + > > + bastate->written[bastate->cur_idx], > > + data, bytes_to_write); > > + > > + bastate->written[bastate->cur_idx] += bytes_to_write; > > + data += bytes_to_write; > > + data_len -= bytes_to_write; > > + > > + if (bastate->written[bastate->cur_idx] == cur_buf_len) { > > + bastate->cur_idx++; > > + } > > + } > > +} > > + > > +static void > > +igb_write_packet_payload_to_descr_addr(IGBCore *core, > > The names of igb_write_payload_to_rx_buffers() and > igb_write_packet_payload_to_descr_addr() don't disambiguate themselves. > Since igb_write_payload_to_rx_buffers() only writes a framgent of the > payload, you may rename it to igb_write_payload_fragment_to_rx_buffers(), > and rename igb_write_packet_payload_to_descr_addr() to > igb_write_payload_to_rx_buffers(). > > > + struct NetRxPkt *pkt, > > + PCIDevice *d, > > + igb_packet_tx_dma_state *pdma_st, > > + size_t *copy_size) > > +{ > > + static const uint32_t fcs_pad; > > + size_t iov_copy; > > + > > + /* Copy packet payload */ > > + while (*copy_size) { > > + iov_copy = MIN(*copy_size, pdma_st->iov->iov_len - > > pdma_st->iov_ofs); > > + igb_write_payload_to_rx_buffers(core, d, > > + &pdma_st->ba, > > + &pdma_st->bastate, > > + pdma_st->rx_desc_packet_buf_size, > > + pdma_st->iov->iov_base + > > pdma_st->iov_ofs, > > + iov_copy); > > + > > + *copy_size -= iov_copy; > > + pdma_st->iov_ofs += iov_copy; > > + if (pdma_st->iov_ofs == pdma_st->iov->iov_len) { > > + pdma_st->iov++; > > + pdma_st->iov_ofs = 0; > > + } > > + } > > + > > + if (pdma_st->desc_offset + pdma_st->desc_size >= pdma_st->total_size) { > > + /* Simulate FCS checksum presence in the last descriptor */ > > + igb_write_payload_to_rx_buffers(core, d, > > + &pdma_st->ba, > > + &pdma_st->bastate, > > + pdma_st->rx_desc_packet_buf_size, > > + (const char *) &fcs_pad, > > + e1000x_fcs_len(core->mac)); > > + } > > +} > > + > > +static void > > +igb_write_to_rx_buffers(IGBCore *core, > > + struct NetRxPkt *pkt, > > + PCIDevice *d, > > + igb_packet_tx_dma_state *pdma_st) > > +{ > > + size_t copy_size; > > + > > + if (!(pdma_st->ba)[1]) { > > + /* as per intel docs; skip descriptors with null buf addr */ > > + trace_e1000e_rx_null_descriptor(); > > + return; > > + } > > + > > + if (pdma_st->desc_offset >= pdma_st->size) { > > + return; > > + } > > + > > + pdma_st->desc_size = pdma_st->total_size - pdma_st->desc_offset; > > + igb_truncate_to_descriptor_size(pdma_st, &pdma_st->desc_size); > > + copy_size = pdma_st->size - pdma_st->desc_offset; > > + igb_truncate_to_descriptor_size(pdma_st, ©_size); > > + pdma_st->bastate.cur_idx = 1; > > + igb_write_packet_payload_to_descr_addr(core, pkt, d, pdma_st, > > ©_size); > > +} > > + > > static void > > igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt *pkt, > > const E1000E_RxRing *rxr, > > @@ -1560,91 +1708,58 @@ igb_write_packet_to_guest(IGBCore *core, struct > > NetRxPkt *pkt, > > PCIDevice *d; > > dma_addr_t base; > > union e1000_rx_desc_union desc; > > - size_t desc_size; > > - size_t desc_offset = 0; > > - size_t iov_ofs = 0; > > - > > - struct iovec *iov = net_rx_pkt_get_iovec(pkt); > > - size_t size = net_rx_pkt_get_total_len(pkt); > > - size_t total_size = size + e1000x_fcs_len(core->mac); > > - const E1000E_RingInfo *rxi = rxr->i; > > - size_t bufsize = igb_rxbufsize(core, rxi); > > - > > + const E1000E_RingInfo *rxi; > > + size_t rx_desc_len; > > + > > + igb_packet_tx_dma_state pdma_st = {0}; > > + pdma_st.is_first = true; > > + pdma_st.size = net_rx_pkt_get_total_len(pkt); > > + pdma_st.total_size = pdma_st.size + e1000x_fcs_len(core->mac); > > + > > + rxi = rxr->i; > > + rx_desc_len = core->rx_desc_len; > > + pdma_st.rx_desc_packet_buf_size = > > + igb_rxbufsize(core, rxi); > > + pdma_st.rx_desc_header_buf_size = > > + igb_get_queue_rx_header_buf_size(core, rxi); > > + pdma_st.iov = net_rx_pkt_get_iovec(pkt); > > d = pcie_sriov_get_vf_at_index(core->owner, rxi->idx % 8); > > if (!d) { > > d = core->owner; > > } > > do { > > - hwaddr ba; > > - uint16_t written = 0; > > + memset(&pdma_st.bastate, 0, sizeof(igb_ba_state)); > > bool is_last = false; > > - desc_size = total_size - desc_offset; > > - > > - if (desc_size > bufsize) { > > - desc_size = bufsize; > > - } > > - > > if (igb_ring_empty(core, rxi)) { > > return; > > } > > base = igb_ring_head_descr(core, rxi); > > + pci_dma_read(d, base, &desc, rx_desc_len); > > + trace_e1000e_rx_descr(rxi->idx, base, rx_desc_len); > > - pci_dma_read(d, base, &desc, core->rx_desc_len); > > - > > - trace_e1000e_rx_descr(rxi->idx, base, core->rx_desc_len); > > - > > - igb_read_rx_descr(core, &desc, &ba); > > - > > - if (ba) { > > - if (desc_offset < size) { > > - static const uint32_t fcs_pad; > > - size_t iov_copy; > > - size_t copy_size = size - desc_offset; > > - if (copy_size > bufsize) { > > - copy_size = bufsize; > > - } > > - > > - /* Copy packet payload */ > > - while (copy_size) { > > - iov_copy = MIN(copy_size, iov->iov_len - iov_ofs); > > - > > - igb_write_to_rx_buffers(core, d, ba, &written, > > - iov->iov_base + iov_ofs, > > iov_copy); > > + igb_read_rx_descr(core, &desc, pdma_st.ba, rxi); > > - copy_size -= iov_copy; > > - iov_ofs += iov_copy; > > - if (iov_ofs == iov->iov_len) { > > - iov++; > > - iov_ofs = 0; > > - } > > - } > > - > > - if (desc_offset + desc_size >= total_size) { > > - /* Simulate FCS checksum presence in the last > > descriptor */ > > - igb_write_to_rx_buffers(core, d, ba, &written, > > - (const char *) &fcs_pad, > > e1000x_fcs_len(core->mac)); > > - } > > - } > > - } else { /* as per intel docs; skip descriptors with null buf addr > > */ > > - trace_e1000e_rx_null_descriptor(); > > - } > > - desc_offset += desc_size; > > - if (desc_offset >= total_size) { > > + igb_write_to_rx_buffers(core, pkt, d, &pdma_st); > > + pdma_st.desc_offset += pdma_st.desc_size; > > + if (pdma_st.desc_offset >= pdma_st.total_size) { > > is_last = true; > > } > > - igb_write_rx_descr(core, &desc, is_last ? core->rx_pkt : NULL, > > - rss_info, etqf, ts, written); > > - igb_pci_dma_write_rx_desc(core, d, base, &desc, core->rx_desc_len); > > - > > - igb_ring_advance(core, rxi, core->rx_desc_len / > > E1000_MIN_RX_DESC_LEN); > > - > > - } while (desc_offset < total_size); > > - > > - igb_update_rx_stats(core, rxi, size, total_size); > > + igb_write_rx_descr(core, &desc, > > + is_last ? pkt : NULL, > > + rss_info, > > + etqf, ts, > > + &pdma_st.bastate.written, > > + rxi); > > + pci_dma_write(d, base, &desc, rx_desc_len); > > + igb_ring_advance(core, rxi, > > + rx_desc_len / E1000_MIN_RX_DESC_LEN); > > + } while (pdma_st.desc_offset < pdma_st.total_size); > > + > > + igb_update_rx_stats(core, rxi, pdma_st.size, pdma_st.total_size); > > } > > static bool > > diff --git a/hw/net/igb_regs.h b/hw/net/igb_regs.h > > index 82ff195dfc..b14ca834ad 100644 > > --- a/hw/net/igb_regs.h > > +++ b/hw/net/igb_regs.h > > @@ -76,6 +76,15 @@ union e1000_adv_rx_desc { > > } wb; /* writeback */ > > }; > > +#define IGB_MAX_PS_BUFFERS 2 > > +#define IGB_MAX_RX_DESC_LEN (sizeof(uint64_t) * MAX_PS_BUFFERS) > > + > > +#define RX_DESC_ADV_HDR_LEN_OFFSET (21 - 16) > > +#define RX_DESC_ADV_HDR_LEN_MASK ((BIT(10) - 1) << \ > > + RX_DESC_ADV_HDR_LEN_OFFSET) > > +#define RX_DESC_ADV_HDR_SPH BIT(15) > > +#define RX_DESC_ADV_ST_ERR_HBO_OFFSET BIT(3 + 20) > > + > > These should be added afer /* new */ > > And use the name ADVRXD to denote the advanced Rx descriptor to make it > consistent with ADVTXD. > > > /* from igb/e1000_phy.h */ > > /* IGP01E1000 Specific Registers */ > > @@ -452,6 +461,7 @@ union e1000_adv_rx_desc { > > #define E1000_SRRCTL_BSIZEHDRSIZE_MASK 0x00000F00 > > #define E1000_SRRCTL_BSIZEHDRSIZE_SHIFT 2 /* Shift _left_ */ > > #define E1000_SRRCTL_DESCTYPE_ADV_ONEBUF 0x02000000 > > +#define E1000_SRRCTL_DESCTYPE_HDR_SPLIT 0x04000000 > > #define E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS 0x0A000000 > > #define E1000_SRRCTL_DESCTYPE_MASK 0x0E000000 > > #define E1000_SRRCTL_DROP_EN 0x80000000 > > @@ -692,11 +702,12 @@ union e1000_adv_rx_desc { > > #define E1000_STATUS_NUM_VFS_SHIFT 14 > > -#define E1000_ADVRXD_PKT_IP4 BIT(4) > > -#define E1000_ADVRXD_PKT_IP6 BIT(6) > > -#define E1000_ADVRXD_PKT_TCP BIT(8) > > -#define E1000_ADVRXD_PKT_UDP BIT(9) > > -#define E1000_ADVRXD_PKT_SCTP BIT(10) > > +#define E1000_ADVRXD_PKT_IP4 BIT(0) > > +#define E1000_ADVRXD_PKT_IP6 BIT(2) > > +#define E1000_ADVRXD_PKT_IP6E BIT(3) > > +#define E1000_ADVRXD_PKT_TCP BIT(4) > > +#define E1000_ADVRXD_PKT_UDP BIT(5) > > +#define E1000_ADVRXD_PKT_SCTP BIT(6) > > static inline uint8_t igb_ivar_entry_rx(uint8_t i) > > { > > diff --git a/hw/net/trace-events b/hw/net/trace-events > > index e4a98b2c7d..9a02261360 100644 > > --- a/hw/net/trace-events > > +++ b/hw/net/trace-events > > @@ -277,9 +277,9 @@ igb_core_mdic_write_unhandled(uint32_t addr) "MDIC > > WRITE: PHY[%u] UNHANDLED" > > igb_link_set_ext_params(bool asd_check, bool speed_select_bypass, bool > > pfrstd) "Set extended link params: ASD check: %d, Speed select bypass: %d, > > PF reset done: %d" > > igb_rx_desc_buff_size(uint32_t b) "buffer size: %u" > > -igb_rx_desc_buff_write(uint64_t addr, uint16_t offset, const void* source, > > uint32_t len) "addr: 0x%"PRIx64", offset: %u, from: %p, length: %u" > > +igb_rx_desc_buff_write(uint8_t idx, uint64_t addr, uint16_t offset, const > > void* source, uint32_t len) "buffer #%u, addr: 0x%"PRIx64", offset: %u, > > from: %p, length: %u" > > -igb_rx_metadata_rss(uint32_t rss) "RSS data: 0x%X" > > +igb_rx_metadata_rss(uint32_t rss, uint16_t rss_pkt_type) "RSS data: rss: > > 0x%X, rss_pkt_type: 0x%X" > > igb_irq_icr_clear_gpie_nsicr(void) "Clearing ICR on read due to > > GPIE.NSICR enabled" > > igb_irq_set_iam(uint32_t icr) "Update IAM: 0x%x" > > @@ -294,6 +294,8 @@ igb_irq_eitr_set(uint32_t eitr_num, uint32_t val) > > "EITR[%u] = 0x%x" > > igb_set_pfmailbox(uint32_t vf_num, uint32_t val) "PFMailbox[%d]: 0x%x" > > igb_set_vfmailbox(uint32_t vf_num, uint32_t val) "VFMailbox[%d]: 0x%x" > > +igb_wrn_rx_desc_modes_not_supp(int desc_type) "Not supported descriptor > > type: %d" > > + > > # igbvf.c > > igbvf_wrn_io_addr_unknown(uint64_t addr) "IO unknown register 0x%"PRIx64 > > diff --git a/tests/qtest/libqos/igb.c b/tests/qtest/libqos/igb.c > > index a603468beb..4e720ee7e7 100644 > > --- a/tests/qtest/libqos/igb.c > > +++ b/tests/qtest/libqos/igb.c > > @@ -55,6 +55,7 @@ static void igb_pci_start_hw(QOSGraphObject *obj) > > { > > static const uint8_t address[] = E1000E_ADDRESS; > > QE1000E_PCI *d = (QE1000E_PCI *) obj; > > + uint32_t srrctl0; > > uint32_t val; > > /* Enable the device */ > > @@ -109,6 +110,10 @@ static void igb_pci_start_hw(QOSGraphObject *obj) > > E1000_RAH_AV | E1000_RAH_POOL_1 | > > le16_to_cpu(*(uint16_t *)(address + 4))); > > + /* Set supported receive descriptor mode */ > > + srrctl0 = e1000e_macreg_read(&d->e1000e, E1000_SRRCTL(0)); > > + e1000e_macreg_write(&d->e1000e, E1000_SRRCTL(0), srrctl0 | BIT(25)); > > As I wrote in previous review: > > You don't need to read the initial value. The Linux driver also > > doesn't do that. > > > > Also use: E1000_SRRCTL_DESCTYPE_ADV_ONEBUF > > > + > > /* Enable receive */ > > e1000e_macreg_write(&d->e1000e, E1000_RFCTL, E1000_RFCTL_EXTEN); > > e1000e_macreg_write(&d->e1000e, E1000_RCTL, E1000_RCTL_EN); >