[PATCH 23.11 1/1] net/e1000: fix igc launch time calculation
Shani Peretz
shperetz at nvidia.com
Tue Apr 21 07:36:03 CEST 2026
> -----Original Message-----
> From: Song Yoong Siang <yoong.siang.song at intel.com>
> Sent: Tuesday, 7 April 2026 8:23
> To: stable at dpdk.org
> Cc: Shani Peretz <shperetz at nvidia.com>; Bruce Richardson
> <bruce.richardson at intel.com>; David Zage <david.zage at intel.com>; Song
> Yoong Siang <yoong.siang.song at intel.com>
> Subject: [PATCH 23.11 1/1] net/e1000: fix igc launch time calculation
>
> External email: Use caution opening links or attachments
>
>
> [ upstream commit 2e79349dcd07440a7aecd61f00792d82e1bfebbc ]
>
> Improve the launch time calculation logic to handle different scenarios:
> - Set launch time to 0 if txtime has expired.
> - Set launch time to 0 if txtime exceeds the horizon (beyond the end of
> the next Qbv cycle).
> - Mark the first flag in the context descriptor when the packet is the
> first one scheduled in the next Qbv cycle.
> - Create a dummy packet to dirty the current cycle before sending
> packets intended for the next Qbv cycle.
>
> Testing was performed on two Intel ADL-S platforms with i226 NICs connected
> back-to-back. A DPDK sample application is created to send 10 UDP packets
> with 20,000 nanosecond intervals and their txtime is set to the time of the next
> Qbv cycle. Meanwhile, the tcpdump command below is used on the link partner
> to capture the delta of Rx hardware timestamp of the 10 packets:
>
> tcpdump -ttt -ni enp1s0 --time-stamp-precision=nano -j adapter_unsynced
>
> Without this patch, packets are transmitted immediately as the hardware
> interprets their launch time as expired, resulting in 8,384 nanosecond intervals
> (wire speed for 1024-byte packets at 1Gbps), as shown in tcpdump log below:
>
> 00:00:00.000000000 IP 192.168.1.100.2 > 224.1.1.1.5: UDP, length 982
> 00:00:00.000008384 IP 192.168.1.100.2 > 224.1.1.1.5: UDP, length 982
> 00:00:00.000008384 IP 192.168.1.100.2 > 224.1.1.1.5: UDP, length 982
> 00:00:00.000008384 IP 192.168.1.100.2 > 224.1.1.1.5: UDP, length 982
> 00:00:00.000008384 IP 192.168.1.100.2 > 224.1.1.1.5: UDP, length 982
> 00:00:00.000008384 IP 192.168.1.100.2 > 224.1.1.1.5: UDP, length 982
> 00:00:00.000008384 IP 192.168.1.100.2 > 224.1.1.1.5: UDP, length 982
> 00:00:00.000008384 IP 192.168.1.100.2 > 224.1.1.1.5: UDP, length 982
> 00:00:00.000008384 IP 192.168.1.100.2 > 224.1.1.1.5: UDP, length 982
> 00:00:00.000008384 IP 192.168.1.100.2 > 224.1.1.1.5: UDP, length 982
>
> With this patch, packets are properly held until the next Qbv cycle and
> transmitted at the intended 20,000 nanosecond intervals, demonstrating
> correct launch time behavior, as shown in tcpdump log below:
>
> 00:00:00.000000000 [|llc]
> 00:00:00.000862592 IP 192.168.1.100.2 > 224.1.1.1.5: UDP, length 982
> 00:00:00.000019993 IP 192.168.1.100.2 > 224.1.1.1.5: UDP, length 982
> 00:00:00.000020000 IP 192.168.1.100.2 > 224.1.1.1.5: UDP, length 982
> 00:00:00.000020010 IP 192.168.1.100.2 > 224.1.1.1.5: UDP, length 982
> 00:00:00.000019997 IP 192.168.1.100.2 > 224.1.1.1.5: UDP, length 982
> 00:00:00.000020000 IP 192.168.1.100.2 > 224.1.1.1.5: UDP, length 982
> 00:00:00.000020003 IP 192.168.1.100.2 > 224.1.1.1.5: UDP, length 982
> 00:00:00.000019990 IP 192.168.1.100.2 > 224.1.1.1.5: UDP, length 982
> 00:00:00.000020000 IP 192.168.1.100.2 > 224.1.1.1.5: UDP, length 982
> 00:00:00.000020000 IP 192.168.1.100.2 > 224.1.1.1.5: UDP, length 982
>
> Fixes: 9630f7c71ecd ("net/igc: enable launch time offloading")
> Cc: stable at dpdk.org
>
> Signed-off-by: David Zage <david.zage at intel.com>
> Signed-off-by: Song Yoong Siang <yoong.siang.song at intel.com>
> Acked-by: Bruce Richardson <bruce.richardson at intel.com>
> ---
> drivers/net/igc/igc_txrx.c | 222 +++++++++++++++++++++++++++++++------
> drivers/net/igc/igc_txrx.h | 9 ++
> 2 files changed, 199 insertions(+), 32 deletions(-)
>
> diff --git a/drivers/net/igc/igc_txrx.c b/drivers/net/igc/igc_txrx.c index
> 5241d2fd1c..b774b4df14 100644
> --- a/drivers/net/igc/igc_txrx.c
> +++ b/drivers/net/igc/igc_txrx.c
> @@ -92,9 +92,13 @@
> /* L4 Packet TYPE of Reserved */
> #define IGC_ADVTXD_TUCMD_L4T_RSV 0x00001800
>
> +/* Indicate the first packet in a Qbv cycle */
> +#define IGC_ADVTXD_TSN_CNTX_FRST 0x00000080
> +
> #define IGC_TX_OFFLOAD_NOTSUP_MASK (RTE_MBUF_F_TX_OFFLOAD_MASK
> ^ IGC_TX_OFFLOAD_MASK)
>
> #define IGC_TS_HDR_LEN 16
> +#define IGC_DUMMY_PKT_SIZE 64
>
> static inline uint64_t
> rx_desc_statuserr_to_pkt_flags(uint32_t statuserr) @@ -1442,34 +1446,166
> @@ what_advctx_update(struct igc_tx_queue *txq, uint64_t flags,
> return IGC_CTX_NUM;
> }
>
> -static uint32_t igc_tx_launchtime(uint64_t txtime, uint16_t port_id)
> +static uint32_t
> +igc_tx_launchtime(uint64_t txtime, struct igc_tx_queue *txq,
> + bool *need_dummy_pkt, bool *need_frst_flag)
> {
> - struct rte_eth_dev *dev = &rte_eth_devices[port_id];
> + struct rte_eth_dev *dev = &rte_eth_devices[txq->port_id];
> struct igc_adapter *adapter = IGC_DEV_PRIVATE(dev);
> - uint64_t base_time = adapter->base_time;
> + struct igc_hw *hw = IGC_DEV_PRIVATE_HW(dev);
> uint64_t cycle_time = adapter->cycle_time;
> + uint64_t base_time = adapter->base_time;
> + uint64_t current_cycle_end;
> + uint64_t cycles_elapsed;
> uint32_t launchtime;
> + uint32_t nsec, sec;
> + uint64_t systime;
> +
> + /*
> + * Read current PTP hardware time from SYSTIM registers.
> + * Reading the SYSTIML register latches the upper 32 bits to the SYSTIMH
> + * shadow register for coherent access. As long as we read SYSTIML first
> + * followed by SYSTIMH, we avoid race conditions where the time rolls
> + * over between the two register reads.
> + */
> + nsec = IGC_READ_REG(hw, IGC_SYSTIML);
> + sec = IGC_READ_REG(hw, IGC_SYSTIMH);
> + systime = (uint64_t)sec * NSEC_PER_SEC + (uint64_t)nsec;
>
> + /* Calculate end time of current Qbv cycle */
> + cycles_elapsed = (systime - base_time) / cycle_time;
> + current_cycle_end = (cycles_elapsed + 1) * cycle_time +
> + base_time;
> +
> + /* Set launchtime to 0 if txtime has expired or exceeds the horizon */
> + if (txtime <= systime || txtime >= current_cycle_end + cycle_time) {
> + txq->last_packet_cycle = current_cycle_end;
> + return 0;
> + }
> +
> + /* Calculate launchtime to be inserted into Tx context
> + descriptor */
> launchtime = (txtime - base_time) % cycle_time;
>
> + /* Handle txtime that fall into next Qbv cycle */
> + if (txtime >= current_cycle_end) {
> + /* Only mark as first if the cycle hasn't had a first pkt yet */
> + if (txq->last_frst_flag != current_cycle_end) {
> + *need_frst_flag = true;
> + txq->last_frst_flag = current_cycle_end;
> +
> + /* Check if we need dummy pkt to dirty current cycle */
> + if (txq->last_packet_cycle < current_cycle_end)
> + *need_dummy_pkt = true;
> + }
> + txq->last_packet_cycle = current_cycle_end + cycle_time;
> + } else {
> + txq->last_packet_cycle = current_cycle_end;
> + }
> +
> return rte_cpu_to_le_32(launchtime); }
>
> +/*
> + * If the IGC_ADVTXD_TSN_CNTX_FRST flag is used to schedule a packet
> +for the
> + * next Qbv cycle while no packet was transmitted from that queue in
> +the current
> + * cycle, then the IGC_ADVTXD_TSN_CNTX_FRST flag may be valid in the
> +current
> + * cycle and the packet will be transmitted in the current cycle. To
> +overcome
> + * this issue, we transmit an IGC_DUMMY_PKT_SIZE byte "dummy" packet to
> "dirty"
> + * the current cycle before sending the packet intended for the next cycle.
> + */
> +static void
> +igc_insert_dummy_packet(struct igc_tx_queue *txq, uint16_t *tx_id) {
> + volatile union igc_adv_tx_desc * const txr = txq->tx_ring;
> + struct igc_tx_entry * const sw_ring = txq->sw_ring;
> + volatile struct igc_adv_tx_context_desc *ctx_txd;
> + volatile union igc_adv_tx_desc *txd;
> + struct igc_tx_entry *txe, *txn;
> +
> + /* Get Tx entry (txe) for Tx context descriptor of dummy packet */
> + txe = &sw_ring[*tx_id];
> +
> + /* Prepare for next Tx entry (txn) */
> + txn = &sw_ring[txe->next_id];
> + RTE_MBUF_PREFETCH_TO_FREE(txn->mbuf);
> +
> + /* Set up Tx context descriptor for dummy packet */
> + ctx_txd = (volatile struct igc_adv_tx_context_desc *)&txr[*tx_id];
> + ctx_txd->type_tucmd_mlhl = rte_cpu_to_le_32(IGC_ADVTXD_DTYP_CTXT
> |
> + IGC_ADVTXD_DCMD_DEXT);
> + ctx_txd->mss_l4len_idx = rte_cpu_to_le_32(txq->ctx_curr <<
> + IGC_ADVTXD_IDX_SHIFT);
> + ctx_txd->vlan_macip_lens = 0;
> + ctx_txd->u.launch_time = 0;
> +
> + /* Update tx_id and last_id */
> + *tx_id = txe->next_id;
> + txe->last_id = *tx_id;
> +
> + /* Get Tx entry (txe) for Tx data descriptor of dummy packet */
> + txe = txn;
> +
> + /* Prepare for next Tx entry (txn) */
> + txn = &sw_ring[txe->next_id];
> + RTE_MBUF_PREFETCH_TO_FREE(txn->mbuf);
> +
> + /* Free previous mbuf */
> + if (txe->mbuf != NULL) {
> + rte_pktmbuf_free_seg(txe->mbuf);
> + txe->mbuf = NULL;
> + }
> +
> + /* Set up Tx data descriptor for dummy packet */
> + txd = &txr[*tx_id];
> + txd->read.buffer_addr = rte_cpu_to_le_64(txq->dummy_pkt_dma);
> + txd->read.cmd_type_len = rte_cpu_to_le_32(txq->txd_type |
> + IGC_DUMMY_PKT_SIZE | IGC_ADVTXD_DCMD_IFCS |
> + IGC_ADVTXD_DCMD_DEXT | IGC_TXD_CMD_EOP |
> + IGC_TXD_CMD_RS);
> + txd->read.olinfo_status = rte_cpu_to_le_32(IGC_DUMMY_PKT_SIZE <<
> + IGC_ADVTXD_PAYLEN_SHIFT);
> +
> + /* Update last_id and tx_id */
> + txe->last_id = *tx_id;
> + *tx_id = txe->next_id;
> +
> + /* Get Tx entry (txe) for Tx context descriptor of actual packet */
> + txe = txn;
> +
> + /* Prepare for next Tx entry (txn) */
> + txn = &sw_ring[txe->next_id];
> + RTE_MBUF_PREFETCH_TO_FREE(txn->mbuf);
> +
> + /* Free previous mbuf */
> + if (txe->mbuf != NULL) {
> + rte_pktmbuf_free_seg(txe->mbuf);
> + txe->mbuf = NULL;
> + }
> +
> + /* Update ctx_curr */
> + txq->ctx_curr ^= 1;
> +}
> +
> /*
> * This is a separate function, looking for optimization opportunity here
> * Rework required to go with the pre-defined values.
> */
> static inline void
> -igc_set_xmit_ctx(struct igc_tx_queue *txq,
> - volatile struct igc_adv_tx_context_desc *ctx_txd,
> +igc_set_xmit_ctx(struct igc_tx_queue *txq, uint16_t *tx_id,
> uint64_t ol_flags, union igc_tx_offload tx_offload,
> - uint64_t txtime)
> + uint64_t txtime, uint16_t tx_last)
> {
> + volatile union igc_adv_tx_desc * const txr = txq->tx_ring;
> + struct igc_tx_entry * const sw_ring = txq->sw_ring;
> + volatile struct igc_adv_tx_context_desc *ctx_txd;
> + struct igc_tx_entry *txe;
> uint32_t type_tucmd_mlhl;
> - uint32_t mss_l4len_idx;
> + uint32_t mss_l4len_idx = 0;
> uint32_t ctx_curr;
> uint32_t vlan_macip_lens;
> union igc_tx_offload tx_offload_mask;
> + bool need_frst_flag = false;
> + bool need_dummy_pkt = false;
> + uint32_t launch_time = 0;
>
> /* Use the previous context */
> txq->ctx_curr ^= 1;
> @@ -1478,9 +1614,6 @@ igc_set_xmit_ctx(struct igc_tx_queue *txq,
> tx_offload_mask.data = 0;
> type_tucmd_mlhl = 0;
>
> - /* Specify which HW CTX to upload. */
> - mss_l4len_idx = (ctx_curr << IGC_ADVTXD_IDX_SHIFT);
> -
> if (ol_flags & RTE_MBUF_F_TX_VLAN)
> tx_offload_mask.vlan_tci = 0xffff;
>
> @@ -1542,18 +1675,32 @@ igc_set_xmit_ctx(struct igc_tx_queue *txq,
> txq->ctx_cache[ctx_curr].tx_offload.data =
> tx_offload_mask.data & tx_offload.data;
> txq->ctx_cache[ctx_curr].tx_offload_mask = tx_offload_mask;
> + } else {
> + launch_time = igc_tx_launchtime(txtime, txq, &need_dummy_pkt,
> + &need_frst_flag);
> }
>
> + if (need_frst_flag)
> + mss_l4len_idx |= IGC_ADVTXD_TSN_CNTX_FRST;
> +
> + if (need_dummy_pkt)
> + igc_insert_dummy_packet(txq, tx_id);
> +
> + /* Specify which HW CTX to upload. */
> + mss_l4len_idx |= (txq->ctx_curr << IGC_ADVTXD_IDX_SHIFT);
> +
> + /* Set up Tx context descriptor */
> + ctx_txd = (volatile struct igc_adv_tx_context_desc
> + *)&txr[*tx_id];
> ctx_txd->type_tucmd_mlhl = rte_cpu_to_le_32(type_tucmd_mlhl);
> vlan_macip_lens = (uint32_t)tx_offload.data;
> ctx_txd->vlan_macip_lens = rte_cpu_to_le_32(vlan_macip_lens);
> ctx_txd->mss_l4len_idx = rte_cpu_to_le_32(mss_l4len_idx);
> + ctx_txd->u.launch_time = launch_time;
>
> - if (txtime)
> - ctx_txd->u.launch_time = igc_tx_launchtime(txtime,
> - txq->port_id);
> - else
> - ctx_txd->u.launch_time = 0;
> + /* Update last_id and tx_id */
> + txe = &sw_ring[*tx_id];
> + txe->last_id = tx_last;
> + *tx_id = txe->next_id;
> }
>
> static inline uint32_t
> @@ -1603,7 +1750,7 @@ igc_xmit_pkts(void *tx_queue, struct rte_mbuf
> **tx_pkts, uint16_t nb_pkts)
> uint64_t tx_ol_req;
> uint32_t new_ctx = 0;
> union igc_tx_offload tx_offload = {0};
> - uint64_t ts;
> + uint64_t ts = 0;
>
> tx_id = txq->tx_tail;
> txe = &sw_ring[tx_id];
> @@ -1653,7 +1800,7 @@ igc_xmit_pkts(void *tx_queue, struct rte_mbuf
> **tx_pkts, uint16_t nb_pkts)
> /*
> * Check if there are enough free descriptors in the TX ring
> * to transmit the next packet.
> - * This operation is based on the two following rules:
> + * This operation is based on the three following rules:
> *
> * 1- Only check that the last needed TX descriptor can be
> * allocated (by construction, if that descriptor is free,
> @@ -1674,13 +1821,17 @@ igc_xmit_pkts(void *tx_queue, struct rte_mbuf
> **tx_pkts, uint16_t nb_pkts)
> * By extension, avoid to allocate a free descriptor that
> * belongs to the last set of free descriptors allocated
> * to the same packet previously transmitted.
> + *
> + * 3- Make sure there are two extra descriptors available in
> + * the ring, in case a dummy packet is needed to dirty the
> + * current Qbv cycle when using launch time feature.
> */
>
> /*
> * The "last descriptor" of the previously sent packet, if any,
> * which used the last descriptor to allocate.
> */
> - tx_end = sw_ring[tx_last].last_id;
> + tx_end = sw_ring[tx_last + 2].last_id;
>
> /*
> * The next descriptor following that "last descriptor" in the @@ -
> 1740,10 +1891,6 @@ igc_xmit_pkts(void *tx_queue, struct rte_mbuf
> **tx_pkts, uint16_t nb_pkts)
> if (tx_ol_req) {
> /* Setup TX Advanced context descriptor if required */
> if (new_ctx) {
> - volatile struct igc_adv_tx_context_desc *
> - ctx_txd = (volatile struct
> - igc_adv_tx_context_desc *)&txr[tx_id];
> -
> txn = &sw_ring[txe->next_id];
> RTE_MBUF_PREFETCH_TO_FREE(txn->mbuf);
>
> @@ -1752,20 +1899,15 @@ igc_xmit_pkts(void *tx_queue, struct rte_mbuf
> **tx_pkts, uint16_t nb_pkts)
> txe->mbuf = NULL;
> }
>
> - if (igc_tx_timestamp_dynflag > 0) {
> + if (igc_tx_timestamp_dynflag > 0)
> ts = *RTE_MBUF_DYNFIELD(tx_pkt,
> igc_tx_timestamp_dynfield_offset,
> uint64_t *);
> - igc_set_xmit_ctx(txq, ctx_txd,
> - tx_ol_req, tx_offload, ts);
> - } else {
> - igc_set_xmit_ctx(txq, ctx_txd,
> - tx_ol_req, tx_offload, 0);
> - }
>
> - txe->last_id = tx_last;
> - tx_id = txe->next_id;
> - txe = txn;
> + igc_set_xmit_ctx(txq, &tx_id, tx_ol_req,
> + tx_offload, ts,
> + tx_last);
> +
> + txe = &sw_ring[tx_id];
> }
>
> /* Setup the TX Advanced Data Descriptor */ @@ -1863,6
> +2005,7 @@ static void igc_tx_queue_release(struct igc_tx_queue *txq) {
> igc_tx_queue_release_mbufs(txq);
> + rte_free(txq->dummy_pkt_buf);
> rte_free(txq->sw_ring);
> rte_free(txq);
> }
> @@ -2017,6 +2160,21 @@ int eth_igc_tx_queue_setup(struct rte_eth_dev
> *dev, uint16_t queue_idx,
> PMD_DRV_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"
> PRIx64,
> txq->sw_ring, txq->tx_ring, txq->tx_ring_phys_addr);
>
> + /* Allocate dummy packet buffer */
> + txq->dummy_pkt_buf = rte_zmalloc("dummy_pkt",
> IGC_DUMMY_PKT_SIZE,
> + RTE_CACHE_LINE_SIZE);
> + if (txq->dummy_pkt_buf == NULL) {
> + igc_tx_queue_release(txq);
> + return -ENOMEM;
> + }
> +
> + txq->dummy_pkt_dma = rte_mem_virt2iova(txq->dummy_pkt_buf);
> + if (txq->dummy_pkt_dma == RTE_BAD_IOVA) {
> + PMD_DRV_LOG(ERR, "Failed to get DMA address for dummy
> packet");
> + igc_tx_queue_release(txq);
> + return -ENOMEM;
> + }
> +
> igc_reset_tx_queue(txq);
> dev->tx_pkt_burst = igc_xmit_pkts;
> dev->tx_pkt_prepare = ð_igc_prep_pkts; diff --git
> a/drivers/net/igc/igc_txrx.h b/drivers/net/igc/igc_txrx.h index
> ad7d3b4ca5..21286db00f 100644
> --- a/drivers/net/igc/igc_txrx.h
> +++ b/drivers/net/igc/igc_txrx.h
> @@ -128,6 +128,15 @@ struct igc_tx_queue {
> struct igc_advctx_info ctx_cache[IGC_CTX_NUM];
> /**< Hardware context history.*/
> uint64_t offloads; /**< offloads of RTE_ETH_TX_OFFLOAD_* */
> +
> + /**< Qbv cycle when the last first flag was marked. */
> + uint64_t last_frst_flag;
> + /**< Qbv cycle when the last packet was transmitted. */
> + uint64_t last_packet_cycle;
> + /**< Virtual address of dummy packet buffer for Qbv cycle dirtying. */
> + void *dummy_pkt_buf;
> + /**< DMA/physical address of dummy packet buffer for hardware access.
> */
> + rte_iova_t dummy_pkt_dma;
> };
>
> /*
> --
> 2.34.1
Thanks, applied to 23.11
More information about the stable
mailing list