From 0cfcb2963d1739588c033c7d18d28f181dd909e4 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Thu, 5 Mar 2026 12:12:42 +0100 Subject: [PATCH 01/23] xdp: use modulo operation to calculate XDP frag tailroom The current formula for calculating XDP tailroom in mbuf packets works only if each frag has its own page (if rxq->frag_size is PAGE_SIZE), this defeats the purpose of the parameter overall and without any indication leads to negative calculated tailroom on at least half of frags, if shared pages are used. There are not many drivers that set rxq->frag_size. Among them: * i40e and enetc always split page uniformly between frags, use shared pages * ice uses page_pool frags via libeth, those are power-of-2 and uniformly distributed across page * idpf has variable frag_size with XDP on, so current API is not applicable * mlx5, mtk and mvneta use PAGE_SIZE or 0 as frag_size for page_pool As for AF_XDP ZC, only ice, i40e and idpf declare frag_size for it. Modulo operation yields good results for aligned chunks, they are all power-of-2, between 2K and PAGE_SIZE. Formula without modulo fails when chunk_size is 2K. Buffers in unaligned mode are not distributed uniformly, so modulo operation would not work. To accommodate unaligned buffers, we could define frag_size as data + tailroom, and hence do not subtract offset when calculating tailroom, but this would necessitate more changes in the drivers. Define rxq->frag_size as an even portion of a page that fully belongs to a single frag. When calculating tailroom, locate the data start within such portion by performing a modulo operation on page offset. Fixes: bf25146a5595 ("bpf: add frags support to the bpf_xdp_adjust_tail() API") Acked-by: Jakub Kicinski Reviewed-by: Aleksandr Loktionov Signed-off-by: Larysa Zaremba Link: https://patch.msgid.link/20260305111253.2317394-2-larysa.zaremba@intel.com Signed-off-by: Jakub Kicinski --- net/core/filter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 029e560e32ce3e..31a8536eb03ebb 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4156,7 +4156,8 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz) return -EOPNOTSUPP; - tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag); + tailroom = rxq->frag_size - skb_frag_size(frag) - + skb_frag_off(frag) % rxq->frag_size; if (unlikely(offset > tailroom)) return -EINVAL; From d040ccb028f88717fc295a338627de36e7943086 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Thu, 5 Mar 2026 12:12:43 +0100 Subject: [PATCH 02/23] xsk: introduce helper to determine rxq->frag_size rxq->frag_size is basically a step between consecutive strictly aligned frames. In ZC mode, chunk size fits exactly, but if chunks are unaligned, there is no safe way to determine accessible space to grow tailroom. Report frag_size to be zero, if chunks are unaligned, chunk_size otherwise. Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX") Reviewed-by: Aleksandr Loktionov Signed-off-by: Larysa Zaremba Link: https://patch.msgid.link/20260305111253.2317394-3-larysa.zaremba@intel.com Signed-off-by: Jakub Kicinski --- include/net/xdp_sock_drv.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 242e34f771cca6..09d972f4bd6089 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -51,6 +51,11 @@ static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) return xsk_pool_get_chunk_size(pool) - xsk_pool_get_headroom(pool); } +static inline u32 xsk_pool_get_rx_frag_step(struct xsk_buff_pool *pool) +{ + return pool->unaligned ? 0 : xsk_pool_get_chunk_size(pool); +} + static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) { @@ -337,6 +342,11 @@ static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) return 0; } +static inline u32 xsk_pool_get_rx_frag_step(struct xsk_buff_pool *pool) +{ + return 0; +} + static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) { From 8468091cbe9bab37b33a04c73eb8ef3e1a7c7263 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Thu, 5 Mar 2026 12:12:48 +0100 Subject: [PATCH 03/23] libeth, idpf: use truesize as XDP RxQ info frag_size The only user of frag_size field in XDP RxQ info is bpf_xdp_frags_increase_tail(). It clearly expects whole buffer size instead of DMA write size. Different assumptions in idpf driver configuration lead to negative tailroom. To make it worse, buffer sizes are not actually uniform in idpf when splitq is enabled, as there are several buffer queues, so rxq->rx_buf_size is meaningless in this case. Use truesize of the first bufq in AF_XDP ZC, as there is only one. Disable growing tail for regular splitq. Fixes: ac8a861f632e ("idpf: prepare structures to support XDP") Reviewed-by: Aleksandr Loktionov Signed-off-by: Larysa Zaremba Link: https://patch.msgid.link/20260305111253.2317394-8-larysa.zaremba@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/idpf/xdp.c | 6 +++++- drivers/net/ethernet/intel/idpf/xsk.c | 1 + drivers/net/ethernet/intel/libeth/xsk.c | 1 + include/net/libeth/xsk.h | 3 +++ 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/idpf/xdp.c b/drivers/net/ethernet/intel/idpf/xdp.c index 958d16f874248d..7d91f21174de1c 100644 --- a/drivers/net/ethernet/intel/idpf/xdp.c +++ b/drivers/net/ethernet/intel/idpf/xdp.c @@ -46,11 +46,15 @@ static int __idpf_xdp_rxq_info_init(struct idpf_rx_queue *rxq, void *arg) { const struct idpf_vport *vport = rxq->q_vector->vport; bool split = idpf_is_queue_model_split(vport->rxq_model); + u32 frag_size = 0; int err; + if (idpf_queue_has(XSK, rxq)) + frag_size = rxq->bufq_sets[0].bufq.truesize; + err = __xdp_rxq_info_reg(&rxq->xdp_rxq, vport->netdev, rxq->idx, rxq->q_vector->napi.napi_id, - rxq->rx_buf_size); + frag_size); if (err) return err; diff --git a/drivers/net/ethernet/intel/idpf/xsk.c b/drivers/net/ethernet/intel/idpf/xsk.c index fd2cc43ab43cba..95a665cb2f33b4 100644 --- a/drivers/net/ethernet/intel/idpf/xsk.c +++ b/drivers/net/ethernet/intel/idpf/xsk.c @@ -401,6 +401,7 @@ int idpf_xskfq_init(struct idpf_buf_queue *bufq) bufq->pending = fq.pending; bufq->thresh = fq.thresh; bufq->rx_buf_size = fq.buf_len; + bufq->truesize = fq.truesize; if (!idpf_xskfq_refill(bufq)) netdev_err(bufq->pool->netdev, diff --git a/drivers/net/ethernet/intel/libeth/xsk.c b/drivers/net/ethernet/intel/libeth/xsk.c index 846e902e31b600..4882951d5c9c43 100644 --- a/drivers/net/ethernet/intel/libeth/xsk.c +++ b/drivers/net/ethernet/intel/libeth/xsk.c @@ -167,6 +167,7 @@ int libeth_xskfq_create(struct libeth_xskfq *fq) fq->pending = fq->count; fq->thresh = libeth_xdp_queue_threshold(fq->count); fq->buf_len = xsk_pool_get_rx_frame_size(fq->pool); + fq->truesize = xsk_pool_get_rx_frag_step(fq->pool); return 0; } diff --git a/include/net/libeth/xsk.h b/include/net/libeth/xsk.h index 481a7b28e6f248..82b5d21aae8784 100644 --- a/include/net/libeth/xsk.h +++ b/include/net/libeth/xsk.h @@ -597,6 +597,7 @@ __libeth_xsk_run_pass(struct libeth_xdp_buff *xdp, * @pending: current number of XSkFQEs to refill * @thresh: threshold below which the queue is refilled * @buf_len: HW-writeable length per each buffer + * @truesize: step between consecutive buffers, 0 if none exists * @nid: ID of the closest NUMA node with memory */ struct libeth_xskfq { @@ -614,6 +615,8 @@ struct libeth_xskfq { u32 thresh; u32 buf_len; + u32 truesize; + int nid; }; From 05efcc8b93dc5d955e3b25ced3b9dd6bd73e8a31 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Thu, 5 Mar 2026 12:12:50 +0100 Subject: [PATCH 04/23] xdp: produce a warning when calculated tailroom is negative MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Many ethernet drivers report xdp Rx queue frag size as being the same as DMA write size. However, the only user of this field, namely bpf_xdp_frags_increase_tail(), clearly expects a truesize. Such difference leads to unspecific memory corruption issues under certain circumstances, e.g. in ixgbevf maximum DMA write size is 3 KB, so when running xskxceiver's XDP_ADJUST_TAIL_GROW_MULTI_BUFF, 6K packet fully uses all DMA-writable space in 2 buffers. This would be fine, if only rxq->frag_size was properly set to 4K, but value of 3K results in a negative tailroom, because there is a non-zero page offset. We are supposed to return -EINVAL and be done with it in such case, but due to tailroom being stored as an unsigned int, it is reported to be somewhere near UINT_MAX, resulting in a tail being grown, even if the requested offset is too much (it is around 2K in the abovementioned test). This later leads to all kinds of unspecific calltraces. [ 7340.337579] xskxceiver[1440]: segfault at 1da718 ip 00007f4161aeac9d sp 00007f41615a6a00 error 6 [ 7340.338040] xskxceiver[1441]: segfault at 7f410000000b ip 00000000004042b5 sp 00007f415bffecf0 error 4 [ 7340.338179] in libc.so.6[61c9d,7f4161aaf000+160000] [ 7340.339230] in xskxceiver[42b5,400000+69000] [ 7340.340300] likely on CPU 6 (core 0, socket 6) [ 7340.340302] Code: ff ff 01 e9 f4 fe ff ff 0f 1f 44 00 00 4c 39 f0 74 73 31 c0 ba 01 00 00 00 f0 0f b1 17 0f 85 ba 00 00 00 49 8b 87 88 00 00 00 <4c> 89 70 08 eb cc 0f 1f 44 00 00 48 8d bd f0 fe ff ff 89 85 ec fe [ 7340.340888] likely on CPU 3 (core 0, socket 3) [ 7340.345088] Code: 00 00 00 ba 00 00 00 00 be 00 00 00 00 89 c7 e8 31 ca ff ff 89 45 ec 8b 45 ec 85 c0 78 07 b8 00 00 00 00 eb 46 e8 0b c8 ff ff <8b> 00 83 f8 69 74 24 e8 ff c7 ff ff 8b 00 83 f8 0b 74 18 e8 f3 c7 [ 7340.404334] Oops: general protection fault, probably for non-canonical address 0x6d255010bdffc: 0000 [#1] SMP NOPTI [ 7340.405972] CPU: 7 UID: 0 PID: 1439 Comm: xskxceiver Not tainted 6.19.0-rc1+ #21 PREEMPT(lazy) [ 7340.408006] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.17.0-5.fc42 04/01/2014 [ 7340.409716] RIP: 0010:lookup_swap_cgroup_id+0x44/0x80 [ 7340.410455] Code: 83 f8 1c 73 39 48 ba ff ff ff ff ff ff ff 03 48 8b 04 c5 20 55 fa bd 48 21 d1 48 89 ca 83 e1 01 48 d1 ea c1 e1 04 48 8d 04 90 <8b> 00 48 83 c4 10 d3 e8 c3 cc cc cc cc 31 c0 e9 98 b7 dd 00 48 89 [ 7340.412787] RSP: 0018:ffffcc5c04f7f6d0 EFLAGS: 00010202 [ 7340.413494] RAX: 0006d255010bdffc RBX: ffff891f477895a8 RCX: 0000000000000010 [ 7340.414431] RDX: 0001c17e3fffffff RSI: 00fa070000000000 RDI: 000382fc7fffffff [ 7340.415354] RBP: 00fa070000000000 R08: ffffcc5c04f7f8f8 R09: ffffcc5c04f7f7d0 [ 7340.416283] R10: ffff891f4c1a7000 R11: ffffcc5c04f7f9c8 R12: ffffcc5c04f7f7d0 [ 7340.417218] R13: 03ffffffffffffff R14: 00fa06fffffffe00 R15: ffff891f47789500 [ 7340.418229] FS: 0000000000000000(0000) GS:ffff891ffdfaa000(0000) knlGS:0000000000000000 [ 7340.419489] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 7340.420286] CR2: 00007f415bfffd58 CR3: 0000000103f03002 CR4: 0000000000772ef0 [ 7340.421237] PKRU: 55555554 [ 7340.421623] Call Trace: [ 7340.421987] [ 7340.422309] ? softleaf_from_pte+0x77/0xa0 [ 7340.422855] swap_pte_batch+0xa7/0x290 [ 7340.423363] zap_nonpresent_ptes.constprop.0.isra.0+0xd1/0x270 [ 7340.424102] zap_pte_range+0x281/0x580 [ 7340.424607] zap_pmd_range.isra.0+0xc9/0x240 [ 7340.425177] unmap_page_range+0x24d/0x420 [ 7340.425714] unmap_vmas+0xa1/0x180 [ 7340.426185] exit_mmap+0xe1/0x3b0 [ 7340.426644] __mmput+0x41/0x150 [ 7340.427098] exit_mm+0xb1/0x110 [ 7340.427539] do_exit+0x1b2/0x460 [ 7340.427992] do_group_exit+0x2d/0xc0 [ 7340.428477] get_signal+0x79d/0x7e0 [ 7340.428957] arch_do_signal_or_restart+0x34/0x100 [ 7340.429571] exit_to_user_mode_loop+0x8e/0x4c0 [ 7340.430159] do_syscall_64+0x188/0x6b0 [ 7340.430672] ? __do_sys_clone3+0xd9/0x120 [ 7340.431212] ? switch_fpu_return+0x4e/0xd0 [ 7340.431761] ? arch_exit_to_user_mode_prepare.isra.0+0xa1/0xc0 [ 7340.432498] ? do_syscall_64+0xbb/0x6b0 [ 7340.433015] ? __handle_mm_fault+0x445/0x690 [ 7340.433582] ? count_memcg_events+0xd6/0x210 [ 7340.434151] ? handle_mm_fault+0x212/0x340 [ 7340.434697] ? do_user_addr_fault+0x2b4/0x7b0 [ 7340.435271] ? clear_bhb_loop+0x30/0x80 [ 7340.435788] ? clear_bhb_loop+0x30/0x80 [ 7340.436299] ? clear_bhb_loop+0x30/0x80 [ 7340.436812] ? clear_bhb_loop+0x30/0x80 [ 7340.437323] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 7340.437973] RIP: 0033:0x7f4161b14169 [ 7340.438468] Code: Unable to access opcode bytes at 0x7f4161b1413f. [ 7340.439242] RSP: 002b:00007ffc6ebfa770 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca [ 7340.440173] RAX: fffffffffffffe00 RBX: 00000000000005a1 RCX: 00007f4161b14169 [ 7340.441061] RDX: 00000000000005a1 RSI: 0000000000000109 RDI: 00007f415bfff990 [ 7340.441943] RBP: 00007ffc6ebfa7a0 R08: 0000000000000000 R09: 00000000ffffffff [ 7340.442824] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 [ 7340.443707] R13: 0000000000000000 R14: 00007f415bfff990 R15: 00007f415bfff6c0 [ 7340.444586] [ 7340.444922] Modules linked in: rfkill intel_rapl_msr intel_rapl_common intel_uncore_frequency_common skx_edac_common nfit libnvdimm kvm_intel vfat fat kvm snd_pcm irqbypass rapl iTCO_wdt snd_timer intel_pmc_bxt iTCO_vendor_support snd ixgbevf virtio_net soundcore i2c_i801 pcspkr libeth_xdp net_failover i2c_smbus lpc_ich failover libeth virtio_balloon joydev 9p fuse loop zram lz4hc_compress lz4_compress 9pnet_virtio 9pnet netfs ghash_clmulni_intel serio_raw qemu_fw_cfg [ 7340.449650] ---[ end trace 0000000000000000 ]--- The issue can be fixed in all in-tree drivers, but we cannot just trust OOT drivers to not do this. Therefore, make tailroom a signed int and produce a warning when it is negative to prevent such mistakes in the future. Fixes: bf25146a5595 ("bpf: add frags support to the bpf_xdp_adjust_tail() API") Reviewed-by: Aleksandr Loktionov Reviewed-by: Toke Høiland-Jørgensen Acked-by: Martin KaFai Lau Signed-off-by: Larysa Zaremba Link: https://patch.msgid.link/20260305111253.2317394-10-larysa.zaremba@intel.com Signed-off-by: Jakub Kicinski --- net/core/filter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 31a8536eb03ebb..40ed01579c1bbb 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4151,13 +4151,14 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1]; struct xdp_rxq_info *rxq = xdp->rxq; - unsigned int tailroom; + int tailroom; if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz) return -EOPNOTSUPP; tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag) % rxq->frag_size; + WARN_ON_ONCE(tailroom < 0); if (unlikely(offset > tailroom)) return -EINVAL; From f2908e792d084828b66528bea6afa674354f5f66 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Mon, 23 Jun 2025 14:06:41 +0200 Subject: [PATCH 05/23] ixgbevf: remove legacy Rx Similarly as in commit 53844673d555 ("iavf: kill "legacy-rx" for good"), drop skb construction logic in favor of only using napi_build_skb() as a superior option that reduces the need to allocate and copy memory. As IXGBEVF_PRIV_FLAGS_LEGACY_RX is the only private flag in ixgbevf, entirely remove private flags support from the driver. When compared to iavf changes, ixgbevf has a single complication: MAC type 82599 cannot finely limit the DMA write size with RXDCTL.RLPML, only 1024 increments through SRRCTL are available, see commit fe68195daf34 ("ixgbevf: Require large buffers for build_skb on 82599VF") and commit 2bafa8fac19a ("ixgbe: don't set RXDCTL.RLPML for 82599"). Therefore, this is a special case requiring legacy RX unless large buffers are used. For now, solve this by always using large buffers for this MAC type. Suggested-by: Alexander Lobakin Reviewed-by: Aleksandr Loktionov Reviewed-by: Alexander Lobakin Signed-off-by: Larysa Zaremba --- drivers/net/ethernet/intel/ixgbevf/ethtool.c | 48 -------- drivers/net/ethernet/intel/ixgbevf/ixgbevf.h | 13 +-- .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 106 ++---------------- 3 files changed, 13 insertions(+), 154 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ethtool.c b/drivers/net/ethernet/intel/ixgbevf/ethtool.c index 537a60d5276f0f..274eef39c58618 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ethtool.c +++ b/drivers/net/ethernet/intel/ixgbevf/ethtool.c @@ -72,13 +72,6 @@ static const char ixgbe_gstrings_test[][ETH_GSTRING_LEN] = { #define IXGBEVF_TEST_LEN (sizeof(ixgbe_gstrings_test) / ETH_GSTRING_LEN) -static const char ixgbevf_priv_flags_strings[][ETH_GSTRING_LEN] = { -#define IXGBEVF_PRIV_FLAGS_LEGACY_RX BIT(0) - "legacy-rx", -}; - -#define IXGBEVF_PRIV_FLAGS_STR_LEN ARRAY_SIZE(ixgbevf_priv_flags_strings) - static int ixgbevf_get_link_ksettings(struct net_device *netdev, struct ethtool_link_ksettings *cmd) { @@ -216,8 +209,6 @@ static void ixgbevf_get_drvinfo(struct net_device *netdev, strscpy(drvinfo->driver, ixgbevf_driver_name, sizeof(drvinfo->driver)); strscpy(drvinfo->bus_info, pci_name(adapter->pdev), sizeof(drvinfo->bus_info)); - - drvinfo->n_priv_flags = IXGBEVF_PRIV_FLAGS_STR_LEN; } static void ixgbevf_get_ringparam(struct net_device *netdev, @@ -409,8 +400,6 @@ static int ixgbevf_get_sset_count(struct net_device *netdev, int stringset) return IXGBEVF_TEST_LEN; case ETH_SS_STATS: return IXGBEVF_STATS_LEN; - case ETH_SS_PRIV_FLAGS: - return IXGBEVF_PRIV_FLAGS_STR_LEN; default: return -EINVAL; } @@ -538,10 +527,6 @@ static void ixgbevf_get_strings(struct net_device *netdev, u32 stringset, p += ETH_GSTRING_LEN; } break; - case ETH_SS_PRIV_FLAGS: - memcpy(data, ixgbevf_priv_flags_strings, - IXGBEVF_PRIV_FLAGS_STR_LEN * ETH_GSTRING_LEN); - break; } } @@ -931,37 +916,6 @@ static int ixgbevf_get_rxfh(struct net_device *netdev, return err; } -static u32 ixgbevf_get_priv_flags(struct net_device *netdev) -{ - struct ixgbevf_adapter *adapter = netdev_priv(netdev); - u32 priv_flags = 0; - - if (adapter->flags & IXGBEVF_FLAGS_LEGACY_RX) - priv_flags |= IXGBEVF_PRIV_FLAGS_LEGACY_RX; - - return priv_flags; -} - -static int ixgbevf_set_priv_flags(struct net_device *netdev, u32 priv_flags) -{ - struct ixgbevf_adapter *adapter = netdev_priv(netdev); - unsigned int flags = adapter->flags; - - flags &= ~IXGBEVF_FLAGS_LEGACY_RX; - if (priv_flags & IXGBEVF_PRIV_FLAGS_LEGACY_RX) - flags |= IXGBEVF_FLAGS_LEGACY_RX; - - if (flags != adapter->flags) { - adapter->flags = flags; - - /* reset interface to repopulate queues */ - if (netif_running(netdev)) - ixgbevf_reinit_locked(adapter); - } - - return 0; -} - static const struct ethtool_ops ixgbevf_ethtool_ops = { .supported_coalesce_params = ETHTOOL_COALESCE_USECS, .get_drvinfo = ixgbevf_get_drvinfo, @@ -984,8 +938,6 @@ static const struct ethtool_ops ixgbevf_ethtool_ops = { .get_rxfh_key_size = ixgbevf_get_rxfh_key_size, .get_rxfh = ixgbevf_get_rxfh, .get_link_ksettings = ixgbevf_get_link_ksettings, - .get_priv_flags = ixgbevf_get_priv_flags, - .set_priv_flags = ixgbevf_set_priv_flags, }; void ixgbevf_set_ethtool_ops(struct net_device *netdev) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h index 516a6fdd23d076..ae2763fea2be2e 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h @@ -73,7 +73,6 @@ struct ixgbevf_rx_queue_stats { enum ixgbevf_ring_state_t { __IXGBEVF_RX_3K_BUFFER, - __IXGBEVF_RX_BUILD_SKB_ENABLED, __IXGBEVF_TX_DETECT_HANG, __IXGBEVF_HANG_CHECK_ARMED, __IXGBEVF_TX_XDP_RING, @@ -176,21 +175,13 @@ struct ixgbevf_ring { #define clear_ring_uses_large_buffer(ring) \ clear_bit(__IXGBEVF_RX_3K_BUFFER, &(ring)->state) -#define ring_uses_build_skb(ring) \ - test_bit(__IXGBEVF_RX_BUILD_SKB_ENABLED, &(ring)->state) -#define set_ring_build_skb_enabled(ring) \ - set_bit(__IXGBEVF_RX_BUILD_SKB_ENABLED, &(ring)->state) -#define clear_ring_build_skb_enabled(ring) \ - clear_bit(__IXGBEVF_RX_BUILD_SKB_ENABLED, &(ring)->state) - static inline unsigned int ixgbevf_rx_bufsz(struct ixgbevf_ring *ring) { #if (PAGE_SIZE < 8192) if (ring_uses_large_buffer(ring)) return IXGBEVF_RXBUFFER_3072; - if (ring_uses_build_skb(ring)) - return IXGBEVF_MAX_FRAME_BUILD_SKB; + return IXGBEVF_MAX_FRAME_BUILD_SKB; #endif return IXGBEVF_RXBUFFER_2048; } @@ -377,8 +368,6 @@ struct ixgbevf_adapter { u32 flags; bool link_state; -#define IXGBEVF_FLAGS_LEGACY_RX BIT(1) - #ifdef CONFIG_XFRM struct ixgbevf_ipsec *ipsec; #endif /* CONFIG_XFRM */ diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index d5ce20f47def1f..fc48c89c7bb857 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -602,7 +602,7 @@ static bool ixgbevf_is_non_eop(struct ixgbevf_ring *rx_ring, static inline unsigned int ixgbevf_rx_offset(struct ixgbevf_ring *rx_ring) { - return ring_uses_build_skb(rx_ring) ? IXGBEVF_SKB_PAD : 0; + return IXGBEVF_SKB_PAD; } static bool ixgbevf_alloc_mapped_page(struct ixgbevf_ring *rx_ring, @@ -832,9 +832,7 @@ static void ixgbevf_add_rx_frag(struct ixgbevf_ring *rx_ring, #if (PAGE_SIZE < 8192) unsigned int truesize = ixgbevf_rx_pg_size(rx_ring) / 2; #else - unsigned int truesize = ring_uses_build_skb(rx_ring) ? - SKB_DATA_ALIGN(IXGBEVF_SKB_PAD + size) : - SKB_DATA_ALIGN(size); + unsigned int truesize = SKB_DATA_ALIGN(IXGBEVF_SKB_PAD + size); #endif skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buffer->page, rx_buffer->page_offset, size, truesize); @@ -845,74 +843,6 @@ static void ixgbevf_add_rx_frag(struct ixgbevf_ring *rx_ring, #endif } -static -struct sk_buff *ixgbevf_construct_skb(struct ixgbevf_ring *rx_ring, - struct ixgbevf_rx_buffer *rx_buffer, - struct xdp_buff *xdp, - union ixgbe_adv_rx_desc *rx_desc) -{ - unsigned int size = xdp->data_end - xdp->data; -#if (PAGE_SIZE < 8192) - unsigned int truesize = ixgbevf_rx_pg_size(rx_ring) / 2; -#else - unsigned int truesize = SKB_DATA_ALIGN(xdp->data_end - - xdp->data_hard_start); -#endif - unsigned int headlen; - struct sk_buff *skb; - - /* prefetch first cache line of first page */ - net_prefetch(xdp->data); - - /* Note, we get here by enabling legacy-rx via: - * - * ethtool --set-priv-flags legacy-rx on - * - * In this mode, we currently get 0 extra XDP headroom as - * opposed to having legacy-rx off, where we process XDP - * packets going to stack via ixgbevf_build_skb(). - * - * For ixgbevf_construct_skb() mode it means that the - * xdp->data_meta will always point to xdp->data, since - * the helper cannot expand the head. Should this ever - * changed in future for legacy-rx mode on, then lets also - * add xdp->data_meta handling here. - */ - - /* allocate a skb to store the frags */ - skb = napi_alloc_skb(&rx_ring->q_vector->napi, IXGBEVF_RX_HDR_SIZE); - if (unlikely(!skb)) - return NULL; - - /* Determine available headroom for copy */ - headlen = size; - if (headlen > IXGBEVF_RX_HDR_SIZE) - headlen = eth_get_headlen(skb->dev, xdp->data, - IXGBEVF_RX_HDR_SIZE); - - /* align pull length to size of long to optimize memcpy performance */ - memcpy(__skb_put(skb, headlen), xdp->data, - ALIGN(headlen, sizeof(long))); - - /* update all of the pointers */ - size -= headlen; - if (size) { - skb_add_rx_frag(skb, 0, rx_buffer->page, - (xdp->data + headlen) - - page_address(rx_buffer->page), - size, truesize); -#if (PAGE_SIZE < 8192) - rx_buffer->page_offset ^= truesize; -#else - rx_buffer->page_offset += truesize; -#endif - } else { - rx_buffer->pagecnt_bias++; - } - - return skb; -} - static inline void ixgbevf_irq_enable_queues(struct ixgbevf_adapter *adapter, u32 qmask) { @@ -1092,10 +1022,8 @@ static unsigned int ixgbevf_rx_frame_truesize(struct ixgbevf_ring *rx_ring, #if (PAGE_SIZE < 8192) truesize = ixgbevf_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */ #else - truesize = ring_uses_build_skb(rx_ring) ? - SKB_DATA_ALIGN(IXGBEVF_SKB_PAD + size) + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) : - SKB_DATA_ALIGN(size); + truesize = SKB_DATA_ALIGN(IXGBEVF_SKB_PAD + size) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); #endif return truesize; } @@ -1182,12 +1110,9 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, total_rx_bytes += size; } else if (skb) { ixgbevf_add_rx_frag(rx_ring, rx_buffer, skb, size); - } else if (ring_uses_build_skb(rx_ring)) { + } else { skb = ixgbevf_build_skb(rx_ring, rx_buffer, &xdp, rx_desc); - } else { - skb = ixgbevf_construct_skb(rx_ring, rx_buffer, - &xdp, rx_desc); } /* exit if we failed to retrieve a buffer */ @@ -1958,8 +1883,7 @@ static void ixgbevf_configure_rx_ring(struct ixgbevf_adapter *adapter, #if (PAGE_SIZE < 8192) /* Limit the maximum frame size so we don't overrun the skb */ - if (ring_uses_build_skb(ring) && - !ring_uses_large_buffer(ring)) + if (!ring_uses_large_buffer(ring)) rxdctl |= IXGBEVF_MAX_FRAME_BUILD_SKB | IXGBE_RXDCTL_RLPML_EN; #endif @@ -1978,22 +1902,16 @@ static void ixgbevf_set_rx_buffer_len(struct ixgbevf_adapter *adapter, struct net_device *netdev = adapter->netdev; unsigned int max_frame = netdev->mtu + ETH_HLEN + ETH_FCS_LEN; - /* set build_skb and buffer size flags */ - clear_ring_build_skb_enabled(rx_ring); + /* set buffer size flags */ clear_ring_uses_large_buffer(rx_ring); - if (adapter->flags & IXGBEVF_FLAGS_LEGACY_RX) - return; - if (PAGE_SIZE < 8192) - if (max_frame > IXGBEVF_MAX_FRAME_BUILD_SKB) + /* 82599 can't rely on RXDCTL.RLPML to restrict + * the size of the frame + */ + if (max_frame > IXGBEVF_MAX_FRAME_BUILD_SKB || + adapter->hw.mac.type == ixgbe_mac_82599_vf) set_ring_uses_large_buffer(rx_ring); - - /* 82599 can't rely on RXDCTL.RLPML to restrict the size of the frame */ - if (adapter->hw.mac.type == ixgbe_mac_82599_vf && !ring_uses_large_buffer(rx_ring)) - return; - - set_ring_build_skb_enabled(rx_ring); } /** From d7bf74b613322bb4682073988e4f98baece9370d Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Tue, 24 Jun 2025 12:49:22 +0200 Subject: [PATCH 06/23] ixgbevf: do not share pages between packets Again, same as in the related iavf commit 920d86f3c552 ("iavf: drop page splitting and recycling"), as an intermediate step, drop the page sharing and recycling logic in a preparation to offload it to page_pool. Instead of the previous sharing and recycling, just allocate a new page every time. Suggested-by: Alexander Lobakin Reviewed-by: Aleksandr Loktionov Reviewed-by: Alexander Lobakin Signed-off-by: Larysa Zaremba --- drivers/net/ethernet/intel/ixgbevf/ixgbevf.h | 44 +--- .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 239 ++---------------- 2 files changed, 28 insertions(+), 255 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h index ae2763fea2be2e..2d7ca3f86868bc 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h @@ -45,12 +45,7 @@ struct ixgbevf_tx_buffer { struct ixgbevf_rx_buffer { dma_addr_t dma; struct page *page; -#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536) __u32 page_offset; -#else - __u16 page_offset; -#endif - __u16 pagecnt_bias; }; struct ixgbevf_stats { @@ -72,7 +67,6 @@ struct ixgbevf_rx_queue_stats { }; enum ixgbevf_ring_state_t { - __IXGBEVF_RX_3K_BUFFER, __IXGBEVF_TX_DETECT_HANG, __IXGBEVF_HANG_CHECK_ARMED, __IXGBEVF_TX_XDP_RING, @@ -143,8 +137,7 @@ struct ixgbevf_ring { #define IXGBEVF_MIN_RXD 64 /* Supported Rx Buffer Sizes */ -#define IXGBEVF_RXBUFFER_256 256 /* Used for packet split */ -#define IXGBEVF_RXBUFFER_2048 2048 +#define IXGBEVF_RXBUFFER_256 256 #define IXGBEVF_RXBUFFER_3072 3072 #define IXGBEVF_RX_HDR_SIZE IXGBEVF_RXBUFFER_256 @@ -152,12 +145,6 @@ struct ixgbevf_ring { #define MAXIMUM_ETHERNET_VLAN_SIZE (VLAN_ETH_FRAME_LEN + ETH_FCS_LEN) #define IXGBEVF_SKB_PAD (NET_SKB_PAD + NET_IP_ALIGN) -#if (PAGE_SIZE < 8192) -#define IXGBEVF_MAX_FRAME_BUILD_SKB \ - (SKB_WITH_OVERHEAD(IXGBEVF_RXBUFFER_2048) - IXGBEVF_SKB_PAD) -#else -#define IXGBEVF_MAX_FRAME_BUILD_SKB IXGBEVF_RXBUFFER_2048 -#endif #define IXGBE_TX_FLAGS_CSUM BIT(0) #define IXGBE_TX_FLAGS_VLAN BIT(1) @@ -168,35 +155,6 @@ struct ixgbevf_ring { #define IXGBE_TX_FLAGS_VLAN_PRIO_MASK 0x0000e000 #define IXGBE_TX_FLAGS_VLAN_SHIFT 16 -#define ring_uses_large_buffer(ring) \ - test_bit(__IXGBEVF_RX_3K_BUFFER, &(ring)->state) -#define set_ring_uses_large_buffer(ring) \ - set_bit(__IXGBEVF_RX_3K_BUFFER, &(ring)->state) -#define clear_ring_uses_large_buffer(ring) \ - clear_bit(__IXGBEVF_RX_3K_BUFFER, &(ring)->state) - -static inline unsigned int ixgbevf_rx_bufsz(struct ixgbevf_ring *ring) -{ -#if (PAGE_SIZE < 8192) - if (ring_uses_large_buffer(ring)) - return IXGBEVF_RXBUFFER_3072; - - return IXGBEVF_MAX_FRAME_BUILD_SKB; -#endif - return IXGBEVF_RXBUFFER_2048; -} - -static inline unsigned int ixgbevf_rx_pg_order(struct ixgbevf_ring *ring) -{ -#if (PAGE_SIZE < 8192) - if (ring_uses_large_buffer(ring)) - return 1; -#endif - return 0; -} - -#define ixgbevf_rx_pg_size(_ring) (PAGE_SIZE << ixgbevf_rx_pg_order(_ring)) - #define check_for_tx_hang(ring) \ test_bit(__IXGBEVF_TX_DETECT_HANG, &(ring)->state) #define set_check_for_tx_hang(ring) \ diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index fc48c89c7bb857..05baf28823c869 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -112,9 +112,6 @@ static void ixgbevf_service_event_complete(struct ixgbevf_adapter *adapter) static void ixgbevf_queue_reset_subtask(struct ixgbevf_adapter *adapter); static void ixgbevf_set_itr(struct ixgbevf_q_vector *q_vector); static void ixgbevf_free_all_rx_resources(struct ixgbevf_adapter *adapter); -static bool ixgbevf_can_reuse_rx_page(struct ixgbevf_rx_buffer *rx_buffer); -static void ixgbevf_reuse_rx_page(struct ixgbevf_ring *rx_ring, - struct ixgbevf_rx_buffer *old_buff); static void ixgbevf_remove_adapter(struct ixgbe_hw *hw) { @@ -537,40 +534,20 @@ struct ixgbevf_rx_buffer *ixgbevf_get_rx_buffer(struct ixgbevf_ring *rx_ring, rx_buffer = &rx_ring->rx_buffer_info[rx_ring->next_to_clean]; prefetchw(rx_buffer->page); - /* we are reusing so sync this buffer for CPU use */ dma_sync_single_range_for_cpu(rx_ring->dev, rx_buffer->dma, rx_buffer->page_offset, size, DMA_FROM_DEVICE); - rx_buffer->pagecnt_bias--; - return rx_buffer; } static void ixgbevf_put_rx_buffer(struct ixgbevf_ring *rx_ring, - struct ixgbevf_rx_buffer *rx_buffer, - struct sk_buff *skb) + struct ixgbevf_rx_buffer *rx_buffer) { - if (ixgbevf_can_reuse_rx_page(rx_buffer)) { - /* hand second half of page back to the ring */ - ixgbevf_reuse_rx_page(rx_ring, rx_buffer); - } else { - if (IS_ERR(skb)) - /* We are not reusing the buffer so unmap it and free - * any references we are holding to it - */ - dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma, - ixgbevf_rx_pg_size(rx_ring), - DMA_FROM_DEVICE, - IXGBEVF_RX_DMA_ATTR); - __page_frag_cache_drain(rx_buffer->page, - rx_buffer->pagecnt_bias); - } - - /* clear contents of rx_buffer */ - rx_buffer->page = NULL; + dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma, PAGE_SIZE, + DMA_FROM_DEVICE, IXGBEVF_RX_DMA_ATTR); } /** @@ -600,38 +577,28 @@ static bool ixgbevf_is_non_eop(struct ixgbevf_ring *rx_ring, return true; } -static inline unsigned int ixgbevf_rx_offset(struct ixgbevf_ring *rx_ring) -{ - return IXGBEVF_SKB_PAD; -} - static bool ixgbevf_alloc_mapped_page(struct ixgbevf_ring *rx_ring, struct ixgbevf_rx_buffer *bi) { - struct page *page = bi->page; + struct page *page; dma_addr_t dma; - /* since we are recycling buffers we should seldom need to alloc */ - if (likely(page)) - return true; - /* alloc new page for storage */ - page = dev_alloc_pages(ixgbevf_rx_pg_order(rx_ring)); + page = dev_alloc_page(); if (unlikely(!page)) { rx_ring->rx_stats.alloc_rx_page_failed++; return false; } /* map page for use */ - dma = dma_map_page_attrs(rx_ring->dev, page, 0, - ixgbevf_rx_pg_size(rx_ring), + dma = dma_map_page_attrs(rx_ring->dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE, IXGBEVF_RX_DMA_ATTR); /* if mapping failed free memory back to system since * there isn't much point in holding memory we can't use */ if (dma_mapping_error(rx_ring->dev, dma)) { - __free_pages(page, ixgbevf_rx_pg_order(rx_ring)); + __free_page(page); rx_ring->rx_stats.alloc_rx_page_failed++; return false; @@ -639,8 +606,7 @@ static bool ixgbevf_alloc_mapped_page(struct ixgbevf_ring *rx_ring, bi->dma = dma; bi->page = page; - bi->page_offset = ixgbevf_rx_offset(rx_ring); - bi->pagecnt_bias = 1; + bi->page_offset = IXGBEVF_SKB_PAD; rx_ring->rx_stats.alloc_rx_page++; return true; @@ -673,7 +639,7 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_ring *rx_ring, /* sync the buffer for use by the device */ dma_sync_single_range_for_device(rx_ring->dev, bi->dma, bi->page_offset, - ixgbevf_rx_bufsz(rx_ring), + IXGBEVF_RXBUFFER_3072, DMA_FROM_DEVICE); /* Refresh the desc even if pkt_addr didn't change @@ -755,66 +721,6 @@ static bool ixgbevf_cleanup_headers(struct ixgbevf_ring *rx_ring, return false; } -/** - * ixgbevf_reuse_rx_page - page flip buffer and store it back on the ring - * @rx_ring: rx descriptor ring to store buffers on - * @old_buff: donor buffer to have page reused - * - * Synchronizes page for reuse by the adapter - **/ -static void ixgbevf_reuse_rx_page(struct ixgbevf_ring *rx_ring, - struct ixgbevf_rx_buffer *old_buff) -{ - struct ixgbevf_rx_buffer *new_buff; - u16 nta = rx_ring->next_to_alloc; - - new_buff = &rx_ring->rx_buffer_info[nta]; - - /* update, and store next to alloc */ - nta++; - rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0; - - /* transfer page from old buffer to new buffer */ - new_buff->page = old_buff->page; - new_buff->dma = old_buff->dma; - new_buff->page_offset = old_buff->page_offset; - new_buff->pagecnt_bias = old_buff->pagecnt_bias; -} - -static bool ixgbevf_can_reuse_rx_page(struct ixgbevf_rx_buffer *rx_buffer) -{ - unsigned int pagecnt_bias = rx_buffer->pagecnt_bias; - struct page *page = rx_buffer->page; - - /* avoid re-using remote and pfmemalloc pages */ - if (!dev_page_is_reusable(page)) - return false; - -#if (PAGE_SIZE < 8192) - /* if we are only owner of page we can reuse it */ - if (unlikely((page_ref_count(page) - pagecnt_bias) > 1)) - return false; -#else -#define IXGBEVF_LAST_OFFSET \ - (SKB_WITH_OVERHEAD(PAGE_SIZE) - IXGBEVF_RXBUFFER_2048) - - if (rx_buffer->page_offset > IXGBEVF_LAST_OFFSET) - return false; - -#endif - - /* If we have drained the page fragment pool we need to update - * the pagecnt_bias and page count so that we fully restock the - * number of references the driver holds. - */ - if (unlikely(!pagecnt_bias)) { - page_ref_add(page, USHRT_MAX); - rx_buffer->pagecnt_bias = USHRT_MAX; - } - - return true; -} - /** * ixgbevf_add_rx_frag - Add contents of Rx buffer to sk_buff * @rx_ring: rx descriptor ring to transact packets on @@ -829,18 +735,10 @@ static void ixgbevf_add_rx_frag(struct ixgbevf_ring *rx_ring, struct sk_buff *skb, unsigned int size) { -#if (PAGE_SIZE < 8192) - unsigned int truesize = ixgbevf_rx_pg_size(rx_ring) / 2; -#else unsigned int truesize = SKB_DATA_ALIGN(IXGBEVF_SKB_PAD + size); -#endif + skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buffer->page, rx_buffer->page_offset, size, truesize); -#if (PAGE_SIZE < 8192) - rx_buffer->page_offset ^= truesize; -#else - rx_buffer->page_offset += truesize; -#endif } static inline void ixgbevf_irq_enable_queues(struct ixgbevf_adapter *adapter, @@ -857,13 +755,9 @@ static struct sk_buff *ixgbevf_build_skb(struct ixgbevf_ring *rx_ring, union ixgbe_adv_rx_desc *rx_desc) { unsigned int metasize = xdp->data - xdp->data_meta; -#if (PAGE_SIZE < 8192) - unsigned int truesize = ixgbevf_rx_pg_size(rx_ring) / 2; -#else unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) + SKB_DATA_ALIGN(xdp->data_end - xdp->data_hard_start); -#endif struct sk_buff *skb; /* Prefetch first cache line of first page. If xdp->data_meta @@ -884,13 +778,6 @@ static struct sk_buff *ixgbevf_build_skb(struct ixgbevf_ring *rx_ring, if (metasize) skb_metadata_set(skb, metasize); - /* update buffer offset */ -#if (PAGE_SIZE < 8192) - rx_buffer->page_offset ^= truesize; -#else - rx_buffer->page_offset += truesize; -#endif - return skb; } @@ -1014,38 +901,11 @@ static int ixgbevf_run_xdp(struct ixgbevf_adapter *adapter, return result; } -static unsigned int ixgbevf_rx_frame_truesize(struct ixgbevf_ring *rx_ring, - unsigned int size) -{ - unsigned int truesize; - -#if (PAGE_SIZE < 8192) - truesize = ixgbevf_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */ -#else - truesize = SKB_DATA_ALIGN(IXGBEVF_SKB_PAD + size) + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); -#endif - return truesize; -} - -static void ixgbevf_rx_buffer_flip(struct ixgbevf_ring *rx_ring, - struct ixgbevf_rx_buffer *rx_buffer, - unsigned int size) -{ - unsigned int truesize = ixgbevf_rx_frame_truesize(rx_ring, size); - -#if (PAGE_SIZE < 8192) - rx_buffer->page_offset ^= truesize; -#else - rx_buffer->page_offset += truesize; -#endif -} - static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, struct ixgbevf_ring *rx_ring, int budget) { - unsigned int total_rx_bytes = 0, total_rx_packets = 0, frame_sz = 0; + unsigned int total_rx_bytes = 0, total_rx_packets = 0; struct ixgbevf_adapter *adapter = q_vector->adapter; u16 cleaned_count = ixgbevf_desc_unused(rx_ring); struct sk_buff *skb = rx_ring->skb; @@ -1054,10 +914,7 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, int xdp_res = 0; /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ -#if (PAGE_SIZE < 8192) - frame_sz = ixgbevf_rx_frame_truesize(rx_ring, 0); -#endif - xdp_init_buff(&xdp, frame_sz, &rx_ring->xdp_rxq); + xdp_init_buff(&xdp, IXGBEVF_RXBUFFER_3072, &rx_ring->xdp_rxq); while (likely(total_rx_packets < budget)) { struct ixgbevf_rx_buffer *rx_buffer; @@ -1081,31 +938,24 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, */ rmb(); - rx_buffer = ixgbevf_get_rx_buffer(rx_ring, size); + rx_buffer = + ixgbevf_get_rx_buffer(rx_ring, IXGBEVF_RXBUFFER_3072); /* retrieve a buffer from the ring */ if (!skb) { - unsigned int offset = ixgbevf_rx_offset(rx_ring); + unsigned int offset = rx_buffer->page_offset; unsigned char *hard_start; hard_start = page_address(rx_buffer->page) + rx_buffer->page_offset - offset; xdp_prepare_buff(&xdp, hard_start, offset, size, true); -#if (PAGE_SIZE > 4096) - /* At larger PAGE_SIZE, frame_sz depend on len size */ - xdp.frame_sz = ixgbevf_rx_frame_truesize(rx_ring, size); -#endif xdp_res = ixgbevf_run_xdp(adapter, rx_ring, &xdp); } if (xdp_res) { - if (xdp_res == IXGBEVF_XDP_TX) { + if (xdp_res == IXGBEVF_XDP_TX) xdp_xmit = true; - ixgbevf_rx_buffer_flip(rx_ring, rx_buffer, - size); - } else { - rx_buffer->pagecnt_bias++; - } + total_rx_packets++; total_rx_bytes += size; } else if (skb) { @@ -1118,11 +968,13 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, /* exit if we failed to retrieve a buffer */ if (!xdp_res && !skb) { rx_ring->rx_stats.alloc_rx_buff_failed++; - rx_buffer->pagecnt_bias++; break; } - ixgbevf_put_rx_buffer(rx_ring, rx_buffer, skb); + ixgbevf_put_rx_buffer(rx_ring, rx_buffer); + if (xdp_res == IXGBEVF_XDP_CONSUMED) + __free_page(rx_buffer->page); + rx_buffer->page = NULL; cleaned_count++; /* fetch next buffer in frame if non-eop */ @@ -1699,10 +1551,7 @@ static void ixgbevf_configure_srrctl(struct ixgbevf_adapter *adapter, srrctl = IXGBE_SRRCTL_DROP_EN; srrctl |= IXGBEVF_RX_HDR_SIZE << IXGBE_SRRCTL_BSIZEHDRSIZE_SHIFT; - if (ring_uses_large_buffer(ring)) - srrctl |= IXGBEVF_RXBUFFER_3072 >> IXGBE_SRRCTL_BSIZEPKT_SHIFT; - else - srrctl |= IXGBEVF_RXBUFFER_2048 >> IXGBE_SRRCTL_BSIZEPKT_SHIFT; + srrctl |= IXGBEVF_RXBUFFER_3072 >> IXGBE_SRRCTL_BSIZEPKT_SHIFT; srrctl |= IXGBE_SRRCTL_DESCTYPE_ADV_ONEBUF; IXGBE_WRITE_REG(hw, IXGBE_VFSRRCTL(index), srrctl); @@ -1880,13 +1729,6 @@ static void ixgbevf_configure_rx_ring(struct ixgbevf_adapter *adapter, if (adapter->hw.mac.type != ixgbe_mac_82599_vf) { rxdctl &= ~(IXGBE_RXDCTL_RLPMLMASK | IXGBE_RXDCTL_RLPML_EN); - -#if (PAGE_SIZE < 8192) - /* Limit the maximum frame size so we don't overrun the skb */ - if (!ring_uses_large_buffer(ring)) - rxdctl |= IXGBEVF_MAX_FRAME_BUILD_SKB | - IXGBE_RXDCTL_RLPML_EN; -#endif } rxdctl |= IXGBE_RXDCTL_ENABLE | IXGBE_RXDCTL_VME; @@ -1896,24 +1738,6 @@ static void ixgbevf_configure_rx_ring(struct ixgbevf_adapter *adapter, ixgbevf_alloc_rx_buffers(ring, ixgbevf_desc_unused(ring)); } -static void ixgbevf_set_rx_buffer_len(struct ixgbevf_adapter *adapter, - struct ixgbevf_ring *rx_ring) -{ - struct net_device *netdev = adapter->netdev; - unsigned int max_frame = netdev->mtu + ETH_HLEN + ETH_FCS_LEN; - - /* set buffer size flags */ - clear_ring_uses_large_buffer(rx_ring); - - if (PAGE_SIZE < 8192) - /* 82599 can't rely on RXDCTL.RLPML to restrict - * the size of the frame - */ - if (max_frame > IXGBEVF_MAX_FRAME_BUILD_SKB || - adapter->hw.mac.type == ixgbe_mac_82599_vf) - set_ring_uses_large_buffer(rx_ring); -} - /** * ixgbevf_configure_rx - Configure 82599 VF Receive Unit after Reset * @adapter: board private structure @@ -1944,7 +1768,6 @@ static void ixgbevf_configure_rx(struct ixgbevf_adapter *adapter) for (i = 0; i < adapter->num_rx_queues; i++) { struct ixgbevf_ring *rx_ring = adapter->rx_ring[i]; - ixgbevf_set_rx_buffer_len(adapter, rx_ring); ixgbevf_configure_rx_ring(adapter, rx_ring); } } @@ -2323,19 +2146,13 @@ static void ixgbevf_clean_rx_ring(struct ixgbevf_ring *rx_ring) dma_sync_single_range_for_cpu(rx_ring->dev, rx_buffer->dma, rx_buffer->page_offset, - ixgbevf_rx_bufsz(rx_ring), + IXGBEVF_RXBUFFER_3072, DMA_FROM_DEVICE); /* free resources associated with mapping */ - dma_unmap_page_attrs(rx_ring->dev, - rx_buffer->dma, - ixgbevf_rx_pg_size(rx_ring), - DMA_FROM_DEVICE, - IXGBEVF_RX_DMA_ATTR); - - __page_frag_cache_drain(rx_buffer->page, - rx_buffer->pagecnt_bias); - + ixgbevf_put_rx_buffer(rx_ring, rx_buffer); + __free_page(rx_buffer->page); + rx_buffer->page = NULL; i++; if (i == rx_ring->count) i = 0; @@ -4394,9 +4211,7 @@ static int ixgbevf_xdp_setup(struct net_device *dev, struct bpf_prog *prog) /* verify ixgbevf ring attributes are sufficient for XDP */ for (i = 0; i < adapter->num_rx_queues; i++) { - struct ixgbevf_ring *ring = adapter->rx_ring[i]; - - if (frame_size > ixgbevf_rx_bufsz(ring)) + if (frame_size > IXGBEVF_RXBUFFER_3072) return -EINVAL; } From c0702722e59959f89932c56acbfed9c326a4573c Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Wed, 9 Jul 2025 09:28:02 +0200 Subject: [PATCH 07/23] ixgbevf: use libeth in Rx processing Use page_pool buffers by the means of libeth in the Rx queues, this significantly reduces code complexity of the driver itself. Suggested-by: Alexander Lobakin Reviewed-by: Alexander Lobakin Reviewed-by: Aleksandr Loktionov Signed-off-by: Larysa Zaremba --- drivers/net/ethernet/intel/Kconfig | 1 + drivers/net/ethernet/intel/ixgbevf/defines.h | 2 +- drivers/net/ethernet/intel/ixgbevf/ixgbevf.h | 21 +- .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 324 ++++++------------ 4 files changed, 125 insertions(+), 223 deletions(-) diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig index 288fa8ce53af0a..b513baf3cbb296 100644 --- a/drivers/net/ethernet/intel/Kconfig +++ b/drivers/net/ethernet/intel/Kconfig @@ -203,6 +203,7 @@ config IXGBE_IPSEC config IXGBEVF tristate "Intel(R) 10GbE PCI Express Virtual Function Ethernet support" depends on PCI_MSI + select LIBETH_XDP help This driver supports Intel(R) PCI Express virtual functions for the Intel(R) ixgbe driver. For more information on how to identify your diff --git a/drivers/net/ethernet/intel/ixgbevf/defines.h b/drivers/net/ethernet/intel/ixgbevf/defines.h index e177d1d58696aa..afc927dd14381b 100644 --- a/drivers/net/ethernet/intel/ixgbevf/defines.h +++ b/drivers/net/ethernet/intel/ixgbevf/defines.h @@ -71,7 +71,7 @@ typedef u32 ixgbe_link_speed; #define IXGBE_PSRTYPE_L2HDR 0x00001000 /* SRRCTL bit definitions */ -#define IXGBE_SRRCTL_BSIZEPKT_SHIFT 10 /* so many KBs */ +#define IXGBE_SRRCTL_BSIZEPKT_STEP 1024 #define IXGBE_SRRCTL_RDMTS_SHIFT 22 #define IXGBE_SRRCTL_RDMTS_MASK 0x01C00000 #define IXGBE_SRRCTL_DROP_EN 0x10000000 diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h index 2d7ca3f86868bc..ebf771f0caa4b3 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h @@ -42,12 +42,6 @@ struct ixgbevf_tx_buffer { u32 tx_flags; }; -struct ixgbevf_rx_buffer { - dma_addr_t dma; - struct page *page; - __u32 page_offset; -}; - struct ixgbevf_stats { u64 packets; u64 bytes; @@ -84,19 +78,22 @@ struct ixgbevf_ring { struct ixgbevf_ring *next; struct ixgbevf_q_vector *q_vector; /* backpointer to q_vector */ struct net_device *netdev; - struct bpf_prog *xdp_prog; - struct device *dev; + struct bpf_prog __rcu *xdp_prog; + union { + struct page_pool *pp; /* Rx ring */ + struct device *dev; /* Tx ring */ + }; void *desc; /* descriptor ring memory */ dma_addr_t dma; /* phys. address of descriptor ring */ unsigned int size; /* length in bytes */ + u32 truesize; /* Rx buffer full size */ u16 count; /* amount of descriptors */ u16 next_to_use; u16 next_to_clean; - u16 next_to_alloc; union { + struct libeth_fqe *rx_fqes; struct ixgbevf_tx_buffer *tx_buffer_info; - struct ixgbevf_rx_buffer *rx_buffer_info; }; unsigned long state; struct ixgbevf_stats stats; @@ -115,6 +112,7 @@ struct ixgbevf_ring { */ u16 reg_idx; int queue_index; /* needed for multiqueue queue management */ + u32 rx_buf_len; } ____cacheline_internodealigned_in_smp; /* How many Rx Buffers do we bundle into one write to the hardware ? */ @@ -144,7 +142,8 @@ struct ixgbevf_ring { #define MAXIMUM_ETHERNET_VLAN_SIZE (VLAN_ETH_FRAME_LEN + ETH_FCS_LEN) -#define IXGBEVF_SKB_PAD (NET_SKB_PAD + NET_IP_ALIGN) +#define IXGBEVF_RX_PAGE_LEN(hr) (ALIGN_DOWN(LIBETH_RX_PAGE_LEN(hr), \ + IXGBE_SRRCTL_BSIZEPKT_STEP)) #define IXGBE_TX_FLAGS_CSUM BIT(0) #define IXGBE_TX_FLAGS_VLAN BIT(1) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 05baf28823c869..663d4062a4c34f 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include "ixgbevf.h" @@ -82,6 +83,7 @@ static const struct pci_device_id ixgbevf_pci_tbl[] = { MODULE_DEVICE_TABLE(pci, ixgbevf_pci_tbl); MODULE_DESCRIPTION("Intel(R) 10 Gigabit Virtual Function Network Driver"); +MODULE_IMPORT_NS("LIBETH"); MODULE_LICENSE("GPL v2"); #define DEFAULT_MSG_ENABLE (NETIF_MSG_DRV|NETIF_MSG_PROBE|NETIF_MSG_LINK) @@ -304,7 +306,7 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector *q_vector, /* free the skb */ if (ring_is_xdp(tx_ring)) - page_frag_free(tx_buffer->data); + libeth_xdp_return_va(tx_buffer->data, true); else napi_consume_skb(tx_buffer->skb, napi_budget); @@ -521,33 +523,6 @@ static void ixgbevf_process_skb_fields(struct ixgbevf_ring *rx_ring, if (ixgbevf_test_staterr(rx_desc, IXGBE_RXDADV_STAT_SECP)) ixgbevf_ipsec_rx(rx_ring, rx_desc, skb); - - skb->protocol = eth_type_trans(skb, rx_ring->netdev); -} - -static -struct ixgbevf_rx_buffer *ixgbevf_get_rx_buffer(struct ixgbevf_ring *rx_ring, - const unsigned int size) -{ - struct ixgbevf_rx_buffer *rx_buffer; - - rx_buffer = &rx_ring->rx_buffer_info[rx_ring->next_to_clean]; - prefetchw(rx_buffer->page); - - dma_sync_single_range_for_cpu(rx_ring->dev, - rx_buffer->dma, - rx_buffer->page_offset, - size, - DMA_FROM_DEVICE); - - return rx_buffer; -} - -static void ixgbevf_put_rx_buffer(struct ixgbevf_ring *rx_ring, - struct ixgbevf_rx_buffer *rx_buffer) -{ - dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma, PAGE_SIZE, - DMA_FROM_DEVICE, IXGBEVF_RX_DMA_ATTR); } /** @@ -577,41 +552,6 @@ static bool ixgbevf_is_non_eop(struct ixgbevf_ring *rx_ring, return true; } -static bool ixgbevf_alloc_mapped_page(struct ixgbevf_ring *rx_ring, - struct ixgbevf_rx_buffer *bi) -{ - struct page *page; - dma_addr_t dma; - - /* alloc new page for storage */ - page = dev_alloc_page(); - if (unlikely(!page)) { - rx_ring->rx_stats.alloc_rx_page_failed++; - return false; - } - - /* map page for use */ - dma = dma_map_page_attrs(rx_ring->dev, page, 0, PAGE_SIZE, - DMA_FROM_DEVICE, IXGBEVF_RX_DMA_ATTR); - - /* if mapping failed free memory back to system since - * there isn't much point in holding memory we can't use - */ - if (dma_mapping_error(rx_ring->dev, dma)) { - __free_page(page); - - rx_ring->rx_stats.alloc_rx_page_failed++; - return false; - } - - bi->dma = dma; - bi->page = page; - bi->page_offset = IXGBEVF_SKB_PAD; - rx_ring->rx_stats.alloc_rx_page++; - - return true; -} - /** * ixgbevf_alloc_rx_buffers - Replace used receive buffers; packet split * @rx_ring: rx descriptor ring (for a specific queue) to setup buffers on @@ -621,39 +561,34 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_ring *rx_ring, u16 cleaned_count) { union ixgbe_adv_rx_desc *rx_desc; - struct ixgbevf_rx_buffer *bi; - unsigned int i = rx_ring->next_to_use; + const struct libeth_fq_fp fq = { + .pp = rx_ring->pp, + .fqes = rx_ring->rx_fqes, + .truesize = rx_ring->truesize, + .count = rx_ring->count, + }; + u16 ntu = rx_ring->next_to_use; /* nothing to do or no valid netdev defined */ if (!cleaned_count || !rx_ring->netdev) return; - rx_desc = IXGBEVF_RX_DESC(rx_ring, i); - bi = &rx_ring->rx_buffer_info[i]; - i -= rx_ring->count; + rx_desc = IXGBEVF_RX_DESC(rx_ring, ntu); do { - if (!ixgbevf_alloc_mapped_page(rx_ring, bi)) - break; + dma_addr_t addr; - /* sync the buffer for use by the device */ - dma_sync_single_range_for_device(rx_ring->dev, bi->dma, - bi->page_offset, - IXGBEVF_RXBUFFER_3072, - DMA_FROM_DEVICE); + addr = libeth_rx_alloc(&fq, ntu); + if (addr == DMA_MAPPING_ERROR) + break; - /* Refresh the desc even if pkt_addr didn't change - * because each write-back erases this info. - */ - rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset); + rx_desc->read.pkt_addr = cpu_to_le64(addr); rx_desc++; - bi++; - i++; - if (unlikely(!i)) { + ntu++; + if (unlikely(ntu == rx_ring->count)) { rx_desc = IXGBEVF_RX_DESC(rx_ring, 0); - bi = rx_ring->rx_buffer_info; - i -= rx_ring->count; + ntu = 0; } /* clear the length for the next_to_use descriptor */ @@ -662,14 +597,9 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_ring *rx_ring, cleaned_count--; } while (cleaned_count); - i += rx_ring->count; - - if (rx_ring->next_to_use != i) { + if (likely(rx_ring->next_to_use != ntu)) { /* record the next descriptor to use */ - rx_ring->next_to_use = i; - - /* update next to alloc since we have filled the ring */ - rx_ring->next_to_alloc = i; + rx_ring->next_to_use = ntu; /* Force memory writes to complete before letting h/w * know there are new descriptors to fetch. (Only @@ -677,7 +607,7 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_ring *rx_ring, * such as IA-64). */ wmb(); - ixgbevf_write_tail(rx_ring, i); + ixgbevf_write_tail(rx_ring, ntu); } } @@ -714,10 +644,6 @@ static bool ixgbevf_cleanup_headers(struct ixgbevf_ring *rx_ring, } } - /* if eth_skb_pad returns an error the skb was freed */ - if (eth_skb_pad(skb)) - return true; - return false; } @@ -730,15 +656,15 @@ static bool ixgbevf_cleanup_headers(struct ixgbevf_ring *rx_ring, * * This function will add the data contained in rx_buffer->page to the skb. **/ -static void ixgbevf_add_rx_frag(struct ixgbevf_ring *rx_ring, - struct ixgbevf_rx_buffer *rx_buffer, +static void ixgbevf_add_rx_frag(const struct libeth_fqe *rx_buffer, struct sk_buff *skb, unsigned int size) { - unsigned int truesize = SKB_DATA_ALIGN(IXGBEVF_SKB_PAD + size); + u32 hr = netmem_get_pp(rx_buffer->netmem)->p.offset; - skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buffer->page, - rx_buffer->page_offset, size, truesize); + skb_add_rx_frag_netmem(skb, skb_shinfo(skb)->nr_frags, + rx_buffer->netmem, rx_buffer->offset + hr, + size, rx_buffer->truesize); } static inline void ixgbevf_irq_enable_queues(struct ixgbevf_adapter *adapter, @@ -749,38 +675,6 @@ static inline void ixgbevf_irq_enable_queues(struct ixgbevf_adapter *adapter, IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, qmask); } -static struct sk_buff *ixgbevf_build_skb(struct ixgbevf_ring *rx_ring, - struct ixgbevf_rx_buffer *rx_buffer, - struct xdp_buff *xdp, - union ixgbe_adv_rx_desc *rx_desc) -{ - unsigned int metasize = xdp->data - xdp->data_meta; - unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) + - SKB_DATA_ALIGN(xdp->data_end - - xdp->data_hard_start); - struct sk_buff *skb; - - /* Prefetch first cache line of first page. If xdp->data_meta - * is unused, this points to xdp->data, otherwise, we likely - * have a consumer accessing first few bytes of meta data, - * and then actual data. - */ - net_prefetch(xdp->data_meta); - - /* build an skb around the page buffer */ - skb = napi_build_skb(xdp->data_hard_start, truesize); - if (unlikely(!skb)) - return NULL; - - /* update pointers within the skb to store the data */ - skb_reserve(skb, xdp->data - xdp->data_hard_start); - __skb_put(skb, xdp->data_end - xdp->data); - if (metasize) - skb_metadata_set(skb, metasize); - - return skb; -} - #define IXGBEVF_XDP_PASS 0 #define IXGBEVF_XDP_CONSUMED 1 #define IXGBEVF_XDP_TX 2 @@ -864,25 +758,25 @@ static int ixgbevf_xmit_xdp_ring(struct ixgbevf_ring *ring, static int ixgbevf_run_xdp(struct ixgbevf_adapter *adapter, struct ixgbevf_ring *rx_ring, - struct xdp_buff *xdp) + struct libeth_xdp_buff *xdp) { int result = IXGBEVF_XDP_PASS; struct ixgbevf_ring *xdp_ring; struct bpf_prog *xdp_prog; u32 act; - xdp_prog = READ_ONCE(rx_ring->xdp_prog); + xdp_prog = rcu_dereference(rx_ring->xdp_prog); if (!xdp_prog) goto xdp_out; - act = bpf_prog_run_xdp(xdp_prog, xdp); + act = bpf_prog_run_xdp(xdp_prog, &xdp->base); switch (act) { case XDP_PASS: break; case XDP_TX: xdp_ring = adapter->xdp_ring[rx_ring->queue_index]; - result = ixgbevf_xmit_xdp_ring(xdp_ring, xdp); + result = ixgbevf_xmit_xdp_ring(xdp_ring, &xdp->base); if (result == IXGBEVF_XDP_CONSUMED) goto out_failure; break; @@ -895,6 +789,7 @@ static int ixgbevf_run_xdp(struct ixgbevf_adapter *adapter, fallthrough; /* handle aborts by dropping packet */ case XDP_DROP: result = IXGBEVF_XDP_CONSUMED; + libeth_xdp_return_buff(xdp); break; } xdp_out: @@ -909,16 +804,15 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, struct ixgbevf_adapter *adapter = q_vector->adapter; u16 cleaned_count = ixgbevf_desc_unused(rx_ring); struct sk_buff *skb = rx_ring->skb; + LIBETH_XDP_ONSTACK_BUFF(xdp); bool xdp_xmit = false; - struct xdp_buff xdp; int xdp_res = 0; - /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ - xdp_init_buff(&xdp, IXGBEVF_RXBUFFER_3072, &rx_ring->xdp_rxq); + xdp->base.rxq = &rx_ring->xdp_rxq; while (likely(total_rx_packets < budget)) { - struct ixgbevf_rx_buffer *rx_buffer; union ixgbe_adv_rx_desc *rx_desc; + struct libeth_fqe *rx_buffer; unsigned int size; /* return some buffers to hardware, one at a time is too slow */ @@ -938,18 +832,14 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, */ rmb(); - rx_buffer = - ixgbevf_get_rx_buffer(rx_ring, IXGBEVF_RXBUFFER_3072); + rx_buffer = &rx_ring->rx_fqes[rx_ring->next_to_clean]; + libeth_rx_sync_for_cpu(rx_buffer, size); /* retrieve a buffer from the ring */ if (!skb) { - unsigned int offset = rx_buffer->page_offset; - unsigned char *hard_start; - - hard_start = page_address(rx_buffer->page) + - rx_buffer->page_offset - offset; - xdp_prepare_buff(&xdp, hard_start, offset, size, true); - xdp_res = ixgbevf_run_xdp(adapter, rx_ring, &xdp); + libeth_xdp_prepare_buff(xdp, rx_buffer, size); + prefetch(xdp->data); + xdp_res = ixgbevf_run_xdp(adapter, rx_ring, xdp); } if (xdp_res) { @@ -959,10 +849,9 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, total_rx_packets++; total_rx_bytes += size; } else if (skb) { - ixgbevf_add_rx_frag(rx_ring, rx_buffer, skb, size); + ixgbevf_add_rx_frag(rx_buffer, skb, size); } else { - skb = ixgbevf_build_skb(rx_ring, rx_buffer, - &xdp, rx_desc); + skb = xdp_build_skb_from_buff(&xdp->base); } /* exit if we failed to retrieve a buffer */ @@ -971,10 +860,6 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, break; } - ixgbevf_put_rx_buffer(rx_ring, rx_buffer); - if (xdp_res == IXGBEVF_XDP_CONSUMED) - __free_page(rx_buffer->page); - rx_buffer->page = NULL; cleaned_count++; /* fetch next buffer in frame if non-eop */ @@ -1543,7 +1428,8 @@ static void ixgbevf_configure_tx(struct ixgbevf_adapter *adapter) #define IXGBE_SRRCTL_BSIZEHDRSIZE_SHIFT 2 static void ixgbevf_configure_srrctl(struct ixgbevf_adapter *adapter, - struct ixgbevf_ring *ring, int index) + struct ixgbevf_ring *ring, int index, + bool rlpml_valid) { struct ixgbe_hw *hw = &adapter->hw; u32 srrctl; @@ -1551,7 +1437,11 @@ static void ixgbevf_configure_srrctl(struct ixgbevf_adapter *adapter, srrctl = IXGBE_SRRCTL_DROP_EN; srrctl |= IXGBEVF_RX_HDR_SIZE << IXGBE_SRRCTL_BSIZEHDRSIZE_SHIFT; - srrctl |= IXGBEVF_RXBUFFER_3072 >> IXGBE_SRRCTL_BSIZEPKT_SHIFT; + if (rlpml_valid) + srrctl |= DIV_ROUND_UP(ring->rx_buf_len, + IXGBE_SRRCTL_BSIZEPKT_STEP); + else + srrctl |= ring->rx_buf_len / IXGBE_SRRCTL_BSIZEPKT_STEP; srrctl |= IXGBE_SRRCTL_DESCTYPE_ADV_ONEBUF; IXGBE_WRITE_REG(hw, IXGBE_VFSRRCTL(index), srrctl); @@ -1682,9 +1572,10 @@ static void ixgbevf_configure_rx_ring(struct ixgbevf_adapter *adapter, { struct ixgbe_hw *hw = &adapter->hw; union ixgbe_adv_rx_desc *rx_desc; + u8 reg_idx = ring->reg_idx; + bool rlpml_valid = false; u64 rdba = ring->dma; u32 rxdctl; - u8 reg_idx = ring->reg_idx; /* disable queue to avoid issues while updating state */ rxdctl = IXGBE_READ_REG(hw, IXGBE_VFRXDCTL(reg_idx)); @@ -1710,10 +1601,6 @@ static void ixgbevf_configure_rx_ring(struct ixgbevf_adapter *adapter, IXGBE_WRITE_REG(hw, IXGBE_VFRDT(reg_idx), 0); ring->tail = adapter->io_addr + IXGBE_VFRDT(reg_idx); - /* initialize rx_buffer_info */ - memset(ring->rx_buffer_info, 0, - sizeof(struct ixgbevf_rx_buffer) * ring->count); - /* initialize Rx descriptor 0 */ rx_desc = IXGBEVF_RX_DESC(ring, 0); rx_desc->wb.upper.length = 0; @@ -1721,16 +1608,21 @@ static void ixgbevf_configure_rx_ring(struct ixgbevf_adapter *adapter, /* reset ntu and ntc to place SW in sync with hardwdare */ ring->next_to_clean = 0; ring->next_to_use = 0; - ring->next_to_alloc = 0; - - ixgbevf_configure_srrctl(adapter, ring, reg_idx); /* RXDCTL.RLPML does not work on 82599 */ if (adapter->hw.mac.type != ixgbe_mac_82599_vf) { - rxdctl &= ~(IXGBE_RXDCTL_RLPMLMASK | - IXGBE_RXDCTL_RLPML_EN); + u32 pkt_len = + READ_ONCE(adapter->netdev->mtu) + LIBETH_RX_LL_LEN; + + rxdctl &= ~(IXGBE_RXDCTL_RLPMLMASK | IXGBE_RXDCTL_RLPML_EN); + if (pkt_len <= IXGBE_RXDCTL_RLPMLMASK) { + rxdctl |= pkt_len | IXGBE_RXDCTL_RLPML_EN; + rlpml_valid = true; + } } + ixgbevf_configure_srrctl(adapter, ring, reg_idx, rlpml_valid); + rxdctl |= IXGBE_RXDCTL_ENABLE | IXGBE_RXDCTL_VME; IXGBE_WRITE_REG(hw, IXGBE_VFRXDCTL(reg_idx), rxdctl); @@ -2126,8 +2018,6 @@ void ixgbevf_up(struct ixgbevf_adapter *adapter) **/ static void ixgbevf_clean_rx_ring(struct ixgbevf_ring *rx_ring) { - u16 i = rx_ring->next_to_clean; - /* Free Rx ring sk_buff */ if (rx_ring->skb) { dev_kfree_skb(rx_ring->skb); @@ -2135,30 +2025,14 @@ static void ixgbevf_clean_rx_ring(struct ixgbevf_ring *rx_ring) } /* Free all the Rx ring pages */ - while (i != rx_ring->next_to_alloc) { - struct ixgbevf_rx_buffer *rx_buffer; + for (u32 i = rx_ring->next_to_clean; i != rx_ring->next_to_use; ) { + const struct libeth_fqe *rx_fqe = &rx_ring->rx_fqes[i]; - rx_buffer = &rx_ring->rx_buffer_info[i]; - - /* Invalidate cache lines that may have been written to by - * device so that we avoid corrupting memory. - */ - dma_sync_single_range_for_cpu(rx_ring->dev, - rx_buffer->dma, - rx_buffer->page_offset, - IXGBEVF_RXBUFFER_3072, - DMA_FROM_DEVICE); - - /* free resources associated with mapping */ - ixgbevf_put_rx_buffer(rx_ring, rx_buffer); - __free_page(rx_buffer->page); - rx_buffer->page = NULL; - i++; - if (i == rx_ring->count) + libeth_rx_recycle_slow(rx_fqe->netmem); + if (unlikely(++i == rx_ring->count)) i = 0; } - rx_ring->next_to_alloc = 0; rx_ring->next_to_clean = 0; rx_ring->next_to_use = 0; } @@ -2177,7 +2051,7 @@ static void ixgbevf_clean_tx_ring(struct ixgbevf_ring *tx_ring) /* Free all the Tx ring sk_buffs */ if (ring_is_xdp(tx_ring)) - page_frag_free(tx_buffer->data); + libeth_xdp_return_va(tx_buffer->data, false); else dev_kfree_skb_any(tx_buffer->skb); @@ -3259,12 +3133,25 @@ static int ixgbevf_setup_all_tx_resources(struct ixgbevf_adapter *adapter) int ixgbevf_setup_rx_resources(struct ixgbevf_adapter *adapter, struct ixgbevf_ring *rx_ring) { - int size; + struct libeth_fq fq = { + .count = rx_ring->count, + .nid = NUMA_NO_NODE, + .type = LIBETH_FQE_MTU, + .xdp = !!rx_ring->xdp_prog, + .buf_len = IXGBEVF_RX_PAGE_LEN(rx_ring->xdp_prog ? + LIBETH_XDP_HEADROOM : + LIBETH_SKB_HEADROOM), + }; + int ret; - size = sizeof(struct ixgbevf_rx_buffer) * rx_ring->count; - rx_ring->rx_buffer_info = vmalloc(size); - if (!rx_ring->rx_buffer_info) - goto err; + ret = libeth_rx_fq_create(&fq, &rx_ring->q_vector->napi); + if (ret) + return ret; + + rx_ring->pp = fq.pp; + rx_ring->rx_fqes = fq.fqes; + rx_ring->truesize = fq.truesize; + rx_ring->rx_buf_len = fq.buf_len; u64_stats_init(&rx_ring->syncp); @@ -3272,25 +3159,31 @@ int ixgbevf_setup_rx_resources(struct ixgbevf_adapter *adapter, rx_ring->size = rx_ring->count * sizeof(union ixgbe_adv_rx_desc); rx_ring->size = ALIGN(rx_ring->size, 4096); - rx_ring->desc = dma_alloc_coherent(rx_ring->dev, rx_ring->size, + rx_ring->desc = dma_alloc_coherent(fq.pp->p.dev, rx_ring->size, &rx_ring->dma, GFP_KERNEL); - if (!rx_ring->desc) + if (!rx_ring->desc) { + ret = -ENOMEM; goto err; + } /* XDP RX-queue info */ - if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, adapter->netdev, - rx_ring->queue_index, 0) < 0) + ret = __xdp_rxq_info_reg(&rx_ring->xdp_rxq, adapter->netdev, + rx_ring->queue_index, 0, rx_ring->truesize); + if (ret) goto err; - rx_ring->xdp_prog = adapter->xdp_prog; + xdp_rxq_info_attach_page_pool(&rx_ring->xdp_rxq, fq.pp); + + rcu_assign_pointer(rx_ring->xdp_prog, adapter->xdp_prog); return 0; err: - vfree(rx_ring->rx_buffer_info); - rx_ring->rx_buffer_info = NULL; + libeth_rx_fq_destroy(&fq); + rx_ring->rx_fqes = NULL; + rx_ring->pp = NULL; dev_err(rx_ring->dev, "Unable to allocate memory for the Rx descriptor ring\n"); - return -ENOMEM; + return ret; } /** @@ -3331,17 +3224,24 @@ static int ixgbevf_setup_all_rx_resources(struct ixgbevf_adapter *adapter) **/ void ixgbevf_free_rx_resources(struct ixgbevf_ring *rx_ring) { + struct libeth_fq fq = { + .fqes = rx_ring->rx_fqes, + .pp = rx_ring->pp, + }; + ixgbevf_clean_rx_ring(rx_ring); - rx_ring->xdp_prog = NULL; + rcu_assign_pointer(rx_ring->xdp_prog, NULL); + xdp_rxq_info_detach_mem_model(&rx_ring->xdp_rxq); xdp_rxq_info_unreg(&rx_ring->xdp_rxq); - vfree(rx_ring->rx_buffer_info); - rx_ring->rx_buffer_info = NULL; - dma_free_coherent(rx_ring->dev, rx_ring->size, rx_ring->desc, + dma_free_coherent(fq.pp->p.dev, rx_ring->size, rx_ring->desc, rx_ring->dma); - rx_ring->desc = NULL; + + libeth_rx_fq_destroy(&fq); + rx_ring->rx_fqes = NULL; + rx_ring->pp = NULL; } /** @@ -4233,7 +4133,9 @@ static int ixgbevf_xdp_setup(struct net_device *dev, struct bpf_prog *prog) ixgbevf_open(dev); } else { for (i = 0; i < adapter->num_rx_queues; i++) - xchg(&adapter->rx_ring[i]->xdp_prog, adapter->xdp_prog); + rcu_assign_pointer(adapter->rx_ring[i]->xdp_prog, + adapter->xdp_prog); + synchronize_net(); } if (old_prog) From 6f14b567a8f718e17936babd02b10f2e526dec7a Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Mon, 28 Jul 2025 16:52:18 +0200 Subject: [PATCH 08/23] ixgbevf: branch prediction and cleanup Add likely/unlikely markers for better branch prediction. While touching some functions, cleanup the code a little bit. This patch is not supposed to make any logic changes aside from making total_rx_bytes and total_rx_packets more correlated. Reviewed-by: Aleksandr Loktionov Reviewed-by: Alexander Lobakin Signed-off-by: Larysa Zaremba --- .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 663d4062a4c34f..cedbf0a4d0a546 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -570,7 +570,7 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_ring *rx_ring, u16 ntu = rx_ring->next_to_use; /* nothing to do or no valid netdev defined */ - if (!cleaned_count || !rx_ring->netdev) + if (unlikely(!cleaned_count || !rx_ring->netdev)) return; rx_desc = IXGBEVF_RX_DESC(rx_ring, ntu); @@ -586,7 +586,7 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_ring *rx_ring, rx_desc++; ntu++; - if (unlikely(ntu == rx_ring->count)) { + if (unlikely(ntu == fq.count)) { rx_desc = IXGBEVF_RX_DESC(rx_ring, 0); ntu = 0; } @@ -823,7 +823,7 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, rx_desc = IXGBEVF_RX_DESC(rx_ring, rx_ring->next_to_clean); size = le16_to_cpu(rx_desc->wb.upper.length); - if (!size) + if (unlikely(!size)) break; /* This memory barrier is needed to keep us from reading @@ -855,7 +855,8 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, } /* exit if we failed to retrieve a buffer */ - if (!xdp_res && !skb) { + if (unlikely(!xdp_res && !skb)) { + libeth_xdp_return_buff_slow(xdp); rx_ring->rx_stats.alloc_rx_buff_failed++; break; } @@ -867,21 +868,19 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, continue; /* verify the packet layout is correct */ - if (xdp_res || ixgbevf_cleanup_headers(rx_ring, rx_desc, skb)) { + if (xdp_res || + unlikely(ixgbevf_cleanup_headers(rx_ring, rx_desc, skb))) { skb = NULL; continue; } - /* probably a little skewed due to removing CRC */ - total_rx_bytes += skb->len; - /* Workaround hardware that can't do proper VEPA multicast * source pruning. */ - if ((skb->pkt_type == PACKET_BROADCAST || - skb->pkt_type == PACKET_MULTICAST) && - ether_addr_equal(rx_ring->netdev->dev_addr, - eth_hdr(skb)->h_source)) { + if (unlikely((skb->pkt_type == PACKET_BROADCAST || + skb->pkt_type == PACKET_MULTICAST) && + ether_addr_equal(rx_ring->netdev->dev_addr, + eth_hdr(skb)->h_source))) { dev_kfree_skb_irq(skb); continue; } @@ -889,13 +888,14 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, /* populate checksum, VLAN, and protocol */ ixgbevf_process_skb_fields(rx_ring, rx_desc, skb); + /* probably a little skewed due to removing CRC */ + total_rx_bytes += skb->len; + total_rx_packets++; + ixgbevf_rx_skb(q_vector, skb); /* reset skb pointer */ skb = NULL; - - /* update budget accounting */ - total_rx_packets++; } /* place incomplete frames back on ring for completion */ From 83ccc4decdd10aa0d7117286d3c4421e0053baa5 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Tue, 2 Sep 2025 16:31:51 +0200 Subject: [PATCH 09/23] ixgbevf: support XDP multi-buffer on Rx path Implement XDP support for received fragmented packets, this requires using some helpers from libeth_xdp. Reviewed-by: Aleksandr Loktionov Signed-off-by: Larysa Zaremba --- drivers/net/ethernet/intel/ixgbevf/ixgbevf.h | 3 +- .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 97 +++++++------------ 2 files changed, 35 insertions(+), 65 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h index ebf771f0caa4b3..2626af0393614b 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "vf.h" @@ -105,7 +106,6 @@ struct ixgbevf_ring { struct xdp_rxq_info xdp_rxq; u64 hw_csum_rx_error; u8 __iomem *tail; - struct sk_buff *skb; /* holds the special value that gets the hardware register offset * associated with this ring, which is different for DCB and RSS modes @@ -113,6 +113,7 @@ struct ixgbevf_ring { u16 reg_idx; int queue_index; /* needed for multiqueue queue management */ u32 rx_buf_len; + struct libeth_xdp_buff_stash xdp_stash; } ____cacheline_internodealigned_in_smp; /* How many Rx Buffers do we bundle into one write to the hardware ? */ diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index cedbf0a4d0a546..641d87f93864c2 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -84,6 +84,7 @@ MODULE_DEVICE_TABLE(pci, ixgbevf_pci_tbl); MODULE_DESCRIPTION("Intel(R) 10 Gigabit Virtual Function Network Driver"); MODULE_IMPORT_NS("LIBETH"); +MODULE_IMPORT_NS("LIBETH_XDP"); MODULE_LICENSE("GPL v2"); #define DEFAULT_MSG_ENABLE (NETIF_MSG_DRV|NETIF_MSG_PROBE|NETIF_MSG_LINK) @@ -647,26 +648,6 @@ static bool ixgbevf_cleanup_headers(struct ixgbevf_ring *rx_ring, return false; } -/** - * ixgbevf_add_rx_frag - Add contents of Rx buffer to sk_buff - * @rx_ring: rx descriptor ring to transact packets on - * @rx_buffer: buffer containing page to add - * @skb: sk_buff to place the data into - * @size: size of buffer to be added - * - * This function will add the data contained in rx_buffer->page to the skb. - **/ -static void ixgbevf_add_rx_frag(const struct libeth_fqe *rx_buffer, - struct sk_buff *skb, - unsigned int size) -{ - u32 hr = netmem_get_pp(rx_buffer->netmem)->p.offset; - - skb_add_rx_frag_netmem(skb, skb_shinfo(skb)->nr_frags, - rx_buffer->netmem, rx_buffer->offset + hr, - size, rx_buffer->truesize); -} - static inline void ixgbevf_irq_enable_queues(struct ixgbevf_adapter *adapter, u32 qmask) { @@ -803,16 +784,16 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, unsigned int total_rx_bytes = 0, total_rx_packets = 0; struct ixgbevf_adapter *adapter = q_vector->adapter; u16 cleaned_count = ixgbevf_desc_unused(rx_ring); - struct sk_buff *skb = rx_ring->skb; LIBETH_XDP_ONSTACK_BUFF(xdp); bool xdp_xmit = false; int xdp_res = 0; - xdp->base.rxq = &rx_ring->xdp_rxq; + libeth_xdp_init_buff(xdp, &rx_ring->xdp_stash, &rx_ring->xdp_rxq); while (likely(total_rx_packets < budget)) { union ixgbe_adv_rx_desc *rx_desc; struct libeth_fqe *rx_buffer; + struct sk_buff *skb; unsigned int size; /* return some buffers to hardware, one at a time is too slow */ @@ -833,43 +814,38 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, rmb(); rx_buffer = &rx_ring->rx_fqes[rx_ring->next_to_clean]; - libeth_rx_sync_for_cpu(rx_buffer, size); + libeth_xdp_process_buff(xdp, rx_buffer, size); - /* retrieve a buffer from the ring */ - if (!skb) { - libeth_xdp_prepare_buff(xdp, rx_buffer, size); - prefetch(xdp->data); - xdp_res = ixgbevf_run_xdp(adapter, rx_ring, xdp); - } + cleaned_count++; + /* fetch next buffer in frame if non-eop */ + if (ixgbevf_is_non_eop(rx_ring, rx_desc)) + continue; + + total_rx_packets++; + total_rx_bytes += xdp_get_buff_len(&xdp->base); + xdp_res = ixgbevf_run_xdp(adapter, rx_ring, xdp); if (xdp_res) { if (xdp_res == IXGBEVF_XDP_TX) xdp_xmit = true; - total_rx_packets++; - total_rx_bytes += size; - } else if (skb) { - ixgbevf_add_rx_frag(rx_buffer, skb, size); - } else { - skb = xdp_build_skb_from_buff(&xdp->base); + xdp->data = NULL; + continue; } + skb = xdp_build_skb_from_buff(&xdp->base); + /* exit if we failed to retrieve a buffer */ - if (unlikely(!xdp_res && !skb)) { + if (unlikely(!skb)) { libeth_xdp_return_buff_slow(xdp); rx_ring->rx_stats.alloc_rx_buff_failed++; break; } - cleaned_count++; - - /* fetch next buffer in frame if non-eop */ - if (ixgbevf_is_non_eop(rx_ring, rx_desc)) - continue; + xdp->data = NULL; /* verify the packet layout is correct */ - if (xdp_res || - unlikely(ixgbevf_cleanup_headers(rx_ring, rx_desc, skb))) { + if (unlikely(ixgbevf_cleanup_headers(rx_ring, rx_desc, skb))) { skb = NULL; continue; } @@ -888,18 +864,11 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, /* populate checksum, VLAN, and protocol */ ixgbevf_process_skb_fields(rx_ring, rx_desc, skb); - /* probably a little skewed due to removing CRC */ - total_rx_bytes += skb->len; - total_rx_packets++; - ixgbevf_rx_skb(q_vector, skb); - - /* reset skb pointer */ - skb = NULL; } /* place incomplete frames back on ring for completion */ - rx_ring->skb = skb; + libeth_xdp_save_buff(&rx_ring->xdp_stash, xdp); if (xdp_xmit) { struct ixgbevf_ring *xdp_ring = @@ -2019,10 +1988,7 @@ void ixgbevf_up(struct ixgbevf_adapter *adapter) static void ixgbevf_clean_rx_ring(struct ixgbevf_ring *rx_ring) { /* Free Rx ring sk_buff */ - if (rx_ring->skb) { - dev_kfree_skb(rx_ring->skb); - rx_ring->skb = NULL; - } + libeth_xdp_return_stash(&rx_ring->xdp_stash); /* Free all the Rx ring pages */ for (u32 i = rx_ring->next_to_clean; i != rx_ring->next_to_use; ) { @@ -4103,16 +4069,19 @@ ixgbevf_features_check(struct sk_buff *skb, struct net_device *dev, return features; } -static int ixgbevf_xdp_setup(struct net_device *dev, struct bpf_prog *prog) +static int ixgbevf_xdp_setup(struct net_device *dev, struct bpf_prog *prog, + struct netlink_ext_ack *extack) { - int i, frame_size = dev->mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN; + u32 frame_size = READ_ONCE(dev->mtu) + LIBETH_RX_LL_LEN; struct ixgbevf_adapter *adapter = netdev_priv(dev); struct bpf_prog *old_prog; + bool requires_mbuf; - /* verify ixgbevf ring attributes are sufficient for XDP */ - for (i = 0; i < adapter->num_rx_queues; i++) { - if (frame_size > IXGBEVF_RXBUFFER_3072) - return -EINVAL; + requires_mbuf = frame_size > IXGBEVF_RX_PAGE_LEN(LIBETH_XDP_HEADROOM); + if (prog && !prog->aux->xdp_has_frags && requires_mbuf) { + NL_SET_ERR_MSG_MOD(extack, + "Configured MTU requires non-linear frames and XDP prog does not support frags"); + return -EOPNOTSUPP; } old_prog = xchg(&adapter->xdp_prog, prog); @@ -4132,7 +4101,7 @@ static int ixgbevf_xdp_setup(struct net_device *dev, struct bpf_prog *prog) if (netif_running(dev)) ixgbevf_open(dev); } else { - for (i = 0; i < adapter->num_rx_queues; i++) + for (int i = 0; i < adapter->num_rx_queues; i++) rcu_assign_pointer(adapter->rx_ring[i]->xdp_prog, adapter->xdp_prog); synchronize_net(); @@ -4148,7 +4117,7 @@ static int ixgbevf_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: - return ixgbevf_xdp_setup(dev, xdp->prog); + return ixgbevf_xdp_setup(dev, xdp->prog, xdp->extack); default: return -EINVAL; } @@ -4300,7 +4269,7 @@ static int ixgbevf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) NETIF_F_HW_VLAN_CTAG_TX; netdev->priv_flags |= IFF_UNICAST_FLT; - netdev->xdp_features = NETDEV_XDP_ACT_BASIC; + netdev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_RX_SG; /* MTU range: 68 - 1504 or 9710 */ netdev->min_mtu = ETH_MIN_MTU; From 7875adb85e44fb7764e29a17957455eb7a735478 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Tue, 9 Sep 2025 13:46:44 +0200 Subject: [PATCH 10/23] ixgbevf: XDP_TX in multi-buffer through libeth Use libeth to support XDP_TX action for segmented packets. Reviewed-by: Alexander Lobakin Signed-off-by: Larysa Zaremba --- drivers/net/ethernet/intel/ixgbevf/ixgbevf.h | 14 +- .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 296 ++++++++++++------ 2 files changed, 202 insertions(+), 108 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h index 2626af0393614b..a27081ee764b40 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h @@ -81,20 +81,22 @@ struct ixgbevf_ring { struct net_device *netdev; struct bpf_prog __rcu *xdp_prog; union { - struct page_pool *pp; /* Rx ring */ + struct page_pool *pp; /* Rx and XDP rings */ struct device *dev; /* Tx ring */ }; void *desc; /* descriptor ring memory */ - dma_addr_t dma; /* phys. address of descriptor ring */ - unsigned int size; /* length in bytes */ - u32 truesize; /* Rx buffer full size */ + union { + u32 truesize; /* Rx buffer full size */ + u32 pending; /* Sent-not-completed descriptors */ + }; u16 count; /* amount of descriptors */ - u16 next_to_use; u16 next_to_clean; + u32 next_to_use; union { struct libeth_fqe *rx_fqes; struct ixgbevf_tx_buffer *tx_buffer_info; + struct libeth_sqe *xdp_sqes; }; unsigned long state; struct ixgbevf_stats stats; @@ -114,6 +116,8 @@ struct ixgbevf_ring { int queue_index; /* needed for multiqueue queue management */ u32 rx_buf_len; struct libeth_xdp_buff_stash xdp_stash; + unsigned int dma_size; /* length in bytes */ + dma_addr_t dma; /* phys. address of descriptor ring */ } ____cacheline_internodealigned_in_smp; /* How many Rx Buffers do we bundle into one write to the hardware ? */ diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 641d87f93864c2..4c8279a438f53c 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -306,10 +306,7 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector *q_vector, total_ipsec++; /* free the skb */ - if (ring_is_xdp(tx_ring)) - libeth_xdp_return_va(tx_buffer->data, true); - else - napi_consume_skb(tx_buffer->skb, napi_budget); + napi_consume_skb(tx_buffer->skb, napi_budget); /* unmap skb header data */ dma_unmap_single(tx_ring->dev, @@ -392,9 +389,8 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector *q_vector, eop_desc, (eop_desc ? eop_desc->wb.status : 0), tx_ring->tx_buffer_info[i].time_stamp, jiffies); - if (!ring_is_xdp(tx_ring)) - netif_stop_subqueue(tx_ring->netdev, - tx_ring->queue_index); + netif_stop_subqueue(tx_ring->netdev, + tx_ring->queue_index); /* schedule immediate reset if we believe we hung */ ixgbevf_tx_timeout_reset(adapter); @@ -402,9 +398,6 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector *q_vector, return true; } - if (ring_is_xdp(tx_ring)) - return !!budget; - #define TX_WAKE_THRESHOLD (DESC_NEEDED * 2) if (unlikely(total_packets && netif_carrier_ok(tx_ring->netdev) && (ixgbevf_desc_unused(tx_ring) >= TX_WAKE_THRESHOLD))) { @@ -660,44 +653,83 @@ static inline void ixgbevf_irq_enable_queues(struct ixgbevf_adapter *adapter, #define IXGBEVF_XDP_CONSUMED 1 #define IXGBEVF_XDP_TX 2 -static int ixgbevf_xmit_xdp_ring(struct ixgbevf_ring *ring, - struct xdp_buff *xdp) +static void ixgbevf_clean_xdp_num(struct ixgbevf_ring *xdp_ring, bool in_napi, + u16 to_clean) +{ + struct libeth_xdpsq_napi_stats stats = { }; + u32 ntc = xdp_ring->next_to_clean; + struct xdp_frame_bulk cbulk; + struct libeth_cq_pp cp = { + .bq = &cbulk, + .dev = xdp_ring->dev, + .xss = &stats, + .napi = in_napi, + }; + + xdp_frame_bulk_init(&cbulk); + xdp_ring->pending -= to_clean; + + while (likely(to_clean--)) { + libeth_xdp_complete_tx(&xdp_ring->xdp_sqes[ntc], &cp); + ntc++; + ntc = unlikely(ntc == xdp_ring->count) ? 0 : ntc; + } + + xdp_ring->next_to_clean = ntc; + xdp_flush_frame_bulk(&cbulk); +} + +static u16 ixgbevf_tx_get_num_sent(struct ixgbevf_ring *xdp_ring) { - struct ixgbevf_tx_buffer *tx_buffer; - union ixgbe_adv_tx_desc *tx_desc; - u32 len, cmd_type; - dma_addr_t dma; - u16 i; + u16 ntc = xdp_ring->next_to_clean; + u16 to_clean = 0; - len = xdp->data_end - xdp->data; + while (likely(to_clean < xdp_ring->pending)) { + u32 idx = xdp_ring->xdp_sqes[ntc].rs_idx; + union ixgbe_adv_tx_desc *rs_desc; - if (unlikely(!ixgbevf_desc_unused(ring))) - return IXGBEVF_XDP_CONSUMED; + if (!idx--) + break; - dma = dma_map_single(ring->dev, xdp->data, len, DMA_TO_DEVICE); - if (dma_mapping_error(ring->dev, dma)) - return IXGBEVF_XDP_CONSUMED; + rs_desc = IXGBEVF_TX_DESC(xdp_ring, idx); - /* record the location of the first descriptor for this packet */ - i = ring->next_to_use; - tx_buffer = &ring->tx_buffer_info[i]; - - dma_unmap_len_set(tx_buffer, len, len); - dma_unmap_addr_set(tx_buffer, dma, dma); - tx_buffer->data = xdp->data; - tx_buffer->bytecount = len; - tx_buffer->gso_segs = 1; - tx_buffer->protocol = 0; - - /* Populate minimal context descriptor that will provide for the - * fact that we are expected to process Ethernet frames. - */ - if (!test_bit(__IXGBEVF_TX_XDP_RING_PRIMED, &ring->state)) { + if (!(rs_desc->wb.status & cpu_to_le32(IXGBE_TXD_STAT_DD))) + break; + + xdp_ring->xdp_sqes[ntc].rs_idx = 0; + + to_clean += + (idx >= ntc ? idx : idx + xdp_ring->count) - ntc + 1; + + ntc = (idx + 1 == xdp_ring->count) ? 0 : idx + 1; + } + + return to_clean; +} + +static void ixgbevf_clean_xdp_ring(struct ixgbevf_ring *xdp_ring) +{ + ixgbevf_clean_xdp_num(xdp_ring, false, xdp_ring->pending); +} + +static u32 ixgbevf_prep_xdp_sq(void *xdpsq, struct libeth_xdpsq *sq) +{ + struct ixgbevf_ring *xdp_ring = xdpsq; + + if (unlikely(ixgbevf_desc_unused(xdp_ring) < LIBETH_XDP_TX_BULK)) { + u16 to_clean = ixgbevf_tx_get_num_sent(xdp_ring); + + if (likely(to_clean)) + ixgbevf_clean_xdp_num(xdp_ring, true, to_clean); + } + + if (unlikely(!test_bit(__IXGBEVF_TX_XDP_RING_PRIMED, + &xdp_ring->state))) { struct ixgbe_adv_tx_context_desc *context_desc; - set_bit(__IXGBEVF_TX_XDP_RING_PRIMED, &ring->state); + set_bit(__IXGBEVF_TX_XDP_RING_PRIMED, &xdp_ring->state); - context_desc = IXGBEVF_TX_CTXTDESC(ring, 0); + context_desc = IXGBEVF_TX_CTXTDESC(xdp_ring, 0); context_desc->vlan_macip_lens = cpu_to_le32(ETH_HLEN << IXGBE_ADVTXD_MACLEN_SHIFT); context_desc->fceof_saidx = 0; @@ -706,48 +738,101 @@ static int ixgbevf_xmit_xdp_ring(struct ixgbevf_ring *ring, IXGBE_ADVTXD_DTYP_CTXT); context_desc->mss_l4len_idx = 0; - i = 1; + xdp_ring->next_to_use = 1; + xdp_ring->pending = 1; + + /* Finish descriptor writes before bumping tail */ + wmb(); + ixgbevf_write_tail(xdp_ring, 1); } - /* put descriptor type bits */ - cmd_type = IXGBE_ADVTXD_DTYP_DATA | - IXGBE_ADVTXD_DCMD_DEXT | - IXGBE_ADVTXD_DCMD_IFCS; - cmd_type |= len | IXGBE_TXD_CMD; + *sq = (struct libeth_xdpsq) { + .count = xdp_ring->count, + .descs = xdp_ring->desc, + .lock = NULL, + .ntu = &xdp_ring->next_to_use, + .pending = &xdp_ring->pending, + .pool = NULL, + .sqes = xdp_ring->xdp_sqes, + }; - tx_desc = IXGBEVF_TX_DESC(ring, i); - tx_desc->read.buffer_addr = cpu_to_le64(dma); + return ixgbevf_desc_unused(xdp_ring); +} - tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type); - tx_desc->read.olinfo_status = - cpu_to_le32((len << IXGBE_ADVTXD_PAYLEN_SHIFT) | +static void ixgbevf_xdp_xmit_desc(struct libeth_xdp_tx_desc desc, u32 i, + const struct libeth_xdpsq *sq, + u64 priv) +{ + union ixgbe_adv_tx_desc *tx_desc = + &((union ixgbe_adv_tx_desc *)sq->descs)[i]; + + u32 cmd_type = IXGBE_ADVTXD_DTYP_DATA | + IXGBE_ADVTXD_DCMD_DEXT | + IXGBE_ADVTXD_DCMD_IFCS | + desc.len; + + if (desc.flags & LIBETH_XDP_TX_LAST) + cmd_type |= IXGBE_TXD_CMD_EOP; + + if (desc.flags & LIBETH_XDP_TX_FIRST) { + struct skb_shared_info *sinfo = sq->sqes[i].sinfo; + u16 full_len = desc.len; + + if (desc.flags & LIBETH_XDP_TX_MULTI) + full_len += sinfo->xdp_frags_size; + + tx_desc->read.olinfo_status = + cpu_to_le32((full_len << IXGBE_ADVTXD_PAYLEN_SHIFT) | IXGBE_ADVTXD_CC); + } - /* Avoid any potential race with cleanup */ - smp_wmb(); + tx_desc->read.buffer_addr = cpu_to_le64(desc.addr); + tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type); +} - /* set next_to_watch value indicating a packet is present */ - i++; - if (i == ring->count) - i = 0; +LIBETH_XDP_DEFINE_START(); +LIBETH_XDP_DEFINE_FLUSH_TX(static ixgbevf_xdp_flush_tx, ixgbevf_prep_xdp_sq, + ixgbevf_xdp_xmit_desc); +LIBETH_XDP_DEFINE_END(); - tx_buffer->next_to_watch = tx_desc; - ring->next_to_use = i; +static void ixgbevf_xdp_set_rs(struct ixgbevf_ring *xdp_ring, u32 cached_ntu) +{ + u32 ltu = (xdp_ring->next_to_use ? : xdp_ring->count) - 1; + union ixgbe_adv_tx_desc *desc; - return IXGBEVF_XDP_TX; + desc = IXGBEVF_TX_DESC(xdp_ring, ltu); + xdp_ring->xdp_sqes[cached_ntu].rs_idx = ltu + 1; + desc->read.cmd_type_len |= cpu_to_le32(IXGBE_TXD_CMD); } -static int ixgbevf_run_xdp(struct ixgbevf_adapter *adapter, - struct ixgbevf_ring *rx_ring, +static void ixgbevf_rx_finalize_xdp(struct libeth_xdp_tx_bulk *tx_bulk, + bool xdp_xmit, u32 cached_ntu) +{ + struct ixgbevf_ring *xdp_ring = tx_bulk->xdpsq; + + if (!xdp_xmit) + goto unlock; + + if (tx_bulk->count) + ixgbevf_xdp_flush_tx(tx_bulk, LIBETH_XDP_TX_DROP); + + ixgbevf_xdp_set_rs(xdp_ring, cached_ntu); + + /* Finish descriptor writes before bumping tail */ + wmb(); + ixgbevf_write_tail(xdp_ring, xdp_ring->next_to_use); +unlock: + rcu_read_unlock(); +} + +static int ixgbevf_run_xdp(struct libeth_xdp_tx_bulk *tx_bulk, struct libeth_xdp_buff *xdp) { int result = IXGBEVF_XDP_PASS; - struct ixgbevf_ring *xdp_ring; - struct bpf_prog *xdp_prog; + const struct bpf_prog *xdp_prog; u32 act; - xdp_prog = rcu_dereference(rx_ring->xdp_prog); - + xdp_prog = tx_bulk->prog; if (!xdp_prog) goto xdp_out; @@ -756,17 +841,16 @@ static int ixgbevf_run_xdp(struct ixgbevf_adapter *adapter, case XDP_PASS: break; case XDP_TX: - xdp_ring = adapter->xdp_ring[rx_ring->queue_index]; - result = ixgbevf_xmit_xdp_ring(xdp_ring, &xdp->base); - if (result == IXGBEVF_XDP_CONSUMED) - goto out_failure; + result = IXGBEVF_XDP_TX; + if (!libeth_xdp_tx_queue_bulk(tx_bulk, xdp, + ixgbevf_xdp_flush_tx)) + result = IXGBEVF_XDP_CONSUMED; break; default: - bpf_warn_invalid_xdp_action(rx_ring->netdev, xdp_prog, act); + bpf_warn_invalid_xdp_action(tx_bulk->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: -out_failure: - trace_xdp_exception(rx_ring->netdev, xdp_prog, act); + trace_xdp_exception(tx_bulk->dev, xdp_prog, act); fallthrough; /* handle aborts by dropping packet */ case XDP_DROP: result = IXGBEVF_XDP_CONSUMED; @@ -784,11 +868,19 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, unsigned int total_rx_bytes = 0, total_rx_packets = 0; struct ixgbevf_adapter *adapter = q_vector->adapter; u16 cleaned_count = ixgbevf_desc_unused(rx_ring); + LIBETH_XDP_ONSTACK_BULK(xdp_tx_bulk); LIBETH_XDP_ONSTACK_BUFF(xdp); + u32 cached_ntu; bool xdp_xmit = false; int xdp_res = 0; libeth_xdp_init_buff(xdp, &rx_ring->xdp_stash, &rx_ring->xdp_rxq); + libeth_xdp_tx_init_bulk(&xdp_tx_bulk, rx_ring->xdp_prog, + adapter->netdev, adapter->xdp_ring, + adapter->num_xdp_queues); + if (xdp_tx_bulk.prog) + cached_ntu = + ((struct ixgbevf_ring *)xdp_tx_bulk.xdpsq)->next_to_use; while (likely(total_rx_packets < budget)) { union ixgbe_adv_rx_desc *rx_desc; @@ -824,7 +916,7 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, total_rx_packets++; total_rx_bytes += xdp_get_buff_len(&xdp->base); - xdp_res = ixgbevf_run_xdp(adapter, rx_ring, xdp); + xdp_res = ixgbevf_run_xdp(&xdp_tx_bulk, xdp); if (xdp_res) { if (xdp_res == IXGBEVF_XDP_TX) xdp_xmit = true; @@ -870,16 +962,7 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, /* place incomplete frames back on ring for completion */ libeth_xdp_save_buff(&rx_ring->xdp_stash, xdp); - if (xdp_xmit) { - struct ixgbevf_ring *xdp_ring = - adapter->xdp_ring[rx_ring->queue_index]; - - /* Force memory writes to complete before letting h/w - * know there are new descriptors to fetch. - */ - wmb(); - ixgbevf_write_tail(xdp_ring, xdp_ring->next_to_use); - } + ixgbevf_rx_finalize_xdp(&xdp_tx_bulk, xdp_xmit, cached_ntu); u64_stats_update_begin(&rx_ring->syncp); rx_ring->stats.packets += total_rx_packets; @@ -909,6 +992,8 @@ static int ixgbevf_poll(struct napi_struct *napi, int budget) bool clean_complete = true; ixgbevf_for_each_ring(ring, q_vector->tx) { + if (ring_is_xdp(ring)) + continue; if (!ixgbevf_clean_tx_irq(q_vector, ring, budget)) clean_complete = false; } @@ -1348,6 +1433,7 @@ static void ixgbevf_configure_tx_ring(struct ixgbevf_adapter *adapter, /* reset ntu and ntc to place SW in sync with hardwdare */ ring->next_to_clean = 0; ring->next_to_use = 0; + ring->pending = 0; /* In order to avoid issues WTHRESH + PTHRESH should always be equal * to or less than the number of on chip descriptors, which is @@ -1360,8 +1446,12 @@ static void ixgbevf_configure_tx_ring(struct ixgbevf_adapter *adapter, 32; /* PTHRESH = 32 */ /* reinitialize tx_buffer_info */ - memset(ring->tx_buffer_info, 0, - sizeof(struct ixgbevf_tx_buffer) * ring->count); + if (!ring_is_xdp(ring)) + memset(ring->tx_buffer_info, 0, + sizeof(struct ixgbevf_tx_buffer) * ring->count); + else + memset(ring->xdp_sqes, 0, + sizeof(struct libeth_sqe) * ring->count); clear_bit(__IXGBEVF_HANG_CHECK_ARMED, &ring->state); clear_bit(__IXGBEVF_TX_XDP_RING_PRIMED, &ring->state); @@ -2016,10 +2106,7 @@ static void ixgbevf_clean_tx_ring(struct ixgbevf_ring *tx_ring) union ixgbe_adv_tx_desc *eop_desc, *tx_desc; /* Free all the Tx ring sk_buffs */ - if (ring_is_xdp(tx_ring)) - libeth_xdp_return_va(tx_buffer->data, false); - else - dev_kfree_skb_any(tx_buffer->skb); + dev_kfree_skb_any(tx_buffer->skb); /* unmap skb header data */ dma_unmap_single(tx_ring->dev, @@ -2088,7 +2175,7 @@ static void ixgbevf_clean_all_tx_rings(struct ixgbevf_adapter *adapter) for (i = 0; i < adapter->num_tx_queues; i++) ixgbevf_clean_tx_ring(adapter->tx_ring[i]); for (i = 0; i < adapter->num_xdp_queues; i++) - ixgbevf_clean_tx_ring(adapter->xdp_ring[i]); + ixgbevf_clean_xdp_ring(adapter->xdp_ring[i]); } void ixgbevf_down(struct ixgbevf_adapter *adapter) @@ -2834,8 +2921,6 @@ static void ixgbevf_check_hang_subtask(struct ixgbevf_adapter *adapter) if (netif_carrier_ok(adapter->netdev)) { for (i = 0; i < adapter->num_tx_queues; i++) set_check_for_tx_hang(adapter->tx_ring[i]); - for (i = 0; i < adapter->num_xdp_queues; i++) - set_check_for_tx_hang(adapter->xdp_ring[i]); } /* get one bit for every active Tx/Rx interrupt vector */ @@ -2979,7 +3064,10 @@ static void ixgbevf_service_task(struct work_struct *work) **/ void ixgbevf_free_tx_resources(struct ixgbevf_ring *tx_ring) { - ixgbevf_clean_tx_ring(tx_ring); + if (!ring_is_xdp(tx_ring)) + ixgbevf_clean_tx_ring(tx_ring); + else + ixgbevf_clean_xdp_ring(tx_ring); vfree(tx_ring->tx_buffer_info); tx_ring->tx_buffer_info = NULL; @@ -2988,7 +3076,7 @@ void ixgbevf_free_tx_resources(struct ixgbevf_ring *tx_ring) if (!tx_ring->desc) return; - dma_free_coherent(tx_ring->dev, tx_ring->size, tx_ring->desc, + dma_free_coherent(tx_ring->dev, tx_ring->dma_size, tx_ring->desc, tx_ring->dma); tx_ring->desc = NULL; @@ -3023,7 +3111,9 @@ int ixgbevf_setup_tx_resources(struct ixgbevf_ring *tx_ring) struct ixgbevf_adapter *adapter = netdev_priv(tx_ring->netdev); int size; - size = sizeof(struct ixgbevf_tx_buffer) * tx_ring->count; + size = (!ring_is_xdp(tx_ring) ? sizeof(struct ixgbevf_tx_buffer) : + sizeof(struct libeth_sqe)) * tx_ring->count; + tx_ring->tx_buffer_info = vmalloc(size); if (!tx_ring->tx_buffer_info) goto err; @@ -3031,10 +3121,10 @@ int ixgbevf_setup_tx_resources(struct ixgbevf_ring *tx_ring) u64_stats_init(&tx_ring->syncp); /* round up to nearest 4K */ - tx_ring->size = tx_ring->count * sizeof(union ixgbe_adv_tx_desc); - tx_ring->size = ALIGN(tx_ring->size, 4096); + tx_ring->dma_size = tx_ring->count * sizeof(union ixgbe_adv_tx_desc); + tx_ring->dma_size = ALIGN(tx_ring->dma_size, 4096); - tx_ring->desc = dma_alloc_coherent(tx_ring->dev, tx_ring->size, + tx_ring->desc = dma_alloc_coherent(tx_ring->dev, tx_ring->dma_size, &tx_ring->dma, GFP_KERNEL); if (!tx_ring->desc) goto err; @@ -3122,10 +3212,10 @@ int ixgbevf_setup_rx_resources(struct ixgbevf_adapter *adapter, u64_stats_init(&rx_ring->syncp); /* Round up to nearest 4K */ - rx_ring->size = rx_ring->count * sizeof(union ixgbe_adv_rx_desc); - rx_ring->size = ALIGN(rx_ring->size, 4096); + rx_ring->dma_size = rx_ring->count * sizeof(union ixgbe_adv_rx_desc); + rx_ring->dma_size = ALIGN(rx_ring->dma_size, 4096); - rx_ring->desc = dma_alloc_coherent(fq.pp->p.dev, rx_ring->size, + rx_ring->desc = dma_alloc_coherent(fq.pp->p.dev, rx_ring->dma_size, &rx_ring->dma, GFP_KERNEL); if (!rx_ring->desc) { @@ -3201,7 +3291,7 @@ void ixgbevf_free_rx_resources(struct ixgbevf_ring *rx_ring) xdp_rxq_info_detach_mem_model(&rx_ring->xdp_rxq); xdp_rxq_info_unreg(&rx_ring->xdp_rxq); - dma_free_coherent(fq.pp->p.dev, rx_ring->size, rx_ring->desc, + dma_free_coherent(fq.pp->p.dev, rx_ring->dma_size, rx_ring->desc, rx_ring->dma); rx_ring->desc = NULL; From aef135f88169fe7eede5981fad30e5e4fbbe0c1c Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Mon, 22 Sep 2025 07:14:24 +0200 Subject: [PATCH 11/23] ixgbevf: support XDP_REDIRECT and .ndo_xdp_xmit To fully support XDP_REDIRECT, utilize more libeth helpers in XDP Rx path, hence save cached_ntu in the ring structure instead of stack. ixgbevf-supported VFs usually have few queues, so use libeth_xdpsq_lock functionality for XDP queue sharing. Adjust filling-in of XDP Tx descriptors to use data from xdp frame. Otherwise, simply use libeth helpers to implement .ndo_xdp_xmit(). While at it, fix a typo in libeth docs. Reviewed-by: Aleksandr Loktionov Signed-off-by: Larysa Zaremba --- drivers/net/ethernet/intel/ixgbevf/ixgbevf.h | 2 + .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 151 ++++++++---------- include/net/libeth/xdp.h | 2 +- 3 files changed, 71 insertions(+), 84 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h index a27081ee764b40..ea86679e4f81d0 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h @@ -98,6 +98,8 @@ struct ixgbevf_ring { struct ixgbevf_tx_buffer *tx_buffer_info; struct libeth_sqe *xdp_sqes; }; + struct libeth_xdpsq_lock xdpq_lock; + u32 cached_ntu; unsigned long state; struct ixgbevf_stats stats; struct u64_stats_sync syncp; diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 4c8279a438f53c..8a137e2f78f827 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -649,10 +649,6 @@ static inline void ixgbevf_irq_enable_queues(struct ixgbevf_adapter *adapter, IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, qmask); } -#define IXGBEVF_XDP_PASS 0 -#define IXGBEVF_XDP_CONSUMED 1 -#define IXGBEVF_XDP_TX 2 - static void ixgbevf_clean_xdp_num(struct ixgbevf_ring *xdp_ring, bool in_napi, u16 to_clean) { @@ -710,12 +706,14 @@ static u16 ixgbevf_tx_get_num_sent(struct ixgbevf_ring *xdp_ring) static void ixgbevf_clean_xdp_ring(struct ixgbevf_ring *xdp_ring) { ixgbevf_clean_xdp_num(xdp_ring, false, xdp_ring->pending); + libeth_xdpsq_put(&xdp_ring->xdpq_lock, xdp_ring->netdev); } static u32 ixgbevf_prep_xdp_sq(void *xdpsq, struct libeth_xdpsq *sq) { struct ixgbevf_ring *xdp_ring = xdpsq; + libeth_xdpsq_lock(&xdp_ring->xdpq_lock); if (unlikely(ixgbevf_desc_unused(xdp_ring) < LIBETH_XDP_TX_BULK)) { u16 to_clean = ixgbevf_tx_get_num_sent(xdp_ring); @@ -749,7 +747,7 @@ static u32 ixgbevf_prep_xdp_sq(void *xdpsq, struct libeth_xdpsq *sq) *sq = (struct libeth_xdpsq) { .count = xdp_ring->count, .descs = xdp_ring->desc, - .lock = NULL, + .lock = &xdp_ring->xdpq_lock, .ntu = &xdp_ring->next_to_use, .pending = &xdp_ring->pending, .pool = NULL, @@ -775,11 +773,16 @@ static void ixgbevf_xdp_xmit_desc(struct libeth_xdp_tx_desc desc, u32 i, cmd_type |= IXGBE_TXD_CMD_EOP; if (desc.flags & LIBETH_XDP_TX_FIRST) { - struct skb_shared_info *sinfo = sq->sqes[i].sinfo; + struct libeth_sqe *sqe = &sq->sqes[i]; + struct skb_shared_info *sinfo; u16 full_len = desc.len; - if (desc.flags & LIBETH_XDP_TX_MULTI) + if (desc.flags & LIBETH_XDP_TX_MULTI) { + sinfo = sqe->type == LIBETH_SQE_XDP_TX ? + sqe->sinfo : + xdp_get_shared_info_from_frame(sqe->xdpf); full_len += sinfo->xdp_frags_size; + } tx_desc->read.olinfo_status = cpu_to_le32((full_len << IXGBE_ADVTXD_PAYLEN_SHIFT) | @@ -790,77 +793,43 @@ static void ixgbevf_xdp_xmit_desc(struct libeth_xdp_tx_desc desc, u32 i, tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type); } -LIBETH_XDP_DEFINE_START(); -LIBETH_XDP_DEFINE_FLUSH_TX(static ixgbevf_xdp_flush_tx, ixgbevf_prep_xdp_sq, - ixgbevf_xdp_xmit_desc); -LIBETH_XDP_DEFINE_END(); - -static void ixgbevf_xdp_set_rs(struct ixgbevf_ring *xdp_ring, u32 cached_ntu) +static void ixgbevf_xdp_rs_and_bump(void *xdpsq, bool sent, bool flush) { - u32 ltu = (xdp_ring->next_to_use ? : xdp_ring->count) - 1; + struct ixgbevf_ring *xdp_ring = xdpsq; union ixgbe_adv_tx_desc *desc; + u32 ltu; - desc = IXGBEVF_TX_DESC(xdp_ring, ltu); - xdp_ring->xdp_sqes[cached_ntu].rs_idx = ltu + 1; - desc->read.cmd_type_len |= cpu_to_le32(IXGBE_TXD_CMD); -} - -static void ixgbevf_rx_finalize_xdp(struct libeth_xdp_tx_bulk *tx_bulk, - bool xdp_xmit, u32 cached_ntu) -{ - struct ixgbevf_ring *xdp_ring = tx_bulk->xdpsq; - - if (!xdp_xmit) - goto unlock; + libeth_xdpsq_lock(&xdp_ring->xdpq_lock); - if (tx_bulk->count) - ixgbevf_xdp_flush_tx(tx_bulk, LIBETH_XDP_TX_DROP); + if ((!flush && xdp_ring->pending < xdp_ring->count - 1) || + xdp_ring->cached_ntu == xdp_ring->next_to_use) { + libeth_xdpsq_unlock(&xdp_ring->xdpq_lock); + return; + } - ixgbevf_xdp_set_rs(xdp_ring, cached_ntu); + ltu = (xdp_ring->next_to_use ? : xdp_ring->count) - 1; + desc = IXGBEVF_TX_DESC(xdp_ring, ltu); + xdp_ring->xdp_sqes[xdp_ring->cached_ntu].rs_idx = ltu + 1; + desc->read.cmd_type_len |= cpu_to_le32(IXGBE_TXD_CMD); + xdp_ring->cached_ntu = xdp_ring->next_to_use; /* Finish descriptor writes before bumping tail */ wmb(); ixgbevf_write_tail(xdp_ring, xdp_ring->next_to_use); -unlock: - rcu_read_unlock(); -} - -static int ixgbevf_run_xdp(struct libeth_xdp_tx_bulk *tx_bulk, - struct libeth_xdp_buff *xdp) -{ - int result = IXGBEVF_XDP_PASS; - const struct bpf_prog *xdp_prog; - u32 act; - - xdp_prog = tx_bulk->prog; - if (!xdp_prog) - goto xdp_out; - act = bpf_prog_run_xdp(xdp_prog, &xdp->base); - switch (act) { - case XDP_PASS: - break; - case XDP_TX: - result = IXGBEVF_XDP_TX; - if (!libeth_xdp_tx_queue_bulk(tx_bulk, xdp, - ixgbevf_xdp_flush_tx)) - result = IXGBEVF_XDP_CONSUMED; - break; - default: - bpf_warn_invalid_xdp_action(tx_bulk->dev, xdp_prog, act); - fallthrough; - case XDP_ABORTED: - trace_xdp_exception(tx_bulk->dev, xdp_prog, act); - fallthrough; /* handle aborts by dropping packet */ - case XDP_DROP: - result = IXGBEVF_XDP_CONSUMED; - libeth_xdp_return_buff(xdp); - break; - } -xdp_out: - return result; + libeth_xdpsq_unlock(&xdp_ring->xdpq_lock); } +LIBETH_XDP_DEFINE_START(); +LIBETH_XDP_DEFINE_FLUSH_TX(static ixgbevf_xdp_flush_tx, ixgbevf_prep_xdp_sq, + ixgbevf_xdp_xmit_desc); +LIBETH_XDP_DEFINE_FLUSH_XMIT(static ixgbevf_xdp_flush_xmit, ixgbevf_prep_xdp_sq, + ixgbevf_xdp_xmit_desc); +LIBETH_XDP_DEFINE_RUN_PROG(static ixgbevf_xdp_run_prog, ixgbevf_xdp_flush_tx); +LIBETH_XDP_DEFINE_FINALIZE(static ixgbevf_xdp_finalize_xdp_napi, + ixgbevf_xdp_flush_tx, ixgbevf_xdp_rs_and_bump); +LIBETH_XDP_DEFINE_END(); + static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, struct ixgbevf_ring *rx_ring, int budget) @@ -870,17 +839,11 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, u16 cleaned_count = ixgbevf_desc_unused(rx_ring); LIBETH_XDP_ONSTACK_BULK(xdp_tx_bulk); LIBETH_XDP_ONSTACK_BUFF(xdp); - u32 cached_ntu; - bool xdp_xmit = false; - int xdp_res = 0; libeth_xdp_init_buff(xdp, &rx_ring->xdp_stash, &rx_ring->xdp_rxq); libeth_xdp_tx_init_bulk(&xdp_tx_bulk, rx_ring->xdp_prog, adapter->netdev, adapter->xdp_ring, adapter->num_xdp_queues); - if (xdp_tx_bulk.prog) - cached_ntu = - ((struct ixgbevf_ring *)xdp_tx_bulk.xdpsq)->next_to_use; while (likely(total_rx_packets < budget)) { union ixgbe_adv_rx_desc *rx_desc; @@ -916,14 +879,9 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, total_rx_packets++; total_rx_bytes += xdp_get_buff_len(&xdp->base); - xdp_res = ixgbevf_run_xdp(&xdp_tx_bulk, xdp); - if (xdp_res) { - if (xdp_res == IXGBEVF_XDP_TX) - xdp_xmit = true; - - xdp->data = NULL; + if (xdp_tx_bulk.prog && + !ixgbevf_xdp_run_prog(xdp, &xdp_tx_bulk)) continue; - } skb = xdp_build_skb_from_buff(&xdp->base); @@ -962,7 +920,7 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, /* place incomplete frames back on ring for completion */ libeth_xdp_save_buff(&rx_ring->xdp_stash, xdp); - ixgbevf_rx_finalize_xdp(&xdp_tx_bulk, xdp_xmit, cached_ntu); + ixgbevf_xdp_finalize_xdp_napi(&xdp_tx_bulk); u64_stats_update_begin(&rx_ring->syncp); rx_ring->stats.packets += total_rx_packets; @@ -974,6 +932,23 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, return total_rx_packets; } +static int ixgbevf_xdp_xmit(struct net_device *dev, int n, + struct xdp_frame **frames, u32 flags) +{ + struct ixgbevf_adapter *adapter = netdev_priv(dev); + + if (unlikely(test_bit(__IXGBEVF_DOWN, &adapter->state))) + return -ENETDOWN; + + if (unlikely(!adapter->num_xdp_queues)) + return -ENXIO; + + return libeth_xdp_xmit_do_bulk(dev, n, frames, flags, adapter->xdp_ring, + adapter->num_xdp_queues, + ixgbevf_xdp_flush_xmit, + ixgbevf_xdp_rs_and_bump); +} + /** * ixgbevf_poll - NAPI polling calback * @napi: napi struct with our devices info in it @@ -1434,6 +1409,7 @@ static void ixgbevf_configure_tx_ring(struct ixgbevf_adapter *adapter, ring->next_to_clean = 0; ring->next_to_use = 0; ring->pending = 0; + ring->cached_ntu = 0; /* In order to avoid issues WTHRESH + PTHRESH should always be equal * to or less than the number of on chip descriptors, which is @@ -1446,12 +1422,15 @@ static void ixgbevf_configure_tx_ring(struct ixgbevf_adapter *adapter, 32; /* PTHRESH = 32 */ /* reinitialize tx_buffer_info */ - if (!ring_is_xdp(ring)) + if (!ring_is_xdp(ring)) { memset(ring->tx_buffer_info, 0, sizeof(struct ixgbevf_tx_buffer) * ring->count); - else + } else { memset(ring->xdp_sqes, 0, sizeof(struct libeth_sqe) * ring->count); + libeth_xdpsq_get(&ring->xdpq_lock, ring->netdev, + num_possible_cpus() > adapter->num_xdp_queues); + } clear_bit(__IXGBEVF_HANG_CHECK_ARMED, &ring->state); clear_bit(__IXGBEVF_TX_XDP_RING_PRIMED, &ring->state); @@ -4178,6 +4157,8 @@ static int ixgbevf_xdp_setup(struct net_device *dev, struct bpf_prog *prog, /* If transitioning XDP modes reconfigure rings */ if (!!prog != !!old_prog) { + xdp_features_clear_redirect_target(dev); + /* Hardware has to reinitialize queues and interrupts to * match packet buffer alignment. Unfortunately, the * hardware is not flexible enough to do this dynamically. @@ -4197,6 +4178,9 @@ static int ixgbevf_xdp_setup(struct net_device *dev, struct bpf_prog *prog, synchronize_net(); } + if (prog) + xdp_features_set_redirect_target(dev, true); + if (old_prog) bpf_prog_put(old_prog); @@ -4227,6 +4211,7 @@ static const struct net_device_ops ixgbevf_netdev_ops = { .ndo_vlan_rx_kill_vid = ixgbevf_vlan_rx_kill_vid, .ndo_features_check = ixgbevf_features_check, .ndo_bpf = ixgbevf_xdp, + .ndo_xdp_xmit = ixgbevf_xdp_xmit, }; static void ixgbevf_assign_netdev_ops(struct net_device *dev) @@ -4359,7 +4344,7 @@ static int ixgbevf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) NETIF_F_HW_VLAN_CTAG_TX; netdev->priv_flags |= IFF_UNICAST_FLT; - netdev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_RX_SG; + libeth_xdp_set_features_noredir(netdev, NULL, 0, NULL); /* MTU range: 68 - 1504 or 9710 */ netdev->min_mtu = ETH_MIN_MTU; diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h index 898723ab62e812..2e2154ccecae61 100644 --- a/include/net/libeth/xdp.h +++ b/include/net/libeth/xdp.h @@ -1094,7 +1094,7 @@ __libeth_xdp_xmit_do_bulk(struct libeth_xdp_tx_bulk *bq, * @xqs: array of XDPSQs driver structs * @nqs: number of active XDPSQs, the above array length * @fl: driver callback to flush an XDP xmit bulk - * @fin: driver cabback to finalize the queue + * @fin: driver callback to finalize the queue * * If the driver has active XDPSQs, perform common checks and send the frames. * Finalize the queue, if requested. From a9ab74d4640c829c45aade70988d8b40f50214f6 Mon Sep 17 00:00:00 2001 From: Natalia Wochtman Date: Fri, 19 Sep 2025 15:30:15 +0200 Subject: [PATCH 12/23] ixgbevf: add pseudo header split Introduce pseudo header split support in the ixgbevf driver, specifically targeting ixgbe_mac_82599_vf. On older hardware (e.g. ixgbe_mac_82599_vf), RX DMA write size can only be limited in 1K increments. This causes issues when attempting to fit multiple packets per page, as a DMA write may overwrite the headroom of the next packet. To address this, introduce pseudo header split support, where the hardware copies the full L2 header into a dedicated header buffer. This avoids the need for HR/TR alignment and allows safe skb construction from the header buffer without risking overwrites. Given that once packet is too big to fit into a single page, the behaviour is the same for all supported HW, use pseudo header split only for smaller packets. Signed-off-by: Natalia Wochtman Reviewed-by: Aleksandr Loktionov Co-developed-by: Larysa Zaremba Signed-off-by: Larysa Zaremba --- drivers/net/ethernet/intel/ixgbevf/ixgbevf.h | 7 + .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 181 +++++++++++++++--- 2 files changed, 164 insertions(+), 24 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h index ea86679e4f81d0..17958cfb4ee65b 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h @@ -89,6 +89,7 @@ struct ixgbevf_ring { u32 truesize; /* Rx buffer full size */ u32 pending; /* Sent-not-completed descriptors */ }; + u32 hdr_truesize; /* Rx header buffer full size */ u16 count; /* amount of descriptors */ u16 next_to_clean; u32 next_to_use; @@ -107,6 +108,8 @@ struct ixgbevf_ring { struct ixgbevf_tx_queue_stats tx_stats; struct ixgbevf_rx_queue_stats rx_stats; }; + struct libeth_fqe *hdr_fqes; + struct page_pool *hdr_pp; struct xdp_rxq_info xdp_rxq; u64 hw_csum_rx_error; u8 __iomem *tail; @@ -151,6 +154,8 @@ struct ixgbevf_ring { #define IXGBEVF_RX_PAGE_LEN(hr) (ALIGN_DOWN(LIBETH_RX_PAGE_LEN(hr), \ IXGBE_SRRCTL_BSIZEPKT_STEP)) +#define IXGBEVF_RX_SRRCTL_BUF_SIZE(mtu) (ALIGN((mtu) + LIBETH_RX_LL_LEN, \ + IXGBE_SRRCTL_BSIZEPKT_STEP)) #define IXGBE_TX_FLAGS_CSUM BIT(0) #define IXGBE_TX_FLAGS_VLAN BIT(1) @@ -349,6 +354,8 @@ enum ixbgevf_state_t { __IXGBEVF_QUEUE_RESET_REQUESTED, }; +#define IXGBEVF_FLAG_HSPLIT BIT(0) + enum ixgbevf_boards { board_82599_vf, board_82599_vf_hv, diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 8a137e2f78f827..eb6581828f4180 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -561,6 +561,12 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_ring *rx_ring, .truesize = rx_ring->truesize, .count = rx_ring->count, }; + const struct libeth_fq_fp hdr_fq = { + .pp = rx_ring->hdr_pp, + .fqes = rx_ring->hdr_fqes, + .truesize = rx_ring->hdr_truesize, + .count = rx_ring->count, + }; u16 ntu = rx_ring->next_to_use; /* nothing to do or no valid netdev defined */ @@ -578,6 +584,14 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_ring *rx_ring, rx_desc->read.pkt_addr = cpu_to_le64(addr); + if (hdr_fq.pp) { + addr = libeth_rx_alloc(&hdr_fq, ntu); + if (addr == DMA_MAPPING_ERROR) { + libeth_rx_recycle_slow(fq.fqes[ntu].netmem); + break; + } + } + rx_desc++; ntu++; if (unlikely(ntu == fq.count)) { @@ -830,6 +844,32 @@ LIBETH_XDP_DEFINE_FINALIZE(static ixgbevf_xdp_finalize_xdp_napi, ixgbevf_xdp_flush_tx, ixgbevf_xdp_rs_and_bump); LIBETH_XDP_DEFINE_END(); +static u32 ixgbevf_rx_hsplit_wa(const struct libeth_fqe *hdr, + struct libeth_fqe *buf, u32 data_len) +{ + u32 copy = data_len <= L1_CACHE_BYTES ? data_len : ETH_HLEN; + struct page *hdr_page, *buf_page; + const void *src; + void *dst; + + if (unlikely(netmem_is_net_iov(buf->netmem)) || + !libeth_rx_sync_for_cpu(buf, copy)) + return 0; + + hdr_page = __netmem_to_page(hdr->netmem); + buf_page = __netmem_to_page(buf->netmem); + + dst = page_address(hdr_page) + hdr->offset + + pp_page_to_nmdesc(hdr_page)->pp->p.offset; + src = page_address(buf_page) + buf->offset + + pp_page_to_nmdesc(buf_page)->pp->p.offset; + + memcpy(dst, src, LARGEST_ALIGN(copy)); + buf->offset += copy; + + return copy; +} + static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, struct ixgbevf_ring *rx_ring, int budget) @@ -869,6 +909,23 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, rmb(); rx_buffer = &rx_ring->rx_fqes[rx_ring->next_to_clean]; + + if (unlikely(rx_ring->hdr_pp)) { + struct libeth_fqe *hdr_buff; + unsigned int hdr_size = 0; + + hdr_buff = &rx_ring->hdr_fqes[rx_ring->next_to_clean]; + + if (!xdp->data) { + hdr_size = ixgbevf_rx_hsplit_wa(hdr_buff, + rx_buffer, + size); + size -= hdr_size ? : size; + } + + libeth_xdp_process_buff(xdp, hdr_buff, hdr_size); + } + libeth_xdp_process_buff(xdp, rx_buffer, size); cleaned_count++; @@ -1605,6 +1662,87 @@ static void ixgbevf_setup_vfmrqc(struct ixgbevf_adapter *adapter) IXGBE_WRITE_REG(hw, IXGBE_VFMRQC, vfmrqc); } +static void ixgbevf_rx_destroy_pp(struct ixgbevf_ring *rx_ring) +{ + struct libeth_fq fq = { + .pp = rx_ring->pp, + .fqes = rx_ring->rx_fqes, + }; + + libeth_rx_fq_destroy(&fq); + rx_ring->rx_fqes = NULL; + rx_ring->pp = NULL; + + if (!rx_ring->hdr_pp) + return; + + fq = (struct libeth_fq) { + .pp = rx_ring->hdr_pp, + .fqes = rx_ring->hdr_fqes, + }; + + libeth_rx_fq_destroy(&fq); + rx_ring->hdr_fqes = NULL; + rx_ring->hdr_pp = NULL; +} + +static int ixgbevf_rx_create_pp(struct ixgbevf_ring *rx_ring) +{ + u32 adapter_flags = rx_ring->q_vector->adapter->flags; + struct libeth_fq fq = { + .count = rx_ring->count, + .nid = NUMA_NO_NODE, + .type = LIBETH_FQE_MTU, + .xdp = !!rx_ring->xdp_prog, + .buf_len = IXGBEVF_RX_PAGE_LEN(rx_ring->xdp_prog ? + LIBETH_XDP_HEADROOM : + LIBETH_SKB_HEADROOM), + }; + u32 frame_size; + int ret; + + /* Some HW requires DMA write sizes to be aligned to 1K, + * which warrants fake header split usage, but this is + * not an issue if the frame size is at its maximum of 3K + */ + frame_size = + IXGBEVF_RX_SRRCTL_BUF_SIZE(READ_ONCE(rx_ring->netdev->mtu)); + fq.hsplit = (adapter_flags & IXGBEVF_FLAG_HSPLIT) && + frame_size < fq.buf_len; + ret = libeth_rx_fq_create(&fq, &rx_ring->q_vector->napi); + if (ret) + return ret; + + rx_ring->pp = fq.pp; + rx_ring->rx_fqes = fq.fqes; + rx_ring->truesize = fq.truesize; + rx_ring->rx_buf_len = fq.buf_len; + + if (!fq.hsplit) + return 0; + + fq = (struct libeth_fq) { + .count = rx_ring->count, + .nid = NUMA_NO_NODE, + .type = LIBETH_FQE_HDR, + .xdp = !!rx_ring->xdp_prog, + }; + + ret = libeth_rx_fq_create(&fq, &rx_ring->q_vector->napi); + if (ret) + goto err; + + rx_ring->hdr_pp = fq.pp; + rx_ring->hdr_fqes = fq.fqes; + rx_ring->hdr_truesize = fq.truesize; + + return 0; + +err: + ixgbevf_rx_destroy_pp(rx_ring); + return ret; +} + static void ixgbevf_configure_rx_ring(struct ixgbevf_adapter *adapter, struct ixgbevf_ring *ring) { @@ -2062,8 +2200,13 @@ static void ixgbevf_clean_rx_ring(struct ixgbevf_ring *rx_ring) /* Free all the Rx ring pages */ for (u32 i = rx_ring->next_to_clean; i != rx_ring->next_to_use; ) { const struct libeth_fqe *rx_fqe = &rx_ring->rx_fqes[i]; + const struct libeth_fqe *hdr_fqe = rx_ring->hdr_fqes ? + &rx_ring->hdr_fqes[i] : + NULL; libeth_rx_recycle_slow(rx_fqe->netmem); + if (hdr_fqe) + libeth_rx_recycle_slow(hdr_fqe->netmem); if (unlikely(++i == rx_ring->count)) i = 0; } @@ -2725,6 +2868,9 @@ static int ixgbevf_sw_init(struct ixgbevf_adapter *adapter) goto out; } + if (adapter->hw.mac.type == ixgbe_mac_82599_vf) + adapter->flags |= IXGBEVF_FLAG_HSPLIT; + /* assume legacy case in which PF would only give VF 2 queues */ hw->mac.max_tx_queues = 2; hw->mac.max_rx_queues = 2; @@ -3159,42 +3305,29 @@ static int ixgbevf_setup_all_tx_resources(struct ixgbevf_adapter *adapter) } /** - * ixgbevf_setup_rx_resources - allocate Rx resources (Descriptors) + * ixgbevf_setup_rx_resources - allocate Rx resources * @adapter: board private structure * @rx_ring: Rx descriptor ring (for a specific queue) to setup * - * Returns 0 on success, negative on failure + * Returns: 0 on success, negative on failure. **/ int ixgbevf_setup_rx_resources(struct ixgbevf_adapter *adapter, struct ixgbevf_ring *rx_ring) { - struct libeth_fq fq = { - .count = rx_ring->count, - .nid = NUMA_NO_NODE, - .type = LIBETH_FQE_MTU, - .xdp = !!rx_ring->xdp_prog, - .buf_len = IXGBEVF_RX_PAGE_LEN(rx_ring->xdp_prog ? - LIBETH_XDP_HEADROOM : - LIBETH_SKB_HEADROOM), - }; int ret; - ret = libeth_rx_fq_create(&fq, &rx_ring->q_vector->napi); + ret = ixgbevf_rx_create_pp(rx_ring); if (ret) return ret; - rx_ring->pp = fq.pp; - rx_ring->rx_fqes = fq.fqes; - rx_ring->truesize = fq.truesize; - rx_ring->rx_buf_len = fq.buf_len; - u64_stats_init(&rx_ring->syncp); /* Round up to nearest 4K */ rx_ring->dma_size = rx_ring->count * sizeof(union ixgbe_adv_rx_desc); rx_ring->dma_size = ALIGN(rx_ring->dma_size, 4096); - rx_ring->desc = dma_alloc_coherent(fq.pp->p.dev, rx_ring->dma_size, + rx_ring->desc = dma_alloc_coherent(rx_ring->pp->p.dev, + rx_ring->dma_size, &rx_ring->dma, GFP_KERNEL); if (!rx_ring->desc) { @@ -3208,16 +3341,15 @@ int ixgbevf_setup_rx_resources(struct ixgbevf_adapter *adapter, if (ret) goto err; - xdp_rxq_info_attach_page_pool(&rx_ring->xdp_rxq, fq.pp); + xdp_rxq_info_attach_page_pool(&rx_ring->xdp_rxq, rx_ring->pp); rcu_assign_pointer(rx_ring->xdp_prog, adapter->xdp_prog); return 0; err: - libeth_rx_fq_destroy(&fq); - rx_ring->rx_fqes = NULL; - rx_ring->pp = NULL; + ixgbevf_rx_destroy_pp(rx_ring); dev_err(rx_ring->dev, "Unable to allocate memory for the Rx descriptor ring\n"); + return ret; } @@ -4146,10 +4278,11 @@ static int ixgbevf_xdp_setup(struct net_device *dev, struct bpf_prog *prog, struct bpf_prog *old_prog; bool requires_mbuf; - requires_mbuf = frame_size > IXGBEVF_RX_PAGE_LEN(LIBETH_XDP_HEADROOM); + requires_mbuf = frame_size > IXGBEVF_RX_PAGE_LEN(LIBETH_XDP_HEADROOM) || + adapter->flags & IXGBEVF_FLAG_HSPLIT; if (prog && !prog->aux->xdp_has_frags && requires_mbuf) { NL_SET_ERR_MSG_MOD(extack, - "Configured MTU requires non-linear frames and XDP prog does not support frags"); + "Configured MTU or HW limitations require non-linear frames and XDP prog does not support frags"); return -EOPNOTSUPP; } From 096cf89d05e0e3792a14a20ad7fe138cf655e1a2 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Tue, 25 Nov 2025 16:31:14 +0100 Subject: [PATCH 13/23] ixgbevf: reconfigure page pool when reallocating buffers Currently, when MTU is changed, page pool is not reconfigured, which leads to usage of suboptimal buffer sizes. Always destroy page pool when cleaning the ring up and create it anew when we first allocate Rx buffers. Reviewed-by: Aleksandr Loktionov Signed-off-by: Larysa Zaremba --- .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 80 ++++++++++--------- 1 file changed, 42 insertions(+), 38 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index eb6581828f4180..b1ae29b0344a93 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -569,8 +569,8 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_ring *rx_ring, }; u16 ntu = rx_ring->next_to_use; - /* nothing to do or no valid netdev defined */ - if (unlikely(!cleaned_count || !rx_ring->netdev)) + /* nothing to do or page pool is not present */ + if (unlikely(!cleaned_count || !fq.pp)) return; rx_desc = IXGBEVF_RX_DESC(rx_ring, ntu); @@ -1669,6 +1669,14 @@ static void ixgbevf_rx_destroy_pp(struct ixgbevf_ring *rx_ring) .fqes = rx_ring->rx_fqes, }; + if (!fq.pp) + return; + + if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) { + xdp_rxq_info_detach_mem_model(&rx_ring->xdp_rxq); + xdp_rxq_info_unreg(&rx_ring->xdp_rxq); + } + libeth_rx_fq_destroy(&fq); rx_ring->rx_fqes = NULL; rx_ring->pp = NULL; @@ -1718,6 +1726,14 @@ static int ixgbevf_rx_create_pp(struct ixgbevf_ring *rx_ring) rx_ring->truesize = fq.truesize; rx_ring->rx_buf_len = fq.buf_len; + /* XDP RX-queue info */ + ret = __xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, + rx_ring->queue_index, 0, rx_ring->truesize); + if (ret) + goto err; + + xdp_rxq_info_attach_page_pool(&rx_ring->xdp_rxq, rx_ring->pp); + if (!fq.hsplit) return 0; @@ -1752,6 +1768,7 @@ static void ixgbevf_configure_rx_ring(struct ixgbevf_adapter *adapter, bool rlpml_valid = false; u64 rdba = ring->dma; u32 rxdctl; + int err; /* disable queue to avoid issues while updating state */ rxdctl = IXGBE_READ_REG(hw, IXGBE_VFRXDCTL(reg_idx)); @@ -1785,6 +1802,14 @@ static void ixgbevf_configure_rx_ring(struct ixgbevf_adapter *adapter, ring->next_to_clean = 0; ring->next_to_use = 0; + err = ixgbevf_rx_create_pp(ring); + if (err) { + netdev_err(ring->netdev, + "Failed to create Page Pool for buffer allocation: (%pe), RxQ %d is disabled, driver reload may be needed\n", + ERR_PTR(err), ring->queue_index); + return; + } + /* RXDCTL.RLPML does not work on 82599 */ if (adapter->hw.mac.type != ixgbe_mac_82599_vf) { u32 pkt_len = @@ -2282,8 +2307,10 @@ static void ixgbevf_clean_all_rx_rings(struct ixgbevf_adapter *adapter) { int i; - for (i = 0; i < adapter->num_rx_queues; i++) + for (i = 0; i < adapter->num_rx_queues; i++) { ixgbevf_clean_rx_ring(adapter->rx_ring[i]); + ixgbevf_rx_destroy_pp(adapter->rx_ring[i]); + } } /** @@ -3304,6 +3331,11 @@ static int ixgbevf_setup_all_tx_resources(struct ixgbevf_adapter *adapter) return err; } +static struct device *ixgbevf_dma_dev_from_ring(struct ixgbevf_ring *ring) +{ + return &ring->q_vector->adapter->pdev->dev; +} + /** * ixgbevf_setup_rx_resources - allocate Rx resources * @adapter: board private structure @@ -3314,43 +3346,25 @@ static int ixgbevf_setup_all_tx_resources(struct ixgbevf_adapter *adapter) int ixgbevf_setup_rx_resources(struct ixgbevf_adapter *adapter, struct ixgbevf_ring *rx_ring) { - int ret; - - ret = ixgbevf_rx_create_pp(rx_ring); - if (ret) - return ret; - u64_stats_init(&rx_ring->syncp); /* Round up to nearest 4K */ rx_ring->dma_size = rx_ring->count * sizeof(union ixgbe_adv_rx_desc); rx_ring->dma_size = ALIGN(rx_ring->dma_size, 4096); - rx_ring->desc = dma_alloc_coherent(rx_ring->pp->p.dev, + rx_ring->desc = dma_alloc_coherent(ixgbevf_dma_dev_from_ring(rx_ring), rx_ring->dma_size, &rx_ring->dma, GFP_KERNEL); if (!rx_ring->desc) { - ret = -ENOMEM; - goto err; + dev_err(rx_ring->dev, + "Unable to allocate memory for the Rx descriptor ring\n"); + return -ENOMEM; } - /* XDP RX-queue info */ - ret = __xdp_rxq_info_reg(&rx_ring->xdp_rxq, adapter->netdev, - rx_ring->queue_index, 0, rx_ring->truesize); - if (ret) - goto err; - - xdp_rxq_info_attach_page_pool(&rx_ring->xdp_rxq, rx_ring->pp); - rcu_assign_pointer(rx_ring->xdp_prog, adapter->xdp_prog); return 0; -err: - ixgbevf_rx_destroy_pp(rx_ring); - dev_err(rx_ring->dev, "Unable to allocate memory for the Rx descriptor ring\n"); - - return ret; } /** @@ -3391,24 +3405,14 @@ static int ixgbevf_setup_all_rx_resources(struct ixgbevf_adapter *adapter) **/ void ixgbevf_free_rx_resources(struct ixgbevf_ring *rx_ring) { - struct libeth_fq fq = { - .fqes = rx_ring->rx_fqes, - .pp = rx_ring->pp, - }; - ixgbevf_clean_rx_ring(rx_ring); - + ixgbevf_rx_destroy_pp(rx_ring); rcu_assign_pointer(rx_ring->xdp_prog, NULL); - xdp_rxq_info_detach_mem_model(&rx_ring->xdp_rxq); - xdp_rxq_info_unreg(&rx_ring->xdp_rxq); - dma_free_coherent(fq.pp->p.dev, rx_ring->dma_size, rx_ring->desc, + dma_free_coherent(ixgbevf_dma_dev_from_ring(rx_ring), + rx_ring->dma_size, rx_ring->desc, rx_ring->dma); rx_ring->desc = NULL; - - libeth_rx_fq_destroy(&fq); - rx_ring->rx_fqes = NULL; - rx_ring->pp = NULL; } /** From ffc83fd9468538517d15a67d06db05f1b18bea63 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Thu, 8 Jan 2026 13:44:03 +0100 Subject: [PATCH 14/23] ixgbevf: allow changing MTU when XDP program is attached xskxceiver attempts to change MTU after attaching XDP program, ixgbevf rejects the request leading to test being failed. Support MTU change operation even when XDP program is already attached, perform the same frame size check as when attaching a program. Reviewed-by: Aleksandr Loktionov Signed-off-by: Larysa Zaremba --- .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index b1ae29b0344a93..8978f57452194c 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -4098,6 +4098,18 @@ static int ixgbevf_set_mac(struct net_device *netdev, void *p) return 0; } +static bool ixgbevf_xdp_mtu_ok(const struct ixgbevf_adapter *adapter, + const struct bpf_prog *prog, unsigned int mtu) +{ + u32 frame_size = mtu + LIBETH_RX_LL_LEN; + bool requires_mbuf; + + requires_mbuf = frame_size > IXGBEVF_RX_PAGE_LEN(LIBETH_XDP_HEADROOM) || + adapter->flags & IXGBEVF_FLAG_HSPLIT; + + return prog->aux->xdp_has_frags || !requires_mbuf; +} + /** * ixgbevf_change_mtu - Change the Maximum Transfer Unit * @netdev: network interface device structure @@ -4113,8 +4125,10 @@ static int ixgbevf_change_mtu(struct net_device *netdev, int new_mtu) int ret; /* prevent MTU being changed to a size unsupported by XDP */ - if (adapter->xdp_prog) { - dev_warn(&adapter->pdev->dev, "MTU cannot be changed while XDP program is loaded\n"); + if (adapter->xdp_prog && + !ixgbevf_xdp_mtu_ok(adapter, adapter->xdp_prog, new_mtu)) { + netdev_warn(netdev, + "MTU value provided cannot be set while current XDP program is attached\n"); return -EPERM; } @@ -4277,14 +4291,10 @@ ixgbevf_features_check(struct sk_buff *skb, struct net_device *dev, static int ixgbevf_xdp_setup(struct net_device *dev, struct bpf_prog *prog, struct netlink_ext_ack *extack) { - u32 frame_size = READ_ONCE(dev->mtu) + LIBETH_RX_LL_LEN; struct ixgbevf_adapter *adapter = netdev_priv(dev); struct bpf_prog *old_prog; - bool requires_mbuf; - requires_mbuf = frame_size > IXGBEVF_RX_PAGE_LEN(LIBETH_XDP_HEADROOM) || - adapter->flags & IXGBEVF_FLAG_HSPLIT; - if (prog && !prog->aux->xdp_has_frags && requires_mbuf) { + if (prog && !ixgbevf_xdp_mtu_ok(adapter, prog, READ_ONCE(dev->mtu))) { NL_SET_ERR_MSG_MOD(extack, "Configured MTU or HW limitations require non-linear frames and XDP prog does not support frags"); return -EOPNOTSUPP; From b460fa643b9f18782970576746a0aa3531e5976d Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Wed, 22 Oct 2025 18:19:46 +0200 Subject: [PATCH 15/23] ixgbevf: add a helper to flush Tx queue The same register write operation is already used twice in code, it will be used again by AF_XDP configuration. Wrap it in a helper function. Signed-off-by: Larysa Zaremba --- .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 8978f57452194c..96133b063e7e94 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -2327,10 +2327,17 @@ static void ixgbevf_clean_all_tx_rings(struct ixgbevf_adapter *adapter) ixgbevf_clean_xdp_ring(adapter->xdp_ring[i]); } +static void ixgbevf_flush_tx_queue(struct ixgbevf_ring *ring) +{ + u8 reg_idx = ring->reg_idx; + + IXGBE_WRITE_REG(&ring->q_vector->adapter->hw, IXGBE_VFTXDCTL(reg_idx), + IXGBE_TXDCTL_SWFLSH); +} + void ixgbevf_down(struct ixgbevf_adapter *adapter) { struct net_device *netdev = adapter->netdev; - struct ixgbe_hw *hw = &adapter->hw; int i; /* signal that we are down to the interrupt handler */ @@ -2356,19 +2363,11 @@ void ixgbevf_down(struct ixgbevf_adapter *adapter) timer_delete_sync(&adapter->service_timer); /* disable transmits in the hardware now that interrupts are off */ - for (i = 0; i < adapter->num_tx_queues; i++) { - u8 reg_idx = adapter->tx_ring[i]->reg_idx; - - IXGBE_WRITE_REG(hw, IXGBE_VFTXDCTL(reg_idx), - IXGBE_TXDCTL_SWFLSH); - } - - for (i = 0; i < adapter->num_xdp_queues; i++) { - u8 reg_idx = adapter->xdp_ring[i]->reg_idx; + for (i = 0; i < adapter->num_tx_queues; i++) + ixgbevf_flush_tx_queue(adapter->tx_ring[i]); - IXGBE_WRITE_REG(hw, IXGBE_VFTXDCTL(reg_idx), - IXGBE_TXDCTL_SWFLSH); - } + for (i = 0; i < adapter->num_xdp_queues; i++) + ixgbevf_flush_tx_queue(adapter->xdp_ring[i]); if (!pci_channel_offline(adapter->pdev)) ixgbevf_reset(adapter); From 105a17e2ed4458d9c26e2c5f7eb2502142d69911 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Fri, 24 Oct 2025 12:19:51 +0200 Subject: [PATCH 16/23] ixgbevf: move skb-filling code to a header AF_XDP ZC Rx path is also required to implement skb creation. Move all common functions to a header file as inlines. Signed-off-by: Larysa Zaremba --- .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 195 +----------------- .../ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h | 192 +++++++++++++++++ 2 files changed, 194 insertions(+), 193 deletions(-) create mode 100644 drivers/net/ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 96133b063e7e94..26649934a8159c 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -33,7 +33,7 @@ #include #include -#include "ixgbevf.h" +#include "ixgbevf_txrx_lib.h" const char ixgbevf_driver_name[] = "ixgbevf"; static const char ixgbevf_driver_string[] = @@ -418,134 +418,6 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector *q_vector, return !!budget; } -/** - * ixgbevf_rx_skb - Helper function to determine proper Rx method - * @q_vector: structure containing interrupt and ring information - * @skb: packet to send up - **/ -static void ixgbevf_rx_skb(struct ixgbevf_q_vector *q_vector, - struct sk_buff *skb) -{ - napi_gro_receive(&q_vector->napi, skb); -} - -#define IXGBE_RSS_L4_TYPES_MASK \ - ((1ul << IXGBE_RXDADV_RSSTYPE_IPV4_TCP) | \ - (1ul << IXGBE_RXDADV_RSSTYPE_IPV4_UDP) | \ - (1ul << IXGBE_RXDADV_RSSTYPE_IPV6_TCP) | \ - (1ul << IXGBE_RXDADV_RSSTYPE_IPV6_UDP)) - -static inline void ixgbevf_rx_hash(struct ixgbevf_ring *ring, - union ixgbe_adv_rx_desc *rx_desc, - struct sk_buff *skb) -{ - u16 rss_type; - - if (!(ring->netdev->features & NETIF_F_RXHASH)) - return; - - rss_type = le16_to_cpu(rx_desc->wb.lower.lo_dword.hs_rss.pkt_info) & - IXGBE_RXDADV_RSSTYPE_MASK; - - if (!rss_type) - return; - - skb_set_hash(skb, le32_to_cpu(rx_desc->wb.lower.hi_dword.rss), - (IXGBE_RSS_L4_TYPES_MASK & (1ul << rss_type)) ? - PKT_HASH_TYPE_L4 : PKT_HASH_TYPE_L3); -} - -/** - * ixgbevf_rx_checksum - indicate in skb if hw indicated a good cksum - * @ring: structure containig ring specific data - * @rx_desc: current Rx descriptor being processed - * @skb: skb currently being received and modified - **/ -static inline void ixgbevf_rx_checksum(struct ixgbevf_ring *ring, - union ixgbe_adv_rx_desc *rx_desc, - struct sk_buff *skb) -{ - skb_checksum_none_assert(skb); - - /* Rx csum disabled */ - if (!(ring->netdev->features & NETIF_F_RXCSUM)) - return; - - /* if IP and error */ - if (ixgbevf_test_staterr(rx_desc, IXGBE_RXD_STAT_IPCS) && - ixgbevf_test_staterr(rx_desc, IXGBE_RXDADV_ERR_IPE)) { - ring->rx_stats.csum_err++; - return; - } - - if (!ixgbevf_test_staterr(rx_desc, IXGBE_RXD_STAT_L4CS)) - return; - - if (ixgbevf_test_staterr(rx_desc, IXGBE_RXDADV_ERR_TCPE)) { - ring->rx_stats.csum_err++; - return; - } - - /* It must be a TCP or UDP packet with a valid checksum */ - skb->ip_summed = CHECKSUM_UNNECESSARY; -} - -/** - * ixgbevf_process_skb_fields - Populate skb header fields from Rx descriptor - * @rx_ring: rx descriptor ring packet is being transacted on - * @rx_desc: pointer to the EOP Rx descriptor - * @skb: pointer to current skb being populated - * - * This function checks the ring, descriptor, and packet information in - * order to populate the checksum, VLAN, protocol, and other fields within - * the skb. - **/ -static void ixgbevf_process_skb_fields(struct ixgbevf_ring *rx_ring, - union ixgbe_adv_rx_desc *rx_desc, - struct sk_buff *skb) -{ - ixgbevf_rx_hash(rx_ring, rx_desc, skb); - ixgbevf_rx_checksum(rx_ring, rx_desc, skb); - - if (ixgbevf_test_staterr(rx_desc, IXGBE_RXD_STAT_VP)) { - u16 vid = le16_to_cpu(rx_desc->wb.upper.vlan); - unsigned long *active_vlans = netdev_priv(rx_ring->netdev); - - if (test_bit(vid & VLAN_VID_MASK, active_vlans)) - __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vid); - } - - if (ixgbevf_test_staterr(rx_desc, IXGBE_RXDADV_STAT_SECP)) - ixgbevf_ipsec_rx(rx_ring, rx_desc, skb); -} - -/** - * ixgbevf_is_non_eop - process handling of non-EOP buffers - * @rx_ring: Rx ring being processed - * @rx_desc: Rx descriptor for current buffer - * - * This function updates next to clean. If the buffer is an EOP buffer - * this function exits returning false, otherwise it will place the - * sk_buff in the next buffer to be chained and return true indicating - * that this is in fact a non-EOP buffer. - **/ -static bool ixgbevf_is_non_eop(struct ixgbevf_ring *rx_ring, - union ixgbe_adv_rx_desc *rx_desc) -{ - u32 ntc = rx_ring->next_to_clean + 1; - - /* fetch, update, and store next to clean */ - ntc = (ntc < rx_ring->count) ? ntc : 0; - rx_ring->next_to_clean = ntc; - - prefetch(IXGBEVF_RX_DESC(rx_ring, ntc)); - - if (likely(ixgbevf_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP))) - return false; - - return true; -} - /** * ixgbevf_alloc_rx_buffers - Replace used receive buffers; packet split * @rx_ring: rx descriptor ring (for a specific queue) to setup buffers on @@ -619,42 +491,6 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_ring *rx_ring, } } -/** - * ixgbevf_cleanup_headers - Correct corrupted or empty headers - * @rx_ring: rx descriptor ring packet is being transacted on - * @rx_desc: pointer to the EOP Rx descriptor - * @skb: pointer to current skb being fixed - * - * Check for corrupted packet headers caused by senders on the local L2 - * embedded NIC switch not setting up their Tx Descriptors right. These - * should be very rare. - * - * Also address the case where we are pulling data in on pages only - * and as such no data is present in the skb header. - * - * In addition if skb is not at least 60 bytes we need to pad it so that - * it is large enough to qualify as a valid Ethernet frame. - * - * Returns true if an error was encountered and skb was freed. - **/ -static bool ixgbevf_cleanup_headers(struct ixgbevf_ring *rx_ring, - union ixgbe_adv_rx_desc *rx_desc, - struct sk_buff *skb) -{ - /* verify that the packet does not have any known errors */ - if (unlikely(ixgbevf_test_staterr(rx_desc, - IXGBE_RXDADV_ERR_FRAME_ERR_MASK))) { - struct net_device *netdev = rx_ring->netdev; - - if (!(netdev->features & NETIF_F_RXALL)) { - dev_kfree_skb_any(skb); - return true; - } - } - - return false; -} - static inline void ixgbevf_irq_enable_queues(struct ixgbevf_adapter *adapter, u32 qmask) { @@ -807,33 +643,6 @@ static void ixgbevf_xdp_xmit_desc(struct libeth_xdp_tx_desc desc, u32 i, tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type); } -static void ixgbevf_xdp_rs_and_bump(void *xdpsq, bool sent, bool flush) -{ - struct ixgbevf_ring *xdp_ring = xdpsq; - union ixgbe_adv_tx_desc *desc; - u32 ltu; - - libeth_xdpsq_lock(&xdp_ring->xdpq_lock); - - if ((!flush && xdp_ring->pending < xdp_ring->count - 1) || - xdp_ring->cached_ntu == xdp_ring->next_to_use) { - libeth_xdpsq_unlock(&xdp_ring->xdpq_lock); - return; - } - - ltu = (xdp_ring->next_to_use ? : xdp_ring->count) - 1; - desc = IXGBEVF_TX_DESC(xdp_ring, ltu); - xdp_ring->xdp_sqes[xdp_ring->cached_ntu].rs_idx = ltu + 1; - desc->read.cmd_type_len |= cpu_to_le32(IXGBE_TXD_CMD); - xdp_ring->cached_ntu = xdp_ring->next_to_use; - - /* Finish descriptor writes before bumping tail */ - wmb(); - ixgbevf_write_tail(xdp_ring, xdp_ring->next_to_use); - - libeth_xdpsq_unlock(&xdp_ring->xdpq_lock); -} - LIBETH_XDP_DEFINE_START(); LIBETH_XDP_DEFINE_FLUSH_TX(static ixgbevf_xdp_flush_tx, ixgbevf_prep_xdp_sq, ixgbevf_xdp_xmit_desc); @@ -971,7 +780,7 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, /* populate checksum, VLAN, and protocol */ ixgbevf_process_skb_fields(rx_ring, rx_desc, skb); - ixgbevf_rx_skb(q_vector, skb); + napi_gro_receive(&q_vector->napi, skb); } /* place incomplete frames back on ring for completion */ diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h new file mode 100644 index 00000000000000..c96dba45fb2047 --- /dev/null +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h @@ -0,0 +1,192 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2010-2026 Intel Corporation */ + +#ifndef _IXGBEVF_TXRX_LIB_H_ +#define _IXGBEVF_TXRX_LIB_H_ + +#include + +#include "ixgbevf.h" + +static inline void ixgbevf_xdp_rs_and_bump(void *xdpsq, bool sent, bool flush) +{ + struct ixgbevf_ring *xdp_ring = xdpsq; + union ixgbe_adv_tx_desc *desc; + u32 ltu; + + libeth_xdpsq_lock(&xdp_ring->xdpq_lock); + + if ((!flush && xdp_ring->pending < xdp_ring->count - 1) || + xdp_ring->cached_ntu == xdp_ring->next_to_use) { + libeth_xdpsq_unlock(&xdp_ring->xdpq_lock); + return; + } + + ltu = (xdp_ring->next_to_use ? : xdp_ring->count) - 1; + desc = IXGBEVF_TX_DESC(xdp_ring, ltu); + desc->read.cmd_type_len |= cpu_to_le32(IXGBE_TXD_CMD); + + xdp_ring->xdp_sqes[xdp_ring->cached_ntu].rs_idx = ltu + 1; + xdp_ring->cached_ntu = xdp_ring->next_to_use; + + /* Finish descriptor writes before bumping tail */ + wmb(); + ixgbevf_write_tail(xdp_ring, xdp_ring->next_to_use); + + libeth_xdpsq_unlock(&xdp_ring->xdpq_lock); +} + +/** + * ixgbevf_is_non_eop - process handling of non-EOP buffers + * @rx_ring: Rx ring being processed + * @rx_desc: Rx descriptor for current buffer + * + * This function updates next to clean. If the buffer is an EOP buffer + * this function exits returning false, otherwise it will place the + * sk_buff in the next buffer to be chained and return true indicating + * that this is in fact a non-EOP buffer. + **/ +static inline bool ixgbevf_is_non_eop(struct ixgbevf_ring *rx_ring, + union ixgbe_adv_rx_desc *rx_desc) +{ + u32 ntc = rx_ring->next_to_clean + 1; + + /* fetch, update, and store next to clean */ + ntc = (ntc < rx_ring->count) ? ntc : 0; + rx_ring->next_to_clean = ntc; + + prefetch(IXGBEVF_RX_DESC(rx_ring, ntc)); + + if (likely(ixgbevf_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP))) + return false; + + return true; +} + +/** + * ixgbevf_cleanup_headers - Correct corrupted or empty headers + * @rx_ring: rx descriptor ring packet is being transacted on + * @rx_desc: pointer to the EOP Rx descriptor + * @skb: pointer to current skb being fixed + * + * Check for corrupted packet headers caused by senders on the local L2 + * embedded NIC switch not setting up their Tx Descriptors right. These + * should be very rare. + * + * Also address the case where we are pulling data in on pages only + * and as such no data is present in the skb header. + * + * In addition if skb is not at least 60 bytes we need to pad it so that + * it is large enough to qualify as a valid Ethernet frame. + * + * Returns true if an error was encountered and skb was freed. + **/ +static inline bool ixgbevf_cleanup_headers(struct ixgbevf_ring *rx_ring, + union ixgbe_adv_rx_desc *rx_desc, + struct sk_buff *skb) +{ + /* verify that the packet does not have any known errors */ + if (unlikely(ixgbevf_test_staterr(rx_desc, + IXGBE_RXDADV_ERR_FRAME_ERR_MASK))) { + struct net_device *netdev = rx_ring->netdev; + + if (!(netdev->features & NETIF_F_RXALL)) { + dev_kfree_skb_any(skb); + return true; + } + } + + return false; +} + +#define IXGBE_RSS_L4_TYPES_MASK \ + ((1ul << IXGBE_RXDADV_RSSTYPE_IPV4_TCP) | \ + (1ul << IXGBE_RXDADV_RSSTYPE_IPV4_UDP) | \ + (1ul << IXGBE_RXDADV_RSSTYPE_IPV6_TCP) | \ + (1ul << IXGBE_RXDADV_RSSTYPE_IPV6_UDP)) + +static inline void ixgbevf_rx_hash(struct ixgbevf_ring *ring, + union ixgbe_adv_rx_desc *rx_desc, + struct sk_buff *skb) +{ + u16 rss_type; + + if (!(ring->netdev->features & NETIF_F_RXHASH)) + return; + + rss_type = le16_to_cpu(rx_desc->wb.lower.lo_dword.hs_rss.pkt_info) & + IXGBE_RXDADV_RSSTYPE_MASK; + + if (!rss_type) + return; + + skb_set_hash(skb, le32_to_cpu(rx_desc->wb.lower.hi_dword.rss), + (IXGBE_RSS_L4_TYPES_MASK & (1ul << rss_type)) ? + PKT_HASH_TYPE_L4 : PKT_HASH_TYPE_L3); +} + +/** + * ixgbevf_rx_checksum - indicate in skb if hw indicated a good cksum + * @ring: structure containing ring specific data + * @rx_desc: current Rx descriptor being processed + * @skb: skb currently being received and modified + **/ +static inline void ixgbevf_rx_checksum(struct ixgbevf_ring *ring, + union ixgbe_adv_rx_desc *rx_desc, + struct sk_buff *skb) +{ + skb_checksum_none_assert(skb); + + /* Rx csum disabled */ + if (!(ring->netdev->features & NETIF_F_RXCSUM)) + return; + + /* if IP and error */ + if (ixgbevf_test_staterr(rx_desc, IXGBE_RXD_STAT_IPCS) && + ixgbevf_test_staterr(rx_desc, IXGBE_RXDADV_ERR_IPE)) { + ring->rx_stats.csum_err++; + return; + } + + if (!ixgbevf_test_staterr(rx_desc, IXGBE_RXD_STAT_L4CS)) + return; + + if (ixgbevf_test_staterr(rx_desc, IXGBE_RXDADV_ERR_TCPE)) { + ring->rx_stats.csum_err++; + return; + } + + /* It must be a TCP or UDP packet with a valid checksum */ + skb->ip_summed = CHECKSUM_UNNECESSARY; +} + +/** + * ixgbevf_process_skb_fields - Populate skb header fields from Rx descriptor + * @rx_ring: rx descriptor ring packet is being transacted on + * @rx_desc: pointer to the EOP Rx descriptor + * @skb: pointer to current skb being populated + * + * This function checks the ring, descriptor, and packet information in + * order to populate the checksum, VLAN, protocol, and other fields within + * the skb. + **/ +static inline void ixgbevf_process_skb_fields(struct ixgbevf_ring *rx_ring, + union ixgbe_adv_rx_desc *rx_desc, + struct sk_buff *skb) +{ + ixgbevf_rx_hash(rx_ring, rx_desc, skb); + ixgbevf_rx_checksum(rx_ring, rx_desc, skb); + + if (ixgbevf_test_staterr(rx_desc, IXGBE_RXD_STAT_VP)) { + u16 vid = le16_to_cpu(rx_desc->wb.upper.vlan); + unsigned long *active_vlans = netdev_priv(rx_ring->netdev); + + if (test_bit(vid & VLAN_VID_MASK, active_vlans)) + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vid); + } + + if (ixgbevf_test_staterr(rx_desc, IXGBE_RXDADV_STAT_SECP)) + ixgbevf_ipsec_rx(rx_ring, rx_desc, skb); +} + +#endif /* _IXGBEVF_TXRX_LIB_H_ */ From 604d88721666a372d6cdb67079554c6a34e242d8 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Fri, 24 Oct 2025 14:42:09 +0200 Subject: [PATCH 17/23] ixgbevf: move XDP queue management code to a header Plenty of code can be shared between ZC and normal XDP Tx queues. Expose such code through the previously added header file. Signed-off-by: Larysa Zaremba --- .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 80 +------------------ .../ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h | 79 ++++++++++++++++++ 2 files changed, 81 insertions(+), 78 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 26649934a8159c..1cb307bcbe0310 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -499,8 +499,8 @@ static inline void ixgbevf_irq_enable_queues(struct ixgbevf_adapter *adapter, IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, qmask); } -static void ixgbevf_clean_xdp_num(struct ixgbevf_ring *xdp_ring, bool in_napi, - u16 to_clean) +void ixgbevf_clean_xdp_num(struct ixgbevf_ring *xdp_ring, bool in_napi, + u16 to_clean) { struct libeth_xdpsq_napi_stats stats = { }; u32 ntc = xdp_ring->next_to_clean; @@ -525,88 +525,12 @@ static void ixgbevf_clean_xdp_num(struct ixgbevf_ring *xdp_ring, bool in_napi, xdp_flush_frame_bulk(&cbulk); } -static u16 ixgbevf_tx_get_num_sent(struct ixgbevf_ring *xdp_ring) -{ - u16 ntc = xdp_ring->next_to_clean; - u16 to_clean = 0; - - while (likely(to_clean < xdp_ring->pending)) { - u32 idx = xdp_ring->xdp_sqes[ntc].rs_idx; - union ixgbe_adv_tx_desc *rs_desc; - - if (!idx--) - break; - - rs_desc = IXGBEVF_TX_DESC(xdp_ring, idx); - - if (!(rs_desc->wb.status & cpu_to_le32(IXGBE_TXD_STAT_DD))) - break; - - xdp_ring->xdp_sqes[ntc].rs_idx = 0; - - to_clean += - (idx >= ntc ? idx : idx + xdp_ring->count) - ntc + 1; - - ntc = (idx + 1 == xdp_ring->count) ? 0 : idx + 1; - } - - return to_clean; -} - static void ixgbevf_clean_xdp_ring(struct ixgbevf_ring *xdp_ring) { ixgbevf_clean_xdp_num(xdp_ring, false, xdp_ring->pending); libeth_xdpsq_put(&xdp_ring->xdpq_lock, xdp_ring->netdev); } -static u32 ixgbevf_prep_xdp_sq(void *xdpsq, struct libeth_xdpsq *sq) -{ - struct ixgbevf_ring *xdp_ring = xdpsq; - - libeth_xdpsq_lock(&xdp_ring->xdpq_lock); - if (unlikely(ixgbevf_desc_unused(xdp_ring) < LIBETH_XDP_TX_BULK)) { - u16 to_clean = ixgbevf_tx_get_num_sent(xdp_ring); - - if (likely(to_clean)) - ixgbevf_clean_xdp_num(xdp_ring, true, to_clean); - } - - if (unlikely(!test_bit(__IXGBEVF_TX_XDP_RING_PRIMED, - &xdp_ring->state))) { - struct ixgbe_adv_tx_context_desc *context_desc; - - set_bit(__IXGBEVF_TX_XDP_RING_PRIMED, &xdp_ring->state); - - context_desc = IXGBEVF_TX_CTXTDESC(xdp_ring, 0); - context_desc->vlan_macip_lens = - cpu_to_le32(ETH_HLEN << IXGBE_ADVTXD_MACLEN_SHIFT); - context_desc->fceof_saidx = 0; - context_desc->type_tucmd_mlhl = - cpu_to_le32(IXGBE_TXD_CMD_DEXT | - IXGBE_ADVTXD_DTYP_CTXT); - context_desc->mss_l4len_idx = 0; - - xdp_ring->next_to_use = 1; - xdp_ring->pending = 1; - - /* Finish descriptor writes before bumping tail */ - wmb(); - ixgbevf_write_tail(xdp_ring, 1); - } - - *sq = (struct libeth_xdpsq) { - .count = xdp_ring->count, - .descs = xdp_ring->desc, - .lock = &xdp_ring->xdpq_lock, - .ntu = &xdp_ring->next_to_use, - .pending = &xdp_ring->pending, - .pool = NULL, - .sqes = xdp_ring->xdp_sqes, - }; - - return ixgbevf_desc_unused(xdp_ring); -} - static void ixgbevf_xdp_xmit_desc(struct libeth_xdp_tx_desc desc, u32 i, const struct libeth_xdpsq *sq, u64 priv) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h index c96dba45fb2047..cd65c323a1ff6a 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h @@ -189,4 +189,83 @@ static inline void ixgbevf_process_skb_fields(struct ixgbevf_ring *rx_ring, ixgbevf_ipsec_rx(rx_ring, rx_desc, skb); } +static inline u16 ixgbevf_tx_get_num_sent(struct ixgbevf_ring *xdp_ring) +{ + u16 ntc = xdp_ring->next_to_clean; + u16 to_clean = 0; + + while (likely(to_clean < xdp_ring->pending)) { + u32 idx = xdp_ring->xdp_sqes[ntc].rs_idx; + union ixgbe_adv_tx_desc *rs_desc; + + if (!idx--) + break; + + rs_desc = IXGBEVF_TX_DESC(xdp_ring, idx); + + if (!(rs_desc->wb.status & cpu_to_le32(IXGBE_TXD_STAT_DD))) + break; + + xdp_ring->xdp_sqes[ntc].rs_idx = 0; + + to_clean += + (idx >= ntc ? idx : idx + xdp_ring->count) - ntc + 1; + + ntc = (idx + 1 == xdp_ring->count) ? 0 : idx + 1; + } + + return to_clean; +} + +void ixgbevf_clean_xdp_num(struct ixgbevf_ring *xdp_ring, bool in_napi, + u16 to_clean); + +static inline u32 ixgbevf_prep_xdp_sq(void *xdpsq, struct libeth_xdpsq *sq) +{ + struct ixgbevf_ring *xdp_ring = xdpsq; + + libeth_xdpsq_lock(&xdp_ring->xdpq_lock); + if (unlikely(ixgbevf_desc_unused(xdp_ring) < LIBETH_XDP_TX_BULK)) { + u16 to_clean = ixgbevf_tx_get_num_sent(xdp_ring); + + if (likely(to_clean)) + ixgbevf_clean_xdp_num(xdp_ring, true, to_clean); + } + + if (unlikely(!test_bit(__IXGBEVF_TX_XDP_RING_PRIMED, + &xdp_ring->state))) { + struct ixgbe_adv_tx_context_desc *context_desc; + + set_bit(__IXGBEVF_TX_XDP_RING_PRIMED, &xdp_ring->state); + + context_desc = IXGBEVF_TX_CTXTDESC(xdp_ring, 0); + context_desc->vlan_macip_lens = + cpu_to_le32(ETH_HLEN << IXGBE_ADVTXD_MACLEN_SHIFT); + context_desc->fceof_saidx = 0; + context_desc->type_tucmd_mlhl = + cpu_to_le32(IXGBE_TXD_CMD_DEXT | + IXGBE_ADVTXD_DTYP_CTXT); + context_desc->mss_l4len_idx = 0; + + xdp_ring->next_to_use = 1; + xdp_ring->pending = 1; + + /* Finish descriptor writes before bumping tail */ + wmb(); + ixgbevf_write_tail(xdp_ring, 1); + } + + *sq = (struct libeth_xdpsq) { + .count = xdp_ring->count, + .descs = xdp_ring->desc, + .lock = &xdp_ring->xdpq_lock, + .ntu = &xdp_ring->next_to_use, + .pending = &xdp_ring->pending, + .pool = NULL, + .sqes = xdp_ring->xdp_sqes, + }; + + return ixgbevf_desc_unused(xdp_ring); +} + #endif /* _IXGBEVF_TXRX_LIB_H_ */ From 3fd82a9d502cea72839fa14bbb8f55d7053781ed Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Fri, 24 Oct 2025 14:47:35 +0200 Subject: [PATCH 18/23] ixgbevf: handle single context descriptor on an XDP queue Before starting transmission XDP queue first fills a single context descriptor, on which we cannot check DD bit later. This is not a problem in case of XDP_TX and .ndo_xdp_xmit(), because preparation happens only if we already have packets to send. This is different for ZC though. Wakeup must trigger queue preparation even if no new packets are queued, hence a single context descriptor can block completions. Modify RS-setting logic to account for such case. Signed-off-by: Larysa Zaremba --- .../net/ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h index cd65c323a1ff6a..14eb36717fc4ce 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h @@ -17,12 +17,15 @@ static inline void ixgbevf_xdp_rs_and_bump(void *xdpsq, bool sent, bool flush) libeth_xdpsq_lock(&xdp_ring->xdpq_lock); if ((!flush && xdp_ring->pending < xdp_ring->count - 1) || - xdp_ring->cached_ntu == xdp_ring->next_to_use) { - libeth_xdpsq_unlock(&xdp_ring->xdpq_lock); - return; - } + xdp_ring->cached_ntu == xdp_ring->next_to_use) + goto unlock; ltu = (xdp_ring->next_to_use ? : xdp_ring->count) - 1; + + /* We will not get DD on a context descriptor */ + if (unlikely(xdp_ring->xdp_sqes[ltu].type == LIBETH_SQE_CTX)) + goto unlock; + desc = IXGBEVF_TX_DESC(xdp_ring, ltu); desc->read.cmd_type_len |= cpu_to_le32(IXGBE_TXD_CMD); @@ -33,6 +36,7 @@ static inline void ixgbevf_xdp_rs_and_bump(void *xdpsq, bool sent, bool flush) wmb(); ixgbevf_write_tail(xdp_ring, xdp_ring->next_to_use); +unlock: libeth_xdpsq_unlock(&xdp_ring->xdpq_lock); } @@ -249,6 +253,7 @@ static inline u32 ixgbevf_prep_xdp_sq(void *xdpsq, struct libeth_xdpsq *sq) xdp_ring->next_to_use = 1; xdp_ring->pending = 1; + xdp_ring->xdp_sqes[0].type = LIBETH_SQE_CTX; /* Finish descriptor writes before bumping tail */ wmb(); From 4b4822ebde7eaf19fc3504e2faacfb139291adc2 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Mon, 27 Oct 2025 13:32:49 +0100 Subject: [PATCH 19/23] ixgbevf: implement AF_XDP ZC initialization Implement xsk_buff_pool configuration and supporting functionality, such as a single queue pair reconfiguration. Also, properly initialize Rx buffers. Signed-off-by: Larysa Zaremba --- drivers/net/ethernet/intel/ixgbevf/Makefile | 2 +- drivers/net/ethernet/intel/ixgbevf/ixgbevf.h | 32 ++++- .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 132 +++++++++++++++--- .../net/ethernet/intel/ixgbevf/ixgbevf_xsk.c | 131 +++++++++++++++++ .../net/ethernet/intel/ixgbevf/ixgbevf_xsk.h | 12 ++ 5 files changed, 286 insertions(+), 23 deletions(-) create mode 100644 drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.c create mode 100644 drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.h diff --git a/drivers/net/ethernet/intel/ixgbevf/Makefile b/drivers/net/ethernet/intel/ixgbevf/Makefile index 01d3e892f3fa7b..cdae62f25fd926 100644 --- a/drivers/net/ethernet/intel/ixgbevf/Makefile +++ b/drivers/net/ethernet/intel/ixgbevf/Makefile @@ -6,5 +6,5 @@ obj-$(CONFIG_IXGBEVF) += ixgbevf.o -ixgbevf-y := vf.o mbx.o ethtool.o ixgbevf_main.o +ixgbevf-y := vf.o mbx.o ethtool.o ixgbevf_main.o ixgbevf_xsk.o ixgbevf-$(CONFIG_IXGBEVF_IPSEC) += ipsec.o diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h index 17958cfb4ee65b..d8f841515ca62a 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h @@ -66,6 +66,7 @@ enum ixgbevf_ring_state_t { __IXGBEVF_HANG_CHECK_ARMED, __IXGBEVF_TX_XDP_RING, __IXGBEVF_TX_XDP_RING_PRIMED, + __IXGBEVF_RXTX_XSK_RING, }; #define ring_is_xdp(ring) \ @@ -75,6 +76,13 @@ enum ixgbevf_ring_state_t { #define clear_ring_xdp(ring) \ clear_bit(__IXGBEVF_TX_XDP_RING, &(ring)->state) +#define ring_is_xsk(ring) \ + test_bit(__IXGBEVF_RXTX_XSK_RING, &(ring)->state) +#define set_ring_xsk(ring) \ + set_bit(__IXGBEVF_RXTX_XSK_RING, &(ring)->state) +#define clear_ring_xsk(ring) \ + clear_bit(__IXGBEVF_RXTX_XSK_RING, &(ring)->state) + struct ixgbevf_ring { struct ixgbevf_ring *next; struct ixgbevf_q_vector *q_vector; /* backpointer to q_vector */ @@ -85,22 +93,22 @@ struct ixgbevf_ring { struct device *dev; /* Tx ring */ }; void *desc; /* descriptor ring memory */ - union { - u32 truesize; /* Rx buffer full size */ - u32 pending; /* Sent-not-completed descriptors */ - }; + u32 truesize; /* Rx buffer full size */ u32 hdr_truesize; /* Rx header buffer full size */ u16 count; /* amount of descriptors */ u16 next_to_clean; u32 next_to_use; + u32 pending; /* Sent-not-completed descriptors */ union { struct libeth_fqe *rx_fqes; + struct libeth_xdp_buff **xsk_fqes; struct ixgbevf_tx_buffer *tx_buffer_info; struct libeth_sqe *xdp_sqes; }; struct libeth_xdpsq_lock xdpq_lock; u32 cached_ntu; + u32 thresh; unsigned long state; struct ixgbevf_stats stats; struct u64_stats_sync syncp; @@ -121,8 +129,10 @@ struct ixgbevf_ring { int queue_index; /* needed for multiqueue queue management */ u32 rx_buf_len; struct libeth_xdp_buff_stash xdp_stash; + struct libeth_xdp_buff *xsk_xdp_head; unsigned int dma_size; /* length in bytes */ dma_addr_t dma; /* phys. address of descriptor ring */ + struct xsk_buff_pool *xsk_pool; /* AF_XDP ZC rings */ } ____cacheline_internodealigned_in_smp; /* How many Rx Buffers do we bundle into one write to the hardware ? */ @@ -399,14 +409,28 @@ int ixgbevf_open(struct net_device *netdev); int ixgbevf_close(struct net_device *netdev); void ixgbevf_up(struct ixgbevf_adapter *adapter); void ixgbevf_down(struct ixgbevf_adapter *adapter); +void ixgbevf_flush_tx_queue(struct ixgbevf_ring *ring); +void ixgbevf_disable_rx_queue(struct ixgbevf_adapter *adapter, + struct ixgbevf_ring *ring); +void ixgbevf_rx_desc_queue_enable(struct ixgbevf_adapter *adapter, + struct ixgbevf_ring *ring); void ixgbevf_reinit_locked(struct ixgbevf_adapter *adapter); void ixgbevf_reset(struct ixgbevf_adapter *adapter); void ixgbevf_set_ethtool_ops(struct net_device *netdev); int ixgbevf_setup_rx_resources(struct ixgbevf_adapter *adapter, struct ixgbevf_ring *rx_ring); +void ixgbevf_irq_enable(struct ixgbevf_adapter *adapter); +void ixgbevf_configure_rx_ring(struct ixgbevf_adapter *adapter, + struct ixgbevf_ring *ring); int ixgbevf_setup_tx_resources(struct ixgbevf_ring *); +void ixgbevf_configure_tx_ring(struct ixgbevf_adapter *adapter, + struct ixgbevf_ring *ring); void ixgbevf_free_rx_resources(struct ixgbevf_ring *); +void ixgbevf_clean_rx_ring(struct ixgbevf_ring *rx_ring); +void ixgbevf_rx_destroy_pp(struct ixgbevf_ring *rx_ring); void ixgbevf_free_tx_resources(struct ixgbevf_ring *); +void ixgbevf_clean_tx_ring(struct ixgbevf_ring *tx_ring); +void ixgbevf_clean_xdp_ring(struct ixgbevf_ring *xdp_ring); void ixgbevf_update_stats(struct ixgbevf_adapter *adapter); int ethtool_ioctl(struct ifreq *ifr); diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 1cb307bcbe0310..6fef5950efaa4a 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -30,10 +30,11 @@ #include #include #include -#include +#include #include #include "ixgbevf_txrx_lib.h" +#include "ixgbevf_xsk.h" const char ixgbevf_driver_name[] = "ixgbevf"; static const char ixgbevf_driver_string[] = @@ -525,7 +526,7 @@ void ixgbevf_clean_xdp_num(struct ixgbevf_ring *xdp_ring, bool in_napi, xdp_flush_frame_bulk(&cbulk); } -static void ixgbevf_clean_xdp_ring(struct ixgbevf_ring *xdp_ring) +void ixgbevf_clean_xdp_ring(struct ixgbevf_ring *xdp_ring) { ixgbevf_clean_xdp_num(xdp_ring, false, xdp_ring->pending); libeth_xdpsq_put(&xdp_ring->xdpq_lock, xdp_ring->netdev); @@ -1147,7 +1148,7 @@ static inline void ixgbevf_irq_disable(struct ixgbevf_adapter *adapter) * ixgbevf_irq_enable - Enable default interrupt generation settings * @adapter: board private structure **/ -static inline void ixgbevf_irq_enable(struct ixgbevf_adapter *adapter) +void ixgbevf_irq_enable(struct ixgbevf_adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; @@ -1156,6 +1157,24 @@ static inline void ixgbevf_irq_enable(struct ixgbevf_adapter *adapter) IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, adapter->eims_enable_mask); } +/** + * ixgbevf_xsk_pool_from_q - get ZC XSK buffer pool bound to a queue ID + * @ring: Rx or Tx ring + * + * Return: A pointer to xsk_buff_pool structure if there is a buffer pool + * attached, configured as zero-copy, and usable by this queue, NULL otherwise. + */ +static struct xsk_buff_pool *ixgbevf_xsk_pool_from_q(struct ixgbevf_ring *ring) +{ + struct xsk_buff_pool *pool = + xsk_get_pool_from_qid(ring->netdev, ring->queue_index); + + if (!READ_ONCE(ring->xdp_prog) && !ring_is_xdp(ring)) + return NULL; + + return (pool && pool->dev) ? pool : NULL; +} + /** * ixgbevf_configure_tx_ring - Configure 82599 VF Tx ring after Reset * @adapter: board private structure @@ -1163,8 +1182,8 @@ static inline void ixgbevf_irq_enable(struct ixgbevf_adapter *adapter) * * Configure the Tx descriptor ring after a reset. **/ -static void ixgbevf_configure_tx_ring(struct ixgbevf_adapter *adapter, - struct ixgbevf_ring *ring) +void ixgbevf_configure_tx_ring(struct ixgbevf_adapter *adapter, + struct ixgbevf_ring *ring) { struct ixgbe_hw *hw = &adapter->hw; u64 tdba = ring->dma; @@ -1222,6 +1241,12 @@ static void ixgbevf_configure_tx_ring(struct ixgbevf_adapter *adapter, num_possible_cpus() > adapter->num_xdp_queues); } + ring->xsk_pool = ixgbevf_xsk_pool_from_q(ring); + if (ring_is_xdp(ring) && ring->xsk_pool) + set_ring_xsk(ring); + else + clear_ring_xsk(ring); + clear_bit(__IXGBEVF_HANG_CHECK_ARMED, &ring->state); clear_bit(__IXGBEVF_TX_XDP_RING_PRIMED, &ring->state); @@ -1291,8 +1316,8 @@ static void ixgbevf_setup_psrtype(struct ixgbevf_adapter *adapter) } #define IXGBEVF_MAX_RX_DESC_POLL 10 -static void ixgbevf_disable_rx_queue(struct ixgbevf_adapter *adapter, - struct ixgbevf_ring *ring) +void ixgbevf_disable_rx_queue(struct ixgbevf_adapter *adapter, + struct ixgbevf_ring *ring) { struct ixgbe_hw *hw = &adapter->hw; int wait_loop = IXGBEVF_MAX_RX_DESC_POLL; @@ -1316,10 +1341,15 @@ static void ixgbevf_disable_rx_queue(struct ixgbevf_adapter *adapter, if (!wait_loop) pr_err("RXDCTL.ENABLE queue %d not cleared while polling\n", reg_idx); + + /* Specification calls for 100 usec of delay after + * RXDCTL.ENABLE is cleared + */ + usleep_range(100, 200); } -static void ixgbevf_rx_desc_queue_enable(struct ixgbevf_adapter *adapter, - struct ixgbevf_ring *ring) +void ixgbevf_rx_desc_queue_enable(struct ixgbevf_adapter *adapter, + struct ixgbevf_ring *ring) { struct ixgbe_hw *hw = &adapter->hw; int wait_loop = IXGBEVF_MAX_RX_DESC_POLL; @@ -1395,14 +1425,14 @@ static void ixgbevf_setup_vfmrqc(struct ixgbevf_adapter *adapter) IXGBE_WRITE_REG(hw, IXGBE_VFMRQC, vfmrqc); } -static void ixgbevf_rx_destroy_pp(struct ixgbevf_ring *rx_ring) +void ixgbevf_rx_destroy_pp(struct ixgbevf_ring *rx_ring) { struct libeth_fq fq = { .pp = rx_ring->pp, .fqes = rx_ring->rx_fqes, }; - if (!fq.pp) + if (!fq.pp && !rx_ring->xsk_fqes) return; if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) { @@ -1410,6 +1440,21 @@ static void ixgbevf_rx_destroy_pp(struct ixgbevf_ring *rx_ring) xdp_rxq_info_unreg(&rx_ring->xdp_rxq); } + if (test_and_clear_bit(__IXGBEVF_RXTX_XSK_RING, &rx_ring->state)) { + struct libeth_xskfq xskfq = { + .fqes = rx_ring->xsk_fqes, + }; + + libeth_xskfq_destroy(&xskfq); + rx_ring->xsk_fqes = NULL; + rx_ring->pending = xskfq.pending; + rx_ring->thresh = xskfq.thresh; + rx_ring->rx_buf_len = xskfq.buf_len; + rx_ring->xsk_pool = NULL; + + return; + } + libeth_rx_fq_destroy(&fq); rx_ring->rx_fqes = NULL; rx_ring->pp = NULL; @@ -1439,9 +1484,44 @@ static int ixgbevf_rx_create_pp(struct ixgbevf_ring *rx_ring) LIBETH_XDP_HEADROOM : LIBETH_SKB_HEADROOM), }; + struct xsk_buff_pool *pool; u32 frame_size; int ret; + pool = ixgbevf_xsk_pool_from_q(rx_ring); + if (pool) { + u32 frag_sz = xsk_pool_get_rx_frag_step(pool); + struct libeth_xskfq xskfq = { + .nid = numa_node_id(), + .count = rx_ring->count, + .pool = pool, + }; + + ret = libeth_xskfq_create(&xskfq); + if (ret) + return ret; + + rx_ring->xsk_pool = xskfq.pool; + rx_ring->xsk_fqes = xskfq.fqes; + rx_ring->pending = xskfq.count - 1; + rx_ring->thresh = xskfq.thresh; + rx_ring->rx_buf_len = xskfq.buf_len; + set_ring_xsk(rx_ring); + + ret = __xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, + rx_ring->queue_index, 0, frag_sz); + if (ret) + goto err; + + ret = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq, + MEM_TYPE_XSK_BUFF_POOL, + rx_ring->xsk_pool); + if (ret) + goto err; + + return 0; + } + /* Some HW requires DMA write sizes to be aligned to 1K, * which warrants fake header split usage, but this is * not an issue if the frame size is at its maximum of 3K @@ -1492,8 +1572,8 @@ static int ixgbevf_rx_create_pp(struct ixgbevf_ring *rx_ring) return ret; } -static void ixgbevf_configure_rx_ring(struct ixgbevf_adapter *adapter, - struct ixgbevf_ring *ring) +void ixgbevf_configure_rx_ring(struct ixgbevf_adapter *adapter, + struct ixgbevf_ring *ring) { struct ixgbe_hw *hw = &adapter->hw; union ixgbe_adv_rx_desc *rx_desc; @@ -1534,6 +1614,7 @@ static void ixgbevf_configure_rx_ring(struct ixgbevf_adapter *adapter, /* reset ntu and ntc to place SW in sync with hardwdare */ ring->next_to_clean = 0; ring->next_to_use = 0; + ring->pending = ixgbevf_desc_unused(ring); err = ixgbevf_rx_create_pp(ring); if (err) { @@ -1551,7 +1632,8 @@ static void ixgbevf_configure_rx_ring(struct ixgbevf_adapter *adapter, rxdctl &= ~(IXGBE_RXDCTL_RLPMLMASK | IXGBE_RXDCTL_RLPML_EN); if (pkt_len <= IXGBE_RXDCTL_RLPMLMASK) { rxdctl |= pkt_len | IXGBE_RXDCTL_RLPML_EN; - rlpml_valid = true; + if (pkt_len <= ring->rx_buf_len) + rlpml_valid = true; } } @@ -1561,7 +1643,11 @@ static void ixgbevf_configure_rx_ring(struct ixgbevf_adapter *adapter, IXGBE_WRITE_REG(hw, IXGBE_VFRXDCTL(reg_idx), rxdctl); ixgbevf_rx_desc_queue_enable(adapter, ring); - ixgbevf_alloc_rx_buffers(ring, ixgbevf_desc_unused(ring)); + + if (ring_is_xsk(ring)) + ixgbevf_xsk_alloc_rx_bufs(ring, ring->pending); + else + ixgbevf_alloc_rx_buffers(ring, ring->pending); } /** @@ -1950,8 +2036,13 @@ void ixgbevf_up(struct ixgbevf_adapter *adapter) * ixgbevf_clean_rx_ring - Free Rx Buffers per Queue * @rx_ring: ring to free buffers from **/ -static void ixgbevf_clean_rx_ring(struct ixgbevf_ring *rx_ring) +void ixgbevf_clean_rx_ring(struct ixgbevf_ring *rx_ring) { + if (ring_is_xsk(rx_ring)) { + ixgbevf_rx_xsk_ring_free_buffs(rx_ring); + goto reset; + } + /* Free Rx ring sk_buff */ libeth_xdp_return_stash(&rx_ring->xdp_stash); @@ -1969,15 +2060,17 @@ static void ixgbevf_clean_rx_ring(struct ixgbevf_ring *rx_ring) i = 0; } +reset: rx_ring->next_to_clean = 0; rx_ring->next_to_use = 0; + rx_ring->pending = 0; } /** * ixgbevf_clean_tx_ring - Free Tx Buffers * @tx_ring: ring to be cleaned **/ -static void ixgbevf_clean_tx_ring(struct ixgbevf_ring *tx_ring) +void ixgbevf_clean_tx_ring(struct ixgbevf_ring *tx_ring) { u16 i = tx_ring->next_to_clean; struct ixgbevf_tx_buffer *tx_buffer = &tx_ring->tx_buffer_info[i]; @@ -2060,7 +2153,7 @@ static void ixgbevf_clean_all_tx_rings(struct ixgbevf_adapter *adapter) ixgbevf_clean_xdp_ring(adapter->xdp_ring[i]); } -static void ixgbevf_flush_tx_queue(struct ixgbevf_ring *ring) +void ixgbevf_flush_tx_queue(struct ixgbevf_ring *ring) { u8 reg_idx = ring->reg_idx; @@ -4071,6 +4164,9 @@ static int ixgbevf_xdp(struct net_device *dev, struct netdev_bpf *xdp) switch (xdp->command) { case XDP_SETUP_PROG: return ixgbevf_xdp_setup(dev, xdp->prog, xdp->extack); + case XDP_SETUP_XSK_POOL: + return ixgbevf_setup_xsk_pool(netdev_priv(dev), xdp->xsk.pool, + xdp->xsk.queue_id); default: return -EINVAL; } diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.c new file mode 100644 index 00000000000000..134e7670018700 --- /dev/null +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2026 Intel Corporation */ + +#include + +#include "ixgbevf.h" +#include "ixgbevf_xsk.h" + +/** + * ixgbevf_single_irq_disable - Mask off interrupt generation on a single vector + * @adapter: board private structure + * @vidx: vector id + **/ +static void ixgbevf_single_irq_disable(struct ixgbevf_adapter *adapter, + u16 vidx) +{ + struct ixgbe_hw *hw = &adapter->hw; + + IXGBE_WRITE_REG(hw, IXGBE_VTEIAM, + adapter->eims_enable_mask & ~BIT(vidx)); + IXGBE_WRITE_REG(hw, IXGBE_VTEIMC, BIT(vidx)); + IXGBE_WRITE_REG(hw, IXGBE_VTEIAC, + adapter->eims_enable_mask & ~BIT(vidx)); + + IXGBE_WRITE_FLUSH(hw); + + synchronize_irq(adapter->msix_entries[vidx].vector); +} + +static void ixgbevf_qp_dis(struct ixgbevf_adapter *adapter, u16 qid) +{ + struct ixgbevf_ring *tx_ring, *rx_ring = adapter->rx_ring[qid]; + struct ixgbevf_q_vector *q_vector = rx_ring->q_vector; + + netif_stop_subqueue(adapter->netdev, qid); + ixgbevf_single_irq_disable(adapter, q_vector->v_idx); + napi_disable(&q_vector->napi); + + ixgbevf_disable_rx_queue(adapter, adapter->rx_ring[qid]); + ixgbevf_clean_rx_ring(rx_ring); + ixgbevf_rx_destroy_pp(rx_ring); + + /* Clean both XDP and normal Tx queue */ + ixgbevf_for_each_ring(tx_ring, q_vector->tx) { + ixgbevf_flush_tx_queue(tx_ring); + if (ring_is_xdp(tx_ring)) + ixgbevf_clean_xdp_ring(tx_ring); + else + ixgbevf_clean_tx_ring(tx_ring); + } +} + +static void ixgbevf_qp_ena(struct ixgbevf_adapter *adapter, u16 qid) +{ + struct ixgbevf_ring *tx_ring, *rx_ring = adapter->rx_ring[qid]; + struct ixgbevf_q_vector *q_vector = rx_ring->q_vector; + + ixgbevf_configure_rx_ring(adapter, rx_ring); + ixgbevf_for_each_ring(tx_ring, q_vector->tx) + ixgbevf_configure_tx_ring(adapter, tx_ring); + + napi_enable(&q_vector->napi); + ixgbevf_irq_enable(adapter); + netif_start_subqueue(adapter->netdev, qid); +} + +int ixgbevf_setup_xsk_pool(struct ixgbevf_adapter *adapter, + struct xsk_buff_pool *pool, u16 qid) +{ + bool running = !test_bit(__IXGBEVF_DOWN, &adapter->state) && + adapter->xdp_prog; + int err; + + if (running) + ixgbevf_qp_dis(adapter, qid); + + err = libeth_xsk_setup_pool(adapter->netdev, qid, !!pool); + + if (running) + ixgbevf_qp_ena(adapter, qid); + + return err; +} + +static void ixgbevf_fill_rx_xsk_desc(const struct libeth_xskfq_fp *fq, u32 i) +{ + union ixgbe_adv_rx_desc *rx_desc = + &((union ixgbe_adv_rx_desc *)fq->descs)[i]; + + rx_desc->read.pkt_addr = + cpu_to_le64(libeth_xsk_buff_xdp_get_dma(fq->fqes[i])); + rx_desc->wb.upper.length = 0; +} + +void ixgbevf_xsk_alloc_rx_bufs(struct ixgbevf_ring *rx_ring, u32 num) +{ + struct libeth_xskfq_fp fq = { + .count = rx_ring->count, + .descs = rx_ring->desc, + .fqes = rx_ring->xsk_fqes, + .ntu = rx_ring->next_to_use, + .pool = rx_ring->xsk_pool, + }; + u32 done; + + done = libeth_xskfqe_alloc(&fq, num, ixgbevf_fill_rx_xsk_desc); + if (likely(done)) { + /* Finish descriptor writes before bumping tail */ + wmb(); + ixgbevf_write_tail(rx_ring, fq.ntu); + } + + rx_ring->next_to_use = fq.ntu; + rx_ring->pending -= done; +} + +void ixgbevf_rx_xsk_ring_free_buffs(struct ixgbevf_ring *rx_ring) +{ + u32 ntc = rx_ring->next_to_clean; + + if (rx_ring->xsk_xdp_head) + xsk_buff_free(&rx_ring->xsk_xdp_head->base); + + rx_ring->xsk_xdp_head = NULL; + + while (ntc != rx_ring->next_to_use) { + xsk_buff_free(&rx_ring->xsk_fqes[ntc]->base); + ntc++; + ntc = ntc == rx_ring->count ? 0 : ntc; + } +} diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.h new file mode 100644 index 00000000000000..1cbcea803509b6 --- /dev/null +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2026 Intel Corporation */ + +#ifndef _IXGBEVF_XSK_H_ +#define _IXGBEVF_XSK_H_ + +int ixgbevf_setup_xsk_pool(struct ixgbevf_adapter *adapter, + struct xsk_buff_pool *pool, u16 qid); +void ixgbevf_xsk_alloc_rx_bufs(struct ixgbevf_ring *rx_ring, u32 num); +void ixgbevf_rx_xsk_ring_free_buffs(struct ixgbevf_ring *rx_ring); + +#endif /* _IXGBEVF_XSK_H_ */ From bb6974a8d3eccd3b4eac3829942768900c1e48aa Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Mon, 27 Oct 2025 14:21:11 +0100 Subject: [PATCH 20/23] ixgbevf: implement AF_XDP zero-copy Tx Add code that handles Tx ZC queues inside of napi_poll(), utilize libeth. As NIC's multiple buffer conventions do not play nicely with AF_XDP's, leave handling of segments for later. Signed-off-by: Larysa Zaremba --- .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 20 ++++++++--- .../ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h | 6 ++-- .../net/ethernet/intel/ixgbevf/ixgbevf_xsk.c | 33 ++++++++++++++++++- .../net/ethernet/intel/ixgbevf/ixgbevf_xsk.h | 5 +++ 4 files changed, 56 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 6fef5950efaa4a..73d11c781c3c6a 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -504,6 +504,7 @@ void ixgbevf_clean_xdp_num(struct ixgbevf_ring *xdp_ring, bool in_napi, u16 to_clean) { struct libeth_xdpsq_napi_stats stats = { }; + bool xsk_ring = ring_is_xsk(xdp_ring); u32 ntc = xdp_ring->next_to_clean; struct xdp_frame_bulk cbulk; struct libeth_cq_pp cp = { @@ -512,11 +513,14 @@ void ixgbevf_clean_xdp_num(struct ixgbevf_ring *xdp_ring, bool in_napi, .xss = &stats, .napi = in_napi, }; + u32 xsk_frames = 0; xdp_frame_bulk_init(&cbulk); xdp_ring->pending -= to_clean; while (likely(to_clean--)) { + xsk_frames += xsk_ring && + likely(!xdp_ring->xdp_sqes[ntc].type) ? 1 : 0; libeth_xdp_complete_tx(&xdp_ring->xdp_sqes[ntc], &cp); ntc++; ntc = unlikely(ntc == xdp_ring->count) ? 0 : ntc; @@ -524,6 +528,8 @@ void ixgbevf_clean_xdp_num(struct ixgbevf_ring *xdp_ring, bool in_napi, xdp_ring->next_to_clean = ntc; xdp_flush_frame_bulk(&cbulk); + if (xsk_frames) + xsk_tx_completed(xdp_ring->xsk_pool, xsk_frames); } void ixgbevf_clean_xdp_ring(struct ixgbevf_ring *xdp_ring) @@ -758,10 +764,13 @@ static int ixgbevf_poll(struct napi_struct *napi, int budget) bool clean_complete = true; ixgbevf_for_each_ring(ring, q_vector->tx) { - if (ring_is_xdp(ring)) - continue; - if (!ixgbevf_clean_tx_irq(q_vector, ring, budget)) - clean_complete = false; + if (ring_is_xsk(ring)) + clean_complete &= + ixgbevf_clean_xsk_tx_irq(q_vector, ring, + budget); + else if (!ring_is_xdp(ring)) + clean_complete &= + ixgbevf_clean_tx_irq(q_vector, ring, budget); } if (budget <= 0) @@ -1247,6 +1256,9 @@ void ixgbevf_configure_tx_ring(struct ixgbevf_adapter *adapter, else clear_ring_xsk(ring); + ring->thresh = ring_is_xsk(ring) ? IXGBEVF_XSK_TX_CLEAN_THRESH(ring) : + XDP_BULK_QUEUE_SIZE; + clear_bit(__IXGBEVF_HANG_CHECK_ARMED, &ring->state); clear_bit(__IXGBEVF_TX_XDP_RING_PRIMED, &ring->state); diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h index 14eb36717fc4ce..c153c1b56b6549 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h @@ -229,8 +229,8 @@ static inline u32 ixgbevf_prep_xdp_sq(void *xdpsq, struct libeth_xdpsq *sq) struct ixgbevf_ring *xdp_ring = xdpsq; libeth_xdpsq_lock(&xdp_ring->xdpq_lock); - if (unlikely(ixgbevf_desc_unused(xdp_ring) < LIBETH_XDP_TX_BULK)) { - u16 to_clean = ixgbevf_tx_get_num_sent(xdp_ring); + if (unlikely(ixgbevf_desc_unused(xdp_ring) < xdp_ring->thresh)) { + u16 to_clean = ixgbevf_tx_get_num_sent(xdpsq); if (likely(to_clean)) ixgbevf_clean_xdp_num(xdp_ring, true, to_clean); @@ -266,7 +266,7 @@ static inline u32 ixgbevf_prep_xdp_sq(void *xdpsq, struct libeth_xdpsq *sq) .lock = &xdp_ring->xdpq_lock, .ntu = &xdp_ring->next_to_use, .pending = &xdp_ring->pending, - .pool = NULL, + .pool = xdp_ring->xsk_pool, .sqes = xdp_ring->xdp_sqes, }; diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.c index 134e7670018700..46737368908566 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.c @@ -3,7 +3,7 @@ #include -#include "ixgbevf.h" +#include "ixgbevf_txrx_lib.h" #include "ixgbevf_xsk.h" /** @@ -129,3 +129,34 @@ void ixgbevf_rx_xsk_ring_free_buffs(struct ixgbevf_ring *rx_ring) ntc = ntc == rx_ring->count ? 0 : ntc; } } + +static void ixgbevf_xsk_xmit_desc(struct libeth_xdp_tx_desc desc, u32 i, + const struct libeth_xdpsq *sq, u64 priv) +{ + union ixgbe_adv_tx_desc *tx_desc = + &((union ixgbe_adv_tx_desc *)sq->descs)[i]; + + u32 cmd_type = IXGBE_ADVTXD_DTYP_DATA | + IXGBE_ADVTXD_DCMD_DEXT | + IXGBE_ADVTXD_DCMD_IFCS | + IXGBE_TXD_CMD_EOP | + desc.len; + + tx_desc->read.olinfo_status = + cpu_to_le32((desc.len << IXGBE_ADVTXD_PAYLEN_SHIFT) | + IXGBE_ADVTXD_CC); + + tx_desc->read.buffer_addr = cpu_to_le64(desc.addr); + tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type); +} + +bool ixgbevf_clean_xsk_tx_irq(struct ixgbevf_q_vector *q_vector, + struct ixgbevf_ring *tx_ring, int napi_budget) +{ + u32 budget = min_t(u32, napi_budget, tx_ring->thresh); + + return libeth_xsk_xmit_do_bulk(tx_ring->xsk_pool, tx_ring, budget, + NULL, ixgbevf_prep_xdp_sq, + ixgbevf_xsk_xmit_desc, + ixgbevf_xdp_rs_and_bump); +} diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.h index 1cbcea803509b6..eda3e9b9554763 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.h +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.h @@ -4,9 +4,14 @@ #ifndef _IXGBEVF_XSK_H_ #define _IXGBEVF_XSK_H_ +/* Process completions as soon as possible */ +#define IXGBEVF_XSK_TX_CLEAN_THRESH(r) ((r)->count - 1) + int ixgbevf_setup_xsk_pool(struct ixgbevf_adapter *adapter, struct xsk_buff_pool *pool, u16 qid); void ixgbevf_xsk_alloc_rx_bufs(struct ixgbevf_ring *rx_ring, u32 num); void ixgbevf_rx_xsk_ring_free_buffs(struct ixgbevf_ring *rx_ring); +bool ixgbevf_clean_xsk_tx_irq(struct ixgbevf_q_vector *q_vector, + struct ixgbevf_ring *tx_ring, int napi_budget); #endif /* _IXGBEVF_XSK_H_ */ From f2f514bdf0d890ad38a3c7ae35b1284718df8400 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Mon, 27 Oct 2025 14:24:57 +0100 Subject: [PATCH 21/23] ixgbevf: implement AF_XDP zero-copy Rx Add code that handles AF_XDP ZC Rx queues inside of napi_poll(), utilize libeth helpers. Signed-off-by: Larysa Zaremba --- .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 5 +- .../ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h | 1 + .../net/ethernet/intel/ixgbevf/ixgbevf_xsk.c | 118 +++++++++++++++++- .../net/ethernet/intel/ixgbevf/ixgbevf_xsk.h | 4 +- 4 files changed, 125 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 73d11c781c3c6a..679a6bc6ec2fac 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -785,7 +785,10 @@ static int ixgbevf_poll(struct napi_struct *napi, int budget) per_ring_budget = budget; ixgbevf_for_each_ring(ring, q_vector->rx) { - int cleaned = ixgbevf_clean_rx_irq(q_vector, ring, + int cleaned = ring_is_xsk(ring) ? + ixgbevf_clean_xsk_rx_irq(q_vector, ring, + per_ring_budget) : + ixgbevf_clean_rx_irq(q_vector, ring, per_ring_budget); work_done += cleaned; if (cleaned >= per_ring_budget) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h index c153c1b56b6549..64ca27429bef53 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h @@ -58,6 +58,7 @@ static inline bool ixgbevf_is_non_eop(struct ixgbevf_ring *rx_ring, /* fetch, update, and store next to clean */ ntc = (ntc < rx_ring->count) ? ntc : 0; rx_ring->next_to_clean = ntc; + rx_ring->pending++; prefetch(IXGBEVF_RX_DESC(rx_ring, ntc)); diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.c index 46737368908566..f42e9c2b109279 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.c @@ -62,6 +62,7 @@ static void ixgbevf_qp_ena(struct ixgbevf_adapter *adapter, u16 qid) napi_enable(&q_vector->napi); ixgbevf_irq_enable(adapter); netif_start_subqueue(adapter->netdev, qid); + napi_schedule(&q_vector->napi); } int ixgbevf_setup_xsk_pool(struct ixgbevf_adapter *adapter, @@ -92,7 +93,7 @@ static void ixgbevf_fill_rx_xsk_desc(const struct libeth_xskfq_fp *fq, u32 i) rx_desc->wb.upper.length = 0; } -void ixgbevf_xsk_alloc_rx_bufs(struct ixgbevf_ring *rx_ring, u32 num) +bool ixgbevf_xsk_alloc_rx_bufs(struct ixgbevf_ring *rx_ring, u32 num) { struct libeth_xskfq_fp fq = { .count = rx_ring->count, @@ -112,6 +113,8 @@ void ixgbevf_xsk_alloc_rx_bufs(struct ixgbevf_ring *rx_ring, u32 num) rx_ring->next_to_use = fq.ntu; rx_ring->pending -= done; + + return done == num; } void ixgbevf_rx_xsk_ring_free_buffs(struct ixgbevf_ring *rx_ring) @@ -150,6 +153,119 @@ static void ixgbevf_xsk_xmit_desc(struct libeth_xdp_tx_desc desc, u32 i, tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type); } +LIBETH_XDP_DEFINE_START(); +LIBETH_XSK_DEFINE_FLUSH_TX(static ixgbevf_xsk_flush_tx, ixgbevf_prep_xdp_sq, + ixgbevf_xsk_xmit_desc); +LIBETH_XSK_DEFINE_RUN_PROG(static ixgbevf_xsk_run_prog, ixgbevf_xsk_flush_tx); +LIBETH_XSK_DEFINE_FINALIZE(static ixgbevf_xsk_finalize_xdp_napi, + ixgbevf_xsk_flush_tx, ixgbevf_xdp_rs_and_bump); +LIBETH_XDP_DEFINE_END(); + +u32 ixgbevf_clean_xsk_rx_irq(struct ixgbevf_q_vector *q_vector, + struct ixgbevf_ring *rx_ring, int budget) +{ + struct ixgbevf_adapter *adapter = q_vector->adapter; + u32 total_rx_bytes = 0, total_rx_packets = 0; + LIBETH_XDP_ONSTACK_BULK(xdp_tx_bulk); + struct libeth_xdp_buff *head_xdp; + bool failure = false, wake; + struct sk_buff *skb; + + wake = xsk_uses_need_wakeup(rx_ring->xsk_pool); + if (wake) + xsk_clear_rx_need_wakeup(rx_ring->xsk_pool); + + head_xdp = rx_ring->xsk_xdp_head; + libeth_xsk_tx_init_bulk(&xdp_tx_bulk, rx_ring->xdp_prog, + adapter->netdev, adapter->xdp_ring, + adapter->num_xdp_queues); + + while (likely(total_rx_packets < budget)) { + union ixgbe_adv_rx_desc *rx_desc; + struct libeth_xdp_buff *rx_buffer; + unsigned int size; + u32 xdp_result; + + rx_desc = IXGBEVF_RX_DESC(rx_ring, rx_ring->next_to_clean); + size = le16_to_cpu(rx_desc->wb.upper.length); + if (unlikely(!size)) + break; + + /* Avoid reading other descriptor fields before checking size */ + rmb(); + + rx_buffer = rx_ring->xsk_fqes[rx_ring->next_to_clean]; + head_xdp = libeth_xsk_process_buff(head_xdp, rx_buffer, size); + if (unlikely(!head_xdp) || ixgbevf_is_non_eop(rx_ring, rx_desc)) + continue; + + total_rx_packets++; + total_rx_bytes += xdp_get_buff_len(&head_xdp->base); + + xdp_result = ixgbevf_xsk_run_prog(head_xdp, &xdp_tx_bulk); + if (xdp_result) { + head_xdp = NULL; + if (likely(xdp_result != LIBETH_XDP_ABORTED)) + continue; + failure = true; + break; + } + + skb = xdp_build_skb_from_zc(&head_xdp->base); + + if (unlikely(!skb)) { + libeth_xdp_return_buff_slow(head_xdp); + head_xdp = NULL; + rx_ring->rx_stats.alloc_rx_buff_failed++; + break; + } + + head_xdp = NULL; + + if (unlikely(ixgbevf_cleanup_headers(rx_ring, rx_desc, skb))) { + skb = NULL; + continue; + } + + if (unlikely((skb->pkt_type == PACKET_BROADCAST || + skb->pkt_type == PACKET_MULTICAST) && + ether_addr_equal(rx_ring->netdev->dev_addr, + eth_hdr(skb)->h_source))) { + dev_kfree_skb_irq(skb); + continue; + } + + /* populate checksum, VLAN, and protocol */ + ixgbevf_process_skb_fields(rx_ring, rx_desc, skb); + + napi_gro_receive(&q_vector->napi, skb); + } + + if (rx_ring->pending >= rx_ring->thresh) + failure |= !ixgbevf_xsk_alloc_rx_bufs(rx_ring, + rx_ring->pending); + + /* place incomplete frames back on ring for completion */ + rx_ring->xsk_xdp_head = head_xdp; + + ixgbevf_xsk_finalize_xdp_napi(&xdp_tx_bulk); + + u64_stats_update_begin(&rx_ring->syncp); + rx_ring->stats.packets += total_rx_packets; + rx_ring->stats.bytes += total_rx_bytes; + u64_stats_update_end(&rx_ring->syncp); + q_vector->rx.total_packets += total_rx_packets; + q_vector->rx.total_bytes += total_rx_bytes; + + if (likely(!failure)) + return total_rx_packets; + + if (wake) + xsk_set_rx_need_wakeup(rx_ring->xsk_pool); + + return budget; +} + bool ixgbevf_clean_xsk_tx_irq(struct ixgbevf_q_vector *q_vector, struct ixgbevf_ring *tx_ring, int napi_budget) { diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.h index eda3e9b9554763..042a90cfa9131c 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.h +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.h @@ -9,8 +9,10 @@ int ixgbevf_setup_xsk_pool(struct ixgbevf_adapter *adapter, struct xsk_buff_pool *pool, u16 qid); -void ixgbevf_xsk_alloc_rx_bufs(struct ixgbevf_ring *rx_ring, u32 num); +bool ixgbevf_xsk_alloc_rx_bufs(struct ixgbevf_ring *rx_ring, u32 num); void ixgbevf_rx_xsk_ring_free_buffs(struct ixgbevf_ring *rx_ring); +u32 ixgbevf_clean_xsk_rx_irq(struct ixgbevf_q_vector *q_vector, + struct ixgbevf_ring *rx_ring, int budget); bool ixgbevf_clean_xsk_tx_irq(struct ixgbevf_q_vector *q_vector, struct ixgbevf_ring *tx_ring, int napi_budget); From d4e29073257e229472ca7bf4db1587da66830789 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Mon, 27 Oct 2025 14:26:46 +0100 Subject: [PATCH 22/23] ixgbevf: implement .ndo_xsk_wakeup() and set features To finalize basic AF_XDP implementation, set features and add .ndo_xsk_wakeup() handler. Signed-off-by: Larysa Zaremba --- .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 3 ++- .../net/ethernet/intel/ixgbevf/ixgbevf_xsk.c | 24 +++++++++++++++++++ .../net/ethernet/intel/ixgbevf/ixgbevf_xsk.h | 1 + 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 679a6bc6ec2fac..08b3bcd3eae730 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -4202,6 +4202,7 @@ static const struct net_device_ops ixgbevf_netdev_ops = { .ndo_features_check = ixgbevf_features_check, .ndo_bpf = ixgbevf_xdp, .ndo_xdp_xmit = ixgbevf_xdp_xmit, + .ndo_xsk_wakeup = ixgbevf_xsk_wakeup, }; static void ixgbevf_assign_netdev_ops(struct net_device *dev) @@ -4334,7 +4335,7 @@ static int ixgbevf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) NETIF_F_HW_VLAN_CTAG_TX; netdev->priv_flags |= IFF_UNICAST_FLT; - libeth_xdp_set_features_noredir(netdev, NULL, 0, NULL); + libeth_xdp_set_features_noredir(netdev, NULL, 1, NULL); /* MTU range: 68 - 1504 or 9710 */ netdev->min_mtu = ETH_MIN_MTU; diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.c index f42e9c2b109279..76504a94ac38a2 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.c @@ -276,3 +276,27 @@ bool ixgbevf_clean_xsk_tx_irq(struct ixgbevf_q_vector *q_vector, ixgbevf_xsk_xmit_desc, ixgbevf_xdp_rs_and_bump); } + +int ixgbevf_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags) +{ + struct ixgbevf_adapter *adapter = netdev_priv(dev); + struct ixgbevf_q_vector *q_vector; + struct ixgbevf_ring *rx_ring; + + if (unlikely(test_bit(__IXGBEVF_DOWN, &adapter->state))) + return -ENETDOWN; + + if (unlikely(queue_id >= adapter->num_xdp_queues)) + return -EINVAL; + + rx_ring = adapter->rx_ring[queue_id]; + if (unlikely(!ring_is_xsk(rx_ring))) + return -EINVAL; + + q_vector = rx_ring->q_vector; + if (!napi_if_scheduled_mark_missed(&q_vector->napi)) + IXGBE_WRITE_REG(&adapter->hw, IXGBE_VTEICS, + BIT(q_vector->v_idx)); + + return 0; +} diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.h index 042a90cfa9131c..7af14c78ead978 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.h +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.h @@ -15,5 +15,6 @@ u32 ixgbevf_clean_xsk_rx_irq(struct ixgbevf_q_vector *q_vector, struct ixgbevf_ring *rx_ring, int budget); bool ixgbevf_clean_xsk_tx_irq(struct ixgbevf_q_vector *q_vector, struct ixgbevf_ring *tx_ring, int napi_budget); +int ixgbevf_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags); #endif /* _IXGBEVF_XSK_H_ */ From 25ec6fa5b75cada97b4c966e69c8d392210850e4 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Fri, 19 Dec 2025 14:48:57 +0100 Subject: [PATCH 23/23] ixgbevf: multi-buffer AF_XDP Tx Transmitting multi-buffer AF_XDP packets is not very straightforward given HW limitations in ixgbevf, namely that the first data descriptor must contain the length of the whole packet. Use private data of an sqe to store the length of an unfinished packet so far and the first descriptor index. Once EoP zero-copy descriptor is processed, write the accumulated length into the saved first descriptor. Signed-off-by: Larysa Zaremba --- .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 2 +- .../ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h | 3 + .../net/ethernet/intel/ixgbevf/ixgbevf_xsk.c | 65 ++++++++++++++++--- .../net/ethernet/intel/ixgbevf/ixgbevf_xsk.h | 1 + 4 files changed, 62 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 08b3bcd3eae730..dd6a9f32f309c1 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -4335,7 +4335,7 @@ static int ixgbevf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) NETIF_F_HW_VLAN_CTAG_TX; netdev->priv_flags |= IFF_UNICAST_FLT; - libeth_xdp_set_features_noredir(netdev, NULL, 1, NULL); + libeth_xdp_set_features_noredir(netdev, NULL, IXGBEVF_XSK_MAX_ZC_FRAGS); /* MTU range: 68 - 1504 or 9710 */ netdev->min_mtu = ETH_MIN_MTU; diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h index 64ca27429bef53..5bad6990060715 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_txrx_lib.h @@ -32,6 +32,9 @@ static inline void ixgbevf_xdp_rs_and_bump(void *xdpsq, bool sent, bool flush) xdp_ring->xdp_sqes[xdp_ring->cached_ntu].rs_idx = ltu + 1; xdp_ring->cached_ntu = xdp_ring->next_to_use; + /* In case the packet was interrupted, discard it */ + xdp_ring->xdp_sqes[ltu].priv = 0; + /* Finish descriptor writes before bumping tail */ wmb(); ixgbevf_write_tail(xdp_ring, xdp_ring->next_to_use); diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.c index 76504a94ac38a2..1545fed4cdb51b 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.c @@ -133,24 +133,73 @@ void ixgbevf_rx_xsk_ring_free_buffs(struct ixgbevf_ring *rx_ring) } } +struct ixgbevf_zc_sqe_priv { + u16 first_desc; + u16 len; +}; + +static_assert(sizeof(struct ixgbevf_zc_sqe_priv) <= + sizeof_field(struct libeth_sqe, priv)); + static void ixgbevf_xsk_xmit_desc(struct libeth_xdp_tx_desc desc, u32 i, const struct libeth_xdpsq *sq, u64 priv) { - union ixgbe_adv_tx_desc *tx_desc = - &((union ixgbe_adv_tx_desc *)sq->descs)[i]; + union ixgbe_adv_tx_desc *descs = sq->descs, *tx_desc = &descs[i]; + u32 ltu = (i ? : sq->count) - 1; u32 cmd_type = IXGBE_ADVTXD_DTYP_DATA | IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DCMD_IFCS | - IXGBE_TXD_CMD_EOP | desc.len; - tx_desc->read.olinfo_status = - cpu_to_le32((desc.len << IXGBE_ADVTXD_PAYLEN_SHIFT) | - IXGBE_ADVTXD_CC); - tx_desc->read.buffer_addr = cpu_to_le64(desc.addr); - tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type); + + if (likely((desc.flags & LIBETH_XDP_TX_LAST) && !sq->sqes[ltu].priv)) { + tx_desc->read.olinfo_status = + cpu_to_le32((desc.len << IXGBE_ADVTXD_PAYLEN_SHIFT) | + IXGBE_ADVTXD_CC); + tx_desc->read.cmd_type_len = + cpu_to_le32(cmd_type | IXGBE_TXD_CMD_EOP); + return; + } + + /* No previous packet */ + if (!sq->sqes[ltu].priv) { + struct ixgbevf_zc_sqe_priv *sqe_priv = + (void *)&sq->sqes[i].priv; + + sqe_priv->first_desc = i; + sqe_priv->len = desc.len; + + tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type); + + return; + } + + if (sq->sqes[ltu].priv) { + struct ixgbevf_zc_sqe_priv *sqe_priv = + (void *)&sq->sqes[i].priv; + + sq->sqes[i].priv = sq->sqes[ltu].priv; + sq->sqes[ltu].priv = 0; + sqe_priv->len += desc.len; + + if (desc.flags & LIBETH_XDP_TX_LAST) { + union ixgbe_adv_tx_desc *first_desc = + &descs[sqe_priv->first_desc]; + + first_desc->read.olinfo_status = + cpu_to_le32((sqe_priv->len << + IXGBE_ADVTXD_PAYLEN_SHIFT) | + IXGBE_ADVTXD_CC); + tx_desc->read.cmd_type_len = + cpu_to_le32(cmd_type | IXGBE_TXD_CMD_EOP); + cmd_type |= IXGBE_TXD_CMD_EOP; + sq->sqes[i].priv = 0; + } + + tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type); + } } LIBETH_XDP_DEFINE_START(); diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.h index 7af14c78ead978..2bb39735b10efd 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.h +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_xsk.h @@ -6,6 +6,7 @@ /* Process completions as soon as possible */ #define IXGBEVF_XSK_TX_CLEAN_THRESH(r) ((r)->count - 1) +#define IXGBEVF_XSK_MAX_ZC_FRAGS min(18, MAX_SKB_FRAGS) int ixgbevf_setup_xsk_pool(struct ixgbevf_adapter *adapter, struct xsk_buff_pool *pool, u16 qid);