kernel: improve GRO performance

For packets not belonging to a local socket, use fraglist GRO instead of
regular GRO. This make segmenting packets very cheap and avoids the need for
selectively disabling GRO

Signed-off-by: Felix Fietkau <nbd@nbd.name>
This commit is contained in:
Felix Fietkau 2024-04-23 12:37:05 +02:00
parent 98834a4c3f
commit b5c53848c3
12 changed files with 1133 additions and 318 deletions

View File

@ -0,0 +1,24 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 23 Apr 2024 12:35:21 +0200
Subject: [PATCH] net: enable fraglist GRO by default
This can significantly improve performance for packet forwarding/bridging
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -242,10 +242,10 @@ static inline int find_next_netdev_featu
#define NETIF_F_UPPER_DISABLES NETIF_F_LRO
/* changeable features with no special hardware requirements */
-#define NETIF_F_SOFT_FEATURES (NETIF_F_GSO | NETIF_F_GRO)
+#define NETIF_F_SOFT_FEATURES (NETIF_F_GSO | NETIF_F_GRO | NETIF_F_GRO_FRAGLIST)
/* Changeable features with no special hardware requirements that defaults to off. */
-#define NETIF_F_SOFT_FEATURES_OFF (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)
+#define NETIF_F_SOFT_FEATURES_OFF (NETIF_F_GRO_UDP_FWD)
#define NETIF_F_VLAN_FEATURES (NETIF_F_HW_VLAN_CTAG_FILTER | \
NETIF_F_HW_VLAN_CTAG_RX | \

View File

@ -47,7 +47,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
const struct header_ops *header_ops;
unsigned char operstate;
@@ -2206,6 +2213,10 @@ struct net_device {
@@ -2204,6 +2211,10 @@ struct net_device {
struct mctp_dev __rcu *mctp_ptr;
#endif
@ -60,7 +60,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
*/
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3046,6 +3046,10 @@ static inline int pskb_trim(struct sk_bu
@@ -3045,6 +3045,10 @@ static inline int pskb_trim(struct sk_bu
return (len < skb->len) ? __pskb_trim(skb, len) : 0;
}
@ -71,7 +71,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
/**
* pskb_trim_unique - remove end from a paged unique (not cloned) buffer
* @skb: buffer to alter
@@ -3195,16 +3199,6 @@ static inline struct sk_buff *dev_alloc_
@@ -3194,16 +3198,6 @@ static inline struct sk_buff *dev_alloc_
}
@ -152,7 +152,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
{
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -171,6 +171,12 @@ __be16 eth_type_trans(struct sk_buff *sk
@@ -159,6 +159,12 @@ __be16 eth_type_trans(struct sk_buff *sk
const struct ethhdr *eth;
skb->dev = dev;

View File

@ -0,0 +1,24 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 23 Apr 2024 12:35:21 +0200
Subject: [PATCH] net: enable fraglist GRO by default
This can significantly improve performance for packet forwarding/bridging
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -242,10 +242,10 @@ static inline int find_next_netdev_featu
#define NETIF_F_UPPER_DISABLES NETIF_F_LRO
/* changeable features with no special hardware requirements */
-#define NETIF_F_SOFT_FEATURES (NETIF_F_GSO | NETIF_F_GRO)
+#define NETIF_F_SOFT_FEATURES (NETIF_F_GSO | NETIF_F_GRO | NETIF_F_GRO_FRAGLIST)
/* Changeable features with no special hardware requirements that defaults to off. */
-#define NETIF_F_SOFT_FEATURES_OFF (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)
+#define NETIF_F_SOFT_FEATURES_OFF (NETIF_F_GRO_UDP_FWD)
#define NETIF_F_VLAN_FEATURES (NETIF_F_HW_VLAN_CTAG_FILTER | \
NETIF_F_HW_VLAN_CTAG_RX | \

View File

@ -47,7 +47,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
const struct header_ops *header_ops;
unsigned char operstate;
@@ -2259,6 +2266,10 @@ struct net_device {
@@ -2257,6 +2264,10 @@ struct net_device {
struct mctp_dev __rcu *mctp_ptr;
#endif
@ -60,7 +60,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
*/
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3081,6 +3081,10 @@ static inline int pskb_trim(struct sk_bu
@@ -3080,6 +3080,10 @@ static inline int pskb_trim(struct sk_bu
return (len < skb->len) ? __pskb_trim(skb, len) : 0;
}
@ -71,7 +71,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
/**
* pskb_trim_unique - remove end from a paged unique (not cloned) buffer
* @skb: buffer to alter
@@ -3246,16 +3250,6 @@ static inline struct sk_buff *dev_alloc_
@@ -3245,16 +3249,6 @@ static inline struct sk_buff *dev_alloc_
}
@ -152,7 +152,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
{
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -171,6 +171,12 @@ __be16 eth_type_trans(struct sk_buff *sk
@@ -159,6 +159,12 @@ __be16 eth_type_trans(struct sk_buff *sk
const struct ethhdr *eth;
skb->dev = dev;

View File

@ -1,151 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Subject: net: replace GRO optimization patch with a new one that supports VLANs/bridges with different MAC addresses
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
include/linux/netdevice.h | 2 ++
include/linux/skbuff.h | 3 ++-
net/core/dev.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++
net/ethernet/eth.c | 18 +++++++++++++++++-
4 files changed, 69 insertions(+), 2 deletions(-)
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2157,6 +2157,8 @@ struct net_device {
struct netdev_hw_addr_list mc;
struct netdev_hw_addr_list dev_addrs;
+ unsigned char local_addr_mask[MAX_ADDR_LEN];
+
#ifdef CONFIG_SYSFS
struct kset *queues_kset;
#endif
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -967,6 +967,7 @@ struct sk_buff {
#ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8 ndisc_nodetype:2;
#endif
+ __u8 gro_skip:1;
__u8 ipvs_property:1;
__u8 inner_protocol_type:1;
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -492,6 +492,9 @@ static enum gro_result dev_gro_receive(s
int same_flow;
int grow;
+ if (skb->gro_skip)
+ goto normal;
+
if (netif_elide_gro(skb->dev))
goto normal;
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7628,6 +7628,48 @@ static void __netdev_adjacent_dev_unlink
&upper_dev->adj_list.lower);
}
+static void __netdev_addr_mask(unsigned char *mask, const unsigned char *addr,
+ struct net_device *dev)
+{
+ int i;
+
+ for (i = 0; i < dev->addr_len; i++)
+ mask[i] |= addr[i] ^ dev->dev_addr[i];
+}
+
+static void __netdev_upper_mask(unsigned char *mask, struct net_device *dev,
+ struct net_device *lower)
+{
+ struct net_device *cur;
+ struct list_head *iter;
+
+ netdev_for_each_upper_dev_rcu(dev, cur, iter) {
+ __netdev_addr_mask(mask, cur->dev_addr, lower);
+ __netdev_upper_mask(mask, cur, lower);
+ }
+}
+
+static void __netdev_update_addr_mask(struct net_device *dev)
+{
+ unsigned char mask[MAX_ADDR_LEN];
+ struct net_device *cur;
+ struct list_head *iter;
+
+ memset(mask, 0, sizeof(mask));
+ __netdev_upper_mask(mask, dev, dev);
+ memcpy(dev->local_addr_mask, mask, dev->addr_len);
+
+ netdev_for_each_lower_dev(dev, cur, iter)
+ __netdev_update_addr_mask(cur);
+}
+
+static void netdev_update_addr_mask(struct net_device *dev)
+{
+ rcu_read_lock();
+ __netdev_update_addr_mask(dev);
+ rcu_read_unlock();
+}
+
static int __netdev_upper_dev_link(struct net_device *dev,
struct net_device *upper_dev, bool master,
void *upper_priv, void *upper_info,
@@ -7679,6 +7721,7 @@ static int __netdev_upper_dev_link(struc
if (ret)
return ret;
+ netdev_update_addr_mask(dev);
ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
&changeupper_info.info);
ret = notifier_to_errno(ret);
@@ -7775,6 +7818,7 @@ static void __netdev_upper_dev_unlink(st
__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
+ netdev_update_addr_mask(dev);
call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
&changeupper_info.info);
@@ -8827,6 +8871,7 @@ int dev_set_mac_address(struct net_devic
if (err)
return err;
dev->addr_assign_type = NET_ADDR_SET;
+ netdev_update_addr_mask(dev);
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
add_device_randomness(dev->dev_addr, dev->addr_len);
return 0;
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -143,6 +143,18 @@ u32 eth_get_headlen(const struct net_dev
}
EXPORT_SYMBOL(eth_get_headlen);
+static inline bool
+eth_check_local_mask(const void *addr1, const void *addr2, const void *mask)
+{
+ const u16 *a1 = addr1;
+ const u16 *a2 = addr2;
+ const u16 *m = mask;
+
+ return (((a1[0] ^ a2[0]) & ~m[0]) |
+ ((a1[1] ^ a2[1]) & ~m[1]) |
+ ((a1[2] ^ a2[2]) & ~m[2]));
+}
+
/**
* eth_type_trans - determine the packet's protocol ID.
* @skb: received socket data
@@ -174,6 +186,10 @@ __be16 eth_type_trans(struct sk_buff *sk
} else {
skb->pkt_type = PACKET_OTHERHOST;
}
+
+ if (eth_check_local_mask(eth->h_dest, dev->dev_addr,
+ dev->local_addr_mask))
+ skb->gro_skip = 1;
}
/*

View File

@ -0,0 +1,559 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 23 Apr 2024 11:23:03 +0200
Subject: [PATCH] net: add TCP fraglist GRO support
When forwarding TCP after GRO, software segmentation is very expensive,
especially when the checksum needs to be recalculated.
One case where that's currently unavoidable is when routing packets over
PPPoE. Performance improves significantly when using fraglist GRO
implemented in the same way as for UDP.
Here's a measurement of running 2 TCP streams through a MediaTek MT7622
device (2-core Cortex-A53), which runs NAT with flow offload enabled from
one ethernet port to PPPoE on another ethernet port + cake qdisc set to
1Gbps.
rx-gro-list off: 630 Mbit/s, CPU 35% idle
rx-gro-list on: 770 Mbit/s, CPU 40% idle
Signe-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/include/net/gro.h
+++ b/include/net/gro.h
@@ -424,6 +424,7 @@ static inline __wsum ip6_gro_compute_pse
}
int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb);
+int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb);
/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
static inline void gro_normal_list(struct napi_struct *napi)
@@ -446,5 +447,48 @@ static inline void gro_normal_one(struct
gro_normal_list(napi);
}
+/* This function is the alternative of 'inet_iif' and 'inet_sdif'
+ * functions in case we can not rely on fields of IPCB.
+ *
+ * The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized.
+ * The caller must hold the RCU read lock.
+ */
+static inline void inet_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif)
+{
+ *iif = inet_iif(skb) ?: skb->dev->ifindex;
+ *sdif = 0;
+
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+ if (netif_is_l3_slave(skb->dev)) {
+ struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev);
+
+ *sdif = *iif;
+ *iif = master ? master->ifindex : 0;
+ }
+#endif
+}
+
+/* This function is the alternative of 'inet6_iif' and 'inet6_sdif'
+ * functions in case we can not rely on fields of IP6CB.
+ *
+ * The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized.
+ * The caller must hold the RCU read lock.
+ */
+static inline void inet6_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif)
+{
+ /* using skb->dev->ifindex because skb_dst(skb) is not initialized */
+ *iif = skb->dev->ifindex;
+ *sdif = 0;
+
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+ if (netif_is_l3_slave(skb->dev)) {
+ struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev);
+
+ *sdif = *iif;
+ *iif = master ? master->ifindex : 0;
+ }
+#endif
+}
+
#endif /* _NET_IPV6_GRO_H */
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2057,7 +2057,10 @@ void tcp_v4_destroy_sock(struct sock *sk
struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
netdev_features_t features);
-struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb);
+struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb);
+struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th);
+struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
+ struct tcphdr *th);
INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff));
INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb));
INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff));
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -290,6 +290,33 @@ done:
return 0;
}
+int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
+{
+ if (unlikely(p->len + skb->len >= 65536))
+ return -E2BIG;
+
+ if (NAPI_GRO_CB(p)->last == p)
+ skb_shinfo(p)->frag_list = skb;
+ else
+ NAPI_GRO_CB(p)->last->next = skb;
+
+ skb_pull(skb, skb_gro_offset(skb));
+
+ NAPI_GRO_CB(p)->last = skb;
+ NAPI_GRO_CB(p)->count++;
+ p->data_len += skb->len;
+
+ /* sk ownership - if any - completely transferred to the aggregated packet */
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ p->truesize += skb->truesize;
+ p->len += skb->len;
+
+ NAPI_GRO_CB(skb)->same_flow = 1;
+
+ return 0;
+}
+
static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
{
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -27,6 +27,68 @@ static void tcp_gso_tstamp(struct sk_buf
}
}
+static void __tcpv4_gso_segment_csum(struct sk_buff *seg,
+ __be32 *oldip, __be32 *newip,
+ __be16 *oldport, __be16 *newport)
+{
+ struct tcphdr *th;
+ struct iphdr *iph;
+
+ if (*oldip == *newip && *oldport == *newport)
+ return;
+
+ th = tcp_hdr(seg);
+ iph = ip_hdr(seg);
+
+ inet_proto_csum_replace4(&th->check, seg, *oldip, *newip, true);
+ inet_proto_csum_replace2(&th->check, seg, *oldport, *newport, false);
+ *oldport = *newport;
+
+ csum_replace4(&iph->check, *oldip, *newip);
+ *oldip = *newip;
+}
+
+static struct sk_buff *__tcpv4_gso_segment_list_csum(struct sk_buff *segs)
+{
+ struct sk_buff *seg;
+ struct tcphdr *th, *th2;
+ struct iphdr *iph, *iph2;
+
+ seg = segs;
+ th = tcp_hdr(seg);
+ iph = ip_hdr(seg);
+ th2 = tcp_hdr(seg->next);
+ iph2 = ip_hdr(seg->next);
+
+ if (!(*(u32 *)&th->source ^ *(u32 *)&th2->source) &&
+ iph->daddr == iph2->daddr && iph->saddr == iph2->saddr)
+ return segs;
+
+ while ((seg = seg->next)) {
+ th2 = tcp_hdr(seg);
+ iph2 = ip_hdr(seg);
+
+ __tcpv4_gso_segment_csum(seg,
+ &iph2->saddr, &iph->saddr,
+ &th2->source, &th->source);
+ __tcpv4_gso_segment_csum(seg,
+ &iph2->daddr, &iph->daddr,
+ &th2->dest, &th->dest);
+ }
+
+ return segs;
+}
+
+static struct sk_buff *__tcp4_gso_segment_list(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ skb = skb_segment_list(skb, features, skb_mac_header_len(skb));
+ if (IS_ERR(skb))
+ return skb;
+
+ return __tcpv4_gso_segment_list_csum(skb);
+}
+
static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
@@ -36,6 +98,9 @@ static struct sk_buff *tcp4_gso_segment(
if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
return ERR_PTR(-EINVAL);
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)
+ return __tcp4_gso_segment_list(skb, features);
+
if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
const struct iphdr *iph = ip_hdr(skb);
struct tcphdr *th = tcp_hdr(skb);
@@ -177,61 +242,76 @@ out:
return segs;
}
-struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
+struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th)
{
- struct sk_buff *pp = NULL;
+ struct tcphdr *th2;
struct sk_buff *p;
+
+ list_for_each_entry(p, head, list) {
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ th2 = tcp_hdr(p);
+ if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ return p;
+ }
+
+ return NULL;
+}
+
+struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb)
+{
+ unsigned int thlen, hlen, off;
struct tcphdr *th;
- struct tcphdr *th2;
- unsigned int len;
- unsigned int thlen;
- __be32 flags;
- unsigned int mss = 1;
- unsigned int hlen;
- unsigned int off;
- int flush = 1;
- int i;
off = skb_gro_offset(skb);
hlen = off + sizeof(*th);
th = skb_gro_header(skb, hlen, off);
if (unlikely(!th))
- goto out;
+ return NULL;
thlen = th->doff * 4;
if (thlen < sizeof(*th))
- goto out;
+ return NULL;
hlen = off + thlen;
if (skb_gro_header_hard(skb, hlen)) {
th = skb_gro_header_slow(skb, hlen, off);
if (unlikely(!th))
- goto out;
+ return NULL;
}
skb_gro_pull(skb, thlen);
- len = skb_gro_len(skb);
- flags = tcp_flag_word(th);
-
- list_for_each_entry(p, head, list) {
- if (!NAPI_GRO_CB(p)->same_flow)
- continue;
+ return th;
+}
- th2 = tcp_hdr(p);
+struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
+ struct tcphdr *th)
+{
+ unsigned int thlen = th->doff * 4;
+ struct sk_buff *pp = NULL;
+ struct sk_buff *p;
+ struct tcphdr *th2;
+ unsigned int len;
+ __be32 flags;
+ unsigned int mss = 1;
+ int flush = 1;
+ int i;
- if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
- NAPI_GRO_CB(p)->same_flow = 0;
- continue;
- }
+ len = skb_gro_len(skb);
+ flags = tcp_flag_word(th);
- goto found;
- }
- p = NULL;
- goto out_check_final;
+ p = tcp_gro_lookup(head, th);
+ if (!p)
+ goto out_check_final;
-found:
/* Include the IP ID check below from the inner most IP hdr */
+ th2 = tcp_hdr(p);
flush = NAPI_GRO_CB(p)->flush;
flush |= (__force int)(flags & TCP_FLAG_CWR);
flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
@@ -268,6 +348,19 @@ found:
flush |= p->decrypted ^ skb->decrypted;
#endif
+ if (NAPI_GRO_CB(p)->is_flist) {
+ flush |= (__force int)(flags ^ tcp_flag_word(th2));
+ flush |= skb->ip_summed != p->ip_summed;
+ flush |= skb->csum_level != p->csum_level;
+ flush |= !pskb_may_pull(skb, skb_gro_offset(skb));
+ flush |= NAPI_GRO_CB(p)->count >= 64;
+
+ if (flush || skb_gro_receive_list(p, skb))
+ mss = 1;
+
+ goto out_check_final;
+ }
+
if (flush || skb_gro_receive(p, skb)) {
mss = 1;
goto out_check_final;
@@ -289,7 +382,6 @@ out_check_final:
if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
pp = p;
-out:
NAPI_GRO_CB(skb)->flush |= (flush != 0);
return pp;
@@ -315,18 +407,56 @@ int tcp_gro_complete(struct sk_buff *skb
}
EXPORT_SYMBOL(tcp_gro_complete);
+static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
+ struct tcphdr *th)
+{
+ const struct iphdr *iph = skb_gro_network_header(skb);
+ struct net *net = dev_net(skb->dev);
+ struct sk_buff *p;
+ struct sock *sk;
+ int iif, sdif;
+
+ if (!(skb->dev->features & NETIF_F_GRO_FRAGLIST))
+ return;
+
+ p = tcp_gro_lookup(head, th);
+ if (p) {
+ NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist;
+ return;
+ }
+
+ inet_get_iif_sdif(skb, &iif, &sdif);
+ sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
+ iph->saddr, th->source,
+ iph->daddr, ntohs(th->dest),
+ iif, sdif);
+ NAPI_GRO_CB(skb)->is_flist = !sk;
+ if (sk)
+ sock_put(sk);
+}
+
INDIRECT_CALLABLE_SCOPE
struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
{
+ struct tcphdr *th;
+
/* Don't bother verifying checksum if we're going to flush anyway. */
if (!NAPI_GRO_CB(skb)->flush &&
skb_gro_checksum_validate(skb, IPPROTO_TCP,
- inet_gro_compute_pseudo)) {
- NAPI_GRO_CB(skb)->flush = 1;
- return NULL;
- }
+ inet_gro_compute_pseudo))
+ goto flush;
+
+ th = tcp_gro_pull_header(skb);
+ if (!th)
+ goto flush;
- return tcp_gro_receive(head, skb);
+ tcp4_check_fraglist_gro(head, skb, th);
+
+ return tcp_gro_receive(head, skb, th);
+
+flush:
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
}
INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff)
@@ -334,6 +464,15 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_com
const struct iphdr *iph = ip_hdr(skb);
struct tcphdr *th = tcp_hdr(skb);
+ if (NAPI_GRO_CB(skb)->is_flist) {
+ skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV4;
+ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+
+ __skb_incr_checksum_unnecessary(skb);
+
+ return 0;
+ }
+
th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr,
iph->daddr, 0);
skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -425,33 +425,6 @@ out:
return segs;
}
-static int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
-{
- if (unlikely(p->len + skb->len >= 65536))
- return -E2BIG;
-
- if (NAPI_GRO_CB(p)->last == p)
- skb_shinfo(p)->frag_list = skb;
- else
- NAPI_GRO_CB(p)->last->next = skb;
-
- skb_pull(skb, skb_gro_offset(skb));
-
- NAPI_GRO_CB(p)->last = skb;
- NAPI_GRO_CB(p)->count++;
- p->data_len += skb->len;
-
- /* sk ownership - if any - completely transferred to the aggregated packet */
- skb->destructor = NULL;
- skb->sk = NULL;
- p->truesize += skb->truesize;
- p->len += skb->len;
-
- NAPI_GRO_CB(skb)->same_flow = 1;
-
- return 0;
-}
-
#define UDP_GRO_CNT_MAX 64
static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
--- a/net/ipv6/tcpv6_offload.c
+++ b/net/ipv6/tcpv6_offload.c
@@ -7,24 +7,65 @@
*/
#include <linux/indirect_call_wrapper.h>
#include <linux/skbuff.h>
+#include <net/inet6_hashtables.h>
#include <net/gro.h>
#include <net/protocol.h>
#include <net/tcp.h>
#include <net/ip6_checksum.h>
#include "ip6_offload.h"
+static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
+ struct tcphdr *th)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ const struct ipv6hdr *hdr = skb_gro_network_header(skb);
+ struct net *net = dev_net(skb->dev);
+ struct sk_buff *p;
+ struct sock *sk;
+ int iif, sdif;
+
+ if (!(skb->dev->features & NETIF_F_GRO_FRAGLIST))
+ return;
+
+ p = tcp_gro_lookup(head, th);
+ if (p) {
+ NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist;
+ return;
+ }
+
+ inet6_get_iif_sdif(skb, &iif, &sdif);
+ sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
+ &hdr->saddr, th->source,
+ &hdr->daddr, ntohs(th->dest),
+ iif, sdif);
+ NAPI_GRO_CB(skb)->is_flist = !sk;
+ if (sk)
+ sock_put(sk);
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+}
+
INDIRECT_CALLABLE_SCOPE
struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)
{
+ struct tcphdr *th;
+
/* Don't bother verifying checksum if we're going to flush anyway. */
if (!NAPI_GRO_CB(skb)->flush &&
skb_gro_checksum_validate(skb, IPPROTO_TCP,
- ip6_gro_compute_pseudo)) {
- NAPI_GRO_CB(skb)->flush = 1;
- return NULL;
- }
+ ip6_gro_compute_pseudo))
+ goto flush;
- return tcp_gro_receive(head, skb);
+ th = tcp_gro_pull_header(skb);
+ if (!th)
+ goto flush;
+
+ tcp6_check_fraglist_gro(head, skb, th);
+
+ return tcp_gro_receive(head, skb, th);
+
+flush:
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
}
INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff)
@@ -32,6 +73,15 @@ INDIRECT_CALLABLE_SCOPE int tcp6_gro_com
const struct ipv6hdr *iph = ipv6_hdr(skb);
struct tcphdr *th = tcp_hdr(skb);
+ if (NAPI_GRO_CB(skb)->is_flist) {
+ skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV6;
+ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+
+ __skb_incr_checksum_unnecessary(skb);
+
+ return 0;
+ }
+
th->check = ~tcp_v6_check(skb->len - thoff, &iph->saddr,
&iph->daddr, 0);
skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6;
@@ -50,6 +100,9 @@ static struct sk_buff *tcp6_gso_segment(
if (!pskb_may_pull(skb, sizeof(*th)))
return ERR_PTR(-EINVAL);
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)
+ return skb_segment_list(skb, features, skb_mac_header_len(skb));
+
if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
struct tcphdr *th = tcp_hdr(skb);

View File

@ -17,7 +17,7 @@ Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2192,7 +2192,7 @@ struct net_device {
@@ -2190,7 +2190,7 @@ struct net_device {
#if IS_ENABLED(CONFIG_AX25)
void *ax25_ptr;
#endif

View File

@ -20,7 +20,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
/**
* napi_disable - prevent NAPI from scheduling
@@ -3152,6 +3153,7 @@ struct softnet_data {
@@ -3150,6 +3151,7 @@ struct softnet_data {
unsigned int processed;
unsigned int time_squeeze;
unsigned int received_rps;
@ -157,7 +157,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
@@ -11171,6 +11242,9 @@ static int dev_cpu_dead(unsigned int old
@@ -11126,6 +11197,9 @@ static int dev_cpu_dead(unsigned int old
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_enable();
@ -167,7 +167,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
#ifdef CONFIG_RPS
remsd = oldsd->rps_ipi_list;
oldsd->rps_ipi_list = NULL;
@@ -11483,6 +11557,7 @@ static int __init net_dev_init(void)
@@ -11438,6 +11512,7 @@ static int __init net_dev_init(void)
INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
spin_lock_init(&sd->defer_lock);

View File

@ -1,151 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Subject: net: replace GRO optimization patch with a new one that supports VLANs/bridges with different MAC addresses
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
include/linux/netdevice.h | 2 ++
include/linux/skbuff.h | 3 ++-
net/core/dev.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++
net/ethernet/eth.c | 18 +++++++++++++++++-
4 files changed, 69 insertions(+), 2 deletions(-)
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2210,6 +2210,8 @@ struct net_device {
struct netdev_hw_addr_list mc;
struct netdev_hw_addr_list dev_addrs;
+ unsigned char local_addr_mask[MAX_ADDR_LEN];
+
#ifdef CONFIG_SYSFS
struct kset *queues_kset;
#endif
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -959,6 +959,7 @@ struct sk_buff {
#ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8 ndisc_nodetype:2;
#endif
+ __u8 gro_skip:1;
#if IS_ENABLED(CONFIG_IP_VS)
__u8 ipvs_property:1;
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -446,6 +446,9 @@ static enum gro_result dev_gro_receive(s
enum gro_result ret;
int same_flow;
+ if (skb->gro_skip)
+ goto normal;
+
if (netif_elide_gro(skb->dev))
goto normal;
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7689,6 +7689,48 @@ static void __netdev_adjacent_dev_unlink
&upper_dev->adj_list.lower);
}
+static void __netdev_addr_mask(unsigned char *mask, const unsigned char *addr,
+ struct net_device *dev)
+{
+ int i;
+
+ for (i = 0; i < dev->addr_len; i++)
+ mask[i] |= addr[i] ^ dev->dev_addr[i];
+}
+
+static void __netdev_upper_mask(unsigned char *mask, struct net_device *dev,
+ struct net_device *lower)
+{
+ struct net_device *cur;
+ struct list_head *iter;
+
+ netdev_for_each_upper_dev_rcu(dev, cur, iter) {
+ __netdev_addr_mask(mask, cur->dev_addr, lower);
+ __netdev_upper_mask(mask, cur, lower);
+ }
+}
+
+static void __netdev_update_addr_mask(struct net_device *dev)
+{
+ unsigned char mask[MAX_ADDR_LEN];
+ struct net_device *cur;
+ struct list_head *iter;
+
+ memset(mask, 0, sizeof(mask));
+ __netdev_upper_mask(mask, dev, dev);
+ memcpy(dev->local_addr_mask, mask, dev->addr_len);
+
+ netdev_for_each_lower_dev(dev, cur, iter)
+ __netdev_update_addr_mask(cur);
+}
+
+static void netdev_update_addr_mask(struct net_device *dev)
+{
+ rcu_read_lock();
+ __netdev_update_addr_mask(dev);
+ rcu_read_unlock();
+}
+
static int __netdev_upper_dev_link(struct net_device *dev,
struct net_device *upper_dev, bool master,
void *upper_priv, void *upper_info,
@@ -7740,6 +7782,7 @@ static int __netdev_upper_dev_link(struc
if (ret)
return ret;
+ netdev_update_addr_mask(dev);
ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
&changeupper_info.info);
ret = notifier_to_errno(ret);
@@ -7836,6 +7879,7 @@ static void __netdev_upper_dev_unlink(st
__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
+ netdev_update_addr_mask(dev);
call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
&changeupper_info.info);
@@ -8892,6 +8936,7 @@ int dev_set_mac_address(struct net_devic
return err;
}
dev->addr_assign_type = NET_ADDR_SET;
+ netdev_update_addr_mask(dev);
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
add_device_randomness(dev->dev_addr, dev->addr_len);
return 0;
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -143,6 +143,18 @@ u32 eth_get_headlen(const struct net_dev
}
EXPORT_SYMBOL(eth_get_headlen);
+static inline bool
+eth_check_local_mask(const void *addr1, const void *addr2, const void *mask)
+{
+ const u16 *a1 = addr1;
+ const u16 *a2 = addr2;
+ const u16 *m = mask;
+
+ return (((a1[0] ^ a2[0]) & ~m[0]) |
+ ((a1[1] ^ a2[1]) & ~m[1]) |
+ ((a1[2] ^ a2[2]) & ~m[2]));
+}
+
/**
* eth_type_trans - determine the packet's protocol ID.
* @skb: received socket data
@@ -174,6 +186,10 @@ __be16 eth_type_trans(struct sk_buff *sk
} else {
skb->pkt_type = PACKET_OTHERHOST;
}
+
+ if (eth_check_local_mask(eth->h_dest, dev->dev_addr,
+ dev->local_addr_mask))
+ skb->gro_skip = 1;
}
/*

View File

@ -0,0 +1,510 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 23 Apr 2024 11:23:03 +0200
Subject: [PATCH] net: add TCP fraglist GRO support
When forwarding TCP after GRO, software segmentation is very expensive,
especially when the checksum needs to be recalculated.
One case where that's currently unavoidable is when routing packets over
PPPoE. Performance improves significantly when using fraglist GRO
implemented in the same way as for UDP.
Here's a measurement of running 2 TCP streams through a MediaTek MT7622
device (2-core Cortex-A53), which runs NAT with flow offload enabled from
one ethernet port to PPPoE on another ethernet port + cake qdisc set to
1Gbps.
rx-gro-list off: 630 Mbit/s, CPU 35% idle
rx-gro-list on: 770 Mbit/s, CPU 40% idle
Signe-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/include/net/gro.h
+++ b/include/net/gro.h
@@ -430,6 +430,7 @@ static inline __wsum ip6_gro_compute_pse
}
int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb);
+int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb);
/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
static inline void gro_normal_list(struct napi_struct *napi)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2082,7 +2082,10 @@ void tcp_v4_destroy_sock(struct sock *sk
struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
netdev_features_t features);
-struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb);
+struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb);
+struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th);
+struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
+ struct tcphdr *th);
INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff));
INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb));
INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff));
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -233,6 +233,33 @@ done:
return 0;
}
+int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
+{
+ if (unlikely(p->len + skb->len >= 65536))
+ return -E2BIG;
+
+ if (NAPI_GRO_CB(p)->last == p)
+ skb_shinfo(p)->frag_list = skb;
+ else
+ NAPI_GRO_CB(p)->last->next = skb;
+
+ skb_pull(skb, skb_gro_offset(skb));
+
+ NAPI_GRO_CB(p)->last = skb;
+ NAPI_GRO_CB(p)->count++;
+ p->data_len += skb->len;
+
+ /* sk ownership - if any - completely transferred to the aggregated packet */
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ p->truesize += skb->truesize;
+ p->len += skb->len;
+
+ NAPI_GRO_CB(skb)->same_flow = 1;
+
+ return 0;
+}
+
static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
{
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -28,6 +28,68 @@ static void tcp_gso_tstamp(struct sk_buf
}
}
+static void __tcpv4_gso_segment_csum(struct sk_buff *seg,
+ __be32 *oldip, __be32 *newip,
+ __be16 *oldport, __be16 *newport)
+{
+ struct tcphdr *th;
+ struct iphdr *iph;
+
+ if (*oldip == *newip && *oldport == *newport)
+ return;
+
+ th = tcp_hdr(seg);
+ iph = ip_hdr(seg);
+
+ inet_proto_csum_replace4(&th->check, seg, *oldip, *newip, true);
+ inet_proto_csum_replace2(&th->check, seg, *oldport, *newport, false);
+ *oldport = *newport;
+
+ csum_replace4(&iph->check, *oldip, *newip);
+ *oldip = *newip;
+}
+
+static struct sk_buff *__tcpv4_gso_segment_list_csum(struct sk_buff *segs)
+{
+ struct sk_buff *seg;
+ struct tcphdr *th, *th2;
+ struct iphdr *iph, *iph2;
+
+ seg = segs;
+ th = tcp_hdr(seg);
+ iph = ip_hdr(seg);
+ th2 = tcp_hdr(seg->next);
+ iph2 = ip_hdr(seg->next);
+
+ if (!(*(u32 *)&th->source ^ *(u32 *)&th2->source) &&
+ iph->daddr == iph2->daddr && iph->saddr == iph2->saddr)
+ return segs;
+
+ while ((seg = seg->next)) {
+ th2 = tcp_hdr(seg);
+ iph2 = ip_hdr(seg);
+
+ __tcpv4_gso_segment_csum(seg,
+ &iph2->saddr, &iph->saddr,
+ &th2->source, &th->source);
+ __tcpv4_gso_segment_csum(seg,
+ &iph2->daddr, &iph->daddr,
+ &th2->dest, &th->dest);
+ }
+
+ return segs;
+}
+
+static struct sk_buff *__tcp4_gso_segment_list(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ skb = skb_segment_list(skb, features, skb_mac_header_len(skb));
+ if (IS_ERR(skb))
+ return skb;
+
+ return __tcpv4_gso_segment_list_csum(skb);
+}
+
static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
@@ -37,6 +99,9 @@ static struct sk_buff *tcp4_gso_segment(
if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
return ERR_PTR(-EINVAL);
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)
+ return __tcp4_gso_segment_list(skb, features);
+
if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
const struct iphdr *iph = ip_hdr(skb);
struct tcphdr *th = tcp_hdr(skb);
@@ -178,61 +243,76 @@ out:
return segs;
}
-struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
+struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th)
{
- struct sk_buff *pp = NULL;
+ struct tcphdr *th2;
struct sk_buff *p;
+
+ list_for_each_entry(p, head, list) {
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ th2 = tcp_hdr(p);
+ if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ return p;
+ }
+
+ return NULL;
+}
+
+struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb)
+{
+ unsigned int thlen, hlen, off;
struct tcphdr *th;
- struct tcphdr *th2;
- unsigned int len;
- unsigned int thlen;
- __be32 flags;
- unsigned int mss = 1;
- unsigned int hlen;
- unsigned int off;
- int flush = 1;
- int i;
off = skb_gro_offset(skb);
hlen = off + sizeof(*th);
th = skb_gro_header(skb, hlen, off);
if (unlikely(!th))
- goto out;
+ return NULL;
thlen = th->doff * 4;
if (thlen < sizeof(*th))
- goto out;
+ return NULL;
hlen = off + thlen;
if (skb_gro_header_hard(skb, hlen)) {
th = skb_gro_header_slow(skb, hlen, off);
if (unlikely(!th))
- goto out;
+ return NULL;
}
skb_gro_pull(skb, thlen);
- len = skb_gro_len(skb);
- flags = tcp_flag_word(th);
-
- list_for_each_entry(p, head, list) {
- if (!NAPI_GRO_CB(p)->same_flow)
- continue;
+ return th;
+}
- th2 = tcp_hdr(p);
+struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
+ struct tcphdr *th)
+{
+ unsigned int thlen = th->doff * 4;
+ struct sk_buff *pp = NULL;
+ struct sk_buff *p;
+ struct tcphdr *th2;
+ unsigned int len;
+ __be32 flags;
+ unsigned int mss = 1;
+ int flush = 1;
+ int i;
- if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
- NAPI_GRO_CB(p)->same_flow = 0;
- continue;
- }
+ len = skb_gro_len(skb);
+ flags = tcp_flag_word(th);
- goto found;
- }
- p = NULL;
- goto out_check_final;
+ p = tcp_gro_lookup(head, th);
+ if (!p)
+ goto out_check_final;
-found:
/* Include the IP ID check below from the inner most IP hdr */
+ th2 = tcp_hdr(p);
flush = NAPI_GRO_CB(p)->flush;
flush |= (__force int)(flags & TCP_FLAG_CWR);
flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
@@ -269,6 +349,19 @@ found:
flush |= p->decrypted ^ skb->decrypted;
#endif
+ if (NAPI_GRO_CB(p)->is_flist) {
+ flush |= (__force int)(flags ^ tcp_flag_word(th2));
+ flush |= skb->ip_summed != p->ip_summed;
+ flush |= skb->csum_level != p->csum_level;
+ flush |= !pskb_may_pull(skb, skb_gro_offset(skb));
+ flush |= NAPI_GRO_CB(p)->count >= 64;
+
+ if (flush || skb_gro_receive_list(p, skb))
+ mss = 1;
+
+ goto out_check_final;
+ }
+
if (flush || skb_gro_receive(p, skb)) {
mss = 1;
goto out_check_final;
@@ -290,7 +383,6 @@ out_check_final:
if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
pp = p;
-out:
NAPI_GRO_CB(skb)->flush |= (flush != 0);
return pp;
@@ -314,18 +406,56 @@ void tcp_gro_complete(struct sk_buff *sk
}
EXPORT_SYMBOL(tcp_gro_complete);
+static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
+ struct tcphdr *th)
+{
+ const struct iphdr *iph = skb_gro_network_header(skb);
+ struct net *net = dev_net(skb->dev);
+ struct sk_buff *p;
+ struct sock *sk;
+ int iif, sdif;
+
+ if (!(skb->dev->features & NETIF_F_GRO_FRAGLIST))
+ return;
+
+ p = tcp_gro_lookup(head, th);
+ if (p) {
+ NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist;
+ return;
+ }
+
+ inet_get_iif_sdif(skb, &iif, &sdif);
+ sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
+ iph->saddr, th->source,
+ iph->daddr, ntohs(th->dest),
+ iif, sdif);
+ NAPI_GRO_CB(skb)->is_flist = !sk;
+ if (sk)
+ sock_put(sk);
+}
+
INDIRECT_CALLABLE_SCOPE
struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
{
+ struct tcphdr *th;
+
/* Don't bother verifying checksum if we're going to flush anyway. */
if (!NAPI_GRO_CB(skb)->flush &&
skb_gro_checksum_validate(skb, IPPROTO_TCP,
- inet_gro_compute_pseudo)) {
- NAPI_GRO_CB(skb)->flush = 1;
- return NULL;
- }
+ inet_gro_compute_pseudo))
+ goto flush;
+
+ th = tcp_gro_pull_header(skb);
+ if (!th)
+ goto flush;
- return tcp_gro_receive(head, skb);
+ tcp4_check_fraglist_gro(head, skb, th);
+
+ return tcp_gro_receive(head, skb, th);
+
+flush:
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
}
INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff)
@@ -333,6 +463,15 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_com
const struct iphdr *iph = ip_hdr(skb);
struct tcphdr *th = tcp_hdr(skb);
+ if (NAPI_GRO_CB(skb)->is_flist) {
+ skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV4;
+ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+
+ __skb_incr_checksum_unnecessary(skb);
+
+ return 0;
+ }
+
th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr,
iph->daddr, 0);
skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -433,33 +433,6 @@ out:
return segs;
}
-static int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
-{
- if (unlikely(p->len + skb->len >= 65536))
- return -E2BIG;
-
- if (NAPI_GRO_CB(p)->last == p)
- skb_shinfo(p)->frag_list = skb;
- else
- NAPI_GRO_CB(p)->last->next = skb;
-
- skb_pull(skb, skb_gro_offset(skb));
-
- NAPI_GRO_CB(p)->last = skb;
- NAPI_GRO_CB(p)->count++;
- p->data_len += skb->len;
-
- /* sk ownership - if any - completely transferred to the aggregated packet */
- skb->destructor = NULL;
- skb->sk = NULL;
- p->truesize += skb->truesize;
- p->len += skb->len;
-
- NAPI_GRO_CB(skb)->same_flow = 1;
-
- return 0;
-}
-
#define UDP_GRO_CNT_MAX 64
static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
--- a/net/ipv6/tcpv6_offload.c
+++ b/net/ipv6/tcpv6_offload.c
@@ -7,24 +7,65 @@
*/
#include <linux/indirect_call_wrapper.h>
#include <linux/skbuff.h>
+#include <net/inet6_hashtables.h>
#include <net/gro.h>
#include <net/protocol.h>
#include <net/tcp.h>
#include <net/ip6_checksum.h>
#include "ip6_offload.h"
+static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
+ struct tcphdr *th)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ const struct ipv6hdr *hdr = skb_gro_network_header(skb);
+ struct net *net = dev_net(skb->dev);
+ struct sk_buff *p;
+ struct sock *sk;
+ int iif, sdif;
+
+ if (!(skb->dev->features & NETIF_F_GRO_FRAGLIST))
+ return;
+
+ p = tcp_gro_lookup(head, th);
+ if (p) {
+ NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist;
+ return;
+ }
+
+ inet6_get_iif_sdif(skb, &iif, &sdif);
+ sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
+ &hdr->saddr, th->source,
+ &hdr->daddr, ntohs(th->dest),
+ iif, sdif);
+ NAPI_GRO_CB(skb)->is_flist = !sk;
+ if (sk)
+ sock_put(sk);
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+}
+
INDIRECT_CALLABLE_SCOPE
struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)
{
+ struct tcphdr *th;
+
/* Don't bother verifying checksum if we're going to flush anyway. */
if (!NAPI_GRO_CB(skb)->flush &&
skb_gro_checksum_validate(skb, IPPROTO_TCP,
- ip6_gro_compute_pseudo)) {
- NAPI_GRO_CB(skb)->flush = 1;
- return NULL;
- }
+ ip6_gro_compute_pseudo))
+ goto flush;
- return tcp_gro_receive(head, skb);
+ th = tcp_gro_pull_header(skb);
+ if (!th)
+ goto flush;
+
+ tcp6_check_fraglist_gro(head, skb, th);
+
+ return tcp_gro_receive(head, skb, th);
+
+flush:
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
}
INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff)
@@ -32,6 +73,15 @@ INDIRECT_CALLABLE_SCOPE int tcp6_gro_com
const struct ipv6hdr *iph = ipv6_hdr(skb);
struct tcphdr *th = tcp_hdr(skb);
+ if (NAPI_GRO_CB(skb)->is_flist) {
+ skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV6;
+ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+
+ __skb_incr_checksum_unnecessary(skb);
+
+ return 0;
+ }
+
th->check = ~tcp_v6_check(skb->len - thoff, &iph->saddr,
&iph->daddr, 0);
skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6;
@@ -51,6 +101,9 @@ static struct sk_buff *tcp6_gso_segment(
if (!pskb_may_pull(skb, sizeof(*th)))
return ERR_PTR(-EINVAL);
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)
+ return skb_segment_list(skb, features, skb_mac_header_len(skb));
+
if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
struct tcphdr *th = tcp_hdr(skb);

View File

@ -17,7 +17,7 @@ Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2245,7 +2245,7 @@ struct net_device {
@@ -2243,7 +2243,7 @@ struct net_device {
#if IS_ENABLED(CONFIG_AX25)
void *ax25_ptr;
#endif

View File

@ -20,7 +20,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
/**
* napi_disable - prevent NAPI from scheduling
@@ -3238,6 +3239,7 @@ struct softnet_data {
@@ -3236,6 +3237,7 @@ struct softnet_data {
/* stats */
unsigned int processed;
unsigned int time_squeeze;
@ -157,7 +157,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
@@ -11351,6 +11422,9 @@ static int dev_cpu_dead(unsigned int old
@@ -11306,6 +11377,9 @@ static int dev_cpu_dead(unsigned int old
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_enable();
@ -167,7 +167,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
#ifdef CONFIG_RPS
remsd = oldsd->rps_ipi_list;
oldsd->rps_ipi_list = NULL;
@@ -11666,6 +11740,7 @@ static int __init net_dev_init(void)
@@ -11621,6 +11695,7 @@ static int __init net_dev_init(void)
INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
spin_lock_init(&sd->defer_lock);