kernel: backport the upstream implementation of threaded NAPI to 5.4

The workqueue based implementation has a few corner cases and typically lower
performance than the upstream one

Signed-off-by: Felix Fietkau <nbd@nbd.name>
This commit is contained in:
Felix Fietkau 2021-12-13 18:15:07 +01:00
parent c772783394
commit 01bebc070c
12 changed files with 692 additions and 375 deletions

View File

@ -0,0 +1,88 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 8 Feb 2021 11:34:08 -0800
Subject: [PATCH] net: extract napi poll functionality to __napi_poll()
This commit introduces a new function __napi_poll() which does the main
logic of the existing napi_poll() function, and will be called by other
functions in later commits.
This idea and implementation is done by Felix Fietkau <nbd@nbd.name> and
is proposed as part of the patch to move napi work to work_queue
context.
This commit by itself is a code restructure.
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Wei Wang <weiwan@google.com>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6322,15 +6322,10 @@ void netif_napi_del(struct napi_struct *
}
EXPORT_SYMBOL(netif_napi_del);
-static int napi_poll(struct napi_struct *n, struct list_head *repoll)
+static int __napi_poll(struct napi_struct *n, bool *repoll)
{
- void *have;
int work, weight;
- list_del_init(&n->poll_list);
-
- have = netpoll_poll_lock(n);
-
weight = n->weight;
/* This NAPI_STATE_SCHED test is for avoiding a race
@@ -6348,7 +6343,7 @@ static int napi_poll(struct napi_struct
WARN_ON_ONCE(work > weight);
if (likely(work < weight))
- goto out_unlock;
+ return work;
/* Drivers must not modify the NAPI state if they
* consume the entire weight. In such cases this code
@@ -6357,7 +6352,7 @@ static int napi_poll(struct napi_struct
*/
if (unlikely(napi_disable_pending(n))) {
napi_complete(n);
- goto out_unlock;
+ return work;
}
if (n->gro_bitmask) {
@@ -6375,12 +6370,29 @@ static int napi_poll(struct napi_struct
if (unlikely(!list_empty(&n->poll_list))) {
pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
n->dev ? n->dev->name : "backlog");
- goto out_unlock;
+ return work;
}
- list_add_tail(&n->poll_list, repoll);
+ *repoll = true;
+
+ return work;
+}
+
+static int napi_poll(struct napi_struct *n, struct list_head *repoll)
+{
+ bool do_repoll = false;
+ void *have;
+ int work;
+
+ list_del_init(&n->poll_list);
+
+ have = netpoll_poll_lock(n);
+
+ work = __napi_poll(n, &do_repoll);
+
+ if (do_repoll)
+ list_add_tail(&n->poll_list, repoll);
-out_unlock:
netpoll_poll_unlock(have);
return work;

View File

@ -0,0 +1,261 @@
From: Wei Wang <weiwan@google.com>
Date: Mon, 8 Feb 2021 11:34:09 -0800
Subject: [PATCH] net: implement threaded-able napi poll loop support
This patch allows running each napi poll loop inside its own
kernel thread.
The kthread is created during netif_napi_add() if dev->threaded
is set. And threaded mode is enabled in napi_enable(). We will
provide a way to set dev->threaded and enable threaded mode
without a device up/down in the following patch.
Once that threaded mode is enabled and the kthread is
started, napi_schedule() will wake-up such thread instead
of scheduling the softirq.
The threaded poll loop behaves quite likely the net_rx_action,
but it does not have to manipulate local irqs and uses
an explicit scheduling point based on netdev_budget.
Co-developed-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Co-developed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Co-developed-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Wei Wang <weiwan@google.com>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -340,6 +340,7 @@ struct napi_struct {
struct list_head dev_list;
struct hlist_node napi_hash_node;
unsigned int napi_id;
+ struct task_struct *thread;
};
enum {
@@ -350,6 +351,7 @@ enum {
NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */
NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
+ NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/
};
enum {
@@ -360,6 +362,7 @@ enum {
NAPIF_STATE_HASHED = BIT(NAPI_STATE_HASHED),
NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
+ NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED),
};
enum gro_result {
@@ -504,20 +507,7 @@ bool napi_hash_del(struct napi_struct *n
*/
void napi_disable(struct napi_struct *n);
-/**
- * napi_enable - enable NAPI scheduling
- * @n: NAPI context
- *
- * Resume NAPI from being scheduled on this context.
- * Must be paired with napi_disable.
- */
-static inline void napi_enable(struct napi_struct *n)
-{
- BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
- smp_mb__before_atomic();
- clear_bit(NAPI_STATE_SCHED, &n->state);
- clear_bit(NAPI_STATE_NPSVC, &n->state);
-}
+void napi_enable(struct napi_struct *n);
/**
* napi_synchronize - wait until NAPI is not running
@@ -1783,6 +1773,8 @@ enum netdev_ml_priv_type {
*
* @wol_enabled: Wake-on-LAN is enabled
*
+ * @threaded: napi threaded mode is enabled
+ *
* FIXME: cleanup struct net_device such that network protocol info
* moves out.
*/
@@ -2075,6 +2067,7 @@ struct net_device {
struct lock_class_key addr_list_lock_key;
bool proto_down;
unsigned wol_enabled:1;
+ unsigned threaded:1;
};
#define to_net_dev(d) container_of(d, struct net_device, dev)
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -91,6 +91,7 @@
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
#include <linux/skbuff.h>
+#include <linux/kthread.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
#include <net/net_namespace.h>
@@ -1289,6 +1290,27 @@ void netdev_notify_peers(struct net_devi
}
EXPORT_SYMBOL(netdev_notify_peers);
+static int napi_threaded_poll(void *data);
+
+static int napi_kthread_create(struct napi_struct *n)
+{
+ int err = 0;
+
+ /* Create and wake up the kthread once to put it in
+ * TASK_INTERRUPTIBLE mode to avoid the blocked task
+ * warning and work with loadavg.
+ */
+ n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
+ n->dev->name, n->napi_id);
+ if (IS_ERR(n->thread)) {
+ err = PTR_ERR(n->thread);
+ pr_err("kthread_run failed with err %d\n", err);
+ n->thread = NULL;
+ }
+
+ return err;
+}
+
static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -3885,6 +3907,21 @@ int gro_normal_batch __read_mostly = 8;
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{
+ struct task_struct *thread;
+
+ if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
+ /* Paired with smp_mb__before_atomic() in
+ * napi_enable(). Use READ_ONCE() to guarantee
+ * a complete read on napi->thread. Only call
+ * wake_up_process() when it's not NULL.
+ */
+ thread = READ_ONCE(napi->thread);
+ if (thread) {
+ wake_up_process(thread);
+ return;
+ }
+ }
+
list_add_tail(&napi->poll_list, &sd->poll_list);
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
@@ -6276,6 +6313,12 @@ void netif_napi_add(struct net_device *d
set_bit(NAPI_STATE_NPSVC, &napi->state);
list_add_rcu(&napi->dev_list, &dev->napi_list);
napi_hash_add(napi);
+ /* Create kthread for this napi if dev->threaded is set.
+ * Clear dev->threaded if kthread creation failed so that
+ * threaded mode will not be enabled in napi_enable().
+ */
+ if (dev->threaded && napi_kthread_create(napi))
+ dev->threaded = 0;
}
EXPORT_SYMBOL(netif_napi_add);
@@ -6292,9 +6335,28 @@ void napi_disable(struct napi_struct *n)
hrtimer_cancel(&n->timer);
clear_bit(NAPI_STATE_DISABLE, &n->state);
+ clear_bit(NAPI_STATE_THREADED, &n->state);
}
EXPORT_SYMBOL(napi_disable);
+/**
+ * napi_enable - enable NAPI scheduling
+ * @n: NAPI context
+ *
+ * Resume NAPI from being scheduled on this context.
+ * Must be paired with napi_disable.
+ */
+void napi_enable(struct napi_struct *n)
+{
+ BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
+ smp_mb__before_atomic();
+ clear_bit(NAPI_STATE_SCHED, &n->state);
+ clear_bit(NAPI_STATE_NPSVC, &n->state);
+ if (n->dev->threaded && n->thread)
+ set_bit(NAPI_STATE_THREADED, &n->state);
+}
+EXPORT_SYMBOL(napi_enable);
+
static void flush_gro_hash(struct napi_struct *napi)
{
int i;
@@ -6319,6 +6381,11 @@ void netif_napi_del(struct napi_struct *
flush_gro_hash(napi);
napi->gro_bitmask = 0;
+
+ if (napi->thread) {
+ kthread_stop(napi->thread);
+ napi->thread = NULL;
+ }
}
EXPORT_SYMBOL(netif_napi_del);
@@ -6398,6 +6465,51 @@ static int napi_poll(struct napi_struct
return work;
}
+static int napi_thread_wait(struct napi_struct *napi)
+{
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ while (!kthread_should_stop() && !napi_disable_pending(napi)) {
+ if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
+ WARN_ON(!list_empty(&napi->poll_list));
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+ __set_current_state(TASK_RUNNING);
+ return -1;
+}
+
+static int napi_threaded_poll(void *data)
+{
+ struct napi_struct *napi = data;
+ void *have;
+
+ while (!napi_thread_wait(napi)) {
+ for (;;) {
+ bool repoll = false;
+
+ local_bh_disable();
+
+ have = netpoll_poll_lock(napi);
+ __napi_poll(napi, &repoll);
+ netpoll_poll_unlock(have);
+
+ __kfree_skb_flush();
+ local_bh_enable();
+
+ if (!repoll)
+ break;
+
+ cond_resched();
+ }
+ }
+ return 0;
+}
+
static __latent_entropy void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);

View File

@ -0,0 +1,177 @@
From: Wei Wang <weiwan@google.com>
Date: Mon, 8 Feb 2021 11:34:10 -0800
Subject: [PATCH] net: add sysfs attribute to control napi threaded mode
This patch adds a new sysfs attribute to the network device class.
Said attribute provides a per-device control to enable/disable the
threaded mode for all the napi instances of the given network device,
without the need for a device up/down.
User sets it to 1 or 0 to enable or disable threaded mode.
Note: when switching between threaded and the current softirq based mode
for a napi instance, it will not immediately take effect if the napi is
currently being polled. The mode switch will happen for the next time
napi_schedule() is called.
Co-developed-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Co-developed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Co-developed-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Wei Wang <weiwan@google.com>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/Documentation/ABI/testing/sysfs-class-net
+++ b/Documentation/ABI/testing/sysfs-class-net
@@ -301,3 +301,18 @@ Contact: netdev@vger.kernel.org
Description:
32-bit unsigned integer counting the number of times the link has
been down
+
+What: /sys/class/net/<iface>/threaded
+Date: Jan 2021
+KernelVersion: 5.12
+Contact: netdev@vger.kernel.org
+Description:
+ Boolean value to control the threaded mode per device. User could
+ set this value to enable/disable threaded mode for all napi
+ belonging to this device, without the need to do device up/down.
+
+ Possible values:
+ == ==================================
+ 0 threaded mode disabled for this dev
+ 1 threaded mode enabled for this dev
+ == ==================================
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -498,6 +498,8 @@ static inline bool napi_complete(struct
*/
bool napi_hash_del(struct napi_struct *napi);
+int dev_set_threaded(struct net_device *dev, bool threaded);
+
/**
* napi_disable - prevent NAPI from scheduling
* @n: NAPI context
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3911,8 +3911,9 @@ static inline void ____napi_schedule(str
if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
/* Paired with smp_mb__before_atomic() in
- * napi_enable(). Use READ_ONCE() to guarantee
- * a complete read on napi->thread. Only call
+ * napi_enable()/dev_set_threaded().
+ * Use READ_ONCE() to guarantee a complete
+ * read on napi->thread. Only call
* wake_up_process() when it's not NULL.
*/
thread = READ_ONCE(napi->thread);
@@ -6290,6 +6291,49 @@ static void init_gro_hash(struct napi_st
napi->gro_bitmask = 0;
}
+int dev_set_threaded(struct net_device *dev, bool threaded)
+{
+ struct napi_struct *napi;
+ int err = 0;
+
+ if (dev->threaded == threaded)
+ return 0;
+
+ if (threaded) {
+ list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ if (!napi->thread) {
+ err = napi_kthread_create(napi);
+ if (err) {
+ threaded = false;
+ break;
+ }
+ }
+ }
+ }
+
+ dev->threaded = threaded;
+
+ /* Make sure kthread is created before THREADED bit
+ * is set.
+ */
+ smp_mb__before_atomic();
+
+ /* Setting/unsetting threaded mode on a napi might not immediately
+ * take effect, if the current napi instance is actively being
+ * polled. In this case, the switch between threaded mode and
+ * softirq mode will happen in the next round of napi_schedule().
+ * This should not cause hiccups/stalls to the live traffic.
+ */
+ list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ if (threaded)
+ set_bit(NAPI_STATE_THREADED, &napi->state);
+ else
+ clear_bit(NAPI_STATE_THREADED, &napi->state);
+ }
+
+ return err;
+}
+
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -557,6 +557,45 @@ static ssize_t phys_switch_id_show(struc
}
static DEVICE_ATTR_RO(phys_switch_id);
+static ssize_t threaded_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (dev_isalive(netdev))
+ ret = sprintf(buf, fmt_dec, netdev->threaded);
+
+ rtnl_unlock();
+ return ret;
+}
+
+static int modify_napi_threaded(struct net_device *dev, unsigned long val)
+{
+ int ret;
+
+ if (list_empty(&dev->napi_list))
+ return -EOPNOTSUPP;
+
+ if (val != 0 && val != 1)
+ return -EOPNOTSUPP;
+
+ ret = dev_set_threaded(dev, val);
+
+ return ret;
+}
+
+static ssize_t threaded_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, modify_napi_threaded);
+}
+static DEVICE_ATTR_RW(threaded);
+
static struct attribute *net_class_attrs[] __ro_after_init = {
&dev_attr_netdev_group.attr,
&dev_attr_type.attr,
@@ -587,6 +626,7 @@ static struct attribute *net_class_attrs
&dev_attr_proto_down.attr,
&dev_attr_carrier_up_count.attr,
&dev_attr_carrier_down_count.attr,
+ &dev_attr_threaded.attr,
NULL,
};
ATTRIBUTE_GROUPS(net_class);

View File

@ -0,0 +1,93 @@
From: Wei Wang <weiwan@google.com>
Date: Mon, 1 Mar 2021 17:21:13 -0800
Subject: [PATCH] net: fix race between napi kthread mode and busy poll
Currently, napi_thread_wait() checks for NAPI_STATE_SCHED bit to
determine if the kthread owns this napi and could call napi->poll() on
it. However, if socket busy poll is enabled, it is possible that the
busy poll thread grabs this SCHED bit (after the previous napi->poll()
invokes napi_complete_done() and clears SCHED bit) and tries to poll
on the same napi. napi_disable() could grab the SCHED bit as well.
This patch tries to fix this race by adding a new bit
NAPI_STATE_SCHED_THREADED in napi->state. This bit gets set in
____napi_schedule() if the threaded mode is enabled, and gets cleared
in napi_complete_done(), and we only poll the napi in kthread if this
bit is set. This helps distinguish the ownership of the napi between
kthread and other scenarios and fixes the race issue.
Fixes: 29863d41bb6e ("net: implement threaded-able napi poll loop support")
Reported-by: Martin Zaharinov <micron10@gmail.com>
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Wei Wang <weiwan@google.com>
Cc: Alexander Duyck <alexanderduyck@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
---
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -352,6 +352,7 @@ enum {
NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/
+ NAPI_STATE_SCHED_THREADED, /* Napi is currently scheduled in threaded mode */
};
enum {
@@ -363,6 +364,7 @@ enum {
NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED),
+ NAPIF_STATE_SCHED_THREADED = BIT(NAPI_STATE_SCHED_THREADED),
};
enum gro_result {
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3918,6 +3918,8 @@ static inline void ____napi_schedule(str
*/
thread = READ_ONCE(napi->thread);
if (thread) {
+ if (thread->state != TASK_INTERRUPTIBLE)
+ set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
wake_up_process(thread);
return;
}
@@ -6078,7 +6080,8 @@ bool napi_complete_done(struct napi_stru
WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
- new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
+ new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
+ NAPIF_STATE_SCHED_THREADED);
/* If STATE_MISSED was set, leave STATE_SCHED set,
* because we will call napi->poll() one more time.
@@ -6511,16 +6514,25 @@ static int napi_poll(struct napi_struct
static int napi_thread_wait(struct napi_struct *napi)
{
+ bool woken = false;
+
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop() && !napi_disable_pending(napi)) {
- if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
+ /* Testing SCHED_THREADED bit here to make sure the current
+ * kthread owns this napi and could poll on this napi.
+ * Testing SCHED bit is not enough because SCHED bit might be
+ * set by some other busy poll thread or by napi_disable().
+ */
+ if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
WARN_ON(!list_empty(&napi->poll_list));
__set_current_state(TASK_RUNNING);
return 0;
}
schedule();
+ /* woken being true indicates this thread owns this napi. */
+ woken = true;
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);

View File

@ -0,0 +1,53 @@
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 9 Apr 2021 17:24:17 +0200
Subject: [PATCH] net: fix hangup on napi_disable for threaded napi
napi_disable() is subject to an hangup, when the threaded
mode is enabled and the napi is under heavy traffic.
If the relevant napi has been scheduled and the napi_disable()
kicks in before the next napi_threaded_wait() completes - so
that the latter quits due to the napi_disable_pending() condition,
the existing code leaves the NAPI_STATE_SCHED bit set and the
napi_disable() loop waiting for such bit will hang.
This patch addresses the issue by dropping the NAPI_STATE_DISABLE
bit test in napi_thread_wait(). The later napi_threaded_poll()
iteration will take care of clearing the NAPI_STATE_SCHED.
This also addresses a related problem reported by Jakub:
before this patch a napi_disable()/napi_enable() pair killed
the napi thread, effectively disabling the threaded mode.
On the patched kernel napi_disable() simply stops scheduling
the relevant thread.
v1 -> v2:
- let the main napi_thread_poll() loop clear the SCHED bit
Reported-by: Jakub Kicinski <kuba@kernel.org>
Fixes: 29863d41bb6e ("net: implement threaded-able napi poll loop support")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/883923fa22745a9589e8610962b7dc59df09fb1f.1617981844.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6518,7 +6518,7 @@ static int napi_thread_wait(struct napi_
set_current_state(TASK_INTERRUPTIBLE);
- while (!kthread_should_stop() && !napi_disable_pending(napi)) {
+ while (!kthread_should_stop()) {
/* Testing SCHED_THREADED bit here to make sure the current
* kthread owns this napi and could poll on this napi.
* Testing SCHED bit is not enough because SCHED bit might be
@@ -6536,6 +6536,7 @@ static int napi_thread_wait(struct napi_
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
+
return -1;
}

View File

@ -66,7 +66,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5432,8 +5432,7 @@ static inline void skb_gro_reset_offset(
@@ -5472,8 +5472,7 @@ static inline void skb_gro_reset_offset(
NAPI_GRO_CB(skb)->frag0 = NULL;
NAPI_GRO_CB(skb)->frag0_len = 0;

View File

@ -15,7 +15,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1549,6 +1549,7 @@ enum netdev_priv_flags {
@@ -1540,6 +1540,7 @@ enum netdev_priv_flags {
IFF_FAILOVER_SLAVE = 1<<28,
IFF_L3MDEV_RX_HANDLER = 1<<29,
IFF_LIVE_RENAME_OK = 1<<30,
@ -23,7 +23,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
};
#define IFF_802_1Q_VLAN IFF_802_1Q_VLAN
@@ -1581,6 +1582,7 @@ enum netdev_priv_flags {
@@ -1572,6 +1573,7 @@ enum netdev_priv_flags {
#define IFF_FAILOVER_SLAVE IFF_FAILOVER_SLAVE
#define IFF_L3MDEV_RX_HANDLER IFF_L3MDEV_RX_HANDLER
#define IFF_LIVE_RENAME_OK IFF_LIVE_RENAME_OK
@ -31,7 +31,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
/* Specifies the type of the struct net_device::ml_priv pointer */
enum netdev_ml_priv_type {
@@ -1889,6 +1891,11 @@ struct net_device {
@@ -1882,6 +1884,11 @@ struct net_device {
const struct tlsdev_ops *tlsdev_ops;
#endif
@ -43,7 +43,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
const struct header_ops *header_ops;
unsigned int flags;
@@ -1971,6 +1978,10 @@ struct net_device {
@@ -1964,6 +1971,10 @@ struct net_device {
struct mpls_dev __rcu *mpls_ptr;
#endif
@ -101,7 +101,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
help
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3200,10 +3200,20 @@ static int xmit_one(struct sk_buff *skb,
@@ -3221,10 +3221,20 @@ static int xmit_one(struct sk_buff *skb,
if (dev_nit_active(dev))
dev_queue_xmit_nit(skb, dev);

View File

@ -23,7 +23,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -928,6 +928,13 @@ struct devlink;
@@ -922,6 +922,13 @@ struct devlink;
struct tlsdev_ops;
@ -37,7 +37,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
/*
* This structure defines the management hooks for network devices.
* The following hooks can be defined; unless noted otherwise, they are
@@ -1160,6 +1167,10 @@ struct tlsdev_ops;
@@ -1154,6 +1161,10 @@ struct tlsdev_ops;
* int (*ndo_bridge_dellink)(struct net_device *dev, struct nlmsghdr *nlh,
* u16 flags);
*
@ -48,7 +48,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
* int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier);
* Called to change device carrier. Soft-devices (like dummy, team, etc)
* which do not represent real hardware may define this to allow their
@@ -1407,6 +1418,8 @@ struct net_device_ops {
@@ -1401,6 +1412,8 @@ struct net_device_ops {
int (*ndo_bridge_dellink)(struct net_device *dev,
struct nlmsghdr *nlh,
u16 flags);

View File

@ -15,7 +15,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -929,6 +929,7 @@ struct tlsdev_ops;
@@ -923,6 +923,7 @@ struct tlsdev_ops;
struct flow_offload;
@ -23,7 +23,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
enum flow_offload_type {
FLOW_OFFLOAD_ADD = 0,
@@ -1167,8 +1168,15 @@ enum flow_offload_type {
@@ -1161,8 +1162,15 @@ enum flow_offload_type {
* int (*ndo_bridge_dellink)(struct net_device *dev, struct nlmsghdr *nlh,
* u16 flags);
*
@ -40,7 +40,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
* Adds/deletes flow entry to/from net device flowtable.
*
* int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier);
@@ -1418,8 +1426,11 @@ struct net_device_ops {
@@ -1412,8 +1420,11 @@ struct net_device_ops {
int (*ndo_bridge_dellink)(struct net_device *dev,
struct nlmsghdr *nlh,
u16 flags);

View File

@ -11,7 +11,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1931,6 +1931,8 @@ struct net_device {
@@ -1927,6 +1927,8 @@ struct net_device {
struct netdev_hw_addr_list mc;
struct netdev_hw_addr_list dev_addrs;
@ -32,7 +32,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
__u16 tc_index; /* traffic control index */
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5498,6 +5498,9 @@ static enum gro_result dev_gro_receive(s
@@ -5538,6 +5538,9 @@ static enum gro_result dev_gro_receive(s
int same_flow;
int grow;
@ -42,7 +42,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
if (netif_elide_gro(skb->dev))
goto normal;
@@ -7300,6 +7303,48 @@ static void __netdev_adjacent_dev_unlink
@@ -7481,6 +7484,48 @@ static void __netdev_adjacent_dev_unlink
&upper_dev->adj_list.lower);
}
@ -91,7 +91,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
static int __netdev_upper_dev_link(struct net_device *dev,
struct net_device *upper_dev, bool master,
void *upper_priv, void *upper_info,
@@ -7350,6 +7395,7 @@ static int __netdev_upper_dev_link(struc
@@ -7531,6 +7576,7 @@ static int __netdev_upper_dev_link(struc
if (ret)
return ret;
@ -99,7 +99,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
&changeupper_info.info);
ret = notifier_to_errno(ret);
@@ -7443,6 +7489,7 @@ void netdev_upper_dev_unlink(struct net_
@@ -7624,6 +7670,7 @@ void netdev_upper_dev_unlink(struct net_
__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
@ -107,7 +107,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
&changeupper_info.info);
@@ -8173,6 +8220,7 @@ int dev_set_mac_address(struct net_devic
@@ -8354,6 +8401,7 @@ int dev_set_mac_address(struct net_devic
if (err)
return err;
dev->addr_assign_type = NET_ADDR_SET;

View File

@ -1,355 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Sun, 26 Jul 2020 14:03:21 +0200
Subject: [PATCH] net: add support for threaded NAPI polling
For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
poll function does not perform well. Since NAPI poll is bound to the CPU it
was scheduled from, we can easily end up with a few very busy CPUs spending
most of their time in softirq/ksoftirqd and some idle ones.
Introduce threaded NAPI for such drivers based on a workqueue. The API is the
same except for using netif_threaded_napi_add instead of netif_napi_add.
In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
thread.
With threaded NAPI it seems stable and consistent (and higher than the best
results I got without it).
Based on a patch by Hillf Danton
Cc: Hillf Danton <hdanton@sina.com>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -340,6 +340,7 @@ struct napi_struct {
struct list_head dev_list;
struct hlist_node napi_hash_node;
unsigned int napi_id;
+ struct work_struct work;
};
enum {
@@ -350,6 +351,7 @@ enum {
NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */
NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
+ NAPI_STATE_THREADED, /* Use threaded NAPI */
};
enum {
@@ -360,6 +362,7 @@ enum {
NAPIF_STATE_HASHED = BIT(NAPI_STATE_HASHED),
NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
+ NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED),
};
enum gro_result {
@@ -2101,6 +2104,7 @@ struct net_device {
struct lock_class_key addr_list_lock_key;
bool proto_down;
unsigned wol_enabled:1;
+ unsigned threaded:1;
};
#define to_net_dev(d) container_of(d, struct net_device, dev)
@@ -2281,6 +2285,26 @@ void netif_napi_add(struct net_device *d
int (*poll)(struct napi_struct *, int), int weight);
/**
+ * netif_threaded_napi_add - initialize a NAPI context
+ * @dev: network device
+ * @napi: NAPI context
+ * @poll: polling function
+ * @weight: default weight
+ *
+ * This variant of netif_napi_add() should be used from drivers using NAPI
+ * with CPU intensive poll functions.
+ * This will schedule polling from a high priority workqueue
+ */
+static inline void netif_threaded_napi_add(struct net_device *dev,
+ struct napi_struct *napi,
+ int (*poll)(struct napi_struct *, int),
+ int weight)
+{
+ set_bit(NAPI_STATE_THREADED, &napi->state);
+ netif_napi_add(dev, napi, poll, weight);
+}
+
+/**
* netif_tx_napi_add - initialize a NAPI context
* @dev: network device
* @napi: NAPI context
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -156,6 +156,7 @@ static DEFINE_SPINLOCK(offload_lock);
struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
struct list_head ptype_all __read_mostly; /* Taps */
static struct list_head offload_base __read_mostly;
+static struct workqueue_struct *napi_workq __read_mostly;
static int netif_rx_internal(struct sk_buff *skb);
static int call_netdevice_notifiers_info(unsigned long val,
@@ -5940,6 +5941,11 @@ void __napi_schedule(struct napi_struct
{
unsigned long flags;
+ if (test_bit(NAPI_STATE_THREADED, &n->state)) {
+ queue_work(napi_workq, &n->work);
+ return;
+ }
+
local_irq_save(flags);
____napi_schedule(this_cpu_ptr(&softnet_data), n);
local_irq_restore(flags);
@@ -5991,6 +5997,10 @@ EXPORT_SYMBOL(napi_schedule_prep);
*/
void __napi_schedule_irqoff(struct napi_struct *n)
{
+ if (test_bit(NAPI_STATE_THREADED, &n->state)) {
+ queue_work(napi_workq, &n->work);
+ return;
+ }
if (!IS_ENABLED(CONFIG_PREEMPT_RT))
____napi_schedule(this_cpu_ptr(&softnet_data), n);
else
@@ -6255,9 +6265,89 @@ static void init_gro_hash(struct napi_st
napi->gro_bitmask = 0;
}
+static int __napi_poll(struct napi_struct *n, bool *repoll)
+{
+ int work, weight;
+
+ weight = n->weight;
+
+ /* This NAPI_STATE_SCHED test is for avoiding a race
+ * with netpoll's poll_napi(). Only the entity which
+ * obtains the lock and sees NAPI_STATE_SCHED set will
+ * actually make the ->poll() call. Therefore we avoid
+ * accidentally calling ->poll() when NAPI is not scheduled.
+ */
+ work = 0;
+ if (test_bit(NAPI_STATE_SCHED, &n->state)) {
+ work = n->poll(n, weight);
+ trace_napi_poll(n, work, weight);
+ }
+
+ WARN_ON_ONCE(work > weight);
+
+ if (likely(work < weight))
+ return work;
+
+ /* Drivers must not modify the NAPI state if they
+ * consume the entire weight. In such cases this code
+ * still "owns" the NAPI instance and therefore can
+ * move the instance around on the list at-will.
+ */
+ if (unlikely(napi_disable_pending(n))) {
+ napi_complete(n);
+ return work;
+ }
+
+ if (n->gro_bitmask) {
+ /* flush too old packets
+ * If HZ < 1000, flush all packets.
+ */
+ napi_gro_flush(n, HZ >= 1000);
+ }
+
+ gro_normal_list(n);
+
+ *repoll = true;
+
+ return work;
+}
+
+static void napi_workfn(struct work_struct *work)
+{
+ struct napi_struct *n = container_of(work, struct napi_struct, work);
+ void *have;
+
+ for (;;) {
+ bool repoll = false;
+
+ local_bh_disable();
+
+ have = netpoll_poll_lock(n);
+ __napi_poll(n, &repoll);
+ netpoll_poll_unlock(have);
+
+ local_bh_enable();
+
+ if (!repoll)
+ return;
+
+ if (!need_resched())
+ continue;
+
+ /*
+ * have to pay for the latency of task switch even if
+ * napi is scheduled
+ */
+ queue_work(napi_workq, work);
+ return;
+ }
+}
+
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
+ if (dev->threaded)
+ set_bit(NAPI_STATE_THREADED, &napi->state);
INIT_LIST_HEAD(&napi->poll_list);
hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
napi->timer.function = napi_watchdog;
@@ -6274,6 +6364,7 @@ void netif_napi_add(struct net_device *d
#ifdef CONFIG_NETPOLL
napi->poll_owner = -1;
#endif
+ INIT_WORK(&napi->work, napi_workfn);
set_bit(NAPI_STATE_SCHED, &napi->state);
set_bit(NAPI_STATE_NPSVC, &napi->state);
list_add_rcu(&napi->dev_list, &dev->napi_list);
@@ -6314,6 +6405,7 @@ static void flush_gro_hash(struct napi_s
void netif_napi_del(struct napi_struct *napi)
{
might_sleep();
+ cancel_work_sync(&napi->work);
if (napi_hash_del(napi))
synchronize_net();
list_del_init(&napi->dev_list);
@@ -6326,50 +6418,18 @@ EXPORT_SYMBOL(netif_napi_del);
static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
+ bool do_repoll = false;
void *have;
- int work, weight;
+ int work;
list_del_init(&n->poll_list);
have = netpoll_poll_lock(n);
- weight = n->weight;
-
- /* This NAPI_STATE_SCHED test is for avoiding a race
- * with netpoll's poll_napi(). Only the entity which
- * obtains the lock and sees NAPI_STATE_SCHED set will
- * actually make the ->poll() call. Therefore we avoid
- * accidentally calling ->poll() when NAPI is not scheduled.
- */
- work = 0;
- if (test_bit(NAPI_STATE_SCHED, &n->state)) {
- work = n->poll(n, weight);
- trace_napi_poll(n, work, weight);
- }
-
- WARN_ON_ONCE(work > weight);
+ work = __napi_poll(n, &do_repoll);
- if (likely(work < weight))
- goto out_unlock;
-
- /* Drivers must not modify the NAPI state if they
- * consume the entire weight. In such cases this code
- * still "owns" the NAPI instance and therefore can
- * move the instance around on the list at-will.
- */
- if (unlikely(napi_disable_pending(n))) {
- napi_complete(n);
+ if (!do_repoll)
goto out_unlock;
- }
-
- if (n->gro_bitmask) {
- /* flush too old packets
- * If HZ < 1000, flush all packets.
- */
- napi_gro_flush(n, HZ >= 1000);
- }
-
- gro_normal_list(n);
/* Some drivers may have called napi_schedule
* prior to exhausting their budget.
@@ -10349,6 +10409,10 @@ static int __init net_dev_init(void)
sd->backlog.weight = weight_p;
}
+ napi_workq = alloc_workqueue("napi_workq", WQ_UNBOUND | WQ_HIGHPRI,
+ WQ_UNBOUND_MAX_ACTIVE | WQ_SYSFS);
+ BUG_ON(!napi_workq);
+
dev_boot_phase = 0;
/* The loopback device is special if any other network devices
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -470,6 +470,52 @@ static ssize_t proto_down_store(struct d
}
NETDEVICE_SHOW_RW(proto_down, fmt_dec);
+static int change_napi_threaded(struct net_device *dev, unsigned long val)
+{
+ struct napi_struct *napi;
+
+ if (list_empty(&dev->napi_list))
+ return -EOPNOTSUPP;
+
+ list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ if (val)
+ set_bit(NAPI_STATE_THREADED, &napi->state);
+ else
+ clear_bit(NAPI_STATE_THREADED, &napi->state);
+ }
+
+ return 0;
+}
+
+static ssize_t napi_threaded_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, change_napi_threaded);
+}
+
+static ssize_t napi_threaded_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ struct napi_struct *napi;
+ bool enabled = false;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ list_for_each_entry(napi, &netdev->napi_list, dev_list) {
+ if (test_bit(NAPI_STATE_THREADED, &napi->state))
+ enabled = true;
+ }
+
+ rtnl_unlock();
+
+ return sprintf(buf, fmt_dec, enabled);
+}
+static DEVICE_ATTR_RW(napi_threaded);
+
static ssize_t phys_port_id_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -581,6 +627,7 @@ static struct attribute *net_class_attrs
&dev_attr_flags.attr,
&dev_attr_tx_queue_len.attr,
&dev_attr_gro_flush_timeout.attr,
+ &dev_attr_napi_threaded.attr,
&dev_attr_phys_port_id.attr,
&dev_attr_phys_port_name.attr,
&dev_attr_phys_switch_id.attr,

View File

@ -24,7 +24,7 @@ Reviewed-by: Grant Grundler <grundler@chromium.org>
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -776,6 +776,16 @@ struct xps_map {
@@ -767,6 +767,16 @@ struct xps_map {
#define XPS_MIN_MAP_ALLOC ((L1_CACHE_ALIGN(offsetof(struct xps_map, queues[1])) \
- sizeof(struct xps_map)) / sizeof(u16))
@ -41,7 +41,7 @@ Reviewed-by: Grant Grundler <grundler@chromium.org>
/*
* This structure holds all XPS maps for device. Maps are indexed by CPU.
*/
@@ -1379,6 +1389,9 @@ struct net_device_ops {
@@ -1370,6 +1380,9 @@ struct net_device_ops {
const struct sk_buff *skb,
u16 rxq_index,
u32 flow_id);