Skip to content

Commit

Permalink
Merge branch 'bridge-mdb-events'
Browse files Browse the repository at this point in the history
Tobias Waldekranz says:

====================
net: bridge: switchdev: Ensure MDB events are delivered exactly once

When a device is attached to a bridge, drivers will request a replay
of objects that were created before the device joined the bridge, that
are still of interest to the joining port. Typical examples include
FDB entries and MDB memberships on other ports ("foreign interfaces")
or on the bridge itself.

Conversely when a device is detached, the bridge will synthesize
deletion events for all those objects that are still live, but no
longer applicable to the device in question.

This series eliminates two races related to the synching and
unsynching phases of a bridge's MDB with a joining or leaving device,
that would cause notifications of such objects to be either delivered
twice (1/2), or not at all (2/2).

A similar race to the one solved by 1/2 still remains for the
FDB. This is much harder to solve, due to the lockless operation of
the FDB's rhashtable, and is therefore knowingly left out of this
series.

v1 -> v2:
- Squash the previously separate addition of
  switchdev_port_obj_act_is_deferred into first consumer.
- Use ether_addr_equal to compare MAC addresses.
- Document switchdev_port_obj_act_is_deferred (renamed from
  switchdev_port_obj_is_deferred in v1, to indicate that we also match
  on the action).
- Delay allocations of MDB objects until we know they're needed.
- Use non-RCU version of the hash list iterator, now that the MDB is
  not scanned while holding the RCU read lock.
- Add Fixes tag to commit message

v2 -> v3:
- Fix unlocking in error paths
- Access RCU protected port list via mlock_dereference, since MDB is
  guaranteed to remain constant for the duration of the scan.

v3 -> v4:
- Limit the search for exiting deferred events in 1/2 to only apply to
  additions, since the problem does not exist in the deletion case.
- Add 2/2, to plug a related race when unoffloading an indirectly
  associated device.

v4 -> v5:
- Fix grammatical errors in kerneldoc of
  switchdev_port_obj_act_is_deferred
====================

Signed-off-by: David S. Miller <[email protected]>
  • Loading branch information
davem330 committed Feb 16, 2024
2 parents b4ea9b6 + f7a70d6 commit 82a678e
Show file tree
Hide file tree
Showing 3 changed files with 132 additions and 28 deletions.
3 changes: 3 additions & 0 deletions include/net/switchdev.h
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,9 @@ void switchdev_deferred_process(void);
int switchdev_port_attr_set(struct net_device *dev,
const struct switchdev_attr *attr,
struct netlink_ext_ack *extack);
bool switchdev_port_obj_act_is_deferred(struct net_device *dev,
enum switchdev_notifier_type nt,
const struct switchdev_obj *obj);
int switchdev_port_obj_add(struct net_device *dev,
const struct switchdev_obj *obj,
struct netlink_ext_ack *extack);
Expand Down
84 changes: 56 additions & 28 deletions net/bridge/br_switchdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -595,21 +595,40 @@ br_switchdev_mdb_replay_one(struct notifier_block *nb, struct net_device *dev,
}

static int br_switchdev_mdb_queue_one(struct list_head *mdb_list,
struct net_device *dev,
unsigned long action,
enum switchdev_obj_id id,
const struct net_bridge_mdb_entry *mp,
struct net_device *orig_dev)
{
struct switchdev_obj_port_mdb *mdb;
struct switchdev_obj_port_mdb mdb = {
.obj = {
.id = id,
.orig_dev = orig_dev,
},
};
struct switchdev_obj_port_mdb *pmdb;

mdb = kzalloc(sizeof(*mdb), GFP_ATOMIC);
if (!mdb)
return -ENOMEM;
br_switchdev_mdb_populate(&mdb, mp);

if (action == SWITCHDEV_PORT_OBJ_ADD &&
switchdev_port_obj_act_is_deferred(dev, action, &mdb.obj)) {
/* This event is already in the deferred queue of
* events, so this replay must be elided, lest the
* driver receives duplicate events for it. This can
* only happen when replaying additions, since
* modifications are always immediately visible in
* br->mdb_list, whereas actual event delivery may be
* delayed.
*/
return 0;
}

mdb->obj.id = id;
mdb->obj.orig_dev = orig_dev;
br_switchdev_mdb_populate(mdb, mp);
list_add_tail(&mdb->obj.list, mdb_list);
pmdb = kmemdup(&mdb, sizeof(mdb), GFP_ATOMIC);
if (!pmdb)
return -ENOMEM;

list_add_tail(&pmdb->obj.list, mdb_list);
return 0;
}

Expand Down Expand Up @@ -677,51 +696,50 @@ br_switchdev_mdb_replay(struct net_device *br_dev, struct net_device *dev,
if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
return 0;

/* We cannot walk over br->mdb_list protected just by the rtnl_mutex,
* because the write-side protection is br->multicast_lock. But we
* need to emulate the [ blocking ] calling context of a regular
* switchdev event, so since both br->multicast_lock and RCU read side
* critical sections are atomic, we have no choice but to pick the RCU
* read side lock, queue up all our events, leave the critical section
* and notify switchdev from blocking context.
if (adding)
action = SWITCHDEV_PORT_OBJ_ADD;
else
action = SWITCHDEV_PORT_OBJ_DEL;

/* br_switchdev_mdb_queue_one() will take care to not queue a
* replay of an event that is already pending in the switchdev
* deferred queue. In order to safely determine that, there
* must be no new deferred MDB notifications enqueued for the
* duration of the MDB scan. Therefore, grab the write-side
* lock to avoid racing with any concurrent IGMP/MLD snooping.
*/
rcu_read_lock();
spin_lock_bh(&br->multicast_lock);

hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) {
hlist_for_each_entry(mp, &br->mdb_list, mdb_node) {
struct net_bridge_port_group __rcu * const *pp;
const struct net_bridge_port_group *p;

if (mp->host_joined) {
err = br_switchdev_mdb_queue_one(&mdb_list,
err = br_switchdev_mdb_queue_one(&mdb_list, dev, action,
SWITCHDEV_OBJ_ID_HOST_MDB,
mp, br_dev);
if (err) {
rcu_read_unlock();
spin_unlock_bh(&br->multicast_lock);
goto out_free_mdb;
}
}

for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL;
for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL;
pp = &p->next) {
if (p->key.port->dev != dev)
continue;

err = br_switchdev_mdb_queue_one(&mdb_list,
err = br_switchdev_mdb_queue_one(&mdb_list, dev, action,
SWITCHDEV_OBJ_ID_PORT_MDB,
mp, dev);
if (err) {
rcu_read_unlock();
spin_unlock_bh(&br->multicast_lock);
goto out_free_mdb;
}
}
}

rcu_read_unlock();

if (adding)
action = SWITCHDEV_PORT_OBJ_ADD;
else
action = SWITCHDEV_PORT_OBJ_DEL;
spin_unlock_bh(&br->multicast_lock);

list_for_each_entry(obj, &mdb_list, list) {
err = br_switchdev_mdb_replay_one(nb, dev,
Expand Down Expand Up @@ -786,6 +804,16 @@ static void nbp_switchdev_unsync_objs(struct net_bridge_port *p,
br_switchdev_mdb_replay(br_dev, dev, ctx, false, blocking_nb, NULL);

br_switchdev_vlan_replay(br_dev, ctx, false, blocking_nb, NULL);

/* Make sure that the device leaving this bridge has seen all
* relevant events before it is disassociated. In the normal
* case, when the device is directly attached to the bridge,
* this is covered by del_nbp(). If the association was indirect
* however, e.g. via a team or bond, and the device is leaving
* that intermediate device, then the bridge port remains in
* place.
*/
switchdev_deferred_process();
}

/* Let the bridge know that this port is offloaded, so that it can assign a
Expand Down
73 changes: 73 additions & 0 deletions net/switchdev/switchdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,35 @@
#include <linux/rtnetlink.h>
#include <net/switchdev.h>

static bool switchdev_obj_eq(const struct switchdev_obj *a,
const struct switchdev_obj *b)
{
const struct switchdev_obj_port_vlan *va, *vb;
const struct switchdev_obj_port_mdb *ma, *mb;

if (a->id != b->id || a->orig_dev != b->orig_dev)
return false;

switch (a->id) {
case SWITCHDEV_OBJ_ID_PORT_VLAN:
va = SWITCHDEV_OBJ_PORT_VLAN(a);
vb = SWITCHDEV_OBJ_PORT_VLAN(b);
return va->flags == vb->flags &&
va->vid == vb->vid &&
va->changed == vb->changed;
case SWITCHDEV_OBJ_ID_PORT_MDB:
case SWITCHDEV_OBJ_ID_HOST_MDB:
ma = SWITCHDEV_OBJ_PORT_MDB(a);
mb = SWITCHDEV_OBJ_PORT_MDB(b);
return ma->vid == mb->vid &&
ether_addr_equal(ma->addr, mb->addr);
default:
break;
}

BUG();
}

static LIST_HEAD(deferred);
static DEFINE_SPINLOCK(deferred_lock);

Expand Down Expand Up @@ -307,6 +336,50 @@ int switchdev_port_obj_del(struct net_device *dev,
}
EXPORT_SYMBOL_GPL(switchdev_port_obj_del);

/**
* switchdev_port_obj_act_is_deferred - Is object action pending?
*
* @dev: port device
* @nt: type of action; add or delete
* @obj: object to test
*
* Returns true if a deferred item is pending, which is
* equivalent to the action @nt on an object @obj.
*
* rtnl_lock must be held.
*/
bool switchdev_port_obj_act_is_deferred(struct net_device *dev,
enum switchdev_notifier_type nt,
const struct switchdev_obj *obj)
{
struct switchdev_deferred_item *dfitem;
bool found = false;

ASSERT_RTNL();

spin_lock_bh(&deferred_lock);

list_for_each_entry(dfitem, &deferred, list) {
if (dfitem->dev != dev)
continue;

if ((dfitem->func == switchdev_port_obj_add_deferred &&
nt == SWITCHDEV_PORT_OBJ_ADD) ||
(dfitem->func == switchdev_port_obj_del_deferred &&
nt == SWITCHDEV_PORT_OBJ_DEL)) {
if (switchdev_obj_eq((const void *)dfitem->data, obj)) {
found = true;
break;
}
}
}

spin_unlock_bh(&deferred_lock);

return found;
}
EXPORT_SYMBOL_GPL(switchdev_port_obj_act_is_deferred);

static ATOMIC_NOTIFIER_HEAD(switchdev_notif_chain);
static BLOCKING_NOTIFIER_HEAD(switchdev_blocking_notif_chain);

Expand Down

0 comments on commit 82a678e

Please sign in to comment.