From c4893c7e31c35f1b056b5462cb135a9c15f8b8f4 Mon Sep 17 00:00:00 2001 From: David Porter Date: Wed, 17 Apr 2024 22:30:24 -0700 Subject: [PATCH] Fix deadlock during NRI plugin registration During NRI external plugin registration: * `acceptPluginConnections()` is called * adaptation lock from `nri/pkg/adaptation` is acquired * `syncFn` is invoked * `syncFn` acquires NRI lock in `pkg/nri/nri.go` During container lifecycle events such as `ContainerStart` * NRI lock is acquired in pkg/nri.go * adaptation lock is acquired in `StateChange()` in `nri/pkg/adaptation` As a result, the locking order during NRI plugin registration is: * adaptation lock -> NRI lock While the locking order during container starts is: * NRI lock -> adaptation lock Due the fact that the locking order is inverted and not consistent, it it possible to encounter a deadlock. To fix the issue, during NRI plugin registration, first acquire the NRI lock (done via `syncFn` call) and only after acquire the adaptation lock. This ensures that NRI plugin registration the locking order is adaption lock -> NRI lock, which is consistent with the locking order during container lifecycle events. Fixes https://github.com/containerd/containerd/issues/10085 Signed-off-by: David Porter --- pkg/adaptation/adaptation.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pkg/adaptation/adaptation.go b/pkg/adaptation/adaptation.go index 141cb85b..9201e641 100644 --- a/pkg/adaptation/adaptation.go +++ b/pkg/adaptation/adaptation.go @@ -431,18 +431,16 @@ func (r *Adaptation) acceptPluginConnections(l net.Listener) error { continue } - r.Lock() - err = r.syncFn(ctx, p.synchronize) if err != nil { log.Infof(ctx, "failed to synchronize plugin: %v", err) } else { + r.Lock() r.plugins = append(r.plugins, p) r.sortPlugins() + r.Unlock() } - r.Unlock() - log.Infof(ctx, "plugin %q connected", p.name()) } }()