From 222d5778873e637989ef47e6b3522b8e47a8779b Mon Sep 17 00:00:00 2001 From: Changwei Ge Date: Wed, 30 Nov 2022 13:25:20 +0800 Subject: [PATCH] get rid of getting daemon state in single flight It is possible that snapshotter waits for RUNNING state before waits for INIT stat which is a deadlock Signed-off-by: Changwei Ge --- pkg/daemon/daemon.go | 53 +++++++++++++++-------------------- pkg/manager/daemon_adaptor.go | 2 +- pkg/manager/manager.go | 5 ++-- 3 files changed, 27 insertions(+), 33 deletions(-) diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index e2b953f7ce..916c60d5ea 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -26,7 +26,6 @@ import ( "github.com/containerd/nydus-snapshotter/pkg/utils/mount" "github.com/containerd/nydus-snapshotter/pkg/utils/retry" "github.com/pkg/errors" - "golang.org/x/sync/singleflight" ) const ( @@ -73,8 +72,6 @@ type Daemon struct { Supervisor *supervisor.Supervisor Config daemonconfig.DaemonConfig - stateGetterGroup singleflight.Group - ref int32 // Cache the nydusd daemon state to avoid frequently querying nydusd by API. State types.DaemonState @@ -162,7 +159,9 @@ func (d *Daemon) GetState() (types.DaemonState, error) { st := info.DaemonState() + d.Lock() d.State = st + d.Unlock() return st, nil } @@ -173,37 +172,31 @@ func (d *Daemon) GetState() (types.DaemonState, error) { // 2. READY // 3. RUNNING func (d *Daemon) WaitUntilState(expected types.DaemonState) error { - stateGetter := func() (v any, err error) { - err = retry.Do(func() error { - if expected == d.State { - return nil - } - - state, err := d.GetState() - if err != nil { - return errors.Wrapf(err, "wait until daemon is %s", expected) - } - - if state != expected { - return errors.Errorf("daemon %s is not %s yet, current state %s", - d.ID(), expected, state) - } - collector.CollectDaemonEvent(d.ID(), string(expected)) - + return retry.Do(func() error { + d.Lock() + if expected == d.State { + d.Unlock() return nil - }, - retry.Attempts(20), // totally wait for 2 seconds, should be enough - retry.LastErrorOnly(true), - retry.Delay(100*time.Millisecond), - ) + } + d.Unlock() - return - } + state, err := d.GetState() + if err != nil { + return errors.Wrapf(err, "wait until daemon is %s", expected) + } - _, err, shared := d.stateGetterGroup.Do(d.ID(), stateGetter) - log.L.Debugf("Get daemon %s with shared result: %v ", d.ID(), shared) + if state != expected { + return errors.Errorf("daemon %s is not %s yet, current state %s", + d.ID(), expected, state) + } + collector.CollectDaemonEvent(d.ID(), string(expected)) - return err + return nil + }, + retry.Attempts(20), // totally wait for 2 seconds, should be enough + retry.LastErrorOnly(true), + retry.Delay(100*time.Millisecond), + ) } func (d *Daemon) SharedMount(rafs *Rafs) error { diff --git a/pkg/manager/daemon_adaptor.go b/pkg/manager/daemon_adaptor.go index d70c12b3f4..1142208166 100644 --- a/pkg/manager/daemon_adaptor.go +++ b/pkg/manager/daemon_adaptor.go @@ -66,7 +66,7 @@ func (m *Manager) StartDaemon(d *daemon.Daemon) error { } if err := d.WaitUntilState(types.DaemonStateRunning); err != nil { - log.L.Errorf("daemon %s is not managed to reach RUNNING state", d.ID()) + log.L.WithError(err).Errorf("daemon %s is not managed to reach RUNNING state", d.ID()) return } diff --git a/pkg/manager/manager.go b/pkg/manager/manager.go index ac8f34c3e8..dadbbf033b 100644 --- a/pkg/manager/manager.go +++ b/pkg/manager/manager.go @@ -257,7 +257,7 @@ func (m *Manager) doDaemonFailover(d *daemon.Daemon) { } if err := d.WaitUntilState(types.DaemonStateInit); err != nil { - log.L.Errorf("daemon din't reach state %s", types.DaemonStateInit) + log.L.WithError(err).Errorf("daemon didn't reach state %s,", types.DaemonStateInit) return } @@ -311,8 +311,9 @@ func (m *Manager) handleDaemonDeathEvent() { log.L.Warnf("Daemon %s was not found", ev.daemonID) return } - + d.Lock() d.State = types.DaemonStateUnknown + d.Unlock() if m.RecoverPolicy == RecoverPolicyRestart { log.L.Infof("Restart daemon %s", ev.daemonID) go m.doDaemonRestart(d)