From 6cdb2e0be30c4813933b79fa45e2b28020cc032a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20Pa=C3=9F?= <22845248+mpass99@users.noreply.github.com> Date: Tue, 24 Sep 2024 18:22:06 +0200 Subject: [PATCH] Restrict false positive warning (#693) "Environment stopped unexpectedly" that got also triggered on migrations. --- internal/runner/nomad_manager.go | 33 ++++++++++++++++++++++++++- internal/runner/nomad_manager_test.go | 10 -------- 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/internal/runner/nomad_manager.go b/internal/runner/nomad_manager.go index edc83f3c..ea1df3bc 100644 --- a/internal/runner/nomad_manager.go +++ b/internal/runner/nomad_manager.go @@ -10,6 +10,7 @@ import ( "time" nomadApi "github.com/hashicorp/nomad/api" + "github.com/hashicorp/nomad/nomad/structs" influxdb2 "github.com/influxdata/influxdb-client-go/v2" "github.com/openHPI/poseidon/internal/config" "github.com/openHPI/poseidon/internal/nomad" @@ -20,6 +21,8 @@ import ( "github.com/openHPI/poseidon/pkg/util" ) +const environmentMigrationDelay = time.Minute + var ( log = logging.GetLogger("runner") ErrUnknownExecutionEnvironment = errors.New("execution environment not found") @@ -330,6 +333,34 @@ func monitorAllocationStartupDuration(startup time.Duration, runnerID string, en monitoring.WriteInfluxPoint(p) } +// checkForMigratingEnvironmentJob checks if the Nomad environment job is still running after the delay. +func (m *NomadRunnerManager) checkForMigratingEnvironmentJob(ctx context.Context, jobID string, delay time.Duration) { + log.WithField(dto.KeyEnvironmentID, jobID).Debug("Environment stopped unexpectedly. Checking again...") + + select { + case <-ctx.Done(): + return + case <-time.After(delay): + } + + templateJobs, err := m.apiClient.LoadEnvironmentJobs() + if err != nil { + log.WithError(err).Warn("couldn't load template jobs") + } + + var environmentStillRunning bool + for _, job := range templateJobs { + if jobID == *job.ID && *job.Status == structs.JobStatusRunning { + environmentStillRunning = true + break + } + } + + if !environmentStillRunning { + log.WithField(dto.KeyEnvironmentID, jobID).Warn("Environment stopped unexpectedly") + } +} + // onAllocationStopped is the callback for when Nomad stopped an allocation. func (m *NomadRunnerManager) onAllocationStopped(ctx context.Context, runnerID string, reason error) (alreadyRemoved bool) { log.WithField(dto.KeyRunnerID, runnerID).Debug("Runner stopped") @@ -343,7 +374,7 @@ func (m *NomadRunnerManager) onAllocationStopped(ctx context.Context, runnerID s } _, ok := m.environments.Get(environmentID.ToString()) if ok { - log.WithField(dto.KeyEnvironmentID, environmentID).Warn("Environment stopped unexpectedly") + go m.checkForMigratingEnvironmentJob(ctx, runnerID, environmentMigrationDelay) } return !ok } diff --git a/internal/runner/nomad_manager_test.go b/internal/runner/nomad_manager_test.go index da109a63..b356be6a 100644 --- a/internal/runner/nomad_manager_test.go +++ b/internal/runner/nomad_manager_test.go @@ -510,16 +510,6 @@ func (s *ManagerTestSuite) TestOnAllocationStopped() { s.True(alreadyRemoved) }) }) - s.Run("logs unexpectedly stopped environments", func() { - logger, hook := test.NewNullLogger() - log = logger.WithField("package", "runner") - - alreadyRemoved := s.nomadRunnerManager.onAllocationStopped(s.TestCtx, tests.DefaultTemplateJobID, nil) - s.False(alreadyRemoved) - - s.Len(hook.Entries, 1) - s.Equal(logrus.WarnLevel, hook.LastEntry().Level) - }) s.Run("does not log expectedly stopped environments", func() { logger, hook := test.NewNullLogger() log = logger.WithField("package", "runner")