Skip to content

Commit

Permalink
Restrict false positive warning (#693)
Browse files Browse the repository at this point in the history
"Environment stopped unexpectedly" that got also triggered on migrations.
  • Loading branch information
mpass99 authored Sep 24, 2024
1 parent b966aac commit 6cdb2e0
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 11 deletions.
33 changes: 32 additions & 1 deletion internal/runner/nomad_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"time"

nomadApi "github.com/hashicorp/nomad/api"
"github.com/hashicorp/nomad/nomad/structs"
influxdb2 "github.com/influxdata/influxdb-client-go/v2"
"github.com/openHPI/poseidon/internal/config"
"github.com/openHPI/poseidon/internal/nomad"
Expand All @@ -20,6 +21,8 @@ import (
"github.com/openHPI/poseidon/pkg/util"
)

const environmentMigrationDelay = time.Minute

var (
log = logging.GetLogger("runner")
ErrUnknownExecutionEnvironment = errors.New("execution environment not found")
Expand Down Expand Up @@ -330,6 +333,34 @@ func monitorAllocationStartupDuration(startup time.Duration, runnerID string, en
monitoring.WriteInfluxPoint(p)
}

// checkForMigratingEnvironmentJob checks if the Nomad environment job is still running after the delay.
func (m *NomadRunnerManager) checkForMigratingEnvironmentJob(ctx context.Context, jobID string, delay time.Duration) {
log.WithField(dto.KeyEnvironmentID, jobID).Debug("Environment stopped unexpectedly. Checking again...")

select {
case <-ctx.Done():
return
case <-time.After(delay):
}

templateJobs, err := m.apiClient.LoadEnvironmentJobs()
if err != nil {
log.WithError(err).Warn("couldn't load template jobs")
}

var environmentStillRunning bool
for _, job := range templateJobs {
if jobID == *job.ID && *job.Status == structs.JobStatusRunning {
environmentStillRunning = true
break
}
}

if !environmentStillRunning {
log.WithField(dto.KeyEnvironmentID, jobID).Warn("Environment stopped unexpectedly")
}
}

// onAllocationStopped is the callback for when Nomad stopped an allocation.
func (m *NomadRunnerManager) onAllocationStopped(ctx context.Context, runnerID string, reason error) (alreadyRemoved bool) {
log.WithField(dto.KeyRunnerID, runnerID).Debug("Runner stopped")
Expand All @@ -343,7 +374,7 @@ func (m *NomadRunnerManager) onAllocationStopped(ctx context.Context, runnerID s
}
_, ok := m.environments.Get(environmentID.ToString())
if ok {
log.WithField(dto.KeyEnvironmentID, environmentID).Warn("Environment stopped unexpectedly")
go m.checkForMigratingEnvironmentJob(ctx, runnerID, environmentMigrationDelay)
}
return !ok
}
Expand Down
10 changes: 0 additions & 10 deletions internal/runner/nomad_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -510,16 +510,6 @@ func (s *ManagerTestSuite) TestOnAllocationStopped() {
s.True(alreadyRemoved)
})
})
s.Run("logs unexpectedly stopped environments", func() {
logger, hook := test.NewNullLogger()
log = logger.WithField("package", "runner")

alreadyRemoved := s.nomadRunnerManager.onAllocationStopped(s.TestCtx, tests.DefaultTemplateJobID, nil)
s.False(alreadyRemoved)

s.Len(hook.Entries, 1)
s.Equal(logrus.WarnLevel, hook.LastEntry().Level)
})
s.Run("does not log expectedly stopped environments", func() {
logger, hook := test.NewNullLogger()
log = logger.WithField("package", "runner")
Expand Down

0 comments on commit 6cdb2e0

Please sign in to comment.