From 212e7159125773bd0030b8404219deb81a15ffca Mon Sep 17 00:00:00 2001
From: Herko Lategan <herko@cockroachlabs.com>
Date: Wed, 6 Nov 2024 13:59:26 +0000
Subject: [PATCH] roachtest: minor task manager refactoring

Implement a few refactoring changes based on recent feedback.
1. Rename the `ExpectedCancel` event field to `TriggeredByTest` as it makes more
sense in the context of the flag that gets set on the event. A comment has been
added to explain the reasoning behind this flag.
2. Update the commentary on task termination to indicate that it does not wait
indefinitely for tasks to stop.
3. Update the test framework panic handler logging to be more inline with how
panics are logged in a test's main goroutine.

Epic: None
Release note: None
---
 .../roachtestutil/mixedversion/runner.go      |  2 +-
 .../roachtest/roachtestutil/task/manager.go   | 37 ++++++++++++-------
 pkg/cmd/roachtest/test_impl.go                |  4 +-
 pkg/cmd/roachtest/test_runner.go              |  2 +-
 4 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/pkg/cmd/roachtest/roachtestutil/mixedversion/runner.go b/pkg/cmd/roachtest/roachtestutil/mixedversion/runner.go
index c29c3c8e99b6..5a5af126494f 100644
--- a/pkg/cmd/roachtest/roachtestutil/mixedversion/runner.go
+++ b/pkg/cmd/roachtest/roachtestutil/mixedversion/runner.go
@@ -187,7 +187,7 @@ func (tr *testRunner) run() (retErr error) {
 			if event.Err == nil {
 				tr.logger.Printf("background step finished: %s", event.Name)
 				continue
-			} else if event.ExpectedCancel {
+			} else if event.TriggeredByTest {
 				tr.logger.Printf("background step canceled by test: %s", event.Name)
 				continue
 			}
diff --git a/pkg/cmd/roachtest/roachtestutil/task/manager.go b/pkg/cmd/roachtest/roachtestutil/task/manager.go
index d15d783d5e37..de4bbd0e3c37 100644
--- a/pkg/cmd/roachtest/roachtestutil/task/manager.go
+++ b/pkg/cmd/roachtest/roachtestutil/task/manager.go
@@ -28,9 +28,9 @@ type (
 
 	// Event represents the result of a task execution.
 	Event struct {
-		Name           string
-		Err            error
-		ExpectedCancel bool
+		Name            string
+		Err             error
+		TriggeredByTest bool
 	}
 
 	manager struct {
@@ -59,7 +59,7 @@ func NewManager(ctx context.Context, l *logger.Logger) Manager {
 func (m *manager) defaultOptions() []Option {
 	// The default panic handler simply returns the panic as an error.
 	defaultPanicHandlerFn := func(_ context.Context, name string, l *logger.Logger, r interface{}) error {
-		return r.(error)
+		return fmt.Errorf("panic: %v", r)
 	}
 	// The default error handler simply returns the error as is.
 	defaultErrorHandlerFn := func(_ context.Context, name string, l *logger.Logger, err error) error {
@@ -98,17 +98,26 @@ func (m *manager) GoWithCancel(fn Func, opts ...Option) context.CancelFunc {
 		}
 		err = internalFunc(l)
 		event := Event{
-			Name:           opt.Name,
-			Err:            err,
-			ExpectedCancel: err != nil && IsContextCanceled(groupCtx) && expectedContextCancellation.Load(),
+			Name: opt.Name,
+			Err:  err,
+			// TriggeredByTest is set to true if the task was canceled intentionally,
+			// by the test, and we encounter an error. The assumption is that we
+			// expect the error to have been caused by the cancelation, hence the
+			// error above was not caused by a failure. This ensures we don't register
+			// a test failure if the task was meant to be canceled. It's possible that
+			// `expectedContextCancellation` could be set before the context is
+			// canceled, thus we also ensure that the context is canceled.
+			TriggeredByTest: err != nil && IsContextCanceled(groupCtx) && expectedContextCancellation.Load(),
 		}
-		select {
-		case m.events <- event:
-			// exit goroutine
-		case <-m.ctx.Done():
-			// Parent context already finished, exit goroutine.
+
+		// Do not send the event if the parent context is canceled. The test is
+		// already aware of the cancelation and sending an event would be redundant.
+		// For instance, a call to test.Fatal would already have captured the error
+		// and canceled the context.
+		if IsContextCanceled(m.ctx) {
 			return nil
 		}
+		m.events <- event
 		return err
 	})
 
@@ -129,7 +138,9 @@ func (m *manager) Go(fn Func, opts ...Option) {
 }
 
 // Terminate will call the stop functions for every task started during the
-// test. Returns when all task functions have returned.
+// test. Returns when all task functions have returned, or after a 5-minute
+// timeout, whichever comes first. If the timeout is reached, the function logs
+// a warning message and returns.
 func (m *manager) Terminate(l *logger.Logger) {
 	func() {
 		m.mu.Lock()
diff --git a/pkg/cmd/roachtest/test_impl.go b/pkg/cmd/roachtest/test_impl.go
index 0fade5bfa0d0..a58d4b726778 100644
--- a/pkg/cmd/roachtest/test_impl.go
+++ b/pkg/cmd/roachtest/test_impl.go
@@ -12,7 +12,6 @@ import (
 	"math/rand"
 	"os"
 	"regexp"
-	"runtime/debug"
 	"strings"
 	"sync"
 	"time"
@@ -654,8 +653,7 @@ func (t *testImpl) IsBuildVersion(minVersion string) bool {
 }
 
 func panicHandler(_ context.Context, name string, l *logger.Logger, r interface{}) error {
-	l.Printf("panic stack trace in task %s:\n%s", name, string(debug.Stack()))
-	return fmt.Errorf("panic (stack trace above): %v", r)
+	return fmt.Errorf("test task %s panicked: %v", name, r)
 }
 
 // GoWithCancel runs the given function in a goroutine and returns a
diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go
index 69d43859c58a..a8bf62bf5b57 100644
--- a/pkg/cmd/roachtest/test_runner.go
+++ b/pkg/cmd/roachtest/test_runner.go
@@ -1299,7 +1299,7 @@ func (r *testRunner) runTest(
 				if event.Err == nil {
 					t.L().Printf("task finished: %s", event.Name)
 					continue
-				} else if event.ExpectedCancel {
+				} else if event.TriggeredByTest {
 					t.L().Printf("task canceled by test: %s", event.Name)
 					continue
 				}