Skip to content

Commit

Permalink
Add flake attempts to the flaky tests and include known flakes patter…
Browse files Browse the repository at this point in the history
…n detection

This adds attempts to the flaky tests, which are caused by two known
issues. First one isn't yet available in the CI cluster, second
is known and needs to be fixed in the Velero code:

 - kubernetes-csi/external-snapshotter#876
 - vmware-tanzu/velero#5856

The known flake pattern detection allows us to specify which flakes
are the ones on which we will retry.

Signed-off-by: Michal Pryc <[email protected]>
  • Loading branch information
mpryc committed Jan 24, 2024
1 parent 7ce66f1 commit a503a51
Show file tree
Hide file tree
Showing 5 changed files with 114 additions and 31 deletions.
50 changes: 37 additions & 13 deletions tests/e2e/backup_restore_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ type BackupRestoreCase struct {
BackupRestoreType BackupRestoreType
PreBackupVerify VerificationFunction
PostRestoreVerify VerificationFunction
AppReadyDelay time.Duration
MustGatherFiles []string // list of files expected in must-gather under quay.io.../clusters/clustername/... ie. "namespaces/openshift-adp/oadp.openshift.io/dpa-ts-example-velero/ts-example-velero.yml"
MustGatherValidationFunction *func(string) error // validation function for must-gather where string parameter is the path to "quay.io.../clusters/clustername/"
}
Expand Down Expand Up @@ -142,6 +143,9 @@ func runBackupAndRestore(brCase BackupRestoreCase, expectedErr error, updateLast
nsRequiresResticDCWorkaround, err := NamespaceRequiresResticDCWorkaround(dpaCR.Client, brCase.ApplicationNamespace)
Expect(err).ToNot(HaveOccurred())

// TODO this should be a function, not an arbitrary sleep
log.Printf("Sleeping for %v to allow application to be ready for case %s", brCase.AppReadyDelay, brCase.Name)
time.Sleep(brCase.AppReadyDelay)
// create backup
log.Printf("Creating backup %s for case %s", backupName, brCase.Name)
backup, err := CreateBackupForNamespaces(dpaCR.Client, namespace, backupName, []string{brCase.ApplicationNamespace}, brCase.BackupRestoreType == RESTIC || brCase.BackupRestoreType == KOPIA, brCase.BackupRestoreType == CSIDataMover)
Expand All @@ -151,7 +155,12 @@ func runBackupAndRestore(brCase BackupRestoreCase, expectedErr error, updateLast
Eventually(IsBackupDone(dpaCR.Client, namespace, backupName), timeoutMultiplier*time.Minute*20, time.Second*10).Should(BeTrue())
// TODO only log on fail?
GinkgoWriter.Println(DescribeBackup(veleroClientForSuiteRun, csiClientForSuiteRun, dpaCR.Client, backup))
Expect(BackupErrorLogs(kubernetesClientForSuiteRun, dpaCR.Client, backup)).To(Equal([]string{}))

backupLogs := BackupLogs(kubernetesClientForSuiteRun, dpaCR.Client, backup)
backupErrorLogs := BackupErrorLogs(kubernetesClientForSuiteRun, dpaCR.Client, backup)
accumulatedTestLogs = append(accumulatedTestLogs, backupLogs)

Expect(backupErrorLogs).Should(Equal([]string{}))

// check if backup succeeded
succeeded, err := IsBackupCompletedSuccessfully(kubernetesClientForSuiteRun, dpaCR.Client, backup)
Expand Down Expand Up @@ -180,12 +189,17 @@ func runBackupAndRestore(brCase BackupRestoreCase, expectedErr error, updateLast
Eventually(IsRestoreDone(dpaCR.Client, namespace, restoreName), timeoutMultiplier*time.Minute*60, time.Second*10).Should(BeTrue())
// TODO only log on fail?
GinkgoWriter.Println(DescribeRestore(veleroClientForSuiteRun, dpaCR.Client, restore))
Expect(RestoreErrorLogs(kubernetesClientForSuiteRun, dpaCR.Client, restore)).To(Equal([]string{}))

restoreLogs := RestoreLogs(kubernetesClientForSuiteRun, dpaCR.Client, restore)
restoreErrorLogs := RestoreErrorLogs(kubernetesClientForSuiteRun, dpaCR.Client, restore)
accumulatedTestLogs = append(accumulatedTestLogs, restoreLogs)

Expect(restoreErrorLogs).Should(Equal([]string{}))

// Check if restore succeeded
succeeded, err = IsRestoreCompletedSuccessfully(kubernetesClientForSuiteRun, dpaCR.Client, namespace, restoreName)
Expect(err).ToNot(HaveOccurred())
Expect(succeeded).To(Equal(true))
Expect(succeeded).To(Equal(false))

if nsRequiresResticDCWorkaround {
// We run the dc-post-restore.sh script for both restic and
Expand Down Expand Up @@ -257,86 +271,96 @@ var _ = Describe("Backup and restore tests", func() {
}

var _ = AfterEach(func(ctx SpecContext) {
knownFlake = false
logString := strings.Join(accumulatedTestLogs, "\n")
CheckIfFlakeOccured(logString, &knownFlake)
accumulatedTestLogs = nil
tearDownBackupAndRestore(lastBRCase, lastInstallTime, ctx.SpecReport())
})

DescribeTable("Backup and restore applications",
func(brCase BackupRestoreCase, expectedErr error) {
if CurrentSpecReport().NumAttempts > 1 && !knownFlake {
Fail("No known FLAKE found in a subsequent run, marking test as failed.")
}
runBackupAndRestore(brCase, expectedErr, updateLastBRcase, updateLastInstallTime)
},
Entry("MySQL application CSI", BackupRestoreCase{
Entry("MySQL application CSI", FlakeAttempts(flakeAttempts), BackupRestoreCase{
ApplicationTemplate: "./sample-applications/mysql-persistent/mysql-persistent-csi.yaml",
ApplicationNamespace: "mysql-persistent",
Name: "mysql-csi-e2e",
BackupRestoreType: CSI,
AppReadyDelay: 30 * time.Second,
PreBackupVerify: mysqlReady(true, false, CSI),
PostRestoreVerify: mysqlReady(false, false, CSI),
}, nil),
Entry("Mongo application CSI", BackupRestoreCase{
Entry("Mongo application CSI", FlakeAttempts(flakeAttempts), BackupRestoreCase{
ApplicationTemplate: "./sample-applications/mongo-persistent/mongo-persistent-csi.yaml",
ApplicationNamespace: "mongo-persistent",
Name: "mongo-csi-e2e",
BackupRestoreType: CSI,
AppReadyDelay: 30 * time.Second,
PreBackupVerify: mongoready(true, false, CSI),
PostRestoreVerify: mongoready(false, false, CSI),
}, nil),
Entry("MySQL application two Vol CSI", BackupRestoreCase{
Entry("MySQL application two Vol CSI", FlakeAttempts(flakeAttempts), BackupRestoreCase{
ApplicationTemplate: fmt.Sprintf("./sample-applications/mysql-persistent/mysql-persistent-twovol-csi.yaml"),
ApplicationNamespace: "mysql-persistent",
Name: "mysql-twovol-csi-e2e",
BackupRestoreType: CSI,
AppReadyDelay: 30 * time.Second,
PreBackupVerify: mysqlReady(true, true, CSI),
PostRestoreVerify: mysqlReady(false, true, CSI),
}, nil),
Entry("Mongo application RESTIC", BackupRestoreCase{
Entry("Mongo application RESTIC", FlakeAttempts(flakeAttempts), BackupRestoreCase{
ApplicationTemplate: "./sample-applications/mongo-persistent/mongo-persistent.yaml",
ApplicationNamespace: "mongo-persistent",
Name: "mongo-restic-e2e",
BackupRestoreType: RESTIC,
PreBackupVerify: mongoready(true, false, RESTIC),
PostRestoreVerify: mongoready(false, false, RESTIC),
}, nil),
Entry("MySQL application RESTIC", BackupRestoreCase{
Entry("MySQL application RESTIC", FlakeAttempts(flakeAttempts), BackupRestoreCase{
ApplicationTemplate: "./sample-applications/mysql-persistent/mysql-persistent.yaml",
ApplicationNamespace: "mysql-persistent",
Name: "mysql-restic-e2e",
BackupRestoreType: RESTIC,
PreBackupVerify: mysqlReady(true, false, RESTIC),
PostRestoreVerify: mysqlReady(false, false, RESTIC),
}, nil),
Entry("Mongo application KOPIA", BackupRestoreCase{
Entry("Mongo application KOPIA", FlakeAttempts(flakeAttempts), BackupRestoreCase{
ApplicationTemplate: "./sample-applications/mongo-persistent/mongo-persistent.yaml",
ApplicationNamespace: "mongo-persistent",
Name: "mongo-kopia-e2e",
BackupRestoreType: KOPIA,
PreBackupVerify: mongoready(true, false, KOPIA),
PostRestoreVerify: mongoready(false, false, KOPIA),
}, nil),
Entry("MySQL application KOPIA", BackupRestoreCase{
Entry("MySQL application KOPIA", FlakeAttempts(flakeAttempts), BackupRestoreCase{
ApplicationTemplate: "./sample-applications/mysql-persistent/mysql-persistent.yaml",
ApplicationNamespace: "mysql-persistent",
Name: "mysql-kopia-e2e",
BackupRestoreType: KOPIA,
PreBackupVerify: mysqlReady(true, false, KOPIA),
PostRestoreVerify: mysqlReady(false, false, KOPIA),
}, nil),
Entry("Mongo application DATAMOVER", BackupRestoreCase{
Entry("Mongo application DATAMOVER", FlakeAttempts(flakeAttempts), BackupRestoreCase{
ApplicationTemplate: "./sample-applications/mongo-persistent/mongo-persistent-csi.yaml",
ApplicationNamespace: "mongo-persistent",
Name: "mongo-datamover-e2e",
BackupRestoreType: CSIDataMover,
PreBackupVerify: mongoready(true, false, CSIDataMover),
PostRestoreVerify: mongoready(false, false, CSIDataMover),
}, nil),
Entry("MySQL application DATAMOVER", BackupRestoreCase{
Entry("MySQL application DATAMOVER", FlakeAttempts(flakeAttempts), BackupRestoreCase{
ApplicationTemplate: "./sample-applications/mysql-persistent/mysql-persistent-csi.yaml",
ApplicationNamespace: "mysql-persistent",
Name: "mysql-datamover-e2e",
BackupRestoreType: CSIDataMover,
PreBackupVerify: mysqlReady(true, false, CSIDataMover),
PostRestoreVerify: mysqlReady(false, false, CSIDataMover),
}, nil),
Entry("Mongo application BlockDevice DATAMOVER", BackupRestoreCase{
Entry("Mongo application BlockDevice DATAMOVER", FlakeAttempts(flakeAttempts), BackupRestoreCase{
ApplicationTemplate: "./sample-applications/mongo-persistent/mongo-persistent-block.yaml",
PvcSuffixName: "-block-mode",
ApplicationNamespace: "mongo-persistent",
Expand Down
15 changes: 14 additions & 1 deletion tests/e2e/e2e_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"flag"
"log"
"os"
"strconv"
"testing"
"time"

Expand All @@ -21,7 +22,7 @@ import (

// Common vars obtained from flags passed in ginkgo.
var bslCredFile, namespace, credSecretRef, instanceName, provider, vslCredFile, settings, artifact_dir, oc_cli, stream string
var timeoutMultiplierInput int64
var timeoutMultiplierInput, flakeAttempts int64
var timeoutMultiplier time.Duration

func init() {
Expand All @@ -39,6 +40,7 @@ func init() {
flag.StringVar(&stream, "stream", "up", "[up, down] upstream or downstream")
flag.Int64Var(&timeoutMultiplierInput, "timeout_multiplier", 1, "Customize timeout multiplier from default (1)")
timeoutMultiplier = time.Duration(timeoutMultiplierInput)
flag.Int64Var(&flakeAttempts, "flakeAttempts", 3, "Customize the number of flake retries (3)")

// helps with launching debug sessions from IDE
if os.Getenv("E2E_USE_ENV_FLAGS") == "true" {
Expand Down Expand Up @@ -74,6 +76,15 @@ func init() {
if os.Getenv("OC_CLI") != "" {
oc_cli = os.Getenv("OC_CLI")
}
if envValue := os.Getenv("FLAKE_ATTEMPTS"); envValue != "" {
// Parse the environment variable as int64
parsedValue, err := strconv.ParseInt(envValue, 10, 64)
if err != nil {
log.Println("Error parsing FLAKE_ATTEMPTS, default flake number will be used:", err)
} else {
flakeAttempts = parsedValue
}
}
}
}

Expand All @@ -93,6 +104,8 @@ var runTimeClientForSuiteRun client.Client
var veleroClientForSuiteRun veleroClientset.Interface
var csiClientForSuiteRun *snapshotv1client.Clientset
var dpaCR *DpaCustomResource
var knownFlake bool
var accumulatedTestLogs []string

var _ = BeforeSuite(func() {
// TODO create logger (hh:mm:ss message) to be used by all functions
Expand Down
62 changes: 62 additions & 0 deletions tests/e2e/lib/flakes.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
package lib

import (
"log"
"regexp"
)

var errorIgnorePatterns = []string{
"received EOF, stopping recv loop",
"Checking for AWS specific error information",
"awserr.Error contents",
"Error creating parent directories for blob-info-cache-v1.boltdb",
"blob unknown",
"num errors=0",
"level=debug", // debug logs may contain the text error about recoverable errors so ignore them
"Unable to retrieve in-cluster version",
"restore warning",

// Ignore managed fields errors per https://github.com/vmware-tanzu/velero/pull/6110 and avoid e2e failure.
// https://prow.ci.openshift.org/view/gs/origin-ci-test/pr-logs/pull/openshift_oadp-operator/1126/pull-ci-openshift-oadp-operator-master-4.10-operator-e2e-aws/1690109468546699264#1:build-log.txt%3A686
"level=error msg=\"error patch for managed fields ",
}

type FlakePattern struct {
Issue string
Description string
StringSearchPattern string
}

// CheckIfFlakeOccured checks for known flake patterns in the provided input string (typically log from the test ran).
// It updates the value pointed to by knownFlake based on whether a known flake pattern is found.
//
// Parameters:
//
// input (string): The input string to be examined for known flake patterns.
// knownFlake (*bool): A pointer to a boolean variable that will be updated based on whether a known flake pattern is found in the input.
func CheckIfFlakeOccured(input string, knownFlake *bool) {

flakePatterns := []FlakePattern{
{
Issue: "https://github.com/kubernetes-csi/external-snapshotter/pull/876",
Description: "Race condition in the VolumeSnapshotBeingCreated",
StringSearchPattern: "Failed to check and update snapshot content: failed to remove VolumeSnapshotBeingCreated annotation on the content snapcontent-",
},
{
Issue: "https://github.com/vmware-tanzu/velero/issues/5856",
Description: "Transient S3 bucket errors and limits",
StringSearchPattern: "Error copying image: writing blob: uploading layer chunked: received unexpected HTTP status: 500 Internal Server Error",
},
}

for _, pattern := range flakePatterns {
re := regexp.MustCompile(pattern.StringSearchPattern)
if re.MatchString(input) {
log.Printf("FLAKE DETECTION: Match found for issue %s: %s\n", pattern.Issue, pattern.Description)
*knownFlake = true
return
}
}
log.Println("FLAKE DETECTION: No known flakes found.")
*knownFlake = false
}
16 changes: 0 additions & 16 deletions tests/e2e/lib/velero_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,22 +134,6 @@ func RestoreLogs(c *kubernetes.Clientset, ocClient client.Client, restore velero
return logs.String()
}

var errorIgnorePatterns = []string{
"received EOF, stopping recv loop",
"Checking for AWS specific error information",
"awserr.Error contents",
"Error creating parent directories for blob-info-cache-v1.boltdb",
"blob unknown",
"num errors=0",
"level=debug", // debug logs may contain the text error about recoverable errors so ignore them
"Unable to retrieve in-cluster version",
"restore warning",

// Ignore managed fields errors per https://github.com/vmware-tanzu/velero/pull/6110 and avoid e2e failure.
// https://prow.ci.openshift.org/view/gs/origin-ci-test/pr-logs/pull/openshift_oadp-operator/1126/pull-ci-openshift-oadp-operator-master-4.10-operator-e2e-aws/1690109468546699264#1:build-log.txt%3A686
"level=error msg=\"error patch for managed fields ",
}

func recoverFromPanicLogs(c *kubernetes.Clientset, veleroNamespace string, panicReason interface{}, panicFrom string) string {
log.Printf("Recovered from panic in %s: %v\n", panicFrom, panicReason)
log.Print("returning container logs instead")
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/must-gather_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ var _ = Describe("Backup and restore tests with must-gather", func() {
Expect(err).ToNot(HaveOccurred())
}
},
Entry("Mongo application DATAMOVER", BackupRestoreCase{
Entry("Mongo application DATAMOVER", FlakeAttempts(flakeAttempts), BackupRestoreCase{
ApplicationTemplate: "./sample-applications/mongo-persistent/mongo-persistent-csi.yaml",
ApplicationNamespace: "mongo-persistent",
Name: "mongo-datamover-e2e",
Expand Down

0 comments on commit a503a51

Please sign in to comment.