tests(e2e): minor e2e test fixups (#941)

Some low-hanging fruit I noticed as we got some activity on the E2E test workflow: ### Deprovisioning can happen from multiple initial states Our current model only allows for one initial state and one target state. Some commands such as the deprovisioning one, can work incrementally for the client/AD. For example, if we manage to provision the client but fail to provision the AD, the script should still release any client resources. As of now, it only works if the AD was successfully provisioned. There are multiple ways to handle this, but I've opted for the less intrusive one of making states variadic arguments, where all but the last are considered the "initial states", and the last is the state that is being transitioned to. We still sacrifice some readability but existing code will work, and I've made sure to document the new intricacies of the `WithStateTransition` function. ### Add simple retry mechanism for SSH connections As seen in some failing CI runs, sometimes we fail to establish SSH connection to either a client or the AD controller. I believe this is a transient failure that could be mitigated if we retry the operation multiple times. This implements a simple retry mechanism that for now attempts to connect a maximum of 5 times at an interval of 3 seconds. ### Update apt package list before realm join It seems that realm join installs some packages on its own, but doesn't run `apt update` first, resulting in errors such as: * Installing necessary packages: sssd-tools adcli sssd libnss-sss libpam-sss Password for localadmin: ! E: http://azure.archive.ubuntu.com/ubuntu focal-updates/main amd64 libc-ares2 amd64 1.15.0-1ubuntu0.4 is not (yet) available (404 Not Found [IP: 20.54.144.51 80]) We already run `apt update` just after, before installing adsys, so this is a case of moving the update bit above `realm join`. In addition, upgrade packages to avoid Ubuntu running unattended-upgr on its own and breaking our use of `apt`. ### Move realm join after installing all packages There's a pesky unattended upgrade that hits us often on focal... let's group all apt commands before joining the realm so there's less chance of it starting between our apt runs. If it becomes a bigger problem we can stop and disable the unattended upgrade service. ### Add a sleep before checking machine scripts Once a while we seem to have issues asserting the existence of the startup script. Let's see if we can rule this out with a sleep after reboot. ### Increase max timeout for client reboot 1 minute may not be enough, as per https://github.com/ubuntu/adsys/actions/runs/8195833180/job/22414854516 ------------- Passing CI run: https://github.com/ubuntu/adsys/actions/runs/8201760903
ubuntu · Mar 8, 2024 · 2cef786 · 2cef786
2 parents d45043e + 1df84dd
commit 2cef786
Show file tree

Hide file tree

Showing 6 changed files with 107 additions and 51 deletions.
diff --git a/e2e/cmd/run_tests/01_provision_client/main.go b/e2e/cmd/run_tests/01_provision_client/main.go
@@ -176,14 +176,14 @@ func action(ctx context.Context, cmd *command.Command) error {
 		}
 	}
 
-	log.Infof("Joining VM to domain...")
-	_, err = client.Run(ctx, fmt.Sprintf("realm join warthogs.biz -U localadmin -v --unattended <<<'%s'", adPassword))
+	log.Infof("Upgrading packages...")
+	_, err = client.Run(ctx, "apt-get -y update && DEBIAN_FRONTEND=noninteractive apt-get -y upgrade")
 	if err != nil {
-		return fmt.Errorf("failed to join VM to domain: %w", err)
+		return fmt.Errorf("failed to update package list: %w", err)
 	}
 
 	log.Infof("Installing adsys package...")
-	_, err = client.Run(ctx, "apt-get -y update && DEBIAN_FRONTEND=noninteractive apt-get install -y /debs/*.deb")
+	_, err = client.Run(ctx, "DEBIAN_FRONTEND=noninteractive apt-get install -y /debs/*.deb")
 	if err != nil {
 		return fmt.Errorf("failed to install adsys package: %w", err)
 	}
@@ -195,6 +195,12 @@ func action(ctx context.Context, cmd *command.Command) error {
 		log.Warningf("Some packages failed to install: %v", err)
 	}
 
+	log.Infof("Joining VM to domain...")
+	_, err = client.Run(ctx, fmt.Sprintf("realm join warthogs.biz -U localadmin -v --unattended <<<'%s'", adPassword))
+	if err != nil {
+		return fmt.Errorf("failed to join VM to domain: %w", err)
+	}
+
 	cmd.Inventory.IP = ipAddress
 	cmd.Inventory.VMID = id
 	cmd.Inventory.UUID = uuid

diff --git a/e2e/cmd/run_tests/04_test_pro_managers/main.go b/e2e/cmd/run_tests/04_test_pro_managers/main.go
@@ -198,6 +198,10 @@ ftp_proxy="http://127.0.0.1:8080"`); err != nil {
 	if err := client.Reboot(); err != nil {
 		return err
 	}
+
+	// Sleep a few seconds to ensure the machine startup script has time to run
+	time.Sleep(10 * time.Second)
+
 	if err := client.RequireFileExists(ctx, "/etc/created-by-adsys-machine-shutdown-script"); err != nil {
 		return err
 	}

diff --git a/e2e/cmd/run_tests/99_deprovision/main.go b/e2e/cmd/run_tests/99_deprovision/main.go
@@ -25,7 +25,7 @@ func main() {
 func run() int {
 	cmd := command.New(action,
 		command.WithValidateFunc(validate),
-		command.WithStateTransition(inventory.ADProvisioned, inventory.Deprovisioned),
+		command.WithStateTransition(inventory.ClientProvisioned, inventory.ADProvisioned, inventory.Deprovisioned),
 	)
 	cmd.Usage = fmt.Sprintf(`go run ./%s [options]
 
@@ -66,6 +66,23 @@ func action(ctx context.Context, cmd *command.Command) error {
 		return fmt.Errorf("failed to leave domain: %w", err)
 	}
 
+	// Destroy the client VM
+	log.Infof("Destroying client VM %q", cmd.Inventory.VMName)
+	_, _, err = az.RunCommand(ctx, "vm", "delete",
+		"--resource-group", "AD",
+		"--name", cmd.Inventory.VMName,
+		"--force-deletion", "true",
+		"--yes",
+	)
+	if err != nil {
+		return err
+	}
+
+	// Return early if we don't need to deprovision AD resources
+	if cmd.Inventory.State != inventory.ADProvisioned {
+		return nil
+	}
+
 	// Connect to the domain controller
 	client, err = remote.NewClient(inventory.DomainControllerIP, "localadmin", sshKey)
 	if err != nil {
@@ -82,17 +99,5 @@ func action(ctx context.Context, cmd *command.Command) error {
 		return fmt.Errorf("error running the PowerShell script: %w", err)
 	}
 
-	// Destroy the client VM
-	log.Infof("Destroying client VM %q", cmd.Inventory.VMName)
-	_, _, err = az.RunCommand(ctx, "vm", "delete",
-		"--resource-group", "AD",
-		"--name", cmd.Inventory.VMName,
-		"--force-deletion", "true",
-		"--yes",
-	)
-	if err != nil {
-		return err
-	}
-
 	return nil
 }
diff --git a/e2e/internal/command/command.go b/e2e/internal/command/command.go
@@ -40,62 +40,75 @@ type Command struct {
 
 	fSet *flag.FlagSet
 
-	fromState inventory.State
-	toState   inventory.State
+	fromStates []inventory.State
+	toState    inventory.State
 }
 
-// WithStateTransition sets the expected state transition for the command.
-func WithStateTransition(from, to inventory.State) func(*options) {
-	return func(a *options) {
-		a.fromState = from
-		a.toState = to
+// WithStateTransition sets the expected state transition for the command,
+// allowing for one or more initial states, and a final state.
+func WithStateTransition(states ...inventory.State) func(*options) error {
+	return func(a *options) error {
+		if len(states) < 2 {
+			return errors.New("expected at least two states")
+		}
+
+		a.fromStates = states[:len(states)-1]
+		a.toState = states[len(states)-1]
+
+		return nil
 	}
 }
 
 // WithRequiredState ensures that the inventory file is in the given state,
 // without a transition being performed.
-func WithRequiredState(state inventory.State) func(*options) {
-	return func(a *options) {
-		a.fromState = state
+func WithRequiredState(state inventory.State) func(*options) error {
+	return func(a *options) error {
+		a.fromStates = []inventory.State{state}
 		a.toState = state
+
+		return nil
 	}
 }
 
 // WithValidateFunc sets the validation function for the command.
-func WithValidateFunc(validate cmdFunc) func(*options) {
-	return func(a *options) {
+func WithValidateFunc(validate cmdFunc) func(*options) error {
+	return func(a *options) error {
 		a.validate = validate
+
+		return nil
 	}
 }
 
 type options struct {
-	validate  cmdFunc
-	fromState inventory.State
-	toState   inventory.State
+	validate   cmdFunc
+	fromStates []inventory.State
+	toState    inventory.State
 }
 
 // Option is a function that configures the command.
-type Option func(*options)
+type Option func(*options) error
 
 // New creates a new command.
 func New(action cmdFunc, args ...Option) *Command {
 	// Apply given options
 	opts := options{
-		fromState: inventory.Null,
-		toState:   inventory.Null,
+		fromStates: []inventory.State{inventory.Null},
+		toState:    inventory.Null,
 	}
 
 	for _, f := range args {
-		f(&opts)
+		if err := f(&opts); err != nil {
+			log.Fatalf("Failed to apply option: %s", err)
+		}
 	}
 
 	return &Command{
 		action:   action,
 		validate: opts.validate,
 
-		fSet:      flag.NewFlagSet("", flag.ContinueOnError),
-		fromState: opts.fromState,
-		toState:   opts.toState,
+		fSet:       flag.NewFlagSet("", flag.ContinueOnError),
+		fromStates: opts.fromStates,
+		toState:    opts.toState,
 	}
 }
 
@@ -195,8 +208,16 @@ func (c *Command) Execute(ctx context.Context) int {
 			return 1
 		}
 
-		if c.Inventory.State != c.fromState {
-			log.Errorf("Inventory file is not in the expected state: %s", c.fromState)
+		// Allow at least one of the expected states
+		found := false
+		for _, s := range c.fromStates {
+			if c.Inventory.State == s {
+				found = true
+				break
+			}
+		}
+		if !found {
+			log.Errorf("Inventory file is not in any of the expected initial states: %v", c.fromStates)
 			return 1
 		}
 	}
@@ -227,7 +248,7 @@ func (c *Command) Execute(ctx context.Context) int {
 }
 
 func (c *Command) requireInventory() bool {
-	return c.fromState != inventory.Null
+	return c.fromStates[0] != inventory.Null
 }
 
 func (c *Command) installSignalHandler(cancel context.CancelFunc) func() {

diff --git a/e2e/internal/command/command_test.go b/e2e/internal/command/command_test.go
@@ -36,8 +36,9 @@ func TestAddFlags(t *testing.T) {
 
 func TestInventory(t *testing.T) {
 	tests := map[string]struct {
-		fromState inventory.State
-		toState   inventory.State
+		fromState           inventory.State
+		toState             inventory.State
+		additionalFromState inventory.State
 
 		existingInventory string
 
@@ -47,9 +48,11 @@ func TestInventory(t *testing.T) {
 		"From null state doesn't require existing data": {toState: inventory.BaseVMCreated},
 		"From existing state requires existing data":    {fromState: inventory.BaseVMCreated, toState: inventory.TemplateCreated, existingInventory: "inventory_from_template_created"},
 		"To null state doesn't write data":              {toState: inventory.Null, wantNoFile: true},
+		"Multiple from states requires at least one":    {fromState: inventory.ClientProvisioned, toState: inventory.TemplateCreated, additionalFromState: inventory.BaseVMCreated, existingInventory: "inventory_from_template_created"},
 
-		"Error if inventory file is required and doesn't exist":  {fromState: inventory.TemplateCreated, wantErr: true},
-		"Error if inventory state does not match expected state": {fromState: inventory.TemplateCreated, existingInventory: "inventory_from_template_created", wantErr: true},
+		"Error if inventory file is required and doesn't exist":      {fromState: inventory.TemplateCreated, wantErr: true},
+		"Error if inventory state does not match expected state":     {fromState: inventory.TemplateCreated, existingInventory: "inventory_from_template_created", wantErr: true},
+		"Error if inventory state does not match any expected state": {fromState: inventory.TemplateCreated, additionalFromState: inventory.Null, existingInventory: "inventory_from_template_created", wantErr: true},
 	}
 
 	for name, tc := range tests {
@@ -73,7 +76,11 @@ func TestInventory(t *testing.T) {
 			inventoryPath := filepath.Join(tempDir, "inventory", tc.existingInventory)
 			os.Args = append(args, "--inventory-file", inventoryPath)
 
-			cmd := command.New(mockAction, command.WithStateTransition(tc.fromState, tc.toState))
+			states := []inventory.State{tc.fromState}
+			states = append(states, tc.additionalFromState)
+			states = append(states, tc.toState)
+
+			cmd := command.New(mockAction, command.WithStateTransition(states...))
 			ret := cmd.Execute(context.Background())
 
 			if tc.wantErr {

diff --git a/e2e/internal/remote/remote.go b/e2e/internal/remote/remote.go
@@ -57,9 +57,22 @@ func NewClient(host string, username string, secret string) (Client, error) {
 		Timeout:         10 * time.Second,
 	}
 
-	client, err := ssh.Dial("tcp", host+":22", config)
+	var client *ssh.Client
+
+	interval := 3 * time.Second
+	retries := 10
+
+	for i := 1; i <= retries; i++ {
+		log.Debugf("Establishing SSH connection to %q (attempt %d/%d)", host, i, retries)
+		client, err = ssh.Dial("tcp", host+":22", config)
+		if err == nil {
+			break
+		}
+		log.Warningf("Failed to connect to %q: %v (attempt %d/%d)", host, err, i, retries)
+		time.Sleep(interval)
+	}
 	if err != nil {
-		return Client{}, fmt.Errorf("failed to establish connection to remote host: %w", err)
+		return Client{}, fmt.Errorf("failed to connect to %q: %w", host, err)
 	}
 
 	return Client{
@@ -218,7 +231,7 @@ func (c Client) Upload(localPath string, remotePath string) error {
 // Reboot reboots the remote host and waits for it to come back online, then
 // reestablishes the SSH connection.
 // It first waits for the host to go offline, then returns an error if the host
-// does not come back online within 1 minute.
+// does not come back online within 3 minutes.
 func (c *Client) Reboot() error {
 	log.Infof("Rebooting host %q", c.client.RemoteAddr().String())
 	_, _ = c.Run(context.Background(), "reboot")
@@ -238,7 +251,7 @@ func (c *Client) Reboot() error {
 		return fmt.Errorf("host did not go offline in time")
 	}
 
-	ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
+	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
 	defer cancel()
 
 	// Wait for the host to come back online