From 2f8689e8b860e84a8db69df2670124dafe9714b0 Mon Sep 17 00:00:00 2001
From: Gabriel Nagy <gabriel.nagy@canonical.com>
Date: Thu, 7 Mar 2024 21:26:33 +0200
Subject: [PATCH 1/6] Deprovisioning can happen from two initial states

Our current model only allows for one initial state and one target
state. Some commands such as the deprovisioning one, can work
incrementally for the client/AD. For example, if we manage to provision
the client but fail to provision the AD, the script should still release
any client resources. As of now, it only works if the AD was successfully
provisioned.

There are multiple ways to handle this, but I've opted for the less
intrusive one of making states variadic arguments, where all but the
last are considered the "initial states", and the last is the state that
is being transitioned to. We still sacrifice some readability but
existing code will work, and I've made sure to document the new
intricacies of the WithStateTransition function.
---
 e2e/cmd/run_tests/99_deprovision/main.go | 31 ++++++-----
 e2e/internal/command/command.go          | 71 +++++++++++++++---------
 e2e/internal/command/command_test.go     | 17 ++++--
 3 files changed, 76 insertions(+), 43 deletions(-)

diff --git a/e2e/cmd/run_tests/99_deprovision/main.go b/e2e/cmd/run_tests/99_deprovision/main.go
index ef736aaa1..32a94a6ba 100644
--- a/e2e/cmd/run_tests/99_deprovision/main.go
+++ b/e2e/cmd/run_tests/99_deprovision/main.go
@@ -25,7 +25,7 @@ func main() {
 func run() int {
 	cmd := command.New(action,
 		command.WithValidateFunc(validate),
-		command.WithStateTransition(inventory.ADProvisioned, inventory.Deprovisioned),
+		command.WithStateTransition(inventory.ClientProvisioned, inventory.ADProvisioned, inventory.Deprovisioned),
 	)
 	cmd.Usage = fmt.Sprintf(`go run ./%s [options]
 
@@ -66,6 +66,23 @@ func action(ctx context.Context, cmd *command.Command) error {
 		return fmt.Errorf("failed to leave domain: %w", err)
 	}
 
+	// Destroy the client VM
+	log.Infof("Destroying client VM %q", cmd.Inventory.VMName)
+	_, _, err = az.RunCommand(ctx, "vm", "delete",
+		"--resource-group", "AD",
+		"--name", cmd.Inventory.VMName,
+		"--force-deletion", "true",
+		"--yes",
+	)
+	if err != nil {
+		return err
+	}
+
+	// Return early if we don't need to deprovision AD resources
+	if cmd.Inventory.State != inventory.ADProvisioned {
+		return nil
+	}
+
 	// Connect to the domain controller
 	client, err = remote.NewClient(inventory.DomainControllerIP, "localadmin", sshKey)
 	if err != nil {
@@ -82,17 +99,5 @@ func action(ctx context.Context, cmd *command.Command) error {
 		return fmt.Errorf("error running the PowerShell script: %w", err)
 	}
 
-	// Destroy the client VM
-	log.Infof("Destroying client VM %q", cmd.Inventory.VMName)
-	_, _, err = az.RunCommand(ctx, "vm", "delete",
-		"--resource-group", "AD",
-		"--name", cmd.Inventory.VMName,
-		"--force-deletion", "true",
-		"--yes",
-	)
-	if err != nil {
-		return err
-	}
-
 	return nil
 }
diff --git a/e2e/internal/command/command.go b/e2e/internal/command/command.go
index 537856aed..d1db9c1dd 100644
--- a/e2e/internal/command/command.go
+++ b/e2e/internal/command/command.go
@@ -40,62 +40,75 @@ type Command struct {
 
 	fSet *flag.FlagSet
 
-	fromState inventory.State
-	toState   inventory.State
+	fromStates []inventory.State
+	toState    inventory.State
 }
 
-// WithStateTransition sets the expected state transition for the command.
-func WithStateTransition(from, to inventory.State) func(*options) {
-	return func(a *options) {
-		a.fromState = from
-		a.toState = to
+// WithStateTransition sets the expected state transition for the command,
+// allowing for one or more initial states, and a final state.
+func WithStateTransition(states ...inventory.State) func(*options) error {
+	return func(a *options) error {
+		if len(states) < 2 {
+			return errors.New("expected at least two states")
+		}
+
+		a.fromStates = states[:len(states)-1]
+		a.toState = states[len(states)-1]
+
+		return nil
 	}
 }
 
 // WithRequiredState ensures that the inventory file is in the given state,
 // without a transition being performed.
-func WithRequiredState(state inventory.State) func(*options) {
-	return func(a *options) {
-		a.fromState = state
+func WithRequiredState(state inventory.State) func(*options) error {
+	return func(a *options) error {
+		a.fromStates = []inventory.State{state}
 		a.toState = state
+
+		return nil
 	}
 }
 
 // WithValidateFunc sets the validation function for the command.
-func WithValidateFunc(validate cmdFunc) func(*options) {
-	return func(a *options) {
+func WithValidateFunc(validate cmdFunc) func(*options) error {
+	return func(a *options) error {
 		a.validate = validate
+
+		return nil
 	}
 }
 
 type options struct {
-	validate  cmdFunc
-	fromState inventory.State
-	toState   inventory.State
+	validate   cmdFunc
+	fromStates []inventory.State
+	toState    inventory.State
 }
 
 // Option is a function that configures the command.
-type Option func(*options)
+type Option func(*options) error
 
 // New creates a new command.
 func New(action cmdFunc, args ...Option) *Command {
 	// Apply given options
 	opts := options{
-		fromState: inventory.Null,
-		toState:   inventory.Null,
+		fromStates: []inventory.State{inventory.Null},
+		toState:    inventory.Null,
 	}
 
 	for _, f := range args {
-		f(&opts)
+		if err := f(&opts); err != nil {
+			log.Fatalf("Failed to apply option: %s", err)
+		}
 	}
 
 	return &Command{
 		action:   action,
 		validate: opts.validate,
 
-		fSet:      flag.NewFlagSet("", flag.ContinueOnError),
-		fromState: opts.fromState,
-		toState:   opts.toState,
+		fSet:       flag.NewFlagSet("", flag.ContinueOnError),
+		fromStates: opts.fromStates,
+		toState:    opts.toState,
 	}
 }
 
@@ -195,8 +208,16 @@ func (c *Command) Execute(ctx context.Context) int {
 			return 1
 		}
 
-		if c.Inventory.State != c.fromState {
-			log.Errorf("Inventory file is not in the expected state: %s", c.fromState)
+		// Allow at least one of the expected states
+		found := false
+		for _, s := range c.fromStates {
+			if c.Inventory.State == s {
+				found = true
+				break
+			}
+		}
+		if !found {
+			log.Errorf("Inventory file is not in any of the expected initial states: %v", c.fromStates)
 			return 1
 		}
 	}
@@ -227,7 +248,7 @@ func (c *Command) Execute(ctx context.Context) int {
 }
 
 func (c *Command) requireInventory() bool {
-	return c.fromState != inventory.Null
+	return c.fromStates[0] != inventory.Null
 }
 
 func (c *Command) installSignalHandler(cancel context.CancelFunc) func() {
diff --git a/e2e/internal/command/command_test.go b/e2e/internal/command/command_test.go
index 83632b271..203b08583 100644
--- a/e2e/internal/command/command_test.go
+++ b/e2e/internal/command/command_test.go
@@ -36,8 +36,9 @@ func TestAddFlags(t *testing.T) {
 
 func TestInventory(t *testing.T) {
 	tests := map[string]struct {
-		fromState inventory.State
-		toState   inventory.State
+		fromState           inventory.State
+		toState             inventory.State
+		additionalFromState inventory.State
 
 		existingInventory string
 
@@ -47,9 +48,11 @@ func TestInventory(t *testing.T) {
 		"From null state doesn't require existing data": {toState: inventory.BaseVMCreated},
 		"From existing state requires existing data":    {fromState: inventory.BaseVMCreated, toState: inventory.TemplateCreated, existingInventory: "inventory_from_template_created"},
 		"To null state doesn't write data":              {toState: inventory.Null, wantNoFile: true},
+		"Multiple from states requires at least one":    {fromState: inventory.ClientProvisioned, toState: inventory.TemplateCreated, additionalFromState: inventory.BaseVMCreated, existingInventory: "inventory_from_template_created"},
 
-		"Error if inventory file is required and doesn't exist":  {fromState: inventory.TemplateCreated, wantErr: true},
-		"Error if inventory state does not match expected state": {fromState: inventory.TemplateCreated, existingInventory: "inventory_from_template_created", wantErr: true},
+		"Error if inventory file is required and doesn't exist":      {fromState: inventory.TemplateCreated, wantErr: true},
+		"Error if inventory state does not match expected state":     {fromState: inventory.TemplateCreated, existingInventory: "inventory_from_template_created", wantErr: true},
+		"Error if inventory state does not match any expected state": {fromState: inventory.TemplateCreated, additionalFromState: inventory.Null, existingInventory: "inventory_from_template_created", wantErr: true},
 	}
 
 	for name, tc := range tests {
@@ -73,7 +76,11 @@ func TestInventory(t *testing.T) {
 			inventoryPath := filepath.Join(tempDir, "inventory", tc.existingInventory)
 			os.Args = append(args, "--inventory-file", inventoryPath)
 
-			cmd := command.New(mockAction, command.WithStateTransition(tc.fromState, tc.toState))
+			states := []inventory.State{tc.fromState}
+			states = append(states, tc.additionalFromState)
+			states = append(states, tc.toState)
+
+			cmd := command.New(mockAction, command.WithStateTransition(states...))
 			ret := cmd.Execute(context.Background())
 
 			if tc.wantErr {

From 7fec9aa6927e572245e3ff8380a55004eee1a223 Mon Sep 17 00:00:00 2001
From: Gabriel Nagy <gabriel.nagy@canonical.com>
Date: Thu, 7 Mar 2024 21:45:59 +0200
Subject: [PATCH 2/6] Add simple retry mechanism for SSH connections

As seen in some failing CI runs, sometimes we fail to establish SSH
connection to either a client or the AD controller. I believe this is a
transient failure that could be mitigated if we retry the operation
multiple times.

This implements a simple retry mechanism that for now attempts to
connect a maximum of 10 times at an interval of 3 seconds.
---
 e2e/internal/remote/remote.go | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/e2e/internal/remote/remote.go b/e2e/internal/remote/remote.go
index 3f53681fc..1d8347f32 100644
--- a/e2e/internal/remote/remote.go
+++ b/e2e/internal/remote/remote.go
@@ -57,9 +57,22 @@ func NewClient(host string, username string, secret string) (Client, error) {
 		Timeout:         10 * time.Second,
 	}
 
-	client, err := ssh.Dial("tcp", host+":22", config)
+	var client *ssh.Client
+
+	interval := 3 * time.Second
+	retries := 10
+
+	for i := 1; i <= retries; i++ {
+		log.Debugf("Establishing SSH connection to %q (attempt %d/%d)", host, i, retries)
+		client, err = ssh.Dial("tcp", host+":22", config)
+		if err == nil {
+			break
+		}
+		log.Warningf("Failed to connect to %q: %v (attempt %d/%d)", host, err, i, retries)
+		time.Sleep(interval)
+	}
 	if err != nil {
-		return Client{}, fmt.Errorf("failed to establish connection to remote host: %w", err)
+		return Client{}, fmt.Errorf("failed to connect to %q: %w", host, err)
 	}
 
 	return Client{

From 0b00c6c1f6c2cddf5192fd5587642e2217419b5c Mon Sep 17 00:00:00 2001
From: Gabriel Nagy <gabriel.nagy@canonical.com>
Date: Thu, 7 Mar 2024 22:30:28 +0200
Subject: [PATCH 3/6] Update apt package list before realm join

It seems that realm join installs some packages on its own, but doesn't
run `apt update` first, resulting in errors such as:

    * Installing necessary packages: sssd-tools adcli sssd libnss-sss libpam-sss
    Password for localadmin:
    ! E: http://azure.archive.ubuntu.com/ubuntu focal-updates/main amd64 libc-ares2 amd64 1.15.0-1ubuntu0.4 is not (yet) available (404  Not Found [IP: 20.54.144.51 80])

We already run `apt update` just after, before installing adsys, so this
is a case of moving the update bit above `realm join`. In addition,
upgrade packages to avoid Ubuntu running unattended-upgr on its own and
breaking our use of `apt`.
---
 e2e/cmd/run_tests/01_provision_client/main.go | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/e2e/cmd/run_tests/01_provision_client/main.go b/e2e/cmd/run_tests/01_provision_client/main.go
index 8099984b8..6c66843a3 100644
--- a/e2e/cmd/run_tests/01_provision_client/main.go
+++ b/e2e/cmd/run_tests/01_provision_client/main.go
@@ -176,6 +176,12 @@ func action(ctx context.Context, cmd *command.Command) error {
 		}
 	}
 
+	log.Infof("Upgrading packages...")
+	_, err = client.Run(ctx, "apt-get -y update && DEBIAN_FRONTEND=noninteractive apt-get -y upgrade")
+	if err != nil {
+		return fmt.Errorf("failed to update package list: %w", err)
+	}
+
 	log.Infof("Joining VM to domain...")
 	_, err = client.Run(ctx, fmt.Sprintf("realm join warthogs.biz -U localadmin -v --unattended <<<'%s'", adPassword))
 	if err != nil {
@@ -183,7 +189,7 @@ func action(ctx context.Context, cmd *command.Command) error {
 	}
 
 	log.Infof("Installing adsys package...")
-	_, err = client.Run(ctx, "apt-get -y update && DEBIAN_FRONTEND=noninteractive apt-get install -y /debs/*.deb")
+	_, err = client.Run(ctx, "DEBIAN_FRONTEND=noninteractive apt-get install -y /debs/*.deb")
 	if err != nil {
 		return fmt.Errorf("failed to install adsys package: %w", err)
 	}

From 13c891c0ab04a6170a1e7203b30952feab09fda0 Mon Sep 17 00:00:00 2001
From: Gabriel Nagy <gabriel.nagy@canonical.com>
Date: Thu, 7 Mar 2024 23:36:48 +0200
Subject: [PATCH 4/6] Move realm join after installing all packages

There's a pesky unattended upgrade that hits us often on focal... let's
group all apt commands before joining the realm so there's less chance
of it starting between our apt runs.

If it becomes a bigger problem we can stop and disable the unattended
upgrade service.
---
 e2e/cmd/run_tests/01_provision_client/main.go | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/e2e/cmd/run_tests/01_provision_client/main.go b/e2e/cmd/run_tests/01_provision_client/main.go
index 6c66843a3..a28de2a8d 100644
--- a/e2e/cmd/run_tests/01_provision_client/main.go
+++ b/e2e/cmd/run_tests/01_provision_client/main.go
@@ -182,12 +182,6 @@ func action(ctx context.Context, cmd *command.Command) error {
 		return fmt.Errorf("failed to update package list: %w", err)
 	}
 
-	log.Infof("Joining VM to domain...")
-	_, err = client.Run(ctx, fmt.Sprintf("realm join warthogs.biz -U localadmin -v --unattended <<<'%s'", adPassword))
-	if err != nil {
-		return fmt.Errorf("failed to join VM to domain: %w", err)
-	}
-
 	log.Infof("Installing adsys package...")
 	_, err = client.Run(ctx, "DEBIAN_FRONTEND=noninteractive apt-get install -y /debs/*.deb")
 	if err != nil {
@@ -201,6 +195,12 @@ func action(ctx context.Context, cmd *command.Command) error {
 		log.Warningf("Some packages failed to install: %v", err)
 	}
 
+	log.Infof("Joining VM to domain...")
+	_, err = client.Run(ctx, fmt.Sprintf("realm join warthogs.biz -U localadmin -v --unattended <<<'%s'", adPassword))
+	if err != nil {
+		return fmt.Errorf("failed to join VM to domain: %w", err)
+	}
+
 	cmd.Inventory.IP = ipAddress
 	cmd.Inventory.VMID = id
 	cmd.Inventory.UUID = uuid

From edbb00ecc7fb072dcbc5f769d83f9aa67d744a1c Mon Sep 17 00:00:00 2001
From: Gabriel Nagy <gabriel.nagy@canonical.com>
Date: Thu, 7 Mar 2024 23:52:52 +0200
Subject: [PATCH 5/6] Add a sleep before checking machine scripts

Once a while we seem to have issues asserting the existence of the
startup script. Let's see if we can rule this out with a sleep after
reboot.
---
 e2e/cmd/run_tests/04_test_pro_managers/main.go | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/e2e/cmd/run_tests/04_test_pro_managers/main.go b/e2e/cmd/run_tests/04_test_pro_managers/main.go
index c02be2f72..d6f423699 100644
--- a/e2e/cmd/run_tests/04_test_pro_managers/main.go
+++ b/e2e/cmd/run_tests/04_test_pro_managers/main.go
@@ -198,6 +198,10 @@ ftp_proxy="http://127.0.0.1:8080"`); err != nil {
 	if err := client.Reboot(); err != nil {
 		return err
 	}
+
+	// Sleep a few seconds to ensure the machine startup script has time to run
+	time.Sleep(10 * time.Second)
+
 	if err := client.RequireFileExists(ctx, "/etc/created-by-adsys-machine-shutdown-script"); err != nil {
 		return err
 	}

From 1df84dde3d09031c8bf835f3c797865237630ca1 Mon Sep 17 00:00:00 2001
From: Gabriel Nagy <gabriel.nagy@canonical.com>
Date: Fri, 8 Mar 2024 00:58:59 +0200
Subject: [PATCH 6/6] Increase max timeout for client reboot

1 minute may not be enough, as per
https://github.com/ubuntu/adsys/actions/runs/8195833180/job/22414854516
---
 e2e/internal/remote/remote.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/e2e/internal/remote/remote.go b/e2e/internal/remote/remote.go
index 1d8347f32..593f5d8a6 100644
--- a/e2e/internal/remote/remote.go
+++ b/e2e/internal/remote/remote.go
@@ -231,7 +231,7 @@ func (c Client) Upload(localPath string, remotePath string) error {
 // Reboot reboots the remote host and waits for it to come back online, then
 // reestablishes the SSH connection.
 // It first waits for the host to go offline, then returns an error if the host
-// does not come back online within 1 minute.
+// does not come back online within 3 minutes.
 func (c *Client) Reboot() error {
 	log.Infof("Rebooting host %q", c.client.RemoteAddr().String())
 	_, _ = c.Run(context.Background(), "reboot")
@@ -251,7 +251,7 @@ func (c *Client) Reboot() error {
 		return fmt.Errorf("host did not go offline in time")
 	}
 
-	ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
+	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
 	defer cancel()
 
 	// Wait for the host to come back online