Skip to content

Commit

Permalink
Cleanup janitor
Browse files Browse the repository at this point in the history
Signed-off-by: Stefan Büringer [email protected]
  • Loading branch information
sbueringer committed Jun 17, 2024
1 parent bace233 commit dcd79c8
Show file tree
Hide file tree
Showing 6 changed files with 94 additions and 299 deletions.
12 changes: 3 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -930,15 +930,9 @@ clean-ci: ## Cleanup orphaned objects in CI
@if [ -z "${GOVC_USERNAME}" ]; then echo "GOVC_USERNAME is not set"; exit 1; fi
@if [ -z "${GOVC_PASSWORD}" ]; then echo "GOVC_PASSWORD is not set"; exit 1; fi
@if [ -z "${GOVC_URL}" ]; then echo "GOVC_URL is not set"; exit 1; fi
go run $(JANITOR_DIR) \
--dry-run=false \
--max-age=12h \
--ipam-namespace=default \
--folder=/SDDC-Datacenter/vm/Workloads/cluster-api-provider-vsphere \
--resource-pool=/SDDC-Datacenter/host/Cluster-1/Resources/Compute-ResourcePool/cluster-api-provider-vsphere \
--vm-folder=/SDDC-Datacenter/vm/Workloads/cluster-api-provider-vsphere \
--vm-folder=/SDDC-Datacenter/vm/Workloads/cloud-provider-vsphere \
--vm-folder=/SDDC-Datacenter/vm/Workloads/image-builder
@if [ -z "${VSPHERE_TLS_THUMBPRINT}" ]; then echo "VSPHERE_TLS_THUMBPRINT is not set"; exit 1; fi
@if [ -z "${BOSKOS_HOST}" ]; then echo "BOSKOS_HOST is not set"; exit 1; fi
go run $(JANITOR_DIR) --dry-run=false

.PHONY: clean-temporary
clean-temporary: ## Remove all temporary files and folders
Expand Down
3 changes: 1 addition & 2 deletions hack/tools/boskosctl/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -359,8 +359,7 @@ func release(ctx context.Context, client *boskos.Client, resourceName, vSphereUs
defer vSphereClients.Logout(ctx)

// Delete all VMs created up until now.
maxCreationDate := time.Now()
j := janitor.NewJanitor(vSphereClients, nil, maxCreationDate, "", false)
j := janitor.NewJanitor(vSphereClients, false)

log.Info("Cleaning up vSphere")
// Note: We intentionally want to skip clusterModule cleanup. If we run this too often we might hit race conditions
Expand Down
173 changes: 68 additions & 105 deletions hack/tools/janitor/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ package main
import (
"context"
"flag"
"fmt"
"os"
"time"

"github.com/pkg/errors"
"github.com/spf13/pflag"
Expand All @@ -30,7 +30,6 @@ import (
"k8s.io/klog/v2"
ipamv1 "sigs.k8s.io/cluster-api/exp/ipam/api/v1beta1"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"

"sigs.k8s.io/cluster-api-provider-vsphere/hack/tools/pkg/boskos"
"sigs.k8s.io/cluster-api-provider-vsphere/hack/tools/pkg/janitor"
Expand All @@ -45,28 +44,16 @@ func init() {

var (
dryRun bool
ipamNamespace string
maxAge time.Duration
// Flags to get folders and resource pools from Boskos.
boskosHost string
resourceOwner string
resourceTypes []string
// Flags to directly specify folders and resource pools.
vsphereVMFolders []string
vsphereFolders []string
vsphereResourcePools []string
)

func initFlags(fs *pflag.FlagSet) {
// Note: Intentionally not adding a fallback value, so it is still possible to not use Boskos.
fs.StringVar(&boskosHost, "boskos-host", os.Getenv("BOSKOS_HOST"), "Boskos server URL. Boskos is only used to retrieve resources if this flag is set.")
fs.StringVar(&resourceOwner, "resource-owner", "vsphere-janitor", "Owner for the resource.")
fs.StringArrayVar(&resourceTypes, "resource-type", []string{"vsphere-project-cluster-api-provider", "vsphere-project-cloud-provider", "vsphere-project-image-builder"}, "Types of the resources")
fs.StringArrayVar(&vsphereVMFolders, "vm-folder", []string{}, "Path to folders in vCenter to cleanup virtual machines.")
fs.StringArrayVar(&vsphereFolders, "folder", []string{}, "Path to a folder in vCenter to recursively cleanup empty subfolders.")
fs.StringArrayVar(&vsphereResourcePools, "resource-pool", []string{}, "Path to a resource pool in vCenter to recursively cleanup empty child resource pools.")
fs.StringVar(&ipamNamespace, "ipam-namespace", "", "Namespace for IPAddressClaim cleanup.")
fs.DurationVar(&maxAge, "max-age", time.Hour*12, "Maximum age of an object before it is getting deleted.")
fs.BoolVar(&dryRun, "dry-run", false, "dry-run results in not deleting anything but printing the actions.")
}

Expand All @@ -90,6 +77,17 @@ func main() {

func run(ctx context.Context) error {
log := ctrl.LoggerFrom(ctx)
log.Info("Configured settings", "dry-run", dryRun)

if boskosHost == "" {
return fmt.Errorf("--boskos-host must be set")
}
if resourceOwner == "" {
return fmt.Errorf("--resource-owner must be set")
}
if len(resourceTypes) == 0 {
return fmt.Errorf("--resource-type must be set")
}

// Create clients for vSphere.
vSphereClients, err := janitor.NewVSphereClients(ctx, janitor.NewVSphereClientsInput{
Expand All @@ -104,111 +102,76 @@ func run(ctx context.Context) error {
}
defer vSphereClients.Logout(ctx)

// Create controller-runtime client for IPAM.
restConfig, err := ctrl.GetConfig()
if err != nil {
return errors.Wrap(err, "unable to get kubeconfig")
}
ipamClient, err := client.New(restConfig, client.Options{Scheme: ipamScheme})
log = log.WithValues("boskosHost", boskosHost, "resourceOwner", resourceOwner)
log.Info("Getting resources to cleanup from Boskos")
client, err := boskos.NewClient(resourceOwner, boskosHost)
if err != nil {
return errors.Wrap(err, "creating IPAM client")
return err
}

if boskosHost != "" {
log = log.WithValues("boskosHost", boskosHost, "resourceOwner", resourceOwner)
log.Info("Getting resources to cleanup from Boskos")
client, err := boskos.NewClient(resourceOwner, boskosHost)
if err != nil {
return err
}

var allErrs []error
for _, resourceType := range resourceTypes {
// For all resource in state dirty that are currently not owned:
// * acquire the resource (and set it to state "cleaning")
// * try to clean up vSphere
// * if cleanup succeeds, release the resource as free
// * if cleanup fails, resource will stay in cleaning and become stale (reaper will move it to dirty)
for {
log.Info("Acquiring resource")
res, err := client.Acquire(resourceType, boskos.Dirty, boskos.Cleaning)
if err != nil {
// If we get an error on acquire we're done looping through all dirty resources
if errors.Is(err, boskos.ErrNotFound) {
// Note: ErrNotFound means there are no more dirty resources that are not owned.
log.Info("No more resources to cleanup")
break
}
allErrs = append(allErrs, errors.Wrapf(err, "failed to acquire resource"))
var allErrs []error
for _, resourceType := range resourceTypes {
log = log.WithValues("resourceType", resourceType)
// For all resource in state dirty that are currently not owned:
// * acquire the resource (and set it to state "cleaning")
// * try to clean up vSphere
// * if cleanup succeeds, release the resource as free
// * if cleanup fails, resource will stay in cleaning and become stale (reaper will move it to dirty)
for {
log.Info("Acquiring resource")
res, err := client.Acquire(resourceType, boskos.Dirty, boskos.Cleaning)
if err != nil {
// If we get an error on acquire we're done looping through all dirty resources
if errors.Is(err, boskos.ErrNotFound) {
// Note: ErrNotFound means there are no more dirty resources that are not owned.
log.Info("No more resources to cleanup")
break
}
log := log.WithValues("resourceName", res.Name)
allErrs = append(allErrs, errors.Wrapf(err, "failed to acquire resource"))
break
}
log := log.WithValues("resourceName", res.Name)

if res.UserData == nil {
allErrs = append(allErrs, errors.Errorf("failed to get user data, resource %q is missing user data", res.Name))
continue
}
if res.UserData == nil {
allErrs = append(allErrs, errors.Errorf("failed to get user data, resource %q is missing user data", res.Name))
continue
}

folder, hasFolder := res.UserData.Load("folder")
if !hasFolder {
allErrs = append(allErrs, errors.Errorf("failed to get user data, resource %q is missing \"folder\" key", res.Name))
continue
}
resourcePool, hasResourcePool := res.UserData.Load("resourcePool")
if !hasResourcePool {
allErrs = append(allErrs, errors.Errorf("failed to get user data, resource %q is missing \"resourcePool\" key", res.Name))
continue
}
folder, hasFolder := res.UserData.Load("folder")
if !hasFolder {
allErrs = append(allErrs, errors.Errorf("failed to get user data, resource %q is missing \"folder\" key", res.Name))
continue
}
resourcePool, hasResourcePool := res.UserData.Load("resourcePool")
if !hasResourcePool {
allErrs = append(allErrs, errors.Errorf("failed to get user data, resource %q is missing \"resourcePool\" key", res.Name))
continue
}

// Delete all VMs created up until now.
maxCreationDate := time.Now()
j := janitor.NewJanitor(vSphereClients, nil, maxCreationDate, "", false)
j := janitor.NewJanitor(vSphereClients, false)

log.Info("Cleaning up vSphere")
if err := j.CleanupVSphere(ctx, []string{folder.(string)}, []string{resourcePool.(string)}, []string{folder.(string)}, false); err != nil {
log.Info("Cleaning up vSphere failed")
log.Info("Cleaning up vSphere")
if err := j.CleanupVSphere(ctx, []string{folder.(string)}, []string{resourcePool.(string)}, []string{folder.(string)}, false); err != nil {
log.Info("Cleaning up vSphere failed")

// Intentionally keep this resource in cleaning state. The reaper will move it from cleaning to dirty
// and we'll retry the cleanup.
// If we move it to dirty here, the for loop will pick it up again, and we get stuck in an infinite loop.
allErrs = append(allErrs, errors.Wrapf(err, "cleaning up vSphere failed, resource %q will now become stale", res.Name))
continue
}
log.Info("Cleaning up vSphere succeeded")
// Intentionally keep this resource in cleaning state. The reaper will move it from cleaning to dirty
// and we'll retry the cleanup.
// If we move it to dirty here, the for loop will pick it up again, and we get stuck in an infinite loop.
allErrs = append(allErrs, errors.Wrapf(err, "cleaning up vSphere failed, resource %q will now become stale", res.Name))
continue
}
log.Info("Cleaning up vSphere succeeded")

// Try to release resource as free.
log.Info("Releasing resource as free")
if releaseErr := client.Release(res.Name, boskos.Free); releaseErr != nil {
allErrs = append(allErrs, errors.Wrapf(releaseErr, "cleaning up vSphere succeeded and releasing resource as free failed, resource %q will now become stale", res.Name))
}
log.Info("Releasing resource as free succeeded")
// Try to release resource as free.
log.Info("Releasing resource as free")
if releaseErr := client.Release(res.Name, boskos.Free); releaseErr != nil {
allErrs = append(allErrs, errors.Wrapf(releaseErr, "cleaning up vSphere succeeded and releasing resource as free failed, resource %q will now become stale", res.Name))
}
log.Info("Releasing resource as free succeeded")
}
if len(allErrs) > 0 {
return errors.Wrap(kerrors.NewAggregate(allErrs), "cleaning up Boskos resources")
}
}

// Note: The following will be deleted once we migrated all repos to Boskos.
maxCreationDate := time.Now().Add(-maxAge)
janitor := janitor.NewJanitor(vSphereClients, ipamClient, maxCreationDate, ipamNamespace, dryRun)

log.Info("Configured settings", "dry-run", dryRun)
log.Info("Configured settings", "folders", vsphereFolders)
log.Info("Configured settings", "vm-folders", vsphereVMFolders)
log.Info("Configured settings", "resource-pools", vsphereResourcePools)
log.Info("Configured settings", "ipam-namespace", ipamNamespace)
log.Info("Configured settings", "max-age", maxAge)
log.Info("Configured settings", "janitor.maxCreationDate", maxCreationDate)

// First cleanup old vms and other vSphere resources to free up IPAddressClaims or cluster modules which are still in-use.
if err := janitor.CleanupVSphere(ctx, vsphereFolders, vsphereResourcePools, vsphereVMFolders, false); err != nil {
return errors.Wrap(err, "cleaning up vSphere")
}

// Second cleanup IPAddressClaims.
if err := janitor.DeleteIPAddressClaims(ctx); err != nil {
return errors.Wrap(err, "cleaning up IPAddressClaims")
if len(allErrs) > 0 {
return errors.Wrap(kerrors.NewAggregate(allErrs), "cleaning up Boskos resources")
}

return nil
Expand Down
Loading

0 comments on commit dcd79c8

Please sign in to comment.