Skip to content

Commit

Permalink
Cleanup janitor
Browse files Browse the repository at this point in the history
Signed-off-by: Stefan Büringer [email protected]
  • Loading branch information
sbueringer committed Jun 17, 2024
1 parent bace233 commit 553d4d1
Show file tree
Hide file tree
Showing 12 changed files with 145 additions and 344 deletions.
12 changes: 3 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -930,15 +930,9 @@ clean-ci: ## Cleanup orphaned objects in CI
@if [ -z "${GOVC_USERNAME}" ]; then echo "GOVC_USERNAME is not set"; exit 1; fi
@if [ -z "${GOVC_PASSWORD}" ]; then echo "GOVC_PASSWORD is not set"; exit 1; fi
@if [ -z "${GOVC_URL}" ]; then echo "GOVC_URL is not set"; exit 1; fi
go run $(JANITOR_DIR) \
--dry-run=false \
--max-age=12h \
--ipam-namespace=default \
--folder=/SDDC-Datacenter/vm/Workloads/cluster-api-provider-vsphere \
--resource-pool=/SDDC-Datacenter/host/Cluster-1/Resources/Compute-ResourcePool/cluster-api-provider-vsphere \
--vm-folder=/SDDC-Datacenter/vm/Workloads/cluster-api-provider-vsphere \
--vm-folder=/SDDC-Datacenter/vm/Workloads/cloud-provider-vsphere \
--vm-folder=/SDDC-Datacenter/vm/Workloads/image-builder
@if [ -z "${VSPHERE_TLS_THUMBPRINT}" ]; then echo "VSPHERE_TLS_THUMBPRINT is not set"; exit 1; fi
@if [ -z "${BOSKOS_HOST}" ]; then echo "BOSKOS_HOST is not set"; exit 1; fi
go run $(JANITOR_DIR) --dry-run=false

.PHONY: clean-temporary
clean-temporary: ## Remove all temporary files and folders
Expand Down
8 changes: 4 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ module sigs.k8s.io/cluster-api-provider-vsphere

go 1.22.0

replace sigs.k8s.io/cluster-api => sigs.k8s.io/cluster-api v1.7.0-rc.0.0.20240610140608-2e3860ac7408
replace sigs.k8s.io/cluster-api => sigs.k8s.io/cluster-api v1.7.0-rc.0.0.20240617064349-5b6043e1b6ec

replace github.com/vmware-tanzu/vm-operator/pkg/constants/testlabels => github.com/vmware-tanzu/vm-operator/pkg/constants/testlabels v0.0.0-20240404200847-de75746a9505

Expand Down Expand Up @@ -37,7 +37,7 @@ require (
k8s.io/klog/v2 v2.120.1
k8s.io/utils v0.0.0-20231127182322-b307cd553661
sigs.k8s.io/cluster-api v1.7.0-rc.0.0.20240610140608-2e3860ac7408
sigs.k8s.io/controller-runtime v0.18.3
sigs.k8s.io/controller-runtime v0.18.4
sigs.k8s.io/kustomize/api v0.17.2
sigs.k8s.io/kustomize/kyaml v0.17.1
sigs.k8s.io/yaml v1.4.0
Expand Down Expand Up @@ -75,14 +75,14 @@ require (
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.27.0 // indirect
golang.org/x/net v0.26.0 // indirect
golang.org/x/oauth2 v0.20.0 // indirect
golang.org/x/oauth2 v0.21.0 // indirect
golang.org/x/sync v0.7.0 // indirect
golang.org/x/sys v0.21.0 // indirect
golang.org/x/term v0.21.0 // indirect
golang.org/x/text v0.16.0 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20240311132316-a219d84964c2 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240314234333-6e1732d8331c // indirect
google.golang.org/grpc v1.62.1 // indirect
google.golang.org/grpc v1.62.2 // indirect
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
gopkg.in/warnings.v0 v0.1.2 // indirect
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.30.0 // indirect
Expand Down
16 changes: 8 additions & 8 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -714,8 +714,8 @@ golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4Iltr
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/oauth2 v0.20.0 h1:4mQdhULixXKP1rwYBW0vAijoXnkTG0BLCDRzfe1idMo=
golang.org/x/oauth2 v0.20.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs=
golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
Expand Down Expand Up @@ -900,8 +900,8 @@ google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyac
google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
google.golang.org/grpc v1.27.1/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
google.golang.org/grpc v1.62.1 h1:B4n+nfKzOICUXMgyrNd19h/I9oH0L1pizfk1d4zSgTk=
google.golang.org/grpc v1.62.1/go.mod h1:IWTG0VlJLCh1SkC58F7np9ka9mx/WNkjl4PGJaiq+QE=
google.golang.org/grpc v1.62.2 h1:iEIj1U5qjyBjzkM5nk3Fq+S1IbjbXSyqeULZ1Nfo4AA=
google.golang.org/grpc v1.62.2/go.mod h1:IWTG0VlJLCh1SkC58F7np9ka9mx/WNkjl4PGJaiq+QE=
google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
Expand Down Expand Up @@ -1011,11 +1011,11 @@ rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA=
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.0.15/go.mod h1:LEScyzhFmoF5pso/YSeBstl57mOzx9xlU9n85RGrDQg=
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.30.0 h1:Tc9rS7JJoZ9sl3OpL4842oIk6lH7gWBb0JOmJ0ute7M=
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.30.0/go.mod h1:1ewhL9l1gkPcU/IU/6rFYfikf+7Y5imWv7ARVbBOzNs=
sigs.k8s.io/cluster-api v1.7.0-rc.0.0.20240610140608-2e3860ac7408 h1:2YkK+3O1A7HsQV9Cw8ncH+DGa1GuSFwv6lbl1bOyZgI=
sigs.k8s.io/cluster-api v1.7.0-rc.0.0.20240610140608-2e3860ac7408/go.mod h1:qN/cGR3Ww2GlMTcM47Abeob4SvpkN/8II439eNbPz6w=
sigs.k8s.io/cluster-api v1.7.0-rc.0.0.20240617064349-5b6043e1b6ec h1:ikgHzieJg7LTJjvL/o4gcs8pcGBLwXyvuRvrC4Uj7kk=
sigs.k8s.io/cluster-api v1.7.0-rc.0.0.20240617064349-5b6043e1b6ec/go.mod h1:tDxEz5a0levoOzLKny7JMW5S7g2P4fKYHNOMsS9IH/c=
sigs.k8s.io/controller-runtime v0.9.0/go.mod h1:TgkfvrhhEw3PlI0BRL/5xM+89y3/yc0ZDfdbTl84si8=
sigs.k8s.io/controller-runtime v0.18.3 h1:B5Wmmo8WMWK7izei+2LlXLVDGzMwAHBNLX68lwtlSR4=
sigs.k8s.io/controller-runtime v0.18.3/go.mod h1:TVoGrfdpbA9VRFaRnKgk9P5/atA0pMwq+f+msb9M8Sg=
sigs.k8s.io/controller-runtime v0.18.4 h1:87+guW1zhvuPLh1PHybKdYFLU0YJp4FhJRmiHvm5BZw=
sigs.k8s.io/controller-runtime v0.18.4/go.mod h1:TVoGrfdpbA9VRFaRnKgk9P5/atA0pMwq+f+msb9M8Sg=
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo=
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0=
sigs.k8s.io/kustomize/api v0.17.2 h1:E7/Fjk7V5fboiuijoZHgs4aHuexi5Y2loXlVOAVAG5g=
Expand Down
3 changes: 1 addition & 2 deletions hack/tools/boskosctl/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -359,8 +359,7 @@ func release(ctx context.Context, client *boskos.Client, resourceName, vSphereUs
defer vSphereClients.Logout(ctx)

// Delete all VMs created up until now.
maxCreationDate := time.Now()
j := janitor.NewJanitor(vSphereClients, nil, maxCreationDate, "", false)
j := janitor.NewJanitor(vSphereClients, false)

log.Info("Cleaning up vSphere")
// Note: We intentionally want to skip clusterModule cleanup. If we run this too often we might hit race conditions
Expand Down
7 changes: 2 additions & 5 deletions hack/tools/janitor/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,5 @@
The janitor is a tool for CI to cleanup objects leftover from failed or killed prowjobs.
It can be run regularly as prowjob.

It tries to delete:

* vSphere: virtual machines in the configured folders which exist longer than the configured `--max-age` flag.
* vSphere: cluster modules which do not refer any virtual machine
* IPAM: IPAddressClaims which exist longer than the configured `--max-age` flag
It retrieves vSphere projects from Boskos and then deletes VMs and resource pools accordingly.
Additionally it will delete cluster modules which do not refer any virtual machine.
185 changes: 83 additions & 102 deletions hack/tools/janitor/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ package main
import (
"context"
"flag"
"fmt"
"os"
"time"

"github.com/pkg/errors"
"github.com/spf13/pflag"
Expand All @@ -30,7 +30,6 @@ import (
"k8s.io/klog/v2"
ipamv1 "sigs.k8s.io/cluster-api/exp/ipam/api/v1beta1"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"

"sigs.k8s.io/cluster-api-provider-vsphere/hack/tools/pkg/boskos"
"sigs.k8s.io/cluster-api-provider-vsphere/hack/tools/pkg/janitor"
Expand All @@ -45,28 +44,16 @@ func init() {

var (
dryRun bool
ipamNamespace string
maxAge time.Duration
// Flags to get folders and resource pools from Boskos.
boskosHost string
resourceOwner string
resourceTypes []string
// Flags to directly specify folders and resource pools.
vsphereVMFolders []string
vsphereFolders []string
vsphereResourcePools []string
)

func initFlags(fs *pflag.FlagSet) {
// Note: Intentionally not adding a fallback value, so it is still possible to not use Boskos.
fs.StringVar(&boskosHost, "boskos-host", os.Getenv("BOSKOS_HOST"), "Boskos server URL. Boskos is only used to retrieve resources if this flag is set.")
fs.StringVar(&resourceOwner, "resource-owner", "vsphere-janitor", "Owner for the resource.")
fs.StringVar(&resourceOwner, "resource-owner", "vsphere-janitor", "Owner for the resource during cleanup.")
fs.StringArrayVar(&resourceTypes, "resource-type", []string{"vsphere-project-cluster-api-provider", "vsphere-project-cloud-provider", "vsphere-project-image-builder"}, "Types of the resources")
fs.StringArrayVar(&vsphereVMFolders, "vm-folder", []string{}, "Path to folders in vCenter to cleanup virtual machines.")
fs.StringArrayVar(&vsphereFolders, "folder", []string{}, "Path to a folder in vCenter to recursively cleanup empty subfolders.")
fs.StringArrayVar(&vsphereResourcePools, "resource-pool", []string{}, "Path to a resource pool in vCenter to recursively cleanup empty child resource pools.")
fs.StringVar(&ipamNamespace, "ipam-namespace", "", "Namespace for IPAddressClaim cleanup.")
fs.DurationVar(&maxAge, "max-age", time.Hour*12, "Maximum age of an object before it is getting deleted.")
fs.BoolVar(&dryRun, "dry-run", false, "dry-run results in not deleting anything but printing the actions.")
}

Expand All @@ -90,6 +77,17 @@ func main() {

func run(ctx context.Context) error {
log := ctrl.LoggerFrom(ctx)
log.Info("Configured settings", "dry-run", dryRun)

if boskosHost == "" {
return fmt.Errorf("--boskos-host must be set")
}
if resourceOwner == "" {
return fmt.Errorf("--resource-owner must be set")
}
if len(resourceTypes) == 0 {
return fmt.Errorf("--resource-type must be set")
}

// Create clients for vSphere.
vSphereClients, err := janitor.NewVSphereClients(ctx, janitor.NewVSphereClientsInput{
Expand All @@ -104,111 +102,94 @@ func run(ctx context.Context) error {
}
defer vSphereClients.Logout(ctx)

// Create controller-runtime client for IPAM.
restConfig, err := ctrl.GetConfig()
if err != nil {
return errors.Wrap(err, "unable to get kubeconfig")
}
ipamClient, err := client.New(restConfig, client.Options{Scheme: ipamScheme})
log = log.WithValues("boskosHost", boskosHost, "resourceOwner", resourceOwner)
ctx = ctrl.LoggerInto(ctx, log)
log.Info("Getting resources to cleanup from Boskos")
client, err := boskos.NewClient(resourceOwner, boskosHost)
if err != nil {
return errors.Wrap(err, "creating IPAM client")
return err
}

if boskosHost != "" {
log = log.WithValues("boskosHost", boskosHost, "resourceOwner", resourceOwner)
log.Info("Getting resources to cleanup from Boskos")
client, err := boskos.NewClient(resourceOwner, boskosHost)
var allErrs []error
for _, resourceType := range resourceTypes {
log := log.WithValues("resourceType", resourceType)
ctx := ctrl.LoggerInto(ctx, log)

metrics, err := client.Metric(resourceType)
if err != nil {
return err
allErrs = append(allErrs, errors.Errorf("failed to get metrics before cleanup for resource type %q", resourceType))
} else {
log.Info("State before cleanup", "resourceOwners", metrics.Owners, "resourceStates", metrics.Current)
}

var allErrs []error
for _, resourceType := range resourceTypes {
// For all resource in state dirty that are currently not owned:
// * acquire the resource (and set it to state "cleaning")
// * try to clean up vSphere
// * if cleanup succeeds, release the resource as free
// * if cleanup fails, resource will stay in cleaning and become stale (reaper will move it to dirty)
for {
log.Info("Acquiring resource")
res, err := client.Acquire(resourceType, boskos.Dirty, boskos.Cleaning)
if err != nil {
// If we get an error on acquire we're done looping through all dirty resources
if errors.Is(err, boskos.ErrNotFound) {
// Note: ErrNotFound means there are no more dirty resources that are not owned.
log.Info("No more resources to cleanup")
break
}
allErrs = append(allErrs, errors.Wrapf(err, "failed to acquire resource"))
// For all resource in state dirty that are currently not owned:
// * acquire the resource (and set it to state "cleaning")
// * try to clean up vSphere
// * if cleanup succeeds, release the resource as free
// * if cleanup fails, resource will stay in cleaning and become stale (reaper will move it to dirty)
for {
log.Info("Acquiring resource")
res, err := client.Acquire(resourceType, boskos.Dirty, boskos.Cleaning)
if err != nil {
// If we get an error on acquire we're done looping through all dirty resources
if errors.Is(err, boskos.ErrNotFound) {
// Note: ErrNotFound means there are no more dirty resources that are not owned.
log.Info("No more resources to cleanup")
break
}
log := log.WithValues("resourceName", res.Name)
allErrs = append(allErrs, errors.Wrapf(err, "failed to acquire resource"))
break
}
log := log.WithValues("resourceName", res.Name)
ctx := ctrl.LoggerInto(ctx, log)

if res.UserData == nil {
allErrs = append(allErrs, errors.Errorf("failed to get user data, resource %q is missing user data", res.Name))
continue
}
if res.UserData == nil {
allErrs = append(allErrs, errors.Errorf("failed to get user data, resource %q is missing user data", res.Name))
continue
}

folder, hasFolder := res.UserData.Load("folder")
if !hasFolder {
allErrs = append(allErrs, errors.Errorf("failed to get user data, resource %q is missing \"folder\" key", res.Name))
continue
}
resourcePool, hasResourcePool := res.UserData.Load("resourcePool")
if !hasResourcePool {
allErrs = append(allErrs, errors.Errorf("failed to get user data, resource %q is missing \"resourcePool\" key", res.Name))
continue
}
folder, hasFolder := res.UserData.Load("folder")
if !hasFolder {
allErrs = append(allErrs, errors.Errorf("failed to get user data, resource %q is missing \"folder\" key", res.Name))
continue
}
resourcePool, hasResourcePool := res.UserData.Load("resourcePool")
if !hasResourcePool {
allErrs = append(allErrs, errors.Errorf("failed to get user data, resource %q is missing \"resourcePool\" key", res.Name))
continue
}

// Delete all VMs created up until now.
maxCreationDate := time.Now()
j := janitor.NewJanitor(vSphereClients, nil, maxCreationDate, "", false)
j := janitor.NewJanitor(vSphereClients, false)

log.Info("Cleaning up vSphere")
if err := j.CleanupVSphere(ctx, []string{folder.(string)}, []string{resourcePool.(string)}, []string{folder.(string)}, false); err != nil {
log.Info("Cleaning up vSphere failed")
log.Info("Cleaning up vSphere")
if err := j.CleanupVSphere(ctx, []string{folder.(string)}, []string{resourcePool.(string)}, []string{folder.(string)}, false); err != nil {
log.Info("Cleaning up vSphere failed")

// Intentionally keep this resource in cleaning state. The reaper will move it from cleaning to dirty
// and we'll retry the cleanup.
// If we move it to dirty here, the for loop will pick it up again, and we get stuck in an infinite loop.
allErrs = append(allErrs, errors.Wrapf(err, "cleaning up vSphere failed, resource %q will now become stale", res.Name))
continue
}
log.Info("Cleaning up vSphere succeeded")
// Intentionally keep this resource in cleaning state. The reaper will move it from cleaning to dirty
// and we'll retry the cleanup.
// If we move it to dirty here, the for loop will pick it up again, and we get stuck in an infinite loop.
allErrs = append(allErrs, errors.Wrapf(err, "cleaning up vSphere failed, resource %q will now become stale", res.Name))
continue
}
log.Info("Cleaning up vSphere succeeded")

// Try to release resource as free.
log.Info("Releasing resource as free")
if releaseErr := client.Release(res.Name, boskos.Free); releaseErr != nil {
allErrs = append(allErrs, errors.Wrapf(releaseErr, "cleaning up vSphere succeeded and releasing resource as free failed, resource %q will now become stale", res.Name))
}
log.Info("Releasing resource as free succeeded")
// Try to release resource as free.
log.Info("Releasing resource as free")
if releaseErr := client.Release(res.Name, boskos.Free); releaseErr != nil {
allErrs = append(allErrs, errors.Wrapf(releaseErr, "cleaning up vSphere succeeded and releasing resource as free failed, resource %q will now become stale", res.Name))
}
log.Info("Releasing resource as free succeeded")
}
if len(allErrs) > 0 {
return errors.Wrap(kerrors.NewAggregate(allErrs), "cleaning up Boskos resources")
}
}

// Note: The following will be deleted once we migrated all repos to Boskos.
maxCreationDate := time.Now().Add(-maxAge)
janitor := janitor.NewJanitor(vSphereClients, ipamClient, maxCreationDate, ipamNamespace, dryRun)

log.Info("Configured settings", "dry-run", dryRun)
log.Info("Configured settings", "folders", vsphereFolders)
log.Info("Configured settings", "vm-folders", vsphereVMFolders)
log.Info("Configured settings", "resource-pools", vsphereResourcePools)
log.Info("Configured settings", "ipam-namespace", ipamNamespace)
log.Info("Configured settings", "max-age", maxAge)
log.Info("Configured settings", "janitor.maxCreationDate", maxCreationDate)

// First cleanup old vms and other vSphere resources to free up IPAddressClaims or cluster modules which are still in-use.
if err := janitor.CleanupVSphere(ctx, vsphereFolders, vsphereResourcePools, vsphereVMFolders, false); err != nil {
return errors.Wrap(err, "cleaning up vSphere")
metrics, err = client.Metric(resourceType)
if err != nil {
allErrs = append(allErrs, errors.Errorf("failed to get metrics after cleanup for resource type %q", resourceType))
} else {
log.Info("State after cleanup", "resourceOwners", metrics.Owners, "resourceStates", metrics.Current)
}
}

// Second cleanup IPAddressClaims.
if err := janitor.DeleteIPAddressClaims(ctx); err != nil {
return errors.Wrap(err, "cleaning up IPAddressClaims")
if len(allErrs) > 0 {
return errors.Wrap(kerrors.NewAggregate(allErrs), "cleaning up Boskos resources")
}

return nil
Expand Down
Loading

0 comments on commit 553d4d1

Please sign in to comment.