Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🌱 test: fix collector for machines not having an IP in status and change ignition ssh user to capv #3010

Merged
merged 2 commits into from
May 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ endif

# Helper function to get dependency version from go.mod
get_go_version = $(shell go list -m $1 | awk '{print $$NF}')
get_test_go_version = $(shell cat test/go.mod | grep $1 | awk '{print $$NF}')
get_test_go_version = $(shell cat test/go.mod | grep $1 | awk '{print $$2}')

#
# Binaries.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,10 @@ patches:
- target:
kind: VSphereMachineTemplate
path: ../commons/remove-storage-policy.yaml
# Replace ssh user to match expected user by the e2e machine collector
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

q: is it possible to change it cluster-template-ignition.yaml instead?

Copy link
Member

@sbueringer sbueringer May 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should keep the core user there. It's sort of standard with Ignition I think? (IIRC)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree, core is the standard in ignition.

- target:
kind: KubeadmControlPlane
path: patch-user-kcp.yaml
- target:
kind: KubeadmConfigTemplate
path: patch-user-md.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- op: add
path: /spec/kubeadmConfigSpec/users/0/name
value: "capv"
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- op: add
path: /spec/template/spec/users/0/name
value: "capv"
5 changes: 4 additions & 1 deletion test/e2e/e2e_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,10 @@ var _ = SynchronizedBeforeSuite(func() []byte {
// vspherelog.MachineLogCollector tries to ssh to the machines to collect logs.
// This does not work when using vcsim because there are no real machines running ssh.
if testTarget != VCSimTestTarget {
clusterProxyOptions = append(clusterProxyOptions, framework.WithMachineLogCollector(vspherelog.MachineLogCollector{}))
clusterProxyOptions = append(clusterProxyOptions, framework.WithMachineLogCollector(&vspherelog.MachineLogCollector{
Client: vsphereClient,
Finder: vsphereFinder,
}))
}
bootstrapClusterProxy = framework.NewClusterProxy("bootstrap", kubeconfigPath, initScheme(), clusterProxyOptions...)

Expand Down
116 changes: 101 additions & 15 deletions test/framework/log/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,37 +22,42 @@ import (
"context"
"fmt"
"io"
"net"
"os"
"path/filepath"
"strings"
"sync"

"github.com/pkg/errors"
"github.com/vmware/govmomi"
"github.com/vmware/govmomi/find"
"github.com/vmware/govmomi/vim25/mo"
"golang.org/x/crypto/ssh"
kerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/klog/v2"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1"
"sigs.k8s.io/controller-runtime/pkg/client"
kinderrors "sigs.k8s.io/kind/pkg/errors"
)

const (
DefaultUserName = "capv"
VSpherePrivateKeyFilePath = "VSPHERE_SSH_PRIVATE_KEY"
)

type MachineLogCollector struct{}
type MachineLogCollector struct {
Client *govmomi.Client
Finder *find.Finder
}

func (collector MachineLogCollector) CollectMachinePoolLog(_ context.Context, _ client.Client, _ *expv1.MachinePool, _ string) error {
func (c *MachineLogCollector) CollectMachinePoolLog(_ context.Context, _ client.Client, _ *expv1.MachinePool, _ string) error {
return nil
}

func (collector MachineLogCollector) CollectMachineLog(_ context.Context, _ client.Client, m *clusterv1.Machine, outputPath string) error {
var hostIPAddr string
for _, address := range m.Status.Addresses {
if address.Type != clusterv1.MachineExternalIP {
continue
}
hostIPAddr = address.Address
break
func (c *MachineLogCollector) CollectMachineLog(ctx context.Context, _ client.Client, m *clusterv1.Machine, outputPath string) error {
machineIPAddresses, err := c.machineIPAddresses(ctx, m)
if err != nil {
return err
}

captureLogs := func(hostFileName, command string, args ...string) func() error {
Expand All @@ -62,11 +67,24 @@ func (collector MachineLogCollector) CollectMachineLog(_ context.Context, _ clie
return err
}
defer f.Close()
return executeRemoteCommand(f, hostIPAddr, command, args...)
var errs []error
// Try with all available IPs unless it succeeded.
for _, machineIPAddress := range machineIPAddresses {
if err := executeRemoteCommand(f, machineIPAddress, command, args...); err != nil {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q: does this imply we do expect ssh connectivity from the prow container to the machine? is this a new constraint introduced by this PR or it already exists for something else (thinking about constraints for the new test environment)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We already did this for a while to get logs

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR just fixes the usage where we try to pull data via SSH to make it work in more failure cases. Before we tried but often did not get any data because there was no IP address yet set on the Machine's status object.

errs = append(errs, err)
continue
}
return nil
}

if err := kerrors.NewAggregate(errs); err != nil {
return errors.Wrapf(err, "failed to run command %s for machine %s on ips [%s]", command, klog.KObj(m), strings.Join(machineIPAddresses, ", "))
}
return nil
}
}

return kinderrors.AggregateConcurrent([]func() error{
return aggregateConcurrent(
captureLogs("kubelet.log",
"sudo journalctl", "--no-pager", "--output=short-precise", "-u", "kubelet.service"),
captureLogs("containerd.log",
Expand All @@ -75,13 +93,56 @@ func (collector MachineLogCollector) CollectMachineLog(_ context.Context, _ clie
"sudo", "cat", "/var/log/cloud-init.log"),
captureLogs("cloud-init-output.log",
"sudo", "cat", "/var/log/cloud-init-output.log"),
})
)
}

func (collector MachineLogCollector) CollectInfrastructureLogs(_ context.Context, _ client.Client, _ *clusterv1.Cluster, _ string) error {
func (c *MachineLogCollector) CollectInfrastructureLogs(_ context.Context, _ client.Client, _ *clusterv1.Cluster, _ string) error {
return nil
}

func (c *MachineLogCollector) machineIPAddresses(ctx context.Context, m *clusterv1.Machine) ([]string, error) {
for _, address := range m.Status.Addresses {
if address.Type != clusterv1.MachineExternalIP {
continue
}
return []string{address.Address}, nil
}

vmObj, err := c.Finder.VirtualMachine(ctx, m.GetName())
if err != nil {
return nil, err
}

var vm mo.VirtualMachine

if err := c.Client.RetrieveOne(ctx, vmObj.Reference(), []string{"guest.net"}, &vm); err != nil {
// We cannot get the properties e.g. when the vm already got deleted or is getting deleted.
return nil, errors.Errorf("error retrieving properties for machine %s", klog.KObj(m))
}

addresses := []string{}

// Return all IPs so we can try each of them until one succeeded.
for _, nic := range vm.Guest.Net {
if nic.IpConfig == nil {
continue
}
for _, ip := range nic.IpConfig.IpAddress {
netIP := net.ParseIP(ip.IpAddress)
ipv4 := netIP.To4()
if ipv4 != nil {
addresses = append(addresses, ip.IpAddress)
}
}
}

if len(addresses) == 0 {
return nil, errors.Errorf("unable to find IP Addresses for Machine %s", klog.KObj(m))
}

return addresses, nil
}

func createOutputFile(path string) (*os.File, error) {
if err := os.MkdirAll(filepath.Dir(path), os.ModePerm); err != nil {
return nil, err
Expand Down Expand Up @@ -155,3 +216,28 @@ func readPrivateKey() ([]byte, error) {

return os.ReadFile(filepath.Clean(privateKeyFilePath))
}

// aggregateConcurrent runs fns concurrently, returning aggregated errors.
func aggregateConcurrent(funcs ...func() error) error {
// run all fns concurrently
ch := make(chan error, len(funcs))
var wg sync.WaitGroup
for _, f := range funcs {
f := f
wg.Add(1)
go func() {
defer wg.Done()
ch <- f()
}()
}
wg.Wait()
close(ch)
// collect up and return errors
errs := []error{}
for err := range ch {
if err != nil {
errs = append(errs, err)
}
}
return kerrors.NewAggregate(errs)
}
2 changes: 1 addition & 1 deletion test/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ require (
sigs.k8s.io/cluster-api-provider-vsphere v0.0.0-00010101000000-000000000000
sigs.k8s.io/cluster-api/test v0.0.0-00010101000000-000000000000
sigs.k8s.io/controller-runtime v0.17.5
sigs.k8s.io/kind v0.22.0
sigs.k8s.io/yaml v1.4.0
)

Expand Down Expand Up @@ -168,5 +167,6 @@ require (
k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 // indirect
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.28.0 // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
sigs.k8s.io/kind v0.22.0 // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
)