From c6e10b499220e3d18d459493f5134cf1e05fcf2d Mon Sep 17 00:00:00 2001 From: Anna Khmelnitsky Date: Wed, 18 Oct 2023 02:48:48 +0000 Subject: [PATCH] Probe NSX API endpoint in manager cluster In manager cluster resource, probe for NSX connectivity before joining cluster nodes. Parameters for connection probing are specified within `node_connectivity` block. This option is useful since nsx manager nodes might be spawned within same apply process, in which case it would take a while for NSX API endpoint to become responsive. Rather than using custom provisioner script to ensure connectivity, we give a probe and wait option in manager cluster resource. Standard retry mechanism will not be present in this connection probing in order to avoid duplication (we expect retry parameters to be very different for regular retry and initial connection probing) Signed-off-by: Anna Khmelnitsky --- nsxt/provider.go | 18 +-- nsxt/resource_nsxt_manager_cluster.go | 120 +++++++++++++++++++ website/docs/r/manager_cluster.html.markdown | 7 ++ 3 files changed, 138 insertions(+), 7 deletions(-) diff --git a/nsxt/provider.go b/nsxt/provider.go index 33eec89bf..c10a3bf0a 100644 --- a/nsxt/provider.go +++ b/nsxt/provider.go @@ -712,13 +712,13 @@ func configurePolicyConnectorData(d *schema.ResourceData, clients *nsxtClients) } if !isVMC { - err = configureLicenses(getPolicyConnectorForInit(*clients), clients.CommonConfig.LicenseDiff) + err = configureLicenses(getPolicyConnectorForInit(*clients, true), clients.CommonConfig.LicenseDiff) if err != nil { return err } } - err = initNSXVersion(getPolicyConnectorForInit(*clients)) + err = initNSXVersion(getPolicyConnectorForInit(*clients, true)) if err != nil && isVMC { // In case version API does not work for VMC, we workaround by testing version-specific APIs // TODO - remove this when /node/version API works for all auth methods on VMC @@ -978,14 +978,14 @@ func providerConfigure(d *schema.ResourceData) (interface{}, error) { } func getPolicyConnector(clients interface{}) client.Connector { - return getPolicyConnectorWithHeaders(clients, nil, false) + return getPolicyConnectorWithHeaders(clients, nil, false, true) } -func getPolicyConnectorForInit(clients interface{}) client.Connector { - return getPolicyConnectorWithHeaders(clients, nil, true) +func getPolicyConnectorForInit(clients interface{}, withRetry bool) client.Connector { + return getPolicyConnectorWithHeaders(clients, nil, true, withRetry) } -func getPolicyConnectorWithHeaders(clients interface{}, customHeaders *map[string]string, initFlow bool) client.Connector { +func getPolicyConnectorWithHeaders(clients interface{}, customHeaders *map[string]string, initFlow bool, withRetry bool) client.Connector { c := clients.(nsxtClients) retryFunc := func(retryContext retry.RetryContext) bool { @@ -1018,10 +1018,14 @@ func getPolicyConnectorWithHeaders(clients interface{}, customHeaders *map[strin return true } - connectorOptions := []client.ConnectorOption{client.UsingRest(nil), client.WithHttpClient(c.PolicyHTTPClient), client.WithDecorators(retry.NewRetryDecorator(uint(c.CommonConfig.MaxRetries), retryFunc))} + connectorOptions := []client.ConnectorOption{client.UsingRest(nil), client.WithHttpClient(c.PolicyHTTPClient)} var requestProcessors []core.RequestProcessor var responseAcceptors []core.ResponseAcceptor + if withRetry { + connectorOptions = append(connectorOptions, client.WithDecorators(retry.NewRetryDecorator(uint(c.CommonConfig.MaxRetries), retryFunc))) + } + if c.PolicySecurityContext != nil { connectorOptions = append(connectorOptions, client.WithSecurityContext(c.PolicySecurityContext)) } diff --git a/nsxt/resource_nsxt_manager_cluster.go b/nsxt/resource_nsxt_manager_cluster.go index 2154391b4..623d919ec 100644 --- a/nsxt/resource_nsxt_manager_cluster.go +++ b/nsxt/resource_nsxt_manager_cluster.go @@ -13,12 +13,19 @@ import ( "golang.org/x/exp/slices" + "github.com/hashicorp/terraform-plugin-sdk/v2/helper/resource" "github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema" "github.com/hashicorp/terraform-plugin-sdk/v2/helper/validation" + "github.com/vmware/vsphere-automation-sdk-go/runtime/protocol/client" "github.com/vmware/vsphere-automation-sdk-go/services/nsxt-mp/nsx" nsxModel "github.com/vmware/vsphere-automation-sdk-go/services/nsxt-mp/nsx/model" + "github.com/vmware/vsphere-automation-sdk-go/services/nsxt/infra" ) +const nodeConnectivityInitialDelay int = 20 +const nodeConnectivityInterval int = 16 +const nodeConnectivityTimeout int = 1800 + func resourceNsxtManagerCluster() *schema.Resource { return &schema.Resource{ Create: resourceNsxtManagerClusterCreate, @@ -28,6 +35,40 @@ func resourceNsxtManagerCluster() *schema.Resource { Schema: map[string]*schema.Schema{ "revision": getRevisionSchema(), + "api_probing": { + Type: schema.TypeList, + MaxItems: 1, + Description: "Settings that control initial node connection", + Elem: &schema.Resource{ + Schema: map[string]*schema.Schema{ + "enabled": { + Type: schema.TypeBool, + Description: "Whether API probing for NSX nodes is enabled", + Optional: true, + Default: true, + }, + "delay": { + Type: schema.TypeInt, + Description: "Initial delay in seconds before probing connection", + Optional: true, + Default: nodeConnectivityInitialDelay, + }, + "interval": { + Type: schema.TypeInt, + Description: "Connection probing interval in seconds", + Optional: true, + Default: nodeConnectivityInterval, + }, + "timeout": { + Type: schema.TypeInt, + Description: "Timeout for connection probing in seconds", + Optional: true, + Default: nodeConnectivityTimeout, + }, + }, + }, + Optional: true, + }, "node": { Type: schema.TypeList, Description: "Nodes in the cluster", @@ -82,6 +123,76 @@ type NsxClusterNode struct { Status string } +func getNodeConnectivityStateConf(connector client.Connector, delay int, interval int, timeout int) *resource.StateChangeConf { + + return &resource.StateChangeConf{ + Pending: []string{"notyet"}, + Target: []string{"success"}, + Refresh: func() (interface{}, string, error) { + siteClient := infra.NewSitesClient(connector) + // We use default site API to probe NSX manager API endpoint readiness, + // since it may take a while to auto-generate default site after API is responsive + resp, err := siteClient.Get("default") + if err != nil { + log.Printf("[DEBUG]: NSX API endpoint not ready: %v", err) + return nil, "notyet", nil + } + + log.Printf("[INFO]: NSX API endpoint ready") + return resp, "success", nil + }, + Delay: time.Duration(delay) * time.Second, + Timeout: time.Duration(timeout) * time.Second, + PollInterval: time.Duration(interval) * time.Second, + } +} + +func waitForNodeStatus(d *schema.ResourceData, m interface{}, nodes []NsxClusterNode) error { + + delay := nodeConnectivityInitialDelay + interval := nodeConnectivityInterval + timeout := nodeConnectivityTimeout + probingEnabled := true + probing := d.Get("api_probing").([]interface{}) + for _, item := range probing { + entry := item.(map[string]interface{}) + probingEnabled = entry["enabled"].(bool) + delay = entry["delay"].(int) + interval = entry["interval"].(int) + timeout = entry["timeout"].(int) + break + } + + // Wait for main mode + if !probingEnabled { + log.Printf("[DEBUG]: API probing for NSX is disabled") + return nil + } + connector := getPolicyConnectorForInit(m, false) + stateConf := getNodeConnectivityStateConf(connector, delay, interval, timeout) + _, err := stateConf.WaitForState() + if err != nil { + return fmt.Errorf("Failed to connect to main NSX manager endpoint") + } + + // Wait for joining nodes + for _, node := range nodes { + c, err := getNewNsxtClient(node, d, m) + if err != nil { + return err + } + newNsxClients := c.(nsxtClients) + nodeConnector := getPolicyConnectorForInit(newNsxClients, false) + nodeConf := getNodeConnectivityStateConf(nodeConnector, 0, interval, timeout) + _, err = nodeConf.WaitForState() + if err != nil { + return fmt.Errorf("Failed to connect to NSX node endpoint %s", node.IPAddress) + } + } + + return nil +} + func getClusterNodesFromSchema(d *schema.ResourceData) []NsxClusterNode { nodes := d.Get("node").([]interface{}) var clusterNodes []NsxClusterNode @@ -108,6 +219,11 @@ func resourceNsxtManagerClusterCreate(d *schema.ResourceData, m interface{}) err if len(nodes) == 0 { return fmt.Errorf("At least a manager appliance must be provided to form a cluster") } + + err := waitForNodeStatus(d, m, nodes) + if err != nil { + return fmt.Errorf("Failed to establish connection to NSX API: %v", err) + } clusterID, certSha256Thumbprint, hostIPs, err := getClusterInfoFromHostNode(d, m) if err != nil { return handleCreateError("ManagerCluster", "", err) @@ -329,6 +445,10 @@ func resourceNsxtManagerClusterRead(d *schema.ResourceData, m interface{}) error } func resourceNsxtManagerClusterUpdate(d *schema.ResourceData, m interface{}) error { + if !d.HasChange("node") { + // CHanges to attributes other than "node" should be ignored + return nil + } id := d.Id() connector := getPolicyConnector(m) client := nsx.NewClusterClient(connector) diff --git a/website/docs/r/manager_cluster.html.markdown b/website/docs/r/manager_cluster.html.markdown index edba82b4d..6f2c7fb9c 100644 --- a/website/docs/r/manager_cluster.html.markdown +++ b/website/docs/r/manager_cluster.html.markdown @@ -12,6 +12,8 @@ This resource is supported with NSX 4.1.0 onwards. The main node for the cluster is the host in terraform nsxt provider config, user will need to specify the nodes that will join the cluster in the resource config. Only one instance of nsxt_manager_cluster resource is supported. +If `api_probing` is enabled, this resource will wait for NSX API endpoints to come up +before performing cluster joining. ## Example Usage @@ -38,6 +40,11 @@ The following arguments are supported: * `ip_address` - (Required) Ip address of the node. * `username` - (Required) The username for login to the node. * `password` - (Required) The password for login to the node. +* `api_probing` - (Optional) Parameters for probing NSX API endpoint connection. Since NSX nodes might have been created during same apply, we might need to wait until the API endpoint becomes available and all required default objects are created. + * `enabled` - (Optional) Whether API connectivity check is enabled. Default is `true`. + * `delay` - (Optional) Initial delay before we start probing API endpoint in seconds. Default is 0. + * `interval` - (Optional) Interval for probing API endpoint in seconds. Default is 10. + * `timeout` - (Optional) Timeout for probing the API endpoint in seconds. Default is 1800. ## Argument Reference