From 94d0bf70721321a296593e8f9cfa6e5a10b98fae Mon Sep 17 00:00:00 2001 From: Joe Webster <31218426+jwebster7@users.noreply.github.com> Date: Fri, 22 Nov 2024 10:01:11 -0600 Subject: [PATCH] Updates comments, logic, and logs for NVMe and LUKS --- frontend/csi/node_server.go | 103 +++++++++---------- storage_drivers/ontap/ontap_san_nvme_test.go | 1 - utils/iscsi/iscsi.go | 10 +- utils/nvme.go | 30 ++++-- utils/nvme_linux.go | 48 ++++----- 5 files changed, 103 insertions(+), 89 deletions(-) diff --git a/frontend/csi/node_server.go b/frontend/csi/node_server.go index 8ca843b47..a1d2d8dac 100644 --- a/frontend/csi/node_server.go +++ b/frontend/csi/node_server.go @@ -502,20 +502,25 @@ func (p *Plugin) nodeExpandVolume( if !utils.IsLegacyLUKSDevicePath(devicePath) { devicePath, err = utils.GetLUKSDeviceForMultipathDevice(devicePath) if err != nil { + Logc(ctx).WithFields(LogFields{ + "volumeId": volumeId, + "publishedPath": publishInfo.DevicePath, + }).WithError(err).Error("Failed to get LUKS device path from device path.") return status.Error(codes.Internal, err.Error()) } } Logc(ctx).WithField("volumeId", volumeId).Info("Resizing the LUKS mapping.") - // Refresh the luks device - // cryptsetup resize << + + // Refresh the LUKS device. + // "cryptsetup resize << " passphrase, ok := secrets["luks-passphrase"] if !ok { return status.Error(codes.InvalidArgument, "cannot expand LUKS encrypted volume; no passphrase provided") } else if passphrase == "" { return status.Error(codes.InvalidArgument, "cannot expand LUKS encrypted volume; empty passphrase provided") } - err := utils.ResizeLUKSDevice(ctx, devicePath, passphrase) - if err != nil { + + if err := utils.ResizeLUKSDevice(ctx, devicePath, passphrase); err != nil { if errors.IsIncorrectLUKSPassphraseError(err) { return status.Error(codes.InvalidArgument, err.Error()) } @@ -1302,7 +1307,7 @@ func (p *Plugin) nodeUnstageFCPVolume( return status.Error(codes.Internal, errStr) } - // If the luks device still exists, it means the device was unable to be closed prior to removing the block + // If the LUKS device still exists, it means the device was unable to be closed prior to removing the block // device. This can happen if the LUN was deleted or offline. It should be removable by this point. // It needs to be removed prior to removing the 'unmappedMpathDevice' device below. if luksMapperPath != "" { @@ -1752,7 +1757,7 @@ func (p *Plugin) nodeUnstageISCSIVolume( return status.Error(codes.Internal, errStr) } - // If the luks device still exists, it means the device was unable to be closed prior to removing the block + // If the LUKS device still exists, it means the device was unable to be closed prior to removing the block // device. This can happen if the LUN was deleted or offline. It should be removable by this point. // It needs to be removed prior to removing the 'unmappedMpathDevice' device below. if luksMapperPath != "" { @@ -2472,7 +2477,7 @@ func (p *Plugin) nodeStageNVMeVolume( // Ensure we update the passphrase in case it has never been set before err = ensureLUKSVolumePassphrase(ctx, p.restClient, luksDevice, volumeId, req.GetSecrets(), true) if err != nil { - return fmt.Errorf("could not set LUKS volume passphrase") + return fmt.Errorf("could not set LUKS volume passphrase; %v", err) } } @@ -2482,13 +2487,12 @@ func (p *Plugin) nodeStageNVMeVolume( PublishedPaths: map[string]struct{}{}, } - // Save the device info to the volume tracking info path for use in the publish & unstage calls. + // Save the device info to the volume tracking info path for use in future CSI node publish & unstage calls. if err := p.nodeHelper.WriteTrackingInfo(ctx, volumeId, volTrackingInfo); err != nil { return err } p.nvmeHandler.AddPublishedNVMeSession(&publishedNVMeSessions, publishInfo) - return nil } @@ -2507,7 +2511,7 @@ func (p *Plugin) nodeUnstageNVMeVolume( // Proceed further with unstage flow, if device is not found. nvmeDev, err := p.nvmeHandler.NewNVMeDevice(ctx, publishInfo.NVMeNamespaceUUID) if err != nil && !errors.IsNotFoundError(err) { - return nil, fmt.Errorf("error while getting NVMe device, %v", err) + return nil, fmt.Errorf("failed to get NVMe device; %v", err) } var devicePath string @@ -2518,18 +2522,19 @@ func (p *Plugin) nodeUnstageNVMeVolume( var luksMapperPath string if utils.ParseBool(publishInfo.LUKSEncryption) && devicePath != "" { fields := LogFields{ - "lunID": publishInfo.IscsiLunNumber, - "publishedDevice": publishInfo.DevicePath, - "nvmeDevPath": nvmeDev.GetPath(), + "namespace": publishInfo.NVMeNamespaceUUID, + "devicePath": devicePath, + "publishedPath": publishInfo.DevicePath, } luksMapperPath, err = p.devices.GetLUKSDeviceForMultipathDevice(devicePath) if err != nil { + Logc(ctx).WithFields(fields).WithError(err).Error("Failed to get LUKS device path from device path.") return &csi.NodeUnstageVolumeResponse{}, err } - // Ensure the LUKS device is closed if the luksMapperPath is set. if luksMapperPath != "" { + fields["luksMapperPath"] = luksMapperPath if err = p.devices.EnsureLUKSDeviceClosedWithMaxWaitLimit(ctx, luksMapperPath); err != nil { if !errors.IsMaxWaitExceededError(err) { Logc(ctx).WithFields(fields).WithError(err).Error("Failed to close LUKS device.") @@ -2540,51 +2545,43 @@ func (p *Plugin) nodeUnstageNVMeVolume( } } + // Attempt to flush the NVMe device. if !nvmeDev.IsNil() { - // If device is found, proceed to flush and clean up. - err := nvmeDev.FlushDevice(ctx, p.unsafeDetach, force) // If flush fails, give a grace period of 6 minutes (nvmeMaxFlushWaitDuration) before giving up. - if err != nil { + if err := nvmeDev.FlushDevice(ctx, p.unsafeDetach, force); err != nil { if NVMeNamespacesFlushRetry[publishInfo.NVMeNamespaceUUID].IsZero() { NVMeNamespacesFlushRetry[publishInfo.NVMeNamespaceUUID] = time.Now() - return nil, fmt.Errorf("error while flushing NVMe device, %v", err) - } else { - elapsed := time.Since(NVMeNamespacesFlushRetry[publishInfo.NVMeNamespaceUUID]) - if elapsed > nvmeMaxFlushWaitDuration { - // Waited enough, log it and proceed with next step in detach flow. - Logc(ctx).WithFields( - LogFields{ - "namespace": publishInfo.NVMeNamespaceUUID, - "elapsed": elapsed, - "maxWait": nvmeMaxFlushWaitDuration, - }).Debug("Volume is not safe to be detached. But, waited enough time.") - // Done with this, remove entry from exceptions list. - delete(NVMeNamespacesFlushRetry, publishInfo.NVMeNamespaceUUID) - } else { - // Allowing to wait for some more time. Let the kubelet retry. - Logc(ctx).WithFields( - LogFields{ - "namespace": publishInfo.NVMeNamespaceUUID, - "elapsed": elapsed, - }).Debug("Waiting for some more time.") - return nil, fmt.Errorf("error while flushing NVMe device, %v", err) - } + return nil, fmt.Errorf("failed to flush NVMe device; %v", err) } - } else { - // No error in 'flush', remove entry from exceptions list in case it was added earlier. - delete(NVMeNamespacesFlushRetry, publishInfo.NVMeNamespaceUUID) + + // If the max wait time for flush isn't hit yet, fail and let the CSI node agent call again. + elapsed := time.Since(NVMeNamespacesFlushRetry[publishInfo.NVMeNamespaceUUID]) + if elapsed <= nvmeMaxFlushWaitDuration { + Logc(ctx).WithFields(LogFields{ + "devicePath": devicePath, + "namespace": publishInfo.NVMeNamespaceUUID, + "elapsed": elapsed, + }).WithError(err).Debug("Could not flush NVMe device.") + return nil, fmt.Errorf("failed to flush NVMe device; %v", err) + } + + Logc(ctx).WithFields(LogFields{ + "namespace": publishInfo.NVMeNamespaceUUID, + "elapsed": elapsed, + "maxWait": nvmeMaxFlushWaitDuration, + }).Warn("Could not flush device within expected time period.") } + + delete(NVMeNamespacesFlushRetry, publishInfo.NVMeNamespaceUUID) } // Get the number of namespaces associated with the subsystem nvmeSubsys := p.nvmeHandler.NewNVMeSubsystem(ctx, publishInfo.NVMeSubsystemNQN) numNs, err := nvmeSubsys.GetNamespaceCount(ctx) if err != nil { - Logc(ctx).WithFields( - LogFields{ - "subsystem": publishInfo.NVMeSubsystemNQN, - "error": err, - }).Debug("Error getting Namespace count.") + Logc(ctx).WithField( + "subsystem", publishInfo.NVMeSubsystemNQN, + ).WithError(err).Debug("Error getting Namespace count.") } // If number of namespaces is more than 1, don't disconnect the subsystem. If we get any issues while getting the @@ -2592,11 +2589,9 @@ func (p *Plugin) nodeUnstageNVMeVolume( // NVMe self-healing is enabled), which keeps track of namespaces associated with the subsystem. if (err == nil && numNs <= 1) || (p.nvmeSelfHealingInterval > 0 && err != nil && disconnect) { if err := nvmeSubsys.Disconnect(ctx); err != nil { - Logc(ctx).WithFields( - LogFields{ - "subsystem": publishInfo.NVMeSubsystemNQN, - "error": err, - }).Debug("Error disconnecting subsystem.") + Logc(ctx).WithField( + "subsystem", publishInfo.NVMeSubsystemNQN, + ).WithError(err).Debug("Error disconnecting subsystem.") } } @@ -2614,7 +2609,7 @@ func (p *Plugin) nodeUnstageNVMeVolume( return nil, status.Error(codes.Internal, errStr) } - // If the luks device still exists, it means the device was unable to be closed prior to removing the block + // If the LUKS device still exists, it means the device was unable to be closed prior to removing the block // device. This can happen if the LUN was deleted or offline. It should be removable by this point. // It needs to be removed prior to removing the 'unmappedMpathDevice' device below. if luksMapperPath != "" { @@ -2669,7 +2664,7 @@ func (p *Plugin) nodePublishNVMeVolume( Logc(ctx).WithError(err).Error("Failed to ensure current LUKS passphrase.") } - // At this point, we must reassign the device path to the luks mapper path for mounts to work. + // At this point, we must reassign the device path to the LUKS mapper path for mounts to work. devicePath = luksDevice.MappedDevicePath() } diff --git a/storage_drivers/ontap/ontap_san_nvme_test.go b/storage_drivers/ontap/ontap_san_nvme_test.go index b2a4991bd..3028f8103 100644 --- a/storage_drivers/ontap/ontap_san_nvme_test.go +++ b/storage_drivers/ontap/ontap_san_nvme_test.go @@ -2284,7 +2284,6 @@ func TestImport_LUKSNamespace(t *testing.T) { mAPI.EXPECT().VolumeInfo(ctx, gomock.Any()).Return(vol, nil) mAPI.EXPECT().NVMeNamespaceGetByName(ctx, "/vol/"+originalName+"/*").Return(ns, nil) mAPI.EXPECT().NVMeIsNamespaceMapped(ctx, "", ns.UUID).Return(false, nil) - // mAPI.EXPECT().VolumeRename(ctx, originalName, volConfig.InternalName).Return(nil) beforeLUKSOverheadBytesStr, err := utils.ConvertSizeToBytes(volConfig.Size) if err != nil { diff --git a/utils/iscsi/iscsi.go b/utils/iscsi/iscsi.go index e9186b2b2..c55b5b6be 100644 --- a/utils/iscsi/iscsi.go +++ b/utils/iscsi/iscsi.go @@ -404,6 +404,7 @@ func (client *Client) AttachVolume( if err != nil { return mpathSize, err } + devicePath = luksDevice.MappedDevicePath() } @@ -418,11 +419,14 @@ func (client *Client) AttachVolume( if existingFstype == "" { if !isLUKSDevice { if unformatted, err := client.deviceClient.IsDeviceUnformatted(ctx, devicePath); err != nil { - Logc(ctx).WithField("device", - devicePath).Errorf("Unable to identify if the device is unformatted; err: %v", err) + Logc(ctx).WithField( + "device", devicePath, + ).WithError(err).Errorf("Unable to identify if the device is unformatted.") return mpathSize, err } else if !unformatted { - Logc(ctx).WithField("device", devicePath).Errorf("Device is not unformatted; err: %v", err) + Logc(ctx).WithField( + "device", devicePath, + ).WithError(err).Errorf("Device is not unformatted.") return mpathSize, fmt.Errorf("device %v is not unformatted", devicePath) } } else { diff --git a/utils/nvme.go b/utils/nvme.go index fccf208e5..6378f6cd4 100644 --- a/utils/nvme.go +++ b/utils/nvme.go @@ -313,7 +313,7 @@ func NVMeMountVolume( Logc(ctx).Debug(">>>> nvme.NVMeMountVolume") defer Logc(ctx).Debug("<<<< nvme.NVMeMountVolume") - // This is the raw device path for a nvme namespace. + // Initially, the device path raw device path for this NVMe namespace. devicePath := publishInfo.DevicePath // Format and open a LUKS device if LUKS Encryption is set to true. @@ -326,12 +326,19 @@ func NVMeMountVolume( if err != nil { return err } + devicePath = luksDevice.MappedDevicePath() } + // Fail fast if the device should be a LUKS device but is not LUKS formatted. if isLUKSDevice && !luksFormatted { - Logc(ctx).Errorf("Unable to identify if luks device.", devicePath) - return err + Logc(ctx).WithFields(LogFields{ + "devicePath": publishInfo.DevicePath, + "luksMapperPath": devicePath, + "isLUKSFormatted": luksFormatted, + "shouldBeLUKS": isLUKSDevice, + }).Error("Device should be a LUKS device but is not LUKS formatted.") + return fmt.Errorf("device should be a LUKS device but is not LUKS formatted") } // No filesystem work is required for raw block; return early. @@ -346,16 +353,23 @@ func NVMeMountVolume( if existingFstype == "" { if !isLUKSDevice { if unformatted, err := isDeviceUnformatted(ctx, devicePath); err != nil { - Logc(ctx).WithField("device", - devicePath).Errorf("Unable to identify if the device is not formatted; err: %v", err) + Logc(ctx).WithField( + "device", devicePath, + ).WithError(err).Error("Unable to identify if the device is not formatted.") return err } else if !unformatted { - Logc(ctx).WithField("device", devicePath).Errorf("Device is not not formatted; err: %v", err) - return fmt.Errorf("device %v is not unformatted", devicePath) + Logc(ctx).WithField( + "device", devicePath, + ).WithError(err).Error("Device is not unformatted.") + return fmt.Errorf("device %v is already formatted", devicePath) } } - Logc(ctx).WithFields(LogFields{"volume": name, "fstype": publishInfo.FilesystemType}).Debug("Formatting LUN.") + Logc(ctx).WithFields(LogFields{ + "volume": name, + "namespace": publishInfo.NVMeNamespaceUUID, + "fstype": publishInfo.FilesystemType, + }).Debug("Formatting NVMe Namespace.") err := fsClient.FormatVolume(ctx, devicePath, publishInfo.FilesystemType, publishInfo.FormatOptions) if err != nil { return fmt.Errorf("error formatting Namespace %s, device %s: %v", name, devicePath, err) diff --git a/utils/nvme_linux.go b/utils/nvme_linux.go index d6b0ae6fd..99c46cb7b 100644 --- a/utils/nvme_linux.go +++ b/utils/nvme_linux.go @@ -22,8 +22,8 @@ func GetHostNqn(ctx context.Context) (string, error) { out, err := command.Execute(ctx, "cat", "/etc/nvme/hostnqn") if err != nil { - Logc(ctx).WithField("Error", err).Warn("Could not read hostnqn; perhaps NVMe is not installed?") - return "", fmt.Errorf("failed to get hostnqn: %v", err) + Logc(ctx).WithError(err).Warn("Could not read hostnqn; perhaps NVMe is not installed?") + return "", fmt.Errorf("failed to get hostnqn; %v", err) } newout := strings.Split(string(out), "\n") @@ -37,21 +37,23 @@ func NVMeActiveOnHost(ctx context.Context) (bool, error) { _, err := command.ExecuteWithTimeout(ctx, "nvme", NVMeListCmdTimeoutInSeconds*time.Second, false, "version") if err != nil { - Logc(ctx).WithField("Error", err).Warn("Could not read nvme cli version; perhaps NVMe cli is not installed?") + Logc(ctx).WithError(err).Warn("Could not read NVMe CLI version; perhaps NVMe CLI is not installed?") return false, fmt.Errorf("failed to get hostnqn: %v", err) } out, err := command.ExecuteWithTimeout(ctx, "lsmod", NVMeListCmdTimeoutInSeconds*time.Second, false) if err != nil { - Logc(ctx).WithField("Error", err).Warn("Could not read the modules loaded on the host") - return false, fmt.Errorf("failed to get nvme driver info") + Logc(ctx).WithError(err).Warn("Could not read the modules loaded on the host.") + return false, fmt.Errorf("failed to get NVMe driver info; %v", err) } + newout := strings.Split(string(out), "\n") for _, s := range newout { if strings.Contains(s, fmt.Sprintf("%s_%s", sa.NVMe, transport)) { return true, nil } } + return false, fmt.Errorf("NVMe driver is not loaded on the host") } @@ -65,16 +67,16 @@ func GetNVMeSubsystemList(ctx context.Context) (Subsystems, error) { out, err := command.ExecuteWithTimeout(ctx, "nvme", NVMeListCmdTimeoutInSeconds*time.Second, false, "list-subsys", "-o", "json") if err != nil { - Logc(ctx).WithField("Error", err).Errorf("Failed to list subsystem: %v", err) + Logc(ctx).WithError(err).Error("Failed to list subsystem.") return subs, fmt.Errorf("failed to list subsys %v", err) } // For RHEL, the output is present in array for this command. if string(out)[0] == '[' { var rhelSubs []Subsystems - if err = json.Unmarshal([]byte(out), &rhelSubs); err != nil { - Logc(ctx).WithField("Error", err).Errorf("Failed to unmarshal ontap nvme devices: %v", err) - return subs, fmt.Errorf("failed to unmarshal ontap nvme devices: %v", err) + if err = json.Unmarshal(out, &rhelSubs); err != nil { + Logc(ctx).WithError(err).Error("Failed to unmarshal ONTAP NVMe devices.") + return subs, fmt.Errorf("failed to unmarshal ONTAP NVMe devices: %v", err) } if len(rhelSubs) > 0 { @@ -85,8 +87,8 @@ func GetNVMeSubsystemList(ctx context.Context) (Subsystems, error) { } if err = json.Unmarshal(out, &subs); err != nil { - Logc(ctx).WithField("Error", err).Errorf("Failed to unmarshal subsystem: %v", err) - return subs, fmt.Errorf("failed to unmarshal subsys %v", err) + Logc(ctx).WithError(err).Error("Failed to unmarshal subsystem.") + return subs, fmt.Errorf("failed to unmarshal subsystems; %v", err) } return subs, nil @@ -101,8 +103,8 @@ func ConnectSubsystemToHost(ctx context.Context, subsNqn, IP string) error { _, err := command.Execute(ctx, "nvme", "connect", "-t", "tcp", "-n", subsNqn, "-a", IP, "-s", "4420", "-l", "-1") if err != nil { - Logc(ctx).WithField("Error", err).Errorf("Failed to connect subsystem to host: %v", err) - return fmt.Errorf("failed to connect subsystem %s to %s: %v", subsNqn, IP, err) + Logc(ctx).WithError(err).Errorf("Failed to connect subsystem %s to host.", subsNqn) + return fmt.Errorf("failed to connect subsystem %s to %s; %v", subsNqn, IP, err) } return nil @@ -115,8 +117,8 @@ func DisconnectSubsystemFromHost(ctx context.Context, subsysNqn string) error { _, err := command.Execute(ctx, "nvme", "disconnect", "-n", subsysNqn) if err != nil { - Logc(ctx).WithField("Error", err).Errorf("Failed to disconnect subsystem %s: %v", subsysNqn, err) - return fmt.Errorf("failed to disconnect subsystem %s: %v", subsysNqn, err) + Logc(ctx).WithError(err).Errorf("Failed to disconnect subsystem %s.", subsysNqn) + return fmt.Errorf("failed to disconnect subsystem %s; %v", subsysNqn, err) } return nil @@ -130,8 +132,8 @@ func GetNamespaceCountForSubsDevice(ctx context.Context, subsDevice string) (int out, err := command.ExecuteWithTimeout(ctx, "nvme", NVMeListCmdTimeoutInSeconds*time.Second, false, "list-ns", subsDevice) if err != nil { - Logc(ctx).WithField("Error", err).Errorf("Failed to get namespace count: %v", err) - return 0, fmt.Errorf("failed to get namespace count: %v", err) + Logc(ctx).WithError(err).Error("Failed to get namespace count.") + return 0, fmt.Errorf("failed to get namespace count; %v", err) } return strings.Count(string(out), "["), nil @@ -147,8 +149,8 @@ func GetNVMeDeviceList(ctx context.Context) (NVMeDevices, error) { out, err := command.ExecuteWithTimeout(ctx, "nvme", NVMeListCmdTimeoutInSeconds*time.Second, false, "netapp", "ontapdevices", "-o", "json") if err != nil { - Logc(ctx).WithField("Error", err).Errorf("Failed to exec list nvme ontap devices: %v", err) - return ontapDevs, fmt.Errorf("failed to exec list nvme ontap devices: %v", err) + Logc(ctx).WithError(err).Error("Failed to list NVMe ONTAP devices.") + return ontapDevs, fmt.Errorf("failed to list NVMe ONTAP devices; %v", err) } // There are 2 use cases where we may need to format the output before unmarshalling it. @@ -173,8 +175,8 @@ func GetNVMeDeviceList(ctx context.Context) (NVMeDevices, error) { if string(out) != "" { // "out" would be empty string if there are no devices if err = json.Unmarshal(out, &ontapDevs); err != nil { - Logc(ctx).WithField("Error", err).Errorf("Failed to unmarshal ontap nvme devices: %v", err) - return ontapDevs, fmt.Errorf("failed to unmarshal ontap nvme devices: %v", err) + Logc(ctx).WithError(err).Error("Failed to unmarshal NVMe ONTAP devices.") + return ontapDevs, fmt.Errorf("failed to unmarshal NVMe ONTAP devices; %v", err) } } @@ -188,8 +190,8 @@ func FlushNVMeDevice(ctx context.Context, device string) error { _, err := command.Execute(ctx, "nvme", "flush", device) if err != nil { - Logc(ctx).Error("Error while flushing the device %s, %v", device, err) - return fmt.Errorf("error while flushing the device %s, %v", device, err) + Logc(ctx).WithError(err).Errorf("Failed to flush device: %s", device) + return fmt.Errorf("failed to flush device: %s; %v", device, err) } return nil