Skip to content

Commit

Permalink
data mover restore for Windows
Browse files Browse the repository at this point in the history
Signed-off-by: Lyndon-Li <[email protected]>
  • Loading branch information
Lyndon-Li committed Jan 9, 2025
1 parent be5f56a commit e9cd583
Show file tree
Hide file tree
Showing 18 changed files with 214 additions and 88 deletions.
1 change: 1 addition & 0 deletions changelogs/unreleased/8594-Lyndon-Li
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Data mover restore for Windows
3 changes: 3 additions & 0 deletions config/crd/v2alpha1/bases/velero.io_datadownloads.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ spec:
DataMover specifies the data mover to be used by the backup.
If DataMover is "" or "velero", the built-in data mover will be used.
type: string
nodeOS:
description: Node is OS of the node where the DataUpload is processed.
type: string
operationTimeout:
description: |-
OperationTimeout specifies the time used to wait internal operations,
Expand Down
6 changes: 5 additions & 1 deletion config/crd/v2alpha1/bases/velero.io_datauploads.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,8 @@ spec:
description: DataUploadStatus is the current status of a DataUpload.
properties:
acceptedByNode:
description: Node is name of the node where the DataUpload is prepared.
description: AcceptedByNode is name of the node where the DataUpload
is prepared.
type: string
acceptedTimestamp:
description: |-
Expand Down Expand Up @@ -175,6 +176,9 @@ spec:
node:
description: Node is name of the node where the DataUpload is processed.
type: string
nodeOS:
description: Node is OS of the node where the DataUpload is processed.
type: string
path:
description: Path is the full path of the snapshot volume being backed
up.
Expand Down
4 changes: 2 additions & 2 deletions config/crd/v2alpha1/crds/crds.go

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions pkg/apis/velero/v2alpha1/data_download_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ type DataDownloadSpec struct {
// OperationTimeout specifies the time used to wait internal operations,
// before returning error as timeout.
OperationTimeout metav1.Duration `json:"operationTimeout"`

// NodeOS is OS of the node where the DataUpload is processed.
// +optional
NodeOS NodeOS `json:"nodeOS,omitempty"`
}

// TargetVolumeSpec is the specification for a target PVC.
Expand Down
19 changes: 18 additions & 1 deletion pkg/apis/velero/v2alpha1/data_upload_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,14 @@ const (
DataUploadPhaseFailed DataUploadPhase = "Failed"
)

type NodeOS string

const (
NodeOSLinux NodeOS = "linux"
NodeOSWindows NodeOS = "windows"
NodeOSAuto NodeOS = "auto"
)

// DataUploadStatus is the current status of a DataUpload.
type DataUploadStatus struct {
// Phase is the current state of the DataUpload.
Expand Down Expand Up @@ -144,7 +152,12 @@ type DataUploadStatus struct {
// Node is name of the node where the DataUpload is processed.
// +optional
Node string `json:"node,omitempty"`
// Node is name of the node where the DataUpload is prepared.

// NodeOS is OS of the node where the DataUpload is processed.
// +optional
NodeOS NodeOS `json:"nodeOS,omitempty"`

// AcceptedByNode is name of the node where the DataUpload is prepared.
// +optional
AcceptedByNode string `json:"acceptedByNode,omitempty"`

Expand Down Expand Up @@ -221,4 +234,8 @@ type DataUploadResult struct {
// +optional
// +nullable
DataMoverResult *map[string]string `json:"dataMoverResult,omitempty"`

// NodeOS is OS of the node where the DataUpload is processed.
// +optional
NodeOS NodeOS `json:"nodeOS,omitempty"`
}
6 changes: 6 additions & 0 deletions pkg/builder/data_download_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,12 @@ func (d *DataDownloadBuilder) Node(node string) *DataDownloadBuilder {
return d
}

// NodeOS sets the DataDownload's Node OS.
func (d *DataDownloadBuilder) NodeOS(nodeOS velerov2alpha1api.NodeOS) *DataDownloadBuilder {
d.object.Spec.NodeOS = nodeOS
return d
}

// AcceptedByNode sets the DataDownload's AcceptedByNode.
func (d *DataDownloadBuilder) AcceptedByNode(node string) *DataDownloadBuilder {
d.object.Status.AcceptedByNode = node
Expand Down
6 changes: 6 additions & 0 deletions pkg/builder/data_upload_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,12 @@ func (d *DataUploadBuilder) Node(node string) *DataUploadBuilder {
return d
}

// NodeOS sets the DataUpload's Node OS.
func (d *DataUploadBuilder) NodeOS(nodeOS velerov2alpha1api.NodeOS) *DataUploadBuilder {
d.object.Status.NodeOS = nodeOS
return d
}

// AcceptedByNode sets the DataUpload's AcceptedByNode.
func (d *DataUploadBuilder) AcceptedByNode(node string) *DataUploadBuilder {
d.object.Status.AcceptedByNode = node
Expand Down
19 changes: 18 additions & 1 deletion pkg/cmd/cli/datamover/restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,24 @@ func newdataMoverRestore(logger logrus.FieldLogger, factory client.Factory, conf
return nil, errors.Wrap(err, "error to create client")
}

cache, err := ctlcache.New(clientConfig, cacheOption)
var cache ctlcache.Cache
retry := 10
for {
cache, err = ctlcache.New(clientConfig, cacheOption)
if err == nil {
break
}

retry--
if retry == 0 {
break
}

logger.WithError(err).Warn("Failed to create client cache, need retry")

time.Sleep(time.Second)
}

if err != nil {
cancelFunc()
return nil, errors.Wrap(err, "error to create client cache")
Expand Down
57 changes: 39 additions & 18 deletions pkg/controller/data_download_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,28 +183,15 @@ func (r *DataDownloadReconciler) Reconcile(ctx context.Context, req ctrl.Request
return ctrl.Result{}, nil
}

hostingPodLabels := map[string]string{velerov1api.DataDownloadLabel: dd.Name}
for _, k := range util.ThirdPartyLabels {
if v, err := nodeagent.GetLabelValue(ctx, r.kubeClient, dd.Namespace, k, kube.NodeOSLinux); err != nil {
if err != nodeagent.ErrNodeAgentLabelNotFound {
log.WithError(err).Warnf("Failed to check node-agent label, skip adding host pod label %s", k)
}
} else {
hostingPodLabels[k] = v
}
exposeParam, err := r.setupExposeParam(dd)
if err != nil {
return r.errorOut(ctx, dd, err, "failed to set exposer parameters", log)
}

// Expose() will trigger to create one pod whose volume is restored by a given volume snapshot,
// but the pod maybe is not in the same node of the current controller, so we need to return it here.
// And then only the controller who is in the same node could do the rest work.
err = r.restoreExposer.Expose(ctx, getDataDownloadOwnerObject(dd), exposer.GenericRestoreExposeParam{
TargetPVCName: dd.Spec.TargetVolume.PVC,
SourceNamespace: dd.Spec.TargetVolume.Namespace,
HostingPodLabels: hostingPodLabels,
Resources: r.podResources,
ExposeTimeout: dd.Spec.OperationTimeout.Duration,
RestorePVCConfig: r.restorePVCConfig,
})
err = r.restoreExposer.Expose(ctx, getDataDownloadOwnerObject(dd), exposeParam)
if err != nil {
if err := r.client.Get(ctx, req.NamespacedName, dd); err != nil {
if !apierrors.IsNotFound(err) {
Expand Down Expand Up @@ -243,7 +230,7 @@ func (r *DataDownloadReconciler) Reconcile(ctx context.Context, req ctrl.Request
log.Debugf("Data download is been canceled %s in Phase %s", dd.GetName(), dd.Status.Phase)
r.tryCancelAcceptedDataDownload(ctx, dd, "")
} else if peekErr := r.restoreExposer.PeekExposed(ctx, getDataDownloadOwnerObject(dd)); peekErr != nil {
r.tryCancelAcceptedDataDownload(ctx, dd, fmt.Sprintf("found a dataupload %s/%s with expose error: %s. mark it as cancel", dd.Namespace, dd.Name, peekErr))
r.tryCancelAcceptedDataDownload(ctx, dd, fmt.Sprintf("found a datadownload %s/%s with expose error: %s. mark it as cancel", dd.Namespace, dd.Name, peekErr))
log.Errorf("Cancel dd %s/%s because of expose error %s", dd.Namespace, dd.Name, peekErr)
} else if dd.Status.AcceptedTimestamp != nil {
if time.Since(dd.Status.AcceptedTimestamp.Time) >= r.preparingTimeout {
Expand Down Expand Up @@ -737,6 +724,40 @@ func (r *DataDownloadReconciler) closeDataPath(ctx context.Context, ddName strin
r.dataPathMgr.RemoveAsyncBR(ddName)
}

func (r *DataDownloadReconciler) setupExposeParam(dd *velerov2alpha1api.DataDownload) (exposer.GenericRestoreExposeParam, error) {
nodeOS := string(dd.Spec.NodeOS)
if nodeOS == "" {
r.logger.Info("nodeOS is empty in DD, fallback to linux")
nodeOS = kube.NodeOSLinux
}

if err := kube.HasNodeWithOS(context.Background(), string(dd.Spec.NodeOS), r.kubeClient.CoreV1()); err != nil {
return exposer.GenericRestoreExposeParam{}, errors.Wrapf(err, "no appropriate node to run datadownload %s/%s", dd.Namespace, dd.Name)
}

hostingPodLabels := map[string]string{velerov1api.DataDownloadLabel: dd.Name}
for _, k := range util.ThirdPartyLabels {
if v, err := nodeagent.GetLabelValue(context.Background(), r.kubeClient, dd.Namespace, k, nodeOS); err != nil {
if err != nodeagent.ErrNodeAgentLabelNotFound {
r.logger.WithError(err).Warnf("Failed to check node-agent label, skip adding host pod label %s", k)
}
} else {
hostingPodLabels[k] = v
}
}

return exposer.GenericRestoreExposeParam{
TargetPVCName: dd.Spec.TargetVolume.PVC,
TargetNamespace: dd.Spec.TargetVolume.Namespace,
HostingPodLabels: hostingPodLabels,
Resources: r.podResources,
OperationTimeout: dd.Spec.OperationTimeout.Duration,
ExposeTimeout: r.preparingTimeout,
NodeOS: nodeOS,
RestorePVCConfig: r.restorePVCConfig,
}, nil
}

func getDataDownloadOwnerObject(dd *velerov2alpha1api.DataDownload) v1.ObjectReference {
return v1.ObjectReference{
Kind: dd.Kind,
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/data_download_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ func dataDownloadBuilder() *builder.DataDownloadBuilder {
PV: "test-pv",
PVC: "test-pvc",
Namespace: "test-ns",
})
}).NodeOS(velerov2alpha1api.NodeOS("test-node-os"))
}

func initDataDownloadReconciler(objects []runtime.Object, needError ...bool) (*DataDownloadReconciler, error) {
Expand Down
12 changes: 12 additions & 0 deletions pkg/controller/data_upload_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,17 @@ func (r *DataUploadReconciler) Reconcile(ctx context.Context, req ctrl.Request)
return ctrl.Result{}, nil
}

var nodeOS velerov2alpha1api.NodeOS
if res.ByPod.NodeOS == nil {
nodeOS = velerov2alpha1api.NodeOSAuto
} else if *res.ByPod.NodeOS == "linux" {
nodeOS = velerov2alpha1api.NodeOSLinux
} else if *res.ByPod.NodeOS == "windows" {
nodeOS = velerov2alpha1api.NodeOSWindows
} else {
return r.errorOut(ctx, du, errors.Errorf("invalid node OS %s", *res.ByPod.NodeOS), "invalid expose result", log)
}

log.Info("Exposed snapshot is ready and creating data path routine")

// Need to first create file system BR and get data path instance then update data upload status
Expand Down Expand Up @@ -317,6 +328,7 @@ func (r *DataUploadReconciler) Reconcile(ctx context.Context, req ctrl.Request)
original := du.DeepCopy()
du.Status.Phase = velerov2alpha1api.DataUploadPhaseInProgress
du.Status.StartTimestamp = &metav1.Time{Time: r.Clock.Now()}
du.Status.NodeOS = nodeOS
if err := r.client.Patch(ctx, du, client.MergeFrom(original)); err != nil {
log.WithError(err).Warnf("Failed to update dataupload %s to InProgress, will data path close and retry", du.Name)

Expand Down
6 changes: 6 additions & 0 deletions pkg/exposer/csi_snapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -281,10 +281,16 @@ func (e *csiSnapshotExposer) GetExposed(ctx context.Context, ownerObject corev1.

curLog.WithField("pod", pod.Name).Infof("Backup volume is found in pod at index %v", i)

var nodeOS *string
if os, found := pod.Spec.NodeSelector[kube.NodeOSLabel]; found {
nodeOS = &os
}

return &ExposeResult{ByPod: ExposeByPod{
HostingPod: pod,
HostingContainer: containerName,
VolumeName: volumeName,
NodeOS: nodeOS,
}}, nil
}

Expand Down
Loading

0 comments on commit e9cd583

Please sign in to comment.