From 229229e62d02ee0b508a30b8a6ee84417761dc1e Mon Sep 17 00:00:00 2001 From: Carolina Calderon Date: Mon, 16 Sep 2024 10:19:19 -0600 Subject: [PATCH 1/7] chore: change log level for log retention policies --- master/internal/logretention/logretention.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/master/internal/logretention/logretention.go b/master/internal/logretention/logretention.go index 0334541ec0f..f73b099e641 100644 --- a/master/internal/logretention/logretention.go +++ b/master/internal/logretention/logretention.go @@ -92,7 +92,7 @@ func DeleteExpiredTaskLogs(ctx context.Context, days *int16) (int64, error) { if days != nil { defaultLogRetentionDays = *days } - log.WithField("default-retention-days", defaultLogRetentionDays).Trace("deleting expired task logs") + log.WithField("default-retention-days", defaultLogRetentionDays).Info("deleting expired task logs") r, err := db.Bun().NewRaw(fmt.Sprintf(` WITH log_retention_tasks AS ( SELECT COALESCE(r.log_retention_days, %d) as log_retention_days, t.task_id, t.end_time @@ -112,6 +112,6 @@ func DeleteExpiredTaskLogs(ctx context.Context, days *int16) (int64, error) { return 0, errors.Wrap(err, "error deleting expired task logs") } rows, err := r.RowsAffected() - log.WithFields(logrus.Fields{"rows": rows, "err": err}).Trace("deleted expired task logs") + log.WithFields(logrus.Fields{"rows": rows, "err": err}).Info("deleted expired task logs") return rows, err } From 1d52fa7ee48d5379208074e8ed1b6090cc58d1e7 Mon Sep 17 00:00:00 2001 From: Carolina Calderon Date: Tue, 17 Sep 2024 10:15:49 -0600 Subject: [PATCH 2/7] changes needed for gke --- helm/charts/determined/templates/db-deployment.yaml | 9 +++++++-- helm/charts/determined/values.yaml | 7 ++++++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/helm/charts/determined/templates/db-deployment.yaml b/helm/charts/determined/templates/db-deployment.yaml index 7fdf866613a..9c322879266 100644 --- a/helm/charts/determined/templates/db-deployment.yaml +++ b/helm/charts/determined/templates/db-deployment.yaml @@ -61,6 +61,11 @@ spec: volumes: - name: determined-db-storage persistentVolumeClaim: - claimName: determined-db-pvc-{{ .Release.Name }} + claimName: + {{ if .Values.db.claimName }} + {{ .Values.db.claimName }} + {{- else -}} + determined-db-pvc-{{ .Release.Name }} + {{ end }} {{ end }} -{{ end}} +{{ end }} diff --git a/helm/charts/determined/values.yaml b/helm/charts/determined/values.yaml index 488909382a3..463e9fb9ec7 100644 --- a/helm/charts/determined/values.yaml +++ b/helm/charts/determined/values.yaml @@ -196,6 +196,11 @@ db: # resourceType: # certResourceName: + # claim_name refers to the persistent volume claim name for the database. + # If left undefined, a new PVC is created. Otherwise, use the pvc provided, given + # that it exists in the current context. + # claimName: + # checkpointStorage controls where checkpoints are stored. Supported types include `shared_fs`, # `gcs`, and `s3`. @@ -362,7 +367,7 @@ resourcePools: # defaultComputeResourcePool: default ## Configure the initial user password for the cluster -# initialUserPassword: +initialUserPassword: Abcd1234 # additional_resource_managers: # - resource_manager: From cc561b856465e379ca9a2b9e0c73844dc21f9eda Mon Sep 17 00:00:00 2001 From: Carolina Calderon Date: Wed, 18 Sep 2024 12:38:06 -0600 Subject: [PATCH 3/7] more wip --- helm/charts/determined/Chart.yaml | 2 +- .../templates/db-persistent-volume-claim.yaml | 13 ++++++++++++- .../templates/db-volume-snapshot-class.yaml | 8 ++++++++ .../templates/db-volume-snapshot.yaml | 19 +++++++++++++++++++ .../templates/master-deployment.yaml | 2 ++ helm/charts/determined/values.yaml | 14 +++++++++++--- 6 files changed, 53 insertions(+), 5 deletions(-) create mode 100644 helm/charts/determined/templates/db-volume-snapshot-class.yaml create mode 100644 helm/charts/determined/templates/db-volume-snapshot.yaml diff --git a/helm/charts/determined/Chart.yaml b/helm/charts/determined/Chart.yaml index 38c18fe6a78..e8509324336 100644 --- a/helm/charts/determined/Chart.yaml +++ b/helm/charts/determined/Chart.yaml @@ -9,4 +9,4 @@ home: https://github.com/determined-ai/determined.git # If using a non-release version (e.g., X.Y.Z.dev0) you will have to specify an # existing official release version (e.g., X.Y.Z) or specify a commit has # that has been publicly published (all commits from main). -appVersion: "0.36.1-dev0" +appVersion: "0.36.0" diff --git a/helm/charts/determined/templates/db-persistent-volume-claim.yaml b/helm/charts/determined/templates/db-persistent-volume-claim.yaml index 7d5c6c9fcd2..e47f43ef0b4 100644 --- a/helm/charts/determined/templates/db-persistent-volume-claim.yaml +++ b/helm/charts/determined/templates/db-persistent-volume-claim.yaml @@ -3,7 +3,12 @@ apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: determined-db-pvc-{{ .Release.Name }} + name: + {{ if and .Release.IsUpgrade .Values.db.claimName }} + determined-db-pvc-{{ .Values.db.claimName }} + {{- else -}} + determined-db-pvc-{{ .Release.Name }} + {{ end }} namespace: {{ .Release.Namespace }} labels: app: determined-db-{{ .Release.Name }} @@ -11,6 +16,12 @@ metadata: spec: accessModes: - ReadWriteOnce + {{- if and .Release.IsUpgrade .Values.db.restoreSnapshotName }} + dataSource: + name: determined-db-snapshot-{{ .Values.db.restoreSnapshotName }} + kind: VolumeSnapshot + apiGroup: snapshot.storage.k8s.io + {{ end }} resources: requests: storage: {{ required "A valid Values.db.storageSize entry is required!" .Values.db.storageSize }} diff --git a/helm/charts/determined/templates/db-volume-snapshot-class.yaml b/helm/charts/determined/templates/db-volume-snapshot-class.yaml new file mode 100644 index 00000000000..3d86459b402 --- /dev/null +++ b/helm/charts/determined/templates/db-volume-snapshot-class.yaml @@ -0,0 +1,8 @@ +{{- if and .Release.IsUpgrade .Values.db.snapshotName }} +apiVersion: snapshot.storage.k8s.io/v1 +kind: VolumeSnapshotClass +metadata: + name: determined-db-snapshot-class +driver: pd.csi.storage.gke.io +deletionPolicy: Delete +{{ end }} \ No newline at end of file diff --git a/helm/charts/determined/templates/db-volume-snapshot.yaml b/helm/charts/determined/templates/db-volume-snapshot.yaml new file mode 100644 index 00000000000..36dbd3fd192 --- /dev/null +++ b/helm/charts/determined/templates/db-volume-snapshot.yaml @@ -0,0 +1,19 @@ +{{- if and .Release.IsUpgrade .Values.db.snapshotName }} +apiVersion: snapshot.storage.k8s.io/v1 +kind: VolumeSnapshot +metadata: + name: determined-db-snapshot-{{ .Values.db.snapshotName }} + namespace: default + labels: + app: determined-db-{{ .Release.Name }} + release: {{ .Release.Name }} +spec: + volumeSnapshotClassName: determined-db-snapshot-class + source: + persistentVolumeClaimName: + {{ if .Values.db.claimName }} + determined-db-pvc-{{ .Values.db.claimName }} + {{- else -}} + determined-db-pvc-{{ .Release.Name }} + {{ end }} +{{ end }} \ No newline at end of file diff --git a/helm/charts/determined/templates/master-deployment.yaml b/helm/charts/determined/templates/master-deployment.yaml index 8af99b769dd..eb84c0adf42 100644 --- a/helm/charts/determined/templates/master-deployment.yaml +++ b/helm/charts/determined/templates/master-deployment.yaml @@ -26,6 +26,8 @@ spec: serviceAccount: determined-master-{{ .Release.Name }} containers: - name: determined-master-{{ .Release.Name }} + command: ["sleep"] + args: ["999999m"] #or some other abnormally large number {{ $image := "determined-master" }} {{- if .Values.enterpriseEdition -}} {{ $image = "hpe-mlde-master" }} diff --git a/helm/charts/determined/values.yaml b/helm/charts/determined/values.yaml index 463e9fb9ec7..188b35770eb 100644 --- a/helm/charts/determined/values.yaml +++ b/helm/charts/determined/values.yaml @@ -196,11 +196,19 @@ db: # resourceType: # certResourceName: - # claim_name refers to the persistent volume claim name for the database. - # If left undefined, a new PVC is created. Otherwise, use the pvc provided, given - # that it exists in the current context. + # claimName refers to the persistent volume claim name for the database. + # If left undefined, a new PVC is created, named "determined-db-pvc-{releaseName}". + # Otherwise, use the pvc provided, given that it exists in the current context. # claimName: + # snapshotName refers to the volume snapshot name, a backup of the database's persistent volume. + # If defined, Helm will "snapshot" the DB upon its next upgrade, creating a volumeSnapshot called + # determined-db-snapsnot-{snapshotName}. + # snapshotName: + + # restoreSnapshotName refers to the volume snapshot name which you wish to restore the DB to. + # This only applies during a Helm upgrade, and if defined. + # restoreSnapshotName: # checkpointStorage controls where checkpoints are stored. Supported types include `shared_fs`, # `gcs`, and `s3`. From a64649ebd50ac9161a5de43f127a139774141320 Mon Sep 17 00:00:00 2001 From: Carolina Calderon Date: Wed, 18 Sep 2024 12:56:18 -0600 Subject: [PATCH 4/7] tested with CLI; todo test this on actual cluster & write docs --- helm/charts/determined/Chart.yaml | 2 +- .../templates/db-persistent-volume-claim.yaml | 4 ++-- .../templates/db-volume-snapshot-class.yaml | 2 +- .../determined/templates/db-volume-snapshot.yaml | 12 ++++++------ helm/charts/determined/values.yaml | 12 +++++++----- master/internal/logretention/logretention.go | 4 ++-- 6 files changed, 19 insertions(+), 17 deletions(-) diff --git a/helm/charts/determined/Chart.yaml b/helm/charts/determined/Chart.yaml index e8509324336..38c18fe6a78 100644 --- a/helm/charts/determined/Chart.yaml +++ b/helm/charts/determined/Chart.yaml @@ -9,4 +9,4 @@ home: https://github.com/determined-ai/determined.git # If using a non-release version (e.g., X.Y.Z.dev0) you will have to specify an # existing official release version (e.g., X.Y.Z) or specify a commit has # that has been publicly published (all commits from main). -appVersion: "0.36.0" +appVersion: "0.36.1-dev0" diff --git a/helm/charts/determined/templates/db-persistent-volume-claim.yaml b/helm/charts/determined/templates/db-persistent-volume-claim.yaml index e47f43ef0b4..cf04cbf9a3c 100644 --- a/helm/charts/determined/templates/db-persistent-volume-claim.yaml +++ b/helm/charts/determined/templates/db-persistent-volume-claim.yaml @@ -4,8 +4,8 @@ apiVersion: v1 kind: PersistentVolumeClaim metadata: name: - {{ if and .Release.IsUpgrade .Values.db.claimName }} - determined-db-pvc-{{ .Values.db.claimName }} + {{ if and .Release.IsUpgrade .Values.db.restoreSnapshotName }} + determined-db-pvc-{{ .Values.db.restoreSnapshotName }} {{- else -}} determined-db-pvc-{{ .Release.Name }} {{ end }} diff --git a/helm/charts/determined/templates/db-volume-snapshot-class.yaml b/helm/charts/determined/templates/db-volume-snapshot-class.yaml index 3d86459b402..26ddca28f46 100644 --- a/helm/charts/determined/templates/db-volume-snapshot-class.yaml +++ b/helm/charts/determined/templates/db-volume-snapshot-class.yaml @@ -5,4 +5,4 @@ metadata: name: determined-db-snapshot-class driver: pd.csi.storage.gke.io deletionPolicy: Delete -{{ end }} \ No newline at end of file +{{ end }} diff --git a/helm/charts/determined/templates/db-volume-snapshot.yaml b/helm/charts/determined/templates/db-volume-snapshot.yaml index 36dbd3fd192..32da0e91541 100644 --- a/helm/charts/determined/templates/db-volume-snapshot.yaml +++ b/helm/charts/determined/templates/db-volume-snapshot.yaml @@ -11,9 +11,9 @@ spec: volumeSnapshotClassName: determined-db-snapshot-class source: persistentVolumeClaimName: - {{ if .Values.db.claimName }} - determined-db-pvc-{{ .Values.db.claimName }} - {{- else -}} - determined-db-pvc-{{ .Release.Name }} - {{ end }} -{{ end }} \ No newline at end of file + {{ if .Values.db.claimName }} + determined-db-pvc-{{ .Values.db.claimName }} + {{- else -}} + determined-db-pvc-{{ .Release.Name }} + {{ end }} +{{ end }} diff --git a/helm/charts/determined/values.yaml b/helm/charts/determined/values.yaml index 188b35770eb..5f5482801ae 100644 --- a/helm/charts/determined/values.yaml +++ b/helm/charts/determined/values.yaml @@ -203,12 +203,14 @@ db: # snapshotName refers to the volume snapshot name, a backup of the database's persistent volume. # If defined, Helm will "snapshot" the DB upon its next upgrade, creating a volumeSnapshot called - # determined-db-snapsnot-{snapshotName}. - # snapshotName: + # determined-db-snapsnot-{snapshotName}. This CANNOT be the name of a PV that + # already exists, like the {{Release.Name}}. + # snapshotName: determined1 # restoreSnapshotName refers to the volume snapshot name which you wish to restore the DB to. - # This only applies during a Helm upgrade, and if defined. - # restoreSnapshotName: + # This only applies during a Helm upgrade, and if defined. This CANNOT be the name of a PVC that + # already exists, like the {{Release.Name}}. + # restoreSnapshotName: # checkpointStorage controls where checkpoints are stored. Supported types include `shared_fs`, # `gcs`, and `s3`. @@ -375,7 +377,7 @@ resourcePools: # defaultComputeResourcePool: default ## Configure the initial user password for the cluster -initialUserPassword: Abcd1234 +# initialUserPassword: # additional_resource_managers: # - resource_manager: diff --git a/master/internal/logretention/logretention.go b/master/internal/logretention/logretention.go index f73b099e641..0334541ec0f 100644 --- a/master/internal/logretention/logretention.go +++ b/master/internal/logretention/logretention.go @@ -92,7 +92,7 @@ func DeleteExpiredTaskLogs(ctx context.Context, days *int16) (int64, error) { if days != nil { defaultLogRetentionDays = *days } - log.WithField("default-retention-days", defaultLogRetentionDays).Info("deleting expired task logs") + log.WithField("default-retention-days", defaultLogRetentionDays).Trace("deleting expired task logs") r, err := db.Bun().NewRaw(fmt.Sprintf(` WITH log_retention_tasks AS ( SELECT COALESCE(r.log_retention_days, %d) as log_retention_days, t.task_id, t.end_time @@ -112,6 +112,6 @@ func DeleteExpiredTaskLogs(ctx context.Context, days *int16) (int64, error) { return 0, errors.Wrap(err, "error deleting expired task logs") } rows, err := r.RowsAffected() - log.WithFields(logrus.Fields{"rows": rows, "err": err}).Info("deleted expired task logs") + log.WithFields(logrus.Fields{"rows": rows, "err": err}).Trace("deleted expired task logs") return rows, err } From f2a9d612da38cb48d758db296ffb8bd4fe18887b Mon Sep 17 00:00:00 2001 From: Carolina Calderon Date: Thu, 19 Sep 2024 11:08:38 -0600 Subject: [PATCH 5/7] cleaning up helm chart; todo add docs --- helm/charts/determined/values.yaml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/helm/charts/determined/values.yaml b/helm/charts/determined/values.yaml index 5f5482801ae..30348ee8c54 100644 --- a/helm/charts/determined/values.yaml +++ b/helm/charts/determined/values.yaml @@ -197,19 +197,21 @@ db: # certResourceName: # claimName refers to the persistent volume claim name for the database. - # If left undefined, a new PVC is created, named "determined-db-pvc-{releaseName}". + # If left undefined, a new PVC is created, named "determined-db-pvc-{{ .Release.name }}". # Otherwise, use the pvc provided, given that it exists in the current context. # claimName: # snapshotName refers to the volume snapshot name, a backup of the database's persistent volume. # If defined, Helm will "snapshot" the DB upon its next upgrade, creating a volumeSnapshot called - # determined-db-snapsnot-{snapshotName}. This CANNOT be the name of a PV that - # already exists, like the {{Release.Name}}. + # `determined-db-snapsnot-{{ .Values.snapshotName}}`. This CANNOT be the name of a persistent volume claim + # that already exists, like the `determined-db-pvc-{{Release.Name}}`. # snapshotName: determined1 # restoreSnapshotName refers to the volume snapshot name which you wish to restore the DB to. # This only applies during a Helm upgrade, and if defined. This CANNOT be the name of a PVC that - # already exists, like the {{Release.Name}}. + # already exists, like the {{Release.Name}}. A new persistent volume & claim will be created, named + # `determined-db-pvc-{{ .Values.restoreSnapshotName}}`, restoring the data from + # `determined-db-snapsnot-{{ .Values.restoreSnapshotName}}`. # restoreSnapshotName: # checkpointStorage controls where checkpoints are stored. Supported types include `shared_fs`, From 3b820b5b4838521684a154dca132ab09f6963700 Mon Sep 17 00:00:00 2001 From: Carolina Calderon Date: Thu, 19 Sep 2024 11:09:14 -0600 Subject: [PATCH 6/7] cleaning up stray change --- helm/charts/determined/templates/master-deployment.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/helm/charts/determined/templates/master-deployment.yaml b/helm/charts/determined/templates/master-deployment.yaml index eb84c0adf42..8af99b769dd 100644 --- a/helm/charts/determined/templates/master-deployment.yaml +++ b/helm/charts/determined/templates/master-deployment.yaml @@ -26,8 +26,6 @@ spec: serviceAccount: determined-master-{{ .Release.Name }} containers: - name: determined-master-{{ .Release.Name }} - command: ["sleep"] - args: ["999999m"] #or some other abnormally large number {{ $image := "determined-master" }} {{- if .Values.enterpriseEdition -}} {{ $image = "hpe-mlde-master" }} From 05b4c406189612afb2a7bde755247a68cf66be9c Mon Sep 17 00:00:00 2001 From: Carolina Calderon Date: Thu, 19 Sep 2024 13:08:09 -0600 Subject: [PATCH 7/7] edit --- helm/charts/determined/templates/db-deployment.yaml | 2 +- .../charts/determined/templates/db-volume-snapshot-class.yaml | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/helm/charts/determined/templates/db-deployment.yaml b/helm/charts/determined/templates/db-deployment.yaml index 9c322879266..b41e82a0392 100644 --- a/helm/charts/determined/templates/db-deployment.yaml +++ b/helm/charts/determined/templates/db-deployment.yaml @@ -63,7 +63,7 @@ spec: persistentVolumeClaim: claimName: {{ if .Values.db.claimName }} - {{ .Values.db.claimName }} + determined-db-pvc-{{ .Values.db.claimName }} {{- else -}} determined-db-pvc-{{ .Release.Name }} {{ end }} diff --git a/helm/charts/determined/templates/db-volume-snapshot-class.yaml b/helm/charts/determined/templates/db-volume-snapshot-class.yaml index 26ddca28f46..b58850b40d0 100644 --- a/helm/charts/determined/templates/db-volume-snapshot-class.yaml +++ b/helm/charts/determined/templates/db-volume-snapshot-class.yaml @@ -1,8 +1,6 @@ -{{- if and .Release.IsUpgrade .Values.db.snapshotName }} apiVersion: snapshot.storage.k8s.io/v1 kind: VolumeSnapshotClass metadata: name: determined-db-snapshot-class driver: pd.csi.storage.gke.io -deletionPolicy: Delete -{{ end }} +deletionPolicy: Retain