From c30b8cdc9da3fe434607454e834fd8777eb0bd2d Mon Sep 17 00:00:00 2001 From: sotojn Date: Thu, 5 Sep 2024 11:19:38 -0700 Subject: [PATCH 01/13] pass in ALLOW_EX_RESTART env var to ex in k8sV2 --- .../cluster/backends/kubernetesV2/k8sResource.ts | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/packages/teraslice/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sResource.ts b/packages/teraslice/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sResource.ts index bb135e55cf2..dea4db69200 100644 --- a/packages/teraslice/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sResource.ts +++ b/packages/teraslice/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sResource.ts @@ -115,7 +115,14 @@ export class K8sResource { } _setEnvVariables() { - /// TODO: Use this later when we need to set env vars in workers/ex controllers + // Pass in env var to let ex know it can restart in + // certain scenarios + this.resource.spec.template.spec.containers[0].env.push( + { + name: 'ALLOW_EX_RESTART', + value: 'true' + } + ); } _mountLocalTeraslice(contextType: string): void { @@ -292,14 +299,14 @@ export class K8sResource { // eslint-disable-next-line max-len this.resource.spec.template.spec.priorityClassName = this.terasliceConfig.kubernetes_priority_class_name; if (this.execution.stateful) { - + this.resource.spec.template.metadata.labels[`${this.jobPropertyLabelPrefix}/stateful`] = 'true'; } } if (this.nodeType === 'worker' && this.execution.stateful) { // eslint-disable-next-line max-len this.resource.spec.template.spec.priorityClassName = this.terasliceConfig.kubernetes_priority_class_name; - + this.resource.spec.template.metadata.labels[`${this.jobPropertyLabelPrefix}/stateful`] = 'true'; } } From 864d04efb34f971b24cfa606cfa2163b4503aa45 Mon Sep 17 00:00:00 2001 From: sotojn Date: Thu, 5 Sep 2024 11:24:08 -0700 Subject: [PATCH 02/13] only pass in ALLOW_EX_RESTART var to ex resource --- .../backends/kubernetesV2/k8sResource.ts | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/packages/teraslice/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sResource.ts b/packages/teraslice/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sResource.ts index dea4db69200..43c53890284 100644 --- a/packages/teraslice/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sResource.ts +++ b/packages/teraslice/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sResource.ts @@ -89,7 +89,7 @@ export class K8sResource { if (process.env.MOUNT_LOCAL_TERASLICE !== undefined) { this._mountLocalTeraslice(resourceName); } - this._setEnvVariables(); + this._setEnvVariables(resourceName); this._setAssetsVolume(); this._setImagePullSecret(); this._setEphemeralStorage(); @@ -114,15 +114,17 @@ export class K8sResource { } } - _setEnvVariables() { + _setEnvVariables(resourceName: string) { // Pass in env var to let ex know it can restart in // certain scenarios - this.resource.spec.template.spec.containers[0].env.push( - { - name: 'ALLOW_EX_RESTART', - value: 'true' - } - ); + if (resourceName === 'execution_controller') { + this.resource.spec.template.spec.containers[0].env.push( + { + name: 'ALLOW_EX_RESTART', + value: 'true' + } + ); + } } _mountLocalTeraslice(contextType: string): void { From 856afc565b509515cd8c5e4b322a4d70f1484bf8 Mon Sep 17 00:00:00 2001 From: sotojn Date: Thu, 5 Sep 2024 11:25:55 -0700 Subject: [PATCH 03/13] allow allowNonZeroExitCode for ex in k8sV2 --- .../teraslice/src/lib/workers/helpers/worker-shutdown.ts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/packages/teraslice/src/lib/workers/helpers/worker-shutdown.ts b/packages/teraslice/src/lib/workers/helpers/worker-shutdown.ts index 6ba60ef450a..47a2faaa022 100644 --- a/packages/teraslice/src/lib/workers/helpers/worker-shutdown.ts +++ b/packages/teraslice/src/lib/workers/helpers/worker-shutdown.ts @@ -44,7 +44,12 @@ export function shutdownHandler( const isProcessRestart = process.env.process_restart; // everything but the k8s execution_controller should not be allowed be allowed to // set a non-zero exit code (to avoid being restarted) - const allowNonZeroExitCode = !(isK8s && assignment === 'execution_controller'); + // This is overridden in V2 because it can restart + const allowNonZeroExitCode = !( + isK8s + && assignment === 'execution_controller' + && process.env.ALLOW_EX_RESTART !== 'true' + ); const api = { exiting: false, exit From a670873365795689cba5016715cd22e8bdb6b477 Mon Sep 17 00:00:00 2001 From: sotojn Date: Thu, 5 Sep 2024 11:27:09 -0700 Subject: [PATCH 04/13] add baseline logic for ex restart scenarios --- .../lib/workers/execution-controller/index.ts | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/packages/teraslice/src/lib/workers/execution-controller/index.ts b/packages/teraslice/src/lib/workers/execution-controller/index.ts index c8dc1fe0e89..61edc9ca817 100644 --- a/packages/teraslice/src/lib/workers/execution-controller/index.ts +++ b/packages/teraslice/src/lib/workers/execution-controller/index.ts @@ -42,7 +42,7 @@ export class ExecutionController { private isExecutionFinished = false; isExecutionDone = false; private workersHaveConnected = false; - + private _handlers = new Map void>(); executionAnalytics: ExecutionAnalytics; readonly scheduler: Scheduler; @@ -420,6 +420,15 @@ export class ExecutionController { await this.client.sendExecutionFinished(shutdownError.message); } } + await this.stateStorage.refresh(); + const status = await this.executionStorage.getStatus(this.exId); + /// This is an indication that the cluster_master did not call for this + /// shutdown. We want to restart in this case. + if (process.env.ALLOW_EX_RESTART === 'true' && status === 'running') { + this.logger.info('Skipping shutdown to allow restart...'); + return; + } + if (this.isShutdown) return; if (!this.isInitialized) return; if (this.isShuttingDown) { @@ -921,6 +930,11 @@ export class ExecutionController { if (includes(terminalStatuses, status)) { error = new Error(invalidStateMsg('terminal')); } else if (includes(runningStatuses, status)) { + // In the case of a `running` state on startup we + // want to continue to start up. Only in V2. + if (process.env.ALLOW_EX_RESTART === 'true') { + return true; + } error = new Error(invalidStateMsg('running')); // If in a running status the execution process // crashed and k8s is trying to restart the pod, From eaaaf92ffcd95943f5985c0bb09d746940dec0bf Mon Sep 17 00:00:00 2001 From: sotojn Date: Thu, 5 Sep 2024 12:44:09 -0700 Subject: [PATCH 05/13] properly wrap ex restart logic in if statments --- .../lib/workers/execution-controller/index.ts | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/packages/teraslice/src/lib/workers/execution-controller/index.ts b/packages/teraslice/src/lib/workers/execution-controller/index.ts index 61edc9ca817..13f167a84a7 100644 --- a/packages/teraslice/src/lib/workers/execution-controller/index.ts +++ b/packages/teraslice/src/lib/workers/execution-controller/index.ts @@ -420,13 +420,17 @@ export class ExecutionController { await this.client.sendExecutionFinished(shutdownError.message); } } - await this.stateStorage.refresh(); - const status = await this.executionStorage.getStatus(this.exId); - /// This is an indication that the cluster_master did not call for this - /// shutdown. We want to restart in this case. - if (process.env.ALLOW_EX_RESTART === 'true' && status === 'running') { - this.logger.info('Skipping shutdown to allow restart...'); - return; + + /// This only applies to kubernetesV2 + if (process.env.ALLOW_EX_RESTART === 'true') { + await this.stateStorage.refresh(); + const status = await this.executionStorage.getStatus(this.exId); + /// This is an indication that the cluster_master did not call for this + /// shutdown. We want to restart in this case. + if (status === 'running') { + this.logger.info('Skipping shutdown to allow restart...'); + return; + } } if (this.isShutdown) return; From 45d8db86c37085771a577aec26b8b4ac36a1af5f Mon Sep 17 00:00:00 2001 From: sotojn Date: Thu, 5 Sep 2024 13:35:32 -0700 Subject: [PATCH 06/13] add isRestartable() func to Slicer-core class --- .../job-components/src/operations/core/slicer-core.ts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/packages/job-components/src/operations/core/slicer-core.ts b/packages/job-components/src/operations/core/slicer-core.ts index c667084bb67..02824a3dfe4 100644 --- a/packages/job-components/src/operations/core/slicer-core.ts +++ b/packages/job-components/src/operations/core/slicer-core.ts @@ -147,6 +147,14 @@ export default abstract class SlicerCore return false; } + /** + * Used to indicate whether this slicer is restartable. Only relevant for + * kubernetesV2 backend + */ + isRestartable(): boolean { + return false; + } + /** * Used to determine the maximum number of slices queued. * Defaults to 10000 From cb26f542d1bf740b15c2e1787cb3276a2671fbcf Mon Sep 17 00:00:00 2001 From: sotojn Date: Thu, 5 Sep 2024 13:37:12 -0700 Subject: [PATCH 07/13] add isRestartable() check in ex _verifyExecution() func --- .../teraslice/src/lib/workers/execution-controller/index.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/packages/teraslice/src/lib/workers/execution-controller/index.ts b/packages/teraslice/src/lib/workers/execution-controller/index.ts index 13f167a84a7..c8ea2fb3695 100644 --- a/packages/teraslice/src/lib/workers/execution-controller/index.ts +++ b/packages/teraslice/src/lib/workers/execution-controller/index.ts @@ -937,7 +937,11 @@ export class ExecutionController { // In the case of a `running` state on startup we // want to continue to start up. Only in V2. if (process.env.ALLOW_EX_RESTART === 'true') { - return true; + // Check to see if `isRestartable` exists. + // Allows for older assets to work with k8sV2 + if (this.executionContext.slicer().isRestartable) { + return this.executionContext.slicer().isRestartable(); + } } error = new Error(invalidStateMsg('running')); // If in a running status the execution process From 7d31adfc6f503da4e47271f1fffc542f3eaad314 Mon Sep 17 00:00:00 2001 From: sotojn Date: Thu, 5 Sep 2024 13:57:00 -0700 Subject: [PATCH 08/13] add logging to restart logic in ex class --- .../src/lib/workers/execution-controller/index.ts | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/packages/teraslice/src/lib/workers/execution-controller/index.ts b/packages/teraslice/src/lib/workers/execution-controller/index.ts index c8ea2fb3695..03bbbda3c94 100644 --- a/packages/teraslice/src/lib/workers/execution-controller/index.ts +++ b/packages/teraslice/src/lib/workers/execution-controller/index.ts @@ -425,6 +425,7 @@ export class ExecutionController { if (process.env.ALLOW_EX_RESTART === 'true') { await this.stateStorage.refresh(); const status = await this.executionStorage.getStatus(this.exId); + this.logger.debug(`Execution ${this.exId} is currently in a ${status} state`); /// This is an indication that the cluster_master did not call for this /// shutdown. We want to restart in this case. if (status === 'running') { @@ -940,7 +941,14 @@ export class ExecutionController { // Check to see if `isRestartable` exists. // Allows for older assets to work with k8sV2 if (this.executionContext.slicer().isRestartable) { - return this.executionContext.slicer().isRestartable(); + this.logger.info(`Execution ${this.exId} detected to have been restarted..`); + const restartable = this.executionContext.slicer().isRestartable(); + if (restartable) { + this.logger.info(`Execution ${this.exId} is restarable and will continue reinitializing...`); + } else { + this.logger.error(`Execution ${this.exId} is not restarable and will shutdown...`); + } + return restartable; } } error = new Error(invalidStateMsg('running')); From 78b7e43f6765a4d2e3cf9da182bc8a473135af5e Mon Sep 17 00:00:00 2001 From: sotojn Date: Mon, 9 Sep 2024 08:09:35 -0700 Subject: [PATCH 09/13] remove env ALLOW_EX_RESTART --- .../cluster/backends/kubernetesV2/k8sResource.ts | 14 ++------------ .../src/lib/workers/execution-controller/index.ts | 7 +++++-- .../src/lib/workers/helpers/worker-shutdown.ts | 2 +- 3 files changed, 8 insertions(+), 15 deletions(-) diff --git a/packages/teraslice/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sResource.ts b/packages/teraslice/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sResource.ts index 43c53890284..53797d10168 100644 --- a/packages/teraslice/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sResource.ts +++ b/packages/teraslice/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sResource.ts @@ -89,7 +89,7 @@ export class K8sResource { if (process.env.MOUNT_LOCAL_TERASLICE !== undefined) { this._mountLocalTeraslice(resourceName); } - this._setEnvVariables(resourceName); + this._setEnvVariables(); this._setAssetsVolume(); this._setImagePullSecret(); this._setEphemeralStorage(); @@ -114,17 +114,7 @@ export class K8sResource { } } - _setEnvVariables(resourceName: string) { - // Pass in env var to let ex know it can restart in - // certain scenarios - if (resourceName === 'execution_controller') { - this.resource.spec.template.spec.containers[0].env.push( - { - name: 'ALLOW_EX_RESTART', - value: 'true' - } - ); - } + _setEnvVariables() { } _mountLocalTeraslice(contextType: string): void { diff --git a/packages/teraslice/src/lib/workers/execution-controller/index.ts b/packages/teraslice/src/lib/workers/execution-controller/index.ts index 03bbbda3c94..fff71b92d1c 100644 --- a/packages/teraslice/src/lib/workers/execution-controller/index.ts +++ b/packages/teraslice/src/lib/workers/execution-controller/index.ts @@ -422,7 +422,10 @@ export class ExecutionController { } /// This only applies to kubernetesV2 - if (process.env.ALLOW_EX_RESTART === 'true') { + if ( + this.context.sysconfig.teraslice.cluster_manager_type === 'kubernetesV2' + && eventType === 'SIGTERM' + ) { await this.stateStorage.refresh(); const status = await this.executionStorage.getStatus(this.exId); this.logger.debug(`Execution ${this.exId} is currently in a ${status} state`); @@ -937,7 +940,7 @@ export class ExecutionController { } else if (includes(runningStatuses, status)) { // In the case of a `running` state on startup we // want to continue to start up. Only in V2. - if (process.env.ALLOW_EX_RESTART === 'true') { + if (this.context.sysconfig.teraslice.cluster_manager_type === 'kubernetesV2') { // Check to see if `isRestartable` exists. // Allows for older assets to work with k8sV2 if (this.executionContext.slicer().isRestartable) { diff --git a/packages/teraslice/src/lib/workers/helpers/worker-shutdown.ts b/packages/teraslice/src/lib/workers/helpers/worker-shutdown.ts index 47a2faaa022..e00cad5c7fe 100644 --- a/packages/teraslice/src/lib/workers/helpers/worker-shutdown.ts +++ b/packages/teraslice/src/lib/workers/helpers/worker-shutdown.ts @@ -48,7 +48,7 @@ export function shutdownHandler( const allowNonZeroExitCode = !( isK8s && assignment === 'execution_controller' - && process.env.ALLOW_EX_RESTART !== 'true' + && context.sysconfig.teraslice.cluster_manager_type === 'kubernetesV2' ); const api = { exiting: false, From 4ac80bb732e5e6dd9cb7d3a1edddbdafdc17d976 Mon Sep 17 00:00:00 2001 From: sotojn Date: Mon, 9 Sep 2024 08:43:30 -0700 Subject: [PATCH 10/13] add refactored restart logic to ex controler --- .../src/lib/workers/execution-controller/index.ts | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/packages/teraslice/src/lib/workers/execution-controller/index.ts b/packages/teraslice/src/lib/workers/execution-controller/index.ts index fff71b92d1c..d100303c0fb 100644 --- a/packages/teraslice/src/lib/workers/execution-controller/index.ts +++ b/packages/teraslice/src/lib/workers/execution-controller/index.ts @@ -428,11 +428,13 @@ export class ExecutionController { ) { await this.stateStorage.refresh(); const status = await this.executionStorage.getStatus(this.exId); + const runningStatuses = this.executionStorage.getRunningStatuses(); this.logger.debug(`Execution ${this.exId} is currently in a ${status} state`); /// This is an indication that the cluster_master did not call for this /// shutdown. We want to restart in this case. - if (status === 'running') { + if (status !== 'stopping' && includes(runningStatuses, status)) { this.logger.info('Skipping shutdown to allow restart...'); + await this.executionStorage.setStatus(this.exId, 'paused'); return; } } @@ -938,9 +940,12 @@ export class ExecutionController { if (includes(terminalStatuses, status)) { error = new Error(invalidStateMsg('terminal')); } else if (includes(runningStatuses, status)) { - // In the case of a `running` state on startup we + // In the case of a `paused` state on startup we // want to continue to start up. Only in V2. - if (this.context.sysconfig.teraslice.cluster_manager_type === 'kubernetesV2') { + if ( + this.context.sysconfig.teraslice.cluster_manager_type === 'kubernetesV2' + && status === 'paused' + ) { // Check to see if `isRestartable` exists. // Allows for older assets to work with k8sV2 if (this.executionContext.slicer().isRestartable) { From d1beaa31f6b425932a81588b7a268e2748282b84 Mon Sep 17 00:00:00 2001 From: sotojn Date: Mon, 9 Sep 2024 08:44:16 -0700 Subject: [PATCH 11/13] add catch when setting state in ex controller --- .../teraslice/src/lib/workers/execution-controller/index.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/packages/teraslice/src/lib/workers/execution-controller/index.ts b/packages/teraslice/src/lib/workers/execution-controller/index.ts index d100303c0fb..efba6184bf9 100644 --- a/packages/teraslice/src/lib/workers/execution-controller/index.ts +++ b/packages/teraslice/src/lib/workers/execution-controller/index.ts @@ -434,7 +434,10 @@ export class ExecutionController { /// shutdown. We want to restart in this case. if (status !== 'stopping' && includes(runningStatuses, status)) { this.logger.info('Skipping shutdown to allow restart...'); - await this.executionStorage.setStatus(this.exId, 'paused'); + await this.executionStorage.setStatus(this.exId, 'paused') + .catch((err) => { + logError(this.logger, err, 'failure to set status to paused while restarting..'); + }); return; } } From e929358ec2f362ececcd7b8e1b10e617e79cd6ab Mon Sep 17 00:00:00 2001 From: sotojn Date: Tue, 10 Sep 2024 08:19:52 -0700 Subject: [PATCH 12/13] remove paused status check for interrupted execution --- .../src/lib/workers/execution-controller/index.ts | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/packages/teraslice/src/lib/workers/execution-controller/index.ts b/packages/teraslice/src/lib/workers/execution-controller/index.ts index efba6184bf9..42449136d60 100644 --- a/packages/teraslice/src/lib/workers/execution-controller/index.ts +++ b/packages/teraslice/src/lib/workers/execution-controller/index.ts @@ -434,10 +434,6 @@ export class ExecutionController { /// shutdown. We want to restart in this case. if (status !== 'stopping' && includes(runningStatuses, status)) { this.logger.info('Skipping shutdown to allow restart...'); - await this.executionStorage.setStatus(this.exId, 'paused') - .catch((err) => { - logError(this.logger, err, 'failure to set status to paused while restarting..'); - }); return; } } @@ -943,12 +939,9 @@ export class ExecutionController { if (includes(terminalStatuses, status)) { error = new Error(invalidStateMsg('terminal')); } else if (includes(runningStatuses, status)) { - // In the case of a `paused` state on startup we + // In the case of a running status on startup we // want to continue to start up. Only in V2. - if ( - this.context.sysconfig.teraslice.cluster_manager_type === 'kubernetesV2' - && status === 'paused' - ) { + if (this.context.sysconfig.teraslice.cluster_manager_type === 'kubernetesV2') { // Check to see if `isRestartable` exists. // Allows for older assets to work with k8sV2 if (this.executionContext.slicer().isRestartable) { From c55e25b0583e1e3ef1e2e60e2984e3a1fa8eee5a Mon Sep 17 00:00:00 2001 From: sotojn Date: Tue, 10 Sep 2024 08:24:46 -0700 Subject: [PATCH 13/13] add notes about restartable ex startup --- .../teraslice/src/lib/workers/execution-controller/index.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/packages/teraslice/src/lib/workers/execution-controller/index.ts b/packages/teraslice/src/lib/workers/execution-controller/index.ts index 42449136d60..9cfc7a7a638 100644 --- a/packages/teraslice/src/lib/workers/execution-controller/index.ts +++ b/packages/teraslice/src/lib/workers/execution-controller/index.ts @@ -941,6 +941,9 @@ export class ExecutionController { } else if (includes(runningStatuses, status)) { // In the case of a running status on startup we // want to continue to start up. Only in V2. + // Right now we will depend on kubernetes `crashloopbackoff` in the case of + // an unexpected exit to the ex process. Ex: an OOM + // NOTE: If this becomes an issue we may want to add a new state. Maybe `interrupted` if (this.context.sysconfig.teraslice.cluster_manager_type === 'kubernetesV2') { // Check to see if `isRestartable` exists. // Allows for older assets to work with k8sV2