diff --git a/packages/job-components/src/operations/core/slicer-core.ts b/packages/job-components/src/operations/core/slicer-core.ts index c667084bb67..02824a3dfe4 100644 --- a/packages/job-components/src/operations/core/slicer-core.ts +++ b/packages/job-components/src/operations/core/slicer-core.ts @@ -147,6 +147,14 @@ export default abstract class SlicerCore return false; } + /** + * Used to indicate whether this slicer is restartable. Only relevant for + * kubernetesV2 backend + */ + isRestartable(): boolean { + return false; + } + /** * Used to determine the maximum number of slices queued. * Defaults to 10000 diff --git a/packages/teraslice/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sResource.ts b/packages/teraslice/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sResource.ts index bb135e55cf2..53797d10168 100644 --- a/packages/teraslice/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sResource.ts +++ b/packages/teraslice/src/lib/cluster/services/cluster/backends/kubernetesV2/k8sResource.ts @@ -115,7 +115,6 @@ export class K8sResource { } _setEnvVariables() { - /// TODO: Use this later when we need to set env vars in workers/ex controllers } _mountLocalTeraslice(contextType: string): void { @@ -292,14 +291,14 @@ export class K8sResource { // eslint-disable-next-line max-len this.resource.spec.template.spec.priorityClassName = this.terasliceConfig.kubernetes_priority_class_name; if (this.execution.stateful) { - + this.resource.spec.template.metadata.labels[`${this.jobPropertyLabelPrefix}/stateful`] = 'true'; } } if (this.nodeType === 'worker' && this.execution.stateful) { // eslint-disable-next-line max-len this.resource.spec.template.spec.priorityClassName = this.terasliceConfig.kubernetes_priority_class_name; - + this.resource.spec.template.metadata.labels[`${this.jobPropertyLabelPrefix}/stateful`] = 'true'; } } diff --git a/packages/teraslice/src/lib/workers/execution-controller/index.ts b/packages/teraslice/src/lib/workers/execution-controller/index.ts index c8dc1fe0e89..9cfc7a7a638 100644 --- a/packages/teraslice/src/lib/workers/execution-controller/index.ts +++ b/packages/teraslice/src/lib/workers/execution-controller/index.ts @@ -42,7 +42,7 @@ export class ExecutionController { private isExecutionFinished = false; isExecutionDone = false; private workersHaveConnected = false; - + private _handlers = new Map void>(); executionAnalytics: ExecutionAnalytics; readonly scheduler: Scheduler; @@ -420,6 +420,24 @@ export class ExecutionController { await this.client.sendExecutionFinished(shutdownError.message); } } + + /// This only applies to kubernetesV2 + if ( + this.context.sysconfig.teraslice.cluster_manager_type === 'kubernetesV2' + && eventType === 'SIGTERM' + ) { + await this.stateStorage.refresh(); + const status = await this.executionStorage.getStatus(this.exId); + const runningStatuses = this.executionStorage.getRunningStatuses(); + this.logger.debug(`Execution ${this.exId} is currently in a ${status} state`); + /// This is an indication that the cluster_master did not call for this + /// shutdown. We want to restart in this case. + if (status !== 'stopping' && includes(runningStatuses, status)) { + this.logger.info('Skipping shutdown to allow restart...'); + return; + } + } + if (this.isShutdown) return; if (!this.isInitialized) return; if (this.isShuttingDown) { @@ -921,6 +939,25 @@ export class ExecutionController { if (includes(terminalStatuses, status)) { error = new Error(invalidStateMsg('terminal')); } else if (includes(runningStatuses, status)) { + // In the case of a running status on startup we + // want to continue to start up. Only in V2. + // Right now we will depend on kubernetes `crashloopbackoff` in the case of + // an unexpected exit to the ex process. Ex: an OOM + // NOTE: If this becomes an issue we may want to add a new state. Maybe `interrupted` + if (this.context.sysconfig.teraslice.cluster_manager_type === 'kubernetesV2') { + // Check to see if `isRestartable` exists. + // Allows for older assets to work with k8sV2 + if (this.executionContext.slicer().isRestartable) { + this.logger.info(`Execution ${this.exId} detected to have been restarted..`); + const restartable = this.executionContext.slicer().isRestartable(); + if (restartable) { + this.logger.info(`Execution ${this.exId} is restarable and will continue reinitializing...`); + } else { + this.logger.error(`Execution ${this.exId} is not restarable and will shutdown...`); + } + return restartable; + } + } error = new Error(invalidStateMsg('running')); // If in a running status the execution process // crashed and k8s is trying to restart the pod, diff --git a/packages/teraslice/src/lib/workers/helpers/worker-shutdown.ts b/packages/teraslice/src/lib/workers/helpers/worker-shutdown.ts index 6ba60ef450a..e00cad5c7fe 100644 --- a/packages/teraslice/src/lib/workers/helpers/worker-shutdown.ts +++ b/packages/teraslice/src/lib/workers/helpers/worker-shutdown.ts @@ -44,7 +44,12 @@ export function shutdownHandler( const isProcessRestart = process.env.process_restart; // everything but the k8s execution_controller should not be allowed be allowed to // set a non-zero exit code (to avoid being restarted) - const allowNonZeroExitCode = !(isK8s && assignment === 'execution_controller'); + // This is overridden in V2 because it can restart + const allowNonZeroExitCode = !( + isK8s + && assignment === 'execution_controller' + && context.sysconfig.teraslice.cluster_manager_type === 'kubernetesV2' + ); const api = { exiting: false, exit