Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[teraslice] - Add execution controller restart logic in kubernetesV2 #3740

Merged
merged 13 commits into from
Sep 11, 2024
Merged
8 changes: 8 additions & 0 deletions packages/job-components/src/operations/core/slicer-core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,14 @@ export default abstract class SlicerCore<T = OpConfig>
return false;
}

/**
* Used to indicate whether this slicer is restartable. Only relevant for
* kubernetesV2 backend
*/
isRestartable(): boolean {
return false;
}

/**
* Used to determine the maximum number of slices queued.
* Defaults to 10000
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@ export class K8sResource {
}

_setEnvVariables() {
/// TODO: Use this later when we need to set env vars in workers/ex controllers
}

_mountLocalTeraslice(contextType: string): void {
Expand Down Expand Up @@ -292,14 +291,14 @@ export class K8sResource {
// eslint-disable-next-line max-len
this.resource.spec.template.spec.priorityClassName = this.terasliceConfig.kubernetes_priority_class_name;
if (this.execution.stateful) {

this.resource.spec.template.metadata.labels[`${this.jobPropertyLabelPrefix}/stateful`] = 'true';
}
}
if (this.nodeType === 'worker' && this.execution.stateful) {
// eslint-disable-next-line max-len
this.resource.spec.template.spec.priorityClassName = this.terasliceConfig.kubernetes_priority_class_name;

this.resource.spec.template.metadata.labels[`${this.jobPropertyLabelPrefix}/stateful`] = 'true';
}
}
Expand Down
39 changes: 38 additions & 1 deletion packages/teraslice/src/lib/workers/execution-controller/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ export class ExecutionController {
private isExecutionFinished = false;
isExecutionDone = false;
private workersHaveConnected = false;

private _handlers = new Map<string, (arg: any) => void>();
executionAnalytics: ExecutionAnalytics;
readonly scheduler: Scheduler;
Expand Down Expand Up @@ -420,6 +420,24 @@ export class ExecutionController {
await this.client.sendExecutionFinished(shutdownError.message);
}
}

/// This only applies to kubernetesV2
if (
this.context.sysconfig.teraslice.cluster_manager_type === 'kubernetesV2'
&& eventType === 'SIGTERM'
) {
await this.stateStorage.refresh();
const status = await this.executionStorage.getStatus(this.exId);
const runningStatuses = this.executionStorage.getRunningStatuses();
this.logger.debug(`Execution ${this.exId} is currently in a ${status} state`);
/// This is an indication that the cluster_master did not call for this
/// shutdown. We want to restart in this case.
sotojn marked this conversation as resolved.
Show resolved Hide resolved
if (status !== 'stopping' && includes(runningStatuses, status)) {
this.logger.info('Skipping shutdown to allow restart...');
return;
}
}

if (this.isShutdown) return;
if (!this.isInitialized) return;
if (this.isShuttingDown) {
Expand Down Expand Up @@ -921,6 +939,25 @@ export class ExecutionController {
if (includes(terminalStatuses, status)) {
error = new Error(invalidStateMsg('terminal'));
} else if (includes(runningStatuses, status)) {
// In the case of a running status on startup we
// want to continue to start up. Only in V2.
// Right now we will depend on kubernetes `crashloopbackoff` in the case of
// an unexpected exit to the ex process. Ex: an OOM
// NOTE: If this becomes an issue we may want to add a new state. Maybe `interrupted`
if (this.context.sysconfig.teraslice.cluster_manager_type === 'kubernetesV2') {
// Check to see if `isRestartable` exists.
// Allows for older assets to work with k8sV2
if (this.executionContext.slicer().isRestartable) {
this.logger.info(`Execution ${this.exId} detected to have been restarted..`);
const restartable = this.executionContext.slicer().isRestartable();
if (restartable) {
this.logger.info(`Execution ${this.exId} is restarable and will continue reinitializing...`);
} else {
this.logger.error(`Execution ${this.exId} is not restarable and will shutdown...`);
}
return restartable;
}
}
sotojn marked this conversation as resolved.
Show resolved Hide resolved
error = new Error(invalidStateMsg('running'));
// If in a running status the execution process
// crashed and k8s is trying to restart the pod,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,12 @@ export function shutdownHandler(
const isProcessRestart = process.env.process_restart;
// everything but the k8s execution_controller should not be allowed be allowed to
// set a non-zero exit code (to avoid being restarted)
const allowNonZeroExitCode = !(isK8s && assignment === 'execution_controller');
// This is overridden in V2 because it can restart
const allowNonZeroExitCode = !(
isK8s
&& assignment === 'execution_controller'
&& context.sysconfig.teraslice.cluster_manager_type === 'kubernetesV2'
);
const api = {
exiting: false,
exit
Expand Down
Loading