Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Spark pod template overrides #4183

Merged
merged 7 commits into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package flytek8s

import (
pluginsCore "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/core"
)

// Wraps a regular TaskExecutionMetadata and overrides the IsInterruptible method to always return false
// This is useful as the runner and the scheduler pods should never be interruptible
type NonInterruptibleTaskExecutionMetadata struct {
pluginsCore.TaskExecutionMetadata
}

func (n NonInterruptibleTaskExecutionMetadata) IsInterruptible() bool {
return false
}

// A wrapper around a regular TaskExecutionContext allowing to inject a custom TaskExecutionMetadata which is
// non-interruptible
type NonInterruptibleTaskExecutionContext struct {
pluginsCore.TaskExecutionContext
metadata NonInterruptibleTaskExecutionMetadata
}

func (n NonInterruptibleTaskExecutionContext) TaskExecutionMetadata() pluginsCore.TaskExecutionMetadata {
return n.metadata
}

func NewNonInterruptibleTaskExecutionContext(ctx pluginsCore.TaskExecutionContext) NonInterruptibleTaskExecutionContext {
return NonInterruptibleTaskExecutionContext{
TaskExecutionContext: ctx,
metadata: NonInterruptibleTaskExecutionMetadata{
ctx.TaskExecutionMetadata(),
},
}
}
9 changes: 9 additions & 0 deletions flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,15 @@ func ToK8sPodSpec(ctx context.Context, tCtx pluginsCore.TaskExecutionContext) (*
return podSpec, objectMeta, primaryContainerName, nil
}

func GetContainer(podSpec *v1.PodSpec, name string) (*v1.Container, error) {
for _, container := range podSpec.Containers {
if container.Name == name {
return &container, nil
}
}
return nil, pluginserrors.Errorf(pluginserrors.BadTaskSpecification, "invalid TaskSpecification, container [%s] not defined", name)
}

// getBasePodTemplate attempts to retrieve the PodTemplate to use as the base for k8s Pod configuration. This value can
// come from one of the following:
// (1) PodTemplate name in the TaskMetadata: This name is then looked up in the PodTemplateStore.
Expand Down
50 changes: 10 additions & 40 deletions flyteplugins/go/tasks/plugins/k8s/dask/dask.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ import (
"time"

daskAPI "github.com/dask/dask-kubernetes/v2023/dask_kubernetes/operator/go_client/pkg/apis/kubernetes.dask.org/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/client-go/kubernetes/scheme"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/flyteorg/flyte/flyteidl/gen/pb-go/flyteidl/plugins"
"github.com/flyteorg/flyte/flyteplugins/go/tasks/errors"
"github.com/flyteorg/flyte/flyteplugins/go/tasks/logs"
Expand All @@ -15,54 +21,19 @@ import (
"github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/k8s"
"github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/tasklog"
"github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/utils"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/client-go/kubernetes/scheme"
"sigs.k8s.io/controller-runtime/pkg/client"
)

const (
daskTaskType = "dask"
KindDaskJob = "DaskJob"
)

// Wraps a regular TaskExecutionMetadata and overrides the IsInterruptible method to always return false
// This is useful as the runner and the scheduler pods should never be interruptible
type nonInterruptibleTaskExecutionMetadata struct {
pluginsCore.TaskExecutionMetadata
}

func (n nonInterruptibleTaskExecutionMetadata) IsInterruptible() bool {
return false
}

// A wrapper around a regular TaskExecutionContext allowing to inject a custom TaskExecutionMetadata which is
// non-interruptible
type nonInterruptibleTaskExecutionContext struct {
pluginsCore.TaskExecutionContext
metadata nonInterruptibleTaskExecutionMetadata
}

func (n nonInterruptibleTaskExecutionContext) TaskExecutionMetadata() pluginsCore.TaskExecutionMetadata {
return n.metadata
}

func mergeMapInto(src map[string]string, dst map[string]string) {
for key, value := range src {
dst[key] = value
}
}

func getPrimaryContainer(spec *v1.PodSpec, primaryContainerName string) (*v1.Container, error) {
for _, container := range spec.Containers {
if container.Name == primaryContainerName {
return &container, nil
}
}
return nil, errors.Errorf(errors.BadTaskSpecification, "primary container [%v] not found in pod spec", primaryContainerName)
}

func replacePrimaryContainer(spec *v1.PodSpec, primaryContainerName string, container v1.Container) error {
for i, c := range spec.Containers {
if c.Name == primaryContainerName {
Expand Down Expand Up @@ -104,8 +75,7 @@ func (p daskResourceHandler) BuildResource(ctx context.Context, taskCtx pluginsC
if err != nil {
return nil, err
}
nonInterruptibleTaskMetadata := nonInterruptibleTaskExecutionMetadata{taskCtx.TaskExecutionMetadata()}
nonInterruptibleTaskCtx := nonInterruptibleTaskExecutionContext{taskCtx, nonInterruptibleTaskMetadata}
nonInterruptibleTaskCtx := flytek8s.NewNonInterruptibleTaskExecutionContext(taskCtx)
nonInterruptiblePodSpec, _, _, err := flytek8s.ToK8sPodSpec(ctx, nonInterruptibleTaskCtx)
if err != nil {
return nil, err
Expand Down Expand Up @@ -144,7 +114,7 @@ func (p daskResourceHandler) BuildResource(ctx context.Context, taskCtx pluginsC

func createWorkerSpec(cluster plugins.DaskWorkerGroup, podSpec *v1.PodSpec, primaryContainerName string) (*daskAPI.WorkerSpec, error) {
workerPodSpec := podSpec.DeepCopy()
primaryContainer, err := getPrimaryContainer(workerPodSpec, primaryContainerName)
primaryContainer, err := flytek8s.GetContainer(workerPodSpec, primaryContainerName)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -206,7 +176,7 @@ func createWorkerSpec(cluster plugins.DaskWorkerGroup, podSpec *v1.PodSpec, prim

func createSchedulerSpec(scheduler plugins.DaskScheduler, clusterName string, podSpec *v1.PodSpec, primaryContainerName string) (*daskAPI.SchedulerSpec, error) {
schedulerPodSpec := podSpec.DeepCopy()
primaryContainer, err := getPrimaryContainer(schedulerPodSpec, primaryContainerName)
primaryContainer, err := flytek8s.GetContainer(schedulerPodSpec, primaryContainerName)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -283,7 +253,7 @@ func createJobSpec(workerSpec daskAPI.WorkerSpec, schedulerSpec daskAPI.Schedule
jobPodSpec := podSpec.DeepCopy()
jobPodSpec.RestartPolicy = v1.RestartPolicyNever

primaryContainer, err := getPrimaryContainer(jobPodSpec, primaryContainerName)
primaryContainer, err := flytek8s.GetContainer(jobPodSpec, primaryContainerName)
if err != nil {
return nil, err
}
Expand Down
Loading