Skip to content

Commit

Permalink
run-job-scheduler-on-arm (#1149)
Browse files Browse the repository at this point in the history
* Always run job-scheduler on arm platform

* Fixed unit-tests

* Set arm and cache to workflows

* Fire build

* Cleanup

* Cleanup
  • Loading branch information
satr authored Jul 25, 2024
1 parent 6c79307 commit febfc9c
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 25 deletions.
7 changes: 7 additions & 0 deletions .github/workflows/build-push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ jobs:
strategy:
fail-fast: false
matrix:
arch: [arm64]
target:
- name: "dev"
ref: "refs/heads/master"
Expand Down Expand Up @@ -133,6 +134,8 @@ jobs:
ts=$(date +%s)
echo "radix-operator-tag=${GITHUB_REF_NAME}-${sha}-${ts}" >> $GITHUB_OUTPUT
echo "pipeline-runner-tag=${GITHUB_REF_NAME}-latest" >> $GITHUB_OUTPUT
echo "cache-radix-operator-tag=cache-radix-operator-${GITHUB_REF_NAME}" >> $GITHUB_OUTPUT
echo "cache-pipeline-runner-tag=cache-pipeline-runner-${GITHUB_REF_NAME}" >> $GITHUB_OUTPUT
- name: Extract labels from metadata for Docker
if: matrix.target.ref == github.ref
Expand All @@ -153,6 +156,8 @@ jobs:
linux/arm64
tags: "${{ steps.build-image-names.outputs.radix-operator-image-name }}:${{ steps.build-tags.outputs.radix-operator-tag }}"
labels: ${{ steps.radix-operator-meta.outputs.labels }}
cache-from: "type=registry,ref=${{ steps.build-image-names.outputs.radix-operator-image-name }}:${{ steps.build-tags.outputs.cache-radix-operator-tag }}"
cache-to: "type=registry,ref=${{ steps.build-image-names.outputs.radix-operator-image-name }}:${{ steps.build-tags.outputs.cache-radix-operator-tag }},mode=max"

- name: Build and push pipeline-runner docker image
if: matrix.target.ref == github.ref
Expand All @@ -166,6 +171,8 @@ jobs:
linux/arm64
tags: "${{ steps.build-image-names.outputs.pipeline-runner-image-name }}:${{ steps.build-tags.outputs.pipeline-runner-tag }}"
labels: ${{ steps.pipeline-runner-meta.outputs.labels }}
cache-from: "type=registry,ref=${{ steps.build-image-names.outputs.pipeline-runner-image-name }}:${{ steps.build-tags.outputs.cache-pipeline-runner-tag }}"
cache-to: "type=registry,ref=${{ steps.build-image-names.outputs.pipeline-runner-image-name }}:${{ steps.build-tags.outputs.cache-pipeline-runner-tag }},mode=max"

- name: Revoke GitHub IP on ACR
if: ${{ matrix.target.ref == github.ref && steps.update_firewall.outcome == 'success' && !cancelled()}} # Always run this step even if previous step failed
Expand Down
16 changes: 15 additions & 1 deletion .github/workflows/pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@ on:
pull_request:
branches:
- master

jobs:
build:
name: pull-request-check
runs-on: ubuntu-latest
strategy:
matrix:
arch: [arm64]
steps:
- uses: actions/checkout@v4
- name: Set up Docker Buildx
Expand All @@ -34,6 +36,9 @@ jobs:
radix-operator-test:
name: Pipeline-runner unit tests
runs-on: ubuntu-latest
strategy:
matrix:
arch: [arm64]
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v5
Expand All @@ -51,6 +56,9 @@ jobs:
pipeline-runner-test:
name: Pipeline-runner unit tests
runs-on: ubuntu-latest
strategy:
matrix:
arch: [arm64]
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v5
Expand All @@ -68,6 +76,9 @@ jobs:
radix-operator-lint:
name: Lint
runs-on: ubuntu-latest
strategy:
matrix:
arch: [arm64]
steps:
- uses: actions/checkout@v4
with:
Expand All @@ -84,6 +95,9 @@ jobs:
verify-code-generation:
name: Verify Code Generation
runs-on: ubuntu-latest
strategy:
matrix:
arch: [arm64]
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v5
Expand Down
4 changes: 2 additions & 2 deletions charts/radix-operator/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: v2
name: radix-operator
version: 1.37.6
appVersion: 1.57.5
version: 1.37.7
appVersion: 1.57.6
kubeVersion: ">=1.24.0"
description: Radix Operator
keywords:
Expand Down
59 changes: 38 additions & 21 deletions pkg/apis/deployment/deployment_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -682,7 +682,7 @@ func TestObjectSynced_MultiJob_ContainsAllElements(t *testing.T) {
expectedAffinity := &corev1.Affinity{
NodeAffinity: &corev1.NodeAffinity{RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{NodeSelectorTerms: []corev1.NodeSelectorTerm{{MatchExpressions: []corev1.NodeSelectorRequirement{
{Key: corev1.LabelOSStable, Operator: corev1.NodeSelectorOpIn, Values: []string{defaults.DefaultNodeSelectorOS}},
{Key: corev1.LabelArchStable, Operator: corev1.NodeSelectorOpIn, Values: []string{defaults.DefaultNodeSelectorArchitecture}},
{Key: corev1.LabelArchStable, Operator: corev1.NodeSelectorOpIn, Values: []string{string(radixv1.RuntimeArchitectureArm64)}},
}}}}},
PodAntiAffinity: &corev1.PodAntiAffinity{PreferredDuringSchedulingIgnoredDuringExecution: []corev1.WeightedPodAffinityTerm{{Weight: 1, PodAffinityTerm: corev1.PodAffinityTerm{TopologyKey: corev1.LabelHostname, LabelSelector: &metav1.LabelSelector{MatchExpressions: []metav1.LabelSelectorRequirement{
{Key: kube.RadixAppLabel, Operator: metav1.LabelSelectorOpIn, Values: []string{appName}},
Expand Down Expand Up @@ -2977,7 +2977,7 @@ func TestUseGpuNodeOnDeploy(t *testing.T) {
expectedAffinity := &corev1.Affinity{
NodeAffinity: &corev1.NodeAffinity{RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{NodeSelectorTerms: []corev1.NodeSelectorTerm{{MatchExpressions: []corev1.NodeSelectorRequirement{
{Key: corev1.LabelOSStable, Operator: corev1.NodeSelectorOpIn, Values: []string{defaults.DefaultNodeSelectorOS}},
{Key: corev1.LabelArchStable, Operator: corev1.NodeSelectorOpIn, Values: []string{defaults.DefaultNodeSelectorArchitecture}},
{Key: corev1.LabelArchStable, Operator: corev1.NodeSelectorOpIn, Values: []string{string(radixv1.RuntimeArchitectureArm64)}},
}}}}},
PodAntiAffinity: &corev1.PodAntiAffinity{PreferredDuringSchedulingIgnoredDuringExecution: []corev1.WeightedPodAffinityTerm{{Weight: 1, PodAffinityTerm: corev1.PodAffinityTerm{TopologyKey: corev1.LabelHostname, LabelSelector: &metav1.LabelSelector{MatchExpressions: []metav1.LabelSelectorRequirement{
{Key: kube.RadixAppLabel, Operator: metav1.LabelSelectorOpIn, Values: []string{anyAppName}},
Expand Down Expand Up @@ -3153,22 +3153,10 @@ func TestUseGpuNodeCountOnDeployment(t *testing.T) {
WithNodeGpuCount(nodeGpuCount10)))
require.NoError(t, err)

defaultAffinityBuilder := func(componentName string) *corev1.Affinity {
return &corev1.Affinity{
NodeAffinity: &corev1.NodeAffinity{RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{NodeSelectorTerms: []corev1.NodeSelectorTerm{{MatchExpressions: []corev1.NodeSelectorRequirement{
{Key: corev1.LabelOSStable, Operator: corev1.NodeSelectorOpIn, Values: []string{defaults.DefaultNodeSelectorOS}},
{Key: corev1.LabelArchStable, Operator: corev1.NodeSelectorOpIn, Values: []string{defaults.DefaultNodeSelectorArchitecture}},
}}}}},
PodAntiAffinity: &corev1.PodAntiAffinity{PreferredDuringSchedulingIgnoredDuringExecution: []corev1.WeightedPodAffinityTerm{{Weight: 1, PodAffinityTerm: corev1.PodAffinityTerm{TopologyKey: corev1.LabelHostname, LabelSelector: &metav1.LabelSelector{MatchExpressions: []metav1.LabelSelectorRequirement{
{Key: kube.RadixAppLabel, Operator: metav1.LabelSelectorOpIn, Values: []string{anyAppName}},
{Key: kube.RadixComponentLabel, Operator: metav1.LabelSelectorOpIn, Values: []string{componentName}},
}}}}}},
}
}
t.Run("missing node.gpu", func(t *testing.T) {
t.Parallel()
deployment, _ := client.AppsV1().Deployments(envNamespace).Get(context.Background(), componentName1, metav1.GetOptions{})
assert.Equal(t, defaultAffinityBuilder(componentName1), deployment.Spec.Template.Spec.Affinity)
assert.Equal(t, getDefaultComponentAffinityBuilder(componentName1, anyAppName), deployment.Spec.Template.Spec.Affinity)
tolerations := deployment.Spec.Template.Spec.Tolerations
assert.Len(t, tolerations, 0) // missing node.gpu
})
Expand Down Expand Up @@ -3199,35 +3187,35 @@ func TestUseGpuNodeCountOnDeployment(t *testing.T) {
t.Run("has node with gpu-count 0", func(t *testing.T) {
t.Parallel()
deployment, _ := client.AppsV1().Deployments(envNamespace).Get(context.Background(), componentName3, metav1.GetOptions{})
assert.Equal(t, defaultAffinityBuilder(componentName3), deployment.Spec.Template.Spec.Affinity)
assert.Equal(t, getDefaultComponentAffinityBuilder(componentName3, anyAppName), deployment.Spec.Template.Spec.Affinity)
tolerations := deployment.Spec.Template.Spec.Tolerations
assert.Len(t, tolerations, 0)
})
t.Run("has node with gpu-count -1", func(t *testing.T) {
t.Parallel()
deployment, _ := client.AppsV1().Deployments(envNamespace).Get(context.Background(), componentName4, metav1.GetOptions{})
assert.Equal(t, defaultAffinityBuilder(componentName4), deployment.Spec.Template.Spec.Affinity)
assert.Equal(t, getDefaultComponentAffinityBuilder(componentName4, anyAppName), deployment.Spec.Template.Spec.Affinity)
tolerations := deployment.Spec.Template.Spec.Tolerations
assert.Len(t, tolerations, 0)
})
t.Run("has node with invalid value of gpu-count", func(t *testing.T) {
t.Parallel()
deployment, _ := client.AppsV1().Deployments(envNamespace).Get(context.Background(), componentName5, metav1.GetOptions{})
assert.Equal(t, defaultAffinityBuilder(componentName5), deployment.Spec.Template.Spec.Affinity)
assert.Equal(t, getDefaultComponentAffinityBuilder(componentName5, anyAppName), deployment.Spec.Template.Spec.Affinity)
tolerations := deployment.Spec.Template.Spec.Tolerations
assert.Len(t, tolerations, 0)
})
t.Run("has node with no gpu-count", func(t *testing.T) {
t.Parallel()
deployment, _ := client.AppsV1().Deployments(envNamespace).Get(context.Background(), componentName6, metav1.GetOptions{})
assert.Equal(t, defaultAffinityBuilder(componentName6), deployment.Spec.Template.Spec.Affinity)
assert.Equal(t, getDefaultComponentAffinityBuilder(componentName6, anyAppName), deployment.Spec.Template.Spec.Affinity)
tolerations := deployment.Spec.Template.Spec.Tolerations
assert.Len(t, tolerations, 0)
})
t.Run("job has node, but pod template of Job Scheduler does not have it", func(t *testing.T) {
t.Parallel()
deployment, _ := client.AppsV1().Deployments(envNamespace).Get(context.Background(), jobComponentName, metav1.GetOptions{})
assert.Equal(t, defaultAffinityBuilder(jobComponentName), deployment.Spec.Template.Spec.Affinity)
assert.Equal(t, getDefaultJobComponentAffinityBuilder(anyAppName, jobComponentName), deployment.Spec.Template.Spec.Affinity)
tolerations := deployment.Spec.Template.Spec.Tolerations
assert.Len(t, tolerations, 0)
})
Expand Down Expand Up @@ -3290,7 +3278,7 @@ func TestUseGpuNodeWithGpuCountOnDeployment(t *testing.T) {
expectedAffinity := &corev1.Affinity{
NodeAffinity: &corev1.NodeAffinity{RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{NodeSelectorTerms: []corev1.NodeSelectorTerm{{MatchExpressions: []corev1.NodeSelectorRequirement{
{Key: corev1.LabelOSStable, Operator: corev1.NodeSelectorOpIn, Values: []string{defaults.DefaultNodeSelectorOS}},
{Key: corev1.LabelArchStable, Operator: corev1.NodeSelectorOpIn, Values: []string{defaults.DefaultNodeSelectorArchitecture}},
{Key: corev1.LabelArchStable, Operator: corev1.NodeSelectorOpIn, Values: []string{string(radixv1.RuntimeArchitectureArm64)}},
}}}}},
PodAntiAffinity: &corev1.PodAntiAffinity{PreferredDuringSchedulingIgnoredDuringExecution: []corev1.WeightedPodAffinityTerm{{Weight: 1, PodAffinityTerm: corev1.PodAffinityTerm{TopologyKey: corev1.LabelHostname, LabelSelector: &metav1.LabelSelector{MatchExpressions: []metav1.LabelSelectorRequirement{
{Key: kube.RadixAppLabel, Operator: metav1.LabelSelectorOpIn, Values: []string{anyAppName}},
Expand Down Expand Up @@ -4569,3 +4557,32 @@ func getPortByName(name string, ports []corev1.ContainerPort) *corev1.ContainerP
}
return nil
}

func getDefaultComponentAffinityBuilder(componentName string, appName string) *corev1.Affinity {
return &corev1.Affinity{NodeAffinity: getLinuxAmd64NodeAffinity(), PodAntiAffinity: getComponentPodAntiAffinity(appName, componentName)}
}

func getDefaultJobComponentAffinityBuilder(appName string, jobComponentName string) *corev1.Affinity {
return &corev1.Affinity{NodeAffinity: getLinuxArm64NodeAffinity(), PodAntiAffinity: getComponentPodAntiAffinity(appName, jobComponentName)}
}

func getComponentPodAntiAffinity(anyAppName string, componentName string) *corev1.PodAntiAffinity {
return &corev1.PodAntiAffinity{PreferredDuringSchedulingIgnoredDuringExecution: []corev1.WeightedPodAffinityTerm{{Weight: 1, PodAffinityTerm: corev1.PodAffinityTerm{TopologyKey: corev1.LabelHostname, LabelSelector: &metav1.LabelSelector{MatchExpressions: []metav1.LabelSelectorRequirement{
{Key: kube.RadixAppLabel, Operator: metav1.LabelSelectorOpIn, Values: []string{anyAppName}},
{Key: kube.RadixComponentLabel, Operator: metav1.LabelSelectorOpIn, Values: []string{componentName}},
}}}}}}
}

func getLinuxArm64NodeAffinity() *corev1.NodeAffinity {
return &corev1.NodeAffinity{RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{NodeSelectorTerms: []corev1.NodeSelectorTerm{{MatchExpressions: []corev1.NodeSelectorRequirement{
{Key: corev1.LabelOSStable, Operator: corev1.NodeSelectorOpIn, Values: []string{defaults.DefaultNodeSelectorOS}},
{Key: corev1.LabelArchStable, Operator: corev1.NodeSelectorOpIn, Values: []string{string(radixv1.RuntimeArchitectureArm64)}},
}}}}}
}

func getLinuxAmd64NodeAffinity() *corev1.NodeAffinity {
return &corev1.NodeAffinity{RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{NodeSelectorTerms: []corev1.NodeSelectorTerm{{MatchExpressions: []corev1.NodeSelectorRequirement{
{Key: corev1.LabelOSStable, Operator: corev1.NodeSelectorOpIn, Values: []string{defaults.DefaultNodeSelectorOS}},
{Key: corev1.LabelArchStable, Operator: corev1.NodeSelectorOpIn, Values: []string{defaults.DefaultNodeSelectorArchitecture}},
}}}}}
}
2 changes: 1 addition & 1 deletion pkg/apis/deployment/jobschedulercomponent.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ func (js *jobSchedulerComponent) GetNode() *radixv1.RadixNode {
}

func (js *jobSchedulerComponent) GetRuntime() *radixv1.Runtime {
return &radixv1.Runtime{Architecture: radixv1.RuntimeArchitectureAmd64}
return &radixv1.Runtime{Architecture: radixv1.RuntimeArchitectureArm64}
}

func isDeployComponentJobSchedulerDeployment(deployComponent radixv1.RadixCommonDeployComponent) bool {
Expand Down

0 comments on commit febfc9c

Please sign in to comment.