Skip to content

Commit

Permalink
fix failing test
Browse files Browse the repository at this point in the history
  • Loading branch information
asm582 committed Dec 16, 2023
1 parent 883e04a commit ff376f5
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 22 deletions.
30 changes: 15 additions & 15 deletions .github/workflows/mcad-CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,21 @@ jobs:
steps:
- name: run docker resource config
run: |
sudo touch /etc/systemd/system/docker_limit.slice
cat <<EOF > /etc/systemd/system/docker_limit.slice
[Unit]
Description=Slice that limits docker resources
Before=slices.target
[Slice]
CPUAccounting=true
CPUQuota=50%
EOF
sudo systemctl start /etc/systemd/system/docker_limit.slice
new_content='{ "exec-opts": ["native.cgroupdriver=cgroupfs"], "cgroup-parent": "/docker_limit.slice" }'
sudo sed -i 's|{ "exec-opts": \["native.cgroupdriver=cgroupfs"\], "cgroup-parent": "/actions_job" }|'"$new_content"'|' /etc/docker/daemon.json
cat /etc/docker/daemon.json
sudo systemctl restart docker
sleep 10
# sudo touch /etc/systemd/system/docker_limit.slice
# cat <<EOF > /etc/systemd/system/docker_limit.slice
# [Unit]
# Description=Slice that limits docker resources
# Before=slices.target
# [Slice]
# CPUAccounting=true
# CPUQuota=50%
# EOF
# sudo systemctl start /etc/systemd/system/docker_limit.slice
# new_content='{ "exec-opts": ["native.cgroupdriver=cgroupfs"], "cgroup-parent": "/docker_limit.slice" }'
# sudo sed -i 's|{ "exec-opts": \["native.cgroupdriver=cgroupfs"\], "cgroup-parent": "/actions_job" }|'"$new_content"'|' /etc/docker/daemon.json
# cat /etc/docker/daemon.json
# sudo systemctl restart docker
# sleep 10
docker info | grep CPU
- name: checkout code
uses: actions/checkout@v3
Expand Down
21 changes: 14 additions & 7 deletions test/e2e/queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -427,15 +427,21 @@ var _ = Describe("AppWrapper E2E Test", func() {
appwrappersPtr := &appwrappers
defer cleanupTestObjectsPtr(context, appwrappersPtr)

// This should fill up the worker node and most of the master node
aw := createDeploymentAWwith550CPU(context, appendRandomString("aw-deployment-2-550cpu"))
//This should fill up the worker node and most of the master node
//aw := createDeploymentAWwith550CPU(context, appendRandomString("aw-deployment-2-550cpu"))
cap := getClusterCapacitycontext(context)
resource := cpuDemand(cap, 0.275).String()
aw := createGenericDeploymentCustomPodResourcesWithCPUAW(
context, appendRandomString("aw-ff-deployment-55-percent-cpu"), resource, resource, 2, 60)
appwrappers = append(appwrappers, aw)
err := waitAWPodsReady(context, aw)
Expect(err).NotTo(HaveOccurred(), "Expecting pods for app wrapper: aw-deployment-2-550cpu")
fmt.Fprintf(os.Stdout, "The aw status is %v", aw.Status.State)
Expect(err).NotTo(HaveOccurred(), "Expecting pods for app wrapper: aw-ff-deployment-1-3500-cpu")

// This should not fit on any node but should dispatch because there is enough aggregated resources.
resource2 := cpuDemand(cap, 0.4).String()
aw2 := createGenericDeploymentCustomPodResourcesWithCPUAW(
context, appendRandomString("aw-ff-deployment-1-850-cpu"), "850m", "850m", 1, 60)
context, appendRandomString("aw-ff-deployment-40-percent-cpu"), resource2, resource2, 1, 60)

appwrappers = append(appwrappers, aw2)

Expand All @@ -448,18 +454,19 @@ var _ = Describe("AppWrapper E2E Test", func() {

// This should fit on cluster after AW aw-deployment-1-850-cpu above is automatically preempted on
// scheduling failure
resource3 := cpuDemand(cap, 0.15).String()
aw3 := createGenericDeploymentCustomPodResourcesWithCPUAW(
context, appendRandomString("aw-ff-deployment-2-340-cpu"), "340m", "340m", 2, 60)
context, appendRandomString("aw-ff-deployment-15-percent-cpu"), resource3, resource3, 2, 60)

appwrappers = append(appwrappers, aw3)

// Wait for pods to get created, assumes preemption around 10 minutes
err = waitAWPodsExists(context, aw3, 720000*time.Millisecond)
Expect(err).NotTo(HaveOccurred(), "Expecting pods for app wrapper: aw-ff-deployment-2-340-cpu")
Expect(err).NotTo(HaveOccurred(), "Expecting pods for app wrapper: aw-ff-deployment-15-percent-cpu")
fmt.Fprintf(GinkgoWriter, "[e2e] MCAD Scheduling Fail Fast Preemption Test - Pods not found for app wrapper aw-ff-deployment-2-340-cpu\n")

err = waitAWPodsReady(context, aw3)
Expect(err).NotTo(HaveOccurred(), "Expecting no pods for app wrapper: aw-ff-deployment-2-340-cpu")
Expect(err).NotTo(HaveOccurred(), "Expecting no pods for app wrapper: aw-ff-deployment-15-percent-cpu")
fmt.Fprintf(GinkgoWriter, "[e2e] MCAD Scheduling Fail Fast Preemption Test - Ready pods found for app wrapper aw-ff-deployment-2-340-cpu\n")

// Make sure pods from AW aw-deployment-1-850-cpu have preempted
Expand Down
38 changes: 38 additions & 0 deletions test/e2e/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ import (

arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1"
versioned "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/versioned"
"github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/api"
clusterstateapi "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/api"
)

var ninetySeconds = 90 * time.Second
Expand Down Expand Up @@ -793,6 +795,36 @@ func createDeploymentAWwith550CPU(context *context, name string) *arbv1.AppWrapp
return appwrapper
}

func getClusterCapacitycontext(context *context) *clusterstateapi.Resource {
capacity := clusterstateapi.EmptyResource()
nodes, _ := context.kubeclient.CoreV1().Nodes().List(context.ctx, metav1.ListOptions{})
for _, node := range nodes.Items {
// skip unschedulable nodes
if node.Spec.Unschedulable {
continue
}
nodeResource := clusterstateapi.NewResource(node.Status.Allocatable)
capacity.Add(nodeResource)
var specNodeName = "spec.nodeName"
labelSelector := fmt.Sprintf("%s=%s", specNodeName, node.Name)
podList, err := context.kubeclient.CoreV1().Pods("").List(context.ctx, metav1.ListOptions{FieldSelector: labelSelector})
// TODO: when no pods are listed, do we send entire node capacity as available
// this will cause false positive dispatch.
if err != nil {
fmt.Errorf("[allocatableCapacity] Error listing pods %v", err)
}
for _, pod := range podList.Items {
if _, ok := pod.GetLabels()["appwrappers.mcad.ibm.com"]; !ok && pod.Status.Phase != v1.PodFailed && pod.Status.Phase != v1.PodSucceeded {
for _, container := range pod.Spec.Containers {
usedResource := clusterstateapi.NewResource(container.Resources.Requests)
capacity.Sub(usedResource)
}
}
}
}
return capacity
}

func createDeploymentAWwith350CPU(context *context, name string) *arbv1.AppWrapper {
rb := []byte(`{"apiVersion": "apps/v1",
"kind": "Deployment",
Expand Down Expand Up @@ -2705,3 +2737,9 @@ func AppWrapper(context *context, namespace string, name string) func(g gomega.G
func AppWrapperState(aw *arbv1.AppWrapper) arbv1.AppWrapperState {
return aw.Status.State
}

func cpuDemand(cap *api.Resource, fractionOfCluster float64) *resource.Quantity {
//klog.Infof("[allocatableCapacity] The available capacity to dispatch appwrapper is %v and time took to calculate is %v", capacity, time.Since(startTime))
milliDemand := int64(float64(cap.MilliCPU) * fractionOfCluster)
return resource.NewMilliQuantity(milliDemand, resource.DecimalSI)
}

0 comments on commit ff376f5

Please sign in to comment.