Skip to content

Commit

Permalink
use string for storing mem resources (#522)
Browse files Browse the repository at this point in the history
Signed-off-by: Kevin <[email protected]>
  • Loading branch information
KPostOffice authored Apr 26, 2024
1 parent 82d2c5b commit 59cbccc
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 41 deletions.
34 changes: 14 additions & 20 deletions src/codeflare_sdk/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
cluster setup queue, a list of all existing clusters, and the user's working namespace.
"""

import re
from time import sleep
from typing import List, Optional, Tuple, Dict

Expand All @@ -41,6 +42,7 @@
RayClusterStatus,
)
from kubernetes import client, config
from kubernetes.utils import parse_quantity
import yaml
import os
import requests
Expand Down Expand Up @@ -513,26 +515,18 @@ def from_k8_cluster_object(
namespace=rc["metadata"]["namespace"],
machine_types=machine_types,
num_workers=rc["spec"]["workerGroupSpecs"][0]["minReplicas"],
min_cpus=int(
rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
"resources"
]["requests"]["cpu"]
),
max_cpus=int(
rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
"resources"
]["limits"]["cpu"]
),
min_memory=int(
rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
"resources"
]["requests"]["memory"][:-1]
),
max_memory=int(
rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
"resources"
]["limits"]["memory"][:-1]
),
min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["requests"]["cpu"],
max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["limits"]["cpu"],
min_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["requests"]["memory"],
max_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["limits"]["memory"],
num_gpus=int(
rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
"resources"
Expand Down
31 changes: 25 additions & 6 deletions src/codeflare_sdk/cluster/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

from dataclasses import dataclass, field
import pathlib
import typing

dir = pathlib.Path(__file__).parent.parent.resolve()

Expand All @@ -34,15 +35,15 @@ class ClusterConfiguration:
name: str
namespace: str = None
head_info: list = field(default_factory=list)
head_cpus: int = 2
head_memory: int = 8
head_cpus: typing.Union[int, str] = 2
head_memory: typing.Union[int, str] = 8
head_gpus: int = 0
machine_types: list = field(default_factory=list) # ["m4.xlarge", "g4dn.xlarge"]
min_cpus: int = 1
max_cpus: int = 1
min_cpus: typing.Union[int, str] = 1
max_cpus: typing.Union[int, str] = 1
num_workers: int = 1
min_memory: int = 2
max_memory: int = 2
min_memory: typing.Union[int, str] = 2
max_memory: typing.Union[int, str] = 2
num_gpus: int = 0
template: str = f"{dir}/templates/base-template.yaml"
instascale: bool = False
Expand All @@ -59,5 +60,23 @@ def __post_init__(self):
print(
"Warning: TLS verification has been disabled - Endpoint checks will be bypassed"
)
self._memory_to_string()
self._str_mem_no_unit_add_GB()

def _str_mem_no_unit_add_GB(self):
if isinstance(self.head_memory, str) and self.head_memory.isdecimal():
self.head_memory = f"{self.head_memory}G"
if isinstance(self.min_memory, str) and self.min_memory.isdecimal():
self.min_memory = f"{self.min_memory}G"
if isinstance(self.max_memory, str) and self.max_memory.isdecimal():
self.max_memory = f"{self.max_memory}G"

def _memory_to_string(self):
if isinstance(self.head_memory, int):
self.head_memory = f"{self.head_memory}G"
if isinstance(self.min_memory, int):
self.min_memory = f"{self.min_memory}G"
if isinstance(self.max_memory, int):
self.max_memory = f"{self.max_memory}G"

local_queue: str = None
12 changes: 6 additions & 6 deletions src/codeflare_sdk/utils/generate_yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,8 @@ def update_custompodresources(
# Leave head node resources as template default
resource["requests"]["cpu"] = head_cpus
resource["limits"]["cpu"] = head_cpus
resource["requests"]["memory"] = str(head_memory) + "G"
resource["limits"]["memory"] = str(head_memory) + "G"
resource["requests"]["memory"] = head_memory
resource["limits"]["memory"] = head_memory
resource["requests"]["nvidia.com/gpu"] = head_gpus
resource["limits"]["nvidia.com/gpu"] = head_gpus

Expand All @@ -158,9 +158,9 @@ def update_custompodresources(
resource[k][spec] = min_cpu
if spec == "memory":
if k == "limits":
resource[k][spec] = str(max_memory) + "G"
resource[k][spec] = max_memory
else:
resource[k][spec] = str(min_memory) + "G"
resource[k][spec] = min_memory
if spec == "nvidia.com/gpu":
if i == 0:
resource[k][spec] = 0
Expand Down Expand Up @@ -213,12 +213,12 @@ def update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu):
requests = resource.get("resources").get("requests")
if requests is not None:
requests["cpu"] = min_cpu
requests["memory"] = str(min_memory) + "G"
requests["memory"] = min_memory
requests["nvidia.com/gpu"] = gpu
limits = resource.get("resources").get("limits")
if limits is not None:
limits["cpu"] = max_cpu
limits["memory"] = str(max_memory) + "G"
limits["memory"] = max_memory
limits["nvidia.com/gpu"] = gpu


Expand Down
2 changes: 1 addition & 1 deletion src/codeflare_sdk/utils/pretty_print.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def print_clusters(clusters: List[RayCluster]):
name = cluster.name
dashboard = cluster.dashboard
workers = str(cluster.workers)
memory = str(cluster.worker_mem_min) + "~" + str(cluster.worker_mem_max)
memory = f"{cluster.worker_mem_min}~{cluster.worker_mem_max}"
cpu = str(cluster.worker_cpu)
gpu = str(cluster.worker_gpu)

Expand Down
16 changes: 8 additions & 8 deletions tests/unit_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ def test_config_creation():
assert config.name == "unit-test-cluster" and config.namespace == "ns"
assert config.num_workers == 2
assert config.min_cpus == 3 and config.max_cpus == 4
assert config.min_memory == 5 and config.max_memory == 6
assert config.min_memory == "5G" and config.max_memory == "6G"
assert config.num_gpus == 7
assert config.image == "quay.io/project-codeflare/ray:latest-py39-cu118"
assert config.template == f"{parent}/src/codeflare_sdk/templates/base-template.yaml"
Expand Down Expand Up @@ -851,8 +851,8 @@ def test_ray_details(mocker, capsys):
name="raytest1",
status=RayClusterStatus.READY,
workers=1,
worker_mem_min=2,
worker_mem_max=2,
worker_mem_min="2G",
worker_mem_max="2G",
worker_cpu=1,
worker_gpu=0,
namespace="ns",
Expand Down Expand Up @@ -911,7 +911,7 @@ def test_ray_details(mocker, capsys):
" │ ╭── Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n"
" │ │ # Workers │ │ Memory CPU GPU │ │ \n"
" │ │ │ │ │ │ \n"
" │ │ 1 │ │ 2~2 1 0 │ │ \n"
" │ │ 1 │ │ 2G~2G 1 0 │ │ \n"
" │ │ │ │ │ │ \n"
" │ ╰─────────────╯ ╰──────────────────────────────────────╯ │ \n"
" ╰───────────────────────────────────────────────────────────────╯ \n"
Expand All @@ -929,7 +929,7 @@ def test_ray_details(mocker, capsys):
" │ ╭── Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n"
" │ │ # Workers │ │ Memory CPU GPU │ │ \n"
" │ │ │ │ │ │ \n"
" │ │ 1 │ │ 2~2 1 0 │ │ \n"
" │ │ 1 │ │ 2G~2G 1 0 │ │ \n"
" │ │ │ │ │ │ \n"
" │ ╰─────────────╯ ╰──────────────────────────────────────╯ │ \n"
" ╰───────────────────────────────────────────────────────────────╯ \n"
Expand All @@ -945,7 +945,7 @@ def test_ray_details(mocker, capsys):
"│ ╭── Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │\n"
"│ │ # Workers │ │ Memory CPU GPU │ │\n"
"│ │ │ │ │ │\n"
"│ │ 1 │ │ 2~2 1 0 │ │\n"
"│ │ 1 │ │ 2G~2G 1 0 │ │\n"
"│ │ │ │ │ │\n"
"│ ╰─────────────╯ ╰──────────────────────────────────────╯ │\n"
"╰───────────────────────────────────────────────────────────────╯\n"
Expand Down Expand Up @@ -2438,7 +2438,7 @@ def custom_side_effect(group, version, namespace, plural, **kwargs):
and "g4dn.xlarge" in cluster_config.machine_types
)
assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1
assert cluster_config.min_memory == 2 and cluster_config.max_memory == 2
assert cluster_config.min_memory == "2G" and cluster_config.max_memory == "2G"
assert cluster_config.num_gpus == 0
assert (
cluster_config.image
Expand Down Expand Up @@ -2470,7 +2470,7 @@ def test_get_cluster(mocker):
and "g4dn.xlarge" in cluster_config.machine_types
)
assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1
assert cluster_config.min_memory == 2 and cluster_config.max_memory == 2
assert cluster_config.min_memory == "2G" and cluster_config.max_memory == "2G"
assert cluster_config.num_gpus == 0
assert cluster_config.instascale
assert (
Expand Down

0 comments on commit 59cbccc

Please sign in to comment.