Skip to content

Commit

Permalink
Nits
Browse files Browse the repository at this point in the history
  • Loading branch information
alaydshah committed May 9, 2024
1 parent 17c857a commit 4534716
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 5 deletions.
3 changes: 3 additions & 0 deletions python/fedml/computing/scheduler/comm_utils/job_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ class DockerArgs:
registry: str = ""
ports: List[int] = field(default_factory=lambda: [2345])

def __post_init__(self):
self.client = JobRunnerUtils.get_docker_client(self)


class JobRunnerUtils(Singleton):
STATIC_RUN_LOCK_KEY_SUFFIX = "STATIC"
Expand Down
12 changes: 7 additions & 5 deletions python/fedml/computing/scheduler/slave/client_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import multiprocessing
import sys
from datetime import datetime

from multiprocessing import Process
import os
Expand Down Expand Up @@ -642,6 +643,7 @@ def execute_train_job_task(self, job_args: JobArgs, unzip_package_path, entry_fi
container = self.create_docker_container(docker_args=job_args.docker_args,
unzip_package_path=unzip_package_path,
image_pull_policy=job_args.image_pull_policy)

try:
job_executing_commands = JobRunnerUtils.generate_launch_docker_command(docker_args=job_args.docker_args,
run_id=self.run_id,
Expand Down Expand Up @@ -674,18 +676,17 @@ def create_docker_container(self, docker_args: DockerArgs,
unzip_package_path: str,
image_pull_policy: str = None):

docker_client = JobRunnerUtils.get_docker_client(docker_args=docker_args)
logging.info(f"Start pulling the launch job image {docker_args.image}... "
f"with policy {image_pull_policy}")
try:
ContainerUtils.get_instance().pull_image_with_policy(image_name=docker_args.image,
client=docker_client,
client=docker_args.client,
image_pull_policy=image_pull_policy)
except Exception as e:
raise Exception(f"Failed to pull the launch job image {docker_args.image} with Exception {e}")

container_name = JobRunnerUtils.get_run_container_name(self.run_id)
JobRunnerUtils.remove_run_container_if_exists(container_name, docker_client)
JobRunnerUtils.remove_run_container_if_exists(container_name, docker_args.client)
device_requests = []
volumes = []
binds = {}
Expand All @@ -706,11 +707,11 @@ def create_docker_container(self, docker_args: DockerArgs,
logging.info(f"device_requests: {device_requests}")

try:
host_config = docker_client.api.create_host_config(
host_config = docker_args.client.api.create_host_config(
binds=binds,
device_requests=device_requests,
)
container = docker_client.api.create_container(
container = docker_args.client.api.create_container(
image=docker_args.image,
name=container_name,
tty=True,
Expand All @@ -719,6 +720,7 @@ def create_docker_container(self, docker_args: DockerArgs,
volumes=volumes,
detach=True # Run container in detached mode
)
docker_args.client.api.start(container=container.get("Id"))
except Exception as e:
logging.error(f"Failed to create docker container with Exception {e}. Traceback: {traceback.format_exc()}")
raise Exception(f"Failed to create docker container with Exception {e}")
Expand Down

0 comments on commit 4534716

Please sign in to comment.