From a8d4d2ac43a2ab4e42414f900a5869c21778965d Mon Sep 17 00:00:00 2001 From: Kunal Bhalla Date: Thu, 31 Aug 2023 10:36:07 -0700 Subject: [PATCH] Wait for app to start with torchx log (#763) Summary: Pull Request resolved: https://github.com/pytorch/torchx/pull/763 Right now it silently times out after waiting for 10s and then (depending on the scheduler) fails because the job hasn't actually started; I'm trying to use `--log` with the `run` command and inevitably run into this. I wasn't sure if we want to explicitly timeout, or what the best ergonomics would be -- I can increase the timeout to 10 minutes and blow up if it doesn't start in that time, or change it to a while loop (as done here) -- wanted to check :). Differential Revision: D48840617 fbshipit-source-id: 8c2326d07d4df8c69a15e3d747ba3b5162ab6bec --- torchx/cli/cmd_log.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/torchx/cli/cmd_log.py b/torchx/cli/cmd_log.py index b3854f7b5..e814d0da3 100644 --- a/torchx/cli/cmd_log.py +++ b/torchx/cli/cmd_log.py @@ -101,12 +101,14 @@ def get_logs( if len(path) == 4: replica_ids = [(role_name, int(id)) for id in path[3].split(",") if id] else: - for i in range(10): + display_waiting = True + while True: status = runner.status(app_handle) if status and is_started(status.state): break - if i == 0: - logger.info("Waiting for app to start before logging...") + elif display_waiting: + logger.info("Waiting for app to start before fetching logs...") + display_waiting = False time.sleep(1) app = none_throws(runner.describe(app_handle))