From 2d22b201af18065978a954808fb3c4aa1315152b Mon Sep 17 00:00:00 2001 From: caozhou Date: Fri, 7 Jun 2024 14:52:34 +0800 Subject: [PATCH] fix multi node bug --- flagscale/auto_tuner/record/recorder.py | 2 +- flagscale/auto_tuner/tuner.py | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/flagscale/auto_tuner/record/recorder.py b/flagscale/auto_tuner/record/recorder.py index 24d0885ff..3b9f216dd 100644 --- a/flagscale/auto_tuner/record/recorder.py +++ b/flagscale/auto_tuner/record/recorder.py @@ -309,4 +309,4 @@ def save(self, history): df = df.reindex(columns=cols) if "stopped_by_tuner" in df.columns: df = df.drop(columns=["stopped_by_tuner"]) - df.to_csv(self.path, index=False) + df.to_csv(self.path, index=False, escapechar='\\') diff --git a/flagscale/auto_tuner/tuner.py b/flagscale/auto_tuner/tuner.py index fd41d6bf8..abe20f53e 100644 --- a/flagscale/auto_tuner/tuner.py +++ b/flagscale/auto_tuner/tuner.py @@ -242,7 +242,7 @@ def monitor(self): """Monitor the task until task timeout or completed.""" # Sleep 3s to ensure the task is started time.sleep(3) - + running = False while True: # If the task timeout, stop monitoring end_time = time.time() @@ -262,6 +262,12 @@ def monitor(self): f"task_{self.cur_strategy['idx']} status: {status.name}") if status == JobStatus.COMPLETED_OR_IDLE: break + if status == JobStatus.RUNNING: + running = True + if status == JobStatus.TRANSITIONAL: + if running: + self.runner.stop() + break except Exception as e: self.logger.info(e) time.sleep(self.interval)