Skip to content

Commit

Permalink
add more logging
Browse files Browse the repository at this point in the history
  • Loading branch information
robnagler committed Dec 23, 2024
1 parent 668bea8 commit 5684755
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 12 deletions.
15 changes: 13 additions & 2 deletions sirepo/job_supervisor.py
Original file line number Diff line number Diff line change
Expand Up @@ -888,6 +888,8 @@ def _process_run_status_update(self, msg):
d.error = msg.get("error", "<unknown error>")
if self.db.status != msg.state:
pkdlog("new status={} {}", msg.state, self)
else:
pkdlog("same status={} {}", msg.state, self)
# must be set because _db_status_update requires it
d.status = msg.state
if self.db.isParallel and d.status != job.PENDING:
Expand Down Expand Up @@ -1155,19 +1157,28 @@ def _update_db(reply, op):
# computeJobSerial is checked by agent so do not need to check here, since
# already know the db is in sync from when request was sent.
# Reply only includes state at this point; OP_RUN_STATUS_UPDATE handles parallelStatus
pkdlog("{} status change reply={}", self, reply)
self._db_status_update(status=reply.state)
op.destroy()
return
if not (e := reply.get("error")):
pkdlog("{} normal sbatch case reply={}", self, reply)
# normal sbatch case of UKNOWN state and _run_status_op continues.
# if is_running_pending, the ui will request frames, but that's ok.
return
op.destroy()
if self._is_running_pending():
# Only set status in running/pending case, otherwise already in exit state
self._db_status_update(status=job.ERROR, error=e)
pkdlog("{} agent does not know status, error={}, ", self, e)
# else leave status alone, likely supervisor knows more than agent
pkdlog(
"{} agent does not know status state={}, reply={}, ",
self,
job.ERROR,
reply,
)
else:
# leave status alone, likely supervisor knows more than agent
pkdlog("{} leave status unchanged, not running reply={}, ", self, reply)

# TODO(robnagler) consolidate _receive_api_runSimulation
d = self.db.dbUpdateTime
Expand Down
25 changes: 15 additions & 10 deletions sirepo/pkcli/job_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,16 +241,20 @@ async def loop(self):
while True:
self._websocket = None
try:
self._websocket = await tornado.websocket.websocket_connect(
tornado.httpclient.HTTPRequest(
connect_timeout=_CONNECT_SECS,
url=_cfg.supervisor_uri,
validate_cert=job.cfg().verify_tls,
),
max_message_size=job.cfg().max_message_bytes,
ping_interval=job.cfg().ping_interval_secs,
ping_timeout=job.cfg().ping_timeout_secs,
)
try:
self._websocket = await tornado.websocket.websocket_connect(
tornado.httpclient.HTTPRequest(
connect_timeout=_CONNECT_SECS,
url=_cfg.supervisor_uri,
validate_cert=job.cfg().verify_tls,
),
max_message_size=job.cfg().max_message_bytes,
ping_interval=job.cfg().ping_interval_secs,
ping_timeout=job.cfg().ping_timeout_secs,
)
except ConnectionError as e:
pkdlog("unable to connect to supervisor exception={}", e)
continue
s = self.format_op(None, job.OP_ALIVE)
while True:
if s and not await self.send(s):
Expand Down Expand Up @@ -803,6 +807,7 @@ def start(self):

def _on_exit(self, returncode):
self.returncode = returncode
pkdlog("{} returncode={}", self, returncode)
self._exit.set()


Expand Down

0 comments on commit 5684755

Please sign in to comment.