Skip to content

Commit

Permalink
Log cpu and wall time for torchx events (pytorch#754) (pytorch#754)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: pytorch#754

This diff adds support to log cpu and wall times for torchx events. These numbers can provide an over all time to submit/launch estimate.

Will export this diff to github as well after I have landed the sync diff in the this stack

Reviewed By: kurman

Differential Revision: D48375892

fbshipit-source-id: ac5daa231da2adb707b0a8c39e4a2526b4595895
  • Loading branch information
manav-a authored and KPostOffice committed Sep 7, 2023
1 parent e880542 commit ac38a8a
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 8 deletions.
11 changes: 11 additions & 0 deletions torchx/runner/events/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"""

import logging
import time
import traceback
from types import TracebackType
from typing import Optional, Type
Expand Down Expand Up @@ -86,8 +87,12 @@ def __init__(
self._torchx_event: TorchxEvent = self._generate_torchx_event(
api, scheduler or "", app_id, app_image=app_image, runcfg=runcfg
)
self._start_cpu_time_ns = 0
self._start_wall_time_ns = 0

def __enter__(self) -> "log_event":
self._start_cpu_time_ns = time.process_time_ns()
self._start_wall_time_ns = time.perf_counter_ns()
return self

def __exit__(
Expand All @@ -96,6 +101,12 @@ def __exit__(
exec_value: Optional[BaseException],
traceback_type: Optional[TracebackType],
) -> Optional[bool]:
self._torchx_event.cpu_time_usec = (
time.process_time_ns() - self._start_cpu_time_ns
) // 1000
self._torchx_event.wall_time_usec = (
time.perf_counter_ns() - self._start_wall_time_ns
) // 1000
if traceback_type:
self._torchx_event.raw_exception = traceback.format_exc()
record(self._torchx_event)
Expand Down
4 changes: 4 additions & 0 deletions torchx/runner/events/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ class TorchxEvent:
image: Image/container bundle that is used to execute request.
runcfg: Run config that was used to schedule app.
source: Type of source the event is generated.
cpu_time_usec: CPU time spent in usec
wall_time_usec: Wall time spent in usec
"""

session: str
Expand All @@ -40,6 +42,8 @@ class TorchxEvent:
runcfg: Optional[str] = None
raw_exception: Optional[str] = None
source: SourceType = SourceType.UNKNOWN
cpu_time_usec: Optional[int] = None
wall_time_usec: Optional[int] = None

def __str__(self) -> str:
return self.serialize()
Expand Down
19 changes: 11 additions & 8 deletions torchx/runner/events/test/lib_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,14 +94,6 @@ def test_create_context(self, _) -> None:

def test_record_event(self, record_mock: MagicMock) -> None:
cfg = json.dumps({"test_key": "test_value"})
expected_torchx_event = TorchxEvent(
"test_app_id",
"local",
"test_call",
"test_app_id",
app_image="test_app_image_id",
runcfg=cfg,
)
with log_event(
"test_call",
"local",
Expand All @@ -110,6 +102,17 @@ def test_record_event(self, record_mock: MagicMock) -> None:
runcfg=cfg,
) as ctx:
pass

expected_torchx_event = TorchxEvent(
"test_app_id",
"local",
"test_call",
"test_app_id",
app_image="test_app_image_id",
runcfg=cfg,
cpu_time_usec=ctx._torchx_event.cpu_time_usec,
wall_time_usec=ctx._torchx_event.wall_time_usec,
)
self.assert_torchx_event(expected_torchx_event, ctx._torchx_event)

def test_record_event_with_exception(self, record_mock: MagicMock) -> None:
Expand Down

0 comments on commit ac38a8a

Please sign in to comment.