From 4f0e75ba0e78c8c91cbba01b474b55a032562101 Mon Sep 17 00:00:00 2001 From: Maksim Novikov Date: Sat, 17 Jul 2021 11:50:56 +0200 Subject: [PATCH 1/3] Add note on running remote server in readme. Closes: #181 --- README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b3ab13a2..366bc4bd 100644 --- a/README.md +++ b/README.md @@ -26,9 +26,15 @@ To install tiktorch and start server run: conda create -n tiktorch-server-env -c ilastik-forge -c conda-forge -c pytorch tiktorch conda activate tiktorch-server-env - +``` +To run server locally use +``` tiktorch-server ``` +To be able to connect to remote machine use (this will bind to all available addresses) +``` +tiktorch-server --addr 0.0.0.0 +``` ## Development environment From 8d1a452e059ada6053f6b16c429b1ee17fb3bc49 Mon Sep 17 00:00:00 2001 From: Maksim Novikov Date: Sat, 17 Jul 2021 11:51:36 +0200 Subject: [PATCH 2/3] List available devices on startup This will simplify cuda issues debugging --- tiktorch/server/grpc/__init__.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tiktorch/server/grpc/__init__.py b/tiktorch/server/grpc/__init__.py index 2af48293..f40190d5 100644 --- a/tiktorch/server/grpc/__init__.py +++ b/tiktorch/server/grpc/__init__.py @@ -7,7 +7,7 @@ from tiktorch.proto import data_store_pb2_grpc, inference_pb2_grpc from tiktorch.server.data_store import DataStore -from tiktorch.server.device_pool import TorchDevicePool +from tiktorch.server.device_pool import IDevicePool, TorchDevicePool from tiktorch.server.session_manager import SessionManager from .data_store_servicer import DataStoreServicer @@ -15,6 +15,12 @@ from .inference_servicer import InferenceServicer +def _print_available_devices(device_pool: IDevicePool) -> None: + print("Available devices:") + for device in device_pool.list_devices(): + print(f" * {device.id}") + + def serve(host, port, *, connection_file_path: Optional[str] = None, kill_timeout: Optional[float] = None): """ Starts grpc server on given host and port and writes connection details to json file @@ -37,7 +43,8 @@ def serve(host, port, *, connection_file_path: Optional[str] = None, kill_timeou data_store = DataStore() - inference_svc = InferenceServicer(TorchDevicePool(), SessionManager(), data_store) + device_pool = TorchDevicePool() + inference_svc = InferenceServicer(device_pool, SessionManager(), data_store) fligh_svc = FlightControlServicer(done_evt=done_evt, kill_timeout=kill_timeout) data_svc = DataStoreServicer(data_store) @@ -52,6 +59,7 @@ def serve(host, port, *, connection_file_path: Optional[str] = None, kill_timeou with open(connection_file_path, "w") as conn_file: json.dump({"addr": host, "port": acquired_port}, conn_file) + _print_available_devices(device_pool) server.start() done_evt.wait() From 2c05cdd2663099041313d8b63b4a640f28111a27 Mon Sep 17 00:00:00 2001 From: Maksim Novikov Date: Sat, 17 Jul 2021 12:40:47 +0200 Subject: [PATCH 3/3] Add cuda info to startup log --- tests/test_server/test_device_pool.py | 17 +++++++++++++++++ tiktorch/server/device_pool.py | 17 ++++++++++++++++- tiktorch/server/grpc/__init__.py | 12 +++++++++++- 3 files changed, 44 insertions(+), 2 deletions(-) create mode 100644 tests/test_server/test_device_pool.py diff --git a/tests/test_server/test_device_pool.py b/tests/test_server/test_device_pool.py new file mode 100644 index 00000000..9822b269 --- /dev/null +++ b/tests/test_server/test_device_pool.py @@ -0,0 +1,17 @@ +import pytest +import torch.cuda +import torch.version + +from tiktorch.server.device_pool import TorchDevicePool + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda is not available") +def test_device_pool_with_cuda(): + device_pool = TorchDevicePool() + assert device_pool.cuda_version == torch.version.cuda + + +@pytest.mark.skipif(torch.cuda.is_available(), reason="cuda is available") +def test_device_pool_without_cuda(): + device_pool = TorchDevicePool() + assert device_pool.cuda_version is None diff --git a/tiktorch/server/device_pool.py b/tiktorch/server/device_pool.py index 0a3dad21..6487ce01 100644 --- a/tiktorch/server/device_pool.py +++ b/tiktorch/server/device_pool.py @@ -5,7 +5,7 @@ import threading import uuid from collections import defaultdict -from typing import List +from typing import List, Optional import torch @@ -60,6 +60,14 @@ def devices(self) -> List[IDevice]: class IDevicePool(abc.ABC): + @property + @abc.abstractmethod + def cuda_version(self) -> Optional[str]: + """ + Returns CUDA version if available + """ + ... + @abc.abstractmethod def list_devices(self) -> List[IDevice]: """ @@ -112,6 +120,13 @@ def __init__(self): self.__device_ids_by_lease_id = defaultdict(list) self.__lock = threading.Lock() + @property + def cuda_version(self) -> Optional[str]: + if torch.cuda.is_available(): + return torch.version.cuda # type: ignore + else: + return None + def list_devices(self) -> List[IDevice]: with self.__lock: ids = ["cpu"] diff --git a/tiktorch/server/grpc/__init__.py b/tiktorch/server/grpc/__init__.py index f40190d5..a2132a51 100644 --- a/tiktorch/server/grpc/__init__.py +++ b/tiktorch/server/grpc/__init__.py @@ -1,4 +1,5 @@ import json +import os import threading from concurrent import futures from typing import Optional @@ -16,6 +17,14 @@ def _print_available_devices(device_pool: IDevicePool) -> None: + cuda = device_pool.cuda_version + print() + print("CUDA version:", cuda or "not available") + for env_var, value in os.environ.items(): + if env_var.startswith("CUDA_"): + print(env_var, value.strip() or "") + + print() print("Available devices:") for device in device_pool.list_devices(): print(f" * {device.id}") @@ -47,19 +56,20 @@ def serve(host, port, *, connection_file_path: Optional[str] = None, kill_timeou inference_svc = InferenceServicer(device_pool, SessionManager(), data_store) fligh_svc = FlightControlServicer(done_evt=done_evt, kill_timeout=kill_timeout) data_svc = DataStoreServicer(data_store) + _print_available_devices(device_pool) inference_pb2_grpc.add_InferenceServicer_to_server(inference_svc, server) inference_pb2_grpc.add_FlightControlServicer_to_server(fligh_svc, server) data_store_pb2_grpc.add_DataStoreServicer_to_server(data_svc, server) acquired_port = server.add_insecure_port(f"{host}:{port}") + print() print(f"Starting server on {host}:{acquired_port}") if connection_file_path: print(f"Writing connection data to {connection_file_path}") with open(connection_file_path, "w") as conn_file: json.dump({"addr": host, "port": acquired_port}, conn_file) - _print_available_devices(device_pool) server.start() done_evt.wait()