From 4f0e75ba0e78c8c91cbba01b474b55a032562101 Mon Sep 17 00:00:00 2001
From: Maksim Novikov <mnovikov.work@gmail.com>
Date: Sat, 17 Jul 2021 11:50:56 +0200
Subject: [PATCH 1/3] Add note on running remote server in readme. Closes: #181

---
 README.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b3ab13a2..366bc4bd 100644
--- a/README.md
+++ b/README.md
@@ -26,9 +26,15 @@ To install tiktorch and start server run:
 conda create -n tiktorch-server-env -c ilastik-forge -c conda-forge -c pytorch tiktorch
 
 conda activate tiktorch-server-env
-
+```
+To run server locally use
+```
 tiktorch-server
 ```
+To be able to connect to remote machine use (this will bind to all available addresses)
+```
+tiktorch-server --addr 0.0.0.0
+```
 
 ## Development environment
 

From 8d1a452e059ada6053f6b16c429b1ee17fb3bc49 Mon Sep 17 00:00:00 2001
From: Maksim Novikov <mnovikov.work@gmail.com>
Date: Sat, 17 Jul 2021 11:51:36 +0200
Subject: [PATCH 2/3] List available devices on startup

This will simplify cuda issues debugging
---
 tiktorch/server/grpc/__init__.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tiktorch/server/grpc/__init__.py b/tiktorch/server/grpc/__init__.py
index 2af48293..f40190d5 100644
--- a/tiktorch/server/grpc/__init__.py
+++ b/tiktorch/server/grpc/__init__.py
@@ -7,7 +7,7 @@
 
 from tiktorch.proto import data_store_pb2_grpc, inference_pb2_grpc
 from tiktorch.server.data_store import DataStore
-from tiktorch.server.device_pool import TorchDevicePool
+from tiktorch.server.device_pool import IDevicePool, TorchDevicePool
 from tiktorch.server.session_manager import SessionManager
 
 from .data_store_servicer import DataStoreServicer
@@ -15,6 +15,12 @@
 from .inference_servicer import InferenceServicer
 
 
+def _print_available_devices(device_pool: IDevicePool) -> None:
+    print("Available devices:")
+    for device in device_pool.list_devices():
+        print(f"  * {device.id}")
+
+
 def serve(host, port, *, connection_file_path: Optional[str] = None, kill_timeout: Optional[float] = None):
     """
     Starts grpc server on given host and port and writes connection details to json file
@@ -37,7 +43,8 @@ def serve(host, port, *, connection_file_path: Optional[str] = None, kill_timeou
 
     data_store = DataStore()
 
-    inference_svc = InferenceServicer(TorchDevicePool(), SessionManager(), data_store)
+    device_pool = TorchDevicePool()
+    inference_svc = InferenceServicer(device_pool, SessionManager(), data_store)
     fligh_svc = FlightControlServicer(done_evt=done_evt, kill_timeout=kill_timeout)
     data_svc = DataStoreServicer(data_store)
 
@@ -52,6 +59,7 @@ def serve(host, port, *, connection_file_path: Optional[str] = None, kill_timeou
         with open(connection_file_path, "w") as conn_file:
             json.dump({"addr": host, "port": acquired_port}, conn_file)
 
+    _print_available_devices(device_pool)
     server.start()
 
     done_evt.wait()

From 2c05cdd2663099041313d8b63b4a640f28111a27 Mon Sep 17 00:00:00 2001
From: Maksim Novikov <mnovikov.work@gmail.com>
Date: Sat, 17 Jul 2021 12:40:47 +0200
Subject: [PATCH 3/3] Add cuda info to startup log

---
 tests/test_server/test_device_pool.py | 17 +++++++++++++++++
 tiktorch/server/device_pool.py        | 17 ++++++++++++++++-
 tiktorch/server/grpc/__init__.py      | 12 +++++++++++-
 3 files changed, 44 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_server/test_device_pool.py

diff --git a/tests/test_server/test_device_pool.py b/tests/test_server/test_device_pool.py
new file mode 100644
index 00000000..9822b269
--- /dev/null
+++ b/tests/test_server/test_device_pool.py
@@ -0,0 +1,17 @@
+import pytest
+import torch.cuda
+import torch.version
+
+from tiktorch.server.device_pool import TorchDevicePool
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda is not available")
+def test_device_pool_with_cuda():
+    device_pool = TorchDevicePool()
+    assert device_pool.cuda_version == torch.version.cuda
+
+
+@pytest.mark.skipif(torch.cuda.is_available(), reason="cuda is available")
+def test_device_pool_without_cuda():
+    device_pool = TorchDevicePool()
+    assert device_pool.cuda_version is None
diff --git a/tiktorch/server/device_pool.py b/tiktorch/server/device_pool.py
index 0a3dad21..6487ce01 100644
--- a/tiktorch/server/device_pool.py
+++ b/tiktorch/server/device_pool.py
@@ -5,7 +5,7 @@
 import threading
 import uuid
 from collections import defaultdict
-from typing import List
+from typing import List, Optional
 
 import torch
 
@@ -60,6 +60,14 @@ def devices(self) -> List[IDevice]:
 
 
 class IDevicePool(abc.ABC):
+    @property
+    @abc.abstractmethod
+    def cuda_version(self) -> Optional[str]:
+        """
+        Returns CUDA version if available
+        """
+        ...
+
     @abc.abstractmethod
     def list_devices(self) -> List[IDevice]:
         """
@@ -112,6 +120,13 @@ def __init__(self):
         self.__device_ids_by_lease_id = defaultdict(list)
         self.__lock = threading.Lock()
 
+    @property
+    def cuda_version(self) -> Optional[str]:
+        if torch.cuda.is_available():
+            return torch.version.cuda  # type: ignore
+        else:
+            return None
+
     def list_devices(self) -> List[IDevice]:
         with self.__lock:
             ids = ["cpu"]
diff --git a/tiktorch/server/grpc/__init__.py b/tiktorch/server/grpc/__init__.py
index f40190d5..a2132a51 100644
--- a/tiktorch/server/grpc/__init__.py
+++ b/tiktorch/server/grpc/__init__.py
@@ -1,4 +1,5 @@
 import json
+import os
 import threading
 from concurrent import futures
 from typing import Optional
@@ -16,6 +17,14 @@
 
 
 def _print_available_devices(device_pool: IDevicePool) -> None:
+    cuda = device_pool.cuda_version
+    print()
+    print("CUDA version:", cuda or "not available")
+    for env_var, value in os.environ.items():
+        if env_var.startswith("CUDA_"):
+            print(env_var, value.strip() or "<empty>")
+
+    print()
     print("Available devices:")
     for device in device_pool.list_devices():
         print(f"  * {device.id}")
@@ -47,19 +56,20 @@ def serve(host, port, *, connection_file_path: Optional[str] = None, kill_timeou
     inference_svc = InferenceServicer(device_pool, SessionManager(), data_store)
     fligh_svc = FlightControlServicer(done_evt=done_evt, kill_timeout=kill_timeout)
     data_svc = DataStoreServicer(data_store)
+    _print_available_devices(device_pool)
 
     inference_pb2_grpc.add_InferenceServicer_to_server(inference_svc, server)
     inference_pb2_grpc.add_FlightControlServicer_to_server(fligh_svc, server)
     data_store_pb2_grpc.add_DataStoreServicer_to_server(data_svc, server)
 
     acquired_port = server.add_insecure_port(f"{host}:{port}")
+    print()
     print(f"Starting server on {host}:{acquired_port}")
     if connection_file_path:
         print(f"Writing connection data to {connection_file_path}")
         with open(connection_file_path, "w") as conn_file:
             json.dump({"addr": host, "port": acquired_port}, conn_file)
 
-    _print_available_devices(device_pool)
     server.start()
 
     done_evt.wait()