[Runtime] Allow query of available device memory through DeviceAPI (#…

…16994) * [Runtime] Allow query of available device memory through DeviceAPI Prior to this commit, the total device memory could be queried through the `DeviceAPI` interface, but the currently available device memory could not. This functionality may be useful for debugging, or for validating available memory prior to model execution. This commit implements the property `Device.available_global_memory`, which queries the `DeviceAttrKind::kAvailableGlobalMemory`. Support for this query, like all device attribute queries, may vary across different backends, and will return `None` for backends that do not support this query. This commit only currently implements support for `kAvailableGlobalMemory` for TVM's Cuda backend. * Updated docstring to fix copy/paste typo * Lint fix, cover all enum values in case/switch * Fix rocm compilation warning
apache · May 19, 2024 · 3cd6673 · 3cd6673
1 parent afb6416
commit 3cd6673
Show file tree

Hide file tree

Showing 7 changed files with 86 additions and 22 deletions.
diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
@@ -51,6 +51,7 @@ enum DeviceAttrKind : int {
   kDriverVersion = 12,
   kL2CacheSizeBytes = 13,
   kTotalGlobalMemory = 14,
+  kAvailableGlobalMemory = 15,
 };
 
 #ifdef TVM_KALLOC_ALIGNMENT

diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
@@ -539,11 +539,25 @@ def total_global_memory(self):
         Returns
         -------
         total_global_memory : int or None
-            Return the global memory available on device in bytes.
+            Return the total size of global memory on device in bytes.
             Return None if the device does not support this feature.
         """
         return self._GetDeviceAttr(self.device_type, self.device_id, 14)
 
+    @property
+    def available_global_memory(self):
+        """Return size of the available global memory.
+
+        Supported devices include CUDA.
+
+        Returns
+        -------
+        available_global_memory : int or None
+            Return the amount of unallocated global memory on device in bytes.
+            Return None if the device does not support this feature.
+        """
+        return self._GetDeviceAttr(self.device_type, self.device_id, 15)
+
     def texture_spatial_limit(self):
         """Returns limits for textures by spatial dimensions
 

diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
@@ -121,6 +121,12 @@ class CUDADeviceAPI final : public DeviceAPI {
         *rv = total_global_memory;
         return;
       }
+      case kAvailableGlobalMemory: {
+        size_t free_mem, total_mem;
+        CUDA_CALL(cudaMemGetInfo(&free_mem, &total_mem));
+        *rv = static_cast<int64_t>(free_mem);
+        return;
+      }
     }
     *rv = value;
   }

diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
@@ -214,6 +214,12 @@ void OpenCLWorkspace::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv)
       *rv = static_cast<int64_t>(total_global_memory);
       return;
     }
+
+    case kAvailableGlobalMemory:
+      // Not currently implemented.  Based on
+      // https://stackoverflow.com/a/3568223, may not be implementable
+      // at all through OpenCL API.
+      break;
   }
 }
 

diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc
@@ -136,6 +136,10 @@ class ROCMDeviceAPI final : public DeviceAPI {
         *rv = total_global_memory;
         return;
       }
+
+      case kAvailableGlobalMemory:
+        // Not currently implemented.
+        break;
     }
     *rv = value;
   }

diff --git a/src/runtime/vulkan/vulkan_device_api.cc b/src/runtime/vulkan/vulkan_device_api.cc
@@ -168,6 +168,11 @@ void VulkanDeviceAPI::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv)
       *rv = device(index).compute_memory_size;
       return;
     }
+
+    case kAvailableGlobalMemory:
+      // Not currently implemented.  Will only be implementable for
+      // devices that support the VK_EXT_memory_budget extension.
+      break;
   }
 }
 

diff --git a/tests/python/all-platform-minimal-test/test_runtime_ndarray.py b/tests/python/all-platform-minimal-test/test_runtime_ndarray.py
@@ -16,33 +16,63 @@
 # under the License.
 """Basic runtime enablement test."""
 
-import tvm
-from tvm import te
+import math
+
+import pytest
 import numpy as np
+
+import tvm
 import tvm.testing
+from tvm import te
+
+dtype = tvm.testing.parameter("uint8", "int8", "uint16", "int16", "uint32", "int32", "float32")
+
+
+def test_nd_create(target, dev, dtype):
+    x = np.random.randint(0, 10, size=(3, 4))
+    x = np.array(x, dtype=dtype)
+    y = tvm.nd.array(x, device=dev)
+    z = y.copyto(dev)
+    assert y.dtype == x.dtype
+    assert y.shape == x.shape
+    assert isinstance(y, tvm.nd.NDArray)
+    np.testing.assert_equal(x, y.numpy())
+    np.testing.assert_equal(x, z.numpy())
+
+    # no need here, just to test usablity
+    dev.sync()
+
+
+def test_memory_usage(target, dev, dtype):
+    available_memory_before = dev.available_global_memory
+    if available_memory_before is None:
+        pytest.skip(reason=f"Target '{target}' does not support queries of available memory")
+
+    arr = tvm.nd.empty([1024, 1024], dtype=dtype, device=dev)
+    available_memory_after = dev.available_global_memory
+
+    num_elements = math.prod(arr.shape)
+    element_nbytes = tvm.runtime.DataType(dtype).itemsize()
+    expected_memory_after = available_memory_before - num_elements * element_nbytes
+
+    # Allocations may be padded out to provide alignment, to match a
+    # page boundary, due to additional device-side bookkeeping
+    # required by the TVM backend or the driver, etc.  Therefore, the
+    # available memory may decrease by more than the requested amount.
+    assert available_memory_after <= expected_memory_after
 
+    # TVM's NDArray type is a reference-counted handle to the
+    # underlying reference.  After the last reference to an NDArray is
+    # cleared, the backing allocation will be freed.
+    del arr
 
-@tvm.testing.uses_gpu
-def test_nd_create():
-    for target, dev in tvm.testing.enabled_targets():
-        for dtype in ["uint8", "int8", "uint16", "int16", "uint32", "int32", "float32"]:
-            x = np.random.randint(0, 10, size=(3, 4))
-            x = np.array(x, dtype=dtype)
-            y = tvm.nd.array(x, device=dev)
-            z = y.copyto(dev)
-            assert y.dtype == x.dtype
-            assert y.shape == x.shape
-            assert isinstance(y, tvm.nd.NDArray)
-            np.testing.assert_equal(x, y.numpy())
-            np.testing.assert_equal(x, z.numpy())
-        # no need here, just to test usablity
-        dev.sync()
+    assert dev.available_global_memory == available_memory_before
 
 
 def test_fp16_conversion():
     n = 100
 
-    for (src, dst) in [("float32", "float16"), ("float16", "float32")]:
+    for src, dst in [("float32", "float16"), ("float16", "float32")]:
         A = te.placeholder((n,), dtype=src)
         B = te.compute((n,), lambda i: A[i].astype(dst))
 
@@ -66,6 +96,4 @@ def test_dtype():
 
 
 if __name__ == "__main__":
-    test_nd_create()
-    test_fp16_conversion()
-    test_dtype()
+    tvm.testing.main()