Skip to content

Commit

Permalink
[Runtime] Allow query of available device memory through DeviceAPI (#…
Browse files Browse the repository at this point in the history
…16994)

* [Runtime] Allow query of available device memory through DeviceAPI

Prior to this commit, the total device memory could be queried through
the `DeviceAPI` interface, but the currently available device memory
could not.  This functionality may be useful for debugging, or for
validating available memory prior to model execution.

This commit implements the property `Device.available_global_memory`,
which queries the `DeviceAttrKind::kAvailableGlobalMemory`.  Support
for this query, like all device attribute queries, may vary across
different backends, and will return `None` for backends that do not
support this query.  This commit only currently implements support for
`kAvailableGlobalMemory` for TVM's Cuda backend.

* Updated docstring to fix copy/paste typo

* Lint fix, cover all enum values in case/switch

* Fix rocm compilation warning
  • Loading branch information
Lunderberg authored May 19, 2024
1 parent afb6416 commit 3cd6673
Show file tree
Hide file tree
Showing 7 changed files with 86 additions and 22 deletions.
1 change: 1 addition & 0 deletions include/tvm/runtime/device_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ enum DeviceAttrKind : int {
kDriverVersion = 12,
kL2CacheSizeBytes = 13,
kTotalGlobalMemory = 14,
kAvailableGlobalMemory = 15,
};

#ifdef TVM_KALLOC_ALIGNMENT
Expand Down
16 changes: 15 additions & 1 deletion python/tvm/_ffi/runtime_ctypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,11 +539,25 @@ def total_global_memory(self):
Returns
-------
total_global_memory : int or None
Return the global memory available on device in bytes.
Return the total size of global memory on device in bytes.
Return None if the device does not support this feature.
"""
return self._GetDeviceAttr(self.device_type, self.device_id, 14)

@property
def available_global_memory(self):
"""Return size of the available global memory.
Supported devices include CUDA.
Returns
-------
available_global_memory : int or None
Return the amount of unallocated global memory on device in bytes.
Return None if the device does not support this feature.
"""
return self._GetDeviceAttr(self.device_type, self.device_id, 15)

def texture_spatial_limit(self):
"""Returns limits for textures by spatial dimensions
Expand Down
6 changes: 6 additions & 0 deletions src/runtime/cuda/cuda_device_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,12 @@ class CUDADeviceAPI final : public DeviceAPI {
*rv = total_global_memory;
return;
}
case kAvailableGlobalMemory: {
size_t free_mem, total_mem;
CUDA_CALL(cudaMemGetInfo(&free_mem, &total_mem));
*rv = static_cast<int64_t>(free_mem);
return;
}
}
*rv = value;
}
Expand Down
6 changes: 6 additions & 0 deletions src/runtime/opencl/opencl_device_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,12 @@ void OpenCLWorkspace::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv)
*rv = static_cast<int64_t>(total_global_memory);
return;
}

case kAvailableGlobalMemory:
// Not currently implemented. Based on
// https://stackoverflow.com/a/3568223, may not be implementable
// at all through OpenCL API.
break;
}
}

Expand Down
4 changes: 4 additions & 0 deletions src/runtime/rocm/rocm_device_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,10 @@ class ROCMDeviceAPI final : public DeviceAPI {
*rv = total_global_memory;
return;
}

case kAvailableGlobalMemory:
// Not currently implemented.
break;
}
*rv = value;
}
Expand Down
5 changes: 5 additions & 0 deletions src/runtime/vulkan/vulkan_device_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,11 @@ void VulkanDeviceAPI::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv)
*rv = device(index).compute_memory_size;
return;
}

case kAvailableGlobalMemory:
// Not currently implemented. Will only be implementable for
// devices that support the VK_EXT_memory_budget extension.
break;
}
}

Expand Down
70 changes: 49 additions & 21 deletions tests/python/all-platform-minimal-test/test_runtime_ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,33 +16,63 @@
# under the License.
"""Basic runtime enablement test."""

import tvm
from tvm import te
import math

import pytest
import numpy as np

import tvm
import tvm.testing
from tvm import te

dtype = tvm.testing.parameter("uint8", "int8", "uint16", "int16", "uint32", "int32", "float32")


def test_nd_create(target, dev, dtype):
x = np.random.randint(0, 10, size=(3, 4))
x = np.array(x, dtype=dtype)
y = tvm.nd.array(x, device=dev)
z = y.copyto(dev)
assert y.dtype == x.dtype
assert y.shape == x.shape
assert isinstance(y, tvm.nd.NDArray)
np.testing.assert_equal(x, y.numpy())
np.testing.assert_equal(x, z.numpy())

# no need here, just to test usablity
dev.sync()


def test_memory_usage(target, dev, dtype):
available_memory_before = dev.available_global_memory
if available_memory_before is None:
pytest.skip(reason=f"Target '{target}' does not support queries of available memory")

arr = tvm.nd.empty([1024, 1024], dtype=dtype, device=dev)
available_memory_after = dev.available_global_memory

num_elements = math.prod(arr.shape)
element_nbytes = tvm.runtime.DataType(dtype).itemsize()
expected_memory_after = available_memory_before - num_elements * element_nbytes

# Allocations may be padded out to provide alignment, to match a
# page boundary, due to additional device-side bookkeeping
# required by the TVM backend or the driver, etc. Therefore, the
# available memory may decrease by more than the requested amount.
assert available_memory_after <= expected_memory_after

# TVM's NDArray type is a reference-counted handle to the
# underlying reference. After the last reference to an NDArray is
# cleared, the backing allocation will be freed.
del arr

@tvm.testing.uses_gpu
def test_nd_create():
for target, dev in tvm.testing.enabled_targets():
for dtype in ["uint8", "int8", "uint16", "int16", "uint32", "int32", "float32"]:
x = np.random.randint(0, 10, size=(3, 4))
x = np.array(x, dtype=dtype)
y = tvm.nd.array(x, device=dev)
z = y.copyto(dev)
assert y.dtype == x.dtype
assert y.shape == x.shape
assert isinstance(y, tvm.nd.NDArray)
np.testing.assert_equal(x, y.numpy())
np.testing.assert_equal(x, z.numpy())
# no need here, just to test usablity
dev.sync()
assert dev.available_global_memory == available_memory_before


def test_fp16_conversion():
n = 100

for (src, dst) in [("float32", "float16"), ("float16", "float32")]:
for src, dst in [("float32", "float16"), ("float16", "float32")]:
A = te.placeholder((n,), dtype=src)
B = te.compute((n,), lambda i: A[i].astype(dst))

Expand All @@ -66,6 +96,4 @@ def test_dtype():


if __name__ == "__main__":
test_nd_create()
test_fp16_conversion()
test_dtype()
tvm.testing.main()

0 comments on commit 3cd6673

Please sign in to comment.