From 0befd4ec92b132245fef945454fb19104b40f59c Mon Sep 17 00:00:00 2001 From: Gustavo Coelho Date: Wed, 19 Oct 2022 12:08:43 -0300 Subject: [PATCH 01/23] dsl: Creates CupyAllocator class --- devito/data/allocators.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/devito/data/allocators.py b/devito/data/allocators.py index d940a006bf..397d4c5f65 100644 --- a/devito/data/allocators.py +++ b/devito/data/allocators.py @@ -5,6 +5,7 @@ import os import sys +import cupy as cp import numpy as np import ctypes from ctypes.util import find_library @@ -15,7 +16,7 @@ __all__ = ['ALLOC_FLAT', 'ALLOC_NUMA_LOCAL', 'ALLOC_NUMA_ANY', 'ALLOC_KNL_MCDRAM', 'ALLOC_KNL_DRAM', 'ALLOC_GUARD', - 'default_allocator'] + 'CUPY_ALLOC', 'default_allocator'] class MemoryAllocator(object): @@ -317,6 +318,33 @@ def put_local(self): return self._node == 'local' +class CupyAllocator(MemoryAllocator): + + """ + Memory allocator based on ``posix`` functions. The allocated memory is + aligned to page boundaries. + """ + + is_Posix = True + + def __init__(self): + cp.cuda.set_allocator(cp.cuda.MemoryPool(cp.cuda.malloc_managed).malloc) + + @classmethod + def initialize(cls): + pass + + + def _alloc_C_libcall(self, size, ctype): + + mem_obj = cp.zeros(size, dtype=cp.float64) + return mem_obj.data.ptr, mem_obj + + def free(self, c_pointer): + pass + + + class ExternalAllocator(MemoryAllocator): """ @@ -373,6 +401,7 @@ def alloc(self, shape, dtype): ALLOC_KNL_MCDRAM = NumaAllocator(1) ALLOC_NUMA_ANY = NumaAllocator('any') ALLOC_NUMA_LOCAL = NumaAllocator('local') +CUPY_ALLOC = CupyAllocator() custom_allocators = {} """User-defined allocators.""" From db8736288ec109612bbe8f79bf2395b5e9687496 Mon Sep 17 00:00:00 2001 From: Gustavo Coelho Date: Wed, 19 Oct 2022 12:13:01 -0300 Subject: [PATCH 02/23] misc: Fix indentation --- devito/data/allocators.py | 1 - 1 file changed, 1 deletion(-) diff --git a/devito/data/allocators.py b/devito/data/allocators.py index 397d4c5f65..c812393d09 100644 --- a/devito/data/allocators.py +++ b/devito/data/allocators.py @@ -344,7 +344,6 @@ def free(self, c_pointer): pass - class ExternalAllocator(MemoryAllocator): """ From ef1f36828c68e1af7d78af8799eb61fdeb5cdfc9 Mon Sep 17 00:00:00 2001 From: Gustavo Coelho Date: Wed, 19 Oct 2022 12:23:19 -0300 Subject: [PATCH 03/23] dsl: Fix del method allowing the dealocation of the Cupy data --- devito/data/data.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/devito/data/data.py b/devito/data/data.py index 046b162342..be1c7e2bcb 100644 --- a/devito/data/data.py +++ b/devito/data/data.py @@ -3,7 +3,7 @@ import numpy as np -from devito.data.allocators import ALLOC_FLAT +from devito.data.allocators import ALLOC_FLAT, CUPY_ALLOC from devito.data.utils import * from devito.logger import warning from devito.parameters import configuration @@ -82,7 +82,8 @@ def __del__(self): # Dask/Distributed context), which may (re)create a Data object # without going through `__array_finalize__` return - self._allocator.free(*self._memfree_args) + if self._allocator is not CUPY_ALLOC: + self._allocator.free(*self._memfree_args) self._memfree_args = None def __reduce__(self): From ca806b3fb54541f86a7510427e1701ec561615b5 Mon Sep 17 00:00:00 2001 From: Gustavo Coelho Date: Wed, 19 Oct 2022 13:52:17 -0300 Subject: [PATCH 04/23] dsl: Changes that exclude copyin and copyout pragmas from source code if it is CupyAllocator --- devito/passes/iet/definitions.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/devito/passes/iet/definitions.py b/devito/passes/iet/definitions.py index eb088cc3e6..6aee8f73ed 100644 --- a/devito/passes/iet/definitions.py +++ b/devito/passes/iet/definitions.py @@ -20,6 +20,8 @@ from devito.tools import as_mapper, as_list, as_tuple, filter_sorted, flatten from devito.types import DeviceRM, Symbol +from devito.data.allocators import CUPY_ALLOC + __all__ = ['DataManager', 'DeviceAwareDataManager', 'Storage'] @@ -435,6 +437,9 @@ def _map_function_on_high_bw_mem(self, site, obj, storage, devicerm, read_only=F """ mmap = self.lang._map_to(obj) + if obj._allocator is CUPY_ALLOC: + return + if read_only is False: unmap = [self.lang._map_update(obj), self.lang._map_release(obj, devicerm=devicerm)] From 50cd5340c28dd60f8eee8229b753be662998d108 Mon Sep 17 00:00:00 2001 From: Gustavo Coelho Date: Wed, 26 Oct 2022 19:53:17 -0300 Subject: [PATCH 05/23] dsl: Remove the part of the code that makes the source code be generate without openacc pragmas --- devito/passes/iet/definitions.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/devito/passes/iet/definitions.py b/devito/passes/iet/definitions.py index 6aee8f73ed..749dcd00aa 100644 --- a/devito/passes/iet/definitions.py +++ b/devito/passes/iet/definitions.py @@ -437,9 +437,6 @@ def _map_function_on_high_bw_mem(self, site, obj, storage, devicerm, read_only=F """ mmap = self.lang._map_to(obj) - if obj._allocator is CUPY_ALLOC: - return - if read_only is False: unmap = [self.lang._map_update(obj), self.lang._map_release(obj, devicerm=devicerm)] From 539254cc7b97ba80f46885dbc933a8c3ee166965 Mon Sep 17 00:00:00 2001 From: Gustavo Coelho Date: Wed, 26 Oct 2022 20:04:10 -0300 Subject: [PATCH 06/23] dsl: Change from CUPY_ALLOC to ALLOC_CUPY --- devito/data/allocators.py | 4 ++-- devito/data/data.py | 4 ++-- devito/passes/iet/definitions.py | 2 -- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/devito/data/allocators.py b/devito/data/allocators.py index c812393d09..eaf5a2c5a5 100644 --- a/devito/data/allocators.py +++ b/devito/data/allocators.py @@ -16,7 +16,7 @@ __all__ = ['ALLOC_FLAT', 'ALLOC_NUMA_LOCAL', 'ALLOC_NUMA_ANY', 'ALLOC_KNL_MCDRAM', 'ALLOC_KNL_DRAM', 'ALLOC_GUARD', - 'CUPY_ALLOC', 'default_allocator'] + 'ALLOC_CUPY', 'default_allocator'] class MemoryAllocator(object): @@ -400,7 +400,7 @@ def alloc(self, shape, dtype): ALLOC_KNL_MCDRAM = NumaAllocator(1) ALLOC_NUMA_ANY = NumaAllocator('any') ALLOC_NUMA_LOCAL = NumaAllocator('local') -CUPY_ALLOC = CupyAllocator() +ALLOC_CUPY = CupyAllocator() custom_allocators = {} """User-defined allocators.""" diff --git a/devito/data/data.py b/devito/data/data.py index be1c7e2bcb..d00cb1d761 100644 --- a/devito/data/data.py +++ b/devito/data/data.py @@ -3,7 +3,7 @@ import numpy as np -from devito.data.allocators import ALLOC_FLAT, CUPY_ALLOC +from devito.data.allocators import ALLOC_FLAT, ALLOC_CUPY from devito.data.utils import * from devito.logger import warning from devito.parameters import configuration @@ -82,7 +82,7 @@ def __del__(self): # Dask/Distributed context), which may (re)create a Data object # without going through `__array_finalize__` return - if self._allocator is not CUPY_ALLOC: + if self._allocator is not ALLOC_CUPY: self._allocator.free(*self._memfree_args) self._memfree_args = None diff --git a/devito/passes/iet/definitions.py b/devito/passes/iet/definitions.py index 749dcd00aa..eb088cc3e6 100644 --- a/devito/passes/iet/definitions.py +++ b/devito/passes/iet/definitions.py @@ -20,8 +20,6 @@ from devito.tools import as_mapper, as_list, as_tuple, filter_sorted, flatten from devito.types import DeviceRM, Symbol -from devito.data.allocators import CUPY_ALLOC - __all__ = ['DataManager', 'DeviceAwareDataManager', 'Storage'] From ddb5991db92dd0f618614cfa4c17c7fdf02ab7ff Mon Sep 17 00:00:00 2001 From: Gustavo Coelho Date: Thu, 27 Oct 2022 10:31:25 -0300 Subject: [PATCH 07/23] dsl: Update CupyAllocator's mem_free_args as a tuple, allowing removal of conditional inside 'del' method --- devito/data/allocators.py | 4 ++-- devito/data/data.py | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/devito/data/allocators.py b/devito/data/allocators.py index eaf5a2c5a5..bd7b554dcb 100644 --- a/devito/data/allocators.py +++ b/devito/data/allocators.py @@ -338,9 +338,9 @@ def initialize(cls): def _alloc_C_libcall(self, size, ctype): mem_obj = cp.zeros(size, dtype=cp.float64) - return mem_obj.data.ptr, mem_obj + return mem_obj.data.ptr, (mem_obj,) - def free(self, c_pointer): + def free(self, _): pass diff --git a/devito/data/data.py b/devito/data/data.py index d00cb1d761..046b162342 100644 --- a/devito/data/data.py +++ b/devito/data/data.py @@ -3,7 +3,7 @@ import numpy as np -from devito.data.allocators import ALLOC_FLAT, ALLOC_CUPY +from devito.data.allocators import ALLOC_FLAT from devito.data.utils import * from devito.logger import warning from devito.parameters import configuration @@ -82,8 +82,7 @@ def __del__(self): # Dask/Distributed context), which may (re)create a Data object # without going through `__array_finalize__` return - if self._allocator is not ALLOC_CUPY: - self._allocator.free(*self._memfree_args) + self._allocator.free(*self._memfree_args) self._memfree_args = None def __reduce__(self): From d337ac85e98b70f37bcd76b396b44cc136d136d2 Mon Sep 17 00:00:00 2001 From: Gustavo Coelho Date: Thu, 27 Oct 2022 14:47:27 -0300 Subject: [PATCH 08/23] misc: Fix indentation and comments --- devito/data/allocators.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/devito/data/allocators.py b/devito/data/allocators.py index bd7b554dcb..78e7ebc106 100644 --- a/devito/data/allocators.py +++ b/devito/data/allocators.py @@ -321,12 +321,9 @@ def put_local(self): class CupyAllocator(MemoryAllocator): """ - Memory allocator based on ``posix`` functions. The allocated memory is - aligned to page boundaries. + Memory allocator based on Unified Memory concept. The allocation is made using Cupy. """ - is_Posix = True - def __init__(self): cp.cuda.set_allocator(cp.cuda.MemoryPool(cp.cuda.malloc_managed).malloc) @@ -334,9 +331,7 @@ def __init__(self): def initialize(cls): pass - def _alloc_C_libcall(self, size, ctype): - mem_obj = cp.zeros(size, dtype=cp.float64) return mem_obj.data.ptr, (mem_obj,) From 6511b06d4c8dd972509513ad36cd8d5b88abf005 Mon Sep 17 00:00:00 2001 From: Gustavo Coelho Date: Fri, 4 Nov 2022 13:16:31 -0300 Subject: [PATCH 09/23] dsl: Update free method inside CupyAllocator --- devito/data/allocators.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/devito/data/allocators.py b/devito/data/allocators.py index 78e7ebc106..35f128e906 100644 --- a/devito/data/allocators.py +++ b/devito/data/allocators.py @@ -325,7 +325,8 @@ class CupyAllocator(MemoryAllocator): """ def __init__(self): - cp.cuda.set_allocator(cp.cuda.MemoryPool(cp.cuda.malloc_managed).malloc) + self.mempool = cp.cuda.MemoryPool(cp.cuda.malloc_managed) + cp.cuda.set_allocator(self.mempool.malloc) @classmethod def initialize(cls): @@ -336,7 +337,7 @@ def _alloc_C_libcall(self, size, ctype): return mem_obj.data.ptr, (mem_obj,) def free(self, _): - pass + self.mempool.free_all_blocks() class ExternalAllocator(MemoryAllocator): From ce12f56ffedb1b872bd13fd429c5323f5fdf035c Mon Sep 17 00:00:00 2001 From: Gustavo Coelho Date: Tue, 7 Feb 2023 14:17:12 -0300 Subject: [PATCH 10/23] tests: Add test to unified memory allocator --- tests/test_data.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/tests/test_data.py b/tests/test_data.py index 0a03e7a4c6..eaae9912fb 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -1,8 +1,9 @@ import pytest import numpy as np +import cupy as cp from devito import (Grid, Function, TimeFunction, SparseTimeFunction, Dimension, # noqa - Eq, Operator, ALLOC_GUARD, ALLOC_FLAT, configuration, switchconfig) + Eq, Operator, ALLOC_GUARD, ALLOC_FLAT, ALLOC_CUPY, configuration, switchconfig) from devito.data import LEFT, RIGHT, Decomposition, loc_data_idx, convert_index from devito.tools import as_tuple from devito.types import Scalar @@ -206,6 +207,26 @@ def test_indexing_into_sparse(self): sf.data[1:-1, 0] = np.arange(8) assert np.all(sf.data[1:-1, 0] == np.arange(8)) + def test_uma_allocation(self): + """ + Test Unified Memory allocation. + """ + nt = 5 + grid = Grid(shape=(4, 4, 4)) + + u = Function(name='u', grid=grid, allocator=ALLOC_CUPY ) + u.data[:] = 5 + address = u.data.ctypes.data + pointerAttr = cp.cuda.runtime.pointerGetAttributes(address) + assert pointerAttr.devicePointer == pointerAttr.hostPointer + + v = TimeFunction(name='v', grid=grid, save=nt, allocator=ALLOC_CUPY ) + v.data[:] = 5 + address = v.data.ctypes.data + pointerAttr = cp.cuda.runtime.pointerGetAttributes(address) + assert pointerAttr.devicePointer == pointerAttr.hostPointer + + class TestLocDataIDX(object): """ From 3ce03baceaf542f26e5f4a7eb6e3b61d32f94f34 Mon Sep 17 00:00:00 2001 From: Gustavo Coelho Date: Wed, 8 Mar 2023 10:34:47 -0300 Subject: [PATCH 11/23] dsl: Add conditional import for Cupy module --- devito/data/allocators.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/devito/data/allocators.py b/devito/data/allocators.py index 35f128e906..79dee296b5 100644 --- a/devito/data/allocators.py +++ b/devito/data/allocators.py @@ -5,7 +5,6 @@ import os import sys -import cupy as cp import numpy as np import ctypes from ctypes.util import find_library @@ -14,6 +13,11 @@ from devito.parameters import configuration from devito.tools import dtype_to_ctype +try: + import cupy as cp +except: + cp = None + __all__ = ['ALLOC_FLAT', 'ALLOC_NUMA_LOCAL', 'ALLOC_NUMA_ANY', 'ALLOC_KNL_MCDRAM', 'ALLOC_KNL_DRAM', 'ALLOC_GUARD', 'ALLOC_CUPY', 'default_allocator'] @@ -325,9 +329,12 @@ class CupyAllocator(MemoryAllocator): """ def __init__(self): + if not cp: + raise ImportError("Couldn't find `cupy` to " + "allocate memory") self.mempool = cp.cuda.MemoryPool(cp.cuda.malloc_managed) cp.cuda.set_allocator(self.mempool.malloc) - + @classmethod def initialize(cls): pass From f4231e24e7642ad3d10d7a8698408f0e93e7c0b4 Mon Sep 17 00:00:00 2001 From: Gustavo Coelho Date: Wed, 8 Mar 2023 11:13:03 -0300 Subject: [PATCH 12/23] test: Update tests adding a class responsible for test external and uma allocators --- tests/test_data.py | 91 +++++++++++++++++++++++----------------------- 1 file changed, 46 insertions(+), 45 deletions(-) diff --git a/tests/test_data.py b/tests/test_data.py index eaae9912fb..b6679abed8 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -207,26 +207,6 @@ def test_indexing_into_sparse(self): sf.data[1:-1, 0] = np.arange(8) assert np.all(sf.data[1:-1, 0] == np.arange(8)) - def test_uma_allocation(self): - """ - Test Unified Memory allocation. - """ - nt = 5 - grid = Grid(shape=(4, 4, 4)) - - u = Function(name='u', grid=grid, allocator=ALLOC_CUPY ) - u.data[:] = 5 - address = u.data.ctypes.data - pointerAttr = cp.cuda.runtime.pointerGetAttributes(address) - assert pointerAttr.devicePointer == pointerAttr.hostPointer - - v = TimeFunction(name='v', grid=grid, save=nt, allocator=ALLOC_CUPY ) - v.data[:] = 5 - address = v.data.ctypes.data - pointerAttr = cp.cuda.runtime.pointerGetAttributes(address) - assert pointerAttr.devicePointer == pointerAttr.hostPointer - - class TestLocDataIDX(object): """ @@ -1494,6 +1474,52 @@ def test_gather_time_function(self): assert ans == np.array(None) +class TestAllocators(object): + + def test_uma_allocation(self): + """ + Test Unified Memory allocation. + """ + nt = 5 + grid = Grid(shape=(4, 4, 4)) + + u = Function(name='u', grid=grid, allocator=ALLOC_CUPY ) + u.data[:] = 5 + address = u.data.ctypes.data + pointerAttr = cp.cuda.runtime.pointerGetAttributes(address) + assert pointerAttr.devicePointer == pointerAttr.hostPointer + + v = TimeFunction(name='v', grid=grid, save=nt, allocator=ALLOC_CUPY ) + v.data[:] = 5 + address = v.data.ctypes.data + pointerAttr = cp.cuda.runtime.pointerGetAttributes(address) + assert pointerAttr.devicePointer == pointerAttr.hostPointer + + def test_external_allocator(self): + shape = (2, 2) + space_order = 0 + numpy_array = np.ones(shape, dtype=np.float32) + g = Grid(shape) + f = Function(name='f', space_order=space_order, grid=g, + allocator=ExternalAllocator(numpy_array), initializer=lambda x: None) + + # Ensure the two arrays have the same value + assert(np.array_equal(f.data, numpy_array)) + + # Ensure the original numpy array is unchanged + assert(np.array_equal(numpy_array, np.ones(shape, dtype=np.float32))) + + # Change the underlying numpy array + numpy_array[:] = 3. + # Ensure the function.data changes too + assert(np.array_equal(f.data, numpy_array)) + + # Change the function.data + f.data[:] = 4. + # Ensure the underlying numpy array changes too + assert(np.array_equal(f.data, numpy_array)) + + def test_scalar_arg_substitution(): """ Tests the relaxed (compared to other devito sympy subclasses) @@ -1540,31 +1566,6 @@ def test_numpy_c_contiguous(): assert(u._data_allocated.flags.c_contiguous) -def test_external_allocator(): - shape = (2, 2) - space_order = 0 - numpy_array = np.ones(shape, dtype=np.float32) - g = Grid(shape) - f = Function(name='f', space_order=space_order, grid=g, - allocator=ExternalAllocator(numpy_array), initializer=lambda x: None) - - # Ensure the two arrays have the same value - assert(np.array_equal(f.data, numpy_array)) - - # Ensure the original numpy array is unchanged - assert(np.array_equal(numpy_array, np.ones(shape, dtype=np.float32))) - - # Change the underlying numpy array - numpy_array[:] = 3. - # Ensure the function.data changes too - assert(np.array_equal(f.data, numpy_array)) - - # Change the function.data - f.data[:] = 4. - # Ensure the underlying numpy array changes too - assert(np.array_equal(f.data, numpy_array)) - - def test_boolean_masking_array(): """ Test truth value of array, raised in Python 3.9 (MFE for issue #1788) From c4444a12452e3c79491c6c29e5c9557bbee66f2f Mon Sep 17 00:00:00 2001 From: Gustavo Coelho Date: Thu, 13 Apr 2023 15:09:48 -0300 Subject: [PATCH 13/23] dsl: Changing import cupy from init() to initialize() --- devito/data/allocators.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/devito/data/allocators.py b/devito/data/allocators.py index 79dee296b5..7d6db16496 100644 --- a/devito/data/allocators.py +++ b/devito/data/allocators.py @@ -327,24 +327,27 @@ class CupyAllocator(MemoryAllocator): """ Memory allocator based on Unified Memory concept. The allocation is made using Cupy. """ - - def __init__(self): - if not cp: - raise ImportError("Couldn't find `cupy` to " - "allocate memory") - self.mempool = cp.cuda.MemoryPool(cp.cuda.malloc_managed) - cp.cuda.set_allocator(self.mempool.malloc) + _mempool = None @classmethod def initialize(cls): - pass + try: + import cupy as cp + cls.lib = cp + cls._mempool = cls.lib.cuda.MemoryPool(cls.lib.cuda.malloc_managed) + cls.lib.cuda.set_allocator(cls._mempool.malloc) + except: + cls.lib = None def _alloc_C_libcall(self, size, ctype): + if not self.available(): + raise ImportError("Couldn't find `cupy` to " + "allocate memory") mem_obj = cp.zeros(size, dtype=cp.float64) return mem_obj.data.ptr, (mem_obj,) def free(self, _): - self.mempool.free_all_blocks() + self._mempool.free_all_blocks() class ExternalAllocator(MemoryAllocator): @@ -447,6 +450,7 @@ def default_allocator(name=None): Custom allocators may be added with `register_allocator`. """ + return ALLOC_CUPY if name is not None: try: return custom_allocators[name] From f3f90c16e1e48c27cd8c7fb25a1d29e1111fe60e Mon Sep 17 00:00:00 2001 From: Gustavo Coelho Date: Mon, 24 Apr 2023 16:18:59 -0300 Subject: [PATCH 14/23] dsl: Update to fix the problem when ALLOC_CUPY tries to alloc data with size = 0 (MPI) --- devito/data/allocators.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/devito/data/allocators.py b/devito/data/allocators.py index 7d6db16496..2d94a0030b 100644 --- a/devito/data/allocators.py +++ b/devito/data/allocators.py @@ -81,10 +81,13 @@ def alloc(self, shape, dtype): if c_pointer is None: raise RuntimeError("Unable to allocate %d elements in memory", str(size)) - # cast to 1D array of the specified size - ctype_1d = ctype * size - buf = ctypes.cast(c_pointer, ctypes.POINTER(ctype_1d)).contents - pointer = np.frombuffer(buf, dtype=dtype) + if c_pointer: + # cast to 1D array of the specified size + ctype_1d = ctype * size + buf = ctypes.cast(c_pointer, ctypes.POINTER(ctype_1d)).contents + pointer = np.frombuffer(buf, dtype=dtype) + else: + pointer = np.empty(shape = (0), dtype=dtype) # pointer.reshape should not be used here because it may introduce a copy # From https://docs.scipy.org/doc/numpy/reference/generated/numpy.reshape.html: # It is not always possible to change the shape of an array without copying the @@ -450,7 +453,6 @@ def default_allocator(name=None): Custom allocators may be added with `register_allocator`. """ - return ALLOC_CUPY if name is not None: try: return custom_allocators[name] From 41838aef9c087f83f051fdc43b225dd63eca5327 Mon Sep 17 00:00:00 2001 From: Gustavo Coelho Date: Thu, 11 May 2023 16:40:30 -0300 Subject: [PATCH 15/23] dsl: Update CupyAllocator to run at multiples nodes using MPI --- devito/data/allocators.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/devito/data/allocators.py b/devito/data/allocators.py index 2d94a0030b..703e894e6e 100644 --- a/devito/data/allocators.py +++ b/devito/data/allocators.py @@ -13,10 +13,11 @@ from devito.parameters import configuration from devito.tools import dtype_to_ctype -try: - import cupy as cp -except: - cp = None +# try: +# from mpi4py import MPI # noqa +# except ImportError: +# MPI = None + __all__ = ['ALLOC_FLAT', 'ALLOC_NUMA_LOCAL', 'ALLOC_NUMA_ANY', 'ALLOC_KNL_MCDRAM', 'ALLOC_KNL_DRAM', 'ALLOC_GUARD', @@ -336,17 +337,22 @@ class CupyAllocator(MemoryAllocator): def initialize(cls): try: import cupy as cp + from mpi4py import MPI # noqa + cls.lib = cp cls._mempool = cls.lib.cuda.MemoryPool(cls.lib.cuda.malloc_managed) cls.lib.cuda.set_allocator(cls._mempool.malloc) + n_gpu = cls.lib.cuda.runtime.getDeviceCount() + if MPI.Is_initialized(): + rank_local = MPI.COMM_WORLD.Split_type(MPI.COMM_TYPE_SHARED).Get_rank() + cls.lib.cuda.runtime.setDevice(rank_local%n_gpu) except: cls.lib = None def _alloc_C_libcall(self, size, ctype): if not self.available(): - raise ImportError("Couldn't find `cupy` to " - "allocate memory") - mem_obj = cp.zeros(size, dtype=cp.float64) + raise ImportError("Couldn't initialize cupy or MPI elements of alocation") + mem_obj = self.lib.zeros(size, dtype=self.lib.float64) return mem_obj.data.ptr, (mem_obj,) def free(self, _): @@ -453,6 +459,7 @@ def default_allocator(name=None): Custom allocators may be added with `register_allocator`. """ + return ALLOC_CUPY if name is not None: try: return custom_allocators[name] From 241e44453da44eac0898abe58c155e8a463ba844 Mon Sep 17 00:00:00 2001 From: Gustavo Coelho Date: Wed, 17 May 2023 18:01:37 -0300 Subject: [PATCH 16/23] dsl: Fix CupyAllocator to properly support MPI execution. --- devito/data/allocators.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/devito/data/allocators.py b/devito/data/allocators.py index 703e894e6e..54bdf27d88 100644 --- a/devito/data/allocators.py +++ b/devito/data/allocators.py @@ -335,19 +335,31 @@ class CupyAllocator(MemoryAllocator): @classmethod def initialize(cls): + try: import cupy as cp - from mpi4py import MPI # noqa - cls.lib = cp - cls._mempool = cls.lib.cuda.MemoryPool(cls.lib.cuda.malloc_managed) - cls.lib.cuda.set_allocator(cls._mempool.malloc) - n_gpu = cls.lib.cuda.runtime.getDeviceCount() - if MPI.Is_initialized(): - rank_local = MPI.COMM_WORLD.Split_type(MPI.COMM_TYPE_SHARED).Get_rank() - cls.lib.cuda.runtime.setDevice(rank_local%n_gpu) + cls._initialize_shared_memory() + try: + from mpi4py import MPI + cls.MPI = MPI + cls._set_device_for_mpi() + except: + cls.MPI = None except: cls.lib = None + + @classmethod + def _initialize_shared_memory(cls): + cls._mempool = cls.lib.cuda.MemoryPool(cls.lib.cuda.malloc_managed) + cls.lib.cuda.set_allocator(cls._mempool.malloc) + + @classmethod + def _set_device_for_mpi(cls): + if cls.MPI.Is_initialized(): + n_gpu = cls.lib.cuda.runtime.getDeviceCount() + rank_local = cls.MPI.COMM_WORLD.Split_type(cls.MPI.COMM_TYPE_SHARED).Get_rank() + cls.lib.cuda.runtime.setDevice(rank_local % n_gpu) def _alloc_C_libcall(self, size, ctype): if not self.available(): @@ -459,7 +471,6 @@ def default_allocator(name=None): Custom allocators may be added with `register_allocator`. """ - return ALLOC_CUPY if name is not None: try: return custom_allocators[name] From e724ffb5e74506ea12e6514a3cdd7f86ce748df4 Mon Sep 17 00:00:00 2001 From: Gustavo Coelho Date: Wed, 17 May 2023 18:09:34 -0300 Subject: [PATCH 17/23] misc: Fix indentation --- devito/data/allocators.py | 10 +++++----- tests/test_data.py | 9 +++++---- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/devito/data/allocators.py b/devito/data/allocators.py index 54bdf27d88..6b4cbbe6e4 100644 --- a/devito/data/allocators.py +++ b/devito/data/allocators.py @@ -88,7 +88,7 @@ def alloc(self, shape, dtype): buf = ctypes.cast(c_pointer, ctypes.POINTER(ctype_1d)).contents pointer = np.frombuffer(buf, dtype=dtype) else: - pointer = np.empty(shape = (0), dtype=dtype) + pointer = np.empty(shape=(0), dtype=dtype) # pointer.reshape should not be used here because it may introduce a copy # From https://docs.scipy.org/doc/numpy/reference/generated/numpy.reshape.html: # It is not always possible to change the shape of an array without copying the @@ -343,12 +343,12 @@ def initialize(cls): try: from mpi4py import MPI cls.MPI = MPI - cls._set_device_for_mpi() + cls._set_device_for_mpi() except: cls.MPI = None except: cls.lib = None - + @classmethod def _initialize_shared_memory(cls): cls._mempool = cls.lib.cuda.MemoryPool(cls.lib.cuda.malloc_managed) @@ -358,8 +358,8 @@ def _initialize_shared_memory(cls): def _set_device_for_mpi(cls): if cls.MPI.Is_initialized(): n_gpu = cls.lib.cuda.runtime.getDeviceCount() - rank_local = cls.MPI.COMM_WORLD.Split_type(cls.MPI.COMM_TYPE_SHARED).Get_rank() - cls.lib.cuda.runtime.setDevice(rank_local % n_gpu) + rank_l = cls.MPI.COMM_WORLD.Split_type(cls.MPI.COMM_TYPE_SHARED).Get_rank() + cls.lib.cuda.runtime.setDevice(rank_l % n_gpu) def _alloc_C_libcall(self, size, ctype): if not self.available(): diff --git a/tests/test_data.py b/tests/test_data.py index b6679abed8..801d74aa41 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -3,7 +3,8 @@ import cupy as cp from devito import (Grid, Function, TimeFunction, SparseTimeFunction, Dimension, # noqa - Eq, Operator, ALLOC_GUARD, ALLOC_FLAT, ALLOC_CUPY, configuration, switchconfig) + Eq, Operator, ALLOC_GUARD, ALLOC_FLAT, ALLOC_CUPY, + configuration, switchconfig) from devito.data import LEFT, RIGHT, Decomposition, loc_data_idx, convert_index from devito.tools import as_tuple from devito.types import Scalar @@ -1483,13 +1484,13 @@ def test_uma_allocation(self): nt = 5 grid = Grid(shape=(4, 4, 4)) - u = Function(name='u', grid=grid, allocator=ALLOC_CUPY ) + u = Function(name='u', grid=grid, allocator=ALLOC_CUPY) u.data[:] = 5 address = u.data.ctypes.data pointerAttr = cp.cuda.runtime.pointerGetAttributes(address) assert pointerAttr.devicePointer == pointerAttr.hostPointer - v = TimeFunction(name='v', grid=grid, save=nt, allocator=ALLOC_CUPY ) + v = TimeFunction(name='v', grid=grid, save=nt, allocator=ALLOC_CUPY) v.data[:] = 5 address = v.data.ctypes.data pointerAttr = cp.cuda.runtime.pointerGetAttributes(address) @@ -1501,7 +1502,7 @@ def test_external_allocator(self): numpy_array = np.ones(shape, dtype=np.float32) g = Grid(shape) f = Function(name='f', space_order=space_order, grid=g, - allocator=ExternalAllocator(numpy_array), initializer=lambda x: None) + allocator=ExternalAllocator(numpy_array), initializer=lambda x: None) # Ensure the two arrays have the same value assert(np.array_equal(f.data, numpy_array)) From 9379b31b8b5ad020995bb459e10108902db5ae25 Mon Sep 17 00:00:00 2001 From: Gustavo Coelho Date: Fri, 8 Mar 2024 12:26:34 -0300 Subject: [PATCH 18/23] misc: Removes unwanted leftover comments. --- devito/data/allocators.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/devito/data/allocators.py b/devito/data/allocators.py index 6b4cbbe6e4..84c8417a7b 100644 --- a/devito/data/allocators.py +++ b/devito/data/allocators.py @@ -13,11 +13,6 @@ from devito.parameters import configuration from devito.tools import dtype_to_ctype -# try: -# from mpi4py import MPI # noqa -# except ImportError: -# MPI = None - __all__ = ['ALLOC_FLAT', 'ALLOC_NUMA_LOCAL', 'ALLOC_NUMA_ANY', 'ALLOC_KNL_MCDRAM', 'ALLOC_KNL_DRAM', 'ALLOC_GUARD', From 7814a46f68f8e0f04bcbb8966721ea7f9c374365 Mon Sep 17 00:00:00 2001 From: Gustavo Coelho Date: Fri, 8 Mar 2024 12:33:45 -0300 Subject: [PATCH 19/23] dsl: Update the way MPI is imported at CupyAllocator --- devito/data/allocators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devito/data/allocators.py b/devito/data/allocators.py index 84c8417a7b..8701c285bf 100644 --- a/devito/data/allocators.py +++ b/devito/data/allocators.py @@ -336,7 +336,7 @@ def initialize(cls): cls.lib = cp cls._initialize_shared_memory() try: - from mpi4py import MPI + from devito.mpi import MPI cls.MPI = MPI cls._set_device_for_mpi() except: From 6df7a06e03aab98bdffbfbdca535c8c09bfe1146 Mon Sep 17 00:00:00 2001 From: Gustavo Coelho Date: Fri, 8 Mar 2024 12:35:37 -0300 Subject: [PATCH 20/23] misc: Add explanatory comment --- devito/data/allocators.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/devito/data/allocators.py b/devito/data/allocators.py index 8701c285bf..c070ee9ead 100644 --- a/devito/data/allocators.py +++ b/devito/data/allocators.py @@ -82,8 +82,14 @@ def alloc(self, shape, dtype): ctype_1d = ctype * size buf = ctypes.cast(c_pointer, ctypes.POINTER(ctype_1d)).contents pointer = np.frombuffer(buf, dtype=dtype) + + # During the execution in MPI, domain splitting can generate a situation where + # the allocated data size is zero, as we have observed with Sparse Functions. + # When this occurs, Cupy returns a pointer with a value of zero. This + # conditional statement was defined for this case. else: pointer = np.empty(shape=(0), dtype=dtype) + # pointer.reshape should not be used here because it may introduce a copy # From https://docs.scipy.org/doc/numpy/reference/generated/numpy.reshape.html: # It is not always possible to change the shape of an array without copying the From 76dcdb153c88f7beb7ef8be3b38c89102c2f3059 Mon Sep 17 00:00:00 2001 From: Gustavo Coelho Date: Fri, 8 Mar 2024 12:47:59 -0300 Subject: [PATCH 21/23] dsl: Update "except" to "except ImportError". Other errors should be caught --- devito/data/allocators.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/devito/data/allocators.py b/devito/data/allocators.py index c070ee9ead..64317d8fac 100644 --- a/devito/data/allocators.py +++ b/devito/data/allocators.py @@ -345,9 +345,9 @@ def initialize(cls): from devito.mpi import MPI cls.MPI = MPI cls._set_device_for_mpi() - except: + except ImportError: cls.MPI = None - except: + except ImportError: cls.lib = None @classmethod From 6ad66113a2cbbbe79e230b0338491a9c377f564e Mon Sep 17 00:00:00 2001 From: Gustavo Coelho Date: Fri, 8 Mar 2024 13:00:13 -0300 Subject: [PATCH 22/23] tests: Update memory allocator test to use skipif('nodevice') --- tests/test_data.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_data.py b/tests/test_data.py index 801d74aa41..712b36ffcb 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -1,6 +1,5 @@ import pytest import numpy as np -import cupy as cp from devito import (Grid, Function, TimeFunction, SparseTimeFunction, Dimension, # noqa Eq, Operator, ALLOC_GUARD, ALLOC_FLAT, ALLOC_CUPY, @@ -10,6 +9,7 @@ from devito.types import Scalar from devito.data.allocators import ExternalAllocator +from conftest import skipif class TestDataBasic(object): @@ -1477,10 +1477,13 @@ def test_gather_time_function(self): class TestAllocators(object): + @skipif('nodevice') def test_uma_allocation(self): """ Test Unified Memory allocation. """ + import cupy as cp + nt = 5 grid = Grid(shape=(4, 4, 4)) From 92ba35c3542ad0a758f2948825f00a7c5a7d4368 Mon Sep 17 00:00:00 2001 From: Gustavo Coelho Date: Mon, 18 Mar 2024 14:29:29 -0300 Subject: [PATCH 23/23] dsl: Update of the way data type allocation is defined --- devito/data/allocators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devito/data/allocators.py b/devito/data/allocators.py index 64317d8fac..4ba29e141a 100644 --- a/devito/data/allocators.py +++ b/devito/data/allocators.py @@ -365,7 +365,7 @@ def _set_device_for_mpi(cls): def _alloc_C_libcall(self, size, ctype): if not self.available(): raise ImportError("Couldn't initialize cupy or MPI elements of alocation") - mem_obj = self.lib.zeros(size, dtype=self.lib.float64) + mem_obj = self.lib.zeros(size, dtype=ctype) return mem_obj.data.ptr, (mem_obj,) def free(self, _):