Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UCT/CUDA: Runtime CUDA >= 12.3 to enable VMM #10396

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 42 additions & 18 deletions src/uct/cuda/cuda_copy/cuda_copy_md.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@
#define UCT_CUDA_DEV_NAME_MAX_LEN 64
#define UCT_CUDA_MAX_DEVICES 32

#define UCT_CUDA_VERSION_VMM 12030 /* for VMM: cuCtxSetFlags() >= cuda 12.1 */
#define UCT_CUDA_MAJOR(_version) ((_version) / 1000)
#define UCT_CUDA_MINOR(_version) (((_version) % 1000) / 10)


static const char *uct_cuda_pref_loc[] = {
[UCT_CUDA_PREF_LOC_CPU] = "cpu",
Expand Down Expand Up @@ -515,22 +519,27 @@ static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device)
static void
uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address)
{
unsigned value = 1;

#if HAVE_CUDA_FABRIC
ucs_status_t status;
if (!md->sync_memops_set) {
/* Synchronize future DMA operations for all memory types */
status = UCT_CUDADRV_FUNC_LOG_WARN(cuCtxSetFlags(CU_CTX_SYNC_MEMOPS));
if (status == UCS_OK) {
md->sync_memops_set = 1;
if (md->config.cuda_ctx_set_flags) {
if (!md->sync_memops_set) {
/* Synchronize future DMA operations for all memory types */
status = UCT_CUDADRV_FUNC_LOG_WARN(cuCtxSetFlags(CU_CTX_SYNC_MEMOPS));
rakhmets marked this conversation as resolved.
Show resolved Hide resolved
if (status == UCS_OK) {
md->sync_memops_set = 1;
}
}

return;
}
#else
unsigned value = 1;
#endif

/* Synchronize for DMA for legacy memory types*/
UCT_CUDADRV_FUNC_LOG_WARN(
cuPointerSetAttribute(&value, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
(CUdeviceptr)address));
#endif
}

static ucs_status_t
Expand Down Expand Up @@ -830,7 +839,7 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name,
uct_cuda_copy_md_config_t *config = ucs_derived_of(md_config,
uct_cuda_copy_md_config_t);
uct_cuda_copy_md_t *md;
int dmabuf_supported;
int dmabuf_supported, version;
ucs_status_t status;

md = ucs_malloc(sizeof(uct_cuda_copy_md_t), "uct_cuda_copy_md_t");
Expand All @@ -840,15 +849,30 @@ uct_cuda_copy_md_open(uct_component_t *component, const char *md_name,
goto err;
}

md->super.ops = &md_ops;
md->super.component = &uct_cuda_copy_component;
md->config.alloc_whole_reg = config->alloc_whole_reg;
md->config.max_reg_ratio = config->max_reg_ratio;
md->config.pref_loc = config->pref_loc;
md->config.enable_fabric = config->enable_fabric;
md->config.dmabuf_supported = 0;
md->sync_memops_set = 0;
md->granularity = SIZE_MAX;
md->super.ops = &md_ops;
md->super.component = &uct_cuda_copy_component;
md->config.alloc_whole_reg = config->alloc_whole_reg;
md->config.max_reg_ratio = config->max_reg_ratio;
md->config.pref_loc = config->pref_loc;
md->config.enable_fabric = config->enable_fabric;
md->config.dmabuf_supported = 0;
md->config.cuda_ctx_set_flags = 1;
md->sync_memops_set = 0;
md->granularity = SIZE_MAX;

if ((cuDriverGetVersion(&version) == CUDA_SUCCESS) &&
(version < UCT_CUDA_VERSION_VMM)) {
if (md->config.enable_fabric != UCS_NO) {
ucs_warn("disabled fabric memory allocations as cuda driver "
"library %d.%d < %d.%d",
UCT_CUDA_MAJOR(version), UCT_CUDA_MINOR(version),
UCT_CUDA_MAJOR(UCT_CUDA_VERSION_VMM),
UCT_CUDA_MINOR(UCT_CUDA_VERSION_VMM));
}

md->config.enable_fabric = UCS_NO;
md->config.cuda_ctx_set_flags = 0;
}

if ((config->cuda_async_mem_type != UCS_MEMORY_TYPE_CUDA) &&
(config->cuda_async_mem_type != UCS_MEMORY_TYPE_CUDA_MANAGED)) {
Expand Down
2 changes: 2 additions & 0 deletions src/uct/cuda/cuda_copy/cuda_copy_md.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ typedef struct uct_cuda_copy_md {
ucs_ternary_auto_value_t enable_fabric;
uct_cuda_pref_loc_t pref_loc;
int cuda_async_managed;
int cuda_ctx_set_flags; /* missing cuCtxSetFlags()
below CUDA 12.1 */
} config;
} uct_cuda_copy_md_t;

Expand Down
Loading