From 37e99580315c1f7ed17f4bb5ee24d55181b55c99 Mon Sep 17 00:00:00 2001 From: Leo Prokh Date: Wed, 6 Sep 2023 19:02:11 +0300 Subject: [PATCH 1/4] Init cudagraph Co-authored-by: Mit Kotak <53411468+mitkotak@users.noreply.github.com> Co-authored-by: Gerlof Fokkema --- doc/driver.rst | 125 ++++++++++++++ examples/demo_graph.py | 57 +++++++ pycuda/driver.py | 121 ++++++++------ src/cpp/cuda.hpp | 305 ++++++++++++++++++++++++++++++++--- src/wrapper/wrap_cudadrv.cpp | 56 +++++++ test/test_graph.py | 151 +++++++++++++++++ 6 files changed, 747 insertions(+), 68 deletions(-) create mode 100644 examples/demo_graph.py create mode 100644 test/test_graph.py diff --git a/doc/driver.rst b/doc/driver.rst index 6e6d65f5..870ff2b7 100644 --- a/doc/driver.rst +++ b/doc/driver.rst @@ -605,6 +605,22 @@ Constants .. attribute:: LAZY_ENABLE_PEER_ACCESS +.. class:: capture_mode + + CUDA 10 and newer. + + .. attribute:: GLOBAL + .. attribute:: THREAD_LOCAL + .. attribute:: RELAXED + +.. class:: capture_status + + CUDA 10 and newer. + + .. attribute:: NONE + .. attribute:: ACTIVE + .. attribute:: INVALIDATED + Graphics-related constants ^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -845,6 +861,43 @@ Concurrency and Streams .. versionadded:: 2011.1 + .. method:: begin_capture(capture_mode=capture_mode.GLOBAL) + + Begins graph stream capture on a stream. + + When a stream is in capture mode, all operations pushed into the stream + will not be executed, but will instead be captured into a graph. + + :arg capture_mode: A :class:`capture_mode` specifying mode for capturing graph. + + CUDA 10 and above. + + .. method:: end_capture() + + Ends stream capture and returns a :class:`Graph` object. + + CUDA 10 and above. + + .. method:: get_capture_info_v2() + + Query a stream's capture state. + + Return a :class:`tuple` of (:class:`capture_status` capture status, :class:`int` id for the capture sequence, + :class:`Graph` the graph being captured into, a :class:`list` of :class:`GraphNode` specifying set of nodes the + next node to be captured in the stream will depend on) + + CUDA 10 and above. + + .. method:: update_capture_dependencies(dependencies, flags) + + Modifies the dependency set of a capturing stream. + The dependency set is the set of nodes that the next captured node in the stream will depend on. + + :arg dependencies: A :class:`list` of :class:`GraphNode` specifying the new list of dependencies. + :arg flags: A :class:`int` controlling whether the set passed to the API is added to the existing set or replaces it. + + CUDA 11.3 and above. + .. class:: Event(flags=0) An event is a temporal 'marker' in a :class:`Stream` that allows taking the time @@ -895,6 +948,78 @@ Concurrency and Streams .. versionadded: 2011.2 +CUDAGraphs +---------- + +CUDA 10.0 and above + +Launching a simple kernel using CUDAGraphs +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. literalinclude:: ../examples/cudagraph_kernel.py + +.. class:: GraphNode + + An object representing a node on :class:`Graph`. + + Wraps `cuGraphNode ` + +.. class:: GraphExec + + An executable graph to be launched on a stream. + + Wraps `cuGraphExec `_ + + .. method:: launch(stream_py=None) + + Launches an executable graph in a stream. + + :arg stream_py: :class:`Stream` object specifying device stream. + Will use default stream if *stream_py* is None. + + .. method:: kernel_node_set_params(*args, kernel_node, func=None, block=(), grid=(), shared_mem_bytes=0) + + Sets a kernel node's parameters. Refer to :meth:`add_kernel_node` for argument specifications. + +.. class:: Graph() + + A cudagraph is a data dependency graph meant to + serve as an alternative to :class:`Stream`. + + Wraps `cuGraph ` + + .. method:: add_kernel_node(*args, func, block, grid=(1, ), dependencies=[], shared_mem_bytes=0) + + Returns and adds a :class:`GraphNode` object specifying + kernel node to the graph. + + Will be placed at the root of the graph if dependencies + are not specified. + + :arg args: *arg1* through *argn* are the positional C arguments to the kernel. + See :meth:`Function.__call__` for more argument details. + + :arg func: a :class:`Function`object specifying kernel function. + + :arg block: a :class:`tuple` of up to three integer entries specifying the number + of thread blocks to launch, as a multi-dimensional grid. + + :arg grid: a :class:`tuple` of up to three integer entries specifying the grid configuration. + + :arg dependencies: A :class:`list` of :class:`GraphNode` objects specifying dependency nodes. + + :arg shared_mem_bytes: A :class:`int` specifying size of shared memory. + + .. method:: instantiate() + + Returns and instantiates a :class:`GraphExec` object. + + .. method:: debug_dot_print(path) + + Returns a DOT file describing graph structure at specifed path. + + :arg path: String specifying path for saving DOT file. + Memory ------ diff --git a/examples/demo_graph.py b/examples/demo_graph.py new file mode 100644 index 00000000..57e1f3ce --- /dev/null +++ b/examples/demo_graph.py @@ -0,0 +1,57 @@ +# Sample source code from the Tutorial Introduction in the documentation. +import pycuda.driver as cuda +import pycuda.autoinit # noqa +from pycuda.compiler import SourceModule + +mod = SourceModule(""" + __global__ void plus(float *a, int num) + { + int idx = threadIdx.x + threadIdx.y*4; + a[idx] += num; + } + __global__ void times(float *a, float *b) + { + int idx = threadIdx.x + threadIdx.y*4; + a[idx] *= b[idx]; + } + """) +func_plus = mod.get_function("plus") +func_times = mod.get_function("times") + +import numpy +a = numpy.zeros((4, 4)).astype(numpy.float32) +a_gpu = cuda.mem_alloc_like(a) +b = numpy.zeros((4, 4)).astype(numpy.float32) +b_gpu = cuda.mem_alloc_like(b) +result = numpy.zeros_like(b) +b2_gpu = cuda.mem_alloc_like(b) + +stream_1 = cuda.Stream() +stream_1.begin_capture() +cuda.memcpy_htod_async(a_gpu, a, stream_1) +cuda.memcpy_htod_async(b_gpu, b, stream_1) +cuda.memcpy_htod_async(b2_gpu, b, stream_1) +func_plus(a_gpu, numpy.int32(2), block=(4, 4, 1), stream=stream_1) +_, _, graph, deps = stream_1.get_capture_info_v2() +first_node = graph.add_kernel_node(b_gpu, numpy.int32(3), block=(4, 4, 1), func=func_plus, dependencies=deps) +stream_1.update_capture_dependencies([first_node], 1) + +_, _, graph, deps = stream_1.get_capture_info_v2() +second_node = graph.add_kernel_node(a_gpu, b_gpu, block=(4, 4, 1), func=func_times, dependencies=deps) +stream_1.update_capture_dependencies([second_node], 1) +cuda.memcpy_dtoh_async(result, a_gpu, stream_1) + +graph = stream_1.end_capture() +graph.debug_dot_print("test.dot") # print dotfile of graph +instance = graph.instantiate() + +# Setting dynamic parameters +instance.kernel_node_set_params(b2_gpu, numpy.int32(100), block=(4, 4, 1), func=func_plus, kernel_node=first_node) +instance.kernel_node_set_params(a_gpu, b2_gpu, block=(4, 4, 1), func=func_times, kernel_node=second_node) +instance.launch() + +print("original arrays:") +print(a) +print(b) +print("(0+2)x(0+100) = 200, using a kernel graph of 3 kernels:") +print(result) \ No newline at end of file diff --git a/pycuda/driver.py b/pycuda/driver.py index 4bce8347..17e3b2b5 100644 --- a/pycuda/driver.py +++ b/pycuda/driver.py @@ -159,6 +159,54 @@ def post_call(self, stream): class InOut(In, Out): pass +from functools import lru_cache + +@lru_cache(maxsize=None) +def _build_arg_buf(args): + handlers = [] + + arg_data = [] + format = "" + for i, arg in enumerate(args): + if isinstance(arg, np.number): + arg_data.append(arg) + format += arg.dtype.char + elif isinstance(arg, (DeviceAllocation, PooledDeviceAllocation)): + arg_data.append(int(arg)) + format += "P" + elif isinstance(arg, ArgumentHandler): + handlers.append(arg) + arg_data.append(int(arg.get_device_alloc())) + format += "P" + elif isinstance(arg, np.ndarray): + if isinstance(arg.base, ManagedAllocationOrStub): + arg_data.append(int(arg.base)) + format += "P" + else: + arg_data.append(arg) + format += "%ds" % arg.nbytes + elif isinstance(arg, np.void): + arg_data.append(_my_bytes(_memoryview(arg))) + format += "%ds" % arg.itemsize + else: + cai = getattr(arg, "__cuda_array_interface__", None) + if cai: + arg_data.append(cai["data"][0]) + format += "P" + continue + + try: + gpudata = np.uintp(arg.gpudata) + except AttributeError: + raise TypeError("invalid type on parameter #%d (0-based)" % i) + else: + # for gpuarrays + arg_data.append(int(gpudata)) + format += "P" + + from pycuda._pvt_struct import pack + + return handlers, pack(format, *arg_data) def _add_functionality(): def device_get_attributes(dev): @@ -187,52 +235,6 @@ def device_get_attributes(dev): def device___getattr__(dev, name): return dev.get_attribute(getattr(device_attribute, name.upper())) - def _build_arg_buf(args): - handlers = [] - - arg_data = [] - format = "" - for i, arg in enumerate(args): - if isinstance(arg, np.number): - arg_data.append(arg) - format += arg.dtype.char - elif isinstance(arg, (DeviceAllocation, PooledDeviceAllocation)): - arg_data.append(int(arg)) - format += "P" - elif isinstance(arg, ArgumentHandler): - handlers.append(arg) - arg_data.append(int(arg.get_device_alloc())) - format += "P" - elif isinstance(arg, np.ndarray): - if isinstance(arg.base, ManagedAllocationOrStub): - arg_data.append(int(arg.base)) - format += "P" - else: - arg_data.append(arg) - format += "%ds" % arg.nbytes - elif isinstance(arg, np.void): - arg_data.append(_my_bytes(_memoryview(arg))) - format += "%ds" % arg.itemsize - else: - cai = getattr(arg, "__cuda_array_interface__", None) - if cai: - arg_data.append(cai["data"][0]) - format += "P" - continue - - try: - gpudata = np.uintp(arg.gpudata) - except AttributeError: - raise TypeError("invalid type on parameter #%d (0-based)" % i) - else: - # for gpuarrays - arg_data.append(int(gpudata)) - format += "P" - - from pycuda._pvt_struct import pack - - return handlers, pack(format, *arg_data) - # {{{ pre-CUDA 4 call interface (stateful) def function_param_set_pre_v4(func, *args): @@ -710,6 +712,33 @@ def new_func(*args, **kwargs): _add_functionality() +# {{{ cudagraph + +def patch_cudagraph(): + def graph_add_kernel_node_call(graph, *args, func, block, grid=(1, ), dependencies=[], shared_mem_bytes=0): + if func is None: + raise ValueError("must specify func") + if block is None: + raise ValueError("must specify block size") + _, arg_buf = _build_arg_buf(args) + return graph._add_kernel_node(dependencies, func, grid, block, arg_buf, shared_mem_bytes) + + def exec_graph_set_kernel_node_call(exec_graph, *args, kernel_node, func, block, grid=(1, ), shared_mem_bytes=0): + if kernel_node is None: + raise ValueError("must specify kernel_node") + if func is None: + raise ValueError("must specify func") + if block is None: + raise ValueError("must specify block size") + _, arg_buf = _build_arg_buf(args) + return exec_graph._kernel_node_set_params(kernel_node, func, grid, block, arg_buf, shared_mem_bytes) + + Graph.add_kernel_node = graph_add_kernel_node_call + GraphExec.kernel_node_set_params = exec_graph_set_kernel_node_call +if get_version() >= (10,): + patch_cudagraph() + +# }}} # {{{ pagelocked numpy arrays diff --git a/src/cpp/cuda.hpp b/src/cpp/cuda.hpp index 18079ab4..07fd1c29 100644 --- a/src/cpp/cuda.hpp +++ b/src/cpp/cuda.hpp @@ -408,6 +408,41 @@ namespace pycuda #endif // }}} + // {{{ helpers + + template + inline T* vector_data_ptr(std::vector& v) + { return v.empty() ? nullptr : &v[0]; } + + inline + void preprocess_grid_block(py::tuple block_dim_py, unsigned int block_dim[], + py::tuple grid_dim_py, unsigned int grid_dim[], + const unsigned axis_count) { + for (unsigned i = 0; i < axis_count; ++i) + { + grid_dim[i] = 1; + block_dim[i] = 1; + } + + pycuda_size_t gd_length = py::len(grid_dim_py); + if (gd_length > axis_count) + throw pycuda::error("function::launch_kernel", CUDA_ERROR_INVALID_HANDLE, + "too many grid dimensions in kernel launch"); + + for (unsigned i = 0; i < gd_length; ++i) + grid_dim[i] = py::extract(grid_dim_py[i]); + + pycuda_size_t bd_length = py::len(block_dim_py); + if (bd_length > axis_count) + throw pycuda::error("function::launch_kernel", CUDA_ERROR_INVALID_HANDLE, + "too many block dimensions in kernel launch"); + + for (unsigned i = 0; i < bd_length; ++i) + block_dim[i] = py::extract(block_dim_py[i]); + } + + // }}} + // {{{ device class context; class primary_context; @@ -993,6 +1028,8 @@ namespace pycuda // {{{ stream class event; + class graph; + class graph_node; class stream : public boost::noncopyable, public context_dependent { @@ -1024,6 +1061,14 @@ namespace pycuda #if CUDAPP_CUDA_VERSION >= 3020 void wait_for_event(const event &evt); +#endif +#if CUDAPP_CUDA_VERSION >= 10000 + void begin_capture(CUstreamCaptureMode mode); + graph *end_capture(); + py::tuple get_capture_info_v2(); +#if CUDAPP_CUDA_VERSION >= 11030 + void update_capture_dependencies(py::list deps_py, unsigned int flags); +#endif #endif bool is_done() const @@ -1447,28 +1492,7 @@ namespace pycuda const unsigned axis_count = 3; unsigned grid_dim[axis_count]; unsigned block_dim[axis_count]; - - for (unsigned i = 0; i < axis_count; ++i) - { - grid_dim[i] = 1; - block_dim[i] = 1; - } - - pycuda_size_t gd_length = py::len(grid_dim_py); - if (gd_length > axis_count) - throw pycuda::error("function::launch_kernel", CUDA_ERROR_INVALID_HANDLE, - "too many grid dimensions in kernel launch"); - - for (unsigned i = 0; i < gd_length; ++i) - grid_dim[i] = py::extract(grid_dim_py[i]); - - pycuda_size_t bd_length = py::len(block_dim_py); - if (bd_length > axis_count) - throw pycuda::error("function::launch_kernel", CUDA_ERROR_INVALID_HANDLE, - "too many block dimensions in kernel launch"); - - for (unsigned i = 0; i < bd_length; ++i) - block_dim[i] = py::extract(block_dim_py[i]); + preprocess_grid_block(block_dim_py, block_dim, grid_dim_py, grid_dim, axis_count); PYCUDA_PARSE_STREAM_PY; @@ -1500,6 +1524,9 @@ namespace pycuda } #endif + CUfunction handle() const + { return m_function; } + }; inline @@ -1512,6 +1539,240 @@ namespace pycuda // }}} + // {{{ graph +#if CUDAPP_CUDA_VERSION >= 10000 + class graph_exec : public boost::noncopyable, public context_dependent + { + private: + CUgraphExec m_exec; + + public: + graph_exec(CUgraphExec exec) + : m_exec(exec) + { } + + ~graph_exec() { + free(); + } + + void free() { + try + { + scoped_context_activation ca(get_context()); + CUDAPP_CALL_GUARDED_CLEANUP(cuGraphExecDestroy,(m_exec)); + } + CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(graph_exec); + release_context(); + } + + void launch(py::object stream_py) + { + PYCUDA_PARSE_STREAM_PY; + CUDAPP_CALL_GUARDED(cuGraphLaunch, (m_exec, s_handle)) + } + + void kernel_node_set_params(py::object graph_node_py, py::object function_py, + py::tuple grid_dim_py, py::tuple block_dim_py, + py::object parameter_buffer, + unsigned shared_mem_bytes); + }; + + class graph_node : public boost::noncopyable, public context_dependent + { + private: + CUgraphNode m_node; + + public: + graph_node(CUgraphNode node) + : m_node(node) + { } + + bool operator==(const graph_node& rhs) + { + return m_node == rhs.m_node; + } + + bool operator!=(const graph_node& rhs) + { + return !(*this == rhs); + } + + CUgraphNode handle() const + { return m_node; } + }; + + inline py::list array_of_nodes_to_list(const CUgraphNode* nodes, size_t length) { + py::list list_nodes; + for (size_t i = 0; i < length; ++i) { + graph_node* pre_to_python_node = new graph_node(nodes[i]); + list_nodes.append(boost::ref(pre_to_python_node)); + } + return list_nodes; + } + + inline std::vector list_to_vector_of_nodes(py::list list_nodes) { + std::vector v; + for (ssize_t i = 0; i < len(list_nodes); ++i) { + graph_node* node = boost::python::extract(list_nodes[i]); + v.push_back(node->handle()); + } + return v; + } + + class graph : public boost::noncopyable, public context_dependent + { + private: + CUgraph m_graph; + + public: + graph(CUgraph graph) + : m_graph(graph) + { } + + bool operator==(const graph& rhs) + { + return m_graph == rhs.m_graph; + } + + bool operator!=(const graph& rhs) + { + return !(*this == rhs); + } + + graph_exec *instantiate() + { + CUgraphExec instance; +#if CUDAPP_CUDA_VERSION >= 12000 + CUDAPP_CALL_GUARDED(cuGraphInstantiate, (&instance, m_graph, 0)) +#else + CUDAPP_CALL_GUARDED(cuGraphInstantiate, (&instance, m_graph, nullptr, nullptr, 0)) +#endif + return new graph_exec(instance); + } + + graph_node *add_kernel_node(py::list deps_py, py::object function_py, + py::tuple grid_dim_py, py::tuple block_dim_py, py::object parameter_buffer, + unsigned shared_mem_bytes); + + void debug_dot_print(std::string path) + { + CUDAPP_CALL_GUARDED(cuGraphDebugDotPrint, (m_graph, path.c_str(), 1<<1)) + } + + CUgraph handle() const + { return m_graph; } + }; + + inline void stream::begin_capture(CUstreamCaptureMode mode = CU_STREAM_CAPTURE_MODE_GLOBAL) + { + CUDAPP_CALL_GUARDED(cuStreamBeginCapture, (m_stream, mode)); + } + + inline graph *stream::end_capture() + { + CUgraph result; + CUDAPP_CALL_GUARDED(cuStreamEndCapture, (m_stream, &result)) + return new graph(result); + } + + inline py::tuple stream::get_capture_info_v2() + { + CUgraph _capturing_graph; + CUstreamCaptureStatus _capture_status; + const CUgraphNode *_deps; + size_t _dep_count; + uint64_t _id_out; + CUDAPP_CALL_GUARDED(cuStreamGetCaptureInfo_v2, (m_stream, &_capture_status, &_id_out, &_capturing_graph, &_deps, &_dep_count)); + py::list list_root_nodes = array_of_nodes_to_list(_deps, _dep_count); + graph *node = new graph(_capturing_graph); + return py::make_tuple(_capture_status, _id_out, boost::ref(node), list_root_nodes); + } + +#if CUDAPP_CUDA_VERSION >= 11030 + inline void stream::update_capture_dependencies(py::list deps_py, unsigned int flags) + { + std::vector deps = list_to_vector_of_nodes(deps_py); + CUDAPP_CALL_GUARDED(cuStreamUpdateCaptureDependencies, (m_stream, vector_data_ptr(deps), len(deps_py), flags)); + } +#endif + + inline graph_node *graph::add_kernel_node(py::list deps_py, py::object function_py, + py::tuple grid_dim_py, py::tuple block_dim_py, py::object parameter_buffer, + unsigned shared_mem_bytes) + { + const unsigned axis_count = 3; + unsigned grid_dim[axis_count]; + unsigned block_dim[axis_count]; + preprocess_grid_block(block_dim_py, block_dim, grid_dim_py, grid_dim, axis_count); + + py_buffer_wrapper par_buf_wrapper; + par_buf_wrapper.get(parameter_buffer.ptr(), PyBUF_ANY_CONTIGUOUS); + size_t par_len = par_buf_wrapper.m_buf.len; + + void *config[] = { + CU_LAUNCH_PARAM_BUFFER_POINTER, const_cast(par_buf_wrapper.m_buf.buf), + CU_LAUNCH_PARAM_BUFFER_SIZE, &par_len, + CU_LAUNCH_PARAM_END + }; + + CUgraphNode result; + function &fn = py::extract(function_py); + std::vector deps = list_to_vector_of_nodes(deps_py); + + CUDA_KERNEL_NODE_PARAMS _dynamic_params_cuda = { 0 }; + _dynamic_params_cuda.blockDimX = block_dim[0]; + _dynamic_params_cuda.blockDimY = block_dim[1]; + _dynamic_params_cuda.blockDimZ = block_dim[2]; + _dynamic_params_cuda.extra = config; + _dynamic_params_cuda.func = fn.handle(); + _dynamic_params_cuda.gridDimX = grid_dim[0]; + _dynamic_params_cuda.gridDimY = grid_dim[1]; + _dynamic_params_cuda.gridDimZ = grid_dim[2]; + _dynamic_params_cuda.kernelParams = 0; + _dynamic_params_cuda.sharedMemBytes = shared_mem_bytes; + CUDAPP_CALL_GUARDED(cuGraphAddKernelNode, (&result, m_graph, vector_data_ptr(deps), len(deps_py), &_dynamic_params_cuda)); + return new graph_node(result); + } + + inline void graph_exec::kernel_node_set_params(py::object graph_node_py, py::object function_py, + py::tuple grid_dim_py, py::tuple block_dim_py, + py::object parameter_buffer, + unsigned shared_mem_bytes) + { + const unsigned axis_count = 3; + unsigned grid_dim[axis_count]; + unsigned block_dim[axis_count]; + preprocess_grid_block(block_dim_py, block_dim, grid_dim_py, grid_dim, axis_count); + + py_buffer_wrapper par_buf_wrapper; + par_buf_wrapper.get(parameter_buffer.ptr(), PyBUF_ANY_CONTIGUOUS); + size_t par_len = par_buf_wrapper.m_buf.len; + + void *config[] = { + CU_LAUNCH_PARAM_BUFFER_POINTER, const_cast(par_buf_wrapper.m_buf.buf), + CU_LAUNCH_PARAM_BUFFER_SIZE, &par_len, + CU_LAUNCH_PARAM_END + }; + + graph_node &gn = py::extract(graph_node_py); + function &fn = py::extract(function_py); + + CUDA_KERNEL_NODE_PARAMS _dynamic_params_cuda = { 0 }; + _dynamic_params_cuda.blockDimX = block_dim[0]; + _dynamic_params_cuda.blockDimY = block_dim[1]; + _dynamic_params_cuda.blockDimZ = block_dim[2]; + _dynamic_params_cuda.extra = config; + _dynamic_params_cuda.func = fn.handle(); + _dynamic_params_cuda.gridDimX = grid_dim[0]; + _dynamic_params_cuda.gridDimY = grid_dim[1]; + _dynamic_params_cuda.gridDimZ = grid_dim[2]; + _dynamic_params_cuda.kernelParams = 0; + _dynamic_params_cuda.sharedMemBytes = shared_mem_bytes; + CUDAPP_CALL_GUARDED(cuGraphExecKernelNodeSetParams, (m_exec, gn.handle(), &_dynamic_params_cuda)); + } +#endif + // }}} + // {{{ device memory inline py::tuple mem_get_info() diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index 3758689b..a3626866 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -1231,7 +1231,53 @@ BOOST_PYTHON_MODULE(_driver) } // }}} + // {{{ graph +#if CUDAPP_CUDA_VERSION >= 10000 + { + typedef graph_node cl; + py::class_("GraphNode", py::no_init) + .def("__eq__", &cl::operator==) + .def("__ne__", &cl::operator!=) + ; + } + + { + typedef graph_exec cl; + py::class_("GraphExec", py::no_init) + .def("launch", &cl::launch, + py::arg("stream")=py::object()) + .def("_kernel_node_set_params", &cl::kernel_node_set_params) + ; + } + + { + typedef graph cl; + py::class_("Graph", py::no_init) + .def("__eq__", &cl::operator==) + .def("__ne__", &cl::operator!=) + .def("_add_kernel_node", &cl::add_kernel_node, + py::return_value_policy()) + .def("instantiate", &cl::instantiate, + py::return_value_policy()) + .DEF_SIMPLE_METHOD(debug_dot_print) + ; + } +#endif + // }}} + // {{{ stream +#if CUDAPP_CUDA_VERSION >= 10000 + py::enum_("capture_mode") + .value("GLOBAL", CU_STREAM_CAPTURE_MODE_GLOBAL) + .value("THREAD_LOCAL", CU_STREAM_CAPTURE_MODE_THREAD_LOCAL) + .value("RELAXED", CU_STREAM_CAPTURE_MODE_RELAXED) + ; + py::enum_("capture_status") + .value("NONE", CU_STREAM_CAPTURE_STATUS_NONE) + .value("ACTIVE", CU_STREAM_CAPTURE_STATUS_ACTIVE) + .value("INVALIDATED", CU_STREAM_CAPTURE_STATUS_INVALIDATED) + ; +#endif { typedef stream cl; py::class_ > @@ -1240,6 +1286,16 @@ BOOST_PYTHON_MODULE(_driver) .DEF_SIMPLE_METHOD(is_done) #if CUDAPP_CUDA_VERSION >= 3020 .DEF_SIMPLE_METHOD(wait_for_event) +#endif +#if CUDAPP_CUDA_VERSION >= 10000 + .def("begin_capture", &cl::begin_capture, + py::arg("capture_mode") = CU_STREAM_CAPTURE_MODE_GLOBAL) + .def("end_capture", &cl::end_capture, + py::return_value_policy()) + .def("get_capture_info_v2", &cl::get_capture_info_v2) +#if CUDAPP_CUDA_VERSION >= 11030 + .def("update_capture_dependencies", &cl::update_capture_dependencies) +#endif #endif .add_property("handle", &cl::handle_int) ; diff --git a/test/test_graph.py b/test/test_graph.py new file mode 100644 index 00000000..ca3ec196 --- /dev/null +++ b/test/test_graph.py @@ -0,0 +1,151 @@ +__copyright__ = """ +Copyright 2008-2021 Andreas Kloeckner +Copyright 2021 NVIDIA Corporation +""" + +import numpy as np +import numpy.linalg as la +from pycuda.tools import mark_cuda_test, dtype_to_ctype +import pytest # noqa + + +import pycuda.gpuarray as gpuarray +import pycuda.driver as drv +from pycuda.compiler import SourceModule + + +class TestGraph: + @mark_cuda_test + def test_static_params(self): + mod = SourceModule(""" + __global__ void plus(float *a, int num) + { + int idx = threadIdx.x + threadIdx.y*4; + a[idx] += num; + } + """) + func_plus = mod.get_function("plus") + + import numpy + a = numpy.zeros((4, 4)).astype(numpy.float32) + a_gpu = drv.mem_alloc_like(a) + result = numpy.zeros_like(a) + stream_1 = drv.Stream() + stream_1.begin_capture() + func_plus(a_gpu, numpy.int32(3), block=(4, 4, 1), stream=stream_1) + graph = stream_1.end_capture() + + instance = graph.instantiate() + instance.launch() + drv.memcpy_dtoh_async(result, a_gpu, stream_1) + np.testing.assert_allclose(result, np.full((4, 4), 3), rtol=1e-5) + + @mark_cuda_test + def test_dynamic_params(self): + mod = SourceModule(""" + __global__ void plus(float *a, int num) + { + int idx = threadIdx.x + threadIdx.y*4; + a[idx] += num; + } + """) + func_plus = mod.get_function("plus") + + stream_1 = drv.Stream() + import numpy + a = numpy.zeros((4, 4)).astype(numpy.float32) + a_gpu = drv.mem_alloc_like(a) + result = numpy.zeros_like(a) + stream_1.begin_capture() + stat , _, x_graph, deps = stream_1.get_capture_info_v2() + assert stat == drv.capture_status.ACTIVE, "Capture should be active" + assert len(deps) == 0, "Nothing on deps" + newnode = x_graph.add_kernel_node(a_gpu, numpy.int32(3), block=(4, 4, 1), func=func_plus, dependencies=deps) + stream_1.update_capture_dependencies([newnode], 1) + drv.memcpy_dtoh_async(result, a_gpu, stream_1) # Capture a copy as well. + graph = stream_1.end_capture() + assert graph == x_graph, "Should be the same" + + instance = graph.instantiate() + + stat, _, _, _ = stream_1.get_capture_info_v2() + assert stat == drv.capture_status.NONE, "No capture should be active" + + wanna = 0 + for i in range(4): + instance.kernel_node_set_params(a_gpu, numpy.int32(i), block=(4, 4, 1), func=func_plus, kernel_node=newnode) + instance.launch() + wanna += i + np.testing.assert_allclose(result, np.full((4, 4), wanna), rtol=1e-5) + + @mark_cuda_test + def test_many_dynamic_params(self): + mod = SourceModule(""" + __global__ void plus(float *a, int num) + { + int idx = threadIdx.x + threadIdx.y*4; + a[idx] += num; + } + __global__ void times(float *a, float *b) + { + int idx = threadIdx.x + threadIdx.y*4; + a[idx] *= b[idx]; + } + """) + func_plus = mod.get_function("plus") + func_times = mod.get_function("times") + + stream_1 = drv.Stream() + + import numpy + a = numpy.zeros((4, 4)).astype(numpy.float32) + a_gpu = drv.mem_alloc_like(a) + b = numpy.zeros((4, 4)).astype(numpy.float32) + b_gpu = drv.mem_alloc_like(b) + result = numpy.zeros_like(b) + stream_1.begin_capture() + stat , _, x_graph, deps = stream_1.get_capture_info_v2() + assert stat == drv.capture_status.ACTIVE, "Capture should be active" + assert len(deps) == 0, "Nothing on deps" + newnode = x_graph.add_kernel_node(a_gpu, numpy.int32(3), block=(4, 4, 1), func=func_plus, dependencies=deps) + stream_1.update_capture_dependencies([newnode], 1) + _, _, x_graph, deps = stream_1.get_capture_info_v2() + assert deps == [newnode], "Call to update_capture_dependencies should set newnode as the only dep" + newnode2 = x_graph.add_kernel_node(b_gpu, numpy.int32(3), block=(4, 4, 1), func=func_plus, dependencies=deps) + stream_1.update_capture_dependencies([newnode2], 1) + + # Static capture + func_times(a_gpu, b_gpu, block=(4, 4, 1), stream=stream_1) + drv.memcpy_dtoh_async(result, a_gpu, stream_1) # Capture a copy as well. + graph = stream_1.end_capture() + assert graph == x_graph, "Should be the same" + + instance = graph.instantiate() + + stat, _, _, _ = stream_1.get_capture_info_v2() + assert stat == drv.capture_status.NONE, "No capture be active" + + instance.kernel_node_set_params(a_gpu, numpy.int32(4), block=(4, 4, 1), func=func_plus, kernel_node=newnode) + instance.kernel_node_set_params(b_gpu, numpy.int32(9), block=(4, 4, 1), func=func_plus, kernel_node=newnode2) + instance.launch() + np.testing.assert_allclose(result, np.full((4, 4), 4*9), rtol=1e-5) + + a = numpy.zeros((4, 4)).astype(numpy.float32) + a_gpu_fake = drv.mem_alloc_like(a) + instance.kernel_node_set_params(a_gpu_fake, numpy.int32(5), block=(4, 4, 1), func=func_plus, kernel_node=newnode) + instance.kernel_node_set_params(b_gpu, numpy.int32(4), block=(4, 4, 1), func=func_plus, kernel_node=newnode2) + instance.launch() + np.testing.assert_allclose(result, np.full((4, 4), (4*9)*(9+4)), rtol=1e-5) # b is now (9 + 4), a is 4*9 as it was after func_times, since we write to another buffer this launch. + +if __name__ == "__main__": + # make sure that import failures get reported, instead of skipping the tests. + import pycuda.autoinit # noqa + + import sys + + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from pytest import main + + main([__file__]) From 66d7e3693f111adbf99f5be0ac08606c89fb8c35 Mon Sep 17 00:00:00 2001 From: Leo Prokh Date: Fri, 20 Oct 2023 11:13:35 +0300 Subject: [PATCH 2/4] Fix segfault in get_capture_info_v2 --- src/cpp/cuda.hpp | 6 +++--- test/test_graph.py | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/cpp/cuda.hpp b/src/cpp/cuda.hpp index 07fd1c29..7d175db1 100644 --- a/src/cpp/cuda.hpp +++ b/src/cpp/cuda.hpp @@ -1679,9 +1679,9 @@ namespace pycuda { CUgraph _capturing_graph; CUstreamCaptureStatus _capture_status; - const CUgraphNode *_deps; - size_t _dep_count; - uint64_t _id_out; + const CUgraphNode *_deps = nullptr; + size_t _dep_count = 0; + uint64_t _id_out = 0; CUDAPP_CALL_GUARDED(cuStreamGetCaptureInfo_v2, (m_stream, &_capture_status, &_id_out, &_capturing_graph, &_deps, &_dep_count)); py::list list_root_nodes = array_of_nodes_to_list(_deps, _dep_count); graph *node = new graph(_capturing_graph); diff --git a/test/test_graph.py b/test/test_graph.py index ca3ec196..794ac134 100644 --- a/test/test_graph.py +++ b/test/test_graph.py @@ -96,6 +96,7 @@ def test_many_dynamic_params(self): func_times = mod.get_function("times") stream_1 = drv.Stream() + _ , _, _, _ = stream_1.get_capture_info_v2() import numpy a = numpy.zeros((4, 4)).astype(numpy.float32) @@ -103,6 +104,7 @@ def test_many_dynamic_params(self): b = numpy.zeros((4, 4)).astype(numpy.float32) b_gpu = drv.mem_alloc_like(b) result = numpy.zeros_like(b) + _ , _, _, _ = stream_1.get_capture_info_v2() stream_1.begin_capture() stat , _, x_graph, deps = stream_1.get_capture_info_v2() assert stat == drv.capture_status.ACTIVE, "Capture should be active" From f4e3b04b81606dc19babaace4dfd5b307fe455d5 Mon Sep 17 00:00:00 2001 From: Leo Prokh Date: Fri, 20 Oct 2023 11:33:31 +0300 Subject: [PATCH 3/4] Add update_capture_dependencies flags --- doc/driver.rst | 7 +++++++ examples/demo_graph.py | 4 ++-- src/wrapper/wrap_cudadrv.cpp | 10 +++++++++- test/test_graph.py | 6 +++--- 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/doc/driver.rst b/doc/driver.rst index 870ff2b7..666cc3a8 100644 --- a/doc/driver.rst +++ b/doc/driver.rst @@ -621,6 +621,13 @@ Constants .. attribute:: ACTIVE .. attribute:: INVALIDATED +.. class:: update_capture_dependencies_flags + + CUDA 11.3 and newer. + + .. attribute:: ADD_CAPTURE_DEPENDENCIES + .. attribute:: SET_CAPTURE_DEPENDENCIES + Graphics-related constants ^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/examples/demo_graph.py b/examples/demo_graph.py index 57e1f3ce..865b2695 100644 --- a/examples/demo_graph.py +++ b/examples/demo_graph.py @@ -34,11 +34,11 @@ func_plus(a_gpu, numpy.int32(2), block=(4, 4, 1), stream=stream_1) _, _, graph, deps = stream_1.get_capture_info_v2() first_node = graph.add_kernel_node(b_gpu, numpy.int32(3), block=(4, 4, 1), func=func_plus, dependencies=deps) -stream_1.update_capture_dependencies([first_node], 1) +stream_1.update_capture_dependencies([first_node], cuda.update_capture_dependencies_flags.SET_CAPTURE_DEPENDENCIES) _, _, graph, deps = stream_1.get_capture_info_v2() second_node = graph.add_kernel_node(a_gpu, b_gpu, block=(4, 4, 1), func=func_times, dependencies=deps) -stream_1.update_capture_dependencies([second_node], 1) +stream_1.update_capture_dependencies([second_node], cuda.update_capture_dependencies_flags.SET_CAPTURE_DEPENDENCIES) cuda.memcpy_dtoh_async(result, a_gpu, stream_1) graph = stream_1.end_capture() diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index a3626866..e225ddbc 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -1277,6 +1277,12 @@ BOOST_PYTHON_MODULE(_driver) .value("ACTIVE", CU_STREAM_CAPTURE_STATUS_ACTIVE) .value("INVALIDATED", CU_STREAM_CAPTURE_STATUS_INVALIDATED) ; +#endif +#if CUDAPP_CUDA_VERSION >= 11030 + py::enum_("update_capture_dependencies_flags") + .value("ADD_CAPTURE_DEPENDENCIES", CU_STREAM_ADD_CAPTURE_DEPENDENCIES) + .value("SET_CAPTURE_DEPENDENCIES", CU_STREAM_SET_CAPTURE_DEPENDENCIES) + ; #endif { typedef stream cl; @@ -1294,7 +1300,9 @@ BOOST_PYTHON_MODULE(_driver) py::return_value_policy()) .def("get_capture_info_v2", &cl::get_capture_info_v2) #if CUDAPP_CUDA_VERSION >= 11030 - .def("update_capture_dependencies", &cl::update_capture_dependencies) + .def("update_capture_dependencies", &cl::update_capture_dependencies, + (py::arg("dependencies"), + py::arg("flags") = CU_STREAM_ADD_CAPTURE_DEPENDENCIES)) #endif #endif .add_property("handle", &cl::handle_int) diff --git a/test/test_graph.py b/test/test_graph.py index 794ac134..bcf92b9e 100644 --- a/test/test_graph.py +++ b/test/test_graph.py @@ -61,7 +61,7 @@ def test_dynamic_params(self): assert stat == drv.capture_status.ACTIVE, "Capture should be active" assert len(deps) == 0, "Nothing on deps" newnode = x_graph.add_kernel_node(a_gpu, numpy.int32(3), block=(4, 4, 1), func=func_plus, dependencies=deps) - stream_1.update_capture_dependencies([newnode], 1) + stream_1.update_capture_dependencies([newnode], cuda.update_capture_dependencies_flags.SET_CAPTURE_DEPENDENCIES) drv.memcpy_dtoh_async(result, a_gpu, stream_1) # Capture a copy as well. graph = stream_1.end_capture() assert graph == x_graph, "Should be the same" @@ -110,11 +110,11 @@ def test_many_dynamic_params(self): assert stat == drv.capture_status.ACTIVE, "Capture should be active" assert len(deps) == 0, "Nothing on deps" newnode = x_graph.add_kernel_node(a_gpu, numpy.int32(3), block=(4, 4, 1), func=func_plus, dependencies=deps) - stream_1.update_capture_dependencies([newnode], 1) + stream_1.update_capture_dependencies([newnode], cuda.update_capture_dependencies_flags.SET_CAPTURE_DEPENDENCIES) _, _, x_graph, deps = stream_1.get_capture_info_v2() assert deps == [newnode], "Call to update_capture_dependencies should set newnode as the only dep" newnode2 = x_graph.add_kernel_node(b_gpu, numpy.int32(3), block=(4, 4, 1), func=func_plus, dependencies=deps) - stream_1.update_capture_dependencies([newnode2], 1) + stream_1.update_capture_dependencies([newnode2], cuda.update_capture_dependencies_flags.SET_CAPTURE_DEPENDENCIES) # Static capture func_times(a_gpu, b_gpu, block=(4, 4, 1), stream=stream_1) From fb9ccefb8323fd078403e53f12db142964461d16 Mon Sep 17 00:00:00 2001 From: Leo Prokh Date: Thu, 26 Oct 2023 13:24:10 +0300 Subject: [PATCH 4/4] Allow Graph creation --- src/cpp/cuda.hpp | 26 +++++++++++++++++++++++++- src/wrapper/wrap_cudadrv.cpp | 2 +- test/test_graph.py | 31 ++++++++++++++++++++++++++++--- 3 files changed, 54 insertions(+), 5 deletions(-) diff --git a/src/cpp/cuda.hpp b/src/cpp/cuda.hpp index 7d175db1..c57a48d8 100644 --- a/src/cpp/cuda.hpp +++ b/src/cpp/cuda.hpp @@ -1552,7 +1552,7 @@ namespace pycuda { } ~graph_exec() { - free(); + free(); } void free() { @@ -1623,12 +1623,36 @@ namespace pycuda { private: CUgraph m_graph; + bool m_own_graph; public: + graph() + : m_own_graph(true) + { + cuGraphCreate(&m_graph, 0); + } + graph(CUgraph graph) : m_graph(graph) + , m_own_graph(false) { } + ~graph() { + if (m_own_graph) { + free(); + } + } + + void free() { + try + { + scoped_context_activation ca(get_context()); + CUDAPP_CALL_GUARDED_CLEANUP(cuGraphDestroy,(m_graph)); + } + CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(graph); + release_context(); + } + bool operator==(const graph& rhs) { return m_graph == rhs.m_graph; diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index e225ddbc..0931a271 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -1252,7 +1252,7 @@ BOOST_PYTHON_MODULE(_driver) { typedef graph cl; - py::class_("Graph", py::no_init) + py::class_("Graph") .def("__eq__", &cl::operator==) .def("__ne__", &cl::operator!=) .def("_add_kernel_node", &cl::add_kernel_node, diff --git a/test/test_graph.py b/test/test_graph.py index bcf92b9e..530986af 100644 --- a/test/test_graph.py +++ b/test/test_graph.py @@ -61,7 +61,7 @@ def test_dynamic_params(self): assert stat == drv.capture_status.ACTIVE, "Capture should be active" assert len(deps) == 0, "Nothing on deps" newnode = x_graph.add_kernel_node(a_gpu, numpy.int32(3), block=(4, 4, 1), func=func_plus, dependencies=deps) - stream_1.update_capture_dependencies([newnode], cuda.update_capture_dependencies_flags.SET_CAPTURE_DEPENDENCIES) + stream_1.update_capture_dependencies([newnode], drv.update_capture_dependencies_flags.SET_CAPTURE_DEPENDENCIES) drv.memcpy_dtoh_async(result, a_gpu, stream_1) # Capture a copy as well. graph = stream_1.end_capture() assert graph == x_graph, "Should be the same" @@ -110,11 +110,11 @@ def test_many_dynamic_params(self): assert stat == drv.capture_status.ACTIVE, "Capture should be active" assert len(deps) == 0, "Nothing on deps" newnode = x_graph.add_kernel_node(a_gpu, numpy.int32(3), block=(4, 4, 1), func=func_plus, dependencies=deps) - stream_1.update_capture_dependencies([newnode], cuda.update_capture_dependencies_flags.SET_CAPTURE_DEPENDENCIES) + stream_1.update_capture_dependencies([newnode], drv.update_capture_dependencies_flags.SET_CAPTURE_DEPENDENCIES) _, _, x_graph, deps = stream_1.get_capture_info_v2() assert deps == [newnode], "Call to update_capture_dependencies should set newnode as the only dep" newnode2 = x_graph.add_kernel_node(b_gpu, numpy.int32(3), block=(4, 4, 1), func=func_plus, dependencies=deps) - stream_1.update_capture_dependencies([newnode2], cuda.update_capture_dependencies_flags.SET_CAPTURE_DEPENDENCIES) + stream_1.update_capture_dependencies([newnode2], drv.update_capture_dependencies_flags.SET_CAPTURE_DEPENDENCIES) # Static capture func_times(a_gpu, b_gpu, block=(4, 4, 1), stream=stream_1) @@ -139,6 +139,31 @@ def test_many_dynamic_params(self): instance.launch() np.testing.assert_allclose(result, np.full((4, 4), (4*9)*(9+4)), rtol=1e-5) # b is now (9 + 4), a is 4*9 as it was after func_times, since we write to another buffer this launch. + @mark_cuda_test + def test_graph_create(self): + mod = SourceModule(""" + __global__ void plus(float *a, int num) + { + int idx = threadIdx.x + threadIdx.y*4; + a[idx] += num; + } + """) + func_plus = mod.get_function("plus") + + import numpy + a = numpy.zeros((4, 4)).astype(numpy.float32) + a_gpu = drv.mem_alloc_like(a) + result = numpy.zeros_like(a) + + graph = drv.Graph() + node1 = graph.add_kernel_node(a_gpu, numpy.int32(1), block=(4, 4, 1), func=func_plus, dependencies=[]) + node2 = graph.add_kernel_node(a_gpu, numpy.int32(2), block=(4, 4, 1), func=func_plus, dependencies=[node1]) + + instance = graph.instantiate() + instance.launch() + drv.memcpy_dtoh_async(result, a_gpu) + np.testing.assert_allclose(result, np.full((4, 4), 1+2), rtol=1e-5) + if __name__ == "__main__": # make sure that import failures get reported, instead of skipping the tests. import pycuda.autoinit # noqa