diff --git a/doc/driver.rst b/doc/driver.rst
index 6e6d65f5..666cc3a8 100644
--- a/doc/driver.rst
+++ b/doc/driver.rst
@@ -605,6 +605,29 @@ Constants
 
     .. attribute:: LAZY_ENABLE_PEER_ACCESS
 
+.. class:: capture_mode
+
+    CUDA 10 and newer.
+
+    .. attribute:: GLOBAL
+    .. attribute:: THREAD_LOCAL
+    .. attribute:: RELAXED
+
+.. class:: capture_status
+
+    CUDA 10 and newer.
+
+    .. attribute:: NONE
+    .. attribute:: ACTIVE
+    .. attribute:: INVALIDATED
+
+.. class:: update_capture_dependencies_flags
+
+    CUDA 11.3 and newer.
+
+    .. attribute:: ADD_CAPTURE_DEPENDENCIES
+    .. attribute:: SET_CAPTURE_DEPENDENCIES
+
 
 Graphics-related constants
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -845,6 +868,43 @@ Concurrency and Streams
 
         .. versionadded:: 2011.1
 
+    .. method:: begin_capture(capture_mode=capture_mode.GLOBAL)
+
+        Begins graph stream capture on a stream.
+
+        When a stream is in capture mode, all operations pushed into the stream
+        will not be executed, but will instead be captured into a graph.
+
+        :arg capture_mode: A :class:`capture_mode` specifying mode for capturing graph.
+
+        CUDA 10 and above.
+
+    .. method:: end_capture()
+
+        Ends stream capture and returns a :class:`Graph` object.
+
+        CUDA 10 and above.
+
+    .. method:: get_capture_info_v2()
+
+        Query a stream's capture state.
+
+        Return a :class:`tuple` of (:class:`capture_status` capture status, :class:`int` id for the capture sequence,
+        :class:`Graph` the graph being captured into, a :class:`list` of :class:`GraphNode` specifying set of nodes the
+        next node to be captured in the stream will depend on)
+
+        CUDA 10 and above.
+
+    .. method:: update_capture_dependencies(dependencies, flags)
+
+        Modifies the dependency set of a capturing stream.
+        The dependency set is the set of nodes that the next captured node in the stream will depend on.
+
+        :arg dependencies: A :class:`list` of :class:`GraphNode` specifying the new list of dependencies.
+        :arg flags: A :class:`int` controlling whether the set passed to the API is added to the existing set or replaces it.
+
+        CUDA 11.3 and above.
+
 .. class:: Event(flags=0)
 
     An event is a temporal 'marker' in a :class:`Stream` that allows taking the time
@@ -895,6 +955,78 @@ Concurrency and Streams
 
         .. versionadded: 2011.2
 
+CUDAGraphs
+----------
+
+CUDA 10.0 and above
+
+Launching a simple kernel using CUDAGraphs
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. literalinclude:: ../examples/cudagraph_kernel.py
+
+.. class:: GraphNode
+
+    An object representing a node on :class:`Graph`.
+
+    Wraps `cuGraphNode <https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gc72514a94dacc85ed0617f979211079c>`
+
+.. class:: GraphExec
+
+    An executable graph to be launched on a stream.
+
+    Wraps `cuGraphExec <https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gf0abeceeaa9f0a39592fe36a538ea1f0>`_
+
+    .. method:: launch(stream_py=None)
+
+        Launches an executable graph in a stream.
+
+        :arg stream_py: :class:`Stream` object specifying device stream.
+                    Will use default stream if *stream_py* is None.
+
+    .. method:: kernel_node_set_params(*args, kernel_node, func=None, block=(), grid=(), shared_mem_bytes=0)
+
+        Sets a kernel node's parameters. Refer to :meth:`add_kernel_node` for argument specifications.
+
+.. class:: Graph()
+
+    A cudagraph is a data dependency graph meant to
+    serve as an alternative to :class:`Stream`.
+
+    Wraps `cuGraph <https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1g69f555c38df5b3fa1ed25efef794739a>`
+
+    .. method:: add_kernel_node(*args, func, block, grid=(1, ), dependencies=[], shared_mem_bytes=0)
+
+        Returns and adds a :class:`GraphNode` object specifying
+        kernel node to the graph.
+
+        Will be placed at the root of the graph if dependencies
+        are not specified.
+
+        :arg args:  *arg1* through *argn* are the positional C arguments to the kernel.
+        See :meth:`Function.__call__` for more argument details.
+
+        :arg func: a :class:`Function`object specifying kernel function.
+
+        :arg block: a :class:`tuple` of up to three integer entries specifying the number
+        of thread blocks to launch, as a multi-dimensional grid.
+
+        :arg grid: a :class:`tuple` of up to three integer entries specifying the grid configuration.
+
+        :arg dependencies: A :class:`list` of :class:`GraphNode` objects specifying dependency nodes.
+
+        :arg shared_mem_bytes: A :class:`int` specifying size of shared memory.
+
+    .. method:: instantiate()
+
+        Returns and instantiates a :class:`GraphExec` object.
+
+    .. method:: debug_dot_print(path)
+
+        Returns a DOT file describing graph structure at specifed path.
+
+        :arg path: String specifying path for saving DOT file.
+
 Memory
 ------
 
diff --git a/examples/demo_graph.py b/examples/demo_graph.py
new file mode 100644
index 00000000..865b2695
--- /dev/null
+++ b/examples/demo_graph.py
@@ -0,0 +1,57 @@
+# Sample source code from the Tutorial Introduction in the documentation.
+import pycuda.driver as cuda
+import pycuda.autoinit  # noqa
+from pycuda.compiler import SourceModule
+
+mod = SourceModule("""
+    __global__ void plus(float *a, int num)
+    {
+      int idx = threadIdx.x + threadIdx.y*4;
+      a[idx] += num;
+    }
+    __global__ void times(float *a, float *b)
+    {
+      int idx = threadIdx.x + threadIdx.y*4;
+      a[idx] *= b[idx];
+    }
+    """)
+func_plus = mod.get_function("plus")
+func_times = mod.get_function("times")
+
+import numpy
+a = numpy.zeros((4, 4)).astype(numpy.float32)
+a_gpu = cuda.mem_alloc_like(a)
+b = numpy.zeros((4, 4)).astype(numpy.float32)
+b_gpu = cuda.mem_alloc_like(b)
+result = numpy.zeros_like(b)
+b2_gpu = cuda.mem_alloc_like(b)
+
+stream_1 = cuda.Stream()
+stream_1.begin_capture()
+cuda.memcpy_htod_async(a_gpu, a, stream_1)
+cuda.memcpy_htod_async(b_gpu, b, stream_1)
+cuda.memcpy_htod_async(b2_gpu, b, stream_1)
+func_plus(a_gpu, numpy.int32(2), block=(4, 4, 1), stream=stream_1)
+_, _, graph, deps = stream_1.get_capture_info_v2()
+first_node = graph.add_kernel_node(b_gpu, numpy.int32(3), block=(4, 4, 1), func=func_plus, dependencies=deps)
+stream_1.update_capture_dependencies([first_node], cuda.update_capture_dependencies_flags.SET_CAPTURE_DEPENDENCIES)
+
+_, _, graph, deps = stream_1.get_capture_info_v2()
+second_node = graph.add_kernel_node(a_gpu, b_gpu, block=(4, 4, 1), func=func_times, dependencies=deps)
+stream_1.update_capture_dependencies([second_node], cuda.update_capture_dependencies_flags.SET_CAPTURE_DEPENDENCIES)
+cuda.memcpy_dtoh_async(result, a_gpu, stream_1)
+
+graph = stream_1.end_capture()
+graph.debug_dot_print("test.dot")  # print dotfile of graph
+instance = graph.instantiate()
+
+# Setting dynamic parameters
+instance.kernel_node_set_params(b2_gpu, numpy.int32(100), block=(4, 4, 1), func=func_plus, kernel_node=first_node)
+instance.kernel_node_set_params(a_gpu, b2_gpu, block=(4, 4, 1), func=func_times, kernel_node=second_node)
+instance.launch()
+
+print("original arrays:")
+print(a)
+print(b)
+print("(0+2)x(0+100) = 200, using a kernel graph of 3 kernels:")
+print(result)
\ No newline at end of file
diff --git a/pycuda/driver.py b/pycuda/driver.py
index 4bce8347..17e3b2b5 100644
--- a/pycuda/driver.py
+++ b/pycuda/driver.py
@@ -159,6 +159,54 @@ def post_call(self, stream):
 class InOut(In, Out):
     pass
 
+from functools import lru_cache
+
+@lru_cache(maxsize=None)
+def _build_arg_buf(args):
+    handlers = []
+
+    arg_data = []
+    format = ""
+    for i, arg in enumerate(args):
+        if isinstance(arg, np.number):
+            arg_data.append(arg)
+            format += arg.dtype.char
+        elif isinstance(arg, (DeviceAllocation, PooledDeviceAllocation)):
+            arg_data.append(int(arg))
+            format += "P"
+        elif isinstance(arg, ArgumentHandler):
+            handlers.append(arg)
+            arg_data.append(int(arg.get_device_alloc()))
+            format += "P"
+        elif isinstance(arg, np.ndarray):
+            if isinstance(arg.base, ManagedAllocationOrStub):
+                arg_data.append(int(arg.base))
+                format += "P"
+            else:
+                arg_data.append(arg)
+                format += "%ds" % arg.nbytes
+        elif isinstance(arg, np.void):
+            arg_data.append(_my_bytes(_memoryview(arg)))
+            format += "%ds" % arg.itemsize
+        else:
+            cai = getattr(arg, "__cuda_array_interface__", None)
+            if cai:
+                arg_data.append(cai["data"][0])
+                format += "P"
+                continue
+
+            try:
+                gpudata = np.uintp(arg.gpudata)
+            except AttributeError:
+                raise TypeError("invalid type on parameter #%d (0-based)" % i)
+            else:
+                # for gpuarrays
+                arg_data.append(int(gpudata))
+                format += "P"
+
+    from pycuda._pvt_struct import pack
+
+    return handlers, pack(format, *arg_data)
 
 def _add_functionality():
     def device_get_attributes(dev):
@@ -187,52 +235,6 @@ def device_get_attributes(dev):
     def device___getattr__(dev, name):
         return dev.get_attribute(getattr(device_attribute, name.upper()))
 
-    def _build_arg_buf(args):
-        handlers = []
-
-        arg_data = []
-        format = ""
-        for i, arg in enumerate(args):
-            if isinstance(arg, np.number):
-                arg_data.append(arg)
-                format += arg.dtype.char
-            elif isinstance(arg, (DeviceAllocation, PooledDeviceAllocation)):
-                arg_data.append(int(arg))
-                format += "P"
-            elif isinstance(arg, ArgumentHandler):
-                handlers.append(arg)
-                arg_data.append(int(arg.get_device_alloc()))
-                format += "P"
-            elif isinstance(arg, np.ndarray):
-                if isinstance(arg.base, ManagedAllocationOrStub):
-                    arg_data.append(int(arg.base))
-                    format += "P"
-                else:
-                    arg_data.append(arg)
-                    format += "%ds" % arg.nbytes
-            elif isinstance(arg, np.void):
-                arg_data.append(_my_bytes(_memoryview(arg)))
-                format += "%ds" % arg.itemsize
-            else:
-                cai = getattr(arg, "__cuda_array_interface__", None)
-                if cai:
-                    arg_data.append(cai["data"][0])
-                    format += "P"
-                    continue
-
-                try:
-                    gpudata = np.uintp(arg.gpudata)
-                except AttributeError:
-                    raise TypeError("invalid type on parameter #%d (0-based)" % i)
-                else:
-                    # for gpuarrays
-                    arg_data.append(int(gpudata))
-                    format += "P"
-
-        from pycuda._pvt_struct import pack
-
-        return handlers, pack(format, *arg_data)
-
     # {{{ pre-CUDA 4 call interface (stateful)
 
     def function_param_set_pre_v4(func, *args):
@@ -710,6 +712,33 @@ def new_func(*args, **kwargs):
 
 _add_functionality()
 
+# {{{ cudagraph
+
+def patch_cudagraph():
+    def graph_add_kernel_node_call(graph, *args, func, block, grid=(1, ), dependencies=[], shared_mem_bytes=0):
+        if func is None:
+            raise ValueError("must specify func")
+        if block is None:
+            raise ValueError("must specify block size")
+        _, arg_buf = _build_arg_buf(args)
+        return graph._add_kernel_node(dependencies, func, grid, block, arg_buf, shared_mem_bytes)
+
+    def exec_graph_set_kernel_node_call(exec_graph, *args, kernel_node, func, block, grid=(1, ), shared_mem_bytes=0):
+        if kernel_node is None:
+            raise ValueError("must specify kernel_node")
+        if func is None:
+            raise ValueError("must specify func")
+        if block is None:
+            raise ValueError("must specify block size")
+        _, arg_buf = _build_arg_buf(args)
+        return exec_graph._kernel_node_set_params(kernel_node, func, grid, block, arg_buf, shared_mem_bytes)
+
+    Graph.add_kernel_node = graph_add_kernel_node_call
+    GraphExec.kernel_node_set_params = exec_graph_set_kernel_node_call
+if get_version() >= (10,):
+    patch_cudagraph()
+
+# }}}
 
 # {{{ pagelocked numpy arrays
 
diff --git a/src/cpp/cuda.hpp b/src/cpp/cuda.hpp
index 18079ab4..c57a48d8 100644
--- a/src/cpp/cuda.hpp
+++ b/src/cpp/cuda.hpp
@@ -408,6 +408,41 @@ namespace pycuda
 #endif
   // }}}
 
+  // {{{ helpers
+
+  template <class T>
+  inline T* vector_data_ptr(std::vector<T>& v)
+  { return v.empty() ? nullptr : &v[0]; }
+
+  inline
+  void preprocess_grid_block(py::tuple block_dim_py, unsigned int block_dim[],
+                              py::tuple grid_dim_py, unsigned int grid_dim[],
+                              const unsigned axis_count) {
+    for (unsigned i = 0; i < axis_count; ++i)
+    {
+      grid_dim[i] = 1;
+      block_dim[i] = 1;
+    }
+
+    pycuda_size_t gd_length = py::len(grid_dim_py);
+    if (gd_length > axis_count)
+      throw pycuda::error("function::launch_kernel", CUDA_ERROR_INVALID_HANDLE,
+          "too many grid dimensions in kernel launch");
+
+    for (unsigned i = 0; i < gd_length; ++i)
+      grid_dim[i] = py::extract<unsigned>(grid_dim_py[i]);
+
+    pycuda_size_t bd_length = py::len(block_dim_py);
+    if (bd_length > axis_count)
+      throw pycuda::error("function::launch_kernel", CUDA_ERROR_INVALID_HANDLE,
+          "too many block dimensions in kernel launch");
+
+    for (unsigned i = 0; i < bd_length; ++i)
+      block_dim[i] = py::extract<unsigned>(block_dim_py[i]);
+  }
+
+  // }}}
+
   // {{{ device
   class context;
   class primary_context;
@@ -993,6 +1028,8 @@ namespace pycuda
 
   // {{{ stream
   class event;
+  class graph;
+  class graph_node;
 
   class stream : public boost::noncopyable, public context_dependent
   {
@@ -1024,6 +1061,14 @@ namespace pycuda
 
 #if CUDAPP_CUDA_VERSION >= 3020
       void wait_for_event(const event &evt);
+#endif
+#if CUDAPP_CUDA_VERSION >= 10000
+      void begin_capture(CUstreamCaptureMode mode);
+      graph *end_capture();
+      py::tuple get_capture_info_v2();
+#if CUDAPP_CUDA_VERSION >= 11030
+      void update_capture_dependencies(py::list deps_py, unsigned int flags);
+#endif
 #endif
 
       bool is_done() const
@@ -1447,28 +1492,7 @@ namespace pycuda
         const unsigned axis_count = 3;
         unsigned grid_dim[axis_count];
         unsigned block_dim[axis_count];
-
-        for (unsigned i = 0; i < axis_count; ++i)
-        {
-          grid_dim[i] = 1;
-          block_dim[i] = 1;
-        }
-
-        pycuda_size_t gd_length = py::len(grid_dim_py);
-        if (gd_length > axis_count)
-          throw pycuda::error("function::launch_kernel", CUDA_ERROR_INVALID_HANDLE,
-              "too many grid dimensions in kernel launch");
-
-        for (unsigned i = 0; i < gd_length; ++i)
-          grid_dim[i] = py::extract<unsigned>(grid_dim_py[i]);
-
-        pycuda_size_t bd_length = py::len(block_dim_py);
-        if (bd_length > axis_count)
-          throw pycuda::error("function::launch_kernel", CUDA_ERROR_INVALID_HANDLE,
-              "too many block dimensions in kernel launch");
-
-        for (unsigned i = 0; i < bd_length; ++i)
-          block_dim[i] = py::extract<unsigned>(block_dim_py[i]);
+        preprocess_grid_block(block_dim_py, block_dim, grid_dim_py, grid_dim, axis_count);
 
         PYCUDA_PARSE_STREAM_PY;
 
@@ -1500,6 +1524,9 @@ namespace pycuda
       }
 #endif
 
+    CUfunction handle() const
+    { return m_function; }
+
   };
 
   inline
@@ -1512,6 +1539,264 @@ namespace pycuda
 
   // }}}
 
+  // {{{ graph
+#if CUDAPP_CUDA_VERSION >= 10000
+  class graph_exec : public boost::noncopyable, public context_dependent
+  {
+    private:
+      CUgraphExec m_exec;
+
+    public:
+      graph_exec(CUgraphExec exec)
+        : m_exec(exec)
+      { }
+
+      ~graph_exec() {
+        free();
+      }
+
+      void free() {
+        try
+        {
+          scoped_context_activation ca(get_context());
+          CUDAPP_CALL_GUARDED_CLEANUP(cuGraphExecDestroy,(m_exec));
+        }
+        CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(graph_exec);
+        release_context();
+      }
+
+      void launch(py::object stream_py)
+      {
+        PYCUDA_PARSE_STREAM_PY;
+        CUDAPP_CALL_GUARDED(cuGraphLaunch, (m_exec, s_handle))
+      }
+
+      void kernel_node_set_params(py::object graph_node_py, py::object function_py, 
+        py::tuple grid_dim_py, py::tuple block_dim_py,
+        py::object parameter_buffer,
+        unsigned shared_mem_bytes);
+  };
+
+  class graph_node : public boost::noncopyable, public context_dependent
+  {
+    private:
+      CUgraphNode m_node;
+
+    public:
+      graph_node(CUgraphNode node)
+        : m_node(node)
+      { }
+
+      bool operator==(const graph_node& rhs)
+      {
+        return m_node == rhs.m_node;
+      }
+
+      bool operator!=(const graph_node& rhs)
+      {
+        return !(*this == rhs);
+      }
+
+      CUgraphNode handle() const
+      { return m_node; }
+  };
+
+  inline py::list array_of_nodes_to_list(const CUgraphNode* nodes, size_t length) {
+    py::list list_nodes;
+    for (size_t i = 0; i < length; ++i) {
+      graph_node* pre_to_python_node = new graph_node(nodes[i]);
+      list_nodes.append(boost::ref(pre_to_python_node));
+    }
+    return list_nodes;
+  }
+
+  inline std::vector<CUgraphNode> list_to_vector_of_nodes(py::list list_nodes) {
+    std::vector<CUgraphNode> v;
+    for (ssize_t i = 0; i < len(list_nodes); ++i) {
+      graph_node* node = boost::python::extract<graph_node*>(list_nodes[i]);
+      v.push_back(node->handle());
+    }
+    return v;
+  }
+
+  class graph : public boost::noncopyable, public context_dependent
+  {
+    private:
+      CUgraph m_graph;
+      bool m_own_graph;
+
+    public:
+      graph()
+        : m_own_graph(true)
+      {
+        cuGraphCreate(&m_graph, 0);
+      }
+
+      graph(CUgraph graph)
+        : m_graph(graph)
+        , m_own_graph(false)
+      { }
+
+      ~graph() {
+        if (m_own_graph) {
+          free();
+        }
+      }
+
+      void free() {
+        try
+        {
+          scoped_context_activation ca(get_context());
+          CUDAPP_CALL_GUARDED_CLEANUP(cuGraphDestroy,(m_graph));
+        }
+        CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(graph);
+        release_context();
+      }
+
+      bool operator==(const graph& rhs)
+      {
+        return m_graph == rhs.m_graph;
+      }
+
+      bool operator!=(const graph& rhs)
+      {
+        return !(*this == rhs);
+      }
+
+      graph_exec *instantiate()
+      {
+        CUgraphExec instance;
+#if CUDAPP_CUDA_VERSION >= 12000
+        CUDAPP_CALL_GUARDED(cuGraphInstantiate, (&instance, m_graph, 0))
+#else
+        CUDAPP_CALL_GUARDED(cuGraphInstantiate, (&instance, m_graph, nullptr, nullptr, 0))
+#endif
+        return new graph_exec(instance);
+      }
+
+      graph_node *add_kernel_node(py::list deps_py, py::object function_py,
+        py::tuple grid_dim_py, py::tuple block_dim_py, py::object parameter_buffer,
+        unsigned shared_mem_bytes);
+
+      void debug_dot_print(std::string path)
+      {
+        CUDAPP_CALL_GUARDED(cuGraphDebugDotPrint, (m_graph, path.c_str(), 1<<1))
+      }
+
+      CUgraph handle() const
+      { return m_graph; }
+  };
+
+  inline void stream::begin_capture(CUstreamCaptureMode mode = CU_STREAM_CAPTURE_MODE_GLOBAL)
+  {
+    CUDAPP_CALL_GUARDED(cuStreamBeginCapture, (m_stream, mode));
+  }
+
+  inline graph *stream::end_capture()
+  {
+    CUgraph result;
+    CUDAPP_CALL_GUARDED(cuStreamEndCapture, (m_stream, &result))
+    return new graph(result);
+  }
+
+  inline py::tuple stream::get_capture_info_v2()
+  {
+    CUgraph _capturing_graph;
+    CUstreamCaptureStatus _capture_status;
+    const CUgraphNode *_deps = nullptr;
+    size_t _dep_count = 0;
+    uint64_t _id_out = 0;
+    CUDAPP_CALL_GUARDED(cuStreamGetCaptureInfo_v2, (m_stream, &_capture_status, &_id_out, &_capturing_graph, &_deps, &_dep_count));
+    py::list list_root_nodes = array_of_nodes_to_list(_deps, _dep_count);
+    graph *node = new graph(_capturing_graph);
+    return py::make_tuple(_capture_status, _id_out, boost::ref(node), list_root_nodes);
+  }
+
+#if CUDAPP_CUDA_VERSION >= 11030
+  inline void stream::update_capture_dependencies(py::list deps_py, unsigned int flags)
+  {
+    std::vector<CUgraphNode> deps = list_to_vector_of_nodes(deps_py);
+    CUDAPP_CALL_GUARDED(cuStreamUpdateCaptureDependencies, (m_stream, vector_data_ptr(deps), len(deps_py), flags));
+  }
+#endif
+
+  inline graph_node *graph::add_kernel_node(py::list deps_py, py::object function_py,
+        py::tuple grid_dim_py, py::tuple block_dim_py, py::object parameter_buffer,
+        unsigned shared_mem_bytes)
+  {
+    const unsigned axis_count = 3;
+    unsigned grid_dim[axis_count];
+    unsigned block_dim[axis_count];
+    preprocess_grid_block(block_dim_py, block_dim, grid_dim_py, grid_dim, axis_count);
+
+    py_buffer_wrapper par_buf_wrapper;
+    par_buf_wrapper.get(parameter_buffer.ptr(), PyBUF_ANY_CONTIGUOUS);
+    size_t par_len = par_buf_wrapper.m_buf.len;
+
+    void *config[] = {
+      CU_LAUNCH_PARAM_BUFFER_POINTER, const_cast<void *>(par_buf_wrapper.m_buf.buf),
+      CU_LAUNCH_PARAM_BUFFER_SIZE, &par_len,
+      CU_LAUNCH_PARAM_END
+    };
+
+    CUgraphNode result;
+    function &fn = py::extract<function &>(function_py);
+    std::vector<CUgraphNode> deps = list_to_vector_of_nodes(deps_py);
+
+    CUDA_KERNEL_NODE_PARAMS _dynamic_params_cuda = { 0 };
+    _dynamic_params_cuda.blockDimX = block_dim[0];
+    _dynamic_params_cuda.blockDimY = block_dim[1];
+    _dynamic_params_cuda.blockDimZ = block_dim[2];
+    _dynamic_params_cuda.extra = config;
+    _dynamic_params_cuda.func = fn.handle();
+    _dynamic_params_cuda.gridDimX = grid_dim[0];
+    _dynamic_params_cuda.gridDimY = grid_dim[1];
+    _dynamic_params_cuda.gridDimZ = grid_dim[2];
+    _dynamic_params_cuda.kernelParams = 0;
+    _dynamic_params_cuda.sharedMemBytes = shared_mem_bytes;
+    CUDAPP_CALL_GUARDED(cuGraphAddKernelNode, (&result, m_graph, vector_data_ptr(deps), len(deps_py), &_dynamic_params_cuda));
+    return new graph_node(result);
+  }
+
+  inline void graph_exec::kernel_node_set_params(py::object graph_node_py, py::object function_py, 
+        py::tuple grid_dim_py, py::tuple block_dim_py,
+        py::object parameter_buffer,
+        unsigned shared_mem_bytes)
+  {
+    const unsigned axis_count = 3;
+    unsigned grid_dim[axis_count];
+    unsigned block_dim[axis_count];
+    preprocess_grid_block(block_dim_py, block_dim, grid_dim_py, grid_dim, axis_count);
+
+    py_buffer_wrapper par_buf_wrapper;
+    par_buf_wrapper.get(parameter_buffer.ptr(), PyBUF_ANY_CONTIGUOUS);
+    size_t par_len = par_buf_wrapper.m_buf.len;
+
+    void *config[] = {
+      CU_LAUNCH_PARAM_BUFFER_POINTER, const_cast<void *>(par_buf_wrapper.m_buf.buf),
+      CU_LAUNCH_PARAM_BUFFER_SIZE, &par_len,
+      CU_LAUNCH_PARAM_END
+    };
+
+    graph_node &gn = py::extract<graph_node &>(graph_node_py);
+    function &fn = py::extract<function &>(function_py);
+
+    CUDA_KERNEL_NODE_PARAMS _dynamic_params_cuda = { 0 };
+    _dynamic_params_cuda.blockDimX = block_dim[0];
+    _dynamic_params_cuda.blockDimY = block_dim[1];
+    _dynamic_params_cuda.blockDimZ = block_dim[2];
+    _dynamic_params_cuda.extra = config;
+    _dynamic_params_cuda.func = fn.handle();
+    _dynamic_params_cuda.gridDimX = grid_dim[0];
+    _dynamic_params_cuda.gridDimY = grid_dim[1];
+    _dynamic_params_cuda.gridDimZ = grid_dim[2];
+    _dynamic_params_cuda.kernelParams = 0;
+    _dynamic_params_cuda.sharedMemBytes = shared_mem_bytes;
+    CUDAPP_CALL_GUARDED(cuGraphExecKernelNodeSetParams, (m_exec, gn.handle(), &_dynamic_params_cuda));
+  }
+#endif
+  // }}}
+
   // {{{ device memory
   inline
   py::tuple mem_get_info()
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index 3758689b..0931a271 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -1231,7 +1231,59 @@ BOOST_PYTHON_MODULE(_driver)
   }
   // }}}
 
+  // {{{ graph
+#if CUDAPP_CUDA_VERSION >= 10000
+  {
+    typedef graph_node cl;
+    py::class_<cl, boost::noncopyable>("GraphNode", py::no_init)
+      .def("__eq__", &cl::operator==)
+      .def("__ne__", &cl::operator!=)
+    ;
+  }
+
+  {
+    typedef graph_exec cl;
+    py::class_<cl, boost::noncopyable>("GraphExec", py::no_init)
+      .def("launch", &cl::launch,
+        py::arg("stream")=py::object())
+      .def("_kernel_node_set_params", &cl::kernel_node_set_params)
+      ;
+  }
+
+  {
+    typedef graph cl;
+    py::class_<cl, boost::noncopyable>("Graph")
+      .def("__eq__", &cl::operator==)
+      .def("__ne__", &cl::operator!=)
+      .def("_add_kernel_node", &cl::add_kernel_node,
+        py::return_value_policy<py::manage_new_object>())
+      .def("instantiate", &cl::instantiate,
+        py::return_value_policy<py::manage_new_object>())
+      .DEF_SIMPLE_METHOD(debug_dot_print)
+      ;
+  }
+#endif
+  // }}}
+
   // {{{ stream
+#if CUDAPP_CUDA_VERSION >= 10000
+  py::enum_<CUstreamCaptureMode>("capture_mode")
+    .value("GLOBAL", CU_STREAM_CAPTURE_MODE_GLOBAL)
+    .value("THREAD_LOCAL", CU_STREAM_CAPTURE_MODE_THREAD_LOCAL)
+    .value("RELAXED", CU_STREAM_CAPTURE_MODE_RELAXED)
+    ;
+  py::enum_<CUstreamCaptureStatus>("capture_status")
+    .value("NONE", CU_STREAM_CAPTURE_STATUS_NONE)
+    .value("ACTIVE", CU_STREAM_CAPTURE_STATUS_ACTIVE)
+    .value("INVALIDATED", CU_STREAM_CAPTURE_STATUS_INVALIDATED)
+    ;
+#endif
+#if CUDAPP_CUDA_VERSION >= 11030
+  py::enum_<CUstreamUpdateCaptureDependencies_flags>("update_capture_dependencies_flags")
+    .value("ADD_CAPTURE_DEPENDENCIES", CU_STREAM_ADD_CAPTURE_DEPENDENCIES)
+    .value("SET_CAPTURE_DEPENDENCIES", CU_STREAM_SET_CAPTURE_DEPENDENCIES)
+    ;
+#endif
   {
     typedef stream cl;
     py::class_<cl, boost::noncopyable, shared_ptr<cl> >
@@ -1240,6 +1292,18 @@ BOOST_PYTHON_MODULE(_driver)
       .DEF_SIMPLE_METHOD(is_done)
 #if CUDAPP_CUDA_VERSION >= 3020
       .DEF_SIMPLE_METHOD(wait_for_event)
+#endif
+#if CUDAPP_CUDA_VERSION >= 10000
+      .def("begin_capture", &cl::begin_capture,
+        py::arg("capture_mode") = CU_STREAM_CAPTURE_MODE_GLOBAL)
+      .def("end_capture", &cl::end_capture,
+        py::return_value_policy<py::manage_new_object>())
+      .def("get_capture_info_v2", &cl::get_capture_info_v2)
+#if CUDAPP_CUDA_VERSION >= 11030
+      .def("update_capture_dependencies", &cl::update_capture_dependencies,
+        (py::arg("dependencies"),
+        py::arg("flags") = CU_STREAM_ADD_CAPTURE_DEPENDENCIES))
+#endif
 #endif
       .add_property("handle", &cl::handle_int)
       ;
diff --git a/test/test_graph.py b/test/test_graph.py
new file mode 100644
index 00000000..530986af
--- /dev/null
+++ b/test/test_graph.py
@@ -0,0 +1,178 @@
+__copyright__ = """
+Copyright 2008-2021 Andreas Kloeckner
+Copyright 2021 NVIDIA Corporation
+"""
+
+import numpy as np
+import numpy.linalg as la
+from pycuda.tools import mark_cuda_test, dtype_to_ctype
+import pytest  # noqa
+
+
+import pycuda.gpuarray as gpuarray
+import pycuda.driver as drv
+from pycuda.compiler import SourceModule
+
+
+class TestGraph:
+    @mark_cuda_test
+    def test_static_params(self):
+        mod = SourceModule("""
+            __global__ void plus(float *a, int num)
+            {
+            int idx = threadIdx.x + threadIdx.y*4;
+            a[idx] += num;
+            }
+            """)
+        func_plus = mod.get_function("plus")
+
+        import numpy
+        a = numpy.zeros((4, 4)).astype(numpy.float32)
+        a_gpu = drv.mem_alloc_like(a)
+        result = numpy.zeros_like(a)
+        stream_1 = drv.Stream()
+        stream_1.begin_capture()
+        func_plus(a_gpu, numpy.int32(3), block=(4, 4, 1), stream=stream_1)
+        graph = stream_1.end_capture()
+
+        instance = graph.instantiate()
+        instance.launch()
+        drv.memcpy_dtoh_async(result, a_gpu, stream_1)
+        np.testing.assert_allclose(result, np.full((4, 4), 3), rtol=1e-5)
+
+    @mark_cuda_test
+    def test_dynamic_params(self):
+        mod = SourceModule("""
+            __global__ void plus(float *a, int num)
+            {
+            int idx = threadIdx.x + threadIdx.y*4;
+            a[idx] += num;
+            }
+            """)
+        func_plus = mod.get_function("plus")
+
+        stream_1 = drv.Stream()
+        import numpy
+        a = numpy.zeros((4, 4)).astype(numpy.float32)
+        a_gpu = drv.mem_alloc_like(a)
+        result = numpy.zeros_like(a)
+        stream_1.begin_capture()
+        stat , _, x_graph, deps = stream_1.get_capture_info_v2()
+        assert stat == drv.capture_status.ACTIVE, "Capture should be active"
+        assert len(deps) == 0, "Nothing on deps"
+        newnode = x_graph.add_kernel_node(a_gpu, numpy.int32(3), block=(4, 4, 1), func=func_plus, dependencies=deps)
+        stream_1.update_capture_dependencies([newnode], drv.update_capture_dependencies_flags.SET_CAPTURE_DEPENDENCIES)
+        drv.memcpy_dtoh_async(result, a_gpu, stream_1) # Capture a copy as well.
+        graph = stream_1.end_capture()
+        assert graph == x_graph, "Should be the same"
+
+        instance = graph.instantiate()
+
+        stat, _, _, _ = stream_1.get_capture_info_v2()
+        assert stat == drv.capture_status.NONE, "No capture should be active"
+
+        wanna = 0
+        for i in range(4):
+            instance.kernel_node_set_params(a_gpu, numpy.int32(i), block=(4, 4, 1), func=func_plus, kernel_node=newnode)
+            instance.launch()
+            wanna += i
+            np.testing.assert_allclose(result, np.full((4, 4), wanna), rtol=1e-5)
+
+    @mark_cuda_test
+    def test_many_dynamic_params(self):
+        mod = SourceModule("""
+            __global__ void plus(float *a, int num)
+            {
+            int idx = threadIdx.x + threadIdx.y*4;
+            a[idx] += num;
+            }
+            __global__ void times(float *a, float *b)
+            {
+            int idx = threadIdx.x + threadIdx.y*4;
+            a[idx] *= b[idx];
+            }
+            """)
+        func_plus = mod.get_function("plus")
+        func_times = mod.get_function("times")
+
+        stream_1 = drv.Stream()
+        _ , _, _, _ = stream_1.get_capture_info_v2()
+
+        import numpy
+        a = numpy.zeros((4, 4)).astype(numpy.float32)
+        a_gpu = drv.mem_alloc_like(a)
+        b = numpy.zeros((4, 4)).astype(numpy.float32)
+        b_gpu = drv.mem_alloc_like(b)
+        result = numpy.zeros_like(b)
+        _ , _, _, _ = stream_1.get_capture_info_v2()
+        stream_1.begin_capture()
+        stat , _, x_graph, deps = stream_1.get_capture_info_v2()
+        assert stat == drv.capture_status.ACTIVE, "Capture should be active"
+        assert len(deps) == 0, "Nothing on deps"
+        newnode = x_graph.add_kernel_node(a_gpu, numpy.int32(3), block=(4, 4, 1), func=func_plus, dependencies=deps)
+        stream_1.update_capture_dependencies([newnode], drv.update_capture_dependencies_flags.SET_CAPTURE_DEPENDENCIES)
+        _, _, x_graph, deps = stream_1.get_capture_info_v2()
+        assert deps == [newnode], "Call to update_capture_dependencies should set newnode as the only dep"
+        newnode2 = x_graph.add_kernel_node(b_gpu, numpy.int32(3), block=(4, 4, 1), func=func_plus, dependencies=deps)
+        stream_1.update_capture_dependencies([newnode2], drv.update_capture_dependencies_flags.SET_CAPTURE_DEPENDENCIES)
+
+        # Static capture
+        func_times(a_gpu, b_gpu, block=(4, 4, 1), stream=stream_1)
+        drv.memcpy_dtoh_async(result, a_gpu, stream_1) # Capture a copy as well.
+        graph = stream_1.end_capture()
+        assert graph == x_graph, "Should be the same"
+
+        instance = graph.instantiate()
+
+        stat, _, _, _ = stream_1.get_capture_info_v2()
+        assert stat == drv.capture_status.NONE, "No capture be active"
+
+        instance.kernel_node_set_params(a_gpu, numpy.int32(4), block=(4, 4, 1), func=func_plus, kernel_node=newnode)
+        instance.kernel_node_set_params(b_gpu, numpy.int32(9), block=(4, 4, 1), func=func_plus, kernel_node=newnode2)
+        instance.launch()
+        np.testing.assert_allclose(result, np.full((4, 4), 4*9), rtol=1e-5)
+
+        a = numpy.zeros((4, 4)).astype(numpy.float32)
+        a_gpu_fake = drv.mem_alloc_like(a)
+        instance.kernel_node_set_params(a_gpu_fake, numpy.int32(5), block=(4, 4, 1), func=func_plus, kernel_node=newnode)
+        instance.kernel_node_set_params(b_gpu, numpy.int32(4), block=(4, 4, 1), func=func_plus, kernel_node=newnode2)
+        instance.launch()
+        np.testing.assert_allclose(result, np.full((4, 4), (4*9)*(9+4)), rtol=1e-5) # b is now (9 + 4), a is 4*9 as it was after func_times, since we write to another buffer this launch.
+
+    @mark_cuda_test
+    def test_graph_create(self):
+        mod = SourceModule("""
+            __global__ void plus(float *a, int num)
+            {
+            int idx = threadIdx.x + threadIdx.y*4;
+            a[idx] += num;
+            }
+            """)
+        func_plus = mod.get_function("plus")
+
+        import numpy
+        a = numpy.zeros((4, 4)).astype(numpy.float32)
+        a_gpu = drv.mem_alloc_like(a)
+        result = numpy.zeros_like(a)
+
+        graph = drv.Graph()
+        node1 = graph.add_kernel_node(a_gpu, numpy.int32(1), block=(4, 4, 1), func=func_plus, dependencies=[])
+        node2 = graph.add_kernel_node(a_gpu, numpy.int32(2), block=(4, 4, 1), func=func_plus, dependencies=[node1])
+
+        instance = graph.instantiate()
+        instance.launch()
+        drv.memcpy_dtoh_async(result, a_gpu)
+        np.testing.assert_allclose(result, np.full((4, 4), 1+2), rtol=1e-5)
+
+if __name__ == "__main__":
+    # make sure that import failures get reported, instead of skipping the tests.
+    import pycuda.autoinit  # noqa
+
+    import sys
+
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from pytest import main
+
+        main([__file__])