diff --git a/.gitmodules b/.gitmodules
index 5ca5d423..01f38c60 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,3 @@
-[submodule "bpl-subset"]
-	path = bpl-subset
-	url = https://github.com/inducer/bpl-subset
 [submodule "pycuda/compyte"]
 	path = pycuda/compyte
 	url = https://github.com/inducer/compyte
diff --git a/bpl-subset b/bpl-subset
deleted file mode 160000
index 3702fb11..00000000
--- a/bpl-subset
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 3702fb119804dddde5eaf4a254822b891a947104
diff --git a/pyproject.toml b/pyproject.toml
index 2bc8218a..81576809 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,6 +5,7 @@
 # implementing that C_API_VERSION.
 requires = [
     "setuptools",
+    "pybind11>=2.5.0",
     "wheel",
     "oldest-supported-numpy",
 ]
diff --git a/setup.py b/setup.py
index 9adccd87..ba6adae7 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-from __future__ import absolute_import, print_function
 from os.path import dirname, join, normpath
 
 
@@ -28,10 +27,8 @@ def get_config_schema():
         IncludeDir,
         LibraryDir,
         Libraries,
-        BoostLibraries,
         Switch,
         StringListOption,
-        make_boost_base_options,
     )
 
     nvcc_path = search_on_path(["nvcc", "nvcc.exe"])
@@ -76,11 +73,7 @@ def get_config_schema():
         default_lib_dirs.append("/usr/local/cuda/lib")
 
     return ConfigSchema(
-        make_boost_base_options()
-        + [
-            Switch("USE_SHIPPED_BOOST", True, "Use included Boost library"),
-            BoostLibraries("python"),
-            BoostLibraries("thread"),
+        [
             Switch("CUDA_TRACE", False, "Enable CUDA API tracing"),
             Option(
                 "CUDA_ROOT", default=cuda_root_default, help="Path to the CUDA toolkit"
@@ -118,27 +111,26 @@ def main():
         get_config,
         setup,
         ExtensionUsingNumpy,
-        set_up_shipped_boost_if_requested,
         check_git_submodules,
-        NumpyBuildExtCommand,
+        check_pybind11,
+        get_pybind_include,
+        PybindBuildExtCommand,
     )
 
+    check_pybind11()
     check_git_submodules()
 
     hack_distutils()
     conf = get_config(get_config_schema())
 
-    EXTRA_SOURCES, EXTRA_DEFINES = set_up_shipped_boost_if_requested("pycuda", conf)
+    EXTRA_SOURCES = []
+    EXTRA_DEFINES = {}
 
     EXTRA_DEFINES["PYGPU_PACKAGE"] = "pycuda"
     EXTRA_DEFINES["PYGPU_PYCUDA"] = "1"
 
-    LIBRARY_DIRS = conf["BOOST_LIB_DIR"] + conf["CUDADRV_LIB_DIR"]
-    LIBRARIES = (
-        conf["BOOST_PYTHON_LIBNAME"]
-        + conf["BOOST_THREAD_LIBNAME"]
-        + conf["CUDADRV_LIBNAME"]
-    )
+    LIBRARY_DIRS = conf["CUDADRV_LIB_DIR"]
+    LIBRARIES = conf["CUDADRV_LIBNAME"]
 
     if not conf["CUDA_INC_DIR"] and conf["CUDA_ROOT"]:
         conf["CUDA_INC_DIR"] = [join(conf["CUDA_ROOT"], "include")]
@@ -149,7 +141,7 @@ def main():
     if conf["CUDA_PRETEND_VERSION"]:
         EXTRA_DEFINES["CUDAPP_PRETEND_CUDA_VERSION"] = conf["CUDA_PRETEND_VERSION"]
 
-    INCLUDE_DIRS = ["src/cpp"] + conf["BOOST_INC_DIR"]
+    INCLUDE_DIRS = ["src/cpp", get_pybind_include()]
     if conf["CUDA_INC_DIR"]:
         INCLUDE_DIRS += conf["CUDA_INC_DIR"]
 
@@ -186,11 +178,6 @@ def main():
 
     import sys
 
-    if sys.version_info >= (3,):
-        pvt_struct_source = "src/wrapper/_pvt_struct_v3.cpp"
-    else:
-        pvt_struct_source = "src/wrapper/_pvt_struct_v2.cpp"
-
     setup(
         name="pycuda",
         # metadata
@@ -254,12 +241,12 @@ def main():
             ),
             ExtensionUsingNumpy(
                 "_pvt_struct",
-                [pvt_struct_source],
+                ["src/wrapper/_pvt_struct_v3.cpp"],
                 extra_compile_args=conf["CXXFLAGS"],
                 extra_link_args=conf["LDFLAGS"],
             ),
         ],
-        cmdclass={"build_ext": NumpyBuildExtCommand},
+        cmdclass={"build_ext": PybindBuildExtCommand},
         include_package_data=True,
         package_data={
             "pycuda": [
diff --git a/src/cpp/bitlog.cpp b/src/cpp/bitlog.cpp
index a09a1168..6f011c6f 100644
--- a/src/cpp/bitlog.cpp
+++ b/src/cpp/bitlog.cpp
@@ -1,9 +1,33 @@
-#include <bitlog.hpp>
+// Base-2 logarithm bithack
+//
+// Copyright (C) 2009 Andreas Kloeckner
+// Copyright (C) Sean Eron Anderson (in the public domain)
+//
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+//
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
 
 
+#include "bitlog.hpp"
 
 
-/* from http://graphics.stanford.edu/~seander/bithacks.html */
 const char pycuda::log_table_8[] =
 {
   0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
diff --git a/src/cpp/bitlog.hpp b/src/cpp/bitlog.hpp
index 9343051b..1e95917c 100644
--- a/src/cpp/bitlog.hpp
+++ b/src/cpp/bitlog.hpp
@@ -1,47 +1,79 @@
 // Base-2 logarithm bithack.
-
-
-
-
-#ifndef _AFJDFJSDFSD_PYCUDA_HEADER_SEEN_BITLOG_HPP
-#define _AFJDFJSDFSD_PYCUDA_HEADER_SEEN_BITLOG_HPP
-
-
+//
+// Copyright (C) 2009 Andreas Kloeckner
+// Copyright (C) Sean Eron Anderson (in the public domain)
+//
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+//
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+
+#ifndef _AFJDFJSDFSD_PYOPENCL_HEADER_SEEN_BITLOG_HPP
+#define _AFJDFJSDFSD_PYOPENCL_HEADER_SEEN_BITLOG_HPP
 
 
 #include <climits>
-#include <boost/cstdint.hpp>
+#include <cstdint>
 
 
-namespace pycuda 
+namespace pycuda
 {
+  /* from http://graphics.stanford.edu/~seander/bithacks.html */
+
   extern const char log_table_8[];
 
-  inline unsigned bitlog2_16(boost::uint16_t v)
+  inline unsigned bitlog2_16(uint16_t v)
   {
     if (unsigned long t = v >> 8)
       return 8+log_table_8[t];
-    else 
+    else
       return log_table_8[v];
   }
 
-  inline unsigned bitlog2_32(boost::uint32_t v)
+  inline unsigned bitlog2_32(uint32_t v)
   {
-    if (boost::uint16_t t = v >> 16)
+    if (uint16_t t = v >> 16)
       return 16+bitlog2_16(t);
-    else 
-      return bitlog2_16(boost::uint16_t(v));
+    else
+      return bitlog2_16(v);
   }
 
-  inline unsigned bitlog2(size_t v)
+#if defined(UINT64_MAX)
+  inline unsigned bitlog2(uint64_t v)
+  {
+    if (uint32_t t = v >> 32)
+      return 32+bitlog2_32(t);
+    else
+      return bitlog2_32(v);
+  }
+#else
+  inline unsigned bitlog2(unsigned long v)
   {
-#if (ULONG_MAX != 4294967295) || defined(_WIN64)
-    if (boost::uint32_t t = v >> 32)
+#if (ULONG_MAX != 4294967295)
+    if (uint32_t t = v >> 32)
       return 32+bitlog2_32(t);
-    else 
+    else
+#endif
+      return bitlog2_32(v);
+  }
 #endif
-      return bitlog2_32(unsigned(v));
-   }
 }
 
 
diff --git a/src/cpp/cuda.cpp b/src/cpp/cuda.cpp
index 00c7072c..e8c404d0 100644
--- a/src/cpp/cuda.cpp
+++ b/src/cpp/cuda.cpp
@@ -3,4 +3,4 @@
 
 #include "cuda.hpp"
 
-boost::thread_specific_ptr<pycuda::context_stack> pycuda::context_stack_ptr;
+std::thread_specific_ptr<pycuda::context_stack> pycuda::context_stack_ptr;
diff --git a/src/cpp/cuda.hpp b/src/cpp/cuda.hpp
index 18079ab4..d9e2ee37 100644
--- a/src/cpp/cuda.hpp
+++ b/src/cpp/cuda.hpp
@@ -20,29 +20,22 @@
 #endif
 
 #if CUDAPP_CUDA_VERSION >= 4000
-#include <cudaProfiler.h>
+// FIXME reenable
+//#include <cudaProfiler.h>
 #endif
 
 #ifndef _MSC_VER
 #include <stdint.h>
 #endif
 #include <stdexcept>
-#include <boost/shared_ptr.hpp>
-#include <boost/foreach.hpp>
+#include <memory>
 #include <utility>
 #include <stack>
 #include <iostream>
 #include <vector>
-#include <boost/python.hpp>
-#include <boost/thread/thread.hpp>
-#include <boost/thread/tss.hpp>
-#include <boost/version.hpp>
+#include <thread>
 
-#if (BOOST_VERSION/100) < 1035
-#warning *****************************************************************
-#warning **** Your version of Boost C++ is likely too old for PyCUDA. ****
-#warning *****************************************************************
-#endif
+#include <pybind11/pybind11.h>
 
 // MAYBE? cuMemcpy, cuPointerGetAttribute
 // TODO: cuCtxSetCurrent, cuCtxGetCurrent
@@ -79,7 +72,7 @@ typedef Py_ssize_t PYCUDA_BUFFER_SIZE_T;
     CUstream s_handle; \
     if (stream_py.ptr() != Py_None) \
     { \
-      const stream &s = py::extract<const stream &>(stream_py); \
+      const stream &s = py::cast<const stream &>(stream_py); \
       s_handle = s.handle(); \
     } \
     else \
@@ -180,7 +173,20 @@ typedef Py_ssize_t PYCUDA_BUFFER_SIZE_T;
 
 namespace pycuda
 {
-  namespace py = boost::python;
+  namespace py = pybind11;
+
+
+  // https://stackoverflow.com/a/44175911
+  class noncopyable {
+  public:
+    noncopyable() = default;
+    ~noncopyable() = default;
+
+  private:
+    noncopyable(const noncopyable&) = delete;
+    noncopyable& operator=(const noncopyable&) = delete;
+  };
+
 
   typedef
 #if CUDAPP_CUDA_VERSION >= 3020
@@ -367,7 +373,7 @@ namespace pycuda
 
   // {{{ buffer interface helper
 
-  class py_buffer_wrapper : public boost::noncopyable
+  class py_buffer_wrapper : public noncopyable
   {
     private:
       bool m_initialized;
@@ -482,9 +488,9 @@ namespace pycuda
         return m_device;
       }
 
-      boost::shared_ptr<context> make_context(unsigned int flags);
+      std::shared_ptr<context> make_context(unsigned int flags);
 #if CUDAPP_CUDA_VERSION >= 7000
-      boost::shared_ptr<context> retain_primary_context();
+      std::shared_ptr<context> retain_primary_context();
 #endif
 
       CUdevice handle() const
@@ -537,12 +543,12 @@ namespace pycuda
 
   // for friend decl
   namespace gl {
-    boost::shared_ptr<context>
+    std::shared_ptr<context>
         make_gl_context(device const &dev, unsigned int flags);
   }
 
   class context_stack;
-  extern boost::thread_specific_ptr<context_stack> context_stack_ptr;
+  extern std::thread_specific_ptr<context_stack> context_stack_ptr;
 
   class context_stack
   {
@@ -552,7 +558,7 @@ namespace pycuda
        * to be destroyed.
        */
     private:
-      typedef std::stack<boost::shared_ptr<context> > stack_t;
+      typedef std::stack<std::shared_ptr<context> > stack_t;
       typedef stack_t::value_type value_type;;
       stack_t m_stack;
 
@@ -587,18 +593,18 @@ namespace pycuda
       }
   };
 
-  class context : boost::noncopyable
+  class context : noncopyable
   {
     protected:
       CUcontext m_context;
       bool m_valid;
       unsigned m_use_count;
-      boost::thread::id m_thread;
+      std::thread::id m_thread;
 
     public:
       context(CUcontext ctx)
         : m_context(ctx), m_valid(true), m_use_count(1),
-        m_thread(boost::this_thread::get_id())
+        m_thread(std::this_thread::get_id())
       { }
 
       virtual ~context()
@@ -634,7 +640,7 @@ namespace pycuda
         return hash_type(m_context) ^ hash_type(this);
       }
 
-      boost::thread::id thread_id() const
+      std::thread::id thread_id() const
       { return m_thread; }
 
       bool is_valid() const
@@ -642,11 +648,11 @@ namespace pycuda
         return m_valid;
       }
 
-      static boost::shared_ptr<context> attach(unsigned int flags)
+      static std::shared_ptr<context> attach(unsigned int flags)
       {
         CUcontext current;
         CUDAPP_CALL_GUARDED(cuCtxAttach, (&current, flags));
-        boost::shared_ptr<context> result(new context(current));
+        std::shared_ptr<context> result(new context(current));
         context_stack::get().push(result);
         return result;
       }
@@ -669,7 +675,7 @@ namespace pycuda
           }
           else
           {
-            if (m_thread == boost::this_thread::get_id())
+            if (m_thread == std::this_thread::get_id())
             {
               CUDAPP_CALL_GUARDED_CLEANUP(cuCtxPushCurrent, (m_context));
               detach_internal();
@@ -689,7 +695,7 @@ namespace pycuda
 
           if (active_before_destruction)
           {
-            boost::shared_ptr<context> new_active = current_context(this);
+            std::shared_ptr<context> new_active = current_context(this);
             if (new_active.get())
             {
               CUDAPP_CALL_GUARDED(cuCtxPushCurrent, (new_active->m_context));
@@ -730,7 +736,7 @@ namespace pycuda
               "cannot pop non-current context");
         }
 
-        boost::shared_ptr<context> current = current_context();
+        std::shared_ptr<context> current = current_context();
         if (current)
           --current->m_use_count;
 
@@ -747,14 +753,14 @@ namespace pycuda
       static void synchronize()
       { CUDAPP_CALL_GUARDED_THREADED(cuCtxSynchronize, ()); }
 
-      static boost::shared_ptr<context> current_context(context *except=0)
+      static std::shared_ptr<context> current_context(context *except=0)
       {
         while (true)
         {
           if (context_stack::get().empty())
-            return boost::shared_ptr<context>();
+            return std::shared_ptr<context>();
 
-          boost::shared_ptr<context> result(context_stack::get().top());
+          std::shared_ptr<context> result(context_stack::get().top());
           if (result.get() != except
               && result->is_valid())
           {
@@ -829,8 +835,8 @@ namespace pycuda
 #endif
 
       friend class device;
-      friend void context_push(boost::shared_ptr<context> ctx);
-      friend boost::shared_ptr<context>
+      friend void context_push(std::shared_ptr<context> ctx);
+      friend std::shared_ptr<context>
           gl::make_gl_context(device const &dev, unsigned int flags);
       friend class primary_context;
   };
@@ -854,24 +860,24 @@ namespace pycuda
   };
 
   inline
-  boost::shared_ptr<context> device::make_context(unsigned int flags)
+  std::shared_ptr<context> device::make_context(unsigned int flags)
   {
     context::prepare_context_switch();
 
     CUcontext ctx;
     CUDAPP_CALL_GUARDED_THREADED(cuCtxCreate, (&ctx, flags, m_device));
-    boost::shared_ptr<context> result(new context(ctx));
+    std::shared_ptr<context> result(new context(ctx));
     context_stack::get().push(result);
     return result;
   }
 
 
 #if CUDAPP_CUDA_VERSION >= 7000
-  inline boost::shared_ptr<context> device::retain_primary_context()
+  inline std::shared_ptr<context> device::retain_primary_context()
   {
     CUcontext ctx;
     CUDAPP_CALL_GUARDED(cuDevicePrimaryCtxRetain, (&ctx, m_device));
-    boost::shared_ptr<context> result(new primary_context(ctx, m_device));
+    std::shared_ptr<context> result(new primary_context(ctx, m_device));
     return result;
   }
 #endif
@@ -879,7 +885,7 @@ namespace pycuda
 
 #if CUDAPP_CUDA_VERSION >= 2000
   inline
-  void context_push(boost::shared_ptr<context> ctx)
+  void context_push(std::shared_ptr<context> ctx)
   {
     context::prepare_context_switch();
 
@@ -916,7 +922,7 @@ namespace pycuda
   class explicit_context_dependent
   {
     private:
-      boost::shared_ptr<context> m_ward_context;
+      std::shared_ptr<context> m_ward_context;
 
     public:
       void acquire_context()
@@ -933,7 +939,7 @@ namespace pycuda
         m_ward_context.reset();
       }
 
-      boost::shared_ptr<context> get_context()
+      std::shared_ptr<context> get_context()
       {
         return m_ward_context;
       }
@@ -942,7 +948,7 @@ namespace pycuda
   class context_dependent : public explicit_context_dependent
   {
     private:
-      boost::shared_ptr<context> m_ward_context;
+      std::shared_ptr<context> m_ward_context;
 
     public:
       context_dependent()
@@ -953,11 +959,11 @@ namespace pycuda
   class scoped_context_activation
   {
     private:
-      boost::shared_ptr<context> m_context;
+      std::shared_ptr<context> m_context;
       bool m_did_switch;
 
     public:
-      scoped_context_activation(boost::shared_ptr<context> ctx)
+      scoped_context_activation(std::shared_ptr<context> ctx)
         : m_context(ctx)
       {
         if (!m_context->is_valid())
@@ -967,7 +973,7 @@ namespace pycuda
         m_did_switch = context::current_context() != m_context;
         if (m_did_switch)
         {
-          if (boost::this_thread::get_id() != m_context->thread_id())
+          if (std::this_thread::get_id() != m_context->thread_id())
             throw pycuda::cannot_activate_out_of_thread_context(
                 "cannot activate out-of-thread context");
 #if CUDAPP_CUDA_VERSION >= 2000
@@ -994,7 +1000,7 @@ namespace pycuda
   // {{{ stream
   class event;
 
-  class stream : public boost::noncopyable, public context_dependent
+  class stream : public noncopyable, public context_dependent
   {
     private:
       CUstream m_stream;
@@ -1046,7 +1052,7 @@ namespace pycuda
   // }}}
 
   // {{{ array
-  class array : public boost::noncopyable, public context_dependent
+  class array : public noncopyable, public context_dependent
   {
     private:
       CUarray m_array;
@@ -1114,15 +1120,15 @@ namespace pycuda
   // {{{ texture reference
   class module;
 
-  class texture_reference : public  boost::noncopyable
+  class texture_reference : public  noncopyable
   {
     private:
       CUtexref m_texref;
       bool m_managed;
 
       // life support for array and module
-      boost::shared_ptr<array> m_array;
-      boost::shared_ptr<module> m_module;
+      std::shared_ptr<array> m_array;
+      std::shared_ptr<module> m_module;
 
     public:
       texture_reference()
@@ -1141,13 +1147,13 @@ namespace pycuda
         }
       }
 
-      void set_module(boost::shared_ptr<module> mod)
+      void set_module(std::shared_ptr<module> mod)
       { m_module = mod; }
 
       CUtexref handle() const
       { return m_texref; }
 
-      void set_array(boost::shared_ptr<array> ary)
+      void set_array(std::shared_ptr<array> ary)
       {
         CUDAPP_CALL_GUARDED(cuTexRefSetArray, (m_texref,
             ary->handle(), CU_TRSA_OVERRIDE_FORMAT));
@@ -1236,27 +1242,27 @@ namespace pycuda
 #if CUDAPP_CUDA_VERSION >= 3010
   class module;
 
-  class surface_reference : public  boost::noncopyable
+  class surface_reference : public  noncopyable
   {
     private:
       CUsurfref m_surfref;
 
       // life support for array and module
-      boost::shared_ptr<array> m_array;
-      boost::shared_ptr<module> m_module;
+      std::shared_ptr<array> m_array;
+      std::shared_ptr<module> m_module;
 
     public:
       surface_reference(CUsurfref sr)
         : m_surfref(sr)
       { }
 
-      void set_module(boost::shared_ptr<module> mod)
+      void set_module(std::shared_ptr<module> mod)
       { m_module = mod; }
 
       CUsurfref handle() const
       { return m_surfref; }
 
-      void set_array(boost::shared_ptr<array> ary, unsigned int flags)
+      void set_array(std::shared_ptr<array> ary, unsigned int flags)
       {
         CUDAPP_CALL_GUARDED(cuSurfRefSetArray, (m_surfref, ary->handle(), flags));
         m_array = ary;
@@ -1276,7 +1282,7 @@ namespace pycuda
   // {{{ module
   class function;
 
-  class module : public boost::noncopyable, public context_dependent
+  class module : public noncopyable, public context_dependent
   {
     private:
       CUmodule m_module;
@@ -1319,7 +1325,7 @@ namespace pycuda
 
   inline
   texture_reference *module_get_texref(
-      boost::shared_ptr<module> mod, const char *name)
+      std::shared_ptr<module> mod, const char *name)
   {
     CUtexref tr;
     CUDAPP_CALL_GUARDED(cuModuleGetTexRef, (&tr, mod->handle(), name));
@@ -1332,7 +1338,7 @@ namespace pycuda
 #if CUDAPP_CUDA_VERSION >= 3010
   inline
   surface_reference *module_get_surfref(
-      boost::shared_ptr<module> mod, const char *name)
+      std::shared_ptr<module> mod, const char *name)
   {
     CUsurfref sr;
     CUDAPP_CALL_GUARDED(cuModuleGetSurfRef, (&sr, mod->handle(), name));
@@ -1460,7 +1466,7 @@ namespace pycuda
               "too many grid dimensions in kernel launch");
 
         for (unsigned i = 0; i < gd_length; ++i)
-          grid_dim[i] = py::extract<unsigned>(grid_dim_py[i]);
+          grid_dim[i] = py::cast<unsigned>(grid_dim_py[i]);
 
         pycuda_size_t bd_length = py::len(block_dim_py);
         if (bd_length > axis_count)
@@ -1468,7 +1474,7 @@ namespace pycuda
               "too many block dimensions in kernel launch");
 
         for (unsigned i = 0; i < bd_length; ++i)
-          block_dim[i] = py::extract<unsigned>(block_dim_py[i]);
+          block_dim[i] = py::cast<unsigned>(block_dim_py[i]);
 
         PYCUDA_PARSE_STREAM_PY;
 
@@ -1561,7 +1567,7 @@ namespace pycuda
       }
   };
 
-  class device_allocation : public boost::noncopyable, public context_dependent
+  class device_allocation : public noncopyable, public context_dependent
   {
     private:
       bool m_valid;
@@ -1655,7 +1661,7 @@ namespace pycuda
   // {{{ ipc_mem_handle
 
 #if CUDAPP_CUDA_VERSION >= 4010 && PY_VERSION_HEX >= 0x02060000
-  class ipc_mem_handle : public boost::noncopyable, public context_dependent
+  class ipc_mem_handle : public noncopyable, public context_dependent
   {
     private:
       bool m_valid;
@@ -1928,7 +1934,7 @@ namespace pycuda
 
 
 
-  struct host_pointer : public boost::noncopyable, public context_dependent
+  struct host_pointer : public noncopyable, public context_dependent
   {
     protected:
       bool m_valid;
@@ -2112,7 +2118,7 @@ namespace pycuda
   // }}}
 
   // {{{ event
-  class event : public boost::noncopyable, public context_dependent
+  class event : public noncopyable, public context_dependent
   {
     private:
       CUevent m_event;
diff --git a/src/cpp/cuda_gl.hpp b/src/cpp/cuda_gl.hpp
index 04b7dd86..b7722783 100644
--- a/src/cpp/cuda_gl.hpp
+++ b/src/cpp/cuda_gl.hpp
@@ -8,7 +8,7 @@
 #include <OpenGL/gl.h>
 #else  /* __APPLE__ */
 #include <GL/gl.h>
-#endif 
+#endif
 
 #include <cudaGL.h>
 
@@ -31,11 +31,11 @@ namespace pycuda { namespace gl {
 
 
   inline
-  boost::shared_ptr<context> make_gl_context(device const &dev, unsigned int flags)
+  std::shared_ptr<context> make_gl_context(device const &dev, unsigned int flags)
   {
     CUcontext ctx;
     CUDAPP_CALL_GUARDED(cuGLCtxCreate, (&ctx, flags, dev.handle()));
-    boost::shared_ptr<context> result(new context(ctx));
+    std::shared_ptr<context> result(new context(ctx));
     context_stack::get().push(result);
     return result;
   }
@@ -91,18 +91,18 @@ namespace pycuda { namespace gl {
   class buffer_object_mapping : public context_dependent
   {
     private:
-      boost::shared_ptr<buffer_object> m_buffer_object;
+      std::shared_ptr<buffer_object> m_buffer_object;
       CUdeviceptr m_devptr;
       size_t m_size;
       bool m_valid;
 
     public:
       buffer_object_mapping(
-          boost::shared_ptr<buffer_object> bobj,
+          std::shared_ptr<buffer_object> bobj,
           CUdeviceptr devptr,
           size_t size)
         : m_buffer_object(bobj), m_devptr(devptr), m_size(size), m_valid(true)
-      { 
+      {
         PyErr_Warn(
             PyExc_DeprecationWarning,
             "buffer_object_mapping has been deprecated since CUDA 3.0 "
@@ -142,7 +142,7 @@ namespace pycuda { namespace gl {
 
 
   inline buffer_object_mapping *map_buffer_object(
-      boost::shared_ptr<buffer_object> bobj)
+      std::shared_ptr<buffer_object> bobj)
   {
     CUdeviceptr devptr;
     pycuda_size_t size;
@@ -199,7 +199,7 @@ namespace pycuda { namespace gl {
           CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(registered_object);
         }
         else
-          throw pycuda::error("registered_object::unregister", 
+          throw pycuda::error("registered_object::unregister",
               CUDA_ERROR_INVALID_HANDLE);
       }
   };
@@ -207,11 +207,11 @@ namespace pycuda { namespace gl {
   class registered_buffer : public registered_object
   {
     public:
-      registered_buffer(GLuint gl_handle, 
+      registered_buffer(GLuint gl_handle,
           CUgraphicsMapResourceFlags flags=CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE)
         : registered_object(gl_handle)
       {
-        CUDAPP_CALL_GUARDED(cuGraphicsGLRegisterBuffer, 
+        CUDAPP_CALL_GUARDED(cuGraphicsGLRegisterBuffer,
             (&m_resource, gl_handle, flags));
       }
   };
@@ -219,11 +219,11 @@ namespace pycuda { namespace gl {
   class registered_image : public registered_object
   {
     public:
-      registered_image(GLuint gl_handle, GLenum target, 
+      registered_image(GLuint gl_handle, GLenum target,
           CUgraphicsMapResourceFlags flags=CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE)
         : registered_object(gl_handle)
       {
-        CUDAPP_CALL_GUARDED(cuGraphicsGLRegisterImage, 
+        CUDAPP_CALL_GUARDED(cuGraphicsGLRegisterImage,
             (&m_resource, gl_handle, target, flags));
       }
   };
@@ -233,14 +233,14 @@ namespace pycuda { namespace gl {
   class registered_mapping : public context_dependent
   {
     private:
-      boost::shared_ptr<registered_object> m_object;
-      boost::shared_ptr<stream> m_stream;
+      std::shared_ptr<registered_object> m_object;
+      std::shared_ptr<stream> m_stream;
       bool m_valid;
 
     public:
       registered_mapping(
-          boost::shared_ptr<registered_object> robj,
-          boost::shared_ptr<stream> strm)
+          std::shared_ptr<registered_object> robj,
+          std::shared_ptr<stream> strm)
         : m_object(robj), m_stream(strm), m_valid(true)
       { }
 
@@ -255,7 +255,7 @@ namespace pycuda { namespace gl {
         unmap(m_stream);
       }
 
-      void unmap(boost::shared_ptr<stream> const &strm)
+      void unmap(std::shared_ptr<stream> const &strm)
       {
         CUstream s_handle;
         if (!strm.get())
@@ -283,7 +283,7 @@ namespace pycuda { namespace gl {
       {
         CUdeviceptr devptr;
         pycuda_size_t size;
-        CUDAPP_CALL_GUARDED(cuGraphicsResourceGetMappedPointer, 
+        CUDAPP_CALL_GUARDED(cuGraphicsResourceGetMappedPointer,
             (&devptr, &size, m_object->resource()));
         return py::make_tuple(devptr, size);
       }
@@ -292,7 +292,7 @@ namespace pycuda { namespace gl {
       pycuda::array *array(unsigned int index, unsigned int level) const
       {
         CUarray devptr;
-        CUDAPP_CALL_GUARDED(cuGraphicsSubResourceGetMappedArray, 
+        CUDAPP_CALL_GUARDED(cuGraphicsSubResourceGetMappedArray,
             (&devptr, m_object->resource(), index, level));
         std::unique_ptr<pycuda::array> result(
             new pycuda::array(devptr, false));
@@ -304,11 +304,11 @@ namespace pycuda { namespace gl {
 
 
   inline registered_mapping *map_registered_object(
-      boost::shared_ptr<registered_object> const &robj,
+      std::shared_ptr<registered_object> const &robj,
       py::object strm_py)
   {
     CUstream s_handle;
-    boost::shared_ptr<stream> strm_sptr;
+    std::shared_ptr<stream> strm_sptr;
 
     if (strm_py.ptr() == Py_None)
     {
@@ -316,7 +316,7 @@ namespace pycuda { namespace gl {
     }
     else
     {
-      strm_sptr = py::extract<boost::shared_ptr<stream> >(strm_py);
+      strm_sptr = py::extract<std::shared_ptr<stream> >(strm_py);
       s_handle = strm_sptr->handle();
     }
 
diff --git a/src/cpp/mempool.hpp b/src/cpp/mempool.hpp
index 44f0fd64..72f20569 100644
--- a/src/cpp/mempool.hpp
+++ b/src/cpp/mempool.hpp
@@ -50,12 +50,6 @@ namespace PYGPU_PACKAGE
     mp_noncopyable& operator=(const mp_noncopyable&) = delete;
   };
 
-#ifdef PYGPU_PYCUDA
-#define PYGPU_SHARED_PTR boost::shared_ptr
-#else
-#define PYGPU_SHARED_PTR std::shared_ptr
-#endif
-
   template <class T>
   inline T signed_left_shift(T x, signed shift_amount)
   {
@@ -400,14 +394,14 @@ namespace PYGPU_PACKAGE
       typedef typename Pool::size_type size_type;
 
     private:
-      PYGPU_SHARED_PTR<pool_type> m_pool;
+      std::shared_ptr<pool_type> m_pool;
 
       pointer_type m_ptr;
       size_type m_size;
       bool m_valid;
 
     public:
-      pooled_allocation(PYGPU_SHARED_PTR<pool_type> p, size_type size)
+      pooled_allocation(std::shared_ptr<pool_type> p, size_type size)
         : m_pool(p), m_ptr(p->allocate(size)), m_size(size), m_valid(true)
       { }
 
diff --git a/src/wrapper/mempool.cpp b/src/wrapper/mempool.cpp
index d889d516..155e51b3 100644
--- a/src/wrapper/mempool.cpp
+++ b/src/wrapper/mempool.cpp
@@ -6,12 +6,11 @@
 #include "wrap_helpers.hpp"
 #include <cuda.hpp>
 #include <mempool.hpp>
-#include <boost/python/stl_iterator.hpp>
 
 
 
 
-namespace py = boost::python;
+namespace py = pybind11;
 
 
 
@@ -128,7 +127,7 @@ namespace
 
     public:
       pooled_device_allocation(
-          boost::shared_ptr<super::pool_type> p, super::size_type s)
+          std::shared_ptr<super::pool_type> p, super::size_type s)
         : super(p, s)
       { }
 
@@ -140,7 +139,7 @@ namespace
 
 
   pooled_device_allocation *device_pool_allocate(
-      boost::shared_ptr<context_dependent_memory_pool<device_allocator> > pool,
+      std::shared_ptr<context_dependent_memory_pool<device_allocator> > pool,
       context_dependent_memory_pool<device_allocator>::size_type sz)
   {
     return new pooled_device_allocation(pool, sz);
@@ -170,7 +169,7 @@ namespace
 
     public:
       pooled_host_allocation(
-          boost::shared_ptr<super::pool_type> p, super::size_type s)
+          std::shared_ptr<super::pool_type> p, super::size_type s)
         : super(p, s)
       { }
   };
@@ -179,7 +178,7 @@ namespace
 
 
   py::handle<> host_pool_allocate(
-      boost::shared_ptr<pycuda::memory_pool<host_allocator> > pool,
+      std::shared_ptr<pycuda::memory_pool<host_allocator> > pool,
       py::object shape, py::object dtype, py::object order_py)
   {
     PyArray_Descr *tp_descr;
@@ -249,8 +248,8 @@ void pycuda_expose_tools()
     typedef context_dependent_memory_pool<device_allocator> cl;
 
     py::class_<
-      cl, boost::noncopyable,
-      boost::shared_ptr<cl> > wrapper("DeviceMemoryPool");
+      cl, noncopyable,
+      std::shared_ptr<cl> > wrapper("DeviceMemoryPool");
     wrapper
       .def("allocate", device_pool_allocate,
           py::return_value_policy<py::manage_new_object>())
@@ -269,8 +268,8 @@ void pycuda_expose_tools()
     typedef pycuda::memory_pool<host_allocator> cl;
 
     py::class_<
-      cl, boost::noncopyable,
-      boost::shared_ptr<cl> > wrapper(
+      cl, noncopyable,
+      std::shared_ptr<cl> > wrapper(
           "PageLockedMemoryPool",
           py::init<py::optional<host_allocator const &> >()
           );
@@ -284,7 +283,7 @@ void pycuda_expose_tools()
 
   {
     typedef pooled_device_allocation cl;
-    py::class_<cl, boost::noncopyable>(
+    py::class_<cl, noncopyable>(
         "PooledDeviceAllocation", py::no_init)
       .DEF_SIMPLE_METHOD(free)
       .def("__int__", &cl::ptr)
@@ -298,7 +297,7 @@ void pycuda_expose_tools()
 
   {
     typedef pooled_host_allocation cl;
-    py::class_<cl, boost::noncopyable>(
+    py::class_<cl, noncopyable>(
         "PooledHostAllocation", py::no_init)
       .DEF_SIMPLE_METHOD(free)
       .def("__len__", &cl::size)
diff --git a/src/wrapper/tools.hpp b/src/wrapper/tools.hpp
index 98a7d8c3..ff277c31 100644
--- a/src/wrapper/tools.hpp
+++ b/src/wrapper/tools.hpp
@@ -5,7 +5,7 @@
 
 
 #include <cuda.hpp>
-#include <boost/python.hpp>
+#include <pybind11/pybind11.h>
 #include <numeric>
 #include <numpy/arrayobject.h>
 
@@ -28,7 +28,7 @@ namespace pycuda
 
   inline void run_python_gc()
   {
-    namespace py = boost::python;
+    namespace py = pybind11;
 
     py::object gc_mod(
         py::handle<>(
@@ -46,7 +46,7 @@ namespace pycuda
       return pycuda::mem_alloc(bytes);
     }
     catch (pycuda::error &e)
-    { 
+    {
       if (e.code() != CUDA_ERROR_OUT_OF_MEMORY)
         throw;
     }
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index 01f09189..e1e97393 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -8,7 +8,6 @@
 
 #include "tools.hpp"
 #include "wrap_helpers.hpp"
-#include <boost/python/stl_iterator.hpp>
 
 
 
@@ -21,7 +20,6 @@
 
 
 using namespace pycuda;
-using boost::shared_ptr;
 
 
 
@@ -291,14 +289,14 @@ namespace
       py::object dest_context_py, py::object src_context_py
       )
   {
-    boost::shared_ptr<context> dest_context = context::current_context();
-    boost::shared_ptr<context> src_context = dest_context;
+    std::shared_ptr<context> dest_context = context::current_context();
+    std::shared_ptr<context> src_context = dest_context;
 
     if (dest_context_py.ptr() == Py_None)
-      dest_context = py::extract<boost::shared_ptr<context> >(dest_context_py);
+      dest_context = py::extract<std::shared_ptr<context> >(dest_context_py);
 
     if (src_context_py.ptr() == Py_None)
-      src_context = py::extract<boost::shared_ptr<context> >(src_context_py);
+      src_context = py::extract<std::shared_ptr<context> >(src_context_py);
 
     CUDAPP_CALL_GUARDED_THREADED(cuMemcpyPeer, (
           dest, dest_context->handle(),
@@ -311,14 +309,14 @@ namespace
       py::object dest_context_py, py::object src_context_py,
       py::object stream_py)
   {
-    boost::shared_ptr<context> dest_context = context::current_context();
-    boost::shared_ptr<context> src_context = dest_context;
+    std::shared_ptr<context> dest_context = context::current_context();
+    std::shared_ptr<context> src_context = dest_context;
 
     if (dest_context_py.ptr() == Py_None)
-      dest_context = py::extract<boost::shared_ptr<context> >(dest_context_py);
+      dest_context = py::extract<std::shared_ptr<context> >(dest_context_py);
 
     if (src_context_py.ptr() == Py_None)
-      src_context = py::extract<boost::shared_ptr<context> >(src_context_py);
+      src_context = py::extract<std::shared_ptr<context> >(src_context_py);
 
     PYCUDA_PARSE_STREAM_PY
 
@@ -415,7 +413,7 @@ namespace
   // {{{ linker
 
 #if CUDAPP_CUDA_VERSION >= 5050
-  class Linker : public boost::noncopyable
+  class Linker : public ncopyable
   {
     private:
       py::object m_message_handler;
@@ -670,9 +668,9 @@ BOOST_PYTHON_MODULE(_driver)
   if (!import_numpy_helper())
     throw py::error_already_set();
 
-  py::def("get_version", cuda_version);
+  m.def("get_version", cuda_version);
 #if CUDAPP_CUDA_VERSION >= 2020
-  py::def("get_driver_version", pycuda::get_driver_version);
+  m.def("get_driver_version", pycuda::get_driver_version);
 #endif
 
   // {{{ exceptions
@@ -750,7 +748,7 @@ BOOST_PYTHON_MODULE(_driver)
 
 #if CUDAPP_CUDA_VERSION >= 3000
   {
-    py::class_<array3d_flags> cls("array3d_flags", py::no_init);
+    py::class_<array3d_flags> cls(m, "array3d_flags", py::no_init);
     // deprecated
     cls.attr("ARRAY3D_2DARRAY") = CUDA_ARRAY3D_2DARRAY;
 #if CUDAPP_CUDA_VERSION >= 4000
@@ -1046,7 +1044,7 @@ BOOST_PYTHON_MODULE(_driver)
 
 #if CUDAPP_CUDA_VERSION >= 2020
   {
-    py::class_<host_alloc_flags> cls("host_alloc_flags", py::no_init);
+    py::class_<host_alloc_flags> cls(m, "host_alloc_flags", py::no_init);
     cls.attr("PORTABLE") = CU_MEMHOSTALLOC_PORTABLE;
     cls.attr("DEVICEMAP") = CU_MEMHOSTALLOC_DEVICEMAP;
     cls.attr("WRITECOMBINED") = CU_MEMHOSTALLOC_WRITECOMBINED;
@@ -1055,7 +1053,7 @@ BOOST_PYTHON_MODULE(_driver)
 
 #if CUDAPP_CUDA_VERSION >= 4000
   {
-    py::class_<mem_host_register_flags> cls("mem_host_register_flags", py::no_init);
+    py::class_<mem_host_register_flags> cls(m, "mem_host_register_flags", py::no_init);
     cls.attr("PORTABLE") = CU_MEMHOSTREGISTER_PORTABLE;
     cls.attr("DEVICEMAP") = CU_MEMHOSTREGISTER_DEVICEMAP;
   }
@@ -1109,13 +1107,13 @@ BOOST_PYTHON_MODULE(_driver)
 
   // }}}
 
-  py::def("init", init,
+  m.def("init", init,
       py::arg("flags")=0);
 
   // {{{ device
   {
     typedef device cl;
-    py::class_<cl>("Device", py::no_init)
+    py::class_<cl>(m, "Device", py::no_init)
       .def("__init__", py::make_constructor(make_device))
 #if CUDAPP_CUDA_VERSION >= 4010
       .def("__init__", py::make_constructor(make_device_from_pci_bus_id))
@@ -1133,13 +1131,13 @@ BOOST_PYTHON_MODULE(_driver)
       .def(py::self != py::self)
       .def("__hash__", &cl::hash)
       .def("make_context", &cl::make_context,
-          (py::args("self"), py::args("flags")=0))
+          (py::arg("self"), py::arg("flags")=0))
 #if CUDAPP_CUDA_VERSION >= 4000
       .DEF_SIMPLE_METHOD(can_access_peer)
 #endif
 #if CUDAPP_CUDA_VERSION >= 7000
       .def("retain_primary_context", &cl::retain_primary_context,
-          (py::args("self")))
+          (py::arg("self")))
 #endif
       ;
   }
@@ -1148,7 +1146,7 @@ BOOST_PYTHON_MODULE(_driver)
   // {{{ context
   {
     typedef context cl;
-    py::class_<cl, shared_ptr<cl>, boost::noncopyable >("Context", py::no_init)
+    py::class_<cl, shared_ptr<cl>>(m, "Context", py::no_init)
       .def(py::self == py::self)
       .def(py::self != py::self)
       .def("__hash__", &cl::hash)
@@ -1168,7 +1166,7 @@ BOOST_PYTHON_MODULE(_driver)
       .DEF_SIMPLE_METHOD(synchronize)
       .staticmethod("synchronize")
 
-      .def("get_current", (boost::shared_ptr<cl> (*)()) &cl::current_context)
+      .def("get_current", (std::shared_ptr<cl> (*)()) &cl::current_context)
       .staticmethod("get_current")
 
 #if CUDAPP_CUDA_VERSION >= 3010
@@ -1205,8 +1203,8 @@ BOOST_PYTHON_MODULE(_driver)
   // {{{ stream
   {
     typedef stream cl;
-    py::class_<cl, boost::noncopyable, shared_ptr<cl> >
-      ("Stream", py::init<unsigned int>(py::arg("flags")=0))
+    py::class_<cl, shared_ptr<cl> >
+      (m, "Stream", py::init<unsigned int>(py::arg("flags")=0))
       .DEF_SIMPLE_METHOD(synchronize)
       .DEF_SIMPLE_METHOD(is_done)
 #if CUDAPP_CUDA_VERSION >= 3020
@@ -1220,24 +1218,24 @@ BOOST_PYTHON_MODULE(_driver)
   // {{{ module
   {
     typedef module cl;
-    py::class_<cl, boost::noncopyable, shared_ptr<cl> >("Module", py::no_init)
-      .def("get_function", &cl::get_function, (py::args("self", "name")),
+    py::class_<cl, shared_ptr<cl> >(m, "Module", py::no_init)
+      .def("get_function", &cl::get_function, py::arg("self"), py::arg("name")),
           py::with_custodian_and_ward_postcall<0, 1>())
-      .def("get_global", &cl::get_global, (py::args("self", "name")))
+      .def("get_global", &cl::get_global, py::arg("self") py::arg("name")))
       .def("get_texref", module_get_texref,
-          (py::args("self", "name")),
+          (py::arg("self"), py::arg("name")),
           py::return_value_policy<py::manage_new_object>())
 #if CUDAPP_CUDA_VERSION >= 3010
       .def("get_surfref", module_get_surfref,
-          (py::args("self", "name")),
+          (py::arg("self"), py::arg("name")),
           py::return_value_policy<py::manage_new_object>())
 #endif
       ;
   }
 
-  py::def("module_from_file", module_from_file, (py::arg("filename")),
+  m.def("module_from_file", module_from_file, (py::arg("filename")),
       py::return_value_policy<py::manage_new_object>());
-  py::def("module_from_buffer", module_from_buffer,
+  m.def("module_from_buffer", module_from_buffer,
       (py::arg("buffer"),
        py::arg("options")=py::list(),
        py::arg("message_handler")=py::object()),
@@ -1255,7 +1253,7 @@ BOOST_PYTHON_MODULE(_driver)
     .value("OBJECT", CU_JIT_INPUT_OBJECT)
     .value("LIBRARY", CU_JIT_INPUT_LIBRARY);
 
-  py::class_<Linker, boost::noncopyable, shared_ptr<Linker> >("Linker")
+  py::class_<Linker, shared_ptr<Linker> >(m, "Linker")
     .def(py::init<py::object>())
     .def(py::init<py::object, py::object>())
     .def(py::init<py::object, py::object, py::object>())
@@ -1269,7 +1267,7 @@ BOOST_PYTHON_MODULE(_driver)
   // {{{ function
   {
     typedef function cl;
-    py::class_<cl>("Function", py::no_init)
+    py::class_<cl>(m, "Function", py::no_init)
       .def("_set_block_shape", &cl::set_block_shape)
       .def("_set_shared_size", &cl::set_shared_size)
       .def("_param_set_size", &cl::param_set_size)
@@ -1281,9 +1279,9 @@ BOOST_PYTHON_MODULE(_driver)
 
       .def("_launch", &cl::launch)
       .def("_launch_grid", &cl::launch_grid,
-          py::args("grid_width", "grid_height"))
+          py::arg("grid_width"), py::arg("grid_height"))
       .def("_launch_grid_async", &cl::launch_grid_async,
-          py::args("grid_width", "grid_height", "s"))
+          py::arg("grid_width"), py::arg("grid_height"), py::arg("s"))
 
 #if CUDAPP_CUDA_VERSION >= 2020
       .DEF_SIMPLE_METHOD(get_attribute)
@@ -1306,8 +1304,7 @@ BOOST_PYTHON_MODULE(_driver)
 
   {
     typedef pointer_holder_base cl;
-    py::class_<pointer_holder_base_wrap, boost::noncopyable>(
-        "PointerHolderBase")
+    py::class_<pointer_holder_base_wrap>(m, "PointerHolderBase")
       .def("get_pointer", py::pure_virtual(&cl::get_pointer))
       .def("as_buffer", &cl::as_buffer,
           (py::arg("size"), py::arg("offset")=0))
@@ -1321,7 +1318,7 @@ BOOST_PYTHON_MODULE(_driver)
 
   {
     typedef device_allocation cl;
-    py::class_<cl, boost::noncopyable>("DeviceAllocation", py::no_init)
+    py::class_<cl>(m, "DeviceAllocation", py::no_init)
       .def("__int__", &cl::operator CUdeviceptr)
       .def("__long__", mem_obj_to_long<cl>)
       .def("__index__", mem_obj_to_long<cl>)
@@ -1336,7 +1333,7 @@ BOOST_PYTHON_MODULE(_driver)
 #if CUDAPP_CUDA_VERSION >= 4010 && PY_VERSION_HEX >= 0x02060000
   {
     typedef ipc_mem_handle cl;
-    py::class_<cl, boost::noncopyable>("IPCMemoryHandle",
+    py::class_<cl>(m, "IPCMemoryHandle",
         py::init<py::object, py::optional<CUipcMem_flags> >())
       .def("__int__", &cl::operator CUdeviceptr)
       .def("__long__", mem_obj_to_long<cl>)
@@ -1356,7 +1353,7 @@ BOOST_PYTHON_MODULE(_driver)
 
   {
     typedef host_pointer cl;
-    py::class_<cl, boost::noncopyable>("HostPointer", py::no_init)
+    py::class_<cl>(m, "HostPointer", py::no_init)
 #if CUDAPP_CUDA_VERSION >= 2020
       .DEF_SIMPLE_METHOD(get_device_pointer)
 #endif
@@ -1365,8 +1362,8 @@ BOOST_PYTHON_MODULE(_driver)
 
   {
     typedef pagelocked_host_allocation cl;
-    py::class_<cl, boost::noncopyable, py::bases<host_pointer> > wrp(
-        "PagelockedHostAllocation", py::no_init);
+    py::class_<cl, py::bases<host_pointer> > wrp(
+        m, "PagelockedHostAllocation", py::no_init);
 
     wrp
       .DEF_SIMPLE_METHOD(free)
@@ -1380,8 +1377,8 @@ BOOST_PYTHON_MODULE(_driver)
 
   {
     typedef aligned_host_allocation cl;
-    py::class_<cl, boost::noncopyable, py::bases<host_pointer> > wrp(
-        "AlignedHostAllocation", py::no_init);
+    py::class_<cl, py::bases<host_pointer> > wrp(
+        m, "AlignedHostAllocation", py::no_init);
 
     wrp
       .DEF_SIMPLE_METHOD(free)
@@ -1391,8 +1388,8 @@ BOOST_PYTHON_MODULE(_driver)
 #if CUDAPP_CUDA_VERSION >= 6000
   {
     typedef managed_allocation cl;
-    py::class_<cl, boost::noncopyable, py::bases<device_allocation> > wrp(
-        "ManagedAllocation", py::no_init);
+    py::class_<cl, py::bases<device_allocation> > wrp(
+        m, "ManagedAllocation", py::no_init);
 
     wrp
       .DEF_SIMPLE_METHOD(get_device_pointer)
@@ -1405,108 +1402,108 @@ BOOST_PYTHON_MODULE(_driver)
 #if CUDAPP_CUDA_VERSION >= 4000
   {
     typedef registered_host_memory cl;
-    py::class_<cl, boost::noncopyable, py::bases<host_pointer> >(
-        "RegisteredHostMemory", py::no_init)
+    py::class_<cl, py::bases<host_pointer> >(
+        m, "RegisteredHostMemory", py::no_init)
       .def("unregister", &cl::free)
       ;
   }
 #endif
 
-  py::def("pagelocked_empty", numpy_empty<pagelocked_host_allocation>,
+  m.def("pagelocked_empty", numpy_empty<pagelocked_host_allocation>,
       (py::arg("shape"), py::arg("dtype"), py::arg("order")="C",
        py::arg("mem_flags")=0));
 
-  py::def("aligned_empty", numpy_empty<aligned_host_allocation>,
+  m.def("aligned_empty", numpy_empty<aligned_host_allocation>,
       (py::arg("shape"), py::arg("dtype"),
        py::arg("order")="C", py::arg("alignment")=4096));
 
 #if CUDAPP_CUDA_VERSION >= 6000
-  py::def("managed_empty", numpy_empty<managed_allocation>,
+  m.def("managed_empty", numpy_empty<managed_allocation>,
       (py::arg("shape"), py::arg("dtype"), py::arg("order")="C",
        py::arg("mem_flags")=0));
 #endif
 
 #if CUDAPP_CUDA_VERSION >= 4000
-  py::def("register_host_memory", register_host_memory,
+  m.def("register_host_memory", register_host_memory,
       (py::arg("ary"), py::arg("flags")=0));
 #endif
 
   // }}}
 
   DEF_SIMPLE_FUNCTION(mem_get_info);
-  py::def("mem_alloc", mem_alloc_wrap,
+  m.def("mem_alloc", mem_alloc_wrap,
       py::return_value_policy<py::manage_new_object>());
-  py::def("mem_alloc_pitch", mem_alloc_pitch_wrap,
-      py::args("width", "height", "access_size"));
+  m.def("mem_alloc_pitch", mem_alloc_pitch_wrap,
+      py::arg("width"), py::arg("height"), py::arg("access_size"));
   DEF_SIMPLE_FUNCTION(mem_get_address_range);
 
   // {{{ memset/memcpy
-  py::def("memset_d8",  py_memset_d8, py::args("dest", "data", "size"));
-  py::def("memset_d16", py_memset_d16, py::args("dest", "data", "size"));
-  py::def("memset_d32", py_memset_d32, py::args("dest", "data", "size"));
-
-  py::def("memset_d2d8", py_memset_d2d8,
-      py::args("dest", "pitch", "data", "width", "height"));
-  py::def("memset_d2d16", py_memset_d2d16,
-      py::args("dest", "pitch", "data", "width", "height"));
-  py::def("memset_d2d32", py_memset_d2d32,
-      py::args("dest", "pitch", "data", "width", "height"));
-
-  py::def("memset_d8_async",  py_memset_d8_async,
-      (py::args("dest", "data", "size"), py::arg("stream")=py::object()));
-  py::def("memset_d16_async", py_memset_d16_async,
-      (py::args("dest", "data", "size"), py::arg("stream")=py::object()));
-  py::def("memset_d32_async", py_memset_d32_async,
-      (py::args("dest", "data", "size"), py::arg("stream")=py::object()));
-
-  py::def("memset_d2d8_async", py_memset_d2d8_async,
-      (py::args("dest", "pitch", "data", "width", "height"),
+  m.def("memset_d8",  py_memset_d8, py::arg("dest"), py::arg("data"), py::arg("size"));
+  m.def("memset_d16", py_memset_d16, py::arg("dest"), py::arg("data"), py::arg("size"));
+  m.def("memset_d32", py_memset_d32, py::arg("dest"), py::arg("data"), py::arg"size"));
+
+  m.def("memset_d2d8", py_memset_d2d8,
+      py::arg("dest"), py::arg("pitch"), py::arg("data", py::arg("width"), py::arg("height"));
+  m.def("memset_d2d16", py_memset_d2d16,
+      py::arg("dest", py::arg("pitch"), py::arg("data", py::arg("width"), py::arg("height"));
+  m.def("memset_d2d32", py_memset_d2d32,
+      py::arg("dest", py::arg("pitch"), py::arg("data"), py::arg("width"), py::arg("height"));
+
+  m.def("memset_d8_async",  py_memset_d8_async,
+      (py::arg("dest", py::arg("data", py::arg("size"), py::arg("stream")=py::object()));
+  m.def("memset_d16_async", py_memset_d16_async,
+      (py::arg("dest", py::arg("data", py::arg("size"), py::arg("stream")=py::object()));
+  m.def("memset_d32_async", py_memset_d32_async,
+      (py::arg("dest", py::arg("data", py::arg("size"), py::arg("stream")=py::object()));
+
+  m.def("memset_d2d8_async", py_memset_d2d8_async,
+      (py::arg("dest", py::arg("pitch"), py::arg("data"), py::arg("width"), py::arg("height"),
        py::arg("stream")=py::object()));
-  py::def("memset_d2d16_async", py_memset_d2d16_async,
-      (py::args("dest", "pitch", "data", "width", "height"),
+  m.def("memset_d2d16_async", py_memset_d2d16_async,
+      (py::arg("dest", py::arg("pitch"), py::arg("data"), py::arg("width"), py::arg("height"),
        py::arg("stream")=py::object()));
-  py::def("memset_d2d32_async", py_memset_d2d32_async,
-      (py::args("dest", "pitch", "data", "width", "height"),
+  m.def("memset_d2d32_async", py_memset_d2d32_async,
+      (py::arg("dest", py::arg("pitch"), py::arg("data"), py::arg("width"), py::arg("height"),
        py::arg("stream")=py::object()));
 
-  py::def("memcpy_htod", py_memcpy_htod,
-      (py::args("dest"), py::arg("src")));
-  py::def("memcpy_htod_async", py_memcpy_htod_async,
-      (py::args("dest"), py::arg("src"), py::arg("stream")=py::object()));
-  py::def("memcpy_dtoh", py_memcpy_dtoh,
-      (py::args("dest"), py::arg("src")));
-  py::def("memcpy_dtoh_async", py_memcpy_dtoh_async,
-      (py::args("dest"), py::arg("src"), py::arg("stream")=py::object()));
+  m.def("memcpy_htod", py_memcpy_htod,
+      (py::arg("dest"), py::arg("src")));
+  m.def("memcpy_htod_async", py_memcpy_htod_async,
+      (py::arg("dest"), py::arg("src"), py::arg("stream")=py::object()));
+  m.def("memcpy_dtoh", py_memcpy_dtoh,
+      (py::arg("dest"), py::arg("src")));
+  m.def("memcpy_dtoh_async", py_memcpy_dtoh_async,
+      (py::arg("dest"), py::arg("src"), py::arg("stream")=py::object()));
 
-  py::def("memcpy_dtod", py_memcpy_dtod, py::args("dest", "src", "size"));
+  m.def("memcpy_dtod", py_memcpy_dtod, py::arg("dest"), py::arg("src"), py::arg("size"));
 #if CUDAPP_CUDA_VERSION >= 3000
-  py::def("memcpy_dtod_async", py_memcpy_dtod_async,
-      (py::args("dest", "src", "size"), py::arg("stream")=py::object()));
+  m.def("memcpy_dtod_async", py_memcpy_dtod_async,
+      (py::arg("dest"), py::arg("src"), py::arg("size"), py::arg("stream")=py::object()));
 #endif
 #if CUDAPP_CUDA_VERSION >= 4000
-  py::def("memcpy_peer", py_memcpy_peer,
-      (py::args("dest", "src", "size"),
+  m.def("memcpy_peer", py_memcpy_peer,
+      (py::arg("dest"), py::arg("src"), py::arg("size"),
        py::arg("dest_context")=py::object(),
        py::arg("src_context")=py::object()));
 
-  py::def("memcpy_peer_async", py_memcpy_peer_async,
-      (py::args("dest", "src", "size"),
+  m.def("memcpy_peer_async", py_memcpy_peer_async,
+      (py::args("dest"), py::arg("src"), py::arg("size"),
        py::arg("dest_context")=py::object(),
        py::arg("src_context")=py::object(),
        py::arg("stream")=py::object()));
 #endif
 
-  DEF_SIMPLE_FUNCTION_WITH_ARGS(memcpy_dtoa,
-      ("ary", "index", "src", "len"));
-  DEF_SIMPLE_FUNCTION_WITH_ARGS(memcpy_atod,
-      ("dest", "ary", "index", "len"));
-  DEF_SIMPLE_FUNCTION_WITH_ARGS(py_memcpy_htoa,
-      ("ary", "index", "src"));
-  DEF_SIMPLE_FUNCTION_WITH_ARGS(py_memcpy_atoh,
-      ("dest", "ary", "index"));
+  m.def("memcpy_dtoa",memcpy_dtoa,
+      py::arg("ary"), py::arg("index"), py::arg("src"), py::arg("len"));
+  m.def("memcpy_atod", memcpy_atod,
+      py::arg("dest"), py::arg("ary"), py::arg("index"), py::arg("len"));
+  m.def("memcpy_htoa", py_memcpy_htoa,
+      py::arg("ary"), py::arg("index"), py::arg("src"));
+  m.def("memcpy_atoh",py_memcpy_atoh,
+      py::arg("dest"), py::arg("ary"_, py::arg("index"));
 
-  DEF_SIMPLE_FUNCTION_WITH_ARGS(memcpy_atoa,
-      ("dest", "dest_index", "src", "src_index", "len"));
+  m.def("memcpy_atoa", memcpy_atoa,
+      py::arg("dest"), py::arg("dest_index"), py::arg("src"), py::arg("src_index"), py::arg("len"));
 
 #if CUDAPP_CUDA_VERSION >= 4000
 #define WRAP_MEMCPY_2D_UNIFIED_SETTERS \
@@ -1544,7 +1541,7 @@ BOOST_PYTHON_MODULE(_driver)
 
   {
     typedef memcpy_2d cl;
-    py::class_<cl>("Memcpy2D")
+    py::class_<cl>(m, "Memcpy2D")
       WRAP_MEMCPY_2D_PROPERTIES
 
       .def("__call__", &cl::execute, py::args("self", "aligned"))
@@ -1567,7 +1564,7 @@ BOOST_PYTHON_MODULE(_driver)
 
   {
     typedef memcpy_3d cl;
-    py::class_<cl>("Memcpy3D")
+    py::class_<cl>(m, "Memcpy3D")
       WRAP_MEMCPY_3D_PROPERTIES
 
       .def("__call__", &cl::execute)
@@ -1578,7 +1575,7 @@ BOOST_PYTHON_MODULE(_driver)
 #if CUDAPP_CUDA_VERSION >= 4000
   {
     typedef memcpy_3d_peer cl;
-    py::class_<cl>("Memcpy3DPeer")
+    py::class_<cl>(m, "Memcpy3DPeer")
       WRAP_MEMCPY_3D_PROPERTIES
 
       .DEF_SIMPLE_METHOD(set_src_context)
@@ -1594,8 +1591,8 @@ BOOST_PYTHON_MODULE(_driver)
   // {{{ event
   {
     typedef event cl;
-    py::class_<cl, boost::noncopyable>
-      ("Event", py::init<py::optional<unsigned int> >(py::arg("flags")))
+    py::class_<cl>
+      (m, "Event", py::init<py::optional<unsigned int> >(py::arg("flags")))
       .def("record", &cl::record,
           py::arg("stream")=py::object(), py::return_self<>())
       .def("synchronize", &cl::synchronize, py::return_self<>())
@@ -1615,7 +1612,7 @@ BOOST_PYTHON_MODULE(_driver)
   // {{{ arrays
   {
     typedef CUDA_ARRAY_DESCRIPTOR cl;
-    py::class_<cl>("ArrayDescriptor")
+    py::class_<cl>(m, "ArrayDescriptor")
       .def_readwrite("width", &cl::Width)
       .def_readwrite("height", &cl::Height)
       .def_readwrite("format", &cl::Format)
@@ -1626,7 +1623,7 @@ BOOST_PYTHON_MODULE(_driver)
 #if CUDAPP_CUDA_VERSION >= 2000
   {
     typedef CUDA_ARRAY3D_DESCRIPTOR cl;
-    py::class_<cl>("ArrayDescriptor3D")
+    py::class_<cl>(m, "ArrayDescriptor3D")
       .def_readwrite("width", &cl::Width)
       .def_readwrite("height", &cl::Height)
       .def_readwrite("depth", &cl::Depth)
@@ -1639,8 +1636,8 @@ BOOST_PYTHON_MODULE(_driver)
 
   {
     typedef array cl;
-    py::class_<cl, shared_ptr<cl>, boost::noncopyable>
-      ("Array", py::init<const CUDA_ARRAY_DESCRIPTOR &>())
+    py::class_<cl, shared_ptr<cl>>
+      (m, "Array", py::init<const CUDA_ARRAY_DESCRIPTOR &>())
       .DEF_SIMPLE_METHOD(free)
       .DEF_SIMPLE_METHOD(get_descriptor)
 #if CUDAPP_CUDA_VERSION >= 2000
@@ -1655,15 +1652,15 @@ BOOST_PYTHON_MODULE(_driver)
   // {{{ texture reference
   {
     typedef texture_reference cl;
-    py::class_<cl, boost::noncopyable>("TextureReference")
+    py::class_<cl>(m, "TextureReference")
       .DEF_SIMPLE_METHOD(set_array)
       .def("set_address", &cl::set_address,
-          (py::arg("devptr"), py::arg("bytes"), py::arg("allow_offset")=false))
+          py::arg("devptr"), py::arg("bytes"), py::arg("allow_offset")=false)
 #if CUDAPP_CUDA_VERSION >= 2020
-      .DEF_SIMPLE_METHOD_WITH_ARGS(set_address_2d, ("devptr", "descr", "pitch"))
+      .def("set_address_2d", set_address_2d, py::arg("devptr", "descr", "pitch"))
 #endif
-      .DEF_SIMPLE_METHOD_WITH_ARGS(set_format, ("format", "num_components"))
-      .DEF_SIMPLE_METHOD_WITH_ARGS(set_address_mode, ("dim", "am"))
+      .def("set_format", set_format, py::arg("format"), py::arg9"num_components"))
+      .def("set_address_mode", set_address_mode, py::arg("dim"), py::arg("am"))
       .DEF_SIMPLE_METHOD(set_filter_mode)
       .DEF_SIMPLE_METHOD(set_flags)
       .DEF_SIMPLE_METHOD(get_address)
@@ -1685,7 +1682,7 @@ BOOST_PYTHON_MODULE(_driver)
 #if CUDAPP_CUDA_VERSION >= 3010
   {
     typedef surface_reference cl;
-    py::class_<cl, boost::noncopyable>("SurfaceReference", py::no_init)
+    py::class_<cl>(m, "SurfaceReference", py::no_init)
       .def("set_array", &cl::set_array,
           (py::arg("array"), py::arg("flags")=0))
       .def("get_array", &cl::get_array,
diff --git a/src/wrapper/wrap_cudagl.cpp b/src/wrapper/wrap_cudagl.cpp
index 47208867..4222007c 100644
--- a/src/wrapper/wrap_cudagl.cpp
+++ b/src/wrapper/wrap_cudagl.cpp
@@ -12,7 +12,6 @@
 
 using namespace pycuda;
 using namespace pycuda::gl;
-using boost::shared_ptr;
 
 
 
@@ -56,7 +55,7 @@ void pycuda_expose_gl()
   {
     typedef registered_image cl;
     py::class_<cl, shared_ptr<cl>, py::bases<registered_object> >(
-        "RegisteredImage", 
+        "RegisteredImage",
         py::init<GLuint, GLenum, py::optional<CUgraphicsMapResourceFlags> >())
       ;
   }
diff --git a/src/wrapper/wrap_helpers.hpp b/src/wrapper/wrap_helpers.hpp
index 00e2f937..8a9896c3 100644
--- a/src/wrapper/wrap_helpers.hpp
+++ b/src/wrapper/wrap_helpers.hpp
@@ -4,19 +4,11 @@
 
 
 
-#include <boost/python.hpp>
-#include <boost/foreach.hpp>
-#include <boost/python/stl_iterator.hpp>
+#include <pybind11/pybind11.h>
 
 
 
 
-#define PYTHON_ERROR(TYPE, REASON) \
-{ \
-  PyErr_SetString(PyExc_##TYPE, REASON); \
-  throw boost::python::error_already_set(); \
-}
-
 #define ENUM_VALUE(NAME) \
   value(#NAME, NAME)
 
@@ -27,10 +19,10 @@
   def(#NAME, &cl::NAME, boost::python::args ARGS)
 
 #define DEF_SIMPLE_FUNCTION(NAME) \
-  boost::python::def(#NAME, &NAME)
+  m.def(#NAME, &NAME)
 
 #define DEF_SIMPLE_FUNCTION_WITH_ARGS(NAME, ARGS) \
-  boost::python::def(#NAME, &NAME, boost::python::args ARGS)
+  m.def(#NAME, &NAME, boost::python::args ARGS)
 
 #define DEF_SIMPLE_RO_MEMBER(NAME) \
   def_readonly(#NAME, &cl::m_##NAME)
@@ -38,22 +30,15 @@
 #define DEF_SIMPLE_RW_MEMBER(NAME) \
   def_readwrite(#NAME, &cl::m_##NAME)
 
-#define PYTHON_FOREACH(NAME, ITERABLE) \
-  BOOST_FOREACH(boost::python::object NAME, \
-      std::make_pair( \
-        boost::python::stl_input_iterator<boost::python::object>(ITERABLE), \
-        boost::python::stl_input_iterator<boost::python::object>()))
-
 
 
 
 namespace
 {
   template <typename T>
-  inline boost::python::handle<> handle_from_new_ptr(T *ptr)
+  inline py::object handle_from_new_ptr(T *ptr)
   {
-    return boost::python::handle<>(
-        typename boost::python::manage_new_object::apply<T *>::type()(ptr));
+    return py::cast(ptr, py::return_value_policy::take_ownership);
   }
 }