diff --git a/Cargo.toml b/Cargo.toml
index 067e29d..b76c118 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -14,7 +14,8 @@ members = [
 ]
 
 exclude = [
-    "crates/optix/examples/common"
+    "crates/optix/examples/common",
+    "crates/cuda_std_cg",
 ]
 
 [profile.dev.package.rustc_codegen_nvvm]
diff --git a/Justfile b/Justfile
new file mode 100644
index 0000000..af59104
--- /dev/null
+++ b/Justfile
@@ -0,0 +1,7 @@
+build_cuda_std_cg:
+    #!/usr/bin/env bash
+    set -euxo pipefail
+    nvcc --ptx -arch=sm_75 \
+        -I crates/cuda_std_cg/src -I${CUDA_ROOT}/include \
+        --device-c crates/cuda_std_cg/src/cg_bridge.cu \
+        -o crates/cuda_std_cg/cg_bridge.ptx
diff --git a/crates/cuda_std/src/cg.rs b/crates/cuda_std/src/cg.rs
new file mode 100644
index 0000000..5265798
--- /dev/null
+++ b/crates/cuda_std/src/cg.rs
@@ -0,0 +1,72 @@
+use crate::gpu_only;
+
+mod ffi {
+    use core::ffi::c_void;
+
+    pub type GridGroup = *mut c_void;
+    extern "C" {
+        pub(super) fn this_grid() -> GridGroup;
+        pub(super) fn GridGroup_destroy(gg: GridGroup);
+        pub(super) fn GridGroup_is_valid(gg: GridGroup) -> bool;
+        pub(super) fn GridGroup_sync(gg: GridGroup);
+        pub(super) fn GridGroup_size(gg: GridGroup) -> u64;
+        pub(super) fn GridGroup_thread_rank(gg: GridGroup) -> u64;
+        pub(super) fn GridGroup_num_threads(gg: GridGroup) -> u64;
+        pub(super) fn GridGroup_num_blocks(gg: GridGroup) -> u64;
+        pub(super) fn GridGroup_block_rank(gg: GridGroup) -> u64;
+        // dim3 GridGroup_group_dim(); // TODO: impl these.
+        // dim3 GridGroup_dim_blocks(); // TODO: impl these.
+        // dim3 GridGroup_block_index(); // TODO: impl these.
+    }
+}
+
+pub struct GridGroup(ffi::GridGroup);
+
+impl Drop for GridGroup {
+    fn drop(&mut self) {
+        unsafe { ffi::GridGroup_destroy(self.0) }
+    }
+}
+
+impl GridGroup {
+    #[gpu_only]
+    pub fn this_grid() -> Self {
+        let ptr = unsafe { ffi::this_grid() };
+        GridGroup(ptr)
+    }
+
+    #[gpu_only]
+    pub fn is_valid(&mut self) -> bool {
+        unsafe { ffi::GridGroup_is_valid(self.0) }
+    }
+
+    #[gpu_only]
+    pub fn sync(&mut self) {
+        unsafe { ffi::GridGroup_sync(self.0) }
+    }
+
+    #[gpu_only]
+    pub fn size(&mut self) -> u64 {
+        unsafe { ffi::GridGroup_size(self.0) }
+    }
+
+    #[gpu_only]
+    pub fn thread_rank(&mut self) -> u64 {
+        unsafe { ffi::GridGroup_thread_rank(self.0) }
+    }
+
+    #[gpu_only]
+    pub fn num_threads(&mut self) -> u64 {
+        unsafe { ffi::GridGroup_num_threads(self.0) }
+    }
+
+    #[gpu_only]
+    pub fn num_blocks(&mut self) -> u64 {
+        unsafe { ffi::GridGroup_num_blocks(self.0) }
+    }
+
+    #[gpu_only]
+    pub fn block_rank(&mut self) -> u64 {
+        unsafe { ffi::GridGroup_block_rank(self.0) }
+    }
+}
diff --git a/crates/cuda_std/src/lib.rs b/crates/cuda_std/src/lib.rs
index 33e7b28..a2f38d4 100644
--- a/crates/cuda_std/src/lib.rs
+++ b/crates/cuda_std/src/lib.rs
@@ -46,6 +46,7 @@ pub mod misc;
 // pub mod rt;
 pub mod atomic;
 pub mod cfg;
+pub mod cg;
 pub mod ptr;
 pub mod shared;
 pub mod thread;
diff --git a/crates/cuda_std_cg/cg_bridge.ptx b/crates/cuda_std_cg/cg_bridge.ptx
new file mode 100644
index 0000000..dd84c8c
--- /dev/null
+++ b/crates/cuda_std_cg/cg_bridge.ptx
@@ -0,0 +1,304 @@
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-31442593
+// Cuda compilation tools, release 11.7, V11.7.99
+// Based on NVVM 7.0.1
+//
+
+.version 7.7
+.target sm_75
+.address_size 64
+
+	// .globl	this_grid
+.extern .func  (.param .b64 func_retval0) malloc
+(
+	.param .b64 malloc_param_0
+)
+;
+.extern .func free
+(
+	.param .b64 free_param_0
+)
+;
+.weak .global .align 4 .b8 _ZZN4cuda3std3__48__detail21__stronger_order_cudaEiiE7__xform[16] = {3, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 3, 0, 0, 0};
+
+.visible .func  (.param .b64 func_retval0) this_grid()
+{
+	.reg .pred 	%p<2>;
+	.reg .b32 	%r<3>;
+	.reg .b64 	%rd<9>;
+
+
+	// begin inline asm
+	mov.u32 %r1, %envreg2;
+	// end inline asm
+	cvt.u64.u32 	%rd5, %r1;
+	// begin inline asm
+	mov.u32 %r2, %envreg1;
+	// end inline asm
+	cvt.u64.u32 	%rd6, %r2;
+	bfi.b64 	%rd1, %rd6, %rd5, 32, 32;
+	mov.u64 	%rd7, 16;
+	{ // callseq 0, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd7;
+	.param .b64 retval0;
+	call.uni (retval0), 
+	malloc, 
+	(
+	param0
+	);
+	ld.param.b64 	%rd2, [retval0+0];
+	} // callseq 0
+	setp.eq.s64 	%p1, %rd2, 0;
+	mov.u64 	%rd8, 0;
+	@%p1 bra 	$L__BB0_2;
+
+	st.u64 	[%rd2], %rd1;
+	mov.u64 	%rd8, %rd2;
+
+$L__BB0_2:
+	st.param.b64 	[func_retval0+0], %rd8;
+	ret;
+
+}
+	// .globl	GridGroup_destroy
+.visible .func GridGroup_destroy(
+	.param .b64 GridGroup_destroy_param_0
+)
+{
+	.reg .b64 	%rd<2>;
+
+
+	ld.param.u64 	%rd1, [GridGroup_destroy_param_0];
+	{ // callseq 1, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd1;
+	call.uni 
+	free, 
+	(
+	param0
+	);
+	} // callseq 1
+	ret;
+
+}
+	// .globl	GridGroup_is_valid
+.visible .func  (.param .b32 func_retval0) GridGroup_is_valid(
+	.param .b64 GridGroup_is_valid_param_0
+)
+{
+	.reg .pred 	%p<2>;
+	.reg .b32 	%r<2>;
+	.reg .b64 	%rd<3>;
+
+
+	ld.param.u64 	%rd1, [GridGroup_is_valid_param_0];
+	ld.u64 	%rd2, [%rd1];
+	setp.ne.s64 	%p1, %rd2, 0;
+	selp.u32 	%r1, 1, 0, %p1;
+	st.param.b32 	[func_retval0+0], %r1;
+	ret;
+
+}
+	// .globl	GridGroup_sync
+.visible .func GridGroup_sync(
+	.param .b64 GridGroup_sync_param_0
+)
+{
+	.reg .pred 	%p<5>;
+	.reg .b32 	%r<24>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd1, [GridGroup_sync_param_0];
+	ld.u64 	%rd8, [%rd1];
+	setp.ne.s64 	%p1, %rd8, 0;
+	@%p1 bra 	$L__BB3_2;
+
+	// begin inline asm
+	trap;
+	// end inline asm
+	ld.u64 	%rd8, [%rd1];
+
+$L__BB3_2:
+	mov.u32 	%r2, %tid.y;
+	mov.u32 	%r3, %tid.x;
+	add.s32 	%r4, %r3, %r2;
+	mov.u32 	%r5, %tid.z;
+	neg.s32 	%r6, %r5;
+	setp.ne.s32 	%p2, %r4, %r6;
+	bar.sync 	0;
+	@%p2 bra 	$L__BB3_6;
+
+	add.s64 	%rd6, %rd8, 4;
+	mov.u32 	%r9, %ctaid.z;
+	neg.s32 	%r10, %r9;
+	mov.u32 	%r11, %ctaid.x;
+	mov.u32 	%r12, %ctaid.y;
+	add.s32 	%r13, %r11, %r12;
+	setp.eq.s32 	%p3, %r13, %r10;
+	mov.u32 	%r14, %nctaid.z;
+	mov.u32 	%r15, %nctaid.x;
+	mov.u32 	%r16, %nctaid.y;
+	mul.lo.s32 	%r17, %r15, %r16;
+	mul.lo.s32 	%r18, %r17, %r14;
+	mov.u32 	%r19, -2147483647;
+	sub.s32 	%r20, %r19, %r18;
+	selp.b32 	%r8, %r20, 1, %p3;
+	membar.gl;
+	// begin inline asm
+	atom.add.release.gpu.u32 %r7,[%rd6],%r8;
+	// end inline asm
+
+$L__BB3_4:
+	ld.volatile.u32 	%r21, [%rd6];
+	xor.b32  	%r22, %r21, %r7;
+	setp.gt.s32 	%p4, %r22, -1;
+	@%p4 bra 	$L__BB3_4;
+
+	// begin inline asm
+	ld.acquire.gpu.u32 %r23,[%rd6];
+	// end inline asm
+
+$L__BB3_6:
+	bar.sync 	0;
+	ret;
+
+}
+	// .globl	GridGroup_size
+.visible .func  (.param .b64 func_retval0) GridGroup_size(
+	.param .b64 GridGroup_size_param_0
+)
+{
+	.reg .b32 	%r<10>;
+	.reg .b64 	%rd<4>;
+
+
+	mov.u32 	%r1, %nctaid.x;
+	mov.u32 	%r2, %nctaid.y;
+	mov.u32 	%r3, %nctaid.z;
+	mul.lo.s32 	%r4, %r2, %r3;
+	mul.wide.u32 	%rd1, %r4, %r1;
+	mov.u32 	%r5, %ntid.x;
+	mov.u32 	%r6, %ntid.y;
+	mul.lo.s32 	%r7, %r5, %r6;
+	mov.u32 	%r8, %ntid.z;
+	mul.lo.s32 	%r9, %r7, %r8;
+	cvt.u64.u32 	%rd2, %r9;
+	mul.lo.s64 	%rd3, %rd1, %rd2;
+	st.param.b64 	[func_retval0+0], %rd3;
+	ret;
+
+}
+	// .globl	GridGroup_thread_rank
+.visible .func  (.param .b64 func_retval0) GridGroup_thread_rank(
+	.param .b64 GridGroup_thread_rank_param_0
+)
+{
+	.reg .b32 	%r<16>;
+	.reg .b64 	%rd<12>;
+
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ctaid.y;
+	mov.u32 	%r3, %ctaid.z;
+	mov.u32 	%r4, %nctaid.x;
+	mov.u32 	%r5, %nctaid.y;
+	mul.wide.u32 	%rd1, %r5, %r3;
+	cvt.u64.u32 	%rd2, %r4;
+	cvt.u64.u32 	%rd3, %r2;
+	add.s64 	%rd4, %rd1, %rd3;
+	mul.lo.s64 	%rd5, %rd4, %rd2;
+	cvt.u64.u32 	%rd6, %r1;
+	add.s64 	%rd7, %rd5, %rd6;
+	mov.u32 	%r6, %ntid.x;
+	mov.u32 	%r7, %ntid.y;
+	mul.lo.s32 	%r8, %r6, %r7;
+	mov.u32 	%r9, %ntid.z;
+	mul.lo.s32 	%r10, %r8, %r9;
+	cvt.u64.u32 	%rd8, %r10;
+	mul.lo.s64 	%rd9, %rd7, %rd8;
+	mov.u32 	%r11, %tid.x;
+	mov.u32 	%r12, %tid.y;
+	mov.u32 	%r13, %tid.z;
+	mad.lo.s32 	%r14, %r7, %r13, %r12;
+	mad.lo.s32 	%r15, %r14, %r6, %r11;
+	cvt.u64.u32 	%rd10, %r15;
+	add.s64 	%rd11, %rd9, %rd10;
+	st.param.b64 	[func_retval0+0], %rd11;
+	ret;
+
+}
+	// .globl	GridGroup_num_threads
+.visible .func  (.param .b64 func_retval0) GridGroup_num_threads(
+	.param .b64 GridGroup_num_threads_param_0
+)
+{
+	.reg .b32 	%r<10>;
+	.reg .b64 	%rd<4>;
+
+
+	mov.u32 	%r1, %nctaid.x;
+	mov.u32 	%r2, %nctaid.y;
+	mov.u32 	%r3, %nctaid.z;
+	mul.lo.s32 	%r4, %r2, %r3;
+	mul.wide.u32 	%rd1, %r4, %r1;
+	mov.u32 	%r5, %ntid.x;
+	mov.u32 	%r6, %ntid.y;
+	mul.lo.s32 	%r7, %r5, %r6;
+	mov.u32 	%r8, %ntid.z;
+	mul.lo.s32 	%r9, %r7, %r8;
+	cvt.u64.u32 	%rd2, %r9;
+	mul.lo.s64 	%rd3, %rd1, %rd2;
+	st.param.b64 	[func_retval0+0], %rd3;
+	ret;
+
+}
+	// .globl	GridGroup_num_blocks
+.visible .func  (.param .b64 func_retval0) GridGroup_num_blocks(
+	.param .b64 GridGroup_num_blocks_param_0
+)
+{
+	.reg .b32 	%r<5>;
+	.reg .b64 	%rd<2>;
+
+
+	mov.u32 	%r1, %nctaid.x;
+	mov.u32 	%r2, %nctaid.y;
+	mov.u32 	%r3, %nctaid.z;
+	mul.lo.s32 	%r4, %r2, %r3;
+	mul.wide.u32 	%rd1, %r4, %r1;
+	st.param.b64 	[func_retval0+0], %rd1;
+	ret;
+
+}
+	// .globl	GridGroup_block_rank
+.visible .func  (.param .b64 func_retval0) GridGroup_block_rank(
+	.param .b64 GridGroup_block_rank_param_0
+)
+{
+	.reg .b32 	%r<6>;
+	.reg .b64 	%rd<8>;
+
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ctaid.y;
+	mov.u32 	%r3, %ctaid.z;
+	mov.u32 	%r4, %nctaid.x;
+	mov.u32 	%r5, %nctaid.y;
+	mul.wide.u32 	%rd1, %r5, %r3;
+	cvt.u64.u32 	%rd2, %r4;
+	cvt.u64.u32 	%rd3, %r2;
+	add.s64 	%rd4, %rd1, %rd3;
+	mul.lo.s64 	%rd5, %rd4, %rd2;
+	cvt.u64.u32 	%rd6, %r1;
+	add.s64 	%rd7, %rd5, %rd6;
+	st.param.b64 	[func_retval0+0], %rd7;
+	ret;
+
+}
+
diff --git a/crates/cuda_std_cg/src/cg_bridge.cu b/crates/cuda_std_cg/src/cg_bridge.cu
new file mode 100644
index 0000000..110d9f2
--- /dev/null
+++ b/crates/cuda_std_cg/src/cg_bridge.cu
@@ -0,0 +1,63 @@
+#include "cooperative_groups.h"
+#include "cg_bridge.cuh"
+// #include <cstdio>
+namespace cg = cooperative_groups;
+
+__device__ GridGroup this_grid()
+{
+    cg::grid_group gg = cg::this_grid();
+    GridGroupWrapper* ggp = new GridGroupWrapper { gg };
+    return ggp;
+}
+
+__device__ void GridGroup_destroy(GridGroup gg)
+{
+    GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
+    delete g;
+}
+
+__device__ bool GridGroup_is_valid(GridGroup gg)
+{
+    GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
+    return g->gg.is_valid();
+}
+
+__device__ void GridGroup_sync(GridGroup gg)
+{
+    // std::printf("calling sync from bridge");
+    GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
+    return g->gg.sync();
+}
+
+__device__ unsigned long long GridGroup_size(GridGroup gg)
+{
+    GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
+    return g->gg.size();
+}
+
+__device__ unsigned long long GridGroup_thread_rank(GridGroup gg)
+{
+    GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
+    return g->gg.thread_rank();
+}
+
+__device__ unsigned long long GridGroup_num_threads(GridGroup gg)
+{
+    GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
+    return g->gg.num_threads();
+}
+
+__device__ unsigned long long GridGroup_num_blocks(GridGroup gg)
+{
+    GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
+    return g->gg.num_blocks();
+}
+
+__device__ unsigned long long GridGroup_block_rank(GridGroup gg)
+{
+    GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
+    return g->gg.block_rank();
+}
+
+__host__ int main()
+{}
diff --git a/crates/cuda_std_cg/src/cg_bridge.cuh b/crates/cuda_std_cg/src/cg_bridge.cuh
new file mode 100644
index 0000000..9f22f12
--- /dev/null
+++ b/crates/cuda_std_cg/src/cg_bridge.cuh
@@ -0,0 +1,21 @@
+#pragma once
+#include "cooperative_groups.h"
+namespace cg = cooperative_groups;
+
+typedef struct GridGroupWrapper {
+    cg::grid_group gg;
+} GridGroupWrapper;
+
+extern "C" typedef void* GridGroup;
+extern "C" __device__ GridGroup this_grid();
+extern "C" __device__ void GridGroup_destroy(GridGroup gg);
+extern "C" __device__ bool GridGroup_is_valid(GridGroup gg);
+extern "C" __device__ void GridGroup_sync(GridGroup gg);
+extern "C" __device__ unsigned long long GridGroup_size(GridGroup gg);
+extern "C" __device__ unsigned long long GridGroup_thread_rank(GridGroup gg);
+// extern "C" dim3 GridGroup_group_dim(); // TODO: impl these.
+extern "C" __device__ unsigned long long GridGroup_num_threads(GridGroup gg);
+// extern "C" dim3 GridGroup_dim_blocks(); // TODO: impl these.
+extern "C" __device__ unsigned long long GridGroup_num_blocks(GridGroup gg);
+// extern "C" dim3 GridGroup_block_index(); // TODO: impl these.
+extern "C" __device__ unsigned long long GridGroup_block_rank(GridGroup gg);
diff --git a/crates/cust/src/error.rs b/crates/cust/src/error.rs
index 8994af2..8857c5b 100644
--- a/crates/cust/src/error.rs
+++ b/crates/cust/src/error.rs
@@ -78,6 +78,7 @@ pub enum CudaError {
     InvalidAddressSpace = 717,
     InvalidProgramCounter = 718,
     LaunchFailed = 719,
+    CooperativeLaunchTooLarge = 720,
     NotPermitted = 800,
     NotSupported = 801,
     UnknownError = 999,
@@ -209,9 +210,15 @@ impl ToResult for cudaError_enum {
             cudaError_enum::CUDA_ERROR_INVALID_ADDRESS_SPACE => Err(CudaError::InvalidAddressSpace),
             cudaError_enum::CUDA_ERROR_INVALID_PC => Err(CudaError::InvalidProgramCounter),
             cudaError_enum::CUDA_ERROR_LAUNCH_FAILED => Err(CudaError::LaunchFailed),
+            cudaError_enum::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE => {
+                Err(CudaError::CooperativeLaunchTooLarge)
+            }
             cudaError_enum::CUDA_ERROR_NOT_PERMITTED => Err(CudaError::NotPermitted),
             cudaError_enum::CUDA_ERROR_NOT_SUPPORTED => Err(CudaError::NotSupported),
-            _ => Err(CudaError::UnknownError),
+            err => {
+                println!("error encountered: {:?}", err);
+                Err(CudaError::UnknownError)
+            }
         }
     }
 }
diff --git a/crates/cust/src/stream.rs b/crates/cust/src/stream.rs
index 25a4178..6c83703 100644
--- a/crates/cust/src/stream.rs
+++ b/crates/cust/src/stream.rs
@@ -262,7 +262,51 @@ impl Stream {
         let grid_size: GridSize = grid_size.into();
         let block_size: BlockSize = block_size.into();
 
-        cuda::cuLaunchKernel(
+        // cuda::cuLaunchKernel(
+        //     f: CUfunction,
+        //     gridDimX: ::std::os::raw::c_uint,
+        //     gridDimY: ::std::os::raw::c_uint,
+        //     gridDimZ: ::std::os::raw::c_uint,
+        //     blockDimX: ::std::os::raw::c_uint,
+        //     blockDimY: ::std::os::raw::c_uint,
+        //     blockDimZ: ::std::os::raw::c_uint,
+        //     sharedMemBytes: ::std::os::raw::c_uint,
+        //     hStream: CUstream,
+        //     kernelParams: *mut *mut ::std::os::raw::c_void,
+        //     extra: *mut *mut ::std::os::raw::c_void,
+        // ).to_result();
+
+        // cuda::cuLaunchCooperativeKernel(
+        //     f: CUfunction,
+        //     gridDimX: ::std::os::raw::c_uint,
+        //     gridDimY: ::std::os::raw::c_uint,
+        //     gridDimZ: ::std::os::raw::c_uint,
+        //     blockDimX: ::std::os::raw::c_uint,
+        //     blockDimY: ::std::os::raw::c_uint,
+        //     blockDimZ: ::std::os::raw::c_uint,
+        //     sharedMemBytes: ::std::os::raw::c_uint,
+        //     hStream: CUstream,
+        //     kernelParams: *mut *mut ::std::os::raw::c_void,
+        // ).to_result();
+
+        // cuda::cuLaunchKernel(
+        //     func.to_raw(),
+        //     grid_size.x,
+        //     grid_size.y,
+        //     grid_size.z,
+        //     block_size.x,
+        //     block_size.y,
+        //     block_size.z,
+        //     shared_mem_bytes,
+        //     self.inner,
+        //     args.as_ptr() as *mut _,
+        //     ptr::null_mut(),
+        // )
+        // .to_result()
+
+        // TODO: make this configurable based on invocation patterns. For now, just testing.
+
+        cuda::cuLaunchCooperativeKernel(
             func.to_raw(),
             grid_size.x,
             grid_size.y,
@@ -273,7 +317,6 @@ impl Stream {
             shared_mem_bytes,
             self.inner,
             args.as_ptr() as *mut _,
-            ptr::null_mut(),
         )
         .to_result()
     }
diff --git a/crates/cust_raw/bindgen.sh b/crates/cust_raw/bindgen.sh
old mode 100644
new mode 100755
index 83c8b7b..afc1ed3
--- a/crates/cust_raw/bindgen.sh
+++ b/crates/cust_raw/bindgen.sh
@@ -1,15 +1,20 @@
 #!/bin/bash
 set -exu
 
+if [ -z ${CUDA_PATH} ]; then
+  echo 'env var ${CUDA_PATH} must be defined, and must point to the root directory of the target Cuda installation'
+  exit 1
+fi
+
 bindgen \
-  --whitelist-type="^CU.*" \
-  --whitelist-type="^cuuint(32|64)_t" \
-  --whitelist-type="^cudaError_enum" \
-  --whitelist-type="^cu.*Complex$" \
-  --whitelist-type="^cuda.*" \
-  --whitelist-type="^libraryPropertyType.*" \
-  --whitelist-var="^CU.*" \
-  --whitelist-function="^cu.*" \
+  --allowlist-type="^CU.*" \
+  --allowlist-type="^cuuint(32|64)_t" \
+  --allowlist-type="^cudaError_enum" \
+  --allowlist-type="^cu.*Complex$" \
+  --allowlist-type="^cuda.*" \
+  --allowlist-type="^libraryPropertyType.*" \
+  --allowlist-var="^CU.*" \
+  --allowlist-function="^cu.*" \
   --default-enum-style=rust \
   --no-doc-comments \
   --with-derive-default \
@@ -17,5 +22,7 @@ bindgen \
   --with-derive-hash \
   --with-derive-ord \
   --size_t-is-usize \
-  wrapper.h -- -I/opt/cuda/include \
-  > src/cuda.rs
\ No newline at end of file
+  wrapper.h \
+  -- \
+  -I${CUDA_PATH}/include \
+  > src/cuda.rs