From b328b5b3bceaa8e2304c3fa06e1aa306d8786ecb Mon Sep 17 00:00:00 2001 From: Anthony Dodd Date: Thu, 15 Sep 2022 14:12:07 -0500 Subject: [PATCH] Add bridging PTX code for cooperative_groups API This appears to be a working solution. The PTX should be fairly portable as well. Overall, this system will work as follows: - Wrap a `cc::Builder` to compile the bridging code on demand as part of a `sys` crate. This will use nvcc under the hood and we will pass along any other needed flags. - Folks that need the bridging code will then use cust::link::Linker to link the bridging PTX code with their PTX. All of the above is tested and working, however it currently deadlocks invocations because we have not exposed the cooperative launch interface. This should be quite simple though, given that the generated cuda bindgen code already has this in place. LOTS TO DO STILL! - Update the new cuda_std_cg crate to use `cc` to compile the C++ bridging code on demand, and produce the PTX path as output. - Remove the Justfile. I was only using it for POC testing. - MAJOR: update cust to expose the `cuLaunchCooperativeKernel` in a nice interface. --- Cargo.toml | 3 +- Justfile | 7 + crates/cuda_std/src/cg.rs | 72 +++++++ crates/cuda_std/src/lib.rs | 1 + crates/cuda_std_cg/cg_bridge.ptx | 304 +++++++++++++++++++++++++++ crates/cuda_std_cg/src/cg_bridge.cu | 63 ++++++ crates/cuda_std_cg/src/cg_bridge.cuh | 21 ++ crates/cust/src/error.rs | 9 +- crates/cust/src/stream.rs | 47 ++++- crates/cust_raw/bindgen.sh | 27 ++- 10 files changed, 540 insertions(+), 14 deletions(-) create mode 100644 Justfile create mode 100644 crates/cuda_std/src/cg.rs create mode 100644 crates/cuda_std_cg/cg_bridge.ptx create mode 100644 crates/cuda_std_cg/src/cg_bridge.cu create mode 100644 crates/cuda_std_cg/src/cg_bridge.cuh mode change 100644 => 100755 crates/cust_raw/bindgen.sh diff --git a/Cargo.toml b/Cargo.toml index 067e29d..b76c118 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,8 @@ members = [ ] exclude = [ - "crates/optix/examples/common" + "crates/optix/examples/common", + "crates/cuda_std_cg", ] [profile.dev.package.rustc_codegen_nvvm] diff --git a/Justfile b/Justfile new file mode 100644 index 0000000..af59104 --- /dev/null +++ b/Justfile @@ -0,0 +1,7 @@ +build_cuda_std_cg: + #!/usr/bin/env bash + set -euxo pipefail + nvcc --ptx -arch=sm_75 \ + -I crates/cuda_std_cg/src -I${CUDA_ROOT}/include \ + --device-c crates/cuda_std_cg/src/cg_bridge.cu \ + -o crates/cuda_std_cg/cg_bridge.ptx diff --git a/crates/cuda_std/src/cg.rs b/crates/cuda_std/src/cg.rs new file mode 100644 index 0000000..5265798 --- /dev/null +++ b/crates/cuda_std/src/cg.rs @@ -0,0 +1,72 @@ +use crate::gpu_only; + +mod ffi { + use core::ffi::c_void; + + pub type GridGroup = *mut c_void; + extern "C" { + pub(super) fn this_grid() -> GridGroup; + pub(super) fn GridGroup_destroy(gg: GridGroup); + pub(super) fn GridGroup_is_valid(gg: GridGroup) -> bool; + pub(super) fn GridGroup_sync(gg: GridGroup); + pub(super) fn GridGroup_size(gg: GridGroup) -> u64; + pub(super) fn GridGroup_thread_rank(gg: GridGroup) -> u64; + pub(super) fn GridGroup_num_threads(gg: GridGroup) -> u64; + pub(super) fn GridGroup_num_blocks(gg: GridGroup) -> u64; + pub(super) fn GridGroup_block_rank(gg: GridGroup) -> u64; + // dim3 GridGroup_group_dim(); // TODO: impl these. + // dim3 GridGroup_dim_blocks(); // TODO: impl these. + // dim3 GridGroup_block_index(); // TODO: impl these. + } +} + +pub struct GridGroup(ffi::GridGroup); + +impl Drop for GridGroup { + fn drop(&mut self) { + unsafe { ffi::GridGroup_destroy(self.0) } + } +} + +impl GridGroup { + #[gpu_only] + pub fn this_grid() -> Self { + let ptr = unsafe { ffi::this_grid() }; + GridGroup(ptr) + } + + #[gpu_only] + pub fn is_valid(&mut self) -> bool { + unsafe { ffi::GridGroup_is_valid(self.0) } + } + + #[gpu_only] + pub fn sync(&mut self) { + unsafe { ffi::GridGroup_sync(self.0) } + } + + #[gpu_only] + pub fn size(&mut self) -> u64 { + unsafe { ffi::GridGroup_size(self.0) } + } + + #[gpu_only] + pub fn thread_rank(&mut self) -> u64 { + unsafe { ffi::GridGroup_thread_rank(self.0) } + } + + #[gpu_only] + pub fn num_threads(&mut self) -> u64 { + unsafe { ffi::GridGroup_num_threads(self.0) } + } + + #[gpu_only] + pub fn num_blocks(&mut self) -> u64 { + unsafe { ffi::GridGroup_num_blocks(self.0) } + } + + #[gpu_only] + pub fn block_rank(&mut self) -> u64 { + unsafe { ffi::GridGroup_block_rank(self.0) } + } +} diff --git a/crates/cuda_std/src/lib.rs b/crates/cuda_std/src/lib.rs index 33e7b28..a2f38d4 100644 --- a/crates/cuda_std/src/lib.rs +++ b/crates/cuda_std/src/lib.rs @@ -46,6 +46,7 @@ pub mod misc; // pub mod rt; pub mod atomic; pub mod cfg; +pub mod cg; pub mod ptr; pub mod shared; pub mod thread; diff --git a/crates/cuda_std_cg/cg_bridge.ptx b/crates/cuda_std_cg/cg_bridge.ptx new file mode 100644 index 0000000..dd84c8c --- /dev/null +++ b/crates/cuda_std_cg/cg_bridge.ptx @@ -0,0 +1,304 @@ +// +// Generated by NVIDIA NVVM Compiler +// +// Compiler Build ID: CL-31442593 +// Cuda compilation tools, release 11.7, V11.7.99 +// Based on NVVM 7.0.1 +// + +.version 7.7 +.target sm_75 +.address_size 64 + + // .globl this_grid +.extern .func (.param .b64 func_retval0) malloc +( + .param .b64 malloc_param_0 +) +; +.extern .func free +( + .param .b64 free_param_0 +) +; +.weak .global .align 4 .b8 _ZZN4cuda3std3__48__detail21__stronger_order_cudaEiiE7__xform[16] = {3, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 3, 0, 0, 0}; + +.visible .func (.param .b64 func_retval0) this_grid() +{ + .reg .pred %p<2>; + .reg .b32 %r<3>; + .reg .b64 %rd<9>; + + + // begin inline asm + mov.u32 %r1, %envreg2; + // end inline asm + cvt.u64.u32 %rd5, %r1; + // begin inline asm + mov.u32 %r2, %envreg1; + // end inline asm + cvt.u64.u32 %rd6, %r2; + bfi.b64 %rd1, %rd6, %rd5, 32, 32; + mov.u64 %rd7, 16; + { // callseq 0, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd7; + .param .b64 retval0; + call.uni (retval0), + malloc, + ( + param0 + ); + ld.param.b64 %rd2, [retval0+0]; + } // callseq 0 + setp.eq.s64 %p1, %rd2, 0; + mov.u64 %rd8, 0; + @%p1 bra $L__BB0_2; + + st.u64 [%rd2], %rd1; + mov.u64 %rd8, %rd2; + +$L__BB0_2: + st.param.b64 [func_retval0+0], %rd8; + ret; + +} + // .globl GridGroup_destroy +.visible .func GridGroup_destroy( + .param .b64 GridGroup_destroy_param_0 +) +{ + .reg .b64 %rd<2>; + + + ld.param.u64 %rd1, [GridGroup_destroy_param_0]; + { // callseq 1, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd1; + call.uni + free, + ( + param0 + ); + } // callseq 1 + ret; + +} + // .globl GridGroup_is_valid +.visible .func (.param .b32 func_retval0) GridGroup_is_valid( + .param .b64 GridGroup_is_valid_param_0 +) +{ + .reg .pred %p<2>; + .reg .b32 %r<2>; + .reg .b64 %rd<3>; + + + ld.param.u64 %rd1, [GridGroup_is_valid_param_0]; + ld.u64 %rd2, [%rd1]; + setp.ne.s64 %p1, %rd2, 0; + selp.u32 %r1, 1, 0, %p1; + st.param.b32 [func_retval0+0], %r1; + ret; + +} + // .globl GridGroup_sync +.visible .func GridGroup_sync( + .param .b64 GridGroup_sync_param_0 +) +{ + .reg .pred %p<5>; + .reg .b32 %r<24>; + .reg .b64 %rd<9>; + + + ld.param.u64 %rd1, [GridGroup_sync_param_0]; + ld.u64 %rd8, [%rd1]; + setp.ne.s64 %p1, %rd8, 0; + @%p1 bra $L__BB3_2; + + // begin inline asm + trap; + // end inline asm + ld.u64 %rd8, [%rd1]; + +$L__BB3_2: + mov.u32 %r2, %tid.y; + mov.u32 %r3, %tid.x; + add.s32 %r4, %r3, %r2; + mov.u32 %r5, %tid.z; + neg.s32 %r6, %r5; + setp.ne.s32 %p2, %r4, %r6; + bar.sync 0; + @%p2 bra $L__BB3_6; + + add.s64 %rd6, %rd8, 4; + mov.u32 %r9, %ctaid.z; + neg.s32 %r10, %r9; + mov.u32 %r11, %ctaid.x; + mov.u32 %r12, %ctaid.y; + add.s32 %r13, %r11, %r12; + setp.eq.s32 %p3, %r13, %r10; + mov.u32 %r14, %nctaid.z; + mov.u32 %r15, %nctaid.x; + mov.u32 %r16, %nctaid.y; + mul.lo.s32 %r17, %r15, %r16; + mul.lo.s32 %r18, %r17, %r14; + mov.u32 %r19, -2147483647; + sub.s32 %r20, %r19, %r18; + selp.b32 %r8, %r20, 1, %p3; + membar.gl; + // begin inline asm + atom.add.release.gpu.u32 %r7,[%rd6],%r8; + // end inline asm + +$L__BB3_4: + ld.volatile.u32 %r21, [%rd6]; + xor.b32 %r22, %r21, %r7; + setp.gt.s32 %p4, %r22, -1; + @%p4 bra $L__BB3_4; + + // begin inline asm + ld.acquire.gpu.u32 %r23,[%rd6]; + // end inline asm + +$L__BB3_6: + bar.sync 0; + ret; + +} + // .globl GridGroup_size +.visible .func (.param .b64 func_retval0) GridGroup_size( + .param .b64 GridGroup_size_param_0 +) +{ + .reg .b32 %r<10>; + .reg .b64 %rd<4>; + + + mov.u32 %r1, %nctaid.x; + mov.u32 %r2, %nctaid.y; + mov.u32 %r3, %nctaid.z; + mul.lo.s32 %r4, %r2, %r3; + mul.wide.u32 %rd1, %r4, %r1; + mov.u32 %r5, %ntid.x; + mov.u32 %r6, %ntid.y; + mul.lo.s32 %r7, %r5, %r6; + mov.u32 %r8, %ntid.z; + mul.lo.s32 %r9, %r7, %r8; + cvt.u64.u32 %rd2, %r9; + mul.lo.s64 %rd3, %rd1, %rd2; + st.param.b64 [func_retval0+0], %rd3; + ret; + +} + // .globl GridGroup_thread_rank +.visible .func (.param .b64 func_retval0) GridGroup_thread_rank( + .param .b64 GridGroup_thread_rank_param_0 +) +{ + .reg .b32 %r<16>; + .reg .b64 %rd<12>; + + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ctaid.y; + mov.u32 %r3, %ctaid.z; + mov.u32 %r4, %nctaid.x; + mov.u32 %r5, %nctaid.y; + mul.wide.u32 %rd1, %r5, %r3; + cvt.u64.u32 %rd2, %r4; + cvt.u64.u32 %rd3, %r2; + add.s64 %rd4, %rd1, %rd3; + mul.lo.s64 %rd5, %rd4, %rd2; + cvt.u64.u32 %rd6, %r1; + add.s64 %rd7, %rd5, %rd6; + mov.u32 %r6, %ntid.x; + mov.u32 %r7, %ntid.y; + mul.lo.s32 %r8, %r6, %r7; + mov.u32 %r9, %ntid.z; + mul.lo.s32 %r10, %r8, %r9; + cvt.u64.u32 %rd8, %r10; + mul.lo.s64 %rd9, %rd7, %rd8; + mov.u32 %r11, %tid.x; + mov.u32 %r12, %tid.y; + mov.u32 %r13, %tid.z; + mad.lo.s32 %r14, %r7, %r13, %r12; + mad.lo.s32 %r15, %r14, %r6, %r11; + cvt.u64.u32 %rd10, %r15; + add.s64 %rd11, %rd9, %rd10; + st.param.b64 [func_retval0+0], %rd11; + ret; + +} + // .globl GridGroup_num_threads +.visible .func (.param .b64 func_retval0) GridGroup_num_threads( + .param .b64 GridGroup_num_threads_param_0 +) +{ + .reg .b32 %r<10>; + .reg .b64 %rd<4>; + + + mov.u32 %r1, %nctaid.x; + mov.u32 %r2, %nctaid.y; + mov.u32 %r3, %nctaid.z; + mul.lo.s32 %r4, %r2, %r3; + mul.wide.u32 %rd1, %r4, %r1; + mov.u32 %r5, %ntid.x; + mov.u32 %r6, %ntid.y; + mul.lo.s32 %r7, %r5, %r6; + mov.u32 %r8, %ntid.z; + mul.lo.s32 %r9, %r7, %r8; + cvt.u64.u32 %rd2, %r9; + mul.lo.s64 %rd3, %rd1, %rd2; + st.param.b64 [func_retval0+0], %rd3; + ret; + +} + // .globl GridGroup_num_blocks +.visible .func (.param .b64 func_retval0) GridGroup_num_blocks( + .param .b64 GridGroup_num_blocks_param_0 +) +{ + .reg .b32 %r<5>; + .reg .b64 %rd<2>; + + + mov.u32 %r1, %nctaid.x; + mov.u32 %r2, %nctaid.y; + mov.u32 %r3, %nctaid.z; + mul.lo.s32 %r4, %r2, %r3; + mul.wide.u32 %rd1, %r4, %r1; + st.param.b64 [func_retval0+0], %rd1; + ret; + +} + // .globl GridGroup_block_rank +.visible .func (.param .b64 func_retval0) GridGroup_block_rank( + .param .b64 GridGroup_block_rank_param_0 +) +{ + .reg .b32 %r<6>; + .reg .b64 %rd<8>; + + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ctaid.y; + mov.u32 %r3, %ctaid.z; + mov.u32 %r4, %nctaid.x; + mov.u32 %r5, %nctaid.y; + mul.wide.u32 %rd1, %r5, %r3; + cvt.u64.u32 %rd2, %r4; + cvt.u64.u32 %rd3, %r2; + add.s64 %rd4, %rd1, %rd3; + mul.lo.s64 %rd5, %rd4, %rd2; + cvt.u64.u32 %rd6, %r1; + add.s64 %rd7, %rd5, %rd6; + st.param.b64 [func_retval0+0], %rd7; + ret; + +} + diff --git a/crates/cuda_std_cg/src/cg_bridge.cu b/crates/cuda_std_cg/src/cg_bridge.cu new file mode 100644 index 0000000..110d9f2 --- /dev/null +++ b/crates/cuda_std_cg/src/cg_bridge.cu @@ -0,0 +1,63 @@ +#include "cooperative_groups.h" +#include "cg_bridge.cuh" +// #include +namespace cg = cooperative_groups; + +__device__ GridGroup this_grid() +{ + cg::grid_group gg = cg::this_grid(); + GridGroupWrapper* ggp = new GridGroupWrapper { gg }; + return ggp; +} + +__device__ void GridGroup_destroy(GridGroup gg) +{ + GridGroupWrapper* g = static_cast(gg); + delete g; +} + +__device__ bool GridGroup_is_valid(GridGroup gg) +{ + GridGroupWrapper* g = static_cast(gg); + return g->gg.is_valid(); +} + +__device__ void GridGroup_sync(GridGroup gg) +{ + // std::printf("calling sync from bridge"); + GridGroupWrapper* g = static_cast(gg); + return g->gg.sync(); +} + +__device__ unsigned long long GridGroup_size(GridGroup gg) +{ + GridGroupWrapper* g = static_cast(gg); + return g->gg.size(); +} + +__device__ unsigned long long GridGroup_thread_rank(GridGroup gg) +{ + GridGroupWrapper* g = static_cast(gg); + return g->gg.thread_rank(); +} + +__device__ unsigned long long GridGroup_num_threads(GridGroup gg) +{ + GridGroupWrapper* g = static_cast(gg); + return g->gg.num_threads(); +} + +__device__ unsigned long long GridGroup_num_blocks(GridGroup gg) +{ + GridGroupWrapper* g = static_cast(gg); + return g->gg.num_blocks(); +} + +__device__ unsigned long long GridGroup_block_rank(GridGroup gg) +{ + GridGroupWrapper* g = static_cast(gg); + return g->gg.block_rank(); +} + +__host__ int main() +{} diff --git a/crates/cuda_std_cg/src/cg_bridge.cuh b/crates/cuda_std_cg/src/cg_bridge.cuh new file mode 100644 index 0000000..9f22f12 --- /dev/null +++ b/crates/cuda_std_cg/src/cg_bridge.cuh @@ -0,0 +1,21 @@ +#pragma once +#include "cooperative_groups.h" +namespace cg = cooperative_groups; + +typedef struct GridGroupWrapper { + cg::grid_group gg; +} GridGroupWrapper; + +extern "C" typedef void* GridGroup; +extern "C" __device__ GridGroup this_grid(); +extern "C" __device__ void GridGroup_destroy(GridGroup gg); +extern "C" __device__ bool GridGroup_is_valid(GridGroup gg); +extern "C" __device__ void GridGroup_sync(GridGroup gg); +extern "C" __device__ unsigned long long GridGroup_size(GridGroup gg); +extern "C" __device__ unsigned long long GridGroup_thread_rank(GridGroup gg); +// extern "C" dim3 GridGroup_group_dim(); // TODO: impl these. +extern "C" __device__ unsigned long long GridGroup_num_threads(GridGroup gg); +// extern "C" dim3 GridGroup_dim_blocks(); // TODO: impl these. +extern "C" __device__ unsigned long long GridGroup_num_blocks(GridGroup gg); +// extern "C" dim3 GridGroup_block_index(); // TODO: impl these. +extern "C" __device__ unsigned long long GridGroup_block_rank(GridGroup gg); diff --git a/crates/cust/src/error.rs b/crates/cust/src/error.rs index 8994af2..8857c5b 100644 --- a/crates/cust/src/error.rs +++ b/crates/cust/src/error.rs @@ -78,6 +78,7 @@ pub enum CudaError { InvalidAddressSpace = 717, InvalidProgramCounter = 718, LaunchFailed = 719, + CooperativeLaunchTooLarge = 720, NotPermitted = 800, NotSupported = 801, UnknownError = 999, @@ -209,9 +210,15 @@ impl ToResult for cudaError_enum { cudaError_enum::CUDA_ERROR_INVALID_ADDRESS_SPACE => Err(CudaError::InvalidAddressSpace), cudaError_enum::CUDA_ERROR_INVALID_PC => Err(CudaError::InvalidProgramCounter), cudaError_enum::CUDA_ERROR_LAUNCH_FAILED => Err(CudaError::LaunchFailed), + cudaError_enum::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE => { + Err(CudaError::CooperativeLaunchTooLarge) + } cudaError_enum::CUDA_ERROR_NOT_PERMITTED => Err(CudaError::NotPermitted), cudaError_enum::CUDA_ERROR_NOT_SUPPORTED => Err(CudaError::NotSupported), - _ => Err(CudaError::UnknownError), + err => { + println!("error encountered: {:?}", err); + Err(CudaError::UnknownError) + } } } } diff --git a/crates/cust/src/stream.rs b/crates/cust/src/stream.rs index 25a4178..6c83703 100644 --- a/crates/cust/src/stream.rs +++ b/crates/cust/src/stream.rs @@ -262,7 +262,51 @@ impl Stream { let grid_size: GridSize = grid_size.into(); let block_size: BlockSize = block_size.into(); - cuda::cuLaunchKernel( + // cuda::cuLaunchKernel( + // f: CUfunction, + // gridDimX: ::std::os::raw::c_uint, + // gridDimY: ::std::os::raw::c_uint, + // gridDimZ: ::std::os::raw::c_uint, + // blockDimX: ::std::os::raw::c_uint, + // blockDimY: ::std::os::raw::c_uint, + // blockDimZ: ::std::os::raw::c_uint, + // sharedMemBytes: ::std::os::raw::c_uint, + // hStream: CUstream, + // kernelParams: *mut *mut ::std::os::raw::c_void, + // extra: *mut *mut ::std::os::raw::c_void, + // ).to_result(); + + // cuda::cuLaunchCooperativeKernel( + // f: CUfunction, + // gridDimX: ::std::os::raw::c_uint, + // gridDimY: ::std::os::raw::c_uint, + // gridDimZ: ::std::os::raw::c_uint, + // blockDimX: ::std::os::raw::c_uint, + // blockDimY: ::std::os::raw::c_uint, + // blockDimZ: ::std::os::raw::c_uint, + // sharedMemBytes: ::std::os::raw::c_uint, + // hStream: CUstream, + // kernelParams: *mut *mut ::std::os::raw::c_void, + // ).to_result(); + + // cuda::cuLaunchKernel( + // func.to_raw(), + // grid_size.x, + // grid_size.y, + // grid_size.z, + // block_size.x, + // block_size.y, + // block_size.z, + // shared_mem_bytes, + // self.inner, + // args.as_ptr() as *mut _, + // ptr::null_mut(), + // ) + // .to_result() + + // TODO: make this configurable based on invocation patterns. For now, just testing. + + cuda::cuLaunchCooperativeKernel( func.to_raw(), grid_size.x, grid_size.y, @@ -273,7 +317,6 @@ impl Stream { shared_mem_bytes, self.inner, args.as_ptr() as *mut _, - ptr::null_mut(), ) .to_result() } diff --git a/crates/cust_raw/bindgen.sh b/crates/cust_raw/bindgen.sh old mode 100644 new mode 100755 index 83c8b7b..afc1ed3 --- a/crates/cust_raw/bindgen.sh +++ b/crates/cust_raw/bindgen.sh @@ -1,15 +1,20 @@ #!/bin/bash set -exu +if [ -z ${CUDA_PATH} ]; then + echo 'env var ${CUDA_PATH} must be defined, and must point to the root directory of the target Cuda installation' + exit 1 +fi + bindgen \ - --whitelist-type="^CU.*" \ - --whitelist-type="^cuuint(32|64)_t" \ - --whitelist-type="^cudaError_enum" \ - --whitelist-type="^cu.*Complex$" \ - --whitelist-type="^cuda.*" \ - --whitelist-type="^libraryPropertyType.*" \ - --whitelist-var="^CU.*" \ - --whitelist-function="^cu.*" \ + --allowlist-type="^CU.*" \ + --allowlist-type="^cuuint(32|64)_t" \ + --allowlist-type="^cudaError_enum" \ + --allowlist-type="^cu.*Complex$" \ + --allowlist-type="^cuda.*" \ + --allowlist-type="^libraryPropertyType.*" \ + --allowlist-var="^CU.*" \ + --allowlist-function="^cu.*" \ --default-enum-style=rust \ --no-doc-comments \ --with-derive-default \ @@ -17,5 +22,7 @@ bindgen \ --with-derive-hash \ --with-derive-ord \ --size_t-is-usize \ - wrapper.h -- -I/opt/cuda/include \ - > src/cuda.rs \ No newline at end of file + wrapper.h \ + -- \ + -I${CUDA_PATH}/include \ + > src/cuda.rs