diff --git a/Cargo.toml b/Cargo.toml index 067e29d..b76c118 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,8 @@ members = [ ] exclude = [ - "crates/optix/examples/common" + "crates/optix/examples/common", + "crates/cuda_std_cg", ] [profile.dev.package.rustc_codegen_nvvm] diff --git a/Justfile b/Justfile new file mode 100644 index 0000000..af59104 --- /dev/null +++ b/Justfile @@ -0,0 +1,7 @@ +build_cuda_std_cg: + #!/usr/bin/env bash + set -euxo pipefail + nvcc --ptx -arch=sm_75 \ + -I crates/cuda_std_cg/src -I${CUDA_ROOT}/include \ + --device-c crates/cuda_std_cg/src/cg_bridge.cu \ + -o crates/cuda_std_cg/cg_bridge.ptx diff --git a/crates/cuda_std/src/cg.rs b/crates/cuda_std/src/cg.rs new file mode 100644 index 0000000..5265798 --- /dev/null +++ b/crates/cuda_std/src/cg.rs @@ -0,0 +1,72 @@ +use crate::gpu_only; + +mod ffi { + use core::ffi::c_void; + + pub type GridGroup = *mut c_void; + extern "C" { + pub(super) fn this_grid() -> GridGroup; + pub(super) fn GridGroup_destroy(gg: GridGroup); + pub(super) fn GridGroup_is_valid(gg: GridGroup) -> bool; + pub(super) fn GridGroup_sync(gg: GridGroup); + pub(super) fn GridGroup_size(gg: GridGroup) -> u64; + pub(super) fn GridGroup_thread_rank(gg: GridGroup) -> u64; + pub(super) fn GridGroup_num_threads(gg: GridGroup) -> u64; + pub(super) fn GridGroup_num_blocks(gg: GridGroup) -> u64; + pub(super) fn GridGroup_block_rank(gg: GridGroup) -> u64; + // dim3 GridGroup_group_dim(); // TODO: impl these. + // dim3 GridGroup_dim_blocks(); // TODO: impl these. + // dim3 GridGroup_block_index(); // TODO: impl these. + } +} + +pub struct GridGroup(ffi::GridGroup); + +impl Drop for GridGroup { + fn drop(&mut self) { + unsafe { ffi::GridGroup_destroy(self.0) } + } +} + +impl GridGroup { + #[gpu_only] + pub fn this_grid() -> Self { + let ptr = unsafe { ffi::this_grid() }; + GridGroup(ptr) + } + + #[gpu_only] + pub fn is_valid(&mut self) -> bool { + unsafe { ffi::GridGroup_is_valid(self.0) } + } + + #[gpu_only] + pub fn sync(&mut self) { + unsafe { ffi::GridGroup_sync(self.0) } + } + + #[gpu_only] + pub fn size(&mut self) -> u64 { + unsafe { ffi::GridGroup_size(self.0) } + } + + #[gpu_only] + pub fn thread_rank(&mut self) -> u64 { + unsafe { ffi::GridGroup_thread_rank(self.0) } + } + + #[gpu_only] + pub fn num_threads(&mut self) -> u64 { + unsafe { ffi::GridGroup_num_threads(self.0) } + } + + #[gpu_only] + pub fn num_blocks(&mut self) -> u64 { + unsafe { ffi::GridGroup_num_blocks(self.0) } + } + + #[gpu_only] + pub fn block_rank(&mut self) -> u64 { + unsafe { ffi::GridGroup_block_rank(self.0) } + } +} diff --git a/crates/cuda_std/src/lib.rs b/crates/cuda_std/src/lib.rs index 33e7b28..a2f38d4 100644 --- a/crates/cuda_std/src/lib.rs +++ b/crates/cuda_std/src/lib.rs @@ -46,6 +46,7 @@ pub mod misc; // pub mod rt; pub mod atomic; pub mod cfg; +pub mod cg; pub mod ptr; pub mod shared; pub mod thread; diff --git a/crates/cuda_std_cg/cg_bridge.ptx b/crates/cuda_std_cg/cg_bridge.ptx new file mode 100644 index 0000000..dd84c8c --- /dev/null +++ b/crates/cuda_std_cg/cg_bridge.ptx @@ -0,0 +1,304 @@ +// +// Generated by NVIDIA NVVM Compiler +// +// Compiler Build ID: CL-31442593 +// Cuda compilation tools, release 11.7, V11.7.99 +// Based on NVVM 7.0.1 +// + +.version 7.7 +.target sm_75 +.address_size 64 + + // .globl this_grid +.extern .func (.param .b64 func_retval0) malloc +( + .param .b64 malloc_param_0 +) +; +.extern .func free +( + .param .b64 free_param_0 +) +; +.weak .global .align 4 .b8 _ZZN4cuda3std3__48__detail21__stronger_order_cudaEiiE7__xform[16] = {3, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 3, 0, 0, 0}; + +.visible .func (.param .b64 func_retval0) this_grid() +{ + .reg .pred %p<2>; + .reg .b32 %r<3>; + .reg .b64 %rd<9>; + + + // begin inline asm + mov.u32 %r1, %envreg2; + // end inline asm + cvt.u64.u32 %rd5, %r1; + // begin inline asm + mov.u32 %r2, %envreg1; + // end inline asm + cvt.u64.u32 %rd6, %r2; + bfi.b64 %rd1, %rd6, %rd5, 32, 32; + mov.u64 %rd7, 16; + { // callseq 0, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd7; + .param .b64 retval0; + call.uni (retval0), + malloc, + ( + param0 + ); + ld.param.b64 %rd2, [retval0+0]; + } // callseq 0 + setp.eq.s64 %p1, %rd2, 0; + mov.u64 %rd8, 0; + @%p1 bra $L__BB0_2; + + st.u64 [%rd2], %rd1; + mov.u64 %rd8, %rd2; + +$L__BB0_2: + st.param.b64 [func_retval0+0], %rd8; + ret; + +} + // .globl GridGroup_destroy +.visible .func GridGroup_destroy( + .param .b64 GridGroup_destroy_param_0 +) +{ + .reg .b64 %rd<2>; + + + ld.param.u64 %rd1, [GridGroup_destroy_param_0]; + { // callseq 1, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd1; + call.uni + free, + ( + param0 + ); + } // callseq 1 + ret; + +} + // .globl GridGroup_is_valid +.visible .func (.param .b32 func_retval0) GridGroup_is_valid( + .param .b64 GridGroup_is_valid_param_0 +) +{ + .reg .pred %p<2>; + .reg .b32 %r<2>; + .reg .b64 %rd<3>; + + + ld.param.u64 %rd1, [GridGroup_is_valid_param_0]; + ld.u64 %rd2, [%rd1]; + setp.ne.s64 %p1, %rd2, 0; + selp.u32 %r1, 1, 0, %p1; + st.param.b32 [func_retval0+0], %r1; + ret; + +} + // .globl GridGroup_sync +.visible .func GridGroup_sync( + .param .b64 GridGroup_sync_param_0 +) +{ + .reg .pred %p<5>; + .reg .b32 %r<24>; + .reg .b64 %rd<9>; + + + ld.param.u64 %rd1, [GridGroup_sync_param_0]; + ld.u64 %rd8, [%rd1]; + setp.ne.s64 %p1, %rd8, 0; + @%p1 bra $L__BB3_2; + + // begin inline asm + trap; + // end inline asm + ld.u64 %rd8, [%rd1]; + +$L__BB3_2: + mov.u32 %r2, %tid.y; + mov.u32 %r3, %tid.x; + add.s32 %r4, %r3, %r2; + mov.u32 %r5, %tid.z; + neg.s32 %r6, %r5; + setp.ne.s32 %p2, %r4, %r6; + bar.sync 0; + @%p2 bra $L__BB3_6; + + add.s64 %rd6, %rd8, 4; + mov.u32 %r9, %ctaid.z; + neg.s32 %r10, %r9; + mov.u32 %r11, %ctaid.x; + mov.u32 %r12, %ctaid.y; + add.s32 %r13, %r11, %r12; + setp.eq.s32 %p3, %r13, %r10; + mov.u32 %r14, %nctaid.z; + mov.u32 %r15, %nctaid.x; + mov.u32 %r16, %nctaid.y; + mul.lo.s32 %r17, %r15, %r16; + mul.lo.s32 %r18, %r17, %r14; + mov.u32 %r19, -2147483647; + sub.s32 %r20, %r19, %r18; + selp.b32 %r8, %r20, 1, %p3; + membar.gl; + // begin inline asm + atom.add.release.gpu.u32 %r7,[%rd6],%r8; + // end inline asm + +$L__BB3_4: + ld.volatile.u32 %r21, [%rd6]; + xor.b32 %r22, %r21, %r7; + setp.gt.s32 %p4, %r22, -1; + @%p4 bra $L__BB3_4; + + // begin inline asm + ld.acquire.gpu.u32 %r23,[%rd6]; + // end inline asm + +$L__BB3_6: + bar.sync 0; + ret; + +} + // .globl GridGroup_size +.visible .func (.param .b64 func_retval0) GridGroup_size( + .param .b64 GridGroup_size_param_0 +) +{ + .reg .b32 %r<10>; + .reg .b64 %rd<4>; + + + mov.u32 %r1, %nctaid.x; + mov.u32 %r2, %nctaid.y; + mov.u32 %r3, %nctaid.z; + mul.lo.s32 %r4, %r2, %r3; + mul.wide.u32 %rd1, %r4, %r1; + mov.u32 %r5, %ntid.x; + mov.u32 %r6, %ntid.y; + mul.lo.s32 %r7, %r5, %r6; + mov.u32 %r8, %ntid.z; + mul.lo.s32 %r9, %r7, %r8; + cvt.u64.u32 %rd2, %r9; + mul.lo.s64 %rd3, %rd1, %rd2; + st.param.b64 [func_retval0+0], %rd3; + ret; + +} + // .globl GridGroup_thread_rank +.visible .func (.param .b64 func_retval0) GridGroup_thread_rank( + .param .b64 GridGroup_thread_rank_param_0 +) +{ + .reg .b32 %r<16>; + .reg .b64 %rd<12>; + + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ctaid.y; + mov.u32 %r3, %ctaid.z; + mov.u32 %r4, %nctaid.x; + mov.u32 %r5, %nctaid.y; + mul.wide.u32 %rd1, %r5, %r3; + cvt.u64.u32 %rd2, %r4; + cvt.u64.u32 %rd3, %r2; + add.s64 %rd4, %rd1, %rd3; + mul.lo.s64 %rd5, %rd4, %rd2; + cvt.u64.u32 %rd6, %r1; + add.s64 %rd7, %rd5, %rd6; + mov.u32 %r6, %ntid.x; + mov.u32 %r7, %ntid.y; + mul.lo.s32 %r8, %r6, %r7; + mov.u32 %r9, %ntid.z; + mul.lo.s32 %r10, %r8, %r9; + cvt.u64.u32 %rd8, %r10; + mul.lo.s64 %rd9, %rd7, %rd8; + mov.u32 %r11, %tid.x; + mov.u32 %r12, %tid.y; + mov.u32 %r13, %tid.z; + mad.lo.s32 %r14, %r7, %r13, %r12; + mad.lo.s32 %r15, %r14, %r6, %r11; + cvt.u64.u32 %rd10, %r15; + add.s64 %rd11, %rd9, %rd10; + st.param.b64 [func_retval0+0], %rd11; + ret; + +} + // .globl GridGroup_num_threads +.visible .func (.param .b64 func_retval0) GridGroup_num_threads( + .param .b64 GridGroup_num_threads_param_0 +) +{ + .reg .b32 %r<10>; + .reg .b64 %rd<4>; + + + mov.u32 %r1, %nctaid.x; + mov.u32 %r2, %nctaid.y; + mov.u32 %r3, %nctaid.z; + mul.lo.s32 %r4, %r2, %r3; + mul.wide.u32 %rd1, %r4, %r1; + mov.u32 %r5, %ntid.x; + mov.u32 %r6, %ntid.y; + mul.lo.s32 %r7, %r5, %r6; + mov.u32 %r8, %ntid.z; + mul.lo.s32 %r9, %r7, %r8; + cvt.u64.u32 %rd2, %r9; + mul.lo.s64 %rd3, %rd1, %rd2; + st.param.b64 [func_retval0+0], %rd3; + ret; + +} + // .globl GridGroup_num_blocks +.visible .func (.param .b64 func_retval0) GridGroup_num_blocks( + .param .b64 GridGroup_num_blocks_param_0 +) +{ + .reg .b32 %r<5>; + .reg .b64 %rd<2>; + + + mov.u32 %r1, %nctaid.x; + mov.u32 %r2, %nctaid.y; + mov.u32 %r3, %nctaid.z; + mul.lo.s32 %r4, %r2, %r3; + mul.wide.u32 %rd1, %r4, %r1; + st.param.b64 [func_retval0+0], %rd1; + ret; + +} + // .globl GridGroup_block_rank +.visible .func (.param .b64 func_retval0) GridGroup_block_rank( + .param .b64 GridGroup_block_rank_param_0 +) +{ + .reg .b32 %r<6>; + .reg .b64 %rd<8>; + + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ctaid.y; + mov.u32 %r3, %ctaid.z; + mov.u32 %r4, %nctaid.x; + mov.u32 %r5, %nctaid.y; + mul.wide.u32 %rd1, %r5, %r3; + cvt.u64.u32 %rd2, %r4; + cvt.u64.u32 %rd3, %r2; + add.s64 %rd4, %rd1, %rd3; + mul.lo.s64 %rd5, %rd4, %rd2; + cvt.u64.u32 %rd6, %r1; + add.s64 %rd7, %rd5, %rd6; + st.param.b64 [func_retval0+0], %rd7; + ret; + +} + diff --git a/crates/cuda_std_cg/src/cg_bridge.cu b/crates/cuda_std_cg/src/cg_bridge.cu new file mode 100644 index 0000000..110d9f2 --- /dev/null +++ b/crates/cuda_std_cg/src/cg_bridge.cu @@ -0,0 +1,63 @@ +#include "cooperative_groups.h" +#include "cg_bridge.cuh" +// #include +namespace cg = cooperative_groups; + +__device__ GridGroup this_grid() +{ + cg::grid_group gg = cg::this_grid(); + GridGroupWrapper* ggp = new GridGroupWrapper { gg }; + return ggp; +} + +__device__ void GridGroup_destroy(GridGroup gg) +{ + GridGroupWrapper* g = static_cast(gg); + delete g; +} + +__device__ bool GridGroup_is_valid(GridGroup gg) +{ + GridGroupWrapper* g = static_cast(gg); + return g->gg.is_valid(); +} + +__device__ void GridGroup_sync(GridGroup gg) +{ + // std::printf("calling sync from bridge"); + GridGroupWrapper* g = static_cast(gg); + return g->gg.sync(); +} + +__device__ unsigned long long GridGroup_size(GridGroup gg) +{ + GridGroupWrapper* g = static_cast(gg); + return g->gg.size(); +} + +__device__ unsigned long long GridGroup_thread_rank(GridGroup gg) +{ + GridGroupWrapper* g = static_cast(gg); + return g->gg.thread_rank(); +} + +__device__ unsigned long long GridGroup_num_threads(GridGroup gg) +{ + GridGroupWrapper* g = static_cast(gg); + return g->gg.num_threads(); +} + +__device__ unsigned long long GridGroup_num_blocks(GridGroup gg) +{ + GridGroupWrapper* g = static_cast(gg); + return g->gg.num_blocks(); +} + +__device__ unsigned long long GridGroup_block_rank(GridGroup gg) +{ + GridGroupWrapper* g = static_cast(gg); + return g->gg.block_rank(); +} + +__host__ int main() +{} diff --git a/crates/cuda_std_cg/src/cg_bridge.cuh b/crates/cuda_std_cg/src/cg_bridge.cuh new file mode 100644 index 0000000..9f22f12 --- /dev/null +++ b/crates/cuda_std_cg/src/cg_bridge.cuh @@ -0,0 +1,21 @@ +#pragma once +#include "cooperative_groups.h" +namespace cg = cooperative_groups; + +typedef struct GridGroupWrapper { + cg::grid_group gg; +} GridGroupWrapper; + +extern "C" typedef void* GridGroup; +extern "C" __device__ GridGroup this_grid(); +extern "C" __device__ void GridGroup_destroy(GridGroup gg); +extern "C" __device__ bool GridGroup_is_valid(GridGroup gg); +extern "C" __device__ void GridGroup_sync(GridGroup gg); +extern "C" __device__ unsigned long long GridGroup_size(GridGroup gg); +extern "C" __device__ unsigned long long GridGroup_thread_rank(GridGroup gg); +// extern "C" dim3 GridGroup_group_dim(); // TODO: impl these. +extern "C" __device__ unsigned long long GridGroup_num_threads(GridGroup gg); +// extern "C" dim3 GridGroup_dim_blocks(); // TODO: impl these. +extern "C" __device__ unsigned long long GridGroup_num_blocks(GridGroup gg); +// extern "C" dim3 GridGroup_block_index(); // TODO: impl these. +extern "C" __device__ unsigned long long GridGroup_block_rank(GridGroup gg); diff --git a/crates/cust/src/error.rs b/crates/cust/src/error.rs index 8994af2..8857c5b 100644 --- a/crates/cust/src/error.rs +++ b/crates/cust/src/error.rs @@ -78,6 +78,7 @@ pub enum CudaError { InvalidAddressSpace = 717, InvalidProgramCounter = 718, LaunchFailed = 719, + CooperativeLaunchTooLarge = 720, NotPermitted = 800, NotSupported = 801, UnknownError = 999, @@ -209,9 +210,15 @@ impl ToResult for cudaError_enum { cudaError_enum::CUDA_ERROR_INVALID_ADDRESS_SPACE => Err(CudaError::InvalidAddressSpace), cudaError_enum::CUDA_ERROR_INVALID_PC => Err(CudaError::InvalidProgramCounter), cudaError_enum::CUDA_ERROR_LAUNCH_FAILED => Err(CudaError::LaunchFailed), + cudaError_enum::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE => { + Err(CudaError::CooperativeLaunchTooLarge) + } cudaError_enum::CUDA_ERROR_NOT_PERMITTED => Err(CudaError::NotPermitted), cudaError_enum::CUDA_ERROR_NOT_SUPPORTED => Err(CudaError::NotSupported), - _ => Err(CudaError::UnknownError), + err => { + println!("error encountered: {:?}", err); + Err(CudaError::UnknownError) + } } } } diff --git a/crates/cust/src/stream.rs b/crates/cust/src/stream.rs index 25a4178..6c83703 100644 --- a/crates/cust/src/stream.rs +++ b/crates/cust/src/stream.rs @@ -262,7 +262,51 @@ impl Stream { let grid_size: GridSize = grid_size.into(); let block_size: BlockSize = block_size.into(); - cuda::cuLaunchKernel( + // cuda::cuLaunchKernel( + // f: CUfunction, + // gridDimX: ::std::os::raw::c_uint, + // gridDimY: ::std::os::raw::c_uint, + // gridDimZ: ::std::os::raw::c_uint, + // blockDimX: ::std::os::raw::c_uint, + // blockDimY: ::std::os::raw::c_uint, + // blockDimZ: ::std::os::raw::c_uint, + // sharedMemBytes: ::std::os::raw::c_uint, + // hStream: CUstream, + // kernelParams: *mut *mut ::std::os::raw::c_void, + // extra: *mut *mut ::std::os::raw::c_void, + // ).to_result(); + + // cuda::cuLaunchCooperativeKernel( + // f: CUfunction, + // gridDimX: ::std::os::raw::c_uint, + // gridDimY: ::std::os::raw::c_uint, + // gridDimZ: ::std::os::raw::c_uint, + // blockDimX: ::std::os::raw::c_uint, + // blockDimY: ::std::os::raw::c_uint, + // blockDimZ: ::std::os::raw::c_uint, + // sharedMemBytes: ::std::os::raw::c_uint, + // hStream: CUstream, + // kernelParams: *mut *mut ::std::os::raw::c_void, + // ).to_result(); + + // cuda::cuLaunchKernel( + // func.to_raw(), + // grid_size.x, + // grid_size.y, + // grid_size.z, + // block_size.x, + // block_size.y, + // block_size.z, + // shared_mem_bytes, + // self.inner, + // args.as_ptr() as *mut _, + // ptr::null_mut(), + // ) + // .to_result() + + // TODO: make this configurable based on invocation patterns. For now, just testing. + + cuda::cuLaunchCooperativeKernel( func.to_raw(), grid_size.x, grid_size.y, @@ -273,7 +317,6 @@ impl Stream { shared_mem_bytes, self.inner, args.as_ptr() as *mut _, - ptr::null_mut(), ) .to_result() } diff --git a/crates/cust_raw/bindgen.sh b/crates/cust_raw/bindgen.sh old mode 100644 new mode 100755 index 83c8b7b..afc1ed3 --- a/crates/cust_raw/bindgen.sh +++ b/crates/cust_raw/bindgen.sh @@ -1,15 +1,20 @@ #!/bin/bash set -exu +if [ -z ${CUDA_PATH} ]; then + echo 'env var ${CUDA_PATH} must be defined, and must point to the root directory of the target Cuda installation' + exit 1 +fi + bindgen \ - --whitelist-type="^CU.*" \ - --whitelist-type="^cuuint(32|64)_t" \ - --whitelist-type="^cudaError_enum" \ - --whitelist-type="^cu.*Complex$" \ - --whitelist-type="^cuda.*" \ - --whitelist-type="^libraryPropertyType.*" \ - --whitelist-var="^CU.*" \ - --whitelist-function="^cu.*" \ + --allowlist-type="^CU.*" \ + --allowlist-type="^cuuint(32|64)_t" \ + --allowlist-type="^cudaError_enum" \ + --allowlist-type="^cu.*Complex$" \ + --allowlist-type="^cuda.*" \ + --allowlist-type="^libraryPropertyType.*" \ + --allowlist-var="^CU.*" \ + --allowlist-function="^cu.*" \ --default-enum-style=rust \ --no-doc-comments \ --with-derive-default \ @@ -17,5 +22,7 @@ bindgen \ --with-derive-hash \ --with-derive-ord \ --size_t-is-usize \ - wrapper.h -- -I/opt/cuda/include \ - > src/cuda.rs \ No newline at end of file + wrapper.h \ + -- \ + -I${CUDA_PATH}/include \ + > src/cuda.rs