diff --git a/Cargo.toml b/Cargo.toml index 067e29d..1a25cae 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ members = [ ] exclude = [ - "crates/optix/examples/common" + "crates/optix/examples/common", ] [profile.dev.package.rustc_codegen_nvvm] diff --git a/crates/cuda_builder/Cargo.toml b/crates/cuda_builder/Cargo.toml index 53de4fd..1ab6a51 100644 --- a/crates/cuda_builder/Cargo.toml +++ b/crates/cuda_builder/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cuda_builder" -version = "0.3.0" +version = "0.4.0" edition = "2021" authors = ["Riccardo D'Ambrosio ", "The rust-gpu Authors"] license = "MIT OR Apache-2.0" @@ -9,8 +9,16 @@ repository = "https://github.com/Rust-GPU/Rust-CUDA" readme = "../../README.md" [dependencies] +anyhow = "1" +thiserror = "1" +cc = { version = "1", default-features = false, optional = true } +cust = { path = "../cust", optional = true } rustc_codegen_nvvm = { version = "0.3", path = "../rustc_codegen_nvvm" } nvvm = { path = "../nvvm", version = "0.1" } serde = { version = "1.0.130", features = ["derive"] } serde_json = "1.0.68" find_cuda_helper = { version = "0.2", path = "../find_cuda_helper" } + +[features] +default = [] +cooperative_groups = ["cc", "cust"] diff --git a/crates/cuda_builder/cg/cg_bridge.cu b/crates/cuda_builder/cg/cg_bridge.cu new file mode 100644 index 0000000..51a5869 --- /dev/null +++ b/crates/cuda_builder/cg/cg_bridge.cu @@ -0,0 +1,61 @@ +#include "cooperative_groups.h" +#include "cg_bridge.cuh" +namespace cg = cooperative_groups; + +__device__ GridGroup this_grid() +{ + cg::grid_group gg = cg::this_grid(); + GridGroupWrapper* ggp = new GridGroupWrapper { gg }; + return ggp; +} + +__device__ void GridGroup_destroy(GridGroup gg) +{ + GridGroupWrapper* g = static_cast(gg); + delete g; +} + +__device__ bool GridGroup_is_valid(GridGroup gg) +{ + GridGroupWrapper* g = static_cast(gg); + return g->gg.is_valid(); +} + +__device__ void GridGroup_sync(GridGroup gg) +{ + GridGroupWrapper* g = static_cast(gg); + return g->gg.sync(); +} + +__device__ unsigned long long GridGroup_size(GridGroup gg) +{ + GridGroupWrapper* g = static_cast(gg); + return g->gg.size(); +} + +__device__ unsigned long long GridGroup_thread_rank(GridGroup gg) +{ + GridGroupWrapper* g = static_cast(gg); + return g->gg.thread_rank(); +} + +__device__ unsigned long long GridGroup_num_threads(GridGroup gg) +{ + GridGroupWrapper* g = static_cast(gg); + return g->gg.num_threads(); +} + +__device__ unsigned long long GridGroup_num_blocks(GridGroup gg) +{ + GridGroupWrapper* g = static_cast(gg); + return g->gg.num_blocks(); +} + +__device__ unsigned long long GridGroup_block_rank(GridGroup gg) +{ + GridGroupWrapper* g = static_cast(gg); + return g->gg.block_rank(); +} + +__host__ int main() +{} diff --git a/crates/cuda_builder/cg/cg_bridge.cuh b/crates/cuda_builder/cg/cg_bridge.cuh new file mode 100644 index 0000000..9f22f12 --- /dev/null +++ b/crates/cuda_builder/cg/cg_bridge.cuh @@ -0,0 +1,21 @@ +#pragma once +#include "cooperative_groups.h" +namespace cg = cooperative_groups; + +typedef struct GridGroupWrapper { + cg::grid_group gg; +} GridGroupWrapper; + +extern "C" typedef void* GridGroup; +extern "C" __device__ GridGroup this_grid(); +extern "C" __device__ void GridGroup_destroy(GridGroup gg); +extern "C" __device__ bool GridGroup_is_valid(GridGroup gg); +extern "C" __device__ void GridGroup_sync(GridGroup gg); +extern "C" __device__ unsigned long long GridGroup_size(GridGroup gg); +extern "C" __device__ unsigned long long GridGroup_thread_rank(GridGroup gg); +// extern "C" dim3 GridGroup_group_dim(); // TODO: impl these. +extern "C" __device__ unsigned long long GridGroup_num_threads(GridGroup gg); +// extern "C" dim3 GridGroup_dim_blocks(); // TODO: impl these. +extern "C" __device__ unsigned long long GridGroup_num_blocks(GridGroup gg); +// extern "C" dim3 GridGroup_block_index(); // TODO: impl these. +extern "C" __device__ unsigned long long GridGroup_block_rank(GridGroup gg); diff --git a/crates/cuda_builder/src/cg.rs b/crates/cuda_builder/src/cg.rs new file mode 100644 index 0000000..186708e --- /dev/null +++ b/crates/cuda_builder/src/cg.rs @@ -0,0 +1,174 @@ +//! Cooperative Groups compilation and linking. + +use std::path::{Path, PathBuf}; + +use anyhow::Context; + +use crate::{CudaBuilderError, CudaBuilderResult}; + +/// An artifact which may be linked together with the Cooperative Groups API bridge PTX code. +pub enum LinkableArtifact { + /// A PTX artifact. + Ptx(PathBuf), + /// A cubin artifact. + Cubin(PathBuf), + /// A fatbin artifact. + Fatbin(PathBuf), +} + +impl LinkableArtifact { + /// Add this artifact to the given linker. + fn link_artifact(&self, linker: &mut cust::link::Linker) -> CudaBuilderResult<()> { + match &self { + LinkableArtifact::Ptx(path) => { + let mut data = std::fs::read_to_string(&path).with_context(|| { + format!("error reading PTX file for linking, file={:?}", path) + })?; + if !data.ends_with('\0') { + // If the PTX is not null-terminated, then linking will fail. Only required for PTX. + data.push('\0'); + } + linker + .add_ptx(&data) + .with_context(|| format!("error linking PTX file={:?}", path))?; + } + LinkableArtifact::Cubin(path) => { + let data = std::fs::read(&path).with_context(|| { + format!("error reading cubin file for linking, file={:?}", path) + })?; + linker + .add_cubin(&data) + .with_context(|| format!("error linking cubin file={:?}", path))?; + } + LinkableArtifact::Fatbin(path) => { + let data = std::fs::read(&path).with_context(|| { + format!("error reading fatbin file for linking, file={:?}", path) + })?; + linker + .add_fatbin(&data) + .with_context(|| format!("error linking fatbin file={:?}", path))?; + } + } + Ok(()) + } +} + +/// A builder which will compile the Cooperative Groups API bridging code, and will then link it +/// together with any other artifacts provided to this builder. +/// +/// The result of this process will be a `cubin` file containing the linked Cooperative Groups +/// PTX code along with any other linked artifacts provided to this builder. The output `cubin` +/// may then be loaded via `cust::module::Module::from_cubin(..)` and used as normal. +#[derive(Default)] +pub struct CooperativeGroups { + /// Artifacts to be linked together with the Cooperative Groups bridge code. + artifacts: Vec, + /// Flags to pass to nvcc for Cooperative Groups API bridge compilation. + nvcc_flags: Vec, +} + +impl CooperativeGroups { + /// Construct a new instance. + pub fn new() -> Self { + Self::default() + } + + /// Add the artifact at the given path for linking. + /// + /// This only applies to linking with the Cooperative Groups API bridge code. Typically, + /// this will be the PTX of your main program which has already been built via `CudaBuilder`. + pub fn link(mut self, artifact: LinkableArtifact) -> Self { + self.artifacts.push(artifact); + self + } + + /// Add a flag to be passed along to `nvcc` during compilation of the Cooperative Groups API bridge code. + /// + /// This provides maximum flexibility for code generation. If needed, multiple architectures + /// may be generated by adding the appropriate flags to the `nvcc` call. + /// + /// By default, `nvcc` will generate code for `sm_52`. Override by specifying any of `--gpu-architecture`, + /// `--gpu-code`, or `--generate-code` flags. + /// + /// Regardless of the flags added via this method, this builder will always added the following flags: + /// - `-I/include`: ensuring `cooperative_groups.h` can be found. + /// - `-Icg`: ensuring the bridging header can be found. + /// - `--ptx`: forces the compiled output to be in PTX form. + /// - `--device-c`: to compile the bridging code as relocatable device code. + /// - `src/cg_bridge.cu` will be added as the code to be compiled, which generates the + /// Cooperative Groups API bridge. + /// + /// Docs: https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#command-option-description + pub fn nvcc_flag(mut self, val: impl AsRef) -> Self { + self.nvcc_flags.push(val.as_ref().to_string()); + self + } + + /// Compile the Cooperative Groups API bridging code, and then link it together + /// with any other artifacts provided to this builder. + /// + /// - `cg_out` specifies the output location for the Cooperative Groups API bridge PTX. + /// - `cubin_out` specifies the output location for the fully linked `cubin`. + /// + /// ## Errors + /// - At least one artifact must be provided to this builder for linking. + /// - Any errors which take place from the `nvcc` compilation of the Cooperative Groups briding + /// code, or any errors which take place during module linking. + pub fn compile( + mut self, + cg_out: impl AsRef, + cubin_out: impl AsRef, + ) -> CudaBuilderResult<()> { + // Perform some initial validation. + if self.artifacts.is_empty() { + return Err(anyhow::anyhow!("must provide at least 1 ptx/cubin/fatbin artifact to be linked with the Cooperative Groups API bridge code").into()); + } + + // Find the cuda installation directory for compilation of CG API. + let cuda_root = + find_cuda_helper::find_cuda_root().ok_or(CudaBuilderError::CudaRootNotFound)?; + let cuda_include = cuda_root.join("include"); + let cg_src = std::path::Path::new(std::file!()) + .parent() + .context("error accessing parent dir cuda_builder/src")? + .parent() + .context("error accessing parent dir cuda_builder")? + .join("cg") + .canonicalize() + .context("error taking canonical path to cooperative groups API bridge code")?; + let cg_bridge_cu = cg_src.join("cg_bridge.cu"); + + // Build up the `nvcc` invocation and then build the bridging code. + let mut nvcc = std::process::Command::new("nvcc"); + nvcc.arg(format!("-I{:?}", &cuda_include).as_str()) + .arg(format!("-I{:?}", &cg_src).as_str()) + .arg("--ptx") + .arg("-o") + .arg(cg_out.as_ref().to_string_lossy().as_ref()) + .arg("--device-c") + .arg(cg_bridge_cu.to_string_lossy().as_ref()); + for flag in self.nvcc_flags.iter() { + nvcc.arg(flag.as_str()); + } + nvcc.status() + .context("error calling nvcc for Cooperative Groups API bridge compilation")?; + + // Link together the briding code with any given PTX/cubin/fatbin artifacts. + let _ctx = cust::quick_init().context("error building cuda context")?; + let mut linker = cust::link::Linker::new().context("error building cust linker")?; + self.artifacts + .push(LinkableArtifact::Ptx(cg_out.as_ref().to_path_buf())); + for artifact in self.artifacts.iter() { + artifact.link_artifact(&mut linker)?; + } + let linked_cubin = linker + .complete() + .context("error linking artifacts with Cooperative Groups API bridge PTX")?; + + // Write finalized cubin. + std::fs::write(&cubin_out, &linked_cubin) + .with_context(|| format!("error writing linked cubin to {:?}", cubin_out.as_ref()))?; + + Ok(()) + } +} diff --git a/crates/cuda_builder/src/lib.rs b/crates/cuda_builder/src/lib.rs index e5b1e60..8552484 100644 --- a/crates/cuda_builder/src/lib.rs +++ b/crates/cuda_builder/src/lib.rs @@ -1,36 +1,37 @@ //! Utility crate for easily building CUDA crates using rustc_codegen_nvvm. Derived from rust-gpu's spirv_builder. +#[cfg(feature = "cooperative_groups")] +pub mod cg; + pub use nvvm::*; use serde::Deserialize; use std::{ borrow::Borrow, env, ffi::OsString, - fmt, path::{Path, PathBuf}, process::{Command, Stdio}, }; -#[derive(Debug)] +/// Cuda builder result type. +pub type CudaBuilderResult = Result; + +/// Cuda builder error type. +#[derive(thiserror::Error, Debug)] #[non_exhaustive] pub enum CudaBuilderError { + #[error("crate path {0} does not exist")] CratePathDoesntExist(PathBuf), - FailedToCopyPtxFile(std::io::Error), + #[error("build failed")] BuildFailed, -} - -impl fmt::Display for CudaBuilderError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - CudaBuilderError::CratePathDoesntExist(path) => { - write!(f, "Crate path {} does not exist", path.display()) - } - CudaBuilderError::BuildFailed => f.write_str("Build failed"), - CudaBuilderError::FailedToCopyPtxFile(err) => { - f.write_str(&format!("Failed to copy PTX file: {:?}", err)) - } - } - } + #[error("failed to copy PTX file: {0:?}")] + FailedToCopyPtxFile(#[from] std::io::Error), + #[cfg(feature = "cooperative_groups")] + #[error("could not find cuda root installation dir")] + CudaRootNotFound, + #[cfg(feature = "cooperative_groups")] + #[error("compilation of the Cooperative Groups API bridge code failed: {0}")] + CGError(#[from] anyhow::Error), } #[derive(Debug, Clone, Copy, PartialEq)] diff --git a/crates/cuda_std/Cargo.toml b/crates/cuda_std/Cargo.toml index 0a25f7e..ec6893e 100644 --- a/crates/cuda_std/Cargo.toml +++ b/crates/cuda_std/Cargo.toml @@ -13,3 +13,10 @@ cuda_std_macros = { version = "0.2", path = "../cuda_std_macros" } half = "1.7.1" bitflags = "1.3.2" paste = "1.0.5" + +[features] +default = [] +cooperative_groups = [] + +[package.metadata.docs.rs] +all-features = true diff --git a/crates/cuda_std/src/cg.rs b/crates/cuda_std/src/cg.rs new file mode 100644 index 0000000..cb01e21 --- /dev/null +++ b/crates/cuda_std/src/cg.rs @@ -0,0 +1,74 @@ +//! Cuda Cooperative Groups API interface. + +use crate::gpu_only; + +mod ffi { + use core::ffi::c_void; + + pub type GridGroup = *mut c_void; + extern "C" { + pub(super) fn this_grid() -> GridGroup; + pub(super) fn GridGroup_destroy(gg: GridGroup); + pub(super) fn GridGroup_is_valid(gg: GridGroup) -> bool; + pub(super) fn GridGroup_sync(gg: GridGroup); + pub(super) fn GridGroup_size(gg: GridGroup) -> u64; + pub(super) fn GridGroup_thread_rank(gg: GridGroup) -> u64; + pub(super) fn GridGroup_num_threads(gg: GridGroup) -> u64; + pub(super) fn GridGroup_num_blocks(gg: GridGroup) -> u64; + pub(super) fn GridGroup_block_rank(gg: GridGroup) -> u64; + // dim3 GridGroup_group_dim(); // TODO: impl these. + // dim3 GridGroup_dim_blocks(); // TODO: impl these. + // dim3 GridGroup_block_index(); // TODO: impl these. + } +} + +pub struct GridGroup(ffi::GridGroup); + +impl Drop for GridGroup { + fn drop(&mut self) { + unsafe { ffi::GridGroup_destroy(self.0) } + } +} + +impl GridGroup { + #[gpu_only] + pub fn this_grid() -> Self { + let ptr = unsafe { ffi::this_grid() }; + GridGroup(ptr) + } + + #[gpu_only] + pub fn is_valid(&mut self) -> bool { + unsafe { ffi::GridGroup_is_valid(self.0) } + } + + #[gpu_only] + pub fn sync(&mut self) { + unsafe { ffi::GridGroup_sync(self.0) } + } + + #[gpu_only] + pub fn size(&mut self) -> u64 { + unsafe { ffi::GridGroup_size(self.0) } + } + + #[gpu_only] + pub fn thread_rank(&mut self) -> u64 { + unsafe { ffi::GridGroup_thread_rank(self.0) } + } + + #[gpu_only] + pub fn num_threads(&mut self) -> u64 { + unsafe { ffi::GridGroup_num_threads(self.0) } + } + + #[gpu_only] + pub fn num_blocks(&mut self) -> u64 { + unsafe { ffi::GridGroup_num_blocks(self.0) } + } + + #[gpu_only] + pub fn block_rank(&mut self) -> u64 { + unsafe { ffi::GridGroup_block_rank(self.0) } + } +} diff --git a/crates/cuda_std/src/lib.rs b/crates/cuda_std/src/lib.rs index 33e7b28..e8c43a0 100644 --- a/crates/cuda_std/src/lib.rs +++ b/crates/cuda_std/src/lib.rs @@ -46,6 +46,8 @@ pub mod misc; // pub mod rt; pub mod atomic; pub mod cfg; +#[cfg(feature = "cooperative_groups")] +pub mod cg; pub mod ptr; pub mod shared; pub mod thread; diff --git a/crates/cust/src/error.rs b/crates/cust/src/error.rs index 8994af2..9e7224a 100644 --- a/crates/cust/src/error.rs +++ b/crates/cust/src/error.rs @@ -78,6 +78,7 @@ pub enum CudaError { InvalidAddressSpace = 717, InvalidProgramCounter = 718, LaunchFailed = 719, + CooperativeLaunchTooLarge = 720, NotPermitted = 800, NotSupported = 801, UnknownError = 999, @@ -209,9 +210,12 @@ impl ToResult for cudaError_enum { cudaError_enum::CUDA_ERROR_INVALID_ADDRESS_SPACE => Err(CudaError::InvalidAddressSpace), cudaError_enum::CUDA_ERROR_INVALID_PC => Err(CudaError::InvalidProgramCounter), cudaError_enum::CUDA_ERROR_LAUNCH_FAILED => Err(CudaError::LaunchFailed), + cudaError_enum::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE => { + Err(CudaError::CooperativeLaunchTooLarge) + } cudaError_enum::CUDA_ERROR_NOT_PERMITTED => Err(CudaError::NotPermitted), cudaError_enum::CUDA_ERROR_NOT_SUPPORTED => Err(CudaError::NotSupported), - _ => Err(CudaError::UnknownError), + err => Err(CudaError::UnknownError), } } } diff --git a/crates/cust/src/function.rs b/crates/cust/src/function.rs index 3f6faf2..07c06ca 100644 --- a/crates/cust/src/function.rs +++ b/crates/cust/src/function.rs @@ -545,3 +545,38 @@ macro_rules! launch { } }; } + +/// Launch a cooperative kernel function asynchronously. +/// +/// This macro is the same as `launch!`, except that it will launch kernels using the driver API +/// `cuLaunchCooperativeKernel` function. +#[macro_export] +macro_rules! launch_cooperative { + ($module:ident . $function:ident <<<$grid:expr, $block:expr, $shared:expr, $stream:ident>>>( $( $arg:expr),* $(,)?)) => { + { + let function = $module.get_function(stringify!($function)); + match function { + Ok(f) => launch_cooperative!(f<<<$grid, $block, $shared, $stream>>>( $($arg),* ) ), + Err(e) => Err(e), + } + } + }; + ($function:ident <<<$grid:expr, $block:expr, $shared:expr, $stream:ident>>>( $( $arg:expr),* $(,)?)) => { + { + fn assert_impl_devicecopy(_val: T) {} + if false { + $( + assert_impl_devicecopy($arg); + )* + }; + + $stream.launch_cooperative(&$function, $grid, $block, $shared, + &[ + $( + &$arg as *const _ as *mut ::std::ffi::c_void, + )* + ] + ) + } + }; +} diff --git a/crates/cust/src/stream.rs b/crates/cust/src/stream.rs index 25a4178..aedb4a3 100644 --- a/crates/cust/src/stream.rs +++ b/crates/cust/src/stream.rs @@ -278,6 +278,38 @@ impl Stream { .to_result() } + // Hidden implementation detail function. Highly unsafe. Use the `launch!` macro instead. + #[doc(hidden)] + pub unsafe fn launch_cooperative( + &self, + func: &Function, + grid_size: G, + block_size: B, + shared_mem_bytes: u32, + args: &[*mut c_void], + ) -> CudaResult<()> + where + G: Into, + B: Into, + { + let grid_size: GridSize = grid_size.into(); + let block_size: BlockSize = block_size.into(); + + cuda::cuLaunchCooperativeKernel( + func.to_raw(), + grid_size.x, + grid_size.y, + grid_size.z, + block_size.x, + block_size.y, + block_size.z, + shared_mem_bytes, + self.inner, + args.as_ptr() as *mut _, + ) + .to_result() + } + // Get the inner `CUstream` from the `Stream`. If you use this handle elsewhere, // make sure not to use it after the stream has been dropped. Or ManuallyDrop the struct to be safe. pub fn as_inner(&self) -> CUstream { diff --git a/crates/cust_raw/bindgen.sh b/crates/cust_raw/bindgen.sh old mode 100644 new mode 100755 index 83c8b7b..afc1ed3 --- a/crates/cust_raw/bindgen.sh +++ b/crates/cust_raw/bindgen.sh @@ -1,15 +1,20 @@ #!/bin/bash set -exu +if [ -z ${CUDA_PATH} ]; then + echo 'env var ${CUDA_PATH} must be defined, and must point to the root directory of the target Cuda installation' + exit 1 +fi + bindgen \ - --whitelist-type="^CU.*" \ - --whitelist-type="^cuuint(32|64)_t" \ - --whitelist-type="^cudaError_enum" \ - --whitelist-type="^cu.*Complex$" \ - --whitelist-type="^cuda.*" \ - --whitelist-type="^libraryPropertyType.*" \ - --whitelist-var="^CU.*" \ - --whitelist-function="^cu.*" \ + --allowlist-type="^CU.*" \ + --allowlist-type="^cuuint(32|64)_t" \ + --allowlist-type="^cudaError_enum" \ + --allowlist-type="^cu.*Complex$" \ + --allowlist-type="^cuda.*" \ + --allowlist-type="^libraryPropertyType.*" \ + --allowlist-var="^CU.*" \ + --allowlist-function="^cu.*" \ --default-enum-style=rust \ --no-doc-comments \ --with-derive-default \ @@ -17,5 +22,7 @@ bindgen \ --with-derive-hash \ --with-derive-ord \ --size_t-is-usize \ - wrapper.h -- -I/opt/cuda/include \ - > src/cuda.rs \ No newline at end of file + wrapper.h \ + -- \ + -I${CUDA_PATH}/include \ + > src/cuda.rs diff --git a/crates/nvvm/src/lib.rs b/crates/nvvm/src/lib.rs index e8bae63..c92c668 100644 --- a/crates/nvvm/src/lib.rs +++ b/crates/nvvm/src/lib.rs @@ -254,6 +254,8 @@ impl FromStr for NvvmOption { "72" => NvvmArch::Compute72, "75" => NvvmArch::Compute75, "80" => NvvmArch::Compute80, + "86" => NvvmArch::Compute86, + "87" => NvvmArch::Compute87, _ => return Err("unknown arch"), }; Self::Arch(arch) @@ -278,6 +280,8 @@ pub enum NvvmArch { Compute72, Compute75, Compute80, + Compute86, + Compute87, } impl Display for NvvmArch { @@ -432,6 +436,8 @@ mod tests { "-arch=compute_72", "-arch=compute_75", "-arch=compute_80", + "-arch=compute_86", + "-arch=compute_87", "-ftz=1", "-prec-sqrt=0", "-prec-div=0", @@ -453,6 +459,8 @@ mod tests { Arch(Compute72), Arch(Compute75), Arch(Compute80), + Arch(Compute86), + Arch(Compute87), Ftz, FastSqrt, FastDiv,