Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Cooperative Groups API integration #87

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ members = [
]

exclude = [
"crates/optix/examples/common"
"crates/optix/examples/common",
]

[profile.dev.package.rustc_codegen_nvvm]
Expand Down
10 changes: 9 additions & 1 deletion crates/cuda_builder/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "cuda_builder"
version = "0.3.0"
version = "0.4.0"
edition = "2021"
authors = ["Riccardo D'Ambrosio <[email protected]>", "The rust-gpu Authors"]
license = "MIT OR Apache-2.0"
Expand All @@ -9,8 +9,16 @@ repository = "https://github.com/Rust-GPU/Rust-CUDA"
readme = "../../README.md"

[dependencies]
anyhow = "1"
thiserror = "1"
cc = { version = "1", default-features = false, optional = true }
cust = { path = "../cust", optional = true }
rustc_codegen_nvvm = { version = "0.3", path = "../rustc_codegen_nvvm" }
nvvm = { path = "../nvvm", version = "0.1" }
serde = { version = "1.0.130", features = ["derive"] }
serde_json = "1.0.68"
find_cuda_helper = { version = "0.2", path = "../find_cuda_helper" }

[features]
default = []
cooperative_groups = ["cc", "cust"]
61 changes: 61 additions & 0 deletions crates/cuda_builder/cg/cg_bridge.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#include "cooperative_groups.h"
#include "cg_bridge.cuh"
namespace cg = cooperative_groups;

__device__ GridGroup this_grid()
{
cg::grid_group gg = cg::this_grid();
GridGroupWrapper* ggp = new GridGroupWrapper { gg };
return ggp;
}

__device__ void GridGroup_destroy(GridGroup gg)
{
GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
delete g;
}

__device__ bool GridGroup_is_valid(GridGroup gg)
{
GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
return g->gg.is_valid();
}

__device__ void GridGroup_sync(GridGroup gg)
{
GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
return g->gg.sync();
}

__device__ unsigned long long GridGroup_size(GridGroup gg)
{
GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
return g->gg.size();
}

__device__ unsigned long long GridGroup_thread_rank(GridGroup gg)
{
GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
return g->gg.thread_rank();
}

__device__ unsigned long long GridGroup_num_threads(GridGroup gg)
{
GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
return g->gg.num_threads();
}

__device__ unsigned long long GridGroup_num_blocks(GridGroup gg)
{
GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
return g->gg.num_blocks();
}

__device__ unsigned long long GridGroup_block_rank(GridGroup gg)
{
GridGroupWrapper* g = static_cast<GridGroupWrapper*>(gg);
return g->gg.block_rank();
}

__host__ int main()
{}
21 changes: 21 additions & 0 deletions crates/cuda_builder/cg/cg_bridge.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#pragma once
#include "cooperative_groups.h"
namespace cg = cooperative_groups;

typedef struct GridGroupWrapper {
cg::grid_group gg;
} GridGroupWrapper;

extern "C" typedef void* GridGroup;
extern "C" __device__ GridGroup this_grid();
extern "C" __device__ void GridGroup_destroy(GridGroup gg);
extern "C" __device__ bool GridGroup_is_valid(GridGroup gg);
extern "C" __device__ void GridGroup_sync(GridGroup gg);
extern "C" __device__ unsigned long long GridGroup_size(GridGroup gg);
extern "C" __device__ unsigned long long GridGroup_thread_rank(GridGroup gg);
// extern "C" dim3 GridGroup_group_dim(); // TODO: impl these.
extern "C" __device__ unsigned long long GridGroup_num_threads(GridGroup gg);
// extern "C" dim3 GridGroup_dim_blocks(); // TODO: impl these.
extern "C" __device__ unsigned long long GridGroup_num_blocks(GridGroup gg);
// extern "C" dim3 GridGroup_block_index(); // TODO: impl these.
extern "C" __device__ unsigned long long GridGroup_block_rank(GridGroup gg);
174 changes: 174 additions & 0 deletions crates/cuda_builder/src/cg.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
//! Cooperative Groups compilation and linking.

use std::path::{Path, PathBuf};

use anyhow::Context;

use crate::{CudaBuilderError, CudaBuilderResult};

/// An artifact which may be linked together with the Cooperative Groups API bridge PTX code.
pub enum LinkableArtifact {
/// A PTX artifact.
Ptx(PathBuf),
/// A cubin artifact.
Cubin(PathBuf),
/// A fatbin artifact.
Fatbin(PathBuf),
}

impl LinkableArtifact {
/// Add this artifact to the given linker.
fn link_artifact(&self, linker: &mut cust::link::Linker) -> CudaBuilderResult<()> {
match &self {
LinkableArtifact::Ptx(path) => {
let mut data = std::fs::read_to_string(&path).with_context(|| {
format!("error reading PTX file for linking, file={:?}", path)
})?;
if !data.ends_with('\0') {
// If the PTX is not null-terminated, then linking will fail. Only required for PTX.
data.push('\0');
}
linker
.add_ptx(&data)
.with_context(|| format!("error linking PTX file={:?}", path))?;
}
LinkableArtifact::Cubin(path) => {
let data = std::fs::read(&path).with_context(|| {
format!("error reading cubin file for linking, file={:?}", path)
})?;
linker
.add_cubin(&data)
.with_context(|| format!("error linking cubin file={:?}", path))?;
}
LinkableArtifact::Fatbin(path) => {
let data = std::fs::read(&path).with_context(|| {
format!("error reading fatbin file for linking, file={:?}", path)
})?;
linker
.add_fatbin(&data)
.with_context(|| format!("error linking fatbin file={:?}", path))?;
}
}
Ok(())
}
}

/// A builder which will compile the Cooperative Groups API bridging code, and will then link it
/// together with any other artifacts provided to this builder.
///
/// The result of this process will be a `cubin` file containing the linked Cooperative Groups
/// PTX code along with any other linked artifacts provided to this builder. The output `cubin`
/// may then be loaded via `cust::module::Module::from_cubin(..)` and used as normal.
#[derive(Default)]
pub struct CooperativeGroups {
/// Artifacts to be linked together with the Cooperative Groups bridge code.
artifacts: Vec<LinkableArtifact>,
/// Flags to pass to nvcc for Cooperative Groups API bridge compilation.
nvcc_flags: Vec<String>,
}

impl CooperativeGroups {
/// Construct a new instance.
pub fn new() -> Self {
Self::default()
}

/// Add the artifact at the given path for linking.
///
/// This only applies to linking with the Cooperative Groups API bridge code. Typically,
/// this will be the PTX of your main program which has already been built via `CudaBuilder`.
pub fn link(mut self, artifact: LinkableArtifact) -> Self {
self.artifacts.push(artifact);
self
}

/// Add a flag to be passed along to `nvcc` during compilation of the Cooperative Groups API bridge code.
///
/// This provides maximum flexibility for code generation. If needed, multiple architectures
/// may be generated by adding the appropriate flags to the `nvcc` call.
///
/// By default, `nvcc` will generate code for `sm_52`. Override by specifying any of `--gpu-architecture`,
/// `--gpu-code`, or `--generate-code` flags.
///
/// Regardless of the flags added via this method, this builder will always added the following flags:
/// - `-I<cudaRoot>/include`: ensuring `cooperative_groups.h` can be found.
/// - `-Icg`: ensuring the bridging header can be found.
/// - `--ptx`: forces the compiled output to be in PTX form.
/// - `--device-c`: to compile the bridging code as relocatable device code.
/// - `src/cg_bridge.cu` will be added as the code to be compiled, which generates the
/// Cooperative Groups API bridge.
///
/// Docs: https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#command-option-description
pub fn nvcc_flag(mut self, val: impl AsRef<str>) -> Self {
self.nvcc_flags.push(val.as_ref().to_string());
self
}

/// Compile the Cooperative Groups API bridging code, and then link it together
/// with any other artifacts provided to this builder.
///
/// - `cg_out` specifies the output location for the Cooperative Groups API bridge PTX.
/// - `cubin_out` specifies the output location for the fully linked `cubin`.
///
/// ## Errors
/// - At least one artifact must be provided to this builder for linking.
/// - Any errors which take place from the `nvcc` compilation of the Cooperative Groups briding
/// code, or any errors which take place during module linking.
pub fn compile(
mut self,
cg_out: impl AsRef<Path>,
cubin_out: impl AsRef<Path>,
) -> CudaBuilderResult<()> {
// Perform some initial validation.
if self.artifacts.is_empty() {
return Err(anyhow::anyhow!("must provide at least 1 ptx/cubin/fatbin artifact to be linked with the Cooperative Groups API bridge code").into());
}

// Find the cuda installation directory for compilation of CG API.
let cuda_root =
find_cuda_helper::find_cuda_root().ok_or(CudaBuilderError::CudaRootNotFound)?;
let cuda_include = cuda_root.join("include");
let cg_src = std::path::Path::new(std::file!())
.parent()
.context("error accessing parent dir cuda_builder/src")?
.parent()
.context("error accessing parent dir cuda_builder")?
.join("cg")
.canonicalize()
.context("error taking canonical path to cooperative groups API bridge code")?;
let cg_bridge_cu = cg_src.join("cg_bridge.cu");

// Build up the `nvcc` invocation and then build the bridging code.
let mut nvcc = std::process::Command::new("nvcc");
nvcc.arg(format!("-I{:?}", &cuda_include).as_str())
.arg(format!("-I{:?}", &cg_src).as_str())
.arg("--ptx")
.arg("-o")
.arg(cg_out.as_ref().to_string_lossy().as_ref())
.arg("--device-c")
.arg(cg_bridge_cu.to_string_lossy().as_ref());
for flag in self.nvcc_flags.iter() {
nvcc.arg(flag.as_str());
}
nvcc.status()
.context("error calling nvcc for Cooperative Groups API bridge compilation")?;

// Link together the briding code with any given PTX/cubin/fatbin artifacts.
let _ctx = cust::quick_init().context("error building cuda context")?;
let mut linker = cust::link::Linker::new().context("error building cust linker")?;
self.artifacts
.push(LinkableArtifact::Ptx(cg_out.as_ref().to_path_buf()));
for artifact in self.artifacts.iter() {
artifact.link_artifact(&mut linker)?;
}
let linked_cubin = linker
.complete()
.context("error linking artifacts with Cooperative Groups API bridge PTX")?;

// Write finalized cubin.
std::fs::write(&cubin_out, &linked_cubin)
.with_context(|| format!("error writing linked cubin to {:?}", cubin_out.as_ref()))?;

Ok(())
}
}
35 changes: 18 additions & 17 deletions crates/cuda_builder/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,36 +1,37 @@
//! Utility crate for easily building CUDA crates using rustc_codegen_nvvm. Derived from rust-gpu's spirv_builder.
#[cfg(feature = "cooperative_groups")]
pub mod cg;

pub use nvvm::*;
use serde::Deserialize;
use std::{
borrow::Borrow,
env,
ffi::OsString,
fmt,
path::{Path, PathBuf},
process::{Command, Stdio},
};

#[derive(Debug)]
/// Cuda builder result type.
pub type CudaBuilderResult<T> = Result<T, CudaBuilderError>;

/// Cuda builder error type.
#[derive(thiserror::Error, Debug)]
#[non_exhaustive]
pub enum CudaBuilderError {
#[error("crate path {0} does not exist")]
CratePathDoesntExist(PathBuf),
FailedToCopyPtxFile(std::io::Error),
#[error("build failed")]
BuildFailed,
}

impl fmt::Display for CudaBuilderError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
CudaBuilderError::CratePathDoesntExist(path) => {
write!(f, "Crate path {} does not exist", path.display())
}
CudaBuilderError::BuildFailed => f.write_str("Build failed"),
CudaBuilderError::FailedToCopyPtxFile(err) => {
f.write_str(&format!("Failed to copy PTX file: {:?}", err))
}
}
}
#[error("failed to copy PTX file: {0:?}")]
FailedToCopyPtxFile(#[from] std::io::Error),
#[cfg(feature = "cooperative_groups")]
#[error("could not find cuda root installation dir")]
CudaRootNotFound,
#[cfg(feature = "cooperative_groups")]
#[error("compilation of the Cooperative Groups API bridge code failed: {0}")]
CGError(#[from] anyhow::Error),
}

#[derive(Debug, Clone, Copy, PartialEq)]
Expand Down
7 changes: 7 additions & 0 deletions crates/cuda_std/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,10 @@ cuda_std_macros = { version = "0.2", path = "../cuda_std_macros" }
half = "1.7.1"
bitflags = "1.3.2"
paste = "1.0.5"

[features]
default = []
cooperative_groups = []

[package.metadata.docs.rs]
all-features = true
Loading