Skip to content

Commit

Permalink
ROCm support
Browse files Browse the repository at this point in the history
  • Loading branch information
nazar-pc committed Sep 26, 2024
1 parent 84ea07b commit bdc7d98
Show file tree
Hide file tree
Showing 15 changed files with 801 additions and 14 deletions.
29 changes: 29 additions & 0 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,30 @@ jobs:
sub-packages: '["nvcc", "cudart"]'
if: runner.os == 'Linux' || runner.os == 'Windows'

- name: ROCm toolchain
run: |
ROCM_VERSION=6.2.1
if [ $(uname -p) = "x86_64" ]; then \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends gpg && \
mkdir -p --mode=0755 /etc/apt/keyrings && \
curl -L https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor > /etc/apt/keyrings/rocm.gpg && \
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/$ROCM_VERSION focal main" > /etc/apt/sources.list.d/rocm.list && \
echo "Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600" > /etc/apt/preferences.d/rocm-pin-600 && \
apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rocm-hip-runtime-dev && \
echo "/opt/rocm/lib" > /etc/ld.so.conf.d/rocm.conf && \
ldconfig \
; fi
if: runner.os == 'Linux'

- name: ROCm toolchain
# Download and install HIP SDK
run: |
curl -L -o HIP-SDK-Installer.exe https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe
Start-Process HIP-SDK-Installer.exe -ArgumentList '-install','-log',"hip-sdk-installer_log.txt" -NoNewWindow -Wait
Remove-Item HIP-SDK-Installer.exe
if: runner.os == 'Windows'

- name: Configure cache
uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2
with:
Expand All @@ -124,6 +148,11 @@ jobs:
cargo -Zgitoxide -Zgit clippy --locked --all-targets --features runtime-benchmarks,cuda -- -D warnings
if: runner.os == 'Linux' || runner.os == 'Windows'

- name: cargo clippy (ROCm)
run: |
cargo -Zgitoxide -Zgit clippy --locked --all-targets --features rocm -- -D warnings
if: runner.os == 'Linux' || runner.os == 'Windows'

cargo-docs:
runs-on: ${{ fromJson(github.repository_owner == 'autonomys' && '["self-hosted", "ubuntu-20.04-x86-64"]' || '"ubuntu-22.04"') }}
steps:
Expand Down
42 changes: 42 additions & 0 deletions .github/workflows/snapshot-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,30 @@ jobs:
sub-packages: '["nvcc", "cudart"]'
if: runner.os == 'Linux' || runner.os == 'Windows'

- name: ROCm toolchain
run: |
ROCM_VERSION=6.2.1
if [ $(uname -p) = "x86_64" ]; then \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends gpg && \
mkdir -p --mode=0755 /etc/apt/keyrings && \
curl -L https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor > /etc/apt/keyrings/rocm.gpg && \
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/$ROCM_VERSION focal main" > /etc/apt/sources.list.d/rocm.list && \
echo "Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600" > /etc/apt/preferences.d/rocm-pin-600 && \
apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rocm-hip-runtime-dev && \
echo "/opt/rocm/lib" > /etc/ld.so.conf.d/rocm.conf && \
ldconfig \
; fi
if: runner.os == 'Linux'

- name: ROCm toolchain
# Download and install HIP SDK
run: |
curl -L -o HIP-SDK-Installer.exe https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe
Start-Process HIP-SDK-Installer.exe -ArgumentList '-install','-log',"hip-sdk-installer_log.txt" -NoNewWindow -Wait
Remove-Item HIP-SDK-Installer.exe
if: runner.os == 'Windows'

- name: Configure cache
uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2
with:
Expand All @@ -212,6 +236,22 @@ jobs:
cargo -Zgitoxide -Zgit build --locked -Z build-std --target ${{ matrix.build.target }} --profile production --bin subspace-farmer
if: runner.os == 'macOS' || !startsWith(matrix.build.target, 'x86_64')

# ROCm can't be enabled together with CUDA for now
- name: Build farmer (ROCm, Windows)
run: |
cargo -Zgitoxide -Zgit build --locked -Z build-std --target ${{ matrix.build.target }} --profile production --bin subspace-farmer --features rocm
move ${{ env.PRODUCTION_TARGET }}/subspace-farmer.exe ${{ env.PRODUCTION_TARGET }}/subspace-farmer-rocm.exe
# TODO: ROCm packages are only available for x86-64 for now
if: runner.os == 'Windows' && startsWith(matrix.build.target, 'x86_64')

# ROCm can't be enabled together with CUDA for now
- name: Build farmer (ROCm, Ubuntu)
run: |
cargo -Zgitoxide -Zgit build --locked -Z build-std --target ${{ matrix.build.target }} --profile production --bin subspace-farmer --features rocm
mv ${{ env.PRODUCTION_TARGET }}/subspace-farmer ${{ env.PRODUCTION_TARGET }}/subspace-farmer-rocm
# TODO: ROCm packages are only available for x86-64 for now
if: runner.os == 'Linux' && startsWith(matrix.build.target, 'x86_64')

- name: Build farmer
run: |
cargo -Zgitoxide -Zgit build --locked -Z build-std --target ${{ matrix.build.target }} --profile production --bin subspace-farmer --features cuda
Expand Down Expand Up @@ -279,6 +319,7 @@ jobs:
run: |
mkdir executables
mv ${{ env.PRODUCTION_TARGET }}/subspace-farmer executables/subspace-farmer-${{ matrix.build.suffix }}
mv ${{ env.PRODUCTION_TARGET }}/subspace-farmer-rocm executables/subspace-farmer-rocm-${{ matrix.build.suffix }}
mv ${{ env.PRODUCTION_TARGET }}/subspace-node executables/subspace-node-${{ matrix.build.suffix }}
if: runner.os == 'Linux'

Expand All @@ -298,6 +339,7 @@ jobs:
run: |
mkdir executables
move ${{ env.PRODUCTION_TARGET }}/subspace-farmer.exe executables/subspace-farmer-${{ matrix.build.suffix }}.exe
move ${{ env.PRODUCTION_TARGET }}/subspace-farmer-rocm.exe executables/subspace-farmer-rocm-${{ matrix.build.suffix }}.exe
move ${{ env.PRODUCTION_TARGET }}/subspace-node.exe executables/subspace-node-${{ matrix.build.suffix }}.exe
if: runner.os == 'Windows'

Expand Down
3 changes: 1 addition & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

60 changes: 56 additions & 4 deletions Dockerfile-farmer
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,39 @@ RUN \
curl -OL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/$CUDA_ARCH/cuda-ubuntu2004.pin && \
mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cuda-minimal-build-12-4
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cuda-minimal-build-12-4 && \
echo "/usr/local/cuda/lib64" > /etc/ld.so.conf.d/cuda.conf && \
ldconfig

# ROCm is only used on x86-64 since they don't have other packages
ARG ROCM_VERSION=6.2.1
RUN \
export PATH=/usr/local/cuda/bin${PATH:+:${PATH}} && \
export LD_LIBRARY_PATH=/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} && \
if [ $(uname -p) = "x86_64" ]; then \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends gpg && \
mkdir -p --mode=0755 /etc/apt/keyrings && \
curl -L https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor > /etc/apt/keyrings/rocm.gpg && \
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/$ROCM_VERSION focal main" > /etc/apt/sources.list.d/rocm.list && \
echo "Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600" > /etc/apt/preferences.d/rocm-pin-600 && \
apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rocm-hip-runtime-dev && \
echo "/opt/rocm/lib" > /etc/ld.so.conf.d/rocm.conf && \
ldconfig \
; fi

# TODO: Remove `NVCC=off` hack once `sppark` has proper features for CUDA and ROCm
# ROCm is only used on x86-64 since they don't have other packages
RUN \
export PATH=/usr/local/cuda/bin:/opt/rocm-$ROCM_VERSION/bin${PATH:+:${PATH}} && \
if [ $(uname -p) = "x86_64" ]; then \
NVCC=off /root/.cargo/bin/cargo -Zgitoxide -Zgit build \
--locked \
-Z build-std \
--profile $PROFILE \
--bin subspace-farmer \
--features rocm \
--target $(uname -p)-unknown-linux-gnu && \
mv target/*/*/subspace-farmer subspace-farmer-rocm \
; fi && \
/root/.cargo/bin/cargo -Zgitoxide -Zgit build \
--locked \
-Z build-std \
Expand All @@ -63,10 +91,34 @@ RUN \

FROM ubuntu:20.04

COPY --from=0 /code/subspace-farmer /subspace-farmer
# Next block is for ROCm support
# ROCm is only used on x86-64 since they don't have other packages
ARG ROCM_VERSION=6.2.1
RUN \
if [ $(uname -p) = "x86_64" ]; then \
apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends curl ca-certificates gpg && \
mkdir -p --mode=0755 /etc/apt/keyrings && \
curl -L https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor > /etc/apt/keyrings/rocm.gpg && \
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/$ROCM_VERSION focal main" > /etc/apt/sources.list.d/rocm.list && \
echo "Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600" > /etc/apt/preferences.d/rocm-pin-600 && \
apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends hip-runtime-amd && \
DEBIAN_FRONTEND=noninteractive apt-get remove -y --purge --autoremove curl ca-certificates gpg && \
rm -rf /var/lib/apt/lists/* && \
echo "/opt/rocm/lib" > /etc/ld.so.conf.d/rocm.conf && \
ldconfig \
; fi

COPY --from=0 /code/subspace-farmer* /

RUN mkdir /var/subspace && chown nobody:nogroup /var/subspace

# `rocm` user for GPU access from the host
RUN \
groupadd -g 151 render && \
useradd -g render rocm

VOLUME /var/subspace

USER nobody:nogroup
Expand Down
1 change: 1 addition & 0 deletions crates/subspace-farmer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ cluster = ["dep:async-nats"]
numa = ["dep:hwlocality"]
# Only Volta+ architectures are supported (GeForce RTX 20xx consumer GPUs and newer)
cuda = ["_gpu", "subspace-proof-of-space-gpu/cuda"]
rocm = ["_gpu", "subspace-proof-of-space-gpu/rocm"]
# Internal feature, shouldn't be used directly
_gpu = []

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ use subspace_farmer::cluster::plotter::plotter_service;
use subspace_farmer::plotter::cpu::CpuPlotter;
#[cfg(feature = "cuda")]
use subspace_farmer::plotter::gpu::cuda::CudaRecordsEncoder;
#[cfg(feature = "rocm")]
use subspace_farmer::plotter::gpu::rocm::RocmRecordsEncoder;
#[cfg(feature = "_gpu")]
use subspace_farmer::plotter::gpu::GpuPlotter;
use subspace_farmer::plotter::pool::PoolPlotter;
Expand Down Expand Up @@ -101,6 +103,24 @@ struct CudaPlottingOptions {
cuda_gpus: Option<String>,
}

#[cfg(feature = "rocm")]
#[derive(Debug, Parser)]
struct RocmPlottingOptions {
/// Defines how many sectors farmer will download concurrently during plotting with ROCm GPU,
/// allows to limit memory usage of the plotting process, defaults to number of ROCm GPUs found
/// + 1 to download future sector ahead of time.
///
/// Increase will result in higher memory usage.
#[arg(long)]
rocm_sector_downloading_concurrency: Option<NonZeroUsize>,
/// Specify exact GPUs to be used for plotting instead of using all GPUs (default behavior).
///
/// GPUs are coma-separated: `--rocm-gpus 0,1,3`. Empty string can be specified to disable ROCm
/// GPU usage.
#[arg(long)]
rocm_gpus: Option<String>,
}

/// Arguments for plotter
#[derive(Debug, Parser)]
pub(super) struct PlotterArgs {
Expand All @@ -117,6 +137,10 @@ pub(super) struct PlotterArgs {
#[cfg(feature = "cuda")]
#[clap(flatten)]
cuda_plotting_options: CudaPlottingOptions,
/// Plotting options only used by ROCm GPU plotter
#[cfg(feature = "rocm")]
#[clap(flatten)]
rocm_plotting_options: RocmPlottingOptions,
/// Additional cluster components
#[clap(raw = true)]
pub(super) additional_components: Vec<String>,
Expand All @@ -136,6 +160,8 @@ where
cpu_plotting_options,
#[cfg(feature = "cuda")]
cuda_plotting_options,
#[cfg(feature = "rocm")]
rocm_plotting_options,
additional_components: _,
} = plotter_args;

Expand Down Expand Up @@ -167,6 +193,21 @@ where
modern_plotters.push(Box::new(cuda_plotter));
}
}
#[cfg(feature = "rocm")]
{
let maybe_rocm_plotter = init_rocm_plotter(
rocm_plotting_options,
piece_getter.clone(),
Arc::clone(&global_mutex),
kzg.clone(),
erasure_coding.clone(),
registry,
)?;

if let Some(rocm_plotter) = maybe_rocm_plotter {
modern_plotters.push(Box::new(rocm_plotter));
}
}
{
let cpu_sector_encoding_concurrency = cpu_plotting_options.cpu_sector_encoding_concurrency;
let maybe_cpu_plotters = init_cpu_plotters::<_, PosTableLegacy, PosTable>(
Expand Down Expand Up @@ -400,3 +441,85 @@ where
.map_err(|error| anyhow::anyhow!("Failed to initialize CUDA plotter: {error}"))?,
))
}

#[cfg(feature = "rocm")]
fn init_rocm_plotter<PG>(
rocm_plotting_options: RocmPlottingOptions,
piece_getter: PG,
global_mutex: Arc<AsyncMutex<()>>,
kzg: Kzg,
erasure_coding: ErasureCoding,
registry: &mut Registry,
) -> anyhow::Result<Option<GpuPlotter<PG, RocmRecordsEncoder>>>
where
PG: PieceGetter + Clone + Send + Sync + 'static,
{
use std::collections::BTreeSet;
use subspace_proof_of_space_gpu::rocm::rocm_devices;
use tracing::{debug, warn};

let RocmPlottingOptions {
rocm_sector_downloading_concurrency,
rocm_gpus,
} = rocm_plotting_options;

let mut rocm_devices = rocm_devices();
let mut used_rocm_devices = (0..rocm_devices.len()).collect::<Vec<_>>();

if let Some(rocm_gpus) = rocm_gpus {
if rocm_gpus.is_empty() {
info!("ROCm GPU plotting was explicitly disabled");
return Ok(None);
}

let mut rocm_gpus_to_use = rocm_gpus
.split(',')
.map(|gpu_index| gpu_index.parse())
.collect::<Result<BTreeSet<usize>, _>>()?;

(used_rocm_devices, rocm_devices) = rocm_devices
.into_iter()
.enumerate()
.filter(|(index, _rocm_device)| rocm_gpus_to_use.remove(index))
.unzip();

if !rocm_gpus_to_use.is_empty() {
warn!(
?rocm_gpus_to_use,
"Some ROCm GPUs were not found on the system"
);
}
}

if rocm_devices.is_empty() {
debug!("No ROCm GPU devices found");
return Ok(None);
}

info!(?used_rocm_devices, "Using ROCm GPUs");

let rocm_downloading_semaphore = Arc::new(Semaphore::new(
rocm_sector_downloading_concurrency
.map(|rocm_sector_downloading_concurrency| rocm_sector_downloading_concurrency.get())
.unwrap_or(rocm_devices.len() + 1),
));

Ok(Some(
GpuPlotter::new(
piece_getter,
rocm_downloading_semaphore,
rocm_devices
.into_iter()
.map(|rocm_device| RocmRecordsEncoder::new(rocm_device, Arc::clone(&global_mutex)))
.collect::<Result<_, _>>()
.map_err(|error| {
anyhow::anyhow!("Failed to create ROCm records encoder: {error}")
})?,
global_mutex,
kzg,
erasure_coding,
Some(registry),
)
.map_err(|error| anyhow::anyhow!("Failed to initialize ROCm plotter: {error}"))?,
))
}
Loading

0 comments on commit bdc7d98

Please sign in to comment.