Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

{ai}[foss/2023a] DeepSpeed v0.14.5, CUTLASS v3.4.0, DLPACK v0.8 w/ CUDA 12.1.1 #21438

Open
wants to merge 19 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
easyblock = 'CMakeMake'

name = 'CUTLASS'
version = '3.5.1'
versionsuffix = '-CUDA-%(cudaver)s'

homepage = 'https://github.com/NVIDIA/cutlass'
description = """CUTLASS is a collection of CUDA C++ template
abstractions for implementing high-performance matrix-matrix
multiplication (GEMM) and related computations at all levels and scales
within CUDA. It incorporates strategies for hierarchical decomposition
and data movement similar to those used to implement cuBLAS and cuDNN.
CUTLASS decomposes these "moving parts" into reusable, modular software
components abstracted by C++ template classes. Primitives for different
levels of a conceptual parallelization hierarchy can be specialized and
tuned via custom tiling sizes, data types, and other algorithmic policy.
The resulting flexibility simplifies their use as building blocks within
custom kernels and applications."""

toolchain = {'name': 'foss', 'version': '2023a'}

github_account = 'NVIDIA'
source_urls = [GITHUB_LOWER_SOURCE]
patches = ['CUTLASS-3.5.1_install_tools.patch']

sources = ['v%(version)s.tar.gz']
checksums = [
{'v%(version)s.tar.gz': '20b7247cda2d257cbf8ba59ba3ca40a9211c4da61a9c9913e32b33a2c5883a36'},
{'CUTLASS-3.5.1_install_tools.patch': '18fa5361b15848d98435b8b08bd921130718b963ca4ad47fa0db96fbe815e509'},
]

builddependencies = [
('CMake', '3.26.3'),
('Python', '3.11.3'),
]

dependencies = [
('CUDA', '12.1.1', '', SYSTEM),
('cuDNN', '8.9.2.26', versionsuffix, SYSTEM),
]

_copts = [
'-DCUTLASS_NVCC_ARCHS="%(cuda_cc_cmake)s"',
'-DCUTLASS_ENABLE_CUBLAS=1',
'-DCUTLASS_ENABLE_CUDNN=1',
'-DCUTLASS_ENABLE_TOOLS=1',
]
configopts = ' '.join(_copts)

sanity_check_paths = {
'files': ['include/cutlass/cutlass.h', 'lib/libcutlass.%s' % SHLIB_EXT],
'dirs': ['lib/cmake', 'tools/util/include'],
}

moduleclass = 'lib'
33 changes: 33 additions & 0 deletions easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.5.1_install_tools.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
From fd04f818d16431ee8979728d4725f63ab7f31a05 Mon Sep 17 00:00:00 2001
From: Viktor Rehnberg <[email protected]>
Date: Tue, 8 Oct 2024 08:24:23 +0000
Subject: [PATCH] Optionally install tools/util/include

DeepSpeed EvoformerAttn expects this file, see
https://github.com/microsoft/DeepSpeed/blob/v0.14.5/op_builder/evoformer_attn.py#L76
---
CMakeLists.txt | 8 ++++++++
1 file changed, 8 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7419bdf5..1cee21ac 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -689,6 +689,14 @@ install(
PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
)

+if(CUTLASS_ENABLE_TOOLS)
+ install(
+ DIRECTORY ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}/
+ DESTINATION ${CMAKE_INSTALL_PREFIX}/tools/util/include
+ )
+endif()
+
+
################################################################################

# Doxygen is available. Generate documentation
--
2.39.3

26 changes: 26 additions & 0 deletions easybuild/easyconfigs/d/DLPack/DLPack-0.8-GCC-12.3.0.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
easyblock = 'CMakeMake'

name = 'DLPack'
version = '0.8'

homepage = 'https://dmlc.github.io/dlpack/latest/'
description = """DLPack is a stable in-memory data structure for an ndarray
system to interact with a variety of frameworks."""

toolchain = {'name': 'GCC', 'version': '12.3.0'}

github_account = 'dmlc'
source_urls = [GITHUB_LOWER_SOURCE]
sources = ['v%(version)s.tar.gz']
checksums = ['cf965c26a5430ba4cc53d61963f288edddcd77443aa4c85ce722aaf1e2f29513']

builddependencies = [
('CMake', '3.26.3'),
]

sanity_check_paths = {
'files': ['include/dlpack/dlpack.h', 'lib/cmake/dlpack/dlpackConfig.cmake'],
'dirs': [],
}

moduleclass = 'lib'
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
Patch away dependency on ninja python package by falling back to checking
returncode of `ninja --version`.

Author: Viktor Rehnberg (Chalmers University of Technology)


diff --git a/deepspeed/env_report.py b/deepspeed/env_report.py
index 85a2f9b2..8bb64626 100644
--- a/deepspeed/env_report.py
+++ b/deepspeed/env_report.py
@@ -62,7 +62,7 @@ def ninja_installed():
try:
import ninja # noqa: F401 # type: ignore
except ImportError:
- return False
+ return (subprocess.run(["ninja", "--version"]).returncode == 0)
return True


diff --git a/op_builder/builder.py b/op_builder/builder.py
index 8dc825c7..970d18b2 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -487,7 +487,8 @@ class OpBuilder(ABC):
try:
import ninja # noqa: F401 # type: ignore
except ImportError:
- raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
+ if subprocess.run(["ninja", "--version"]).returncode != 0:
+ raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")

if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch():
self.build_for_cpu = not torch.cuda.is_available()
diff --git a/op_builder/xpu/builder.py b/op_builder/xpu/builder.py
index 81b15f19..cf0a1cc0 100644
--- a/op_builder/xpu/builder.py
+++ b/op_builder/xpu/builder.py
@@ -89,7 +89,8 @@ class SYCLOpBuilder(OpBuilder):
try:
import ninja # noqa: F401
except ImportError:
- raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
+ if subprocess.run(["ninja", "--version"]).returncode != 0:
+ raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")

self.jit_mode = True
from intel_extension_for_pytorch.xpu.cpp_extension import load
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 80c9f9b3..eed77fa3 100755
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -1,5 +1,4 @@
hjson
-ninja
numpy
packaging>=20.0
psutil
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
easyblock = 'PythonBundle'

name = 'DeepSpeed'
version = '0.14.5'
versionsuffix = '-CUDA-%(cudaver)s'

homepage = "http://www.deepspeed.ai/"
description = """
DeepSpeed is a deep learning optimization library that makes distributed training easy, efficient, and effective.
"""


toolchain = {'name': 'foss', 'version': '2023a'}

builddependencies = [
('Ninja', '1.11.1'),
]

dependencies = [
('Python', '3.11.3'),
('CUDA', '12.1.1', '', SYSTEM),
('NCCL', '2.18.3', versionsuffix),
('CUTLASS', '3.5.1', versionsuffix),
('PyTorch', '2.1.2', versionsuffix),
('CuPy', '13.0.0', versionsuffix),
('Triton', '2.1.0', versionsuffix),
('accelerate', '0.33.0', versionsuffix),
('PyTorch-bundle', '2.1.2', versionsuffix), # torchvision dependency for mup
('Seaborn', '0.13.2'), # dependency for mup
('DLPack', '0.8'),
('py-cpuinfo', '9.0.0'),
('pydantic', '2.5.3'),
('tqdm', '4.66.1'),
('libaio', '0.3.113'), # for async_io (builddep only?)
('Transformers', '4.39.3'),
]

use_pip = True

github_account = 'microsoft'
exts_list = [
('hjson', '3.1.0', {
'checksums': ['55af475a27cf83a7969c808399d7bccdec8fb836a07ddbd574587593b9cdcf75'],
}),
('nvidia-ml-py', '12.535.161', {
'checksums': ['2bcc31ff7a0ea291ed8d7fc39b149391a42c2fb1cb4256c935e692de488b4d17'],
'modulename': 'pynvml',
}),
('mup', '1.0.0', {
'checksums': ['9639e3d19f90e754f985ed444542ed2f8a049f3c0488fcb6efe150f30922cf74'],
}),
(name, version, {
'ds_build_ops_to_skip': [
# DS_BUILD_<OPT>=0 http://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops
'SPARSE_ATTN', # requires PyTorch<2.0
'FP_QUANTIZER', # Untested triton version (2.1.0), only 2.3.0 and 2.3.1 are known to be compatible
'CUTLASS_OPS', # requires dskernels
'RAGGED_DEVICE_OPS', # requires dskernels
],
'installopts': '--global-option="build_ext" --global-option="-j%(parallel)s"',
'patches': [
'DeepSpeed-0.14.2_no-ninja-dep.patch',
'DeepSpeed-0.14.5_pic-compile.patch',
'DeepSpeed-0.14.5_pdsh-env-vars.patch',
'DeepSpeed-0.14.5_use-eb-cutlass.patch',
],
'runtest': (
'PATH="$PATH:$PWD/bin"' # deepspeed cli used in a lot of tests
' pytest tests/unit/'
' -k "not TestTensorBoard' # requires tensorboard
' and not TestWandb' # requires wandb
' and not TestCometMonitor"' # requires comet
),
# Test suite not available on pypi
'source_urls': [GITHUB_SOURCE],
'sources': [{'download_filename': 'v%(version)s.tar.gz', 'filename': SOURCE_TAR_GZ}],
'testinstall': True,
'checksums': [
{'DeepSpeed-0.14.5.tar.gz': '9f5622715cbd89c7382bfecf7fb188419ad3f2af7764dc6de35917abc6390cce'},
{'DeepSpeed-0.14.2_no-ninja-dep.patch': '03ab528096387e7f18d2a5a6f5fc20ed86d1ca8f63f0e65f266f4dda30e11776'},
{'DeepSpeed-0.14.5_pic-compile.patch': '7d250f6bf57d006cab01a8763803b026f0d9029634557746c2a759893ab279b3'},
{'DeepSpeed-0.14.5_pdsh-env-vars.patch':
'02f053d8de17e4e607b223e836658d8223cb26a3a7d8c9135e67b69aaa7f83a9'},
{'DeepSpeed-0.14.5_use-eb-cutlass.patch':
'43675f7c84fd0b0cea1050a4419020b377de414fc7f83d69b8010ab368964d8d'},
],
}),
]

sanity_pip_check = True

moduleclass = 'ai'
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
From aba7406021d9ea81f7c99e5d0143ed6509acc9e9 Mon Sep 17 00:00:00 2001
From: Viktor Rehnberg <[email protected]>
Date: Wed, 25 Sep 2024 09:29:23 +0000
Subject: [PATCH] Add software relevant environment variables

The multinode runner launches processes with pdsh, if LD_LIBRARY_PATH is
not included in these exports then the python .so file may not be found.
Also including what seemed important and was added from loading DeepSpeed.
(Couldn't add everything, then argumet list becomes too long).

See
- https://github.com/easybuilders/easybuild-easyconfigs/pull/21438#issuecomment-2373540098
for more details.
---
deepspeed/launcher/runner.py | 5 +++++
1 file changed, 5 insertions(+)

diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
index 07d1713e..e9cd61b8 100755
--- a/deepspeed/launcher/runner.py
+++ b/deepspeed/launcher/runner.py
@@ -32,6 +32,11 @@ from deepspeed.accelerator import get_accelerator

DLTS_HOSTFILE = "/job/hostfile"
EXPORT_ENVS = ['MLFLOW', 'PYTHON', 'MV2', 'UCX']
+EXPORT_ENVS += [ # Extra based on what's added by module load DeepSpeed
+ 'LD_LIBRARY_PATH', 'PATH', 'EB', 'TRITON', 'CUDA', # important
+ 'ACLOCAL', 'CMAKE', 'CPATH', 'LIBRARY_PATH', 'MPL', 'NCCL',
+ 'PKG_CONFIG_PATH', 'XDG_DATA_DIRS',
+]
Comment on lines +26 to +30
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm on the fence on if all of these should be included or not. The alternative is to reduce these and add them case by case with a .deepspeed_env file. https://www.deepspeed.ai/getting-started/#multi-node-environment-variables

EXPORT_ENVS += NEBULA_EXPORT_ENVS
DEEPSPEED_ENVIRONMENT_NAME = os.getenv("DS_ENV_FILE", ".deepspeed_env")
DEEPSPEED_ENVIRONMENT_PATHS = [os.path.expanduser("~"), '.']
--
2.39.3

Loading
Loading