Skip to content

Commit

Permalink
Introduce hermetic CUDA in Google ML projects.
Browse files Browse the repository at this point in the history
1) Hermetic CUDA rules allow building wheels with GPU support on a machine without GPUs, as well as running Bazel GPU tests on a machine with only GPUs and NVIDIA driver installed. When `--config=cuda` is provided in Bazel options, Bazel will download CUDA, CUDNN and NCCL redistributions in the cache, and use them during build and test phases.

    [Default location of CUNN redistributions](https://developer.download.nvidia.com/compute/cudnn/redist/)

    [Default location of CUDA redistributions](https://developer.download.nvidia.com/compute/cuda/redist/)

    [Default location of NCCL redistributions](https://pypi.org/project/nvidia-nccl-cu12/#history)

2) To include hermetic CUDA rules in your project, add the following in the WORKSPACE of the downstream project dependent on XLA.

   Note: use `@local_tsl` instead of `@tsl` in Tensorflow project.

   ```
   load(
      "@tsl//third_party/gpus/cuda/hermetic:cuda_json_init_repository.bzl",
      "cuda_json_init_repository",
   )

   cuda_json_init_repository()

   load(
      "@cuda_redist_json//:distributions.bzl",
      "CUDA_REDISTRIBUTIONS",
      "CUDNN_REDISTRIBUTIONS",
   )
   load(
      "@tsl//third_party/gpus/cuda/hermetic:cuda_redist_init_repositories.bzl",
      "cuda_redist_init_repositories",
      "cudnn_redist_init_repository",
   )

   cuda_redist_init_repositories(
      cuda_redistributions = CUDA_REDISTRIBUTIONS,
   )

   cudnn_redist_init_repository(
      cudnn_redistributions = CUDNN_REDISTRIBUTIONS,
   )

   load(
      "@tsl//third_party/gpus/cuda/hermetic:cuda_configure.bzl",
      "cuda_configure",
   )

   cuda_configure(name = "local_config_cuda")

   load(
      "@tsl//third_party/nccl/hermetic:nccl_redist_init_repository.bzl",
      "nccl_redist_init_repository",
   )

   nccl_redist_init_repository()

   load(
      "@tsl//third_party/nccl/hermetic:nccl_configure.bzl",
      "nccl_configure",
   )

   nccl_configure(name = "local_config_nccl")
   ```

PiperOrigin-RevId: 616865795
  • Loading branch information
tensorflower-gardener authored and copybara-github committed Aug 13, 2024
1 parent ed173a6 commit 052ebcd
Show file tree
Hide file tree
Showing 67 changed files with 3,995 additions and 405 deletions.
28 changes: 14 additions & 14 deletions .bazelrc
Original file line number Diff line number Diff line change
Expand Up @@ -219,11 +219,16 @@ build:mkl_aarch64_threadpool -c opt
build:cuda --repo_env TF_NEED_CUDA=1
build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
build:cuda --@local_config_cuda//:enable_cuda
# Default CUDA and CUDNN versions.
build:cuda --repo_env=HERMETIC_CUDA_VERSION="12.3.2"
build:cuda --repo_env=HERMETIC_CUDNN_VERSION="8.9.7.29"
# This flag is needed to include hermetic CUDA libraries for bazel tests.
test:cuda --@local_config_cuda//cuda:include_hermetic_cuda_libs=true

# CUDA: This config refers to building CUDA op kernels with clang.
build:cuda_clang --config=cuda
build:cuda_clang --action_env=TF_CUDA_CLANG="1"
build:cuda_clang --@local_config_cuda//:cuda_compiler=clang
build:cuda_clang --copt=-Qunused-arguments
# Select supported compute capabilities (supported graphics cards).
# This is the same as the official TensorFlow builds.
# See https://developer.nvidia.com/cuda-gpus#compute
Expand All @@ -232,22 +237,22 @@ build:cuda_clang --@local_config_cuda//:cuda_compiler=clang
# release while SASS is only forward compatible inside the current
# major release. Example: sm_80 kernels can run on sm_89 GPUs but
# not on sm_90 GPUs. compute_80 kernels though can also run on sm_90 GPUs.
build:cuda_clang --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_60,sm_70,sm_80,sm_89,compute_90"
build:cuda_clang --repo_env=HERMETIC_CUDA_COMPUTE_CAPABILITIES="sm_60,sm_70,sm_80,sm_89,compute_90"
# Set lld as the linker.
build:cuda_clang --host_linkopt="-fuse-ld=lld"
build:cuda_clang --host_linkopt="-lm"
build:cuda_clang --linkopt="-fuse-ld=lld"
build:cuda_clang --linkopt="-lm"

# Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
build:cuda_clang_official --config=cuda_clang
build:cuda_clang_official --action_env=TF_CUDA_VERSION="12"
build:cuda_clang_official --action_env=TF_CUDNN_VERSION="8"
build:cuda_clang_official --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.3"
build:cuda_clang_official --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
build:cuda_clang_official --repo_env=HERMETIC_CUDA_VERSION="12.3.2"
build:cuda_clang_official --repo_env=HERMETIC_CUDNN_VERSION="8.9.7.29"
build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-18/bin/clang"
build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
build:cuda_clang_official --crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"

# Build with nvcc for CUDA and clang for host
build:nvcc_clang --config=cuda
# Unfortunately, cuda_configure.bzl demands this for using nvcc + clang
build:nvcc_clang --action_env=TF_CUDA_CLANG="1"
build:nvcc_clang --action_env=TF_NVCC_CLANG="1"
build:nvcc_clang --@local_config_cuda//:cuda_compiler=nvcc

Expand Down Expand Up @@ -543,9 +548,6 @@ build:rbe_linux_cuda --config=cuda_clang_official
build:rbe_linux_cuda --config=rbe_linux_cpu
# For Remote build execution -- GPU configuration
build:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.17-clang_config_cuda"
build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.17-clang_config_nccl"
test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"

build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
build:rbe_linux_cuda_nvcc --config=nvcc_clang
Expand Down Expand Up @@ -630,7 +632,6 @@ build:release_cpu_linux_base --repo_env=BAZEL_COMPILER="/usr/lib/llvm-18/bin/cla
# Test-related settings below this point.
test:release_linux_base --build_tests_only --keep_going --test_output=errors --verbose_failures=true
test:release_linux_base --local_test_jobs=HOST_CPUS
test:release_linux_base --test_env=LD_LIBRARY_PATH
# Give only the list of failed tests at the end of the log
test:release_linux_base --test_summary=short

Expand All @@ -644,7 +645,6 @@ build:release_gpu_linux --config=release_cpu_linux
# Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
# Note that linux cpu and cuda builds share the same toolchain now.
build:release_gpu_linux --config=cuda_clang_official
test:release_gpu_linux --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
# Local test jobs has to be 4 because parallel_gpu_execute is fragile, I think
test:release_gpu_linux --test_timeout=300,450,1200,3600 --local_test_jobs=4 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute

Expand Down
47 changes: 47 additions & 0 deletions WORKSPACE
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,50 @@ xla_workspace1()
load(":workspace0.bzl", "xla_workspace0")

xla_workspace0()

load(
"@tsl//third_party/gpus/cuda/hermetic:cuda_json_init_repository.bzl",
"cuda_json_init_repository",
)

cuda_json_init_repository()

load(
"@cuda_redist_json//:distributions.bzl",
"CUDA_REDISTRIBUTIONS",
"CUDNN_REDISTRIBUTIONS",
)
load(
"@tsl//third_party/gpus/cuda/hermetic:cuda_redist_init_repositories.bzl",
"cuda_redist_init_repositories",
"cudnn_redist_init_repository",
)

cuda_redist_init_repositories(
cuda_redistributions = CUDA_REDISTRIBUTIONS,
)

cudnn_redist_init_repository(
cudnn_redistributions = CUDNN_REDISTRIBUTIONS,
)

load(
"@tsl//third_party/gpus/cuda/hermetic:cuda_configure.bzl",
"cuda_configure",
)

cuda_configure(name = "local_config_cuda")

load(
"@tsl//third_party/nccl/hermetic:nccl_redist_init_repository.bzl",
"nccl_redist_init_repository",
)

nccl_redist_init_repository()

load(
"@tsl//third_party/nccl/hermetic:nccl_configure.bzl",
"nccl_configure",
)

nccl_configure(name = "local_config_nccl")
10 changes: 10 additions & 0 deletions build_tools/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,16 @@ def main():
r"s/@sigbuild-r2\.17-clang_/@sigbuild-r2.17-clang-cudnn9_/g",
"github/xla/.bazelrc",
],
check=True,
)
sh(
[
"sed",
"-i",
r"s/8\.9\.7\.29/9.1.1/g",
"github/xla/.bazelrc",
],
check=True,
)
sh(["nvidia-smi"])

Expand Down
1 change: 1 addition & 0 deletions build_tools/configure/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ py_test(
data = [
"testdata/clang.bazelrc",
"testdata/cuda_clang.bazelrc",
"testdata/default_cuda_clang.bazelrc",
"testdata/gcc.bazelrc",
"testdata/nvcc_clang.bazelrc",
"testdata/nvcc_gcc.bazelrc",
Expand Down
157 changes: 61 additions & 96 deletions build_tools/configure/configure.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,6 @@
the clang in your path. If that isn't the correct clang, you can override like
`./configure.py --backend=cpu --clang_path=<PATH_TO_YOUR_CLANG>`.
NOTE(ddunleavy): Lots of these things should probably be outside of configure.py
but are here because of complexity in `cuda_configure.bzl` and the TF bazelrc.
Once XLA has it's own bazelrc, and cuda_configure.bzl is replaced or refactored,
we can probably make this file smaller.
TODO(ddunleavy): add more thorough validation.
"""
import argparse
Expand All @@ -45,18 +40,9 @@
import sys
from typing import Optional

_REQUIRED_CUDA_LIBRARIES = ["cublas", "cuda", "cudnn"]
_DEFAULT_BUILD_AND_TEST_TAG_FILTERS = ("-no_oss",)
# Assume we are being invoked from the symlink at the root of the repo
_XLA_SRC_ROOT = pathlib.Path(__file__).absolute().parent
_FIND_CUDA_CONFIG = str(
_XLA_SRC_ROOT
/ "third_party"
/ "tsl"
/ "third_party"
/ "gpus"
/ "find_cuda_config.py"
)
_XLA_BAZELRC_NAME = "xla_configure.bazelrc"
_KW_ONLY_IF_PYTHON310 = {"kw_only": True} if sys.version_info >= (3, 10) else {}

Expand Down Expand Up @@ -224,11 +210,12 @@ class DiscoverablePathsAndVersions:
ld_library_path: Optional[str] = None

# CUDA specific
cublas_version: Optional[str] = None
cuda_toolkit_path: Optional[str] = None
cuda_version: Optional[str] = None
cuda_compute_capabilities: Optional[list[str]] = None
cudnn_version: Optional[str] = None
nccl_version: Optional[str] = None
local_cuda_path: Optional[str] = None
local_cudnn_path: Optional[str] = None
local_nccl_path: Optional[str] = None

def get_relevant_paths_and_versions(self, config: "XLAConfigOptions"):
"""Gets paths and versions as needed by the config.
Expand All @@ -247,7 +234,7 @@ def get_relevant_paths_and_versions(self, config: "XLAConfigOptions"):
)

# Notably, we don't use `_find_executable_or_die` for lld, as it changes
# which commands it accepts based on it's name! ld.lld is symlinked to a
# which commands it accepts based on its name! ld.lld is symlinked to a
# different executable just called lld, which should not be invoked
# directly.
self.lld_path = self.lld_path or shutil.which("ld.lld")
Expand All @@ -261,64 +248,6 @@ def get_relevant_paths_and_versions(self, config: "XLAConfigOptions"):
if not self.cuda_compute_capabilities:
self.cuda_compute_capabilities = _get_cuda_compute_capabilities_or_die()

self._get_cuda_libraries_paths_and_versions_if_needed(config)

def _get_cuda_libraries_paths_and_versions_if_needed(
self, config: "XLAConfigOptions"
):
"""Gets cuda paths and versions if user left any unspecified.
This uses `find_cuda_config.py` to find versions for all libraries in
`_REQUIRED_CUDA_LIBRARIES`.
Args:
config: config that determines which libraries should be found.
"""
should_find_nccl = config.using_nccl and self.nccl_version is None
any_cuda_config_unset = any([
self.cublas_version is None,
self.cuda_toolkit_path is None,
self.cudnn_version is None,
should_find_nccl,
])

maybe_nccl = ["nccl"] if should_find_nccl else []

if any_cuda_config_unset:
logging.info(
"Some CUDA config versions and paths were not provided, "
"so trying to find them using find_cuda_config.py"
)
try:
find_cuda_config_proc = subprocess.run(
[
sys.executable,
_FIND_CUDA_CONFIG,
*_REQUIRED_CUDA_LIBRARIES,
*maybe_nccl,
],
capture_output=True,
check=True,
text=True,
)
except subprocess.CalledProcessError as e:
logging.info("Command %s failed. Is CUDA installed?", e.cmd)
logging.info("Dumping %s ouptut:\n %s", e.cmd, e.output)
raise e

cuda_config = dict(
tuple(line.split(": "))
for line in find_cuda_config_proc.stdout.strip().split("\n")
)

self.cublas_version = self.cublas_version or cuda_config["cublas_version"]
self.cuda_toolkit_path = (
self.cuda_toolkit_path or cuda_config["cuda_toolkit_path"]
)
self.cudnn_version = self.cudnn_version or cuda_config["cudnn_version"]
if should_find_nccl:
self.nccl_version = self.nccl_version or cuda_config["nccl_version"]


@dataclasses.dataclass(frozen=True, **_KW_ONLY_IF_PYTHON310)
class XLAConfigOptions:
Expand Down Expand Up @@ -391,18 +320,31 @@ def to_bazelrc_lines(
)

# Lines needed for CUDA backend regardless of CUDA/host compiler
if dpav.cuda_version:
rc.append(
f"build:cuda --repo_env HERMETIC_CUDA_VERSION={dpav.cuda_version}"
)
rc.append(
f"build --action_env CUDA_TOOLKIT_PATH={dpav.cuda_toolkit_path}"
)
rc.append(f"build --action_env TF_CUBLAS_VERSION={dpav.cublas_version}")
rc.append(
"build --action_env"
f" TF_CUDA_COMPUTE_CAPABILITIES={','.join(dpav.cuda_compute_capabilities)}"
"build:cuda --repo_env"
f" HERMETIC_CUDA_COMPUTE_CAPABILITIES={','.join(dpav.cuda_compute_capabilities)}"
)
rc.append(f"build --action_env TF_CUDNN_VERSION={dpav.cudnn_version}")
if self.using_nccl:
rc.append(f"build --action_env TF_NCCL_VERSION={dpav.nccl_version}")
else:
if dpav.cudnn_version:
rc.append(
f"build:cuda --repo_env HERMETIC_CUDNN_VERSION={dpav.cudnn_version}"
)
if dpav.local_cuda_path:
rc.append(
f"build:cuda --repo_env LOCAL_CUDA_PATH={dpav.local_cuda_path}"
)
if dpav.local_cudnn_path:
rc.append(
f"build:cuda --repo_env LOCAL_CUDNN_PATH={dpav.local_cudnn_path}"
)
if dpav.local_nccl_path:
rc.append(
f"build:cuda --repo_env LOCAL_NCCL_PATH={dpav.local_nccl_path}"
)
if not self.using_nccl:
rc.append("build --config nonccl")
elif self.backend == Backend.ROCM:
pass
Expand Down Expand Up @@ -489,13 +431,35 @@ def _parse_args():
parser.add_argument("--lld_path", help=path_help)

# CUDA specific
find_cuda_config_help = (
"Optional: will be found using `find_cuda_config.py` if flag is not set."
parser.add_argument(
"--cuda_version",
help="Optional: CUDA will be downloaded by Bazel if the flag is set",
)
parser.add_argument(
"--cudnn_version",
help="Optional: CUDNN will be downloaded by Bazel if the flag is set",
)
parser.add_argument(
"--local_cuda_path",
help=(
"Optional: Local CUDA dir will be used in dependencies if the flag"
" is set"
),
)
parser.add_argument(
"--local_cudnn_path",
help=(
"Optional: Local CUDNN dir will be used in dependencies if the flag"
" is set"
),
)
parser.add_argument(
"--local_nccl_path",
help=(
"Optional: Local NCCL dir will be used in dependencies if the flag"
" is set"
),
)
parser.add_argument("--cublas_version", help=find_cuda_config_help)
parser.add_argument("--cuda_toolkit_path", help=find_cuda_config_help)
parser.add_argument("--cudnn_version", help=find_cuda_config_help)
parser.add_argument("--nccl_version", help=find_cuda_config_help)

return parser.parse_args()

Expand Down Expand Up @@ -523,11 +487,12 @@ def main():
gcc_path=args.gcc_path,
lld_path=args.lld_path,
ld_library_path=args.ld_library_path,
cublas_version=args.cublas_version,
cuda_compute_capabilities=args.cuda_compute_capabilities,
cuda_toolkit_path=args.cuda_toolkit_path,
cuda_version=args.cuda_version,
cudnn_version=args.cudnn_version,
nccl_version=args.nccl_version,
cuda_compute_capabilities=args.cuda_compute_capabilities,
local_cuda_path=args.local_cuda_path,
local_cudnn_path=args.local_cudnn_path,
local_nccl_path=args.local_nccl_path,
)
)

Expand Down
Loading

0 comments on commit 052ebcd

Please sign in to comment.