Introduce hermetic CUDA in Google ML projects.

1) Hermetic CUDA rules allow building wheels with GPU support on a machine without GPUs, as well as running Bazel GPU tests on a machine with only GPUs and NVIDIA driver installed. When `--config=cuda` is provided in Bazel options, Bazel will download CUDA, CUDNN and NCCL redistributions in the cache, and use them during build and test phases. [Default location of CUNN redistributions](https://developer.download.nvidia.com/compute/cudnn/redist/) [Default location of CUDA redistributions](https://developer.download.nvidia.com/compute/cuda/redist/) [Default location of NCCL redistributions](https://pypi.org/project/nvidia-nccl-cu12/#history) 2) To include hermetic CUDA rules in your project, add the following in the WORKSPACE of the downstream project dependent on XLA. Note: use `@local_tsl` instead of `@tsl` in Tensorflow project. ``` load( "@tsl//third_party/gpus/cuda/hermetic:cuda_json_init_repository.bzl", "cuda_json_init_repository", ) cuda_json_init_repository() load( "@cuda_redist_json//:distributions.bzl", "CUDA_REDISTRIBUTIONS", "CUDNN_REDISTRIBUTIONS", ) load( "@tsl//third_party/gpus/cuda/hermetic:cuda_redist_init_repositories.bzl", "cuda_redist_init_repositories", "cudnn_redist_init_repository", ) cuda_redist_init_repositories( cuda_redistributions = CUDA_REDISTRIBUTIONS, ) cudnn_redist_init_repository( cudnn_redistributions = CUDNN_REDISTRIBUTIONS, ) load( "@tsl//third_party/gpus/cuda/hermetic:cuda_configure.bzl", "cuda_configure", ) cuda_configure(name = "local_config_cuda") load( "@tsl//third_party/nccl/hermetic:nccl_redist_init_repository.bzl", "nccl_redist_init_repository", ) nccl_redist_init_repository() load( "@tsl//third_party/nccl/hermetic:nccl_configure.bzl", "nccl_configure", ) nccl_configure(name = "local_config_nccl") ``` PiperOrigin-RevId: 616865795
openxla · Aug 13, 2024 · 052ebcd · 052ebcd
1 parent ed173a6
commit 052ebcd
Show file tree

Hide file tree

Showing 67 changed files with 3,995 additions and 405 deletions.
diff --git a/.bazelrc b/.bazelrc
@@ -219,11 +219,16 @@ build:mkl_aarch64_threadpool -c opt
 build:cuda --repo_env TF_NEED_CUDA=1
 build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
 build:cuda --@local_config_cuda//:enable_cuda
+# Default CUDA and CUDNN versions.
+build:cuda --repo_env=HERMETIC_CUDA_VERSION="12.3.2"
+build:cuda --repo_env=HERMETIC_CUDNN_VERSION="8.9.7.29"
+# This flag is needed to include hermetic CUDA libraries for bazel tests.
+test:cuda --@local_config_cuda//cuda:include_hermetic_cuda_libs=true
 
 # CUDA: This config refers to building CUDA op kernels with clang.
 build:cuda_clang --config=cuda
-build:cuda_clang --action_env=TF_CUDA_CLANG="1"
 build:cuda_clang --@local_config_cuda//:cuda_compiler=clang
+build:cuda_clang --copt=-Qunused-arguments
 # Select supported compute capabilities (supported graphics cards).
 # This is the same as the official TensorFlow builds.
 # See https://developer.nvidia.com/cuda-gpus#compute
@@ -232,22 +237,22 @@ build:cuda_clang --@local_config_cuda//:cuda_compiler=clang
 # release while SASS is only forward compatible inside the current
 # major release. Example: sm_80 kernels can run on sm_89 GPUs but
 # not on sm_90 GPUs. compute_80 kernels though can also run on sm_90 GPUs.
-build:cuda_clang --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_60,sm_70,sm_80,sm_89,compute_90"
+build:cuda_clang --repo_env=HERMETIC_CUDA_COMPUTE_CAPABILITIES="sm_60,sm_70,sm_80,sm_89,compute_90"
+# Set lld as the linker.
+build:cuda_clang --host_linkopt="-fuse-ld=lld"
+build:cuda_clang --host_linkopt="-lm"
+build:cuda_clang --linkopt="-fuse-ld=lld"
+build:cuda_clang --linkopt="-lm"
 
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
 build:cuda_clang_official --config=cuda_clang
-build:cuda_clang_official --action_env=TF_CUDA_VERSION="12"
-build:cuda_clang_official --action_env=TF_CUDNN_VERSION="8"
-build:cuda_clang_official --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.3"
-build:cuda_clang_official --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
+build:cuda_clang_official --repo_env=HERMETIC_CUDA_VERSION="12.3.2"
+build:cuda_clang_official --repo_env=HERMETIC_CUDNN_VERSION="8.9.7.29"
 build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-18/bin/clang"
-build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 build:cuda_clang_official --crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"
 
 # Build with nvcc for CUDA and clang for host
 build:nvcc_clang --config=cuda
-# Unfortunately, cuda_configure.bzl demands this for using nvcc + clang
-build:nvcc_clang --action_env=TF_CUDA_CLANG="1"
 build:nvcc_clang --action_env=TF_NVCC_CLANG="1"
 build:nvcc_clang --@local_config_cuda//:cuda_compiler=nvcc
 
@@ -543,9 +548,6 @@ build:rbe_linux_cuda --config=cuda_clang_official
 build:rbe_linux_cuda --config=rbe_linux_cpu
 # For Remote build execution -- GPU configuration
 build:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
-build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.17-clang_config_cuda"
-build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.17-clang_config_nccl"
-test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 
 build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
 build:rbe_linux_cuda_nvcc --config=nvcc_clang
@@ -630,7 +632,6 @@ build:release_cpu_linux_base --repo_env=BAZEL_COMPILER="/usr/lib/llvm-18/bin/cla
 # Test-related settings below this point.
 test:release_linux_base --build_tests_only --keep_going --test_output=errors --verbose_failures=true
 test:release_linux_base --local_test_jobs=HOST_CPUS
-test:release_linux_base --test_env=LD_LIBRARY_PATH
 # Give only the list of failed tests at the end of the log
 test:release_linux_base --test_summary=short
 
@@ -644,7 +645,6 @@ build:release_gpu_linux --config=release_cpu_linux
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
 # Note that linux cpu and cuda builds share the same toolchain now.
 build:release_gpu_linux --config=cuda_clang_official
-test:release_gpu_linux --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 # Local test jobs has to be 4 because parallel_gpu_execute is fragile, I think
 test:release_gpu_linux --test_timeout=300,450,1200,3600 --local_test_jobs=4 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute
 

diff --git a/WORKSPACE b/WORKSPACE
@@ -52,3 +52,50 @@ xla_workspace1()
 load(":workspace0.bzl", "xla_workspace0")
 
 xla_workspace0()
+
+load(
+    "@tsl//third_party/gpus/cuda/hermetic:cuda_json_init_repository.bzl",
+    "cuda_json_init_repository",
+)
+
+cuda_json_init_repository()
+
+load(
+    "@cuda_redist_json//:distributions.bzl",
+    "CUDA_REDISTRIBUTIONS",
+    "CUDNN_REDISTRIBUTIONS",
+)
+load(
+    "@tsl//third_party/gpus/cuda/hermetic:cuda_redist_init_repositories.bzl",
+    "cuda_redist_init_repositories",
+    "cudnn_redist_init_repository",
+)
+
+cuda_redist_init_repositories(
+    cuda_redistributions = CUDA_REDISTRIBUTIONS,
+)
+
+cudnn_redist_init_repository(
+    cudnn_redistributions = CUDNN_REDISTRIBUTIONS,
+)
+
+load(
+    "@tsl//third_party/gpus/cuda/hermetic:cuda_configure.bzl",
+    "cuda_configure",
+)
+
+cuda_configure(name = "local_config_cuda")
+
+load(
+    "@tsl//third_party/nccl/hermetic:nccl_redist_init_repository.bzl",
+    "nccl_redist_init_repository",
+)
+
+nccl_redist_init_repository()
+
+load(
+    "@tsl//third_party/nccl/hermetic:nccl_configure.bzl",
+    "nccl_configure",
+)
+
+nccl_configure(name = "local_config_nccl")
diff --git a/build_tools/build.py b/build_tools/build.py
@@ -411,6 +411,16 @@ def main():
             r"s/@sigbuild-r2\.17-clang_/@sigbuild-r2.17-clang-cudnn9_/g",
             "github/xla/.bazelrc",
         ],
+        check=True,
+    )
+    sh(
+        [
+            "sed",
+            "-i",
+            r"s/8\.9\.7\.29/9.1.1/g",
+            "github/xla/.bazelrc",
+        ],
+        check=True,
     )
     sh(["nvidia-smi"])
 

diff --git a/build_tools/configure/BUILD b/build_tools/configure/BUILD
@@ -33,6 +33,7 @@ py_test(
     data = [
         "testdata/clang.bazelrc",
         "testdata/cuda_clang.bazelrc",
+        "testdata/default_cuda_clang.bazelrc",
         "testdata/gcc.bazelrc",
         "testdata/nvcc_clang.bazelrc",
         "testdata/nvcc_gcc.bazelrc",

diff --git a/build_tools/configure/configure.py b/build_tools/configure/configure.py
@@ -27,11 +27,6 @@
   the clang in your path. If that isn't the correct clang, you can override like
   `./configure.py --backend=cpu --clang_path=<PATH_TO_YOUR_CLANG>`.
 
-NOTE(ddunleavy): Lots of these things should probably be outside of configure.py
-but are here because of complexity in `cuda_configure.bzl` and the TF bazelrc.
-Once XLA has it's own bazelrc, and cuda_configure.bzl is replaced or refactored,
-we can probably make this file smaller.
-
 TODO(ddunleavy): add more thorough validation.
 """
 import argparse
@@ -45,18 +40,9 @@
 import sys
 from typing import Optional
 
-_REQUIRED_CUDA_LIBRARIES = ["cublas", "cuda", "cudnn"]
 _DEFAULT_BUILD_AND_TEST_TAG_FILTERS = ("-no_oss",)
 # Assume we are being invoked from the symlink at the root of the repo
 _XLA_SRC_ROOT = pathlib.Path(__file__).absolute().parent
-_FIND_CUDA_CONFIG = str(
-    _XLA_SRC_ROOT
-    / "third_party"
-    / "tsl"
-    / "third_party"
-    / "gpus"
-    / "find_cuda_config.py"
-)
 _XLA_BAZELRC_NAME = "xla_configure.bazelrc"
 _KW_ONLY_IF_PYTHON310 = {"kw_only": True} if sys.version_info >= (3, 10) else {}
 
@@ -224,11 +210,12 @@ class DiscoverablePathsAndVersions:
   ld_library_path: Optional[str] = None
 
   # CUDA specific
-  cublas_version: Optional[str] = None
-  cuda_toolkit_path: Optional[str] = None
+  cuda_version: Optional[str] = None
   cuda_compute_capabilities: Optional[list[str]] = None
   cudnn_version: Optional[str] = None
-  nccl_version: Optional[str] = None
+  local_cuda_path: Optional[str] = None
+  local_cudnn_path: Optional[str] = None
+  local_nccl_path: Optional[str] = None
 
   def get_relevant_paths_and_versions(self, config: "XLAConfigOptions"):
     """Gets paths and versions as needed by the config.
@@ -247,7 +234,7 @@ def get_relevant_paths_and_versions(self, config: "XLAConfigOptions"):
       )
 
       # Notably, we don't use `_find_executable_or_die` for lld, as it changes
-      # which commands it accepts based on it's name! ld.lld is symlinked to a
+      # which commands it accepts based on its name! ld.lld is symlinked to a
       # different executable just called lld, which should not be invoked
       # directly.
       self.lld_path = self.lld_path or shutil.which("ld.lld")
@@ -261,64 +248,6 @@ def get_relevant_paths_and_versions(self, config: "XLAConfigOptions"):
       if not self.cuda_compute_capabilities:
         self.cuda_compute_capabilities = _get_cuda_compute_capabilities_or_die()
 
-      self._get_cuda_libraries_paths_and_versions_if_needed(config)
-
-  def _get_cuda_libraries_paths_and_versions_if_needed(
-      self, config: "XLAConfigOptions"
-  ):
-    """Gets cuda paths and versions if user left any unspecified.
-
-    This uses `find_cuda_config.py` to find versions for all libraries in
-    `_REQUIRED_CUDA_LIBRARIES`.
-
-    Args:
-      config: config that determines which libraries should be found.
-    """
-    should_find_nccl = config.using_nccl and self.nccl_version is None
-    any_cuda_config_unset = any([
-        self.cublas_version is None,
-        self.cuda_toolkit_path is None,
-        self.cudnn_version is None,
-        should_find_nccl,
-    ])
-
-    maybe_nccl = ["nccl"] if should_find_nccl else []
-
-    if any_cuda_config_unset:
-      logging.info(
-          "Some CUDA config versions and paths were not provided, "
-          "so trying to find them using find_cuda_config.py"
-      )
-      try:
-        find_cuda_config_proc = subprocess.run(
-            [
-                sys.executable,
-                _FIND_CUDA_CONFIG,
-                *_REQUIRED_CUDA_LIBRARIES,
-                *maybe_nccl,
-            ],
-            capture_output=True,
-            check=True,
-            text=True,
-        )
-      except subprocess.CalledProcessError as e:
-        logging.info("Command %s failed. Is CUDA installed?", e.cmd)
-        logging.info("Dumping %s ouptut:\n %s", e.cmd, e.output)
-        raise e
-
-      cuda_config = dict(
-          tuple(line.split(": "))
-          for line in find_cuda_config_proc.stdout.strip().split("\n")
-      )
-
-      self.cublas_version = self.cublas_version or cuda_config["cublas_version"]
-      self.cuda_toolkit_path = (
-          self.cuda_toolkit_path or cuda_config["cuda_toolkit_path"]
-      )
-      self.cudnn_version = self.cudnn_version or cuda_config["cudnn_version"]
-      if should_find_nccl:
-        self.nccl_version = self.nccl_version or cuda_config["nccl_version"]
-
 
 @dataclasses.dataclass(frozen=True, **_KW_ONLY_IF_PYTHON310)
 class XLAConfigOptions:
@@ -391,18 +320,31 @@ def to_bazelrc_lines(
         )
 
       # Lines needed for CUDA backend regardless of CUDA/host compiler
+      if dpav.cuda_version:
+        rc.append(
+            f"build:cuda --repo_env HERMETIC_CUDA_VERSION={dpav.cuda_version}"
+        )
       rc.append(
-          f"build --action_env CUDA_TOOLKIT_PATH={dpav.cuda_toolkit_path}"
-      )
-      rc.append(f"build --action_env TF_CUBLAS_VERSION={dpav.cublas_version}")
-      rc.append(
-          "build --action_env"
-          f" TF_CUDA_COMPUTE_CAPABILITIES={','.join(dpav.cuda_compute_capabilities)}"
+          "build:cuda --repo_env"
+          f" HERMETIC_CUDA_COMPUTE_CAPABILITIES={','.join(dpav.cuda_compute_capabilities)}"
       )
-      rc.append(f"build --action_env TF_CUDNN_VERSION={dpav.cudnn_version}")
-      if self.using_nccl:
-        rc.append(f"build --action_env TF_NCCL_VERSION={dpav.nccl_version}")
-      else:
+      if dpav.cudnn_version:
+        rc.append(
+            f"build:cuda --repo_env HERMETIC_CUDNN_VERSION={dpav.cudnn_version}"
+        )
+      if dpav.local_cuda_path:
+        rc.append(
+            f"build:cuda --repo_env LOCAL_CUDA_PATH={dpav.local_cuda_path}"
+        )
+      if dpav.local_cudnn_path:
+        rc.append(
+            f"build:cuda --repo_env LOCAL_CUDNN_PATH={dpav.local_cudnn_path}"
+        )
+      if dpav.local_nccl_path:
+        rc.append(
+            f"build:cuda --repo_env LOCAL_NCCL_PATH={dpav.local_nccl_path}"
+        )
+      if not self.using_nccl:
         rc.append("build --config nonccl")
     elif self.backend == Backend.ROCM:
       pass
@@ -489,13 +431,35 @@ def _parse_args():
   parser.add_argument("--lld_path", help=path_help)
 
   # CUDA specific
-  find_cuda_config_help = (
-      "Optional: will be found using `find_cuda_config.py` if flag is not set."
+  parser.add_argument(
+      "--cuda_version",
+      help="Optional: CUDA will be downloaded by Bazel if the flag is set",
+  )
+  parser.add_argument(
+      "--cudnn_version",
+      help="Optional: CUDNN will be downloaded by Bazel if the flag is set",
+  )
+  parser.add_argument(
+      "--local_cuda_path",
+      help=(
+          "Optional: Local CUDA dir will be used in dependencies if the flag"
+          " is set"
+      ),
+  )
+  parser.add_argument(
+      "--local_cudnn_path",
+      help=(
+          "Optional: Local CUDNN dir will be used in dependencies if the flag"
+          " is set"
+      ),
+  )
+  parser.add_argument(
+      "--local_nccl_path",
+      help=(
+          "Optional: Local NCCL dir will be used in dependencies if the flag"
+          " is set"
+      ),
   )
-  parser.add_argument("--cublas_version", help=find_cuda_config_help)
-  parser.add_argument("--cuda_toolkit_path", help=find_cuda_config_help)
-  parser.add_argument("--cudnn_version", help=find_cuda_config_help)
-  parser.add_argument("--nccl_version", help=find_cuda_config_help)
 
   return parser.parse_args()
 
@@ -523,11 +487,12 @@ def main():
           gcc_path=args.gcc_path,
           lld_path=args.lld_path,
           ld_library_path=args.ld_library_path,
-          cublas_version=args.cublas_version,
-          cuda_compute_capabilities=args.cuda_compute_capabilities,
-          cuda_toolkit_path=args.cuda_toolkit_path,
+          cuda_version=args.cuda_version,
           cudnn_version=args.cudnn_version,
-          nccl_version=args.nccl_version,
+          cuda_compute_capabilities=args.cuda_compute_capabilities,
+          local_cuda_path=args.local_cuda_path,
+          local_cudnn_path=args.local_cudnn_path,
+          local_nccl_path=args.local_nccl_path,
       )
   )