diff --git a/.github/scripts/utils_cuda.bash b/.github/scripts/utils_cuda.bash index 8bbf4ff2c..15d0d5c61 100644 --- a/.github/scripts/utils_cuda.bash +++ b/.github/scripts/utils_cuda.bash @@ -70,6 +70,12 @@ install_cuda () { nm -gDC "${libcuda_path}" append_to_library_path "${env_name}" "$(dirname "$libcuda_path")" + # The symlink appears to be missing when we attempt to run FBGEMM_GPU on the + # `ubuntu-latest` runners on GitHub, so we have to manually add this in. + if [ "$ADD_LIBCUDA_SYMLINK" == "1" ]; then + print_exec ln "${libcuda_path}" -s "$(dirname "$libcuda_path")/libcuda.so.1" + fi + echo "[INSTALL] Set environment variable NVML_LIB_PATH ..." # shellcheck disable=SC2155,SC2086 local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX) diff --git a/.github/scripts/utils_pytorch.bash b/.github/scripts/utils_pytorch.bash index 3d07e437c..42ea6c0f6 100644 --- a/.github/scripts/utils_pytorch.bash +++ b/.github/scripts/utils_pytorch.bash @@ -127,7 +127,12 @@ install_pytorch_pip () { # shellcheck disable=SC2155 local env_prefix=$(env_name_or_prefix "${env_name}") - # Install the package from PyTorch PIP (not PyPI) + # Install the main dependencies + # shellcheck disable=SC2086 + (exec_with_retries 3 conda install ${env_prefix} -c conda-forge -y \ + numpy) || return 1 + + # Install the torch package from PyTorch PIP (not PyPI) install_from_pytorch_pip "${env_name}" torch "${pytorch_channel_version}" "${pytorch_variant_type_version}" || return 1 # Check that PyTorch is importable diff --git a/.github/scripts/utils_system.bash b/.github/scripts/utils_system.bash index ba5605a28..a81ece336 100644 --- a/.github/scripts/utils_system.bash +++ b/.github/scripts/utils_system.bash @@ -73,6 +73,57 @@ free_disk_space () { echo "[CLEANUP] Freed up some disk space" } +free_disk_space_on_host () { + echo "################################################################################" + echo "# Free Disk Space On CI Host" + echo "################################################################################" + + # NOTE: This is meant to be run from ** inside ** containers hosted on + # non-PyTorch-infra GitHub runners, where the hosts might be close to full + # disk from serving many CI jobs. When the container is set up properly, we + # can escape the container using nsenter to run commands on the host. + # + # On average, we see roughly 3GB of disk freed when running this cleanup, + # which appears to be sufficient to avoid the somewhat-frequent out-of-disk + # errors that we were previously running into. + # + # Frees up disk space on the ubuntu-latest host machine based on recommendations: + # https://github.com/orgs/community/discussions/25678 + # https://github.com/apache/flink/blob/02d30ace69dc18555a5085eccf70ee884e73a16e/tools/azure-pipelines/free_disk_space.sh + # + # Escape the docker container to run the free disk operation on the host: + # https://stackoverflow.com/questions/66160057/how-to-run-a-command-in-host-before-entering-docker-container-in-github-ci + # https://stackoverflow.com/questions/32163955/how-to-run-shell-script-on-host-from-docker-container/63140387#63140387 + + nsenter -t 1 -m -u -n -i bash -c " + echo 'Listing 100 largest packages'; + dpkg-query -Wf '\${Installed-Size}\t\${Package}\n' | sort -n | tail -n 100; + df -h; + + echo 'Removing large packages'; + sudo apt-get remove -y '^ghc-8.*'; + sudo apt-get remove -y '^dotnet-.*'; + sudo apt-get remove -y '^llvm-.*'; + sudo apt-get remove -y 'php.*'; + sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel; + sudo apt-get autoremove -y; + sudo apt-get clean; + df -h; + + echo 'Removing large directories'; + rm -rf /usr/local/android; + rm -rf /usr/share/dotnet; + rm -rf /usr/local/share/boost; + rm -rf /opt/ghc; + rm -rf /usr/local/share/chrom*; + rm -rf /usr/share/swift; + rm -rf /usr/local/julia*; + rm -rf /usr/local/lib/android; + rm -rf /opt/hostedtoolcache; + df -h; + " +} + ################################################################################ # Info Functions @@ -91,7 +142,7 @@ print_gpu_info () { (lspci -v | grep -e 'controller.*NVIDIA') || true - if [[ "${ENFORCE_CUDA_DEVICE}" ]]; then + if [[ "${ENFORCE_CUDA_DEVICE}" == '1' ]]; then # Ensure that nvidia-smi is available and returns GPU entries if ! nvidia-smi; then echo "[CHECK] NVIDIA drivers and CUDA device are required for this workflow, but does not appear to be installed or available!" diff --git a/.github/workflows/fbgemm_gpu_ci_genai.yml b/.github/workflows/fbgemm_gpu_ci_genai.yml index 07f8f8059..3ffdf45c0 100644 --- a/.github/workflows/fbgemm_gpu_ci_genai.yml +++ b/.github/workflows/fbgemm_gpu_ci_genai.yml @@ -127,7 +127,6 @@ jobs: # Download the built artifact from GHA, test on GPU, and push to PyPI test_and_publish_artifact: - # runs-on: linux.4xlarge.nvidia.gpu # Use available instance types - https://github.com/pytorch/test-infra/blob/main/.github/scale-config.yml runs-on: ${{ matrix.host-machine.instance }} defaults: diff --git a/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml b/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml new file mode 100644 index 000000000..ef9f78ed6 --- /dev/null +++ b/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml @@ -0,0 +1,199 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# This workflow is used for FBGEMM_GPU-GenAI CI, and is meant to be used for +# copies of the FBGEMM repos hosted outside of the pytorch org. +name: FBGEMM_GPU-GenAI CI (Generic Runner) + +on: + # PR Trigger + # + pull_request: + branches: + - main + + # Push Trigger (enable to catch errors coming out of multiple merges) + # + push: + branches: + - main + + # Manual Trigger + # + workflow_dispatch: + +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + # Build on CPU hosts and upload to GHA + build_artifact: + runs-on: ${{ matrix.host-machine.instance }} + container: + image: amazonlinux:2023 + options: --user root --privileged --pid=host + volumes: + - /var/run/docker.sock:/var/run/docker.sock + defaults: + run: + shell: bash + env: + PRELUDE: .github/scripts/setup_env.bash + BUILD_ENV: build_binary + BUILD_VARIANT: genai + continue-on-error: true + strategy: + fail-fast: false + matrix: + host-machine: [ + { arch: x86, instance: "ubuntu-latest" }, + ] + python-version: [ "3.9", "3.10", "3.11", "3.12" ] + cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ] + compiler: [ "gcc", "clang" ] + + steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which + + - name: Checkout the Repository + uses: actions/checkout@v4 + with: + submodules: true + + - name: Free Disk Space on Host + run: . $PRELUDE; free_disk_space_on_host + + - name: Display System Info + run: . $PRELUDE; print_system_info + + - name: Display GPU Info + run: . $PRELUDE; print_gpu_info + + - name: Setup Miniconda + run: . $PRELUDE; setup_miniconda $HOME/miniconda + + - name: Create Conda Environment + run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} + + - name: Install C/C++ Compilers + run: . $PRELUDE; install_cxx_compiler $BUILD_ENV ${{ matrix.compiler }} + + - name: Install Build Tools + run: . $PRELUDE; install_build_tools $BUILD_ENV + + - name: Install CUDA + run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }} + + # Install via PIP to avoid defaulting to the CPU variant if the GPU variant of the day is not ready + - name: Install PyTorch Nightly + run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda/${{ matrix.cuda-version }} + + - name: Collect PyTorch Environment Info + if: ${{ success() || failure() }} + run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi + + - name: Install cuDNN + run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }} + + - name: Prepare FBGEMM_GPU Build + run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV + + - name: Build FBGEMM_GPU Wheel + run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV nightly genai + + - name: Upload Built Wheel as GHA Artifact + # Cannot upgrade to actions/upload-artifact@v4 yet because GLIBC on the instance is too old + uses: actions/upload-artifact@v3 + with: + name: fbgemm_gpu_nightly_genai_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl + path: fbgemm_gpu/dist/*.whl + if-no-files-found: error + + # Download the built artifact from GHA, test on GPU, and push to PyPI + test_artifact: + runs-on: ${{ matrix.host-machine.instance }} + container: + image: amazonlinux:2023 + options: --user root --privileged --pid=host + volumes: + - /var/run/docker.sock:/var/run/docker.sock + defaults: + run: + shell: bash + env: + PRELUDE: .github/scripts/setup_env.bash + BUILD_ENV: build_binary + BUILD_VARIANT: genai + ENFORCE_CUDA_DEVICE: 0 + CUDA_VISIBLE_DEVICES: -1 + ADD_LIBCUDA_SYMLINK: 1 + strategy: + fail-fast: false + matrix: + host-machine: [ + { arch: x86, instance: "ubuntu-latest" }, + ] + python-version: [ "3.9", "3.10", "3.11", "3.12" ] + cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ] + # Specify exactly ONE CUDA version for artifact publish + cuda-version-publish: [ "12.1.1" ] + compiler: [ "gcc", "clang" ] + needs: build_artifact + + steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which + + - name: Checkout the Repository + uses: actions/checkout@v4 + with: + submodules: true + + - name: Free Disk Space on Host + run: . $PRELUDE; free_disk_space_on_host + + - name: Download Wheel Artifact from GHA + # Cannot upgrade to actions/download-artifact@v4 yet because GLIBC on the instance is too old + uses: actions/download-artifact@v3 + with: + name: fbgemm_gpu_nightly_genai_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl + + - name: Display System Info + run: . $PRELUDE; print_system_info; print_ec2_info + + - name: Display GPU Info + run: . $PRELUDE; print_gpu_info + + - name: Setup Miniconda + run: . $PRELUDE; setup_miniconda $HOME/miniconda + + - name: Create Conda Environment + run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} + + - name: Install C/C++ Compilers for Updated LIBGCC + run: . $PRELUDE; install_cxx_compiler $BUILD_ENV clang + + - name: Install CUDA + run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }} + + - name: Install PyTorch Nightly + run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda/${{ matrix.cuda-version }} + + - name: Collect PyTorch Environment Info + if: ${{ success() || failure() }} + run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi + + - name: Prepare FBGEMM_GPU Build + run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV + + - name: Install FBGEMM_GPU Wheel + run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl + + - name: Test with PyTest + timeout-minutes: 30 + run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV diff --git a/fbgemm_gpu/experimental/gen_ai/test/attention/gqa_test.py b/fbgemm_gpu/experimental/gen_ai/test/attention/gqa_test.py index 43c1dafb4..8446e34e3 100755 --- a/fbgemm_gpu/experimental/gen_ai/test/attention/gqa_test.py +++ b/fbgemm_gpu/experimental/gen_ai/test/attention/gqa_test.py @@ -137,7 +137,7 @@ def gqa_reference( class Int4GQATest(unittest.TestCase): @unittest.skipIf( - not torch.version.cuda or torch.cuda.get_device_capability()[0] < 8, + not torch.cuda.is_available() or torch.cuda.get_device_capability()[0] < 8, "Skip when CUDA is not available or CUDA compute capability is less than 8", ) @settings(verbosity=VERBOSITY, max_examples=40, deadline=None) @@ -243,7 +243,7 @@ def test_gqa( ) # pyre-fixme[56] @unittest.skipIf( - not torch.version.cuda or not HAS_XFORMERS, + not torch.cuda.is_available() or not HAS_XFORMERS, "Skip when CUDA is not available or xformers is not available", ) def test_mqa_main( # noqa C901 diff --git a/fbgemm_gpu/fbgemm_gpu/permute_pooled_embedding_modules.py b/fbgemm_gpu/fbgemm_gpu/permute_pooled_embedding_modules.py index 03021ad86..cbdf1b0bc 100644 --- a/fbgemm_gpu/fbgemm_gpu/permute_pooled_embedding_modules.py +++ b/fbgemm_gpu/fbgemm_gpu/permute_pooled_embedding_modules.py @@ -11,6 +11,7 @@ from typing import List, Optional import torch + from fbgemm_gpu.utils.loader import load_torch_module try: