Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[fbgemm_gpu] Add workflow for running only on non-PyTorch infrastructure #3159

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/scripts/utils_cuda.bash
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,12 @@ install_cuda () {
nm -gDC "${libcuda_path}"
append_to_library_path "${env_name}" "$(dirname "$libcuda_path")"

# The symlink appears to be missing when we attempt to run FBGEMM_GPU on the
# `ubuntu-latest` runners on GitHub, so we have to manually add this in.
if [ "$ADD_LIBCUDA_SYMLINK" == "1" ]; then
print_exec ln "${libcuda_path}" -s "$(dirname "$libcuda_path")/libcuda.so.1"
fi

echo "[INSTALL] Set environment variable NVML_LIB_PATH ..."
# shellcheck disable=SC2155,SC2086
local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
Expand Down
7 changes: 6 additions & 1 deletion .github/scripts/utils_pytorch.bash
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,12 @@ install_pytorch_pip () {
# shellcheck disable=SC2155
local env_prefix=$(env_name_or_prefix "${env_name}")

# Install the package from PyTorch PIP (not PyPI)
# Install the main dependencies
# shellcheck disable=SC2086
(exec_with_retries 3 conda install ${env_prefix} -c conda-forge -y \
numpy) || return 1

# Install the torch package from PyTorch PIP (not PyPI)
install_from_pytorch_pip "${env_name}" torch "${pytorch_channel_version}" "${pytorch_variant_type_version}" || return 1

# Check that PyTorch is importable
Expand Down
53 changes: 52 additions & 1 deletion .github/scripts/utils_system.bash
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,57 @@ free_disk_space () {
echo "[CLEANUP] Freed up some disk space"
}

free_disk_space_on_host () {
echo "################################################################################"
echo "# Free Disk Space On CI Host"
echo "################################################################################"

# NOTE: This is meant to be run from ** inside ** containers hosted on
# non-PyTorch-infra GitHub runners, where the hosts might be close to full
# disk from serving many CI jobs. When the container is set up properly, we
# can escape the container using nsenter to run commands on the host.
#
# On average, we see roughly 3GB of disk freed when running this cleanup,
# which appears to be sufficient to avoid the somewhat-frequent out-of-disk
# errors that we were previously running into.
#
# Frees up disk space on the ubuntu-latest host machine based on recommendations:
# https://github.com/orgs/community/discussions/25678
# https://github.com/apache/flink/blob/02d30ace69dc18555a5085eccf70ee884e73a16e/tools/azure-pipelines/free_disk_space.sh
#
# Escape the docker container to run the free disk operation on the host:
# https://stackoverflow.com/questions/66160057/how-to-run-a-command-in-host-before-entering-docker-container-in-github-ci
# https://stackoverflow.com/questions/32163955/how-to-run-shell-script-on-host-from-docker-container/63140387#63140387

nsenter -t 1 -m -u -n -i bash -c "
echo 'Listing 100 largest packages';
dpkg-query -Wf '\${Installed-Size}\t\${Package}\n' | sort -n | tail -n 100;
df -h;

echo 'Removing large packages';
sudo apt-get remove -y '^ghc-8.*';
sudo apt-get remove -y '^dotnet-.*';
sudo apt-get remove -y '^llvm-.*';
sudo apt-get remove -y 'php.*';
sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel;
sudo apt-get autoremove -y;
sudo apt-get clean;
df -h;

echo 'Removing large directories';
rm -rf /usr/local/android;
rm -rf /usr/share/dotnet;
rm -rf /usr/local/share/boost;
rm -rf /opt/ghc;
rm -rf /usr/local/share/chrom*;
rm -rf /usr/share/swift;
rm -rf /usr/local/julia*;
rm -rf /usr/local/lib/android;
rm -rf /opt/hostedtoolcache;
df -h;
"
}


################################################################################
# Info Functions
Expand All @@ -91,7 +142,7 @@ print_gpu_info () {

(lspci -v | grep -e 'controller.*NVIDIA') || true

if [[ "${ENFORCE_CUDA_DEVICE}" ]]; then
if [[ "${ENFORCE_CUDA_DEVICE}" == '1' ]]; then
# Ensure that nvidia-smi is available and returns GPU entries
if ! nvidia-smi; then
echo "[CHECK] NVIDIA drivers and CUDA device are required for this workflow, but does not appear to be installed or available!"
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/fbgemm_gpu_ci_genai.yml
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,6 @@ jobs:

# Download the built artifact from GHA, test on GPU, and push to PyPI
test_and_publish_artifact:
# runs-on: linux.4xlarge.nvidia.gpu
# Use available instance types - https://github.com/pytorch/test-infra/blob/main/.github/scale-config.yml
runs-on: ${{ matrix.host-machine.instance }}
defaults:
Expand Down
199 changes: 199 additions & 0 deletions .github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# This workflow is used for FBGEMM_GPU-GenAI CI, and is meant to be used for
# copies of the FBGEMM repos hosted outside of the pytorch org.
name: FBGEMM_GPU-GenAI CI (Generic Runner)

on:
# PR Trigger
#
pull_request:
branches:
- main

# Push Trigger (enable to catch errors coming out of multiple merges)
#
push:
branches:
- main

# Manual Trigger
#
workflow_dispatch:

concurrency:
# Cancel previous runs in the PR if a new commit is pushed
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
# Build on CPU hosts and upload to GHA
build_artifact:
runs-on: ${{ matrix.host-machine.instance }}
container:
image: amazonlinux:2023
options: --user root --privileged --pid=host
volumes:
- /var/run/docker.sock:/var/run/docker.sock
defaults:
run:
shell: bash
env:
PRELUDE: .github/scripts/setup_env.bash
BUILD_ENV: build_binary
BUILD_VARIANT: genai
continue-on-error: true
strategy:
fail-fast: false
matrix:
host-machine: [
{ arch: x86, instance: "ubuntu-latest" },
]
python-version: [ "3.9", "3.10", "3.11", "3.12" ]
cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
compiler: [ "gcc", "clang" ]

steps:
- name: Setup Build Container
run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which

- name: Checkout the Repository
uses: actions/checkout@v4
with:
submodules: true

- name: Free Disk Space on Host
run: . $PRELUDE; free_disk_space_on_host

- name: Display System Info
run: . $PRELUDE; print_system_info

- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: . $PRELUDE; setup_miniconda $HOME/miniconda

- name: Create Conda Environment
run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}

- name: Install C/C++ Compilers
run: . $PRELUDE; install_cxx_compiler $BUILD_ENV ${{ matrix.compiler }}

- name: Install Build Tools
run: . $PRELUDE; install_build_tools $BUILD_ENV

- name: Install CUDA
run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}

# Install via PIP to avoid defaulting to the CPU variant if the GPU variant of the day is not ready
- name: Install PyTorch Nightly
run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda/${{ matrix.cuda-version }}

- name: Collect PyTorch Environment Info
if: ${{ success() || failure() }}
run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi

- name: Install cuDNN
run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }}

- name: Prepare FBGEMM_GPU Build
run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV

- name: Build FBGEMM_GPU Wheel
run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV nightly genai

- name: Upload Built Wheel as GHA Artifact
# Cannot upgrade to actions/upload-artifact@v4 yet because GLIBC on the instance is too old
uses: actions/upload-artifact@v3
with:
name: fbgemm_gpu_nightly_genai_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
path: fbgemm_gpu/dist/*.whl
if-no-files-found: error

# Download the built artifact from GHA, test on GPU, and push to PyPI
test_artifact:
runs-on: ${{ matrix.host-machine.instance }}
container:
image: amazonlinux:2023
options: --user root --privileged --pid=host
volumes:
- /var/run/docker.sock:/var/run/docker.sock
defaults:
run:
shell: bash
env:
PRELUDE: .github/scripts/setup_env.bash
BUILD_ENV: build_binary
BUILD_VARIANT: genai
ENFORCE_CUDA_DEVICE: 0
CUDA_VISIBLE_DEVICES: -1
ADD_LIBCUDA_SYMLINK: 1
strategy:
fail-fast: false
matrix:
host-machine: [
{ arch: x86, instance: "ubuntu-latest" },
]
python-version: [ "3.9", "3.10", "3.11", "3.12" ]
cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
# Specify exactly ONE CUDA version for artifact publish
cuda-version-publish: [ "12.1.1" ]
compiler: [ "gcc", "clang" ]
needs: build_artifact

steps:
- name: Setup Build Container
run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which

- name: Checkout the Repository
uses: actions/checkout@v4
with:
submodules: true

- name: Free Disk Space on Host
run: . $PRELUDE; free_disk_space_on_host

- name: Download Wheel Artifact from GHA
# Cannot upgrade to actions/download-artifact@v4 yet because GLIBC on the instance is too old
uses: actions/download-artifact@v3
with:
name: fbgemm_gpu_nightly_genai_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl

- name: Display System Info
run: . $PRELUDE; print_system_info; print_ec2_info

- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: . $PRELUDE; setup_miniconda $HOME/miniconda

- name: Create Conda Environment
run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}

- name: Install C/C++ Compilers for Updated LIBGCC
run: . $PRELUDE; install_cxx_compiler $BUILD_ENV clang

- name: Install CUDA
run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}

- name: Install PyTorch Nightly
run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda/${{ matrix.cuda-version }}

- name: Collect PyTorch Environment Info
if: ${{ success() || failure() }}
run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi

- name: Prepare FBGEMM_GPU Build
run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV

- name: Install FBGEMM_GPU Wheel
run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl

- name: Test with PyTest
timeout-minutes: 30
run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV
4 changes: 2 additions & 2 deletions fbgemm_gpu/experimental/gen_ai/test/attention/gqa_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def gqa_reference(

class Int4GQATest(unittest.TestCase):
@unittest.skipIf(
not torch.version.cuda or torch.cuda.get_device_capability()[0] < 8,
not torch.cuda.is_available() or torch.cuda.get_device_capability()[0] < 8,
"Skip when CUDA is not available or CUDA compute capability is less than 8",
)
@settings(verbosity=VERBOSITY, max_examples=40, deadline=None)
Expand Down Expand Up @@ -243,7 +243,7 @@ def test_gqa(
)
# pyre-fixme[56]
@unittest.skipIf(
not torch.version.cuda or not HAS_XFORMERS,
not torch.cuda.is_available() or not HAS_XFORMERS,
"Skip when CUDA is not available or xformers is not available",
)
def test_mqa_main( # noqa C901
Expand Down
1 change: 1 addition & 0 deletions fbgemm_gpu/fbgemm_gpu/permute_pooled_embedding_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from typing import List, Optional

import torch

from fbgemm_gpu.utils.loader import load_torch_module

try:
Expand Down
Loading