From 32876f072c43215cbeac1a73b245d2edc957a9e2 Mon Sep 17 00:00:00 2001 From: Henry Tsang Date: Thu, 5 Oct 2023 15:13:47 -0700 Subject: [PATCH] Fix validate nightly binaries (#1372) Summary: Fix validate nightly binaries. Differential Revision: D48928848 --- .github/scripts/validate_binaries.sh | 83 ++++++++++++++----- .github/workflows/validate-binaries.yml | 4 +- .../workflows/validate-nightly-binaries.yml | 10 +-- 3 files changed, 72 insertions(+), 25 deletions(-) diff --git a/.github/scripts/validate_binaries.sh b/.github/scripts/validate_binaries.sh index 463aa1c4a..bffe7dc9c 100755 --- a/.github/scripts/validate_binaries.sh +++ b/.github/scripts/validate_binaries.sh @@ -8,40 +8,85 @@ export PYTORCH_CUDA_PKG="" +conda create -y -n build_binary python="${MATRIX_PYTHON_VERSION}" + +conda run -n build_binary python --version + # Install pytorch, torchrec and fbgemm as per # installation instructions on following page # https://github.com/pytorch/torchrec#installations + +if [[ ${MATRIX_GPU_ARCH_TYPE} = 'rocm' ]]; then + echo "We don't support rocm" + exit 0 +fi + if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cuda' ]]; then - export PYTORCH_CUDA_PKG="pytorch-cuda=${MATRIX_GPU_ARCH_VERSION}" + export CUDA_VERSION="cu118" +else + export CUDA_VERSION="cpu" fi -if [[ ${MATRIX_CHANNEL} = 'nightly' ]]; then - # shellcheck disable=SC2086 - conda install -y pytorch ${PYTORCH_CUDA_PKG} -c pytorch-nightly -c nvidia - pip install torchrec_nightly +# figure out CUDA VERSION +if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cuda' ]]; then + if [[ ${MATRIX_GPU_ARCH_VERSION} = '11.8' ]]; then + export CUDA_VERSION="cu118" + else + export CUDA_VERSION="cu121" + fi else - # shellcheck disable=SC2086 - conda install -y pytorch ${PYTORCH_CUDA_PKG} -c pytorch -c nvidia - pip install torchrec + export CUDA_VERSION="cpu" fi -if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cpu' || ${MATRIX_GPU_ARCH_TYPE} = 'rocm' ]]; then +if [[ ${MATRIX_CHANNEL} = 'pypi_release' ]]; then + echo "checking pypi release" + pip install torch + pip install fbgemm-gpu + pip install torchrec +else + # figure out URL if [[ ${MATRIX_CHANNEL} = 'nightly' ]]; then - pip uninstall fbgemm-gpu-nightly -y - pip install fbgemm-gpu-nightly-cpu - else - pip uninstall fbgemm-gpu -y - pip install fbgemm-gpu-cpu + export PYTORCH_URL="https://download.pytorch.org/whl/nightly/${CUDA_VERSION}" + elif [[ ${MATRIX_CHANNEL} = 'test' ]]; then + export PYTORCH_URL="https://download.pytorch.org/whl/test/${CUDA_VERSION}" + elif [[ ${MATRIX_CHANNEL} = 'release' ]]; then + export PYTORCH_URL="https://download.pytorch.org/whl/${CUDA_VERSION}" fi + + # install pytorch + # switch back to conda once torch nightly is fixed + # if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cuda' ]]; then + # export PYTORCH_CUDA_PKG="pytorch-cuda=${MATRIX_GPU_ARCH_VERSION}" + # fi + conda run -n build_binary pip install torch --index-url "$PYTORCH_URL" + + # install fbgemm + conda run -n build_binary pip install fbgemm-gpu --index-url "$PYTORCH_URL" + + # install requirements from pypi + conda run -n build_binary pip install torchmetrics==1.0.3 + + # install torchrec + conda run -n build_binary pip install torchrec --index-url "$PYTORCH_URL" + + # Run small import test + conda run -n build_binary python -c "import torch; import fbgemm_gpu; import torchrec" fi -# Run small import test -python -c "import torch; import fbgemm_gpu; import torchrec" +# check directory +ls -R + +# check if cuda available +conda run -n build_binary python -c "import torch; print(torch.cuda.is_available())" + +# check cuda version +conda run -n build_binary python -c "import torch; print(torch.version.cuda)" # Finally run smoke test -pip install torchx +# python 3.11 needs torchx-nightly +conda run -n build_binary pip install torchx-nightly iopath if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cuda' ]]; then - torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py + conda run -n build_binary torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py else - torchx run -s local_cwd dist.ddp -j 1 --script test_installation.py -- --cpu_only + conda run -n build_binary torchx run -s local_cwd dist.ddp -j 1 --script test_installation.py -- --cpu_only fi diff --git a/.github/workflows/validate-binaries.yml b/.github/workflows/validate-binaries.yml index f25bfb0d1..c6f62a69d 100644 --- a/.github/workflows/validate-binaries.yml +++ b/.github/workflows/validate-binaries.yml @@ -16,12 +16,14 @@ on: workflow_dispatch: inputs: channel: - description: "Channel to use (nightly, release)" + description: "Channel to use (nightly, release, test, pypi_release)" required: true type: choice options: - release - nightly + - test + - pypi_release ref: description: 'Reference to checkout, defaults to empty' default: "" diff --git a/.github/workflows/validate-nightly-binaries.yml b/.github/workflows/validate-nightly-binaries.yml index 6d6369495..5a08c6885 100644 --- a/.github/workflows/validate-nightly-binaries.yml +++ b/.github/workflows/validate-nightly-binaries.yml @@ -15,12 +15,12 @@ on: - .github/workflows/validate-binaries.yml - .github/scripts/validate-binaries.sh pull_request: - paths: - - .github/workflows/validate-nightly-binaries.yml - - .github/workflows/validate-binaries.yml - - .github/scripts/validate-binaries.sh + # paths: + # - .github/workflows/validate-nightly-binaries.yml + # - .github/workflows/validate-binaries.yml + # - .github/scripts/validate-binaries.sh jobs: nightly: uses: ./.github/workflows/validate-binaries.yml with: - channel: nightly + channel: pypi_release