From 7a1ef9ae24e49976d5423ea00db51dedfc58de53 Mon Sep 17 00:00:00 2001 From: Henry Tsang Date: Thu, 5 Oct 2023 14:47:45 -0700 Subject: [PATCH] Fix validate nightly binaries (#1372) Summary: Fix validate nightly binaries. Differential Revision: D48928848 --- .github/scripts/validate_binaries.sh | 70 ++++++++++++++----- .github/workflows/validate-binaries.yml | 3 +- .../workflows/validate-nightly-binaries.yml | 8 +-- 3 files changed, 57 insertions(+), 24 deletions(-) diff --git a/.github/scripts/validate_binaries.sh b/.github/scripts/validate_binaries.sh index 463aa1c4a..45824d959 100755 --- a/.github/scripts/validate_binaries.sh +++ b/.github/scripts/validate_binaries.sh @@ -8,40 +8,72 @@ export PYTORCH_CUDA_PKG="" +conda create -y -n build_binary python="${MATRIX_PYTHON_VERSION}" + +conda run -n build_binary python --version + # Install pytorch, torchrec and fbgemm as per # installation instructions on following page # https://github.com/pytorch/torchrec#installations -if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cuda' ]]; then - export PYTORCH_CUDA_PKG="pytorch-cuda=${MATRIX_GPU_ARCH_VERSION}" + +if [[ ${MATRIX_GPU_ARCH_TYPE} = 'rocm' ]]; then + echo "We don't support rocm" + exit 0 fi -if [[ ${MATRIX_CHANNEL} = 'nightly' ]]; then - # shellcheck disable=SC2086 - conda install -y pytorch ${PYTORCH_CUDA_PKG} -c pytorch-nightly -c nvidia - pip install torchrec_nightly +if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cuda' ]]; then + export CUDA_VERSION="cu118" else - # shellcheck disable=SC2086 - conda install -y pytorch ${PYTORCH_CUDA_PKG} -c pytorch -c nvidia - pip install torchrec + export CUDA_VERSION="cpu" fi -if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cpu' || ${MATRIX_GPU_ARCH_TYPE} = 'rocm' ]]; then - if [[ ${MATRIX_CHANNEL} = 'nightly' ]]; then - pip uninstall fbgemm-gpu-nightly -y - pip install fbgemm-gpu-nightly-cpu +# figure out CUDA VERSION +if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cuda' ]]; then + if [[ ${MATRIX_GPU_ARCH_VERSION} = '11.8' ]]; then + export CUDA_VERSION="cu118" else - pip uninstall fbgemm-gpu -y - pip install fbgemm-gpu-cpu + export CUDA_VERSION="cu121" fi +else + export CUDA_VERSION="cpu" fi +# figure out URL +if [[ ${MATRIX_CHANNEL} = 'nightly' ]]; then + export PYTORCH_URL="https://download.pytorch.org/whl/nightly/${CUDA_VERSION}" +elif [[ ${MATRIX_CHANNEL} = 'test' ]]; then + export PYTORCH_URL="https://download.pytorch.org/whl/test/${CUDA_VERSION}" +elif [[ ${MATRIX_CHANNEL} = 'release' ]]; then + export PYTORCH_URL="https://download.pytorch.org/whl/${CUDA_VERSION}" +fi + +# install pytorch +# switch back to conda once torch nightly is fixed +# if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cuda' ]]; then +# export PYTORCH_CUDA_PKG="pytorch-cuda=${MATRIX_GPU_ARCH_VERSION}" +# fi +conda run -n build_binary pip install torch --index-url "$PYTORCH_URL" + +# install fbgemm +conda run -n build_binary pip install fbgemm-gpu --index-url "$PYTORCH_URL" + +# install requirements +conda run -n build_binary pip install torchmetrics==1.0.3 + +# install torchrec +conda run -n build_binary pip install torchrec --index-url "$PYTORCH_URL" + # Run small import test -python -c "import torch; import fbgemm_gpu; import torchrec" +conda run -n build_binary python -c "import torch; import fbgemm_gpu; import torchrec" + +# check directory +ls -R # Finally run smoke test -pip install torchx +# python 3.11 needs torchx-nightly +conda run -n build_binary pip install torchx-nightly iopath if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cuda' ]]; then - torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py + conda run -n build_binary torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py else - torchx run -s local_cwd dist.ddp -j 1 --script test_installation.py -- --cpu_only + conda run -n build_binary torchx run -s local_cwd dist.ddp -j 1 --script test_installation.py -- --cpu_only fi diff --git a/.github/workflows/validate-binaries.yml b/.github/workflows/validate-binaries.yml index f25bfb0d1..7f3cd8f69 100644 --- a/.github/workflows/validate-binaries.yml +++ b/.github/workflows/validate-binaries.yml @@ -16,12 +16,13 @@ on: workflow_dispatch: inputs: channel: - description: "Channel to use (nightly, release)" + description: "Channel to use (nightly, release, test)" required: true type: choice options: - release - nightly + - test ref: description: 'Reference to checkout, defaults to empty' default: "" diff --git a/.github/workflows/validate-nightly-binaries.yml b/.github/workflows/validate-nightly-binaries.yml index 6d6369495..866449e3b 100644 --- a/.github/workflows/validate-nightly-binaries.yml +++ b/.github/workflows/validate-nightly-binaries.yml @@ -15,10 +15,10 @@ on: - .github/workflows/validate-binaries.yml - .github/scripts/validate-binaries.sh pull_request: - paths: - - .github/workflows/validate-nightly-binaries.yml - - .github/workflows/validate-binaries.yml - - .github/scripts/validate-binaries.sh + # paths: + # - .github/workflows/validate-nightly-binaries.yml + # - .github/workflows/validate-binaries.yml + # - .github/scripts/validate-binaries.sh jobs: nightly: uses: ./.github/workflows/validate-binaries.yml