Skip to content
This repository has been archived by the owner on Feb 1, 2024. It is now read-only.

Commit

Permalink
ci: update GPU check (#35)
Browse files Browse the repository at this point in the history
  • Loading branch information
Borda authored Sep 2, 2023
1 parent 4949669 commit f4a9331
Showing 1 changed file with 42 additions and 22 deletions.
64 changes: 42 additions & 22 deletions .azure/gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,56 +33,81 @@ jobs:
cancelTimeoutInMinutes: "2"
strategy:
matrix:
#'PL pkg': # todo: consider adding
# image: "pytorchlightning/pytorch_lightning:base-cuda-py3.8-torch1.12-cuda11.3.1"
# scope: ""
# PACKAGE_NAME: "pytorch"
'Lightning pkg':
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.7.1"
scope: ""
PACKAGE_NAME: "lightning"
#image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch1.13-cuda11.7.1"
image: "nvidia/cuda:11.7.1-devel-ubuntu20.04"
PYTHON_VERSION: '3.9'
PYTORCH_VERSION: '1.13'
CUDA_VERSION_MM: '116'
pool: 'lit-rtx-3090'
variables:
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
FREEZE_REQUIREMENTS: "1"
TORCH_HOME: "/var/tmp/torch"
PIP_CACHE_DIR: "/var/tmp/pip"
# HOROVOD_CUDA_HOME: $(CUDA_TOOLKIT_ROOT_DIR)
HOROVOD_GPU_OPERATIONS: NCCL
HOROVOD_WITH_PYTORCH: 1
HOROVOD_WITHOUT_TENSORFLOW: 1
HOROVOD_WITHOUT_MXNET: 1
HOROVOD_WITH_GLOO: 1
HOROVOD_WITH_MPI: 1
PL_USE_MOCKED_MNIST: 1
container:
image: $(image)
# default shm size is 64m. Increase it to avoid:
# 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
options: "--gpus=all --shm-size=2gb"
options: "--gpus=all --shm-size=6gb -v /usr/bin/docker:/tmp/docker:ro"
workspace:
clean: all
steps:

- script: |
container_id=$(head -1 /proc/self/cgroup|cut -d/ -f3)
echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
/tmp/docker exec -t -u 0 $container_id \
sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo"
echo "##vso[task.setvariable variable=CONTAINER_ID]$container_id"
displayName: 'Install Sudo in container (thanks Microsoft!)'
- bash: |
add-apt-repository ppa:deadsnakes/ppa
apt-get -y update -qq --fix-missing
apt-get -y install \
build-essential \
python$PYTHON_VERSION \
python$PYTHON_VERSION-dev \
python3-distutils \
cmake \
pciutils \
curl
update-alternatives --install /usr/bin/python python /usr/bin/python$PYTHON_VERSION 1
curl https://bootstrap.pypa.io/get-pip.py | python
displayName: 'Install sys & python' # CUDA image if completely blind
- bash: |
echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
cuda_ver=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
echo "##vso[task.setvariable variable=CUDA_VERSION_MM]$cuda_ver"
pytorch_ver=$(python -c "import torch; print(torch.__version__.split('+')[0][:4])")
echo "##vso[task.setvariable variable=PYTORCH_VERSION]$pytorch_ver"
echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${cuda_ver}/torch_stable.html"
echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html"
displayName: 'set env. vars'
- bash: |
echo $CUDA_VISIBLE_DEVICES
echo $PYTORCH_VERSION
echo $TORCH_URL
lspci | egrep 'VGA|3D'
whereis nvidia
nvidia-smi
cmake --version
which python && which pip
python --version
pip --version
pip list
displayName: 'Image info & NVIDIA'
- bash: pip install -e . -r tests/requirements.txt -f ${TORCH_URL}
- bash: |
apt-get update --fx-missing
apt-get install -y libopenmpi-dev openmpi-bin
pip install "torch==${PYTORCH_VERSION}" -f ${TORCH_URL}
pip install "pip<23.0" # HotFix for bad Horovod requirements
pip install -e . -r tests/requirements.txt
displayName: 'Install package & extras'
- bash: |
Expand All @@ -102,19 +127,14 @@ jobs:
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
displayName: 'Env details'
- bash: python -m coverage run --source lightning_horovod -m pytest -v --durations=50
workingDirectory: tests/
env:
PL_USE_MOCKED_MNIST: 1
- bash: pytest src/ tests/ --cov=lightning_horovod -v
displayName: 'Testing: standard'
timeoutInMinutes: "20"

- bash: |
python -m coverage report
python -m coverage xml
python -m coverage html
python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
--flags=gpu,pytest --name="GPU-coverage" --env=linux,azure
ls -l
workingDirectory: tests/
displayName: 'Statistics'

0 comments on commit f4a9331

Please sign in to comment.