Skip to content
This repository has been archived by the owner on Feb 1, 2024. It is now read-only.

Commit

Permalink
ci: GPU testing with lightning & PL (#7)
Browse files Browse the repository at this point in the history
  • Loading branch information
Borda authored Sep 2, 2023
1 parent f4a9331 commit 1fb604e
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 13 deletions.
18 changes: 18 additions & 0 deletions .azure/adjust-requirements.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""Simple package switch."""

import re
import sys


def main(pkg: str, req_file: str = "requirements.txt"):
"""Perform the replacement."""
with open(req_file) as fo:
lines = fo.readlines()
lines = [re.sub(r"lightning([ <=>]+)", rf"{pkg} \1", ln) for ln in lines]
with open(req_file, "w") as fw:
fw.writelines(lines)


if __name__ == "__main__":
assert len(sys.argv) >= 2
main(sys.argv[1])
22 changes: 14 additions & 8 deletions .azure/gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,18 @@ jobs:
cancelTimeoutInMinutes: "2"
strategy:
matrix:
'Lightning pkg':
#image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch1.13-cuda11.7.1"
image: "nvidia/cuda:11.7.1-devel-ubuntu20.04"
PYTHON_VERSION: '3.9'
PYTORCH_VERSION: '1.13'
CUDA_VERSION_MM: '116'
'PL dep.':
dependency: "pytorch-lightning"
'Lightning dep.':
dependency: "lightning"
pool: 'lit-rtx-3090'
variables:
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
TORCH_HOME: "/var/tmp/torch"
PIP_CACHE_DIR: "/var/tmp/pip"
PYTHON_VERSION: '3.9'
PYTORCH_VERSION: '1.13'
CUDA_VERSION_MM: '117'
# HOROVOD_CUDA_HOME: $(CUDA_TOOLKIT_ROOT_DIR)
HOROVOD_GPU_OPERATIONS: NCCL
HOROVOD_WITH_PYTORCH: 1
Expand All @@ -53,7 +54,7 @@ jobs:
HOROVOD_WITH_MPI: 1
PL_USE_MOCKED_MNIST: 1
container:
image: $(image)
image: "nvidia/cuda:11.7.1-devel-ubuntu20.04"
# default shm size is 64m. Increase it to avoid:
# 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
options: "--gpus=all --shm-size=6gb -v /usr/bin/docker:/tmp/docker:ro"
Expand Down Expand Up @@ -102,6 +103,11 @@ jobs:
pip list
displayName: 'Image info & NVIDIA'
- script: |
python .azure/adjust-requirements.py $(dependency)
cat requirements.txt
displayName: 'Adjust dependencies'
- bash: |
apt-get update --fx-missing
apt-get install -y libopenmpi-dev openmpi-bin
Expand All @@ -127,7 +133,7 @@ jobs:
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
displayName: 'Env details'
- bash: pytest src/ tests/ --cov=lightning_horovod -v
- bash: pytest tests/ --cov=lightning_horovod -v
displayName: 'Testing: standard'
timeoutInMinutes: "20"

Expand Down
3 changes: 1 addition & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
pytorch-lightning >=2.0.0
packaging # needed for horovod issue
lightning >=2.0.0
# no need to install with [pytorch] as pytorch is already installed
horovod >0.24.0, <=0.27.0
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ def _load_readme(path_readme: str = _PATH_README) -> str:
zip_safe=False,
keywords=["deep learning", "pytorch", "AI"],
python_requires=">=3.8",
setup_requires=["wheel"],
install_requires=_load_requirements(_PATH_ROOT),
project_urls={
"Bug Tracker": "https://github.com/Lightning-AI/lightning-Horovod/issues",
Expand Down
4 changes: 2 additions & 2 deletions tests/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
coverage >=6.0
pytest >=7.0
coverage >6.0
pytest >7.0
pytest-cov
scikit-learn >=1.0.0
torchmetrics <0.11.0 # pin for compatibility

0 comments on commit 1fb604e

Please sign in to comment.