Log process group name instead of pg uid #338
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: CI | |
on: [push, pull_request] | |
env: | |
OPENUCX_LINK: https://github.com/openucx/ucx.git | |
UCC_LINK: https://github.com/openucx/ucc.git | |
PARAM_LINK: https://github.com/facebookresearch/param | |
jobs: | |
torch-ucc-tests: | |
runs-on: ubuntu-latest | |
container: | |
image: pytorch/pytorch:latest | |
steps: | |
- name: Checkout Action | |
uses: actions/checkout@v1 | |
- name: Build TorchUCC | |
uses: ./.github/actions/build | |
with: | |
ucx: ${OPENUCX_LINK} | |
ucc: ${UCC_LINK} | |
- name: Tests | |
run: | | |
export LD_LIBRARY_PATH=/opt/ucx/lib:/opt/ucc/lib:$LD_LIBRARY_PATH | |
/opt/ucx/bin/ucx_info -e -u t | |
export UCX_LOG_LEVEL=info | |
export TORCH_UCC_ENABLE_HEALTH_CHECK=1 | |
export TORCH_SHOW_CPP_STACKTRACES=1 | |
for np in `seq 4` | |
do | |
echo "Test comm size $np" | |
export TORCH_UCC_TEST_SIZE=$np | |
echo "UCC barrier" | |
/bin/bash ./test/start_test.sh ./test/torch_barrier_test.py --backend=gloo | |
echo "UCC alltoall" | |
/bin/bash ./test/start_test.sh ./test/torch_alltoall_test.py --backend=gloo | |
echo "UCC alltoallv" | |
/bin/bash ./test/start_test.sh ./test/torch_alltoallv_test.py --backend=gloo | |
echo "UCC allgather" | |
/bin/bash ./test/start_test.sh ./test/torch_allgather_test.py --backend=gloo | |
echo "UCC allreduce" | |
/bin/bash ./test/start_test.sh ./test/torch_allreduce_test.py --backend=gloo | |
echo "UCC broadcast" | |
/bin/bash ./test/start_test.sh ./test/torch_bcast_test.py --backend=gloo | |
echo "UCC reduce" | |
/bin/bash ./test/start_test.sh ./test/torch_reduce_test.py --backend=gloo | |
# FIXME: disabled as UCC does not support gather on CPU tensor yet | |
# echo "UCC gather" | |
# /bin/bash ./test/start_test.sh ./test/torch_gather_test.py --backend=gloo | |
done | |
echo "UCC basic functionality test" | |
/bin/bash ./test/start_test.sh ./test/torch_work_test.py --backend=gloo | |
echo "UCC pt2pt" | |
/bin/bash ./test/start_test.sh ./test/torch_pt2pt_test.py --backend=gloo | |
echo "UCC timeout test" | |
/bin/bash ./test/start_test.sh ./test/torch_timeout_test.py --backend=gloo | |
echo "UCC multiple comms test" | |
TORCH_UCC_SHARED_COMM=0 UCX_TLS=tcp /bin/bash ./test/start_test.sh ./test/torch_multiple_comms_test.py | |
echo "UCC multiple comms test shared comm" | |
TORCH_UCC_SHARED_COMM=1 /bin/bash ./test/start_test.sh ./test/torch_multiple_comms_test.py | |
pytorch-unit-tests: | |
runs-on: ubuntu-latest | |
container: | |
image: pytorch/pytorch:latest | |
steps: | |
- name: Checkout Action | |
uses: actions/checkout@v1 | |
- name: Build TorchUCC | |
uses: ./.github/actions/build | |
with: | |
ucx: ${OPENUCX_LINK} | |
ucc: ${UCC_LINK} | |
- name: PyTorch Unit Tests | |
run: | | |
export LD_LIBRARY_PATH=/opt/ucx/lib:/opt/ucc/lib:$LD_LIBRARY_PATH | |
/opt/ucx/bin/ucx_info -e -u t | |
export UCX_LOG_LEVEL=info | |
export TORCH_UCC_ENABLE_HEALTH_CHECK=1 | |
export TORCH_SHOW_CPP_STACKTRACES=1 | |
pip3 install expecttest hypothesis xmlrunner unittest-xml-reporting | |
cd test | |
for np in `seq 4` | |
do | |
export BACKEND='ucc' | |
export WORLD_SIZE=$np | |
python torch_tests.py --subprocess | |
done | |
param-comm-tests: | |
runs-on: ubuntu-latest | |
container: | |
image: pytorch/pytorch:latest | |
steps: | |
- name: Checkout Action | |
uses: actions/checkout@v1 | |
- name: Build TorchUCC | |
uses: ./.github/actions/build | |
with: | |
ucx: ${OPENUCX_LINK} | |
ucc: ${UCC_LINK} | |
- name: Test PARAM | |
run: | | |
git clone ${PARAM_LINK} /tmp/param | |
export LD_LIBRARY_PATH=/opt/ucx/lib:/opt/ucc/lib:$LD_LIBRARY_PATH | |
export TORCH_UCC_TEST_SIZE=4 | |
echo "PARAM-Comms Reduce w/ UCC" | |
/bin/bash ./test/start_test.sh /tmp/param/train/comms/pt/comms.py --backend ucc --device cpu --b 4 --e 4M --c 1 --collective reduce | |
echo "PARAM-Comms Allreduce w/ UCC" | |
/bin/bash ./test/start_test.sh /tmp/param/train/comms/pt/comms.py --backend ucc --device cpu --b 4 --e 4M --c 1 --collective all_reduce | |
echo "PARAM-Comms Alltoall w/ UCC" | |
/bin/bash ./test/start_test.sh /tmp/param/train/comms/pt/comms.py --backend ucc --device cpu --b 4 --e 4M --c 1 --collective all_to_all | |
echo "PARAM-Comms Alltoallv w/ UCC" | |
/bin/bash ./test/start_test.sh /tmp/param/train/comms/pt/comms.py --backend ucc --device cpu --b 4 --e 4M --c 1 --collective all_to_allv | |
echo "PARAM-Comms Broadcast w/ UCC" | |
/bin/bash ./test/start_test.sh /tmp/param/train/comms/pt/comms.py --backend ucc --device cpu --b 4 --e 4M --c 1 --collective broadcast | |
echo "PARAM-Comms Allgather w/ UCC" | |
/bin/bash ./test/start_test.sh /tmp/param/train/comms/pt/comms.py --backend ucc --device cpu --b 4 --e 4M --c 1 --collective all_gather | |
echo "PARAM-Comms Allgather_base w/ UCC" | |
/bin/bash ./test/start_test.sh /tmp/param/train/comms/pt/comms.py --backend ucc --device cpu --b 4 --e 4M --c 1 --collective all_gather_base | |
# FIXME: disabled as UCC does not support gather on CPU tensor yet | |
# echo "PARAM-Comms Gather w/ UCC" | |
# /bin/bash ./test/start_test.sh /tmp/param/train/comms/pt/comms.py --backend ucc --device cpu --b 4 --e 4M --c 1 --collective gather | |
echo "PARAM-Comms Quantized Allreduce w/ UCC (use of c10d future)" | |
/bin/bash ./test/start_test.sh /tmp/param/train/comms/pt/comms.py --backend ucc --device cpu --b 4 --e 4M --c 1 --bitwidth 16 --collective all_reduce | |
echo "PARAM-Comms Non-blocking Allreduce w/ UCC" | |
/bin/bash ./test/start_test.sh /tmp/param/train/comms/pt/comms.py --backend ucc --device cpu --b 4 --e 4M --c 1 --z 0 --collective all_reduce | |
echo "PARAM-Comms Non-blocking Alltoall w/ UCC" | |
/bin/bash ./test/start_test.sh /tmp/param/train/comms/pt/comms.py --backend ucc --device cpu --b 4 --e 4M --c 1 --z 0 --collective all_to_all | |
echo "PARAM-Comms Non-blocking Alltoallv w/ UCC" | |
/bin/bash ./test/start_test.sh /tmp/param/train/comms/pt/comms.py --backend ucc --device cpu --b 4 --e 4M --c 1 --z 0 --collective all_to_allv | |
echo "PARAM-Comms Pt2pt w/ UCC" | |
export TORCH_UCC_TEST_SIZE=2 | |
/bin/bash ./test/start_test.sh /tmp/param/train/comms/pt/comms.py --backend ucc --device cpu --b 4 --e 4M --c 1 --pt2pt one2one --src-ranks 0 --dst-ranks 1 |