Skip to content

Log process group name instead of pg uid #338

Log process group name instead of pg uid

Log process group name instead of pg uid #338

Workflow file for this run

name: CI
on: [push, pull_request]
env:
OPENUCX_LINK: https://github.com/openucx/ucx.git
UCC_LINK: https://github.com/openucx/ucc.git
PARAM_LINK: https://github.com/facebookresearch/param
jobs:
torch-ucc-tests:
runs-on: ubuntu-latest
container:
image: pytorch/pytorch:latest
steps:
- name: Checkout Action
uses: actions/checkout@v1
- name: Build TorchUCC
uses: ./.github/actions/build
with:
ucx: ${OPENUCX_LINK}
ucc: ${UCC_LINK}
- name: Tests
run: |
export LD_LIBRARY_PATH=/opt/ucx/lib:/opt/ucc/lib:$LD_LIBRARY_PATH
/opt/ucx/bin/ucx_info -e -u t
export UCX_LOG_LEVEL=info
export TORCH_UCC_ENABLE_HEALTH_CHECK=1
export TORCH_SHOW_CPP_STACKTRACES=1
for np in `seq 4`
do
echo "Test comm size $np"
export TORCH_UCC_TEST_SIZE=$np
echo "UCC barrier"
/bin/bash ./test/start_test.sh ./test/torch_barrier_test.py --backend=gloo
echo "UCC alltoall"
/bin/bash ./test/start_test.sh ./test/torch_alltoall_test.py --backend=gloo
echo "UCC alltoallv"
/bin/bash ./test/start_test.sh ./test/torch_alltoallv_test.py --backend=gloo
echo "UCC allgather"
/bin/bash ./test/start_test.sh ./test/torch_allgather_test.py --backend=gloo
echo "UCC allreduce"
/bin/bash ./test/start_test.sh ./test/torch_allreduce_test.py --backend=gloo
echo "UCC broadcast"
/bin/bash ./test/start_test.sh ./test/torch_bcast_test.py --backend=gloo
echo "UCC reduce"
/bin/bash ./test/start_test.sh ./test/torch_reduce_test.py --backend=gloo
# FIXME: disabled as UCC does not support gather on CPU tensor yet
# echo "UCC gather"
# /bin/bash ./test/start_test.sh ./test/torch_gather_test.py --backend=gloo
done
echo "UCC basic functionality test"
/bin/bash ./test/start_test.sh ./test/torch_work_test.py --backend=gloo
echo "UCC pt2pt"
/bin/bash ./test/start_test.sh ./test/torch_pt2pt_test.py --backend=gloo
echo "UCC timeout test"
/bin/bash ./test/start_test.sh ./test/torch_timeout_test.py --backend=gloo
echo "UCC multiple comms test"
TORCH_UCC_SHARED_COMM=0 UCX_TLS=tcp /bin/bash ./test/start_test.sh ./test/torch_multiple_comms_test.py
echo "UCC multiple comms test shared comm"
TORCH_UCC_SHARED_COMM=1 /bin/bash ./test/start_test.sh ./test/torch_multiple_comms_test.py
pytorch-unit-tests:
runs-on: ubuntu-latest
container:
image: pytorch/pytorch:latest
steps:
- name: Checkout Action
uses: actions/checkout@v1
- name: Build TorchUCC
uses: ./.github/actions/build
with:
ucx: ${OPENUCX_LINK}
ucc: ${UCC_LINK}
- name: PyTorch Unit Tests
run: |
export LD_LIBRARY_PATH=/opt/ucx/lib:/opt/ucc/lib:$LD_LIBRARY_PATH
/opt/ucx/bin/ucx_info -e -u t
export UCX_LOG_LEVEL=info
export TORCH_UCC_ENABLE_HEALTH_CHECK=1
export TORCH_SHOW_CPP_STACKTRACES=1
pip3 install expecttest hypothesis xmlrunner unittest-xml-reporting
cd test
for np in `seq 4`
do
export BACKEND='ucc'
export WORLD_SIZE=$np
python torch_tests.py --subprocess
done
param-comm-tests:
runs-on: ubuntu-latest
container:
image: pytorch/pytorch:latest
steps:
- name: Checkout Action
uses: actions/checkout@v1
- name: Build TorchUCC
uses: ./.github/actions/build
with:
ucx: ${OPENUCX_LINK}
ucc: ${UCC_LINK}
- name: Test PARAM
run: |
git clone ${PARAM_LINK} /tmp/param
export LD_LIBRARY_PATH=/opt/ucx/lib:/opt/ucc/lib:$LD_LIBRARY_PATH
export TORCH_UCC_TEST_SIZE=4
echo "PARAM-Comms Reduce w/ UCC"
/bin/bash ./test/start_test.sh /tmp/param/train/comms/pt/comms.py --backend ucc --device cpu --b 4 --e 4M --c 1 --collective reduce
echo "PARAM-Comms Allreduce w/ UCC"
/bin/bash ./test/start_test.sh /tmp/param/train/comms/pt/comms.py --backend ucc --device cpu --b 4 --e 4M --c 1 --collective all_reduce
echo "PARAM-Comms Alltoall w/ UCC"
/bin/bash ./test/start_test.sh /tmp/param/train/comms/pt/comms.py --backend ucc --device cpu --b 4 --e 4M --c 1 --collective all_to_all
echo "PARAM-Comms Alltoallv w/ UCC"
/bin/bash ./test/start_test.sh /tmp/param/train/comms/pt/comms.py --backend ucc --device cpu --b 4 --e 4M --c 1 --collective all_to_allv
echo "PARAM-Comms Broadcast w/ UCC"
/bin/bash ./test/start_test.sh /tmp/param/train/comms/pt/comms.py --backend ucc --device cpu --b 4 --e 4M --c 1 --collective broadcast
echo "PARAM-Comms Allgather w/ UCC"
/bin/bash ./test/start_test.sh /tmp/param/train/comms/pt/comms.py --backend ucc --device cpu --b 4 --e 4M --c 1 --collective all_gather
echo "PARAM-Comms Allgather_base w/ UCC"
/bin/bash ./test/start_test.sh /tmp/param/train/comms/pt/comms.py --backend ucc --device cpu --b 4 --e 4M --c 1 --collective all_gather_base
# FIXME: disabled as UCC does not support gather on CPU tensor yet
# echo "PARAM-Comms Gather w/ UCC"
# /bin/bash ./test/start_test.sh /tmp/param/train/comms/pt/comms.py --backend ucc --device cpu --b 4 --e 4M --c 1 --collective gather
echo "PARAM-Comms Quantized Allreduce w/ UCC (use of c10d future)"
/bin/bash ./test/start_test.sh /tmp/param/train/comms/pt/comms.py --backend ucc --device cpu --b 4 --e 4M --c 1 --bitwidth 16 --collective all_reduce
echo "PARAM-Comms Non-blocking Allreduce w/ UCC"
/bin/bash ./test/start_test.sh /tmp/param/train/comms/pt/comms.py --backend ucc --device cpu --b 4 --e 4M --c 1 --z 0 --collective all_reduce
echo "PARAM-Comms Non-blocking Alltoall w/ UCC"
/bin/bash ./test/start_test.sh /tmp/param/train/comms/pt/comms.py --backend ucc --device cpu --b 4 --e 4M --c 1 --z 0 --collective all_to_all
echo "PARAM-Comms Non-blocking Alltoallv w/ UCC"
/bin/bash ./test/start_test.sh /tmp/param/train/comms/pt/comms.py --backend ucc --device cpu --b 4 --e 4M --c 1 --z 0 --collective all_to_allv
echo "PARAM-Comms Pt2pt w/ UCC"
export TORCH_UCC_TEST_SIZE=2
/bin/bash ./test/start_test.sh /tmp/param/train/comms/pt/comms.py --backend ucc --device cpu --b 4 --e 4M --c 1 --pt2pt one2one --src-ranks 0 --dst-ranks 1