Skip to content

Commit

Permalink
Enable UCX on TOT
Browse files Browse the repository at this point in the history
  • Loading branch information
mag1cp1n authored Oct 23, 2023
1 parent 356284d commit 0c7ff12
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 19 deletions.
13 changes: 9 additions & 4 deletions .github/workflows/ci-gh-cpu-build-and-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ on:

jobs:
build-cpu:
strategy:
fail-fast: false
matrix:
ucx-config: [ucx, no-ucx]
if: ${{ github.repository_owner == 'nv-legate' }}
uses:
./.github/workflows/gh-build.yml
Expand All @@ -21,17 +25,17 @@ jobs:
runs-on: ${{ contains(github.repository, 'nv-legate/legate.core') && 'linux-amd64-cpu4' || 'ubuntu-latest' }}
sha: ${{ github.sha }}
build-type: ci
ucx-config: ${{ matrix.ucx-config }}

test-cpu:
needs:
- build-cpu
strategy:
fail-fast: false
matrix:
include:
- { name: Pytest Unit Tests, test-scope: unit }
- { name: mypy, test-scope: mypy }
name: ${{ matrix.name }}
ucx-config: [ucx, no-ucx]
test-config: [{ name: Pytest Unit Tests, test-scope: unit }, { name: mypy, test-scope: mypy }]
name: ${{ matrix.test-config.name }}
if: ${{ github.repository_owner == 'nv-legate' }}
uses:
./.github/workflows/gh-test.yml
Expand All @@ -40,6 +44,7 @@ jobs:
repos-name: ${{ github.event.repository.name }}
runs-on: ${{ contains(github.repository, 'nv-legate/legate.core') && 'linux-amd64-32cpu' || 'ubuntu-latest' }}
sha: ${{ github.sha }}
ucx-config: ${{ matrix.ucx-config }}
test-scope: ${{ matrix.test-scope }}

cleanup:
Expand Down
17 changes: 12 additions & 5 deletions .github/workflows/ci-gh-gpu-build-and-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ on:

jobs:
build-gpu:
strategy:
fail-fast: false
matrix:
ucx-config: [ucx, no-ucx]
if: ${{ github.repository_owner == 'nv-legate' }}
uses:
./.github/workflows/gh-build.yml
Expand All @@ -21,26 +25,29 @@ jobs:
runs-on: ${{ contains(github.repository, 'nv-legate/legate.core') && 'linux-amd64-32cpu' || 'ubuntu-latest' }}
sha: ${{ github.sha }}
build-type: ci
ucx-config: ${{ matrix.ucx-config }}

test-gpu:
needs:
- build-gpu
strategy:
fail-fast: false
matrix:
include:
- { name: Pytest Unit Tests, test-scope: unit, runner: linux-amd64-gpu-v100-latest-1 }
- { name: Pytest Unit Tests, test-scope: unit, runner: linux-amd64-2gpu }
- { name: mypy, test-scope: mypy, runner: 'linux-amd64-gpu-v100-latest-1' }
ucx-config: [ucx, no-ucx]
test-config: [{ name: Pytest Unit Tests, test-scope: unit, runner: linux-amd64-gpu-v100-latest-1 },
{ name: Pytest Unit Tests, test-scope: unit, runner: linux-amd64-2gpu },
{ name: mypy, test-scope: mypy, runner: 'linux-amd64-gpu-v100-latest-1' }]
name: ${{ matrix.test-config.name }}
name: ${{ matrix.name }}
if: ${{ github.repository_owner == 'nv-legate' }}
uses:
./.github/workflows/gh-test.yml
with:
build-target: gpu
repos-name: ${{ github.event.repository.name }}
runs-on: ${{ matrix.runner }}
runs-on: ${{ matrix.test-config.runner }}
sha: ${{ github.sha }}
ucx-config: ${{ matrix.ucx-config }}
test-scope: ${{ matrix.test-scope }}

cleanup:
Expand Down
10 changes: 7 additions & 3 deletions .github/workflows/gh-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,21 @@ on:
required: true
type: string
description: One of ci / release
ucx-config:
required: true
type: string

env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
BASE_IMAGE: rapidsai/devcontainers:23.06-cpp-mambaforge-ubuntu22.04
IMAGE_NAME: ${{ inputs.repos-name }}-${{ inputs.build-target }}
IMAGE_NAME: ${{ inputs.repos-name }}-${{ inputs.build-target }}-${{ inputs.ucx-config }}
USE_CUDA: ${{ (inputs.build-target == 'cpu' && 'OFF') || 'ON' }}
UCX_ENABLED: ${{ (inputs.ucx-config == 'ucx' && 'ON') || 'OFF' }}
PUSH_IMAGE: ${{ inputs.build-type == 'ci' && 'true' || 'false'}}

jobs:
build:
name: build-${{ inputs.build-target }}-sub-workflow
name: build-${{ inputs.build-target }}-${{ inputs.ucx-config }}-sub-workflow

permissions:
id-token: write # This is required for configure-aws-credentials
Expand Down Expand Up @@ -106,5 +110,5 @@ jobs:
- name: Upload build artifacts
uses: actions/upload-artifact@v3
with:
name: "${{ inputs.repos-name }}-${{ inputs.build-target }}-${{ inputs.sha }}"
name: "${{ inputs.repos-name }}-${{ inputs.build-target }}-${{ inputs.ucx-config }}-${{ inputs.sha }}"
path: artifacts
6 changes: 5 additions & 1 deletion .github/workflows/gh-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ on:
sha:
required: true
type: string
ucx-config:
required: true
type: string
test-scope:
required: true
type: string
Expand All @@ -23,10 +26,11 @@ jobs:
test:
if: github.repository_owner == 'nv-legate'
name: test-${{ inputs.build-target }}-sub-workflow
name: test-${{ inputs.build-target }}-${{ inputs.ucx-config }}-sub-workflow
runs-on: ${{ inputs.runs-on }}
container:
options: -u root
image: ghcr.io/nv-legate/${{ inputs.repos-name }}-${{ inputs.build-target }}:${{ inputs.sha }}
image: ghcr.io/nv-legate/${{ inputs.repos-name }}-${{ inputs.build-target }}-${{ inputs.ucx-config }}:${{ inputs.sha }}
env:
PYTHONDONTWRITEBYTECODE: 1
NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
Expand Down
7 changes: 7 additions & 0 deletions continuous_integration/build-docker-image
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ done

set -e

if [ "$UCX_ENABLED" = "ON" ]; then
sed -i 's@DLegion_NETWORKS=@DLegion_NETWORKS=ucx@g' $SOURCE_DIR/continuous_integration/home/coder/.local/bin/build-legate-cpp
sed -i '\/conda-build\/conda_build_config\.yaml/a ucx:\n\ - ">=1.14"' $SOURCE_DIR/continuous_integration/home/coder/.local/bin/build-legate-conda
sed -i 's/\-\-no\-ucx/\-\-ucx /g' $SOURCE_DIR/continuous_integration/home/coder/.local/bin/conda-utils
sed -i '/-c conda-forge pytest/s/$/ ucx openmpi/g' $SOURCE_DIR/continuous_integration/home/coder/.local/bin/run-test-or-analysis
fi

# Avoid build errors due to a missing .creds folder
mkdir -p "$SOURCE_DIR/.creds"

Expand Down
30 changes: 24 additions & 6 deletions tests/unit/legate/driver/test_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -1097,7 +1097,10 @@ def test_default_multi_rank(

result = m.cmd_bgwork(config, system, launcher)

assert result == ("-ll:bgwork", "2")
if "ucx" in install_info.networks:
assert result == ("-ll:bgwork", "2", "-ll:bgworkpin", "1")
else:
assert result == ("-ll:bgwork", "2")

@pytest.mark.parametrize("rank_var", RANK_ENV_VARS)
@pytest.mark.parametrize("rank", ("0", "1", "2"))
Expand Down Expand Up @@ -1126,7 +1129,10 @@ def test_utility_1_multi_rank_no_launcher(

result = m.cmd_bgwork(config, system, launcher)

assert result == ("-ll:bgwork", "2")
if "ucx" in install_info.networks:
assert result == ("-ll:bgwork", "2", "-ll:bgworkpin", "1")
else:
assert result == ("-ll:bgwork", "2")

@pytest.mark.parametrize("rank_var", RANK_ENV_VARS)
@pytest.mark.parametrize("rank", ("0", "1", "2"))
Expand Down Expand Up @@ -1154,7 +1160,10 @@ def test_utility_1_multi_rank_with_launcher(

result = m.cmd_bgwork(config, system, launcher)

assert result == ("-ll:bgwork", "2")
if "ucx" in install_info.networks:
assert result == ("-ll:bgwork", "2", "-ll:bgworkpin", "1")
else:
assert result == ("-ll:bgwork", "2")

@pytest.mark.parametrize("launch", ("mpirun", "jsrun", "srun"))
def test_utility_1_multi_rank_with_launcher_and_ucx(
Expand Down Expand Up @@ -1183,7 +1192,10 @@ def test_utility_n_multi_rank_no_launcher(

result = m.cmd_bgwork(config, system, launcher)

assert result == ("-ll:bgwork", value)
if "ucx" in install_info.networks:
assert result == ("-ll:bgwork", value, "-ll:bgworkpin", "1")
else:
assert result == ("-ll:bgwork", value)

@pytest.mark.parametrize("rank_var", RANK_ENV_VARS)
@pytest.mark.parametrize("rank", ("0", "1", "2"))
Expand All @@ -1200,7 +1212,10 @@ def test_utility_n_multi_rank_no_launcher_and_ucx(
result = m.cmd_bgwork(config, system, launcher)
install_info.networks[:] = networks_orig[:]

assert result == ("-ll:bgwork", value, "-ll:bgworkpin", "1")
if "ucx" in install_info.networks:
assert result == ("-ll:bgwork", value, "-ll:bgworkpin", "1")
else:
assert result == ("-ll:bgwork", value)

@pytest.mark.parametrize("launch", ("mpirun", "jsrun", "srun"))
@pytest.mark.parametrize("value", ("2", "3", "10"))
Expand All @@ -1213,7 +1228,10 @@ def test_utility_n_multi_rank_with_launcher(

result = m.cmd_bgwork(config, system, launcher)

assert result == ("-ll:bgwork", value)
if "ucx" in install_info.networks:
assert result == ("-ll:bgwork", value, "-ll:bgworkpin", "1")
else:
assert result == ("-ll:bgwork", value)

@pytest.mark.parametrize("launch", ("mpirun", "jsrun", "srun"))
@pytest.mark.parametrize("value", ("2", "3", "10"))
Expand Down

0 comments on commit 0c7ff12

Please sign in to comment.