diff --git a/.github/workflows/ci-gh-cpu-build-and-test.yml b/.github/workflows/ci-gh-cpu-build-and-test.yml index 7ee174639..1c224e120 100644 --- a/.github/workflows/ci-gh-cpu-build-and-test.yml +++ b/.github/workflows/ci-gh-cpu-build-and-test.yml @@ -12,6 +12,10 @@ on: jobs: build-cpu: + strategy: + fail-fast: false + matrix: + ucx-config: [ucx, no-ucx] if: ${{ github.repository_owner == 'nv-legate' }} uses: ./.github/workflows/gh-build.yml @@ -21,6 +25,7 @@ jobs: runs-on: ${{ contains(github.repository, 'nv-legate/legate.core') && 'linux-amd64-cpu4' || 'ubuntu-latest' }} sha: ${{ github.sha }} build-type: ci + ucx-config: ${{ matrix.ucx-config }} test-cpu: needs: @@ -28,10 +33,9 @@ jobs: strategy: fail-fast: false matrix: - include: - - { name: Pytest Unit Tests, test-scope: unit } - - { name: mypy, test-scope: mypy } - name: ${{ matrix.name }} + ucx-config: [ucx, no-ucx] + test-config: [{ name: Pytest Unit Tests, test-scope: unit }, { name: mypy, test-scope: mypy }] + name: ${{ matrix.test-config.name }} if: ${{ github.repository_owner == 'nv-legate' }} uses: ./.github/workflows/gh-test.yml @@ -40,6 +44,7 @@ jobs: repos-name: ${{ github.event.repository.name }} runs-on: ${{ contains(github.repository, 'nv-legate/legate.core') && 'linux-amd64-32cpu' || 'ubuntu-latest' }} sha: ${{ github.sha }} + ucx-config: ${{ matrix.ucx-config }} test-scope: ${{ matrix.test-scope }} cleanup: diff --git a/.github/workflows/ci-gh-gpu-build-and-test.yml b/.github/workflows/ci-gh-gpu-build-and-test.yml index b0c32f1de..01b85ffc7 100644 --- a/.github/workflows/ci-gh-gpu-build-and-test.yml +++ b/.github/workflows/ci-gh-gpu-build-and-test.yml @@ -12,6 +12,10 @@ on: jobs: build-gpu: + strategy: + fail-fast: false + matrix: + ucx-config: [ucx, no-ucx] if: ${{ github.repository_owner == 'nv-legate' }} uses: ./.github/workflows/gh-build.yml @@ -21,6 +25,7 @@ jobs: runs-on: ${{ contains(github.repository, 'nv-legate/legate.core') && 'linux-amd64-32cpu' || 'ubuntu-latest' }} sha: ${{ github.sha }} build-type: ci + ucx-config: ${{ matrix.ucx-config }} test-gpu: needs: @@ -28,10 +33,11 @@ jobs: strategy: fail-fast: false matrix: - include: - - { name: Pytest Unit Tests, test-scope: unit, runner: linux-amd64-gpu-v100-latest-1 } - - { name: Pytest Unit Tests, test-scope: unit, runner: linux-amd64-2gpu } - - { name: mypy, test-scope: mypy, runner: 'linux-amd64-gpu-v100-latest-1' } + ucx-config: [ucx, no-ucx] + test-config: [{ name: Pytest Unit Tests, test-scope: unit, runner: linux-amd64-gpu-v100-latest-1 }, + { name: Pytest Unit Tests, test-scope: unit, runner: linux-amd64-2gpu }, + { name: mypy, test-scope: mypy, runner: 'linux-amd64-gpu-v100-latest-1' }] + name: ${{ matrix.test-config.name }} name: ${{ matrix.name }} if: ${{ github.repository_owner == 'nv-legate' }} uses: @@ -39,8 +45,9 @@ jobs: with: build-target: gpu repos-name: ${{ github.event.repository.name }} - runs-on: ${{ matrix.runner }} + runs-on: ${{ matrix.test-config.runner }} sha: ${{ github.sha }} + ucx-config: ${{ matrix.ucx-config }} test-scope: ${{ matrix.test-scope }} cleanup: diff --git a/.github/workflows/gh-build.yml b/.github/workflows/gh-build.yml index d4a0e216f..84df0ce79 100644 --- a/.github/workflows/gh-build.yml +++ b/.github/workflows/gh-build.yml @@ -21,17 +21,21 @@ on: required: true type: string description: One of ci / release + ucx-config: + required: true + type: string env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} BASE_IMAGE: rapidsai/devcontainers:23.06-cpp-mambaforge-ubuntu22.04 - IMAGE_NAME: ${{ inputs.repos-name }}-${{ inputs.build-target }} + IMAGE_NAME: ${{ inputs.repos-name }}-${{ inputs.build-target }}-${{ inputs.ucx-config }} USE_CUDA: ${{ (inputs.build-target == 'cpu' && 'OFF') || 'ON' }} + UCX_ENABLED: ${{ (inputs.ucx-config == 'ucx' && 'ON') || 'OFF' }} PUSH_IMAGE: ${{ inputs.build-type == 'ci' && 'true' || 'false'}} jobs: build: - name: build-${{ inputs.build-target }}-sub-workflow + name: build-${{ inputs.build-target }}-${{ inputs.ucx-config }}-sub-workflow permissions: id-token: write # This is required for configure-aws-credentials @@ -106,5 +110,5 @@ jobs: - name: Upload build artifacts uses: actions/upload-artifact@v3 with: - name: "${{ inputs.repos-name }}-${{ inputs.build-target }}-${{ inputs.sha }}" + name: "${{ inputs.repos-name }}-${{ inputs.build-target }}-${{ inputs.ucx-config }}-${{ inputs.sha }}" path: artifacts diff --git a/.github/workflows/gh-test.yml b/.github/workflows/gh-test.yml index 5d9c49872..8452fd71c 100644 --- a/.github/workflows/gh-test.yml +++ b/.github/workflows/gh-test.yml @@ -15,6 +15,9 @@ on: sha: required: true type: string + ucx-config: + required: true + type: string test-scope: required: true type: string @@ -23,10 +26,11 @@ jobs: test: if: github.repository_owner == 'nv-legate' name: test-${{ inputs.build-target }}-sub-workflow + name: test-${{ inputs.build-target }}-${{ inputs.ucx-config }}-sub-workflow runs-on: ${{ inputs.runs-on }} container: options: -u root - image: ghcr.io/nv-legate/${{ inputs.repos-name }}-${{ inputs.build-target }}:${{ inputs.sha }} + image: ghcr.io/nv-legate/${{ inputs.repos-name }}-${{ inputs.build-target }}-${{ inputs.ucx-config }}:${{ inputs.sha }} env: PYTHONDONTWRITEBYTECODE: 1 NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} diff --git a/continuous_integration/build-docker-image b/continuous_integration/build-docker-image index e71c05f0b..3b7105026 100755 --- a/continuous_integration/build-docker-image +++ b/continuous_integration/build-docker-image @@ -34,6 +34,13 @@ done set -e +if [ "$UCX_ENABLED" = "ON" ]; then + sed -i 's@DLegion_NETWORKS=@DLegion_NETWORKS=ucx@g' $SOURCE_DIR/continuous_integration/home/coder/.local/bin/build-legate-cpp + sed -i '\/conda-build\/conda_build_config\.yaml/a ucx:\n\ - ">=1.14"' $SOURCE_DIR/continuous_integration/home/coder/.local/bin/build-legate-conda + sed -i 's/\-\-no\-ucx/\-\-ucx /g' $SOURCE_DIR/continuous_integration/home/coder/.local/bin/conda-utils + sed -i '/-c conda-forge pytest/s/$/ ucx openmpi/g' $SOURCE_DIR/continuous_integration/home/coder/.local/bin/run-test-or-analysis +fi + # Avoid build errors due to a missing .creds folder mkdir -p "$SOURCE_DIR/.creds" diff --git a/tests/unit/legate/driver/test_command.py b/tests/unit/legate/driver/test_command.py index 57aedc4fe..6e74005ad 100644 --- a/tests/unit/legate/driver/test_command.py +++ b/tests/unit/legate/driver/test_command.py @@ -1097,7 +1097,10 @@ def test_default_multi_rank( result = m.cmd_bgwork(config, system, launcher) - assert result == ("-ll:bgwork", "2") + if "ucx" in install_info.networks: + assert result == ("-ll:bgwork", "2", "-ll:bgworkpin", "1") + else: + assert result == ("-ll:bgwork", "2") @pytest.mark.parametrize("rank_var", RANK_ENV_VARS) @pytest.mark.parametrize("rank", ("0", "1", "2")) @@ -1126,7 +1129,10 @@ def test_utility_1_multi_rank_no_launcher( result = m.cmd_bgwork(config, system, launcher) - assert result == ("-ll:bgwork", "2") + if "ucx" in install_info.networks: + assert result == ("-ll:bgwork", "2", "-ll:bgworkpin", "1") + else: + assert result == ("-ll:bgwork", "2") @pytest.mark.parametrize("rank_var", RANK_ENV_VARS) @pytest.mark.parametrize("rank", ("0", "1", "2")) @@ -1154,7 +1160,10 @@ def test_utility_1_multi_rank_with_launcher( result = m.cmd_bgwork(config, system, launcher) - assert result == ("-ll:bgwork", "2") + if "ucx" in install_info.networks: + assert result == ("-ll:bgwork", "2", "-ll:bgworkpin", "1") + else: + assert result == ("-ll:bgwork", "2") @pytest.mark.parametrize("launch", ("mpirun", "jsrun", "srun")) def test_utility_1_multi_rank_with_launcher_and_ucx( @@ -1183,7 +1192,10 @@ def test_utility_n_multi_rank_no_launcher( result = m.cmd_bgwork(config, system, launcher) - assert result == ("-ll:bgwork", value) + if "ucx" in install_info.networks: + assert result == ("-ll:bgwork", value, "-ll:bgworkpin", "1") + else: + assert result == ("-ll:bgwork", value) @pytest.mark.parametrize("rank_var", RANK_ENV_VARS) @pytest.mark.parametrize("rank", ("0", "1", "2")) @@ -1200,7 +1212,10 @@ def test_utility_n_multi_rank_no_launcher_and_ucx( result = m.cmd_bgwork(config, system, launcher) install_info.networks[:] = networks_orig[:] - assert result == ("-ll:bgwork", value, "-ll:bgworkpin", "1") + if "ucx" in install_info.networks: + assert result == ("-ll:bgwork", value, "-ll:bgworkpin", "1") + else: + assert result == ("-ll:bgwork", value) @pytest.mark.parametrize("launch", ("mpirun", "jsrun", "srun")) @pytest.mark.parametrize("value", ("2", "3", "10")) @@ -1213,7 +1228,10 @@ def test_utility_n_multi_rank_with_launcher( result = m.cmd_bgwork(config, system, launcher) - assert result == ("-ll:bgwork", value) + if "ucx" in install_info.networks: + assert result == ("-ll:bgwork", value, "-ll:bgworkpin", "1") + else: + assert result == ("-ll:bgwork", value) @pytest.mark.parametrize("launch", ("mpirun", "jsrun", "srun")) @pytest.mark.parametrize("value", ("2", "3", "10"))