From 3064bc4f1738e4965b5e370dc857d6e984995917 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 2 Jun 2023 17:10:09 +0100 Subject: [PATCH 01/80] Test Running NVTabular GPU tests with rapids runner --- .github/workflows/gpu-tests.yml | 20 ++++++++++---------- tox.ini | 13 +++++++------ 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index a12bdad57f..f0b2ebef7e 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -16,19 +16,19 @@ concurrency: jobs: gpu-tests: - runs-on: 2GPU - + runs-on: linux-amd64-gpu-v100-latest-1 + container: + image: nvidia/cuda:11.8.0-base-ubuntu22.04 + env: + NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} steps: - uses: actions/checkout@v3 with: fetch-depth: 0 + - name: Get Branch name + id: get-branch-name + uses: NVIDIA-Merlin/.github/actions/branch-name@main - name: Run tests run: | - ref_type=${{ github.ref_type }} - branch=main - if [[ $ref_type == "tag"* ]] - then - raw=$(git branch -r --contains ${{ github.ref_name }}) - branch=${raw/origin\/} - fi - cd ${{ github.workspace }}; tox -e test-gpu -- $branch + merlin_branch="${{ steps.get-branch-name.outputs.branch }}" + RAPIDS_VERSION=23.04 MERLIN_BRANCH="$merlin_branch" tox -e test-gpu diff --git a/tox.ini b/tox.ini index da0c1239a3..fce6923e6d 100644 --- a/tox.ini +++ b/tox.ini @@ -28,9 +28,8 @@ commands = setenv = TF_GPU_ALLOCATOR=cuda_malloc_async passenv = - OPAL_PREFIX - NR_USER CUDA_VISIBLE_DEVICES + NVIDIA_VISIBLE_DEVICES sitepackages=true ; Runs in: Internal Jenkins ; Runs GPU-based tests. @@ -42,10 +41,12 @@ deps = pytest pytest-cov commands = - python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/core.git - python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/dataloader.git - python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/models.git - python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/core.git@{posargs:main} + python -m pip install rmm-cu11=={env:RAPIDS_VERSION} cudf-cu11=={env:RAPIDS_VERSION} dask-cudf-cu11=={env:RAPIDS_VERSION} --extra-index-url=https://pypi.nvidia.com + python -m pip install --upgrade \ + git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH:main} \ + git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_BRANCH:main} \ + git+https://github.com/NVIDIA-Merlin/models.git@{env:MERLIN_BRANCH:main} \ + git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH:main} \ python -m pytest --cov-report term --cov merlin -rxs tests/unit [testenv:test-merlin] From 1c93bb4f36f3b4a7b21a21e51e1fee80d861e633 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 2 Jun 2023 17:12:30 +0100 Subject: [PATCH 02/80] Update label for runner --- .github/workflows/gpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index f0b2ebef7e..171c8000ac 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -16,7 +16,7 @@ concurrency: jobs: gpu-tests: - runs-on: linux-amd64-gpu-v100-latest-1 + runs-on: linux-amd64-gpu-p100-latest-1 container: image: nvidia/cuda:11.8.0-base-ubuntu22.04 env: From 826168b6c019a1675236dd00810d68185a24667a Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 2 Jun 2023 17:16:31 +0100 Subject: [PATCH 03/80] Remove pull_request trigger --- .github/workflows/gpu-tests.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 171c8000ac..72327cc819 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -3,12 +3,11 @@ name: GPU Tests on: workflow_dispatch: push: - branches: [main] + branches: + - main + - pull-request/* tags: - "v[0-9]+.[0-9]+.[0-9]+" - pull_request: - branches: [main] - types: [opened, synchronize, reopened, closed] concurrency: group: ${{ github.workflow }}-${{ github.ref }} From 9df8ed66bbb84a6a4241a71c00f50243169c36ef Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 2 Jun 2023 17:16:44 +0100 Subject: [PATCH 04/80] Add ops-bot.yaml --- .github/ops-bot.yaml | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .github/ops-bot.yaml diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml new file mode 100644 index 0000000000..794526b9a0 --- /dev/null +++ b/.github/ops-bot.yaml @@ -0,0 +1,4 @@ +# This file controls which features from the `ops-bot` repository below are enabled. +# - https://github.com/rapidsai/ops-bot + +copy_prs: true \ No newline at end of file From 67030c0886bf1d9fdcc9ea10c0312cb8a7b3e049 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 2 Jun 2023 17:27:44 +0100 Subject: [PATCH 05/80] Setup python --- .github/workflows/gpu-tests.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 72327cc819..9919240a82 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -24,6 +24,19 @@ jobs: - uses: actions/checkout@v3 with: fetch-depth: 0 + - name: Set up Python 3.8 + uses: actions/setup-python@v4 + with: + python-version: 3.8 + - name: Install Ubuntu packages + run: | + sudo apt-get update -y + sudo apt-get install -y protobuf-compiler + - name: Install and upgrade python packages + run: | + python -m pip install --upgrade pip setuptools==59.4.0 wheel tox pybind11 + python -m pip uninstall protobuf -y + python -m pip install --no-binary=protobuf protobuf - name: Get Branch name id: get-branch-name uses: NVIDIA-Merlin/.github/actions/branch-name@main From ac8d5bd9f904c99ae508df5fc31da7e9f7f9f187 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 2 Jun 2023 17:31:26 +0100 Subject: [PATCH 06/80] remove sudo --- .github/workflows/gpu-tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 9919240a82..9e3076ea2e 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -30,8 +30,8 @@ jobs: python-version: 3.8 - name: Install Ubuntu packages run: | - sudo apt-get update -y - sudo apt-get install -y protobuf-compiler + apt-get update -y + apt-get install -y protobuf-compiler - name: Install and upgrade python packages run: | python -m pip install --upgrade pip setuptools==59.4.0 wheel tox pybind11 From d680c08c53dea7728e3e788e1c6da348a12f9690 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 2 Jun 2023 17:34:09 +0100 Subject: [PATCH 07/80] add build-essential --- .github/workflows/gpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 9e3076ea2e..0aa02d4849 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -31,7 +31,7 @@ jobs: - name: Install Ubuntu packages run: | apt-get update -y - apt-get install -y protobuf-compiler + apt-get install -y build-essential protobuf-compiler - name: Install and upgrade python packages run: | python -m pip install --upgrade pip setuptools==59.4.0 wheel tox pybind11 From 31d6afa6eaa17e6f5524103cc25b98e83fa97729 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 2 Jun 2023 17:56:01 +0100 Subject: [PATCH 08/80] Update branch-name action version --- .github/workflows/gpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 0aa02d4849..a2d6ca9a4a 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -39,7 +39,7 @@ jobs: python -m pip install --no-binary=protobuf protobuf - name: Get Branch name id: get-branch-name - uses: NVIDIA-Merlin/.github/actions/branch-name@main + uses: NVIDIA-Merlin/.github/actions/branch-name@branch-name-pull-request - name: Run tests run: | merlin_branch="${{ steps.get-branch-name.outputs.branch }}" From a9ec8d2b196368d62f2836bf567afa4662653a96 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 2 Jun 2023 18:03:51 +0100 Subject: [PATCH 09/80] remove trailing slash --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index fce6923e6d..aac818206b 100644 --- a/tox.ini +++ b/tox.ini @@ -46,7 +46,7 @@ commands = git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH:main} \ git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_BRANCH:main} \ git+https://github.com/NVIDIA-Merlin/models.git@{env:MERLIN_BRANCH:main} \ - git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH:main} \ + git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH:main} python -m pytest --cov-report term --cov merlin -rxs tests/unit [testenv:test-merlin] From 475b96b45efb96cf6342b44b2a2adba20949fdd8 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 2 Jun 2023 18:22:21 +0100 Subject: [PATCH 10/80] Add git to installed packages --- .github/workflows/gpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index a2d6ca9a4a..dd84c3c8bc 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -31,7 +31,7 @@ jobs: - name: Install Ubuntu packages run: | apt-get update -y - apt-get install -y build-essential protobuf-compiler + apt-get install -y build-essential protobuf-compiler git - name: Install and upgrade python packages run: | python -m pip install --upgrade pip setuptools==59.4.0 wheel tox pybind11 From fef5b4e61a76ddeeef57b92fe45ff87393763ace Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 2 Jun 2023 20:08:37 +0100 Subject: [PATCH 11/80] Use devel image for nvvm --- .github/workflows/gpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index dd84c3c8bc..595e2c5d87 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -17,7 +17,7 @@ jobs: gpu-tests: runs-on: linux-amd64-gpu-p100-latest-1 container: - image: nvidia/cuda:11.8.0-base-ubuntu22.04 + image: nvidia/cuda:11.8.0-devel-ubuntu22.04 env: NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} steps: From 8fb29a32ca92687d9f0de28cdd4d56b8fc3b3597 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 2 Jun 2023 20:08:46 +0100 Subject: [PATCH 12/80] Combine install to one line --- tox.ini | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tox.ini b/tox.ini index aac818206b..97b368d208 100644 --- a/tox.ini +++ b/tox.ini @@ -41,12 +41,14 @@ deps = pytest pytest-cov commands = - python -m pip install rmm-cu11=={env:RAPIDS_VERSION} cudf-cu11=={env:RAPIDS_VERSION} dask-cudf-cu11=={env:RAPIDS_VERSION} --extra-index-url=https://pypi.nvidia.com - python -m pip install --upgrade \ - git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH:main} \ - git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_BRANCH:main} \ - git+https://github.com/NVIDIA-Merlin/models.git@{env:MERLIN_BRANCH:main} \ - git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH:main} + python -m pip install \ + cudf-cu11=={env:RAPIDS_VERSION} dask-cudf-cu11=={env:RAPIDS_VERSION} \ + git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH:main} \ + git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_BRANCH:main} \ + git+https://github.com/NVIDIA-Merlin/models.git@{env:MERLIN_BRANCH:main} \ + git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH:main} \ + --extra-index-url=https://pypi.nvidia.com + python -m pytest --cov-report term --cov merlin -rxs tests/unit [testenv:test-merlin] From 8cf1bce6e4902aa847e16c09987e9c651e53250f Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 2 Jun 2023 20:24:33 +0100 Subject: [PATCH 13/80] Add pip cache --- .github/workflows/gpu-tests.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 595e2c5d87..d61f248568 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -28,6 +28,8 @@ jobs: uses: actions/setup-python@v4 with: python-version: 3.8 + cache: 'pip' + cache-dependency-path: requirements/*.txt - name: Install Ubuntu packages run: | apt-get update -y From 018b34a2e459034d5bc7b6a9c2fd72ea87af7003 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 2 Jun 2023 20:24:43 +0100 Subject: [PATCH 14/80] Update deps for gpu tox env --- tox.ini | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tox.ini b/tox.ini index 97b368d208..bb93a468ac 100644 --- a/tox.ini +++ b/tox.ini @@ -37,9 +37,7 @@ sitepackages=true ; and other gpu-specific libraries that we can enxpect will always exist. Thus, we don't need ; to install requirements.txt yet. As we get better at python environment isolation, we will ; need to add some back. -deps = - pytest - pytest-cov +deps = -rrequirements/test.txt commands = python -m pip install \ cudf-cu11=={env:RAPIDS_VERSION} dask-cudf-cu11=={env:RAPIDS_VERSION} \ From 31f8fb47d7ca02dad6dfc280d383e9418ff7d531 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 2 Jun 2023 20:31:20 +0100 Subject: [PATCH 15/80] Move package install earlier --- .github/workflows/gpu-tests.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index d61f248568..43ecd71e7c 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -24,16 +24,16 @@ jobs: - uses: actions/checkout@v3 with: fetch-depth: 0 + - name: Install Ubuntu packages + run: | + apt-get update -y + apt-get install -y build-essential protobuf-compiler git lsb-release - name: Set up Python 3.8 uses: actions/setup-python@v4 with: python-version: 3.8 cache: 'pip' cache-dependency-path: requirements/*.txt - - name: Install Ubuntu packages - run: | - apt-get update -y - apt-get install -y build-essential protobuf-compiler git - name: Install and upgrade python packages run: | python -m pip install --upgrade pip setuptools==59.4.0 wheel tox pybind11 From fd581896d88bd07c990f54f544811d7e8056076e Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 2 Jun 2023 20:40:06 +0100 Subject: [PATCH 16/80] disable cache --- .github/workflows/gpu-tests.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 43ecd71e7c..58eedfb677 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -32,8 +32,6 @@ jobs: uses: actions/setup-python@v4 with: python-version: 3.8 - cache: 'pip' - cache-dependency-path: requirements/*.txt - name: Install and upgrade python packages run: | python -m pip install --upgrade pip setuptools==59.4.0 wheel tox pybind11 From dab79d205c3a543cb7b5ebce630e48352dfefa74 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 5 Jun 2023 11:51:03 +0100 Subject: [PATCH 17/80] Pass keyword argument for axis in dataframe any method --- nvtabular/ops/categorify.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nvtabular/ops/categorify.py b/nvtabular/ops/categorify.py index 2f4285738f..7fbfa1de77 100644 --- a/nvtabular/ops/categorify.py +++ b/nvtabular/ops/categorify.py @@ -1025,7 +1025,7 @@ def _top_level_groupby(df, options: FitOptions = None, spill=True): del df_gb # Extract null groups into gb_null - isnull = gb.isnull().any(1) + isnull = gb.isnull().any(axis=1) gb_null = gb[~isnull] gb = gb[isnull] if not len(gb_null): From 12caac9ba943c3c6ccf5426b669972cab6aee188 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 5 Jun 2023 11:54:18 +0100 Subject: [PATCH 18/80] Use Distributed helper for client fixture --- tests/conftest.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 2598ced640..0ca0ad8748 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -52,10 +52,10 @@ def assert_eq(a, b, *args, **kwargs): import pytest from asvdb import ASVDb, BenchmarkInfo, utils -from dask.distributed import Client, LocalCluster from numba import cuda import nvtabular +from merlin.core.utils import Distributed from merlin.dag.node import iter_nodes REPO_ROOT = Path(__file__).parent.parent @@ -97,8 +97,9 @@ def assert_eq(a, b, *args, **kwargs): @pytest.fixture(scope="module") def client(): - cluster = LocalCluster(n_workers=2) - client = Client(cluster) + distributed = Distributed(n_workers=2) + cluster = distributed.cluster + client = distributed.client yield client client.close() cluster.close() From b54daa414a084a8ce1c284422fc19692fabd668e Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 5 Jun 2023 11:54:37 +0100 Subject: [PATCH 19/80] Add nvidia-cudnn-cu11 to gpu test env --- tox.ini | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index bb93a468ac..756ae45a4a 100644 --- a/tox.ini +++ b/tox.ini @@ -37,7 +37,9 @@ sitepackages=true ; and other gpu-specific libraries that we can enxpect will always exist. Thus, we don't need ; to install requirements.txt yet. As we get better at python environment isolation, we will ; need to add some back. -deps = -rrequirements/test.txt +deps = + nvidia-cudnn-cu11==8.6.0.163 + -rrequirements/test.txt commands = python -m pip install \ cudf-cu11=={env:RAPIDS_VERSION} dask-cudf-cu11=={env:RAPIDS_VERSION} \ From 930e0358a6bbb73af89b6cd5081822f695c12841 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 5 Jun 2023 11:54:52 +0100 Subject: [PATCH 20/80] Add posargs to gpu test env --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 756ae45a4a..4ad055a1d7 100644 --- a/tox.ini +++ b/tox.ini @@ -49,7 +49,7 @@ commands = git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH:main} \ --extra-index-url=https://pypi.nvidia.com - python -m pytest --cov-report term --cov merlin -rxs tests/unit + python -m pytest --cov-report term --cov merlin -rxs {posargs:tests/unit} [testenv:test-merlin] ; Runs in: Internal Jenkins From 0e775a77ab21998b6115660c5a395e6c00e24950 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 5 Jun 2023 11:55:03 +0100 Subject: [PATCH 21/80] Add ops to NVT import --- nvtabular/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nvtabular/__init__.py b/nvtabular/__init__.py index 71d597b6cc..1759cde183 100644 --- a/nvtabular/__init__.py +++ b/nvtabular/__init__.py @@ -21,8 +21,7 @@ from merlin.core import dispatch, utils # noqa from merlin.dag import ColumnSelector from merlin.schema import ColumnSchema, Schema -from nvtabular import workflow # noqa -from nvtabular import _version +from nvtabular import _version, ops, workflow # noqa # suppress some warnings with cudf warning about column ordering with dlpack # and numba warning about deprecated environment variables From 89e980f8ea57a3bceeb09689718bb01741c11547 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 5 Jun 2023 11:55:11 +0100 Subject: [PATCH 22/80] Use tmpdir for Categorify out_path in test_tf4rec --- tests/unit/test_tf4rec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_tf4rec.py b/tests/unit/test_tf4rec.py index 46dd66054a..737515e928 100644 --- a/tests/unit/test_tf4rec.py +++ b/tests/unit/test_tf4rec.py @@ -14,7 +14,7 @@ NUM_ROWS = 10000 -def test_tf4rec(): +def test_tf4rec(tmpdir): inputs = { "user_session": np.random.randint(1, 10000, NUM_ROWS), "product_id": np.random.randint(1, 51996, NUM_ROWS), @@ -29,7 +29,7 @@ def test_tf4rec(): cat_feats = ( ["user_session", "product_id", "category_id"] - >> nvt.ops.Categorify() + >> nvt.ops.Categorify(out_path=str(tmpdir)) >> nvt.ops.LambdaOp(lambda col: col + 1) ) From 19a59ea8fe3939b199b02fa7399d746d0bbe82c4 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 5 Jun 2023 11:55:28 +0100 Subject: [PATCH 23/80] Install libcudnn8 for Tensorflow support on GPU --- .github/workflows/gpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 58eedfb677..c4d070bc2d 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -27,7 +27,7 @@ jobs: - name: Install Ubuntu packages run: | apt-get update -y - apt-get install -y build-essential protobuf-compiler git lsb-release + apt-get install -y build-essential protobuf-compiler git lsb-release libcudnn8 - name: Set up Python 3.8 uses: actions/setup-python@v4 with: From 469fb7d43d034de7cfd814ba2e7855a99dbb9817 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 5 Jun 2023 11:56:44 +0100 Subject: [PATCH 24/80] Get visible devices from env var if set --- ...3-Running-on-multiple-GPUs-or-on-CPU.ipynb | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/examples/03-Running-on-multiple-GPUs-or-on-CPU.ipynb b/examples/03-Running-on-multiple-GPUs-or-on-CPU.ipynb index aba2647567..3c90574ff5 100644 --- a/examples/03-Running-on-multiple-GPUs-or-on-CPU.ipynb +++ b/examples/03-Running-on-multiple-GPUs-or-on-CPU.ipynb @@ -27,6 +27,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "77464844", "metadata": {}, @@ -53,6 +54,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "1c5598ae", "metadata": {}, @@ -92,6 +94,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "63ac0cf2", "metadata": {}, @@ -100,6 +103,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "4def0005", "metadata": {}, @@ -123,6 +127,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "d7c3f9ea", "metadata": {}, @@ -148,6 +153,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "728c3009", "metadata": {}, @@ -176,11 +182,15 @@ "\n", "# Deploy a Single-Machine Multi-GPU Cluster\n", "protocol = \"tcp\" # \"tcp\" or \"ucx\"\n", + "\n", "if numba.cuda.is_available():\n", " NUM_GPUS = list(range(len(numba.cuda.gpus)))\n", "else:\n", " NUM_GPUS = []\n", - "visible_devices = \",\".join([str(n) for n in NUM_GPUS]) # Delect devices to place workers\n", + "try:\n", + " visible_devices = os.environ[\"CUDA_VISIBLE_DEVICES\"]\n", + "except KeyError:\n", + " visible_devices = \",\".join([str(n) for n in NUM_GPUS]) # Delect devices to place workers\n", "device_limit_frac = 0.7 # Spill GPU-Worker memory to host at this limit.\n", "device_pool_frac = 0.8\n", "part_mem_frac = 0.15\n", @@ -206,6 +216,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "d14dc098", "metadata": {}, @@ -242,6 +253,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "0576affe", "metadata": {}, @@ -589,6 +601,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "94ef0024", "metadata": {}, @@ -599,6 +612,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "768fc24e", "metadata": {}, @@ -622,6 +636,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "61785127", "metadata": {}, @@ -678,6 +693,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "01ea40bb", "metadata": {}, @@ -686,6 +702,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "987f3274", "metadata": {}, @@ -714,6 +731,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "b06c962e", "metadata": {}, @@ -745,6 +763,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "d28ae761", "metadata": {}, @@ -755,6 +774,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "4e07864d", "metadata": {}, @@ -763,6 +783,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "8f971a22", "metadata": {}, From e2fbc8ec6daeae51af76903168e4e14ba4929e37 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 5 Jun 2023 12:54:28 +0100 Subject: [PATCH 25/80] Remove n_workers from Distributed in conftest --- tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 0ca0ad8748..3c3ae4373b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -97,7 +97,7 @@ def assert_eq(a, b, *args, **kwargs): @pytest.fixture(scope="module") def client(): - distributed = Distributed(n_workers=2) + distributed = Distributed() cluster = distributed.cluster client = distributed.client yield client From 5d4c6e7082b561d84c6b94d5a8e0769f4bcf57be Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Mon, 5 Jun 2023 14:38:04 +0100 Subject: [PATCH 26/80] install libcudnn8 for cuda 11.8 --- .github/workflows/gpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index c4d070bc2d..db5f4c7cea 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -27,7 +27,7 @@ jobs: - name: Install Ubuntu packages run: | apt-get update -y - apt-get install -y build-essential protobuf-compiler git lsb-release libcudnn8 + apt-get install -y build-essential protobuf-compiler git lsb-release 'libcudnn8=*cuda11.8' - name: Set up Python 3.8 uses: actions/setup-python@v4 with: From 95a615a04ac76615dbd96853ab5c44e0bd6b5e07 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Wed, 21 Jun 2023 10:37:01 +0100 Subject: [PATCH 27/80] Run tests outside of tox --- .github/workflows/gpu-tests.yml | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index db5f4c7cea..97ac17ff68 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -43,4 +43,13 @@ jobs: - name: Run tests run: | merlin_branch="${{ steps.get-branch-name.outputs.branch }}" - RAPIDS_VERSION=23.04 MERLIN_BRANCH="$merlin_branch" tox -e test-gpu + rapids_version=23.04 + python -m pip instll -r requirements/test.txt + python -m pip install \ + nvidia-cudnn-cu11==8.6.0.163 \ + cudf-cu11==${rapids_version} dask-cudf-cu11==${rapids_version} \ + git+https://github.com/NVIDIA-Merlin/core.git@${merlin_branch} \ + git+https://github.com/NVIDIA-Merlin/dataloader.git@${merlin_branch} \ + git+https://github.com/NVIDIA-Merlin/models.git@${merlin_branch} \ + git+https://github.com/NVIDIA-Merlin/core.git@${merlin_branch} \ + --extra-index-url=https://pypi.nvidia.com From 71dfc22c77662208bf2572613491b5fd9f04e276 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Wed, 21 Jun 2023 10:44:58 +0100 Subject: [PATCH 28/80] Correct install name --- .github/workflows/gpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 97ac17ff68..878f6dd0de 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -44,7 +44,7 @@ jobs: run: | merlin_branch="${{ steps.get-branch-name.outputs.branch }}" rapids_version=23.04 - python -m pip instll -r requirements/test.txt + python -m pip install -r requirements/test.txt python -m pip install \ nvidia-cudnn-cu11==8.6.0.163 \ cudf-cu11==${rapids_version} dask-cudf-cu11==${rapids_version} \ From 78be2b652f718182c323bc74e597b51863eaece9 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Wed, 21 Jun 2023 10:58:29 +0100 Subject: [PATCH 29/80] Add call to pytest --- .github/workflows/gpu-tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 878f6dd0de..470e49c4f6 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -53,3 +53,4 @@ jobs: git+https://github.com/NVIDIA-Merlin/models.git@${merlin_branch} \ git+https://github.com/NVIDIA-Merlin/core.git@${merlin_branch} \ --extra-index-url=https://pypi.nvidia.com + python -m pytest -rsx tests/unit From bc89308d778d814b395ed1370f62e3cebda81f69 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Wed, 21 Jun 2023 11:48:09 +0100 Subject: [PATCH 30/80] Add marker for loader --- pytest.ini | 3 +++ tests/conftest.py | 7 +++++++ 2 files changed, 10 insertions(+) create mode 100644 pytest.ini diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000000..c94bf54283 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +markers = + loader: mark as testing the dataloader diff --git a/tests/conftest.py b/tests/conftest.py index 3c3ae4373b..f4826695d2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -371,3 +371,10 @@ def devices(request): @pytest.fixture def report(request): return request.config.getoption("--report") + +def pytest_collection_modifyitems(items): + for item in items: + path = item.location[0] + + if "/loader/" in path: + item.add_marker(getattr(pytest.mark, "loader")) From b3aebb79a4302d222512375d5e3c2343ffc8230a Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Wed, 21 Jun 2023 11:48:24 +0100 Subject: [PATCH 31/80] Run loader tests separately --- .github/workflows/gpu-tests.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 470e49c4f6..00dbdf07fd 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -53,4 +53,6 @@ jobs: git+https://github.com/NVIDIA-Merlin/models.git@${merlin_branch} \ git+https://github.com/NVIDIA-Merlin/core.git@${merlin_branch} \ --extra-index-url=https://pypi.nvidia.com - python -m pytest -rsx tests/unit + python -m pytest -rsx tests/unit/loader/test_tf_dataloader.py + python -m pytest -rsx tests/unit/loader/test_torch_dataloader.py + python -m pytest -rsx -m 'not loader' tests/unit From 7b45ebc361114892f05a6bdab4f61eca167446a7 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Wed, 21 Jun 2023 11:50:28 +0100 Subject: [PATCH 32/80] Remove newline from conftest.py --- tests/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/conftest.py b/tests/conftest.py index f4826695d2..2d89955fea 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -372,6 +372,7 @@ def devices(request): def report(request): return request.config.getoption("--report") + def pytest_collection_modifyitems(items): for item in items: path = item.location[0] From 73e058147bfe48c14d83c77665f5a92c048224f0 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Wed, 21 Jun 2023 12:49:43 +0100 Subject: [PATCH 33/80] Add tensorflow marker --- pytest.ini | 1 + tests/conftest.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/pytest.ini b/pytest.ini index c94bf54283..a6ad5ce6c6 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,3 +1,4 @@ [pytest] markers = loader: mark as testing the dataloader + tensorflow: mark as using tensorflow \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 2d89955fea..c3c8dcd5b4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -379,3 +379,6 @@ def pytest_collection_modifyitems(items): if "/loader/" in path: item.add_marker(getattr(pytest.mark, "loader")) + + if "test_tf_" in path: + item.add_marker(getattr(pytest.mark, "tensorflow")) From 5793541d75ef91518c45ebd49a8ef065212144ab Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Wed, 21 Jun 2023 12:49:49 +0100 Subject: [PATCH 34/80] Run test_tf_dataloader separately --- tox.ini | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 4ad055a1d7..963d1a5013 100644 --- a/tox.ini +++ b/tox.ini @@ -49,7 +49,8 @@ commands = git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH:main} \ --extra-index-url=https://pypi.nvidia.com - python -m pytest --cov-report term --cov merlin -rxs {posargs:tests/unit} + python -m pytest --cov-report term --cov merlin -rxs tests/unit/loader/test_tf_dataloader.py + python -m pytest --cov-report term --cov merlin -rxs -m 'not (loader and tensorflow)' tests/unit [testenv:test-merlin] ; Runs in: Internal Jenkins From 93a9932a263c871bd8b1e5accdcdf558d91ac65c Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Wed, 21 Jun 2023 12:50:57 +0100 Subject: [PATCH 35/80] install current project --- .github/workflows/gpu-tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 00dbdf07fd..dd027f5dfa 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -44,6 +44,7 @@ jobs: run: | merlin_branch="${{ steps.get-branch-name.outputs.branch }}" rapids_version=23.04 + python -m pip install . python -m pip install -r requirements/test.txt python -m pip install \ nvidia-cudnn-cu11==8.6.0.163 \ From fcd1a4329b93673c61883b46ba17aec3f7b688e3 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Wed, 21 Jun 2023 12:51:10 +0100 Subject: [PATCH 36/80] run torch tests alongside the rest --- .github/workflows/gpu-tests.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index dd027f5dfa..828e41d4bd 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -55,5 +55,4 @@ jobs: git+https://github.com/NVIDIA-Merlin/core.git@${merlin_branch} \ --extra-index-url=https://pypi.nvidia.com python -m pytest -rsx tests/unit/loader/test_tf_dataloader.py - python -m pytest -rsx tests/unit/loader/test_torch_dataloader.py - python -m pytest -rsx -m 'not loader' tests/unit + python -m pytest -rsx -m 'not (loader and tensorflow)' tests/unit From 615ec7bc8f1aad30057fcf7e6213e6d264f97cd6 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Wed, 21 Jun 2023 12:56:38 +0100 Subject: [PATCH 37/80] Reformat conftest.py --- tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index c3c8dcd5b4..5e8b48f102 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -380,5 +380,5 @@ def pytest_collection_modifyitems(items): if "/loader/" in path: item.add_marker(getattr(pytest.mark, "loader")) - if "test_tf_" in path: + if "test_tf_" in path: item.add_marker(getattr(pytest.mark, "tensorflow")) From a6e0d4900f8ffcfdfbf02d4ea8506a34de3f759f Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Wed, 21 Jun 2023 20:17:51 +0100 Subject: [PATCH 38/80] Add marker for ops --- pytest.ini | 3 ++- tests/conftest.py | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pytest.ini b/pytest.ini index a6ad5ce6c6..acdc393ec2 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,4 +1,5 @@ [pytest] markers = loader: mark as testing the dataloader - tensorflow: mark as using tensorflow \ No newline at end of file + tensorflow: mark as using tensorflow + ops: mark as tesing an operator diff --git a/tests/conftest.py b/tests/conftest.py index 5e8b48f102..1b3bf665f2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -380,5 +380,8 @@ def pytest_collection_modifyitems(items): if "/loader/" in path: item.add_marker(getattr(pytest.mark, "loader")) + if "/ops/" in path: + item.add_marker(getattr(pytest.mark, "ops")) + if "test_tf_" in path: item.add_marker(getattr(pytest.mark, "tensorflow")) From bf407592722e842338e197bc04663ee68d54e144 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Wed, 21 Jun 2023 21:29:14 +0100 Subject: [PATCH 39/80] Set num thread env vars --- tox.ini | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index 963d1a5013..1239125cc5 100644 --- a/tox.ini +++ b/tox.ini @@ -27,6 +27,10 @@ commands = [testenv:test-gpu] setenv = TF_GPU_ALLOCATOR=cuda_malloc_async + MKL_NUM_THREADS=1 + OMP_NUM_THREADS=1 + TF_NUM_INTEROP_THREADS=1 + TF_NUM_INTRAOP_THREADS=1 passenv = CUDA_VISIBLE_DEVICES NVIDIA_VISIBLE_DEVICES @@ -49,8 +53,9 @@ commands = git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH:main} \ --extra-index-url=https://pypi.nvidia.com - python -m pytest --cov-report term --cov merlin -rxs tests/unit/loader/test_tf_dataloader.py - python -m pytest --cov-report term --cov merlin -rxs -m 'not (loader and tensorflow)' tests/unit + ; python -m pytest --cov-report term --cov merlin -rxs tests/unit/loader/test_tf_dataloader.py + ; python -m pytest --cov-report term --cov merlin -rxs -m '(loader and tensorflow) or ops' tests/unit + python -m pytest --cov-report term --cov merlin -rxs tests/unit [testenv:test-merlin] ; Runs in: Internal Jenkins From e00790b7258d8bdcb1fed4b521ccd06a538dfeb7 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Wed, 21 Jun 2023 21:29:37 +0100 Subject: [PATCH 40/80] Run tests in tox --- .github/workflows/gpu-tests.yml | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 828e41d4bd..05e39404a6 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -43,16 +43,4 @@ jobs: - name: Run tests run: | merlin_branch="${{ steps.get-branch-name.outputs.branch }}" - rapids_version=23.04 - python -m pip install . - python -m pip install -r requirements/test.txt - python -m pip install \ - nvidia-cudnn-cu11==8.6.0.163 \ - cudf-cu11==${rapids_version} dask-cudf-cu11==${rapids_version} \ - git+https://github.com/NVIDIA-Merlin/core.git@${merlin_branch} \ - git+https://github.com/NVIDIA-Merlin/dataloader.git@${merlin_branch} \ - git+https://github.com/NVIDIA-Merlin/models.git@${merlin_branch} \ - git+https://github.com/NVIDIA-Merlin/core.git@${merlin_branch} \ - --extra-index-url=https://pypi.nvidia.com - python -m pytest -rsx tests/unit/loader/test_tf_dataloader.py - python -m pytest -rsx -m 'not (loader and tensorflow)' tests/unit + RAPIDS_VERSION=23.04 MERLIN_BRANCH=$merlin_branch tox -e test-gpu From 93fcbb10ea8e6837254d5d617ce46ecf1ee4756e Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Thu, 22 Jun 2023 11:03:19 +0100 Subject: [PATCH 41/80] Call stop on dataloader after each test --- tests/unit/loader/test_tf_dataloader.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/unit/loader/test_tf_dataloader.py b/tests/unit/loader/test_tf_dataloader.py index e1ba7bfe7c..bc544c0c15 100644 --- a/tests/unit/loader/test_tf_dataloader.py +++ b/tests/unit/loader/test_tf_dataloader.py @@ -80,6 +80,7 @@ def test_nested_list(): assert multihot_data2_col.shape == true_data2_col.shape assert np.allclose(multihot_data2_col.numpy(), true_data2_col.numpy()) + train_dataset.stop() def test_shuffling(): num_rows = 10000 @@ -99,6 +100,8 @@ def test_shuffling(): assert (first_batch != in_order).numpy().any() assert (tf.sort(first_batch) == in_order).numpy().all() + train_dataset.stop() + @pytest.mark.parametrize("batch_size", [10, 9, 8]) @pytest.mark.parametrize("drop_last", [True, False]) @@ -148,6 +151,8 @@ def test_tf_drp_reset(tmpdir, batch_size, drop_last, num_rows): else: assert num_rows == all_rows + data_itr.stop() + def test_tf_catname_ordering(tmpdir): df = make_df( @@ -184,6 +189,8 @@ def test_tf_catname_ordering(tmpdir): assert list(X["cont2"].numpy()) == [2.0] * 10 assert list(X["cont3"].numpy()) == [3.0] * 10 + data_itr.stop() + def test_tf_map(tmpdir): df = make_df( @@ -226,6 +233,8 @@ def add_sample_weight(features, labels, sample_weight_col_name="sample_weight"): assert list(sample_weight.numpy()) == [1.0] * 10 + data_itr.stop() + # TODO: include use_columns option # TODO: include parts_per_chunk test @@ -392,6 +401,7 @@ def test_mh_support(tmpdir, batch_size): idx += 1 assert idx == (3 // batch_size + 1) + data_itr.stop() @pytest.mark.parametrize("batch_size", [128, 256]) @@ -444,6 +454,8 @@ def test_validater(tmpdir, batch_size): print(estimated_auc) assert np.isclose(true_auc, estimated_auc, rtol=0.1) + dataloader.stop() + @pytest.mark.parametrize("engine", ["parquet"]) @pytest.mark.parametrize("batch_size", [1, 10, 100]) @@ -466,6 +478,7 @@ def test_multigpu_partitioning(datasets, engine, batch_size, global_rank): ) indices = data_loader._indices_for_process() assert indices == [global_rank] + data_loader.stop() @pytest.mark.parametrize("batch_size", [1000]) @@ -509,3 +522,5 @@ def test_dataloader_schema(tmpdir, df, dataset, batch_size, engine, device): num_label_cols = batch[1].shape[1] if len(batch[1].shape) > 1 else 1 assert num_label_cols == len(label_name) + + data_loader.stop() From 15b1b69983a0005841477f18c6792e81ec7784eb Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Thu, 22 Jun 2023 11:56:37 +0100 Subject: [PATCH 42/80] Don't run torch tests in gpu tests --- pytest.ini | 1 + tests/conftest.py | 3 +++ tox.ini | 7 ++----- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pytest.ini b/pytest.ini index acdc393ec2..3d3be99f27 100644 --- a/pytest.ini +++ b/pytest.ini @@ -2,4 +2,5 @@ markers = loader: mark as testing the dataloader tensorflow: mark as using tensorflow + torch: mark as using torch ops: mark as tesing an operator diff --git a/tests/conftest.py b/tests/conftest.py index 1b3bf665f2..0fd310b877 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -385,3 +385,6 @@ def pytest_collection_modifyitems(items): if "test_tf_" in path: item.add_marker(getattr(pytest.mark, "tensorflow")) + + if "test_torch_" in path: + item.add_marker(getattr(pytest.mark, "torch")) diff --git a/tox.ini b/tox.ini index 1239125cc5..adbe4f2ae7 100644 --- a/tox.ini +++ b/tox.ini @@ -27,10 +27,6 @@ commands = [testenv:test-gpu] setenv = TF_GPU_ALLOCATOR=cuda_malloc_async - MKL_NUM_THREADS=1 - OMP_NUM_THREADS=1 - TF_NUM_INTEROP_THREADS=1 - TF_NUM_INTRAOP_THREADS=1 passenv = CUDA_VISIBLE_DEVICES NVIDIA_VISIBLE_DEVICES @@ -55,7 +51,8 @@ commands = ; python -m pytest --cov-report term --cov merlin -rxs tests/unit/loader/test_tf_dataloader.py ; python -m pytest --cov-report term --cov merlin -rxs -m '(loader and tensorflow) or ops' tests/unit - python -m pytest --cov-report term --cov merlin -rxs tests/unit + ; python -m pytest --cov-report term --cov merlin -rxs -m '(loader and torch) or ops' tests/unit + python -m pytest --cov-report term --cov merlin -rxs -m 'not torch' tests/unit [testenv:test-merlin] ; Runs in: Internal Jenkins From b23d9ab97de32cff52204cce38ab5de5c606841d Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Thu, 22 Jun 2023 13:17:30 +0100 Subject: [PATCH 43/80] Stop dataloader in test_dataloader_schema test --- tests/unit/loader/test_torch_dataloader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit/loader/test_torch_dataloader.py b/tests/unit/loader/test_torch_dataloader.py index 2617fa0845..99026e2ba7 100644 --- a/tests/unit/loader/test_torch_dataloader.py +++ b/tests/unit/loader/test_torch_dataloader.py @@ -526,3 +526,5 @@ def test_dataloader_schema(tmpdir, df, dataset, batch_size, engine, device): num_label_cols = batch[1].shape[1] if len(batch[1].shape) > 1 else 1 assert num_label_cols == len(label_name) + + data_loader.stop() From d32575b15f55929b0f6c6b4e37539f673bfd4a0c Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Thu, 22 Jun 2023 13:17:44 +0100 Subject: [PATCH 44/80] Remove torch constraint from tests --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index adbe4f2ae7..45d6b13de1 100644 --- a/tox.ini +++ b/tox.ini @@ -52,7 +52,7 @@ commands = ; python -m pytest --cov-report term --cov merlin -rxs tests/unit/loader/test_tf_dataloader.py ; python -m pytest --cov-report term --cov merlin -rxs -m '(loader and tensorflow) or ops' tests/unit ; python -m pytest --cov-report term --cov merlin -rxs -m '(loader and torch) or ops' tests/unit - python -m pytest --cov-report term --cov merlin -rxs -m 'not torch' tests/unit + python -m pytest --cov-report term --cov merlin -rxs tests/unit [testenv:test-merlin] ; Runs in: Internal Jenkins From 2a15374b050688ec3e063f4d1adf28e425d45f77 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Thu, 22 Jun 2023 14:52:22 +0100 Subject: [PATCH 45/80] Remove unnecessary dataloader stop commands from test_tf_dataloader.py --- tests/unit/loader/test_tf_dataloader.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/tests/unit/loader/test_tf_dataloader.py b/tests/unit/loader/test_tf_dataloader.py index bc544c0c15..b51d068190 100644 --- a/tests/unit/loader/test_tf_dataloader.py +++ b/tests/unit/loader/test_tf_dataloader.py @@ -80,7 +80,6 @@ def test_nested_list(): assert multihot_data2_col.shape == true_data2_col.shape assert np.allclose(multihot_data2_col.numpy(), true_data2_col.numpy()) - train_dataset.stop() def test_shuffling(): num_rows = 10000 @@ -100,8 +99,6 @@ def test_shuffling(): assert (first_batch != in_order).numpy().any() assert (tf.sort(first_batch) == in_order).numpy().all() - train_dataset.stop() - @pytest.mark.parametrize("batch_size", [10, 9, 8]) @pytest.mark.parametrize("drop_last", [True, False]) @@ -151,8 +148,6 @@ def test_tf_drp_reset(tmpdir, batch_size, drop_last, num_rows): else: assert num_rows == all_rows - data_itr.stop() - def test_tf_catname_ordering(tmpdir): df = make_df( @@ -189,8 +184,6 @@ def test_tf_catname_ordering(tmpdir): assert list(X["cont2"].numpy()) == [2.0] * 10 assert list(X["cont3"].numpy()) == [3.0] * 10 - data_itr.stop() - def test_tf_map(tmpdir): df = make_df( @@ -233,8 +226,6 @@ def add_sample_weight(features, labels, sample_weight_col_name="sample_weight"): assert list(sample_weight.numpy()) == [1.0] * 10 - data_itr.stop() - # TODO: include use_columns option # TODO: include parts_per_chunk test @@ -401,7 +392,6 @@ def test_mh_support(tmpdir, batch_size): idx += 1 assert idx == (3 // batch_size + 1) - data_itr.stop() @pytest.mark.parametrize("batch_size", [128, 256]) @@ -454,8 +444,6 @@ def test_validater(tmpdir, batch_size): print(estimated_auc) assert np.isclose(true_auc, estimated_auc, rtol=0.1) - dataloader.stop() - @pytest.mark.parametrize("engine", ["parquet"]) @pytest.mark.parametrize("batch_size", [1, 10, 100]) @@ -478,7 +466,6 @@ def test_multigpu_partitioning(datasets, engine, batch_size, global_rank): ) indices = data_loader._indices_for_process() assert indices == [global_rank] - data_loader.stop() @pytest.mark.parametrize("batch_size", [1000]) From 92fd2620623aa30fbabbff7cf9f9bc84092219f3 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Thu, 22 Jun 2023 15:58:09 +0100 Subject: [PATCH 46/80] Remove commented commands from tox config --- tox.ini | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tox.ini b/tox.ini index 45d6b13de1..4ad055a1d7 100644 --- a/tox.ini +++ b/tox.ini @@ -49,10 +49,7 @@ commands = git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH:main} \ --extra-index-url=https://pypi.nvidia.com - ; python -m pytest --cov-report term --cov merlin -rxs tests/unit/loader/test_tf_dataloader.py - ; python -m pytest --cov-report term --cov merlin -rxs -m '(loader and tensorflow) or ops' tests/unit - ; python -m pytest --cov-report term --cov merlin -rxs -m '(loader and torch) or ops' tests/unit - python -m pytest --cov-report term --cov merlin -rxs tests/unit + python -m pytest --cov-report term --cov merlin -rxs {posargs:tests/unit} [testenv:test-merlin] ; Runs in: Internal Jenkins From 4e8d7e257ba71723159bcf3e1736b8a79cd40066 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Thu, 22 Jun 2023 16:25:23 +0100 Subject: [PATCH 47/80] Revert changes to passenv in test-gpu --- tox.ini | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 4ad055a1d7..c9aeffb9a0 100644 --- a/tox.ini +++ b/tox.ini @@ -28,8 +28,9 @@ commands = setenv = TF_GPU_ALLOCATOR=cuda_malloc_async passenv = + OPAL_PREFIX + NR_USER CUDA_VISIBLE_DEVICES - NVIDIA_VISIBLE_DEVICES sitepackages=true ; Runs in: Internal Jenkins ; Runs GPU-based tests. From f6864fba1728ea0ee37691fb89663e8b20f3da13 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 23 Jun 2023 12:07:11 +0100 Subject: [PATCH 48/80] Move dependencies to deps section in test-gpu environment --- tox.ini | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/tox.ini b/tox.ini index c9aeffb9a0..d6b110b2c1 100644 --- a/tox.ini +++ b/tox.ini @@ -39,17 +39,14 @@ sitepackages=true ; to install requirements.txt yet. As we get better at python environment isolation, we will ; need to add some back. deps = - nvidia-cudnn-cu11==8.6.0.163 -rrequirements/test.txt + git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_REF} + git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_REF} + git+https://github.com/NVIDIA-Merlin/models.git@{env:MERLIN_REF} + nvidia-cudnn-cu11==8.6.0.163 + cudf-cu11=={env:RAPIDS_VERSION} + dask-cudf-cu11=={env:RAPIDS_VERSION} commands = - python -m pip install \ - cudf-cu11=={env:RAPIDS_VERSION} dask-cudf-cu11=={env:RAPIDS_VERSION} \ - git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH:main} \ - git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_BRANCH:main} \ - git+https://github.com/NVIDIA-Merlin/models.git@{env:MERLIN_BRANCH:main} \ - git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH:main} \ - --extra-index-url=https://pypi.nvidia.com - python -m pytest --cov-report term --cov merlin -rxs {posargs:tests/unit} [testenv:test-merlin] From 40554ec347d014be9b827aef0f665a34f7300b8a Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 23 Jun 2023 12:08:13 +0100 Subject: [PATCH 49/80] Rename ENV var MERLIN_BRANCH to MERLIN_REF --- .github/workflows/gpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 05e39404a6..c7fd8d501b 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -43,4 +43,4 @@ jobs: - name: Run tests run: | merlin_branch="${{ steps.get-branch-name.outputs.branch }}" - RAPIDS_VERSION=23.04 MERLIN_BRANCH=$merlin_branch tox -e test-gpu + RAPIDS_VERSION=23.04 MERLIN_REF=$merlin_branch tox -e test-gpu From 577fce1a6b5de06338a3dfb8955d69cc271482c5 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 23 Jun 2023 12:09:39 +0100 Subject: [PATCH 50/80] Rename MERLIN_REF to MERLIN_BRANCH --- .github/workflows/gpu-tests.yml | 2 +- tox.ini | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index c7fd8d501b..05e39404a6 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -43,4 +43,4 @@ jobs: - name: Run tests run: | merlin_branch="${{ steps.get-branch-name.outputs.branch }}" - RAPIDS_VERSION=23.04 MERLIN_REF=$merlin_branch tox -e test-gpu + RAPIDS_VERSION=23.04 MERLIN_BRANCH=$merlin_branch tox -e test-gpu diff --git a/tox.ini b/tox.ini index d6b110b2c1..efba613bf1 100644 --- a/tox.ini +++ b/tox.ini @@ -40,9 +40,9 @@ sitepackages=true ; need to add some back. deps = -rrequirements/test.txt - git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_REF} - git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_REF} - git+https://github.com/NVIDIA-Merlin/models.git@{env:MERLIN_REF} + git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH} + git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_BRANCH} + git+https://github.com/NVIDIA-Merlin/models.git@{env:MERLIN_BRANCH} nvidia-cudnn-cu11==8.6.0.163 cudf-cu11=={env:RAPIDS_VERSION} dask-cudf-cu11=={env:RAPIDS_VERSION} From 231d58947dbcea6671b5ad4d1a5657f7587870ff Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 23 Jun 2023 12:45:30 +0100 Subject: [PATCH 51/80] Add nvidia pypi to extra index url env var --- tox.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/tox.ini b/tox.ini index efba613bf1..36d75bf910 100644 --- a/tox.ini +++ b/tox.ini @@ -27,6 +27,7 @@ commands = [testenv:test-gpu] setenv = TF_GPU_ALLOCATOR=cuda_malloc_async + PIP_EXTRA_INDEX_URL=https://pypi.nvidia.com passenv = OPAL_PREFIX NR_USER From f5ecdc65b9b163634716e63efd6b23a7e877adb1 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 23 Jun 2023 12:51:12 +0100 Subject: [PATCH 52/80] Remove git version from merlin-models test.txt --- requirements/test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/test.txt b/requirements/test.txt index 0a29deeed9..93b974e66a 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -19,7 +19,7 @@ s3fs>=2021.4 aiobotocore>=1.3.3 # required for synthetic data `merlin.datasets` and notebook tests using merlin models -merlin-models[tensorflow]@git+https://github.com/NVIDIA-Merlin/models.git +merlin-models[tensorflow] # needed to run notebook tests nest-asyncio From c1de4d8bb7fce23b7cdec213aab889ad9aaa5d06 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 23 Jun 2023 15:15:45 +0100 Subject: [PATCH 53/80] Add separate env for CUDA 11 and CUDA 12 --- tox.ini | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 36d75bf910..19a085f087 100644 --- a/tox.ini +++ b/tox.ini @@ -24,7 +24,7 @@ commands = python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/core.git@{posargs:main} python -m pytest --cov-report term --cov=merlin -rxs tests/unit -[testenv:test-gpu] +[testenv:test-gpu-cu11] setenv = TF_GPU_ALLOCATOR=cuda_malloc_async PIP_EXTRA_INDEX_URL=https://pypi.nvidia.com @@ -50,6 +50,32 @@ deps = commands = python -m pytest --cov-report term --cov merlin -rxs {posargs:tests/unit} +[testenv:test-gpu-cu12] +setenv = + TF_GPU_ALLOCATOR=cuda_malloc_async + PIP_EXTRA_INDEX_URL=https://pypi.nvidia.com + PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python +passenv = + OPAL_PREFIX + NR_USER + CUDA_VISIBLE_DEVICES +sitepackages=true +; Runs in: Internal Jenkins +; Runs GPU-based tests. +; The jenkins jobs run on an image based on merlin-hugectr. This will include all cudf configuration +; and other gpu-specific libraries that we can enxpect will always exist. Thus, we don't need +; to install requirements.txt yet. As we get better at python environment isolation, we will +; need to add some back. +deps = + -rrequirements/test.txt + git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH} + git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_BRANCH} + git+https://github.com/NVIDIA-Merlin/models.git@{env:MERLIN_BRANCH} + cudf-cu12=={env:RAPIDS_VERSION} + dask-cudf-cu12=={env:RAPIDS_VERSION} +commands = + python -m pytest --cov-report term --cov merlin -rxs {posargs:tests/unit} + [testenv:test-merlin] ; Runs in: Internal Jenkins ; This runs the end-to-end tests from the NVIDIA-Merlin/Merlin repo on the jenkins machine. From 40a6062528d4e927889a1b47e9fc1dee91fc6a79 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 23 Jun 2023 17:07:30 +0100 Subject: [PATCH 54/80] Set protobuf implementation env var --- tox.ini | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 19a085f087..df16c0681e 100644 --- a/tox.ini +++ b/tox.ini @@ -28,6 +28,7 @@ commands = setenv = TF_GPU_ALLOCATOR=cuda_malloc_async PIP_EXTRA_INDEX_URL=https://pypi.nvidia.com + PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python passenv = OPAL_PREFIX NR_USER @@ -74,7 +75,8 @@ deps = cudf-cu12=={env:RAPIDS_VERSION} dask-cudf-cu12=={env:RAPIDS_VERSION} commands = - python -m pytest --cov-report term --cov merlin -rxs {posargs:tests/unit} + ; Latest TensorFlow PyPI package does not currently support CUDA 12 + python -m pytest --cov-report term --cov merlin -rxs -m 'not tensorflow' {posargs:tests/unit} [testenv:test-merlin] ; Runs in: Internal Jenkins From 781b6354d5f75e93224ebd7fd5d80f31bd5043a8 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 23 Jun 2023 17:08:18 +0100 Subject: [PATCH 55/80] remove cudnn and protobuf package from install --- .github/workflows/gpu-tests.yml | 6 ++---- tox.ini | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 05e39404a6..a341cf0390 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -27,16 +27,14 @@ jobs: - name: Install Ubuntu packages run: | apt-get update -y - apt-get install -y build-essential protobuf-compiler git lsb-release 'libcudnn8=*cuda11.8' + apt-get install -y git lsb-release - name: Set up Python 3.8 uses: actions/setup-python@v4 with: python-version: 3.8 - name: Install and upgrade python packages run: | - python -m pip install --upgrade pip setuptools==59.4.0 wheel tox pybind11 - python -m pip uninstall protobuf -y - python -m pip install --no-binary=protobuf protobuf + python -m pip install --upgrade pip tox - name: Get Branch name id: get-branch-name uses: NVIDIA-Merlin/.github/actions/branch-name@branch-name-pull-request diff --git a/tox.ini b/tox.ini index df16c0681e..e759c3abe1 100644 --- a/tox.ini +++ b/tox.ini @@ -45,7 +45,7 @@ deps = git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH} git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_BRANCH} git+https://github.com/NVIDIA-Merlin/models.git@{env:MERLIN_BRANCH} - nvidia-cudnn-cu11==8.6.0.163 + nvidia-cudnn-cu11==8.6.0.163 ; required for tensorflow version cudf-cu11=={env:RAPIDS_VERSION} dask-cudf-cu11=={env:RAPIDS_VERSION} commands = From d7a304ede27bf135abcf3aff4b01ec2fc7f52369 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 23 Jun 2023 17:10:31 +0100 Subject: [PATCH 56/80] Update tox env name in gpu-tests.yml --- .github/workflows/gpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index a341cf0390..8a26d009ff 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -41,4 +41,4 @@ jobs: - name: Run tests run: | merlin_branch="${{ steps.get-branch-name.outputs.branch }}" - RAPIDS_VERSION=23.04 MERLIN_BRANCH=$merlin_branch tox -e test-gpu + RAPIDS_VERSION=23.04 MERLIN_BRANCH=$merlin_branch tox -e test-gpu-cu11 From 9100b5b6e41013ce45e2e287c16109e4cd024eb9 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 23 Jun 2023 18:35:42 +0100 Subject: [PATCH 57/80] Replace np.bool with bool --- nvtabular/ops/fill.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nvtabular/ops/fill.py b/nvtabular/ops/fill.py index 2bd9405c6a..b38383b7c3 100644 --- a/nvtabular/ops/fill.py +++ b/nvtabular/ops/fill.py @@ -75,7 +75,7 @@ def column_mapping(self, col_selector): def _compute_dtype(self, col_schema, input_schema): col_schema = super()._compute_dtype(col_schema, input_schema) if col_schema.name.endswith("_filled"): - col_schema = col_schema.with_dtype(np.bool) + col_schema = col_schema.with_dtype(bool) return col_schema transform.__doc__ = Operator.transform.__doc__ @@ -143,5 +143,5 @@ def column_mapping(self, col_selector): def _compute_dtype(self, col_schema, input_schema): col_schema = super()._compute_dtype(col_schema, input_schema) if col_schema.name.endswith("_filled"): - col_schema = col_schema.with_dtype(np.bool) + col_schema = col_schema.with_dtype(bool) return col_schema From 08619d8d95d456b2ef68f1dfe73ee61cd8c6de67 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 23 Jun 2023 18:53:52 +0100 Subject: [PATCH 58/80] Replace np.long with int in data_gen --- nvtabular/tools/data_gen.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nvtabular/tools/data_gen.py b/nvtabular/tools/data_gen.py index 70dc2ca775..d6037ee6f2 100644 --- a/nvtabular/tools/data_gen.py +++ b/nvtabular/tools/data_gen.py @@ -114,13 +114,13 @@ def create_cats(self, size, cats_rep, entries=False): if col.multi_min and col.multi_max: if HAS_GPU: ser = dist.create_col( - col_size + 1, dtype=np.long, min_val=col.multi_min, max_val=col.multi_max + col_size + 1, dtype=int, min_val=col.multi_min, max_val=col.multi_max ) ser = make_series(np.ceil(ser)).astype(ser.dtype) _cumsum = xp.cumsum else: ser = dist.create_col( - col_size + 1, dtype=np.long, min_val=col.multi_min, max_val=col.multi_max + col_size + 1, dtype=int, min_val=col.multi_min, max_val=col.multi_max ) ser = make_df(np.ceil(ser))[0] _cumsum = np.cumsum @@ -130,12 +130,12 @@ def create_cats(self, size, cats_rep, entries=False): offs = offs.astype("int32") if HAS_GPU: ser = dist.create_col( - col_size, dtype=np.long, min_val=col.min_val, max_val=col.cardinality + col_size, dtype=int, min_val=col.min_val, max_val=col.cardinality ) ser = make_series(np.ceil(ser)).astype(ser.dtype) else: ser = dist.create_col( - col_size, dtype=np.long, min_val=col.min_val, max_val=col.cardinality + col_size, dtype=int, min_val=col.min_val, max_val=col.cardinality ) ser = make_df(np.ceil(ser))[0] ser = ser.astype("int32") From 5602f68d353ac0c0c620da22e8cc808aefe6b255 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 23 Jun 2023 18:59:05 +0100 Subject: [PATCH 59/80] Remove invalid comment from deps --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index e759c3abe1..df16c0681e 100644 --- a/tox.ini +++ b/tox.ini @@ -45,7 +45,7 @@ deps = git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH} git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_BRANCH} git+https://github.com/NVIDIA-Merlin/models.git@{env:MERLIN_BRANCH} - nvidia-cudnn-cu11==8.6.0.163 ; required for tensorflow version + nvidia-cudnn-cu11==8.6.0.163 cudf-cu11=={env:RAPIDS_VERSION} dask-cudf-cu11=={env:RAPIDS_VERSION} commands = From 6ca1d5a1dbe151e89c9986bcb8dec08b809cce67 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 23 Jun 2023 18:59:32 +0100 Subject: [PATCH 60/80] Run GPU tests with CUDA 12 --- .github/workflows/gpu-tests.yml | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 8a26d009ff..38b03ee386 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -42,3 +42,32 @@ jobs: run: | merlin_branch="${{ steps.get-branch-name.outputs.branch }}" RAPIDS_VERSION=23.04 MERLIN_BRANCH=$merlin_branch tox -e test-gpu-cu11 + + gpu-tests-cu12: + runs-on: linux-amd64-gpu-p100-latest-1 + container: + image: nvidia/cuda:12.1.1-devel-ubuntu22.04 + env: + NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + - name: Install Ubuntu packages + run: | + apt-get update -y + apt-get install -y git lsb-release + - name: Set up Python 3.8 + uses: actions/setup-python@v4 + with: + python-version: 3.8 + - name: Install and upgrade python packages + run: | + python -m pip install --upgrade pip tox + - name: Get Branch name + id: get-branch-name + uses: NVIDIA-Merlin/.github/actions/branch-name@branch-name-pull-request + - name: Run tests + run: | + merlin_branch="${{ steps.get-branch-name.outputs.branch }}" + RAPIDS_VERSION=23.06 MERLIN_BRANCH=$merlin_branch tox -e test-gpu-cu12 From 15d4ee2ec14b312bbd3849062b6fc5de9a40a4d3 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 23 Jun 2023 22:00:07 +0100 Subject: [PATCH 61/80] Install libcudnn8 package for tensorflow GPU support --- .github/workflows/gpu-tests.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 38b03ee386..110a1a238d 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -27,7 +27,8 @@ jobs: - name: Install Ubuntu packages run: | apt-get update -y - apt-get install -y git lsb-release + # libcudnn8 installed for tensorflow GPU support + apt-get install -y git lsb-release 'libcudnn8=*cuda11.8' - name: Set up Python 3.8 uses: actions/setup-python@v4 with: From 98c3a3e2e86548ea3b01cbf908ff3a6e5acde2f7 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Wed, 28 Jun 2023 12:37:21 +0100 Subject: [PATCH 62/80] Remove unused numpy import from fill.py --- nvtabular/ops/fill.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nvtabular/ops/fill.py b/nvtabular/ops/fill.py index b38383b7c3..4d6511fda0 100644 --- a/nvtabular/ops/fill.py +++ b/nvtabular/ops/fill.py @@ -14,7 +14,6 @@ # limitations under the License. # import dask.dataframe as dd -import numpy as np from merlin.core.dispatch import DataFrameType, annotate from merlin.dag.ops.stat_operator import StatOperator From cede7d12600ffb6d65f2420471e178edb31c2faf Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Wed, 28 Jun 2023 12:38:38 +0100 Subject: [PATCH 63/80] Use Python 3.9 for cu12 gpu test environment --- .github/workflows/gpu-tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 110a1a238d..3c176ce09e 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -58,10 +58,10 @@ jobs: run: | apt-get update -y apt-get install -y git lsb-release - - name: Set up Python 3.8 + - name: Set up Python 3.9 uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: 3.9 - name: Install and upgrade python packages run: | python -m pip install --upgrade pip tox From b92777ecd7057be8184d8c1eb7be19df71cc840f Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Thu, 29 Jun 2023 10:23:04 +0100 Subject: [PATCH 64/80] Run compute before merge in test_embedding_cat_export_import --- tests/unit/workflow/test_workflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/workflow/test_workflow.py b/tests/unit/workflow/test_workflow.py index 77009da135..1f3c71b993 100755 --- a/tests/unit/workflow/test_workflow.py +++ b/tests/unit/workflow/test_workflow.py @@ -819,7 +819,7 @@ def test_embedding_cat_export_import(tmpdir, cpu): shuffle=False, device=cpu, ) - origin_df = train_res.to_ddf().merge(emb_res.to_ddf(), on="string_id", how="left").compute() + origin_df = train_res.compute().merge(emb_res.compute(), on="string_id", how="left") for idx, batch in enumerate(data_loader): batch b_df = batch[0].to_df() From a2d285e29828508ec1c7f6db233b8c5b95bcda0d Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Thu, 29 Jun 2023 10:43:37 +0100 Subject: [PATCH 65/80] Add tensorflow pytest merk to example notebook 1 and 2 --- tests/unit/examples/test_01-Getting-started.py | 1 + tests/unit/examples/test_02-Advanced-NVTabular-workflow.py | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/unit/examples/test_01-Getting-started.py b/tests/unit/examples/test_01-Getting-started.py index cf28fbcf56..0781367497 100644 --- a/tests/unit/examples/test_01-Getting-started.py +++ b/tests/unit/examples/test_01-Getting-started.py @@ -23,6 +23,7 @@ nest_asyncio.apply() +@pytest.mark.tensorflow def test_example_01_getting_started(): with testbook( REPO_ROOT / "examples" / "01-Getting-started.ipynb", diff --git a/tests/unit/examples/test_02-Advanced-NVTabular-workflow.py b/tests/unit/examples/test_02-Advanced-NVTabular-workflow.py index 0e9bc2a962..6b231bc9a3 100644 --- a/tests/unit/examples/test_02-Advanced-NVTabular-workflow.py +++ b/tests/unit/examples/test_02-Advanced-NVTabular-workflow.py @@ -27,6 +27,7 @@ nest_asyncio.apply() +@pytest.mark.tensorflow def test_example_02_advanced_workflow(): with testbook( REPO_ROOT / "examples" / "02-Advanced-NVTabular-workflow.ipynb", From 2f9db246abf61a743e313549edf3a129324f9085 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Thu, 29 Jun 2023 10:55:38 +0100 Subject: [PATCH 66/80] Debug: run only workflow test --- tox.ini | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index df16c0681e..b1369f431c 100644 --- a/tox.ini +++ b/tox.ini @@ -49,7 +49,7 @@ deps = cudf-cu11=={env:RAPIDS_VERSION} dask-cudf-cu11=={env:RAPIDS_VERSION} commands = - python -m pytest --cov-report term --cov merlin -rxs {posargs:tests/unit} + python -m pytest --cov-report term --cov merlin -rxs {posargs:tests/unit/workflow/test_workflow.py::test_embedding_cat_export_import} [testenv:test-gpu-cu12] setenv = @@ -76,7 +76,7 @@ deps = dask-cudf-cu12=={env:RAPIDS_VERSION} commands = ; Latest TensorFlow PyPI package does not currently support CUDA 12 - python -m pytest --cov-report term --cov merlin -rxs -m 'not tensorflow' {posargs:tests/unit} + python -m pytest --cov-report term --cov merlin -rxs -m 'not tensorflow' {posargs:tests/unit/workflow/test_workflow.py::test_embedding_cat_export_import} [testenv:test-merlin] ; Runs in: Internal Jenkins From 91f4de550fcab45dd7cf75d00cd2948385e3866e Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Thu, 29 Jun 2023 13:42:22 +0100 Subject: [PATCH 67/80] Debug: print dataframes in test_workflow --- tests/unit/workflow/test_workflow.py | 2 ++ tox.ini | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/unit/workflow/test_workflow.py b/tests/unit/workflow/test_workflow.py index 1f3c71b993..aeb4c65a72 100755 --- a/tests/unit/workflow/test_workflow.py +++ b/tests/unit/workflow/test_workflow.py @@ -820,6 +820,8 @@ def test_embedding_cat_export_import(tmpdir, cpu): device=cpu, ) origin_df = train_res.compute().merge(emb_res.compute(), on="string_id", how="left") + print(train_res.compute()) + print(origin_df) for idx, batch in enumerate(data_loader): batch b_df = batch[0].to_df() diff --git a/tox.ini b/tox.ini index b1369f431c..a4571cdcf8 100644 --- a/tox.ini +++ b/tox.ini @@ -49,7 +49,7 @@ deps = cudf-cu11=={env:RAPIDS_VERSION} dask-cudf-cu11=={env:RAPIDS_VERSION} commands = - python -m pytest --cov-report term --cov merlin -rxs {posargs:tests/unit/workflow/test_workflow.py::test_embedding_cat_export_import} + python -m pytest --cov-report term --cov merlin -rxs -s {posargs:tests/unit/workflow/test_workflow.py::test_embedding_cat_export_import} [testenv:test-gpu-cu12] setenv = @@ -76,7 +76,7 @@ deps = dask-cudf-cu12=={env:RAPIDS_VERSION} commands = ; Latest TensorFlow PyPI package does not currently support CUDA 12 - python -m pytest --cov-report term --cov merlin -rxs -m 'not tensorflow' {posargs:tests/unit/workflow/test_workflow.py::test_embedding_cat_export_import} + python -m pytest --cov-report term --cov merlin -rxs -s -m 'not tensorflow' {posargs:tests/unit/workflow/test_workflow.py::test_embedding_cat_export_import} [testenv:test-merlin] ; Runs in: Internal Jenkins From afc0ce9a1ce6c1c699fa1cd2360cf7ece2c44f14 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Thu, 29 Jun 2023 14:03:34 +0100 Subject: [PATCH 68/80] debug: print train_df --- tests/unit/workflow/test_workflow.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/unit/workflow/test_workflow.py b/tests/unit/workflow/test_workflow.py index aeb4c65a72..340fc1d4c1 100755 --- a/tests/unit/workflow/test_workflow.py +++ b/tests/unit/workflow/test_workflow.py @@ -805,9 +805,10 @@ def test_embedding_cat_export_import(tmpdir, cpu): df = make_df({"string_id": np.random.choice(string_ids, 30)}) graph2 = ["string_id"] >> cat_op train_res = Workflow(graph2).transform(Dataset(df, cpu=(cpu is not None))) + train_df = train_res.compute() data_loader = Loader( - train_res, + nvt.Dataset(train_df), batch_size=1, transforms=[ EmbeddingOperator( @@ -819,9 +820,13 @@ def test_embedding_cat_export_import(tmpdir, cpu): shuffle=False, device=cpu, ) - origin_df = train_res.compute().merge(emb_res.compute(), on="string_id", how="left") - print(train_res.compute()) + origin_df = train_df.merge(emb_res.compute(), on="string_id", how="left") + print("train_df:") + print(train_df) + print("origin_df:") print(origin_df) + print("emb_res:") + print(emb_res.compute()) for idx, batch in enumerate(data_loader): batch b_df = batch[0].to_df() From 4f4e7468c5bf21b31cbb9c33d78c8d386163ca41 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Thu, 29 Jun 2023 17:08:12 +0100 Subject: [PATCH 69/80] Remove use of merge (results in non-deterministic row ordering) DataFrames merges in cuDF result in non-deterministic row ordering https://docs.rapids.ai/api/cudf/stable/api_docs/api/cudf.dataframe.merge/ --- tests/unit/workflow/test_workflow.py | 35 +++++++++++++--------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/tests/unit/workflow/test_workflow.py b/tests/unit/workflow/test_workflow.py index 340fc1d4c1..2d5b8558bd 100755 --- a/tests/unit/workflow/test_workflow.py +++ b/tests/unit/workflow/test_workflow.py @@ -800,40 +800,37 @@ def test_embedding_cat_export_import(tmpdir, cpu): npy_path = str(tmpdir / "embeddings.npy") emb_res.to_npy(npy_path) - embeddings = np.load(npy_path) + ids_and_embeddings = np.load(npy_path) # second workflow that categorifies the embedding table data df = make_df({"string_id": np.random.choice(string_ids, 30)}) graph2 = ["string_id"] >> cat_op train_res = Workflow(graph2).transform(Dataset(df, cpu=(cpu is not None))) train_df = train_res.compute() + ids = ids_and_embeddings[:, 0].astype(int) + embeddings = ids_and_embeddings[:, 1:] + data_loader = Loader( nvt.Dataset(train_df), batch_size=1, transforms=[ EmbeddingOperator( - embeddings[:, 1:], - id_lookup_table=embeddings[:, 0].astype(int), + embeddings, + id_lookup_table=ids, lookup_key="string_id", ) ], shuffle=False, device=cpu, ) - origin_df = train_df.merge(emb_res.compute(), on="string_id", how="left") - print("train_df:") - print(train_df) - print("origin_df:") - print(origin_df) - print("emb_res:") - print(emb_res.compute()) + embeddings_by_id = dict(zip(ids, embeddings)) for idx, batch in enumerate(data_loader): - batch - b_df = batch[0].to_df() - org_df = origin_df.iloc[idx] - if not cpu: - assert (b_df["string_id"].to_numpy() == org_df["string_id"].to_numpy()).all() - assert (b_df["embeddings"].list.leaves == org_df["embeddings"].list.leaves).all() - else: - assert (b_df["string_id"].values == org_df["string_id"]).all() - assert b_df["embeddings"].values[0] == org_df["embeddings"].tolist() + x, _ = batch + b_df = x.to_df() + org_df = make_df( + { + "string_id": x["string_id"].values, + "embeddings": [embeddings_by_id[_id] for _id in x["string_id"].values.tolist()], + } + ) + assert_eq(b_df, org_df) From 8220231293463e275e8b4e4945cb43a1560dc13d Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Thu, 29 Jun 2023 17:13:33 +0100 Subject: [PATCH 70/80] remove train_df --- tests/unit/workflow/test_workflow.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/unit/workflow/test_workflow.py b/tests/unit/workflow/test_workflow.py index 2d5b8558bd..31cc104988 100755 --- a/tests/unit/workflow/test_workflow.py +++ b/tests/unit/workflow/test_workflow.py @@ -805,13 +805,12 @@ def test_embedding_cat_export_import(tmpdir, cpu): df = make_df({"string_id": np.random.choice(string_ids, 30)}) graph2 = ["string_id"] >> cat_op train_res = Workflow(graph2).transform(Dataset(df, cpu=(cpu is not None))) - train_df = train_res.compute() ids = ids_and_embeddings[:, 0].astype(int) embeddings = ids_and_embeddings[:, 1:] data_loader = Loader( - nvt.Dataset(train_df), + train_res, batch_size=1, transforms=[ EmbeddingOperator( From 4604f639cce4632b8a620429e2401f1002a00f74 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Thu, 29 Jun 2023 17:16:38 +0100 Subject: [PATCH 71/80] Restore posargs default in gpu tox envs --- tox.ini | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index a4571cdcf8..71f579778b 100644 --- a/tox.ini +++ b/tox.ini @@ -49,7 +49,7 @@ deps = cudf-cu11=={env:RAPIDS_VERSION} dask-cudf-cu11=={env:RAPIDS_VERSION} commands = - python -m pytest --cov-report term --cov merlin -rxs -s {posargs:tests/unit/workflow/test_workflow.py::test_embedding_cat_export_import} + python -m pytest --cov-report term --cov merlin -rxs -s {posargs:tests/unit} [testenv:test-gpu-cu12] setenv = @@ -76,7 +76,7 @@ deps = dask-cudf-cu12=={env:RAPIDS_VERSION} commands = ; Latest TensorFlow PyPI package does not currently support CUDA 12 - python -m pytest --cov-report term --cov merlin -rxs -s -m 'not tensorflow' {posargs:tests/unit/workflow/test_workflow.py::test_embedding_cat_export_import} + python -m pytest --cov-report term --cov merlin -rxs -s -m 'not tensorflow' {posargs:tests/unit} [testenv:test-merlin] ; Runs in: Internal Jenkins From ccfeda6ccbbeac8249c4ec75c41c09e78e17e684 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Thu, 29 Jun 2023 17:16:55 +0100 Subject: [PATCH 72/80] Restore multi-gpu test job and tox envirionment --- .github/workflows/gpu-tests.yml | 23 +++++++++++++++++++++++ tox.ini | 23 +++++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 3c176ce09e..50964a046e 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -14,7 +14,30 @@ concurrency: cancel-in-progress: true jobs: + + # Multi-GPU tests + gpu-tests: + runs-on: 2GPU + + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + - name: Run tests + run: | + ref_type=${{ github.ref_type }} + branch=main + if [[ $ref_type == "tag"* ]] + then + raw=$(git branch -r --contains ${{ github.ref_name }}) + branch=${raw/origin\/} + fi + cd ${{ github.workspace }}; MERLIN_BRANCH=$branch tox -e test-gpu + + # Single GPU tests + + gpu-tests-cu11: runs-on: linux-amd64-gpu-p100-latest-1 container: image: nvidia/cuda:11.8.0-devel-ubuntu22.04 diff --git a/tox.ini b/tox.ini index 71f579778b..fe581b2666 100644 --- a/tox.ini +++ b/tox.ini @@ -24,6 +24,29 @@ commands = python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/core.git@{posargs:main} python -m pytest --cov-report term --cov=merlin -rxs tests/unit +[testenv:test-gpu] +setenv = + TF_GPU_ALLOCATOR=cuda_malloc_async +passenv = + OPAL_PREFIX + NR_USER + CUDA_VISIBLE_DEVICES +sitepackages=true +; Runs in: Internal Jenkins +; Runs GPU-based tests. +; The jenkins jobs run on an image based on merlin-hugectr. This will include all cudf configuration +; and other gpu-specific libraries that we can enxpect will always exist. Thus, we don't need +; to install requirements.txt yet. As we get better at python environment isolation, we will +; need to add some back. +deps = + pytest + pytest-cov + git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH} + git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_BRANCH} + git+https://github.com/NVIDIA-Merlin/models.git@{env:MERLIN_BRANCH} +commands = + python -m pytest --cov-report term --cov merlin -rxs {posargs:tests/unit} + [testenv:test-gpu-cu11] setenv = TF_GPU_ALLOCATOR=cuda_malloc_async From cc6102930d8b47540ce71001eac5fb2a47fa31f5 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Thu, 29 Jun 2023 17:26:33 +0100 Subject: [PATCH 73/80] Remove sitepackages=true from test-gpu envs --- tox.ini | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/tox.ini b/tox.ini index fe581b2666..30e48f6b1b 100644 --- a/tox.ini +++ b/tox.ini @@ -48,6 +48,8 @@ commands = python -m pytest --cov-report term --cov merlin -rxs {posargs:tests/unit} [testenv:test-gpu-cu11] +; Runs in: GitHub Actions +; Runs GPU-based tests. setenv = TF_GPU_ALLOCATOR=cuda_malloc_async PIP_EXTRA_INDEX_URL=https://pypi.nvidia.com @@ -56,13 +58,6 @@ passenv = OPAL_PREFIX NR_USER CUDA_VISIBLE_DEVICES -sitepackages=true -; Runs in: Internal Jenkins -; Runs GPU-based tests. -; The jenkins jobs run on an image based on merlin-hugectr. This will include all cudf configuration -; and other gpu-specific libraries that we can enxpect will always exist. Thus, we don't need -; to install requirements.txt yet. As we get better at python environment isolation, we will -; need to add some back. deps = -rrequirements/test.txt git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH} @@ -75,6 +70,8 @@ commands = python -m pytest --cov-report term --cov merlin -rxs -s {posargs:tests/unit} [testenv:test-gpu-cu12] +; Runs in: GitHub Actions +; Runs GPU-based tests. setenv = TF_GPU_ALLOCATOR=cuda_malloc_async PIP_EXTRA_INDEX_URL=https://pypi.nvidia.com @@ -83,13 +80,6 @@ passenv = OPAL_PREFIX NR_USER CUDA_VISIBLE_DEVICES -sitepackages=true -; Runs in: Internal Jenkins -; Runs GPU-based tests. -; The jenkins jobs run on an image based on merlin-hugectr. This will include all cudf configuration -; and other gpu-specific libraries that we can enxpect will always exist. Thus, we don't need -; to install requirements.txt yet. As we get better at python environment isolation, we will -; need to add some back. deps = -rrequirements/test.txt git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH} From 917827b21bab2d248d917172e2ffb5094396c19f Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Thu, 29 Jun 2023 17:28:51 +0100 Subject: [PATCH 74/80] Remove unnecessary env vars from gpu-tests tox envs --- tox.ini | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tox.ini b/tox.ini index 30e48f6b1b..8f43e8650e 100644 --- a/tox.ini +++ b/tox.ini @@ -55,8 +55,6 @@ setenv = PIP_EXTRA_INDEX_URL=https://pypi.nvidia.com PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python passenv = - OPAL_PREFIX - NR_USER CUDA_VISIBLE_DEVICES deps = -rrequirements/test.txt @@ -73,12 +71,9 @@ commands = ; Runs in: GitHub Actions ; Runs GPU-based tests. setenv = - TF_GPU_ALLOCATOR=cuda_malloc_async PIP_EXTRA_INDEX_URL=https://pypi.nvidia.com PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python passenv = - OPAL_PREFIX - NR_USER CUDA_VISIBLE_DEVICES deps = -rrequirements/test.txt From d264997471a4d868cf7ea11c1948d4461e94fb63 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Thu, 29 Jun 2023 17:30:59 +0100 Subject: [PATCH 75/80] remove blank line from gpu-tests.yml --- .github/workflows/gpu-tests.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 50964a046e..9b5fc9ce76 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -14,7 +14,6 @@ concurrency: cancel-in-progress: true jobs: - # Multi-GPU tests gpu-tests: From 9c9f32fa3bcd7158eb9d6cbf395f16ee07e91790 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Thu, 27 Jul 2023 12:42:30 +0100 Subject: [PATCH 76/80] Add job for running NVTabular tests with conda --- .github/workflows/gpu-tests.yml | 39 +++++++++++++++++++++++++++++++++ tox.ini | 3 +-- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 9b5fc9ce76..76ed3952c7 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -36,6 +36,45 @@ jobs: # Single GPU tests + gpu-tests-conda-cu12: + runs-on: linux-amd64-gpu-p100-latest-1 + container: + image: nvidia/cuda:12.1.1-devel-ubuntu22.04 + env: + NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + - name: Install Ubuntu packages + run: | + apt-get update -y + apt-get install -y git lsb-release + - uses: mamba-org/setup-micromamba@v1 + with: + environment-name: test-env + create-args: >- + python=3.10 + cudf=23.08 + cuda-version=12.0 + cuda-nvcc=12.0 + condarc: | + channels: + - rapidsai-nightly + - conda-forge + - nvidia + cache-downloads: true + - name: Install and upgrade python packages + run: | + python -m pip install --upgrade pip tox + - name: Get Branch name + id: get-branch-name + uses: NVIDIA-Merlin/.github/actions/branch-name@branch-name-pull-request + - name: Run tests + run: | + merlin_branch="${{ steps.get-branch-name.outputs.branch }}" + MERLIN_BRANCH=$merlin_branch tox -e test-gpu + gpu-tests-cu11: runs-on: linux-amd64-gpu-p100-latest-1 container: diff --git a/tox.ini b/tox.ini index 8f43e8650e..39afffd2e5 100644 --- a/tox.ini +++ b/tox.ini @@ -39,8 +39,7 @@ sitepackages=true ; to install requirements.txt yet. As we get better at python environment isolation, we will ; need to add some back. deps = - pytest - pytest-cov + -rrequirements/test.txt git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH} git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_BRANCH} git+https://github.com/NVIDIA-Merlin/models.git@{env:MERLIN_BRANCH} From 32e39c34733e0a8d3c0a7fe865d617d5bd1e654c Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Thu, 27 Jul 2023 13:03:29 +0100 Subject: [PATCH 77/80] Use conda-incubator/setup-miniconda instead of setup-micromamba --- .github/workflows/gpu-tests.yml | 18 +++++------------- conda/environments/test-cu12.yaml | 11 +++++++++++ 2 files changed, 16 insertions(+), 13 deletions(-) create mode 100644 conda/environments/test-cu12.yaml diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 76ed3952c7..e11919a881 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -50,20 +50,12 @@ jobs: run: | apt-get update -y apt-get install -y git lsb-release - - uses: mamba-org/setup-micromamba@v1 + - uses: conda-incubator/setup-miniconda@v2 with: - environment-name: test-env - create-args: >- - python=3.10 - cudf=23.08 - cuda-version=12.0 - cuda-nvcc=12.0 - condarc: | - channels: - - rapidsai-nightly - - conda-forge - - nvidia - cache-downloads: true + miniforge-variant: Mambaforge + use-mamba: true + activate-environment: cu12-env + environment-file: conda/environments/test-cu12.yaml - name: Install and upgrade python packages run: | python -m pip install --upgrade pip tox diff --git a/conda/environments/test-cu12.yaml b/conda/environments/test-cu12.yaml new file mode 100644 index 0000000000..73dd1b284a --- /dev/null +++ b/conda/environments/test-cu12.yaml @@ -0,0 +1,11 @@ +name: cu12-env +channels: + - conda-forge + - rapidsai-nightly + - nvidia +dependencies: + - python=3.10 + - cuda-version=12.1 + - cuda-nvcc=12.1 + - cudf=23.08 + - dask-cudf=23.08 From 7d975dbbabf2ea72135176f655c19994d79c206f Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Thu, 27 Jul 2023 13:24:22 +0100 Subject: [PATCH 78/80] Update ref for branch-name-pull-request --- .github/workflows/gpu-tests.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index e11919a881..e4990536b8 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -61,7 +61,7 @@ jobs: python -m pip install --upgrade pip tox - name: Get Branch name id: get-branch-name - uses: NVIDIA-Merlin/.github/actions/branch-name@branch-name-pull-request + uses: NVIDIA-Merlin/.github/actions/branch-name@9f82e25a18e2b4a3f4350e9f287c2c31e906d89e - name: Run tests run: | merlin_branch="${{ steps.get-branch-name.outputs.branch }}" @@ -91,7 +91,7 @@ jobs: python -m pip install --upgrade pip tox - name: Get Branch name id: get-branch-name - uses: NVIDIA-Merlin/.github/actions/branch-name@branch-name-pull-request + uses: NVIDIA-Merlin/.github/actions/branch-name@9f82e25a18e2b4a3f4350e9f287c2c31e906d89e - name: Run tests run: | merlin_branch="${{ steps.get-branch-name.outputs.branch }}" @@ -120,7 +120,7 @@ jobs: python -m pip install --upgrade pip tox - name: Get Branch name id: get-branch-name - uses: NVIDIA-Merlin/.github/actions/branch-name@branch-name-pull-request + uses: NVIDIA-Merlin/.github/actions/branch-name@9f82e25a18e2b4a3f4350e9f287c2c31e906d89e - name: Run tests run: | merlin_branch="${{ steps.get-branch-name.outputs.branch }}" From fdb65b03aa2a0877115d88b5931c6c6a30a300b9 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Thu, 27 Jul 2023 13:50:04 +0100 Subject: [PATCH 79/80] Update cuda dependencies to 12 --- conda/environments/test-cu12.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conda/environments/test-cu12.yaml b/conda/environments/test-cu12.yaml index 73dd1b284a..9fffdec9f2 100644 --- a/conda/environments/test-cu12.yaml +++ b/conda/environments/test-cu12.yaml @@ -5,7 +5,7 @@ channels: - nvidia dependencies: - python=3.10 - - cuda-version=12.1 - - cuda-nvcc=12.1 + - cuda-version=12 + - cuda-nvcc=12 - cudf=23.08 - dask-cudf=23.08 From 8bbd1ad3b666dba1132c91d09cda86bd503c9440 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Thu, 27 Jul 2023 14:12:39 +0100 Subject: [PATCH 80/80] Add tox to conda test env --- .github/workflows/gpu-tests.yml | 4 +--- conda/environments/test-cu12.yaml | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index e4990536b8..c6e075399e 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -56,9 +56,7 @@ jobs: use-mamba: true activate-environment: cu12-env environment-file: conda/environments/test-cu12.yaml - - name: Install and upgrade python packages - run: | - python -m pip install --upgrade pip tox + python-version: "3.10" - name: Get Branch name id: get-branch-name uses: NVIDIA-Merlin/.github/actions/branch-name@9f82e25a18e2b4a3f4350e9f287c2c31e906d89e diff --git a/conda/environments/test-cu12.yaml b/conda/environments/test-cu12.yaml index 9fffdec9f2..aa94329a09 100644 --- a/conda/environments/test-cu12.yaml +++ b/conda/environments/test-cu12.yaml @@ -4,8 +4,8 @@ channels: - rapidsai-nightly - nvidia dependencies: - - python=3.10 - cuda-version=12 - cuda-nvcc=12 - cudf=23.08 - dask-cudf=23.08 + - tox=4