From 2e4c50d38316858cba0d0a3915af41ebdb96824e Mon Sep 17 00:00:00 2001 From: root Date: Wed, 24 Apr 2024 03:48:01 +0000 Subject: [PATCH 01/27] add unittest env with gpu --- .github/workflows/docker/docker-compose.yml | 56 +++++++++++++++++++++ .github/workflows/unittest-gpu.yml | 32 ++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 .github/workflows/docker/docker-compose.yml create mode 100644 .github/workflows/unittest-gpu.yml diff --git a/.github/workflows/docker/docker-compose.yml b/.github/workflows/docker/docker-compose.yml new file mode 100644 index 000000000..06d2a78a1 --- /dev/null +++ b/.github/workflows/docker/docker-compose.yml @@ -0,0 +1,56 @@ +version: '3' +services: + ray-head: + image: data-juicer-unittest:v2 + pull_policy: never + command: ray start --head --dashboard-host 0.0.0.0 --include-dashboard true --block + environment: + - HF_HOME=/data/huggingface + - RAY_ADDRESS=auto + working_dir: /workspace + networks: + - ray-network + volumes: + - huggingface_cache:/data + - ../../..:/workspace + ports: + - "6379:6379" + - "8265:8265" + shm_size: "32G" + deploy: + resources: + reservations: + devices: + - driver: nvidia + device_ids: ['0', '1'] + capabilities: [gpu] + + ray-worker: + image: data-juicer-unittest:v2 + pull_policy: never + command: ray start --address=ray-head:6379 --block + environment: + - HF_HOME=/data/huggingface + working_dir: /workspace + volumes: + - huggingface_cache:/data + - ../../..:/workspace + depends_on: + - ray-head + networks: + - ray-network + shm_size: "32G" + deploy: + resources: + reservations: + devices: + - driver: nvidia + device_ids: ['2', '3'] + capabilities: [gpu] + +networks: + ray-network: + driver: bridge + +volumes: + huggingface_cache: diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest-gpu.yml new file mode 100644 index 000000000..6da6989d3 --- /dev/null +++ b/.github/workflows/unittest-gpu.yml @@ -0,0 +1,32 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: unittest-gpu + +on: + workflow_dispatch: + push: + branches: + - main + +permissions: + contents: read + +jobs: + unittest: + + runs-on: [self-hosted, linux, GPU] + steps: + - uses: actions/checkout@v3 + - name: Setup docker compose + working-directory: .github/workflows/docker + run: | + docker compose up -d + - name: Install data-juicer + run: | + docker compose exec ray-head cd workspace && pip install -v -e .[all] + + - name: Remove docker compose + working-directory: .github/workflows/docker + run: | + docker compose down --remove-orphans \ No newline at end of file From 3edd333d8c55fb10cb8022b2bc3e6aedb86d7f3b Mon Sep 17 00:00:00 2001 From: root Date: Wed, 24 Apr 2024 04:12:59 +0000 Subject: [PATCH 02/27] fix unittest yml --- .github/workflows/unittest-gpu.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest-gpu.yml index 6da6989d3..7a198c461 100644 --- a/.github/workflows/unittest-gpu.yml +++ b/.github/workflows/unittest-gpu.yml @@ -18,15 +18,17 @@ jobs: runs-on: [self-hosted, linux, GPU] steps: - uses: actions/checkout@v3 + - name: Setup docker compose working-directory: .github/workflows/docker run: | docker compose up -d + - name: Install data-juicer run: | - docker compose exec ray-head cd workspace && pip install -v -e .[all] - + docker compose exec ray-head cd /workspace && pip install -v -e .[all] + - name: Remove docker compose working-directory: .github/workflows/docker run: | - docker compose down --remove-orphans \ No newline at end of file + docker compose down --remove-orphans From aa5950930085c0ddcb7bbe8323abfa4ba3ab9c54 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 24 Apr 2024 04:26:54 +0000 Subject: [PATCH 03/27] add environment for unittest --- .github/workflows/unittest-gpu.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest-gpu.yml index 7a198c461..553eb94dd 100644 --- a/.github/workflows/unittest-gpu.yml +++ b/.github/workflows/unittest-gpu.yml @@ -14,8 +14,8 @@ permissions: jobs: unittest: - runs-on: [self-hosted, linux, GPU] + environment: testing steps: - uses: actions/checkout@v3 From 5d7c5ddae89f7ebf7c39ba147d8df68776f05611 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 24 Apr 2024 04:28:36 +0000 Subject: [PATCH 04/27] update workflow trigger --- .github/workflows/unittest-gpu.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest-gpu.yml index 553eb94dd..4822437fa 100644 --- a/.github/workflows/unittest-gpu.yml +++ b/.github/workflows/unittest-gpu.yml @@ -5,6 +5,7 @@ name: unittest-gpu on: workflow_dispatch: + pull_request: push: branches: - main @@ -15,7 +16,7 @@ permissions: jobs: unittest: runs-on: [self-hosted, linux, GPU] - environment: testing + environment: Testing steps: - uses: actions/checkout@v3 From b9e9f0ca0928d6f19067b4178b149e3dc144de6c Mon Sep 17 00:00:00 2001 From: root Date: Wed, 24 Apr 2024 05:39:23 +0000 Subject: [PATCH 05/27] update install step --- .github/workflows/unittest-gpu.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest-gpu.yml index 4822437fa..20607a73f 100644 --- a/.github/workflows/unittest-gpu.yml +++ b/.github/workflows/unittest-gpu.yml @@ -27,9 +27,10 @@ jobs: - name: Install data-juicer run: | - docker compose exec ray-head cd /workspace && pip install -v -e .[all] + docker compose exec ray-head pip install -v -e .[all] - name: Remove docker compose working-directory: .github/workflows/docker + if: always() run: | docker compose down --remove-orphans From fe7a71a853ba570b5fd91a1bddc71f3b2313100f Mon Sep 17 00:00:00 2001 From: root Date: Wed, 24 Apr 2024 05:52:40 +0000 Subject: [PATCH 06/27] fix install command --- .github/workflows/unittest-gpu.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest-gpu.yml index 20607a73f..50a11e214 100644 --- a/.github/workflows/unittest-gpu.yml +++ b/.github/workflows/unittest-gpu.yml @@ -27,7 +27,7 @@ jobs: - name: Install data-juicer run: | - docker compose exec ray-head pip install -v -e .[all] + docker compose exec ray-head pip install -v -e .\[all\] - name: Remove docker compose working-directory: .github/workflows/docker From 06141cab9ff7ba23de6a105df74046c1ae9e0776 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 24 Apr 2024 05:58:06 +0000 Subject: [PATCH 07/27] update working dir --- .github/workflows/unittest-gpu.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest-gpu.yml index 50a11e214..10a1360fb 100644 --- a/.github/workflows/unittest-gpu.yml +++ b/.github/workflows/unittest-gpu.yml @@ -26,6 +26,7 @@ jobs: docker compose up -d - name: Install data-juicer + working-directory: .github/workflows/docker run: | docker compose exec ray-head pip install -v -e .\[all\] From c89a92acd952ee25bd77d91503a1bdfd133576f1 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 24 Apr 2024 07:14:41 +0000 Subject: [PATCH 08/27] update container --- .github/workflows/docker/docker-compose.yml | 8 ++++---- .github/workflows/unittest-gpu.yml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/docker/docker-compose.yml b/.github/workflows/docker/docker-compose.yml index 06d2a78a1..2089e6798 100644 --- a/.github/workflows/docker/docker-compose.yml +++ b/.github/workflows/docker/docker-compose.yml @@ -1,7 +1,7 @@ version: '3' services: ray-head: - image: data-juicer-unittest:v2 + image: data-juicer-unittest:v0.2.0 pull_policy: never command: ray start --head --dashboard-host 0.0.0.0 --include-dashboard true --block environment: @@ -16,7 +16,7 @@ services: ports: - "6379:6379" - "8265:8265" - shm_size: "32G" + shm_size: "64G" deploy: resources: reservations: @@ -26,7 +26,7 @@ services: capabilities: [gpu] ray-worker: - image: data-juicer-unittest:v2 + image: data-juicer-unittest:v0.2.0 pull_policy: never command: ray start --address=ray-head:6379 --block environment: @@ -39,7 +39,7 @@ services: - ray-head networks: - ray-network - shm_size: "32G" + shm_size: "64G" deploy: resources: reservations: diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest-gpu.yml index 10a1360fb..cf4633b84 100644 --- a/.github/workflows/unittest-gpu.yml +++ b/.github/workflows/unittest-gpu.yml @@ -28,7 +28,7 @@ jobs: - name: Install data-juicer working-directory: .github/workflows/docker run: | - docker compose exec ray-head pip install -v -e .\[all\] + docker compose exec ray-head pip install -e .\[all\] - name: Remove docker compose working-directory: .github/workflows/docker From 1f5f52207376c601851eedb39a77a0fc28c65ff5 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 24 Apr 2024 10:24:46 +0000 Subject: [PATCH 09/27] update working dir --- .github/workflows/unittest-gpu.yml | 3 +++ environments/science_requires.txt | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest-gpu.yml index cf4633b84..7fe10c142 100644 --- a/.github/workflows/unittest-gpu.yml +++ b/.github/workflows/unittest-gpu.yml @@ -16,6 +16,9 @@ permissions: jobs: unittest: runs-on: [self-hosted, linux, GPU] + defaults: + run: + working-directory: ./${{ github.event.pull_request.head.ref }} environment: Testing steps: - uses: actions/checkout@v3 diff --git a/environments/science_requires.txt b/environments/science_requires.txt index 4faa330c8..2748d2ef9 100644 --- a/environments/science_requires.txt +++ b/environments/science_requires.txt @@ -1,3 +1,5 @@ +torch +torchaudio easyocr fasttext-wheel kenlm @@ -16,8 +18,6 @@ accelerate tiktoken opencc==1.1.6 imagededup -torch -torchaudio dlib spacy-pkuseg==0.0.32 diffusers From 9142f505991bc1cc24e9087fd72d63cd3c43c847 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 24 Apr 2024 11:25:46 +0000 Subject: [PATCH 10/27] change working directory --- .github/workflows/unittest-gpu.yml | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest-gpu.yml index 7fe10c142..e5ff382b8 100644 --- a/.github/workflows/unittest-gpu.yml +++ b/.github/workflows/unittest-gpu.yml @@ -16,12 +16,15 @@ permissions: jobs: unittest: runs-on: [self-hosted, linux, GPU] - defaults: - run: - working-directory: ./${{ github.event.pull_request.head.ref }} environment: Testing steps: + - name: Set up unique working directory + run: mkdir ${{ runner.temp }}/dj-${{ github.run_id }} + shell: bash + - uses: actions/checkout@v3 + with: + path: ${{ runner.temp }}/dj-${{ github.run_id }} - name: Setup docker compose working-directory: .github/workflows/docker @@ -38,3 +41,8 @@ jobs: if: always() run: | docker compose down --remove-orphans + + - name: Cleanup workspace + if: always() + run: | + rm -rf ${{ runner.temp }}/dj-${{ github.run_id }} From 926fbb56d3775baf537281f278965b30156300a9 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 24 Apr 2024 11:42:04 +0000 Subject: [PATCH 11/27] change working directory --- .github/workflows/unittest-gpu.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest-gpu.yml index e5ff382b8..3a785d36b 100644 --- a/.github/workflows/unittest-gpu.yml +++ b/.github/workflows/unittest-gpu.yml @@ -18,10 +18,6 @@ jobs: runs-on: [self-hosted, linux, GPU] environment: Testing steps: - - name: Set up unique working directory - run: mkdir ${{ runner.temp }}/dj-${{ github.run_id }} - shell: bash - - uses: actions/checkout@v3 with: path: ${{ runner.temp }}/dj-${{ github.run_id }} From 905488fcaf5cd55ad3ae465ec67eb6fde6ddbffc Mon Sep 17 00:00:00 2001 From: root Date: Wed, 24 Apr 2024 11:49:06 +0000 Subject: [PATCH 12/27] change working directory --- .github/workflows/unittest-gpu.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest-gpu.yml index 3a785d36b..3e4411b6b 100644 --- a/.github/workflows/unittest-gpu.yml +++ b/.github/workflows/unittest-gpu.yml @@ -20,7 +20,7 @@ jobs: steps: - uses: actions/checkout@v3 with: - path: ${{ runner.temp }}/dj-${{ github.run_id }} + path: dj-${{ github.run_id }} - name: Setup docker compose working-directory: .github/workflows/docker @@ -41,4 +41,4 @@ jobs: - name: Cleanup workspace if: always() run: | - rm -rf ${{ runner.temp }}/dj-${{ github.run_id }} + rm -rf dj-${{ github.run_id }} From 74c34650fc6c92e4d5b5ab36cdbebca43d15d364 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 24 Apr 2024 11:53:14 +0000 Subject: [PATCH 13/27] change working directory --- .github/workflows/unittest-gpu.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest-gpu.yml index 3e4411b6b..b01167aa7 100644 --- a/.github/workflows/unittest-gpu.yml +++ b/.github/workflows/unittest-gpu.yml @@ -23,17 +23,17 @@ jobs: path: dj-${{ github.run_id }} - name: Setup docker compose - working-directory: .github/workflows/docker + working-directory: dj-${{ github.run_id }}/.github/workflows/docker run: | docker compose up -d - name: Install data-juicer - working-directory: .github/workflows/docker + working-directory: dj-${{ github.run_id }}/.github/workflows/docker run: | docker compose exec ray-head pip install -e .\[all\] - name: Remove docker compose - working-directory: .github/workflows/docker + working-directory: dj-${{ github.run_id }}/.github/workflows/docker if: always() run: | docker compose down --remove-orphans From 16ce4a8c41ea92397581c07eb9dd063d3030fd83 Mon Sep 17 00:00:00 2001 From: root Date: Thu, 25 Apr 2024 04:13:52 +0000 Subject: [PATCH 14/27] change unittest --- data_juicer/utils/unittest_utils.py | 34 ++++++++++++++++++++ tests/ops/filter/test_alphanumeric_filter.py | 10 +++--- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/data_juicer/utils/unittest_utils.py b/data_juicer/utils/unittest_utils.py index b9d18dbf1..0a585073f 100644 --- a/data_juicer/utils/unittest_utils.py +++ b/data_juicer/utils/unittest_utils.py @@ -1,8 +1,13 @@ import os import shutil import unittest +from datasets import Dataset +import ray.data as rd +from data_juicer.ops import Filter from data_juicer.utils.registry import Registry +from data_juicer.utils.constant import Fields + SKIPPED_TESTS = Registry('SkippedTests') @@ -32,3 +37,32 @@ def tearDownClass(cls, hf_model_name=None) -> None: if os.path.exists(transformers.TRANSFORMERS_CACHE): print('CLEAN all TRANSFORMERS_CACHE') shutil.rmtree(transformers.TRANSFORMERS_CACHE) + + @classmethod + def generate_dataset(cls, data, type="hf"): + """Generate dataset for a specific executor. + + Args: + type (str, optional): `hf` or `ray`. Defaults to "hf". + """ + if type == "hf": + return Dataset.from_list(data) + elif type == "ray": + return rd.from_items(data) + + @classmethod + def run_single_op(cls, dataset, op, type="hf"): + """Run operator in the specific executor.""" + if type == "hf": + if isinstance(op, Filter) and Fields.stats not in dataset.features: + # TODO: + # this is a temp solution, + # only add stats when calling filter op + dataset = dataset.add_column(name=Fields.stats, + column=[{}] * dataset.num_rows) + dataset = dataset.map(op.compute_stats) + dataset = dataset.filter(op.process) + dataset = dataset.select_columns(column_names=['text']) + return dataset.to_list() + elif type == "ray": + pass \ No newline at end of file diff --git a/tests/ops/filter/test_alphanumeric_filter.py b/tests/ops/filter/test_alphanumeric_filter.py index efca696c2..41d282f75 100644 --- a/tests/ops/filter/test_alphanumeric_filter.py +++ b/tests/ops/filter/test_alphanumeric_filter.py @@ -50,9 +50,10 @@ def test_case(self): }, { 'text': 'emoji表情测试下😊,😸31231\n' }] - dataset = Dataset.from_list(ds_list) + dataset = DataJuicerTestCaseBase.generate_dataset(ds_list) op = AlphanumericFilter(min_ratio=0.2, max_ratio=0.9) - self._run_alphanumeric_filter(dataset, tgt_list, op) + result = DataJuicerTestCaseBase.run_single_op(dataset, op) + self.assertEqual(result, tgt_list) def test_token_case(self): @@ -76,9 +77,10 @@ def test_token_case(self): }, { 'text': 'Do you need a cup of coffee?' }] - dataset = Dataset.from_list(ds_list) + dataset = DataJuicerTestCaseBase.generate_dataset(ds_list) op = AlphanumericFilter(tokenization=True, min_ratio=1.5) - self._run_alphanumeric_filter(dataset, tgt_list, op) + result = DataJuicerTestCaseBase.run_single_op(dataset, op) + self.assertEqual(result, tgt_list) if __name__ == '__main__': From 4e96d29a6da886a02d80fa98353285bdda421953 Mon Sep 17 00:00:00 2001 From: panxuchen Date: Thu, 25 Apr 2024 19:15:54 +0800 Subject: [PATCH 15/27] use test tag --- .github/workflows/docker/docker-compose.yml | 11 +++- .github/workflows/unit-test.yml | 57 ------------------- .../{unittest-gpu.yml => unittest.yml} | 16 +++++- data_juicer/utils/unittest_utils.py | 8 +++ tests/ops/filter/test_alphanumeric_filter.py | 16 +----- tests/run.py | 36 +++++------- 6 files changed, 46 insertions(+), 98 deletions(-) delete mode 100644 .github/workflows/unit-test.yml rename .github/workflows/{unittest-gpu.yml => unittest.yml} (70%) diff --git a/.github/workflows/docker/docker-compose.yml b/.github/workflows/docker/docker-compose.yml index 2089e6798..73b577937 100644 --- a/.github/workflows/docker/docker-compose.yml +++ b/.github/workflows/docker/docker-compose.yml @@ -1,11 +1,14 @@ version: '3' services: ray-head: - image: data-juicer-unittest:v0.2.0 + image: data-juicer-unittest:0.2.0 pull_policy: never command: ray start --head --dashboard-host 0.0.0.0 --include-dashboard true --block environment: - HF_HOME=/data/huggingface + - TORCH_HOME=/data/torch + - NLTK_DATA=/data/nltk + - DATA_JUICER_CACHE_HOME=/data/dj - RAY_ADDRESS=auto working_dir: /workspace networks: @@ -26,11 +29,14 @@ services: capabilities: [gpu] ray-worker: - image: data-juicer-unittest:v0.2.0 + image: data-juicer-unittest:0.2.0 pull_policy: never command: ray start --address=ray-head:6379 --block environment: - HF_HOME=/data/huggingface + - TORCH_HOME=/data/torch + - NLTK_DATA=/data/nltk + - DATA_JUICER_CACHE_HOME=/data/dj working_dir: /workspace volumes: - huggingface_cache:/data @@ -54,3 +60,4 @@ networks: volumes: huggingface_cache: + external: true diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml deleted file mode 100644 index 3eb21202d..000000000 --- a/.github/workflows/unit-test.yml +++ /dev/null @@ -1,57 +0,0 @@ -# This workflow will install Python dependencies, run tests and lint with a single version of Python -# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python - -name: Unit Test - -on: [push, pull_request, workflow_dispatch] - -permissions: - contents: read - -jobs: - build: - - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v3 - - name: Check disk space - run: | - df -h - - name: Set up Python 3.8 - uses: actions/setup-python@v3 - with: - python-version: "3.8" - - name: Check disk space - run: | - df -h - - name: Install dependencies - run: | - sudo apt-get install ffmpeg - python -m pip install --upgrade pip - pip install -v -e .[all] - - name: Increase swapfile - run: | - df -h - free -h - sudo swapoff -a - sudo fallocate -l 12G /mnt/swapfile - sudo chmod 600 /mnt/swapfile - sudo mkswap /mnt/swapfile - sudo swapon /mnt/swapfile - sudo swapon --show - - name: Clean data-juicer assets and models after cached - uses: webiny/action-post-run@3.1.0 - with: - run: rm -rf ~/.cache/data_juicer - - name: Cache data-juicer assets and models - uses: actions/cache@v3 - with: - path: ~/.cache/data_juicer - key: dj-assets-models - - name: Check disk space - run: | - df -h - - name: Run the test - run: | - python tests/run.py diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest.yml similarity index 70% rename from .github/workflows/unittest-gpu.yml rename to .github/workflows/unittest.yml index b01167aa7..052c58a95 100644 --- a/.github/workflows/unittest-gpu.yml +++ b/.github/workflows/unittest.yml @@ -1,7 +1,7 @@ # This workflow will install Python dependencies, run tests and lint with a single version of Python # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python -name: unittest-gpu +name: unittest on: workflow_dispatch: @@ -14,8 +14,8 @@ permissions: contents: read jobs: - unittest: - runs-on: [self-hosted, linux, GPU] + unittest-single: + runs-on: [self-hosted, linux] environment: Testing steps: - uses: actions/checkout@v3 @@ -32,6 +32,16 @@ jobs: run: | docker compose exec ray-head pip install -e .\[all\] + - name: Clean dataset cache + working-directory: dj-${{ github.run_id }}/.github/workflows/docker + run: | + docker compose exec ray-head rm -rf /data/huggingface/dataset + + - name: Run unittest standalone + working-directory: dj-${{ github.run_id }}/.github/workflows/docker + run: | + docker compose exec ray-head python tests/run.py --tag standalone-gpu + - name: Remove docker compose working-directory: dj-${{ github.run_id }}/.github/workflows/docker if: always() diff --git a/data_juicer/utils/unittest_utils.py b/data_juicer/utils/unittest_utils.py index 0a585073f..2394a834e 100644 --- a/data_juicer/utils/unittest_utils.py +++ b/data_juicer/utils/unittest_utils.py @@ -11,6 +11,14 @@ SKIPPED_TESTS = Registry('SkippedTests') +def TEST_TAG(*tags): + """Tags for test case. + Currently, `standalone`, `ray`, `standalone-gpu`, `ray-gpu` are supported. + """ + def decorator(func): + setattr(func, "tags", tags) + return func + return decorator class DataJuicerTestCaseBase(unittest.TestCase): diff --git a/tests/ops/filter/test_alphanumeric_filter.py b/tests/ops/filter/test_alphanumeric_filter.py index 41d282f75..ffe635258 100644 --- a/tests/ops/filter/test_alphanumeric_filter.py +++ b/tests/ops/filter/test_alphanumeric_filter.py @@ -4,24 +4,12 @@ from data_juicer.ops.filter.alphanumeric_filter import AlphanumericFilter from data_juicer.utils.constant import Fields -from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase +from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, TEST_TAG class AlphanumericFilterTest(DataJuicerTestCaseBase): - def _run_alphanumeric_filter(self, dataset: Dataset, target_list, op): - if Fields.stats not in dataset.features: - # TODO: - # this is a temp solution, - # only add stats when calling filter op - dataset = dataset.add_column(name=Fields.stats, - column=[{}] * dataset.num_rows) - dataset = dataset.map(op.compute_stats) - dataset = dataset.filter(op.process) - dataset = dataset.select_columns(column_names=['text']) - res_list = dataset.to_list() - self.assertEqual(res_list, target_list) - + @TEST_TAG("standalone-gpu") def test_case(self): ds_list = [{ diff --git a/tests/run.py b/tests/run.py index 8ff91e459..613c84693 100644 --- a/tests/run.py +++ b/tests/run.py @@ -19,6 +19,9 @@ sys.path.append(file_dir) parser = argparse.ArgumentParser('test runner') +parser.add_argument('--tag', choices=["standalone", "standalone-gpu", "ray", "ray-gpu"], + default="standalone", + help="the tag of tests being run") parser.add_argument('--list_tests', action='store_true', help='list all tests') parser.add_argument('--pattern', default='test_*.py', help='test file pattern') parser.add_argument('--test_dir', @@ -26,9 +29,8 @@ help='directory to be tested') args = parser.parse_args() - -def gather_test_cases(test_dir, pattern, list_tests): - test_suite = unittest.TestSuite() +def gather_test_cases(test_dir, pattern, list_tests, tag): + test_to_run = unittest.TestSuite() discover = unittest.defaultTestLoader.discover(test_dir, pattern=pattern, top_level_dir=None) @@ -36,32 +38,22 @@ def gather_test_cases(test_dir, pattern, list_tests): f'{SKIPPED_TESTS.modules}') for suite_discovered in discover: - for test_case in suite_discovered: - logger.info(f'Prepare for test [{test_case}]') - # filter out those tests that need to be skipped - filtered_test_suite = unittest.TestSuite() - for tc in test_case: - if type(tc) in SKIPPED_TESTS.modules.values(): + for test_suite in suite_discovered: + logger.info(f'Prepare for test suite [{test_suite}]') + for test_case in test_suite: + if type(test_case) in SKIPPED_TESTS.modules.values(): continue - filtered_test_suite.addTest(tc) - if filtered_test_suite.countTestCases() == 0: - continue - - test_suite.addTest(test_case) - if hasattr(test_case, '__iter__'): - for subcase in test_case: + if tag in getattr(test_case, "tags", ["standalone"]): + test_to_run.addTest(test_case) if list_tests: - print(subcase) - else: - if list_tests: - print(test_case) - return test_suite + logger.info(f"add test case [{test_case}]") + return test_to_run def main(): runner = unittest.TextTestRunner() test_suite = gather_test_cases(os.path.abspath(args.test_dir), - args.pattern, args.list_tests) + args.pattern, args.list_tests, args.tag) if not args.list_tests: res = runner.run(test_suite) if not res.wasSuccessful(): From 85db5d091404d954491c96bef64e97be0ee7667d Mon Sep 17 00:00:00 2001 From: root Date: Thu, 25 Apr 2024 12:14:21 +0000 Subject: [PATCH 16/27] finish tag support --- .github/workflows/unittest.yml | 2 +- data_juicer/utils/unittest_utils.py | 28 ++++++++++-------- tests/ops/filter/test_alphanumeric_filter.py | 3 +- tests/run.py | 31 ++++++++++++++------ 4 files changed, 41 insertions(+), 23 deletions(-) diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 052c58a95..ed1a10718 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -40,7 +40,7 @@ jobs: - name: Run unittest standalone working-directory: dj-${{ github.run_id }}/.github/workflows/docker run: | - docker compose exec ray-head python tests/run.py --tag standalone-gpu + docker compose exec ray-head python tests/run.py --tag standalone - name: Remove docker compose working-directory: dj-${{ github.run_id }}/.github/workflows/docker diff --git a/data_juicer/utils/unittest_utils.py b/data_juicer/utils/unittest_utils.py index 2394a834e..5806fa60b 100644 --- a/data_juicer/utils/unittest_utils.py +++ b/data_juicer/utils/unittest_utils.py @@ -1,25 +1,29 @@ import os import shutil import unittest -from datasets import Dataset + import ray.data as rd +from datasets import Dataset from data_juicer.ops import Filter -from data_juicer.utils.registry import Registry from data_juicer.utils.constant import Fields - +from data_juicer.utils.registry import Registry SKIPPED_TESTS = Registry('SkippedTests') + def TEST_TAG(*tags): """Tags for test case. Currently, `standalone`, `ray`, `standalone-gpu`, `ray-gpu` are supported. """ + def decorator(func): - setattr(func, "tags", tags) + setattr(func, '__test_tags__', tags) return func + return decorator + class DataJuicerTestCaseBase(unittest.TestCase): @classmethod @@ -47,30 +51,30 @@ def tearDownClass(cls, hf_model_name=None) -> None: shutil.rmtree(transformers.TRANSFORMERS_CACHE) @classmethod - def generate_dataset(cls, data, type="hf"): + def generate_dataset(cls, data, type='hf'): """Generate dataset for a specific executor. Args: type (str, optional): `hf` or `ray`. Defaults to "hf". """ - if type == "hf": + if type == 'hf': return Dataset.from_list(data) - elif type == "ray": + elif type == 'ray': return rd.from_items(data) @classmethod - def run_single_op(cls, dataset, op, type="hf"): + def run_single_op(cls, dataset, op, type='hf'): """Run operator in the specific executor.""" - if type == "hf": + if type == 'hf': if isinstance(op, Filter) and Fields.stats not in dataset.features: # TODO: # this is a temp solution, # only add stats when calling filter op dataset = dataset.add_column(name=Fields.stats, - column=[{}] * dataset.num_rows) + column=[{}] * dataset.num_rows) dataset = dataset.map(op.compute_stats) dataset = dataset.filter(op.process) dataset = dataset.select_columns(column_names=['text']) return dataset.to_list() - elif type == "ray": - pass \ No newline at end of file + elif type == 'ray': + pass diff --git a/tests/ops/filter/test_alphanumeric_filter.py b/tests/ops/filter/test_alphanumeric_filter.py index ffe635258..1ef5ebe2a 100644 --- a/tests/ops/filter/test_alphanumeric_filter.py +++ b/tests/ops/filter/test_alphanumeric_filter.py @@ -9,7 +9,7 @@ class AlphanumericFilterTest(DataJuicerTestCaseBase): - @TEST_TAG("standalone-gpu") + @TEST_TAG("single") def test_case(self): ds_list = [{ @@ -43,6 +43,7 @@ def test_case(self): result = DataJuicerTestCaseBase.run_single_op(dataset, op) self.assertEqual(result, tgt_list) + @TEST_TAG("single") def test_token_case(self): ds_list = [{ diff --git a/tests/run.py b/tests/run.py index 613c84693..24402fa55 100644 --- a/tests/run.py +++ b/tests/run.py @@ -29,24 +29,37 @@ help='directory to be tested') args = parser.parse_args() + +class TaggedTestLoader(unittest.TestLoader): + def __init__(self, tag=None): + super().__init__() + self.tag = tag + + def loadTestsFromTestCase(self, testCaseClass): + + test_names = self.getTestCaseNames(testCaseClass) + loaded_suite = self.suiteClass() + for test_name in test_names: + test_case = testCaseClass(test_name) + test_method = getattr(test_case, test_name) + if hasattr(test_method, '__test_tags__') and self.tag in test_method.__test_tags__: + loaded_suite.addTest(test_case) + return loaded_suite + def gather_test_cases(test_dir, pattern, list_tests, tag): test_to_run = unittest.TestSuite() - discover = unittest.defaultTestLoader.discover(test_dir, - pattern=pattern, - top_level_dir=None) + test_loader = TaggedTestLoader(tag) + discover = test_loader.discover(test_dir, pattern=pattern, top_level_dir=None) print(f'These tests will be skipped due to some reasons: ' f'{SKIPPED_TESTS.modules}') for suite_discovered in discover: - for test_suite in suite_discovered: - logger.info(f'Prepare for test suite [{test_suite}]') for test_case in test_suite: if type(test_case) in SKIPPED_TESTS.modules.values(): continue - if tag in getattr(test_case, "tags", ["standalone"]): - test_to_run.addTest(test_case) - if list_tests: - logger.info(f"add test case [{test_case}]") + if list_tests: + logger.info(f'Add test case [{str(test_case)}]') + test_to_run.addTest(test_case) return test_to_run From ef4ab1d4433e0c2c5a745fa9ccfca1bf6132051d Mon Sep 17 00:00:00 2001 From: root Date: Fri, 26 Apr 2024 04:26:44 +0000 Subject: [PATCH 17/27] support run op with different executro --- data_juicer/utils/unittest_utils.py | 21 ++++++++++---------- tests/ops/filter/test_alphanumeric_filter.py | 6 +++--- tests/run.py | 19 +++++++++--------- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/data_juicer/utils/unittest_utils.py b/data_juicer/utils/unittest_utils.py index 5806fa60b..d545dbac9 100644 --- a/data_juicer/utils/unittest_utils.py +++ b/data_juicer/utils/unittest_utils.py @@ -51,30 +51,31 @@ def tearDownClass(cls, hf_model_name=None) -> None: shutil.rmtree(transformers.TRANSFORMERS_CACHE) @classmethod - def generate_dataset(cls, data, type='hf'): + def generate_dataset(cls, data, type='standalone'): """Generate dataset for a specific executor. Args: type (str, optional): `hf` or `ray`. Defaults to "hf". """ - if type == 'hf': + if type.startswith('standalone'): return Dataset.from_list(data) - elif type == 'ray': + elif type.startswith('ray'): return rd.from_items(data) + else: + raise ValueError("Unsupported type") @classmethod - def run_single_op(cls, dataset, op, type='hf'): + def run_single_op(cls, dataset, op, type='standalone'): """Run operator in the specific executor.""" - if type == 'hf': + if type.startswith('standalone'): if isinstance(op, Filter) and Fields.stats not in dataset.features: - # TODO: - # this is a temp solution, - # only add stats when calling filter op dataset = dataset.add_column(name=Fields.stats, column=[{}] * dataset.num_rows) dataset = dataset.map(op.compute_stats) dataset = dataset.filter(op.process) dataset = dataset.select_columns(column_names=['text']) return dataset.to_list() - elif type == 'ray': - pass + elif type.startswith('ray'): + raise ValueError("Unsupported type") + else: + raise ValueError("Unsupported type") \ No newline at end of file diff --git a/tests/ops/filter/test_alphanumeric_filter.py b/tests/ops/filter/test_alphanumeric_filter.py index 1ef5ebe2a..c0a64ffa7 100644 --- a/tests/ops/filter/test_alphanumeric_filter.py +++ b/tests/ops/filter/test_alphanumeric_filter.py @@ -9,7 +9,7 @@ class AlphanumericFilterTest(DataJuicerTestCaseBase): - @TEST_TAG("single") + @TEST_TAG("standalone") def test_case(self): ds_list = [{ @@ -40,10 +40,10 @@ def test_case(self): }] dataset = DataJuicerTestCaseBase.generate_dataset(ds_list) op = AlphanumericFilter(min_ratio=0.2, max_ratio=0.9) - result = DataJuicerTestCaseBase.run_single_op(dataset, op) + result = DataJuicerTestCaseBase.run_single_op(dataset, op, AlphanumericFilterTest.current_tag,) self.assertEqual(result, tgt_list) - @TEST_TAG("single") + @TEST_TAG("standalone") def test_token_case(self): ds_list = [{ diff --git a/tests/run.py b/tests/run.py index 24402fa55..a2fbeeb1c 100644 --- a/tests/run.py +++ b/tests/run.py @@ -22,7 +22,6 @@ parser.add_argument('--tag', choices=["standalone", "standalone-gpu", "ray", "ray-gpu"], default="standalone", help="the tag of tests being run") -parser.add_argument('--list_tests', action='store_true', help='list all tests') parser.add_argument('--pattern', default='test_*.py', help='test file pattern') parser.add_argument('--test_dir', default='tests', @@ -36,7 +35,8 @@ def __init__(self, tag=None): self.tag = tag def loadTestsFromTestCase(self, testCaseClass): - + # set tag to testcase class + setattr(testCaseClass, 'current_tag', self.tag) test_names = self.getTestCaseNames(testCaseClass) loaded_suite = self.suiteClass() for test_name in test_names: @@ -46,7 +46,7 @@ def loadTestsFromTestCase(self, testCaseClass): loaded_suite.addTest(test_case) return loaded_suite -def gather_test_cases(test_dir, pattern, list_tests, tag): +def gather_test_cases(test_dir, pattern, tag): test_to_run = unittest.TestSuite() test_loader = TaggedTestLoader(tag) discover = test_loader.discover(test_dir, pattern=pattern, top_level_dir=None) @@ -57,8 +57,8 @@ def gather_test_cases(test_dir, pattern, list_tests, tag): for test_case in test_suite: if type(test_case) in SKIPPED_TESTS.modules.values(): continue - if list_tests: - logger.info(f'Add test case [{str(test_case)}]') + logger.info(f'Add test case [{test_case._testMethodName}]' + f' from {test_case.__class__.__name__}') test_to_run.addTest(test_case) return test_to_run @@ -66,11 +66,10 @@ def gather_test_cases(test_dir, pattern, list_tests, tag): def main(): runner = unittest.TextTestRunner() test_suite = gather_test_cases(os.path.abspath(args.test_dir), - args.pattern, args.list_tests, args.tag) - if not args.list_tests: - res = runner.run(test_suite) - if not res.wasSuccessful(): - exit(1) + args.pattern, args.tag) + res = runner.run(test_suite) + if not res.wasSuccessful(): + exit(1) if __name__ == '__main__': From 2a698ca57c2c5f9e8f567af02ecde32d2dd8ed93 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 26 Apr 2024 04:28:31 +0000 Subject: [PATCH 18/27] fix pre-commit --- data_juicer/utils/unittest_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/data_juicer/utils/unittest_utils.py b/data_juicer/utils/unittest_utils.py index d545dbac9..1465e9c6c 100644 --- a/data_juicer/utils/unittest_utils.py +++ b/data_juicer/utils/unittest_utils.py @@ -62,7 +62,7 @@ def generate_dataset(cls, data, type='standalone'): elif type.startswith('ray'): return rd.from_items(data) else: - raise ValueError("Unsupported type") + raise ValueError('Unsupported type') @classmethod def run_single_op(cls, dataset, op, type='standalone'): @@ -76,6 +76,6 @@ def run_single_op(cls, dataset, op, type='standalone'): dataset = dataset.select_columns(column_names=['text']) return dataset.to_list() elif type.startswith('ray'): - raise ValueError("Unsupported type") + raise ValueError('Unsupported type') else: - raise ValueError("Unsupported type") \ No newline at end of file + raise ValueError('Unsupported type') From 319c4504786f29f8cfa9d4b8b2da2d00a9659f34 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 26 Apr 2024 05:16:19 +0000 Subject: [PATCH 19/27] add hf mirror --- .github/workflows/docker/docker-compose.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/docker/docker-compose.yml b/.github/workflows/docker/docker-compose.yml index 73b577937..38dd63f1e 100644 --- a/.github/workflows/docker/docker-compose.yml +++ b/.github/workflows/docker/docker-compose.yml @@ -6,6 +6,7 @@ services: command: ray start --head --dashboard-host 0.0.0.0 --include-dashboard true --block environment: - HF_HOME=/data/huggingface + - HF_ENDPOINT="https://hf-mirror.com" - TORCH_HOME=/data/torch - NLTK_DATA=/data/nltk - DATA_JUICER_CACHE_HOME=/data/dj @@ -34,6 +35,7 @@ services: command: ray start --address=ray-head:6379 --block environment: - HF_HOME=/data/huggingface + - HF_ENDPOINT="https://hf-mirror.com" - TORCH_HOME=/data/torch - NLTK_DATA=/data/nltk - DATA_JUICER_CACHE_HOME=/data/dj From de318af64d2eb2d0984bcd5c01db906cc6c1fc66 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 26 Apr 2024 05:18:05 +0000 Subject: [PATCH 20/27] add hf mirror --- .github/workflows/docker/docker-compose.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker/docker-compose.yml b/.github/workflows/docker/docker-compose.yml index 38dd63f1e..fb2aa7a3e 100644 --- a/.github/workflows/docker/docker-compose.yml +++ b/.github/workflows/docker/docker-compose.yml @@ -6,7 +6,7 @@ services: command: ray start --head --dashboard-host 0.0.0.0 --include-dashboard true --block environment: - HF_HOME=/data/huggingface - - HF_ENDPOINT="https://hf-mirror.com" + - HF_ENDPOINT=https://hf-mirror.com - TORCH_HOME=/data/torch - NLTK_DATA=/data/nltk - DATA_JUICER_CACHE_HOME=/data/dj @@ -35,7 +35,7 @@ services: command: ray start --address=ray-head:6379 --block environment: - HF_HOME=/data/huggingface - - HF_ENDPOINT="https://hf-mirror.com" + - HF_ENDPOINT=https://hf-mirror.com - TORCH_HOME=/data/torch - NLTK_DATA=/data/nltk - DATA_JUICER_CACHE_HOME=/data/dj From 6863cd004bf79a8ae0822d4b5f655c3fb4f644ba Mon Sep 17 00:00:00 2001 From: root Date: Fri, 26 Apr 2024 05:21:39 +0000 Subject: [PATCH 21/27] run all test in standalone mode by default --- tests/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/run.py b/tests/run.py index a2fbeeb1c..080404b8a 100644 --- a/tests/run.py +++ b/tests/run.py @@ -42,7 +42,7 @@ def loadTestsFromTestCase(self, testCaseClass): for test_name in test_names: test_case = testCaseClass(test_name) test_method = getattr(test_case, test_name) - if hasattr(test_method, '__test_tags__') and self.tag in test_method.__test_tags__: + if self.tag in getattr(test_method, '__test_tags__', ["standalone"]): loaded_suite.addTest(test_case) return loaded_suite From 1a251c9a58516b1bbb54e69ec8ccfd9577b09224 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 26 Apr 2024 06:02:09 +0000 Subject: [PATCH 22/27] ignore image face ratio --- tests/ops/filter/test_image_face_ratio_filter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/ops/filter/test_image_face_ratio_filter.py b/tests/ops/filter/test_image_face_ratio_filter.py index 2a2327b8f..2c6cf5b40 100644 --- a/tests/ops/filter/test_image_face_ratio_filter.py +++ b/tests/ops/filter/test_image_face_ratio_filter.py @@ -5,9 +5,10 @@ from data_juicer.ops.filter.image_face_ratio_filter import ImageFaceRatioFilter from data_juicer.utils.constant import Fields -from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase +from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS +@SKIPPED_TESTS.register_module() class ImageFaceRatioFilterTest(DataJuicerTestCaseBase): maxDiff = None From aef673966257bb5d95068231e2adbfb7501a3b1d Mon Sep 17 00:00:00 2001 From: panxuchen Date: Tue, 7 May 2024 15:43:26 +0800 Subject: [PATCH 23/27] update tags --- data_juicer/utils/unittest_utils.py | 6 ++---- tests/ops/filter/test_alphanumeric_filter.py | 8 ++++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/data_juicer/utils/unittest_utils.py b/data_juicer/utils/unittest_utils.py index 1465e9c6c..0b8acda58 100644 --- a/data_juicer/utils/unittest_utils.py +++ b/data_juicer/utils/unittest_utils.py @@ -50,7 +50,6 @@ def tearDownClass(cls, hf_model_name=None) -> None: print('CLEAN all TRANSFORMERS_CACHE') shutil.rmtree(transformers.TRANSFORMERS_CACHE) - @classmethod def generate_dataset(cls, data, type='standalone'): """Generate dataset for a specific executor. @@ -64,8 +63,7 @@ def generate_dataset(cls, data, type='standalone'): else: raise ValueError('Unsupported type') - @classmethod - def run_single_op(cls, dataset, op, type='standalone'): + def run_single_op(cls, dataset, op, column_names, type='standalone'): """Run operator in the specific executor.""" if type.startswith('standalone'): if isinstance(op, Filter) and Fields.stats not in dataset.features: @@ -73,7 +71,7 @@ def run_single_op(cls, dataset, op, type='standalone'): column=[{}] * dataset.num_rows) dataset = dataset.map(op.compute_stats) dataset = dataset.filter(op.process) - dataset = dataset.select_columns(column_names=['text']) + dataset = dataset.select_columns(column_names=column_names) return dataset.to_list() elif type.startswith('ray'): raise ValueError('Unsupported type') diff --git a/tests/ops/filter/test_alphanumeric_filter.py b/tests/ops/filter/test_alphanumeric_filter.py index c0a64ffa7..ec0115942 100644 --- a/tests/ops/filter/test_alphanumeric_filter.py +++ b/tests/ops/filter/test_alphanumeric_filter.py @@ -9,7 +9,7 @@ class AlphanumericFilterTest(DataJuicerTestCaseBase): - @TEST_TAG("standalone") + @TEST_TAG("standalone", "ray") def test_case(self): ds_list = [{ @@ -40,10 +40,10 @@ def test_case(self): }] dataset = DataJuicerTestCaseBase.generate_dataset(ds_list) op = AlphanumericFilter(min_ratio=0.2, max_ratio=0.9) - result = DataJuicerTestCaseBase.run_single_op(dataset, op, AlphanumericFilterTest.current_tag,) + result = DataJuicerTestCaseBase.run_single_op(dataset, op, ["text"], self.current_tag) self.assertEqual(result, tgt_list) - @TEST_TAG("standalone") + @TEST_TAG("standalone", "ray") def test_token_case(self): ds_list = [{ @@ -68,7 +68,7 @@ def test_token_case(self): }] dataset = DataJuicerTestCaseBase.generate_dataset(ds_list) op = AlphanumericFilter(tokenization=True, min_ratio=1.5) - result = DataJuicerTestCaseBase.run_single_op(dataset, op) + result = DataJuicerTestCaseBase.run_single_op(dataset, op, ["text"], self.current_tag) self.assertEqual(result, tgt_list) From 6e6409daa3badf757eb5b3b7a7d468eec8c3d806 Mon Sep 17 00:00:00 2001 From: panxuchen Date: Tue, 7 May 2024 20:19:38 +0800 Subject: [PATCH 24/27] add ray testcase --- data_juicer/core/ray_executor.py | 127 +++++++++--------- data_juicer/utils/unittest_utils.py | 49 +++++-- tests/ops/filter/test_alphanumeric_filter.py | 12 +- .../ops/filter/test_audio_duration_filter.py | 47 ++++--- 4 files changed, 142 insertions(+), 93 deletions(-) diff --git a/data_juicer/core/ray_executor.py b/data_juicer/core/ray_executor.py index d42d72f95..3cf7acbb3 100644 --- a/data_juicer/core/ray_executor.py +++ b/data_juicer/core/ray_executor.py @@ -102,6 +102,70 @@ def get_num_gpus(self, op, op_proc): proc_per_gpu = op_proc / cuda_device_count() return 1.0 / proc_per_gpu + def run_op(self, op, op_cfg, dataset): + op_name, op_args = list(op_cfg.items())[0] + op_cls = OPERATORS.modules[op_name] + op_proc = calculate_np(self.cfg.np, op, op_name) + num_gpus = self.get_num_gpus(op, op_proc) + use_actor = op.use_actor() or num_gpus + try: + if isinstance(op, Mapper): + if op.is_batched_op(): + if use_actor: + dataset = dataset.map_batches( + op_cls, + compute=ActorPoolStrategy(), + concurrency=op_proc, + fn_constructor_kwargs=op_args, + batch_format='pyarrow', + num_gpus=num_gpus, + batch_size=1) + # The batch size here is same as in data.py + else: + dataset = dataset.map_batches( + partial(ray_batch_mapper_wrapper, + fn=op.process), + batch_format='pyarrow', + num_gpus=num_gpus, + batch_size=1) + # The batch size here is same as in data.py + else: + if use_actor: + dataset = dataset.map( + op_cls, + compute=ActorPoolStrategy(), + concurrency=op_proc, + fn_constructor_kwargs=op_args, + num_gpus=num_gpus) + else: + dataset = dataset.map(op.process, + num_gpus=num_gpus) + + elif isinstance(op, Filter): + if use_actor: + dataset = dataset.map(op_cls, + compute=ActorPoolStrategy(), + concurrency=op_proc, + fn_constructor_kwargs=op_args, + num_gpus=num_gpus) + else: + dataset = dataset.map(op.compute_stats, + num_gpus=num_gpus) + if op.stats_export_path is not None: + dataset.write_json(op.stats_export_path, + force_ascii=False) + dataset = dataset.filter(op.process) + else: + logger.error( + 'Ray executor only support Filter and Mapper OPs for ' + 'now') + raise NotImplementedError + except: # noqa: E722 + logger.error(f'An error occurred during Op [{op_name}].') + import traceback + traceback.print_exc() + exit(1) + def run(self, load_data_np=None): """ Running the dataset process pipeline. @@ -140,68 +204,7 @@ def process_batch_arrow(table: pa.Table) -> pa.Table: logger.info('Processing data...') tstart = time.time() for op_cfg, op in zip(self.process_list, self.ops): - op_name, op_args = list(op_cfg.items())[0] - op_cls = OPERATORS.modules[op_name] - op_proc = calculate_np(self.cfg.np, op, op_name) - num_gpus = self.get_num_gpus(op, op_proc) - use_actor = op.use_actor() or num_gpus - try: - if isinstance(op, Mapper): - if op.is_batched_op(): - if use_actor: - dataset = dataset.map_batches( - op_cls, - compute=ActorPoolStrategy(), - concurrency=op_proc, - fn_constructor_kwargs=op_args, - batch_format='pyarrow', - num_gpus=num_gpus, - batch_size=1) - # The batch size here is same as in data.py - else: - dataset = dataset.map_batches( - partial(ray_batch_mapper_wrapper, - fn=op.process), - batch_format='pyarrow', - num_gpus=num_gpus, - batch_size=1) - # The batch size here is same as in data.py - else: - if use_actor: - dataset = dataset.map( - op_cls, - compute=ActorPoolStrategy(), - concurrency=op_proc, - fn_constructor_kwargs=op_args, - num_gpus=num_gpus) - else: - dataset = dataset.map(op.process, - num_gpus=num_gpus) - - elif isinstance(op, Filter): - if use_actor: - dataset = dataset.map(op_cls, - compute=ActorPoolStrategy(), - concurrency=op_proc, - fn_constructor_kwargs=op_args, - num_gpus=num_gpus) - else: - dataset = dataset.map(op.compute_stats, - num_gpus=num_gpus) - if op.stats_export_path is not None: - dataset.write_json(op.stats_export_path, - force_ascii=False) - dataset = dataset.filter(op.process) - else: - logger.error( - 'Ray executor only support Filter and Mapper OPs for ' - 'now') - raise NotImplementedError - except: # noqa: E722 - logger.error(f'An error occurred during Op [{op_name}].') - import traceback - traceback.print_exc() - exit(1) + dataset = self.run_op(op, op_cfg, dataset) # 4. data export logger.info('Exporting dataset to disk...') diff --git a/data_juicer/utils/unittest_utils.py b/data_juicer/utils/unittest_utils.py index 0b8acda58..546a28dc3 100644 --- a/data_juicer/utils/unittest_utils.py +++ b/data_juicer/utils/unittest_utils.py @@ -2,7 +2,9 @@ import shutil import unittest +import numpy import ray.data as rd +import pyarrow as pa from datasets import Dataset from data_juicer.ops import Filter @@ -50,22 +52,32 @@ def tearDownClass(cls, hf_model_name=None) -> None: print('CLEAN all TRANSFORMERS_CACHE') shutil.rmtree(transformers.TRANSFORMERS_CACHE) - def generate_dataset(cls, data, type='standalone'): + def generate_dataset(self, data): """Generate dataset for a specific executor. Args: - type (str, optional): `hf` or `ray`. Defaults to "hf". + type (str, optional): "standalone" or "ray". Defaults to "standalone". """ - if type.startswith('standalone'): + if self.current_tag.startswith('standalone'): return Dataset.from_list(data) - elif type.startswith('ray'): - return rd.from_items(data) + elif self.current_tag.startswith('ray'): + dataset = rd.from_items(data) + if Fields.stats not in dataset.columns(fetch_if_missing=False): + def process_batch_arrow(table: pa.Table) -> pa.Table: + new_column_data = [{} for _ in range(len(table))] + new_talbe = table.append_column(Fields.stats, + [new_column_data]) + return new_talbe + + dataset = dataset.map_batches(process_batch_arrow, + batch_format='pyarrow') + return dataset else: raise ValueError('Unsupported type') - def run_single_op(cls, dataset, op, column_names, type='standalone'): + def run_single_op(self, dataset, op, column_names): """Run operator in the specific executor.""" - if type.startswith('standalone'): + if self.current_tag.startswith('standalone'): if isinstance(op, Filter) and Fields.stats not in dataset.features: dataset = dataset.add_column(name=Fields.stats, column=[{}] * dataset.num_rows) @@ -73,7 +85,26 @@ def run_single_op(cls, dataset, op, column_names, type='standalone'): dataset = dataset.filter(op.process) dataset = dataset.select_columns(column_names=column_names) return dataset.to_list() - elif type.startswith('ray'): - raise ValueError('Unsupported type') + elif self.current_tag.startswith('ray'): + dataset = dataset.map(op.compute_stats) + dataset = dataset.filter(op.process) + dataset = dataset.to_pandas().get(column_names) + if dataset is None: + return [] + return dataset.to_dict(orient="records") else: raise ValueError('Unsupported type') + + def assertDatasetEqual(self, first, second): + def convert_record(rec): + for key in rec.keys(): + # Convert incomparable `list` to comparable `tuple` + if isinstance(rec[key], numpy.ndarray) or isinstance(rec[key], list): + rec[key] = tuple(rec[key]) + return rec + + first = [convert_record(d) for d in first] + second = [convert_record(d) for d in second] + first = sorted(first, key=lambda x: tuple(sorted(x.items()))) + second = sorted(second, key=lambda x: tuple(sorted(x.items()))) + return self.assertEqual(first, second) \ No newline at end of file diff --git a/tests/ops/filter/test_alphanumeric_filter.py b/tests/ops/filter/test_alphanumeric_filter.py index ec0115942..594432207 100644 --- a/tests/ops/filter/test_alphanumeric_filter.py +++ b/tests/ops/filter/test_alphanumeric_filter.py @@ -38,10 +38,10 @@ def test_case(self): }, { 'text': 'emoji表情测试下😊,😸31231\n' }] - dataset = DataJuicerTestCaseBase.generate_dataset(ds_list) + dataset = self.generate_dataset(ds_list) op = AlphanumericFilter(min_ratio=0.2, max_ratio=0.9) - result = DataJuicerTestCaseBase.run_single_op(dataset, op, ["text"], self.current_tag) - self.assertEqual(result, tgt_list) + result = self.run_single_op(dataset, op, ["text"]) + self.assertDatasetEqual(result, tgt_list) @TEST_TAG("standalone", "ray") def test_token_case(self): @@ -66,10 +66,10 @@ def test_token_case(self): }, { 'text': 'Do you need a cup of coffee?' }] - dataset = DataJuicerTestCaseBase.generate_dataset(ds_list) + dataset = self.generate_dataset(ds_list) op = AlphanumericFilter(tokenization=True, min_ratio=1.5) - result = DataJuicerTestCaseBase.run_single_op(dataset, op, ["text"], self.current_tag) - self.assertEqual(result, tgt_list) + result = self.run_single_op(dataset, op, ["text"]) + self.assertDatasetEqual(result, tgt_list) if __name__ == '__main__': diff --git a/tests/ops/filter/test_audio_duration_filter.py b/tests/ops/filter/test_audio_duration_filter.py index 91a39bfd8..f7363969d 100644 --- a/tests/ops/filter/test_audio_duration_filter.py +++ b/tests/ops/filter/test_audio_duration_filter.py @@ -5,7 +5,7 @@ from data_juicer.ops.filter.audio_duration_filter import AudioDurationFilter from data_juicer.utils.constant import Fields -from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase +from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, TEST_TAG class AudioDurationFilterTest(DataJuicerTestCaseBase): @@ -30,6 +30,7 @@ def _run_audio_duration_filter(self, res_list = dataset.to_list() self.assertEqual(res_list, target_list) + @TEST_TAG("standalone", "ray") def test_default_filter(self): ds_list = [{ @@ -46,10 +47,13 @@ def test_default_filter(self): }, { 'audios': [self.aud3_path] }] - dataset = Dataset.from_list(ds_list) + dataset = self.generate_dataset(ds_list) op = AudioDurationFilter() - self._run_audio_duration_filter(dataset, tgt_list, op) + result = self.run_single_op(dataset, op, [op.audio_key]) + self.assertDatasetEqual(result, tgt_list) + + @TEST_TAG("standalone", "ray") def test_filter_long_audios(self): ds_list = [{ @@ -60,10 +64,12 @@ def test_filter_long_audios(self): 'audios': [self.aud3_path] }] tgt_list = [{'audios': [self.aud1_path]}] - dataset = Dataset.from_list(ds_list) + dataset = self.generate_dataset(ds_list) op = AudioDurationFilter(max_duration=10) - self._run_audio_duration_filter(dataset, tgt_list, op) + result = self.run_single_op(dataset, op, [op.audio_key]) + self.assertDatasetEqual(result, tgt_list) + @TEST_TAG("standalone", "ray") def test_filter_short_audios(self): ds_list = [{ @@ -74,10 +80,12 @@ def test_filter_short_audios(self): 'audios': [self.aud3_path] }] tgt_list = [{'audios': [self.aud3_path]}] - dataset = Dataset.from_list(ds_list) + dataset = self.generate_dataset(ds_list) op = AudioDurationFilter(min_duration=60) - self._run_audio_duration_filter(dataset, tgt_list, op) + result = self.run_single_op(dataset, op, [op.audio_key]) + self.assertDatasetEqual(result, tgt_list) + @TEST_TAG("standalone", "ray") def test_filter_audios_within_range(self): ds_list = [{ @@ -88,12 +96,13 @@ def test_filter_audios_within_range(self): 'audios': [self.aud3_path] }] tgt_list = [{'audios': [self.aud2_path]}] - dataset = Dataset.from_list(ds_list) + dataset = self.generate_dataset(ds_list) op = AudioDurationFilter(min_duration=10, max_duration=20) - self._run_audio_duration_filter(dataset, tgt_list, op) + result = self.run_single_op(dataset, op, [op.audio_key]) + self.assertDatasetEqual(result, tgt_list) + @TEST_TAG("standalone", "ray") def test_any(self): - ds_list = [{ 'audios': [self.aud1_path, self.aud2_path] }, { @@ -106,12 +115,15 @@ def test_any(self): }, { 'audios': [self.aud2_path, self.aud3_path] }] - dataset = Dataset.from_list(ds_list) + dataset = self.generate_dataset(ds_list) op = AudioDurationFilter(min_duration=10, max_duration=20, any_or_all='any') - self._run_audio_duration_filter(dataset, tgt_list, op) + result = self.run_single_op(dataset, op, [op.audio_key]) + print(result) + self.assertDatasetEqual(result, tgt_list) + @TEST_TAG("standalone", "ray") def test_all(self): ds_list = [{ @@ -122,12 +134,14 @@ def test_all(self): 'audios': [self.aud1_path, self.aud3_path] }] tgt_list = [] - dataset = Dataset.from_list(ds_list) + dataset = self.generate_dataset(ds_list) op = AudioDurationFilter(min_duration=10, max_duration=20, any_or_all='all') - self._run_audio_duration_filter(dataset, tgt_list, op) + result = self.run_single_op(dataset, op, [op.audio_key]) + self.assertDatasetEqual(result, tgt_list) + @TEST_TAG("standalone", "ray") def test_filter_in_parallel(self): ds_list = [{ @@ -138,9 +152,10 @@ def test_filter_in_parallel(self): 'audios': [self.aud3_path] }] tgt_list = [{'audios': [self.aud2_path]}] - dataset = Dataset.from_list(ds_list) + dataset = self.generate_dataset(ds_list) op = AudioDurationFilter(min_duration=10, max_duration=20) - self._run_audio_duration_filter(dataset, tgt_list, op, np=2) + result = self.run_single_op(dataset, op, [op.audio_key]) + self.assertDatasetEqual(result, tgt_list) if __name__ == '__main__': From a2091d89c06f24027c68683ebc27bbf7fd7ccef7 Mon Sep 17 00:00:00 2001 From: panxuchen Date: Tue, 7 May 2024 20:23:58 +0800 Subject: [PATCH 25/27] add ray test in workflow --- .github/workflows/unittest.yml | 5 ++++ data_juicer/core/ray_executor.py | 39 +++++++++++++---------------- data_juicer/utils/unittest_utils.py | 18 +++++++------ tests/run.py | 4 +-- 4 files changed, 35 insertions(+), 31 deletions(-) diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index ed1a10718..3ea19b3bb 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -42,6 +42,11 @@ jobs: run: | docker compose exec ray-head python tests/run.py --tag standalone + - name: Run unittest ray + working-directory: dj-${{ github.run_id }}/.github/workflows/docker + run: | + docker compose exec ray-head python tests/run.py --tag ray + - name: Remove docker compose working-directory: dj-${{ github.run_id }}/.github/workflows/docker if: always() diff --git a/data_juicer/core/ray_executor.py b/data_juicer/core/ray_executor.py index 3cf7acbb3..1c066ea02 100644 --- a/data_juicer/core/ray_executor.py +++ b/data_juicer/core/ray_executor.py @@ -122,38 +122,33 @@ def run_op(self, op, op_cfg, dataset): batch_size=1) # The batch size here is same as in data.py else: - dataset = dataset.map_batches( - partial(ray_batch_mapper_wrapper, - fn=op.process), - batch_format='pyarrow', - num_gpus=num_gpus, - batch_size=1) + dataset = dataset.map_batches(partial( + ray_batch_mapper_wrapper, fn=op.process), + batch_format='pyarrow', + num_gpus=num_gpus, + batch_size=1) # The batch size here is same as in data.py else: if use_actor: - dataset = dataset.map( - op_cls, - compute=ActorPoolStrategy(), - concurrency=op_proc, - fn_constructor_kwargs=op_args, - num_gpus=num_gpus) + dataset = dataset.map(op_cls, + compute=ActorPoolStrategy(), + concurrency=op_proc, + fn_constructor_kwargs=op_args, + num_gpus=num_gpus) else: - dataset = dataset.map(op.process, - num_gpus=num_gpus) + dataset = dataset.map(op.process, num_gpus=num_gpus) elif isinstance(op, Filter): if use_actor: dataset = dataset.map(op_cls, - compute=ActorPoolStrategy(), - concurrency=op_proc, - fn_constructor_kwargs=op_args, - num_gpus=num_gpus) + compute=ActorPoolStrategy(), + concurrency=op_proc, + fn_constructor_kwargs=op_args, + num_gpus=num_gpus) else: - dataset = dataset.map(op.compute_stats, - num_gpus=num_gpus) + dataset = dataset.map(op.compute_stats, num_gpus=num_gpus) if op.stats_export_path is not None: - dataset.write_json(op.stats_export_path, - force_ascii=False) + dataset.write_json(op.stats_export_path, force_ascii=False) dataset = dataset.filter(op.process) else: logger.error( diff --git a/data_juicer/utils/unittest_utils.py b/data_juicer/utils/unittest_utils.py index 546a28dc3..d7ee7d87d 100644 --- a/data_juicer/utils/unittest_utils.py +++ b/data_juicer/utils/unittest_utils.py @@ -3,8 +3,8 @@ import unittest import numpy -import ray.data as rd import pyarrow as pa +import ray.data as rd from datasets import Dataset from data_juicer.ops import Filter @@ -16,7 +16,7 @@ def TEST_TAG(*tags): """Tags for test case. - Currently, `standalone`, `ray`, `standalone-gpu`, `ray-gpu` are supported. + Currently, `standalone`, `ray` are supported. """ def decorator(func): @@ -56,13 +56,15 @@ def generate_dataset(self, data): """Generate dataset for a specific executor. Args: - type (str, optional): "standalone" or "ray". Defaults to "standalone". + type (str, optional): "standalone" or "ray". + Defaults to "standalone". """ if self.current_tag.startswith('standalone'): return Dataset.from_list(data) elif self.current_tag.startswith('ray'): dataset = rd.from_items(data) if Fields.stats not in dataset.columns(fetch_if_missing=False): + def process_batch_arrow(table: pa.Table) -> pa.Table: new_column_data = [{} for _ in range(len(table))] new_talbe = table.append_column(Fields.stats, @@ -70,7 +72,7 @@ def process_batch_arrow(table: pa.Table) -> pa.Table: return new_talbe dataset = dataset.map_batches(process_batch_arrow, - batch_format='pyarrow') + batch_format='pyarrow') return dataset else: raise ValueError('Unsupported type') @@ -91,15 +93,17 @@ def run_single_op(self, dataset, op, column_names): dataset = dataset.to_pandas().get(column_names) if dataset is None: return [] - return dataset.to_dict(orient="records") + return dataset.to_dict(orient='records') else: raise ValueError('Unsupported type') def assertDatasetEqual(self, first, second): + def convert_record(rec): for key in rec.keys(): # Convert incomparable `list` to comparable `tuple` - if isinstance(rec[key], numpy.ndarray) or isinstance(rec[key], list): + if isinstance(rec[key], numpy.ndarray) or isinstance( + rec[key], list): rec[key] = tuple(rec[key]) return rec @@ -107,4 +111,4 @@ def convert_record(rec): second = [convert_record(d) for d in second] first = sorted(first, key=lambda x: tuple(sorted(x.items()))) second = sorted(second, key=lambda x: tuple(sorted(x.items()))) - return self.assertEqual(first, second) \ No newline at end of file + return self.assertEqual(first, second) diff --git a/tests/run.py b/tests/run.py index 080404b8a..81028ee01 100644 --- a/tests/run.py +++ b/tests/run.py @@ -19,7 +19,7 @@ sys.path.append(file_dir) parser = argparse.ArgumentParser('test runner') -parser.add_argument('--tag', choices=["standalone", "standalone-gpu", "ray", "ray-gpu"], +parser.add_argument('--tag', choices=["standalone", "ray"], default="standalone", help="the tag of tests being run") parser.add_argument('--pattern', default='test_*.py', help='test file pattern') @@ -30,7 +30,7 @@ class TaggedTestLoader(unittest.TestLoader): - def __init__(self, tag=None): + def __init__(self, tag="standalone"): super().__init__() self.tag = tag From be814892f07ec935a3b9044ffae377a6fe2f5800 Mon Sep 17 00:00:00 2001 From: panxuchen Date: Wed, 8 May 2024 09:49:00 +0800 Subject: [PATCH 26/27] update ray unittest workflow --- .github/workflows/unittest.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 3ea19b3bb..28b2e6f3f 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -31,6 +31,7 @@ jobs: working-directory: dj-${{ github.run_id }}/.github/workflows/docker run: | docker compose exec ray-head pip install -e .\[all\] + docker compose exec ray-worker pip install -e .\[all\] - name: Clean dataset cache working-directory: dj-${{ github.run_id }}/.github/workflows/docker From aa68b49f9bfa9cd083d0df48a8a13405fb8b214b Mon Sep 17 00:00:00 2001 From: panxuchen Date: Mon, 17 Jun 2024 14:07:13 +0800 Subject: [PATCH 27/27] delete old unittest --- .github/workflows/unittest.yml | 60 ---------------------------------- 1 file changed, 60 deletions(-) delete mode 100644 .github/workflows/unittest.yml diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml deleted file mode 100644 index 28b2e6f3f..000000000 --- a/.github/workflows/unittest.yml +++ /dev/null @@ -1,60 +0,0 @@ -# This workflow will install Python dependencies, run tests and lint with a single version of Python -# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python - -name: unittest - -on: - workflow_dispatch: - pull_request: - push: - branches: - - main - -permissions: - contents: read - -jobs: - unittest-single: - runs-on: [self-hosted, linux] - environment: Testing - steps: - - uses: actions/checkout@v3 - with: - path: dj-${{ github.run_id }} - - - name: Setup docker compose - working-directory: dj-${{ github.run_id }}/.github/workflows/docker - run: | - docker compose up -d - - - name: Install data-juicer - working-directory: dj-${{ github.run_id }}/.github/workflows/docker - run: | - docker compose exec ray-head pip install -e .\[all\] - docker compose exec ray-worker pip install -e .\[all\] - - - name: Clean dataset cache - working-directory: dj-${{ github.run_id }}/.github/workflows/docker - run: | - docker compose exec ray-head rm -rf /data/huggingface/dataset - - - name: Run unittest standalone - working-directory: dj-${{ github.run_id }}/.github/workflows/docker - run: | - docker compose exec ray-head python tests/run.py --tag standalone - - - name: Run unittest ray - working-directory: dj-${{ github.run_id }}/.github/workflows/docker - run: | - docker compose exec ray-head python tests/run.py --tag ray - - - name: Remove docker compose - working-directory: dj-${{ github.run_id }}/.github/workflows/docker - if: always() - run: | - docker compose down --remove-orphans - - - name: Cleanup workspace - if: always() - run: | - rm -rf dj-${{ github.run_id }}