From 2e4c50d38316858cba0d0a3915af41ebdb96824e Mon Sep 17 00:00:00 2001
From: root <panxuchen>
Date: Wed, 24 Apr 2024 03:48:01 +0000
Subject: [PATCH 01/27] add unittest env with gpu

---
 .github/workflows/docker/docker-compose.yml | 56 +++++++++++++++++++++
 .github/workflows/unittest-gpu.yml          | 32 ++++++++++++
 2 files changed, 88 insertions(+)
 create mode 100644 .github/workflows/docker/docker-compose.yml
 create mode 100644 .github/workflows/unittest-gpu.yml
diff --git a/.github/workflows/docker/docker-compose.yml b/.github/workflows/docker/docker-compose.yml
new file mode 100644
index 000000000..06d2a78a1
--- /dev/null
+++ b/.github/workflows/docker/docker-compose.yml
@@ -0,0 +1,56 @@
+version: '3'
+services:
+  ray-head:
+    image: data-juicer-unittest:v2
+    pull_policy: never
+    command: ray start --head --dashboard-host 0.0.0.0 --include-dashboard true --block
+    environment:
+      - HF_HOME=/data/huggingface
+      - RAY_ADDRESS=auto
+    working_dir: /workspace
+    networks:
+      - ray-network
+    volumes:
+      - huggingface_cache:/data
+      - ../../..:/workspace
+    ports:
+      - "6379:6379"
+      - "8265:8265"
+    shm_size: "32G"
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            device_ids: ['0', '1']
+            capabilities: [gpu]
+
+  ray-worker:
+    image: data-juicer-unittest:v2
+    pull_policy: never
+    command: ray start --address=ray-head:6379 --block
+    environment:
+      - HF_HOME=/data/huggingface
+    working_dir: /workspace
+    volumes:
+      - huggingface_cache:/data
+      - ../../..:/workspace
+    depends_on:
+      - ray-head
+    networks:
+      - ray-network
+    shm_size: "32G"
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            device_ids: ['2', '3']
+            capabilities: [gpu]
+
+networks:
+  ray-network:
+    driver: bridge
+
+volumes:
+  huggingface_cache:
diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest-gpu.yml
new file mode 100644
index 000000000..6da6989d3
--- /dev/null
+++ b/.github/workflows/unittest-gpu.yml
@@ -0,0 +1,32 @@
+# This workflow will install Python dependencies, run tests and lint with a single version of Python
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: unittest-gpu
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+
+permissions:
+  contents: read
+
+jobs:
+  unittest:
+
+    runs-on: [self-hosted, linux, GPU]
+    steps:
+    - uses: actions/checkout@v3
+    - name: Setup docker compose
+      working-directory: .github/workflows/docker
+      run: |
+        docker compose up -d
+    - name: Install data-juicer
+      run: |
+        docker compose exec ray-head cd workspace && pip install -v -e .[all]
+      
+    - name: Remove docker compose
+      working-directory: .github/workflows/docker
+      run: |
+        docker compose down --remove-orphans
\ No newline at end of file

From 3edd333d8c55fb10cb8022b2bc3e6aedb86d7f3b Mon Sep 17 00:00:00 2001
From: root <panxuchen>
Date: Wed, 24 Apr 2024 04:12:59 +0000
Subject: [PATCH 02/27] fix unittest yml

---
 .github/workflows/unittest-gpu.yml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest-gpu.yml
index 6da6989d3..7a198c461 100644
--- a/.github/workflows/unittest-gpu.yml
+++ b/.github/workflows/unittest-gpu.yml
@@ -18,15 +18,17 @@ jobs:
     runs-on: [self-hosted, linux, GPU]
     steps:
     - uses: actions/checkout@v3
+
     - name: Setup docker compose
       working-directory: .github/workflows/docker
       run: |
         docker compose up -d
+
     - name: Install data-juicer
       run: |
-        docker compose exec ray-head cd workspace && pip install -v -e .[all]
-      
+        docker compose exec ray-head cd /workspace && pip install -v -e .[all]
+
     - name: Remove docker compose
       working-directory: .github/workflows/docker
       run: |
-        docker compose down --remove-orphans
\ No newline at end of file
+        docker compose down --remove-orphans

From aa5950930085c0ddcb7bbe8323abfa4ba3ab9c54 Mon Sep 17 00:00:00 2001
From: root <panxuchen>
Date: Wed, 24 Apr 2024 04:26:54 +0000
Subject: [PATCH 03/27] add environment for unittest

---
 .github/workflows/unittest-gpu.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest-gpu.yml
index 7a198c461..553eb94dd 100644
--- a/.github/workflows/unittest-gpu.yml
+++ b/.github/workflows/unittest-gpu.yml
@@ -14,8 +14,8 @@ permissions:
 
 jobs:
   unittest:
-
     runs-on: [self-hosted, linux, GPU]
+    environment: testing
     steps:
     - uses: actions/checkout@v3
 

From 5d7c5ddae89f7ebf7c39ba147d8df68776f05611 Mon Sep 17 00:00:00 2001
From: root <panxuchen>
Date: Wed, 24 Apr 2024 04:28:36 +0000
Subject: [PATCH 04/27] update workflow trigger

---
 .github/workflows/unittest-gpu.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest-gpu.yml
index 553eb94dd..4822437fa 100644
--- a/.github/workflows/unittest-gpu.yml
+++ b/.github/workflows/unittest-gpu.yml
@@ -5,6 +5,7 @@ name: unittest-gpu
 
 on:
   workflow_dispatch:
+  pull_request:
   push:
     branches:
       - main
@@ -15,7 +16,7 @@ permissions:
 jobs:
   unittest:
     runs-on: [self-hosted, linux, GPU]
-    environment: testing
+    environment: Testing
     steps:
     - uses: actions/checkout@v3
 

From b9e9f0ca0928d6f19067b4178b149e3dc144de6c Mon Sep 17 00:00:00 2001
From: root <panxuchen>
Date: Wed, 24 Apr 2024 05:39:23 +0000
Subject: [PATCH 05/27] update install step

---
 .github/workflows/unittest-gpu.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest-gpu.yml
index 4822437fa..20607a73f 100644
--- a/.github/workflows/unittest-gpu.yml
+++ b/.github/workflows/unittest-gpu.yml
@@ -27,9 +27,10 @@ jobs:
 
     - name: Install data-juicer
       run: |
-        docker compose exec ray-head cd /workspace && pip install -v -e .[all]
+        docker compose exec ray-head pip install -v -e .[all]
 
     - name: Remove docker compose
       working-directory: .github/workflows/docker
+      if: always()
       run: |
         docker compose down --remove-orphans

From fe7a71a853ba570b5fd91a1bddc71f3b2313100f Mon Sep 17 00:00:00 2001
From: root <panxuchen>
Date: Wed, 24 Apr 2024 05:52:40 +0000
Subject: [PATCH 06/27] fix install command

---
 .github/workflows/unittest-gpu.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest-gpu.yml
index 20607a73f..50a11e214 100644
--- a/.github/workflows/unittest-gpu.yml
+++ b/.github/workflows/unittest-gpu.yml
@@ -27,7 +27,7 @@ jobs:
 
     - name: Install data-juicer
       run: |
-        docker compose exec ray-head pip install -v -e .[all]
+        docker compose exec ray-head pip install -v -e .\[all\]
 
     - name: Remove docker compose
       working-directory: .github/workflows/docker

From 06141cab9ff7ba23de6a105df74046c1ae9e0776 Mon Sep 17 00:00:00 2001
From: root <panxuchen>
Date: Wed, 24 Apr 2024 05:58:06 +0000
Subject: [PATCH 07/27] update working dir

---
 .github/workflows/unittest-gpu.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest-gpu.yml
index 50a11e214..10a1360fb 100644
--- a/.github/workflows/unittest-gpu.yml
+++ b/.github/workflows/unittest-gpu.yml
@@ -26,6 +26,7 @@ jobs:
         docker compose up -d
 
     - name: Install data-juicer
+      working-directory: .github/workflows/docker
       run: |
         docker compose exec ray-head pip install -v -e .\[all\]
 

From c89a92acd952ee25bd77d91503a1bdfd133576f1 Mon Sep 17 00:00:00 2001
From: root <panxuchen>
Date: Wed, 24 Apr 2024 07:14:41 +0000
Subject: [PATCH 08/27] update container

---
 .github/workflows/docker/docker-compose.yml | 8 ++++----
 .github/workflows/unittest-gpu.yml          | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/docker/docker-compose.yml b/.github/workflows/docker/docker-compose.yml
index 06d2a78a1..2089e6798 100644
--- a/.github/workflows/docker/docker-compose.yml
+++ b/.github/workflows/docker/docker-compose.yml
@@ -1,7 +1,7 @@
 version: '3'
 services:
   ray-head:
-    image: data-juicer-unittest:v2
+    image: data-juicer-unittest:v0.2.0
     pull_policy: never
     command: ray start --head --dashboard-host 0.0.0.0 --include-dashboard true --block
     environment:
@@ -16,7 +16,7 @@ services:
     ports:
       - "6379:6379"
       - "8265:8265"
-    shm_size: "32G"
+    shm_size: "64G"
     deploy:
       resources:
         reservations:
@@ -26,7 +26,7 @@ services:
             capabilities: [gpu]
 
   ray-worker:
-    image: data-juicer-unittest:v2
+    image: data-juicer-unittest:v0.2.0
     pull_policy: never
     command: ray start --address=ray-head:6379 --block
     environment:
@@ -39,7 +39,7 @@ services:
       - ray-head
     networks:
       - ray-network
-    shm_size: "32G"
+    shm_size: "64G"
     deploy:
       resources:
         reservations:
diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest-gpu.yml
index 10a1360fb..cf4633b84 100644
--- a/.github/workflows/unittest-gpu.yml
+++ b/.github/workflows/unittest-gpu.yml
@@ -28,7 +28,7 @@ jobs:
     - name: Install data-juicer
       working-directory: .github/workflows/docker
       run: |
-        docker compose exec ray-head pip install -v -e .\[all\]
+        docker compose exec ray-head pip install -e .\[all\]
 
     - name: Remove docker compose
       working-directory: .github/workflows/docker

From 1f5f52207376c601851eedb39a77a0fc28c65ff5 Mon Sep 17 00:00:00 2001
From: root <panxuchen>
Date: Wed, 24 Apr 2024 10:24:46 +0000
Subject: [PATCH 09/27] update working dir

---
 .github/workflows/unittest-gpu.yml | 3 +++
 environments/science_requires.txt  | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest-gpu.yml
index cf4633b84..7fe10c142 100644
--- a/.github/workflows/unittest-gpu.yml
+++ b/.github/workflows/unittest-gpu.yml
@@ -16,6 +16,9 @@ permissions:
 jobs:
   unittest:
     runs-on: [self-hosted, linux, GPU]
+    defaults:
+      run:
+        working-directory: ./${{ github.event.pull_request.head.ref }}
     environment: Testing
     steps:
     - uses: actions/checkout@v3
diff --git a/environments/science_requires.txt b/environments/science_requires.txt
index 4faa330c8..2748d2ef9 100644
--- a/environments/science_requires.txt
+++ b/environments/science_requires.txt
@@ -1,3 +1,5 @@
+torch
+torchaudio
 easyocr
 fasttext-wheel
 kenlm
@@ -16,8 +18,6 @@ accelerate
 tiktoken
 opencc==1.1.6
 imagededup
-torch
-torchaudio
 dlib
 spacy-pkuseg==0.0.32
 diffusers

From 9142f505991bc1cc24e9087fd72d63cd3c43c847 Mon Sep 17 00:00:00 2001
From: root <panxuchen>
Date: Wed, 24 Apr 2024 11:25:46 +0000
Subject: [PATCH 10/27] change working directory

---
 .github/workflows/unittest-gpu.yml | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest-gpu.yml
index 7fe10c142..e5ff382b8 100644
--- a/.github/workflows/unittest-gpu.yml
+++ b/.github/workflows/unittest-gpu.yml
@@ -16,12 +16,15 @@ permissions:
 jobs:
   unittest:
     runs-on: [self-hosted, linux, GPU]
-    defaults:
-      run:
-        working-directory: ./${{ github.event.pull_request.head.ref }}
     environment: Testing
     steps:
+    - name: Set up unique working directory
+      run: mkdir ${{ runner.temp }}/dj-${{ github.run_id }}
+      shell: bash
+
     - uses: actions/checkout@v3
+      with:
+        path: ${{ runner.temp }}/dj-${{ github.run_id }}
 
     - name: Setup docker compose
       working-directory: .github/workflows/docker
@@ -38,3 +41,8 @@ jobs:
       if: always()
       run: |
         docker compose down --remove-orphans
+
+    - name: Cleanup workspace
+      if: always()
+      run: |
+        rm -rf ${{ runner.temp }}/dj-${{ github.run_id }}

From 926fbb56d3775baf537281f278965b30156300a9 Mon Sep 17 00:00:00 2001
From: root <panxuchen>
Date: Wed, 24 Apr 2024 11:42:04 +0000
Subject: [PATCH 11/27] change working directory

---
 .github/workflows/unittest-gpu.yml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest-gpu.yml
index e5ff382b8..3a785d36b 100644
--- a/.github/workflows/unittest-gpu.yml
+++ b/.github/workflows/unittest-gpu.yml
@@ -18,10 +18,6 @@ jobs:
     runs-on: [self-hosted, linux, GPU]
     environment: Testing
     steps:
-    - name: Set up unique working directory
-      run: mkdir ${{ runner.temp }}/dj-${{ github.run_id }}
-      shell: bash
-
     - uses: actions/checkout@v3
       with:
         path: ${{ runner.temp }}/dj-${{ github.run_id }}

From 905488fcaf5cd55ad3ae465ec67eb6fde6ddbffc Mon Sep 17 00:00:00 2001
From: root <panxuchen>
Date: Wed, 24 Apr 2024 11:49:06 +0000
Subject: [PATCH 12/27] change working directory

---
 .github/workflows/unittest-gpu.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest-gpu.yml
index 3a785d36b..3e4411b6b 100644
--- a/.github/workflows/unittest-gpu.yml
+++ b/.github/workflows/unittest-gpu.yml
@@ -20,7 +20,7 @@ jobs:
     steps:
     - uses: actions/checkout@v3
       with:
-        path: ${{ runner.temp }}/dj-${{ github.run_id }}
+        path: dj-${{ github.run_id }}
 
     - name: Setup docker compose
       working-directory: .github/workflows/docker
@@ -41,4 +41,4 @@ jobs:
     - name: Cleanup workspace
       if: always()
       run: |
-        rm -rf ${{ runner.temp }}/dj-${{ github.run_id }}
+        rm -rf  dj-${{ github.run_id }}

From 74c34650fc6c92e4d5b5ab36cdbebca43d15d364 Mon Sep 17 00:00:00 2001
From: root <panxuchen>
Date: Wed, 24 Apr 2024 11:53:14 +0000
Subject: [PATCH 13/27] change working directory

---
 .github/workflows/unittest-gpu.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest-gpu.yml
index 3e4411b6b..b01167aa7 100644
--- a/.github/workflows/unittest-gpu.yml
+++ b/.github/workflows/unittest-gpu.yml
@@ -23,17 +23,17 @@ jobs:
         path: dj-${{ github.run_id }}
 
     - name: Setup docker compose
-      working-directory: .github/workflows/docker
+      working-directory: dj-${{ github.run_id }}/.github/workflows/docker
       run: |
         docker compose up -d
 
     - name: Install data-juicer
-      working-directory: .github/workflows/docker
+      working-directory: dj-${{ github.run_id }}/.github/workflows/docker
       run: |
         docker compose exec ray-head pip install -e .\[all\]
 
     - name: Remove docker compose
-      working-directory: .github/workflows/docker
+      working-directory: dj-${{ github.run_id }}/.github/workflows/docker
       if: always()
       run: |
         docker compose down --remove-orphans

From 16ce4a8c41ea92397581c07eb9dd063d3030fd83 Mon Sep 17 00:00:00 2001
From: root <panxuchen>
Date: Thu, 25 Apr 2024 04:13:52 +0000
Subject: [PATCH 14/27] change unittest

---
 data_juicer/utils/unittest_utils.py          | 34 ++++++++++++++++++++
 tests/ops/filter/test_alphanumeric_filter.py | 10 +++---
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/data_juicer/utils/unittest_utils.py b/data_juicer/utils/unittest_utils.py
index b9d18dbf1..0a585073f 100644
--- a/data_juicer/utils/unittest_utils.py
+++ b/data_juicer/utils/unittest_utils.py
@@ -1,8 +1,13 @@
 import os
 import shutil
 import unittest
+from datasets import Dataset
+import ray.data as rd
 
+from data_juicer.ops import Filter
 from data_juicer.utils.registry import Registry
+from data_juicer.utils.constant import Fields
+
 
 SKIPPED_TESTS = Registry('SkippedTests')
 
@@ -32,3 +37,32 @@ def tearDownClass(cls, hf_model_name=None) -> None:
             if os.path.exists(transformers.TRANSFORMERS_CACHE):
                 print('CLEAN all TRANSFORMERS_CACHE')
                 shutil.rmtree(transformers.TRANSFORMERS_CACHE)
+
+    @classmethod
+    def generate_dataset(cls, data, type="hf"):
+        """Generate dataset for a specific executor.
+
+        Args:
+            type (str, optional): `hf` or `ray`. Defaults to "hf".
+        """
+        if type == "hf":
+            return Dataset.from_list(data)
+        elif type == "ray":
+            return rd.from_items(data)
+
+    @classmethod
+    def run_single_op(cls, dataset, op, type="hf"):
+        """Run operator in the specific executor."""
+        if type == "hf":
+            if isinstance(op, Filter) and Fields.stats not in dataset.features:
+                # TODO:
+                # this is a temp solution,
+                # only add stats when calling filter op
+                dataset = dataset.add_column(name=Fields.stats,
+                                            column=[{}] * dataset.num_rows)
+            dataset = dataset.map(op.compute_stats)
+            dataset = dataset.filter(op.process)
+            dataset = dataset.select_columns(column_names=['text'])
+            return dataset.to_list()
+        elif type == "ray":
+            pass
\ No newline at end of file
diff --git a/tests/ops/filter/test_alphanumeric_filter.py b/tests/ops/filter/test_alphanumeric_filter.py
index efca696c2..41d282f75 100644
--- a/tests/ops/filter/test_alphanumeric_filter.py
+++ b/tests/ops/filter/test_alphanumeric_filter.py
@@ -50,9 +50,10 @@ def test_case(self):
         }, {
             'text': 'emoji表情测试下😊，😸31231\n'
         }]
-        dataset = Dataset.from_list(ds_list)
+        dataset = DataJuicerTestCaseBase.generate_dataset(ds_list)
         op = AlphanumericFilter(min_ratio=0.2, max_ratio=0.9)
-        self._run_alphanumeric_filter(dataset, tgt_list, op)
+        result = DataJuicerTestCaseBase.run_single_op(dataset, op)
+        self.assertEqual(result, tgt_list)
 
     def test_token_case(self):
 
@@ -76,9 +77,10 @@ def test_token_case(self):
         }, {
             'text': 'Do you need a cup of coffee?'
         }]
-        dataset = Dataset.from_list(ds_list)
+        dataset = DataJuicerTestCaseBase.generate_dataset(ds_list)
         op = AlphanumericFilter(tokenization=True, min_ratio=1.5)
-        self._run_alphanumeric_filter(dataset, tgt_list, op)
+        result = DataJuicerTestCaseBase.run_single_op(dataset, op)
+        self.assertEqual(result, tgt_list)
 
 
 if __name__ == '__main__':

From 4e96d29a6da886a02d80fa98353285bdda421953 Mon Sep 17 00:00:00 2001
From: panxuchen <panxuchen.pxc@alibaba-inc.com>
Date: Thu, 25 Apr 2024 19:15:54 +0800
Subject: [PATCH 15/27] use test tag

---
 .github/workflows/docker/docker-compose.yml   | 11 +++-
 .github/workflows/unit-test.yml               | 57 -------------------
 .../{unittest-gpu.yml => unittest.yml}        | 16 +++++-
 data_juicer/utils/unittest_utils.py           |  8 +++
 tests/ops/filter/test_alphanumeric_filter.py  | 16 +-----
 tests/run.py                                  | 36 +++++-------
 6 files changed, 46 insertions(+), 98 deletions(-)
 delete mode 100644 .github/workflows/unit-test.yml
 rename .github/workflows/{unittest-gpu.yml => unittest.yml} (70%)

diff --git a/.github/workflows/docker/docker-compose.yml b/.github/workflows/docker/docker-compose.yml
index 2089e6798..73b577937 100644
--- a/.github/workflows/docker/docker-compose.yml
+++ b/.github/workflows/docker/docker-compose.yml
@@ -1,11 +1,14 @@
 version: '3'
 services:
   ray-head:
-    image: data-juicer-unittest:v0.2.0
+    image: data-juicer-unittest:0.2.0
     pull_policy: never
     command: ray start --head --dashboard-host 0.0.0.0 --include-dashboard true --block
     environment:
       - HF_HOME=/data/huggingface
+      - TORCH_HOME=/data/torch
+      - NLTK_DATA=/data/nltk
+      - DATA_JUICER_CACHE_HOME=/data/dj
       - RAY_ADDRESS=auto
     working_dir: /workspace
     networks:
@@ -26,11 +29,14 @@ services:
             capabilities: [gpu]
 
   ray-worker:
-    image: data-juicer-unittest:v0.2.0
+    image: data-juicer-unittest:0.2.0
     pull_policy: never
     command: ray start --address=ray-head:6379 --block
     environment:
       - HF_HOME=/data/huggingface
+      - TORCH_HOME=/data/torch
+      - NLTK_DATA=/data/nltk
+      - DATA_JUICER_CACHE_HOME=/data/dj
     working_dir: /workspace
     volumes:
       - huggingface_cache:/data
@@ -54,3 +60,4 @@ networks:
 
 volumes:
   huggingface_cache:
+    external: true
diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml
deleted file mode 100644
index 3eb21202d..000000000
--- a/.github/workflows/unit-test.yml
+++ /dev/null
@@ -1,57 +0,0 @@
-# This workflow will install Python dependencies, run tests and lint with a single version of Python
-# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
-
-name: Unit Test
-
-on: [push, pull_request, workflow_dispatch]
-
-permissions:
-  contents: read
-
-jobs:
-  build:
-
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v3
-    - name: Check disk space
-      run: |
-        df -h
-    - name: Set up Python 3.8
-      uses: actions/setup-python@v3
-      with:
-        python-version: "3.8"
-    - name: Check disk space
-      run: |
-        df -h
-    - name: Install dependencies
-      run: |
-        sudo apt-get install ffmpeg
-        python -m pip install --upgrade pip
-        pip install -v -e .[all]
-    - name: Increase swapfile
-      run: |
-        df -h
-        free -h
-        sudo swapoff -a
-        sudo fallocate -l 12G /mnt/swapfile
-        sudo chmod 600 /mnt/swapfile
-        sudo mkswap /mnt/swapfile
-        sudo swapon /mnt/swapfile
-        sudo swapon --show
-    - name: Clean data-juicer assets and models after cached
-      uses: webiny/action-post-run@3.1.0
-      with:
-        run: rm -rf ~/.cache/data_juicer
-    - name: Cache data-juicer assets and models
-      uses: actions/cache@v3
-      with:
-        path: ~/.cache/data_juicer
-        key: dj-assets-models
-    - name: Check disk space
-      run: |
-        df -h
-    - name: Run the test
-      run: |
-        python tests/run.py
diff --git a/.github/workflows/unittest-gpu.yml b/.github/workflows/unittest.yml
similarity index 70%
rename from .github/workflows/unittest-gpu.yml
rename to .github/workflows/unittest.yml
index b01167aa7..052c58a95 100644
--- a/.github/workflows/unittest-gpu.yml
+++ b/.github/workflows/unittest.yml
@@ -1,7 +1,7 @@
 # This workflow will install Python dependencies, run tests and lint with a single version of Python
 # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 
-name: unittest-gpu
+name: unittest
 
 on:
   workflow_dispatch:
@@ -14,8 +14,8 @@ permissions:
   contents: read
 
 jobs:
-  unittest:
-    runs-on: [self-hosted, linux, GPU]
+  unittest-single:
+    runs-on: [self-hosted, linux]
     environment: Testing
     steps:
     - uses: actions/checkout@v3
@@ -32,6 +32,16 @@ jobs:
       run: |
         docker compose exec ray-head pip install -e .\[all\]
 
+    - name: Clean dataset cache
+      working-directory: dj-${{ github.run_id }}/.github/workflows/docker
+      run: |
+        docker compose exec ray-head rm -rf /data/huggingface/dataset
+
+    - name: Run unittest standalone
+      working-directory: dj-${{ github.run_id }}/.github/workflows/docker
+      run: |
+        docker compose exec ray-head python tests/run.py --tag standalone-gpu
+
     - name: Remove docker compose
       working-directory: dj-${{ github.run_id }}/.github/workflows/docker
       if: always()
diff --git a/data_juicer/utils/unittest_utils.py b/data_juicer/utils/unittest_utils.py
index 0a585073f..2394a834e 100644
--- a/data_juicer/utils/unittest_utils.py
+++ b/data_juicer/utils/unittest_utils.py
@@ -11,6 +11,14 @@
 
 SKIPPED_TESTS = Registry('SkippedTests')
 
+def TEST_TAG(*tags):
+    """Tags for test case.
+    Currently, `standalone`, `ray`, `standalone-gpu`, `ray-gpu` are supported.
+    """
+    def decorator(func):
+        setattr(func, "tags", tags)
+        return func
+    return decorator
 
 class DataJuicerTestCaseBase(unittest.TestCase):
 
diff --git a/tests/ops/filter/test_alphanumeric_filter.py b/tests/ops/filter/test_alphanumeric_filter.py
index 41d282f75..ffe635258 100644
--- a/tests/ops/filter/test_alphanumeric_filter.py
+++ b/tests/ops/filter/test_alphanumeric_filter.py
@@ -4,24 +4,12 @@
 
 from data_juicer.ops.filter.alphanumeric_filter import AlphanumericFilter
 from data_juicer.utils.constant import Fields
-from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, TEST_TAG
 
 
 class AlphanumericFilterTest(DataJuicerTestCaseBase):
 
-    def _run_alphanumeric_filter(self, dataset: Dataset, target_list, op):
-        if Fields.stats not in dataset.features:
-            # TODO:
-            # this is a temp solution,
-            # only add stats when calling filter op
-            dataset = dataset.add_column(name=Fields.stats,
-                                         column=[{}] * dataset.num_rows)
-        dataset = dataset.map(op.compute_stats)
-        dataset = dataset.filter(op.process)
-        dataset = dataset.select_columns(column_names=['text'])
-        res_list = dataset.to_list()
-        self.assertEqual(res_list, target_list)
-
+    @TEST_TAG("standalone-gpu")
     def test_case(self):
 
         ds_list = [{
diff --git a/tests/run.py b/tests/run.py
index 8ff91e459..613c84693 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -19,6 +19,9 @@
 sys.path.append(file_dir)
 
 parser = argparse.ArgumentParser('test runner')
+parser.add_argument('--tag', choices=["standalone", "standalone-gpu", "ray", "ray-gpu"],
+                    default="standalone",
+                    help="the tag of tests being run")
 parser.add_argument('--list_tests', action='store_true', help='list all tests')
 parser.add_argument('--pattern', default='test_*.py', help='test file pattern')
 parser.add_argument('--test_dir',
@@ -26,9 +29,8 @@
                     help='directory to be tested')
 args = parser.parse_args()
 
-
-def gather_test_cases(test_dir, pattern, list_tests):
-    test_suite = unittest.TestSuite()
+def gather_test_cases(test_dir, pattern, list_tests, tag):
+    test_to_run = unittest.TestSuite()
     discover = unittest.defaultTestLoader.discover(test_dir,
                                                    pattern=pattern,
                                                    top_level_dir=None)
@@ -36,32 +38,22 @@ def gather_test_cases(test_dir, pattern, list_tests):
           f'{SKIPPED_TESTS.modules}')
     for suite_discovered in discover:
 
-        for test_case in suite_discovered:
-            logger.info(f'Prepare for test [{test_case}]')
-            # filter out those tests that need to be skipped
-            filtered_test_suite = unittest.TestSuite()
-            for tc in test_case:
-                if type(tc) in SKIPPED_TESTS.modules.values():
+        for test_suite in suite_discovered:
+            logger.info(f'Prepare for test suite [{test_suite}]')
+            for test_case in test_suite:
+                if type(test_case) in SKIPPED_TESTS.modules.values():
                     continue
-                filtered_test_suite.addTest(tc)
-            if filtered_test_suite.countTestCases() == 0:
-                continue
-
-            test_suite.addTest(test_case)
-            if hasattr(test_case, '__iter__'):
-                for subcase in test_case:
+                if tag in getattr(test_case, "tags", ["standalone"]):
+                    test_to_run.addTest(test_case)
                     if list_tests:
-                        print(subcase)
-            else:
-                if list_tests:
-                    print(test_case)
-    return test_suite
+                        logger.info(f"add test case [{test_case}]")
+    return test_to_run
 
 
 def main():
     runner = unittest.TextTestRunner()
     test_suite = gather_test_cases(os.path.abspath(args.test_dir),
-                                   args.pattern, args.list_tests)
+                                   args.pattern, args.list_tests, args.tag)
     if not args.list_tests:
         res = runner.run(test_suite)
         if not res.wasSuccessful():

From 85db5d091404d954491c96bef64e97be0ee7667d Mon Sep 17 00:00:00 2001
From: root <panxuchen>
Date: Thu, 25 Apr 2024 12:14:21 +0000
Subject: [PATCH 16/27] finish tag support

---
 .github/workflows/unittest.yml               |  2 +-
 data_juicer/utils/unittest_utils.py          | 28 ++++++++++--------
 tests/ops/filter/test_alphanumeric_filter.py |  3 +-
 tests/run.py                                 | 31 ++++++++++++++------
 4 files changed, 41 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
index 052c58a95..ed1a10718 100644
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@@ -40,7 +40,7 @@ jobs:
     - name: Run unittest standalone
       working-directory: dj-${{ github.run_id }}/.github/workflows/docker
       run: |
-        docker compose exec ray-head python tests/run.py --tag standalone-gpu
+        docker compose exec ray-head python tests/run.py --tag standalone
 
     - name: Remove docker compose
       working-directory: dj-${{ github.run_id }}/.github/workflows/docker
diff --git a/data_juicer/utils/unittest_utils.py b/data_juicer/utils/unittest_utils.py
index 2394a834e..5806fa60b 100644
--- a/data_juicer/utils/unittest_utils.py
+++ b/data_juicer/utils/unittest_utils.py
@@ -1,25 +1,29 @@
 import os
 import shutil
 import unittest
-from datasets import Dataset
+
 import ray.data as rd
+from datasets import Dataset
 
 from data_juicer.ops import Filter
-from data_juicer.utils.registry import Registry
 from data_juicer.utils.constant import Fields
-
+from data_juicer.utils.registry import Registry
 
 SKIPPED_TESTS = Registry('SkippedTests')
 
+
 def TEST_TAG(*tags):
     """Tags for test case.
     Currently, `standalone`, `ray`, `standalone-gpu`, `ray-gpu` are supported.
     """
+
     def decorator(func):
-        setattr(func, "tags", tags)
+        setattr(func, '__test_tags__', tags)
         return func
+
     return decorator
 
+
 class DataJuicerTestCaseBase(unittest.TestCase):
 
     @classmethod
@@ -47,30 +51,30 @@ def tearDownClass(cls, hf_model_name=None) -> None:
                 shutil.rmtree(transformers.TRANSFORMERS_CACHE)
 
     @classmethod
-    def generate_dataset(cls, data, type="hf"):
+    def generate_dataset(cls, data, type='hf'):
         """Generate dataset for a specific executor.
 
         Args:
             type (str, optional): `hf` or `ray`. Defaults to "hf".
         """
-        if type == "hf":
+        if type == 'hf':
             return Dataset.from_list(data)
-        elif type == "ray":
+        elif type == 'ray':
             return rd.from_items(data)
 
     @classmethod
-    def run_single_op(cls, dataset, op, type="hf"):
+    def run_single_op(cls, dataset, op, type='hf'):
         """Run operator in the specific executor."""
-        if type == "hf":
+        if type == 'hf':
             if isinstance(op, Filter) and Fields.stats not in dataset.features:
                 # TODO:
                 # this is a temp solution,
                 # only add stats when calling filter op
                 dataset = dataset.add_column(name=Fields.stats,
-                                            column=[{}] * dataset.num_rows)
+                                             column=[{}] * dataset.num_rows)
             dataset = dataset.map(op.compute_stats)
             dataset = dataset.filter(op.process)
             dataset = dataset.select_columns(column_names=['text'])
             return dataset.to_list()
-        elif type == "ray":
-            pass
\ No newline at end of file
+        elif type == 'ray':
+            pass
diff --git a/tests/ops/filter/test_alphanumeric_filter.py b/tests/ops/filter/test_alphanumeric_filter.py
index ffe635258..1ef5ebe2a 100644
--- a/tests/ops/filter/test_alphanumeric_filter.py
+++ b/tests/ops/filter/test_alphanumeric_filter.py
@@ -9,7 +9,7 @@
 
 class AlphanumericFilterTest(DataJuicerTestCaseBase):
 
-    @TEST_TAG("standalone-gpu")
+    @TEST_TAG("single")
     def test_case(self):
 
         ds_list = [{
@@ -43,6 +43,7 @@ def test_case(self):
         result = DataJuicerTestCaseBase.run_single_op(dataset, op)
         self.assertEqual(result, tgt_list)
 
+    @TEST_TAG("single")
     def test_token_case(self):
 
         ds_list = [{
diff --git a/tests/run.py b/tests/run.py
index 613c84693..24402fa55 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -29,24 +29,37 @@
                     help='directory to be tested')
 args = parser.parse_args()
 
+
+class TaggedTestLoader(unittest.TestLoader):
+    def __init__(self, tag=None):
+        super().__init__()
+        self.tag = tag
+    
+    def loadTestsFromTestCase(self, testCaseClass):
+
+        test_names = self.getTestCaseNames(testCaseClass)
+        loaded_suite = self.suiteClass()
+        for test_name in test_names:
+            test_case = testCaseClass(test_name)
+            test_method = getattr(test_case, test_name)
+            if hasattr(test_method, '__test_tags__') and self.tag in test_method.__test_tags__:
+                loaded_suite.addTest(test_case)
+        return loaded_suite
+
 def gather_test_cases(test_dir, pattern, list_tests, tag):
     test_to_run = unittest.TestSuite()
-    discover = unittest.defaultTestLoader.discover(test_dir,
-                                                   pattern=pattern,
-                                                   top_level_dir=None)
+    test_loader = TaggedTestLoader(tag)
+    discover = test_loader.discover(test_dir, pattern=pattern, top_level_dir=None)
     print(f'These tests will be skipped due to some reasons: '
           f'{SKIPPED_TESTS.modules}')
     for suite_discovered in discover:
-
         for test_suite in suite_discovered:
-            logger.info(f'Prepare for test suite [{test_suite}]')
             for test_case in test_suite:
                 if type(test_case) in SKIPPED_TESTS.modules.values():
                     continue
-                if tag in getattr(test_case, "tags", ["standalone"]):
-                    test_to_run.addTest(test_case)
-                    if list_tests:
-                        logger.info(f"add test case [{test_case}]")
+                if list_tests:
+                    logger.info(f'Add test case [{str(test_case)}]')
+                test_to_run.addTest(test_case)
     return test_to_run
 
 

From ef4ab1d4433e0c2c5a745fa9ccfca1bf6132051d Mon Sep 17 00:00:00 2001
From: root <panxuchen>
Date: Fri, 26 Apr 2024 04:26:44 +0000
Subject: [PATCH 17/27] support run op with different executro

---
 data_juicer/utils/unittest_utils.py          | 21 ++++++++++----------
 tests/ops/filter/test_alphanumeric_filter.py |  6 +++---
 tests/run.py                                 | 19 +++++++++---------
 3 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/data_juicer/utils/unittest_utils.py b/data_juicer/utils/unittest_utils.py
index 5806fa60b..d545dbac9 100644
--- a/data_juicer/utils/unittest_utils.py
+++ b/data_juicer/utils/unittest_utils.py
@@ -51,30 +51,31 @@ def tearDownClass(cls, hf_model_name=None) -> None:
                 shutil.rmtree(transformers.TRANSFORMERS_CACHE)
 
     @classmethod
-    def generate_dataset(cls, data, type='hf'):
+    def generate_dataset(cls, data, type='standalone'):
         """Generate dataset for a specific executor.
 
         Args:
             type (str, optional): `hf` or `ray`. Defaults to "hf".
         """
-        if type == 'hf':
+        if type.startswith('standalone'):
             return Dataset.from_list(data)
-        elif type == 'ray':
+        elif type.startswith('ray'):
             return rd.from_items(data)
+        else:
+            raise ValueError("Unsupported type")
 
     @classmethod
-    def run_single_op(cls, dataset, op, type='hf'):
+    def run_single_op(cls, dataset, op, type='standalone'):
         """Run operator in the specific executor."""
-        if type == 'hf':
+        if type.startswith('standalone'):
             if isinstance(op, Filter) and Fields.stats not in dataset.features:
-                # TODO:
-                # this is a temp solution,
-                # only add stats when calling filter op
                 dataset = dataset.add_column(name=Fields.stats,
                                              column=[{}] * dataset.num_rows)
             dataset = dataset.map(op.compute_stats)
             dataset = dataset.filter(op.process)
             dataset = dataset.select_columns(column_names=['text'])
             return dataset.to_list()
-        elif type == 'ray':
-            pass
+        elif type.startswith('ray'):
+            raise ValueError("Unsupported type")
+        else:
+            raise ValueError("Unsupported type")
\ No newline at end of file
diff --git a/tests/ops/filter/test_alphanumeric_filter.py b/tests/ops/filter/test_alphanumeric_filter.py
index 1ef5ebe2a..c0a64ffa7 100644
--- a/tests/ops/filter/test_alphanumeric_filter.py
+++ b/tests/ops/filter/test_alphanumeric_filter.py
@@ -9,7 +9,7 @@
 
 class AlphanumericFilterTest(DataJuicerTestCaseBase):
 
-    @TEST_TAG("single")
+    @TEST_TAG("standalone")
     def test_case(self):
 
         ds_list = [{
@@ -40,10 +40,10 @@ def test_case(self):
         }]
         dataset = DataJuicerTestCaseBase.generate_dataset(ds_list)
         op = AlphanumericFilter(min_ratio=0.2, max_ratio=0.9)
-        result = DataJuicerTestCaseBase.run_single_op(dataset, op)
+        result = DataJuicerTestCaseBase.run_single_op(dataset, op, AlphanumericFilterTest.current_tag,)
         self.assertEqual(result, tgt_list)
 
-    @TEST_TAG("single")
+    @TEST_TAG("standalone")
     def test_token_case(self):
 
         ds_list = [{
diff --git a/tests/run.py b/tests/run.py
index 24402fa55..a2fbeeb1c 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -22,7 +22,6 @@
 parser.add_argument('--tag', choices=["standalone", "standalone-gpu", "ray", "ray-gpu"],
                     default="standalone",
                     help="the tag of tests being run")
-parser.add_argument('--list_tests', action='store_true', help='list all tests')
 parser.add_argument('--pattern', default='test_*.py', help='test file pattern')
 parser.add_argument('--test_dir',
                     default='tests',
@@ -36,7 +35,8 @@ def __init__(self, tag=None):
         self.tag = tag
     
     def loadTestsFromTestCase(self, testCaseClass):
-
+        # set tag to testcase class
+        setattr(testCaseClass, 'current_tag', self.tag)
         test_names = self.getTestCaseNames(testCaseClass)
         loaded_suite = self.suiteClass()
         for test_name in test_names:
@@ -46,7 +46,7 @@ def loadTestsFromTestCase(self, testCaseClass):
                 loaded_suite.addTest(test_case)
         return loaded_suite
 
-def gather_test_cases(test_dir, pattern, list_tests, tag):
+def gather_test_cases(test_dir, pattern, tag):
     test_to_run = unittest.TestSuite()
     test_loader = TaggedTestLoader(tag)
     discover = test_loader.discover(test_dir, pattern=pattern, top_level_dir=None)
@@ -57,8 +57,8 @@ def gather_test_cases(test_dir, pattern, list_tests, tag):
             for test_case in test_suite:
                 if type(test_case) in SKIPPED_TESTS.modules.values():
                     continue
-                if list_tests:
-                    logger.info(f'Add test case [{str(test_case)}]')
+                logger.info(f'Add test case [{test_case._testMethodName}]'
+                            f' from {test_case.__class__.__name__}')
                 test_to_run.addTest(test_case)
     return test_to_run
 
@@ -66,11 +66,10 @@ def gather_test_cases(test_dir, pattern, list_tests, tag):
 def main():
     runner = unittest.TextTestRunner()
     test_suite = gather_test_cases(os.path.abspath(args.test_dir),
-                                   args.pattern, args.list_tests, args.tag)
-    if not args.list_tests:
-        res = runner.run(test_suite)
-        if not res.wasSuccessful():
-            exit(1)
+                                   args.pattern, args.tag)
+    res = runner.run(test_suite)
+    if not res.wasSuccessful():
+        exit(1)
 
 
 if __name__ == '__main__':

From 2a698ca57c2c5f9e8f567af02ecde32d2dd8ed93 Mon Sep 17 00:00:00 2001
From: root <panxuchen>
Date: Fri, 26 Apr 2024 04:28:31 +0000
Subject: [PATCH 18/27] fix pre-commit

---
 data_juicer/utils/unittest_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/data_juicer/utils/unittest_utils.py b/data_juicer/utils/unittest_utils.py
index d545dbac9..1465e9c6c 100644
--- a/data_juicer/utils/unittest_utils.py
+++ b/data_juicer/utils/unittest_utils.py
@@ -62,7 +62,7 @@ def generate_dataset(cls, data, type='standalone'):
         elif type.startswith('ray'):
             return rd.from_items(data)
         else:
-            raise ValueError("Unsupported type")
+            raise ValueError('Unsupported type')
 
     @classmethod
     def run_single_op(cls, dataset, op, type='standalone'):
@@ -76,6 +76,6 @@ def run_single_op(cls, dataset, op, type='standalone'):
             dataset = dataset.select_columns(column_names=['text'])
             return dataset.to_list()
         elif type.startswith('ray'):
-            raise ValueError("Unsupported type")
+            raise ValueError('Unsupported type')
         else:
-            raise ValueError("Unsupported type")
\ No newline at end of file
+            raise ValueError('Unsupported type')

From 319c4504786f29f8cfa9d4b8b2da2d00a9659f34 Mon Sep 17 00:00:00 2001
From: root <panxuchen>
Date: Fri, 26 Apr 2024 05:16:19 +0000
Subject: [PATCH 19/27] add hf mirror

---
 .github/workflows/docker/docker-compose.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/docker/docker-compose.yml b/.github/workflows/docker/docker-compose.yml
index 73b577937..38dd63f1e 100644
--- a/.github/workflows/docker/docker-compose.yml
+++ b/.github/workflows/docker/docker-compose.yml
@@ -6,6 +6,7 @@ services:
     command: ray start --head --dashboard-host 0.0.0.0 --include-dashboard true --block
     environment:
       - HF_HOME=/data/huggingface
+      - HF_ENDPOINT="https://hf-mirror.com"
       - TORCH_HOME=/data/torch
       - NLTK_DATA=/data/nltk
       - DATA_JUICER_CACHE_HOME=/data/dj
@@ -34,6 +35,7 @@ services:
     command: ray start --address=ray-head:6379 --block
     environment:
       - HF_HOME=/data/huggingface
+      - HF_ENDPOINT="https://hf-mirror.com"
       - TORCH_HOME=/data/torch
       - NLTK_DATA=/data/nltk
       - DATA_JUICER_CACHE_HOME=/data/dj

From de318af64d2eb2d0984bcd5c01db906cc6c1fc66 Mon Sep 17 00:00:00 2001
From: root <panxuchen>
Date: Fri, 26 Apr 2024 05:18:05 +0000
Subject: [PATCH 20/27] add hf mirror

---
 .github/workflows/docker/docker-compose.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/docker/docker-compose.yml b/.github/workflows/docker/docker-compose.yml
index 38dd63f1e..fb2aa7a3e 100644
--- a/.github/workflows/docker/docker-compose.yml
+++ b/.github/workflows/docker/docker-compose.yml
@@ -6,7 +6,7 @@ services:
     command: ray start --head --dashboard-host 0.0.0.0 --include-dashboard true --block
     environment:
       - HF_HOME=/data/huggingface
-      - HF_ENDPOINT="https://hf-mirror.com"
+      - HF_ENDPOINT=https://hf-mirror.com
       - TORCH_HOME=/data/torch
       - NLTK_DATA=/data/nltk
       - DATA_JUICER_CACHE_HOME=/data/dj
@@ -35,7 +35,7 @@ services:
     command: ray start --address=ray-head:6379 --block
     environment:
       - HF_HOME=/data/huggingface
-      - HF_ENDPOINT="https://hf-mirror.com"
+      - HF_ENDPOINT=https://hf-mirror.com
       - TORCH_HOME=/data/torch
       - NLTK_DATA=/data/nltk
       - DATA_JUICER_CACHE_HOME=/data/dj

From 6863cd004bf79a8ae0822d4b5f655c3fb4f644ba Mon Sep 17 00:00:00 2001
From: root <panxuchen>
Date: Fri, 26 Apr 2024 05:21:39 +0000
Subject: [PATCH 21/27] run all test in standalone mode by default

---
 tests/run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/run.py b/tests/run.py
index a2fbeeb1c..080404b8a 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -42,7 +42,7 @@ def loadTestsFromTestCase(self, testCaseClass):
         for test_name in test_names:
             test_case = testCaseClass(test_name)
             test_method = getattr(test_case, test_name)
-            if hasattr(test_method, '__test_tags__') and self.tag in test_method.__test_tags__:
+            if self.tag in getattr(test_method, '__test_tags__', ["standalone"]):
                 loaded_suite.addTest(test_case)
         return loaded_suite
 

From 1a251c9a58516b1bbb54e69ec8ccfd9577b09224 Mon Sep 17 00:00:00 2001
From: root <panxuchen>
Date: Fri, 26 Apr 2024 06:02:09 +0000
Subject: [PATCH 22/27] ignore image face ratio

---
 tests/ops/filter/test_image_face_ratio_filter.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/ops/filter/test_image_face_ratio_filter.py b/tests/ops/filter/test_image_face_ratio_filter.py
index 2a2327b8f..2c6cf5b40 100644
--- a/tests/ops/filter/test_image_face_ratio_filter.py
+++ b/tests/ops/filter/test_image_face_ratio_filter.py
@@ -5,9 +5,10 @@
 
 from data_juicer.ops.filter.image_face_ratio_filter import ImageFaceRatioFilter
 from data_juicer.utils.constant import Fields
-from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS
 
 
+@SKIPPED_TESTS.register_module()
 class ImageFaceRatioFilterTest(DataJuicerTestCaseBase):
 
     maxDiff = None

From aef673966257bb5d95068231e2adbfb7501a3b1d Mon Sep 17 00:00:00 2001
From: panxuchen <panxuchen.pxc@alibaba-inc.com>
Date: Tue, 7 May 2024 15:43:26 +0800
Subject: [PATCH 23/27] update tags

---
 data_juicer/utils/unittest_utils.py          | 6 ++----
 tests/ops/filter/test_alphanumeric_filter.py | 8 ++++----
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/data_juicer/utils/unittest_utils.py b/data_juicer/utils/unittest_utils.py
index 1465e9c6c..0b8acda58 100644
--- a/data_juicer/utils/unittest_utils.py
+++ b/data_juicer/utils/unittest_utils.py
@@ -50,7 +50,6 @@ def tearDownClass(cls, hf_model_name=None) -> None:
                 print('CLEAN all TRANSFORMERS_CACHE')
                 shutil.rmtree(transformers.TRANSFORMERS_CACHE)
 
-    @classmethod
     def generate_dataset(cls, data, type='standalone'):
         """Generate dataset for a specific executor.
 
@@ -64,8 +63,7 @@ def generate_dataset(cls, data, type='standalone'):
         else:
             raise ValueError('Unsupported type')
 
-    @classmethod
-    def run_single_op(cls, dataset, op, type='standalone'):
+    def run_single_op(cls, dataset, op, column_names, type='standalone'):
         """Run operator in the specific executor."""
         if type.startswith('standalone'):
             if isinstance(op, Filter) and Fields.stats not in dataset.features:
@@ -73,7 +71,7 @@ def run_single_op(cls, dataset, op, type='standalone'):
                                              column=[{}] * dataset.num_rows)
             dataset = dataset.map(op.compute_stats)
             dataset = dataset.filter(op.process)
-            dataset = dataset.select_columns(column_names=['text'])
+            dataset = dataset.select_columns(column_names=column_names)
             return dataset.to_list()
         elif type.startswith('ray'):
             raise ValueError('Unsupported type')
diff --git a/tests/ops/filter/test_alphanumeric_filter.py b/tests/ops/filter/test_alphanumeric_filter.py
index c0a64ffa7..ec0115942 100644
--- a/tests/ops/filter/test_alphanumeric_filter.py
+++ b/tests/ops/filter/test_alphanumeric_filter.py
@@ -9,7 +9,7 @@
 
 class AlphanumericFilterTest(DataJuicerTestCaseBase):
 
-    @TEST_TAG("standalone")
+    @TEST_TAG("standalone", "ray")
     def test_case(self):
 
         ds_list = [{
@@ -40,10 +40,10 @@ def test_case(self):
         }]
         dataset = DataJuicerTestCaseBase.generate_dataset(ds_list)
         op = AlphanumericFilter(min_ratio=0.2, max_ratio=0.9)
-        result = DataJuicerTestCaseBase.run_single_op(dataset, op, AlphanumericFilterTest.current_tag,)
+        result = DataJuicerTestCaseBase.run_single_op(dataset, op, ["text"], self.current_tag)
         self.assertEqual(result, tgt_list)
 
-    @TEST_TAG("standalone")
+    @TEST_TAG("standalone", "ray")
     def test_token_case(self):
 
         ds_list = [{
@@ -68,7 +68,7 @@ def test_token_case(self):
         }]
         dataset = DataJuicerTestCaseBase.generate_dataset(ds_list)
         op = AlphanumericFilter(tokenization=True, min_ratio=1.5)
-        result = DataJuicerTestCaseBase.run_single_op(dataset, op)
+        result = DataJuicerTestCaseBase.run_single_op(dataset, op, ["text"], self.current_tag)
         self.assertEqual(result, tgt_list)
 
 

From 6e6409daa3badf757eb5b3b7a7d468eec8c3d806 Mon Sep 17 00:00:00 2001
From: panxuchen <panxuchen.pxc@alibaba-inc.com>
Date: Tue, 7 May 2024 20:19:38 +0800
Subject: [PATCH 24/27] add ray testcase

---
 data_juicer/core/ray_executor.py              | 127 +++++++++---------
 data_juicer/utils/unittest_utils.py           |  49 +++++--
 tests/ops/filter/test_alphanumeric_filter.py  |  12 +-
 .../ops/filter/test_audio_duration_filter.py  |  47 ++++---
 4 files changed, 142 insertions(+), 93 deletions(-)

diff --git a/data_juicer/core/ray_executor.py b/data_juicer/core/ray_executor.py
index d42d72f95..3cf7acbb3 100644
--- a/data_juicer/core/ray_executor.py
+++ b/data_juicer/core/ray_executor.py
@@ -102,6 +102,70 @@ def get_num_gpus(self, op, op_proc):
         proc_per_gpu = op_proc / cuda_device_count()
         return 1.0 / proc_per_gpu
 
+    def run_op(self, op, op_cfg, dataset):
+        op_name, op_args = list(op_cfg.items())[0]
+        op_cls = OPERATORS.modules[op_name]
+        op_proc = calculate_np(self.cfg.np, op, op_name)
+        num_gpus = self.get_num_gpus(op, op_proc)
+        use_actor = op.use_actor() or num_gpus
+        try:
+            if isinstance(op, Mapper):
+                if op.is_batched_op():
+                    if use_actor:
+                        dataset = dataset.map_batches(
+                            op_cls,
+                            compute=ActorPoolStrategy(),
+                            concurrency=op_proc,
+                            fn_constructor_kwargs=op_args,
+                            batch_format='pyarrow',
+                            num_gpus=num_gpus,
+                            batch_size=1)
+                        # The batch size here is same as in data.py
+                    else:
+                        dataset = dataset.map_batches(
+                            partial(ray_batch_mapper_wrapper,
+                                    fn=op.process),
+                            batch_format='pyarrow',
+                            num_gpus=num_gpus,
+                            batch_size=1)
+                        # The batch size here is same as in data.py
+                else:
+                    if use_actor:
+                        dataset = dataset.map(
+                            op_cls,
+                            compute=ActorPoolStrategy(),
+                            concurrency=op_proc,
+                            fn_constructor_kwargs=op_args,
+                            num_gpus=num_gpus)
+                    else:
+                        dataset = dataset.map(op.process,
+                                                num_gpus=num_gpus)
+
+            elif isinstance(op, Filter):
+                if use_actor:
+                    dataset = dataset.map(op_cls,
+                                            compute=ActorPoolStrategy(),
+                                            concurrency=op_proc,
+                                            fn_constructor_kwargs=op_args,
+                                            num_gpus=num_gpus)
+                else:
+                    dataset = dataset.map(op.compute_stats,
+                                            num_gpus=num_gpus)
+                if op.stats_export_path is not None:
+                    dataset.write_json(op.stats_export_path,
+                                        force_ascii=False)
+                dataset = dataset.filter(op.process)
+            else:
+                logger.error(
+                    'Ray executor only support Filter and Mapper OPs for '
+                    'now')
+                raise NotImplementedError
+        except:  # noqa: E722
+            logger.error(f'An error occurred during Op [{op_name}].')
+            import traceback
+            traceback.print_exc()
+            exit(1)
+
     def run(self, load_data_np=None):
         """
         Running the dataset process pipeline.
@@ -140,68 +204,7 @@ def process_batch_arrow(table: pa.Table) -> pa.Table:
         logger.info('Processing data...')
         tstart = time.time()
         for op_cfg, op in zip(self.process_list, self.ops):
-            op_name, op_args = list(op_cfg.items())[0]
-            op_cls = OPERATORS.modules[op_name]
-            op_proc = calculate_np(self.cfg.np, op, op_name)
-            num_gpus = self.get_num_gpus(op, op_proc)
-            use_actor = op.use_actor() or num_gpus
-            try:
-                if isinstance(op, Mapper):
-                    if op.is_batched_op():
-                        if use_actor:
-                            dataset = dataset.map_batches(
-                                op_cls,
-                                compute=ActorPoolStrategy(),
-                                concurrency=op_proc,
-                                fn_constructor_kwargs=op_args,
-                                batch_format='pyarrow',
-                                num_gpus=num_gpus,
-                                batch_size=1)
-                            # The batch size here is same as in data.py
-                        else:
-                            dataset = dataset.map_batches(
-                                partial(ray_batch_mapper_wrapper,
-                                        fn=op.process),
-                                batch_format='pyarrow',
-                                num_gpus=num_gpus,
-                                batch_size=1)
-                            # The batch size here is same as in data.py
-                    else:
-                        if use_actor:
-                            dataset = dataset.map(
-                                op_cls,
-                                compute=ActorPoolStrategy(),
-                                concurrency=op_proc,
-                                fn_constructor_kwargs=op_args,
-                                num_gpus=num_gpus)
-                        else:
-                            dataset = dataset.map(op.process,
-                                                  num_gpus=num_gpus)
-
-                elif isinstance(op, Filter):
-                    if use_actor:
-                        dataset = dataset.map(op_cls,
-                                              compute=ActorPoolStrategy(),
-                                              concurrency=op_proc,
-                                              fn_constructor_kwargs=op_args,
-                                              num_gpus=num_gpus)
-                    else:
-                        dataset = dataset.map(op.compute_stats,
-                                              num_gpus=num_gpus)
-                    if op.stats_export_path is not None:
-                        dataset.write_json(op.stats_export_path,
-                                           force_ascii=False)
-                    dataset = dataset.filter(op.process)
-                else:
-                    logger.error(
-                        'Ray executor only support Filter and Mapper OPs for '
-                        'now')
-                    raise NotImplementedError
-            except:  # noqa: E722
-                logger.error(f'An error occurred during Op [{op_name}].')
-                import traceback
-                traceback.print_exc()
-                exit(1)
+            dataset = self.run_op(op, op_cfg, dataset)
 
         # 4. data export
         logger.info('Exporting dataset to disk...')
diff --git a/data_juicer/utils/unittest_utils.py b/data_juicer/utils/unittest_utils.py
index 0b8acda58..546a28dc3 100644
--- a/data_juicer/utils/unittest_utils.py
+++ b/data_juicer/utils/unittest_utils.py
@@ -2,7 +2,9 @@
 import shutil
 import unittest
 
+import numpy
 import ray.data as rd
+import pyarrow as pa
 from datasets import Dataset
 
 from data_juicer.ops import Filter
@@ -50,22 +52,32 @@ def tearDownClass(cls, hf_model_name=None) -> None:
                 print('CLEAN all TRANSFORMERS_CACHE')
                 shutil.rmtree(transformers.TRANSFORMERS_CACHE)
 
-    def generate_dataset(cls, data, type='standalone'):
+    def generate_dataset(self, data):
         """Generate dataset for a specific executor.
 
         Args:
-            type (str, optional): `hf` or `ray`. Defaults to "hf".
+            type (str, optional): "standalone" or "ray". Defaults to "standalone".
         """
-        if type.startswith('standalone'):
+        if self.current_tag.startswith('standalone'):
             return Dataset.from_list(data)
-        elif type.startswith('ray'):
-            return rd.from_items(data)
+        elif self.current_tag.startswith('ray'):
+            dataset = rd.from_items(data)
+            if Fields.stats not in dataset.columns(fetch_if_missing=False):
+                def process_batch_arrow(table: pa.Table) -> pa.Table:
+                    new_column_data = [{} for _ in range(len(table))]
+                    new_talbe = table.append_column(Fields.stats,
+                                                    [new_column_data])
+                    return new_talbe
+
+                dataset = dataset.map_batches(process_batch_arrow,
+                                            batch_format='pyarrow')
+            return dataset
         else:
             raise ValueError('Unsupported type')
 
-    def run_single_op(cls, dataset, op, column_names, type='standalone'):
+    def run_single_op(self, dataset, op, column_names):
         """Run operator in the specific executor."""
-        if type.startswith('standalone'):
+        if self.current_tag.startswith('standalone'):
             if isinstance(op, Filter) and Fields.stats not in dataset.features:
                 dataset = dataset.add_column(name=Fields.stats,
                                              column=[{}] * dataset.num_rows)
@@ -73,7 +85,26 @@ def run_single_op(cls, dataset, op, column_names, type='standalone'):
             dataset = dataset.filter(op.process)
             dataset = dataset.select_columns(column_names=column_names)
             return dataset.to_list()
-        elif type.startswith('ray'):
-            raise ValueError('Unsupported type')
+        elif self.current_tag.startswith('ray'):
+            dataset = dataset.map(op.compute_stats)
+            dataset = dataset.filter(op.process)
+            dataset = dataset.to_pandas().get(column_names)
+            if dataset is None:
+                return []
+            return dataset.to_dict(orient="records")
         else:
             raise ValueError('Unsupported type')
+
+    def assertDatasetEqual(self, first, second):
+        def convert_record(rec):
+            for key in rec.keys():
+                # Convert incomparable `list` to comparable `tuple`
+                if isinstance(rec[key], numpy.ndarray) or isinstance(rec[key], list):
+                    rec[key] = tuple(rec[key])
+            return rec
+
+        first = [convert_record(d) for d in first]
+        second = [convert_record(d) for d in second]
+        first = sorted(first, key=lambda x: tuple(sorted(x.items())))
+        second = sorted(second, key=lambda x: tuple(sorted(x.items())))
+        return self.assertEqual(first, second)
\ No newline at end of file
diff --git a/tests/ops/filter/test_alphanumeric_filter.py b/tests/ops/filter/test_alphanumeric_filter.py
index ec0115942..594432207 100644
--- a/tests/ops/filter/test_alphanumeric_filter.py
+++ b/tests/ops/filter/test_alphanumeric_filter.py
@@ -38,10 +38,10 @@ def test_case(self):
         }, {
             'text': 'emoji表情测试下😊，😸31231\n'
         }]
-        dataset = DataJuicerTestCaseBase.generate_dataset(ds_list)
+        dataset = self.generate_dataset(ds_list)
         op = AlphanumericFilter(min_ratio=0.2, max_ratio=0.9)
-        result = DataJuicerTestCaseBase.run_single_op(dataset, op, ["text"], self.current_tag)
-        self.assertEqual(result, tgt_list)
+        result = self.run_single_op(dataset, op, ["text"])
+        self.assertDatasetEqual(result, tgt_list)
 
     @TEST_TAG("standalone", "ray")
     def test_token_case(self):
@@ -66,10 +66,10 @@ def test_token_case(self):
         }, {
             'text': 'Do you need a cup of coffee?'
         }]
-        dataset = DataJuicerTestCaseBase.generate_dataset(ds_list)
+        dataset = self.generate_dataset(ds_list)
         op = AlphanumericFilter(tokenization=True, min_ratio=1.5)
-        result = DataJuicerTestCaseBase.run_single_op(dataset, op, ["text"], self.current_tag)
-        self.assertEqual(result, tgt_list)
+        result = self.run_single_op(dataset, op, ["text"])
+        self.assertDatasetEqual(result, tgt_list)
 
 
 if __name__ == '__main__':
diff --git a/tests/ops/filter/test_audio_duration_filter.py b/tests/ops/filter/test_audio_duration_filter.py
index 91a39bfd8..f7363969d 100644
--- a/tests/ops/filter/test_audio_duration_filter.py
+++ b/tests/ops/filter/test_audio_duration_filter.py
@@ -5,7 +5,7 @@
 
 from data_juicer.ops.filter.audio_duration_filter import AudioDurationFilter
 from data_juicer.utils.constant import Fields
-from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, TEST_TAG
 
 
 class AudioDurationFilterTest(DataJuicerTestCaseBase):
@@ -30,6 +30,7 @@ def _run_audio_duration_filter(self,
         res_list = dataset.to_list()
         self.assertEqual(res_list, target_list)
 
+    @TEST_TAG("standalone", "ray")
     def test_default_filter(self):
 
         ds_list = [{
@@ -46,10 +47,13 @@ def test_default_filter(self):
         }, {
             'audios': [self.aud3_path]
         }]
-        dataset = Dataset.from_list(ds_list)
+        dataset = self.generate_dataset(ds_list)
         op = AudioDurationFilter()
-        self._run_audio_duration_filter(dataset, tgt_list, op)
+        result = self.run_single_op(dataset, op, [op.audio_key])
+        self.assertDatasetEqual(result, tgt_list)
 
+
+    @TEST_TAG("standalone", "ray")
     def test_filter_long_audios(self):
 
         ds_list = [{
@@ -60,10 +64,12 @@ def test_filter_long_audios(self):
             'audios': [self.aud3_path]
         }]
         tgt_list = [{'audios': [self.aud1_path]}]
-        dataset = Dataset.from_list(ds_list)
+        dataset = self.generate_dataset(ds_list)
         op = AudioDurationFilter(max_duration=10)
-        self._run_audio_duration_filter(dataset, tgt_list, op)
+        result = self.run_single_op(dataset, op, [op.audio_key])
+        self.assertDatasetEqual(result, tgt_list)
 
+    @TEST_TAG("standalone", "ray")
     def test_filter_short_audios(self):
 
         ds_list = [{
@@ -74,10 +80,12 @@ def test_filter_short_audios(self):
             'audios': [self.aud3_path]
         }]
         tgt_list = [{'audios': [self.aud3_path]}]
-        dataset = Dataset.from_list(ds_list)
+        dataset = self.generate_dataset(ds_list)
         op = AudioDurationFilter(min_duration=60)
-        self._run_audio_duration_filter(dataset, tgt_list, op)
+        result = self.run_single_op(dataset, op, [op.audio_key])
+        self.assertDatasetEqual(result, tgt_list)
 
+    @TEST_TAG("standalone", "ray")
     def test_filter_audios_within_range(self):
 
         ds_list = [{
@@ -88,12 +96,13 @@ def test_filter_audios_within_range(self):
             'audios': [self.aud3_path]
         }]
         tgt_list = [{'audios': [self.aud2_path]}]
-        dataset = Dataset.from_list(ds_list)
+        dataset = self.generate_dataset(ds_list)
         op = AudioDurationFilter(min_duration=10, max_duration=20)
-        self._run_audio_duration_filter(dataset, tgt_list, op)
+        result = self.run_single_op(dataset, op, [op.audio_key])
+        self.assertDatasetEqual(result, tgt_list)
 
+    @TEST_TAG("standalone", "ray")
     def test_any(self):
-
         ds_list = [{
             'audios': [self.aud1_path, self.aud2_path]
         }, {
@@ -106,12 +115,15 @@ def test_any(self):
         }, {
             'audios': [self.aud2_path, self.aud3_path]
         }]
-        dataset = Dataset.from_list(ds_list)
+        dataset = self.generate_dataset(ds_list)
         op = AudioDurationFilter(min_duration=10,
                                  max_duration=20,
                                  any_or_all='any')
-        self._run_audio_duration_filter(dataset, tgt_list, op)
+        result = self.run_single_op(dataset, op, [op.audio_key])
+        print(result)
+        self.assertDatasetEqual(result, tgt_list)
 
+    @TEST_TAG("standalone", "ray")
     def test_all(self):
 
         ds_list = [{
@@ -122,12 +134,14 @@ def test_all(self):
             'audios': [self.aud1_path, self.aud3_path]
         }]
         tgt_list = []
-        dataset = Dataset.from_list(ds_list)
+        dataset = self.generate_dataset(ds_list)
         op = AudioDurationFilter(min_duration=10,
                                  max_duration=20,
                                  any_or_all='all')
-        self._run_audio_duration_filter(dataset, tgt_list, op)
+        result = self.run_single_op(dataset, op, [op.audio_key])
+        self.assertDatasetEqual(result, tgt_list)
 
+    @TEST_TAG("standalone", "ray")
     def test_filter_in_parallel(self):
 
         ds_list = [{
@@ -138,9 +152,10 @@ def test_filter_in_parallel(self):
             'audios': [self.aud3_path]
         }]
         tgt_list = [{'audios': [self.aud2_path]}]
-        dataset = Dataset.from_list(ds_list)
+        dataset = self.generate_dataset(ds_list)
         op = AudioDurationFilter(min_duration=10, max_duration=20)
-        self._run_audio_duration_filter(dataset, tgt_list, op, np=2)
+        result = self.run_single_op(dataset, op, [op.audio_key])
+        self.assertDatasetEqual(result, tgt_list)
 
 
 if __name__ == '__main__':

From a2091d89c06f24027c68683ebc27bbf7fd7ccef7 Mon Sep 17 00:00:00 2001
From: panxuchen <panxuchen.pxc@alibaba-inc.com>
Date: Tue, 7 May 2024 20:23:58 +0800
Subject: [PATCH 25/27] add ray test in workflow

---
 .github/workflows/unittest.yml      |  5 ++++
 data_juicer/core/ray_executor.py    | 39 +++++++++++++----------------
 data_juicer/utils/unittest_utils.py | 18 +++++++------
 tests/run.py                        |  4 +--
 4 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
index ed1a10718..3ea19b3bb 100644
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@@ -42,6 +42,11 @@ jobs:
       run: |
         docker compose exec ray-head python tests/run.py --tag standalone
 
+    - name: Run unittest ray
+      working-directory: dj-${{ github.run_id }}/.github/workflows/docker
+      run: |
+        docker compose exec ray-head python tests/run.py --tag ray
+
     - name: Remove docker compose
       working-directory: dj-${{ github.run_id }}/.github/workflows/docker
       if: always()
diff --git a/data_juicer/core/ray_executor.py b/data_juicer/core/ray_executor.py
index 3cf7acbb3..1c066ea02 100644
--- a/data_juicer/core/ray_executor.py
+++ b/data_juicer/core/ray_executor.py
@@ -122,38 +122,33 @@ def run_op(self, op, op_cfg, dataset):
                             batch_size=1)
                         # The batch size here is same as in data.py
                     else:
-                        dataset = dataset.map_batches(
-                            partial(ray_batch_mapper_wrapper,
-                                    fn=op.process),
-                            batch_format='pyarrow',
-                            num_gpus=num_gpus,
-                            batch_size=1)
+                        dataset = dataset.map_batches(partial(
+                            ray_batch_mapper_wrapper, fn=op.process),
+                                                      batch_format='pyarrow',
+                                                      num_gpus=num_gpus,
+                                                      batch_size=1)
                         # The batch size here is same as in data.py
                 else:
                     if use_actor:
-                        dataset = dataset.map(
-                            op_cls,
-                            compute=ActorPoolStrategy(),
-                            concurrency=op_proc,
-                            fn_constructor_kwargs=op_args,
-                            num_gpus=num_gpus)
+                        dataset = dataset.map(op_cls,
+                                              compute=ActorPoolStrategy(),
+                                              concurrency=op_proc,
+                                              fn_constructor_kwargs=op_args,
+                                              num_gpus=num_gpus)
                     else:
-                        dataset = dataset.map(op.process,
-                                                num_gpus=num_gpus)
+                        dataset = dataset.map(op.process, num_gpus=num_gpus)
 
             elif isinstance(op, Filter):
                 if use_actor:
                     dataset = dataset.map(op_cls,
-                                            compute=ActorPoolStrategy(),
-                                            concurrency=op_proc,
-                                            fn_constructor_kwargs=op_args,
-                                            num_gpus=num_gpus)
+                                          compute=ActorPoolStrategy(),
+                                          concurrency=op_proc,
+                                          fn_constructor_kwargs=op_args,
+                                          num_gpus=num_gpus)
                 else:
-                    dataset = dataset.map(op.compute_stats,
-                                            num_gpus=num_gpus)
+                    dataset = dataset.map(op.compute_stats, num_gpus=num_gpus)
                 if op.stats_export_path is not None:
-                    dataset.write_json(op.stats_export_path,
-                                        force_ascii=False)
+                    dataset.write_json(op.stats_export_path, force_ascii=False)
                 dataset = dataset.filter(op.process)
             else:
                 logger.error(
diff --git a/data_juicer/utils/unittest_utils.py b/data_juicer/utils/unittest_utils.py
index 546a28dc3..d7ee7d87d 100644
--- a/data_juicer/utils/unittest_utils.py
+++ b/data_juicer/utils/unittest_utils.py
@@ -3,8 +3,8 @@
 import unittest
 
 import numpy
-import ray.data as rd
 import pyarrow as pa
+import ray.data as rd
 from datasets import Dataset
 
 from data_juicer.ops import Filter
@@ -16,7 +16,7 @@
 
 def TEST_TAG(*tags):
     """Tags for test case.
-    Currently, `standalone`, `ray`, `standalone-gpu`, `ray-gpu` are supported.
+    Currently, `standalone`, `ray` are supported.
     """
 
     def decorator(func):
@@ -56,13 +56,15 @@ def generate_dataset(self, data):
         """Generate dataset for a specific executor.
 
         Args:
-            type (str, optional): "standalone" or "ray". Defaults to "standalone".
+            type (str, optional): "standalone" or "ray".
+            Defaults to "standalone".
         """
         if self.current_tag.startswith('standalone'):
             return Dataset.from_list(data)
         elif self.current_tag.startswith('ray'):
             dataset = rd.from_items(data)
             if Fields.stats not in dataset.columns(fetch_if_missing=False):
+
                 def process_batch_arrow(table: pa.Table) -> pa.Table:
                     new_column_data = [{} for _ in range(len(table))]
                     new_talbe = table.append_column(Fields.stats,
@@ -70,7 +72,7 @@ def process_batch_arrow(table: pa.Table) -> pa.Table:
                     return new_talbe
 
                 dataset = dataset.map_batches(process_batch_arrow,
-                                            batch_format='pyarrow')
+                                              batch_format='pyarrow')
             return dataset
         else:
             raise ValueError('Unsupported type')
@@ -91,15 +93,17 @@ def run_single_op(self, dataset, op, column_names):
             dataset = dataset.to_pandas().get(column_names)
             if dataset is None:
                 return []
-            return dataset.to_dict(orient="records")
+            return dataset.to_dict(orient='records')
         else:
             raise ValueError('Unsupported type')
 
     def assertDatasetEqual(self, first, second):
+
         def convert_record(rec):
             for key in rec.keys():
                 # Convert incomparable `list` to comparable `tuple`
-                if isinstance(rec[key], numpy.ndarray) or isinstance(rec[key], list):
+                if isinstance(rec[key], numpy.ndarray) or isinstance(
+                        rec[key], list):
                     rec[key] = tuple(rec[key])
             return rec
 
@@ -107,4 +111,4 @@ def convert_record(rec):
         second = [convert_record(d) for d in second]
         first = sorted(first, key=lambda x: tuple(sorted(x.items())))
         second = sorted(second, key=lambda x: tuple(sorted(x.items())))
-        return self.assertEqual(first, second)
\ No newline at end of file
+        return self.assertEqual(first, second)
diff --git a/tests/run.py b/tests/run.py
index 080404b8a..81028ee01 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -19,7 +19,7 @@
 sys.path.append(file_dir)
 
 parser = argparse.ArgumentParser('test runner')
-parser.add_argument('--tag', choices=["standalone", "standalone-gpu", "ray", "ray-gpu"],
+parser.add_argument('--tag', choices=["standalone", "ray"],
                     default="standalone",
                     help="the tag of tests being run")
 parser.add_argument('--pattern', default='test_*.py', help='test file pattern')
@@ -30,7 +30,7 @@
 
 
 class TaggedTestLoader(unittest.TestLoader):
-    def __init__(self, tag=None):
+    def __init__(self, tag="standalone"):
         super().__init__()
         self.tag = tag
     

From be814892f07ec935a3b9044ffae377a6fe2f5800 Mon Sep 17 00:00:00 2001
From: panxuchen <panxuchen.pxc@alibaba-inc.com>
Date: Wed, 8 May 2024 09:49:00 +0800
Subject: [PATCH 26/27] update ray unittest workflow

---
 .github/workflows/unittest.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
index 3ea19b3bb..28b2e6f3f 100644
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@@ -31,6 +31,7 @@ jobs:
       working-directory: dj-${{ github.run_id }}/.github/workflows/docker
       run: |
         docker compose exec ray-head pip install -e .\[all\]
+        docker compose exec ray-worker pip install -e .\[all\]
 
     - name: Clean dataset cache
       working-directory: dj-${{ github.run_id }}/.github/workflows/docker

From aa68b49f9bfa9cd083d0df48a8a13405fb8b214b Mon Sep 17 00:00:00 2001
From: panxuchen <panxuchen.pxc@alibaba-inc.com>
Date: Mon, 17 Jun 2024 14:07:13 +0800
Subject: [PATCH 27/27] delete old unittest

---
 .github/workflows/unittest.yml | 60 ----------------------------------
 1 file changed, 60 deletions(-)
 delete mode 100644 .github/workflows/unittest.yml

diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
deleted file mode 100644
index 28b2e6f3f..000000000
--- a/.github/workflows/unittest.yml
+++ /dev/null
@@ -1,60 +0,0 @@
-# This workflow will install Python dependencies, run tests and lint with a single version of Python
-# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
-
-name: unittest
-
-on:
-  workflow_dispatch:
-  pull_request:
-  push:
-    branches:
-      - main
-
-permissions:
-  contents: read
-
-jobs:
-  unittest-single:
-    runs-on: [self-hosted, linux]
-    environment: Testing
-    steps:
-    - uses: actions/checkout@v3
-      with:
-        path: dj-${{ github.run_id }}
-
-    - name: Setup docker compose
-      working-directory: dj-${{ github.run_id }}/.github/workflows/docker
-      run: |
-        docker compose up -d
-
-    - name: Install data-juicer
-      working-directory: dj-${{ github.run_id }}/.github/workflows/docker
-      run: |
-        docker compose exec ray-head pip install -e .\[all\]
-        docker compose exec ray-worker pip install -e .\[all\]
-
-    - name: Clean dataset cache
-      working-directory: dj-${{ github.run_id }}/.github/workflows/docker
-      run: |
-        docker compose exec ray-head rm -rf /data/huggingface/dataset
-
-    - name: Run unittest standalone
-      working-directory: dj-${{ github.run_id }}/.github/workflows/docker
-      run: |
-        docker compose exec ray-head python tests/run.py --tag standalone
-
-    - name: Run unittest ray
-      working-directory: dj-${{ github.run_id }}/.github/workflows/docker
-      run: |
-        docker compose exec ray-head python tests/run.py --tag ray
-
-    - name: Remove docker compose
-      working-directory: dj-${{ github.run_id }}/.github/workflows/docker
-      if: always()
-      run: |
-        docker compose down --remove-orphans
-
-    - name: Cleanup workspace
-      if: always()
-      run: |
-        rm -rf  dj-${{ github.run_id }}