FlagOpen · aoyulong · Jun 12, 2024 · Jun 11, 2024 · Jun 12, 2024 · Jun 12, 2024
diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml
@@ -0,0 +1,44 @@
+name: format
+
+on:
+  pull_request:
+    branches: [ "main" ]
+    types: [opened, synchronize, reopened]
+
+env:
+  CHECK_FILES: >-
+    flagscale/auto_tuner/*.py
+    flagscale/auto_tuner/prune/*.py
+    flagscale/auto_tuner/record/*.py
+    flagscale/auto_tuner/search/*.py
+    flagscale/launcher/*.py
+    flagscale/logger.py
+    flagscale/patches_utils.py
+    flagscale/datasets/sft_dataset.py
+
+jobs:
+  format:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v2
+
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v2
+      with:
+        python-version: "3.10"
+
+    - name: Install dependencies
+      run: |
+        pip install black isort
+
+    - name: Run Black
+      run: >-
+        black --check --diff --include $CHECK_FILES ./
+
+    - name: Run Isort
+      run: |
+        isort --profile black --check --diff $CHECK_FILES
+
+
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -9,46 +9,64 @@ on:
   pull_request:
     branches: [ "main" ]
 
+env:
+  REPORT_ADDR: http://120.92.110.224:8081
+
 jobs:
-  test:
-      runs-on: self-hosted
-      container:
-        image: localhost:5000/flagscale_cicd:v1.3
-        ports:
-          - 80
-        volumes:
-          - /home/flagscale_cicd/flask/static:/workspace/report
-        options: --gpus all --hostname flagscale_cicd
-      steps:
-        - name: Checkout Code
-          uses: actions/checkout@v2
-
-        - name: Megatron Unit Test
-          run: |
-            cd megatron
-            export PYTHONPATH=..:$PYTHONPATH
-            torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/data
-            torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/dist_checkpointing
-            torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/fusions
-            torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/models
-            torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/pipeline_parallel
-            torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/tensor_parallel
-            torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/transformer 
-            torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/*.py 
-
-        - name: Megatron Unit Test Coverage Online Report
-          run: |
-            echo "You can access the test coverage report at the http://120.92.110.224:8081/${{github.sha}}/cov-report-megatron/index.html"
-
-        - name: Flagscale Unit Test
-          run: |
-            torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-flagscale --cov=flagscale -q -x tests/unit_tests/launcher
-
-        - name: Flagscale Unit Test Coverage Online Report
-          run: |
-            echo "You can access the test coverage report at the http://120.92.110.224:8081/${{github.sha}}/cov-report-flagscale/index.html"
-
-        - name: Flagscale Functional Test
-          run: |
-            python run.py --config-path tests/functional_tests/aquila/conf --config-name config action=test
-            pytest -s tests/functional_tests/test_result.py --test_reaults_path=./tests/functional_tests/aquila/test_result
+  megatron-unit-test:
+    runs-on: self-hosted
+    container:
+      image: localhost:5000/flagscale_cicd:v1.3
+      ports:
+        - 80
+      volumes:
+        - /home/flagscale_cicd/flask/static:/workspace/report
+      options: --gpus all --hostname flagscale_cicd
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v2
+
+      - name: Megatron Unit Test
+        run: tests/scripts/unit_test_megatron.sh ${{github.sha}}
+
+      - name: Megatron Unit Test Coverage Online Report
+        run: echo "You can access the test coverage report at the $REPORT_ADDR/${{github.sha}}/cov-report-megatron/index.html"
+
+
+  flagscale-unit-test:
+    runs-on: self-hosted
+    needs: megatron-unit-test
+    container:
+      image: localhost:5000/flagscale_cicd:v1.3
+      ports:
+        - 80
+      volumes:
+        - /home/flagscale_cicd/flask/static:/workspace/report
+      options: --gpus all --hostname flagscale_cicd
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v2
+
+      - name: Flagscale Unit Test
+        run: tests/scripts/unit_test_flagscale.sh ${{github.sha}}
+
+      - name: Flagscale Unit Test Coverage Online Report
+        run: echo "You can access the test coverage report at the $REPORT_ADDR/${{github.sha}}/cov-report-flagscale/index.html"
+
+
+  flagscale-functional-test:
+    runs-on: self-hosted
+    needs: flagscale-unit-test
+    container:
+      image: localhost:5000/flagscale_cicd:v1.3
+      ports:
+        - 80
+      volumes:
+        - /home/flagscale_cicd/flask/static:/workspace/report
+      options: --gpus all --hostname flagscale_cicd
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v2
+
+      - name: Flagscale Functional Test
+        run: tests/scripts/functional_test_flagscale.sh
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,28 @@
+check_files: &check_files |
+  (?x)^(
+      flagscale/auto_tuner/.*\.py|
+      flagscale/auto_tuner/prune/\..*\.py|
+      flagscale/auto_tuner/record/\..*\.py|
+      flagscale/auto_tuner/search/\..*\.py|
+      flagscale/launcher/\..*\.py|
+      flagscale/logger\.py|
+      flagscale/patches_utils\.py|
+      flagscale/datasets/sft_dataset\.py
+  )$
+
+repos:
+  - repo: local
+    hooks:
+      - id: black
+        name: black
+        entry: black
+        language: system
+        types: [python]
+        files: *check_files
+      - id: isort
+        name: isort
+        entry: isort
+        language: system
+        types: [python]
+        files: *check_files
+        args: ["--profile", "black"]
diff --git a/flagscale/auto_tuner/generate.py b/flagscale/auto_tuner/generate.py
@@ -1,5 +1,5 @@
-import os
 import copy
+import os
 
 
 class Generator:
@@ -16,8 +16,7 @@ def __init__(self, config):
                 "tensor_model_parallel_size": "tensor_model_parallel_size",
                 "sequence_parallel": "sequence_parallel",
                 "pipeline_model_parallel_size": "pipeline_model_parallel_size",
-                "num_layers_per_virtual_pipeline_stage":
-                "num_layers_per_virtual_pipeline_stage",
+                "num_layers_per_virtual_pipeline_stage": "num_layers_per_virtual_pipeline_stage",
                 "recompute_method": "recompute_method",
                 "recompute_granularity": "recompute_granularity",
                 "recompute_num_layers": "recompute_num_layers",
@@ -81,14 +80,15 @@ def gen(self, strategy):
         # Set train_iters of each task
         if "control" in config.experiment.auto_tuner:
             config.train.model.train_iters = config.experiment.auto_tuner.control.get(
-                "train_iters", 5)
+                "train_iters", 5
+            )
         else:
             config.train.model.train_iters = 5
 
         # log dir
-        config.experiment.exp_dir = os.path.join(config.experiment.exp_dir,
-                                                 "auto_tuner",
-                                                 f"task_{strategy['idx']}")
+        config.experiment.exp_dir = os.path.join(
+            config.experiment.exp_dir, "auto_tuner", f"task_{strategy['idx']}"
+        )
 
         return config
 

diff --git a/flagscale/auto_tuner/prune/history.py b/flagscale/auto_tuner/prune/history.py
@@ -1,6 +1,6 @@
 import logging
-from ..utils import beside
-from ..utils import compare_by_recompute
+
+from ..utils import beside, compare_by_recompute
 
 _HISTORY_BASED_PRUNE_FUNC = []
 logger = logging.getLogger("FlagScale-AutoTuner")

diff --git a/flagscale/auto_tuner/prune/pruner.py b/flagscale/auto_tuner/prune/pruner.py
@@ -14,6 +14,7 @@ def prune(self, strategy, history=[]):
             if func(self.config, strategy, history):
                 not_run = True
                 break
+
         history.append(strategy)
         if not_run:
             self.pruned_count += 1

diff --git a/flagscale/auto_tuner/record/recorder.py b/flagscale/auto_tuner/record/recorder.py
@@ -1,7 +1,8 @@
+import logging
 import os
 import re
-import logging
 import subprocess
+
 import pandas as pd
 
 

diff --git a/flagscale/auto_tuner/search/algorithm.py b/flagscale/auto_tuner/search/algorithm.py
@@ -24,15 +24,19 @@ def __init__(self, strategies, config):
     def checkout(self, mode):
         if mode == "memory":
             from ..utils import sort_by_memory
+
             if self.idx > 0 and self.idx < len(self.strategies):
-                self.strategies = self.strategies[:self.idx] + sorted(
-                    self.strategies[self.idx:], key=sort_by_memory)
+                self.strategies = self.strategies[: self.idx] + sorted(
+                    self.strategies[self.idx :], key=sort_by_memory
+                )
 
         elif mode == "performance":
             from ..utils import sort_by_performance
+
             if self.idx > 0 and self.idx < len(self.strategies):
-                self.strategies = self.strategies[:self.idx] + sorted(
-                    self.strategies[self.idx:], key=sort_by_performance)
+                self.strategies = self.strategies[: self.idx] + sorted(
+                    self.strategies[self.idx :], key=sort_by_performance
+                )
 
     def search(self):
         """Return a task iteratively."""

diff --git a/flagscale/auto_tuner/search/searcher.py b/flagscale/auto_tuner/search/searcher.py
@@ -1,8 +1,8 @@
-import time
 import copy
 import logging
-from ..utils import divisible
+import time
 
+from ..utils import divisible
 
 __BUILT_IN_STRATEGY_DIMS__ = [
     "data_parallel_size",