FlagOpen · phoenixdong · May 27, 2024 · May 27, 2024 · May 27, 2024 · May 27, 2024
@@ -0,0 +1,104 @@
+# This workflow will install Python dependencies, run tests and lint with a single version of Python
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: flagscale-test
+
+on:
+  push:
+    branches: [ "main" , "add_CICD"]
+  pull_request:
+    branches: [ "main" ]
+
+jobs:
+  container-test-job:
+      runs-on: self-hosted
+      container:
+        image: localhost:5000/flagscale_cicd:v1.1 
+        env:                                         
+          NODE_ENV: development
+        ports:
+          - 80
+        options: --gpus all --hostname flagscale_cicd 
+      steps:
+        - name: checkout-code
+          uses: actions/checkout@v2
+
+        - name: unit_test_megatron
+          run: |
+            export PYTHONPATH=./megatron:$PYTHONPATH
+            export PYTHONPATH=./../../FlagScale/:$PYTHONPATH
+
+            cd megatron
+
+            # passed             
+            # slow
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/data/test_preprocess_data.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/data/test_preprocess_mmdata.py
+
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/dist_checkpointing/models/test_t5_model.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/dist_checkpointing/models/test_mlp_glu.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/dist_checkpointing/test_async_save.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/dist_checkpointing/test_mapping.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/dist_checkpointing/models/test_gpt_model.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/dist_checkpointing/models/test_retro_model.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/dist_checkpointing/models/test_bert_model.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/dist_checkpointing/models/test_sequential_mlp.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/dist_checkpointing/test_fully_parallel.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/dist_checkpointing/test_optimizer.py
+
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/fusions
+
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/models/test_bert_model.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/models/test_gpt_model.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/models/test_multimodal_projector.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/models/test_clip_vit_model.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/models/test_llava_model.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/models/test_t5_model.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/models/test_base_embedding.py
+
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/pipeline_parallel
+
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/tensor_parallel/test_cross_entropy.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/tensor_parallel/test_initialization.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/tensor_parallel/test_mappings.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/tensor_parallel/test_tensor_parallel_utils.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/tensor_parallel/test_data.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/tensor_parallel/test_layers.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/tensor_parallel/test_random.py
+
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer/moe/test_grouped_mlp.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer/moe/test_routers.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer/moe/test_sequential_mlp.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer/moe/test_token_dispatcher.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py
+
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer/test_transformer_block.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer/test_attention.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer/test_module.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer/test_spec_customization.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer/test_transformer_layer.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer/test_mlp.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer/test_retro_attention.py
+
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/test_basic.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/test_imports.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/test_optimizer.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/test_training.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/test_utils.py
+            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/test_parallel_state.py
+
+            # unpassed
+            # AssertionError: args is not initialized.
+            # torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/data/test_builder.py
+            # torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/data/test_gpt_dataset.py
+            # torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/data/test_multimodal_dataset.py
+
+        - name: unit_test_flagscale
+          run: |
+            export PYTHONPATH=./flagscale:$PYTHONPATH
+            pytest -x tests/unit_tests/launcher/test_parse_hostfile.py
+
+        - name: unit_test_flagscale
+          run: |
+            python run.py --config-path tests/functional_tests/aquila/conf --config-name config action=test
+            pytest -s tests/functional_tests/test_result.py --test_reaults_path=./tests/functional_tests/aquila/test_result
@@ -83,18 +83,20 @@ def get_host_name_or_ip():
     return IP
 
 
-def run_local_command(cmd, dryrun=False):
+def run_local_command(cmd, dryrun=False, with_test=False):
     logger.info(f"Run the local command: {cmd}")
     if dryrun:
         return
-    result = subprocess.run(cmd, shell=True, check=True, capture_output=True, text=True)
+    if with_test:
+        result = subprocess.run(cmd, shell=True, check=True)
+    else:
+        result = subprocess.run(cmd, shell=True, check=True, capture_output=True, text=True)
     if result.returncode != 0:
         print(f"Command {cmd} failed with return code {result.returncode}.")
         print(f"Output: {result.stdout}")
         print(f"Error: {result.stderr}")
         sys.exit(result.returncode)
 
-
 def run_ssh_command(host, cmd, port=None, dryrun=False):
     if port:
         ssh_cmd = f"ssh -f -n -p {port} {host} '{cmd}'"
@@ -483,10 +485,10 @@ def _run_each(
 
         cmd = shlex.join(export_cmd + runner_cmd + [self.user_script] + self.user_args)
 
-        if with_test:
-            exp_dir = self.config.experiment.exp_dir
-            test_cmd = f";python tests/functional_tests/check_result.py {exp_dir};rm -r {exp_dir}"
-            cmd = cmd + test_cmd
+        # if with_test:
+        #     exp_dir = self.config.experiment.exp_dir
+        #     test_cmd = f";python tests/functional_tests/check_result.py {exp_dir};rm -r {exp_dir}"
+        #     cmd = cmd + test_cmd
 
         host_run_script_file = _generate_run_script(
             self.config, host, node_rank, cmd, background=True, with_test=with_test
@@ -508,7 +510,7 @@ def _run_each(
             # Step 3: run the host_run_script_file on the remote host
             run_ssh_command(host, f"bash {host_run_script_file}", ssh_port, dryrun)
         else:
-            run_local_command(f"bash {host_run_script_file}", dryrun)
+            run_local_command(f"bash {host_run_script_file}", dryrun, with_test=with_test)
 
     def run(self, with_test=False, dryrun=False):
         self._prepare()
@@ -584,7 +586,7 @@ def _stop_each(self, host, node_rank):
             # Step 3: run the host_run_script_file on the remote host
             run_ssh_command(host, f"bash {host_stop_script_file}", ssh_port)
         else:
-            run_local_command(f"bash {host_stop_script_file}")
+            run_local_command(f"bash {host_stop_script_file}", with_test=with_test)
 
     def stop(self):
         if self.resources is None:
@@ -642,16 +644,16 @@ def _run_each(
 
         cmd = shlex.join(export_cmd + runner_cmd + [self.user_script] + self.user_args)
 
-        if with_test:
-            exp_dir = self.config.experiment.exp_dir
-            test_cmd = f";python tests/functional_tests/check_result.py {exp_dir};rm -r {exp_dir}"
-            cmd = cmd + test_cmd
+        # if with_test:
+        #     exp_dir = self.config.experiment.exp_dir
+        #     test_cmd = f";python tests/functional_tests/check_result.py {exp_dir};rm -r {exp_dir}"
+        #     cmd = cmd + test_cmd
 
         host_run_script_file = _generate_run_script(
             self.config, host, node_rank, cmd, background=False, with_test=with_test
         )
 
-        run_local_command(f"bash {host_run_script_file}", dryrun)
+        run_local_command(f"bash {host_run_script_file}", dryrun, with_test=with_test)
 
     def run(self, with_test=False, dryrun=False):
         self._prepare()

@@ -1,7 +1,7 @@
 unit_tests:
-  unit_test_megatron: true    
-  unit_test_megatron_dist: true 
-  unit_test_flagscale: true  
+  unit_test_megatron: false    
+  unit_test_megatron_dist: false 
+  unit_test_flagscale: false  
 
 functional_tests: 
   functional_test_flagscale: ["aquila"]

@@ -1,14 +1,16 @@
 defaults:
-  - train: test_train_aquila
   - _self_
-
+  - train: test_train_aquila
+
 experiment:
-  exp_name: unified-runner 
+  exp_name: test_train_aquila
   exp_dir: ./tests/functional_tests/aquila/test_result
-  entrypoint: ./flagscale/train/train_aquila.py 
-  task: train
-  hostfile: 
-  backend: megatron
+  task:
+    type: train 
+    backend: megatron 
+    entrypoint: flagscale/train/train_aquila.py 
+  runner:
+    backend: torchrun 
   shell_cmds: null 
   ssh_port: null
   envs:
@@ -19,4 +21,4 @@ action: run
 
 hydra:
   run:
-    dir: ${experiment.exp_dir}/hydra 
+    dir: ${experiment.exp_dir}/hydra
@@ -54,7 +54,7 @@ model:
       lr_decay_style: cosine 
 
 data:
-  data_path: ./pile_wikipedia_demo/pile_wikipedia_demo
+  data_path: /home/gitlab-runner/data/pile_wikipedia_demo/pile_wikipedia_demo
   split: 1
   tokenizer:
     tokenizer_type: AquilaTokenizer

@@ -0,0 +1,11 @@
+import pytest
+from configparser import ConfigParser
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--test_reaults_path", action="store", default="none", help="test result path"
+    )
+
+@pytest.fixture
+def test_reaults_path(request):
+    return request.config.getoption("--test_reaults_path")
@@ -1,14 +1,15 @@
 import os, json, sys
 import numpy as np
 
-def compare_result_log(test_reaults_path:str):
+def test_result(test_reaults_path:str):
 
     host_path = test_reaults_path + "/logs/details/host_0_localhost"
     id_name      = os.listdir(host_path)[0]
-    attempt_name = os.listdir(host_path + "/" + id_name)[0]
-    results_path = (os.listdir(host_path + "/" + id_name + "/" + attempt_name))
+    default_name = os.listdir(host_path + "/" + id_name)[0]
+    attempt_name = os.listdir(host_path + "/" + id_name + "/" + default_name)[0]
+    results_path = (os.listdir(host_path + "/" + id_name + "/" + default_name+ "/" + attempt_name))
     results_path.sort()
-    result_path  = host_path + "/" + id_name + "/" + attempt_name + "/" + results_path[-1] + "/stdout.log"
+    result_path  = host_path + "/" + id_name + "/" + default_name+ "/" + attempt_name + "/" + results_path[-1] + "/stdout.log"
 
     with open(result_path, 'r') as file:
         lines = file.readlines()
@@ -33,6 +34,9 @@ def compare_result_log(test_reaults_path:str):
     print("gold_result: ", gold_result_json)
     print("The results are basically equal: ", np.allclose(gold_result_json["lm loss:"]["values"], result_json["lm loss:"]["values"]))
 
+    assert np.allclose(gold_result_json["lm loss:"]["values"], result_json["lm loss:"]["values"]), "result not close to gold result"
+
+
 if __name__ == '__main__':
     test_reaults_path = sys.argv[1]
     compare_result_log(test_reaults_path)

@@ -10,4 +10,4 @@ if [ -d "$OUT_DIR" ]; then
     sleep 3s
 fi
 
-python run.py --config-path tests/functional_tests/$1/conf --config-name config action=test
+python run.py --config-path tests/functional_tests/$1/conf --config-name config action=test
@@ -1,11 +1,69 @@
 export PYTHONPATH=./megatron:$PYTHONPATH
+export PYTHONPATH=./../../FlagScale/:$PYTHONPATH
 
-pytest -x megatron/tests/unit_tests/test_basic.py \
-       megatron/tests/unit_tests/test_imports.py \
-       megatron/tests/unit_tests/test_utils.py \
-       megatron/tests/unit_tests/data/test_mock_gpt_dataset.py  \
-       megatron/tests/unit_tests/data/test_multimodal_dataset.py \
-       megatron/tests/unit_tests/data/test_preprocess_mmdata.py \
-       megatron/tests/unit_tests/data/test_preprocess_data.py \
-       megatron/tests/unit_tests/fusions
-
+cd megatron
+
+# passed 
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/data/test_preprocess_data.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/data/test_preprocess_mmdata.py
+
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/dist_checkpointing/models/test_t5_model.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/dist_checkpointing/models/test_mlp_glu.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/dist_checkpointing/test_async_save.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/dist_checkpointing/test_mapping.py
+
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/models/test_bert_model.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/models/test_gpt_model.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/models/test_multimodal_projector.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/models/test_clip_vit_model.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/models/test_llava_model.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/models/test_t5_model.py
+
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/pipeline_parallel
+
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/tensor_parallel/test_cross_entropy.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/tensor_parallel/test_initialization.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/tensor_parallel/test_mappings.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/tensor_parallel/test_tensor_parallel_utils.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/tensor_parallel/test_data.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/tensor_parallel/test_layers.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/tensor_parallel/test_random.py
+
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer/moe/test_grouped_mlp.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer/moe/test_routers.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer/moe/test_sequential_mlp.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer/moe/test_token_dispatcher.py
+
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer/test_transformer_block.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer/test_attention.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer/test_module.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer/test_spec_customization.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer/test_transformer_layer.py
+
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/test_basic.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/test_imports.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/test_optimizer.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/test_training.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/test_utils.py
+torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/test_parallel_state.py
+
+
+# unpassed
+# torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/data/test_builder.py
+# torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/data/test_gpt_dataset.py
+# torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/data/test_multimodal_dataset.py
+
+# torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/dist_checkpointing/models/test_gpt_model.py
+# torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/dist_checkpointing/models/test_retro_model.py
+# torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/dist_checkpointing/models/test_bert_model.py
+# torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/dist_checkpointing/models/test_sequential_mlp.py
+# torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/dist_checkpointing/test_fully_parallel.py
+# torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/dist_checkpointing/test_optimizer.py
+
+# torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/fusions
+
+# torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/models/test_base_embedding.py
+
+# torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py
+# torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer/test_mlp.py
+# torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer/test_retro_attention.py
@@ -48,4 +48,4 @@ def test_parse_hostfile_empty(mock_os_path_isfile, mock_open):
 
     mock_open.return_value.readlines.return_value = hostfile_content
     with pytest.raises(ValueError):
-        parse_hostfile("/path/to/hostfile.txt")
+        parse_hostfile("/path/to/hostfile.txt")