From 4a51534e63ea6cd22d0db2ecae156ae406ab86e4 Mon Sep 17 00:00:00 2001
From: JingyaHuang <huang_jingya@outlook.com>
Date: Wed, 18 Oct 2023 14:57:40 +0000
Subject: [PATCH] update tests

---
 .../docker/Dockerfile-ort-nightly-cu118       |   3 +-
 .../docker/Dockerfile-ort1.14.1-cu116         |   2 +-
 .../docker/Dockerfile-ort1.15.1-cu118         |   2 +-
 .../docker/Dockerfile-ort1.16.1-cu118         |   2 +-
 .../onnxruntime/training/test_examples.py     | 174 -----
 .../training/text-classification/run_glue.py  |  97 +--
 .../docker/Dockerfile_onnxruntime_trainer     |   7 +-
 .../training/nightly_test_examples.py         |  70 ++
 .../{ => training}/nightly_test_trainer.py    | 693 +++++++-----------
 9 files changed, 388 insertions(+), 662 deletions(-)
 delete mode 100644 examples/onnxruntime/training/test_examples.py
 create mode 100644 tests/onnxruntime/training/nightly_test_examples.py
 rename tests/onnxruntime/{ => training}/nightly_test_trainer.py (54%)

diff --git a/examples/onnxruntime/training/docker/Dockerfile-ort-nightly-cu118 b/examples/onnxruntime/training/docker/Dockerfile-ort-nightly-cu118
index 668e5e56695..3e6841453b5 100644
--- a/examples/onnxruntime/training/docker/Dockerfile-ort-nightly-cu118
+++ b/examples/onnxruntime/training/docker/Dockerfile-ort-nightly-cu118
@@ -22,6 +22,7 @@ CMD nvidia-smi
 ENV DEBIAN_FRONTEND noninteractive
 
 # Versions
+# available options 3.8, 3.9, 3.10, 3.11
 ARG PYTHON_VERSION=3.9
 ARG TORCH_CUDA_VERSION=cu118
 ARG TORCH_VERSION=2.0.0
@@ -34,7 +35,7 @@ SHELL ["/bin/bash", "-c"]
 # Install and update tools to minimize security vulnerabilities
 RUN apt-get update
 RUN apt-get install -y software-properties-common wget apt-utils patchelf git libprotobuf-dev protobuf-compiler cmake \
-    bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev && \
+    bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev ffmpeg && \
     apt-get clean
 RUN unattended-upgrade
 RUN apt-get autoremove -y
diff --git a/examples/onnxruntime/training/docker/Dockerfile-ort1.14.1-cu116 b/examples/onnxruntime/training/docker/Dockerfile-ort1.14.1-cu116
index db2219b5c62..15df7c352fe 100644
--- a/examples/onnxruntime/training/docker/Dockerfile-ort1.14.1-cu116
+++ b/examples/onnxruntime/training/docker/Dockerfile-ort1.14.1-cu116
@@ -33,7 +33,7 @@ ARG TORCHVISION_VERSION=0.14.1
 # Install and update tools to minimize security vulnerabilities
 RUN apt-get update
 RUN apt-get install -y software-properties-common wget apt-utils patchelf git libprotobuf-dev protobuf-compiler cmake \
-    bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev && \
+    bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev ffmpeg && \
     apt-get clean
 RUN unattended-upgrade
 RUN apt-get autoremove -y
diff --git a/examples/onnxruntime/training/docker/Dockerfile-ort1.15.1-cu118 b/examples/onnxruntime/training/docker/Dockerfile-ort1.15.1-cu118
index 51c9ec514c4..2d1306e1a35 100644
--- a/examples/onnxruntime/training/docker/Dockerfile-ort1.15.1-cu118
+++ b/examples/onnxruntime/training/docker/Dockerfile-ort1.15.1-cu118
@@ -34,7 +34,7 @@ ARG TORCHVISION_VERSION=0.15.1
 # Install and update tools to minimize security vulnerabilities
 RUN apt-get update
 RUN apt-get install -y software-properties-common wget apt-utils patchelf git libprotobuf-dev protobuf-compiler cmake \
-    bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev && \
+    bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev ffmpeg && \
     apt-get clean
 RUN unattended-upgrade
 RUN apt-get autoremove -y
diff --git a/examples/onnxruntime/training/docker/Dockerfile-ort1.16.1-cu118 b/examples/onnxruntime/training/docker/Dockerfile-ort1.16.1-cu118
index 3f6b8335923..482d495fcb4 100644
--- a/examples/onnxruntime/training/docker/Dockerfile-ort1.16.1-cu118
+++ b/examples/onnxruntime/training/docker/Dockerfile-ort1.16.1-cu118
@@ -34,7 +34,7 @@ SHELL ["/bin/bash", "-c"]
 # Install and update tools to minimize security vulnerabilities
 RUN apt-get update
 RUN apt-get install -y software-properties-common wget apt-utils patchelf git libprotobuf-dev protobuf-compiler cmake \
-    bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev && \
+    bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev ffmpeg && \
     apt-get clean
 RUN unattended-upgrade
 RUN apt-get autoremove -y
diff --git a/examples/onnxruntime/training/test_examples.py b/examples/onnxruntime/training/test_examples.py
deleted file mode 100644
index 8fe1de53d56..00000000000
--- a/examples/onnxruntime/training/test_examples.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# coding=utf-8
-#  Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import json
-import logging
-import os
-import sys
-import unittest
-from unittest.mock import patch
-
-import torch
-from transformers.file_utils import is_apex_available
-from transformers.testing_utils import TestCasePlus, get_gpu_count, slow, torch_device
-
-
-SRC_DIRS = [
-    os.path.join(os.path.dirname(__file__), dirname)
-    for dirname in [
-        "text-classification",
-        "token-classification",
-        "question-answering",
-        "translation",
-    ]
-]
-sys.path.extend(SRC_DIRS)
-if SRC_DIRS is not None:
-    import run_glue
-    import run_ner
-    import run_qa
-    import run_translation
-
-
-logging.basicConfig(level=logging.DEBUG)
-logger = logging.getLogger()
-
-
-def get_results(output_dir):
-    results = {}
-    path = os.path.join(output_dir, "all_results.json")
-    if os.path.exists(path):
-        with open(path, "r") as f:
-            results = json.load(f)
-    else:
-        raise ValueError(f"can't find {path}")
-    return results
-
-
-def is_cuda_and_apex_available():
-    is_using_cuda = torch.cuda.is_available() and torch_device == "cuda"
-    return is_using_cuda and is_apex_available()
-
-
-class ExamplesTests(TestCasePlus):
-    # Text Classification Tests
-    def test_run_glue(self):
-        stream_handler = logging.StreamHandler(sys.stdout)
-        logger.addHandler(stream_handler)
-
-        tmp_dir = self.get_auto_remove_tmp_dir()
-        testargs = f"""
-            run_glue.py
-            --model_name_or_path bert-base-uncased
-            --task_name sst2
-            --do_train
-            --do_eval
-            --output_dir {tmp_dir}
-            --overwrite_output_dir
-            --learning_rate=1e-5
-            --per_device_train_batch_size=16
-            --per_device_eval_batch_size=16
-            """.split()
-
-        with patch.object(sys, "argv", testargs):
-            run_glue.main()
-            result = get_results(tmp_dir)
-            self.assertGreaterEqual(result["eval_accuracy"], 0.75)
-
-    # Token Classification Tests
-    def test_run_ner(self):
-        stream_handler = logging.StreamHandler(sys.stdout)
-        logger.addHandler(stream_handler)
-
-        # with so little data distributed training needs more epochs to get the score on par with 0/1 gpu
-        epochs = 7 if get_gpu_count() > 1 else 2
-
-        tmp_dir = self.get_auto_remove_tmp_dir()
-        testargs = f"""
-            run_ner.py
-            --model_name_or_path bert-base-uncased
-            --dataset_name conll2003
-            --do_train
-            --do_eval
-            --output_dir {tmp_dir}
-            --overwrite_output_dir
-            --learning_rate=1e-5
-            --per_device_train_batch_size=16
-            --per_device_eval_batch_size=16
-            --num_train_epochs={epochs}
-        """.split()
-
-        with patch.object(sys, "argv", testargs):
-            run_ner.main()
-            result = get_results(tmp_dir)
-            self.assertGreaterEqual(result["eval_accuracy"], 0.75)
-            self.assertLess(result["eval_loss"], 0.5)
-
-    # Question Answering Tests
-    def test_run_qa(self):
-        stream_handler = logging.StreamHandler(sys.stdout)
-        logger.addHandler(stream_handler)
-
-        tmp_dir = self.get_auto_remove_tmp_dir()
-        testargs = f"""
-            run_qa.py
-            --model_name_or_path bert-base-uncased
-            --dataset_name squad
-            --do_train
-            --do_eval
-            --output_dir {tmp_dir}
-            --overwrite_output_dir
-            --learning_rate=1e-5
-            --per_device_train_batch_size=16
-            --per_device_eval_batch_size=16
-        """.split()
-
-        with patch.object(sys, "argv", testargs):
-            run_qa.main()
-            result = get_results(tmp_dir)
-            self.assertGreaterEqual(result["eval_f1"], 30)
-            self.assertGreaterEqual(result["eval_exact"], 30)
-
-    @slow
-    def test_run_translation(self):
-        stream_handler = logging.StreamHandler(sys.stdout)
-        logger.addHandler(stream_handler)
-
-        tmp_dir = self.get_auto_remove_tmp_dir()
-        testargs = f"""
-            run_translation.py
-            --model_name_or_path t5-large
-            --source_lang en
-            --target_lang ro
-            --dataset_name wmt16
-            --output_dir {tmp_dir}
-            --overwrite_output_dir
-            --max_steps=50
-            --warmup_steps=8
-            --do_train
-            --learning_rate=3e-3
-            --per_device_train_batch_size=2
-            --per_device_eval_batch_size=1
-            --predict_with_generate
-        """.split()
-
-        with patch.object(sys, "argv", testargs):
-            run_translation.main()
-            result = get_results(tmp_dir)
-            self.assertGreaterEqual(result["eval_bleu"], 30)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/examples/onnxruntime/training/text-classification/run_glue.py b/examples/onnxruntime/training/text-classification/run_glue.py
index 7a81a2ff156..f3f04657afb 100644
--- a/examples/onnxruntime/training/text-classification/run_glue.py
+++ b/examples/onnxruntime/training/text-classification/run_glue.py
@@ -21,6 +21,7 @@
 import os
 import random
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -48,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.23.0")
+check_min_version("4.34.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
@@ -188,12 +189,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -203,32 +220,24 @@ class ModelArguments:
     )
 
 
-@dataclass
-class InferenceArguments:
-    """
-    Arguments for inference(evaluate, predict).
-    """
-
-    inference_with_ort: bool = field(
-        default=False,
-        metadata={"help": "Whether use ONNX Runtime as backend for inference. Default set to false."},
-    )
-
-
 def main():
     # See all possible arguments in src/transformers/training_args.py
     # or by passing the --help flag to this script.
     # We now keep distinct sets of args, for a cleaner separation of concerns.
 
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments, InferenceArguments))
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a json file,
         # let's parse it to get our arguments.
-        model_args, data_args, training_args, inference_args = parser.parse_json_file(
-            json_file=os.path.abspath(sys.argv[1])
-        )
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
     else:
-        model_args, data_args, training_args, inference_args = parser.parse_args_into_dataclasses()
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
 
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
@@ -241,6 +250,10 @@ def main():
         handlers=[logging.StreamHandler(sys.stdout)],
     )
 
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
     log_level = training_args.get_process_log_level()
     logger.setLevel(log_level)
     datasets.utils.logging.set_verbosity(log_level)
@@ -291,7 +304,7 @@ def main():
             "glue",
             data_args.task_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     elif data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
@@ -299,7 +312,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         # Loading a dataset from your local files.
@@ -328,7 +341,7 @@ def main():
                 "csv",
                 data_files=data_files,
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
         else:
             # Loading a dataset from local json files
@@ -336,7 +349,7 @@ def main():
                 "json",
                 data_files=data_files,
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
     # See more about loading any type of standard or custom dataset at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -371,14 +384,16 @@ def main():
         finetuning_task=data_args.task_name,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForSequenceClassification.from_pretrained(
         model_args.model_name_or_path,
@@ -386,7 +401,8 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
 
@@ -440,7 +456,7 @@ def main():
 
     if data_args.max_seq_length > tokenizer.model_max_length:
         logger.warning(
-            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the "
             f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
         )
     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
@@ -496,6 +512,8 @@ def preprocess_function(examples):
     # Get the metric function
     if data_args.task_name is not None:
         metric = evaluate.load("glue", data_args.task_name)
+    elif is_regression:
+        metric = evaluate.load("mse")
     else:
         metric = evaluate.load("accuracy")
 
@@ -504,17 +522,12 @@ def preprocess_function(examples):
     def compute_metrics(p: EvalPrediction):
         preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
         preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
-        if data_args.task_name is not None:
-            result = metric.compute(predictions=preds, references=p.label_ids)
-            if len(result) > 1:
-                result["combined_score"] = np.mean(list(result.values())).item()
-            return result
-        elif is_regression:
-            return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
-        else:
-            return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
+        result = metric.compute(predictions=preds, references=p.label_ids)
+        if len(result) > 1:
+            result["combined_score"] = np.mean(list(result.values())).item()
+        return result
 
-    # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to ORTTrainer, so we change it if
+    # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if
     # we already did the padding.
     if data_args.pad_to_max_length:
         data_collator = default_data_collator
@@ -532,7 +545,6 @@ def compute_metrics(p: EvalPrediction):
         compute_metrics=compute_metrics,
         tokenizer=tokenizer,
         data_collator=data_collator,
-        feature="text-classification",
     )
 
     # Training
@@ -550,6 +562,7 @@ def compute_metrics(p: EvalPrediction):
         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
 
         trainer.save_model()  # Saves the tokenizer too for easy upload
+
         trainer.log_metrics("train", metrics)
         trainer.save_metrics("train", metrics)
         trainer.save_state()
@@ -571,7 +584,7 @@ def compute_metrics(p: EvalPrediction):
             combined = {}
 
         for eval_dataset, task in zip(eval_datasets, tasks):
-            metrics = trainer.evaluate(eval_dataset=eval_dataset, inference_with_ort=inference_args.inference_with_ort)
+            metrics = trainer.evaluate(eval_dataset=eval_dataset)
 
             max_eval_samples = (
                 data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
@@ -599,9 +612,7 @@ def compute_metrics(p: EvalPrediction):
         for predict_dataset, task in zip(predict_datasets, tasks):
             # Removing the `label` columns because it contains -1 and Trainer won't like that.
             predict_dataset = predict_dataset.remove_columns("label")
-            predictions = trainer.predict(
-                predict_dataset, metric_key_prefix="predict", inference_with_ort=inference_args.inference_with_ort
-            ).predictions
+            predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
             predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)
 
             output_predict_file = os.path.join(training_args.output_dir, f"predict_results_{task}.txt")
diff --git a/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer b/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer
index 62f7efc8178..7266ba224a8 100644
--- a/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer
+++ b/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer
@@ -34,7 +34,7 @@ ARG TORCHVISION_VERSION=0.15.1
 # Install and update tools to minimize security vulnerabilities
 RUN apt-get update
 RUN apt-get install -y software-properties-common wget apt-utils patchelf git libprotobuf-dev protobuf-compiler cmake \
-    bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev && \
+    bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev ffmpeg && \
     apt-get clean
 RUN unattended-upgrade
 RUN apt-get autoremove -y
@@ -65,7 +65,7 @@ RUN $PYTHON_EXE -m pip install onnx ninja
 RUN $PYTHON_EXE -m pip install torch==${TORCH_VERSION} torchvision==${TORCHVISION_VERSION} -f https://download.pytorch.org/whl/${TORCH_CUDA_VERSION}
 
 # ORT Module
-RUN $PYTHON_EXE -m pip install onnxruntime-training==1.15.1 -f https://download.onnxruntime.ai/onnxruntime_stable_cu118.html
+RUN $PYTHON_EXE -m pip install onnxruntime-training==1.16.1 -f https://download.onnxruntime.ai/onnxruntime_stable_cu118.html
 RUN $PYTHON_EXE -m pip install torch-ort
 ENV TORCH_CUDA_ARCH_LIST="5.2 6.0 6.1 7.0 7.5 8.0 8.6+PTX"
 RUN $PYTHON_EXE -m pip install --upgrade protobuf==3.20.2
@@ -76,4 +76,5 @@ COPY . /workspace/optimum
 RUN pip install /workspace/optimum[tests]
 
 ENV TEST_LEVEL=1
-CMD RUN_SLOW=1 pytest -v -rs onnxruntime/nightly_test_trainer.py --durations=0
\ No newline at end of file
+CMD RUN_SLOW=1 pytest -v -rs onnxruntime/training/nightly_test_trainer.py --durations=0
+CMD RUN_SLOW=1 pytest -v -rs onnxruntime/training/nightly_test_examples.py --durations=0
\ No newline at end of file
diff --git a/tests/onnxruntime/training/nightly_test_examples.py b/tests/onnxruntime/training/nightly_test_examples.py
new file mode 100644
index 00000000000..2318c9b47b2
--- /dev/null
+++ b/tests/onnxruntime/training/nightly_test_examples.py
@@ -0,0 +1,70 @@
+# coding=utf-8
+# Copyright 2023 the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test ONNX Runtime Training Examples in Optimum."""
+
+import subprocess
+import unittest
+from transformers.testing_utils import slow
+
+
+@slow
+class ORTTrainerExampleTest(unittest.TestCase):
+    def test_text_classification(self):
+        subprocess.run(
+            "cp ../examples/onnxruntime/training/text-classification/run_glue.py ./",
+            shell=True,
+        )
+
+        subprocess.run(
+            "torchrun"
+            " --nproc_per_node=1"
+            " run_glue.py"
+            " --model_name_or_path distilbert-base-uncased"
+            " --task_name mnli"
+            " --max_seq_length 64"
+            " --learning_rate 3e-6"
+            " --do_train"
+            " --output_dir /tmp/distilbert"
+            " --overwrite_output_dir"
+            " --max_steps 50"
+            " --logging_steps 50"
+            " --per_device_train_batch_size 8"
+            " --fp16 --optim adamw_ort_fused"
+            " --max_train_samples 20",
+            shell=True,
+            check=True,
+        )
+
+    # TODO: Test all ORT training examples
+    def test_token_classification(self):
+        pass
+
+    def test_translation(self):
+        pass
+
+    def test_summarization(self):
+        pass
+
+    def test_stable_diffusion_txt2img(self):
+        pass
+
+    def test_question_answering(self):
+        pass
+
+    def test_language_modeling(self):
+        pass
+
+    def test_image_classification(self):
+        pass
diff --git a/tests/onnxruntime/nightly_test_trainer.py b/tests/onnxruntime/training/nightly_test_trainer.py
similarity index 54%
rename from tests/onnxruntime/nightly_test_trainer.py
rename to tests/onnxruntime/training/nightly_test_trainer.py
index 2eb3ca433f7..e24ee306178 100644
--- a/tests/onnxruntime/nightly_test_trainer.py
+++ b/tests/onnxruntime/training/nightly_test_trainer.py
@@ -12,11 +12,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Test ONNX Runtime Training ORTTrainer in Optimum."""
 
 import gc
+import os
 import random
-import subprocess
-import sys
 import tempfile
 import unittest
 from itertools import chain
@@ -25,7 +25,6 @@
 
 import nltk
 import numpy as np
-import pytest
 from datasets import load_dataset
 from evaluate import load
 from transformers import (
@@ -35,12 +34,16 @@
     AutoModelForTokenClassification,
     AutoTokenizer,
     DataCollatorForSeq2Seq,
-    DataCollatorForTokenClassification,
     DataCollatorWithPadding,
     default_data_collator,
     is_torch_available,
 )
-from transformers.testing_utils import require_deepspeed, require_torch, slow
+from transformers.testing_utils import (
+    mockenv_context,
+    require_deepspeed,
+    require_torch,
+    slow,
+)
 from transformers.training_args import OptimizerNames
 
 
@@ -75,11 +78,11 @@
         "data_collator": default_data_collator,
         "data_collator_class": DataCollatorWithPadding,
     },
-    "token-classification": {
-        "dataset": ["conll2003"],
-        "metric": ["seqeval"],
-        "data_collator_class": DataCollatorForTokenClassification,
-    },
+    # "token-classification": {
+    #     "dataset": ["conll2003"],
+    #     "metric": ["seqeval"],
+    #     "data_collator_class": DataCollatorForTokenClassification,
+    # },
 }
 
 _DECODER_TASKS_DATASETS_CONFIGS = {
@@ -88,11 +91,6 @@
         "metric": ["accuracy"],
         "data_collator": default_data_collator,
     },
-    "text-generation-with-past": {
-        "dataset": ["wikitext", "wikitext-2-raw-v1"],
-        "metric": ["accuracy"],
-        "data_collator": default_data_collator,
-    },
 }
 
 _SEQ2SEQ_TASKS_DATASETS_CONFIGS = {
@@ -101,30 +99,37 @@
         "metric": ["rouge"],
         "data_collator_class": DataCollatorForSeq2Seq,
     },
-    "text2text-generation-with-past": {
-        "dataset": ["xsum"],
-        "metric": ["rouge"],
-        "data_collator_class": DataCollatorForSeq2Seq,
-    },
 }
 
+# List supported ORT optimizers to test
+optim_test_params = []
+if is_torch_available():
+    default_adam_kwargs = {
+        "betas": (ORTTrainingArguments.adam_beta1, ORTTrainingArguments.adam_beta2),
+        "eps": ORTTrainingArguments.adam_epsilon,
+        "lr": ORTTrainingArguments.learning_rate,
+    }
 
-def _get_models_to_test(model_list, task_list, both_inf_backend=False, excluded: Optional[List[str]] = None):
+    optim_test_params = [
+        (
+            ORTOptimizerNames.ADAMW_ORT_FUSED,
+            onnxruntime.training.optim.FusedAdam,
+            default_adam_kwargs,
+        ),
+    ]
+
+# default torch.distributed port
+DEFAULT_MASTER_PORT = "10999"
+
+
+def _get_models_to_test(model_list, task_list, excluded: Optional[List[str]] = None):
     models_to_test = []
 
     for name, model_name in model_list:
-        for feature, data_metric_config in task_list.items():
-            if excluded and (name in excluded or feature in excluded):
+        for task, data_metric_config in task_list.items():
+            if excluded and (name in excluded or task in excluded):
                 continue
-            if both_inf_backend:
-                models_to_test.append(
-                    (f"{name}_{feature}", model_name, feature, data_metric_config, True)
-                )  # inference_with_ort=True
-                models_to_test.append(
-                    (f"{name}_{feature}", model_name, feature, data_metric_config, False)
-                )  # inference_with_ort=False
-            else:
-                models_to_test.append((f"{name}_{feature}", model_name, feature, data_metric_config))
+            models_to_test.append((f"{name}_{task}", model_name, task, data_metric_config))
 
     return sorted(models_to_test)
 
@@ -151,17 +156,39 @@ def _get_data_collator(data_metric_config, tokenizer=None, model=None, training_
     return data_collator
 
 
-def get_ort_training_args(feature, **kwargs):
-    if feature in _ENCODER_TASKS_DATASETS_CONFIGS or feature in _DECODER_TASKS_DATASETS_CONFIGS:
+def get_ort_training_args(task, **kwargs):
+    if task in _ENCODER_TASKS_DATASETS_CONFIGS or task in _DECODER_TASKS_DATASETS_CONFIGS:
         training_args = ORTTrainingArguments(**kwargs)
-    elif feature in _SEQ2SEQ_TASKS_DATASETS_CONFIGS:
+    elif task in _SEQ2SEQ_TASKS_DATASETS_CONFIGS:
         training_args = ORTSeq2SeqTrainingArguments(**kwargs)
     return training_args
 
 
+def get_master_port(real_launcher=False):
+    """
+    When using a single gpu launcher emulation (i.e. not deepspeed or python -m torch.distributed)
+    the issue is that once the port is tied it can't be used anywhere else outside of this process,
+    since torch.dist doesn't free the port until the process exits. Therefore for the sake of being
+    able to run both emulated launcher and normal launcher tests we need 2 distinct ports.
+
+    This function will give the right port in the right context. For real launcher it'll give the
+    base port, for emulated launcher it'll give the base port + 1. In both cases a string is
+    returned.
+
+    Args:
+        `real_launcher`: whether a real launcher is going to be used, or the emulated one
+
+    """
+
+    master_port_base = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
+    if not real_launcher:
+        master_port_base = str(int(master_port_base) + 1)
+    return master_port_base
+
+
 def get_ort_trainer(
     model_name,
-    feature,
+    task,
     data_metric_config,
     training_args,
     max_seq_length=None,
@@ -170,7 +197,7 @@ def get_ort_trainer(
     max_test_samples=None,
     **kwargs,
 ):
-    training_kwargs = load_and_prepare(feature)(
+    training_kwargs = load_and_prepare(task)(
         model_name,
         data_metric_config,
         max_seq_length,
@@ -185,26 +212,25 @@ def get_ort_trainer(
     if getattr(training_args, "predict_with_generate", False) is not True:
         training_kwargs.pop("compute_metrics", None)
 
-    if feature in _ENCODER_TASKS_DATASETS_CONFIGS or feature in _DECODER_TASKS_DATASETS_CONFIGS:
-        trainer = ORTTrainer(feature=feature, args=training_args, **training_kwargs)
-    elif feature in _SEQ2SEQ_TASKS_DATASETS_CONFIGS:
-        trainer = ORTSeq2SeqTrainer(feature=feature, args=training_args, **training_kwargs)
+    if task in _ENCODER_TASKS_DATASETS_CONFIGS or task in _DECODER_TASKS_DATASETS_CONFIGS:
+        trainer = ORTTrainer(args=training_args, **training_kwargs)
+    elif task in _SEQ2SEQ_TASKS_DATASETS_CONFIGS:
+        trainer = ORTSeq2SeqTrainer(args=training_args, **training_kwargs)
     else:
         raise
 
     return trainer, test_dataset
 
 
-def load_and_prepare(feature):
+def load_and_prepare(task):
     preprocess_mapping = {
         "text-classification": load_and_prepare_glue,
         "token-classification": load_and_prepare_ner,
         "text-generation": load_and_prepare_clm,
         "text-generation-with-past": load_and_prepare_clm,
         "text2text-generation": load_and_prepare_xsum,
-        "text2text-generation-with-past": load_and_prepare_xsum,
     }
-    return preprocess_mapping[feature]
+    return preprocess_mapping[task]
 
 
 def load_and_prepare_glue(model_name, data_metric_config, max_seq_length, padding="max_length", **kwargs):
@@ -520,212 +546,140 @@ class ORTTrainerIntegrationTest(unittest.TestCase):
     def setUp(self):
         super().setUp()
         args = ORTTrainingArguments("..")
+        master_port = get_master_port(real_launcher=False)
+        self.dist_env_1_gpu = {
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": master_port,
+            "RANK": "0",
+            "LOCAL_RANK": "0",
+            "WORLD_SIZE": "1",
+        }
         self.n_epochs = min(args.num_train_epochs, 1)
-        self.per_device_train_batch_size = args.per_device_train_batch_size
-        self.per_device_eval_batch_size = args.per_device_eval_batch_size
+        self.per_device_train_batch_size = min(args.per_device_train_batch_size, 2)
+        self.per_device_eval_batch_size = min(args.per_device_eval_batch_size, 2)
 
         self.max_seq_length = 64
-        self.max_train_samples = 50
-        self.max_valid_samples = 20
-        self.max_test_samples = 10
+        self.max_train_samples = 10
+        self.max_valid_samples = 5
+        self.max_test_samples = 5
 
         self.warmup_steps = 10
         self.weight_decay = 0.01
 
     @parameterized.expand(
-        _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS, both_inf_backend=True)
-        # + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS, both_inf_backend=True)  # Skip test for OOM bug
-        + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS, both_inf_backend=True),
-        skip_on_empty=True,
-    )
-    def test_trainer_fp32(self, test_name, model_name, feature, data_metric_config, inference_with_ort):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            training_args = get_ort_training_args(
-                feature=feature,
-                output_dir=tmp_dir,
-                num_train_epochs=self.n_epochs,
-                per_device_train_batch_size=self.per_device_train_batch_size,
-                per_device_eval_batch_size=self.per_device_eval_batch_size,
-                warmup_steps=self.warmup_steps,
-                weight_decay=self.weight_decay,
-                logging_dir=tmp_dir,
-            )
-
-            trainer, test_dataset = get_ort_trainer(
-                model_name,
-                feature,
-                data_metric_config,
-                training_args,
-                max_seq_length=self.max_seq_length,
-                max_train_samples=self.max_train_samples,
-                max_valid_samples=self.max_valid_samples,
-                max_test_samples=self.max_test_samples,
-            )
-
-            trainer.train()
-            trainer.save_model()
-            trainer.evaluate(inference_with_ort=inference_with_ort)
-            trainer.predict(test_dataset, inference_with_ort=inference_with_ort)
-            gc.collect()
-
-    @parameterized.expand(
-        _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS, both_inf_backend=True)
-        # + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS, both_inf_backend=True)  # Skip test for OOM bug
-        + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS, both_inf_backend=True),
+        _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS)
+        + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS)
+        + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS),
         skip_on_empty=True,
     )
-    def test_trainer_fp32_with_label_smoothing(
-        self, test_name, model_name, feature, data_metric_config, inference_with_ort
-    ):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            training_args = get_ort_training_args(
-                feature=feature,
-                output_dir=tmp_dir,
-                num_train_epochs=self.n_epochs,
-                per_device_train_batch_size=self.per_device_train_batch_size,
-                per_device_eval_batch_size=self.per_device_eval_batch_size,
-                label_smoothing_factor=0.1,
-                warmup_steps=self.warmup_steps,
-                weight_decay=self.weight_decay,
-                logging_dir=tmp_dir,
-            )
-
-            trainer, test_dataset = get_ort_trainer(
-                model_name,
-                feature,
-                data_metric_config,
-                training_args,
-                max_seq_length=self.max_seq_length,
-                max_train_samples=self.max_train_samples,
-                max_valid_samples=self.max_valid_samples,
-                max_test_samples=self.max_test_samples,
-            )
-
-            trainer.train()
-            trainer.save_model()
-            trainer.evaluate(inference_with_ort=inference_with_ort)
-            trainer.predict(test_dataset, inference_with_ort=inference_with_ort)
-            gc.collect()
+    def test_trainer_fp32(self, test_name, model_name, task, data_metric_config):
+        with mockenv_context(**self.dist_env_1_gpu):
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                training_args = get_ort_training_args(
+                    task=task,
+                    output_dir=tmp_dir,
+                    num_train_epochs=self.n_epochs,
+                    per_device_train_batch_size=self.per_device_train_batch_size,
+                    per_device_eval_batch_size=self.per_device_eval_batch_size,
+                    warmup_steps=self.warmup_steps,
+                    weight_decay=self.weight_decay,
+                    logging_dir=tmp_dir,
+                )
+
+                trainer, test_dataset = get_ort_trainer(
+                    model_name,
+                    task,
+                    data_metric_config,
+                    training_args,
+                    max_seq_length=self.max_seq_length,
+                    max_train_samples=self.max_train_samples,
+                    max_valid_samples=self.max_valid_samples,
+                    max_test_samples=self.max_test_samples,
+                )
+
+                trainer.train()
+                trainer.save_model()
+                trainer.evaluate()
+                trainer.predict(test_dataset)
+                gc.collect()
 
     @slow
     @parameterized.expand(
         _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS)
-        # + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS)  # Skip test for OOM bug
+        + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS)
         + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS),
         skip_on_empty=True,
     )
-    def test_trainer_fp16_pt_inference(self, test_name, model_name, feature, data_metric_config):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            training_args = get_ort_training_args(
-                feature=feature,
-                output_dir=tmp_dir,
-                num_train_epochs=self.n_epochs,
-                per_device_train_batch_size=self.per_device_train_batch_size,
-                per_device_eval_batch_size=self.per_device_eval_batch_size,
-                warmup_steps=self.warmup_steps,
-                weight_decay=self.weight_decay,
-                logging_dir=tmp_dir,
-                fp16=True,
-            )
-
-            trainer, test_dataset = get_ort_trainer(
-                model_name,
-                feature,
-                data_metric_config,
-                training_args,
-                max_seq_length=self.max_seq_length,
-                max_train_samples=self.max_train_samples,
-                max_valid_samples=self.max_valid_samples,
-                max_test_samples=self.max_test_samples,
-            )
-
-            trainer.train()
-            trainer.save_model()
-            trainer.evaluate()
-            trainer.predict(test_dataset)
-            gc.collect()
+    def test_trainer_fp32_with_label_smoothing(self, test_name, model_name, task, data_metric_config):
+        with mockenv_context(**self.dist_env_1_gpu):
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                training_args = get_ort_training_args(
+                    task=task,
+                    output_dir=tmp_dir,
+                    num_train_epochs=self.n_epochs,
+                    per_device_train_batch_size=self.per_device_train_batch_size,
+                    per_device_eval_batch_size=self.per_device_eval_batch_size,
+                    label_smoothing_factor=0.1,
+                    warmup_steps=self.warmup_steps,
+                    weight_decay=self.weight_decay,
+                    logging_dir=tmp_dir,
+                )
+
+                trainer, test_dataset = get_ort_trainer(
+                    model_name,
+                    task,
+                    data_metric_config,
+                    training_args,
+                    max_seq_length=self.max_seq_length,
+                    max_train_samples=self.max_train_samples,
+                    max_valid_samples=self.max_valid_samples,
+                    max_test_samples=self.max_test_samples,
+                )
+
+                trainer.train()
+                trainer.save_model()
+                trainer.evaluate()
+                trainer.predict(test_dataset)
+                gc.collect()
 
     @slow
     @parameterized.expand(
         _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS)
-        # Exclude "with-past" tests as they fail for ORT inference after the mixed-precision training
-        # + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS, excluded=["text-generation-with-past"])  # Skip test for OOM bug
-        + _get_models_to_test(
-            _SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS, excluded=["text2text-generation-with-past"]
-        ),
+        + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS)
+        + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS),
         skip_on_empty=True,
     )
-    def test_trainer_fp16_ort_inference(self, test_name, model_name, feature, data_metric_config):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            training_args = get_ort_training_args(
-                feature=feature,
-                output_dir=tmp_dir,
-                num_train_epochs=self.n_epochs,
-                per_device_train_batch_size=self.per_device_train_batch_size,
-                per_device_eval_batch_size=self.per_device_eval_batch_size,
-                warmup_steps=self.warmup_steps,
-                weight_decay=self.weight_decay,
-                logging_dir=tmp_dir,
-                fp16=True,
-            )
-
-            trainer, test_dataset = get_ort_trainer(
-                model_name,
-                feature,
-                data_metric_config,
-                training_args,
-                max_seq_length=self.max_seq_length,
-                max_train_samples=self.max_train_samples,
-                max_valid_samples=self.max_valid_samples,
-                max_test_samples=self.max_test_samples,
-            )
-
-            trainer.train()
-            trainer.save_model()
-            trainer.evaluate(inference_with_ort=True)
-            trainer.predict(test_dataset, inference_with_ort=True)
-            gc.collect()
-
-    # Skip this test as a large amount of ops don't support bf16 yet.
-    # @unittest.skip("Skip BF16 test.")
-    # @slow
-    # @require_torch_bf16_gpu
-    # @parameterized.expand(
-    #     _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS)
-    #     + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS)
-    #     + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS),
-    #     skip_on_empty=True,
-    # )
-    # def test_trainer_bf16(self, test_name, model_name, feature, data_metric_config):
-    #     with tempfile.TemporaryDirectory() as tmp_dir:
-    #         training_args = get_ort_training_args(
-    #             feature=feature,
-    #             output_dir=tmp_dir,
-    #             num_train_epochs=self.n_epochs,
-    #             per_device_train_batch_size=self.per_device_train_batch_size,
-    #             per_device_eval_batch_size=self.per_device_eval_batch_size,
-    #             warmup_steps=self.warmup_steps,
-    #             weight_decay=self.weight_decay,
-    #             logging_dir=tmp_dir,
-    #             bf16=True,
-    #         )
-
-    #         trainer, test_dataset = get_ort_trainer(
-    #             model_name,
-    #             feature,
-    #             data_metric_config,
-    #             training_args,
-    #             max_seq_length=self.max_seq_length,
-    #             max_train_samples=self.max_train_samples,
-    #             max_valid_samples=self.max_valid_samples,
-    #             max_test_samples=self.max_test_samples,
-    #         )
-
-    #         trainer.train()
-    #         trainer.save_model()
-    #         trainer.evaluate()
-    #         trainer.predict(test_dataset)
-    #         gc.collect()
+    def test_trainer_fp16(self, test_name, model_name, task, data_metric_config):
+        with mockenv_context(**self.dist_env_1_gpu):
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                training_args = get_ort_training_args(
+                    task=task,
+                    output_dir=tmp_dir,
+                    num_train_epochs=self.n_epochs,
+                    per_device_train_batch_size=self.per_device_train_batch_size,
+                    per_device_eval_batch_size=self.per_device_eval_batch_size,
+                    warmup_steps=self.warmup_steps,
+                    weight_decay=self.weight_decay,
+                    logging_dir=tmp_dir,
+                    fp16=True,
+                )
+
+                trainer, test_dataset = get_ort_trainer(
+                    model_name,
+                    task,
+                    data_metric_config,
+                    training_args,
+                    max_seq_length=self.max_seq_length,
+                    max_train_samples=self.max_train_samples,
+                    max_valid_samples=self.max_valid_samples,
+                    max_test_samples=self.max_test_samples,
+                )
+
+                trainer.train()
+                trainer.save_model()
+                trainer.evaluate()
+                trainer.predict(test_dataset)
+                gc.collect()
 
 
 @slow
@@ -734,14 +688,22 @@ class ORTTrainerIntegrationDeepSpeedTest(unittest.TestCase):
     def setUp(self):
         super().setUp()
         args = ORTTrainingArguments("..")
+        master_port = get_master_port(real_launcher=False)
+        self.dist_env_1_gpu = {
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": master_port,
+            "RANK": "0",
+            "LOCAL_RANK": "0",
+            "WORLD_SIZE": "1",
+        }
         self.n_epochs = min(args.num_train_epochs, 1)
-        self.per_device_train_batch_size = args.per_device_train_batch_size
-        self.per_device_eval_batch_size = args.per_device_eval_batch_size
+        self.per_device_train_batch_size = min(args.per_device_train_batch_size, 2)
+        self.per_device_eval_batch_size = min(args.per_device_eval_batch_size, 2)
 
         self.max_seq_length = 64
-        self.max_train_samples = 30
-        self.max_valid_samples = 10
-        self.max_test_samples = 10
+        self.max_train_samples = 10
+        self.max_valid_samples = 5
+        self.max_test_samples = 5
 
         self.warmup_steps = 10
         self.weight_decay = 0.01
@@ -749,126 +711,80 @@ def setUp(self):
     @parameterized.expand(
         random.sample(
             _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS)
-            # + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS)
+            + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS)
             + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS),
-            1,
+            1,  # only test one
         ),
         skip_on_empty=True,
     )
-    def test_trainer_fp16_ds_stage1(self, test_name, model_name, feature, data_metric_config):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            training_args = get_ort_training_args(
-                feature=feature,
-                output_dir=tmp_dir,
-                num_train_epochs=self.n_epochs,
-                per_device_train_batch_size=self.per_device_train_batch_size,
-                per_device_eval_batch_size=self.per_device_eval_batch_size,
-                warmup_steps=self.warmup_steps,
-                weight_decay=self.weight_decay,
-                logging_dir=tmp_dir,
-                fp16=True,
-                deepspeed="onnxruntime/ds_configs/ds_config_zero_stage_1.json",
-            )
-
-            trainer, _ = get_ort_trainer(
-                model_name,
-                feature,
-                data_metric_config,
-                training_args,
-                max_seq_length=self.max_seq_length,
-                max_train_samples=self.max_train_samples,
-                max_valid_samples=self.max_valid_samples,
-                max_test_samples=self.max_test_samples,
-            )
-
-            trainer.train()
-            gc.collect()
+    def test_trainer_fp16_ds_stage1(self, test_name, model_name, task, data_metric_config):
+        with mockenv_context(**self.dist_env_1_gpu):
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                training_args = get_ort_training_args(
+                    task=task,
+                    output_dir=tmp_dir,
+                    num_train_epochs=self.n_epochs,
+                    per_device_train_batch_size=self.per_device_train_batch_size,
+                    per_device_eval_batch_size=self.per_device_eval_batch_size,
+                    warmup_steps=self.warmup_steps,
+                    weight_decay=self.weight_decay,
+                    logging_dir=tmp_dir,
+                    fp16=True,
+                    deepspeed="onnxruntime/ds_configs/ds_config_zero_stage_1.json",
+                )
+
+                trainer, _ = get_ort_trainer(
+                    model_name,
+                    task,
+                    data_metric_config,
+                    training_args,
+                    max_seq_length=self.max_seq_length,
+                    max_train_samples=self.max_train_samples,
+                    max_valid_samples=self.max_valid_samples,
+                    max_test_samples=self.max_test_samples,
+                )
+
+                trainer.train()
+                gc.collect()
 
     @parameterized.expand(
         random.sample(
             _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS)
-            # + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS)
+            + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS)
             + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS),
             1,
         ),
         skip_on_empty=True,
     )
-    def test_trainer_fp16_ds_stage2(self, test_name, model_name, feature, data_metric_config):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            training_args = get_ort_training_args(
-                feature=feature,
-                output_dir=tmp_dir,
-                num_train_epochs=self.n_epochs,
-                per_device_train_batch_size=self.per_device_train_batch_size,
-                per_device_eval_batch_size=self.per_device_eval_batch_size,
-                warmup_steps=self.warmup_steps,
-                weight_decay=self.weight_decay,
-                logging_dir=tmp_dir,
-                fp16=True,
-                deepspeed="onnxruntime/ds_configs/ds_config_zero_stage_2.json",
-            )
-
-            trainer, _ = get_ort_trainer(
-                model_name,
-                feature,
-                data_metric_config,
-                training_args,
-                max_seq_length=self.max_seq_length,
-                max_train_samples=self.max_train_samples,
-                max_valid_samples=self.max_valid_samples,
-                max_test_samples=self.max_test_samples,
-            )
-
-            trainer.train()
-            gc.collect()
-
-
-@slow
-@pytest.mark.skip(reason="skip for now, server socket error")
-class ORTTrainerIntegrationDDPTest(unittest.TestCase):
-    def test_trainer_ddp_glue(self):
-        subprocess.run(
-            "cp ../examples/onnxruntime/training/text-classification/run_glue.py ./",
-            shell=True,
-        )
-
-        subprocess.run(
-            f"{sys.executable} -m torch.distributed.launch"
-            " --nproc_per_node=1"
-            " run_glue.py"
-            " --model_name_or_path distilbert-base-uncased"
-            " --task_name mnli"
-            " --max_seq_length 128"
-            " --learning_rate 3e-6"
-            " --do_train"
-            " --output_dir /tmp/distilbert"
-            " --overwrite_output_dir"
-            " --max_steps 200"
-            " --logging_steps 20"
-            " --per_device_train_batch_size 32"
-            " --fp16 --optim adamw_ort_fused"
-            " --max_train_samples 500",
-            shell=True,
-            check=True,
-        )
-
-
-# List supported ORT optimizers to test
-optim_test_params = []
-if is_torch_available():
-    default_adam_kwargs = {
-        "betas": (ORTTrainingArguments.adam_beta1, ORTTrainingArguments.adam_beta2),
-        "eps": ORTTrainingArguments.adam_epsilon,
-        "lr": ORTTrainingArguments.learning_rate,
-    }
-
-    optim_test_params = [
-        (
-            ORTOptimizerNames.ADAMW_ORT_FUSED,
-            onnxruntime.training.optim.FusedAdam,
-            default_adam_kwargs,
-        ),
-    ]
+    def test_trainer_fp16_ds_stage2(self, test_name, model_name, task, data_metric_config):
+        with mockenv_context(**self.dist_env_1_gpu):
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                training_args = get_ort_training_args(
+                    task=task,
+                    output_dir=tmp_dir,
+                    num_train_epochs=self.n_epochs,
+                    per_device_train_batch_size=self.per_device_train_batch_size,
+                    per_device_eval_batch_size=self.per_device_eval_batch_size,
+                    warmup_steps=self.warmup_steps,
+                    weight_decay=self.weight_decay,
+                    logging_dir=tmp_dir,
+                    fp16=True,
+                    deepspeed="onnxruntime/ds_configs/ds_config_zero_stage_2.json",
+                )
+
+                trainer, _ = get_ort_trainer(
+                    model_name,
+                    task,
+                    data_metric_config,
+                    training_args,
+                    max_seq_length=self.max_seq_length,
+                    max_train_samples=self.max_train_samples,
+                    max_valid_samples=self.max_valid_samples,
+                    max_test_samples=self.max_test_samples,
+                )
+
+                trainer.train()
+                gc.collect()
 
 
 @slow
@@ -876,21 +792,6 @@ def test_trainer_ddp_glue(self):
 class ORTTrainerOptimizerChoiceTest(unittest.TestCase):
     def setUp(self):
         super().setUp()
-        args = ORTTrainingArguments("..")
-        self.n_epochs = min(args.num_train_epochs, 1)
-        self.per_device_train_batch_size = args.per_device_train_batch_size
-        self.per_device_eval_batch_size = args.per_device_eval_batch_size
-
-        self.max_seq_length = 64
-        self.max_train_samples = 50
-        self.max_valid_samples = 20
-        self.max_test_samples = 10
-
-        self.warmup_steps = 10
-        self.weight_decay = 0.01
-
-        self.model_name = "bert-base-cased"
-        self.feature = "text-classification"
 
     def check_optim_and_kwargs(self, optim: OptimizerNames, mandatory_kwargs, expected_cls):
         args = ORTTrainingArguments(optim=optim, output_dir="None")
@@ -903,37 +804,6 @@ def check_optim_and_kwargs(self, optim: OptimizerNames, mandatory_kwargs, expect
             actual_v = optim_kwargs[p]
             self.assertTrue(actual_v == v, f"Failed check for {p}. Expected {v}, but got {actual_v}.")
 
-    @parameterized.expand(optim_test_params, skip_on_empty=True)
-    def test_optim_supported(self, name: str, expected_cls, mandatory_kwargs):
-        # exercises all the valid --optim options
-        self.check_optim_and_kwargs(name, mandatory_kwargs, expected_cls)
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            training_args = ORTTrainingArguments(
-                optim=name,
-                output_dir=tmp_dir,
-                num_train_epochs=self.n_epochs,
-                per_device_train_batch_size=self.per_device_train_batch_size,
-                per_device_eval_batch_size=self.per_device_eval_batch_size,
-                warmup_steps=self.warmup_steps,
-                weight_decay=self.weight_decay,
-                logging_dir=tmp_dir,
-            )
-
-            trainer, _ = get_ort_trainer(
-                self.model_name,
-                self.feature,
-                _ENCODER_TASKS_DATASETS_CONFIGS[self.feature],
-                training_args,
-                max_seq_length=self.max_seq_length,
-                max_train_samples=self.max_train_samples,
-                max_valid_samples=self.max_valid_samples,
-                max_test_samples=self.max_test_samples,
-            )
-
-            trainer.train()
-            gc.collect()
-
     def test_ort_fused_adam(self):
         # Pretend that onnxruntime-training is installed and mock onnxruntime.training.optim.FusedAdam exists.
         # Trainer.get_optimizer_cls_and_kwargs does not use FusedAdam. It only has to return the
@@ -951,56 +821,3 @@ def test_ort_fused_adam(self):
                 default_adam_kwargs,
                 mock.optimizers.FusedAdam,
             )
-
-
-class ORTSeq2SeqTrainerSpecificIntegrationTest(unittest.TestCase):
-    def setUp(self):
-        super().setUp()
-        args = ORTTrainingArguments("..")
-        self.n_epochs = min(args.num_train_epochs, 1)
-        self.per_device_train_batch_size = args.per_device_train_batch_size
-        self.per_device_eval_batch_size = args.per_device_eval_batch_size
-
-        self.max_seq_length = 32
-        self.max_train_samples = 10
-        self.max_valid_samples = 10
-        self.max_test_samples = 10
-
-        self.warmup_steps = 10
-        self.weight_decay = 0.01
-
-    @parameterized.expand(
-        _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS),
-        skip_on_empty=True,
-    )
-    def test_predict_with_generate_ort(self, test_name, model_name, feature, data_metric_config):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            training_args = get_ort_training_args(
-                feature=feature,
-                output_dir=tmp_dir,
-                evaluation_strategy="epoch",
-                num_train_epochs=self.n_epochs,
-                per_device_train_batch_size=self.per_device_train_batch_size,
-                per_device_eval_batch_size=self.per_device_eval_batch_size,
-                warmup_steps=self.warmup_steps,
-                weight_decay=self.weight_decay,
-                logging_dir=tmp_dir,
-                label_smoothing_factor=0.1,
-                predict_with_generate=True,
-            )
-
-            trainer, test_dataset = get_ort_trainer(
-                model_name,
-                feature,
-                data_metric_config,
-                training_args,
-                max_seq_length=self.max_seq_length,
-                max_train_samples=self.max_train_samples,
-                max_valid_samples=self.max_valid_samples,
-                max_test_samples=self.max_test_samples,
-            )
-
-            trainer.train()
-            trainer.evaluate(inference_with_ort=True)
-            trainer.predict(test_dataset, inference_with_ort=True)
-            gc.collect()