From 4a51534e63ea6cd22d0db2ecae156ae406ab86e4 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Wed, 18 Oct 2023 14:57:40 +0000 Subject: [PATCH] update tests --- .../docker/Dockerfile-ort-nightly-cu118 | 3 +- .../docker/Dockerfile-ort1.14.1-cu116 | 2 +- .../docker/Dockerfile-ort1.15.1-cu118 | 2 +- .../docker/Dockerfile-ort1.16.1-cu118 | 2 +- .../onnxruntime/training/test_examples.py | 174 ----- .../training/text-classification/run_glue.py | 97 +-- .../docker/Dockerfile_onnxruntime_trainer | 7 +- .../training/nightly_test_examples.py | 70 ++ .../{ => training}/nightly_test_trainer.py | 693 +++++++----------- 9 files changed, 388 insertions(+), 662 deletions(-) delete mode 100644 examples/onnxruntime/training/test_examples.py create mode 100644 tests/onnxruntime/training/nightly_test_examples.py rename tests/onnxruntime/{ => training}/nightly_test_trainer.py (54%) diff --git a/examples/onnxruntime/training/docker/Dockerfile-ort-nightly-cu118 b/examples/onnxruntime/training/docker/Dockerfile-ort-nightly-cu118 index 668e5e56695..3e6841453b5 100644 --- a/examples/onnxruntime/training/docker/Dockerfile-ort-nightly-cu118 +++ b/examples/onnxruntime/training/docker/Dockerfile-ort-nightly-cu118 @@ -22,6 +22,7 @@ CMD nvidia-smi ENV DEBIAN_FRONTEND noninteractive # Versions +# available options 3.8, 3.9, 3.10, 3.11 ARG PYTHON_VERSION=3.9 ARG TORCH_CUDA_VERSION=cu118 ARG TORCH_VERSION=2.0.0 @@ -34,7 +35,7 @@ SHELL ["/bin/bash", "-c"] # Install and update tools to minimize security vulnerabilities RUN apt-get update RUN apt-get install -y software-properties-common wget apt-utils patchelf git libprotobuf-dev protobuf-compiler cmake \ - bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev && \ + bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev ffmpeg && \ apt-get clean RUN unattended-upgrade RUN apt-get autoremove -y diff --git a/examples/onnxruntime/training/docker/Dockerfile-ort1.14.1-cu116 b/examples/onnxruntime/training/docker/Dockerfile-ort1.14.1-cu116 index db2219b5c62..15df7c352fe 100644 --- a/examples/onnxruntime/training/docker/Dockerfile-ort1.14.1-cu116 +++ b/examples/onnxruntime/training/docker/Dockerfile-ort1.14.1-cu116 @@ -33,7 +33,7 @@ ARG TORCHVISION_VERSION=0.14.1 # Install and update tools to minimize security vulnerabilities RUN apt-get update RUN apt-get install -y software-properties-common wget apt-utils patchelf git libprotobuf-dev protobuf-compiler cmake \ - bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev && \ + bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev ffmpeg && \ apt-get clean RUN unattended-upgrade RUN apt-get autoremove -y diff --git a/examples/onnxruntime/training/docker/Dockerfile-ort1.15.1-cu118 b/examples/onnxruntime/training/docker/Dockerfile-ort1.15.1-cu118 index 51c9ec514c4..2d1306e1a35 100644 --- a/examples/onnxruntime/training/docker/Dockerfile-ort1.15.1-cu118 +++ b/examples/onnxruntime/training/docker/Dockerfile-ort1.15.1-cu118 @@ -34,7 +34,7 @@ ARG TORCHVISION_VERSION=0.15.1 # Install and update tools to minimize security vulnerabilities RUN apt-get update RUN apt-get install -y software-properties-common wget apt-utils patchelf git libprotobuf-dev protobuf-compiler cmake \ - bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev && \ + bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev ffmpeg && \ apt-get clean RUN unattended-upgrade RUN apt-get autoremove -y diff --git a/examples/onnxruntime/training/docker/Dockerfile-ort1.16.1-cu118 b/examples/onnxruntime/training/docker/Dockerfile-ort1.16.1-cu118 index 3f6b8335923..482d495fcb4 100644 --- a/examples/onnxruntime/training/docker/Dockerfile-ort1.16.1-cu118 +++ b/examples/onnxruntime/training/docker/Dockerfile-ort1.16.1-cu118 @@ -34,7 +34,7 @@ SHELL ["/bin/bash", "-c"] # Install and update tools to minimize security vulnerabilities RUN apt-get update RUN apt-get install -y software-properties-common wget apt-utils patchelf git libprotobuf-dev protobuf-compiler cmake \ - bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev && \ + bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev ffmpeg && \ apt-get clean RUN unattended-upgrade RUN apt-get autoremove -y diff --git a/examples/onnxruntime/training/test_examples.py b/examples/onnxruntime/training/test_examples.py deleted file mode 100644 index 8fe1de53d56..00000000000 --- a/examples/onnxruntime/training/test_examples.py +++ /dev/null @@ -1,174 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import logging -import os -import sys -import unittest -from unittest.mock import patch - -import torch -from transformers.file_utils import is_apex_available -from transformers.testing_utils import TestCasePlus, get_gpu_count, slow, torch_device - - -SRC_DIRS = [ - os.path.join(os.path.dirname(__file__), dirname) - for dirname in [ - "text-classification", - "token-classification", - "question-answering", - "translation", - ] -] -sys.path.extend(SRC_DIRS) -if SRC_DIRS is not None: - import run_glue - import run_ner - import run_qa - import run_translation - - -logging.basicConfig(level=logging.DEBUG) -logger = logging.getLogger() - - -def get_results(output_dir): - results = {} - path = os.path.join(output_dir, "all_results.json") - if os.path.exists(path): - with open(path, "r") as f: - results = json.load(f) - else: - raise ValueError(f"can't find {path}") - return results - - -def is_cuda_and_apex_available(): - is_using_cuda = torch.cuda.is_available() and torch_device == "cuda" - return is_using_cuda and is_apex_available() - - -class ExamplesTests(TestCasePlus): - # Text Classification Tests - def test_run_glue(self): - stream_handler = logging.StreamHandler(sys.stdout) - logger.addHandler(stream_handler) - - tmp_dir = self.get_auto_remove_tmp_dir() - testargs = f""" - run_glue.py - --model_name_or_path bert-base-uncased - --task_name sst2 - --do_train - --do_eval - --output_dir {tmp_dir} - --overwrite_output_dir - --learning_rate=1e-5 - --per_device_train_batch_size=16 - --per_device_eval_batch_size=16 - """.split() - - with patch.object(sys, "argv", testargs): - run_glue.main() - result = get_results(tmp_dir) - self.assertGreaterEqual(result["eval_accuracy"], 0.75) - - # Token Classification Tests - def test_run_ner(self): - stream_handler = logging.StreamHandler(sys.stdout) - logger.addHandler(stream_handler) - - # with so little data distributed training needs more epochs to get the score on par with 0/1 gpu - epochs = 7 if get_gpu_count() > 1 else 2 - - tmp_dir = self.get_auto_remove_tmp_dir() - testargs = f""" - run_ner.py - --model_name_or_path bert-base-uncased - --dataset_name conll2003 - --do_train - --do_eval - --output_dir {tmp_dir} - --overwrite_output_dir - --learning_rate=1e-5 - --per_device_train_batch_size=16 - --per_device_eval_batch_size=16 - --num_train_epochs={epochs} - """.split() - - with patch.object(sys, "argv", testargs): - run_ner.main() - result = get_results(tmp_dir) - self.assertGreaterEqual(result["eval_accuracy"], 0.75) - self.assertLess(result["eval_loss"], 0.5) - - # Question Answering Tests - def test_run_qa(self): - stream_handler = logging.StreamHandler(sys.stdout) - logger.addHandler(stream_handler) - - tmp_dir = self.get_auto_remove_tmp_dir() - testargs = f""" - run_qa.py - --model_name_or_path bert-base-uncased - --dataset_name squad - --do_train - --do_eval - --output_dir {tmp_dir} - --overwrite_output_dir - --learning_rate=1e-5 - --per_device_train_batch_size=16 - --per_device_eval_batch_size=16 - """.split() - - with patch.object(sys, "argv", testargs): - run_qa.main() - result = get_results(tmp_dir) - self.assertGreaterEqual(result["eval_f1"], 30) - self.assertGreaterEqual(result["eval_exact"], 30) - - @slow - def test_run_translation(self): - stream_handler = logging.StreamHandler(sys.stdout) - logger.addHandler(stream_handler) - - tmp_dir = self.get_auto_remove_tmp_dir() - testargs = f""" - run_translation.py - --model_name_or_path t5-large - --source_lang en - --target_lang ro - --dataset_name wmt16 - --output_dir {tmp_dir} - --overwrite_output_dir - --max_steps=50 - --warmup_steps=8 - --do_train - --learning_rate=3e-3 - --per_device_train_batch_size=2 - --per_device_eval_batch_size=1 - --predict_with_generate - """.split() - - with patch.object(sys, "argv", testargs): - run_translation.main() - result = get_results(tmp_dir) - self.assertGreaterEqual(result["eval_bleu"], 30) - - -if __name__ == "__main__": - unittest.main() diff --git a/examples/onnxruntime/training/text-classification/run_glue.py b/examples/onnxruntime/training/text-classification/run_glue.py index 7a81a2ff156..f3f04657afb 100644 --- a/examples/onnxruntime/training/text-classification/run_glue.py +++ b/examples/onnxruntime/training/text-classification/run_glue.py @@ -21,6 +21,7 @@ import os import random import sys +import warnings from dataclasses import dataclass, field from typing import Optional @@ -48,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.23.0") +check_min_version("4.34.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") @@ -188,12 +189,28 @@ class ModelArguments: default="main", metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) + token: str = field( + default=None, + metadata={ + "help": ( + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." + ) + }, + ) use_auth_token: bool = field( + default=None, + metadata={ + "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`." + }, + ) + trust_remote_code: bool = field( default=False, metadata={ "help": ( - "Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models)." + "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" + "should only be set to `True` for repositories you trust and in which you have read the code, as it will " + "execute code present on the Hub on your local machine." ) }, ) @@ -203,32 +220,24 @@ class ModelArguments: ) -@dataclass -class InferenceArguments: - """ - Arguments for inference(evaluate, predict). - """ - - inference_with_ort: bool = field( - default=False, - metadata={"help": "Whether use ONNX Runtime as backend for inference. Default set to false."}, - ) - - def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments, InferenceArguments)) + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ORTTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. - model_args, data_args, training_args, inference_args = parser.parse_json_file( - json_file=os.path.abspath(sys.argv[1]) - ) + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: - model_args, data_args, training_args, inference_args = parser.parse_args_into_dataclasses() + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + if model_args.use_auth_token is not None: + warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning) + if model_args.token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + model_args.token = model_args.use_auth_token # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. @@ -241,6 +250,10 @@ def main(): handlers=[logging.StreamHandler(sys.stdout)], ) + if training_args.should_log: + # The default of training_args.log_level is passive, so we set log level at info here to have that default. + transformers.utils.logging.set_verbosity_info() + log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) @@ -291,7 +304,7 @@ def main(): "glue", data_args.task_name, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) elif data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. @@ -299,7 +312,7 @@ def main(): data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) else: # Loading a dataset from your local files. @@ -328,7 +341,7 @@ def main(): "csv", data_files=data_files, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) else: # Loading a dataset from local json files @@ -336,7 +349,7 @@ def main(): "json", data_files=data_files, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. @@ -371,14 +384,16 @@ def main(): finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, @@ -386,7 +401,8 @@ def main(): config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, ) @@ -440,7 +456,7 @@ def main(): if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( - f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" + f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the " f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) @@ -496,6 +512,8 @@ def preprocess_function(examples): # Get the metric function if data_args.task_name is not None: metric = evaluate.load("glue", data_args.task_name) + elif is_regression: + metric = evaluate.load("mse") else: metric = evaluate.load("accuracy") @@ -504,17 +522,12 @@ def preprocess_function(examples): def compute_metrics(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) - if data_args.task_name is not None: - result = metric.compute(predictions=preds, references=p.label_ids) - if len(result) > 1: - result["combined_score"] = np.mean(list(result.values())).item() - return result - elif is_regression: - return {"mse": ((preds - p.label_ids) ** 2).mean().item()} - else: - return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()} + result = metric.compute(predictions=preds, references=p.label_ids) + if len(result) > 1: + result["combined_score"] = np.mean(list(result.values())).item() + return result - # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to ORTTrainer, so we change it if + # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if # we already did the padding. if data_args.pad_to_max_length: data_collator = default_data_collator @@ -532,7 +545,6 @@ def compute_metrics(p: EvalPrediction): compute_metrics=compute_metrics, tokenizer=tokenizer, data_collator=data_collator, - feature="text-classification", ) # Training @@ -550,6 +562,7 @@ def compute_metrics(p: EvalPrediction): metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.save_model() # Saves the tokenizer too for easy upload + trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() @@ -571,7 +584,7 @@ def compute_metrics(p: EvalPrediction): combined = {} for eval_dataset, task in zip(eval_datasets, tasks): - metrics = trainer.evaluate(eval_dataset=eval_dataset, inference_with_ort=inference_args.inference_with_ort) + metrics = trainer.evaluate(eval_dataset=eval_dataset) max_eval_samples = ( data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) @@ -599,9 +612,7 @@ def compute_metrics(p: EvalPrediction): for predict_dataset, task in zip(predict_datasets, tasks): # Removing the `label` columns because it contains -1 and Trainer won't like that. predict_dataset = predict_dataset.remove_columns("label") - predictions = trainer.predict( - predict_dataset, metric_key_prefix="predict", inference_with_ort=inference_args.inference_with_ort - ).predictions + predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1) output_predict_file = os.path.join(training_args.output_dir, f"predict_results_{task}.txt") diff --git a/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer b/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer index 62f7efc8178..7266ba224a8 100644 --- a/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer +++ b/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer @@ -34,7 +34,7 @@ ARG TORCHVISION_VERSION=0.15.1 # Install and update tools to minimize security vulnerabilities RUN apt-get update RUN apt-get install -y software-properties-common wget apt-utils patchelf git libprotobuf-dev protobuf-compiler cmake \ - bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev && \ + bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev ffmpeg && \ apt-get clean RUN unattended-upgrade RUN apt-get autoremove -y @@ -65,7 +65,7 @@ RUN $PYTHON_EXE -m pip install onnx ninja RUN $PYTHON_EXE -m pip install torch==${TORCH_VERSION} torchvision==${TORCHVISION_VERSION} -f https://download.pytorch.org/whl/${TORCH_CUDA_VERSION} # ORT Module -RUN $PYTHON_EXE -m pip install onnxruntime-training==1.15.1 -f https://download.onnxruntime.ai/onnxruntime_stable_cu118.html +RUN $PYTHON_EXE -m pip install onnxruntime-training==1.16.1 -f https://download.onnxruntime.ai/onnxruntime_stable_cu118.html RUN $PYTHON_EXE -m pip install torch-ort ENV TORCH_CUDA_ARCH_LIST="5.2 6.0 6.1 7.0 7.5 8.0 8.6+PTX" RUN $PYTHON_EXE -m pip install --upgrade protobuf==3.20.2 @@ -76,4 +76,5 @@ COPY . /workspace/optimum RUN pip install /workspace/optimum[tests] ENV TEST_LEVEL=1 -CMD RUN_SLOW=1 pytest -v -rs onnxruntime/nightly_test_trainer.py --durations=0 \ No newline at end of file +CMD RUN_SLOW=1 pytest -v -rs onnxruntime/training/nightly_test_trainer.py --durations=0 +CMD RUN_SLOW=1 pytest -v -rs onnxruntime/training/nightly_test_examples.py --durations=0 \ No newline at end of file diff --git a/tests/onnxruntime/training/nightly_test_examples.py b/tests/onnxruntime/training/nightly_test_examples.py new file mode 100644 index 00000000000..2318c9b47b2 --- /dev/null +++ b/tests/onnxruntime/training/nightly_test_examples.py @@ -0,0 +1,70 @@ +# coding=utf-8 +# Copyright 2023 the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Test ONNX Runtime Training Examples in Optimum.""" + +import subprocess +import unittest +from transformers.testing_utils import slow + + +@slow +class ORTTrainerExampleTest(unittest.TestCase): + def test_text_classification(self): + subprocess.run( + "cp ../examples/onnxruntime/training/text-classification/run_glue.py ./", + shell=True, + ) + + subprocess.run( + "torchrun" + " --nproc_per_node=1" + " run_glue.py" + " --model_name_or_path distilbert-base-uncased" + " --task_name mnli" + " --max_seq_length 64" + " --learning_rate 3e-6" + " --do_train" + " --output_dir /tmp/distilbert" + " --overwrite_output_dir" + " --max_steps 50" + " --logging_steps 50" + " --per_device_train_batch_size 8" + " --fp16 --optim adamw_ort_fused" + " --max_train_samples 20", + shell=True, + check=True, + ) + + # TODO: Test all ORT training examples + def test_token_classification(self): + pass + + def test_translation(self): + pass + + def test_summarization(self): + pass + + def test_stable_diffusion_txt2img(self): + pass + + def test_question_answering(self): + pass + + def test_language_modeling(self): + pass + + def test_image_classification(self): + pass diff --git a/tests/onnxruntime/nightly_test_trainer.py b/tests/onnxruntime/training/nightly_test_trainer.py similarity index 54% rename from tests/onnxruntime/nightly_test_trainer.py rename to tests/onnxruntime/training/nightly_test_trainer.py index 2eb3ca433f7..e24ee306178 100644 --- a/tests/onnxruntime/nightly_test_trainer.py +++ b/tests/onnxruntime/training/nightly_test_trainer.py @@ -12,11 +12,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +"""Test ONNX Runtime Training ORTTrainer in Optimum.""" import gc +import os import random -import subprocess -import sys import tempfile import unittest from itertools import chain @@ -25,7 +25,6 @@ import nltk import numpy as np -import pytest from datasets import load_dataset from evaluate import load from transformers import ( @@ -35,12 +34,16 @@ AutoModelForTokenClassification, AutoTokenizer, DataCollatorForSeq2Seq, - DataCollatorForTokenClassification, DataCollatorWithPadding, default_data_collator, is_torch_available, ) -from transformers.testing_utils import require_deepspeed, require_torch, slow +from transformers.testing_utils import ( + mockenv_context, + require_deepspeed, + require_torch, + slow, +) from transformers.training_args import OptimizerNames @@ -75,11 +78,11 @@ "data_collator": default_data_collator, "data_collator_class": DataCollatorWithPadding, }, - "token-classification": { - "dataset": ["conll2003"], - "metric": ["seqeval"], - "data_collator_class": DataCollatorForTokenClassification, - }, + # "token-classification": { + # "dataset": ["conll2003"], + # "metric": ["seqeval"], + # "data_collator_class": DataCollatorForTokenClassification, + # }, } _DECODER_TASKS_DATASETS_CONFIGS = { @@ -88,11 +91,6 @@ "metric": ["accuracy"], "data_collator": default_data_collator, }, - "text-generation-with-past": { - "dataset": ["wikitext", "wikitext-2-raw-v1"], - "metric": ["accuracy"], - "data_collator": default_data_collator, - }, } _SEQ2SEQ_TASKS_DATASETS_CONFIGS = { @@ -101,30 +99,37 @@ "metric": ["rouge"], "data_collator_class": DataCollatorForSeq2Seq, }, - "text2text-generation-with-past": { - "dataset": ["xsum"], - "metric": ["rouge"], - "data_collator_class": DataCollatorForSeq2Seq, - }, } +# List supported ORT optimizers to test +optim_test_params = [] +if is_torch_available(): + default_adam_kwargs = { + "betas": (ORTTrainingArguments.adam_beta1, ORTTrainingArguments.adam_beta2), + "eps": ORTTrainingArguments.adam_epsilon, + "lr": ORTTrainingArguments.learning_rate, + } -def _get_models_to_test(model_list, task_list, both_inf_backend=False, excluded: Optional[List[str]] = None): + optim_test_params = [ + ( + ORTOptimizerNames.ADAMW_ORT_FUSED, + onnxruntime.training.optim.FusedAdam, + default_adam_kwargs, + ), + ] + +# default torch.distributed port +DEFAULT_MASTER_PORT = "10999" + + +def _get_models_to_test(model_list, task_list, excluded: Optional[List[str]] = None): models_to_test = [] for name, model_name in model_list: - for feature, data_metric_config in task_list.items(): - if excluded and (name in excluded or feature in excluded): + for task, data_metric_config in task_list.items(): + if excluded and (name in excluded or task in excluded): continue - if both_inf_backend: - models_to_test.append( - (f"{name}_{feature}", model_name, feature, data_metric_config, True) - ) # inference_with_ort=True - models_to_test.append( - (f"{name}_{feature}", model_name, feature, data_metric_config, False) - ) # inference_with_ort=False - else: - models_to_test.append((f"{name}_{feature}", model_name, feature, data_metric_config)) + models_to_test.append((f"{name}_{task}", model_name, task, data_metric_config)) return sorted(models_to_test) @@ -151,17 +156,39 @@ def _get_data_collator(data_metric_config, tokenizer=None, model=None, training_ return data_collator -def get_ort_training_args(feature, **kwargs): - if feature in _ENCODER_TASKS_DATASETS_CONFIGS or feature in _DECODER_TASKS_DATASETS_CONFIGS: +def get_ort_training_args(task, **kwargs): + if task in _ENCODER_TASKS_DATASETS_CONFIGS or task in _DECODER_TASKS_DATASETS_CONFIGS: training_args = ORTTrainingArguments(**kwargs) - elif feature in _SEQ2SEQ_TASKS_DATASETS_CONFIGS: + elif task in _SEQ2SEQ_TASKS_DATASETS_CONFIGS: training_args = ORTSeq2SeqTrainingArguments(**kwargs) return training_args +def get_master_port(real_launcher=False): + """ + When using a single gpu launcher emulation (i.e. not deepspeed or python -m torch.distributed) + the issue is that once the port is tied it can't be used anywhere else outside of this process, + since torch.dist doesn't free the port until the process exits. Therefore for the sake of being + able to run both emulated launcher and normal launcher tests we need 2 distinct ports. + + This function will give the right port in the right context. For real launcher it'll give the + base port, for emulated launcher it'll give the base port + 1. In both cases a string is + returned. + + Args: + `real_launcher`: whether a real launcher is going to be used, or the emulated one + + """ + + master_port_base = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT) + if not real_launcher: + master_port_base = str(int(master_port_base) + 1) + return master_port_base + + def get_ort_trainer( model_name, - feature, + task, data_metric_config, training_args, max_seq_length=None, @@ -170,7 +197,7 @@ def get_ort_trainer( max_test_samples=None, **kwargs, ): - training_kwargs = load_and_prepare(feature)( + training_kwargs = load_and_prepare(task)( model_name, data_metric_config, max_seq_length, @@ -185,26 +212,25 @@ def get_ort_trainer( if getattr(training_args, "predict_with_generate", False) is not True: training_kwargs.pop("compute_metrics", None) - if feature in _ENCODER_TASKS_DATASETS_CONFIGS or feature in _DECODER_TASKS_DATASETS_CONFIGS: - trainer = ORTTrainer(feature=feature, args=training_args, **training_kwargs) - elif feature in _SEQ2SEQ_TASKS_DATASETS_CONFIGS: - trainer = ORTSeq2SeqTrainer(feature=feature, args=training_args, **training_kwargs) + if task in _ENCODER_TASKS_DATASETS_CONFIGS or task in _DECODER_TASKS_DATASETS_CONFIGS: + trainer = ORTTrainer(args=training_args, **training_kwargs) + elif task in _SEQ2SEQ_TASKS_DATASETS_CONFIGS: + trainer = ORTSeq2SeqTrainer(args=training_args, **training_kwargs) else: raise return trainer, test_dataset -def load_and_prepare(feature): +def load_and_prepare(task): preprocess_mapping = { "text-classification": load_and_prepare_glue, "token-classification": load_and_prepare_ner, "text-generation": load_and_prepare_clm, "text-generation-with-past": load_and_prepare_clm, "text2text-generation": load_and_prepare_xsum, - "text2text-generation-with-past": load_and_prepare_xsum, } - return preprocess_mapping[feature] + return preprocess_mapping[task] def load_and_prepare_glue(model_name, data_metric_config, max_seq_length, padding="max_length", **kwargs): @@ -520,212 +546,140 @@ class ORTTrainerIntegrationTest(unittest.TestCase): def setUp(self): super().setUp() args = ORTTrainingArguments("..") + master_port = get_master_port(real_launcher=False) + self.dist_env_1_gpu = { + "MASTER_ADDR": "localhost", + "MASTER_PORT": master_port, + "RANK": "0", + "LOCAL_RANK": "0", + "WORLD_SIZE": "1", + } self.n_epochs = min(args.num_train_epochs, 1) - self.per_device_train_batch_size = args.per_device_train_batch_size - self.per_device_eval_batch_size = args.per_device_eval_batch_size + self.per_device_train_batch_size = min(args.per_device_train_batch_size, 2) + self.per_device_eval_batch_size = min(args.per_device_eval_batch_size, 2) self.max_seq_length = 64 - self.max_train_samples = 50 - self.max_valid_samples = 20 - self.max_test_samples = 10 + self.max_train_samples = 10 + self.max_valid_samples = 5 + self.max_test_samples = 5 self.warmup_steps = 10 self.weight_decay = 0.01 @parameterized.expand( - _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS, both_inf_backend=True) - # + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS, both_inf_backend=True) # Skip test for OOM bug - + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS, both_inf_backend=True), - skip_on_empty=True, - ) - def test_trainer_fp32(self, test_name, model_name, feature, data_metric_config, inference_with_ort): - with tempfile.TemporaryDirectory() as tmp_dir: - training_args = get_ort_training_args( - feature=feature, - output_dir=tmp_dir, - num_train_epochs=self.n_epochs, - per_device_train_batch_size=self.per_device_train_batch_size, - per_device_eval_batch_size=self.per_device_eval_batch_size, - warmup_steps=self.warmup_steps, - weight_decay=self.weight_decay, - logging_dir=tmp_dir, - ) - - trainer, test_dataset = get_ort_trainer( - model_name, - feature, - data_metric_config, - training_args, - max_seq_length=self.max_seq_length, - max_train_samples=self.max_train_samples, - max_valid_samples=self.max_valid_samples, - max_test_samples=self.max_test_samples, - ) - - trainer.train() - trainer.save_model() - trainer.evaluate(inference_with_ort=inference_with_ort) - trainer.predict(test_dataset, inference_with_ort=inference_with_ort) - gc.collect() - - @parameterized.expand( - _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS, both_inf_backend=True) - # + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS, both_inf_backend=True) # Skip test for OOM bug - + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS, both_inf_backend=True), + _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS) + + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS) + + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS), skip_on_empty=True, ) - def test_trainer_fp32_with_label_smoothing( - self, test_name, model_name, feature, data_metric_config, inference_with_ort - ): - with tempfile.TemporaryDirectory() as tmp_dir: - training_args = get_ort_training_args( - feature=feature, - output_dir=tmp_dir, - num_train_epochs=self.n_epochs, - per_device_train_batch_size=self.per_device_train_batch_size, - per_device_eval_batch_size=self.per_device_eval_batch_size, - label_smoothing_factor=0.1, - warmup_steps=self.warmup_steps, - weight_decay=self.weight_decay, - logging_dir=tmp_dir, - ) - - trainer, test_dataset = get_ort_trainer( - model_name, - feature, - data_metric_config, - training_args, - max_seq_length=self.max_seq_length, - max_train_samples=self.max_train_samples, - max_valid_samples=self.max_valid_samples, - max_test_samples=self.max_test_samples, - ) - - trainer.train() - trainer.save_model() - trainer.evaluate(inference_with_ort=inference_with_ort) - trainer.predict(test_dataset, inference_with_ort=inference_with_ort) - gc.collect() + def test_trainer_fp32(self, test_name, model_name, task, data_metric_config): + with mockenv_context(**self.dist_env_1_gpu): + with tempfile.TemporaryDirectory() as tmp_dir: + training_args = get_ort_training_args( + task=task, + output_dir=tmp_dir, + num_train_epochs=self.n_epochs, + per_device_train_batch_size=self.per_device_train_batch_size, + per_device_eval_batch_size=self.per_device_eval_batch_size, + warmup_steps=self.warmup_steps, + weight_decay=self.weight_decay, + logging_dir=tmp_dir, + ) + + trainer, test_dataset = get_ort_trainer( + model_name, + task, + data_metric_config, + training_args, + max_seq_length=self.max_seq_length, + max_train_samples=self.max_train_samples, + max_valid_samples=self.max_valid_samples, + max_test_samples=self.max_test_samples, + ) + + trainer.train() + trainer.save_model() + trainer.evaluate() + trainer.predict(test_dataset) + gc.collect() @slow @parameterized.expand( _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS) - # + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS) # Skip test for OOM bug + + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS) + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS), skip_on_empty=True, ) - def test_trainer_fp16_pt_inference(self, test_name, model_name, feature, data_metric_config): - with tempfile.TemporaryDirectory() as tmp_dir: - training_args = get_ort_training_args( - feature=feature, - output_dir=tmp_dir, - num_train_epochs=self.n_epochs, - per_device_train_batch_size=self.per_device_train_batch_size, - per_device_eval_batch_size=self.per_device_eval_batch_size, - warmup_steps=self.warmup_steps, - weight_decay=self.weight_decay, - logging_dir=tmp_dir, - fp16=True, - ) - - trainer, test_dataset = get_ort_trainer( - model_name, - feature, - data_metric_config, - training_args, - max_seq_length=self.max_seq_length, - max_train_samples=self.max_train_samples, - max_valid_samples=self.max_valid_samples, - max_test_samples=self.max_test_samples, - ) - - trainer.train() - trainer.save_model() - trainer.evaluate() - trainer.predict(test_dataset) - gc.collect() + def test_trainer_fp32_with_label_smoothing(self, test_name, model_name, task, data_metric_config): + with mockenv_context(**self.dist_env_1_gpu): + with tempfile.TemporaryDirectory() as tmp_dir: + training_args = get_ort_training_args( + task=task, + output_dir=tmp_dir, + num_train_epochs=self.n_epochs, + per_device_train_batch_size=self.per_device_train_batch_size, + per_device_eval_batch_size=self.per_device_eval_batch_size, + label_smoothing_factor=0.1, + warmup_steps=self.warmup_steps, + weight_decay=self.weight_decay, + logging_dir=tmp_dir, + ) + + trainer, test_dataset = get_ort_trainer( + model_name, + task, + data_metric_config, + training_args, + max_seq_length=self.max_seq_length, + max_train_samples=self.max_train_samples, + max_valid_samples=self.max_valid_samples, + max_test_samples=self.max_test_samples, + ) + + trainer.train() + trainer.save_model() + trainer.evaluate() + trainer.predict(test_dataset) + gc.collect() @slow @parameterized.expand( _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS) - # Exclude "with-past" tests as they fail for ORT inference after the mixed-precision training - # + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS, excluded=["text-generation-with-past"]) # Skip test for OOM bug - + _get_models_to_test( - _SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS, excluded=["text2text-generation-with-past"] - ), + + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS) + + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS), skip_on_empty=True, ) - def test_trainer_fp16_ort_inference(self, test_name, model_name, feature, data_metric_config): - with tempfile.TemporaryDirectory() as tmp_dir: - training_args = get_ort_training_args( - feature=feature, - output_dir=tmp_dir, - num_train_epochs=self.n_epochs, - per_device_train_batch_size=self.per_device_train_batch_size, - per_device_eval_batch_size=self.per_device_eval_batch_size, - warmup_steps=self.warmup_steps, - weight_decay=self.weight_decay, - logging_dir=tmp_dir, - fp16=True, - ) - - trainer, test_dataset = get_ort_trainer( - model_name, - feature, - data_metric_config, - training_args, - max_seq_length=self.max_seq_length, - max_train_samples=self.max_train_samples, - max_valid_samples=self.max_valid_samples, - max_test_samples=self.max_test_samples, - ) - - trainer.train() - trainer.save_model() - trainer.evaluate(inference_with_ort=True) - trainer.predict(test_dataset, inference_with_ort=True) - gc.collect() - - # Skip this test as a large amount of ops don't support bf16 yet. - # @unittest.skip("Skip BF16 test.") - # @slow - # @require_torch_bf16_gpu - # @parameterized.expand( - # _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS) - # + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS) - # + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS), - # skip_on_empty=True, - # ) - # def test_trainer_bf16(self, test_name, model_name, feature, data_metric_config): - # with tempfile.TemporaryDirectory() as tmp_dir: - # training_args = get_ort_training_args( - # feature=feature, - # output_dir=tmp_dir, - # num_train_epochs=self.n_epochs, - # per_device_train_batch_size=self.per_device_train_batch_size, - # per_device_eval_batch_size=self.per_device_eval_batch_size, - # warmup_steps=self.warmup_steps, - # weight_decay=self.weight_decay, - # logging_dir=tmp_dir, - # bf16=True, - # ) - - # trainer, test_dataset = get_ort_trainer( - # model_name, - # feature, - # data_metric_config, - # training_args, - # max_seq_length=self.max_seq_length, - # max_train_samples=self.max_train_samples, - # max_valid_samples=self.max_valid_samples, - # max_test_samples=self.max_test_samples, - # ) - - # trainer.train() - # trainer.save_model() - # trainer.evaluate() - # trainer.predict(test_dataset) - # gc.collect() + def test_trainer_fp16(self, test_name, model_name, task, data_metric_config): + with mockenv_context(**self.dist_env_1_gpu): + with tempfile.TemporaryDirectory() as tmp_dir: + training_args = get_ort_training_args( + task=task, + output_dir=tmp_dir, + num_train_epochs=self.n_epochs, + per_device_train_batch_size=self.per_device_train_batch_size, + per_device_eval_batch_size=self.per_device_eval_batch_size, + warmup_steps=self.warmup_steps, + weight_decay=self.weight_decay, + logging_dir=tmp_dir, + fp16=True, + ) + + trainer, test_dataset = get_ort_trainer( + model_name, + task, + data_metric_config, + training_args, + max_seq_length=self.max_seq_length, + max_train_samples=self.max_train_samples, + max_valid_samples=self.max_valid_samples, + max_test_samples=self.max_test_samples, + ) + + trainer.train() + trainer.save_model() + trainer.evaluate() + trainer.predict(test_dataset) + gc.collect() @slow @@ -734,14 +688,22 @@ class ORTTrainerIntegrationDeepSpeedTest(unittest.TestCase): def setUp(self): super().setUp() args = ORTTrainingArguments("..") + master_port = get_master_port(real_launcher=False) + self.dist_env_1_gpu = { + "MASTER_ADDR": "localhost", + "MASTER_PORT": master_port, + "RANK": "0", + "LOCAL_RANK": "0", + "WORLD_SIZE": "1", + } self.n_epochs = min(args.num_train_epochs, 1) - self.per_device_train_batch_size = args.per_device_train_batch_size - self.per_device_eval_batch_size = args.per_device_eval_batch_size + self.per_device_train_batch_size = min(args.per_device_train_batch_size, 2) + self.per_device_eval_batch_size = min(args.per_device_eval_batch_size, 2) self.max_seq_length = 64 - self.max_train_samples = 30 - self.max_valid_samples = 10 - self.max_test_samples = 10 + self.max_train_samples = 10 + self.max_valid_samples = 5 + self.max_test_samples = 5 self.warmup_steps = 10 self.weight_decay = 0.01 @@ -749,126 +711,80 @@ def setUp(self): @parameterized.expand( random.sample( _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS) - # + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS) + + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS) + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS), - 1, + 1, # only test one ), skip_on_empty=True, ) - def test_trainer_fp16_ds_stage1(self, test_name, model_name, feature, data_metric_config): - with tempfile.TemporaryDirectory() as tmp_dir: - training_args = get_ort_training_args( - feature=feature, - output_dir=tmp_dir, - num_train_epochs=self.n_epochs, - per_device_train_batch_size=self.per_device_train_batch_size, - per_device_eval_batch_size=self.per_device_eval_batch_size, - warmup_steps=self.warmup_steps, - weight_decay=self.weight_decay, - logging_dir=tmp_dir, - fp16=True, - deepspeed="onnxruntime/ds_configs/ds_config_zero_stage_1.json", - ) - - trainer, _ = get_ort_trainer( - model_name, - feature, - data_metric_config, - training_args, - max_seq_length=self.max_seq_length, - max_train_samples=self.max_train_samples, - max_valid_samples=self.max_valid_samples, - max_test_samples=self.max_test_samples, - ) - - trainer.train() - gc.collect() + def test_trainer_fp16_ds_stage1(self, test_name, model_name, task, data_metric_config): + with mockenv_context(**self.dist_env_1_gpu): + with tempfile.TemporaryDirectory() as tmp_dir: + training_args = get_ort_training_args( + task=task, + output_dir=tmp_dir, + num_train_epochs=self.n_epochs, + per_device_train_batch_size=self.per_device_train_batch_size, + per_device_eval_batch_size=self.per_device_eval_batch_size, + warmup_steps=self.warmup_steps, + weight_decay=self.weight_decay, + logging_dir=tmp_dir, + fp16=True, + deepspeed="onnxruntime/ds_configs/ds_config_zero_stage_1.json", + ) + + trainer, _ = get_ort_trainer( + model_name, + task, + data_metric_config, + training_args, + max_seq_length=self.max_seq_length, + max_train_samples=self.max_train_samples, + max_valid_samples=self.max_valid_samples, + max_test_samples=self.max_test_samples, + ) + + trainer.train() + gc.collect() @parameterized.expand( random.sample( _get_models_to_test(_ENCODERS_TO_TEST, _ENCODER_TASKS_DATASETS_CONFIGS) - # + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS) + + _get_models_to_test(_DECODERS_TO_TEST, _DECODER_TASKS_DATASETS_CONFIGS) + _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS), 1, ), skip_on_empty=True, ) - def test_trainer_fp16_ds_stage2(self, test_name, model_name, feature, data_metric_config): - with tempfile.TemporaryDirectory() as tmp_dir: - training_args = get_ort_training_args( - feature=feature, - output_dir=tmp_dir, - num_train_epochs=self.n_epochs, - per_device_train_batch_size=self.per_device_train_batch_size, - per_device_eval_batch_size=self.per_device_eval_batch_size, - warmup_steps=self.warmup_steps, - weight_decay=self.weight_decay, - logging_dir=tmp_dir, - fp16=True, - deepspeed="onnxruntime/ds_configs/ds_config_zero_stage_2.json", - ) - - trainer, _ = get_ort_trainer( - model_name, - feature, - data_metric_config, - training_args, - max_seq_length=self.max_seq_length, - max_train_samples=self.max_train_samples, - max_valid_samples=self.max_valid_samples, - max_test_samples=self.max_test_samples, - ) - - trainer.train() - gc.collect() - - -@slow -@pytest.mark.skip(reason="skip for now, server socket error") -class ORTTrainerIntegrationDDPTest(unittest.TestCase): - def test_trainer_ddp_glue(self): - subprocess.run( - "cp ../examples/onnxruntime/training/text-classification/run_glue.py ./", - shell=True, - ) - - subprocess.run( - f"{sys.executable} -m torch.distributed.launch" - " --nproc_per_node=1" - " run_glue.py" - " --model_name_or_path distilbert-base-uncased" - " --task_name mnli" - " --max_seq_length 128" - " --learning_rate 3e-6" - " --do_train" - " --output_dir /tmp/distilbert" - " --overwrite_output_dir" - " --max_steps 200" - " --logging_steps 20" - " --per_device_train_batch_size 32" - " --fp16 --optim adamw_ort_fused" - " --max_train_samples 500", - shell=True, - check=True, - ) - - -# List supported ORT optimizers to test -optim_test_params = [] -if is_torch_available(): - default_adam_kwargs = { - "betas": (ORTTrainingArguments.adam_beta1, ORTTrainingArguments.adam_beta2), - "eps": ORTTrainingArguments.adam_epsilon, - "lr": ORTTrainingArguments.learning_rate, - } - - optim_test_params = [ - ( - ORTOptimizerNames.ADAMW_ORT_FUSED, - onnxruntime.training.optim.FusedAdam, - default_adam_kwargs, - ), - ] + def test_trainer_fp16_ds_stage2(self, test_name, model_name, task, data_metric_config): + with mockenv_context(**self.dist_env_1_gpu): + with tempfile.TemporaryDirectory() as tmp_dir: + training_args = get_ort_training_args( + task=task, + output_dir=tmp_dir, + num_train_epochs=self.n_epochs, + per_device_train_batch_size=self.per_device_train_batch_size, + per_device_eval_batch_size=self.per_device_eval_batch_size, + warmup_steps=self.warmup_steps, + weight_decay=self.weight_decay, + logging_dir=tmp_dir, + fp16=True, + deepspeed="onnxruntime/ds_configs/ds_config_zero_stage_2.json", + ) + + trainer, _ = get_ort_trainer( + model_name, + task, + data_metric_config, + training_args, + max_seq_length=self.max_seq_length, + max_train_samples=self.max_train_samples, + max_valid_samples=self.max_valid_samples, + max_test_samples=self.max_test_samples, + ) + + trainer.train() + gc.collect() @slow @@ -876,21 +792,6 @@ def test_trainer_ddp_glue(self): class ORTTrainerOptimizerChoiceTest(unittest.TestCase): def setUp(self): super().setUp() - args = ORTTrainingArguments("..") - self.n_epochs = min(args.num_train_epochs, 1) - self.per_device_train_batch_size = args.per_device_train_batch_size - self.per_device_eval_batch_size = args.per_device_eval_batch_size - - self.max_seq_length = 64 - self.max_train_samples = 50 - self.max_valid_samples = 20 - self.max_test_samples = 10 - - self.warmup_steps = 10 - self.weight_decay = 0.01 - - self.model_name = "bert-base-cased" - self.feature = "text-classification" def check_optim_and_kwargs(self, optim: OptimizerNames, mandatory_kwargs, expected_cls): args = ORTTrainingArguments(optim=optim, output_dir="None") @@ -903,37 +804,6 @@ def check_optim_and_kwargs(self, optim: OptimizerNames, mandatory_kwargs, expect actual_v = optim_kwargs[p] self.assertTrue(actual_v == v, f"Failed check for {p}. Expected {v}, but got {actual_v}.") - @parameterized.expand(optim_test_params, skip_on_empty=True) - def test_optim_supported(self, name: str, expected_cls, mandatory_kwargs): - # exercises all the valid --optim options - self.check_optim_and_kwargs(name, mandatory_kwargs, expected_cls) - - with tempfile.TemporaryDirectory() as tmp_dir: - training_args = ORTTrainingArguments( - optim=name, - output_dir=tmp_dir, - num_train_epochs=self.n_epochs, - per_device_train_batch_size=self.per_device_train_batch_size, - per_device_eval_batch_size=self.per_device_eval_batch_size, - warmup_steps=self.warmup_steps, - weight_decay=self.weight_decay, - logging_dir=tmp_dir, - ) - - trainer, _ = get_ort_trainer( - self.model_name, - self.feature, - _ENCODER_TASKS_DATASETS_CONFIGS[self.feature], - training_args, - max_seq_length=self.max_seq_length, - max_train_samples=self.max_train_samples, - max_valid_samples=self.max_valid_samples, - max_test_samples=self.max_test_samples, - ) - - trainer.train() - gc.collect() - def test_ort_fused_adam(self): # Pretend that onnxruntime-training is installed and mock onnxruntime.training.optim.FusedAdam exists. # Trainer.get_optimizer_cls_and_kwargs does not use FusedAdam. It only has to return the @@ -951,56 +821,3 @@ def test_ort_fused_adam(self): default_adam_kwargs, mock.optimizers.FusedAdam, ) - - -class ORTSeq2SeqTrainerSpecificIntegrationTest(unittest.TestCase): - def setUp(self): - super().setUp() - args = ORTTrainingArguments("..") - self.n_epochs = min(args.num_train_epochs, 1) - self.per_device_train_batch_size = args.per_device_train_batch_size - self.per_device_eval_batch_size = args.per_device_eval_batch_size - - self.max_seq_length = 32 - self.max_train_samples = 10 - self.max_valid_samples = 10 - self.max_test_samples = 10 - - self.warmup_steps = 10 - self.weight_decay = 0.01 - - @parameterized.expand( - _get_models_to_test(_SEQ2SEQ_MODELS_TO_TEST, _SEQ2SEQ_TASKS_DATASETS_CONFIGS), - skip_on_empty=True, - ) - def test_predict_with_generate_ort(self, test_name, model_name, feature, data_metric_config): - with tempfile.TemporaryDirectory() as tmp_dir: - training_args = get_ort_training_args( - feature=feature, - output_dir=tmp_dir, - evaluation_strategy="epoch", - num_train_epochs=self.n_epochs, - per_device_train_batch_size=self.per_device_train_batch_size, - per_device_eval_batch_size=self.per_device_eval_batch_size, - warmup_steps=self.warmup_steps, - weight_decay=self.weight_decay, - logging_dir=tmp_dir, - label_smoothing_factor=0.1, - predict_with_generate=True, - ) - - trainer, test_dataset = get_ort_trainer( - model_name, - feature, - data_metric_config, - training_args, - max_seq_length=self.max_seq_length, - max_train_samples=self.max_train_samples, - max_valid_samples=self.max_valid_samples, - max_test_samples=self.max_test_samples, - ) - - trainer.train() - trainer.evaluate(inference_with_ort=True) - trainer.predict(test_dataset, inference_with_ort=True) - gc.collect()