From dbbe625d5b7beac0b4465f7b90441d9f4593bc57 Mon Sep 17 00:00:00 2001 From: zjgarvey <47986913+zjgarvey@users.noreply.github.com> Date: Tue, 17 Sep 2024 17:18:43 -0700 Subject: [PATCH] [alt] Add a CL-based mode (#342) Adds `--mode=cl-onnx-iree` flag option which mirrors `--mode=onnx-iree` (default), but runs most stages via command line scripts rather than python bindings. This seems to work better for a few reasons: 1. Easier command-line reproducers for failures. (The script which fails is printed in the corresponding log file) 2. Memory management per stage is more contained and doesn't need to rely on python's garbage collector. --- alt_e2eshark/e2e_testing/backends.py | 55 ++++++- alt_e2eshark/e2e_testing/framework.py | 6 +- alt_e2eshark/e2e_testing/onnx_utils.py | 8 +- alt_e2eshark/e2e_testing/storage.py | 3 +- .../e2e_testing/test_configs/onnxconfig.py | 139 +++++++++++++++++- alt_e2eshark/onnx_tests/models/migraphx.py | 37 +---- alt_e2eshark/run.py | 23 ++- 7 files changed, 218 insertions(+), 53 deletions(-) diff --git a/alt_e2eshark/e2e_testing/backends.py b/alt_e2eshark/e2e_testing/backends.py index 70a10385..3e1de6be 100644 --- a/alt_e2eshark/e2e_testing/backends.py +++ b/alt_e2eshark/e2e_testing/backends.py @@ -6,9 +6,11 @@ import abc import onnxruntime as ort from typing import TypeVar, List -from e2e_testing.storage import TestTensors +from e2e_testing.storage import TestTensors, get_shape_string from e2e_testing.framework import CompiledOutput, ModelArtifact from onnx import ModelProto +import os +from pathlib import Path Invoker = TypeVar("Invoker") @@ -72,7 +74,7 @@ def compile(self, module, *, save_to: str = None): ) # log the vmfb if save_to: - with open(save_to + "compiled_model.vmfb", "wb") as f: + with open(os.path.join(save_to, "compiled_model.vmfb"), "wb") as f: f.write(b) return b @@ -94,6 +96,55 @@ def func(x): return func +class CLIREEBackend(BackendBase): + '''This backend calls iree through the command line to compile and run MLIR modules''' + def __init__(self, *, device="local-task", hal_target_backend="llvm-cpu", extra_args : List[str] = None): + self.device = device + self.hal_target_backend = hal_target_backend + self.extra_args = [] + if extra_args: + for a in extra_args: + if a[0:2] == "--": + self.extra_args.append(a) + else: + self.extra_args.append("--" + a) + + def compile(self, module_path: str, *, save_to : str = None) -> str: + vmfb_path = os.path.join(save_to, "compiled_model.vmfb") + arg_string = f"--iree-hal-target-backends={self.hal_target_backend} " + for arg in self.extra_args: + arg_string += arg + arg_string += " " + command_error_dump = os.path.join(save_to, "detail", "compilation.detail.log") + commands_log = os.path.join(save_to, "commands", "compilation.commands.log") + script = f"iree-compile {module_path} {arg_string}-o {vmfb_path} 1> {command_error_dump} 2>&1" + with open(commands_log, "w") as file: + file.write(script) + # remove old vmfb if it exists + Path(vmfb_path).unlink(missing_ok=True) + os.system(script) + if not os.path.exists(vmfb_path): + error_message = f"failure executing command: \n{script}\n failed to produce a vmfb at {vmfb_path}.\n" + if os.path.exists(command_error_dump): + error_message += "Error Details:\n\n" + with open(command_error_dump, "r+") as file: + error_message += file.read() + raise FileNotFoundError(error_message) + return vmfb_path + + def load(self, vmfb_path: str, *, func_name=None): + """A bit hacky. func returns a script that would dump outputs to terminal output. Modified in config.run method""" + run_dir = Path(vmfb_path).parent + def func(x: TestTensors) -> str: + script = f"iree-run-module --module='{vmfb_path}' --device={self.device}" + if func_name: + script += f" --function='{func_name}'" + torch_inputs = x.to_torch().data + for index, input in enumerate(torch_inputs): + script += f" --input='{get_shape_string(input)}=@{run_dir}/input.{index}.bin'" + return script + return func + class OnnxrtIreeEpBackend(BackendBase): '''This backend uses onnxrt iree-ep to compile and run onnx models for a specified hal_target_backend''' diff --git a/alt_e2eshark/e2e_testing/framework.py b/alt_e2eshark/e2e_testing/framework.py index 71e7c75f..28bc2014 100644 --- a/alt_e2eshark/e2e_testing/framework.py +++ b/alt_e2eshark/e2e_testing/framework.py @@ -87,11 +87,13 @@ def save_processed_output(self, output: TestTensors, save_to: str, name: str): # the following helper methods aren't meant to be overriden - def get_signature(self, *, from_inputs=True): + def get_signature(self, *, from_inputs=True, leave_dynamic=False): """Returns the input or output signature of self.model""" if not os.path.exists(self.model): self.construct_model() - return get_signature_for_onnx_model(self.model, from_inputs=from_inputs, dim_param_dict=self.dim_param_dict) + if not leave_dynamic: + self.update_dim_param_dict() + return get_signature_for_onnx_model(self.model, from_inputs=from_inputs, dim_param_dict=self.dim_param_dict, leave_dynamic=leave_dynamic) def load_inputs(self, dir_path): """computes the input signature of the onnx model and loads inputs from bin files""" diff --git a/alt_e2eshark/e2e_testing/onnx_utils.py b/alt_e2eshark/e2e_testing/onnx_utils.py index 26b6ff47..78c2833f 100644 --- a/alt_e2eshark/e2e_testing/onnx_utils.py +++ b/alt_e2eshark/e2e_testing/onnx_utils.py @@ -73,7 +73,7 @@ def get_sample_inputs_for_onnx_model(model_path, dim_param_dict = None): return sample_inputs -def get_signature_for_onnx_model(model_path, *, from_inputs: bool = True, dim_param_dict: Optional[dict[str, int]] = None): +def get_signature_for_onnx_model(model_path, *, from_inputs: bool = True, dim_param_dict: Optional[dict[str, int]] = None, leave_dynamic: bool = False): """A convenience funtion for retrieving the input or output shapes and dtypes""" s = onnxruntime.InferenceSession(model_path, None) if from_inputs: @@ -83,7 +83,11 @@ def get_signature_for_onnx_model(model_path, *, from_inputs: bool = True, dim_pa shapes = [] dtypes = [] for i in nodes: - shapes.append(i.shape) + shape = i.shape + for index, s in enumerate(shape): + if not leave_dynamic and isinstance(s, str) and s in dim_param_dict.keys(): + shape[index] = dim_param_dict[s] + shapes.append(shape) dtypes.append(dtype_from_ort_node(i)) return shapes, dtypes diff --git a/alt_e2eshark/e2e_testing/storage.py b/alt_e2eshark/e2e_testing/storage.py index 28a0976e..a8f40d7e 100644 --- a/alt_e2eshark/e2e_testing/storage.py +++ b/alt_e2eshark/e2e_testing/storage.py @@ -9,6 +9,7 @@ import torch from typing import Tuple, Optional, Dict, List, Any, Union from pathlib import Path +import os def get_shape_string(torch_tensor): input_shape = list(torch_tensor.shape) @@ -211,7 +212,7 @@ def load_from(shapes, torch_dtypes, dir_path: str, name: str = "input"): for i in range(len(shapes)): shape = shapes[i] dtype = torch_dtypes[i] - t = load_raw_binary_as_torch_tensor(dir_path + name + "." + str(i) + ".bin", shape, dtype) + t = load_raw_binary_as_torch_tensor(os.path.join(dir_path, name + "." + str(i) + ".bin"), shape, dtype) tensor_list.append(t) return TestTensors(tuple(tensor_list)) diff --git a/alt_e2eshark/e2e_testing/test_configs/onnxconfig.py b/alt_e2eshark/e2e_testing/test_configs/onnxconfig.py index 05730548..8eb94bc9 100644 --- a/alt_e2eshark/e2e_testing/test_configs/onnxconfig.py +++ b/alt_e2eshark/e2e_testing/test_configs/onnxconfig.py @@ -11,8 +11,10 @@ from e2e_testing.framework import TestConfig, OnnxModelInfo, Module, CompiledArtifact from e2e_testing.storage import TestTensors from torch_mlir.passmanager import PassManager -from typing import Tuple +from typing import Tuple, Any from onnxruntime import InferenceSession +import os +from pathlib import Path REDUCE_TO_LINALG_PIPELINE = [ "torch-lower-to-backend-contract", @@ -81,7 +83,7 @@ def import_model(self, model_info: OnnxModelInfo, *, save_to: str = None) -> Tup imp.import_all() # log imported IR if save_to: - with open(save_to + "model.torch_onnx.mlir", "w") as f: + with open(os.path.join(save_to, "model.torch_onnx.mlir"), "w") as f: f.write(str(m)) return m, func_name @@ -96,13 +98,13 @@ def preprocess_model(self, mlir_module: Module, *, save_to: str = None) -> Modul pm0.run(mlir_module.operation) # log torch-mlir IR if save_to: - with open(save_to + "model.torch.mlir", "w") as f: + with open(os.path.join(save_to, "model.torch.mlir"), "w") as f: f.write(str(mlir_module)) pm1 = PassManager.parse(self.pass_pipeline) pm1.run(mlir_module.operation) # log modified IR if save_to: - with open(save_to + "model.modified.mlir", "w") as f: + with open(os.path.join(save_to, "model.modified.mlir"), "w") as f: f.write(str(mlir_module)) return mlir_module @@ -112,3 +114,132 @@ def compile(self, mlir_module: Module, *, save_to: str = None) -> CompiledArtifa def run(self, artifact: CompiledArtifact, inputs: TestTensors, *, func_name="main") -> TestTensors: func = self.backend.load(artifact, func_name=func_name) return func(inputs) + +class CLOnnxTestConfig(TestConfig): + '''This is parallel to OnnxTestConfig, but uses command-line scripts for each stage.''' + def __init__( + self, log_dir: str, backend: BackendBase, torch_mlir_pipeline: Tuple[str, ...] + ): + super().__init__() + self.log_dir = log_dir + self.backend = backend + self.tensor_info_dict = dict() + if len(torch_mlir_pipeline) > 0: + self.pass_pipeline = "builtin.module(" + ",".join(torch_mlir_pipeline) + ")" + else: + self.pass_pipeline = None + + def import_model(self, program: OnnxModelInfo, *, save_to: str) -> Tuple[str, str]: + if not save_to: + raise ValueError("CLOnnxTestConfig requires saving artifacts") + # setup a detail subdirectory + os.makedirs(os.path.join(save_to, "detail"), exist_ok=True) + # setup a commands subdirectory + os.makedirs(os.path.join(save_to, "commands"), exist_ok=True) + # set file paths + mlir_file = os.path.join(save_to, "model.torch_onnx.mlir") + detail_log = os.path.join(save_to, "detail", "import_model.detail.log") + commands_log = os.path.join(save_to, "commands", "import_model.commands.log") + # get a command line script + script = "python -m torch_mlir.tools.import_onnx " + script += str(program.model) + script += " -o " + script = script + mlir_file + script += f" 1> {detail_log} 2>&1" + # log the command + with open(commands_log, "w") as file: + file.write(script) + # remove old mlir_file if present + Path(mlir_file).unlink(missing_ok=True) + # run the command + os.system(script) + # check if a new mlir file was generated + if not os.path.exists(mlir_file): + error_msg = f"failure executing command: \n{script}\n failed to produce mlir file {mlir_file}.\n" + if os.path.exists(detail_log): + error_msg += "Error detail:\n\n" + with open(detail_log,"r+") as file: + error_msg += file.read() + raise FileNotFoundError(error_msg) + # store output signatures for loading the outputs of iree-run-module + self.tensor_info_dict[program.name] = program.get_signature(from_inputs=False) + # get the func name + # TODO put this as an OnnxModelInfo attr? + model = onnx.load(program.model, load_external_data=False) + func_name = model.graph.name + return mlir_file, func_name + + def preprocess_model(self, mlir_module: str, *, save_to: str = None) -> Module: + # if the pass pipeline is empty, return the original module + if not self.pass_pipeline: + return mlir_module + # convert imported torch-onnx ir to torch + onnx_to_torch_pipeline = "builtin.module(func.func(convert-torch-onnx-to-torch))" + # get paths + detail_log = os.path.join(save_to, "detail", "preprocessing.detail.log") + commands_log = os.path.join(save_to, "commands", "preprocessing.commands.log") + torch_ir = os.path.join(save_to, "model.torch.mlir") + linalg_ir = os.path.join(save_to, "model.modified.mlir") + # generate scripts + script0 = f"torch-mlir-opt -pass-pipeline='{onnx_to_torch_pipeline}' {mlir_module} -o {torch_ir} 1> {detail_log} 2>&1" + script1 = f"torch-mlir-opt -pass-pipeline='{self.pass_pipeline}' {torch_ir} -o {linalg_ir} 1> {detail_log} 2>&1" + # remove old torch_ir + Path(torch_ir).unlink(missing_ok=True) + with open(commands_log, "w") as file: + file.write(script0) + file.write(script1) + # run torch-onnx-to-torch + os.system(script0) + if not os.path.exists(torch_ir): + error_msg = f"failure executing command: \n{script0}\n failed to produce mlir file {torch_ir}.\n" + if os.path.exists(detail_log): + error_msg += "Error detail:\n\n" + with open(detail_log,"r+") as file: + error_msg += file.read() + raise FileNotFoundError(error_msg) + # remove old linalg ir + Path(linalg_ir).unlink(missing_ok=True) + # run torch-to-linalg pipeline + os.system(script1) + if not os.path.exists(linalg_ir): + error_msg = f"failure executing command: \n{script1}\n failed to produce mlir file {linalg_ir}.\n" + if os.path.exists(detail_log): + error_msg += "Error detail:\n\n" + with open(detail_log,"r+") as file: + error_msg += file.read() + raise FileNotFoundError(error_msg) + return linalg_ir + + def compile(self, mlir_module: str, *, save_to: str = None) -> str: + return self.backend.compile(mlir_module, save_to=save_to) + + def run(self, artifact: str, inputs: TestTensors, *, func_name=None) -> TestTensors: + run_dir = Path(artifact).parent + test_name = run_dir.name + detail_log = run_dir.joinpath("detail", "compiled_inference.detail.log") + commands_log = run_dir.joinpath("commands", "compiled_inference.commands.log") + func = self.backend.load(artifact, func_name=func_name) + script = func(inputs) + num_outputs = len(self.tensor_info_dict[test_name][0]) + output_files = [] + for i in range(num_outputs): + output_files.append(os.path.join(run_dir, f"output.{i}.bin")) + script += f" --output=@'{output_files[i]}'" + # remove existing output files if they already exist + # we use the existence of these files to check if the inference succeeded. + Path(output_files[i]).unlink(missing_ok=True) + # dump additional error messaging to the detail log. + script += f" 1> {detail_log} 2>&1" + with open(commands_log, "w") as file: + file.write(script) + os.system(script) + for file in output_files: + if not os.path.exists(file): + error_msg = f"failure executing command: \n{script}\n failed to produce output file {file}.\n" + if os.path.exists(detail_log): + error_msg += "Error detail:\n\n" + with open(detail_log,"r+") as file: + error_msg += file.read() + raise FileNotFoundError(error_msg) + return TestTensors.load_from(self.tensor_info_dict[test_name][0], self.tensor_info_dict[test_name][1], run_dir, "output") + diff --git a/alt_e2eshark/onnx_tests/models/migraphx.py b/alt_e2eshark/onnx_tests/models/migraphx.py index 103edf28..c3a6603e 100644 --- a/alt_e2eshark/onnx_tests/models/migraphx.py +++ b/alt_e2eshark/onnx_tests/models/migraphx.py @@ -13,38 +13,6 @@ # 3. setup dim params for other misc models # 4. reupload cadence model 1 -ALL_MODELS = [ - "migraphx_agentmodel__AgentModel", - "migraphx_bert__bert-large-uncased", - "migraphx_bert__bertsquad-12", - "migraphx_cadene__dpn92i1", - "migraphx_cadene__inceptionv4i16", - "migraphx_cadene__resnext101_64x4di1", - "migraphx_cadene__resnext101_64x4di16", - "migraphx_huggingface-transformers__bert_mrpc8", - "migraphx_mlperf__bert_large_mlperf", - "migraphx_mlperf__resnet50_v1", - "migraphx_onnx-misc__taau_low_res_downsample_d2s_for_infer_time_fp16_opset11", - "migraphx_onnx-model-zoo__gpt2-10", - "migraphx_ORT__bert_base_cased_1", - "migraphx_ORT__bert_base_uncased_1", - "migraphx_ORT__bert_large_uncased_1", - "migraphx_ORT__distilgpt2_1", - "migraphx_ORT__onnx_models__bert_base_cased_1_fp16_gpu", - "migraphx_ORT__onnx_models__bert_large_uncased_1_fp16_gpu", - "migraphx_ORT__onnx_models__distilgpt2_1_fp16_gpu", - "migraphx_pytorch-examples__wlang_gru", - "migraphx_pytorch-examples__wlang_lstm", - "migraphx_sd__unet__model", - "migraphx_sdxl__unet__model", - "migraphx_torchvision__densenet121i32", - "migraphx_torchvision__inceptioni1", - "migraphx_torchvision__inceptioni32", - "migraphx_torchvision__resnet50i1", - "migraphx_torchvision__resnet50i64", -] - - def dim_param_constructor(dim_param_dict): class AzureWithDimParams(AzureDownloadableModel): def __init__(self, *args, **kwargs): @@ -70,8 +38,7 @@ def update_dim_param_dict(self): ORT_model_names = [ "migraphx_ORT__bert_base_cased_1", # batch_size, seq_len "migraphx_ORT__bert_base_uncased_1", # batch_size, seq_len - # the following test currently crashes for some reason (maybe opset version related?) - # "migraphx_ORT__bert_large_uncased_1", # batch_size, seq_len + "migraphx_ORT__bert_large_uncased_1", # batch_size, seq_len "migraphx_ORT__distilgpt2_1", # batch_size, seq_len "migraphx_ORT__onnx_models__bert_base_cased_1_fp16_gpu", # batch_size, seq_len "migraphx_ORT__onnx_models__bert_large_uncased_1_fp16_gpu", # batch_size, seq_len @@ -129,7 +96,7 @@ def update_dim_param_dict(self): "migraphx_models__whisper-tiny-decoder" : {"batch_size" : 1, "decoder_sequence_length" : 64, "encoder_sequence_length / 2" : 32}, "migraphx_models__whisper-tiny-encoder" : {"batch_size" : 1, "feature_size" : 80, "encoder_sequence_length" : 64}, # this one crashes for some reason... - # "migraphx_sdxl__unet__model" : {"batch_size" : 1, "num_channels" : 4, "height" : 512, "width" : 512, "steps" : 2, "sequence_length" : 64} + "migraphx_sdxl__unet__model" : {"batch_size" : 1, "num_channels" : 4, "height" : 512, "width" : 512, "steps" : 2, "sequence_length" : 64} } for key, dim_param in misc_models.items(): diff --git a/alt_e2eshark/run.py b/alt_e2eshark/run.py index c7eb4505..ec707fdd 100644 --- a/alt_e2eshark/run.py +++ b/alt_e2eshark/run.py @@ -21,13 +21,14 @@ # import frontend test configs: from e2e_testing.test_configs.onnxconfig import ( + CLOnnxTestConfig, OnnxTestConfig, OnnxEpTestConfig, REDUCE_TO_LINALG_PIPELINE, ) # import backends -from e2e_testing.backends import SimpleIREEBackend, OnnxrtIreeEpBackend +from e2e_testing.backends import SimpleIREEBackend, OnnxrtIreeEpBackend, CLIREEBackend from e2e_testing.storage import load_test_txt_file, load_json_dict from utils.report import generate_report, save_dict @@ -83,6 +84,11 @@ def main(args): config = OnnxTestConfig( str(TEST_DIR), SimpleIREEBackend(device=args.device, hal_target_backend=args.backend, extra_args=args.iree_compile_args), pipeline ) + elif args.mode == "cl-onnx-iree": + pipeline = REDUCE_TO_LINALG_PIPELINE if args.torchtolinalg else [] + config = CLOnnxTestConfig( + str(TEST_DIR), CLIREEBackend(device=args.device, hal_target_backend=args.backend, extra_args=args.iree_compile_args), pipeline + ) elif args.mode == "ort-ep": # TODO: allow specifying provider explicitly from cl args. config = OnnxEpTestConfig( @@ -131,7 +137,6 @@ def run_tests( if not os.path.exists(parent_log_dir): os.makedirs(parent_log_dir) - num_passes = 0 warnings.filterwarnings("ignore") if verbose: @@ -246,7 +251,6 @@ def run_tests( test_passed = log_result(result, log_dir, [1e-3, 1e-3]) if test_passed: status_dict[t.unique_name] = "PASS" - num_passes+=1 else: status_dict[t.unique_name] = "Numerics" except Exception as e: @@ -254,11 +258,16 @@ def run_tests( log_exception(e, log_dir, "results-summary", t.unique_name, verbose) if verbose: - if t.unique_name not in status_dict.keys() or status_dict[t.unique_name] == "PASS": + # "PASS" is only recorded if a results-summary is generated + # if running a subset of ALL_STAGES, manually indicate "PASS". + if t.unique_name not in status_dict.keys(): + status_dict[t.unique_name] = "PASS" + if status_dict[t.unique_name] == "PASS": print(f"\tPASSED") else: print(f"\tFAILED ({status_dict[t.unique_name]})") + num_passes = list(status_dict.values()).count("PASS") print("\nTest Summary:") print(f"\tPASSES: {num_passes}\n\tTOTAL: {len(test_list)}") print(f"results stored in {parent_log_dir}") @@ -291,9 +300,9 @@ def log_exception(e: Exception, path: str, stage: str, name: str, verbose: bool) f.write(base_str) if verbose: print(f"\tFAILED ({stage})") + tb = e.__traceback__ import traceback - - traceback.print_exception(e, file=f) + traceback.print_tb(tb, file=f) else: print(f"FAILED: {name}") @@ -333,7 +342,7 @@ def _get_argparse(): parser.add_argument( "-m", "--mode", - choices=["onnx-iree", "ort-ep"], + choices=["onnx-iree", "cl-onnx-iree", "ort-ep"], default="onnx-iree", help="onnx-iree=onnx->torch-mlir->IREE, ort=onnx->run with custom ORT EP inference session", )