diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 000000000..6b4364d2a
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,63 @@
+[tool:pytest]
+minversion = 6.0
+addopts = -ra -q -s
+testpaths =
+    tests
+
+[yapf]
+based_on_style = google
+
+# The number of columns to use for indentation.
+indent_width = 4
+
+# The column limit.
+column_limit = 80
+
+# Place each dictionary entry onto its own line.
+each_dict_entry_on_separate_line = True
+
+# Put closing brackets on a separate line, dedented, if the bracketed
+# expression can't fit in a single line. Applies to all kinds of brackets,
+# including function definitions and calls. For example:
+#
+#   config = {
+#       'key1': 'value1',
+#       'key2': 'value2',
+#   }        # <--- this bracket is dedented and on a separate line
+#
+#   time_series = self.remote_client.query_entity_counters(
+#       entity='dev3246.region1',
+#       key='dns.query_latency_tcp',
+#       transform=Transformation.AVERAGE(window=timedelta(seconds=60)),
+#       start_ts=now()-timedelta(days=3),
+#       end_ts=now(),
+#   )        # <--- this bracket is dedented and on a separate line
+dedent_closing_brackets = True
+
+# Do not split consecutive brackets. Only relevant when DEDENT_CLOSING_BRACKETS is set
+coalesce_brackets = True
+
+# Align closing bracket with visual indentation.
+align_closing_bracket_with_visual_indent = False
+
+# Split named assignments onto individual lines.
+split_before_named_assigns = True
+
+# If an argument / parameter list is going to be split, then split before the first argument.
+split_before_first_argument = True
+
+# Allow splitting before a default / named assignment in an argument list.
+allow_split_before_default_or_named_assigns = True
+
+# Join short lines into one line. E.g., single line if statements.
+join_multiple_lines = False
+
+# Let spacing indicate operator precedence.
+arithmetic_precedence_indication = True
+
+# Do not include spaces around selected binary operators.
+# Example: 1 + 2 * 3 - 4 / 5     =>     1 + 2*3 - 4/5
+no_spaces_around_selected_binary_operators = True
+
+# Allow lambdas to be formatted on more than one line.
+allow_multiline_lambdas = True
\ No newline at end of file
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 000000000..a5682fb09
--- /dev/null
+++ b/tests/__init__.py
@@ -0,0 +1,2 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
\ No newline at end of file
diff --git a/tests/test_yapf_format.py b/tests/test_yapf_format.py
new file mode 100644
index 000000000..a34ded6dc
--- /dev/null
+++ b/tests/test_yapf_format.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import unittest
+
+import pygments
+from pygments import console
+
+from tests.utils import list_all_py_files
+from tests.utils import CustomTestCase
+
+from yapf.yapflib.yapf_api import FormatCode
+
+
+def _read_utf_8_file(filename):
+    if sys.version_info.major == 2:  ## Python 2 specific
+        with open(filename, 'rb') as f:
+            return unicode(f.read(), 'utf-8')
+    else:
+        with open(filename, encoding='utf-8') as f:
+            return f.read()
+
+
+def print_color(msg, color):
+    print(pygments.console.colorize(color, msg))
+
+
+class YAPF_Style_Test(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+
+        cls.badly_formatted_files = list()
+        cls.files_2_test = list_all_py_files()
+
+    def test_files_format(self):
+
+        total_analyzed_files = 0
+        for file in list_all_py_files():
+
+            total_analyzed_files += 1
+
+            try:
+
+                print(f"Testing: {file:100s}", end="")
+                code = _read_utf_8_file(file)
+
+                # https://pypi.python.org/pypi/yapf/0.20.2#example-as-a-module
+                diff, changed = FormatCode(
+                    code,
+                    filename=file,
+                    style_config='setup.cfg',
+                    print_diff=True
+                )
+
+                if changed:
+                    print_color("FAILURE", "red")
+                    self.badly_formatted_files.append(file)
+                else:
+                    print_color("SUCCESS", "green")
+
+            except Exception as e:
+                print_color("FAILURE", "red")("FAILURE")
+                print(
+                    "Error while processing file: `%s`\n"
+                    "Error: %s" % (file, str(e))
+                )
+
+        str_err = ""
+
+        if self.badly_formatted_files:
+            for filename in self.badly_formatted_files:
+                str_err += f"yapf -i --style=setup.cfg {filename}\n"
+
+            str_err = "\n======================================================================================\n" \
+                        f"Bad Coding Style: {len(self.badly_formatted_files)} file(s) need to be formatted, run the following commands to fix: \n" \
+                        f"{str_err}" \
+                        "======================================================================================"
+
+        passing_files = total_analyzed_files - len(self.badly_formatted_files)
+        print_color(
+            f"\nPASSING: {passing_files} / {total_analyzed_files}",
+            "green" if str_err == "" else "red"
+        )
+
+        if str_err != "":
+            print_color(str_err, "red")
+
+        self.assertEqual(str_err, "")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/utils.py b/tests/utils.py
new file mode 100644
index 000000000..20739225c
--- /dev/null
+++ b/tests/utils.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os
+import unittest
+
+from contextlib import contextmanager
+from glob import glob, iglob
+
+__all__ = [
+    'CustomTestCase',
+    'list_all_py_files',
+]
+
+
+class CustomTestCase(unittest.TestCase):
+
+    @contextmanager
+    def assertNotRaises(self, exc_type):
+        try:
+            yield None
+        except exc_type:
+            raise self.failureException('{} raised'.format(exc_type.__name__))
+
+
+_excludes_paths = ["tftrt/blog_posts/", "tftrt/examples/third_party"]
+
+
+def list_all_py_files():
+    for _dir in ['tests', 'tftrt']:
+        for _file in iglob(f"{_dir}/**/*.py", recursive=True):
+            if any([path in _file for path in _excludes_paths]):
+                continue
+            yield _file
diff --git a/tftrt/examples/benchmark_args.py b/tftrt/examples/benchmark_args.py
index b31fb4e12..93d379f94 100644
--- a/tftrt/examples/benchmark_args.py
+++ b/tftrt/examples/benchmark_args.py
@@ -1,119 +1,158 @@
 #!/usr/bin/env python
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # -*- coding: utf-8 -*-
 
 import argparse
-import copy
 import os
-import time
 
 import numpy as np
-import tensorflow as tf
 
-from tensorflow.python.compiler.tensorrt import trt_convert as trt
 from tensorflow.python.compiler.tensorrt.trt_convert import \
     DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES
-from tensorflow.python.framework import convert_to_constants
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.saved_model.signature_constants import \
     DEFAULT_SERVING_SIGNATURE_DEF_KEY
 
-from benchmark_runner import _print_dict
+from benchmark_utils import print_dict
 
 
 class BaseCommandLineAPI(object):
 
-    ALLOWED_TFTRT_PRECISION_MODES = ['FP32', 'FP16', 'INT8']
+    ALLOWED_TFTRT_PRECISION_MODES = ["FP32", "FP16", "INT8"]
     SAMPLES_IN_VALIDATION_SET = None
 
     def __init__(self):
-        self._parser = argparse.ArgumentParser(description='tftrt_benchmark')
+
+        self._parser = argparse.ArgumentParser(
+            description="TF-TRT Inference Benchmark"
+        )
 
         # ======================= SavedModel Directories ===================== #
 
-        self._parser.add_argument('--input_saved_model_dir', type=str,
-                                  default=None,
-                                  help='Directory containing the input saved '
-                                       'model.')
+        self._parser.add_argument(
+            "--input_saved_model_dir",
+            type=str,
+            default=None,
+            help="Directory containing the input saved model."
+        )
 
-        self._parser.add_argument('--output_saved_model_dir', type=str,
-                                  default=None,
-                                  help='Directory in which the converted model '
-                                       'will be saved')
+        self._parser.add_argument(
+            "--output_saved_model_dir",
+            type=str,
+            default=None,
+            help="Directory in which the converted model will be saved"
+        )
 
         # ======================== Dataset Directories ======================= #
 
-        self._parser.add_argument('--calib_data_dir', type=str,
-                                  help='Directory containing the dataset used '
-                                       'for INT8 calibration.')
+        self._parser.add_argument(
+            "--calib_data_dir",
+            type=str,
+            help="Directory containing the dataset used for INT8 calibration."
+        )
 
-        self._parser.add_argument('--data_dir', type=str, default=None,
-                                  help='Directory containing the dataset used '
-                                       'for model validation.')
+        self._parser.add_argument(
+            "--data_dir",
+            type=str,
+            default=None,
+            help="Directory containing the dataset used for model validation."
+        )
 
         # ======================= Generic Runtime Flags ====================== #
 
-        self._parser.add_argument('--batch_size', type=int, default=8,
-                                  help='Number of images per batch.')
+        self._parser.add_argument(
+            "--batch_size",
+            type=int,
+            default=8,
+            help="Number of images per batch."
+        )
 
-        self._parser.add_argument('--display_every', type=int, default=50,
-                                  help='Number of iterations executed between'
-                                       'two consecutive display of metrics')
+        self._parser.add_argument(
+            "--display_every",
+            type=int,
+            default=50,
+            help="Number of iterations executed between two consecutive "
+            "displays of metrics"
+        )
 
-        self._parser.add_argument('--gpu_mem_cap', type=int, default=0,
-                                  help='Upper bound for GPU memory in MB. '
-                                        'Default is 0 which means allow_growth '
-                                        'will be used.')
+        self._parser.add_argument(
+            "--gpu_mem_cap",
+            type=int,
+            default=0,
+            help="Upper bound for GPU memory in MB. Default is 0 which means "
+            "allow_growth will be used."
+        )
 
         default_sign_key = DEFAULT_SERVING_SIGNATURE_DEF_KEY
-        self._parser.add_argument('--input_signature_key', type=str,
-                                  default=default_sign_key,
-                                  help='SavedModel signature to use for '
-                                  'inference, defaults to: %s' % (
-                                    default_sign_key
-                                  ))
-
-        self._parser.add_argument('--output_tensor_names', type=str,
-                                  default=None,
-                                  help='Output tensors\' name, defaults to all '
-                                       'tensors available if not set. Will '
-                                       'only work with `--use_tftrt`.')
-
-        self._parser.add_argument('--output_tensor_indices', type=str,
-                                  default=None,
-                                  help='Output tensors\' index, defaults to '
-                                       'all tensors available if not set. Will '
-                                       'only work without `--use_tftrt`.')
-
-        self._parser.add_argument('--num_iterations', type=int, default=None,
-                                  help='How many iterations(batches) to '
-                                       'evaluate. If not supplied, the whole '
-                                       'set will be evaluated.')
-
-        self._parser.add_argument('--num_warmup_iterations', type=int,
-                                  default=100,
-                                  help='Number of initial iterations skipped '
-                                       'from timing')
+        self._parser.add_argument(
+            "--input_signature_key",
+            type=str,
+            default=default_sign_key,
+            help=f"SavedModel signature to use for inference, defaults to: "
+            f"`{default_sign_key}`"
+        )
+
+        default_tag = tag_constants.SERVING
+        self._parser.add_argument(
+            "--model_tag",
+            type=str,
+            default=default_tag,
+            help=f"SavedModel inference tag to use, defaults to: "
+            f"{default_tag}"
+        )
+
+        self._parser.add_argument(
+            "--output_tensors_name",
+            type=str,
+            default=None,
+            help="Output tensors' name, defaults to all tensors available if "
+            "not set. Will only work with `--use_tftrt`."
+        )
+
+        self._parser.add_argument(
+            "--num_iterations",
+            type=int,
+            default=None,
+            help="How many iterations(batches) to evaluate. If not supplied, "
+            "the whole set will be evaluated."
+        )
+
+        self._parser.add_argument(
+            "--num_warmup_iterations",
+            type=int,
+            default=100,
+            help="Number of initial iterations skipped from timing."
+        )
+
+        self._parser.add_argument(
+            "--total_max_samples",
+            type=int,
+            default=None,
+            required=True,
+            help="Preallocated size of the result numpy arrays. Shall be at "
+            "least as large as the number of samples in the dataset."
+        )
 
         self._add_bool_argument(
             name="use_xla",
             default=False,
             required=False,
-            help='If set to True, the benchmark will use XLA JIT Compilation'
+            help="If set to True, the benchmark will use XLA JIT Compilation."
         )
 
         self._add_bool_argument(
             name="skip_accuracy_testing",
             default=False,
             required=False,
-            help='If set to True, accuracy calculation will be skipped.'
+            help="If set to True, accuracy calculation will be skipped."
         )
 
         self._add_bool_argument(
-           name="use_synthetic_data",
-           default=False,
-           required=False,
-           help='If set to True, one unique batch of random batch of data is '
-                'generated and used at every iteration.'
+            name="use_synthetic_data",
+            default=False,
+            required=False,
+            help="If set to True, one unique batch of random batch of data is "
+            "generated and used at every iteration."
         )
 
         # =========================== TF-TRT Flags ========================== #
@@ -122,8 +161,8 @@ def __init__(self):
             name="use_tftrt",
             default=False,
             required=False,
-            help='If set to True, the inference graph will be converted using '
-                 'TF-TRT graph converter.'
+            help="If set to True, the inference graph will be converted using "
+            "TF-TRT graph converter."
         )
 
         self._add_bool_argument(
@@ -133,40 +172,50 @@ def __init__(self):
             help="Whether to build TensorRT engines during runtime."
         )
 
-        self._parser.add_argument('--max_workspace_size', type=int,
-                                 default=DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES,
-                                 help='The maximum GPU temporary memory which '
-                                      'the TRT engine can use at execution '
-                                      'time.')
+        self._parser.add_argument(
+            "--max_workspace_size",
+            type=int,
+            default=DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES,
+            help="The maximum GPU temporary memory which the TRT engine can "
+            "use at execution time."
+        )
 
-        self._parser.add_argument('--minimum_segment_size', type=int, default=5,
-                                  help='Minimum number of TensorFlow ops in a '
-                                       'TRT engine.')
+        self._parser.add_argument(
+            "--minimum_segment_size",
+            type=int,
+            default=5,
+            help="Minimum number of TensorFlow ops in a TRT engine."
+        )
 
-        self._parser.add_argument('--num_calib_inputs', type=int, default=500,
-                                  help='Number of inputs (e.g. images) used '
-                                       'for calibration (last batch is skipped '
-                                       'in case it is not full)')
+        self._parser.add_argument(
+            "--num_calib_batches",
+            type=int,
+            default=10,
+            help="Number of batches used for INT8 calibration (only useful if "
+            "--use_tftrt is set with --precision=INT8"
+        )
 
         self._add_bool_argument(
             name="optimize_offline",
             default=True,
             required=False,
-            help='If set to True, TensorRT engines are built before runtime.'
+            help="If set to True, TensorRT engines are built before runtime."
         )
 
-        self._parser.add_argument('--precision', type=str,
-                                  choices=self.ALLOWED_TFTRT_PRECISION_MODES,
-                                  default='FP32',
-                                  help='Precision mode to use. FP16 and INT8 '
-                                       'modes only works if --use_tftrt is '
-                                       'used.')
+        self._parser.add_argument(
+            "--precision",
+            type=str,
+            choices=self.ALLOWED_TFTRT_PRECISION_MODES,
+            default="FP32",
+            help="Precision mode to use. FP16 and INT8 modes only works if "
+            "--use_tftrt is used."
+        )
 
         self._add_bool_argument(
             name="use_dynamic_shape",
             default=False,
             required=False,
-            help='Whether to use implicit batch mode or dynamic shape mode.'
+            help="Whether to use implicit batch mode or dynamic shape mode."
         )
 
         # =========================== DEBUG Flags ========================== #
@@ -175,26 +224,39 @@ def __init__(self):
             name="debug",
             default=False,
             required=False,
-            help='If set to True, will print additional information.'
+            help="If set to True, will print additional information."
+        )
+
+        self._add_bool_argument(
+            name="debug_performance",
+            default=False,
+            required=False,
+            help="If set to True, will print additional information."
         )
 
-    def _add_bool_argument(self, name=None, default=False, required=False, help=None):
-            if not isinstance(default, bool):
-                raise ValueError()
+    def _add_bool_argument(
+        self, name=None, default=False, required=False, help=None
+    ):
+        if not isinstance(default, bool):
+            raise ValueError()
 
-            feature_parser = self._parser.add_mutually_exclusive_group(\
-                required=required
-            )
+        feature_parser = self._parser.add_mutually_exclusive_group(\
+            required=required
+        )
 
-            feature_parser.add_argument('--' + name, dest=name,
-                                        action='store_true',
-                                        help=help,
-                                        default=default)
+        feature_parser.add_argument(
+            "--" + name,
+            dest=name,
+            action="store_true",
+            help=help,
+            default=default
+        )
 
-            feature_parser.add_argument('--no' + name, dest=name,
-                                        action='store_false')
+        feature_parser.add_argument(
+            "--no" + name, dest=name, action="store_false"
+        )
 
-            feature_parser.set_defaults(name=default)
+        feature_parser.set_defaults(name=default)
 
     def _validate_args(self, args):
 
@@ -202,75 +264,77 @@ def _validate_args(self, args):
             raise ValueError("--data_dir is required")
 
         elif not os.path.isdir(args.data_dir):
-            raise RuntimeError("The path --data_dir=`{}` doesn't exist or is "
-                               "not a directory".format(args.data_dir))
+            raise RuntimeError(
+                f"The path --data_dir=`{args.data_dir}` doesn't exist or is "
+                "not a directory"
+            )
 
-        if (
-            args.num_iterations is not None and
-            args.num_iterations <= args.num_warmup_iterations
-        ):
+        if (args.num_iterations is not None and
+                args.num_iterations <= args.num_warmup_iterations):
             raise ValueError(
-                '--num_iterations must be larger than --num_warmup_iterations '
-                '({} <= {})'.format(args.num_iterations,
-                                    args.num_warmup_iterations))
+                "--num_iterations must be larger than --num_warmup_iterations "
+                f"({args.num_iterations} <= {args.num_warmup_iterations})"
+            )
 
         if not args.use_tftrt:
             if args.use_dynamic_shape:
-                raise ValueError('TensorRT must be enabled for Dynamic Shape '
-                                 'support to be enabled (--use_tftrt).')
+                raise ValueError(
+                    "TensorRT must be enabled for Dynamic Shape support to be "
+                    "enabled (--use_tftrt)."
+                )
 
-            if args.precision != 'FP32':
-                raise ValueError('TensorRT must be enabled for FP16'
-                                 'or INT8 modes (--use_tftrt).')
+            if args.precision != "FP32":
+                raise ValueError(
+                    "TensorRT must be enabled for FP16 or INT8 modes "
+                    "(--use_tftrt)."
+                )
 
         else:
             if args.use_xla:
                 raise ValueError("--use_xla flag is not supported with TF-TRT.")
 
             if args.precision not in self.ALLOWED_TFTRT_PRECISION_MODES:
-                raise ValueError("The received --precision={} is not supported."
-                                 " Allowed: {}".format(
-                                    args.precision,
-                                    self.ALLOWED_TFTRT_PRECISION_MODES
-                ))
+                raise ValueError(
+                    f"The received --precision={args.precision} is not "
+                    f"supported. Allowed: {self.ALLOWED_TFTRT_PRECISION_MODES}"
+                )
 
-            if args.precision == 'INT8':
+            if args.precision == "INT8":
 
                 if not args.calib_data_dir:
-                    raise ValueError('--calib_data_dir is required for INT8 '
-                                     'precision mode')
+                    raise ValueError(
+                        "--calib_data_dir is required for INT8 precision mode"
+                    )
 
                 elif not os.path.isdir(args.calib_data_dir):
-                    raise RuntimeError("The path --calib_data_dir=`{}` doesn't "
-                                       "exist or is not a directory".format(
-                                            args.calib_data_dir))
+                    raise RuntimeError(
+                        f"The path --calib_data_dir=`{args.calib_data_dir}` "
+                        "doesn't exist or is not a directory"
+                    )
 
                 if args.use_dynamic_shape:
-                    raise ValueError('TF-TRT does not support dynamic shape '
-                                     'mode with INT8 calibration.')
-
-                if args.num_calib_inputs <= args.batch_size:
                     raise ValueError(
-                        '--num_calib_inputs must not be smaller than '
-                        '--batch_size ({} <= {})'.format(
-                        args.num_calib_inputs, args.batch_size))
+                        "TF-TRT does not support dynamic shape mode with INT8 "
+                        "calibration."
+                    )
 
     def _post_process_args(self, args):
 
         if args.num_iterations is None:
             args.num_iterations = (
-                self.SAMPLES_IN_VALIDATION_SET // args.batch_size
+                max((args.total_max_samples // args.batch_size) + 1, 1000)
             )
 
         return args
 
     def parse_args(self):
         args = self._parser.parse_args()
+
         args = self._post_process_args(args)
         self._validate_args(args)
 
-        print('\nBenchmark arguments:')
-        _print_dict(vars(args))
+        print("\nBenchmark arguments:")
+        print_dict(vars(args))
         print()
 
         return args
diff --git a/tftrt/examples/benchmark_runner.py b/tftrt/examples/benchmark_runner.py
index 40c699432..ee39840c5 100644
--- a/tftrt/examples/benchmark_runner.py
+++ b/tftrt/examples/benchmark_runner.py
@@ -1,144 +1,71 @@
 #!/usr/bin/env python
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # -*- coding: utf-8 -*-
+
 import os
-import sys
 
 import abc
-import argparse
-import copy
 import logging
+import sys
 import time
 
-from collections import defaultdict
-from contextlib import contextmanager
-from functools import partial
-from operator import itemgetter
+from distutils.util import strtobool
+
+from benchmark_utils import DataAggregator
+from benchmark_utils import force_gpu_resync
+from benchmark_utils import print_dict
+from benchmark_utils import timed_section
+from benchmark_utils import timed_dataset
 
 import numpy as np
 import tensorflow as tf
 
 from tensorflow.python.compiler.tensorrt import trt_convert as trt
-from tensorflow.python.compiler.tensorrt.trt_convert import \
-    DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES
-from tensorflow.python.framework import convert_to_constants
-from tensorflow.python.saved_model import tag_constants
-from tensorflow.python.saved_model.signature_constants import \
-    DEFAULT_SERVING_SIGNATURE_DEF_KEY
-
-
-def _print_dict(input_dict, prefix='  ', postfix=''):
-    for k, v in sorted(input_dict.items()):
-        print('{prefix}{arg_name}: {value}{postfix}'.format(
-            prefix=prefix,
-            arg_name=k,
-            value='%.1f' % v if isinstance(v, float) else v,
-            postfix=postfix
-        ))
-
-
-@contextmanager
-def _timed_section(msg):
-    print('\n[START] {}'.format(msg))
-    start_time = time.time()
-    yield
-    print("[END] Duration: {:.1f}s".format(time.time() - start_time))
-    print("=" * 80, "\n")
 
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import tag_constants
 
-def _force_gpu_resync(func):
-    p = tf.constant(0.)  # Create small tensor to force GPU resync
-    def wrapper(*args, **kwargs):
-        rslt = func(*args, **kwargs)
-        (p + 1.).numpy()  # Sync the GPU
-        return rslt
-    return wrapper
+__all__ = ["BaseBenchmarkRunner"]
 
 
 class BaseBenchmarkRunner(object, metaclass=abc.ABCMeta):
 
-    ACCURACY_METRIC_NAME = None
-
     ############################################################################
     # Methods expected to be overwritten by the subclasses
     ############################################################################
 
-    def before_benchmark(self, **kwargs):
-        pass
+    @abc.abstractmethod
+    def get_dataset_batches(self):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def preprocess_model_inputs(self, data_batch):
+        raise NotImplementedError()
 
     @abc.abstractmethod
-    def compute_accuracy_metric(self, predictions, expected, **kwargs):
+    def postprocess_model_outputs(self, predictions, expected):
         raise NotImplementedError()
 
     @abc.abstractmethod
-    def process_model_output(self, outputs, **kwargs):
+    def evaluate_model(self, predictions, expected, bypass_data_to_eval):
         raise NotImplementedError()
 
     ############################################################################
     # Common methods for all the benchmarks
     ############################################################################
 
-    def __init__(
-        self,
-        input_saved_model_dir,
-        output_saved_model_dir,
-        allow_build_at_runtime=False,
-        calibration_input_fn=None,
-        debug=False,
-        gpu_mem_cap=None,
-        input_signature_key=DEFAULT_SERVING_SIGNATURE_DEF_KEY,
-        max_workspace_size_bytes=DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES,
-        minimum_segment_size=5,
-        num_calib_inputs=None,
-        optimize_offline=False,
-        optimize_offline_input_fn=None,
-        output_tensor_indices=None,
-        output_tensor_names=None,
-        precision_mode=None,
-        use_dynamic_shape=False,
-        use_tftrt=False,
-    ):
+    def __init__(self, args):
+        self._args = args
 
         logging.getLogger("tensorflow").setLevel(logging.INFO)
         logging.disable(logging.WARNING)
 
-        self._debug = debug
-
         # TensorFlow can execute operations synchronously or asynchronously.
         # If asynchronous execution is enabled, operations may return
         # "non-ready" handles.
         tf.config.experimental.set_synchronous_execution(True)
 
-        self._config_gpu_memory(gpu_mem_cap)
-
-        calibration_input_fn = (
-            None
-            if precision_mode != 'INT8' else
-            calibration_input_fn
-        )
-
-        optimize_offline_input_fn = (
-            None
-            if not optimize_offline and not use_dynamic_shape else
-            optimize_offline_input_fn
-        )
-
-        self._graph_func = self._get_graph_func(
-            input_saved_model_dir=input_saved_model_dir,
-            output_saved_model_dir=output_saved_model_dir,
-            allow_build_at_runtime=allow_build_at_runtime,
-            calibration_input_fn=calibration_input_fn,
-            input_signature_key=input_signature_key,
-            max_workspace_size_bytes=max_workspace_size_bytes,
-            minimum_segment_size=minimum_segment_size,
-            num_calib_inputs=num_calib_inputs,
-            optimize_offline=optimize_offline,
-            optimize_offline_input_fn=optimize_offline_input_fn,
-            precision_mode=precision_mode,
-            use_dynamic_shape=use_dynamic_shape,
-            use_tftrt=use_tftrt
-        )
-
-        self._set_output_tensor_name(output_tensor_indices, output_tensor_names)
+        self._config_gpu_memory(self._args.gpu_mem_cap)
 
     def _config_gpu_memory(self, gpu_mem_cap):
         gpus = tf.config.experimental.list_physical_devices('GPU')
@@ -146,9 +73,9 @@ def _config_gpu_memory(self, gpu_mem_cap):
         if not gpus:
             raise RuntimeError("No GPUs has been found.")
 
-        self.debug_print('Found the following GPUs:')
+        self._debug_print('Found the following GPUs:')
         for gpu in gpus:
-            self.debug_print(f"\t- {gpu}")
+            self._debug_print(f"\t- {gpu}")
 
         for gpu in gpus:
             try:
@@ -156,151 +83,130 @@ def _config_gpu_memory(self, gpu_mem_cap):
                     tf.config.experimental.set_memory_growth(gpu, True)
                 else:
                     tf.config.experimental.set_virtual_device_configuration(
-                        gpu,
-                        [tf.config.experimental.VirtualDeviceConfiguration(
-                            memory_limit=gpu_mem_cap)])
+                        gpu, [
+                            tf.config.experimental.VirtualDeviceConfiguration(
+                                memory_limit=gpu_mem_cap
+                            )
+                        ]
+                    )
             except RuntimeError as e:
                 print('Can not set GPU memory config', e)
 
-    def _set_output_tensor_name(
-        self, output_tensor_indices, output_tensor_names
-    ):
-        structured_outputs = self._graph_func.structured_outputs
-
-        if isinstance(structured_outputs, (list, tuple)):
-            if output_tensor_indices is None:
-                output_tensor_indices = list(range(len(structured_outputs)))
-            else:
-                output_tensor_indices = [
-                    int(i) for i in output_tensor_indices.split(",")
-                ]
-
-            self._output_tensors = output_tensor_indices
-
-        elif isinstance(structured_outputs, dict):
-            structured_outputs = dict(sorted(structured_outputs.items()))
-            if output_tensor_names is None:
-                output_tensor_names = list(structured_outputs.keys())
-            else:
-                output_tensor_names = [n for n in output_tensor_names.split(",")]
-                for name in output_tensor_names:
-                    if name not in structured_outputs.keys():
-                        raise ValueError(
-                          f"Unknown output_tensor_names received: {name}. " \
-                          f"Authorized: {structured_outputs.keys()}")
-
-            self._output_tensors = output_tensor_names
+    def _debug_print(self, msg):
+        if self._args.debug:
+            print(f"[DEBUG] {msg}")
 
-        else:
-            raise RuntimeError('Unknown structured_outputs format received:',
-                               type(structured_outputs))
-
-        self.debug_print(f"Available Output Tensors: {structured_outputs}")
-        self.debug_print(f"Chosen Output Tensor: {self._output_tensors}")
-
-    def _get_graph_func(
-        self,
-        input_saved_model_dir,
-        output_saved_model_dir,
-        allow_build_at_runtime=False,
-        calibration_input_fn=None,
-        input_signature_key=DEFAULT_SERVING_SIGNATURE_DEF_KEY,
-        max_workspace_size_bytes=DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES,
-        minimum_segment_size=5,
-        num_calib_inputs=None,
-        optimize_offline=False,
-        optimize_offline_input_fn=None,
-        precision_mode=None,
-        use_dynamic_shape=False,
-        use_tftrt=False):
+    def _get_graph_func(self):
         """Retreives a frozen SavedModel and applies TF-TRT
         use_tftrt: bool, if true use TensorRT
         precision: str, floating point precision (FP32, FP16, or INT8)
         returns: TF function that is ready to run for inference
         """
 
-        if not use_tftrt:
+        def load_model_from_disk(
+            path,
+            tags=[tag_constants.SERVING],
+            signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+        ):
+            saved_model_loaded = tf.saved_model.load(export_dir=path, tags=tags)
 
-            with _timed_section('Loading TensorFlow native model...'):
-                saved_model_loaded = tf.saved_model.load(
-                    input_saved_model_dir, tags=[tag_constants.SERVING]
-                )
+            graph_func = saved_model_loaded.signatures[signature_key]
 
-                graph_func = saved_model_loaded.signatures[input_signature_key]
-                graph_func = convert_to_constants.convert_variables_to_constants_v2(
-                    graph_func
-                )
+            # from tensorflow.python.framework import convert_to_constants
+            # graph_func = convert_to_constants.convert_variables_to_constants_v2(
+            #     graph_func
+            # )
 
-        else:
+            # Known TF Issue: https://github.com/tensorflow/tensorflow/issues/37615#issuecomment-767804930
+            # it looks like if the original trackable object is released by
+            # the Python garbage collector once it goes out of scope, and
+            # the signature returned by the function does not maintain a
+            # back-reference to the original loaded object.
+            graph_func._backref_to_saved_model = saved_model_loaded
 
-            def get_trt_conversion_params(
-                allow_build_at_runtime,
-                max_workspace_size_bytes,
-                precision_mode,
-                minimum_segment_size):
-
-                params = copy.deepcopy(trt.DEFAULT_TRT_CONVERSION_PARAMS)
-
-                def get_trt_precision():
-                    if precision_mode == "FP32":
-                        return trt.TrtPrecisionMode.FP32
-                    elif precision_mode == "FP16":
-                        return trt.TrtPrecisionMode.FP16
-                    elif precision_mode == "INT8":
-                        return trt.TrtPrecisionMode.INT8
-                    else:
-                        raise RuntimeError("Unknown precision received: `{}`. Expected: "
-                                           "FP32, FP16 or INT8".format(precision))
-
-                params = params._replace(
-                    allow_build_at_runtime=allow_build_at_runtime,
-                    max_workspace_size_bytes=max_workspace_size_bytes,
-                    minimum_segment_size=minimum_segment_size,
-                    precision_mode=get_trt_precision(),
-                    use_calibration=precision_mode == "INT8"
+            return graph_func
+
+        if not self._args.use_tftrt:
+
+            with timed_section("Loading TensorFlow native model"):
+                graph_func = load_model_from_disk(
+                    path=self._args.input_saved_model_dir,
+                    tags=self._args.model_tag.split(","),
+                    signature_key=self._args.input_signature_key
                 )
 
-                print('\nTensorRT Conversion Params:')
-                _print_dict(dict(params._asdict()))
+        else:
 
-                return params
+            def get_trt_precision(precision):
+                if precision == "FP32":
+                    return trt.TrtPrecisionMode.FP32
+                elif precision == "FP16":
+                    return trt.TrtPrecisionMode.FP16
+                elif precision == "INT8":
+                    return trt.TrtPrecisionMode.INT8
+                else:
+                    raise RuntimeError(
+                        f"Unknown precision received: `{precision}`. "
+                        f"Expected: FP32, FP16 or INT8"
+                    )
 
-            conversion_params = get_trt_conversion_params(
-                allow_build_at_runtime=allow_build_at_runtime,
-                max_workspace_size_bytes=max_workspace_size_bytes,
-                precision_mode=precision_mode,
-                minimum_segment_size=minimum_segment_size
+            tftrt_precision = get_trt_precision(self._args.precision)
+
+            trt_converter_params = dict(
+                allow_build_at_runtime=self._args.allow_build_at_runtime,
+                enable_sparse_compute=True,
+                input_saved_model_dir=self._args.input_saved_model_dir,
+                input_saved_model_signature_key=self._args.input_signature_key,
+                input_saved_model_tags=self._args.model_tag.split(","),
+                max_workspace_size_bytes=self._args.max_workspace_size,
+                maximum_cached_engines=1,
+                minimum_segment_size=self._args.minimum_segment_size,
+                precision_mode=tftrt_precision,
+                use_calibration=(tftrt_precision == trt.TrtPrecisionMode.INT8),
+                use_dynamic_shape=self._args.use_dynamic_shape,
             )
 
-            converter = trt.TrtGraphConverterV2(
-                input_saved_model_dir=input_saved_model_dir,
-                conversion_params=conversion_params,
-                input_saved_model_signature_key=input_signature_key,
-                use_dynamic_shape=use_dynamic_shape
-            )
+            print("\n[*] TF-TRT Converter Parameters:")
+            print_dict(trt_converter_params)
+
+            converter = trt.TrtGraphConverterV2(**trt_converter_params)
 
-            def _check_input_fn(func, name):
-                if func is None:
-                    raise ValueError("The function `{}` is None.".format(name))
+            def engine_build_input_fn(num_batches, model_phase):
+                dataset, _ = self.get_dataset_batches()
 
-                if not callable(func):
-                    raise ValueError("The argument `{}` is not a function.".format(
-                        name))
+                for idx, data_batch in enumerate(dataset):
+                    print(
+                        f"* [{model_phase}] "
+                        f"- step {(idx+1):04d}/{num_batches:04d}"
+                    )
+                    x, _ = self.preprocess_model_inputs(data_batch)  # x, y
+
+                    if not isinstance(x, (tuple, list, dict)):
+                        x = [x]
+
+                    yield x
 
-            if conversion_params.precision_mode == 'INT8':
+                    if (idx + 1) >= num_batches:
+                        break
 
-                _check_input_fn(calibration_input_fn, "calibration_input_fn")
+            if tftrt_precision == trt.TrtPrecisionMode.INT8:
 
-                with _timed_section('TF-TRT graph conversion and INT8 '
-                                   'calibration ...'):
+                calibration_input_fn = lambda: engine_build_input_fn(
+                    num_batches=self._args.num_calib_batches,
+                    model_phase="Calibration"
+                )
+
+                with timed_section(
+                        "TF-TRT graph conversion and INT8 calibration ..."):
                     graph_func = converter.convert(
-                        calibration_input_fn=tf.autograph.experimental.do_not_convert(
-                            calibration_input_fn
+                        calibration_input_fn=(
+                            tf.autograph.experimental.
+                            do_not_convert(calibration_input_fn)
                         )
                     )
 
             else:
-                with _timed_section('TF-TRT graph conversion ...'):
+                with timed_section("TF-TRT graph conversion ..."):
                     graph_func = converter.convert()
 
             try:
@@ -312,188 +218,197 @@ def _check_input_fn(func, name):
             except AttributeError:
                 pass
 
-            if optimize_offline or use_dynamic_shape:
+            if strtobool(os.environ.get("TF_TRT_BENCHMARK_QUIT_AFTER_SUMMARY",
+                                        "0")):
+                sys.exit(0)
+
+            if self._args.optimize_offline or self._args.use_dynamic_shape:
 
-                _check_input_fn(
-                    optimize_offline_input_fn,
-                    "optimize_offline_input_fn"
+                offline_opt_input_fn = lambda: engine_build_input_fn(
+                    num_batches=1, model_phase="Building"
                 )
 
-                with _timed_section('Building TensorRT engines...'):
-                    converter.build(input_fn=tf.autograph.experimental.do_not_convert(
-                        optimize_offline_input_fn
-                    ))
+                with timed_section("Building TensorRT engines"):
+                    converter.build(
+                        input_fn=tf.autograph.experimental.
+                        do_not_convert(offline_opt_input_fn)
+                    )
+                pass
 
-            if output_saved_model_dir is not None:
+            if self._args.output_saved_model_dir is not None:
 
-                with _timed_section('Saving converted graph with TF-TRT ...'):
-                    converter.save(output_saved_model_dir)
-                    print("Converted graph saved to `{}`".format(
-                        output_saved_model_dir))
+                with timed_section("Saving converted graph with TF-TRT"):
+                    converter.save(self._args.output_saved_model_dir)
+                    print(
+                        f"Converted graph saved to "
+                        f"`{self._args.output_saved_model_dir}`"
+                    )
+                    # Engine cache is cleared while saving, we have to reload.
+                    # Failing to do so, would force TF-TRT to rebuild
+                    del converter
+                    del graph_func
+                    graph_func = load_model_from_disk(
+                        self._args.output_saved_model_dir
+                    )
 
-        return graph_func
+        if isinstance(graph_func.structured_outputs, (tuple, list)):
+            savedmodel_outputs = "\n  - ".join([
+                str(t) for t in graph_func.structured_outputs
+            ])
+            savedmodel_outputs = f"  - {savedmodel_outputs}"
+        else:
+            savedmodel_outputs = print_dict(
+                graph_func.structured_outputs, redirect_to_str=True
+            )
+        self._debug_print(f"Available Output Tensors:\n{savedmodel_outputs}")
+        print()  # visual spacing
 
-    def debug_print(self, msg):
-        if self._debug:
-            print(f"[DEBUG] {msg}")
+        chosen_outputs = "\n  - ".join(
+            sorted(self._args.output_tensors_name.split(","))
+        )
+        self._debug_print(f"Chosen Output Tensor:\n  - {chosen_outputs}")
+        print()  # visual spacing
+
+        return graph_func
 
-    def execute_benchmark(
-        self,
-        batch_size,
-        display_every,
-        get_benchmark_input_fn,
-        num_iterations,
-        num_warmup_iterations,
-        skip_accuracy_testing,
-        use_synthetic_data,
-        use_xla,
-        **kwargs):
+    def execute_benchmark(self):
         """Run the given graph_func on the data files provided.
         It consumes TFRecords with labels and reports accuracy.
         """
 
-        self.before_benchmark(**kwargs)
+        with timed_section("Model Loading"):
+            graph_func = self._get_graph_func()
 
-        results = {}
-        iter_times = []
-        steps_executed = 0
+        with timed_section("Model Inference"):
+            dataset, bypass_data_to_eval = self.get_dataset_batches()
 
-        dataset = get_benchmark_input_fn(
-            batch_size=batch_size,
-            use_synthetic_data=use_synthetic_data,
-        )
+            if self._args.use_synthetic_data:
+                old_ds = dataset
+                try:
+                    dataset = dataset.take(count=1)  # loop over 1 batch
+                    dataset = dataset.cache()
+                    dataset = dataset.repeat()
+                    dataset = dataset.prefetch(
+                        buffer_size=tf.data.experimental.AUTOTUNE
+                    )
+                    self._debug_print(
+                        "Model dataset has been replaced by a synthetic data "
+                        "loader to minimize data loading jitter."
+                    )
+
+                except Exception as e:
+                    dataset = old_ds
+                    print(
+                        f"[ERROR] Impossible to transform the dataset into a "
+                        f"synthetic dataset. Performance numbers will be "
+                        f"impacted.\nError: {str(e)}."
+                    )
 
-        @_force_gpu_resync
-        @tf.function(jit_compile=use_xla)
-        def infer_step(_batch_x):
-          output = self._graph_func(_batch_x)
-          return itemgetter(*self._output_tensors)(output)
+            @force_gpu_resync
+            @tf.function(jit_compile=self._args.use_xla)
+            def infer_batch(x):
+                if isinstance(x, (tuple, list)):
+                    model_out = graph_func(*x)
+                elif isinstance(x, dict):
+                    model_out = graph_func(**x)
+                else:
+                    model_out = graph_func(x)
 
-        predicted_dict = defaultdict(lambda: [])
-        expected_arr = []
+                if self._args.output_tensors_name is not None:
+                    output_ts_name = self._args.output_tensors_name.split(",")
+                    if len(output_ts_name) == 1:
+                        return model_out[self._args.output_tensors_name]
+                    else:
+                        return {key: model_out[key] for key in output_ts_name}
 
-        def get_debug_output_shape_str(output):
-            if isinstance(output, (tuple, list)):
-                return [t.shape for t in output]
+                return model_out
 
-            elif isinstance(output, dict):
-                return {k: v.shape for k, v in output.items()}
+            if not self._args.use_synthetic_data:
+                data_aggregator = DataAggregator(
+                    self.postprocess_model_outputs, args=self._args
+                )
 
-            else:
-                return output.shape
+            iter_times = []
 
+            def log_step(step_idx, display_every, iter_time):
+                if step_idx % display_every == 0:
+                    print(
+                        f"  step {step_idx:04d}, iter_time(ms)={iter_time:.3f}"
+                    )
 
-        print("\nStart inference ...")
-        for i, data_batch in enumerate(dataset):
+            dataset = timed_dataset(
+                dataset, activate=self._args.debug_performance
+            )
+
+            for step_idx, data_batch in enumerate(dataset):
+                x, y = self.preprocess_model_inputs(data_batch)
+
+                start_time = time.time()
+                y_pred = infer_batch(x)
+                iter_times.append(time.time() - start_time)
 
-            if isinstance(data_batch, (list, tuple)):
-                if len(data_batch) == 1:
-                    batch_x, batch_y = (data_batch, None)
-                elif len(data_batch) == 2:
-                    batch_x, batch_y = data_batch
+                if not self._args.debug_performance:
+                    log_step(
+                        step_idx + 1, self._args.display_every,
+                        np.mean(iter_times[-self._args.display_every:]) * 1000
+                    )
                 else:
-                    raise RuntimeError("Error: The dataset function returned "
-                                       "%d elements." % len(data_batch))
-            # TF Tensor
-            else:
-                batch_x, batch_y = (data_batch, None)
-
-            start_time = time.time()
-            batch_preds = infer_step(batch_x)
-            iter_times.append(time.time() - start_time)
-
-            steps_executed += 1
-
-            if (i + 1) % display_every == 0 or (i + 1) == num_iterations:
-                print("  step %04d/%04d, iter_time(ms)=%.0f" % (
-                    i + 1,
-                    num_iterations,
-                    np.mean(iter_times[-display_every:]) * 1000
-                ))
-
-            if not skip_accuracy_testing:
-                if i == 0:
-                    self.debug_print("=========== BEFORE PROCESSING ==========")
-                    debug_batch_preds = get_debug_output_shape_str(batch_preds)
-                    self.debug_print(f"`batch_preds`: {debug_batch_preds}")
-                    if batch_y is not None:
-                        self.debug_print(f"`batch_y` shape: {batch_y.shape}")
-
-                batch_preds = self.process_model_output(batch_preds, **kwargs)
-
-                if not isinstance(batch_preds, dict):
-                    raise ValueError(
-                        f"`self.process_model_output` did not return a dict. " \
-                        f"Received: {type(batch_preds)}"
+                    print(f"{'GPU Iteration Time':18s}: {iter_times[-1]:.4f}s")
+
+                if not self._args.use_synthetic_data:
+                    data_aggregator.aggregate_data(y_pred, y)
+
+                if (self._args.num_iterations is not None and
+                        step_idx + 1 >= self._args.num_iterations):
+                    break
+
+            if (not self._args.debug_performance and
+                    step_idx % self._args.display_every !=
+                    0):  # avoids double printing
+                log_step(
+                    step_idx + 1,
+                    display_every=1,  # force print
+                    iter_time=(
+                        np.mean(iter_times[-self._args.display_every:]) * 1000
                     )
+                )
 
-                if batch_y is not None:
-                    batch_y = batch_y.numpy()
-                    if batch_y.shape[-1] == 1:
-                        batch_y = np.squeeze(batch_y, axis=-1)
-
-                if i == 0:
-                    self.debug_print("=========== AFTER PROCESSING ===========")
-                    debug_batch_preds = get_debug_output_shape_str(batch_preds)
-                    self.debug_print(f"`batch_preds`: {debug_batch_preds}")
-                    if batch_y is not None:
-                        self.debug_print(f"`batch_y` shape: {batch_y.shape}")
-                    self.debug_print("========================================")
-
-                for key, value in batch_preds.items():
-                    predicted_dict[key].append(value)
-
-                if batch_y is not None:
-                    expected_arr.append(batch_y)
-
-            if (i + 1) >= num_iterations:
-                break
-
-        if not skip_accuracy_testing:
-            predicted_dict = {
-                k: np.concatenate(v, axis=0)
-                for k, v in predicted_dict.items()
-            }
-            if expected_arr:
-                expected_arr = np.concatenate(expected_arr, axis=0)
-            else:
-                expected_arr = np.array(expected_arr)
-
-            self.debug_print("=========== BEFORE METRIC COMPUTATION ==========")
-            debug_predicted_dict = get_debug_output_shape_str(predicted_dict)
-            self.debug_print(f"`predicted_dict`: {debug_predicted_dict}")
-            self.debug_print(f"`expected_arr` shape: {expected_arr.shape}")
-            self.debug_print("========================================")
-
-            results['accuracy_metric'] = self.compute_accuracy_metric(
-                predictions=predicted_dict,
-                expected=expected_arr,
-                **kwargs
+        with timed_section("Metric Computation"):
+
+            if not self._args.use_synthetic_data:
+                metric, metric_units = self.evaluate_model(
+                    data_aggregator.predicted_dict,
+                    data_aggregator.expected_dict, bypass_data_to_eval
+                )
+                print(f"- {metric_units:35s}: {metric:.2f}")
+
+            metrics = dict()
+
+            if not self._args.use_synthetic_data:
+                metrics["Total Samples Processed"] = (
+                    data_aggregator.total_samples_processed
+                )
+
+            # Skipping last batch. Might have different batch_size
+            run_times = np.array(iter_times)
+            run_times = run_times[self._args.num_warmup_iterations:-1]
+
+            metrics['Total GPU Time (s)'] = int(np.ceil(np.sum(iter_times)))
+            metrics['Throughput (samples/sec)'] = np.mean(
+                self._args.batch_size / run_times
             )
+            metrics['99th_percentile (ms)'] = np.percentile(
+                run_times, q=99, interpolation='lower'
+            ) * 1000
+            metrics['GPU Latency Mean (ms)'] = np.mean(run_times) * 1000
+            metrics['GPU Latency Median (ms)'] = np.median(run_times) * 1000
+            metrics['GPU Latency Min (ms)'] = np.min(run_times) * 1000
+            metrics['GPU Latency Max (ms)'] = np.max(run_times) * 1000
+
+            for key, val in sorted(metrics.items()):
+                if isinstance(val, int):
+                    print(f"- {key:35s}: {val}")
+                else:
+                    print(f"- {key:35s}: {val:.2f}")
 
-        iter_times = np.array(iter_times)
-        run_times = iter_times[num_warmup_iterations:]
-
-        results['total_time(s)'] = int(np.sum(iter_times))
-        results['samples/sec'] = int(np.mean(batch_size / run_times))
-        results['99th_percentile(ms)'] = np.percentile(
-            run_times, q=99, interpolation='lower'
-        ) * 1000
-        results['latency_mean(ms)'] = np.mean(run_times) * 1000
-        results['latency_median(ms)'] = np.median(run_times) * 1000
-        results['latency_min(ms)'] = np.min(run_times) * 1000
-        results['latency_max(ms)'] = np.max(run_times) * 1000
-
-        print('\n=============================================\n')
-        print('Results:\n')
-
-        if "accuracy_metric" in results:
-            print('  {}: {:.2f}'.format(
-                self.ACCURACY_METRIC_NAME, results['accuracy_metric'] * 100))
-            del results['accuracy_metric']
-
-        for key, val in sorted(results.items()):
-            if isinstance(val, float):
-                print("  {}: {:.2f}".format(key, val))
-            else:
-                print("  {}: {}".format(key, val))
+        print()  # visual spacing
diff --git a/tftrt/examples/benchmark_utils.py b/tftrt/examples/benchmark_utils.py
new file mode 100644
index 000000000..e381627a2
--- /dev/null
+++ b/tftrt/examples/benchmark_utils.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# -*- coding: utf-8 -*-
+
+import time
+
+import numpy as np
+import tensorflow as tf
+
+from contextlib import contextmanager
+
+__all__ = ["DataAggregator", "force_gpu_resync", "print_dict", "timed_section"]
+
+
+def force_gpu_resync(func):
+    p = tf.constant(0.)  # Create small tensor to force GPU resync
+
+    def wrapper(*args, **kwargs):
+        rslt = func(*args, **kwargs)
+        (p + 1.).numpy()  # Sync the GPU
+        return rslt
+
+    return wrapper
+
+
+def print_dict(input_dict, prefix='  ', postfix='', redirect_to_str=False):
+    rslt_str = ""
+    for key, val in sorted(input_dict.items()):
+        val = f"{val:.1f}" if isinstance(val, float) else val
+        tmp_str = f"{prefix}- {key}: {val}{postfix}"
+        if not redirect_to_str:
+            print(tmp_str)
+        else:
+            rslt_str += f"{tmp_str}\n"
+
+    if redirect_to_str:
+        return rslt_str.rstrip()
+
+
+@contextmanager
+def timed_section(msg, activate=True, start_end_mode=True):
+    if activate:
+
+        if start_end_mode:
+            print(f"\n[START] {msg} ...")
+
+        start_time = time.time()
+        yield
+        total_time = time.time() - start_time
+
+        if start_end_mode:
+            print(f"[END] `{msg}` - Duration: {total_time:.1f}s")
+            print("=" * 80, "\n")
+        else:
+            print(f"{msg:18s}: {total_time:.4f}s")
+
+    else:
+        yield
+
+
+def timed_dataset(ds, activate=True):
+    data_start_t = time.time()
+
+    for idx, data_batch in enumerate(ds):
+
+        if activate:
+            print(f"Step: {idx + 1}")
+            print(
+                f"{'Data Loading Time':18s}: {time.time() - data_start_t:.4f}s"
+            )
+
+        yield data_batch
+
+        if activate:
+            print("===============")
+            data_start_t = time.time()
+
+
+def _format_output_tensors(predictions, expected, batch_size):
+
+    def dictionarize(data):
+        tmp_preds = dict()
+        if isinstance(data, (tuple, list)):
+            for idx, pred_i in enumerate(data):
+                tmp_preds[f"data_{idx:03d}"] = pred_i
+        elif not isinstance(data, dict):
+            tmp_preds["data"] = data
+        else:
+            tmp_preds = data
+        return tmp_preds
+
+    def format(data):
+
+        def _format(tensor):
+            if tensor.shape[0] != batch_size:
+                tensor = np.expand_dims(tensor, 0)
+            elif len(tensor.shape) == 1:
+                tensor = np.expand_dims(tensor, 1)
+            return tensor
+
+        for key, val in data.items():
+            data[key] = _format(val)
+
+        return data
+
+    predictions = format(dictionarize(predictions))
+    expected = format(dictionarize(expected))
+
+    return predictions, expected
+
+
+class DataAggregator(object):
+
+    def __init__(self, postprocess_model_outputs_fn, args):
+
+        self._args = args
+
+        self._predicted = dict()
+        self._expected = dict()
+
+        self._total_samples_processed = 0
+
+        self._postprocess_model_outputs_fn = postprocess_model_outputs_fn
+
+    def _calc_step_batchsize(self, data_arr):
+        if isinstance(data_arr, (list, tuple)):
+            return data_arr[0].shape[0]
+        elif isinstance(data_arr, dict):
+            return list(data_arr.values())[0].shape[0]
+        else:  # TF.Tensor or TF.EagerTensor
+            return data_arr.shape[0]
+
+    @property
+    def predicted_dict(self):
+        tmp_data = dict()
+        for key, val in self._predicted.items():
+            tmp_data[key] = val[:self._total_samples_processed]
+        return tmp_data
+
+    @property
+    def expected_dict(self):
+        tmp_data = dict()
+        for key, val in self._expected.items():
+            tmp_data[key] = val[:self._total_samples_processed]
+        return tmp_data
+
+    @property
+    def total_samples_processed(self):
+        return self._total_samples_processed
+
+    def aggregate_data(self, y_pred, y):
+
+        with timed_section("Processing Time",
+                           activate=self._args.debug_performance,
+                           start_end_mode=False):
+
+            step_batch_size = self._calc_step_batchsize(y_pred)
+
+            y_pred, y = self._postprocess_model_outputs_fn(
+                predictions=y_pred, expected=y
+            )
+
+            y_pred, y = _format_output_tensors(
+                y_pred, y, batch_size=step_batch_size
+            )
+
+            if not self._predicted:  # First call
+                for key, val in y_pred.items():
+                    self._predicted[key] = np.empty(
+                        [self._args.total_max_samples] + list(val.shape[1:]),
+                        dtype=val.dtype
+                    )
+
+            if not self._expected:  # First call
+                for key, val in y.items():
+                    self._expected[key] = np.empty(
+                        [self._args.total_max_samples] + list(val.shape[1:]),
+                        dtype=val.dtype
+                    )
+
+            idx_start = self._total_samples_processed
+
+            self._total_samples_processed += step_batch_size
+            idx_stop = self._total_samples_processed
+
+            with timed_section("Numpy Copy Time",
+                               activate=self._args.debug_performance,
+                               start_end_mode=False):
+                for key, val in self._predicted.items():
+                    self._predicted[key][idx_start:idx_stop] = y_pred[key]
+                for key, val in self._expected.items():
+                    self._expected[key][idx_start:idx_stop] = y[key]
diff --git a/tftrt/examples/image_classification/README.md b/tftrt/examples/image_classification/README.md
index b1c29ba66..b125dac9e 100644
--- a/tftrt/examples/image_classification/README.md
+++ b/tftrt/examples/image_classification/README.md
@@ -65,7 +65,7 @@ python image_classification.py \
     --saved_model_dir /models/resnet_v1.5_50_saved_model/ \
     --model resnet_v1.5_50_tfv2 \
     --num_warmup_iterations 50 \
-    --num_calib_inputs 128
+    --num_calib_batches 128
     --display_every 10 \
     --use_tftrt \
     --optimize_offline \
diff --git a/tftrt/examples/image_classification/image_classification.py b/tftrt/examples/image_classification/image_classification.py
index 0b51a2a50..338c7709b 100644
--- a/tftrt/examples/image_classification/image_classification.py
+++ b/tftrt/examples/image_classification/image_classification.py
@@ -18,20 +18,18 @@
 import os
 import sys
 
-import logging
-import multiprocessing
-import time
-
-from functools import partial
-
 import numpy as np
+
 import tensorflow as tf
 
 import preprocessing
 
 # Allow import of top level python files
 import inspect
-currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+
+currentdir = os.path.dirname(
+    os.path.abspath(inspect.getfile(inspect.currentframe()))
+)
 parentdir = os.path.dirname(currentdir)
 sys.path.insert(0, parentdir)
 
@@ -41,229 +39,200 @@
 
 class CommandLineAPI(BaseCommandLineAPI):
 
-    SAMPLES_IN_VALIDATION_SET = 50000
-
     def __init__(self):
         super(CommandLineAPI, self).__init__()
 
-        self._parser.add_argument('--input_size', type=int, default=224,
-                                  help='Size of input images expected by the '
-                                       'model')
+        self._parser.add_argument(
+            '--input_size',
+            type=int,
+            default=224,
+            help='Size of input images expected by the '
+            'model'
+        )
+
+        self._parser.add_argument(
+            '--num_classes',
+            type=int,
+            default=1001,
+            help='Number of classes used when training '
+            'the model'
+        )
+
+        self._parser.add_argument(
+            '--preprocess_method',
+            type=str,
+            choices=['vgg', 'inception', 'resnet50_v1_5_tf1_ngc_preprocess'],
+            default='vgg',
+            help='The image preprocessing method used in '
+            'dataloading.'
+        )
 
-        self._parser.add_argument('--num_classes', type=int, default=1001,
-                                  help='Number of classes used when training '
-                                       'the model')
+    def _post_process_args(self, args):
+        args = super(CommandLineAPI, self)._post_process_args(args)
+        args.labels_shift = 1 if args.num_classes == 1001 else 0
+
+        return args
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #
+# %%%%%%%%%%%%%%%%% IMPLEMENT MODEL-SPECIFIC FUNCTIONS HERE %%%%%%%%%%%%%%%%%% #
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #
 
-        self._parser.add_argument('--preprocess_method', type=str,
-                                  choices=['vgg', 'inception',
-                                           'resnet50_v1_5_tf1_ngc_preprocess'
-                                  ],
-                                  default='vgg',
-                                  help='The image preprocessing method used in '
-                                       'dataloading.')
 
 class BenchmarkRunner(BaseBenchmarkRunner):
 
-    ACCURACY_METRIC_NAME = "accuracy"
-
-    def before_benchmark(self, **kwargs):
-        self._labels_shift = 1 if kwargs["num_classes"] == 1001 else 0
-
-    def compute_accuracy_metric(self, predictions, expected, **kwargs):
-        return np.mean(np.equal(predictions["outputs"], expected))
-
-    def process_model_output(self, outputs, **kwargs):
-        outputs = outputs.numpy()
-        if (len(outputs.shape) != 1):
-            outputs = np.argmax(outputs, axis=1).reshape(-1)
-        return {"outputs": outputs - self._labels_shift}
-
-
-def get_dataset(data_files, batch_size, use_synthetic_data, preprocess_method, input_size):
-
-    def deserialize_image_record(record):
-        feature_map = {
-            'image/encoded': tf.io.FixedLenFeature([], tf.string, ''),
-            'image/class/label': tf.io.FixedLenFeature([1], tf.int64, -1),
-            'image/class/text': tf.io.FixedLenFeature([], tf.string, ''),
-            'image/object/bbox/xmin': tf.io.VarLenFeature(dtype=tf.float32),
-            'image/object/bbox/ymin': tf.io.VarLenFeature(dtype=tf.float32),
-            'image/object/bbox/xmax': tf.io.VarLenFeature(dtype=tf.float32),
-            'image/object/bbox/ymax': tf.io.VarLenFeature(dtype=tf.float32)
-        }
-        with tf.compat.v1.name_scope('deserialize_image_record'):
-            obj = tf.io.parse_single_example(serialized=record,
-                                             features=feature_map)
-            imgdata = obj['image/encoded']
-            label = tf.cast(obj['image/class/label'], tf.int32)
-        return imgdata, label
-
-    def get_preprocess_fn(preprocess_method, input_size):
-        """Creates a function to parse and process a TFRecord
-
-        preprocess_method: string
-        input_size: int
-        returns: function, the preprocessing function for a record
+    def get_dataset_batches(self):
+        """Returns a list of batches of input samples.
+
+        Each batch should be in the form [x, y], where
+        x is a numpy array of the input samples for the batch, and
+        y is a numpy array of the expected model outputs for the batch
+
+        Returns:
+        - dataset: a TF Dataset object
+        - bypass_data_to_eval: any object type that will be passed unmodified to
+                            `evaluate_result()`. If not necessary: `None`
+
+        Note: script arguments can be accessed using `self._args.attr`
         """
-        if preprocess_method == 'vgg':
-            preprocess_fn = preprocessing.vgg_preprocess
-        elif preprocess_method == 'inception':
-            preprocess_fn = preprocessing.inception_preprocess
-        elif preprocess_method == 'resnet50_v1_5_tf1_ngc_preprocess':
-            preprocess_fn = preprocessing.resnet50_v1_5_tf1_ngc_preprocess
-        else:
-            raise ValueError(
-                'Invalid preprocessing method {}'.format(preprocess_method)
-            )
-
-        def preprocess_sample_fn(record):
-            # Parse TFRecord
-            imgdata, label = deserialize_image_record(record)
-            label -= 1  # Change to 0-based (don't use background class)
-            try:
-                image = tf.image.decode_jpeg(
-                    imgdata,
-                    channels=3,
-                    fancy_upscaling=False,
-                    dct_method='INTEGER_FAST'
+
+        def get_files(data_dir, filename_pattern):
+            if data_dir is None:
+                return []
+
+            files = tf.io.gfile.glob(os.path.join(data_dir, filename_pattern))
+
+            if not files:
+                raise ValueError(
+                    'Can not find any files in {} with '
+                    'pattern "{}"'.format(data_dir, filename_pattern)
+                )
+            return files
+
+        def deserialize_image_record(record):
+            feature_map = {
+                'image/encoded': tf.io.FixedLenFeature([], tf.string, ''),
+                'image/class/label': tf.io.FixedLenFeature([1], tf.int64, -1)
+            }
+            with tf.compat.v1.name_scope('deserialize_image_record'):
+                obj = tf.io.parse_single_example(
+                    serialized=record, features=feature_map
+                )
+                imgdata = obj['image/encoded']
+                label = tf.cast(obj['image/class/label'], tf.int32)
+            return imgdata, label
+
+        def get_preprocess_fn(preprocess_method, input_size):
+            """Creates a function to parse and process a TFRecord
+            input_size: int
+            returns: function, the preprocessing function for a record
+            """
+            if preprocess_method == 'vgg':
+                preprocess_fn = preprocessing.vgg_preprocess
+            elif preprocess_method == 'inception':
+                preprocess_fn = preprocessing.inception_preprocess
+            elif preprocess_method == 'resnet50_v1_5_tf1_ngc_preprocess':
+                preprocess_fn = preprocessing.resnet50_v1_5_tf1_ngc_preprocess
+            else:
+                raise ValueError(
+                    'Invalid preprocessing method {}'.format(preprocess_method)
                 )
-            except:
-                image = tf.image.decode_png(imgdata, channels=3)
-            # Use model's preprocessing function
-            image = preprocess_fn(image, input_size, input_size)
-            return image, label
 
-        return preprocess_sample_fn
+            def preprocess_sample_fn(record):
+                # Parse TFRecord
+                imgdata, label = deserialize_image_record(record)
+                label -= 1  # Change to 0-based (don't use background class)
+                try:
+                    image = tf.image.decode_jpeg(
+                        imgdata,
+                        channels=3,
+                        fancy_upscaling=False,
+                        dct_method='INTEGER_FAST'
+                    )
+                except:
+                    image = tf.image.decode_png(imgdata, channels=3)
+                # Use model's preprocessing function
+                image = preprocess_fn(image, input_size, input_size)
+                return image, label
+
+            return preprocess_sample_fn
+
+        data_files = get_files(self._args.data_dir, 'validation*')
+        dataset = tf.data.Dataset.from_tensor_slices(data_files)
+
+        dataset = dataset.interleave(
+            tf.data.TFRecordDataset,
+            cycle_length=tf.data.experimental.AUTOTUNE,
+            block_length=max(self._args.batch_size, 32)
+        )
 
-    dataset = tf.data.Dataset.from_tensor_slices(data_files)
+        # preprocess function for input data
+        preprocess_fn = get_preprocess_fn(
+            preprocess_method=self._args.preprocess_method,
+            input_size=self._args.input_size
+        )
 
-    dataset = dataset.interleave(
-        tf.data.TFRecordDataset,
-        cycle_length=min(8, multiprocessing.cpu_count()),
-        block_length=max(batch_size, 32)
-    )
+        dataset = dataset.map(
+            map_func=preprocess_fn,
+            num_parallel_calls=tf.data.experimental.AUTOTUNE,
+        )
 
-    # preprocess function for input data
-    preprocess_fn = get_preprocess_fn(
-        preprocess_method=preprocess_method,
-        input_size=input_size
-    )
+        dataset = dataset.batch(self._args.batch_size, drop_remainder=False)
 
-    dataset = dataset.map(
-        map_func=preprocess_fn,
-        num_parallel_calls=min(8, multiprocessing.cpu_count())
-    )
+        dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
 
-    dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)
+        return dataset, None
 
-    if use_synthetic_data:
-        dataset = dataset.take(count=1)  # loop over 1 batch
-        dataset = dataset.cache()
-        dataset = dataset.repeat()
+    def preprocess_model_inputs(self, data_batch):
+        """This function prepare the `data_batch` generated from the dataset.
+        Returns:
+            x: input of the model
+            y: data to be used for model evaluation
 
-    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+        Note: script arguments can be accessed using `self._args.attr`
+        """
 
-    return dataset
+        x, y = data_batch
+        return x, y
 
+    def postprocess_model_outputs(self, predictions, expected):
+        """Post process if needed the predictions and expected tensors. At the
+        minimum, this function transforms all TF Tensors into a numpy arrays.
+        Most models will not need to modify this function.
 
-if __name__ == '__main__':
+        Note: script arguments can be accessed using `self._args.attr`
+        """
 
-    cmdline_api = CommandLineAPI()
-    args = cmdline_api.parse_args()
+        predictions = predictions.numpy()
 
-    def get_files(data_dir, filename_pattern):
-        if data_dir is None:
-            return []
+        if len(predictions.shape) != 1:
+            predictions = tf.math.argmax(predictions, axis=1)
+            predictions = predictions.numpy().reshape(-1)
 
-        files = tf.io.gfile.glob(os.path.join(data_dir, filename_pattern))
+        predictions - self._args.labels_shift
 
-        if not files:
-            raise ValueError('Can not find any files in {} with '
-                             'pattern "{}"'.format(data_dir, filename_pattern))
-        return files
+        return predictions - self._args.labels_shift, expected.numpy()
 
-    data_files = get_files(args.data_dir, 'validation*')
+    def evaluate_model(self, predictions, expected, bypass_data_to_eval):
+        """Evaluate result predictions for entire dataset.
 
-    calib_files = (
-        []
-        if args.precision != 'INT8' else
-        get_files(args.calib_data_dir, 'train*')
-    )
+        This computes overall accuracy, mAP,  etc.  Returns the
+        metric value and a metric_units string naming the metric.
 
-    def _input_fn(input_files, build_steps, model_phase):
+        Note: script arguments can be accessed using `args.attr`
+        """
 
-        dataset = get_dataset(
-            data_files=input_files,
-            batch_size=args.batch_size,
-            # even when using synthetic data, we need to
-            # build and/or calibrate using real training data
-            # to be in a realistic scenario
-            use_synthetic_data=False,
-            preprocess_method=args.preprocess_method,
-            input_size=args.input_size
+        return (
+            np.mean(predictions["data"] == expected["data"]) * 100.0,
+            "Top-1 Accuracy %"
         )
 
-        for i, (batch_images, _) in enumerate(dataset):
-            if i >= build_steps:
-                break
-
-            print("* [%s] - step %04d/%04d" % (
-                model_phase, i + 1, build_steps
-            ))
-            yield batch_images,
-
-    calibration_input_fn = partial(
-        _input_fn,
-        input_files=calib_files,
-        build_steps=args.num_calib_inputs // args.batch_size,
-        model_phase="Calibration"
-    )
-
-    optimize_offline_input_fn = partial(
-        _input_fn,
-        input_files=data_files,
-        build_steps=1,
-        model_phase="Building"
-    )
-
-    runner = BenchmarkRunner(
-        input_saved_model_dir=args.input_saved_model_dir,
-        output_saved_model_dir=args.output_saved_model_dir,
-        allow_build_at_runtime=args.allow_build_at_runtime,
-        calibration_input_fn=calibration_input_fn,
-        debug=args.debug,
-        gpu_mem_cap=args.gpu_mem_cap,
-        input_signature_key=args.input_signature_key,
-        max_workspace_size_bytes=args.max_workspace_size,
-        minimum_segment_size=args.minimum_segment_size,
-        num_calib_inputs=args.num_calib_inputs,
-        optimize_offline=args.optimize_offline,
-        optimize_offline_input_fn=optimize_offline_input_fn,
-        output_tensor_indices=args.output_tensor_indices,
-        output_tensor_names=args.output_tensor_names,
-        precision_mode=args.precision,
-        use_dynamic_shape=args.use_dynamic_shape,
-        use_tftrt=args.use_tftrt
-    )
-
-    get_benchmark_input_fn = partial(
-        get_dataset,
-        data_files=data_files,
-        input_size=args.input_size,
-        preprocess_method=args.preprocess_method
-    )
-
-    runner.execute_benchmark(
-        batch_size=args.batch_size,
-        display_every=args.display_every,
-        get_benchmark_input_fn=get_benchmark_input_fn,
-        num_iterations=args.num_iterations,
-        num_warmup_iterations=args.num_warmup_iterations,
-        skip_accuracy_testing=(
-            args.use_synthetic_data or args.skip_accuracy_testing
-        ),
-        use_synthetic_data=args.use_synthetic_data,
-        use_xla=args.use_xla,
-        ########### Additional Settings ############
-        num_classes=args.num_classes,
-    )
+
+if __name__ == '__main__':
+
+    cmdline_api = CommandLineAPI()
+    args = cmdline_api.parse_args()
+
+    runner = BenchmarkRunner(args)
+
+    runner.execute_benchmark()
diff --git a/tftrt/examples/image_classification/preprocessing.py b/tftrt/examples/image_classification/preprocessing.py
index 8c63d99e2..aa90d6a52 100644
--- a/tftrt/examples/image_classification/preprocessing.py
+++ b/tftrt/examples/image_classification/preprocessing.py
@@ -7,8 +7,9 @@
 _RESIZE_SIDE_MIN = 256
 _RESIZE_SIDE_MAX = 512
 
+
 def _crop(image, offset_height, offset_width, crop_height, crop_width):
-  """Crops the given image using the provided offsets and sizes.
+    """Crops the given image using the provided offsets and sizes.
   Note that the method doesn't assume we know the input image size but it does
   assume we know the input image rank.
   Args:
@@ -23,30 +24,32 @@ def _crop(image, offset_height, offset_width, crop_height, crop_width):
     InvalidArgumentError: if the rank is not 3 or if the image dimensions are
       less than the crop size.
   """
-  original_shape = tf.shape(image)
+    original_shape = tf.shape(image)
+
+    rank_assertion = tf.Assert(
+        tf.equal(tf.rank(image), 3), ['Rank of image must be equal to 3.']
+    )
+    with tf.control_dependencies([rank_assertion]):
+        cropped_shape = tf.stack([crop_height, crop_width, original_shape[2]])
 
-  rank_assertion = tf.Assert(
-      tf.equal(tf.rank(image), 3),
-      ['Rank of image must be equal to 3.'])
-  with tf.control_dependencies([rank_assertion]):
-    cropped_shape = tf.stack([crop_height, crop_width, original_shape[2]])
+    size_assertion = tf.Assert(
+        tf.logical_and(
+            tf.greater_equal(original_shape[0], crop_height),
+            tf.greater_equal(original_shape[1], crop_width)
+        ), ['Crop size greater than the image size.']
+    )
 
-  size_assertion = tf.Assert(
-      tf.logical_and(
-          tf.greater_equal(original_shape[0], crop_height),
-          tf.greater_equal(original_shape[1], crop_width)),
-      ['Crop size greater than the image size.'])
+    offsets = tf.cast(tf.stack([offset_height, offset_width, 0]), tf.int32)
 
-  offsets = tf.cast(tf.stack([offset_height, offset_width, 0]), tf.int32)
+    # Use tf.slice instead of crop_to_bounding box as it accepts tensors to
+    # define the crop size.
+    with tf.control_dependencies([size_assertion]):
+        image = tf.slice(image, offsets, cropped_shape)
+    return tf.reshape(image, cropped_shape)
 
-  # Use tf.slice instead of crop_to_bounding box as it accepts tensors to
-  # define the crop size.
-  with tf.control_dependencies([size_assertion]):
-    image = tf.slice(image, offsets, cropped_shape)
-  return tf.reshape(image, cropped_shape)
 
 def _central_crop(image_list, crop_height, crop_width):
-  """Performs central crops of the given image list.
+    """Performs central crops of the given image list.
   Args:
     image_list: a list of image tensors of the same dimension but possibly
       varying channel.
@@ -55,21 +58,22 @@ def _central_crop(image_list, crop_height, crop_width):
   Returns:
     the list of cropped images.
   """
-  outputs = []
-  for image in image_list:
-    image_height = tf.shape(image)[0]
-    image_width = tf.shape(image)[1]
+    outputs = []
+    for image in image_list:
+        image_height = tf.shape(image)[0]
+        image_width = tf.shape(image)[1]
 
-    offset_height = (image_height - crop_height) / 2
-    offset_width = (image_width - crop_width) / 2
+        offset_height = (image_height-crop_height) / 2
+        offset_width = (image_width-crop_width) / 2
 
-    outputs.append(_crop(image, offset_height, offset_width,
-                         crop_height, crop_width))
-  return outputs
+        outputs.append(
+            _crop(image, offset_height, offset_width, crop_height, crop_width)
+        )
+    return outputs
 
 
 def _mean_image_subtraction(image, means):
-  """Subtracts the given means from each image channel.
+    """Subtracts the given means from each image channel.
   For example:
     means = [123.68, 116.779, 103.939]
     image = _mean_image_subtraction(image, means)
@@ -84,19 +88,20 @@ def _mean_image_subtraction(image, means):
       than three or if the number of channels in `image` doesn't match the
       number of values in `means`.
   """
-  if image.get_shape().ndims != 3:
-    raise ValueError('Input must be of size [height, width, C>0]')
-  num_channels = image.get_shape().as_list()[-1]
-  if len(means) != num_channels:
-    raise ValueError('len(means) must match the number of channels')
+    if image.get_shape().ndims != 3:
+        raise ValueError('Input must be of size [height, width, C>0]')
+    num_channels = image.get_shape().as_list()[-1]
+    if len(means) != num_channels:
+        raise ValueError('len(means) must match the number of channels')
+
+    channels = tf.split(axis=2, num_or_size_splits=num_channels, value=image)
+    for i in range(num_channels):
+        channels[i] -= means[i]
+    return tf.concat(axis=2, values=channels)
 
-  channels = tf.split(axis=2, num_or_size_splits=num_channels, value=image)
-  for i in range(num_channels):
-    channels[i] -= means[i]
-  return tf.concat(axis=2, values=channels)
 
 def _aspect_preserving_resize(image, smallest_side):
-  """Resize images preserving the original aspect ratio.
+    """Resize images preserving the original aspect ratio.
   Args:
     image: A 3-D image `Tensor`.
     smallest_side: A python integer or scalar `Tensor` indicating the size of
@@ -104,20 +109,23 @@ def _aspect_preserving_resize(image, smallest_side):
   Returns:
     resized_image: A 3-D tensor containing the resized image.
   """
-  smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32)
-
-  shape = tf.shape(image)
-  height = shape[0]
-  width = shape[1]
-  new_height, new_width = _smallest_size_at_least(height, width, smallest_side)
-  image = tf.expand_dims(image, 0)
-  resized_image = tf.image.resize(image, [new_height, new_width])
-  resized_image = tf.squeeze(resized_image)
-  resized_image.set_shape([None, None, 3])
-  return resized_image
+    smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32)
+
+    shape = tf.shape(image)
+    height = shape[0]
+    width = shape[1]
+    new_height, new_width = _smallest_size_at_least(
+        height, width, smallest_side
+    )
+    image = tf.expand_dims(image, 0)
+    resized_image = tf.image.resize(image, [new_height, new_width])
+    resized_image = tf.squeeze(resized_image)
+    resized_image.set_shape([None, None, 3])
+    return resized_image
+
 
 def _smallest_size_at_least(height, width, smallest_side):
-  """Computes new shape with the smallest side equal to `smallest_side`.
+    """Computes new shape with the smallest side equal to `smallest_side`.
   Computes new shape with the smallest side equal to `smallest_side` while
   preserving the original aspect ratio.
   Args:
@@ -129,27 +137,29 @@ def _smallest_size_at_least(height, width, smallest_side):
     new_height: an int32 scalar tensor indicating the new height.
     new_width: and int32 scalar tensor indicating the new width.
   """
-  smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32)
-
-  height = tf.cast(height, tf.float32)
-  width = tf.cast(width, tf.float32)
-  smallest_side = tf.cast(smallest_side, tf.float32)
-
-  scale = tf.cond(tf.greater(height, width),
-                  lambda: smallest_side / width,
-                  lambda: smallest_side / height)
-  new_height = tf.cast(tf.math.rint(height * scale), tf.int32)
-  new_width = tf.cast(tf.math.rint(width * scale), tf.int32)
-  return new_height, new_width
-
-
-
-def inception_preprocess(image,
-                        height,
-                        width,
-                        central_fraction=0.875,
-                        scope=None,
-                        central_crop=False):
+    smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32)
+
+    height = tf.cast(height, tf.float32)
+    width = tf.cast(width, tf.float32)
+    smallest_side = tf.cast(smallest_side, tf.float32)
+
+    scale = tf.cond(
+        tf.greater(height, width), lambda: smallest_side / width,
+        lambda: smallest_side / height
+    )
+    new_height = tf.cast(tf.math.rint(height * scale), tf.int32)
+    new_width = tf.cast(tf.math.rint(width * scale), tf.int32)
+    return new_height, new_width
+
+
+def inception_preprocess(
+    image,
+    height,
+    width,
+    central_fraction=0.875,
+    scope=None,
+    central_crop=False
+):
     """Prepare one image for evaluation.
     If height and width are specified it would output an image with that size by
     applying resize_bilinear.
@@ -183,21 +193,24 @@ def inception_preprocess(image,
     image = tf.multiply(image, 2.0)
     return image
 
+
 def vgg_preprocess(image, output_height, output_width):
-	image = _aspect_preserving_resize(image, _RESIZE_SIDE_MIN)
-	image = _central_crop([image], output_height, output_width)[0]
-	image.set_shape([output_height, output_width, 3])
-	image = tf.cast(image, tf.float32)
-	return _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN])
-
-
-def resnet50_v1_5_tf1_ngc_preprocess(image,
-                  height,
-                  width,
-                  central_fraction=0.875,
-                  scope=None,
-                  central_crop=False):
-  """Prepare one image for evaluation.
+    image = _aspect_preserving_resize(image, _RESIZE_SIDE_MIN)
+    image = _central_crop([image], output_height, output_width)[0]
+    image.set_shape([output_height, output_width, 3])
+    image = tf.cast(image, tf.float32)
+    return _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN])
+
+
+def resnet50_v1_5_tf1_ngc_preprocess(
+    image,
+    height,
+    width,
+    central_fraction=0.875,
+    scope=None,
+    central_crop=False
+):
+    """Prepare one image for evaluation.
   If height and width are specified it would output an image with that size by
   applying resize_bilinear.
   If central_fraction is specified it would crop the central fraction of the
@@ -216,15 +229,15 @@ def resnet50_v1_5_tf1_ngc_preprocess(image,
   Returns:
   3-D float Tensor of prepared image.
   """
-  if image.dtype != tf.float32:
-      image = tf.image.convert_image_dtype(image, dtype=tf.float32)
-      # Crop the central region of the image with an area containing 87.5% of
-      # the original image.
-  image = tf.image.central_crop(image, central_fraction=central_fraction)
-  if height and width:
-      # Resize the image to the specified height and width.
-      image = tf.expand_dims(image, 0)
-      image = tf.image.resize(image, [height, width])
-      image = tf.squeeze(image, [0])
-  image = image * 255
-  return image
+    if image.dtype != tf.float32:
+        image = tf.image.convert_image_dtype(image, dtype=tf.float32)
+        # Crop the central region of the image with an area containing 87.5% of
+        # the original image.
+    image = tf.image.central_crop(image, central_fraction=central_fraction)
+    if height and width:
+        # Resize the image to the specified height and width.
+        image = tf.expand_dims(image, 0)
+        image = tf.image.resize(image, [height, width])
+        image = tf.squeeze(image, [0])
+    image = image * 255
+    return image
diff --git a/tftrt/examples/image_classification/scripts/base_script.sh b/tftrt/examples/image_classification/scripts/base_script.sh
index d8e1c8f3c..d6c63053d 100755
--- a/tftrt/examples/image_classification/scripts/base_script.sh
+++ b/tftrt/examples/image_classification/scripts/base_script.sh
@@ -33,11 +33,11 @@ do
         MODEL_DIR="${arg#*=}"
         shift # Remove --input_saved_model_dir= from processing
         ;;
-        --output_tensor_names=*)
-        shift # Remove --output_tensor_names= from processing
+        --total_max_samples=*)
+        shift # Remove --total_max_samples= from processing
         ;;
-        --output_tensor_indices=*)
-        shift # Remove --output_tensor_indices= from processing
+        --output_tensors_name=*)
+        shift # Remove --output_tensors_name= from processing
         ;;
         --use_xla_auto_jit)
         TF_AUTO_JIT_XLA_FLAG="TF_XLA_FLAGS=--tf_xla_auto_jit=2"
@@ -54,8 +54,8 @@ done
 INPUT_SIZE=224
 PREPROCESS_METHOD="vgg"
 NUM_CLASSES=1001
-OUTPUT_TENSOR_NAME_FLAG=""
-OUTPUT_TENSOR_IDX_FLAG=""
+MAX_SAMPLES=49920
+OUTPUT_TENSORS_NAME="logits"
 
 case ${MODEL_NAME} in
   "inception_v3" | "inception_v4")
@@ -76,19 +76,24 @@ case ${MODEL_NAME} in
     PREPROCESS_METHOD="inception"
     ;;
 
-  "resnet_v1.5_50_tfv2" | "vgg_16" | "vgg_19" )
+  "resnet_v1.5_50_tfv2" )
+    NUM_CLASSES=1000
+    OUTPUT_TENSORS_NAME="activation_49"
+    ;;
+
+  "vgg_16" | "vgg_19" )
     NUM_CLASSES=1000
     ;;
 
   "resnet50-v1.5_tf1_ngc" )
     NUM_CLASSES=1000
-    OUTPUT_TENSOR_IDX_FLAG="--output_tensor_indices=0"
-    OUTPUT_TENSOR_NAME_FLAG="--output_tensor_names=classes"
+    OUTPUT_TENSORS_NAME="classes"
     PREPROCESS_METHOD="resnet50_v1_5_tf1_ngc_preprocess"
     ;;
 
   "resnet50v2_backbone" | "resnet50v2_sparse_backbone" )
     INPUT_SIZE=256
+    OUTPUT_TENSORS_NAME="outputs"
     ;;
 esac
 
@@ -106,8 +111,8 @@ echo ""
 echo "[*] INPUT_SIZE: ${INPUT_SIZE}"
 echo "[*] PREPROCESS_METHOD: ${PREPROCESS_METHOD}"
 echo "[*] NUM_CLASSES: ${NUM_CLASSES}"
-echo "[*] OUTPUT_TENSOR_IDX_FLAG: ${OUTPUT_TENSOR_IDX_FLAG}"
-echo "[*] OUTPUT_TENSOR_NAME_FLAG: ${OUTPUT_TENSOR_NAME_FLAG}"
+echo "[*] MAX_SAMPLES: ${MAX_SAMPLES}"
+echo "[*] OUTPUT_TENSORS_NAME: ${OUTPUT_TENSORS_NAME}"
 echo ""
 echo "[*] TF_AUTO_JIT_XLA_FLAG: ${TF_AUTO_JIT_XLA_FLAG}"
 echo "[*] BYPASS_ARGUMENTS: $(echo \"${BYPASS_ARGUMENTS}\" | tr -s ' ')"
@@ -163,11 +168,11 @@ COMMAND="${PREPEND_COMMAND} python image_classification.py \
     --input_size ${INPUT_SIZE} \
     --preprocess_method ${PREPROCESS_METHOD} \
     --num_classes ${NUM_CLASSES} \
-    ${OUTPUT_TENSOR_IDX_FLAG} \
-    ${OUTPUT_TENSOR_NAME_FLAG} \
+    --total_max_samples=${MAX_SAMPLES} \
+    --output_tensors_name=${OUTPUT_TENSORS_NAME} \
     ${BYPASS_ARGUMENTS}"
 
-COMMAND=$(echo "${COMMAND}" | tr -s " ")
+COMMAND=$(echo ${COMMAND} | sed 's/ *$//g')  # Trimming whitespaces
 
 echo -e "**Executing:**\n\n${COMMAND}\n"
 sleep 5
diff --git a/tftrt/examples/object_detection/object_detection.py b/tftrt/examples/object_detection/object_detection.py
index 39a67bb03..94aa9ff49 100644
--- a/tftrt/examples/object_detection/object_detection.py
+++ b/tftrt/examples/object_detection/object_detection.py
@@ -16,16 +16,12 @@
 # =============================================================================
 
 import os
-import sys
-
-import logging
-import time
 import shutil
+import sys
 
-from functools import partial
+import numpy as np
 import ujson as json
 
-import numpy as np
 import tensorflow as tf
 
 from pycocotools.coco import COCO
@@ -33,7 +29,10 @@
 
 # Allow import of top level python files
 import inspect
-currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+
+currentdir = os.path.dirname(
+    os.path.abspath(inspect.getfile(inspect.currentframe()))
+)
 parentdir = os.path.dirname(currentdir)
 sys.path.insert(0, parentdir)
 
@@ -43,53 +42,128 @@
 
 class CommandLineAPI(BaseCommandLineAPI):
 
-    SAMPLES_IN_VALIDATION_SET = 5000
-
     def __init__(self):
         super(CommandLineAPI, self).__init__()
 
-        self._parser.add_argument('--input_size', type=int, default=640,
-                                  help='Size of input images expected by the '
-                                       'model')
+        self._parser.add_argument(
+            '--input_size',
+            type=int,
+            default=640,
+            help='Size of input images expected by the '
+            'model'
+        )
+
+        self._parser.add_argument(
+            '--annotation_path',
+            type=str,
+            help='Path that contains COCO annotations'
+        )
+
 
-        self._parser.add_argument('--annotation_path', type=str,
-                                  help='Path that contains COCO annotations')
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #
+# %%%%%%%%%%%%%%%%% IMPLEMENT MODEL-SPECIFIC FUNCTIONS HERE %%%%%%%%%%%%%%%%%% #
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #
 
 
 class BenchmarkRunner(BaseBenchmarkRunner):
 
-    ACCURACY_METRIC_NAME = "mAP"
-
-    def before_benchmark(self, **kwargs):
-        self._output_name_map = (
-            # <tf.Tensor 'detection_boxes:0' shape=(8, None, None) dtype=float32>
-            (0, 'boxes'),
-            # <tf.Tensor 'detection_classes:0' shape=(8, None) dtype=float32>
-            (1, 'classes'),
-            # <tf.Tensor 'num_detections:0' shape=(8,) dtype=float32>
-            (2, 'num_detections'),
-            # <tf.Tensor 'detection_scores:0' shape=(8, None) dtype=float32>
-            (3, 'scores'),
+    def get_dataset_batches(self):
+        """Returns a list of batches of input samples.
+
+        Each batch should be in the form [x, y], where
+        x is a numpy array of the input samples for the batch, and
+        y is a numpy array of the expected model outputs for the batch
+
+        Returns:
+        - dataset: a TF Dataset object
+        - bypass_data_to_eval: any object type that will be passed unmodified to
+                            `evaluate_result()`. If not necessary: `None`
+
+        Note: script arguments can be accessed using `self._args.attr`
+        """
+
+        coco_api = COCO(annotation_file=self._args.annotation_path)
+        image_ids = coco_api.getImgIds()
+
+        image_paths = []
+        for image_id in image_ids:
+            coco_img = coco_api.imgs[image_id]
+            image_paths.append(
+                os.path.join(self._args.data_dir, coco_img['file_name'])
+            )
+
+        dataset = tf.data.Dataset.from_tensor_slices(image_paths)
+
+        def load_image_op(path):
+            image = tf.io.read_file(path)
+            image = tf.image.decode_jpeg(image, channels=3)
+
+            return tf.data.Dataset.from_tensor_slices([image])
+
+        dataset = dataset.interleave(
+            load_image_op,
+            cycle_length=tf.data.experimental.AUTOTUNE,
+            block_length=8,
+            num_parallel_calls=tf.data.experimental.AUTOTUNE
         )
 
-    def compute_accuracy_metric(self, predictions, expected, **kwargs):
-        return self._eval_model(
-            predictions=predictions,
-            image_ids=kwargs["image_ids"],
-            annotation_path=kwargs["annotation_path"]
+        def preprocess_fn(image):
+            if self._args.input_size is not None:
+                image = tf.image.resize(
+                    image,
+                    size=(self._args.input_size, self._args.input_size)
+                )
+                image = tf.cast(image, tf.uint8)
+            return image
+
+        dataset = dataset.map(
+            map_func=preprocess_fn,
+            num_parallel_calls=tf.data.experimental.AUTOTUNE,
         )
 
-    def _eval_model(self, predictions, image_ids, annotation_path):
+        dataset = dataset.batch(self._args.batch_size, drop_remainder=False)
+
+        dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+
+        return dataset, None
+
+    def preprocess_model_inputs(self, data_batch):
+        """This function prepare the `data_batch` generated from the dataset.
+        Returns:
+            x: input of the model
+            y: data to be used for model evaluation
+
+        Note: script arguments can be accessed using `self._args.attr`
+        """
+
+        return data_batch, np.array([])
 
-        # for key in predictions:
-        #     predictions[key] = np.vstack(predictions[key])
-        #     if key == 'num_detections':
-        #         predictions[key] = predictions[key].ravel()
+    def postprocess_model_outputs(self, predictions, expected):
+        """Post process if needed the predictions and expected tensors. At the
+        minimum, this function transforms all TF Tensors into a numpy arrays.
+        Most models will not need to modify this function.
+
+        Note: script arguments can be accessed using `self._args.attr`
+        """
+
+        predictions = {k: t.numpy() for k, t in predictions.items()}
+
+        return predictions, expected
+
+    def evaluate_model(self, predictions, expected, bypass_data_to_eval):
+        """Evaluate result predictions for entire dataset.
+
+        This computes overall accuracy, mAP,  etc.  Returns the
+        metric value and a metric_units string naming the metric.
+
+        Note: script arguments can be accessed using `args.attr`
+        """
+        coco_api = COCO(annotation_file=self._args.annotation_path)
+        image_ids = coco_api.getImgIds()
 
-        coco = COCO(annotation_file=annotation_path)
         coco_detections = []
         for i, image_id in enumerate(image_ids):
-            coco_img = coco.imgs[image_id]
+            coco_img = coco_api.imgs[image_id]
             image_width = coco_img['width']
             image_height = coco_img['height']
 
@@ -99,8 +173,8 @@ def _eval_model(self, predictions, image_ids, annotation_path):
                 bbox_coco_fmt = [
                     x1 * image_width,  # x0
                     y1 * image_height,  # x1
-                    (x2 - x1) * image_width,  # width
-                    (y2 - y1) * image_height,  # height
+                    (x2-x1) * image_width,  # width
+                    (y2-y1) * image_height,  # height
                 ]
                 coco_detection = {
                     'image_id': image_id,
@@ -117,83 +191,20 @@ def _eval_model(self, predictions, image_ids, annotation_path):
         coco_detections_path = os.path.join(tmp_dir, 'coco_detections.json')
         with open(coco_detections_path, 'w') as f:
             json.dump(coco_detections, f)
-        cocoDt = coco.loadRes(coco_detections_path)
+
+        cocoDt = coco_api.loadRes(coco_detections_path)
 
         shutil.rmtree(tmp_dir)
 
         # compute coco metrics
-        eval = COCOeval(coco, cocoDt, 'bbox')
+        eval = COCOeval(coco_api, cocoDt, 'bbox')
         eval.params.imgIds = image_ids
 
         eval.evaluate()
         eval.accumulate()
         eval.summarize()
 
-        return eval.stats[0]
-
-    def process_model_output(self, outputs, **kwargs):
-        # outputs = graph_func(batch_images)
-        if isinstance(outputs, dict):
-            outputs = {k:t.numpy() for k, t in outputs.items()}
-        else:
-            outputs = {
-                name: outputs[idx].numpy()
-                for idx, name in self._output_name_map
-            }
-
-        return outputs
-
-
-def get_dataset(batch_size,
-                images_dir,
-                image_ids,
-                input_size,
-                use_synthetic_data):
-
-    image_paths = []
-
-    for image_id in image_ids:
-        coco_img = coco.imgs[image_id]
-        image_paths.append(os.path.join(images_dir, coco_img['file_name']))
-
-    dataset = tf.data.Dataset.from_tensor_slices(image_paths)
-
-    def load_image_op(path):
-        image = tf.io.read_file(path)
-        image = tf.image.decode_jpeg(image, channels=3)
-
-        return tf.data.Dataset.from_tensor_slices([image])
-
-    dataset = dataset.interleave(
-        lambda path: load_image_op(path),
-        cycle_length=tf.data.experimental.AUTOTUNE,
-        block_length=8,
-        num_parallel_calls=tf.data.experimental.AUTOTUNE
-    )
-
-    def preprocess_fn(image):
-        if input_size is not None:
-            image = tf.image.resize(image, size=(input_size, input_size))
-            image = tf.cast(image, tf.uint8)
-        return image
-
-    dataset = dataset.apply(
-        tf.data.experimental.map_and_batch(
-            map_func=preprocess_fn,
-            batch_size=batch_size,
-            num_parallel_calls=tf.data.experimental.AUTOTUNE,
-            drop_remainder=True
-        )
-    )
-
-    if use_synthetic_data:
-        dataset = dataset.take(count=1)  # loop over 1 batch
-        dataset = dataset.cache()
-        dataset = dataset.repeat()
-
-    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
-
-    return dataset
+        return eval.stats[0] * 100, "mAP %"
 
 
 if __name__ == '__main__':
@@ -201,83 +212,6 @@ def preprocess_fn(image):
     cmdline_api = CommandLineAPI()
     args = cmdline_api.parse_args()
 
-    coco = COCO(annotation_file=args.annotation_path)
-    image_ids = coco.getImgIds()
-
-    def _input_fn(input_data_dir, build_steps, model_phase):
-
-        dataset = get_dataset(
-            batch_size=args.batch_size,
-            images_dir=input_data_dir,
-            image_ids=image_ids,
-            input_size=args.input_size,
-            # even when using synthetic data, we need to
-            # build and/or calibrate using real training data
-            # to be in a realistic scenario
-            use_synthetic_data=False,
-        )
+    runner = BenchmarkRunner(args)
 
-        for i, batch_images in enumerate(dataset):
-            if i >= build_steps:
-                break
-
-            print("* [%s] - step %04d/%04d" % (
-                model_phase, i + 1, build_steps
-            ))
-            yield batch_images,
-
-    calibration_input_fn = partial(
-        _input_fn,
-        input_data_dir=args.calib_data_dir,
-        build_steps=args.num_calib_inputs // args.batch_size,
-        model_phase="Calibration"
-    )
-
-    optimize_offline_input_fn = partial(
-        _input_fn,
-        input_data_dir=args.data_dir,
-        build_steps=1,
-        model_phase="Building"
-    )
-
-    runner = BenchmarkRunner(
-        input_saved_model_dir=args.input_saved_model_dir,
-        output_saved_model_dir=args.output_saved_model_dir,
-        allow_build_at_runtime=args.allow_build_at_runtime,
-        calibration_input_fn=calibration_input_fn,
-        debug=args.debug,
-        gpu_mem_cap=args.gpu_mem_cap,
-        input_signature_key=args.input_signature_key,
-        max_workspace_size_bytes=args.max_workspace_size,
-        minimum_segment_size=args.minimum_segment_size,
-        num_calib_inputs=args.num_calib_inputs,
-        optimize_offline=args.optimize_offline,
-        optimize_offline_input_fn=optimize_offline_input_fn,
-        output_tensor_indices=args.output_tensor_indices,
-        output_tensor_names=args.output_tensor_names,
-        precision_mode=args.precision,
-        use_dynamic_shape=args.use_dynamic_shape,
-        use_tftrt=args.use_tftrt)
-
-    get_benchmark_input_fn = partial(
-        get_dataset,
-        images_dir=args.data_dir,
-        image_ids=image_ids,
-        input_size=args.input_size
-    )
-
-    runner.execute_benchmark(
-        batch_size=args.batch_size,
-        display_every=args.display_every,
-        get_benchmark_input_fn=get_benchmark_input_fn,
-        num_iterations=args.num_iterations,
-        num_warmup_iterations=args.num_warmup_iterations,
-        skip_accuracy_testing=(
-            args.use_synthetic_data or args.skip_accuracy_testing
-        ),
-        use_synthetic_data=args.use_synthetic_data,
-        use_xla=args.use_xla,
-        ########### Additional Settings ############
-        image_ids=image_ids,
-        annotation_path=args.annotation_path
-    )
+    runner.execute_benchmark()
diff --git a/tftrt/examples/object_detection/scripts/base_script.sh b/tftrt/examples/object_detection/scripts/base_script.sh
index 005fcdfb3..aa849398b 100755
--- a/tftrt/examples/object_detection/scripts/base_script.sh
+++ b/tftrt/examples/object_detection/scripts/base_script.sh
@@ -10,12 +10,9 @@ MODEL_DIR=""
 # Default Argument Values
 NVIDIA_TF32_OVERRIDE=""
 
-BATCH_SIZE=8
-MAX_WORKSPACE_SIZE=$((2 ** (32 + 1)))  # + 1 necessary compared to python
-INPUT_SIZE=640
-
 BYPASS_ARGUMENTS=""
 TF_AUTO_JIT_XLA_FLAG=""
+BATCH_SIZE=8
 
 # Loop through arguments and process them
 for arg in "$@"
@@ -37,6 +34,12 @@ do
         DATA_DIR="${arg#*=}"
         shift # Remove --data_dir= from processing
         ;;
+        --total_max_samples=*)
+        shift # Remove --total_max_samples= from processing
+        ;;
+        --output_tensors_name=*)
+        shift # Remove --output_tensors_name= from processing
+        ;;
         --input_saved_model_dir=*)
         MODEL_DIR="${arg#*=}"
         shift # Remove --input_saved_model_dir= from processing
@@ -53,6 +56,11 @@ done
 
 # ============== Set model specific parameters ============= #
 
+INPUT_SIZE=640
+MAX_WORKSPACE_SIZE=$((2 ** (32 + 1)))  # + 1 necessary compared to python
+MAX_SAMPLES=5000
+OUTPUT_TENSORS_NAME="boxes,classes,num_detections,scores"
+
 case ${MODEL_NAME} in
   "faster_rcnn_resnet50_coco" | "ssd_mobilenet_v1_fpn_coco")
     MAX_WORKSPACE_SIZE=$((2 ** (24 + 1)))  # + 1 necessary compared to python
@@ -73,6 +81,8 @@ echo ""
 echo "[*] BATCH_SIZE: ${BATCH_SIZE}"
 echo "[*] INPUT_SIZE: ${INPUT_SIZE}"
 echo "[*] MAX_WORKSPACE_SIZE: ${MAX_WORKSPACE_SIZE}"
+echo "[*] MAX_SAMPLES: ${MAX_SAMPLES}"
+echo "[*] OUTPUT_TENSORS_NAME: ${OUTPUT_TENSORS_NAME}"
 echo ""
 echo "[*] TF_AUTO_JIT_XLA_FLAG: ${TF_AUTO_JIT_XLA_FLAG}"
 echo "[*] BYPASS_ARGUMENTS: $(echo \"${BYPASS_ARGUMENTS}\" | tr -s ' ')"
@@ -150,9 +160,11 @@ COMMAND="${PREPEND_COMMAND} python object_detection.py \
     --batch_size ${BATCH_SIZE} \
     --input_size ${INPUT_SIZE} \
     --max_workspace_size ${MAX_WORKSPACE_SIZE} \
+    --total_max_samples=${MAX_SAMPLES} \
+    --output_tensors_name=${OUTPUT_TENSORS_NAME} \
     ${BYPASS_ARGUMENTS}"
 
-COMMAND=$(echo "${COMMAND}" | tr -s " ")
+COMMAND=$(echo ${COMMAND} | sed 's/ *$//g')  # Trimming whitespaces
 
 echo -e "**Executing:**\n\n${COMMAND}\n"
 sleep 5
diff --git a/tftrt/examples/transformers/generate_save_models_from_hf.py b/tftrt/examples/transformers/generate_save_models_from_hf.py
index f01c9402b..f0335e616 100644
--- a/tftrt/examples/transformers/generate_save_models_from_hf.py
+++ b/tftrt/examples/transformers/generate_save_models_from_hf.py
@@ -13,19 +13,21 @@
 from transformers import BertTokenizer, TFBertForPreTraining
 from transformers import BartTokenizer, TFBartForConditionalGeneration
 
-
 USE_CACHE = False
 OUTPUT_ATTENTIONS = False
 OUTPUT_HIDDEN_STATES = False
 
 
 class HFModel(tf.Module):
+
     def __init__(self, model, is_encoder_decoder):
         self._model = model
         self._is_encoder_decoder = is_encoder_decoder
 
     @tf.function(
-        input_signature=[tf.TensorSpec((None, None), tf.int32, name="input_ids")]
+        input_signature=[
+            tf.TensorSpec((None, None), tf.int32, name="input_ids")
+        ]
     )
     def serving(self, input_ids):
         if self._is_encoder_decoder:
@@ -39,15 +41,10 @@ def serving(self, input_ids):
 if __name__ == "__main__":
 
     MODEL_NAMES = [
-        "bert-base-uncased",
-        "bert-base-cased",
-        "bert-large-uncased",
-        "bert-large-cased",
-        'facebook/bart-base',
-        'facebook/bart-large'
+        "bert-base-uncased", "bert-base-cased", "bert-large-uncased",
+        "bert-large-cased", 'facebook/bart-base', 'facebook/bart-large'
     ]
 
-
     # use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
     #     If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
     #     decoding (see :obj:`past_key_values`). Set to :obj:`False` during training, :obj:`True` during generation
@@ -111,7 +108,9 @@ def serving(self, input_ids):
             )
 
         print("Exporting Model to SavedModel at:", pb_model_dir)
-        hf_model = HFModel(model, is_encoder_decoder)  # necessary to define a custom input signature
+        hf_model = HFModel(
+            model, is_encoder_decoder
+        )  # necessary to define a custom input signature
 
         tf.saved_model.save(
             hf_model,
@@ -144,7 +143,7 @@ def serving(self, input_ids):
             print("saving:", key, "...")
             arr_save_path = os.path.join(numpy_asset_dir, '%s.npy' % key)
             np.save(arr_save_path, val)
-            assert(np.allclose(np.load(arr_save_path), val))
+            assert (np.allclose(np.load(arr_save_path), val))
 
         # Clearing the GPU memory
 
@@ -152,5 +151,7 @@ def serving(self, input_ids):
         del tokenizer
         K.clear_session()
 
-        print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+        print(
+            "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"
+        )
         print()
diff --git a/tftrt/examples/transformers/scripts/base_script.sh b/tftrt/examples/transformers/scripts/base_script.sh
index da7604e6a..9c9b29b1a 100755
--- a/tftrt/examples/transformers/scripts/base_script.sh
+++ b/tftrt/examples/transformers/scripts/base_script.sh
@@ -14,6 +14,8 @@ DATA_DIR="/tmp"
 
 BYPASS_ARGUMENTS=""
 TF_AUTO_JIT_XLA_FLAG=""
+BATCH_SIZE=32
+SEQ_LEN=128
 
 # Loop through arguments and process them
 for arg in "$@"
@@ -27,14 +29,18 @@ do
         NVIDIA_TF32_OVERRIDE="NVIDIA_TF32_OVERRIDE=0"
         shift # Remove --no_tf32 from processing
         ;;
+        --batch_size=*)
+        BATCH_SIZE="${arg#*=}"
+        shift # Remove --batch_size= from processing
+        ;;
         --data_dir=*)
         shift # Remove --data_dir= from processing
         ;;
-        --vocab_size=*)
-        shift # Remove --vocab_size= from processing
+        --total_max_samples=*)
+        shift # Remove --total_max_samples= from processing
         ;;
-        --minimum_segment_size=*)
-        shift # Remove --minimum_segment_size= from processing
+        --output_tensors_name=*)
+        shift # Remove --output_tensors_name= from processing
         ;;
         --input_saved_model_dir=*)
         MODEL_DIR="${arg#*=}"
@@ -44,6 +50,13 @@ do
         TF_AUTO_JIT_XLA_FLAG="TF_XLA_FLAGS=--tf_xla_auto_jit=2"
         shift # Remove --use_xla_auto_jit from processing
         ;;
+        --vocab_size=*)
+        shift # Remove --vocab_size= from processing
+        ;;
+        --sequence_length=*)
+        SEQ_LEN="${arg#*=}"
+        shift # Remove --sequence_length= from processing
+        ;;
         *)
         BYPASS_ARGUMENTS=" ${BYPASS_ARGUMENTS} ${arg}"
         ;;
@@ -54,6 +67,9 @@ done
 
 MIN_SEGMENT_SIZE=5
 VOCAB_SIZE=-1
+MAX_WORKSPACE_SIZE=$((2 ** (32 + 1)))  # + 1 necessary compared to python
+MAX_SAMPLES=1
+OUTPUT_TENSORS_NAME="prediction_logits,seq_relationship_logits"
 
 case ${MODEL_NAME} in
   "bert_base_uncased" | "bert_large_uncased")
@@ -67,6 +83,7 @@ case ${MODEL_NAME} in
   "bart_base" | "bart_large")
     VOCAB_SIZE=50265
     MIN_SEGMENT_SIZE=90
+    OUTPUT_TENSORS_NAME="encoder_last_hidden_state,logits"
     ;;
 esac
 
@@ -80,9 +97,12 @@ echo "[*] MODEL_DIR: ${MODEL_DIR}"
 echo ""
 echo "[*] NVIDIA_TF32_OVERRIDE: ${NVIDIA_TF32_OVERRIDE}"
 echo ""
-# Custom Transormers Task Flags
-echo "[*] MIN_SEGMENT_SIZE: ${MIN_SEGMENT_SIZE}"
+# Custom Transormer Task Flags
 echo "[*] VOCAB_SIZE: ${VOCAB_SIZE}"
+echo "[*] SEQ_LEN: ${SEQ_LEN}"
+echo "[*] MAX_WORKSPACE_SIZE: ${MAX_WORKSPACE_SIZE}"
+echo "[*] MAX_SAMPLES: ${MAX_SAMPLES}"
+echo "[*] OUTPUT_TENSORS_NAME: ${OUTPUT_TENSORS_NAME}"
 echo ""
 echo "[*] TF_AUTO_JIT_XLA_FLAG: ${TF_AUTO_JIT_XLA_FLAG}"
 echo "[*] BYPASS_ARGUMENTS: $(echo \"${BYPASS_ARGUMENTS}\" | tr -s ' ')"
@@ -132,13 +152,19 @@ cd ${BENCH_DIR}
 PREPEND_COMMAND="${TF_AUTO_JIT_XLA_FLAG} ${NVIDIA_TF32_OVERRIDE}"
 
 COMMAND="${PREPEND_COMMAND} python transformers.py \
-    --input_saved_model_dir ${INPUT_SAVED_MODEL_DIR} \
     --data_dir ${DATA_DIR} \
+    --calib_data_dir ${DATA_DIR} \
+    --input_saved_model_dir ${INPUT_SAVED_MODEL_DIR} \
+    --output_saved_model_dir /tmp/$RANDOM \
+    --batch_size ${BATCH_SIZE} \
     --vocab_size ${VOCAB_SIZE} \
-    --minimum_segment_size ${MIN_SEGMENT_SIZE} \
+    --sequence_length=${SEQ_LEN} \
+    --max_workspace_size ${MAX_WORKSPACE_SIZE} \
+    --total_max_samples=${MAX_SAMPLES} \
+    --output_tensors_name=${OUTPUT_TENSORS_NAME} \
     ${BYPASS_ARGUMENTS}"
 
-COMMAND=$(echo "${COMMAND}" | tr -s " ")
+COMMAND=$(echo ${COMMAND} | sed 's/ *$//g')  # Trimming whitespaces
 
 echo -e "**Executing:**\n\n${COMMAND}\n"
 sleep 5
diff --git a/tftrt/examples/transformers/transformers.py b/tftrt/examples/transformers/transformers.py
index 5b014c033..fc4ae5403 100644
--- a/tftrt/examples/transformers/transformers.py
+++ b/tftrt/examples/transformers/transformers.py
@@ -18,20 +18,16 @@
 import os
 import sys
 
-import logging
-import multiprocessing
-import time
-
-from functools import partial
-
 import numpy as np
-import tensorflow as tf
 
-from statistics import mean
+import tensorflow as tf
 
 # Allow import of top level python files
 import inspect
-currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+
+currentdir = os.path.dirname(
+    os.path.abspath(inspect.getfile(inspect.currentframe()))
+)
 parentdir = os.path.dirname(currentdir)
 sys.path.insert(0, parentdir)
 
@@ -41,8 +37,6 @@
 
 class CommandLineAPI(BaseCommandLineAPI):
 
-    # SAMPLES_IN_VALIDATION_SET = 50000
-
     ALLOWED_VOCAB_SIZES = [
         30522,  # BERT Uncased
         28996,  # BERT Cased
@@ -52,174 +46,130 @@ class CommandLineAPI(BaseCommandLineAPI):
     def __init__(self):
         super(CommandLineAPI, self).__init__()
 
-        self._parser.add_argument('--sequence_length', type=int, default=128,
-                            help='Directory containing the input saved model.')
-
-        self._parser.add_argument('--vocab_size', type=int, required=True,
-                                  choices=self.ALLOWED_VOCAB_SIZES,
-                                  help='Size of the vocabulory used for '
-                                       'training. Refer to huggingface '
-                                       'documentation.')
+        self._parser.add_argument(
+            "--sequence_length",
+            type=int,
+            default=128,
+            help="Input data sequence length."
+        )
 
-        self._parser.add_argument('--validate_output', action='store_true',
-                            help='Validates that the model returns the correct '
-                            'value. This only works with batch_size =32.')
+        self._parser.add_argument(
+            "--vocab_size",
+            type=int,
+            required=True,
+            choices=self.ALLOWED_VOCAB_SIZES,
+            help="Size of the vocabulory used for training. Refer to "
+            "huggingface documentation."
+        )
 
+        # self._parser.add_argument(
+        #     "--validate_output",
+        #     action="store_true",
+        #     help="Validates that the model returns the correct value. This "
+        #     "only works with batch_size =32."
+        # )
 
     def _validate_args(self, args):
         super(CommandLineAPI, self)._validate_args(args)
 
-        if args.validate_output and args.batch_size != 32:
-            raise ValueError("Output validation only supports batch size 32.")
+        # if args.validate_output and args.batch_size != 32:
+        #     raise ValueError("Output validation only supports batch size 32.")
 
         # TODO: Remove when proper dataloading is implemented
         if args.num_iterations is None:
-            raise ValueError("This benchmark does not currently support "
-                             "--num_iterations=None")
+            raise ValueError(
+                "This benchmark does not currently support "
+                "--num_iterations=None"
+            )
 
-    # TODO: Remove when proper dataloading is implemented
     def _post_process_args(self, args):
+        args = super(CommandLineAPI, self)._post_process_args(args)
+
         return args
 
 
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #
+# %%%%%%%%%%%%%%%%% IMPLEMENT MODEL-SPECIFIC FUNCTIONS HERE %%%%%%%%%%%%%%%%%% #
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #
+
+
 class BenchmarkRunner(BaseBenchmarkRunner):
 
-    ACCURACY_METRIC_NAME = "mAP"
+    def get_dataset_batches(self):
+        """Returns a list of batches of input samples.
 
-    def before_benchmark(self, **kwargs):
-        pass
+        Each batch should be in the form [x, y], where
+        x is a numpy array of the input samples for the batch, and
+        y is a numpy array of the expected model outputs for the batch
 
-    def compute_accuracy_metric(self, predictions, expected, **kwargs):
-        pass
+        Returns:
+        - dataset: a TF Dataset object
+        - bypass_data_to_eval: any object type that will be passed unmodified to
+                            `evaluate_result()`. If not necessary: `None`
 
-    def process_model_output(self, outputs, **kwargs):
-        pass
+        Note: script arguments can be accessed using `self._args.attr`
+        """
 
-# def validate_model_artifacts(infer_func, model_dir, use_tftrt, precision):
-#     numpy_asset_dir = os.path.join(model_dir, "numpy_assets")
-#
-#     input_data = np.load(os.path.join(numpy_asset_dir, 'input_data.npy'))
-#     input_data = tf.constant(input_data, dtype=tf.int32)
-#
-#     output = infer_func(input_ids=input_data)
-#
-#     if use_tftrt:
-#         if precision == "fp16":
-#             rtol=1e-2
-#             atol=2e-1
-#         else:
-#             rtol=1e-2
-#             atol=5e-2
-#     else:
-#         rtol=1e-5
-#         atol=1e-8
-#
-#     for key in output.keys():
-#         target = np.load(os.path.join(numpy_asset_dir, '%s.npy' % key))
-#         np.testing.assert_allclose(
-#             target, output[key].numpy(), rtol=rtol, atol=atol
-#         )
-#     print("\n*****************************************************************")
-#     print("Model was validated with success ...")
-#     print("*****************************************************************\n")
+        if not self._args.use_synthetic_data:
+            raise NotImplementedError()
+
+        tf.random.set_seed(10)
+
+        input_data = tf.random.uniform(
+            shape=(1, self._args.sequence_length),
+            maxval=self._args.vocab_size,
+            dtype=tf.int32
+        )
+
+        dataset = tf.data.Dataset.from_tensor_slices(input_data)
+        dataset = dataset.repeat()
+        dataset = dataset.batch(self._args.batch_size)
+        dataset = dataset.take(count=1)  # loop over 1 batch
+        dataset = dataset.cache()
+        dataset = dataset.repeat()
+        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+
+        return dataset, None
 
+    def preprocess_model_inputs(self, data_batch):
+        """This function prepare the `data_batch` generated from the dataset.
+        Returns:
+            x: input of the model
+            y: data to be used for model evaluation
 
-def get_dataset(batch_size, seq_len, vocab_size, use_synthetic_data):
+        Note: script arguments can be accessed using `self._args.attr`
+        """
 
-    if not use_synthetic_data:
-        raise NotImplementedError()
+        x = data_batch
+        return x, None
 
-    tf.random.set_seed(10)
-    input_data = tf.random.uniform(shape=(1, seq_len), maxval=vocab_size,
-                                   dtype=tf.int32)
+    def postprocess_model_outputs(self, predictions, expected):
+        """Post process if needed the predictions and expected tensors. At the
+        minimum, this function transforms all TF Tensors into a numpy arrays.
+        Most models will not need to modify this function.
 
-    dataset = tf.data.Dataset.from_tensor_slices(input_data)
-    dataset = dataset.repeat()
-    dataset = dataset.batch(batch_size)
-    dataset = dataset.take(count=1)  # loop over 1 batch
-    dataset = dataset.cache()
-    dataset = dataset.repeat()
-    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+        Note: script arguments can be accessed using `self._args.attr`
+        """
 
-    return dataset
+        return predictions.numpy(), expected.numpy()
+
+    def evaluate_model(self, predictions, expected, bypass_data_to_eval):
+        """Evaluate result predictions for entire dataset.
+
+        This computes overall accuracy, mAP,  etc.  Returns the
+        metric value and a metric_units string naming the metric.
+
+        Note: script arguments can be accessed using `args.attr`
+        """
+
+        return None, "Top-1 Accuracy %"
 
 
 if __name__ == '__main__':
+
     cmdline_api = CommandLineAPI()
     args = cmdline_api.parse_args()
 
-    def _input_fn(build_steps, model_phase):
-
-        dataset = get_dataset(
-            batch_size=args.batch_size,
-            seq_len=args.sequence_length,
-            vocab_size=args.vocab_size,
-            use_synthetic_data=args.use_synthetic_data
-        )
+    runner = BenchmarkRunner(args)
 
-        for i, (input_batch) in enumerate(dataset):
-            if i >= build_steps:
-                break
-
-            print("* [%s] - step %04d/%04d" % (
-                model_phase, i + 1, build_steps
-            ))
-            yield input_batch,
-
-    calibration_input_fn = partial(
-        _input_fn,
-        build_steps=args.num_calib_inputs // args.batch_size,
-        model_phase="Calibration"
-    )
-    optimize_offline_input_fn = partial(
-        _input_fn,
-        build_steps=1,
-        model_phase="Building"
-    )
-
-    runner = BenchmarkRunner(
-        input_saved_model_dir=args.input_saved_model_dir,
-        output_saved_model_dir=args.output_saved_model_dir,
-        allow_build_at_runtime=args.allow_build_at_runtime,
-        calibration_input_fn=calibration_input_fn,
-        debug=args.debug,
-        gpu_mem_cap=args.gpu_mem_cap,
-        input_signature_key=args.input_signature_key,
-        max_workspace_size_bytes=args.max_workspace_size,
-        minimum_segment_size=args.minimum_segment_size,
-        num_calib_inputs=args.num_calib_inputs,
-        optimize_offline=args.optimize_offline,
-        optimize_offline_input_fn=optimize_offline_input_fn,
-        output_tensor_indices=args.output_tensor_indices,
-        output_tensor_names=args.output_tensor_names,
-        precision_mode=args.precision,
-        use_dynamic_shape=args.use_dynamic_shape,
-        use_tftrt=args.use_tftrt)
-
-    # if args.validate_output:
-    #     # artifacts only generated for BS == 32
-    #     validate_model_artifacts(
-    #         graph_func,
-    #         args.input_saved_model_dir,
-    #         args.use_tftrt,
-    #         args.precision.lower()
-    #     )
-
-    get_benchmark_input_fn = partial(
-        get_dataset,
-        seq_len=args.sequence_length,
-        vocab_size=args.vocab_size
-    )
-
-    runner.execute_benchmark(
-        batch_size=args.batch_size,
-        display_every=args.display_every,
-        get_benchmark_input_fn=get_benchmark_input_fn,
-        num_iterations=args.num_iterations,
-        num_warmup_iterations=args.num_warmup_iterations,
-        skip_accuracy_testing=(
-            args.use_synthetic_data or args.skip_accuracy_testing
-        ),
-        use_synthetic_data=args.use_synthetic_data,
-        use_xla=args.use_xla,
-    )
+    runner.execute_benchmark()