[CI][PTQ] benchmark (#2819)

### Changes - Update conformance tests for benchmark jobs - Add pytest-forked to run each test in own thread to more correct memory monitoring (note: not works for torch_cuda) - Change order of columns in results.csv (status should be last) - Add build_url in report.csv (to faster access to log for specific model) - Catch exception on run benchmark to avoid error status for models with dynamic shape (will be fixed after) - Fix issue with gpt2 with incorrect shape of input data. ### Related tickets 111533 ### Tests manual/job/post_training_quantization_performance/ manual/job/post_training_weight_compression_performance/
openvinotoolkit · Aug 19, 2024 · b882992 · b882992
1 parent bdf8d27
commit b882992
Show file tree

Hide file tree

Showing 6 changed files with 69 additions and 35 deletions.
diff --git a/constraints.txt b/constraints.txt
@@ -20,3 +20,4 @@ pytest-mock==3.12.0
 pytest-dependency==0.6.0
 pytest-ordering==0.6
 pytest-xdist==3.5.0
+pytest-forked==1.6.0
diff --git a/tests/post_training/README.md b/tests/post_training/README.md
@@ -48,6 +48,9 @@ It's possible to run a suite of tests for the specific compression algorithm onl
 For that append `::test_weight_compression` or `::test_ptq_quantization` to the `tests/post_training/test_quantize_conformance.py`.
 For instance:
 
+> [!WARNING]
+> It is recommended to run the test with a specific test function specified, running all tests at the same time is not tested.
+
 ```bash
 NUM_VAL_THREADS=8 pytest --data=<path_to_datasets> --output=./tmp tests/post_training/test_quantize_conformance.py::test_weight_compression
 ```
@@ -62,65 +65,69 @@ Additional arguments:
 - `--subset-size=N` to force subset_size of calibration dataset
 - `--batch-size=N` to use batch_size for calibration. Some of the models do not support --batch-size > 1. For such models, please, use --batch-size=1.
 - `--benchmark` to collect throughput statistics, add `FPS` column to result.csv
-- `--extra-columns` to add additional columns to reports.csv:
-  - `Stat. collection time` - time of statistic collection
-  - `Bias correction time` - time of bias correction
-  - `Validation time` - time of validation
+- `--extra-columns` to add additional columns to reports.csv, like time for each algorithms
+- `--memory-monitor` to using MemoryMonitor from tools/memory_monitor.py
 
 ### Examples
 
 Run for only OV backend:
 
 ```bash
-pytest --data=<path_to_datasets> -k backend_OV tests/post_training/test_quantize_conformance.py
+pytest --data=<path_to_datasets> -k backend_OV tests/post_training/test_quantize_conformance.py::test_weight_compression
 ```
 
 Run for only one model:
 
 ```bash
-pytest --data=<path_to_datasets> -k timm/crossvit_9_240 tests/post_training/test_quantize_conformance.py
+pytest --data=<path_to_datasets> -k timm/crossvit_9_240 tests/post_training/test_quantize_conformance.py::test_weight_compression
 ```
 
 Run for only one model for OV backend:
 
 ```bash
-pytest --data=<path_to_datasets> -k timm/crossvit_9_240_backend_OV tests/post_training/test_quantize_conformance.py
+pytest --data=<path_to_datasets> -k timm/crossvit_9_240_backend_OV tests/post_training/test_quantize_conformance.py::test_weight_compression
 ```
 
 Only dump models:
 
 ```bash
-pytest --data=<path_to_datasets> --no-eval tests/post_training/test_quantize_conformance.py
+pytest --data=<path_to_datasets> --no-eval tests/post_training/test_quantize_conformance.py::test_weight_compression
 ```
 
 Fast dump models with `subset_size=1` for all models:
 
 ```bash
-pytest --data=<path_to_datasets> --no-eval --subset-size 1 tests/post_training/test_quantize_conformance.py
+pytest --data=<path_to_datasets> --no-eval --subset-size 1 tests/post_training/test_quantize_conformance.py::test_weight_compression
 ```
 
 Run test with collection of throughput statistics:
 
 ```bash
-pytest --data=<path_to_datasets> --benchmark tests/post_training/test_quantize_conformance.py
+pytest --data=<path_to_datasets> --benchmark tests/post_training/test_quantize_conformance.py::test_weight_compression
 ```
 
 Fast collection of throughput statistics:
 
 ```bash
-pytest --data=<path_to_datasets> --benchmark --no-eval --subset-size 1 tests/post_training/test_quantize_conformance.py
+pytest --data=<path_to_datasets> --benchmark --no-eval --subset-size 1 tests/post_training/test_quantize_conformance.py::test_weight_compression
 ```
 
 Run test with additional columns:
 
 ```bash
-pytest --data=<path_to_datasets> --extra-columns tests/post_training/test_quantize_conformance.py
+pytest --data=<path_to_datasets> --extra-columns tests/post_training/test_quantize_conformance.py::test_weight_compression
 ```
 
 Run test with calibration dataset having batch-size=10 for all models:
 
 ```bash
-pytest --data=<path_to_datasets> --batch-size 10 tests/post_training/test_quantize_conformance.py
+pytest --data=<path_to_datasets> --batch-size 10 tests/post_training/test_quantize_conformance.py::test_weight_compression
+```
+
+Run test as in benchmark jobs:
+
+```bash
+pytest --data=<path_to_datasets> --forked --no-eval --subset-size 300 --batch-size 1 --benchmark --extra-columns --memory-monitor tests/post_training/test_quantize_conformance.py::test_ptq_quantization
 ```
 
 ## Reference data

diff --git a/tests/post_training/pipelines/base.py b/tests/post_training/pipelines/base.py
@@ -166,6 +166,14 @@ def format_memory_usage(memory):
         return int(memory)
 
     def get_result_dict(self):
+        ram_data = {}
+        if self.compression_memory_usage_rss is None and self.compression_memory_usage_system is None:
+            ram_data["RAM MiB"] = self.format_memory_usage(self.compression_memory_usage)
+        if self.compression_memory_usage_rss is not None:
+            ram_data["RAM MiB"] = self.format_memory_usage(self.compression_memory_usage_rss)
+        if self.compression_memory_usage_system is not None:
+            ram_data["RAM MiB System"] = self.format_memory_usage(self.compression_memory_usage_system)
+
         result = {
             "Model": self.model,
             "Backend": self.backend.value if self.backend else None,
@@ -179,16 +187,11 @@ def get_result_dict(self):
             **self.stats_from_output.get_stats(),
             "Total time": self.format_time(self.time_total),
             "FPS": self.fps,
+            **ram_data,
             "Status": self.status[:LIMIT_LENGTH_OF_STATUS] if self.status is not None else None,
+            "Build url": os.environ.get("BUILD_URL", ""),
         }
 
-        if self.compression_memory_usage_rss is None and self.compression_memory_usage_system is None:
-            result["RAM MiB"] = self.format_memory_usage(self.compression_memory_usage)
-        if self.compression_memory_usage_rss is not None:
-            result["RAM MiB RSS"] = self.format_memory_usage(self.compression_memory_usage_rss)
-        if self.compression_memory_usage_system is not None:
-            result["RAM MiB System"] = self.format_memory_usage(self.compression_memory_usage_system)
-
         return result
 
 
@@ -449,14 +452,18 @@ def run_bench(self) -> None:
         """
         if not self.run_benchmark_app:
             return
-        runner = Command(f"benchmark_app -m {self.path_compressed_ir}")
-        runner.run(stdout=False)
-        cmd_output = " ".join(runner.output)
-
-        match = re.search(r"Throughput\: (.+?) FPS", cmd_output)
-        if match is not None:
-            fps = match.group(1)
-            self.run_info.fps = float(fps)
+
+        try:
+            runner = Command(f"benchmark_app -m {self.path_compressed_ir}")
+            runner.run(stdout=False)
+            cmd_output = " ".join(runner.output)
+
+            match = re.search(r"Throughput\: (.+?) FPS", cmd_output)
+            if match is not None:
+                fps = match.group(1)
+                self.run_info.fps = float(fps)
+        except Exception as e:
+            print(e)
 
     def cleanup_cache(self):
         """

diff --git a/tests/post_training/pipelines/gpt.py b/tests/post_training/pipelines/gpt.py
@@ -64,10 +64,11 @@ def transform_func(data):
         else:
 
             def transform_func(data):
+                ids = np.expand_dims(data["input_ids"], axis=0)
                 inputs = {
-                    "input_ids": np.expand_dims(data["input_ids"], axis=0),
+                    "input_ids": ids,
                     "attention_mask": np.expand_dims(data["attention_mask"], axis=0),
-                    "position_ids": np.ones((1, 128), dtype=np.int64),
+                    "position_ids": np.ones(ids.shape, dtype=np.int64),
                     "beam_idx": np.zeros((1,), dtype=np.int64),
                 }
                 return inputs

diff --git a/tests/post_training/requirements.txt b/tests/post_training/requirements.txt
@@ -5,6 +5,7 @@ onnx
 onnxruntime
 openvino
 pytest
+pytest-forked
 
 librosa==0.10.0
 memory-profiler==0.61.0

diff --git a/tests/post_training/test_quantize_conformance.py b/tests/post_training/test_quantize_conformance.py
@@ -139,7 +139,7 @@ def fixture_wc_reference_data():
 
 
 @pytest.fixture(scope="session", name="ptq_result_data")
-def fixture_ptq_report_data(output_dir, run_benchmark_app):
+def fixture_ptq_report_data(output_dir, run_benchmark_app, pytestconfig):
     data: Dict[str, RunInfo] = {}
 
     yield data
@@ -151,22 +151,39 @@ def fixture_ptq_report_data(output_dir, run_benchmark_app):
             df = df.drop(columns=["FPS"])
 
         output_dir.mkdir(parents=True, exist_ok=True)
-        df.to_csv(output_dir / "results.csv", index=False)
+        output_file = output_dir / "results.csv"
+
+        if pytestconfig.getoption("forked") and output_file.exists():
+            # When run test with --forked to run test in separate process
+            # Used in post_training_performance jobs
+            df.to_csv(output_file, index=False, mode="a", header=False)
+        else:
+            df.to_csv(output_file, index=False)
 
 
 @pytest.fixture(scope="session", name="wc_result_data")
-def fixture_wc_report_data(output_dir):
+def fixture_wc_report_data(output_dir, run_benchmark_app, pytestconfig):
     data: Dict[str, RunInfo] = {}
 
     yield data
 
     if data:
         test_results = OrderedDict(sorted(data.items()))
         df = pd.DataFrame(v.get_result_dict() for v in test_results.values())
-        df = df.drop(columns=["FPS", "Num FQ"])
+        if not run_benchmark_app:
+            df = df.drop(columns=["FPS"])
+
+        df = df.drop(columns=["Num FQ"])
 
         output_dir.mkdir(parents=True, exist_ok=True)
-        df.to_csv(output_dir / "results.csv", index=False)
+        output_file = output_dir / "results.csv"
+
+        if pytestconfig.getoption("forked") and output_file.exists():
+            # When run test with --forked to run test in separate process
+            # Used in post_training_performance jobs
+            df.to_csv(output_file, index=False, mode="a", header=False)
+        else:
+            df.to_csv(output_file, index=False)
 
 
 def maybe_skip_test_case(test_model_param, run_fp32_backend, run_torch_cuda_backend, batch_size):