[EP Perf] Fix on EP Perf (#20683)

### Description  * Partially revert [previous change](#19804), and * Redo concurrency_test_result parser outside of post.py * Add support of syncing memtest result to db ### Motivation and Context  To fix the error when CI is running on two model groups. - When running on two model groups, the [previous change](#19804) wrongly navigates two levels up in the directory after running one model group, while one level is needed. After that, the script can't find another model group. - Running on one model group can't repro the issue
microsoft · May 16, 2024 · 47a178b · 47a178b
1 parent f5bfbd6
commit 47a178b
Show file tree

Hide file tree

Showing 3 changed files with 173 additions and 59 deletions.
diff --git a/onnxruntime/python/tools/tensorrt/perf/parse_mem_concurrency_test.py b/onnxruntime/python/tools/tensorrt/perf/parse_mem_concurrency_test.py
@@ -0,0 +1,132 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import csv
+import datetime
+import os
+import re
+
+import pandas as pd
+from azure.kusto.data import KustoConnectionStringBuilder
+from azure.kusto.ingest import QueuedIngestClient
+from post import get_identifier, parse_arguments, write_table
+
+
+def parse_valgrind_log(input_path, output_path, keywords):
+    is_definitely_lost = False
+    is_ort_trt_related = False
+    buffer = []
+    leak_block = None
+    leak_bytes = None
+    keyword = None
+    results = []
+
+    with open(input_path) as file:
+        lines = file.readlines()
+
+        for line in lines:
+            line = line.strip()  # noqa: PLW2901
+            # Remove "==xxxxx==" pattern from the line
+            line = line.split("==")[-1].strip()  # noqa: PLW2901
+
+            if "blocks are definitely lost in loss" in line:
+                is_definitely_lost = True
+                # Extract LeakBlock and LeakBytes
+                match = re.search(r"([\d,]+) byte[s]? in ([\d,]+) block[s]?", line)
+                if match:
+                    leak_bytes = match.group(1).replace(",", "")
+                    leak_block = match.group(2).replace(",", "")
+                continue
+
+            if is_definitely_lost:
+                if line:
+                    buffer.append(line)
+                    for word in keywords:
+                        if word in line:
+                            is_ort_trt_related = True
+                            keyword = word
+                            break
+
+            # End of section
+            if is_definitely_lost and not line:
+                if is_ort_trt_related:
+                    results.append((keyword, leak_block, leak_bytes, "\n".join(buffer)))
+                # Reset var
+                is_definitely_lost = False
+                is_ort_trt_related = False
+                buffer = []
+                leak_block = None
+                leak_bytes = None
+                keyword = None
+
+    # Writing results to CSV
+    with open(output_path, "w", newline="") as csvfile:
+        csvwriter = csv.writer(csvfile)
+        csvwriter.writerow(["Keyword", "LeakBlock", "LeakBytes", "ValgrindMessage"])
+        for entry in results:
+            csvwriter.writerow([entry[0], entry[1], entry[2], entry[3]])
+
+
+def parse_concurrency_test_log(input_path, output_path):
+    with open(input_path) as log_file:
+        log_content = log_file.read()
+
+    failed_cases_section = log_content.split("Failed Test Cases:")[1]
+
+    # passed = 1 if no failed test cases
+    if failed_cases_section.strip() == "":
+        passed = 1
+    else:
+        passed = 0
+
+    with open(output_path, "w", newline="") as csv_file:
+        csv_writer = csv.writer(csv_file)
+        csv_writer.writerow(["Passed", "Log"])
+        csv_writer.writerow([passed, log_content])
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+
+    # connect to database
+    kcsb_ingest = KustoConnectionStringBuilder.with_az_cli_authentication(args.kusto_conn)
+    ingest_client = QueuedIngestClient(kcsb_ingest)
+    identifier = get_identifier(
+        args.commit_datetime, args.commit_hash, args.trt_version, args.branch, args.use_tensorrt_oss_parser
+    )
+    upload_time = datetime.datetime.now(tz=datetime.timezone.utc).replace(microsecond=0)
+
+    try:
+        result_mem_test_path = args.report_folder
+        os.chdir(result_mem_test_path)
+        # Parse mem_test log
+        logs = ["valgrind.log", "concurrency_test.log"]
+        csv_paths = ["mem_test.csv", "concurrency_test.csv"]
+        for log, csv_path in zip(logs, csv_paths):
+            if os.path.exists(log):
+                print(f"{identifier}: Parsing {log}")
+                if log == logs[0]:
+                    parse_valgrind_log(log, csv_path, ["TensorrtExecutionProvider", "TensorRT"])
+                else:
+                    parse_concurrency_test_log(log, csv_path)
+
+        # Upload to db
+        for csv_path, db_table_name in zip(csv_paths, ["ep_valgrind_record", "ep_concurrencytest_record"]):
+            if os.path.exists(csv_path):
+                table = pd.read_csv(csv_path)
+                write_table(
+                    ingest_client,
+                    args.database,
+                    table,
+                    db_table_name,
+                    upload_time,
+                    identifier,
+                    args.branch,
+                    args.commit_hash,
+                    args.commit_datetime,
+                )
+                print(f"{identifier}: {csv_path} is synced to db")
+
+    except Exception as e:
+        print(str(e))
diff --git a/onnxruntime/python/tools/tensorrt/perf/post.py b/onnxruntime/python/tools/tensorrt/perf/post.py
@@ -3,7 +3,6 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 import argparse
-import csv
 import datetime
 import os
 import sys
@@ -421,11 +420,10 @@ def main():
     upload_time = datetime.datetime.now(tz=datetime.timezone.utc).replace(microsecond=0)
 
     try:
-        # Load EP Perf test results from /result
         result_file = args.report_folder
-        result_perf_test_path = os.path.join(result_file, "result")
-        folders = os.listdir(result_perf_test_path)
-        os.chdir(result_perf_test_path)
+
+        folders = os.listdir(result_file)
+        os.chdir(result_file)
 
         tables = [
             fail_name,
@@ -448,26 +446,26 @@ def main():
         for model_group in folders:
             os.chdir(model_group)
             csv_filenames = os.listdir()
-            for csv_file in csv_filenames:
-                table = pd.read_csv(csv_file)
-                if session_name in csv_file:
+            for csv in csv_filenames:
+                table = pd.read_csv(csv)
+                if session_name in csv:
                     table_results[session_name] = pd.concat(
                         [table_results[session_name], get_session(table, model_group)], ignore_index=True
                     )
-                elif specs_name in csv_file:
+                elif specs_name in csv:
                     table_results[specs_name] = pd.concat(
                         [
                             table_results[specs_name],
                             get_specs(table, args.branch, args.commit_hash, args.commit_datetime),
                         ],
                         ignore_index=True,
                     )
-                elif fail_name in csv_file:
+                elif fail_name in csv:
                     table_results[fail_name] = pd.concat(
                         [table_results[fail_name], get_failures(table, model_group)],
                         ignore_index=True,
                     )
-                elif latency_name in csv_file:
+                elif latency_name in csv:
                     table_results[memory_name] = pd.concat(
                         [table_results[memory_name], get_memory(table, model_group)],
                         ignore_index=True,
@@ -477,11 +475,11 @@ def main():
                         [table_results[latency_name], get_latency(table, model_group)],
                         ignore_index=True,
                     )
-                elif status_name in csv_file:
+                elif status_name in csv:
                     table_results[status_name] = pd.concat(
                         [table_results[status_name], get_status(table, model_group)], ignore_index=True
                     )
-                elif op_metrics_name in csv_file:
+                elif op_metrics_name in csv:
                     table = table.assign(Group=model_group)
                     table_results[op_metrics_name] = pd.concat(
                         [table_results[op_metrics_name], table], ignore_index=True
@@ -515,43 +513,6 @@ def main():
                 args.commit_datetime,
             )
 
-        # Load concurrency test results
-        result_mem_test_path = os.path.join(result_file, "result_mem_test")
-        os.chdir(result_mem_test_path)
-        log_path = "concurrency_test.log"
-        if os.path.exists(log_path):
-            print("Generating concurrency test report")
-            with open(log_path) as log_file:
-                log_content = log_file.read()
-
-            failed_cases_section = log_content.split("Failed Test Cases:")[1]
-
-            # passed = 1 if no failed test cases
-            if failed_cases_section.strip() == "":
-                passed = 1
-            else:
-                passed = 0
-
-            csv_path = "concurrency_test.csv"
-            with open(csv_path, "w", newline="") as csv_file:
-                csv_writer = csv.writer(csv_file)
-                csv_writer.writerow(["Passed", "Log"])
-                csv_writer.writerow([passed, log_content])
-
-            db_table_name = "ep_concurrencytest_record"
-            table = pd.read_csv(csv_path)
-            write_table(
-                ingest_client,
-                args.database,
-                table,
-                db_table_name,
-                upload_time,
-                identifier,
-                args.branch,
-                args.commit_hash,
-                args.commit_datetime,
-            )
-
     except BaseException as e:
         print(str(e))
         sys.exit(1)

diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
@@ -1,7 +1,7 @@
 parameters:
 
 - name: PostToDashboard
-  displayName: Post to Dashboard
+  displayName: Post EP Perf results to Dashboard
   type: boolean
   default: true
 
@@ -30,7 +30,7 @@ parameters:
     - "partner-models"
 
 - name: MemTest
-  displayName: Run Memory Test and Concurrency Test
+  displayName: Run Memory and Concurrency Test
   type: boolean
   default: true
 
@@ -147,11 +147,27 @@ jobs:
         workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/'
         condition: always()
 
-    - task: PublishBuildArtifacts@1
-      inputs:
-        pathtoPublish: '$(Build.SourcesDirectory)/Artifact'
-        artifactName: 'result-$(Build.BuildNumber)'
-
+      - script: 'python3 -m pip install pandas azure-kusto-data[pandas] azure-kusto-ingest[pandas] coloredlogs'
+        displayName: 'Install dashboard dependencies'
+
+      - script: |
+          az --version || {
+              echo "Azure CLI not found, installing..."
+              curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
+          }
+        displayName: 'Check and Install Azure CLI'
+      
+      - task: AzureCLI@2
+        displayName: 'Parse Memory & Concurrency Test Records and Sync'
+        inputs:
+          azureSubscription: AIInfraBuildOnnxRuntimeOSS
+          scriptLocation: inlineScript
+          scriptType: bash
+          inlineScript: |
+            short_hash=$(git rev-parse --short HEAD) &&
+            commit_date=$(git log -1 --date=iso-strict --pretty=format:%cd) &&
+            python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/parse_mem_concurrency_test.py -r $(Build.SourcesDirectory)/Artifact/result_mem_test -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
+
     - ${{ if eq(parameters.PostToDashboard, true) }}:
 
       - script: 'python3 -m pip install pandas azure-kusto-data[pandas] azure-kusto-ingest[pandas] coloredlogs'
@@ -165,15 +181,20 @@ jobs:
         displayName: 'Check and Install Azure CLI'
 
       - task: AzureCLI@2
-        displayName: 'Post EP Perf Results to Dashboard'
+        displayName: 'Azure CLI Post to Dashboard'
         inputs:
           azureSubscription: AIInfraBuildOnnxRuntimeOSS
           scriptLocation: inlineScript
           scriptType: bash
           inlineScript: |
             short_hash=$(git rev-parse --short HEAD) &&
             commit_date=$(git log -1 --date=iso-strict --pretty=format:%cd) &&
-            python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
+            python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact/result -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
+
+    - task: PublishBuildArtifacts@1
+      inputs:
+        pathtoPublish: '$(Build.SourcesDirectory)/Artifact'
+        artifactName: 'result-$(Build.BuildNumber)'
 
     - template: templates/component-governance-component-detection-steps.yml
       parameters :