Skip to content

Commit

Permalink
[EP Perf] Fix on EP Perf (#20683)
Browse files Browse the repository at this point in the history
### Description
<!-- Describe your changes. -->
* Partially revert [previous
change](#19804), and
   * Redo concurrency_test_result parser outside of post.py
* Add support of syncing memtest result to db


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
To fix the error when CI is running on two model groups.
- When running on two model groups, the [previous
change](#19804) wrongly
navigates two levels up in the directory after running one model group,
while one level is needed. After that, the script can't find another
model group.
- Running on one model group can't repro the issue
  • Loading branch information
yf711 authored May 16, 2024
1 parent f5bfbd6 commit 47a178b
Show file tree
Hide file tree
Showing 3 changed files with 173 additions and 59 deletions.
132 changes: 132 additions & 0 deletions onnxruntime/python/tools/tensorrt/perf/parse_mem_concurrency_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------
import csv
import datetime
import os
import re

import pandas as pd
from azure.kusto.data import KustoConnectionStringBuilder
from azure.kusto.ingest import QueuedIngestClient
from post import get_identifier, parse_arguments, write_table


def parse_valgrind_log(input_path, output_path, keywords):
is_definitely_lost = False
is_ort_trt_related = False
buffer = []
leak_block = None
leak_bytes = None
keyword = None
results = []

with open(input_path) as file:
lines = file.readlines()

for line in lines:
line = line.strip() # noqa: PLW2901
# Remove "==xxxxx==" pattern from the line
line = line.split("==")[-1].strip() # noqa: PLW2901

if "blocks are definitely lost in loss" in line:
is_definitely_lost = True
# Extract LeakBlock and LeakBytes
match = re.search(r"([\d,]+) byte[s]? in ([\d,]+) block[s]?", line)
if match:
leak_bytes = match.group(1).replace(",", "")
leak_block = match.group(2).replace(",", "")
continue

if is_definitely_lost:
if line:
buffer.append(line)
for word in keywords:
if word in line:
is_ort_trt_related = True
keyword = word
break

# End of section
if is_definitely_lost and not line:
if is_ort_trt_related:
results.append((keyword, leak_block, leak_bytes, "\n".join(buffer)))
# Reset var
is_definitely_lost = False
is_ort_trt_related = False
buffer = []
leak_block = None
leak_bytes = None
keyword = None

# Writing results to CSV
with open(output_path, "w", newline="") as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(["Keyword", "LeakBlock", "LeakBytes", "ValgrindMessage"])
for entry in results:
csvwriter.writerow([entry[0], entry[1], entry[2], entry[3]])


def parse_concurrency_test_log(input_path, output_path):
with open(input_path) as log_file:
log_content = log_file.read()

failed_cases_section = log_content.split("Failed Test Cases:")[1]

# passed = 1 if no failed test cases
if failed_cases_section.strip() == "":
passed = 1
else:
passed = 0

with open(output_path, "w", newline="") as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["Passed", "Log"])
csv_writer.writerow([passed, log_content])


if __name__ == "__main__":
args = parse_arguments()

# connect to database
kcsb_ingest = KustoConnectionStringBuilder.with_az_cli_authentication(args.kusto_conn)
ingest_client = QueuedIngestClient(kcsb_ingest)
identifier = get_identifier(
args.commit_datetime, args.commit_hash, args.trt_version, args.branch, args.use_tensorrt_oss_parser
)
upload_time = datetime.datetime.now(tz=datetime.timezone.utc).replace(microsecond=0)

try:
result_mem_test_path = args.report_folder
os.chdir(result_mem_test_path)
# Parse mem_test log
logs = ["valgrind.log", "concurrency_test.log"]
csv_paths = ["mem_test.csv", "concurrency_test.csv"]
for log, csv_path in zip(logs, csv_paths):
if os.path.exists(log):
print(f"{identifier}: Parsing {log}")
if log == logs[0]:
parse_valgrind_log(log, csv_path, ["TensorrtExecutionProvider", "TensorRT"])
else:
parse_concurrency_test_log(log, csv_path)

# Upload to db
for csv_path, db_table_name in zip(csv_paths, ["ep_valgrind_record", "ep_concurrencytest_record"]):
if os.path.exists(csv_path):
table = pd.read_csv(csv_path)
write_table(
ingest_client,
args.database,
table,
db_table_name,
upload_time,
identifier,
args.branch,
args.commit_hash,
args.commit_datetime,
)
print(f"{identifier}: {csv_path} is synced to db")

except Exception as e:
print(str(e))
61 changes: 11 additions & 50 deletions onnxruntime/python/tools/tensorrt/perf/post.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# Licensed under the MIT License.
# --------------------------------------------------------------------------
import argparse
import csv
import datetime
import os
import sys
Expand Down Expand Up @@ -421,11 +420,10 @@ def main():
upload_time = datetime.datetime.now(tz=datetime.timezone.utc).replace(microsecond=0)

try:
# Load EP Perf test results from /result
result_file = args.report_folder
result_perf_test_path = os.path.join(result_file, "result")
folders = os.listdir(result_perf_test_path)
os.chdir(result_perf_test_path)

folders = os.listdir(result_file)
os.chdir(result_file)

tables = [
fail_name,
Expand All @@ -448,26 +446,26 @@ def main():
for model_group in folders:
os.chdir(model_group)
csv_filenames = os.listdir()
for csv_file in csv_filenames:
table = pd.read_csv(csv_file)
if session_name in csv_file:
for csv in csv_filenames:
table = pd.read_csv(csv)
if session_name in csv:
table_results[session_name] = pd.concat(
[table_results[session_name], get_session(table, model_group)], ignore_index=True
)
elif specs_name in csv_file:
elif specs_name in csv:
table_results[specs_name] = pd.concat(
[
table_results[specs_name],
get_specs(table, args.branch, args.commit_hash, args.commit_datetime),
],
ignore_index=True,
)
elif fail_name in csv_file:
elif fail_name in csv:
table_results[fail_name] = pd.concat(
[table_results[fail_name], get_failures(table, model_group)],
ignore_index=True,
)
elif latency_name in csv_file:
elif latency_name in csv:
table_results[memory_name] = pd.concat(
[table_results[memory_name], get_memory(table, model_group)],
ignore_index=True,
Expand All @@ -477,11 +475,11 @@ def main():
[table_results[latency_name], get_latency(table, model_group)],
ignore_index=True,
)
elif status_name in csv_file:
elif status_name in csv:
table_results[status_name] = pd.concat(
[table_results[status_name], get_status(table, model_group)], ignore_index=True
)
elif op_metrics_name in csv_file:
elif op_metrics_name in csv:
table = table.assign(Group=model_group)
table_results[op_metrics_name] = pd.concat(
[table_results[op_metrics_name], table], ignore_index=True
Expand Down Expand Up @@ -515,43 +513,6 @@ def main():
args.commit_datetime,
)

# Load concurrency test results
result_mem_test_path = os.path.join(result_file, "result_mem_test")
os.chdir(result_mem_test_path)
log_path = "concurrency_test.log"
if os.path.exists(log_path):
print("Generating concurrency test report")
with open(log_path) as log_file:
log_content = log_file.read()

failed_cases_section = log_content.split("Failed Test Cases:")[1]

# passed = 1 if no failed test cases
if failed_cases_section.strip() == "":
passed = 1
else:
passed = 0

csv_path = "concurrency_test.csv"
with open(csv_path, "w", newline="") as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["Passed", "Log"])
csv_writer.writerow([passed, log_content])

db_table_name = "ep_concurrencytest_record"
table = pd.read_csv(csv_path)
write_table(
ingest_client,
args.database,
table,
db_table_name,
upload_time,
identifier,
args.branch,
args.commit_hash,
args.commit_datetime,
)

except BaseException as e:
print(str(e))
sys.exit(1)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
parameters:

- name: PostToDashboard
displayName: Post to Dashboard
displayName: Post EP Perf results to Dashboard
type: boolean
default: true

Expand Down Expand Up @@ -30,7 +30,7 @@ parameters:
- "partner-models"

- name: MemTest
displayName: Run Memory Test and Concurrency Test
displayName: Run Memory and Concurrency Test
type: boolean
default: true

Expand Down Expand Up @@ -147,11 +147,27 @@ jobs:
workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/'
condition: always()

- task: PublishBuildArtifacts@1
inputs:
pathtoPublish: '$(Build.SourcesDirectory)/Artifact'
artifactName: 'result-$(Build.BuildNumber)'

- script: 'python3 -m pip install pandas azure-kusto-data[pandas] azure-kusto-ingest[pandas] coloredlogs'
displayName: 'Install dashboard dependencies'

- script: |
az --version || {
echo "Azure CLI not found, installing..."
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
}
displayName: 'Check and Install Azure CLI'
- task: AzureCLI@2
displayName: 'Parse Memory & Concurrency Test Records and Sync'
inputs:
azureSubscription: AIInfraBuildOnnxRuntimeOSS
scriptLocation: inlineScript
scriptType: bash
inlineScript: |
short_hash=$(git rev-parse --short HEAD) &&
commit_date=$(git log -1 --date=iso-strict --pretty=format:%cd) &&
python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/parse_mem_concurrency_test.py -r $(Build.SourcesDirectory)/Artifact/result_mem_test -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
- ${{ if eq(parameters.PostToDashboard, true) }}:

- script: 'python3 -m pip install pandas azure-kusto-data[pandas] azure-kusto-ingest[pandas] coloredlogs'
Expand All @@ -165,15 +181,20 @@ jobs:
displayName: 'Check and Install Azure CLI'
- task: AzureCLI@2
displayName: 'Post EP Perf Results to Dashboard'
displayName: 'Azure CLI Post to Dashboard'
inputs:
azureSubscription: AIInfraBuildOnnxRuntimeOSS
scriptLocation: inlineScript
scriptType: bash
inlineScript: |
short_hash=$(git rev-parse --short HEAD) &&
commit_date=$(git log -1 --date=iso-strict --pretty=format:%cd) &&
python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact/result -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
- task: PublishBuildArtifacts@1
inputs:
pathtoPublish: '$(Build.SourcesDirectory)/Artifact'
artifactName: 'result-$(Build.BuildNumber)'

- template: templates/component-governance-component-detection-steps.yml
parameters :
Expand Down

0 comments on commit 47a178b

Please sign in to comment.