-
Notifications
You must be signed in to change notification settings - Fork 1.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
tests: parametrize bench mark tests #4974
base: main
Are you sure you want to change the base?
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
import json | ||
import logging | ||
import platform | ||
import re | ||
import shutil | ||
from pathlib import Path | ||
|
||
|
@@ -17,78 +18,114 @@ | |
LOGGER = logging.getLogger(__name__) | ||
|
||
|
||
def get_executables(): | ||
""" | ||
Get a list of binaries for benchmarking | ||
""" | ||
|
||
# Passing --message-format json to cargo tells it to print its log in a json format. At the end, instead of the | ||
# usual "placed executable <...> at <...>" we'll get a json object with an 'executable' key, from which we | ||
# extract the path to the compiled benchmark binary. | ||
_, stdout, _ = cargo( | ||
"bench", | ||
f"--all --quiet --target {platform.machine()}-unknown-linux-musl --message-format json --no-run", | ||
) | ||
|
||
executables = [] | ||
for line in stdout.split("\n"): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: could be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fix in b95900e And I realized there are mamy same implementation. https://github.com/search?q=repo%3Afirecracker-microvm%2Ffirecracker%20split(%22%5Cn%22)&type=code |
||
if line: | ||
msg = json.loads(line) | ||
executable = msg.get("executable") | ||
if executable: | ||
executables.append(executable) | ||
|
||
return executables | ||
|
||
|
||
@pytest.mark.no_block_pr | ||
@pytest.mark.timeout(900) | ||
def test_no_regression_relative_to_target_branch(): | ||
@pytest.mark.timeout(600) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. from the buildkite run, it seems like the longest duration of one of these is 150s for the queue benchmarks, so I think we can actually drop this timeout marker altogether and just rely on the default timeout specified in pytest.ini (which is 300s) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fix in 215af23 |
||
@pytest.mark.parametrize("executable", get_executables()) | ||
def test_no_regression_relative_to_target_branch(executable): | ||
""" | ||
Run the microbenchmarks in this repository, comparing results from pull | ||
request target branch against what's achieved on HEAD | ||
""" | ||
run_criterion = get_run_criterion(executable) | ||
compare_results = get_compare_results(executable) | ||
git_ab_test(run_criterion, compare_results) | ||
|
||
|
||
def run_criterion(firecracker_checkout: Path, is_a: bool) -> Path: | ||
def get_run_criterion(executable): | ||
""" | ||
Executes all benchmarks by running "cargo bench --no-run", finding the executables, and running them pinned to some CPU | ||
Get function that executes specified benchmarks, and running them pinned to some CPU | ||
""" | ||
baseline_name = "a_baseline" if is_a else "b_baseline" | ||
|
||
with contextlib.chdir(firecracker_checkout): | ||
# Passing --message-format json to cargo tells it to print its log in a json format. At the end, instead of the | ||
# usual "placed executable <...> at <...>" we'll get a json object with an 'executable' key, from which we | ||
# extract the path to the compiled benchmark binary. | ||
_, stdout, _ = cargo( | ||
"bench", | ||
f"--all --quiet --target {platform.machine()}-unknown-linux-musl --message-format json --no-run", | ||
) | ||
|
||
executables = [] | ||
for line in stdout.split("\n"): | ||
if line: | ||
msg = json.loads(line) | ||
executable = msg.get("executable") | ||
if executable: | ||
executables.append(executable) | ||
def _run_criterion(firecracker_checkout: Path, is_a: bool) -> Path: | ||
baseline_name = "a_baseline" if is_a else "b_baseline" | ||
|
||
for executable in executables: | ||
with contextlib.chdir(firecracker_checkout): | ||
utils.check_output( | ||
f"CARGO_TARGET_DIR=build/cargo_target taskset -c 1 {executable} --bench --save-baseline {baseline_name}" | ||
) | ||
|
||
return firecracker_checkout / "build" / "cargo_target" / "criterion" | ||
return firecracker_checkout / "build" / "cargo_target" / "criterion" | ||
|
||
return _run_criterion | ||
|
||
|
||
def get_compare_results(executable): | ||
""" | ||
Get function that compares the two recorded criterion baselines for regressions, assuming that "A" is the baseline from main | ||
""" | ||
|
||
def compare_results(location_a_baselines: Path, location_b_baselines: Path): | ||
"""Compares the two recorded criterion baselines for regressions, assuming that "A" is the baseline from main""" | ||
for benchmark in location_b_baselines.glob("*"): | ||
data = json.loads( | ||
(benchmark / "b_baseline" / "estimates.json").read_text("utf-8") | ||
def _compare_results(location_a_baselines: Path, location_b_baselines: Path): | ||
|
||
list_result = utils.check_output( | ||
f"CARGO_TARGET_DIR=build/cargo_target {executable} --bench --list" | ||
) | ||
|
||
average_ns = data["mean"]["point_estimate"] | ||
# Format a string like `page_fault #2: benchmark` to a string like `page_fault_2`. | ||
# Because under `cargo_target/criterion/`, a directory like `page_fault_2` will create. | ||
bench_marks = [ | ||
re.sub(r"\s#(?P<sub_id>[1-9]+)", r"_\g<sub_id>", i.split(":")[0]) | ||
for i in list_result.stdout.split("\n") | ||
if i.endswith(": benchmark") | ||
] | ||
|
||
for benchmark in bench_marks: | ||
data = json.loads( | ||
( | ||
location_b_baselines / benchmark / "b_baseline" / "estimates.json" | ||
).read_text("utf-8") | ||
) | ||
|
||
LOGGER.info("%s mean: %iµs", benchmark.name, average_ns / 1000) | ||
average_ns = data["mean"]["point_estimate"] | ||
|
||
# Assumption: location_b_baseline = cargo_target of current working directory. So just copy the a_baselines here | ||
# to do the comparison | ||
for benchmark in location_a_baselines.glob("*"): | ||
shutil.copytree( | ||
benchmark / "a_baseline", | ||
location_b_baselines / benchmark.name / "a_baseline", | ||
LOGGER.info("%s mean: %iµs", benchmark, average_ns / 1000) | ||
|
||
# Assumption: location_b_baseline = cargo_target of current working directory. So just copy the a_baselines here | ||
# to do the comparison | ||
|
||
for benchmark in bench_marks: | ||
shutil.copytree( | ||
location_a_baselines / benchmark / "a_baseline", | ||
location_b_baselines / benchmark / "a_baseline", | ||
) | ||
|
||
bench_result = utils.check_output( | ||
f"CARGO_TARGET_DIR=build/cargo_target {executable} --bench --baseline a_baseline --load-baseline b_baseline", | ||
True, | ||
Path.cwd().parent, | ||
) | ||
|
||
_, stdout, _ = cargo( | ||
"bench", | ||
f"--all --target {platform.machine()}-unknown-linux-musl", | ||
"--baseline a_baseline --load-baseline b_baseline", | ||
) | ||
regressions_only = "\n\n".join( | ||
result | ||
for result in bench_result.stdout.split("\n\n") | ||
if "Performance has regressed." in result | ||
) | ||
|
||
regressions_only = "\n\n".join( | ||
result | ||
for result in stdout.split("\n\n") | ||
if "Performance has regressed." in result | ||
) | ||
# If this string is anywhere in stdout, then at least one of our benchmarks | ||
# is now performing worse with the PR changes. | ||
assert not regressions_only, "\n" + regressions_only | ||
|
||
# If this string is anywhere in stdout, then at least one of our benchmarks | ||
# is now performing worse with the PR changes. | ||
assert not regressions_only, "\n" + regressions_only | ||
return _compare_results |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Mhh, I don't think this does what we want. We precompile the executables ones (from the PR branch), and then we use this precompiled executable for both A and B runs. What need to do though is compile each benchmark twice, ones from the main branch and once from the PR branch, so that this test does a meaningful comparison :/ That's why in #4832 I suggested to use
--list-only
or something: determine the names of the benchmarks here, and then compile them twice in_run_criterion
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry, I think I misunderstood a bit how it works.
Let me confirm the modifications.
First, run
cargo bench --all -- --list
to generate parameters to pass topytest.parametrize
.I will get the following output:
And then, I will get benchmark name...for example
queue_pop_16
.Finally run a command like
cargo bench --all -- queue_pop_16
in_run_criterion
.Is this correct?