Skip to content

Commit

Permalink
Merge pull request #77 from FlorianDeconinck/qol/translate_report
Browse files Browse the repository at this point in the history
[QOL] Break report Translate errors in terminal & files
  • Loading branch information
FlorianDeconinck authored Sep 30, 2024
2 parents 4a4c0c8 + 8a719ef commit a667d29
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 89 deletions.
19 changes: 15 additions & 4 deletions ndsl/stencils/testing/test_translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,13 +210,19 @@ def test_sequential_savepoint(
near_zero=case.testobj.near_zero,
)
if not metric.check:
os.makedirs(OUTDIR, exist_ok=True)
log_filename = os.path.join(
OUTDIR,
f"details-{case.savepoint_name}-{varname}-rank{case.rank}.log",
)
metric.report(log_filename)
pytest.fail(str(metric), pytrace=False)
passing_names.append(failing_names.pop())
ref_data_out[varname] = [ref_data]
if len(failing_names) > 0:
get_thresholds(case.testobj, input_data=original_input_data)
os.makedirs(OUTDIR, exist_ok=True)
out_filename = os.path.join(OUTDIR, f"translate-{case.savepoint_name}.nc")
nc_filename = os.path.join(OUTDIR, f"translate-{case.savepoint_name}.nc")
input_data_on_host = {}
for key, _input in input_data.items():
input_data_on_host[key] = gt_utils.asarray(_input)
Expand All @@ -226,7 +232,7 @@ def test_sequential_savepoint(
[output],
ref_data_out,
failing_names,
out_filename,
nc_filename,
)
if failing_names != []:
pytest.fail(
Expand Down Expand Up @@ -353,11 +359,16 @@ def test_parallel_savepoint(
near_zero=case.testobj.near_zero,
)
if not metric.check:
os.makedirs(OUTDIR, exist_ok=True)
log_filename = os.path.join(
OUTDIR, f"details-{case.savepoint_name}-{varname}.log"
)
metric.report(log_filename)
pytest.fail(str(metric), pytrace=False)
passing_names.append(failing_names.pop())
if len(failing_names) > 0:
os.makedirs(OUTDIR, exist_ok=True)
out_filename = os.path.join(
nct_filename = os.path.join(
OUTDIR, f"translate-{case.savepoint_name}-{case.grid.rank}.nc"
)
try:
Expand All @@ -370,7 +381,7 @@ def test_parallel_savepoint(
[output],
ref_data,
failing_names,
out_filename,
nct_filename,
)
except Exception as error:
print(f"TestParallel SaveNetCDF Error: {error}")
Expand Down
195 changes: 110 additions & 85 deletions ndsl/testing/comparison.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Union
from typing import List, Optional, Union

import numpy as np
import numpy.typing as npt
Expand All @@ -20,6 +20,9 @@ def __str__(self) -> str:
def __repr__(self) -> str:
...

def report(self, file_path: Optional[str] = None) -> List[str]:
...


class LegacyMetric(BaseMetric):
"""Legacy (AI2) metric used for original FV3 port.
Expand Down Expand Up @@ -88,67 +91,78 @@ def _compute_errors(
)
return success

def __str__(self) -> str:
return self.__repr__()

def __repr__(self) -> str:
def report(self, file_path: Optional[str] = None) -> List[str]:
report = []
if self.check:
return "✅ No numerical differences"
report.append("✅ No numerical differences")
else:
report.append("❌ Numerical failures")

found_indices = np.logical_not(self.success).nonzero()
computed_failures = self.computed[found_indices]
reference_failures = self.references[found_indices]

# List all errors
bad_indices_count = len(found_indices[0])
# Determine worst result
worst_metric_err = 0.0
abs_errs = []
details = [
"All failures:",
"Index Computed Reference Absloute E Metric E",
]
for b in range(bad_indices_count):
full_index = tuple([f[b] for f in found_indices])

metric_err = self._calculated_metric[full_index]

absolute_distance = abs(computed_failures[b] - reference_failures[b])
abs_errs.append(absolute_distance)

details.append(
f"{full_index} {computed_failures[b]} "
f"{reference_failures[b]} {abs_errs[-1]:.3e} {metric_err:.3e}"
)

report = []
report.append("❌ Numerical failures")

found_indices = np.logical_not(self.success).nonzero()
computed_failures = self.computed[found_indices]
reference_failures = self.references[found_indices]

# List all errors
bad_indices_count = len(found_indices[0])
# Determine worst result
worst_metric_err = 0.0
abs_errs = []
details = [
"All failures:",
"Index Computed Reference Absloute E Metric E",
]
for b in range(bad_indices_count):
full_index = tuple([f[b] for f in found_indices])

metric_err = self._calculated_metric[full_index]

absolute_distance = abs(computed_failures[b] - reference_failures[b])
abs_errs.append(absolute_distance)

details.append(
f"{full_index} {computed_failures[b]} "
f"{reference_failures[b]} {abs_errs[-1]:.3e} {metric_err:.3e}"
if np.isnan(metric_err) or (abs(metric_err) > abs(worst_metric_err)):
worst_metric_err = metric_err
worst_full_idx = full_index
worst_abs_err = abs_errs[-1]
computed_worst = computed_failures[b]
reference_worst = reference_failures[b]
# Try to quantify noisy errors
unique_errors = len(np.unique(np.array(abs_errs)))
# Summary and worst result
fullcount = len(self.references.flatten())
report.append(
f"Failed count: {bad_indices_count}/{fullcount} "
f"({round(100.0 * (bad_indices_count / fullcount), 2)}%),\n"
f"Worst failed index {worst_full_idx}\n"
f" Computed:{computed_worst}\n"
f" Reference: {reference_worst}\n"
f" Absolute diff: {worst_abs_err:.3e}\n"
f" Metric diff: {worst_metric_err:.3e}\n"
f" Metric threshold: {self.eps}\n"
f" Noise quantification:\n"
f" Reference dtype: {type(reference_worst)}\n"
f" Unique errors: {unique_errors}/{bad_indices_count}"
)
report.extend(details)

if np.isnan(metric_err) or (abs(metric_err) > abs(worst_metric_err)):
worst_metric_err = metric_err
worst_full_idx = full_index
worst_abs_err = abs_errs[-1]
computed_worst = computed_failures[b]
reference_worst = reference_failures[b]
# Try to quantify noisy errors
unique_errors = len(np.unique(np.array(abs_errs)))
# Summary and worst result
fullcount = len(self.references.flatten())
report.append(
f"Failed count: {bad_indices_count}/{fullcount} "
f"({round(100.0 * (bad_indices_count / fullcount), 2)}%),\n"
f"Worst failed index {worst_full_idx}\n"
f" Computed:{computed_worst}\n"
f" Reference: {reference_worst}\n"
f" Absolute diff: {worst_abs_err:.3e}\n"
f" Metric diff: {worst_metric_err:.3e}\n"
f" Metric threshold: {self.eps}\n"
f" Noise quantification:\n"
f" Reference dtype: {type(reference_worst)}\n"
f" Unique errors: {unique_errors}/{bad_indices_count}"
)
report.extend(details)
if file_path:
with open(file_path, "w") as fd:
fd.write("\n".join(report))

return report

def __str__(self) -> str:
return self.__repr__()

def __repr__(self) -> str:
report = self.report()
if len(report) > 30:
report = report[:30] # ~10 first errors
report.append("...")
return "\n".join(report)


Expand Down Expand Up @@ -231,36 +245,47 @@ def _compute_all_metrics(
f"recieved data with unexpected dtype {self.references.dtype}"
)

def report(self, file_path: Optional[str] = None) -> List[str]:
report = []
if self.check:
report.append("✅ No numerical differences")
else:
report.append("❌ Numerical failures")

found_indices = np.logical_not(self.success).nonzero()
# List all errors to terminal and file
bad_indices_count = len(found_indices[0])
full_count = len(self.references.flatten())
failures_pct = round(100.0 * (bad_indices_count / full_count), 2)
report = [
f"All failures ({bad_indices_count}/{full_count}) ({failures_pct}%),\n",
f"Index Computed Reference "
f"Absolute E(<{self.absolute_eps:.2e}) "
f"Relative E(<{self.relative_fraction * 100:.2e}%) "
f"ULP E(<{self.ulp_threshold})",
]
# Summary and worst result
for iBad in range(bad_indices_count):
fi = tuple([f[iBad] for f in found_indices])
report.append(
f"{str(fi)} {self.computed[fi]:.16e} {self.references[fi]:.16e} "
f"{self.absolute_distance[fi]:.2e} {'✅' if self.absolute_distance_metric[fi] else '❌'} "
f"{self.relative_distance[fi] * 100:.2e} {'✅' if self.relative_distance_metric[fi] else '❌'} "
f"{int(self.ulp_distance[fi]):02} {'✅' if self.ulp_distance_metric[fi] else '❌'} "
)

if file_path:
with open(file_path, "w") as fd:
fd.write("\n".join(report))

return report

def __str__(self) -> str:
return self.__repr__()

def __repr__(self) -> str:
if self.check:
return "✅ No numerical differences"

report = []
report.append("❌ Numerical failures")

found_indices = np.logical_not(self.success).nonzero()
# List all errors
bad_indices_count = len(found_indices[0])
full_count = len(self.references.flatten())
failures_pct = round(100.0 * (bad_indices_count / full_count), 2)
report = [
f"All failures ({bad_indices_count}/{full_count}) ({failures_pct}%),\n",
f"Index Computed Reference "
f"Absolute E(<{self.absolute_eps:.2e}) "
f"Relative E(<{self.relative_fraction * 100:.2e}%) "
f"ULP E(<{self.ulp_threshold})",
]
# Summary and worst result
for iBad in range(bad_indices_count):
fi = tuple([f[iBad] for f in found_indices])
report.append(
f"({fi[0]:02}, {fi[1]:02}, {fi[2]:02}) {self.computed[fi]:.16e} {self.references[fi]:.16e} "
f"{self.absolute_distance[fi]:.2e} {'✅' if self.absolute_distance_metric[fi] else '❌'} "
f"{self.relative_distance[fi] * 100:.2e} {'✅' if self.relative_distance_metric[fi] else '❌'} "
f"{int(self.ulp_distance[fi]):02} {'✅' if self.ulp_distance_metric[fi] else '❌'} "
)

report = self.report()
if len(report) > 12:
report = report[:12] # ~10 first errors
report.append("...")
return "\n".join(report)

0 comments on commit a667d29

Please sign in to comment.