Skip to content

Commit

Permalink
Update perf script
Browse files Browse the repository at this point in the history
  • Loading branch information
tomvothecoder committed Oct 11, 2023
1 parent d061826 commit ddb509d
Show file tree
Hide file tree
Showing 2 changed files with 133 additions and 53 deletions.
5 changes: 5 additions & 0 deletions docs/examples/parallel-computing-with-dask.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2534,6 +2534,11 @@
"source": [
"### Performance Metrics\n",
"\n",
"Hardware\n",
" - Memory (GiB): 1000\n",
" - CPU Model: Intel(R) Xeon(R) CPU E7-8890 v4 @ 2.20GHz\n",
" - CPUs: 192 (2 threads x 24 cores x 4 sockets)\n",
"\n",
"Factors\n",
"\n",
"- Data size\n",
Expand Down
181 changes: 128 additions & 53 deletions docs/examples/parallel-computing-with-dask.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,13 @@
# %%
import collections
import timeit
from typing import List, Tuple
from typing import DefaultDict, Dict, List, Literal

import cdms2
import cdutil
import numpy as np
import xarray as xr
import xcdat as xc


# %%
def get_runtime(
setup: str, stmt: str, repeat: int = 5, number: int = 1
) -> Tuple[float, float]:
def get_runtime(setup: str, stmt: str, repeat: int = 5, number: int = 1) -> float:
"""Get the runtime for a code statement using timeit.
This function takes the lowest performance value for each sample.
Expand Down Expand Up @@ -85,61 +80,141 @@ def get_runtime(
return min


# %% STATIC VARIABLES
# -------------------
DIR = (
"/p/css03/esgf_publish/CMIP6/CMIP/MOHC/HadGEM3-GC31-MM/historical/r2i1p1f3"
"/day/ta/gn/v20191218"
)
VAR = "ta"
# %% xCDAT
# ----------
def get_xcdat_runtimes(
dir_path: str,
var: str,
type: Literal["serial", "parallel"],
) -> Dict[str, float]:
"""Get the cdms2 runtimes for spatial and temporal averaging.
Parameters
----------
dir_path : str
The path to the directory containing `.nc` datasets.
var : str
The variable to operate on.
type : Literal["serial", "parallel"]
Whether to run the API serially or in parallel.
Returns
-------
Dict[str, float]
A dictionary mapping the API to the runtime.
"""
if type == "serial":
chunks = None
use_flox = "with xr.set_options(use_flox=False): \n "
elif type == "parallel":
chunks = "auto"
use_flox = "with xr.set_options(use_flox=True): \n "

setup = (
"import xcdat as xc\n"
"import xarray as xr\n"
f"ds = xc.open_mfdataset(f'{dir_path}/*.nc', chunks={chunks})\n"
)
api_calls = {
"spatial_avg": f"ds.spatial.average('{var}')",
"temporal_avg": f"ds.temporal.average('{var}', weighted=True)",
"climatology": f"ds.temporal.climatology('{var}', freq='month', weighted=True)",
# "departures": f"ds.temporal.departures('{var}', freq='month', weighted=True)",
}

# xCDAT API Calls for `get_runtime`
XC_SPATIAL_AVG_STMT = f"ds.spatial.average('{VAR}')"
XC_TEMP_AVG_STMT = f"ds.temporal.average('{VAR}', weighted=True)"
XC_CLIMATOLOGY_STMT = f"ds.temporal.climatology('{VAR}', freq='month', weighted=True)"
XC_DEPARTURES_STMT = f"ds.temporal.depatures('{VAR}', freq='month', weighted=True)"
runtimes = {}
for api, stmt in api_calls.items():
stmt = use_flox + stmt
runtimes[api] = get_runtime(setup, stmt, repeat=1)

return runtimes

# %% xCDAT SERIAL
# -------------------
# Dataset object stored here for inspection. It is not actually used in the
# performance benchmarking.
# ds_serial = xc.open_mfdataset(f"{DIR}/*.nc", chunks="auto")

XC_SERIAL_SETUP = (
"import xcdat as xc\n" f"ds = xc.open_mfdataset(f'{DIR}/*.nc', chunks=None)\n"
)
serial_results = {
"spatial_avg": get_runtime(XC_SERIAL_SETUP, XC_SPATIAL_AVG_STMT, repeat=5),
"temporal_avg": get_runtime(XC_SERIAL_SETUP, XC_TEMP_AVG_STMT, repeat=5),
"climatology": get_runtime(XC_SERIAL_SETUP, XC_CLIMATOLOGY_STMT, repeat=5),
"departures": get_runtime(XC_SERIAL_SETUP, XC_DEPARTURES_STMT, repeat=5),
# %%
FILEPATHS = {
"7 GB": "/p/css03/esgf_publish/CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/day/tas/gn/v20190308/",
# "17 GB": "/p/css03/cmip5_css01/data/cmip5/output1/CNRM-CERFACS/CNRM-CM5/historical/day/atmos/day/r1i1p1/v20120530/ta/",
# "12 GB": "/p/css03/esgf_publish/CMIP6/CMIP/MRI/MRI-ESM2-0/amip/r1i1p1f1/3hr/tas/gn/v20190829/",
# "22 GB": "/p/css03/esgf_publish/CMIP6/CMIP/MOHC/UKESM1-0-LL/historical/r5i1p1f3/day/ta/gn/v20191115/",
# "50 GB": "/p/css03/esgf_publish/CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/day/ta/gn/v20190308/",
# "74 GB": "/p/css03/esgf_publish/CMIP6/CMIP/CCCma/CanESM5/historical/r1i1p2f1/CFday/ta/gn/v20190429/",
# "105 GB": "/p/css03/esgf_publish/CMIP6/CMIP/MOHC/HadGEM3-GC31-MM/historical/r2i1p1f3/day/ta/gn/v20191218",
}

# %% xCDAT Parallel
# ---------------------
# 245 GB -> 125MB chunk sizes
# Dataset object stored here for inspection. It is not actually used in the
# performance benchmarking.
# ds_parallel = xc.open_mfdataset(f"{DIR}/*.nc", chunks="auto")
XC_PARALLEL_SETUP = (
"import xcdat as xc\n" f"ds = xc.open_mfdataset(f'{DIR}/*.nc', chunks='auto')\n"
# %%
xcdat_serial_runtimes: DefaultDict[str, Dict[str, float]] = collections.defaultdict(
dict
)
xcdat_parallel_runtimes: DefaultDict[str, Dict[str, float]] = collections.defaultdict(
dict
)
parallel_results = {
"spatial_avg": get_runtime(XC_PARALLEL_SETUP, XC_SPATIAL_AVG_STMT, repeat=5),
"temporal_avg": get_runtime(XC_PARALLEL_SETUP, XC_TEMP_AVG_STMT, repeat=5),
"climatology": get_runtime(XC_PARALLEL_SETUP, XC_CLIMATOLOGY_STMT, repeat=5),
"departures": get_runtime(XC_PARALLEL_SETUP, XC_DEPARTURES_STMT, repeat=5),
}

for filesize, path in FILEPATHS.items():
xcdat_serial_runtimes[filesize] = get_xcdat_runtimes(path, "tas", "serial")
# xcdat_runtimes[filesize]["parallel"] = get_xcdat_runtimes(path, "tas", "parallel")


# %% CDMS2 (serial)
# ---------------------
C_SPATIAL_AVG_STMT = f"cdutil.averager(t_var, axis='xy')"
C_TEMP_AVG_STMT = f"cdutil.averager(t_var, axis='t')"
C_CLIMATOLOGY_STMT = f"cdutil.ANNUALCYCLE.climatoloy(t_var)"
C_DEPARTURES_STMT = f"cdutil.ANNUALCYCLE.depatures(t_var)"
def get_cdms2_runtimes(
cdml_filepath: str, var: str, repeat: int = 1
) -> Dict[str, float]:
"""Get the cdms2 runtimes for spatial and temporal averaging.
Parameters
----------
xml_path : str
The path to the CDML file that maps to a multi-file dataset.
var : str
The variable to operate on.
repeat : int
Number of samples to take for each API call, by default 1.
Returns
-------
Dict[str, float]
A dictionary mapping the API to the runtime.
"""
setup = (
"import cdms2\n"
"import cdutil\n"
f"ds = cdms2.open('{cdml_filepath}')\n"
f"t_var = ds['{var}']"
)
api_calls = {
"spatial_avg": "cdutil.averager(t_var, axis='xy')",
"temporal_avg": "cdutil.averager(t_var, axis='t')",
"climatology": "cdutil.ANNUALCYCLE.climatology(t_var)",
# "departures": "cdutil.ANNUALCYCLE.departures(t_var)",
}

runtimes = {}
for api, stmt in api_calls.items():
runtimes[api] = get_runtime(setup, stmt, repeat=repeat)

return runtimes


# %%
# They are stored here for data on climate machines: /p/user_pub/xclim
# You can also generate them from the command line cdscan -x myxml.xml /full/path/to/file/*nc
# /p/user_pub/xclim/$MIP_ERA/$ACTIVITY/$EXPERIMENT/$REALM/$FREQUENCY/$VARIABLE/
# filename: MIP_ERA.ACTIVITY.EXPERIMENT.INSTITUTION.MODEL.MEMBER.FREQUENCY.VARIABLE.REALM.GRID.VERSION.FLAGS.LATEST.xml
XML_FILEPATHS = {
"7 GB": "/home/vo13/xCDAT/xcdat/input/485-xml/CMIP6.CMIP.historical.NCAR.CESM2.r1i1p1f1.day.tas.gn.v20190308.0000000.0.xml",
# "105 GB": "/p/user_pub/xclim/CMIP6/CMIP/historical/atmos/day/ta/"
# "CMIP6.CMIP.historical.MOHC.HadGEM3-GC31-MM.r2i1p1f3.day.ta.atmos.glb-p8-gn.v20191218.0000000.0.xml",
}


# %%
cdms2_serial_runtimes: DefaultDict[str, Dict[str, float]] = collections.defaultdict(
dict
)

for filesize, path in XML_FILEPATHS.items():
cdms2_serial_runtimes[filesize] = get_cdms2_runtimes(path, "tas")

ds_cdms = cdms2(DIR)
t_var = ds_cdms(VAR)

# %%

0 comments on commit ddb509d

Please sign in to comment.