From ddb509d0cb5ab206539699decfd9726dc269347f Mon Sep 17 00:00:00 2001 From: Tom Vo Date: Fri, 9 Jun 2023 13:46:33 -0700 Subject: [PATCH] Update perf script --- .../parallel-computing-with-dask.ipynb | 5 + docs/examples/parallel-computing-with-dask.py | 181 +++++++++++++----- 2 files changed, 133 insertions(+), 53 deletions(-) diff --git a/docs/examples/parallel-computing-with-dask.ipynb b/docs/examples/parallel-computing-with-dask.ipynb index 18ba7643..70cf950b 100644 --- a/docs/examples/parallel-computing-with-dask.ipynb +++ b/docs/examples/parallel-computing-with-dask.ipynb @@ -2534,6 +2534,11 @@ "source": [ "### Performance Metrics\n", "\n", + "Hardware\n", + " - Memory (GiB): 1000\n", + " - CPU Model: Intel(R) Xeon(R) CPU E7-8890 v4 @ 2.20GHz\n", + " - CPUs: 192 (2 threads x 24 cores x 4 sockets)\n", + "\n", "Factors\n", "\n", "- Data size\n", diff --git a/docs/examples/parallel-computing-with-dask.py b/docs/examples/parallel-computing-with-dask.py index 1c8c7cfc..abf561d3 100644 --- a/docs/examples/parallel-computing-with-dask.py +++ b/docs/examples/parallel-computing-with-dask.py @@ -1,18 +1,13 @@ # %% +import collections import timeit -from typing import List, Tuple +from typing import DefaultDict, Dict, List, Literal -import cdms2 -import cdutil import numpy as np -import xarray as xr -import xcdat as xc # %% -def get_runtime( - setup: str, stmt: str, repeat: int = 5, number: int = 1 -) -> Tuple[float, float]: +def get_runtime(setup: str, stmt: str, repeat: int = 5, number: int = 1) -> float: """Get the runtime for a code statement using timeit. This function takes the lowest performance value for each sample. @@ -85,61 +80,141 @@ def get_runtime( return min -# %% STATIC VARIABLES -# ------------------- -DIR = ( - "/p/css03/esgf_publish/CMIP6/CMIP/MOHC/HadGEM3-GC31-MM/historical/r2i1p1f3" - "/day/ta/gn/v20191218" -) -VAR = "ta" +# %% xCDAT +# ---------- +def get_xcdat_runtimes( + dir_path: str, + var: str, + type: Literal["serial", "parallel"], +) -> Dict[str, float]: + """Get the cdms2 runtimes for spatial and temporal averaging. + + Parameters + ---------- + dir_path : str + The path to the directory containing `.nc` datasets. + var : str + The variable to operate on. + type : Literal["serial", "parallel"] + Whether to run the API serially or in parallel. + + Returns + ------- + Dict[str, float] + A dictionary mapping the API to the runtime. + """ + if type == "serial": + chunks = None + use_flox = "with xr.set_options(use_flox=False): \n " + elif type == "parallel": + chunks = "auto" + use_flox = "with xr.set_options(use_flox=True): \n " + + setup = ( + "import xcdat as xc\n" + "import xarray as xr\n" + f"ds = xc.open_mfdataset(f'{dir_path}/*.nc', chunks={chunks})\n" + ) + api_calls = { + "spatial_avg": f"ds.spatial.average('{var}')", + "temporal_avg": f"ds.temporal.average('{var}', weighted=True)", + "climatology": f"ds.temporal.climatology('{var}', freq='month', weighted=True)", + # "departures": f"ds.temporal.departures('{var}', freq='month', weighted=True)", + } -# xCDAT API Calls for `get_runtime` -XC_SPATIAL_AVG_STMT = f"ds.spatial.average('{VAR}')" -XC_TEMP_AVG_STMT = f"ds.temporal.average('{VAR}', weighted=True)" -XC_CLIMATOLOGY_STMT = f"ds.temporal.climatology('{VAR}', freq='month', weighted=True)" -XC_DEPARTURES_STMT = f"ds.temporal.depatures('{VAR}', freq='month', weighted=True)" + runtimes = {} + for api, stmt in api_calls.items(): + stmt = use_flox + stmt + runtimes[api] = get_runtime(setup, stmt, repeat=1) + return runtimes -# %% xCDAT SERIAL -# ------------------- -# Dataset object stored here for inspection. It is not actually used in the -# performance benchmarking. -# ds_serial = xc.open_mfdataset(f"{DIR}/*.nc", chunks="auto") -XC_SERIAL_SETUP = ( - "import xcdat as xc\n" f"ds = xc.open_mfdataset(f'{DIR}/*.nc', chunks=None)\n" -) -serial_results = { - "spatial_avg": get_runtime(XC_SERIAL_SETUP, XC_SPATIAL_AVG_STMT, repeat=5), - "temporal_avg": get_runtime(XC_SERIAL_SETUP, XC_TEMP_AVG_STMT, repeat=5), - "climatology": get_runtime(XC_SERIAL_SETUP, XC_CLIMATOLOGY_STMT, repeat=5), - "departures": get_runtime(XC_SERIAL_SETUP, XC_DEPARTURES_STMT, repeat=5), +# %% +FILEPATHS = { + "7 GB": "/p/css03/esgf_publish/CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/day/tas/gn/v20190308/", + # "17 GB": "/p/css03/cmip5_css01/data/cmip5/output1/CNRM-CERFACS/CNRM-CM5/historical/day/atmos/day/r1i1p1/v20120530/ta/", + # "12 GB": "/p/css03/esgf_publish/CMIP6/CMIP/MRI/MRI-ESM2-0/amip/r1i1p1f1/3hr/tas/gn/v20190829/", + # "22 GB": "/p/css03/esgf_publish/CMIP6/CMIP/MOHC/UKESM1-0-LL/historical/r5i1p1f3/day/ta/gn/v20191115/", + # "50 GB": "/p/css03/esgf_publish/CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/day/ta/gn/v20190308/", + # "74 GB": "/p/css03/esgf_publish/CMIP6/CMIP/CCCma/CanESM5/historical/r1i1p2f1/CFday/ta/gn/v20190429/", + # "105 GB": "/p/css03/esgf_publish/CMIP6/CMIP/MOHC/HadGEM3-GC31-MM/historical/r2i1p1f3/day/ta/gn/v20191218", } -# %% xCDAT Parallel -# --------------------- -# 245 GB -> 125MB chunk sizes -# Dataset object stored here for inspection. It is not actually used in the -# performance benchmarking. -# ds_parallel = xc.open_mfdataset(f"{DIR}/*.nc", chunks="auto") -XC_PARALLEL_SETUP = ( - "import xcdat as xc\n" f"ds = xc.open_mfdataset(f'{DIR}/*.nc', chunks='auto')\n" +# %% +xcdat_serial_runtimes: DefaultDict[str, Dict[str, float]] = collections.defaultdict( + dict +) +xcdat_parallel_runtimes: DefaultDict[str, Dict[str, float]] = collections.defaultdict( + dict ) -parallel_results = { - "spatial_avg": get_runtime(XC_PARALLEL_SETUP, XC_SPATIAL_AVG_STMT, repeat=5), - "temporal_avg": get_runtime(XC_PARALLEL_SETUP, XC_TEMP_AVG_STMT, repeat=5), - "climatology": get_runtime(XC_PARALLEL_SETUP, XC_CLIMATOLOGY_STMT, repeat=5), - "departures": get_runtime(XC_PARALLEL_SETUP, XC_DEPARTURES_STMT, repeat=5), -} + +for filesize, path in FILEPATHS.items(): + xcdat_serial_runtimes[filesize] = get_xcdat_runtimes(path, "tas", "serial") + # xcdat_runtimes[filesize]["parallel"] = get_xcdat_runtimes(path, "tas", "parallel") + # %% CDMS2 (serial) # --------------------- -C_SPATIAL_AVG_STMT = f"cdutil.averager(t_var, axis='xy')" -C_TEMP_AVG_STMT = f"cdutil.averager(t_var, axis='t')" -C_CLIMATOLOGY_STMT = f"cdutil.ANNUALCYCLE.climatoloy(t_var)" -C_DEPARTURES_STMT = f"cdutil.ANNUALCYCLE.depatures(t_var)" +def get_cdms2_runtimes( + cdml_filepath: str, var: str, repeat: int = 1 +) -> Dict[str, float]: + """Get the cdms2 runtimes for spatial and temporal averaging. + + Parameters + ---------- + xml_path : str + The path to the CDML file that maps to a multi-file dataset. + var : str + The variable to operate on. + repeat : int + Number of samples to take for each API call, by default 1. + + + Returns + ------- + Dict[str, float] + A dictionary mapping the API to the runtime. + """ + setup = ( + "import cdms2\n" + "import cdutil\n" + f"ds = cdms2.open('{cdml_filepath}')\n" + f"t_var = ds['{var}']" + ) + api_calls = { + "spatial_avg": "cdutil.averager(t_var, axis='xy')", + "temporal_avg": "cdutil.averager(t_var, axis='t')", + "climatology": "cdutil.ANNUALCYCLE.climatology(t_var)", + # "departures": "cdutil.ANNUALCYCLE.departures(t_var)", + } + + runtimes = {} + for api, stmt in api_calls.items(): + runtimes[api] = get_runtime(setup, stmt, repeat=repeat) + + return runtimes + + +# %% +# They are stored here for data on climate machines: /p/user_pub/xclim +# You can also generate them from the command line cdscan -x myxml.xml /full/path/to/file/*nc +# /p/user_pub/xclim/$MIP_ERA/$ACTIVITY/$EXPERIMENT/$REALM/$FREQUENCY/$VARIABLE/ +# filename: MIP_ERA.ACTIVITY.EXPERIMENT.INSTITUTION.MODEL.MEMBER.FREQUENCY.VARIABLE.REALM.GRID.VERSION.FLAGS.LATEST.xml +XML_FILEPATHS = { + "7 GB": "/home/vo13/xCDAT/xcdat/input/485-xml/CMIP6.CMIP.historical.NCAR.CESM2.r1i1p1f1.day.tas.gn.v20190308.0000000.0.xml", + # "105 GB": "/p/user_pub/xclim/CMIP6/CMIP/historical/atmos/day/ta/" + # "CMIP6.CMIP.historical.MOHC.HadGEM3-GC31-MM.r2i1p1f3.day.ta.atmos.glb-p8-gn.v20191218.0000000.0.xml", +} + + +# %% +cdms2_serial_runtimes: DefaultDict[str, Dict[str, float]] = collections.defaultdict( + dict +) + +for filesize, path in XML_FILEPATHS.items(): + cdms2_serial_runtimes[filesize] = get_cdms2_runtimes(path, "tas") -ds_cdms = cdms2(DIR) -t_var = ds_cdms(VAR) # %%