Update perf script

xCDAT · Oct 11, 2023 · ddb509d · ddb509d
1 parent d061826
commit ddb509d
Show file tree

Hide file tree

Showing 2 changed files with 133 additions and 53 deletions.
diff --git a/docs/examples/parallel-computing-with-dask.ipynb b/docs/examples/parallel-computing-with-dask.ipynb
@@ -2534,6 +2534,11 @@
    "source": [
     "### Performance Metrics\n",
     "\n",
+    "Hardware\n",
+    " - Memory (GiB): 1000\n",
+    " - CPU Model: Intel(R) Xeon(R) CPU E7-8890 v4 @ 2.20GHz\n",
+    " - CPUs: 192 (2 threads x 24 cores x 4 sockets)\n",
+    "\n",
     "Factors\n",
     "\n",
     "- Data size\n",

diff --git a/docs/examples/parallel-computing-with-dask.py b/docs/examples/parallel-computing-with-dask.py
@@ -1,18 +1,13 @@
 # %%
+import collections
 import timeit
-from typing import List, Tuple
+from typing import DefaultDict, Dict, List, Literal
 
-import cdms2
-import cdutil
 import numpy as np
-import xarray as xr
-import xcdat as xc
 
 
 # %%
-def get_runtime(
-    setup: str, stmt: str, repeat: int = 5, number: int = 1
-) -> Tuple[float, float]:
+def get_runtime(setup: str, stmt: str, repeat: int = 5, number: int = 1) -> float:
     """Get the runtime for a code statement using timeit.
 
     This function takes the lowest performance value for each sample.
@@ -85,61 +80,141 @@ def get_runtime(
     return min
 
 
-# %% STATIC VARIABLES
-# -------------------
-DIR = (
-    "/p/css03/esgf_publish/CMIP6/CMIP/MOHC/HadGEM3-GC31-MM/historical/r2i1p1f3"
-    "/day/ta/gn/v20191218"
-)
-VAR = "ta"
+# %% xCDAT
+# ----------
+def get_xcdat_runtimes(
+    dir_path: str,
+    var: str,
+    type: Literal["serial", "parallel"],
+) -> Dict[str, float]:
+    """Get the cdms2 runtimes for spatial and temporal averaging.
+
+    Parameters
+    ----------
+    dir_path : str
+        The path to the directory containing `.nc` datasets.
+    var : str
+        The variable to operate on.
+    type : Literal[&quot;serial&quot;, &quot;parallel&quot;]
+        Whether to run the API serially or in parallel.
+
+    Returns
+    -------
+    Dict[str, float]
+        A dictionary mapping the API to the runtime.
+    """
+    if type == "serial":
+        chunks = None
+        use_flox = "with xr.set_options(use_flox=False): \n    "
+    elif type == "parallel":
+        chunks = "auto"
+        use_flox = "with xr.set_options(use_flox=True): \n    "
+
+    setup = (
+        "import xcdat as xc\n"
+        "import xarray as xr\n"
+        f"ds = xc.open_mfdataset(f'{dir_path}/*.nc', chunks={chunks})\n"
+    )
+    api_calls = {
+        "spatial_avg": f"ds.spatial.average('{var}')",
+        "temporal_avg": f"ds.temporal.average('{var}', weighted=True)",
+        "climatology": f"ds.temporal.climatology('{var}', freq='month', weighted=True)",
+        # "departures": f"ds.temporal.departures('{var}', freq='month', weighted=True)",
+    }
 
-# xCDAT API Calls for `get_runtime`
-XC_SPATIAL_AVG_STMT = f"ds.spatial.average('{VAR}')"
-XC_TEMP_AVG_STMT = f"ds.temporal.average('{VAR}', weighted=True)"
-XC_CLIMATOLOGY_STMT = f"ds.temporal.climatology('{VAR}', freq='month', weighted=True)"
-XC_DEPARTURES_STMT = f"ds.temporal.depatures('{VAR}', freq='month', weighted=True)"
+    runtimes = {}
+    for api, stmt in api_calls.items():
+        stmt = use_flox + stmt
+        runtimes[api] = get_runtime(setup, stmt, repeat=1)
 
+    return runtimes
 
-# %% xCDAT SERIAL
-# -------------------
-# Dataset object stored here for inspection. It is not actually used in the
-# performance benchmarking.
-# ds_serial = xc.open_mfdataset(f"{DIR}/*.nc", chunks="auto")
 
-XC_SERIAL_SETUP = (
-    "import xcdat as xc\n" f"ds = xc.open_mfdataset(f'{DIR}/*.nc', chunks=None)\n"
-)
-serial_results = {
-    "spatial_avg": get_runtime(XC_SERIAL_SETUP, XC_SPATIAL_AVG_STMT, repeat=5),
-    "temporal_avg": get_runtime(XC_SERIAL_SETUP, XC_TEMP_AVG_STMT, repeat=5),
-    "climatology": get_runtime(XC_SERIAL_SETUP, XC_CLIMATOLOGY_STMT, repeat=5),
-    "departures": get_runtime(XC_SERIAL_SETUP, XC_DEPARTURES_STMT, repeat=5),
+# %%
+FILEPATHS = {
+    "7 GB": "/p/css03/esgf_publish/CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/day/tas/gn/v20190308/",
+    # "17 GB": "/p/css03/cmip5_css01/data/cmip5/output1/CNRM-CERFACS/CNRM-CM5/historical/day/atmos/day/r1i1p1/v20120530/ta/",
+    # "12 GB": "/p/css03/esgf_publish/CMIP6/CMIP/MRI/MRI-ESM2-0/amip/r1i1p1f1/3hr/tas/gn/v20190829/",
+    # "22 GB": "/p/css03/esgf_publish/CMIP6/CMIP/MOHC/UKESM1-0-LL/historical/r5i1p1f3/day/ta/gn/v20191115/",
+    # "50 GB": "/p/css03/esgf_publish/CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/day/ta/gn/v20190308/",
+    # "74 GB": "/p/css03/esgf_publish/CMIP6/CMIP/CCCma/CanESM5/historical/r1i1p2f1/CFday/ta/gn/v20190429/",
+    # "105 GB": "/p/css03/esgf_publish/CMIP6/CMIP/MOHC/HadGEM3-GC31-MM/historical/r2i1p1f3/day/ta/gn/v20191218",
 }
 
-# %% xCDAT Parallel
-# ---------------------
-# 245 GB -> 125MB chunk sizes
-# Dataset object stored here for inspection. It is not actually used in the
-# performance benchmarking.
-# ds_parallel = xc.open_mfdataset(f"{DIR}/*.nc", chunks="auto")
-XC_PARALLEL_SETUP = (
-    "import xcdat as xc\n" f"ds = xc.open_mfdataset(f'{DIR}/*.nc', chunks='auto')\n"
+# %%
+xcdat_serial_runtimes: DefaultDict[str, Dict[str, float]] = collections.defaultdict(
+    dict
+)
+xcdat_parallel_runtimes: DefaultDict[str, Dict[str, float]] = collections.defaultdict(
+    dict
 )
-parallel_results = {
-    "spatial_avg": get_runtime(XC_PARALLEL_SETUP, XC_SPATIAL_AVG_STMT, repeat=5),
-    "temporal_avg": get_runtime(XC_PARALLEL_SETUP, XC_TEMP_AVG_STMT, repeat=5),
-    "climatology": get_runtime(XC_PARALLEL_SETUP, XC_CLIMATOLOGY_STMT, repeat=5),
-    "departures": get_runtime(XC_PARALLEL_SETUP, XC_DEPARTURES_STMT, repeat=5),
-}
+
+for filesize, path in FILEPATHS.items():
+    xcdat_serial_runtimes[filesize] = get_xcdat_runtimes(path, "tas", "serial")
+    # xcdat_runtimes[filesize]["parallel"] = get_xcdat_runtimes(path, "tas", "parallel")
+
 
 # %% CDMS2 (serial)
 # ---------------------
-C_SPATIAL_AVG_STMT = f"cdutil.averager(t_var, axis='xy')"
-C_TEMP_AVG_STMT = f"cdutil.averager(t_var, axis='t')"
-C_CLIMATOLOGY_STMT = f"cdutil.ANNUALCYCLE.climatoloy(t_var)"
-C_DEPARTURES_STMT = f"cdutil.ANNUALCYCLE.depatures(t_var)"
+def get_cdms2_runtimes(
+    cdml_filepath: str, var: str, repeat: int = 1
+) -> Dict[str, float]:
+    """Get the cdms2 runtimes for spatial and temporal averaging.
+
+    Parameters
+    ----------
+    xml_path : str
+        The path to the CDML file that maps to a multi-file dataset.
+    var : str
+        The variable to operate on.
+    repeat : int
+        Number of samples to take for each API call, by default 1.
+
+
+    Returns
+    -------
+    Dict[str, float]
+        A dictionary mapping the API to the runtime.
+    """
+    setup = (
+        "import cdms2\n"
+        "import cdutil\n"
+        f"ds = cdms2.open('{cdml_filepath}')\n"
+        f"t_var = ds['{var}']"
+    )
+    api_calls = {
+        "spatial_avg": "cdutil.averager(t_var, axis='xy')",
+        "temporal_avg": "cdutil.averager(t_var, axis='t')",
+        "climatology": "cdutil.ANNUALCYCLE.climatology(t_var)",
+        # "departures": "cdutil.ANNUALCYCLE.departures(t_var)",
+    }
+
+    runtimes = {}
+    for api, stmt in api_calls.items():
+        runtimes[api] = get_runtime(setup, stmt, repeat=repeat)
+
+    return runtimes
+
+
+# %%
+# They are stored here for data on climate machines: /p/user_pub/xclim
+# You can also generate them from the command line cdscan -x myxml.xml /full/path/to/file/*nc
+# /p/user_pub/xclim/$MIP_ERA/$ACTIVITY/$EXPERIMENT/$REALM/$FREQUENCY/$VARIABLE/
+# filename: MIP_ERA.ACTIVITY.EXPERIMENT.INSTITUTION.MODEL.MEMBER.FREQUENCY.VARIABLE.REALM.GRID.VERSION.FLAGS.LATEST.xml
+XML_FILEPATHS = {
+    "7 GB": "/home/vo13/xCDAT/xcdat/input/485-xml/CMIP6.CMIP.historical.NCAR.CESM2.r1i1p1f1.day.tas.gn.v20190308.0000000.0.xml",
+    # "105 GB": "/p/user_pub/xclim/CMIP6/CMIP/historical/atmos/day/ta/"
+    # "CMIP6.CMIP.historical.MOHC.HadGEM3-GC31-MM.r2i1p1f3.day.ta.atmos.glb-p8-gn.v20191218.0000000.0.xml",
+}
+
+
+# %%
+cdms2_serial_runtimes: DefaultDict[str, Dict[str, float]] = collections.defaultdict(
+    dict
+)
+
+for filesize, path in XML_FILEPATHS.items():
+    cdms2_serial_runtimes[filesize] = get_cdms2_runtimes(path, "tas")
 
-ds_cdms = cdms2(DIR)
-t_var = ds_cdms(VAR)
 
 # %%