admin/4.0-release-prep-and-benchmark-upgrades (#244)

* Ignore presentations dir from language * Try double star for ignore lang * Add benchmark for chunk size variability * Fix default and lif bench, add chunk compare * Benchmark aicsimageio against other libs * Use variance cfe instead of pipeline 4 * Configure better lib compare bench * Remove extra deps from benchmark deps * Cleanup lib compare * Reduce the amount of files checked during benchs * Fix benchmark params on TIFF like * Fix comment in random sample * Fix typo
AllenCellModeling · May 30, 2021 · 8c7d4f5 · 8c7d4f5
1 parent 738ce92
commit 8c7d4f5
Show file tree

Hide file tree

Showing 6 changed files with 175 additions and 18 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+presentations/** linguist-documentation
diff --git a/asv.conf.json b/asv.conf.json
@@ -7,7 +7,7 @@
     "dvcs": "git",
     "environment_type": "virtualenv",
     "install_command": [
-        "in-dir={env_dir} python -mpip install {build_dir}[dev]"
+        "in-dir={env_dir} python -mpip install {build_dir}[benchmark]"
     ],
     "show_commit_url": "http://github.com/AllenCellModeling/aicsimageio/commit/",
     "pythons": ["3.9"],

diff --git a/benchmarks/benchmark_chunk_sizes.py b/benchmarks/benchmark_chunk_sizes.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import dask.array as da
+import random
+from pathlib import Path
+
+from aicsimageio import AICSImage
+
+from .benchmark_image_containers import _ImageContainerTimeSuite
+
+###############################################################################
+
+# We only benchmark against local files as remote files are covered by unit tests
+# and are generally slower than local but scale at a similar rate.
+LOCAL_RESOURCES_DIR = (
+    Path(__file__).parent.parent / "aicsimageio" / "tests" / "resources"
+)
+
+###############################################################################
+
+
+class ChunkSuite(_ImageContainerTimeSuite):
+    # This suite measures the effect that changing the default chunk dims
+    # has on the duration of various reads.
+    # We would expect that processing speed can be optimized based off of the
+    # dimensions of the file and what the user is trying to do with said file.
+    # i.e. If the user wants to normalize each channel and make a max projection
+    # through Z, then the default of 'ZYX' is preferred over just 'YX'.
+    # During this suite we not only benchmark the above example but also
+    # file reading under the various chunk configurations as a monitor
+    # for general read performance.
+
+    params = (
+        [
+            str(LOCAL_RESOURCES_DIR / "pre-variance-cfe.ome.tiff"),
+            str(LOCAL_RESOURCES_DIR / "variance-cfe.ome.tiff"),
+        ],
+        # We don't go above chunking by three dims because it would be rare
+        # to do so... if you can read four-plus dims in a single chunk why can't you
+        # just read in the whole image at once.
+        # We also use CYX here to show that chunking with the _wrong_ dimensions can
+        # result in longer processing times.
+        [
+            "YX",
+            "ZYX",
+            "CYX",
+        ],
+    )
+
+    def time_norm_and_project(self, img_path, chunk_dims):
+        """
+        Benchmark how long a norm and project through Z takes
+        under various chunk dims configurations.
+        """
+        # Init image container
+        r = self.ImageContainer(img_path, chunk_dims=chunk_dims)
+
+        # Store all delayed projections
+        projs = []
+
+        # Only run a random sample of two channels instead of all
+        selected_channels = random.sample(r.channel_names, 2)
+        for i, channel_name in enumerate(r.channel_names):
+            if channel_name in selected_channels:
+                # Select each channel
+                data = r.get_image_dask_data("ZYX", C=i)
+
+                # Get percentile norm by values
+                min_px_val, max_px_val = da.percentile(
+                    data.flatten(),
+                    [50.0, 99.8],
+                ).compute()
+
+                # Norm
+                normed = (data - min_px_val) / (max_px_val - min_px_val)
+
+                # Clip any values outside of 0 and 1
+                clipped = da.clip(normed, 0, 1)
+
+                # Scale them between 0 and 255
+                scaled = clipped * 255
+
+                # Create max project
+                projs.append(scaled.max(axis=0))
+
+        # Compute all projections
+        projs = da.stack(projs)
+        projs.compute()
+
+    def setup(self, img_path, chunk_dims):
+        random.seed(42)
+        self.ImageContainer = AICSImage
diff --git a/benchmarks/benchmark_image_containers.py b/benchmarks/benchmark_image_containers.py
@@ -5,7 +5,7 @@
 from pathlib import Path
 
 from aicsimageio import AICSImage, readers
-from aicsimageio.dimensions import DimensionNames
+from aicsimageio.dimensions import DEFAULT_CHUNK_DIMS, DimensionNames
 
 ###############################################################################
 
@@ -61,25 +61,34 @@ class _ImageContainerTimeSuite:
         DimensionNames.Samples,
     ]
 
-    def time_init(self, img_path):
+    def time_init(self, img_path, chunk_dims=None):
         """
         Benchmark how long it takes to validate a file and finish general setup.
         """
-        self.ImageContainer(img_path)
+        if chunk_dims is None:
+            chunk_dims = DEFAULT_CHUNK_DIMS
 
-    def time_delayed_array_construct(self, img_path):
+        self.ImageContainer(img_path, chunk_dims=chunk_dims)
+
+    def time_delayed_array_construct(self, img_path, chunk_dims=None):
         """
         Benchmark how long it takes to construct the delayed dask array for a file.
         """
-        self.ImageContainer(img_path).dask_data
+        if chunk_dims is None:
+            chunk_dims = DEFAULT_CHUNK_DIMS
+
+        self.ImageContainer(img_path, chunk_dims=chunk_dims).dask_data
 
-    def time_random_single_chunk_read(self, img_path):
+    def time_random_single_chunk_read(self, img_path, chunk_dims=None):
         """
         Benchmark how long it takes to read a single chunk out of a file.
 
         I.E. "Pull just the Brightfield channel z-stack.
         """
-        r = self.ImageContainer(img_path)
+        if chunk_dims is None:
+            chunk_dims = DEFAULT_CHUNK_DIMS
+
+        r = self.ImageContainer(img_path, chunk_dims=chunk_dims)
 
         random_index_selections = {}
         for dim, size in zip(r.dims.order, r.dims.shape):
@@ -91,13 +100,16 @@ def time_random_single_chunk_read(self, img_path):
         )
         r.get_image_dask_data(valid_dims_to_return, **random_index_selections).compute()
 
-    def time_random_many_chunk_read(self, img_path):
+    def time_random_many_chunk_read(self, img_path, chunk_dims=None):
         """
         Open a file, get many chunks out of the file at once.
 
         I.E. "Pull the DNA and Nucleus channel z-stacks, for the middle 50% timepoints".
         """
-        r = self.ImageContainer(img_path)
+        if chunk_dims is None:
+            chunk_dims = DEFAULT_CHUNK_DIMS
+
+        r = self.ImageContainer(img_path, chunk_dims=chunk_dims)
 
         random_index_selections = {}
         for dim, size in zip(r.dims.order, r.dims.shape):
@@ -133,27 +145,37 @@ class DefaultReaderSuite(_ImageContainerTimeSuite, _ImageContainerMemorySuite):
 
     def setup(self, img_path):
         random.seed(42)
-        self.ImageContainer = readers.DefaultReader
+        self.ImageContainer = readers.default_reader.DefaultReader
 
 
 class TiffReaderSuite(_ImageContainerTimeSuite, _ImageContainerMemorySuite):
     params = [
-        sorted([str(f) for f in LOCAL_RESOURCES_DIR.glob("*.tiff")]),
+        [
+            str(
+                LOCAL_RESOURCES_DIR
+                / "image_stack_tpzc_50tp_2p_5z_3c_512k_1_MMStack_2-Pos001_000.ome.tif"
+            ),
+            str(LOCAL_RESOURCES_DIR / "variance-cfe.ome.tiff"),
+        ]
     ]
 
     def setup(self, img_path):
         random.seed(42)
-        self.ImageContainer = readers.TiffReader
+        self.ImageContainer = readers.tiff_reader.TiffReader
 
 
 class OmeTiffReaderSuite(_ImageContainerTimeSuite, _ImageContainerMemorySuite):
     params = [
-        sorted([str(f) for f in LOCAL_RESOURCES_DIR.glob("*.ome.tiff")]),
+        [
+            str(LOCAL_RESOURCES_DIR / "actk.ome.tiff"),
+            str(LOCAL_RESOURCES_DIR / "pre-variance-cfe.ome.tiff"),
+            str(LOCAL_RESOURCES_DIR / "variance-cfe.ome.tiff"),
+        ]
     ]
 
     def setup(self, img_path):
         random.seed(42)
-        self.ImageContainer = readers.OmeTiffReader
+        self.ImageContainer = readers.ome_tiff_reader.OmeTiffReader
 
 
 class LifReaderSuite(_ImageContainerTimeSuite, _ImageContainerMemorySuite):
@@ -163,7 +185,7 @@ class LifReaderSuite(_ImageContainerTimeSuite, _ImageContainerMemorySuite):
 
     def setup(self, img_path):
         random.seed(42)
-        self.ImageContainer = readers.LifReader
+        self.ImageContainer = readers.lif_reader.LifReader
 
 
 class AICSImageSuite(_ImageContainerTimeSuite, _ImageContainerMemorySuite):

diff --git a/benchmarks/benchmark_lib.py b/benchmarks/benchmark_lib.py
@@ -2,13 +2,48 @@
 # -*- coding: utf-8 -*-
 
 """
-Benchmarks for general library operations.
+Benchmarks for general library operations and comparisons against other libraries.
 """
 
+from functools import partial
 
-class LibSuite:
+from aicsimageio import imread_dask as aicsimageio_imread
+from dask_image.imread import imread as dask_image_imread
+
+from .benchmark_image_containers import LOCAL_RESOURCES_DIR
+
+###############################################################################
+
+ACTK_OME_TIFF = str(LOCAL_RESOURCES_DIR / "actk.ome.tiff")
+
+###############################################################################
+
+
+class LibInitSuite:
     def time_base_import(self):
         """
         Benchmark how long it takes to import the library as a whole.
         """
         import aicsimageio  # noqa: F401
+
+
+class LibCompareSuite:
+    """
+    Compare aicsimageio against other "just-in-time" image reading libs.
+    """
+
+    FUNC_LOOKUP = {
+        "aicsimageio-default-chunks": partial(aicsimageio_imread, chunk_dims="ZYX"),
+        "aicsimageio-plane-chunks": partial(aicsimageio_imread, chunk_dims="YX"),
+        "dask-image-imread-default": dask_image_imread,
+    }
+
+    params = [
+        "aicsimageio-default-chunks",
+        "aicsimageio-plane-chunks",
+        "dask-image-imread-default",
+    ]
+
+    def time_lib_config(self, func_name):
+        func = self.FUNC_LOOKUP[func_name]
+        func(ACTK_OME_TIFF).compute()
diff --git a/setup.py b/setup.py
@@ -53,6 +53,11 @@
     "wheel>=0.34.2",
 ]
 
+benchmark_requirements = [
+    *dev_requirements,
+    "dask-image~=0.6.0",
+]
+
 requirements = [
     "dask[array]>=2021.4.1",
     "fsspec>=2021.4.0",
@@ -70,6 +75,7 @@
     "setup": setup_requirements,
     "test": test_requirements,
     "dev": dev_requirements,
+    "benchmark": benchmark_requirements,
     **format_libs,
     "all": all_formats,
 }