From c9a8cfd68d23061963acde121815663e28b89f46 Mon Sep 17 00:00:00 2001 From: Miles Price Date: Thu, 25 Apr 2024 21:08:44 -0700 Subject: [PATCH] feat: adding updates for CVCUDA Release v0.7.0 Beta 5 --- .gitattributes | 1 + .github/ISSUE_TEMPLATE/bug_report.md | 16 + .github/ISSUE_TEMPLATE/feature_request.md | 16 + .github/ISSUE_TEMPLATE/submit-question.md | 16 + .gitignore | 1 + .pre-commit-config.yaml | 6 +- CMakeLists.txt | 2 +- CONTRIBUTING.md | 4 +- DEVELOPER_GUIDE.md | 21 +- README.md | 309 ++-- bench/BenchAdaptiveThreshold.cpp | 2 +- bench/BenchAverageBlur.cpp | 2 +- bench/BenchBilateralFilter.cpp | 2 +- bench/BenchBrightnessContrast.cpp | 2 +- bench/BenchColorTwist.cpp | 2 +- bench/BenchComposite.cpp | 2 +- bench/BenchCopyMakeBorder.cpp | 2 +- bench/BenchCvtColor.cpp | 2 +- bench/BenchErase.cpp | 2 +- bench/BenchFindContours.cpp | 126 -- bench/BenchFlip.cpp | 2 +- bench/BenchGaussian.cpp | 2 +- bench/BenchGaussianNoise.cpp | 2 +- bench/BenchHQResize.cpp | 2 +- bench/BenchHistogramEq.cpp | 2 +- bench/BenchInpaint.cpp | 2 +- bench/BenchJointBilateralFilter.cpp | 2 +- bench/BenchLabel.cpp | 23 +- bench/BenchLaplacian.cpp | 2 +- bench/BenchMedianBlur.cpp | 2 +- bench/BenchMinMaxLoc.cpp | 2 +- bench/BenchMorphology.cpp | 2 +- bench/BenchNormalize.cpp | 2 +- bench/BenchPillowResize.cpp | 2 +- bench/BenchRandomResizedCrop.cpp | 2 +- bench/BenchRemap.cpp | 2 +- bench/BenchResize.cpp | 2 +- bench/BenchRotate.cpp | 2 +- bench/BenchThreshold.cpp | 2 +- bench/BenchWarpAffine.cpp | 2 +- bench/BenchWarpPerspective.cpp | 2 +- bench/CMakeLists.txt | 1 - bench/python/all_ops/op_copymakeborder.py | 2 +- bench/python/all_ops/op_findcontours.py | 109 -- bench/python/all_ops/op_flip.py | 20 +- bench/python/assets/brooklyn_bboxes.pt | Bin 196520 -> 131 bytes bench/python/assets/brooklyn_nms_masks.pt | Bin 196620 -> 131 bytes bench/python/assets/brooklyn_scores.pt | Bin 98600 -> 130 bytes bench/python/run_bench.py | 19 +- ci/check_formatting.sh | 42 + cmake/ConfigCUDA.cmake | 7 +- docker/config | 2 +- docs/sphinx/conf.py | 4 +- docs/sphinx/content/cvcuda_oplist.csv | 9 +- docs/sphinx/index.rst | 13 +- docs/sphinx/installation.rst | 11 +- docs/sphinx/relnotes/v0.7.0-beta.rst | 69 + python/CMakeLists.txt | 2 +- python/mod_cvcuda/CMakeLists.txt | 1 - python/mod_cvcuda/Main.cpp | 1 - python/mod_cvcuda/OpFindContours.cpp | 134 -- python/mod_cvcuda/OpLabel.cpp | 56 +- python/mod_cvcuda/OpResize.cpp | 4 +- python/mod_cvcuda/Operators.hpp | 1 - python/mod_nvcv/CAPI.cpp | 10 +- python/mod_nvcv/Resource.cpp | 77 +- python/mod_nvcv/Resource.hpp | 65 +- python/mod_nvcv/Stream.cpp | 113 +- python/mod_nvcv/Stream.hpp | 23 +- python/mod_nvcv/include/nvcv/python/CAPI.hpp | 3 +- .../include/nvcv/python/ResourceGuard.hpp | 31 +- samples/CMakeLists.txt | 2 +- samples/classification/python/main.py | 8 +- samples/common/python/nvcodec_utils.py | 11 +- samples/label/python/label.py | 215 +++ samples/label/python/main.py | 8 +- samples/object_detection/python/main.py | 12 +- samples/scripts/benchmark.py | 2 +- samples/scripts/run_samples.sh | 83 +- samples/segmentation/python/main.py | 11 +- samples/segmentation/python/triton_client.py | 10 +- src/cvcuda/CMakeLists.txt | 3 +- src/cvcuda/OpFindContours.cpp | 56 - src/cvcuda/OpLabel.cpp | 9 +- src/cvcuda/include/cvcuda/OpFindContours.h | 124 -- src/cvcuda/include/cvcuda/OpFindContours.hpp | 86 -- src/cvcuda/include/cvcuda/OpFindHomography.h | 8 +- src/cvcuda/include/cvcuda/OpLabel.h | 49 +- src/cvcuda/include/cvcuda/OpLabel.hpp | 9 +- src/cvcuda/include/cvcuda/OpSIFT.h | 3 +- src/cvcuda/include/cvcuda/Types.h | 6 + src/cvcuda/priv/CMakeLists.txt | 1 - src/cvcuda/priv/OpBrightnessContrast.cu | 5 +- src/cvcuda/priv/OpFindContours.cpp | 69 - src/cvcuda/priv/OpFindContours.hpp | 55 - src/cvcuda/priv/OpFindHomography.cu | 148 +- src/cvcuda/priv/OpLabel.cu | 347 ++++- src/cvcuda/priv/OpLabel.hpp | 4 +- src/cvcuda/priv/OpRemap.cu | 3 +- src/cvcuda/priv/legacy/CMakeLists.txt | 1 - src/cvcuda/priv/legacy/CvCudaLegacy.h | 128 -- src/cvcuda/priv/legacy/adaptive_threshold.cu | 5 - .../legacy/adaptive_threshold_var_shape.cu | 6 - src/cvcuda/priv/legacy/filter.cu | 12 - src/cvcuda/priv/legacy/filter_var_shape.cu | 10 - src/cvcuda/priv/legacy/find_contours.cu | 1238 ----------------- src/cvcuda/priv/legacy/min_area_rect.cu | 5 - src/cvcuda/priv/legacy/osd.cu | 2 +- src/cvcuda/priv/legacy/random_resized_crop.cu | 98 -- .../legacy/random_resized_crop_var_shape.cu | 98 -- src/cvcuda/priv/legacy/resize.cu | 336 +---- src/cvcuda/priv/legacy/resize_var_shape.cu | 454 +----- .../include/nvcv/cuda/ArrayWrap.hpp | 7 +- src/nvcv_types/include/nvcv/cuda/Atomics.hpp | 6 +- .../include/nvcv/cuda/BorderWrap.hpp | 19 +- src/nvcv_types/include/nvcv/cuda/DropCast.hpp | 8 +- .../include/nvcv/cuda/RangeCast.hpp | 8 +- .../include/nvcv/cuda/SaturateCast.hpp | 8 +- .../include/nvcv/cuda/StaticCast.hpp | 8 +- .../include/nvcv/cuda/TensorWrap.hpp | 26 +- .../include/nvcv/cuda/math/LinAlg.hpp | 576 ++++++-- tests/cvcuda/python/cvcuda_test_python.in | 29 +- tests/cvcuda/python/test_multi_stream.py | 163 +++ tests/cvcuda/python/test_opfindcontours.py | 53 - tests/cvcuda/python/test_opfindhomography.py | 7 +- tests/cvcuda/python/test_oplabel.py | 41 +- tests/cvcuda/system/CMakeLists.txt | 1 - tests/cvcuda/system/TestOpFindContours.cpp | 174 --- tests/cvcuda/system/TestOpLabel.cpp | 109 +- tests/cvcuda/system/TestOpOSD.cpp | 2 +- .../cudatools_system/TestLinAlg.cpp | 539 +++++-- .../cudatools_system/TestTensorWrap.cpp | 6 +- .../python/nvcv_test_types_python.in | 29 +- tests/nvcv_types/system/TestArray.cpp | 108 +- tests/nvcv_types/system/TestColorSpec.cpp | 2 +- tests/nvcv_types/system/TestImage.cpp | 51 + tests/nvcv_types/system/TestImageBatch.cpp | 30 + tests/nvcv_types/system/TestTensor.cpp | 50 + tests/nvcv_types/system/TestTensorBatch.cpp | 28 + tests/nvcv_types/unit/TestCheckError.cpp | 9 + 140 files changed, 2940 insertions(+), 4325 deletions(-) delete mode 100644 bench/BenchFindContours.cpp delete mode 100644 bench/python/all_ops/op_findcontours.py create mode 100755 ci/check_formatting.sh create mode 100644 docs/sphinx/relnotes/v0.7.0-beta.rst delete mode 100644 python/mod_cvcuda/OpFindContours.cpp create mode 100644 samples/label/python/label.py delete mode 100644 src/cvcuda/OpFindContours.cpp delete mode 100644 src/cvcuda/include/cvcuda/OpFindContours.h delete mode 100644 src/cvcuda/include/cvcuda/OpFindContours.hpp delete mode 100644 src/cvcuda/priv/OpFindContours.cpp delete mode 100644 src/cvcuda/priv/OpFindContours.hpp delete mode 100644 src/cvcuda/priv/legacy/find_contours.cu create mode 100644 tests/cvcuda/python/test_multi_stream.py delete mode 100644 tests/cvcuda/python/test_opfindcontours.py delete mode 100644 tests/cvcuda/system/TestOpFindContours.cpp diff --git a/.gitattributes b/.gitattributes index 1255c68c..89205017 100644 --- a/.gitattributes +++ b/.gitattributes @@ -21,3 +21,4 @@ *.mp4 filter=lfs diff=lfs merge=lfs -text *.a filter=lfs diff=lfs merge=lfs -text *.hdf5 filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 540810c2..359a414f 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -7,6 +7,22 @@ assignees: '' --- +[//]: # "SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved." +[//]: # "SPDX-License-Identifier: Apache-2.0" +[//]: # "" +[//]: # "Licensed under the Apache License, Version 2.0 (the 'License');" +[//]: # "you may not use this file except in compliance with the License." +[//]: # "You may obtain a copy of the License at" +[//]: # "http://www.apache.org/licenses/LICENSE-2.0" +[//]: # "" +[//]: # "Unless required by applicable law or agreed to in writing, software" +[//]: # "distributed under the License is distributed on an 'AS IS' BASIS" +[//]: # "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." +[//]: # "See the License for the specific language governing permissions and" +[//]: # "limitations under the License." + + + **Describe the bug** A clear and concise description of the bug. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index d4b540a7..ffd08012 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -7,6 +7,22 @@ assignees: '' --- +[//]: # "SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved." +[//]: # "SPDX-License-Identifier: Apache-2.0" +[//]: # "" +[//]: # "Licensed under the Apache License, Version 2.0 (the 'License');" +[//]: # "you may not use this file except in compliance with the License." +[//]: # "You may obtain a copy of the License at" +[//]: # "http://www.apache.org/licenses/LICENSE-2.0" +[//]: # "" +[//]: # "Unless required by applicable law or agreed to in writing, software" +[//]: # "distributed under the License is distributed on an 'AS IS' BASIS" +[//]: # "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." +[//]: # "See the License for the specific language governing permissions and" +[//]: # "limitations under the License." + + + **Is your feature request related to a problem? Please describe.** A clear and concise description of the problem. Ex. I wish I could use CV-CUDA to do [...] diff --git a/.github/ISSUE_TEMPLATE/submit-question.md b/.github/ISSUE_TEMPLATE/submit-question.md index 72b2b74c..6900ea6b 100644 --- a/.github/ISSUE_TEMPLATE/submit-question.md +++ b/.github/ISSUE_TEMPLATE/submit-question.md @@ -7,4 +7,20 @@ assignees: '' --- +[//]: # "SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved." +[//]: # "SPDX-License-Identifier: Apache-2.0" +[//]: # "" +[//]: # "Licensed under the Apache License, Version 2.0 (the 'License');" +[//]: # "you may not use this file except in compliance with the License." +[//]: # "You may obtain a copy of the License at" +[//]: # "http://www.apache.org/licenses/LICENSE-2.0" +[//]: # "" +[//]: # "Unless required by applicable law or agreed to in writing, software" +[//]: # "distributed under the License is distributed on an 'AS IS' BASIS" +[//]: # "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." +[//]: # "See the License for the specific language governing permissions and" +[//]: # "limitations under the License." + + + **What is your question?** diff --git a/.gitignore b/.gitignore index 718aa64f..4b0a6a14 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,7 @@ /build/ /build-*/ /install/ +/cvcuda-installer*/ # Visual Studio Code # ------------------ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f0db2240..f5197e2e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -72,8 +72,8 @@ repos: - id: copyright_check name: 'check copyright message' language: system - types: ['file', 'text'] - exclude_types: ['markdown', 'xml', 'json', 'csv'] + types: ['file', 'text', 'markdown'] + exclude_types: ['xml', 'json', 'csv'] entry: ./lint/copyright_check.sh exclude: 'models/.*' - id: lfs_check @@ -83,7 +83,7 @@ repos: require_serial: true - repo: https://github.com/alessandrojcm/commitlint-pre-commit-hook - rev: v9.0.0 + rev: v9.13.0 hooks: - id: commitlint stages: [commit-msg] diff --git a/CMakeLists.txt b/CMakeLists.txt index fccd9c7e..0f98aede 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,7 +23,7 @@ endif() project(cvcuda LANGUAGES C CXX - VERSION 0.6.0 + VERSION 0.7.0 DESCRIPTION "CUDA-accelerated Computer Vision algorithms" ) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d21011b9..37852a87 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -16,7 +16,7 @@ # Contributing to CV-CUDA -**As of release v0.6.0-beta, CV-CUDA is not accepting outside contribution.** +**As of release v0.7.0-beta, CV-CUDA is not accepting outside contribution.** Contributions to CV-CUDA fall into the following categories: @@ -28,7 +28,7 @@ Contributions to CV-CUDA fall into the following categories: 1. To propose a new feature, please file a new feature request [issue](https://github.com/CVCUDA/CV-CUDA/issues/new/choose). Describe the intended feature and discuss the design and implementation with the team and - community. NOTE: Currently, as of release v0.6.0-beta, CV-CUDA is not accepting + community. NOTE: Currently, as of release v0.7.0-beta, CV-CUDA is not accepting outside contribution. 1. To ask a general question, please sumbit a question [issue](https://github.com/CVCUDA/CV-CUDA/issues/new/choose). If you need diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md index a5f4bec5..83e42f22 100644 --- a/DEVELOPER_GUIDE.md +++ b/DEVELOPER_GUIDE.md @@ -30,7 +30,7 @@ CV-CUDA includes: - C, C++, and Python APIs - Batching support, with variable shape images - Zero-copy interfaces to PyTorch -- Sample applications: object classification and image segmentation +- Sample applications ## What Pre- and Post-Processing Operators Are Included? @@ -40,7 +40,7 @@ CV-CUDA includes: | Advanced Color Format Conversions | Performs color conversion from interleaved RGB/BGR <-> YUV/YVU and semi planar. Supported standards: BT.601. BT.709. BT.2020 | | AverageBlur | Reduces image noise using an average filter | | BilateralFilter | Reduces image noise while preserving strong edges | -| Bounding Box | Draws a rectangular border using the X-Y coordinates and dimensions typically to define the location and size of an object in an image | +| Bounding Box | Draws an rectangular border using the X-Y coordinates and dimensions typically to define the location and size of an object in an image | | Box Blurring | Overlays a blurred rectangle using the X-Y coordinates and dimensions that define the location and size of an object in an image | | Brightness_Contrast | Adjusts brightness and contrast of an image | | CenterCrop | Crops an image at its center | @@ -53,8 +53,6 @@ CV-CUDA includes: | CvtColor | Converts an image from one color space to another | | DataTypeConvert | Converts an image’s data type, with optional scaling | | Erase | Erases image regions | -| Find Contours | Extract closed contours from an input binary image | -| FindHomography | Calculates a perspective transform from four pairs of the corresponding points | | Flip | Flips a 2D image around its axis | | GammaContrast | Adjusts image contrast | | Gaussian | Applies a gaussian blur filter to the image | @@ -70,9 +68,9 @@ CV-CUDA includes: | MinArea Rect | Finds the minimum area rotated rectangle typically used to draw bounding rectangle with minimum area | | MinMaxLoc | Finds the maximum and minimum values in a given array | | Morphology | Performs morphological erode and dilate transformations | -| Morphology (close) | Performs a morphological operation that involves dilation followed by erosion on an image | -| Morphology (open) | Performs a morphological operation that involves erosion followed by dilation on an image | -| Non-max Suppression | Enables selecting a single entity out of many overlapping ones typically used for selecting from multiple bounding boxes during object detection | +| Morphology (close) | Performs morphological operation that involves dilation followed by erosion on an image | +| Morphology (open) | Performs morphological operation that involves erosion followed by dilation on an image | +| Non-Maximum Suppression | Enables selecting a single entity out of many overlapping ones typically used for selecting from multiple bounding boxes during object detection | | Normalize | Normalizes an image pixel’s range | | OSD (Polyline Line Text Rotated Rect Segmented Mask) | Displays an overlay on the image of different forms including polyline line text rotated rectangle segmented mask | | PadStack | Stacks several images into a tensor with border extension | @@ -83,20 +81,19 @@ CV-CUDA includes: | Remap | Maps pixels in an image with one projection to another projection in a new image. | | Resize | Changes the size and scale of an image | | Rotate | Rotates a 2D array in multiples of 90 degrees | -| SIFT | Identifies and matches features in images that are invariant to scale rotation and affine distortion. | -| Stack | Concatenates two input tensors into a single output tensor | +| SIFT | Identifies and describes features in images that are invariant to scale rotation and affine distortion. | | Thresholding | Chooses a global threshold value that is the same for all pixels across the image. | | WarpAffine | Applies an affine transformation to an image | | WarpPerspective | Applies a perspective transformation to an image | ## Where Are the Release Notes? -An awesome product requires excellent support. CV-CUDA release notes can be +CV-CUDA release notes can be found [here](https://github.com/CVCUDA/CV-CUDA/releases) ## Where Can I Get Help? -File requests for enhancements and bug reports +An awesome product requires excellent support. File requests for enhancements and bug reports [here](https://github.com/CVCUDA/CV-CUDA/issues/new/choose). We are providing limited, direct, support to select enterprises using CV-CUDA. @@ -208,5 +205,5 @@ companies with which they are associated. Copyright -© 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +© 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. diff --git a/README.md b/README.md index 31ae4466..4eaf478d 100644 --- a/README.md +++ b/README.md @@ -15,9 +15,10 @@ # CV-CUDA + [![License](https://img.shields.io/badge/License-Apache_2.0-yellogreen.svg)](https://opensource.org/licenses/Apache-2.0) -![Version](https://img.shields.io/badge/Version-v0.6.0--beta-blue) +![Version](https://img.shields.io/badge/Version-v0.7.0--beta-blue) ![Platform](https://img.shields.io/badge/Platform-linux--64_%7C_win--64_wsl2%7C_aarch64-gray) @@ -33,7 +34,7 @@ efficient pre- and post-processing pipelines. CV-CUDA originated as a collaborative effort between [NVIDIA][NVIDIA Develop] and [ByteDance][ByteDance]. Refer to our [Developer Guide](DEVELOPER_GUIDE.md) for more information on the -operators available as of release v0.6.0-beta. +operators available. ## Getting Started @@ -43,10 +44,10 @@ To get a local copy up and running follow these steps. |CV-CUDA Build|Platform|CUDA Version|CUDA Compute Capability|Hardware Architectures|Nvidia Driver|Python Versions|Supported Compilers (build from source)|API compatibility with prebuilt binaries|OS/Linux distributions tested with prebuilt packages| |-|-|-|-|-|-|-|-|-|-| -|x86_64_cu11|x86_64|11.7 or later|SM7 and later|Volta, Turing, Amper, Hopper, Ada Lovelace|r520 or later*** |3.8, 3.9, 3.10, 3.11|gcc>=9*
gcc>=11**|gcc>=9|Ubuntu>= 20.04
WSL2/Ubuntu>=20.04| -|x86_64_cu12|x86_64|12.2 or later|SM7 and later|Volta, Turing, Amper, Hopper, Ada Lovelace|r520 or later***|3.8, 3.9, 3.10, 3.11|gcc>=9*
gcc>=11**|gcc>=9|Ubuntu>= 20.04
WSL2/Ubuntu>=20.04| -|aarch64_cu11 (JetPack 5.1)|aarch64|11.4|SM7 and later|Jetson AGX Orin|JetPack 5.1|3.8|gcc>=9*
gcc>=11**|gcc>=9|Jetson Linux 35.x| -|aarch64_cu12 (JetPack 6.0)|aarch64|12.2|SM7 and later|Jetson AGX Orin|JetPack 6.0 DP|3.10|gcc>=9*
gcc>=11**|gcc>=9|Jetson Linux 36.2| +|x86_64_cu11|x86_64|11.7 or later|SM7 and later|Volta, Turing, Ampere, Hopper, Ada Lovelace|r525 or later*** |3.8, 3.9, 3.10, 3.11|gcc>=9*
gcc>=11**|gcc>=9|Ubuntu>= 20.04
WSL2/Ubuntu>=20.04| +|x86_64_cu12|x86_64|12.2 or later|SM7 and later|Volta, Turing, Ampere, Hopper, Ada Lovelace|r525 or later***|3.8, 3.9, 3.10, 3.11|gcc>=9*
gcc>=11**|gcc>=9|Ubuntu>= 20.04
WSL2/Ubuntu>=20.04| +|aarch64_cu11|aarch64|11.4|SM7 and later|Jetson AGX Orin|JetPack 5.1|3.8|gcc>=9*
gcc>=11**|gcc>=9|Jetson Linux 35.x| +|aarch64_cu12|aarch64|12.2|SM7 and later|Jetson AGX Orin, IGX Orin + Ampere RTX6000, IGX Orin + ADA RTX6000|JetPack 6.0 DP, r535 (IGX OS v0.6)|3.10|gcc>=9*
gcc>=11**|gcc>=9|Jetson Linux 36.2
IGX OS v0.6| \* partial build, no test module (see Known Limitations)
\** full build, including test module
@@ -58,7 +59,7 @@ To get a local copy up and running follow these steps. - The C++ test module cannot build with gcc<11 (requires specific C++-20 features). With gcc-9 or gcc-10, please build with option `-DBUILD_TESTS=0` - [CV-CUDA Samples] require driver r535 or later to run and are only officially supported with CUDA 12. - Only one CUDA version (CUDA 11.x or CUDA 12.x) of CV-CUDA packages (Debian packages, tarballs, Python Wheels) can be installed at a time. Please uninstall all packages from a given CUDA version before installing packages from a different version. -- Test tarballs (cvcuda-tests-*.tar.xz) need to be unpacked at the root level to find existing tests. +- Documentation built with older toolchains (doxygen, sphinx, breathe, exhale) may be incomplete. We recommend using Ubuntu 22.04 or later. ### Installation @@ -66,211 +67,189 @@ For convenience, we provide pre-built packages for various combinations of CUDA The following steps describe how to install CV-CUDA from such pre-built packages. We support two main alternative pathways: -- DEB or Tar archive installation (C++/CUDA Libraries, Headers, Python bindings) - Standalone Python Wheels (containing C++/CUDA Libraries and Python bindings) +- DEB or Tar archive installation (C++/CUDA Libraries, Headers, Python bindings) Choose the installation method that meets your environment needs. -#### Tar File Installation +#### Python Wheel File Installation -- Installation of C++/CUDA libraries (cvcuda-lib*) and development headers (cvcuda-dev*): -```shell -tar -xvf cvcuda-lib-0.6.0_beta---linux.tar.xz -tar -xvf cvcuda-dev-0.6.0_beta---linux.tar.xz -``` -- Installation of Python bindings (cvcuda-python*) -```shell -tar -xvf cvcuda-python-0.6.0_beta---linux.tar.xz -``` -with `` the desired CUDA version, -`` the desired Python version and -`` the desired architecture +Download the appropriate .whl file for your computer architecture, Python and CUDA version from the release assets of current CV-CUDA release. Release information of all CV-CUDA releases can be found [here][CV-CUDA GitHub Releases]. Once downloaded, execute the `pip install` command to install the Python wheel. For example: + ```shell + pip install cvcuda_-0.7.0b0-cp-cp-linux_.whl + ``` + +where `` is the desired CUDA version, `` is the desired Python version and `` is the desired architecture. + +Please note that the Python wheels are standalone, they include both the C++/CUDA libraries and the Python bindings. #### DEB File Installation -- Installation of C++/CUDA libraries (cvcuda-lib*) and development headers (cvcuda-dev*): +Install C++/CUDA libraries (cvcuda-lib*) and development headers (cvcuda-dev*) using `apt`: ```shell -sudo apt-get install -y ./cvcuda-lib-0.6.0_beta---linux.deb ./cvcuda-dev-0.6.0_beta---linux.deb +apt install -y ./cvcuda-lib----linux.deb ./cvcuda-dev----linux.deb ``` -- Installation of Python bindings (cvcuda-python*) + +Install Python bindings (cvcuda-python*) using `apt`: ```shell -sudo apt-get install -y cvcuda-python-0.6.0_beta---linux.deb +apt install -y ./cvcuda-python----linux.deb ``` -with `` the desired CUDA version, -`` the desired Python version and -`` the desired architecture - -#### Python Wheel File Installation - - -Download the appropriate .whl file for your computer architecture, Python and CUDA version from the release assets of current CV-CUDA release. Release information of all CV-CUDA releases can be accessed [here][CV-CUDA GitHub Releases]. Once downloaded, execute the `pip install` command to install the Python wheel. For example: +where `` is the desired CUDA version, `` is the desired Python version and `` is the desired architecture. +#### Tar File Installation +Install C++/CUDA libraries (cvcuda-lib*) and development headers (cvcuda-dev*): ```shell -pip install cvcuda_-0.6.0b0-cp-cp-linux_.whl +tar -xvf cvcuda-lib----linux.tar.xz +tar -xvf cvcuda-dev----linux.tar.xz ``` -with `` the desired CUDA version, -`` the desired Python version and -`` the desired architecture - -Please note that the Python wheels provided are standalone, they include both the C++/CUDA libraries and the Python bindings. +Install Python bindings (cvcuda-python*) +```shell +tar -xvf cvcuda-python----linux.tar.xz +``` +where `` is the desired CUDA version, `` is the desired Python version and `` is the desired architecture. ### Build from Source Follow these instruction to build CV-CUDA from source: -1. Set up your local CV-CUDA repository - - a. Install prerequisites needed to setup up the repository. - - On Ubuntu >= 20.04, install the following packages: - - git-lfs: to retrieve binary files from remote repository - - ```shell - sudo apt-get install -y git git-lfs - ``` - - b. After cloning the repository (assuming it was cloned in `~/cvcuda`), - it needs to be properly configured by running the `init_repo.sh` script only once. - - ```shell - cd ~/cvcuda - ./init_repo.sh - ``` - -2. Build CV-CUDA - - a. Install the dependencies required for building CV-CUDA - - On Ubuntu >= 20.04, install the following packages: - - g++-11: compiler to be used - - cmake (>= 3.20), ninja-build (optional): manage build rules - - python3-dev: for python bindings - - libssl-dev: needed by the testsuite (MD5 hashing utilities) - - ```shell - sudo apt-get install -y g++-11 cmake ninja-build python3-dev libssl-dev - ``` - - For CUDA Toolkit, any version of the 11.x or 12.x series should work. - CV-CUDA was tested with 11.7 and 12.2, thus those should be preferred. - - ```shell - sudo apt-get install -y cuda-11-7 - # or - sudo apt-get install -y cuda-12-2 - ``` - - b. Build the project +#### 1. Set up your local CV-CUDA repository - ```shell - ci/build.sh [release|debug] [output build tree path] [-DBUILD_TESTS=1|0] [-DPYTHON_VERSIONS='3.8;3.9;3.10;3.11'] [-DPUBLIC_API_COMPILERS='gcc-9;gcc-11;clang-11;clang-14'] - ``` +Install the dependencies needed to setup up the repository: +- git +- git-lfs: to retrieve binary files from remote repository - The default build type is 'release'. - - If output build tree path isn't specified, it will be `build-rel` for release - builds, and `build-deb` for debug. - - The library is in `build-rel/lib` and executables (tests, etc...) are in `build-rel/bin`. +On Ubuntu >= 20.04, install the following packages using `apt`: +```shell +apt install -y git git-lfs +``` - The `-DBUILD_TESTS` option can be used to disable/enable building the tests (enabled by default, see Known Limitations). +Clone the repository +```shell +git clone https://github.com/CVCUDA/CV-CUDA.git +``` - The `-DPYTHON_VERSIONS` option can be used to select Python versions to build bindings and Wheels for. - By default, only the default system Python3 version will be selected. +Assuming the repository was cloned in `~/cvcuda`, it needs to be properly configured by running the `init_repo.sh` script only once. - The `-DPUBLIC_API_COMPILERS` option can be used to select the compilers used to check public API compatibility. - By default, gcc-11, gcc-9, clang-11, and clang-14 is tried to be selected and checked. +```shell +cd ~/cvcuda +./init_repo.sh +``` -3. Build Documentation +#### 2. Build CV-CUDA - a. Install the dependencies required for building the documentation +Install the dependencies required to build CV-CUDA: +- g++-11: compiler to be used +- cmake (>= 3.20), ninja-build (optional): manage build rules +- python3-dev: for python bindings +- libssl-dev: needed by the testsuite (MD5 hashing utilities) +- CUDA toolkit - On Ubuntu >= 20.04, install the following packages: - - doxygen: parse header files for reference documentation - - python3, python3-pip: to install some python packages needed - - sphinx, breathe, exhale, recommonmark, graphiviz: to render the documentation - - sphinx-rtd-theme: documenation theme used +On Ubuntu >= 20.04, install the following packages using `apt`: +```shell +apt install -y g++-11 cmake ninja-build python3-dev libssl-dev +``` - ```shell - sudo apt-get install -y doxygen graphviz python3 python3-pip - sudo python3 -m pip install sphinx==4.5.0 breathe exhale recommonmark graphviz sphinx-rtd-theme - ``` +Any version of the 11.x or 12.x CUDA toolkit should work. +CV-CUDA was tested with 11.7 and 12.2, these versions are thus recommended. - b. Build the documentation - ```shell - ci/build_docs.sh [build folder] - ``` +```shell +apt install -y cuda-11-7 +# or +apt install -y cuda-12-2 +``` - Example: - `ci/build_docs.sh build_docs` +Build the project: +```shell +ci/build.sh [release|debug] [output build tree path] [-DBUILD_TESTS=1|0] [-DPYTHON_VERSIONS='3.8;3.9;3.10;3.11'] [-DPUBLIC_API_COMPILERS='gcc-9;gcc-11;clang-11;clang-14'] +``` -4. Build and run Samples +- The default build type is 'release'. +- If output build tree path isn't specified, it will be `build-rel` for release + builds, and `build-deb` for debug. +- The library is in `build-rel/lib` and executables (tests, etc...) are in `build-rel/bin`. +- The `-DBUILD_TESTS` option can be used to disable/enable building the tests (enabled by default, see Known Limitations). +- The `-DPYTHON_VERSIONS` option can be used to select Python versions to build bindings and Wheels for. By default, only the default system Python3 version will be selected. +- The `-DPUBLIC_API_COMPILERS` option can be used to select the compilers used to check public API compatibility. By default, gcc-11, gcc-9, clang-11, and clang-14 is tried to be selected and checked. - For instructions on how to build samples from source and run them, see the [Samples](samples/README.md) documentation. +#### 3. Build Documentation -5. Run Tests +Known limitation: documentation built with older toolchains (doxygen, sphinx, breathe, exhale) may be incomplete. We recommend using Ubuntu 22.04 or later. - a. Install the dependencies required for running the tests +Install the dependencies required to build the documentation: +- doxygen: parse header files for reference documentation +- python3, python3-pip: to install some python packages needed +- sphinx, breathe, exhale, recommonmark, graphiviz: to render the documentation +- sphinx-rtd-theme: documentation theme used - On Ubuntu >= 20.04, install the following packages: - - python3, python3-pip: to run python bindings tests - - torch: dependencies needed by python bindings tests +On Ubuntu, install the following packages using `apt` and `pip`: +```shell +apt install -y doxygen graphviz python3 python3-pip +python3 -m pip install sphinx==4.5.0 breathe exhale recommonmark graphviz sphinx-rtd-theme +``` - ```shell - sudo apt-get install -y python3 python3-pip - sudo python3 -m pip install pytest torch - ``` +Build the documentation: +```shell +ci/build_docs.sh [build folder] +``` +Default build folder is 'build'. - b. Run the tests +#### 4. Build and run Samples - The tests are in `/bin`. You can run the script below to run all - tests at once. Here's an example when build tree is created in `build-rel` +For instructions on how to build samples from source and run them, see the [Samples](samples/README.md) documentation. - ```shell - build-rel/bin/run_tests.sh - ``` +#### 5. Run Tests -6. Package installers and Python Wheels +Install the dependencies required for running the tests: +- python3, python3-pip: to run python bindings tests +- torch: dependencies needed by python bindings tests - a. Package installers +On Ubuntu >= 20.04, install the following packages using `apt` and `pip`: +```shell +apt install -y python3 python3-pip +python3 -m pip install pytest torch +``` - Installers can be generated using the following cpack command once you have successfully built the project +The tests are in `/bin`. You can run the script below to run all tests at once. Here's an example when build tree is created in `build-rel`: +```shell +build-rel/bin/run_tests.sh +``` - ```shell - cd build-rel - cpack . - ``` +#### 6. Package installers and Python Wheels - This will generate in the build directory both Debian installers and tarballs - (\*.tar.xz), needed for integration in other distros. +Package installers - For a fine-grained choice of what installers to generate, the full syntax is: +Installers can be generated using the following cpack command once you have successfully built the project: +```shell +cd build-rel +cpack . +``` +This will generate in the build directory both Debian installers and tarballs (\*.tar.xz), needed for integration in other distros. - ```shell - cpack . -G [DEB|TXZ] - ``` +For a fine-grained choice of what installers to generate, the full syntax is: - - DEB for Debian packages - - TXZ for \*.tar.xz tarballs. +```shell +cpack . -G [DEB|TXZ] +``` +- DEB for Debian packages +- TXZ for \*.tar.xz tarballs. - b. Python Wheels +Python Wheels - By default during the `release` build, Python bindings and wheels are created for the available CUDA version and the specified Python - version(s). The wheels are stored in `build-rel/pythonX.Y/wheel` folder, where `build-rel` is the build directory - used to build the release build and `X` and `Y` are Python major and minor versions. The built wheels can be installed using pip. - For example, to install the Python wheel built for CUDA 12.x, Python 3.10 on Linux x86_64 systems: +By default during the `release` build, Python bindings and wheels are created for the available CUDA version and the specified Python version(s). The wheels are stored in `build-rel/pythonX.Y/wheel` folder, where `build-rel` is the build directory used to build the release build and `X` and `Y` are Python major and minor versions. - ```shell - pip install cvcuda_cu12-0.6.0b0-cp310-cp310-linux_x86_64.whl - ``` +The built wheels can be installed using pip. +For example, to install the Python wheel built for CUDA 12.x, Python 3.10 on Linux x86_64 systems: +```shell +pip install cvcuda_cu12--cp310-cp310-linux_x86_64.whl +``` ## Contributing CV-CUDA is an open source project. As part of the Open Source Community, we are committed to the cycle of learning, improving, and updating that makes this -community thrive. However, as of release v0.6.0-beta, CV-CUDA is not yet ready +community thrive. However, as of release v0.7.0-beta, CV-CUDA is not yet ready for external contributions. To understand the process for contributing the CV-CUDA, see our @@ -287,27 +266,27 @@ The `mkop.sh` script is a powerful tool for creating a scaffold for new operator 1. **Operator Stub Creation**: Generates no-op (no-operation) operator templates, which serve as a starting point for implementing new functionalities. -1. **File Customization**: Modifies template files to include the new operator's name, ensuring consistent naming conventions across the codebase. +2. **File Customization**: Modifies template files to include the new operator's name, ensuring consistent naming conventions across the codebase. -1. **CMake Integration**: Adds the new operator files to the appropriate CMakeLists, facilitating seamless compilation and integration into the build system. +3. **CMake Integration**: Adds the new operator files to the appropriate CMakeLists, facilitating seamless compilation and integration into the build system. -1. **Python Bindings**: Creates Python wrapper stubs for the new operator, allowing it to be used within Python environments. +4. **Python Bindings**: Creates Python wrapper stubs for the new operator, allowing it to be used within Python environments. -1. **Test Setup**: Generates test files for both C++ and Python, enabling immediate development of unit tests for the new operator. +5. **Test Setup**: Generates test files for both C++ and Python, enabling immediate development of unit tests for the new operator. #### How to Use `mkop.sh`: -Run the script with the desired operator name. The script assumes it's located in `/cvcuda/tools/mkop`. +Run the script with the desired operator name. The script assumes it's located in `~/cvcuda/tools/mkop`. - ```shell - ./mkop.sh [Operator Name] - ``` +```shell +./mkop.sh [Operator Name] +``` If the script is run from a different location, provide the path to the CV-CUDA root directory. - ```shell - ./mkop.sh [Operator Name] [CV-CUDA root] - ``` +```shell +./mkop.sh [Operator Name] [CV-CUDA root] +``` **NOTE**: The first letter of the new operator name is captitalized where needed to match the rest of the file structures. diff --git a/bench/BenchAdaptiveThreshold.cpp b/bench/BenchAdaptiveThreshold.cpp index 658281fd..10fe8570 100644 --- a/bench/BenchAdaptiveThreshold.cpp +++ b/bench/BenchAdaptiveThreshold.cpp @@ -92,5 +92,5 @@ using AdaptiveThresholdTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(AdaptiveThreshold, NVBENCH_TYPE_AXES(AdaptiveThresholdTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}) + .add_int64_axis("varShape", {-1, 0}) .add_int64_axis("blockSize", {7}); diff --git a/bench/BenchAverageBlur.cpp b/bench/BenchAverageBlur.cpp index fbfc9c4c..0736ccd4 100644 --- a/bench/BenchAverageBlur.cpp +++ b/bench/BenchAverageBlur.cpp @@ -88,6 +88,6 @@ using AverageBlurTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(AverageBlur, NVBENCH_TYPE_AXES(AverageBlurTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}) + .add_int64_axis("varShape", {-1, 0}) .add_string_axis("kernelSize", {"7x7"}) .add_string_axis("border", {"REPLICATE"}); diff --git a/bench/BenchBilateralFilter.cpp b/bench/BenchBilateralFilter.cpp index 73875d8e..ff41b949 100644 --- a/bench/BenchBilateralFilter.cpp +++ b/bench/BenchBilateralFilter.cpp @@ -90,7 +90,7 @@ using BilateralFilterTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(BilateralFilter, NVBENCH_TYPE_AXES(BilateralFilterTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}) + .add_int64_axis("varShape", {-1, 0}) .add_int64_axis("diameter", {-1}) .add_float64_axis("sigmaSpace", {1.2}) .add_string_axis("border", {"REFLECT"}); diff --git a/bench/BenchBrightnessContrast.cpp b/bench/BenchBrightnessContrast.cpp index 8e741169..ea79f5a1 100644 --- a/bench/BenchBrightnessContrast.cpp +++ b/bench/BenchBrightnessContrast.cpp @@ -88,4 +88,4 @@ using BrightnessContrastTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(BrightnessContrast, NVBENCH_TYPE_AXES(BrightnessContrastTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}); + .add_int64_axis("varShape", {-1, 0}); diff --git a/bench/BenchColorTwist.cpp b/bench/BenchColorTwist.cpp index 67e90af8..1ade029f 100644 --- a/bench/BenchColorTwist.cpp +++ b/bench/BenchColorTwist.cpp @@ -82,4 +82,4 @@ using ColorTwistTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(ColorTwist, NVBENCH_TYPE_AXES(ColorTwistTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}); + .add_int64_axis("varShape", {-1, 0}); diff --git a/bench/BenchComposite.cpp b/bench/BenchComposite.cpp index 2293ecab..f29f26ac 100644 --- a/bench/BenchComposite.cpp +++ b/bench/BenchComposite.cpp @@ -88,4 +88,4 @@ using CompositeTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(Composite, NVBENCH_TYPE_AXES(CompositeTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}); + .add_int64_axis("varShape", {-1, 0}); diff --git a/bench/BenchCopyMakeBorder.cpp b/bench/BenchCopyMakeBorder.cpp index 722c37d0..8d26487a 100644 --- a/bench/BenchCopyMakeBorder.cpp +++ b/bench/BenchCopyMakeBorder.cpp @@ -92,5 +92,5 @@ using CopyMakeBorderTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(CopyMakeBorder, NVBENCH_TYPE_AXES(CopyMakeBorderTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}) + .add_int64_axis("varShape", {-1, 0}) .add_string_axis("border", {"REFLECT101"}); diff --git a/bench/BenchCvtColor.cpp b/bench/BenchCvtColor.cpp index 05469e0f..abe1951e 100644 --- a/bench/BenchCvtColor.cpp +++ b/bench/BenchCvtColor.cpp @@ -80,4 +80,4 @@ using CvtColorTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(CvtColor, NVBENCH_TYPE_AXES(CvtColorTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}); + .add_int64_axis("varShape", {-1, 0}); diff --git a/bench/BenchErase.cpp b/bench/BenchErase.cpp index 68419ad9..2bb504d2 100644 --- a/bench/BenchErase.cpp +++ b/bench/BenchErase.cpp @@ -91,5 +91,5 @@ using EraseTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(Erase, NVBENCH_TYPE_AXES(EraseTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {0}) + .add_int64_axis("varShape", {-1, 0}) .add_int64_axis("numErase", {3}); diff --git a/bench/BenchFindContours.cpp b/bench/BenchFindContours.cpp deleted file mode 100644 index 06deb973..00000000 --- a/bench/BenchFindContours.cpp +++ /dev/null @@ -1,126 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "BenchUtils.hpp" - -#include -#include -#include - -#include - -using CPUImage = std::vector; - -static void generateRectangle(CPUImage &image, nvcv::Size2D boundary, nvcv::Size2D anchor = {0, 0}, - nvcv::Size2D size = {5, 5}, double angle = 0.0, bool fill = true, uint8_t setValue = 1); - -static void generateRectangle(CPUImage &image, nvcv::Size2D boundary, nvcv::Size2D anchor, nvcv::Size2D size, - double angle, bool fill, uint8_t setValue) -{ - auto rad = angle * (M_PI / 180.0); - auto cosAngle = std::cos(rad); - auto sinAngle = std::sin(rad); - - auto transformed = anchor; - for (auto y = 0; y < size.h; ++y) - { - for (auto x = 0; x < size.w; ++x) - { - transformed.w = anchor.w + (x * cosAngle - y * sinAngle); - transformed.h = anchor.h + (x * sinAngle + y * cosAngle); - - if (fill || y == 0 || y == size.h - 1 || x == 0 || x == size.w - 1) - { - if (transformed.w >= 0 && transformed.w < boundary.w && transformed.h >= 0 - && transformed.h < boundary.h) - { - image[transformed.h * boundary.w + transformed.w] = setValue; - } - } - } - } -} - -template -inline void FindContours(nvbench::state &state, nvbench::type_list) -try -{ - srand(0U); // Use a fixed random seed - long3 shape = benchutils::GetShape<3>(state.get_string("shape")); - long varShape = state.get_int64("varShape"); - int numPoints = static_cast(state.get_int64("numPoints")); - - // R/W bandwidth rationale: - // Read image + connected components (S32) - // Write points + contours (U32) - state.add_global_memory_reads(shape.x * shape.y * shape.z * (sizeof(T) + sizeof(int))); - state.add_global_memory_writes(shape.x * numPoints * sizeof(int) * 2 + shape.x * 4 * sizeof(int)); - - cvcuda::FindContours op(nvcv::Size2D{(int)shape.z, (int)shape.y}, shape.x); - - // clang-format off - - nvcv::Tensor points({{shape.x, numPoints, 2}, "NCW"}, nvcv::TYPE_S32); - nvcv::Tensor counts({{shape.x, 4}, "NW"}, nvcv::TYPE_S32); - - if (varShape < 0) // negative var shape means use Tensor - { - nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); - auto inData = src.exportData(); - auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inData); - - //Generate input - CPUImage srcVec(shape.y * shape.z, 0); - for (auto i = 0; i < 10; ++i) - { - int anchorX = rand() % shape.z; - int anchorY = rand() % shape.y; - int sizeX = rand() % (shape.z - anchorX); - int sizeY = rand() % (shape.y - anchorY); - generateRectangle(srcVec, {anchorX, anchorY}, {sizeX, sizeY}); - } - - for (auto i = 0; i < shape.x; ++i) - { - CUDA_CHECK_ERROR(cudaMemcpy2D(inAccess->sampleData(i), inAccess->rowStride(), srcVec.data(), shape.z, shape.z, - shape.y, cudaMemcpyHostToDevice)); - } - - state.exec(nvbench::exec_tag::sync, [&op, &src, &points, &counts](nvbench::launch &launch) - { - op(launch.get_stream(), src, points, counts); - }); - } - else // zero and positive var shape means use ImageBatchVarShape - { - throw std::invalid_argument("ImageBatchVarShape not implemented for this operator"); - } -} -catch (const std::exception &err) -{ - state.skip(err.what()); -} - -// clang-format on - -using FindContoursTypes = nvbench::type_list; - -NVBENCH_BENCH_TYPES(FindContours, NVBENCH_TYPE_AXES(FindContoursTypes)) - .set_type_axes_names({"InOutDataType"}) - .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}) - .add_int64_axis("numPoints", {1024}); diff --git a/bench/BenchFlip.cpp b/bench/BenchFlip.cpp index 620eac7f..9c052f62 100644 --- a/bench/BenchFlip.cpp +++ b/bench/BenchFlip.cpp @@ -95,5 +95,5 @@ using FlipTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(Flip, NVBENCH_TYPE_AXES(FlipTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}) + .add_int64_axis("varShape", {-1, 0}) .add_string_axis("flipType", {"BOTH"}); diff --git a/bench/BenchGaussian.cpp b/bench/BenchGaussian.cpp index 8b4fc30d..a1976581 100644 --- a/bench/BenchGaussian.cpp +++ b/bench/BenchGaussian.cpp @@ -91,6 +91,6 @@ using GaussianTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(Gaussian, NVBENCH_TYPE_AXES(GaussianTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}) + .add_int64_axis("varShape", {-1, 0}) .add_float64_axis("sigma", {1.2}) .add_string_axis("border", {"REFLECT"}); diff --git a/bench/BenchGaussianNoise.cpp b/bench/BenchGaussianNoise.cpp index 68633a90..09dcd04e 100644 --- a/bench/BenchGaussianNoise.cpp +++ b/bench/BenchGaussianNoise.cpp @@ -84,4 +84,4 @@ using GaussianNoiseTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(GaussianNoise, NVBENCH_TYPE_AXES(GaussianNoiseTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}); + .add_int64_axis("varShape", {-1, 0}); diff --git a/bench/BenchHQResize.cpp b/bench/BenchHQResize.cpp index 9d80963e..49ff4141 100644 --- a/bench/BenchHQResize.cpp +++ b/bench/BenchHQResize.cpp @@ -122,7 +122,7 @@ using HQResizeTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(HQResize, NVBENCH_TYPE_AXES(HQResizeTypes)) .set_type_axes_names({"InOutDataType"}) - .add_int64_axis("batch", {false}) + .add_int64_axis("batch", {false, true}) .add_string_axis("shape", {"1x1080x1920"}) .add_string_axis("interpolation", {"CUBIC"}) .add_int64_axis("antialias", {false, true}) diff --git a/bench/BenchHistogramEq.cpp b/bench/BenchHistogramEq.cpp index 54082d55..74bcb9d4 100644 --- a/bench/BenchHistogramEq.cpp +++ b/bench/BenchHistogramEq.cpp @@ -74,4 +74,4 @@ using HistogramEqTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(HistogramEq, NVBENCH_TYPE_AXES(HistogramEqTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}); + .add_int64_axis("varShape", {-1, 0}); diff --git a/bench/BenchInpaint.cpp b/bench/BenchInpaint.cpp index 88a237b3..ed6dbd05 100644 --- a/bench/BenchInpaint.cpp +++ b/bench/BenchInpaint.cpp @@ -82,4 +82,4 @@ using InpaintTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(Inpaint, NVBENCH_TYPE_AXES(InpaintTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}); + .add_int64_axis("varShape", {-1, 0}); diff --git a/bench/BenchJointBilateralFilter.cpp b/bench/BenchJointBilateralFilter.cpp index 45c325bd..2aa74804 100644 --- a/bench/BenchJointBilateralFilter.cpp +++ b/bench/BenchJointBilateralFilter.cpp @@ -94,7 +94,7 @@ using JointBilateralFilterTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(JointBilateralFilter, NVBENCH_TYPE_AXES(JointBilateralFilterTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}) + .add_int64_axis("varShape", {-1, 0}) .add_int64_axis("diameter", {-1}) .add_float64_axis("sigmaSpace", {1.2}) .add_string_axis("border", {"REFLECT"}); diff --git a/bench/BenchLabel.cpp b/bench/BenchLabel.cpp index 41005379..5e1870f5 100644 --- a/bench/BenchLabel.cpp +++ b/bench/BenchLabel.cpp @@ -32,15 +32,16 @@ try std::string runChoice = state.get_string("runChoice"); - // Use [BG][MIN][MAX][ISLAND][COUNT][STAT] in runChoice to run Label with: - // background; minThreshold; maxThreshold; island removal; count; statistics + // Use [BG][MIN][MAX][ISLAND][COUNT][STAT][MASK] in runChoice to run Label with: + // background; minThreshold; maxThreshold; island removal; count; statistics; mask - long3 staShape{srcShape.x, 10000, 6}; // using fixed 10K max. cap. and 2D problem + long3 staShape{srcShape.x, 10000, 7}; // using fixed 10K max. cap. and 2D problem - NVCVConnectivityType conn = NVCV_CONNECTIVITY_4_2D; - NVCVLabelType alab = NVCV_LABEL_FAST; + NVCVConnectivityType conn = NVCV_CONNECTIVITY_4_2D; + NVCVLabelType alab = NVCV_LABEL_FAST; + NVCVLabelMaskType mType = NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY; - nvcv::Tensor bgT, minT, maxT, countT, statsT, mszT; + nvcv::Tensor bgT, minT, maxT, countT, statsT, mszT, maskT; cvcuda::Label op; @@ -81,16 +82,20 @@ try { statsT = nvcv::Tensor({{staShape.x, staShape.y, staShape.z}, "NMA"}, benchutils::GetDataType
()); } + if (runChoice.find("MASK") != std::string::npos) + { + maskT = nvcv::Tensor({{srcShape.x, srcShape.y, srcShape.z, 1}, "NHWC"}, nvcv::TYPE_U8); + } nvcv::Tensor src({{srcShape.x, srcShape.y, srcShape.z, 1}, "NHWC"}, benchutils::GetDataType()); nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, 1}, "NHWC"}, benchutils::GetDataType
()); benchutils::FillTensor(src, benchutils::RandomValues()); - state.exec(nvbench::exec_tag::sync, - [&op, &src, &dst, &bgT, &minT, &maxT, &mszT, &countT, &statsT, &conn, &alab](nvbench::launch &launch) + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &bgT, &minT, &maxT, &mszT, &countT, &statsT, &maskT, &conn, + &alab, &mType](nvbench::launch &launch) { - op(launch.get_stream(), src, dst, bgT, minT, maxT, mszT, countT, statsT, conn, alab); + op(launch.get_stream(), src, dst, bgT, minT, maxT, mszT, countT, statsT, maskT, conn, alab, mType); }); } catch (const std::exception &err) diff --git a/bench/BenchLaplacian.cpp b/bench/BenchLaplacian.cpp index e685198e..7956d8c2 100644 --- a/bench/BenchLaplacian.cpp +++ b/bench/BenchLaplacian.cpp @@ -85,7 +85,7 @@ using LaplacianTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(Laplacian, NVBENCH_TYPE_AXES(LaplacianTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}) + .add_int64_axis("varShape", {-1, 0}) .add_int64_axis("ksize", {1}) .add_float64_axis("scale", {1.0}) .add_string_axis("border", {"REFLECT101"}); diff --git a/bench/BenchMedianBlur.cpp b/bench/BenchMedianBlur.cpp index 45b2c1a6..0520f5f2 100644 --- a/bench/BenchMedianBlur.cpp +++ b/bench/BenchMedianBlur.cpp @@ -82,5 +82,5 @@ using MedianBlurTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(MedianBlur, NVBENCH_TYPE_AXES(MedianBlurTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}) + .add_int64_axis("varShape", {-1, 0}) .add_string_axis("kernelSize", {"5x5"}); diff --git a/bench/BenchMinMaxLoc.cpp b/bench/BenchMinMaxLoc.cpp index 582348fd..40e8385b 100644 --- a/bench/BenchMinMaxLoc.cpp +++ b/bench/BenchMinMaxLoc.cpp @@ -88,5 +88,5 @@ using MinMaxLocTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(MinMaxLoc, NVBENCH_TYPE_AXES(MinMaxLocTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}) + .add_int64_axis("varShape", {-1, 0}) .add_int64_axis("maxLocations", {100000}); diff --git a/bench/BenchMorphology.cpp b/bench/BenchMorphology.cpp index d3947e78..f357dbff 100644 --- a/bench/BenchMorphology.cpp +++ b/bench/BenchMorphology.cpp @@ -128,7 +128,7 @@ using MorphologyTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(Morphology, NVBENCH_TYPE_AXES(MorphologyTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}) + .add_int64_axis("varShape", {-1, 0}) .add_int64_axis("iteration", {1}) .add_string_axis("kernelSize", {"3x3"}) .add_string_axis("morphType", {"ERODE", "DILATE", "OPEN", "CLOSE"}) diff --git a/bench/BenchNormalize.cpp b/bench/BenchNormalize.cpp index 64eed3e3..9e7cc09e 100644 --- a/bench/BenchNormalize.cpp +++ b/bench/BenchNormalize.cpp @@ -96,4 +96,4 @@ using NormalizeTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(Normalize, NVBENCH_TYPE_AXES(NormalizeTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}); + .add_int64_axis("varShape", {-1, 0}); diff --git a/bench/BenchPillowResize.cpp b/bench/BenchPillowResize.cpp index 359480e2..1340a9f2 100644 --- a/bench/BenchPillowResize.cpp +++ b/bench/BenchPillowResize.cpp @@ -100,6 +100,6 @@ using PillowResizeTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(PillowResize, NVBENCH_TYPE_AXES(PillowResizeTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}) + .add_int64_axis("varShape", {-1, 0}) .add_string_axis("resizeType", {"CONTRACT"}) .add_string_axis("interpolation", {"CUBIC"}); diff --git a/bench/BenchRandomResizedCrop.cpp b/bench/BenchRandomResizedCrop.cpp index b7f58c57..661a5e42 100644 --- a/bench/BenchRandomResizedCrop.cpp +++ b/bench/BenchRandomResizedCrop.cpp @@ -98,6 +98,6 @@ using RandomResizedCropTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(RandomResizedCrop, NVBENCH_TYPE_AXES(RandomResizedCropTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}) + .add_int64_axis("varShape", {-1, 0}) .add_string_axis("resizeType", {"EXPAND"}) .add_string_axis("interpolation", {"LINEAR"}); diff --git a/bench/BenchRemap.cpp b/bench/BenchRemap.cpp index 7fc20600..3f3825c8 100644 --- a/bench/BenchRemap.cpp +++ b/bench/BenchRemap.cpp @@ -116,5 +116,5 @@ using RemapTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(Remap, NVBENCH_TYPE_AXES(RemapTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}) + .add_int64_axis("varShape", {-1, 0}) .add_string_axis("mapType", {"DENSE"}); diff --git a/bench/BenchResize.cpp b/bench/BenchResize.cpp index 7446a6f8..b8fb517a 100644 --- a/bench/BenchResize.cpp +++ b/bench/BenchResize.cpp @@ -92,6 +92,6 @@ using ResizeTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(Resize, NVBENCH_TYPE_AXES(ResizeTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}) + .add_int64_axis("varShape", {-1, 0}) .add_string_axis("resizeType", {"EXPAND"}) .add_string_axis("interpolation", {"LINEAR"}); diff --git a/bench/BenchRotate.cpp b/bench/BenchRotate.cpp index 4f4af05c..bfd58527 100644 --- a/bench/BenchRotate.cpp +++ b/bench/BenchRotate.cpp @@ -87,5 +87,5 @@ using RotateTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(Rotate, NVBENCH_TYPE_AXES(RotateTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}) + .add_int64_axis("varShape", {-1, 0}) .add_string_axis("interpolation", {"CUBIC"}); diff --git a/bench/BenchThreshold.cpp b/bench/BenchThreshold.cpp index 648a83ac..1c87a799 100644 --- a/bench/BenchThreshold.cpp +++ b/bench/BenchThreshold.cpp @@ -82,4 +82,4 @@ using ThresholdTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(Threshold, NVBENCH_TYPE_AXES(ThresholdTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}); + .add_int64_axis("varShape", {-1, 0}); diff --git a/bench/BenchWarpAffine.cpp b/bench/BenchWarpAffine.cpp index 459c3b32..a028e28b 100644 --- a/bench/BenchWarpAffine.cpp +++ b/bench/BenchWarpAffine.cpp @@ -89,7 +89,7 @@ using WarpAffineTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(WarpAffine, NVBENCH_TYPE_AXES(WarpAffineTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}) + .add_int64_axis("varShape", {-1, 0}) .add_string_axis("border", {"REFLECT"}) .add_string_axis("interpolation", {"CUBIC"}) .add_string_axis("inverseMap", {"Y"}); diff --git a/bench/BenchWarpPerspective.cpp b/bench/BenchWarpPerspective.cpp index 87498612..f18108e8 100644 --- a/bench/BenchWarpPerspective.cpp +++ b/bench/BenchWarpPerspective.cpp @@ -89,7 +89,7 @@ using WarpPerspectiveTypes = nvbench::type_list; NVBENCH_BENCH_TYPES(WarpPerspective, NVBENCH_TYPE_AXES(WarpPerspectiveTypes)) .set_type_axes_names({"InOutDataType"}) .add_string_axis("shape", {"1x1080x1920"}) - .add_int64_axis("varShape", {-1}) + .add_int64_axis("varShape", {-1, 0}) .add_string_axis("border", {"REFLECT"}) .add_string_axis("interpolation", {"CUBIC"}) .add_string_axis("inverseMap", {"Y"}); diff --git a/bench/CMakeLists.txt b/bench/CMakeLists.txt index e82bf3da..3ca00027 100644 --- a/bench/CMakeLists.txt +++ b/bench/CMakeLists.txt @@ -53,7 +53,6 @@ set(bench_sources BenchCropFlipNormalizeReformat.cpp BenchCustomCrop.cpp BenchErase.cpp - BenchFindContours.cpp BenchGammaContrast.cpp BenchGaussianNoise.cpp BenchHistogramEq.cpp diff --git a/bench/python/all_ops/op_copymakeborder.py b/bench/python/all_ops/op_copymakeborder.py index c0bca25b..2f57475d 100644 --- a/bench/python/all_ops/op_copymakeborder.py +++ b/bench/python/all_ops/op_copymakeborder.py @@ -24,7 +24,7 @@ class OpCopyMakeBorder(AbstractOpBase): def setup(self, input): self.border_mode = cvcuda.Border.CONSTANT - self.border_values = [255, 0, 0] # Border values for 3 channel input. + self.border_values = [255, 0, 0] # Border values for 3 channel RGB input. self.top = 30 self.left = 40 self.bottom = 50 diff --git a/bench/python/all_ops/op_findcontours.py b/bench/python/all_ops/op_findcontours.py deleted file mode 100644 index 7fe31cab..00000000 --- a/bench/python/all_ops/op_findcontours.py +++ /dev/null @@ -1,109 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise -# things may throw unexpected errors. -import pycuda.driver as cuda # noqa: F401 - -from bench_utils import AbstractOpBase -import cvcuda -import torch -from torchvision.io import read_image -import matplotlib.pyplot as plt -import numpy as np -import os -import logging - -logger = logging.getLogger(__name__) - - -class OpFindContours(AbstractOpBase): - def setup(self, input): - grayscale_input = read_image( - os.path.join(self.assets_dir, "countour_lines.jpg") - ) - grayscale_input = grayscale_input.moveaxis( - 0, -1 - ).contiguous() # From CHW to HWC - # Binarize the grayscale_input - grayscale_input[grayscale_input <= 50] = 0 - grayscale_input[grayscale_input > 50] = 255 - - grayscale_input = [grayscale_input.clone() for _ in range(input.shape[0])] - grayscale_input = torch.stack(grayscale_input) - grayscale_input = grayscale_input.cuda(self.device_id) - self.grayscale_input = cvcuda.as_tensor(grayscale_input, "NHWC") - - def run(self, input): - return cvcuda.find_contours(self.grayscale_input) - - def visualize(self): - """ - Attempts to visualize the output produced by the operator as an image by writing it - down to the disk. May raise exceptions if visualization is not successful. - """ - output_dir = self._setup_clear_output_dir(filename_ends_with="_op_out.jpg") - # Convert the inputs and outputs to numpy arrays first. - # input shape: NHWC - # out[0] = points_info shape: NxMx2 (M == max points, 2 for x and y coordinates) - # out[1] = contours_info shape: NxC where - # (C == max contours, number of non-zero elements are number of contours) - input_npy = ( - torch.as_tensor( - self.grayscale_input.cuda(), device="cuda:%d" % self.device_id - ) - .cpu() - .numpy() - ) - points_npy = ( - torch.as_tensor(self.op_output[0].cuda(), device="cuda:%d" % self.device_id) - .cpu() - .numpy() - ) - num_contours_npy = ( - torch.as_tensor(self.op_output[1].cuda(), device="cuda:%d" % self.device_id) - .cpu() - .numpy() - ) - - # Loop over all the images... - for i, img in enumerate(input_npy): - - # Grab the information on the points and the contours of this image. - points_info = points_npy[i] - contours_info = num_contours_npy[i] - - # Keep only the non-zero entries from contours_info - contours_info = contours_info[np.nonzero(contours_info)] - # Use the num_points in contours_info to split the points_info - # Since the values in num_points are not start-stop indices of the points - # we need to use cumsum to fix it and use it inside the split function - valid_points = np.split(points_info, contours_info.cumsum()) - # Last element in valid_points is the remainder of the points so need to drop it. - all_contours = valid_points[:-1] # This list stores OpenCV style contours. - - plt.figure(figsize=(img.shape[1] / 100.0, img.shape[0] / 100.0)) - plt.gca().invert_yaxis() - - plt.plot(0, 0, color="white") - plt.plot(img.shape[1], img.shape[0], color="white") - for contour in all_contours: - x, y = contour[:, 0], contour[:, 1] - plt.plot(x, y, color="green", linewidth=2) - - # Save using PIL - out_file_name = "img_%d_op_out.jpg" % i - plt.savefig(os.path.join(output_dir, out_file_name)) - plt.close() diff --git a/bench/python/all_ops/op_flip.py b/bench/python/all_ops/op_flip.py index 962a1285..d93a1c14 100644 --- a/bench/python/all_ops/op_flip.py +++ b/bench/python/all_ops/op_flip.py @@ -21,9 +21,25 @@ import cvcuda -class OpFlip(AbstractOpBase): +class OpFlipX(AbstractOpBase): def setup(self, input): - self.flip_code = -1 # means flipping around both axes. + self.flip_code = 0 # means flipping around x axis. + + def run(self, input): + return cvcuda.flip(input, flipCode=self.flip_code) + + +class OpFlipY(AbstractOpBase): + def setup(self, input): + self.flip_code = 1 # means flipping around y axis. + + def run(self, input): + return cvcuda.flip(input, flipCode=self.flip_code) + + +class OpFlipXY(AbstractOpBase): + def setup(self, input): + self.flip_code = -1 # means flipping around x and y axis. def run(self, input): return cvcuda.flip(input, flipCode=self.flip_code) diff --git a/bench/python/assets/brooklyn_bboxes.pt b/bench/python/assets/brooklyn_bboxes.pt index 3261e47208b22be52416c8639207fe1bf06569e9..69bc4260cdd55b9f99c4c819248b0615ccf55051 100644 GIT binary patch literal 131 zcmWN{yAi`63;@vHQ?Njc0U^NK@DYK#B={oiBGbp!Yu-g)?&BlcoCohxKKp#UDzCTg zOlux*$p>Y5sb%ycxJ7Mox1g)DfF@YwLk1`=rj#}v75Jiy!$9aELrBpTLP&Q|&JY+x OsCwc4BcTFXy)A$44ksM| literal 196520 zcmZ_12Yi-A^Y1;o3rR@tz4wHYkU|M1^bVo--g^g8k)nXoR6qd>q9WKH6+{sP6-7a@ zAPNX5Ql$6ZVb6DOpZ_`Mec#WkA9Cm3Yp>banc3gY&dzSywlw8Le0=2p^IwpJN&o(1 zrVN^pH?ZHtepSYf7}=yjkaYil{x?Gpm^fzqfT30TOqw`s}cX#AjojfM@F=sV6Qr~SAJ6UX_sALm!4ZA)*TssDV|a}AW;zqbFsy$+D1|GU?D z`Ar(c{lEMCe@mcHc0Ft^N9C#wkoI!UD*45hmjIb3YwQH+EcfhjJ4|Awv0Sr3QeHO7 z1v^GUB$&Eg#8-UeruC=%YdK+`rCz#RvdzU?D#=}&DoXA{;Y;z7LiyeHlUlM$?%P6o z5U(ECol;Jk(_UYxC)?zKZ9sWG<-b$jR(9EOQYyEgQ%kfoRS)eSl=r0mBT`>Z${kx> zjI^ixFy(#ZCp%XfNQk;=3*e`hveefyT7Iz4(Mq_wZ7WmW2_@};uc`8feL-qSeRbC+ zK&Ksau24Q-j@o5XEZr$j=KXf+zO_7mNlx2WB}dBCJ?d*2OZh*vyGqX5S150yZrNDq zKg08Vl&_Uj(5WOn)OAn!XVgCqov-8=^{dI->V{3D-f{RjLBIc!-S$OkC|>%Gtpfdh zlpm-3w(PL;q=QtTydvd0C_l|-sQS@9CC%g_bhG*Xu7`ek$xy%B$D~O9rax)CKNr5f zkqT0v{zPw)@+6{G|)^X7c=L_-ZLX+SQc%NFef9VW&t7c|(OuU3r)C^Pci3 zwVC?Wamux!k?@_01HS{uE-?c~4XQn=FJAouk_$)$RKT-Yz<>~qw_UR`t>yy;4 zAa7B=gz`rEmK}mEKda9oCok!)@AF-tci;Aw>vowwV{@rDOkcKLk?$DoBOT>0yG|do zk&LHm`n>JP_xcY@(2kSGXW9>I``PZ* zN3E}f%6Rxqr`<1f6k~6n{Z9Xl{3B!r?We={4>}2X?zFq~U$%lo$x(fs@(S{YerPMn z5qn1e37r_JV}wzA-)_@4=|_1HbBJ~$r7h+4u$v#X$SApGBg`oq4BuVNE$Dd35$tsm z&s&)L*uR#B<|O?KkkinARBqTN<~n%dE%nVF8%6u&jFLfe(l#`g;Kx^*Qa^(Jg&HG$ zZenla-N6kaq8v2XP4I9Y&@0*o28cv>opFYsv z$ou}F-<$9shvb)&*MXC5(CZYZH7A3bI=PBN+0kUFa+QQCmrH7KhCjy8mYk<|N* zR@#B}{jldA*ia)Zb)rPeAN0Eivg(I^$MY{u(qtyce}sC&c&|V9@CY(Yp}pl`;{&jI z0x}tn9X&^R8tr{X{qkVebZlcRMmjuWMwEZ;txhfX>2m|lKq>2ibt?)Vr zIs#6HQQiccs-c49SNK?f-3Nk`p_C6$AyQL?$|+>K5ZaBvnP})QcK8LI2hdr7yxW5# z(eOJ}Rgl&yNg|b!*~n=Kcp8Ol3+YoMl_s^pvEj^VBf+0UB$TVli;s$7UI|3%SL^_A zHXgZU!FvVTcno^^@IH;t7<8Edo}ZBF@;ZEk^L;w+`SBUUd$-ZxNA%&gy<(qMn$`oM zbIl%NzWi2sBjq!84gEA&lfOE`eAGn#K&}t$9?Byq_oAPl+1*&lIrvYIFTw4H9y{D5 zvuunSEO(JhAU3j1{bTbbS2oB+`;^KS)I#r1*-zAQ+Y-5ch`r5#W<29Pigq_4_qMd~ zJ$jm^;w4Q-qv`MM9(3B05$ERf5h?+k4WU1~?PYMd8}^w(KWC^+>8Jy#wa0qvQ?@Dn z?51kVBgni==QA$8vsvI~Df9M0RTCL)x6O4U#@#!%F1TKa`6f{pSywbNd*1i7Blb)+I4|FFyi}&niaM~Na4A!~ydyl=N zTQOQbx7&3E^b;9s(R!QKcwFDv-=OP5`Qz|861xd9)sfUOdy4tZ2TR|iV;Ltm zs9%ho{BEyk#YhX6oy-$+8M|K8FGXKhn3w#if1A&~Qc=Q8w$!Bk2yhsbm5Sh26831} zqbC}uWGp!BFSX3K*mVtw0hgNcy#PG(;j@J~0Pa;uk6P_GxyqiDa6d4Sw6+9}{+1o9jU-e+K6!@%bn z=w~{(9Y+0^!T(fgB##>}topKDLc5_@-rMFi{Jz=p6!S=ZH2EL+3gpSV=6%W^mo@O8 zhVS|f?F1vA4dyiXUq`+$K4|qQxb5bnNZHT4k%A<58aJQbwkMf4!oa0#X1?_TrS_Q+ ztn3ea#oR$|F_^tC^UrEDoDQbmv|3i-KRe|8mUg>fv%yI9w!LOA*c?zc5Dck}om8d1 z0?+-ym=rM53rh7U@50k__6T$DA^hw9NIHelcm-dhC0^HFbg|72$9qkr4-IL54hVhS z{={5$mHDSGlzc(yTG(iJe*4A_KsNdGusW26purXR^f&D$I}vV+kxwJEIgpX^9%JFU z{m?#+?3>c&7}^_)y?urEP!1n@2{LFR*)jo-dXwz6-yzL-`P+WP&no=hg+5>9b3ar; zak~-P@z80@+t*kURR4Ffj=sWE}IKzT5)4LIQ^+if1d zC&B4*+HvE0lxj|E%WNHW$aa<~@ZDL;;afflo+|k14(^p8;a`zjwE7D?J!|iQ7`;_6 ztBDr!P)2}RhwLlh*HtWi9Pec_uF_O8K7JkfSw-X31u-8~roRzj5;%)aeuw`<_5tlA z(6125Q3dp|xBR1Wk>)q{b>+Ny5Lz)t)eO0(s^GP3uHTkC8c@P0h}x23EW`u!6+ zZz6{raO@epj3717#_KlfE_{A!Z`+|d4UJ?X=jLGkTorBFpxr}uxUPb>?%4@yKWmfy za$UDX8hhvS zO8eubx@m`gKeoGdb@XWMLOohuz!uX?J@9S^{&N-V$VV1~?*p-KF)>(Q27a_c>oMSb zvb+gSRW$j??JhX%yl*dgl6m1bCqI(`ojdk{PNm!*zuB$5)1;mWg3cuyXdJ3C?-+l^ zJTn1!RE9osO(fs^!C7y5QBI=4wZ^a7jX2{vRUmyGa2tI>A$G6RPTazBYYm(9G z9=p|OPyO}g80-5{vevk9^{f2>{C0Hqn`_J;9l>XZn+NP~l)K-qnAKqBvvSNt!1ZB! z)kM?Z7{+pQ>y6iVi&kn-eixgr0H&0e{j5hCz%#M{i>}!7tmFFf#!dX3f;YYEgl z&U#`dGRpzkZrBz`#~%rMF~)|2rgxbI2OzipAe1ljS1Khv@w9!gnuS>EYYU`rjS z)P?F|s2`&IA9}T%UUa0bK|I;SxT(PSJjP7485x&gXI;?74eZ&dG(7e&`4n5Iif2$j zuVU2`jQMUrV>67o7Myi_FYX-i23UggY`362NvQnL_GMV*Lm|LzfHw~qJWqJ}Q zPvDp16-hm%h)s=qqT$Tho4g@6JHat2>FNxdh*x|(dDUtcos+{2T35}6gsQ`RbH zC+F~=ZxL^4D_hhuMnfZe1{@A#ru|txfJzS>On)S|wRc z-oR>JVU1E%J&r$IUl!S&Hb@_2UA0sOs_)&Lqps=Z@`n8bJP(q7c&by>Q~37vm6!rq zZSUC5x;C?Bfm{Qh)70C<5g7IKtDnw8Iz>{Zkpk~|n+Eb6_Bj#L3xa<(M@Cr8f|4 zI?!?W+?nTYJz5>St_==#Fu9b5gXgJ?=w-hR9n$!`-(0@m&;Q%qIr zCCN17tUX$0K|h7hr_e(vh_;YyPWLO%l>TL>TFSlnCYF!;9&{ElIC`pPFJ2kxWr zn>RDgHp|DxdHZMW*Piki@cd=Q{UPENF5-O9?lcwQGg5vqJD{^hb~7*e$yWO->kKz9 zTu1J~@~G?qw_Pi6!;;(aK=I->^L2sa;e>!u@l31%mtSlGOrXMlP)}Q zK5-#>ZiCKZXtx0Abf^4LyrwF&KA7jxw30{f$53Yh*6L=DiFnSjv{T#R5|+A-Rgf?3 zOb2VzdC~}uo}lipjPVEffiHkb>Ad+ky%~k2SZw|r5r%ajEvtQyzq&ln@3E>9_?ga{ zv;nnKWiX>=3wE2YvWV^lW0Q?(H5K0b$S=@HRmJ#MMs^~LG}>s(^E0%Wsq&C=1$>q_^z*l+nGcO@7M&D zZ*#=uPy7DGi$0>WSI&DsoR6k z8J_Y7$Y2K+xlZ{|qZR&mGw7sAv5I7s)g63}17};pLtV-pZX2+^6>$vb_m|2v^0^a%}D)BRY9H= zm#dHrMA4!uMu&(RCzE4z$1 z{Bj$l;?!<+0InO$Yx*Vlf0udV72-7)k>eKo3)dVl=#OUdli4U9@d#8e#EVJ$s@8Ce|{%E%xIm!*vcdRkj$7s zFZjDB@n%NrNJdnNEYfjEN6WXm5u+tQ=JH-TV`8KZM9UfS7xRK)q`Uy0=OE9itREU6 z7a#CwH}tHBCI_T})#}Hb?~jooKI?smr%uaJUpJiXwho zgcT-%&(-)Ik00&uBaY{8HLUQPoqrdDzwP*|AQP#dhn`1M9>vI>Vd}tbD*kjMq*Vcb zxiRG$b)Pv7hIS$Sz$~W>C?f9Uv z^Uw)|9|bqw@VgIuF6MW|?Cau46&Pz3X#W~sdOb=M_!mU|6r>x)^M|aJ>mUziHohgI z`2yaKLlcd7?+(6q6r;Bh7#oEZHG^sy_H&!H>;*7#~hDL9R9rtMkUhnY%s*dp$1|DO~Cq3 zu--UmF7n`cE;Gg>pzznkHyZF(Fz=O7J44>U_Dbaody=Qsps*QVF&oZa0C5Mwa|Km^ z-s5B!9A(g>neaUu4>MR*VO+%{@m4&oEKk$Z7eLrVa5fQ(xWfAZv_Fwr*O&)WR4_PI zSFSUsg(A7R)Cy8lSeJ)jXN@FSHO4-p@%@YC5tu9n&S}Ya?uDuDqjZmqKv;NW?|1z0+72)qWdOlm>8_RHf z{su_4g!k?cBkC)wJanGL&uc=zGl&qS(SWfC!#RDK(+Dl&(KgQpddW-VEw z+VUAM^Hgm#_XU1$F?x%a_f%b^5-!_R87r_g*2O(85((Vm@n1t^vziPJ9kpjwP4K5S zc-oiV#)8LV(Bw6HgYS3o8xN~~SYrf!ZyD0PZ_~lMU+`cqt5Hx7VxB0&V|dqAVI6;! z9aZDO!&3bIJZ$ZV?W$)%?UFsJF0)_yvTS4@cn&>TqL;(Zr}lt)!~R8NV>5DCNPj-m zW3j@$Hd!0|!slo|pE*8CUev^I_^hiNz-2}7HjDBH_7Rqf10BU z(P|j`f^Hlo$V{DsU7WUSbte2)l8N}wE&?!L=h6=!{8JZac5&fStT91e*Ll>xX5Znv zi=Qrrek77vsxugO(ejCILN8)unQqS0Jefy3^@%v`*0mV3+47NY1Gn+=s4kGnSe`L? z@KJ|&Oh3l1uRNvywnNc-HtTCkr0BFBLAk&Dt$VW09Lhdi9&b00e{@T%qze9Z85aBj z_*@OU43!BzVs0%~SsjW=x!xK;@{6`IbB)?nfwLwKGg zn)N~}Ms|^DO{<~AnFlfs;w9O%gG*nj!KWYfJDc7}F&e+SDcXrAp3wpOk78f42S3Ak z?$*`*GJtl2_}&}(e&Eq4uZmsI1Rrao zpJk>!@^?5~lkfAu@fsjTdEyzJD38Ri-pW3&uX@QOf*{_q#dL+s7+Gz;1BbPGlX<5j zIQ$cIT)btq*#(~8qMq~4j)Kpv>2186G;cFYAC+CqU(u|9&mte^c?)lQ^ZY0_UV@z| znM8RsWnGZ6%gktwy*XcJ4E>E`toA`ZiC}GSwB=$YefZ~M*QHoN4z}6}Z9HI4uax$y zGDb@n-AQ14A23}Z(`>ZSh(1=LJPXX|#5*pJqdN6#U?DXqugLS})bs}p>(JXAY~RKC z+u$j;g=#c-?cyb2|Kb~S!&GpqA{1s*?)E!o!fA7;y^7Ql;Cun2BoQuNq7N0}`~~U{ zqQ)y=rQ>}8GU&$lH~80^-#!KJQfO%bR{j`zI0Noh1uwVrG>zY$qMa9!%YFE+#IyZy zl!&G$!v9+8xL9WveLjz89)TtwgU&C=ItARr3m`9JByx^qrBg?KfX7P84=MFw?{YY@ z52LJuyamEkRw1BiJ@zSwVMU?rPfmgQm$sIQ!-_gcW&A>40~q3zPu2%r8)`~%ejZa*O|P?OTivR?JX zhWw~kjD9ZK?^Hw3wT^tMx?^WP@{9TmD_BnbZfHA)cm-n}eg3T8LYwc~JK%FD>Cs2_(cH?vl_WoOw++Q}h^1ZDVc@YJ-)reX z*hyvStX;OzH+G<|L2E|p>7kx+F+z96zRr-RQITF1%2?eKeMQP_-2(oMOxDg1NyAU> zO#3nL(S&}27vOU+7BofogWm{wLl-etg}92F?^tQ;!IVdnH&T=L{N#J^Jp?){;U@zA zUMD`0k7W1g*7UOi_}q`$6`5CZ!I{gpoN3PUO7e|vh*h?b=d=%0@y1Mb{FW@?ag|_!(+Nf$P2C^O~(@Mk2Q$ z{Ncwv{jO=cG13BQe-PipDWA$)e(Vo6rk8#)h;rv|_cJ5-=|g;@C33lFM}n(v-*77P zN_F%<)${>xZrTO#=ltIJCY$jwf;e+~=pVI9O@DeFEw7mrQ0puF=Z08|KlnU^cB9DO z*k!x3ihdP2^aee*m|-6NR-50!?F6+F|Gz)}^G?d0zx^KNU5HSwHBpS}li>3R_>Ptf z<_A{utL0mhiIuqh!S=`}9@##|XDoB%D2D>foeMntkXe8{g84ogsk)fdBj`UFS$3vB zS-dk=+)i37vvC9#mP>gi<0+2SS%GH+xvac&>U8#$7Qnq*=hTO8EbqJ7HVaA(kWoe6 zZzIE?lf=_z)Gy?FPwHmzy+6`VX4c5(X$vrB5C|P5)4>kM-xTV`V-I5}b-R?a_|2`0 z=7PIfeVEH(rhevIRa^5R_Q1htm4L*;=A3kmy zs318K^|Wd&zk5_a4!?NZ&GeY^f>Y5 zMxay}xZWMP6@!ye@Rfw$+KSSi#4(<9@WQ9tFDjPZsy#SbMK%zV(Tw9y)Nu56*q&E? zu%Qm%uZxbPLFZ-q`7QqQ6p!5Zg3rmc`z{kBTk8p+ z87m<@iua3%_x8f#1V4BHV%=w_lZ9{ zKOvMj$56^ci92_vy*uFZSdU&F2e0!$pC|FVUHp4C`0VyIX3_t;jE^~{EAz!!yWEW9 z?Ra^`J_bf*(G;Kf)Z`ouR$q% zC)2xV`P`f#KPW=IYib~`)Akthada+FKbUd%y=g`-PJ_=~;NR^JK1csjz~^abvl1=8 z44pXIcpAQfh#)OQn;~fU5lXY@`%F;HMI;tO-w*7Z!gx#qW1nOkBq8}{=(jij=0PJK z3z|xKHY4O|-VS4AJwbF>meT%tEt*%-+!<(&$k@I15-j>LMR z$EN7y9Y)~=xP1gZW0)PBf8-+Ci)iN$^693)wcCS#l=d^AIvB3X)9QS<4WoP{_BIB; zxgV{EV}lFPdK_HVWZvJ1Z_tBrQHi+7BA&*h-u&+DT+=@8DVyH)hx@~xKGm*&Q zc$019Fccl$z6{>JEdK#V-MXS4dycQdhd=cCF(*7nyuoD`m9pQs0zUolM7lHAEF*ud z9kX~Td2=hFA3@xF0ld{>pT8|~sSZBBM*SfC;TM?+$RP(OOPIm(*mHF8D7Sa_0g@~Q zS6$?@nB0jdB=(Sq&SK=%4}UiTO3l?1)W&Cqk0M6OZ16LJnjKVs-fPHyWLtXSEzhgL zNcWx{sv4t@Zt|w8=~@4*BW~h_UZ$$9P-#v4x&&_S+xNlS1Sl@zy+jahovMYjtAOXj zkYY>X2+L_rgU2!SqPF}5{#r7wKcl=n^fswXtgtFL`xv&>oOtvg?6fs`er?frE^+DC zK#efsBXt-*)A8ebVJU4q^`oixH}gYZBrpfOc9s{XCqu0V<&T0IL-2>mMaORI>Bo>t zW%9=Rqs`^i>wq>c+7a5Vhij0RHx=tFmFoHttml-SphqLGo3^T6gq?(e+e46|NGIJH z?ZuNn;qaE&sUAu{ePn=kIjP0cRnI^#;W7|tS!dKlc4 zfZr>i6AA8)!{_8-c=Um5MC${grr^X9AIqqG0l^wa#jZa;#KKODcN zd(z87`48=9g10;M7`RObpWVFLR-VzFu=6p*oyViEOzPJ}J5xRTpil9hG0vVN!2QaM zq_yNp&u1hw5nuBJ-oOQWnEth8_4osPJjKkG0KPi>P{c1>{L>43bt{tyaIYeI3pdWM zAd+XMF{|X`54VNiT=Ftr;JsjQ`Eh!cNnB+p`VEzQvz+?mqnoEZa;w7o6|ut-v(O{A zK4v;nBIn9HkM+lpKQ|lsgn`dbI;~4xdCY^`vrQl5Q5AfigB5oqzitLkkJuH);d3f^cX?QwzZwZYy|9VsTTZv4MrO34yUB1QlBxt=s`%!4*HBd1bS-yre+|Judq?t~= z)o9J_1HOS=+zRa-ys!j*e+#(uhZTwaQhXk zJBH^j-cTEVb{6)0g3N`b^a=lj^+Ffgev{vVJ-u5*>&f7J26%gpmHh;G_91`zHCj)F z+emo0E+0{&J@vLi-$h8?06k{mC4Y{!ccJ_sHDi&?X0YHfJd9sxxfLy30R!AP*a3b{ zf%o%_g+%@@4-UI|`B%y((^@G0-!1lo!qk7@CkA|c3Q&cZ7w^b&NJPPZM0H0rn|2X2;FCqIAcD(9MEpiX>C-GkJo5x`V z=j=21yMgql19(^kBz#(p1Vyc#tH#oE_5r~4BIb~1`5uVHzXX0p(9`MQZYA`whW5O$ z|G8>DRO8qmblLHF#50_K5sTlwjM*iMIL3>NnhM}_N2sLmeIfJ%_W+90+sO|Zf`w)g zcUXjt*5^EqvyhMEs(PFi(|y}a_eE}Ha-a4av2qF4&tX$O_`6Nu8w3YGoqyI>e}T94 zIe8V6k+QY<`dzpUm6m!o_9f(13`R~x(pax%Y=nZ_v(Q=`csz@K#)8ide?p~~cDRsE z{b%T1Ii4?qPB{4dI&uv0)UVFW_ZaOu{9mdE@UukP;*a-4V~h1t=mdG<8D!mq+fRBT zlFOhUfaX_#=OOU>678nJ=Ni2dIz*PhUzcI_g?~<3PJBd;sxdQzN+B*Tg)^L)5{~S!u{LEUk zON`FE!)S z5cU&arFJg)drP1qVBeeh^eRUF z#!rvOr`u_UiaI<)Ot)I8Oy9$u>uFZ=k1q@OLcZa|C_e zfOqD@nD~JHCSno&z^{3{^Bwqqm7K^o;HDydn!x)r(b->&n?E^owUBXJf`9y;hkkob zl;mN-Pw*MZ8xx2`Bzsy}ht_Y}8S*W>#ejlidG8UBcsbhN&mPccp7O_-)rhvy+AQ9? zZC66qtw-E>nOBkLM&3wdXJ#$YH5adY3N5@uv~~ke-Ol;9%nQ-uS%rjSx$j_9!9O?7S1QFoo&&Xgh-uVUayp-?vkyAMI52L-Y`0W8|I`7vf z{=5c0*%PGqL!p!f4n=`b>FRy-0`l>^h&RzZ--^KJFDd853HaQYH_Fs3w3|&X$LI7Y zj`NGp@-u||+!v8jCUN99_y&%sXK05=I5{E(Skyr9%FXZ7p;Lhx18A={)-V=-J06KI zQVS?g;>^ZI^mCc~=O-LD?E5{-93M%(-CF2-b6(J`zhc4N6^y?y@cvEa;Y_}}%tSwr zAI<3%{N-0XssV^kUWgJyv2Dny_xY7O?+cD`tT#({3zDYP}=A% zd<&P>`fcbY6Zd!psUc_msfYh=;JeG<9th5+Qqos{?4dJO58-=PaC$D%^CAEG6Urlr z4?he2BI%|-WDOY$K5vA6Fu1*#rv>=o>!9|)z6_rZpI_1rKPnSHUJKWMg3li#%@A1+ zjs`KZKZYN-Z@x}{K==zWkvXoY`h+mHbZ=1;(p4X-4zv*S@y$bmn zUtzBiat}PN$;##`e7b$E2YlvYXI}W5wV~)`w!ja2+~#xI%K*=x#~yr%b9{kyVWH-0 zv{(dQuVB850G~gnl*~=@E#-;$&+kI50`ZM6;X9PL#t-P3oleSg(0qyc6zWmn@ejPs zdfa>nHJ1;)jh|7(nSY@?RoZ})#q?~Tar%ms@%YU_=w+0dORq!0*H5Sqx*6x+WyliB zA97-AuK5qovxz@1Ldq?{e{O1 z^eY>BTbYO4{@m-PHP%$D-ZD?~%}ahUZ$K}WIL2a6{|+!;IREA}^Nt%Y=ir0Wt$bg> zD2kF(o_Q*cvGpqY%pubG0XSNKZSCUySn9sTzg&3#5IX6+vxENUz|m@??lNrN;hT%G zevEeOfLJf1&uspEi=77(yLks(tjg~%W8b~t`6AEdalsg3=}WaR!g+ zIrP>5AN6UTE=E@8@ul}s{ygOki6p)bA5ZY_5_rCzT+l_7HwN=o(*McKZ9726pX_AD zVj{RVmHDDF?=HY24j`_%fY_>&&l9xPnE1mAP}k-Ay~&9)cOGdF-sB>@(jDNrJOBF` zp0U&S0J2Jd!)x0Ew&oHA+Kw+4Nxz>YCc1}H+I!eZYyh7JJ>`{|ePW0|)K>fCL+t$m z=T_Fz_Z%X8U1;M+RyJ?a?=x$!iXsX-(0>#J@o-uOXp7c<9Y z>%MzYbBU)PF5dAi)^*9Qpxnj(KT+TD)Yntq70Itv+i5otT;0K^7yE}_!lwbJcTzt9 z-2M*TmLng{`9n2iKkc}EpY_ZW4Y0Z^;Cy-P>~HlG^~1>1SdFx9b8clVPit{LW+9(V z$*+3}4U>1MXHu&V`-v~(6)xj^-{;`k6}BXP*0}bkCcaj&!16VRo;Z}I96b9>!1IN&+DL*15R&7PUQJh?&7rTm@iV0 z%U=B}xO#~5*;^=0I_L^gd>{pX>p>p8jW&-!YRhJVegXek0cDmzXbHeDVf- zWm0mEbwzC~x#_;hvSe#lG%h5 z%kZnerPpW4%iBPCCg-%bF$$`I+Z*7H++*rRQ_|1;g&adT|NRd3QbD?yujnWFiQsYw zeANe+o6zDI=(~Ap960CVM*YBdPNB*q%AFh@H=oguRPcBmm=^@@x^WjHi_JLX%Ulos zy4;AjSr-Js!8|jYsQOCq`bDg|Ir()?pV8#w4Q575|olM8{dL|4D}C zCTP1|#dXk`3*GC`e-+&Robq}^G2W-w&q4PLt*zqh%XVm=Wl}3+v(j;+zoOB zEhVzkH3{7Ofmz~Xz8?ag|A7A#+P`eK$fN2j_;qK~zmPrjGmf$OrhNb!{6W1+?8|*m zbrUUhRlce`NSvQptO+uIw$`WaQJ=tQ;!{nmua^Q|M_>y6WGW5m7jjh0IwpuGFDdm z(M34%iM>1zBo4Bl^;Rr?^qM-_&-$A+aUz)1FoJ_&zL(~r> ze)}iX0-%3|n9&XUggU?|Am5gHp;%0Jbpo8fVi)k;;q!cY8cz8XKI<}ny-c|`_c-7xdgd!e`76jj)Ex z>I_53pz>WC zqGvEu4kPbw9oCBfLENYvUO}i{LOXeqp|`+&oD_h=@leXcAI+w{0{s(Ql4-6_&`Kcj zldtG^u+-7pd741n<9DQfpZt&y;ETvP_#TJFcG1sZy?wyr@2D5S=MQ{V;&TP_S!41u zzC^y>@`!f&AfHoT@{FT?dOvo2&(70-QXUIFAEvw*Klw+vDg~EMAfE{GG!9W-Nj}iK z8I|Pe>T~cJO1^bpo({n;-iI}Xk(d1s?Szs4ag|KkgXGQK0c9@QUuegVcDHFS>~%kJ ziwoGpHG2ep0vO$!bfKH4UGgX0%!IoDXCF&aeIuDYjuh za4YiTE(>tR$tMhat%bdc*}?O>Hr;$r?PT@~H#6>Qlkf2())+)Q;{g5Ogf+PC;uS^s z$th4MHU}wB`J5o{&y3Qqz8EI>|_+U7|Lgf`5mn@KFx2)AsF0t{#O+K^iQ;= zh&%sAe}mZ<{K@m)RNARbPiL4t)DM$I<|7Y3E5J*a*}lf?_29q~zPt0dpPIL^*B0`j z`2#D6kQMmbO=XLGW|qNkU0Ls;8wpOYg1gCbgt&>Kf8W8ki+da~f1{-wIY2#MJn~cK z9P~M5Xa3}Ab@2HZdWr;}ukjhr2)YPYDV#++%y?-;&o99(@gup6{F3k>E;y7x1E-Ns zCQmL`c^b_KIR?E_G`kh6eVMWh)rI`5c5)PX)yLELh0!z{^h;M+@)-E+=#=6m|48eL zp!ASd7lY5o(0>Whiw~gpF?f6uzP_=~z@Ou13}a>{iywZr$XvoNkcqyO(%ND7dtP`N!GA6Y%sYD7ue%%;g9EB`2sK$I}Xo%)VtU%JFvu^tMEo3IOXCM+3FX5va(XE z=zl9d|A7;BR`^VXe-Cw! zS`N4G^HwDHh1`M41McOxi+u)hr-fFbP->`dAt&NU)OUGJb5%JNLQ6f>1*|p+T)sm& zXL%_P<8vl>T$hrUdGB9*_T_~6Jv&F8VN`k%N4`P1ANkG?urbX!zf<&Ev0r$}gC`EB zUB>-Z=(v66ufg*WH1P%VMKW}+fy>UHIS1e67#BO#k9aHT?B}_8kre>+T|WO|{BF0N ztU`RY5Nj!~&pMv za}I9{R_iYndEXb!G}Omwo!o!@8`9zKCveZLQ`&)#Zd|m3AD4I19=vvbQcwLU_E?|z z^dY#2#xMS!@?7E^>#>7w#4G-$9YdVs5Y&?Kqu)giEy*YO9qaKXANmShMick&R?fd# z3crE;zZe{M=Py=)r?IsA7P!f&dGPr%+$0d!c#_X1oDbZKmWe`A?)Jg|qZ8=warOuA z(+($v!2by7ey6)4t#`?fyUn=K;IzYuRPs1nHvCrn8APe)~c3q6tzyNQMi17|A5)5u7)ROceb#Jnv$E@D|+U5Ct}%{A{uT z&NCbB^TM8NV#G}h2k09{00=ODY9OD|&N@rit z&r{x0CG#HfKIpcDuO2FZ?;+&dC8+=?wN(!pr`!ple6f=V&WNWe=lAvmN1MR!VV7cgj_9f)2%Oh z>6^4ihCR6NPHc67u8UXn2T$F(lor&lg#C>ISDl><0FQH#@F403Fe8isKb@aEUyr7C zf0?20(at$L6FTmE+)`bKahJ-umRRKWyM12&i(~JA-?30$#k`RTg*Ey*T1+Iaalli) zNgwpg=N~X{WHIW#(o6W>Uw-t=OV)l%93{^H4aWUoTS9$U}PC~yN=URNglSIy^JVXl>$fLW5w7sN>u^u}dX)5p@kqqOn zTn0uv${qbtwBt@jS2Y^kC?Kv8j6DQ^&;Ha82cH8}Th;~RO$fNhc}eKF`|X}GO4Vds zPo>=9`66((GBfz=%ma71!DhV)<^5<`2|ja53w%C`_m;~2N1@nHJb3&u*4dl$C1=4( z&S}z~i-Ubl93=~jI|BdLxp!qV`0L_A`2O@0#zY1=P?_lY7s}}E&}A3jAU+M#pI2$hHs2r zrQn&l^Gc!kxqa~w!@&B4tw`4HUBC-Z9;IGhU~jp^wo995vdKXa#L zqH-rg7K@9IAD1~qdh;mf{E3P}pDsIO9H%#ygOqKN^crGEF3#+ByO&ExwH3r<^1M9)Vus zsF|QDF>gI&{wu~F?^EvLC-;#{T`blIoQy@P0V)D}jwMecUU_a5P}iZ8%O=dLz`Z(@H&bDh2Z6^Av4U9Q)IPMsT?WeWAI7=t`&e%x z@eN<B=>#X<{NoQDi%E^k@x28IS70{KrfmS z8H!N#v5x;!z8`CBM!rW7c=8+g{BQktIXmsnu`Oplse$EkFNcTDdX=a$837xq@9wAk zR{1l(gc9M`swWopm&#E0xC3S{{c!8Pck#cU;eT)KOFzSiZXBz-oz6JsMk#PF z8@iQsMao0S*9b$O?tTlyxDH}pF&h3k2LVnyJSzbAUEWS#T@HPPgU=6Wj|?N~a~m7? zf%##0!1c5xoP%+(8NrJFpVon1c!pQDgNF!^s=_}Q3qE5bPOB@Pk} zoe<(0{>ZZ_=USq%^E;eJybrF0a^I-4yEOdi+v4ug+@LK#gNZX=fgeBeJ5DghGKqhb zhYt77G9NT$bRK3taHoXN=tRoH$n)@Le75C0;YF+`P%glyTX!APfn+o`0LL#fhPmi-k@(GL7#9k^_$c^XpE!j-<06G~ z**B0&1h{*La^gYYyUUCDlz2ok?fnV;tK19mmGJ{tL)a(0NbhUOAIu9Hy=*q#(2wH$ z_H}y4*?n`LRynyue;uA*HBD%DsaTVNyoCFblCkqp)=;irkFZl6iGJOk3+dRIy8+b? z`M(4*XJE&FbMsgxbmFj#tN4YV(}N)VXLm<$p2yC8h%?N?SLEI!JWF@xya2h!(dK4( z8qCWAcUxl@4Y@_eohvL6cgil9HZ5Mzb^PaQ$d7x;h|9LchvnuEyqpWkhrImCfyuxA>H#GaG)u**A)U_UZq_j6}jRpcBG z)|N5aQ^^WAPW!|`!GE`2n+HcTsPhGURzU;X*pYMl0;_3ZBr%Yqc*-vC6feRfhYjrN z4W`e+I^3Oe`;+;^;rVvF@WIH<7o2f7Zk=EriNwNAK({mN1y(KeCj`zi89zx#EDAZ- zrTh-E3upGJNV}25H}ZJ?kb5@*R1x@+qN;fCszjwS9>eKX4b=j9R#7pu%iVnV!%okc zszjAw4^31!bX;Du^PgS5M>TMmGk_|^LuWX6=Z~E>=F{bo4pZ66?eC9O>Cg+|p1VBw z;yx4japQfFid62z;-i$u;lHj_HL*|fzQ9ki_{hI2rJv=&=MvAjSO(6M?Ia(nGWgg6 zKL1<3lKxa-T(1S^gYaiIt0E5`uBY7`?DiYv=JLe8Q-vP-U#e)v&p^4vd}X=kYM-hM zKe71FK5D+KkP9kDxf>(Sf$!X=2aboRNkpH*bSVBt2=V1~6@&fk1HWHkXCNBf%fmYT z!C$u?xW_!yQL5pG-$UC4#GjoW}^_kDQ%DFwaCn z@3O9mJ%!0`=+$!a^T@ozQvJv+W)z-LE45_}F-20LzRA{qbg9C$wck0kC9 zjJ)HCGy5a2EWYQ_|9I}vxXQSSC%$}xR&&AOOz1_)bNJ0J?mvV2#o+ED+T(U7_WQix zH=MjWZ|dccr*RkR6~XIZ#&eYH;M3hC|Dnf!PREZ<0RQ6fd&AUCZp8f>ejI*op&f2G z29MoM5BJHt%lC}W6UN0Yn}XXBssZEah$*6-1n{^o`kli)qZL&m^1WuNFpor_>C5mx znUjVo=*6vU>Ont}5waC~{e$ndq5rcxneAz}9QfNEADY}e#_x5`n&GLzpW8oN4kicS z2@YjlRv-=Or(3CR1QT7%Sn#{aoMxstNKA!|QGY zKF2dF6nW}5QO$XdM?$^k*k?oV*u|^6sg~#s|AhLj7%!bwb?QaQP}Kr^@kOik@i#cf z2JX7|C#v??i$WWX!L3a2xss>;B5*K?)lHddhaRJeQ|D4I2ERI;{4&P?!&7jM0 z@VP2-cKLPx&c9D;m+vrw{lZf4?T}r7AM1sk&L+Mcj9e6FjrUhyUFTOEzTBG;&U@Z+S68E6v;>(N*uf$0K?+eW zsu_&G>ipzf(}2$(ZIp>ZKIMpS6!1NTct#d@6wA4kROn}c)Ac+!PzpY|a~ds88sjKY zn(@BF;U4&x-puQ*!Mg(5FEdVG5i-J*fZNCHV3Q4w1#`Z;fd@C{!p}q2UE}amOX$xe z<8UU9yk~cF0{1XNuLQdDp%=wCTaMrD^81%S$K5pbDf~HHf7jH6o8L3|8sC%0(TMLtKD66c zs3teeAf)&>_awDO@AtT$KNGnUmt()rOG+pyLJqln@8Qv_#_tUW<04r(d$B$hJTd3D zJm8GLBzYB{+Nw_MyQPsq>-<0q!Y(D+m@Spn5bimsDjhv?a5wEQX0#IHSMKJTCeqa- z|3YFFqh$y>Nl=4my}h&pKi%2)t>9Z%WFM~DgHL_I$xOx}w~2!SMG9X=l@fm&0fM>v zmqxNv=3?1{@lzXckNRlhDn-a)HvBioYh4DNBJ}wT9%)4oayfa>^@&+8C*IKk`MtrM zk_!D5%rD)+=5_e9?xwY`KsL9p`Zje3lcTX4{Lf_;+DQhn%V%2;z7Iz}e?vEi@pXWc z)y@y!2{QL!#c&CaH-iy)lKIQU4cS#xb%=lZ;RnVeRqkv=-rQ8ni9r|FiN~M3P9A0? zIP2!Abo}Eua41r>f)4kasJb3J%mu%lo-)8`hyS_YvBTjeY6SMnO+A!XL5>YnC+bJj zi=iHztF0P1+3~$EdJL73Y834-lY__Auz-oGBjeG;m6_gPpW|TxRDtDK2D1H-j46la+>~y;)i^XJnA8@Kfraj+i}@b?$7t; zssu~D$onoI{6=R+ArH=E2Wt`Wo8m554*x{G88wZrwdyyEs=j_6uvMs*LOZ zDW9RekV|`Ty%uub_TO?p_WPQGE6MUKpGC<3Bi#o&0rHD(smjsbD&3gzfxYV99y$L` zxyyTgM|Yw;fWHS*>anZC9=`lJ&(fRn`{dWv^2q<3?g(Cy=>+|1;Kg;_8T$z#?%aX# z%HNLw=hNW(A>|GygTbe0_znU8va!Q3X7ti6|S(^gRxqJ1aOef0YIcHLd@uArV zETO+i;B|fU%MHx<(LwN6*A&w(C)U9&x9{1D__a6lahYkaTpoTa>N`Je6!>Vl=dqt@ ziTz))-DoFCs^Tv<0>8ZRs~ez4cMpfN+Zgic8X?aV{!U6`>T?r=ads2MK42r-Pvv`i z_>YrywD01&YfMx4567SGhW%CmpBuwp9C>u@s27Ic+zFhzY}cB$@EHR>m!S`D;vBu0 ze|Fgo&~fKMjv5y&|DO9qr+^##>>=hUmp6OIv;uX;6OSHCe^1#P%p3eQB+4c;3%v@y zPNsf2Mob~|QzRb4479KwiyMPp_Yr??hdsw3`L1}N^XWqe)t6mGY=(A=DDTLs{%OM-9uG|{wLseI<0<4 zoV^36y`3Gi7~&_}K)z(DPJ}0v{=7!?xh47&;v2ouON#6z0^D3C;%%3K3tsrm-T5RB z27kF4ew`n<5`czog6D20ojc}f-~Anr+Uh*#FXF&QhfArdGwq~;&z*T*6+9k{Jw$-p zov=GjWP+z|eV3z#fD2))de{-g|~aC#Y)B zt0evKM_pWPoSMXU&hD}novMqqzk z@SppGkA7IjSjwZp=b`Y|9XuWeo}IPt@xJ4KBk}Mk z{G>1*e){kHauB|g;A^YuhfRcm#~nO4d{XsNZ8~w7AjUVgI&*kU<{GI^L4K`BluG(bq+Tm$?{WO1lXN}Fk z|Ln#H$=BUIdh4P4gMX)NE&R)T`rktL0I$N??`uc-E)-;Vn^M$w-H_65gM?uQ?p4{o#tho>XQKf&RD%ctvN>bD|~ zVS*JHK4Joh8`M{%aT?3qEM;w2w zov8(Gmf{y@VOI$<*bGs1!0n;nwu=id#6RZL5$6cILMKw5h2In@;#^8y`kO|aqZ@k3 zN)h?B=^iB^nA`fWZgk@6B#TA<(~ncpF%F~ zUVXPtbAQR}1xCNi{eA)r-OUXsc7t@b!(2E_#0=r`LZYSu8Cj$0+c>d>;+@b zXV~Fv2ClZD=kMdSuR>;>RB!g<3P6q;_>l$dsPzLCBEZ8D&}qVo+~F{HlHuiyMYfH> z=aZhe2e;yalwZmKP@o99W%$oy;nUq^ItUN89r5as`0KT~7knn|yL(O+5O=5v8k2R5 z{T#DTdT_oyxVn?k)}0-sKB_wUS`EIsa~_`)%ON|JT#bJCkKTCM?a@a8QH)^px{w)f z7;@K0q#gCEgV!nG`$Brr6FlHvV*Khfy#EulQ;7d_2VCZqD!pk1zBVGeqrP(Y8TqOn z=rIGlZiqd$qh5XHw3<+BL;R`=s6PZcH6%%m$ByzPOig6oOk`G?hdqRY)8pwMH=5x8 z=Ah|{>M75-XhodE3w)}<^IWu98#?Y@{Xwdhy3764b*b;p!8HeeiKZg?cGTy_b^KR% zDzvYffKGG3-5!i5cYkR=^v+JO;%|KLd@gwJvcVryqp=_E1y-}5pT>Pi3!vABxb!pN zGo50|I$8*h^tt_RXCnQnRtIL1j4{TJnRi5m}vZ~nF37ceun;4khOXYc#%vT`7!E8 z;U7DEjwYXO8a_}U_lOQdzTB$@p1b_v?CC~SnXV0G5+1by|&d$!h0H2r9tH=?4E4wt-`iH=kFXegOEe*Rj zwiZ+no>XgJ-ETn6wM?5ggDA_R7aw=cV>&Y*DW<(<3{{j1gRg)w6GfptRez!lkk9Kz#%N+3xd$E*{pjQ=H>tX*5 zWaW3z&-=jVlZ_kkyDZ4=aP}Q`axalwABj+?M~W-W4;|>)k4Y`h&Ewy=6yGWQo~gbO zywPzB;EqW}xE1TcpLv2~KB}L_xsn6;H%|Avh#z>_orF82BYmI3emeMq>M!Y; zF2RRPzsmQg|J~tY;Nes_TqTir#`jo{k4;Y|+g7;kmqwh^5coa@Zcm1Dk)``E{)IwZ z+MM)6{Z$`M&!;B?-EUi6`62N6q;x!a9RTlx?jBctY~D*x(8rBPZwWk(eEhLLxv$Tw ze12P;ogPRIgV@KPYvn@m&4-Xf~23qPUDD?ARva?v{|N-0|63y)lj7 z`EdHskstaZxZpN5aS7FvJ=oF57(cCjzY;FgO4^E}s0q(U!@Uadx@~p={A$2&F_avB za!zou{%y){aTfh|f0B5JI3F-4I~%^b`!hQSU1mCVxz&wx*(aruxo=i>x&EllK0Xs) zqr^p@rCsg$H?CLSy@xFXH{UO$pAFg1C#rWE{0=^9C8uVW(r5V^(M1LG zKc9iGAKgEDHo3VM4?foQ+y!v9v-jR)9*K3$v*wEm#`CrCx_8K*z1)Eu-zn#r8vGqg z(xFK&_VY31+*Q1D_&qwwH@Xa;wZ&(gQNH&_a*1;_ucJpM)9QlQuYxc*IpWJqKT3!RJM3Pvay!uqWH)@17iv@9){q z^YJs?NY2uu)Y;UCo&e8BC41@TIZ0oB1GnMo>t*S#)^XcIhLD@H;r6rlC9D0|8{Zdu z@O8(!bqdRJHdLCzuhO4>YS`Vj|THT}uAeMt2-`s3sVi!}Uo zjnjzRmIExkoE)71gp09%D!PC^Oo!W-=)dN?^k>4u-pR*G_N?err38+kV_rT<;!$ylsiRrK~H;_2%1PsbcR0zNO) zFY=?OcbJ3MimzLuU*z_;A6dEfp4*cW4^iD^O0})#q4x(A*uicazov$@I_krhk826Fq`3!p7F4;Pd^BJ=} zcLlwwCa=Z{?QH~~yU>Rn+0U22&yDc;Cj8YFuMv7ROI%*)+n2=-`8CQngwO5Baa1x- z`Em4T9=kjCv(DmYsYrhh&CfPY3-Ql!)5Lh^xhJbn*3JAX@d+6}^un^=@8IvKpnd1# ze``2@oq8IR)BSn)tT-N)V!|M?qv^|TTD`ih9{STC{=SaeOPyP} ziodIdxV+7J^`~OPk~{wPC?oqu+$~t(hn@UL zIbMEa(blj6_<+S2=jMJ8ksJFdmD6_Qx&v={ly|-dKS6KAzO4c`HYq4xIcty+%7~Dt~_(8R=JPN6765^Fu9if5rDP^z#Sx#d(C9 z_SGs|e|)I?F2-XDCnK(`Zs~8fQ=HyNUz|M39w_PEv%ci72eZ4`ft!j}*?na_L$dCE z7fGJokN&&+Ua!YTL+7}Asehz+ySvJM8J(S9&R@G^J=08ojmhrRk46rBihP&z)n5sH zuJ2ay?pu9AJq_UR1k@bLR!mG!GB( z+!gd%oJ{sui9dImH)rE;kvdC&uZc*&*cA(z(K0(&6NAPxg@aHj?*f0es_E zH7`V-$=Bh2#8<51->FLe&zcV+F7AKklbC;g%#Kul2e^I%IagsX-%_@>BD*sDPOG$c z61`mu@7pE)l4k78D*TZf@XxDP)W|P4pE#kGw@edyULn7WerD|DTg>n73C-J(_x6$h z4!yQ}tTyZCmbr+1c3-FbMz~sseBgSqb@S^l?ty)()Q6S)COP}!9+`(n zlhz9?z{q#^IXk|(as4=)jT>-2f)wF@`;C7m=C$|W`3_0W-|?Dy-NkD^EBu_Fkjq^4 z6n4M@XBwnOqIVUwcz20owbGZ->+$d}pl^edD(M6C(8?-(zQom0{1~znnGGK)@oGRC z?{yc0_>3CtjgIM)#;H86>1(CFZ4Vcrmfg;$$g>hp(5v`qXxBIP!Db~R(udN8=zFOr z^z)c>TY52>{rq5QKOCx_@avv#zp$qI4{}554DDK~o_Oyy_T#FXRgc1FWX8NCeWNUY zz5T)(+PfScP9mS1($~xK&+*?>Nro$T4}VuBz4B67{uTR{vWl3c?;z(!{Lyb3hqaQg z(-YJa=T7cmw{-OVeb3imKQG4b_~cW3jxpYUO5eq|j7-+?RgKe6wRcx`T9&<9j+eUG z>F906QHMNv-m{m>@;yttaj5)w_pvW{XtVdx#qaa8R!e#%4YS+vU)R3hnw01UohIUhI!P*5|S;~ zzqn`Rl@fm*GjBwG>MP6xTbZx#h1;?2dyoAb>%JS!J3AWZFJ@00N8R9Y(0hr?y8|9( z&JjLm{%y~{e4_StvEO(<`Dgsk|78z|?9aE>?y3ABkJ5+r#UDN+Z)aWmbv2FR4cWu` z1+wKY;>QVwd>EYDk=^|$xjE5X(mTqRvAyyl?b*TGChqpjUoY*Y5$dm@{=V>UQr7f6`UHcY@pZ!}Z9I{&qgyd*s2*UlTvPvUtJ#5O&h4e5UtKch2xmc8Hy3&&7J@ zk9>m9+3t^h9KEsq!UyPiEu1~*bEr3d$gcgnD2k8O6Pe0x)Bl6)2!EKyO=#6boW}mF z6R2qm@{~ayUTzfs{I>q9lXN$7dn89|!7KDz8%Kq3}QU>Gy!sL*RxTQ1fxZgZz?yZ^v$a zD($YnjzG7&?=It4yf5ycoeKfRY79{8}5pQ55xU89|0m%b!|ApDc7%|nrg^H%t~3w!TV?ThyQ2(juK z2hZVSjySKikS63-*}6UaT5oIT&St$$$vfoRn0*}UvsU6fo;OZY-#=1MU3k36`?q&m z{Z+WPspx_p?;R~(BF-;Vf}4xk;r-M1l#BD-^ZhP|a{7Y)tqPw%Dfyqno~@$S`lTHUR+8(ldV=h(ymX7qu^xFPxoG@eTy4D;I5~`e!TkN4(qCd${&9wCqB@cC==*J}4`zh2UNW#@T+@8oyDD!e!tacT{agNa z_IRxO4$Hn^C;VF62A^Yof1G_B_ZFU`o@v&(4|#v^^RD?L@*|$F{CxWSTJ{2aqiyoO zc_`w%9?n*gkNeHDnBN=0IOvzJF7lxe~Ulo;`1}<#Xz|Iq;W0ICVxjgec|av`nQ*S9?$BRy8O|fmHZ#Y zuMzz?9gfdc&#`&T?=_uAIe{ET@o(G#dvkdG8G7LJ2jtVAhX0gyRX&l)JnZVaB@VST;wol? z?E7?pKhZ8H&QrHT`~8=+uXSs8coz0dSN@N-P`8DV(@FX6|CZm<4p3wta-xX+Z{NfS zsROSj!o`koU@*%i?*EcGh&*FQ^(A)0&OA3&vbeAAaQp<`&SqIn@t6EUPrfWRqKCavJ$>7I>he?n!oK~ks0nXk z9oLqhwVn8)JpBSaP`$P6 z|GkZWC;rlE*>t!x8%_p&0e&JMi)h|#f?l7e{^-ZebUPerkq!{|vNZjkURB|zI8Q(9 z!<)2*UTu{3=!X)ou1asEKU?ws|70BfP%KA}`fufLXlS)_Y`RIm{a)PWx$yTs&u*^C zzC9;hZ~j_aJf(chgD>*e)M5WTsGfds=C$zxr*NUspRwf^jiHa zBd>aU)BikMLocc%MY@6?Xog<=hM#N``i~{O61=Hv{BBCu(chnoCfVmD|GoGLY9um; zryI;`Yu$_WSvih^UNh+ihySLxpE^ev@~O@r{Q!9nVt@Z`9981)2)(KSPao931Dtc& zz&=d)qhs7v=YPImJ%i=z_>Ep|Dh|q4m;4``-KCzv{2)K+-=B+P@e^`BJ^PRxSTtFd zZ3zdKDIfF2^6XjiWC3ZgL${;}IBU&jXPl%I;n z@EPlici?N3za*=aZKwSEWxE2mLmxZB^?%at_SsjJ{MSgnHqX>%pE&=mUp^|ngWoaF z*3KXE*#yv451+sQ%ZQt#V~JnQG5;M00AKZkzq$R7VD-IxEcRsMZh z&oKT4S^44n59W*N@H*sHgI)cI`uf4~Kg^FG6Zvau z(zD8UmB(X4dZ6)sbp922R^^X~b!S)SzJFyO{$3oHe_iUsxp~;1BiY|;jHC7T2fr%i zd|}D|aQCHrZC-J|Qr;*Fy}Dlgb|st}T%|v2+6Qcyg*>Pt`LCB3L@2zPD3B28& zMpVz*$&D;imGZH2`5InNLvN6EfD_Zu!~ZtHj(cZT@>V>9b+vz%-@|^GEYfxXwA|lH zHS(hzB(qrL&DbVcp3XzR&<^|R;s-d~EDIc#Pg=j;1wXrGPL=R)G*>?C=STI+Uh-#D z&Eov#9XyO7pBwoj#FDUwYh+#E%w~FZ@fz2dUyc-K$0I2()Bn*Qg3qm>{|z!ZME>(f z$szD$75t6(`nz~Mx2Koi!s)=z)yntaKUtY{$~wW}pUF4u-ftjGr{pkkAAi8z^~LYz zrNH6p>7Vqgo_OgxS=^FPJ^dTsP5GN|AkU-uncHOb+1p*yoP1b1?ClO#3w_fj+ViP< z2n)~GVL#tv{O%|YuTnN#`5EXj?@WbLp`SgKAI85l3tom^?GOJN!G{rPx2z{V=cXG= z{Xd-lBJv&Wl(y9Gf574T^s_R2e%SoAkLMbYr~BH_L;ufAA6I_&1*qx#$hEuI4}I1+}|s@;y7du~}Q*jV$#&&te=^)=RIVLt6Fp)ZRR6h)c_cR4g6Gr&!3>nTadNJ|BvqXu2|9!g2VOEcffZy z@;_ER!T(9{tA5gjAEcUkzJ||LvWUYt9}YL+7dj%Vla1l`yGs2b|7+RDqAHy)te1_& z&)r!~@~O`MyvF!#@7zlL%=v)g3GWSkcq{u$zjt!3rAgMG-oER-C2qjqI6wCqdQIiO z%l<6&VO3V>m*)H))#&GU{LC9l{iy?A!>{sm)-u}#dZh3#)EH-`hC;d?4MS7UF+ zzCdH^^)~b~%Y*;bMfbLBr5zwviD z-)*V>mT;qEUWNSMD(TU#!{DrFcKe1Ml&{QAu35_Y@cb9_F7_2WWP|bZfAn+M(Y2GI z{Mjvy-__`~%JR$6WsUWG5BA=>?zj3!|25>7exH2CiI47>wPwGqfWu*@-H@*|?#99A zP3rrNAEOO<)=XYAZ!|-{Ew9hM-B7%eH_)!S_6%EaF=jF6J>r+l$<(yNK*(-3nS@L??1#0$z zKO3C-?k;mo@Y9?hp%XkD=0>)^#f=6%q`OT-^`qEl65t&CJWY*Wp%o`v6L6?#wfFSWm>d0dth z?XAaO{c0Mxy^#g?IWN*W?TGlFI%!SMbrsjqi5x^!!Nst*+oW%ld^SqkYge13OWHaM z`@Bv1dWqXk(`MhvKRFD32Yn0v0Hr9b9<--^qAQoCXvf6@FCc|u=Ldys$1 z3)Z#d^V{@y{LJS4@24L-CZEImz@_im!A;QrN;_m97w(tOMi^gJ+0Vb>XQt2Io-g3@ zh_ZYg_GG){Kz@!!+7suxd--(pIlQXJ57RW;k{n)v+d;3c_iE_B9?q3?$O`$#`eohA z@jC@hhyC2s_cruz^1t*s=&@h6mG6;vbgp^=Z|7yX@`J_Swa)B{ITzTq#M^_iG;64y z1zCUc+)x~y^(g7bDc=bGAC+~>+S1#Tv)(2BB6!&fpGRjS%{#voS7)uX&%Pms01sTbBLxhTmm5HEBm^M z=X&sKv@PZT4LiQE^6zCm;ok4X-|7kZuglhZZ*TbCA=|>bDS@LwZ(tqY0nU7*{;u%0 z0{b)M*@zw8-ukXyKEiue747oYF&M zkjrBnIBA96o}Q2OzPiTy^pai;J~trG?#ZOQT^8{u$L0e{eK?Gr+W>m)Q{rlyl3u~Q zc@8|SlMLmL?#Ukb13nKb%b$;47e1epkHg=$#pUoj^!z&XCQ$39d@y@pO>u3>e|>g! z(6^9pw0E|#_@r>ZpmE(4J}3Czg1@A%d9ay$qBZD8WB!YRK2P9h?nv+dU_aMHZz8Yg zm)^UpxV#qn(Yc?ziQnte;J`oU!re&U);v)m8A_iT`~OdAcXryZ#o#RBr5eDoz>Ov> z4bihk+}P}&;tx1Hv(yuJ|Di8Rwt+kUEmoEE7>{Gvi*b)hlVlv6{@OW|cKC@*dTaT) zE`bouGreO*98=P#u&d`m(LH(c+R=x7?92q-ZjBx|e5jF88$KUyCnolnxA~WT99(V& zS5GO+H;07ZrX!Ne#dFjnr#L4#s^tG}JNRLLo@eAm9^HGbSYv&04|+T8dOR7K{ZahN zPM$(Owf(-1J{%?sXy@#W;xqn@aV7qQ{oFG-j{jpgIW*>f9#hg+|4U!PV%k`IEqnGB z=;5#4Tt0)hD~oRm`8TvH^3+u@&gXf4NH)4zJzt5cw<}%JUx2&yjEmRP3FKB2 zK99g>HTH80`Zvq$Inns(oqVO981LVw&5iTfdS_VnqxP;V%U6fXEzoDMqlaX_xwo-$ zHleJiE<9<=4^|sqZzxDU+rF%)9^8xD%|^npSL6w4kDx5ZD~m%Av7c2xD*mHY=g zUl`BVhR0J&`f=!u(ND-i&h_~_CKz8e_#>LboiT8DB!2pc-xyxfFUuMmS7V${8KwWM zx4kFUFAu=SxbNsc$`3aB9?6E5{rg%rSicN%US*qXXp-9>9A1|Hz&y|_>FfMTeD>jo z2)R`kU)Lh*?E7cRH6@R~y(jqjHLII-H=q8PZ3R#MDYATIDd&dl%DDf%YCg%lmWbCF zUAC)HN#B`YV%zMm;urWlpwz2o?B$MdyOZzh=)+Jr8Tc@oUD*MBTt3!~0&W1#xAb0l zxl8(v@V0;U9(!P#_e6gk=6eJ39}WNe>6dx=FmkGgJ`BB=d>q5!e{b>56V+2w+*~L9 zGE`jNAo3d`-fnZd&WD?I_-`-G$K$_Nas~cF4%eXvKHunliFr40I^=wP-Wh$IIF8}y z8`;mHAA!$Z$Z-qztqir^?C&0xe&zUmUHN9}dCm8@SN(mTabEQ&_#Js=Up2opVu$>i z55#{1l8yOnf5;i15zjG2Jxx3zBZ2kxnqn-R?2MC1-WwUl8<`i6NGc^0w6}_px3&7? zCrNBfep`_U6O zbHLA-kJ_*+JCJ`fIGb3fw)eT&e%?%%)hK)4`lWxmb=falus0ji=i%rn{2ZT7E9E?e zecju}7xygD+{7whZ~PDZzHir?t9i_@d<{ZC={K)=-x*F0VOCL}ke zOUU&T`H`lozouQ_?Mi%pB|XIY>(}CqvV7q4_!6I=Nsl1+O~se$je7o%{%yM0{P;z& z4!sI}FSI}Kc};paK7Qi03p$S2hk?)4;7Q1*Q5O8zXJJnUy>B);`w%{~Q$GBwy|Z0> zc4Mu~%zly=Wgz^FdM0PHv}cgjtkC zHVMD=#czyPeu#a*N$|M8xVdf6tvum!H>=z?$*r4y`y<;4pS93MAevAA(4H8dX}%XZ z)L;kq)UR&L$!D7%Wsu6Fe`zlJIB>gTzC-q>bzeI;7wu}IeAvl-;p2Da_d)-bADws8 z-oWSSSp|0SR{7?3{U3clc=(oic3XHKIJ`KYfR9Q1%yac;Mfkm?@!Np^IrtkZ{|1j! z=ySeR`QCHAC+5w)^7-0ZKhAfT{kuGmaW+Dn^AVY>N5xg14?E#HzXv{Fn;%L4zbS4n z`K$?_gT4hn^!D1hj{Q8Xv;$v5Z>s#AKI2~1ck@}~*@*qT6**6Fu5b$b$GNtAJMU`( zp9dHhyW2k;VcfOikKPuak-u&h`dRGc?b&JZzun4qG-A(nhi<>Su_y8})J}Hw8FuwZ zIDC|r@1y=r;+%IW@vSM`?9BVn2M&J*$Kr;|==YKA+@A7#baZm67Qf;c^@qJWQT;tc zC6D20YiE5l9XY1RAlQaF84j|Ww^(@5_a-3BcKUC+b#HvdFNhsZqOeT zdAgzaiT%9mzqq}b{sw+l7H`7m9mt^}d>#+qj&%}fiGD5IJ~KzZ1P+fcaeIx(s6_t% zwd#p^=X3P9Ke1A}8@u;^;x|IBG#f59PPUgP={S1zMNvPUiynTAE$QbHr*f8(%U`m* z23|NJ^e=rxdNh0WL-zB|C4C%xbJji?1otZPgH1?xEyw9(<$D_!lhb3h^AG;#(9=5N zGlIUW)0lf^KbtQW{7XLqE;nUA?~bXclX{}|DS$o zdIp?-n|~wvw&u#5CpQW7{ zzw{p7=O$HlZQEpFQZZXX&(owqw(I|=w}a1tw;vUs7VWZ~%lG!qPWF6vTDs7BiiZ>~ z1`bTlj-?j^l|RY*JLuIN@WVfp9YeoG^=GHi%LcSD>Zu8jmw0Y)vR!tf{%Mu$>%Aej zMcGmGWk7PU-(~%#jhn~O$=Q+U1I1$;tp1MTnm6m)C0Vq080|b!zts~r7jmw}zp)2B z86{5RSoX|M_6PSU>wg5_b>Ytq-qXT5{W0$iynRnQ;Rb*t@~4ULl1o332tn~ zpS;*O$lQOmTdAKvqj%De&zg6J!jWIh8-aUk)zbqmeV!dn-#77Z>|TzGhH$7Iy!|sf zRljd28ls2&)+nD@;&ZF~Jmca6_gn2rE^f+FzD=@&eZ9rz<#(M=*`wrhG~DU{1%~8j zD)*!P!Oi-=nZ6Uh@uAt*#?j1w=?md(on$H;KL#HDT zJC*M}$Mbc_bN~EU^tgv`7xdcB6^0!!L>{}7@Kee8?#=pqeSWxl#<7Qwr#IF4n?wI= zJAZPB@>{^^Mfl~}#ec+^yq52)zh}Vbh2&9RT=Smf(9k`EyX(if?B;p&u#)pAdy!jH z_&k^X?=5aRaJC-%c|ZLd_`JmTrAG8I%CBMZ9jczje%YnOpRb+tmo2QQksN5ew)E!R z&7;TAs57-|jXX#Pmi^KMy$gEBWHEh<8!Z;2^NS`s>6f#4uusG9du~)dgdDQ&v)dnTd19llC6^+>CHkj$)WIhwR}fmk2HWh+hm|{jvB`Ju!a2K<~!x z{eZ>NPQ3bB^@JXNryU)<_ctroKa1C#PYF8DI{dB3p8Lm0>nX2PgLGg0;%4A9$}fTE z_j~^jMd$Qj`d~*s4Lvy8>B#Hh*n9B#Abi$M#-&G+?~e8Xuh&m*!1jFDOIs;#4iz7L zhjzas|8vw+n;m@+`Og)teYXBuRm}H2>OTUWHc58l_xO*`3eFiGS@N?Cy(#@!n6A+8 zMR8hsSjqnx=*{%Y1#o+vecOxKzfsRk_^4%EU#|XEwCrZ~Y}ipRr#G`lhVXk_2%mps zKOX^CBYxu=`ZAiI`4sQT;qzg}L2XeqSJKY_CM{fC%ICoBYW+5hm2(<0$6N@?l<{9f4juxj8$Ne4M|B&+Wx^+=?E0^;mX>@>{Tzmy-Xt_6LtF>kM+ z(XQW$Jm0LJjo?=c^8Y!z6dq)7d2_!vft#^ESTDcPbKlz^jQ846%}>SW6!!GF-v23l z4ms0={3z|&fq(i2_5E$%F!ahzBJi}8`lsfn8%JL|KN$7Y5x=p#tY;$p4?pU@=rN8C zS8x4fs&go(;lHl<<<0$aZhj?w8^RBAuJKyU{jc%fI^yB3(l0}Oo}=9|QT@yR&1Ly< z?om0L{MHsX!|^Epg7?-Wx9jp{#@Uy}6Y2@My>32eYFymr_cr?F4e|+GdfRg?q0-CR z9qXfy%}asLYx5I*-`DztZC5ylkBESKz0%_g+V1>wE8^-a9mTQl5>|OZr_h;?_;(Ik|G5`rnkN zF3L9)|9rlA_h6R(J>K_O@kGgg6Ml{h)W4VW-&bhQ^X~6HjvO2EXIyMtjFxTb2LAug ziZ7CoZ$o*Iu2g=m^WC?ZM_wv^RZonUO-^iyY86L!Ir%)#{~U5N)20s^*KO<@uF&ph z-C}bze(Lf!Ux<&zNyYRbdi$k3{U>_AdrH$Qv}-=U$hFFSQ;bYcQ%_y~=WFTxPVo6A z<$0;nGfVxP2)`O9bKv)F^yH&rFLq_vUHigcStINVUgx>ji$l_L&}%wpa*O(R zuK{OYFUx;q-iiBeHY#78T~j%G#^+de^=Z4G|H~vC;77I0`CGJ`eh;z;BEWtPUGz6qL=b9FU-h3FrN-!KR=`XhwRRrUeb5T z-mwnvWpx?tda0P^y-|L?@|ED~r0o7|O!8C_cpUW~lKrTBQ+D?4nLX~}2;~E>&dGkp zN0RK3U6svFo+wU-%Q0Rq&tA`RevQM>w=$1k&)yeS*W&Ux275cE&88;pxt$xHZ&d=6`~Wy+Zc zv+MotCL{Bb)AU?qQI8DV`Q7^0jmg=y{3I*c$)_tXP5?dPdw=)-?r^vVT#WLa^GESD zP(Hhx&9k4n7bWU)q6oX0*{!%mKRoao7K*Df<#eyMsk6jRj`@|l@G;d3;< z$Cc*g_r=$RKG$@w;3Ga~!|i*FyBCW6%kupC`2*^o!{2eAardtHy1<{>@b@Lyx4AffMfqXu>09X4D*lblHClzv=9LPg$MDMBL33faWd9l0pU1tvN4?kL@{}eB?&n9`_ zZoC}-wbmVn-Ut01^y#qwApNjjJ;TkzTc(?emFPPue|U^PZ5Ygc*$(|P^j+Pla=P_L z-SiJR@+z|b3gt7F_)G3indF>a+zmNVU`k!=dSu!K3 ziryXl1@v!y)=OWJBWf$VxEqVGQ*JKnmoYYdM~vYBcIGDaC;Y_gi)W12e(6X0%{@ls z+eSNgBc~Vj(}Z-59Il%R_p}yGm7DCAo*Rve-F=_PPPjDvrl?1sr{MQS{GXWq;1=z# zirdrgX^az#%AMf%Q}H#$_+5b>_VC2?N-;K*MJ>8R6~3*IJvHc!(?^vbB+j`JU)UMq z@z$04vnsjDT)DOORWsXOUL20U*4p*0oZj!VS0l<^E`VZ^ejI?0EA0jUV0=_I3R)z0 zy8XwUG1~PwO^%z2IzY2VWH(qnRP3(}w5UX&ykq5q1nA-p^t zJpXR`Z81o{%(r{6T7PyDWwO2bexlFF=ld%9OXwY}77l_Vmsw>GBDd?&&qp863g4G5 zpKTW=&Ry+bcVb`fneTHKcIL&rtlgB`Pc%@(e;;TTn3;SmF79)=Y^o)L>~_vH{?0YZ zd&A*}vQ^|?nO**iaTWG*HCFu#;+DS@wNk@s@%vI94;sxq`Bx$IN&ZzzAqcj(pOaAcQxxp+hLOI7&3nzy)vcC552>j9U(b) z-{kuS|Hd!qRq?aFh^%g-d`^E~Vx3H!7 z={(t4`Csx6)!%?5|Fh|z;!QPmGzxxD_G}p4g2vui?24bt{y)lih#Xd7pY-+jq5K>@ z$*!Hh<3`xHUuO%lIf<1VcNFx|kDZK@PD*W|%nDZZRM)(U@XLpQ+-+|BfA9F?Qa;}Hbi74mRB0#$G4Qw!%RAn7) zgvGI&)`}n5?J_XyIJ&XOcsmDAanI*R{%rtDmMM3%cRlX+M*d$lZ6^0nB_8oiZqm-O zLCaCA&2ektWSDV{qw7}bOXGNP3_XBb!5-`hcCRd zP_a#RF^q3GH6*Q_yorx9>3uCYSr2_rQ8hzFczr{*XQ_KUoN1!|<$+)7`A0oxlJ|CS zB%)`Qq*eKjTeG{xAd}O+@HAF6{k3U)+Fm?!6>|8CKlvy6(bl;ZQMe+`rlqw=eXV=C zf4AEugEZc&ddE@f>x1`WeKysX9pU*`*4O{RM+4uF^IWWdn|kg~Js79YoQfTCh?o1vczCt=Z~BF`S>&zyN#@%wiEh#QZ;JcUFN-PUdIx=r%%v@i_r~7)Fu70n zh8OkIv1to-Z|nxYSiGxGrxt@^~JncUQyhn2#E+&;r zyU*d5j@^fq#oBxWxizKs)yzLD^sqZRv@1)li(PD*{;c=TrrB`{=}$ZJYeXwe#ZzJx zT8-qw+W0{ok)^1s=QHzZtT0BPt}^$2!oJ>vq~(gIQ3uhMulP%wxcgx~>s7`_*xi($ z!C6p#zinXmL^DBGb<8wlpPlUBuV>^(YN9pGAO>4O4_?Wp|D$-gI0Ux?$!EOY+{;M0 z#aQ^QxPxE1wNh>Qsi(l@zltYW4;9$Z=aa!ie%{3_>W7o}iswkPMe=EJr+<6-d$uhlA&Lh<4ub;muK(>=Y;>fuMvrtFIM^v}wR<|}A zmo&=aG|mI;{P)e>JBt$+Rpgz0*oXfVZxt`4?iwZIUa8wl@V=w>CiLMG{4AxhgZ#cq zyN=Daf+n|$4{0nWv!8i~b(hXguFJN@|C{o7H0Dd2C@yg-Ur}4}Yt!`B-Ni_Fe=z@-$J?R1%#y{YM6_`F{n^}Nho>2$Y!Itk{>t({NRyPp*EvtD}Z z?_zQKfwjp8&PPunjraIN+R(M?NfYzX8*pGlwxyBzc5z35?gjR<+D+2bLK}WAeBi z?&qRb9;j;G$!i$%W&(ILO@DOD8~GOJP%3Lh1#ukzxcja>9G43LeS7#9_Xu=`pVN%2 z@p;_lS&JVzo|3mQucnZEcg|J=a|gCn_Itap%HMx3};|SA#kK6zAm5MB@LZ{82e8&QDJE{8;*ZCqBd9 zdt?4QU+ywE-A1pzTRfF#W%+ya&#mtdNbbsGT)kPm1iz!65At>9kD2f}aML*~^!RIS zz5-^RoqV1*#Pw&zueqE(_(_vVqFt7oiH4d-K63--TjsSkdhi45k;oiH7O-foJG=%Z zZUKZXk&nAO*)`M7ihQG!ShsP)JL2Rfla5;x{7UDb>0f5SJ;`kk2vyDe)edE+(trIc zdvsqj(pYc!ng4l!c65P0e4gq%4>$G5VK-wjMr_1#Eap$X$;^=YWg*n>f!>-;axw42 zv&yYClO6$EMyh3$T9>Q+6Z9{%>JqJ(>0NvKy6*(bDp+m zvAb`AF7#y?^wX+l=`qH9A1z%gZ&DRj^fty)UuXYzmzQ}Y%l|FsUp^Cqawds|ujB%D zYajf)T0HIT+iT5ex->04m!F`!RovG_W;{35#-SwGBRx+%Tb$zixTuS-hUWFX@LfCI zk$<8)i}P`L?ZSJvjd+w^@b&_@HA%1kx9FC>Y!15|c2wbWxlf!#uRx!o2i%)X>X&QH z)b!Li&u6qRh4=kPU=%<30CPxJr#_3~JNY;lq(kYejJEi?g`}Rr$4^$Ok&Q9({BOrj zexHBrDST#Ra}2vYPL+;HQ@*MF?c)t+g{<1G^IYCAZlZelI<}=$UTebme;M znd9vy?3i3sJXm~|UhFJro?dQ^(lMBe#r{=GR@+}olM)-!RPkrjr<5ky>_`zmUIS)1|K> z3hPbu*f$%(>$!$yTMI5$FfKpj?QhFs=xg2JE;jgCM}P0Y57H66DZlqbxx)T-zf^cd z+$ZL}vT7%7^Lb|LkBbp`Pk8)|bA_GN;v`ER<2!J;8uS|thok?U5XpOaE_L5Z1zIP+ zd0Aggc-slyS5ZE8bDHq8#H~`bl39L__2^)DJpvBR$h)D+#FCe{-ScC_Z7+WnyP=2O zySwKa`D<|=w-{7&UrJvizO{2Iq3uoN=WnJxP1wc#lxijpI{a?^e7choj-7>%$h&cU z{u%tc7=AZ3I;+V`*A@4T+0Bm{XOB9c64w6r#nUCdDLlW-xc`K|!cDC#k(cxC_-y1{ z$_whbK^#X!RX=g|8vHtU76jn(v5+`jeuBn*6`kn zbhaw=_>;9VnC~@Jwl0ptdTdkv9kS&9Q_;l>3XO)PEdWoUvWH3sgd@? z&cirQ{lMn`m4D*^5{T0U-Dtv=#_Qk3RDIUed;5`&`(bf96%zd^zAkPftpOAJvWZ6< zQ`5btVsZ$}xtaIGokWXh>fKgBv0r-}tnJ{J@i;n0**A^xf7pQ+z@!eIJX&iGpeY5N z|AB|$Zb%#VMIDQNuD_e5J>X{tYtnH_x8vcTEGPd?d<~s>_w3i2XX>NR=7oA)9qrPA z?5o^9VJoA89p(4$yt7l?AbEy%S&4sHnz~Jc5K<&d>mKt2kQ)mlrkih(@f-IA0aN~_hM*zxj1Bdl%iy&!rh?%Lk2I<$jF}byV>yoIMq-K;`Z}%^uRM|+~qz6 z52Mfnw{uv(H9tn!{qnPtM_v6iA1)uICGW$tn*0Tq;nyvz?#F6D&h?TiSyiiIJ11#B z_Luuh+2_9(FNs%n=FdodFzrR2-Ni-6-bDr3Rr(rTquA;FNv54sOpzt!=i*nk>_1k> z@A^IR`+S#*v1PAR%62two0}g7l5=P0T8=eCUSQvD5w3of7rHA+kK~WvQ~5hYi!SEf zdtX+@=gjN($OjUqX8EGBv+Vj@%nyG_Q9Es(zLowD*Yd7pm*W3s^Ty@;H9wK#W5w%w z`{8V=mEiB*(?H*Dr$3&M3uUc*6z<|NkH=}OIA8a6TERMKH@oQrar-NKcns}qEo%R3 z_VD52-Q0D;hFO@!Ee381%Gwwc`h~swHEU*J*2st&X7@Yp#*jxibMlrK@^sI2FeVmc zk?o?Re9d9iHL;r=x0-f`->qqIbGX-+H>ZtV^65si$e*k-ZiljGYqQU{_1sMDude(D zEVMb|DB?Esy_4^=Y5LtgIo>D785|Dpoea%zIDDNOvwgMVxMD^=4(|L?e9d0(PyXHW zv%PnD(Fd;8hX=pG|MqmPcYcu1)$S#1>G#0vt-LkjG~$+H9$~mQLk)fNi6wn@KEcR_ zMEKv{`-bPEe5(V8N7JIl{Ko@mVQct2$afJV=uNeEd$`sfitUz%eb0T%8~u{HBy^+p_h%=cZr&N7 zC+<{Fod3Npe}tdx8#rB^mSyf6oPgfU_j|2t*YI!H`(b&#q+NmMH|0;Ge*~Y`!_@DK z_wzCMY$i|Aqh`xz>>q};|5ov%dTQY(#Z6`Wz7L-V`a3oIh99Y_u@?7du4AQ-K}q3X z=H*CrMISMxyN{UT4t`PhTWB=GNA z7}d&~Plh4khuVt{Ooa9K(cb3jJg>y_Zf1M-st4<;%mPS zQp2Nc#V$Cz0OIbA=c;Kx{odSp?j4oxs|Az2>!+el+RM0VLE>BcwWoUkrg)~3_=<>j z?adOKPWvJfyCpx#&itKq;M6E_cH8O~c{`P=ojR?a$~B6|I_U)Z*@W#s&>bri`M$+= z@Cr49?Qt*Uy=3{c_eD0jzu@XXc643u{V(l)8|M??Lzfwo%|SV<77UYe;mm=ozEY8v|hNbtmkZY-bC$oV^8YtDw4Pz{`N`c zIHxk!(|yF-bvK7KfT!I_bGU3M(|xuO7d=AHJjae5dWT3vjf4uwA>;OTB!>s)Mlpq^Y;{OsrCvZZSKxmcb4V!!mVTy%3z67oX&-ZZWBKIxZ+wo*#;LzMyzPws zkNg^a^ag(^dTTA(lE1h=`{|Udzh13xUogf|OHoo?>BYD5=6Awx5AhsvTVE^DNZquf z3i~wt%;6UvO&eP}2iRNrb;V7-hyQfB`Xi6VMOjDVu8C+QCv%O2i?d07jr+3P=*i>s zRyN$2?c(0pskm(+N~&LS7|qM`-uM{FAG5o$TTR}NPm6i_y^HmA!Bg~Ywm*6eaUDBZ zXU?+^*W1(M$wfZ0eRrp@7;57I7_cJ@63e5uc{~fEi3Xj;Q4yx!a7?GpSRK47V`GrXqH|rzAjG9 zy;OWpK4Ez$o{szCSJUxf^ejydMQ?&Oo0KCmV-kG}TSx8!?P||%jQsbZt>d()iSIF! zcQOXL(3EC+sV(`p_r0B#4pnR96Cb18CNsuxr8?`!X{6ZEU)w0(%m|(YRpP$8-Tc=P zAJb5}c>j)iC%hX+!}iE;vnRak>y1a22=DxB;G4N&3Eb+ghLh1_zvBd)PEy+yq}Ec2 zGmVl~xVTgsy5sx;<@Z+Na#$I>pGgLDeZR(kyZGyVc-PjO&ZOl>>4)#&ZeMu$ykFb< z>m={FkX$z6yNBOCz)>qbeF*;VQYL(zowfN#mU%-xd6aryC+oIwj};&;V*xogveFsp zPL^4G9C6BeS9_Q9$-X|?47pB!wWh8MO~=@$+>aL3vp;!++Mg(fr_E{64Dob((%ugE z+bKClZ;ejtYu(ZC^gwd$Y2`f}XVuy0-N;m*!Rul8?INCH0c1WFeTKPaARL~L*DcZ) zj84&ssXM^UQ47)glhthdklQEw%cIdx>xa9IavS|)t(|uFJ+ie7*7`nfN!U-X#eG(j zaeE)Xz);klZdr)zn$?vXsDFMco=f9ap^>s7Mi#g#;+-F)1(zs4S8sRauQ1m2=WFTp zdh^y|1AMM${Qi*6@V0*N`54bJ=`FUBOl_i&%p7PNA*xISn1 z;Md@MFuSF*=ce<+%wga6(7$hsZ`+DAojb}}I3w_ab1V~iFJ(ScKJodm`C^&T^@;i8 zWNY5``4n1^i_6%#9C4N5>Qw#L5q?&trvu?{Wi9BNhaEmhe*WS9WohMm(w}j6uG?f8*Zn zDdZCOTZJ{=(m9nuM(z$icZD~96ldZu?A|l-F2=|H{LI^_e|2$DzK2#fkw>E~)OwQr zyajEk0iXBv-p1lL-f*`_yYvcj*adn%mhV^cw<3QBZnsRYX8-TOetsD}?Co37XY)|r zmCM=AK7Sv7;fGn1zhpJPLY})$v?TTiw^?FkFo;ZQ`mvYmb3gY?^&HTw8!Q=)Zf|c{u0B~Uc=ivZrShNC$ysVU-}`6a=SHYhPyMGVy`NuiSAM8> z-VN1a9dx1h&S0fK&#vx61E2F>6P#XXG@gL>4LBX*^FtK3liKg>D>u{4{0A{toeQ5Y zQMz(+HS6GtO8?4xw zTd8|cT0?*D?tYYk-rlc_EjhyK-~e`GoWnhXT{}XH_e!^bYe%z(BZg;)IKGMHn&?Ql z*NNO8PGd(@Tt0m4iH{55ZDYOuDE{K!nWxgNQFr3UI5FVGXY3aZl)Y*@IND1#!KH9J zcaO?^{q|Y$W4bFHngM^qJJMb~mucs-?B^p&xxWXW+j#G@eD*!83pS)vXxXmfp%3*r zjs3nHmk-Hvw7u^#f0Iu~Bk_BDHsPt*B0Exh`@rqFaOj96&Ej-HJ@(_l{_W~q;TE_% zl)ra7+_SB-#n8-3C_BLO1NeJ)rf~^7cu!-ax45@u`032gaJp99BR(T;8EGlrd8vMH zg+9+`9r14NOoexgvg1lS>ac7da%tiF9PJndpO5psseQM-=uK;%+u^^LsQ)89-H3fZ zn>14Kd~@*8NBrFuYVR*tlIOG7MeOGHF}{n(WtR`3qbt=Hc^)pxVx@b8==@Gb z%GUB`>_*N#+1bAqm&*Qfezpz0_(#0*49_=nDxxLZ;XUzr`_qz5;+2En55-rv;qUuH ze!Ru(haUFnB71K|=TYVvA+y|rwTCgcEx+-$q`(^rhhsgsA#>NF_IJp~>u(vG^TWug zw!9m2wW}{Xd7|3tz~>W<(;D(RpJ_BZkqOuO@E_Uf``8`Jcm1sK?JS54|40^A6s(0X{D(`{ihO-5dIx$nFmR_sQ@%_BEDj z|0v_*lzfi);``!~{2))aaE`DKH2IJ}WG4tz6gTAuzU`8vkki}_Ys@hyCw zjsMsmJV*Q5!slc4W)E+^Og+uL<5Yab35c`wW?emeIBI8YKLM(RM`F49tHaJCjK?@( zexh;En&eN@?#ljKrj8agXbE~3Bji+1$9a{r{o6uY7kOK6I&q?3V?>_gEs+Ur7g{-i zG&*C{N^k*5p+A>@Fv-YF5m1{dXH2jdg{LUr;&XIPdem^Kp~1BO|`PN_r(e ztFuIQFxOlxer;Q`xSK=LtJUxJ1D5*@YHrV8P?scH^HIyX%O1YaOdy6FP7X7J^|t32 z{;1fyyPYIQ!`1M74iq~P_g`${(OFJjyR&y2s%cVsjM8jod<-yBPKKXuLG*nO&uw8p za=KPjPR>pDA>F@=dFcfGF*mt3jnif+e|2PyugWhyM=jgV77giE|FgU4}7JI-+*K-ZeW>r3=bRXE>?7WRd+N6{_k*wVe}={Dm0 zrt9x+{L)uJjr#l}BaNTq*zr5ll&NKT*>m_q##=4jF0bB{QuCT;hpBBx^dq6hzU*PK z==63>b`+`f6gR$y-n=Afp3Tr3KNb6Du^x7VYqo@Tj+X84aN6@tu{c{uUVk`0cpg2e z54ZOv#l-o*S$eO9_=&(lY_?)EDdf`S4g&I2^vhdm1@) zwT#r8J7u*WZid0{E7j8o?jGd1y3QFMskD1CvlHomoadhAeeIm?TB4sC`@Xo;|Esc7 zz5Tc1PW9Bl@m1MQTG`*}sN-pYGX~j#xa|O+V_u!^9Lx6f{6K!^gY{P@<%jB>CFMTo zNuJB|(C3D5zo(IOmpJJ&jD(3vwfq>?z?$MS?VE1(_!2%&GPAXWuYn&a|3&yeE5X-T znKXoZ-Su08JnV`%-+ioEr4M^}I)1x}%ecUEb>Q;RTGfePWuE@6m-NXmQC>WHeo`s7 zUY_qx4~MxwHRQHyew=^BT;=D|ewlE+H_qkl>b-HF*fu_U=z(4HQ%X7R>bb_A+)sVY zM3BwXj`}2YI9w0=<=FhF61PvzcO{R$@c9&4JX3t#@qS$;zi8ldJ8^ftXB0Lh#-4J`8w#=<*^>`44;?Kzb}f{yeI5)`R&N}OXtGJ==FNZ zx1Ni2^v`))qo9rWj-7nVogX~h_eRFXx%llN(&Hj>9jGs__r0~cuJQgj!F?4}Y(th; zkw)yiT}_%Dlv|Ix@5Umf&BK5?2=)y? ziumcPyuBT6-GIwme5VrIDfg86!$Wcn^jOMDeuTEqM*mRBW+d|{EI5jV;od^yzYhQN zCAjYdhv(7vd)U(vTgbBxhhtuT6Fsti)MNjxw;xnLy$wH2;rl7(q=o#%2jH@+{l>f1 zCgLf52oA|In1+9_zqscW==K!T3(Rdj`H>Hx-3{UMfAHU&|N2t0|H?hDyC^00kUbgq zO)Tag$k`>Qv3K194xjfney2HSvH*$}#p&rHEw>K<*9VwG&h|adb6pBQ8*1rsaJPs4 zxWW4(|Jv#48ES2Ae=wrsd-G?6ebJ2FeX-f4F+awIMom>XJyR`heLqY6HGQ64j-w~w zepmec#?JmkB+OIzXsiEj#&`J0zE5v55AZLQ^MZSW%s)HQuLju-tekQD)H7KQ&x;S) zmxgxccQ}uZj&&X_biiWOG?zY}A9>OZ8fFc)V0Q<9_eJpL(*a z1TJ(?{xq$t;P>;?(*QpIhaBsc<>UU=qr5lp|Dr5z2^;1-_~|SX`w-bh>Zx7&Geq6P z?GsCnqd_3$jg--s}KJUa^BHFK_InSZm&hy64GeNVnQv8(2z4&?WtcG`*N-#Cfh9twXCMXe(5#?|=lhu_LBcr`He=C4Dk60^sEPbzD2)B{*9}>zb!lZ9JR)I z@9VUyuKmL+l

e?|ii+;_$8`g@*8W1v$6kf4&HA^z}XJkND<0=~GMoj*CdIIluF* zr95|0K2T}De5v36WIvy!)vegmC#a`ca$0_@*;jrUxEuGpoa*_Am|2z|NuG`QH7@jY zJLd*xvbfeI2f^*h>c1`z`>nOlen!Wzbg6bsG#@_6?vE_Y*X5)1-=OqV^U!oFuov?) zw5ni zbAN4pfOS^In7CbgThWN!;ny;XMk(vLE$@wR!} zQqTQaT&><%kHmSIE6DR9PqeZ#a~JPg`0Gy63zzcNI^T`=zvUOb481R#q7eynOCL7B z>>?uJeZRX4*Q_#?gjUHHeZBT=EPgZ-G=!V4k-=fC^EL2%wC@|qse$@G(|e29?KRWm zJU^EI`38K(dG8sJ`*bsK*ui$*(_8fmx$34kd!*{BVXei97#{}9-eS9BK>HTbKkxN?p|P=)#b0g6*?BhBKCF* z5=&RH>sHRoM1?ylKYQ9;Z( zN6a~aB1V#8M$Y@K2k-yA_akRdO`q=S>gr!tS65e~w{vIXLF2Pl{PRh{jdQK{h3E443s z2wYu6KbxyR^t%&(#3Q_rE%~J{!NoXs_i=P(MA9RB#_fpq{3GM6z~8akcd_T0B_GiO ztI}xqNzQ~jPdmSx%l;S%_jbfxysGnvRsEEtV>Ur;ySVok`;lE`qkX{md(kqxha5i= zhjFf7xv!QzV*b?dZ`^D=8uB+EPa7sC+h&jY)=c)nTh-g1-{TrmBj;>M!T%ii9ueGo z!`Y4~yJw4vdJfC>^?f#+K9}_BiNC%Ny^4SMBK1dhv4!@KE#dQn>Mw`em-*`ecKE$& z`>S*{J_Dbx$^t+7@E_lU>yP2{BGRl#7Q@jdB)tefac+KZc9+ss{LpcuOJrhplX~lk z2l`Bk+us27k zu}=Odew=aV4{NUrJimlKG~nlWjCRq`{3)Y22wq=jy=xAiA4V0qnLmr(mi>IY+GH@v zAIGI!I+mGi9)fVy4&-k~RpLr>INB6t`&B|;jFI|jR zm3!Z3_>_4iKUc4t!q@whhi-Y;cU|PEJJ;&inqT=&fA7x!d?_hUg~zA(`#AoK`_$f9 zT-|J=wUT{(3aLzVKjkL;@PFnL&ARF70{!Z)z9+3iaX$B-d>h&{IK3(Vhi`S{qXc%-&l8IzI=}lw_AO`&Md0-E@7-wk@@;s;~ANV?t-HO=+^6g-%{C| z{MQrj3)R!vZ(h*<9yq$0)FZ;?Hs8X>y2$8`hFDh{&+h(vmY$dM-P{Hj`}^Nl)9+pI zyxKf_M;zVdsN?9~1@P+(HLcN)Z&^$i8@CB;)PMNvWn}dOTlzipOVB6sB>o2%_(GYGy#yu`TlF8yw!2iF35}j?sxgO3eLa4+G*ro_d^99?w;OV==;RHZ|<$!_J+s@OTVMG7l5uI}-rrGX;l8qC-LHJu*H5F1FUpQKs(U(TypfNwQe4H8 z{%yy9{EYfL^OJ-=SMztgtA6jI`+eZ{e82Cjr&njMpjX1@2hjV=v+<(Ze=Xgbz4kXg z->aS;aQYE)66cQ|e%gDj7ahpu-Rupx`lBDTX{&yAM zahf^2uRI+0vroP(t+8H&fAVL1bw&B!zG4&&*_b_B=u<6t8D7gIzlGkq^9h$@A6Jz> z4PAEUCwWl6)|I-#;a23(E`Nv=r?IP_)$8xX<=v0o)jjQJ%z`0s`+nSsJyxy;Wsm$5 za`b*FJ3Y=lcg&yAPw|OxxdLC~;qqiH9;*IWrw)g6;Wyd?zPp9$y}`!|Ih>e3pdXR9 z?p~NDLtKI9E#$8|oP4eI@Hb*3ZnZCHf`jw&vv}424X^)6tG5(i7votY9&es`U6VeR zUu7JJc^Bm+^*SGC7f(w*miPG>_Qojjbk7>aw#ir8@d^_By@%9x@%gg)TZpe)thPbk z9eh*0jo|dl{*8mQ5Id`R7Ks(7z@&Qu9+7l2u}n@1Mfw_wnCB`)f;&%GUM-e&cTXW63-E z*~;9!L9V&?()X2<4Mxt2rG@+tWvhR0H|<>%?R;7K4F1Qvzl&-2_54(yC6Al$KP$7; zd~B=rbFH&C>D6L=ch6P%Sl53}WTN(89d>QtLU|hU?*NDI(4TQm7_QdtH1_k`=xvZKo9$6 zO8QC26h42dd}IFGkJVPD{&jptKbKBQKQs%(+bY+T#_W`S1?Sh5F7Q3@`FuTXhJKvS zG1jlE(Ia2(-RU>}UJE{dL<{95CaJ*f2h;EAoP9X@RP*IQ^#%Q5pWRslkD+%qBCo42 z&JiB4K5R}K)~1j7>nMJX56y$7{Lv5k%$)z-V7&JA_jj~gBme0C^mlowGP}X*wT-_& zsFv>HuvbFmKS~wZsaDGU#ogUYTiGA{Ml)CiO|#3j)6Z)+kK(?S{75g;^-6a0BEOcG zDti@|BF(d}w9=UW@iwV%5Fc_EzIe{zdn+2dV|EVh-3=Z;r(AQN zFZt~4^E&IQvAn{B56MoH}^Y~SzTJH zorc8J`q-axh$sJj5gNw~ab~X#N&C zl*8??zpB~OuWBvjcYaHM%bg3pTKsOlcKYb)srfs~S9<^T?t-7|;AKR%-;%#u;K1d+ z$9>!-`K|PIEAJ4#PYaqS*Ri*!B#$N!D183HI90*tpM179gI43JgS%*-n=hN|`5N5vA0=za zuN@0wRiK0s_=bGC`pYWxh-m+%z~?T?$3D6be2nq`yxGlJ~@q7;0x0Ds`RA2a_pNT%IRm(zVGt4G=)nqSbqXHPQud?DEgjtEaC${ zNxo9P%D+$X7#*K{$}S&nzJ8Aq@y{FB*W0kz*6^ncA)O7{2|TZt-cE0w!16oy^^3-E zD*P4Q)205kvr2E&|E+y~i4!L)K0D%gsQrI?HBL-_QGfU^e$#$K9DNNBo!tFR-#Pum z_=qI-eV+c$P5)FXaC?JaRmm6fyITH|m*Wrm*+fo@EN!Z$iRrK8#E+zW#5J9p)=De1 zv~#+O*0zJozoWa$i(cvTGqUF@{sfo9p7|c%wakkb(pOk3ot@{!x@ZTW z{)p%QoZTJ!$-exv18Ga`>|6C$vy+#xKQ@+H*uT#v&xYAI=vLrtHMw_6T4dMK{~lg9 z`wE}l>B!cZhnahLkJ4J1p#8pqk(rpS(7M?F>@Cuf`w8yFK4k`cjCnBwKM{B0wF2cj z$fCQ1K91*~{uCEA?BW;Ed-8KErU!HQD?Zas&d>3w+S;?DZzqST;z{17J$1!Huft^x zzen9P?5ls_H{SiZ435YBiyPr-OYPnSH{F_t&ueib&v|yb&&lowzH5|uA<^R;{LZY6 z{;qI7xIsH^3c>&8>U}PoO4P0L`QKP5e@$&JNa>ZbocqScH6PP|BL(i z@cBFK=WzE!?R19QFO%YE_VjZ4nzEah`&|oukI&WXRZ=)QSpC)c*XVNcz^B&qs|)+8PYFX--3!tJ5CB^~ZVNGwkJV>Ukso)VK2F!F*N0 z=a>0!xOEde{{bKU;c%@q{P*AHb<%jRc@6xI{IaX_CTTsAk-*2i8^Tik9bf%@ua$;f z%_d9ZeD9l5?Q{xGb35)hA7m4xagYBm*E4YEyOYYt?=?I>O;%7>l26P+5yU$MXP zjgqVRtLvp(da}8X(2?5ihaD)wp}Sw7pZr&E*7BnNhUeFv5B`dq9=yOgls`)} zzzR-5*T_+8Eo@cH+Djw#|&HED>H=6Uz?8Q3& zi^cJcK8xemR^ao`5O^#4{vP{#TN?fqO6*@6z{er}?mcfc_EyUV^p;xxNWa87uzLfS zD$-J+&mG`aJ92HqZXRf4I;AOoD&ce7JZwh_Uxn4dA6=U~n!wZgaMjCl=})B9*|}iD zg1&Rw-tYMP)H@YlbJH^4E5zHiOXD^6>FGxER8EMr0X=E%j(Ddu?B1Q>=w$q!2oKB2 z;ZnFrz`-68ow{SD0UFReqe$9c!_ z8|bhd9A3`bU|*=e4QRq^=}Ko6!}x{&qx>}a?aIidjV0j&EAuX1*?CxNVvnenq^`sO&?i(2AeCep+9D$HT8`<24#C#|^oWa5MH7yJWv; zPlP?(j~iP%syF-OOg!fUt%!+U*{0uw!{Ok~=_e@UB z{?yL*rPI_C_v8MRjW+K(y4TW@+}Ex*Qx1^uDy zqhdY3%ev9cta~E6-1j|`SBiD1RC<;lrBi|9&+-Fp!7sgz9;Wgz{igpd^Tlt?J13BEIniSCyaD;!Z|C39%hB#LE;GO7 z7|YAxjuV%>wtm$X*RfLlwcKl|2T$6%pR&;?)E7thCu!G7Cg-I>4-d-g`yEe4-YAU- zj2Y-r|H0aco6-IA3~mhI*C?kCW$?L?^4^2g-gNtdBk~Hk=f0$RBEQ`kd72J4ujiu& zo?i-Q`&hyMV;%TI2AjosQ@^i~KlEjC84aJ8v)($&f7F!zw1&rvXy>l(mwX2+-P6=w z_`#myN9ji6mf?TByep5u-|!24U|r}01wYj9ME`!UF2sCToxh38&Ea)z`d>%$RIkb<=J%u~ce~&uS>vI2~x^{GwEyj^sy`i+~*4lgzKQSlp;R{3LqSfn^r= zm7211cV{CugNL*HI~1aXoLlin&r9A+`m1k>c43F|J&f&l@L~bP=$uaEN1Oro`kT^-rdr$OAdge1CxK_sJ{NbCZ}cFG;W4m zmV|$NMRGPzZ$EVRPtt1R8POq!yS=dhQcfr7TlkK`&m5=S3%mmLB*ct-!q2EL?AtTR z@-lV2r(eC`%-cK=vfuKa><0T2=<|tEJ(AC1c${asajzGL;Pww_lK9@-xU^ETP1@VK zwbA-Fm_BYm5C6#qav4Ko>%mE{1UV;cPM_ml!q#b#Q6OEfp7vJ1HtM(EPOJ5!h3~b~ zQO2ncdf?<#cv?^Uz2V+y^exg#^d|6lBQ0pjPhC$tav7w-zYNlFx6JrVfy;42;$Y=t ze|WTdBA>x7zHg!2X=!8nQ6=BUZ@!63N?RJocH-VbpS`=pu5D$9cRaqw(~ke2{sC>= zLcB*4c=8r}-cv~Ake40ura$WQrf+Q|CSl!re{z0Ht( zl6Fb|lmq5n{fK?v?dFj8JNHu*ps7J9N@VfS|9=ZN`K zDbB7{F)rt37iwjWEa;)v8{u;`Ih2ckd(^*^+*?V^lLq`C&D2xDuhED+r?}TrL(hMA zA8`X*YnnWeh2C{yPyd!g9?ko+l7Gwjnb+Z`q4Pg$&)9c;ku?_4I025YHiu>W`kVhX`ajgXtjK@WPM4%S z@1%U>53QF@CiiOew&q3ayq$U1PF{@+u6XY;Uu)JjVn4S`tIWq)xyitXIF5bXNk6yB zThl|YhQQIVF9zj}>06C>jaCKy;Jm+2`O?ti=HKDI2i_i<_f+034efU^9!KW`jcX(4 zdLL_`g8w#D!n{q6O7=k9*KL91>q>2KiqkNITn z{wpcvUC1kuFR48}Z)y(}{W{i3b(QgroC}@lS!6(MNd8wr<}UR3GnvP_s;3pr_>o=s zupTsFKSz$8IR?aFh-ow*QFjdr%`TV56g&qE?+)G2!$kTTSOVVzG z9bJ?5;0Hd{nW@*I%+deR%iqo5h9Y7=kTb16Ik$$j)6Djcq5@WEpC1(d$A0ZR9G$An zWBBZA1fFps7v};uYvCY1kmW4ph&yFP6mq!Vsop$&PVB?up4-RPlfd)mS>f}@tpS{g zj9Xt>N2+MpSL)rte!;Fqe=2a+)%VQQnQRbP#;$ zkoGO`YDC)6d~Bvy!_uwDvv1l$yK?ifhhv<(rX$i3^kH(^Og$0T9QJd>dklfY?f|A8 zih2%!ca7-jB%cvSx_{a|jr0A(($4Cukms%+zT`8(Pt5mA)2cLXEFO>Eiv9ZUbU6JK z_XR&i@rl11tDm*u^N3A<4?$I}|~XK(MLw3f-~kJ5W-AN;grKi5moPp z73tBuPnB6y_C^K&a_2O%oWBRZuW>R^1NZvT&PMPz_5*)fhqh0Ku*3f}+5`BXV;ozH zw;QMZkG$JmI~`-3`eg$PIfs25>(0omsu+jqS)T&8_f}u56GO93={V(ggXbZ)eX@3c z*H=YGXu5fM#@;_Ytf#>ICw;I*i&Hyfg+7DcM0Dm$6uF6^)_ge6|v-#dAdCfWz=Z|+~9n+EI z_F~qvDF2~#BW^joYhCGXi+=${mjNy)|o2xt<8qhr^@6_^^FYq6>>}1&yfwc zTHd`8=>QVisGec!D}$@Ce`y16df=-SoEvETTIFpDyJ$4KH!?2`f@_`e-yfcaojIkj zZ!`IxgRd6wIqI*1&rQ>u9#7A!%>TFtzM1}4@q3h$cWZv;M&#Aa_kQ}{y6|I!pWsyO zMttG9c~|vSd!IWckGklM(!p@|Qtip^53egx&Rr9D)tvo0BPcoVnD`&2C;EAW(AKo?rnJF?u5`!@RqMm29wJx+Kuv$`TxE2_-jbp&-eG7 z+{O4lCz5V&wRr*qZgu7T9?0Hjzd0eC%t~*PCdRp))n^E~cERD#NyHa>mz%W)QPNT5 z(8{VbJSA(R(wmIBz}>Nh9*#;UlXoSndvo~Q+Nv<9C_g#f(%)Gm%56zMC&J_KSI`oPJETLEt4{Vyx1ujzkVwa~H{4@`yYc&@((UL=YBVOnt*-F7cTxUQIM~ka zW?DMKIM(n}_f)P0ySlUf%;29Mr$1p=??LYax7*U&9oWG`(@(vN{hxF%^v3+tZPK_? z`jGV|aF&>ndr^oYBze`+PPl^`3@J?F%{n}_?hd&@saH2 zO6?4jA87=+ba2l*@PDej%gyL#9q$qLP`;D6y_)nWJD$MhxH(e}C&Rzm3O)^1e;4$~ zzuzevUyScW_*89Q=k-#5cQT)iDd;<9-P5>tJ1T3z&S)Z@V~YBl@gq-3BMy9w`bL-? zyJg$Ji@0a9O)*{*wYN2W+Qaw2w==UxpvwX9xj#8aeBGw?_w+2{9S(F}I2OKrTDpK8 zTZf*W#lKz7uD&qqrN67apAs2o%i!@aIMKMzFSqcyB-=uN z_hLS z45B9+y|+?Dk7g#z^AV)+qBxGtefM6Qb)W_s{+#baiYJQW*v7d3DL;Q#a^;tE-d86X zfij33dip-M7}t#5+YH8eTakXe1#V^*cFSMx2ppYUi>K-7Eaz^WM9_vESc0(s;%)4n z^hx)X=b(SGeIbX)w11h|%EzBZ=9_Ywn`6>=c_BAGL(tWhPT`~iH#@+WP;oEGo zZ@vonZ65?Age)_5@^$bJbgwohL@) zP59r_D)h1l;)rKk4BrnRpU>3W!~FWt&FZkjUxv(c?HJauc)OT^E3IGQH*~6)j^_Vt z$PR2xs&Y1xw@kHe42EB)HG6WsILsgx@hvUEp-! ze@}QEI6OH$kp9ZlgWi`MC#18KuhEPB3Y;62P6*lgzPoX3njDlKtQ{*kJRU*=j!L&T zAH(0gr?G{&+8qNYPll5b_j@^f?*JdRNq2%L;z!|X;OOP)l(fAa$$WU<#{54!4f%vW zWW4fo+0Waif0fpIe{eT^$S?r6yRjc0hwByQ--GxGe10_@;CC_-v+r9agMFXjdrk6< z{xxNXyhtA7$m@N$9;Y2^3wk5pw=}<+xeqwNe75eT)BH}pOY6cA_5Q*RkABu;_r^W& z#_Yz(WLTMP4yQ!2XWJ^jUc5&y`>YH;Pry%W@fQcAwdJYl3O~oelfhZ6XX0PrVPgJ| zXQy^Dzb9uq7V_N(J^nrl4vsg|cFks*UrpTKp62gW{2)`5%zH3n3%&Zh;H#1QEW4s_5MMX4kpK7D z_V7xillq6ii}l$K^rwQ~d8Ya5{RlYU32tpf51cHAPc`_ifPdZTVPzilmh$2S?tNG4 z0OumFesw+zy|sHLz0HrzIp7HWZ3nN%8($fi+0jk#HzpsfT{qU@R-Ak8!hap_S5C{f zOCt_{YvsdEI~YEe_-~cmtNrln-jYwje>3*<9Qs=yK2O45D{*u)l&fSn&w^8H zOZVi{@!0}CPct5M`8js6{=8ngUp;XjWO*Lm@)zY3Jq~WXSNhaC6>+n_=UYRaBlx2a z)8DU4KU+7vhJ<#ERp?sydW`b5%$R}Jrz#f1sYYQjjXTu1zQ_MOgFd$+`PnSc^Yvk7 zy1P?FI-}h|=-b=Xe+C1`qzB2E*e%%#zY+Zqmg(izqCx2#>;6CyVMFZS2J%bqmmHbO ziIg4)hh^I`Zd0rzOKI&qGV2M?*NVp&Y}}@+X>qcL^Scp{N?ujC-IRTFIPCleZvV+c zvzdAEn%q8h==-sD3a_xm9}(Fx!hBp+It}iQRP!xj>$>9UCVab3FTSoP+}+Yy-+OlH zZ?UK6rhU|ZAKZ?2kr(o_&xG2~J27j)fAS3EYo83|;c2Qr*YiG4HGT>I#;(StL-H;U z@RrF@tnF!Vp$@xwC!gYA*vmuMbq(Q4nGyO0p2tbL9P`?bcRWU=Ke&I<3O)udbxe2A zPDl8>gTMEO$MfhzbGSW=-no$pPh)@ABi$D+RND185%f78K8GDYg1s91j@_&Wai4rk zc-PPGr>DCX>)?^;5cTy=_F#{OU+wVpXy4uIQE%)I_Jn&~@O4spobRoS;6C`7%`bfb zd)xVUdW?Rxkq_xW`nMzdc~AJ*kY*f)UIU-^!{1Kucz<~IUFm;*AN)VaA0F?JF0w8( zv!8mpu-99#ll#LH-Z1=hFdtt1dwux`-|g`AWV#2PsD#He3mjgV?u_rw;-z;pPs^Pj z&QYJ2M6Dkc_**q?LEkj=84igBBp?Fs+BE{$YgZl?cRWpm+GGv|FXwBG`b@5@do z=XaiKzQj9&+rTULg|cJq9M*Pd0;eUqw90%)98|Q+Dp*LhlZw5D}!{JlN z@nbl=iGE5pSoy8Rah#^yRsWCv?<`&h=pjyeHvRskbi49lC%=M^de*fC_>FUfCE0-m zt}fMHr2 z!kih-3BxAyE-yQ_8$bB=`4vi?B){EW=6UXYtZ~JB*(HC>Hzx-9{^>yb^vU@|^giwf zF30sSIPFB7JvJjB2{*^Gi@VXQHr^lHHysVP_k-Kv7ru~v?ADQcgtOICm0X114#^<* zQpV_SCw`7O#-*!#MhELp6Zg1lYo~+!8G9CTydyub(389KGm>7ga(R9-xqM!_&)=KT z%g6J$zjrpf_z1uM2yVwZzc+vLV71)`pZ`goc9!{0=2s`>0(~D!{>(2m&fk|d+IR47 zlEDHfe-3;;gIpr7e(clYC9f;Y{)qiP35LEY1IiI_ySegrn5D7j*qOcYoy?azllO2Z z@mHwn?$lW@eg4)7=N52vTRp#n)qW$H%}RH59yb7T3}Z(QaH6&cRA>$l4^+=&JMzF` zImuW#hmq|h_`I_4d&r6lDIZVvf&zooJB|H(7(OGH={_vj8T{1;vDZh*3x1mRBhSeN z{0_sRv8ZGE^F`_O0_V4bt1p{vJ2^qxEgfpSZicUM&*Ndf9MP#_HTGaX)@5aHXFLY* zVw4-#3$1wjlY2%YGn5|+uRFl^3-w}Wcpxt^ySfwW|10ebWdHmQm)%m;o2}vN=At{s zr;*R7K|0qsc7fLu>EjIL##_^dt8H8UtG-ZwU-b-2+NDR(qs@}a^eF3QTdUG}^r0D? zK0^OwG-3btFw(u!Gm3e!H9tok_%zJldm7DA>WRGi`=le&Kjfz#qkP<^xgTT;}xnErF-CX_!XZm;yb3fce0;;G!Un82swFA6Fz5t z_YfQ(Y@R=#9*b_L%-)tI$9=-1;A(yTjT7O9d@^bHBfGh`67^c|@Dn)Gh+SHje>l(f zF-}&<>_qb;;xyj!YmwpE-r5!EmK_Jj+!V<^K_A0!ydS>3EeS`%&Y!}bER~)pjZ@En z*j=KBUvaMee_j5vN!c;!C`hqow!iOjkNfCU6q)Zw;?Gq8`9QBMaQozJ4mnSQw`Y3^;7>TbprD_keE9Ph;rngx&io(! z+H8099`1SX!yfv>{gnCSAg453&FSk6_!|m`mt-;5I(p~z9Q~`G+>;#&FWU1XpP+m- z`#A8qMm*igY@o*Si0)6m@~Xk}h#!6tJ?!mOa6kM5CF@V@*Vd>f-s@PW{@BMg%i~;E zE~q>*mp4vo=T4Z7XKUwYtOxf7_rz}__f~eLADbnk@;b(0sQcYVvZKnJ6NdfKPh3V_ z_TxzRSB|$|tHr;uPtop9c}^bV{q7k2tmfy~w7%?>*D?;nFwoSg6{)D6SI-OZ`8;|RIl}K?m&RWID7dmhp6A`uuzPbSa+lhj_J&gv=!v~>fzK_SmmTYN zLKS@5lYGX)=>g<2h{s?zbvG9wum`-InH*f`@emg2u6D`Y;NE2R>OPR94SgRE{k>LB zclO}-c+cCYrU1{yLB{u^(pfyCv7b6szox?Ni^S6P;g`N3A`$R^zjML* z$?^OdrJ<3pxjt1@AF|H!L`S3XM@yGe(zCN$_`AzAf^iXmb!an{&FAovld=xoE zzR}b4r(5!bxQp<4-N!EPs~?Z0m-<}``7?%EFZ!{cPe`p!&I!*oF5UT!+nA>##LbE}>zvCp-Ei z^QWEf*TIFxNe6KhW$=6~+^YqzJ7ibDuV(xf!^vT_dxA$A-xmB97t(+4Cy9rM`vJ4F zi{PubKeLO`Wu{}7+ugXAeMTCY`*z5#F&<6W#~0vhjJW6v^{W&A#!bq5H!M5dc*^OP zU2T2r>H9zFXLI)R>FS*Zzk`p)UU|QYKFimLE-IM+`Fwo+;QiT)$j!TW@Ug=0u7I=M zw0oO%B=$AWS}$su&o{v9ej$JMa%XaUyI5x$@psHmhbMj6&nJ;{5An|7_vk9$=y~{T zB0l5%qTL_JCGOQMMUPCT-|26R*M`ClZ{>HV+qX7>&zC4)BR(VWaA)xw=b7h?*wqIX z?RLoDCpUS{(R;Dxs`9YAx00X#He6oK|8cc_*&n68@Gs&Uwt;(PJfrRME7BeLTc+iw zlSlJ-=hZkKXxOH@|F!ZptIFH%% zXgIu%^UMl%^^s{0oZZeJ9eGP%&Yz+Um&r%>4>%L?8&k~JneL(NZ!}+ZFXa;R(FwHv z20r|fUzpS(hn4wa^7oQhKAwKJ^uNO2KFf~aQuD64nSHwcHihNu$mc9AUPF(*;@=4S zq&+{mx?BixZhLD@P;m))7C9D0}8{d`or|HIj!&W+~(jxdD>-$@fEZ#xA+gcL&3}1r(v0i+|zHjL7PpN0J-+jr)9&yOO zI@hdY9OCZx7Nq%|oqoi@*H2gbHjh;Aq(5$Mut>vSSDi-Owj5yTHRR|9AY6>|Q_%(V zVLIGiVEkI~(q8}%`z52(mg?WmsNF`t>+wciXnu8p&tvg9RQY(}>5tMl^tcx}m*2Qn zGB6pMUPZ6?k*p7G@qfH{?1Ua|lU_&9|19kc4+DqKfWsO7=cfN5AMf*8A0ltzzr>fs zKJ)-MyuB6ftn^}bbq716cxiD4sa!`6&DqbF;J>4tRLN*;<9Ao&FaQSkW?<03zLy3iWDPJG>b<07}e^T-C~ z(<|xS{9T`UC*@e}c7?ljtUEK4W$B&P{hz!8a6+*jHP0H-bCOB#N=M+V2F^s>TDL6D z`*B6{9Q+}(-v)9wxa*Mqk+;c4Ur*dn{p-<^cAs~cOF z=CFf1k78&1a=Yjt%)`9uiZS-pz zJiduM%H^B8y(oWjc7xyV=p670a{Qxoy?WwY@*Z|_L;7(hKF9gK5Z;G>D8K2ZFXqfQtJ*6>%!ym{3i4D z*HR7tYft6I=IhMPdGLIpc^`S6&!@K?ldbc(pRuFgT}!VT$g6R){#L=~?(|_U`*{KU z+yI|%!(S8e8lhJ^h|3FoTT$96|F81R;d4iF9Fy#)`~>>5AG6&Mz`g zOX8p7rHRem=iW+vvTo+ri%-b-p${p>eIb8GE&aP3|69ZP8`V=yPD}Fe$H#r{OVc>- zyN|se{`1%KhG`Z1=}*b@5s5d)$|ApWbM^+iSH;nw!@x z(#7;#oOyZ&{#)|;1kUV0a*vxA@$S{V#k`)wo{sbXp2|%%ueV8WFX}%sZ9?C6C+$Vf zXPv8{2cEM!l;4@vzmR+~Cj|GPyLX({C_l%_d6$fLwcz-of_?-3>zilCrnjKW6QSK$ zPjBaMX<+=%vyQ~eFE^_vUy?>f`ephNa{J!;(3sp`@qHZq{9b)=AEC^-T3!2(kCflld`#hF#FbSP{$?k| z>8_P2T$$NAVeB)QOE<~QmrEovuE0*!^)F=OEtp^bo z_o4M9)}J4;TfkW_#pi%=g(fuZdoIa z;}UZ0&JI4(I_@p;{2_YsUFo9y-}=)`-nrA^Ko@Z$Po+nj@3-Xtq4)LSubm}aIW~Va zJ-~i{6TP1E!+($`yvZM-4=>3tdMn(om%N*I(C)$TdI_9gDgSf$3Ob0-s8Ie$@pq52 zLw@pp_jTHBp8QG<_29!B`P1f0E%x-4^ekSWxl(=MseU+(OzRs(q}J1)n1PMx;a}+E z^sAm7W{i8aov$b%JqcV*wfKVeB9p@y^Z8dJc~3fsJ`8pe6n-t9H}YDQ*a@g-rE|cA z#=X5A7aEqu;ZWeUcT{?-%|beW8=Hh$lS7OpSfe0=OG`cst9qg7@R9G9}2L@nN1;8^4IW%Q;z z{0Gsup-H{;etKwUl|EnK>KJ|uS&FQN4;OefD2;Z##UMVT5qqOc`hMO` zBGhu)`6PMP;R$*bKh2%`#yQvy$*A4`|9)^e9OpWA79@*{Y-y%W2a@=tHpe&$j(LYAdWiZ$@8AQT$Jxq*p0*G$GeYx z!9$z9hc14fpS3~KH))>Th5riYes2`?E#X)LIMgw_gMHD4ALMoQHsqJy6aT}!vvRj_ zZ!JIc8@`t%`(=Ch9PVCA*oUp1BQC|aI4|`yH1D?0oQN8)W3!OI_aoS;k@w<2^zlB= z%R)ZQo%6Zj1E-HeAB&#_`W54HZ}xBUY0Z9q#QckQR$eLa=TYlM-V<&Adk?8UrWFEjJ%x{&g;s|;@`7}j0Ol~m&?qfS@~n=4c+H{uApC&pQ?PvfcSy zZQ%V&`rFR;=L->py(`>a0@ouy`dj&Q?Z|_h zzb1b8{n888hp>~D<=bm_x_gFqvqPLT`(5mJe#s~K+|m27kD*sPFT9_gH^$imK8I`Z zBX;edrBdk=^+cw!JB|MkC&C}4@e*1C5vOrJ>jrAtnmlEYhnE|~Kfh)C%9CDZZtvto zJ$Qxw{0g7nBi~9p+%w_kPDbbx{R+?E>-dv@$_!-<8j%muQRd5R5maHH#LJ|;>G^ia zpT^~<($Vlg&gu7r)5G9~6Hx1M!h`&haqq}(elqQ4ypBV6y6-LHSF{&zP|u|AM2<&-5?)`vb(PFb|%?$82$4>mW_Yt*(80__f~B&t0r~ z8QZa?C^o< zyUNA=?gRW?4(0R(<69p-e_HTAn>}04s0~P8giF;OTX2fs@`3SFVXs3!EImKk{CBBK{vq-$!qZem?mRvr4~G>^I*~U&J%4>GGf}=t_Aq(0g3l|hU(3C( zy|keB%`Vk`zvO599AaL#&OSh|FJ9wv?e-GS@fLo5cRu)eF;Bza(I|0aE_;_8c)+ty z^+(<){^j6vv+RBQk=pS26ZF963($x1gS^RJ%6#8c{?P2)qWpCAPq(i*8lK1ccs@Lh zdvJ5IkM+xYK0#FQHgGHabtCih)icaF;?wED=D~n`d70TnQg`<8*2ZIU{tf=zi_b5h7lYyU z6XvxzoBVC{^oOVaGQNG~^LW;{RPaZCUhsbczebGXbT~doJtya}zL&X=avC{|;on#c zw+FG4SHXb>aQ0?$9w46MCG%i&add0Y>+?TqllW9hxB>tdnUWQ(1#>cvOubq9% zI46J6&-dkj{*)Xmd0IZF4=H>64|d4L(sJL!o>^&sAnFP}ueUCLRg!PRIMu+}$Mvr( zf5}>UvZAyBJ?xE!>08>V;HO^8zWugT25({?*MXn4qxhmc{SrOnpZHe&&G{oP^SLv; zUj{!ncr$usLEqf@UsLq9=_>QuxmNm-dK)|cdkg<={G|=D>2PUBI2rVV@Dur1MDu2o zjQW1+k8#{YcfygD(m~>0&P>0fSM~TQE;SDGc#~Gqs}1rVeP7_!_30h-XA9o{)#lOn zrOVKx{yX>^n%nK1oNhF3KbP+GyYTlu&u%Vb-(H;lYW-SQdQ$mV2Vdl`DQEvYpq>G6 z=C$;jqWm}fF3qg!cc$yext`HlUGVu`dV_J6kypL_=zpHAq8Ig&QhGB#&`hKFH9y%# z^dAa(9e7h={%%ZvqQ7fOHQ5&h|9$xh8YMD^r@vd*)_E7}^I{$ay)5YohySFvpSedE z@@dE)eLr~*VSoQ@9@XLR2)$|qPaiP8gWPlZoqd?_N5{Nt$p5@VJwxT|SWmAumJZLB z7yKWQ-J_nN{2;51@0!xd_z5|mn|(wMESfCKroe&ol#lh|vg}#%WC7{VXtVNqpCQlN z;ASQK6Hjei+=C#8ST`TH{y6QFuj6<7l+~q2@frJzx8ZA)Uy#+wwp0GSqF;gAp^shQ z`d?`;=j_W0{u?DLtusy7C+>e6myb)|!tYpTo8*uB>?NPansmPL`Xl>JxoY@)7dgmW zs=d9*voik_pZ0_K#q@J7d;IHkU;e|k`FBMyTR`cJ;&R8vw_D zu|9rW+5%pMUhS@YBlLFp2J6CS{LCM!r&2zoXO-_EkH_!nq2~LE`B&swpFbk@oju(9 z{*it7bLo_PWuXt3x%c4@~SNK>L&F&m2hux znei-h4%j>kdES+8B+qv6`O7roPM*nY74$dQ+ttn=Z?mq<#>caHmPNnb;(w`vClAB> z$m{!l{w@74cVF;A_3q_<;>QJUZ-DoK!)dYspZ(3;T3Ou7%w&!UIn*S-8m|`qS7j#Z z%Shw}y_K2Q#m$UvGK0l!7Cr*)mRUgpZ)eenhFM3sk!7kUjI(#G{3_}MdatAu}}mGWUfKVn?=mOrC@7WX$7@-T*cZsm^;))fw~A>XijzlJbflVikv`~r7>E&Xg=3LI{j{zku=ikB|W;+2Gk>7V#+!QXrf zd7jA6+&-&hZ}&)Z@?q()w>#S{^iON_=QHmRmi)e){k+KhohuHnPBv2cndq_ZYzwDC zKl>;@f`4fTco}+iApEO>52MnaSs#4PNjDVwe=Pq+AD6DM4g^18FSp`{JS**$ZN|TM8-C*6*UaI*J9c)pRa@ST zEcHFlVjk5sO0S|rTJ`kN-#lxMPnqc0kMX{61Gv=6JZ+iPE#^_5Y=!yWQ~AExO#ZP} z{vLKt-|Q>&UP z@KX;ye~K<|LDmlcKX~7}c0oTJ4p*Y@gzui@f3kXl|1;oMWzwA=q=9-?!smKf#9>?x zhimwSj?2okar}PQt3Tv_1N&H1rTc|Vv+?-3H!CBbhWyW~%->G#tyE_22b3PyZs^0C z*&oKet9vaq*+6>xj&=*&fWL8n?ltr><-g5-EA(MmRx&QF_&plX&u{see=qc>9KMEM z<*BSqwkz~V;a|wDDco)a2R_I5w&dJ^y&2~M)%NS{>1UP)|I16A^4jpBk^3pj$#bB4 zF3qy(=Ean}j`D%e%e6aDTwW{XzYxFiXR+Q*QGXk_(Iu}({%;oa=+`lDRy4bF!_Lar zWha*vay~X+i{9P2V&`lqe*TYsF*~|RGMqoVjrqGAy>U_gGIUvE{k}JQ?;Y=3{bl@` z^Gm-+J`==8cgfnZ-)@G(VW-`azi-}6fX^G%x1JxPJ$aTTuUR)*qTiWUvTuJcy^=T6 zuL|de9~eh@s`5r~@?-IJ89lEQSNEvj?+TxP_9tj%CR_|0n+jL=hk6Gk?Xs5E<=vgI$N9+C z&foS$pU?AGkG&yMCky*?8|C-qMZPdj?&yr_cP!wUK${q4kBe^$K-JzUHK8U4DCU&JY|TiZ>uwWSZ`h3-nuBGJ+N zs{eKJDKkI5ck?X#Hp|faps&#XGV8c3Df-)#zxvfQaC-v_?h9U|pY$W*f6CJ`zw05c zqboUxsDg`OZ?{ihFZirVJLy;ZqsZ@FZ#O$`@2Ol-QP!9FTP_x2fZPjZ31_O@kbA)Co=M+U5fI@rYq>xZtUr9#;=@z zqMz|^2%o=DkNpUG6@5B8?LiJrc)NnWcXB2AccnF@Yw;iby#wCIs9lu~^xZm!9{O|V z-}Fbd_l>yhC-f`U@fWQ>ktcL%+ME1SUa%eopWmcE<7Y?S{{hCaOY#N04_x|=9bAL{ zN7_02q~v}1Y?S#`pZ&ZZKim84=l3P}JgO*P&YtX;9Lmp8r9W}6yRT0#pTn!B{4g!D zDdg}9+zxs}qu0pz^>(kMb5@dnY(Unln7>oublA^*d~Z+xw)&es8$Hg;w(vdjj?Pg} z;O%}{uKZB(ckME#V(ta@DDd|1EX|s$=b&sLdH!BHG3#B>Pf@-K{-2Qb%sSB9v$B2# z{a^61Ej~}oMq78*mj0c!)j#K^?B=fQx{I>@aN}#|fScs`VAhztx_e)6Gq{wx&)udN zuh;!umP7vM0a@f1TAFRrlee<==snoiHGbEdU!y}I|F7Bc)ylt{^?`dom;O{w$p5G8 zSMBzL@13)W_Du;K4SF;C_|9qnw-%D)iwPc5XB1HLt+c_65C`b@O6)TAmE&kM6@B_ys->Dav1tUICvk&L`mS zo6qniJsM4u<|Gk2l)zp$Un&}-xsU7_9G#O1X%j_&>BHU3^fg9HEE3-=;@2kS(w zWH^1Q_W!HXUhK3VOGC4Wmud#b0yk<{8lq>-xbfLvrC;Fi_Jy8!`w#uFWE;5iOles` zkNG%`y%_J9)Fcz&^h)8<1ES^yzhWk$z}IIf^iWmoS9Mfc*#>qsBwIhhH( z-5Nb`_-Heu34A`*NlctCZ}T_(6u8_HuAW_#Zv_dzNk=8ui05cZPH|6gOu_%XPVmG2 zywuE#Ji3eRSYv;&2)(0zJ(i5lekuLPPM%6WP5k{Q`f!Xapxv_9OP}*^OepXt?B_no zDf}NJ$)TG6d0atX{x^LUi)lk?rR>=g(ZgT8seA@+-!FY#l7BqOMA3fZBaFZn6iX0k?!)Qp3yYoknRlaR9 zAAZNWBl;aZ@OcoqbWhGnhZOxiFRfqDoziBXIz_(vZ~7f@(`%jZc``Y7=A9bucV%$9 za~7ldl>U4rs@|z|L4N`6HZ?Dnrjy963_g#-X9M(=Yi-Ju%qq;lb{&d$J3yq9}e7X7I2vj_PPb-yrv-vl0SThLEIuSP#D3prQvcT6(B8u3T8 zf;;2j@M!#OCVpdNLBBeyHm}CHpEAby*>7tn_AmFt$9V7PKgtg^`yS4Q7vuX{Hq^Kb zabIPdYbuAIEF}CPeRYBi{Ut&h~M`7m5*a2{O>2;d9r%S#LaazF2lv;4I#f_;_Wu|>vFhR&VPGVzB&FIC)eUX zn0i6Xt@9p>P$zh&$o58Wjs2=!yy!z|o@|c1i2(XEM zx|tMq$eH+$c&!_f*2edwS&SZd>(vtLW{r4_sRjRwtrH#bA9%gN zY3L&?qu9R$J<7j{&-dxW3;G-VTBe@B=lA`7LwM(T5@TIk{x|(65wd@XhFK$%L)f>Q z=<@TxyEjX2hM~uKPh0kG7w3|)`_U6GbHLA7kJ_^^@+zV2sT-ag%vzKLu`4;*%zmH%QcJ9$jDs#GC* zWv23h&jabt9-?dJYPV7}&GI1LM|6T=R7Oo0Qy|&L`JTGU{q-&k6q-l*sQ=-;GAS|7hG{e)hRzL)er@Of2wEIwBA+65g)?8Ct42Jj^0 zQec z=ndG*JK}GU)%#GN_2G4_kActI7v*PW2OFol$pP98`r+Ah_&GE=I6K_;pS%ydZPBmO z&;z%R%l1(|70(fJmd6!;o#DVK*<5$PtR(xi?_%(_3QuW`@_RGt+O-WecHpWIy9z$b;B(L?@jm9@_F_-uWoVM@<}>W-(Qx<#J)fukjpCelE%2=c-0a5tuo)cw9FD~c zmoe_6*|~k>_vqs0RAYX{aq16yb+Y<t+oJEo+k9NIBfa~K{XCoAi@$}V)%-K3vQ%X2DqZ7y zyvwl2x)OHsd1gQjKimKC8SBnsc5cug5PAB0X*K(Kx4&_F6a97kykB|)KJP>h&EfOr z@a<$bf#w_6lGkTu8<)W0%?sRKB{C|J|9_o&V%_-yJ>E~OlkU#${ZRZy$dzWp#p-01 zJV~d}t1nBH=^XU%V@#o+^WDlhlU)9gHih!Mb_oW&%bNI=L>MWDf+AFS>{C(cJv+v{hRb~<2{|deZKbHYfERVC;ZX} z`M!hGjmzQf&!u(B2Y!~)L&$T6)8-5HGvk-uQ+r-gW!H8{4oPZf^XYk-G|P7TfAo&< zIq>%5(r2ZP*)B!9{j#(Cz85V$#D0p06fOo1Y?Yl%F9s`rhW0xf)t&IeKa`zBzeV+D z=g`Y$v@z-_gU9pzZfLSycDnIto9wIIklSI|3G`)9a)iIj`b`@*&7-rj2Cp{S>PUB?u%xdR@dlvOSjPDBgbBlIb+owOO z-N4&-^&@V3U7wwj+1q+QWq0(z=bhok7W~Oang^NpuXZo=a}9b|LYuyOk zTc@7haOsQeMEbswe`AkgUNnb89pUY7**}c?@1^GGVZT-7+ZXuUHow%o_`v&Cdy$Km zvXpP1?BrbUNbB-D?x*Zo@HrN4b%p}N@(Yyv!TI1O{oh32ncw*6Y^8a${onLM;B0xa zEgU}y9Yx_elmKzL%1t?6ZZ7`lhWyQ;|4rOKIZFA7aQZO(^6cV2;!Ix4_ch)#;qxKn zQ7Nu@FLG$^9l||~;~aMLe)O=8`zU*pTMPI+hyL#)ZaQ$bDf@YU;~V%q-}f`k=#!LR z#o{|!J=OlQYk@y2-Sn3&tkftu)O>BD1;9oy~JsM69g2R`X0b%#vtemVr$-XpsT=Gci z>Ll!uY8Lom^x<&7yV5@Bm(p#%N8I(|G9+{*E!%##>orVejoPI7Rp;g#YZpH@3-ZDj(VD~qYo$lIij^MGG5C{2lyWK z9|uoslHK_|{^PTjdxpms{G5m0g8m$m-fY}UrE}6_3jWVWZ)sevfZIPgx4n}68};0V zkH+TpHR^9m%Wh}Sh8^{CdOLe$7{A9q;qwpd=i}gN#BcnUzKrE(K3h9Ed_KlJXd;T{ zI(oa6D4;9hfqabW5Bm4%73Pa~J<_XvX7G3u|7muGao=7v{58fW{LhDJ*GuI-z2eN< zwuHW|Dm8;cn{fLu@;xYNo&7KSwe)JK1A8{)vsreR&p!MZm(rVeO8v8A3OQ`8T*Q@i z&Hm}!GiNU!UC?*VZdG6G8n4i<8@bt0MY{)NHujuTgJfyVGOfVdTTSCOZzjNpgF30r|Lp51%`U z>$n3w^y<;Y<|Tk+OK~3;(XV>q zFM`hs@f#=U|0MCy=aSQg(%PbYh4;G8vJQ=g&(W{-r99uHpH=XyHTkc}u7U>{T;4S9 zHE=V|2b<=%`rUWV2cumls`)?gIh8$qiS|E(&mm`;ke{GGJMmB7qP{^U*0q>m*m&cw_*Gs zmzb{&y#E^QmWzkG-nb0+d9i-WMD;iSw-@Cnct_^HB9gau&7qnYOZa3!VnP)3X zkEOvhgtH~*r+E%}v>OfJnPD1R+}nrQb% z8e6H|qqRFcc|x9za|`-CGU8Sw`?@b@vFC{vz#tUV6OXzlNXV z3ia>p{`a-|^St-FPa((V{25o87h`2xx`qG$^U{||$hWyXNY^Pp$Nla*ts^g$epFA) zmyK?0iE0%`cMbVG&;J~9v(lyynAh!{8{Vwn&w9n?MEq3nH~$kKM<%t?2kGq!dHPS+ zzIRH~YxU~@evuoL`?@qbJx4ti{LeSg`<>zQZOZdfr56=|IbHn2A>Zt=&RVzo9LIQ zH%8@B^_5EBv7gT`>c2jHL;YR&J#MEb&zEk*PvGHW=}P|A&gc(VPoIL%=b($^NI#^1 zqxhrmvo62n9``u~Ki{S+l<)5r(F0k3Ta|itJzHc>q84ee`;*)9_7vR%Q!|`BIf#%TKbR)E(XhJ`Bi2Ai&#B z+1=*Z?WMlT$GR{x`@niSi2eMu`X6*Ub8bQ3HGA7WyszD5^y{V4H0?(D1C*}=SGUZT zWaE-2OM%Bx|54cw%C}%=-<3JzE*++StbK!E#mus@6SFHsn9KD_dyXTZ`|wAe%xyk7zgI!rsa5ppToo zXLwsSnw|5Gbs*@UWw)3YJ0vgRC-OP0%g$5II+)$$?_M&pF1bz5MHcnQz@6V~e7%^Q z-M~-sK0Enb<;4l0M||(k+V2I2d&9*j-z|RxUxVedyWKkbnRij5em8XT`~1DPIE_o$ zC#%@WQI8uv`Ca;DWy`Np&+nye)f4jBK7ZWjSbmS|tjq6-uM2%H^RD2-K6ixMi_E(h zN(UC@`StVr)jylR<39859r1O6KTY88OUjRQKV>modbf0$cEfLTW&VotBiPgbqgTuL zH#U|3U;dWgjr8|>(4Uac`Try8F2L?6l0Q&S&%|B7xVyW%lLR6JNFYEGG`PDv1eb;2 z?u)y-v$)IR8VJE5A({Ku+5f%w;o&-@8+u zjo1gu_f=d04|bwwKV%&z&&Bcj)dU`2rC*ET`d#q+t@0Z<9p%3kxA|Ny&(bC2?eB7J zajW*?W{Fof%%lze;!I{!^D&vL)%ABpc3ruj@mfb_cY(^&*j5^{>)elrdy$t$o?U3& zsYM5k^OGlx=^38uYHu-<{vfk4>l5!iH9Sg>XCW__>EG4X;34p%ZCYQxNI#pYy``)i zPuc5ERz6Vqo66_G{(X$YpXwQ79iEXklBX%F;s#O%I}^?HT^aXAI!{|Gi=WIA>iJpusdV-W$kA5H zJ9>Y!jmE%nL-paqwt{dsuPI}3j_owWj)t#HIE8kImJM6pw7Z#`A zD&_Zi{!8^dX2rXhK6QQiUGD5^^t?dnn^w6+`>XPn z^gW3YvFN#D{C*O;W}Cm)Di3{ldb*U4%`9Gv?ofqq*UFw6@sWR#`;&sYI|{c5c)dk*SB(dze8UQFMGKF%9+M-C^{~67QD**sA(3o&2Dr1 zk2_=Z>k*P1Hx+e-W>t7MLOoOQp%PtVH1aj=i{7rPxaJbzy4cn*@kjf@eldWBVpGq$WR%x16rW{a4BA|8+T~_z68S19-l<{xvBI+QR$l3HcGRTfB}RGJ_WD z@b^{rjWx<^qvy{uvbv4;Ia9v+9hArYg42q>=$5r3uM#m`38d-8qhLhZGyiP1Ju*@w-#Ph)=oPDW!}-=vkuv`@LTe`dx+o$m4!~ z@dx>PvwYd}k;nWr-`i{dCiTry&r8M6mHD!-&k5Rpw8*U*ua}<`KPqn`Pxa+&3p4mn z7uhzRUsHUb{$@CN#fJ6eYijCd7Oe8@88EsniM_2{f}WcG8*e^D4y({l2Kn~^>>T~@ zuCsr~jj(aQ&J?^kj+PvE6bv+u-OZEko|@vBYiZSUVE0I61v8T(7@4o(bI!43q%I3S?S`nw5f5PaDmvaXmAidZI{N{WjyFAJG{9-TN3~iH=Q~K0M_Pq6M;XSyD zZE%WVe#5DesbTgSI!+_^4d7%G<=gYB8O6iv8@xST-BaOAOZA@<_@$op>NyR+H-#hN zJ#%=f&34>@-pvOYpLT|)v8x%PPg7GD{>-)TVJ&;|YVy%ZTnjH;9%pk?16=>vJ>9=M z?UF$n?X|UWsQLz?{Yam!jAb`?{+0dp73gT@`%&JD{ckJp{bd9r`b<=0c~Z)?PA`J5 zA6qXYI<^|jjI0d>X{wh#wH{3u<J&();(`kvkfXg*-cuVrxNk5NH-LuUqTIgH!?QhB%`nhEq z>#X!=dhkH=;u&5}KbTj;c#)36qi4&R`Bd}nhH^Yy?`yqj&sy_@`zrRNm9{r`rsS{C zH|FvCYr&`f`|?^^U3KRRd*<)UCOQFMo^!+UB+|s9y@{S|4`Ez(%6segTjk&R z#%9Zd<+HRrcZfQLse-4g%bJB7FUn`iH`E*XDi1^F?&PLnA!5wA*f-Xed~4{!5k@|V`(yBc&_9{aC(#h2u=9slT9 zNB(5r2);FBkN$$(bar0&JALI3`8R%o?~%{{FY?hOTT-mY_toXW@G$Z`><#b3|7I8X zJkt2cb7!8m6PNNjy>hB^#2Tb}O?j?%!~5a-;xc;Q32~pS^6Kms)p5CrIF*}~AI1;5 zfIB~yPiiOPJMJyMh5zHSM~aBv{S0o$dQ?R}UL&`sm9dERl!eB+5&3_9Ely?aK9gNK zP9NSae=h#Uw^qcyj`inSBkYb2{mQee;}n~uKds$`Bs-#z{&F(^BX6ZSXv*wD>)>2y z8>`e2S&DjjKewL74r8p+Pp!Ql)33L~X}RJ_)IOx;EB4Zs?ta*n_A28e>~6)*AQqI} zZxh%(-Ad3)9h+IP&&am&uP5b4YN(LgQq&NQ_qIWYs<%I57p?<=iAGXn0C+Fv!MfZOZbn@0e)a;@VP0f8ugg9V zw;6Rn=}PUyyxt?t)Y=8?8y~X)9gOZRvTAh8W8rB+&sK16G!DOpYc119@br7xgeN=q zO z%dl_vN`v)rTV9U6aq~&{ScNtZp_4h9j^^dwpVo43c}MDI70A`|vo*A97GAg?$YXpS z{k)rT-#?9ja!;^dbi$R7og2PvKFy%t_9b=BQ`2m?ad~-H`po`Iyho~QZ#Frr$|D-* zUV8pJ*6wZiiSsJb=0N)4`ttSih2*YLJno;|UV`?;+Rezr$LKkp#18lSr}}kdz7aIJ znSV$NKAD59JG8sBFuOY682zuwwXWPRy=QOo zp7`k5xbZf7NJp|(H*0AfdKC`*ozE~c-zd+{d(gj{iZ*znOz9ElZ2K7Xdz}Ny73W|3 zvr3+fvJu66<9lcMI%{4tBXD}Yr&FKm;`t`RyXWZ7a`%}-=fL+J$ahw>B7NQH(aESg z5!U`yaH>xcw;ivA!=ZiG$d0uxyb=5>x~b>yay@Hf=<}bl4y-VbCKho!+)rhLB2d-3 zQ`9r(tpxCBuJPzyRQVQhC^hw>8b6Np?!M~+$K^s$zB&AhdjxvG&$(vS)FN*4Y`~5j zZ^_$O)HfPAdq*?%%VQ2ll&>8mL>x*aEa#z6V zjz;yZ^6?_CJbzd5x&8ef+3iKltJlgG;CIyXUh%8-V>9?1xG4@xdHn0=;#!z_M)rBp z9MzwczZ7!zpeJQBdAporB^qTN`NR#JuUpqT8o~GMMkGg*$Fd@GF^vrfaQ&+vD4I5UP&#tFw}=EB5PO=%YJZktS*3 zC-&!|`q2~muz9NQY}7Qtho$CX%-Hbd*pEH=1}j7I%brlbuksFbk_%WLp7z|&R?>rE z%Q&@+SL-=y|5*7KdbLt7Hq+LQetE#Wscn9KZY6pMk4KZUg+}9d@?4}YG$Ivruo57g72?k*(dtYI3JPM zF06N(@JHznZ_k5UGmZLFW$*N&HSA*8QH#yxZhjK|1AWTAaBmi_U#vHC(n%4|XSN>? z?+4?+cy{uk){tJJKFjhu`8an^qsXg_w&=PEr=CQ|YCF})#u$12H>D@P!#?&nI&-`^ zf?gg`r4v(Po7&Ae-WXcQuk6g>Q{T>c=r?7Dbdz(r;n}_V^{91c9Zlv~JR6c7=U$VL zlU4NRKiOO6i4VNMY?xYp2ZtM2Y5$Yft7V+*C1>F8g7UYthgtNj@{Dwjbj~;n|K2V6 zWk93GqK?*?Ka=Ijn?t*PfFH(EQuRUkZ2HZ*KVP<&ZOF@e{Mh&|l=4fU|1=ul3uz;q zTT`BtFU-yNSgsW<^jV5nfM!l5cFF$jT=?yDO4%<>5ncJG{mju$6BcLZm-m&Q zqzlA?7U^PpltJk*`tYRe?D9qVJwC9nI$vIr=VQ%DzvgnPmhZyzhS@Tjs&iPn{iKu? zgR^Vh%d`b6?kv*wD}3&fu4hLu>-EchvbZypHz+Q6uVyhhJCl4K4p+CQ4NTAfLw~-J z{dP9%4}aZ!D9t497ys2+`AzV7HcMq4@qgn;cr)jnVIO--T(dhA?66)_9_MBwSv`NG z**1WS)y&HeSo=HC7zWulxQh*bHZtB@v4eC|-iqCOx?ExF+%FYY5%-B{S61z;Q?cA? z{ZTo#=m(F#5m(qlEh1TpnBRfJb)eq_I2_|ILZs;Lz2v@?YNSqn^Gbd7;ca(#U(54x zn$wb64KsMe*V__(~@31*i)_fL5JOKuupeV!m-oQ5qUSRDL#RJ7r^gUW@jCF z>3X5Q1--cemSM}rN zYxWtjF8qe?jhx@T1-F~iAAiD^us`Q)+BNAN@47K?j2RHS&U)IdPG)OEkH2UuBiLS( zXB(q9w8xH>f5)x$?LGUQ^=7_4j#2)N{B(WQ8xd6Ru~!^TQ>xORIC+@ttyOgXU)VSH zz=4P^=uHx4n6K-~ImWD&b_e5*`(aT!2NM0ozbq=+*&(v zC(+&{^$xqBIIlel)^_#FR1_WI+1JeR_4L5=U{Y6a9;P>Ykd%_luVP`i1JcHQQAa92 z%fDNvzVNfFJ?UgmcV^+AB`5#ZYz;kF_ng;SXPPQs$O`p}Iy$Fe^sB--VSBTJ9_9D% zwAnokz~$O>)Zo56Wqun7duF6Tth6;@V_UuIOqXrQ#&H=tZa>ej=fBa?+SUobI?!D& zf!p0#3IBFJ*rCFm6SXrTodY)qv7$yE(eSmI#10<%+wuIBBWg>Al+?%bWBH~0UXDr^ z^Fww<$xCJq+>P?}c<>yFjO76Wna-k^B1_6o)Z_`91Rce3$sx(pPHa zOU&Cg)`wyE+(TT;kygm_oZIb3vkoyOLjs{u`|u7qi!_#>a=tSB&-p`5ZgJ-?h`s*lub( z9+eB_XZa}H#bX_hXsn2@dm~k|4_fLpeHdzgp$|_aogH}Xe@!1gh`*b=PUtXu=5dRG zn}YI==7e#f?|x02*)y*)qeeUZj=M4B5za;4vO=EXy{_iO?s;Up=q6utXmu@}X2-3j zec*Qo65Iyv4Pwpd=#+e(+064NuZh}G^x1~=^ZDM}O#kb8{yiGnB7PKc8~Tpf_xW7o z?w%a&$#DjU!+Iw}GaL?E=Rf(*dT~^_pqLDI)|6k<*N5PLpW+Pdo>LBlYfa(7Z}7hh zSsPI7-yQj}hgq^mlW()&+_!EkeI*Mc)qkYG?ab_Q+edeJ-Qf&+RpUp zskl{}zC0Ga@(UMZ@UMrw(UWKaU$buv&Kl#;_4+@Ao_vaRXQ+|5O+69+dv)<3JJ~mI zx-Kcp-8VQ*d28SAvakJ-eZ$!g&FcmI3Ov7|ctZJy@cB=e`hEFMF%g}u=-S@ULTKMz+=O zh4kf!foV(64Le?Snr>tT9|Er%;tKn#bs+G5Z*NC@?7nys_;(eIYOm!JU`W`Zb|3@O zVg22tw~ac_uHboZE5<=k_zCun$=;HG4PBkd_myOCLiR}cwO@y;;UT(WPn4Ypad$*> z?KIf9w-L|1*wcgbV3xL4myJ??^QtY5Z|v88?g5zXof`Zr!rOHKO>7?N3s3BJ>?GT; zcQ%4kMPs8hjeNGG`ww%+%5=7Gz8$PWtzmoI3wak_KB2wH zCbt%@4x>jm*6vfJ`wf&&gOej-cagTxqD&cZ?JD@TTk33{ZRdWgIeIl*mg<&pxU>8i zbmMGSI{##x6`ju>dzewUx>C;>^t|c%?Z%$u?kb$P1^y1q7Ku}t4Y)y+VUollD-1L+eD`LLB}U zdqU^P8WVUpf!-fpL7$e3^5fO^Q~6n1^}vmtSL`(O@rC-a(IMuon-Z zKb@ElF{(|S3&uQZ$4jagx%gJz{O;)O%bz1|>ub*&skeUAqMwGHIqbp{NMk#3fCD`L zYk7n3VLv@b{gFrG{Jfia*OE7q$Xqkwf_$c5&W}@-*UNe?`eNsvJ`!r@20#SzmBc#GnYGu8{qB9^7zjtCH?8w zb4l1L_wE@uoTsa- zJDb7bm(>&YmTQYw;rXAQ3+?PP_`HeUww1U4daLwj{OclW?uGJu{0YrF^LE@9{}~w{ zP0mtwfby2g7UFVvX3QjSq3g(9pkH0+jgkL8q;;|$we&q!^6ut9Pm4IOip^wj=hqZ8JR!(e;l zx7i-v4btKv6@(Z68u(^yI2>;EQNsz!%boqyrWKj`l3xA#!g z-bn9{{@Xnhw$2{<`~%Irxsg0nJ+I((C%8uo;FqyGK3Cc4jB_W;7Hk|5Wxd4N<*s<& zRM`T#PJgkdZVXK)I;Y%)6g6=^d9d0aEytubq-X(uy6s7CSM+V29bvR4q^5dz7(Cqz zU;Ej4k3m^oI(lzBHD>U7GOxF<9@%xQ%iX<72O#`uHB%T1M#oK({39V$|Y3t68YMn_Xa((td7Pi0qnm zJvYqwtS+BPajVcc*$^WOTrK|2_mP5?p5M}F_hPRw*Nx|Y(lti&=JId&+{FA{l@@4i zFnrzzUp}KhZ;X$VWfzFsxW>Wxi=pH3vK0isy2?Vejd5{KV?CM8KC(@IGHKw!md`eB zTdDs)a?frlPVc~qT-VLrOu;VkDZiHG=)NQ8-2^{}=9}T?H2xp0wQC;+kL&tACf|{q z^b?g8-bPQBJLDs^R*3UnUYQrW<%`JIcl?6PD&D2D&{y!63q7O@J$Vc5 zw?fBs{e&0rIqp|HIA4a|$m=}RT+P^3V!gA|&f~U%mi$KI)~UML^YFc~dM-y#G!_wvzkIGXYuP3y9d8U6eH*@J-j!M9cB^z zzOV6pgMZsbxGC-^Z!1RNIdLr0SubTi^L*y>0qeyIv+HB)#|ielU5eSHpx~FWO=ZT_ zgsXFmUpM$!lbjBNzcuw>P!W3gaQXSi_)F6&wkJPL=))&_ss&u%2Ay%Q%I^3kb7OH> zrQf59EzPtB+TYQ4o~4CYE_|3@Ow-G{;<a?%DVf_gjTF-cFp#a5Hx+ zpG)A)ALVK23%&Q$qNn+>8$0u+>i?`fzt~Q%Tgs!+32Hq?f1W~G>cQuowcCRK#;fiY z>6|XXhi#$f!^JKY`mQbBgxl@X<@EpU=+7@I4}JS4J zEKbr|TaDwMyWYyXn@?@*&f>n3!C83ab@j{^mD0hekA2Q0^|tnYthRB_;$%GPnUs``BPd(08zB+meU9l*TFL@%|P2qEq1R z0Ii&(K3OeT_V*&C@}HB*E!1|d_7|Y%dZ^Sy{U>>P7r)-(`BB=u1FFS7=wI4hKudp? zUOkWmKI8wEDE*h&cnsSAM(IeOA1HAk?eh_QbS@3zKr)y@H(+)1l_Munz zflG&a?`-YXk;i{aw2o$vh}$3c%9^Ls^{O5$--q?^vvPC_?NuMr3QyOB&%4uyzm&JG z1&XISpF2^x%yFoE2n8Fb@Y>jzJ>mrYW@}0_^{tA%AuL30u%c&fr0(IVp7CDhew1Na zA6)S*IoR%C4|-$7;hsvb9jnJXq$zOiF#2%#@QmcgH@&haIt1=@$M*+PoQU$vhmZZx zaUQ&FVbmW&U)($Mc-mO$*6bL^2E6!;^P*w0S8WPM`^zSHJlrnaqq3`U`=tCKEg?e- z;BQz*x~S(O{d}7Kd~k*DZ^P$~+I^bMzOQ}3-)S}}Tf!gu0H1T|@8_WMetC{I_g&_1 z{OM*Uevi(UEEQAoL-cnb+};un9h{{+q6?bPANTdQm$ELry^!MlWe~7oM==Te8Bk|{3gpPsy@204I z2(S6=wVq|q=5dPH+wT*7=Z{M-A4x{vRbS+JI6se_?!mnCyPGK+%bT$jpZn3Xe4ujVNpwF@N?y!HK0H5Pr<9PiaZ+@IuEV5pFU#=|n z@pfBrgae_;2kaqRL!h#}q1YGot+FeN)qKx-rPGT!P@|@MVfRpf*sIskliQ^;ij6)0 z6MQ~U@8m%#zT+R%Jl$A~#IJA5&+#Yf`O))@%)94{jr8JM_`DGPaXxsa{&j-SM;gt( zTE0j-ur5zbM2OQ<`?n-8%>?n;}S zBcI*H_*KV_c}xHQ8;-`lLdGws9Dba4YwujtBzk1{*VjsyqO&edWGidU1^n0MTgBZR zk}g-j+Ye~&*QvP+dqHEI=)guT>n?rxUseJ>NYr$FN7m+1)KTIRUPQ<#QO{ ziMan_1B=c%__d6_+gwdE(-EGgGoxdunQ{XBbPJ;I+j(z_^T>I6Q8PO$?S#AQ%H?UA z@z^rEDn+zeVy}*@@wM5d7pY|v-$$e7ANlt;B^yJs)$`64WJJW~xqLbT+`JuSdkKg!qMssD> zCSPDQR+qcwu^)DWYkoNCoFLobL8RxKa=&~}{Q5)u;MwG)Dcs%}7c=pJTNu5z{3il$ z*-rE2`dKU6J`bPNVWRzyHZCG})EE2vUEwjWa5#L4cB({Ot-$p*qO5j7&1m?2sd}p5 z?mpgYEY9!{PrD~GKbHJQJojAfbr#)qxN&OX`+gPqzbrpV>%Wz^siz)_FUz;q%ORqp zjwS_S4D!8D+Z8^?x;jrB%jV>KFLvjBjaPTik1{%kSI$9?^IlPeJU55?{mi7>`AMH@ zCQQ%j6i3hoek?!Hzj=0#FQDT%D_cAG8u*deFT(y=1HQ)2q&eK{W89h*p;ttF_mNhW zf%M^d=&*g3E{LRd;rk<;J&3Hn3RfdH(3daTUJx_kJI8INJTG!M7#FQU3C| zD$XMPGT~}B;_{YgH|`VL#Ajb4u%tM#!p9}vYvIjZ)YqB^*>e48ibDs%_0V6AEDo*U z_6fxj{1^nEPb9^g@vl4Dub;{<8u;9q-`xPxRxj<2o^8m>HO2l$w>|y2AXnqliP{Yh zvIjliNcq)8?8ke+=flb07v+Dn6Z*OQcKG|HxbTTay-D`1_hKLYQ_;yR=*YigYu^g- zg9rIuWp12>-o89N&d1kb#_}59JE-eQ?MDRnWl*srUS5V9aq@OKZg%zDwMH$@0k6cD z*rDA(3v1`!*Wg=yBXNOeyP)L~xZJ~tT$_zW&oq0WczzBXAFpJb=Lf520ep-8ZU;|> zm~R_Fw1GIz!NXBovIiqx&iEdi~saxTJMZnH{h}s-{y%; zJ@>f!!$NW;^f;cD{2*yxsQd#@w#J!l_Cc8t?vOFd2t>?}G;QCN&$QiyzJlBQrv$>ug1$X-zkL$D-`PWWKr>eD$^TF_r zAHbdw`bBGc_XSp$7VH@RGHYtX>CM#A$@kOMU*G2$m3j0i-0y|H-{{#N^MrXE9i5Ed zjpz;=+4t!N>j3*wWnFM@ko9LV`D&J5N6VSaPQ4k;;aUD6JCo2J><(v>(MjTQLPBnH zYr|o7O#hS{u@gQGmCfZ^bE65|KHjLcfyc+|XWZ`{_|rs|mB59roqF^)!Re zSKwpg%JXr5>!I2W{69aBTf#<*2S0^I;v6DBUp);g_6%P4aQoPbMe+jmNBrrf@Vqg4 zFVt=qbY7QVt{#>p_!}N(kK~8KyVl}bR`}bSJw2ke8na)VqP!VBIP|A!%C~2mWY6K=%ZI?RTO_pQ+Y}_r6-c8aqF{#M8a`@13ib zj33_BxX>IPUyIM}*`Lpc8-sk0`oq8ZHuBVtz2kh`Ys2n*bA_MVdOlETm*RN8|3!a3 zU9a2Ir;kxj>+Iy>NUN{>GH^HUc{$1Z;W4wKI0QdiuxtFw+nvP?Zbsw!HQNVn&r<)@ zMd)uGd=54{MyKQTW4iV5F?xSwVZOQ;Z~TU*ldMDY?7*HcPSvYc*?;NjZRvC`7Q52k zx@N1;QCB-ZC~pX)c=Xm+ne5AKON=$N$k(pp)=yW7eMY) ztiYiMJ9$qx8yCpu_o3c2^1cm)&yOk>aRQ&Gd1G!m-stw^%kiKdwGqE~hQE#Z=l%yJABnSHYhU&dxOyk~Y_9&0?=I{S53xeFW|uw-6%*;*2a}c2S+D#l(GeZlMd_*g)O5?&XXqn%Rk00>EIK^Ay)$)hTpO)+!*BXzy?9GRehAG*c{9)gk$v$|!dONavT#Re@ zoZnNSe@plt9^5;?+0IIq=66==**o9a_eF5}6x^%D|N3s_RqVrOs6Voc-D(fn8b05z z{zh>79KQ~xhu@*L_2otA41B&Y5B%uIetbKsKZMVB;%4LQE;!l{r|(2hoSWZ~-{R>i zcIb%c;+dFVquyHlLC)7o5BBSQ;O$1)cYYtQ=WpZ}82A4CH6A0c&E*wcK?>KFzo0YD zxz_vaP0ni2H^-{6M)5FuoN*TqYOfnSKZ`unVdr>+bdk{F38Od^USDdxYYv|uRLXO+ zcv^XT`tyxylfk5T6x}k46fYQOC!NZB8Trx0J!)?Ok6-Y%{djS!+QJ|DDSunAGryv| zv-s}IX_Z68C+aPnlPWQ}ntCe7)QA(5(pluA4k{r&t}^vMc}F?>*U{&&K81 z@c1adPh`KiTkT!>)h#kwU(>IT!j&oFQ?5Y|`)4u5tecls>Q_(oJ#HO}^SLXEO-R$Q zbWL%RZ#CrOxKe$s*)h(k^zVJ^Md&x*T6bc;tU`zAR^Kl*i(0ssFxIKae0_=Wj7&tg zz|p~E>s9Z!_UtwO?~V3b)zj5G&*^_J6kUt!;bC)wZ((D-)98+aSQi=3p8kKDoHt^- zxdAQ?^6!ht_cmx=ZJxcwkM3Nh6Up9b@as4={iGk?(U{IOZj)W%D8HdpDlj5zR$ z^)35c&koJ*X7^Z${@(QHkgF~{2#%%AZ3%ZT)tliy@5R+_?AO=?J`cT;ihxAetRlYFgi*Ww-^sJ{Ez}4q`)^L9KC^`!`{i5gV!|i8`Z8iRAp%0C8>srWRGkW;V z_}h`6-Bn~Y{N!Htd{cfJ*W%V+;#O|=>lpDdr~912zwSaK%4 zgK*8g#d+AZcHy5FmMFTr{d?T9)<56Y+jHUb^Y}{Q=+AKv{ZMv?@M#~CKY{v|{NG;G zn^om(dT4#m&B&kmzw%d%{MvH6{08XOAAY}JT-u2H+utnQS$3>@JsC68hSf4F zt|PZ%KH+lgA z??oNoW6!lzvaI+BAKhO{PmgoYg~em~$v+V;H%8YaxIA5pd#OLxseRyF*o~IKcTuSB z8+^3Fhr^2d^ds`t-2wAth^ydvEBWj8!Cz}V{0-lT>+K5~qF_aF0;~Gv@cLh*dM5w6 z7|)jc@pdw=Tc$^f3ytGQ_oBR@UgzWV;-x8vG2 zK3`OSEB`U_|Zq9y?lua;KuJ->*Nc5%s`iDF;A5eFkD&C=~ed2AuyW!CHt&H2;^r5GmcUwUMpZoeg z8y!p1C*+IXp*-}L8R?_oDSZCe^9|TWlIR zefFdcJfgg-5qVX8agK1G^sWS<_sxSQ?9uo8%$@&TWxRIw`&-&=DgWqA z`ukbgG{4H~wTa*FS4&TR*k42CwPoY{Xe;F|{O;}`t@ICeqxrOgCi%JA8Q`{?hf&`~ zexw)4dQ*Dyo!*w0Dt{T3Jk9g3w9_7v@mzDCFZk^1^HS^CH2xWn;%`m&cgJ|}=gfbq%%f%bd*pbX`>j4y-WooC zqI?)V`6X0MfXiRuPjh}6A1m)GAJXe)B|qJK6*`;nw_fVko$1Bzkfx^mvcJ$yQ~n>n z@TC2K-`w9&W_9_BcIs;Psk{bxeUIJZNAmEi`v>1L>RtF{42g9*TWuZafX>zVyL#;e zF8|!@@9F!UW?RY6`4jRVdiAZiILCS5iJKoWk1T+;9mzC#E*dzM&Mig>F$>wm4`?B9~FG==K0u1_k)k&U)DBTPb%YP zjxW94lg9I9Qzx>+L4EbDr6O8QRr1m0Z-O@t-S@W~c z@M{uH;}x^%1nAc_^~er|&r#0=TJlR;KU+;3w0aJF{@nPFX1%yYeRsm+uhI2nc{2Kf zp1sVKU1_Noz}wb(eRcMQ*6oY*_}T0jR)NoP~P% zwbG5`)(I@TbAN9%fKy?w=$T&e*Ulj%u8e{!o9|FaFYg zT@-x-51rioU-{N)t?}VW?E6mozc{V)RN(eM-l~!>)rujC@{{STk(h%jI0{=)Cf${Os* zzbmiH-+4V5%f}H3u`b#Hs6YJqKcRQWezHIN>=4pYJ^xPqE$GRg(mytot?b_y;b-0aTjf^Z zd^Nsz$y()?kpEt8H~R{m?&-*XHV<=gcn_0WnV`L2$H+{{ztlS4|NITyk^2em#Xe;| ze2jT9A3foB;)q$6e%LOZJNYYNuf5_*iWn>Crdh z!)*Q}Z<3yx{GoqFWlMICnkn?F^U)jk{+t8HBmUxQc-mUK*T79t^YD2MYUDZ3kM%iS zeBj$gnH5rboP+;6Z>PUsIv@N;J0gYPe{=OdlP|)p8~MlmW!~g)I&h*hKaTK*f1IBD zC^~xj{6@PS=-*eO{&4vGz4i;Z`@VL%!tEDvaU6a6GxD0~&7XO%6}!hL>UAqA937_q z7R5Kp<>Y}+ZOKsBwDPHq@2lk9#^rEBtPQLP<`xA<9t+Tz@ zI|6@N!{IN{F!20z z_&u0}Ukc|N!r|+R-@V;NzPlHp=UjHPCWGT+~zAdY#87M96xZ`|~PLLvwf4%H3;aP2uq27r1?v!Gk z#-kLm)#0HK_&yrq^f6QDM5#Z1%=diY&=mHI2{;_tW(JZIUIQucb}nR_OxGMjgIma7 zDV`5Kc`GtClD%RV&)3T?Vy~{1LJ!{y&h@5q@2ex##fR<2!=b0QPtC5>n>DQHf1&wR z=YxNurWY%40p(BN4e*22=xsUK%E~<2!sy%!jh@VY#`Vv|W!Ff7&pVkb5A%n4J^RJ` zt66^{<3@9qneA!k>uDU{>N7unZ3RC64uNNp_jl;ub4d7CN@D*~2R;t>yZgM=*jFw8 zC~vLx5A=(#1HCtJsc|YR^4tk-b->ql^yVQ(rb|lbX$qes@~{Igd=**;dvtaDXb4Yh z!&NuSrC)KYt8>A+mGZ4qNAI!ssdqNI7N`2YH|B5GA;oR%^U?~cGi$PavO zT4T2TDKEz=2_Bc#VV0 z5g|7QZpQv%oBR*$@vw*c5wW#Uy5cG=PSI_<0~k5Ny=$DN;#Gw(W!YiW&dZOK z-;82=c8&KduwJ!F2{*y{M7uGW2~qTcbvEs)%B}7zmBifUrk&~EqKyie98u+P@5m!I^3?2O)tud z9PVD!_8v<{Q7?rD#(d>b{~p?j$ml^u4mSp~YcwJc_2F|p&$|y&d-LoI_AMI2J@Jz2 ziTrlQ6)BA}uMbxqcz!mV?PmqQ!aA^42AjKzCf;u;f9Q+&G7dg}Mtkcl|4|e2(-t1z zNjkR`U-CVy6sM`Zu!B9pj?#m~eTx3yf&`& zPKDRi$$t(08!gSl7NJojS+7MJ)l5A|Vp%pvXA3CSAPuo{eqOdnW7tvUBEZLvIP)hd z<~M2&KaE^XyjTe_ zx~3`Yi1Xooe=~e0+?<=;3bzOH{CX7*_eRGwJza-8x5DXIH=g3--6MrwvMU@Nl3k9X z+WPw+IW60#hzvO=3;Xz&*@-N@1C)zDNiB?Lc!%sGdSfM|oQKobu^ok-IZnG*x&`WS zh#C2WpHg4wx5wkTBKFVEr3L9{*7u_LG0`Wju+k1t;AS6i)U~uDmq7~pWsruu z^^MOAxEv7@dw4$fhX<%9@)>O7`$nESCN&@*Rq}oO?9;P)2t?A|4MZ5unh zL(n~mbo~G2?~%rh_;YM;8B0rz`8@>4#Y z>RDf!u<+-}-3-}Bsaskn2h7|05&OOy>E9>&uV!9HKi%{K-y>tn!|BWHOk+Qoo>$Gx z&hl>B@5SD{&RN7!@=tt^Rs~LP=uRR_{o|lbHKG3=q zIayxOdwZ~R#C&SX&#p~nTu#YP*UFZ%pod&ac?}Q%_@d zje7VwLtINsa{jk?#DCyglkC1cS`mZeVXx^Kb{x)J~{uw=W`FDI}j&*1MsIDG) zXRQY_%+61&2N5OwUEW@KQ+^)x%+Gn^30ISz2HB73jD6SVc>^Aeli~Pkb6Cc&|D*pW z{U2doHZK0uPPeR4(Z%zTKeSewj_)m$w>K}^79GsH4)SW`aK(L(#Tv7=9{ss>sxlup zFH8nD#EJCdF8VpEXiE;=8UjZ{zZhCHAa5=CYqY79?@Ccg)cR)qXYq2w|pexq$`}PoFuoJ8}7pR{-l$`|gdcfUH z$i);{WFoF`bQV!|jkHhuqr#|`%1lA*e}?%=ucymbvJ)H;8+WM97(5$ z_Kg94-yrWs{WKK5bV~gzcr`k8HXobm)yOmpKl`Uv+LfD+J{;rRJ&jJI$;0&2Og-V( z9Qt$kdklxeVgOU8NZ-oR^4twTmwYDZiTQqZs!9>D zc!=^g^w-PNDDuhg3x4wA<9|0nKdZy%(Uo~|0i5UUl--)A~^F=>XQJrg;epuSiqM&w=6 zM9(jQ=fSrf^A7)4UlsZ7495plaBqM5ZzFVV3+J}L|9#XSc)lXP-l#5e4%pndC$~}b zsXTu!dvINHa!TF~KQ@#XT6Y3(@AEzK`yTH59QE9#{lKOB&=>RX!o00&%bZoh$x4*tt&l^+?VKy`rkwU0JHH&>r9pU*5sqeQ`78q z^^FPsRrr?CpCcP?3wifOr@=V1K|LeYS0Ap%{-qtf>4mN~aBhh4Yg4qV=tbk`y^(oo zC|v7`{z33G^voF*{Wh2HIp}HypQHXN_}nBF)}@q z-v{V_+ln0{>;y+^H~b4vDY~n#h5NY^e$-T6FAamcXKPP(C+B@N(A$h(T}{vR;-|4z z?S0^NBlEe1y!wq&WRkzNqCa<~N4JB2ZRovC(;qV8K7pRV&xf@m!!bOLOb`F?yBk=U zpI;SWx10gDo2SX<)2GEy?X-i(OO5Y|@{IOL?eX{5qQ7+{?zdV47Y-C@*vYuW&Ss+e ztIUwc$?Lnmk5&JFA~L(uZZ-J36&;#yp85NmSTmX$`r`a>B}}eQ6I^azHaMH2pRrTD zA0~#)IHPy1l|yi{`4UKcH5{9o`mhCW!@jY|zvKBS*^}z;od(ih7sKZrSxCdrQ06@H z@P!>fS9tQ0ywDTOk1lX|dK!{F!|u`9Jo>NP5yMiPR-T+)k&UCp%9(2&`Bs$8aP9V@ z_a0)Ms0p8c(@tGh>aoVXz4Bw6!&XbT)9YD*X~C0>dpj1ne?#cUSj$&u!|>%L?Z)$u z_-}hX{sz(x@ckVpcQL-t@TA*8ZLYw8Tisc|htT)wZ%zoO)6yHJ%s6+j`V7a{ZYcac z3;$yGa?{ozN*ap~ZLCV85?&jXzIfCP?oO!4;n*}Cznjv!r^4s9R)wLJ=clKcey5Rm zZYKGh0*}L9y=9th+~_Bs-`ITFB8~7|i)?wCMPA$>ktWeM#IeEMcz^%2DS1goV;bD* z4xjs0oAm?qJ`!&YjQT#e) zqq`M5b8R?2hThy%J0s;s8jUZV#CZq)&z5(&8TqW?KEhs}@4|1dW!m44CvZ6;XBxrD zu&=g(Ps7yTO?l+s?~+fdjPDfq)WW{b?WKNqGM|sHlrPMCrii;8o42B8G~~}QL;cOz zk!Pgv10S!x(Pqbzd=q#PaVDEo#%qf9HYQKYd=Gp(K7R`w;R!`PtW`4?`rp_M8?_r@OUJgYtJ4s(R^noLPy~0Q*bow zMwi3OjeOpdZ=}C*e@X|RZsg2I*ss)Kcb-t8=d-*QoUR9-`zm*irN2SvkJ=0S?x))8 zL;ioxhmwb8^yz-&x{kaV&AeALTbmE}Tub-CjwA;k%Y)Q7g;lc=eKqvt-o<2}FO@Bd z7Wh+*ePgiio!B#az#}(?7oFAL15Qt@;J^g<6mc4}i!SC-d-n=X)^3&az;62eYq<%$ z8?61gMT}Q-_Xv)L+n<*^6+PitBk|pnE4XnAdN$a1?L$u;tv?48firFRJxAsQc^K6f z_l2FRz82=$N_uzr_n)ht$TW2i`U2N)EJmrX9>0wd@J0MiMPKJPT1@b}n`F^nm@oGg z)9|6AIKo+pe(2tmq2y$P`&O#R(Sq!=Vl-|%&yQoO@9t}}4zz@ZzZW~=;$i$aHZkt& zuk3Qp`)XvPl?=s)-o7ucjB8HsZ3g4qtw=uJ05=ONddqq-0taN5plM#(+__s9 z9<-qcmJs%Q*2bP$zqGSF2ZOTBDtw4c`{$UgZ2T!Q*G$hARro)Itzyq?XJgVX?Wpxl zv-$8dV&5N!Z;SB0S=tmnErpZa%|qG5p}^2Ymq`=Y-*$swk$-7-CuL#F-ie+%SswL0 z*;R((!wKl0No&18JwuJ>$u!bF5OO6y=t*qWE7?01;NO+jl+NnE)cUd&Y`%j&8=2M~ zgKTlGdc9|N;-m38{O@fQdXWcl__N&w-*?5I|Ejl_`SrfY>d?bqgv^WW7=EJhb~6LN zwtj`((5Ye?$NpKD9@rLF=sc~$Q?Vk3~ zj+Go94<`Zpr_IdAus3gKY$2|8$HU1Z;bi#zo(tbQ!G}%KR`7)XC|nI3JvYrr9qmXC zhxhHw{}WU2C+s1UJinO!JSVL$e{=uf5_HHg0JnS4A0CD4jm^LN(G&RmavJPC8Hwrl zZL(p$&-cA$_O1Rkp@%$=9~1HGUAP{n9cwD(^?aXcel-&hIM{r)?xnfjlkd{HFkHQV z(8HsjwdlPO2i}0*7?}*4=2PJmPxgF{=YQkx(Z@b3htHGI)0Y3muBo~_HQnLoM0heR zkM)fI3p~us|4H=JF6Q_2d|`#ZJ1URg$HKu$X4K0oz;<#nC+&44Q% zvom}S#sAy#CF-f4y^_yP)wFkYKF$0fulcqWKEJ6v{GM;fH&KX$lU_a9I7S|h zklR-L>89dmC-(8s@z3Pd0kk5*+mOn3_=)rK~o$_Pr zRQS#QU2F_>_GORWTYtYPf46S94N2L7R-sGa>w%uHX2uM$K2^~ejy4K+k+{8#>pSev z^T~4?oL@u(JyjnTq@_+3$&7Y~D&Nem{=YDAeA->c#2(o!^oI9CXr|{{i-x8xt@}fG zgblZU8^SKVJlikHiIjGO!?JA|w;5KFS4iznc-9-9ui=j|%(%@{(_Psz=XawamAtBO zy9xbhAK3XV++N2*Gtj*FkK8^r$ooNd3NO*cAL7|D+I;-6JO=KLQS)_t>$;=q8gz@N z=U>+w?#^`9_l{lq8}#YLsh|4qhTCy3@~!Ob3!wJ1PRv@dpF9QmI%XqSc$(YBJ{5}XC??fJ&!|lzE!4@a+5YChrIR z_p^t`J*0P97n<2mJz3G$ThWsT!4uXn^mH;GUjD!J%N#f7&@0+*^5F zdh|fJF%J$Og^wS?;s2E%l@Ig$#{4*r@!W;~U-{*E+zil*pY$T~{YQDD=R;3^2_3bp zYb((k=Lq-YyH#-Y741dd`ao zhS;Z1FQzE(CqD2qRF8zyPQ>YB^NTTXa{|4%2f1qJ{=uEoIJmtWZiij?bo#NVBXNX_ z)Kit6f!$i-_MlkzTy7_dkH!vCi+n-aJfgcf;p@;isKtv6cDN#dCqa4`l0# zvyJnra)W&b>n0wogz_iD=i~4t^6JMvEpGBU&+HH1?;~L7>oTD13%8ql{${f@_8eQ& zH@=toa%=n^J;Z#PsSewEwTPGlc&0H(VB_ zsy7?M*QvZaCZ)({R3|Mqj@{t(6!JLVbCaxTqtrHs{i;9I-&s8)vkqxrax^e&n)bJD zwzn#+AP>#p^uGEhqY3@Dmyzz1j;qXzjoCSBz^9RZ?`<^4sweX5@0iAhB^S(rM~j!Y+L>oR)*s?_vL6nNC%{lwCS*q587C9ZrW`@#%_x$6RqI%k`rUKaD-{ z$$gseIrrWJaD13~{%kr(xt%h7Tb3O0g!{wQ+Uy&L!43IjQrIJVh+Bzzt#{}NoT*1I zt;s%IE4->!NAZ~Cw^wtx%vMu=-n2jxJ7>5z!yZ56<)pt%9 z`o{o%88zvTW5llV5AMC% z94-xKfBqv2Z?gT0!_#p1vz#6s@pNYvzZkVS^ys~fZ|=G272LkKct`J+z~_C8TT^+F zcGmw6aQb!9HJ3eOnfJS~cN|se@7?^E!@KysVxi|d(yy1QXB_?cY=4hpe?A(neNx_0 z3^op1(~l>S`rG+y{0mO}RK9N=>8wA06|#}i5`TuzgYZD+=v3RUOP*>^X-ACxDsjK+TIX#JWEx6j~9*Njf4#}B;hgTr0ZINxW}pU=W;D*(K*`^);J!_gtrujhx5|2}Xr@NfY$myT{cSTldm2#<&Xi4u{8)k3Y^QclLRi&u_~!(q8y5oPPYdUhc`i zd4GI}e51$cPmk;|eivc$x|?3!Uq2p6XM3-e{23#x7X#?ehbF6&bHY=MOHX#=cIN45 zesf2ovti`l$KSzlxw_}4^UFA#9@~=N-QoBucYOY{(U{Bbd=Po)#twZxTyUEjzl6Hk zcJ$~Y%%2XvUkVo*WS#g?)Q9I2;9fO&-6=m0el=sa7=;h3#R=|bd|R$ETrdJiqAE^{Wf}#x!W7Rtseg_>5-12@Qd6utHIj>;$=Tp)3gZpRCz&H2e!NwK3@f|2L%7=%U$vD&B{7ckGPn+!`pc8So_w7@cAsyx8$D@c(^tHjTPp3 zJ$m(Sm3BK7@8X+0=gRxg=BkR&yJyMIe*-EnV*j|vzHDvTAO3}Z!zOUAKFer_;=Htw zy=883EPgbPdtQy>?&4UEx3B#L4xeBg8`6(|Q_oiJT{$g{gNIwf#h@qroU7yKp4uH_ zJa@Nlgq`7>qM826#0$5>o^v$(5B>QndUaj=IJ!839$1SVdPn2Y+x@UTj9X`Mg&~L4 z`FSiNN2B0%oM$$sSMQg4q3lNX=*V08V(|oNI7dFZli*DFZ_F@X7l=dI#b~}PF6Auq z(FwHv20r{zoSxOdhp&sf@ZU{h#U%3G+P}iyzPTO2+2&nyGy7QmZ34@G#-9_kcriKp zihU#Wla443f5cBwdM3KQg3o)We&$J7U{7$$-zPf=-QUxncSX-UGdWL(s^L-0*(xby6t@oE?P#`ooRY~8VMolUb`2$g2Ds|>-P z<9J?$-@GONcXVobDgORi-UeSZUz#XYFESxX|H;S2CT){C#`_jUdLgnFiX?+Z5e z@I(I7xn>RH5V7AI;pX>t`r!v(JFWI@CtUqE`4PE6BMp09ixhs_a)70a@lgaIT#WNm z-UZ}g9^77O{MxY6p9T*HWaCn6^>1p_ZXn;aSffrizq-Na3FsW*`MBX}Z8=eS#6>P< zH?Ecq$ws6L$rU@2^`Slb58;oUk)uu0rR02Fxivft96k;X=jdOYuD~Dn^I9JwZ{hj; zOJX0oD;(a;3U@*}lV07)&M0nLoR2G);zM)#^I7QcY$sJ3jZM6Fk^VO37ai^Pqc1l_ zr##!~eB&E=H!dXqz2wQ*Prqsz?UnrLnzB#F8a)<1?`d4*M^Cp}gMa2bQ^ouf8CREkaoMn-5S=N1=*+R-`4$K-2-rFWj$)1*CppTlWs|)QPvX9gx^|^ zJkI-AAmL#*?Y4lgfqN73-^`bZ^x-qec^7_nhmlXaH~3!*-Y$Trkr!a2{A#~D38hyz zur6&$5AK3Lo96#ke-r+6VaFNn9`EavTZ!|_jC%`l-Ph`eQoI?iN4-6AQZ|?AZZh_gJv5@!dcwn+@VHTN zjrsa#*^>RWx927lKbxI9!Sh?q`^fWrD!J{PZCpfr#zOC1Layq_t8uOVR>9|<e7_y*tjcPG@JSPJRz4 z&M;3){?BpK#8mOzv(zW+W^oz+gq$6E&&s&p%HB~;|IS7K#&G^>^|ZjJdy25fM?Cl0 zDbD-urtgRS{MDjvs-mAfL{E-;5}z(Yeulu`S5SMQxRuM;yXx`F`=4C>B`$@HLVl#j zQ}}!T!%kBJ-!pg>(b!+goDlI}NoKIXXIaXP=RCh(W_*7uYtV>deC681?>kU-4Y^;< zzHw4ze8rZTKhxBG7J2+Zh8QRR^lCYSoSScDO}N$ke;7U=mWG(m!)SP$+v5-9X*J9} z8(W#tv5%O_{xK_S@A(VV^@6+g?jrx%!*nyffTBDtl6SC-}TA`B<49K>zPa zPL7oUKJ=wa(ceu3!wPsknEanZlZ$hNyU`;0z)Ak&@II_HH+bIuk7gMA`U~x_hp{g{ zz=JI0SROiZ8|Q@g8OMxO`)1{>!;?fk^XS*NqpJ};`N7J3`HeTdz z=P5N)XMEl2|KaH_;I}H4|AC*7?(XhxBn2r26i`sW?hX_KyD+fD!1fy0g{|0EMNIUf z*n)wHil~SR=l`CAzyG~2dA`p)dv<4MXFoe~YMDu$4^sci0*=a5MwijtXWPjGa3VPrwN6~+G-|KDoXy6=oPxTKMZ}(t1E+ewb%k^uItVfy|uaVgU#?j1yPm%BP z5cO9?pXs}|c=xS7tDgGs_ZiRa=-%*G(q80nZ1!B)-&3+z;j=v+{MFIFbF=5kt(o(J z7g~q=d+vIAElwtTy2PIctQ$3xuHvE>>X(}g)D!b`adr$jwQ#QR?sOQEsdJ^l;kJ$<8Zx7dRC+EuS<-5!NR$y0# z-)W`(j;FV4;eFerZ_<>VS(!g_1O9pSit71w))Obx@)l`A&$q}Qq@Njk`EKjGdqVTp z;#2hi|38J)+F8p)U(-|A2ugK)PT!#Bw=t&1VaW~NK8Oi?1SJv+(|DS%powJB1 zJk3mv_n@ocZs_4%EYmu?AJ-%wTQA)smb_f@`FL`KPdD}?Z&_b|aS!bCr9Q0SH_6!- z56L_{g0z;f03+Ys7wq_I=JhjhHg3TA7*d4)?Kl3NSl8Zz=es01f5)5Zbr-MwtnhPw zN-i_iQ`i9uoT;ClfZj#a;-V7AYNoHD*X7|qjJ|D|R8AkGhu*BxS4v#niXTIkBCFw3 zC0_MUqg{6~h|j3b-sq4%XP(OAn!Z`;+fHy1YT504o;)k^1igWu26la8A8hAjSo%bI z1p0pJ3H>}J-GN?AU_U=z+7HL7C;Yne>=)Kh|50v8ovL4R)f4UBY(K7=RrN%CMrO>b z)3?j=x7jbOuD|o);dt`7GkvQp|1$qgF=KGv@Cn0 zoG*2)4YK?2U&p@RvXVX? zj#Y(2ZL@pW7cKZf-d1lle(8PiKgc~R4;c66@0@?Cc@#6$TJ_OF~%I;oFZca3p^bYc6?4-O%dv@>+iM##sw@Q0ynEI=$zYn}7 zZSfoLpf?f!9P7*0&b2(n{;cW#!MDrx>tOa~=+CJ9A$U_&KBN~*eLI+DQF5KM32z{bo;Te1z zfAUY6p{zk8@DFD(sWEnY24S-!b{yxU4IVh5xZnzYm-q1UKw}T8|SRF zExY;ow43pogKl@+8|Ap)o$N-Q$B4UIS-xjI;Q5f- zqwMOM-Y;&mBQ@0i>BH>sz=sw56czO9X8jDibg2k}@K3I?4n-c$d*JUL?7h$QFZ%l< z#HwQ+yo`?-;=I;Envh#1@9p8&dRISpx9V+5{ztxz*vIjH)>53uE9PnH`^W031CM8E zer0@0Y%(T%7Md#P4z_r%Q})75MyV$^Q)YY-OX? zH+>Z@HL|<;VL6Wu=l`frZhgyoYV+=VqTbp@aW45x@V4@?etlM)2q%L+PrFXZ*;V+g zjBh_reL=rP`D*N({ggjef3HzK{JOWZ+pDYpEc^vdx<6dGT1G8!atQy(`{_yeU!1N$ zuZeyE`46&6zh1s?E>mB`GpyzZajJ~8SEPs1lkc=2_Q{&`B>eB>xliHVSMu1sQqog6 zQ^z{HOg|6MZVqRne68#$@@NL1S6jbUxnKLOlHN1BO#8i(-|=&VdEGqw0KJNMjmx#$ zO+3fD_}O58@bhw>hQFhF;>29`9y#!UXP@eiyixqi!DsJmh4+yP@c9$;z~>9mx8w(T zhrN{fzPbF7*}Ss+6!lN>UUMuwkM;2acpB&6W@aDjm-~R&*WtIh5Y9 z>acGki^Ff$znSJ;wftQ;-yZHpyy8@G(yyhntnV%0R`~0N<`<}EkbT4#)5Fb!zWJ(B z&xi0A*MLi%*~i-%k8%08_;W5kzmQ%GfZNZR*WzsQ|EZ@BJiW^J_LR@#CF4?uKl<~M z{}cH&VjQQy@tNv5HIMbZhVv+Aki%B|8xO*V=PwHi{hnkA5G#4*%9Y`B(T@Up$k4QR?mg_{Zx&zbo=(`t?ikN*=fx{^ql_ zJ5GE@ywBEUPk&SL^P6}SQS0aGpZ4%4`sKuV>ULe&{u>%~~lI0Tjf5{v~p0T64l-;m9&rRhl?yEZgfw--b}wO%YV!7Qs1h6Z@QM8D;uq!N_{lb*|5(y1!kaqg@1}Ge{r$OUoPAO9-;A|9|H0QNe|1(d+fn)V%YFrJhdy?I>l@Q<_SsjK{8vv_TW4yqPn`cYE*}@) z!S7gSYvoV->?WVb&*{m=>#yv4ZvCm(o4#Bk;h|0dZhV&QvNl0R^g9`_s%ZPegDcn{Jl6WUtQ|MrFqz&!`a_!%%k=8 z2fr@md}Yc1Q1_**wywBeDQ}pCUfrgCyAsX~t~8!C>;pE)LZ0{Ko5-^jeEu?xxRd|p zTa@%=?CnPOAMdoT%)rM>d6q@L-sOL32v44Z_mS6kMgATAukF0xMe5zx`NWS)-2MmN z2M(vnKltop=5CS2xy(%Fn2`(cr&;(2v|Gap z5_r24ji{Ejl^a>6D&^zNWi`B>j9x!$4<{z0hyQJ?9rsSG*)W^eh>R$f=JuL zpyi?7R3ksiQ8J4~-i+;&`RPIESK47;Rr~;ln`VK-@<|)l2jOSe%&8Lojb_S+{k+(? z>?eOll`PJ0-p|7r^0|XQLM#b;xO&zZ&TOVv6R&Zz_2mR{c07{uGW{R@3HaO+`rj^- zL*zeSOb&r3E8%a%*FVVPxf8wo7ET9#u2Q}`|H+D^W7ZK4|4hDN_kIImIwr@7`}hOy zt}lMKE(H!(OaG)_b;V29&f=DYYU$tjZo=PuJ9(bO&)hnz$KLLe=H$cDVQ;thw$LYS ztUsT*hp_N`ZT9m+=I^fJ@G53Qm7j_p>&_%N75dpj`62vEJHyM+t3%;mL-;T(?V9z# z=gjnYGwo=6m&w1eS#GbUPG+;KpHACmk?r=~ zw36Rz!{gV~KNGz{*2Vhgyr6zpNxo-iH?nHWyOE{7=UL37N=E4obV#e79{QVS4e%)w z9s4ov7p@AInwh6fvr6SW>XChAzIRo=XEv38teM}#&gq$bjovM3pVg;N%Zh#_y^45^ z&-9}|Eo)DnKNSh?K zux_?Beimgl$fp|r^BVKFopUSoGUo$|XSEyp@J{xZaqs9{OXI8`yIiYO$pmG4I?(lJ@4c z7^3}ATA2zL1IH%9)q|nlVM(j3sdafTJM6JPvYq|61JO_B`K!#{5UG=e{kgsJ2l66c z6g&6yb}s(IKXzmCPTB@e&a=Z)-#UJ;2-ncV2a?y}-ihKnTALRQ;r5`C{*eB*W39iW z-h>`L%mW$ydW2uZF0WJDb+ccJ59NjKNX{bB(R-@@ZStvMe*EC%S@>;MqW3`mO8;wE z$7M;;-@5$OZ=`|S|FGb`;6+-e9})jkJFVflF5)^ml7omUxES_!>-6oC&xUC`{c4?b zPFrPRpSMomDskIs+Uz^|Cx^iAppWJcaQam|NA$mDvLpP9{%*_uZjwy#`!MUp_w47O zSA(;);Lafa=mGRZMxL}oS^oI+D|)pjd%BbHtIa>r%lKD=&tIs=`v`hN`gBg(g&b<} zb_IREWC8m3#m~j{_>cbH1Mg$hu1Wj(ZXH7p{kivF`qSF`R$TUT`W5T=tJa^$6Z%%# zo%~Z?ur4K^-=@FgXFBhHU*p&z`2yYtE`84qZjAm{+CKZFaKC&u%>1gte*O(V(|q>w zd;y<_mE~)*C)*}R@^duQpE%du)2Exy;Z(ir zH|ti;--&QK?B^c7x2As+{-w`AkNvW3eUH4OGu0D#dr+1uzoq!QR+(Kf=K{NwczaZq zW)0MHSk{j`Hxwsj-Anpu$~T1nCuUu&`F5O<7C*vv107 z?#QmYIO_v9zOfIuS)Na1HOZ^9`vrT$rPO)u7Ug)o?RQxY`Jek{kzeSoY_p!co3%#o z!oF_ox$gWLZA$rn!;WvH{Cimsxc7VUw|YYU>$3IQ?FHZ4XQRD0C2%z8^}WZphcn-( zzYDzGg8dotY{-so=l!m3K1@3+i?(?y@~tc0c`f{$AkSU*Ec}*};am9aNAMeTU?=p- z2U<@>$mKB)oU}r3N6)v`ULEs&N=dH@pX-xnw`6?YHj8+aQ}h0%J{-r+tq;8pC~>uQ zN#DY{c?mqNoebuW?!g}T13nKd%U_OO2R>htkHX)##dYvI^!!%z#!&0dd`tGgn&Os{ z|9b4|ppTYsv{&{|@oC|HLG!u^d`|E^n!luvb+D;?qSfg~BmRqmK9A*R?m+MVU_aMD zZ!E9qSK8fETwZhI=-f}<*za{{aNwVF;cldFW1ZL{8BCuV`TtL8H+I^u#g-xUI8|#UF5ZTB#@Q{zIRYY!7$-TdXYUF&{^=7vmn2#>pr+z1lgHw)lxm zdTaT)u7(iJGNWTf99hyQva1h*qWkjXwWSXS*qI5u-3~o)_*gTe7JNS5PE71CZ~rg- zG`QRpuAWntZw3j!O@}4diRY+GPH|3ftCIgkcJRagyv)ptJh~5iV~zKVhtS*V*E7lR z?2qDCcJf5>spa=|^x-&JKs#k`7oYQQj4JUb?B^cIY5X5U$)OSd^T?9E>R-O(SywgCrQ!?g`IG^d)e&u6glR zI+omOz~^E3tjd0FPXBhcdX6)HdL>`0C+7S2X*2VDy3rYu{iwg|%JS9Va&z>l?C3$+ zZ|-fZl#MOxsRK{i@PpNY*Bc6w&vq*7sSEewcC+Dd>~(oUnrG9{7|zDMvpe7??#rE(ML(+e z>_YxqI$s#i*Mi5BO8RN&jnL1?LeBO0JI0z{)%hcu!JUzCcsPD~i{BVp(yz@LnO7s7 zPua@&dEeGfyuUmKALG8G|0utu+4od7xE$X%vn`FwK<8Dq&ju&C{lTGS`46lEO_M&( zuf%6>eu$7;HSu-LvrfK$u3QuH_**-{&#zgXtef@p$81}8va!hW;ia4#uq)&K_bT~# z>slgSV?^1nh9!M>eu*8jzlvYrbN^DWnzENW!0nE{ucHrx;bh>$2zF(A^ilcNZWM3> zaK4Rp<>fBvyTaRk*?a7P$=ZqWI?ng{~FBka)Yz{kj}(*5<#xCLe?Un#uL}4>{b59{7BR_7dxE;B?6Ow!9Pi zC~+J^(f?sThkgV;cP7Wt?pqn`eY2l?RQi_l_bug{s^?AL<6iamea3m!pWt`om3_nd z(vThUYd!$~^+`6?vqK?gd`3LSMD;ZGgp35QFy7F*I47x?jMd-D zX5Mz{kCR7@(2u7PLrXiVzli=)&iofWM7y<2c7~6u;PddZUt`tZi??^6?*riQWOx>K zwG+@;;PZZ-Z%qydXtzEbi;U`l&nKwA3NDW;>45;7>1UcrVTa7cf5dCuk~BBIrzEGE zX&tN_UVwO?7zXUzXzk|;e^kIqq zM!#07C-8ZN=l==sJWpb*YpedHuM;8rmuQ%uWpW7nb~9am9(eao;ba(k?Dw=}?{=^+ zDZ3v%aWe<}jP9QJS?|Xmgmu^>%%V_pyBl>e1m>f)NG z=-=4nj`U=5{ZxLWN$Rg*7kI}KpI=Xp@&5H|@pf50@Oey$&o8EP$bD1sm3pI||D%7K z&bB^&S*$~^Oy3Lr4}4ye9*>Woc zmhDPUI>F=Z%W=FG{>C}x1=$Sio_lJtsU>}pdYZzed$aB6MML;J)%>c%J|02uN4lST zW;ViT z|CS$-ch%p(=PB71?BZ?n&Heg6`oZw<9qa54@IG*Oc0Lv#xof?koAM0iT0Dnjd;6{anX>o?P02Z=yF*{sEtHuj=xAXYy>we%_XxCpuR+k^SRb zTfU?A8pG%Q=EdIj4~Lm|E%~E&z-Q#I+Zp{LcJfZ_wD{j%Wj`9S=ej|+-`&^~c^PUY zd-@E!dN>?DQO^%h|0Z$HdzARr1a5ZXedrB`KZj#+!)1*7aCUAF`8_%~IaQNiaisdg zULB|Y?xK=M^0c+}el!I=#%m(K!U4PvyDHzs>Tm!}4{ytMNxu@Gk4W~TzdyQXaCdTV zl1xJ1pSO8VGM(Oi#(thb@5SH3(MJ3;r?XUK>ng7EJ?>?A$hs1C^7&>!V}7=~@fqvR z!|dFkKQ8igL-7;)dCz}wdo%rQ{H!RJ!RKAbp#gjz1K&<{66j>(TDW~?hH(iT9#i7> z8j(?n{Qqm!6YI_w=y88y#dI%r?}y?yLasC$E;dSbk|*gjdi7;dFP(`VevEDC=gCgx z%q5qeGXi1!hYT(y_{bBQk-1Y6ZpIf`T_P2=fi>Z?jPJ+ zyEXqm{hIUwIR7sHMvQL_{^gVXzIAeAdKS4iDlH*Eq3(YCH>pHs3-i=hxxv<-HprP?eE1}BRu$&(eN-qq+-z_=mDn=(ni;>^yo|pEgE4HQ@2dp4&3nF+0=v zv`h}vZpdv`b|QW0pB(LXS-)xH=6Q5>b^`hU@fb&|zk|5u&H8qA7X2MUJI^$3b;Zqv zoNMxL>_bns5~p!0d*&zmgZq^AKZWl)@aJ~zH20qVv~~k;-_wsc?R8^zTISu>{V98) z2R`oxH@4+Zo^2jv?!Vfr)X$&MI~vEAtUH6@$S>B7z`eEV=?<5^$WEf~oA@{OF6Tu9 zIMf#2{+XR?+&2^r(8GRfm`^M5xn+KtdGUe!t@b4sH)Sc`I@!g(-fZjga_3X_Dft`$ zx7tI2LHUKs{b+x1v;J?U@5XO@Y_{4wn)WaK2sm3inFPmAfrmdA2gA$g?{WE|^k<^D zyp!pL`!Ms}%62dDd~NbPG(QzR?jhU*y_R!@VFwJ7$L=irRCK<3vp(OJAFrNK?BUbt zO*Q`J(EnP_pB$t7XgEC!zdXD6k2sSz^8=0dRQP-ZdDIivye~O4a1Y_$#&IUQ`5<~& z(Rq~p$gK%{o=N}r7dIU^TbKQOu<;FiKH2xVX7nk_uVL{WtDZ)F*`vgt)lT}$7FJYG zjx=9eXnAkz=qWVnLj77J57Lq4xHLxZjNT!cP2b{1i`nS>qRDQ?W`#i#MH52|CX@{H?&A+i0fskXNdHdZ2M} zGjJN^Plo4@YX66#Q+hOgup^&_9-QQK5J@|YSKIa#moHIP3eEyOBJO{2u{Kn1nWduL-IoiqL^Ks@uEm1T#(Ax>3fEK_5`54t7^dHhI%oq21 zq}Te);PGbu)9ec4K20?Ib;c+B&$G1argEQdaprA%l)kMg>cgSUxIK$}4@;V7cW3L1 zH;Ojw*^p1~>^`48_%SY{H_MAY*>R;Dwo@+R$~tCO+V{-a%g2`V-LgB>7jKPMXxE9{ z?3l9ML$ZbBG|=1CrQ}_M7S1l^b7XdtaT~(QIgh=W!Q-P!d0v#=;P)-z`Z?J^{k=fD zAEdC<+tRsHo6#XA79(o=*{Zsr>mapT! z?z62!!{KxE>$f7$H|u9Z_|=^Jf6lIf2N_)6JnoI*X6z5v&F}Es_x1;)T|27zx%iyO zp1xH3pTXylGfl`()Sq4Wr*Bu^-}VhduiPX8Pg|;gQa;Z-TJ8K`)Kgph#{9CLaqvI< zs0X6QJUU*z^^!@>p`3^RI^vf%kISX`4fJggKggx#YgPBZM!U7e!`*0H2K&53zh$EO zm;bxU@}u0NaxwX>E$)KjQGSVbYmnQm`T6GASH-jH3Aw#xJ!oQH-0Sz&#^r7D30!*D zb1k6KYx*7UN1s@i0-x9BXZn7i{ln;==s4vf&vRlVt3#^|c7!X-YmvZ@%{Y8c){H&* zMe(S8+n0FwPE}7MNOlgnorUv9eZDOt^|{)Oow{@2(Fs0pCeLNXV0aws@-}vsTd*@* z$&A(0eS%vl7yOSwcUsa6UPNw-itShi5q~~DIR)NMGxM(EH~X{L8IHBIzKw;~ab8YD zMz(c~hd0Ew?3=4-a8rJz*~w+uDCMumPc7}4cW`ikOv_VWd0{Wqq|)Zc;M<1Tvg zN^vWG0uP@_SM#^FM}ORU`T~4D4_zcj`XT)r#vlENb$O|C+~<}2e3yQud>^NX9?$w1 zuW!?H(W}GRx61M#TX*8Vn}3wA#;&Q9z36jmcJ;mRZDmo39T)wrlU+rghwxKBLa%?Z z8@`#(dfCvUa2VA<9>TtK+jrvysX3Mc{GNe@ynH@=e&;_ht6Di#f^% zUR{#?jE^MQC%ZA5o;+L3gUc~ruFKxaa(<2D(6_aYU!~oe>bX68%x5E7c1bqH{C(JG zHU5+PvipsDbMYM)X4{a@V(ms;*z)Wi`Z&fp!#lI#?40G+fuMhu-ELm&oGispth1lF7bWUvIIZ#|_ry_r=$RKG$%s;8Q-Q!|jL6yCub;WqE%6{4w><;O}_Eyjw25F7T%o{9UU2 zQ0G$~hD+}i^R*j(n+5sn$`4^r-%YPp@^5S|e|P?_=Z5)A{|6o%z|LN59jDLvT>YvGk8jbhUE%t}@O*jk4V(`8y8Hp32g$Q^ zJ$>6y{FUFYy|`K8?M>rZgTLC7+0lGV&D9F}dwz0zG23|kO=k~=%47LfYVqqlL54@t zm)gosx9(JCgU0^J3&!+f<=T6<7|(u?*_ii9^qv}?WykZ77klg9t=8av@S}OUv3QMr zHeP#s@pAmvyY49T-st~Bp91@jG7jt2Gt@e~O}eRAfxesa$H)BBhb`GJJEDJ%zNb4? z=6V09ll~z`UIq4Fp?t;?U+V6Z@y_YRtq0BxqzUXyS*9yw+#6&+?JrsUWR_6RTJ#Ip z>WX)SIJ7@*_W`7s&qdy2c z7m&iP>2D?dapk{K&vRD1>)2Cwq`%9Zy#+fjaf&j5_wN)}vUe)7(tE1^ee|!;|D8k??>!*x^Pp4Os?aueLrbBB4cx9{WF3D>0G6m{wIMEu@?|1;Ac z+@k$;abNm9jd5a8xikEJF1{w3zYEdB9v+w8AjW2bs6}_E!ncL8rv|-Ix>)&v;+z}u zgHR)?HKOe00w~5C$Nu=Z!CvqW=0_#7 zpn39u+ke~{qhHU^ zcAtULZjUKm&ORgm2J&}&hh7a1M|PRlinm3-RDtiSc#GTX#|m#{-Qm)AyggOn!|*&Y zYyT(x_{|9XS7Wt)l#`9JhOlOJ@o z{)vcC{|Wq2|LW#BYvDung#I|Oo4-RI zGK1!SlkeO78^54e!O!|4vbt^MbG&@@`=Q7Ef@AYP*_KrzuM$0Yt2iYOq^n_FSZ^Gv z$(ytV8}ilSnEVI3FtwfQcIP;`ose%TwpaiB{CB&AaX#fHyOc5p;b%|vKjHc6(C-%f zM;`a9@;~U`|B5%1k38lV`rcCecdKuLdS1`hmhA%7=gIEM|B`>C{`w^OpG_N!WoqhR7W|;>#W1=#jeVfl6F-&we=GALa#)3Z(#PM& z@^kbcySDz08)4&qozY}-7AraKDClh*JDMjQl^U(gLRR%8*gXhsI2;{~gTs`2S1UWf z`n~D%u5@S&O+AK>YlVKt+0(s^@6rCVJI2Cq&*!_*S`oXP>tS>v%DI&YkS=@!8>}T& zSV#ZB;@C}V$&c)I8JKk%-I!&*U4o~$=ksI#)`unMD|eE%p7DD_|F4oZm3ycnk9a0G zX(!pB<*3!?xV3Ns%(!0N7nw1ErLiy6KW$(wsbPO{LvgP0`PI90eY|a*+NDox%bvH9 zFT9gbu}yX{%x^d~D6N$&!^eg6z9yWki@vX@n!zHxz9HL-)IA2yG*~6N8gDiI^l41mPCRpEa`=ls`6v3(#<>T)h7_@!X$AFixL26oG%WX6t-YEW?e(BvT@>cyM^KIwEu;~3a#Utrg#YA$wpT0%r(&pxSBkev( z?o+hzs&P6sZO-nE-QZV?<;HZP85iq$WF{ELN)!cL`~rt(($6;fd3xG0nO0h%%h}uC z6x-t-aM7B`8b`xBmspz!u` zF^Mh9cGRnd#p!Uly7Ah9749yeNd~~cM>-jfAZngNMzoa*9?HB&eUim}*ji2FrP~ImSL>MOyt!ak+LQ`r(fJMt0xXai1)D1%8VPq+HiI zm3z=n5(k~boi)Xa+KKZWkLBOO|KZ6qd7R!|1-D~8YREqRMQ%?kV;<`%4~=yr^8c*M zFW~LIgkL&#A667=^9|(IgxXiN{wy@Y?!EV(syv1$5C+MQ0b;}p`LcIMZJR+@yT z#9L@pk_&6&2X#c2qAs4#tf%pYF${I3wf9r@^*$smS3He6ine^sU)tE+4~MW`WqgF) zP52p{1?BhK9(Iqj5_D0=G%NPS$u9nSQGTSxdeanQuoaBp4Q%>9iYJR>a65o}#u&~0 z%#^#$h2M(%`K4Pa)rOyXB3%Bfc#ie31snQuG8o6tJDWxQWb$6|GHEtXJ}VyZZ%=Upr`LIV_|dZ|yW)Lg7dDx$zyy4`@Dm3KPKH0%DupU(S}q$wQu-_`LqrDwmYq}pPEjD8#fgXr>newIq#8H z^KLdOX_&=noX6Pt?_0Zf7bh;NNSnRchZ~D`ilwQ$M#;Em>h==6@2cH|K75Lwxiog5 z-&g9_soA#Bq*>?C}CXYuWzO-@T5-0H$wGqEI*=Q{)hQs@# z@gG_8hZ^6H)pH9ubb({%u`+6<=Zo2!l>UX!N5xSel({RN?zT(E!+g25^GQbc)8de< zr_uVmn4Ny$-Q)x3qbHKad;B4-=~}g{-TxGd|${;KD+!3#y2K4YKl zD5L(EeL%V5{B;bkd?VrA685v)edf@7_`VXyfC$A7`$u)ZrMXIdBD4f*9A)U%=3$lDn9{7T+|^Ub4? zdE5^7b5S!7RJHEp)y;V;0X&*)Ji6u$eT#D_mGoi@aU2`neb)|-%Y}fx6a0&N1UkXb z$!6A=JZ|%>$&VaQ$=jIMFd7+uM}76nV-82-{_e?eHQwF#Mvr*lz4Ew)p)wp!try|% z2&lCx9~?-g*h9GOLAu-@qcH&Sk8*ele0a)HT`}7 zpW*MlBY%Z2_k1|r+Ndrsp3k$g{KNSd-tP}j9?WB2Ei0D7@2Ka4e4X`U8hj4ibPfwW z{#u(agqas7U*rvN{dw_gE@uyZ(qz17m-$wr!Pb$_-N5;db*;4#{J{H2WR4;WShUt1 zUIP=i0K%5Y$K8$Wn(Ai-zR~fl+c@DJadHz#$E^u|rE}2qFRS3bijwO&lq*8YBZ(!8l+etuykT1>`6=-G6m@jHFa z({?O&_f61+J}iU2det;N#hmZ0r)%X+s?3Vs-dyVA?BCwcj#7XoF^eMW-y$Pg# zo!(4J&yDkZX8T-t-uV<}0 zf3sxHB(r|WS?)CnJ^6wCyq>>hit~XB%!V<=cW}6-mG;eaqgsZ`UUD({PAz^*4>yZm zK~HGsAp49H$ZtgzklKZ7NH=CZ!FP^t9yjJ|>w9u+F&H~i8D{*jguYKY7(s@OXbhOiz z>%GsMZZ~1q{#^IiP66M28c>t_8~CUL*`I(y}J!{>=Sl~tYp8&1RP+wTnj*mCEZ z-J#$OYZ-d%n+@XiT*I=h2^Y68FF)e#Z^L5f-gRIt{6vOf!@c0|&3OlLANtQh3ci?bU=r;lm$M`!TlK1pn>b{jN zXr27#Wqmc^ZAW-tS^3z_Y0S?Ow@TGacJ_O`k8TOChryw#c~?}KSn~3=dwz_#?d7jx zFZ8f`_x4;vf6dP07K5tpOX*|Aw{lJ;w7s$X{7vFO>8q@cdfy{!{)6H?^`vUdy}T zv!Qb-OVo3_IF5*_e!f`dJtNkI-^jhT{k!FGy8-+0XL1SubH=A#iQTcnje$eWfOzYy zuH6cBwhHw4leMxX-)pLD9UO=C*t+~XZmn;r?043i9rST1`Zw~^byshkp!$Hn;&hf$ zL;Z=Jhf$vTfzAIb|Hk1Y5T^^e(u8fy*T0KN#;l2U`;w0PVR1SM68$N@E^Z^O4io#Z ziAR`IQ?yedIfmujR6B7e(JY#}$XihC*PaGz+xukqZ@pp9HfbMT+Dfpoxn8wp%huxKxRD>X zhw>Z6Z#1^HwIQ!oY}f1Ic1K>q4fY3HmC`v!J0sHhaI+6DYUB}(Sevc+!NY!=D_%KH zZOM?5c2jc|gT9drme9z^p7gub@o+fRKo8>f^Gl4ti)q~D zJ`oRFp$Bf~uzov!jIjIVXC;q1#_15ae4?Jb56^1w7hH>9x2(D!t2sH>O)6(qycOFy zN&B+D++WH*|Gii$UfG#HGxhPbCwX=g7ae;STga}`$L!jQo!*aR+B(G)SyFy3er3yU z^oG3L?~&i-cMaBFCqT zw~Y3a*(7g*ziX$yvE9*lJS!KWZ*$BTD!*9jZuh%9a~a8pp$+MF;h?A_I@nIp1>X4DY7-*GpFJi?iix4e+^JlEcw zI4p~77ainl4y&%Q-R!v4v>W_xMT48cy*|7-t?iOeF`GsHWR-9`m_1vIeZGU|rs;n* zY4*l=PPM&Ao z>2D++P*0ryy)}P|pX?hrU5%Dy?i(D7-qiPpz1Obc->~;52f#-MTFQ9)6pV!0G z?~C{Ik@##XPtszm<%{+Y!`gqR_)$GI@sr}F5`N!@&jb9OT7AQhRK;A2`!moJGj}DB3^^efrX6n4G#PhCJjN_s33;Y|SJSG1czS`37o9W(&+y{Sq zCo`N=+1k^+#oKkWhSi6s-AHq&Y$#KFHWwE?%*Z{%?j2ybdIbsfrlg5{Sg(kvY3(0RolAdXxj&ml3*ReRXKofEvCq@q*7!#9$?@O#+$tNT{%-KL z6Z%H^HF_Hj{!;W-dbACHaXPKbvY51ALFFb-awr~!xm-6e1JADuT>3sD^9*wKA4(45B(MV3_nh6WC@qUf_vfSv& z7vyRs|tBX`HI*MNMQdBFpk zpG_P8D4M{v?&Q=j-`Q-f%@1;*-#f|6G1v_13U?!GeO-R${mcXBwDKKr2`hX?9No_O z9;Dcq-Mx*`annpbmHvxR%6Ii|eZR+hb#-~}rs-dOanA>OZbNZ+9(cP>T*m}d(P;T@ zq!f1aPUb`G=f!$tKMTDNy*>jz4l_Or^2i<>I6T<*^WnI=X=%h=o(p^Uw)}bfyj9a{ zawqUe;^}+_E}O`E_c9#L(p#)M)8OzM>Ir|#!u)M`zFxVo&Q`(a?e(^~y#04rrB{is zi&Jw;i|@%NEbqkAabNr@IzEJ+rOC1AjnSr)azti~r*C2F$X%dc?bwZx|30*JlpZzq zJy!CL=0Inf($pxmA^x7E|ZYK?s2BbD1^#TcqoC*wGo6g&89d*z#&!Q-Jy+;_K^ z|2p7fGD>Id?`m|yyKxe1kNh_K!n;0NJfTE*=U)TgtPLl_t!`>K8$I?r&cNw-wOvnY zEtI&>ENO|0YxJQT&aY5@e?z{^+s z+Rk5RY3E9E`3K+K{q_NlS{mtN@c*DP;p^X(`vNIn?oU-22-sK@=Uk`1nT&KT!r>+A{N7|=6m=@KwKRHM3&lW?|X0&Lkc)ERQ zZ+rahmYia=Mx^!h?j(46B)RtR<~mm573qirCwLd+)TkQu~J;iLHJ&jiEe_Q{j+K0 zIad5fQ|)>WgU8i;ADZn?PkK0&712g77W-v`w3a*PeNZ_s4$WrJukXZZbknPTqU3io zUJXT;PQYI-UM}pAcI@Pxwci9EgMHuK_`WN?ZClcG?kH>SjKC7-SjO>Q%6z7L;`2%C#rbB} zr`C_Nz4NxqC(?pkT*mI@jH?7!CmFvE@Us#<9RPnT=|P`7?C^o|^AGheODo@({?ug; zpQltKxV}3+<6f1+$W7+P{G_tqgY%hYT21Zm@4Lv-+*vMi*dZUQm(`r--d{ap4Dv($ z8~1ikB$v3~Dy;Dq&Z!JEb9eE%C%pNixDbC~_g;{9Ha`yKXWmi$tBR}gee}AqJQ{7F z)^qIV(X^#Hd_GXSjl^%f?QW5_>GkBW2lRY8Ke*&?Vg5h3-6Fk-{l5?U`8D*gx9>)u z&O`ZNE@wOY{C)g|A7)Ly)Z6?*dG0#WlGq>Ik$jqy?TJ2{S|bk+6tH$)Sn?yXfIMpZ zegvt8PwIH%-^p56JGjC^cUrFC2qVn1+vvW!9i~D%b z*;>6#Js+!W+_N}}jJkP#lJc>B?F$Rq;!s4nvYmY2hxA)pHG2DX0*u)oLc5O*b_9QO zlxR95;c?CRb$@MZrs#_5%2;~9A0fYU)fKSFUksr}AAax>ks24p^OddypATaXesv$qhVT%% z!HS)^t-1%M)s6Sw?nfD*^}c0n$sBJ7hqD{w9PS0|+F^RUUpgADox~oF7@k4m_{Np* zL?^(#j^zGi8atxm^5J6-d|Ux<8yWS*_=|gIo=>+!-HjjP%zzi4v0pSm_NpD>XiwP$ z=fdsWJt~J7x6g|o(>>|XRQMa-k#_32RzF{2KhG)U{vLd8t=*US?7Mp}*pN=7WqXQ; zKGx@C_WOKXJ|WN1PQJ_hO+FpW#P9Lhn5SZNc7p!)hTAjY(3~XA;&ee>_T$n1?c!YF zXxtsk-@7C3+1A-?Xy#2QJKXdA`FnS#aS1zkUvs0ExVQ81(}|y9o?bj8J|k`!X(8Tu zu5oXPevr@F;@#Yt3h!oRr-*@pq%u-cQv0zFJR`SF+ee?CSTCzKh3Ymk*+&E7TWx9%E9kP;;UQp z_x&M1-fZ?mcl&gay|;q%Cyk%_8 zk0YmA@@~x3uRiSLacZj#pU*T;tIO+rq1o(2CS2>o6FDe99ld6chTMk_&lo|my_UiFX(e7yF2{fXT#^% z*O;sSTbUo{RZO8z@v9nERb$>^QUkaIm9=T$EDZ*zT}rL|sk;!MBBjJ!lEkqK=N zS~-^FuJVlh1?q|OFGrchsE9=XOE~a-h<99d-e+K+l4l7 z$B!^y?Tw-CBu_5pi9CcgIY~ad>&UCBH|ClCyBChedxeZ&P&wi_AJN|BxJlWO5no?9 zy#b%qSR%VvYZi!K+rcXC=8*Iz^}GFm<$k-G+wm9FA&FLe)Uxifhp)5}h#`lQL#$xE z>^X)%D)#Q~BgqkPH9VgK#7@Nh7n^u==9Aan?A-=x8lRq`G@BV8{mqoK;ip>=ec#7( zqwPmd(ThsSrRf2r`*(3rI@Wm1Om0czv{}ku9a-b6@Jr87%l5tx!OcJN@9#)A`a7p` zfbkfBK9sb3!t;YkaUws$)mqEoah%eQbKHm1wI=ZTYNJyH&Ud7RecxX9{ZkyLt!8{fxhUY#_{rW%c(ibJz_A9jOlb~5c8A=}~cwC9^*c6J1L{o(xJ zW%Q&T+&+*L6XyeWHhRs)PXylbon{B=XXRwyEMigzIPHJBadC1-eer&O2s{=Q4u`MT zPD7`z&L{O|PFWp{n<4P~2K6+AyGMDhj&p`5DD9ri>`eL}=eZ|qudUNvCmW|mzRxc8 z|HkZGt^Za$pq}bDzA@WPFZ($ibvi9@#vnTqx9#C`tgBO;W7&zGAIa~0wDIbw{9vPV za=8zBmgn+3^tl1t?_nlAC{FqUGhtj(H9v(lu%`H2|E73*T#ApgtZXgdYv4!9e-ZxA zitshwOd7zwZpN*C9(F~X?>^P4(wjXz1;1UzWnAI8+HmV_2|k}p|Gq5V)K1vv^4pQ`SI&ix zH0pJeZ#@_9(Ld*H%!1b9J9hIecYg4A-y516m*Tg(NRO+?b%3$F&G%O7x>@^ig8N3O z*qSVFB#qd4yNNX0E4R?7#XjK8|;t6?M-DBBJ%*TdybMr2_!3_oMN z14a2saC|G2;mY?_&s6vp{oMzi^fTYKg=oD|W~nFMM~@*nN3huhcD?k0W@f5R~L>|FZ%waA65^of1qePOz`Z}C^{vUithy)~Tg0B?Wr zW!@Y&( ze{KHft8w2E4j)9{A7W2OY$4A&9FBE)8G2;>sLTFaZ$GGBdM|#O!1r^kNk{M#ACAi| z_8S+eO~h0B1RRoOFb)4;KXK0s(d{XwS6JJ6@FO2iyBomg|KPtF|MfLw|FwHw_fSgg zA$v0Jo0!c%kh4q9WAC~J96s-F{!Vtzs4WAd6 z^XOT)-vxiav9muF3G+NY+8DpP@Ety~@6$W21N=+ny5Qa*>(8$AtA2JnD`ylx^)!~l zE8;^Aq@kVo9WJAzTRV>v8uEZ!8&2}Z^iQ#Ewimg%SycODyt&lQE}>Iytc71K0>R;D z*k^lX&#<@M8al@04``jv>QIDnk`%jUAs+^7q;=Nh$U@OZ9%#{J%b zKXqkU30!Ee{CRq}h2JkzPks3OA9AcymXG^ePtZw)w zGeq6P?K4Y{YBHzLfQ$xeiKO`U5w-@je?)8n*O z9sY}X==IsbVLy#U-6p{f8!?Yw_!(L zqSiR?eXD-ev442I(p|;xU9Ofy9Nw*@&;TAUBL`fts{erx5kuh}sqovRo9?yz5Y}7!66x%JwznD?{|0MT2&^I&`SBDZ_~ejiXW{64dCXRWN;kod<{Gw;rl=2RA2p{8@*ZV z_8RGFp5L1P`F4E9dGD!^d!7|I>|i_Z={?2;^7*~LCyl&s{o(Vo=uVu#=dqrcoX$16 zoyBrIrAN)2U%c4Ab;RdhNlG30k#F`5j-=;lhbLT&NM9P|+_!ImyH}WJb@;3AfsREn zi@n{P#4k%PL~rHH#uLWp7xB-h1UL5mF1E%u6mR#S`YSmv81G+A;qDFQUp;s)v#WL= zCBZoV6?Sx-&wet!uADbd@jJ)+?4#-*M6#>3FM9}FeTaTGP=DxmJN}5pypT=#r7y+B zNOt!ex-ul`lKt1|h}QfgBdx&Sv)gy~&a;zzL|0pthC5F(7w)`h|87_I$56O89d~i7 z&SI-Bhst_pWe55`15VE)y~^UR zA3<-(KYX$JBfHrB-XWX9=O@%(8*b0{YhQNwBDHNSuEl5I^EFxEM-Tqv2XXx|e13>D z>m(1u(OM+^5Po9cd{K6v(hd2c<3yLp#OyZpRu&I(l~y|OUmpc;w@tqD`&N4XZnnU< z_Y|-39DQvduju)-@UP-me8xW4MxR~jSw;5dFf~@ppT>_p?))k3b%5uW(ub=29M8}$ z8k)ag6#K*L8?1K?;PX?cA~*Aw&|9*f?^2r#Ci%1YmQf^MYMkwKqIWU!L-I$}-UuEq z^|bf#{C>4XJoJD4+lZg}E%dg|ci+UytR^pAj8{YFzR&V0^GZHXuj|6sN0f(ddDwRy zrm{^{U_g^HVsH`%g^<#qI?`TtFI|P#{9B>A6hTMzWLU=6Z7Qv&Fi1)k2iL*G5v_EU|GFIslcpm3}A7>fI zc<(E!^j%ila@g9H7w8qXSVR3jlTUQHQ`*bzB#y%8MF{gx32Uh%#qU+CG1$s_z8SL44cJ392Lod|+6S#vYt?hSe~(C1^M z+JXQ24)aHZCVXvU#`K485ic<^y-&Ye=)n!fu`@r$efnL`Irsbg+Yrt#VeM3RuKURn z5BEwRDD{1G`l9h!SL~m@P~zwL>BGjiro2k;!oS~&6O@lg&5P2&8#mLYZ(BKjk$>Y+ zT3407H~JC&j+bcH&u)Q#R6YDT>C1kVFARMK$v>FBqEt<|y2NKi`-jisGl$c!DPIF_ zzie!`5RVr2&>*+2g&x*thu=fKt;N~hLRTYB?hWPZiPKm}T7NpX@}OUbIv;b9&uzrl zU1NmB9cE`JwWs~WYgvU0obP)IuDQ233%}OE;`722#di1p9=ELZ%=Yl~Wcd6lxzafH zbKFDUpWh*3+WTcM;J&f=x7YOMhhic-w1#rqWH0|qf6K`KRqT`93Eg_a@1@42nR9=~ zn1u(*j`f)GVPC(9F1{!`)~N1dpYc{c#(Lr^p7(Dn{^S3uzdb)m=yN0fj`!5>UUbg~ zZlCP=o_cz1_Bwh!`209}A9*%jRr~tlj_l2U@p+MYy1?nhVsjq^D6^IuAGqD#=!my>E4Uqxdj;SpPeU?>NI8K2RQx$Ji%d7C&1r!aw;tzB;1( z;JsoP4cU~vRO-_f@G`uXNq#%Mb>-q$}Cq4}h`V^?vJ zdgiJ3H}zi+x1T~E$%}E1USFc#@3eNSvoEi}An~K@JozaJ+9~g`J{1fl_-3^Dn z?_%60rynb2zuO8D_}tz1iTKzv{hWTWJJ7>^*(UuYWD1`@RlX+w?Z;}Xq5ideM!y%Q zq#v3E;%$|yOJjCPzlQT`iwk`Ze7-!Z){xs~IhF{WW{5p)E<3sbHE`RhB zJ~R7&w-~Pj{k~kgjpZM`TYpy-^|D*6UfcWq3AJ<;|!sm_t-UwEgE}^#-7C3d+&lGR~r(jd|bidd(FdpE>*N&d$#McXoDm_6dLO!0+)Ysjm|sax1=g&f$AI8oOapWwTP8ZvG`cn~S&J*-iV(*LllZzIJmod-TfesWpBnUGHl_&S!pX2p~K{)_ygPbuBmdEtZR{W$R+ z&(cPjwTrj)!wqY=9{!49?Bp(L7+$=C-p#$g7wJ_dKhgtmc$j<{@0EJ8A6$$)#AAwA zjXHm5@d`OKgWF+$)v%|Z(^}$peno$qITw7c{M}OR4A9f#i`SHI;r-X!OMb3`ml4^1 zL-Bfv0~h)p_i^tn{z_kGc!%&!TF@%HioHEOdnmiFxCFlwvb)QAtKsyuxI4^y-DhZZ zT=pM)hu>pa@qtm2(Mvh6I>6^I)$f*F@vgs)gwG!tr)v28z0dY$&}v+DaTo1F^JNn~ z{~EXaN7*;z*NFwODp0}*d`3P!{bdz;M6~}{;&Tt>V;?;LK1O_5`|LMb8836J@ax_z zo)!K*++3;nYcDqRNAPTdp1w-UcJ}|D{k63j@{#%`8rjEa?d51a(zfEW=48vsYcfmY z1+(cC=+`~<%8r81QO`tH^0Qh$Urp<^dI5a?$oP-py|`6RVvRYu}r>qTeveh-(!p7|2rwakkr)3dCV z?#}aKU9V*vZS-AL}b^?BC~;XT$t6bSrSan%ui( zZSpJWe;==#eS%N#bmZTfhq-%r57Angp#8pqk=Zz3p>?tU`75L&_Y>TUeaam881rHd zej@I~YX!=6kwy1j`Z$??`h8rqW*0w&-j|=_ZhEkQzv2V!6#N|TtF1FT`X+LiDW2q2 z+EZ6N^tZTd&F@h+g?)7~e&gMr3*dO%zql5jw$<)+aMP`M`1}oS-?+lRj+@TV;t zUV$$s>BTbZ?I<|@rSWQlz7k#LNH{(kKL?|SJs{r(Z4~#4kID4^Sa=wCem49bLc_0u z^G)IK4aE@89g@Ug~+N zc;B~X+5N?;lFyHepK9d_$w=U1-i=_X zuEp12-)p6?tJ!2J&i6j6)K1fJTG(;N`5>Dh#XbJtWOs?kYI_Xz#(nQDDb{H+N^!S3 zA`}AO$3UEZW(u1q4J3~_$_EZ@%zrVFgd^L`M)X9~KnlE_1=*&sHHWg`wiT}w<-<iZM%m^3)%8-?;RnFEzHIJ;b)>fYVf%=1=vFPH%6YVn(fK`@wn)fF% zZnWZ=*^PDn8;j#JeHO>Bt-$9WAn*+O{s#MdOB((OO6*@6z{e5(?mcfc_E*a~^tM`m zN58~6uzLfSs#2xY=Pq!o6S;O^HxD&3-BQ9&3-}y24?B^f$=%`cbd*ceSM(@8|Dtz0u#+oBuRj0t7yjKe`;(nWZpe*= zo3X#xA^%Z(BJAOQ+}PSyz2UFe6`sbulo`tRr+@pw)zFjO;Bj9P-b4AuR)hoKXV}SS z=KCwXYj%A8t9HJuoS>e#A9ry+-n{GTUQ1hYd$n?2z5&N>gx_t|e64k(ry6d^*BHf) z{2FhU^nc{#@ zZ~qJQOrCAOk4zn3%#)`&p$B9c(Tfb_H>sYD&TJE*fgC`x`Pg!pi>WicMm9*<*(~3%|hx-=w z{f;N2Xp|xXV-9-Mzn^yEX7u19hZ{rqHJZ_f#_+k3^4^2g-fa7VgNrJ-=f0$RBEQ{9 zMM|U1>!s*{=jX%O0aoxoTL*rT!RGFwx!<>zKlB-L84sVAv);POf7G1*w1>xc(as&+ zFZlvix~HkV@Pqx6AEg(KTZaEN@~%7xf5R{ImUW>U6ntC1GyVI@x)AeWb@4JTH-Xo+ z>3<#Z8?DX5ny@I6tk-TD)lI!?7N#-Y7EN;{u zek#4{!ZM5dO3m51JF^j6!ozv~JrbgXoZIn7FUnrd2CHwnc43F|eT?lF@ZwU4(LHU< zk2nYJ4>ZF!gPXIm+u-&_BEMdQ!+r5FRZo8mh zc{wdRrnnh$K^Fe;71^mgy@Sx*KS?#lGonKda(m-aNI9FNZ{Rx$KXaURU+NX8#~^0p z6aKgQ!oEG3EH6;U8~W7`&b-P4A^R=w$xg68gFf#m)hGK9hR1o98~1v#KW=}4CYkT8 zj7vKuJEZ>Bt@YNw;q-AGdiYP)k;?=cTMtfpCCE8pEBYMo61GoeMuD_OJ)Nz79n^2X zooe)>jqkP7IO8+`J#ca+Jgukw{%~(R`lhJ`dQ*73o))y_r>>_RxeQY9FM~APZESp| z!{xXkv7hp>KRi@DkPw5 z+VQ{iw`k+0;ys$elULyLhVnPLJ017fF14PFqPg;Nl=a-5I;Ngx!2QZce#*yFBkN0Z z9{xOen<0BI^+>MY_o`6rf<(lq{#o~eTo{Zaed(zvnwvA>FM?3)iRf}Oo%348t0jE?O?x7Yl$VW7KG24xo-8iy z-d_A1F`rt9vujt5%jx;qT3H|qdg%3f_*_E{&BVVwTgl9m2K*o`)l- ztBRkr(<5tEbW=X^ht^Be$h`)=qj}N3=w#k?l2;>#E8cr7zA`FD`-fwu<~eU*1hL;F39 z$DzegSi%5_tXu zD|`{THGngbaqAQ7NHq=nM7`VCFW9x{PZiF3nm?ULtcIK?psyvnmR6sB+O6i_807Ek z<=tqUhQXIEX<&(0V^UZ1v87&(N;AlFU}~dXx%t?`G0r{Hm^6kyOiL}*6LHO9KS#XB z2srExVCquVvp2kJL{F#sj5yN0QtuS!`v<1(>Z_9HZV>H_w!SAiW`eZqPJte z{wa;7pW?pYrzk%0cN6uqHhdma&WlUnylAKFwloVL55eaR^grp(K=b+zINy{#^I#fQ z;^E!e?L%+>MQ#x%_MbGgtpAbJ+Wgtw`zY;Ya{8t6MjC*hw(RG6>CEgZ?+o@ykul;+ z_%53s9Ir};@;S%u}+N4yQN9W?*z|7 zZhPdNey^{Z{&t1qgG$^xg#FtLUpvCNZOH#1^#`7xk^j}G&UX&j%D5-5QS>XzU&tTa zke-~LcOZ}TmCLLYEt1D~V*YWUnd74&#^QEmRmJ@Ae6znb5p8F{zoXKqAZy?h^}|LsdZ zM)(Pi*KWiYo?i4+Uyb*<6M57{Zn56q)32EA8j*?C1{guN}L$Mfy=j+<)RH@be+<$Z!mgBh$k=ad#V)^YfD;{Fc+< zcB?eSd|Flv(@qC?ytDB=Ri4p)sU!J*UktR4#QRp?z=gxyH0)wrVrMf+{ncj3WAyb+ z-^Z!{`M8-~wp$DSZpVfen`i#6ao3EchQByJyc8xkW(h8~FB_6=te>${y&ooq&p2au z{UV3p6!RsJ_*yu&N$STJyaWHneE%KgH_ra0{=R7=_Ul6Uyax|y#2Lz*M;|`61LzJ< zo|PARqWRGcE>BBCv;XjWbTyAYkUL^{iqp!|vOj0zS+R2FT1P%}%Vvaj`>=bDv`*B8 z&ug^Pke7O#aqozJf^*nf=?->1FEA^3vT^UgBX&p`yY{XyQ|F;7;vj6@ApvlKKsoH;WSoy)07$KPF9~0|Jiw z8bnFs$f2E8X>=lMqtc&@dcfU@r5=t;)5yC8t9ujp+}^4%tSmn*ZRYPR66H3dpBuyD z@K-NLGmRVjMET9lmu=EW%VG99#2K_f@V9ySlsn%;BG&q(5O-??Uebw>#3?ZP>vh)BE1V{&U(5y$Szx zhZJ{8|6#q!`STxAU)X8)s&7Cx*1f^j>aFp+x#+FnYA<@;-nm~D`HmLXF%#cy_?hd& z@v-dY7TOslKhhX->EfPu;Qvf{ms`@$I^HAfqkK1Ud#%$Uc07U0adV~_oDBbJJNPtQ z{XNhl|9-c8ayh;m!>1biIUp34ZqQ!;N_-1 z@69*W-*`WzlTR;l<|FM_8t^+$EcscU_kq)m;B$X;=UDn1e6G@7_;;6SuOI#YHXlYG zTC%4H(CY^BX0-IXy4f%J2<2LP4|WticwZi*suW($X6)6lllvA^d_G&LDQd{47XQW& z-@EW<^nyoT3@^H>zZaaIRN}xy_!RdvW)|JdqmJGcoTA-o=Yc)+`}@ik@NS6qXB9DC zt-K>R25x^;Syc3fW6j*}o>Jn*>G)Z1-*pf>b&UQTS_ICt6ZagM6Xao3U%VG~ruu5k zvrF0C5#N8IdLq-*z4!}Uzo{6lzDD9UM#2~OcS?I*+-Nb;-@PP@|H6E^ub4^>o!ujx zk=TdcO&LZ{)_ZTInjX!~mKS44<0)|*oA~a%HtRrZX!t|12Pqycj$;es{;T}_J;{|{ z&Us&*Yz)dUa_H;(!g5@5c5h1<=WRv$@d~(^TiPwZxg&6Bb|s!>r+LoZx{06-JFo&_ zFXnCRoefBP%5yL{+q#rPWZJ*LY~|xmk-27CHouhrP`--&vptPThqQ;*x60k z7<`*g_AS#^@M&i_+0#6fO&kggOKh1mk^gNU2p0L5_H|MgzU)Qp)G6|)@6WF?j2uqE z|7NV#OVl&Wc%H^0?FS(*6$d?;&-zmSj=ALb7i&sa^#(0_A$TScC$L{@Mj?NLOX`9S-d^Wz?Igo@EbZ+ zOyl`K8?po2ld7DJ+fzO`N~E^%&Lni8_}eP`p?Jli+zryVOfJRU&< z4oO>^kKu3L)!0H@?M{G`$HK{o`@ImpcYzOEr0w8|_))kTIC^24o;ur+EQR+S%>PqU z$S3?Ela*h{e%>l6nRV{uQ%a(oOXOu(i{1{nfcYyeZV2+vvn`c@;mu1tqUX6`y)F%`dN?N z8~4DQup1+jVT*hdI3k zd?)oZ&R)o8rdry&HlJ#K(ARv&QqHfUN8Iy``8@Q--h-J~>eYuOUya;n*%5u6_`0#B z{J+e%hF2n;)IS1VtjV{bKUMtBbIn)pN5J`RaBDq!;AAuS)EeJa@UJ&LY*7Tgt-N@F zdtX$#z`4k)UsKFOZ||N-fAb@E4md`CJHhM8##aVrc63wxO(=$I*Nt_!73ZEi@L$LK zm9vViQpDkJu6)>O`@zSGJde8+GwJ`=l?B@AlnrJtZvwySva6>U$H>DGdfP@k-6rJO z#XBiem6wU32)(Is4!DW-yZXKr{@Z1DYCrtCHx$$H-;zDOfd1Bp&r|W&P8{7FCWD&l6>7Mnwz zgZZNm)Zb4lKUg=shJ<#QRp?6idYJOH%$T9pr)n0%@kZfp8h3zkeS`mb4t;J%^7C1s zXX?Y;w6jx1I-}iT=v&*>e*gm~qzOMm)xF<2GAOcW1jezZ(Oo!OqX% z_OCoN8<`i+%k5K#z8`L<@GM*WL6IF}%*R!g6X5PxHQyk%t|y+Z!?*kN;_Ldt-OZf! zycReT&Vqo6SFq_C;x_gowJcVJk9mzYToCW#xLXF*wMIj z$zJCH-Yh$WwLJ?i)L}Po=TjUEdwB%At|44$Y=nM<=W&uQ$GrCA9glJ8EB7zj!NCW(w=akgg%8F!X6F3+Ck}1-`(p`Z|o0tg?l~lbxbC;$Pkf|8wN$*iU~tI2Sw+y&*e#Be*de z4j)I3@512?^yBj3%5N@?;{@d{`ycwB@^~4bk2vZ1^!vxkP0ELz{474|S=TPbZ=56C zo9|QN>I>S7y!U_07b)M4-+33`tKApeTYKUR^8M*=?p!eRHqHYN#(x`rj(y-wP4=Dk z%5(61F}!Lb4tlO}>CC^pJ^6?f;%|+6==F*%$+MAs)jO#lf|l`X6WPf#%&RybTtE-| zh?Bb?=FD+U7&e)AdD*eO_`$a>E>h|k`R#Ty&kOHkO)BTh4#mU1IWZ{qN<;0_rxhEc z4{$$lIj%>+X(!_Bu{p(9xH*ws+>2gy@c!VQX*}Fs47bBCd^Y>ott0mc=c}hWI|sjA zvSIF}OwiwM{2U96OHcWX_S2uH?s3=FP8azzb}i+2bFojUC$|?TW&L2~^5R%>`LJ@2 zzqh29j}&o#?^Jg2!G8Y^+>UjAcmC$#YP$zM|B*cHEQ{^TuWrf(`u-#PwK(56e_2^? z-@&^{2A4wl)8O+-WWL;< zyhl5Uzer8DCuhO*`Ew_no5IyC_55a5`?X{?FYV|&ZV2QU#f}`}L~R$S&GY zYs zo|Ciq9Y#ZAQOES>qsoaT&Tj=*pE29EbAq%}8fm<)hp%zZ;{m=L(WzoJc40r(Wo2(= zJcjUMG&8PeTk-ZH_nbuLC_fTjcY*I`>&5o)Kwe^YbvM@kC)ydx{#gr`-BQ(?&Ee}N zqB|z1$Y<0bEi{fj;PuAzagK76t!bmxwk7}7K&Zc`dPZfP(!um-qpU?b#Jbths&oc@ zXbGne);}3d*uQ;@biZ^`IWIQn=cogpM)`YRqd87Jkyn3@G&cPrKlKFV;~v~*@Rv80 zlQug+H>Fdn-||^$tIQGmY4#b@_*j zd=KMfh0KpOKO#=!6~7i4p6{+*k#6}BaLi4S{5|vu{Kkv%?QKao8g~A4_GG2iYmrczNOXDELX8B&e$35;tlPEIZm*BHMyZGeNem)59MLvv;^OH*d z#$vc1{^4WbUwiXo|NI!^6Zt?dDRKMQd;vMng14u73E)>ad}&EPPWkZX--Yj2y*u+? z`jz?4=zZMt-h(~#i~A`{$w5wOxLVNHYw$M`4&R%{T&gXHMCS4)S?$6Jv+->2{EYSB-r%nIZRFm{j`U-rY+O;tIE-|^dkH(Lv2(((KL&}* zsLOsF>;B4-_G`8HH})vo-L5FeW3t~JhM(2^92?e`-HKYqVYGZm2b=F-dG~4_TpGdu z{9_i;WJ`*rX$1UP%#M!xbmtb|8MQ6h(FYpeT)Al_ZeLNnp?5pM=Yxz}3we?D)c;O! z`eoWRi$7x*zwg1{aa`Hod&DtEbn#oow#s*AU+=7*@$BdG{rfon=i}kphm|{vA;w{Q z_VHv|e}{ODKf;NxD{oszy6Vr*g>0m(#BbsAVEz3KJu)X}{0VhMi`8Wb1un@Os{et{ zex_4ZSv2`zE-_7n-3v} z$Txa|{`ATo6?YLnuY1_#1NGzKbiUuUkw0Uk^*(asQ+_z1Bh4E<0K0XUy6U0TItzX^vH?C9OyJ7i>##2tW z{BrAKU*G>oKU=Y%PgL(L_#J#S@yh#U^jW?}bWy?l&u8N6EAP*qLvG&1gO62ycM+WJ zrQI8?BeAb}+0EOO>&dx9K9cFuDS@jdxre{H{$Yg{*TM;%YLa0gntp=um#*}%rn}lxF~JQ-!iK> zkvv+(JFmuZU-wu}wy*sT4xeHio3fABsAoIxuAG&|!@~t|G5Co%=i21CzjntO&wZ^M z;b*v@XsN$4@xtx!=Nu3J!+ySoUEPp8jxUa32iD_<-otqG^*(Gb&NHjn)k{(zoZZA99eGQiDgH?tE|8Dz4{#>pH>R7fbKOJP%V<95UdnmqqZ4TT z4Se{qI6JFD4l9eh$=^$2#bo;3)_;Y+eV!e``Q}|KGy6pSZ4S%7C7)BYcm+NBgnuLK zlg>Dfc*JEmJr`e}z~=+g0P`d~u%|fX@0T5p?=RTTd*f%enVhGi)Y-<2o(j(=WV;*Z z`PpE81Fzv4>od|$_HpwdL&(kDaOc^3lGR@9jV~&@@^#0)btcPlTc|XXUu7uyoFwup z;uiLSlbb->10cx+^b@li=~H{>g8k{ox$yZ^>t-#H^EZ#xA-C7d)44;Pov0i+_zHjL7|5VR3zx$YvJ>rmm za;{m&IKvSSCb-cTMn>v1v$C_2p8l0RCED- zm<_itHGb`Q>Cb|PgR=3dt@^hzYB$pFdc09*n_oTP^F(}(R6bsK`lT`nJ?=#=gPv(z)#FE_O!o(&8Laxr!WGv7gVwe^)!HiqY7@?=IKhDsj=#?f~|33w+A6 zoh~-Mk$2-V`rk*Mj3xS2&uCvNo~{M|bga?i;Pd{*MSk>jn>F}b@pVg$i`@RsBO91c z&!*e?yFTzv%Hi7W33uyQcjjix(k<5g@4W+XRJk6t${W&il1aCwF*s`tXCiK`S03km zJdp6Phjwe=YvA6*e2w`si9LJ{J?|#&?r8dH_Xhv#!P~j;H1Yy$nqTYhPD0t$O{_}` z*umY%XRG`c^*0w!7k-=(-toQx-AbHaZQN_z>%Lw;oVw(98Q&V`fqyC2fu;G4^lKJ8 zzK%Sa$v1aXS^n7k8o%GxIp9U)_)Fz#^~Aa49qiq9c6h`GzRV7e_2y2$k9^Bd<;Sbn8*uFK2K4AZaCQhezK4&vKk|)zLfkuiOZ}s@ z^I3iydK-D}4lnz+j2x$G@4Ng;^6xHAqbGfv%ujtiz52RRn;jar)Oy3iy70JJah>`4 zQ>8WkYhUFi7T=nki{SZf=6&ROK9k;d%{DLMe#W+bcO|`QAg{*t`dba3d((%7?B`42 z=Q{X&BmSC-*9g7ZMqFO#+ltC|#b1A%~Vs>|&XPwQ@Qk(uBTAX8^ zR>VKYOB0*8&pktZvThbvi%-b;q4zJx{WkuNTKab({x^s7*Q%$6obD~cA0PL*&rfmQ zcMp3%{O2zg4O2DyTd()E> zWq=QR=_>s9aD(9tcs+#vU%--!bA)^F;`V`){Kw#Zcx!G{-u{ne82kFm?68OPFaARW zS?IAmbo4gP3GXwG8L#%u=xrmCL_M?F*LUEn89Vuba=xr#(KfOJc#Fjt_vXG7ksIeJ zbyHVz-Hx|B%Ddl#pP*N>Z|lL0^~z6#lTF}C)ZYWH*2QNt-{Zb~=X9;{?yS`wsW7kG zq`T?4IP-Kf{@e2U1kP+ja*vo7@$S`~<-A_No{sbXzRJxsueV4ymGv)4P3hatqT>zikXryJ1aiO_DW zr#JDpG%)^WSV!XJm+RFN@_Zuo!{k;T3!2(ca-1Jd`xgM;>xN@f3uU~ z^jG@g=28CllHM;LO#VhNzmpxfzS1$jr>tjG-p}7flIQoL|KPpX>+sRaJ???(A1~hS zj&fWk<``D-y~4kxvE*=9e!q6BuHQh;_1MdQEBjlUT^W9-W%|1rz5N#6cg==mZP}Ui_#@ZipI5Ka zsJOy<;)Yt$A!YQuR&fXY%-PF-v%Y&LwCGIU^CJHpdQ=y$ly~k#IM73!$UoDe=KBrBpXq&l_-khgR}L?p zOMBbzZ=lz6e)tFSgg3>5^xt(MOU9`I&yuKICuay5ed<9*^XH+S_MEu<& z?2zxh-+h&KTV+3yLp}KLQt>bIr51bo5_%Rd&|IRv@KisLBGdYM5vld`CuU$HdiWdq zB>k#qhZ*BuW9KVMNKXP+lNO)UUSx8ZU_Sq3B=1PW=)-U)LE+cpc_Xh?nVo=oRyqf~ z&A4~AquiSwd_r@dR?mC<%=eOiTQ?$)L%-jO@q~TH$KD@&&^UJH3%SI4znJ9}nS5W9 z;r<$W(!)vIRrGs)wnw(Y`u*hp&~I~c7V(6~nW?c4x)kn)9^S+XQ-EPK~_=^e4` zKP8{{W&8W|Vo&yh_4OO?z&=*$!<+mj1^ePInTIEm){`v2$anVj+UxLqyR6{vcwW8U;&q-Ce$MyFWr2DsPQU_ZTBak>`-ob+v&69`=^6Cq zJpB98w~<-B^mlq_XO*5PadiSehAc%^!v{*d8kVA6Z!w6^XvE&=ksdWq<#A2Vm-;pj zE9YK@&MjpXu}a@Y&eigeSCfM^aK6fiJg|`&z19~DnA{)i#Y0#C(nESOj&+FX*Z6RAMYOa z1rKfh2DpT`@K}sH-lph;8556X7)t~evlW{+mK&+SNxCi&dTk^ zy}kUKKxYq(P!lCta4u0*eqU~|HHUIw&E%LxWQ12fpZJl-4BwR8_gxX zhkP0Hlox5w4xW>F+pl<`w3o)IzmfU}!)wwOzwt798}ZMvzD#zn z4{q_i_~ADzPg)E4%rz)X#Vwf$ntQTgK07FW#V@OWzO4 z_Jz-0(eQ@RbLF`IE!&@Ti!A~n|e>D}z`z=t>aDQfH0 zU-UEV(x*ibgnx3mbtv+1-VA?tWbb{Tf6?EsAXb%m@E?547w7dYqzSpzwQmo<)+_qC zgH>;R_A2?-u#aPZ)=`|t6Xt31{ay7`!Q%t9KhJIT=iuJ@N^kUNccOTSxW7;bZZ2Vm z4^6Kt7x%mO_IEjy)04)xK74+^{i~@uMa9m!O5VXu3fj}oGN@!#{6|@(pDp*BFR3r$8CLRxxK&2l zZ>D|d$>-V+`(#x*8vpbB?tQqoLLR#(N_v7bRo2;;^mCDR3pf+yo8}LYM?3hu()zXB z``Ry*^nv;L+8>ntfS>)%>-PCu==H^GT&UfC;yGTy&syh$AC~hp{2h%lH|Fx!$$&MrpWj0dd_D_(BtOW@?4{iI4doBWPcO^QR{w1KnnU4vtdD2H z)3^t>Ab(fCya&X-4!_MG(Zf1AIRBuWUzh)0{#N#Q?E4POKV~O~rtee(>{}^&|2lUa0)u^!fSxN%lsU><#Ns#CiQAUq(LOH_u~zuZG9>s;@P> z`b~VUseF~cR@x_Z?dK!^@aulR$b9_CIu&t4RqWfy;;_d0x4^t>SiAz~yTjdxSDY(O z`k8c~^}Pe!3V+?$;!O37a*p_~w4Zq}q*z|+`56A<#&D@O`*?HXF|GIvf9}N>XVHt{ zaQjj7TAWSss(J>)(~FJoK>0i#H!fBD(I1xlAH}Z`<2W0RFHq01MXc|Q-A6fr947E@ z+zq#fv6EN9fd+8)dU75jp5tlrU=wk4U!&LOe|}59@k`@NeB*tiZ=l zm4}OuO1*uRf4mC%T~xfJU*A-oC<0f*-+YR8r-|=~{cLmg^rt01Ys907nwN+43F~`1 zc>N)I;PYGLGn9s}PJNWmWHJxCx~jyX4rW~Ke1vnK?(ir2<;HpPI<&uklLp(j_Je0( z&-CH{=mK@yn>js|@ArH8=}v$m`;Z$&?0@GbW=J!5H4QHIfCD30F7f`C%t7QCC#p}g z8+PEish7ulb%*08@OE34)hzys6SAL;a~=4+fxfr(Wr}^op)%O?&+dfJ-=*-2-jbb5 z4vQf18uO(hd+KMUe%^@Qk{y0Fe2kZ&Z!_a#-MrJzzHPRIhyFd|^{T9@pDN$TDdx4t z*E`PHZ}js``Jdk>#}+&-AJT`!9{+_Mvc9t1_poPH+8>Czg3oKL%b!%_+b~Y8;p`*& z*OR~G8+x*$vJO4$jfUwJ?Nsqof5X20ywVun#6GSIKWkUY*1BQEf{ zJ-lBAKi7FPdSyx9#Q9%y^p0tjdF@;)y`$bH&i`J)zZ-w4K|UKUZ3`!ZzAt_vAB$+- ze5z4jto|6s4Rj|Qsgj0?dpS9ML9goZQ=D%c7V##nqF3wWJ^Hf5tEbInC$+@1Z>(e#%d_ z9{sD5UI*S(nZN7P_w@JcO6&ZilK+AH1dTG8!_!*p+PB`t`mmfwL2sP(g~Pwn+Yj6$ z4EZ$VkN!J(k6?fQU>?=s?+CqW1W)fXzQf#eS<61m_@iUqHRON3S3M);>sUjt)>jV7 zmzVq>oZq3Ik^CU5jqlf$WAPJmK0SYj99T4Yo==AZXDA=*#Rd7} zH^R*p@J~Foad8iV9Ae#k#QNj3Q@)P1_9?3?58*TR7q7zCD1T{QC*Ml>H_CnmZihbh zfa|}de$LsKmHao#R$6D8vQOOqHZJd0K8N42&NeL`^4U*5kFV2Gp6U`Brj} zxm0_*lV^+KeSF#v7U$B>h3xT9)1Lf?9g8o@ddBcC$jT4jzqDR7gx4XrM(pYb)Hei< z|7d-Dx3Vd`3ccD{`9|oSignh75BQniR!>RLF9`d}kSWljv;Pb~R;!gfu)GFyOvA1iSKi+6vnU9ahi#(5hy~6)e4No3` z_mS84&Ej+V-^_i%v(>ws`-$(CxV;YE2M(uf9XbNM^Cfo3TZ9L0XLdM&ubi?06*QW%@7ue)!xG`rjavL*zeykQ@R}mcieMufK!Ga~{3?3{D4rE?2%k z|H+$K&%7rb{+fKl?)?L>(2+?A*}d)Gxqac=I=srcy;oz%FjiQb!SUB z75X_q`7!)U+rZ1vt9{^KHGCMC`sM@hxgf18_5X1Gi^zAhed=J`e}uy==x1H{{15Bb zBEM@%p5ALi5B)zYJ*@mL*{P`-pYK*KvJM14VK2Aihdd?q%QxcRyAeNe@9b9ewnq8J z)YJIBB>%<+xjmKK%w|_Vl)C1T?e>*a*Wa7L<7d>r0KHY-$NJ~KpnliSzF=q9SheNd z$dm6y9`mTKQF;y?(yC{G{uX&Fe9A<}evJ2p8^EP@=4so!ZaI$zrS4lD>L%?r5qN(^}v}ydDncraa@o$ z!cRT;{64z81$ig@f8~Ae+9mxUINSn#JAC&g|6|n?{GSBBT4cTXK^mxMC48=zM;yk5 zaJV(U&=GmFd=kIk)#?xVU&B5YRq1|V^L#RX?#vsLPecCaRpxIu_f}fu?gvyJ(QfF& z%lR+Hy{CIEt@EMu_BHL6xB-9T{@nBEjg|jA|GCtMWqHN8wBz?^KtDg{XI@+CPc!%$ zewBab9r7KaM}mJLx8`uW9US-&-&>M%1NLT|3)I-Jcc!0t5&SQ&bSrAZheqzFEGN&Q z?zyzgXPXz(i#o~&J}=kqP;q(fl>bQl#;@giH(mW5;6{(49{Imq(xYF8!CBGl&JDXO zUzeTSxRmqZ#W(1^ohx?FN8;zd^mEzKO|#Mb*&WQ^<>*by@)w}X8teD{*?X^f-|9Ex z*NR{I4f2^HKDtNViT!px91c6}hT={0ZVG%}uf8?>7@f(parV4*qb>R^MGN-r+RC#< zOZ}>HZupjQl&7j_1Sj7WUzgMK7UJq2^7|d(^LP9N-fJux7_Y$R4)Ag#{^!Tano=N>!pK|Id=boC(UV+lUPFS7kAZZzm2 z|0(dgH+!bLonm+VO?4w6=AE}l(nh=%W3)d-D|6vu;Mh#Kx);>jH|vzQwJz`Mggwqj zHh2EEC;C#Jzk2Kqkve(UpIa!uCol5ZadJ;@PsV@v$F9y^PF>*S=}vfBTE}k@;Tn2) zd-g2cJ4$>&FJCXJdn|^d-z41^18L%JpZQhw!F|i$yp>i z`at!+NIs3tk1yRk3%|`W^a1EA^uMunT$U95ZO&i)TngM?$AbHa7wLQbi1?posj=Vn z5!ca^97I&X#jv+Krx!~;t5Y}q>YVjXo$|2HJEs>)+;*Eb|6KmbG4MO+8}kRa{VJX# z`rjnm3jRfZH)Vgf$!7ceIP1k1?B}32gtJZI&M5xq;q*jCp46i(e|TC!uXbWj_cDIX z_$LM#|Az4SBlXyipjXqUQ&S&uXv*6a^xd;d(7&jBUAYqf(chcleT>>=X{hhkG4#-% zTYjfMq`l9?Wk0H4v5r4w{fRuGFQoqDpLoIglze`ce!$PRy#GUtV~^}3cptd*1v|Jk z`Y)+_{$9oV^7%OPt3LaA4Su%vImquT@OfNWz8QP6YjyxXN45UMz3zcNy?hR@n)Ac7 z$)}UUvv51;4UJwSzjFT0gwtU^5AeM+{hRSSeLi}emu>2MdmIp79)-k&!iuioAl+z2iu_qjWi zevf_!{M3&j-N0A1c49C*=Qq z{*!hG!T0X@#`aAa91VI)`}po~=2P|efw#5TpCQj`c62xUyXM6>?JTQwEjp2JbMemK z!p|A<-1X1HZ#fITh2MT6zd;Xn!k}V=^;Cph5%a)JEA(#ke6sec%=g(Py#ah~NuK?( z&5Evh#G@Qr3@i2FFm`TB=(VWC)y^fomUZ)7c-kx*%^y90J@6xZ9#NLR5WNaMpIc19 z-)EI8;CJZxwdk#(){VtT_Q0yjH6{No*wsPbSiaFg`MS#c74Hk0*KOc)hVPB}O9op9 z+sY@}hXIm%dQa5(VU#(p5TO%|2v)F zhy8iJnHPC<@3v!&{l#7AUG?kXY<&J>8n^w>nbZ{&)yh4{M8%E=kWH;%BL0iH}os=)YUT27yJFF ze42eiV|W(*l8>Xr=ay-_dEJbEVoW(MLGNyT8J#xD->Upj83A`=UE4V2=JgJuVus~k zy45*1ZDC$Ca{DIazoU4JE%U;B+16*!_eq15A16L~wE5sBNt!_pO~hd|B*&fkqbDlg zFD7cZo#6)E z#>25^{Y3QvBk&aK#$@&0@LVl(SnCSGG=*{|x7z5~C+ocx!{H}H8_saI{;%RS(B zPv5_%52N8^;KM|AWq0%`#bhrEcmcSWu3dS#OZq~1J2ZctJupikl+t9BI;bt@b+sleg@ZTi6 z68|BGYtaLr|Ej&rx*Iqha=xzUg+4_b$5`}r?B~#rz~|oNxUu(EM%!-=^^VGra{j)c zd|UNA?|Zzf{)W%Eueut3M_$?ItS{BTI@whHt!L(KuKu`rRD*svjTl?nQNu*^mvZL67$DlMW3~-^ zTn?Ydm;IWm{z1IGBYYnYhiAdFu&dpG&I6x!_xsM|ut>Wt;aFr;4}3mS{q=EqSV<2A z*g!wgObR>XWc)|G)-_ps<9keYteMuse&lNFNyLHNjUIUG)e`GwYw;R0OaAY+PISe8 z;PpDEp%1c*V*e8KDE~4(-=q&u>TmRGnR)`B-}L)+;hpD6jCF1K@AU6Q$o?W4=4+W8 z!oJ-=m!AjTy&kDhp$1AfMO)R|q`o&4Lv*~~t*o6imQ^9H)C zM)@1|FGJJj<+yCj-mIa|W6=}*+$7B^9Cn+P|6(CKc|yLb zQYCt2uJVD;L+Q^hqH7jvw}oiPnaX=V6P^b?ADXr?-`Y7ZoLSbtfxehmM@IU}_s$1r zpgT9kZ;aX{=_q{vR&jb@+!~2%o~?gVv%jXL7c9b*6WUFF5H zeBkpYB|iTv9YOBvD=X9+_57FqSz2O!{J8QxdOiAH(f`2bRq1eitmd@~I*!om10T97AO6)r`HnvOvR1avSIdhs9DYVUGxBZpXN2A4!St;q zdpYP0*vs4EZp*8Gc)c-`GU?QO*H3gU3U~&252h=Lwhl z+U34PZhej0kNNiaY=SNV(R%u${>1!D#qQ+Lh#fq@xOy?C*w*?egH#dYOLN)Bf!jTb z?ed@P`?|up=vQmy!%iLyA3wLgkNCa(#G*nso-ul}j?su2tdqENN zY@9gfBXU`fDp&h`*a_GAd*Jgm#gX*?)5@(SpN-*j&^P9Xo~NJRv!7>`cHr~qZIr*= zXS}QWTCojzRzi}S;A=3Phr=sEa|{B_%)pUqC5 z$4-lXb}su-&7SKA-G1<5Pvm82n(gE>?CSAw_$WPJr2h5doOdkotqt7l#rv=k9R3iF z#S51)?&I0H1LXJU;pS8me#J@Z4|{c*`umGYp2XAE)&6KUdW_dheuYK64hxmF-zG8Qg)~+hkj!@4?%AM7Axx`+)sCpWchVg`+k6Gsm-3Wb3M2 z;d{KxaF=x@?Bp}dfY$tMf5T_2J9o2lgMOdL)3ue=?B|_+$L$UD7xD9Es7(ITckQzg zaIZE$*wnOBIZtOOKghh8k&e~RANij{Pn(I)2>MQLWA2uJZN1p{clr@6q=Q-H_XX`BC&`Saz_#%lb_lH_W3`@*~lQi^n)v z{XN7rZ_u|(^XTsw+IgaJYc6gs&|F6@{M&P zaPM37^oL6yhf6o74+}BoGp@;oeU2I+AbI0O*^WrVoJ;Y51&NgR1?`3=gpO^Z6vKf7h z@~c>UhpMN>Uv@0HWcjwHxPv`;p6mZopqgf4{1%P*2R?kJ0Z)iUjAT)YgNqWn^Ley{ewtn^9;(+4N=DfHlIwp)*w076)rjBt3w@c$&wQ$O3iy1OdC*i8 z%~kYvhA5y*;DLOM>JR#t=_2#RyB_IspE*3B623`D~Qm>T>`;#`*N-waVc9uu=}2D;IHP zJ@Y?0_bk}UhnDp1^S`Puc8wQl*Nxo#kh0yq^XtiJgx%G7jD=HingR z8hbN`#|M@2JUhS2-$%mrQ}Yq}d!}|no>!>1iQVa8@G$aWUXdSx-ZZ-@KZ|_azlYD= z#C6<^9(wgqex~vpvy)FI|IeHc9$D5S{<*~AYxA?=?RxLS9*Qm#Q|={zY%_5m=g_Zu z;xB^FD)Ae~=>Jsl(5I8ry2>|Y`6};qpJE*v51*r7Ybr&tK|ibES9|jRI=>7aS-o^9H{at`@@Te%62NBJkU+nC(0EzU5{ zR#YBQPsr^B>p>gy;ue4JY+PO>pTMP8{H_C3dPcuvfApSpDe(E*;zZx~bbc896CI~q zprq&D*gjKdRj8-1c!EzGKx>1IbP$x07&wug@1{r2d0; z37J8Ta_(#bLAfOza&pxly4>e`9kaN!7Tl|wD)1yx)BsIkw`@xWv4eDBIEv{Qni(vO8Euuci&M|>>FYNz|@?FxDN zPt?A5O4F74wKu=WHOhTj8J|v5PZj_3HS~Ua_ryj$6w zT^V-Qp72-J2N=f60D6qtyR1(Obgj14{ZT_VWh% zdFqW(IbVI1$`|bCGt2s~PA{py2fxQn^yG=kwfG4auEpd zwp)I?d3IA}pz^UU%+24jo(^L_|4aS%JDoYbr0 z<@e^3vd1cc$5Hrm>XALboH=k|(-tDs?|H-ZSZN|O5_>Qyk>E!dEb|WtAwftuK zxQTm)H|FEnIj>m_M|@7?_qfWs{D%0t(C5b96@0+wws89{ z^X|#YK4p1+{o?QHpU>ZMk9qf+_`1NKrttS^<;S|8ayMLhy>fwe!*6p*@vQP=*wcTb zSIhV}HkAKc@rvJ#_4hl_AC=EJ@UWHq8jt$jRJeUNd+#oJHew$r-&gTxc(90_{h)Q6 zJ{KqJS95rLjeaeJ>vzNR*D9aF>7ajK-0pL+JWE&7x3!gDircj3WPfJedXLq8+EZ}0 z9sWOiW>G84>eVyF|Ia^7SW~<1dcYBTEE#mz9tSVk>wx{IP06yM4WfpZuikT>-&-#; zHKh3XpE{O)eBbt>(;`BRT>kGrzt_0?E{E(g@W6fc{l7o{|5*#?`2YSB=_CHX9}n8- z|Niv<=PBa6|M#ES>+}Eq^!J|3o>`|(SZDE1$6~;S|M2hP_y79u=RLoFzd#M~kKYOD b%Cc!ZyYWx*|JgHZ*NOiw{4dIHDEt2a1%sTJ diff --git a/bench/python/assets/brooklyn_nms_masks.pt b/bench/python/assets/brooklyn_nms_masks.pt index 2e13cef5e672532735d1d50c4b8b14704651af54..0b97d7a0dd4250be7c63fb17ea084e663f8370bb 100644 GIT binary patch literal 131 zcmWN?K@!3s3;@78uiyg~5?T`ao0bG&Mx`UQ2Vbvy*-PJ}^_Q*Zd5qP$dpmEBGN%9a zvroA{-8eM63ruffN8NCVYYc=X(hdOR1wrN@UbP@#UXmzo9H}soBIYnNa#jw2Njz!E NY)1R4g%l%t^8@S{Clvqy literal 196620 zcmeI!L2IU06#(FGCNq;5=}yqLATBHD%%Coc%Ry#g5MxCcW)m_=s3Do8Uy>;lx^PwS zPbl5`YpN@Mf;;ICsQP}~@o=YehXXFk#q-2B@0`5%+~c{=fpYivw(hJ}J3Fh_e%@Ve zuby39UY?zQa&d6+{QBVe;q}?|gX6=S!}}kdoj-oGwR-yW=^t(%-CSNB{r>*Ji<{H) z>)nH^lV>kZ&yNpoPA;x5uMR%`$?kgf-g{5Bm-^RB=HZ7Y>sxKh?xV}g^WU~_yX)Ie zZY?Pf*IQ5SE&m?9I6nOO!}XnCuD-gxx>?`-&E9J72fzBqcRyR-Ti$!OK3i}9@#yIC z{QTtT=JfL7diU2?$0t`O$G$-AwYlt0RjXF5ct0kaJ8Fm zj4_f3rfYH3P~UBfmv?(c>B+agT9TSAN2v(19o7jDAV7cs0RjXFR1k1Mt}tb1BtU=w z0RjYy2)No!H|Q8i1k<%RYN+qF#ml=rqx9rkUoA<^mZMYz*$(Rj2oNAZfB*pk1S$x) zAXk{OGZG*`fB*pkMFd>!rW6-8t*@4(X3J43f^3I%0t5&U zAV7cs0Rj~ST#zeF*%=8CAV7csfg%E~cGC?yMiRkvEsh%MyKV9EZqFz^`PNrUQnTeK z6+yPcIspO%2oNAZfB=CC0xrlErtFLa2oNAZfItxeSG(y39V3Zgx)w(b_1(63dADbj zo_y=8C8^nRl!_qRVVwX00t5&UAV7dX1pyc23R8AQ0t5&UAV8pqfUDhfgN~6zFkOqI zhWc(>yu8~pN>9G^)soa~IZ8#4?XXUO009C72oNAZpn`x4a)l{7BLM;g2oNApM8MT< zxXK!5-N0tAW(xY|uO=om=^)3rEisPDGL%ey_J^yFJ#ElJIm zqf`Xh4(kL65FkK+009C7DhRkBSD3Oh5+Fc;009C;1YGT=8+42$g6UcuHPm<8;^p0* zQF`*Nua=}{%TX$VY=?CM1PBlyK!5-N0u=;YkSk2t83_;|K!5;&A_A^<(+xUC62WvW zjvDH_ZSnGM&nP|l)>lhXv*joiLAJv>0RjXF5FkK+0D%euF31(8?2H5m5FkK+KoJ2~ zyXgiUBZ*+T7Do;B-L`mnw`Y`|eCw+vso8RriXhuzod5v>1PBlyK!8960T<*7Q+7rI z1PBlyK%j_#tKD>ij*&z#U5lfJ`fgjiyxTKMPrmillGJQDN=12xx$p4kpKY#1PBl)BH(H_-JoM65lq+OsG+{w7BBDijM9^DeYGSt zTaHo@WIL=AAV7cs0RjXF5U3#Ff?Q$B&Pad&0RjXF6cKQ>n{LoCk_e`2anw-XZHt$8 zdq(NWx4v4Enk`4E2(lg42@oJafB*pk1PD|Ra6zsxWoINnfB*pk1d0f_+D$j;7)b=v zwK!_1@3zIuyFH`yjVf8AV7cs0RjXn2)H0un6fhxAV7cs0Rlw? zT{DmiD0@GM-BDews?8BXOy0N>#HTH*>aSMAlqS` z009C72oNAZfItNS7vu_4c18jO2oNAZpooC0-E@PFkwh?Ei=&46Zd<&(+cQc}zV+3T z)NDCQMUd^VPJjRb0t5&UAV8pkfD3YkDLW$p0t5&UAW%fW)o!{$$4DZWuEkM9eYY)M z-t8HsC*S&NNouwnr6S07SSLV$009C72oNApLBIvM!jzqn009C72oNYD;A%JBpkpKv zOxNP5p}yM|FYoq@(vxp}wInrLj#3e1JFF8RK!5-N0t5&Us372iTw%)2NPqwV0t5&Y z5pcDeZqPB32&QXs)KK4Ti zbpiwk5FkK+009CO1YD3SOxYO;5FkK+0D&R`u6EN6Iz|$~bS;h=>bq_6@@~&4J^9vG zOH#AtC>24r!#V*11PBlyK!5;&3IZ<36{hTr1PBlyK!89I0av@}1|1`bV7eAZ4fWl& zczL&Hl%9O+t0k$~a+Hc7+hLsm0RjXF5FkK+Km`F8sBsE)(QW0c3tP}XA1@`y0 zcb@<&LNk t>t*!cfB*jS@4qe8Hb2Y0?ajY``FHz!^PiLby}S1|-?o z-?VUlnQ;i4^R9Z!denV&enV@$3%Ot^2YpkfJqM{=Gz1?}U^MXVY6TtpD(~sTnP2RkD z^PM-dv$M00+&=Z{GnSOZew0pQE!f1NV+A(IC1%;fQP^Xh<1`uKWz4dFWUreQ-u>ErA7iwvGLzCr&!tWW)Z<-eLfz9Ia7 zpE}dWpE7gk(6OUO4<9;dqy(%HT{wvXCK+xifwm(F)ONrT1?F~ff(BykT^ncHzKoy z;3)*(P4HTRzm?cB4Nt%IptQkkDf8pc37o;=VS%C8UMbJ~QYQ-&JAHmnKeI97UjdVp|E3 zu}Rsgn5kq=35TcS4}w|{?aH>_WYMI zKz*2z`8l27nl#BKep`titFE}GgRGjXD`81xSIz+Ou_7`L(`T8qTJq;8>UY)`mHGA> zGC#kM*pUqr+bVC2`pf@31HwfNvKV2&yn|*MKmesLRnoG12+Bs2Bd;O5h{? zAAYN(7&s~e#5WSc$KYy^Ik~dA^r-rNDpTkO8LFQV8|Xe;FX7K1Ka=3P1k-zpnExFU z1P0sf$M_n?ngG||avX0<_S+nJm1>y*(xFql&+G0P`ojS1fd9eWVNdOzAL$g96sXWLeAoZd9ViraBqunexk=QmP) z)otHNl&5`k63viRlYTS_r-ElGc;XTDF^X%;66)JIr0rz?*$PP?4!02VDR5RK^Vuhe zZ9lUHqRdW>C2d20zRF-_8#&G$5@e%ia%{Z?vJv$E9YkMkwePs;AwJVmjAf`Sg>ZP> z>aK?3F_KejyToiVvAG)RsxC_Sp@paqQ%q%Qf>$H|NpSUHd*K<>3ZhR!ou=o*FhB1{ z89Renb%bv-mWUxP53qp`pb!6DV;kZj*E(ykZvFAhOc(EJ5Mqzl;rYH~kj3|dRu192 zCT%)yWl0~=XY-}RhihvOs%W8zAXHEGwRwhUBV*;gMcXr25w4e<{UMvauZ$4i{;B#- z)Zbu!@lt)+MyrKv<4NP(^F7DHUU?>56I`!!0~*-`SEBMulFeb zPx?1a_OG=<`f&b*l85jMnD6w1iTMdXiwqa?kROsdk5b`@&!X$e2zeGOpq!t0v)md- z5F5bfYCU;=)Da4Ogf}t|NAiSEGPbF`Z11M&h{1;oS0T=W z=Uff(RE=5hi?IOyoO=S>LYwA{F)xue9whecAUH~7ymBW1q>%BRPVjVrp&b?4QUjY- z%YMmEl6~5%m6(&^$l^+wpEn=zt6<9k*B*ON8O+Bqm$#lv-T~hpj0GhKx0w9UdnBGD%_7O zE_3c3&~~RiCnNcYuihux2y3Bf!hW#p3Xu;6o=`gpC)HWi|0<(ysK&ML+sk z$~4*6eIFJ1@c4e8%#S`Vu%G4hK^eHe#dCRnY>&ap>+<~ar$fwo4Y^GOm<{`PGLiPm z%-TbgADB-?%AG0f2Vc#SII&4$Uw6bXugA(M{uU`D{o7pPl<5+WGo;TKMKK@vHpMd|-0Q|2l=9&*5}(kyB1ZZy zPx5JzJ$Ck%%r{ffPS^+XUBTOEBxkW<54zayTb#)XKXkFLLu!`1S8l7N{`twiC2bM! zzhLY}LTGbYfQ9QLcINLL*e<8fQ_wzVtn>ROx1ElWIMM%;c2wt&)VHfsuBx&`e?s&l zoRb&#R#0QklfKRVEU~>-Vs^d6kxtTA;{_6%WZp|zCiA`bDOEMOMslQ&U|-Tp)c^8H zK7ST~g;s!-%)>GJg3fXWxY+!5Zzdav_pc#ez?|(9`U$UICAADpVJFnJ4*PCgfE{jw zYbR5;{?oyI9%m`zL(IngeaqR48ieu(K!=#O9pAo4_yE|N5N)Zc-INQ8YpVCB%lo<# zAL}FileHslWP(G`6(zWk;36q+iv4O3x4)MS5md zbjNhkM#?kAR40|UX<1Tll;AwVxz`s^&Ukb5-*M z=RR07>&WeOx;^GocTO%^w>;bTpH{AKTRB6re0S7yag#OsVgFy;bD=D|huBQpUQBqP z?~H81RRzw8;TDl8BYNomhIfB~avsls4-)xOf@wdJswzNJ)>}FQeHa8%V~D|aGe!D& zg)y5>>KZ}tUnD=)V&cPB3k-FNxxFsP(zf3$%d*c2o9x9wU43m7qZTMZu$-es}Z|oY;MyNMN%I+X_QJp4{u?VivNtOH%zT{lLKz?9c zUcb#y3TDo*uw26P@0D0rKfQW(yhUy19hUO6PVcFB0K1B~A8A@A+a47GUiPWIjulb+JOy zN7PR&nIHL;>~~ApRt`E|K|RgEUh$2sa~01lj(u%NTj&`-`d#f#DPvzL>#{GBHk%Jf z8S8h59q#EnvgV{+bl&jZk7TfLW2wLVu+(1|5g25clwmIsonI6DqQq81X`4Ay;tKa5 z(N{lpjUneY*w7dIEocKR(>_7@{_0^23*1g`Q01|vj& zIrh-G#U^_0B=UD6C(bw!dkapzt4LqGPUOY2fTN$DBl6}$=MSVVGN!DplEWK2V%tGF zPsct28wzpH&C$YUlX~fWs=d_1NFTAQMO)w*b34aYJA+yGpuE$!qq$D2sm`ohr5=m4 z1>e`WCeU*uMS>q_Q*1HuC4*ox-r2jte(0l{L|uSWB{n}o3_hR7kO%OSydsIiOOY$c z&wo|=b8bY|>&uqdf47ur=k0%Hu(hj2KG1NX9rTeMFUY+21^(FBp_c{U(}nyQuaz2A30BDcxN#DEa@Dy4f-grktiErDZ2#r zvqv9A85`bfZ$aED5MF$`yB2lcd0KVOzpb5%D zKmHJrHeB-{VvriRR@eZ2u|Qy$CVwlj?;U|NSm%~fPyJUA7x`J8=MY0(tp=g|pTVES zy8{#UsNw9Ro1A;mX3@W}&J^zmfFG;A$Oqh2VDN`O2n_ww8uftAr5|8ju=|#=^}}9vZQ$ zzsQFfd=>HxoR6E2^#Uj9sZ!oP4fCO_>9rl&^^we9a~A4^wj`Ef*%`nU=yFo)r(~V; zdxuzoSkG7)*3&Pe4TS-AZ7Rw^yR1nGyg1?L=$D zRz{-aA;dm^w8t}uq25jHP$vUFzfW+T!BT;1H?<0|UO9-DYb@(6&(cv37&t>8zC#dE^n{1hPtr1*LI&-A2|UY zV&Z+bmBA9Dq|Md>QE!07y?NWu*1n4}A8Txn&F!DP0rKj&<2QQle$aiuy+77TKHWQ; zbo%r!y0O#ubNODYf`z50a6j3;C1f?t;I>7sG1!V0$VqR_-Ai@*1^5~9C(MU+-_TRA zEx`G^;+_lR{jE7Dvn>#w(}3%@n(5I(ep6BoTCYaKj%Y@sV zmu2NP=lQ9PL%E0MaBQ{G^IC|y<_GqV{2S^_3Yfe7ST762FGxi^E|7lbS6G)3C<;$S zd@x`y{u0}Gw`QKd5b;`#t(k-v&Ml0ciSl8cy*QC$D^q9ri;zQDGn{TBE)K9p8xZFO z*xG@J%|LX?d#I<2<~#pd#1KQJ^G@z3a5mT~6}~r9vk%YXHbhP_QZ5?9+j!|cp(#VI z;O!c}Eo5|JJpYW3Lb)ZIxXvcmg~CNcIgWJ?`qqj2=IVfXAc5@ye@Ns(E9>1}fYtO;;=Vj-$9>i@v_pGd?g9}`2zxQ>Pwf}ss)AxY}RMUt>sVrW+2zeHU`RKZzC=a9NOEOa}_BvjAz^M{FLvEzE{mf z-)Nv8=)RU?bF9&9!{0D}v3A}~4)LnQQa;wj<=@Q9zIgb3SayLPFM5(=*zf$) zzvGC<1=g%7MZ6*q>YK@N|=(tgYS(n2IoOax+2ddTI7DdALx9OWbXP3^V?{w??$c@o{8+Xw{Si1 zTw)HN%4OgTSM*Zkyz{Eyp{{kg&1X*wMGjoa@v~2a)^uC$VspTmSxsKJnBxr#GLxpm zxfv&=ObN!Wa`w|99fa>hx_-dxW%p|A`zDA#(^$?d^kF!I)_I-l8##}@mYtrO?3S&2 zLu&W%!%JQHM9U`Kt@qF2cBa97es!4p((HLb^X;d3{-Iu(1rrXtWzSrEf46v~ty^~V zmQmfT_NTfsBQ_b?o8H8+xu`)_BI|kA{@k?b*$0C6aGQVF88RO_hudjAYdkl-gty&X zs26?Knai*{bZgh?yk6EuJFms3T*f}6PuuuUlz%|~a_@s2r!5Z^EFH%2r+J~CkG#n7 z`a~$NWhTFuL_ahdru4!4N-M2gho=rx~Vy%z;n zw;bl?r>&ZqJ?^Yr7n^OnWNX*l@5)%OY|id{4e!Uo(pK4Rr_JYO<9&_E>%Qd}&gTyL zU1QX5YV+F)-`=DBhP}^ad`ZTL^UmNn`ni#`emUo-?B3w&FOkpmEkXMhydN1OgJ%EB zxlH;+!L%I21GfajZGxQh$lf6P_5GaN)cb?2?i$bKt)Omn?$7J8lJ&AxY1kM4(CWW* z0nhh;6O4UPkLTx}7s@-pcz&DSAzN?HaoJyj{^>YZG`ctRLBH>LK3k;a)q4zaj<&ha zGOnj^NH89Mk=s^OYV7#@9IW@CX8-XC^nvmKD_e;E++0sAeB71KnfTl6q7_SA`8OXw zw|o9MPq_Hc@uuC)rbWDL_$tGCsx`;C9fLV#mvS3yOTV?{dbG#X2EMS6W9RJj0RGGp zdFT49Nx#0)%}?)nZuZ^&XI*S%mS(Ohx|PefZJyaAI-29eGUJH>zvF#sonn~T^LReX z1b@B_+v3+!ibE*VUq8H~3&-XVqp&&p^HWcRuAln?FI&(kl=Rdj<#e6LzzYT!%MmU+pGLpW#@Bz z&)&M-!k?Yt$`rL;m-WHTyIgx(`Tv+5t@9MNYq^nnNrGdmk)Cly3$&r5p7!xunBPqA z@xdOBeS?jn8P{=~^0jd&C4*!86JyHbS93i7rC|I}Hur%Yj|L~LJCo;^vSN|33-KMo%6hk~u4`uH}+(WnecjINv1A3=h&?kLbaK={b zm%OE+WwYPl^1f${0Zl*V*t)>z*JA>2Z|p8(Xu;>GbEy{jwGVCB7tj}9kDP4N$~IoX zv30dE>hTq>yfvX=X3pCtlmGMZBJ_XvrwM((wE$hVvaEkUcq7Mu zEy~<8;~8x427Td$Um;G?HwTkMUHa%bt9ZWcH@0-MIWD?57`qJD)!{`(Y3wqdUkYo& z1?ea=EqHqQR^Gpb9fQSN-{shR$7r?iI*ylJ6)JraeQ#~Y(7|)RvR}#8~ugxpICXotoWVSa(e>cw$BoU56yK8tv^_S&XJXwRX*;^f_kztB!uBi4Zv zG^=?>l#d14bsT~+MS+oB#69$R?Qne@*DR(z*?9}fEYQBHe-m=pPanPp*Anb*&3ey9 zoeBR7=ZwUDZ?A0}A4d6l`Zn_s)YDV1*Iul3$LPL?|AG02_U5ZsW7$1gGz{y0Hx=nU zzE0y9?mP2GVL!6-w3Y3*V%e9quO1d<=W0D?;TT~7{p`=ipq@JVbFYp-yjY8_$9#K> zzO?rrc)poz?7UzB*Jc+|}^iIuiTzPkR2|B#zCy^aFeFS;M|7nAdV8&o65k zdVRx(T#xylp1L!LZJDF3nUv4-;k&NR?Z@+c>r^AU#W@EL3D)P4b?AF=F5CBdJuc5o zZBbSs;hBJM=~lZLRkkw_yGh zjs3fa_dLiJ`-M!P&rH1cu{GM-m+?7^-KphFc?HYv(axSYi{o%lV_*)h6YRtK_Bz7% zuGjlq^)D`C_cz|WKg6+hseZ!)c;B?L^hq6s{t_+qhuxejtAk$lIL?jiZ`wZN2`t-F z|6m5Li{QQJSzU3SH(%7x8=S*s*jDZE>EltpgFdRq6wI%q7u#aKdRo8X79Yx7tQT$R zgncw$kEFFm{Jj=iI+o*jmSK)K8~fsE&G#`r$FX#+s7UN}bkR2bfNK)?o-DHc3)J(H z)^6iav|+BcXh0q0xl}7Xa}mmfw7#P<(1+*f`l|hCTZ-Ot@cHGMwbDd?OA5r>E#`WxPI@d`_*<%OHMksQU|-z6NBA?dd}A@+ z&ah_OhBh1u#2?iV!=7QwYterW!k%OR%ffzh0>*}y-jVNLqBBH3oKG8r`e8q(U^ceP z)Qra)Vp|dcpSch7qZ-TZkMacpvrRY5FA12#bJ3pLbXEv4KHL-uf{EhcC+J7|y_^@B04>-K5r%t=EKiG^ zaUI%FstPMPA8?ZS zxSn>+71-WfJ@=-eh!1H?9}&DY(xdD7^A3Z1y;=4-a@9$*>a9i$-^Q)}5cz>K5qlRS zP71JBUPcUi)DL`#820BkUXB=iAkY@|nEvn|1?Pjb^pX~6PqJp+el@ny(0wh&VgB11 z+qeSTQli_L&vQLil5S7g&GXq1J>~i5xV&|fe$YPy%f6|x8zy7DupfTC*gv=Z=CVR8 z+gr2Gy%e#PVb&pm%;2RdO}fYXK7qYS)rIeV(ehxa7ny$QU(Nbs}4ZXd8l z;{6f)>?Hk}J%#zOKl~(oyW*y}-|Qv!f1>{U9%78=1lSq4r|q0AGXEuY7J28Z;r&z} zF-G8>ta#@Fd+C$KS(LEfYIZxecbk?U67$R;-7f2d`EY;9clOPll%vJ(d6?&0z4TUA zcjPvt#m-oW@^J3;Ht}9BO=IOgYzv$RNfz(sz-Km&!h9RPozF!a^|Npk@&15m3O_$f zwxz5`^{TwMuED!Xz9h>sPKdwrPra2mmtD?0XH;JMmO&ibM<-(ExJ z=l2mivSDJ|v1?Ki|FaATKLMXHgf9VxbI)F?FmHRg)dxONms6BtqDsZ*CuAsCeFngH zMM?-CgR4R2zT!n@@ck z-otr`e2c2$^LwliKzybpnG@jM@GmNj@!q2`6`vn^2F8$z&o4Zb_%MAor?FOQYY$3P zS<~x#WUP@m7%DzLCsxJhcX&|o`D;4N-0~IH8)UAi73H8r+&4`XpI=a_;`0k8j>8qs zCwU2cuGSNr1LH}>=NH_n`22#2TDqo?^-ns%(*=fky289&gL+qsX9tM+BjOc+Iu)Ov zbE@L=|6Jc9Yl-c`Z$Z~VF@FN4@$PEk&$g^f#pmb!rQ-7ozm>ira(>>G>LEU?;`9GJ zI%%mGS*PX6xkAO~2a=q5(I))F38oy@2vgs#k)lcp6a5L%zwoVxiqFsa+JKxSsrdYr zI8YU`iH|Xgt5CECA}T&Vl2=IXiR3((7t_+2$5ed&s(V_3%=6x!!;!@+^YhB`k1YS= z-zq*oa*{{<+3O47)q7?Etm5-?vYfGfv}6${SEpq-s2gR(dWccQno5{JK6VB z@%cr^kiG~LOt@9?`H?Fh;fMORH}*k}yq8q*`H?qz#!toP7XwdTV>pl5Dn7p;MaAcL zn1J&tDn7rDpY)a*B?}TevG{X&$}u< zKe&X7&(G&c6`$YbLB;3CI9e(`zv~YwK0lAOXsY=9u0N>w{M;W@e16V@iqDU+SXF#} zcYaXu`MEz7srdY^Kfu4UqvG=e6DmGG9~UYyoK5Ov*~( zsv%42RPp(%MwybT0t0m3@R9S(75uphda5fz>!sh>RacLaR}56f0QF%qe^w@5vwSkQ ztN8piqf(WY3{W3tWPYx#*l0HKTUl!bq~h~ephNwrHUq@RisU+$(Px=7;@j2MPYOg) zzq7um%(qp1enOan)n`EXF!-2?&tHA|REb|017v>ozI6p3H^`h^*<4!fmWZ(f1*v{U zY@qvW^xHiZpTF8{pWFhZPfwa5!Unj~XFwnE!)n_mbV5P8o>B4ntIgz(3eZ0K(H~Dn z2I*^)%!gu~c`{5AbvkQYkth~$-J4~wh{FVAsb^hS_9_O?=MXMw$?whdxz4R05!$nQ-+fq<6 zEJff9W{u!;xaG|2yWDwx#NC^+nbpM2@6cpackAM0_q((Gp*y+@6Gl8$zdN&jzGsR@D5Xt32ajrOC4&)(UwhM8)U-5oxuTF@mgb!RKl{d48qh^9ycOe15@1 zEnR8+Iu)P4=A^Eab8`48IS&^!5ceI%(w~&~CVPRLw^e-ppG!Zv=c@SpKbJ&Vj;_BaMXR%qK5IpO8NyX=f_Z#4AH{uv~uub@h z6YP;zQ$T&Yrn-(+p6FNc`8h%PWDZgB`H$u_)yyM4#wf0)nre!x`20v%A@OH&9!$mO zM}F*T`ecI4^WL7rkkHu3i_r~O#pmZ_Ict=oC5!lQ z8R3)eZ_sz~9?zicAdxR6xGHly+4ocN`9;T&z6cXcxK;7_kt-kJhx)cR_CZd?=jUX# zk$Yu~p7EpK)$Wus_LXvv!oEoExtR}08S8h59qwDmd`8b@(Rm{z_Cp}9V#;4pn@9hY}T^pN}ILPjke$ z16+pKnFnqln64i>3whAc1N{o*W8!^lT7a1k;&_K|t*mnpJM+gZ>4)B&;~3IR&J|h3 z0>d^;v6N?rWxa_P<$N;gS@BLGgZaJ_`F<9S3JlvjmlHkHq#kRYlu_~dp&#Hrtm5+n z4=O&tGb$N_Jq8t@A9zsl`CWHV@%iyy1JAoEK0mmGiqG$k3#Q`pyF94){3<^G?0G@+ z?WZ{pDn38Pl~wWi-T6Vq=jZ;Q;`4JJRD6EU15@$&-T6Vq=jZ;Q;`3u1auuK79TzG- zKOYw=KEI03&;F+EGoHZpgNo1Z&JXa;c2ob6xR&^fR=#%t;#ak`fqICSYt~8=pHGtl z@pg;3FIk=RSoVDMt5n@;a5>7fHkf^n_b$frjm3OB!iQ2$>wRxmr8x69Ow#~Y&kiGa`Chxt*BW%oz< zf`Hkk8|JI{{G2xxpP%!f;`4JJRD6EU1H1}W@%fPo6`x=DL^ZF~JcCqC?I%y1v@1f+ zWYuZ%q1jE!Ln^{jW1X zeVELj$A=a&j#Pa9<3sWPqE_m|jLgp}KL7usH~g2jnaqu_&*h~s@n<*>T`MVy`knQK zP3on}ReXNRhQifmK=?5Dmx|9{ZT3`wUm62so=gbehO0s5actRDAxLxSUtk^F1f$IQX`>Z^Hh!iqFqkSMm8h@2h<}pLS<^w4O9QwobGSXckv$2 zpyVLozm#BB1>ZJhy(&Jx=n%;X3lmJZRq^?eE4V*-iTbuT_TjH8K0l{O#pm~2<(Po) zk-F#E!+mjoH9x@GT!46nW=$S|aUP1DxZ=28GGPA8C3B!S|KL=(i>yQP1%~|1L_UdN zo8UW%{GIr$`P?F-GblV@k#6cRCLEOch+I`1}qd zE+;BJzsrP*&kr=foT1|L0}m=bzdI&We142QrQ-9u{-EOX;gXsqu>o^LhP?Y6h@7=z|Vx;cC*m$Ba0 zqnEbf`JGn<4|T1}V-=n~EfhI$CCAS`5n9u2xr@yKXJ$2d;bM+AEXYinekw1UG9?(h z%GpncbZ|(Jb^QSCxmRP~H$nWF#&TxiI|4Wh+qG)?hh(HKj%Tk=jS}A z`22j%RPp(_KdAWpxF%8Y`Q7`biqFr-g^JIQ<03u4%D%wT{J7nia78*knp;@G-Wzu|#%5ohU> zItu+ITIvtG`7?{vK`(neoy(~B{OAMAwZgx5;ryuh{K$if&+pzZn2OKOc~J5BIS(p6 zKj#5X`u{(Q&!5}pl%#d3XDzaBufv@Ga{JV)4<8REiMGn?sjNPmICSiU;S;;wJbc2$ zkz>bP(gR-pCp#M6Sn~TT>w>8BUv8f|&c)Ml-`xERBkhD;jLW!xV| zA6b~1*0e&&1pFd}yQUvml-hJ^g`%-L>WHG)lH5KG8+JyS_Vq6Cr0XFP*VXvDxmCwS aQ0^25yo|q{+}u758ajFP;D3%hm;FCNa$>Lm diff --git a/bench/python/run_bench.py b/bench/python/run_bench.py index ae2c69b0..8c00b43c 100644 --- a/bench/python/run_bench.py +++ b/bench/python/run_bench.py @@ -68,22 +68,25 @@ def run_bench( logger = logging.getLogger("run_bench") logger.info("Benchmarking started.") + # Set up various CUDA stuff. + cuda_device = cuda.Device(device_id) + cuda_ctx = cuda_device.retain_primary_context() + cuda_ctx.push() + # Use the the default stream for cvcuda and torch + # Since we never created a stream current will be the CUDA default stream + cvcuda_stream = cvcuda.Stream().current + torch_stream = torch.cuda.default_stream(device=cuda_device) + # Create an image batch decoder to supply us the input test data. decoder = ImageBatchDecoder( input_path, batch_size, device_id, - cuda_ctx=None, + cuda_ctx, + cvcuda_stream, cvcuda_perf=cvcuda_perf, ) - # Set up various CUDA stuff. - cuda_device = cuda.Device(device_id) - cuda_ctx = cuda_device.retain_primary_context() - cuda_ctx.push() - cvcuda_stream = cvcuda.Stream() - torch_stream = torch.cuda.ExternalStream(cvcuda_stream.handle) - # Get a list of (class names, class types) of all the ops that can be profiled. ops_info_list = get_benchmark_eligible_ops_info() logger.info("Found a total of %d operators for benchmarking." % len(ops_info_list)) diff --git a/ci/check_formatting.sh b/ci/check_formatting.sh new file mode 100755 index 00000000..b91d518c --- /dev/null +++ b/ci/check_formatting.sh @@ -0,0 +1,42 @@ +#!/bin/bash -e + +# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if [ $# = 0 ]; then + # No arguments? Lint all code. + echo "Linting all code in the repository ==========================" + pre-commit run -a +else + from=$1 + if [ $# = 1 ]; then + to=HEAD + elif [ $# = 2 ]; then + to=$2 + else + echo "Invalid arguments" + echo "Usage: $(basename "$0") [ref_from [ref_to]]" + exit 1 + fi + + echo "Linting files touched from commit $from to $to ==============" + echo "Files to be linted:" + git diff --stat $from..$to + if ! pre-commit run --from-ref $from --to-ref $to ; then + echo "Formatting errors:" + git diff + false + fi +fi diff --git a/cmake/ConfigCUDA.cmake b/cmake/ConfigCUDA.cmake index 24bc2453..88a2707c 100644 --- a/cmake/ConfigCUDA.cmake +++ b/cmake/ConfigCUDA.cmake @@ -38,9 +38,14 @@ if(NOT USE_CMAKE_CUDA_ARCHITECTURES) if(ENABLE_TEGRA) list(APPEND CMAKE_CUDA_ARCHITECTURES 72-real # Volta - gv11b/Tegra (Jetson AGX Xavier) - 86-real # Ampere - Jetson IGX Orin + 86-real # Jetson IGX Orin with optional Ampere RTX A6000 87-real # Ampere - ga10b,ga10c/Tegra (Jetson AGX Orin) ) + if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8") + list(APPEND CMAKE_CUDA_ARCHITECTURES + 89-real # Jetson IGX Orin with optional RTX 6000 Ada + ) + endif() else() # All architectures we build sass for list(APPEND CMAKE_CUDA_ARCHITECTURES diff --git a/docker/config b/docker/config index aa84ebf0..56df16cb 100644 --- a/docker/config +++ b/docker/config @@ -27,5 +27,5 @@ TAG_IMAGE_SAMPLES=6.1 TAG_IMAGE_TEST=5 VER_CUDA=11.7.1 -VER_UBUNTU=22.04 +VER_UBUNTU=20.04 VER_TRT=24.01 diff --git a/docs/sphinx/conf.py b/docs/sphinx/conf.py index a32ec33c..f7fac8e6 100644 --- a/docs/sphinx/conf.py +++ b/docs/sphinx/conf.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -33,7 +33,7 @@ import sys project = "CV-CUDA" -copyright = "2022-2023, NVIDIA." +copyright = "2022-2024, NVIDIA." author = "NVIDIA" version = "Beta" release = version diff --git a/docs/sphinx/content/cvcuda_oplist.csv b/docs/sphinx/content/cvcuda_oplist.csv index bc4aecd5..85e45f08 100644 --- a/docs/sphinx/content/cvcuda_oplist.csv +++ b/docs/sphinx/content/cvcuda_oplist.csv @@ -16,15 +16,13 @@ CustomCrop,Crops an image with a given region-of-interest CvtColor,Converts an image from one color space to another DataTypeConvert,Converts an image’s data type with optional scaling Erase,Erases image regions -Find Contours,Extract closed contours from an input binary image -FindHomography,Calculates a perspective transform from four pairs of the corresponding points Flip,Flips a 2D image around its axis GammaContrast,Adjusts image contrast Gaussian,Applies a gaussian blur filter to the image Gaussian Noise,Generates a statistical noise with a normal (Gaussian) distribution Histogram,Provides a grayscale value distribution showing the frequency of occurrence of each gray value. Histogram Equalizer,Allows effective spreading out the intensity range of the image typically used to improve contrast -HqResize,Performs advanced resizing supporting 2D and 3D data, tensors, tensor batches, and varshape image batches (2D only). Supports nearest neighbor, linear, cubic, Gaussian and Lanczos interpolation, with optional antialiasing when down-sampling. +HqResize, "Performs advanced resizing supporting 2D and 3D data, tensors, tensor batches, and varshape image batches (2D only). Supports nearest neighbor, linear, cubic, Gaussian and Lanczos interpolation, with optional antialiasing when down-sampling." Inpainting,Performs inpainting by replacing a pixel by normalized weighted sum of all the known pixels in the neighborhood Joint Bilateral Filter,Reduces image noise while preserving strong edges based on a guidance image Label,Labels connected regions in an image using 4-way connectivity for foreground and 8-way for background pixels @@ -35,11 +33,11 @@ MinMaxLoc,Finds the maximum and minimum values in a given array Morphology,Performs morphological erode and dilate transformations Morphology (close), Performs morphological operation that involves dilation followed by erosion on an image Morphology (open), Performs morphological operation that involves erosion followed by dilation on an image -Non-max Suppression,Enables selecting a single entity out of many overlapping ones typically used for selecting from multiple bounding boxes during object detection +Non-Maximum Suppression,Enables selecting a single entity out of many overlapping ones typically used for selecting from multiple bounding boxes during object detection Normalize,Normalizes an image pixel’s range OSD (Polyline Line Text Rotated Rect Segmented Mask),Displays an overlay on the image of of different forms including polyline line text rotated rectangle segmented mask PadStack,Stacks several images into a tensor with border extension -PairwiseMatcher,Matches features computed separately (e.g. via the SIFT operator) in two images, e.g. using the brute force method +PairwiseMatcher,"Matches features computed separately (e.g. via the SIFT operator) in two images, e.g. using the brute force method" PillowResize,Changes the size and scale of an image using python-pillow algorithm RandomResizedCrop,Crops a random portion of an image and resizes it to a specified size. Reformat,Converts a planar image into non-planar and vice versa @@ -47,7 +45,6 @@ Remap,Maps pixels in an image with one projection to another projection in a new Resize,Changes the size and scale of an image Rotate,Rotates a 2D array in multiples of 90 degrees SIFT,Identifies and matches features in images that are invariant to scale rotation and affine distortion. -Stack,Concatenates two input tensors into a single output tensor Thresholding,Chooses a global threshold value that is the same for all pixels across the image. WarpAffine,Applies an affine transformation to an image WarpPerspective,Applies a perspective transformation to an image diff --git a/docs/sphinx/index.rst b/docs/sphinx/index.rst index 890d4426..254a0bf6 100644 --- a/docs/sphinx/index.rst +++ b/docs/sphinx/index.rst @@ -1,5 +1,5 @@ .. - # SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -38,10 +38,9 @@ CV-CUDA includes: CV-CUDA Pre- and Post-Processing Operators ------------------ -CV-CUDA offers more than 20 Computer Vision and Image Processing operators. Find the operator that is right for your workflow below. +CV-CUDA offers a comprehensive collection of Computer Vision and Image Processing operators, listed below. - -.. csv-table:: +.. csv-table:: List of operators :file: content/cvcuda_oplist.csv :widths: 30, 70 :header-rows: 1 @@ -50,12 +49,13 @@ CV-CUDA offers more than 20 Computer Vision and Image Processing operators. Find Where Are the Release Notes? ------------------ -An awesome product requires excellent support. CV-CUDA release notes can be found `here `_. +CV-CUDA release notes can be found `here `_. Where Can I Get Help? ------------------ +An awesome product requires excellent support. File requests for enhancements and bug reports `here `_. @@ -97,7 +97,7 @@ NVIDIA, the NVIDIA logo, NVIDIA CV-CUDA, and NVIDIA TensorRT are trademarks and/ Copyright -------------------- -© 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +© 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. @@ -124,6 +124,7 @@ Copyright :maxdepth: 1 :hidden: + Beta.5 Beta.4 Beta.3 Beta.2 diff --git a/docs/sphinx/installation.rst b/docs/sphinx/installation.rst index 5e213d53..6c05a33d 100644 --- a/docs/sphinx/installation.rst +++ b/docs/sphinx/installation.rst @@ -24,14 +24,15 @@ Pre-requisites This section describes the recommended dependencies to install CV-CUDA. -* Ubuntu >= 20.04 -* CUDA driver >= 11.7 +* Ubuntu >= 20.04 (22.04 recommended for building the documentation) +* CUDA >= 11.7 (cuda 12 required for samples) +* NVIDIA driver r525 or later (r535 required for samples) Setup ----- The following steps describe how to install CV-CUDA. Choose the installation method that meets your environment needs. -You can download the CV-CUDA tar, deb or wheel packages from `here `_ +You can download the CV-CUDA tar, deb or wheel packages from `the asset section `_ * Tar File Installation @@ -73,11 +74,11 @@ You can download the CV-CUDA tar, deb or wheel packages from `here `_ + Download the appropriate .whl file for your computer architecture, Python and CUDA version from `here `_ Execute the following command to install appropriate CV-CUDA Python wheel :: - pip install cvcuda_-0.6.0b0-cp-cp-linux_.whl + pip install cvcuda_-0.7.0b0-cp-cp-linux_.whl where is the desired CUDA version, the desired Python version and the desired architecture. diff --git a/docs/sphinx/relnotes/v0.7.0-beta.rst b/docs/sphinx/relnotes/v0.7.0-beta.rst new file mode 100644 index 00000000..5ad3ae43 --- /dev/null +++ b/docs/sphinx/relnotes/v0.7.0-beta.rst @@ -0,0 +1,69 @@ +.. + # SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-License-Identifier: Apache-2.0 + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + +.. _v0.7.0-beta: + +Beta.5 +====== + +CV-CUDA 0.7.0 introduces performance and support enhancements, along with bug fixes and new features. + +Release Highlights +------------------ + +CV-CUDA v0.7.0 includes the following improvements: + +* **New Features**: + + * Optimized Python bindings: near-zero overhead compared to C++ calls​ + + * Added masking option to Label operator: conditional island removal + + * Added IGX Orin support (with dGPU, Ampere or Ada RTX6000)​ + + * Added support of signed 32bits output datatype for Label operator​ + +* **Removed Operator**:​ + + * Removed Find Contours operator for troubleshooting of major limitations + +* **Bug Fixes**: + + * Fixed constraint on installation directory for Python tests​: tar test packages can now be used from any directory​ + + +Compatibility and Known Limitations +----------------------------------- + +See main README on `CV-CUDA GitHub `_. + +License +------- + +CV-CUDA is licensed under the `Apache 2.0 `_ license. + +Resources +--------- + +1. `CV-CUDA GitHub `_ +2. `CV-CUDA Increasing Throughput and Reducing Costs for AI-Based Computer Vision with CV-CUDA `_ +3. `NVIDIA Announces Microsoft, Tencent, Baidu Adopting CV-CUDA for Computer Vision AI `_ +4. `CV-CUDA helps Tencent Cloud audio and video PaaS platform achieve full-process GPU acceleration for video enhancement AI `_ + +Acknowledgements +---------------- + +CV-CUDA is developed jointly by NVIDIA and the ByteDance Machine Learning team. diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 7647d049..65f61d87 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.18) +cmake_minimum_required(VERSION 3.20.1) project(cvcuda_python CXX C) diff --git a/python/mod_cvcuda/CMakeLists.txt b/python/mod_cvcuda/CMakeLists.txt index 45ecc94e..66b53d87 100644 --- a/python/mod_cvcuda/CMakeLists.txt +++ b/python/mod_cvcuda/CMakeLists.txt @@ -29,7 +29,6 @@ nvcv_python_add_module( OpLabel.cpp LabelType.cpp ConnectivityType.cpp - OpFindContours.cpp OpHistogramEq.cpp OpOSD.cpp OpAdvCvtColor.cpp diff --git a/python/mod_cvcuda/Main.cpp b/python/mod_cvcuda/Main.cpp index 130d0168..aff67174 100644 --- a/python/mod_cvcuda/Main.cpp +++ b/python/mod_cvcuda/Main.cpp @@ -94,7 +94,6 @@ PYBIND11_MODULE(cvcuda, m) // CV-CUDA Operators ExportOpPairwiseMatcher(m); ExportOpLabel(m); - ExportOpFindContours(m); ExportOpOSD(m); ExportOpHistogramEq(m); ExportOpAdvCvtColor(m); diff --git a/python/mod_cvcuda/OpFindContours.cpp b/python/mod_cvcuda/OpFindContours.cpp deleted file mode 100644 index 137bf645..00000000 --- a/python/mod_cvcuda/OpFindContours.cpp +++ /dev/null @@ -1,134 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Operators.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace cvcudapy { - -namespace { - -using TupleTensor2 = std::tuple; - -TupleTensor2 FindContoursInto(Tensor &points, Tensor &numPoints, Tensor &input, std::optional pstream) -{ - if (!pstream) - { - pstream = Stream::Current(); - } - - nvcv::Size2D size{static_cast(input.shape()[2]), static_cast(input.shape()[1])}; - auto findContours = CreateOperator(size, static_cast(input.shape()[0])); - - ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_MODE_READ, {input}); - guard.add(LockMode::LOCK_MODE_WRITE, {points}); - guard.add(LockMode::LOCK_MODE_WRITE, {numPoints}); - guard.add(LockMode::LOCK_MODE_READWRITE, {*findContours}); - - findContours->submit(pstream->cudaHandle(), input, points, numPoints); - - return TupleTensor2(std::move(points), std::move(numPoints)); -} - -TupleTensor2 FindContours(Tensor &input, std::optional pstream) -{ - auto pointShape = nvcv::TensorShape{ - {input.shape()[0], cvcuda::FindContours::MAX_TOTAL_POINTS, 2}, - nvcv::TENSOR_NHW - }; - Tensor points = Tensor::Create(pointShape, nvcv::TYPE_S32); - - auto countShape = nvcv::TensorShape{ - {input.shape()[0], cvcuda::FindContours::MAX_NUM_CONTOURS}, - nvcv::TENSOR_NW - }; - Tensor numPoints = Tensor::Create(countShape, nvcv::TYPE_S32); - - return FindContoursInto(points, numPoints, input, pstream); -} - -} // namespace - -void ExportOpFindContours(py::module &m) -{ - using namespace pybind11::literals; - py::options options; - options.disable_function_signatures(); - - m.def("find_contours", &FindContours, "image"_a, "stream"_a = nullptr, R"pbdoc( - - cvcuda.find_contours(src : nvcv.Tensor, stream: Optional[nvcv.cuda.Stream] = None) -> nvcv.Tensor - Executes the FindContours operation on the given cuda stream. - - See also: - Refer to the CV-CUDA C API reference for the FindContours operator - for more details and usage examples. - - Args: - src (Tensor): Input tensor containing one or more images. - stream (Stream, optional): CUDA Stream on which to perform the operation. - - Returns: - Tuple[Tensor, Tensor]: A tuple of two tensors. The first is the contour points tensor with dimensions NxMx2 - - where N is the batch size, M is the maximum number of points allowed. Each point of the contour is specified - in (x, y) coordinates. The second tensor specifies the number of valid contours per image and the number of - valid points in those contours. It has dimensions NxC where N is the batch size and C is the maximum number - of contours found. The actual number of contours can be calculated by counting the number of non-zero elements - in the C dimension and the actual number of points in each of those contours are the values stored in the C dimension. - - Caution: - Restrictions to several arguments may apply. Check the C - API references of the CV-CUDA operator. - )pbdoc"); - - m.def("find_contours_into", &FindContoursInto, "points"_a, "num_points"_a, "src"_a, "stream"_a = nullptr, R"pbdoc( - - cvcuda.find_contours_into(points : nvcv.Tensor, num_points : nvcv.Tensor, src : Tensor, stream: Optional[nvcv.cuda.Stream] = None) - Executes the FindContours operation on the given cuda stream. - - See also: - Refer to the CV-CUDA C API reference for the FindContours operator - for more details and usage examples. - - Args: - points (Tensor): Output tensor to store the coordinates of each contour point. - num_points (Tensor): Output tensor to store the number of points in a contour. - src (Tensor): Input tensor containing one or more images. - stream (Stream, optional): CUDA Stream on which to perform the operation. - - Returns: - None - - Caution: - Restrictions to several arguments may apply. Check the C - API references of the CV-CUDA operator. - )pbdoc"); -} - -} // namespace cvcudapy diff --git a/python/mod_cvcuda/OpLabel.cpp b/python/mod_cvcuda/OpLabel.cpp index 1d45618d..c93158ac 100644 --- a/python/mod_cvcuda/OpLabel.cpp +++ b/python/mod_cvcuda/OpLabel.cpp @@ -33,9 +33,9 @@ using TupleTensor3 = std::tuple, std::optional count, std::optional stats, Tensor &input, - NVCVConnectivityType connectivity, NVCVLabelType assignLabels, std::optional bgLabel, - std::optional minThresh, std::optional maxThresh, std::optional minSize, - std::optional pstream) + NVCVConnectivityType connectivity, NVCVLabelType assignLabels, NVCVLabelMaskType maskType, + std::optional bgLabel, std::optional minThresh, std::optional maxThresh, + std::optional minSize, std::optional mask, std::optional pstream) { if (!pstream) { @@ -73,20 +73,26 @@ TupleTensor3 LabelInto(Tensor &output, std::optional count, std::optiona { guard.add(LockMode::LOCK_MODE_READ, {*minSize}); } + if (mask) + { + guard.add(LockMode::LOCK_MODE_READ, {*mask}); + } op->submit(pstream->cudaHandle(), input, output, (bgLabel ? *bgLabel : nvcv::Tensor{nullptr}), (minThresh ? *minThresh : nvcv::Tensor{nullptr}), (maxThresh ? *maxThresh : nvcv::Tensor{nullptr}), (minSize ? *minSize : nvcv::Tensor{nullptr}), (count ? *count : nvcv::Tensor{nullptr}), - (stats ? *stats : nvcv::Tensor{nullptr}), connectivity, assignLabels); + (stats ? *stats : nvcv::Tensor{nullptr}), (mask ? *mask : nvcv::Tensor{nullptr}), connectivity, + assignLabels, maskType); return TupleTensor3(std::move(output), count, stats); } -TupleTensor3 Label(Tensor &input, NVCVConnectivityType connectivity, NVCVLabelType assignLabels, bool count, bool stats, - int maxLabels, std::optional bgLabel, std::optional minThresh, - std::optional maxThresh, std::optional minSize, std::optional pstream) +TupleTensor3 Label(Tensor &input, NVCVConnectivityType connectivity, NVCVLabelType assignLabels, + NVCVLabelMaskType maskType, bool count, bool stats, int maxLabels, std::optional bgLabel, + std::optional minThresh, std::optional maxThresh, std::optional minSize, + std::optional mask, std::optional pstream) { - constexpr nvcv::DataType outType = nvcv::TYPE_U32; + constexpr nvcv::DataType outType = nvcv::TYPE_S32; auto inputData = input.exportData(); if (!inputData) @@ -112,11 +118,11 @@ TupleTensor3 Label(Tensor &input, NVCVConnectivityType connectivity, NVCVLabelTy int numStats = 1; if (connectivity == NVCV_CONNECTIVITY_4_2D || connectivity == NVCV_CONNECTIVITY_8_2D) { - numStats = 6; + numStats = 7; } if (connectivity == NVCV_CONNECTIVITY_6_3D || connectivity == NVCV_CONNECTIVITY_26_3D) { - numStats = 8; + numStats = 9; } statsTensor = Tensor::Create( @@ -127,8 +133,8 @@ TupleTensor3 Label(Tensor &input, NVCVConnectivityType connectivity, NVCVLabelTy outType); } - return LabelInto(output, countTensor, statsTensor, input, connectivity, assignLabels, bgLabel, minThresh, maxThresh, - minSize, pstream); + return LabelInto(output, countTensor, statsTensor, input, connectivity, assignLabels, maskType, bgLabel, minThresh, + maxThresh, minSize, mask, pstream); } } // namespace @@ -137,9 +143,14 @@ void ExportOpLabel(py::module &m) { using namespace pybind11::literals; + py::enum_(m, "LabelMaskType", py::arithmetic()) + .value("REMOVE_ISLANDS_OUTSIDE_MASK_ONLY", NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY) + .export_values(); + m.def("label", &Label, "src"_a, "connectivity"_a = NVCV_CONNECTIVITY_4_2D, "assign_labels"_a = NVCV_LABEL_FAST, - py::kw_only(), "count"_a = false, "stats"_a = false, "max_labels"_a = 10000, "bg_label"_a = nullptr, - "min_thresh"_a = nullptr, "max_thresh"_a = nullptr, "min_size"_a = nullptr, "stream"_a = nullptr, R"pbdoc( + "mask_type"_a = NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY, py::kw_only(), "count"_a = false, "stats"_a = false, + "max_labels"_a = 10000, "bg_label"_a = nullptr, "min_thresh"_a = nullptr, "max_thresh"_a = nullptr, + "min_size"_a = nullptr, "mask"_a = nullptr, "stream"_a = nullptr, R"pbdoc( Executes the Label operation on the given cuda stream. @@ -152,6 +163,8 @@ void ExportOpLabel(py::module &m) default is cvcuda.CONNECTIVITY_4_2D. assign_labels (cvcuda.LABEL, optional): Choice on how labels are assigned, default is cvcuda.LABEL.FAST. + mask_type (cvcuda.LabelMaskType, optional): Choice on how the mask is used, + default is cvcuda.REMOVE_ISLANDS_OUTSIDE_MASK_ONLY. count (bool, optional): Use True to return the count of valid labeled regions. stats (bool, optional): Use True to return the statistics of valid labeled regions. max_labels (Number, optional): Maximum number of labels to compute statistics for, default is 10000. @@ -161,6 +174,10 @@ void ExportOpLabel(py::module &m) max_thresh (Tensor, optional): Maximum threshold tensor to mask input values above it to be 0, and others 1. min_size (Tensor, optional): Minimum size tensor to remove islands, i.e. labeled regions with number of elements less than the minimum size. + mask (Tensor, optional): Mask tensor, its behavior is controlled by \ref mask_type. One choice is to + control island removal in addition to \ref min_size, i.e. regions with at + least one element inside the mask (non-zero values) are not removed in case + mask_type is cvcuda.REMOVE_ISLANDS_OUTSIDE_MASK_ONLY. stream (Stream, optional): CUDA Stream on which to perform the operation. Returns: @@ -172,8 +189,9 @@ void ExportOpLabel(py::module &m) )pbdoc"); m.def("label_into", &LabelInto, "dst"_a, "count"_a = nullptr, "stats"_a = nullptr, "src"_a, - "connectivity"_a = NVCV_CONNECTIVITY_4_2D, "assign_labels"_a = NVCV_LABEL_FAST, py::kw_only(), - "bg_label"_a = nullptr, "min_thresh"_a = nullptr, "max_thresh"_a = nullptr, "min_size"_a = nullptr, + "connectivity"_a = NVCV_CONNECTIVITY_4_2D, "assign_labels"_a = NVCV_LABEL_FAST, + "mask_type"_a = NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY, py::kw_only(), "bg_label"_a = nullptr, + "min_thresh"_a = nullptr, "max_thresh"_a = nullptr, "min_size"_a = nullptr, "mask"_a = nullptr, "stream"_a = nullptr, R"pbdoc( Executes the Label operation on the given cuda stream. @@ -190,12 +208,18 @@ void ExportOpLabel(py::module &m) default is cvcuda.CONNECTIVITY_4_2D. assign_labels (cvcuda.LABEL, optional): Choice on how labels are assigned, default is cvcuda.LABEL.FAST. + mask_type (cvcuda.LabelMaskType, optional): Choice on how the mask is used, + default is cvcuda.REMOVE_ISLANDS_OUTSIDE_MASK_ONLY. bg_label (Tensor, optional): Background tensor to define input values to be considered background labels and thus ignored. min_thresh (Tensor, optional): Minimum threshold tensor to mask input values below it to be 0, and others 1. max_thresh (Tensor, optional): Maximum threshold tensor to mask input values above it to be 0, and others 1. min_size (Tensor, optional): Minimum size tensor to remove islands, i.e. labeled regions with number of elements less than the minimum size. + mask (Tensor, optional): Mask tensor, its behavior is controlled by \ref mask_type. One choice is to + control island removal in addition to \ref min_size, i.e. regions with at + least one element inside the mask (non-zero values) are not removed in case + mask_type is cvcuda.REMOVE_ISLANDS_OUTSIDE_MASK_ONLY. stream (Stream, optional): CUDA Stream on which to perform the operation. Returns: diff --git a/python/mod_cvcuda/OpResize.cpp b/python/mod_cvcuda/OpResize.cpp index 7d42dcce..a8e41fab 100644 --- a/python/mod_cvcuda/OpResize.cpp +++ b/python/mod_cvcuda/OpResize.cpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -173,7 +173,7 @@ void ExportOpResize(py::module &m) stream (Stream, optional): CUDA Stream on which to perform the operation. Returns: - cvcuda.Tensor: The output tensor. + cvcuda.ImageBatchVarShape: The output image batch. Caution: Restrictions to several arguments may apply. Check the C diff --git a/python/mod_cvcuda/Operators.hpp b/python/mod_cvcuda/Operators.hpp index b48f11fb..6197e43b 100644 --- a/python/mod_cvcuda/Operators.hpp +++ b/python/mod_cvcuda/Operators.hpp @@ -49,7 +49,6 @@ using nvcvpy::TensorBatch; namespace util = nvcvpy::util; namespace py = ::pybind11; -void ExportOpFindContours(py::module &m); void ExportOpReformat(py::module &m); void ExportOpResize(py::module &m); void ExportOpCustomCrop(py::module &m); diff --git a/python/mod_nvcv/CAPI.cpp b/python/mod_nvcv/CAPI.cpp index e15f6eff..b31fc27e 100644 --- a/python/mod_nvcv/CAPI.cpp +++ b/python/mod_nvcv/CAPI.cpp @@ -105,14 +105,9 @@ LockMode ToLockMode(PyObject *_mode) } } -extern "C" void ImplResource_SubmitSync(PyObject *res, PyObject *stream, PyObject *lockMode) +extern "C" void ImplResource_SubmitSync(PyObject *res, PyObject *stream) { - ToSharedObj(res)->submitSync(*ToSharedObj(stream), ToLockMode(lockMode)); -} - -extern "C" void ImplResource_SubmitSignal(PyObject *res, PyObject *stream, PyObject *lockMode) -{ - ToSharedObj(res)->submitSignal(*ToSharedObj(stream), ToLockMode(lockMode)); + ToSharedObj(res)->submitSync(*ToSharedObj(stream)); } extern "C" void ImplStream_HoldResources(PyObject *stream, PyObject *resourceList) @@ -294,7 +289,6 @@ void ExportCAPI(py::module &m) .ImageFormat_ToPython = &ImplImageFormat_ToPython, .ImageFormat_FromPython = &ImplImageFormat_FromPython, .Resource_SubmitSync = &ImplResource_SubmitSync, - .Resource_SubmitSignal = &ImplResource_SubmitSignal, .Stream_HoldResources = &ImplStream_HoldResources, .Stream_GetCurrent = &ImplStream_GetCurrent, .Stream_GetCudaHandle = &ImplStream_GetCudaHandle, diff --git a/python/mod_nvcv/Resource.cpp b/python/mod_nvcv/Resource.cpp index afe57156..a8d0fe67 100644 --- a/python/mod_nvcv/Resource.cpp +++ b/python/mod_nvcv/Resource.cpp @@ -30,24 +30,21 @@ Resource::Resource() m_id = idnext++; - m_readEvent = m_writeEvent = nullptr; + m_event = nullptr; try { - util::CheckThrow(cudaEventCreateWithFlags(&m_readEvent, cudaEventDisableTiming)); - util::CheckThrow(cudaEventCreateWithFlags(&m_writeEvent, cudaEventDisableTiming)); + util::CheckThrow(cudaEventCreateWithFlags(&m_event, cudaEventDisableTiming)); } catch (...) { - cudaEventDestroy(m_readEvent); - cudaEventDestroy(m_writeEvent); + cudaEventDestroy(m_event); throw; } } Resource::~Resource() { - cudaEventDestroy(m_readEvent); - cudaEventDestroy(m_writeEvent); + cudaEventDestroy(m_event); } uint64_t Resource::id() const @@ -55,62 +52,29 @@ uint64_t Resource::id() const return m_id; } -void Resource::submitSignal(Stream &stream, LockMode mode) const +void Resource::submitSync(Stream &stream) { - doBeforeSubmitSignal(stream, mode); - - if (mode & LOCK_MODE_READ) - { - util::CheckThrow(cudaEventRecord(m_readEvent, stream.handle())); - } - if (mode & LOCK_MODE_WRITE) + //Check if we have a last stream, if not set it to the current stream + if (!m_lastStream.has_value()) { - util::CheckThrow(cudaEventRecord(m_writeEvent, stream.handle())); + m_lastStream.emplace(stream.shared_from_this()); //store a shared pointer to the stream } -} - -void Resource::submitSync(Stream &stream, LockMode mode) const -{ - doBeforeSubmitSync(stream, mode); - doSubmitSync(stream, mode); -} - -void Resource::doSubmitSync(Stream &stream, LockMode mode) const -{ - if (mode & LOCK_MODE_WRITE) - { - util::CheckThrow(cudaStreamWaitEvent(stream.handle(), m_writeEvent)); - util::CheckThrow(cudaStreamWaitEvent(stream.handle(), m_readEvent)); - } - else if (mode & LOCK_MODE_READ) + // if we are on the same stream we dont need to do anything + // as streams are sequential and we can assume that the last operation on the stream is done + if (m_lastStream.value()->handle() == stream.handle()) { - util::CheckThrow(cudaStreamWaitEvent(stream.handle(), m_writeEvent)); + return; } -} - -void Resource::sync(LockMode mode) const -{ - py::gil_scoped_release release; - doBeforeSync(mode); + // if we are on a different stream we need to wait for that stream to finish + // write event on the old stream, the new stream will have to wait for it to be done + util::CheckThrow(cudaEventRecord(m_event, m_lastStream.value()->handle())); + util::CheckThrow(cudaStreamWaitEvent(stream.handle(), m_event)); - doSync(mode); -} - -void Resource::doSync(LockMode mode) const -{ - NVCV_ASSERT(PyGILState_Check() == 0); - - if (mode & LOCK_MODE_WRITE) - { - util::CheckThrow(cudaEventSynchronize(m_writeEvent)); - util::CheckThrow(cudaEventSynchronize(m_readEvent)); - } - else if (mode & LOCK_MODE_READ) - { - util::CheckThrow(cudaEventSynchronize(m_writeEvent)); - } + // update the last stream since we changed streams + m_lastStream.reset(); + m_lastStream.emplace(stream.shared_from_this()); } std::shared_ptr Resource::shared_from_this() @@ -127,8 +91,7 @@ void Resource::Export(py::module &m) { py::class_>(m, "Resource") .def_property_readonly("id", &Resource::id, "Unique resource instance identifier") - .def("submitSync", &Resource::submitSync) - .def("submitSignal", &Resource::submitSignal); + .def("submitStreamSync", &Resource::submitSync, "Syncs object on new Stream"); } } // namespace nvcvpy::priv diff --git a/python/mod_nvcv/Resource.hpp b/python/mod_nvcv/Resource.hpp index 21e7cc18..010c8b33 100644 --- a/python/mod_nvcv/Resource.hpp +++ b/python/mod_nvcv/Resource.hpp @@ -19,6 +19,7 @@ #define NVCV_PYTHON_PRIV_RESOURCE_HPP #include "Object.hpp" +#include "Stream.hpp" #include #include @@ -32,42 +33,64 @@ typedef struct CUevent_st *cudaEvent_t; namespace nvcvpy::priv { namespace py = pybind11; -class Stream; - +/** + * @brief A class representing a CUDA resource. + * + * This class encapsulates a CUDA resource and provides methods for synchronization + * with CUDA streams. + */ class PYBIND11_EXPORT Resource : public virtual Object { public: + /** + * @brief Destructor. + */ ~Resource(); + /** + * @brief Export the Resource class to Python. + * + * @param m The Python module to export the class to. + */ static void Export(py::module &m); + /** + * @brief Get the unique identifier of the resource. + * + * @return uint64_t The unique identifier of the resource. + */ uint64_t id() const; - void submitSync(Stream &stream, LockMode mode) const; - void submitSignal(Stream &stream, LockMode mode) const; - - // Assumes GIL is locked (is in acquired state) - void sync(LockMode mode) const; - - std::shared_ptr shared_from_this(); + /** + * @brief Submit the resource for synchronization with a CUDA stream. + * + * This method synchronizes the resource with the specified CUDA stream. + * + * @param stream The CUDA stream to synchronize with. + */ + void submitSync(Stream &stream); + + /** + * @brief Get a shared pointer to this resource. + * + * @return std::shared_ptr A shared pointer to this resource. + */ + std::shared_ptr shared_from_this(); + + /** + * @brief Get a shared pointer to this const resource. + * + * @return std::shared_ptr A shared pointer to this const resource. + */ std::shared_ptr shared_from_this() const; protected: Resource(); - void doSubmitSync(Stream &stream, LockMode mode) const; - - // Assumes GIL is not locked (is in released state) - void doSync(LockMode mode) const; - private: - // To be overriden by children if they have their own requirements - virtual void doBeforeSync(LockMode mode) const {}; - virtual void doBeforeSubmitSync(Stream &stream, LockMode mode) const {}; - virtual void doBeforeSubmitSignal(Stream &stream, LockMode mode) const {}; - - uint64_t m_id; - cudaEvent_t m_readEvent, m_writeEvent; + uint64_t m_id; /**< The unique identifier of the resource. */ + cudaEvent_t m_event; /**< The CUDA event used for synchronization. */ + std::optional> m_lastStream; /**< Cache the last stream used for this resource. */ }; } // namespace nvcvpy::priv diff --git a/python/mod_nvcv/Stream.cpp b/python/mod_nvcv/Stream.cpp index 4c120499..bd3c1f9d 100644 --- a/python/mod_nvcv/Stream.cpp +++ b/python/mod_nvcv/Stream.cpp @@ -28,6 +28,11 @@ namespace nvcvpy::priv { +// Static members initialization +cudaStream_t Stream::m_auxStream = nullptr; +std::atomic Stream::m_instanceCount = 0; +std::mutex Stream::m_auxStreamMutex; + // Here we define the representation of external cuda streams. // It defines pybind11's type casters from the python object // to the corresponding ExternalStream. @@ -193,7 +198,18 @@ std::shared_ptr Stream::Create() Stream::Stream() : m_owns(true) { - util::CheckThrow(cudaStreamCreate(&m_handle)); + try + { + util::CheckThrow(cudaStreamCreateWithFlags(&m_handle, cudaStreamNonBlocking)); + incrementInstanceCount(); + GetAuxStream(); + util::CheckThrow(cudaEventCreateWithFlags(&m_event, cudaEventDisableTiming)); + } + catch (...) + { + destroy(); + throw; + } } Stream::Stream(IExternalStream &extStream) @@ -206,14 +222,72 @@ Stream::Stream(IExternalStream &extStream) { throw std::runtime_error("Invalid cuda stream"); } + + try + { + incrementInstanceCount(); + GetAuxStream(); // Make sure the singleton aux stream is created + util::CheckThrow(cudaEventCreateWithFlags(&m_event, cudaEventDisableTiming)); + } + catch (...) + { + destroy(); + throw; + } +} + +void Stream::incrementInstanceCount() +{ + m_instanceCount.fetch_add(1, std::memory_order_relaxed); +} + +int Stream::decrementInstanceCount() +{ + return m_instanceCount.fetch_sub(1, std::memory_order_acq_rel) - 1; +} + +cudaStream_t &Stream::GetAuxStream() +{ + if (!m_auxStream) + { + std::lock_guard lock(m_auxStreamMutex); + if (!m_auxStream) + { + util::CheckThrow(cudaStreamCreateWithFlags(&m_auxStream, cudaStreamNonBlocking)); + } + } + return m_auxStream; } Stream::~Stream() +{ + destroy(); +} + +void Stream::destroy() { if (m_owns) { - util::CheckLog(cudaStreamSynchronize(m_handle)); - util::CheckLog(cudaStreamDestroy(m_handle)); + if (m_handle) + { + util::CheckLog(cudaStreamSynchronize(m_handle)); + util::CheckLog(cudaStreamDestroy(m_handle)); + m_handle = nullptr; + } + } + { + std::lock_guard lock(m_auxStreamMutex); + if (m_auxStream && decrementInstanceCount() == 0) + { + util::CheckThrow(cudaStreamSynchronize(m_auxStream)); + util::CheckThrow(cudaStreamDestroy(m_auxStream)); + m_auxStream = nullptr; + } + } + if (m_event) + { + util::CheckThrow(cudaEventDestroy(m_event)); + m_event = nullptr; } } @@ -240,7 +314,6 @@ intptr_t Stream::pyhandle() const void Stream::sync() { py::gil_scoped_release release; - util::CheckThrow(cudaStreamSynchronize(m_handle)); } @@ -283,8 +356,34 @@ void Stream::holdResources(LockResources usedResources) delete pclosure; }; - util::CheckThrow(cudaStreamAddCallback(m_handle, fn, closure.get(), 0)); - + // If we naively execute the callback in the main stream (m_handle), the GPU will wait until the callback + // is executed (on host). For correctness, GPU doesn't need to wait - it's the CPU that needs + // to wait for the work already scheduled to complete. + // + // Naive timeline: + // + // stream GPU_kernel1 | Callback | GPU_kernel2 + // GPU activity xxxxxxxxxxx xxxxxxxxxxx + // CPU activity xxxxxxxx + // + // Optimized timeline + // + // + // event -----v + // stream GPU_kernel1 | GPU_kernel2 + // aux_stream waitEvent >| Callback + // + // GPU activity xxxxxxxxxxx xxxxxxxxxxx + // CPU activity xxxxxxxx + + util::CheckThrow(cudaEventRecord(m_event, m_handle)); // add async record the event in the main stream + util::CheckThrow( + cudaStreamWaitEvent(GetAuxStream(), m_event)); // add async wait for the event in the aux stream + // The callback will be executed in the singleton aux stream there may be contention with other callbacks and waitEvents from + // other streams. However the callback is used to release resources from the cache and should not be a performance bottleneck. + // This avoids opening a new aux stream for each stream object. + util::CheckThrow( + cudaStreamAddCallback(GetAuxStream(), fn, closure.get(), 0)); // add async callback in the aux stream closure.release(); } } @@ -322,6 +421,8 @@ void Stream::Export(py::module &m) ExportExternalStream(m); ExportExternalStream(m); + fflush(stdout); + stream.def("__enter__", &Stream::activate, "Activate the CUDA stream as the current stream for this thread.") .def("__exit__", &Stream::deactivate, "Deactivate the CUDA stream as the current stream for this thread.") .def("sync", &Stream::sync, "Wait for all preceding CUDA calls in the current stream to complete.") diff --git a/python/mod_nvcv/Stream.hpp b/python/mod_nvcv/Stream.hpp index 81a3fc9f..2dcceb72 100644 --- a/python/mod_nvcv/Stream.hpp +++ b/python/mod_nvcv/Stream.hpp @@ -24,8 +24,10 @@ #include #include +#include #include #include +#include #include #include @@ -51,7 +53,7 @@ class PYBIND11_EXPORT Stream : public CacheItem static std::shared_ptr Create(); - ~Stream(); + virtual ~Stream(); std::shared_ptr shared_from_this(); std::shared_ptr shared_from_this() const; @@ -75,6 +77,8 @@ class PYBIND11_EXPORT Stream : public CacheItem Stream(Stream &&) = delete; Stream(); + // Singleton access to the auxiliary CUDA stream + class Key final : public IKey { private: @@ -88,9 +92,22 @@ class PYBIND11_EXPORT Stream : public CacheItem return key; } - bool m_owns; - cudaStream_t m_handle; + void destroy(); + + bool m_owns = false; + cudaStream_t m_handle = nullptr; + cudaEvent_t m_event = nullptr; py::object m_wrappedObj; + + //singleton aux stream and protection. this a a bit overkill + //for now as python is single threaded, but it is a good practice + static std::mutex m_auxStreamMutex; + static std::atomic m_instanceCount; + static cudaStream_t m_auxStream; + + static void incrementInstanceCount(); + static int decrementInstanceCount(); + static cudaStream_t &GetAuxStream(); }; } // namespace nvcvpy::priv diff --git a/python/mod_nvcv/include/nvcv/python/CAPI.hpp b/python/mod_nvcv/include/nvcv/python/CAPI.hpp index db5f200a..664ed87b 100644 --- a/python/mod_nvcv/include/nvcv/python/CAPI.hpp +++ b/python/mod_nvcv/include/nvcv/python/CAPI.hpp @@ -44,8 +44,7 @@ struct CAPI PyObject *(*ImageFormat_ToPython)(NVCVImageFormat p); NVCVImageFormat (*ImageFormat_FromPython)(PyObject *obj); - void (*Resource_SubmitSync)(PyObject *res, PyObject *stream, PyObject *lockMode); - void (*Resource_SubmitSignal)(PyObject *res, PyObject *stream, PyObject *lockMode); + void (*Resource_SubmitSync)(PyObject *res, PyObject *stream); void (*Stream_HoldResources)(PyObject *stream, PyObject *resources); PyObject *(*Stream_GetCurrent)(); diff --git a/python/mod_nvcv/include/nvcv/python/ResourceGuard.hpp b/python/mod_nvcv/include/nvcv/python/ResourceGuard.hpp index 40967a84..5ad2bae5 100644 --- a/python/mod_nvcv/include/nvcv/python/ResourceGuard.hpp +++ b/python/mod_nvcv/include/nvcv/python/ResourceGuard.hpp @@ -62,43 +62,16 @@ class ResourceGuard for (const std::reference_wrapper &r : resources) { py::object pyRes = r.get(); - - capi().Resource_SubmitSync(pyRes.ptr(), m_pyStream.ptr(), pyLockMode.ptr()); + capi().Resource_SubmitSync(pyRes.ptr(), m_pyStream.ptr()); m_resourcesPerLockMode.append(std::make_pair(pyLockMode, std::move(pyRes))); } + return *this; } void commit() { capi().Stream_HoldResources(m_pyStream.ptr(), m_resourcesPerLockMode.ptr()); - - py::list newList; - - auto it = m_resourcesPerLockMode.begin(); - try - { - // Try to signal the resources, stop on the first that fails, or - // when all resources were signaled - for (; it != m_resourcesPerLockMode.end(); ++it) - { - py::tuple t = it->cast(); - - // resource, stream, lockmode - capi().Resource_SubmitSignal(t[1].ptr(), m_pyStream.ptr(), t[0].ptr()); - } - } - catch (...) - { - // Add all resources that weren't signaled to the newList. - for (; it != m_resourcesPerLockMode.end(); ++it) - { - newList.append(std::move(*it)); - } - throw; - } - - m_resourcesPerLockMode = std::move(newList); } private: diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index ca2ee0c2..806192fe 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.22) +cmake_minimum_required(VERSION 3.20.1) project(nvcv_samples LANGUAGES CXX) find_package(CUDAToolkit REQUIRED) diff --git a/samples/classification/python/main.py b/samples/classification/python/main.py index f12c95f5..cae6131e 100644 --- a/samples/classification/python/main.py +++ b/samples/classification/python/main.py @@ -81,8 +81,10 @@ def run_sample( cuda_device = cuda.Device(device_id) cuda_ctx = cuda_device.retain_primary_context() cuda_ctx.push() - cvcuda_stream = cvcuda.Stream() - torch_stream = torch.cuda.ExternalStream(cvcuda_stream.handle) + # Use the the default stream for cvcuda and torch + # Since we never created a stream current will be the CUDA default stream + cvcuda_stream = cvcuda.Stream().current + torch_stream = torch.cuda.default_stream(device=cuda_device) # docs_tag: end_setup_gpu # docs_tag: begin_setup_stages @@ -96,6 +98,7 @@ def run_sample( batch_size, device_id, cuda_ctx, + cvcuda_stream, cvcuda_perf, ) @@ -106,6 +109,7 @@ def run_sample( batch_size, device_id, cuda_ctx, + cvcuda_stream, cvcuda_perf, ) diff --git a/samples/common/python/nvcodec_utils.py b/samples/common/python/nvcodec_utils.py index 2a300d38..420e15fe 100644 --- a/samples/common/python/nvcodec_utils.py +++ b/samples/common/python/nvcodec_utils.py @@ -68,6 +68,7 @@ def __init__( batch_size, device_id, cuda_ctx, + cuda_stream, cvcuda_perf, ): # docs_tag: begin_init_videobatchdecoder_pyvideocodec @@ -76,7 +77,7 @@ def __init__( self.batch_size = batch_size self.device_id = device_id self.cuda_ctx = cuda_ctx - self.cuda_stream = cvcuda.Stream().current + self.cuda_stream = cuda_stream self.cvcuda_perf = cvcuda_perf self.total_decoded = 0 self.batch_idx = 0 @@ -229,6 +230,7 @@ def __init__( fps, device_id, cuda_ctx, + cuda_stream, cvcuda_perf, ): self.logger = logging.getLogger(__name__) @@ -236,7 +238,7 @@ def __init__( self.fps = fps self.device_id = device_id self.cuda_ctx = cuda_ctx - self.cuda_stream = cvcuda.Stream().current + self.cuda_stream = cuda_stream self.cvcuda_perf = cvcuda_perf self.encoder = None @@ -327,7 +329,7 @@ def start(self): pass def join(self): - self.encoder.flush() + # self.encoder.flush() self.logger.info("Wrote: %s" % self.output_file_name) @@ -482,6 +484,7 @@ def __init__( batch_size, device_id, cuda_ctx, + cuda_stream, cvcuda_perf, ): @@ -493,7 +496,7 @@ def __init__( self.total_decoded = 0 self.batch_idx = 0 self.cuda_ctx = cuda_ctx - self.cuda_stream = cvcuda.Stream().current + self.cuda_stream = cuda_stream self.cvcuda_perf = cvcuda_perf self.decoder = nvimgcodec.Decoder(device_id=device_id) diff --git a/samples/label/python/label.py b/samples/label/python/label.py new file mode 100644 index 00000000..0d8fcf21 --- /dev/null +++ b/samples/label/python/label.py @@ -0,0 +1,215 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +import torch +import torchvision + +import cvcuda + + +def parse_arguments(): + """Parse this program script arguments.""" + + parser = argparse.ArgumentParser(prog="label", description="Labels an input image.") + + parser.add_argument("input", type=str, help="Input image png file path.") + parser.add_argument( + "output", + nargs="?", + default="out.png", + type=str, + help="Output image png file path. Defaults to out.png.", + ) + parser.add_argument( + "--max_labels", + default=1000, + type=int, + help="Maximum number of labels. Defaults to 1000.", + ) + parser.add_argument( + "--min_threshold", + default=None, + type=int, + help="Minimum threshold to binarize input. Defaults to no minimum threshold.", + ) + parser.add_argument( + "--max_threshold", + default=None, + type=int, + help="Maximum threshold to binarize input. Defaults to no maximum threshold.", + ) + parser.add_argument( + "--min_size", + default=None, + type=int, + help="Minimum size to prevent a region to be removed. Defaults to no minimum size (no removals).", + ) + parser.add_argument( + "--mask", + action=argparse.BooleanOptionalAction, + help="Apply mask to protect center islands (small regions). Defaults to no mask.", + ) + parser.add_argument( + "--background_label", + default=0, + type=int, + help="Background label. Defaults to zero.", + ) + + return parser.parse_args() + + +def color_labels( + h_labels_hw, + bgl, + bgc=torch.as_tensor([0, 0, 0], dtype=torch.uint8), + fgc=torch.as_tensor([255, 255, 255], dtype=torch.uint8), + cmap=None, +): + """Convert labels to colors + + Args: + h_labels_hw (Tensor): Tensor with labels + bgl (int): Background label + bgc (Tensor): Background color, this color is used for the background label + fgc (Tensor): Foreground color, this color is used when cmap is None + cmap (function): Colormap, e.g. matplotlib.colormaps["jet"] + + Returns: + Tensor: Tensor with colors + """ + # Create an empty Tensor with same height and width as the labels Tensor and Channel = 3 for RGB + h_out_hwc = torch.empty( + (h_labels_hw.shape[0], h_labels_hw.shape[1], 3), dtype=torch.uint8 + ) + + # Set all values to be the background color + h_out_hwc[:, :] = bgc + + # Get the unique set of labels except background label from the labels Tensor + h_uniq = torch.unique(h_labels_hw) + h_uniq = h_uniq[h_uniq != bgl] + + # Set the label RGB color to be the foreground color + label_rgb = fgc + + for i, label in enumerate(h_uniq): + if cmap is not None: + # If a color map was provided, use it to generate the label color + label_rgb = [int(c * 255) for c in cmap(i / h_uniq.shape[0])[:3]] + label_rgb = torch.as_tensor(label_rgb, dtype=torch.uint8) + + h_out_hwc[h_labels_hw == label] = label_rgb + + return h_out_hwc + + +if __name__ == "__main__": + + args = parse_arguments() + + print( + f"I Reading input image: {args.input}\nI Writing output image: {args.output}\n" + f"I Minimum threshold: {args.min_threshold}\nI Maximum threshold: {args.max_threshold}\n" + f"I Minimum size: {args.min_size}\nI Apply mask: {args.mask}\n" + f"I Background label: {args.background_label}" + ) + + # Use torchvision to read an input image, convert it to gray and store it as a CHW Tensor + h_in_chw = torchvision.io.read_image(args.input, torchvision.io.ImageReadMode.GRAY) + + # Convert the image read from Pytorch Tensor to CVCUDA Tensor with zero copy + d_in_chw = cvcuda.as_tensor(h_in_chw.cuda(), layout="CHW") + + # Reshape CVCUDA Tensor from CHW to HW (Channel is 1) with zero copy + d_in_hw = d_in_chw.reshape(d_in_chw.shape[1:], "HW") + + # Tensors are initialized first in host (h_) and then copied to device (d_), using Pytorch's .as_tensor() + # and .cuda() methods, and then converted to CVCUDA with zero copy, using CVCUDA's .as_tensor() method + h_bgl = torch.as_tensor([args.background_label], dtype=h_in_chw.dtype) + d_bgl = cvcuda.as_tensor(h_bgl.cuda(), layout="N") + + # Tensors for min/max thresholds min size and mask are optional + d_min_thrs = None + d_max_thrs = None + d_min_size = None + d_mask_hw = None + + if args.min_threshold: + h_min_thrs = torch.as_tensor([args.min_threshold], dtype=h_in_chw.dtype) + d_min_thrs = cvcuda.as_tensor(h_min_thrs.cuda(), layout="N") + + if args.max_threshold: + h_max_thrs = torch.as_tensor([args.max_threshold], dtype=h_in_chw.dtype) + d_max_thrs = cvcuda.as_tensor(h_max_thrs.cuda(), layout="N") + + if args.min_size: + h_min_size = torch.as_tensor([args.min_size], dtype=torch.int32) + d_min_size = cvcuda.as_tensor(h_min_size.cuda(), layout="N") + + if args.mask: + # Below are slices in between 10% and 90% (a center box) to be considered inside the mask + s_h_in_mask = slice(int(0.1 * h_in_chw.shape[1]), int(0.9 * h_in_chw.shape[1])) + s_w_in_mask = slice(int(0.1 * h_in_chw.shape[2]), int(0.9 * h_in_chw.shape[2])) + + # The mask in host is first initialized with zeros + h_mask_hw = torch.zeros(h_in_chw.shape[1:], dtype=h_in_chw.dtype) + + # Then the center of the mask defined by the slices is set to 1 + h_mask_hw[s_h_in_mask, s_w_in_mask] = 1 + + # The Pytorch Tensor mask is copied to CUDA and converted to CVCUDA Tensor + d_mask_hw = cvcuda.as_tensor(h_mask_hw.cuda(), layout="HW") + + # Call CVCUDA label operator using the arguments set above + d_out, d_count, d_stats = cvcuda.label( + src=d_in_hw, + connectivity=cvcuda.CONNECTIVITY_4_2D, + assign_labels=cvcuda.LABEL.SEQUENTIAL, + mask_type=cvcuda.REMOVE_ISLANDS_OUTSIDE_MASK_ONLY, + count=True, + stats=True, + max_labels=args.max_labels, + bg_label=d_bgl, + min_thresh=d_min_thrs, + max_thresh=d_max_thrs, + min_size=d_min_size, + mask=d_mask_hw, + ) + + # Convert CVCUDA output Tensors to Pytorch with zero copy, using CVCUDA's .cuda() method, then copy the + # Pytorch Tensor to the CPU, using Pytorch's .cpu() method + h_out = torch.as_tensor(d_out.cuda()).cpu() + h_count = torch.as_tensor(d_count.cuda()).cpu() + h_stats = torch.as_tensor(d_stats.cuda()).cpu() + + print(f"I Number of labels found: {h_count[0]}") + + # The stats Tensor (with statistics) has a region mark at index 6 that is set to 1 for removed regions + # and set to 2 for regions in the mask that cannot be removed + num_removed = sum([1 if h_stats[0, si, 6] == 1 else 0 for si in range(h_count[0])]) + num_in_mask = sum([1 if h_stats[0, si, 6] == 2 else 0 for si in range(h_count[0])]) + + print(f"I Number of labeled regions removed: {num_removed}") + print(f"I Number of labeled regions in the mask: {num_in_mask}") + print(f"I Number of labeled regions kept: {h_count[0] - num_removed}") + + # Color the labels using default behavior: white foreground and black background + h_out_rgb_hwc = color_labels(h_out, h_bgl[0]) + + # Use torchvision to write the output image from a CHW Tensor + torchvision.io.write_png(h_out_rgb_hwc.permute(2, 0, 1), args.output) diff --git a/samples/label/python/main.py b/samples/label/python/main.py index aeff0f85..cb664f69 100644 --- a/samples/label/python/main.py +++ b/samples/label/python/main.py @@ -159,8 +159,10 @@ def run_sample( cuda_device = cuda.Device(device_id) cuda_ctx = cuda_device.retain_primary_context() cuda_ctx.push() - cvcuda_stream = cvcuda.Stream() - torch_stream = torch.cuda.ExternalStream(cvcuda_stream.handle) + # Use the the default stream for cvcuda and torch + # Since we never created a stream current will be the CUDA default stream + cvcuda_stream = cvcuda.Stream().current + torch_stream = torch.cuda.default_stream(device=cuda_device) # docs_tag: end_setup_gpu # docs_tag: encoder_decoder setup @@ -168,7 +170,7 @@ def run_sample( if os.path.splitext(input_path)[1] == ".jpg" or os.path.isdir(input_path): # Treat this as data modality of images decoder = ImageBatchDecoder( - input_path, batch_size, device_id, cuda_ctx, cvcuda_perf + input_path, batch_size, device_id, cuda_ctx, cvcuda_stream, cvcuda_perf ) encoder = ImageBatchEncoder( output_dir, diff --git a/samples/object_detection/python/main.py b/samples/object_detection/python/main.py index 935e121a..0741ea2c 100644 --- a/samples/object_detection/python/main.py +++ b/samples/object_detection/python/main.py @@ -85,8 +85,10 @@ def run_sample( cuda_device = cuda.Device(device_id) cuda_ctx = cuda_device.retain_primary_context() cuda_ctx.push() - cvcuda_stream = cvcuda.Stream() - torch_stream = torch.cuda.ExternalStream(cvcuda_stream.handle) + # Use the the default stream for cvcuda and torch + # Since we never created a stream current will be the CUDA default stream + cvcuda_stream = cvcuda.Stream().current + torch_stream = torch.cuda.default_stream(device=cuda_device) # docs_tag: end_setup_gpu # docs_tag: begin_setup_stages @@ -96,7 +98,7 @@ def run_sample( if os.path.splitext(input_path)[1] == ".jpg" or os.path.isdir(input_path): # Treat this as data modality of images decoder = ImageBatchDecoder( - input_path, batch_size, device_id, cuda_ctx, cvcuda_perf + input_path, batch_size, device_id, cuda_ctx, cvcuda_stream, cvcuda_perf ) encoder = ImageBatchEncoder( @@ -107,11 +109,11 @@ def run_sample( else: # Treat this as data modality of videos decoder = VideoBatchDecoder( - input_path, batch_size, device_id, cuda_ctx, cvcuda_perf + input_path, batch_size, device_id, cuda_ctx, cvcuda_stream, cvcuda_perf ) encoder = VideoBatchEncoder( - output_dir, decoder.fps, device_id, cuda_ctx, cvcuda_perf + output_dir, decoder.fps, device_id, cuda_ctx, cvcuda_stream, cvcuda_perf ) # Define the post-processor diff --git a/samples/scripts/benchmark.py b/samples/scripts/benchmark.py index fe252d26..938f080c 100644 --- a/samples/scripts/benchmark.py +++ b/samples/scripts/benchmark.py @@ -195,7 +195,7 @@ def parse_nvtx_gpu_proj_trace_json(json_path): # Grab the necessary values from the JSON file. range_id = row["RangeId"] - if range_id == "None": + if not range_id or range_id == "None": continue flat_name = row["Name"] diff --git a/samples/scripts/run_samples.sh b/samples/scripts/run_samples.sh index dea98a58..7a8fc302 100755 --- a/samples/scripts/run_samples.sh +++ b/samples/scripts/run_samples.sh @@ -1,6 +1,6 @@ #!/bin/bash -e -# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -36,6 +36,16 @@ echo "SEGMENTATION_OUT_DIR: $SEGMENTATION_OUT_DIR" echo "DETECTION_OUT_DIR: $DETECTION_OUT_DIR" echo "DISTANCE_LABEL_OUT_DIR: $DISTANCE_LABEL_OUT_DIR" +create_output_dir() { + local base_dir=$1 + local run_number=1 + while [[ -d "$base_dir/$run_number" ]]; do + let run_number++ + done + mkdir -p "$base_dir/$run_number" + echo "$base_dir/$run_number" +} + # Crop and Resize Sample # Batch size 2 LD_LIBRARY_PATH=$SAMPLES_DIR/lib $SAMPLES_DIR/build/cropandresize/cvcuda_sample_cropandresize -i $SAMPLES_DIR/assets/images/ -b 2 @@ -45,20 +55,27 @@ LD_LIBRARY_PATH=$SAMPLES_DIR/lib $SAMPLES_DIR/build/cropandresize/cvcuda_sample_ # Run the segmentation Python sample with default settings, without any command-line args. rm -rf "$CLASSIFICATION_OUT_DIR" mkdir "$CLASSIFICATION_OUT_DIR" -python3 $SAMPLES_DIR/classification/python/main.py -o "$CLASSIFICATION_OUT_DIR" +CLASSIFICATION_RUN_DIR=$(create_output_dir "$CLASSIFICATION_OUT_DIR") +python3 $SAMPLES_DIR/classification/python/main.py -o "$CLASSIFICATION_RUN_DIR" # Run it on a specific image with batch size 1 with PyTorch backend. -python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 1 -bk pytorch -o "$CLASSIFICATION_OUT_DIR" +CLASSIFICATION_RUN_DIR=$(create_output_dir "$CLASSIFICATION_OUT_DIR") +python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 1 -bk pytorch -o "$CLASSIFICATION_RUN_DIR" # # Run it on a specific image with batch size 4 with PyTorch backend. Uses Same image multiple times -python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 4 -bk pytorch -o "$CLASSIFICATION_OUT_DIR" +CLASSIFICATION_RUN_DIR=$(create_output_dir "$CLASSIFICATION_OUT_DIR") +python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 4 -bk pytorch -o "$CLASSIFICATION_RUN_DIR" # Run it on a folder worth of images with batch size 2 with PyTorch backend. -python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/ -b 2 -bk pytorch -o "$CLASSIFICATION_OUT_DIR" +CLASSIFICATION_RUN_DIR=$(create_output_dir "$CLASSIFICATION_OUT_DIR") +python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/ -b 2 -bk pytorch -o "$CLASSIFICATION_RUN_DIR" # Run it on a specific image with batch size 1 with TensorRT backend with saving the output in a specific directory. - -python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 1 -bk tensorrt -o "$CLASSIFICATION_OUT_DIR" +CLASSIFICATION_RUN_DIR=$(create_output_dir "$CLASSIFICATION_OUT_DIR") +python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 1 -bk tensorrt -o "$CLASSIFICATION_RUN_DIR" # Run it on a specific image with batch size 1 with TensorRT backend with saving the output in a specific directory. -python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 2 -bk tensorrt -o "$CLASSIFICATION_OUT_DIR" +CLASSIFICATION_RUN_DIR=$(create_output_dir "$CLASSIFICATION_OUT_DIR") +python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 2 -bk tensorrt -o "$CLASSIFICATION_RUN_DIR" # Run it on a video with batch size 1 with TensorRT backend with saving the output in a specific directory. -python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-ilimdar-avgezer-7081456.mp4 -b 1 -bk tensorrt -o "$CLASSIFICATION_OUT_DIR" +CLASSIFICATION_RUN_DIR=$(create_output_dir "$CLASSIFICATION_OUT_DIR") +python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-ilimdar-avgezer-7081456.mp4 -b 1 -bk tensorrt -o "$CLASSIFICATION_RUN_DIR" + # Run the classification C++ sample. Since the Python sample was already run, we can reuse the TensorRT model # and the labels file generated by it. # Batch size 1 @@ -66,44 +83,56 @@ LD_LIBRARY_PATH=$SAMPLES_DIR/lib $SAMPLES_DIR/build/classification/cvcuda_sample # Batch size 2 LD_LIBRARY_PATH=$SAMPLES_DIR/lib $SAMPLES_DIR/build/classification/cvcuda_sample_classification -e /tmp/classification/model.2.224.224.trtmodel -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -l /tmp/classification/labels.txt -b 2 - # Run the segmentation Python sample with default settings, without any command-line args. rm -rf "$SEGMENTATION_OUT_DIR" mkdir "$SEGMENTATION_OUT_DIR" -python3 $SAMPLES_DIR/segmentation/python/main.py -o "$SEGMENTATION_OUT_DIR" +SEGMENTATION_RUN_DIR=$(create_output_dir "$SEGMENTATION_OUT_DIR") +python3 $SAMPLES_DIR/segmentation/python/main.py -o "$SEGMENTATION_RUN_DIR" # Run the segmentation sample with default settings for PyTorch backend. -python3 $SAMPLES_DIR/segmentation/python/main.py -bk pytorch -o "$SEGMENTATION_OUT_DIR" +SEGMENTATION_RUN_DIR=$(create_output_dir "$SEGMENTATION_OUT_DIR") +python3 $SAMPLES_DIR/segmentation/python/main.py -bk pytorch -o "$SEGMENTATION_RUN_DIR" # Run it on a single image with high batch size for the background class writing to a specific directory with PyTorch backend -python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -o "$SEGMENTATION_OUT_DIR" -b 5 -c __background__ -bk pytorch +SEGMENTATION_RUN_DIR=$(create_output_dir "$SEGMENTATION_OUT_DIR") +python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -o "$SEGMENTATION_RUN_DIR" -b 5 -c __background__ -bk pytorch # Run it on a folder worth of images with the default tensorrt backend -python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/ -o "$SEGMENTATION_OUT_DIR" -b 4 -c __background__ +SEGMENTATION_RUN_DIR=$(create_output_dir "$SEGMENTATION_OUT_DIR") +python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/ -o "$SEGMENTATION_RUN_DIR" -b 4 -c __background__ # Run it on a folder worth of images with PyTorch -python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/ -o "$SEGMENTATION_OUT_DIR" -b 5 -c __background__ -bk pytorch +SEGMENTATION_RUN_DIR=$(create_output_dir "$SEGMENTATION_OUT_DIR") +python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/ -o "$SEGMENTATION_RUN_DIR" -b 5 -c __background__ -bk pytorch # Run on a single image with custom resized input given to the sample for the dog class -python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/Weimaraner.jpg -o "$SEGMENTATION_OUT_DIR" -b 1 -c dog -th 512 -tw 512 +SEGMENTATION_RUN_DIR=$(create_output_dir "$SEGMENTATION_OUT_DIR") +python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/Weimaraner.jpg -o "$SEGMENTATION_RUN_DIR" -b 1 -c dog -th 512 -tw 512 # Run it on a video for class background. -python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-ilimdar-avgezer-7081456.mp4 -b 4 -c __background__ -o "$SEGMENTATION_OUT_DIR" +SEGMENTATION_RUN_DIR=$(create_output_dir "$SEGMENTATION_OUT_DIR") +python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-ilimdar-avgezer-7081456.mp4 -b 4 -c __background__ -o "$SEGMENTATION_RUN_DIR" # Run it on a video for class background with the PyTorch backend. -python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-ilimdar-avgezer-7081456.mp4 -b 4 -c __background__ -bk pytorch -o "$SEGMENTATION_OUT_DIR" - +SEGMENTATION_RUN_DIR=$(create_output_dir "$SEGMENTATION_OUT_DIR") +python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-ilimdar-avgezer-7081456.mp4 -b 4 -c __background__ -bk pytorch -o "$SEGMENTATION_RUN_DIR" # Run the object detection Python sample with default settings, without any command-line args. rm -rf "$DETECTION_OUT_DIR" mkdir "$DETECTION_OUT_DIR" -python3 $SAMPLES_DIR/object_detection/python/main.py -o "$DETECTION_OUT_DIR" +DETECTION_RUN_DIR=$(create_output_dir "$DETECTION_OUT_DIR") +python3 $SAMPLES_DIR/object_detection/python/main.py -o "$DETECTION_RUN_DIR" # Run it with batch size 1 on a single image -python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/images/peoplenet.jpg -b 1 -o "$DETECTION_OUT_DIR" +DETECTION_RUN_DIR=$(create_output_dir "$DETECTION_OUT_DIR") +python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/images/peoplenet.jpg -b 1 -o "$DETECTION_RUN_DIR" # Run it with batch size 4 on a video -python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4 -b 4 -o "$DETECTION_OUT_DIR" +DETECTION_RUN_DIR=$(create_output_dir "$DETECTION_OUT_DIR") +python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4 -b 4 -o "$DETECTION_RUN_DIR" # Run it with batch size 2 on a folder of images -python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/images/ -b 3 -o "$DETECTION_OUT_DIR" +DETECTION_RUN_DIR=$(create_output_dir "$DETECTION_OUT_DIR") +python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/images/ -b 3 -o "$DETECTION_RUN_DIR" # RUn it with the TensorFlow backend -python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4 -b 4 -bk tensorflow -o "$DETECTION_OUT_DIR" - +DETECTION_RUN_DIR=$(create_output_dir "$DETECTION_OUT_DIR") +python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4 -b 4 -bk tensorflow -o "$DETECTION_RUN_DIR" # Run the distance label Python sample with default settings, without any command-line args. rm -rf "$DISTANCE_LABEL_OUT_DIR" mkdir "$DISTANCE_LABEL_OUT_DIR" -python3 $SAMPLES_DIR/label/python/main.py -o "$DISTANCE_LABEL_OUT_DIR" +DISTANCE_LABEL_RUN_DIR=$(create_output_dir "$DISTANCE_LABEL_OUT_DIR") +python3 $SAMPLES_DIR/label/python/main.py -o "$DISTANCE_LABEL_RUN_DIR" # Run it with batch size 1 on a single image -python3 $SAMPLES_DIR/label/python/main.py -i $SAMPLES_DIR/assets/images/peoplenet.jpg -b 1 -o "$DISTANCE_LABEL_OUT_DIR" +DISTANCE_LABEL_RUN_DIR=$(create_output_dir "$DISTANCE_LABEL_OUT_DIR") +python3 $SAMPLES_DIR/label/python/main.py -i $SAMPLES_DIR/assets/images/peoplenet.jpg -b 1 -o "$DISTANCE_LABEL_RUN_DIR" diff --git a/samples/segmentation/python/main.py b/samples/segmentation/python/main.py index 02c8a982..6ee5411a 100644 --- a/samples/segmentation/python/main.py +++ b/samples/segmentation/python/main.py @@ -85,8 +85,10 @@ def run_sample( cuda_device = cuda.Device(device_id) cuda_ctx = cuda_device.retain_primary_context() cuda_ctx.push() - cvcuda_stream = cvcuda.Stream() - torch_stream = torch.cuda.ExternalStream(cvcuda_stream.handle) + # Use the the default stream for cvcuda and torch + # Since we never created a stream current will be the CUDA default stream + cvcuda_stream = cvcuda.Stream().current + torch_stream = torch.cuda.default_stream(device=cuda_device) # docs_tag: end_setup_gpu # docs_tag: begin_setup_stages @@ -100,6 +102,7 @@ def run_sample( batch_size, device_id, cuda_ctx, + cvcuda_stream, cvcuda_perf, ) @@ -115,6 +118,7 @@ def run_sample( batch_size, device_id, cuda_ctx, + cvcuda_stream, cvcuda_perf, ) @@ -123,6 +127,7 @@ def run_sample( decoder.fps, device_id, cuda_ctx, + cvcuda_stream, cvcuda_perf, ) @@ -169,7 +174,7 @@ def run_sample( batch_idx = 0 while True: cvcuda_perf.push_range("batch", batch_idx=batch_idx) - + # Make sure that cvcuda and torch are using the same stream with cvcuda_stream, torch.cuda.stream(torch_stream): # Stage 1: decode batch = decoder() diff --git a/samples/segmentation/python/triton_client.py b/samples/segmentation/python/triton_client.py index 7802fec2..d6eff764 100644 --- a/samples/segmentation/python/triton_client.py +++ b/samples/segmentation/python/triton_client.py @@ -104,8 +104,10 @@ def run_sample( cuda_device = cuda.Device(device_id) cuda_ctx = cuda_device.retain_primary_context() cuda_ctx.push() - cvcuda_stream = cvcuda.Stream() - torch_stream = torch.cuda.ExternalStream(cvcuda_stream.handle) + # Use the the default stream for cvcuda and torch + # Since we never created a stream current will be the CUDA default stream + cvcuda_stream = cvcuda.Stream().current + torch_stream = torch.cuda.default_stream(device=cuda_device) # docs_tag: end_stream_setup # docs_tag: begin_setup_triton_client @@ -128,6 +130,7 @@ def run_sample( batch_size, device_id, cuda_ctx, + cvcuda_stream, cvcuda_perf, ) @@ -166,11 +169,12 @@ def run_sample( batch_size, device_id, cuda_ctx, + cvcuda_stream, cvcuda_perf, ) encoder = VideoBatchEncoder( - output_dir, decoder.fps, device_id, cuda_ctx, cvcuda_perf + output_dir, decoder.fps, device_id, cuda_ctx, cvcuda_stream, cvcuda_perf ) # Fire up encoder/decoder diff --git a/src/cvcuda/CMakeLists.txt b/src/cvcuda/CMakeLists.txt index 4a21a4c5..202caf75 100644 --- a/src/cvcuda/CMakeLists.txt +++ b/src/cvcuda/CMakeLists.txt @@ -22,7 +22,6 @@ set(CV_CUDA_OP_FILES OpOSD.cpp OpHistogramEq.cpp OpAdvCvtColor.cpp - OpFindContours.cpp OpSIFT.cpp OpMinMaxLoc.cpp OpHistogram.cpp @@ -69,6 +68,7 @@ set(CV_CUDA_OP_FILES OpLabel.cpp OpPairwiseMatcher.cpp OpFindHomography.cpp + OpStack.cpp ) # filter only one that matches the patern (case insensitive), should be set on the global level @@ -91,7 +91,6 @@ else() endif() add_library(cvcuda SHARED - OpStack.cpp ${CV_CUDA_LIB_FILES} ) diff --git a/src/cvcuda/OpFindContours.cpp b/src/cvcuda/OpFindContours.cpp deleted file mode 100644 index 8c508090..00000000 --- a/src/cvcuda/OpFindContours.cpp +++ /dev/null @@ -1,56 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "priv/OpFindContours.hpp" - -#include "priv/SymbolVersioning.hpp" - -#include -#include -#include -#include - -namespace priv = cvcuda::priv; - -CVCUDA_DEFINE_API(0, 4, NVCVStatus, cvcudaFindContoursCreate, - (NVCVOperatorHandle * handle, int32_t maxWidth, int32_t maxHeight, int32_t maxBatchSize)) -{ - return nvcv::ProtectCall( - [&] - { - if (handle == nullptr) - { - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, - "Pointer to NVCVOperator handle must not be NULL"); - } - - *handle = reinterpret_cast( - new priv::FindContours(nvcv::Size2D{maxWidth, maxHeight}, maxBatchSize)); - }); -} - -CVCUDA_DEFINE_API(0, 4, NVCVStatus, cvcudaFindContoursSubmit, - (NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle in, NVCVTensorHandle points, - NVCVTensorHandle counts)) -{ - return nvcv::ProtectCall( - [&] - { - nvcv::TensorWrapHandle point(points), input(in), count(counts); - priv::ToDynamicRef(handle)(stream, input, point, count); - }); -} diff --git a/src/cvcuda/OpLabel.cpp b/src/cvcuda/OpLabel.cpp index 351cce2b..807c99e6 100644 --- a/src/cvcuda/OpLabel.cpp +++ b/src/cvcuda/OpLabel.cpp @@ -38,11 +38,11 @@ CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaLabelCreate, (NVCVOperatorHandle * han }); } -CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaLabelSubmit, +CVCUDA_DEFINE_API(0, 7, NVCVStatus, cvcudaLabelSubmit, (NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle in, NVCVTensorHandle out, NVCVTensorHandle bgLabel, NVCVTensorHandle minThresh, NVCVTensorHandle maxThresh, - NVCVTensorHandle minSize, NVCVTensorHandle count, NVCVTensorHandle stats, - NVCVConnectivityType connectivity, NVCVLabelType assignLabels)) + NVCVTensorHandle minSize, NVCVTensorHandle count, NVCVTensorHandle stats, NVCVTensorHandle mask, + NVCVConnectivityType connectivity, NVCVLabelType assignLabels, NVCVLabelMaskType maskType)) { return nvcv::ProtectCall( [&] @@ -50,6 +50,7 @@ CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaLabelSubmit, cvcuda::priv::ToDynamicRef(handle)( stream, nvcv::TensorWrapHandle{in}, nvcv::TensorWrapHandle{out}, nvcv::TensorWrapHandle{bgLabel}, nvcv::TensorWrapHandle{minThresh}, nvcv::TensorWrapHandle{maxThresh}, nvcv::TensorWrapHandle{minSize}, - nvcv::TensorWrapHandle{count}, nvcv::TensorWrapHandle{stats}, connectivity, assignLabels); + nvcv::TensorWrapHandle{count}, nvcv::TensorWrapHandle{stats}, nvcv::TensorWrapHandle{mask}, + connectivity, assignLabels, maskType); }); } diff --git a/src/cvcuda/include/cvcuda/OpFindContours.h b/src/cvcuda/include/cvcuda/OpFindContours.h deleted file mode 100644 index 78ea04e4..00000000 --- a/src/cvcuda/include/cvcuda/OpFindContours.h +++ /dev/null @@ -1,124 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file OpFindContours.h - * - * @brief Defines types and functions to handle the resize operation. - * @defgroup NVCV_C_ALGORITHM_FIND_CONTOURS Find Contours - * @{ - */ - -#ifndef CVCUDA_FIND_CONTOURS_H -#define CVCUDA_FIND_CONTOURS_H - -#include "Operator.h" -#include "Types.h" -#include "detail/Export.h" - -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" -{ -#endif - -/** Constructs and an instance of the resize operator. - * - * @param [out] handle Where the image instance handle will be written to. - * + Must not be NULL. - * - * @retval #NVCV_ERROR_INVALID_ARGUMENT Handle is null. - * @retval #NVCV_ERROR_OUT_OF_MEMORY Not enough memory to create the operator. - * @retval #NVCV_SUCCESS Operation executed successfully. - */ -CVCUDA_PUBLIC NVCVStatus cvcudaFindContoursCreate(NVCVOperatorHandle *handle, int32_t maxWidth, int32_t maxHeight, - int32_t maxBatchSize); - -/** - * Limitations: - * - * Input: - * Data Layout: [kNHWC, kHWC] - * Channels: [1] - * - * Data Type | Allowed - * -------------- | ------------- - * 8bit Unsigned | Yes - * 8bit Signed | No - * 16bit Unsigned | No - * 16bit Signed | No - * 32bit Unsigned | No - * 32bit Signed | No - * 32bit Float | No - * 64bit Float | No - * - * Output: - * Data Layout: [kNHWC, kHWC] - * Channels: [1, 3, 4] - * - * Data Type | Allowed - * -------------- | ------------- - * 8bit Unsigned | Yes - * 8bit Signed | No - * 16bit Unsigned | Yes - * 16bit Signed | No - * 32bit Unsigned | No - * 32bit Signed | Yes - * 32bit Float | Yes - * 64bit Float | No - * - * Input/Output dependency - * - * Property | Input == Output - * -------------- | ------------- - * Data Layout | Yes - * Data Type | Yes - * Number | Yes - * Channels | Yes - * Width | Yes - * Height | Yes - * - * @param [in] handle Handle to the operator. - * + Must not be NULL. - * @param [in] stream Handle to a valid CUDA stream. - * @param [in] in GPU pointer to input data. Represents an 8-bit, unsigned, - * single-channel image. Non-zero pixels are treated as 1's, and zero - * pixels remain as 0's, which makes the image binary. - * @param [out] points GPU pointer to output data. It contains the detected - * contours for the input image. The data is structured as: [x_c0_p0, - * y_c0_p0, ..., x_ci_pj, y_ci_pj, ...], where "ci" denotes a contour's - * index in the output array and "pj" is a point's index within a - * contour. - * @param [out] numPoints Holds the number of contour points for each image. - * Specifically, numPoints[i] gives the number of contours for the i-th - * image, while numPoints[i][j] gives the number of points in the j-th - * contour of i-th image. - */ -/** @{ */ -CVCUDA_PUBLIC NVCVStatus cvcudaFindContoursSubmit(NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle in, - NVCVTensorHandle points, NVCVTensorHandle numPoints); -/** @} */ - -#ifdef __cplusplus -} -#endif - -#endif /* CVCUDA_FIND_CONTOURS_H */ diff --git a/src/cvcuda/include/cvcuda/OpFindContours.hpp b/src/cvcuda/include/cvcuda/OpFindContours.hpp deleted file mode 100644 index 29f84ffe..00000000 --- a/src/cvcuda/include/cvcuda/OpFindContours.hpp +++ /dev/null @@ -1,86 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file OpFindContours.hpp - * - * @brief Defines the public C++ Class for the resize operation. - * @defgroup NVCV_CPP_ALGORITHM_FIND_CONTOURS Find Contours - * @{ - */ - -#ifndef CVCUDA_FIND_CONTOURS_HPP -#define CVCUDA_FIND_CONTOURS_HPP - -#include "IOperator.hpp" -#include "OpFindContours.h" - -#include -#include -#include -#include -#include - -namespace cvcuda { - -class FindContours final : public IOperator -{ -public: - static constexpr int32_t MAX_NUM_CONTOURS = 256; - static constexpr int32_t MAX_CONTOUR_POINTS = 4 * 1024; - static constexpr int32_t MAX_TOTAL_POINTS = MAX_NUM_CONTOURS * MAX_CONTOUR_POINTS; - - explicit FindContours() = delete; - explicit FindContours(nvcv::Size2D maxSize, int32_t maxBatchSize); - - ~FindContours(); - - void operator()(cudaStream_t stream, nvcv::Tensor &in, nvcv::Tensor &points, nvcv::Tensor &numPoints); - - virtual NVCVOperatorHandle handle() const noexcept override; - -private: - NVCVOperatorHandle m_handle; -}; - -inline FindContours::FindContours(nvcv::Size2D maxSize, int32_t maxBatchSize) -{ - nvcv::detail::CheckThrow(cvcudaFindContoursCreate(&m_handle, maxSize.w, maxSize.h, maxBatchSize)); - assert(m_handle); -} - -inline FindContours::~FindContours() -{ - nvcvOperatorDestroy(m_handle); - m_handle = nullptr; -} - -inline void FindContours::operator()(cudaStream_t stream, nvcv::Tensor &in, nvcv::Tensor &points, - nvcv::Tensor &numPoints) -{ - nvcv::detail::CheckThrow( - cvcudaFindContoursSubmit(m_handle, stream, in.handle(), points.handle(), numPoints.handle())); -} - -inline NVCVOperatorHandle FindContours::handle() const noexcept -{ - return m_handle; -} - -} // namespace cvcuda - -#endif // CVCUDA_FIND_CONTOURS_HPP diff --git a/src/cvcuda/include/cvcuda/OpFindHomography.h b/src/cvcuda/include/cvcuda/OpFindHomography.h index 6d5c5dcc..b1806254 100644 --- a/src/cvcuda/include/cvcuda/OpFindHomography.h +++ b/src/cvcuda/include/cvcuda/OpFindHomography.h @@ -105,15 +105,15 @@ CVCUDA_PUBLIC NVCVStatus cvcudaFindHomographyCreate(NVCVOperatorHandle *handle, * from 0 to batch-1, j ranges from 4 to number of coordinates per image, and the data type being * float2 for (x=x, y=y) * + Number of coordinates must be >= 4 - * + Must have data type 2F32 - * + Must have rank 2 + * + Must have data type 2F32 or F32 + * + Must have rank 2 or 3 * * * @param [in] dstPts Input tensor, dstPts[i, j] is the set of coordinates for the destination image where i ranges * from 0 to batch-1, j ranges from 4 to number of coordinates per image, and the data type being * float2 for (x=x, y=y) * + Number of coordinates must be >= 4 - * + Must have data type 2F32 - * + Must have rank 2 + * + Must have data type 2F32 or F32 + * + Must have rank 2 or 3 * * @param [out] out Output tensor, models[i, j, k] is the output model tensor which maps the src points to dst points * in image i, where i ranges from 0 to batch-1, j ranges from 0 to 2 and k ranges from 0 to 2, and diff --git a/src/cvcuda/include/cvcuda/OpLabel.h b/src/cvcuda/include/cvcuda/OpLabel.h index 77f620a5..06a3a7ac 100644 --- a/src/cvcuda/include/cvcuda/OpLabel.h +++ b/src/cvcuda/include/cvcuda/OpLabel.h @@ -101,7 +101,7 @@ CVCUDA_PUBLIC NVCVStatus cvcudaLabelCreate(NVCVOperatorHandle *handle); * 16bit Unsigned | No * 16bit Signed | No * 32bit Unsigned | Yes - * 32bit Signed | No + * 32bit Signed | Yes * 32bit Float | No * 64bit Float | No * @@ -116,6 +116,8 @@ CVCUDA_PUBLIC NVCVStatus cvcudaLabelCreate(NVCVOperatorHandle *handle); * Height | Yes * Depth | Yes * + * @note The number of elements (pixels or voxels) in input and output tensors must be at most \f$ 2^31 - 1 \f$. + * * @param [in] handle Handle to the operator. * + Must not be NULL. * @param [in] stream Handle to a valid CUDA stream. @@ -177,7 +179,7 @@ CVCUDA_PUBLIC NVCVStatus cvcudaLabelCreate(NVCVOperatorHandle *handle); * done before this post-filter step, also known as island-removal step. * + It must have the same number of samples as input and output tensors. * + It must have one element per sample, i.e. number of channels must be 1 in a [NC] tensor. - * + It must have U32 data type. + * + It must have S32 or U32 data type. * + It may be NULL to not apply minimum size regions removal as a post-filter. * + If not NULL, the \ref bgLabel and \ref stats tensors must not be NULL as well. * @@ -189,32 +191,50 @@ CVCUDA_PUBLIC NVCVStatus cvcudaLabelCreate(NVCVOperatorHandle *handle); * of \ref stats tensor, and regions potentially removed by \ref minSize tensor. * + It must have the same number of samples as input and output tensors. * + It must have one element per sample, i.e. number of channels must be 1 in a [NC] tensor. - * + It must have U32 data type. + * + It must have S32 or U32 data type. * + It may be NULL to disregard counting the number of different labels found. * * @param [out] stats Statistics tensor. The expected layout is [NMA], meaning rank-3 tensor with first dimension * as the number of samples N, matching input and output tensors, second dimension M as maximum * number of different labels statistics to be computed, and a third dimension A as the amount - * of statistics to be computed per label (fixed as 6 for 2D or 8 for 3D). If present, this + * of statistics to be computed per label (fixed as 7 for 2D or 9 for 3D). If present, this * tensor is used by the operator to store information per connected-component label. The * background label is ignored and thus its statistics is not computed. * + It must have the same number of samples as input and output tensors. * + It must have a number of statistics M per sample N equal to the maximum allowed number of * label statistics that can be computed by the Label operator per sample image (or volume). * The actual number of labels found is stored in \ref count (see above). - * + For 2D labeling, it must have in the last dimension A=6 elements to store at: (0) the + * + For 2D labeling, it must have in the last dimension A=7 elements to store at: (0) the * original label number; (1) leftmost position; (2) topmost position; (3) width size; (4) - * height size; (5) count of pixels (i.e. size of the labeled region). And for 3D labeling, - * it must have in the last dimension A=8 elements to store at: (0) the original label number; - * (1) leftmost position; (2) topmost position; (3) shallowmost position; (4) width size; (5) - * height size; (6) depth size; (7) count of voxels (i.e. size of the labeled region). - * + It must have U32 data type. + * height size; (5) count of pixels (i.e. size of the labeled region); (6) region marks (0 + * means no marks, 1 means region was removed, 2 means region inside the \ref mask will not be + * removed). And for 3D labeling, it must have in the last dimension A=9 elements to store + * at: (0) the original label number; (1) leftmost position; (2) topmost position; (3) + * shallowmost position; (4) width size; (5) height size; (6) depth size; (7) count of voxels + * (i.e. size of the labeled region); (8) region marks (0 means no marks, 1 means region was + * removed, 2 means region inside the \ref mask will not be removed). + * + It must have S32 or U32 data type. * + It may be NULL to disregard computing statistics information on different labels found. * + It must not be NULL if \ref assignLabel is NVCV_LABEL_SEQUENTIAL, the index of each label * statistics is used as the new sequential label replacing the original label in the output, * the sequential labels are up to the maximum capacity M * + If not NULL, the \ref count tensor must not be NULL as well. * + * @param [in] mask Mask tensor. The expected layout is [HWC] or [NHWC] for 2D masking or [DHWC] or [NDHWC] for 3D + * masking, with either explicit C dimension or missing C with channels embedded in the data type. + * The N dimension is the number of samples, if missing it is considered to be N=1, in case N=1 + * and \ref in and \ref out tensors have N>1 the same mask is to be applied to all images (2D) or + * volumes (3D). A value of zero in the mask is considered to be outside the mask and non-zero is + * inside. The mask behavior is controlled by \ref maskType. + * + If number of samples N is present in the layout, it must be either 1 or equal to N in the + * \ref in \ref out tensors. + * + It must have the same height H and width W as \ref in and \ref out tensors. + * + It must have the same depth D as \ref in and \ref out tensors in case of 3D. + * + If channel C is present in the layout, it must be 1. + * + It must have S8 or U8 data type. + * + If not NULL and maskType is NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY, the \ref minSize tensor + * must not be NULL as well. + * * @param [in] connectivity Specify connectivity of elements for the operator, see \ref NVCVConnectivityType. * + It must conform with \ref in and \ref out tensors, i.e. 3D labeling requires [DHWC] * or [NDHWC] tensor layouts and 2D labeling requires [HWC] or [NHWC], where the C @@ -224,6 +244,10 @@ CVCUDA_PUBLIC NVCVStatus cvcudaLabelCreate(NVCVOperatorHandle *handle); * NVCV_LABEL_FAST to do fast labeling, i.e. assign non-consecutive label numbers fast. * Use NCVC_LABEL_SEQUENTIAL to have consecutive label numbers instead. * + * @param [in] maskType Specify how the mask tensor affects this operator, see \ref NVCVLabelMaskType. Use + * NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY to only remove islands, i.e. regions with less than + * \ref minSize elements, that are outside the mask (defined by zeros in the mask). + * * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range. * @retval #NVCV_ERROR_INTERNAL Internal error in the operator, invalid types passed in. * @retval #NVCV_SUCCESS Operation executed successfully. @@ -231,8 +255,9 @@ CVCUDA_PUBLIC NVCVStatus cvcudaLabelCreate(NVCVOperatorHandle *handle); CVCUDA_PUBLIC NVCVStatus cvcudaLabelSubmit(NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle in, NVCVTensorHandle out, NVCVTensorHandle bgLabel, NVCVTensorHandle minThresh, NVCVTensorHandle maxThresh, NVCVTensorHandle minSize, NVCVTensorHandle count, - NVCVTensorHandle stats, NVCVConnectivityType connectivity, - NVCVLabelType assignLabels); + NVCVTensorHandle stats, NVCVTensorHandle mask, + NVCVConnectivityType connectivity, NVCVLabelType assignLabels, + NVCVLabelMaskType maskType); #ifdef __cplusplus } diff --git a/src/cvcuda/include/cvcuda/OpLabel.hpp b/src/cvcuda/include/cvcuda/OpLabel.hpp index 54ebd54e..1b6997d9 100644 --- a/src/cvcuda/include/cvcuda/OpLabel.hpp +++ b/src/cvcuda/include/cvcuda/OpLabel.hpp @@ -45,8 +45,8 @@ class Label final : public IOperator void operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, const nvcv::Tensor &bgLabel, const nvcv::Tensor &minThresh, const nvcv::Tensor &maxThresh, const nvcv::Tensor &minSize, - const nvcv::Tensor &count, const nvcv::Tensor &stats, NVCVConnectivityType connectivity, - NVCVLabelType assignLabels) const; + const nvcv::Tensor &count, const nvcv::Tensor &stats, const nvcv::Tensor &mask, + NVCVConnectivityType connectivity, NVCVLabelType assignLabels, NVCVLabelMaskType maskType) const; virtual NVCVOperatorHandle handle() const noexcept override; @@ -69,11 +69,12 @@ inline Label::~Label() inline void Label::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, const nvcv::Tensor &bgLabel, const nvcv::Tensor &minThresh, const nvcv::Tensor &maxThresh, const nvcv::Tensor &minSize, const nvcv::Tensor &count, const nvcv::Tensor &stats, - NVCVConnectivityType connectivity, NVCVLabelType assignLabels) const + const nvcv::Tensor &mask, NVCVConnectivityType connectivity, NVCVLabelType assignLabels, + NVCVLabelMaskType maskType) const { nvcv::detail::CheckThrow(cvcudaLabelSubmit(m_handle, stream, in.handle(), out.handle(), bgLabel.handle(), minThresh.handle(), maxThresh.handle(), minSize.handle(), count.handle(), - stats.handle(), connectivity, assignLabels)); + stats.handle(), mask.handle(), connectivity, assignLabels, maskType)); } inline NVCVOperatorHandle Label::handle() const noexcept diff --git a/src/cvcuda/include/cvcuda/OpSIFT.h b/src/cvcuda/include/cvcuda/OpSIFT.h index 45fa7308..39e5142f 100644 --- a/src/cvcuda/include/cvcuda/OpSIFT.h +++ b/src/cvcuda/include/cvcuda/OpSIFT.h @@ -146,8 +146,7 @@ CVCUDA_PUBLIC NVCVStatus cvcudaSIFTCreate(NVCVOperatorHandle *handle, int3 maxSh * + It must have S32 data type to store number of features found. * + It must have one element per sample, i.e. number of channels must be 1 in a [NC] tensor. * - * @param [in] numOctaveLayers Number of layers in each octave. Since the minimum number of layers is 3, the - * actual number is 3 + numOctaveLayers. One suggestion, given by the original + * @param [in] numOctaveLayers Number of layers in each octave. One suggestion, given by the original * algorithm description, is to use numOctaveLayers = 3. The number of octaves is * computed from the input image resolution WxH as \f$ log(min(W, H))/log(2) - 2 \f$. * + It must be positive. diff --git a/src/cvcuda/include/cvcuda/Types.h b/src/cvcuda/include/cvcuda/Types.h index 37eb2e0c..8dc5131f 100644 --- a/src/cvcuda/include/cvcuda/Types.h +++ b/src/cvcuda/include/cvcuda/Types.h @@ -402,6 +402,12 @@ typedef enum NVCV_LABEL_SEQUENTIAL, //!< Assigns consecutive numbers to labels. } NVCVLabelType; +// @brief Defines how mask affects label operation +typedef enum +{ + NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY, //!< Prevent removing islands inside the mask +} NVCVLabelMaskType; + // @brief Defines pair-wise matcher algorithms of choice typedef enum { diff --git a/src/cvcuda/priv/CMakeLists.txt b/src/cvcuda/priv/CMakeLists.txt index 6b28a39f..fa0e8c39 100644 --- a/src/cvcuda/priv/CMakeLists.txt +++ b/src/cvcuda/priv/CMakeLists.txt @@ -18,7 +18,6 @@ add_subdirectory(legacy) set(CV_CUDA_PRIV_FILES IOperator.cpp) set(CV_CUDA_PRIV_OP_FILES - OpFindContours.cpp OpOSD.cpp OpHistogramEq.cpp OpAdvCvtColor.cu diff --git a/src/cvcuda/priv/OpBrightnessContrast.cu b/src/cvcuda/priv/OpBrightnessContrast.cu index f97f67b4..2e55c3df 100644 --- a/src/cvcuda/priv/OpBrightnessContrast.cu +++ b/src/cvcuda/priv/OpBrightnessContrast.cu @@ -72,8 +72,7 @@ struct BatchArgsWrap }; template -inline __host__ __device__ BT GetArg(const cuda::Tensor1DWrap &tensorArg, int argLen, int sampleIdx, - BT defaultVal) +inline __device__ BT GetArg(const cuda::Tensor1DWrap &tensorArg, int argLen, int sampleIdx, BT defaultVal) { if (argLen == 0) { @@ -90,7 +89,7 @@ inline __host__ __device__ BT GetArg(const cuda::Tensor1DWrap &tensorA } template -inline __host__ __device__ SampleArgs GetBrightnessContrastArg(const BatchArgsWrap &args, int sampleIdx) +inline __device__ SampleArgs GetBrightnessContrastArg(const BatchArgsWrap &args, int sampleIdx) { return {GetArg(args.brightness, args.brightnessLen, sampleIdx, BT{1}), GetArg(args.contrast, args.contrastLen, sampleIdx, BT{1}), diff --git a/src/cvcuda/priv/OpFindContours.cpp b/src/cvcuda/priv/OpFindContours.cpp deleted file mode 100644 index 51d253e1..00000000 --- a/src/cvcuda/priv/OpFindContours.cpp +++ /dev/null @@ -1,69 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "OpFindContours.hpp" - -#include "legacy/CvCudaLegacy.h" -#include "legacy/CvCudaLegacyHelpers.hpp" - -#include -#include - -namespace cvcuda::priv { - -namespace legacy = nvcv::legacy::cuda_op; - -FindContours::FindContours(nvcv::Size2D maxSize, int maxBatchSize) -{ - legacy::DataShape maxIn, maxOut; - // maxIn/maxOut not used by op. - maxIn.N = maxBatchSize; - maxIn.C = 1; - maxIn.H = maxSize.h; - maxIn.W = maxSize.w; - - m_legacyOp = std::make_unique(maxIn, maxOut); -} - -void FindContours::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &points, - const nvcv::Tensor &numPoints) const -{ - auto inData = in.exportData(); - if (inData == nullptr) - { - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, - "Input must be cuda-accessible, pitch-linear tensor"); - } - - auto pointCoords = points.exportData(); - if (pointCoords == nullptr) - { - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, - "Output must be cuda-accessible, pitch-linear tensor"); - } - - auto pointCounts = numPoints.exportData(); - if (pointCounts == nullptr) - { - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, - "Output must be cuda-accessible, pitch-linear tensor"); - } - - NVCV_CHECK_THROW(m_legacyOp->infer(*inData, *pointCoords, *pointCounts, stream)); -} - -} // namespace cvcuda::priv diff --git a/src/cvcuda/priv/OpFindContours.hpp b/src/cvcuda/priv/OpFindContours.hpp deleted file mode 100644 index ec4f2113..00000000 --- a/src/cvcuda/priv/OpFindContours.hpp +++ /dev/null @@ -1,55 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file OpFindContours.hpp - * - * @brief Defines the private C++ Class for the find contours operation. - */ - -#ifndef CVCUDA_PRIV_FIND_CONTOURS_HPP -#define CVCUDA_PRIV_FIND_CONTOURS_HPP - -#include "IOperator.hpp" -#include "legacy/CvCudaLegacy.h" - -#include -#include - -#include - -namespace cvcuda::priv { - -namespace legacy = nvcv::legacy::cuda_op; - -class FindContours final : public IOperator -{ -public: - explicit FindContours() = delete; - - explicit FindContours(nvcv::Size2D maxSize, int maxBatchSize); - - void operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &points, - const nvcv::Tensor &numPoints) const; - -private: - std::unique_ptr m_legacyOp; -}; - -} // namespace cvcuda::priv - -#endif // CVCUDA_PRIV_FIND_CONTOURS_HPP diff --git a/src/cvcuda/priv/OpFindHomography.cu b/src/cvcuda/priv/OpFindHomography.cu index d3e712cb..7e8beef8 100644 --- a/src/cvcuda/priv/OpFindHomography.cu +++ b/src/cvcuda/priv/OpFindHomography.cu @@ -295,12 +295,12 @@ __device__ void calculate_residual_and_jacobian_device(float2 *src, float2 *dst, } } -__host__ __device__ inline float myfabs(float val) +__device__ inline float myfabs(float val) { return fabsf(val); } -inline __host__ __device__ float2 myfabs2(float2 val) +inline __device__ float2 myfabs2(float2 val) { float2 ret; ret.x = fabsf(val.x); @@ -308,14 +308,14 @@ inline __host__ __device__ float2 myfabs2(float2 val) return ret; } -__host__ __device__ inline int getNumPoints(cuda::Tensor2DWrap src, int numPoints, int batch) +__device__ inline int getNumPoints(cuda::Tensor2DWrap src, int numPoints, int batch) { return numPoints; } struct MeanOp { - __host__ __device__ float2 eval(float2 val, int numPoints, int batch) + __device__ float2 eval(float2 val, int numPoints, int batch) { return val / numPoints; } @@ -323,7 +323,7 @@ struct MeanOp struct SquareOp { - __host__ __device__ float eval(float val, int batch) + __device__ float eval(float val, int batch) { return val * val; } @@ -336,11 +336,11 @@ private: public: // Constructor that takes a float* pointer as a parameter - __host__ __device__ AbsShiftOp(float2 *data) + __host__ AbsShiftOp(float2 *data) : _data(data){}; // Method to update the float value pointed to by the pointer - __host__ __device__ float2 eval(float2 newVal, int numPoints, int batch) + __device__ float2 eval(float2 newVal, int numPoints, int batch) { _data += batch; return myfabs2(newVal - _data[0]); @@ -353,7 +353,7 @@ private: float2 *cm, *cM, *sm, *sM; public: - __host__ __device__ LtLOp(float2 *srcMean, float2 *dstMean, float2 *srcShiftSum, float2 *dstShiftSum) + __host__ LtLOp(float2 *srcMean, float2 *dstMean, float2 *srcShiftSum, float2 *dstShiftSum) { cM = srcMean; sM = srcShiftSum; @@ -361,7 +361,7 @@ public: sm = dstShiftSum; } - __host__ __device__ float eval(float2 *src, float2 *dst, int batch, int numPoints, int tid, int j, int k) + __device__ float eval(float2 *src, float2 *dst, int batch, int numPoints, int tid, int j, int k) { cm += batch; cM += batch; @@ -1410,10 +1410,59 @@ void FindHomographyWrapper(SrcDstWrapper srcWrap, SrcDstWrapper dstWrap, ModelTy calc_buffer, modelWrap, numPoints, batchSize); } -template -void RunFindHomography(const SrcDstType &src, const SrcDstType &dst, const nvcv::TensorDataStridedCuda &models, - const BufferOffsets *bufferOffset, const cuSolver *cusolverData, cudaStream_t stream) +inline void RunFindHomography(const nvcv::TensorDataStridedCuda &src, const nvcv::TensorDataStridedCuda &dst, + const nvcv::TensorDataStridedCuda &models, const BufferOffsets *bufferOffset, + const cuSolver *cusolverData, cudaStream_t stream) { + // validation of input data + if ((src.rank() != 2 && src.rank() != 3) || (dst.rank() != 2 && dst.rank() != 3)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "source and destination points must have rank 2 or 3"); + } + + if (!(src.shape(0) == dst.shape(0) && src.shape(0) == models.shape(0))) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "source, destination and model must have same batch size"); + } + + if (src.shape(1) != dst.shape(1)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "source and destination array length must be same length to return a valid model"); + } + + if (src.shape(1) < 4 || dst.shape(1) < 4) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "source and destination array length must be >=4 to return a valid model"); + } + + if (!(models.rank() == 3 && models.shape(1) == 3 && models.shape(2) == 3 && models.dtype() == nvcv::TYPE_F32)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "model tensor must be 2D with shape 3x3 and data type F32"); + } + + if (!((src.rank() == 2 && src.dtype() == nvcv::TYPE_2F32) + || (src.rank() == 3 && src.dtype() == nvcv::TYPE_F32 && src.shape(2) == 2))) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "source tensor must have data type 2F32 or F32 with last shape 2"); + } + if (!((dst.rank() == 2 && dst.dtype() == nvcv::TYPE_2F32) + || (dst.rank() == 3 && dst.dtype() == nvcv::TYPE_F32 && dst.shape(2) == 2))) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "destination tensor must have data type 2F32 or F32 with last shape 2"); + } + if (!(src.stride(1) == sizeof(float2) && dst.stride(1) == sizeof(float2))) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "source and destination tensors must have last dimensions packed"); + } + using SrcDstWrapper = cuda::Tensor2DWrap; SrcDstWrapper srcWrap(src); SrcDstWrapper dstWrap(dst); @@ -1498,42 +1547,6 @@ void FindHomography::operator()(cudaStream_t stream, const nvcv::Tensor &srcPoin "Input must be cuda-accessible, pitch-linear tensor"); } - // validation of input data - if (!((srcData->rank() == dstData->rank()) && (srcData->rank() == 2))) - { - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "source and destination points must have rank 2"); - } - - if (!(srcData->shape(0) == dstData->shape(0) && srcData->shape(0) == modelData->shape(0))) - { - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, - "source, destination and model must have same batch size"); - } - - if (srcData->shape(1) != dstData->shape(1)) - { - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, - "source and destination array length must be same length to return a valid model"); - } - - if (srcData->shape(1) < 4 || dstData->shape(1) < 4) - { - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, - "source and destination array length must be >=4 to return a valid model"); - } - - if (!(modelData->rank() == 3 && modelData->shape(1) == 3 && modelData->shape(2) == 3)) - { - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "model tensor must be 2D with shape 3x3"); - } - - if (!(srcData->dtype() == nvcv::TYPE_2F32 && dstData->dtype() == nvcv::TYPE_2F32 - && modelData->dtype() == nvcv::TYPE_F32)) - { - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, - "source, destination and model tensors must have data type F32"); - } - RunFindHomography(*srcData, *dstData, *modelData, &bufferOffset, &cusolverData, stream); } @@ -1569,45 +1582,6 @@ void FindHomography::operator()(cudaStream_t stream, const nvcv::TensorBatch &sr "model must be cuda-accessible, pitch-linear tensor"); } - // validation of input data - if (!((srcData->shape(0) == dstData->shape(0)) && (srcData->shape(0) == modelData->shape(0)) - && (srcData->shape(0) == 1))) - { - throw nvcv::Exception( - nvcv::Status::ERROR_INVALID_ARGUMENT, - "Invdividual samples (src, dst and model) in the batch must be tensors with batch size 1"); - } - - if (!((srcData->rank() == dstData->rank()) && (srcData->rank() == 2))) - { - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, - "source and destination tensors must have rank 2"); - } - - if (srcData->shape(1) != dstData->shape(1)) - { - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, - "source and destination array length must be same length to return a valid model"); - } - - if (srcData->shape(1) < 4 || dstData->shape(1) < 4) - { - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, - "source and destination array length must be >=4 to return a valid model"); - } - - if (!(modelData->rank() == 3 && modelData->shape(1) == 3 && modelData->shape(2) == 3)) - { - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "model tensor must be 2D with shape 3x3"); - } - - if (!(srcData->dtype() == nvcv::TYPE_2F32 && dstData->dtype() == nvcv::TYPE_2F32 - && modelData->dtype() == nvcv::TYPE_F32)) - { - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, - "source, destination and model tensors must have data type F32"); - } - RunFindHomography(*srcData, *dstData, *modelData, &bufferOffset, &cusolverData, stream); } } diff --git a/src/cvcuda/priv/OpLabel.cu b/src/cvcuda/priv/OpLabel.cu index 8a1c5118..b552e865 100644 --- a/src/cvcuda/priv/OpLabel.cu +++ b/src/cvcuda/priv/OpLabel.cu @@ -68,6 +68,10 @@ namespace util = nvcv::util; namespace { +constexpr int REGION_NOT_MARKED = 0; +constexpr int REGION_REMOVED = 1; +constexpr int REGION_INSIDE_MASK = 2; + // CUDA kernels ---------------------------------------------------------------- template @@ -432,12 +436,13 @@ __global__ void CountLabels2D(cuda::Tensor1DWrap

count, cuda::Tensor3DWrap -__global__ void ComputeStats2D(cuda::Tensor3DWrap
stats, cuda::Tensor3DWrap
dst, cuda::Tensor1DWrap bgLabel, - int2 size, bool relabel) +template +__global__ void ComputeStats2D(cuda::Tensor3DWrap
stats, cuda::Tensor3DWrap
dst, cuda::Tensor3DWrap mask, + cuda::Tensor1DWrap bgLabel, int2 size, int maskN, bool relabel) { int3 gc; gc.x = blockIdx.x * blockDim.x + threadIdx.x; @@ -449,26 +454,45 @@ __global__ void ComputeStats2D(cuda::Tensor3DWrap
stats, cuda::Tensor3DWrap< return; } + bool hasMask = (mask.ptr(0) != nullptr); + bool isInsideMask = false; bool hasBgLabel = (bgLabel.ptr(0) != nullptr); ST backgroundLabel = hasBgLabel ? bgLabel[gc.z] : 0; DT endLabel = dst.strides()[0] / sizeof(DT); DT label = dst[gc]; + DT regionIdx = 0; + + if (hasMask) + { + int3 mc{gc.x, gc.y, maskN == 1 ? 0 : gc.z}; + + isInsideMask = mask[mc] == 0 ? false : true; // mask value = 0 means outside the mask + } if (hasBgLabel && label == (DT)backgroundLabel) { return; // do not compute statistics for background labels } + if (label & (DT)(1 << 31)) { + if (isInsideMask) + { + regionIdx = label & (DT) ~(1 << 31); + + *stats.ptr(gc.z, (int)regionIdx, 6) = REGION_INSIDE_MASK; // region is inside the mask, mark it as such + } + return; // label is marked as region index, its statistics is already computed } + if (hasBgLabel && label == endLabel) { // This is a special region marked with one-element-after-the-end label, its label was the backgroundLabel label = backgroundLabel; } - DT regionIdx = dst.ptr(gc.z)[label]; + regionIdx = dst.ptr(gc.z)[label]; if (regionIdx & (DT)(1 << 31)) { @@ -493,12 +517,18 @@ __global__ void ComputeStats2D(cuda::Tensor3DWrap
stats, cuda::Tensor3DWrap< atomicMax(stats.ptr(gc.z, (int)regionIdx, 3), (DT)bboxArea.x); atomicMax(stats.ptr(gc.z, (int)regionIdx, 4), (DT)bboxArea.y); atomicAdd(stats.ptr(gc.z, (int)regionIdx, 5), 1); + + if (isInsideMask) + { + *stats.ptr(gc.z, (int)regionIdx, 6) = REGION_INSIDE_MASK; // region is inside the mask, mark it as such + } } } template __global__ void RemoveIslands2D(cuda::Tensor3DWrap
stats, cuda::Tensor3DWrap
dst, - cuda::Tensor1DWrap bgLabel, cuda::Tensor1DWrap
minSize, int2 size, bool relabel) + cuda::Tensor1DWrap bgLabel, cuda::Tensor1DWrap
minSize, int2 size, bool relabel, + bool hasMask) { int3 gc; gc.x = blockIdx.x * blockDim.x + threadIdx.x; @@ -557,7 +587,7 @@ __global__ void RemoveIslands2D(cuda::Tensor3DWrap
stats, cuda::Tensor3DWrap } else { - regionIdx = label & (DT) ~(1 << 31); + return; // should not remove first region element with 1st bit 1 so other elements are not lost } DT regionSize = *stats.ptr(gc.z, (int)regionIdx, 5); @@ -565,13 +595,18 @@ __global__ void RemoveIslands2D(cuda::Tensor3DWrap
stats, cuda::Tensor3DWrap // If region size is less than minimum size, it is an island and should be removed, i.e. set to background label if (regionSize < minSize[gc.z]) { - dst[gc] = backgroundLabel; + // If there is no mask or if there is a mask and the region mark is not 2, meaning the region is not + // inside the mask, the region should be removed + if (!hasMask || *stats.ptr(gc.z, (int)regionIdx, 6) != REGION_INSIDE_MASK) + { + dst[gc] = backgroundLabel; + } } } template __global__ void Relabel2D(cuda::Tensor3DWrap
stats, cuda::Tensor3DWrap
dst, cuda::Tensor1DWrap bgLabel, - int2 size, bool relabel) + cuda::Tensor1DWrap
minSize, int2 size, bool relabel, bool hasMask) { int3 gc; gc.x = blockIdx.x * blockDim.x + threadIdx.x; @@ -583,6 +618,8 @@ __global__ void Relabel2D(cuda::Tensor3DWrap
stats, cuda::Tensor3DWrap
d return; } + bool removeIsland = minSize.ptr(0) != nullptr; + DT label = dst[gc]; if (label & (DT)(1 << 31)) @@ -608,6 +645,21 @@ __global__ void Relabel2D(cuda::Tensor3DWrap
stats, cuda::Tensor3DWrap
d { dst[gc] = *stats.ptr(gc.z, (int)regionIdx, 0); } + + if (removeIsland) + { + DT regionSize = *stats.ptr(gc.z, (int)regionIdx, 5); + + if (regionSize < minSize[gc.z]) + { + if (!hasMask || *stats.ptr(gc.z, (int)regionIdx, 6) != REGION_INSIDE_MASK) + { + dst[gc] = (DT)bgLabel[gc.z]; + + *stats.ptr(gc.z, (int)regionIdx, 6) = REGION_REMOVED; + } + } + } } } @@ -1104,13 +1156,14 @@ __global__ void CountLabels3D(cuda::Tensor1DWrap
count, cuda::Tensor3DWrap -__global__ void ComputeStats3D(cuda::Tensor3DWrap
stats, cuda::Tensor4DWrap
dst, cuda::Tensor1DWrap bgLabel, - int4 shape, bool relabel) +template +__global__ void ComputeStats3D(cuda::Tensor3DWrap
stats, cuda::Tensor4DWrap
dst, cuda::Tensor4DWrap mask, + cuda::Tensor1DWrap bgLabel, int4 shape, int maskN, bool relabel) { int4 gc; gc.x = blockIdx.x * blockDim.x + threadIdx.x; @@ -1122,8 +1175,11 @@ __global__ void ComputeStats3D(cuda::Tensor3DWrap
stats, cuda::Tensor4DWrap< return; } - bool hasBgLabel = (bgLabel.ptr(0) != nullptr); - DT endLabel = dst.strides()[0] / sizeof(DT); + bool hasMask = (mask.ptr(0) != nullptr); + bool isInsideMask = false; + bool hasBgLabel = (bgLabel.ptr(0) != nullptr); + DT endLabel = dst.strides()[0] / sizeof(DT); + DT regionIdx = 0; for (gc.w = 0; gc.w < shape.w; gc.w++) { @@ -1131,12 +1187,26 @@ __global__ void ComputeStats3D(cuda::Tensor3DWrap
stats, cuda::Tensor4DWrap< DT label = dst[gc]; + if (hasMask) + { + int4 mc{gc.x, gc.y, gc.z, maskN == 1 ? 0 : gc.w}; + + isInsideMask = mask[mc] == 0 ? false : true; // mask value = 0 means outside the mask + } + if (hasBgLabel && label == (DT)backgroundLabel) { continue; // do not compute statistics for background labels } if (label & (DT)(1 << 31)) { + if (isInsideMask) + { + regionIdx = label & (DT) ~(1 << 31); + + *stats.ptr(gc.w, (int)regionIdx, 8) = REGION_INSIDE_MASK; // region is inside the mask, mark it as such + } + continue; // label is marked as region index, its statistics is already computed } if (hasBgLabel && label == endLabel) @@ -1145,7 +1215,7 @@ __global__ void ComputeStats3D(cuda::Tensor3DWrap
stats, cuda::Tensor4DWrap< label = backgroundLabel; } - DT regionIdx = dst.ptr(gc.w)[label]; + regionIdx = dst.ptr(gc.w)[label]; if (regionIdx & (DT)(1 << 31)) { @@ -1172,6 +1242,11 @@ __global__ void ComputeStats3D(cuda::Tensor3DWrap
stats, cuda::Tensor4DWrap< atomicMax(stats.ptr(gc.w, (int)regionIdx, 5), (DT)bboxArea.y); atomicMax(stats.ptr(gc.w, (int)regionIdx, 6), (DT)bboxArea.z); atomicAdd(stats.ptr(gc.w, (int)regionIdx, 7), 1); + + if (isInsideMask) + { + *stats.ptr(gc.w, (int)regionIdx, 8) = REGION_INSIDE_MASK; // region is inside the mask, mark it as such + } } } } @@ -1179,7 +1254,7 @@ __global__ void ComputeStats3D(cuda::Tensor3DWrap
stats, cuda::Tensor4DWrap< template __global__ void RemoveIslands3D(cuda::Tensor3DWrap
stats, cuda::Tensor4DWrap
dst, cuda::Tensor1DWrap bgLabel, cuda::Tensor1DWrap
minSize, int4 shape, - bool relabel) + bool relabel, bool hasMask) { int4 gc; gc.x = blockIdx.x * blockDim.x + threadIdx.x; @@ -1234,13 +1309,13 @@ __global__ void RemoveIslands3D(cuda::Tensor3DWrap
stats, cuda::Tensor4DWrap } else { - return; // invalid region index + continue; // invalid region index } } } else { - regionIdx = label & (DT) ~(1 << 31); + continue; // should not remove first region element with 1st bit 1 so other elements are not lost } DT regionSize = *stats.ptr(gc.w, (int)regionIdx, 7); @@ -1248,14 +1323,19 @@ __global__ void RemoveIslands3D(cuda::Tensor3DWrap
stats, cuda::Tensor4DWrap // If region size is less than minimum size, it is an island and should be removed, i.e. set to background label if (regionSize < minSize[gc.w]) { - dst[gc] = backgroundLabel; + // If there is no mask or if there is a mask and the region mark is not 2, meaning the region is not + // inside the mask, the region should be removed + if (!hasMask || *stats.ptr(gc.w, (int)regionIdx, 8) != REGION_INSIDE_MASK) + { + dst[gc] = backgroundLabel; + } } } } template __global__ void Relabel3D(cuda::Tensor3DWrap
stats, cuda::Tensor4DWrap
dst, cuda::Tensor1DWrap bgLabel, - int4 shape, bool relabel) + cuda::Tensor1DWrap
minSize, int4 shape, bool relabel, bool hasMask) { int4 gc; gc.x = blockIdx.x * blockDim.x + threadIdx.x; @@ -1267,6 +1347,8 @@ __global__ void Relabel3D(cuda::Tensor3DWrap
stats, cuda::Tensor4DWrap
d return; } + bool removeIsland = minSize.ptr(0) != nullptr; + for (gc.w = 0; gc.w < shape.w; gc.w++) { DT label = dst[gc]; @@ -1294,27 +1376,86 @@ __global__ void Relabel3D(cuda::Tensor3DWrap
stats, cuda::Tensor4DWrap
d { dst[gc] = *stats.ptr(gc.w, (int)regionIdx, 0); } + + if (removeIsland) + { + DT regionSize = *stats.ptr(gc.w, (int)regionIdx, 7); + + if (regionSize < minSize[gc.w]) + { + if (!hasMask || *stats.ptr(gc.w, (int)regionIdx, 8) != REGION_INSIDE_MASK) + { + dst[gc] = (DT)bgLabel[gc.w]; + + *stats.ptr(gc.w, (int)regionIdx, 8) = REGION_REMOVED; + } + } + } } } } // Run functions --------------------------------------------------------------- -template +template inline void RunLabelForType(cudaStream_t stream, const nvcv::TensorDataStridedCuda &srcData, const nvcv::TensorDataStridedCuda &dstData, const int4 &shapeWHDN, const nvcv::Tensor &bgLabel, const nvcv::Tensor &minThresh, const nvcv::Tensor &maxThresh, const nvcv::Tensor &minSize, const nvcv::Tensor &count, const nvcv::Tensor &stats, - int numDim, bool relabel) + const nvcv::Tensor &mask, int numDim, bool relabel) { constexpr int BW = 32, BH = 4, BD = 2; // block width, height and depth int4 idsNDHW{srcData.layout().find('N'), srcData.layout().find('D'), srcData.layout().find('H'), srcData.layout().find('W')}; + // Although output tensors may have S32 or U32 data type, they are always considered U32 (DstT = uint32_t) as + // they are used as non-negative offset or position or size or count, or even as a 32-bit mask + + // Although mask tensor may have S8 or U8 data type, it is always considered U8 (MskT = uint8_t) as it is used + // as zero (outside mask) or non-zero (inside mask) + NVCV_ASSERT(srcData.stride(idsNDHW.w) == sizeof(SrcT)); NVCV_ASSERT(dstData.stride(idsNDHW.w) == sizeof(DstT)); + if ((srcData.stride(idsNDHW.z) > nvcv::cuda::TypeTraits::max) + || (idsNDHW.y != -1 && srcData.stride(idsNDHW.y) > nvcv::cuda::TypeTraits::max) + || (idsNDHW.x != -1 && srcData.stride(idsNDHW.x) > nvcv::cuda::TypeTraits::max)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Too big in tensor"); + } + if ((dstData.stride(idsNDHW.z) > nvcv::cuda::TypeTraits::max) + || (idsNDHW.y != -1 && dstData.stride(idsNDHW.y) > nvcv::cuda::TypeTraits::max) + || (idsNDHW.x != -1 && dstData.stride(idsNDHW.x) > nvcv::cuda::TypeTraits::max)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Too big out tensor"); + } + + nvcv::Optional mskData; + int4 mskIdsNDHW = {0, 0, 0, 0}; + int maskN = 0; + bool hasMask = (mask) ? true : false; + + if (hasMask) + { + mskData = mask.exportData(); // export data check already done + + mskIdsNDHW = int4{mskData->layout().find('N'), mskData->layout().find('D'), mskData->layout().find('H'), + mskData->layout().find('W')}; + + NVCV_ASSERT(mskData->stride(mskIdsNDHW.w) == sizeof(MskT)); + + if ((mskData->stride(mskIdsNDHW.z) > nvcv::cuda::TypeTraits::max) + || (mskIdsNDHW.y != -1 && mskData->stride(mskIdsNDHW.y) > nvcv::cuda::TypeTraits::max) + || (mskIdsNDHW.x != -1 && mskData->stride(mskIdsNDHW.x) > nvcv::cuda::TypeTraits::max) + || (mskIdsNDHW.x != -1 && mskData->shape()[mskIdsNDHW.x] > nvcv::cuda::TypeTraits::max)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Too big mask tensor"); + } + + maskN = mskIdsNDHW.x == -1 ? 1 : (int)mskData->shape()[mskIdsNDHW.x]; + } + cuda::Tensor1DWrap bgLabelWrap, minThreshWrap, maxThreshWrap; cuda::Tensor1DWrap minSizeWrap, countWrap; cuda::Tensor3DWrap statsWrap; @@ -1380,6 +1521,15 @@ inline void RunLabelForType(cudaStream_t stream, const nvcv::TensorDataStridedCu cuda::Tensor3DWrap srcWrap(srcData.basePtr(), srcStridesNH.x, srcStridesNH.y); cuda::Tensor3DWrap dstWrap(dstData.basePtr(), dstStridesNH.x, dstStridesNH.y); + cuda::Tensor3DWrap mskWrap; + + if (hasMask) + { + int2 mskStridesNH{0, (int)mskData->stride(mskIdsNDHW.z)}; + mskStridesNH.x = mskIdsNDHW.x == -1 ? mskStridesNH.y * shapeWHDN.y : (int)mskData->stride(mskIdsNDHW.x); + + mskWrap = cuda::Tensor3DWrap(mskData->basePtr(), mskStridesNH.x, mskStridesNH.y); + } BlockLabel2D <<>>(dstWrap, srcWrap, minThreshWrap, maxThreshWrap, sizeWH); @@ -1404,15 +1554,17 @@ inline void RunLabelForType(cudaStream_t stream, const nvcv::TensorDataStridedCu if (stats) { - ComputeStats2D<<>>(statsWrap, dstWrap, bgLabelWrap, sizeWH, relabel); + ComputeStats2D<<>>(statsWrap, dstWrap, mskWrap, bgLabelWrap, sizeWH, + maskN, relabel); if (minSize) { RemoveIslands2D<<>>(statsWrap, dstWrap, bgLabelWrap, minSizeWrap, - sizeWH, relabel); + sizeWH, relabel, hasMask); } - Relabel2D<<>>(statsWrap, dstWrap, bgLabelWrap, sizeWH, relabel); + Relabel2D<<>>(statsWrap, dstWrap, bgLabelWrap, minSizeWrap, sizeWH, + relabel, hasMask); } } } @@ -1432,6 +1584,15 @@ inline void RunLabelForType(cudaStream_t stream, const nvcv::TensorDataStridedCu cuda::Tensor4DWrap srcWrap(srcData.basePtr(), srcStridesNDH.x, srcStridesNDH.y, srcStridesNDH.z); cuda::Tensor4DWrap dstWrap(dstData.basePtr(), dstStridesNDH.x, dstStridesNDH.y, dstStridesNDH.z); + cuda::Tensor4DWrap mskWrap; + + if (hasMask) + { + int3 mskStridesNDH{0, (int)mskData->stride(mskIdsNDHW.y), (int)mskData->stride(mskIdsNDHW.z)}; + mskStridesNDH.x = mskIdsNDHW.x == -1 ? mskStridesNDH.y * shapeWHDN.z : (int)mskData->stride(mskIdsNDHW.x); + + mskWrap = cuda::Tensor4DWrap(mskData->basePtr(), mskStridesNDH.x, mskStridesNDH.y, mskStridesNDH.z); + } BlockLabel3D <<>>(dstWrap, srcWrap, minThreshWrap, maxThreshWrap, shapeWHDN); @@ -1459,16 +1620,17 @@ inline void RunLabelForType(cudaStream_t stream, const nvcv::TensorDataStridedCu if (stats) { - ComputeStats3D<<>>(statsWrap, dstWrap, bgLabelWrap, shapeWHDN, - relabel); + ComputeStats3D<<>>(statsWrap, dstWrap, mskWrap, bgLabelWrap, + shapeWHDN, maskN, relabel); if (minSize) { RemoveIslands3D<<>>(statsWrap, dstWrap, bgLabelWrap, minSizeWrap, - shapeWHDN, relabel); + shapeWHDN, relabel, hasMask); } - Relabel3D<<>>(statsWrap, dstWrap, bgLabelWrap, shapeWHDN, relabel); + Relabel3D<<>>(statsWrap, dstWrap, bgLabelWrap, minSizeWrap, shapeWHDN, + relabel, hasMask); } } } @@ -1477,15 +1639,15 @@ inline void RunLabelForType(cudaStream_t stream, const nvcv::TensorDataStridedCu inline void RunLabel(cudaStream_t stream, const nvcv::TensorDataStridedCuda &srcData, const nvcv::TensorDataStridedCuda &dstData, const int4 &srcShape, nvcv::DataType srcDataType, const nvcv::Tensor &bgLabel, const nvcv::Tensor &minThresh, const nvcv::Tensor &maxThresh, - const nvcv::Tensor &minSize, const nvcv::Tensor &count, const nvcv::Tensor &stats, int numDim, - bool relabel) + const nvcv::Tensor &minSize, const nvcv::Tensor &count, const nvcv::Tensor &stats, + const nvcv::Tensor &mask, int numDim, bool relabel) { switch (srcDataType) { #define CVCUDA_LABEL_CASE(DT, T) \ case nvcv::TYPE_##DT: \ RunLabelForType(stream, srcData, dstData, srcShape, bgLabel, minThresh, maxThresh, minSize, count, stats, \ - numDim, relabel); \ + mask, numDim, relabel); \ break CVCUDA_LABEL_CASE(U8, uint8_t); @@ -1515,7 +1677,8 @@ Label::Label() {} void Label::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, const nvcv::Tensor &bgLabel, const nvcv::Tensor &minThresh, const nvcv::Tensor &maxThresh, const nvcv::Tensor &minSize, const nvcv::Tensor &count, const nvcv::Tensor &stats, - NVCVConnectivityType connectivity, NVCVLabelType assignLabels) const + const nvcv::Tensor &mask, NVCVConnectivityType connectivity, NVCVLabelType assignLabels, + NVCVLabelMaskType maskType) const { if (!(in.shape().layout() == nvcv::TENSOR_HW || in.shape().layout() == nvcv::TENSOR_HWC || in.shape().layout() == nvcv::TENSOR_NHW || in.shape().layout() == nvcv::TENSOR_NHWC @@ -1532,9 +1695,10 @@ void Label::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv:: throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input and output tensors must have the same shape and layout"); } - if (!(out.dtype() == nvcv::TYPE_U32)) + if (!(out.dtype() == nvcv::TYPE_S32 || out.dtype() == nvcv::TYPE_U32)) { - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output tensor data type must be U32"); + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output tensor data type (%s) must be S32 or U32", + nvcvDataTypeGetName(out.dtype())); } auto inData = in.exportData(); @@ -1549,12 +1713,6 @@ void Label::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv:: throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output tensor must be cuda-accessible"); } - if (outData->stride(0) >= cuda::TypeTraits::max - || (uint32_t)outData->stride(0) / (uint32_t)sizeof(uint32_t) >= (uint32_t)(1 << 31)) - { - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Too big input and output tensors"); - } - auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inData); if (!inAccess) { @@ -1613,6 +1771,16 @@ void Label::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv:: } } + // Various outputs assume the maximum range of values to be representable as int, for this to happen the number + // of elements (pixels or voxels) in the input must not be greater than maximum int32_t + int64_t numElems = inShape.x * inShape.y * inShape.z; + if (numElems > cuda::TypeTraits::max) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Too big input shape with %ld elements, must be smaller than or equal to %d", numElems, + cuda::TypeTraits::max); + } + if (bgLabel) { if (!((bgLabel.rank() == 1 && bgLabel.shape()[0] == inShape.w) @@ -1681,9 +1849,10 @@ void Label::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv:: "Output count must be [N] or [NC] tensor, with N=%d and C=1, got %s", inShape.w, oss.str().c_str()); } - if (!(count.dtype() == nvcv::TYPE_U32)) + if (!(count.dtype() == nvcv::TYPE_S32 || count.dtype() == nvcv::TYPE_U32)) { - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output count (%s) must have U32 data type", + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Output count (%s) must have S32 or U32 data type", nvcvDataTypeGetName(count.dtype())); } } @@ -1694,17 +1863,18 @@ void Label::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv:: { throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output stats requires count tensor"); } - if (!((stats.rank() == 3 && stats.shape()[0] == inShape.w && stats.shape()[2] == 2 + 2 * numDim))) + if (!((stats.rank() == 3 && stats.shape()[0] == inShape.w && stats.shape()[2] == 3 + 2 * numDim))) { std::ostringstream oss; oss << stats.shape(); throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output stats must be [NMA] tensor, with rank=3 N=%d A=%d, got %s", inShape.w, - 2 + 2 * numDim, oss.str().c_str()); + 3 + 2 * numDim, oss.str().c_str()); } - if (!(stats.dtype() == nvcv::TYPE_U32)) + if (!(stats.dtype() == nvcv::TYPE_S32 || stats.dtype() == nvcv::TYPE_U32)) { - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output stats (%s) must have U32 data type", + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Output stats (%s) must have S32 or U32 data type", nvcvDataTypeGetName(stats.dtype())); } } @@ -1731,20 +1901,99 @@ void Label::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv:: "Input minSize must be [N] or [NC] tensor, with N=%d and C=1, got %s", inShape.w, oss.str().c_str()); } - if (!(minSize.dtype() == nvcv::TYPE_U32)) + if (!(minSize.dtype() == nvcv::TYPE_S32 || minSize.dtype() == nvcv::TYPE_U32)) { - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input minSize (%s) must have U32 data type", + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Input minSize (%s) must have S32 or U32 data type", nvcvDataTypeGetName(minSize.dtype())); } } + if (mask) + { + if (!minSize && maskType == NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Input mask requires minSize tensor " + "when maskType is NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY"); + } + if (!(maskType == NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Mask type must be " + "NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY"); + } + + auto maskData = in.exportData(); + if (!maskData) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Mask tensor must be cuda-accessible"); + } + + auto maskAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*maskData); + if (!maskAccess) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Mask tensor must have strided access"); + } + if (!(maskAccess->numChannels() == 1)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Mask tensor must have a single channel"); + } + if (!(maskAccess->numPlanes() == 1)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Mask tensor must have a single plane"); + } + if (!(maskAccess->numSamples() == 1 || maskAccess->numSamples() == inShape.w)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Mask tensor must have number of samples N=1 " + "or same N as input and output tensors"); + } + if (!(maskAccess->numCols() == inShape.x)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Mask tensor must have the same width W " + "as input and output tensors"); + } + if (!(maskAccess->numRows() == inShape.y)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Mask tensor must have the same height H " + "as input and output tensors"); + } + + int maskDepthIdx = mask.shape().layout().find('D'); + if (maskDepthIdx != -1) + { + if (mask.shape()[maskDepthIdx] != inShape.z) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Mask tensor must have the same depth D " + "as input and output tensors"); + } + } + else + { + if (numDim == 3) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Tensors in and out are 3D and mask is 2D"); + } + } + + if (!(mask.dtype() == nvcv::TYPE_S8 || mask.dtype() == nvcv::TYPE_U8)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input mask (%s) must have S8 or U8 data type", + nvcvDataTypeGetName(mask.dtype())); + } + } + // TODO: Support full connectivity if (connectivity == NVCV_CONNECTIVITY_8_2D || connectivity == NVCV_CONNECTIVITY_26_3D) { throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Full neighborhood labeling not supported yet"); } - RunLabel(stream, *inData, *outData, inShape, in.dtype(), bgLabel, minThresh, maxThresh, minSize, count, stats, + RunLabel(stream, *inData, *outData, inShape, in.dtype(), bgLabel, minThresh, maxThresh, minSize, count, stats, mask, numDim, relabel); } diff --git a/src/cvcuda/priv/OpLabel.hpp b/src/cvcuda/priv/OpLabel.hpp index 08d34f33..d397d90e 100644 --- a/src/cvcuda/priv/OpLabel.hpp +++ b/src/cvcuda/priv/OpLabel.hpp @@ -39,8 +39,8 @@ class Label final : public IOperator void operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, const nvcv::Tensor &bgLabel, const nvcv::Tensor &minThresh, const nvcv::Tensor &maxThresh, const nvcv::Tensor &minSize, - const nvcv::Tensor &count, const nvcv::Tensor &stats, NVCVConnectivityType connectivity, - NVCVLabelType assignLabels) const; + const nvcv::Tensor &count, const nvcv::Tensor &stats, const nvcv::Tensor &mask, + NVCVConnectivityType connectivity, NVCVLabelType assignLabels, NVCVLabelMaskType maskType) const; }; } // namespace cvcuda::priv diff --git a/src/cvcuda/priv/OpRemap.cu b/src/cvcuda/priv/OpRemap.cu index fc9a65db..427c5c59 100644 --- a/src/cvcuda/priv/OpRemap.cu +++ b/src/cvcuda/priv/OpRemap.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -26,6 +26,7 @@ #include #include #include +#include #include namespace cuda = nvcv::cuda; diff --git a/src/cvcuda/priv/legacy/CMakeLists.txt b/src/cvcuda/priv/legacy/CMakeLists.txt index 11a2a517..53bed6c4 100644 --- a/src/cvcuda/priv/legacy/CMakeLists.txt +++ b/src/cvcuda/priv/legacy/CMakeLists.txt @@ -16,7 +16,6 @@ set(CV_CUDA_PRIV_LEGACY_FILES CvCudaLegacyHelpers.cpp) set(CV_CUDA_PRIV_LEGACY_OP_FILES - find_contours.cu filter_utils.cu custom_crop.cu reformat.cu diff --git a/src/cvcuda/priv/legacy/CvCudaLegacy.h b/src/cvcuda/priv/legacy/CvCudaLegacy.h index f2919dd9..5d2f42c3 100644 --- a/src/cvcuda/priv/legacy/CvCudaLegacy.h +++ b/src/cvcuda/priv/legacy/CvCudaLegacy.h @@ -442,13 +442,6 @@ class MinAreaRect : public CudaBaseOp */ ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, const TensorDataStridedCuda &numPointsInContour, const int totalContours, cudaStream_t stream); - /** - * @brief calculate the cpu/gpu buffer size needed by this operator - * @param max_input_shape maximum input DataShape that may be used - * @param max_output_shape maximum output DataShape that may be used - * @param max_data_type DataType with the maximum size that may be used - */ - size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, int maxContourNum); private: int mMaxContourNum; @@ -1507,16 +1500,6 @@ class Gaussian : public CudaBaseOp ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, Size2D kernelSize, double2 sigma, NVCVBorderType borderMode, cudaStream_t stream); - /** - * @brief calculate the cpu/gpu buffer size needed by this operator - * @param max_input_shape maximum input DataShape that may be used - * @param max_output_shape maximum output DataShape that may be used - * @param max_data_type DataType with the maximum size that may be used - * @param maxKernelSize Maximum Gaussian kernel size that may be used - */ - size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type, - Size2D maxKernelSize); - private: Size2D m_maxKernelSize = {0, 0}; Size2D m_curKernelSize = {0, 0}; @@ -1625,16 +1608,6 @@ class AverageBlur : public CudaBaseOp ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, Size2D kernelSize, int2 kernelAnchor, NVCVBorderType borderMode, cudaStream_t stream); - /** - * @brief calculate the cpu/gpu buffer size needed by this operator - * @param max_input_shape maximum input DataShape that may be used - * @param max_output_shape maximum output DataShape that may be used - * @param max_data_type DataType with the maximum size that may be used - * @param maxKernelSize Maximum average blur kernel size that may be used - */ - size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type, - Size2D maxKernelSize); - private: Size2D m_maxKernelSize = {0, 0}; Size2D m_curKernelSize = {0, 0}; @@ -1924,13 +1897,6 @@ class GaussianVarShape : public CudaBaseOp const TensorDataStridedCuda &kernelSize, const TensorDataStridedCuda &sigma, NVCVBorderType borderMode, cudaStream_t stream); - /** - * @brief calculate the gpu buffer size needed by this operator - * @param maxKernelSize Maximum Gaussian kernel size that may be used - * @param maxBatchSize Maximum batch size that may be used - */ - size_t calBufferSize(Size2D maxKernelSize, int maxBatchSize); - private: Size2D m_maxKernelSize = {0, 0}; int m_maxBatchSize = 0; @@ -2005,13 +1971,6 @@ class AverageBlurVarShape : public CudaBaseOp const TensorDataStridedCuda &kernelSize, const TensorDataStridedCuda &kernelAnchor, NVCVBorderType borderMode, cudaStream_t stream); - /** - * @brief calculate the cpu/gpu buffer size needed by this operator - * @param maxKernelSize Maximum Gaussian kernel size that may be used - * @param maxBatchSize Maximum batch size that may be used - */ - size_t calBufferSize(Size2D maxKernelSize, int maxBatchSize); - private: Size2D m_maxKernelSize = {0, 0}; int m_maxBatchSize = 0; @@ -2595,13 +2554,6 @@ class AdaptiveThreshold : public CudaBaseOp ErrorCode infer(const TensorDataStridedCuda &in, const TensorDataStridedCuda &out, const double maxValue, const NVCVAdaptiveThresholdType adaptiveMethod, const NVCVThresholdType thresholdType, const int32_t blockSize, const double c, cudaStream_t stream); - /** - * @brief calculate the cpu/gpu buffer size needed by this operator - * @param maxInputShape maximum input DataShape that may be used - * @param maxOutputShape maximum output DataShape that may be used - * @param maxBlockSize maximum block size that may be used - */ - size_t calBufferSize(DataShape maxInputShape, DataShape maxOutputShape, int maxBlockSize); private: int m_blockSize = -1; @@ -2637,15 +2589,6 @@ class AdaptiveThresholdVarShape : public CudaBaseOp const NVCVThresholdType thresholdType, const TensorDataStridedCuda &blockSize, const TensorDataStridedCuda &c, cudaStream_t stream); - /** - * @brief calculate the cpu/gpu buffer size needed by this operator - * @param maxInputShape maximum input DataShape that may be used - * @param maxOutputShape maximum output DataShape that may be used - * @param maxBlockSize maximum block size that may be used - * @param maxBatchSize maximum batch size that may be used - */ - size_t calBufferSize(DataShape maxInputShape, DataShape maxOutputShape, int maxBlockSize, int maxBatchSize); - private: const int m_maxBatchSize; const int m_maxBlockSize; @@ -2912,77 +2855,6 @@ class HistogramEqVarShape : public CudaBaseOp std::byte *m_histoArray; }; -class FindContours : public CudaBaseOp -{ -public: - static constexpr int32_t MAX_NUM_CONTOURS = 256; - static constexpr int32_t MAX_CONTOUR_POINTS = 4 * 1024; - static constexpr int32_t MAX_TOTAL_POINTS = MAX_NUM_CONTOURS * MAX_CONTOUR_POINTS; - - FindContours() = delete; - FindContours(DataShape max_input_shape, DataShape max_output_shape); - - ~FindContours(); - - /** - * Limitations: - * - * Input: - * Data Layout: [kNHWC, kHWC] - * Channels: [1] - * - * | Data Type | Allowed | - * |-----------------|-------------| - * | 8bit Unsigned | Yes | - * ... [other types] - * - * Output: - * Data Layout: [kNCW, CW] - * Width: [2] - * - * | Data Type | Allowed | - * |-----------------|-------------| - * | 32bit Signed | Yes | - * ... [other types] - * - * - Input/Output Dependency: - * | Property | Input == Output | - * |-----------------|-----------------| - * | Data Layout | Yes | - * ... [other properties] - * - * @brief Extracts contours from a binary image. - * - * @param inData GPU pointer to input data. Represents an 8-bit, unsigned, - * single-channel image. Non-zero pixels are treated as 1's, and zero - * pixels remain as 0's, which makes the image binary. - * @param outData GPU pointer to output data. It contains the detected - * contours for the input image. The data is structured as: [x_c0_p0, - * y_c0_p0, ..., x_ci_pj, y_ci_pj, ...], where "ci" denotes a contour's - * index in the output array and "pj" is a point's index within a - * contour. - * @param numPoints Holds the number of contour points for each image. - * Specifically, numPoints[i] gives the number of contours for the i-th - * image, while numPoints[i][j] gives the number of points in the j-th - * contour of i-th image. - * @param stream CUDA stream for asynchronous execution. - */ - ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, - const TensorDataStridedCuda &numPoints, cudaStream_t stream); - - /** - * @brief Computes the necessary GPU buffer size for the operation. - * - * @param max_input_shape The largest possible shape for input data. - * @param max_output_shape The largest possible shape for output data. - * @param max_data_type The data type of the maximum size that is used. - */ - size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type); - -private: - void *gpu_workspace{nullptr}; -}; - } // namespace nvcv::legacy::cuda_op #endif // CV_CUDA_LEGACY_H diff --git a/src/cvcuda/priv/legacy/adaptive_threshold.cu b/src/cvcuda/priv/legacy/adaptive_threshold.cu index b4274480..56bdb126 100644 --- a/src/cvcuda/priv/legacy/adaptive_threshold.cu +++ b/src/cvcuda/priv/legacy/adaptive_threshold.cu @@ -169,11 +169,6 @@ AdaptiveThreshold::~AdaptiveThreshold() NVCV_CHECK_LOG(cudaFree(m_kernel)); } -size_t AdaptiveThreshold::calBufferSize(DataShape maxInputShape, DataShape maxOutputShape, int maxBlockSize) -{ - return maxBlockSize * maxBlockSize * sizeof(float); -} - ErrorCode AdaptiveThreshold::infer(const TensorDataStridedCuda &in, const TensorDataStridedCuda &out, const double maxValue, const NVCVAdaptiveThresholdType adaptiveMethod, const NVCVThresholdType thresholdType, const int32_t blockSize, const double c, diff --git a/src/cvcuda/priv/legacy/adaptive_threshold_var_shape.cu b/src/cvcuda/priv/legacy/adaptive_threshold_var_shape.cu index eac9e667..8f372ea1 100644 --- a/src/cvcuda/priv/legacy/adaptive_threshold_var_shape.cu +++ b/src/cvcuda/priv/legacy/adaptive_threshold_var_shape.cu @@ -195,12 +195,6 @@ AdaptiveThresholdVarShape::~AdaptiveThresholdVarShape() NVCV_CHECK_LOG(cudaFree(m_kernel)); } -size_t AdaptiveThresholdVarShape::calBufferSize(DataShape maxInputShape, DataShape maxOutputShape, int maxBlockSize, - int maxBatchSize) -{ - return sizeof(float) * maxBatchSize * maxBlockSize * maxBlockSize; -} - ErrorCode AdaptiveThresholdVarShape::infer(const ImageBatchVarShapeDataStridedCuda &in, const ImageBatchVarShapeDataStridedCuda &out, const TensorDataStridedCuda &maxValue, diff --git a/src/cvcuda/priv/legacy/filter.cu b/src/cvcuda/priv/legacy/filter.cu index 105f9260..3ec059be 100644 --- a/src/cvcuda/priv/legacy/filter.cu +++ b/src/cvcuda/priv/legacy/filter.cu @@ -237,12 +237,6 @@ Gaussian::~Gaussian() NVCV_CHECK_LOG(cudaFree(m_kernel)); } -size_t Gaussian::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type, - Size2D maxKernelSize) -{ - return maxKernelSize.w * maxKernelSize.h * sizeof(float); -} - ErrorCode Gaussian::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, Size2D kernelSize, double2 sigma, NVCVBorderType borderMode, cudaStream_t stream) { @@ -360,12 +354,6 @@ AverageBlur::~AverageBlur() NVCV_CHECK_LOG(cudaFree(m_kernel)); } -size_t AverageBlur::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type, - Size2D maxKernelSize) -{ - return maxKernelSize.w * maxKernelSize.h * sizeof(float); -} - ErrorCode AverageBlur::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, Size2D kernelSize, int2 kernelAnchor, NVCVBorderType borderMode, cudaStream_t stream) { diff --git a/src/cvcuda/priv/legacy/filter_var_shape.cu b/src/cvcuda/priv/legacy/filter_var_shape.cu index a2069509..e8611423 100644 --- a/src/cvcuda/priv/legacy/filter_var_shape.cu +++ b/src/cvcuda/priv/legacy/filter_var_shape.cu @@ -480,11 +480,6 @@ GaussianVarShape::~GaussianVarShape() NVCV_CHECK_LOG(cudaFree(m_kernel)); } -size_t GaussianVarShape::calBufferSize(Size2D maxKernelSize, int maxBatchSize) -{ - return maxKernelSize.w * maxKernelSize.h * maxBatchSize * sizeof(float); -} - ErrorCode GaussianVarShape::infer(const ImageBatchVarShapeDataStridedCuda &inData, const ImageBatchVarShapeDataStridedCuda &outData, const TensorDataStridedCuda &kernelSize, const TensorDataStridedCuda &sigma, @@ -689,11 +684,6 @@ AverageBlurVarShape::~AverageBlurVarShape() NVCV_CHECK_LOG(cudaFree(m_kernel)); } -size_t AverageBlurVarShape::calBufferSize(Size2D maxKernelSize, int maxBatchSize) -{ - return maxKernelSize.w * maxKernelSize.h * maxBatchSize * sizeof(float); -} - ErrorCode AverageBlurVarShape::infer(const ImageBatchVarShapeDataStridedCuda &inData, const ImageBatchVarShapeDataStridedCuda &outData, const TensorDataStridedCuda &kernelSize, const TensorDataStridedCuda &kernelAnchor, diff --git a/src/cvcuda/priv/legacy/find_contours.cu b/src/cvcuda/priv/legacy/find_contours.cu deleted file mode 100644 index abcb798c..00000000 --- a/src/cvcuda/priv/legacy/find_contours.cu +++ /dev/null @@ -1,1238 +0,0 @@ -/* Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES - * SPDX-License-Identifier: Apache-2.0 - * - * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. - * Copyright (C) 2021-2022, Bytedance Inc. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "CvCudaLegacy.h" -#include "CvCudaLegacyHelpers.hpp" - -#include "CvCudaUtils.cuh" - -#include -#include -#include - -#include - -namespace cg = cooperative_groups; - -namespace nvcv::legacy::cuda_op { - -template -__forceinline__ __host__ __device__ std::enable_if_t::value, T> mod(const T &a, const T &b) -{ - T c = a % b; - return (c < 0) ? (c + b) : c; -} - -// Importing scope of the helpers namespace -using namespace nvcv::legacy::helpers; - -using CountType = uint32_t; -using IndexType = int32_t; -using PixelType = uint8_t; -using LabelType = IndexType; -using PointType = int2; -using CoordType = uint3; -using MaskType = uint32_t; - -template -using DeviceImage = cuda::FullTensorWrap; -using BoundaryLabel = Ptr2dNHWC; -using Neighborhood = Ptr2dNHWC; -using ConnectList = Ptr2dNHWC; -using CountList = Ptr2dNL; -using NodeList = cuda::FullTensorWrap; -using NodeCounts = cuda::FullTensorWrap; - -using KernelGrid = cg::grid_group; -using KernelBlock = cg::thread_block; -using KernelWarp = cg::thread_block_tile<32, KernelBlock>; -using ActiveWarp = cg::coalesced_group; - -template -class SharedHWWrapper -{ -public: - using type = SharedHWWrapper; - using value_type = ValueType; - using size_type = IndexType; - - __forceinline__ __device__ SharedHWWrapper() - : m_height{0} - , m_width{0} - , m_data{nullptr} - { - } - - __forceinline__ __device__ SharedHWWrapper(const type &other) - : m_height{other.m_height} - , m_width{other.m_width} - , m_data{other.m_data} - { - } - - __forceinline__ __device__ SharedHWWrapper(type &&other) - { - this->m_height = other.m_height; - other.m_height = 0; - this->m_width = other.m_width; - other.m_width = 0; - this->m_data = other.m_data; - other.m_data = nullptr; - } - - __forceinline__ __device__ SharedHWWrapper(size_type rows, size_type cols, ValueType *data) - : m_height{rows} - , m_width{cols} - , m_data{data} - { - } - - __forceinline__ __device__ ~SharedHWWrapper() - { - m_height = 0; - m_width = 0; - m_data = nullptr; - } - - __forceinline__ __device__ type &operator=(const type &other) - { - this->m_height = other.m_height; - this->m_width = other.m_width; - this->m_data = other.m_data; - return *this; - } - - __forceinline__ __device__ type &operator=(type &&other) - { - this->m_height = other.m_height; - other.m_height = 0; - this->m_width = other.m_width; - other.m_width = 0; - this->m_data = other.m_data; - other.m_data = nullptr; - return *this; - } - - __forceinline__ __device__ value_type &operator[](IndexType index) - { - assert(0 <= index && index < static_cast(m_width * m_height)); - return this->m_data[index]; - } - - const __forceinline__ __device__ value_type &operator[](IndexType index) const - { - assert(0 <= index && index < static_cast(m_width * m_height)); - return this->m_data[index]; - } - - __forceinline__ __device__ value_type &operator[](PointType pos) - { - using AxisType = decltype(pos.x); - assert(0 <= pos.x && pos.x < static_cast(m_width)); - assert(0 <= pos.y && pos.y < static_cast(m_height)); - return (*this)[this->pointToIndex(pos)]; - } - - const __forceinline__ __device__ value_type &operator[](PointType pos) const - { - using AxisType = decltype(pos.x); - assert(0 <= pos.x && pos.x < static_cast(m_width)); - assert(0 <= pos.y && pos.y < static_cast(m_height)); - return (*this)[this->pointToIndex(pos)]; - } - - const __forceinline__ __device__ value_type *ptr(IndexType index) const - { - return &((*this)[index]); - } - - const __forceinline__ __device__ value_type *ptr(PointType pos) const - { - return &((*this)[pos]); - } - - __forceinline__ __device__ IndexType pointToIndex(PointType pos) const - { - return static_cast(pos.y * this->m_width + pos.x); - } - - __forceinline__ __device__ PointType indexToPoint(IndexType index) const - { - return PointType{mod(index, m_width), index / m_width}; - } - - __forceinline__ __device__ size_type height() const - { - return m_height; - } - - __forceinline__ __device__ size_type width() const - { - return m_width; - } - - __forceinline__ __device__ size_type volume() const - { - return this->height() * this->width(); - } - -private: - size_type m_height{0}; - size_type m_width{0}; - - value_type *m_data{nullptr}; -}; - -// Representing the shared memory in int32 for to avoid bank conflicts -using SharedImage = SharedHWWrapper; -using SharedLabel = SharedHWWrapper; - -template -class LocalQueue -{ -public: - __device__ LocalQueue() - : m_front(0) - , m_back(0) - { - } - - // Push an item to the queue; returns false if the queue is full. - __device__ bool push(const ValueType &value) - { - if (mod(m_back + 1, MAX_SIZE) == m_front) - return false; // Queue is full - - m_data[m_back] = value; - m_back = mod(m_back + 1, MAX_SIZE); - - return true; - } - - __device__ IndexType pushOrDelete(const ValueType &value) - { - IndexType removeAt = MAX_SIZE; - - // Check if value is already in the queue - for (auto i = m_front; i != m_back; i = mod(i + 1, MAX_SIZE)) - { - if (m_data[i] == value) - { - // Remove the value by shifting everything to the left - removeAt = i; - this->remove(i); - i = m_back; - } - } - - // If we've reached here, value is not in the queue. Push it. - if (removeAt == MAX_SIZE) - { - push(value); - } - - return removeAt; - } - - __device__ void remove(IndexType index) - { - // Remove the value by shifting everything to the left - for (auto j = mod(index + 1, MAX_SIZE); j != m_back; j = mod(j + 1, MAX_SIZE)) - { - m_data[index] = m_data[j]; - index = j; - j = mod(j + 1, MAX_SIZE); - } - m_back = mod(m_back - 1, MAX_SIZE); - } - - // Pop an item from the queue; returns false if the queue is empty. - __device__ bool pop(ValueType &value) - { - if (m_front == m_back) - return false; // Queue is empty - - value = m_data[m_front]; - m_front = mod(m_front + 1, MAX_SIZE); - - return true; - } - - // Check if the queue is empty. - __device__ bool isEmpty() const - { - return m_front == m_back; - } - - // Check if the queue is full. - __device__ bool isFull() const - { - return mod(m_back + 1, MAX_SIZE) == m_front; - } - -private: - ValueType m_data[MAX_SIZE]; // Array to store the queue's elements. - IndexType m_front; // Index of the front element. - IndexType m_back; // Index where the next element will be pushed. -}; - -// Creating a list of point offsets for 8-point stencil which are specified -// in clock-wise rotating order from top-left -__constant__ PointType OFFSET[8] = { - {-1, -1}, // 0 ==> Left-Up - { 0, -1}, // 1 ==> Up - { 1, -1}, // 2 ==> Right-Up - { 1, 0}, // 3 ==> Right - { 1, 1}, // 4 ==> Right-Down - { 0, 1}, // 5 ==> Down - {-1, 1}, // 6 ==> Left-Down - {-1, 0} // 7 ==> Left -}; - -// Pre-declarations -/******************************************************************************/ -template -__device__ SharedValueType globalAt(const DeviceImage &global, PointType pos, IndexType batch, - GlobalValueType defaultValue = 0); - -template -__device__ void doCopyDirected(const DeviceImage &global, PointType globalPos, PointType localPos, - IndexType batch, PointType direction, GlobalValueType defaultValue, - SharedHWWrapper &shared); - -template -__device__ void copyGlobalToShared(const DeviceImage &global, PointType globalPos, PointType localPos, - IndexType batch, GlobalValueType defaultValue, - SharedHWWrapper &shared); - -__device__ MaskType getNeighborhoodMask(const SharedImage &sharedImage, PointType localPos); - -__device__ bool isEdgePixel(const SharedImage &sharedImage, PointType localPos); - -__device__ void setLabels(const SharedImage &sharedImage, PointType localPos, SharedLabel &sharedLabels, - PointType globalPos, IndexType width, IndexType height); - -__device__ LabelType findRoot(const BoundaryLabel &labels, CoordType pos, LabelType badLabel); - -__device__ LabelType minLabelInNeighborhood(const BoundaryLabel &labels, CoordType pos, LabelType badLabel, - Neighborhood &neighbors); - -__device__ void resolveRoots(BoundaryLabel &segments, CoordType pos, LabelType badLabel, Neighborhood &neighbors); - -__device__ IndexType nextDirectionNot(const Neighborhood &neighbors, IndexType from, IndexType lastDir, CoordType pos, - bool flipDir = true); - -__device__ LabelType findHead(const BoundaryLabel &labels, const BoundaryLabel &segments, const Neighborhood &neighbors, - CoordType pos, IndexType from, IndexType &lastDir); - -__device__ void traverseContour(const BoundaryLabel &labels, const BoundaryLabel &segments, - const BoundaryLabel &connectedComponents, Neighborhood &neighbors, CoordType pos, - ConnectList &connectList, CountList &nodeCount, CountType *contourCount, - LabelType badLabel); - -template -__host__ void findContours_impl(DeviceImage &dImage, LabelType *dLabels, LabelType *dSegments, - LabelType *dConnectedComponents, MaskType *dNeighbors, IndexType *dConnectList, - CountType *dNodeCount, CountType *dContourCount, NodeList &dNodeList, - NodeCounts &dPointCount, IndexType height, IndexType width, IndexType batchSize, - cudaStream_t stream); - -/******************************************************************************/ - -template -__forceinline__ __device__ SharedValueType globalAt(const DeviceImage &global, PointType pos, - IndexType batch, GlobalValueType defaultValue) -{ - SharedValueType result = static_cast(defaultValue); - - // Batch size is 1 - if (0 <= pos.x && pos.x < global.shapes()[2] && 0 <= pos.y && pos.y < global.shapes()[1] && 0 <= batch - && batch < global.shapes()[0]) - { - result = static_cast(*global.ptr(batch, pos.y, pos.x) > 0); - } - - return result; -} - -template -__forceinline__ __device__ void doCopyDirected(const DeviceImage &global, PointType globalPos, - PointType localPos, IndexType batch, PointType direction, - GlobalValueType defaultValue, SharedHWWrapper &shared) -{ - for (IndexType i = 1; i <= PAD_SIZE; ++i) - { - for (IndexType j = 1; j <= PAD_SIZE; ++j) - { - const auto offset = PointType{i * direction.x, j * direction.y}; - shared[localPos + offset] - = globalAt(global, globalPos + offset, batch, defaultValue); - } - } -} - -template -__device__ void copyGlobalToShared(const DeviceImage &global, PointType globalPos, PointType localPos, - IndexType batch, GlobalValueType defaultValue, - SharedHWWrapper &shared) -{ - // Copying over all data within the boundary - shared[localPos] = globalAt(global, globalPos, batch, defaultValue); - - if (localPos.x == PAD_SIZE) - { - doCopyDirected(global, globalPos, localPos, batch, OFFSET[7], defaultValue, shared); - } - if (localPos.x == (shared.width() - 1 - PAD_SIZE)) - { - doCopyDirected(global, globalPos, localPos, batch, OFFSET[3], defaultValue, shared); - } - if (localPos.y == PAD_SIZE) - { - doCopyDirected(global, globalPos, localPos, batch, OFFSET[1], defaultValue, shared); - } - if (localPos.y == (shared.height() - 1 - PAD_SIZE)) - { - doCopyDirected(global, globalPos, localPos, batch, OFFSET[5], defaultValue, shared); - } - if (localPos.x == PAD_SIZE && localPos.y == PAD_SIZE) - { - doCopyDirected(global, globalPos, localPos, batch, OFFSET[0], defaultValue, shared); - } - if (localPos.x == (shared.width() - 1 - PAD_SIZE) && localPos.y == PAD_SIZE) - { - doCopyDirected(global, globalPos, localPos, batch, OFFSET[2], defaultValue, shared); - } - if (localPos.x == PAD_SIZE && localPos.y == (shared.height() - 1 - PAD_SIZE)) - { - doCopyDirected(global, globalPos, localPos, batch, OFFSET[6], defaultValue, shared); - } - if (localPos.x == (shared.width() - 1 - PAD_SIZE) && localPos.y == (shared.height() - 1 - PAD_SIZE)) - { - doCopyDirected(global, globalPos, localPos, batch, OFFSET[4], defaultValue, shared); - } -} - -__device__ MaskType getNeighborhoodMask(const SharedImage &sharedImage, PointType localPos) -{ - MaskType neighborhoodMask = 0; - - for (auto dir = 0; dir < 8; ++dir) - { - const auto neighborPos = localPos + OFFSET[dir]; - - neighborhoodMask |= ((sharedImage[neighborPos] > 0) ? (1 << dir) : 0); - } - - return neighborhoodMask; -} - -__device__ bool isEdgePixel(const SharedImage &sharedImage, PointType localPos) -{ - // NOTE: This condition might need further thought. An edge pixel is a pixel - // with at least 1 zero pixel neighbor and at least two set neighbors - const auto neighborhood = getNeighborhoodMask(sharedImage, localPos); - return sharedImage[localPos] > 0 && __popc(neighborhood & 0xaa) < 4; -} - -__device__ void setLabels(const SharedImage &sharedImage, PointType localPos, SharedLabel &sharedLabels, - PointType globalPos, IndexType width, IndexType height) -{ - // Collecting boundary evaluation - const auto isBoundary = isEdgePixel(sharedImage, localPos); - const auto index = (localPos.y - 2) * sharedLabels.width() + (localPos.x - 2); - - // Collect boundary determination of neighbors - LabelType minIndex = isBoundary ? (globalPos.y * width + globalPos.x) : (width * height); - - for (auto i = 0; i < 4; ++i) - { - auto neighborPos = localPos + OFFSET[mod(i + 7, 8)]; - auto neighborIndex = (neighborPos.y - 2) * sharedLabels.width() + (neighborPos.x - 2); - auto neighborIsEdge = isBoundary && isEdgePixel(sharedImage, neighborPos); - - neighborPos = neighborPos - localPos + globalPos; - neighborIndex = neighborIsEdge ? (neighborPos.y * width + neighborPos.x) : (width * height); - - minIndex = min(minIndex, neighborIndex); - } - - sharedLabels[index] = minIndex; -} - -__device__ LabelType findRoot(const BoundaryLabel &labels, CoordType pos, LabelType badLabel) -{ - auto next = pos.y * labels.cols + pos.x; // Linearize the pixel position. - auto root = *labels.ptr(pos.z, pos.y, pos.x); - - // Keep finding the root until the root is a bad label or the next label is the root itself. - while (root != badLabel && next != root) - { - next = root; // Move on to the next label. - root = *(labels.ptr(pos.z, 0, 0) + root); // Fetch the next root label. - } - - return root; // Return the found root label. -} - -__device__ LabelType minLabelInNeighborhood(const BoundaryLabel &labels, CoordType pos, LabelType badLabel, - Neighborhood &neighbors) -{ - auto label = *labels.ptr(pos.z, pos.y, pos.x); - LabelType minLabel = badLabel; - - *neighbors.ptr(pos.z, pos.y, pos.x) = 0; - - // Loop through all 8 neighbors to find the smallest label. - for (auto dir = 1; dir < 8; dir += 2) - { - const auto neighborLabel = *labels.ptr(pos.z, pos.y + OFFSET[dir].y, pos.x + OFFSET[dir].x); - minLabel = min(neighborLabel, minLabel); - - // Update the edge neighbors in the flow structure based on valid neighbors. - *neighbors.ptr(pos.z, pos.y, pos.x) |= (neighborLabel != badLabel && label != badLabel) ? (1 << dir) : 0; - } - - return label == badLabel ? badLabel : minLabel; // Return the smallest label found. -} - -__device__ void resolveRoots(BoundaryLabel &segments, CoordType pos, LabelType badLabel, Neighborhood &neighbors) -{ - auto label1 = *segments.ptr(pos.z, pos.y, pos.x); - auto label2 = minLabelInNeighborhood(segments, pos, badLabel, neighbors); - auto label3 = badLabel; - - // Resolve the root for the label1 until it remains unchanged. - while (label1 != badLabel && label2 != badLabel && label1 != label3) - { - label3 = label1; - label1 = *(segments.ptr(pos.z, 0, 0) + label1); - } - - // Resolve the root for the label2 until it remains unchanged. - while (label1 != badLabel && label2 != badLabel && label2 != label3) - { - label3 = label2; - label2 = *(segments.ptr(pos.z, 0, 0) + label2); - } - - // Merge label1 and label2 if they are different and not bad labels. - while (label1 != badLabel && label2 != badLabel && label1 != label2) - { - label3 = atomicMin(segments.ptr(pos.z, 0, 0) + label1, label2); - label1 = label1 == label3 ? label2 : label3; - label2 = label3; - } -} - -__device__ IndexType nextDirectionNot(const Neighborhood &neighbors, IndexType from, IndexType lastDir, CoordType pos, - bool flipDir) -{ - // Start from the direction opposite (180 degrees) to the last direction. - // This is done by adding 4 (half of 8 directions) and taking modulo 8. - // This ensures the result lies between 0 and 7 (inclusive). - IndexType nextDirection = mod(lastDir + (flipDir ? 4 : 0), 8); - - // Loop to search for the next valid direction in a clockwise manner. - // The loop starts from 1 and iterates 7 times, covering all directions. - for (auto dir = 1; dir < 8; ++dir) - { - // Move in a clockwise manner by incrementing the direction - // and taking modulo 8 to ensure it stays in the valid range. - nextDirection = mod(nextDirection + dir, 8); - - // Check if the direction pointed by nextDirection is valid by inspecting - // the neighbors bitmask. If valid, break out of the loop. - if (((*(neighbors.ptr(pos.z, 0, 0) + from)) & (1 << nextDirection)) > 0) - { - break; - } - } - - // Return the determined valid direction. - return nextDirection; -} - -__device__ LabelType findHead(const BoundaryLabel &labels, const BoundaryLabel &segments, const Neighborhood &neighbors, - CoordType pos, IndexType from, IndexType &lastDir) -{ - // Begin at the starting position. - IndexType next = from; - - // If segments at this new pixel is equal to the pixel, end - while ((*(neighbors.ptr(pos.z, 0, 0) + next) & 0x87) != 0) - { - // Use nextDirectionNot to get the next direction to move in. - lastDir = nextDirectionNot(neighbors, next, lastDir - (next != from ? 0 : 1), pos, next != from); - - // Update the current position by moving in the direction provided by nextDirectionNot. - next += OFFSET[lastDir].y * labels.cols + OFFSET[lastDir].x; - } - - // Return the position that matches the condition. - return next; -} - -__device__ void traverseContour(const BoundaryLabel &labels, const BoundaryLabel &segments, - const BoundaryLabel &connectedComponents, Neighborhood &neighbors, CoordType pos, - ConnectList &connectList, CountList &nodeCount, CountType *contourCount, - LabelType badLabel) -{ - // Obtain the root label for the connected component. - // It represents the label assigned to this specific group of connected pixels. - auto root = *connectedComponents.ptr(pos.z, pos.y, pos.x); - - // The head is a reference pixel on the contour; essentially, our starting point. - auto head = *segments.ptr(pos.z, pos.y, pos.x); - - // The current pixel label we're working on. - auto next = *labels.ptr(pos.z, pos.y, pos.x); - auto curr = pos.y * labels.cols + pos.x; - - // Return early if the current pixel isn't the root pixel. - // This ensures we are only processing root pixels. - if (curr != root || root == badLabel) - return; - - // Get the contour neighbor data for the current pixel. - auto neighborhood = *neighbors.ptr(pos.z, pos.y, pos.x); - - // Calculate the first direction which has a neighbor on the edge. - auto nextDir = __ffs(static_cast(neighborhood)); - for (auto dir = nextDir; dir < 7; ++dir) - { - nextDir = (((1 << dir) & neighborhood) > 0) ? dir : nextDir; - } - auto lastDir = nextDir; - - // Prepare local queues for storing pixel labels and directions. - // These help manage which pixels/directions are processed next. - constexpr int MAX_SIZE = 64; - LocalQueue labelQueue; - LocalQueue dirQueue; - - // Initialize the queues with starting values. - labelQueue.push(root); - dirQueue.push(nextDir); - - // Adjust the next label based on the initial direction. - next += OFFSET[nextDir].y * labels.cols + OFFSET[nextDir].x; - - // Temporary variables for dequeuing operations. - LabelType frusLabel; - IndexType frusDir; - - // Keep processing pixels until there's nothing left in our queues. - auto &counts = labels.batches > 1 ? contourCount[pos.z] : *contourCount; - while (!labelQueue.isEmpty() && !dirQueue.isEmpty() && counts < FindContours::MAX_NUM_CONTOURS) - { - // Fetch the next label and direction from the front of our queues. - labelQueue.pop(frusLabel); - dirQueue.pop(frusDir); - - // Identify the contour's starting pixel for this segment. - head = findHead(labels, segments, neighbors, pos, frusLabel, frusDir); - neighborhood = *(neighbors.ptr(pos.z, 0, 0) + head); - if (neighborhood == 0) - { - continue; - } - - // Update tracking variables to work on the head pixel. - curr = head; - nextDir = mod(frusDir + (frusLabel == head ? 0 : 4), 8); - lastDir = mod(nextDirectionNot(neighbors, head, nextDir, pos) + 4, 8); - next = curr + OFFSET[nextDir].y * labels.cols + OFFSET[nextDir].x; - - // Increment the total count of contours. - IndexType contourIndex = atomicInc(&counts, FindContours::MAX_NUM_CONTOURS); - if (contourIndex == FindContours::MAX_NUM_CONTOURS) - { - atomicExch(&counts, FindContours::MAX_NUM_CONTOURS); - break; - } - *nodeCount.ptr(pos.z, contourIndex) = 0; - - // Traverse the contour until it loops back to the head pixel. - while (next != head && *nodeCount.ptr(pos.z, contourIndex) != FindContours::MAX_CONTOUR_POINTS) - { - // Register the current pixel to the contour. - IndexType pointIndex = (*nodeCount.ptr(pos.z, contourIndex))++; - *connectList.ptr(pos.z, contourIndex, pointIndex) = curr; - - // Update the next direction based on the neighbors. - for (auto dir = 1; dir < 8; ++dir) - { - nextDir = mod(lastDir + 4 - dir, 8); - next = curr + OFFSET[nextDir].y * labels.cols + OFFSET[nextDir].x; - - auto ccNext = *(connectedComponents.ptr(pos.z, 0, 0) + next); - if (ccNext == root) - { - break; - } - } - - // Move to the next pixel in the chosen direction. - curr = next; - lastDir = nextDir; - } - } -} - -template -__global__ void labelEdges(DeviceImage image, IndexType height, IndexType width, IndexType batchSize, - LabelType *dLabels) -{ - // NOTE: Potential for improvement to reduce thread divergences. - - // Shared memory buffer allocation. - extern __shared__ int32_t sharedBuffer[]; - - // Setting up the labels data structure. - BoundaryLabel labels{batchSize, height, width, 1, dLabels}; - - // Initializing cooperative groups for thread management. - auto grid = cg::this_grid(); - auto block = cg::this_thread_block(); - - // Deriving grid and block properties. - auto gridBlocks = grid.group_dim().x * grid.group_dim().y * grid.group_dim().z; - auto gridShape = grid.group_dim() * block.group_dim(); - auto blockHeight = block.group_dim().y; - auto blockWidth = block.group_dim().x; - auto blockRank = block.group_index().z * grid.group_dim().x * grid.group_dim().y - + block.group_index().y * grid.group_dim().x + block.group_index().x; - - // Get pointers to shared memory for image and labels. - auto sharedOffset = 0; - SharedImage sharedImage{static_cast(blockHeight + 4), static_cast(blockWidth + 4), - reinterpret_cast(&sharedBuffer[sharedOffset])}; - sharedOffset += sharedImage.volume() * sizeof(typename SharedImage::value_type) / sizeof(int32_t); - SharedLabel sharedLabels{static_cast(blockHeight), static_cast(blockWidth), - reinterpret_cast(&sharedBuffer[sharedOffset])}; - - // Computing block dimensions in terms of tiles. - auto blocksTileWidth = util::DivUp(width, blockWidth); - auto blocksTileHeight = util::DivUp(height, blockHeight); - auto numSteps = util::DivUp(blocksTileWidth * blocksTileHeight * batchSize, gridBlocks); - - // Thread positions within a block. - PointType threadBlockPos{static_cast(block.thread_rank() % blockWidth), - static_cast(block.thread_rank() / blockWidth)}; - - // Iterate through the steps to cover the entire image. - for (auto step = 0; step < numSteps; ++step) - { - // Compute block index. - auto blockIndex = blockRank + step * gridBlocks; - CoordType blockGridPos{blockIndex % blocksTileWidth, (blockIndex / blocksTileWidth) % blocksTileHeight, - blockIndex / (blocksTileWidth * blocksTileHeight)}; - - // Compute local and global positions. - PointType localPos{threadBlockPos.x + 2, threadBlockPos.y + 2}; - PointType globalPos{static_cast(threadBlockPos.x + blockWidth * blockGridPos.x), - static_cast(threadBlockPos.y + blockHeight * blockGridPos.y)}; - IndexType batchIndex = static_cast(blockGridPos.z); - - // Populate shared memory with image data. - copyGlobalToShared<2, PixelType, int32_t>(image, globalPos, localPos, batchIndex, 0, sharedImage); - block.sync(); - - // Assign labels to the edges. - if (batchIndex < batchSize) - { - setLabels(sharedImage, localPos, sharedLabels, globalPos, width, height); - } - block.sync(); - - // Copy labels from shared memory back to global memory. - const auto index = (localPos.y - 2) * sharedLabels.width() + (localPos.x - 2); - if (globalPos.x < width && globalPos.y < height && batchIndex < batchSize) - { - *labels.ptr(batchIndex, globalPos.y, globalPos.x) = sharedLabels[index]; - } - } -} - -__global__ void labelConnectedComponents(LabelType *dLabels, IndexType height, IndexType width, IndexType batchSize, - LabelType *dSegments, LabelType *dConnectedComponents, MaskType *dNeighbors) -{ - // Set up data structures to provide structure and ease of access to labels, segments, - // connected components, and neighbors. - BoundaryLabel labels{batchSize, height, width, 1, dLabels}; - BoundaryLabel segments{batchSize, height, width, 1, dSegments}; - BoundaryLabel connectedComponents{batchSize, height, width, 1, dConnectedComponents}; - Neighborhood neighbors{batchSize, height, width, 1, dNeighbors}; - - // Initialize cooperative groups, which provide synchronization primitives for CUDA threads. - auto grid = cg::this_grid(); - auto block = cg::this_thread_block(); - - // Calculate properties for the grid and blocks. - auto gridBlocks = grid.group_dim().x * grid.group_dim().y * grid.group_dim().z; - auto blockHeight = block.group_dim().y; - auto blockWidth = block.group_dim().x; - auto blockRank = block.group_index().z * grid.group_dim().x * grid.group_dim().y - + block.group_index().y * grid.group_dim().x + block.group_index().x; - - // Calculate the width and height of blocks in tiles and the total number of steps required. - auto blocksTileWidth = util::DivUp(width, blockWidth); - auto blocksTileHeight = util::DivUp(height, blockHeight); - auto numSteps = util::DivUp(blocksTileWidth * blocksTileHeight * batchSize, gridBlocks); - - // Determine block dimensions and thread's position within the block. - CoordType blockDims{blockWidth, blockHeight, 1}; - CoordType threadBlockPos{block.thread_rank() % blockWidth, block.thread_rank() / blockWidth, 0}; - - const auto badLabel = height * width; - - // Lambda function to encapsulate the repeated logic. It operates on the thread's position - // and performs the given action if the position is within the image boundaries. - auto performOperationOnThreadPos = [&](auto operation) - { - for (auto step = 0; step < numSteps; ++step) - { - // Calculate the block's position in the grid. - auto blockIndex = blockRank + step * gridBlocks; - CoordType blockGridPos{blockIndex % blocksTileWidth, (blockIndex / blocksTileWidth) % blocksTileHeight, - blockIndex / (blocksTileWidth * blocksTileHeight)}; - - // Calculate the global position of the thread. - auto threadPos = blockGridPos * blockDims + threadBlockPos; - - // Check if the thread position is within the boundaries of the image. - bool inLabels = (threadPos.x < width && threadPos.y < height && threadPos.z < batchSize); - - // If valid, perform the given operation. - if (inLabels) - { - operation(threadPos); - } - } - grid.sync(); // Synchronize threads in the grid to ensure they are all done. - }; - - // 1. Extract edge segments of contiguous edges. - performOperationOnThreadPos( - [&](const CoordType &threadPos) - { *segments.ptr(threadPos.z, threadPos.y, threadPos.x) = findRoot(labels, threadPos, badLabel); }); - - // 2. Resolve roots in the segments to connect neighboring components. - performOperationOnThreadPos([&](const CoordType &threadPos) - { resolveRoots(segments, threadPos, badLabel, neighbors); }); - - // 3. Label the connected components. - performOperationOnThreadPos( - [&](const CoordType &threadPos) - { *connectedComponents.ptr(threadPos.z, threadPos.y, threadPos.x) = findRoot(segments, threadPos, badLabel); }); -} - -__global__ void resolveContours(LabelType *dLabels, LabelType *dSegments, LabelType *dConnectedComponents, - MaskType *dNeighbors, IndexType height, IndexType width, IndexType batchSize, - IndexType *dConnectList, CountType *dNodeCount, CountType *contourCount) -{ - // Organize input/output data into structured objects for easier access. - BoundaryLabel labels{batchSize, height, width, 1, dLabels}; - BoundaryLabel segments{batchSize, height, width, 1, dSegments}; - BoundaryLabel connectedComponents{batchSize, height, width, 1, dConnectedComponents}; - Neighborhood neighbors{batchSize, height, width, 1, dNeighbors}; - ConnectList connectList{batchSize, FindContours::MAX_NUM_CONTOURS, FindContours::MAX_CONTOUR_POINTS, 1, - dConnectList}; - CountList nodeCount{batchSize, FindContours::MAX_NUM_CONTOURS, dNodeCount}; - - // Initialize cooperative groups for thread synchronization. - auto grid = cg::this_grid(); - auto block = cg::this_thread_block(); - - // Compute properties of the grid and blocks. - auto gridBlocks = grid.group_dim().x * grid.group_dim().y * grid.group_dim().z; - auto blockHeight = block.group_dim().y; - auto blockWidth = block.group_dim().x; - auto blockRank = block.group_index().z * grid.group_dim().x * grid.group_dim().y - + block.group_index().y * grid.group_dim().x + block.group_index().x; - - // Calculate block tile dimensions and total number of iterations needed. - auto blocksTileWidth = util::DivUp(width, blockWidth); - auto blocksTileHeight = util::DivUp(height, blockHeight); - auto numSteps = util::DivUp(blocksTileWidth * blocksTileHeight * batchSize, gridBlocks); - - // Calculate the thread's block dimensions and its position within the block. - CoordType blockDims{blockWidth, blockHeight, 1}; - CoordType threadBlockPos{block.thread_rank() % blockWidth, block.thread_rank() / blockWidth, 0}; - - const auto badLabel = height * width; - - // Traverse and label contours for each step. - for (auto step = 0; step < numSteps; ++step) - { - // Calculate block's position within the grid. - auto blockIndex = blockRank + step * gridBlocks; - CoordType blockGridPos{blockIndex % blocksTileWidth, (blockIndex / blocksTileWidth) % blocksTileHeight, - blockIndex / (blocksTileWidth * blocksTileHeight)}; - - // Calculate the thread's global position. - auto threadPos = blockGridPos * blockDims + threadBlockPos; - - // Check if thread's position is within image boundaries. - bool inLabels = (threadPos.x < width && threadPos.y < height && threadPos.z < batchSize); - - // If within boundaries, traverse and label the contour for the current position. - if (inLabels) - { - traverseContour(labels, segments, connectedComponents, neighbors, threadPos, connectList, nodeCount, - contourCount, badLabel); - } - } -} - -__global__ void flattenContours(IndexType *dConnectList, CountType *dNodeCount, CountType *contourCount, - IndexType width, IndexType batchSize, NodeList nodeList, NodeCounts pointCount) -{ - // Structuring the input/output data - ConnectList connectList{batchSize, FindContours::MAX_NUM_CONTOURS, FindContours::MAX_CONTOUR_POINTS, 1, - dConnectList}; - CountList nodeCount{batchSize, FindContours::MAX_NUM_CONTOURS, dNodeCount}; - - // Initialize cooperative groups for thread synchronization. - auto grid = cg::this_grid(); - auto block = cg::this_thread_block(); - auto warp = cg::tiled_partition<32>(block); - - // Compute properties of the grid and blocks. - auto gridBlocks = grid.group_dim().x * grid.group_dim().y * grid.group_dim().z; - auto blockRank = block.group_index().z * grid.group_dim().x * grid.group_dim().y - + block.group_index().y * grid.group_dim().x + block.group_index().x; - - // Calculate block tile dimensions and total number of iterations needed. - auto contourTile = util::DivUp(FindContours::MAX_NUM_CONTOURS, warp.meta_group_size()); - auto neededThreads = warp.size() * FindContours::MAX_NUM_CONTOURS * batchSize; - auto neededBlocks = (neededThreads + block.size() - 1) / block.size(); - auto numStepsBatchSize = ((batchSize * contourTile - blockRank) + gridBlocks - 1) / gridBlocks; - auto numSteps = max((neededBlocks + gridBlocks - 1) / gridBlocks, numStepsBatchSize); - - // Calculate the thread's block dimensions and its position within the block. - CoordType blockDims{warp.size(), warp.meta_group_size(), 1}; - CoordType threadBlockPos{warp.thread_rank(), warp.meta_group_rank(), 0}; - - // Traverse and label contours for each step. - for (auto step = 0; step < numSteps; ++step) - { - // Calculate block's position within the grid. - auto blockIndex = blockRank + step * gridBlocks; - CoordType blockGridPos{0, blockIndex % contourTile, blockIndex / contourTile}; - - // Calculate the thread's global position. - auto pos = blockGridPos * blockDims + threadBlockPos; - auto contourIndex = pos.y; - - // Make sure we're within the boundaries of our contour count - if (pos.z < batchSize && contourIndex < contourCount[pos.z]) - { - auto indexOffset = 0; - for (auto i = 0; i < contourIndex; ++i) - { - indexOffset += *nodeCount.ptr(pos.z, i); - } - - if ((indexOffset + *nodeCount.ptr(pos.z, contourIndex)) > FindContours::MAX_TOTAL_POINTS) - { - return; - } - - for (auto i = pos.x; i < *nodeCount.ptr(pos.z, contourIndex); i += blockDims.x) - { - auto point = *connectList.ptr(pos.z, contourIndex, i); - PointType node{mod(point, width), point / width}; - *nodeList.ptr(static_cast(pos.z), static_cast(indexOffset + i), 0) = node.x; - *nodeList.ptr(static_cast(pos.z), static_cast(indexOffset + i), 1) = node.y; - } - if (pos.x == 0) - { - *pointCount.ptr(static_cast(pos.z), static_cast(contourIndex)) - = *nodeCount.ptr(pos.z, contourIndex); - } - } - } -} - -namespace detail { -template -void forwardArgs(Lambda &&f, Args &&...args) -{ - // Create a lambda to capture each forwarded arg, then use pack expansion - // to expand and call the lambda for each arg. - auto forwarder = [&f](auto &&...a) - { - (f(&a), ...); - }; - forwarder(std::forward(args)...); -} - -template -inline void cooperativeLaunch(const KernelFunction &func, cudaStream_t stream, dim3 grid, dim3 block, size_t sharedMem, - KernelParameters... params) -{ - void *args[sizeof...(params)]; - int argIndex = 0; - - // Capture args by address into the args array - forwardArgs([&](auto p) { args[argIndex++] = p; }, params...); - - cudaLaunchCooperativeKernel(&func, grid, block, args, sharedMem, stream); -} -} // namespace detail - -template -__host__ void findContours_impl(DeviceImage &dImage, LabelType *dLabels, LabelType *dSegments, - LabelType *dConnectedComponents, MaskType *dNeighbors, IndexType *dConnectList, - CountType *dNodeCount, CountType *dContourCount, NodeList &dNodeList, - NodeCounts &dPointCount, IndexType height, IndexType width, IndexType batchSize, - cudaStream_t stream) -{ - // Determine shared memory size needed for labelEdges kernel, considering halo cells and storage. - auto labelEdgesSharedMem = [&](int blockSize) - { - int dimX = 32; - int dimY = static_cast((blockSize + dimX - 1) / dimX); - return (dimX + 4) * (dimY + 4) * sizeof(typename SharedImage::value_type) - + dimX * dimY * sizeof(typename SharedLabel::value_type); - }; - - // Parameters for kernel launches - dim3 block(1, 1, 1); - dim3 grid(1, 1, 1); - int maxGridSize = 1; - int maxBlockSize = 32; - - // 1. Labeling Image Edges: - // Query for optimal block size for the labelEdges kernel. - checkCudaErrors(cudaOccupancyMaxPotentialBlockSizeVariableSMem(&maxGridSize, &maxBlockSize, labelEdges, - labelEdgesSharedMem, 1024)); - block = dim3(32, (maxBlockSize + 31) / 32); - auto blocksTileWidth = util::DivUp(width, block.x); - auto blocksTileHeight = util::DivUp(height, block.y); - grid.x = std::min(blocksTileWidth * blocksTileHeight * batchSize, maxGridSize); - detail::cooperativeLaunch(labelEdges, stream, grid, block, labelEdgesSharedMem(block.x * block.y), - dImage, height, width, batchSize, dLabels); - - // 2. Labeling Connected Components: - checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&maxGridSize, &maxBlockSize, labelConnectedComponents, 0, 1024)); - block = dim3(32, (maxBlockSize + 31) / 32); - blocksTileWidth = util::DivUp(width, block.x); - blocksTileHeight = util::DivUp(height, block.y); - grid.x = std::min(blocksTileWidth * blocksTileHeight * batchSize, maxGridSize); - detail::cooperativeLaunch(labelConnectedComponents, stream, grid, block, 0, dLabels, height, width, batchSize, - dSegments, dConnectedComponents, dNeighbors); - - // 3. Resolving Contours: - checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&maxGridSize, &maxBlockSize, resolveContours, 0, 1024)); - block = dim3(32, (maxBlockSize + 31) / 32); - blocksTileWidth = util::DivUp(width, block.x); - blocksTileHeight = util::DivUp(height, block.y); - grid.x = std::min(blocksTileWidth * blocksTileHeight * batchSize, maxGridSize); - detail::cooperativeLaunch(resolveContours, stream, grid, block, 0, dLabels, dSegments, dConnectedComponents, - dNeighbors, height, width, batchSize, dConnectList, dNodeCount, dContourCount); - - // 4. Flattening Contours: - checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&maxGridSize, &maxBlockSize, flattenContours, 0, 1024)); - auto idealThreads = 32 * FindContours::MAX_NUM_CONTOURS * batchSize; - auto bestBlockCount = util::DivUp(idealThreads, maxBlockSize); - block = dim3(32, (maxBlockSize + 31) / 32, 1); - grid.x = 1; - grid.y = std::min(bestBlockCount, maxGridSize); - grid.z = 1; - detail::cooperativeLaunch(flattenContours, stream, grid, block, 0, dConnectList, dNodeCount, dContourCount, width, - batchSize, dNodeList, dPointCount); -} - -// ============================================================================= -// FindContours Class Definition -// ============================================================================= - -FindContours::FindContours(DataShape max_input_shape, DataShape max_output_shape) - : CudaBaseOp(max_input_shape, max_output_shape) -{ - // Calculating the size of the workspace buffers - auto gpuBufferSize = this->calBufferSize(max_input_shape, max_output_shape, kCV_8U); - - // Allocating GPU memory - NVCV_CHECK_LOG(cudaMalloc(&gpu_workspace, gpuBufferSize)); -} - -FindContours::~FindContours() -{ - NVCV_CHECK_LOG(cudaFree(gpu_workspace)); -} - -size_t FindContours::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type) -{ - // Number of images in the batch times... - return max_input_shape.N - * ( - // Size of labels buffer - max_input_shape.H * max_input_shape.W * sizeof(LabelType) + - - // Size of segments buffer - max_input_shape.H * max_input_shape.W * sizeof(LabelType) + - - // Size of connected components buffer - max_input_shape.H * max_input_shape.W * sizeof(LabelType) + - - // Size of neighborhood flag buffer - max_input_shape.H * max_input_shape.W * sizeof(MaskType) + - - // Size of maximum contours heads found - FindContours::MAX_TOTAL_POINTS * sizeof(IndexType) + - - FindContours::MAX_NUM_CONTOURS * sizeof(CountType) + - - // Size of contour counter - sizeof(CountType) - - // done... - ); -} - -ErrorCode FindContours::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &pointCoords, - const TensorDataStridedCuda &numPoints, cudaStream_t stream) -{ - // Testing inData for valid structure - auto format = GetLegacyDataFormat(inData.layout()); - if (format != kNHWC && format != kHWC) - { - LOG_ERROR("Invalid DataFormat for input image: " << format); - return ErrorCode::INVALID_DATA_FORMAT; - } - - auto data_type = GetLegacyDataType(inData.dtype()); - if (!(data_type == kCV_8U /*|| data_type == kCV_16U || data_type == kCV_16S || data_type == kCV_32F */)) - { - LOG_ERROR("Invalid DataType for input image: " << data_type); - return ErrorCode::INVALID_DATA_TYPE; - } - - // Creating a access overlay for the input data - auto inAccess = TensorDataAccessStridedImage::Create(inData); - NVCV_ASSERT(inAccess); - auto pointsAccess = TensorDataAccessStrided::Create(pointCoords); - NVCV_ASSERT(pointsAccess); - auto countAccess = TensorDataAccessStrided::Create(numPoints); - NVCV_ASSERT(countAccess); - - // Extracting input shape information - auto input_shape = GetLegacyDataShape(inAccess->infoShape()); - auto points_shape = pointsAccess->infoShape(); - auto counts_shape = countAccess->infoShape(); - - const auto nImage = input_shape.N; - const auto width = input_shape.W; - const auto height = input_shape.H; - const auto channels = input_shape.C; - - if (channels != 1) - { - LOG_ERROR("Invalid channel number " << channels); - return ErrorCode::INVALID_DATA_SHAPE; - } - - if (nImage != points_shape.shape()[0] || nImage != counts_shape.shape()[0]) - { - LOG_ERROR("Invalid INVALID_PARAMETER: batch size must be equal for all parameters"); - return ErrorCode::INVALID_PARAMETER; - } - if (points_shape.shape()[1] > FindContours::MAX_TOTAL_POINTS) - { - LOG_ERROR("Invalid INVALID_PARAMETER: points cannot be larger than the max total number of points"); - return ErrorCode::INVALID_PARAMETER; - } - if (points_shape.shape()[2] != 2) - { - LOG_ERROR("Invalid INVALID_PARAMETER: points shape can only hold xy coordinates"); - return ErrorCode::INVALID_PARAMETER; - } - if (counts_shape.shape()[1] > FindContours::MAX_NUM_CONTOURS) - { - LOG_ERROR("Invalid INVALID_PARAMETER: points cannot be larger than the max number of contours"); - return ErrorCode::INVALID_PARAMETER; - } - - DeviceImage dImage{ - reinterpret_cast(inAccess->sampleData(0)), - {static_cast(inAccess->sampleStride()), static_cast(inAccess->rowStride()), - static_cast(inAccess->colStride()) }, - { static_cast(nImage), static_cast(height), static_cast(width)} - }; - NodeList dNodeList{pointCoords}; - NodeCounts dPointCount{numPoints}; - - // Creating some temporaries - char *bufferBoundaryStart = (char *)gpu_workspace; - - // Initialize buffer for the GPU image. - - // Initialize buffer for storing neighborhood indices. - LabelType *dLabels = reinterpret_cast(bufferBoundaryStart); - bufferBoundaryStart += sizeof(LabelType) * input_shape.N * input_shape.H * input_shape.W; - - // Initialize buffer for storing segment boundaries. - LabelType *dSegments = reinterpret_cast(bufferBoundaryStart); - bufferBoundaryStart += sizeof(LabelType) * input_shape.N * input_shape.H * input_shape.W; - - // Initialize buffer for storing connected component data. - LabelType *dConnectedComponents = reinterpret_cast(bufferBoundaryStart); - bufferBoundaryStart += sizeof(LabelType) * input_shape.N * input_shape.H * input_shape.W; - - // Initialize buffer for storing neighbor mask data. - MaskType *dNeighbors = reinterpret_cast(bufferBoundaryStart); - bufferBoundaryStart += sizeof(MaskType) * input_shape.N * input_shape.H * input_shape.W; - - // Initialize buffer to keep track of contours. - IndexType *dConnectList = reinterpret_cast(bufferBoundaryStart); - bufferBoundaryStart += sizeof(IndexType) * input_shape.N * FindContours::MAX_TOTAL_POINTS; - - // Initialize buffer to keep track of contours. - CountType *dNodeCount = reinterpret_cast(bufferBoundaryStart); - bufferBoundaryStart += sizeof(CountType) * input_shape.N * FindContours::MAX_NUM_CONTOURS; - - // Initialize buffer for counting contours. - CountType *dContourCount = reinterpret_cast(bufferBoundaryStart); - bufferBoundaryStart += sizeof(CountType) * input_shape.N; - - // Clear GPU buffers to prepare for computation. - checkCudaErrors(cudaMemsetAsync(reinterpret_cast(dLabels), height * width, - nImage * sizeof(LabelType) * input_shape.H * input_shape.W, stream)); - checkCudaErrors(cudaMemsetAsync(reinterpret_cast(dSegments), height * width, - nImage * sizeof(LabelType) * input_shape.H * input_shape.W, stream)); - checkCudaErrors(cudaMemsetAsync(reinterpret_cast(dConnectedComponents), height * width, - nImage * sizeof(LabelType) * input_shape.H * input_shape.W, stream)); - checkCudaErrors(cudaMemsetAsync(reinterpret_cast(dNeighbors), 0, - nImage * sizeof(MaskType) * input_shape.H * input_shape.W, stream)); - checkCudaErrors(cudaMemsetAsync(reinterpret_cast(dContourCount), 0, nImage * sizeof(CountType), stream)); - - // get boundaries of the binary image, which is called contour. - findContours_impl(dImage, dLabels, dSegments, dConnectedComponents, dNeighbors, dConnectList, dNodeCount, - dContourCount, dNodeList, dPointCount, height, width, nImage, stream); - - return ErrorCode::SUCCESS; -} - -} // namespace nvcv::legacy::cuda_op diff --git a/src/cvcuda/priv/legacy/min_area_rect.cu b/src/cvcuda/priv/legacy/min_area_rect.cu index cdf46324..4554e2b5 100644 --- a/src/cvcuda/priv/legacy/min_area_rect.cu +++ b/src/cvcuda/priv/legacy/min_area_rect.cu @@ -273,11 +273,6 @@ MinAreaRect::~MinAreaRect() NVCV_CHECK_LOG(cudaFree(mRotatedPointsDev)); } -size_t MinAreaRect::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, int maxContourNum) -{ - return maxContourNum * (_MAX_ROTATE_DEGREES + 1) * _MIN_AREA_EACH_ANGLE_STRID * sizeof(int); -} - ErrorCode MinAreaRect::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, const TensorDataStridedCuda &numPointsInContour, const int totalContours, cudaStream_t stream) diff --git a/src/cvcuda/priv/legacy/osd.cu b/src/cvcuda/priv/legacy/osd.cu index f979e688..81c0fcc0 100644 --- a/src/cvcuda/priv/legacy/osd.cu +++ b/src/cvcuda/priv/legacy/osd.cu @@ -41,7 +41,7 @@ using namespace cvcuda::priv; namespace nvcv::legacy::cuda_op { template -static __host__ __device__ unsigned char u8cast(_T value) +static __device__ unsigned char u8cast(_T value) { return value < 0 ? 0 : (value > 255 ? 255 : value); } diff --git a/src/cvcuda/priv/legacy/random_resized_crop.cu b/src/cvcuda/priv/legacy/random_resized_crop.cu index 7f095962..4d3240b1 100644 --- a/src/cvcuda/priv/legacy/random_resized_crop.cu +++ b/src/cvcuda/priv/legacy/random_resized_crop.cu @@ -87,51 +87,6 @@ inline const __device__ ValueType *_cacheAlignedBufferedRead(SrcWrapper srcImage } } //_cacheAlignedBufferedRead -template -__global__ void resize_linear_v2(const SrcWrapper src, DstWrapper dst, int2 srcSize, int2 dstSize, const int *top_, - const int *left_, const float *scale_x_, const float *scale_y_) -{ - int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - const int batch_idx = get_batch_idx(); - int height = srcSize.y, width = srcSize.x, out_height = dstSize.y, out_width = dstSize.x; - if (dst_x >= out_width || dst_y >= out_height) - return; - - const float scale_x = scale_x_[batch_idx]; - const float scale_y = scale_y_[batch_idx]; - const int top = top_[batch_idx]; - const int left = left_[batch_idx]; - - const float src_x = dst_x * scale_x + left; - const float src_y = dst_y * scale_y + top; - - using work_type = cuda::ConvertBaseTypeTo; - work_type out = cuda::SetAll(0); - - const int x1 = __float2int_rd(src_x); - const int y1 = __float2int_rd(src_y); - const int x2 = x1 + 1; - const int y2 = y1 + 1; - const int x2_read = min(x2, width - 1); - const int y2_read = min(y2, height - 1); - - typename SrcWrapper::ValueType src_reg; - src_reg = *src.ptr(batch_idx, y1, x1); - out = out + src_reg * ((x2 - src_x) * (y2 - src_y)); - - src_reg = *src.ptr(batch_idx, y1, x2_read); - out = out + src_reg * ((src_x - x1) * (y2 - src_y)); - - src_reg = *src.ptr(batch_idx, y2_read, x1); - out = out + src_reg * ((x2 - src_x) * (src_y - y1)); - - src_reg = *src.ptr(batch_idx, y2_read, x2_read); - out = out + src_reg * ((src_x - x1) * (src_y - y1)); - - *dst.ptr(batch_idx, dst_y, dst_x) = cuda::SaturateCast(out); -} - template __global__ void resize_linear_v1(const SrcWrapper src, DstWrapper dst, int2 srcSize, int2 dstSize, const int *top_, const int *left_, const float *scale_x_, const float *scale_y_) @@ -175,32 +130,6 @@ __global__ void resize_linear_v1(const SrcWrapper src, DstWrapper dst, int2 srcS } } -template -__global__ void resize_nearest_v2(const SrcWrapper src, DstWrapper dst, int2 srcSize, int2 dstSize, const int *top_, - const int *left_, const float *scale_x_, const float *scale_y_) -{ - int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - const int batch_idx = get_batch_idx(); - - int out_height = dstSize.y, out_width = dstSize.x; - if (dst_x >= out_width || dst_y >= out_height) - return; - - const float scale_x = scale_x_[batch_idx]; - const float scale_y = scale_y_[batch_idx]; - const int top = top_[batch_idx]; - const int left = left_[batch_idx]; - - const float src_x = dst_x * scale_x + left; - const float src_y = dst_y * scale_y + top; - - const int x1 = __float2int_rz(src_x); - const int y1 = __float2int_rz(src_y); - - *dst.ptr(batch_idx, dst_y, dst_x) = *src.ptr(batch_idx, y1, x1); -} - template __global__ void resize_nearest_v1(const SrcWrapper src, DstWrapper dst, int2 srcSize, int2 dstSize, const int *top_, const int *left_, const float *scale_x_, const float *scale_y_) @@ -223,29 +152,6 @@ __global__ void resize_nearest_v1(const SrcWrapper src, DstWrapper dst, int2 src } } -template -__global__ void resize_cubic_v2(const SrcWrapper src, DstWrapper dst, int2 srcSize, int2 dstSize, const int *top_, - const int *left_, const float *scale_x_, const float *scale_y_) -{ - int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - const int batch_idx = get_batch_idx(); - int out_height = dstSize.y, out_width = dstSize.x; - if (dst_x >= out_width || dst_y >= out_height) - return; - - const float scale_x = scale_x_[batch_idx]; - const float scale_y = scale_y_[batch_idx]; - const int top = top_[batch_idx]; - const int left = left_[batch_idx]; - - const float src_x = dst_x * scale_x + left; - const float src_y = dst_y * scale_y + top; - const float3 srcCoord{src_x, src_y, static_cast(batch_idx)}; - - *dst.ptr(batch_idx, dst_y, dst_x) = src[srcCoord]; -} - template __global__ void resize_cubic_v1(const SrcWrapper src, DstWrapper dst, int2 srcSize, int2 dstSize, const int *top_, const int *left_, const float *scale_x_, const float *scale_y_) @@ -336,7 +242,6 @@ void resize(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &ou auto src = cuda::CreateTensorWrapNHW(inData); auto dst = cuda::CreateTensorWrapNHW(outData); - // v2 is original impl, v1 is aligned with new resize op if (interpolation == NVCV_INTERP_LINEAR) { resize_linear_v1<<>>(src, dst, srcSize, dstSize, top, left, scale_x, scale_y); @@ -349,9 +254,6 @@ void resize(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &ou } else { - // this commented code is for v2 - // auto src = cuda::CreateInterpolationWrapNHW(inData); - resize_cubic_v1<<>>(src, dst, srcSize, dstSize, top, left, scale_x, scale_y); checkKernelErrors(); } diff --git a/src/cvcuda/priv/legacy/random_resized_crop_var_shape.cu b/src/cvcuda/priv/legacy/random_resized_crop_var_shape.cu index 279b2c87..f759a1a0 100644 --- a/src/cvcuda/priv/legacy/random_resized_crop_var_shape.cu +++ b/src/cvcuda/priv/legacy/random_resized_crop_var_shape.cu @@ -87,52 +87,6 @@ inline __device__ T *_cacheAlignedBufferedReadVS(cuda::ImageBatchVarShapeWrap -__global__ void resize_linear_v2(const SrcWrapper src, DstWrapper dst, const int *top_, const int *left_, - const float *scale_x_, const float *scale_y_) -{ - int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - const int batch_idx = get_batch_idx(); - int height = src.height(batch_idx), width = src.width(batch_idx); - int out_height = dst.height(batch_idx), out_width = dst.width(batch_idx); - if (dst_x >= out_width || dst_y >= out_height) - return; - - const float scale_x = scale_x_[batch_idx]; - const float scale_y = scale_y_[batch_idx]; - const int top = top_[batch_idx]; - const int left = left_[batch_idx]; - - const float src_x = dst_x * scale_x + left; - const float src_y = dst_y * scale_y + top; - - using work_type = cuda::ConvertBaseTypeTo; - work_type out = cuda::SetAll(0); - - const int x1 = __float2int_rd(src_x); - const int y1 = __float2int_rd(src_y); - const int x2 = x1 + 1; - const int y2 = y1 + 1; - const int x2_read = min(x2, width - 1); - const int y2_read = min(y2, height - 1); - - typename SrcWrapper::ValueType src_reg; - src_reg = *src.ptr(batch_idx, y1, x1); - out = out + src_reg * ((x2 - src_x) * (y2 - src_y)); - - src_reg = *src.ptr(batch_idx, y1, x2_read); - out = out + src_reg * ((src_x - x1) * (y2 - src_y)); - - src_reg = *src.ptr(batch_idx, y2_read, x1); - out = out + src_reg * ((x2 - src_x) * (src_y - y1)); - - src_reg = *src.ptr(batch_idx, y2_read, x2_read); - out = out + src_reg * ((src_x - x1) * (src_y - y1)); - - *dst.ptr(batch_idx, dst_y, dst_x) = cuda::SaturateCast(out); -} - template __global__ void resize_linear_v1(const SrcWrapper src, DstWrapper dst, const int *top_, const int *left_, const float *scale_x_, const float *scale_y_) @@ -180,32 +134,6 @@ __global__ void resize_linear_v1(const SrcWrapper src, DstWrapper dst, const int } } -template -__global__ void resize_nearest_v2(const SrcWrapper src, DstWrapper dst, const int *top_, const int *left_, - const float *scale_x_, const float *scale_y_) -{ - int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - const int batch_idx = get_batch_idx(); - - int out_height = dst.height(batch_idx), out_width = dst.width(batch_idx); - if (dst_x >= out_width || dst_y >= out_height) - return; - - const float scale_x = scale_x_[batch_idx]; - const float scale_y = scale_y_[batch_idx]; - const int top = top_[batch_idx]; - const int left = left_[batch_idx]; - - const float src_x = dst_x * scale_x + left; - const float src_y = dst_y * scale_y + top; - - const int x1 = __float2int_rz(src_x); - const int y1 = __float2int_rz(src_y); - - *dst.ptr(batch_idx, dst_y, dst_x) = *src.ptr(batch_idx, y1, x1); -} - template __global__ void resize_nearest_v1(const SrcWrapper src, DstWrapper dst, const int *top_, const int *left_, const float *scale_x_, const float *scale_y_) @@ -233,29 +161,6 @@ __global__ void resize_nearest_v1(const SrcWrapper src, DstWrapper dst, const in } } -template -__global__ void resize_cubic_v2(const SrcWrapper src, DstWrapper dst, const int *top_, const int *left_, - const float *scale_x_, const float *scale_y_) -{ - int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - const int batch_idx = get_batch_idx(); - int out_height = dst.height(batch_idx), out_width = dst.width(batch_idx); - if (dst_x >= out_width || dst_y >= out_height) - return; - - const float scale_x = scale_x_[batch_idx]; - const float scale_y = scale_y_[batch_idx]; - const int top = top_[batch_idx]; - const int left = left_[batch_idx]; - - const float src_x = dst_x * scale_x + left; - const float src_y = dst_y * scale_y + top; - const float3 srcCoord{src_x, src_y, static_cast(batch_idx)}; - - *dst.ptr(batch_idx, dst_y, dst_x) = src[srcCoord]; -} - template __global__ void resize_cubic_v1(const SrcWrapper src, DstWrapper dst, const int *top_, const int *left_, const float *scale_x_, const float *scale_y_) @@ -355,9 +260,6 @@ void resize(const ImageBatchVarShapeDataStridedCuda &in, const ImageBatchVarShap } else { - // for v2, not used - // cuda::InterpolationVarShapeWrap src(in); - resize_cubic_v1<<>>(src, dst, top, left, scale_x, scale_y); checkKernelErrors(); } diff --git a/src/cvcuda/priv/legacy/resize.cu b/src/cvcuda/priv/legacy/resize.cu index 51721843..4c099618 100644 --- a/src/cvcuda/priv/legacy/resize.cu +++ b/src/cvcuda/priv/legacy/resize.cu @@ -82,18 +82,6 @@ inline const __device__ ValueType *_cacheAlignedBufferedRead(SrcWrapper srcImage } } //_cacheAlignedBufferedRead -template -inline void __device__ _alignedCudaMemcpyQuad(T *pDst, T *pSrc) -{ - //copy 4 T's, assuming 32-bit alignment for both pSrc and pDst - uint *uPtrSrc = (uint *)pSrc; - uint *uPtrDst = (uint *)pDst; - -#pragma unroll - for (int i = 0; i < sizeof(T); ++i) uPtrDst[i] = uPtrSrc[i]; - -} //_alignedCudaMemcpyQuad - //******************** NN = Nearest Neighbor template @@ -113,53 +101,6 @@ __global__ void resize_NN(SrcWrapper src, DstWrapper dst, int2 srcSize, int2 dst } } //resize_NN -template -__global__ void resize_NN_quad_alignread(SrcWrapper src, DstWrapper dst, int2 srcSize, int2 dstSize, - const float scale_x, const float scale_y) -{ - const float MAX_BUFFERED_X_SCALE = 4.0f; //probably more efficient all the way up to 4.0 - - const int dst_x = (blockIdx.x * blockDim.x + threadIdx.x) * 4; //quad - const int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - const int batch_idx = get_batch_idx(); - int out_height = dstSize.y, out_width = dstSize.x; - - //0 - quad-aligned so if one pixel is out, they're all out - if ((dst_x >= out_width) | (dst_y >= out_height)) - return; - - const int sx0 = cuda::min(cuda::round(dst_x * scale_x), srcSize.x - 1); - const int sx1 = cuda::min(cuda::round(dst_x * scale_x + scale_x), srcSize.x - 1); - const int sx2 = cuda::min(cuda::round((dst_x + 2) * scale_x), srcSize.x - 1); - const int sx3 = cuda::min(cuda::round((dst_x + 3) * scale_x), srcSize.x - 1); - const int sy = cuda::min(cuda::round(dst_y * scale_y), srcSize.y - 1); - - //1 - optimized case if scale_x < some finite limit - if ((scale_x <= MAX_BUFFERED_X_SCALE)) //local buffering is more efficient - { - uint readBuffer[MAX_BUFFER_WORDS]; - - //2 - copy out source data, 32-bit aligned aligned - const T *aPtr = _cacheAlignedBufferedRead(src, srcSize.x, &readBuffer[0], - MAX_BUFFER_WORDS, batch_idx, sy, sx0, sx3); - - //3 - NN sampling - T gather[4] = {aPtr[0], aPtr[sx1 - sx0], aPtr[sx2 - sx0], aPtr[sx3 - sx0]}; - - //4 - aligned write back out - _alignedCudaMemcpyQuad(dst.ptr(batch_idx, dst_y, dst_x), gather); - } - else //6 - standard sampling, no optimization - { - //sample all 4 points - - const T *aPtr = src.ptr(batch_idx, sy, sx0); - T gather[4] = {aPtr[0], aPtr[sx1 - sx0], aPtr[sx2 - sx0], aPtr[sx3 - sx0]}; - - _alignedCudaMemcpyQuad(dst.ptr(batch_idx, dst_y, dst_x), gather); - } -} //resize_NN_quad_alignread - //******************** Bilinear template @@ -200,112 +141,6 @@ __global__ void resize_bilinear(SrcWrapper src, DstWrapper dst, int2 srcSize, in } } //resize_bilinear -template -__global__ void resize_bilinear_quad_alignread(SrcWrapper src, DstWrapper dst, int2 srcSize, int2 dstSize, - const float scale_x, const float scale_y) -{ - const float MAX_BUFFERED_X_SCALE = 4.0f; //probably more efficient all the way up to 4.0 - - const int dst_x = (blockIdx.x * blockDim.x + threadIdx.x) * 4; //quad - const int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - const int batch_idx = get_batch_idx(); - int height = srcSize.y, width = srcSize.x, out_height = dstSize.y, out_width = dstSize.x; - - //0 - quad-aligned so if one pixel is out, they're all out - if ((dst_x >= out_width) | (dst_y >= out_height)) - return; - - //float space for weighted addition - using work_type = cuda::ConvertBaseTypeTo; - - //y coordinate math is the same for all points - float fy = (float)((dst_y + 0.5f) * scale_y - 0.5f); - int sy = cuda::round(fy); - fy -= sy; - sy = cuda::max(0, cuda::min(sy, height - 2)); - - //sx0 - float fx0 = (float)((dst_x + 0.5f) * scale_x - 0.5f); - int sx0 = cuda::round(fx0); - fx0 -= sx0; - fx0 *= ((sx0 >= 0) && (sx0 < width - 1)); - sx0 = cuda::max(0, cuda::min(sx0, width - 2)); - - //sx1 - float fx1 = (float)((dst_x + 1.5) * scale_x - 0.5f); - int sx1 = cuda::round(fx1); - fx1 -= sx1; - fx1 *= ((sx1 >= 0) && (sx1 < width - 1)); - sx1 = cuda::max(0, cuda::min(sx1, width - 2)); - - //sx2 - float fx2 = (float)((dst_x + 2.5f) * scale_x - 0.5f); - int sx2 = cuda::round(fx2); - fx2 -= sx2; - fx2 *= ((sx2 >= 0) && (sx2 < width - 1)); - sx2 = cuda::max(0, cuda::min(sx2, width - 2)); - - //sx3 - float fx3 = (float)((dst_x + 3.5f) * scale_x - 0.5f); - int sx3 = cuda::round(fx3); - fx3 -= sx3; - fx3 *= ((sx3 >= 0) && (sx3 < width - 1)); - sx3 = cuda::max(0, cuda::min(sx3, width - 2)); - - uint readBuffer[MAX_BUFFER_WORDS]; - - T result[4]; - - //1 - optimized case if scale_x < some finite limit - if (scale_x <= MAX_BUFFERED_X_SCALE) //local buffering is more efficient - { - work_type accum[4]; - - //2 - aligned load a-row and add partial product - const T *aPtr = _cacheAlignedBufferedRead(src, srcSize.x, readBuffer, MAX_BUFFER_WORDS, - batch_idx, sy, sx0, sx3 + 1); - //const T * aPtr = src.ptr(batch_idx, sy, sx0); //start of upper row - - accum[0] = (1.0f - fy) * (aPtr[sx0 - sx0] * (1.0f - fx0) + aPtr[sx0 - sx0 + 1] * fx0); - accum[1] = (1.0f - fy) * (aPtr[sx1 - sx0] * (1.0f - fx1) + aPtr[sx1 - sx0 + 1] * fx1); - accum[2] = (1.0f - fy) * (aPtr[sx2 - sx0] * (1.0f - fx2) + aPtr[sx2 - sx0 + 1] * fx2); - accum[3] = (1.0f - fy) * (aPtr[sx3 - sx0] * (1.0f - fx3) + aPtr[sx3 - sx0 + 1] * fx3); - - //3 - aligned load b-row and add remaining partial product - const T *bPtr = _cacheAlignedBufferedRead(src, srcSize.x, readBuffer, MAX_BUFFER_WORDS, - batch_idx, sy + 1, sx0, sx3 + 1); - //const T * bPtr = src.ptr(batch_idx, sy+1, sx0); //start of lower row - - //$$$ only need to cast, not saturatecast - result[0] = cuda::SaturateCast(accum[0] + fy * (bPtr[sx0 - sx0] * (1.0f - fx0) + bPtr[sx0 - sx0 + 1] * fx0)); - result[1] = cuda::SaturateCast(accum[1] + fy * (bPtr[sx1 - sx0] * (1.0f - fx1) + bPtr[sx1 - sx0 + 1] * fx1)); - result[2] = cuda::SaturateCast(accum[2] + fy * (bPtr[sx2 - sx0] * (1.0f - fx2) + bPtr[sx2 - sx0 + 1] * fx2)); - result[3] = cuda::SaturateCast(accum[3] + fy * (bPtr[sx3 - sx0] * (1.0f - fx3) + bPtr[sx3 - sx0 + 1] * fx3)); - } - else //unbuffered - { - //row pointers - const T *aPtr = src.ptr(batch_idx, sy, 0); //start of upper row - const T *bPtr = src.ptr(batch_idx, sy + 1, 0); //start of lower row - - //$$$ only need to cast, not saturatecast - result[0] = cuda::SaturateCast(aPtr[sx0] * (1.0f - fx0) * (1.0f - fy) + bPtr[sx0] * (1.0f - fx0) * fy - + aPtr[sx0 + 1] * fx0 * (1.0f - fy) + bPtr[sx0 + 1] * fx0 * fy); - - result[1] = cuda::SaturateCast(aPtr[sx1] * (1.0f - fx1) * (1.0f - fy) + bPtr[sx1] * (1.0f - fx1) * fy - + aPtr[sx1 + 1] * fx1 * (1.0f - fy) + bPtr[sx1 + 1] * fx1 * fy); - - result[2] = cuda::SaturateCast(aPtr[sx2] * (1.0f - fx2) * (1.0f - fy) + bPtr[sx2] * (1.0f - fx2) * fy - + aPtr[sx2 + 1] * fx2 * (1.0f - fy) + bPtr[sx2 + 1] * fx2 * fy); - - result[3] = cuda::SaturateCast(aPtr[sx3] * (1.0f - fx3) * (1.0f - fy) + bPtr[sx3] * (1.0f - fx3) * fy - + aPtr[sx3 + 1] * fx3 * (1.0f - fy) + bPtr[sx3 + 1] * fx3 * fy); - } - - //aligned write 4 pixels - _alignedCudaMemcpyQuad(dst.ptr(batch_idx, dst_y, dst_x), result); -} //resize_bilinear_quad_alignread - //******************** Bicubic template @@ -372,138 +207,6 @@ __global__ void resize_bicubic(SrcWrapper src, DstWrapper dst, int2 srcSize, int } } //resize_bicubic -template -__global__ void resize_bicubic_quad_alignread(SrcWrapper src, DstWrapper dst, int2 srcSize, int2 dstSize, - const float scale_x, const float scale_y) -{ //optimized for aligned read and write, plus buffering - const float MAX_BUFFERED_X_SCALE = 4.0f; //probably more efficient all the way up to 4.0 - - const int dst_x = (blockIdx.x * blockDim.x + threadIdx.x) * 4; //quad - const int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - const int batch_idx = get_batch_idx(); - int height = srcSize.y, width = srcSize.x, out_height = dstSize.y, out_width = dstSize.x; - - //0 - quad-aligned so if one pixel is out, they're all out - if ((dst_x >= out_width) | (dst_y >= out_height)) - return; - - uint readBuffer[MAX_BUFFER_WORDS]; - T result[4]; - - //float space for weighted addition - using work_type = cuda::ConvertBaseTypeTo; - - //y coordinate - float fy = (float)((dst_y + 0.5f) * scale_y - 0.5f); - int sy = cuda::round(fy); - fy -= sy; - sy = cuda::max(1, cuda::min(sy, height - 3)); - - const float A = -0.75f; - - float cY[4]; - cY[0] = ((A * (fy + 1) - 5 * A) * (fy + 1) + 8 * A) * (fy + 1) - 4 * A; - cY[1] = ((A + 2) * fy - (A + 3)) * fy * fy + 1; - cY[2] = ((A + 2) * (1 - fy) - (A + 3)) * (1 - fy) * (1 - fy) + 1; - cY[3] = 1.f - cY[0] - cY[1] - cY[2]; - - //1 - optimized case if scale_x < some finite limit - if (scale_x <= MAX_BUFFERED_X_SCALE) //local buffering - { //buffered read - work_type accum[4]; - float fx[4]; - int sx[4]; - float cX[4][4]; - - //initialize data for each pixel position -#pragma unroll - for (int pix = 0; pix < 4; ++pix) - { - accum[pix] = cuda::SetAll(0); - - //1 - precalc sx's ahead of time to get range from sx0-1..sx3+2 - fx[pix] = (float)((dst_x + pix + 0.5f) * scale_x - 0.5f); - sx[pix] = cuda::round(fx[pix]); - fx[pix] -= sx[pix]; - fx[pix] *= ((sx[pix] >= 1) && (sx[pix] < width - 3)); - sx[pix] = cuda::max(1, cuda::min(sx[pix], width - 3)); - - //2 - precalc cX[][] 2D array - cX[pix][0] - = ((A * (fx[pix] + 1.0f) - 5.0f * A) * (fx[pix] + 1.0f) + 8.0f * A) * (fx[pix] + 1.0f) - 4.0f * A; - cX[pix][1] = ((A + 2.0f) * fx[pix] - (A + 3.0f)) * fx[pix] * fx[pix] + 1.0f; - cX[pix][2] = ((A + 2.0f) * (1.0f - fx[pix]) - (A + 3.0f)) * (1.0f - fx[pix]) * (1.0f - fx[pix]) + 1.0f; - cX[pix][3] = 1.0f - cX[pix][0] - cX[pix][1] - cX[pix][2]; - } - const int rowOffset = sx[0] - 1; - - //contribute each row into 4 pixels -#pragma unroll - for (int row = 0; row < 4; ++row) - { - //1 - load each row from sx[0]-1 to sx[3]+2 inclusive, aligned - const T *aPtr = _cacheAlignedBufferedRead( - src, srcSize.x, readBuffer, MAX_BUFFER_WORDS, batch_idx, sy + row - 1, sx[0] - 1, sx[3] + 2); - -//2 - do each pixel's partial on this row -#pragma unroll - for (int pix = 0; pix < 4; ++pix) - { - accum[pix] - += cY[row] - * (cX[row][0] * aPtr[sx[pix] - rowOffset - 1] + cX[row][1] * aPtr[sx[pix] - rowOffset + 0] - + cX[row][2] * aPtr[sx[pix] - rowOffset + 1] + cX[row][3] * aPtr[sx[pix] - rowOffset + 2]); - } - } - - for (int pix = 0; pix < 4; ++pix) -#ifndef LEGACY_BICUBIC_MATH - result[pix] = cuda::SaturateCast(accum[pix]); -#else - result[pix] = cuda::SaturateCast(cuda::abs(accum[pix])); -#endif - } - else - { //partially buffered read 4 pixels at a time across each bicubic: 16 coalesced reads instead of 64 -#pragma unroll - for (int pix = 0; pix < 4; ++pix) - { - work_type accum = cuda::SetAll(0); - - float fx = (float)((dst_x + pix + 0.5f) * scale_x - 0.5f); - int sx = cuda::round(fx); - fx -= sx; - fx *= ((sx >= 1) && (sx < width - 3)); - sx = cuda::max(1, cuda::min(sx, width - 3)); - - float cX[4]; - cX[0] = ((A * (fx + 1.0f) - 5.0f * A) * (fx + 1.0f) + 8.0f * A) * (fx + 1.0f) - 4.0f * A; - cX[1] = ((A + 2.0f) * fx - (A + 3.0f)) * fx * fx + 1.0f; - cX[2] = ((A + 2.0f) * (1.0f - fx) - (A + 3.0f)) * (1.0f - fx) * (1.0f - fx) + 1.0f; - cX[3] = 1.0f - cX[0] - cX[1] - cX[2]; - - for (int row = 0; row < 4; ++row) - { - //1 - load each sub row from sx[pix]-1 to sx[pix]+2 inclusive, aligned - //const T * aPtr = src.ptr(batch_idx, sy + row - 1, sx-1); - const T *aPtr = _cacheAlignedBufferedRead( - src, srcSize.x, readBuffer, MAX_BUFFER_WORDS, batch_idx, sy + row - 1, sx - 1, sx + 2); - - //2 - do a pixel's partial on this row - accum += cY[row] * (cX[0] * aPtr[0] + cX[1] * aPtr[1] + cX[2] * aPtr[2] + cX[3] * aPtr[3]); - } //for row -#ifndef LEGACY_BICUBIC_MATH - result[pix] = cuda::SaturateCast(accum); -#else - result[pix] = cuda::SaturateCast(cuda::abs(accum)); -#endif - } //for pix - } - - //aligned write 4 pixels - _alignedCudaMemcpyQuad(dst.ptr(batch_idx, dst_y, dst_x), result); -} //resize_bicubic_quad_alignread - template __global__ void resize_area_ocv_align(SrcWrapper src, DstWrapper dst, int2 dstSize) { @@ -552,54 +255,21 @@ void resize(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &ou const dim3 blockSize(BLOCK_WIDTH, THREADS_PER_BLOCK / BLOCK_WIDTH, 1); const dim3 gridSize(divUp(out_width, blockSize.x), divUp(out_height, blockSize.y), batch_size); - //rationale for quad: aligned gather and aligned output where quad is possible: use different threading - const int out_quad_width = out_width / 4; - const dim3 quadGridSize(divUp(out_quad_width, blockSize.x), divUp(out_height, blockSize.y), batch_size); - - //bool can_quad = ((((size_t)dst_ptr) % sizeof(T)) == 0) && ((out_width % 4) == 0); //is the output buffer quad-pixel aligned? - //bool can_quad = ((out_width % 4) == 0); //is the output buffer quad-pixel aligned? - bool can_quad = false; // turning it off due to a reported regression - //Note: resize is fundamentally a gather memory operation, with a little bit of compute // our goals are to (a) maximize throughput, and (b) minimize occupancy for the same performance switch (interpolation) { case NVCV_INTERP_NEAREST: - - if (can_quad) - { //thread does 4 pixels horizontally for aligned read and write - resize_NN_quad_alignread<<>>(src, dst, srcSize, dstSize, scale_x, - scale_y); - } - else - { //generic single pixel per thread case - resize_NN<<>>(src, dst, srcSize, dstSize, scale_x, scale_y); - } + resize_NN<<>>(src, dst, srcSize, dstSize, scale_x, scale_y); break; case NVCV_INTERP_LINEAR: - if (can_quad) - { //thread does 4 pixels horizontally for aligned read and write - resize_bilinear_quad_alignread<<>>(src, dst, srcSize, dstSize, scale_x, - scale_y); - } - else - { //generic single pixel per thread case - resize_bilinear<<>>(src, dst, srcSize, dstSize, scale_x, scale_y); - } + resize_bilinear<<>>(src, dst, srcSize, dstSize, scale_x, scale_y); break; case NVCV_INTERP_CUBIC: - if (can_quad) - { //thread does 4 pixels horizontally for aligned read and write - resize_bicubic_quad_alignread<<>>(src, dst, srcSize, dstSize, scale_x, - scale_y); - } - else - { //generic single pixel per thread case - resize_bicubic<<>>(src, dst, srcSize, dstSize, scale_x, scale_y); - } + resize_bicubic<<>>(src, dst, srcSize, dstSize, scale_x, scale_y); break; case NVCV_INTERP_AREA: diff --git a/src/cvcuda/priv/legacy/resize_var_shape.cu b/src/cvcuda/priv/legacy/resize_var_shape.cu index 6d22bb6d..e4627ab5 100644 --- a/src/cvcuda/priv/legacy/resize_var_shape.cu +++ b/src/cvcuda/priv/legacy/resize_var_shape.cu @@ -86,18 +86,6 @@ inline __device__ T *_cacheAlignedBufferedReadVS(cuda::ImageBatchVarShapeWrap -inline void __device__ _alignedCudaMemcpyQuadVS(T *pDst, T *pSrc) -{ - //copy 4 T's, assuming 32-bit alignment for both pSrc and pDst - uint *uPtrSrc = (uint *)pSrc; - uint *uPtrDst = (uint *)pDst; - -#pragma unroll - for (int i = 0; i < sizeof(T); ++i) uPtrDst[i] = uPtrSrc[i]; - -} //_alignedCudaMemcpyQuadVS - //******************** NN = Nearest Neighbor template @@ -123,73 +111,6 @@ __global__ void resize_NN(cuda::ImageBatchVarShapeWrap src, cuda::Image } } //resize_NN -template -__global__ void resize_NN_quad_combo(cuda::ImageBatchVarShapeWrap src, cuda::ImageBatchVarShapeWrap dst) -{ - const float MAX_BUFFERED_X_SCALE = 4.0f; //probably more efficient all the way up to 4.0 - - const int dst_x = (blockIdx.x * blockDim.x + threadIdx.x) * 4; //quad - const int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - const int batch_idx = get_batch_idx(); - const int dstWidth = dst.width(batch_idx); - const int dstWidth4 = dstWidth & ~3; - const int dstHeight = dst.height(batch_idx); - - //0 - bail if out-of-range - if ((dst_x >= dstWidth) | (dst_y >= dstHeight)) - return; - - const int width = src.width(batch_idx); - const int height = src.height(batch_idx); - const float scale_x = static_cast(width) / dstWidth; - const float scale_y = static_cast(height) / dstHeight; - const int sy = cuda::min(cuda::round(dst_y * scale_y), height - 1); - - if (dstWidth != dstWidth4) //non-aligned case, up to 4 pixels - { //do up to 4 pixels, unoptimized - const int pixels = cuda::min(dstWidth - dst_x, 4); - for (int i = 0; i < pixels; ++i) - { - const int sxi = cuda::min(cuda::round((dst_x + i) * scale_x), width - 1); - - *dst.ptr(batch_idx, dst_y, dst_x + i) = *src.ptr(batch_idx, sy, sxi); - } - } - else //quad-case: memory is aligned, do 4 pixels - { - const int sx0 = cuda::min(cuda::round(dst_x * scale_x), width - 1); - const int sx1 = cuda::min(cuda::round(dst_x * scale_x + scale_x), width - 1); - const int sx2 = cuda::min(cuda::round((dst_x + 2) * scale_x), width - 1); - const int sx3 = cuda::min(cuda::round((dst_x + 3) * scale_x), width - 1); - - //1 - optimized case if scale_x < some finite limit - if ((scale_x <= MAX_BUFFERED_X_SCALE)) //local buffering is more efficient - { - uint readBuffer[MAX_BUFFER_WORDS_VS]; - - //2 - copy out source data, 32-bit aligned aligned - T *aPtr = _cacheAlignedBufferedReadVS( - src, width, &readBuffer[0], MAX_BUFFER_WORDS_VS, batch_idx, sy, sx0, sx3); - - //3 - NN sampling - T gather[4] = {aPtr[0], aPtr[sx1 - sx0], aPtr[sx2 - sx0], aPtr[sx3 - sx0]}; - - //4 - aligned write back out - _alignedCudaMemcpyQuadVS(dst.ptr(batch_idx, dst_y, dst_x), gather); - } - else //6 - standard sampling, no optimization - { - //sample all 4 points - - const T *aPtr = src.ptr(batch_idx, sy, sx0); - - T gather[4] = {aPtr[0], aPtr[sx1 - sx0], aPtr[sx2 - sx0], aPtr[sx3 - sx0]}; - - _alignedCudaMemcpyQuadVS(dst.ptr(batch_idx, dst_y, dst_x), gather); - } - } -} //resize_NN_quad_combo - //******************** Bilinear template @@ -236,147 +157,6 @@ __global__ void resize_bilinear(cuda::ImageBatchVarShapeWrap src, cuda: } } //resize_bilinear -template -__global__ void resize_bilinear_quad_combo(cuda::ImageBatchVarShapeWrap src, - cuda::ImageBatchVarShapeWrap dst) -{ - const float MAX_BUFFERED_X_SCALE = 4.0f; //probably more efficient all the way up to 4.0 - - const int dst_x = (blockIdx.x * blockDim.x + threadIdx.x) * 4; //quad - const int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - const int batch_idx = get_batch_idx(); - const int dstWidth = dst.width(batch_idx); - const int dstWidth4 = dstWidth & ~3; - const int dstHeight = dst.height(batch_idx); - - //0 - if one pixel is out, they're all out - if ((dst_x >= dstWidth) | (dst_y >= dstHeight)) - return; - - const int width = src.width(batch_idx); - const int height = src.height(batch_idx); - - const float scale_x = static_cast(width) / dstWidth; - const float scale_y = static_cast(height) / dstHeight; - - //y coordinate math is the same for all points - float fy = (float)((dst_y + 0.5f) * scale_y - 0.5f); - int sy = cuda::round(fy); - fy -= sy; - sy = cuda::max(0, cuda::min(sy, height - 2)); - - if (dstWidth != dstWidth4) //non-aligned case, up to 4 pixels - { - //row pointers - const T *aPtr = src.ptr(batch_idx, sy, 0); //start of upper row - const T *bPtr = src.ptr(batch_idx, sy + 1, 0); //start of lower row - - const int pixels = cuda::min(dstWidth - dst_x, 4); - for (int i = 0; i < pixels; ++i) - { //compute source data position and weight for [xi] components - float fxi = (float)((dst_x + 0.5f + i) * scale_x - 0.5f); - int sxi = cuda::round(fxi); - fxi -= sxi; - fxi *= ((sxi >= 0) && (sxi < width - 1)); - sxi = cuda::max(0, cuda::min(sxi, width - 2)); - - *dst.ptr(batch_idx, dst_y, dst_x + i) - = cuda::SaturateCast((1.0f - fxi) * (aPtr[sxi] * (1.0f - fy) + bPtr[sxi] * fy) - + fxi * (aPtr[sxi + 1] * (1.0f - fy) + bPtr[sxi + 1] * fy)); - } - } - else //quad-aligned case, 4 pixels - { - //float space for weighted addition - using work_type = cuda::ConvertBaseTypeTo; - - //sx0 - float fx0 = (float)((dst_x + 0.5f) * scale_x - 0.5f); - int sx0 = cuda::round(fx0); - fx0 -= sx0; - fx0 *= ((sx0 >= 0) && (sx0 < width - 1)); - sx0 = cuda::max(0, cuda::min(sx0, width - 2)); - - //sx1 - float fx1 = (float)((dst_x + 1.5) * scale_x - 0.5f); - int sx1 = cuda::round(fx1); - fx1 -= sx1; - fx1 *= ((sx1 >= 0) && (sx1 < width - 1)); - sx1 = cuda::max(0, cuda::min(sx1, width - 2)); - - //sx2 - float fx2 = (float)((dst_x + 2.5f) * scale_x - 0.5f); - int sx2 = cuda::round(fx2); - fx2 -= sx2; - fx2 *= ((sx2 >= 0) && (sx2 < width - 1)); - sx2 = cuda::max(0, cuda::min(sx2, width - 2)); - - //sx3 - float fx3 = (float)((dst_x + 3.5f) * scale_x - 0.5f); - int sx3 = cuda::round(fx3); - fx3 -= sx3; - fx3 *= ((sx3 >= 0) && (sx3 < width - 1)); - sx3 = cuda::max(0, cuda::min(sx3, width - 2)); - - uint readBuffer[MAX_BUFFER_WORDS_VS]; - - T result[4]; - - //1 - optimized case if scale_x < some finite limit - if (scale_x <= MAX_BUFFERED_X_SCALE) //local buffering is more efficient - { - work_type accum[4]; - - //2 - aligned load a-row and add partial product - T *aPtr = _cacheAlignedBufferedReadVS( - src, width, readBuffer, MAX_BUFFER_WORDS_VS, batch_idx, sy, sx0, sx3 + 1); - //const T * aPtr = src.ptr(batch_idx, sy, sx0); //start of upper row - - accum[0] = (1.0f - fy) * (aPtr[sx0 - sx0] * (1.0f - fx0) + aPtr[sx0 - sx0 + 1] * fx0); - accum[1] = (1.0f - fy) * (aPtr[sx1 - sx0] * (1.0f - fx1) + aPtr[sx1 - sx0 + 1] * fx1); - accum[2] = (1.0f - fy) * (aPtr[sx2 - sx0] * (1.0f - fx2) + aPtr[sx2 - sx0 + 1] * fx2); - accum[3] = (1.0f - fy) * (aPtr[sx3 - sx0] * (1.0f - fx3) + aPtr[sx3 - sx0 + 1] * fx3); - - //3 - aligned load b-row and add remaining partial product - T *bPtr = _cacheAlignedBufferedReadVS( - src, width, readBuffer, MAX_BUFFER_WORDS_VS, batch_idx, sy + 1, sx0, sx3 + 1); - //const T * bPtr = src.ptr(batch_idx, sy+1, sx0); //start of lower row - - //$$$ only need to cast, not saturatecast - result[0] - = cuda::SaturateCast(accum[0] + fy * (bPtr[sx0 - sx0] * (1.0f - fx0) + bPtr[sx0 - sx0 + 1] * fx0)); - result[1] - = cuda::SaturateCast(accum[1] + fy * (bPtr[sx1 - sx0] * (1.0f - fx1) + bPtr[sx1 - sx0 + 1] * fx1)); - result[2] - = cuda::SaturateCast(accum[2] + fy * (bPtr[sx2 - sx0] * (1.0f - fx2) + bPtr[sx2 - sx0 + 1] * fx2)); - result[3] - = cuda::SaturateCast(accum[3] + fy * (bPtr[sx3 - sx0] * (1.0f - fx3) + bPtr[sx3 - sx0 + 1] * fx3)); - } - else //unbuffered - { - //row pointers - const T *aPtr = src.ptr(batch_idx, sy, 0); //start of upper row - const T *bPtr = src.ptr(batch_idx, sy + 1, 0); //start of lower row - - //$$$ only need to cast, not saturatecast - result[0] = cuda::SaturateCast(aPtr[sx0] * (1.0f - fx0) * (1.0f - fy) + bPtr[sx0] * (1.0f - fx0) * fy - + aPtr[sx0 + 1] * fx0 * (1.0f - fy) + bPtr[sx0 + 1] * fx0 * fy); - - result[1] = cuda::SaturateCast(aPtr[sx1] * (1.0f - fx1) * (1.0f - fy) + bPtr[sx1] * (1.0f - fx1) * fy - + aPtr[sx1 + 1] * fx1 * (1.0f - fy) + bPtr[sx1 + 1] * fx1 * fy); - - result[2] = cuda::SaturateCast(aPtr[sx2] * (1.0f - fx2) * (1.0f - fy) + bPtr[sx2] * (1.0f - fx2) * fy - + aPtr[sx2 + 1] * fx2 * (1.0f - fy) + bPtr[sx2 + 1] * fx2 * fy); - - result[3] = cuda::SaturateCast(aPtr[sx3] * (1.0f - fx3) * (1.0f - fy) + bPtr[sx3] * (1.0f - fx3) * fy - + aPtr[sx3 + 1] * fx3 * (1.0f - fy) + bPtr[sx3 + 1] * fx3 * fy); - } - - //aligned write 4 pixels - _alignedCudaMemcpyQuadVS(dst.ptr(batch_idx, dst_y, dst_x), result); - } -} //resize_bilinear_quad_combo - //******************** Bicubic template @@ -449,192 +229,6 @@ __global__ void resize_bicubic(cuda::ImageBatchVarShapeWrap src, cuda:: } } //resize_bicubic -template -__global__ void resize_bicubic_quad_combo(cuda::ImageBatchVarShapeWrap src, - cuda::ImageBatchVarShapeWrap dst) -{ //optimized for aligned read and write, plus buffering - const float MAX_BUFFERED_X_SCALE = 4.0f; //probably more efficient all the way up to 4.0 - - const int dst_x = (blockIdx.x * blockDim.x + threadIdx.x) * 4; //quad - const int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - const int batch_idx = get_batch_idx(); - const int dstWidth = dst.width(batch_idx); - const int dstWidth4 = dstWidth & ~3; - const int dstHeight = dst.height(batch_idx); - - //0 - quad-aligned so if one pixel is out, they're all out - if ((dst_x >= dstWidth) | (dst_y >= dstHeight)) - return; - - uint readBuffer[MAX_BUFFER_WORDS_VS]; - T result[4]; - - const int width = src.width(batch_idx); - const int height = src.height(batch_idx); - - const float scale_x = static_cast(width) / dstWidth; - const float scale_y = static_cast(height) / dstHeight; - - //float space for weighted addition - using work_type = cuda::ConvertBaseTypeTo; - - //y coordinate - float fy = (float)((dst_y + 0.5f) * scale_y - 0.5f); - int sy = cuda::round(fy); - fy -= sy; - sy = cuda::max(1, cuda::min(sy, height - 3)); - - const float A = -0.75f; - - float cY[4]; - cY[0] = ((A * (fy + 1) - 5 * A) * (fy + 1) + 8 * A) * (fy + 1) - 4 * A; - cY[1] = ((A + 2) * fy - (A + 3)) * fy * fy + 1; - cY[2] = ((A + 2) * (1 - fy) - (A + 3)) * (1 - fy) * (1 - fy) + 1; - cY[3] = 1.f - cY[0] - cY[1] - cY[2]; - - if (dstWidth != dstWidth4) //non-aligned case, up to 4 pixels - { - uint readBuffer[MAX_BUFFER_WORDS_VS]; - - const int pixels = cuda::min(dstWidth - dst_x, 4); - for (int i = 0; i < pixels; ++i) - { - float fxi = (float)((dst_x + 0.5f + i) * scale_x - 0.5f); - int sxi = cuda::round(fxi); - fxi -= sxi; - fxi *= ((sxi >= 1) && (sxi < width - 3)); - sxi = cuda::max(1, cuda::min(sxi, width - 3)); - - float cX[4]; - cX[0] = ((A * (fxi + 1.0f) - 5.0f * A) * (fxi + 1.0f) + 8.0f * A) * (fxi + 1.0f) - 4.0f * A; - cX[1] = ((A + 2.0f) * fxi - (A + 3.0f)) * fxi * fxi + 1.0f; - cX[2] = ((A + 2.0f) * (1.0f - fxi) - (A + 3.0f)) * (1.0f - fxi) * (1.0f - fxi) + 1.0f; - cX[3] = 1.0f - cX[0] - cX[1] - cX[2]; - - work_type accum = cuda::SetAll(0); -#pragma unroll - for (int row = 0; row < 4; ++row) - { - //1 - load each sub row from sx-1 to sx+3 inclusive, aligned - //const T * aPtr = src.ptr(batch_idx, sy + row - 1, sx-1); - T *aPtr = _cacheAlignedBufferedReadVS( - src, width, readBuffer, MAX_BUFFER_WORDS_VS, batch_idx, sy + row - 1, sxi - 1, sxi + 2); - - //2 - do a pixel's partial on this row - accum += cY[row] * (cX[0] * aPtr[0] + cX[1] * aPtr[1] + cX[2] * aPtr[2] + cX[3] * aPtr[3]); - } //for row -#ifndef LEGACY_BICUBIC_MATH_VS - //correct math - *dst.ptr(batch_idx, dst_y, dst_x + i) = cuda::SaturateCast(accum); -#else - //abs() needed to match legacy operator. - *dst.ptr(batch_idx, dst_y, dst_x + i) = cuda::SaturateCast(cuda::abs(accum)); -#endif - } //for pixels - } - else //quad-aligned case, 4 pixels - { - //1 - optimized case if scale_x < some finite limit - if (scale_x <= MAX_BUFFERED_X_SCALE) //local buffering - { //buffered read - - work_type accum[4]; - float fx[4]; - int sx[4]; - float cX[4][4]; - - //initialize data for each pixel position -#pragma unroll - for (int pix = 0; pix < 4; ++pix) - { - accum[pix] = cuda::SetAll(0); - - //1 - precalc sx's ahead of time to get range from sx0-1..sx3+2 - fx[pix] = (float)((dst_x + pix + 0.5f) * scale_x - 0.5f); - sx[pix] = cuda::round(fx[pix]); - fx[pix] -= sx[pix]; - fx[pix] *= ((sx[pix] >= 1) && (sx[pix] < width - 3)); - sx[pix] = cuda::max(1, cuda::min(sx[pix], width - 3)); - - //2 - precalc cX[][] 2D array - cX[pix][0] - = ((A * (fx[pix] + 1.0f) - 5.0f * A) * (fx[pix] + 1.0f) + 8.0f * A) * (fx[pix] + 1.0f) - 4.0f * A; - cX[pix][1] = ((A + 2.0f) * fx[pix] - (A + 3.0f)) * fx[pix] * fx[pix] + 1.0f; - cX[pix][2] = ((A + 2.0f) * (1.0f - fx[pix]) - (A + 3.0f)) * (1.0f - fx[pix]) * (1.0f - fx[pix]) + 1.0f; - cX[pix][3] = 1.0f - cX[pix][0] - cX[pix][1] - cX[pix][2]; - } - const int rowOffset = sx[0] - 1; - - //contribute each row into 4 pixels -#pragma unroll - for (int row = 0; row < 4; ++row) - { - //1 - load each row from sx[0]-1 to sx[3]+3 inclusive, aligned - T *aPtr = _cacheAlignedBufferedReadVS( - src, width, readBuffer, MAX_BUFFER_WORDS_VS, batch_idx, sy + row - 1, sx[0] - 1, sx[3] + 2); - -//2 - do each pixel's partial on this row -#pragma unroll - for (int pix = 0; pix < 4; ++pix) - { - accum[pix] - += cY[row] - * (cX[row][0] * aPtr[sx[pix] - rowOffset - 1] + cX[row][1] * aPtr[sx[pix] - rowOffset + 0] - + cX[row][2] * aPtr[sx[pix] - rowOffset + 1] + cX[row][3] * aPtr[sx[pix] - rowOffset + 2]); - } - } - -#pragma unroll - for (int pix = 0; pix < 4; ++pix) -#ifndef LEGACY_BICUBIC_MATH_VS - result[pix] = cuda::SaturateCast(accum[pix]); -#else - result[pix] = cuda::SaturateCast(cuda::abs(accum[pix])); -#endif - } - else - { //partially buffered read 4 pixels at a time across each bicubic: 16 coalesced reads instead of 64 -#pragma unroll - for (int pix = 0; pix < 4; ++pix) - { - work_type accum = cuda::SetAll(0); - - float fx = (float)((dst_x + pix + 0.5f) * scale_x - 0.5f); - int sx = cuda::round(fx); - fx -= sx; - fx *= ((sx >= 1) && (sx < width - 3)); - sx = cuda::max(1, cuda::min(sx, width - 3)); - - float cX[4]; - cX[0] = ((A * (fx + 1.0f) - 5.0f * A) * (fx + 1.0f) + 8.0f * A) * (fx + 1.0f) - 4.0f * A; - cX[1] = ((A + 2.0f) * fx - (A + 3.0f)) * fx * fx + 1.0f; - cX[2] = ((A + 2.0f) * (1.0f - fx) - (A + 3.0f)) * (1.0f - fx) * (1.0f - fx) + 1.0f; - cX[3] = 1.0f - cX[0] - cX[1] - cX[2]; - -#pragma unroll - for (int row = 0; row < 4; ++row) - { - //1 - load each sub row from sx[pix]-1 to sx[pix]+2 inclusive, aligned - //const T * aPtr = src.ptr(batch_idx, sy + row - 1, sx-1); - const T *aPtr = _cacheAlignedBufferedReadVS( - src, width, readBuffer, MAX_BUFFER_WORDS_VS, batch_idx, sy + row - 1, sx - 1, sx + 2); - - //2 - do a pixel's partial on this row - accum += cY[row] * (cX[0] * aPtr[0] + cX[1] * aPtr[1] + cX[2] * aPtr[2] + cX[3] * aPtr[3]); - } //for row -#ifndef LEGACY_BICUBIC_MATH_VS - result[pix] = cuda::SaturateCast(accum); -#else - result[pix] = cuda::SaturateCast(cuda::abs(accum)); -#endif - } //for pix - } - - //aligned write 4 pixels - _alignedCudaMemcpyQuadVS(dst.ptr(batch_idx, dst_y, dst_x), result); - } -} //resize_bicubic_quad_combo - //******************** Integrate area template @@ -832,18 +426,6 @@ __global__ void resize_area_ocv_align(const cuda::ImageBatchVarShapeWrap -__global__ void resize_area_v2(const Filter src, cuda_op::Ptr2dVarShapeNHWC dst) -{ - int dst_x = blockDim.x * blockIdx.x + threadIdx.x; - int dst_y = blockDim.y * blockIdx.y + threadIdx.y; - const int batch_idx = get_batch_idx(); - if (dst_x >= dst.cols[batch_idx] || dst_y >= dst.rows[batch_idx]) - return; - - *dst.ptr(batch_idx, dst_y, dst_x) = src(batch_idx, dst_y, dst_x); -} - template void resize(const ImageBatchVarShapeDataStridedCuda &in, const ImageBatchVarShapeDataStridedCuda &out, const int interpolation, cudaStream_t stream) @@ -855,9 +437,6 @@ void resize(const ImageBatchVarShapeDataStridedCuda &in, const ImageBatchVarShap Size2D outMaxSize = out.maxSize(); - bool can_quad = false; // turning it off due to a reported regression - //bool can_quad = false; //<-- force single pixel per kernel mode, smaller register file - const int THREADS_PER_BLOCK = 256; //Performance degrades above 256 and below 16 (GMEM speed limited) const int BLOCK_WIDTH = 8; //as in 32x4 or 32x8 or 8x32. @@ -871,41 +450,22 @@ void resize(const ImageBatchVarShapeDataStridedCuda &in, const ImageBatchVarShap switch (interpolation) { case NVCV_INTERP_NEAREST: - if (can_quad) - { //thread does 4 pixels horizontally for aligned read and write - resize_NN_quad_combo<<>>(src_ptr, dst_ptr); - } - else - { //generic single pixel per thread case - resize_NN<<>>(src_ptr, dst_ptr); - } + resize_NN<<>>(src_ptr, dst_ptr); break; + case NVCV_INTERP_LINEAR: - if (can_quad) - { //thread does 4 pixels horizontally for aligned read and write - resize_bilinear_quad_combo<<>>(src_ptr, dst_ptr); - } - else - { //generic single pixel per thread case - resize_bilinear<<>>(src_ptr, dst_ptr); - } + resize_bilinear<<>>(src_ptr, dst_ptr); break; + case NVCV_INTERP_CUBIC: - if (can_quad) - { //thread does 4 pixels horizontally for aligned read and write - resize_bicubic_quad_combo<<>>(src_ptr, dst_ptr); - } - else - { //generic single pixel per thread case - resize_bicubic<<>>(src_ptr, dst_ptr); - } + resize_bicubic<<>>(src_ptr, dst_ptr); break; + case NVCV_INTERP_AREA: cuda::BorderVarShapeWrap brdSrc(in); - resize_area_ocv_align<<>>(src_ptr, brdSrc, dst_ptr); - break; + } //switch interpolation checkKernelErrors(); diff --git a/src/nvcv_types/include/nvcv/cuda/ArrayWrap.hpp b/src/nvcv_types/include/nvcv/cuda/ArrayWrap.hpp index aa768b41..2a6103d8 100644 --- a/src/nvcv_types/include/nvcv/cuda/ArrayWrap.hpp +++ b/src/nvcv_types/include/nvcv/cuda/ArrayWrap.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -20,12 +20,11 @@ #include "TypeTraits.hpp" -#include #include #include -#include -#include +#include // for assert, etc. +#include // for iterator_traits, etc. namespace nvcv::cuda { diff --git a/src/nvcv_types/include/nvcv/cuda/Atomics.hpp b/src/nvcv_types/include/nvcv/cuda/Atomics.hpp index 7cc05f87..539c91a6 100644 --- a/src/nvcv_types/include/nvcv/cuda/Atomics.hpp +++ b/src/nvcv_types/include/nvcv/cuda/Atomics.hpp @@ -30,10 +30,12 @@ namespace nvcv::cuda { /** - * Metafunction to do a generic atomic operation in floating-point types. - * * @defgroup NVCV_CPP_CUDATOOLS_ATOMICS Atomic operations * @{ + */ + +/** + * Metafunction to do a generic atomic operation in floating-point types. * * @tparam T Type of the values used in the atomic operation. * @tparam OP Operation class that defines the operator call to be used as atomics. diff --git a/src/nvcv_types/include/nvcv/cuda/BorderWrap.hpp b/src/nvcv_types/include/nvcv/cuda/BorderWrap.hpp index 5c29c6e5..eb0009f4 100644 --- a/src/nvcv_types/include/nvcv/cuda/BorderWrap.hpp +++ b/src/nvcv_types/include/nvcv/cuda/BorderWrap.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -31,7 +31,6 @@ #include // for NVCVBorderType, etc. #include // for TensorDataStridedCuda, etc. #include // for TensorDataAccessStridedImagePlanar, etc. -#include // for NVCV_ASSERT, etc. namespace nvcv::cuda { @@ -196,7 +195,7 @@ class BorderWrapImpl explicit __host__ BorderWrapImpl(const TensorDataStridedCuda &tensor) : m_tensorWrap(tensor) { - NVCV_ASSERT(tensor.rank() >= kNumDimensions); + assert(tensor.rank() >= kNumDimensions); int j = 0; #pragma unroll @@ -204,7 +203,7 @@ class BorderWrapImpl { if (kActiveDimensions[i]) { - NVCV_ASSERT(tensor.shape(i) <= TypeTraits::max); + assert(tensor.shape(i) <= TypeTraits::max); m_tensorShape[j++] = tensor.shape(i); } @@ -548,9 +547,9 @@ template>> __host__ auto CreateBorderWrapNHW(const TensorDataStridedCuda &tensor, T borderValue = {}) { auto tensorAccess = TensorDataAccessStridedImagePlanar::Create(tensor); - NVCV_ASSERT(tensorAccess); - NVCV_ASSERT(tensorAccess->numRows() <= TypeTraits::max); - NVCV_ASSERT(tensorAccess->numCols() <= TypeTraits::max); + assert(tensorAccess); + assert(tensorAccess->numRows() <= TypeTraits::max); + assert(tensorAccess->numCols() <= TypeTraits::max); auto tensorWrap = CreateTensorWrapNHW(tensor); @@ -580,9 +579,9 @@ template>> __host__ auto CreateBorderWrapNHWC(const TensorDataStridedCuda &tensor, T borderValue = {}) { auto tensorAccess = TensorDataAccessStridedImagePlanar::Create(tensor); - NVCV_ASSERT(tensorAccess); - NVCV_ASSERT(tensorAccess->numRows() <= TypeTraits::max); - NVCV_ASSERT(tensorAccess->numCols() <= TypeTraits::max); + assert(tensorAccess); + assert(tensorAccess->numRows() <= TypeTraits::max); + assert(tensorAccess->numCols() <= TypeTraits::max); auto tensorWrap = CreateTensorWrapNHWC(tensor); diff --git a/src/nvcv_types/include/nvcv/cuda/DropCast.hpp b/src/nvcv_types/include/nvcv/cuda/DropCast.hpp index 8e924d82..4caf13e7 100644 --- a/src/nvcv_types/include/nvcv/cuda/DropCast.hpp +++ b/src/nvcv_types/include/nvcv/cuda/DropCast.hpp @@ -28,6 +28,11 @@ namespace nvcv::cuda { +/** + * @defgroup NVCV_CPP_CUDATOOLS_DROPCAST Drop Cast + * @{ + */ + /** * Metafunction to drop components of a compound value. * @@ -37,9 +42,6 @@ namespace nvcv::cuda { * template argument (see example below). The type \p T is not needed as it is inferred from the argument \p v. * It is a requirement of the DropCast function that the type \p T has at least N components. * - * @defgroup NVCV_CPP_CUDATOOLS_DROPCAST Drop Cast - * @{ - * * @code * uint2 dstIdx = DropCast<2>(blockIdx * blockDim + threadIdx); * @endcode diff --git a/src/nvcv_types/include/nvcv/cuda/RangeCast.hpp b/src/nvcv_types/include/nvcv/cuda/RangeCast.hpp index a4c392db..a423d495 100644 --- a/src/nvcv_types/include/nvcv/cuda/RangeCast.hpp +++ b/src/nvcv_types/include/nvcv/cuda/RangeCast.hpp @@ -29,6 +29,11 @@ namespace nvcv::cuda { +/** + * @defgroup NVCV_CPP_CUDATOOLS_RANGECAST Range cast + * @{ + */ + /** * Metafunction to range cast (scale) all elements to a target range. * @@ -46,9 +51,6 @@ namespace nvcv::cuda { * | double | int | [-1, 1] | [-2147483648, 2147483647] | * | unsigned short | double | [0, 65535] | [0, 1] | * - * @defgroup NVCV_CPP_CUDATOOLS_RANGECAST Range cast - * @{ - * * @code * using DataType = MakeType; * using FloatDataType = ConvertBaseTypeTo; diff --git a/src/nvcv_types/include/nvcv/cuda/SaturateCast.hpp b/src/nvcv_types/include/nvcv/cuda/SaturateCast.hpp index 7bd337d4..41dace2a 100644 --- a/src/nvcv_types/include/nvcv/cuda/SaturateCast.hpp +++ b/src/nvcv_types/include/nvcv/cuda/SaturateCast.hpp @@ -29,6 +29,11 @@ namespace nvcv::cuda { +/** + * @defgroup NVCV_CPP_CUDATOOLS_SATURATECAST Saturate cast + * @{ + */ + /** * Metafunction to saturate cast all elements to a target type. * @@ -37,9 +42,6 @@ namespace nvcv::cuda { * casted to an uchar4 rounding-then-saturating each value to be in between 0 and 255 (see example below). It is a * requirement of SaturateCast that both types have the same number of components or \p T is a regular C type. * - * @defgroup NVCV_CPP_CUDATOOLS_SATURATECAST Saturate cast - * @{ - * * @code * using DataType = MakeType; * using FloatDataType = ConvertBaseTypeTo; diff --git a/src/nvcv_types/include/nvcv/cuda/StaticCast.hpp b/src/nvcv_types/include/nvcv/cuda/StaticCast.hpp index ab89de88..72f2929f 100644 --- a/src/nvcv_types/include/nvcv/cuda/StaticCast.hpp +++ b/src/nvcv_types/include/nvcv/cuda/StaticCast.hpp @@ -28,6 +28,11 @@ namespace nvcv::cuda { +/** + * @defgroup NVCV_CPP_CUDATOOLS_STATICCAST Static Cast + * @{ + */ + /** * Metafunction to static cast all values of a compound to a target type. * @@ -38,9 +43,6 @@ namespace nvcv::cuda { * type \p U is not needed as it is inferred from the argument \u. It is a requirement of the StaticCast function * that the type \p T is of regular C type and the type \p U is of CUDA compound type. * - * @defgroup NVCV_CPP_CUDATOOLS_STATICCAST Static Cast - * @{ - * * @code * int3 idx = StaticCast(blockIdx * blockDim + threadIdx); * @endcode diff --git a/src/nvcv_types/include/nvcv/cuda/TensorWrap.hpp b/src/nvcv_types/include/nvcv/cuda/TensorWrap.hpp index 1cc7143b..b925ce60 100644 --- a/src/nvcv_types/include/nvcv/cuda/TensorWrap.hpp +++ b/src/nvcv_types/include/nvcv/cuda/TensorWrap.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -29,9 +29,9 @@ #include // for ImageDataStridedCuda, etc. #include // for TensorDataStridedCuda, etc. #include // for TensorDataAccessStridedImagePlanar, etc. -#include // for NVCV_ASSERT, etc. -#include +#include // for assert, etc. +#include // for forward, etc. namespace nvcv::cuda { @@ -150,7 +150,7 @@ class TensorWrap { constexpr int kStride[] = {std::forward(Strides)...}; - NVCV_ASSERT(tensor.rank() >= kNumDimensions); + assert(tensor.rank() >= kNumDimensions); m_data = reinterpret_cast(tensor.basePtr()); @@ -159,11 +159,11 @@ class TensorWrap { if (kStride[i] != -1) { - NVCV_ASSERT(tensor.stride(i) == kStride[i]); + assert(tensor.stride(i) == kStride[i]); } else if (i < kVariableStrides) { - NVCV_ASSERT(tensor.stride(i) <= TypeTraits::max); + assert(tensor.stride(i) <= TypeTraits::max); m_strides[i] = tensor.stride(i); } @@ -447,9 +447,9 @@ template>> __host__ auto CreateTensorWrapNHW(const TensorDataStridedCuda &tensor) { auto tensorAccess = TensorDataAccessStridedImagePlanar::Create(tensor); - NVCV_ASSERT(tensorAccess); - NVCV_ASSERT(tensorAccess->sampleStride() <= TypeTraits::max); - NVCV_ASSERT(tensorAccess->rowStride() <= TypeTraits::max); + assert(tensorAccess); + assert(tensorAccess->sampleStride() <= TypeTraits::max); + assert(tensorAccess->rowStride() <= TypeTraits::max); return Tensor3DWrap(tensor.basePtr(), static_cast(tensorAccess->sampleStride()), static_cast(tensorAccess->rowStride())); @@ -474,10 +474,10 @@ template>> __host__ auto CreateTensorWrapNHWC(const TensorDataStridedCuda &tensor) { auto tensorAccess = TensorDataAccessStridedImagePlanar::Create(tensor); - NVCV_ASSERT(tensorAccess); - NVCV_ASSERT(tensorAccess->sampleStride() <= TypeTraits::max); - NVCV_ASSERT(tensorAccess->rowStride() <= TypeTraits::max); - NVCV_ASSERT(tensorAccess->colStride() <= TypeTraits::max); + assert(tensorAccess); + assert(tensorAccess->sampleStride() <= TypeTraits::max); + assert(tensorAccess->rowStride() <= TypeTraits::max); + assert(tensorAccess->colStride() <= TypeTraits::max); return Tensor4DWrap(tensor.basePtr(), static_cast(tensorAccess->sampleStride()), static_cast(tensorAccess->rowStride()), static_cast(tensorAccess->colStride())); diff --git a/src/nvcv_types/include/nvcv/cuda/math/LinAlg.hpp b/src/nvcv_types/include/nvcv/cuda/math/LinAlg.hpp index d91e3f5f..4287d485 100644 --- a/src/nvcv_types/include/nvcv/cuda/math/LinAlg.hpp +++ b/src/nvcv_types/include/nvcv/cuda/math/LinAlg.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -25,14 +25,15 @@ #define NVCV_CUDA_MATH_LINALG_HPP #include // for cuda::max, etc. +#include // for cuda::Require, etc. -#include // for std::swap, etc. -#include // for assert, etc. -#include // for std::pow, etc. -#include // for std::size_t, etc. -#include // for std::ostream, etc. -#include // for std::enable_if_t, etc. -#include // for std::vector, etc. +#include // for std::swap, etc. +#include // for assert, etc. +#include // for std::pow, etc. +#include // for std::size_t, etc. +#include // for std::initializer_list, etc. +#include // for std::ostream, etc. +#include // for std::vector, etc. namespace nvcv::cuda::math { @@ -68,6 +69,16 @@ class Vector } } + /** + * Load values from a C++ initiliazer list into this vector. + * + * @param[in] l Input C++ initializer list to load values from. + */ + constexpr __host__ __device__ void load(std::initializer_list l) + { + load(std::data(l)); + } + /** * Store values to a C-array from this vector. * @@ -215,9 +226,12 @@ class Vector /** * Matrix class to represent small matrices. * + * It uses the Vector class to stores each row, storing elements in row-major order, i.e. it has M row vectors + * where each vector has N elements. + * * @tparam T Matrix value type. * @tparam M Number of rows. - * @tparam N Number of columns. Default is M. + * @tparam N Number of columns. Default is M (a square matrix). */ template class Matrix @@ -245,6 +259,16 @@ class Matrix } } + /** + * Load values from a C++ initiliazer list into this matrix. + * + * @param[in] l Input C++ initializer list to load values from. + */ + constexpr __host__ __device__ void load(std::initializer_list l) + { + load(std::data(l)); + } + /** * Store values to a flatten array from this matrix. * @@ -356,6 +380,21 @@ class Matrix return c; } + /** + * Set column j of this matrix. + * + * @param[in] j Index of column to set. + * @param[in] v Value to place in matrix column. + */ + constexpr __host__ __device__ void set_col(int j, T v) + { +#pragma unroll + for (int i = 0; i < rows(); ++i) + { + m_data[i][j] = v; + } + } + /** * Set column j of this matrix. * @@ -516,7 +555,7 @@ template constexpr __host__ __device__ Vector &operator*=(Vector &lhs, const T &rhs) { #pragma unroll - for (int j = 0; j < lhs.size(); ++j) + for (int j = 0; j < N; ++j) { lhs[j] *= rhs; } @@ -540,7 +579,7 @@ template constexpr __host__ __device__ Vector &operator*=(Vector &lhs, const Vector &rhs) { #pragma unroll - for (int j = 0; j < lhs.size(); ++j) + for (int j = 0; j < N; ++j) { lhs[j] *= rhs[j]; } @@ -606,12 +645,12 @@ template std::ostream &operator<<(std::ostream &out, const Vector &v) { out << '['; - for (int i = 0; i < v.size(); ++i) + for (int i = 0; i < N; ++i) { out << v[i]; - if (i < v.size() - 1) + if (i < N - 1) { - out << ','; + out << ' '; } } return out << ']'; @@ -700,12 +739,12 @@ std::ostream &operator<<(std::ostream &out, const Matrix &m) out << m[i][j]; if (j < m.cols() - 1) { - out << ','; + out << ' '; } } if (i < m.rows() - 1) { - out << ";"; + out << "\n"; } } return out << ']'; @@ -835,7 +874,7 @@ constexpr __host__ __device__ Vector operator*(const Matrix &m, c return r; } -template 1)>> +template 1)>> constexpr __host__ __device__ Matrix operator*(const Matrix &m, const Vector &v) { Matrix r; @@ -966,107 +1005,79 @@ constexpr Matrix as_matrix(const T (&values)[N][M]) template constexpr __host__ __device__ Vector zeros() { - Vector v; - if constexpr (N > 0) + Vector v = {}; +#pragma unroll + for (int j = 0; j < N; ++j) { -#if __CUDA_ARCH__ -# pragma unroll - for (int j = 0; j < v.size(); ++j) - { - v[j] = T{0}; - } -#else - std::fill(&v[0], &v[N - 1] + 1, T{0}); -#endif + v[j] = T{0}; } - return v; // I'm hoping that RVO will kick in + return v; } template constexpr __host__ __device__ Matrix zeros() { - Matrix mat; - if constexpr (M > 0 && N > 0) + Matrix m = {}; +#pragma unroll + for (int i = 0; i < M; ++i) { -#if __CUDA_ARCH__ -# pragma unroll - for (int i = 0; i < mat.rows(); ++i) +#pragma unroll + for (int j = 0; j < N; ++j) { -# pragma unroll - for (int j = 0; j < mat.cols(); ++j) - { - mat[i][j] = T{0}; - } + m[i][j] = T{0}; } -#else - std::fill(&mat[0][0], &mat[M - 1][N - 1] + 1, T{0}); -#endif } - return mat; // I'm hoping that RVO will kick in + return m; } template constexpr __host__ __device__ Vector ones() { - Vector v; - if constexpr (N > 0) + Vector v = {}; +#pragma unroll + for (int j = 0; j < N; ++j) { -#if __CUDA_ARCH__ -# pragma unroll - for (int j = 0; j < v.size(); ++j) - { - v[j] = T{1}; - } -#else - std::fill(&v[0], &v[N - 1] + 1, T{1}); -#endif + v[j] = T{1}; } - return v; // I'm hoping that RVO will kick in + return v; } template constexpr __host__ __device__ Matrix ones() { - Matrix mat; - if constexpr (M > 0 && N > 0) + Matrix m = {}; +#pragma unroll + for (int i = 0; i < M; ++i) { -#if __CUDA_ARCH__ -# pragma unroll - for (int i = 0; i < mat.rows(); ++i) +#pragma unroll + for (int j = 0; j < N; ++j) { -# pragma unroll - for (int j = 0; j < mat.cols(); ++j) - { - mat[i][j] = T{1}; - } + m[i][j] = T{1}; } -#else - std::fill(&mat[0][0], &mat[M - 1][N - 1] + 1, T{1}); -#endif } - return mat; // I'm hoping that RVO will kick in + return m; } template constexpr __host__ __device__ Matrix identity() { - Matrix mat; - + Matrix m = {}; +#pragma unroll for (int i = 0; i < M; ++i) { +#pragma unroll for (int j = 0; j < N; ++j) { - mat[i][j] = i == j ? 1 : 0; + m[i][j] = i == j ? 1 : 0; } } - - return mat; + return m; } template constexpr __host__ __device__ Matrix vander(const Vector &v) { - Matrix m; + Matrix m = {}; for (int i = 0; i < M; ++i) { for (int j = 0; j < M; ++j) @@ -1074,7 +1085,6 @@ constexpr __host__ __device__ Matrix vander(const Vector &v) m[i][j] = cuda::pow(v[j], i); } } - return m; } @@ -1102,7 +1112,7 @@ constexpr __host__ __device__ Matrix compan(const Vector &a) template constexpr __host__ __device__ Matrix diag(const Vector &v) { - Matrix m; + Matrix m = {}; for (int i = 0; i < M; ++i) { for (int j = 0; j < M; ++j) @@ -1120,7 +1130,7 @@ constexpr __host__ __device__ T dot(const Vector &a, const Vector &b { T d = a[0] * b[0]; #pragma unroll - for (int j = 1; j < a.size(); ++j) + for (int j = 1; j < N; ++j) { d += a[j] * b[j]; } @@ -1130,23 +1140,23 @@ constexpr __host__ __device__ T dot(const Vector &a, const Vector &b template constexpr __host__ __device__ Vector reverse(const Vector &a) { - Vector r; + Vector r = {}; #pragma unroll - for (int j = 0; j < r.size(); ++j) + for (int j = 0; j < N; ++j) { - r[j] = a[a.size() - 1 - j]; + r[j] = a[N - 1 - j]; } return r; } -// Transposition --------------------------------------------------------------- +// Transformations ------------------------------------------------------------- template constexpr __host__ __device__ Matrix &transp_inplace(Matrix &m) { - for (int i = 0; i < m.rows(); ++i) + for (int i = 0; i < M; ++i) { - for (int j = i + 1; j < m.cols(); ++j) + for (int j = i + 1; j < M; ++j) { detail::swap(m[i][j], m[j][i]); } @@ -1157,12 +1167,12 @@ constexpr __host__ __device__ Matrix &transp_inplace(Matrix &m template constexpr __host__ __device__ Matrix transp(const Matrix &m) { - Matrix tm; + Matrix tm = {}; #pragma unroll - for (int i = 0; i < m.rows(); ++i) + for (int i = 0; i < M; ++i) { #pragma unroll - for (int j = 0; j < m.cols(); ++j) + for (int j = 0; j < N; ++j) { tm[j][i] = m[i][j]; } @@ -1173,7 +1183,7 @@ constexpr __host__ __device__ Matrix transp(const Matrix &m) template constexpr __host__ __device__ Matrix transp(const Vector &v) { - Matrix tv; + Matrix tv = {}; tv.set_col(0, v); return tv; } @@ -1181,12 +1191,12 @@ constexpr __host__ __device__ Matrix transp(const Vector &v) template constexpr __host__ __device__ Matrix flip_rows(const Matrix &m) { - Matrix f; + Matrix f = {}; #pragma unroll - for (int i = 0; i < m.rows(); ++i) + for (int i = 0; i < M; ++i) { #pragma unroll - for (int j = 0; j < m.cols(); ++j) + for (int j = 0; j < N; ++j) { f[i][j] = m[M - 1 - i][j]; } @@ -1197,12 +1207,12 @@ constexpr __host__ __device__ Matrix flip_rows(const Matrix &m template constexpr __host__ __device__ Matrix flip_cols(const Matrix &m) { - Matrix f; + Matrix f = {}; #pragma unroll - for (int i = 0; i < m.rows(); ++i) + for (int i = 0; i < M; ++i) { #pragma unroll - for (int j = 0; j < m.cols(); ++j) + for (int j = 0; j < N; ++j) { f[i][j] = m[i][N - 1 - j]; } @@ -1213,12 +1223,12 @@ constexpr __host__ __device__ Matrix flip_cols(const Matrix &m template constexpr __host__ __device__ Matrix flip(const Matrix &m) { - Matrix f; + Matrix f = {}; #pragma unroll - for (int i = 0; i < m.rows(); ++i) + for (int i = 0; i < M; ++i) { #pragma unroll - for (int j = 0; j < m.cols(); ++j) + for (int j = 0; j < N; ++j) { f[i][j] = m[M - 1 - i][N - 1 - j]; } @@ -1226,99 +1236,225 @@ constexpr __host__ __device__ Matrix flip(const Matrix &m) return f; } -// Determinant ----------------------------------------------------------------- - -template -constexpr __host__ __device__ T det(const Matrix &m) +template> +constexpr __host__ __device__ Matrix head(const Matrix &m) { - return T{1}; + Matrix h; + +#pragma unroll + for (int i = 0; i < R; ++i) + { +#pragma unroll + for (int j = 0; j < N; ++j) + { + h[i][j] = m[i][j]; + } + } + + return h; } -template -constexpr __host__ __device__ T det(const Matrix &m) +template> +constexpr __host__ __device__ Matrix tail(const Matrix &m) { - return m[0][0]; + Matrix t; + +#pragma unroll + for (int i = 0; i < R; ++i) + { +#pragma unroll + for (int j = 0; j < N; ++j) + { + t[i][j] = m[M - R + i][j]; + } + } + + return t; } -template -constexpr __host__ __device__ T det(const Matrix &m) +// Advanced operations --------------------------------------------------------- + +// Linear-time invariant (LTI) filtering is a fundamental operation in signal and image processing. Many +// applications use LTI filters that can be expressed as linear, constant-coefficient difference equations. + +// Functions below implement a convolution pass, i.e. finite impulse response (FIR), and a causal/anticausal +// combination of recursive filter passes, i.e. infinite impulse response (IIR), both defined by a set of weights. + +// Definitions: input single element x, block b, length N; filter weights w, order R; prologue p; epilogue e. +// Illustrative example of N=11 and R=3 showing a block b in between previous and next blocks. + +// |----------------- b -----------------| +// | [ e0 e1 e2 ] x x x x x [ p0 p1 p2 ] | + +// FIR + IIR filtering with causal combination is called forward; with anticausal combination is called reverse. + +// Forward (fwd): y = w[0] * x - w[1] * p2 - w[2] * p1 - w[3] * p0 +// The y passed in is considered to be: y = w[0] * x + +// Reverse (rev): z = w[0] * y - w[1] * e0 - w[2] * e1 - w[3] * e2 +// The z passed in is considered to be: z = w[0] * y + +// Forward pass in a single element, updating prologue accordingly and returning result +template +constexpr __host__ __device__ T fwd1(Vector &p, T y, const Vector &w) { - return m[0][0] * m[1][1] - m[0][1] * m[1][0]; + y = y - p[R - 1] * w[1]; + +#pragma unroll + for (int k = R - 1; k >= 1; --k) + { + y = y - p[R - 1 - k] * w[k + 1]; + + p[R - 1 - k] = p[R - 1 - k + 1]; + } + + p[R - 1] = y; + + return y; } -template -constexpr __host__ __device__ T det(const Matrix &m) +// Forward pass in a block of N elements, updating prologue accordingly and in-place +template +constexpr __host__ __device__ void fwdN(Vector &p, Vector &b, const Vector &w) { - return m[0][0] * (m[1][1] * m[2][2] - m[1][2] * m[2][1]) + m[0][1] * (m[1][2] * m[2][0] - m[1][0] * m[2][2]) - + m[0][2] * (m[1][0] * m[2][1] - m[1][1] * m[2][0]); +#pragma unroll + for (int k = 0; k < N; ++k) + { + b[k] = fwd1(p, w[0] * b[k], w); + } } -template -constexpr __host__ __device__ T det(const Matrix &m) +// Forward-transpose pass over rows of a block of MxN elements, updating prologue accordingly and in-place +template +constexpr __host__ void fwdT(Matrix &p, Matrix &b, const Vector &w) { - T d = T{0}; #pragma unroll for (int i = 0; i < M; ++i) { - d += ((i % 2 == 0 ? 1 : -1) * m[0][i] * det(m.subm(0, i))); + fwdN(p[i], b[i], w); } - return d; } -// Matrix Inverse -------------------------------------------------------------- +// Forward pass over columns of a block of MxN elements, returning result +template +constexpr __host__ Matrix fwd(const Matrix &p, const Matrix &b, const Vector &w) +{ + Matrix bout; -namespace detail { +#pragma unroll + for (int j = 0; j < N; ++j) + { + Vector pT = p.col(j); -template -constexpr __host__ __device__ void inv_inplace(Matrix &m, const T &d) +#pragma unroll + for (int i = 0; i < M; ++i) + { + bout[i][j] = fwd1(pT, b[i][j] * w[0], w); + } + } + + return bout; +} + +// Reverse pass in a single element, updating epilogue accordingly and returning result +template +constexpr __host__ __device__ T rev1(T z, Vector &e, const Vector &w) { - m[0][0] = T{1} / d; + z = z - e[0] * w[1]; + +#pragma unroll + for (int k = R - 1; k >= 1; --k) + { + z = z - e[k] * w[k + 1]; + + e[k] = e[k - 1]; + } + + e[0] = z; + + return z; } -template -constexpr __host__ __device__ void inv_inplace(Matrix &m, const T &d) +// Reverse pass in a block of N elements, updating prologue accordingly and in-place +template +constexpr __host__ __device__ void revN(Vector &b, Vector &e, const Vector &w) { - swap(m[0][0], m[1][1]); - m[0][0] /= d; - m[1][1] /= d; +#pragma unroll + for (int k = N - 1; k >= 0; --k) + { + b[k] = rev1(w[0] * b[k], e, w); + } +} - m[0][1] = -m[0][1] / d; - m[1][0] = -m[1][0] / d; +// Reverse-transpose pass over rows of a block of MxN elements, updating prologue accordingly and in-place +template +constexpr __host__ void revT(Matrix &b, Matrix &e, const Vector &w) +{ +#pragma unroll + for (int i = 0; i < M; ++i) + { + revN(b[i], e[i], w); + } +} + +// Reverse pass over columns of a block of MxN elements, returning result +template +constexpr __host__ Matrix rev(const Matrix &b, const Matrix &e, const Vector &w) +{ + Matrix bout; + +#pragma unroll + for (int j = 0; j < N; ++j) + { + Vector eT = e.col(j); + +#pragma unroll + for (int i = M - 1; i >= 0; --i) + { + bout[i][j] = rev1(b[i][j] * w[0], eT, w); + } + } + + return bout; } +// Determinant ----------------------------------------------------------------- + template -constexpr __host__ __device__ void inv_inplace(Matrix &m, const T &d) +constexpr __host__ __device__ T det(const Matrix &m) { - Matrix A; - A[0][0] = (m[1][1] * m[2][2] - m[1][2] * m[2][1]) / d; - A[0][1] = -(m[0][1] * m[2][2] - m[0][2] * m[2][1]) / d; - A[0][2] = (m[0][1] * m[1][2] - m[0][2] * m[1][1]) / d; - A[1][0] = -(m[1][0] * m[2][2] - m[1][2] * m[2][0]) / d; - A[1][1] = (m[0][0] * m[2][2] - m[0][2] * m[2][0]) / d; - A[1][2] = -(m[0][0] * m[1][2] - m[0][2] * m[1][0]) / d; - A[2][0] = (m[1][0] * m[2][1] - m[1][1] * m[2][0]) / d; - A[2][1] = -(m[0][0] * m[2][1] - m[0][1] * m[2][0]) / d; - A[2][2] = (m[0][0] * m[1][1] - m[0][1] * m[1][0]) / d; + return T{1}; +} - m = A; +template +constexpr __host__ __device__ T det(const Matrix &m) +{ + return m[0][0]; } -} // namespace detail +template +constexpr __host__ __device__ T det(const Matrix &m) +{ + return m[0][0] * m[1][1] - m[0][1] * m[1][0]; +} -// Do inverse in-place of matrix m returning true if succeeded (m has determinant) -template> -constexpr __host__ __device__ bool inv_inplace(Matrix &m) +template +constexpr __host__ __device__ T det(const Matrix &m) { - T d = det(m); + return m[0][0] * (m[1][1] * m[2][2] - m[1][2] * m[2][1]) + m[0][1] * (m[1][2] * m[2][0] - m[1][0] * m[2][2]) + + m[0][2] * (m[1][0] * m[2][1] - m[1][1] * m[2][0]); +} - if (d == 0) +template +constexpr __host__ __device__ T det(const Matrix &m) +{ + T d = T{0}; +#pragma unroll + for (int i = 0; i < M; ++i) { - return false; + d += ((i % 2 == 0 ? 1 : -1) * m[0][i] * det(m.subm(0, i))); } - - detail::inv_inplace(m, d); - - return true; + return d; } // LU decomposition & solve ---------------------------------------------------- @@ -1327,7 +1463,7 @@ constexpr __host__ __device__ bool inv_inplace(Matrix &m) template constexpr __host__ __device__ bool lu_inplace(Matrix &m, Vector &p) { - Vector v; + Vector v = {}; #pragma unroll for (int i = 0; i < N; ++i) @@ -1396,6 +1532,7 @@ constexpr __host__ __device__ bool lu_inplace(Matrix &m, Vector return true; } +// Solve in-place using given LU decomposition lu and pivot p, the result x is returned in b template constexpr __host__ __device__ void solve_inplace(const Matrix &lu, const Vector &p, Vector &b) { @@ -1439,10 +1576,11 @@ constexpr __host__ __device__ void solve_inplace(const Matrix &lu, cons } } +// Solve in-place m * x = b, where x is returned in b template constexpr __host__ __device__ bool solve_inplace(const Matrix &m, Vector &b) { - Vector p; + Vector p = {}; Matrix LU = m; if (!lu_inplace(LU, p)) @@ -1455,6 +1593,136 @@ constexpr __host__ __device__ bool solve_inplace(const Matrix &m, Vecto return true; } +// Matrix Inverse -------------------------------------------------------------- + +namespace detail { + +// In this detail, all inverse (and in-place) functions use determinant d of the input matrix m +template +constexpr __host__ __device__ void inv_inplace(Matrix &m, const T &d) +{ + m[0][0] = T{1} / d; +} + +template +constexpr __host__ __device__ Matrix inv(const Matrix &m, const T &d) +{ + Matrix A; + inv_inplace(A, d); + return A; +} + +template +constexpr __host__ __device__ void inv_inplace(Matrix &m, const T &d) +{ + detail::swap(m[0][0], m[1][1]); + m[0][0] /= d; + m[1][1] /= d; + + m[0][1] = -m[0][1] / d; + m[1][0] = -m[1][0] / d; +} + +template +constexpr __host__ __device__ Matrix inv(const Matrix &m, const T &d) +{ + Matrix A = m; + inv_inplace(A, d); + + return A; +} + +template +constexpr __host__ __device__ Matrix inv(const Matrix &m, const T &d) +{ + Matrix A; + A[0][0] = (m[1][1] * m[2][2] - m[1][2] * m[2][1]) / d; + A[0][1] = -(m[0][1] * m[2][2] - m[0][2] * m[2][1]) / d; + A[0][2] = (m[0][1] * m[1][2] - m[0][2] * m[1][1]) / d; + A[1][0] = -(m[1][0] * m[2][2] - m[1][2] * m[2][0]) / d; + A[1][1] = (m[0][0] * m[2][2] - m[0][2] * m[2][0]) / d; + A[1][2] = -(m[0][0] * m[1][2] - m[0][2] * m[1][0]) / d; + A[2][0] = (m[1][0] * m[2][1] - m[1][1] * m[2][0]) / d; + A[2][1] = -(m[0][0] * m[2][1] - m[0][1] * m[2][0]) / d; + A[2][2] = (m[0][0] * m[1][1] - m[0][1] * m[1][0]) / d; + + return A; +} + +template +constexpr __host__ __device__ void inv_inplace(Matrix &m, const T &d) +{ + m = inv(m, d); +} + +} // namespace detail + +// Do inverse of matrix m asserting its success +template> +constexpr __host__ __device__ Matrix inv(const Matrix &m) +{ + T d = det(m); + assert(d != 0); + return detail::inv(m, d); +} + +// Do inverse in-place of matrix m returning true if succeeded (m has determinant) +template> +constexpr __host__ __device__ bool inv_inplace(Matrix &m) +{ + T d = det(m); + + if (d == 0) + { + return false; + } + + detail::inv_inplace(m, d); + + return true; +} + +// Do inverse in-place of matrix m returning out using LU decomposition written to m +template +constexpr __host__ __device__ void inv_lu_inplace(Matrix &out, Matrix &m) +{ + Vector p = {}; + + bool validResult = lu_inplace(m, p); + assert(validResult); + if (!validResult) + { + return; + } + + out = identity(); + +#pragma unroll + for (int i = 0; i < M; ++i) + { + solve_inplace(m, p, out[i]); + } + + transp_inplace(out); +} + +// Do inverse in-place of matrix m using LU decomposition +template +constexpr __host__ __device__ void inv_lu_inplace(Matrix &m) +{ + Matrix res; + inv_lu_inplace(res, m); + m = res; +} + +// Do inverse using LU decomposition +template +constexpr __host__ __device__ Matrix inv_lu(Matrix m) +{ + inv_lu_inplace(m); + return m; +} + /**@}*/ } // namespace nvcv::cuda::math diff --git a/tests/cvcuda/python/cvcuda_test_python.in b/tests/cvcuda/python/cvcuda_test_python.in index f1242e37..94cc6651 100755 --- a/tests/cvcuda/python/cvcuda_test_python.in +++ b/tests/cvcuda/python/cvcuda_test_python.in @@ -15,11 +15,27 @@ # See the License for the specific language governing permissions and # limitations under the License. -tests_dir="@PYTHON_TEST_DIR@" +tests_dir=${1:-"@PYTHON_TEST_DIR@"} python_versions_tentative="@PYTHON_TEST_VERSIONS@" python_versions="" +# Check if test scripts exist in test_dir +if [ ! -f "$tests_dir/cvcuda_util.py" ]; then + # try to find the scripts in relative path + echo "Python test scripts (E.g. cvcuda_util.py, test_op*.py) doesn't exist in $tests_dir" + echo "Trying to find python test scripts via relative path" + cvcuda_test_types_python_tar_dir=$(cd "$(dirname "$0")"; pwd)/../@PYTHON_TEST_INSTDIR@ # relative path in tarball + if [ -f "$cvcuda_test_types_python_tar_dir/cvcuda_util.py" ]; then + echo "Found python test scripts at $cvcuda_test_types_python_tar_dir via relative path" + tests_dir=$cvcuda_test_types_python_tar_dir + else + echo "Cannot find python test scripts in $tests_dir and $cvcuda_test_types_python_tar_dir" + echo "Please run ./cvcuda_test_python [python test scripts folder]" + exit 1 #hard exit + fi +fi + # Verify if correct package dependencies are installed -------- pip_depends="pytest torch" @@ -32,7 +48,10 @@ for ver in $python_versions_tentative; do echo "WARNING: Python version $ver not installed or missing proper dependencies" echo "Please install Python version $ver and run the following commands before running tests: sudo python$ver -m pip install $pip_depends" if [[ "$NVCV_FORCE_PYTHON" == 1 || "$NVCV_FORCE_PYTHON" == yes ]]; then - exit 1 #hard exit + echo "Exiting with FAILURE, as NVCV_FORCE_PYTHON=$NVCV_FORCE_PYTHON" + exit 2 #hard exit + else + echo "Continue and skipping python version $ver, as NVCV_FORCE_PYTHON=$NVCV_FORCE_PYTHON" fi else echo "Found Python version $ver installed with proper dependencies, adding to tests" @@ -67,13 +86,13 @@ for ver in $python_versions; do pubsyms=$(readelf -sWD $modfile | grep -v ' UND ' | grep ' GLOBAL ') if [[ $(echo "$pubsyms" | wc -l) != 1 ]]; then echo -e "cvcuda python $ver module is exposing too many symbols:\n$pubsyms" - exit 1 + exit 3 fi if ! echo "$pubsyms" | grep PyInit_cvcuda > /dev/null; then echo -e "cvcuda python $ver module must expose symbol PyInit_cvcuda, but instead exposes:\n$pubsyms" - exit 2 + exit 4 fi # Run python tests - NVCV_VERSION="@NVCV_VERSION_FULL@" python$ver -m pytest -o cache_dir="$tmpdir" "$@" "$tests_dir" + NVCV_VERSION="@NVCV_VERSION_FULL@" python$ver -m pytest -v --tb=line -o cache_dir="$tmpdir" "$@" "$tests_dir" done diff --git a/tests/cvcuda/python/test_multi_stream.py b/tests/cvcuda/python/test_multi_stream.py new file mode 100644 index 00000000..24a2bf83 --- /dev/null +++ b/tests/cvcuda/python/test_multi_stream.py @@ -0,0 +1,163 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import cvcuda +import pytest as t + + +def test_multiple_streams(): + stream1 = cvcuda.cuda.Stream() # create a new stream + stream2 = cvcuda.cuda.Stream() # create a new stream + stream3 = cvcuda.cuda.Stream() # create a new stream + assert stream1 is not stream2 + assert stream1 is not stream3 + assert cvcuda.cuda.Stream.current is cvcuda.cuda.Stream.default + assert cvcuda.cuda.Stream.current is not stream1 + assert cvcuda.cuda.Stream.current is not stream2 + assert cvcuda.cuda.Stream.current is not stream3 + + +def test_stream_context(): + stream1 = cvcuda.cuda.Stream() # create a new stream + stream2 = cvcuda.cuda.Stream() # create a new stream + with stream1: + assert cvcuda.cuda.Stream.current is stream1 + with stream2: + assert cvcuda.cuda.Stream.current is stream2 + assert cvcuda.cuda.Stream.current is cvcuda.cuda.Stream.default + + +def test_stream_context_nested(): + stream1 = cvcuda.cuda.Stream() # create a new stream + stream2 = cvcuda.cuda.Stream() # create a new stream + with stream1: + assert cvcuda.cuda.Stream.current is stream1 + with stream2: + assert cvcuda.cuda.Stream.current is stream2 + assert cvcuda.cuda.Stream.current is stream1 + assert cvcuda.cuda.Stream.current is cvcuda.cuda.Stream.default + with stream2: + assert cvcuda.cuda.Stream.current is stream2 + assert cvcuda.cuda.Stream.current is cvcuda.cuda.Stream.default + + +def test_stream_context_exception(): + stream1 = cvcuda.cuda.Stream() # create a new stream + stream2 = cvcuda.cuda.Stream() # create a new stream + with t.raises(Exception): + with stream1: + assert cvcuda.cuda.Stream.current is stream1 + with stream2: + assert cvcuda.cuda.Stream.current is stream2 + raise Exception() + assert cvcuda.cuda.Stream.current is stream1 + assert cvcuda.cuda.Stream.current is cvcuda.cuda.Stream.default + with stream2: + assert cvcuda.cuda.Stream.current is stream2 + assert cvcuda.cuda.Stream.current is cvcuda.cuda.Stream.default + + +def test_operator_stream(): + stream1 = cvcuda.cuda.Stream() # create a new stream + stream2 = cvcuda.cuda.Stream() # create a new stream + stream3 = cvcuda.cuda.Stream() # create a new stream + assert stream1 is not stream2 + assert stream1 is not stream3 + assert cvcuda.cuda.Stream.current is cvcuda.cuda.Stream.default + assert cvcuda.cuda.Stream.current is not stream1 + assert cvcuda.cuda.Stream.current is not stream2 + assert cvcuda.cuda.Stream.current is not stream3 + with stream1: + assert cvcuda.cuda.Stream.current is stream1 + img = torch.zeros(10, 10, 3, dtype=torch.uint8, device="cuda") + img = cvcuda.as_tensor(img, "HWC") + cvcuda.cvtcolor(img, cvcuda.ColorConversion.BGR2GRAY) + assert cvcuda.cuda.Stream.current is stream1 + with stream2: + assert cvcuda.cuda.Stream.current is stream2 + img = torch.zeros(10, 10, 3, dtype=torch.uint8, device="cuda") + img = cvcuda.as_tensor(img, "HWC") + cvcuda.cvtcolor(img, cvcuda.ColorConversion.BGR2GRAY) + assert cvcuda.cuda.Stream.current is stream2 + with stream3: + assert cvcuda.cuda.Stream.current is stream3 + img = torch.zeros(10, 10, 3, dtype=torch.uint8, device="cuda") + img = cvcuda.as_tensor(img, "HWC") + cvcuda.cvtcolor(img, cvcuda.ColorConversion.BGR2GRAY) + assert cvcuda.cuda.Stream.current is stream3 + assert cvcuda.cuda.Stream.current is cvcuda.cuda.Stream.default + + +def test_operator_changing_stream(): + + N = 10 + H = 1080 + W = 1080 + C = 3 + Loop = 50 + streams = [cvcuda.cuda.Stream() for _ in range(4)] # create a list of streams + + inputTensor = torch.randint(0, 256, (N, H, W, C), dtype=torch.uint8).cuda() + outputTensor = torch.randint(0, 256, (N, H, W, C), dtype=torch.uint8).cuda() + # Perform deep copy + inputTensor_copy = inputTensor.clone() + + inTensor = cvcuda.as_tensor(inputTensor.data, "NHWC") + outTensor = cvcuda.as_tensor(outputTensor.data, "NHWC") + + for _ in range(Loop): + for stream in streams: + cvcuda.flip_into(outTensor, inTensor, -1, stream=stream) # output x flipped + cvcuda.flip_into(inTensor, outTensor, -1, stream=stream) # output y flipped + + final_out = torch.as_tensor(inTensor.cuda()).cpu() + assert torch.equal(final_out, inputTensor_copy.cpu()) + + +def test_operator_changing_stream_loaded(): + + N = 10 + H = 1080 + W = 1080 + C = 3 + Loop = 50 + stream1 = cvcuda.cuda.Stream() + stream2 = cvcuda.cuda.Stream() + + inputTensor = torch.randint(0, 256, (N, H, W, C), dtype=torch.uint8).cuda() + inputTensorTmp = torch.randint(0, 256, (N, H, W, C), dtype=torch.uint8).cuda() + outputTensor = torch.randint(0, 256, (N, H, W, C), dtype=torch.uint8).cuda() + # Perform deep copy + inputTensor_copy = inputTensor.clone() + + inTensor = cvcuda.as_tensor(inputTensor.data, "NHWC") + inTensorTmp = cvcuda.as_tensor(inputTensorTmp.data, "NHWC") + outTensor = cvcuda.as_tensor(outputTensor.data, "NHWC") + + for _ in range(Loop): + # put a bunch of work on stream 1 + for _ in range(Loop * 2): + cvcuda.flip(inTensorTmp, 0, stream=stream1) + # put a bunch of work on stream 1 this will happen after the above work on stream 1 + cvcuda.flip_into( + inTensorTmp, inTensor, -1, stream=stream1 + ) # output x/y flipped + cvcuda.flip_into( + outTensor, inTensorTmp, -1, stream=stream2 + ) # output y/y flipped + + final_out = torch.as_tensor(outTensor.cuda()).cpu() + assert torch.equal(final_out, inputTensor_copy.cpu()) diff --git a/tests/cvcuda/python/test_opfindcontours.py b/tests/cvcuda/python/test_opfindcontours.py deleted file mode 100644 index 90e1e89e..00000000 --- a/tests/cvcuda/python/test_opfindcontours.py +++ /dev/null @@ -1,53 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import nvcv -import cvcuda -import pytest as t -import numpy as np -import cvcuda_util as util - -RNG = np.random.default_rng(0) - - -@t.mark.parametrize( - "shape, dtype, layout", - [((1, 16, 23, 1), np.uint8, "NHWC"), ((1, 32, 32, 1), np.uint8, "NHWC")], -) -def test_op_find_contours(shape, dtype, layout): - print(shape, dtype, layout) - image = util.create_tensor(shape, dtype, layout, 1, rng=RNG) - points, num_contours_and_points = cvcuda.find_contours(image) - assert points.shape[0] == image.shape[0] - assert points.shape[2] == 2 - assert points.shape[0] == num_contours_and_points.shape[0] - - stream = cvcuda.Stream() - points = cvcuda.Tensor( - (image.shape[0], 1024, 2), nvcv.Type.S32, nvcv.TensorLayout.NHW - ) - num_points = cvcuda.Tensor( - (image.shape[0], 32), nvcv.Type.U32, nvcv.TensorLayout.NW - ) - points_into, num_contours_and_points_into = cvcuda.find_contours_into( - src=image, - points=points, - num_points=num_points, - stream=stream, - ) - assert points_into is points - assert points_into.shape[0] == image.shape[0] - assert points_into.shape[2] == 2 - assert points_into.shape[0] == num_contours_and_points_into.shape[0] diff --git a/tests/cvcuda/python/test_opfindhomography.py b/tests/cvcuda/python/test_opfindhomography.py index 3f8d5faa..bbc57e9a 100644 --- a/tests/cvcuda/python/test_opfindhomography.py +++ b/tests/cvcuda/python/test_opfindhomography.py @@ -15,11 +15,10 @@ import nvcv import cvcuda +import cvcuda_util import pytest as t import numpy as np -RNG = np.random.default_rng(0) - @t.mark.parametrize( "num_samples, num_points", @@ -37,6 +36,10 @@ def test_op_findhomography(num_samples, num_points): assert out.shape == (num_samples, 3, 3) assert out.dtype == np.float32 + create_tensor_args = ((num_samples, num_points, 2), np.float32, "NWC") + src = cvcuda_util.create_tensor(*create_tensor_args) + dst = cvcuda_util.create_tensor(*create_tensor_args) + stream = cvcuda.Stream() out_tensor_args = ((num_samples, 3, 3), np.float32, "NHW") out = cvcuda.Tensor(*out_tensor_args) diff --git a/tests/cvcuda/python/test_oplabel.py b/tests/cvcuda/python/test_oplabel.py index 8a3eb92c..ec61fe23 100644 --- a/tests/cvcuda/python/test_oplabel.py +++ b/tests/cvcuda/python/test_oplabel.py @@ -18,12 +18,12 @@ import numpy as np -DEF_OUT_DTYPE = np.uint32 +DEF_OUT_DTYPE = np.int32 DEF_MAX_CAPACITY = 10000 def defaultNumStats(layout): - return 8 if "D" in layout else 6 + return 9 if "D" in layout else 7 @t.mark.parametrize( @@ -92,7 +92,7 @@ def test_op_label_api(src_args): out, count, stats = cvcuda.label( src, connectivity, - cvcuda.LABEL.SEQUENTIAL, + assign_labels=cvcuda.LABEL.SEQUENTIAL, count=True, stats=True, bg_label=bg_label, @@ -103,6 +103,40 @@ def test_op_label_api(src_args): assert out.shape == src.shape assert out.dtype == DEF_OUT_DTYPE + mask_layout = "".join([lc for lc in src_args[2] if lc != "N"]) + mask_shape = tuple([sv for sv, lc in zip(src_args[0], src_args[2]) if lc != "N"]) + mask = cvcuda.Tensor(mask_shape, np.int8, mask_layout) + + out, count, stats = cvcuda.label( + src, + connectivity, + cvcuda.LABEL.FAST, + mask_type=cvcuda.REMOVE_ISLANDS_OUTSIDE_MASK_ONLY, + count=True, + stats=True, + bg_label=bg_label, + min_size=min_size, + mask=mask, + ) + assert count is not None and stats is not None + assert out.layout == src.layout + assert out.shape == src.shape + assert out.dtype == DEF_OUT_DTYPE + + mask = cvcuda.Tensor(src.shape, np.uint8, src.layout) + + t_out, _, _ = cvcuda.label_into( + out, + count, + stats, + src, + connectivity, + bg_label=bg_label, + min_size=min_size, + mask=mask, + ) + assert t_out is out + t_out, t_count, t_stats = cvcuda.label_into(out, count, stats, src, connectivity) assert t_out is out and t_count is count and t_stats is stats assert out.layout == src.layout @@ -126,6 +160,7 @@ def test_op_label_api(src_args): assert out.shape == src.shape assert out.dtype == DEF_OUT_DTYPE + out = cvcuda.Tensor(src.shape, np.uint32, src.layout) tmp, _, _ = cvcuda.label_into( dst=out, src=src, connectivity=connectivity, stream=stream ) diff --git a/tests/cvcuda/system/CMakeLists.txt b/tests/cvcuda/system/CMakeLists.txt index a1406096..e82b1707 100644 --- a/tests/cvcuda/system/CMakeLists.txt +++ b/tests/cvcuda/system/CMakeLists.txt @@ -34,7 +34,6 @@ add_executable(cvcuda_test_system TestOpPairwiseMatcher.cpp TestOpStack.cpp TestOpLabel.cpp - TestOpFindContours.cpp TestOpOSD.cpp TestOpHistogramEq.cpp TestOpAdvCvtColor.cpp diff --git a/tests/cvcuda/system/TestOpFindContours.cpp b/tests/cvcuda/system/TestOpFindContours.cpp deleted file mode 100644 index 8081e28f..00000000 --- a/tests/cvcuda/system/TestOpFindContours.cpp +++ /dev/null @@ -1,174 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Definitions.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -namespace gt = ::testing; -namespace test = nvcv::test; -namespace ttest = test::type; - -using CPUImage = std::vector; - -// clang-format off - -using Types = ttest::Concat< - // ttest::Combine, - // ttest::Values<32, 64>>, - // ttest::Values<1, 2, 4, 8, 16>>, - // ttest::Combine, - // ttest::Values<128, 256>>, - // ttest::Values<1, 2, 4>>, - // ttest::Combine, - // ttest::Values<512>>, - // ttest::Values<1, 2>>, - // ttest::Combine, - // ttest::Values<1024>>, - // ttest::Values<1>> - ttest::Combine, - ttest::Values<32, 64, 128, 256, 512, 1024>>, - ttest::Values<1, 2, 4, 8, 16, 32, 64, 128>>, - ttest::Combine, - ttest::Values<1080, 2160>>, - ttest::Values<1, 2, 4, 8>>, - ttest::Combine, - ttest::Values<4320>>, - ttest::Values<1, 2>> ->; -NVCV_TYPED_TEST_SUITE(OpFindContours, Types); - -void generateRectangle(CPUImage &image, nvcv::Size2D boundary, nvcv::Size2D anchor = {0, 0}, nvcv::Size2D size = {5, 5}, - double angle = 0.0, bool fill = true, uint8_t setValue = 1); - -void generateRectangle(CPUImage &image, nvcv::Size2D boundary, nvcv::Size2D anchor, nvcv::Size2D size, double angle, - bool fill, uint8_t setValue) -{ - auto rad = angle * (M_PI / 180.0); - auto cosAngle = std::cos(rad); - auto sinAngle = std::sin(rad); - - auto transformed = anchor; - for (auto y = 0; y < size.h; ++y) - { - for (auto x = 0; x < size.w; ++x) - { - transformed.w = anchor.w + (x * cosAngle - y * sinAngle); - transformed.h = anchor.h + (x * sinAngle + y * cosAngle); - - if (fill || y == 0 || y == size.h - 1 || x == 0 || x == size.w - 1) - { - if (transformed.w >= 0 && transformed.w < boundary.w && transformed.h >= 0 - && transformed.h < boundary.h) - { - image[transformed.h * boundary.w + transformed.w] = setValue; - } - } - } - } -} - -TYPED_TEST(OpFindContours, correct_output) -{ - cudaStream_t stream; - ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream)); - - int width = ttest::GetValue; - int height = ttest::GetValue; - int numberOfImages = ttest::GetValue; - - nvcv::Tensor imgIn = nvcv::util::CreateTensor(numberOfImages, width, height, nvcv::FMT_U8); - auto dtype = nvcv::TYPE_S32; - auto tshape_points = nvcv::TensorShape{ - {numberOfImages, 1024, 2}, - nvcv::TENSOR_NCW - }; - auto tshape_counts = nvcv::TensorShape{ - {numberOfImages, 4}, - nvcv::TENSOR_NW - }; - nvcv::Tensor points{tshape_points, dtype}; - nvcv::Tensor counts{tshape_counts, dtype}; - - auto inData = imgIn.exportData(); - ASSERT_NE(nullptr, inData); - auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inData); - ASSERT_TRUE(inAccess); - ASSERT_EQ(numberOfImages, inAccess->numSamples()); - - auto imgPtr = make_cudaPitchedPtr(reinterpret_cast(inAccess->sampleData(0)), inAccess->rowStride(), width, height); - auto extent = make_cudaExtent(sizeof(uint8_t) * width, height, numberOfImages); - ASSERT_EQ(cudaSuccess, cudaMemset3DAsync(imgPtr, 0, extent)); - - //Generate input - CPUImage srcVec(height * width, 0); - - // Creating a 16-pixel contour (simple) - // Head Node at (5, 5) - generateRectangle(srcVec, {width, height}, {5, 5}); - - // Creating a 26-pixel contour (complex) - // Head Node at (17, 17) - generateRectangle(srcVec, {width, height}, {17, 17}); - generateRectangle(srcVec, {width, height}, {20, 20}); - - // Creating a 12-pixel contour (simple rotated) - // Head Node at (12, 12) - generateRectangle(srcVec, {width, height}, {12, 12}, {5, 5}, 45.0); - - for (auto i = 0; i < numberOfImages; ++i) - { - ASSERT_EQ(cudaSuccess, cudaMemcpy2D(inAccess->sampleData(i), inAccess->rowStride(), srcVec.data(), width, width, - height, cudaMemcpyHostToDevice)); - } - - // Creating contour validator - std::unordered_set expectedSizes{{0, 16, 26, 12}}; - - // run operator - cvcuda::FindContours findContoursOp(nvcv::Size2D{width, height}, numberOfImages); - EXPECT_NO_THROW(findContoursOp(stream, imgIn, points, counts)); - ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); - ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream)); - - auto outData = counts.exportData(); - ASSERT_NE(nullptr, outData); - auto outAccess = nvcv::TensorDataAccessStrided::Create(*outData); - ASSERT_TRUE(outAccess); - - std::vector hcounts(4, 0); - for (auto i = 0; i < numberOfImages; ++i) - { - ASSERT_EQ(cudaSuccess, cudaMemcpy(hcounts.data(), outAccess->sampleData(i), - hcounts.size() * sizeof(int), cudaMemcpyDeviceToHost)); - - std::unordered_set resultSizes{hcounts.begin(), hcounts.end()}; - EXPECT_EQ(resultSizes, expectedSizes); - } -} diff --git a/tests/cvcuda/system/TestOpLabel.cpp b/tests/cvcuda/system/TestOpLabel.cpp index 12516ab4..3439a590 100644 --- a/tests/cvcuda/system/TestOpLabel.cpp +++ b/tests/cvcuda/system/TestOpLabel.cpp @@ -317,14 +317,16 @@ void SortStats(std::vector>> &stats, std::vector +template void ComputeStats(std::vector>> &stats, const RawBufferType &dstVec, - const RawBufferType &bglVec, const long4 &dstStrides, const long1 &bglStrides, - const std::vector> &labels, const long4 &shape, int numStats) + const RawBufferType &mskVec, const RawBufferType &bglVec, const long4 &dstStrides, + const long4 &mskStrides, const long1 &bglStrides, const std::vector> &labels, + const long4 &shape, long maskN, int numStats) { // One-element-after-the-end label is a special label assigned to a region which got the background label DT endLabel = dstStrides.x / sizeof(DT); + bool hasMask = mskStrides.x > 0; bool hasBgLabel = bglStrides.x > 0; for (long x = 0; x < shape.x; ++x) @@ -350,18 +352,26 @@ void ComputeStats(std::vector>> &stats, const RawBuf if ((hasBgLabel && label == endLabel && posLabel == (DT)backgroundLabel) || label == posLabel) { - long regionIdx = std::distance(labels[x].cbegin(), fit); + long regionIdx = std::distance(labels[x].cbegin(), fit); + DT regionMark = 0; // region has no marks + + // If has mask and the element is inside the mask + if (hasMask && util::ValueAt(mskVec, mskStrides, long4{maskN == 1 ? 0 : x, y, z, w}) != 0) + { + regionMark = 2; // mark the region as inside the mask (= 2) + } stats[x][regionIdx].resize(numStats); stats[x][regionIdx][0] = label; stats[x][regionIdx][1] = w; stats[x][regionIdx][2] = z; - if (numStats == 6) + if (numStats == 7) { stats[x][regionIdx][3] = 1; stats[x][regionIdx][4] = 1; stats[x][regionIdx][5] = 1; + stats[x][regionIdx][6] = regionMark; } else { @@ -370,6 +380,7 @@ void ComputeStats(std::vector>> &stats, const RawBuf stats[x][regionIdx][5] = 1; stats[x][regionIdx][6] = 1; stats[x][regionIdx][7] = 1; + stats[x][regionIdx][8] = regionMark; } } } @@ -399,7 +410,17 @@ void ComputeStats(std::vector>> &stats, const RawBuf DT bboxAreaW = std::abs(stats[x][regionIdx][1] - w) + 1; DT bboxAreaH = std::abs(stats[x][regionIdx][2] - z) + 1; - if (numStats == 6) + // If has mask and the region has no marks (it is no marked as inside mask) + if (hasMask && stats[x][regionIdx][numStats - 1] == 0) + { + // If element is inside mask + if (util::ValueAt(mskVec, mskStrides, long4{maskN == 1 ? 0 : x, y, z, w}) != 0) + { + stats[x][regionIdx][numStats - 1] = 2; // mark the region as inside mask (= 2) + } + } + + if (numStats == 7) { stats[x][regionIdx][3] = std::max(stats[x][regionIdx][3], bboxAreaW); stats[x][regionIdx][4] = std::max(stats[x][regionIdx][4], bboxAreaH); @@ -424,7 +445,7 @@ void ComputeStats(std::vector>> &stats, const RawBuf template void RemoveIslands(std::vector> &labels, RawBufferType &dstVec, const RawBufferType &bglVec, const RawBufferType &mszVec, const long4 &dstStrides, const long1 &bglStrides, - const long1 &mszStrides, const std::vector>> &stats, const long4 &shape, + const long1 &mszStrides, std::vector>> &stats, const long4 &shape, int numStats) { for (long x = 0; x < shape.x; ++x) @@ -448,11 +469,15 @@ void RemoveIslands(std::vector> &labels, RawBufferType &dstVec, con } long regionIdx = std::distance(labels[x].cbegin(), fit); - DT regionSize = stats[x][regionIdx][numStats - 1]; + DT regionSize = stats[x][regionIdx][numStats - 2]; - if (regionSize < minSize) + // If region size is smaller than minimum size (it is an island) and the region is not marked + // as inside the mask (= 2), then remove the island and mark it as removed + if (regionSize < minSize && stats[x][regionIdx][numStats - 1] != 2) { util::ValueAt
(dstVec, dstStrides, curCoord) = backgroundLabel; + + stats[x][regionIdx][numStats - 1] = 1; } } } @@ -518,25 +543,29 @@ void Relabel(RawBufferType &dstVec, const RawBufferType &bglVec, const RawBuffer type::Types, type::Value, Type, type::Value, type::Value, \ type::Value, type::Value, type::Value> -// DoPostFilters: (0) none; (1) count regions; (2) + compute statistics; (3) + island removal. +// DoPostFilters: (0) none; (1) count regions; (2) + compute statistics; (3) + island removal; (4) + masked. NVCV_TYPED_TEST_SUITE(OpLabel, type::Types< NVCV_TEST_ROW(NVCV_SHAPE(33, 16, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, false, false, false, 0, false), NVCV_TEST_ROW(NVCV_SHAPE(23, 81, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, false, true, false, 1, false), NVCV_TEST_ROW(NVCV_SHAPE(13, 14, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, false, true, true, 2, false), NVCV_TEST_ROW(NVCV_SHAPE(32, 43, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, true, false, false, 3, false), + NVCV_TEST_ROW(NVCV_SHAPE(13, 52, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, true, false, false, 4, false), NVCV_TEST_ROW(NVCV_SHAPE(22, 12, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, false, false, true, 0, false), NVCV_TEST_ROW(NVCV_SHAPE(15, 16, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, true, false, true, 1, false), NVCV_TEST_ROW(NVCV_SHAPE(14, 26, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, true, true, false, 2, true), NVCV_TEST_ROW(NVCV_SHAPE(28, 73, 1, 3), NVCV_DATA_TYPE_U16, uint16_t, true, true, true, 3, true), + NVCV_TEST_ROW(NVCV_SHAPE(19, 61, 1, 3), NVCV_DATA_TYPE_U16, uint16_t, true, true, true, 4, true), NVCV_TEST_ROW(NVCV_SHAPE(23, 21, 12, 1), NVCV_DATA_TYPE_U32, uint32_t, false, false, false, 0, false), NVCV_TEST_ROW(NVCV_SHAPE(33, 41, 22, 1), NVCV_DATA_TYPE_U32, uint32_t, false, false, false, 1, false), NVCV_TEST_ROW(NVCV_SHAPE(25, 38, 13, 2), NVCV_DATA_TYPE_S8, int8_t, true, false, false, 2, false), NVCV_TEST_ROW(NVCV_SHAPE(25, 18, 13, 1), NVCV_DATA_TYPE_S8, int8_t, true, false, false, 3, false), + NVCV_TEST_ROW(NVCV_SHAPE(45, 17, 11, 1), NVCV_DATA_TYPE_S8, int8_t, true, false, false, 4, false), NVCV_TEST_ROW(NVCV_SHAPE(22, 37, 19, 2), NVCV_DATA_TYPE_S16, int16_t, true, true, false, 0, false), NVCV_TEST_ROW(NVCV_SHAPE(18, 27, 3, 1), NVCV_DATA_TYPE_S32, int32_t, true, false, true, 1, false), NVCV_TEST_ROW(NVCV_SHAPE(17, 29, 5, 2), NVCV_DATA_TYPE_U8, uint8_t, true, true, true, 2, false), - NVCV_TEST_ROW(NVCV_SHAPE(16, 28, 4, 3), NVCV_DATA_TYPE_U8, uint8_t, true, true, true, 3, true) + NVCV_TEST_ROW(NVCV_SHAPE(16, 28, 4, 3), NVCV_DATA_TYPE_U8, uint8_t, true, true, true, 3, true), + NVCV_TEST_ROW(NVCV_SHAPE(17, 27, 5, 2), NVCV_DATA_TYPE_U8, uint8_t, true, true, true, 4, true) >); // clang-format on @@ -547,10 +576,16 @@ TYPED_TEST(OpLabel, correct_output) int4 shape{type::GetValue}; nvcv::DataType srcDT{type::GetValue}; - nvcv::DataType dstDT{nvcv::TYPE_U32}; + nvcv::DataType dstDT{srcDT.dataKind() == nvcv::DataKind::SIGNED ? nvcv::TYPE_S32 : nvcv::TYPE_U32}; + nvcv::DataType mskDT{srcDT.dataKind() == nvcv::DataKind::SIGNED ? nvcv::TYPE_S8 : nvcv::TYPE_U8}; + + // Testing dstDT/mskDT with S32/S8 when srcDT is signed + // DstT must be U32 even though dstDT may be S32 (ref. code expects it as U32 since it treated it as a mask) + // MskT must be U8 even though mskDT may be S8 (ref. code only check if it is zero as outside the mask) using SrcT = type::GetType; using DstT = uint32_t; + using MskT = uint8_t; bool hasBgLabel = type::GetValue; bool hasMinThresh = type::GetValue; @@ -562,14 +597,20 @@ TYPED_TEST(OpLabel, correct_output) // labels (bgl), minimum threshold (min), maximum threshold (max), minimum size for islands removal (msz), // count of labeled regions (count) and statistics computed per labeled region (sta) - nvcv::Tensor srcTensor, dstTensor, bglTensor, minTensor, maxTensor, mszTensor, cntTensor, staTensor; + nvcv::Tensor srcTensor, dstTensor, bglTensor, minTensor, maxTensor, mszTensor, cntTensor, staTensor, mskTensor; - nvcv::Optional srcData, dstData, bglData, minData, maxData, mszData, cntData, staData; + nvcv::Optional srcData, dstData, bglData, minData, maxData, mszData, cntData, staData, + mskData; NVCVConnectivityType connectivity = (shape.z == 1) ? NVCV_CONNECTIVITY_4_2D : NVCV_CONNECTIVITY_6_3D; NVCVLabelType assignLabels = doRelabel ? NVCV_LABEL_SEQUENTIAL : NVCV_LABEL_FAST; + NVCVLabelMaskType maskType = NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY; // this is the only mask type allowed + + long maskN{shape.w % 2 == 1 ? 1 : shape.w}; // test a single mask for all N when src/dst N is odd - long3 staShape{shape.w, 10000, (shape.z == 1) ? 6 : 8}; + long4 mskShape{maskN, shape.z, shape.y, shape.x}; // mskShape is NDHW whereas shape is WHDN + + long3 staShape{shape.w, 10000, (shape.z == 1) ? 7 : 9}; // clang-format off @@ -631,13 +672,20 @@ TYPED_TEST(OpLabel, correct_output) staData = staTensor.exportData(); ASSERT_TRUE(staData); } - if (doPostFilters == 3) + if (doPostFilters >= 3) { mszTensor = nvcv::Tensor({{shape.w}, "N"}, dstDT); mszData = mszTensor.exportData(); ASSERT_TRUE(mszData); } + if (doPostFilters >= 4) + { + mskTensor = nvcv::Tensor({{mskShape.x, mskShape.y, mskShape.z, mskShape.w}, "NDHW"}, mskDT); + + mskData = mskTensor.exportData(); + ASSERT_TRUE(mskData); + } // clang-format on @@ -664,6 +712,16 @@ TYPED_TEST(OpLabel, correct_output) long1 mszStrides{(mszTensor) ? mszData->stride(0) : 0}; long1 cntStrides{(cntTensor) ? cntData->stride(0) : 0}; long3 staStrides = (staTensor) ? long3{staData->stride(0), staData->stride(1), staData->stride(2)} : long3{0, 0, 0}; + long4 mskStrides{0, 0, 0, 0}; + + if (mskTensor) + { + int4 maskIds{mskTensor.layout().find('N'), mskTensor.layout().find('D'), mskTensor.layout().find('H'), + mskTensor.layout().find('W')}; + + mskStrides = long4{mskData->stride(maskIds.x), mskData->stride(maskIds.y), mskData->stride(maskIds.z), + mskData->stride(maskIds.w)}; + } srcStrides.y = (ids.y == -1) ? srcStrides.z * srcShape.z : srcData->stride(ids.y); srcStrides.x = (ids.x == -1) ? srcStrides.y * srcShape.y : srcData->stride(ids.x); @@ -672,6 +730,7 @@ TYPED_TEST(OpLabel, correct_output) long srcBufSize = srcStrides.x * srcShape.x; long dstBufSize = dstStrides.x * srcShape.x; + long mskBufSize = mskStrides.x * mskShape.x; long bglBufSize = bglStrides.x * srcShape.x; long minBufSize = minStrides.x * srcShape.x; long maxBufSize = maxStrides.x * srcShape.x; @@ -682,6 +741,7 @@ TYPED_TEST(OpLabel, correct_output) // Third setup: generate raw buffer data and copy them into tensors RawBufferType srcVec(srcBufSize); + RawBufferType mskVec(mskBufSize); RawBufferType bglVec(bglBufSize); RawBufferType minVec(minBufSize); RawBufferType maxVec(maxBufSize); @@ -690,6 +750,7 @@ TYPED_TEST(OpLabel, correct_output) std::default_random_engine rng(0); std::uniform_int_distribution srcRandom(0, 6); + std::uniform_int_distribution mskRandom(0, 1); std::uniform_int_distribution bglRandom(0, (minTensor || maxTensor) ? 1 : 6); std::uniform_int_distribution minRandom(1, 3); std::uniform_int_distribution maxRandom(3, 5); @@ -732,6 +793,16 @@ TYPED_TEST(OpLabel, correct_output) ASSERT_EQ(cudaSuccess, cudaMemcpy(mszData->basePtr(), mszVec.data(), mszBufSize, cudaMemcpyHostToDevice)); } + if (mskTensor) + { + for (long x = 0; x < mskShape.x; ++x) + for (long y = 0; y < mskShape.y; ++y) + for (long z = 0; z < mskShape.z; ++z) + for (long w = 0; w < mskShape.w; ++w) + util::ValueAt(mskVec, mskStrides, long4{x, y, z, w}) = mskRandom(rng); + + ASSERT_EQ(cudaSuccess, cudaMemcpy(mskData->basePtr(), mskVec.data(), mskBufSize, cudaMemcpyHostToDevice)); + } // clang-format on @@ -742,7 +813,7 @@ TYPED_TEST(OpLabel, correct_output) cvcuda::Label op; EXPECT_NO_THROW(op(stream, srcTensor, dstTensor, bglTensor, minTensor, maxTensor, mszTensor, cntTensor, staTensor, - connectivity, assignLabels)); + mskTensor, connectivity, assignLabels, maskType)); ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream)); @@ -800,8 +871,8 @@ TYPED_TEST(OpLabel, correct_output) { ASSERT_EQ(cudaSuccess, cudaMemcpy(staTestVec.data(), staData->basePtr(), staBufSize, cudaMemcpyDeviceToHost)); - ref::ComputeStats(goldStats, labGoldVec, bglVec, dstStrides, bglStrides, goldLabels, srcShape, - staShape.z); + ref::ComputeStats(goldStats, labGoldVec, mskVec, bglVec, dstStrides, mskStrides, bglStrides, + goldLabels, srcShape, maskN, staShape.z); ref::GetLabels(testLabels, cntTestVec, staTestVec, cntStrides, staStrides, srcShape.x); } diff --git a/tests/cvcuda/system/TestOpOSD.cpp b/tests/cvcuda/system/TestOpOSD.cpp index 5ef18eab..d401408f 100644 --- a/tests/cvcuda/system/TestOpOSD.cpp +++ b/tests/cvcuda/system/TestOpOSD.cpp @@ -298,7 +298,7 @@ static void runOp(cudaStream_t &stream, cvcuda::OSD &op, int &inN, int &inW, int (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}; rb.bgColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}; - rb.interpolation = false; + rb.interpolation = (bool)randl(0, 1); element = std::make_shared(type, &rb); break; } diff --git a/tests/nvcv_types/cudatools_system/TestLinAlg.cpp b/tests/nvcv_types/cudatools_system/TestLinAlg.cpp index e820c66a..fcef2cb6 100644 --- a/tests/nvcv_types/cudatools_system/TestLinAlg.cpp +++ b/tests/nvcv_types/cudatools_system/TestLinAlg.cpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -19,8 +19,10 @@ #include // for StringLiteral #include // the object of this test +#include // for std::generate, etc. #include // for std::pow, etc. #include // for std::iota, etc. +#include // for std::random_device, etc. #include // for std::stringstream, etc. #include // for std::remove_reference_t, etc. @@ -34,6 +36,13 @@ using TStr = typename test::StringLiteral; using schar = signed char; using uchar = unsigned char; +static std::random_device rd; +static std::mt19937 mt(rd()); // to generate random input + +// Maximum absolute error expected given type T as either float or double +template +constexpr T MaxAbsErr = std::is_same_v ? 1e-5 : 1e-8; + #define SCALAR(T, V) ttype::Value #define VEC(T, N, ...) ttype::Value{{__VA_ARGS__}}> @@ -175,6 +184,36 @@ TYPED_TEST(LinAlgVectorTest, load_works) } } +TYPED_TEST(LinAlgVectorTest, load_initializer_list_works) +{ + using VectorType = ttype::GetType; + constexpr int N = ttype::GetValue; + + math::Vector vec; + + std::array arr; + + std::iota(arr.begin(), arr.end(), 0); + + if constexpr (N == 1) + { + vec.load({0}); + } + else if constexpr (N == 3) + { + vec.load({0, 1, 2}); + } + else if constexpr (N == 5) + { + vec.load({0, 1, 2, 3, 4}); + } + + for (int i = 0; i < N; ++i) + { + EXPECT_EQ(vec[i], i); + } +} + TYPED_TEST(LinAlgVectorTest, store_works) { using VectorType = ttype::GetType; @@ -364,6 +403,24 @@ TYPED_TEST(LinAlgMatrixTest, set_col_with_pointer_works) } } +TYPED_TEST(LinAlgMatrixTest, set_col_with_value_works) +{ + using MatrixType = ttype::GetType; + constexpr int M = ttype::GetValue; + constexpr int N = ttype::GetValue; + + math::Matrix mat{{1}}; + + MatrixType val = 1; + + mat.set_col(0, val); + + for (int i = 0; i < M; ++i) + { + EXPECT_EQ(mat[i][0], val); + } +} + TYPED_TEST(LinAlgMatrixTest, load_works) { using MatrixType = ttype::GetType; @@ -388,6 +445,41 @@ TYPED_TEST(LinAlgMatrixTest, load_works) } } +TYPED_TEST(LinAlgMatrixTest, load_initializer_list_works) +{ + using MatrixType = ttype::GetType; + constexpr int M = ttype::GetValue; + constexpr int N = ttype::GetValue; + + math::Matrix mat; + + std::array arr; + + std::iota(arr.begin(), arr.end(), 0); + + if constexpr (M * N == 1) + { + mat.load({0}); + } + else if constexpr (M * N == 6) + { + mat.load({0, 1, 2, 3, 4, 5}); + } + else if constexpr (M * N == 12) + { + mat.load({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); + } + + int val = 0; + for (int i = 0; i < M; ++i) + { + for (int j = 0; j < N; ++j) + { + EXPECT_EQ(mat[i][j], val++); + } + } +} + TYPED_TEST(LinAlgMatrixTest, store_works) { using MatrixType = ttype::GetType; @@ -568,8 +660,8 @@ TYPED_TEST(LinAlgOperatorLessTest, correct_output) // clang-format off NVCV_TYPED_TEST_SUITE(LinAlgOutputStreamTest, ttype::Types< - ttype::Types>, - ttype::Types> + ttype::Types>, + ttype::Types> >); // clang-format on @@ -1073,10 +1165,10 @@ TYPED_TEST(LinAlgDotAndReverseVectorTest, correct_content_of_reverse) EXPECT_EQ(test, gold); } -// -------------------- Testing LinAlg transp* operations ---------------------- +// ------------------- Testing LinAlg matrix transformations ------------------- // clang-format off -NVCV_TYPED_TEST_SUITE(LinAlgTranspTest, ttype::Zip< +NVCV_TYPED_TEST_SUITE(LinAlgTransfTest, ttype::Zip< test::Types, test::Values<4, 9>, test::Values<8, 13> @@ -1084,11 +1176,11 @@ NVCV_TYPED_TEST_SUITE(LinAlgTranspTest, ttype::Zip< // clang-format on -TYPED_TEST(LinAlgTranspTest, correct_content_of_transp) +TYPED_TEST(LinAlgTransfTest, correct_content_of_transp) { using Type = ttype::GetType; constexpr int M = ttype::GetValue; - constexpr int N = ttype::GetValue; + constexpr int N = ttype::GetValue; math::Matrix mat; @@ -1117,7 +1209,7 @@ TYPED_TEST(LinAlgTranspTest, correct_content_of_transp) EXPECT_EQ(test, gold); } -TYPED_TEST(LinAlgTranspTest, correct_content_of_transp_inplace) +TYPED_TEST(LinAlgTransfTest, correct_content_of_transp_inplace) { using Type = ttype::GetType; constexpr int M = ttype::GetValue; @@ -1152,7 +1244,7 @@ TYPED_TEST(LinAlgTranspTest, correct_content_of_transp_inplace) EXPECT_EQ(test1, gold); } -TYPED_TEST(LinAlgTranspTest, correct_content_of_transp_vector) +TYPED_TEST(LinAlgTransfTest, correct_content_of_transp_vector) { using Type = ttype::GetType; constexpr int M = ttype::GetValue; @@ -1181,22 +1273,11 @@ TYPED_TEST(LinAlgTranspTest, correct_content_of_transp_vector) EXPECT_EQ(test, gold); } -// ------------------ Testing LinAlg flip* matrix operations ------------------- - -// clang-format off -NVCV_TYPED_TEST_SUITE(LinAlgFlipMatrixTest, ttype::Zip< - test::Types, - test::Values<4, 9>, - test::Values<8, 13> ->); - -// clang-format on - -TYPED_TEST(LinAlgFlipMatrixTest, correct_content_of_flip) +TYPED_TEST(LinAlgTransfTest, correct_content_of_flip) { using Type = ttype::GetType; constexpr int M = ttype::GetValue; - constexpr int N = ttype::GetValue; + constexpr int N = ttype::GetValue; math::Matrix mat; @@ -1212,7 +1293,7 @@ TYPED_TEST(LinAlgFlipMatrixTest, correct_content_of_flip) EXPECT_EQ(test.rows(), M); EXPECT_EQ(test.cols(), N); - math::Matrix gold; + math::Matrix gold; for (int i = 0; i < gold.rows(); ++i) { @@ -1225,11 +1306,11 @@ TYPED_TEST(LinAlgFlipMatrixTest, correct_content_of_flip) EXPECT_EQ(test, gold); } -TYPED_TEST(LinAlgFlipMatrixTest, correct_content_of_flip_rows) +TYPED_TEST(LinAlgTransfTest, correct_content_of_flip_rows) { using Type = ttype::GetType; constexpr int M = ttype::GetValue; - constexpr int N = ttype::GetValue; + constexpr int N = ttype::GetValue; math::Matrix mat; @@ -1245,7 +1326,7 @@ TYPED_TEST(LinAlgFlipMatrixTest, correct_content_of_flip_rows) EXPECT_EQ(test.rows(), M); EXPECT_EQ(test.cols(), N); - math::Matrix gold; + math::Matrix gold; for (int i = 0; i < gold.rows(); ++i) { @@ -1258,11 +1339,11 @@ TYPED_TEST(LinAlgFlipMatrixTest, correct_content_of_flip_rows) EXPECT_EQ(test, gold); } -TYPED_TEST(LinAlgFlipMatrixTest, correct_content_of_flip_cols) +TYPED_TEST(LinAlgTransfTest, correct_content_of_flip_cols) { using Type = ttype::GetType; constexpr int M = ttype::GetValue; - constexpr int N = ttype::GetValue; + constexpr int N = ttype::GetValue; math::Matrix mat; @@ -1278,7 +1359,7 @@ TYPED_TEST(LinAlgFlipMatrixTest, correct_content_of_flip_cols) EXPECT_EQ(test.rows(), M); EXPECT_EQ(test.cols(), N); - math::Matrix gold; + math::Matrix gold; for (int i = 0; i < gold.rows(); ++i) { @@ -1291,14 +1372,240 @@ TYPED_TEST(LinAlgFlipMatrixTest, correct_content_of_flip_cols) EXPECT_EQ(test, gold); } +TYPED_TEST(LinAlgTransfTest, correct_content_of_head) +{ + using Type = ttype::GetType; + constexpr int M = ttype::GetValue; + constexpr int N = ttype::GetValue; + + math::Matrix mat; + + for (int i = 0; i < mat.rows(); ++i) + { + std::iota(mat[i].begin(), mat[i].end(), 0); + } + + auto test = math::head<1>(mat); + + EXPECT_TRUE((std::is_same_v)); + + EXPECT_EQ(test.rows(), 1); + EXPECT_EQ(test.cols(), N); + + math::Matrix gold; + + for (int i = 0; i < gold.rows(); ++i) + { + for (int j = 0; j < gold.cols(); ++j) + { + gold[i][j] = mat[i][j]; + } + } + + EXPECT_EQ(test, gold); +} + +TYPED_TEST(LinAlgTransfTest, correct_content_of_tail) +{ + using Type = ttype::GetType; + constexpr int M = ttype::GetValue; + constexpr int N = ttype::GetValue; + + math::Matrix mat; + + for (int i = 0; i < mat.rows(); ++i) + { + std::iota(mat[i].begin(), mat[i].end(), 0); + } + + auto test = math::tail<1>(mat); + + EXPECT_TRUE((std::is_same_v)); + + EXPECT_EQ(test.rows(), 1); + EXPECT_EQ(test.cols(), N); + + math::Matrix gold; + + for (int i = 0; i < gold.rows(); ++i) + { + for (int j = 0; j < gold.cols(); ++j) + { + gold[i][j] = mat[M - 1 - i][j]; + } + } + + EXPECT_EQ(test, gold); +} + +// -------------------- Testing LinAlg advanced operations --------------------- + +// clang-format off +NVCV_TYPED_TEST_SUITE(LinAlgLTIFilterTest, ttype::Zip< + test::Types, + test::Values<5, 32>, + test::Values<3, 32>, + test::Values<1, 3> +>); + +// clang-format on + +TYPED_TEST(LinAlgLTIFilterTest, correct_content_of_fwd) +{ + using Type = ttype::GetType; + constexpr int M = ttype::GetValue; + constexpr int N = ttype::GetValue; + constexpr int R = ttype::GetValue; + + std::uniform_real_distribution d(-1, 1); + + math::Vector weights; + math::Matrix prologue; + math::Matrix block; + + std::generate(weights.begin(), weights.end(), [&]() { return d(mt); }); + for (int i = 0; i < R; ++i) + { + std::generate(prologue[i].begin(), prologue[i].end(), [&]() { return d(mt); }); + } + for (int i = 0; i < M; ++i) + { + std::generate(block[i].begin(), block[i].end(), [&]() { return d(mt); }); + } + + auto test = math::fwd(prologue, block, weights); + + EXPECT_TRUE((std::is_same_v)); + + EXPECT_EQ(test.rows(), M); + EXPECT_EQ(test.cols(), N); + + math::Matrix gold; + + for (int j = 0; j < N; ++j) + { + for (int i = 0; i < M; ++i) + { + Type y = block[i][j] * weights[0] - prologue[R - 1][j] * weights[1]; + + for (int r = R - 1; r >= 1; --r) + { + y = y - prologue[R - 1 - r][j] * weights[r + 1]; + + prologue[R - 1 - r][j] = prologue[R - 1 - r + 1][j]; + } + + gold[i][j] = prologue[R - 1][j] = y; + } + } + + for (int i = 0; i < M; ++i) + { + for (int j = 0; j < N; ++j) + { + EXPECT_NEAR(test[i][j], gold[i][j], MaxAbsErr); + } + } +} + +TYPED_TEST(LinAlgLTIFilterTest, correct_content_of_rev) +{ + using Type = ttype::GetType; + constexpr int M = ttype::GetValue; + constexpr int N = ttype::GetValue; + constexpr int R = ttype::GetValue; + + std::uniform_real_distribution d(-1, 1); + + math::Vector weights; + math::Matrix epilogue; + math::Matrix block; + + std::generate(weights.begin(), weights.end(), [&]() { return d(mt); }); + for (int i = 0; i < R; ++i) + { + std::generate(epilogue[i].begin(), epilogue[i].end(), [&]() { return d(mt); }); + } + for (int i = 0; i < M; ++i) + { + std::generate(block[i].begin(), block[i].end(), [&]() { return d(mt); }); + } + + auto test = math::rev(block, epilogue, weights); + + EXPECT_TRUE((std::is_same_v)); + + EXPECT_EQ(test.rows(), M); + EXPECT_EQ(test.cols(), N); + + math::Matrix gold; + + for (int j = 0; j < N; ++j) + { + for (int i = M - 1; i >= 0; --i) + { + Type z = block[i][j] * weights[0] - epilogue[0][j] * weights[1]; + + for (int r = R - 1; r >= 1; --r) + { + z = z - epilogue[r][j] * weights[r + 1]; + + epilogue[r][j] = epilogue[r - 1][j]; + } + + gold[i][j] = epilogue[0][j] = z; + } + } + + for (int i = 0; i < M; ++i) + { + for (int j = 0; j < N; ++j) + { + EXPECT_NEAR(test[i][j], gold[i][j], MaxAbsErr); + } + } +} + // ------------------- Testing LinAlg det matrix operations -------------------- // clang-format off NVCV_TYPED_TEST_SUITE(LinAlgDetMatrixTest, ttype::Zip< - test::Types, - test::Values<2, 3, 4> + test::Types, + test::Values<1, 2, 3, 4> >); +template +void GetTestInput(math::Matrix &input) +{ + if constexpr (M == 1) + { + input.load({0.999998682}); + } + else if constexpr (M == 2) + { + input.load( + {1.00034897, -0.000357094, + 0.000348814, 0.999643171}); + } + else if constexpr (M == 3) + { + input.load( + {1.01250394, -0.02495176, 0.01244351, + 0.01199532, 0.97607735, 0.01192297, + 0.01149353, -0.02290747, 1.01140953}); + } + else + { + static_assert(M == 4); + + input.load( + {1., 0.292789199, 0.384852709, 0.200596131, + 0., 0.941267619, 0.215589234, 0.344613902, + 0., -0.100899228, 0.808642026, 0.146461019, + 0., -0.042882204, -0.157265148, 0.779262512}); + } +} + // clang-format on template @@ -1330,7 +1637,9 @@ TYPED_TEST(LinAlgDetMatrixTest, correct_content_of_det) using Type = ttype::GetType; constexpr int M = ttype::GetValue; - math::Matrix mat = math::identity(); + math::Matrix mat; + + GetTestInput(mat); auto test = math::det(mat); @@ -1340,10 +1649,30 @@ TYPED_TEST(LinAlgDetMatrixTest, correct_content_of_det) Type gold = goldDet(mat); - EXPECT_EQ(test, gold); + EXPECT_NEAR(test, gold, MaxAbsErr); } -// --------------- Testing LinAlg inv_inplace matrix operations ---------------- +// --------------------- Testing LinAlg solve operations ----------------------- + +NVCV_TYPED_TEST_SUITE( + LinAlgSolveTest, + ttype::Types, + ttype::Types, + ttype::Types>); + +TYPED_TEST(LinAlgSolveTest, correct_solve) +{ + auto A = ttype::GetValue; + auto x = ttype::GetValue; + auto b = ttype::GetValue; + + math::solve_inplace(A, b); + + EXPECT_EQ(b, x); +} + +// ------------- Testing LinAlg various inverse matrix operations -------------- // clang-format off NVCV_TYPED_TEST_SUITE(LinAlgInvMatrixTest, ttype::Zip< @@ -1351,98 +1680,110 @@ NVCV_TYPED_TEST_SUITE(LinAlgInvMatrixTest, ttype::Zip< test::Values<1, 2, 3> >); -// clang-format on - template -struct GoldInv +void GetTestInputAndGoldOutput(math::Matrix &input, math::Matrix &output) { - void operator()(math::Matrix &m) {} -}; + GetTestInput(input); -template -struct GoldInv -{ - void operator()(math::Matrix &m) + if constexpr (M == 1) { - m[0][0] = T{1} / m[0][0]; + output.load({1.000001318}); } -}; - -template -struct GoldInv -{ - void operator()(math::Matrix &m) + else if constexpr (M == 2) { - GoldDet goldDet; - T d = goldDet(m); - std::swap(m[0][0], m[1][1]); - m[0][0] /= d; - m[1][1] /= d; - - m[0][1] = -m[0][1] / d; - m[1][0] = -m[1][0] / d; + output.load( + { 0.999651028, 0.000357097, + -0.000348817, 1.000356831}); } -}; + else + { + static_assert(M == 3); -template -struct GoldInv + output.load( + { 0.98749612, 0.02495163, -0.01244344, + -0.01199525, 1.02392251, -0.0119229, + -0.01149346, 0.02290733, 0.98859054}); + } +} + +// clang-format on + +TYPED_TEST(LinAlgInvMatrixTest, correct_content_of_inv) { - void operator()(math::Matrix &m) + using Type = ttype::GetType; + constexpr int M = ttype::GetValue; + + math::Matrix mat, gold, test; + + GetTestInputAndGoldOutput(mat, gold); + + EXPECT_NO_THROW(test = math::inv(mat)); + + for (int i = 0; i < M; ++i) { - GoldDet goldDet; - T d = goldDet(m); - - math::Matrix A; - A[0][0] = (m[1][1] * m[2][2] - m[1][2] * m[2][1]) / d; - A[0][1] = -(m[0][1] * m[2][2] - m[0][2] * m[2][1]) / d; - A[0][2] = (m[0][1] * m[1][2] - m[0][2] * m[1][1]) / d; - A[1][0] = -(m[1][0] * m[2][2] - m[1][2] * m[2][0]) / d; - A[1][1] = (m[0][0] * m[2][2] - m[0][2] * m[2][0]) / d; - A[1][2] = -(m[0][0] * m[1][2] - m[0][2] * m[1][0]) / d; - A[2][0] = (m[1][0] * m[2][1] - m[1][1] * m[2][0]) / d; - A[2][1] = -(m[0][0] * m[2][1] - m[0][1] * m[2][0]) / d; - A[2][2] = (m[0][0] * m[1][1] - m[0][1] * m[1][0]) / d; - - m = A; + for (int j = 0; j < M; ++j) + { + EXPECT_NEAR(test[i][j], gold[i][j], MaxAbsErr); + } } -}; +} TYPED_TEST(LinAlgInvMatrixTest, correct_content_of_inv_inplace) { using Type = ttype::GetType; constexpr int M = ttype::GetValue; - math::Matrix mat = math::identity(); + math::Matrix mat, gold; - auto test = mat; + GetTestInputAndGoldOutput(mat, gold); - EXPECT_TRUE(math::inv_inplace(test)); + EXPECT_TRUE(math::inv_inplace(mat)); - GoldInv goldInv; + for (int i = 0; i < M; ++i) + { + for (int j = 0; j < M; ++j) + { + EXPECT_NEAR(mat[i][j], gold[i][j], MaxAbsErr); + } + } +} - auto gold = mat; +TYPED_TEST(LinAlgInvMatrixTest, correct_content_of_inv_lu) +{ + using Type = ttype::GetType; + constexpr int M = ttype::GetValue; - goldInv(gold); + math::Matrix mat, gold, test; - EXPECT_EQ(test, gold); -} + GetTestInputAndGoldOutput(mat, gold); -// --------------------- Testing LinAlg solve operations ----------------------- + EXPECT_NO_THROW(test = math::inv_lu(mat)); -NVCV_TYPED_TEST_SUITE( - LinAlgSolveTest, - ttype::Types, - ttype::Types, - ttype::Types>); + for (int i = 0; i < M; ++i) + { + for (int j = 0; j < M; ++j) + { + EXPECT_NEAR(test[i][j], gold[i][j], MaxAbsErr); + } + } +} -TYPED_TEST(LinAlgSolveTest, correct_solve) +TYPED_TEST(LinAlgInvMatrixTest, correct_content_of_inv_lu_inplace) { - auto A = ttype::GetValue; - auto x = ttype::GetValue; - auto b = ttype::GetValue; + using Type = ttype::GetType; + constexpr int M = ttype::GetValue; - math::solve_inplace(A, b); + math::Matrix mat, gold; - EXPECT_EQ(b, x); + GetTestInputAndGoldOutput(mat, gold); + + EXPECT_NO_THROW(math::inv_lu_inplace(mat)); + + for (int i = 0; i < M; ++i) + { + for (int j = 0; j < M; ++j) + { + EXPECT_NEAR(mat[i][j], gold[i][j], MaxAbsErr); + } + } } diff --git a/tests/nvcv_types/cudatools_system/TestTensorWrap.cpp b/tests/nvcv_types/cudatools_system/TestTensorWrap.cpp index cae960ee..427cf6db 100644 --- a/tests/nvcv_types/cudatools_system/TestTensorWrap.cpp +++ b/tests/nvcv_types/cudatools_system/TestTensorWrap.cpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -1199,7 +1199,7 @@ TEST_P(CreateTensorWrapNHWxTests, correct_properties_in_nhw) EXPECT_EQ(wrap.ptr(), reinterpret_cast(dev->basePtr())); auto tensorAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*dev); - NVCV_ASSERT(tensorAccess); + ASSERT_TRUE(tensorAccess); EXPECT_EQ(wrap.strides()[0], tensorAccess->sampleStride()); EXPECT_EQ(wrap.strides()[1], tensorAccess->rowStride()); @@ -1226,7 +1226,7 @@ TEST_P(CreateTensorWrapNHWxTests, correct_properties_in_nhwc) EXPECT_EQ(wrap.ptr(), reinterpret_cast(dev->basePtr())); auto tensorAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*dev); - NVCV_ASSERT(tensorAccess); + ASSERT_TRUE(tensorAccess); EXPECT_EQ(wrap.strides()[0], tensorAccess->sampleStride()); EXPECT_EQ(wrap.strides()[1], tensorAccess->rowStride()); diff --git a/tests/nvcv_types/python/nvcv_test_types_python.in b/tests/nvcv_types/python/nvcv_test_types_python.in index ee9d70f8..ee25eda7 100755 --- a/tests/nvcv_types/python/nvcv_test_types_python.in +++ b/tests/nvcv_types/python/nvcv_test_types_python.in @@ -16,11 +16,27 @@ # limitations under the License. -tests_dir="@PYTHON_TEST_DIR@" +tests_dir=${1:-"@PYTHON_TEST_DIR@"} python_versions_tentative="@PYTHON_TEST_VERSIONS@" python_versions="" +# Check if test scripts exist in test_dir +if [ ! -f "$tests_dir/nvcv_util.py" ]; then + # try to find the scripts in relative path + echo "Python test scripts (E.g. nvcv_util.py, test_image.py) doesn't exist in $tests_dir" + echo "Trying to find python test scripts via relative path" + nvcv_test_types_python_tar_dir=$(cd "$(dirname "$0")"; pwd)/../@PYTHON_TEST_INSTDIR@ # relative path in tarball + if [ -f "$nvcv_test_types_python_tar_dir/nvcv_util.py" ]; then + echo "Found python test scripts at $nvcv_test_types_python_tar_dir via relative path" + tests_dir=$nvcv_test_types_python_tar_dir + else + echo "Cannot find python test scripts in $tests_dir and $nvcv_test_types_python_tar_dir" + echo "Please run ./nvcv_test_types_python [python test scripts folder]" + exit 1 #hard exit + fi +fi + # Verify if correct package dependencies are installed -------- pip_depends="pytest torch" @@ -33,7 +49,10 @@ for ver in $python_versions_tentative; do echo "WARNING: Python version $ver not installed or missing proper dependencies" echo "Please install Python version $ver and run the following commands before running tests: sudo python$ver -m pip install $pip_depends" if [[ "$NVCV_FORCE_PYTHON" == 1 || "$NVCV_FORCE_PYTHON" == yes ]]; then - exit 1 #hard exit + echo "Exiting with FAILURE, as NVCV_FORCE_PYTHON=$NVCV_FORCE_PYTHON" + exit 2 #hard exit + else + echo "Continue and skipping python version $ver, as NVCV_FORCE_PYTHON=$NVCV_FORCE_PYTHON" fi else echo "Found Python version $ver installed with proper dependencies, adding to tests" @@ -69,13 +88,13 @@ for ver in $python_versions; do pubsyms=$(readelf -sWD $modfile | grep -v ' UND ' | grep ' GLOBAL ') if [[ $(echo "$pubsyms" | wc -l) != 1 ]]; then echo -e "nvcv python $ver module is exposing too many symbols:\n$pubsyms" - exit 1 + exit 3 fi if ! echo "$pubsyms" | grep PyInit_nvcv > /dev/null; then echo -e "nvcv python $ver module must expose symbol PyInit_nvcv, but instead exposes:\n$pubsyms" - exit 2 + exit 4 fi # Run python tests - NVCV_VERSION="@NVCV_VERSION_FULL@" python$ver -m pytest -o cache_dir="$tmpdir" "$@" "$tests_dir" + NVCV_VERSION="@NVCV_VERSION_FULL@" python$ver -m pytest -v --tb=line -o cache_dir="$tmpdir" "$@" "$tests_dir" done diff --git a/tests/nvcv_types/system/TestArray.cpp b/tests/nvcv_types/system/TestArray.cpp index cbc67ab5..dbf204ba 100644 --- a/tests/nvcv_types/system/TestArray.cpp +++ b/tests/nvcv_types/system/TestArray.cpp @@ -163,9 +163,26 @@ TEST_P(ArrayWrapTests, smoke_create) EXPECT_EQ(data->length(), access->length()); EXPECT_EQ(data->kind(), access->kind()); EXPECT_EQ(data->stride(), access->stride()); + EXPECT_EQ(data->rank(), 1); auto array = nvcv::ArrayWrapData(*data); ASSERT_NE(array.handle(), nullptr); + EXPECT_EQ(array.rank(), 1); + EXPECT_EQ(array.capacity(), capacity); + EXPECT_EQ(array.length(), data->length()); + EXPECT_EQ(array.dtype(), data->dtype()); + EXPECT_EQ(array.target(), baseArray.target()); + + auto arrayData = array.exportData(); + ASSERT_TRUE(arrayData); + auto arrayAccess = nvcv::ArrayDataAccess::Create(*arrayData); + ASSERT_TRUE(arrayAccess); + + EXPECT_EQ(arrayData->basePtr(), arrayAccess->ptr()); + EXPECT_EQ(arrayData->length(), arrayAccess->length()); + EXPECT_EQ(arrayData->kind(), arrayAccess->kind()); + EXPECT_EQ(arrayData->stride(), arrayAccess->stride()); + EXPECT_EQ(arrayData->rank(), 1); } INSTANTIATE_TEST_SUITE_P(_, ArrayWrapTests, @@ -402,9 +419,11 @@ TEST(ArrayTests, invalid_out_get_data_type) TEST(ArrayTests, valid_get_allocator) { + int tmp = 1; NVCVArrayHandle arrayHandle; NVCVArrayRequirements req; - NVCVAllocatorHandle alloc; + NVCVAllocatorHandle alloc = reinterpret_cast(&tmp); + EXPECT_NE(alloc, nullptr); EXPECT_EQ(NVCV_SUCCESS, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, &req)); EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstruct(&req, nullptr, &arrayHandle)); @@ -473,3 +492,90 @@ TEST(ArrayTests, invalid_out_get_target) EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvArrayGetTarget(arrayHandle, nullptr)); EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr)); } + +TEST(ArrayTests, validResize) +{ + NVCVArrayHandle arrayHandle; + NVCVArrayRequirements req; + int64_t length = 0; + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, &req)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstruct(&req, nullptr, &arrayHandle)); + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayResize(arrayHandle, 8)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayGetLength(arrayHandle, &length)); + EXPECT_EQ(length, 8); + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr)); +} + +TEST(ArrayTests, invalidResize) +{ + NVCVArrayHandle arrayHandle; + NVCVArrayRequirements req; + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, &req)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstruct(&req, nullptr, &arrayHandle)); + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvArrayResize(arrayHandle, 17)); + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr)); +} + +TEST(ArrayWrapTests, validResize) +{ + NVCVArrayHandle arrayHandle, arrayWrapHandle; + NVCVArrayData arrayData; + NVCVArrayRequirements req; + int64_t length = 0; + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, &req)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstruct(&req, nullptr, &arrayHandle)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayExportData(arrayHandle, &arrayData)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayWrapDataConstruct(&arrayData, nullptr, nullptr, &arrayWrapHandle)); + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayResize(arrayWrapHandle, 8)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayGetLength(arrayWrapHandle, &length)); + EXPECT_EQ(length, 8); + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayWrapHandle, nullptr)); +} + +TEST(ArrayWrapTests, invalidResize) +{ + NVCVArrayHandle arrayHandle, arrayWrapHandle; + NVCVArrayData arrayData; + NVCVArrayRequirements req; + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, &req)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstruct(&req, nullptr, &arrayHandle)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayExportData(arrayHandle, &arrayData)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayWrapDataConstruct(&arrayData, nullptr, nullptr, &arrayWrapHandle)); + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvArrayResize(arrayHandle, 17)); + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayWrapHandle, nullptr)); +} + +TEST(ArrayWrapTests, valid_get_allocator) +{ + int tmp = 1; + NVCVArrayHandle arrayHandle, arrayWrapHandle; + NVCVArrayData arrayData; + NVCVArrayRequirements req; + NVCVAllocatorHandle alloc = reinterpret_cast(&tmp); + EXPECT_NE(alloc, nullptr); + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, &req)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstruct(&req, nullptr, &arrayHandle)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayExportData(arrayHandle, &arrayData)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayWrapDataConstruct(&arrayData, nullptr, nullptr, &arrayWrapHandle)); + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayGetAllocator(arrayWrapHandle, &alloc)); + EXPECT_EQ(alloc, nullptr); + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayWrapHandle, nullptr)); +} diff --git a/tests/nvcv_types/system/TestColorSpec.cpp b/tests/nvcv_types/system/TestColorSpec.cpp index 7d6d6a73..bc06a055 100644 --- a/tests/nvcv_types/system/TestColorSpec.cpp +++ b/tests/nvcv_types/system/TestColorSpec.cpp @@ -629,7 +629,7 @@ TEST(ColorSpaceTests, get_name) // White point =========================== -TEST(WhitePointTests, get_name) +TEST(WhitePointTests, get_name0) { EXPECT_STREQ("NVCV_WHITE_POINT_D65", nvcvWhitePointGetName(NVCV_WHITE_POINT_D65)); EXPECT_STREQ("NVCVWhitePoint(255)", nvcvWhitePointGetName(NVCV_WHITE_POINT_FORCE8)); diff --git a/tests/nvcv_types/system/TestImage.cpp b/tests/nvcv_types/system/TestImage.cpp index 98dd6f47..371f4831 100644 --- a/tests/nvcv_types/system/TestImage.cpp +++ b/tests/nvcv_types/system/TestImage.cpp @@ -247,6 +247,36 @@ TEST(Image, smoke_operator) } } +TEST(Image, valid_get_allocator) +{ + int tmp = 1; + NVCVImageHandle handle; + NVCVImageRequirements reqs; + NVCVAllocatorHandle alloc = reinterpret_cast(&tmp); + EXPECT_NE(alloc, nullptr); + + EXPECT_EQ(NVCV_SUCCESS, nvcvImageCalcRequirements(224, 224, NVCV_IMAGE_FORMAT_RGBA8, 0, 0, &reqs)); + EXPECT_EQ(NVCV_SUCCESS, nvcvImageConstruct(&reqs, nullptr, &handle)); + + EXPECT_EQ(NVCV_SUCCESS, nvcvImageGetAllocator(handle, &alloc)); + EXPECT_EQ(alloc, nullptr); + + EXPECT_EQ(NVCV_SUCCESS, nvcvImageDecRef(handle, nullptr)); +} + +TEST(Image, invalid_out_get_allocator) +{ + NVCVImageHandle handle; + NVCVImageRequirements reqs; + + EXPECT_EQ(NVCV_SUCCESS, nvcvImageCalcRequirements(224, 224, NVCV_IMAGE_FORMAT_RGBA8, 0, 0, &reqs)); + EXPECT_EQ(NVCV_SUCCESS, nvcvImageConstruct(&reqs, nullptr, &handle)); + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageGetAllocator(handle, nullptr)); + + EXPECT_EQ(NVCV_SUCCESS, nvcvImageDecRef(handle, nullptr)); +} + TEST(ImageWrapData, smoke_cleanup) { nvcv::ImageDataStridedCuda::Buffer buf; @@ -300,6 +330,27 @@ TEST(ImageWrapData, smoke_mem_reqs) } } +TEST(ImageWrapData, valid_get_allocator) +{ + int tmp = 1; + NVCVImageHandle handle, warpHandle; + NVCVImageData imageData; + NVCVImageRequirements reqs; + NVCVAllocatorHandle alloc = reinterpret_cast(&tmp); + EXPECT_NE(alloc, nullptr); + + EXPECT_EQ(NVCV_SUCCESS, nvcvImageCalcRequirements(224, 224, NVCV_IMAGE_FORMAT_RGBA8, 0, 0, &reqs)); + EXPECT_EQ(NVCV_SUCCESS, nvcvImageConstruct(&reqs, nullptr, &handle)); + EXPECT_EQ(NVCV_SUCCESS, nvcvImageExportData(handle, &imageData)); + EXPECT_EQ(NVCV_SUCCESS, nvcvImageWrapDataConstruct(&imageData, nullptr, nullptr, &warpHandle)); + + EXPECT_EQ(NVCV_SUCCESS, nvcvImageGetAllocator(warpHandle, &alloc)); + EXPECT_EQ(alloc, nullptr); + + EXPECT_EQ(NVCV_SUCCESS, nvcvImageDecRef(handle, nullptr)); + EXPECT_EQ(NVCV_SUCCESS, nvcvImageDecRef(warpHandle, nullptr)); +} + // Future API ideas #if 0 TEST(Image, smoke_image_managed_memory) diff --git a/tests/nvcv_types/system/TestImageBatch.cpp b/tests/nvcv_types/system/TestImageBatch.cpp index cc6b1250..83030ee3 100644 --- a/tests/nvcv_types/system/TestImageBatch.cpp +++ b/tests/nvcv_types/system/TestImageBatch.cpp @@ -628,6 +628,36 @@ TEST(ImageBatch, construct_null_parameters) EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchVarShapeConstruct(&reqs, nullptr, nullptr)); } +TEST(ImageBatch, valid_get_allocator) +{ + int tmp = 1; + NVCVImageBatchHandle handle; + NVCVImageBatchVarShapeRequirements reqs; + NVCVAllocatorHandle alloc = reinterpret_cast(&tmp); + EXPECT_NE(alloc, nullptr); + + EXPECT_EQ(NVCV_SUCCESS, nvcvImageBatchVarShapeCalcRequirements(5, &reqs)); + EXPECT_EQ(NVCV_SUCCESS, nvcvImageBatchVarShapeConstruct(&reqs, nullptr, &handle)); + + EXPECT_EQ(NVCV_SUCCESS, nvcvImageBatchGetAllocator(handle, &alloc)); + EXPECT_EQ(alloc, nullptr); + + EXPECT_EQ(NVCV_SUCCESS, nvcvImageBatchDecRef(handle, nullptr)); +} + +TEST(ImageBatch, invalid_out_get_allocator) +{ + NVCVImageBatchHandle handle; + NVCVImageBatchVarShapeRequirements reqs; + + EXPECT_EQ(NVCV_SUCCESS, nvcvImageBatchVarShapeCalcRequirements(5, &reqs)); + EXPECT_EQ(NVCV_SUCCESS, nvcvImageBatchVarShapeConstruct(&reqs, nullptr, &handle)); + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchGetAllocator(handle, nullptr)); + + EXPECT_EQ(NVCV_SUCCESS, nvcvImageBatchDecRef(handle, nullptr)); +} + TEST_F(ImageBatchNullParamTest, get_user_pointer_null_output) { EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchGetUserPointer(handle, nullptr)); diff --git a/tests/nvcv_types/system/TestTensor.cpp b/tests/nvcv_types/system/TestTensor.cpp index 2f45bccc..1e31cb05 100644 --- a/tests/nvcv_types/system/TestTensor.cpp +++ b/tests/nvcv_types/system/TestTensor.cpp @@ -274,6 +274,51 @@ TEST(Tensor, smoke_user_pointer) ASSERT_EQ(nullptr, userPtr); } +TEST(Tensor, valid_get_allocator) +{ + int tmp = 1; + NVCVTensorHandle tensorHandle; + NVCVTensorRequirements reqs; + NVCVAllocatorHandle alloc = reinterpret_cast(&tmp); + EXPECT_NE(alloc, nullptr); + + ASSERT_EQ(NVCV_SUCCESS, nvcvTensorCalcRequirementsForImages(1, 224, 224, NVCV_IMAGE_FORMAT_RGBA8, 0, 0, &reqs)); + ASSERT_EQ(NVCV_SUCCESS, nvcvTensorConstruct(&reqs, nullptr, &tensorHandle)); + + EXPECT_EQ(NVCV_SUCCESS, nvcvTensorGetAllocator(tensorHandle, &alloc)); + EXPECT_EQ(alloc, nullptr); + + EXPECT_EQ(NVCV_SUCCESS, nvcvTensorDecRef(tensorHandle, nullptr)); +} + +TEST(Tensor, layout_ne_op) +{ + NVCVTensorLayout lLayout = NVCV_TENSOR_NHWC; + NVCVTensorLayout rLayout = NVCV_TENSOR_NCHW; + EXPECT_TRUE(lLayout != rLayout); +} + +TEST(TensorWrapData, valid_get_allocator) +{ + int tmp = 1; + NVCVTensorHandle tensorHandle, tensorWrapHandle; + NVCVTensorData tensorData; + NVCVTensorRequirements reqs; + NVCVAllocatorHandle alloc = reinterpret_cast(&tmp); + EXPECT_NE(alloc, nullptr); + + ASSERT_EQ(NVCV_SUCCESS, nvcvTensorCalcRequirementsForImages(1, 224, 224, NVCV_IMAGE_FORMAT_RGBA8, 0, 0, &reqs)); + ASSERT_EQ(NVCV_SUCCESS, nvcvTensorConstruct(&reqs, nullptr, &tensorHandle)); + EXPECT_EQ(NVCV_SUCCESS, nvcvTensorExportData(tensorHandle, &tensorData)); + EXPECT_EQ(NVCV_SUCCESS, nvcvTensorWrapDataConstruct(&tensorData, nullptr, nullptr, &tensorWrapHandle)); + + EXPECT_EQ(NVCV_SUCCESS, nvcvTensorGetAllocator(tensorWrapHandle, &alloc)); + EXPECT_EQ(alloc, nullptr); + + EXPECT_EQ(NVCV_SUCCESS, nvcvTensorDecRef(tensorHandle, nullptr)); + EXPECT_EQ(NVCV_SUCCESS, nvcvTensorDecRef(tensorWrapHandle, nullptr)); +} + TEST(TensorWrapData, smoke_create) { nvcv::ImageFormat fmt @@ -626,6 +671,11 @@ TEST_F(TensorTests_Negative, invalid_parameter_TensorShapePermute) nvcvTensorShapePermute(srcLayout, srcShape.data(), dstLayout, nullptr)); // null outShape } +TEST_F(TensorTests_Negative, invalid_out_get_allocator) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvTensorGetAllocator(handle, nullptr)); +} + class TensorPermuteTests : public t::TestWithParam< std::tuple, test::Param<"srcShape", std::vector>, diff --git a/tests/nvcv_types/system/TestTensorBatch.cpp b/tests/nvcv_types/system/TestTensorBatch.cpp index 8cd1bf28..8a62d609 100644 --- a/tests/nvcv_types/system/TestTensorBatch.cpp +++ b/tests/nvcv_types/system/TestTensorBatch.cpp @@ -465,3 +465,31 @@ TEST(TensorBatch, set_tensor) EXPECT_EQ(tensorA.refCount(), 4); EXPECT_EQ(tensorB.refCount(), 3); } + +TEST(TensorBatch, valid_get_allocator) +{ + int tmp = 1; + NVCVTensorBatchHandle tensorBatchHandle; + NVCVTensorBatchRequirements req; + NVCVAllocatorHandle alloc = reinterpret_cast(&tmp); + EXPECT_NE(alloc, nullptr); + + EXPECT_EQ(NVCV_SUCCESS, nvcvTensorBatchCalcRequirements(16, &req)); + EXPECT_EQ(NVCV_SUCCESS, nvcvTensorBatchConstruct(&req, nullptr, &tensorBatchHandle)); + + EXPECT_EQ(NVCV_SUCCESS, nvcvTensorBatchGetAllocator(tensorBatchHandle, &alloc)); + EXPECT_EQ(alloc, nullptr); + EXPECT_EQ(NVCV_SUCCESS, nvcvTensorBatchDecRef(tensorBatchHandle, nullptr)); +} + +TEST(TensorBatch, invalid_out_get_allocator) +{ + NVCVTensorBatchHandle tensorBatchHandle; + NVCVTensorBatchRequirements req; + + EXPECT_EQ(NVCV_SUCCESS, nvcvTensorBatchCalcRequirements(16, &req)); + EXPECT_EQ(NVCV_SUCCESS, nvcvTensorBatchConstruct(&req, nullptr, &tensorBatchHandle)); + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvTensorBatchGetAllocator(tensorBatchHandle, nullptr)); + EXPECT_EQ(NVCV_SUCCESS, nvcvTensorBatchDecRef(tensorBatchHandle, nullptr)); +} diff --git a/tests/nvcv_types/unit/TestCheckError.cpp b/tests/nvcv_types/unit/TestCheckError.cpp index 8478c2e5..1c03441a 100644 --- a/tests/nvcv_types/unit/TestCheckError.cpp +++ b/tests/nvcv_types/unit/TestCheckError.cpp @@ -89,3 +89,12 @@ TEST_P(CheckStatusMacroTests, throw_return_something_else) return a; }) } + +TEST(CheckStatusMacroTests, throw_with_extra_string) +{ + const cudaError_t cudaErrCode = cudaErrorTextureFetchFailed; + const char *fmt = " Extra String: %s"; + const char *extraString = "abc\n\0"; + + NVCV_EXPECT_STATUS(NVCV_ERROR_INTERNAL, NVCV_CHECK_THROW(cudaErrCode, fmt, extraString)); +}