From 0765bc19481f771843e6340543b01fecd3f4967a Mon Sep 17 00:00:00 2001
From: Miles Price <milesp@nvidia.com>
Date: Wed, 13 Dec 2023 17:39:45 -0800
Subject: [PATCH] feat: adding code for release v0.5.0 (beta-3) of CV-CUDA

---
 .github/workflows/codeql.yml                  |   89 +
 .gitmodules                                   |    3 +
 3rdparty/CMakeLists.txt                       |    8 +
 3rdparty/nvbench                              |    1 +
 CMakeLists.txt                                |    9 +-
 CONTRIBUTING.md                               |    4 +-
 DEVELOPER_GUIDE.md                            |   13 +-
 LICENSE.md                                    |    2 +-
 README.md                                     |  238 ++-
 SECURITY.md                                   |   17 +
 bench/BenchAdaptiveThreshold.cpp              |   96 +
 bench/BenchAdvCvtColor.cpp                    |   74 +
 bench/BenchAverageBlur.cpp                    |   93 +
 bench/BenchBilateralFilter.cpp                |   96 +
 bench/BenchBndBox.cpp                         |   97 +
 bench/BenchBoxBlur.cpp                        |   97 +
 bench/BenchBrightnessContrast.cpp             |   91 +
 bench/BenchCenterCrop.cpp                     |   85 +
 bench/BenchChannelReorder.cpp                 |   73 +
 bench/BenchColorTwist.cpp                     |   85 +
 bench/BenchComposite.cpp                      |   91 +
 bench/BenchConv2D.cpp                         |   83 +
 bench/BenchConvertTo.cpp                      |   70 +
 bench/BenchCopyMakeBorder.cpp                 |   96 +
 bench/BenchCropFlipNormalizeReformat.cpp      |  116 ++
 bench/BenchCustomCrop.cpp                     |   70 +
 bench/BenchCvtColor.cpp                       |   83 +
 bench/BenchErase.cpp                          |   95 +
 bench/BenchFindContours.cpp                   |  126 ++
 bench/BenchFlip.cpp                           |   99 +
 bench/BenchGammaContrast.cpp                  |   75 +
 bench/BenchGaussian.cpp                       |   96 +
 bench/BenchGaussianNoise.cpp                  |   87 +
 bench/BenchHistogram.cpp                      |   71 +
 bench/BenchHistogramEq.cpp                    |   77 +
 bench/BenchInpaint.cpp                        |   85 +
 bench/BenchJointBilateralFilter.cpp           |  100 +
 bench/BenchLabel.cpp                          |  108 +
 bench/BenchLaplacian.cpp                      |   91 +
 bench/BenchMedianBlur.cpp                     |   86 +
 bench/BenchMinAreaRect.cpp                    |   70 +
 bench/BenchMinMaxLoc.cpp                      |   92 +
 bench/BenchMinMaxLoc.hpp                      |  121 ++
 bench/BenchMorphology.cpp                     |  133 ++
 bench/BenchNMS.cpp                            |   76 +
 bench/BenchNormalize.cpp                      |   99 +
 bench/BenchOSD.cpp                            |   94 +
 bench/BenchPadAndStack.cpp                    |   82 +
 bench/BenchPairwiseMatcher.cpp                |  108 +
 bench/BenchPillowResize.cpp                   |  105 +
 bench/BenchRandomResizedCrop.cpp              |  103 +
 bench/BenchReformat.cpp                       |   67 +
 bench/BenchRemap.cpp                          |  120 ++
 bench/BenchResize.cpp                         |   97 +
 bench/BenchRotate.cpp                         |   91 +
 bench/BenchSIFT.cpp                           |  109 +
 bench/BenchStack.cpp                          |   68 +
 bench/BenchThreshold.cpp                      |   85 +
 bench/BenchUtils.hpp                          |  324 +++
 bench/BenchWarpAffine.cpp                     |   95 +
 bench/BenchWarpPerspective.cpp                |   95 +
 bench/CMakeLists.txt                          |   82 +
 bench/run_bench.py                            |   95 +
 ci/build.sh                                   |   11 +-
 ci/check_formatting.sh                        |   42 -
 cmake/ConfigCompiler.cmake                    |    2 +
 cmake/PrintConfig.cmake                       |    6 +
 docker/build/Dockerfile                       |    6 +-
 docker/config                                 |    3 +-
 docker/test/Dockerfile                        |    7 +
 docker/update_test_image.sh                   |    2 +-
 docs/sphinx/content/cvcuda_oplist.csv         |    7 +-
 docs/sphinx/index.rst                         |    3 +-
 docs/sphinx/installation.rst                  |    2 +-
 docs/sphinx/relnotes/v0.5.0-beta.rst          |   75 +
 .../samples/python_samples/classification.rst |    2 +-
 .../python_samples/object_detection.rst       |    2 +-
 .../samples/python_samples/segmentation.rst   |    2 +-
 python/mod_cvcuda/CMakeLists.txt              |   18 +
 python/mod_cvcuda/ConnectivityType.cpp        |   34 +
 python/mod_cvcuda/ConnectivityType.hpp        |   30 +
 python/mod_cvcuda/InterpolationType.cpp       |    4 +-
 python/mod_cvcuda/LabelType.cpp               |   31 +
 python/mod_cvcuda/LabelType.hpp               |   30 +
 python/mod_cvcuda/Main.cpp                    |   15 +-
 python/mod_cvcuda/NormType.cpp                |   32 +
 python/mod_cvcuda/NormType.hpp                |   30 +
 python/mod_cvcuda/OpAdvCvtColor.cpp           |    2 -
 python/mod_cvcuda/OpCvtColor.cpp              |   20 +-
 python/mod_cvcuda/OpFindContours.cpp          |   18 +-
 python/mod_cvcuda/OpFindHomography.cpp        |  330 ++++
 python/mod_cvcuda/OpLabel.cpp                 |  210 ++
 python/mod_cvcuda/OpPairwiseMatcher.cpp       |  204 ++
 python/mod_cvcuda/OpPillowResize.cpp          |   99 +-
 python/mod_cvcuda/OpStack.cpp                 |  179 ++
 python/mod_cvcuda/Operators.hpp               |    8 +
 python/mod_cvcuda/OsdElement.cpp              |  380 ++--
 python/mod_cvcuda/OsdElement.hpp              |    2 -
 python/mod_cvcuda/PairwiseMatcherType.cpp     |   29 +
 python/mod_cvcuda/PairwiseMatcherType.hpp     |   30 +
 python/mod_cvcuda/WorkspaceCache.cpp          |   87 +
 python/mod_cvcuda/WorkspaceCache.hpp          |  319 +++
 python/mod_cvcuda/exports.ldscript            |   22 +
 python/mod_nvcv/Array.cpp                     |  350 ++++
 python/mod_nvcv/Array.hpp                     |  106 +
 python/mod_nvcv/CAPI.cpp                      |   48 +
 python/mod_nvcv/CMakeLists.txt                |    9 +
 python/mod_nvcv/CastUtils.hpp                 |   47 +
 python/mod_nvcv/DLPackUtils.cpp               |   56 +-
 python/mod_nvcv/DLPackUtils.hpp               |    2 +
 python/mod_nvcv/ExternalBuffer.cpp            |    6 +-
 python/mod_nvcv/Image.cpp                     |   16 +-
 python/mod_nvcv/ImageBatch.cpp                |   23 +
 python/mod_nvcv/ImageBatch.hpp                |    2 +
 python/mod_nvcv/Main.cpp                      |    2 +
 python/mod_nvcv/Resource.cpp                  |    2 -
 python/mod_nvcv/Tensor.cpp                    |   35 +-
 python/mod_nvcv/Tensor.hpp                    |    4 +
 python/mod_nvcv/TensorBatch.cpp               |  261 +++
 python/mod_nvcv/TensorBatch.hpp               |   96 +
 python/mod_nvcv/exports.ldscript              |   22 +
 python/mod_nvcv/include/nvcv/python/Array.hpp |  102 +
 python/mod_nvcv/include/nvcv/python/CAPI.hpp  |   14 +
 python/mod_nvcv/include/nvcv/python/Fwd.hpp   |    1 +
 python/mod_nvcv/include/nvcv/python/Shape.hpp |   25 +
 .../include/nvcv/python/TensorBatch.hpp       |  113 ++
 samples/CMakeLists.txt                        |    3 +-
 samples/README.md                             |    8 +-
 samples/common/python/interop_utils.py        |   88 +
 samples/common/python/perf_utils.py           |   10 +-
 samples/label/python/main.py                  |  316 +++
 samples/object_detection/python/pipelines.py  |   24 +-
 samples/scripts/benchmark.py                  |   15 +-
 samples/scripts/install_dependencies.sh       |    2 +-
 samples/scripts/run_samples.sh                |    6 +
 src/cvcuda/CMakeLists.txt                     |    6 +-
 src/cvcuda/OpFindHomography.cpp               |   67 +
 src/cvcuda/OpLabel.cpp                        |   55 +
 src/cvcuda/OpPairwiseMatcher.cpp              |   58 +
 src/cvcuda/OpPillowResize.cpp                 |   61 +-
 src/cvcuda/OpStack.cpp                        |   53 +
 src/cvcuda/include/cvcuda/OpFindHomography.h  |  151 ++
 .../include/cvcuda/OpFindHomography.hpp       |   86 +
 src/cvcuda/include/cvcuda/OpLabel.h           |  242 +++
 src/cvcuda/include/cvcuda/OpLabel.hpp         |   86 +
 src/cvcuda/include/cvcuda/OpPairwiseMatcher.h |  173 ++
 .../include/cvcuda/OpPairwiseMatcher.hpp      |   86 +
 src/cvcuda/include/cvcuda/OpPillowResize.h    |   60 +-
 src/cvcuda/include/cvcuda/OpPillowResize.hpp  |   49 +-
 src/cvcuda/include/cvcuda/OpSIFT.h            |    4 +-
 src/cvcuda/include/cvcuda/OpStack.h           |  121 ++
 src/cvcuda/include/cvcuda/OpStack.hpp         |   79 +
 src/cvcuda/include/cvcuda/Types.h             |  136 +-
 src/cvcuda/include/cvcuda/Workspace.h         |  104 +
 src/cvcuda/include/cvcuda/Workspace.hpp       |  203 ++
 src/cvcuda/priv/CMakeLists.txt                |    9 +-
 src/cvcuda/priv/OpBndBox.cpp                  |    4 +-
 src/cvcuda/priv/OpBndBox.hpp                  |    2 +-
 .../priv/OpCropFlipNormalizeReformat.cu       |    4 +-
 src/cvcuda/priv/OpFindHomography.cu           | 1615 +++++++++++++++
 src/cvcuda/priv/OpFindHomography.hpp          |   78 +
 src/cvcuda/priv/OpLabel.cu                    | 1751 +++++++++++++++++
 src/cvcuda/priv/OpLabel.hpp                   |   48 +
 src/cvcuda/priv/OpMinMaxLoc.cu                |   24 +-
 src/cvcuda/priv/OpPairwiseMatcher.cu          |  665 +++++++
 src/cvcuda/priv/OpPairwiseMatcher.hpp         |   49 +
 src/cvcuda/priv/OpPillowResize.cpp            |   43 +-
 src/cvcuda/priv/OpPillowResize.hpp            |   16 +-
 src/cvcuda/priv/OpSIFT.cu                     |   10 +
 src/cvcuda/priv/OpStack.cpp                   |  101 +
 src/cvcuda/priv/OpStack.hpp                   |   49 +
 src/cvcuda/priv/Types.hpp                     |  643 ++++++
 src/cvcuda/priv/WorkspaceAllocator.hpp        |  216 ++
 src/cvcuda/priv/WorkspaceEstimator.hpp        |   90 +
 src/cvcuda/priv/WorkspaceUtil.hpp             |   24 +
 src/cvcuda/priv/legacy/CMakeLists.txt         |    1 -
 src/cvcuda/priv/legacy/CvCudaLegacy.h         |  178 +-
 src/cvcuda/priv/legacy/CvCudaOSD.hpp          |    4 -
 src/cvcuda/priv/legacy/bnd_box.cu             |  573 ------
 src/cvcuda/priv/legacy/box_blur.cu            |   25 +-
 src/cvcuda/priv/legacy/center_crop.cu         |    5 -
 src/cvcuda/priv/legacy/convert_to.cu          |    5 -
 src/cvcuda/priv/legacy/custom_crop.cu         |    5 -
 src/cvcuda/priv/legacy/cvt_color.cu           |    5 -
 src/cvcuda/priv/legacy/cvt_color_var_shape.cu |   14 +-
 src/cvcuda/priv/legacy/filter.cu              |    5 -
 src/cvcuda/priv/legacy/find_contours.cu       |    9 +-
 src/cvcuda/priv/legacy/flip.cu                |    5 -
 .../priv/legacy/flip_or_copy_var_shape.cu     |    5 -
 .../priv/legacy/histogram_eq_var_shape.cu     |    4 +-
 src/cvcuda/priv/legacy/median_blur.cu         |    5 -
 src/cvcuda/priv/legacy/min_area_rect.cu       |    3 +-
 src/cvcuda/priv/legacy/normalize.cu           |    5 -
 src/cvcuda/priv/legacy/osd.cu                 |  131 +-
 src/cvcuda/priv/legacy/pad_and_stack.cu       |    5 -
 src/cvcuda/priv/legacy/pillow_resize.cu       |   39 +-
 src/cvcuda/priv/legacy/pillow_resize.h        |    5 +-
 .../priv/legacy/pillow_resize_var_shape.cu    |  101 +-
 src/cvcuda/priv/legacy/reformat.cu            |    5 -
 src/cvcuda/priv/legacy/resize.cu              |    5 -
 src/cvcuda/priv/legacy/warp.cu                |    7 +-
 src/cvcuda/priv/legacy/warp_var_shape.cu      |    2 +-
 src/nvcv_types/Array.cpp                      |   17 +
 src/nvcv_types/CMakeLists.txt                 |    1 +
 src/nvcv_types/ImageBatch.cpp                 |   10 +
 src/nvcv_types/Tensor.cpp                     |   38 +
 src/nvcv_types/TensorBatch.cpp                |  318 +++
 src/nvcv_types/include/nvcv/Array.h           |   14 +
 src/nvcv_types/include/nvcv/Array.hpp         |    2 +
 src/nvcv_types/include/nvcv/Fwd.h             |    9 +-
 src/nvcv_types/include/nvcv/ImageBatch.h      |    6 +-
 src/nvcv_types/include/nvcv/ImageData.h       |    2 +-
 src/nvcv_types/include/nvcv/Size.h            |   42 +
 src/nvcv_types/include/nvcv/Size.hpp          |   99 +-
 src/nvcv_types/include/nvcv/Tensor.h          |   25 +
 src/nvcv_types/include/nvcv/Tensor.hpp        |    6 +
 src/nvcv_types/include/nvcv/TensorBatch.h     |  278 +++
 src/nvcv_types/include/nvcv/TensorBatch.hpp   |  244 +++
 src/nvcv_types/include/nvcv/TensorBatchData.h |   65 +
 .../include/nvcv/TensorBatchData.hpp          |  172 ++
 .../include/nvcv/alloc/Allocator.hpp          |    1 +
 .../include/nvcv/cuda/BorderVarShapeWrap.hpp  |   16 +-
 .../nvcv/cuda/ImageBatchVarShapeWrap.hpp      |    9 +-
 .../include/nvcv/cuda/math/LinAlg.hpp         |    2 +-
 src/nvcv_types/include/nvcv/detail/Align.hpp  |  106 +
 .../include/nvcv/detail/ArrayImpl.hpp         |    7 +
 .../include/nvcv/detail/TensorBatchImpl.hpp   |  266 +++
 .../include/nvcv/detail/TensorImpl.hpp        |    9 +
 src/nvcv_types/priv/Array.cpp                 |    9 +
 src/nvcv_types/priv/Array.hpp                 |    2 +
 src/nvcv_types/priv/ArrayWrapData.cpp         |    8 +
 src/nvcv_types/priv/ArrayWrapData.hpp         |    2 +
 src/nvcv_types/priv/CMakeLists.txt            |    1 +
 src/nvcv_types/priv/Context.cpp               |    5 +-
 src/nvcv_types/priv/Context.hpp               |   14 +-
 src/nvcv_types/priv/HandleTraits.hpp          |    6 +
 src/nvcv_types/priv/IArray.hpp                |    2 +
 src/nvcv_types/priv/IContext.hpp              |   15 +-
 src/nvcv_types/priv/ITensorBatch.hpp          |   66 +
 src/nvcv_types/priv/TensorBatch.cpp           |  339 ++++
 src/nvcv_types/priv/TensorBatch.hpp           |  107 +
 src/nvcv_types/priv/TensorBatchManager.hpp    |   36 +
 src/nvcv_types/priv/TensorData.cpp            |  164 +-
 src/nvcv_types/priv/TensorData.hpp            |    5 +-
 src/util/CMakeLists.txt                       |    3 +
 src/util/Event.cpp                            |   56 +
 src/util/Event.hpp                            |   59 +
 src/util/PerStreamCache.hpp                   |  265 +++
 src/util/PerStreamCacheImpl.hpp               |  330 ++++
 src/util/SimpleCache.hpp                      |  137 ++
 src/util/Stream.cpp                           |   70 +
 src/util/Stream.hpp                           |   58 +
 src/util/StreamId.cpp                         |  150 ++
 src/util/StreamId.hpp                         |   47 +
 src/util/TensorDataUtils.cpp                  |  150 +-
 src/util/TensorDataUtils.hpp                  |  105 +-
 src/util/UniqueHandle.hpp                     |  191 ++
 tests/CMakeLists.txt                          |    4 +-
 tests/common/CheckStatus.hpp                  |   11 +
 tests/cvcuda/CMakeLists.txt                   |    5 +-
 tests/cvcuda/python/CMakeLists.txt            |    2 +-
 tests/cvcuda/python/cvcuda_test_python.in     |   22 +-
 tests/cvcuda/python/cvcuda_util.py            |   24 +-
 .../python/test_adaptivethresholdtype.py      |    1 +
 tests/cvcuda/python/test_bordertype.py        |    3 +-
 tests/cvcuda/python/test_import_order.py      |   25 +
 tests/cvcuda/python/test_interptype.py        |    3 +-
 .../cvcuda/python/test_opadaptivethreshold.py |    1 +
 tests/cvcuda/python/test_opbndbox.py          |  115 +-
 tests/cvcuda/python/test_opboxblur.py         |   25 +-
 tests/cvcuda/python/test_opfindcontours.py    |   12 +-
 tests/cvcuda/python/test_opfindhomography.py  |   92 +
 tests/cvcuda/python/test_ophistogram.py       |    2 +-
 tests/cvcuda/python/test_oplabel.py           |  135 ++
 tests/cvcuda/python/test_opmatch.py           |  212 ++
 tests/cvcuda/python/test_opmorphology.py      |    2 +-
 tests/cvcuda/python/test_opnms.py             |    2 +-
 tests/cvcuda/python/test_oposd.py             |  193 +-
 tests/cvcuda/python/test_oppillowresize.py    |    2 +-
 tests/cvcuda/python/test_opreformat.py        |    2 +-
 tests/cvcuda/python/test_opremap.py           |    2 +-
 tests/cvcuda/python/test_opstack.py           |  103 +
 tests/cvcuda/python/test_opwarpperspective.py |   11 +
 tests/cvcuda/python/test_util.py              |    2 +-
 tests/cvcuda/system/CMakeLists.txt            |    8 +-
 tests/cvcuda/system/OsdUtils.cu               |   16 +-
 tests/cvcuda/system/OsdUtils.cuh              |    1 -
 tests/cvcuda/system/TestOpBndBox.cpp          |   28 +-
 tests/cvcuda/system/TestOpBoxBlur.cpp         |   30 +-
 tests/cvcuda/system/TestOpFindHomography.cpp  |  394 ++++
 tests/cvcuda/system/TestOpLabel.cpp           |  835 ++++++++
 tests/cvcuda/system/TestOpOSD.cpp             |  365 ++--
 tests/cvcuda/system/TestOpPairwiseMatcher.cpp |  442 +++++
 tests/cvcuda/system/TestOpPillowResize.cpp    |   16 +-
 tests/cvcuda/system/TestOpStack.cpp           |  190 ++
 tests/cvcuda/system/TestOpWarpPerspective.cpp |    2 +-
 tests/cvcuda/unit/CMakeLists.txt              |   34 +
 tests/cvcuda/unit/Definitions.hpp             |   26 +
 tests/cvcuda/unit/TestWorkspaceAllocator.cpp  |  203 ++
 tests/cvcuda/unit/TestWorkspaceEstimator.cpp  |   69 +
 .../cudatools_system/CMakeLists.txt           |    2 +-
 .../DeviceBorderVarShapeWrap.cu               |    4 +-
 .../DeviceImageBatchVarShapeWrap.cu           |    2 +-
 .../nvcv_types/cudatools_unit/CMakeLists.txt  |    4 +-
 tests/nvcv_types/python/CMakeLists.txt        |    2 +-
 .../python/nvcv_test_types_python.in          |   20 +-
 tests/nvcv_types/python/test_image.py         |    4 +-
 .../python/test_imgbatchvarshape.py           |   55 +-
 tests/nvcv_types/python/test_import_order.py  |   25 +
 tests/nvcv_types/python/test_stream.py        |    2 +-
 tests/nvcv_types/python/test_tensor.py        |  134 +-
 tests/nvcv_types/python/test_tensor_batch.py  |  227 +++
 tests/nvcv_types/system/CMakeLists.txt        |    5 +-
 tests/nvcv_types/system/TestColorSpec.cpp     |  145 +-
 tests/nvcv_types/system/TestDataLayout.cpp    |   53 +
 tests/nvcv_types/system/TestImageBatch.cpp    |  269 +++
 tests/nvcv_types/system/TestSize.cpp          |   16 +-
 tests/nvcv_types/system/TestTensorBatch.cpp   |  467 +++++
 .../nvcv_types/system/TestTensorDataUtils.cpp |   37 +-
 tests/nvcv_types/unit/CMakeLists.txt          |    6 +-
 tests/nvcv_types/unit/TestPerStreamCache.cpp  |  396 ++++
 tests/nvcv_types/unit/TestSimpleCache.cpp     |   92 +
 tests/nvcv_types/unit/TestStreamId.cpp        |  128 ++
 tests/run_tests.sh.in                         |   20 +-
 324 files changed, 25820 insertions(+), 2246 deletions(-)
 create mode 100644 .github/workflows/codeql.yml
 create mode 160000 3rdparty/nvbench
 create mode 100644 bench/BenchAdaptiveThreshold.cpp
 create mode 100644 bench/BenchAdvCvtColor.cpp
 create mode 100644 bench/BenchAverageBlur.cpp
 create mode 100644 bench/BenchBilateralFilter.cpp
 create mode 100644 bench/BenchBndBox.cpp
 create mode 100644 bench/BenchBoxBlur.cpp
 create mode 100644 bench/BenchBrightnessContrast.cpp
 create mode 100644 bench/BenchCenterCrop.cpp
 create mode 100644 bench/BenchChannelReorder.cpp
 create mode 100644 bench/BenchColorTwist.cpp
 create mode 100644 bench/BenchComposite.cpp
 create mode 100644 bench/BenchConv2D.cpp
 create mode 100644 bench/BenchConvertTo.cpp
 create mode 100644 bench/BenchCopyMakeBorder.cpp
 create mode 100644 bench/BenchCropFlipNormalizeReformat.cpp
 create mode 100644 bench/BenchCustomCrop.cpp
 create mode 100644 bench/BenchCvtColor.cpp
 create mode 100644 bench/BenchErase.cpp
 create mode 100644 bench/BenchFindContours.cpp
 create mode 100644 bench/BenchFlip.cpp
 create mode 100644 bench/BenchGammaContrast.cpp
 create mode 100644 bench/BenchGaussian.cpp
 create mode 100644 bench/BenchGaussianNoise.cpp
 create mode 100644 bench/BenchHistogram.cpp
 create mode 100644 bench/BenchHistogramEq.cpp
 create mode 100644 bench/BenchInpaint.cpp
 create mode 100644 bench/BenchJointBilateralFilter.cpp
 create mode 100644 bench/BenchLabel.cpp
 create mode 100644 bench/BenchLaplacian.cpp
 create mode 100644 bench/BenchMedianBlur.cpp
 create mode 100644 bench/BenchMinAreaRect.cpp
 create mode 100644 bench/BenchMinMaxLoc.cpp
 create mode 100644 bench/BenchMinMaxLoc.hpp
 create mode 100644 bench/BenchMorphology.cpp
 create mode 100644 bench/BenchNMS.cpp
 create mode 100644 bench/BenchNormalize.cpp
 create mode 100644 bench/BenchOSD.cpp
 create mode 100644 bench/BenchPadAndStack.cpp
 create mode 100644 bench/BenchPairwiseMatcher.cpp
 create mode 100644 bench/BenchPillowResize.cpp
 create mode 100644 bench/BenchRandomResizedCrop.cpp
 create mode 100644 bench/BenchReformat.cpp
 create mode 100644 bench/BenchRemap.cpp
 create mode 100644 bench/BenchResize.cpp
 create mode 100644 bench/BenchRotate.cpp
 create mode 100644 bench/BenchSIFT.cpp
 create mode 100644 bench/BenchStack.cpp
 create mode 100644 bench/BenchThreshold.cpp
 create mode 100644 bench/BenchUtils.hpp
 create mode 100644 bench/BenchWarpAffine.cpp
 create mode 100644 bench/BenchWarpPerspective.cpp
 create mode 100644 bench/CMakeLists.txt
 create mode 100644 bench/run_bench.py
 delete mode 100755 ci/check_formatting.sh
 create mode 100644 docs/sphinx/relnotes/v0.5.0-beta.rst
 create mode 100644 python/mod_cvcuda/ConnectivityType.cpp
 create mode 100644 python/mod_cvcuda/ConnectivityType.hpp
 create mode 100644 python/mod_cvcuda/LabelType.cpp
 create mode 100644 python/mod_cvcuda/LabelType.hpp
 create mode 100644 python/mod_cvcuda/NormType.cpp
 create mode 100644 python/mod_cvcuda/NormType.hpp
 create mode 100644 python/mod_cvcuda/OpFindHomography.cpp
 create mode 100644 python/mod_cvcuda/OpLabel.cpp
 create mode 100644 python/mod_cvcuda/OpPairwiseMatcher.cpp
 create mode 100644 python/mod_cvcuda/OpStack.cpp
 create mode 100644 python/mod_cvcuda/PairwiseMatcherType.cpp
 create mode 100644 python/mod_cvcuda/PairwiseMatcherType.hpp
 create mode 100644 python/mod_cvcuda/WorkspaceCache.cpp
 create mode 100644 python/mod_cvcuda/WorkspaceCache.hpp
 create mode 100644 python/mod_cvcuda/exports.ldscript
 create mode 100644 python/mod_nvcv/Array.cpp
 create mode 100644 python/mod_nvcv/Array.hpp
 create mode 100644 python/mod_nvcv/CastUtils.hpp
 create mode 100644 python/mod_nvcv/TensorBatch.cpp
 create mode 100644 python/mod_nvcv/TensorBatch.hpp
 create mode 100644 python/mod_nvcv/exports.ldscript
 create mode 100644 python/mod_nvcv/include/nvcv/python/Array.hpp
 create mode 100644 python/mod_nvcv/include/nvcv/python/TensorBatch.hpp
 create mode 100644 samples/common/python/interop_utils.py
 create mode 100644 samples/label/python/main.py
 create mode 100644 src/cvcuda/OpFindHomography.cpp
 create mode 100644 src/cvcuda/OpLabel.cpp
 create mode 100644 src/cvcuda/OpPairwiseMatcher.cpp
 create mode 100644 src/cvcuda/OpStack.cpp
 create mode 100644 src/cvcuda/include/cvcuda/OpFindHomography.h
 create mode 100644 src/cvcuda/include/cvcuda/OpFindHomography.hpp
 create mode 100644 src/cvcuda/include/cvcuda/OpLabel.h
 create mode 100644 src/cvcuda/include/cvcuda/OpLabel.hpp
 create mode 100644 src/cvcuda/include/cvcuda/OpPairwiseMatcher.h
 create mode 100644 src/cvcuda/include/cvcuda/OpPairwiseMatcher.hpp
 create mode 100644 src/cvcuda/include/cvcuda/OpStack.h
 create mode 100644 src/cvcuda/include/cvcuda/OpStack.hpp
 create mode 100644 src/cvcuda/include/cvcuda/Workspace.h
 create mode 100644 src/cvcuda/include/cvcuda/Workspace.hpp
 create mode 100644 src/cvcuda/priv/OpFindHomography.cu
 create mode 100644 src/cvcuda/priv/OpFindHomography.hpp
 create mode 100644 src/cvcuda/priv/OpLabel.cu
 create mode 100644 src/cvcuda/priv/OpLabel.hpp
 create mode 100644 src/cvcuda/priv/OpPairwiseMatcher.cu
 create mode 100644 src/cvcuda/priv/OpPairwiseMatcher.hpp
 create mode 100644 src/cvcuda/priv/OpStack.cpp
 create mode 100644 src/cvcuda/priv/OpStack.hpp
 create mode 100644 src/cvcuda/priv/Types.hpp
 create mode 100644 src/cvcuda/priv/WorkspaceAllocator.hpp
 create mode 100644 src/cvcuda/priv/WorkspaceEstimator.hpp
 create mode 100644 src/cvcuda/priv/WorkspaceUtil.hpp
 delete mode 100644 src/cvcuda/priv/legacy/bnd_box.cu
 create mode 100644 src/nvcv_types/TensorBatch.cpp
 create mode 100644 src/nvcv_types/include/nvcv/Size.h
 create mode 100644 src/nvcv_types/include/nvcv/TensorBatch.h
 create mode 100644 src/nvcv_types/include/nvcv/TensorBatch.hpp
 create mode 100644 src/nvcv_types/include/nvcv/TensorBatchData.h
 create mode 100644 src/nvcv_types/include/nvcv/TensorBatchData.hpp
 create mode 100644 src/nvcv_types/include/nvcv/detail/Align.hpp
 create mode 100644 src/nvcv_types/include/nvcv/detail/TensorBatchImpl.hpp
 create mode 100644 src/nvcv_types/priv/ITensorBatch.hpp
 create mode 100644 src/nvcv_types/priv/TensorBatch.cpp
 create mode 100644 src/nvcv_types/priv/TensorBatch.hpp
 create mode 100644 src/nvcv_types/priv/TensorBatchManager.hpp
 create mode 100644 src/util/Event.cpp
 create mode 100644 src/util/Event.hpp
 create mode 100644 src/util/PerStreamCache.hpp
 create mode 100644 src/util/PerStreamCacheImpl.hpp
 create mode 100644 src/util/SimpleCache.hpp
 create mode 100644 src/util/Stream.cpp
 create mode 100644 src/util/Stream.hpp
 create mode 100644 src/util/StreamId.cpp
 create mode 100644 src/util/StreamId.hpp
 create mode 100644 src/util/UniqueHandle.hpp
 create mode 100644 tests/cvcuda/python/test_import_order.py
 create mode 100644 tests/cvcuda/python/test_opfindhomography.py
 create mode 100644 tests/cvcuda/python/test_oplabel.py
 create mode 100644 tests/cvcuda/python/test_opmatch.py
 create mode 100644 tests/cvcuda/python/test_opstack.py
 create mode 100644 tests/cvcuda/system/TestOpFindHomography.cpp
 create mode 100644 tests/cvcuda/system/TestOpLabel.cpp
 create mode 100644 tests/cvcuda/system/TestOpPairwiseMatcher.cpp
 create mode 100644 tests/cvcuda/system/TestOpStack.cpp
 create mode 100644 tests/cvcuda/unit/CMakeLists.txt
 create mode 100644 tests/cvcuda/unit/Definitions.hpp
 create mode 100644 tests/cvcuda/unit/TestWorkspaceAllocator.cpp
 create mode 100644 tests/cvcuda/unit/TestWorkspaceEstimator.cpp
 create mode 100644 tests/nvcv_types/python/test_import_order.py
 create mode 100644 tests/nvcv_types/python/test_tensor_batch.py
 create mode 100644 tests/nvcv_types/system/TestTensorBatch.cpp
 create mode 100644 tests/nvcv_types/unit/TestPerStreamCache.cpp
 create mode 100644 tests/nvcv_types/unit/TestSimpleCache.cpp
 create mode 100644 tests/nvcv_types/unit/TestStreamId.cpp

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
new file mode 100644
index 00000000..2843346a
--- /dev/null
+++ b/.github/workflows/codeql.yml
@@ -0,0 +1,89 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+name: "CodeQL"
+
+on: [push, pull_request]
+
+jobs:
+  analyze:
+    name: Analyze
+    runs-on: ${{ (matrix.language == 'c-cpp' && 'ubuntu-20.04') || 'ubuntu-latest' }}
+    timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }}
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        language: [ 'c-cpp', 'javascript-typescript', 'python' ]
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v3
+      with:
+        lfs: true
+        submodules: 'recursive'
+
+    - if: matrix.language == 'c-cpp'
+      name: Setup environment
+      run: |
+        sudo apt update -y && sudo apt install -y --no-install-recommends \
+        git git-lfs gcc-11 g++-11 ninja-build ccache libgtest-dev libgmock-dev \
+        shellcheck curl doxygen python3 python3-pip python3-dev python3-distutils
+
+    - if: matrix.language == 'c-cpp'
+      name: Install Python Dependencies
+      run: |
+        python3 -m pip install sphinx-rtd-theme sphinx==4.5.0 breathe exhale \
+        recommonmark graphviz numpy==1.24.1
+
+    - if: matrix.language == 'c-cpp'
+      name: Install CUDA Toolkit
+      uses: Jimver/cuda-toolkit@v0.2.11
+      id: cuda-toolkit
+      with:
+        cuda: '11.7.1'
+        linux-local-args: '["--toolkit"]'
+
+    - if: matrix.language == 'c-cpp'
+      name: Verify CUDA installation
+      run: |
+          echo "Installed CUDA version is: ${{ steps.cuda-toolkit.outputs.cuda }}"
+          echo "CUDA install location: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}"
+          nvcc -V
+
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@v2
+      with:
+        languages: ${{ matrix.language }}
+        queries: +security-and-quality
+
+    - if: matrix.language != 'c-cpp'
+      name: Autobuild
+      uses: github/codeql-action/autobuild@v2
+
+    - if: matrix.language == 'c-cpp'
+      name: Build CMake project
+      run: |
+        echo "Running CMake project build script"
+        ./ci/build.sh debug build "-DBUILD_SAMPLES=OFF -DBUILD_TESTS=OFF -DBUILD_PYTHON=1" $*
+
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@v2
+      with:
+        category: "/language:${{matrix.language}}"
diff --git a/.gitmodules b/.gitmodules
index b09e10a1..9de0bf30 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -22,3 +22,6 @@
 [submodule "3rdparty/dlpack"]
 	path = 3rdparty/dlpack
 	url = https://github.com/dmlc/dlpack.git
+[submodule "3rdparty/nvbench"]
+	path = 3rdparty/nvbench
+	url = https://github.com/NVIDIA/nvbench.git
diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt
index 98933a1a..51e72f3c 100644
--- a/3rdparty/CMakeLists.txt
+++ b/3rdparty/CMakeLists.txt
@@ -38,3 +38,11 @@ set(DLPACK_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/dlpack" PARENT_SCOPE)
 
 # cuOSD -----------------------------
 set(CUOSD_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/cuOSD" PARENT_SCOPE)
+
+# NVBench --------------------------------
+set(NVBENCH_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/nvbench" PARENT_SCOPE)
+
+if(BUILD_BENCH)
+    set(NVBench_ENABLE_CUPTI off)
+    add_subdirectory(nvbench)
+endif()
diff --git a/3rdparty/nvbench b/3rdparty/nvbench
new file mode 160000
index 00000000..75212298
--- /dev/null
+++ b/3rdparty/nvbench
@@ -0,0 +1 @@
+Subproject commit 75212298727e8f6e1df9215f2fcb47c8c721ffc9
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5d2515cf..6256d837 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.18)
+cmake_minimum_required(VERSION 3.20.1)
 
 # We need to check this variable before starting a CUDA project - otherwise it will appear
 # as set, with the default value pointing to the oldest supported architecture (52 as of CUDA 11.8)
@@ -23,7 +23,7 @@ endif()
 
 project(cvcuda
         LANGUAGES C CXX
-        VERSION 0.4.0
+        VERSION 0.5.0
         DESCRIPTION "CUDA-accelerated Computer Vision algorithms"
 )
 
@@ -48,6 +48,7 @@ endif()
 # Options to configure the build tree =======
 option(BUILD_TESTS "Enable testsuite" OFF)
 option(BUILD_PYTHON "Build python bindings" OFF)
+option(BUILD_BENCH "Build benchmark" OFF)
 option(ENABLE_SANITIZER "Enabled sanitized build" OFF)
 
 # Configure build tree ======================
@@ -85,6 +86,10 @@ if(BUILD_SAMPLES)
     add_subdirectory(samples)
 endif()
 
+if(BUILD_BENCH)
+    add_subdirectory(bench)
+endif()
+
 # Must be done after build tree is defined
 include(ConfigCPack)
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 31387c3c..89506b78 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,6 +1,6 @@
 # Contributing to CV-CUDA
 
-**As of release v0.4.0-beta, CV-CUDA is not accepting outside contribution.**
+**As of release v0.5.0-beta, CV-CUDA is not accepting outside contribution.**
 
 Contributions to CV-CUDA fall into the following categories:
 
@@ -12,7 +12,7 @@ Contributions to CV-CUDA fall into the following categories:
 1. To propose a new feature, please file a new feature request
    [issue](https://github.com/CVCUDA/CV-CUDA/issues/new/choose). Describe the
    intended feature and discuss the design and implementation with the team and
-   community. NOTE: Currently, as of release v0.4.0-beta, CV-CUDA is not accepting
+   community. NOTE: Currently, as of release v0.5.0-beta, CV-CUDA is not accepting
    outside contribution.
 1. To ask a general question, please sumbit a question
    [issue](https://github.com/CVCUDA/CV-CUDA/issues/new/choose). If you need
diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md
index 483b6e87..ae4f0923 100644
--- a/DEVELOPER_GUIDE.md
+++ b/DEVELOPER_GUIDE.md
@@ -24,7 +24,7 @@ CV-CUDA includes:
 | Advanced Color Format Conversions | Performs color conversion from interleaved RGB/BGR <-> YUV/YVU and semi planar. Supported standards: BT.601. BT.709. BT.2020 |
 | AverageBlur | Reduces image noise using an average filter |
 | BilateralFilter | Reduces image noise while preserving strong edges |
-| Bounding Box | Draws an rectangular border using the X-Y coordinates and dimensions typically to define the location and size of an object in an image |
+| Bounding Box | Draws a rectangular border using the X-Y coordinates and dimensions typically to define the location and size of an object in an image |
 | Box Blurring | Overlays a blurred rectangle using the X-Y coordinates and dimensions that define the location and size of an object in an image |
 | Brightness_Contrast | Adjusts brightness and contrast of an image |
 | CenterCrop | Crops an image at its center |
@@ -38,6 +38,7 @@ CV-CUDA includes:
 | DataTypeConvert | Converts an image’s data type with optional scaling |
 | Erase | Erases image regions |
 | Find Contours | Extract closed contours from an input binary image |
+| FindHomography | Calculates a perspective transform from four pairs of the corresponding points  |
 | Flip | Flips a 2D image around its axis |
 | GammaContrast | Adjusts image contrast |
 | Gaussian | Applies a gaussian blur filter to the image |
@@ -45,18 +46,20 @@ CV-CUDA includes:
 | Histogram | Provides a grayscale value distribution showing the frequency of occurrence of each gray value. |
 | Histogram Equalizer | Allows effective spreading out the intensity range of the image typically used to improve contrast |
 | Inpainting | Performs inpainting by replacing a pixel by normalized weighted sum of all the known pixels in the neighborhood |
-| Joint Bilateral Filter | Provides a edge-preserving denoising filter |
+| Joint Bilateral Filter | Reduces image noise while preserving strong edges based on a guidance image |
+| Label | Labels connected regions in an image using 4-way connectivity for foreground and 8-way for background pixels |
 | Laplacian | Applies a Laplace transform to an image |
 | MedianBlur | Reduces an image’s salt-and-pepper noise |
 | MinArea Rect | Finds the minimum area rotated rectangle typically used to draw bounding rectangle with minimum area |
 | MinMaxLoc | Finds the maximum and minimum values in a given array |
 | Morphology | Performs morphological erode and dilate transformations |
-| Morphology (close) |  Performs morphological operation that involves dilation followed by erosion on an image |
-| Morphology (open) |  Performs morphological operation that involves erosion followed by dilation on an image |
+| Morphology (close) | Performs a morphological operation that involves dilation followed by erosion on an image |
+| Morphology (open) | Performs a morphological operation that involves erosion followed by dilation on an image |
 | Non-max Suppression | Enables selecting a single entity out of many overlapping ones typically used for selecting from multiple bounding boxes during object detection |
 | Normalize | Normalizes an image pixel’s range |
-| OSD (Polyline Line Text Rotated Rect Segmented Mask) | Displays an overlay on the image of of different forms including polyline line text rotated rectangle segmented mask |
+| OSD (Polyline Line Text Rotated Rect Segmented Mask) | Displays an overlay on the image of different forms including polyline line text rotated rectangle segmented mask |
 | PadStack | Stacks several images into a tensor with border extension |
+| PairwiseMatcher | Matches features computed separately (e.g. via the SIFT operator) in two images using the brute force method |
 | PillowResize | Changes the size and scale of an image using python-pillow algorithm |
 | RandomResizedCrop | Crops a random portion of an image and resizes it to a specified size. |
 | Reformat | Converts a planar image into non-planar and vice versa |
diff --git a/LICENSE.md b/LICENSE.md
index 00ac932d..f0b0397a 100644
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -74,7 +74,7 @@ END OF TERMS AND CONDITIONS
 
 To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives.
 
-   Copyright [yyyy] [name of copyright owner]
+   Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/README.md b/README.md
index 155bcd8a..95dd9223 100644
--- a/README.md
+++ b/README.md
@@ -2,13 +2,13 @@
 
 [![License](https://img.shields.io/badge/License-Apache_2.0-yellogreen.svg)](https://opensource.org/licenses/Apache-2.0)
 
-![Version](https://img.shields.io/badge/Version-v0.4.0--beta-blue)
+![Version](https://img.shields.io/badge/Version-v0.5.0--beta-blue)
 
 ![Platform](https://img.shields.io/badge/Platform-linux--64_%7C_win--64_wsl2-gray)
 
 [![Cuda](https://img.shields.io/badge/CUDA-v11.7-%2376B900?logo=nvidia)](https://developer.nvidia.com/cuda-toolkit-archive)
 [![GCC](https://img.shields.io/badge/GCC-v11.0-yellow)](https://gcc.gnu.org/gcc-11/changes.html)
-[![Python](https://img.shields.io/badge/python-v3.7_%7c_v3.8_%7c_v3.10-blue?logo=python)](https://www.python.org/)
+[![Python](https://img.shields.io/badge/python-v3.7_%7c_v3.8_%7c_v3.9_%7c_v3.10-blue?logo=python)](https://www.python.org/)
 [![CMake](https://img.shields.io/badge/CMake-v3.22-%23008FBA?logo=cmake)](https://cmake.org/)
 
 CV-CUDA is an open-source project that enables building efficient cloud-scale
@@ -18,7 +18,7 @@ efficient pre- and post-processing pipelines. CV-CUDA originated as a
 collaborative effort between [NVIDIA][NVIDIA Develop] and [ByteDance][ByteDance].
 
 Refer to our [Developer Guide](DEVELOPER_GUIDE.md) for more information on the
-operators available as of release v0.4.0-beta.
+operators available as of release v0.5.0-beta.
 
 ## Getting Started
 
@@ -45,170 +45,150 @@ packages. Choose the installation method that meets your environment needs.
 #### Tar File Installation
 
 ```shell
-tar -xvf nvcv-lib-0.4.0-cuda11-x86_64-linux.tar.xz
-tar -xvf nvcv-dev-0.4.0-cuda11-x86_64-linux.tar.xz
+tar -xvf nvcv-lib-0.5.0-cuda11-x86_64-linux.tar.xz
+tar -xvf nvcv-dev-0.5.0-cuda11-x86_64-linux.tar.xz
 ```
 
 #### DEB File Installation
 
 ```shell
-sudo apt-get install -y ./nvcv-lib-0.4.0-cuda11-x86_64-linux.deb ./nvcv-dev-0.4.0-cuda11-x86_64-linux.deb
+sudo apt-get install -y ./nvcv-lib-0.5.0-cuda11-x86_64-linux.deb ./nvcv-dev-0.5.0-cuda11-x86_64-linux.deb
 ```
 
 #### Python WHL File Installation
 
 ```shell
-pip install nvcv_python-0.4.0-cp38-cp38-linux_x86_64.whl
+pip install nvcv_python-0.5.0-cp38-cp38-linux_x86_64.whl
 ```
 
 ### Build from Source
 
-Follow these instruction to build CV-CUDA from source:
+Building CV-CUDA from source allows for customization and is essential for contributing to the project. Here are detailed steps to guide you through the process:
 
-1. Set up your local CV-CUDA repository
+#### 1. Repository Setup
 
-    1. Install prerequisites needed to setup up the repository.
+   Before you begin, ensure you have cloned the CV-CUDA repository to your local machine. Let's assume you've cloned it into `~/cvcuda`.
 
-       On Ubuntu 22.04, install the following packages:
-       - git-lfs: to retrieve binary files from remote repository
+   - **Initialize the Repository**:
+     After cloning, initialize the repository to configure it correctly. This setup is required only once.
 
-       ```shell
-       sudo apt-get install -y git git-lfs
-       ```
+     ```shell
+     cd ~/cvcuda
+     ./init_repo.sh
+     ```
 
-    2. After cloning the repository (assuming it was cloned in `~/cvcuda`),
-       it needs to be properly configured by running the `init_repo.sh` script only once.
+#### 2. Install Build Dependencies
 
-       ```shell
-       cd ~/cvcuda
-       ./init_repo.sh
-       ```
+   CV-CUDA requires several dependencies to build from source. The following steps are based on Ubuntu 22.04, but similar packages can be found for other distributions.
 
-1. Build CV-CUDA
+   - **Install Essential Packages**:
+     These include the compiler, build system, and necessary libraries.
 
-    1. Install the dependencies required for building CV-CUDA
+     ```shell
+     sudo apt-get install -y g++-11 cmake ninja-build python3-dev libssl-dev
+     ```
 
-       On Ubuntu 22.04, install the following packages:
-       - g++-11: compiler to be used
-       - cmake, ninja-build (optional): manage build rules
-       - python3-dev: for python bindings
-       - libssl-dev: needed by the testsuite (MD5 hashing utilities)
+   - **CUDA Toolkit**:
+     The CUDA Toolkit is essential for GPU acceleration. Although any 11.x version is compatible, 11.7 is recommended.
 
-       ```shell
-       sudo apt-get install -y g++-11 cmake ninja-build python3-dev libssl-dev
-       ```
+     ```shell
+     sudo apt-get install -y cuda-minimal-build-11-7
+     ```
 
-       For CUDA Toolkit, any version of the 11.x series should work.
-       CV-CUDA was tested with 11.7, thus it should be preferred.
+#### 3. Build Process
 
-       ```shell
-       sudo apt-get install -y cuda-minimal-build-11-7
-       ```
+   Once the dependencies are in place, you can proceed to build CV-CUDA.
 
-    2. Build the project
+   - **Run Build Script**:
+     A build script is provided to simplify the compilation process. It creates a build tree and compiles the source code.
 
-       ```shell
-       ci/build.sh
-       ```
+     ```shell
+     ci/build.sh
+     ```
 
-       This will compile a x86 release build of CV-CUDA inside `build-rel` directory.
-       The library is in build-rel/lib, docs in build-rel/docs and executables
-       (tests, etc...) are in build-rel/bin.
+     This script creates a release build by default, placing output in `build-rel`. You can specify a debug build or a different output directory:
 
-       The script accepts some parameters to control the creation of the build tree:
+     ```shell
+     ci/build.sh [release|debug] [output build tree path]
+     ```
 
-       ```shell
-       ci/build.sh [release|debug] [output build tree path]
-       ```
+#### 4. Build Documentation (Optional)
 
-       By default it builds for release.
+   If you need to build the documentation, additional dependencies are required:
 
-       If output build tree path isn't specified, it'll be `build-rel` for release
-       builds, and `build-deb` for debug.
+   - **Install Documentation Dependencies**:
+     These tools are used to generate and format the documentation.
 
-1. Build Documentation
+     ```shell
+     sudo apt-get install -y doxygen graphviz python3 python3-pip
+     sudo python3 -m pip install sphinx==4.5.0 breathe exhale recommonmark graphviz sphinx-rtd-theme
+     ```
 
-    1. Install the dependencies required for building the documentation
+   - **Generate Documentation**:
+     Use the provided script to build the documentation.
 
-       On Ubuntu 22.04, install the following packages:
-       - doxygen: parse header files for reference documentation
-       - python3, python3-pip: to install some python packages needed
-       - sphinx, breathe, exhale, recommonmark, graphiviz: to render the documentation
-       - sphinx-rtd-theme: documenation theme used
+     ```shell
+     ci/build_docs.sh [build folder]
+     ```
 
-       ```shell
-       sudo apt-get install -y doxygen graphviz python3 python3-pip
-       sudo python3 -m pip install sphinx==4.5.0 breathe exhale recommonmark graphviz sphinx-rtd-theme
-       ```
+     For example:
 
-    2. Build the documentation
-       ```shell
-       ci/build_docs.sh [build folder]
-       ```
+     ```shell
+     ci/build_docs.sh build_docs
+     ```
 
-       Example:
-       `ci/build_docs.sh build_docs`
+#### 5. Build and Run Samples (Optional)
 
-1. Build and run Samples
+   CV-CUDA comes with a variety of samples to demonstrate its capabilities.
 
-   1. For instructions on how to build samples from source and run them, see the [Samples](samples/README.md) documentation.
+   - **See the Samples Documentation**:
+     Detailed instructions for building and running samples are available in the [Samples](samples/README.md) documentation.
 
-1. Run Tests
+#### 6. Running Tests
 
-   1. Install the dependencies required for running the tests
+   To ensure everything is working as expected, you can run CV-CUDA's test suite.
 
-       On Ubuntu 22.04, install the following packages:
-       - python3, python3-pip: to run python bindings tests
-       - torch: dependencies needed by python bindings tests
+   - **Install Test Dependencies**:
+     These are necessary to run the Python binding tests.
 
-       ```shell
-       sudo apt-get install -y python3 python3-pip
-       sudo python3 -m pip install pytest torch
-       ```
+     ```shell
+     sudo apt-get install -y python3 python3-pip
+     sudo python3 -m pip install pytest torch
+     ```
 
-   2. Run the tests
+   - **Execute Tests**:
+     Run the test scripts located in the build tree.
 
-       The tests are in `<buildtree>/bin`. You can run the script below to run all
-       tests at once. Here's an example when build tree is created in `build-rel`
+     ```shell
+     build-rel/bin/run_tests.sh
+     ```
 
-       ```shell
-       build-rel/bin/run_tests.sh
-       ```
+#### 7. Packaging
 
-1. Package installers
+   After a successful build, you can create installers using `cpack`.
 
-   Installers can be generated using the following cpack command once you have successfully built the project
+   - **Generate Installers**:
+     This step produces Debian packages and tarballs, suitable for distribution or installation on other systems.
 
-   ```shell
-   cd build-rel
-   cpack .
-   ```
+     ```shell
+     cd build-rel
+     cpack .
+     ```
 
-   This will generate in the build directory both Debian installers and tarballs
-   (\*.tar.xz), needed for integration in other distros.
+     For specific installer types:
 
-   For a fine-grained choice of what installers to generate, the full syntax is:
+     ```shell
+     cpack . -G [DEB|TXZ]
+     ```
 
-   ```shell
-   cpack . -G [DEB|TXZ]
-   ```
-
-   - DEB for Debian packages
-   - TXZ for \*.tar.xz tarballs.
-
-## Tools
-
-1. CV-CUDA make operator tool
-
-   This tool will create an noop operator; python bindings, and tests.
-
-   This tool is located in 'tools/mkop'. To run it, navigate to the directory and execute the command './mkop.sh OperatorName', where 'OperatorName' is the desired name of the operator.
+     - `DEB` for Debian packages.
+     - `TXZ` for `.tar.xz` tarballs.
 
 ## Contributing
 
 CV-CUDA is an open source project. As part of the Open Source Community, we are
 committed to the cycle of learning, improving, and updating that makes this
-community thrive. However, as of release v0.4.0-beta, CV-CUDA is not yet ready
+community thrive. However, as of release v0.5.0-beta, CV-CUDA is not yet ready
 for external contributions.
 
 To understand the process for contributing the CV-CUDA, see our
@@ -217,6 +197,48 @@ Source Community, and providing an environment that both supports and respects
 the efforts of all contributors, please read our
 [Code of Conduct](CODE_OF_CONDUCT.md).
 
+### CV-CUDA Make Operator Tool
+
+The `mkop.sh` script is a powerful tool for creating a scaffold for new operators in the CV-CUDA library. It automates several tasks, ensuring consistency and saving time.
+
+#### Features of `mkop.sh`:
+
+1. **Operator Stub Creation**: Generates no-op (no-operation) operator templates, which serve as a starting point for implementing new functionalities.
+
+1. **File Customization**: Modifies template files to include the new operator's name, ensuring consistent naming conventions across the codebase.
+
+1. **CMake Integration**: Adds the new operator files to the appropriate CMakeLists, facilitating seamless compilation and integration into the build system.
+
+1. **Python Bindings**: Creates Python wrapper stubs for the new operator, allowing it to be used within Python environments.
+
+1. **Test Setup**: Generates test files for both C++ and Python, enabling immediate development of unit tests for the new operator.
+
+#### How to Use `mkop.sh`:
+
+Run the script with the desired operator name. The script assumes it's located in `/cvcuda/tools/mkop`.
+
+  ```shell
+  ./mkop.sh [Operator Name]
+  ```
+
+If the script is run from a different location, provide the path to the CV-CUDA root directory.
+
+  ```shell
+  ./mkop.sh [Operator Name] [CV-CUDA root]
+  ```
+
+**NOTE**: The first letter of the new operator name is captitalized where needed to match the rest of the file structures.
+
+#### Process Details:
+
+- **Initial Setup**: The script begins by validating the input and setting up necessary variables. It then capitalizes the first letter of the operator name to adhere to naming conventions.
+
+- **Template Modification**: It processes various template files (`Public.h`, `PrivateImpl.cpp`, etc.), replacing placeholders with the new operator name. This includes adjusting file headers, namespaces, and function signatures.
+
+- **CMake and Python Integration**: The script updates `CMakeLists.txt` files and Python module files to include the new operator, ensuring it's recognized by the build system and Python interface.
+
+- **Testing Framework**: Finally, it sets up test files for both C++ and Python, allowing developers to immediately start writing tests for the new operator.
+
 ## License
 
 CV-CUDA operates under the [Apache-2.0](LICENSE.md) license.
diff --git a/SECURITY.md b/SECURITY.md
index 695cf3fe..1bcc2896 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -18,3 +18,20 @@ To report a potential security vulnerability in any NVIDIA product:
   - We encourage you to use the following PGP key for secure email communication: [NVIDIA public PGP Key for communication](https://www.nvidia.com/en-us/security/pgp-key)
   - Please include the following information:
   - Product/Driver name and version/branch that contains the vulnerability
+
+## Code Static Analysis
+
+In our commitment to maintaining the highest standards of code quality and security, we have enabled GitHub's Code Static Analysis scanning on our repositories. Static Analysis is a powerful tool for analyzing the codebase for potential vulnerabilities.
+
+- Scope: CodeQL scanning is activated across all branches of this repository.
+- Frequency: Scans are conducted regularly on new commits to ensure continuous integration and delivery are secure.
+- Results Handling: Any identified vulnerabilities or code issues are reviewed and addressed promptly by our development team.
+- Community Contribution: We welcome contributions to enhance our CodeQL queries. If you have suggestions or improvements, please submit a pull request or contact us via the outlined channels.
+
+## Secrets Scanning
+
+To further bolster our repository's security, we have implemented GitHub's secrets scanning feature. This feature helps detect and prevent accidental commits of sensitive information such as passwords, private keys, and API tokens.
+
+- Active Scanning: Secrets scanning is active on all branches of this repository.
+- Alerts and Notifications: In the event that a potential secret is committed to the repository, an alert is generated. These alerts are reviewed and addressed swiftly by our security team.
+- Prevention and Education: We continuously educate our contributors about best practices in handling secrets and sensitive data. We encourage the use of environment variables and secure vaults for managing secrets.
diff --git a/bench/BenchAdaptiveThreshold.cpp b/bench/BenchAdaptiveThreshold.cpp
new file mode 100644
index 00000000..658281fd
--- /dev/null
+++ b/bench/BenchAdaptiveThreshold.cpp
@@ -0,0 +1,96 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpAdaptiveThreshold.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void AdaptiveThreshold(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape     = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape  = state.get_int64("varShape");
+    int   blockSize = static_cast<int>(state.get_int64("blockSize"));
+
+    NVCVThresholdType         threshType = NVCV_THRESH_BINARY;
+    NVCVAdaptiveThresholdType adaptType  = NVCV_ADAPTIVE_THRESH_GAUSSIAN_C;
+
+    double maxValue = 123.;
+    double c        = -2.3;
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::AdaptiveThreshold op(blockSize, shape.x);
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &maxValue, &adaptType, &threshType, &blockSize, &c](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, maxValue, adaptType, threshType, blockSize, c);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(shape.x);
+        nvcv::ImageBatchVarShape dst(shape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{shape.z, shape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        dst.pushBack(src.begin(), src.end());
+
+        nvcv::Tensor maxValueTensor({{shape.x}, "N"}, nvcv::TYPE_F64);
+        nvcv::Tensor blockSizeTensor({{shape.x}, "N"}, nvcv::TYPE_S32);
+        nvcv::Tensor cTensor({{shape.x}, "N"}, nvcv::TYPE_F64);
+
+        benchutils::FillTensor<double>(maxValueTensor, [&maxValue](const long4 &){ return maxValue; });
+        benchutils::FillTensor<int>(maxValueTensor, [&blockSize](const long4 &){ return blockSize; });
+        benchutils::FillTensor<double>(cTensor, [&c](const long4 &){ return c; });
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &maxValueTensor, &adaptType, &threshType, &blockSizeTensor, &cTensor]
+                   (nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, maxValueTensor, adaptType, threshType, blockSizeTensor, cTensor);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using AdaptiveThresholdTypes = nvbench::type_list<uint8_t>;
+
+NVBENCH_BENCH_TYPES(AdaptiveThreshold, NVBENCH_TYPE_AXES(AdaptiveThresholdTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("blockSize", {7});
diff --git a/bench/BenchAdvCvtColor.cpp b/bench/BenchAdvCvtColor.cpp
new file mode 100644
index 00000000..04459bdb
--- /dev/null
+++ b/bench/BenchAdvCvtColor.cpp
@@ -0,0 +1,74 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpAdvCvtColor.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void AdvCvtColor(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape    = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+
+    using BT = typename nvcv::cuda::BaseType<T>;
+
+    int ch = nvcv::cuda::NumElements<T>;
+
+    NVCVColorConversionCode code      = NVCV_COLOR_BGR2YUV;
+    nvcv::ColorSpec         colorSpec = NVCV_COLOR_SPEC_BT2020;
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::AdvCvtColor op;
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType<BT>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType<BT>());
+
+        benchutils::FillTensor<BT>(src, benchutils::RandomValues<BT>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &code, &colorSpec](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, code, colorSpec);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        throw std::invalid_argument("ImageBatchVarShape not implemented for this operator");
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using AdvCvtColorTypes = nvbench::type_list<uchar3>;
+
+NVBENCH_BENCH_TYPES(AdvCvtColor, NVBENCH_TYPE_AXES(AdvCvtColorTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1});
diff --git a/bench/BenchAverageBlur.cpp b/bench/BenchAverageBlur.cpp
new file mode 100644
index 00000000..fbfc9c4c
--- /dev/null
+++ b/bench/BenchAverageBlur.cpp
@@ -0,0 +1,93 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpAverageBlur.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void AverageBlur(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape      = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape   = state.get_int64("varShape");
+    int2  kernelSize = nvcv::cuda::StaticCast<int>(benchutils::GetShape<2>(state.get_string("kernelSize")));
+
+    NVCVBorderType borderType = benchutils::GetBorderType(state.get_string("border"));
+
+    nvcv::Size2D kernelSize2d{kernelSize.x, kernelSize.y};
+    int2         kernelAnchor{-1, -1};
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::AverageBlur op(kernelSize2d, shape.x);
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &kernelSize2d, &kernelAnchor, &borderType](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, kernelSize2d, kernelAnchor, borderType);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(shape.x);
+        nvcv::ImageBatchVarShape dst(shape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{shape.z, shape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        dst.pushBack(src.begin(), src.end());
+
+        nvcv::Tensor kernelSizeTensor({{shape.x}, "N"}, nvcv::TYPE_2S32);
+        nvcv::Tensor kernelAnchorTensor({{shape.x}, "N"}, nvcv::TYPE_2S32);
+
+        benchutils::FillTensor<int2>(kernelSizeTensor, [&kernelSize](const long4 &){ return kernelSize; });
+        benchutils::FillTensor<int2>(kernelAnchorTensor, [&kernelAnchor](const long4 &){ return kernelAnchor; });
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &kernelSizeTensor, &kernelAnchorTensor, &borderType](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, kernelSizeTensor, kernelAnchorTensor, borderType);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using AverageBlurTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(AverageBlur, NVBENCH_TYPE_AXES(AverageBlurTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1})
+    .add_string_axis("kernelSize", {"7x7"})
+    .add_string_axis("border", {"REPLICATE"});
diff --git a/bench/BenchBilateralFilter.cpp b/bench/BenchBilateralFilter.cpp
new file mode 100644
index 00000000..73875d8e
--- /dev/null
+++ b/bench/BenchBilateralFilter.cpp
@@ -0,0 +1,96 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpBilateralFilter.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void BilateralFilter(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape      = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape   = state.get_int64("varShape");
+    int   diameter   = static_cast<int>(state.get_int64("diameter"));
+    float sigmaSpace = static_cast<float>(state.get_float64("sigmaSpace"));
+    float sigmaColor = -1.f;
+
+    NVCVBorderType borderType = benchutils::GetBorderType(state.get_string("border"));
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::BilateralFilter op;
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &diameter, &sigmaColor, &sigmaSpace, &borderType](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, diameter, sigmaColor, sigmaSpace, borderType);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(shape.x);
+        nvcv::ImageBatchVarShape dst(shape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{shape.z, shape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        dst.pushBack(src.begin(), src.end());
+
+        nvcv::Tensor diameterTensor({{shape.x}, "N"}, nvcv::TYPE_S32);
+        nvcv::Tensor sigmaSpaceTensor({{shape.x}, "N"}, nvcv::TYPE_F32);
+        nvcv::Tensor sigmaColorTensor({{shape.x}, "N"}, nvcv::TYPE_F32);
+
+        benchutils::FillTensor<int>(diameterTensor, [&diameter](const long4 &){ return diameter; });
+        benchutils::FillTensor<float>(sigmaSpaceTensor, [&sigmaSpace](const long4 &){ return sigmaSpace; });
+        benchutils::FillTensor<float>(sigmaColorTensor, [&sigmaColor](const long4 &){ return sigmaColor; });
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &diameterTensor, &sigmaColorTensor, &sigmaSpaceTensor, &borderType]
+                   (nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, diameterTensor, sigmaColorTensor, sigmaSpaceTensor, borderType);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using BilateralFilterTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(BilateralFilter, NVBENCH_TYPE_AXES(BilateralFilterTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("diameter", {-1})
+    .add_float64_axis("sigmaSpace", {1.2})
+    .add_string_axis("border", {"REFLECT"});
diff --git a/bench/BenchBndBox.cpp b/bench/BenchBndBox.cpp
new file mode 100644
index 00000000..9f714e0a
--- /dev/null
+++ b/bench/BenchBndBox.cpp
@@ -0,0 +1,97 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <../priv/Types.hpp>
+#include <cvcuda/OpBndBox.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void BndBox(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape    = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+    int   numBoxes = static_cast<int>(state.get_int64("numBoxes"));
+
+    using BT = typename nvcv::cuda::BaseType<T>;
+
+    int ch = nvcv::cuda::NumElements<T>;
+
+    NVCVBndBoxI bndBox{
+        {43, 21, 12,  34}, // box x, y position w, h size
+        2, // box thickness
+        { 0,  0,  0, 255}, // box border color
+        { 0,  0,  0,   0}  // box fill color
+    };
+
+    std::vector<std::vector<NVCVBndBoxI>> bndBoxesVec;
+
+    for (int i = 0; i < shape.x; i++)
+    {
+        std::vector<NVCVBndBoxI> curVec;
+        for (int j = 0; j < numBoxes; j++)
+        {
+            curVec.push_back(bndBox);
+        }
+        bndBoxesVec.push_back(curVec);
+    }
+
+    std::shared_ptr<cvcuda::priv::NVCVBndBoxesImpl> bndBoxesImpl
+        = std::make_shared<cvcuda::priv::NVCVBndBoxesImpl>(bndBoxesVec);
+    NVCVBndBoxesI bndBoxes = (NVCVBndBoxesI)bndBoxesImpl.get();
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T) + shape.x * numBoxes * sizeof(NVCVBndBoxI));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::BndBox op;
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType<BT>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType<BT>());
+
+        benchutils::FillTensor<BT>(src, benchutils::RandomValues<BT>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &bndBoxes](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, bndBoxes);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        throw std::invalid_argument("ImageBatchVarShape not implemented for this operator");
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using BndBoxTypes = nvbench::type_list<uchar3, uchar4>;
+
+NVBENCH_BENCH_TYPES(BndBox, NVBENCH_TYPE_AXES(BndBoxTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("numBoxes", {10, 100});
diff --git a/bench/BenchBoxBlur.cpp b/bench/BenchBoxBlur.cpp
new file mode 100644
index 00000000..031d74f9
--- /dev/null
+++ b/bench/BenchBoxBlur.cpp
@@ -0,0 +1,97 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <../priv/Types.hpp>
+#include <cvcuda/OpBoxBlur.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void BoxBlur(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape      = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape   = state.get_int64("varShape");
+    int   numBoxes   = static_cast<int>(state.get_int64("numBoxes"));
+    int   kernelSize = static_cast<int>(state.get_int64("kernelSize"));
+
+    using BT = typename nvcv::cuda::BaseType<T>;
+
+    int ch = nvcv::cuda::NumElements<T>;
+
+    NVCVBlurBoxI blurBox{
+        {43, 21, 12, 34}, // box x, y position w, h size
+        kernelSize  // median filter kernel size
+    };
+
+    std::vector<std::vector<NVCVBlurBoxI>> blurBoxesVec;
+
+    for (int i = 0; i < shape.x; i++)
+    {
+        std::vector<NVCVBlurBoxI> curVec;
+        for (int j = 0; j < numBoxes; j++)
+        {
+            curVec.push_back(blurBox);
+        }
+        blurBoxesVec.push_back(curVec);
+    }
+
+    std::shared_ptr<cvcuda::priv::NVCVBlurBoxesImpl> blurBoxesImpl
+        = std::make_shared<cvcuda::priv::NVCVBlurBoxesImpl>(blurBoxesVec);
+    NVCVBlurBoxesI blurBoxes = (NVCVBlurBoxesI)blurBoxesImpl.get();
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T) + shape.x * numBoxes * sizeof(NVCVBlurBoxI));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::BoxBlur op;
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType<BT>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType<BT>());
+
+        benchutils::FillTensor<BT>(src, benchutils::RandomValues<BT>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &blurBoxes](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, blurBoxes);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        throw std::invalid_argument("ImageBatchVarShape not implemented for this operator");
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using BoxBlurTypes = nvbench::type_list<uchar3, uchar4>;
+
+NVBENCH_BENCH_TYPES(BoxBlur, NVBENCH_TYPE_AXES(BoxBlurTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("numBoxes", {4})
+    .add_int64_axis("kernelSize", {5});
diff --git a/bench/BenchBrightnessContrast.cpp b/bench/BenchBrightnessContrast.cpp
new file mode 100644
index 00000000..8e741169
--- /dev/null
+++ b/bench/BenchBrightnessContrast.cpp
@@ -0,0 +1,91 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpBrightnessContrast.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void BrightnessContrast(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape    = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T) + shape.x * sizeof(float) * 4);
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::BrightnessContrast op;
+
+    // clang-format off
+
+    nvcv::Tensor brightness({{shape.x}, "N"}, nvcv::TYPE_F32);
+    nvcv::Tensor contrast({{shape.x}, "N"}, nvcv::TYPE_F32);
+    nvcv::Tensor brightnessShift({{shape.x}, "N"}, nvcv::TYPE_F32);
+    nvcv::Tensor contrastCenter({{shape.x}, "N"}, nvcv::TYPE_F32);
+
+    benchutils::FillTensor<float>(brightness, benchutils::RandomValues<float>(0.f, 1.f));
+    benchutils::FillTensor<float>(contrast, benchutils::RandomValues<T>());
+    benchutils::FillTensor<float>(brightnessShift, benchutils::RandomValues<T>());
+    benchutils::FillTensor<float>(contrastCenter, benchutils::RandomValues<T>());
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &brightness, &contrast, &brightnessShift, &contrastCenter]
+                   (nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, brightness, contrast, brightnessShift, contrastCenter);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(shape.x);
+        nvcv::ImageBatchVarShape dst(shape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{shape.z, shape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        dst.pushBack(src.begin(), src.end());
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &brightness, &contrast, &brightnessShift, &contrastCenter]
+                   (nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, brightness, contrast, brightnessShift, contrastCenter);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using BrightnessContrastTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(BrightnessContrast, NVBENCH_TYPE_AXES(BrightnessContrastTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1});
diff --git a/bench/BenchCenterCrop.cpp b/bench/BenchCenterCrop.cpp
new file mode 100644
index 00000000..3ebe7d13
--- /dev/null
+++ b/bench/BenchCenterCrop.cpp
@@ -0,0 +1,85 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpCenterCrop.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void CenterCrop(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 srcShape = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+
+    nvcv::Size2D cropSize;
+
+    if (state.get_string("cropType") == "SAME")
+    {
+        cropSize = nvcv::Size2D{(int)srcShape.z, (int)srcShape.y};
+    }
+    else if (state.get_string("cropType") == "QUARTER")
+    {
+        cropSize = nvcv::Size2D{(int)srcShape.z / 2, (int)srcShape.y / 2};
+    }
+    else
+    {
+        throw std::invalid_argument("Invalid resizeType = " + state.get_string("resizeType"));
+    }
+
+    long3 dstShape{srcShape.x, cropSize.h, cropSize.w};
+
+    state.add_global_memory_reads(dstShape.x * dstShape.y * dstShape.z * sizeof(T));
+    state.add_global_memory_writes(dstShape.x * dstShape.y * dstShape.z * sizeof(T));
+
+    cvcuda::CenterCrop op;
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{srcShape.x, srcShape.y, srcShape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &cropSize](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, cropSize);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        throw std::invalid_argument("ImageBatchVarShape not implemented for this operator");
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using CenterCropTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(CenterCrop, NVBENCH_TYPE_AXES(CenterCropTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1})
+    .add_string_axis("cropType", {"QUARTER"});
diff --git a/bench/BenchChannelReorder.cpp b/bench/BenchChannelReorder.cpp
new file mode 100644
index 00000000..fed1a600
--- /dev/null
+++ b/bench/BenchChannelReorder.cpp
@@ -0,0 +1,73 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpChannelReorder.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void ChannelReorder(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape    = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::ChannelReorder op;
+
+    // clang-format off
+
+    nvcv::Tensor orders({{shape.x, 4}, "NC"}, nvcv::TYPE_S32);
+
+    benchutils::FillTensor<int>(orders, benchutils::RandomValues<int>(0, nvcv::cuda::NumElements<T>));
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        throw std::invalid_argument("Tensor not implemented for this operator");
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(shape.x);
+        nvcv::ImageBatchVarShape dst(shape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{shape.z, shape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        dst.pushBack(src.begin(), src.end());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &orders](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, orders);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using ChannelReorderTypes = nvbench::type_list<uchar3, uchar4>;
+
+NVBENCH_BENCH_TYPES(ChannelReorder, NVBENCH_TYPE_AXES(ChannelReorderTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {0});
diff --git a/bench/BenchColorTwist.cpp b/bench/BenchColorTwist.cpp
new file mode 100644
index 00000000..67e90af8
--- /dev/null
+++ b/bench/BenchColorTwist.cpp
@@ -0,0 +1,85 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpColorTwist.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void ColorTwist(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape    = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+
+    using BT = typename nvcv::cuda::BaseType<T>;
+
+    int ch = nvcv::cuda::NumElements<T>;
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::ColorTwist op;
+
+    // clang-format off
+
+    nvcv::Tensor twist({{shape.x, 3}, "NH"}, nvcv::TYPE_4F32);
+
+    benchutils::FillTensor<float4>(twist, benchutils::RandomValues<float4>());
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType<BT>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType<BT>());
+
+        benchutils::FillTensor<BT>(src, benchutils::RandomValues<BT>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &twist](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, twist);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(shape.x);
+        nvcv::ImageBatchVarShape dst(shape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{shape.z, shape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        dst.pushBack(src.begin(), src.end());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &twist](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, twist);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using ColorTwistTypes = nvbench::type_list<uchar3, uchar4>;
+
+NVBENCH_BENCH_TYPES(ColorTwist, NVBENCH_TYPE_AXES(ColorTwistTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1});
diff --git a/bench/BenchComposite.cpp b/bench/BenchComposite.cpp
new file mode 100644
index 00000000..2293ecab
--- /dev/null
+++ b/bench/BenchComposite.cpp
@@ -0,0 +1,91 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpComposite.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T, typename M = uint8_t>
+inline void Composite(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape    = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+
+    using BT = typename nvcv::cuda::BaseType<T>;
+
+    int ch = nvcv::cuda::NumElements<T>;
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * (sizeof(T) * 2 + sizeof(M)));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::Composite op;
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor fg({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor bg({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor mask({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<M>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(fg, benchutils::RandomValues<T>());
+        benchutils::FillTensor<T>(bg, benchutils::RandomValues<T>());
+        benchutils::FillTensor<M>(mask, [](const long4 &){ return 1; });
+
+        state.exec(nvbench::exec_tag::sync, [&op, &fg, &bg, &mask, &dst](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), fg, bg, mask, dst);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape fg(shape.x);
+        nvcv::ImageBatchVarShape bg(shape.x);
+        nvcv::ImageBatchVarShape mask(shape.x);
+        nvcv::ImageBatchVarShape dst(shape.x);
+
+        benchutils::FillImageBatch<T>(fg, long2{shape.z, shape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        bg.pushBack(fg.begin(), fg.end());
+        dst.pushBack(fg.begin(), fg.end());
+
+        benchutils::FillImageBatch<M>(mask, long2{shape.z, shape.y}, long2{varShape, varShape},
+                                      [](const long4 &){ return 1; });
+
+        state.exec(nvbench::exec_tag::sync, [&op, &fg, &bg, &mask, &dst](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), fg, bg, mask, dst);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using CompositeTypes = nvbench::type_list<uchar3>;
+
+NVBENCH_BENCH_TYPES(Composite, NVBENCH_TYPE_AXES(CompositeTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1});
diff --git a/bench/BenchConv2D.cpp b/bench/BenchConv2D.cpp
new file mode 100644
index 00000000..ba88b921
--- /dev/null
+++ b/bench/BenchConv2D.cpp
@@ -0,0 +1,83 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpConv2D.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void Conv2D(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape      = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape   = state.get_int64("varShape");
+    int2  kernelSize = nvcv::cuda::StaticCast<int>(benchutils::GetShape<2>(state.get_string("kernelSize")));
+
+    NVCVBorderType borderType = benchutils::GetBorderType(state.get_string("border"));
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::Conv2D op;
+
+    // clang-format off
+
+    nvcv::Tensor kernelAnchor({{shape.x}, "N"}, nvcv::TYPE_2S32);
+
+    benchutils::FillTensor<int2>(kernelAnchor, [](const long4 &){ return int2{-1, -1}; });
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        throw std::invalid_argument("Tensor not implemented for this operator");
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(shape.x);
+        nvcv::ImageBatchVarShape dst(shape.x);
+        nvcv::ImageBatchVarShape kernel(shape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{shape.z, shape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        dst.pushBack(src.begin(), src.end());
+
+        benchutils::FillImageBatch<float>(kernel, long2{kernelSize.x, kernelSize.y}, long2{0, 0},
+                                          benchutils::RandomValues<float>(0.f, 1.f));
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &kernel, &kernelAnchor, &borderType](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, kernel, kernelAnchor, borderType);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using Conv2DTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(Conv2D, NVBENCH_TYPE_AXES(Conv2DTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {0})
+    .add_string_axis("kernelSize", {"7x7"})
+    .add_string_axis("border", {"REPLICATE"});
diff --git a/bench/BenchConvertTo.cpp b/bench/BenchConvertTo.cpp
new file mode 100644
index 00000000..2bbd74a7
--- /dev/null
+++ b/bench/BenchConvertTo.cpp
@@ -0,0 +1,70 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpConvertTo.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void ConvertTo(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape    = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+
+    double alpha = 0.123;
+    double beta  = 0.456;
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::ConvertTo op;
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &alpha, &beta](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, alpha, beta);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        throw std::invalid_argument("ImageBatchVarShape not implemented for this operator");
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using ConvertToTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(ConvertTo, NVBENCH_TYPE_AXES(ConvertToTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1});
diff --git a/bench/BenchCopyMakeBorder.cpp b/bench/BenchCopyMakeBorder.cpp
new file mode 100644
index 00000000..722c37d0
--- /dev/null
+++ b/bench/BenchCopyMakeBorder.cpp
@@ -0,0 +1,96 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpCopyMakeBorder.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void CopyMakeBorder(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 srcShape = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+
+    NVCVBorderType borderType = benchutils::GetBorderType(state.get_string("border"));
+
+    float4 borderValue{0.f, 0.f, 0.f, 0.f};
+
+    int top  = srcShape.y / 2;
+    int left = srcShape.z / 2;
+
+    long3 dstShape{srcShape.x, top + srcShape.y, left + srcShape.z};
+
+    state.add_global_memory_reads(srcShape.x * srcShape.y * srcShape.z * sizeof(T));
+    state.add_global_memory_writes(dstShape.x * dstShape.y * dstShape.z * sizeof(T));
+
+    cvcuda::CopyMakeBorder op;
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{srcShape.x, srcShape.y, srcShape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &top, &left, &borderType, &borderValue](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, top, left, borderType, borderValue);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(srcShape.x);
+        nvcv::ImageBatchVarShape dst(dstShape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{srcShape.z, srcShape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        benchutils::FillImageBatch<T>(dst, long2{dstShape.z, dstShape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+
+        nvcv::Tensor topTensor({{srcShape.x, 1, 1, 1}, "NHWC"}, nvcv::TYPE_S32);
+        nvcv::Tensor leftTensor({{srcShape.x, 1, 1, 1}, "NHWC"}, nvcv::TYPE_S32);
+
+        benchutils::FillTensor<int>(topTensor, [&top](const long4 &){ return top; });
+        benchutils::FillTensor<int>(leftTensor, [&left](const long4 &){ return left; });
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &topTensor, &leftTensor, &borderType, &borderValue](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, topTensor, leftTensor, borderType, borderValue);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using CopyMakeBorderTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(CopyMakeBorder, NVBENCH_TYPE_AXES(CopyMakeBorderTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1})
+    .add_string_axis("border", {"REFLECT101"});
diff --git a/bench/BenchCropFlipNormalizeReformat.cpp b/bench/BenchCropFlipNormalizeReformat.cpp
new file mode 100644
index 00000000..32f82737
--- /dev/null
+++ b/bench/BenchCropFlipNormalizeReformat.cpp
@@ -0,0 +1,116 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpCropFlipNormalizeReformat.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void CropFlipNormalizeReformat(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 srcShape = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+    long3 dstShape = srcShape;
+
+    NVCVBorderType borderType = benchutils::GetBorderType(state.get_string("border"));
+
+    float borderValue{0.f};
+
+    float    globalScale = 1.234f;
+    float    globalShift = 2.345f;
+    float    epsilon     = 12.34f;
+    uint32_t flags       = 0;
+
+    long3 baseShape{srcShape.x, 1, 1};
+    long3 scaleShape{srcShape.x, 1, 1};
+    long3 cropShape{srcShape.x, 1, 1};
+
+    state.add_global_memory_reads(srcShape.x * srcShape.y * srcShape.z * sizeof(T)
+                                  + baseShape.x * baseShape.y * baseShape.z * sizeof(float)
+                                  + scaleShape.x * scaleShape.y * scaleShape.z * sizeof(float)
+                                  + cropShape.x * cropShape.y * cropShape.z * sizeof(int) * 4);
+    state.add_global_memory_writes(dstShape.x * dstShape.y * dstShape.z * sizeof(T));
+
+    cvcuda::CropFlipNormalizeReformat op;
+
+    // clang-format off
+
+    nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+    nvcv::Tensor flipCode({{srcShape.x}, "N"}, nvcv::TYPE_S32);
+
+    nvcv::Tensor base({{baseShape.x, baseShape.y, baseShape.z, 1}, "NHWC"}, nvcv::TYPE_F32);
+    nvcv::Tensor scale({{scaleShape.x, scaleShape.y, scaleShape.z, 1}, "NHWC"}, nvcv::TYPE_F32);
+
+    nvcv::Tensor crop({{cropShape.x, cropShape.y, cropShape.z, 4}, "NHWC"}, nvcv::TYPE_S32);
+
+    benchutils::FillTensor<int>(flipCode, [](const long4 &){ return -1; });
+
+    benchutils::FillTensor<float>(base, benchutils::RandomValues<T>());
+    benchutils::FillTensor<float>(scale, benchutils::RandomValues<float>(0.f, 1.f));
+
+    // Always crop entire source image for easy bandwidth calculations
+    benchutils::FillTensor<int>(crop, [&srcShape](const long4 &c)
+    {
+        if (c.w == 2)
+        {
+            return (int)srcShape.z;
+        }
+        else if (c.w == 3)
+        {
+            return (int)srcShape.y;
+        }
+        return 0;
+    });
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        throw std::invalid_argument("Tensor not implemented for this operator");
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(srcShape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{srcShape.z, srcShape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &crop, &borderType, &borderValue, &flipCode, &base, &scale, &globalScale,
+                    &globalShift, &epsilon, &flags](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, crop, borderType, borderValue, flipCode, base, scale, globalScale,
+               globalShift, epsilon, flags);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using CropFlipNormalizeReformatTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(CropFlipNormalizeReformat, NVBENCH_TYPE_AXES(CropFlipNormalizeReformatTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {0})
+    .add_string_axis("border", {"REFLECT101"});
diff --git a/bench/BenchCustomCrop.cpp b/bench/BenchCustomCrop.cpp
new file mode 100644
index 00000000..07478a39
--- /dev/null
+++ b/bench/BenchCustomCrop.cpp
@@ -0,0 +1,70 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpCustomCrop.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void CustomCrop(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape    = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+
+    // Always crop entire source image for easy bandwidth calculations
+    NVCVRectI cropRect{0, 0, (int)shape.z, (int)shape.y};
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::CustomCrop op;
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &cropRect](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, cropRect);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        throw std::invalid_argument("ImageBatchVarShape not implemented for this operator");
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using CustomCropTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(CustomCrop, NVBENCH_TYPE_AXES(CustomCropTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1});
diff --git a/bench/BenchCvtColor.cpp b/bench/BenchCvtColor.cpp
new file mode 100644
index 00000000..05469e0f
--- /dev/null
+++ b/bench/BenchCvtColor.cpp
@@ -0,0 +1,83 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpCvtColor.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void CvtColor(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape    = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+
+    using BT = typename nvcv::cuda::BaseType<T>;
+
+    int ch = nvcv::cuda::NumElements<T>;
+
+    NVCVColorConversionCode code = ch == 3 ? NVCV_COLOR_BGR2RGB : NVCV_COLOR_BGRA2RGBA;
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::CvtColor op;
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType<BT>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType<BT>());
+
+        benchutils::FillTensor<BT>(src, benchutils::RandomValues<BT>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &code](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, code);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(shape.x);
+        nvcv::ImageBatchVarShape dst(shape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{shape.z, shape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        dst.pushBack(src.begin(), src.end());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &code](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, code);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using CvtColorTypes = nvbench::type_list<uchar3, uchar4>;
+
+NVBENCH_BENCH_TYPES(CvtColor, NVBENCH_TYPE_AXES(CvtColorTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1});
diff --git a/bench/BenchErase.cpp b/bench/BenchErase.cpp
new file mode 100644
index 00000000..68419ad9
--- /dev/null
+++ b/bench/BenchErase.cpp
@@ -0,0 +1,95 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpErase.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void Erase(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape    = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+    int   numErase = static_cast<int>(state.get_int64("numErase"));
+
+    bool random = true;
+    int  seed   = 0;
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T)
+                                  + shape.x * (sizeof(int2) + sizeof(int3) + sizeof(float) + sizeof(int)));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::Erase op(numErase);
+
+    // clang-format off
+
+    nvcv::Tensor anchor({{shape.x}, "N"}, nvcv::TYPE_2S32);
+    nvcv::Tensor erasing({{shape.x}, "N"}, nvcv::TYPE_3S32);
+    nvcv::Tensor values({{shape.x}, "N"}, nvcv::TYPE_F32);
+    nvcv::Tensor imgIdx({{shape.x}, "N"}, nvcv::TYPE_S32);
+
+    benchutils::FillTensor<int2>(anchor, [](const long4 &){ return int2{0, 0}; });
+    benchutils::FillTensor<int3>(erasing, [](const long4 &){ return int3{10, 10, 1}; });
+    benchutils::FillTensor<float>(values, [](const long4 &){ return 1.f; });
+    benchutils::FillTensor<int>(imgIdx, [](const long4 &){ return 0; });
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &anchor, &erasing, &values, &imgIdx, &random, &seed](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, anchor, erasing, values, imgIdx, random, seed);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(shape.x);
+        nvcv::ImageBatchVarShape dst(shape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{shape.z, shape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        dst.pushBack(src.begin(), src.end());
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &anchor, &erasing, &values, &imgIdx, &random, &seed](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, anchor, erasing, values, imgIdx, random, seed);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using EraseTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(Erase, NVBENCH_TYPE_AXES(EraseTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {0})
+    .add_int64_axis("numErase", {3});
diff --git a/bench/BenchFindContours.cpp b/bench/BenchFindContours.cpp
new file mode 100644
index 00000000..2beb2747
--- /dev/null
+++ b/bench/BenchFindContours.cpp
@@ -0,0 +1,126 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpFindContours.hpp>
+#include <nvcv/TensorDataAccess.hpp>
+#include <stdlib.h>
+
+#include <nvbench/nvbench.cuh>
+
+using CPUImage = std::vector<uint8_t>;
+
+static void generateRectangle(CPUImage &image, nvcv::Size2D boundary, nvcv::Size2D anchor = {0, 0},
+                              nvcv::Size2D size = {5, 5}, double angle = 0.0, bool fill = true, uint8_t setValue = 1);
+
+static void generateRectangle(CPUImage &image, nvcv::Size2D boundary, nvcv::Size2D anchor, nvcv::Size2D size,
+                              double angle, bool fill, uint8_t setValue)
+{
+    auto rad      = angle * (M_PI / 180.0);
+    auto cosAngle = std::cos(rad);
+    auto sinAngle = std::sin(rad);
+
+    auto transformed = anchor;
+    for (auto y = 0; y < size.h; ++y)
+    {
+        for (auto x = 0; x < size.w; ++x)
+        {
+            transformed.w = anchor.w + (x * cosAngle - y * sinAngle);
+            transformed.h = anchor.h + (x * sinAngle + y * cosAngle);
+
+            if (fill || y == 0 || y == size.h - 1 || x == 0 || x == size.w - 1)
+            {
+                if (transformed.w >= 0 && transformed.w < boundary.w && transformed.h >= 0
+                    && transformed.h < boundary.h)
+                {
+                    image[transformed.h * boundary.w + transformed.w] = setValue;
+                }
+            }
+        }
+    }
+}
+
+template<typename T>
+inline void FindContours(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    srand(0U); // Use a fixed random seed
+    long3 shape     = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape  = state.get_int64("varShape");
+    int   numPoints = static_cast<int>(state.get_int64("numPoints"));
+
+    // R/W bandwidth rationale:
+    // Read image + connected components (S32)
+    // Write points + contours (U32)
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * (sizeof(T) + sizeof(int)));
+    state.add_global_memory_writes(shape.x * numPoints * sizeof(int) * 2 + shape.x * 4 * sizeof(int));
+
+    cvcuda::FindContours op(nvcv::Size2D{(int)shape.z, (int)shape.y}, shape.x);
+
+    // clang-format off
+
+    nvcv::Tensor points({{shape.x, numPoints, 2}, "NCW"}, nvcv::TYPE_S32);
+    nvcv::Tensor counts({{shape.x, 4}, "NW"}, nvcv::TYPE_S32);
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        auto inData = src.exportData<nvcv::TensorDataStridedCuda>();
+        auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inData);
+
+        //Generate input
+        CPUImage srcVec(shape.y * shape.z, 0);
+        for (auto i = 0; i < 10; ++i)
+        {
+            auto anchorX = rand() % shape.z;
+            auto anchorY = rand() % shape.y;
+            auto sizeX = rand() % (shape.z - anchorX);
+            auto sizeY = rand() % (shape.y - anchorY);
+            generateRectangle(srcVec, {anchorX, anchorY}, {sizeX, sizeY});
+        }
+
+        for (auto i = 0; i < shape.x; ++i)
+        {
+            CUDA_CHECK_ERROR(cudaMemcpy2D(inAccess->sampleData(i), inAccess->rowStride(), srcVec.data(), shape.z, shape.z,
+                                          shape.y, cudaMemcpyHostToDevice));
+        }
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &points, &counts](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, points, counts);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        throw std::invalid_argument("ImageBatchVarShape not implemented for this operator");
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using FindContoursTypes = nvbench::type_list<uint8_t>;
+
+NVBENCH_BENCH_TYPES(FindContours, NVBENCH_TYPE_AXES(FindContoursTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("numPoints", {1024});
diff --git a/bench/BenchFlip.cpp b/bench/BenchFlip.cpp
new file mode 100644
index 00000000..620eac7f
--- /dev/null
+++ b/bench/BenchFlip.cpp
@@ -0,0 +1,99 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpFlip.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void Flip(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape    = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+    int   flipCode;
+
+    if (state.get_string("flipType") == "HORIZONTAL")
+    {
+        flipCode = 0;
+    }
+    else if (state.get_string("flipType") == "VERTICAL")
+    {
+        flipCode = 1;
+    }
+    else if (state.get_string("flipType") == "BOTH")
+    {
+        flipCode = -1;
+    }
+    else
+    {
+        throw std::invalid_argument("Invalid flipType = " + state.get_string("flipType"));
+    }
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::Flip op;
+
+    // clang-format off
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &flipCode](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, flipCode);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(shape.x);
+        nvcv::ImageBatchVarShape dst(shape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{shape.z, shape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        dst.pushBack(src.begin(), src.end());
+
+        nvcv::Tensor flipCodeTensor({{shape.x}, "N"}, nvcv::TYPE_S32);
+
+        benchutils::FillTensor<int>(flipCodeTensor, [&flipCode](const long4 &){ return flipCode; });
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &flipCodeTensor](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, flipCodeTensor);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using FlipTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(Flip, NVBENCH_TYPE_AXES(FlipTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1})
+    .add_string_axis("flipType", {"BOTH"});
diff --git a/bench/BenchGammaContrast.cpp b/bench/BenchGammaContrast.cpp
new file mode 100644
index 00000000..e1e16958
--- /dev/null
+++ b/bench/BenchGammaContrast.cpp
@@ -0,0 +1,75 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpGammaContrast.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void GammaContrast(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape    = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+
+    int ch = nvcv::cuda::NumElements<T>;
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::GammaContrast op(shape.x, ch);
+
+    // clang-format off
+
+    nvcv::Tensor gamma({{shape.x * ch}, "N"}, nvcv::TYPE_F32);
+
+    benchutils::FillTensor<float>(gamma, benchutils::RandomValues<float>(.5f, 1.f));
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        throw std::invalid_argument("Tensor not implemented for this operator");
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(shape.x);
+        nvcv::ImageBatchVarShape dst(shape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{shape.z, shape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        dst.pushBack(src.begin(), src.end());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &gamma](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, gamma);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using GammaContrastTypes = nvbench::type_list<uint8_t, uchar4>;
+
+NVBENCH_BENCH_TYPES(GammaContrast, NVBENCH_TYPE_AXES(GammaContrastTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {0});
diff --git a/bench/BenchGaussian.cpp b/bench/BenchGaussian.cpp
new file mode 100644
index 00000000..8b4fc30d
--- /dev/null
+++ b/bench/BenchGaussian.cpp
@@ -0,0 +1,96 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpGaussian.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void Gaussian(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3  shape    = benchutils::GetShape<3>(state.get_string("shape"));
+    long   varShape = state.get_int64("varShape");
+    double sigma    = state.get_float64("sigma");
+
+    NVCVBorderType borderType = benchutils::GetBorderType(state.get_string("border"));
+
+    int  kernelSize = (int)std::round(sigma * (std::is_same_v<nvcv::cuda::BaseType<T>, uint8_t> ? 3 : 4) * 2 + 1) | 1;
+    int2 ksize2{kernelSize, kernelSize};
+
+    nvcv::Size2D kernelSize2{kernelSize, kernelSize};
+    double2      sigma2{sigma, sigma};
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::Gaussian op(kernelSize2, shape.x);
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &kernelSize2, &sigma2, &borderType](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, kernelSize2, sigma2, borderType);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(shape.x);
+        nvcv::ImageBatchVarShape dst(shape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{shape.z, shape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        dst.pushBack(src.begin(), src.end());
+
+        nvcv::Tensor kernelSizeTensor({{shape.x}, "N"}, nvcv::TYPE_2S32);
+        nvcv::Tensor sigmaTensor({{shape.x}, "N"}, nvcv::TYPE_2F64);
+
+        benchutils::FillTensor<int2>(kernelSizeTensor, [&ksize2](const long4 &){ return ksize2; });
+        benchutils::FillTensor<double2>(sigmaTensor, [&sigma2](const long4 &){ return sigma2; });
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &kernelSizeTensor, &sigmaTensor, &borderType](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, kernelSizeTensor, sigmaTensor, borderType);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using GaussianTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(Gaussian, NVBENCH_TYPE_AXES(GaussianTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1})
+    .add_float64_axis("sigma", {1.2})
+    .add_string_axis("border", {"REFLECT"});
diff --git a/bench/BenchGaussianNoise.cpp b/bench/BenchGaussianNoise.cpp
new file mode 100644
index 00000000..68633a90
--- /dev/null
+++ b/bench/BenchGaussianNoise.cpp
@@ -0,0 +1,87 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpGaussianNoise.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void GaussianNoise(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape    = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+
+    bool perCh = nvcv::cuda::NumElements<T> > 1;
+
+    unsigned long long int seed = 12345;
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::GaussianNoise op(shape.x);
+
+    // clang-format off
+
+    nvcv::Tensor mu({{shape.x}, "N"}, nvcv::TYPE_F32);
+    nvcv::Tensor sigma({{shape.x}, "N"}, nvcv::TYPE_F32);
+
+    benchutils::FillTensor<float>(mu, benchutils::RandomValues<float>(.0f, 1.f));
+    benchutils::FillTensor<float>(sigma, benchutils::RandomValues<float>(.05f, .1f));
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &mu, &sigma, &perCh, &seed](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, mu, sigma, perCh, seed);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(shape.x);
+        nvcv::ImageBatchVarShape dst(shape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{shape.z, shape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        dst.pushBack(src.begin(), src.end());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &mu, &sigma, &perCh, &seed](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, mu, sigma, perCh, seed);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using GaussianNoiseTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(GaussianNoise, NVBENCH_TYPE_AXES(GaussianNoiseTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1});
diff --git a/bench/BenchHistogram.cpp b/bench/BenchHistogram.cpp
new file mode 100644
index 00000000..73f00cda
--- /dev/null
+++ b/bench/BenchHistogram.cpp
@@ -0,0 +1,71 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpHistogram.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void Histogram(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape    = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+
+    int          numBins = 256;
+    nvcv::Tensor mask{nullptr};
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T));
+    state.add_global_memory_writes(numBins * sizeof(int));
+
+    cvcuda::Histogram op;
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor hist({{shape.x, numBins, 1}, "HWC"}, nvcv::TYPE_S32);
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+        benchutils::FillTensor<int>(hist, [](const long4 &){ return 0; });
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &mask, &hist](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, mask, hist);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        throw std::invalid_argument("ImageBatchVarShape not implemented for this operator");
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using HistogramTypes = nvbench::type_list<uint8_t>;
+
+NVBENCH_BENCH_TYPES(Histogram, NVBENCH_TYPE_AXES(HistogramTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1});
diff --git a/bench/BenchHistogramEq.cpp b/bench/BenchHistogramEq.cpp
new file mode 100644
index 00000000..54082d55
--- /dev/null
+++ b/bench/BenchHistogramEq.cpp
@@ -0,0 +1,77 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpHistogramEq.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void HistogramEq(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape    = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::HistogramEq op(shape.x);
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(shape.x);
+        nvcv::ImageBatchVarShape dst(shape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{shape.z, shape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        dst.pushBack(src.begin(), src.end());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using HistogramEqTypes = nvbench::type_list<uint8_t, uchar4>;
+
+NVBENCH_BENCH_TYPES(HistogramEq, NVBENCH_TYPE_AXES(HistogramEqTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1});
diff --git a/bench/BenchInpaint.cpp b/bench/BenchInpaint.cpp
new file mode 100644
index 00000000..88a237b3
--- /dev/null
+++ b/bench/BenchInpaint.cpp
@@ -0,0 +1,85 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpInpaint.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void Inpaint(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape    = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+
+    double inpaintRadius = 5.0;
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * (sizeof(T) + sizeof(uint8_t)));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::Inpaint op(shape.x, nvcv::Size2D{(int)shape.z, (int)shape.y});
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor mask({{shape.x, shape.y, shape.z, 1}, "NHWC"}, nvcv::TYPE_U8);
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+        benchutils::FillTensor<uint8_t>(mask, benchutils::RandomValues<uint8_t>(0, 1));
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &mask, &dst, &inpaintRadius](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, mask, dst, inpaintRadius);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(shape.x);
+        nvcv::ImageBatchVarShape dst(shape.x);
+        nvcv::ImageBatchVarShape mask(shape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{shape.z, shape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        dst.pushBack(src.begin(), src.end());
+
+        benchutils::FillImageBatch<uint8_t>(mask, long2{shape.z, shape.y}, long2{varShape, varShape},
+                                            benchutils::RandomValues<uint8_t>(0, 1));
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &mask, &dst, &inpaintRadius](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, mask, dst, inpaintRadius);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using InpaintTypes = nvbench::type_list<uint8_t, uchar4>;
+
+NVBENCH_BENCH_TYPES(Inpaint, NVBENCH_TYPE_AXES(InpaintTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1});
diff --git a/bench/BenchJointBilateralFilter.cpp b/bench/BenchJointBilateralFilter.cpp
new file mode 100644
index 00000000..45c325bd
--- /dev/null
+++ b/bench/BenchJointBilateralFilter.cpp
@@ -0,0 +1,100 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpJointBilateralFilter.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void JointBilateralFilter(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape      = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape   = state.get_int64("varShape");
+    int   diameter   = static_cast<int>(state.get_int64("diameter"));
+    float sigmaSpace = static_cast<float>(state.get_float64("sigmaSpace"));
+    float sigmaColor = -1.f;
+
+    NVCVBorderType borderType = benchutils::GetBorderType(state.get_string("border"));
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::JointBilateralFilter op;
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor color({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+        benchutils::FillTensor<T>(color, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &color, &dst, &diameter, &sigmaColor, &sigmaSpace, &borderType](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, color, dst, diameter, sigmaColor, sigmaSpace, borderType);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(shape.x);
+        nvcv::ImageBatchVarShape color(shape.x);
+        nvcv::ImageBatchVarShape dst(shape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{shape.z, shape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        color.pushBack(src.begin(), src.end());
+        dst.pushBack(src.begin(), src.end());
+
+        nvcv::Tensor diameterTensor({{shape.x}, "N"}, nvcv::TYPE_S32);
+        nvcv::Tensor sigmaSpaceTensor({{shape.x}, "N"}, nvcv::TYPE_F32);
+        nvcv::Tensor sigmaColorTensor({{shape.x}, "N"}, nvcv::TYPE_F32);
+
+        benchutils::FillTensor<int>(diameterTensor, [&diameter](const long4 &){ return diameter; });
+        benchutils::FillTensor<float>(sigmaSpaceTensor, [&sigmaSpace](const long4 &){ return sigmaSpace; });
+        benchutils::FillTensor<float>(sigmaColorTensor, [&sigmaColor](const long4 &){ return sigmaColor; });
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &color, &dst, &diameterTensor, &sigmaColorTensor, &sigmaSpaceTensor, &borderType]
+                   (nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, color, dst, diameterTensor, sigmaColorTensor, sigmaSpaceTensor, borderType);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using JointBilateralFilterTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(JointBilateralFilter, NVBENCH_TYPE_AXES(JointBilateralFilterTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("diameter", {-1})
+    .add_float64_axis("sigmaSpace", {1.2})
+    .add_string_axis("border", {"REFLECT"});
diff --git a/bench/BenchLabel.cpp b/bench/BenchLabel.cpp
new file mode 100644
index 00000000..41005379
--- /dev/null
+++ b/bench/BenchLabel.cpp
@@ -0,0 +1,108 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpLabel.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename ST>
+inline void Label(nvbench::state &state, nvbench::type_list<ST>)
+try
+{
+    using DT = uint32_t;
+
+    long3 srcShape = benchutils::GetShape<3>(state.get_string("shape"));
+    long3 dstShape = srcShape;
+
+    std::string runChoice = state.get_string("runChoice");
+
+    // Use [BG][MIN][MAX][ISLAND][COUNT][STAT] in runChoice to run Label with:
+    // background; minThreshold; maxThreshold; island removal; count; statistics
+
+    long3 staShape{srcShape.x, 10000, 6}; // using fixed 10K max. cap. and 2D problem
+
+    NVCVConnectivityType conn = NVCV_CONNECTIVITY_4_2D;
+    NVCVLabelType        alab = NVCV_LABEL_FAST;
+
+    nvcv::Tensor bgT, minT, maxT, countT, statsT, mszT;
+
+    cvcuda::Label op;
+
+    state.add_global_memory_reads(srcShape.x * srcShape.y * srcShape.z * sizeof(ST));
+    state.add_global_memory_writes(dstShape.x * dstShape.y * dstShape.z * sizeof(DT));
+
+    // clang-format off
+
+    if (runChoice.find("BG") != std::string::npos)
+    {
+        bgT = nvcv::Tensor({{srcShape.x}, "N"}, benchutils::GetDataType<ST>());
+
+        benchutils::FillTensor<ST>(bgT, benchutils::RandomValues<ST>());
+    }
+    if (runChoice.find("MIN") != std::string::npos)
+    {
+        minT = nvcv::Tensor({{srcShape.x}, "N"}, benchutils::GetDataType<ST>());
+
+        benchutils::FillTensor<ST>(minT, benchutils::RandomValues<ST>());
+    }
+    if (runChoice.find("MAX") != std::string::npos)
+    {
+        maxT = nvcv::Tensor({{srcShape.x}, "N"}, benchutils::GetDataType<ST>());
+
+        benchutils::FillTensor<ST>(maxT, benchutils::RandomValues<ST>());
+    }
+    if (runChoice.find("ISLAND") != std::string::npos)
+    {
+        mszT = nvcv::Tensor({{srcShape.x}, "N"}, benchutils::GetDataType<DT>());
+
+        benchutils::FillTensor<DT>(mszT, benchutils::RandomValues<DT>());
+    }
+    if (runChoice.find("COUNT") != std::string::npos)
+    {
+        countT = nvcv::Tensor({{srcShape.x}, "N"}, benchutils::GetDataType<DT>());
+    }
+    if (runChoice.find("STAT") != std::string::npos)
+    {
+        statsT = nvcv::Tensor({{staShape.x, staShape.y, staShape.z}, "NMA"}, benchutils::GetDataType<DT>());
+    }
+
+    nvcv::Tensor src({{srcShape.x, srcShape.y, srcShape.z, 1}, "NHWC"}, benchutils::GetDataType<ST>());
+    nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, 1}, "NHWC"}, benchutils::GetDataType<DT>());
+
+    benchutils::FillTensor<ST>(src, benchutils::RandomValues<ST>());
+
+    state.exec(nvbench::exec_tag::sync,
+               [&op, &src, &dst, &bgT, &minT, &maxT, &mszT, &countT, &statsT, &conn, &alab](nvbench::launch &launch)
+               {
+                   op(launch.get_stream(), src, dst, bgT, minT, maxT, mszT, countT, statsT, conn, alab);
+               });
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using LabelTypes = nvbench::type_list<uint8_t, uint32_t>;
+
+NVBENCH_BENCH_TYPES(Label, NVBENCH_TYPE_AXES(LabelTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_string_axis("runChoice", {""});
diff --git a/bench/BenchLaplacian.cpp b/bench/BenchLaplacian.cpp
new file mode 100644
index 00000000..e685198e
--- /dev/null
+++ b/bench/BenchLaplacian.cpp
@@ -0,0 +1,91 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpLaplacian.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void Laplacian(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape    = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+    int   ksize    = static_cast<int>(state.get_int64("ksize"));
+    float scale    = static_cast<float>(state.get_float64("scale"));
+
+    NVCVBorderType borderType = benchutils::GetBorderType(state.get_string("border"));
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::Laplacian op;
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &ksize, &scale, &borderType](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, ksize, scale, borderType);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(shape.x);
+        nvcv::ImageBatchVarShape dst(shape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{shape.z, shape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        dst.pushBack(src.begin(), src.end());
+
+        nvcv::Tensor ksizeTensor({{shape.x}, "N"}, nvcv::TYPE_S32);
+        nvcv::Tensor scaleTensor({{shape.x}, "N"}, nvcv::TYPE_F32);
+
+        benchutils::FillTensor<int>(ksizeTensor, [&ksize](const long4 &){ return ksize; });
+        benchutils::FillTensor<float>(scaleTensor, [&scale](const long4 &){ return scale; });
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &ksizeTensor, &scaleTensor, &borderType](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, ksizeTensor, scaleTensor, borderType);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using LaplacianTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(Laplacian, NVBENCH_TYPE_AXES(LaplacianTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("ksize", {1})
+    .add_float64_axis("scale", {1.0})
+    .add_string_axis("border", {"REFLECT101"});
diff --git a/bench/BenchMedianBlur.cpp b/bench/BenchMedianBlur.cpp
new file mode 100644
index 00000000..45b2c1a6
--- /dev/null
+++ b/bench/BenchMedianBlur.cpp
@@ -0,0 +1,86 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpMedianBlur.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void MedianBlur(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape      = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape   = state.get_int64("varShape");
+    int2  kernelSize = nvcv::cuda::StaticCast<int>(benchutils::GetShape<2>(state.get_string("kernelSize")));
+
+    nvcv::Size2D kernelSize2d{kernelSize.x, kernelSize.y};
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::MedianBlur op(shape.x);
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &kernelSize2d](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, kernelSize2d);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(shape.x);
+        nvcv::ImageBatchVarShape dst(shape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{shape.z, shape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        dst.pushBack(src.begin(), src.end());
+
+        nvcv::Tensor kernelSizeTensor({{shape.x, 2}, "NW"}, nvcv::TYPE_S32);
+
+        benchutils::FillTensor<int>(kernelSizeTensor,
+                                    [&kernelSize](const long4 &c){ return nvcv::cuda::GetElement(kernelSize, c.y); });
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &kernelSizeTensor](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, kernelSizeTensor);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using MedianBlurTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(MedianBlur, NVBENCH_TYPE_AXES(MedianBlurTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1})
+    .add_string_axis("kernelSize", {"5x5"});
diff --git a/bench/BenchMinAreaRect.cpp b/bench/BenchMinAreaRect.cpp
new file mode 100644
index 00000000..1eae7a3e
--- /dev/null
+++ b/bench/BenchMinAreaRect.cpp
@@ -0,0 +1,70 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpMinAreaRect.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void MinAreaRect(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long2 shape    = benchutils::GetShape<2>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+
+    state.add_global_memory_reads(shape.x * shape.y * sizeof(T));
+    state.add_global_memory_writes(shape.x * 8 * sizeof(float) + shape.x * sizeof(int));
+
+    cvcuda::MinAreaRect op(shape.x);
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, 2}, "NWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{shape.x, 8}, "NW"}, nvcv::TYPE_F32);
+        nvcv::Tensor points({{1, shape.x}, "NW"}, nvcv::TYPE_S32);
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+        benchutils::FillTensor<float>(dst, benchutils::RandomValues<float>(0.f, 1.f));
+        benchutils::FillTensor<int>(points, benchutils::RandomValues<int>(10, 100));
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &points, &shape](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, points, shape.x);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        throw std::invalid_argument("ImageBatchVarShape not implemented for this operator");
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using MinAreaRectTypes = nvbench::type_list<short>;
+
+NVBENCH_BENCH_TYPES(MinAreaRect, NVBENCH_TYPE_AXES(MinAreaRectTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1024"})
+    .add_int64_axis("varShape", {-1});
diff --git a/bench/BenchMinMaxLoc.cpp b/bench/BenchMinMaxLoc.cpp
new file mode 100644
index 00000000..582348fd
--- /dev/null
+++ b/bench/BenchMinMaxLoc.cpp
@@ -0,0 +1,92 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchMinMaxLoc.hpp"
+
+#include <cvcuda/OpMinMaxLoc.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void MinMaxLoc(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape    = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+    long  maxLocs  = state.get_int64("maxLocations");
+
+    // clang-format off
+
+    nvcv::Tensor minVal({{shape.x}, "N"}, nvcv::TYPE_U32);
+    nvcv::Tensor minLoc({{shape.x, maxLocs}, "NM"}, nvcv::TYPE_2S32);
+    nvcv::Tensor numMin({{shape.x}, "N"}, nvcv::TYPE_S32);
+
+    nvcv::Tensor maxVal({{shape.x}, "N"}, nvcv::TYPE_U32);
+    nvcv::Tensor maxLoc({{shape.x, maxLocs}, "NM"}, nvcv::TYPE_2S32);
+    nvcv::Tensor numMax({{shape.x}, "N"}, nvcv::TYPE_S32);
+
+    // clang-format on
+
+    // R/W bandwidth rationale:
+    // 1 read to find min/max + 1 read to collect their locations
+    // 2 writes of min/max values (U32), locations (2S32) and quantity (S32)
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T) * 2);
+    state.add_global_memory_writes(shape.x * (sizeof(uint32_t) + maxLocs * sizeof(int2) + sizeof(int)) * 2);
+
+    cvcuda::MinMaxLoc op;
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensorWithMinMax<T>(src, maxLocs);
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &minVal, &minLoc, &numMin, &maxVal, &maxLoc, &numMax](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, minVal, minLoc, numMin, maxVal, maxLoc, numMax);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(shape.x);
+
+        benchutils::FillImageBatchWithMinMax<T>(src, long2{shape.z, shape.y}, long2{varShape, varShape}, maxLocs);
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &minVal, &minLoc, &numMin, &maxVal, &maxLoc, &numMax](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, minVal, minLoc, numMin, maxVal, maxLoc, numMax);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using MinMaxLocTypes = nvbench::type_list<nvbench::uint8_t, nvbench::uint32_t>;
+
+NVBENCH_BENCH_TYPES(MinMaxLoc, NVBENCH_TYPE_AXES(MinMaxLocTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("maxLocations", {100000});
diff --git a/bench/BenchMinMaxLoc.hpp b/bench/BenchMinMaxLoc.hpp
new file mode 100644
index 00000000..cd7f6a27
--- /dev/null
+++ b/bench/BenchMinMaxLoc.hpp
@@ -0,0 +1,121 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CVCUDA_BENCH_MINMAXLOC_HPP
+#define CVCUDA_BENCH_MINMAXLOC_HPP
+
+#include "BenchUtils.hpp"
+
+namespace benchutils {
+
+template<typename VT, typename R = Randomizer<VT>, typename RE = typename R::RE, typename UD = typename R::UD,
+         class = nvcv::cuda::Require<nvcv::cuda::NumComponents<VT> == 0 && std::is_integral_v<VT>>>
+inline auto RandomValuesWithoutMinMax(VT min = nvcv::cuda::TypeTraits<VT>::min,
+                                      VT max = nvcv::cuda::TypeTraits<VT>::max, RE rng = DefaultGenerator())
+{
+    return R{UD(min + 1, max - 1), rng};
+}
+
+template<typename VT, typename R = Randomizer<VT>, typename RE = typename R::RE>
+inline void RandomMinMax(std::vector<uint8_t> &srcVec, const long3 &shape, const long3 &strides, long locs,
+                         RE randomGenerator = DefaultGenerator())
+{
+    long locsPerHeight = static_cast<int>(std::ceil(locs / shape.y));
+    if (locsPerHeight * 2 >= shape.z)
+    {
+        throw std::runtime_error("Locations is bigger than available pixels");
+    }
+
+    for (long x = 0; x < shape.x; ++x)
+    {
+        long countLocs = 0;
+        for (long y = 0; y < shape.y; ++y)
+        {
+            long numLocs = (countLocs + locsPerHeight > locs) ? (locs - countLocs) : locsPerHeight;
+            for (long z = 0; z < numLocs; ++z)
+            {
+                ValueAt<VT>(srcVec, strides, long3{x, y, z}) = nvcv::cuda::TypeTraits<VT>::min;
+            }
+            for (long z = numLocs; z < numLocs * 2; ++z)
+            {
+                ValueAt<VT>(srcVec, strides, long3{x, y, z}) = nvcv::cuda::TypeTraits<VT>::max;
+            }
+            std::shuffle(&ValueAt<VT>(srcVec, strides, long3{x, y, 0}),
+                         &ValueAt<VT>(srcVec, strides, long3{x, y, shape.z}), randomGenerator);
+
+            countLocs += locsPerHeight;
+        }
+    }
+}
+
+template<typename VT>
+inline void FillTensorWithMinMax(const nvcv::Tensor &tensor, long locations)
+{
+    auto tensorData = tensor.exportData<nvcv::TensorDataStridedCuda>();
+    CVCUDA_CHECK_DATA(tensorData);
+
+    if (tensor.rank() != 3 && tensor.rank() != 4)
+    {
+        throw std::invalid_argument("Tensor rank is not 3 or 4");
+    }
+
+    long3 strides{tensorData->stride(0), tensorData->stride(1), tensorData->stride(2)};
+    long3 shape{tensorData->shape(0), tensorData->shape(1), tensorData->shape(2)};
+    long  bufSize{nvcv::cuda::GetElement(strides, 0) * nvcv::cuda::GetElement(shape, 0)};
+
+    std::vector<uint8_t> tensorVec(bufSize);
+
+    FillBuffer<VT>(tensorVec, shape, strides, RandomValuesWithoutMinMax<VT>());
+
+    RandomMinMax<VT>(tensorVec, shape, strides, locations);
+
+    CUDA_CHECK_ERROR(cudaMemcpy(tensorData->basePtr(), tensorVec.data(), bufSize, cudaMemcpyHostToDevice));
+}
+
+template<typename VT>
+inline void FillImageBatchWithMinMax(nvcv::ImageBatchVarShape &imageBatch, long2 size, long2 varSize, long locations)
+{
+    auto randomWidth  = RandomValues<int>(static_cast<int>(size.x - varSize.x), static_cast<int>(size.x));
+    auto randomHeight = RandomValues<int>(static_cast<int>(size.y - varSize.y), static_cast<int>(size.y));
+
+    for (int i = 0; i < imageBatch.capacity(); ++i)
+    {
+        nvcv::Image image(nvcv::Size2D{randomWidth(), randomHeight()}, GetFormat<VT>());
+
+        auto data = image.exportData<nvcv::ImageDataStridedCuda>();
+        CVCUDA_CHECK_DATA(data);
+
+        long2 strides{data->plane(0).rowStride, sizeof(VT)};
+        long2 shape{data->plane(0).height, data->plane(0).width};
+        long  bufSize{strides.x * shape.x};
+
+        std::vector<uint8_t> imageBuffer(bufSize);
+
+        FillBuffer<VT>(imageBuffer, shape, strides, RandomValuesWithoutMinMax<VT>());
+
+        RandomMinMax<VT>(imageBuffer, long3{1, shape.x, shape.y}, long3{bufSize, strides.x, strides.y}, locations);
+
+        CUDA_CHECK_ERROR(cudaMemcpy2D(data->plane(0).basePtr, strides.x, imageBuffer.data(), strides.x, strides.x,
+                                      data->plane(0).height, cudaMemcpyHostToDevice));
+
+        imageBatch.pushBack(image);
+    }
+}
+
+} // namespace benchutils
+
+#endif // CVCUDA_BENCH_MINMAXLOC_HPP
diff --git a/bench/BenchMorphology.cpp b/bench/BenchMorphology.cpp
new file mode 100644
index 00000000..69ed2f97
--- /dev/null
+++ b/bench/BenchMorphology.cpp
@@ -0,0 +1,133 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpMorphology.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void Morphology(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape     = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape  = state.get_int64("varShape");
+    int   iteration = static_cast<int>(state.get_int64("iteration"));
+
+    NVCVBorderType borderType = benchutils::GetBorderType(state.get_string("border"));
+
+    NVCVMorphologyType morphType;
+
+    if (state.get_string("morphType") == "ERODE")
+    {
+        morphType = NVCV_ERODE;
+    }
+    else if (state.get_string("morphType") == "DILATE")
+    {
+        morphType = NVCV_DILATE;
+    }
+    else if (state.get_string("morphType") == "OPEN")
+    {
+        morphType = NVCV_OPEN;
+    }
+    else if (state.get_string("morphType") == "CLOSE")
+    {
+        morphType = NVCV_CLOSE;
+    }
+
+    nvcv::Size2D mask{3, 3};
+    int2         anchor{-1, -1};
+
+    int bwIteration = (morphType == NVCV_OPEN || morphType == NVCV_CLOSE || iteration > 1) ? 2 * iteration : iteration;
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T) * bwIteration);
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T) * bwIteration);
+
+    cvcuda::Morphology op;
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        nvcv::Tensor workspace{nullptr};
+
+        if (morphType == NVCV_OPEN || morphType == NVCV_CLOSE || iteration > 1)
+        {
+            workspace = nvcv::Tensor({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        }
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &workspace, &morphType, &mask, &anchor, &iteration, &borderType]
+                   (nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, workspace, morphType, mask, anchor, iteration, borderType);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(shape.x);
+        nvcv::ImageBatchVarShape dst(shape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{shape.z, shape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        dst.pushBack(src.begin(), src.end());
+
+        nvcv::Tensor maskTensor({{shape.x}, "N"}, nvcv::TYPE_2S32);
+        nvcv::Tensor anchorTensor({{shape.x}, "N"}, nvcv::TYPE_2S32);
+
+        benchutils::FillTensor<int2>(maskTensor, [&mask](const long4 &){ return int2{mask.w, mask.h}; });
+        benchutils::FillTensor<int2>(anchorTensor, [&anchor](const long4 &){ return anchor; });
+
+        nvcv::ImageBatchVarShape workspace{nullptr};
+
+        if (morphType == NVCV_OPEN || morphType == NVCV_CLOSE || iteration > 1)
+        {
+            workspace = nvcv::ImageBatchVarShape(shape.x);
+
+            workspace.pushBack(dst.begin(), dst.end());
+        }
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &workspace, &morphType, &maskTensor, &anchorTensor, &iteration, &borderType]
+                   (nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, workspace, morphType, maskTensor, anchorTensor, iteration, borderType);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using MorphologyTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(Morphology, NVBENCH_TYPE_AXES(MorphologyTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("iteration", {1})
+    .add_string_axis("morphType", {"ERODE", "DILATE", "OPEN", "CLOSE"})
+    .add_string_axis("border", {"REPLICATE"});
diff --git a/bench/BenchNMS.cpp b/bench/BenchNMS.cpp
new file mode 100644
index 00000000..bad16a31
--- /dev/null
+++ b/bench/BenchNMS.cpp
@@ -0,0 +1,76 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpNonMaximumSuppression.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T, typename S = float, typename M = uint8_t>
+inline void NMS(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long2 shape    = benchutils::GetShape<2>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+    float scThr    = static_cast<float>(state.get_float64("scoreThreshold"));
+    float iouThr   = static_cast<float>(state.get_float64("iouThreshold"));
+
+    // R/W bandwidth rationale:
+    // 1 read of scores (F32) to mask out lower scores boxes + 1 read of boxes (4S16) for IoU threshold
+    // 2 writes of masks (U8) by score and IoU thresholds
+    state.add_global_memory_reads(shape.x * shape.y * (sizeof(T) + sizeof(S)));
+    state.add_global_memory_writes(shape.x * shape.y * sizeof(M) * 2);
+
+    cvcuda::NonMaximumSuppression op;
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor srcBB({{shape.x, shape.y}, "NB"}, benchutils::GetDataType<T>());
+        nvcv::Tensor srcSc({{shape.x, shape.y}, "NB"}, benchutils::GetDataType<S>());
+        nvcv::Tensor dstMk({{shape.x, shape.y}, "NB"}, benchutils::GetDataType<M>());
+
+        benchutils::FillTensor<T>(srcBB, benchutils::RandomValues<T>(10, 50));
+        benchutils::FillTensor<S>(srcSc, benchutils::RandomValues<S>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &srcBB, &dstMk, &srcSc, &scThr, &iouThr](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), srcBB, dstMk, srcSc, scThr, iouThr);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        throw std::invalid_argument("ImageBatchVarShape not implemented for this operator");
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using NMSTypes = nvbench::type_list<short4>;
+
+NVBENCH_BENCH_TYPES(NMS, NVBENCH_TYPE_AXES(NMSTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1024", "32x1024"})
+    .add_int64_axis("varShape", {-1})
+    .add_float64_axis("scoreThreshold", {0.5})
+    .add_float64_axis("iouThreshold", {0.75});
diff --git a/bench/BenchNormalize.cpp b/bench/BenchNormalize.cpp
new file mode 100644
index 00000000..64eed3e3
--- /dev/null
+++ b/bench/BenchNormalize.cpp
@@ -0,0 +1,99 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpNormalize.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void Normalize(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 srcShape = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+    long3 dstShape = srcShape;
+
+    float    globalScale = 1.234f;
+    float    globalShift = 2.345f;
+    float    epsilon     = 12.34f;
+    uint32_t flags       = CVCUDA_NORMALIZE_SCALE_IS_STDDEV;
+
+    long3 baseShape{srcShape.x, 1, 1};
+    long3 scaleShape{srcShape.x, 1, 1};
+
+    state.add_global_memory_reads(srcShape.x * srcShape.y * srcShape.z * sizeof(T)
+                                  + baseShape.x * baseShape.y * baseShape.z * sizeof(float)
+                                  + scaleShape.x * scaleShape.y * scaleShape.z * sizeof(float));
+    state.add_global_memory_writes(dstShape.x * dstShape.y * dstShape.z * sizeof(T));
+
+    cvcuda::Normalize op;
+
+    // clang-format off
+
+    nvcv::Tensor base({{baseShape.x, baseShape.y, baseShape.z, 1}, "NHWC"}, nvcv::TYPE_F32);
+    nvcv::Tensor scale({{scaleShape.x, scaleShape.y, scaleShape.z, 1}, "NHWC"}, nvcv::TYPE_F32);
+
+    benchutils::FillTensor<float>(base, benchutils::RandomValues<T>());
+    benchutils::FillTensor<float>(scale, benchutils::RandomValues<float>(0.f, 1.f));
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{srcShape.x, srcShape.y, srcShape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &base, &scale, &dst, &globalScale, &globalShift, &epsilon, &flags]
+                   (nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, base, scale, dst, globalScale, globalShift, epsilon, flags);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(srcShape.x);
+        nvcv::ImageBatchVarShape dst(dstShape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{srcShape.z, srcShape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        benchutils::FillImageBatch<T>(dst, long2{dstShape.z, dstShape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &base, &scale, &dst, &globalScale, &globalShift, &epsilon, &flags]
+                   (nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, base, scale, dst, globalScale, globalShift, epsilon, flags);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using NormalizeTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(Normalize, NVBENCH_TYPE_AXES(NormalizeTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1});
diff --git a/bench/BenchOSD.cpp b/bench/BenchOSD.cpp
new file mode 100644
index 00000000..b5e9b0a6
--- /dev/null
+++ b/bench/BenchOSD.cpp
@@ -0,0 +1,94 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <../priv/Types.hpp>
+#include <cvcuda/OpOSD.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void OSD(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape    = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+    int   numElem  = static_cast<int>(state.get_int64("numElem"));
+
+    int ch = nvcv::cuda::NumElements<T>;
+
+    using BT = nvcv::cuda::BaseType<T>;
+
+    std::vector<std::vector<std::shared_ptr<cvcuda::priv::NVCVElement>>> elementVec;
+
+    for (int n = 0; n < (int)shape.x; n++)
+    {
+        std::vector<std::shared_ptr<cvcuda::priv::NVCVElement>> curVec;
+        for (int i = 0; i < numElem; i++)
+        {
+            NVCVPoint point;
+            point.centerPos.x = shape.z / 2;
+            point.centerPos.y = shape.y / 2;
+            point.radius      = std::min(shape.z, shape.y) / 2;
+            point.color       = {0, 0, 0, 255};
+            auto element      = std::make_shared<cvcuda::priv::NVCVElement>(NVCVOSDType::NVCV_OSD_POINT, &point);
+            curVec.push_back(element);
+        }
+        elementVec.push_back(curVec);
+    }
+
+    std::shared_ptr<cvcuda::priv::NVCVElementsImpl> ctx = std::make_shared<cvcuda::priv::NVCVElementsImpl>(elementVec);
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T) + numElem * sizeof(int) * 16);
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::OSD op;
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType<BT>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType<BT>());
+
+        benchutils::FillTensor<BT>(src, benchutils::RandomValues<BT>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &ctx](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, (NVCVElements)ctx.get());
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        throw std::invalid_argument("ImageBatchVarShape not implemented for this operator");
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using OSDTypes = nvbench::type_list<uchar3, uchar4>;
+
+NVBENCH_BENCH_TYPES(OSD, NVBENCH_TYPE_AXES(OSDTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("numElem", {100});
diff --git a/bench/BenchPadAndStack.cpp b/bench/BenchPadAndStack.cpp
new file mode 100644
index 00000000..18a36c38
--- /dev/null
+++ b/bench/BenchPadAndStack.cpp
@@ -0,0 +1,82 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpPadAndStack.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void PadAndStack(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 srcShape = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+    long3 dstShape = srcShape;
+
+    NVCVBorderType borderType = benchutils::GetBorderType(state.get_string("border"));
+
+    float borderValue{0.f};
+
+    state.add_global_memory_reads(srcShape.x * srcShape.y * srcShape.z * sizeof(T) + srcShape.x * sizeof(int) * 2);
+    state.add_global_memory_writes(dstShape.x * dstShape.y * dstShape.z * sizeof(T));
+
+    cvcuda::PadAndStack op;
+
+    // clang-format off
+
+    nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+    nvcv::Tensor top({{srcShape.x, 1, 1, 1}, "NHWC"}, nvcv::TYPE_S32);
+    nvcv::Tensor left({{srcShape.x, 1, 1, 1}, "NHWC"}, nvcv::TYPE_S32);
+
+    benchutils::FillTensor<int>(top, [&srcShape](const long4 &){ return srcShape.y / 2; });
+    benchutils::FillTensor<int>(left, [&srcShape](const long4 &){ return srcShape.z / 2; });
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        throw std::invalid_argument("Tensor not implemented for this operator");
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(srcShape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{srcShape.z, srcShape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &top, &left, &borderType, &borderValue](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, top, left, borderType, borderValue);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using PadAndStackTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(PadAndStack, NVBENCH_TYPE_AXES(PadAndStackTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {0})
+    .add_string_axis("border", {"REFLECT101"});
diff --git a/bench/BenchPairwiseMatcher.cpp b/bench/BenchPairwiseMatcher.cpp
new file mode 100644
index 00000000..4d7ebaf2
--- /dev/null
+++ b/bench/BenchPairwiseMatcher.cpp
@@ -0,0 +1,108 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpPairwiseMatcher.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename ST>
+inline void PairwiseMatcher(nvbench::state &state, nvbench::type_list<ST>)
+try
+{
+    long3 shape = benchutils::GetShape<3>(state.get_string("shape"));
+
+    int matchesPerPoint = static_cast<int>(state.get_int64("matchesPerPoint"));
+
+    bool crossCheck     = state.get_string("crossCheck") == "T";
+    bool readNumSets    = state.get_string("readNumSets") == "T";
+    bool writeDistances = state.get_string("writeDistances") == "T";
+
+    NVCVNormType normType = benchutils::GetNormType(state.get_string("normType"));
+
+    NVCVPairwiseMatcherType algoChoice;
+
+    if (state.get_string("algoChoice") == "BRUTE_FORCE")
+    {
+        algoChoice = NVCV_BRUTE_FORCE;
+    }
+    else
+    {
+        throw std::invalid_argument("Unexpected algorithm choice = " + state.get_string("algoChoice"));
+    }
+
+    int maxMatches = shape.y * matchesPerPoint;
+
+    cvcuda::PairwiseMatcher op(algoChoice);
+
+    state.add_global_memory_reads((crossCheck ? 3 : 2) * shape.x * shape.y * shape.z * sizeof(ST));
+    state.add_global_memory_writes(shape.x * (sizeof(int) + maxMatches * (2 * sizeof(int) + sizeof(float))));
+
+    // clang-format off
+
+    nvcv::Tensor set1({{shape.x, shape.y, shape.z}, "NMD"}, benchutils::GetDataType<ST>());
+    nvcv::Tensor set2({{shape.x, shape.y, shape.z}, "NMD"}, benchutils::GetDataType<ST>());
+
+    nvcv::Tensor matches({{shape.x, maxMatches, 2}, "NMD"}, nvcv::TYPE_S32);
+
+    nvcv::Tensor numMatches({{shape.x}, "N"}, nvcv::TYPE_S32);
+
+    nvcv::Tensor numSet1, numSet2, distances;
+
+    if (readNumSets)
+    {
+        numSet1 = nvcv::Tensor({{shape.x}, "N"}, nvcv::TYPE_S32);
+        numSet2 = nvcv::Tensor({{shape.x}, "N"}, nvcv::TYPE_S32);
+
+        benchutils::FillTensor<int>(numSet1, [&shape](const long4 &){ return shape.y; });
+        benchutils::FillTensor<int>(numSet2, [&shape](const long4 &){ return shape.y; });
+    }
+    if (writeDistances)
+    {
+        distances = nvcv::Tensor({{shape.x, maxMatches}, "NM"}, nvcv::TYPE_F32);
+    }
+
+    benchutils::FillTensor<ST>(set1, benchutils::RandomValues<ST>());
+    benchutils::FillTensor<ST>(set2, benchutils::RandomValues<ST>());
+
+    state.exec(nvbench::exec_tag::sync,
+               [&op, &set1, &set2, &numSet1, &numSet2, &matches, &numMatches, &distances, &crossCheck,
+                &matchesPerPoint, &normType](nvbench::launch &launch)
+               {
+                   op(launch.get_stream(), set1, set2, numSet1, numSet2, matches, numMatches, distances, crossCheck,
+                      matchesPerPoint, normType);
+               });
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using PairwiseMatcherTypes = nvbench::type_list<uint8_t, uint32_t, float>;
+
+NVBENCH_BENCH_TYPES(PairwiseMatcher, NVBENCH_TYPE_AXES(PairwiseMatcherTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x10000x32"})
+    .add_int64_axis("matchesPerPoint", {1})
+    .add_string_axis("crossCheck", {"T"})
+    .add_string_axis("readNumSets", {"F"})
+    .add_string_axis("writeDistances", {"T"})
+    .add_string_axis("normType", {"HAMMING"})
+    .add_string_axis("algoChoice", {"BRUTE_FORCE"});
diff --git a/bench/BenchPillowResize.cpp b/bench/BenchPillowResize.cpp
new file mode 100644
index 00000000..359480e2
--- /dev/null
+++ b/bench/BenchPillowResize.cpp
@@ -0,0 +1,105 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpPillowResize.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void PillowResize(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 srcShape = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+
+    NVCVInterpolationType interpType = benchutils::GetInterpolationType(state.get_string("interpolation"));
+
+    long3 dstShape;
+
+    if (state.get_string("resizeType") == "EXPAND")
+    {
+        dstShape = long3{srcShape.x, srcShape.y * 2, srcShape.z * 2};
+    }
+    else if (state.get_string("resizeType") == "CONTRACT")
+    {
+        dstShape = long3{srcShape.x, srcShape.y / 2, srcShape.z / 2};
+    }
+    else
+    {
+        throw std::invalid_argument("Invalid resizeType = " + state.get_string("resizeType"));
+    }
+
+    nvcv::Size2D srcSize{(int)srcShape.z, (int)srcShape.y};
+    nvcv::Size2D dstSize{(int)dstShape.z, (int)dstShape.y};
+
+    nvcv::DataType    dtype{benchutils::GetDataType<T>()};
+    nvcv::ImageFormat fmt(nvcv::MemLayout::PITCH_LINEAR, dtype.dataKind(), nvcv::Swizzle::S_X000, dtype.packing());
+
+    state.add_global_memory_reads(srcShape.x * srcShape.y * srcShape.z * sizeof(T));
+    state.add_global_memory_writes(dstShape.x * dstShape.y * dstShape.z * sizeof(T));
+
+    cvcuda::PillowResize    op;
+    cvcuda::UniqueWorkspace ws
+        = cvcuda::AllocateWorkspace(op.getWorkspaceRequirements(srcShape.x, srcSize, dstSize, fmt));
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{srcShape.x, srcShape.y, srcShape.z, 1}, "NHWC"}, dtype);
+        nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, 1}, "NHWC"}, dtype);
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &ws, &src, &dst, &interpType](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), ws.get(), src, dst, interpType);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(srcShape.x);
+        nvcv::ImageBatchVarShape dst(dstShape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{srcShape.z, srcShape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        benchutils::FillImageBatch<T>(dst, long2{dstShape.z, dstShape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &ws, &src, &dst, &interpType](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), ws.get(), src, dst, interpType);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using PillowResizeTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(PillowResize, NVBENCH_TYPE_AXES(PillowResizeTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1})
+    .add_string_axis("resizeType", {"CONTRACT"})
+    .add_string_axis("interpolation", {"CUBIC"});
diff --git a/bench/BenchRandomResizedCrop.cpp b/bench/BenchRandomResizedCrop.cpp
new file mode 100644
index 00000000..b7f58c57
--- /dev/null
+++ b/bench/BenchRandomResizedCrop.cpp
@@ -0,0 +1,103 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpRandomResizedCrop.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void RandomResizedCrop(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 srcShape = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+
+    NVCVInterpolationType interpType = benchutils::GetInterpolationType(state.get_string("interpolation"));
+
+    long3 dstShape;
+
+    if (state.get_string("resizeType") == "EXPAND")
+    {
+        dstShape = long3{srcShape.x, srcShape.y * 2, srcShape.z * 2};
+    }
+    else if (state.get_string("resizeType") == "CONTRACT")
+    {
+        dstShape = long3{srcShape.x, srcShape.y / 2, srcShape.z / 2};
+    }
+    else
+    {
+        throw std::invalid_argument("Invalid resizeType = " + state.get_string("resizeType"));
+    }
+
+    double   minScale = 0.08;
+    double   maxScale = 1.0;
+    double   minRatio = 0.5;
+    double   maxRatio = 2.0;
+    uint32_t seed     = 1234;
+
+    state.add_global_memory_reads(srcShape.x * srcShape.y * srcShape.z * sizeof(T));
+    state.add_global_memory_writes(dstShape.x * dstShape.y * dstShape.z * sizeof(T));
+
+    cvcuda::RandomResizedCrop op(minScale, maxScale, minRatio, maxRatio, srcShape.x, seed);
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{srcShape.x, srcShape.y, srcShape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &interpType](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, interpType);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(srcShape.x);
+        nvcv::ImageBatchVarShape dst(dstShape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{srcShape.z, srcShape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        benchutils::FillImageBatch<T>(dst, long2{dstShape.z, dstShape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &interpType](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, interpType);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using RandomResizedCropTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(RandomResizedCrop, NVBENCH_TYPE_AXES(RandomResizedCropTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1})
+    .add_string_axis("resizeType", {"EXPAND"})
+    .add_string_axis("interpolation", {"LINEAR"});
diff --git a/bench/BenchReformat.cpp b/bench/BenchReformat.cpp
new file mode 100644
index 00000000..26e8f28c
--- /dev/null
+++ b/bench/BenchReformat.cpp
@@ -0,0 +1,67 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpReformat.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void Reformat(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape    = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::Reformat op;
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, 1, shape.y, shape.z}, "NCHW"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        throw std::invalid_argument("ImageBatchVarShape not implemented for this operator");
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using ReformatTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(Reformat, NVBENCH_TYPE_AXES(ReformatTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1});
diff --git a/bench/BenchRemap.cpp b/bench/BenchRemap.cpp
new file mode 100644
index 00000000..7fc20600
--- /dev/null
+++ b/bench/BenchRemap.cpp
@@ -0,0 +1,120 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpRemap.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void Remap(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 srcShape = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+    long3 dstShape = srcShape;
+    long3 mapShape;
+
+    NVCVInterpolationType srcInterp, mapInterp;
+    NVCVBorderType        borderType;
+    NVCVRemapMapValueType mapValueType;
+
+    bool   alignCorners{true};
+    float4 borderValue{0, 0, 0, 0};
+
+    if (state.get_string("mapType") == "DENSE")
+    {
+        srcInterp    = NVCV_INTERP_NEAREST;
+        mapInterp    = NVCV_INTERP_NEAREST;
+        borderType   = NVCV_BORDER_CONSTANT;
+        mapValueType = NVCV_REMAP_ABSOLUTE_NORMALIZED;
+        mapShape     = srcShape;
+    }
+    else if (state.get_string("mapType") == "RELATIVE")
+    {
+        srcInterp    = NVCV_INTERP_CUBIC;
+        mapInterp    = NVCV_INTERP_CUBIC;
+        borderType   = NVCV_BORDER_REFLECT101;
+        mapValueType = NVCV_REMAP_RELATIVE_NORMALIZED;
+        mapShape     = long3{srcShape.x, 4, 4};
+    }
+    else
+    {
+        throw std::invalid_argument("Invalid mapType = " + state.get_string("mapType"));
+    }
+
+    state.add_global_memory_reads(srcShape.x * srcShape.y * srcShape.z * sizeof(T)
+                                  + mapShape.x * mapShape.y * mapShape.z * sizeof(float2));
+    state.add_global_memory_writes(dstShape.x * dstShape.y * dstShape.z * sizeof(T));
+
+    cvcuda::Remap op;
+
+    // clang-format off
+
+    nvcv::Tensor map({{mapShape.x, mapShape.y, mapShape.z, 1}, "NHWC"}, nvcv::TYPE_2F32);
+
+    benchutils::FillTensor<float2>(map, benchutils::RandomValues<float2>());
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{srcShape.x, srcShape.y, srcShape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &map, &srcInterp, &mapInterp, &mapValueType, &alignCorners, &borderType,
+                    &borderValue](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, map, srcInterp, mapInterp, mapValueType, alignCorners, borderType,
+               borderValue);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(srcShape.x);
+        nvcv::ImageBatchVarShape dst(dstShape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{srcShape.z, srcShape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        benchutils::FillImageBatch<T>(dst, long2{dstShape.z, dstShape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &map, &srcInterp, &mapInterp, &mapValueType, &alignCorners, &borderType,
+                    &borderValue](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, map, srcInterp, mapInterp, mapValueType, alignCorners, borderType,
+               borderValue);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using RemapTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(Remap, NVBENCH_TYPE_AXES(RemapTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1})
+    .add_string_axis("mapType", {"DENSE"});
diff --git a/bench/BenchResize.cpp b/bench/BenchResize.cpp
new file mode 100644
index 00000000..7446a6f8
--- /dev/null
+++ b/bench/BenchResize.cpp
@@ -0,0 +1,97 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpResize.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void Resize(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 srcShape = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+
+    NVCVInterpolationType interpType = benchutils::GetInterpolationType(state.get_string("interpolation"));
+
+    long3 dstShape;
+
+    if (state.get_string("resizeType") == "EXPAND")
+    {
+        dstShape = long3{srcShape.x, srcShape.y * 2, srcShape.z * 2};
+    }
+    else if (state.get_string("resizeType") == "CONTRACT")
+    {
+        dstShape = long3{srcShape.x, srcShape.y / 2, srcShape.z / 2};
+    }
+    else
+    {
+        throw std::invalid_argument("Invalid resizeType = " + state.get_string("resizeType"));
+    }
+
+    state.add_global_memory_reads(srcShape.x * srcShape.y * srcShape.z * sizeof(T));
+    state.add_global_memory_writes(dstShape.x * dstShape.y * dstShape.z * sizeof(T));
+
+    cvcuda::Resize op;
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{srcShape.x, srcShape.y, srcShape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &interpType](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, interpType);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(srcShape.x);
+        nvcv::ImageBatchVarShape dst(dstShape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{srcShape.z, srcShape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        benchutils::FillImageBatch<T>(dst, long2{dstShape.z, dstShape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &interpType](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, interpType);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using ResizeTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(Resize, NVBENCH_TYPE_AXES(ResizeTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1})
+    .add_string_axis("resizeType", {"EXPAND"})
+    .add_string_axis("interpolation", {"LINEAR"});
diff --git a/bench/BenchRotate.cpp b/bench/BenchRotate.cpp
new file mode 100644
index 00000000..4f4af05c
--- /dev/null
+++ b/bench/BenchRotate.cpp
@@ -0,0 +1,91 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpRotate.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void Rotate(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape    = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+
+    NVCVInterpolationType interpType = benchutils::GetInterpolationType(state.get_string("interpolation"));
+
+    double  angleDeg = 123.456;
+    double2 shift{12.34, 12.34};
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::Rotate op(shape.x);
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &angleDeg, &shift, &interpType](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, angleDeg, shift, interpType);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(shape.x);
+        nvcv::ImageBatchVarShape dst(shape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{shape.z, shape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        dst.pushBack(src.begin(), src.end());
+
+        nvcv::Tensor angleDegTensor({{shape.x}, "N"}, nvcv::TYPE_F64);
+        nvcv::Tensor shiftTensor({{shape.x, 2}, "NW"}, nvcv::TYPE_F64);
+
+        benchutils::FillTensor<double>(angleDegTensor, [&angleDeg](const long4 &){ return angleDeg; });
+        benchutils::FillTensor<double>(shiftTensor,
+                                       [&shift](const long4 &c){ return nvcv::cuda::GetElement(shift, c.y); });
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &angleDegTensor, &shiftTensor, &interpType](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, angleDegTensor, shiftTensor, interpType);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using RotateTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(Rotate, NVBENCH_TYPE_AXES(RotateTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1})
+    .add_string_axis("interpolation", {"CUBIC"});
diff --git a/bench/BenchSIFT.cpp b/bench/BenchSIFT.cpp
new file mode 100644
index 00000000..804c3598
--- /dev/null
+++ b/bench/BenchSIFT.cpp
@@ -0,0 +1,109 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpSIFT.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void SIFT(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape        = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape     = state.get_int64("varShape");
+    int   capacity     = static_cast<int>(state.get_int64("maxCapacity"));
+    int   numOctLayers = static_cast<int>(state.get_int64("numOctaveLayers"));
+    float contThr      = static_cast<float>(state.get_float64("contrastThreshold"));
+    float edgeThr      = static_cast<float>(state.get_float64("edgeThreshold"));
+    float initSigma    = static_cast<float>(state.get_float64("initSigma"));
+
+    NVCVSIFTFlagType flags;
+
+    int3 maxShape;
+
+    if (state.get_string("expandInput") == "Y")
+    {
+        flags    = NVCV_SIFT_USE_EXPANDED_INPUT;
+        maxShape = int3{(int)shape.z * 2, (int)shape.y * 2, (int)shape.x};
+    }
+    else if (state.get_string("expandInput") == "N")
+    {
+        flags    = NVCV_SIFT_USE_ORIGINAL_INPUT;
+        maxShape = int3{(int)shape.z, (int)shape.y, (int)shape.x};
+    }
+    else
+    {
+        throw std::invalid_argument("Invalid expandInput = " + state.get_string("expandInput"));
+    }
+
+    // Each pyramid has shape approximately (3 + L) * N * (2 HW size) * F32
+    std::size_t pyrSize = (numOctLayers + 3) * shape.x * (maxShape.x * maxShape.y * 2) * sizeof(float);
+
+    // R/W bandwidth rationale:
+    // 1 read of input (U8) to build (F32) pyramids, 1 read of Gauss and 1 read of DoG pyramids
+    // 1 write of Gauss and 1 write of DoG pyramids, 1 write of 4 output data
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T) + 2 * pyrSize);
+    state.add_global_memory_writes(2 * pyrSize + shape.x * sizeof(int)
+                                   + shape.x * capacity * (sizeof(float4) + sizeof(float3) + 128 * sizeof(T)));
+
+    cvcuda::SIFT op(maxShape, numOctLayers);
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, nvcv::TYPE_U8);
+        nvcv::Tensor dstC({{shape.x, capacity}, "NM"}, nvcv::TYPE_4F32);
+        nvcv::Tensor dstM({{shape.x, capacity}, "NM"}, nvcv::TYPE_3F32);
+        nvcv::Tensor dstD({{shape.x, capacity, 128}, "NMD"}, nvcv::TYPE_U8);
+        nvcv::Tensor dstN({{shape.x}, "N"}, nvcv::TYPE_S32);
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dstC, &dstM, &dstD, &dstN, &numOctLayers, &contThr, &edgeThr, &initSigma, &flags]
+                   (nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dstC, dstM, dstD, dstN, numOctLayers, contThr, edgeThr, initSigma, flags);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        throw std::invalid_argument("ImageBatchVarShape not implemented for this operator");
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using SIFTTypes = nvbench::type_list<uint8_t>;
+
+NVBENCH_BENCH_TYPES(SIFT, NVBENCH_TYPE_AXES(SIFTTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("maxCapacity", {10000})
+    .add_int64_axis("numOctaveLayers", {3})
+    .add_float64_axis("contrastThreshold", {0.04})
+    .add_float64_axis("edgeThreshold", {10.0})
+    .add_float64_axis("initSigma", {1.6})
+    .add_string_axis("expandInput", {"Y"});
diff --git a/bench/BenchStack.cpp b/bench/BenchStack.cpp
new file mode 100644
index 00000000..ebd3c079
--- /dev/null
+++ b/bench/BenchStack.cpp
@@ -0,0 +1,68 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpStack.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void Stack(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape = benchutils::GetShape<3>(state.get_string("shape"));
+
+    using BT = typename nvcv::cuda::BaseType<T>;
+
+    int ch = nvcv::cuda::NumElements<T>;
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::Stack op;
+
+    // clang-format off
+
+    nvcv::TensorBatch src(nvcv::TensorBatch::CalcRequirements(shape.x));
+    nvcv::Tensor dst({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType<BT>());
+
+    for (int i = 0 ; i < shape.x; i++)
+    {
+        nvcv::Tensor srcIn({{shape.y, shape.z, ch}, "HWC"}, benchutils::GetDataType<BT>());
+        benchutils::FillTensor<BT>(srcIn, benchutils::RandomValues<BT>());
+        src.pushBack(srcIn);
+    }
+
+    state.exec(nvbench::exec_tag::sync, [&op, &src, &dst](nvbench::launch &launch)
+    {
+        op(launch.get_stream(), src, dst);
+    });
+
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using StackTypes = nvbench::type_list<uchar3, uint8_t, float, uchar4>;
+
+NVBENCH_BENCH_TYPES(Stack, NVBENCH_TYPE_AXES(StackTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"10x1080x1920"});
diff --git a/bench/BenchThreshold.cpp b/bench/BenchThreshold.cpp
new file mode 100644
index 00000000..648a83ac
--- /dev/null
+++ b/bench/BenchThreshold.cpp
@@ -0,0 +1,85 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpThreshold.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void Threshold(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape    = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+
+    uint32_t threshType = NVCV_THRESH_BINARY | (std::is_same_v<T, uint8_t> ? NVCV_THRESH_OTSU : 0);
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::Threshold op(threshType, shape.x);
+
+    // clang-format off
+
+    nvcv::Tensor thresh({{shape.x}, "N"}, nvcv::TYPE_F64);
+    nvcv::Tensor maxval({{shape.x}, "N"}, nvcv::TYPE_F64);
+
+    benchutils::FillTensor<double>(thresh, benchutils::RandomValues<T>());
+    benchutils::FillTensor<double>(maxval, benchutils::RandomValues<T>());
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &thresh, &maxval](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, thresh, maxval);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(shape.x);
+        nvcv::ImageBatchVarShape dst(shape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{shape.z, shape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        dst.pushBack(src.begin(), src.end());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &thresh, &maxval](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, thresh, maxval);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using ThresholdTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(Threshold, NVBENCH_TYPE_AXES(ThresholdTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1});
diff --git a/bench/BenchUtils.hpp b/bench/BenchUtils.hpp
new file mode 100644
index 00000000..3875928e
--- /dev/null
+++ b/bench/BenchUtils.hpp
@@ -0,0 +1,324 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CVCUDA_BENCH_UTILS_HPP
+#define CVCUDA_BENCH_UTILS_HPP
+
+#include <cvcuda/Types.h>
+#include <nvcv/BorderType.h>
+#include <nvcv/DataType.hpp>
+#include <nvcv/ImageBatch.hpp>
+#include <nvcv/ImageFormat.hpp>
+#include <nvcv/Tensor.hpp>
+#include <nvcv/TensorData.hpp>
+#include <nvcv/cuda/DropCast.hpp>
+#include <nvcv/cuda/MathOps.hpp>
+#include <nvcv/cuda/TypeTraits.hpp>
+
+#include <algorithm>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#define CVCUDA_CHECK_DATA(data)                   \
+    if (!data)                                    \
+    {                                             \
+        throw std::runtime_error("Invalid data"); \
+    }
+
+#define CUDA_CHECK_ERROR(RC)                                  \
+    {                                                         \
+        benchutils::cudaCheckError((RC), __FILE__, __LINE__); \
+    }
+
+namespace benchutils {
+
+inline void cudaCheckError(cudaError_t code, const char *file, int line)
+{
+    if (code != cudaSuccess)
+    {
+        fprintf(stderr, "\nE In CUDA: %s %s %d\n", cudaGetErrorString(code), file, line);
+        exit(code);
+    }
+}
+
+template<int N, typename RT = nvcv::cuda::MakeType<long, N>>
+inline RT GetShape(const std::string &shapeStr, const std::string &delimiter = "x")
+{
+    std::string str = shapeStr;
+    RT          shape;
+    for (int i = 0; i < N; ++i)
+    {
+        size_t pos = str.find(delimiter);
+
+        if ((pos == std::string::npos && i != (N - 1)) || (pos != std::string::npos && i == (N - 1)))
+        {
+            throw std::invalid_argument("Expecting " + std::to_string(N) + "-rank shape in " + shapeStr
+                                        + " (pass shape separated by " + delimiter + ")");
+        }
+
+        nvcv::cuda::GetElement(shape, i) = std::stoi(str.substr(0, pos));
+
+        str.erase(0, pos + delimiter.length());
+    }
+
+    return shape;
+}
+
+template<typename T>
+inline nvcv::DataType GetDataType()
+{
+#define CVCUDA_BENCH_GET_DATA_TYPE(TYPE, DATA_TYPE) \
+    if constexpr (std::is_same_v<T, TYPE>)          \
+    {                                               \
+        return DATA_TYPE;                           \
+    }
+
+    CVCUDA_BENCH_GET_DATA_TYPE(uint8_t, nvcv::TYPE_U8);
+    CVCUDA_BENCH_GET_DATA_TYPE(uint16_t, nvcv::TYPE_U16);
+    CVCUDA_BENCH_GET_DATA_TYPE(uint32_t, nvcv::TYPE_U32);
+
+    CVCUDA_BENCH_GET_DATA_TYPE(uchar3, nvcv::TYPE_3U8);
+    CVCUDA_BENCH_GET_DATA_TYPE(uchar4, nvcv::TYPE_4U8);
+    CVCUDA_BENCH_GET_DATA_TYPE(float, nvcv::TYPE_F32);
+
+    CVCUDA_BENCH_GET_DATA_TYPE(float3, nvcv::TYPE_3F32);
+    CVCUDA_BENCH_GET_DATA_TYPE(float4, nvcv::TYPE_4F32);
+
+    CVCUDA_BENCH_GET_DATA_TYPE(int, nvcv::TYPE_S32);
+
+    CVCUDA_BENCH_GET_DATA_TYPE(short, nvcv::TYPE_S16);
+
+    CVCUDA_BENCH_GET_DATA_TYPE(ushort3, nvcv::TYPE_3U16);
+    CVCUDA_BENCH_GET_DATA_TYPE(ushort4, nvcv::TYPE_4U16);
+    CVCUDA_BENCH_GET_DATA_TYPE(short4, nvcv::TYPE_4S16);
+
+#undef CVCUDA_BENCH_GET_DATA_TYPE
+
+    throw std::invalid_argument("Unexpected data type");
+}
+
+template<typename T>
+inline nvcv::ImageFormat GetFormat()
+{
+    return nvcv::ImageFormat{GetDataType<T>()};
+}
+
+inline NVCVBorderType GetBorderType(const std::string &border)
+{
+#define CVCUDA_BENCH_GET_BORDER_TYPE(BORDER) \
+    if (border == #BORDER)                   \
+    {                                        \
+        return NVCV_BORDER_##BORDER;         \
+    }
+
+    CVCUDA_BENCH_GET_BORDER_TYPE(CONSTANT);
+    CVCUDA_BENCH_GET_BORDER_TYPE(REPLICATE);
+    CVCUDA_BENCH_GET_BORDER_TYPE(REFLECT);
+    CVCUDA_BENCH_GET_BORDER_TYPE(WRAP);
+    CVCUDA_BENCH_GET_BORDER_TYPE(REFLECT101);
+
+#undef CVCUDA_BENCH_GET_BORDER_TYPE
+
+    throw std::invalid_argument("Unexpected border type = " + border);
+}
+
+inline NVCVNormType GetNormType(const std::string &normType)
+{
+#define CVCUDA_BENCH_GET_NORM_TYPE(NORM) \
+    if (normType == #NORM)               \
+    {                                    \
+        return NVCV_NORM_##NORM;         \
+    }
+
+    CVCUDA_BENCH_GET_NORM_TYPE(HAMMING);
+    CVCUDA_BENCH_GET_NORM_TYPE(L1);
+    CVCUDA_BENCH_GET_NORM_TYPE(L2);
+
+#undef CVCUDA_BENCH_GET_NORM_TYPE
+
+    throw std::invalid_argument("Unexpected norm type = " + normType);
+}
+
+inline NVCVInterpolationType GetInterpolationType(const std::string &interpolation)
+{
+#define CVCUDA_BENCH_GET_INTERPOLATION_TYPE(INTERP) \
+    if (interpolation == #INTERP)                   \
+    {                                               \
+        return NVCV_INTERP_##INTERP;                \
+    }
+
+    CVCUDA_BENCH_GET_INTERPOLATION_TYPE(NEAREST);
+    CVCUDA_BENCH_GET_INTERPOLATION_TYPE(LINEAR);
+    CVCUDA_BENCH_GET_INTERPOLATION_TYPE(CUBIC);
+    CVCUDA_BENCH_GET_INTERPOLATION_TYPE(AREA);
+
+#undef CVCUDA_BENCH_GET_INTERPOLATION_TYPE
+
+    throw std::invalid_argument("Unexpected interpolation type = " + interpolation);
+}
+
+template<typename T, class VecType, typename ST, typename RT = std::conditional_t<std::is_const_v<VecType>, const T, T>>
+inline RT &ValueAt(VecType &vec, const ST &strides, const ST &coord)
+{
+    return *reinterpret_cast<RT *>(&vec[nvcv::cuda::dot(coord, strides)]);
+}
+
+static std::default_random_engine DefaultGenerator(unsigned long int seed = 0)
+{
+    static std::default_random_engine defaultRandomGenerator{std::random_device{}()};
+
+    defaultRandomGenerator.seed(seed);
+
+    return defaultRandomGenerator;
+}
+
+template<typename VT>
+struct Randomizer
+{
+    using BT = nvcv::cuda::BaseType<VT>;
+    using RE = std::default_random_engine;
+    using UD = std::conditional_t<std::is_floating_point_v<BT>, std::uniform_real_distribution<BT>,
+                                  std::uniform_int_distribution<BT>>;
+
+    VT operator()()
+    {
+        VT ret;
+        for (int i = 0; i < nvcv::cuda::NumElements<VT>; ++i)
+        {
+            nvcv::cuda::GetElement(ret, i) = uniformDistribution(randomGenerator);
+        }
+        return ret;
+    }
+
+    VT operator()(const long4 &)
+    {
+        return operator()();
+    }
+
+    UD uniformDistribution;
+    RE randomGenerator;
+};
+
+template<typename VT, typename R = Randomizer<VT>, typename BT = typename R::BT, typename RE = typename R::RE,
+         typename UD = typename R::UD>
+inline auto RandomValues(BT min = std::is_integral_v<BT> ? nvcv::cuda::TypeTraits<BT>::min : -1,
+                         BT max = std::is_integral_v<BT> ? nvcv::cuda::TypeTraits<BT>::max : +1,
+                         RE rng = DefaultGenerator())
+{
+    return R{UD(min, max), rng};
+}
+
+template<typename VT, typename ST, class VG>
+inline void FillBuffer(std::vector<uint8_t> &vec, const ST &shape, const ST &strides, VG valuesGenerator)
+{
+    for (long x = 0; x < (nvcv::cuda::NumElements<ST> >= 1 ? nvcv::cuda::GetElement(shape, 0) : 1); ++x)
+    {
+        for (long y = 0; y < (nvcv::cuda::NumElements<ST> >= 2 ? nvcv::cuda::GetElement(shape, 1) : 1); ++y)
+        {
+            for (long z = 0; z < (nvcv::cuda::NumElements<ST> >= 3 ? nvcv::cuda::GetElement(shape, 2) : 1); ++z)
+            {
+                for (long w = 0; w < (nvcv::cuda::NumElements<ST> == 4 ? nvcv::cuda::GetElement(shape, 3) : 1); ++w)
+                {
+                    long4 coord{x, y, z, w};
+                    ST    stCoord = nvcv::cuda::DropCast<nvcv::cuda::NumElements<ST>>(coord);
+
+                    ValueAt<VT>(vec, strides, stCoord) = valuesGenerator(coord);
+                }
+            }
+        }
+    }
+}
+
+template<typename VT, int RANK, class VG>
+inline void FillTensor(const nvcv::Tensor &tensor, VG valuesGenerator)
+{
+    using longR = nvcv::cuda::MakeType<long, RANK>;
+
+    auto tensorData = tensor.exportData<nvcv::TensorDataStridedCuda>();
+    CVCUDA_CHECK_DATA(tensorData);
+
+    longR strides, shape;
+
+    for (int i = 0; i < RANK; ++i)
+    {
+        nvcv::cuda::GetElement(strides, i) = tensorData->stride(i);
+        nvcv::cuda::GetElement(shape, i)   = tensorData->shape(i);
+    }
+
+    long bufSize{nvcv::cuda::GetElement(strides, 0) * nvcv::cuda::GetElement(shape, 0)};
+
+    std::vector<uint8_t> tensorVec(bufSize);
+
+    FillBuffer<VT>(tensorVec, shape, strides, valuesGenerator);
+
+    CUDA_CHECK_ERROR(cudaMemcpy(tensorData->basePtr(), tensorVec.data(), bufSize, cudaMemcpyHostToDevice));
+}
+
+template<typename VT, class VG>
+inline void FillTensor(const nvcv::Tensor &tensor, VG valuesGenerator)
+{
+    switch (tensor.rank())
+    {
+#define CVCUDA_BENCH_FILL_TENSOR_CASE(RANK)        \
+case RANK:                                         \
+    FillTensor<VT, RANK>(tensor, valuesGenerator); \
+    break
+
+        CVCUDA_BENCH_FILL_TENSOR_CASE(1);
+        CVCUDA_BENCH_FILL_TENSOR_CASE(2);
+        CVCUDA_BENCH_FILL_TENSOR_CASE(3);
+        CVCUDA_BENCH_FILL_TENSOR_CASE(4);
+
+#undef CVCUDA_BENCH_FILL_TENSOR_CASE
+    default:
+        throw std::invalid_argument("Tensor has rank not in [1, 4]");
+    }
+}
+
+template<typename VT, class VG>
+inline void FillImageBatch(nvcv::ImageBatchVarShape &imageBatch, long2 size, long2 varSize, VG valuesGenerator)
+{
+    auto randomWidth  = RandomValues<int>(static_cast<int>(size.x - varSize.x), static_cast<int>(size.x));
+    auto randomHeight = RandomValues<int>(static_cast<int>(size.y - varSize.y), static_cast<int>(size.y));
+
+    for (int i = 0; i < imageBatch.capacity(); ++i)
+    {
+        nvcv::Image image(nvcv::Size2D{randomWidth(), randomHeight()}, GetFormat<VT>());
+
+        auto data = image.exportData<nvcv::ImageDataStridedCuda>();
+        CVCUDA_CHECK_DATA(data);
+
+        long2 strides{data->plane(0).rowStride, sizeof(VT)};
+        long2 shape{data->plane(0).height, data->plane(0).width};
+
+        std::vector<uint8_t> imageBuffer(strides.x * shape.x);
+
+        FillBuffer<VT>(imageBuffer, shape, strides, valuesGenerator);
+
+        CUDA_CHECK_ERROR(cudaMemcpy2D(data->plane(0).basePtr, strides.x, imageBuffer.data(), strides.x, strides.x,
+                                      data->plane(0).height, cudaMemcpyHostToDevice));
+
+        imageBatch.pushBack(image);
+    }
+}
+
+} // namespace benchutils
+
+#endif // CVCUDA_BENCH_UTILS_HPP
diff --git a/bench/BenchWarpAffine.cpp b/bench/BenchWarpAffine.cpp
new file mode 100644
index 00000000..459c3b32
--- /dev/null
+++ b/bench/BenchWarpAffine.cpp
@@ -0,0 +1,95 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpWarpAffine.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void WarpAffine(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape    = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+
+    NVCVBorderType        borderType = benchutils::GetBorderType(state.get_string("border"));
+    NVCVInterpolationType interpType = benchutils::GetInterpolationType(state.get_string("interpolation"));
+
+    int flags = interpType | ((state.get_string("inverseMap") == "Y") ? NVCV_WARP_INVERSE_MAP : 0);
+
+    float4 borderValue{0, 0, 0, 0};
+
+    NVCVAffineTransform transMatrix{2.f, 2.f, 0.f, 3.f, 1.f, 0.f};
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T) + 6 * sizeof(float));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::WarpAffine op(shape.x);
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &transMatrix, &flags, &borderType, &borderValue]
+                   (nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, transMatrix, flags, borderType, borderValue);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(shape.x);
+        nvcv::ImageBatchVarShape dst(shape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{shape.z, shape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        dst.pushBack(src.begin(), src.end());
+
+        nvcv::Tensor transMatrixTensor({{shape.x, 6}, "NW"}, nvcv::TYPE_F32);
+
+        benchutils::FillTensor<float>(transMatrixTensor, [&transMatrix](const long4 &c){ return transMatrix[c.y]; });
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &transMatrixTensor, &flags, &borderType, &borderValue](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, transMatrixTensor, flags, borderType, borderValue);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using WarpAffineTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(WarpAffine, NVBENCH_TYPE_AXES(WarpAffineTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1})
+    .add_string_axis("border", {"REFLECT"})
+    .add_string_axis("interpolation", {"CUBIC"})
+    .add_string_axis("inverseMap", {"Y"});
diff --git a/bench/BenchWarpPerspective.cpp b/bench/BenchWarpPerspective.cpp
new file mode 100644
index 00000000..87498612
--- /dev/null
+++ b/bench/BenchWarpPerspective.cpp
@@ -0,0 +1,95 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpWarpPerspective.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void WarpPerspective(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 shape    = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+
+    NVCVBorderType        borderType = benchutils::GetBorderType(state.get_string("border"));
+    NVCVInterpolationType interpType = benchutils::GetInterpolationType(state.get_string("interpolation"));
+
+    int flags = interpType | ((state.get_string("inverseMap") == "Y") ? NVCV_WARP_INVERSE_MAP : 0);
+
+    float4 borderValue{0, 0, 0, 0};
+
+    NVCVPerspectiveTransform transMatrix{0.27, 0.16, 0.00, -0.11, 0.61, 0.65, -0.09, 0.06, 1.00};
+
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T) + 9 * sizeof(float));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+
+    cvcuda::WarpPerspective op(shape.x);
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+        nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &transMatrix, &flags, &borderType, &borderValue](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, transMatrix, flags, borderType, borderValue);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(shape.x);
+        nvcv::ImageBatchVarShape dst(shape.x);
+
+        benchutils::FillImageBatch<T>(src, long2{shape.z, shape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+        dst.pushBack(src.begin(), src.end());
+
+        nvcv::Tensor transMatrixTensor({{shape.x, 9}, "NW"}, nvcv::TYPE_F32);
+
+        benchutils::FillTensor<float>(transMatrixTensor, [&transMatrix](const long4 &c){ return transMatrix[c.y]; });
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &transMatrixTensor, &flags, &borderType, &borderValue](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, transMatrixTensor, flags, borderType, borderValue);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using WarpPerspectiveTypes = nvbench::type_list<uint8_t, float>;
+
+NVBENCH_BENCH_TYPES(WarpPerspective, NVBENCH_TYPE_AXES(WarpPerspectiveTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1})
+    .add_string_axis("border", {"REFLECT"})
+    .add_string_axis("interpolation", {"CUBIC"})
+    .add_string_axis("inverseMap", {"Y"});
diff --git a/bench/CMakeLists.txt b/bench/CMakeLists.txt
new file mode 100644
index 00000000..67fd8c5f
--- /dev/null
+++ b/bench/CMakeLists.txt
@@ -0,0 +1,82 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+project(cvcuda_bench)
+
+set(bench_sources
+    BenchMinMaxLoc.cpp
+    BenchNMS.cpp
+    BenchRemap.cpp
+    BenchGaussian.cpp
+    BenchLaplacian.cpp
+    BenchAverageBlur.cpp
+    BenchMedianBlur.cpp
+    BenchBilateralFilter.cpp
+    BenchSIFT.cpp
+    BenchReformat.cpp
+    BenchResize.cpp
+    BenchFlip.cpp
+    BenchRotate.cpp
+    BenchPillowResize.cpp
+    BenchCenterCrop.cpp
+    BenchWarpPerspective.cpp
+    BenchWarpAffine.cpp
+    BenchThreshold.cpp
+    BenchRandomResizedCrop.cpp
+    BenchNormalize.cpp
+    BenchMorphology.cpp
+    BenchAdaptiveThreshold.cpp
+    BenchCvtColor.cpp
+    BenchAdvCvtColor.cpp
+    BenchBndBox.cpp
+    BenchBoxBlur.cpp
+    BenchBrightnessContrast.cpp
+    BenchChannelReorder.cpp
+    BenchColorTwist.cpp
+    BenchComposite.cpp
+    BenchConv2D.cpp
+    BenchConvertTo.cpp
+    BenchCopyMakeBorder.cpp
+    BenchCropFlipNormalizeReformat.cpp
+    BenchCustomCrop.cpp
+    BenchErase.cpp
+    BenchFindContours.cpp
+    BenchGammaContrast.cpp
+    BenchGaussianNoise.cpp
+    BenchHistogramEq.cpp
+    BenchHistogram.cpp
+    BenchInpaint.cpp
+    BenchJointBilateralFilter.cpp
+    BenchMinAreaRect.cpp
+    BenchPadAndStack.cpp
+    BenchOSD.cpp
+    BenchLabel.cpp
+    BenchPairwiseMatcher.cpp
+    BenchStack.cpp
+)
+
+# Metatarget for all benchmarks
+add_custom_target(bench_all)
+
+foreach(bench_source IN LISTS bench_sources)
+  get_filename_component(bench_file_name "${bench_source}" NAME_WLE)
+  string(REPLACE "Bench" "cvcuda_bench_" algo_name ${bench_file_name})
+  string(TOLOWER ${algo_name} bench_name)
+  add_executable(${bench_name} "${bench_source}")
+  target_include_directories(${bench_name} PRIVATE "${CMAKE_CURRENT_LIST_DIR}")
+  target_link_libraries(${bench_name} PRIVATE nvbench::main PUBLIC cvcuda)
+  set_target_properties(${bench_name} PROPERTIES COMPILE_FEATURES cuda_std_17)
+  add_dependencies(bench_all ${bench_name})
+endforeach()
diff --git a/bench/run_bench.py b/bench/run_bench.py
new file mode 100644
index 00000000..cc995ee8
--- /dev/null
+++ b/bench/run_bench.py
@@ -0,0 +1,95 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import time
+import subprocess
+import pandas as pd
+
+
+BENCH_PREFIX = "cvcuda_bench_"
+BENCH_OUTPUT = "out.csv"
+BENCH_COMMAND = "{} {} --csv {}"
+BENCH_COLNAME = "Benchmark"
+BENCH_RESULTS = "bench_output.csv"
+BENCH_COLUMNS = {"Benchmark", "BWUtil", "Skipped"}
+BANDWIDTH_COLNAME = "BWUtil"
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 1:
+        print(
+            "E At least one argument must be provided: benchmark folder"
+            f"I Usage: {sys.argv[0]} bench_folder [extra args for benchmarks]"
+        )
+        sys.exit(1)
+
+    bench_args = " ".join(sys.argv[2:]) if len(sys.argv) > 2 else ""
+    bench_folder = sys.argv[1]
+    bench_files = [fn for fn in sorted(os.listdir(bench_folder)) if BENCH_PREFIX in fn]
+
+    if len(bench_files) == 0:
+        print(f"E No benchmarks found in {bench_folder}")
+        sys.exit(1)
+
+    print(f"I Found {len(bench_files)} benchmark(s) in {bench_folder} to run")
+
+    l_df = []
+
+    for filename in bench_files:
+        filepath = os.path.join(bench_folder, filename)
+
+        cmd = BENCH_COMMAND.format(filepath, bench_args, BENCH_OUTPUT)
+
+        print(f'I Running "{cmd}"', end=" ")
+
+        beg = time.time()
+        subprocess.run(cmd.split(), stdout=subprocess.PIPE)
+        end = time.time()
+
+        print(f"took {end - beg:.03f} sec")
+
+        if os.path.exists(BENCH_OUTPUT) is False or os.path.getsize(BENCH_OUTPUT) == 0:
+            print("W Skipping as benchmark output does not exist or is empty")
+            continue
+
+        df = pd.read_csv(BENCH_OUTPUT)
+
+        if not BENCH_COLUMNS.issubset(df.columns):
+            print(f"W Skipping as benchmark output does not have: {BENCH_COLUMNS}")
+            continue
+
+        df = df[df["Skipped"] == "No"]
+
+        os.remove(BENCH_OUTPUT)
+
+        if len(df) > 0:
+            l_df.append(df)
+
+    df = pd.concat(l_df, axis=0)
+    df = df.reset_index(drop=True)
+
+    filepath = os.path.join(bench_folder, BENCH_RESULTS)
+
+    df.to_csv(filepath)
+
+    print(f"I Full results written to {filepath}")
+
+    df = df.groupby("Benchmark")["BWUtil"].mean()
+
+    pd.options.display.float_format = "{:.2%}".format
+
+    print(f"I Summary results:\n{df}")
diff --git a/ci/build.sh b/ci/build.sh
index d13d719f..bcbbec23 100755
--- a/ci/build.sh
+++ b/ci/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash -e
 
-# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -37,7 +37,7 @@ source_dir="$SDIR/.."
 
 if [[ $# -ge 1 ]]; then
     case $1 in
-    debug|release)
+    debug|release|profile)
         build_type=$1
         if [[ $# -ge 2 ]]; then
             build_dir=$2
@@ -77,10 +77,13 @@ else
 fi
 
 if [ "$PYTHON_VERSIONS" ]; then
-    cmake_args="-DPYTHON_VERSIONS=$PYTHON_VERSIONS"
+    cmake_args="$cmake_args -DPYTHON_VERSIONS=$PYTHON_VERSIONS"
 fi
 
 case $build_type in
+    profile)
+        cmake_args="$cmake_args -DCMAKE_BUILD_TYPE=Release -DBUILD_BENCH=1"
+        ;;
     release)
         cmake_args="$cmake_args -DCMAKE_BUILD_TYPE=Release"
         ;;
@@ -132,7 +135,7 @@ cmake -B "$build_dir" "$source_dir"  \
     $user_args
 
 # Build CV-CUDA
-cmake --build "$build_dir" -- $MAKE_OPTS
+cmake --build "$build_dir" --parallel 8 -- $MAKE_OPTS
 
 # Show ccache status, if available!
 if [[ $has_ccache ]]; then
diff --git a/ci/check_formatting.sh b/ci/check_formatting.sh
deleted file mode 100755
index b91d518c..00000000
--- a/ci/check_formatting.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash -e
-
-# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if [ $# = 0 ]; then
-    # No arguments? Lint all code.
-    echo "Linting all code in the repository =========================="
-    pre-commit run -a
-else
-    from=$1
-    if [ $# = 1 ]; then
-        to=HEAD
-    elif [ $# = 2 ]; then
-        to=$2
-    else
-        echo "Invalid arguments"
-        echo "Usage: $(basename "$0") [ref_from [ref_to]]"
-        exit 1
-    fi
-
-    echo "Linting files touched from commit $from to $to =============="
-    echo "Files to be linted:"
-    git diff --stat $from..$to
-    if ! pre-commit run --from-ref $from --to-ref $to ; then
-        echo "Formatting errors:"
-        git diff
-        false
-    fi
-fi
diff --git a/cmake/ConfigCompiler.cmake b/cmake/ConfigCompiler.cmake
index 5c4f834c..898a7ee8 100644
--- a/cmake/ConfigCompiler.cmake
+++ b/cmake/ConfigCompiler.cmake
@@ -17,6 +17,8 @@ set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3 -ggdb")
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -O3 -ggdb")
 
+# Use old behavior (before CMake 3.20) for Ninja DEPFILES generators
+cmake_policy(SET CMP0116 OLD)
 
 if(WARNINGS_AS_ERRORS)
     set(C_WARNING_ERROR_FLAG "-Werror")
diff --git a/cmake/PrintConfig.cmake b/cmake/PrintConfig.cmake
index b43529cc..4c58c7e8 100644
--- a/cmake/PrintConfig.cmake
+++ b/cmake/PrintConfig.cmake
@@ -49,6 +49,12 @@ else()
     message(STATUS "    ENABLE_SANITIZER         : off")
 endif()
 
+if(BUILD_BENCH)
+    message(STATUS "    BUILD_BENCH              : ON")
+else()
+    message(STATUS "    BUILD_BENCH              : off")
+endif()
+
 if(ENABLE_TEGRA)
     message(STATUS "    ENABLE_TEGRA             : ON")
 else()
diff --git a/docker/build/Dockerfile b/docker/build/Dockerfile
index 306cd2ec..8fd5fb8e 100644
--- a/docker/build/Dockerfile
+++ b/docker/build/Dockerfile
@@ -36,9 +36,9 @@ RUN DEBIAN_FRONTEND="noninteractive" apt-get update \
         pre-commit shellcheck \
         curl \
     && rm -rf /var/lib/apt/lists/* \
-    && curl -L https://cmake.org/files/v3.18/cmake-3.18.6-Linux-x86_64.tar.gz --output /tmp/cmake-3.18.6.tar.gz \
-    && tar -xzf /tmp/cmake-3.18.6.tar.gz -C /tmp/ && cd /tmp/cmake-3.18.6-Linux-x86_64/ \
-    && cp bin/ share/ doc/ /usr/local/ -r && rm -rf /tmp/cmake-3.18.6*
+    && curl -L https://cmake.org/files/v3.20/cmake-3.20.1-linux-x86_64.tar.gz --output /tmp/cmake-3.20.1.tar.gz \
+    && tar -xzf /tmp/cmake-3.20.1.tar.gz -C /tmp/ && cd /tmp/cmake-3.20.1-linux-x86_64/ \
+    && cp bin/ share/ doc/ /usr/local/ -r && rm -rf /tmp/cmake-3.20.1*
 
 # Configure ccache
 RUN mkdir -p /cache
diff --git a/docker/config b/docker/config
index e7b31e24..56cc639d 100644
--- a/docker/config
+++ b/docker/config
@@ -22,8 +22,9 @@ IMAGE_URL_BASE=''
 # image versions must be upgraded whenever a breaking
 # change is done, such as removing some package, or updating
 # packaged versions that introduces incompatibilities.
-TAG_IMAGE=5
+TAG_IMAGE=6
 TAG_IMAGE_SAMPLES=5.1
+TAG_IMAGE_TEST=5
 
 VER_CUDA=11.7.1
 VER_UBUNTU=22.04
diff --git a/docker/test/Dockerfile b/docker/test/Dockerfile
index f6600b1e..0c55129e 100644
--- a/docker/test/Dockerfile
+++ b/docker/test/Dockerfile
@@ -47,3 +47,10 @@ RUN set -e \
            python$ver -m pip install torch numpy torchvision; \
        done \
     && rm -rf /root/.cache/pip
+
+# Other dependencies of python tests
+# binutils: for readelf
+RUN DEBIAN_FRONTEND="noninteractive" apt-get update \
+    && apt-get install -y --no-install-recommends \
+        binutils \
+    && rm -rf /var/lib/apt/lists/*
diff --git a/docker/update_test_image.sh b/docker/update_test_image.sh
index adce0abf..d0cc5199 100755
--- a/docker/update_test_image.sh
+++ b/docker/update_test_image.sh
@@ -35,7 +35,7 @@ cd "$SDIR"
 
 cd test
 
-image=$IMAGE_URL_BASE/test-linux-x64:$TAG_IMAGE
+image=$IMAGE_URL_BASE/test-linux-x64:$TAG_IMAGE_TEST
 
 docker build --network=host \
     --build-arg "VER_CUDA=$VER_CUDA" \
diff --git a/docs/sphinx/content/cvcuda_oplist.csv b/docs/sphinx/content/cvcuda_oplist.csv
index cd564dbb..d6b48a6a 100644
--- a/docs/sphinx/content/cvcuda_oplist.csv
+++ b/docs/sphinx/content/cvcuda_oplist.csv
@@ -16,7 +16,8 @@ CustomCrop,Crops an image with a given region-of-interest
 CvtColor,Converts an image from one color space to another
 DataTypeConvert,Converts an image’s data type with optional scaling
 Erase,Erases image regions
-Find Contours,Extract closed contours from an input binary image
+FindContours,Extract closed contours from an input binary image
+FindHomography,Calculates a perspective transform from four pairs of the corresponding points
 Flip,Flips a 2D image around its axis
 GammaContrast,Adjusts image contrast
 Gaussian,Applies a gaussian blur filter to the image
@@ -24,7 +25,8 @@ Gaussian Noise,Generates a statistical noise with a normal (Gaussian) distributi
 Histogram,Provides a grayscale value distribution showing the frequency of occurrence of each gray value.
 Histogram Equalizer,Allows effective spreading out the intensity range of the image typically used to improve contrast
 Inpainting,Performs inpainting by replacing a pixel by normalized weighted sum of all the known pixels in the neighborhood
-Joint Bilateral Filter,Provides a edge-preserving denoising filter
+Joint Bilateral Filter,Reduces image noise while preserving strong edges based on a guidance image
+Label,Labels connected regions in an image using 4-way connectivity for foreground and 8-way for background pixels
 Laplacian,Applies a Laplace transform to an image
 MedianBlur,Reduces an image’s salt-and-pepper noise
 MinArea Rect,Finds the minimum area rotated rectangle typically used to draw bounding rectangle with minimum area
@@ -36,6 +38,7 @@ Non-max Suppression,Enables selecting a single entity out of many overlapping on
 Normalize,Normalizes an image pixel’s range
 OSD (Polyline Line Text Rotated Rect Segmented Mask),Displays an overlay on the image of of different forms including polyline line text rotated rectangle segmented mask
 PadStack,Stacks several images into a tensor with border extension
+PairwiseMatcher,Matches features computed separately (e.g. via the SIFT operator) in two images using the brute force method
 PillowResize,Changes the size and scale of an image using python-pillow algorithm
 RandomResizedCrop,Crops a random portion of an image and resizes it to a specified size.
 Reformat,Converts a planar image into non-planar and vice versa
diff --git a/docs/sphinx/index.rst b/docs/sphinx/index.rst
index 69d72c9c..ec3667c6 100644
--- a/docs/sphinx/index.rst
+++ b/docs/sphinx/index.rst
@@ -50,7 +50,7 @@ CV-CUDA offers more than 20 Computer Vision and Image Processing operators. Find
 Where Are the Release Notes?
 ------------------
 
-An awesome product requires excellent support.  CV-CUDA release notes can be found `here <https://github.com/CVCUDA/CV-CUDA/releases/tag/v0.4.0-beta>`_.
+An awesome product requires excellent support.  CV-CUDA release notes can be found `here <https://github.com/CVCUDA/CV-CUDA/releases/tag/v0.5.0-beta>`_.
 
 
 Where Can I Get Help?
@@ -124,6 +124,7 @@ Copyright
     :maxdepth: 1
     :hidden:
 
+    Beta.3 <relnotes/v0.5.0-beta>
     Beta.2 <relnotes/v0.4.0-beta>
     Beta.1 <relnotes/v0.3.1-beta>
     Beta <relnotes/v0.3.0-beta>
diff --git a/docs/sphinx/installation.rst b/docs/sphinx/installation.rst
index 2130f299..c37fd42f 100644
--- a/docs/sphinx/installation.rst
+++ b/docs/sphinx/installation.rst
@@ -32,7 +32,7 @@ Setup
 
 The following steps describe how to install cvcuda. Choose the installation method that meets your environment needs.
 
-Download the cvcuda tar/deb package from `here <https://github.com/CVCUDA/CV-CUDA/releases/tag/v0.4.0-beta>`_
+Download the cvcuda tar/deb package from `here <https://github.com/CVCUDA/CV-CUDA/releases/tag/v0.5.0-beta>`_
 
 * Tar File Installation
 
diff --git a/docs/sphinx/relnotes/v0.5.0-beta.rst b/docs/sphinx/relnotes/v0.5.0-beta.rst
new file mode 100644
index 00000000..bd363319
--- /dev/null
+++ b/docs/sphinx/relnotes/v0.5.0-beta.rst
@@ -0,0 +1,75 @@
+..
+  # SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+  # SPDX-License-Identifier: Apache-2.0
+  #
+  # Licensed under the Apache License, Version 2.0 (the "License");
+  # you may not use this file except in compliance with the License.
+  # You may obtain a copy of the License at
+  #
+  # http://www.apache.org/licenses/LICENSE-2.0
+  #
+  # Unless required by applicable law or agreed to in writing, software
+  # distributed under the License is distributed on an "AS IS" BASIS,
+  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  # See the License for the specific language governing permissions and
+  # limitations under the License.
+
+.. _v0.5.0-beta:
+
+Beta.3
+======
+
+CV-CUDA 0.5.0 is a comprehensive update introducing new security, compliance, and performance enhancements, alongside bug fixes and new features.
+
+Release Highlights
+------------------
+
+CV-CUDA v0.5.0 includes significant improvements:
+
+* **New Operators**:
+  - FindHomography: Calculates a perspective transform from four pairs of the corresponding points
+  - Label: Labels connected regions in an image using 4-way connectivity for foreground and 8-way for background pixels
+  - PairwiseMatcher: Matches features computed separately (e.g. via the SIFT operator) in two images using the brute force method
+
+* **New Features**:
+  - Implemented Python class for `TensorBatch``, a container type that can hold a list of non-uniformly shaped tensors
+  - Added support for RGBD image formats
+  - Enhanced documentation
+
+* **Bug Fixes**:
+  - Resolved memory leak in NvBlurBoxes
+  - Fixed segmentation fault issue in Python with certain imports
+  - Corrected typestr format issue in `__cuda_array_interface__`
+  - Addressed occasional hanging in OpBoxBlur on RGBA images
+
+Compatibility
+-------------
+
+* Continues to support GPU Compute Capability: 7+.x
+* Compatible with Ubuntu x86_64: 20.04, 22.04
+* CUDA Toolkit: 11.7+ (11.2+ for library build and run)
+* GCC: 11.0+ (9.0 and 10.0 for APIs, with pre-built binary and run)
+* Python: 3.7, 3.8, 3.10
+
+Known Issues/Limitations
+------------------------
+
+* The release notes do not specify new known issues or limitations for this version.
+
+License
+-------
+
+CV-CUDA is licensed under the `Apache 2.0 <https://github.com/CVCUDA/CV-CUDA/blob/main/LICENSE.md>`_ license.
+
+Resources
+---------
+
+1. `CV-CUDA GitHub <https://github.com/CVCUDA/CV-CUDA>`_
+2. `CV-CUDA Increasing Throughput and Reducing Costs for AI-Based Computer Vision with CV-CUDA <https://developer.nvidia.com/blog/increasing-throughput-and-reducing-costs-for-computer-vision-with-cv-cuda/>`_
+3. `NVIDIA Announces Microsoft, Tencent, Baidu Adopting CV-CUDA for Computer Vision AI <https://blogs.nvidia.com/blog/2023/03/21/cv-cuda-ai-computer-vision/>`_
+4. `CV-CUDA helps Tencent Cloud audio and video PaaS platform achieve full-process GPU acceleration for video enhancement AI <https://developer.nvidia.com/zh-cn/blog/cv-cuda-high-performance-image-processing/>`_
+
+Acknowledgements
+----------------
+
+CV-CUDA is developed jointly by NVIDIA and the ByteDance Machine Learning team.
diff --git a/docs/sphinx/samples/python_samples/classification.rst b/docs/sphinx/samples/python_samples/classification.rst
index 8356d46d..11961162 100644
--- a/docs/sphinx/samples/python_samples/classification.rst
+++ b/docs/sphinx/samples/python_samples/classification.rst
@@ -171,7 +171,7 @@ The top 5 classification results for the tabby_cat_tiger.jpg image is as follows
 
    user@machine:~/cvcuda/samples$ python3 classification/python/main.py -b 1
    [perf_utils:85] 2023-07-27 22:27:17 WARNING perf_utils is used without benchmark.py. Benchmarking mode is turned off.
-   [perf_utils:89] 2023-07-27 22:27:17 INFO    Using CV-CUDA version: 0.4.0-beta
+   [perf_utils:89] 2023-07-27 22:27:17 INFO    Using CV-CUDA version: 0.5.0-beta
    [pipelines:35] 2023-07-27 22:27:17 INFO     Using CVCUDA as preprocessor.
    [torch_utils:77] 2023-07-27 22:27:17 INFO   Using torchnvjpeg as decoder.
    [pipelines:122] 2023-07-27 22:27:17 INFO    Using CVCUDA as post-processor.
diff --git a/docs/sphinx/samples/python_samples/object_detection.rst b/docs/sphinx/samples/python_samples/object_detection.rst
index 3e2cad01..a2d05499 100644
--- a/docs/sphinx/samples/python_samples/object_detection.rst
+++ b/docs/sphinx/samples/python_samples/object_detection.rst
@@ -177,7 +177,7 @@ This sample takes as input one or more images or one video and generates the obj
 
         user@machine:~/cvcuda/samples$ python3 object_detection/python/main.py
         [perf_utils:85] 2023-07-27 23:15:34 WARNING perf_utils is used without benchmark.py. Benchmarking mode is turned off.
-        [perf_utils:89] 2023-07-27 23:15:34 INFO   Using CV-CUDA version: 0.4.0-beta
+        [perf_utils:89] 2023-07-27 23:15:34 INFO   Using CV-CUDA version: 0.5.0-beta
         [pipelines:30] 2023-07-27 23:15:36 INFO   Using CVCUDA as preprocessor.
         [torch_utils:77] 2023-07-27 23:15:36 INFO   Using torchnvjpeg as decoder.
         [torch_utils:151] 2023-07-27 23:15:36 INFO   Using PyTorch/PIL as encoder.
diff --git a/docs/sphinx/samples/python_samples/segmentation.rst b/docs/sphinx/samples/python_samples/segmentation.rst
index c062c03e..5dd4d194 100644
--- a/docs/sphinx/samples/python_samples/segmentation.rst
+++ b/docs/sphinx/samples/python_samples/segmentation.rst
@@ -182,7 +182,7 @@ This sample takes as input the one or more images or one video and generates the
 
    user@machine:~/cvcuda/samples$ python3 segmentation/python/main.py -b 5 -c __background__ -o /tmp -i assets/images/
    [perf_utils:85] 2023-07-27 23:17:49 WARNING perf_utils is used without benchmark.py. Benchmarking mode is turned off.
-   [perf_utils:89] 2023-07-27 23:17:49 INFO   Using CV-CUDA version: 0.4.0-beta
+   [perf_utils:89] 2023-07-27 23:17:49 INFO   Using CV-CUDA version: 0.5.0-beta
    [pipelines:35] 2023-07-27 23:17:50 INFO   Using CVCUDA as preprocessor.
    [torch_utils:60] 2023-07-27 23:17:50 INFO   Found a total of 3 JPEG images.
    [torch_utils:77] 2023-07-27 23:17:50 INFO   Using torchnvjpeg as decoder.
diff --git a/python/mod_cvcuda/CMakeLists.txt b/python/mod_cvcuda/CMakeLists.txt
index 87ed892d..5db4089a 100644
--- a/python/mod_cvcuda/CMakeLists.txt
+++ b/python/mod_cvcuda/CMakeLists.txt
@@ -21,6 +21,14 @@ nvcv_python_add_module(
     OUTPUT_NAME cvcuda
     SOURCES
         Main.cpp
+        OpPairwiseMatcher.cpp
+        PairwiseMatcherType.cpp
+        NormType.cpp
+        OpStack.cpp
+        WorkspaceCache.cpp
+        OpLabel.cpp
+        LabelType.cpp
+        ConnectivityType.cpp
         OpFindContours.cpp
         OpHistogramEq.cpp
         OpOSD.cpp
@@ -77,6 +85,7 @@ nvcv_python_add_module(
         OpGaussianNoise.cpp
         OpInpaint.cpp
         CvtColorUtil.cpp
+        OpFindHomography.cpp
 )
 
 target_link_libraries(cvcuda_module_python
@@ -86,7 +95,16 @@ target_link_libraries(cvcuda_module_python
         nvcv_util_compat
         cvcuda
         nvcv_python_common
+        nvcv_util
+        cuda
         -lrt
 )
 
+# use exports file to expose only the symbol dl-loaded by python,
+# and nothing else.
+target_link_options(cvcuda_module_python
+    PRIVATE
+        -Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/exports.ldscript
+)
+
 set_target_properties(cvcuda_module_python PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/python)
diff --git a/python/mod_cvcuda/ConnectivityType.cpp b/python/mod_cvcuda/ConnectivityType.cpp
new file mode 100644
index 00000000..8cb5d41f
--- /dev/null
+++ b/python/mod_cvcuda/ConnectivityType.cpp
@@ -0,0 +1,34 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConnectivityType.hpp"
+
+#include <cvcuda/Types.h>
+
+namespace cvcudapy {
+
+void ExportConnectivityType(py::module &m)
+{
+    py::enum_<NVCVConnectivityType>(m, "ConnectivityType", py::arithmetic())
+        .value("CONNECTIVITY_4_2D", NVCV_CONNECTIVITY_4_2D)
+        .value("CONNECTIVITY_6_3D", NVCV_CONNECTIVITY_6_3D)
+        .value("CONNECTIVITY_8_2D", NVCV_CONNECTIVITY_8_2D)
+        .value("CONNECTIVITY_26_2D", NVCV_CONNECTIVITY_26_3D)
+        .export_values();
+}
+
+} // namespace cvcudapy
diff --git a/python/mod_cvcuda/ConnectivityType.hpp b/python/mod_cvcuda/ConnectivityType.hpp
new file mode 100644
index 00000000..a7cc8b0a
--- /dev/null
+++ b/python/mod_cvcuda/ConnectivityType.hpp
@@ -0,0 +1,30 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_PYTHON_CONNECTIVITY_TYPE_HPP
+#define NVCV_PYTHON_CONNECTIVITY_TYPE_HPP
+
+#include <pybind11/pybind11.h>
+
+namespace cvcudapy {
+namespace py = ::pybind11;
+
+void ExportConnectivityType(py::module &m);
+
+} // namespace cvcudapy
+
+#endif // NVCV_PYTHON_CONNECTIVITY_TYPE_HPP
diff --git a/python/mod_cvcuda/InterpolationType.cpp b/python/mod_cvcuda/InterpolationType.cpp
index 65010bf3..7b6c0fa1 100644
--- a/python/mod_cvcuda/InterpolationType.cpp
+++ b/python/mod_cvcuda/InterpolationType.cpp
@@ -29,8 +29,10 @@ void ExportInterpolationType(py::module &m)
         .value("CUBIC", NVCV_INTERP_CUBIC, "Cubic interpolation")
         .value("AREA", NVCV_INTERP_AREA, "Area-based (resampling using pixels in area) interpolation")
         .value("LANCZOS", NVCV_INTERP_LANCZOS, "Lanczos interpolation")
+        .value("WARP_INVERSE_MAP", NVCV_WARP_INVERSE_MAP, "Inverse transformation")
         .value("HAMMING", NVCV_INTERP_HAMMING, "Hamming interpolation")
-        .value("BOX", NVCV_INTERP_BOX, "Box interpolation");
+        .value("BOX", NVCV_INTERP_BOX, "Box interpolation")
+        .def("__or__", [](NVCVInterpolationType e1, NVCVInterpolationType e2) { return int(e1) | int(e2); });
 }
 
 } // namespace cvcudapy
diff --git a/python/mod_cvcuda/LabelType.cpp b/python/mod_cvcuda/LabelType.cpp
new file mode 100644
index 00000000..04efc42d
--- /dev/null
+++ b/python/mod_cvcuda/LabelType.cpp
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "LabelType.hpp"
+
+#include <cvcuda/Types.h>
+
+namespace cvcudapy {
+
+void ExportLabelType(py::module &m)
+{
+    py::enum_<NVCVLabelType>(m, "LABEL", py::arithmetic())
+        .value("FAST", NVCV_LABEL_FAST)
+        .value("SEQUENTIAL", NVCV_LABEL_SEQUENTIAL);
+}
+
+} // namespace cvcudapy
diff --git a/python/mod_cvcuda/LabelType.hpp b/python/mod_cvcuda/LabelType.hpp
new file mode 100644
index 00000000..3260e2d2
--- /dev/null
+++ b/python/mod_cvcuda/LabelType.hpp
@@ -0,0 +1,30 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_PYTHON_LABEL_TYPE_HPP
+#define NVCV_PYTHON_LABEL_TYPE_HPP
+
+#include <pybind11/pybind11.h>
+
+namespace cvcudapy {
+namespace py = ::pybind11;
+
+void ExportLabelType(py::module &m);
+
+} // namespace cvcudapy
+
+#endif // NVCV_PYTHON_LABEL_TYPE_HPP
diff --git a/python/mod_cvcuda/Main.cpp b/python/mod_cvcuda/Main.cpp
index 41034e5b..226336f2 100644
--- a/python/mod_cvcuda/Main.cpp
+++ b/python/mod_cvcuda/Main.cpp
@@ -18,10 +18,14 @@
 #include "AdaptiveThresholdType.hpp"
 #include "BorderType.hpp"
 #include "ColorConversionCode.hpp"
+#include "ConnectivityType.hpp"
 #include "InterpolationType.hpp"
+#include "LabelType.hpp"
 #include "MorphologyType.hpp"
+#include "NormType.hpp"
 #include "Operators.hpp"
 #include "OsdElement.hpp"
+#include "PairwiseMatcherType.hpp"
 #include "RemapMapValueType.hpp"
 #include "SIFTFlagType.hpp"
 #include "ThresholdType.hpp"
@@ -77,14 +81,19 @@ PYBIND11_MODULE(cvcuda, m)
     ExportMorphologyType(m);
     ExportColorConversionCode(m);
     ExportRemapMapValueType(m);
-    ExportBndBox(m);
     ExportBoxBlur(m);
     ExportOSD(m);
     ExportThresholdType(m);
     ExportAdaptiveThresholdType(m);
     ExportSIFTFlagType(m);
+    ExportConnectivityType(m);
+    ExportLabelType(m);
+    ExportNormType(m);
+    ExportPairwiseMatcherType(m);
 
     // CV-CUDA Operators
+    ExportOpPairwiseMatcher(m);
+    ExportOpLabel(m);
     ExportOpFindContours(m);
     ExportOpOSD(m);
     ExportOpHistogramEq(m);
@@ -93,8 +102,8 @@ PYBIND11_MODULE(cvcuda, m)
     ExportOpMinMaxLoc(m);
     ExportOpHistogram(m);
     ExportOpMinAreaRect(m);
-    ExportOpBoxBlur(m);
     ExportOpBndBox(m);
+    ExportOpBoxBlur(m);
     ExportOpBrightnessContrast(m);
     ExportOpColorTwist(m);
     ExportOpRemap(m);
@@ -131,4 +140,6 @@ PYBIND11_MODULE(cvcuda, m)
     ExportOpRandomResizedCrop(m);
     ExportOpGaussianNoise(m);
     ExportOpInpaint(m);
+    ExportOpStack(m);
+    ExportOpFindHomography(m);
 }
diff --git a/python/mod_cvcuda/NormType.cpp b/python/mod_cvcuda/NormType.cpp
new file mode 100644
index 00000000..0f53f820
--- /dev/null
+++ b/python/mod_cvcuda/NormType.cpp
@@ -0,0 +1,32 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "NormType.hpp"
+
+#include <cvcuda/Types.h>
+
+namespace cvcudapy {
+
+void ExportNormType(py::module &m)
+{
+    py::enum_<NVCVNormType>(m, "Norm", py::arithmetic())
+        .value("HAMMING", NVCV_NORM_HAMMING, "Hamming distance")
+        .value("L1", NVCV_NORM_L1, "Manhattan distance")
+        .value("L2", NVCV_NORM_L2, "Euclidean distance");
+}
+
+} // namespace cvcudapy
diff --git a/python/mod_cvcuda/NormType.hpp b/python/mod_cvcuda/NormType.hpp
new file mode 100644
index 00000000..51d7b47e
--- /dev/null
+++ b/python/mod_cvcuda/NormType.hpp
@@ -0,0 +1,30 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_PYTHON_NORM_TYPE_HPP
+#define NVCV_PYTHON_NORM_TYPE_HPP
+
+#include <pybind11/pybind11.h>
+
+namespace cvcudapy {
+namespace py = ::pybind11;
+
+void ExportNormType(py::module &m);
+
+} // namespace cvcudapy
+
+#endif // NVCV_PYTHON_NORM_TYPE_HPP
diff --git a/python/mod_cvcuda/OpAdvCvtColor.cpp b/python/mod_cvcuda/OpAdvCvtColor.cpp
index 0acbc637..f24337b7 100644
--- a/python/mod_cvcuda/OpAdvCvtColor.cpp
+++ b/python/mod_cvcuda/OpAdvCvtColor.cpp
@@ -88,14 +88,12 @@ Tensor AdvCvtColor(Tensor &input, NVCVColorConversionCode code, NVCVColorSpec sp
     {
         nvcv::TensorShape yuvCorrectedShape({outputShape[0], outputShape[1], outputShape[2], outputShape[3]}, "NHWC");
         Tensor            output = Tensor::Create(yuvCorrectedShape, input.dtype());
-        std::cout << yuvCorrectedShape;
         return AdvCvtColorInto(output, input, code, spec, pstream);
     }
     else
     {
         nvcv::TensorShape yuvCorrectedShape({outputShape[0], outputShape[1], outputShape[2]}, "HWC");
         Tensor            output = Tensor::Create(yuvCorrectedShape, input.dtype());
-        std::cout << yuvCorrectedShape;
         return AdvCvtColorInto(output, input, code, spec, pstream);
     }
 }
diff --git a/python/mod_cvcuda/OpCvtColor.cpp b/python/mod_cvcuda/OpCvtColor.cpp
index 051db2f9..3b8eb883 100644
--- a/python/mod_cvcuda/OpCvtColor.cpp
+++ b/python/mod_cvcuda/OpCvtColor.cpp
@@ -56,17 +56,25 @@ Tensor CvtColorInto(Tensor &output, Tensor &input, NVCVColorConversionCode code,
 
 Tensor CvtColor(Tensor &input, NVCVColorConversionCode code, std::optional<Stream> pstream)
 {
+    int  ndim      = input.shape().size();
+    auto layout    = input.layout();
     auto outFormat = GetOutputFormat(input.dtype(), code);
-
-    if (input.shape().size() < 3)
+    auto out_dtype = outFormat.planeDataType(0).channelType(0);
+    if (ndim < 3)
     {
         throw std::runtime_error("Invalid input tensor shape");
     }
-    int          numImgs{static_cast<int>(input.shape()[0])};
-    nvcv::Size2D size{static_cast<int>(input.shape()[2]), static_cast<int>(input.shape()[1])};
-
-    Tensor output = Tensor::CreateForImageBatch(numImgs, size, outFormat);
 
+    std::array<int64_t, NVCV_TENSOR_MAX_RANK> shape_data;
+    for (int d = 0; d < ndim; d++)
+    {
+        if (layout[d] == 'C')
+            shape_data[d] = outFormat.numChannels();
+        else
+            shape_data[d] = input.shape()[d];
+    }
+    nvcv::TensorShape out_shape(shape_data.data(), ndim, layout);
+    Tensor            output = Tensor::Create(out_shape, out_dtype);
     return CvtColorInto(output, input, code, pstream);
 }
 
diff --git a/python/mod_cvcuda/OpFindContours.cpp b/python/mod_cvcuda/OpFindContours.cpp
index 8b50af16..5202905b 100644
--- a/python/mod_cvcuda/OpFindContours.cpp
+++ b/python/mod_cvcuda/OpFindContours.cpp
@@ -32,7 +32,10 @@
 namespace cvcudapy {
 
 namespace {
-Tensor FindContoursInto(Tensor &points, Tensor &numPoints, Tensor &input, std::optional<Stream> pstream)
+
+using TupleTensor2 = std::tuple<Tensor, Tensor>;
+
+TupleTensor2 FindContoursInto(Tensor &points, Tensor &numPoints, Tensor &input, std::optional<Stream> pstream)
 {
     if (!pstream)
     {
@@ -50,10 +53,10 @@ Tensor FindContoursInto(Tensor &points, Tensor &numPoints, Tensor &input, std::o
 
     findContours->submit(pstream->cudaHandle(), input, points, numPoints);
 
-    return points;
+    return TupleTensor2(std::move(points), std::move(numPoints));
 }
 
-Tensor FindContours(Tensor &input, std::optional<Stream> pstream)
+TupleTensor2 FindContours(Tensor &input, std::optional<Stream> pstream)
 {
     auto pointShape = nvcv::TensorShape{
         {input.shape()[0], cvcuda::FindContours::MAX_TOTAL_POINTS, 2},
@@ -65,7 +68,7 @@ Tensor FindContours(Tensor &input, std::optional<Stream> pstream)
         {input.shape()[0], cvcuda::FindContours::MAX_NUM_CONTOURS},
         nvcv::TENSOR_NW
     };
-    Tensor numPoints = Tensor::Create(countShape, nvcv::TYPE_U32);
+    Tensor numPoints = Tensor::Create(countShape, nvcv::TYPE_S32);
 
     return FindContoursInto(points, numPoints, input, pstream);
 }
@@ -92,7 +95,12 @@ void ExportOpFindContours(py::module &m)
             stream (Stream, optional): CUDA Stream on which to perform the operation.
 
         Returns:
-            cvcuda.Tensor: The output tensor.
+            Tuple[Tensor, Tensor]: A tuple of two tensors. The first is the contour points tensor with dimensions NxMx2 -
+            where N is the batch size, M is the maximum number of points allowed. Each point of the contour is specified
+            in (x, y) coordinates. The second tensor specifies the number of valid contours per image and the number of
+            valid points in those contours. It has dimensions NxC where N is the batch size and C is the maximum number
+            of contours found. The actual number of contours can be calculated by counting the number of non-zero elements
+            in the C dimension and the actual number of points in each of those contours are the values stored in the C dimension.
 
         Caution:
             Restrictions to several arguments may apply. Check the C
diff --git a/python/mod_cvcuda/OpFindHomography.cpp b/python/mod_cvcuda/OpFindHomography.cpp
new file mode 100644
index 00000000..12553598
--- /dev/null
+++ b/python/mod_cvcuda/OpFindHomography.cpp
@@ -0,0 +1,330 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Operators.hpp"
+#include "WorkspaceCache.hpp"
+
+#include <common/PyUtil.hpp>
+#include <common/String.hpp>
+#include <cvcuda/OpFindHomography.hpp>
+#include <cvcuda/Types.h>
+#include <nvcv/TensorDataAccess.hpp>
+#include <nvcv/cuda/TypeTraits.hpp>
+#include <nvcv/python/Image.hpp>
+#include <nvcv/python/ImageFormat.hpp>
+#include <nvcv/python/ResourceGuard.hpp>
+#include <nvcv/python/Stream.hpp>
+#include <nvcv/python/Tensor.hpp>
+#include <nvcv/python/TensorBatch.hpp>
+#include <pybind11/stl.h>
+
+namespace cvcudapy {
+
+namespace {
+
+// Specialized class for cvcuda::FindHomography operator with a better cache Key.
+// It allows for reusing an existing operator object from cache if its payload size is >= the required size.
+// It also allows to fetch the biggest payload object to be reused while removing all others.
+// This is more flexible than using the generic PyOperator class and its Key class.
+class PyOpFindHomography : public nvcvpy::Container
+{
+public:
+    // Define a Key class to be used by the cache to fetch similar items for potential reuse.
+    class Key : public nvcvpy::IKey
+    {
+    public:
+        // Arguments of the key constructor should match the corresponding cvcuda operator arguments.
+        Key(int batchSize, int maxNumPoints) {}
+
+        size_t payloadSize() const
+        {
+            return 0;
+        }
+
+    private:
+        size_t doGetHash() const override
+        {
+            return 0;
+        }
+
+        // The comparison of keys is based on the payload size, the one in the cache is "that" key.
+        bool doIsCompatible(const nvcvpy::IKey &that_) const override
+        {
+            return dynamic_cast<const Key *>(&that_) != nullptr;
+        }
+    };
+
+    // Constructor instantiate the cache key and the operator object.
+    PyOpFindHomography(int batchSize, int maxNumPoints)
+        : m_key(batchSize, maxNumPoints)
+        , m_op(batchSize, maxNumPoints)
+    {
+    }
+
+    inline void submit(cudaStream_t stream, const nvcv::Tensor &srcPts, const nvcv::Tensor &dstPts,
+                       const nvcv::Tensor &models)
+    {
+        m_op(stream, srcPts, dstPts, models);
+    }
+
+    inline void submit(cudaStream_t stream, const nvcv::TensorBatch &srcPts, const nvcv::TensorBatch &dstPts,
+                       const nvcv::TensorBatch &models)
+    {
+        m_op(stream, srcPts, dstPts, models);
+    }
+
+    // Required override to get the py object container.
+    py::object container() const override
+    {
+        return *this;
+    }
+
+    // Required override to get the key as the base interface class.
+    const nvcvpy::IKey &key() const override
+    {
+        return m_key;
+    }
+
+    // The static fetch function can be used to specialize the fetch of a specific object from the cache.
+    // It can be used to select the best object among a number of matched cache objects.
+    // It can also be used to remove other objects that are not needed in the cache anymore.
+    // Here, it fetches the biggest payload OP among cache items and remove all other OPs from the cache.
+    // It is ok to remove them since the biggest payload OP can be used to accomodate all of them,
+    // so they will never be reused and thus are no longer necessary.
+    static std::shared_ptr<nvcvpy::ICacheItem> fetch(std::vector<std::shared_ptr<nvcvpy::ICacheItem>> &cache)
+    {
+        assert(!cache.empty());
+
+        std::shared_ptr<nvcvpy::ICacheItem> retItem        = cache[0];
+        size_t                              maxPayloadSize = 0;
+
+        for (const auto &item : cache)
+        {
+            const Key &key            = static_cast<const Key &>(item.get()->key());
+            size_t     keyPayloadSize = key.payloadSize();
+
+            if (keyPayloadSize > maxPayloadSize)
+            {
+                maxPayloadSize = keyPayloadSize;
+                retItem        = item;
+            }
+        }
+
+        cache.clear();
+
+        nvcvpy::Cache::removeAllNotInUseMatching(retItem.get()->key());
+
+        return retItem;
+    }
+
+private:
+    Key                    m_key;
+    cvcuda::FindHomography m_op;
+};
+
+Tensor FindHomographyInto(Tensor &models, Tensor &srcPts, Tensor &dstPts, std::optional<Stream> pstream)
+{
+    if (!pstream)
+    {
+        pstream = Stream::Current();
+    }
+
+    // Use CreateOperatorEx to use the extended create operator function passing the specialized PyOperator above
+    // as template type, instead of the regular cvcuda::OP class used in the CreateOperator function.
+    int32_t batchSize = srcPts.shape()[0];
+    int32_t numPoints = srcPts.shape()[1];
+
+    auto findHomography = CreateOperatorEx<PyOpFindHomography>(batchSize, numPoints);
+
+    ResourceGuard guard(*pstream);
+    guard.add(LockMode::LOCK_READ, {srcPts});
+    guard.add(LockMode::LOCK_READ, {dstPts});
+    guard.add(LockMode::LOCK_WRITE, {models});
+
+    findHomography->submit(pstream->cudaHandle(), srcPts, dstPts, models);
+
+    return models;
+}
+
+Tensor FindHomography(Tensor &srcPts, Tensor dstPts, std::optional<Stream> pstream)
+{
+    Shape modelsShape(3);
+    modelsShape[0] = srcPts.shape()[0];
+    modelsShape[1] = 3;
+    modelsShape[2] = 3;
+
+    Tensor models = Tensor::Create(modelsShape, nvcv::TYPE_F32, nvcv::TENSOR_NHW);
+
+    return FindHomographyInto(models, srcPts, dstPts, pstream);
+}
+
+TensorBatch VarShapeFindHomographyInto(TensorBatch &models, TensorBatch &srcPts, TensorBatch &dstPts,
+                                       std::optional<Stream> pstream)
+{
+    if (!pstream)
+    {
+        pstream = Stream::Current();
+    }
+
+    // The same PyOpFindHomography class and CreateOperatorEx function can be used regardless of Tensors or VarShape.
+    int batchSize    = srcPts.numTensors();
+    int maxNumPoints = 0;
+
+    for (int i = 0; i < batchSize; i++)
+    {
+        int numPoints = srcPts[i].shape()[1];
+        if (numPoints > maxNumPoints)
+            maxNumPoints = numPoints;
+    }
+
+    auto findHomography = CreateOperatorEx<PyOpFindHomography>(batchSize, maxNumPoints);
+
+    ResourceGuard guard(*pstream);
+    guard.add(LockMode::LOCK_READ, {srcPts});
+    guard.add(LockMode::LOCK_READ, {dstPts});
+    guard.add(LockMode::LOCK_WRITE, {models});
+
+    findHomography->submit(pstream->cudaHandle(), srcPts, dstPts, models);
+
+    return models;
+}
+
+TensorBatch VarShapeFindHomography(TensorBatch &srcPts, TensorBatch &dstPts, std::optional<Stream> pstream)
+{
+    TensorBatch models = TensorBatch::Create(srcPts.numTensors());
+
+    Shape modelsShape(3);
+    modelsShape[0] = 1;
+    modelsShape[1] = 3;
+    modelsShape[2] = 3;
+
+    for (int i = 0; i < srcPts.numTensors(); i++)
+    {
+        Tensor outTensor = Tensor::Create(modelsShape, nvcv::TYPE_F32, nvcv::TENSOR_NHW);
+        models.pushBack(outTensor);
+    }
+
+    return VarShapeFindHomographyInto(models, srcPts, dstPts, pstream);
+}
+
+} // namespace
+
+void ExportOpFindHomography(py::module &m)
+{
+    using namespace pybind11::literals;
+
+    py::options options;
+    options.disable_function_signatures();
+
+    m.def("findhomography", &FindHomography, "srcPts"_a, "dstPts"_a, "stream"_a = nullptr, R"pbdoc(
+
+	cvcuda.findhomography(srcPts: nvcv.Tensor, dstPts: nvcv.Tensor, stream: Optional[nvcv.cuda.Stream] = None) -> nvcv.Tensor
+
+        Estimates the homography matrix between srcPts and dstPts coordinates on the given cuda stream.
+
+        See also:
+            Refer to the CV-CUDA C API reference for the Find Homography operator
+            for more details and usage examples.
+
+        Args:
+            srcPts (Tensor): Input source coordinates tensor containing 2D coordinates in the source image.
+            dstPts (Tensor): Input destination coordinates tensor containing 2D coordinates in the target image.
+            stream (Stream, optional): CUDA Stream on which to perform the operation.
+
+        Returns:
+            cvcuda.Tensor: The model homography matrix tensor.
+
+        Caution:
+            Restrictions to several arguments may apply. Check the C
+            API references of the CV-CUDA operator.
+    )pbdoc");
+
+    m.def("findhomography_into", &FindHomographyInto, "models"_a, "srcPts"_a, "dstPts"_a, "stream"_a = nullptr, R"pbdoc(
+
+	cvcuda.findhomography_into(models: nvcv.Tensor, srcPts: nvcv.Tensor, dstPts: nvcv.Tensor, stream: Optional[nvcv.cuda.Stream] = None)
+
+        Executes the Find Homography operation on the given cuda stream.
+
+        See also:
+            Refer to the CV-CUDA C API reference for the Find Homography operator
+            for more details and usage examples.
+
+        Args:
+            models (Tensor) : Output model tensor containing 3x3 homography matrices.
+            srcPts (Tensor): Input source coordinates tensor containing 2D coordinates in the source image.
+            dstPts (Tensor): Input destination coordinates tensor containing 2D coordinates in the target image.
+            stream (Stream, optional): CUDA Stream on which to perform the operation.
+
+        Returns:
+            cvcuda.Tensor: The model homography matrix tensor.
+
+        Caution:
+            Restrictions to several arguments may apply. Check the C
+            API references of the CV-CUDA operator.
+    )pbdoc");
+
+    m.def("findhomography", &VarShapeFindHomography, "srcPts"_a, "dstPts"_a, "stream"_a = nullptr, R"pbdoc(
+
+	cvcuda.findhomography(srcPts: nvcv.TensorBatch, dstPts: nvcv.TensorBatch, stream: Optional[nvcv.cuda.Stream] = None) -> TensorBatch
+
+        Executes the Find Homography operation on the given cuda stream.
+
+        See also:
+            Refer to the CV-CUDA C API reference for the Find Homography operator
+            for more details and usage examples.
+
+        Args:
+            srcPts (TensorBatch): Input source coordinates tensor containing 2D coordinates in the source image.
+            dstPts (TensorBatch): Input destination coordinates tensor containing 2D coordinates in the target image.
+            stream (Stream, optional): CUDA Stream on which to perform the operation.
+
+        Returns:
+            cvcuda.TensorBatch: The model homography matrix tensor batch.
+
+        Caution:
+            Restrictions to several arguments may apply. Check the C
+            API references of the CV-CUDA operator.
+    )pbdoc");
+
+    m.def("findhomography_into", &VarShapeFindHomographyInto, "models"_a, "srcPts"_a, "dstPts"_a, "stream"_a = nullptr,
+          R"pbdoc(
+
+	cvcuda.findhomography(models: nvcv.TensorBatch, srcPts: nvcv.TensorBatch, dstPts: nvcv.TensorBatch, stream: Optional[nvcv.cuda.Stream] = None)
+
+        Executes the Find Homography operation on the given cuda stream.
+
+        See also:
+            Refer to the CV-CUDA C API reference for the Find Homography operator
+            for more details and usage examples.
+
+        Args:
+            models (TensorBatch) : Output model tensor containing 3x3 homography matrices.
+            srcPts (TensorBatch): Input source coordinates tensor containing 2D coordinates in the source image.
+            dstPts (TensorBatch): Input destination coordinates tensor containing 2D coordinates in the target image.
+            stream (Stream, optional): CUDA Stream on which to perform the operation.
+
+        Returns:
+            cvcuda.TensorBatch: The model homography matrix tensor batch.
+
+
+        Caution:
+            Restrictions to several arguments may apply. Check the C
+            API references of the CV-CUDA operator.
+    )pbdoc");
+}
+
+} // namespace cvcudapy
diff --git a/python/mod_cvcuda/OpLabel.cpp b/python/mod_cvcuda/OpLabel.cpp
new file mode 100644
index 00000000..eb89d55b
--- /dev/null
+++ b/python/mod_cvcuda/OpLabel.cpp
@@ -0,0 +1,210 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Operators.hpp"
+
+#include <common/PyUtil.hpp>
+#include <cvcuda/OpLabel.hpp>
+#include <nvcv/TensorDataAccess.hpp>
+#include <nvcv/python/ResourceGuard.hpp>
+#include <nvcv/python/Stream.hpp>
+#include <nvcv/python/Tensor.hpp>
+
+#include <optional>
+
+namespace cvcudapy {
+
+using TupleTensor3 = std::tuple<Tensor, std::optional<Tensor>, std::optional<Tensor>>;
+
+namespace {
+
+TupleTensor3 LabelInto(Tensor &output, std::optional<Tensor> count, std::optional<Tensor> stats, Tensor &input,
+                       NVCVConnectivityType connectivity, NVCVLabelType assignLabels, std::optional<Tensor> bgLabel,
+                       std::optional<Tensor> minThresh, std::optional<Tensor> maxThresh, std::optional<Tensor> minSize,
+                       std::optional<Stream> pstream)
+{
+    if (!pstream)
+    {
+        pstream = Stream::Current();
+    }
+
+    auto op = CreateOperator<cvcuda::Label>();
+
+    ResourceGuard guard(*pstream);
+    guard.add(LockMode::LOCK_READ, {input});
+    guard.add(LockMode::LOCK_WRITE, {output});
+    guard.add(LockMode::LOCK_NONE, {*op});
+
+    if (count)
+    {
+        guard.add(LockMode::LOCK_WRITE, {*count});
+    }
+    if (stats)
+    {
+        guard.add(LockMode::LOCK_WRITE, {*stats});
+    }
+    if (bgLabel)
+    {
+        guard.add(LockMode::LOCK_READ, {*bgLabel});
+    }
+    if (minThresh)
+    {
+        guard.add(LockMode::LOCK_READ, {*minThresh});
+    }
+    if (maxThresh)
+    {
+        guard.add(LockMode::LOCK_READ, {*maxThresh});
+    }
+    if (minSize)
+    {
+        guard.add(LockMode::LOCK_READ, {*minSize});
+    }
+
+    op->submit(pstream->cudaHandle(), input, output, (bgLabel ? *bgLabel : nvcv::Tensor{nullptr}),
+               (minThresh ? *minThresh : nvcv::Tensor{nullptr}), (maxThresh ? *maxThresh : nvcv::Tensor{nullptr}),
+               (minSize ? *minSize : nvcv::Tensor{nullptr}), (count ? *count : nvcv::Tensor{nullptr}),
+               (stats ? *stats : nvcv::Tensor{nullptr}), connectivity, assignLabels);
+
+    return TupleTensor3(std::move(output), count, stats);
+}
+
+TupleTensor3 Label(Tensor &input, NVCVConnectivityType connectivity, NVCVLabelType assignLabels, bool count, bool stats,
+                   int maxLabels, std::optional<Tensor> bgLabel, std::optional<Tensor> minThresh,
+                   std::optional<Tensor> maxThresh, std::optional<Tensor> minSize, std::optional<Stream> pstream)
+{
+    constexpr nvcv::DataType outType = nvcv::TYPE_U32;
+
+    auto inputData = input.exportData<nvcv::TensorDataStridedCuda>();
+    if (!inputData)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input must be a valid CUDA strided tensor");
+    }
+    auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inputData);
+    if (!inAccess)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input must be a valid image-based tensor");
+    }
+    int numSamples = inAccess->numSamples();
+
+    Tensor                output = Tensor::Create(input.shape(), outType);
+    std::optional<Tensor> countTensor, statsTensor;
+
+    if (count)
+    {
+        countTensor = Tensor::Create({{numSamples}, "N"}, outType);
+    }
+    if (stats)
+    {
+        int numStats = 1;
+        if (connectivity == NVCV_CONNECTIVITY_4_2D || connectivity == NVCV_CONNECTIVITY_8_2D)
+        {
+            numStats = 6;
+        }
+        if (connectivity == NVCV_CONNECTIVITY_6_3D || connectivity == NVCV_CONNECTIVITY_26_3D)
+        {
+            numStats = 8;
+        }
+
+        statsTensor = Tensor::Create(
+            {
+                {numSamples, maxLabels, numStats},
+                "NMA"
+        },
+            outType);
+    }
+
+    return LabelInto(output, countTensor, statsTensor, input, connectivity, assignLabels, bgLabel, minThresh, maxThresh,
+                     minSize, pstream);
+}
+
+} // namespace
+
+void ExportOpLabel(py::module &m)
+{
+    using namespace pybind11::literals;
+
+    m.def("label", &Label, "src"_a, "connectivity"_a = NVCV_CONNECTIVITY_4_2D, "assign_labels"_a = NVCV_LABEL_FAST,
+          py::kw_only(), "count"_a = false, "stats"_a = false, "max_labels"_a = 10000, "bg_label"_a = nullptr,
+          "min_thresh"_a = nullptr, "max_thresh"_a = nullptr, "min_size"_a = nullptr, "stream"_a = nullptr, R"pbdoc(
+
+        Executes the Label operation on the given cuda stream.
+
+        See also:
+            Refer to the CV-CUDA C API reference for the Label operator for more details and usage examples.
+
+        Args:
+            src (Tensor): Input tensor to label connected-component regions.
+            connectivity (cvcuda.ConnectivityType, optional): Choice to control connectivity of input elements,
+                                                              default is cvcuda.CONNECTIVITY_4_2D.
+            assign_labels (cvcuda.LABEL, optional): Choice on how labels are assigned,
+                                                    default is cvcuda.LABEL.FAST.
+            count (bool, optional): Use True to return the count of valid labeled regions.
+            stats (bool, optional): Use True to return the statistics of valid labeled regions.
+            max_labels (Number, optional): Maximum number of labels to compute statistics for, default is 10000.
+            bg_label (Tensor, optional): Background tensor to define input values to be considered background
+                                         labels and thus ignored.
+            min_thresh (Tensor, optional): Minimum threshold tensor to mask input values below it to be 0, and others 1.
+            max_thresh (Tensor, optional): Maximum threshold tensor to mask input values above it to be 0, and others 1.
+            min_size (Tensor, optional): Minimum size tensor to remove islands, i.e. labeled regions with number of
+                                         elements less than the minimum size.
+            stream (Stream, optional): CUDA Stream on which to perform the operation.
+
+        Returns:
+            Tuple[Tensor, Tensor, Tensor]: A tuple wih output labels, count of regions and their statistics.
+                                           The count or stats tensors may be None if theirs arguments are False.
+
+        Caution:
+            Restrictions to several arguments may apply. Check the C API references of the CV-CUDA operator.
+    )pbdoc");
+
+    m.def("label_into", &LabelInto, "dst"_a, "count"_a = nullptr, "stats"_a = nullptr, "src"_a,
+          "connectivity"_a = NVCV_CONNECTIVITY_4_2D, "assign_labels"_a = NVCV_LABEL_FAST, py::kw_only(),
+          "bg_label"_a = nullptr, "min_thresh"_a = nullptr, "max_thresh"_a = nullptr, "min_size"_a = nullptr,
+          "stream"_a = nullptr, R"pbdoc(
+
+        Executes the Label operation on the given cuda stream.
+
+        See also:
+            Refer to the CV-CUDA C API reference for the Label operator for more details and usage examples.
+
+        Args:
+            dst (Tensor): Output tensor with labels.
+            count (Tensor, optional): Output tensor with count number of labeled regions.
+            stats (Tensor, optional): Output tensor with statistics for each labeled region.
+            src (Tensor): Input tensor to label connected-component regions.
+            connectivity (cvcuda.ConnectivityType, optional): Choice to control connectivity of input elements,
+                                                              default is cvcuda.CONNECTIVITY_4_2D.
+            assign_labels (cvcuda.LABEL, optional): Choice on how labels are assigned,
+                                                    default is cvcuda.LABEL.FAST.
+            bg_label (Tensor, optional): Background tensor to define input values to be considered background
+                                         labels and thus ignored.
+            min_thresh (Tensor, optional): Minimum threshold tensor to mask input values below it to be 0, and others 1.
+            max_thresh (Tensor, optional): Maximum threshold tensor to mask input values above it to be 0, and others 1.
+            min_size (Tensor, optional): Minimum size tensor to remove islands, i.e. labeled regions with number of
+                                         elements less than the minimum size.
+            stream (Stream, optional): CUDA Stream on which to perform the operation.
+
+        Returns:
+            Tuple[Tensor, Tensor, Tensor]: A tuple wih output labels, count of regions and their statistics.
+                                           The count or stats tensors may be None if theirs arguments are None.
+
+        Caution:
+            Restrictions to several arguments may apply. Check the C API references of the CV-CUDA operator.
+    )pbdoc");
+}
+
+} // namespace cvcudapy
diff --git a/python/mod_cvcuda/OpPairwiseMatcher.cpp b/python/mod_cvcuda/OpPairwiseMatcher.cpp
new file mode 100644
index 00000000..2b9248d7
--- /dev/null
+++ b/python/mod_cvcuda/OpPairwiseMatcher.cpp
@@ -0,0 +1,204 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Operators.hpp"
+
+#include <common/PyUtil.hpp>
+#include <cvcuda/OpPairwiseMatcher.hpp>
+#include <nvcv/python/ResourceGuard.hpp>
+#include <nvcv/python/Stream.hpp>
+#include <nvcv/python/Tensor.hpp>
+
+#include <optional>
+
+namespace cvcudapy {
+
+using TupleTensor3 = std::tuple<Tensor, std::optional<Tensor>, std::optional<Tensor>>;
+
+namespace {
+
+TupleTensor3 PairwiseMatcherInto(Tensor &matches, std::optional<Tensor> numMatches, std::optional<Tensor> distances,
+                                 Tensor &set1, Tensor &set2, std::optional<Tensor> numSet1,
+                                 std::optional<Tensor> numSet2, bool crossCheck, int matchesPerPoint,
+                                 std::optional<NVCVNormType> normType, NVCVPairwiseMatcherType algoChoice,
+                                 std::optional<Stream> pstream)
+{
+    if (!pstream)
+    {
+        pstream = Stream::Current();
+    }
+
+    if (!normType)
+    {
+        normType = set1.dtype() == nvcv::TYPE_F32 ? NVCV_NORM_L2 : NVCV_NORM_HAMMING;
+    }
+
+    auto op = CreateOperator<cvcuda::PairwiseMatcher>(algoChoice);
+
+    ResourceGuard guard(*pstream);
+    guard.add(LockMode::LOCK_READ, {set1, set2});
+    guard.add(LockMode::LOCK_WRITE, {matches});
+    guard.add(LockMode::LOCK_NONE, {*op});
+
+    if (numSet1)
+    {
+        guard.add(LockMode::LOCK_READ, {*numSet1});
+    }
+    if (numSet2)
+    {
+        guard.add(LockMode::LOCK_READ, {*numSet2});
+    }
+    if (numMatches)
+    {
+        guard.add(LockMode::LOCK_WRITE, {*numMatches});
+    }
+    if (distances)
+    {
+        guard.add(LockMode::LOCK_WRITE, {*distances});
+    }
+
+    op->submit(pstream->cudaHandle(), set1, set2, (numSet1 ? *numSet1 : nvcv::Tensor{nullptr}),
+               (numSet2 ? *numSet2 : nvcv::Tensor{nullptr}), matches,
+               (numMatches ? *numMatches : nvcv::Tensor{nullptr}), (distances ? *distances : nvcv::Tensor{nullptr}),
+               crossCheck, matchesPerPoint, *normType);
+
+    return TupleTensor3(std::move(matches), numMatches, distances);
+}
+
+TupleTensor3 PairwiseMatcher(Tensor &set1, Tensor &set2, std::optional<Tensor> numSet1, std::optional<Tensor> numSet2,
+                             std::optional<bool> numMatches, bool distances, bool crossCheck, int matchesPerPoint,
+                             std::optional<NVCVNormType> normType, NVCVPairwiseMatcherType algoChoice,
+                             std::optional<Stream> pstream)
+{
+    nvcv::TensorShape set1Shape = set1.shape();
+    nvcv::TensorShape set2Shape = set2.shape();
+
+    if (set1Shape.rank() != 3 || set2Shape.rank() != 3)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input sets must be rank-3 tensors");
+    }
+
+    int64_t numSamples = set1Shape[0];
+    int64_t maxMatches = std::max(set1Shape[1], set2Shape[1]) * matchesPerPoint;
+
+    if (!numMatches)
+    {
+        numMatches = crossCheck;
+    }
+
+    // clang-format off
+
+    Tensor matches = Tensor::Create({{numSamples, maxMatches, 2}, "NMA"}, nvcv::TYPE_S32);
+
+    std::optional<Tensor> numMatchesTensor, distancesTensor;
+
+    if (*numMatches)
+    {
+        numMatchesTensor = Tensor::Create({{numSamples}, "N"}, nvcv::TYPE_S32);
+    }
+    if (distances)
+    {
+        distancesTensor = Tensor::Create({{numSamples, maxMatches}, "NM"}, nvcv::TYPE_F32);
+    }
+
+    // clang-format on
+
+    return PairwiseMatcherInto(matches, numMatchesTensor, distancesTensor, set1, set2, numSet1, numSet2, crossCheck,
+                               matchesPerPoint, normType, algoChoice, pstream);
+}
+
+} // namespace
+
+void ExportOpPairwiseMatcher(py::module &m)
+{
+    using namespace pybind11::literals;
+
+    m.def("match", &PairwiseMatcher, "set1"_a, "set2"_a, "num_set1"_a = nullptr, "num_set2"_a = nullptr,
+          "num_matches"_a = nullptr, "distances"_a = false, "cross_check"_a = false, "matches_per_point"_a = 1,
+          "norm_type"_a = nullptr, "algo_choice"_a = NVCV_BRUTE_FORCE, py::kw_only(), "stream"_a = nullptr, R"pbdoc(
+
+        Executes the Pairwise matcher operation on the given CUDA stream.
+
+        See also:
+            Refer to the CV-CUDA C API reference for this operator for more details and usage examples.
+
+        Args:
+            set1 (Tensor): Input tensor with 1st set of points.
+            set2 (Tensor): Input tensor with 2nd set of points.
+            num_set1 (Tensor, optional): Input tensor with number of valid points in the 1st set.  If not provided,
+                                         consider the entire set1 containing valid points.
+            num_set2 (Tensor, optional): Input tensor with number of valid points in the 2nd set.  If not provided,
+                                         consider the entire set2 containing valid points.
+            num_matches (bool, optional): Use True to return the number of matches.  If not provided, it is set
+                                          to True if crossCheck=True and False otherwise.
+            distances (bool, optional): Use True to return the match distances.
+            cross_check (bool, optional): Use True to cross check best matches, a best match is only returned if it is
+                                          the best match (minimum distance) from 1st set to 2nd set and vice versa.
+            matches_per_point (Number, optional): Number of best matches to return per point.
+            norm_type (cvcuda.Norm, optional): Choice on how distances are normalized.  Defaults to cvcuda.Norm.L2
+                                               for float input and cvcuda.Norm.HAMMING for other input data types.
+            algo_choice (cvcuda.Matcher, optional): Choice of the algorithm to perform the match.
+            stream (Stream, optional): CUDA Stream on which to perform the operation.
+
+        Returns:
+            Tuple[Tensor, Tensor, Tensor]: A tuple wih output matches, number of matches and their distances.
+                                           The number of matches tensor may be None if its argument is False.
+                                           The distances tensor may be None if its argument is False.
+
+        Caution:
+            Restrictions to several arguments may apply. Check the C API references of the CV-CUDA operator.
+    )pbdoc");
+
+    m.def("match_into", &PairwiseMatcherInto, "matches"_a, "num_matches"_a = nullptr, "distances"_a = nullptr, "set1"_a,
+          "set2"_a, "num_set1"_a = nullptr, "num_set2"_a = nullptr, "cross_check"_a = false, "matches_per_point"_a = 1,
+          "norm_type"_a = nullptr, "algo_choice"_a = NVCV_BRUTE_FORCE, py::kw_only(), "stream"_a = nullptr,
+          R"pbdoc(
+
+        Executes the Pairwise matcher operation on the given CUDA stream.
+
+        See also:
+            Refer to the CV-CUDA C API reference for this operator for more details and usage examples.
+
+        Args:
+            matches (Tensor): Output tensor with matches.
+            num_matches (Tensor, optional): Output tensor with number of matches.
+            distances (Tensor, optional): Output tensor with match distances.
+            set1 (Tensor): Input tensor with 1st set of points.
+            set2 (Tensor): Input tensor with 2nd set of points.
+            num_set1 (Tensor, optional): Input tensor with number of valid points in the 1st set.  If not provided,
+                                         consider the entire set1 containing valid points.
+            num_set2 (Tensor, optional): Input tensor with number of valid points in the 2nd set.  If not provided,
+                                         consider the entire set2 containing valid points.
+            cross_check (bool, optional): Use True to cross check best matches, a best match is only returned if it is
+                                          the best match (minimum distance) from 1st set to 2nd set and vice versa.
+            matches_per_point (Number, optional): Number of best matches to return per point.
+            norm_type (cvcuda.Norm, optional): Choice on how distances are normalized.  Defaults to cvcuda.Norm.L2
+                                               for float input and cvcuda.Norm.HAMMING for other input data types.
+            algo_choice (cvcuda.Matcher, optional): Choice of the algorithm to perform the match.
+            stream (Stream, optional): CUDA Stream on which to perform the operation.
+
+        Returns:
+            Tuple[Tensor, Tensor, Tensor]: A tuple wih output matches, number of matches and their distances.
+                                           The number of matches tensor may be None if its argument is None.
+                                           The distances tensor may be None if its argument is None.
+
+        Caution:
+            Restrictions to several arguments may apply. Check the C API references of the CV-CUDA operator.
+    )pbdoc");
+}
+
+} // namespace cvcudapy
diff --git a/python/mod_cvcuda/OpPillowResize.cpp b/python/mod_cvcuda/OpPillowResize.cpp
index 9c393398..75a5b908 100644
--- a/python/mod_cvcuda/OpPillowResize.cpp
+++ b/python/mod_cvcuda/OpPillowResize.cpp
@@ -16,6 +16,7 @@
  */
 
 #include "Operators.hpp"
+#include "WorkspaceCache.hpp"
 
 #include <common/PyUtil.hpp>
 #include <common/String.hpp>
@@ -47,52 +48,82 @@ class PyOpPillowResize : public nvcvpy::Container
     {
     public:
         // Arguments of the key constructor should match the corresponding cvcuda operator arguments.
-        Key(const nvcv::Size2D &maxSize, int maxBatchSize, nvcv::ImageFormat fmt)
-            : m_maxSize{maxSize}
-            , m_maxBatchSize{maxBatchSize}
-            , m_format{fmt}
-        {
-        }
+        Key() {}
 
-        // The payload size is an approximate function of the actual size of the payload.
-        // There is no need to be an exact value, it is just provide ordering inside cache.
         size_t payloadSize() const
         {
-            return m_maxSize.w * m_maxSize.h * m_maxBatchSize;
+            return 0;
         }
 
     private:
-        // The hash is based only on the image format used by the operator.
-        // (In addition to the OP type as defined by IKey).
         size_t doGetHash() const override
         {
-            return ComputeHash(m_format);
+            return 0;
         }
 
         // The comparison of keys is based on the payload size, the one in the cache is "that" key.
         bool doIsCompatible(const nvcvpy::IKey &that_) const override
         {
-            const Key &that = static_cast<const Key &>(that_);
-            return this->payloadSize() <= that.payloadSize();
+            return dynamic_cast<const Key *>(&that_) != nullptr;
         }
-
-        nvcv::Size2D      m_maxSize;
-        int               m_maxBatchSize;
-        nvcv::ImageFormat m_format;
     };
 
     // Constructor instantiate the cache key and the operator object.
-    PyOpPillowResize(const nvcv::Size2D &maxSize, int maxBatchSize, nvcv::ImageFormat fmt)
-        : m_key(maxSize, maxBatchSize, fmt)
-        , m_op(maxSize, maxBatchSize, fmt)
+    PyOpPillowResize()
+        : m_key()
+        , m_op()
     {
     }
 
-    // The submit forwards its args to the OP's call operator.
-    template<class... AA>
-    void submit(AA &&...args)
+    inline void submit(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, nvcv::ImageFormat format,
+                       NVCVInterpolationType interpolation)
     {
-        m_op(std::forward<AA>(args)...);
+        int          batch_size = getBatchSize(in);
+        nvcv::Size2D in_size    = imageSize(in);
+        nvcv::Size2D out_size   = imageSize(out);
+
+        auto req = m_op.getWorkspaceRequirements(batch_size, out_size, in_size, format);
+        auto ws  = WorkspaceCache::instance().get(req, stream);
+        m_op(stream, ws.get(), in, out, interpolation);
+    }
+
+    inline int getBatchSize(const nvcv::Tensor &tensor)
+    {
+        auto access = nvcv::TensorDataAccessStridedImagePlanar::Create(tensor.exportData());
+        if (!access)
+            throw std::runtime_error("Incompatible tensor layout");
+
+        return access->numSamples();
+    }
+
+    static nvcv::Size2D imageSize(const nvcv::Tensor &tensor)
+    {
+        auto access = nvcv::TensorDataAccessStridedImagePlanar::Create(tensor.exportData());
+        if (!access)
+            throw std::runtime_error("Incompatible tensor layout");
+
+        return access->size();
+    }
+
+    inline void submit(cudaStream_t stream, const nvcv::ImageBatchVarShape &in, const nvcv::ImageBatchVarShape &out,
+                       const NVCVInterpolationType interpolation)
+    {
+        assert(in.numImages() == out.numImages());
+        auto in_sizes  = imageSizes(in);
+        auto out_sizes = imageSizes(out);
+        int  N         = in_sizes.size();
+        auto req       = m_op.getWorkspaceRequirements(N, in_sizes.data(), out_sizes.data(), in.uniqueFormat());
+        auto ws        = WorkspaceCache::instance().get(req, stream);
+        m_op(stream, ws.get(), in, out, interpolation);
+    }
+
+    static std::vector<nvcv::Size2D> imageSizes(const nvcv::ImageBatchVarShape &batch)
+    {
+        std::vector<nvcv::Size2D> sizes(batch.numImages());
+
+        for (size_t i = 0; i < sizes.size(); i++) sizes[i] = batch[i].size();
+
+        return sizes;
     }
 
     // Required override to get the py object container.
@@ -158,21 +189,16 @@ Tensor PillowResizeInto(Tensor &output, Tensor &input, nvcv::ImageFormat format,
         throw std::runtime_error("Incompatible input/output tensor layout");
     }
 
-    nvcv::Size2D maxSize{std::max(in_access->numCols(), out_access->numCols()),
-                         std::max(in_access->numRows(), out_access->numRows())};
-
-    int maxBatchSize = static_cast<int>(in_access->numSamples());
-
     // Use CreateOperatorEx to use the extended create operator function passing the specialized PyOperator above
     // as template type, instead of the regular cvcuda::OP class used in the CreateOperator function.
-    auto pillowResize = CreateOperatorEx<PyOpPillowResize>(maxSize, maxBatchSize, format);
+    auto pillowResize = CreateOperatorEx<PyOpPillowResize>();
 
     ResourceGuard guard(*pstream);
     guard.add(LockMode::LOCK_READ, {input});
     guard.add(LockMode::LOCK_WRITE, {output});
     guard.add(LockMode::LOCK_WRITE, {*pillowResize});
 
-    pillowResize->submit(pstream->cudaHandle(), input, output, interp);
+    pillowResize->submit(pstream->cudaHandle(), input, output, format, interp);
 
     return output;
 }
@@ -193,15 +219,8 @@ ImageBatchVarShape VarShapePillowResizeInto(ImageBatchVarShape &output, ImageBat
         pstream = Stream::Current();
     }
 
-    nvcv::Size2D maxSrcSize = input.maxSize();
-    nvcv::Size2D maxDstSize = output.maxSize();
-
-    nvcv::Size2D maxSize{std::max(maxSrcSize.w, maxDstSize.w), std::max(maxSrcSize.h, maxDstSize.h)};
-
-    int maxBatchSize = static_cast<int>(input.capacity());
-
     // The same PyOpPillowResize class and CreateOperatorEx function can be used regardless of Tensors or VarShape.
-    auto pillowResize = CreateOperatorEx<PyOpPillowResize>(maxSize, maxBatchSize, input.uniqueFormat());
+    auto pillowResize = CreateOperatorEx<PyOpPillowResize>();
 
     ResourceGuard guard(*pstream);
     guard.add(LockMode::LOCK_READ, {input});
diff --git a/python/mod_cvcuda/OpStack.cpp b/python/mod_cvcuda/OpStack.cpp
new file mode 100644
index 00000000..41c7b891
--- /dev/null
+++ b/python/mod_cvcuda/OpStack.cpp
@@ -0,0 +1,179 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Operators.hpp"
+
+#include <common/PyUtil.hpp>
+#include <cvcuda/OpStack.hpp>
+#include <nvcv/python/ResourceGuard.hpp>
+#include <nvcv/python/Stream.hpp>
+#include <nvcv/python/Tensor.hpp>
+#include <nvcv/python/TensorBatch.hpp>
+
+namespace cvcudapy {
+
+namespace {
+
+void checkTensorList(std::vector<Tensor> &tensorList, int64_t (&outputShape)[4], nvcv::TensorLayout &layout,
+                     nvcv::DataType &dtype)
+{
+    int32_t totalTensors = 0;
+
+    if (tensorList.size() == 0)
+    {
+        throw std::runtime_error("Invalid input tensor list");
+    }
+
+    for (auto &tensor : tensorList)
+    {
+        if (tensor.shape().rank() < 3 || tensor.shape().rank() > 4)
+        {
+            throw std::runtime_error("Invalid input tensor shape");
+        }
+        if (tensor.shape().rank() == 4)
+        {
+            totalTensors += tensor.shape()[0];
+            outputShape[1] = tensor.shape()[1];
+            outputShape[2] = tensor.shape()[2];
+            outputShape[3] = tensor.shape()[3];
+        }
+        else
+        {
+            totalTensors++;
+            outputShape[1] = tensor.shape()[0];
+            outputShape[2] = tensor.shape()[1];
+            outputShape[3] = tensor.shape()[2];
+        }
+
+        if (tensor.shape().layout() == nvcv::TENSOR_CHW || tensor.shape().layout() == nvcv::TENSOR_NCHW)
+            layout = nvcv::TENSOR_NCHW;
+        else
+            layout = nvcv::TENSOR_NHWC;
+    }
+    outputShape[0] = totalTensors; // set N to total number of tensors
+    dtype          = tensorList[0].dtype();
+}
+
+Tensor StackIntoInternal(Tensor &output, std::vector<Tensor> &tensorList, std::optional<Stream> pstream,
+                         int32_t numberOfTensors)
+{
+    if (!pstream)
+    {
+        pstream = Stream::Current();
+    }
+
+    nvcvpy::TensorBatch inTensorBatch = nvcvpy::TensorBatch::Create(numberOfTensors);
+
+    for (auto &tensor : tensorList)
+    {
+        inTensorBatch.pushBack(tensor);
+    }
+
+    auto op = CreateOperator<cvcuda::Stack>();
+
+    ResourceGuard guard(*pstream);
+    guard.add(LockMode::LOCK_READ, {inTensorBatch});
+    guard.add(LockMode::LOCK_WRITE, {output});
+    guard.add(LockMode::LOCK_NONE, {*op});
+    op->submit(pstream->cudaHandle(), inTensorBatch, output);
+    return std::move(output);
+}
+
+Tensor StackInto(Tensor &output, std::vector<Tensor> &tensorList, std::optional<Stream> pstream)
+{
+    int64_t            outputShape[4] = {}; // NCHW/NHWC
+    nvcv::TensorLayout layout         = nvcv::TENSOR_CHW;
+    nvcv::DataType     dtype;
+
+    checkTensorList(tensorList, outputShape, layout, dtype);
+
+    if (output.shape().layout() != nvcv::TENSOR_NCHW && output.shape().layout() != nvcv::TENSOR_NHWC)
+        throw std::runtime_error("Invalid output tensor shape");
+
+    if (output.shape()[0] != outputShape[0])
+        throw std::runtime_error("Invalid output tensor shape");
+
+    StackIntoInternal(output, tensorList, pstream, outputShape[0]);
+    return std::move(output);
+}
+
+Tensor Stack(std::vector<Tensor> &tensorList, std::optional<Stream> pstream)
+{
+    int64_t            outputShape[4] = {}; // NCHW/NHWC
+    nvcv::TensorLayout layout         = nvcv::TENSOR_CHW;
+    nvcv::DataType     dtype;
+    checkTensorList(tensorList, outputShape, layout, dtype);
+
+    //create new output tensor
+    Tensor output = Tensor::Create(
+        {
+            {outputShape[0], outputShape[1], outputShape[2], outputShape[3]},
+            layout
+    },
+        dtype);
+    return StackIntoInternal(output, tensorList, pstream, outputShape[0]);
+}
+
+} // namespace
+
+void ExportOpStack(py::module &m)
+{
+    using namespace pybind11::literals;
+
+    m.def("stack", &Stack, "src"_a, py::kw_only(), "stream"_a = nullptr, R"pbdoc(
+
+        Executes the Stack operation on the given cuda stream. This takes input tensors and combines them into a N(HWC/CHW) tensor.
+
+        See also:
+            Refer to the CV-CUDA C API reference for the Stack operator
+            for more details and usage examples.
+
+        Args:
+            src (Tensor List): Input tensors containing one or more samples each images all tensors must be N(HWC/CHW) or HWC/CHW and have the same data type and shape.
+            stream (Stream, optional): CUDA Stream on which to perform the operation.
+
+        Returns:
+            cvcuda.Tensor: The output tensor containing the stacked input tensors.
+
+        Caution:
+            Restrictions to several arguments may apply. Check the C
+            API references of the CV-CUDA operator.
+    )pbdoc");
+
+    m.def("stack_into", &StackInto, "dst"_a, "src"_a, py::kw_only(), "stream"_a = nullptr, R"pbdoc(
+
+        Executes the Stack operation on the given cuda stream. This takes input tensors and combines them into a N(HWC/CHW) tensor.
+
+        See also:
+            Refer to the CV-CUDA C API reference for the Stack operator
+            for more details and usage examples.
+
+        Args:
+            dst (Tensor): Output N(CHW/HWC) tensor to store the result of the operation.
+            src (Tensor List): Input tensors containing one or more samples each images all tensors must be N(HWC/CHW) or HWC/CHW and have the same data type and shape.
+            stream (Stream, optional): CUDA Stream on which to perform the operation.
+
+        Returns:
+            None
+
+        Caution:
+            Restrictions to several arguments may apply. Check the C
+            API references of the CV-CUDA operator.
+    )pbdoc");
+}
+
+} // namespace cvcudapy
diff --git a/python/mod_cvcuda/Operators.hpp b/python/mod_cvcuda/Operators.hpp
index bd28d41b..2b8886b6 100644
--- a/python/mod_cvcuda/Operators.hpp
+++ b/python/mod_cvcuda/Operators.hpp
@@ -17,12 +17,14 @@
 
 #include <common/Hash.hpp>
 #include <nvcv/cuda/MathOps.hpp>
+#include <nvcv/python/Array.hpp>
 #include <nvcv/python/Cache.hpp>
 #include <nvcv/python/Container.hpp>
 #include <nvcv/python/ImageFormat.hpp>
 #include <nvcv/python/Shape.hpp>
 #include <nvcv/python/Size.hpp>
 #include <nvcv/python/Tensor.hpp>
+#include <nvcv/python/TensorBatch.hpp>
 #include <pybind11/pybind11.h>
 
 #include <nvcv/python/Fwd.hpp>
@@ -32,6 +34,7 @@ namespace nvcvpy::util {
 
 namespace cvcudapy {
 
+using nvcvpy::Array;
 using nvcvpy::CreateNVCVTensorShape;
 using nvcvpy::CreateShape;
 using nvcvpy::Image;
@@ -41,6 +44,7 @@ using nvcvpy::ResourceGuard;
 using nvcvpy::Shape;
 using nvcvpy::Stream;
 using nvcvpy::Tensor;
+using nvcvpy::TensorBatch;
 
 namespace util = nvcvpy::util;
 namespace py   = ::pybind11;
@@ -91,6 +95,10 @@ void ExportOpInpaint(py::module &m);
 void ExportOpHistogramEq(py::module &m);
 void ExportOpMinAreaRect(py::module &m);
 void ExportOpAdvCvtColor(py::module &m);
+void ExportOpLabel(py::module &m);
+void ExportOpPairwiseMatcher(py::module &m);
+void ExportOpStack(py::module &m);
+void ExportOpFindHomography(py::module &m);
 
 // Helper class that serves as generic python-side operator class.
 // OP: native operator class
diff --git a/python/mod_cvcuda/OsdElement.cpp b/python/mod_cvcuda/OsdElement.cpp
index bf787218..e47730fe 100644
--- a/python/mod_cvcuda/OsdElement.cpp
+++ b/python/mod_cvcuda/OsdElement.cpp
@@ -20,6 +20,7 @@
 #include <common/String.hpp>
 #include <cuda_runtime.h>
 #include <cvcuda/Types.h>
+#include <cvcuda/priv/Types.hpp>
 #include <pybind11/numpy.h>
 #include <pybind11/stl.h>
 
@@ -91,57 +92,12 @@ static NVCVColorRGBA pytocolor(py::tuple color)
 
 } // namespace
 
-void ExportBndBox(py::module &m)
-{
-    using namespace py::literals;
-
-    py::class_<NVCVBndBoxI>(m, "BndBoxI")
-        .def(py::init([]() { return NVCVBndBoxI{}; }))
-        .def(py::init(
-                 [](py::tuple box, int thickness, py::tuple borderColor, py::tuple fillColor)
-                 {
-                     NVCVBndBoxI bndbox;
-                     bndbox.box         = pytobox(box);
-                     bndbox.thickness   = thickness;
-                     bndbox.borderColor = pytocolor(borderColor);
-                     bndbox.fillColor   = pytocolor(fillColor);
-                     return bndbox;
-                 }),
-             "box"_a, "thickness"_a, "borderColor"_a, "fillColor"_a)
-        .def_readonly("box", &NVCVBndBoxI::box, "Tuple describing a box: x-coordinate, y-coordinate, width, height.")
-        .def_readonly("thickness", &NVCVBndBoxI::thickness, "Border thickness of bounding box.")
-        .def_readonly("borderColor", &NVCVBndBoxI::borderColor, "Border color of bounding box.")
-        .def_readonly("fillColor", &NVCVBndBoxI::fillColor, "Filled color of bounding box.");
-
-    py::class_<NVCVBndBoxesI>(m, "BndBoxesI")
-        .def(py::init([]() { return NVCVBndBoxesI{}; }))
-        .def(py::init(
-                 [](std::vector<int> numBoxes_vec, std::vector<NVCVBndBoxI> bndboxes_vec)
-                 {
-                     NVCVBndBoxesI bndboxes;
-
-                     bndboxes.batch    = numBoxes_vec.size();
-                     bndboxes.numBoxes = new int[bndboxes.batch];
-                     memcpy(bndboxes.numBoxes, numBoxes_vec.data(), numBoxes_vec.size() * sizeof(int));
-
-                     int total_box_num = bndboxes_vec.size();
-                     bndboxes.boxes    = new NVCVBndBoxI[total_box_num];
-                     memcpy(bndboxes.boxes, bndboxes_vec.data(), bndboxes_vec.size() * sizeof(NVCVBndBoxI));
-
-                     return bndboxes;
-                 }),
-             "numBoxes"_a, "boxes"_a)
-        .def_readonly("batch", &NVCVBndBoxesI::batch, "Number of images in the image batch.")
-        .def_readonly("numBoxes", &NVCVBndBoxesI::numBoxes, "Number array of bounding boxes for image batch.")
-        .def_readonly("boxes", &NVCVBndBoxesI::boxes, "Bounding box array for image batch, \ref NVCVBndBoxI.");
-}
-
 void ExportBoxBlur(py::module &m)
 {
     using namespace py::literals;
+    using namespace cvcuda::priv;
 
     py::class_<NVCVBlurBoxI>(m, "BlurBoxI")
-        .def(py::init([]() { return NVCVBlurBoxI{}; }))
         .def(py::init(
                  [](py::tuple box, int kernelSize)
                  {
@@ -154,98 +110,65 @@ void ExportBoxBlur(py::module &m)
         .def_readonly("box", &NVCVBlurBoxI::box, "Tuple describing a box: x-coordinate, y-coordinate, width, height.")
         .def_readonly("kernelSize", &NVCVBlurBoxI::kernelSize, "Kernel sizes of mean filter.");
 
-    py::class_<NVCVBlurBoxesI>(m, "BlurBoxesI")
-        .def(py::init([]() { return NVCVBlurBoxesI{}; }))
-        .def(py::init(
-                 [](std::vector<int> numBoxes_vec, std::vector<NVCVBlurBoxI> blurboxes_vec)
-                 {
-                     NVCVBlurBoxesI blurboxes;
-
-                     blurboxes.batch    = numBoxes_vec.size();
-                     blurboxes.numBoxes = new int[blurboxes.batch];
-                     memcpy(blurboxes.numBoxes, numBoxes_vec.data(), numBoxes_vec.size() * sizeof(int));
-
-                     int total_box_num = blurboxes_vec.size();
-                     blurboxes.boxes   = new NVCVBlurBoxI[total_box_num];
-                     memcpy(blurboxes.boxes, blurboxes_vec.data(), blurboxes_vec.size() * sizeof(NVCVBlurBoxI));
-
-                     return blurboxes;
-                 }),
-             "numBoxes"_a, "boxes"_a)
-        .def_readonly("batch", &NVCVBlurBoxesI::batch, "Number of images in the image batch.")
-        .def_readonly("numBoxes", &NVCVBlurBoxesI::numBoxes, "Number array of blurring boxes for image batch.")
-        .def_readonly("boxes", &NVCVBlurBoxesI::boxes, "Blurring box array for image batch, \ref NVCVBlurBoxI.");
+    py::class_<NVCVBlurBoxesImpl, std::shared_ptr<NVCVBlurBoxesImpl>>(m, "BlurBoxesI")
+        .def(py::init([](const std::vector<std::vector<NVCVBlurBoxI>> &blurboxes_vec)
+                      { return std::make_shared<NVCVBlurBoxesImpl>(blurboxes_vec); }),
+             "boxes"_a);
 }
 
 void ExportOSD(py::module &m)
 {
     using namespace py::literals;
+    using namespace cvcuda::priv;
 
-    py::class_<NVCVText>(m, "Label")
-        .def(py::init([]() { return NVCVText{}; }))
+    py::class_<NVCVBndBoxI>(m, "BndBoxI")
         .def(py::init(
-                 [](const char *utf8Text, int32_t fontSize, const char *fontName, py::tuple tlPos, py::tuple fontColor,
-                    py::tuple bgColor)
+                 [](py::tuple box, int thickness, py::tuple borderColor, py::tuple fillColor)
                  {
-                     NVCVText label;
-                     label.utf8Text = (const char *)malloc(strlen(utf8Text));
-                     memcpy(const_cast<char *>(label.utf8Text), utf8Text, strlen(utf8Text) + 1);
-                     label.fontName = (const char *)malloc(strlen(fontName));
-                     memcpy(const_cast<char *>(label.fontName), fontName, strlen(fontName) + 1);
-                     label.fontSize  = fontSize;
-                     label.tlPos     = pytopoint(tlPos);
-                     label.fontColor = pytocolor(fontColor);
-                     label.bgColor   = pytocolor(bgColor);
-                     return label;
+                     NVCVBndBoxI bndbox;
+                     bndbox.box         = pytobox(box);
+                     bndbox.thickness   = thickness;
+                     bndbox.borderColor = pytocolor(borderColor);
+                     bndbox.fillColor   = pytocolor(fillColor);
+                     return bndbox;
                  }),
-             "utf8Text"_a, "fontSize"_a, py::arg("fontName") = "DejaVuSansMono", "tlPos"_a, "fontColor"_a, "bgColor"_a)
-        .def_readonly("utf8Text", &NVCVText::utf8Text, "Label text in utf8 format.")
-        .def_readonly("fontSize", &NVCVText::fontSize, "Font size of label text.")
-        .def_readonly("fontName", &NVCVText::fontName, "Font name of label text, default: DejaVuSansMono.")
-        .def_readonly("tlPos", &NVCVText::tlPos, "Top-left corner point for label text.")
-        .def_readonly("fontColor", &NVCVText::fontColor, "Font color of label text.")
-        .def_readonly("bgColor", &NVCVText::bgColor, "Back color of label text.");
+             "box"_a, "thickness"_a, "borderColor"_a, "fillColor"_a)
+        .def_readonly("box", &NVCVBndBoxI::box, "Tuple describing a box: x-coordinate, y-coordinate, width, height.")
+        .def_readonly("thickness", &NVCVBndBoxI::thickness, "Border thickness of bounding box.")
+        .def_readonly("borderColor", &NVCVBndBoxI::borderColor, "Border color of bounding box.")
+        .def_readonly("fillColor", &NVCVBndBoxI::fillColor, "Filled color of bounding box.");
+
+    py::class_<NVCVBndBoxesImpl, std::shared_ptr<NVCVBndBoxesImpl>>(m, "BndBoxesI")
+        .def(py::init([](const std::vector<std::vector<NVCVBndBoxI>> &bndboxes_vec)
+                      { return std::make_shared<NVCVBndBoxesImpl>(bndboxes_vec); }),
+             "boxes"_a);
+
+    py::class_<NVCVText, std::shared_ptr<NVCVText>>(m, "Label")
+        .def(py::init(
+                 [](const char *utf8Text, int32_t fontSize, const char *fontName, py::tuple tlPos, py::tuple fontColor,
+                    py::tuple bgColor) {
+                     return NVCVText(utf8Text, fontSize, fontName, pytopoint(tlPos), pytocolor(fontColor),
+                                     pytocolor(bgColor));
+                 }),
+             "utf8Text"_a, "fontSize"_a, py::arg("fontName") = "DejaVuSansMono", "tlPos"_a, "fontColor"_a, "bgColor"_a);
 
     py::class_<NVCVSegment>(m, "Segment")
-        .def(py::init([]() { return NVCVSegment{}; }))
         .def(py::init(
                  [](py::tuple box, int32_t thickness, py::array_t<float> segArray, float segThreshold,
                     py::tuple borderColor, py::tuple segColor)
                  {
-                     NVCVSegment segment;
-                     segment.box       = pytobox(box);
-                     segment.thickness = thickness;
-
                      py::buffer_info hSeg = segArray.request();
                      if (hSeg.ndim != 2)
                      {
                          throw std::runtime_error("segArray dims must be 2!");
                      }
-                     segment.segWidth  = hSeg.shape[0];
-                     segment.segHeight = hSeg.shape[1];
-
-                     checkRuntime(cudaMalloc(&segment.dSeg, segment.segWidth * segment.segHeight * sizeof(float)));
-                     checkRuntime(cudaMemcpy(segment.dSeg, hSeg.ptr,
-                                             segment.segWidth * segment.segHeight * sizeof(float),
-                                             cudaMemcpyHostToDevice));
-
-                     segment.segThreshold = segThreshold;
-                     segment.borderColor  = pytocolor(borderColor);
-                     segment.segColor     = pytocolor(segColor);
-                     return segment;
+
+                     return NVCVSegment(pytobox(box), thickness, (float *)hSeg.ptr, hSeg.shape[0], hSeg.shape[1],
+                                        segThreshold, pytocolor(borderColor), pytocolor(segColor));
                  }),
-             "box"_a, "thickness"_a, "segArray"_a, "segThreshold"_a, "borderColor"_a, "segColor"_a)
-        .def_readonly("box", &NVCVSegment::box, "Bounding box of segment.")
-        .def_readonly("thickness", &NVCVSegment::thickness, "Line thickness of segment outter rect.")
-        .def_readonly("dSeg", &NVCVSegment::dSeg, "Device pointer for segment mask.")
-        .def_readonly("segWidth", &NVCVSegment::segWidth, "Segment mask width.")
-        .def_readonly("segHeight", &NVCVSegment::segHeight, "Segment mask height.")
-        .def_readonly("segThreshold", &NVCVSegment::segThreshold, "Segment threshold.")
-        .def_readonly("borderColor", &NVCVSegment::borderColor, "Line color of segment outter rect.")
-        .def_readonly("segColor", &NVCVSegment::segColor, "Segment mask color.");
+             "box"_a, "thickness"_a, "segArray"_a, "segThreshold"_a, "borderColor"_a, "segColor"_a);
 
     py::class_<NVCVPoint>(m, "Point")
-        .def(py::init([]() { return NVCVPoint{}; }))
         .def(py::init(
                  [](py::tuple centerPos, int32_t radius, py::tuple color)
                  {
@@ -261,7 +184,6 @@ void ExportOSD(py::module &m)
         .def_readonly("color", &NVCVPoint::color, "Point color.");
 
     py::class_<NVCVLine>(m, "Line")
-        .def(py::init([]() { return NVCVLine{}; }))
         .def(py::init(
                  [](py::tuple pos0, py::tuple pos1, int32_t thickness, py::tuple color, bool interpolation)
                  {
@@ -281,46 +203,22 @@ void ExportOSD(py::module &m)
         .def_readonly("interpolation", &NVCVLine::interpolation, "Default: true.");
 
     py::class_<NVCVPolyLine>(m, "PolyLine")
-        .def(py::init([]() { return NVCVPolyLine{}; }))
         .def(py::init(
                  [](py::array_t<int> points, int32_t thickness, bool isClosed, py::tuple borderColor,
                     py::tuple fillColor, bool interpolation)
                  {
-                     NVCVPolyLine pl;
-
                      py::buffer_info points_info = points.request();
                      if (points_info.ndim != 2 || points_info.shape[1] != 2)
                      {
                          throw std::runtime_error("points dims and shape[1] must be 2!");
                      }
 
-                     pl.numPoints = points_info.shape[0];
-                     pl.hPoints   = new int[pl.numPoints * 2];
-                     checkRuntime(cudaMalloc(&pl.dPoints, 2 * pl.numPoints * sizeof(int)));
-
-                     memcpy(pl.hPoints, points_info.ptr, 2 * pl.numPoints * sizeof(int));
-                     checkRuntime(cudaMemcpy(pl.dPoints, points_info.ptr, 2 * pl.numPoints * sizeof(int),
-                                             cudaMemcpyHostToDevice));
-
-                     pl.thickness     = thickness;
-                     pl.isClosed      = isClosed;
-                     pl.borderColor   = pytocolor(borderColor);
-                     pl.fillColor     = pytocolor(fillColor);
-                     pl.interpolation = interpolation;
-                     return pl;
+                     return NVCVPolyLine((int32_t *)points_info.ptr, points_info.shape[0], thickness, isClosed,
+                                         pytocolor(borderColor), pytocolor(fillColor), interpolation);
                  }),
-             "points"_a, "thickness"_a, "isClosed"_a, "borderColor"_a, "fillColor"_a, py::arg("interpolation") = true)
-        .def_readonly("hPoints", &NVCVPolyLine::hPoints, "Host pointer for polyline points.")
-        .def_readonly("dPoints", &NVCVPolyLine::dPoints, "Device pointer for polyline points.")
-        .def_readonly("numPoints", &NVCVPolyLine::numPoints, "Number of polyline points.")
-        .def_readonly("thickness", &NVCVPolyLine::thickness, "Polyline thickness.")
-        .def_readonly("isClosed", &NVCVPolyLine::isClosed, "Connect p(0) to p(n-1) or not.")
-        .def_readonly("borderColor", &NVCVPolyLine::borderColor, "Line color of polyline border.")
-        .def_readonly("fillColor", &NVCVPolyLine::fillColor, "Fill color of poly fill area.")
-        .def_readonly("interpolation", &NVCVPolyLine::interpolation, "Default: true.");
+             "points"_a, "thickness"_a, "isClosed"_a, "borderColor"_a, "fillColor"_a, py::arg("interpolation") = true);
 
     py::class_<NVCVRotatedBox>(m, "RotatedBox")
-        .def(py::init([]() { return NVCVRotatedBox{}; }))
         .def(py::init(
                  [](py::tuple centerPos, int32_t width, int32_t height, float yaw, int32_t thickness,
                     py::tuple borderColor, py::tuple bgColor, bool interpolation)
@@ -348,7 +246,6 @@ void ExportOSD(py::module &m)
         .def_readonly("interpolation", &NVCVRotatedBox::interpolation, "Default: false.");
 
     py::class_<NVCVCircle>(m, "Circle")
-        .def(py::init([]() { return NVCVCircle{}; }))
         .def(py::init(
                  [](py::tuple centerPos, int32_t radius, int32_t thickness, py::tuple borderColor, py::tuple bgColor)
                  {
@@ -368,7 +265,6 @@ void ExportOSD(py::module &m)
         .def_readonly("bgColor", &NVCVCircle::bgColor, "Circle filled color.");
 
     py::class_<NVCVArrow>(m, "Arrow")
-        .def(py::init([]() { return NVCVArrow{}; }))
         .def(py::init(
                  [](py::tuple pos0, py::tuple pos1, int32_t arrowSize, int32_t thickness, py::tuple color,
                     bool interpolation)
@@ -396,144 +292,88 @@ void ExportOSD(py::module &m)
         .value("HHMMSS", NVCVClockFormat::HHMMSS);
 
     py::class_<NVCVClock>(m, "Clock")
-        .def(py::init([]() { return NVCVClock{}; }))
         .def(py::init(
                  [](NVCVClockFormat clockFormat, long time, int32_t fontSize, const char *font, py::tuple tlPos,
-                    py::tuple fontColor, py::tuple bgColor)
-                 {
-                     NVCVClock clock;
-                     clock.clockFormat = clockFormat;
-                     clock.time        = time;
-                     clock.fontSize    = fontSize;
-                     clock.font        = (const char *)malloc(strlen(font));
-                     memcpy(const_cast<char *>(clock.font), font, strlen(font) + 1);
-                     clock.tlPos     = pytopoint(tlPos);
-                     clock.fontColor = pytocolor(fontColor);
-                     clock.bgColor   = pytocolor(bgColor);
-                     return clock;
+                    py::tuple fontColor, py::tuple bgColor) {
+                     return NVCVClock(clockFormat, time, fontSize, font, pytopoint(tlPos), pytocolor(fontColor),
+                                      pytocolor(bgColor));
                  }),
              "clockFormat"_a, "time"_a, "fontSize"_a, py::arg("font") = "DejaVuSansMono", "tlPos"_a, "fontColor"_a,
-             "bgColor"_a)
-        .def_readonly("clockFormat", &NVCVClock::clockFormat, "Pre-defined clock format.")
-        .def_readonly("time", &NVCVClock::time, "Clock time.")
-        .def_readonly("fontSize", &NVCVClock::fontSize, "Font size.")
-        .def_readonly("font", &NVCVClock::font, "Font name, default: DejaVuSansMono.")
-        .def_readonly("tlPos", &NVCVClock::tlPos, "Top-left corner point of the text.")
-        .def_readonly("fontColor", &NVCVClock::fontColor, "Font color of the text.")
-        .def_readonly("bgColor", &NVCVClock::bgColor, "Background color of text box.");
-
-    py::class_<NVCVElement>(m, "Element")
-        .def(py::init([]() { return NVCVElement{}; }))
-        .def(py::init(
-                 [](NVCVOSDType type, void *data)
-                 {
-                     NVCVElement element;
-                     element.type = type;
-                     element.data = data;
-                     return element;
-                 }),
-             "type"_a, "data"_a)
-        .def_readonly("type", &NVCVElement::type, "Element type.")
-        .def_readonly("data", &NVCVElement::data, "Element data pointer.");
+             "bgColor"_a);
 
-    py::class_<NVCVElements>(m, "Elements")
-        .def(py::init([]() { return NVCVElements{}; }))
+    py::class_<NVCVElementsImpl, std::shared_ptr<NVCVElementsImpl>>(m, "Elements")
         .def(py::init(
-                 [](std::vector<int> numElements_vec, py::tuple elements_list)
+                 [](const std::vector<py::list> &elements_list_vec)
                  {
-                     NVCVElements ctx;
-
-                     ctx.batch       = numElements_vec.size();
-                     ctx.numElements = new int[ctx.batch];
-                     memcpy(ctx.numElements, numElements_vec.data(), numElements_vec.size() * sizeof(int));
-
-                     int total_element_num = elements_list.size();
-                     ctx.elements          = new NVCVElement[total_element_num];
-
-                     for (size_t i = 0; i < elements_list.size(); ++i)
+                     std::vector<std::vector<std::shared_ptr<NVCVElement>>> elements_vec;
+                     for (const auto &elements_list : elements_list_vec)
                      {
-                         if (pybind11::isinstance<NVCVBndBoxI>(elements_list[i]))
-                         {
-                             ctx.elements[i].type = NVCVOSDType::NVCV_OSD_RECT;
-                             ctx.elements[i].data = new NVCVBndBoxI();
-                             auto bndbox          = elements_list[i].cast<NVCVBndBoxI>();
-                             memcpy(ctx.elements[i].data, &bndbox, sizeof(NVCVBndBoxI));
-                         }
-                         else if (pybind11::isinstance<NVCVText>(elements_list[i]))
-                         {
-                             ctx.elements[i].type = NVCVOSDType::NVCV_OSD_TEXT;
-                             ctx.elements[i].data = new NVCVText();
-                             auto text            = elements_list[i].cast<NVCVText>();
-                             memcpy(ctx.elements[i].data, &text, sizeof(NVCVText));
-                         }
-                         else if (pybind11::isinstance<NVCVSegment>(elements_list[i]))
-                         {
-                             ctx.elements[i].type = NVCVOSDType::NVCV_OSD_SEGMENT;
-                             ctx.elements[i].data = new NVCVSegment();
-                             auto segment         = elements_list[i].cast<NVCVSegment>();
-                             memcpy(ctx.elements[i].data, &segment, sizeof(NVCVSegment));
-                         }
-                         else if (pybind11::isinstance<NVCVPoint>(elements_list[i]))
-                         {
-                             ctx.elements[i].type = NVCVOSDType::NVCV_OSD_POINT;
-                             ctx.elements[i].data = new NVCVPoint();
-                             auto point           = elements_list[i].cast<NVCVPoint>();
-                             memcpy(ctx.elements[i].data, &point, sizeof(NVCVPoint));
-                         }
-                         else if (pybind11::isinstance<NVCVLine>(elements_list[i]))
-                         {
-                             ctx.elements[i].type = NVCVOSDType::NVCV_OSD_LINE;
-                             ctx.elements[i].data = new NVCVLine();
-                             auto line            = elements_list[i].cast<NVCVLine>();
-                             memcpy(ctx.elements[i].data, &line, sizeof(NVCVLine));
-                         }
-                         else if (pybind11::isinstance<NVCVPolyLine>(elements_list[i]))
-                         {
-                             ctx.elements[i].type = NVCVOSDType::NVCV_OSD_POLYLINE;
-                             ctx.elements[i].data = new NVCVPolyLine();
-                             auto pl              = elements_list[i].cast<NVCVPolyLine>();
-                             memcpy(ctx.elements[i].data, &pl, sizeof(NVCVPolyLine));
-                         }
-                         else if (pybind11::isinstance<NVCVRotatedBox>(elements_list[i]))
-                         {
-                             ctx.elements[i].type = NVCVOSDType::NVCV_OSD_ROTATED_RECT;
-                             ctx.elements[i].data = new NVCVRotatedBox();
-                             auto pl              = elements_list[i].cast<NVCVRotatedBox>();
-                             memcpy(ctx.elements[i].data, &pl, sizeof(NVCVRotatedBox));
-                         }
-                         else if (pybind11::isinstance<NVCVCircle>(elements_list[i]))
-                         {
-                             ctx.elements[i].type = NVCVOSDType::NVCV_OSD_CIRCLE;
-                             ctx.elements[i].data = new NVCVCircle();
-                             auto circle          = elements_list[i].cast<NVCVCircle>();
-                             memcpy(ctx.elements[i].data, &circle, sizeof(NVCVCircle));
-                         }
-                         else if (pybind11::isinstance<NVCVArrow>(elements_list[i]))
-                         {
-                             ctx.elements[i].type = NVCVOSDType::NVCV_OSD_ARROW;
-                             ctx.elements[i].data = new NVCVArrow();
-                             auto arrow           = elements_list[i].cast<NVCVArrow>();
-                             memcpy(ctx.elements[i].data, &arrow, sizeof(NVCVArrow));
-                         }
-                         else if (pybind11::isinstance<NVCVClock>(elements_list[i]))
-                         {
-                             ctx.elements[i].type = NVCVOSDType::NVCV_OSD_CLOCK;
-                             ctx.elements[i].data = new NVCVClock();
-                             auto clock           = elements_list[i].cast<NVCVClock>();
-                             memcpy(ctx.elements[i].data, &clock, sizeof(NVCVClock));
-                         }
-                         else
+                         std::vector<std::shared_ptr<NVCVElement>> curVec;
+                         for (size_t i = 0; i < elements_list.size(); ++i)
                          {
-                             ctx.elements[i].type = NVCVOSDType::NVCV_OSD_NONE;
+                             std::shared_ptr<NVCVElement> element;
+                             if (pybind11::isinstance<NVCVBndBoxI>(elements_list[i]))
+                             {
+                                 auto rect = elements_list[i].cast<NVCVBndBoxI>();
+                                 element   = std::make_shared<NVCVElement>(NVCVOSDType::NVCV_OSD_RECT, &rect);
+                             }
+                             else if (pybind11::isinstance<NVCVText>(elements_list[i]))
+                             {
+                                 auto text = elements_list[i].cast<NVCVText>();
+                                 element   = std::make_shared<NVCVElement>(NVCVOSDType::NVCV_OSD_TEXT, &text);
+                             }
+                             else if (pybind11::isinstance<NVCVSegment>(elements_list[i]))
+                             {
+                                 auto segment = elements_list[i].cast<NVCVSegment>();
+                                 element      = std::make_shared<NVCVElement>(NVCVOSDType::NVCV_OSD_SEGMENT, &segment);
+                             }
+                             else if (pybind11::isinstance<NVCVPoint>(elements_list[i]))
+                             {
+                                 auto point = elements_list[i].cast<NVCVPoint>();
+                                 element    = std::make_shared<NVCVElement>(NVCVOSDType::NVCV_OSD_POINT, &point);
+                             }
+                             else if (pybind11::isinstance<NVCVLine>(elements_list[i]))
+                             {
+                                 auto line = elements_list[i].cast<NVCVLine>();
+                                 element   = std::make_shared<NVCVElement>(NVCVOSDType::NVCV_OSD_LINE, &line);
+                             }
+                             else if (pybind11::isinstance<NVCVPolyLine>(elements_list[i]))
+                             {
+                                 auto pl = elements_list[i].cast<NVCVPolyLine>();
+                                 element = std::make_shared<NVCVElement>(NVCVOSDType::NVCV_OSD_POLYLINE, &pl);
+                             }
+                             else if (pybind11::isinstance<NVCVRotatedBox>(elements_list[i]))
+                             {
+                                 auto rb = elements_list[i].cast<NVCVRotatedBox>();
+                                 element = std::make_shared<NVCVElement>(NVCVOSDType::NVCV_OSD_ROTATED_RECT, &rb);
+                             }
+                             else if (pybind11::isinstance<NVCVCircle>(elements_list[i]))
+                             {
+                                 auto circle = elements_list[i].cast<NVCVCircle>();
+                                 element     = std::make_shared<NVCVElement>(NVCVOSDType::NVCV_OSD_CIRCLE, &circle);
+                             }
+                             else if (pybind11::isinstance<NVCVArrow>(elements_list[i]))
+                             {
+                                 auto arrow = elements_list[i].cast<NVCVArrow>();
+                                 element    = std::make_shared<NVCVElement>(NVCVOSDType::NVCV_OSD_ARROW, &arrow);
+                             }
+                             else if (pybind11::isinstance<NVCVClock>(elements_list[i]))
+                             {
+                                 auto clock = elements_list[i].cast<NVCVClock>();
+                                 element    = std::make_shared<NVCVElement>(NVCVOSDType::NVCV_OSD_CLOCK, &clock);
+                             }
+                             else
+                             {
+                                 element = std::make_shared<NVCVElement>(NVCVOSDType::NVCV_OSD_NONE, nullptr);
+                             }
+                             curVec.emplace_back(element);
                          }
+                         elements_vec.emplace_back(curVec);
                      }
 
-                     return ctx;
+                     return std::make_shared<NVCVElementsImpl>(elements_vec);
                  }),
-             "numElements"_a, "elements"_a)
-        .def_readonly("batch", &NVCVElements::batch, "Number of images in the image batch.")
-        .def_readonly("numElements", &NVCVElements::numElements, "Number array of OSD elements for image batch.")
-        .def_readonly("elements", &NVCVElements::elements, "OSD elements array for image batch, \ref NVCVElement.");
+             "elements"_a);
 }
 
 } // namespace cvcudapy
diff --git a/python/mod_cvcuda/OsdElement.hpp b/python/mod_cvcuda/OsdElement.hpp
index 320eac80..c18b664a 100644
--- a/python/mod_cvcuda/OsdElement.hpp
+++ b/python/mod_cvcuda/OsdElement.hpp
@@ -24,8 +24,6 @@ namespace cvcudapy {
 
 namespace py = ::pybind11;
 
-void ExportBndBox(py::module &m);
-
 void ExportBoxBlur(py::module &m);
 
 void ExportOSD(py::module &m);
diff --git a/python/mod_cvcuda/PairwiseMatcherType.cpp b/python/mod_cvcuda/PairwiseMatcherType.cpp
new file mode 100644
index 00000000..deb97578
--- /dev/null
+++ b/python/mod_cvcuda/PairwiseMatcherType.cpp
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PairwiseMatcherType.hpp"
+
+#include <cvcuda/Types.h>
+
+namespace cvcudapy {
+
+void ExportPairwiseMatcherType(py::module &m)
+{
+    py::enum_<NVCVPairwiseMatcherType>(m, "Matcher", py::arithmetic()).value("BRUTE_FORCE", NVCV_BRUTE_FORCE);
+}
+
+} // namespace cvcudapy
diff --git a/python/mod_cvcuda/PairwiseMatcherType.hpp b/python/mod_cvcuda/PairwiseMatcherType.hpp
new file mode 100644
index 00000000..c49d0097
--- /dev/null
+++ b/python/mod_cvcuda/PairwiseMatcherType.hpp
@@ -0,0 +1,30 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_PYTHON_PAIRWISE_MATCHER_TYPE_HPP
+#define NVCV_PYTHON_PAIRWISE_MATCHER_TYPE_HPP
+
+#include <pybind11/pybind11.h>
+
+namespace cvcudapy {
+namespace py = ::pybind11;
+
+void ExportPairwiseMatcherType(py::module &m);
+
+} // namespace cvcudapy
+
+#endif // NVCV_PYTHON_PAIRWISE_MATCHER_TYPE_HPP
diff --git a/python/mod_cvcuda/WorkspaceCache.cpp b/python/mod_cvcuda/WorkspaceCache.cpp
new file mode 100644
index 00000000..989b9d51
--- /dev/null
+++ b/python/mod_cvcuda/WorkspaceCache.cpp
@@ -0,0 +1,87 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "WorkspaceCache.hpp"
+
+namespace cvcudapy {
+
+WorkspaceLease::WorkspaceLease(WorkspaceCache *owner, CachedWorkspaceMem<MemoryKind::Host> &&host,
+                               CachedWorkspaceMem<MemoryKind::Pinned> &&pinned,
+                               CachedWorkspaceMem<MemoryKind::Cuda>   &&cuda,
+                               std::optional<cudaStream_t>              hostReleaseStream,
+                               std::optional<cudaStream_t>              pinnedReleaseStream,
+                               std::optional<cudaStream_t>              cudaReleaseStream)
+    : m_owner(owner)
+    , m_host(std::move(host))
+    , m_pinned(std::move(pinned))
+    , m_cuda(std::move(cuda))
+    , m_hostReleaseStream(std::move(hostReleaseStream))
+    , m_pinnedReleaseStream(std::move(pinnedReleaseStream))
+    , m_cudaReleaseStream(std::move(cudaReleaseStream))
+{
+}
+
+WorkspaceLease::~WorkspaceLease()
+{
+    if (m_host)
+        m_owner->m_host.put(std::move(m_host), m_hostReleaseStream);
+    if (m_pinned)
+        m_owner->m_pinned.put(std::move(m_pinned), m_pinnedReleaseStream);
+    if (m_cuda)
+        m_owner->m_cuda.put(std::move(m_cuda), m_hostReleaseStream);
+}
+
+WorkspaceCache::WorkspaceCache(nvcv::Allocator allocator)
+    : m_eventCache(std::make_shared<nvcv::util::EventCache>())
+    , m_host(allocator, m_eventCache)
+    , m_pinned(allocator, m_eventCache)
+    , m_cuda(allocator, m_eventCache)
+{
+}
+
+WorkspaceCache::WorkspaceCache()
+    : WorkspaceCache(nvcv::CustomAllocator<>{})
+{
+}
+
+WorkspaceLease WorkspaceCache::get(cvcuda::WorkspaceRequirements req, std::optional<cudaStream_t> hostAcquireStream,
+                                   std::optional<cudaStream_t> hostReleaseStream,
+                                   std::optional<cudaStream_t> pinnedAcquireStream,
+                                   std::optional<cudaStream_t> pinnedReleaseStream,
+                                   std::optional<cudaStream_t> cudaAcquireStream,
+                                   std::optional<cudaStream_t> cudaReleaseStream)
+{
+    return WorkspaceLease(this, m_host.get(req.hostMem, hostAcquireStream),
+                          m_pinned.get(req.pinnedMem, pinnedAcquireStream), m_cuda.get(req.cudaMem, cudaAcquireStream),
+                          hostReleaseStream, pinnedReleaseStream, cudaReleaseStream);
+}
+
+WorkspaceCache &WorkspaceCache::instance()
+{
+    static WorkspaceCache instance;
+    return instance;
+}
+
+void WorkspaceCache::clear()
+{
+    m_cuda.clear();
+    m_pinned.clear();
+    m_host.clear();
+    m_eventCache->purge();
+}
+
+} // namespace cvcudapy
diff --git a/python/mod_cvcuda/WorkspaceCache.hpp b/python/mod_cvcuda/WorkspaceCache.hpp
new file mode 100644
index 00000000..ffd56a21
--- /dev/null
+++ b/python/mod_cvcuda/WorkspaceCache.hpp
@@ -0,0 +1,319 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CVCUDA_PYTHON_WORKSPACE_CACHE_HPP
+#define CVCUDA_PYTHON_WORKSPACE_CACHE_HPP
+
+#include <common/CheckError.hpp>
+#include <cvcuda/Workspace.hpp>
+#include <nvcv/alloc/Allocator.hpp>
+#include <util/Event.hpp>
+#include <util/PerStreamCache.hpp>
+#include <util/SimpleCache.hpp>
+#include <util/StreamId.hpp>
+
+#include <atomic>
+#include <cassert>
+#include <map>
+#include <mutex>
+
+namespace cvcudapy {
+
+using WorkspaceMemDestructor_t = std::function<void(cvcuda::WorkspaceMem &)>;
+
+enum class MemoryKind
+{
+    Host,
+    Pinned,
+    Cuda
+};
+
+template<MemoryKind kind>
+class CachedWorkspaceMem : public cvcuda::WorkspaceMem
+{
+public:
+    CachedWorkspaceMem()
+        : cvcuda::WorkspaceMem({})
+    {
+        assert(data == nullptr);
+        assert(ready == nullptr);
+    }
+
+    CachedWorkspaceMem(const cvcuda::WorkspaceMem &mem, WorkspaceMemDestructor_t destructor)
+        : cvcuda::WorkspaceMem(mem)
+        , m_destructor(std::move(destructor))
+    {
+    }
+
+    CachedWorkspaceMem(CachedWorkspaceMem &&mem)
+    {
+        *this = std::move(mem);
+    }
+
+    CachedWorkspaceMem &operator=(CachedWorkspaceMem &&mem)
+    {
+        std::swap(wsMem(), mem.wsMem());
+        std::swap(m_destructor, mem.m_destructor);
+        mem.reset();
+        return *this;
+    }
+
+    ~CachedWorkspaceMem()
+    {
+        reset();
+    }
+
+    void reset()
+    {
+        if (m_destructor)
+        {
+            m_destructor(*this);
+            m_destructor = {};
+        }
+        wsMem() = {};
+    }
+
+    explicit operator bool() const noexcept
+    {
+        return data != nullptr;
+    }
+
+private:
+    cvcuda::WorkspaceMem &wsMem() &
+    {
+        return static_cast<cvcuda::WorkspaceMem &>(*this);
+    }
+
+    const cvcuda::WorkspaceMem &wsMem() const &
+    {
+        return static_cast<const cvcuda::WorkspaceMem &>(*this);
+    }
+
+    WorkspaceMemDestructor_t m_destructor;
+};
+
+template<MemoryKind kind>
+inline size_t StreamCachePayloadSize(const CachedWorkspaceMem<kind> &mem)
+{
+    return mem.req.size;
+}
+
+template<MemoryKind kind>
+inline size_t StreamCachePayloadAlignment(const CachedWorkspaceMem<kind> &mem)
+{
+    return mem.req.alignment;
+}
+
+template<MemoryKind kind>
+class WorkspaceMemCache
+{
+public:
+    using Mem  = CachedWorkspaceMem<kind>;
+    using Base = nvcv::util::PerStreamCache<Mem>;
+
+    WorkspaceMemCache(nvcv::Allocator alloc, std::shared_ptr<nvcv::util::EventCache> eventCache)
+        : m_alloc(std::move(alloc))
+        , m_eventCache(std::move(eventCache))
+    {
+    }
+
+    ~WorkspaceMemCache()
+    {
+        assert(m_outstandingAllocs == 0);
+    }
+
+    Mem get(cvcuda::WorkspaceMemRequirements req, std::optional<cudaStream_t> stream)
+    {
+        if (req.size == 0)
+            return {};
+
+        ++m_outstandingAllocs;
+        auto opt = m_memCache.get(req.size, req.alignment, stream);
+        if (opt)
+            return std::move(opt).value();
+
+        return create(req);
+    }
+
+    void put(Mem &&mem, std::optional<cudaStream_t> stream)
+    {
+        m_memCache.put(std::move(mem), stream);
+        --m_outstandingAllocs;
+    }
+
+    void clear()
+    {
+        assert(m_outstandingAllocs == 0);
+        m_memCache.purge();
+    }
+
+private:
+    void *allocateMem(size_t size, size_t alignment) const
+    {
+        if constexpr (kind == MemoryKind::Host)
+            return m_alloc.hostMem().alloc(size, alignment);
+        else if constexpr (kind == MemoryKind::Pinned)
+            return m_alloc.hostPinnedMem().alloc(size, alignment);
+        else if constexpr (kind == MemoryKind::Cuda)
+            return m_alloc.cudaMem().alloc(size, alignment);
+        else
+            return nullptr; // should never happen
+    }
+
+    void freeMem(void *mem, size_t size, size_t alignment) const
+    {
+        if constexpr (kind == MemoryKind::Host)
+            return m_alloc.hostMem().free(mem, size, alignment);
+        else if constexpr (kind == MemoryKind::Pinned)
+            return m_alloc.hostPinnedMem().free(mem, size, alignment);
+        else if constexpr (kind == MemoryKind::Cuda)
+            return m_alloc.cudaMem().free(mem, size, alignment);
+    }
+
+    auto getMemDeleter() const
+    {
+        return [this](cvcuda::WorkspaceMem &mem)
+        {
+            // free the memory
+            freeMem(mem.data, mem.req.size, mem.req.alignment);
+            // return the event to the event cache
+            if (mem.ready)
+            {
+                m_eventCache->put(nvcv::util::CudaEvent(mem.ready));
+                mem.ready = nullptr;
+            }
+        };
+    }
+
+    Mem create(cvcuda::WorkspaceMemRequirements req)
+    {
+        WorkspaceMemDestructor_t del = getMemDeleter();
+
+        auto  evt  = nvcv::util::CudaEvent::Create();
+        void *data = allocateMem(req.size, req.alignment);
+
+        cvcuda::WorkspaceMem wsmem = {req, data, evt.get()};
+
+        Mem mem(wsmem, std::move(del));
+        evt.release(); // from now on, the event handle is managed by `mem`.
+        return mem;
+    }
+
+    nvcv::Allocator m_alloc;
+
+    std::shared_ptr<nvcv::util::EventCache> m_eventCache;
+
+    nvcv::util::PerStreamCache<CachedWorkspaceMem<kind>> m_memCache;
+
+    std::atomic_int m_outstandingAllocs;
+};
+
+class WorkspaceCache;
+
+class WorkspaceLease
+{
+public:
+    cvcuda::Workspace get() const
+    {
+        return {m_host, m_pinned, m_cuda};
+    }
+
+    ~WorkspaceLease();
+
+private:
+    friend class WorkspaceCache;
+    WorkspaceLease(WorkspaceCache *owner, CachedWorkspaceMem<MemoryKind::Host> &&host,
+                   CachedWorkspaceMem<MemoryKind::Pinned> &&pinned, CachedWorkspaceMem<MemoryKind::Cuda> &&cuda,
+                   std::optional<cudaStream_t> hostReleaseStream, std::optional<cudaStream_t> pinnedReleaseStream,
+                   std::optional<cudaStream_t> cudaReleaseStream);
+
+    WorkspaceCache                        *m_owner;
+    CachedWorkspaceMem<MemoryKind::Host>   m_host;
+    CachedWorkspaceMem<MemoryKind::Pinned> m_pinned;
+    CachedWorkspaceMem<MemoryKind::Cuda>   m_cuda;
+
+    std::optional<cudaStream_t> m_hostReleaseStream, m_pinnedReleaseStream, m_cudaReleaseStream;
+};
+
+class WorkspaceCache
+{
+public:
+    WorkspaceCache();
+
+    WorkspaceCache(nvcv::Allocator allocator);
+
+    /** Gets a workspace with custom stream semantics
+     *
+     * @param req                 The workspace memory sizes and alignments
+     * @param hostAcquireStream   The stream on which regular host memory will be initialky used; typically nullopt
+     * @param hostReleaseStream   The stream on which regular host memory usage will be completed; typically nullopt
+     * @param pinnedAcquireStream The stream on which pinned memory will be initialky used; typically nullopt
+     * @param pinnedReleaseStream The stream on which pinned memory usage will be completed; typically the main stream
+     *                            on which the operator is executed
+     * @param cudaAcquireStream   The stream on which device memory will be initialky used
+     * @param cudaReleaseStream   The stream on which device memory usage will be completed
+     */
+    WorkspaceLease get(cvcuda::WorkspaceRequirements req, std::optional<cudaStream_t> hostAcquireStream,
+                       std::optional<cudaStream_t> hostReleaseStream, std::optional<cudaStream_t> pinnedAcquireStream,
+                       std::optional<cudaStream_t> pinnedReleaseStream, std::optional<cudaStream_t> cudaAcquireStream,
+                       std::optional<cudaStream_t> cudaReleaseStream);
+
+    /** Gets a workspace with default stream semantics
+     *
+     * The default stream semantics are:
+     * - host memory doesn't use any streams
+     * - pinned memory is used for h2d copy (released in stream order)
+     * - device memory is acquired and released on the same stream
+     *
+     * NOTE: If these semantics are not honored by the user, the code should still be correct, just less efficient.
+     */
+    WorkspaceLease get(cvcuda::WorkspaceRequirements req, cudaStream_t stream)
+    {
+        return get(req, std::nullopt, std::nullopt, std::nullopt, stream, stream, stream);
+    }
+
+    auto &host()
+    {
+        return m_host;
+    }
+
+    auto &pinned()
+    {
+        return m_pinned;
+    }
+
+    auto &cuda()
+    {
+        return m_cuda;
+    }
+
+    static WorkspaceCache &instance();
+
+    void clear();
+
+private:
+    std::shared_ptr<nvcv::util::EventCache> m_eventCache;
+    WorkspaceMemCache<MemoryKind::Host>     m_host;
+    WorkspaceMemCache<MemoryKind::Pinned>   m_pinned;
+    WorkspaceMemCache<MemoryKind::Cuda>     m_cuda;
+
+    friend class WorkspaceLease;
+};
+
+} // namespace cvcudapy
+
+#endif // CVCUDA_PYTHON_WORKSPACE_CACHE_HPP
diff --git a/python/mod_cvcuda/exports.ldscript b/python/mod_cvcuda/exports.ldscript
new file mode 100644
index 00000000..fb32f5a6
--- /dev/null
+++ b/python/mod_cvcuda/exports.ldscript
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{
+# Restricts global symbols to the only one
+# that needs to be exported by a python module.
+global:
+    PyInit_cvcuda;
+local: *;
+};
diff --git a/python/mod_nvcv/Array.cpp b/python/mod_nvcv/Array.cpp
new file mode 100644
index 00000000..5a39e91e
--- /dev/null
+++ b/python/mod_nvcv/Array.cpp
@@ -0,0 +1,350 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Array.hpp"
+
+#include "DataType.hpp"
+#include "ExternalBuffer.hpp"
+
+#include <common/Assert.hpp>
+#include <common/CheckError.hpp>
+#include <common/Hash.hpp>
+#include <common/PyUtil.hpp>
+#include <common/String.hpp>
+#include <pybind11/operators.h>
+#include <pybind11/stl.h>
+
+namespace nvcvpy::priv {
+
+std::shared_ptr<Array> Array::CreateFromReqs(const nvcv::Array::Requirements &reqs)
+{
+    std::vector<std::shared_ptr<CacheItem>> vcont = Cache::Instance().fetch(Key{reqs});
+
+    // None found?
+    if (vcont.empty())
+    {
+        std::shared_ptr<Array> array(new Array(reqs));
+        array->impl().resize(reqs.capacity);
+        Cache::Instance().add(*array);
+        return array;
+    }
+    else
+    {
+        // Get the first one
+        auto array = std::static_pointer_cast<Array>(vcont[0]);
+        NVCV_ASSERT(array->dtype() == reqs.dtype);
+        return array;
+    }
+}
+
+std::shared_ptr<Array> Array::Create(int64_t length, nvcv::DataType dtype)
+{
+    nvcv::Array::Requirements reqs = nvcv::Array::CalcRequirements(length, dtype);
+    return CreateFromReqs(reqs);
+}
+
+std::shared_ptr<Array> Array::Create(const Shape &shape, nvcv::DataType dtype)
+{
+    return Create(LengthIf1D(shape), dtype);
+}
+
+namespace {
+
+NVCVArrayData FillNVCVArrayData(const DLTensor &tensor, NVCVArrayBufferType bufType)
+{
+    NVCVArrayData arrayData = {};
+
+    // dtype ------------
+    arrayData.dtype = py::cast<nvcv::DataType>(ToDType(ToNVCVDataType(tensor.dtype)));
+
+    // rank ------------
+    {
+        // TODO: Add 0D support
+        int rank = tensor.ndim == 0 ? 1 : tensor.ndim;
+        if (rank != 1)
+        {
+            throw std::invalid_argument(util::FormatString("The tensor rank must be 1 not %d", rank));
+        }
+    }
+
+    // shape ------------
+    arrayData.capacity = arrayData.length = tensor.shape[0];
+
+    // buffer type ------------
+    if (IsCudaAccessible(tensor.device.device_type))
+    {
+        arrayData.bufferType = NVCV_ARRAY_BUFFER_HOST;
+    }
+    else
+    {
+        throw std::runtime_error("Only CUDA-accessible arrays are supported for now");
+    }
+
+    NVCVArrayBufferStrided &dataStrided = arrayData.buffer.strided;
+
+    // stride ------------
+    int elemStrideBytes = (tensor.dtype.bits * tensor.dtype.lanes + 7) / 8;
+    for (int d = 0; d < tensor.ndim; ++d)
+    {
+        dataStrided.stride = tensor.strides[d] * elemStrideBytes;
+    }
+
+    // Memory buffer ------------
+    dataStrided.basePtr = reinterpret_cast<NVCVByte *>(tensor.data) + tensor.byte_offset;
+
+    return arrayData;
+}
+
+NVCVArrayData FillNVCVArrayDataCUDA(const DLTensor &tensor)
+{
+    return FillNVCVArrayData(tensor, NVCV_ARRAY_BUFFER_HOST);
+}
+
+} // namespace
+
+std::shared_ptr<Array> Array::Wrap(ExternalBuffer &buffer)
+{
+    const DLTensor &dlTensor = buffer.dlTensor();
+
+    nvcv::ArrayDataCuda data{FillNVCVArrayDataCUDA(dlTensor)};
+
+    // This is the key of a tensor wrapper.
+    // All tensor wrappers have the same key.
+    Array::Key key;
+    // We take this opportunity to remove from cache all wrappers that aren't
+    // being used. They aren't reusable anyway.
+    Cache::Instance().removeAllNotInUseMatching(key);
+
+    auto array = std::shared_ptr<Array>(new Array(data, py::cast(buffer.shared_from_this())));
+
+    // Need to add wrappers to cache so that they don't get destroyed by
+    // the cuda stream when they're last used, and python script isn't
+    // holding a reference to them. If we don't do it, things might break.
+    Cache::Instance().add(*array);
+    return array;
+}
+
+std::shared_ptr<Array> Array::ResizeArray(Array &array, int64_t length)
+{
+    Array::Key key;
+    Cache::Instance().removeAllNotInUseMatching(key);
+
+    auto array_impl = array.impl();
+    array_impl.resize(length);
+
+    auto new_array = std::shared_ptr<Array>(new Array(std::move(array_impl)));
+
+    // Need to add wrappers to cache so that they don't get destroyed by
+    // the cuda stream when they're last used, and python script isn't
+    // holding a reference to them. If we don't do it, things might break.
+    Cache::Instance().add(*new_array);
+    return new_array;
+}
+
+std::shared_ptr<Array> Array::ResizeArray(Array &array, Shape shape)
+{
+    return ResizeArray(array, LengthIf1D(shape));
+}
+
+std::shared_ptr<Array> Array::Resize(int64_t length)
+{
+    return ResizeArray(*this, length);
+}
+
+std::shared_ptr<Array> Array::Resize(Shape shape)
+{
+    return ResizeArray(*this, shape);
+}
+
+Array::Array(const nvcv::Array::Requirements &reqs)
+    : m_impl{reqs}
+    , m_key{reqs}
+{
+}
+
+Array::Array(const nvcv::ArrayData &data, py::object wrappedObject)
+    : m_impl{nvcv::ArrayWrapData(data)}
+    , m_key{}
+    , m_wrappedObject(wrappedObject)
+{
+}
+
+Array::Array(nvcv::Array &&array)
+    : m_impl{std::move(array)}
+    , m_key{}
+{
+}
+
+std::shared_ptr<Array> Array::shared_from_this()
+{
+    return std::static_pointer_cast<Array>(Container::shared_from_this());
+}
+
+std::shared_ptr<const Array> Array::shared_from_this() const
+{
+    return std::static_pointer_cast<const Array>(Container::shared_from_this());
+}
+
+nvcv::Array &Array::impl()
+{
+    return m_impl;
+}
+
+const nvcv::Array &Array::impl() const
+{
+    return m_impl;
+}
+
+Shape Array::shape() const
+{
+    return CreateShape(m_impl.length());
+}
+
+nvcv::DataType Array::dtype() const
+{
+    return m_impl.dtype();
+}
+
+int Array::rank() const
+{
+    return m_impl.rank();
+}
+
+int64_t Array::length() const
+{
+    return m_impl.length();
+}
+
+Array::Key::Key(const nvcv::Array::Requirements &reqs)
+    : Key(reqs.capacity, static_cast<nvcv::DataType>(reqs.dtype))
+{
+}
+
+Array::Key::Key(int64_t length, nvcv::DataType dtype)
+    : m_length(std::move(length))
+    , m_dtype(dtype)
+    , m_wrapper(false)
+{
+}
+
+size_t Array::Key::doGetHash() const
+{
+    if (m_wrapper)
+    {
+        return 0; // all wrappers are equal wrt. the cache
+    }
+    else
+    {
+        using util::ComputeHash;
+        return ComputeHash(m_length, m_dtype);
+    }
+}
+
+bool Array::Key::doIsCompatible(const IKey &that_) const
+{
+    const Key &that = static_cast<const Key &>(that_);
+
+    // Wrapper key's all compare equal, are they can't be used
+    // and whenever we query the cache for wrappers, we really
+    // want to get them all (as long as they aren't being used).
+    if (m_wrapper && that.m_wrapper)
+    {
+        return true;
+    }
+    else if (m_wrapper || that.m_wrapper) // xor
+    {
+        return false;
+    }
+    else
+    {
+        return std::tie(m_length, m_dtype) == std::tie(that.m_length, that.m_dtype);
+    }
+}
+
+auto Array::key() const -> const Key &
+{
+    return m_key;
+}
+
+static py::object ToPython(const nvcv::ArrayData &arrayData, py::object owner)
+{
+    py::object out;
+
+    auto data = arrayData.cast<nvcv::ArrayData>();
+    if (!data)
+    {
+        throw std::runtime_error("Only tensors with pitch-linear data can be exported");
+    }
+
+    DLPackTensor dlTensor(*data);
+    return ExternalBuffer::Create(std::move(dlTensor), owner);
+}
+
+py::object Array::cuda() const
+{
+    nvcv::ArrayData arrayData = m_impl.exportData();
+
+    // Note: we can't cache the returned ExternalBuffer because it is holding
+    // a reference to us. Doing so would lead to mem leaks.
+    return ToPython(arrayData, py::cast(this->shared_from_this()));
+}
+
+std::ostream &operator<<(std::ostream &out, const Array &array)
+{
+    return out << "<nvcv.Array length=" << array.length()
+               << " dtype=" << py::str(py::cast(array.dtype())).cast<std::string>() << '>';
+}
+
+void Array::Export(py::module &m)
+{
+    using namespace py::literals;
+
+    using CreateFromLengthPtr = std::shared_ptr<Array> (*)(int64_t, nvcv::DataType);
+    using CreateFromShapePtr  = std::shared_ptr<Array> (*)(const Shape &, nvcv::DataType);
+
+    using ResizeLengthPtr = std::shared_ptr<Array> (Array::*)(int64_t DataType);
+    using ResizeShapePtr  = std::shared_ptr<Array> (Array::*)(Shape);
+
+    using ResizeArrayLengthPtr = std::shared_ptr<Array> (*)(Array &, int64_t DataType);
+    using ResizeArrayShapePtr  = std::shared_ptr<Array> (*)(Array &, Shape);
+
+    py::class_<Array, std::shared_ptr<Array>, Container>(m, "Array")
+        .def(py::init(static_cast<CreateFromLengthPtr>(&Array::Create)), "length"_a, "dtype"_a,
+             "Create a Array object with the given length and data type.")
+        .def(py::init(static_cast<CreateFromShapePtr>(&Array::Create)), "shape"_a, "dtype"_a,
+             "Create a Array object with the given shape and data type.")
+        .def_property_readonly("shape", &Array::shape, "The shape of the Array.")
+        .def_property_readonly("dtype", &Array::dtype, "The data type of the Array.")
+        // numpy and others use ndim, let's be consistent with them in python.
+        // It's not a requirement to be consistent between NVCV Python and C/C++.
+        // Each language use whatever is appropriate (and expected) in their environment.
+        .def_property_readonly("ndim", &Array::rank, "The number of dimensions of the Array.")
+        .def("cuda", &Array::cuda, "Reference to the Array on the CUDA device.")
+        .def("resize", static_cast<ResizeLengthPtr>(&Array::Resize), "length"_a,
+             "Produces an array pointing to the same data but with a new length.")
+        .def("resize", static_cast<ResizeShapePtr>(&Array::Resize), "shape"_a,
+             "Produces an array pointing to the same data but with a new shape.")
+        .def("__repr__", &util::ToString<Array>, "Return the string representation of the Array object.");
+
+    m.def("as_array", &Array::Wrap, "buffer"_a, "Wrap an existing buffer into a Array object with the given layout.");
+    m.def("resize", static_cast<ResizeArrayLengthPtr>(&Array::ResizeArray), "array"_a, "length"_a,
+          "Produces an array pointing to the same data but with a new length.");
+    m.def("resize", static_cast<ResizeArrayShapePtr>(&Array::ResizeArray), "array"_a, "shape"_a,
+          "Produces an array pointing to the same data but with a new shape.");
+}
+
+} // namespace nvcvpy::priv
diff --git a/python/mod_nvcv/Array.hpp b/python/mod_nvcv/Array.hpp
new file mode 100644
index 00000000..1cb32b94
--- /dev/null
+++ b/python/mod_nvcv/Array.hpp
@@ -0,0 +1,106 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_PYTHON_PRIV_ARRAY_HPP
+#define NVCV_PYTHON_PRIV_ARRAY_HPP
+
+#include "Container.hpp"
+#include "Size.hpp"
+
+#include <nvcv/Array.hpp>
+#include <nvcv/Shape.hpp>
+#include <nvcv/TensorLayout.hpp>
+#include <nvcv/python/Shape.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pytypes.h>
+
+namespace nvcvpy::priv {
+namespace py = pybind11;
+
+class ExternalBuffer;
+
+class Array : public Container
+{
+public:
+    static void Export(py::module &m);
+
+    static std::shared_ptr<Array> Create(int64_t length, nvcv::DataType dtype);
+    static std::shared_ptr<Array> Create(const Shape &shape, nvcv::DataType dtype);
+
+    static std::shared_ptr<Array> CreateFromReqs(const nvcv::Array::Requirements &reqs);
+
+    static std::shared_ptr<Array> Wrap(ExternalBuffer &buffer);
+    static std::shared_ptr<Array> ResizeArray(Array &array, Shape shape);
+    static std::shared_ptr<Array> ResizeArray(Array &array, int64_t length);
+
+    std::shared_ptr<Array> Resize(Shape shape);
+    std::shared_ptr<Array> Resize(int64_t length);
+
+    std::shared_ptr<Array>       shared_from_this();
+    std::shared_ptr<const Array> shared_from_this() const;
+
+    Shape          shape() const;
+    nvcv::DataType dtype() const;
+    int            rank() const;
+    int64_t        length() const;
+
+    nvcv::Array       &impl();
+    const nvcv::Array &impl() const;
+
+    class Key final : public IKey
+    {
+    public:
+        explicit Key()
+            : m_wrapper(true)
+        {
+        }
+
+        explicit Key(const nvcv::Array::Requirements &reqs);
+        explicit Key(int64_t length, nvcv::DataType dtype);
+
+    private:
+        int64_t        m_length;
+        nvcv::DataType m_dtype;
+        bool           m_wrapper;
+
+        virtual size_t doGetHash() const override;
+        virtual bool   doIsCompatible(const IKey &that) const override;
+    };
+
+    virtual const Key &key() const override;
+
+    py::object cuda() const;
+
+private:
+    Array(const nvcv::Array::Requirements &reqs);
+    Array(const nvcv::ArrayData &data, py::object wrappedObject);
+    Array(nvcv::Array &&array);
+
+    // m_impl must come before m_key
+    nvcv::Array m_impl;
+    Key         m_key;
+
+    mutable py::object m_cacheExternalObject;
+
+    py::object m_wrappedObject; // null if not wrapping
+};
+
+std::ostream &operator<<(std::ostream &out, const Array &array);
+
+} // namespace nvcvpy::priv
+
+#endif // NVCV_PYTHON_PRIV_ARRAY_HPP
diff --git a/python/mod_nvcv/CAPI.cpp b/python/mod_nvcv/CAPI.cpp
index c59cf8ca..6c5f9cd9 100644
--- a/python/mod_nvcv/CAPI.cpp
+++ b/python/mod_nvcv/CAPI.cpp
@@ -17,12 +17,14 @@
 
 #include "CAPI.hpp"
 
+#include "Array.hpp"
 #include "Cache.hpp"
 #include "DataType.hpp"
 #include "Image.hpp"
 #include "ImageBatch.hpp"
 #include "Stream.hpp"
 #include "Tensor.hpp"
+#include "TensorBatch.hpp"
 
 #include <common/Assert.hpp>
 #include <nvcv/python/CAPI.hpp>
@@ -73,6 +75,11 @@ extern "C" NVCVTensorHandle ImplTensor_GetHandle(PyObject *obj)
     return ToSharedObj<Tensor>(obj)->impl().handle();
 }
 
+extern "C" NVCVArrayHandle ImplArray_GetHandle(PyObject *obj)
+{
+    return ToSharedObj<Array>(obj)->impl().handle();
+}
+
 LockMode ToLockMode(PyObject *_mode)
 {
     std::string s = ToObj<std::string>(_mode);
@@ -156,6 +163,13 @@ extern "C" PyObject *ImplTensor_Create(int32_t ndim, const int64_t *shape, NVCVD
     return py::cast(std::move(tensor)).release().ptr();
 }
 
+extern "C" PyObject *ImplArray_Create(int64_t length, NVCVDataType dtype)
+{
+    std::shared_ptr<Array> array = Array::Create(length, nvcv::DataType{dtype});
+
+    return py::cast(std::move(array)).release().ptr();
+}
+
 extern "C" PyObject *ImplImageBatchVarShape_Create(int32_t capacity)
 {
     std::shared_ptr<ImageBatchVarShape> varshape = ImageBatchVarShape::Create(capacity);
@@ -191,6 +205,33 @@ extern "C" void ImplImageBatchVarShape_Clear(PyObject *varshape)
     return ToSharedObj<ImageBatchVarShape>(varshape)->clear();
 }
 
+extern "C" PyObject *ImplTensorBatch_Create(int32_t capacity)
+{
+    std::shared_ptr<TensorBatch> tensorBatch = TensorBatch::Create(capacity);
+    return py::cast(std::move(tensorBatch)).release().ptr();
+}
+
+extern "C" NVCVTensorBatchHandle ImplTensorBatch_GetHandle(PyObject *tensorBatch)
+{
+    return ToSharedObj<TensorBatch>(tensorBatch)->impl().handle();
+}
+
+extern "C" void ImplTensorBatch_PushBack(PyObject *tensorBatch, PyObject *tensor)
+{
+    auto ptensor = ToSharedObj<Tensor>(tensor);
+    ToSharedObj<TensorBatch>(tensorBatch)->pushBack(*ptensor);
+}
+
+extern "C" void ImplTensorBatch_PopBack(PyObject *tensorBatch, uint32_t cnt)
+{
+    ToSharedObj<TensorBatch>(tensorBatch)->popBack(cnt);
+}
+
+extern "C" void ImplTensorBatch_Clear(PyObject *tensorBatch)
+{
+    ToSharedObj<TensorBatch>(tensorBatch)->clear();
+}
+
 extern "C" void ImplCache_Add(ICacheItem *extItem)
 {
     auto item = std::make_shared<ExternalCacheItem>(extItem->shared_from_this());
@@ -260,6 +301,8 @@ void ExportCAPI(py::module &m)
         .Tensor_GetHandle                = &ImplTensor_GetHandle,
         .Tensor_Create                   = &ImplTensor_Create,
         .Tensor_CreateForImageBatch      = &ImplTensor_CreateForImageBatch,
+        .Array_GetHandle                 = &ImplArray_GetHandle,
+        .Array_Create                    = &ImplArray_Create,
         .ImageBatchVarShape_Create       = &ImplImageBatchVarShape_Create,
         .ImageBatchVarShape_GetHandle    = &ImplImageBatchVarShape_GetHandle,
         .ImageBatchVarShape_PushBack     = &ImplImageBatchVarShape_PushBack,
@@ -271,6 +314,11 @@ void ExportCAPI(py::module &m)
         .Image_GetHandle                 = &ImplImage_GetHandle,
         .Container_Create                = &ImplContainer_Create,
         .Cache_RemoveAllNotInUseMatching = &ImplCache_RemoveAllNotInUseMatching,
+        .TensorBatch_Create              = &ImplTensorBatch_Create,
+        .TensorBatch_GetHandle           = &ImplTensorBatch_GetHandle,
+        .TensorBatch_PushBack            = &ImplTensorBatch_PushBack,
+        .TensorBatch_PopBack             = &ImplTensorBatch_PopBack,
+        .TensorBatch_Clear               = &ImplTensorBatch_Clear,
     };
 
     m.add_object("_C_API", py::capsule(&capi, "nvcv._C_API"));
diff --git a/python/mod_nvcv/CMakeLists.txt b/python/mod_nvcv/CMakeLists.txt
index 76e65e3c..d1eb3428 100644
--- a/python/mod_nvcv/CMakeLists.txt
+++ b/python/mod_nvcv/CMakeLists.txt
@@ -56,12 +56,14 @@ nvcv_python_add_module(
         Tensor.cpp
         Image.cpp
         ImageBatch.cpp
+        TensorBatch.cpp
         ExternalBuffer.cpp
         Rect.cpp
         Object.cpp
         CAPI.cpp
         DLPackUtils.cpp
         ColorSpec.cpp
+        Array.cpp
 )
 
 target_include_directories(nvcv_module_python
@@ -83,6 +85,13 @@ target_link_libraries(nvcv_module_python
         -lrt
 )
 
+# use exports file to expose only the symbol dl-loaded by python,
+# and nothing else.
+target_link_options(nvcv_module_python
+    PRIVATE
+        -Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/exports.ldscript
+)
+
 add_library(pynvcv INTERFACE)
 target_include_directories(pynvcv
     INTERFACE include
diff --git a/python/mod_nvcv/CastUtils.hpp b/python/mod_nvcv/CastUtils.hpp
new file mode 100644
index 00000000..9614d967
--- /dev/null
+++ b/python/mod_nvcv/CastUtils.hpp
@@ -0,0 +1,47 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_PYTHON_PRIV_CAST_UTILS_HPP
+#define NVCV_PYTHON_PRIV_CAST_UTILS_HPP
+
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+#include <memory>
+
+namespace nvcvpy::priv {
+namespace py = pybind11;
+
+// pybind11 2.10.3 can't convert an item from the input list into another type
+// automatically. It won't be able to match the call to current method definition.
+// We have to accept std::vector<py::object> and try to cast them manually.
+
+template<typename T>
+std::shared_ptr<T> cast_py_object_as(py::object &obj)
+{
+    py::detail::type_caster<T> caster;
+    if (!caster.load(obj, true))
+    {
+        return {};
+    }
+    std::shared_ptr<T> buf = caster;
+    return buf;
+}
+
+} // namespace nvcvpy::priv
+
+#endif // NVCV_PYTHON_PRIV_CAST_UTILS_HPP
diff --git a/python/mod_nvcv/DLPackUtils.cpp b/python/mod_nvcv/DLPackUtils.cpp
index 3f476347..e07cf75e 100644
--- a/python/mod_nvcv/DLPackUtils.cpp
+++ b/python/mod_nvcv/DLPackUtils.cpp
@@ -135,10 +135,9 @@ DLPackTensor::DLPackTensor(const nvcv::TensorDataStrided &tensorData)
         for (int i = 0; i < tensor.ndim; ++i)
         {
             int64_t stride = tensorData.cdata().buffer.strided.strides[i];
-
             if (stride % tensorData.dtype().strideBytes() != 0)
             {
-                throw std::runtime_error("Stride must be multiple of the element size in bytes");
+                throw std::runtime_error("Stride must be a multiple of the element size in bytes");
             }
 
             tensor.strides[i] = tensorData.cdata().buffer.strided.strides[i] / tensorData.dtype().strideBytes();
@@ -151,6 +150,57 @@ DLPackTensor::DLPackTensor(const nvcv::TensorDataStrided &tensorData)
     }
 }
 
+DLPackTensor::DLPackTensor(const nvcv::ArrayData &arrayData)
+{
+    m_tensor         = {};
+    m_tensor.deleter = [](DLManagedTensor *self)
+    {
+        delete[] self->dl_tensor.shape;
+        delete[] self->dl_tensor.strides;
+    };
+
+    try
+    {
+        DLTensor &tensor = m_tensor.dl_tensor;
+
+        // Set up device
+        if (arrayData.IsCompatible<nvcv::ArrayDataCuda>())
+        {
+            // TODO: detect correct device_type from memory buffer
+            tensor.device.device_type = kDLCUDA;
+            // TODO: detect correct device_id from memory buffer (if possible)
+            tensor.device.device_id = 0;
+        }
+        else
+        {
+            throw std::runtime_error("Array buffer type not supported, must be either CUDA");
+        }
+
+        // Set up ndim
+        tensor.ndim = arrayData.rank();
+
+        // Set up data
+        tensor.data        = arrayData.basePtr();
+        tensor.byte_offset = 0;
+
+        // Set up shape
+        tensor.shape    = new int64_t[tensor.ndim];
+        tensor.shape[0] = arrayData.capacity();
+
+        // Set up dtype
+        tensor.dtype = ToDLDataType(arrayData.dtype());
+
+        // Set up strides
+        tensor.strides    = new int64_t[tensor.ndim];
+        tensor.strides[0] = arrayData.stride();
+    }
+    catch (...)
+    {
+        m_tensor.deleter(&m_tensor);
+        throw;
+    }
+}
+
 DLPackTensor::DLPackTensor(DLPackTensor &&that) noexcept
     : m_tensor{std::move(that.m_tensor)}
 {
@@ -216,7 +266,7 @@ bool IsCudaAccessible(DLDeviceType devType)
 nvcv::DataType ToNVCVDataType(const DLDataType &dtype)
 {
     nvcv::PackingParams pp;
-    pp.byteOrder = nvcv::ByteOrder::LSB;
+    pp.byteOrder = nvcv::ByteOrder::MSB;
 
     int lanes = dtype.lanes;
     int bits  = dtype.bits;
diff --git a/python/mod_nvcv/DLPackUtils.hpp b/python/mod_nvcv/DLPackUtils.hpp
index 1ad8adfb..f7d9069f 100644
--- a/python/mod_nvcv/DLPackUtils.hpp
+++ b/python/mod_nvcv/DLPackUtils.hpp
@@ -19,6 +19,7 @@
 #define NVCV_PYTHON_PRIV_DLPACKUTILS_HPP
 
 #include <dlpack/dlpack.h>
+#include <nvcv/ArrayData.hpp>
 #include <nvcv/TensorData.hpp>
 #include <pybind11/buffer_info.h>
 
@@ -34,6 +35,7 @@ class DLPackTensor final
     explicit DLPackTensor(DLManagedTensor &&tensor);
     explicit DLPackTensor(const py::buffer_info &info, const DLDevice &dev);
     explicit DLPackTensor(const nvcv::TensorDataStrided &tensorData);
+    explicit DLPackTensor(const nvcv::ArrayData &arrayData);
 
     DLPackTensor(DLPackTensor &&that) noexcept;
     ~DLPackTensor();
diff --git a/python/mod_nvcv/ExternalBuffer.cpp b/python/mod_nvcv/ExternalBuffer.cpp
index 822e4178..c78ea225 100644
--- a/python/mod_nvcv/ExternalBuffer.cpp
+++ b/python/mod_nvcv/ExternalBuffer.cpp
@@ -263,9 +263,9 @@ std::optional<py::dict> ExternalBuffer::cudaArrayInterface() const
 
         nvcv::DataType dataType = ToNVCVDataType(m_dlTensor->dtype);
 
-        NVCV_ASSERT(dataType.strideBytes() * 8 == m_dlTensor->dtype.bits);
         NVCV_ASSERT(m_dlTensor->dtype.bits % 8 == 0);
-        int elemStrideBytes = m_dlTensor->dtype.bits / 8;
+        NVCV_ASSERT(dataType.strideBytes() * 8 == m_dlTensor->dtype.bits * m_dlTensor->dtype.lanes);
+        int elemStrideBytes = dataType.strideBytes();
 
         py::object strides;
 
@@ -369,7 +369,7 @@ void ExternalBuffer::Export(py::module &m)
         .def("__dlpack_device__", &ExternalBuffer::dlpackDevice, "Get the device associated with the buffer");
 }
 
-} // namespace nv::vpi::python
+} // namespace nvcvpy::priv
 
 namespace pybind11::detail {
 
diff --git a/python/mod_nvcv/Image.cpp b/python/mod_nvcv/Image.cpp
index 8ce28b8d..703611db 100644
--- a/python/mod_nvcv/Image.cpp
+++ b/python/mod_nvcv/Image.cpp
@@ -18,6 +18,7 @@
 #include "Image.hpp"
 
 #include "Cache.hpp"
+#include "CastUtils.hpp"
 #include "DataType.hpp"
 #include "ImageFormat.hpp"
 #include "Stream.hpp"
@@ -573,19 +574,12 @@ std::shared_ptr<Image> Image::WrapExternalBuffer(ExternalBuffer &buffer, nvcv::I
 std::shared_ptr<Image> Image::WrapExternalBufferVector(std::vector<py::object> buffers, nvcv::ImageFormat fmt)
 {
     std::vector<std::shared_ptr<ExternalBuffer>> spBuffers;
-    for (size_t i = 0; i < buffers.size(); ++i)
+    for (auto &obj : buffers)
     {
-        // pybind11 2.10.3 can't convert an item from the input list into an ExternalBuffer
-        // automatically. It won't be able to match the call to current method definition.
-        // We have to accept py::objects and try to convert them here.
-        py::detail::type_caster<priv::ExternalBuffer> caster;
-        if (!caster.load(buffers[i], true))
-        {
+        std::shared_ptr<ExternalBuffer> buffer = cast_py_object_as<ExternalBuffer>(obj);
+        if (!buffer)
             throw std::runtime_error("Input buffer doesn't provide cuda_array_interface or DLPack interfaces");
-        }
-
-        std::shared_ptr<ExternalBuffer> spbuf = caster;
-        spBuffers.push_back(spbuf);
+        spBuffers.push_back(std::move(buffer));
     }
 
     std::vector<DLPackTensor> bufinfos;
diff --git a/python/mod_nvcv/ImageBatch.cpp b/python/mod_nvcv/ImageBatch.cpp
index e97bbd4c..58831fe9 100644
--- a/python/mod_nvcv/ImageBatch.cpp
+++ b/python/mod_nvcv/ImageBatch.cpp
@@ -17,6 +17,8 @@
 
 #include "ImageBatch.hpp"
 
+#include "CastUtils.hpp"
+#include "ExternalBuffer.hpp"
 #include "Image.hpp"
 
 #include <common/Assert.hpp>
@@ -55,6 +57,23 @@ std::shared_ptr<ImageBatchVarShape> ImageBatchVarShape::Create(int capacity)
     }
 }
 
+std::shared_ptr<ImageBatchVarShape> ImageBatchVarShape::WrapExternalBufferVector(std::vector<py::object> buffers,
+                                                                                 nvcv::ImageFormat       fmt)
+{
+    auto batch = Create(buffers.size());
+    for (auto &obj : buffers)
+    {
+        std::shared_ptr<ExternalBuffer> buffer = cast_py_object_as<ExternalBuffer>(obj);
+        if (!buffer)
+        {
+            throw std::runtime_error("Input buffer doesn't provide cuda_array_interface or DLPack interfaces");
+        }
+        auto image = Image::WrapExternalBuffer(*buffer, fmt);
+        batch->pushBack(*image);
+    }
+    return batch;
+}
+
 ImageBatchVarShape::ImageBatchVarShape(int capacity)
     : m_key(capacity)
     , m_impl(capacity)
@@ -174,6 +193,10 @@ void ImageBatchVarShape::Export(py::module &m)
         .def("popback", &ImageBatchVarShape::popBack, "count"_a = 1,
              "Remove one or more images from the end of the ImageBatchVarShape.")
         .def("clear", &ImageBatchVarShape::clear, "Remove all images from the ImageBatchVarShape.");
+
+    m.def("as_images", &ImageBatchVarShape::WrapExternalBufferVector, py::arg_v("buffers", std::vector<py::object>{}),
+          "format"_a = nvcv::FMT_NONE, py::keep_alive<0, 1>(),
+          "Wrap a vector of external buffers as a batch of images, and tie the buffers lifetime to it");
 }
 
 } // namespace nvcvpy::priv
diff --git a/python/mod_nvcv/ImageBatch.hpp b/python/mod_nvcv/ImageBatch.hpp
index 7e7ff1d1..7006d4cf 100644
--- a/python/mod_nvcv/ImageBatch.hpp
+++ b/python/mod_nvcv/ImageBatch.hpp
@@ -38,6 +38,8 @@ class ImageBatchVarShape : public Container
     static void Export(py::module &m);
 
     static std::shared_ptr<ImageBatchVarShape> Create(int capacity);
+    static std::shared_ptr<ImageBatchVarShape> WrapExternalBufferVector(std::vector<py::object> buffer,
+                                                                        nvcv::ImageFormat       fmt);
 
     std::shared_ptr<ImageBatchVarShape>       shared_from_this();
     std::shared_ptr<const ImageBatchVarShape> shared_from_this() const;
diff --git a/python/mod_nvcv/Main.cpp b/python/mod_nvcv/Main.cpp
index 13e1b459..d02bf389 100644
--- a/python/mod_nvcv/Main.cpp
+++ b/python/mod_nvcv/Main.cpp
@@ -28,6 +28,7 @@
 #include "Resource.hpp"
 #include "Stream.hpp"
 #include "Tensor.hpp"
+#include "TensorBatch.hpp"
 
 #include <nvcv/Version.h>
 #include <pybind11/pybind11.h>
@@ -62,6 +63,7 @@ PYBIND11_MODULE(nvcv, m)
     Resource::Export(m);
     Container::Export(m);
     Tensor::Export(m);
+    TensorBatch::Export(m);
     Image::Export(m);
     ImageBatchVarShape::Export(m);
     ExportCAPI(m);
diff --git a/python/mod_nvcv/Resource.cpp b/python/mod_nvcv/Resource.cpp
index b5d283bf..b6b49476 100644
--- a/python/mod_nvcv/Resource.cpp
+++ b/python/mod_nvcv/Resource.cpp
@@ -22,8 +22,6 @@
 #include <common/Assert.hpp>
 #include <common/CheckError.hpp>
 
-#include <iostream>
-
 namespace nvcvpy::priv {
 
 Resource::Resource()
diff --git a/python/mod_nvcv/Tensor.cpp b/python/mod_nvcv/Tensor.cpp
index 4a314103..ea21d08a 100644
--- a/python/mod_nvcv/Tensor.cpp
+++ b/python/mod_nvcv/Tensor.cpp
@@ -104,6 +104,7 @@ NVCVTensorData FillNVCVTensorData(const DLTensor &tensor, std::optional<nvcv::Te
 
     // rank ------------
     {
+        // TODO: Add 0D support
         int rank = tensor.ndim == 0 ? 1 : tensor.ndim;
         if (rank < 1 || rank > NVCV_TENSOR_MAX_RANK)
         {
@@ -181,6 +182,28 @@ std::shared_ptr<Tensor> Tensor::WrapImage(Image &img)
     return tensor;
 }
 
+std::shared_ptr<Tensor> Tensor::ReshapeTensor(Tensor &tensor, Shape shape, std::optional<nvcv::TensorLayout> layout)
+{
+    Tensor::Key key;
+    Cache::Instance().removeAllNotInUseMatching(key);
+
+    nvcv::Tensor tensor_impl      = tensor.impl();
+    auto         new_tensor_shape = CreateNVCVTensorShape(shape, layout ? *layout : tensor_impl.layout());
+    nvcv::Tensor new_tensor_impl  = tensor_impl.reshape(std::move(new_tensor_shape));
+    auto         new_tensor       = std::shared_ptr<Tensor>(new Tensor(std::move(new_tensor_impl)));
+
+    // Need to add wrappers to cache so that they don't get destroyed by
+    // the cuda stream when they're last used, and python script isn't
+    // holding a reference to them. If we don't do it, things might break.
+    Cache::Instance().add(*new_tensor);
+    return new_tensor;
+}
+
+std::shared_ptr<Tensor> Tensor::Reshape(Shape shape, std::optional<nvcv::TensorLayout> layout)
+{
+    return ReshapeTensor(*this, std::move(shape), std::move(layout));
+}
+
 Tensor::Tensor(const nvcv::Tensor::Requirements &reqs)
     : m_impl{reqs}
     , m_key{reqs}
@@ -201,6 +224,12 @@ Tensor::Tensor(Image &img)
 {
 }
 
+Tensor::Tensor(nvcv::Tensor &&tensor)
+    : m_impl{std::move(tensor)}
+    , m_key{}
+{
+}
+
 std::shared_ptr<Tensor> Tensor::shared_from_this()
 {
     return std::static_pointer_cast<Tensor>(Container::shared_from_this());
@@ -373,12 +402,16 @@ void Tensor::Export(py::module &m)
         // It's not a requirement to be consistent between NVCV Python and C/C++.
         // Each language use whatever is appropriate (and expected) in their environment.
         .def_property_readonly("ndim", &Tensor::rank, "The number of dimensions of the Tensor.")
-        .def("cuda", &Tensor::cuda, "Referance to the Tensor on the CUDA device.")
+        .def("cuda", &Tensor::cuda, "Reference to the Tensor on the CUDA device.")
+        .def("reshape", &Tensor::Reshape, "shape"_a, "layout"_a = std::nullopt,
+             "Produces a tensor pointing to the same data but with a new shape and layout.")
         .def("__repr__", &util::ToString<Tensor>, "Return the string representation of the Tensor object.");
 
     m.def("as_tensor", &Tensor::Wrap, "buffer"_a, "layout"_a = std::nullopt,
           "Wrap an existing buffer into a Tensor object with the given layout.");
     m.def("as_tensor", &Tensor::WrapImage, "image"_a, "Wrap an existing image into a Tensor object.");
+    m.def("reshape", &Tensor::ReshapeTensor, "tensor"_a, "shape"_a, "layout"_a = std::nullopt,
+          "Produces a tensor pointing to the same data but with a new shape and layout.");
 }
 
 } // namespace nvcvpy::priv
diff --git a/python/mod_nvcv/Tensor.hpp b/python/mod_nvcv/Tensor.hpp
index 27a4055a..87f1b7ba 100644
--- a/python/mod_nvcv/Tensor.hpp
+++ b/python/mod_nvcv/Tensor.hpp
@@ -47,6 +47,9 @@ class Tensor : public Container
 
     static std::shared_ptr<Tensor> Wrap(ExternalBuffer &buffer, std::optional<nvcv::TensorLayout> layout);
     static std::shared_ptr<Tensor> WrapImage(Image &img);
+    static std::shared_ptr<Tensor> ReshapeTensor(Tensor &tensor, Shape shape, std::optional<nvcv::TensorLayout> layout);
+
+    std::shared_ptr<Tensor> Reshape(Shape shape, std::optional<nvcv::TensorLayout> layout);
 
     std::shared_ptr<Tensor>       shared_from_this();
     std::shared_ptr<const Tensor> shared_from_this() const;
@@ -87,6 +90,7 @@ class Tensor : public Container
     Tensor(const nvcv::Tensor::Requirements &reqs);
     Tensor(const nvcv::TensorData &data, py::object wrappedObject);
     Tensor(Image &img);
+    Tensor(nvcv::Tensor &&tensor);
 
     // m_impl must come before m_key
     nvcv::Tensor m_impl;
diff --git a/python/mod_nvcv/TensorBatch.cpp b/python/mod_nvcv/TensorBatch.cpp
new file mode 100644
index 00000000..99d51497
--- /dev/null
+++ b/python/mod_nvcv/TensorBatch.cpp
@@ -0,0 +1,261 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TensorBatch.hpp"
+
+#include "CastUtils.hpp"
+#include "DataType.hpp"
+#include "ExternalBuffer.hpp"
+#include "Tensor.hpp"
+
+#include <common/Assert.hpp>
+
+namespace nvcvpy::priv {
+
+size_t TensorBatch::Key::doGetHash() const
+{
+    using util::ComputeHash;
+    return ComputeHash(m_capacity);
+}
+
+bool TensorBatch::Key::doIsCompatible(const IKey &ithat) const
+{
+    auto &that = static_cast<const Key &>(ithat);
+    return m_capacity == that.m_capacity;
+}
+
+std::shared_ptr<TensorBatch> TensorBatch::Create(int capacity)
+{
+    std::vector<std::shared_ptr<CacheItem>> vcont = Cache::Instance().fetch(Key{capacity});
+
+    // None found?
+    if (vcont.empty())
+    {
+        std::shared_ptr<TensorBatch> batch(new TensorBatch(capacity));
+        Cache::Instance().add(*batch);
+        return batch;
+    }
+    else
+    {
+        // Get the first one
+        auto batch = std::static_pointer_cast<TensorBatch>(vcont[0]);
+        batch->clear(); // make sure it's in pristine state
+        return batch;
+    }
+}
+
+std::shared_ptr<TensorBatch> TensorBatch::WrapExternalBufferVector(std::vector<py::object>           buffers,
+                                                                   std::optional<nvcv::TensorLayout> layout)
+{
+    TensorList list;
+    list.reserve(buffers.size());
+    for (auto &obj : buffers)
+    {
+        std::shared_ptr<ExternalBuffer> buffer = cast_py_object_as<ExternalBuffer>(obj);
+        if (!buffer)
+        {
+            throw std::runtime_error("Input buffer doesn't provide cuda_array_interface or DLPack interfaces.");
+        }
+        auto tensor = Tensor::Wrap(*buffer, layout);
+        list.push_back(tensor);
+    }
+    auto batch = Create(buffers.size());
+    batch->pushBackMany(list);
+    return batch;
+}
+
+TensorBatch::TensorBatch(int capacity)
+    : m_key(capacity)
+    , m_impl(capacity)
+{
+    m_list.reserve(capacity);
+}
+
+const nvcv::TensorBatch &TensorBatch::impl() const
+{
+    return m_impl;
+}
+
+nvcv::TensorBatch &TensorBatch::impl()
+{
+    return m_impl;
+}
+
+int32_t TensorBatch::rank() const
+{
+    return m_impl.rank();
+}
+
+int32_t TensorBatch::capacity() const
+{
+    return m_impl.capacity();
+}
+
+int32_t TensorBatch::numTensors() const
+{
+    NVCV_ASSERT(m_impl.numTensors() == static_cast<int32_t>(m_list.size()));
+    return m_impl.numTensors();
+}
+
+std::optional<nvcv::DataType> TensorBatch::dtype() const
+{
+    auto dtype = m_impl.dtype();
+    if (dtype != nvcv::DataType())
+    {
+        return {dtype};
+    }
+    else
+    {
+        return std::nullopt;
+    }
+}
+
+std::optional<nvcv::TensorLayout> TensorBatch::layout() const
+{
+    auto layout = m_impl.layout();
+    if (layout != nvcv::TENSOR_NONE)
+    {
+        return {layout};
+    }
+    else
+    {
+        return std::nullopt;
+    }
+}
+
+void TensorBatch::pushBack(Tensor &tensor)
+{
+    m_impl.pushBack(tensor.impl());
+    m_list.push_back(tensor.shared_from_this());
+}
+
+void TensorBatch::pushBackMany(std::vector<std::shared_ptr<Tensor>> &tensorList)
+{
+    std::vector<nvcv::Tensor> nvcvTensors;
+    nvcvTensors.reserve(tensorList.size());
+    for (auto &tensor : tensorList)
+    {
+        m_list.push_back(tensor);
+        if (tensor)
+            nvcvTensors.push_back(tensor->impl());
+        else
+            nvcvTensors.push_back(nvcv::Tensor());
+    }
+    m_impl.pushBack(nvcvTensors.begin(), nvcvTensors.end());
+}
+
+void TensorBatch::popBack(int tensorCount)
+{
+    m_impl.popTensors(tensorCount);
+    m_list.erase(m_list.end() - tensorCount, m_list.end());
+}
+
+void TensorBatch::clear()
+{
+    m_impl.clear();
+    m_list.clear();
+}
+
+std::shared_ptr<Tensor> TensorBatch::at(int64_t idx) const
+{
+    if (idx < 0)
+    {
+        throw std::runtime_error("Invalid index: " + std::to_string(idx));
+    }
+    else if (idx >= static_cast<int64_t>(m_list.size()))
+    {
+        throw std::runtime_error("Cannot get tensor at index " + std::to_string(idx) + ". Batch has only "
+                                 + std::to_string(m_list.size()) + " elements.");
+    }
+    return m_list[idx];
+}
+
+void TensorBatch::set_at(int64_t idx, std::shared_ptr<Tensor> tensor)
+{
+    if (idx < 0)
+    {
+        throw std::runtime_error("Invalid index: " + std::to_string(idx));
+    }
+    else if (idx >= static_cast<int64_t>(m_list.size()))
+    {
+        throw std::runtime_error("Cannot set tensor at index " + std::to_string(idx) + ". Batch has only "
+                                 + std::to_string(m_list.size()) + " elements.");
+    }
+    m_impl.setTensor(static_cast<int32_t>(idx), tensor->impl());
+    m_list[idx] = tensor;
+}
+
+auto TensorBatch::begin() const -> TensorList::const_iterator
+{
+    return m_list.begin();
+}
+
+auto TensorBatch::end() const -> TensorList::const_iterator
+{
+    return m_list.end();
+}
+
+std::shared_ptr<TensorBatch> TensorBatch::shared_from_this()
+{
+    return std::static_pointer_cast<TensorBatch>(Container::shared_from_this());
+}
+
+std::shared_ptr<const TensorBatch> TensorBatch::shared_from_this() const
+{
+    return std::static_pointer_cast<const TensorBatch>(Container::shared_from_this());
+}
+
+void TensorBatch::Export(py::module &m)
+{
+    using namespace py::literals;
+
+    py::class_<TensorBatch, std::shared_ptr<TensorBatch>, Container>(
+        m, "TensorBatch",
+        "Container for a batch of tensors.\n"
+        "The capacity of the container must be specified upfront in the batch initialization.\n"
+        "The tensors in the batch may differ in shapes but they must have "
+        "a uniform dimensionality, data type and layout.")
+        .def(py::init(&TensorBatch::Create),
+             "capacity"_a
+             "Create a new TensorBatch object with the specified capacity.")
+        .def_property_readonly("layout", &TensorBatch::layout,
+                               "Layout of the tensors in the tensor batch."
+                               " None if the batch is empty.")
+        .def_property_readonly("dtype", &TensorBatch::dtype,
+                               "Data type of tensors in the tensor batch."
+                               " None if the batch is empty.")
+        .def_property_readonly("capacity", &TensorBatch::capacity, "Capacity of the tensor batch.")
+        .def_property_readonly("ndim", &TensorBatch::rank,
+                               "Return the number of dimensions of the tensors or -1 for an empty batch")
+        .def("__len__", &TensorBatch::numTensors, "Return the number of tensors.")
+        .def(
+            "__iter__", [](const TensorBatch &batch) { return py::make_iterator(batch); },
+            "Return an iterator over the tensors in the TensorBatch.")
+        .def("__setitem__", &TensorBatch::set_at, "Set tensor at a given index.")
+        .def("__getitem__", &TensorBatch::at, "Get a tensor at a given index.")
+        .def("pushback", &TensorBatch::pushBack, "Add a new image to the end of the TensorBatch.")
+        .def("pushback", &TensorBatch::pushBackMany, "Add multiple images to the end of the TensorBatch.")
+        .def("popback", &TensorBatch::popBack, "count"_a = 1,
+             "Remove one or more images from the end of the TensorBatch.")
+        .def("clear", &TensorBatch::clear, "Remove all images from the TensorBatch.");
+
+    m.def("as_tensors", &TensorBatch::WrapExternalBufferVector, "buffers"_a = std::vector<py::object>{},
+          "layout"_a = std::nullopt, py::keep_alive<0, 1>(),
+          "Wrap a list of external buffers as a batch of tensors, and tie the buffers lifetime to it");
+}
+
+} // namespace nvcvpy::priv
diff --git a/python/mod_nvcv/TensorBatch.hpp b/python/mod_nvcv/TensorBatch.hpp
new file mode 100644
index 00000000..0edaebba
--- /dev/null
+++ b/python/mod_nvcv/TensorBatch.hpp
@@ -0,0 +1,96 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_PYTHON_PRIV_TENSORBATCH_HPP
+#define NVCV_PYTHON_PRIV_TENSORBATCH_HPP
+
+#include "Container.hpp"
+
+#include <nvcv/TensorBatch.hpp>
+
+#include <vector>
+
+namespace nvcvpy::priv {
+namespace py = pybind11;
+
+class Tensor;
+
+class TensorBatch : public Container
+{
+    using TensorList = std::vector<std::shared_ptr<Tensor>>;
+
+public:
+    static void Export(py::module &m);
+
+    static std::shared_ptr<TensorBatch> Create(int capacity);
+    static std::shared_ptr<TensorBatch> WrapExternalBufferVector(std::vector<py::object>           buffers,
+                                                                 std::optional<nvcv::TensorLayout> layout);
+
+    std::shared_ptr<TensorBatch>       shared_from_this();
+    std::shared_ptr<const TensorBatch> shared_from_this() const;
+
+    const nvcv::TensorBatch &impl() const;
+    nvcv::TensorBatch       &impl();
+
+    int32_t numTensors() const;
+    int32_t capacity() const;
+
+    int32_t                           rank() const;
+    std::optional<nvcv::DataType>     dtype() const;
+    std::optional<nvcv::TensorLayout> layout() const;
+
+    void pushBack(Tensor &tensor);
+    void pushBackMany(std::vector<std::shared_ptr<Tensor>> &tensorList);
+    void popBack(int tensorCount);
+    void clear();
+
+    std::shared_ptr<Tensor> at(int64_t idx) const;
+    void                    set_at(int64_t idx, std::shared_ptr<Tensor> tensor);
+
+    TensorList::const_iterator begin() const;
+    TensorList::const_iterator end() const;
+
+    class Key final : public IKey
+    {
+    public:
+        Key(int capacity)
+            : m_capacity(capacity)
+        {
+        }
+
+    private:
+        int m_capacity;
+
+        virtual size_t doGetHash() const override;
+        virtual bool   doIsCompatible(const IKey &that) const override;
+    };
+
+    virtual const Key &key() const override
+    {
+        return m_key;
+    }
+
+private:
+    TensorBatch(int capacity);
+    Key               m_key;
+    nvcv::TensorBatch m_impl;
+    TensorList        m_list;
+};
+
+} // namespace nvcvpy::priv
+
+#endif // NVCV_PYTHON_PRIV_TENSORBATCH_HPP
diff --git a/python/mod_nvcv/exports.ldscript b/python/mod_nvcv/exports.ldscript
new file mode 100644
index 00000000..c3d107ff
--- /dev/null
+++ b/python/mod_nvcv/exports.ldscript
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{
+# Restricts global symbols to the only one
+# that needs to be exported by a python module.
+global:
+    PyInit_nvcv;
+local: *;
+};
diff --git a/python/mod_nvcv/include/nvcv/python/Array.hpp b/python/mod_nvcv/include/nvcv/python/Array.hpp
new file mode 100644
index 00000000..5d3d2bcf
--- /dev/null
+++ b/python/mod_nvcv/include/nvcv/python/Array.hpp
@@ -0,0 +1,102 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_PYTHON_ARRAY_HPP
+#define NVCV_PYTHON_ARRAY_HPP
+
+#include "CAPI.hpp"
+#include "DataType.hpp"
+#include "Resource.hpp"
+#include "Shape.hpp"
+
+#include <nvcv/Array.hpp>
+#include <pybind11/pytypes.h>
+#include <pybind11/stl.h>
+
+#include <cassert>
+
+namespace nvcvpy {
+
+namespace py = pybind11;
+
+class Array
+    : public Resource
+    , public nvcv::Array
+{
+public:
+    static Array Create(int64_t length, nvcv::DataType dtype)
+    {
+        PyObject *oarray = capi().Array_Create(length, dtype);
+
+        py::object pyarray = py::reinterpret_steal<py::object>(oarray);
+
+        return Array(pyarray);
+    }
+
+    static Array Create(const Shape &shape, nvcv::DataType dtype)
+    {
+        return Create(LengthIf1D(shape), dtype);
+    }
+
+private:
+    friend struct py::detail::type_caster<Array>;
+
+    Array() = default;
+
+    explicit Array(py::object obj)
+        : Resource(obj)
+        , nvcv::Array(FromHandle(capi().Array_GetHandle(this->ptr()), true))
+    {
+    }
+};
+
+} // namespace nvcvpy
+
+namespace pybind11::detail {
+
+namespace cvpy = nvcvpy;
+
+template<>
+struct type_caster<cvpy::Array> : type_caster_base<cvpy::Array>
+{
+    PYBIND11_TYPE_CASTER(cvpy::Array, const_name("nvcv.Array"));
+
+    bool load(handle src, bool)
+    {
+        // Does it have the correct object type?
+        PyTypeObject *srctype = Py_TYPE(src.ptr());
+        if (strcmp(name.text, srctype->tp_name) == 0)
+        {
+            value = cvpy::Array(reinterpret_borrow<object>(src));
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    static handle cast(cvpy::Array array, return_value_policy /* policy */, handle /*parent */)
+    {
+        array.inc_ref(); // for some reason this is needed
+        return array;
+    }
+};
+
+} // namespace pybind11::detail
+
+#endif // NVCV_PYTHON_ARRAY_HPP
diff --git a/python/mod_nvcv/include/nvcv/python/CAPI.hpp b/python/mod_nvcv/include/nvcv/python/CAPI.hpp
index 1a158abc..db5f200a 100644
--- a/python/mod_nvcv/include/nvcv/python/CAPI.hpp
+++ b/python/mod_nvcv/include/nvcv/python/CAPI.hpp
@@ -19,6 +19,7 @@
 #define NVCV_PYTHON_CAPI_HPP
 
 #include <cuda_runtime.h>
+#include <nvcv/Array.h>
 #include <nvcv/DataType.hpp>
 #include <nvcv/ImageBatch.h>
 #include <nvcv/Tensor.h>
@@ -56,6 +57,9 @@ struct CAPI
     PyObject *(*Tensor_CreateForImageBatch)(int32_t numImages, int32_t width, int32_t height, NVCVImageFormat fmt,
                                             int32_t rowAlign);
 
+    NVCVArrayHandle (*Array_GetHandle)(PyObject *array);
+    PyObject *(*Array_Create)(int64_t length, NVCVDataType dtype);
+
     PyObject *(*ImageBatchVarShape_Create)(int32_t capacity);
     NVCVImageBatchHandle (*ImageBatchVarShape_GetHandle)(PyObject *varshape);
     void (*ImageBatchVarShape_PushBack)(PyObject *varshape, PyObject *image);
@@ -72,6 +76,16 @@ struct CAPI
 
     void (*Cache_RemoveAllNotInUseMatching)(const IKey *key);
 
+    PyObject *(*TensorBatch_Create)(int32_t capacity);
+
+    NVCVTensorBatchHandle (*TensorBatch_GetHandle)(PyObject *tensorBatch);
+
+    void (*TensorBatch_PushBack)(PyObject *tensorBatch, PyObject *tensor);
+
+    void (*TensorBatch_PopBack)(PyObject *tensorBatch, uint32_t cnt);
+
+    void (*TensorBatch_Clear)(PyObject *tensorBatch);
+
     // always add new functions at the end, and never change the function prototypes above.
 };
 
diff --git a/python/mod_nvcv/include/nvcv/python/Fwd.hpp b/python/mod_nvcv/include/nvcv/python/Fwd.hpp
index d6543c56..de280d8f 100644
--- a/python/mod_nvcv/include/nvcv/python/Fwd.hpp
+++ b/python/mod_nvcv/include/nvcv/python/Fwd.hpp
@@ -30,6 +30,7 @@ class Resource;
 class Image;
 class ImageBatchVarShape;
 class Tensor;
+class Array;
 class Stream;
 class ResourceGuard;
 enum LockMode : uint8_t;
diff --git a/python/mod_nvcv/include/nvcv/python/Shape.hpp b/python/mod_nvcv/include/nvcv/python/Shape.hpp
index c2f7ed51..a6139a7c 100644
--- a/python/mod_nvcv/include/nvcv/python/Shape.hpp
+++ b/python/mod_nvcv/include/nvcv/python/Shape.hpp
@@ -37,6 +37,13 @@ inline Shape CreateShape(const nvcv::TensorShape &tshape)
     return s;
 }
 
+inline Shape CreateShape(int64_t length)
+{
+    Shape s(1);
+    s[0] = length;
+    return s;
+}
+
 inline nvcv::TensorShape CreateNVCVTensorShape(const Shape &shape, nvcv::TensorLayout layout = nvcv::TENSOR_NONE)
 {
     std::vector<int64_t> dims;
@@ -49,6 +56,24 @@ inline nvcv::TensorShape CreateNVCVTensorShape(const Shape &shape, nvcv::TensorL
     return nvcv::TensorShape(dims.data(), dims.size(), layout);
 }
 
+inline int64_t LengthIf1D(const Shape &shape)
+{
+    int64_t largest = 1;
+    for (size_t i = 0; i < shape.size(); ++i)
+    {
+        if (shape[i].cast<int64_t>() > 1)
+        {
+            if (largest > 1)
+            {
+                throw std::invalid_argument("Non-supported array shape");
+            }
+            largest = shape[i].cast<int64_t>();
+        }
+    }
+
+    return largest;
+}
+
 } // namespace nvcvpy
 
 #endif // NVCV_PYTHON_SHAPE_HPP
diff --git a/python/mod_nvcv/include/nvcv/python/TensorBatch.hpp b/python/mod_nvcv/include/nvcv/python/TensorBatch.hpp
new file mode 100644
index 00000000..b13e184f
--- /dev/null
+++ b/python/mod_nvcv/include/nvcv/python/TensorBatch.hpp
@@ -0,0 +1,113 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_PYTHON_TENSORBATCH_HPP
+#define NVCV_PYTHON_TENSORBATCH_HPP
+
+#include "CAPI.hpp"
+#include "Resource.hpp"
+
+#include <nvcv/TensorBatch.hpp>
+#include <nvcv/python/Tensor.hpp>
+
+#include <cassert>
+
+namespace nvcvpy {
+
+namespace py = pybind11;
+
+class TensorBatch
+    : public Resource
+    , public nvcv::TensorBatch
+{
+public:
+    static TensorBatch Create(int capacity)
+    {
+        PyObject *tensorBatch = capi().TensorBatch_Create(capacity);
+
+        py::object pytensorBatch = py::reinterpret_steal<py::object>(tensorBatch);
+
+        return TensorBatch(pytensorBatch);
+    }
+
+    void pushBack(Tensor tensor)
+    {
+        capi().TensorBatch_PushBack(this->ptr(), tensor.ptr());
+    }
+
+    void popBack(int cnt)
+    {
+        capi().TensorBatch_PopBack(this->ptr(), cnt);
+    }
+
+    void clear()
+    {
+        capi().TensorBatch_Clear(this->ptr());
+    }
+
+    using nvcv::TensorBatch::operator[];
+    using nvcv::TensorBatch::begin;
+    using nvcv::TensorBatch::end;
+
+private:
+    friend struct py::detail::type_caster<TensorBatch>;
+
+    TensorBatch() = default;
+
+    explicit TensorBatch(py::object obj)
+        : Resource(obj)
+        , nvcv::TensorBatch(FromHandle(capi().TensorBatch_GetHandle(this->ptr()), true))
+    {
+    }
+};
+
+} // namespace nvcvpy
+
+namespace pybind11::detail {
+
+namespace cvpy = nvcvpy;
+
+template<>
+struct type_caster<cvpy::TensorBatch> : type_caster_base<cvpy::TensorBatch>
+{
+    PYBIND11_TYPE_CASTER(cvpy::TensorBatch, const_name("nvcv.TensorBatch"));
+
+    bool load(handle src, bool)
+    {
+        // Does it have the correct object type?
+        PyTypeObject *srctype = Py_TYPE(src.ptr());
+        if (strcmp(name.text, srctype->tp_name) == 0)
+        {
+            value = cvpy::TensorBatch(reinterpret_borrow<object>(src));
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    static handle cast(cvpy::TensorBatch tensor, return_value_policy /* policy */, handle /*parent */)
+    {
+        tensor.inc_ref(); // for some reason this is needed
+        return tensor;
+    }
+};
+
+} // namespace pybind11::detail
+
+#endif // NVCV_PYTHON_TENSORBATCH_HPP
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index 2c7813bd..ca2ee0c2 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -32,7 +32,8 @@ set(CPPSAMPLES common
 
 set(PYSAMPLES classification
     segmentation
-    object_detection)
+    object_detection
+    label)
 
 foreach(sample ${CPPSAMPLES})
     add_subdirectory(${sample})
diff --git a/samples/README.md b/samples/README.md
index c3b58468..a0c6a150 100644
--- a/samples/README.md
+++ b/samples/README.md
@@ -57,10 +57,10 @@ Setting up the following is only required if you want to setup and run the sampl
 4. Install the CV-CUDA packages. Please note that since the above container comes with Python 3.8.10, we will install nvcv-python3.8-0 package as mentioned below. If you have any other Python distributions, you would need to use the appropriate nvcv-python Debian package below.
 
    ```bash
-   dpkg -i nvcv-lib-0.4.0_beta-cuda11-x86_64-linux.deb
-   dpkg -i nvcv-dev-0.4.0_beta-cuda11-x86_64-linux.deb
-   dpkg -i cvcuda-samples-0.4.0_beta-cuda11-x86_64-linux.deb
-   dpkg -i nvcv-python3.8-0.4.0_beta-cuda11-x86_64-linux.deb
+   dpkg -i nvcv-lib-0.5.0_beta-cuda11-x86_64-linux.deb
+   dpkg -i nvcv-dev-0.5.0_beta-cuda11-x86_64-linux.deb
+   dpkg -i cvcuda-samples-0.5.0_beta-cuda11-x86_64-linux.deb
+   dpkg -i nvcv-python3.8-0.5.0_beta-cuda11-x86_64-linux.deb
    ```
 5. Copy the samples folder to the target directory.
 
diff --git a/samples/common/python/interop_utils.py b/samples/common/python/interop_utils.py
new file mode 100644
index 00000000..1083452c
--- /dev/null
+++ b/samples/common/python/interop_utils.py
@@ -0,0 +1,88 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import torch
+import copy
+
+
+class CudaBuffer:
+    __cuda_array_interface__ = None
+    obj = None
+
+
+def to_torch_dtype(data_type):
+    """Convert a data type into one supported by torch
+
+    Args:
+        data_type (numpy dtype): Original data type
+
+    Returns:
+        dtype: A data type supported by torch
+    """
+    if data_type == np.uint16:
+        return np.dtype(np.int16)
+    elif data_type == np.uint32:
+        return np.dtype(np.int32)
+    elif data_type == np.uint64:
+        return np.dtype(np.int64)
+    else:
+        return data_type
+
+
+def to_cpu_numpy_buffer(cuda_buffer):
+    """Convert a CUDA buffer to host (CPU) nympy array
+
+    Args:
+        cuda_buffer: CUDA buffer with __cuda_array_interface__
+
+    Returns:
+        numpy array: The CUDA buffer copied to the CPU
+    """
+    torch_dtype = copy.copy(cuda_buffer.dtype)
+    torch_dtype = to_torch_dtype(torch_dtype)
+
+    buf = CudaBuffer
+    buf.obj = cuda_buffer
+    buf.__cuda_array_interface__ = cuda_buffer.__cuda_array_interface__
+    buf.__cuda_array_interface__["typestr"] = torch_dtype.str
+
+    return torch.as_tensor(buf).cpu().numpy()
+
+
+def to_cuda_buffer(host_data):
+    """Convert host data to a CUDA buffer
+
+    Args:
+        host_data (numpy array): Host data
+
+    Returns:
+        CudaBuffer: The converted CUDA buffer
+    """
+    orig_dtype = copy.copy(host_data.dtype)
+
+    host_data.dtype = to_torch_dtype(host_data.dtype)
+
+    dev = torch.as_tensor(host_data, device="cuda").cuda()
+    host_data.dtype = orig_dtype  # restore it
+
+    # The cuda buffer only needs the cuda array interface.
+    # We can then set its dtype to whatever we want.
+    buf = CudaBuffer()
+    buf.__cuda_array_interface__ = dev.__cuda_array_interface__
+    buf.__cuda_array_interface__["typestr"] = orig_dtype.str
+    buf.obj = dev  # make sure it holds a reference to the torch buffer
+
+    return buf
diff --git a/samples/common/python/perf_utils.py b/samples/common/python/perf_utils.py
index f9b290ac..1563afd9 100644
--- a/samples/common/python/perf_utils.py
+++ b/samples/common/python/perf_utils.py
@@ -76,7 +76,7 @@ def __init__(
         self.timing_info = {}
         self.batch_info = {}
         self.inside_batch_info = []
-        self.is_inside_batch = False
+        self.is_inside_batch = 0
         self.total_batches_processed = {}
         # Check if the benchmark.py script was used to run this. We do so
         # by checking whether an environment variable only set by that script is
@@ -108,7 +108,7 @@ def push_range(
         """
         if batch_idx is not None:
             message += "_%d" % batch_idx
-            self.is_inside_batch = True
+            self.is_inside_batch += 1
 
         nvtx.push_range(message, color, domain, category)
 
@@ -133,12 +133,12 @@ def pop_range(self, domain=None, total_items=None):
             # Actual timing information will be recorded and pulled from NSYS by a
             # script like benchmark.py.
 
-            if self.is_inside_batch:
+            if self.is_inside_batch > 0:
                 self.inside_batch_info.append(self.stack_path)
 
             # Record the batch information if it was present.
             if total_items is not None:
-                if not self.is_inside_batch:
+                if self.is_inside_batch <= 0:
                     raise ValueError(
                         "Non zero value for total_items in pop_range can only be "
                         "passed once inside a batch. No known batch was pushed previously. Please "
@@ -146,7 +146,7 @@ def pop_range(self, domain=None, total_items=None):
                     )
 
                 self.batch_info[self.stack_path] = (batch_idx, total_items)
-                self.is_inside_batch = False
+                self.is_inside_batch -= 1
 
                 if total_items > 0:
                     batch_level_prefix = os.path.dirname(self.stack_path)
diff --git a/samples/label/python/main.py b/samples/label/python/main.py
new file mode 100644
index 00000000..575a2a2a
--- /dev/null
+++ b/samples/label/python/main.py
@@ -0,0 +1,316 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# docs_tag: begin_python_imports
+
+import pycuda.driver as cuda
+import os
+import sys
+import logging
+import cvcuda
+import torch
+import numpy as np
+
+# Bring the commons folder from the samples directory into our path so that
+# we can import modules from it.
+common_dir = os.path.join(
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+    "common",
+    "python",
+)
+sys.path.insert(0, common_dir)
+
+from perf_utils import (  # noqa: E402
+    CvCudaPerf,
+    get_default_arg_parser,
+    parse_validate_default_args,
+)
+
+from torch_utils import ImageBatchDecoderPyTorch, ImageBatchEncoderPyTorch  # noqa: E402
+from interop_utils import to_cpu_numpy_buffer, to_cuda_buffer  # noqa: E402
+
+# docs_tag: end_python_imports
+
+
+def save_batch(images, label, encoder, batch):
+    """Save a batch of images to disk
+
+    Args:
+        images (nvcv Tensor): Batch of images to save
+
+        label : label value for output file name,
+                  used to differentiate between outputs
+                  appended to the original file name.
+
+        encoder : Encoder object to save the images
+
+        batch : Batch object to save the images
+
+    Returns:
+        nvcv Tensor: RGB color, random for each label
+    """
+    # Function to modify filenames in the batch
+    def modify_filenames(suffix):
+        modified_filenames = []
+        for filename in batch.fileinfo:
+            name, extension = filename.rsplit(".", 1)
+            modified_filename = f"{name}_{suffix}.{extension}"
+            modified_filenames.append(modified_filename)
+        return modified_filenames
+
+    # convert to NCHW
+    imagesNCHW = cvcuda.reformat(images, "NCHW")
+
+    # Modify filenames with "_labels" suffix
+    oldFileNames = batch.fileinfo
+    batch.fileinfo = modify_filenames(label)
+    batch.data = torch.as_tensor(imagesNCHW.cuda())
+    encoder(batch)
+    batch.fileinfo = oldFileNames
+
+
+def simple_cmap(label):
+    """Convert a label map to an random RGB color
+
+    Args:
+        labels : label value
+
+    Returns:
+        nvcv Tensor: RGB color, random for each label
+    """
+    np.random.seed(label)  # Ensure consistent color for each label
+    return np.random.randint(0, 256, 3)  # Random RGB color
+
+
+def color_labels_nhwc(labels):
+    """Convert a label map to an RGB image
+
+    Args:
+        labels : Output of cvcuda.label operator
+
+    Returns:
+        nvcv Tensor: RGB image, with each label having a unique color
+    """
+    npLabels = to_cpu_numpy_buffer(labels.cuda())
+    # Initialize the output array with the same batch size, height, width, and RGB channels
+    a_rgb = np.zeros(
+        [npLabels.shape[0], npLabels.shape[1], npLabels.shape[2], 3], dtype=np.uint8
+    )
+
+    # Iterate over each image in the batch
+    for n in range(npLabels.shape[0]):
+        # Extract unique labels for the current image
+        a_labels = np.unique(npLabels[n, :, :, :])
+
+        # Process each label in the current image
+        for label in a_labels:
+            rgb_label_color = simple_cmap(label)
+            # Create a mask for the current label
+            mask = npLabels[n] == label
+            # Use the mask to assign color to the corresponding pixels
+            a_rgb[n][mask[:, :, 0]] = rgb_label_color.astype(np.uint8)
+
+    return cvcuda.as_tensor(to_cuda_buffer(a_rgb), "NHWC")
+
+
+def run_sample(
+    input_path,
+    output_dir,
+    batch_size,
+    target_img_height,
+    target_img_width,
+    device_id,
+    cvcuda_perf,
+):
+    logger = logging.getLogger("Distance_Label_Sample")
+
+    logger.debug("Using batch size of %d" % batch_size)
+    logger.debug("Using CUDA device: %d" % device_id)
+
+    # docs_tag: begin_setup_gpu
+    cvcuda_perf.push_range("run_sample")
+
+    # Define the objects that handle the pipeline stages ---
+    image_size = (target_img_width, target_img_height)
+    logger.debug("Image size: %d %d" % image_size)
+
+    # Define the cuda device, context and streams.
+    cuda_device = cuda.Device(device_id)
+    cuda_ctx = cuda_device.retain_primary_context()
+    cuda_ctx.push()
+    cvcuda_stream = cvcuda.Stream()
+    torch_stream = torch.cuda.ExternalStream(cvcuda_stream.handle)
+    # docs_tag: end_setup_gpu
+
+    # docs_tag: encoder_decoder setup
+    # Now define the object that will handle pre-processing
+    if os.path.splitext(input_path)[1] == ".jpg" or os.path.isdir(input_path):
+        # Treat this as data modality of images
+        decoder = ImageBatchDecoderPyTorch(
+            input_path, batch_size, device_id, cuda_ctx, cvcuda_perf
+        )
+        encoder = ImageBatchEncoderPyTorch(
+            output_dir,
+            fps=0,
+            device_id=device_id,
+            cuda_ctx=cuda_ctx,
+            cvcuda_perf=cvcuda_perf,
+        )
+    else:
+        raise ValueError("Unknown data modality: %s." % input_path)
+    # docs_tag: encoder_decoder setup
+
+    # docs_tag: begin_pipeline
+    # Define and execute the processing pipeline
+    cvcuda_perf.push_range("pipeline")
+
+    # Fire up encoder/decoder
+    decoder.start()
+    encoder.start()
+
+    # Loop through all input frames
+    batch_idx = 0
+    while True:
+        cvcuda_perf.push_range("batch", batch_idx=batch_idx)
+
+        # Execute everything inside the streams.
+        with cvcuda_stream, torch.cuda.stream(torch_stream):
+            # Stage 1: decode
+            batch = decoder()
+            if batch is None:
+                cvcuda_perf.pop_range(total_items=0)  # for batch
+                break  # No more frames to decode
+            assert batch_idx == batch.batch_idx
+
+            logger.info("Processing batch %d" % batch_idx)
+
+            # docs_tag: process_batch
+            # Stage 2: process
+
+            # docs_tag: begin_tensor_conversion
+            # Need to check what type of input we have received:
+            # 1) CVCUDA tensor --> Nothing needs to be done.
+            # 2) Numpy Array --> Convert to torch tensor first and then CVCUDA tensor
+            # 3) Torch Tensor --> Convert to CVCUDA tensor
+            if isinstance(batch.data, torch.Tensor):
+                cvcudaTensorNHWC = cvcuda.as_tensor(batch.data, "NHWC")
+            elif isinstance(batch.data, np.ndarray):
+                cvcudaTensorNHWC = cvcuda.as_tensor(
+                    torch.as_tensor(batch.data).to(
+                        device="cuda:%d" % device_id, non_blocking=True
+                    ),
+                    "NHWC",
+                )
+            # docs_tag: end_tensor_conversion
+
+            # Convert to grayscale
+            out = cvcuda.cvtcolor(cvcudaTensorNHWC, cvcuda.ColorConversion.RGB2GRAY)
+
+            save_batch(out, "grayscale", encoder, batch)
+
+            # Histogram eq the image
+            out = cvcuda.histogrameq(out, cvcuda.Type.U8)
+
+            save_batch(out, "histogrameq", encoder, batch)
+
+            # Threshold the image
+            # Use torch tensor for this to take advantage of easy data manipulation
+            thParam = torch.zeros(out.shape[0], dtype=torch.float64).cuda()
+            maxParam = torch.zeros(out.shape[0], dtype=torch.float64).cuda()
+
+            # The parameters below can be set per image. For now, we are setting them to a constant value.
+            # Proper threshold values must be determined by the input images and requirement.
+            thParam.fill_(
+                128
+            )  # Configure the threshold value for each image anything below this will be 0 in the output.
+            maxParam.fill_(255)  # Value to set the areas meeting the threshold.
+
+            thParam = cvcuda.as_tensor(thParam, "N")
+            maxParam = cvcuda.as_tensor(maxParam, "N")
+            out = cvcuda.threshold(out, thParam, maxParam, cvcuda.ThresholdType.BINARY)
+
+            save_batch(out, "threshold", encoder, batch)
+
+            # Create label map
+            ccLabels, _, _ = cvcuda.label(out)
+
+            # Create and ARGB image from the label map, this is for visualization purposes only.
+            argbImage = color_labels_nhwc(ccLabels)
+
+            save_batch(argbImage, "label", encoder, batch)
+
+            batch_idx += 1
+            # docs_tag: end_process
+
+        cvcuda_perf.pop_range(total_items=batch.data.shape[0])  # for batch
+
+    # Make sure encoder finishes any outstanding work
+    encoder.join()
+
+    cvcuda_perf.pop_range()  # for pipeline
+
+    cuda_ctx.pop()
+    # docs_tag: end_pipeline
+
+    cvcuda_perf.pop_range()  # for this example.
+
+    # Once everything is over, we need to finalize the perf-numbers.
+    cvcuda_perf.finalize()
+
+
+# docs_tag: begin_main_func
+def main():
+    # docs_tag: begin_parse_args
+    assets_dir = os.path.join(
+        os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+        "assets",
+    )
+    parser = get_default_arg_parser(
+        "Label sample using CV-CUDA. This sample will execute the label operator on a "
+        "single or a batch of images (must be same size). Each step of the pipeline will "
+        "produce an *_stage.jpg output showing the processing done at that stage.",
+        input_path=os.path.join(assets_dir, "images", "peoplenet.jpg"),
+        target_img_height=544,
+        target_img_width=960,
+    )
+
+    args = parse_validate_default_args(parser)
+
+    logging.basicConfig(
+        format="[%(name)s:%(lineno)d] %(asctime)s %(levelname)-6s %(message)s",
+        level=getattr(logging, args.log_level.upper()),
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    # docs_tag: end_parse_args
+
+    # Run the sample.
+    # docs_tag: start_call_run_sample
+    cvcuda_perf = CvCudaPerf("Distance_Label_sample", default_args=args)
+    run_sample(
+        args.input_path,
+        args.output_dir,
+        args.batch_size,
+        args.target_img_height,
+        args.target_img_width,
+        args.device_id,
+        cvcuda_perf,
+    )
+    # docs_tag: end_call_run_sample
+
+
+# docs_tag: end_main_func
+
+if __name__ == "__main__":
+    main()
diff --git a/samples/object_detection/python/pipelines.py b/samples/object_detection/python/pipelines.py
index 0e8d431e..00ea4685 100644
--- a/samples/object_detection/python/pipelines.py
+++ b/samples/object_detection/python/pipelines.py
@@ -303,19 +303,19 @@ def __call__(self, batch_bboxes_pyt, nms_masks_pyt, frame_nhwc):
         #
         # Once this is done, we can convert these lists to two CV-CUDA
         # structures that can be given to the blur and bndbox operators:
-        #   1) cvcuda.BndBoxesI : To store the bounding boxes for the batch
+        #   1) cvcuda.Elements : To store the bounding boxes for the batch
         #   2) cvcuda.BlurBoxesI : To store the bounding boxes as blur boxes for the batch.
         #
         self.cvcuda_perf.push_range("forloop")
-        num_boxes = []
-        bounding_boxes = []
-        blur_boxes = []
+        bounding_boxes_list = []
+        blur_boxes_list = []
 
         # Create an array of bounding boxes with render settings.
         for current_boxes, current_masks in zip(batch_bboxes_pyt, nms_masks_pyt):
             filtered_boxes = current_boxes[current_masks]
             # Save the count of non-zero bounding boxes of this image.
-            num_boxes.append(filtered_boxes.shape[0])
+            bounding_boxes = []
+            blur_boxes = []
 
             for box in filtered_boxes:
                 bounding_boxes.append(
@@ -329,13 +329,11 @@ def __call__(self, batch_bboxes_pyt, nms_masks_pyt, frame_nhwc):
                 blur_boxes.append(
                     cvcuda.BlurBoxI(box=tuple(box), kernelSize=self.kernel_size)
                 )
+            bounding_boxes_list.append(bounding_boxes)
+            blur_boxes_list.append(blur_boxes)
 
-        batch_bounding_boxes = cvcuda.BndBoxesI(
-            numBoxes=num_boxes, boxes=tuple(bounding_boxes)
-        )
-        batch_blur_boxes = cvcuda.BlurBoxesI(
-            numBoxes=num_boxes, boxes=tuple(blur_boxes)
-        )
+        batch_bounding_boxes = cvcuda.Elements(elements=bounding_boxes_list)
+        batch_blur_boxes = cvcuda.BlurBoxesI(boxes=blur_boxes_list)
         self.cvcuda_perf.pop_range()  # for loop
 
         # Apply blur first.
@@ -344,8 +342,8 @@ def __call__(self, batch_bboxes_pyt, nms_masks_pyt, frame_nhwc):
         self.cvcuda_perf.pop_range()
 
         # Render the bounding boxes.
-        self.cvcuda_perf.push_range("bndbox_into")
-        cvcuda.bndbox_into(frame_nhwc, frame_nhwc, batch_bounding_boxes)
+        self.cvcuda_perf.push_range("osd_into")
+        cvcuda.osd_into(frame_nhwc, frame_nhwc, batch_bounding_boxes)
         self.cvcuda_perf.pop_range()
 
         # docs_tag: end_call_cuosd_bboxes
diff --git a/samples/scripts/benchmark.py b/samples/scripts/benchmark.py
index 30cc5611..3bcb62eb 100644
--- a/samples/scripts/benchmark.py
+++ b/samples/scripts/benchmark.py
@@ -108,6 +108,10 @@ def parse_nvtx_pushpop_trace_json(json_path):
     #
     range_info = {}
 
+    # Check if the file was empty or not. Empty file means no ops were recorded.
+    if os.stat(json_path).st_size == 0:
+        return range_info
+
     # Read the JSON.
     with open(json_path, "r") as f:
         json_data = json.loads(f.read())
@@ -1093,9 +1097,14 @@ def main():
             # behaves correctly.
             proc_device_id = str(args.gpu_offset_id + gpu_idx)
             proc_args = args.args.copy()
-            proc_args.extend(["--device_id", proc_device_id])
-            proc_args.extend(["--output_dir", proc_output_dir])
-
+            # The following will make sure that it inserts the additional args
+            # only at the beginning of the list so that it doesn't interfere with a
+            # potentially argparse.REMAINDER style arg present at the end.
+
+            # Need to set this to 0 because once CUDA_VISIBLE_DEVICES is used,
+            # the process won't be able to see other gpus
+            proc_args[:0] = ["--device_id", "0"]
+            proc_args[:0] = ["--output_dir", proc_output_dir]
             # Start the pool.
             result = pool.apply_async(
                 benchmark_script,
diff --git a/samples/scripts/install_dependencies.sh b/samples/scripts/install_dependencies.sh
index 1eb4e9e4..bb3a4f24 100755
--- a/samples/scripts/install_dependencies.sh
+++ b/samples/scripts/install_dependencies.sh
@@ -75,7 +75,7 @@ pip3 install /tmp/VideoProcessingFramework
 pip3 install /tmp/VideoProcessingFramework/src/PytorchNvCodec
 
 # Install tao-converter which parses the .etlt model file, and generates an optimized TensorRT engine
-wget --content-disposition 'https://api.ngc.nvidia.com/v2/resources/nvidia/tao/tao-converter/versions/v4.0.0_trt8.5.1.7_x86/files/tao-converter' --directory-prefix=/usr/local/bin
+wget 'https://api.ngc.nvidia.com/v2/resources/nvidia/tao/tao-converter/versions/v4.0.0_trt8.5.1.7_x86/files/tao-converter' --directory-prefix=/usr/local/bin
 chmod a+x /usr/local/bin/tao-converter
 
 # Install NVIDIA NSIGHT 2023.2.1
diff --git a/samples/scripts/run_samples.sh b/samples/scripts/run_samples.sh
index 0c723340..2679e546 100755
--- a/samples/scripts/run_samples.sh
+++ b/samples/scripts/run_samples.sh
@@ -80,3 +80,9 @@ python3 ./object_detection/python/main.py -i ./assets/videos/pexels-chiel-slotma
 python3 ./object_detection/python/main.py -i ./assets/images/ -b 3
 # RUn it with the TensorFlow backend
 python3 ./object_detection/python/main.py -i ./assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4 -b 4 -bk tensorflow
+
+# Run the label Python sample with default settings, without any command-line args.
+find /tmp/ -maxdepth 1 -type f -delete
+python3 ./label/python/main.py
+# Run it with batch size 1 on a single image
+python3 ./label/python/main.py -i ./assets/images/peoplenet.jpg  -b 1
diff --git a/src/cvcuda/CMakeLists.txt b/src/cvcuda/CMakeLists.txt
index 4b6afe45..9da86508 100644
--- a/src/cvcuda/CMakeLists.txt
+++ b/src/cvcuda/CMakeLists.txt
@@ -27,8 +27,8 @@ set(CV_CUDA_OP_FILES
     OpMinMaxLoc.cpp
     OpHistogram.cpp
     OpMinAreaRect.cpp
-    OpBoxBlur.cpp
     OpBndBox.cpp
+    OpBoxBlur.cpp
     OpBrightnessContrast.cpp
     OpRemap.cpp
     OpColorTwist.cpp
@@ -65,6 +65,9 @@ set(CV_CUDA_OP_FILES
     OpRandomResizedCrop.cpp
     OpGaussianNoise.cpp
     OpInpaint.cpp
+    OpLabel.cpp
+    OpPairwiseMatcher.cpp
+    OpFindHomography.cpp
 )
 
 # filter only one that matches the patern (case insensitive), should be set on the global level
@@ -87,6 +90,7 @@ else()
 endif()
 
 add_library(cvcuda SHARED
+    OpStack.cpp
     ${CV_CUDA_LIB_FILES}
 )
 
diff --git a/src/cvcuda/OpFindHomography.cpp b/src/cvcuda/OpFindHomography.cpp
new file mode 100644
index 00000000..bb2d5062
--- /dev/null
+++ b/src/cvcuda/OpFindHomography.cpp
@@ -0,0 +1,67 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "priv/OpFindHomography.hpp"
+
+#include "priv/SymbolVersioning.hpp"
+
+#include <nvcv/Exception.hpp>
+#include <nvcv/Tensor.hpp>
+#include <util/Assert.h>
+
+namespace priv = cvcuda::priv;
+
+CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaFindHomographyCreate,
+                  (NVCVOperatorHandle * handle, int batchSize, int numPoints))
+{
+    return nvcv::ProtectCall(
+        [&]
+        {
+            if (handle == nullptr)
+            {
+                throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                      "Pointer to NVCVOperator handle must not be NULL");
+            }
+
+            *handle = reinterpret_cast<NVCVOperatorHandle>(new priv::FindHomography(batchSize, numPoints));
+        });
+}
+
+CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaFindHomographySubmit,
+                  (NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle srcPts, NVCVTensorHandle dstPts,
+                   NVCVTensorHandle models))
+{
+    return nvcv::ProtectCall(
+        [&]
+        {
+            nvcv::TensorWrapHandle _srcPts(srcPts), _dstPts(dstPts), _models(models);
+            priv::ToDynamicRef<priv::FindHomography>(handle)(stream, _srcPts, _dstPts, _models);
+        });
+}
+
+CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaFindHomographyVarShapeSubmit,
+                  (NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorBatchHandle srcPts,
+                   NVCVTensorBatchHandle dstPts, NVCVTensorBatchHandle models))
+{
+    return nvcv::ProtectCall(
+        [&]
+        {
+            nvcv::TensorBatchWrapHandle _srcPts(srcPts), _dstPts(dstPts);
+            nvcv::TensorBatchWrapHandle _models(models);
+            priv::ToDynamicRef<priv::FindHomography>(handle)(stream, _srcPts, _dstPts, _models);
+        });
+}
diff --git a/src/cvcuda/OpLabel.cpp b/src/cvcuda/OpLabel.cpp
new file mode 100644
index 00000000..351cce2b
--- /dev/null
+++ b/src/cvcuda/OpLabel.cpp
@@ -0,0 +1,55 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "priv/OpLabel.hpp"
+
+#include "priv/SymbolVersioning.hpp"
+
+#include <nvcv/Exception.hpp>
+#include <nvcv/Tensor.hpp>
+#include <util/Assert.h>
+
+CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaLabelCreate, (NVCVOperatorHandle * handle))
+{
+    return nvcv::ProtectCall(
+        [&]
+        {
+            if (handle == nullptr)
+            {
+                throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                      "Pointer to NVCVOperator handle must not be NULL");
+            }
+
+            *handle = reinterpret_cast<NVCVOperatorHandle>(new cvcuda::priv::Label());
+        });
+}
+
+CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaLabelSubmit,
+                  (NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle in, NVCVTensorHandle out,
+                   NVCVTensorHandle bgLabel, NVCVTensorHandle minThresh, NVCVTensorHandle maxThresh,
+                   NVCVTensorHandle minSize, NVCVTensorHandle count, NVCVTensorHandle stats,
+                   NVCVConnectivityType connectivity, NVCVLabelType assignLabels))
+{
+    return nvcv::ProtectCall(
+        [&]
+        {
+            cvcuda::priv::ToDynamicRef<cvcuda::priv::Label>(handle)(
+                stream, nvcv::TensorWrapHandle{in}, nvcv::TensorWrapHandle{out}, nvcv::TensorWrapHandle{bgLabel},
+                nvcv::TensorWrapHandle{minThresh}, nvcv::TensorWrapHandle{maxThresh}, nvcv::TensorWrapHandle{minSize},
+                nvcv::TensorWrapHandle{count}, nvcv::TensorWrapHandle{stats}, connectivity, assignLabels);
+        });
+}
diff --git a/src/cvcuda/OpPairwiseMatcher.cpp b/src/cvcuda/OpPairwiseMatcher.cpp
new file mode 100644
index 00000000..07b0db91
--- /dev/null
+++ b/src/cvcuda/OpPairwiseMatcher.cpp
@@ -0,0 +1,58 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "priv/OpPairwiseMatcher.hpp"
+
+#include "priv/SymbolVersioning.hpp"
+
+#include <nvcv/Exception.hpp>
+#include <nvcv/Tensor.hpp>
+#include <util/Assert.h>
+
+namespace priv = cvcuda::priv;
+
+CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaPairwiseMatcherCreate,
+                  (NVCVOperatorHandle * handle, NVCVPairwiseMatcherType algoChoice))
+{
+    return nvcv::ProtectCall(
+        [&]
+        {
+            if (handle == nullptr)
+            {
+                throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                      "Pointer to NVCVOperator handle must not be NULL");
+            }
+
+            *handle = reinterpret_cast<NVCVOperatorHandle>(new cvcuda::priv::PairwiseMatcher(algoChoice));
+        });
+}
+
+CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaPairwiseMatcherSubmit,
+                  (NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle set1, NVCVTensorHandle set2,
+                   NVCVTensorHandle numSet1, NVCVTensorHandle numSet2, NVCVTensorHandle matches,
+                   NVCVTensorHandle numMatches, NVCVTensorHandle distances, bool crossCheck, int matchesPerPoint,
+                   NVCVNormType normType))
+{
+    return nvcv::ProtectCall(
+        [&]
+        {
+            cvcuda::priv::ToDynamicRef<cvcuda::priv::PairwiseMatcher>(handle)(
+                stream, nvcv::TensorWrapHandle{set1}, nvcv::TensorWrapHandle{set2}, nvcv::TensorWrapHandle{numSet1},
+                nvcv::TensorWrapHandle{numSet2}, nvcv::TensorWrapHandle{matches}, nvcv::TensorWrapHandle{numMatches},
+                nvcv::TensorWrapHandle{distances}, crossCheck, matchesPerPoint, normType);
+        });
+}
diff --git a/src/cvcuda/OpPillowResize.cpp b/src/cvcuda/OpPillowResize.cpp
index d599f39f..7ba00b31 100644
--- a/src/cvcuda/OpPillowResize.cpp
+++ b/src/cvcuda/OpPillowResize.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,8 +15,9 @@
  * limitations under the License.
  */
 
-#include "priv/OpPillowResize.hpp"
+#include "cvcuda/OpPillowResize.h"
 
+#include "priv/OpPillowResize.hpp"
 #include "priv/SymbolVersioning.hpp"
 
 #include <nvcv/Exception.hpp>
@@ -26,9 +27,7 @@
 
 namespace priv = cvcuda::priv;
 
-CVCUDA_DEFINE_API(0, 2, NVCVStatus, cvcudaPillowResizeCreate,
-                  (NVCVOperatorHandle * handle, int32_t maxWidth, int32_t maxHeight, int32_t maxBatchSize,
-                   NVCVImageFormat fmt))
+CVCUDA_DEFINE_API(0, 3, NVCVStatus, cvcudaPillowResizeCreate, (NVCVOperatorHandle * handle))
 {
     return nvcv::ProtectCall(
         [&]
@@ -38,31 +37,63 @@ CVCUDA_DEFINE_API(0, 2, NVCVStatus, cvcudaPillowResizeCreate,
                 throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
                                       "Pointer to NVCVOperator handle must not be NULL");
             }
-            *handle = reinterpret_cast<NVCVOperatorHandle>(
-                new priv::PillowResize(nvcv::Size2D{maxWidth, maxHeight}, maxBatchSize, fmt));
+            *handle = reinterpret_cast<NVCVOperatorHandle>(new priv::PillowResize());
+        });
+}
+
+CVCUDA_DEFINE_API(0, 3, NVCVStatus, cvcudaPillowResizeGetWorkspaceRequirements,
+                  (NVCVOperatorHandle handle, int maxBatchSize, int32_t maxInWidth, int32_t maxInHeight,
+                   int32_t maxOutWidth, int32_t maxOutHeight, NVCVImageFormat fmt, NVCVWorkspaceRequirements *reqOut))
+{
+    if (!reqOut)
+        return NVCV_ERROR_INVALID_ARGUMENT;
+
+    return nvcv::ProtectCall(
+        [&]
+        {
+            NVCVSize2D maxInSize  = {maxInWidth, maxInHeight};
+            NVCVSize2D maxOutSize = {maxOutWidth, maxOutHeight};
+            *reqOut = priv::ToDynamicRef<priv::PillowResize>(handle).getWorkspaceRequirements(maxBatchSize, maxInSize,
+                                                                                              maxOutSize, fmt);
+        });
+}
+
+CVCUDA_DEFINE_API(0, 3, NVCVStatus, cvcudaPillowResizeVarShapeGetWorkspaceRequirements,
+                  (NVCVOperatorHandle handle, int batchSize, const NVCVSize2D *inputSizes,
+                   const NVCVSize2D *outputSizes, NVCVImageFormat fmt, NVCVWorkspaceRequirements *reqOut))
+{
+    if (!reqOut)
+        return NVCV_ERROR_INVALID_ARGUMENT;
+
+    return nvcv::ProtectCall(
+        [&]
+        {
+            *reqOut = priv::ToDynamicRef<priv::PillowResize>(handle).getWorkspaceRequirements(
+                batchSize, static_cast<const nvcv::Size2D *>(inputSizes),
+                static_cast<const nvcv::Size2D *>(outputSizes), fmt);
         });
 }
 
-CVCUDA_DEFINE_API(0, 2, NVCVStatus, cvcudaPillowResizeSubmit,
-                  (NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle in, NVCVTensorHandle out,
-                   const NVCVInterpolationType interpolation))
+CVCUDA_DEFINE_API(0, 3, NVCVStatus, cvcudaPillowResizeSubmit,
+                  (NVCVOperatorHandle handle, cudaStream_t stream, const NVCVWorkspace *ws, NVCVTensorHandle in,
+                   NVCVTensorHandle out, const NVCVInterpolationType interpolation))
 {
     return nvcv::ProtectCall(
         [&]
         {
             nvcv::TensorWrapHandle input(in), output(out);
-            priv::ToDynamicRef<priv::PillowResize>(handle)(stream, input, output, interpolation);
+            priv::ToDynamicRef<priv::PillowResize>(handle)(stream, *ws, input, output, interpolation);
         });
 }
 
-CVCUDA_DEFINE_API(0, 2, NVCVStatus, nvcvopPillowResizeVarShapeSubmit,
-                  (NVCVOperatorHandle handle, cudaStream_t stream, NVCVImageBatchHandle in, NVCVImageBatchHandle out,
-                   const NVCVInterpolationType interpolation))
+CVCUDA_DEFINE_API(0, 3, NVCVStatus, cvcudaPillowResizeVarShapeSubmit,
+                  (NVCVOperatorHandle handle, cudaStream_t stream, const NVCVWorkspace *ws, NVCVImageBatchHandle in,
+                   NVCVImageBatchHandle out, const NVCVInterpolationType interpolation))
 {
     return nvcv::ProtectCall(
         [&]
         {
             nvcv::ImageBatchVarShapeWrapHandle input(in), output(out);
-            priv::ToDynamicRef<priv::PillowResize>(handle)(stream, input, output, interpolation);
+            priv::ToDynamicRef<priv::PillowResize>(handle)(stream, *ws, input, output, interpolation);
         });
 }
diff --git a/src/cvcuda/OpStack.cpp b/src/cvcuda/OpStack.cpp
new file mode 100644
index 00000000..b7a4a293
--- /dev/null
+++ b/src/cvcuda/OpStack.cpp
@@ -0,0 +1,53 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "priv/OpStack.hpp"
+
+#include "priv/SymbolVersioning.hpp"
+
+#include <nvcv/Exception.hpp>
+#include <nvcv/Tensor.hpp>
+#include <util/Assert.h>
+
+namespace priv = cvcuda::priv;
+
+CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaStackCreate, (NVCVOperatorHandle * handle))
+{
+    return nvcv::ProtectCall(
+        [&]
+        {
+            if (handle == nullptr)
+            {
+                throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                      "Pointer to NVCVOperator handle must not be NULL");
+            }
+
+            *handle = reinterpret_cast<NVCVOperatorHandle>(new priv::Stack());
+        });
+}
+
+CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaStackSubmit,
+                  (NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorBatchHandle in, NVCVTensorHandle out))
+{
+    return nvcv::ProtectCall(
+        [&]
+        {
+            nvcv::TensorWrapHandle      output(out);
+            nvcv::TensorBatchWrapHandle input(in);
+            priv::ToDynamicRef<priv::Stack>(handle)(stream, input, output);
+        });
+}
diff --git a/src/cvcuda/include/cvcuda/OpFindHomography.h b/src/cvcuda/include/cvcuda/OpFindHomography.h
new file mode 100644
index 00000000..6d5c5dcc
--- /dev/null
+++ b/src/cvcuda/include/cvcuda/OpFindHomography.h
@@ -0,0 +1,151 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file OpFindHomography.h
+ *
+ * @brief Defines types and functions to handle the Find-Homography operation.
+ * @defgroup NVCV_C_ALGORITHM_FIND_HOMOGRAPHY Find-Homography
+ * @{
+ */
+
+#ifndef CVCUDA__FIND_HOMOGRAPHY_H
+#define CVCUDA__FIND_HOMOGRAPHY_H
+
+#include "Operator.h"
+#include "Types.h"
+#include "detail/Export.h"
+
+#include <cuda_runtime.h>
+#include <nvcv/BorderType.h>
+#include <nvcv/ImageBatch.h>
+#include <nvcv/Status.h>
+#include <nvcv/Tensor.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/** Constructs an instance of the Find-Homography operator.
+ *
+ * @param [out] handle Where the image instance handle will be written to.
+ *                     + Must not be NULL.
+ * @param [in]  batchSize number of samples in the batch
+ * @param [in]  numPoints maximum number of coordinates that in the batch
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT Handle is null.
+ * @retval #NVCV_ERROR_OUT_OF_MEMORY    Not enough memory to create the operator.
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+CVCUDA_PUBLIC NVCVStatus cvcudaFindHomographyCreate(NVCVOperatorHandle *handle, int batchSize, int maxNumPoints);
+
+/** Executes the Find-Homography operation on the given cuda stream.
+ *
+ *  Limitations:
+ *
+ *  Input:
+ *       Data Layout:    [NW]
+ *       Channel count:  [1]
+ *
+ *       Data Type      | Allowed
+ *       -------------- | -------------
+ *       8bit  Unsigned | No
+ *       8bit  Signed   | No
+ *       16bit Unsigned | No
+ *       16bit Signed   | No
+ *       32bit Unsigned | No
+ *       32bit Signed   | No
+ *       32bit Float    | Yes
+ *       64bit Float    | No
+ *
+ *  Output:
+ *       Data Layout:    [NHW]
+ *       Channel count:  [1]
+ *
+ *       Data Type      | Allowed
+ *       -------------- | -------------
+ *       8bit  Unsigned | No
+ *       8bit  Signed   | No
+ *       16bit Unsigned | No
+ *       16bit Signed   | No
+ *       32bit Unsigned | No
+ *       32bit Signed   | No
+ *       32bit Float    | Yes
+ *       64bit Float    | No
+ *
+ *  Input/Output dependency
+ *
+ *       Property      |  Input == Output
+ *      -------------- | -------------
+ *       Data Layout   | No
+ *       Data Type     | Yes
+ *       Batches (N)   | Yes
+ *       Channels      | No
+ *
+ * @param [in] handle Handle to the operator.
+ *                    + Must not be NULL.
+ * @param [in] stream Handle to a valid CUDA stream.
+ *
+ * @param [in] srcPts Input tensor, srcPts[i, j] is the set of coordinates for the source image where i ranges
+ *                from 0 to batch-1, j ranges from 4 to number of coordinates per image, and the data type being
+ *                float2 for (x=x, y=y)
+ *                + Number of coordinates must be >= 4
+ *                + Must have data type 2F32
+ *                + Must have rank 2
+ *
+ * * @param [in] dstPts Input tensor, dstPts[i, j] is the set of coordinates for the destination image where i ranges
+ *                from 0 to batch-1, j ranges from 4 to number of coordinates per image, and the data type being
+ *                float2 for (x=x, y=y)
+ *                + Number of coordinates must be >= 4
+ *                + Must have data type 2F32
+ *                + Must have rank 2
+ *
+ * @param [out] out Output tensor, models[i, j, k] is the output model tensor which maps the src points to dst points
+ *                  in image i, where i ranges from 0 to batch-1, j ranges from 0 to 2 and k ranges from 0 to 2, and
+ *                  the data type being F32.
+ *                  + Must have data type F32
+ *                  + Must have rank 3
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range.
+ * @retval #NVCV_ERROR_INTERNAL         Internal error in the operator, invalid types passed in.
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+/** @{ */
+CVCUDA_PUBLIC NVCVStatus cvcudaFindHomographySubmit(NVCVOperatorHandle handle, cudaStream_t stream,
+                                                    NVCVTensorHandle srcPts, NVCVTensorHandle dstPts,
+                                                    NVCVTensorHandle models);
+
+/**
+ * Executes the FindHomography operation on a batch of images.
+ *
+ * Apart from input and output image batches, all parameters are the same as \ref cvcudaFindHomographySubmit.
+ *
+ * @param[in] srcPts batch of coordinates in the source image.
+ * @param[out] dstPts batch of coordinates in the destination image.
+ * @param [in] models model tensor batch.
+ *
+ */
+CVCUDA_PUBLIC NVCVStatus cvcudaFindHomographyVarShapeSubmit(NVCVOperatorHandle handle, cudaStream_t stream,
+                                                            NVCVTensorBatchHandle srcPts, NVCVTensorBatchHandle dstPts,
+                                                            NVCVTensorBatchHandle models);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CVCUDA__FIND_HOMOGRAPHY_H */
diff --git a/src/cvcuda/include/cvcuda/OpFindHomography.hpp b/src/cvcuda/include/cvcuda/OpFindHomography.hpp
new file mode 100644
index 00000000..7e7c807d
--- /dev/null
+++ b/src/cvcuda/include/cvcuda/OpFindHomography.hpp
@@ -0,0 +1,86 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file FindHomography.hpp
+ *
+ * @brief Defines the private C++ Class for the Find-Homography operation.
+ */
+
+#ifndef CVCUDA__FIND_HOMOGRAPHY_HPP
+#define CVCUDA__FIND_HOMOGRAPHY_HPP
+
+#include "IOperator.hpp"
+#include "OpFindHomography.h"
+
+#include <cuda_runtime.h>
+#include <nvcv/Tensor.hpp>
+#include <nvcv/TensorBatch.hpp>
+#include <nvcv/alloc/Requirements.hpp>
+
+namespace cvcuda {
+
+class FindHomography final : public IOperator
+{
+public:
+    explicit FindHomography(int batchSize, int numPoints);
+
+    ~FindHomography();
+
+    void operator()(cudaStream_t stream, const nvcv::Tensor &src, const nvcv::Tensor &dst, const nvcv::Tensor &models);
+
+    void operator()(cudaStream_t stream, const nvcv::TensorBatch &src, const nvcv::TensorBatch &dst,
+                    const nvcv::TensorBatch &models);
+
+    virtual NVCVOperatorHandle handle() const noexcept override;
+
+private:
+    NVCVOperatorHandle m_handle;
+};
+
+inline FindHomography::FindHomography(int batchSize, int numPoints)
+{
+    nvcv::detail::CheckThrow(cvcudaFindHomographyCreate(&m_handle, batchSize, numPoints));
+    assert(m_handle);
+}
+
+inline FindHomography::~FindHomography()
+{
+    nvcvOperatorDestroy(m_handle);
+}
+
+inline void FindHomography::operator()(cudaStream_t stream, const nvcv::Tensor &src, const nvcv::Tensor &dst,
+                                       const nvcv::Tensor &models)
+{
+    nvcv::detail::CheckThrow(cvcudaFindHomographySubmit(m_handle, stream, src.handle(), dst.handle(), models.handle()));
+}
+
+inline void FindHomography::operator()(cudaStream_t stream, const nvcv::TensorBatch &src, const nvcv::TensorBatch &dst,
+                                       const nvcv::TensorBatch &models)
+{
+    nvcv::detail::CheckThrow(
+        cvcudaFindHomographyVarShapeSubmit(m_handle, stream, src.handle(), dst.handle(), models.handle()));
+}
+
+inline NVCVOperatorHandle FindHomography::handle() const noexcept
+{
+    return m_handle;
+}
+
+} // namespace cvcuda
+
+#endif // CVCUDA__FIND_HOMOGRAPHY_HPP
diff --git a/src/cvcuda/include/cvcuda/OpLabel.h b/src/cvcuda/include/cvcuda/OpLabel.h
new file mode 100644
index 00000000..ad0b40aa
--- /dev/null
+++ b/src/cvcuda/include/cvcuda/OpLabel.h
@@ -0,0 +1,242 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file OpLabel.h
+ *
+ * @brief Defines types and functions to handle the Label operation.
+ * @defgroup NVCV_C_ALGORITHM_LABEL Label
+ * @{
+ */
+
+#ifndef CVCUDA_LABEL_H
+#define CVCUDA_LABEL_H
+
+#include "Operator.h"
+#include "Types.h"
+#include "detail/Export.h"
+
+#include <cuda_runtime.h>
+#include <nvcv/Status.h>
+#include <nvcv/Tensor.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * Constructs an instance of the Label operator.
+ *
+ * @param [out] handle Where the operator instance handle will be written to.
+ *                     + Must not be NULL.
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT Handle is null.
+ * @retval #NVCV_ERROR_OUT_OF_MEMORY    Not enough memory to create the operator.
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+CVCUDA_PUBLIC NVCVStatus cvcudaLabelCreate(NVCVOperatorHandle *handle);
+
+/**
+ * Executes the Label operation on the given cuda stream. This operation does not wait for completion.
+ *
+ * This operation computes the connected-component labeling of one or more input images (in 2D) or volumes (in 3D)
+ * inside the input tensor, yielding labels in the output tensor with same rank and shape.  Labels are numbers
+ * uniquely assigned to each connected region, for example:
+ *
+ * Input   0 0 0 0  Output   0 0 0 0
+ * image:  1 1 0 1  labels:  4 4 0 7
+ *         0 0 0 1           0 0 0 7
+ *         0 1 1 1           0 7 7 7
+ *
+ * In the above example, three distinct regions were identified and labeled as 0, 4 and 7.  Note that the region
+ * labeled with 0 remained with the same value as the input, and label numbers 4 and 7 were assigned in
+ * non-consecutive ordering.  Some values in the input may be ignored, i.e. not labeled, using the \ref bgLabel
+ * tensor to define those values as background, which usually is set to the value zero.  For example:
+ *
+ * Input   0 0 1 0  Output   0 0 2 3  Zeros in  0 0 2 0
+ * image:  0 1 0 1  labels:  0 5 6 7  bgLabel:  0 5 0 7
+ *         0 0 1 1           0 0 7 7            0 0 7 7
+ *         0 1 1 1           0 7 7 7            0 7 7 7
+ *
+ *  Limitations:
+ *
+ *  Input:
+ *       Data Layout:    [HWC], [NHWC], [DHWC], [NDHWC]
+ *       Channels:       [1]
+ *
+ *       Data Type      | Allowed
+ *       -------------- | -------------
+ *       8bit  Unsigned | Yes
+ *       8bit  Signed   | Yes
+ *       16bit Unsigned | Yes
+ *       16bit Signed   | Yes
+ *       32bit Unsigned | Yes
+ *       32bit Signed   | Yes
+ *       32bit Float    | No
+ *       64bit Float    | No
+ *
+ *  Output:
+ *       Data Layout:    [HWC], [NHWC], [DHWC], [NDHWC]
+ *       Channels:       [1]
+ *
+ *       Data Type      | Allowed
+ *       -------------- | -------------
+ *       8bit  Unsigned | No
+ *       8bit  Signed   | No
+ *       16bit Unsigned | No
+ *       16bit Signed   | No
+ *       32bit Unsigned | Yes
+ *       32bit Signed   | No
+ *       32bit Float    | No
+ *       64bit Float    | No
+ *
+ *  Input/Output dependency
+ *
+ *       Property      |  Input == Output
+ *      -------------- | -------------
+ *       Data Layout   | Yes
+ *       Data Type     | No
+ *       Number        | Yes
+ *       Channels      | Yes
+ *       Width         | Yes
+ *       Height        | Yes
+ *       Depth         | Yes
+ *
+ * @param [in] handle Handle to the operator.
+ *                    + Must not be NULL.
+ * @param [in] stream Handle to a valid CUDA stream.
+ *
+ * @param [in] in Input tensor.  The expected layout is [HWC] or [NHWC] for 2D labeling or [DHWC] or [NDHWC] for
+ *                3D labeling, with either explicit C dimension or missing C with channels embedded in the data type.
+ *                The N dimension is the number of samples, i.e. either 2D images with height H and width W or
+ *                3D volumes with depth D and height H and width W, inside the tensor.  This operator labels
+ *                regions, i.e. connected components, of each input image or volume read from the \ref in tensor.
+ *                + Check above limitations table to the input tensor data layout, number of channels and data type.
+ *
+ * @param [out] out Output tensor.  The expected layout is [HWC] or [NHWC] for 2D labeling or [DHWC] or [NDHWC] for
+ *                  3D labeling, with either explicit C dimension or missing C with channels embedded in the data type.
+ *                  The N dimension is the number of samples, i.e. either 2D images with height H and width W or
+ *                  3D volumes with depth D and height H and width W, inside the tensor.  This operator labels
+ *                  regions, i.e. connected components, on the input writing the labels to the \ref out tensor.
+ *                  + Check above limitations table to the output tensor data layout, number of channels and data type.
+ *
+ * @param [in] bgLabel Background label tensor.  The expected layout is [N] or [NC], meaning rank-1 or rank-2
+ *                     tensor with first dimension as the number of samples N, matching input and output tensors,
+ *                     and a potential last dimension C with number of channels.  If present, this tensor is used
+ *                     by the operator to define background values in the input tensor to be ignored during
+ *                     labeling.  If not present, all values in the input are labeled.
+ *                     + It must have the same number of samples as input and output tensors.
+ *                     + It must have one element per sample, i.e. number of channels must be 1 in a [NC] tensor.
+ *                     + It must have data type the same as the input.
+ *                     + It may be NULL to consider all values in the input as valid values to be labeled.
+ *
+ * @param [in] minThresh Minimum-threshold value tensor.  The expected layout is [N] or [NC], meaning rank-1 or
+ *                       rank-2 tensor with first dimension as the number of samples N, matching input and output
+ *                       tensors, and a potential last dimension C with number of channels.  If present, this
+ *                       tensor is used by the operator as a pre-filter step to define minimum values in the input
+ *                       tensor to be thresholded into a binary image, i.e. values below it are set to 0 and above
+ *                       or equal it are set to 1.  Labeling is done after this pre-filter step, where \ref
+ *                       bgLabel may be applied for instance to ignore zeroes as background.
+ *                       + It must have the same number of samples as input and output tensors.
+ *                       + It must have one element per sample, i.e. number of channels must be 1 in a [NC] tensor.
+ *                       + It must have data type the same as the input.
+ *                       + It may be NULL to not apply minimum thresholding as a pre-filter.
+ *
+ * @param [in] maxThresh Maximum-threshold value tensor.  The expected layout is [N] or [NC], meaning rank-1 or
+ *                       rank-2 tensor with first dimension as the number of samples N, matching input and output
+ *                       tensors, and a potential last dimension C with number of channels.  If present, this
+ *                       tensor is used by the operator as a pre-filter step to define maximum values in the input
+ *                       tensor to be thresholded into a binary image, i.e. values above it are set to 0 and below
+ *                       or equal it are set to 1.  Labeling is done after this pre-filter step, where \ref
+ *                       bgLabel may be applied for instance to ignore zeroes as background.
+ *                       + It must have the same number of samples as input and output tensors.
+ *                       + It must have one element per sample, i.e. number of channels must be 1 in a [NC] tensor.
+ *                       + It must have data type the same as the input.
+ *                       + It may be NULL to not apply maximum thresholding as a pre-filter.
+ *
+ * @param [in] minSize Minimum-size value tensor.  The expected layout is [N] or [NC], meaning rank-1 or rank-2
+ *                     tensor with first dimension as the number of samples N, matching input and output tensors,
+ *                     and a potential last dimension C with number of channels.  If present, this tensor is used
+ *                     by the operator as a post-filter step to define minimum-size regions in the output tensor to
+ *                     keep their labels, i.e. connected-component regions with less than this minimum number of
+ *                     elements are set to the background value defined in the \ref bgLabel value.  Labeling is
+ *                     done before this post-filter step, also known as island-removal step.
+ *                     + It must have the same number of samples as input and output tensors.
+ *                     + It must have one element per sample, i.e. number of channels must be 1 in a [NC] tensor.
+ *                     + It must have U32 data type.
+ *                     + It may be NULL to not apply minimum size regions removal as a post-filter.
+ *                     + If not NULL, the \ref bgLabel and \ref stats tensors must not be NULL as well.
+ *
+ * @param [out] count Count of labels tensor.  The expected layout is [N] or [NC], meaning rank-1 or rank-2 tensor
+ *                    with first dimension as the number of samples N, matching input and output tensors, and a
+ *                    potential last dimension C with number of channels.  If present, this tensor is used by the
+ *                    operator to store the number of connected regions, or components, labeled.  The background
+ *                    label is ignored and thus not counted.  It counts regions that may be beyond the maximum capacity
+ *                    of \ref stats tensor, and regions potentially removed by \ref minSize tensor.
+ *                    + It must have the same number of samples as input and output tensors.
+ *                    + It must have one element per sample, i.e. number of channels must be 1 in a [NC] tensor.
+ *                    + It must have U32 data type.
+ *                    + It may be NULL to disregard counting the number of different labels found.
+ *
+ * @param [out] stats Statistics tensor.  The expected layout is [NMA], meaning rank-3 tensor with first dimension
+ *                    as the number of samples N, matching input and output tensors, second dimension M as maximum
+ *                    number of different labels statistics to be computed, and a third dimension A as the amount
+ *                    of statistics to be computed per label (fixed as 6 for 2D or 8 for 3D).  If present, this
+ *                    tensor is used by the operator to store information per connected-component label.  The
+ *                    background label is ignored and thus its statistics is not computed.
+ *                    + It must have the same number of samples as input and output tensors.
+ *                    + It must have a number of statistics M per sample N equal to the maximum allowed number of
+ *                      label statistics that can be computed by the Label operator per sample image (or volume).
+ *                      The actual number of labels found is stored in \ref count (see above).
+ *                    + For 2D labeling, it must have in the last dimension A=6 elements to store at: (0) the
+ *                      original label number; (1) leftmost position; (2) topmost position; (3) width size; (4)
+ *                      height size; (5) count of pixels (i.e. size of the labeled region).  And for 3D labeling,
+ *                      it must have in the last dimension A=8 elements to store at: (0) the original label number;
+ *                      (1) leftmost position; (2) topmost position; (3) shallowmost position; (4) width size; (5)
+ *                      height size; (6) depth size; (7) count of voxels (i.e. size of the labeled region).
+ *                    + It must have U32 data type.
+ *                    + It may be NULL to disregard computing statistics information on different labels found.
+ *                    + It must not be NULL if \ref assignLabel is NVCV_LABEL_SEQUENTIAL, the index of each label
+ *                      statistics is used as the new sequential label replacing the original label in the output,
+ *                      the sequential labels are up to the maximum capacity M
+ *                    + If not NULL, the \ref count tensor must not be NULL as well.
+ *
+ * @param [in] connectivity Specify connectivity of elements for the operator, see \ref NVCVConnectivityType.
+ *                          + It must conform with \ref in and \ref out tensors, i.e. 3D labeling requires [DHWC]
+ *                            or [NDHWC] tensor layouts and 2D labeling requires [HWC] or [NHWC], where the C
+ *                            channel may be missing as embedded in data type.
+ *
+ * @param [in] assignLabels Specify how labels are assigned by the operator, see \ref NVCVLabelType.  Use
+ *                          NVCV_LABEL_FAST to do fast labeling, i.e. assign non-consecutive label numbers fast.
+ *                          Use NCVC_LABEL_SEQUENTIAL to have consecutive label numbers instead.
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range.
+ * @retval #NVCV_ERROR_INTERNAL         Internal error in the operator, invalid types passed in.
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+CVCUDA_PUBLIC NVCVStatus cvcudaLabelSubmit(NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle in,
+                                           NVCVTensorHandle out, NVCVTensorHandle bgLabel, NVCVTensorHandle minThresh,
+                                           NVCVTensorHandle maxThresh, NVCVTensorHandle minSize, NVCVTensorHandle count,
+                                           NVCVTensorHandle stats, NVCVConnectivityType connectivity,
+                                           NVCVLabelType assignLabels);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CVCUDA_LABEL_H */
diff --git a/src/cvcuda/include/cvcuda/OpLabel.hpp b/src/cvcuda/include/cvcuda/OpLabel.hpp
new file mode 100644
index 00000000..54ebd54e
--- /dev/null
+++ b/src/cvcuda/include/cvcuda/OpLabel.hpp
@@ -0,0 +1,86 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file OpLabel.hpp
+ *
+ * @brief Defines the public C++ Class for the Label operation.
+ * @defgroup NVCV_CPP_ALGORITHM_LABEL Label
+ * @{
+ */
+
+#ifndef CVCUDA__LABEL_HPP
+#define CVCUDA__LABEL_HPP
+
+#include "IOperator.hpp"
+#include "OpLabel.h"
+
+#include <cuda_runtime.h>
+#include <nvcv/ImageFormat.hpp>
+#include <nvcv/Tensor.hpp>
+#include <nvcv/alloc/Requirements.hpp>
+
+namespace cvcuda {
+
+class Label final : public IOperator
+{
+public:
+    explicit Label();
+
+    ~Label();
+
+    void operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, const nvcv::Tensor &bgLabel,
+                    const nvcv::Tensor &minThresh, const nvcv::Tensor &maxThresh, const nvcv::Tensor &minSize,
+                    const nvcv::Tensor &count, const nvcv::Tensor &stats, NVCVConnectivityType connectivity,
+                    NVCVLabelType assignLabels) const;
+
+    virtual NVCVOperatorHandle handle() const noexcept override;
+
+private:
+    NVCVOperatorHandle m_handle;
+};
+
+inline Label::Label()
+{
+    nvcv::detail::CheckThrow(cvcudaLabelCreate(&m_handle));
+    assert(m_handle);
+}
+
+inline Label::~Label()
+{
+    nvcvOperatorDestroy(m_handle);
+    m_handle = nullptr;
+}
+
+inline void Label::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out,
+                              const nvcv::Tensor &bgLabel, const nvcv::Tensor &minThresh, const nvcv::Tensor &maxThresh,
+                              const nvcv::Tensor &minSize, const nvcv::Tensor &count, const nvcv::Tensor &stats,
+                              NVCVConnectivityType connectivity, NVCVLabelType assignLabels) const
+{
+    nvcv::detail::CheckThrow(cvcudaLabelSubmit(m_handle, stream, in.handle(), out.handle(), bgLabel.handle(),
+                                               minThresh.handle(), maxThresh.handle(), minSize.handle(), count.handle(),
+                                               stats.handle(), connectivity, assignLabels));
+}
+
+inline NVCVOperatorHandle Label::handle() const noexcept
+{
+    return m_handle;
+}
+
+} // namespace cvcuda
+
+#endif // CVCUDA_LABEL_HPP
diff --git a/src/cvcuda/include/cvcuda/OpPairwiseMatcher.h b/src/cvcuda/include/cvcuda/OpPairwiseMatcher.h
new file mode 100644
index 00000000..02705857
--- /dev/null
+++ b/src/cvcuda/include/cvcuda/OpPairwiseMatcher.h
@@ -0,0 +1,173 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file OpairwiseMatcher.h
+ *
+ * @brief Defines types and functions to handle the airwiseMatcher operation.
+ * @defgroup NVCV_C_ALGORITHM_PAIRWISE_MATCHER Pairwise Matcher
+ * @{
+ */
+
+#ifndef CVCUDA_PAIRWISE_MATCHER_H
+#define CVCUDA_PAIRWISE_MATCHER_H
+
+#include "Operator.h"
+#include "Types.h"
+#include "detail/Export.h"
+
+#include <cuda_runtime.h>
+#include <nvcv/Status.h>
+#include <nvcv/Tensor.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/** Constructs and an instance of the PairwiseMatcher operator.
+ *
+ * @param [out] handle Where the image instance handle will be written to.
+ *                     + Must not be NULL.
+ *
+ * @param [in] algoChoice Choice of algorithm to find pair-wise matches.
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT Handle is null.
+ * @retval #NVCV_ERROR_OUT_OF_MEMORY    Not enough memory to create the operator.
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+CVCUDA_PUBLIC NVCVStatus cvcudaPairwiseMatcherCreate(NVCVOperatorHandle *handle, NVCVPairwiseMatcherType algoChoice);
+
+/** Executes the PairwiseMatcher operation on the given CUDA stream. This operation does not wait for completion.
+ *
+ * This operation computes the pair-wise matcher between two sets of n-dimensional points.  For instance
+ * 128-dimensional descriptors as points.  For each point $p1_i$, in the 1st set defined by \ref set1 with size
+ * \ref numSet1, the operator finds the best match (minimum distance) from $p1_i$ to a point in the 2nd set $p2_j$,
+ * defined by \ref set2 with size \ref numSet2.  If \ref crossCheck is true, $p1_i$ must also be the best match
+ * from $p2_j$ considering all possible matches from the 2nd set to the 1st set, to return them as a match.
+ *
+ * @note This operation does not guarantee deterministic output.  Each output tensor limits the number of matches
+ *       found by the operator, that is the total number may be greater than this limitation and the order of
+ *       matches returned might differ in different runs.
+ *
+ * @param [in] handle Handle to the operator.
+ *                    + Must not be NULL.
+ *
+ * @param [in] stream Handle to a CUDA stream.
+ *                    + Must be a valid CUDA stream.
+ *
+ * @param [in] set1 Input 1st set of points tensor.  The first set of points to calculate pair-wise matcher between
+ *                  this 1st set and the 2nd set.  The expected layout is [NMD] meaning a rank-3 tensor with first
+ *                  dimension as number of samples N, second dimension M as maximum number of points, and a third
+ *                  dimension D as depth dimension of each point, e.g. the output of \ref NVCV_C_ALGORITHM_SIFT has
+ *                  128-Byte descriptor or D=128 and U8 data type that can be used as a set of points.
+ *                  + It must have consistent number of samples N across input and output tensors.
+ *                  + The size of the depth dimension D and data type must be consistent across input
+ *                    set of points tensors.
+ *                  + It must have U8 or U32 or F32 data type.
+ *
+ * @param [in] set2 Input 2nd set of points tensor.  The second set of points to calculate pair-wise matcher between
+ *                  this 2nd set and the 1st set.  The expected layout is [NMD] meaning a rank-3 tensor with first
+ *                  dimension as number of samples N, second dimension M as maximum number of points, and a third
+ *                  dimension D as depth dimension of each point, e.g. the output of \ref NVCV_C_ALGORITHM_SIFT has
+ *                  128-Byte descriptor or D=128 and U8 data type that can be used as a set of points.
+ *                  + It must have consistent number of samples N across input and output tensors.
+ *                  + The size of the depth dimension D and data type must be consistent across input
+ *                    set of points tensors.
+ *                  + It must have U8 or U32 or F32 data type.
+ *
+ * @param [in] numSet1 Input tensor storing the actual number of points in \ref set1 tensor.  The expected layout
+ *                     is [N] or [NC], meaning rank-1 or rank-2 tensor with first dimension as number of samples N,
+ *                     and a potential last dimension C with number of channels.  It expresses the total number of
+ *                     valid points in \ref set1 if less than its maximum capacity M, else uses all M points.
+ *                     + It must have consistent number of samples N across input and output tensors.
+ *                     + It must have one element per sample, i.e. number of channels must be 1 in a [NC] tensor.
+ *                     + It must have S32 data type.
+ *                     + It may be NULL to use entire set1 maximum capacity M as valid points.
+ *
+ * @param [in] numSet2 Input tensor storing the actual number of points in \ref set2 tensor.  The expected layout
+ *                     is [N] or [NC], meaning rank-1 or rank-2 tensor with first dimension as number of samples N,
+ *                     and a potential last dimension C with number of channels.  It expresses the total number of
+ *                     valid points in \ref set2 if less than its maximum capacity M, else uses all M points.
+ *                     + It must have consistent number of samples N across input and output tensors.
+ *                     + It must have one element per sample, i.e. number of channels must be 1 in a [NC] tensor.
+ *                     + It must have S32 data type.
+ *                     + It may be NULL to use entire set2 maximum capacity M as valid points.
+ *
+ * @param [out] matches Output tensor to store the matches of points between 1st set \ref set1 and 2nd set \ref
+ *                      set2.  The expected layout is [NMA], meaning rank-3 tensor with first dimension as the
+ *                      number of samples N, same as other tensors, second dimension M as maximum number of
+ *                      matches, not necessarily the same as other tensors, and third dimension A as the attributes
+ *                      of each match, fixed to 2 attributes: set1 index and set2 index.
+ *                      + It must have consistent number of samples N across input and output tensors.
+ *                      + It must have a number of matches M per sample N equal to the maximum allowed number of
+ *                        matches to be found between \ref set1 and \ref set2.  The actual number
+ *                        of matches found is stored in \ref numMatches.
+ *                      + It must have size of attributes dimension A equal 2.
+ *                      + It must have S32 data type.
+ *
+ * @param [out] numMatches Output tensor to store the number of matches found by the operator.  The expected layout
+ *                         is [N] or [NC], meaning rank-1 or rank-2 tensor with first dimension as number of
+ *                         samples N, and a potential last dimension C with number of channels.  It expresses the
+ *                         toal number of matches found, regardless of the maximum allowed number of matches M in
+ *                         output tensor \ref matches.  Since matches are found randomly, they are discarded in a
+ *                         non-deterministic way when the number of matches found is bigger than M.
+ *                         + It must have consistent number of samples N across input and output tensors.
+ *                         + It must have one element per sample, i.e. number of channels must be 1 in a [NC] tensor.
+ *                         + It must have S32 data type.
+ *                         + It may be NULL if \ref crossCheck is false to disregard storing number of matches.
+ *
+ * @param [out] distances Output tensor to store distances of matches found by the operator.  The expected layout
+ *                        is [NM] or [NMC], meaning rank-2 or rank-3 tensor with first dimension as number of
+ *                        samples N, same as other tensors, second dimension M as maximum number of distances, same
+ *                        as \ref matches output tensors, and a potential last dimension C with number of channels.
+ *                        For each match found in \ref matches, the distance between matched points is stored.
+ *                        + It must have consistent number of samples N across input and output tensors.
+ *                        + It must have the same dimension M of the \ref matches tensor, meaning the maximum
+ *                          allowed number of distances must be equal to the maximum allowed number of matches.
+ *                        + It must have one element per sample, i.e. number of channels must be 1 in a [NMC] tensor.
+ *                        + It must have F32 data type.
+ *                        + It may be NULL to disregard storing distances.
+ *
+ * @param [in] crossCheck Choice to do cross check.  Use false to search only for matches from 1st set of points in
+ *                        \ref set1 to 2nd set of points in \ref set2.  Use true to cross check best matches, a
+ *                        best match is only returned if it is the best match (minimum distance) from 1st set to
+ *                        2nd set and vice versa.
+ *
+ * @param [in] matchesPerPoint Number of best matches $k$ per point.  The operator returns the top-$k$ best matches
+ *                             from 1st set to 2nd set.
+ *                             + It must be between 1 and 64.
+ *                             + It has to be 1 if \ref crossCheck is true.
+ *
+ * @param [in] normType Choice of norm type to normalize distances, used in points difference $|p1 - p2|$.
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range.
+ * @retval #NVCV_ERROR_INTERNAL         Internal error in the operator, invalid types passed in.
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+CVCUDA_PUBLIC NVCVStatus cvcudaPairwiseMatcherSubmit(NVCVOperatorHandle handle, cudaStream_t stream,
+                                                     NVCVTensorHandle set1, NVCVTensorHandle set2,
+                                                     NVCVTensorHandle numSet1, NVCVTensorHandle numSet2,
+                                                     NVCVTensorHandle matches, NVCVTensorHandle numMatches,
+                                                     NVCVTensorHandle distances, bool crossCheck, int matchesPerPoint,
+                                                     NVCVNormType normType);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CVCUDA_PAIRWISE_MATCHER_H */
diff --git a/src/cvcuda/include/cvcuda/OpPairwiseMatcher.hpp b/src/cvcuda/include/cvcuda/OpPairwiseMatcher.hpp
new file mode 100644
index 00000000..13178ac7
--- /dev/null
+++ b/src/cvcuda/include/cvcuda/OpPairwiseMatcher.hpp
@@ -0,0 +1,86 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file OpPairwiseMatcher.hpp
+ *
+ * @brief Defines the public C++ Class for the PairwiseMatcher operation.
+ * @defgroup NVCV_CPP_ALGORITHM_PAIRWISE_MATCHER PairwiseMatcher
+ * @{
+ */
+
+#ifndef CVCUDA_PAIRWISE_MATCHER_HPP
+#define CVCUDA_PAIRWISE_MATCHER_HPP
+
+#include "IOperator.hpp"
+#include "OpPairwiseMatcher.h"
+
+#include <cuda_runtime.h>
+#include <nvcv/Tensor.hpp>
+#include <nvcv/alloc/Requirements.hpp>
+
+namespace cvcuda {
+
+class PairwiseMatcher final : public IOperator
+{
+public:
+    explicit PairwiseMatcher(NVCVPairwiseMatcherType algoChoice);
+
+    ~PairwiseMatcher();
+
+    void operator()(cudaStream_t stream, const nvcv::Tensor &set1, const nvcv::Tensor &set2,
+                    const nvcv::Tensor &numSet1, const nvcv::Tensor &numSet2, const nvcv::Tensor &matches,
+                    const nvcv::Tensor &numMatches, const nvcv::Tensor &distances, bool crossCheck, int matchesPerPoint,
+                    NVCVNormType normType);
+
+    virtual NVCVOperatorHandle handle() const noexcept override;
+
+private:
+    NVCVOperatorHandle m_handle;
+};
+
+inline PairwiseMatcher::PairwiseMatcher(NVCVPairwiseMatcherType algoChoice)
+{
+    nvcv::detail::CheckThrow(cvcudaPairwiseMatcherCreate(&m_handle, algoChoice));
+    assert(m_handle);
+}
+
+inline PairwiseMatcher::~PairwiseMatcher()
+{
+    nvcvOperatorDestroy(m_handle);
+    m_handle = nullptr;
+}
+
+inline void PairwiseMatcher::operator()(cudaStream_t stream, const nvcv::Tensor &set1, const nvcv::Tensor &set2,
+                                        const nvcv::Tensor &numSet1, const nvcv::Tensor &numSet2,
+                                        const nvcv::Tensor &matches, const nvcv::Tensor &numMatches,
+                                        const nvcv::Tensor &distances, bool crossCheck, int matchesPerPoint,
+                                        NVCVNormType normType)
+{
+    nvcv::detail::CheckThrow(cvcudaPairwiseMatcherSubmit(
+        m_handle, stream, set1.handle(), set2.handle(), numSet1.handle(), numSet2.handle(), matches.handle(),
+        numMatches.handle(), distances.handle(), crossCheck, matchesPerPoint, normType));
+}
+
+inline NVCVOperatorHandle PairwiseMatcher::handle() const noexcept
+{
+    return m_handle;
+}
+
+} // namespace cvcuda
+
+#endif // CVCUDA_PAIRWISE_MATCHER_HPP
diff --git a/src/cvcuda/include/cvcuda/OpPillowResize.h b/src/cvcuda/include/cvcuda/OpPillowResize.h
index e3326f72..73bdb3a2 100644
--- a/src/cvcuda/include/cvcuda/OpPillowResize.h
+++ b/src/cvcuda/include/cvcuda/OpPillowResize.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -28,10 +28,12 @@
 
 #include "Operator.h"
 #include "Types.h"
+#include "Workspace.h"
 #include "detail/Export.h"
 
 #include <cuda_runtime.h>
 #include <nvcv/ImageBatch.h>
+#include <nvcv/Size.h>
 #include <nvcv/Status.h>
 #include <nvcv/Tensor.h>
 
@@ -44,17 +46,51 @@ extern "C"
  *
  * @param [out] handle Where the image instance handle will be written to.
  *                     + Must not be NULL.
- * @param [in] maxWidth Maximum input and output image width.
- * @param [in] maxHeight Maximum input and output image height.
- * @param [in] maxBatchSize Maximum batchsize used in this operator.
  * @param [in] fmt Image format
  *
  * @retval #NVCV_ERROR_INVALID_ARGUMENT Handle is null.
  * @retval #NVCV_ERROR_OUT_OF_MEMORY    Not enough memory to create the operator.
  * @retval #NVCV_SUCCESS                Operation executed successfully.
  */
-CVCUDA_PUBLIC NVCVStatus cvcudaPillowResizeCreate(NVCVOperatorHandle *handle, int32_t maxWidth, int32_t maxHeight,
-                                                  int32_t maxBatchSize, NVCVImageFormat fmt);
+CVCUDA_PUBLIC NVCVStatus cvcudaPillowResizeCreate(NVCVOperatorHandle *handle);
+
+/** Calculates the upper bounds of buffer sizes required to run the operator
+ *
+ * @param [in] handle Where the image instance handle will be written to.
+ *                     + Must not be NULL.
+ * @param [in] maxBatchSize Maximum batchsize used in this operator.
+ * @param [in] maxWidth Maximum input and output image width.
+ * @param [in] maxHeight Maximum input and output image height.
+ * @param [in] fmt Image format
+ * @param [out] reqOut Requirements for the operator's workspace
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT Handle is null or one of the arguments is out of range.
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+CVCUDA_PUBLIC NVCVStatus cvcudaPillowResizeGetWorkspaceRequirements(NVCVOperatorHandle handle, int maxBatchSize,
+                                                                    int32_t maxInWidth, int32_t maxInHeight,
+                                                                    int32_t maxOutWidth, int32_t maxOutHeight,
+                                                                    NVCVImageFormat            fmt,
+                                                                    NVCVWorkspaceRequirements *reqOut);
+
+/** Calculates the buffer sizes required to run the operator
+ *
+ * @param [in] handle Where the image instance handle will be written to.
+ *                     + Must not be NULL.
+ * @param [in] batchSize The number of images
+ * @param [in] inputSizes The sizes of the input images
+ * @param [in] outputSizes The sizes of the output images
+ * @param [in] fmt Image format
+ * @param [out] reqOut Requirements for the operator's workspace
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT Handle is null or one of the arguments is out of range.
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+CVCUDA_PUBLIC NVCVStatus cvcudaPillowResizeVarShapeGetWorkspaceRequirements(NVCVOperatorHandle handle, int batchSize,
+                                                                            const NVCVSize2D          *inputSizesWH,
+                                                                            const NVCVSize2D          *outputSizesWH,
+                                                                            NVCVImageFormat            fmt,
+                                                                            NVCVWorkspaceRequirements *reqOut);
 
 /** Executes the pillow resize operation on the given cuda stream. This operation does not
  *  wait for completion.
@@ -117,12 +153,14 @@ CVCUDA_PUBLIC NVCVStatus cvcudaPillowResizeCreate(NVCVOperatorHandle *handle, in
  * @retval #NVCV_SUCCESS                Operation executed successfully.
  */
 /** @{ */
-CVCUDA_PUBLIC NVCVStatus cvcudaPillowResizeSubmit(NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle in,
-                                                  NVCVTensorHandle out, const NVCVInterpolationType interpolation);
+CVCUDA_PUBLIC NVCVStatus cvcudaPillowResizeSubmit(NVCVOperatorHandle handle, cudaStream_t stream,
+                                                  const NVCVWorkspace *workspace, NVCVTensorHandle in,
+                                                  NVCVTensorHandle out, NVCVInterpolationType interpolation);
 
-CVCUDA_PUBLIC NVCVStatus nvcvopPillowResizeVarShapeSubmit(NVCVOperatorHandle handle, cudaStream_t stream,
-                                                          NVCVImageBatchHandle in, NVCVImageBatchHandle out,
-                                                          const NVCVInterpolationType interpolation);
+CVCUDA_PUBLIC NVCVStatus cvcudaPillowResizeVarShapeSubmit(NVCVOperatorHandle handle, cudaStream_t stream,
+                                                          const NVCVWorkspace *workspace, NVCVImageBatchHandle in,
+                                                          NVCVImageBatchHandle  out,
+                                                          NVCVInterpolationType interpolation);
 /** @} */
 #ifdef __cplusplus
 }
diff --git a/src/cvcuda/include/cvcuda/OpPillowResize.hpp b/src/cvcuda/include/cvcuda/OpPillowResize.hpp
index ff48c799..6d647503 100644
--- a/src/cvcuda/include/cvcuda/OpPillowResize.hpp
+++ b/src/cvcuda/include/cvcuda/OpPillowResize.hpp
@@ -28,6 +28,7 @@
 
 #include "IOperator.hpp"
 #include "OpPillowResize.h"
+#include "Workspace.hpp"
 
 #include <cuda_runtime.h>
 #include <nvcv/ImageBatch.hpp>
@@ -41,26 +42,31 @@ namespace cvcuda {
 class PillowResize final : public IOperator
 {
 public:
-    explicit PillowResize(nvcv::Size2D maxSize, int32_t maxBatchSize, nvcv::ImageFormat fmt);
+    PillowResize();
 
     ~PillowResize();
 
-    void operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out,
-                    const NVCVInterpolationType interpolation);
+    WorkspaceRequirements getWorkspaceRequirements(int batchSize, const nvcv::Size2D *in_sizes,
+                                                   const nvcv::Size2D *out_sizes, nvcv::ImageFormat fmt);
+
+    WorkspaceRequirements getWorkspaceRequirements(int maxBatchSize, nvcv::Size2D maxInSize, nvcv::Size2D maxOutSize,
+                                                   nvcv::ImageFormat fmt);
 
-    void operator()(cudaStream_t stream, const nvcv::ImageBatchVarShape &in, const nvcv::ImageBatchVarShape &out,
+    void operator()(cudaStream_t stream, const Workspace &ws, const nvcv::Tensor &in, const nvcv::Tensor &out,
                     const NVCVInterpolationType interpolation);
 
+    void operator()(cudaStream_t stream, const Workspace &ws, const nvcv::ImageBatchVarShape &in,
+                    const nvcv::ImageBatchVarShape &out, const NVCVInterpolationType interpolation);
+
     virtual NVCVOperatorHandle handle() const noexcept override;
 
 private:
     NVCVOperatorHandle m_handle;
 };
 
-inline PillowResize::PillowResize(nvcv::Size2D maxSize, int32_t maxBatchSize, nvcv::ImageFormat fmt)
+inline PillowResize::PillowResize()
 {
-    NVCVImageFormat cfmt = fmt.cvalue();
-    nvcv::detail::CheckThrow(cvcudaPillowResizeCreate(&m_handle, maxSize.w, maxSize.h, maxBatchSize, cfmt));
+    nvcv::detail::CheckThrow(cvcudaPillowResizeCreate(&m_handle));
     assert(m_handle);
 }
 
@@ -70,17 +76,36 @@ inline PillowResize::~PillowResize()
     m_handle = nullptr;
 }
 
-inline void PillowResize::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out,
-                                     const NVCVInterpolationType interpolation)
+inline WorkspaceRequirements PillowResize::getWorkspaceRequirements(int batchSize, const nvcv::Size2D *in_sizes,
+                                                                    const nvcv::Size2D *out_sizes,
+                                                                    nvcv::ImageFormat   fmt)
+{
+    WorkspaceRequirements req{};
+    nvcv::detail::CheckThrow(cvcudaPillowResizeVarShapeGetWorkspaceRequirements(m_handle, batchSize, in_sizes,
+                                                                                out_sizes, fmt.cvalue(), &req));
+    return req;
+}
+
+inline WorkspaceRequirements PillowResize::getWorkspaceRequirements(int maxBatchSize, nvcv::Size2D maxInSize,
+                                                                    nvcv::Size2D maxOutSize, nvcv::ImageFormat fmt)
+{
+    WorkspaceRequirements req{};
+    nvcv::detail::CheckThrow(cvcudaPillowResizeGetWorkspaceRequirements(
+        m_handle, maxBatchSize, maxInSize.w, maxInSize.h, maxOutSize.w, maxOutSize.h, fmt.cvalue(), &req));
+    return req;
+}
+
+inline void PillowResize::operator()(cudaStream_t stream, const Workspace &ws, const nvcv::Tensor &in,
+                                     const nvcv::Tensor &out, const NVCVInterpolationType interpolation)
 {
-    nvcv::detail::CheckThrow(cvcudaPillowResizeSubmit(m_handle, stream, in.handle(), out.handle(), interpolation));
+    nvcv::detail::CheckThrow(cvcudaPillowResizeSubmit(m_handle, stream, &ws, in.handle(), out.handle(), interpolation));
 }
 
-inline void PillowResize::operator()(cudaStream_t stream, const nvcv::ImageBatchVarShape &in,
+inline void PillowResize::operator()(cudaStream_t stream, const Workspace &ws, const nvcv::ImageBatchVarShape &in,
                                      const nvcv::ImageBatchVarShape &out, const NVCVInterpolationType interpolation)
 {
     nvcv::detail::CheckThrow(
-        nvcvopPillowResizeVarShapeSubmit(m_handle, stream, in.handle(), out.handle(), interpolation));
+        cvcudaPillowResizeVarShapeSubmit(m_handle, stream, &ws, in.handle(), out.handle(), interpolation));
 }
 
 inline NVCVOperatorHandle PillowResize::handle() const noexcept
diff --git a/src/cvcuda/include/cvcuda/OpSIFT.h b/src/cvcuda/include/cvcuda/OpSIFT.h
index e22e2f48..45fa7308 100644
--- a/src/cvcuda/include/cvcuda/OpSIFT.h
+++ b/src/cvcuda/include/cvcuda/OpSIFT.h
@@ -157,12 +157,12 @@ CVCUDA_PUBLIC NVCVStatus cvcudaSIFTCreate(NVCVOperatorHandle *handle, int3 maxSh
  * @param [in] contrastThreshold The contrast threshold used to remove features with low contrast.  The larger this
  *                               threshold, the less features are extracted by the operator.  One suggestion, given
  *                               by the original algorithm description, is to use \f$ 0.03 \f$.
- *                               + It must be between 0 and 1.
+ *                               + It must be positive.
  *
  * @param [in] edgeThreshold The edge threshold used to remove features that are similar to edges.  The larger this
  *                           threshold, the more features are extracted by the operator. One suggestion, given by
  *                           the original algorithm description, is to use \f$ 10.0 \f$.
- *                           + It must be between 0 and 1.
+ *                           + It must be positive.
  *
  * @param [in] initSigma The initial sigma to be applied by the first Gaussian filter done at the first octave.
  *                       This sigma is progressively applied for each scale-space layer within each octave
diff --git a/src/cvcuda/include/cvcuda/OpStack.h b/src/cvcuda/include/cvcuda/OpStack.h
new file mode 100644
index 00000000..58e9bff1
--- /dev/null
+++ b/src/cvcuda/include/cvcuda/OpStack.h
@@ -0,0 +1,121 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file OpStack.h
+ *
+ * @brief Defines types and functions to handle the Stack operation.
+ * @defgroup NVCV_C_ALGORITHM__STACK Stack
+ * @{
+ */
+
+#ifndef CVCUDA__STACK_H
+#define CVCUDA__STACK_H
+
+#include "Operator.h"
+#include "detail/Export.h"
+
+#include <cuda_runtime.h>
+#include <nvcv/Status.h>
+#include <nvcv/Tensor.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/** Constructs and an instance of the Stack operator.
+ *
+ * @param [out] handle Where the image instance handle will be written to.
+ *                     + Must not be NULL.
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT Handle is null.
+ * @retval #NVCV_ERROR_OUT_OF_MEMORY    Not enough memory to create the operator.
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+CVCUDA_PUBLIC NVCVStatus cvcudaStackCreate(NVCVOperatorHandle *handle);
+
+/**
+ *
+ *  Executes the Stack operation on the given cuda stream. This operation does not
+ *  wait for completion. The stack operation copies source tensors from into an output tensor.
+ *  The output tensor is a concatenation of the source tensors, with each source tensor copied into
+ *  the output tensor. All of the source tensors must have the same data type and number of channels width and height.
+ *
+ *  Limitations:
+ *
+ *  Input:
+ *       Data Layout:    [NHWC, NCHW, CHW, HWC]
+ *       Channels:       [1,2,3,4]
+ *
+ *       Data Type      | Allowed
+ *       -------------- | -------------
+ *       8bit  Unsigned | Yes
+ *       8bit  Signed   | Yes
+ *       16bit Unsigned | Yes
+ *       16bit Signed   | Yes
+ *       32bit Unsigned | Yes
+ *       32bit Signed   | Yes
+ *       32bit Float    | Yes
+ *       64bit Float    | Yes
+ *
+ *  Output:
+ *       Data Layout:    [NHWC, NCHW]
+ *       Channels:       [1,2,3,4]
+ *
+ *       Data Type      | Allowed
+ *       -------------- | -------------
+ *       8bit  Unsigned | Yes
+ *       8bit  Signed   | Yes
+ *       16bit Unsigned | Yes
+ *       16bit Signed   | Yes
+ *       32bit Unsigned | Yes
+ *       32bit Signed   | Yes
+ *       32bit Float    | Yes
+ *       64bit Float    | Yes
+ *
+ *  Input/Output dependency
+ *
+ *       Property      |  Input == Output
+ *      -------------- | -------------
+ *       Data Layout   | Yes
+ *       Data Type     | Yes
+ *       Number        | No
+ *       Channels      | Yes
+ *       Width         | Yes
+ *       Height        | Yes
+ *
+ * @param [in] handle Handle to the operator.
+ *                    + Must not be NULL.
+ * @param [in] stream Handle to a valid CUDA stream.
+ *
+ * @param [in] in input tensors batch.
+ *
+ * @param [out] out output tensor NHWC/CHW where N is equal to the number of all input tensors.
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range.
+ * @retval #NVCV_ERROR_INTERNAL         Internal error in the operator, invalid types passed in.
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+CVCUDA_PUBLIC NVCVStatus cvcudaStackSubmit(NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorBatchHandle in,
+                                           NVCVTensorHandle out);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CVCUDA__STACK_H */
diff --git a/src/cvcuda/include/cvcuda/OpStack.hpp b/src/cvcuda/include/cvcuda/OpStack.hpp
new file mode 100644
index 00000000..8f85a736
--- /dev/null
+++ b/src/cvcuda/include/cvcuda/OpStack.hpp
@@ -0,0 +1,79 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file OpStack.hpp
+ *
+ * @brief Defines the public C++ Class for the Stack operation.
+ * @defgroup NVCV_CPP_ALGORITHM__STACK Stack
+ * @{
+ */
+
+#ifndef CVCUDA__STACK_HPP
+#define CVCUDA__STACK_HPP
+
+#include "IOperator.hpp"
+#include "OpStack.h"
+
+#include <cuda_runtime.h>
+#include <nvcv/ImageFormat.hpp>
+#include <nvcv/Tensor.hpp>
+#include <nvcv/TensorBatch.hpp>
+#include <nvcv/alloc/Requirements.hpp>
+
+namespace cvcuda {
+
+class Stack final : public IOperator
+{
+public:
+    explicit Stack();
+
+    ~Stack();
+
+    void operator()(cudaStream_t stream, const nvcv::TensorBatch &in, const nvcv::Tensor &out);
+
+    virtual NVCVOperatorHandle handle() const noexcept override;
+
+private:
+    NVCVOperatorHandle m_handle;
+};
+
+inline Stack::Stack()
+{
+    nvcv::detail::CheckThrow(cvcudaStackCreate(&m_handle));
+    assert(m_handle);
+}
+
+inline Stack::~Stack()
+{
+    nvcvOperatorDestroy(m_handle);
+    m_handle = nullptr;
+}
+
+inline void Stack::operator()(cudaStream_t stream, const nvcv::TensorBatch &in, const nvcv::Tensor &out)
+{
+    nvcv::detail::CheckThrow(cvcudaStackSubmit(m_handle, stream, in.handle(), out.handle()));
+}
+
+inline NVCVOperatorHandle Stack::handle() const noexcept
+{
+    return m_handle;
+}
+
+} // namespace cvcuda
+
+#endif // CVCUDA__STACK_HPP
diff --git a/src/cvcuda/include/cvcuda/Types.h b/src/cvcuda/include/cvcuda/Types.h
index 67b527cc..17bb8f62 100644
--- a/src/cvcuda/include/cvcuda/Types.h
+++ b/src/cvcuda/include/cvcuda/Types.h
@@ -385,6 +385,42 @@ typedef enum
     NVCV_SIFT_USE_EXPANDED_INPUT = 1
 } NVCVSIFTFlagType;
 
+// @brief Defines connectivity of elements
+typedef enum
+{
+    NVCV_CONNECTIVITY_4_2D,  //!< Immediate (cross) neighborhood of pixels.
+    NVCV_CONNECTIVITY_6_3D,  //!< Immediate (cross) neighborhood of voxels.
+    NVCV_CONNECTIVITY_8_2D,  //!< All direct (full) neighborhood of pixels.
+    NVCV_CONNECTIVITY_26_3D, //!< All direct (full) neighborhood of voxels.
+} NVCVConnectivityType;
+
+// @brief Defines how labels are assigned in Label operator
+typedef enum
+{
+    NVCV_LABEL_FAST,       //!< Does not guarantee consecutive label numbers.
+    NVCV_LABEL_SEQUENTIAL, //!< Assigns consecutive numbers to labels.
+} NVCVLabelType;
+
+// @brief Defines pair-wise matcher algorithms of choice
+typedef enum
+{
+    NVCV_BRUTE_FORCE //!< Select brute-force algorithm as the matcher
+} NVCVPairwiseMatcherType;
+
+// @brief Defines how a vector normalization should occur
+typedef enum
+{
+    NVCV_NORM_HAMMING = 0, //!< Equivalent to the Hamming distance (or L_0 norm)
+    NVCV_NORM_L1      = 1, //!< Equivalent to the absolute distance = |x1-x2| + |y1-y2| (or L_1 norm)
+    NVCV_NORM_L2      = 2, //!< Equivalent to the Euclidean distance (or L_2 norm)
+    NVCV_NORM_C       = 3, //!< distance = max(|x1-x2|,|y1-y2|)
+    NVCV_NORM_L12     = 4, //!< L1-L2 metric: distance = 2(sqrt(1+x*x/2) - 1))
+    NVCV_NORM_FAIR    = 5, //!< distance = c^2(|x|/c-log(1+|x|/c)), c = 1.3998
+    NVCV_NORM_WELSCH  = 6, //!< distance = c^2/2(1-exp(-(x/c)^2)), c = 2.9846
+    NVCV_NORM_HUBER   = 7, //!< distance = |x|<c ? x^2/2 : c(|x|-c/2), c=1.345
+
+} NVCVNormType;
+
 typedef unsigned char uint8_t;
 typedef int           int32_t;
 
@@ -418,12 +454,7 @@ typedef struct
     NVCVColorRGBA fillColor;   // Filled color of bounding box.
 } NVCVBndBoxI;
 
-typedef struct
-{
-    int32_t      batch;    // Number of images in the image batch.
-    int32_t     *numBoxes; // Number array of bounding boxes for image batch.
-    NVCVBndBoxI *boxes;    // Bounding box array for image batch, \ref NVCVBndBoxI.
-} NVCVBndBoxesI;
+typedef void *NVCVBndBoxesI;
 
 typedef struct
 {
@@ -431,44 +462,7 @@ typedef struct
     int32_t  kernelSize; // Kernel sizes of mean filter, refer to cv::blur().
 } NVCVBlurBoxI;
 
-typedef struct
-{
-    int32_t       batch;    // Number of images in the image batch.
-    int32_t      *numBoxes; // Number array of blurring boxes for image batch.
-    NVCVBlurBoxI *boxes;    // Blurring box array for image batch, \ref NVCVBlurBoxI.
-} NVCVBlurBoxesI;
-
-// Default font, user can install via below command:
-//      sudo apt-get update
-//      sudo apt-get install ttf-dejavu fonts-dejavu
-#define DEFAULT_OSD_FONT "DejaVuSansMono"
-
-typedef struct
-{
-    const char   *utf8Text;  // Text to draw in utf8 format.
-    int32_t       fontSize;  // Font size for the text.
-    const char   *fontName;  // Font name for the text.
-    NVCVPointI    tlPos;     // Top-left corner point for label text, \ref NVCVPointI.
-    NVCVColorRGBA fontColor; // Font color of the text.
-    NVCVColorRGBA bgColor;   // Background color of text box.
-} NVCVText;
-
-typedef struct
-{
-    NVCVBoxI      box;          // Bounding box of segment, \ref NVCVBoxI.
-    int32_t       thickness;    // Line thickness of segment outter rect.
-    float        *dSeg;         // Device pointer for segment mask, cannot be nullptr.
-                                // Array length: segWidth * segHeight
-                                // Format:
-                                //      Score_00, Score_01, ..., Score_0k, ...
-                                //      Score_10, Score_11, ..., Score_kk, ...
-                                //          ... ,     ... , ...,     ... , ...
-    int32_t       segWidth;     // Segment mask width.
-    int32_t       segHeight;    // Segment mask height.
-    float         segThreshold; // Segment threshold.
-    NVCVColorRGBA borderColor;  // Line color of segment outter rect.
-    NVCVColorRGBA segColor;     // Segment mask color.
-} NVCVSegment;
+typedef void *NVCVBlurBoxesI;
 
 typedef struct
 {
@@ -486,23 +480,6 @@ typedef struct
     bool          interpolation; // Default: true.
 } NVCVLine;
 
-typedef struct
-{
-    int32_t      *hPoints;       // Host pointer for polyline points' xy, cannot be nullptr.
-                                 // Array length: 2 * numPoints.
-                                 // Format : X0, Y0, X1, Y1, ..., Xk, Yk, ...
-    int32_t      *dPoints;       // Device pointer for polyline points' xy.
-                                 // Can be nullptr only if fillColor.a == 0.
-                                 // Array length: 2 * numPoints.
-                                 // Format: X0, Y0, X1, Y1, ..., Xk, Yk, ...
-    int32_t       numPoints;     // Number of polyline points.
-    int32_t       thickness;     // Polyline thickness.
-    bool          isClosed;      // Connect p(0) to p(n-1) or not.
-    NVCVColorRGBA borderColor;   // Line color of polyline border.
-    NVCVColorRGBA fillColor;     // Fill color of poly fill area.
-    bool          interpolation; // Default: true
-} NVCVPolyLine;
-
 typedef struct
 {
     NVCVPointI    centerPos;     // Center point, \ref NVCVPointI.
@@ -542,42 +519,7 @@ typedef enum
     HHMMSS        = 3
 } NVCVClockFormat;
 
-typedef struct
-{
-    NVCVClockFormat clockFormat; // Pre-defined clock format.
-    long            time;        // Clock time.
-    int32_t         fontSize;    // Font size.
-    const char     *font;        // Font name.
-    NVCVPointI      tlPos;       // Top-left corner point, \ref NVCVPointI.
-    NVCVColorRGBA   fontColor;   // Font color of the text.
-    NVCVColorRGBA   bgColor;     // Background color of text box.
-} NVCVClock;
-
-typedef struct
-{
-    /*
-    *  type:
-    *      NVCV_OSD_RECT           -   \ref NVCVBndBoxI.
-    *      NVCV_OSD_TEXT           -   \ref NVCVText.
-    *      NVCV_OSD_SEGMENT        -   \ref NVCVSegment.
-    *      NVCV_OSD_POINT          -   \ref NVCVPoint.
-    *      NVCV_OSD_LINE           -   \ref NVCVLine.
-    *      NVCV_OSD_POLYLINE       -   \ref NVCVPolyLine.
-    *      NVCV_OSD_ROTATED_RECT   -   \ref NVCVRotatedBox.
-    *      NVCV_OSD_CIRCLE         -   \ref NVCVCircle.
-    *      NVCV_OSD_ARROW          -   \ref NVCVArrow.
-    *      NVCV_OSD_CLOCK          -   \ref NVCVClock.
-    */
-    NVCVOSDType type; // OSD element type to draw.
-    void       *data; // OSD element data pointer.
-} NVCVElement;
-
-typedef struct
-{
-    int32_t      batch;       // Number of images in the image batch.
-    int32_t     *numElements; // Number array of texts for image batch.
-    NVCVElement *elements;    // Element array for image batch, \ref NVCVElement.
-} NVCVElements;
+typedef void *NVCVElements;
 
 #ifdef __cplusplus
 }
diff --git a/src/cvcuda/include/cvcuda/Workspace.h b/src/cvcuda/include/cvcuda/Workspace.h
new file mode 100644
index 00000000..1de22a50
--- /dev/null
+++ b/src/cvcuda/include/cvcuda/Workspace.h
@@ -0,0 +1,104 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CVCUDAERATORS_WORKSPACE_H
+#define CVCUDAERATORS_WORKSPACE_H
+
+#include <cuda_runtime_api.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/** Defines requirements for workspace memory
+ */
+typedef struct NVCVWorkspaceMemRequirementsRec
+{
+    /** Size, in bytes, of the required memory */
+    size_t size;
+    /** Alignment, in bytes, of the required memory */
+    size_t alignment;
+} NVCVWorkspaceMemRequirements;
+
+/** Aggregates requirements for all resource kinds in a workspace
+ */
+typedef struct NVCVWorkspaceRequirementsRec
+{
+    /** Requirements for plain host memory */
+    NVCVWorkspaceMemRequirements hostMem;
+    /** Requirements for GPU-accessible host memory (e.g. allocated with cudaHostAlloc) */
+    NVCVWorkspaceMemRequirements pinnedMem;
+    /** Requirements for GPU memory */
+    NVCVWorkspaceMemRequirements cudaMem;
+} NVCVWorkspaceRequirements;
+
+/** Memory block for use in a workspace object.
+ *
+ * A workspace memory structure contains the requriements (these can be useful when obtaining memory from the workspace)
+ * a pointer to the memory object and an optional CUDA event object which notifies that the memory is ready to use.
+ *
+ */
+typedef struct NVCVWorkspaceMemRec
+{
+    /** The requirements that the memory pointed to by `data` must satisfy */
+    NVCVWorkspaceMemRequirements req;
+
+    /** The pointer to the workspace memory.
+     *
+     * @remark The accessibility of the memory may be restricted to the host or a specific device.
+     */
+    void *data;
+
+    /** The event which notifies that the memory is ready to use.
+     *
+     * The event object is used in two ways - the user (e.g. an operator) of the workspace memory should wait for the
+     * event in the context in which it will use the memory as well as record the event after it has scheduled all work
+     * that uses the memory object.
+     */
+    cudaEvent_t ready;
+} NVCVWorkspaceMem;
+
+/** Aggregates multiple resources into a single workspace objects */
+typedef struct NVCVWorkspaceRec
+{
+    /** Plain host memory. This should not be used in any GPU code.
+     *
+     * On systems with a discrete GPU, this kind of memory doesn't need a CUDA event. On systems with integrated GPU
+     * or HMM systems, there's no difference between plain and pinned host memory with respect to synchronization.
+     */
+    NVCVWorkspaceMem hostMem;
+
+    /** Pinned host memory.
+     *
+     * cudaXxxAsync operations on this kind of memory are performed truly asynchronously, which calls for
+     * synchronization.
+     * When used as a staging buffer for passing data to a CUDA kernel, a typical synchronization scheme would be to
+     * wait for the `ready` event on host (cudaEventSynchronize), issue H2D copy and record the `ready` event.
+     */
+    NVCVWorkspaceMem pinnedMem;
+
+    /** GPU memory */
+    NVCVWorkspaceMem cudaMem;
+} NVCVWorkspace;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CVCUDAERATORS_WORKSPACE_H */
diff --git a/src/cvcuda/include/cvcuda/Workspace.hpp b/src/cvcuda/include/cvcuda/Workspace.hpp
new file mode 100644
index 00000000..65a9ddfd
--- /dev/null
+++ b/src/cvcuda/include/cvcuda/Workspace.hpp
@@ -0,0 +1,203 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CVCUDAERATORS_WORKSPACE_HPP
+#define CVCUDAERATORS_WORKSPACE_HPP
+
+#include "Workspace.h"
+
+#include <nvcv/alloc/Allocator.hpp>
+#include <nvcv/detail/Align.hpp>
+
+#include <cassert>
+#include <functional>
+#include <utility>
+
+namespace cvcuda {
+
+using Workspace                = NVCVWorkspace;
+using WorkspaceMem             = NVCVWorkspaceMem;
+using WorkspaceRequirements    = NVCVWorkspaceRequirements;
+using WorkspaceMemRequirements = NVCVWorkspaceMemRequirements;
+
+/** Computes memory requirements that can cover both input requirements.
+ *
+ * The resulting memory requriements will have alignment and size that is not smaller than that of either
+ * of the arguments.
+ *
+ * alignment = max(a.alignment, b.alignment)
+ * size = align_up(max(a.size, b.size), alignment)
+ */
+inline WorkspaceMemRequirements MaxWorkspaceReq(WorkspaceMemRequirements a, WorkspaceMemRequirements b)
+{
+    WorkspaceMemRequirements ret;
+    assert(!a.size || a.alignment > 0);
+    assert(!b.size || b.alignment > 0);
+    ret.alignment = b.alignment > a.alignment ? b.alignment : a.alignment;
+    ret.size      = b.size > a.size ? b.size : a.size;
+    assert((ret.alignment & (ret.alignment - 1)) == 0 && "Alignment must be a power of 2");
+    ret.size = nvcv::detail::AlignUp(ret.size, ret.alignment);
+    return ret;
+}
+
+/** Computes workspace requirements that can cover both input requirments. */
+inline NVCVWorkspaceRequirements MaxWorkspaceReq(const WorkspaceRequirements &a, const WorkspaceRequirements &b)
+{
+    WorkspaceRequirements ret;
+    ret.hostMem   = MaxWorkspaceReq(a.hostMem, b.hostMem);
+    ret.pinnedMem = MaxWorkspaceReq(a.pinnedMem, b.pinnedMem);
+    ret.cudaMem   = MaxWorkspaceReq(a.cudaMem, b.cudaMem);
+    return ret;
+}
+
+/** A helper class that manages the lifetime of resources stored in a Workspace structure.
+ *
+ * This class works in a way similar to unique_ptr with a custom deleter.
+ */
+class UniqueWorkspace
+{
+public:
+    using DeleterFunc = void(NVCVWorkspace &);
+    using Deleter     = std::function<DeleterFunc>;
+
+    UniqueWorkspace() = default;
+
+    UniqueWorkspace(const UniqueWorkspace &) = delete;
+
+    UniqueWorkspace(UniqueWorkspace &&ws)
+    {
+        swap(ws);
+    }
+
+    UniqueWorkspace &operator=(const UniqueWorkspace &) = delete;
+
+    UniqueWorkspace &operator=(UniqueWorkspace &&ws) noexcept
+    {
+        swap(ws);
+        ws.reset();
+        return *this;
+    }
+
+    UniqueWorkspace(Workspace workspace, Deleter del = {})
+        : m_impl(workspace)
+        , m_del(std::move(del))
+    {
+    }
+
+    UniqueWorkspace(WorkspaceMem host, WorkspaceMem pinned, WorkspaceMem cuda, Deleter del = {})
+        : m_impl{host, pinned, cuda}
+        , m_del(std::move(del))
+    {
+    }
+
+    ~UniqueWorkspace()
+    {
+        reset();
+    }
+
+    void reset() noexcept
+    {
+        if (m_del)
+        {
+            m_del(m_impl);
+            m_del  = {};
+            m_impl = {};
+        }
+    }
+
+    const Workspace &get() const
+    {
+        return m_impl;
+    }
+
+private:
+    void swap(UniqueWorkspace &ws)
+    {
+        std::swap(m_impl, ws.m_impl);
+        std::swap(m_del, ws.m_del);
+    }
+
+    Workspace m_impl{};
+    Deleter   m_del{};
+};
+
+/** Allocates a workspace with an allocator specified in `alloc` (or a default one).
+ *
+ * This function is meant as a simple helper to simplify the usage operators requiring a workspace, but its intense use
+ * may degrade performance due to excessive allocations and deallocations.
+ * For code used in tight loops, some workspace reuse scheme and/or resource pools are recommended.
+ */
+inline UniqueWorkspace AllocateWorkspace(WorkspaceRequirements req, nvcv::Allocator alloc = {})
+{
+    if (!alloc)
+    {
+        nvcv::CustomAllocator<> cust{};
+        alloc = std::move(cust);
+    }
+    auto del = [alloc](NVCVWorkspace &ws)
+    {
+        // TODO(michalz): Add proper CUDA error handling in public API
+        if (ws.hostMem.data)
+        {
+            if (ws.hostMem.ready)
+                if (cudaEventSynchronize(ws.hostMem.ready) != cudaSuccess)
+                    throw std::runtime_error("cudaEventSynchronize failed");
+            alloc.hostMem().free(ws.hostMem.data, ws.hostMem.req.size, ws.hostMem.req.alignment);
+            ws.hostMem.data = nullptr;
+        }
+        if (ws.pinnedMem.data)
+        {
+            if (ws.pinnedMem.ready)
+                if (cudaEventSynchronize(ws.pinnedMem.ready) != cudaSuccess)
+                    throw std::runtime_error("cudaEventSynchronize failed");
+            alloc.hostPinnedMem().free(ws.pinnedMem.data, ws.pinnedMem.req.size, ws.pinnedMem.req.alignment);
+            ws.pinnedMem.data = nullptr;
+        }
+        if (ws.cudaMem.data)
+        {
+            if (ws.cudaMem.ready)
+                if (cudaEventSynchronize(ws.cudaMem.ready) != cudaSuccess)
+                    throw std::runtime_error("cudaEventSynchronize failed");
+            alloc.cudaMem().free(ws.cudaMem.data, ws.cudaMem.req.size, ws.cudaMem.req.alignment);
+            ws.cudaMem.data = nullptr;
+        }
+    };
+    NVCVWorkspace ws = {};
+    try
+    {
+        ws.hostMem.req   = req.hostMem;
+        ws.pinnedMem.req = req.pinnedMem;
+        ws.cudaMem.req   = req.cudaMem;
+
+        if (req.hostMem.size)
+            ws.hostMem.data = alloc.hostMem().alloc(req.hostMem.size, req.hostMem.alignment);
+        if (req.pinnedMem.size)
+            ws.pinnedMem.data = alloc.hostPinnedMem().alloc(req.pinnedMem.size, req.pinnedMem.alignment);
+        if (req.cudaMem.size)
+            ws.cudaMem.data = alloc.cudaMem().alloc(req.cudaMem.size, req.cudaMem.alignment);
+        return UniqueWorkspace(ws, std::move(del));
+    }
+    catch (...)
+    {
+        del(ws);
+        throw;
+    }
+}
+
+} // namespace cvcuda
+
+#endif // CVCUDAERATORS_WORKSPACE_HPP
diff --git a/src/cvcuda/priv/CMakeLists.txt b/src/cvcuda/priv/CMakeLists.txt
index 419dc05a..cd3904c4 100644
--- a/src/cvcuda/priv/CMakeLists.txt
+++ b/src/cvcuda/priv/CMakeLists.txt
@@ -26,8 +26,8 @@ set(CV_CUDA_PRIV_OP_FILES
     OpMinMaxLoc.cu
     OpHistogram.cpp
     OpMinAreaRect.cpp
-    OpBoxBlur.cpp
     OpBndBox.cpp
+    OpBoxBlur.cpp
     OpBrightnessContrast.cu
     OpRemap.cu
     OpColorTwist.cu
@@ -64,6 +64,10 @@ set(CV_CUDA_PRIV_OP_FILES
     OpRandomResizedCrop.cpp
     OpGaussianNoise.cpp
     OpInpaint.cpp
+    OpLabel.cu
+    OpPairwiseMatcher.cu
+    OpStack.cpp
+    OpFindHomography.cu
 )
 
 # filter only one that matches the patern (case insensitive), should be set on the global level
@@ -97,5 +101,8 @@ target_link_libraries(cvcuda_priv
         nvcv_util_sanitizer
         cvcuda_legacy
         CUDA::cudart_static
+	CUDA::cusolver
+        CUDA::cublas
+        CUDA::cublasLt
         -lrt
 )
diff --git a/src/cvcuda/priv/OpBndBox.cpp b/src/cvcuda/priv/OpBndBox.cpp
index 05974888..36fa702f 100644
--- a/src/cvcuda/priv/OpBndBox.cpp
+++ b/src/cvcuda/priv/OpBndBox.cpp
@@ -30,7 +30,7 @@ namespace legacy = nvcv::legacy::cuda_op;
 BndBox::BndBox()
 {
     legacy::DataShape maxIn, maxOut; //maxIn/maxOut not used by op.
-    m_legacyOp = std::make_unique<legacy::BndBox>(maxIn, maxOut);
+    m_legacyOp = std::make_unique<legacy::OSD>(maxIn, maxOut);
 }
 
 void BndBox::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out,
@@ -50,7 +50,7 @@ void BndBox::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv:
                               "Output must be cuda-accessible, pitch-linear tensor");
     }
 
-    NVCV_CHECK_THROW(m_legacyOp->infer(*inData, *outData, bboxes, stream));
+    NVCV_CHECK_THROW(m_legacyOp->inferBox(*inData, *outData, bboxes, stream));
 }
 
 } // namespace cvcuda::priv
diff --git a/src/cvcuda/priv/OpBndBox.hpp b/src/cvcuda/priv/OpBndBox.hpp
index d2904917..00364f59 100644
--- a/src/cvcuda/priv/OpBndBox.hpp
+++ b/src/cvcuda/priv/OpBndBox.hpp
@@ -42,7 +42,7 @@ class BndBox final : public IOperator
                     const NVCVBndBoxesI &bboxes) const;
 
 private:
-    std::unique_ptr<nvcv::legacy::cuda_op::BndBox> m_legacyOp;
+    std::unique_ptr<nvcv::legacy::cuda_op::OSD> m_legacyOp;
 };
 
 } // namespace cvcuda::priv
diff --git a/src/cvcuda/priv/OpCropFlipNormalizeReformat.cu b/src/cvcuda/priv/OpCropFlipNormalizeReformat.cu
index 51ebf8d3..8c9426b7 100644
--- a/src/cvcuda/priv/OpCropFlipNormalizeReformat.cu
+++ b/src/cvcuda/priv/OpCropFlipNormalizeReformat.cu
@@ -100,7 +100,7 @@ __device__ void transfer_data(cuda::BorderVarShapeWrapNHWC<const T1, B> srcWrap,
             float base  = get_base_value(baseWrap, c, base_channels);
             float scale = get_scale_value(scaleWrap, c, scale_channels, epsilon, flags);
             dstWrap[(int4){dst_idx.x, dst_idx.y, c, batchidx}] = cuda::SaturateCast<T2>(
-                (srcWrap[(int4){src_idx.x, src_idx.y, batchidx, c}] - base) * scale * global_scale + global_shift);
+                (srcWrap[(int4){batchidx, src_idx.y, src_idx.x, c}] - base) * scale * global_scale + global_shift);
         }
     }
     else
@@ -110,7 +110,7 @@ __device__ void transfer_data(cuda::BorderVarShapeWrapNHWC<const T1, B> srcWrap,
             float base  = get_base_value(baseWrap, c, base_channels);
             float scale = get_scale_value(scaleWrap, c, scale_channels, epsilon, flags);
             dstWrap[(int4){c, dst_idx.x, dst_idx.y, batchidx}] = cuda::SaturateCast<T2>(
-                (srcWrap[(int4){src_idx.x, src_idx.y, batchidx, c}] - base) * scale * global_scale + global_shift);
+                (srcWrap[(int4){batchidx, src_idx.y, src_idx.x, c}] - base) * scale * global_scale + global_shift);
         }
     }
 }
diff --git a/src/cvcuda/priv/OpFindHomography.cu b/src/cvcuda/priv/OpFindHomography.cu
new file mode 100644
index 00000000..d3e712cb
--- /dev/null
+++ b/src/cvcuda/priv/OpFindHomography.cu
@@ -0,0 +1,1615 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "OpFindHomography.hpp"
+
+#include <cuda_runtime.h>
+#include <driver_types.h>
+#include <float.h>
+#include <nvcv/ArrayData.hpp>
+#include <nvcv/ArrayDataAccess.hpp>
+#include <nvcv/DataType.hpp>
+#include <nvcv/Exception.hpp>
+#include <nvcv/TensorData.hpp>
+#include <nvcv/TensorLayout.hpp>
+#include <nvcv/cuda/DropCast.hpp>
+#include <nvcv/cuda/ImageBatchVarShapeWrap.hpp>
+#include <nvcv/cuda/MathOps.hpp>
+#include <nvcv/cuda/SaturateCast.hpp>
+#include <nvcv/cuda/StaticCast.hpp>
+#include <nvcv/cuda/TensorWrap.hpp>
+#include <nvcv/cuda/math/LinAlg.hpp>
+#include <util/Assert.h>
+#include <util/CheckError.hpp>
+#include <util/Math.hpp>
+
+#include <iostream>
+
+#define BLOCK_SIZE 128
+#define PIPELINES  8
+
+namespace cuda = nvcv::cuda;
+namespace util = nvcv::util;
+
+typedef cuda::math::Vector<float, 8>     vector8;
+typedef cuda::math::Vector<float, 9>     vector9;
+typedef cuda::math::Vector<int, 8>       intvector8;
+typedef cuda::math::Vector<float, 32>    vector32;
+typedef cuda::math::Matrix<float, 8, 8>  matrix8x8;
+typedef cuda::math::Matrix<float, 8, 32> matrix8x32;
+typedef cuda::math::Matrix<double, 8, 8> dmatrix8x8;
+typedef cuda::math::Vector<double, 8>    dvector8;
+
+namespace {
+
+#define is_aligned(POINTER, BYTE_COUNT, msg)                                                \
+    do                                                                                      \
+    {                                                                                       \
+        if (((uintptr_t)(const void *)(POINTER)) % (BYTE_COUNT) != 0)                       \
+        {                                                                                   \
+            std::cerr << msg << " at line " << __LINE__ << " in " << __FILE__ << std::endl; \
+            return;                                                                         \
+        }                                                                                   \
+    }                                                                                       \
+    while (0)
+
+#define CUDA_CHECK_ERROR(err, msg)                                                                                  \
+    do                                                                                                              \
+    {                                                                                                               \
+        cudaError_t _err = (err);                                                                                   \
+        if (_err != cudaSuccess)                                                                                    \
+        {                                                                                                           \
+            std::cerr << "(" << cudaGetErrorString(_err) << ") at line " << __LINE__ << " in " << __FILE__ << " : " \
+                      << msg << std::endl;                                                                          \
+            return;                                                                                                 \
+        }                                                                                                           \
+    }                                                                                                               \
+    while (0)
+
+#define CUBLAS_CHECK_ERROR(err, msg)                                                                                \
+    do                                                                                                              \
+    {                                                                                                               \
+        cublasStatus_t _err = (err);                                                                                \
+        if (_err != CUBLAS_STATUS_SUCCESS)                                                                          \
+        {                                                                                                           \
+            std::cerr << "CUBLAS error (" << _err << ") at line " << __LINE__ << " in " << __FILE__ << " : " << msg \
+                      << std::endl;                                                                                 \
+            return;                                                                                                 \
+        }                                                                                                           \
+    }                                                                                                               \
+    while (0)
+
+#define CUSOLVER_CHECK_ERROR(err, msg)                                                                                \
+    do                                                                                                                \
+    {                                                                                                                 \
+        cusolverStatus_t _err = (err);                                                                                \
+        if (_err != CUSOLVER_STATUS_SUCCESS)                                                                          \
+        {                                                                                                             \
+            std::cerr << "CUSOLVER error (" << _err << ") at line " << __LINE__ << " in " << __FILE__ << " : " << msg \
+                      << std::endl;                                                                                   \
+            return;                                                                                                   \
+        }                                                                                                             \
+    }                                                                                                                 \
+    while (0)
+
+#ifdef DEBUG
+template<typename T>
+__global__ void printKernel(T *data, int numPoints, int batchIdx)
+{
+    int i = threadIdx.x + blockIdx.x * blockDim.x;
+    if (i < numPoints)
+        printf("Batch = %d, i = %d, val = %.9g,\n", batchIdx, i, (double)data[i]);
+}
+
+__global__ void printKernelfloat2(float2 *data, int numPoints, int batchIdx)
+{
+    int i = threadIdx.x + blockIdx.x * blockDim.x;
+    if (i < numPoints)
+        printf("Batch = %d, i = %d, val = %.9g,%.9g\n", batchIdx, i, (double)data[i].x, (double)data[i].y);
+}
+#endif
+
+#ifdef DEBUG_MODEL_KERNEL
+template<typename T>
+__global__ void printMatrix(T *data, int M, int N)
+{
+    for (int i = 0; i < M; i++)
+    {
+        for (int j = 0; j < N; j++)
+        {
+            printf("%.9g, ", (double)data[i * N + j]);
+        }
+        printf("\n");
+    }
+}
+
+template<typename T>
+__global__ void printMatrixCols(T *data, int M, int N)
+{
+    for (int j = 0; j < N; j++)
+    {
+        printf("ROw %d\n", j);
+        for (int i = 0; i < M; i++)
+        {
+            printf("A[%d + %d * lda] = %g;\n", i, j, (double)data[i * N + j]);
+        }
+        printf("\n");
+    }
+}
+
+template<typename T, int N>
+__device__ void printMatrixDevice(cuda::math::Matrix<T, N> &A)
+{
+    for (int i = 0; i < N; i++)
+    {
+        printf("[");
+        for (int j = 0; j < N; j++)
+        {
+            printf("%.9g, ", A[i][j]);
+        }
+        printf("],\n");
+    }
+}
+
+template<typename T, int N>
+__device__ void printMatrixDeviceParallel(cuda::math::Matrix<T, N> &A)
+{
+    __threadfence();
+    if (threadIdx.x < N)
+    {
+        for (int i = 0; i < N; i++)
+        {
+            printf("A[%d][%d] = %g\n", i, threadIdx.x, (double)A[i][threadIdx.x]);
+        }
+    }
+}
+
+template<typename T>
+__device__ void printMatrixDeviceRaw(T *A, int M, int N, int batch)
+{
+    printf("Batch = %d\n", batch);
+    for (int i = 0; i < M; i++)
+    {
+        for (int j = 0; j < N; j++)
+        {
+            printf("%g, ", (double)A[i * N + j]);
+        }
+        printf("\n");
+    }
+}
+
+template<typename T, int N>
+__device__ void printVectorDevice(cuda::math::Vector<T, N> &x, int batch)
+{
+    printf("Batch = %d\n", batch);
+    for (int i = 0; i < N; i++) printf("%.9g, ", x[i]);
+}
+
+template<typename T, int N>
+__device__ void printVectorDeviceParallel(cuda::math::Vector<T, N> &x, int batch)
+{
+    if (threadIdx.x < N)
+        printf("x[%d] = %g\n", threadIdx.x, (double)x[threadIdx.x]);
+}
+
+template<typename T>
+__device__ void printVectorDeviceRaw(T *x, int N, int batch)
+{
+    printf("Batch = %d\n", batch);
+    for (int i = 0; i < N; i++) printf("%g, ", (double)x[i]);
+}
+#endif
+
+__device__ void calculate_residual_and_jacobian_device(float2 *src, float2 *dst, vector8 &h, int numPoints, float *Jptr,
+                                                       float *errptr);
+__device__ void calculate_residual_norm(float *r, float *r_norm_2, vector32 &warpSums, int numPoints);
+
+__device__ void calculate_Jtx_matvec(float *A, float *B, float *result, matrix8x32 &warpSums, int row, int numPoints);
+
+__device__ void calculate_JtJ(float *Jt, matrix8x8 &A, matrix8x32 &warpSums, float *reductionBuffer, int numPoints);
+
+__device__ void calculate_Jtr(float *Jt, float *r, vector8 &v, matrix8x32 &warpSums, float *reductionBuffer,
+                              int numPoints);
+
+__device__ void fetch_diagonal(matrix8x8 &A, vector8 &D, int tid);
+
+__device__ void copy_A_to_Ap_App(matrix8x8 &A, matrix8x8 &Ap, matrix8x8 &App);
+
+__device__ void scale_diagonal8(vector8 &D, matrix8x8 &Ap, float lambda);
+
+__device__ void compute_qr8x8(matrix8x8 &sA, matrix8x8 &sQ);
+
+__device__ bool backsolve_inplace(matrix8x8 &A, vector8 &d);
+
+__device__ bool solve8x8(matrix8x8 &A, matrix8x8 &Q, vector8 &v, vector8 &d, int tid);
+
+__device__ bool invert8x8(matrix8x8 &A, matrix8x8 &Q, matrix8x8 &invA, int tid);
+
+__device__ void subtract8(vector8 &x, vector8 &d, vector8 &xd, int tid);
+
+__device__ void max_diag_val8(matrix8x8 &A, float *maxval);
+
+__device__ void max8(vector8 &v, float *maxval);
+
+__device__ static float atomicMax(float *address, float val);
+
+__device__ void calculate_temp_d(matrix8x8 &A, vector8 &x, vector8 &y, vector8 &z, float alpha, float beta, int tid);
+
+__device__ int compute_model_estimate(float2 cM, float2 cm, float2 sM, float2 sm, float *W, float *V, vector8 &x,
+                                      cuda::Tensor3DWrap<float> model, int batch, int numPoints);
+
+__device__ void calculate_residual_and_jacobian_device(float2 *src, float2 *dst, vector8 &h, int numPoints, float *Jptr,
+                                                       float *errptr)
+{
+    int idx = threadIdx.x;
+
+    for (int tid = idx; tid < numPoints; tid += blockDim.x)
+    {
+        float2 M_i = src[tid];
+        float2 m_i = dst[tid];
+        float  Mx = M_i.x, My = M_i.y;
+        float  mx = m_i.x, my = m_i.y;
+
+        float ww = h[6] * Mx + h[7] * My + 1.;
+        ww       = fabs(ww) > FLT_EPSILON ? 1. / ww : 0;
+        float xi = (h[0] * Mx + h[1] * My + h[2]) * ww;
+        float yi = (h[3] * Mx + h[4] * My + h[5]) * ww;
+
+        errptr[tid * 2]     = xi - mx;
+        errptr[tid * 2 + 1] = yi - my;
+
+        if (Jptr)
+        {
+            // Column major format
+            Jptr[tid * 2 + numPoints * 0 + 0]  = Mx * ww;
+            Jptr[tid * 2 + numPoints * 0 + 1]  = 0;
+            Jptr[tid * 2 + numPoints * 2 + 0]  = My * ww;
+            Jptr[tid * 2 + numPoints * 2 + 1]  = 0;
+            Jptr[tid * 2 + numPoints * 4 + 0]  = ww;
+            Jptr[tid * 2 + numPoints * 4 + 1]  = 0;
+            Jptr[tid * 2 + numPoints * 6 + 0]  = 0;
+            Jptr[tid * 2 + numPoints * 6 + 1]  = Mx * ww;
+            Jptr[tid * 2 + numPoints * 8 + 0]  = 0;
+            Jptr[tid * 2 + numPoints * 8 + 1]  = My * ww;
+            Jptr[tid * 2 + numPoints * 10 + 0] = 0;
+            Jptr[tid * 2 + numPoints * 10 + 1] = ww;
+            Jptr[tid * 2 + numPoints * 12 + 0] = -Mx * ww * xi;
+            Jptr[tid * 2 + numPoints * 12 + 1] = -Mx * ww * yi;
+            Jptr[tid * 2 + numPoints * 14 + 0] = -My * ww * xi;
+            Jptr[tid * 2 + numPoints * 14 + 1] = -My * ww * yi;
+        }
+    }
+}
+
+__host__ __device__ inline float myfabs(float val)
+{
+    return fabsf(val);
+}
+
+inline __host__ __device__ float2 myfabs2(float2 val)
+{
+    float2 ret;
+    ret.x = fabsf(val.x);
+    ret.y = fabsf(val.y);
+    return ret;
+}
+
+__host__ __device__ inline int getNumPoints(cuda::Tensor2DWrap<float2> src, int numPoints, int batch)
+{
+    return numPoints;
+}
+
+struct MeanOp
+{
+    __host__ __device__ float2 eval(float2 val, int numPoints, int batch)
+    {
+        return val / numPoints;
+    }
+};
+
+struct SquareOp
+{
+    __host__ __device__ float eval(float val, int batch)
+    {
+        return val * val;
+    }
+};
+
+class AbsShiftOp
+{
+private:
+    float2 *_data;
+
+public:
+    // Constructor that takes a float* pointer as a parameter
+    __host__ __device__ AbsShiftOp(float2 *data)
+        : _data(data){};
+
+    // Method to update the float value pointed to by the pointer
+    __host__ __device__ float2 eval(float2 newVal, int numPoints, int batch)
+    {
+        _data += batch;
+        return myfabs2(newVal - _data[0]);
+    }
+};
+
+class LtLOp
+{
+private:
+    float2 *cm, *cM, *sm, *sM;
+
+public:
+    __host__ __device__ LtLOp(float2 *srcMean, float2 *dstMean, float2 *srcShiftSum, float2 *dstShiftSum)
+    {
+        cM = srcMean;
+        sM = srcShiftSum;
+        cm = dstMean;
+        sm = dstShiftSum;
+    }
+
+    __host__ __device__ float eval(float2 *src, float2 *dst, int batch, int numPoints, int tid, int j, int k)
+    {
+        cm += batch;
+        cM += batch;
+        sm += batch;
+        sM += batch;
+        float X     = (src[tid].x - cM[0].x) * (numPoints / sM[0].x);
+        float Y     = (src[tid].y - cM[0].y) * (numPoints / sM[0].y);
+        float x     = (dst[tid].x - cm[0].x) * (numPoints / sm[0].x);
+        float y     = (dst[tid].y - cm[0].y) * (numPoints / sm[0].y);
+        float Lx[9] = {X, Y, 1, 0, 0, 0, -x * X, -x * Y, -x};
+        float Ly[9] = {0, 0, 0, X, Y, 1, -y * X, -y * Y, -y};
+        return Lx[j] * Lx[k] + Ly[j] * Ly[k];
+    }
+};
+
+template<class Func>
+__device__ void reducef(float *data, cuda::math::Vector<float, 32> &warpSums, float *result, Func op, int numPoints,
+                        int batch)
+{
+    int      tid    = threadIdx.x;
+    int      idx    = threadIdx.x + blockIdx.x * blockDim.x;
+    float    val    = 0.0f;
+    unsigned mask   = 0xFFFFFFFFU;
+    int      lane   = threadIdx.x % warpSize;
+    int      warpID = threadIdx.x / warpSize;
+    while (idx < numPoints)
+    {
+        val += op.eval(data[idx], batch);
+        idx += gridDim.x * blockDim.x;
+    }
+
+    for (int offset = warpSize / 2; offset > 0; offset >>= 1) val += __shfl_down_sync(mask, val, offset);
+    if (lane == 0)
+        warpSums[warpID] = val;
+
+    __syncthreads();
+
+    if (warpID == 0)
+    {
+        val = (tid < blockDim.x / warpSize) ? warpSums[lane] : 0.0f;
+
+        for (int offset = warpSize / 2; offset > 0; offset >>= 1) val += __shfl_down_sync(mask, val, offset);
+
+        if (tid == 0)
+            atomicAdd(result, val);
+    }
+}
+
+template<class Func>
+__device__ void reducef2(float2 *data, cuda::math::Vector<float2, 32> &warpSums, float2 *result, Func op, int numPoints,
+                         int batch)
+{
+    int      tid    = threadIdx.x;
+    int      idx    = threadIdx.x + blockIdx.x * blockDim.x;
+    float2   val    = {0.0f, 0.0f};
+    unsigned mask   = 0xFFFFFFFFU;
+    int      lane   = threadIdx.x % warpSize;
+    int      warpID = threadIdx.x / warpSize;
+    while (idx < numPoints)
+    {
+        val += op.eval(data[idx], numPoints, batch);
+        idx += gridDim.x * blockDim.x;
+    }
+
+    for (int offset = warpSize / 2; offset > 0; offset >>= 1)
+    {
+        val.x += __shfl_down_sync(mask, val.x, offset);
+        val.y += __shfl_down_sync(mask, val.y, offset);
+    }
+    if (lane == 0)
+        warpSums[warpID] = val;
+
+    __syncthreads();
+
+    if (warpID == 0)
+    {
+        val = (tid < blockDim.x / warpSize) ? warpSums[lane] : float2{0.0f, 0.0f};
+
+        for (int offset = warpSize / 2; offset > 0; offset >>= 1)
+        {
+            val.x += __shfl_down_sync(mask, val.x, offset);
+            val.y += __shfl_down_sync(mask, val.y, offset);
+        }
+
+        if (tid == 0)
+        {
+            atomicAdd(&result[0].x, val.x);
+            atomicAdd(&result[0].y, val.y);
+        }
+    }
+}
+
+template<class Func>
+__device__ void reduceLtL(float2 *src, float2 *dst, cuda::math::Vector<float, 32> &warpSums, float *result, Func op,
+                          int numPoints, int batch, int j, int k)
+{
+    int   tid = threadIdx.x;
+    int   idx = threadIdx.x + blockIdx.x * blockDim.x;
+    float val = 0.0f;
+    ;
+    unsigned mask   = 0xFFFFFFFFU;
+    int      lane   = threadIdx.x % warpSize;
+    int      warpID = threadIdx.x / warpSize;
+    while (idx < numPoints)
+    {
+        // j < 9 and k < 9 are indices of the LtL matrix
+        val += op.eval(src, dst, batch, numPoints, idx, j, k);
+        idx += gridDim.x * blockDim.x;
+    }
+
+    for (int offset = warpSize / 2; offset > 0; offset >>= 1) val += __shfl_down_sync(mask, val, offset);
+    if (lane == 0)
+        warpSums[warpID] = val;
+    __syncthreads();
+
+    if (warpID == 0)
+    {
+        val = (tid < blockDim.x / warpSize) ? warpSums[lane] : 0.0f;
+
+        for (int offset = warpSize / 2; offset > 0; offset >>= 1) val += __shfl_down_sync(mask, val, offset);
+
+        if (tid == 0)
+            atomicAdd(result, val);
+    }
+}
+
+__device__ void calculate_residual_norm(float *r, float *r_norm_2, vector32 &warpSums, int numPoints)
+{
+    SquareOp square_op;
+    reducef<SquareOp>(r, warpSums, r_norm_2, square_op, numPoints, 0);
+    __syncthreads();
+}
+
+__device__ void calculate_Jtx_matvec(float *A, float *B, float *result, matrix8x32 &warpSums, int row, int numPoints)
+{
+    // NOTE : Jt has to be of dimension (8 x innerDim) where innerDim = numPoints x 2
+    int tid = threadIdx.x;
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+    if (threadIdx.x < 8)
+    {
+        for (int i = 0; i < 8; i++)
+        {
+            warpSums[i][threadIdx.x] = 0;
+        }
+    }
+    __syncthreads();
+
+    float    val[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+    unsigned mask   = 0xFFFFFFFFU;
+    int      lane   = threadIdx.x % warpSize;
+    int      warpID = threadIdx.x / warpSize;
+    while (idx < numPoints)
+    {
+        float src_data_val = A[row * numPoints + idx];
+#pragma unroll
+        for (int r = row; r < 8; r++) val[r] += src_data_val * B[r * numPoints + idx];
+        idx += gridDim.x * blockDim.x;
+    }
+
+    for (int r = row; r < 8; r++)
+    {
+        for (int offset = warpSize / 2; offset > 0; offset >>= 1) val[r] += __shfl_down_sync(mask, val[r], offset);
+        if (lane == 0)
+            warpSums[r][warpID] = val[r];
+    }
+    __syncthreads();
+
+    if (warpID == 0)
+    {
+#pragma unroll
+        for (int r = row; r < 8; r++)
+        {
+            val[r] = (tid < blockDim.x / warpSize) ? warpSums[r][lane] : 0;
+
+            for (int offset = warpSize / 2; offset > 0; offset >>= 1) val[r] += __shfl_down_sync(mask, val[r], offset);
+
+            if (tid == 0)
+                atomicAdd(&result[r], val[r]);
+        }
+    }
+}
+
+__device__ void calculate_JtJ(float *Jt, matrix8x8 &A, matrix8x32 &warpSums, float *reductionBuffer, int numPoints)
+{
+    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    for (int row = 0; row < 8; row++)
+    {
+        if (tid < 8)
+            reductionBuffer[tid] = 0;
+        calculate_Jtx_matvec(Jt, Jt, reductionBuffer, warpSums, row, numPoints);
+        __syncthreads();
+        if (tid < 8)
+        {
+            A[row][tid] = reductionBuffer[tid];
+        }
+    }
+    __syncwarp();
+
+    for (int row = 1; row < 8; row++)
+    {
+        if (tid < row)
+            A[row][tid] = A[tid][row];
+    }
+    __syncwarp();
+}
+
+__device__ void calculate_Jtr(float *Jt, float *r, vector8 &v, matrix8x32 &warpSums, float *reductionBuffer,
+                              int numPoints)
+{
+    if (threadIdx.x < 8)
+        reductionBuffer[threadIdx.x] = 0.0f;
+    calculate_Jtx_matvec(r, Jt, reductionBuffer, warpSums, 0, numPoints);
+    __syncthreads();
+    if (threadIdx.x < 8)
+        v[threadIdx.x] = reductionBuffer[threadIdx.x];
+    __syncwarp();
+}
+
+__device__ void fetch_diagonal(matrix8x8 &A, vector8 &D, int tid)
+{
+    if (tid < 8)
+        D[tid] = A[tid][tid];
+    __syncwarp();
+}
+
+__device__ void copy_A_to_Ap_App(matrix8x8 &A, matrix8x8 &Ap, matrix8x8 &App)
+{
+    if (threadIdx.x < 8)
+    {
+        for (int i = 0; i < 8; i++)
+        {
+            Ap[i][threadIdx.x]  = A[i][threadIdx.x];
+            App[i][threadIdx.x] = A[i][threadIdx.x];
+        }
+    }
+    __syncwarp();
+}
+
+__device__ void scale_diagonal8(vector8 &D, matrix8x8 &Ap, float lambda)
+{
+    if (threadIdx.x < 8)
+        Ap[threadIdx.x][threadIdx.x] += lambda * D[threadIdx.x];
+    __syncwarp();
+}
+
+__device__ void compute_qr8x8(matrix8x8 &sA, matrix8x8 &sQ)
+{
+    int       tid = threadIdx.x + blockIdx.x * blockDim.x;
+    const int N   = 8;
+    if (tid < N)
+    {
+        for (int i = 0; i < N; i++)
+        {
+            sQ[i][tid] = 0;
+            if (i == tid)
+                sQ[i][tid] = 1;
+        }
+    }
+    __syncwarp();
+
+    float  s[2];
+    double temp[2];
+    for (int j = 0; j < N; j++)
+    {
+        int pivot_row = j;
+        for (int i = j + 1; i < N; i++)
+        {
+            if (tid < N)
+            {
+                double theta   = atan(-(double)sA[i][j] / (double)sA[pivot_row][j]);
+                double ctheta  = cos(theta);
+                double stheta  = sin(theta);
+                float  sthetaf = (float)stheta;
+                float  cthetaf = (float)ctheta;
+
+                temp[0]            = ctheta * sA[pivot_row][tid] - stheta * sA[i][tid];
+                temp[1]            = stheta * sA[pivot_row][tid] + ctheta * sA[i][tid];
+                sA[pivot_row][tid] = temp[0];
+                sA[i][tid]         = temp[1];
+
+                s[0]               = cthetaf * sQ[pivot_row][tid] - sthetaf * sQ[i][tid];
+                s[1]               = sthetaf * sQ[pivot_row][tid] + cthetaf * sQ[i][tid];
+                sQ[pivot_row][tid] = s[0];
+                sQ[i][tid]         = s[1];
+            }
+            __syncwarp();
+        }
+    }
+    __syncwarp();
+}
+
+__device__ bool backsolve_inplace(matrix8x8 &A, vector8 &d)
+{
+    const int N = 8;
+    for (int j = N - 1; j >= 0; j--)
+    {
+        if (A[j][j] < FLT_EPSILON)
+            return false;
+        d[j] /= A[j][j];
+        for (int i = j - 1; i >= 0; i--)
+        {
+            d[i] = d[i] - A[i][j] * d[j];
+        }
+    }
+    return true;
+}
+
+__device__ bool solve8x8(matrix8x8 &A, matrix8x8 &Q, vector8 &v, vector8 &d, int tid)
+{
+    // Do Q^T * d
+    if (tid < 8)
+    {
+        d[tid] = 0;
+        for (int i = 0; i < 8; i++) d[tid] += Q[tid][i] * v[i];
+    }
+
+    __syncwarp();
+
+    if (tid == 0)
+    {
+        if (!backsolve_inplace(A, d))
+            return false;
+    }
+
+    __syncwarp();
+
+    return true;
+}
+
+__device__ bool invert8x8(matrix8x8 &A, matrix8x8 &Q, matrix8x8 &invA, int tid)
+{
+    if (tid < 8)
+    {
+        vector8 d = Q.col(tid);
+        if (!backsolve_inplace(A, d))
+            return false;
+        invA.set_col(tid, d);
+    }
+    __syncwarp();
+    return true;
+}
+
+__device__ void subtract8(vector8 &x, vector8 &d, vector8 &xd, int tid)
+{
+    if (tid < 8)
+        xd[tid] = x[tid] - d[tid];
+    __syncwarp();
+}
+
+__device__ inline void dot8(vector8 &x, vector8 &y, float *r)
+{
+    *r = x[0] * y[0] + x[1] * y[1] + x[2] * y[2] + x[3] * y[3] + x[4] * y[4] + x[5] * y[5] + x[6] * y[6] + x[7] * y[7];
+}
+
+__device__ void max_diag_val8(matrix8x8 &A, float *maxval)
+{
+    *maxval = A[0][0];
+    *maxval = fmaxf(A[1][1], *maxval);
+    *maxval = fmaxf(A[2][2], *maxval);
+    *maxval = fmaxf(A[3][3], *maxval);
+    *maxval = fmaxf(A[4][4], *maxval);
+    *maxval = fmaxf(A[5][5], *maxval);
+    *maxval = fmaxf(A[6][6], *maxval);
+    *maxval = fmaxf(A[7][7], *maxval);
+}
+
+__device__ void max8(vector8 &v, float *maxval)
+{
+    *maxval = fabsf(v[0]);
+    *maxval = fmaxf(fabsf(v[1]), *maxval);
+    *maxval = fmaxf(fabsf(v[2]), *maxval);
+    *maxval = fmaxf(fabsf(v[3]), *maxval);
+    *maxval = fmaxf(fabsf(v[4]), *maxval);
+    *maxval = fmaxf(fabsf(v[5]), *maxval);
+    *maxval = fmaxf(fabsf(v[6]), *maxval);
+    *maxval = fmaxf(fabsf(v[7]), *maxval);
+}
+
+__device__ static float atomicMax(float *address, float val)
+{
+    int *address_as_i = (int *)address;
+    int  old          = *address_as_i, assumed;
+    do
+    {
+        assumed = old;
+        old     = ::atomicCAS(address_as_i, assumed, __float_as_int(::fmaxf(val, __int_as_float(assumed))));
+    }
+    while (assumed != old);
+    return __int_as_float(old);
+}
+
+template<float (*Func)(float)>
+__device__ void max(float *data, vector32 &warpSums, float *result, int numPoints)
+{
+    int      tid    = threadIdx.x;
+    int      idx    = threadIdx.x + blockIdx.x * blockDim.x;
+    float    val    = 0.0f;
+    unsigned mask   = 0xFFFFFFFFU;
+    int      lane   = threadIdx.x % warpSize;
+    int      warpID = threadIdx.x / warpSize;
+    while (idx < numPoints)
+    {
+        val = fmaxf(val, Func(data[idx]));
+        idx += gridDim.x * blockDim.x;
+    }
+
+    for (int offset = warpSize / 2; offset > 0; offset >>= 1) val = fmaxf(val, __shfl_down_sync(mask, val, offset));
+    if (lane == 0)
+        warpSums[warpID] = val;
+    __syncthreads();
+
+    if (warpID == 0)
+    {
+        val = (tid < blockDim.x / warpSize) ? warpSums[lane] : 0;
+
+        for (int offset = warpSize / 2; offset > 0; offset >>= 1) val = fmaxf(val, __shfl_down_sync(mask, val, offset));
+
+        if (tid == 0)
+            atomicMax(result, val);
+    }
+}
+
+__device__ void calculate_temp_d(matrix8x8 &A, vector8 &x, vector8 &y, vector8 &z, float alpha, float beta, int tid)
+{
+    if (tid < 8)
+    {
+        z[tid] = beta * y[tid];
+#pragma unroll
+        for (int i = 0; i < 8; i++) z[tid] += alpha * A[tid][i] * x[i];
+    }
+    __syncwarp();
+}
+
+__device__ int compute_model_estimate(float2 cM, float2 cm, float2 sM, float2 sm, float *W, float *V, vector8 &x,
+                                      cuda::Tensor3DWrap<float> model, int batch, int numPoints)
+{
+    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    if (sm.x < FLT_EPSILON || sm.y < FLT_EPSILON || sM.x < FLT_EPSILON || sM.y < FLT_EPSILON)
+    {
+        if (tid < 8)
+            x[tid] = 0;
+        __syncwarp();
+        return 1;
+    }
+
+    // compute model estimate
+    float2 _sm{numPoints / sm.x, numPoints / sm.y};
+    float2 _sM{numPoints / sM.x, numPoints / sM.y};
+
+    int   minIdx = 0;
+    float minEig = fabs(W[0]);
+
+    for (int i = 1; i < 9; i++)
+    {
+        if (fabs(W[i]) < minEig)
+        {
+            minIdx = i;
+            minEig = fabs(W[i]);
+        }
+    }
+
+    float *H0 = V + 9 * minIdx;
+
+#ifdef DEBUG_MODEL_ESTIMATE
+    if (tid == 0)
+    {
+        for (int i = 0; i < 9; i++) printf("H0[%d] = %.9g\n", i, H0[i]);
+    }
+#endif
+
+    cuda::math::Matrix<float, 3, 3> tH0;
+    cuda::math::Matrix<float, 3, 3> tHtemp1;
+    cuda::math::Matrix<float, 3, 3> tHtemp2;
+
+    for (int i = 0; i < 3; i++)
+    {
+        for (int j = 0; j < 3; j++)
+        {
+            tH0[i][j] = H0[i * 3 + j];
+        }
+    }
+
+    // load inv_Hnorm
+    tHtemp2[0][0] = 1.0f / _sm.x;
+    tHtemp2[0][1] = 0.0f;
+    tHtemp2[0][2] = cm.x;
+    tHtemp2[1][0] = 0.0f;
+    tHtemp2[1][1] = 1.0f / _sm.y;
+    tHtemp2[1][2] = cm.y;
+    tHtemp2[2][0] = 0.0f;
+    tHtemp2[2][1] = 0.0f;
+    tHtemp2[2][2] = 1.0f;
+    tHtemp1       = tHtemp2 * tH0;
+
+#ifdef DEBUG_MODEL_ESTIMATE
+    if (tid == 0)
+    {
+        printf("\n========================_Htemp=========================\n");
+        for (int i = 0; i < 3; i++)
+        {
+            for (int j = 0; j < 3; j++)
+            {
+                printf("_Htemp[%d][%d] = %.9g,", i, j, tHtemp1[i][j]);
+            }
+            printf("\n");
+        }
+    }
+#endif
+
+    // load Hnorm2
+    tHtemp2[0][0] = _sM.x;
+    tHtemp2[0][1] = 0.0f;
+    tHtemp2[0][2] = -cM.x * _sM.x;
+    tHtemp2[1][0] = 0.0f;
+    tHtemp2[1][1] = _sM.y;
+    tHtemp2[1][2] = -cM.y * _sM.y;
+    tHtemp2[2][0] = 0.0f;
+    tHtemp2[2][1] = 0.0f;
+    tHtemp2[2][2] = 1.0f;
+    tH0           = tHtemp1 * tHtemp2;
+
+#ifdef DEBUG_MODEL_ESTIMATE
+    if (tid == 0)
+    {
+        printf("\n===============_H0====================\n");
+        for (int i = 0; i < 3; i++)
+        {
+            for (int j = 0; j < 3; j++)
+            {
+                printf("_H0[%d][%d] = %.9g,", i, j, tH0[i][j]);
+            }
+            printf("\n");
+        }
+    }
+#endif
+
+#pragma unroll
+    for (int i = 0; i < 3; i++)
+#pragma unroll
+        for (int j = 0; j < 3; j++) tH0[i][j] = tH0[i][j] / tH0[2][2];
+
+#ifdef DEBUG_MODEL_ESTIMATE
+    if (tid == 0)
+    {
+        printf("\n===============_H0====================\n");
+        for (int i = 0; i < 3; i++)
+        {
+            for (int j = 0; j < 3; j++)
+            {
+                printf("_H0[%d][%d] = %.9g,", i, j, tH0[i][j]);
+            }
+            printf("\n");
+        }
+    }
+#endif
+
+    if (tid == 0)
+    {
+        x[0] = tH0[0][0];
+        x[1] = tH0[0][1];
+        x[2] = tH0[0][2];
+        x[3] = tH0[1][0];
+        x[4] = tH0[1][1];
+        x[5] = tH0[1][2];
+        x[6] = tH0[2][0];
+        x[7] = tH0[2][1];
+    }
+    __syncwarp();
+    __syncthreads();
+    return 0;
+}
+
+template<class SrcDstWrapper, class ModelWrapper>
+__global__ void computeModel(SrcDstWrapper src, SrcDstWrapper dst, float2 *srcMean, float2 *dstMean,
+                             float2 *srcShiftSum, float2 *dstShiftSum, float *V_batch, float *W_batch, float *r_batch,
+                             float *J_batch, float *calc_buffer_batch, ModelWrapper model, int maxNumPoints,
+                             int batchSize)
+{
+    int tid   = threadIdx.x + blockIdx.x * blockDim.x;
+    int batch = blockIdx.y;
+
+    if (batch < batchSize)
+    {
+        int     numPoints   = getNumPoints(src, maxNumPoints, batch);
+        float2 *srcPtr      = src.ptr(batch);
+        float2 *dstPtr      = dst.ptr(batch);
+        float2  cM          = srcMean[batch];
+        float2  sM          = srcShiftSum[batch];
+        float2  cm          = dstMean[batch];
+        float2  sm          = dstShiftSum[batch];
+        float  *W           = W_batch + 9 * batch;
+        float  *V           = V_batch + 81 * batch;
+        float  *r           = r_batch + 2 * numPoints * batch;
+        float  *J           = J_batch + 2 * numPoints * 8 * batch;
+        float  *calc_buffer = calc_buffer_batch + numPoints * batch;
+        float  *modelPtr    = model.ptr(batch);
+        bool    status      = true;
+
+        __shared__ matrix8x32 shared_mem;
+        __shared__ vector8    v;
+        __shared__ vector8    d;
+        __shared__ vector8    D;
+        __shared__ vector8    xd;
+        __shared__ vector8    x;
+        __shared__ vector8    temp_d;
+        __shared__ matrix8x8  A;
+        __shared__ matrix8x8  Ap;
+        __shared__ matrix8x8  App;
+        __shared__ matrix8x8  Q;
+
+        int ret = compute_model_estimate(cM, cm, sM, sm, W, V, x, model, batch, numPoints);
+        if (!(ret || numPoints == 4))
+        {
+#ifdef DEBUG_MODEL_KERNEL
+            if (tid == 0 && blockIdx.y == 0)
+            {
+                printf("Model estimated Matrix\n");
+                printVectorDevice(x, blockIdx.y);
+                printf("\n");
+            }
+#endif
+
+            // Begin refinement
+            calculate_residual_and_jacobian_device(srcPtr, dstPtr, x, numPoints, J, r);
+
+            calculate_residual_norm(r, calc_buffer, shared_mem[0], numPoints * 2);
+            float S = calc_buffer[0];
+
+#ifdef DEBUG_MODEL_KERNEL
+            if (tid == 0)
+            {
+                printf("\n\n============Residual================\n");
+                printVectorDeviceRaw(r, 2 * numPoints, blockIdx.y);
+                printf("\n\n============Jacobian================\n");
+                printMatrixDeviceRaw(J, 8, 2 * numPoints, blockIdx.y);
+                printf("\n\n============Residual L2 norm==================\n");
+                printf("S = %f\n", S);
+            }
+#endif
+
+            int nfJ = 2;
+
+            if (tid < 8)
+                calc_buffer[tid] = 0;
+            calculate_JtJ(J, A, shared_mem, calc_buffer, numPoints * 2);
+
+#ifdef DEBUG_MODEL_KERNEL
+            if (tid == 0)
+            {
+                printf("\n================ J^T * J = A ================\n");
+                printMatrixDevice(A);
+                printf("\n\n");
+            }
+#endif
+
+            if (tid < 8)
+                calc_buffer[tid] = 0;
+            calculate_Jtr(J, r, v, shared_mem, calc_buffer, numPoints * 2);
+
+            // only blockIdx.x == 0 needs to do this right now.
+            fetch_diagonal(A, D, tid);
+
+#ifdef DEBUG_MODEL_KERNEL
+            if (tid == 0)
+            {
+                printf("\n=============== J^T * r = v ===================\n");
+                printVectorDevice(v, blockIdx.y);
+                printf("\n================ D ========================\n");
+                printVectorDevice(D, blockIdx.y);
+                printf("\n");
+            }
+#endif
+
+            const float Rlo = 0.25, Rhi = 0.75;
+            float       lambda = 1, lc = 0.75;
+            int         iter = 0, maxIters = 10;
+            float       epsx = 1.19209290e-7f;
+            float       epsf = 1.19209290e-7f;
+            bool        status;
+
+            while (true)
+            {
+#ifdef DEBUG_MODEL_KERNEL
+                if (tid == 0)
+                {
+                    printf("\n========================================\n");
+                    printf("================== ITER = %d =============\n", iter);
+                    printf("==========================================\n");
+                    printf("\n=============== A before copying ===================\n");
+                    printMatrixDevice(A);
+                }
+
+#endif
+                copy_A_to_Ap_App(A, Ap, App);
+
+#ifdef DEBUG_MODEL_KERNEL
+                if (tid == 0)
+                {
+                    printf("\n=============== Ap before scaling of diagonal ===================\n");
+                    printMatrixDevice(Ap);
+                }
+#endif
+                // blockIdx.x == 0
+                scale_diagonal8(D, Ap, lambda);
+
+#ifdef DEBUG_MODEL_KERNEL
+                if (tid == 0)
+                {
+                    printf("\n================ D ========================\n");
+                    printVectorDevice(D, blockIdx.y);
+                    printf("\n=============== Ap after scaling of diagonal ===================\n");
+                    printMatrixDevice(Ap);
+                }
+#endif
+
+                compute_qr8x8(Ap, Q);
+                status = solve8x8(Ap, Q, v, d, tid);
+                if (!status)
+                    break;
+
+                subtract8(x, d, xd, tid);
+
+#ifdef DEBUG_MODEL_KERNEL
+                if (tid == 0)
+                {
+                    printf("\n=============== d ====================\n");
+                    printVectorDevice(d, blockIdx.y);
+                    printf("\n=============== xd ===================\n");
+                    printVectorDevice(xd, blockIdx.y);
+                }
+#endif
+
+                // calculate residual but not Jacobian
+                __syncthreads();
+                calculate_residual_and_jacobian_device(srcPtr, dstPtr, xd, numPoints, nullptr, r);
+
+                nfJ++;
+
+                float Sd;
+                if (tid < 8)
+                    calc_buffer[tid] = 0;
+                calculate_residual_norm(r, calc_buffer, shared_mem[0], numPoints * 2);
+                Sd = calc_buffer[0];
+
+                calculate_temp_d(A, d, v, temp_d, -1.0f, 2.0f, tid);
+
+                float dS;
+                __syncthreads();
+                dot8(d, temp_d, &dS);
+
+                float R = (S - Sd) / (fabsf(dS) > FLT_EPSILON ? dS : 1);
+
+#ifdef DEBUG_MODEL_KERNEL
+                if (tid == 0)
+                {
+                    printf("\n=============== r ====================\n");
+                    printVectorDeviceRaw(r, 2 * numPoints, blockIdx.y);
+                    printf("\n============== || r || ==================\n");
+                    printf("||r||^2 = %f\n", Sd);
+                    printf("\ndS = %f\n", dS);
+                    printf("\nR = %f\n", R);
+                }
+#endif
+
+                if (R > Rhi)
+                {
+                    lambda *= 0.5;
+                    if (lambda < lc)
+                        lambda = 0;
+                }
+                else if (R < Rlo)
+                {
+                    float t;
+                    dot8(d, v, &t);
+
+                    float nu = (Sd - S) / (fabsf(t) > FLT_EPSILON ? t : 1.0f) + 2.0f;
+                    nu       = fminf(fmaxf(nu, 2.0f), 10.0f);
+
+                    if (lambda == 0)
+                    {
+                        compute_qr8x8(App, Q);
+                        status = invert8x8(App, Q, Ap, tid);
+                        if (!status)
+                            break;
+
+                        float maxval;
+                        max_diag_val8(Ap, &maxval);
+
+                        lambda = lc = 1. / maxval;
+                        nu *= 0.5;
+                    }
+                    lambda *= nu;
+                }
+
+#ifdef DEBUG_MODEL_KERNEL
+                if (tid == 0)
+                {
+                    printf("\nlambda = %f\n", lambda);
+                }
+#endif
+
+                if (Sd < S)
+                {
+                    nfJ++;
+                    S = Sd;
+
+#ifdef DEBUG_MODEL_KERNEL
+                    if (tid == 0)
+                    {
+                        printf("\n================== Before swapping =======================\n");
+                        printf("\n =================== X =======================\n");
+                        printVectorDevice(x, blockIdx.y);
+                        printf("\n =================== Xd =======================\n");
+                        printVectorDevice(xd, blockIdx.y);
+                    }
+#endif
+
+                    if (tid < 8)
+                        cuda::math::detail::swap(x[tid], xd[tid]);
+                    __syncwarp();
+                    __syncthreads();
+
+#ifdef DEBUG_MODEL_KERNEL
+                    if (tid == 0)
+                    {
+                        printf("\n================== After swapping =======================\n");
+                        printf("\n =================== X =======================\n");
+                        printVectorDevice(x, blockIdx.y);
+                        printf("\n =================== Xd =======================\n");
+                        printVectorDevice(xd, blockIdx.y);
+                    }
+#endif
+
+                    calculate_residual_and_jacobian_device(srcPtr, dstPtr, x, numPoints, J, r);
+                    calculate_JtJ(J, A, shared_mem, calc_buffer, numPoints * 2);
+                    calculate_Jtr(J, r, v, shared_mem, calc_buffer, numPoints * 2);
+
+#ifdef DEBUG_MODEL_KERNEL
+                    if (tid == 0)
+                    {
+                        printf("\n =================== J =======================\n");
+                        printMatrixDeviceRaw(J, 8, 2 * numPoints, blockIdx.y);
+                        printf("\n\n =================== r =======================\n");
+                        printVectorDeviceRaw(r, 2 * numPoints, blockIdx.y);
+                        printf("\n\n==================== A ========================\n");
+                        printMatrixDevice(A);
+                        printf("\n\n===================== v ========================\n");
+                        printVectorDevice(v, blockIdx.y);
+                        printf("\n");
+                    }
+#endif
+                }
+
+                iter++;
+
+                if (tid == 0)
+                    calc_buffer[tid] = 0;
+                max<myfabs>(r, shared_mem[0], calc_buffer, numPoints * 2);
+                __syncthreads();
+                float maxResidualValue = calc_buffer[0];
+                float maxDvecValue;
+                max8(d, &maxDvecValue);
+
+                bool proceed = maxDvecValue >= epsx && maxResidualValue >= epsf && iter < maxIters;
+                if (!proceed)
+                    break;
+            }
+        }
+
+        // Copy back the estimate to output buffer
+        if (tid == 0)
+        {
+            if (status)
+            {
+                *(model.ptr(batch, 0, 0)) = x[0];
+                *(model.ptr(batch, 0, 1)) = x[1];
+                *(model.ptr(batch, 0, 2)) = x[2];
+                *(model.ptr(batch, 1, 0)) = x[3];
+                *(model.ptr(batch, 1, 1)) = x[4];
+                *(model.ptr(batch, 1, 2)) = x[5];
+                *(model.ptr(batch, 2, 0)) = x[6];
+                *(model.ptr(batch, 2, 1)) = x[7];
+                *(model.ptr(batch, 2, 2)) = 1;
+            }
+            else
+            {
+                *(model.ptr(batch, 0, 0)) = 0;
+                *(model.ptr(batch, 0, 1)) = 0;
+                *(model.ptr(batch, 0, 2)) = 0;
+                *(model.ptr(batch, 1, 0)) = 0;
+                *(model.ptr(batch, 1, 1)) = 0;
+                *(model.ptr(batch, 1, 2)) = 0;
+                *(model.ptr(batch, 2, 0)) = 0;
+                *(model.ptr(batch, 2, 1)) = 0;
+                *(model.ptr(batch, 2, 2)) = 0;
+            }
+        }
+    }
+}
+
+template<class SrcDstWrapper, class Func>
+__global__ void compute_src_dst_mean(SrcDstWrapper src, SrcDstWrapper dst, float2 *srcMean, float2 *dstMean,
+                                     Func src_op, Func dst_op, int maxNumPoints, int batchSize)
+{
+    int        batch = blockIdx.y;
+    __shared__ cuda::math::Vector<float2, 32> warpSums;
+    if (batch < batchSize)
+    {
+        int     numPoints    = getNumPoints(src, maxNumPoints, batch);
+        float2 *srcMeanBatch = srcMean + batch;
+        float2 *dstMeanBatch = dstMean + batch;
+        float2 *srcPtr       = src.ptr(batch);
+        float2 *dstPtr       = dst.ptr(batch);
+        reducef2<Func>(srcPtr, warpSums, srcMeanBatch, src_op, numPoints, batch);
+        __syncthreads();
+        reducef2<Func>(dstPtr, warpSums, dstMeanBatch, dst_op, numPoints, batch);
+    }
+}
+
+template<class SrcDstWrapper, class Func>
+__global__ void compute_LtL(SrcDstWrapper src, SrcDstWrapper dst, float *LtL, Func ltl_op, int maxNumPoints,
+                            int batchSize)
+{
+    int        batch = blockIdx.z;
+    int        j     = blockIdx.y / 9; // LtL row index
+    int        k     = blockIdx.y % 9; // LtL col index
+    __shared__ cuda::math::Vector<float, 32> warpSums;
+    if (batch < batchSize)
+    {
+        int     numPoints = getNumPoints(src, maxNumPoints, batch);
+        float  *LtLBatch  = LtL + 81 * batch;
+        float2 *srcPtr    = src.ptr(batch);
+        float2 *dstPtr    = dst.ptr(batch);
+        reduceLtL<Func>(srcPtr, dstPtr, warpSums, &LtLBatch[j * 9 + k], ltl_op, numPoints, batch, j, k);
+    }
+}
+
+/* numPoints should be maxNumPoints in the case of varshape. */
+template<typename SrcDstWrapper, class ModelType>
+void FindHomographyWrapper(SrcDstWrapper srcWrap, SrcDstWrapper dstWrap, ModelType &models,
+                           const BufferOffsets *bufferOffset, const cuSolver *cusolverData, int numPoints,
+                           cudaStream_t stream)
+{
+    dim3                      block(256, 1, 1);
+    cuda::Tensor3DWrap<float> modelWrap = cuda::CreateTensorWrapNHW<float>(models);
+    int                       batchSize = models.shape(0);
+
+    float2            *srcMean        = bufferOffset->srcMean;
+    float2            *dstMean        = bufferOffset->dstMean;
+    float2            *srcShiftSum    = bufferOffset->srcShiftSum;
+    float2            *dstShiftSum    = bufferOffset->dstShiftSum;
+    float             *J              = bufferOffset->J;
+    float             *r              = bufferOffset->r;
+    float             *LtL            = bufferOffset->LtL;
+    float             *W              = bufferOffset->W;
+    float             *calc_buffer    = bufferOffset->calc_buffer;
+    float             *cusolverBuffer = cusolverData->cusolverBuffer;
+    int               *cusolverInfo   = cusolverData->cusolverInfo;
+    int                lwork          = cusolverData->lwork;
+    cusolverDnHandle_t cusolverH      = cusolverData->cusolverH;
+    syevjInfo_t        syevj_params   = cusolverData->syevj_params;
+
+    cudaMemsetAsync(reinterpret_cast<void *>(srcMean), 0, batchSize * sizeof(float2), stream);
+    cudaMemsetAsync(reinterpret_cast<void *>(dstMean), 0, batchSize * sizeof(float2), stream);
+    cudaMemsetAsync(reinterpret_cast<void *>(srcShiftSum), 0, batchSize * sizeof(float2), stream);
+    cudaMemsetAsync(reinterpret_cast<void *>(dstShiftSum), 0, batchSize * sizeof(float2), stream);
+    cudaMemsetAsync(reinterpret_cast<void *>(J), 0, 2 * numPoints * 8 * batchSize * sizeof(float), stream);
+    cudaMemsetAsync(reinterpret_cast<void *>(r), 0, 2 * numPoints * batchSize * sizeof(float), stream);
+    cudaMemsetAsync(reinterpret_cast<void *>(LtL), 0, 81 * batchSize * sizeof(float), stream);
+    cudaMemsetAsync(reinterpret_cast<void *>(W), 0, 9 * batchSize * sizeof(float), stream);
+    cudaMemsetAsync(reinterpret_cast<void *>(calc_buffer), 0, numPoints * batchSize * sizeof(float), stream);
+    cudaMemsetAsync(reinterpret_cast<void *>(cusolverBuffer), 0, lwork * sizeof(float), stream);
+    cudaMemsetAsync(reinterpret_cast<void *>(cusolverInfo), 0, batchSize * sizeof(int), stream);
+
+    dim3 grid((numPoints + block.x - 1) / block.x, batchSize, 1);
+
+    MeanOp meanop;
+    compute_src_dst_mean<<<grid, block, 0, stream>>>(srcWrap, dstWrap, srcMean, dstMean, meanop, meanop, numPoints,
+                                                     batchSize);
+#ifdef DEBUG
+    int check_batch = 0;
+    printKernelfloat2<<<1, 1, 0, stream>>>(srcMean + check_batch, 1, 0);
+    printKernelfloat2<<<1, 1, 0, stream>>>(dstMean + check_batch, 1, 0);
+#endif
+
+    AbsShiftOp src_abs_shift_op(srcMean);
+    AbsShiftOp dst_abs_shift_op(dstMean);
+    compute_src_dst_mean<<<grid, block, 0, stream>>>(srcWrap, dstWrap, srcShiftSum, dstShiftSum, src_abs_shift_op,
+                                                     dst_abs_shift_op, numPoints, batchSize);
+#ifdef DEBUG
+    printKernelfloat2<<<1, 1, 0, stream>>>(srcShiftSum + check_batch, 1, 0);
+    printKernelfloat2<<<1, 1, 0, stream>>>(dstShiftSum + check_batch, 1, 0);
+#endif
+
+    grid.y = 81;
+    grid.z = batchSize;
+    LtLOp ltl_op(srcMean, dstMean, srcShiftSum, dstShiftSum);
+    compute_LtL<<<grid, block, 0, stream>>>(srcWrap, dstWrap, LtL, ltl_op, numPoints, batchSize);
+#ifdef DEBUG
+    for (int b = 0; b < batchSize; b++)
+    {
+        std::cout << "==================== Batch " << b << " =======================" << std::endl;
+        printMatrix<<<1, 1, 0, stream>>>(LtL + 81 * b, 9, 9);
+        cudaStreamSynchronize(stream);
+    }
+#endif
+
+    // compute Eigen values
+    CUSOLVER_CHECK_ERROR(cusolverDnSetStream(cusolverH, stream), "Failed to set cuda stream in cusolver");
+    CUSOLVER_CHECK_ERROR(cusolverDnSsyevjBatched(cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_LOWER, 9, LtL, 9,
+                                                 W, cusolverBuffer, lwork, cusolverInfo, syevj_params, batchSize),
+                         "Failed to calculate eigen values using syevj");
+#ifdef DEBUG
+    CUDA_CHECK_ERROR(cudaStreamSynchronize(stream), "synchronization failed after eigen value solver");
+    std::vector<int> info(batchSize);
+    cudaMemcpyAsync((void *)info.data(), (void *)cusolverInfo, batchSize * sizeof(int), cudaMemcpyDeviceToHost, stream);
+    CUDA_CHECK_ERROR(cudaStreamSynchronize(stream), "synchronization failed after copying back cusolverInfo");
+
+    for (int b = 0; b < batchSize; b++)
+    {
+        if (info[b] == 0)
+        {
+            std::cout << "cusolver converged for matrix " << b << std::endl;
+            printKernel<<<1, 9, 0, stream>>>(W + 9 * b, 9, 0);
+            printf("\n");
+        }
+        else if (info[b] < 0)
+        {
+            std::cout << info[b] << "th parameter is wrong for image " << b << std::endl;
+        }
+        else
+        {
+            std::cout << "cusolver did not converge for image " << b << std::endl;
+        }
+        CUDA_CHECK_ERROR(cudaStreamSynchronize(stream), "failed to synchronize");
+    }
+#endif
+
+    block.x = 256;
+    grid.x  = 1;
+    grid.y  = batchSize;
+    grid.z  = 1;
+    computeModel<<<grid, block, 0, stream>>>(srcWrap, dstWrap, srcMean, dstMean, srcShiftSum, dstShiftSum, LtL, W, r, J,
+                                             calc_buffer, modelWrap, numPoints, batchSize);
+}
+
+template<typename SrcDstType>
+void RunFindHomography(const SrcDstType &src, const SrcDstType &dst, const nvcv::TensorDataStridedCuda &models,
+                       const BufferOffsets *bufferOffset, const cuSolver *cusolverData, cudaStream_t stream)
+{
+    using SrcDstWrapper = cuda::Tensor2DWrap<float2>;
+    SrcDstWrapper srcWrap(src);
+    SrcDstWrapper dstWrap(dst);
+    int           numPoints = src.shape(1);
+    FindHomographyWrapper(srcWrap, dstWrap, models, bufferOffset, cusolverData, numPoints, stream);
+}
+
+} // namespace
+
+namespace cvcuda::priv {
+
+// Constructor -----------------------------------------------------------------
+
+FindHomography::FindHomography(int batchSize, int maxNumPoints)
+{
+    cudaMalloc(reinterpret_cast<void **>(&(bufferOffset.srcMean)), sizeof(float2) * batchSize);
+    cudaMalloc(reinterpret_cast<void **>(&(bufferOffset.dstMean)), sizeof(float2) * batchSize);
+    cudaMalloc(reinterpret_cast<void **>(&(bufferOffset.srcShiftSum)), sizeof(float2) * batchSize);
+    cudaMalloc(reinterpret_cast<void **>(&(bufferOffset.dstShiftSum)), sizeof(float2) * batchSize);
+    cudaMalloc(reinterpret_cast<void **>(&(bufferOffset.LtL)), 81 * sizeof(float) * batchSize);
+    cudaMalloc(reinterpret_cast<void **>(&(bufferOffset.W)), 9 * sizeof(float) * batchSize);
+    cudaMalloc(reinterpret_cast<void **>(&(bufferOffset.r)), 2 * maxNumPoints * sizeof(float) * batchSize);
+    cudaMalloc(reinterpret_cast<void **>(&(bufferOffset.J)), 2 * maxNumPoints * 8 * sizeof(float) * batchSize);
+    cudaMalloc(reinterpret_cast<void **>(&(bufferOffset.calc_buffer)), maxNumPoints * sizeof(float) * batchSize);
+    CUSOLVER_CHECK_ERROR(cusolverDnCreate(&(cusolverData.cusolverH)), "Failed to create cusolver handle");
+    CUSOLVER_CHECK_ERROR(cusolverDnCreateSyevjInfo(&(cusolverData.syevj_params)), "Failed to create syevj params");
+    CUSOLVER_CHECK_ERROR(cusolverDnXsyevjSetTolerance(cusolverData.syevj_params, 1e-7),
+                         "Failed to set tolerance for syevj");
+    CUSOLVER_CHECK_ERROR(cusolverDnXsyevjSetMaxSweeps(cusolverData.syevj_params, 15),
+                         "Failed to set max sweeps for syevj");
+    CUSOLVER_CHECK_ERROR(cusolverDnXsyevjSetSortEig(cusolverData.syevj_params, 1),
+                         "Failed to set sorting of eigen values in syevj");
+    CUSOLVER_CHECK_ERROR(
+        cusolverDnSsyevjBatched_bufferSize(cusolverData.cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_LOWER, 9,
+                                           NULL, 9, NULL, &(cusolverData.lwork), cusolverData.syevj_params, batchSize),
+        "Failed to calculate buffer size for syevj");
+    cudaMalloc(reinterpret_cast<void **>(&(cusolverData.cusolverBuffer)), cusolverData.lwork * sizeof(float));
+    cudaMalloc(reinterpret_cast<void **>(&(cusolverData.cusolverInfo)), batchSize * sizeof(int));
+}
+
+FindHomography::~FindHomography()
+{
+    cudaFree(bufferOffset.srcMean);
+    cudaFree(bufferOffset.dstMean);
+    cudaFree(bufferOffset.srcShiftSum);
+    cudaFree(bufferOffset.dstShiftSum);
+    cudaFree(bufferOffset.LtL);
+    cudaFree(bufferOffset.W);
+    cudaFree(bufferOffset.r);
+    cudaFree(bufferOffset.J);
+    cudaFree(bufferOffset.calc_buffer);
+    cusolverDnDestroySyevjInfo(cusolverData.syevj_params);
+    cusolverDnDestroy(cusolverData.cusolverH);
+    cudaFree(cusolverData.cusolverBuffer);
+    cudaFree(cusolverData.cusolverInfo);
+}
+
+// Operator --------------------------------------------------------------------
+
+// Tensor input variant
+void FindHomography::operator()(cudaStream_t stream, const nvcv::Tensor &srcPoints, const nvcv::Tensor &dstPoints,
+                                const nvcv::Tensor &models) const
+{
+    auto srcData = srcPoints.exportData<nvcv::TensorDataStridedCuda>();
+    if (!srcData)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "Input must be cuda-accessible, pitch-linear tensor");
+    }
+
+    auto dstData = dstPoints.exportData<nvcv::TensorDataStridedCuda>();
+    if (!dstData)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "Output must be cuda-accessible, pitch-linear tensor");
+    }
+
+    auto modelData = models.exportData<nvcv::TensorDataStridedCuda>();
+    if (!modelData)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "Input must be cuda-accessible, pitch-linear tensor");
+    }
+
+    // validation of input data
+    if (!((srcData->rank() == dstData->rank()) && (srcData->rank() == 2)))
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "source and destination points must have rank 2");
+    }
+
+    if (!(srcData->shape(0) == dstData->shape(0) && srcData->shape(0) == modelData->shape(0)))
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "source, destination and model must have same batch size");
+    }
+
+    if (srcData->shape(1) != dstData->shape(1))
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "source and destination array length must be same length to return a valid model");
+    }
+
+    if (srcData->shape(1) < 4 || dstData->shape(1) < 4)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "source and destination array length must be >=4 to return a valid model");
+    }
+
+    if (!(modelData->rank() == 3 && modelData->shape(1) == 3 && modelData->shape(2) == 3))
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "model tensor must be 2D with shape 3x3");
+    }
+
+    if (!(srcData->dtype() == nvcv::TYPE_2F32 && dstData->dtype() == nvcv::TYPE_2F32
+          && modelData->dtype() == nvcv::TYPE_F32))
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "source, destination and model tensors must have data type F32");
+    }
+
+    RunFindHomography(*srcData, *dstData, *modelData, &bufferOffset, &cusolverData, stream);
+}
+
+void FindHomography::operator()(cudaStream_t stream, const nvcv::TensorBatch &srcPoints,
+                                const nvcv::TensorBatch &dstPoints, const nvcv::TensorBatch &models) const
+{
+    if (!(srcPoints.numTensors() == dstPoints.numTensors() && srcPoints.numTensors() == models.numTensors()))
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "source, destination and model tensors must have same batch size");
+    }
+
+    for (int b = 0; b < srcPoints.numTensors(); b++)
+    {
+        auto srcData = srcPoints[b].exportData<nvcv::TensorDataStridedCuda>();
+        if (!srcData)
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Input src points must be cuda-accessible, pitch-linear tensor");
+        }
+
+        auto dstData = dstPoints[b].exportData<nvcv::TensorDataStridedCuda>();
+        if (!dstData)
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Input dst points must be cuda-accessible, pitch-linear tensor");
+        }
+
+        auto modelData = models[b].exportData<nvcv::TensorDataStridedCuda>();
+        if (!modelData)
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "model must be cuda-accessible, pitch-linear tensor");
+        }
+
+        // validation of input data
+        if (!((srcData->shape(0) == dstData->shape(0)) && (srcData->shape(0) == modelData->shape(0))
+              && (srcData->shape(0) == 1)))
+        {
+            throw nvcv::Exception(
+                nvcv::Status::ERROR_INVALID_ARGUMENT,
+                "Invdividual samples (src, dst and model) in the batch must be tensors with batch size 1");
+        }
+
+        if (!((srcData->rank() == dstData->rank()) && (srcData->rank() == 2)))
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "source and destination tensors must have rank 2");
+        }
+
+        if (srcData->shape(1) != dstData->shape(1))
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "source and destination array length must be same length to return a valid model");
+        }
+
+        if (srcData->shape(1) < 4 || dstData->shape(1) < 4)
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "source and destination array length must be >=4 to return a valid model");
+        }
+
+        if (!(modelData->rank() == 3 && modelData->shape(1) == 3 && modelData->shape(2) == 3))
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "model tensor must be 2D with shape 3x3");
+        }
+
+        if (!(srcData->dtype() == nvcv::TYPE_2F32 && dstData->dtype() == nvcv::TYPE_2F32
+              && modelData->dtype() == nvcv::TYPE_F32))
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "source, destination and model tensors must have data type F32");
+        }
+
+        RunFindHomography(*srcData, *dstData, *modelData, &bufferOffset, &cusolverData, stream);
+    }
+}
+
+} // namespace cvcuda::priv
diff --git a/src/cvcuda/priv/OpFindHomography.hpp b/src/cvcuda/priv/OpFindHomography.hpp
new file mode 100644
index 00000000..c18d3ef4
--- /dev/null
+++ b/src/cvcuda/priv/OpFindHomography.hpp
@@ -0,0 +1,78 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file FindHomography.hpp
+ *
+ * @brief Defines the private C++ Class for the FindHomography operation.
+ */
+
+#ifndef CVCUDA_PRIV__FIND_HOMOGRAPHY_HPP
+#define CVCUDA_PRIV__FIND_HOMOGRAPHY_HPP
+#include "IOperator.hpp"
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <cusolverDn.h>
+#include <cvcuda/OpFindHomography.h>
+#include <library_types.h>
+#include <nvcv/Array.hpp>
+#include <nvcv/Tensor.hpp>
+#include <nvcv/TensorBatch.hpp>
+
+typedef struct
+{
+    float2 *srcMean;
+    float2 *dstMean;
+    float2 *srcShiftSum;
+    float2 *dstShiftSum;
+    float  *LtL;
+    float  *W;
+    float  *r;
+    float  *J;
+    float  *calc_buffer;
+} BufferOffsets;
+
+typedef struct
+{
+    int               *cusolverInfo;
+    float             *cusolverBuffer;
+    cusolverDnHandle_t cusolverH;
+    syevjInfo_t        syevj_params;
+    int                lwork;
+} cuSolver;
+
+namespace cvcuda::priv {
+
+class FindHomography final : public IOperator
+{
+public:
+    explicit FindHomography(int batchSize, int numPoints);
+    ~FindHomography();
+    void operator()(cudaStream_t stream, const nvcv::Tensor &src, const nvcv::Tensor &dst,
+                    const nvcv::Tensor &models) const;
+    void operator()(cudaStream_t stream, const nvcv::TensorBatch &src, const nvcv::TensorBatch &dst,
+                    const nvcv::TensorBatch &models) const;
+
+private:
+    BufferOffsets bufferOffset;
+    cuSolver      cusolverData;
+};
+
+} // namespace cvcuda::priv
+
+#endif // CVCUDA_PRIV__FIND_HOMOGRAPHY_HPP
diff --git a/src/cvcuda/priv/OpLabel.cu b/src/cvcuda/priv/OpLabel.cu
new file mode 100644
index 00000000..8a1c5118
--- /dev/null
+++ b/src/cvcuda/priv/OpLabel.cu
@@ -0,0 +1,1751 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// MIT License
+
+// Copyright (c) 2018 - Daniel Peter Playne
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/**
+ * @note The CUDA kernels implemented below are based on the paper:
+ * D. P. Playne and K. Hawick,
+ * "A New Algorithm for Parallel Connected-Component Labelling on GPUs,"
+ * in IEEE Transactions on Parallel and Distributed Systems,
+ * vol. 29, no. 6, pp. 1217-1230, 1 June 2018.
+ */
+
+#include "Assert.h"
+#include "OpLabel.hpp"
+
+#include <cvcuda/Types.h>
+#include <nvcv/Exception.hpp>
+#include <nvcv/TensorData.hpp>
+#include <nvcv/TensorDataAccess.hpp>
+#include <nvcv/cuda/DropCast.hpp>
+#include <nvcv/cuda/MathOps.hpp>
+#include <nvcv/cuda/MathWrappers.hpp>
+#include <nvcv/cuda/StaticCast.hpp>
+#include <nvcv/cuda/TensorWrap.hpp>
+#include <nvcv/cuda/TypeTraits.hpp>
+#include <util/CheckError.hpp>
+#include <util/Math.hpp>
+
+#include <sstream>
+
+namespace cuda = nvcv::cuda;
+namespace util = nvcv::util;
+
+namespace {
+
+// CUDA kernels ----------------------------------------------------------------
+
+template<typename DT>
+__device__ DT FindRoot(DT *labels, DT label)
+{
+    DT next = labels[label];
+
+    while (label != next)
+    {
+        label = next;
+        next  = labels[label];
+    }
+
+    return label;
+}
+
+template<typename DT>
+__device__ DT Reduction(DT *labels, DT label1, DT label2)
+{
+    DT next1 = (label1 != label2) ? labels[label1] : 0;
+    DT next2 = (label1 != label2) ? labels[label2] : 0;
+
+    while ((label1 != label2) && (label1 != next1))
+    {
+        label1 = next1;
+        next1  = labels[label1];
+    }
+
+    while ((label1 != label2) && (label2 != next2))
+    {
+        label2 = next2;
+        next2  = labels[label2];
+    }
+
+    DT label3;
+
+    while (label1 != label2)
+    {
+        if (label1 < label2)
+        {
+            label3 = label1;
+            label1 = label2;
+            label2 = label3;
+        }
+
+        label3 = atomicMin(&labels[label1], label2);
+        label1 = (label1 == label3) ? label2 : label3;
+    }
+
+    return label1;
+}
+
+// -- 2D kernels --
+
+template<int BW, int BH, typename ST, typename DT>
+__global__ void BlockLabel2D(cuda::Tensor3DWrap<DT> dst, cuda::Tensor3DWrap<ST> src, cuda::Tensor1DWrap<ST> minThresh,
+                             cuda::Tensor1DWrap<ST> maxThresh, int2 size)
+{
+    __shared__ DT labels[BW * BH];
+
+    int2 tc = cuda::StaticCast<int>(cuda::DropCast<2>(threadIdx));
+    int3 gc{(int)(blockIdx.x * BW) + tc.x, (int)(blockIdx.y * BH) + tc.y, (int)blockIdx.z};
+
+    bool nym1x, nyxm1, nym1xm1;
+    DT   label1;
+
+    bool hasMinThresh = (minThresh.ptr(0) != nullptr);
+    bool hasMaxThresh = (maxThresh.ptr(0) != nullptr);
+    ST   minThreshold = hasMinThresh ? minThresh[gc.z] : 0;
+    ST   maxThreshold = hasMaxThresh ? maxThresh[gc.z] : 0;
+
+    if (gc.x < size.x && gc.y < size.y)
+    {
+        ST pyx   = src[gc];
+        ST pym1x = (tc.y > 0) ? *src.ptr(gc.z, gc.y - 1, gc.x) : 0;
+
+        if (hasMinThresh && hasMaxThresh)
+        {
+            pyx   = pyx < minThreshold || pyx > maxThreshold ? 0 : 1;
+            pym1x = (tc.y > 0) ? (pym1x < minThreshold || pym1x > maxThreshold ? 0 : 1) : 0;
+        }
+        else if (hasMinThresh)
+        {
+            pyx   = pyx < minThreshold ? 0 : 1;
+            pym1x = (tc.y > 0) ? (pym1x < minThreshold ? 0 : 1) : 0;
+        }
+        else if (hasMaxThresh)
+        {
+            pyx   = pyx > maxThreshold ? 0 : 1;
+            pym1x = (tc.y > 0) ? (pym1x > maxThreshold ? 0 : 1) : 0;
+        }
+
+        ST pyxm1   = __shfl_up_sync(__activemask(), pyx, 1);
+        ST pym1xm1 = __shfl_up_sync(__activemask(), pym1x, 1);
+
+        nym1x   = (tc.y > 0) ? (pyx == pym1x) : false;
+        nyxm1   = (tc.x > 0) ? (pyx == pyxm1) : false;
+        nym1xm1 = (tc.y > 0 && tc.x > 0) ? (pyx == pym1xm1) : false;
+
+        label1 = (nyxm1) ? tc.y * BW + (tc.x - 1) : tc.y * BW + tc.x;
+        label1 = (nym1x) ? (tc.y - 1) * BW + tc.x : label1;
+
+        labels[tc.y * BW + tc.x] = label1;
+    }
+
+    __syncthreads();
+
+    if (gc.x < size.x && gc.y < size.y)
+    {
+        labels[tc.y * BW + tc.x] = FindRoot(labels, label1);
+    }
+
+    __syncthreads();
+
+    if (gc.x < size.x && gc.y < size.y)
+    {
+        if (nym1x && nyxm1 && !nym1xm1)
+        {
+            DT label2 = labels[tc.y * BW + tc.x - 1];
+
+            label1 = Reduction(labels, label1, label2);
+        }
+    }
+
+    __syncthreads();
+
+    if (gc.x < size.x && gc.y < size.y)
+    {
+        label1 = FindRoot(labels, label1);
+
+        DT lx = label1 % BW;
+        DT ly = (label1 / BW) % BH;
+
+        DT dstStrideH = dst.strides()[1] / sizeof(DT);
+
+        dst[gc] = (blockIdx.y * BH + ly) * dstStrideH + blockIdx.x * BW + lx;
+    }
+}
+
+template<typename ST, typename DT>
+__global__ void YLabelReduction2D(cuda::Tensor3DWrap<DT> dst, cuda::Tensor3DWrap<ST> src,
+                                  cuda::Tensor1DWrap<ST> minThresh, cuda::Tensor1DWrap<ST> maxThresh, int2 size)
+{
+    int3 gc;
+    gc.x = blockIdx.x * blockDim.x + threadIdx.x;
+    gc.y = (blockIdx.y * blockDim.y + threadIdx.y) * blockDim.y + blockDim.y;
+    gc.z = blockIdx.z;
+
+    if (gc.x >= size.x || gc.y >= size.y)
+    {
+        return;
+    }
+
+    bool hasMinThresh = (minThresh.ptr(0) != nullptr);
+    bool hasMaxThresh = (maxThresh.ptr(0) != nullptr);
+    ST   minThreshold = hasMinThresh ? minThresh[gc.z] : 0;
+    ST   maxThreshold = hasMaxThresh ? maxThresh[gc.z] : 0;
+
+    ST pyx   = src[gc];
+    ST pym1x = *src.ptr(gc.z, gc.y - 1, gc.x);
+
+    if (hasMinThresh && hasMaxThresh)
+    {
+        pyx   = pyx < minThreshold || pyx > maxThreshold ? 0 : 1;
+        pym1x = pym1x < minThreshold || pym1x > maxThreshold ? 0 : 1;
+    }
+    else if (hasMinThresh)
+    {
+        pyx   = pyx < minThreshold ? 0 : 1;
+        pym1x = pym1x < minThreshold ? 0 : 1;
+    }
+    else if (hasMaxThresh)
+    {
+        pyx   = pyx > maxThreshold ? 0 : 1;
+        pym1x = pym1x > maxThreshold ? 0 : 1;
+    }
+
+    ST pyxm1   = __shfl_up_sync(0xffffffff, pyx, 1);
+    ST pym1xm1 = __shfl_up_sync(0xffffffff, pym1x, 1);
+
+    if ((pyx == pym1x) && ((threadIdx.x == 0) || (pyx != pyxm1) || (pyx != pym1xm1)))
+    {
+        DT label1 = dst[gc];
+        DT label2 = *dst.ptr(gc.z, gc.y - 1, gc.x);
+
+        Reduction(dst.ptr(gc.z), label1, label2);
+    }
+}
+
+template<typename ST, typename DT>
+__global__ void XLabelReduction2D(cuda::Tensor3DWrap<DT> dst, cuda::Tensor3DWrap<ST> src,
+                                  cuda::Tensor1DWrap<ST> minThresh, cuda::Tensor1DWrap<ST> maxThresh, int2 size)
+{
+    int3 gc;
+    gc.x = (blockIdx.y * blockDim.y + threadIdx.y) * blockDim.x + blockDim.x;
+    gc.y = blockIdx.x * blockDim.x + threadIdx.x;
+    gc.z = blockIdx.z;
+
+    if (gc.x >= size.x || gc.y >= size.y)
+    {
+        return;
+    }
+
+    bool hasMinThresh = (minThresh.ptr(0) != nullptr);
+    bool hasMaxThresh = (maxThresh.ptr(0) != nullptr);
+    ST   minThreshold = hasMinThresh ? minThresh[gc.z] : 0;
+    ST   maxThreshold = hasMaxThresh ? maxThresh[gc.z] : 0;
+
+    ST pyx   = src[gc];
+    ST pyxm1 = *src.ptr(gc.z, gc.y, gc.x - 1);
+
+    if (hasMinThresh && hasMaxThresh)
+    {
+        pyx   = pyx < minThreshold || pyx > maxThreshold ? 0 : 1;
+        pyxm1 = pyxm1 < minThreshold || pyxm1 > maxThreshold ? 0 : 1;
+    }
+    else if (hasMinThresh)
+    {
+        pyx   = pyx < minThreshold ? 0 : 1;
+        pyxm1 = pyxm1 < minThreshold ? 0 : 1;
+    }
+    else if (hasMaxThresh)
+    {
+        pyx   = pyx > maxThreshold ? 0 : 1;
+        pyxm1 = pyxm1 > maxThreshold ? 0 : 1;
+    }
+
+    bool thread_y = (gc.y % blockDim.y) == 0;
+
+    ST pym1x   = __shfl_up_sync(0xffffffff, pyx, 1);
+    ST pym1xm1 = __shfl_up_sync(0xffffffff, pyxm1, 1);
+
+    if ((pyx == pyxm1) && (thread_y || (pyx != pym1x) || (pyx != pym1xm1)))
+    {
+        DT label1 = dst[gc];
+        DT label2 = *dst.ptr(gc.z, gc.y, gc.x - 1);
+
+        Reduction(dst.ptr(gc.z), label1, label2);
+    }
+}
+
+template<typename DT>
+__global__ void ResolveLabels2D(cuda::Tensor3DWrap<DT> dst, int2 size)
+{
+    int3 gc;
+    gc.x = blockIdx.x * blockDim.x + threadIdx.x;
+    gc.y = blockIdx.y * blockDim.y + threadIdx.y;
+    gc.z = blockIdx.z;
+
+    if (gc.x >= size.x || gc.y >= size.y)
+    {
+        return;
+    }
+
+    dst[gc] = FindRoot(dst.ptr(gc.z), dst[gc]);
+}
+
+template<typename DT, typename ST>
+__global__ void ReplaceBgLabels2D(cuda::Tensor3DWrap<DT> dst, cuda::Tensor3DWrap<ST> src,
+                                  cuda::Tensor1DWrap<ST> bgLabel, cuda::Tensor1DWrap<ST> minThresh,
+                                  cuda::Tensor1DWrap<ST> maxThresh, int2 size)
+{
+    int3 gc;
+    gc.x = blockIdx.x * blockDim.x + threadIdx.x;
+    gc.y = blockIdx.y * blockDim.y + threadIdx.y;
+    gc.z = blockIdx.z;
+
+    if (gc.x >= size.x || gc.y >= size.y)
+    {
+        return;
+    }
+
+    bool hasMinThresh = (minThresh.ptr(0) != nullptr);
+    bool hasMaxThresh = (maxThresh.ptr(0) != nullptr);
+    ST   minThreshold = hasMinThresh ? minThresh[gc.z] : 0;
+    ST   maxThreshold = hasMaxThresh ? maxThresh[gc.z] : 0;
+
+    ST pyx = src[gc];
+
+    if (hasMinThresh && hasMaxThresh)
+    {
+        pyx = pyx < minThreshold || pyx > maxThreshold ? 0 : 1;
+    }
+    else if (hasMinThresh)
+    {
+        pyx = pyx < minThreshold ? 0 : 1;
+    }
+    else if (hasMaxThresh)
+    {
+        pyx = pyx > maxThreshold ? 0 : 1;
+    }
+
+    ST backgroundLabel = bgLabel[gc.z];
+
+    // If src has bg label, put it in dst; if dst has bg label, it means a wrong label was assigned to a region,
+    // replace its label by a label never assigned, the stride zero meaning one-element-after-the-end stride
+
+    if (pyx == backgroundLabel)
+    {
+        dst[gc] = backgroundLabel;
+    }
+    else if (dst[gc] == (DT)backgroundLabel)
+    {
+        dst[gc] = dst.strides()[0] / sizeof(DT);
+    }
+}
+
+template<typename DT, typename ST>
+__global__ void CountLabels2D(cuda::Tensor1DWrap<DT> count, cuda::Tensor3DWrap<DT> stats, cuda::Tensor3DWrap<DT> dst,
+                              cuda::Tensor1DWrap<ST> bgLabel, int2 size, int maxCapacity)
+{
+    int3 gc;
+    gc.x = blockIdx.x * blockDim.x + threadIdx.x;
+    gc.y = blockIdx.y * blockDim.y + threadIdx.y;
+    gc.z = blockIdx.z;
+
+    if (gc.x >= size.x || gc.y >= size.y)
+    {
+        return;
+    }
+
+    bool hasBgLabel      = (bgLabel.ptr(0) != nullptr);
+    ST   backgroundLabel = hasBgLabel ? bgLabel[gc.z] : 0;
+
+    DT label = dst[gc];
+
+    if (hasBgLabel && label == (DT)backgroundLabel)
+    {
+        return; // do not count background labels
+    }
+
+    DT posLabel = gc.y * dst.strides()[1] / sizeof(DT) + gc.x;
+    DT endLabel = dst.strides()[0] / sizeof(DT);
+
+    DT   regionIdx;
+    bool counted = false;
+
+    if (hasBgLabel && label == endLabel && posLabel == (DT)backgroundLabel)
+    {
+        // This is a special region marked with one-element-after-the-end label, count it
+        regionIdx = atomicAdd(count.ptr(gc.z), 1);
+        counted   = true;
+    }
+    else if (label == posLabel)
+    {
+        // This is the first element of a regular region, count it
+        regionIdx = atomicAdd(count.ptr(gc.z), 1);
+        counted   = true;
+    }
+
+    // If statistics should be computed and the region index is inside the allowed storage (the M maximum
+    // capacity in stats tensor), replace the output label by the region index and store initial statistics
+
+    if (counted && stats.ptr(0) != nullptr && regionIdx < maxCapacity)
+    {
+        // TODO: improve the mark of output label as region index with 1 in the 1st bit
+        dst[gc] = regionIdx | (DT)(1 << 31);
+
+        *stats.ptr(gc.z, (int)regionIdx, 0) = label;
+        *stats.ptr(gc.z, (int)regionIdx, 1) = (DT)gc.x;
+        *stats.ptr(gc.z, (int)regionIdx, 2) = (DT)gc.y;
+        *stats.ptr(gc.z, (int)regionIdx, 3) = 1;
+        *stats.ptr(gc.z, (int)regionIdx, 4) = 1;
+        *stats.ptr(gc.z, (int)regionIdx, 5) = 1;
+    }
+}
+
+template<typename DT, typename ST>
+__global__ void ComputeStats2D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor3DWrap<DT> dst, cuda::Tensor1DWrap<ST> bgLabel,
+                               int2 size, bool relabel)
+{
+    int3 gc;
+    gc.x = blockIdx.x * blockDim.x + threadIdx.x;
+    gc.y = blockIdx.y * blockDim.y + threadIdx.y;
+    gc.z = blockIdx.z;
+
+    if (gc.x >= size.x || gc.y >= size.y)
+    {
+        return;
+    }
+
+    bool hasBgLabel      = (bgLabel.ptr(0) != nullptr);
+    ST   backgroundLabel = hasBgLabel ? bgLabel[gc.z] : 0;
+    DT   endLabel        = dst.strides()[0] / sizeof(DT);
+    DT   label           = dst[gc];
+
+    if (hasBgLabel && label == (DT)backgroundLabel)
+    {
+        return; // do not compute statistics for background labels
+    }
+    if (label & (DT)(1 << 31))
+    {
+        return; // label is marked as region index, its statistics is already computed
+    }
+    if (hasBgLabel && label == endLabel)
+    {
+        // This is a special region marked with one-element-after-the-end label, its label was the backgroundLabel
+        label = backgroundLabel;
+    }
+
+    DT regionIdx = dst.ptr(gc.z)[label];
+
+    if (regionIdx & (DT)(1 << 31))
+    {
+        regionIdx = regionIdx & (DT) ~(1 << 31);
+
+        if (relabel)
+        {
+            if (hasBgLabel && regionIdx >= (DT)backgroundLabel)
+            {
+                dst[gc] = regionIdx + 1; // skip one region index equals to background label when relabeling
+            }
+            else
+            {
+                dst[gc] = regionIdx;
+            }
+        }
+
+        int2 cornerPos{(int)*stats.ptr(gc.z, (int)regionIdx, 1), (int)*stats.ptr(gc.z, (int)regionIdx, 2)};
+
+        int2 bboxArea = cuda::abs(cornerPos - cuda::DropCast<2>(gc)) + 1;
+
+        atomicMax(stats.ptr(gc.z, (int)regionIdx, 3), (DT)bboxArea.x);
+        atomicMax(stats.ptr(gc.z, (int)regionIdx, 4), (DT)bboxArea.y);
+        atomicAdd(stats.ptr(gc.z, (int)regionIdx, 5), 1);
+    }
+}
+
+template<typename DT, typename ST>
+__global__ void RemoveIslands2D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor3DWrap<DT> dst,
+                                cuda::Tensor1DWrap<ST> bgLabel, cuda::Tensor1DWrap<DT> minSize, int2 size, bool relabel)
+{
+    int3 gc;
+    gc.x = blockIdx.x * blockDim.x + threadIdx.x;
+    gc.y = blockIdx.y * blockDim.y + threadIdx.y;
+    gc.z = blockIdx.z;
+
+    if (gc.x >= size.x || gc.y >= size.y)
+    {
+        return;
+    }
+
+    DT endLabel = dst.strides()[0] / sizeof(DT);
+
+    DT label = dst[gc];
+
+    ST backgroundLabel = bgLabel[gc.z];
+
+    if (label == (DT)backgroundLabel)
+    {
+        return;
+    }
+    if (label == endLabel)
+    {
+        // This is a special region marked with one-element-after-the-end label, its label was the backgroundLabel
+        label = backgroundLabel;
+    }
+
+    DT regionIdx = 0;
+
+    if (!(label & (DT)(1 << 31)))
+    {
+        if (relabel)
+        {
+            if (label >= (DT)backgroundLabel + 1)
+            {
+                regionIdx = label - 1; // go back one region index to account for background label
+            }
+            else
+            {
+                regionIdx = label;
+            }
+        }
+        else
+        {
+            regionIdx = dst.ptr(gc.z)[label];
+
+            if (regionIdx & (DT)(1 << 31))
+            {
+                regionIdx = regionIdx & (DT) ~(1 << 31);
+            }
+            else
+            {
+                return; // invalid region index
+            }
+        }
+    }
+    else
+    {
+        regionIdx = label & (DT) ~(1 << 31);
+    }
+
+    DT regionSize = *stats.ptr(gc.z, (int)regionIdx, 5);
+
+    // If region size is less than minimum size, it is an island and should be removed, i.e. set to background label
+    if (regionSize < minSize[gc.z])
+    {
+        dst[gc] = backgroundLabel;
+    }
+}
+
+template<typename DT, typename ST>
+__global__ void Relabel2D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor3DWrap<DT> dst, cuda::Tensor1DWrap<ST> bgLabel,
+                          int2 size, bool relabel)
+{
+    int3 gc;
+    gc.x = blockIdx.x * blockDim.x + threadIdx.x;
+    gc.y = blockIdx.y * blockDim.y + threadIdx.y;
+    gc.z = blockIdx.z;
+
+    if (gc.x >= size.x || gc.y >= size.y)
+    {
+        return;
+    }
+
+    DT label = dst[gc];
+
+    if (label & (DT)(1 << 31))
+    {
+        // Label is marked as region index, relabel it back to proper label
+        DT regionIdx = label & (DT) ~(1 << 31);
+
+        if (relabel)
+        {
+            bool hasBgLabel      = (bgLabel.ptr(0) != nullptr);
+            ST   backgroundLabel = hasBgLabel ? bgLabel[gc.z] : 0;
+
+            if (hasBgLabel && regionIdx >= (DT)backgroundLabel)
+            {
+                dst[gc] = regionIdx + 1; // skip one region index equals to background label when relabeling
+            }
+            else
+            {
+                dst[gc] = regionIdx;
+            }
+        }
+        else
+        {
+            dst[gc] = *stats.ptr(gc.z, (int)regionIdx, 0);
+        }
+    }
+}
+
+// -- 3D kernels --
+
+template<int BW, int BH, int BD, typename ST, typename DT>
+__global__ void BlockLabel3D(cuda::Tensor4DWrap<DT> dst, cuda::Tensor4DWrap<ST> src, cuda::Tensor1DWrap<ST> minThresh,
+                             cuda::Tensor1DWrap<ST> maxThresh, int4 shape)
+{
+    __shared__ DT labels[BW * BH * BD];
+
+    int3 tc = cuda::StaticCast<int>(threadIdx);
+    int4 gc{(int)blockIdx.x * BW + tc.x, (int)blockIdx.y * BH + tc.y, (int)blockIdx.z * BD + tc.z, 0};
+
+    bool nzm1yx, nzym1x, nzyxm1, nzym1xm1, nzm1yxm1, nzm1ym1x;
+    DT   label;
+
+    bool hasMinThresh = (minThresh.ptr(0) != nullptr);
+    bool hasMaxThresh = (maxThresh.ptr(0) != nullptr);
+
+    for (gc.w = 0; gc.w < shape.w; gc.w++)
+    {
+        ST minThreshold = hasMinThresh ? minThresh[gc.w] : 0;
+        ST maxThreshold = hasMaxThresh ? maxThresh[gc.w] : 0;
+
+        if (gc.x < shape.x && gc.y < shape.y && gc.z < shape.z)
+        {
+            ST pzyx     = src[gc];
+            ST pzym1x   = (tc.y > 0) ? *src.ptr(gc.w, gc.z, gc.y - 1, gc.x) : 0;
+            ST pzm1yx   = (tc.z > 0) ? *src.ptr(gc.w, gc.z - 1, gc.y, gc.x) : 0;
+            ST pzm1ym1x = (tc.z > 0 && tc.y > 0) ? *src.ptr(gc.w, gc.z - 1, gc.y - 1, gc.x) : 0;
+
+            if (hasMinThresh && hasMaxThresh)
+            {
+                pzyx     = pzyx < minThreshold || pzyx > maxThreshold ? 0 : 1;
+                pzym1x   = (tc.y > 0) ? (pzym1x < minThreshold || pzym1x > maxThreshold ? 0 : 1) : 0;
+                pzm1yx   = (tc.z > 0) ? (pzm1yx < minThreshold || pzm1yx > maxThreshold ? 0 : 1) : 0;
+                pzm1ym1x = (tc.z > 0 && tc.y > 0) ? (pzm1ym1x < minThreshold || pzm1ym1x > maxThreshold ? 0 : 1) : 0;
+            }
+            else if (hasMinThresh)
+            {
+                pzyx     = pzyx < minThreshold ? 0 : 1;
+                pzym1x   = (tc.y > 0) ? (pzym1x < minThreshold ? 0 : 1) : 0;
+                pzm1yx   = (tc.z > 0) ? (pzm1yx < minThreshold ? 0 : 1) : 0;
+                pzm1ym1x = (tc.z > 0 && tc.y > 0) ? (pzm1ym1x < minThreshold ? 0 : 1) : 0;
+            }
+            else if (hasMaxThresh)
+            {
+                pzyx     = pzyx > maxThreshold ? 0 : 1;
+                pzym1x   = (tc.y > 0) ? (pzym1x > maxThreshold ? 0 : 1) : 0;
+                pzm1yx   = (tc.z > 0) ? (pzm1yx > maxThreshold ? 0 : 1) : 0;
+                pzm1ym1x = (tc.z > 0 && tc.y > 0) ? (pzm1ym1x > maxThreshold ? 0 : 1) : 0;
+            }
+
+            ST pzyxm1   = __shfl_up_sync(__activemask(), pzyx, 1);
+            ST pzym1xm1 = __shfl_up_sync(__activemask(), pzym1x, 1);
+            ST pzm1yxm1 = __shfl_up_sync(__activemask(), pzm1yx, 1);
+
+            nzm1yx = (tc.z > 0) && (pzyx == pzm1yx);
+            nzym1x = (tc.y > 0) && (pzyx == pzym1x);
+            nzyxm1 = (tc.x > 0) && (pzyx == pzyxm1);
+
+            nzym1xm1 = ((tc.y > 0) && (tc.x > 0) && (pzyx == pzym1xm1));
+            nzm1yxm1 = ((tc.z > 0) && (tc.x > 0) && (pzyx == pzm1yxm1));
+            nzm1ym1x = ((tc.z > 0) && (tc.y > 0) && (pzyx == pzm1ym1x));
+
+            label = (nzyxm1) ? (tc.z * BW * BH + tc.y * BW + (tc.x - 1)) : (tc.z * BW * BH + tc.y * BW + tc.x);
+            label = (nzym1x) ? (tc.z * BW * BH + (tc.y - 1) * BW + tc.x) : label;
+            label = (nzm1yx) ? ((tc.z - 1) * BW * BH + tc.y * BW + tc.x) : label;
+
+            labels[tc.z * BW * BH + tc.y * BW + tc.x] = label;
+        }
+
+        __syncthreads();
+
+        if (gc.x < shape.x && gc.y < shape.y && gc.z < shape.z)
+        {
+            labels[tc.z * BW * BH + tc.y * BW + tc.x] = FindRoot(labels, label);
+        }
+
+        __syncthreads();
+
+        if (gc.x < shape.x && gc.y < shape.y && gc.z < shape.z)
+        {
+            if (nzym1x && nzm1yx && !nzm1ym1x)
+            {
+                Reduction(labels, label, labels[tc.z * BW * BH + (tc.y - 1) * BW + tc.x]);
+            }
+
+            if (nzyxm1 && ((nzm1yx && !nzm1yxm1) || (nzym1x && !nzym1xm1)))
+            {
+                label = Reduction(labels, label, labels[tc.z * BW * BH + tc.y * BW + tc.x - 1]);
+            }
+        }
+
+        __syncthreads();
+
+        if (gc.x < shape.x && gc.y < shape.y && gc.z < shape.z)
+        {
+            label = labels[tc.z * BW * BH + tc.y * BW + tc.x];
+
+            label = FindRoot(labels, label);
+
+            DT lx = label % BW;
+            DT ly = (label / BW) % BH;
+            DT lz = (label / (BW * BH)) % BD;
+
+            DT dstStrideD = dst.strides()[1] / sizeof(DT);
+            DT dstStrideH = dst.strides()[2] / sizeof(DT);
+
+            dst[gc] = (blockIdx.z * BD + lz) * dstStrideD + (blockIdx.y * BH + ly) * dstStrideH + blockIdx.x * BW + lx;
+        }
+    }
+}
+
+template<typename ST, typename DT>
+__global__ void ZLabelReduction3D(cuda::Tensor4DWrap<DT> dst, cuda::Tensor4DWrap<ST> src,
+                                  cuda::Tensor1DWrap<ST> minThresh, cuda::Tensor1DWrap<ST> maxThresh, int4 shape)
+{
+    int4 gc;
+    gc.x = ((blockIdx.x * blockDim.x) + threadIdx.x);
+    gc.y = ((blockIdx.y * blockDim.y) + threadIdx.y);
+    gc.z = ((blockIdx.z * blockDim.z) + threadIdx.z) * blockDim.z + blockDim.z;
+
+    if (gc.x >= shape.x || gc.y >= shape.y || gc.z >= shape.z)
+    {
+        return;
+    }
+
+    bool hasMinThresh = (minThresh.ptr(0) != nullptr);
+    bool hasMaxThresh = (maxThresh.ptr(0) != nullptr);
+
+    bool thread_x = (gc.x % blockDim.x) == 0;
+    bool thread_y = (gc.y % blockDim.y) == 0;
+
+    for (gc.w = 0; gc.w < shape.w; gc.w++)
+    {
+        ST minThreshold = hasMinThresh ? minThresh[gc.w] : 0;
+        ST maxThreshold = hasMaxThresh ? maxThresh[gc.w] : 0;
+
+        ST pzyx   = src[gc];
+        ST pzm1yx = *src.ptr(gc.w, gc.z - 1, gc.y, gc.x);
+
+        if (hasMinThresh && hasMaxThresh)
+        {
+            pzyx   = pzyx < minThreshold || pzyx > maxThreshold ? 0 : 1;
+            pzm1yx = pzm1yx < minThreshold || pzm1yx > maxThreshold ? 0 : 1;
+        }
+        else if (hasMinThresh)
+        {
+            pzyx   = pzyx < minThreshold ? 0 : 1;
+            pzm1yx = pzm1yx < minThreshold ? 0 : 1;
+        }
+        else if (hasMaxThresh)
+        {
+            pzyx   = pzyx > maxThreshold ? 0 : 1;
+            pzm1yx = pzm1yx > maxThreshold ? 0 : 1;
+        }
+
+        ST pzyxm1   = __shfl_up_sync(0xffffffff, pzyx, 1);
+        ST pzm1yxm1 = __shfl_up_sync(0xffffffff, pzm1yx, 1);
+
+        if (pzyx == pzm1yx)
+        {
+            ST pzym1x   = (!thread_y) ? *src.ptr(gc.w, gc.z, gc.y - 1, gc.x) : 0;
+            ST pzm1ym1x = (!thread_y) ? *src.ptr(gc.w, gc.z - 1, gc.y - 1, gc.x) : 0;
+
+            if (hasMinThresh && hasMaxThresh)
+            {
+                pzym1x   = pzym1x < minThreshold || pzym1x > maxThreshold ? 0 : 1;
+                pzm1ym1x = pzm1ym1x < minThreshold || pzm1ym1x > maxThreshold ? 0 : 1;
+            }
+            else if (hasMinThresh)
+            {
+                pzym1x   = pzym1x < minThreshold ? 0 : 1;
+                pzm1ym1x = pzm1ym1x < minThreshold ? 0 : 1;
+            }
+            else if (hasMaxThresh)
+            {
+                pzym1x   = pzym1x > maxThreshold ? 0 : 1;
+                pzm1ym1x = pzm1ym1x > maxThreshold ? 0 : 1;
+            }
+
+            bool nzym1x   = (!thread_y) ? (pzyx == pzym1x) : false;
+            bool nzm1ym1x = (!thread_y) ? (pzyx == pzm1ym1x) : false;
+
+            if ((thread_x || (pzyx != pzyxm1) || (pzyx != pzm1yxm1)) && (!nzym1x || !nzm1ym1x))
+            {
+                DT label1 = dst[gc];
+                DT label2 = *dst.ptr(gc.w, gc.z - 1, gc.y, gc.x);
+
+                Reduction(dst.ptr(gc.w), label1, label2);
+            }
+        }
+    }
+}
+
+template<typename ST, typename DT>
+__global__ void YLabelReduction3D(cuda::Tensor4DWrap<DT> dst, cuda::Tensor4DWrap<ST> src,
+                                  cuda::Tensor1DWrap<ST> minThresh, cuda::Tensor1DWrap<ST> maxThresh, int4 shape)
+{
+    int4 gc;
+    gc.x = ((blockIdx.x * blockDim.x) + threadIdx.x);
+    gc.y = ((blockIdx.z * blockDim.z) + threadIdx.z) * blockDim.y + blockDim.y;
+    gc.z = ((blockIdx.y * blockDim.y) + threadIdx.y);
+
+    if (gc.x >= shape.x || gc.y >= shape.y || gc.z >= shape.z)
+    {
+        return;
+    }
+
+    bool hasMinThresh = (minThresh.ptr(0) != nullptr);
+    bool hasMaxThresh = (maxThresh.ptr(0) != nullptr);
+
+    bool thread_x = (gc.x % blockDim.x) == 0;
+    bool thread_z = (gc.z % blockDim.z) == 0;
+
+    for (gc.w = 0; gc.w < shape.w; gc.w++)
+    {
+        ST minThreshold = hasMinThresh ? minThresh[gc.w] : 0;
+        ST maxThreshold = hasMaxThresh ? maxThresh[gc.w] : 0;
+
+        ST pzyx   = src[gc];
+        ST pzym1x = *src.ptr(gc.w, gc.z, gc.y - 1, gc.x);
+
+        if (hasMinThresh && hasMaxThresh)
+        {
+            pzyx   = pzyx < minThreshold || pzyx > maxThreshold ? 0 : 1;
+            pzym1x = pzym1x < minThreshold || pzym1x > maxThreshold ? 0 : 1;
+        }
+        else if (hasMinThresh)
+        {
+            pzyx   = pzyx < minThreshold ? 0 : 1;
+            pzym1x = pzym1x < minThreshold ? 0 : 1;
+        }
+        else if (hasMaxThresh)
+        {
+            pzyx   = pzyx > maxThreshold ? 0 : 1;
+            pzym1x = pzym1x > maxThreshold ? 0 : 1;
+        }
+
+        ST pzyxm1   = __shfl_up_sync(0xffffffff, pzyx, 1);
+        ST pzym1xm1 = __shfl_up_sync(0xffffffff, pzym1x, 1);
+
+        if (pzyx == pzym1x)
+        {
+            ST pzm1yx   = (!thread_z) ? *src.ptr(gc.w, gc.z - 1, gc.y, gc.x) : 0;
+            ST pzm1ym1x = (!thread_z) ? *src.ptr(gc.w, gc.z - 1, gc.y - 1, gc.x) : 0;
+
+            if (hasMinThresh && hasMaxThresh)
+            {
+                pzm1yx   = pzm1yx < minThreshold || pzm1yx > maxThreshold ? 0 : 1;
+                pzm1ym1x = pzm1ym1x < minThreshold || pzm1ym1x > maxThreshold ? 0 : 1;
+            }
+            else if (hasMinThresh)
+            {
+                pzm1yx   = pzm1yx < minThreshold ? 0 : 1;
+                pzm1ym1x = pzm1ym1x < minThreshold ? 0 : 1;
+            }
+            else if (hasMaxThresh)
+            {
+                pzm1yx   = pzm1yx > maxThreshold ? 0 : 1;
+                pzm1ym1x = pzm1ym1x > maxThreshold ? 0 : 1;
+            }
+
+            bool nzm1yx   = (!thread_z) ? (pzyx == pzm1yx) : false;
+            bool nzm1ym1x = (!thread_z) ? (pzyx == pzm1ym1x) : false;
+
+            if ((!nzm1yx || !nzm1ym1x) && (thread_x || (pzyx != pzyxm1) || (pzyx != pzym1xm1)))
+            {
+                DT label1 = dst[gc];
+                DT label2 = *dst.ptr(gc.w, gc.z, gc.y - 1, gc.x);
+
+                Reduction(dst.ptr(gc.w), label1, label2);
+            }
+        }
+    }
+}
+
+template<typename ST, typename DT>
+__global__ void XLabelReduction3D(cuda::Tensor4DWrap<DT> dst, cuda::Tensor4DWrap<ST> src,
+                                  cuda::Tensor1DWrap<ST> minThresh, cuda::Tensor1DWrap<ST> maxThresh, int4 shape)
+{
+    int4 gc;
+    gc.x = ((blockIdx.z * blockDim.z) + threadIdx.z) * blockDim.x + blockDim.x;
+    gc.y = ((blockIdx.y * blockDim.y) + threadIdx.y);
+    gc.z = ((blockIdx.x * blockDim.x) + threadIdx.x);
+
+    if (gc.x >= shape.x || gc.y >= shape.y || gc.z >= shape.z)
+    {
+        return;
+    }
+
+    bool hasMinThresh = (minThresh.ptr(0) != nullptr);
+    bool hasMaxThresh = (maxThresh.ptr(0) != nullptr);
+
+    bool thread_y = (gc.y % blockDim.y) == 0;
+    bool thread_z = (gc.z % blockDim.z) == 0;
+
+    for (gc.w = 0; gc.w < shape.w; gc.w++)
+    {
+        ST minThreshold = hasMinThresh ? minThresh[gc.w] : 0;
+        ST maxThreshold = hasMaxThresh ? maxThresh[gc.w] : 0;
+
+        ST pzyx   = src[gc];
+        ST pzyxm1 = *src.ptr(gc.w, gc.z, gc.y, gc.x - 1);
+
+        if (hasMinThresh && hasMaxThresh)
+        {
+            pzyx   = pzyx < minThreshold || pzyx > maxThreshold ? 0 : 1;
+            pzyxm1 = pzyxm1 < minThreshold || pzyxm1 > maxThreshold ? 0 : 1;
+        }
+        else if (hasMinThresh)
+        {
+            pzyx   = pzyx < minThreshold ? 0 : 1;
+            pzyxm1 = pzyxm1 < minThreshold ? 0 : 1;
+        }
+        else if (hasMaxThresh)
+        {
+            pzyx   = pzyx > maxThreshold ? 0 : 1;
+            pzyxm1 = pzyxm1 > maxThreshold ? 0 : 1;
+        }
+
+        ST pzm1yx   = __shfl_up_sync(0xffffffff, pzyx, 1);
+        ST pzm1yxm1 = __shfl_up_sync(0xffffffff, pzyxm1, 1);
+
+        if (pzyx == pzyxm1)
+        {
+            ST pzym1x   = (!thread_y) ? *src.ptr(gc.w, gc.z, gc.y - 1, gc.x) : 0;
+            ST pzym1xm1 = (!thread_y) ? *src.ptr(gc.w, gc.z, gc.y - 1, gc.x - 1) : 0;
+
+            if (hasMinThresh && hasMaxThresh)
+            {
+                pzym1x   = pzym1x < minThreshold || pzym1x > maxThreshold ? 0 : 1;
+                pzym1xm1 = pzym1xm1 < minThreshold || pzym1xm1 > maxThreshold ? 0 : 1;
+            }
+            else if (hasMinThresh)
+            {
+                pzym1x   = pzym1x < minThreshold ? 0 : 1;
+                pzym1xm1 = pzym1xm1 < minThreshold ? 0 : 1;
+            }
+            else if (hasMaxThresh)
+            {
+                pzym1x   = pzym1x > maxThreshold ? 0 : 1;
+                pzym1xm1 = pzym1xm1 > maxThreshold ? 0 : 1;
+            }
+
+            bool nzym1x   = (!thread_y) ? (pzyx == pzym1x) : false;
+            bool nzym1xm1 = (!thread_y) ? (pzyx == pzym1xm1) : false;
+
+            if ((thread_z || (pzyx != pzm1yx) || (pzyx != pzm1yxm1)) && (!nzym1x || !nzym1xm1))
+            {
+                DT label1 = dst[gc];
+                DT label2 = *dst.ptr(gc.w, gc.z, gc.y, gc.x - 1);
+
+                Reduction(dst.ptr(gc.w), label1, label2);
+            }
+        }
+    }
+}
+
+template<typename DT>
+__global__ void ResolveLabels3D(cuda::Tensor4DWrap<DT> dst, int4 shape)
+{
+    int4 gc;
+    gc.x = blockIdx.x * blockDim.x + threadIdx.x;
+    gc.y = blockIdx.y * blockDim.y + threadIdx.y;
+    gc.z = blockIdx.z * blockDim.z + threadIdx.z;
+
+    if (gc.x >= shape.x || gc.y >= shape.y || gc.z >= shape.z)
+    {
+        return;
+    }
+
+    for (gc.w = 0; gc.w < shape.w; gc.w++)
+    {
+        dst[gc] = FindRoot(dst.ptr(gc.w), dst[gc]);
+    }
+}
+
+template<typename DT, typename ST>
+__global__ void ReplaceBgLabels3D(cuda::Tensor4DWrap<DT> dst, cuda::Tensor4DWrap<ST> src,
+                                  cuda::Tensor1DWrap<ST> bgLabel, cuda::Tensor1DWrap<ST> minThresh,
+                                  cuda::Tensor1DWrap<ST> maxThresh, int4 shape)
+{
+    int4 gc;
+    gc.x = blockIdx.x * blockDim.x + threadIdx.x;
+    gc.y = blockIdx.y * blockDim.y + threadIdx.y;
+    gc.z = blockIdx.z * blockDim.z + threadIdx.z;
+
+    if (gc.x >= shape.x || gc.y >= shape.y || gc.z >= shape.z)
+    {
+        return;
+    }
+
+    bool hasMinThresh = (minThresh.ptr(0) != nullptr);
+    bool hasMaxThresh = (maxThresh.ptr(0) != nullptr);
+
+    for (gc.w = 0; gc.w < shape.w; gc.w++)
+    {
+        ST minThreshold = hasMinThresh ? minThresh[gc.w] : 0;
+        ST maxThreshold = hasMaxThresh ? maxThresh[gc.w] : 0;
+
+        ST pzyx = src[gc];
+
+        if (hasMinThresh && hasMaxThresh)
+        {
+            pzyx = pzyx < minThreshold || pzyx > maxThreshold ? 0 : 1;
+        }
+        else if (hasMinThresh)
+        {
+            pzyx = pzyx < minThreshold ? 0 : 1;
+        }
+        else if (hasMaxThresh)
+        {
+            pzyx = pzyx > maxThreshold ? 0 : 1;
+        }
+
+        DT backgroundLabel = bgLabel[gc.w];
+
+        // If src has bg label, put it in dst; if dst has bg label, it means a wrong label was assigned to a
+        // region, replace its label by a label never assigned, i.e. one-element-after-the-end stride
+
+        if (pzyx == backgroundLabel)
+        {
+            dst[gc] = backgroundLabel;
+        }
+        else if (dst[gc] == (DT)backgroundLabel)
+        {
+            dst[gc] = dst.strides()[0] / sizeof(DT);
+        }
+    }
+}
+
+template<typename DT, typename ST>
+__global__ void CountLabels3D(cuda::Tensor1DWrap<DT> count, cuda::Tensor3DWrap<DT> stats, cuda::Tensor4DWrap<DT> dst,
+                              cuda::Tensor1DWrap<ST> bgLabel, int4 shape, int maxCapacity)
+{
+    int4 gc;
+    gc.x = blockIdx.x * blockDim.x + threadIdx.x;
+    gc.y = blockIdx.y * blockDim.y + threadIdx.y;
+    gc.z = blockIdx.z * blockDim.z + threadIdx.z;
+
+    if (gc.x >= shape.x || gc.y >= shape.y || gc.z >= shape.z)
+    {
+        return;
+    }
+
+    DT posLabel = gc.z * dst.strides()[1] / sizeof(DT) + gc.y * dst.strides()[2] / sizeof(DT) + gc.x;
+    DT endLabel = dst.strides()[0] / sizeof(DT);
+
+    bool hasBgLabel = (bgLabel.ptr(0) != nullptr);
+
+    for (gc.w = 0; gc.w < shape.w; gc.w++)
+    {
+        ST backgroundLabel = hasBgLabel ? bgLabel[gc.w] : 0;
+
+        DT label = dst[gc];
+
+        if (hasBgLabel && label == (DT)backgroundLabel)
+        {
+            continue; // do not count background labels
+        }
+
+        DT   regionIdx;
+        bool counted = false;
+
+        if (hasBgLabel && label == endLabel && posLabel == (DT)backgroundLabel)
+        {
+            // This is a special region marked with one-element-after-the-end label, count it
+            regionIdx = atomicAdd(count.ptr(gc.w), 1);
+            counted   = true;
+        }
+        else if (label == posLabel)
+        {
+            // This is the first element of a regular region, count it
+            regionIdx = atomicAdd(count.ptr(gc.w), 1);
+            counted   = true;
+        }
+
+        // If statistics should be computed and the region index is inside the allowed storage (the M maximum
+        // capacity in stats tensor), replace the output label by the region index and store initial statistics
+
+        if (counted && stats.ptr(0) != nullptr && regionIdx < maxCapacity)
+        {
+            // TODO: improve the mark of output label as region index with 1 in the 1st bit
+            dst[gc] = regionIdx | (DT)(1 << 31);
+
+            *stats.ptr(gc.w, (int)regionIdx, 0) = label;
+            *stats.ptr(gc.w, (int)regionIdx, 1) = (DT)gc.x;
+            *stats.ptr(gc.w, (int)regionIdx, 2) = (DT)gc.y;
+            *stats.ptr(gc.w, (int)regionIdx, 3) = (DT)gc.z;
+            *stats.ptr(gc.w, (int)regionIdx, 4) = 1;
+            *stats.ptr(gc.w, (int)regionIdx, 5) = 1;
+            *stats.ptr(gc.w, (int)regionIdx, 6) = 1;
+            *stats.ptr(gc.w, (int)regionIdx, 7) = 1;
+        }
+    }
+}
+
+template<typename DT, typename ST>
+__global__ void ComputeStats3D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor4DWrap<DT> dst, cuda::Tensor1DWrap<ST> bgLabel,
+                               int4 shape, bool relabel)
+{
+    int4 gc;
+    gc.x = blockIdx.x * blockDim.x + threadIdx.x;
+    gc.y = blockIdx.y * blockDim.y + threadIdx.y;
+    gc.z = blockIdx.z * blockDim.z + threadIdx.z;
+
+    if (gc.x >= shape.x || gc.y >= shape.y || gc.z >= shape.z)
+    {
+        return;
+    }
+
+    bool hasBgLabel = (bgLabel.ptr(0) != nullptr);
+    DT   endLabel   = dst.strides()[0] / sizeof(DT);
+
+    for (gc.w = 0; gc.w < shape.w; gc.w++)
+    {
+        ST backgroundLabel = hasBgLabel ? bgLabel[gc.w] : 0;
+
+        DT label = dst[gc];
+
+        if (hasBgLabel && label == (DT)backgroundLabel)
+        {
+            continue; // do not compute statistics for background labels
+        }
+        if (label & (DT)(1 << 31))
+        {
+            continue; // label is marked as region index, its statistics is already computed
+        }
+        if (hasBgLabel && label == endLabel)
+        {
+            // This is a special region marked with one-element-after-the-end label, its label was the bg label
+            label = backgroundLabel;
+        }
+
+        DT regionIdx = dst.ptr(gc.w)[label];
+
+        if (regionIdx & (DT)(1 << 31))
+        {
+            regionIdx = regionIdx & (DT) ~(1 << 31);
+
+            if (relabel)
+            {
+                if (hasBgLabel && regionIdx >= (DT)backgroundLabel)
+                {
+                    dst[gc] = regionIdx + 1; // skip one region index equals to background label when relabeling
+                }
+                else
+                {
+                    dst[gc] = regionIdx;
+                }
+            }
+
+            int3 cornerPos{(int)*stats.ptr(gc.w, (int)regionIdx, 1), (int)*stats.ptr(gc.w, (int)regionIdx, 2),
+                           (int)*stats.ptr(gc.w, (int)regionIdx, 3)};
+
+            int3 bboxArea = cuda::abs(cornerPos - cuda::DropCast<3>(gc)) + 1;
+
+            atomicMax(stats.ptr(gc.w, (int)regionIdx, 4), (DT)bboxArea.x);
+            atomicMax(stats.ptr(gc.w, (int)regionIdx, 5), (DT)bboxArea.y);
+            atomicMax(stats.ptr(gc.w, (int)regionIdx, 6), (DT)bboxArea.z);
+            atomicAdd(stats.ptr(gc.w, (int)regionIdx, 7), 1);
+        }
+    }
+}
+
+template<typename DT, typename ST>
+__global__ void RemoveIslands3D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor4DWrap<DT> dst,
+                                cuda::Tensor1DWrap<ST> bgLabel, cuda::Tensor1DWrap<DT> minSize, int4 shape,
+                                bool relabel)
+{
+    int4 gc;
+    gc.x = blockIdx.x * blockDim.x + threadIdx.x;
+    gc.y = blockIdx.y * blockDim.y + threadIdx.y;
+    gc.z = blockIdx.z * blockDim.z + threadIdx.z;
+
+    if (gc.x >= shape.x || gc.y >= shape.y || gc.z >= shape.z)
+    {
+        return;
+    }
+
+    DT endLabel = dst.strides()[0] / sizeof(DT);
+
+    for (gc.w = 0; gc.w < shape.w; gc.w++)
+    {
+        DT label = dst[gc];
+
+        ST backgroundLabel = bgLabel[gc.w];
+
+        if (label == (DT)backgroundLabel)
+        {
+            continue;
+        }
+        if (label == endLabel)
+        {
+            // This is a special region marked with one-element-after-the-end label, its label was the backgroundLabel
+            label = backgroundLabel;
+        }
+
+        DT regionIdx = 0;
+
+        if (!(label & (DT)(1 << 31)))
+        {
+            if (relabel)
+            {
+                if (label >= (DT)backgroundLabel + 1)
+                {
+                    regionIdx = label - 1; // go back one region index to account for background label
+                }
+                else
+                {
+                    regionIdx = label;
+                }
+            }
+            else
+            {
+                regionIdx = dst.ptr(gc.w)[label];
+
+                if (regionIdx & (DT)(1 << 31))
+                {
+                    regionIdx = regionIdx & (DT) ~(1 << 31);
+                }
+                else
+                {
+                    return; // invalid region index
+                }
+            }
+        }
+        else
+        {
+            regionIdx = label & (DT) ~(1 << 31);
+        }
+
+        DT regionSize = *stats.ptr(gc.w, (int)regionIdx, 7);
+
+        // If region size is less than minimum size, it is an island and should be removed, i.e. set to background label
+        if (regionSize < minSize[gc.w])
+        {
+            dst[gc] = backgroundLabel;
+        }
+    }
+}
+
+template<typename DT, typename ST>
+__global__ void Relabel3D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor4DWrap<DT> dst, cuda::Tensor1DWrap<ST> bgLabel,
+                          int4 shape, bool relabel)
+{
+    int4 gc;
+    gc.x = blockIdx.x * blockDim.x + threadIdx.x;
+    gc.y = blockIdx.y * blockDim.y + threadIdx.y;
+    gc.z = blockIdx.z * blockDim.z + threadIdx.z;
+
+    if (gc.x >= shape.x || gc.y >= shape.y || gc.z >= shape.z)
+    {
+        return;
+    }
+
+    for (gc.w = 0; gc.w < shape.w; gc.w++)
+    {
+        DT label = dst[gc];
+
+        if (label & (DT)(1 << 31))
+        {
+            // Label is marked as region index, relabel it back to proper label
+            DT regionIdx = label & (DT) ~(1 << 31);
+
+            if (relabel)
+            {
+                bool hasBgLabel      = (bgLabel.ptr(0) != nullptr);
+                ST   backgroundLabel = hasBgLabel ? bgLabel[gc.w] : 0;
+
+                if (hasBgLabel && regionIdx >= (DT)backgroundLabel)
+                {
+                    dst[gc] = regionIdx + 1; // skip one region index equals to background label when relabeling
+                }
+                else
+                {
+                    dst[gc] = regionIdx;
+                }
+            }
+            else
+            {
+                dst[gc] = *stats.ptr(gc.w, (int)regionIdx, 0);
+            }
+        }
+    }
+}
+
+// Run functions ---------------------------------------------------------------
+
+template<typename SrcT, typename DstT = uint32_t>
+inline void RunLabelForType(cudaStream_t stream, const nvcv::TensorDataStridedCuda &srcData,
+                            const nvcv::TensorDataStridedCuda &dstData, const int4 &shapeWHDN,
+                            const nvcv::Tensor &bgLabel, const nvcv::Tensor &minThresh, const nvcv::Tensor &maxThresh,
+                            const nvcv::Tensor &minSize, const nvcv::Tensor &count, const nvcv::Tensor &stats,
+                            int numDim, bool relabel)
+{
+    constexpr int BW = 32, BH = 4, BD = 2; // block width, height and depth
+
+    int4 idsNDHW{srcData.layout().find('N'), srcData.layout().find('D'), srcData.layout().find('H'),
+                 srcData.layout().find('W')};
+
+    NVCV_ASSERT(srcData.stride(idsNDHW.w) == sizeof(SrcT));
+    NVCV_ASSERT(dstData.stride(idsNDHW.w) == sizeof(DstT));
+
+    cuda::Tensor1DWrap<SrcT> bgLabelWrap, minThreshWrap, maxThreshWrap;
+    cuda::Tensor1DWrap<DstT> minSizeWrap, countWrap;
+    cuda::Tensor3DWrap<DstT> statsWrap;
+
+    int maxCapacity = 0;
+
+#define CVCUDA_LABEL_WRAP(TENSOR, WRAPPER, TENSORWRAP)                                                              \
+    if (TENSOR)                                                                                                     \
+    {                                                                                                               \
+        auto data = TENSOR.exportData<nvcv::TensorDataStridedCuda>();                                               \
+        if (!data)                                                                                                  \
+        {                                                                                                           \
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, #TENSOR " tensor must be cuda-accessible"); \
+        }                                                                                                           \
+        TENSORWRAP = WRAPPER(data->basePtr());                                                                      \
+    }
+
+    CVCUDA_LABEL_WRAP(bgLabel, cuda::Tensor1DWrap<SrcT>, bgLabelWrap);
+    CVCUDA_LABEL_WRAP(minThresh, cuda::Tensor1DWrap<SrcT>, minThreshWrap);
+    CVCUDA_LABEL_WRAP(maxThresh, cuda::Tensor1DWrap<SrcT>, maxThreshWrap);
+    CVCUDA_LABEL_WRAP(minSize, cuda::Tensor1DWrap<DstT>, minSizeWrap);
+
+#undef CVCUDA_LABEL_WRAP
+
+    if (count)
+    {
+        auto data = count.exportData<nvcv::TensorDataStridedCuda>();
+        if (!data)
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "count tensor must be cuda-accessible");
+        }
+
+        countWrap = cuda::Tensor1DWrap<DstT>(data->basePtr());
+
+        NVCV_CHECK_THROW(cudaMemsetAsync(data->basePtr(), 0, sizeof(DstT) * shapeWHDN.w, stream));
+    }
+    if (stats)
+    {
+        auto data = stats.exportData<nvcv::TensorDataStridedCuda>();
+        if (!data)
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "stats tensor must be cuda-accessible");
+        }
+
+        statsWrap = cuda::Tensor3DWrap<DstT>(data->basePtr(), (int)data->stride(0), (int)data->stride(1));
+
+        maxCapacity = data->shape(1);
+    }
+
+    if (numDim == 2)
+    {
+        int2 sizeWH{shapeWHDN.x, shapeWHDN.y};
+        int2 srcStridesNH{0, (int)srcData.stride(idsNDHW.z)};
+        int2 dstStridesNH{0, (int)dstData.stride(idsNDHW.z)};
+
+        srcStridesNH.x = idsNDHW.x == -1 ? srcStridesNH.y * shapeWHDN.y : (int)srcData.stride(idsNDHW.x);
+        dstStridesNH.x = idsNDHW.x == -1 ? dstStridesNH.y * shapeWHDN.y : (int)dstData.stride(idsNDHW.x);
+
+        dim3 larThreads(BW, BH, 1);
+        dim3 labBlocks(util::DivUp(sizeWH.x, BW), util::DivUp(sizeWH.y, BH), shapeWHDN.w);
+        dim3 redBlocksX(util::DivUp(sizeWH.y, BW), util::DivUp((int)labBlocks.x, BH), shapeWHDN.w);
+        dim3 redBlocksY(util::DivUp(sizeWH.x, BW), util::DivUp((int)labBlocks.y, BH), shapeWHDN.w);
+
+        cuda::Tensor3DWrap<SrcT> srcWrap(srcData.basePtr(), srcStridesNH.x, srcStridesNH.y);
+        cuda::Tensor3DWrap<DstT> dstWrap(dstData.basePtr(), dstStridesNH.x, dstStridesNH.y);
+
+        BlockLabel2D<BW, BH>
+            <<<labBlocks, larThreads, 0, stream>>>(dstWrap, srcWrap, minThreshWrap, maxThreshWrap, sizeWH);
+
+        YLabelReduction2D<<<redBlocksY, larThreads, 0, stream>>>(dstWrap, srcWrap, minThreshWrap, maxThreshWrap,
+                                                                 sizeWH);
+
+        XLabelReduction2D<<<redBlocksX, larThreads, 0, stream>>>(dstWrap, srcWrap, minThreshWrap, maxThreshWrap,
+                                                                 sizeWH);
+
+        ResolveLabels2D<<<labBlocks, larThreads, 0, stream>>>(dstWrap, sizeWH);
+
+        if (bgLabel)
+        {
+            ReplaceBgLabels2D<<<labBlocks, larThreads, 0, stream>>>(dstWrap, srcWrap, bgLabelWrap, minThreshWrap,
+                                                                    maxThreshWrap, sizeWH);
+        }
+        if (count)
+        {
+            CountLabels2D<<<labBlocks, larThreads, 0, stream>>>(countWrap, statsWrap, dstWrap, bgLabelWrap, sizeWH,
+                                                                maxCapacity);
+
+            if (stats)
+            {
+                ComputeStats2D<<<labBlocks, larThreads, 0, stream>>>(statsWrap, dstWrap, bgLabelWrap, sizeWH, relabel);
+
+                if (minSize)
+                {
+                    RemoveIslands2D<<<labBlocks, larThreads, 0, stream>>>(statsWrap, dstWrap, bgLabelWrap, minSizeWrap,
+                                                                          sizeWH, relabel);
+                }
+
+                Relabel2D<<<labBlocks, larThreads, 0, stream>>>(statsWrap, dstWrap, bgLabelWrap, sizeWH, relabel);
+            }
+        }
+    }
+    else
+    {
+        int3 srcStridesNDH{0, (int)srcData.stride(idsNDHW.y), (int)srcData.stride(idsNDHW.z)};
+        int3 dstStridesNDH{0, (int)dstData.stride(idsNDHW.y), (int)dstData.stride(idsNDHW.z)};
+
+        srcStridesNDH.x = idsNDHW.x == -1 ? srcStridesNDH.y * shapeWHDN.z : (int)srcData.stride(idsNDHW.x);
+        dstStridesNDH.x = idsNDHW.x == -1 ? dstStridesNDH.y * shapeWHDN.z : (int)dstData.stride(idsNDHW.x);
+
+        dim3 larThreads(BW, BH, BD);
+        dim3 labBlocks(util::DivUp(shapeWHDN.x, BW), util::DivUp(shapeWHDN.y, BH), util::DivUp(shapeWHDN.z, BD));
+        dim3 redBlocksX(util::DivUp(shapeWHDN.z, BW), util::DivUp(shapeWHDN.y, BH), util::DivUp((int)labBlocks.x, BD));
+        dim3 redBlocksY(util::DivUp(shapeWHDN.x, BW), util::DivUp(shapeWHDN.z, BH), util::DivUp((int)labBlocks.y, BD));
+        dim3 redBlocksZ(util::DivUp(shapeWHDN.x, BW), util::DivUp(shapeWHDN.y, BH), util::DivUp((int)labBlocks.z, BD));
+
+        cuda::Tensor4DWrap<SrcT> srcWrap(srcData.basePtr(), srcStridesNDH.x, srcStridesNDH.y, srcStridesNDH.z);
+        cuda::Tensor4DWrap<DstT> dstWrap(dstData.basePtr(), dstStridesNDH.x, dstStridesNDH.y, dstStridesNDH.z);
+
+        BlockLabel3D<BW, BH, BD>
+            <<<labBlocks, larThreads, 0, stream>>>(dstWrap, srcWrap, minThreshWrap, maxThreshWrap, shapeWHDN);
+
+        ZLabelReduction3D<<<redBlocksZ, larThreads, 0, stream>>>(dstWrap, srcWrap, minThreshWrap, maxThreshWrap,
+                                                                 shapeWHDN);
+
+        YLabelReduction3D<<<redBlocksY, larThreads, 0, stream>>>(dstWrap, srcWrap, minThreshWrap, maxThreshWrap,
+                                                                 shapeWHDN);
+
+        XLabelReduction3D<<<redBlocksX, larThreads, 0, stream>>>(dstWrap, srcWrap, minThreshWrap, maxThreshWrap,
+                                                                 shapeWHDN);
+
+        ResolveLabels3D<<<labBlocks, larThreads, 0, stream>>>(dstWrap, shapeWHDN);
+
+        if (bgLabel)
+        {
+            ReplaceBgLabels3D<<<labBlocks, larThreads, 0, stream>>>(dstWrap, srcWrap, bgLabelWrap, minThreshWrap,
+                                                                    maxThreshWrap, shapeWHDN);
+        }
+        if (count)
+        {
+            CountLabels3D<<<labBlocks, larThreads, 0, stream>>>(countWrap, statsWrap, dstWrap, bgLabelWrap, shapeWHDN,
+                                                                maxCapacity);
+
+            if (stats)
+            {
+                ComputeStats3D<<<labBlocks, larThreads, 0, stream>>>(statsWrap, dstWrap, bgLabelWrap, shapeWHDN,
+                                                                     relabel);
+
+                if (minSize)
+                {
+                    RemoveIslands3D<<<labBlocks, larThreads, 0, stream>>>(statsWrap, dstWrap, bgLabelWrap, minSizeWrap,
+                                                                          shapeWHDN, relabel);
+                }
+
+                Relabel3D<<<labBlocks, larThreads, 0, stream>>>(statsWrap, dstWrap, bgLabelWrap, shapeWHDN, relabel);
+            }
+        }
+    }
+}
+
+inline void RunLabel(cudaStream_t stream, const nvcv::TensorDataStridedCuda &srcData,
+                     const nvcv::TensorDataStridedCuda &dstData, const int4 &srcShape, nvcv::DataType srcDataType,
+                     const nvcv::Tensor &bgLabel, const nvcv::Tensor &minThresh, const nvcv::Tensor &maxThresh,
+                     const nvcv::Tensor &minSize, const nvcv::Tensor &count, const nvcv::Tensor &stats, int numDim,
+                     bool relabel)
+{
+    switch (srcDataType)
+    {
+#define CVCUDA_LABEL_CASE(DT, T)                                                                                     \
+    case nvcv::TYPE_##DT:                                                                                            \
+        RunLabelForType<T>(stream, srcData, dstData, srcShape, bgLabel, minThresh, maxThresh, minSize, count, stats, \
+                           numDim, relabel);                                                                         \
+        break
+
+        CVCUDA_LABEL_CASE(U8, uint8_t);
+        CVCUDA_LABEL_CASE(U16, uint16_t);
+        CVCUDA_LABEL_CASE(U32, uint32_t);
+        CVCUDA_LABEL_CASE(S8, int8_t);
+        CVCUDA_LABEL_CASE(S16, int16_t);
+        CVCUDA_LABEL_CASE(S32, int32_t);
+
+#undef CVCUDA_LABEL_CASE
+
+    default:
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid input data type");
+    }
+}
+
+} // anonymous namespace
+
+namespace cvcuda::priv {
+
+// Constructor -----------------------------------------------------------------
+
+Label::Label() {}
+
+// Tensor operator -------------------------------------------------------------
+
+void Label::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out,
+                       const nvcv::Tensor &bgLabel, const nvcv::Tensor &minThresh, const nvcv::Tensor &maxThresh,
+                       const nvcv::Tensor &minSize, const nvcv::Tensor &count, const nvcv::Tensor &stats,
+                       NVCVConnectivityType connectivity, NVCVLabelType assignLabels) const
+{
+    if (!(in.shape().layout() == nvcv::TENSOR_HW || in.shape().layout() == nvcv::TENSOR_HWC
+          || in.shape().layout() == nvcv::TENSOR_NHW || in.shape().layout() == nvcv::TENSOR_NHWC
+          || in.shape().layout() == nvcv::TENSOR_DHW || in.shape().layout() == nvcv::TENSOR_DHWC
+          || in.shape().layout() == nvcv::TENSOR_NDHW || in.shape().layout() == nvcv::TENSOR_NDHWC))
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input tensor must have [N][D]HW[C] layout");
+    }
+
+    // We expect input and output shape to be the same as TensorShape contains TensorLayout
+
+    if (!(in.shape() == out.shape()))
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "Input and output tensors must have the same shape and layout");
+    }
+    if (!(out.dtype() == nvcv::TYPE_U32))
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output tensor data type must be U32");
+    }
+
+    auto inData = in.exportData<nvcv::TensorDataStridedCuda>();
+    if (!inData)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input tensor must be cuda-accessible");
+    }
+
+    auto outData = out.exportData<nvcv::TensorDataStridedCuda>();
+    if (!outData)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output tensor must be cuda-accessible");
+    }
+
+    if (outData->stride(0) >= cuda::TypeTraits<int>::max
+        || (uint32_t)outData->stride(0) / (uint32_t)sizeof(uint32_t) >= (uint32_t)(1 << 31))
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Too big input and output tensors");
+    }
+
+    auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inData);
+    if (!inAccess)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input tensor must have strided access");
+    }
+    if (!(inAccess->numChannels() == 1))
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input tensor must have a single channel");
+    }
+    if (!(inAccess->numPlanes() == 1))
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input tensor must have a single plane");
+    }
+    if (inAccess->numSamples() > cuda::TypeTraits<int>::max)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "Too big number of samples %ld, must be smaller than or equal to %d",
+                              inAccess->numSamples(), cuda::TypeTraits<int>::max);
+    }
+
+    int4 inShape{inAccess->numCols(), inAccess->numRows(), 1, (int)inAccess->numSamples()}; // WHDN shape
+
+    int inDepthIdx = in.shape().layout().find('D');
+
+    int numDim = (connectivity == NVCV_CONNECTIVITY_4_2D || connectivity == NVCV_CONNECTIVITY_8_2D) ? 2 : 3;
+
+    bool relabel = (assignLabels == NVCV_LABEL_SEQUENTIAL);
+
+    if (inDepthIdx != -1)
+    {
+        if (numDim == 2)
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Connectivity 2D not allowed in tensors with depth D dimension");
+        }
+
+        NVCV_ASSERT(inDepthIdx >= 0 && inDepthIdx < in.shape().rank());
+
+        int64_t inDepth = in.shape()[inDepthIdx];
+
+        if (inDepth > cuda::TypeTraits<int>::max)
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Too big depth %ld, must be smaller than or equal to %d", inDepth,
+                                  cuda::TypeTraits<int>::max);
+        }
+
+        inShape.z = static_cast<int>(inDepth);
+    }
+    else
+    {
+        if (numDim == 3)
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Connectivity 3D not allowed in tensors without depth D dimension");
+        }
+    }
+
+    if (bgLabel)
+    {
+        if (!((bgLabel.rank() == 1 && bgLabel.shape()[0] == inShape.w)
+              || (bgLabel.rank() == 2 && bgLabel.shape()[0] == inShape.w && bgLabel.shape()[1] == 1)))
+        {
+            std::ostringstream oss;
+            oss << bgLabel.shape();
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Input bgLabel must be [N] or [NC] tensor, with N=%d and C=1, got %s", inShape.w,
+                                  oss.str().c_str());
+        }
+        if (!(bgLabel.dtype() == in.dtype()))
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Input (%s) and bgLabel (%s) tensors must have the same data type",
+                                  nvcvDataTypeGetName(in.dtype()), nvcvDataTypeGetName(bgLabel.dtype()));
+        }
+    }
+
+    if (minThresh)
+    {
+        if (!((minThresh.rank() == 1 && minThresh.shape()[0] == inShape.w)
+              || (minThresh.rank() == 2 && minThresh.shape()[0] == inShape.w && minThresh.shape()[1] == 1)))
+        {
+            std::ostringstream oss;
+            oss << minThresh.shape();
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Input minThresh must be [N] or [NC] tensor, with N=%d and C=1, got %s", inShape.w,
+                                  oss.str().c_str());
+        }
+        if (!(minThresh.dtype() == in.dtype()))
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Input (%s) and minThresh (%s) tensors must have the same data type",
+                                  nvcvDataTypeGetName(in.dtype()), nvcvDataTypeGetName(minThresh.dtype()));
+        }
+    }
+
+    if (maxThresh)
+    {
+        if (!((maxThresh.rank() == 1 && maxThresh.shape()[0] == inShape.w)
+              || (maxThresh.rank() == 2 && maxThresh.shape()[0] == inShape.w && maxThresh.shape()[1] == 1)))
+        {
+            std::ostringstream oss;
+            oss << maxThresh.shape();
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Input maxThresh must be [N] or [NC] tensor, with N=%d and C=1, got %s", inShape.w,
+                                  oss.str().c_str());
+        }
+        if (!(maxThresh.dtype() == in.dtype()))
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Input (%s) and maxThresh (%s) tensors must have the same data type",
+                                  nvcvDataTypeGetName(in.dtype()), nvcvDataTypeGetName(maxThresh.dtype()));
+        }
+    }
+
+    if (count)
+    {
+        if (!((count.rank() == 1 && count.shape()[0] == inShape.w)
+              || (count.rank() == 2 && count.shape()[0] == inShape.w && count.shape()[1] == 1)))
+        {
+            std::ostringstream oss;
+            oss << count.shape();
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Output count must be [N] or [NC] tensor, with N=%d and C=1, got %s", inShape.w,
+                                  oss.str().c_str());
+        }
+        if (!(count.dtype() == nvcv::TYPE_U32))
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output count (%s) must have U32 data type",
+                                  nvcvDataTypeGetName(count.dtype()));
+        }
+    }
+
+    if (stats)
+    {
+        if (!count)
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output stats requires count tensor");
+        }
+        if (!((stats.rank() == 3 && stats.shape()[0] == inShape.w && stats.shape()[2] == 2 + 2 * numDim)))
+        {
+            std::ostringstream oss;
+            oss << stats.shape();
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Output stats must be [NMA] tensor, with rank=3 N=%d A=%d, got %s", inShape.w,
+                                  2 + 2 * numDim, oss.str().c_str());
+        }
+        if (!(stats.dtype() == nvcv::TYPE_U32))
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output stats (%s) must have U32 data type",
+                                  nvcvDataTypeGetName(stats.dtype()));
+        }
+    }
+    else if (relabel)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "Output stats tensor must not be NULL to have sequential labels");
+    }
+
+    if (minSize)
+    {
+        if (!bgLabel || !stats)
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Input minSize requires bgLabel and stats tensors");
+        }
+
+        if (!((minSize.rank() == 1 && minSize.shape()[0] == inShape.w)
+              || (minSize.rank() == 2 && minSize.shape()[0] == inShape.w && minSize.shape()[1] == 1)))
+        {
+            std::ostringstream oss;
+            oss << minSize.shape();
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Input minSize must be [N] or [NC] tensor, with N=%d and C=1, got %s", inShape.w,
+                                  oss.str().c_str());
+        }
+        if (!(minSize.dtype() == nvcv::TYPE_U32))
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input minSize (%s) must have U32 data type",
+                                  nvcvDataTypeGetName(minSize.dtype()));
+        }
+    }
+
+    // TODO: Support full connectivity
+    if (connectivity == NVCV_CONNECTIVITY_8_2D || connectivity == NVCV_CONNECTIVITY_26_3D)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Full neighborhood labeling not supported yet");
+    }
+
+    RunLabel(stream, *inData, *outData, inShape, in.dtype(), bgLabel, minThresh, maxThresh, minSize, count, stats,
+             numDim, relabel);
+}
+
+} // namespace cvcuda::priv
diff --git a/src/cvcuda/priv/OpLabel.hpp b/src/cvcuda/priv/OpLabel.hpp
new file mode 100644
index 00000000..08d34f33
--- /dev/null
+++ b/src/cvcuda/priv/OpLabel.hpp
@@ -0,0 +1,48 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file OpLabel.hpp
+ *
+ * @brief Defines the private C++ Class for the Label operation.
+ */
+
+#ifndef CVCUDA_PRIV_LABEL_HPP
+#define CVCUDA_PRIV_LABEL_HPP
+
+#include "IOperator.hpp"
+
+#include <cuda_runtime.h>
+#include <cvcuda/OpLabel.h>
+#include <nvcv/Tensor.hpp>
+
+namespace cvcuda::priv {
+
+class Label final : public IOperator
+{
+public:
+    explicit Label();
+
+    void operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, const nvcv::Tensor &bgLabel,
+                    const nvcv::Tensor &minThresh, const nvcv::Tensor &maxThresh, const nvcv::Tensor &minSize,
+                    const nvcv::Tensor &count, const nvcv::Tensor &stats, NVCVConnectivityType connectivity,
+                    NVCVLabelType assignLabels) const;
+};
+
+} // namespace cvcuda::priv
+
+#endif // CVCUDA_PRIV_LABEL_HPP
diff --git a/src/cvcuda/priv/OpMinMaxLoc.cu b/src/cvcuda/priv/OpMinMaxLoc.cu
index d60f69c4..ba16fbe7 100644
--- a/src/cvcuda/priv/OpMinMaxLoc.cu
+++ b/src/cvcuda/priv/OpMinMaxLoc.cu
@@ -31,6 +31,8 @@
 
 #include <cub/cub.cuh>
 
+#include <sstream>
+
 namespace {
 
 // Utilities for MinMaxLoc operator --------------------------------------------
@@ -764,11 +766,12 @@ inline void RunMinMaxLocDataIn(cudaStream_t stream, const DataStridedCuda &inDat
 
         if (!DataTypeMatches(inDataType, minValData->dtype()))
         {
-            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
-                                  "Wrong output minVal data type %s for input tensor data type %s output minVal data "
-                                  "type must be S32/U32/F32/F64: for input data type S8/S16/S32 use S32; for "
-                                  "U8/U16/U32 use U32; for all other data types use same data type as input tensor",
-                                  nvcvDataTypeGetName(minValData->dtype()), nvcvDataTypeGetName(inDataType));
+            std::ostringstream oss;
+            oss << "for minVal=" << nvcvDataTypeGetName(minValData->dtype())
+                << " for input=" << nvcvDataTypeGetName(inDataType)
+                << "; output minVal data type must be S32/U32/F32/F64: for input "
+                << "data type S8/S16 use S32; for U8/U16 use U32; for all other data types use same as input tensor";
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Wrong data types: %s", oss.str().c_str());
         }
         if (!((minValData->rank() == 0 && inNumSamples == 1)
               || ((minValData->rank() == 1 || minValData->rank() == 2) && inNumSamples == minValData->shape(0))))
@@ -842,11 +845,12 @@ inline void RunMinMaxLocDataIn(cudaStream_t stream, const DataStridedCuda &inDat
 
         if (!DataTypeMatches(inDataType, maxValData->dtype()))
         {
-            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
-                                  "Wrong output maxVal data type %s for input tensor data type %s output maxVal data "
-                                  "type must be S32/U32/F32/F64: for input data type S8/S16/S32 use S32; for "
-                                  "U8/U16/U32 use U32; for all other data types use same data type as input tensor",
-                                  nvcvDataTypeGetName(maxValData->dtype()), nvcvDataTypeGetName(inDataType));
+            std::ostringstream oss;
+            oss << "for maxVal=" << nvcvDataTypeGetName(maxValData->dtype())
+                << " for input=" << nvcvDataTypeGetName(inDataType)
+                << "; output maxVal data type must be S32/U32/F32/F64: for input "
+                << "data type S8/S16 use S32; for U8/U16 use U32; for all other data types use same as input tensor";
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Wrong data types: %s", oss.str().c_str());
         }
         if (!((maxValData->rank() == 0 && inNumSamples == 1)
               || ((maxValData->rank() == 1 || maxValData->rank() == 2) && inNumSamples == maxValData->shape(0))))
diff --git a/src/cvcuda/priv/OpPairwiseMatcher.cu b/src/cvcuda/priv/OpPairwiseMatcher.cu
new file mode 100644
index 00000000..676ec865
--- /dev/null
+++ b/src/cvcuda/priv/OpPairwiseMatcher.cu
@@ -0,0 +1,665 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Assert.h"
+#include "OpPairwiseMatcher.hpp"
+
+#include <nvcv/Exception.hpp>
+#include <nvcv/cuda/MathWrappers.hpp>
+#include <nvcv/cuda/TensorWrap.hpp>
+#include <nvcv/cuda/TypeTraits.hpp>
+#include <util/CheckError.hpp>
+#include <util/Math.hpp>
+
+#include <cub/cub.cuh>
+
+#include <sstream>
+
+namespace {
+
+// Utilities definitions -------------------------------------------------------
+
+namespace cuda = nvcv::cuda;
+namespace util = nvcv::util;
+
+constexpr int kIntMax = cuda::TypeTraits<int>::max; // maximum number represented in an int
+
+constexpr int kNumThreads = 64; // number of threads per block
+
+// Key value pair type used in CUB (CUDA Unbound) sort and reduce-min operations
+// The idea is to sort or get the minimum by distance (dist) and then by index (idx)
+struct KeyValueT
+{
+    float dist;
+    int   idx;
+};
+
+// Point class primary template is not intended to be used directly, instead only its partial specializations
+// (below) are used, where: <T> is the point type, i.e. the tensor type of the set storing the points; <NB> is the
+// maximum number of bytes hold by the point class as a cache from global memory (GMEM)
+template<class T, int NB>
+class PointT;
+
+// Point class partial specialization for type T with NB = 0, a fall-through Point class meaning that n-dimensional
+// points in a set are loaded directly from global memory (GMEM) without caching
+template<class T>
+class PointT<T, 0>
+{
+public:
+    static constexpr int kMaxSize = 0; // maximum size in bytes of a single point stored by this class
+
+    __device__ PointT() = default;
+
+    inline __device__ void load(cuda::Tensor3DWrap<T> set, int sampleIdx, int setIdx, int numDim)
+    {
+        data = set.ptr(sampleIdx, setIdx);
+    }
+
+    inline __device__ T &operator[](int i) const
+    {
+        return data[i];
+    }
+
+private:
+    T *data;
+};
+
+using RT = uint32_t; // resource type used in a cache inside the PointT class below
+
+// Point class partial specialization for type T with any NB, meaning that n-dimensional points in a set are loaded
+// from global memory (GMEM) and stored in a cache (can be registers or local memory or shared memory), thus this
+// class can only be used by points with up to NB size in bytes, i.e. n * sizeof(T) <= NB
+template<class T, int NB>
+class PointT
+{
+    static_assert(NB > 0, "Maximum number of bytes capacity in PointT class must be positive");
+
+public:
+    static constexpr int kMaxSize = NB;              // maximum size in bytes of a single point stored by this class
+    static constexpr int kNumElem = NB / sizeof(RT); // number of elements in array serving as a cache
+    static constexpr int kMaxDims = NB / sizeof(T);  // maximum number of dimensions a single point may have
+
+    __device__ PointT() = default;
+
+    inline __device__ void load(cuda::Tensor3DWrap<T> set, int sampleIdx, int setIdx, int numDim)
+    {
+#pragma unroll
+        for (int i = 0; i < kNumElem && i < util::DivUp(numDim * (int)sizeof(T), (int)sizeof(RT)); ++i)
+        {
+            data[i] = *reinterpret_cast<const RT *>(set.ptr(sampleIdx, setIdx, i * (int)(sizeof(RT) / sizeof(T))));
+        }
+    }
+
+    inline __device__ T &operator[](int i) const
+    {
+        return reinterpret_cast<T *>(&data[0])[i];
+    }
+
+private:
+    RT data[kNumElem];
+};
+
+// Is compatible checks if a {numDim}-dimensional point fits in the corresponding Point T class (above)
+template<typename T, int NB>
+inline __host__ bool isCompatible(int numDim)
+{
+    return (numDim * (int)sizeof(T)) <= NB;
+}
+
+// Get minimum stride is used to check if a {numDim}-dimensional Point of type T smaller than RT (the cache
+// resource type in PointT class) can be read in steps of RT, allowing overflow after the last T element
+template<typename T>
+inline __host__ int getMinStride(int numDim)
+{
+    return util::DivUp(numDim * (int)sizeof(T), (int)sizeof(RT)) * (int)sizeof(RT);
+}
+
+// CUDA functions --------------------------------------------------------------
+
+// Reduce-min by key a key-value pair for CUB (CUDA Unbound) to do block-wide reduction to minimum in the first thread
+inline __device__ KeyValueT minkey(const KeyValueT &a, const KeyValueT &b)
+{
+    return (a.dist < b.dist || (a.dist == b.dist && a.idx < b.idx)) ? a : b;
+}
+
+// Absolute difference | a - b | for floating-point values
+template<typename T, class = cuda::Require<std::is_floating_point_v<T>>>
+inline __device__ T absdiff(T a, T b)
+{
+    return cuda::abs(a - b);
+}
+
+// Absolute difference for integral values, computing difference in unsigned types may lead to wrap around
+template<typename T, class = cuda::Require<std::is_integral_v<T>>>
+inline __device__ std::make_unsigned_t<T> absdiff(T a, T b)
+{
+    return a < b ? b - a : a - b; // wrapping around is fine!
+}
+
+// Compute {distance} between elements {e1} and {e2} from n-dimensional points p1 and p2
+template<NVCVNormType NORM, typename T>
+inline __device__ void ComputeDistance(float &distance, const T &e1, const T &e2)
+{
+    if constexpr (NORM == NVCV_NORM_HAMMING)
+    {
+        distance += __popc(e1 ^ e2);
+    }
+    else if constexpr (NORM == NVCV_NORM_L1)
+    {
+        distance += absdiff(e1, e2);
+    }
+    else
+    {
+        static_assert(NORM == NVCV_NORM_L2, "ComputeDistance accepts only HAMMING, L1 or L2 norms");
+
+        float d = absdiff(e1, e2);
+
+        distance = fma(d, d, distance); // square-root is postponed as not needed to find best matches
+    }
+}
+
+// Sort pairs of (distance, index) one per thread from a fixed point p1 to all points p2 in set2 with numDim
+// dimensions, each point is an array with numDim elements of source type ST, each set is an array of points, and
+// the tensor is an array of sets where the sampleIdx selects the current set within it with set2Size points
+template<NVCVNormType NORM, class Point, class SetWrapper>
+inline __device__ void SortKeyValue(float &sortedDist, int &sortedIdx, const Point &p1, const SetWrapper &set2,
+                                    int numDim, int matchesPerPoint, int sampleIdx, int set2Size)
+{
+    sortedDist = cuda::TypeTraits<float>::max;
+    sortedIdx  = -1;
+
+    float curDist;
+    Point p2;
+
+    for (int set2Idx = threadIdx.x; set2Idx < set2Size; set2Idx += kNumThreads)
+    {
+        p2.load(set2, sampleIdx, set2Idx, numDim);
+
+        curDist = 0.f;
+
+        if constexpr (Point::kMaxSize > 0)
+        {
+#pragma unroll
+            for (int i = 0; i < Point::kMaxDims && i < numDim; ++i)
+            {
+                ComputeDistance<NORM>(curDist, p1[i], p2[i]);
+            }
+        }
+        else
+        {
+            for (int i = 0; i < numDim; ++i)
+            {
+                ComputeDistance<NORM>(curDist, p1[i], p2[i]);
+            }
+        }
+
+        if (curDist < sortedDist)
+        {
+            sortedDist = curDist;
+            sortedIdx  = set2Idx;
+        }
+    }
+
+    __syncthreads(); // wait for all the threads to complete their local sorted (distance, index) pair
+
+    if (matchesPerPoint == 1) // fast path for top-1 sort is reduce minimum
+    {
+        using BlockReduce = cub::BlockReduce<KeyValueT, kNumThreads>;
+
+        __shared__ typename BlockReduce::TempStorage cubTempStorage;
+
+        KeyValueT keyValue{sortedDist, sortedIdx};
+
+        KeyValueT minKeyValue = BlockReduce(cubTempStorage).Reduce(keyValue, minkey);
+
+        if (threadIdx.x == 0)
+        {
+            sortedDist = minKeyValue.dist;
+            sortedIdx  = minKeyValue.idx;
+        }
+    }
+    else // normal path to get top-N where N > 1 requires block sort
+    {
+        using BlockSort = cub::BlockRadixSort<float, kNumThreads, 1, int>;
+
+        __shared__ typename BlockSort::TempStorage cubTempStorage;
+
+        float keys[1]   = {sortedDist};
+        int   values[1] = {sortedIdx};
+
+        BlockSort(cubTempStorage).Sort(keys, values);
+
+        if (threadIdx.x < matchesPerPoint)
+        {
+            sortedDist = keys[0];
+            sortedIdx  = values[0];
+        }
+    }
+}
+
+// Write a match of (set1Idx, set2Idx) with (distance) found at matchIdx inside output matches and distances
+template<NVCVNormType NORM>
+inline __device__ void WriteMatch(int matchIdx, int set1Idx, int set2Idx, int sampleIdx, float &distance,
+                                  cuda::Tensor3DWrap<int> matches, cuda::Tensor2DWrap<float> distances)
+{
+    *matches.ptr(sampleIdx, matchIdx, 0) = set1Idx;
+    *matches.ptr(sampleIdx, matchIdx, 1) = set2Idx;
+
+    if (distances.ptr(0) != nullptr)
+    {
+        if constexpr (NORM == NVCV_NORM_L2)
+        {
+            distance = cuda::sqrt(distance); // square-root was postpone for writing time, which is now
+        }
+
+        *distances.ptr(sampleIdx, matchIdx) = distance;
+    }
+}
+
+// Brute-force matcher finds closest pairs of n-dimensional points in set1 and set2, comparing all against all, it
+// is instantiated by: <NB> an upper limit of each point size in bytes; <NORM> type; and <ST> source type
+template<int NB, NVCVNormType NORM, typename ST>
+__global__ void BruteForceMatcher(cuda::Tensor3DWrap<ST> set1, cuda::Tensor3DWrap<ST> set2,
+                                  cuda::Tensor1DWrap<const int> numSet1, cuda::Tensor1DWrap<const int> numSet2,
+                                  cuda::Tensor3DWrap<int> matches, cuda::Tensor1DWrap<int> numMatches,
+                                  cuda::Tensor2DWrap<float> distances, int set1Capacity, int set2Capacity,
+                                  int outCapacity, int numDim, bool crossCheck, int matchesPerPoint)
+{
+    int sampleIdx = blockIdx.x;
+    int set1Idx   = blockIdx.y;
+    int set1Size  = set1Capacity;
+
+    if (numSet1.ptr(0) != nullptr)
+    {
+        set1Size = numSet1[sampleIdx];
+        set1Size = set1Size > set1Capacity ? set1Capacity : set1Size;
+    }
+
+    if (set1Idx >= set1Size)
+    {
+        return;
+    }
+
+    int set2Size = set2Capacity;
+
+    if (numSet2.ptr(0) != nullptr)
+    {
+        set2Size = numSet2[sampleIdx];
+        set2Size = set2Size > set2Capacity ? set2Capacity : set2Size;
+    }
+
+    PointT<ST, NB> p;
+
+    p.load(set1, sampleIdx, set1Idx, numDim);
+
+    float dist;
+    int   set2Idx;
+
+    SortKeyValue<NORM>(dist, set2Idx, p, set2, numDim, matchesPerPoint, sampleIdx, set2Size);
+
+    if (crossCheck)
+    {
+        __shared__ int set2Idx2;
+
+        if (threadIdx.x == 0)
+        {
+            set2Idx2 = set2Idx;
+        }
+
+        __syncthreads(); // wait the first thread to communicate the best match in set2 index
+
+        p.load(set2, sampleIdx, set2Idx2, numDim);
+
+        float dist2;
+        int   set1Idx2;
+
+        SortKeyValue<NORM>(dist2, set1Idx2, p, set1, numDim, matchesPerPoint, sampleIdx, set1Size);
+
+        if (threadIdx.x == 0 && set1Idx2 == set1Idx)
+        {
+            int matchIdx = atomicAdd(numMatches.ptr(sampleIdx), 1);
+
+            if (matchIdx < outCapacity)
+            {
+                WriteMatch<NORM>(matchIdx, set1Idx, set2Idx, sampleIdx, dist, matches, distances);
+            }
+        }
+    }
+    else
+    {
+        if (threadIdx.x < matchesPerPoint)
+        {
+            int matchIdx = set1Idx * matchesPerPoint + threadIdx.x;
+
+            if (matchIdx < outCapacity)
+            {
+                WriteMatch<NORM>(matchIdx, set1Idx, set2Idx, sampleIdx, dist, matches, distances);
+            }
+        }
+    }
+}
+
+// Write number of matches in the case without cross check this number is set1 size times matches per point
+__global__ void WriteNumMatches(cuda::Tensor1DWrap<const int> numSet1, cuda::Tensor1DWrap<int> numMatches,
+                                int set1Capacity, int matchesPerPoint)
+{
+    int sampleIdx = blockIdx.x;
+    int set1Size  = (numSet1.ptr(0) == nullptr) ? set1Capacity : numSet1[sampleIdx];
+
+    numMatches[sampleIdx] = set1Size * matchesPerPoint;
+}
+
+// Run functions ---------------------------------------------------------------
+
+// Run brute-force matcher, using NORM type for distance calculations and SrcT is the input source data type
+template<NVCVNormType NORM, typename SrcT>
+inline void RunBruteForceMatcherForNorm(cudaStream_t stream, const nvcv::Tensor &set1, const nvcv::Tensor &set2,
+                                        const nvcv::Tensor &numSet1, const nvcv::Tensor &numSet2,
+                                        const nvcv::Tensor &matches, const nvcv::Tensor &numMatches,
+                                        const nvcv::Tensor &distances, bool crossCheck, int matchesPerPoint)
+{
+    cuda::Tensor3DWrap<const SrcT>    w_set1, w_set2; // tensor wraps of set1 and set2 and other tensors
+    cuda::Tensor1DWrap<const int32_t> w_numSet1, w_numSet2;
+    cuda::Tensor3DWrap<int32_t>       w_matches;
+    cuda::Tensor1DWrap<int32_t>       w_numMatches;
+    cuda::Tensor2DWrap<float>         w_distances;
+
+#define CVCUDA_BFM_WRAP(TENSOR)                                                                                     \
+    if (TENSOR)                                                                                                     \
+    {                                                                                                               \
+        auto data = TENSOR.exportData<nvcv::TensorDataStridedCuda>();                                               \
+        if (!data)                                                                                                  \
+        {                                                                                                           \
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, #TENSOR " tensor must be cuda-accessible"); \
+        }                                                                                                           \
+        w_##TENSOR = decltype(w_##TENSOR)(*data);                                                                   \
+    }
+
+    CVCUDA_BFM_WRAP(set1);
+    CVCUDA_BFM_WRAP(set2);
+
+    CVCUDA_BFM_WRAP(numSet1);
+    CVCUDA_BFM_WRAP(numSet2);
+
+    CVCUDA_BFM_WRAP(matches);
+    CVCUDA_BFM_WRAP(numMatches);
+
+    CVCUDA_BFM_WRAP(distances);
+
+#undef CVCUDA_BFM_WRAP
+
+    int numSamples   = set1.shape()[0];            // number of samples, where each sample is a set of points
+    int set1Capacity = set1.shape()[1];            // set capacity is the maximum allowed number of points in set1
+    int set2Capacity = set2.shape()[1];            // set capacity is the maximum allowed number of points in set2
+    int numDim       = set1.shape()[2];            // number of dimensions of each n-dimensional point in set1 and set2
+    int outCapacity  = matches.shape()[1];         // output capacity to store matches and distances
+    int minStride    = getMinStride<SrcT>(numDim); // minimum stride in sets to allow the usage of PointT class
+
+    dim3 threads(kNumThreads, 1, 1);
+    dim3 blocks1(numSamples, 1, 1);
+    dim3 blocks2(numSamples, set1Capacity, 1);
+
+    if (crossCheck)
+    {
+        // Cross check returns a varying number of matches, as a match is only valid if it is the best (closest)
+        // match from set1 to set2 and back from set2 to set1, the numMatches output starts at zero and is
+        // atomically incremented in the BruteForceMatcher kernel
+
+        NVCV_CHECK_THROW(cudaMemsetAsync(w_numMatches.ptr(0), 0, sizeof(int32_t) * numSamples, stream));
+    }
+    else
+    {
+        // Without cross check has a fixed number of matches equal to the set1 size, meaning for every point in
+        // set1 there is (are) one (or more) matche(s) (up to matchesPerPoint) in set2
+
+        if (numMatches)
+        {
+            WriteNumMatches<<<blocks1, threads, 0, stream>>>(w_numSet1, w_numMatches, set1Capacity, matchesPerPoint);
+        }
+    }
+
+    // Cache-based kernel specialization: numDim and SrcT must fit a cache in PointT class; it works for 32B and
+    // 128B descriptors, such as ORB and SIFT.  Even though it has 256 bytes spill loads/stores for NB = 128, it
+    // still gives almost 2x performance benefit.
+
+    // TODO: The caveat of below kernel specializations is that it takes time to compile (~30sec) and it does not
+    //       cover points bigger than 128B in size, incurring in low performance for big points.  It may be better
+    //       to use shared memory for those big points, given a certain maximum point dimension, and use threads to
+    //       compute per element results instead of per point.
+
+#define CVCUDA_BFM_RUN(NB)                                                                                      \
+    BruteForceMatcher<NB, NORM><<<blocks2, threads, 0, stream>>>(                                               \
+        w_set1, w_set2, w_numSet1, w_numSet2, w_matches, w_numMatches, w_distances, set1Capacity, set2Capacity, \
+        outCapacity, numDim, crossCheck, matchesPerPoint);                                                      \
+    return
+
+    if (w_set1.strides()[1] >= minStride && w_set2.strides()[1] >= minStride)
+    {
+        if (isCompatible<SrcT, 32>(numDim))
+        {
+            CVCUDA_BFM_RUN(32);
+        }
+        else if (isCompatible<SrcT, 128>(numDim))
+        {
+            CVCUDA_BFM_RUN(128);
+        }
+    }
+
+    CVCUDA_BFM_RUN(0);
+
+#undef CVCUDA_BFM_RUN
+}
+
+template<typename SrcT>
+inline void RunBruteForceMatcherForType(cudaStream_t stream, const nvcv::Tensor &set1, const nvcv::Tensor &set2,
+                                        const nvcv::Tensor &numSet1, const nvcv::Tensor &numSet2,
+                                        const nvcv::Tensor &matches, const nvcv::Tensor &numMatches,
+                                        const nvcv::Tensor &distances, bool crossCheck, int matchesPerPoint,
+                                        NVCVNormType normType)
+{
+    switch (normType)
+    {
+    case NVCV_NORM_HAMMING:
+        if constexpr (std::is_floating_point_v<SrcT>)
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid norm Hamming with float input type");
+        }
+        else
+        {
+            RunBruteForceMatcherForNorm<NVCV_NORM_HAMMING, SrcT>(stream, set1, set2, numSet1, numSet2, matches,
+                                                                 numMatches, distances, crossCheck, matchesPerPoint);
+        }
+        break;
+
+#define CVCUDA_BFM_CASE(NORM)                                                                                         \
+    case NORM:                                                                                                        \
+        RunBruteForceMatcherForNorm<NORM, SrcT>(stream, set1, set2, numSet1, numSet2, matches, numMatches, distances, \
+                                                crossCheck, matchesPerPoint);                                         \
+        break
+
+        CVCUDA_BFM_CASE(NVCV_NORM_L1);
+        CVCUDA_BFM_CASE(NVCV_NORM_L2);
+
+#undef CVCUDA_BFM_CASE
+
+    default:
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid norm type");
+    }
+}
+
+inline void RunBruteForceMatcher(cudaStream_t stream, const nvcv::Tensor &set1, const nvcv::Tensor &set2,
+                                 const nvcv::Tensor &numSet1, const nvcv::Tensor &numSet2, const nvcv::Tensor &matches,
+                                 const nvcv::Tensor &numMatches, const nvcv::Tensor &distances, bool crossCheck,
+                                 int matchesPerPoint, NVCVNormType normType)
+{
+    switch (set1.dtype())
+    {
+#define CVCUDA_BFM_CASE(DT, T)                                                                               \
+    case nvcv::TYPE_##DT:                                                                                    \
+        RunBruteForceMatcherForType<T>(stream, set1, set2, numSet1, numSet2, matches, numMatches, distances, \
+                                       crossCheck, matchesPerPoint, normType);                               \
+        break
+
+        CVCUDA_BFM_CASE(U8, uint8_t);
+        CVCUDA_BFM_CASE(U32, uint32_t);
+        CVCUDA_BFM_CASE(F32, float);
+
+#undef CVCUDA_BFM_CASE
+
+    default:
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid input data type");
+    }
+}
+
+} // anonymous namespace
+
+namespace cvcuda::priv {
+
+// Constructor -----------------------------------------------------------------
+
+PairwiseMatcher::PairwiseMatcher(NVCVPairwiseMatcherType algoChoice)
+    : m_algoChoice(algoChoice)
+{
+    // Support additional algorithms here (only brute force for now), they may require payload
+    if (algoChoice != NVCV_BRUTE_FORCE)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid algorithm choice");
+    }
+}
+
+// Tensor operator -------------------------------------------------------------
+
+void PairwiseMatcher::operator()(cudaStream_t stream, const nvcv::Tensor &set1, const nvcv::Tensor &set2,
+                                 const nvcv::Tensor &numSet1, const nvcv::Tensor &numSet2, const nvcv::Tensor &matches,
+                                 const nvcv::Tensor &numMatches, const nvcv::Tensor &distances, bool crossCheck,
+                                 int matchesPerPoint, NVCVNormType normType)
+{
+    // Check each input and output tensor and their properties are conforming to what is expected
+
+    if (!set1 || !set2 || !matches)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Required tensors: set1 set2 matches");
+    }
+    if (set1.rank() != 3)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input set1 must be a rank-3 tensor");
+    }
+    if (set2.rank() != 3)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input set2 must be a rank-3 tensor");
+    }
+    if (set1.dtype() != set2.dtype())
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input sets must have the same data type");
+    }
+
+    int64_t numSamples = set1.shape()[0];
+    int64_t numDim     = set1.shape()[2];
+
+    if (set2.shape()[0] != numSamples || set2.shape()[2] != numDim)
+    {
+        std::ostringstream oss;
+        oss << (set2 ? set2.shape() : nvcv::TensorShape());
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid set2 shape %s is not [NMD]: N=%ld D=%ld",
+                              oss.str().c_str(), numSamples, numDim);
+    }
+
+    if (numSamples > kIntMax || numDim > kIntMax || set1.shape()[1] > kIntMax || set2.shape()[1] > kIntMax)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Too big input tensors, shape > %d", kIntMax);
+    }
+
+    if (numSet1
+        && ((numSet1.rank() != 1 && numSet1.rank() != 2) || numSet1.shape()[0] != numSamples
+            || (numSet1.rank() == 2 && numSet1.shape()[1] != 1) || numSet1.dtype() != nvcv::TYPE_S32))
+    {
+        std::ostringstream oss;
+        oss << numSet1.shape();
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "Invalid numSet1 shape %s dtype %s are not [N] or [NC]: N=%ld C=1 dtype=S32",
+                              oss.str().c_str(), nvcvDataTypeGetName(numSet1.dtype()), numSamples);
+    }
+
+    if (numSet2
+        && ((numSet2.rank() != 1 && numSet2.rank() != 2) || numSet2.shape()[0] != numSamples
+            || (numSet2.rank() == 2 && numSet2.shape()[1] != 1) || numSet2.dtype() != nvcv::TYPE_S32))
+    {
+        std::ostringstream oss;
+        oss << numSet2.shape();
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "Invalid numSet2 shape %s dtype %s are not [N] or [NC]: N=%ld C=1 dtype=S32",
+                              oss.str().c_str(), nvcvDataTypeGetName(numSet2.dtype()), numSamples);
+    }
+
+    if (matches.rank() != 3 || matches.shape()[0] != numSamples || matches.shape()[1] >= kIntMax
+        || matches.shape()[2] != 2 || matches.dtype() != nvcv::TYPE_S32)
+    {
+        std::ostringstream oss;
+        oss << matches.shape();
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "Invalid matches shape %s dtype %s are not [NMA]: N=%ld M<%d A=2 dtype=S32",
+                              oss.str().c_str(), nvcvDataTypeGetName(matches.dtype()), numSamples, kIntMax);
+    }
+
+    if (numMatches
+        && ((numMatches.rank() != 1 && numMatches.rank() != 2) || numMatches.shape()[0] != numSamples
+            || (numMatches.rank() == 2 && numMatches.shape()[1] != 1) || numMatches.dtype() != nvcv::TYPE_S32))
+    {
+        std::ostringstream oss;
+        oss << numMatches.shape();
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "Invalid numMatches shape %s dtype %s are not [N] or [NC]: N=%ld C=1 dtype=S32",
+                              oss.str().c_str(), nvcvDataTypeGetName(numMatches.dtype()), numSamples);
+    }
+
+    int64_t outCapacity = matches.shape()[1];
+
+    if (distances
+        && ((distances.rank() != 2 && distances.rank() != 3) || distances.shape()[0] != numSamples
+            || distances.shape()[1] != outCapacity || (distances.rank() == 3 && distances.shape()[2] != 1)
+            || distances.dtype() != nvcv::TYPE_F32))
+    {
+        std::ostringstream oss;
+        oss << distances.shape();
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "Invalid distances shape %s dtype %s are not [NM] or [NMC]: N=%ld M=%ld C=1 dtype=S32",
+                              oss.str().c_str(), nvcvDataTypeGetName(distances.dtype()), numSamples, outCapacity);
+    }
+
+    if (matchesPerPoint <= 0 || matchesPerPoint > kNumThreads)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid matchesPerPoint %d is not in [1, %d]",
+                              matchesPerPoint, kNumThreads);
+    }
+    if (crossCheck && matchesPerPoint != 1)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "Invalid matchesPerPoint %d for crossCheck=true is not 1", matchesPerPoint);
+    }
+    if (crossCheck && !numMatches)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid numMatches=NULL for crossCheck=true");
+    }
+
+    if (m_algoChoice == NVCV_BRUTE_FORCE)
+    {
+        RunBruteForceMatcher(stream, set1, set2, numSet1, numSet2, matches, numMatches, distances, crossCheck,
+                             matchesPerPoint, normType);
+    }
+}
+
+} // namespace cvcuda::priv
diff --git a/src/cvcuda/priv/OpPairwiseMatcher.hpp b/src/cvcuda/priv/OpPairwiseMatcher.hpp
new file mode 100644
index 00000000..208a4cc9
--- /dev/null
+++ b/src/cvcuda/priv/OpPairwiseMatcher.hpp
@@ -0,0 +1,49 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file OpPairwiseMatcher.hpp
+ *
+ * @brief Defines the private C++ Class for the PairwiseMatcher operation.
+ */
+
+#ifndef CVCUDA_PRIV_PAIRWISE_MATCHER_HPP
+#define CVCUDA_PRIV_PAIRWISE_MATCHER_HPP
+
+#include "IOperator.hpp"
+
+#include <cvcuda/OpPairwiseMatcher.hpp>
+
+namespace cvcuda::priv {
+
+class PairwiseMatcher final : public IOperator
+{
+public:
+    explicit PairwiseMatcher(NVCVPairwiseMatcherType algoChoice);
+
+    void operator()(cudaStream_t stream, const nvcv::Tensor &set1, const nvcv::Tensor &set2,
+                    const nvcv::Tensor &numSet1, const nvcv::Tensor &numSet2, const nvcv::Tensor &matches,
+                    const nvcv::Tensor &numMatches, const nvcv::Tensor &distances, bool crossCheck, int matchesPerPoint,
+                    NVCVNormType normType);
+
+private:
+    NVCVPairwiseMatcherType m_algoChoice;
+};
+
+} // namespace cvcuda::priv
+
+#endif // CVCUDA_PRIV_PAIRWISE_MATCHER_HPP
diff --git a/src/cvcuda/priv/OpPillowResize.cpp b/src/cvcuda/priv/OpPillowResize.cpp
index 73ce7ebc..a72fa7f9 100644
--- a/src/cvcuda/priv/OpPillowResize.cpp
+++ b/src/cvcuda/priv/OpPillowResize.cpp
@@ -28,7 +28,26 @@ namespace cvcuda::priv {
 namespace leg    = nvcv::legacy;
 namespace legacy = nvcv::legacy::cuda_op;
 
-PillowResize::PillowResize(nvcv::Size2D maxSize, int maxBatchSize, NVCVImageFormat fmt)
+PillowResize::PillowResize()
+{
+    m_legacyOp         = std::make_unique<leg::cuda_op::PillowResize>();
+    m_legacyOpVarShape = std::make_unique<leg::cuda_op::PillowResizeVarShape>();
+}
+
+WorkspaceRequirements PillowResize::getWorkspaceRequirements(int batchSize, const nvcv::Size2D *in_sizes,
+                                                             const nvcv::Size2D *out_sizes, NVCVImageFormat fmt)
+{
+    nvcv::Size2D maxInSize{0, 0}, maxOutSize{0, 0};
+    for (int i = 0; i < batchSize; i++)
+    {
+        maxInSize  = nvcv::MaxSize(in_sizes[i], maxInSize);
+        maxOutSize = nvcv::MaxSize(out_sizes[i], maxOutSize);
+    }
+    return getWorkspaceRequirements(batchSize, maxInSize, maxOutSize, fmt);
+}
+
+WorkspaceRequirements PillowResize::getWorkspaceRequirements(int maxBatchSize, nvcv::Size2D maxInSize,
+                                                             nvcv::Size2D maxOutSize, NVCVImageFormat fmt)
 {
     int32_t bpc[4];
     nvcvImageFormatGetBitsPerChannel(fmt, bpc);
@@ -36,15 +55,17 @@ PillowResize::PillowResize(nvcv::Size2D maxSize, int maxBatchSize, NVCVImageForm
     nvcvImageFormatGetNumChannels(fmt, &maxChannel);
     NVCVDataKind dataKind;
     nvcvImageFormatGetDataKind(fmt, &dataKind);
-    nvcv::DataKind          dkind     = static_cast<nvcv::DataKind>(dataKind);
-    leg::cuda_op::DataType  data_type = leg::helpers::GetLegacyDataType(bpc[0], dkind);
-    leg::cuda_op::DataShape maxIn(maxBatchSize, maxChannel, maxSize.h, maxSize.w),
-        maxOut(maxBatchSize, maxChannel, maxSize.h, maxSize.w);
-    m_legacyOp         = std::make_unique<leg::cuda_op::PillowResize>(maxIn, maxOut, data_type);
-    m_legacyOpVarShape = std::make_unique<leg::cuda_op::PillowResizeVarShape>(maxIn, maxOut, data_type);
+    nvcv::DataKind          dkind    = static_cast<nvcv::DataKind>(dataKind);
+    leg::cuda_op::DataType  dataType = leg::helpers::GetLegacyDataType(bpc[0], dkind);
+    leg::cuda_op::DataShape maxIn(maxBatchSize, maxChannel, maxInSize.h, maxInSize.w);
+    leg::cuda_op::DataShape maxOut(maxBatchSize, maxChannel, maxOutSize.h, maxOutSize.w);
+    auto                    req         = m_legacyOp->getWorkspaceRequirements(maxIn, maxOut, dataType);
+    auto                    reqVarShape = m_legacyOpVarShape->getWorkspaceRequirements(maxIn, maxOut, dataType);
+
+    return MaxWorkspaceReq(req, reqVarShape);
 }
 
-void PillowResize::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out,
+void PillowResize::operator()(cudaStream_t stream, const Workspace &ws, const nvcv::Tensor &in, const nvcv::Tensor &out,
                               const NVCVInterpolationType interpolation) const
 {
     auto inData = in.exportData<nvcv::TensorDataStridedCuda>();
@@ -61,13 +82,13 @@ void PillowResize::operator()(cudaStream_t stream, const nvcv::Tensor &in, const
                               "Output must be device-acessible, pitch-linear tensor");
     }
 
-    NVCV_CHECK_THROW(m_legacyOp->infer(*inData, *outData, interpolation, stream));
+    NVCV_CHECK_THROW(m_legacyOp->infer(*inData, *outData, interpolation, stream, ws));
 }
 
-void PillowResize::operator()(cudaStream_t stream, const nvcv::ImageBatchVarShape &in,
+void PillowResize::operator()(cudaStream_t stream, const Workspace &ws, const nvcv::ImageBatchVarShape &in,
                               const nvcv::ImageBatchVarShape &out, const NVCVInterpolationType interpolation) const
 {
-    NVCV_CHECK_THROW(m_legacyOpVarShape->infer(in, out, interpolation, stream));
+    NVCV_CHECK_THROW(m_legacyOpVarShape->infer(in, out, interpolation, stream, ws));
 }
 
 } // namespace cvcuda::priv
diff --git a/src/cvcuda/priv/OpPillowResize.hpp b/src/cvcuda/priv/OpPillowResize.hpp
index 7b2356fd..c6f743b1 100644
--- a/src/cvcuda/priv/OpPillowResize.hpp
+++ b/src/cvcuda/priv/OpPillowResize.hpp
@@ -25,6 +25,7 @@
 #define CVCUDA_PRIV_PILLOW_RESIZE_HPP
 
 #include "IOperator.hpp"
+#include "cvcuda/Workspace.hpp"
 #include "legacy/CvCudaLegacy.h"
 
 #include <nvcv/ImageBatch.hpp>
@@ -37,13 +38,20 @@ namespace cvcuda::priv {
 class PillowResize final : public IOperator
 {
 public:
-    explicit PillowResize(nvcv::Size2D maxSize, int maxBatchSize, NVCVImageFormat fmt);
+    PillowResize();
 
-    void operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out,
-                    const NVCVInterpolationType interpolation) const;
-    void operator()(cudaStream_t stream, const nvcv::ImageBatchVarShape &in, const nvcv::ImageBatchVarShape &out,
+    WorkspaceRequirements getWorkspaceRequirements(int batchSize, const nvcv::Size2D *in_sizes,
+                                                   const nvcv::Size2D *out_sizes, NVCVImageFormat fmt);
+
+    WorkspaceRequirements getWorkspaceRequirements(int batchSize, nvcv::Size2D maxInSize, nvcv::Size2D maxOutSize,
+                                                   NVCVImageFormat fmt);
+
+    void operator()(cudaStream_t stream, const Workspace &ws, const nvcv::Tensor &in, const nvcv::Tensor &out,
                     const NVCVInterpolationType interpolation) const;
 
+    void operator()(cudaStream_t stream, const Workspace &ws, const nvcv::ImageBatchVarShape &in,
+                    const nvcv::ImageBatchVarShape &out, const NVCVInterpolationType interpolation) const;
+
 private:
     std::unique_ptr<nvcv::legacy::cuda_op::PillowResize>         m_legacyOp;
     std::unique_ptr<nvcv::legacy::cuda_op::PillowResizeVarShape> m_legacyOpVarShape;
diff --git a/src/cvcuda/priv/OpSIFT.cu b/src/cvcuda/priv/OpSIFT.cu
index 045b1cd7..cfe2499b 100644
--- a/src/cvcuda/priv/OpSIFT.cu
+++ b/src/cvcuda/priv/OpSIFT.cu
@@ -1237,6 +1237,16 @@ void SIFT::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::T
         throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Operator call arg. initSigma=%f must be positive",
                               initSigma);
     }
+    if (contrastThreshold <= 0)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Argument contrastThreshold=%f must be positive",
+                              contrastThreshold);
+    }
+    if (edgeThreshold <= 0)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Argument edgeThreshold=%f must be positive",
+                              edgeThreshold);
+    }
 
     if (numOctaveLayers < 1 || numOctaveLayers > m_maxOctaveLayers)
     {
diff --git a/src/cvcuda/priv/OpStack.cpp b/src/cvcuda/priv/OpStack.cpp
new file mode 100644
index 00000000..3b1707a7
--- /dev/null
+++ b/src/cvcuda/priv/OpStack.cpp
@@ -0,0 +1,101 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "OpStack.hpp"
+
+#include "nvcv/TensorDataAccess.hpp"
+
+#include <nvcv/Exception.hpp>
+#include <util/CheckError.hpp>
+
+namespace cvcuda::priv {
+
+void Stack::operator()(cudaStream_t stream, const nvcv::TensorBatch &in, const nvcv::Tensor &out) const
+{
+    auto outData = out.exportData<nvcv::TensorDataStridedCuda>();
+    if (outData == nullptr)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "Output must be cuda-accessible, pitch-linear tensor");
+    }
+
+    // read out data N, H, W and C
+    if (out.rank() != 4)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output must be NCHW orNHWC tensor");
+    }
+
+    uint32_t outN = out.shape()[0];
+    // this works for both NCHW and NHWC since we are just checking if H,W,C are the same
+    uint32_t outH = out.shape()[1];
+    uint32_t outW = out.shape()[2];
+    uint32_t outC = out.shape()[3];
+
+    uint32_t copyIndex = 0;
+    for (auto it = in.begin(); it != in.end(); ++it)
+    {
+        // check if output is large enough since we could have a combo of N and non N tensors on input.
+        if (copyIndex >= outN)
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Output tensor is not large enough to hold all input tensors");
+        }
+
+        //check if data layout and shape is are equal.
+        uint32_t isN = (it->rank() == 4) ? 1 : 0;
+        if (outH != it->shape()[0 + isN] || outW != it->shape()[1 + isN] || outC != it->shape()[2 + isN])
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Input tensors must have the same H, W, and C as output Tensor");
+        }
+
+        auto inData = it->exportData<nvcv::TensorDataStridedCuda>();
+        if (inData == nullptr)
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Output must be cuda-accessible, pitch-linear tensor");
+        }
+
+        copyIndex = copyTensorToNTensor(*outData, *inData, copyIndex, stream);
+    }
+}
+
+// copies all samples from indata to out data, returns the next index in out data.
+int Stack::copyTensorToNTensor(const nvcv::TensorDataStridedCuda &outData, const nvcv::TensorDataStridedCuda &inData,
+                               uint32_t outIndex, cudaStream_t stream) const
+{
+    auto in = nvcv::TensorDataAccessStridedImagePlanar::Create(inData);
+    NVCV_ASSERT(in);
+    auto out = nvcv::TensorDataAccessStridedImagePlanar::Create(outData);
+    NVCV_ASSERT(out);
+
+    for (uint32_t i = 0; i < in->numSamples(); ++i)
+    {
+        nvcv::Byte *inSampData  = in->sampleData(i);
+        nvcv::Byte *outSampData = out->sampleData(outIndex);
+        for (int32_t p = 0; p < in->numPlanes(); ++p)
+        {
+            NVCV_CHECK_LOG(cudaMemcpy2DAsync(
+                out->planeData(p, outSampData), out->rowStride(), in->planeData(p, inSampData), in->rowStride(),
+                in->numCols() * in->colStride(), in->numRows(), cudaMemcpyDeviceToDevice, stream));
+        }
+        outIndex++;
+    }
+    return outIndex;
+}
+
+} // namespace cvcuda::priv
diff --git a/src/cvcuda/priv/OpStack.hpp b/src/cvcuda/priv/OpStack.hpp
new file mode 100644
index 00000000..744af988
--- /dev/null
+++ b/src/cvcuda/priv/OpStack.hpp
@@ -0,0 +1,49 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file OpStack.hpp
+ *
+ * @brief Defines the private C++ Class for the Stack operation.
+ */
+
+#ifndef CVCUDA_PRIV__STACK_HPP
+#define CVCUDA_PRIV__STACK_HPP
+
+#include "IOperator.hpp"
+
+#include <cuda_runtime.h>
+#include <nvcv/Tensor.hpp>
+#include <nvcv/TensorBatch.hpp>
+
+#include <memory>
+
+namespace cvcuda::priv {
+
+class Stack final : public IOperator
+{
+public:
+    void operator()(cudaStream_t stream, const nvcv::TensorBatch &in, const nvcv::Tensor &out) const;
+
+private:
+    int copyTensorToNTensor(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData,
+                            uint32_t outIndex, cudaStream_t stream) const;
+};
+
+} // namespace cvcuda::priv
+
+#endif // CVCUDA_PRIV__STACK_HPP
diff --git a/src/cvcuda/priv/Types.hpp b/src/cvcuda/priv/Types.hpp
new file mode 100644
index 00000000..9580e6bc
--- /dev/null
+++ b/src/cvcuda/priv/Types.hpp
@@ -0,0 +1,643 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CVCUDA_TYPES_HPP
+#define CVCUDA_TYPES_HPP
+
+#include <cvcuda/Types.h>
+
+#include <string>
+#include <vector>
+
+namespace cvcuda::priv {
+
+#define checkERR(call) check_error(call, #call, __LINE__, __FILE__)
+
+inline static bool check_error(cudaError_t e, const char *call, int line, const char *file)
+{
+    if (e != cudaSuccess)
+    {
+        fprintf(stderr, "CUDA Runtime error %s # %s, code = %s [ %d ] in file %s:%d\n", call, cudaGetErrorString(e),
+                cudaGetErrorName(e), e, file, line);
+        return false;
+    }
+    return true;
+}
+
+// Default font, user can install via below command:
+//      sudo apt-get update
+//      sudo apt-get install ttf-dejavu fonts-dejavu
+#define DEFAULT_OSD_FONT "DejaVuSansMono"
+
+class NVCVText
+{
+public:
+    const char   *utf8Text = nullptr; // Text to draw in utf8 format.
+    int32_t       fontSize;           // Font size for the text.
+    const char   *fontName = nullptr; // Font name for the text.
+    NVCVPointI    tlPos;              // Top-left corner point for label text, \ref NVCVPointI.
+    NVCVColorRGBA fontColor;          // Font color of the text.
+    NVCVColorRGBA bgColor;            // Background color of text box.
+
+    NVCVText(const char *_utf8Text, int32_t _fontSize, const char *_fontName, NVCVPointI _tlPos,
+             NVCVColorRGBA _fontColor, NVCVColorRGBA _bgColor)
+        : fontSize(_fontSize)
+        , tlPos(_tlPos)
+        , fontColor(_fontColor)
+        , bgColor(_bgColor)
+    {
+        utf8Text = (const char *)malloc(strlen(_utf8Text) + 1);
+        memcpy(const_cast<char *>(utf8Text), _utf8Text, strlen(_utf8Text) + 1);
+        fontName = (const char *)malloc(strlen(_fontName) + 1);
+        memcpy(const_cast<char *>(fontName), _fontName, strlen(_fontName) + 1);
+    }
+
+    NVCVText(const NVCVText &text)
+        : fontSize(text.fontSize)
+        , tlPos(text.tlPos)
+        , fontColor(text.fontColor)
+        , bgColor(text.bgColor)
+    {
+        utf8Text = (const char *)malloc(strlen(text.utf8Text) + 1);
+        memcpy(const_cast<char *>(utf8Text), text.utf8Text, strlen(text.utf8Text) + 1);
+        fontName = (const char *)malloc(strlen(text.fontName) + 1);
+        memcpy(const_cast<char *>(fontName), text.fontName, strlen(text.fontName) + 1);
+    }
+
+    NVCVText &operator=(const NVCVText &text)
+    {
+        if (this != &text)
+        {
+            if (utf8Text != nullptr)
+            {
+                free((void *)utf8Text);
+                utf8Text = nullptr;
+            }
+            if (fontName != nullptr)
+            {
+                free((void *)fontName);
+                fontName = nullptr;
+            }
+            *this = NVCVText(text);
+        }
+        return *this;
+    };
+
+    ~NVCVText()
+    {
+        if (utf8Text != nullptr)
+        {
+            free((void *)utf8Text);
+            utf8Text = nullptr;
+        }
+        if (fontName != nullptr)
+        {
+            free((void *)fontName);
+            fontName = nullptr;
+        }
+    };
+};
+
+class NVCVSegment
+{
+public:
+    NVCVBoxI      box;            // Bounding box of segment, \ref NVCVBoxI.
+    int32_t       thickness;      // Line thickness of segment outter rect.
+    float        *dSeg = nullptr; // Device pointer for segment mask, cannot be nullptr.
+                                  // Array length: segWidth * segHeight
+                                  // Format:
+                                  //      Score_00, Score_01, ..., Score_0k, ...
+                                  //      Score_10, Score_11, ..., Score_kk, ...
+                                  //          ... ,     ... , ...,     ... , ...
+    int32_t       segWidth;       // Segment mask width.
+    int32_t       segHeight;      // Segment mask height.
+    float         segThreshold;   // Segment threshold.
+    NVCVColorRGBA borderColor;    // Line color of segment outter rect.
+    NVCVColorRGBA segColor;       // Segment mask color.
+
+    NVCVSegment(NVCVBoxI _box, int32_t _thickness, float *_hSeg, int32_t _segWidth, int32_t _segHeight,
+                float _segThreshold, NVCVColorRGBA _borderColor, NVCVColorRGBA _segColor)
+        : box(_box)
+        , thickness(_thickness)
+        , segWidth(_segWidth)
+        , segHeight(_segHeight)
+        , segThreshold(_segThreshold)
+        , borderColor(_borderColor)
+        , segColor(_segColor)
+    {
+        checkERR(cudaMalloc(&dSeg, segWidth * segHeight * sizeof(float)));
+        checkERR(cudaMemcpy(dSeg, _hSeg, segWidth * segHeight * sizeof(float), cudaMemcpyHostToDevice));
+    }
+
+    NVCVSegment(const NVCVSegment &segment)
+        : box(segment.box)
+        , thickness(segment.thickness)
+        , segWidth(segment.segWidth)
+        , segHeight(segment.segHeight)
+        , segThreshold(segment.segThreshold)
+        , borderColor(segment.borderColor)
+        , segColor(segment.segColor)
+    {
+        checkERR(cudaMalloc(&dSeg, segWidth * segHeight * sizeof(float)));
+        checkERR(cudaMemcpy(dSeg, segment.dSeg, segWidth * segHeight * sizeof(float), cudaMemcpyDeviceToDevice));
+    }
+
+    NVCVSegment &operator=(const NVCVSegment &) = delete;
+
+    ~NVCVSegment()
+    {
+        if (dSeg != nullptr)
+        {
+            checkERR(cudaFree(dSeg));
+            dSeg = nullptr;
+        }
+    };
+};
+
+class NVCVPolyLine
+{
+public:
+    int32_t      *hPoints = nullptr; // Host pointer for polyline points' xy, cannot be nullptr.
+                                     // Array length: 2 * numPoints.
+                                     // Format : X0, Y0, X1, Y1, ..., Xk, Yk, ...
+    int32_t      *dPoints = nullptr; // Device pointer for polyline points' xy.
+                                     // Can be nullptr only if fillColor.a == 0.
+                                     // Array length: 2 * numPoints.
+                                     // Format: X0, Y0, X1, Y1, ..., Xk, Yk, ...
+    int32_t       numPoints;         // Number of polyline points.
+    int32_t       thickness;         // Polyline thickness.
+    bool          isClosed;          // Connect p(0) to p(n-1) or not.
+    NVCVColorRGBA borderColor;       // Line color of polyline border.
+    NVCVColorRGBA fillColor;         // Fill color of poly fill area.
+    bool          interpolation;     // Default: true
+
+    NVCVPolyLine(int32_t *_hPoints, int32_t _numPoints, int32_t _thickness, bool _isClosed, NVCVColorRGBA _borderColor,
+                 NVCVColorRGBA _fillColor, bool _interpolation)
+        : numPoints(_numPoints)
+        , thickness(_thickness)
+        , isClosed(_isClosed)
+        , borderColor(_borderColor)
+        , fillColor(_fillColor)
+        , interpolation(_interpolation)
+    {
+        hPoints = (int *)malloc(numPoints * 2 * sizeof(int));
+        checkERR(cudaMalloc(&dPoints, 2 * numPoints * sizeof(int)));
+
+        memcpy(hPoints, _hPoints, 2 * numPoints * sizeof(int));
+        checkERR(cudaMemcpy(dPoints, _hPoints, 2 * numPoints * sizeof(int), cudaMemcpyHostToDevice));
+    }
+
+    NVCVPolyLine(const NVCVPolyLine &pl)
+        : numPoints(pl.numPoints)
+        , thickness(pl.thickness)
+        , isClosed(pl.isClosed)
+        , borderColor(pl.borderColor)
+        , fillColor(pl.fillColor)
+        , interpolation(pl.interpolation)
+    {
+        hPoints = (int *)malloc(numPoints * 2 * sizeof(int));
+        checkERR(cudaMalloc(&dPoints, 2 * numPoints * sizeof(int)));
+
+        memcpy(hPoints, pl.hPoints, 2 * numPoints * sizeof(int));
+        checkERR(cudaMemcpy(dPoints, pl.dPoints, 2 * numPoints * sizeof(int), cudaMemcpyDeviceToDevice));
+    }
+
+    NVCVPolyLine &operator=(const NVCVPolyLine &) = delete;
+
+    ~NVCVPolyLine()
+    {
+        if (hPoints != nullptr)
+        {
+            free(hPoints);
+            hPoints = nullptr;
+        }
+        if (dPoints != nullptr)
+        {
+            checkERR(cudaFree(dPoints));
+            dPoints = nullptr;
+        }
+    };
+};
+
+class NVCVClock
+{
+public:
+    NVCVClockFormat clockFormat;    // Pre-defined clock format.
+    long            time;           // Clock time.
+    int32_t         fontSize;       // Font size.
+    const char     *font = nullptr; // Font name.
+    NVCVPointI      tlPos;          // Top-left corner point, \ref NVCVPointI.
+    NVCVColorRGBA   fontColor;      // Font color of the text.
+    NVCVColorRGBA   bgColor;        // Background color of text box.
+
+    NVCVClock(NVCVClockFormat _clockFormat, long _time, int32_t _fontSize, const char *_font, NVCVPointI _tlPos,
+              NVCVColorRGBA _fontColor, NVCVColorRGBA _bgColor)
+        : clockFormat(_clockFormat)
+        , time(_time)
+        , fontSize(_fontSize)
+        , tlPos(_tlPos)
+        , fontColor(_fontColor)
+        , bgColor(_bgColor)
+    {
+        font = (const char *)malloc(strlen(_font) + 1);
+        memcpy(const_cast<char *>(font), _font, strlen(_font) + 1);
+    }
+
+    NVCVClock(const NVCVClock &clock)
+        : clockFormat(clock.clockFormat)
+        , time(clock.time)
+        , fontSize(clock.fontSize)
+        , tlPos(clock.tlPos)
+        , fontColor(clock.fontColor)
+        , bgColor(clock.bgColor)
+    {
+        font = (const char *)malloc(strlen(clock.font) + 1);
+        memcpy(const_cast<char *>(font), clock.font, strlen(clock.font) + 1);
+    }
+
+    NVCVClock &operator=(const NVCVClock &clock)
+    {
+        if (this != &clock)
+        {
+            if (font != nullptr)
+            {
+                free((void *)font);
+                font = nullptr;
+            }
+            *this = NVCVClock(clock);
+        }
+        return *this;
+    };
+
+    ~NVCVClock()
+    {
+        if (font != nullptr)
+        {
+            free((void *)font);
+            font = nullptr;
+        }
+    };
+};
+
+class NVCVElement
+{
+public:
+    NVCVElement(NVCVOSDType osd_type, const void *src);
+    NVCVElement(const NVCVElement &)            = delete;
+    NVCVElement &operator=(const NVCVElement &) = delete;
+    ~NVCVElement();
+
+    NVCVOSDType type();
+    void       *ptr();
+    // void assign(const void* src);
+
+private:
+    /*
+        *  type:
+        *      NVCV_OSD_RECT           -   \ref NVCVBndBoxI.
+        *      NVCV_OSD_TEXT           -   \ref NVCVText.
+        *      NVCV_OSD_SEGMENT        -   \ref NVCVSegment.
+        *      NVCV_OSD_POINT          -   \ref NVCVPoint.
+        *      NVCV_OSD_LINE           -   \ref NVCVLine.
+        *      NVCV_OSD_POLYLINE       -   \ref NVCVPolyLine.
+        *      NVCV_OSD_ROTATED_RECT   -   \ref NVCVRotatedBox.
+        *      NVCV_OSD_CIRCLE         -   \ref NVCVCircle.
+        *      NVCV_OSD_ARROW          -   \ref NVCVArrow.
+        *      NVCV_OSD_CLOCK          -   \ref NVCVClock.
+        */
+    NVCVOSDType m_type; // OSD element type to draw.
+    void       *m_data; // OSD element data pointer.
+};
+
+inline NVCVElement::NVCVElement(NVCVOSDType osd_type, const void *src)
+    : m_type(osd_type)
+{
+    switch (m_type)
+    {
+    case NVCVOSDType::NVCV_OSD_RECT:
+    {
+        auto rect = NVCVBndBoxI(*(NVCVBndBoxI *)src);
+        m_data    = new NVCVBndBoxI(rect);
+        break;
+    }
+    case NVCVOSDType::NVCV_OSD_TEXT:
+    {
+        auto text = NVCVText(*(NVCVText *)src);
+        m_data    = new NVCVText(text);
+        break;
+    }
+    case NVCVOSDType::NVCV_OSD_SEGMENT:
+    {
+        auto segment = NVCVSegment(*(NVCVSegment *)src);
+        m_data       = new NVCVSegment(segment);
+        break;
+    }
+    case NVCVOSDType::NVCV_OSD_POINT:
+    {
+        auto point = NVCVPoint(*(NVCVPoint *)src);
+        m_data     = new NVCVPoint(point);
+        break;
+    }
+    case NVCVOSDType::NVCV_OSD_LINE:
+    {
+        auto line = NVCVLine(*(NVCVLine *)src);
+        m_data    = new NVCVLine(line);
+        break;
+    }
+    case NVCVOSDType::NVCV_OSD_POLYLINE:
+    {
+        auto pl = NVCVPolyLine(*(NVCVPolyLine *)src);
+        m_data  = new NVCVPolyLine(pl);
+        break;
+    }
+    case NVCVOSDType::NVCV_OSD_ROTATED_RECT:
+    {
+        auto rb = NVCVRotatedBox(*(NVCVRotatedBox *)src);
+        m_data  = new NVCVRotatedBox(rb);
+        break;
+    }
+    case NVCVOSDType::NVCV_OSD_CIRCLE:
+    {
+        auto circle = NVCVCircle(*(NVCVCircle *)src);
+        m_data      = new NVCVCircle(circle);
+        break;
+    }
+    case NVCVOSDType::NVCV_OSD_ARROW:
+    {
+        auto arrow = NVCVArrow(*(NVCVArrow *)src);
+        m_data     = new NVCVArrow(arrow);
+        break;
+    }
+    case NVCVOSDType::NVCV_OSD_CLOCK:
+    {
+        auto clock = NVCVClock(*(NVCVClock *)src);
+        m_data     = new NVCVClock(clock);
+        break;
+    }
+    default:
+        break;
+    }
+}
+
+inline NVCVElement::~NVCVElement()
+{
+    switch (m_type)
+    {
+    case NVCVOSDType::NVCV_OSD_RECT:
+    {
+        NVCVBndBoxI *bndBox = (NVCVBndBoxI *)m_data;
+        if (bndBox != nullptr)
+        {
+            delete (bndBox);
+            bndBox = nullptr;
+        }
+        break;
+    }
+    case NVCVOSDType::NVCV_OSD_TEXT:
+    {
+        NVCVText *label = (NVCVText *)m_data;
+        if (label != nullptr)
+        {
+            delete (label);
+            label = nullptr;
+        }
+        break;
+    }
+    case NVCVOSDType::NVCV_OSD_SEGMENT:
+    {
+        NVCVSegment *segment = (NVCVSegment *)m_data;
+        if (segment != nullptr)
+        {
+            delete (segment);
+            segment = nullptr;
+        }
+        break;
+    }
+    case NVCVOSDType::NVCV_OSD_POINT:
+    {
+        NVCVPoint *point = (NVCVPoint *)m_data;
+        if (point != nullptr)
+        {
+            delete (point);
+            point = nullptr;
+        }
+        break;
+    }
+    case NVCVOSDType::NVCV_OSD_LINE:
+    {
+        NVCVLine *line = (NVCVLine *)m_data;
+        if (line != nullptr)
+        {
+            delete (line);
+            line = nullptr;
+        }
+        break;
+    }
+    case NVCVOSDType::NVCV_OSD_POLYLINE:
+    {
+        NVCVPolyLine *pl = (NVCVPolyLine *)m_data;
+        if (pl != nullptr)
+        {
+            delete (pl);
+            pl = nullptr;
+        }
+        break;
+    }
+    case NVCVOSDType::NVCV_OSD_ROTATED_RECT:
+    {
+        NVCVRotatedBox *rb = (NVCVRotatedBox *)m_data;
+        if (rb != nullptr)
+        {
+            delete (rb);
+            rb = nullptr;
+        }
+        break;
+    }
+    case NVCVOSDType::NVCV_OSD_CIRCLE:
+    {
+        NVCVCircle *circle = (NVCVCircle *)m_data;
+        if (circle != nullptr)
+        {
+            delete (circle);
+            circle = nullptr;
+        }
+        break;
+    }
+    case NVCVOSDType::NVCV_OSD_ARROW:
+    {
+        NVCVArrow *arrow = (NVCVArrow *)m_data;
+        if (arrow != nullptr)
+        {
+            delete (arrow);
+            arrow = nullptr;
+        }
+        break;
+    }
+    case NVCVOSDType::NVCV_OSD_CLOCK:
+    {
+        NVCVClock *clock = (NVCVClock *)m_data;
+        if (clock != nullptr)
+        {
+            delete (clock);
+            clock = nullptr;
+        }
+        break;
+    }
+    default:
+        break;
+    }
+}
+
+inline NVCVOSDType NVCVElement::type()
+{
+    return m_type;
+}
+
+inline void *NVCVElement::ptr()
+{
+    return m_data;
+}
+
+class NVCVBlurBoxesImpl
+{
+public:
+    NVCVBlurBoxesImpl(const std::vector<std::vector<NVCVBlurBoxI>> &blurboxes_vec);
+    NVCVBlurBoxesImpl(const NVCVBlurBoxesImpl &)            = delete;
+    NVCVBlurBoxesImpl &operator=(const NVCVBlurBoxesImpl &) = delete;
+    ~NVCVBlurBoxesImpl();
+
+    int32_t      batch() const;
+    int32_t      numBoxesAt(int32_t b) const;
+    NVCVBlurBoxI boxAt(int32_t b, int32_t i) const;
+
+private:
+    std::vector<std::vector<NVCVBlurBoxI>> m_blurboxes_vec;
+};
+
+inline NVCVBlurBoxesImpl::NVCVBlurBoxesImpl(const std::vector<std::vector<NVCVBlurBoxI>> &blurboxes_vec)
+{
+    m_blurboxes_vec = blurboxes_vec;
+}
+
+inline NVCVBlurBoxesImpl::~NVCVBlurBoxesImpl()
+{
+    std::vector<std::vector<NVCVBlurBoxI>> tmp;
+    m_blurboxes_vec.swap(tmp);
+}
+
+inline int32_t NVCVBlurBoxesImpl::batch() const
+{
+    return m_blurboxes_vec.size();
+}
+
+inline int32_t NVCVBlurBoxesImpl::numBoxesAt(int32_t b) const
+{
+    return m_blurboxes_vec[b].size();
+}
+
+inline NVCVBlurBoxI NVCVBlurBoxesImpl::boxAt(int32_t b, int32_t i) const
+{
+    return m_blurboxes_vec[b][i];
+}
+
+class NVCVBndBoxesImpl
+{
+public:
+    NVCVBndBoxesImpl(const std::vector<std::vector<NVCVBndBoxI>> &bndboxes_vec);
+    NVCVBndBoxesImpl(const NVCVBndBoxesImpl &)            = delete;
+    NVCVBndBoxesImpl &operator=(const NVCVBndBoxesImpl &) = delete;
+    ~NVCVBndBoxesImpl();
+
+    int32_t     batch() const;
+    int32_t     numBoxesAt(int32_t b) const;
+    NVCVBndBoxI boxAt(int32_t b, int32_t i) const;
+
+private:
+    std::vector<std::vector<NVCVBndBoxI>> m_bndboxes_vec;
+};
+
+inline NVCVBndBoxesImpl::NVCVBndBoxesImpl(const std::vector<std::vector<NVCVBndBoxI>> &bndboxes_vec)
+{
+    m_bndboxes_vec = bndboxes_vec;
+}
+
+inline NVCVBndBoxesImpl::~NVCVBndBoxesImpl()
+{
+    std::vector<std::vector<NVCVBndBoxI>> tmp;
+    m_bndboxes_vec.swap(tmp);
+}
+
+inline int32_t NVCVBndBoxesImpl::batch() const
+{
+    return m_bndboxes_vec.size();
+}
+
+inline int32_t NVCVBndBoxesImpl::numBoxesAt(int32_t b) const
+{
+    return m_bndboxes_vec[b].size();
+}
+
+inline NVCVBndBoxI NVCVBndBoxesImpl::boxAt(int32_t b, int32_t i) const
+{
+    return m_bndboxes_vec[b][i];
+}
+
+class NVCVElementsImpl
+{
+public:
+    NVCVElementsImpl(const std::vector<std::vector<std::shared_ptr<NVCVElement>>> &elements_vec);
+    NVCVElementsImpl(const NVCVElementsImpl &)            = delete;
+    NVCVElementsImpl &operator=(const NVCVElementsImpl &) = delete;
+    ~NVCVElementsImpl();
+
+    int32_t                      batch() const;
+    int32_t                      numElementsAt(int32_t b) const;
+    std::shared_ptr<NVCVElement> elementAt(int32_t b, int32_t i) const;
+
+private:
+    std::vector<std::vector<std::shared_ptr<NVCVElement>>> m_elements_vec;
+};
+
+inline NVCVElementsImpl::NVCVElementsImpl(const std::vector<std::vector<std::shared_ptr<NVCVElement>>> &elements_vec)
+{
+    m_elements_vec = elements_vec;
+}
+
+inline NVCVElementsImpl::~NVCVElementsImpl()
+{
+    std::vector<std::vector<std::shared_ptr<NVCVElement>>> tmp;
+    m_elements_vec.swap(tmp);
+}
+
+inline int32_t NVCVElementsImpl::batch() const
+{
+    return m_elements_vec.size();
+}
+
+inline int32_t NVCVElementsImpl::numElementsAt(int32_t b) const
+{
+    return m_elements_vec[b].size();
+}
+
+inline std::shared_ptr<NVCVElement> NVCVElementsImpl::elementAt(int32_t b, int32_t i) const
+{
+    return m_elements_vec[b][i];
+}
+
+} // namespace cvcuda::priv
+
+#endif // CVCUDA_TYPES_HPP
diff --git a/src/cvcuda/priv/WorkspaceAllocator.hpp b/src/cvcuda/priv/WorkspaceAllocator.hpp
new file mode 100644
index 00000000..bb5575fd
--- /dev/null
+++ b/src/cvcuda/priv/WorkspaceAllocator.hpp
@@ -0,0 +1,216 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CVCUDA_PRIV_WORKSPACE_ALLOCATOR_HPP
+#define CVCUDA_PRIV_WORKSPACE_ALLOCATOR_HPP
+
+#include <cvcuda/Workspace.hpp>
+
+#include <optional>
+
+namespace cvcuda {
+
+class WorkspaceMemAllocator
+{
+public:
+    WorkspaceMemAllocator(const WorkspaceMemAllocator &)            = delete;
+    WorkspaceMemAllocator &operator=(const WorkspaceMemAllocator &) = delete;
+
+    /**
+     * @brief Construct a new workspace memory allocator
+     *
+     * The function constructs a new allocator. Subsequent calls to `get` will obtain memory pointers to
+     * workspace entries.
+     *
+     * This function sets the default acquire and release streams, but doesn't call acquire - this is deferred to the
+     * first call to `get`.
+     * The streams can be overriden in manual calls to `acquire` and `release`.
+     *
+     * @param mem Workspace memory
+     * @param acquireReleaseStream A stream on which the data will be used (or nullopt to denote host usage)
+     */
+    WorkspaceMemAllocator(const WorkspaceMem &mem, std::optional<cudaStream_t> acquireReleaseStream = std::nullopt)
+        : WorkspaceMemAllocator(mem, acquireReleaseStream, acquireReleaseStream)
+    {
+    }
+
+    /**
+     * @brief Construct a new workspace memory allocator
+     *
+     * The function constructs a new allocator. Subsequent calls to `get` will obtain memory pointers to
+     * workspace entries.
+     *
+     * This function sets the default acquire and release streams, but doesn't call acquire - this is deferred to the
+     * first call to `get`.
+     * The streams can be overriden in manual calls to `acquire` and `release`.
+     *
+     * @param mem Workspace memory
+     * @param acquireStream A stream on which the data will be used first (or nullopt to denote host usage)
+     * @param acquireStream A stream on which the data will be used last (or nullopt to denote host usage)
+     */
+    WorkspaceMemAllocator(const WorkspaceMem &mem, std::optional<cudaStream_t> acquireStream,
+                          std::optional<cudaStream_t> releaseStream)
+        : m_mem(mem)
+        , m_acquireStream(acquireStream)
+        , m_releaseStream(releaseStream)
+    {
+    }
+
+    ~WorkspaceMemAllocator()
+    {
+        if (!m_released)
+            release(m_releaseStream);
+    }
+
+    /**
+     * @brief Allocates `count` elements of type `T` from the workspace memory.
+     *
+     * This function calls `acquire` if not called explicitly before.
+     *
+     * @tparam T        the type of the object to get
+     * @param count     the number of objects to allocate
+     * @param alignment the extra alignment, must not be less than `alignof(T)`
+     * @return T*       a pointer to the workspace buffer where the requested object is located
+     */
+    template<typename T = char>
+    T *get(size_t count = 1, size_t alignment = alignof(T))
+    {
+        assert(alignment >= alignof(T));
+
+        if (m_released)
+            throw std::logic_error("This workspace memory has been released.");
+
+        if (!m_acquired && count > 0)
+            acquire(m_acquireStream);
+
+        if ((uintptr_t)m_mem.data & (alignment - 1))
+        {
+            throw nvcv::Exception(
+                nvcv::Status::ERROR_INVALID_ARGUMENT,
+                "The workspace base pointer is not aligned to match the required alignment of a workspace entry.");
+        }
+
+        size_t offset    = nvcv::detail::AlignUp(m_offset, alignment);
+        T     *ret       = reinterpret_cast<T *>(static_cast<char *>(m_mem.data) + offset);
+        size_t real_size = nvcv::detail::AlignUp(count * sizeof(T), alignment);
+        offset += real_size;
+        if (offset > m_mem.req.size)
+            throw nvcv::Exception(nvcv::Status::ERROR_OUT_OF_MEMORY, "Operator workspace too small.");
+        m_offset = offset;
+        return ret;
+    }
+
+    constexpr size_t capacity() const
+    {
+        return m_mem.req.size;
+    }
+
+    constexpr size_t allocated() const
+    {
+        return m_offset;
+    }
+
+    /**
+     * @brief Waits for the memory to become ready for use on the acquire stream, if specified, or on host.
+     */
+    void acquire(std::optional<cudaStream_t> stream)
+    {
+        if (m_acquired)
+            throw std::logic_error("Acquire called multiple times");
+
+        if (m_released)
+            throw std::logic_error("This workspace memory has been released.");
+
+        if (m_mem.ready)
+        {
+            if (stream)
+            {
+                if (cudaStreamWaitEvent(*stream, m_mem.ready) != cudaSuccess)
+                    throw nvcv::Exception(nvcv::Status::ERROR_INTERNAL, "cudaStreamWairEvent failed");
+            }
+            else
+            {
+                if (cudaEventSynchronize(m_mem.ready) != cudaSuccess)
+                    throw nvcv::Exception(nvcv::Status::ERROR_INTERNAL, "cudaEventSynchronize failed");
+            }
+        }
+        m_acquired = true;
+    }
+
+    /**
+     * @brief Declares that the memory is ready for reuse by the release stream (if specified) or any thread or stream.
+     */
+    void release(std::optional<cudaStream_t> stream)
+    {
+        if (m_released)
+            throw std::logic_error("Release called multiple times");
+
+        if (m_mem.ready && m_offset)
+        {
+            assert(m_acquired);
+
+            if (stream)
+                if (cudaEventRecord(m_mem.ready, *stream) != cudaSuccess)
+                    throw nvcv::Exception(nvcv::Status::ERROR_INTERNAL, "cudaEventRecord failed");
+        }
+        m_released = true;
+    }
+
+private:
+    WorkspaceMem m_mem;
+    size_t       m_offset   = 0;
+    bool         m_acquired = false, m_released = false;
+
+    std::optional<cudaStream_t> m_acquireStream, m_releaseStream;
+};
+
+struct WorkspaceAllocator
+{
+public:
+    explicit WorkspaceAllocator(const Workspace &ws)
+        : hostMem(ws.hostMem)
+        , pinnedMem(ws.pinnedMem)
+        , cudaMem(ws.cudaMem)
+    {
+    }
+
+    template<typename T = char>
+    T *getHost(size_t count = 1, size_t alignment = alignof(T))
+    {
+        return hostMem.get<T>(count, alignment);
+    }
+
+    template<typename T = char>
+    T *getPinned(size_t count = 1, size_t alignment = alignof(T))
+    {
+        return pinnedMem.get<T>(count, alignment);
+    }
+
+    template<typename T = char>
+    T *getCuda(size_t count = 1, size_t alignment = alignof(T))
+    {
+        return cudaMem.get<T>(count, alignment);
+    }
+
+    WorkspaceMemAllocator hostMem;
+    WorkspaceMemAllocator pinnedMem;
+    WorkspaceMemAllocator cudaMem;
+};
+
+} // namespace cvcuda
+
+#endif // CVCUDA_PRIV_WORKSPACE_ALLOCATOR_HPP
diff --git a/src/cvcuda/priv/WorkspaceEstimator.hpp b/src/cvcuda/priv/WorkspaceEstimator.hpp
new file mode 100644
index 00000000..08ec5936
--- /dev/null
+++ b/src/cvcuda/priv/WorkspaceEstimator.hpp
@@ -0,0 +1,90 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CVCUDA_PRIV_WORKSPACE_ESTIMATOR_HPP
+#define CVCUDA_PRIV_WORKSPACE_ESTIMATOR_HPP
+
+#include <cvcuda/Workspace.hpp>
+
+namespace cvcuda {
+
+struct WorkspaceMemEstimator
+{
+    explicit WorkspaceMemEstimator(size_t initial_size = 0, size_t base_alignment = alignof(std::max_align_t))
+        : req{initial_size, base_alignment}
+    {
+    }
+
+    WorkspaceMemRequirements req;
+
+    template<typename T = char>
+    WorkspaceMemEstimator &add(size_t count = 1, size_t alignment = alignof(T))
+    {
+        if (alignment > req.alignment)
+            req.alignment = alignment;
+        req.size = nvcv::detail::AlignUp(req.size, alignment);
+        req.size += nvcv::detail::AlignUp(count * sizeof(T), alignment);
+        return *this;
+    }
+};
+
+struct WorkspaceEstimator
+{
+    static constexpr size_t kDefaultPinnedAlignment = 256;
+    static constexpr size_t kDefaultDeviceAlignment = 256;
+
+    WorkspaceMemEstimator hostMem;
+    WorkspaceMemEstimator pinnedMem{0, kDefaultPinnedAlignment};
+    WorkspaceMemEstimator cudaMem{0, kDefaultDeviceAlignment};
+
+    template<typename T = char>
+    WorkspaceEstimator &add(bool host, bool pinned, bool cuda, size_t count = 1, size_t alignment = alignof(T))
+    {
+        if (host)
+            addHost<T>(count, alignment);
+        if (pinned)
+            addPinned<T>(count, alignment);
+        if (cuda)
+            addCuda<T>(count, alignment);
+        return *this;
+    }
+
+    template<typename T = char>
+    WorkspaceEstimator &addHost(size_t count = 1, size_t alignment = alignof(T))
+    {
+        hostMem.add<T>(count, alignment);
+        return *this;
+    }
+
+    template<typename T = char>
+    WorkspaceEstimator &addPinned(size_t count = 1, size_t alignment = alignof(T))
+    {
+        pinnedMem.add<T>(count, alignment);
+        return *this;
+    }
+
+    template<typename T = char>
+    WorkspaceEstimator &addCuda(size_t count = 1, size_t alignment = alignof(T))
+    {
+        cudaMem.add<T>(count, alignment);
+        return *this;
+    }
+};
+
+} // namespace cvcuda
+
+#endif // CVCUDA_PRIV_WORKSPACE_ESTIMATOR_HPP
diff --git a/src/cvcuda/priv/WorkspaceUtil.hpp b/src/cvcuda/priv/WorkspaceUtil.hpp
new file mode 100644
index 00000000..6a174171
--- /dev/null
+++ b/src/cvcuda/priv/WorkspaceUtil.hpp
@@ -0,0 +1,24 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CVCUDA_PRIV_WORKSPACE_UTIL_HPP
+#define CVCUDA_PRIV_WORKSPACE_UTIL_HPP
+
+#include "WorkspaceAllocator.hpp"
+#include "WorkspaceEstimator.hpp"
+
+#endif // CVCUDA_PRIV_WORKSPACE_UTIL_HPP
diff --git a/src/cvcuda/priv/legacy/CMakeLists.txt b/src/cvcuda/priv/legacy/CMakeLists.txt
index 99c09da1..11a2a517 100644
--- a/src/cvcuda/priv/legacy/CMakeLists.txt
+++ b/src/cvcuda/priv/legacy/CMakeLists.txt
@@ -68,7 +68,6 @@ set(CV_CUDA_PRIV_LEGACY_OP_FILES
     adaptive_threshold_var_shape.cu
     threshold_var_shape.cu
     threshold_util.cu
-    bnd_box.cu
     box_blur.cu
     osd.cu
     textbackend/backend.cpp
diff --git a/src/cvcuda/priv/legacy/CvCudaLegacy.h b/src/cvcuda/priv/legacy/CvCudaLegacy.h
index 4b24a7f1..f2919dd9 100644
--- a/src/cvcuda/priv/legacy/CvCudaLegacy.h
+++ b/src/cvcuda/priv/legacy/CvCudaLegacy.h
@@ -23,6 +23,7 @@
 #include <cuda_runtime.h>
 #include <curand_kernel.h>
 #include <cvcuda/Types.h>
+#include <cvcuda/Workspace.hpp>
 #include <nvcv/BorderType.h>
 #include <nvcv/ImageBatch.hpp>
 #include <nvcv/ImageBatchData.hpp>
@@ -35,6 +36,11 @@
 
 namespace nvcv::legacy::cuda_op {
 
+using cvcuda::Workspace;
+using cvcuda::WorkspaceMem;
+using cvcuda::WorkspaceMemRequirements;
+using cvcuda::WorkspaceRequirements;
+
 enum ErrorCode
 {
     SUCCESS             = 0,
@@ -294,13 +300,6 @@ class ConvertTo : public CudaBaseOp
      */
     ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, const double alpha,
                     const double beta, cudaStream_t stream);
-    /**
-     * @brief calculate the cpu/gpu buffer size needed by this operator
-     * @param max_input_shape maximum input DataShape that may be used
-     * @param max_output_shape maximum output DataShape that may be used
-     * @param max_data_type DataType with the maximum size that may be used
-     */
-    size_t    calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type);
 };
 
 class CustomCrop : public CudaBaseOp
@@ -374,13 +373,6 @@ class CustomCrop : public CudaBaseOp
      */
     ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, NVCVRectI roi,
                     cudaStream_t stream);
-    /**
-     * @brief calculate the cpu/gpu buffer size needed by this operator
-     * @param max_input_shape maximum input DataShape that may be used
-     * @param max_output_shape maximum output DataShape that may be used
-     * @param max_data_type DataType with the maximum size that may be used
-     */
-    size_t    calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type);
 };
 
 class MinAreaRect : public CudaBaseOp
@@ -527,14 +519,6 @@ class Flip : public CudaBaseOp
      */
     ErrorCode infer(const TensorDataStridedCuda &input, const TensorDataStridedCuda &output, const int32_t flipCode,
                     cudaStream_t stream);
-
-    /**
-     * @brief calculate the cpu/gpu buffer size needed by this operator
-     * @param max_input_shape maximum input DataShape that may be used
-     * @param max_output_shape maximum output DataShape that may be used
-     * @param max_data_type DataType with the maximum size that may be used
-     */
-    size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type);
 };
 
 class FlipOrCopyVarShape : public CudaBaseOp
@@ -600,12 +584,6 @@ class FlipOrCopyVarShape : public CudaBaseOp
      */
     ErrorCode infer(const ImageBatchVarShapeDataStridedCuda &input, const ImageBatchVarShapeDataStridedCuda &output,
                     const TensorDataStridedCuda &flipCode, cudaStream_t stream);
-
-    /**
-     * @brief calculate the gpu buffer size needed by this operator
-     * @param maxBatchSize Maximum batch size that may be used
-     */
-    size_t calBufferSize(int maxBatchSize);
 };
 
 class Reformat : public CudaBaseOp
@@ -677,14 +655,8 @@ class Reformat : public CudaBaseOp
      * @param stream for the asynchronous execution.
      */
     ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, cudaStream_t stream);
-    /**
-     * @brief calculate the cpu/gpu buffer size needed by this operator
-     * @param max_input_shape maximum input DataShape that may be used
-     * @param max_output_shape maximum output DataShape that may be used
-     * @param max_data_type DataType with the maximum size that may be used
-     */
-    size_t    calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type);
-    void      checkDataFormat(DataFormat format);
+
+    void checkDataFormat(DataFormat format);
 };
 
 class Resize : public CudaBaseOp
@@ -753,13 +725,6 @@ class Resize : public CudaBaseOp
      */
     ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
                     const NVCVInterpolationType interpolation, cudaStream_t stream);
-    /**
-     * @brief calculate the cpu/gpu buffer size needed by this operator
-     * @param max_input_shape maximum input DataShape that may be used
-     * @param max_output_shape maximum output DataShape that may be used
-     * @param max_data_type DataType with the maximum size that may be used
-     */
-    size_t    calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type);
 };
 
 class Morphology : public CudaBaseOp
@@ -986,14 +951,8 @@ class Normalize : public CudaBaseOp
                     const TensorDataStridedCuda &scaleData, const TensorDataStridedCuda &outData,
                     const float global_scale, const float shift, const float epsilon, const uint32_t flags,
                     cudaStream_t stream);
-    /**
-     * @brief calculate the cpu/gpu buffer size needed by this operator
-     * @param max_input_shape maximum input DataShape that may be used
-     * @param max_output_shape maximum output DataShape that may be used
-     * @param max_data_type DataType with the maximum size that may be used
-     */
-    size_t    calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type);
-    void      checkParamShape(DataShape input_shape, DataShape param_shape);
+
+    void checkParamShape(DataShape input_shape, DataShape param_shape);
 };
 
 class PadAndStack : public CudaBaseOp
@@ -1059,8 +1018,6 @@ class PadAndStack : public CudaBaseOp
     ErrorCode infer(const ImageBatchVarShapeDataStridedCuda &inData, const TensorDataStridedCuda &outData,
                     const TensorDataStridedCuda &top, const TensorDataStridedCuda &left,
                     const NVCVBorderType borderMode, const float borderValue, cudaStream_t stream);
-
-    size_t calBufferSize(int batch_size);
 };
 
 class Rotate : public CudaBaseOp
@@ -1125,13 +1082,6 @@ class MedianBlur : public CudaBaseOp
      */
     ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, const nvcv::Size2D ksize,
                     cudaStream_t stream);
-    /**
-     * @brief calculate the cpu/gpu buffer size needed by this operator
-     * @param max_input_shape maximum input DataShape that may be used
-     * @param max_output_shape maximum output DataShape that may be used
-     * @param max_data_type DataType with the maximum size that may be used
-     */
-    size_t    calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type);
 };
 
 class NormalizeVarShape : public CudaBaseOp
@@ -1386,13 +1336,6 @@ class CenterCrop : public CudaBaseOp
      */
     ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, int crop_rows,
                     int crop_columns, cudaStream_t stream);
-    /**
-     * @brief calculate the cpu/gpu buffer size needed by this operator
-     * @param max_input_shape maximum input DataShape that may be used
-     * @param max_output_shape maximum output DataShape that may be used
-     * @param max_data_type DataType with the maximum size that may be used
-     */
-    size_t    calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type);
 };
 
 class RotateVarShape : public CudaBaseOp
@@ -1496,14 +1439,6 @@ class Laplacian : public CudaBaseOp
      */
     ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, int ksize, float scale,
                     NVCVBorderType borderMode, cudaStream_t stream);
-
-    /**
-     * @brief calculate the cpu/gpu buffer size needed by this operator
-     * @param max_input_shape maximum input DataShape that may be used
-     * @param max_output_shape maximum output DataShape that may be used
-     * @param max_data_type DataType with the maximum size that may be used
-     */
-    size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type);
 };
 
 class Gaussian : public CudaBaseOp
@@ -2227,43 +2162,22 @@ class OSD : public CudaBaseOp
     ~OSD();
 
     /**
-     * @brief Converts an image from one color space to another.
+     * @brief Draw OSD elements onto input tensor, then return back output tensor.
      * @param inData Input tensor.
      * @param outData Output tensor.
-     * @param elements OSD elements, \ref NVCVElement.
+     * @param elements OSD elements, \ref NVCVElements.
      */
     ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, NVCVElements elements,
                     cudaStream_t stream);
 
     /**
-     * @brief calculate the cpu/gpu buffer size needed by this operator
-     * @param max_input_shape maximum input DataShape that may be used
-     * @param max_output_shape maximum output DataShape that may be used
-     * @param max_data_type DataType with the maximum size that may be used
-     */
-    size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type);
-
-private:
-    nvcv::cuda::osd::cuOSDContext_t m_context;
-};
-
-class BndBox : public CudaBaseOp
-{
-public:
-    BndBox() = delete;
-
-    BndBox(DataShape max_input_shape, DataShape max_output_shape);
-
-    ~BndBox();
-
-    /**
-     * @brief Converts an image from one color space to another.
+     * @brief Draw BndBox elements onto input tensor, then return back output tensor.
      * @param inData Input tensor.
      * @param outData Output tensor.
      * @param boxes Bounding box rectangle, \ref NVCVBndBoxesI.
      */
-    ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, NVCVBndBoxesI bboxes,
-                    cudaStream_t stream);
+    ErrorCode inferBox(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, NVCVBndBoxesI bboxes,
+                       cudaStream_t stream);
 
     /**
      * @brief calculate the cpu/gpu buffer size needed by this operator
@@ -2326,14 +2240,6 @@ class CvtColor : public CudaBaseOp
      */
     ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
                     NVCVColorConversionCode code, cudaStream_t stream);
-
-    /**
-     * @brief calculate the cpu/gpu buffer size needed by this operator
-     * @param max_input_shape maximum input DataShape that may be used
-     * @param max_output_shape maximum output DataShape that may be used
-     * @param max_data_type DataType with the maximum size that may be used
-     */
-    size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type);
 };
 
 class WarpAffine : public CudaBaseOp
@@ -2394,13 +2300,6 @@ class WarpPerspective : public CudaBaseOp
     ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, const float *transMatrix,
                     const int32_t flags, const NVCVBorderType borderMode, const float4 borderValue,
                     cudaStream_t stream);
-    /**
-     * @brief calculate the cpu/gpu buffer size needed by this operator
-     * @param max_input_shape maximum input DataShape that may be used
-     * @param max_output_shape maximum output DataShape that may be used
-     * @param max_data_type DataType with the maximum size that may be used
-     */
-    size_t    calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type);
 };
 
 class WarpPerspectiveVarShape : public CudaBaseOp
@@ -2502,12 +2401,6 @@ class CvtColorVarShape : public CudaBaseOp
      */
     ErrorCode infer(const ImageBatchVarShapeDataStridedCuda &inData, const ImageBatchVarShapeDataStridedCuda &outData,
                     NVCVColorConversionCode code, cudaStream_t stream);
-
-    /**
-     * @brief calculate the cpu/gpu buffer size needed by this operator
-     * @param batch_size maximum input batch size
-     */
-    size_t calBufferSize(int batch_size);
 };
 
 class Composite : public CudaBaseOp
@@ -2602,12 +2495,6 @@ class CompositeVarShape : public CudaBaseOp
 class PillowResize : public CudaBaseOp
 {
 public:
-    PillowResize() = delete;
-
-    PillowResize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type);
-
-    ~PillowResize();
-
     /**
      * @brief Resizes the input images. The function resize resizes the image down to or up to the specified size.
      * @param inputs gpu pointer, inputs[0] are batched input images, whose shape is input_shape and type is data_type.
@@ -2623,29 +2510,15 @@ class PillowResize : public CudaBaseOp
      *
      */
     ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
-                    const NVCVInterpolationType interpolation, cudaStream_t stream);
+                    const NVCVInterpolationType interpolation, cudaStream_t stream, const Workspace &workspace);
 
-    /**
-     * @brief calculate the cpu/gpu buffer size needed by this operator
-     * @param max_input_shape maximum input DataShape that may be used
-     * @param max_output_shape maximum output DataShape that may be used
-     * @param max_data_type DataType with the maximum size that may be used
-     */
-    size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type);
-
-private:
-    void *gpu_workspace;
+    NVCVWorkspaceRequirements getWorkspaceRequirements(DataShape max_input_shape, DataShape max_output_shape,
+                                                       DataType max_data_type);
 };
 
 class PillowResizeVarShape : public CudaBaseOp
 {
 public:
-    PillowResizeVarShape() = delete;
-
-    PillowResizeVarShape(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type);
-
-    ~PillowResizeVarShape();
-
     /**
      * @brief Resizes the input images. The function resize resizes the image down to or up to the specified size.
      * @param inputs gpu pointer, inputs[0] are batched input images, whose shape is input_shape and type is data_type.
@@ -2664,19 +2537,10 @@ class PillowResizeVarShape : public CudaBaseOp
      *
      */
     ErrorCode infer(const ImageBatchVarShape &inData, const ImageBatchVarShape &outData,
-                    const NVCVInterpolationType interpolation, cudaStream_t stream);
+                    const NVCVInterpolationType interpolation, cudaStream_t stream, const Workspace &workspace);
 
-    /**
-     * @brief calculate the cpu/gpu buffer size needed by this operator
-     * @param max_input_shape maximum input DataShape that may be used
-     * @param max_output_shape maximum output DataShape that may be used
-     * @param max_data_type DataType with the maximum size that may be used
-     */
-    size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type);
-
-private:
-    void *gpu_workspace = nullptr;
-    void *cpu_workspace = nullptr;
+    NVCVWorkspaceRequirements getWorkspaceRequirements(DataShape max_input_shape, DataShape max_output_shape,
+                                                       DataType max_data_type);
 };
 
 class Threshold : public CudaBaseOp
diff --git a/src/cvcuda/priv/legacy/CvCudaOSD.hpp b/src/cvcuda/priv/legacy/CvCudaOSD.hpp
index 0a33a6dc..4b3a8303 100644
--- a/src/cvcuda/priv/legacy/CvCudaOSD.hpp
+++ b/src/cvcuda/priv/legacy/CvCudaOSD.hpp
@@ -320,10 +320,6 @@ struct cuOSDContext
     std::unique_ptr<Memory<unsigned char>>            gpu_commands;
     std::unique_ptr<Memory<int>>                      gpu_commands_offset;
 
-    // For OpBndBox only, to be deprecated.
-    std::vector<std::shared_ptr<RectangleCommand>> rect_commands;
-    std::unique_ptr<Memory<RectangleCommand>>      gpu_rect_commands;
-
     std::vector<std::shared_ptr<BoxBlurCommand>> blur_commands;
     std::unique_ptr<Memory<BoxBlurCommand>>      gpu_blur_commands;
 
diff --git a/src/cvcuda/priv/legacy/bnd_box.cu b/src/cvcuda/priv/legacy/bnd_box.cu
deleted file mode 100644
index 6d68507a..00000000
--- a/src/cvcuda/priv/legacy/bnd_box.cu
+++ /dev/null
@@ -1,573 +0,0 @@
-/* Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- *
- * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
- * SPDX-License-Identifier: Apache-2.0
- *
- * Copyright (C) 2021-2022, Bytedance Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-
-#include "CvCudaLegacy.h"
-#include "CvCudaLegacyHelpers.hpp"
-
-#include "CvCudaUtils.cuh"
-
-#include <nvcv/Image.hpp>
-#include <nvcv/ImageData.hpp>
-#include <nvcv/TensorData.hpp>
-
-#include <cstdio>
-
-using namespace nvcv::legacy::cuda_op;
-using namespace nvcv::legacy::helpers;
-using namespace nvcv::cuda::osd;
-
-namespace nvcv::legacy::cuda_op {
-
-template<typename _T>
-static __host__ __device__ uint8_t u8cast(_T value)
-{
-    return value < 0 ? 0 : (value > 255 ? 255 : value);
-}
-
-// inbox_single_pixel:
-// check if given coordinate is in box
-//      a --- d
-//      |     |
-//      b --- c
-static __device__ __forceinline__ bool inbox_single_pixel(float ix, float iy, float ax, float ay, float bx, float by,
-                                                          float cx, float cy, float dx, float dy)
-{
-    return ((bx - ax) * (iy - ay) - (by - ay) * (ix - ax)) < 0 && ((cx - bx) * (iy - by) - (cy - by) * (ix - bx)) < 0
-        && ((dx - cx) * (iy - cy) - (dy - cy) * (ix - cx)) < 0 && ((ax - dx) * (iy - dy) - (ay - dy) * (ix - dx)) < 0;
-}
-
-static __device__ void blend_single_color(uchar4 &color, uint8_t &c0, uint8_t &c1, uint8_t &c2, uint8_t a)
-{
-    int foreground_alpha = a;
-    int background_alpha = color.w;
-    int blend_alpha      = ((background_alpha * (255 - foreground_alpha)) >> 8) + foreground_alpha;
-    color.x = u8cast((((color.x * background_alpha * (255 - foreground_alpha)) >> 8) + (c0 * foreground_alpha))
-                     / blend_alpha);
-    color.y = u8cast((((color.y * background_alpha * (255 - foreground_alpha)) >> 8) + (c1 * foreground_alpha))
-                     / blend_alpha);
-    color.z = u8cast((((color.z * background_alpha * (255 - foreground_alpha)) >> 8) + (c2 * foreground_alpha))
-                     / blend_alpha);
-    color.w = blend_alpha;
-}
-
-// render_rectangle_fill:
-// render filled rectangle with border msaa4x interpolation off
-static __device__ void render_rectangle_fill(int ix, int iy, RectangleCommand *p, uchar4 color[4])
-{
-    if (inbox_single_pixel(ix, iy, p->ax1, p->ay1, p->bx1, p->by1, p->cx1, p->cy1, p->dx1, p->dy1))
-    {
-        blend_single_color(color[0], p->c0, p->c1, p->c2, p->c3);
-    }
-    if (inbox_single_pixel(ix + 1, iy, p->ax1, p->ay1, p->bx1, p->by1, p->cx1, p->cy1, p->dx1, p->dy1))
-    {
-        blend_single_color(color[1], p->c0, p->c1, p->c2, p->c3);
-    }
-    if (inbox_single_pixel(ix, iy + 1, p->ax1, p->ay1, p->bx1, p->by1, p->cx1, p->cy1, p->dx1, p->dy1))
-    {
-        blend_single_color(color[2], p->c0, p->c1, p->c2, p->c3);
-    }
-    if (inbox_single_pixel(ix + 1, iy + 1, p->ax1, p->ay1, p->bx1, p->by1, p->cx1, p->cy1, p->dx1, p->dy1))
-    {
-        blend_single_color(color[3], p->c0, p->c1, p->c2, p->c3);
-    }
-}
-
-// render_rectangle_border:
-// render hollow rectangle with border msaa4x interpolation off
-static __device__ void render_rectangle_border(int ix, int iy, RectangleCommand *p, uchar4 color[4])
-{
-    if (!inbox_single_pixel(ix, iy, p->ax2, p->ay2, p->bx2, p->by2, p->cx2, p->cy2, p->dx2, p->dy2)
-        && inbox_single_pixel(ix, iy, p->ax1, p->ay1, p->bx1, p->by1, p->cx1, p->cy1, p->dx1, p->dy1))
-    {
-        blend_single_color(color[0], p->c0, p->c1, p->c2, p->c3);
-    }
-    if (!inbox_single_pixel(ix + 1, iy, p->ax2, p->ay2, p->bx2, p->by2, p->cx2, p->cy2, p->dx2, p->dy2)
-        && inbox_single_pixel(ix + 1, iy, p->ax1, p->ay1, p->bx1, p->by1, p->cx1, p->cy1, p->dx1, p->dy1))
-    {
-        blend_single_color(color[1], p->c0, p->c1, p->c2, p->c3);
-    }
-    if (!inbox_single_pixel(ix, iy + 1, p->ax2, p->ay2, p->bx2, p->by2, p->cx2, p->cy2, p->dx2, p->dy2)
-        && inbox_single_pixel(ix, iy + 1, p->ax1, p->ay1, p->bx1, p->by1, p->cx1, p->cy1, p->dx1, p->dy1))
-    {
-        blend_single_color(color[2], p->c0, p->c1, p->c2, p->c3);
-    }
-    if (!inbox_single_pixel(ix + 1, iy + 1, p->ax2, p->ay2, p->bx2, p->by2, p->cx2, p->cy2, p->dx2, p->dy2)
-        && inbox_single_pixel(ix + 1, iy + 1, p->ax1, p->ay1, p->bx1, p->by1, p->cx1, p->cy1, p->dx1, p->dy1))
-    {
-        blend_single_color(color[3], p->c0, p->c1, p->c2, p->c3);
-    }
-}
-
-static __device__ void do_rectangle_woMSAA(RectangleCommand *cmd, int ix, int iy, uchar4 context_color[4])
-{
-    if (cmd->thickness == -1)
-    {
-        render_rectangle_fill(ix, iy, cmd, context_color);
-    }
-    else
-    {
-        render_rectangle_border(ix, iy, cmd, context_color);
-    }
-}
-
-template<class SrcWrapper, class DstWrapper, typename T = typename DstWrapper::ValueType>
-static __device__ void blending_rgb_pixel(SrcWrapper src, DstWrapper dst, int x, int y, uchar4 plot_colors[4])
-{
-    const int batch_idx = get_batch_idx();
-
-    for (int i = 0; i < 2; ++i)
-    {
-        T *in  = src.ptr(batch_idx, y + i, x, 0);
-        T *out = dst.ptr(batch_idx, y + i, x, 0);
-        for (int j = 0; j < 2; ++j, in += 3, out += 3)
-        {
-            uchar4 &rcolor           = plot_colors[i * 2 + j];
-            int     foreground_alpha = rcolor.w;
-            int     background_alpha = 255;
-            int     blend_alpha      = ((background_alpha * (255 - foreground_alpha)) >> 8) + foreground_alpha;
-            out[0]
-                = u8cast((((in[0] * background_alpha * (255 - foreground_alpha)) >> 8) + (rcolor.x * foreground_alpha))
-                         / blend_alpha);
-            out[1]
-                = u8cast((((in[1] * background_alpha * (255 - foreground_alpha)) >> 8) + (rcolor.y * foreground_alpha))
-                         / blend_alpha);
-            out[2]
-                = u8cast((((in[2] * background_alpha * (255 - foreground_alpha)) >> 8) + (rcolor.z * foreground_alpha))
-                         / blend_alpha);
-        }
-    }
-}
-
-template<class SrcWrapper, class DstWrapper, typename T = typename DstWrapper::ValueType>
-static __device__ void blending_rgba_pixel(SrcWrapper src, DstWrapper dst, int x, int y, uchar4 plot_colors[4])
-{
-    const int batch_idx = get_batch_idx();
-
-    for (int i = 0; i < 2; ++i)
-    {
-        T *in  = src.ptr(batch_idx, y + i, x, 0);
-        T *out = dst.ptr(batch_idx, y + i, x, 0);
-        for (int j = 0; j < 2; ++j, in += 4, out += 4)
-        {
-            uchar4 &rcolor           = plot_colors[i * 2 + j];
-            int     foreground_alpha = rcolor.w;
-            int     background_alpha = in[3];
-            int     blend_alpha      = ((background_alpha * (255 - foreground_alpha)) >> 8) + foreground_alpha;
-            out[0]
-                = u8cast((((in[0] * background_alpha * (255 - foreground_alpha)) >> 8) + (rcolor.x * foreground_alpha))
-                         / blend_alpha);
-            out[1]
-                = u8cast((((in[1] * background_alpha * (255 - foreground_alpha)) >> 8) + (rcolor.y * foreground_alpha))
-                         / blend_alpha);
-            out[2]
-                = u8cast((((in[2] * background_alpha * (255 - foreground_alpha)) >> 8) + (rcolor.z * foreground_alpha))
-                         / blend_alpha);
-            out[3] = blend_alpha;
-        }
-    }
-}
-
-template<class SrcWrapper, class DstWrapper>
-static __global__ void render_bndbox_rgb_womsaa_kernel(SrcWrapper src, DstWrapper dst, int bx, int by,
-                                                       const RectangleCommand *commands, int num_command, int width,
-                                                       int height, bool inplace)
-{
-    int ix = ((blockDim.x * blockIdx.x + threadIdx.x) << 1) + bx;
-    int iy = ((blockDim.y * blockIdx.y + threadIdx.y) << 1) + by;
-    if (ix < 0 || iy < 0 || ix >= width - 1 || iy >= height - 1)
-        return;
-
-    uchar4    context_color[4] = {0};
-    const int batch_idx        = get_batch_idx();
-
-    for (int i = 0; i < num_command; ++i)
-    {
-        RectangleCommand pcommand = commands[i];
-        if (pcommand.batch_index != batch_idx)
-            continue;
-        do_rectangle_woMSAA(&pcommand, ix, iy, context_color);
-    }
-
-    if (context_color[0].w == 0 && context_color[1].w == 0 && context_color[2].w == 0 && context_color[3].w == 0)
-    {
-        if (inplace)
-            return;
-        *(uchar3 *)(dst.ptr(batch_idx, iy, ix, 0))         = *(uchar3 *)(src.ptr(batch_idx, iy, ix, 0));
-        *(uchar3 *)(dst.ptr(batch_idx, iy, ix + 1, 0))     = *(uchar3 *)(src.ptr(batch_idx, iy, ix + 1, 0));
-        *(uchar3 *)(dst.ptr(batch_idx, iy + 1, ix, 0))     = *(uchar3 *)(src.ptr(batch_idx, iy + 1, ix, 0));
-        *(uchar3 *)(dst.ptr(batch_idx, iy + 1, ix + 1, 0)) = *(uchar3 *)(src.ptr(batch_idx, iy + 1, ix + 1, 0));
-        return;
-    }
-
-    blending_rgb_pixel(src, dst, ix, iy, context_color);
-}
-
-template<class SrcWrapper, class DstWrapper>
-static __global__ void render_bndbox_rgba_womsaa_kernel(SrcWrapper src, DstWrapper dst, int bx, int by,
-                                                        const RectangleCommand *commands, int num_command, int width,
-                                                        int height, bool inplace)
-{
-    int ix = ((blockDim.x * blockIdx.x + threadIdx.x) << 1) + bx;
-    int iy = ((blockDim.y * blockIdx.y + threadIdx.y) << 1) + by;
-    if (ix < 0 || iy < 0 || ix >= width - 1 || iy >= height - 1)
-        return;
-
-    uchar4    context_color[4] = {0};
-    const int batch_idx        = get_batch_idx();
-
-    for (int i = 0; i < num_command; ++i)
-    {
-        RectangleCommand pcommand = commands[i];
-        if (pcommand.batch_index != batch_idx)
-            continue;
-        do_rectangle_woMSAA(&pcommand, ix, iy, context_color);
-    }
-
-    if (context_color[0].w == 0 && context_color[1].w == 0 && context_color[2].w == 0 && context_color[3].w == 0)
-    {
-        if (inplace)
-            return;
-        *(uchar4 *)(dst.ptr(batch_idx, iy, ix, 0))         = *(uchar4 *)(src.ptr(batch_idx, iy, ix, 0));
-        *(uchar4 *)(dst.ptr(batch_idx, iy, ix + 1, 0))     = *(uchar4 *)(src.ptr(batch_idx, iy, ix + 1, 0));
-        *(uchar4 *)(dst.ptr(batch_idx, iy + 1, ix, 0))     = *(uchar4 *)(src.ptr(batch_idx, iy + 1, ix, 0));
-        *(uchar4 *)(dst.ptr(batch_idx, iy + 1, ix + 1, 0)) = *(uchar4 *)(src.ptr(batch_idx, iy + 1, ix + 1, 0));
-        return;
-    }
-
-    blending_rgba_pixel(src, dst, ix, iy, context_color);
-}
-
-static ErrorCode cuosd_draw_rectangle(cuOSDContext_t context, int width, int height, NVCVBndBoxesI bboxes)
-{
-    for (int n = 0; n < bboxes.batch; n++)
-    {
-        auto numBoxes = bboxes.numBoxes[n];
-
-        for (int i = 0; i < numBoxes; i++)
-        {
-            auto bbox   = bboxes.boxes[i];
-            int  left   = max(min(bbox.box.x, width - 1), 0);
-            int  top    = max(min(bbox.box.y, height - 1), 0);
-            int  right  = max(min(left + bbox.box.width - 1, width - 1), 0);
-            int  bottom = max(min(top + bbox.box.height - 1, height - 1), 0);
-
-            if (left == right || top == bottom || bbox.box.width <= 0 || bbox.box.height <= 0)
-            {
-                LOG_DEBUG("Skipped bnd_box(" << bbox.box.x << ", " << bbox.box.y << ", " << bbox.box.width << ", "
-                                             << bbox.box.height << ") in image(" << width << ", " << height << ")");
-                continue;
-            }
-
-            if (bbox.borderColor.a == 0)
-                continue;
-            if (bbox.fillColor.a || bbox.thickness == -1)
-            {
-                if (bbox.thickness == -1)
-                {
-                    bbox.fillColor = bbox.borderColor;
-                }
-
-                auto cmd           = std::make_shared<RectangleCommand>();
-                cmd->batch_index   = n;
-                cmd->thickness     = -1;
-                cmd->interpolation = false;
-                cmd->c0            = bbox.fillColor.r;
-                cmd->c1            = bbox.fillColor.g;
-                cmd->c2            = bbox.fillColor.b;
-                cmd->c3            = bbox.fillColor.a;
-
-                // a   d
-                // b   c
-                cmd->ax1             = left;
-                cmd->ay1             = top;
-                cmd->dx1             = right;
-                cmd->dy1             = top;
-                cmd->cx1             = right;
-                cmd->cy1             = bottom;
-                cmd->bx1             = left;
-                cmd->by1             = bottom;
-                cmd->bounding_left   = left;
-                cmd->bounding_right  = right;
-                cmd->bounding_top    = top;
-                cmd->bounding_bottom = bottom;
-                context->rect_commands.emplace_back(cmd);
-            }
-            if (bbox.thickness == -1)
-                continue;
-
-            auto cmd           = std::make_shared<RectangleCommand>();
-            cmd->batch_index   = n;
-            cmd->thickness     = bbox.thickness;
-            cmd->interpolation = false;
-            cmd->c0            = bbox.borderColor.r;
-            cmd->c1            = bbox.borderColor.g;
-            cmd->c2            = bbox.borderColor.b;
-            cmd->c3            = bbox.borderColor.a;
-
-            float half_thickness = bbox.thickness / 2.0f;
-            cmd->ax2             = left + half_thickness;
-            cmd->ay2             = top + half_thickness;
-            cmd->dx2             = right - half_thickness;
-            cmd->dy2             = top + half_thickness;
-            cmd->cx2             = right - half_thickness;
-            cmd->cy2             = bottom - half_thickness;
-            cmd->bx2             = left + half_thickness;
-            cmd->by2             = bottom - half_thickness;
-
-            // a   d
-            // b   c
-            cmd->ax1 = left - half_thickness;
-            cmd->ay1 = top - half_thickness;
-            cmd->dx1 = right + half_thickness;
-            cmd->dy1 = top - half_thickness;
-            cmd->cx1 = right + half_thickness;
-            cmd->cy1 = bottom + half_thickness;
-            cmd->bx1 = left - half_thickness;
-            cmd->by1 = bottom + half_thickness;
-
-            int int_half         = ceil(half_thickness);
-            cmd->bounding_left   = left - int_half;
-            cmd->bounding_right  = right + int_half;
-            cmd->bounding_top    = top - int_half;
-            cmd->bounding_bottom = bottom + int_half;
-            context->rect_commands.emplace_back(cmd);
-        }
-
-        bboxes.boxes = (NVCVBndBoxI *)((uint8_t *)bboxes.boxes + numBoxes * sizeof(NVCVBndBoxI));
-    }
-    return ErrorCode::SUCCESS;
-}
-
-static void cuosd_apply(cuOSDContext_t context, int width, int height, cudaStream_t stream)
-{
-    context->bounding_left   = width;
-    context->bounding_top    = height;
-    context->bounding_right  = 0;
-    context->bounding_bottom = 0;
-
-    for (int i = 0; i < (int)context->rect_commands.size(); ++i)
-    {
-        auto &cmd                = context->rect_commands[i];
-        context->bounding_left   = min(context->bounding_left, cmd->bounding_left);
-        context->bounding_top    = min(context->bounding_top, cmd->bounding_top);
-        context->bounding_right  = max(context->bounding_right, cmd->bounding_right);
-        context->bounding_bottom = max(context->bounding_bottom, cmd->bounding_bottom);
-    }
-
-    if (context->gpu_rect_commands == nullptr)
-    {
-        context->gpu_rect_commands.reset(new Memory<RectangleCommand>());
-    }
-
-    context->gpu_rect_commands->alloc_or_resize_to(context->rect_commands.size());
-
-    for (int i = 0; i < (int)context->rect_commands.size(); ++i)
-    {
-        auto &cmd = context->rect_commands[i];
-        memcpy((void *)(context->gpu_rect_commands->host() + i), cmd.get(), sizeof(RectangleCommand));
-    }
-
-    context->gpu_rect_commands->copy_host_to_device(stream);
-}
-
-inline ErrorCode ApplyBndBox_RGB(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData,
-                                 cuOSDContext_t context, cudaStream_t stream)
-{
-    auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(inData);
-    NVCV_ASSERT(inAccess);
-
-    cuda_op::DataType  inDataType = helpers::GetLegacyDataType(inData.dtype());
-    cuda_op::DataShape inputShape = helpers::GetLegacyDataShape(inAccess->infoShape());
-
-    auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(outData);
-    NVCV_ASSERT(outAccess);
-
-    cuda_op::DataType  outDataType = helpers::GetLegacyDataType(outData.dtype());
-    cuda_op::DataShape outputShape = helpers::GetLegacyDataShape(outAccess->infoShape());
-
-    if (outDataType != inDataType)
-    {
-        LOG_ERROR("Unsupported input/output DataType " << inDataType << "/" << outDataType);
-        return ErrorCode::INVALID_DATA_TYPE;
-    }
-    if (outputShape.H != inputShape.H || outputShape.W != inputShape.W || outputShape.N != inputShape.N
-        || outputShape.C != inputShape.C || outputShape.C != 3)
-    {
-        LOG_ERROR("Invalid output shape " << outputShape);
-        return ErrorCode::INVALID_DATA_SHAPE;
-    }
-
-    cuosd_apply(context, inputShape.W, inputShape.H, stream);
-
-    dim3 blockSize(16, 8);
-    dim3 gridSize(divUp(int((inputShape.W + 1) / 2), (int)blockSize.x),
-                  divUp(int((inputShape.H + 1) / 2), (int)blockSize.y), inputShape.N);
-
-    auto src = nvcv::cuda::CreateTensorWrapNHWC<uint8_t>(inData);
-    auto dst = nvcv::cuda::CreateTensorWrapNHWC<uint8_t>(outData);
-
-    render_bndbox_rgb_womsaa_kernel<<<gridSize, blockSize, 0, stream>>>(
-        src, dst, 0, 0, context->gpu_rect_commands ? context->gpu_rect_commands->device() : nullptr,
-        context->rect_commands.size(), inputShape.W, inputShape.H, inData.basePtr() == outData.basePtr());
-    checkKernelErrors();
-
-    return ErrorCode::SUCCESS;
-}
-
-inline ErrorCode ApplyBndBox_RGBA(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData,
-                                  cuOSDContext_t context, cudaStream_t stream)
-{
-    auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(inData);
-    NVCV_ASSERT(inAccess);
-
-    cuda_op::DataType  inDataType = helpers::GetLegacyDataType(inData.dtype());
-    cuda_op::DataShape inputShape = helpers::GetLegacyDataShape(inAccess->infoShape());
-
-    auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(outData);
-    NVCV_ASSERT(outAccess);
-
-    cuda_op::DataType  outDataType = helpers::GetLegacyDataType(outData.dtype());
-    cuda_op::DataShape outputShape = helpers::GetLegacyDataShape(outAccess->infoShape());
-
-    if (outDataType != inDataType)
-    {
-        LOG_ERROR("Unsupported input/output DataType " << inDataType << "/" << outDataType);
-        return ErrorCode::INVALID_DATA_TYPE;
-    }
-    if (outputShape.H != inputShape.H || outputShape.W != inputShape.W || outputShape.N != inputShape.N
-        || outputShape.C != inputShape.C || outputShape.C != 4)
-    {
-        LOG_ERROR("Invalid output shape " << outputShape);
-        return ErrorCode::INVALID_DATA_SHAPE;
-    }
-
-    cuosd_apply(context, inputShape.W, inputShape.H, stream);
-
-    dim3 blockSize(16, 8);
-    dim3 gridSize(divUp(int((inputShape.W + 1) / 2), (int)blockSize.x),
-                  divUp(int((inputShape.H + 1) / 2), (int)blockSize.y), inputShape.N);
-
-    auto src = nvcv::cuda::CreateTensorWrapNHWC<uint8_t>(inData);
-    auto dst = nvcv::cuda::CreateTensorWrapNHWC<uint8_t>(outData);
-
-    render_bndbox_rgba_womsaa_kernel<<<gridSize, blockSize, 0, stream>>>(
-        src, dst, 0, 0, context->gpu_rect_commands ? context->gpu_rect_commands->device() : nullptr,
-        context->rect_commands.size(), inputShape.W, inputShape.H, inData.basePtr() == outData.basePtr());
-    checkKernelErrors();
-
-    return ErrorCode::SUCCESS;
-}
-
-BndBox::BndBox(DataShape max_input_shape, DataShape max_output_shape)
-    : CudaBaseOp(max_input_shape, max_output_shape)
-{
-    m_context = new cuOSDContext();
-    if (m_context->gpu_rect_commands == nullptr)
-    {
-        m_context->gpu_rect_commands.reset(new Memory<RectangleCommand>());
-    }
-    m_context->gpu_rect_commands->alloc_or_resize_to(PREALLOC_CMD_NUM * sizeof(RectangleCommand));
-}
-
-BndBox::~BndBox()
-{
-    if (m_context)
-    {
-        m_context->rect_commands.clear();
-        cuOSDContext *p = (cuOSDContext *)m_context;
-        delete p;
-    }
-}
-
-size_t BndBox::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type)
-{
-    return 0;
-}
-
-ErrorCode BndBox::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData,
-                        NVCVBndBoxesI bboxes, cudaStream_t stream)
-{
-    cuda_op::DataFormat input_format  = GetLegacyDataFormat(inData.layout());
-    cuda_op::DataFormat output_format = GetLegacyDataFormat(outData.layout());
-
-    if (!(input_format == kNHWC || input_format == kHWC) || !(output_format == kNHWC || output_format == kHWC))
-    {
-        LOG_ERROR("Invliad DataFormat both Input and Output must be kNHWC or kHWC");
-        return ErrorCode::INVALID_DATA_FORMAT;
-    }
-
-    if (inData.dtype() != outData.dtype())
-    {
-        LOG_ERROR("Input and Output formats must be same input format =" << inData.dtype()
-                                                                         << " output format = " << outData.dtype());
-        return ErrorCode::INVALID_DATA_FORMAT;
-    }
-
-    auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(inData);
-    if (!inAccess)
-    {
-        return ErrorCode::INVALID_DATA_FORMAT;
-    }
-
-    int batch    = inAccess->numSamples();
-    int channels = inAccess->numChannels();
-    int rows     = inAccess->numRows();
-    int cols     = inAccess->numCols();
-
-    if (channels > 4 || channels < 1)
-    {
-        LOG_ERROR("Invalid channel number ch = " << channels);
-        return ErrorCode::INVALID_DATA_SHAPE;
-    }
-
-    if (bboxes.batch != batch)
-    {
-        LOG_ERROR("Invalid bboxes batch = " << bboxes.batch);
-        return ErrorCode::INVALID_DATA_SHAPE;
-    }
-
-    auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(outData);
-    if (!outAccess)
-    {
-        return ErrorCode::INVALID_DATA_FORMAT;
-    }
-
-    auto ret = cuosd_draw_rectangle(m_context, cols, rows, bboxes);
-    if (ret != ErrorCode::SUCCESS)
-    {
-        return ret;
-    }
-
-    typedef ErrorCode (*func_t)(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData,
-                                cuOSDContext_t context, cudaStream_t stream);
-
-    static const func_t funcs[] = {
-        ApplyBndBox_RGB,
-        ApplyBndBox_RGBA,
-    };
-
-    int type_idx = channels - 3;
-    funcs[type_idx](inData, outData, m_context, stream);
-    m_context->rect_commands.clear(); // Clear the command buffer so next render does not contain previous boxes.
-    m_context->blur_commands.clear();
-    return ErrorCode::SUCCESS;
-}
-
-} // namespace nvcv::legacy::cuda_op
diff --git a/src/cvcuda/priv/legacy/box_blur.cu b/src/cvcuda/priv/legacy/box_blur.cu
index 02c26103..aa3637d4 100644
--- a/src/cvcuda/priv/legacy/box_blur.cu
+++ b/src/cvcuda/priv/legacy/box_blur.cu
@@ -23,6 +23,7 @@
 
 #include "CvCudaUtils.cuh"
 
+#include <cvcuda/priv/Types.hpp>
 #include <nvcv/Image.hpp>
 #include <nvcv/ImageData.hpp>
 #include <nvcv/TensorData.hpp>
@@ -32,6 +33,7 @@
 using namespace nvcv::legacy::cuda_op;
 using namespace nvcv::legacy::helpers;
 using namespace nvcv::cuda::osd;
+using namespace cvcuda::priv;
 
 namespace nvcv::legacy::cuda_op {
 
@@ -327,15 +329,15 @@ inline ErrorCode ApplyBoxBlur_RGBA(const nvcv::TensorDataStridedCuda &inData,
     return ErrorCode::SUCCESS;
 }
 
-static ErrorCode cuosd_draw_boxblur(cuOSDContext_t context, int width, int height, NVCVBlurBoxesI bboxes)
+static ErrorCode cuosd_draw_boxblur(cuOSDContext_t context, int width, int height, NVCVBlurBoxesImpl *bboxes)
 {
-    for (int n = 0; n < bboxes.batch; n++)
+    for (int n = 0; n < bboxes->batch(); n++)
     {
-        auto numBoxes = bboxes.numBoxes[n];
+        auto numBoxes = bboxes->numBoxesAt(n);
 
         for (int i = 0; i < numBoxes; i++)
         {
-            auto bbox   = bboxes.boxes[i];
+            auto bbox   = bboxes->boxAt(n, i);
             int  left   = max(min(bbox.box.x, width - 1), 0);
             int  top    = max(min(bbox.box.y, height - 1), 0);
             int  right  = max(min(left + bbox.box.width - 1, width - 1), 0);
@@ -367,8 +369,6 @@ static ErrorCode cuosd_draw_boxblur(cuOSDContext_t context, int width, int heigh
             cmd->bounding_bottom = bottom;
             context->blur_commands.emplace_back(cmd);
         }
-
-        bboxes.boxes = (NVCVBlurBoxI *)((uint8_t *)bboxes.boxes + numBoxes * sizeof(NVCVBlurBoxI));
     }
     return ErrorCode::SUCCESS;
 }
@@ -394,11 +394,6 @@ BoxBlur::~BoxBlur()
     }
 }
 
-size_t BoxBlur::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type)
-{
-    return 0;
-}
-
 ErrorCode BoxBlur::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData,
                          NVCVBlurBoxesI bboxes, cudaStream_t stream)
 {
@@ -435,9 +430,10 @@ ErrorCode BoxBlur::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv::
         return ErrorCode::INVALID_DATA_SHAPE;
     }
 
-    if (bboxes.batch != batch)
+    NVCVBlurBoxesImpl *_bboxes = (NVCVBlurBoxesImpl *)bboxes;
+    if (_bboxes->batch() != batch)
     {
-        LOG_ERROR("Invalid bboxes batch = " << bboxes.batch);
+        LOG_ERROR("Invalid bboxes batch = " << _bboxes->batch());
         return ErrorCode::INVALID_DATA_SHAPE;
     }
 
@@ -447,7 +443,7 @@ ErrorCode BoxBlur::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv::
         return ErrorCode::INVALID_DATA_FORMAT;
     }
 
-    auto ret = cuosd_draw_boxblur(m_context, cols, rows, bboxes);
+    auto ret = cuosd_draw_boxblur(m_context, cols, rows, _bboxes);
     if (ret != ErrorCode::SUCCESS)
     {
         return ret;
@@ -464,7 +460,6 @@ ErrorCode BoxBlur::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv::
     int type_idx = channels - 3;
     funcs[type_idx](inData, outData, m_context, stream);
     m_context->blur_commands.clear(); // Clear the command buffer so next render does not contain previous boxes.
-    m_context->rect_commands.clear();
     return ErrorCode::SUCCESS;
 }
 
diff --git a/src/cvcuda/priv/legacy/center_crop.cu b/src/cvcuda/priv/legacy/center_crop.cu
index c7245dfe..aa8e6542 100644
--- a/src/cvcuda/priv/legacy/center_crop.cu
+++ b/src/cvcuda/priv/legacy/center_crop.cu
@@ -70,11 +70,6 @@ void center_crop(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDa
 
 namespace nvcv::legacy::cuda_op {
 
-size_t CenterCrop::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type)
-{
-    return 0;
-}
-
 ErrorCode CenterCrop::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, int crop_rows,
                             int crop_columns, cudaStream_t stream)
 {
diff --git a/src/cvcuda/priv/legacy/convert_to.cu b/src/cvcuda/priv/legacy/convert_to.cu
index 9d68ba76..5e510d04 100644
--- a/src/cvcuda/priv/legacy/convert_to.cu
+++ b/src/cvcuda/priv/legacy/convert_to.cu
@@ -120,11 +120,6 @@ void convertToScale(const nvcv::TensorDataStridedCuda &inData, const nvcv::Tenso
 
 namespace nvcv::legacy::cuda_op {
 
-size_t ConvertTo::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type)
-{
-    return 0;
-}
-
 ErrorCode ConvertTo::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
                            const double alpha, const double beta, cudaStream_t stream)
 {
diff --git a/src/cvcuda/priv/legacy/custom_crop.cu b/src/cvcuda/priv/legacy/custom_crop.cu
index fe725a68..eea4ebfe 100644
--- a/src/cvcuda/priv/legacy/custom_crop.cu
+++ b/src/cvcuda/priv/legacy/custom_crop.cu
@@ -64,11 +64,6 @@ void customCrop(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDat
 
 namespace nvcv::legacy::cuda_op {
 
-size_t CustomCrop::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type)
-{
-    return 0;
-}
-
 ErrorCode CustomCrop::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, NVCVRectI roi,
                             cudaStream_t stream)
 {
diff --git a/src/cvcuda/priv/legacy/cvt_color.cu b/src/cvcuda/priv/legacy/cvt_color.cu
index c3bd307e..8b794c63 100644
--- a/src/cvcuda/priv/legacy/cvt_color.cu
+++ b/src/cvcuda/priv/legacy/cvt_color.cu
@@ -1494,11 +1494,6 @@ inline ErrorCode BGR_to_YUV420xp(const TensorDataStridedCuda &inData, const Tens
     return ErrorCode::SUCCESS;
 }
 
-size_t CvtColor::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type)
-{
-    return 0;
-}
-
 ErrorCode CvtColor::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
                           NVCVColorConversionCode code, cudaStream_t stream)
 {
diff --git a/src/cvcuda/priv/legacy/cvt_color_var_shape.cu b/src/cvcuda/priv/legacy/cvt_color_var_shape.cu
index 8dd369de..2dc01bbd 100644
--- a/src/cvcuda/priv/legacy/cvt_color_var_shape.cu
+++ b/src/cvcuda/priv/legacy/cvt_color_var_shape.cu
@@ -79,7 +79,7 @@ static constexpr int ITUR_BT_601_CBV = -74448;
 
 namespace nvcv::legacy::cuda_op {
 
-__device__ inline bool checkShapeFromYUV420(int rows, int cols, NVCVColorConversionCode code)
+inline __device__ bool checkShapeFromYUV420(int rows, int cols, NVCVColorConversionCode code)
 {
     int valid_row = 1, valid_col = 1;
     switch (code)
@@ -210,7 +210,8 @@ __global__ void bgr_to_gray_float_nhwc(cuda::ImageBatchVarShapeWrapNHWC<T> src,
     T g = *src.ptr(batch_idx, dst_y, dst_x, 1);
     T r = *src.ptr(batch_idx, dst_y, dst_x, bidx ^ 2);
 
-    T gray = (T)(b * B2YF + g * G2YF + r * R2YF) * dst.ptr(batch_idx, dst_y, dst_x, 0) = gray;
+    T gray                               = (T)(b * B2YF + g * G2YF + r * R2YF);
+    *dst.ptr(batch_idx, dst_y, dst_x, 0) = gray;
 }
 
 template<class T>
@@ -400,7 +401,7 @@ __global__ void bgr_to_hsv_float_nhwc(cuda::ImageBatchVarShapeWrapNHWC<T> src, c
     *dst.ptr(batch_idx, dst_y, dst_x, 2) = v;
 }
 
-__device__ inline void HSV2RGB_native_var_shape(float h, float s, float v, float &b, float &g, float &r,
+inline __device__ void HSV2RGB_native_var_shape(float h, float s, float v, float &b, float &g, float &r,
                                                 const float hscale)
 {
     if (s == 0)
@@ -980,7 +981,7 @@ inline ErrorCode BGR_to_GRAY(const ImageBatchVarShapeDataStridedCuda &inData,
     {
         cuda::ImageBatchVarShapeWrapNHWC<float> src_ptr(inData, channels);
         cuda::ImageBatchVarShapeWrapNHWC<float> dst_ptr(outData, dcn);
-        bgr_to_gray_char_nhwc<float><<<gridSize, blockSize, 0, stream>>>(src_ptr, dst_ptr, bidx);
+        bgr_to_gray_float_nhwc<float><<<gridSize, blockSize, 0, stream>>>(src_ptr, dst_ptr, bidx);
         checkKernelErrors();
     }
     break;
@@ -1596,11 +1597,6 @@ inline ErrorCode BGR_to_YUV420xp(const ImageBatchVarShapeDataStridedCuda &inData
     return ErrorCode::SUCCESS;
 }
 
-size_t CvtColorVarShape::calBufferSize(int batch_size)
-{
-    return 0;
-}
-
 ErrorCode CvtColorVarShape::infer(const ImageBatchVarShapeDataStridedCuda &inData,
                                   const ImageBatchVarShapeDataStridedCuda &outData, NVCVColorConversionCode code,
                                   cudaStream_t stream)
diff --git a/src/cvcuda/priv/legacy/filter.cu b/src/cvcuda/priv/legacy/filter.cu
index 2c84261a..105f9260 100644
--- a/src/cvcuda/priv/legacy/filter.cu
+++ b/src/cvcuda/priv/legacy/filter.cu
@@ -131,11 +131,6 @@ constexpr cuda::math::Vector<float, 9> kLaplacianKernel3{
 
 // clang-format on
 
-size_t Laplacian::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type)
-{
-    return 0;
-}
-
 ErrorCode Laplacian::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, int ksize,
                            float scale, NVCVBorderType borderMode, cudaStream_t stream)
 {
diff --git a/src/cvcuda/priv/legacy/find_contours.cu b/src/cvcuda/priv/legacy/find_contours.cu
index 64058378..abcb798c 100644
--- a/src/cvcuda/priv/legacy/find_contours.cu
+++ b/src/cvcuda/priv/legacy/find_contours.cu
@@ -923,10 +923,11 @@ __global__ void flattenContours(IndexType *dConnectList, CountType *dNodeCount,
                    + block.group_index().y * grid.group_dim().x + block.group_index().x;
 
     // Calculate block tile dimensions and total number of iterations needed.
-    auto contourTile   = util::DivUp(FindContours::MAX_NUM_CONTOURS, warp.meta_group_size());
-    auto neededThreads = warp.size() * FindContours::MAX_NUM_CONTOURS * batchSize;
-    auto neededBlocks  = (neededThreads + block.size() - 1) / block.size();
-    auto numSteps      = (neededBlocks + gridBlocks - 1) / gridBlocks;
+    auto contourTile       = util::DivUp(FindContours::MAX_NUM_CONTOURS, warp.meta_group_size());
+    auto neededThreads     = warp.size() * FindContours::MAX_NUM_CONTOURS * batchSize;
+    auto neededBlocks      = (neededThreads + block.size() - 1) / block.size();
+    auto numStepsBatchSize = ((batchSize * contourTile - blockRank) + gridBlocks - 1) / gridBlocks;
+    auto numSteps          = max((neededBlocks + gridBlocks - 1) / gridBlocks, numStepsBatchSize);
 
     // Calculate the thread's block dimensions and its position within the block.
     CoordType blockDims{warp.size(), warp.meta_group_size(), 1};
diff --git a/src/cvcuda/priv/legacy/flip.cu b/src/cvcuda/priv/legacy/flip.cu
index aa6e7a63..eaba1ccf 100644
--- a/src/cvcuda/priv/legacy/flip.cu
+++ b/src/cvcuda/priv/legacy/flip.cu
@@ -115,11 +115,6 @@ void flip(const TensorDataStridedCuda &input, const TensorDataStridedCuda &outpu
 #endif // CUDA_DEBUG_LOG
 }
 
-size_t Flip::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type)
-{
-    return 0;
-}
-
 ErrorCode Flip::infer(const TensorDataStridedCuda &input, const TensorDataStridedCuda &output, const int32_t flipCode,
                       cudaStream_t stream)
 {
diff --git a/src/cvcuda/priv/legacy/flip_or_copy_var_shape.cu b/src/cvcuda/priv/legacy/flip_or_copy_var_shape.cu
index db4cb400..595e0672 100644
--- a/src/cvcuda/priv/legacy/flip_or_copy_var_shape.cu
+++ b/src/cvcuda/priv/legacy/flip_or_copy_var_shape.cu
@@ -86,11 +86,6 @@ void flip(const ImageBatchVarShapeDataStridedCuda &input, const ImageBatchVarSha
 #endif // CUDA_DEBUG_LOG
 }
 
-size_t FlipOrCopyVarShape::calBufferSize(int maxBatchSize)
-{
-    return (sizeof(void *) * 2 + sizeof(int) * 3) * maxBatchSize;
-}
-
 ErrorCode FlipOrCopyVarShape::infer(const ImageBatchVarShapeDataStridedCuda &input,
                                     const ImageBatchVarShapeDataStridedCuda &output,
                                     const TensorDataStridedCuda &flipCode, cudaStream_t stream)
diff --git a/src/cvcuda/priv/legacy/histogram_eq_var_shape.cu b/src/cvcuda/priv/legacy/histogram_eq_var_shape.cu
index 9df8d9e8..b9b4b4d3 100644
--- a/src/cvcuda/priv/legacy/histogram_eq_var_shape.cu
+++ b/src/cvcuda/priv/legacy/histogram_eq_var_shape.cu
@@ -51,7 +51,7 @@ __global__ void hist_kernel(const SrcWrapper src, DstWrapper histogram, int chan
     {
         for (int ch = 0; ch < channels; ch++)
         {
-            int4  coordImg{src_x, src_y, batch_idx, ch};
+            int4  coordImg{batch_idx, src_y, src_x, ch};
             uchar out = src[coordImg];
             int   idx = out + (256 * ch);
             atomicAdd(&shist[idx], 1);
@@ -160,7 +160,7 @@ __global__ void lookup(const SrcWrapper src, DstWrapper dst, CdfWrapper cdf, int
         for (int ch = 0; ch < channels; ch++)
         {
             offset = 256 * ch;
-            int4 coordImg{src_x, src_y, batch_idx, ch};
+            int4 coordImg{batch_idx, src_y, src_x, ch};
             int2 coordHisto{src[coordImg] + offset, batch_idx};
             dst[coordImg] = nvcv::cuda::SaturateCast<uchar>((temp[src[coordImg] + offset]));
         }
diff --git a/src/cvcuda/priv/legacy/median_blur.cu b/src/cvcuda/priv/legacy/median_blur.cu
index 1db91fe7..3f05eeb1 100644
--- a/src/cvcuda/priv/legacy/median_blur.cu
+++ b/src/cvcuda/priv/legacy/median_blur.cu
@@ -363,11 +363,6 @@ void median(const nvcv::TensorDataAccessStridedImagePlanar &inData,
 
 namespace nvcv::legacy::cuda_op {
 
-size_t MedianBlur::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type)
-{
-    return 0;
-}
-
 ErrorCode MedianBlur::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
                             const nvcv::Size2D ksize, cudaStream_t stream)
 {
diff --git a/src/cvcuda/priv/legacy/min_area_rect.cu b/src/cvcuda/priv/legacy/min_area_rect.cu
index de01c690..384c011c 100644
--- a/src/cvcuda/priv/legacy/min_area_rect.cu
+++ b/src/cvcuda/priv/legacy/min_area_rect.cu
@@ -230,8 +230,7 @@ void minAreaRect(const TensorDataStridedCuda &inData, void *rotatedPointsDev,
 }
 
 MinAreaRect::MinAreaRect(DataShape max_input_shape, DataShape max_output_shape, int maxContourNum)
-    : CudaBaseOp(max_input_shape, max_output_shape)
-    , mMaxContourNum(maxContourNum)
+    : mMaxContourNum(maxContourNum)
 {
     NVCV_CHECK_THROW(cudaMalloc(&mRotateCoeffsBufDev, _MAX_ROTATE_DEGREES * 2 * sizeof(float)));
     NVCV_CHECK_THROW(
diff --git a/src/cvcuda/priv/legacy/normalize.cu b/src/cvcuda/priv/legacy/normalize.cu
index 3613f0fb..60eaf3cd 100644
--- a/src/cvcuda/priv/legacy/normalize.cu
+++ b/src/cvcuda/priv/legacy/normalize.cu
@@ -258,11 +258,6 @@ void Normalize::checkParamShape(DataShape input_shape, DataShape param_shape)
     NVCV_ASSERT(param_shape.W == input_shape.W || param_shape.W == 1);
 }
 
-size_t Normalize::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type)
-{
-    return 0;
-}
-
 ErrorCode Normalize::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &baseData,
                            const nvcv::TensorDataStridedCuda &scaleData, const nvcv::TensorDataStridedCuda &outData,
                            const float global_scale, const float shift, const float epsilon, const uint32_t flags,
diff --git a/src/cvcuda/priv/legacy/osd.cu b/src/cvcuda/priv/legacy/osd.cu
index cf3221f0..f979e688 100644
--- a/src/cvcuda/priv/legacy/osd.cu
+++ b/src/cvcuda/priv/legacy/osd.cu
@@ -23,6 +23,7 @@
 
 #include "CvCudaUtils.cuh"
 
+#include <cvcuda/priv/Types.hpp>
 #include <nvcv/Image.hpp>
 #include <nvcv/ImageData.hpp>
 #include <nvcv/TensorData.hpp>
@@ -35,6 +36,7 @@
 using namespace nvcv::legacy::cuda_op;
 using namespace nvcv::legacy::helpers;
 using namespace nvcv::cuda::osd;
+using namespace cvcuda::priv;
 
 namespace nvcv::legacy::cuda_op {
 
@@ -1358,7 +1360,7 @@ static ErrorCode cuosd_draw_rectangle(cuOSDContext_t context, int batch_idx, int
 }
 
 static ErrorCode cuosd_draw_segmentmask(cuOSDContext_t context, int batch_idx, int width, int height,
-                                        NVCVSegment segment)
+                                        const NVCVSegment &segment)
 {
     int left   = segment.box.x;
     int top    = segment.box.y;
@@ -1508,7 +1510,7 @@ static ErrorCode cuosd_draw_line(cuOSDContext_t context, int batch_idx, NVCVLine
     return ErrorCode::SUCCESS;
 }
 
-static ErrorCode cuosd_draw_polyline(cuOSDContext_t context, int batch_idx, NVCVPolyLine pl)
+static ErrorCode cuosd_draw_polyline(cuOSDContext_t context, int batch_idx, const NVCVPolyLine &pl)
 {
     if (pl.numPoints < 2)
         return ErrorCode::INVALID_PARAMETER;
@@ -1722,16 +1724,17 @@ static ErrorCode cuosd_draw_clock(cuOSDContext_t context, int batch_idx, NVCVClo
     return ErrorCode::SUCCESS;
 }
 
-static ErrorCode cuosd_draw_elements(cuOSDContext_t context, int width, int height, NVCVElements ctx)
+static ErrorCode cuosd_draw_elements(cuOSDContext_t context, int width, int height, NVCVElementsImpl *ctx)
 {
-    for (int n = 0; n < ctx.batch; n++)
+    for (int n = 0; n < ctx->batch(); n++)
     {
-        auto numElements = ctx.numElements[n];
+        auto numElements = ctx->numElementsAt(n);
 
         for (int i = 0; i < numElements; i++)
         {
-            auto type = ctx.elements[i].type;
-            auto data = ctx.elements[i].data;
+            auto element = ctx->elementAt(n, i);
+            auto type    = element->type();
+            auto data    = element->ptr();
             switch (type)
             {
             case NVCVOSDType::NVCV_OSD_NONE:
@@ -1792,7 +1795,21 @@ static ErrorCode cuosd_draw_elements(cuOSDContext_t context, int width, int heig
                 break;
             }
         }
-        ctx.elements = (NVCVElement *)((unsigned char *)ctx.elements + numElements * sizeof(NVCVElement));
+    }
+    return ErrorCode::SUCCESS;
+}
+
+static ErrorCode cuosd_draw_bndbox(cuOSDContext_t context, int width, int height, NVCVBndBoxesImpl *bboxes)
+{
+    for (int n = 0; n < bboxes->batch(); n++)
+    {
+        auto numBoxes = bboxes->numBoxesAt(n);
+
+        for (int i = 0; i < numBoxes; i++)
+        {
+            auto bbox = bboxes->boxAt(n, i);
+            cuosd_draw_rectangle(context, n, width, height, bbox);
+        }
     }
     return ErrorCode::SUCCESS;
 }
@@ -1812,11 +1829,6 @@ OSD::~OSD()
     }
 }
 
-size_t OSD::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type)
-{
-    return 0;
-}
-
 ErrorCode OSD::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData,
                      NVCVElements elements, cudaStream_t stream)
 {
@@ -1853,9 +1865,10 @@ ErrorCode OSD::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv::Tens
         return ErrorCode::INVALID_DATA_SHAPE;
     }
 
-    if (elements.batch != batch)
+    NVCVElementsImpl *_elements = (NVCVElementsImpl *)elements;
+    if (_elements->batch() != batch)
     {
-        LOG_ERROR("Invalid elements batch = " << elements.batch);
+        LOG_ERROR("Invalid elements batch = " << _elements->batch());
         return ErrorCode::INVALID_DATA_SHAPE;
     }
 
@@ -1884,7 +1897,7 @@ ErrorCode OSD::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv::Tens
         return ErrorCode::INVALID_DATA_SHAPE;
     }
 
-    auto ret = cuosd_draw_elements(m_context, cols, rows, elements);
+    auto ret = cuosd_draw_elements(m_context, cols, rows, _elements);
     if (ret != ErrorCode::SUCCESS)
     {
         LOG_ERROR("cuosd_draw_elements failed, ret - " << ret);
@@ -1911,4 +1924,90 @@ ErrorCode OSD::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv::Tens
     return ErrorCode::SUCCESS;
 }
 
+ErrorCode OSD::inferBox(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData,
+                        NVCVBndBoxesI bboxes, cudaStream_t stream)
+{
+    cuda_op::DataFormat input_format  = GetLegacyDataFormat(inData.layout());
+    cuda_op::DataFormat output_format = GetLegacyDataFormat(outData.layout());
+
+    if (!(input_format == kNHWC || input_format == kHWC) || !(output_format == kNHWC || output_format == kHWC))
+    {
+        LOG_ERROR("Invliad DataFormat both Input and Output must be kNHWC or kHWC");
+        return ErrorCode::INVALID_DATA_FORMAT;
+    }
+
+    if (inData.dtype() != outData.dtype())
+    {
+        LOG_ERROR("Input and Output formats must be same input format =" << inData.dtype()
+                                                                         << " output format = " << outData.dtype());
+        return ErrorCode::INVALID_DATA_FORMAT;
+    }
+
+    auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(inData);
+    if (!inAccess)
+    {
+        return ErrorCode::INVALID_DATA_FORMAT;
+    }
+
+    auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(outData);
+    if (!outAccess)
+    {
+        return ErrorCode::INVALID_DATA_FORMAT;
+    }
+
+    cuda_op::DataShape inputShape  = helpers::GetLegacyDataShape(inAccess->infoShape());
+    cuda_op::DataShape outputShape = helpers::GetLegacyDataShape(outAccess->infoShape());
+
+    if (outputShape.H != inputShape.H || outputShape.W != inputShape.W || outputShape.N != inputShape.N
+        || outputShape.C != inputShape.C)
+    {
+        LOG_ERROR("Invalid input/output shape " << inputShape << "/" << outputShape);
+        return ErrorCode::INVALID_DATA_SHAPE;
+    }
+
+    int batch    = inAccess->numSamples();
+    int channels = inAccess->numChannels();
+    int rows     = inAccess->numRows();
+    int cols     = inAccess->numCols();
+
+    if (channels > 4 || channels < 1)
+    {
+        LOG_ERROR("Invalid channel number ch = " << channels);
+        return ErrorCode::INVALID_DATA_SHAPE;
+    }
+
+    NVCVBndBoxesImpl *_bboxes = (NVCVBndBoxesImpl *)bboxes;
+    if (_bboxes->batch() != batch)
+    {
+        LOG_ERROR("Invalid bboxes batch = " << _bboxes->batch());
+        return ErrorCode::INVALID_DATA_SHAPE;
+    }
+
+    auto ret = cuosd_draw_bndbox(m_context, cols, rows, _bboxes);
+    if (ret != ErrorCode::SUCCESS)
+    {
+        LOG_ERROR("cuosd_draw_bndbox failed, ret - " << ret);
+        return ret;
+    }
+
+    auto format = cuOSDImageFormat::RGBA;
+    if (inputShape.C == 3)
+        format = cuOSDImageFormat::RGB;
+
+    cuosd_apply(m_context, inputShape.W, inputShape.H, format, stream);
+
+    auto src     = nvcv::cuda::CreateTensorWrapNHWC<uint8_t>(inData);
+    auto dst     = nvcv::cuda::CreateTensorWrapNHWC<uint8_t>(outData);
+    bool inplace = inData.basePtr() == outData.basePtr();
+
+    cuosd_launch(m_context, src, dst, inputShape.W, inputShape.C * inputShape.W, inputShape.H, format, inplace,
+                 inputShape.N, stream);
+
+    checkKernelErrors();
+
+    cuosd_clear(m_context);
+
+    return ErrorCode::SUCCESS;
+}
+
 } // namespace nvcv::legacy::cuda_op
diff --git a/src/cvcuda/priv/legacy/pad_and_stack.cu b/src/cvcuda/priv/legacy/pad_and_stack.cu
index f40bc0b2..623dd4a3 100644
--- a/src/cvcuda/priv/legacy/pad_and_stack.cu
+++ b/src/cvcuda/priv/legacy/pad_and_stack.cu
@@ -81,11 +81,6 @@ void padAndStack(const ImageBatchVarShapeDataStridedCuda &inData, const TensorDa
     funcs[borderMode](inData, outData, top, left, borderValue, stream);
 }
 
-size_t PadAndStack::calBufferSize(int batch_size)
-{
-    return 0;
-}
-
 ErrorCode PadAndStack::infer(const ImageBatchVarShapeDataStridedCuda &inData, const TensorDataStridedCuda &outData,
                              const TensorDataStridedCuda &top, const TensorDataStridedCuda &left,
                              const NVCVBorderType borderMode, const float borderValue, cudaStream_t stream)
diff --git a/src/cvcuda/priv/legacy/pillow_resize.cu b/src/cvcuda/priv/legacy/pillow_resize.cu
index 4a4cd9b3..fcf583e5 100644
--- a/src/cvcuda/priv/legacy/pillow_resize.cu
+++ b/src/cvcuda/priv/legacy/pillow_resize.cu
@@ -368,8 +368,8 @@ void pillow_resize_filter(const TensorDataAccessStridedImagePlanar &inData,
     }
 }
 
-PillowResize::PillowResize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type)
-    : CudaBaseOp(max_input_shape, max_output_shape)
+WorkspaceRequirements PillowResize::getWorkspaceRequirements(DataShape max_input_shape, DataShape max_output_shape,
+                                                             DataType max_data_type)
 {
     int    max_support = 1; //3
     size_t size
@@ -381,35 +381,22 @@ PillowResize::PillowResize(DataShape max_input_shape, DataShape max_output_shape
                     * (((1.0 * max_input_shape.W / max_output_shape.W + 1) * max_support * 2 + 1) * sizeof(work_type)
                        + 2 * sizeof(int)))
         + max_input_shape.N * max_input_shape.C * max_input_shape.H * max_output_shape.W * DataSize(max_data_type);
-    NVCV_CHECK_LOG(cudaMalloc(&gpu_workspace, size));
-}
-
-PillowResize::~PillowResize()
-{
-    NVCV_CHECK_LOG(cudaFree(gpu_workspace));
-}
-
-size_t PillowResize::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type)
-{
-    int    max_support = 1; //3
-    size_t size
-        = std::ceil(
-              max_output_shape.H
-                  * (((1.0 * max_input_shape.H / max_output_shape.H + 1) * max_support * 2 + 1) * sizeof(work_type)
-                     + 2 * sizeof(int))
-              + max_output_shape.W
-                    * (((1.0 * max_input_shape.W / max_output_shape.W + 1) * max_support * 2 + 1) * sizeof(work_type)
-                       + 2 * sizeof(int)))
-        + max_input_shape.N * max_input_shape.C * max_input_shape.H * max_output_shape.W * DataSize(max_data_type);
-    return size;
+    WorkspaceRequirements req{};
+    req.cudaMem = {size, 256};
+    return req;
 }
 
 ErrorCode PillowResize::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
-                              const NVCVInterpolationType interpolation, cudaStream_t stream)
+                              const NVCVInterpolationType interpolation, cudaStream_t stream, const Workspace &ws)
 {
     DataFormat format        = GetLegacyDataFormat(inData.layout());
     DataFormat output_format = GetLegacyDataFormat(outData.layout());
 
+    if (ws.cudaMem.ready != nullptr)
+        checkCudaErrors(cudaStreamWaitEvent(stream, ws.cudaMem.ready));
+
+    void *gpu_workspace = ws.cudaMem.data;
+
     if (format != output_format)
     {
         LOG_ERROR("Invalid DataFormat between input (" << format << ") and output (" << output_format << ")");
@@ -467,6 +454,10 @@ ErrorCode PillowResize::infer(const TensorDataStridedCuda &inData, const TensorD
         return ErrorCode::INVALID_PARAMETER;
         break;
     }
+
+    if (ws.cudaMem.ready != nullptr)
+        checkCudaErrors(cudaEventRecord(ws.cudaMem.ready, stream));
+
     return ErrorCode::SUCCESS;
 }
 
diff --git a/src/cvcuda/priv/legacy/pillow_resize.h b/src/cvcuda/priv/legacy/pillow_resize.h
index 294018ed..37429a99 100644
--- a/src/cvcuda/priv/legacy/pillow_resize.h
+++ b/src/cvcuda/priv/legacy/pillow_resize.h
@@ -29,8 +29,9 @@ using namespace nvcv;
 using namespace nvcv::legacy::cuda_op;
 using namespace nvcv::legacy::helpers;
 
-#define work_type float
-#define M_PI      3.14159265358979323846 /* pi */
+using work_type = float;
+
+#define M_PI 3.14159265358979323846 /* pi */
 
 namespace nvcv::legacy::cuda_op {
 
diff --git a/src/cvcuda/priv/legacy/pillow_resize_var_shape.cu b/src/cvcuda/priv/legacy/pillow_resize_var_shape.cu
index f2ba65b8..e95cc20e 100644
--- a/src/cvcuda/priv/legacy/pillow_resize_var_shape.cu
+++ b/src/cvcuda/priv/legacy/pillow_resize_var_shape.cu
@@ -275,9 +275,18 @@ __global__ void vertical_pass_var_shape(const Ptr2dNHWC<T1> src, Ptr2dVarShapeNH
 
 template<typename Filter, typename elem_type>
 void pillow_resize_var_shape(const ImageBatchVarShape &inDataBase, const ImageBatchVarShape &outDataBase,
-                             void *gpu_workspace, void *cpu_workspace, bool normalize_coeff, work_type init_buffer,
-                             bool round_up, cudaStream_t stream)
+                             const Workspace &ws, bool normalize_coeff, work_type init_buffer, bool round_up,
+                             cudaStream_t stream)
 {
+    if (ws.hostMem.ready != nullptr)
+        checkCudaErrors(cudaEventSynchronize(ws.hostMem.ready));
+
+    if (ws.cudaMem.ready != nullptr)
+        checkCudaErrors(cudaStreamWaitEvent(stream, ws.cudaMem.ready));
+
+    void *cpu_workspace = ws.hostMem.data;
+    void *gpu_workspace = ws.cudaMem.data;
+
     auto inDataPtr = inDataBase.exportData<ImageBatchVarShapeDataStridedCuda>(stream);
     if (!inDataPtr)
     {
@@ -419,6 +428,9 @@ void pillow_resize_var_shape(const ImageBatchVarShape &inDataBase, const ImageBa
     checkCudaErrors(cudaMemcpyAsync((void *)gpu_workspace, (void *)cpu_workspace, current_buffer_size,
                                     cudaMemcpyHostToDevice, stream));
 
+    if (ws.hostMem.ready != nullptr)
+        checkCudaErrors(cudaEventRecord(ws.hostMem.ready, stream));
+
     Ptr2dVarShapeNHWC<elem_type> src_ptr(inData);
     Ptr2dVarShapeNHWC<elem_type> dst_ptr(outData);
     Ptr2dNHWC<work_type>         ptr_h_out(batch, max_input_height, max_width, channels, (work_type *)hori_gpu_data);
@@ -479,39 +491,37 @@ void pillow_resize_var_shape(const ImageBatchVarShape &inDataBase, const ImageBa
         init_buffer, round_up, hv_use_share_mem);
 
     checkKernelErrors();
+
+    if (ws.cudaMem.ready != nullptr)
+        checkCudaErrors(cudaEventRecord(ws.cudaMem.ready, stream));
 }
 
 } // namespace
 
 template<typename Filter>
 void pillow_resize_filter_var_shape(const ImageBatchVarShape &inData, const ImageBatchVarShape &outData,
-                                    void *gpu_workspace, void *cpu_workspace, NVCVInterpolationType interpolation,
-                                    cudaStream_t stream)
+                                    const Workspace &ws, NVCVInterpolationType interpolation, cudaStream_t stream)
 {
     DataType data_type = helpers::GetLegacyDataType(inData.uniqueFormat());
     switch (data_type)
     {
     case kCV_8U:
-        pillow_resize_var_shape<Filter, unsigned char>(inData, outData, gpu_workspace, cpu_workspace, false, 0., false,
-                                                       stream);
+        pillow_resize_var_shape<Filter, unsigned char>(inData, outData, ws, false, 0., false, stream);
         break;
     case kCV_8S:
-        pillow_resize_var_shape<Filter, signed char>(inData, outData, gpu_workspace, cpu_workspace, false, 0., true,
-                                                     stream);
+        pillow_resize_var_shape<Filter, signed char>(inData, outData, ws, false, 0., true, stream);
         break;
     case kCV_16U:
-        pillow_resize_var_shape<Filter, std::uint16_t>(inData, outData, gpu_workspace, cpu_workspace, false, 0., false,
-                                                       stream);
+        pillow_resize_var_shape<Filter, std::uint16_t>(inData, outData, ws, false, 0., false, stream);
         break;
     case kCV_16S:
-        pillow_resize_var_shape<Filter, std::int16_t>(inData, outData, gpu_workspace, cpu_workspace, false, 0., true,
-                                                      stream);
+        pillow_resize_var_shape<Filter, std::int16_t>(inData, outData, ws, false, 0., true, stream);
         break;
     case kCV_32S:
-        pillow_resize_var_shape<Filter, int>(inData, outData, gpu_workspace, cpu_workspace, false, 0., true, stream);
+        pillow_resize_var_shape<Filter, int>(inData, outData, ws, false, 0., true, stream);
         break;
     case kCV_32F:
-        pillow_resize_var_shape<Filter, float>(inData, outData, gpu_workspace, cpu_workspace, false, 0., false, stream);
+        pillow_resize_var_shape<Filter, float>(inData, outData, ws, false, 0., false, stream);
         break;
     case kCV_64F:
     default:
@@ -519,40 +529,13 @@ void pillow_resize_filter_var_shape(const ImageBatchVarShape &inData, const Imag
     }
 }
 
-PillowResizeVarShape::PillowResizeVarShape(DataShape max_input_shape, DataShape max_output_shape,
-                                           DataType max_data_type)
-    : CudaBaseOp(max_input_shape, max_output_shape)
+WorkspaceRequirements PillowResizeVarShape::getWorkspaceRequirements(DataShape max_input_shape,
+                                                                     DataShape max_output_shape, DataType max_data_type)
 {
-    int    max_support = 1; //3
-    size_t size        = std::ceil(
-               max_output_shape.H
-                   * (((1.0 * max_input_shape.H / max_output_shape.H + 1) * max_support * 2 + 1) * sizeof(work_type)
-               + 2 * sizeof(int))
-               + max_output_shape.W
-                     * (((1.0 * max_input_shape.W / max_output_shape.W + 1) * max_support * 2 + 1) * sizeof(work_type)
-                 + 2 * sizeof(int)));
-    size_t buffer_size = (sizeof(void *) * 3 + sizeof(int) * 12 + sizeof(work_type) * 6 + size) * max_input_shape.N;
-    buffer_size += max_input_shape.N * max_input_shape.C * max_input_shape.H * max_output_shape.W * sizeof(float);
+    constexpr size_t kDefaultDeviceAlignment = 256;
 
-    NVCV_CHECK_LOG(cudaMalloc(&gpu_workspace, buffer_size));
+    WorkspaceRequirements req{};
 
-    cpu_workspace = malloc(buffer_size);
-    if (!cpu_workspace)
-    {
-        LOG_ERROR("Memory allocation error of size: " << buffer_size);
-        throw std::runtime_error("Memory allocation error!");
-    }
-}
-
-PillowResizeVarShape::~PillowResizeVarShape()
-{
-    NVCV_CHECK_LOG(cudaFree(gpu_workspace));
-    free(cpu_workspace);
-}
-
-size_t PillowResizeVarShape::calBufferSize(DataShape max_input_shape, DataShape max_output_shape,
-                                           DataType max_data_type)
-{
     int    max_support = 1; //3
     size_t size        = std::ceil(
                max_output_shape.H
@@ -561,15 +544,24 @@ size_t PillowResizeVarShape::calBufferSize(DataShape max_input_shape, DataShape
                + max_output_shape.W
                      * (((1.0 * max_input_shape.W / max_output_shape.W + 1) * max_support * 2 + 1) * sizeof(work_type)
                  + 2 * sizeof(int)));
+
     size_t buffer_size = (sizeof(void *) * 3 + sizeof(int) * 12 + sizeof(work_type) * 6 + size) * max_input_shape.N;
+
+    req.hostMem.size      = buffer_size;
+    req.hostMem.alignment = alignof(std::max_align_t);
+
     buffer_size += max_input_shape.N * max_input_shape.C * max_input_shape.H * max_output_shape.W * sizeof(float);
 
-    return buffer_size;
+    req.cudaMem.size      = buffer_size;
+    req.cudaMem.alignment = kDefaultDeviceAlignment;
+
+    return req;
 }
 
 ErrorCode PillowResizeVarShape::infer(const nvcv::ImageBatchVarShape &inDataBase,
                                       const nvcv::ImageBatchVarShape &outDataBase,
-                                      const NVCVInterpolationType interpolation, cudaStream_t stream)
+                                      const NVCVInterpolationType interpolation, cudaStream_t stream,
+                                      const NVCVWorkspace &ws)
 {
     if (!inDataBase.uniqueFormat() || !outDataBase.uniqueFormat())
     {
@@ -610,24 +602,19 @@ ErrorCode PillowResizeVarShape::infer(const nvcv::ImageBatchVarShape &inDataBase
     switch (interpolation)
     {
     case NVCV_INTERP_LINEAR:
-        pillow_resize_filter_var_shape<BilinearFilter>(inDataBase, outDataBase, gpu_workspace, cpu_workspace,
-                                                       interpolation, stream);
+        pillow_resize_filter_var_shape<BilinearFilter>(inDataBase, outDataBase, ws, interpolation, stream);
         break;
     case NVCV_INTERP_BOX:
-        pillow_resize_filter_var_shape<BoxFilter>(inDataBase, outDataBase, gpu_workspace, cpu_workspace, interpolation,
-                                                  stream);
+        pillow_resize_filter_var_shape<BoxFilter>(inDataBase, outDataBase, ws, interpolation, stream);
         break;
     case NVCV_INTERP_HAMMING:
-        pillow_resize_filter_var_shape<HammingFilter>(inDataBase, outDataBase, gpu_workspace, cpu_workspace,
-                                                      interpolation, stream);
+        pillow_resize_filter_var_shape<HammingFilter>(inDataBase, outDataBase, ws, interpolation, stream);
         break;
     case NVCV_INTERP_CUBIC:
-        pillow_resize_filter_var_shape<BicubicFilter>(inDataBase, outDataBase, gpu_workspace, cpu_workspace,
-                                                      interpolation, stream);
+        pillow_resize_filter_var_shape<BicubicFilter>(inDataBase, outDataBase, ws, interpolation, stream);
         break;
     case NVCV_INTERP_LANCZOS:
-        pillow_resize_filter_var_shape<LanczosFilter>(inDataBase, outDataBase, gpu_workspace, cpu_workspace,
-                                                      interpolation, stream);
+        pillow_resize_filter_var_shape<LanczosFilter>(inDataBase, outDataBase, ws, interpolation, stream);
         break;
     default:
         LOG_ERROR("Unsupported interpolation method " << interpolation);
diff --git a/src/cvcuda/priv/legacy/reformat.cu b/src/cvcuda/priv/legacy/reformat.cu
index 919f535f..826e0f0c 100644
--- a/src/cvcuda/priv/legacy/reformat.cu
+++ b/src/cvcuda/priv/legacy/reformat.cu
@@ -104,11 +104,6 @@ void Reformat::checkDataFormat(DataFormat format)
     NVCV_ASSERT(format == kNHWC || format == kHWC || format == kNCHW || format == kCHW);
 }
 
-size_t Reformat::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type)
-{
-    return 0;
-}
-
 ErrorCode Reformat::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData,
                           cudaStream_t stream)
 {
diff --git a/src/cvcuda/priv/legacy/resize.cu b/src/cvcuda/priv/legacy/resize.cu
index c262bc59..51721843 100644
--- a/src/cvcuda/priv/legacy/resize.cu
+++ b/src/cvcuda/priv/legacy/resize.cu
@@ -624,11 +624,6 @@ void resize(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &ou
 #endif
 } //resize
 
-size_t Resize::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type)
-{
-    return 0;
-} //Resize::calBufferSize
-
 ErrorCode Resize::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
                         const NVCVInterpolationType interpolation, cudaStream_t stream)
 {
diff --git a/src/cvcuda/priv/legacy/warp.cu b/src/cvcuda/priv/legacy/warp.cu
index eb5f06fd..e1dbc8a1 100644
--- a/src/cvcuda/priv/legacy/warp.cu
+++ b/src/cvcuda/priv/legacy/warp.cu
@@ -216,11 +216,6 @@ ErrorCode WarpAffine::infer(const TensorDataStridedCuda &inData, const TensorDat
     return ErrorCode::SUCCESS;
 }
 
-size_t WarpPerspective::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type)
-{
-    return 9 * sizeof(float);
-}
-
 ErrorCode WarpPerspective::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
                                  const float *transMatrix, const int32_t flags, const NVCVBorderType borderMode,
                                  const float4 borderValue, cudaStream_t stream)
@@ -290,7 +285,7 @@ ErrorCode WarpPerspective::infer(const TensorDataStridedCuda &inData, const Tens
 
     PerspectiveTransform transform(transMatrix);
 
-    if (flags & NVCV_WARP_INVERSE_MAP)
+    if (!(flags & NVCV_WARP_INVERSE_MAP))
     {
         cuda::math::Matrix<float, 3, 3> tempMatrixForInverse;
 
diff --git a/src/cvcuda/priv/legacy/warp_var_shape.cu b/src/cvcuda/priv/legacy/warp_var_shape.cu
index 52023d82..e99dd18c 100644
--- a/src/cvcuda/priv/legacy/warp_var_shape.cu
+++ b/src/cvcuda/priv/legacy/warp_var_shape.cu
@@ -391,7 +391,7 @@ ErrorCode WarpPerspectiveVarShape::infer(const ImageBatchVarShapeDataStridedCuda
     cuda::Tensor2DWrap<float> transMatrixInput(transMatrix);
     cuda::Tensor2DWrap<float> transMatrixOutput(m_transformationMatrix, static_cast<int>(sizeof(float) * 9));
 
-    if (performInverse)
+    if (!performInverse)
     {
         inverseMatWarpPerspective<<<1, inData.numImages(), 0, stream>>>(inData.numImages(), transMatrixInput,
                                                                         transMatrixOutput);
diff --git a/src/nvcv_types/Array.cpp b/src/nvcv_types/Array.cpp
index d47af91a..be7d98a2 100644
--- a/src/nvcv_types/Array.cpp
+++ b/src/nvcv_types/Array.cpp
@@ -264,6 +264,23 @@ NVCV_DEFINE_API(0, 4, NVCVStatus, nvcvArrayGetCapacity, (NVCVArrayHandle handle,
         });
 }
 
+NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvArrayResize, (NVCVArrayHandle handle, int64_t length))
+{
+    return priv::ProtectCall(
+        [&]
+        {
+            auto &array = priv::ToStaticRef<priv::IArray>(handle);
+
+            if (length > array.capacity())
+            {
+                throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT,
+                                      "Cannot resize array to input length because greater than capacity");
+            }
+
+            array.resize(length);
+        });
+}
+
 NVCV_DEFINE_API(0, 4, NVCVStatus, nvcvArrayGetTarget, (NVCVArrayHandle handle, NVCVResourceType *target))
 {
     return priv::ProtectCall(
diff --git a/src/nvcv_types/CMakeLists.txt b/src/nvcv_types/CMakeLists.txt
index f7bbf2b5..2d561704 100644
--- a/src/nvcv_types/CMakeLists.txt
+++ b/src/nvcv_types/CMakeLists.txt
@@ -32,6 +32,7 @@ add_library(nvcv_types SHARED
     DataType.cpp
     ImageFormat.cpp
     Array.cpp
+    TensorBatch.cpp
 )
 
 target_link_libraries(nvcv_types
diff --git a/src/nvcv_types/ImageBatch.cpp b/src/nvcv_types/ImageBatch.cpp
index fca335cc..0de96b23 100644
--- a/src/nvcv_types/ImageBatch.cpp
+++ b/src/nvcv_types/ImageBatch.cpp
@@ -39,6 +39,11 @@ NVCV_DEFINE_API(0, 0, NVCVStatus, nvcvImageBatchVarShapeCalcRequirements,
                 throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to output requirements must not be NULL");
             }
 
+            if (capacity < 0)
+            {
+                throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Capacity must >= 0");
+            }
+
             *reqs = priv::ImageBatchVarShape::CalcRequirements(capacity);
         });
 }
@@ -295,6 +300,11 @@ NVCV_DEFINE_API(0, 3, NVCVStatus, nvcvImageBatchVarShapeGetImages,
     return priv::ProtectCall(
         [&]
         {
+            if (outImages == nullptr)
+            {
+                throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to output handle cannot be NULL");
+            }
+
             auto &batch = priv::ToDynamicRef<const priv::IImageBatchVarShape>(handle);
 
             batch.getImages(begIndex, outImages, numImages);
diff --git a/src/nvcv_types/Tensor.cpp b/src/nvcv_types/Tensor.cpp
index 6db2ed95..89b35239 100644
--- a/src/nvcv_types/Tensor.cpp
+++ b/src/nvcv_types/Tensor.cpp
@@ -304,3 +304,41 @@ NVCV_DEFINE_API(0, 3, NVCVStatus, nvcvTensorGetUserPointer, (NVCVTensorHandle ha
             *outUserPtr = tensor.userPointer();
         });
 }
+
+NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorReshape,
+                (NVCVTensorHandle handle, int32_t rank, const int64_t *shape, NVCVTensorLayout layout,
+                 NVCVTensorHandle *out_handle))
+{
+    return priv::ProtectCall(
+        [&]
+        {
+            if (handle == nullptr)
+            {
+                throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Tensor handle must not be NULL");
+            }
+
+            if (out_handle == nullptr)
+            {
+                throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to output handle must not be NULL");
+            }
+
+            auto tensor_ptr = priv::ToSharedObj<priv::ITensor>(handle); // this will call incRef
+
+            NVCVTensorData new_tensor_data;
+            tensor_ptr->exportData(new_tensor_data);
+
+            // Modifies rank, shape, layout and strides
+            priv::ReshapeTensorData(new_tensor_data, rank, shape, layout);
+
+            // The cleanup consists of dropping the reference to the handle we reference
+            auto cleanup = [](void *h, const NVCVTensorData *)
+            {
+                priv::CoreObjectDecRef(static_cast<NVCVTensorHandle>(h));
+            };
+            void *cleanup_ctx = handle;
+
+            *out_handle = priv::CreateCoreObject<priv::TensorWrapDataStrided>(new_tensor_data, cleanup, cleanup_ctx);
+
+            (void)tensor_ptr.release(); // we transferred ownership, we can release
+        });
+}
diff --git a/src/nvcv_types/TensorBatch.cpp b/src/nvcv_types/TensorBatch.cpp
new file mode 100644
index 00000000..a8f608d0
--- /dev/null
+++ b/src/nvcv_types/TensorBatch.cpp
@@ -0,0 +1,318 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "priv/TensorBatch.hpp"
+
+#include "priv/Status.hpp"
+#include "priv/SymbolVersioning.hpp"
+#include "priv/TensorBatchManager.hpp"
+
+#include <nvcv/TensorBatch.h>
+
+namespace priv = nvcv::priv;
+
+NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchCalcRequirements,
+                (int32_t capacity, NVCVTensorBatchRequirements *reqs))
+{
+    return priv::ProtectCall(
+        [&]
+        {
+            if (reqs == nullptr)
+            {
+                throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to output requirements must not be NULL");
+            }
+
+            *reqs = priv::TensorBatch::CalcRequirements(capacity);
+        });
+}
+
+NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchConstruct,
+                (const NVCVTensorBatchRequirements *reqs, NVCVAllocatorHandle halloc, NVCVTensorBatchHandle *outHandle))
+{
+    return priv::ProtectCall(
+        [&]
+        {
+            if (reqs == nullptr)
+            {
+                throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to requirements must not be NULL");
+            }
+            if (outHandle == nullptr)
+            {
+                throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to output handle must not be NULL");
+            }
+
+            priv::IAllocator &alloc = priv::GetAllocator(halloc);
+            *outHandle              = priv::CreateCoreObject<priv::TensorBatch>(*reqs, alloc);
+        });
+}
+
+NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchClear, (NVCVTensorBatchHandle handle))
+{
+    return priv::ProtectCall(
+        [&]
+        {
+            auto &tb = priv::ToStaticRef<priv::ITensorBatch>(handle);
+            tb.clear();
+        });
+}
+
+NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchPushTensors,
+                (NVCVTensorBatchHandle handle, const NVCVTensorHandle *tensors, int32_t numTensors))
+{
+    return priv::ProtectCall(
+        [&]
+        {
+            if (tensors == nullptr)
+            {
+                throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to tensors must not be NULL");
+            }
+            auto &tb = priv::ToStaticRef<priv::ITensorBatch>(handle);
+            tb.pushTensors(tensors, numTensors);
+        });
+}
+
+NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchDecRef, (NVCVTensorBatchHandle handle, int32_t *newRefCount))
+{
+    return priv::ProtectCall(
+        [&]
+        {
+            int32_t newRef = priv::CoreObjectDecRef(handle);
+            if (newRefCount)
+                *newRefCount = newRef;
+        });
+}
+
+NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchIncRef, (NVCVTensorBatchHandle handle, int32_t *newRefCount))
+{
+    return priv::ProtectCall(
+        [&]
+        {
+            int32_t refCount = priv::CoreObjectIncRef(handle);
+            if (newRefCount)
+                *newRefCount = refCount;
+        });
+}
+
+NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchRefCount, (NVCVTensorBatchHandle handle, int32_t *outRefCount))
+{
+    return priv::ProtectCall(
+        [&]
+        {
+            if (outRefCount == nullptr)
+            {
+                throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to reference count must not be NULL");
+            }
+            *outRefCount = priv::CoreObjectRefCount(handle);
+        });
+}
+
+NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchGetCapacity, (NVCVTensorBatchHandle handle, int32_t *outCapacityPtr))
+{
+    return priv::ProtectCall(
+        [&]
+        {
+            if (outCapacityPtr == nullptr)
+            {
+                throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to capacity must not be NULL");
+            }
+            auto &tb        = priv::ToStaticRef<priv::ITensorBatch>(handle);
+            *outCapacityPtr = tb.capacity();
+        });
+}
+
+NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchGetRank, (NVCVTensorBatchHandle handle, int32_t *outRankPtr))
+{
+    return priv::ProtectCall(
+        [&]
+        {
+            if (outRankPtr == nullptr)
+            {
+                throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to rank must not be NULL");
+            }
+            auto &tb    = priv::ToStaticRef<priv::ITensorBatch>(handle);
+            *outRankPtr = tb.rank();
+        });
+}
+
+NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchGetDType, (NVCVTensorBatchHandle handle, NVCVDataType *outDTypePtr))
+{
+    return priv::ProtectCall(
+        [&]
+        {
+            if (outDTypePtr == nullptr)
+            {
+                throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to data type must not be NULL");
+            }
+            auto &tb     = priv::ToStaticRef<priv::ITensorBatch>(handle);
+            *outDTypePtr = tb.dtype();
+        });
+}
+
+NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchGetLayout,
+                (NVCVTensorBatchHandle handle, NVCVTensorLayout *outLayoutPtr))
+{
+    return priv::ProtectCall(
+        [&]
+        {
+            if (outLayoutPtr == nullptr)
+            {
+                throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to layout must not be NULL");
+            }
+            auto &tb      = priv::ToStaticRef<priv::ITensorBatch>(handle);
+            *outLayoutPtr = tb.layout();
+        });
+}
+
+NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchGetType,
+                (NVCVTensorBatchHandle handle, NVCVTensorBufferType *outTypePtr))
+{
+    return priv::ProtectCall(
+        [&]
+        {
+            if (outTypePtr == nullptr)
+            {
+                throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to buffer type must not be NULL");
+            }
+            auto &tb    = priv::ToStaticRef<priv::ITensorBatch>(handle);
+            *outTypePtr = tb.type();
+        });
+}
+
+NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchGetNumTensors,
+                (NVCVTensorBatchHandle handle, int32_t *outNumTensorsPtr))
+{
+    return priv::ProtectCall(
+        [&]
+        {
+            if (outNumTensorsPtr == nullptr)
+            {
+                throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to tensors number must not be NULL");
+            }
+            auto &tb          = priv::ToStaticRef<priv::ITensorBatch>(handle);
+            *outNumTensorsPtr = tb.numTensors();
+        });
+}
+
+NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchGetAllocator,
+                (NVCVTensorBatchHandle handle, NVCVAllocatorHandle *outAllocatorPtr))
+{
+    return priv::ProtectCall(
+        [&]
+        {
+            if (outAllocatorPtr == nullptr)
+            {
+                throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to allocator must not be NULL");
+            }
+            auto &tb         = priv::ToStaticRef<priv::ITensorBatch>(handle);
+            *outAllocatorPtr = tb.alloc().release()->handle();
+        });
+}
+
+NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchExportData,
+                (NVCVTensorBatchHandle handle, CUstream stream, NVCVTensorBatchData *data))
+{
+    return priv::ProtectCall(
+        [&]
+        {
+            if (data == nullptr)
+            {
+                throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to tensor batch data must not be NULL");
+            }
+            auto &tb = priv::ToStaticRef<priv::ITensorBatch>(handle);
+            tb.exportData(stream, *data);
+        });
+}
+
+NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchPopTensors, (NVCVTensorBatchHandle handle, int32_t numTensors))
+{
+    return priv::ProtectCall(
+        [&]
+        {
+            auto &tb = priv::ToStaticRef<priv::ITensorBatch>(handle);
+            tb.popTensors(numTensors);
+        });
+}
+
+NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchGetTensors,
+                (NVCVTensorBatchHandle handle, int32_t index, NVCVTensorHandle *outTensors, int32_t numTensors))
+{
+    return priv::ProtectCall(
+        [&]
+        {
+            if (outTensors == nullptr)
+            {
+                throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to output tensors must not be NULL");
+            }
+            if (index < 0)
+            {
+                throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Index cannot be negative");
+            }
+            if (numTensors < 0)
+            {
+                throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Number of tensors cannot be negative");
+            }
+            auto &tb = priv::ToStaticRef<priv::ITensorBatch>(handle);
+            tb.getTensors(index, outTensors, numTensors);
+        });
+}
+
+NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchSetTensors,
+                (NVCVTensorBatchHandle handle, int32_t index, const NVCVTensorHandle *tensors, int32_t numTensors))
+{
+    return priv::ProtectCall(
+        [&]
+        {
+            if (tensors == nullptr)
+            {
+                throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to tensors must not be NULL");
+            }
+            if (index < 0)
+            {
+                throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Index cannot be negative");
+            }
+            if (numTensors < 0)
+            {
+                throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Number of tensors cannot be negative");
+            }
+            auto &tb = priv::ToStaticRef<priv::ITensorBatch>(handle);
+            tb.setTensors(index, tensors, numTensors);
+        });
+}
+
+NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchSetUserPointer, (NVCVTensorBatchHandle handle, void *userPointer))
+{
+    return priv::ProtectCall(
+        [&]
+        {
+            auto &tb = priv::ToStaticRef<priv::ITensorBatch>(handle);
+            tb.setUserPointer(userPointer);
+        });
+}
+
+NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchGetUserPointer, (NVCVTensorBatchHandle handle, void **outUserPointer))
+{
+    return priv::ProtectCall(
+        [&]
+        {
+            if (outUserPointer == nullptr)
+            {
+                throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to user poniter must not be NULL");
+            }
+            auto &tb        = priv::ToStaticRef<priv::ITensorBatch>(handle);
+            *outUserPointer = tb.userPointer();
+        });
+}
diff --git a/src/nvcv_types/include/nvcv/Array.h b/src/nvcv_types/include/nvcv/Array.h
index cb4bdb83..077e6f09 100644
--- a/src/nvcv_types/include/nvcv/Array.h
+++ b/src/nvcv_types/include/nvcv/Array.h
@@ -301,6 +301,20 @@ NVCV_PUBLIC NVCVStatus nvcvArrayGetLength(NVCVArrayHandle handle, int64_t *lengt
  */
 NVCV_PUBLIC NVCVStatus nvcvArrayGetCapacity(NVCVArrayHandle handle, int64_t *capacity);
 
+/**
+ * Resizes the array legnth to the specified length up to the capacity.
+ *
+ * @param[in] handle Array to be queried.
+ *                   + Must not be NULL.
+ *                   + Must have been created by @ref nvcvArrayConstruct.
+ *
+ * @param[in] length The input length of the array.
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside its valid range.
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+NVCV_PUBLIC NVCVStatus nvcvArrayResize(NVCVArrayHandle handle, int64_t length);
+
 /**
  * Retrieve the array target.
  *
diff --git a/src/nvcv_types/include/nvcv/Array.hpp b/src/nvcv_types/include/nvcv/Array.hpp
index 5acd3387..81b99675 100644
--- a/src/nvcv_types/include/nvcv/Array.hpp
+++ b/src/nvcv_types/include/nvcv/Array.hpp
@@ -46,6 +46,8 @@ class Array : public CoreResource<NVCVArrayHandle, Array>
 
     ArrayData exportData() const;
 
+    void resize(int64_t length);
+
     template<typename DerivedArrayData>
     Optional<DerivedArrayData> exportData() const
     {
diff --git a/src/nvcv_types/include/nvcv/Fwd.h b/src/nvcv_types/include/nvcv/Fwd.h
index e3152414..c9702cb1 100644
--- a/src/nvcv_types/include/nvcv/Fwd.h
+++ b/src/nvcv_types/include/nvcv/Fwd.h
@@ -31,10 +31,11 @@ extern "C"
 {
 #endif
 
-typedef struct NVCVImage      *NVCVImageHandle;
-typedef struct NVCVImageBatch *NVCVImageBatchHandle;
-typedef struct NVCVTensor     *NVCVTensorHandle;
-typedef struct NVCVArray      *NVCVArrayHandle;
+typedef struct NVCVImage       *NVCVImageHandle;
+typedef struct NVCVImageBatch  *NVCVImageBatchHandle;
+typedef struct NVCVTensor      *NVCVTensorHandle;
+typedef struct NVCVTensorBatch *NVCVTensorBatchHandle;
+typedef struct NVCVArray       *NVCVArrayHandle;
 
 #ifdef __cplusplus
 }
diff --git a/src/nvcv_types/include/nvcv/ImageBatch.h b/src/nvcv_types/include/nvcv/ImageBatch.h
index 801d95ef..3673b25b 100644
--- a/src/nvcv_types/include/nvcv/ImageBatch.h
+++ b/src/nvcv_types/include/nvcv/ImageBatch.h
@@ -68,7 +68,7 @@ typedef struct NVCVImageBatchVarShapeRequirementsRec
 /** Calculates the resource requirements needed to create a varshape image batch.
  *
  * @param [in] capacity Maximum number of images that fits in the image batch.
- *                      + Must be >= 1.
+ *                      + Must be >= 0.
  *
  * @param [out] reqs  Where the image batch requirements will be written to.
  *                    + Must not be NULL.
@@ -104,12 +104,10 @@ NVCV_PUBLIC NVCVStatus nvcvImageBatchVarShapeConstruct(const NVCVImageBatchVarSh
 /** Decrements the reference count of an existing image batch instance.
  *
  * The image batch is destroyed when its reference count reaches zero.
-
+ *
  * If the image has type @ref NVCV_TYPE_IMAGEBATCH_TENSOR_WRAPDATA and has a cleanup function defined,
  * cleanup will be called.
  *
- * @note The image batch object must not be in use in current and future operations.
- *
  * @param [in] handle       Image batch to be destroyed.
  *                          If NULL, no operation is performed, successfully.
  *                          + The handle must have been created with any of the nvcvImageBatchXXXConstruct functions.
diff --git a/src/nvcv_types/include/nvcv/ImageData.h b/src/nvcv_types/include/nvcv/ImageData.h
index f8a49889..c34fb226 100644
--- a/src/nvcv_types/include/nvcv/ImageData.h
+++ b/src/nvcv_types/include/nvcv/ImageData.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/nvcv_types/include/nvcv/Size.h b/src/nvcv_types/include/nvcv/Size.h
new file mode 100644
index 00000000..fe6db006
--- /dev/null
+++ b/src/nvcv_types/include/nvcv/Size.h
@@ -0,0 +1,42 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_SIZE_H
+#define NVCV_SIZE_H
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * @brief Struct representing a two-dimensional size.
+ *
+ * This structure is designed to represent a width and height in 2D space.
+ */
+typedef struct
+{
+    int32_t w, h;
+} NVCVSize2D;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // NVCV_SIZE_H
diff --git a/src/nvcv_types/include/nvcv/Size.hpp b/src/nvcv_types/include/nvcv/Size.hpp
index 9a3020be..cce55e04 100644
--- a/src/nvcv_types/include/nvcv/Size.hpp
+++ b/src/nvcv_types/include/nvcv/Size.hpp
@@ -24,6 +24,8 @@
 #ifndef NVCV_SIZE_HPP
 #define NVCV_SIZE_HPP
 
+#include "Size.h"
+
 #include <cassert>
 #include <iostream>
 #include <tuple>
@@ -35,52 +37,75 @@ namespace nvcv {
  * @{
 */
 
-/**
- * @brief Struct representing a two-dimensional size.
- *
- * This structure is designed to represent a width and height in 2D space.
- */
-struct Size2D
+struct Size2D : NVCVSize2D
 {
-    int w, h;
-};
+    using NVCVSize2D::NVCVSize2D;
 
-/**
- * @brief Compares two Size2D structures for equality.
- *
- * @param a First size to compare.
- * @param b Second size to compare.
- * @return true if both width and height of `a` and `b` are equal, otherwise false.
- */
-inline bool operator==(const Size2D &a, const Size2D &b)
-{
-    return std::tie(a.w, a.h) == std::tie(b.w, b.h);
-}
+    constexpr Size2D(int32_t w, int32_t h)
+        : NVCVSize2D{w, h}
+    {
+    }
 
-/**
- * @brief Compares two Size2D structures for inequality.
- *
- * @param a First size to compare.
- * @param b Second size to compare.
- * @return true if width or height of `a` and `b` are not equal, otherwise false.
- */
-inline bool operator!=(const Size2D &a, const Size2D &b)
-{
-    return !(a == b);
-}
+    constexpr Size2D(const NVCVSize2D &s)
+        : NVCVSize2D{s.w, s.h}
+    {
+    }
+
+    inline Size2D &operator=(const NVCVSize2D &s)
+    {
+        static_cast<NVCVSize2D &>(*this) = s;
+        return *this;
+    }
+
+    /**
+     * @brief Compares two Size2D structures for equality.
+     *
+     * @param a First size to compare.
+     * @param b Second size to compare.
+     * @return true if both width and height of `a`  and `b` are equal, otherwise false.
+     */
+    constexpr bool operator==(const Size2D &rhs) const
+    {
+        return w == rhs.w && h == rhs.h;
+    }
+
+    /**
+     * @brief Compares two Size2D structures for inequality.
+     *
+     * @param a First size to compare.
+     * @param b Second size to compare.
+     * @return true if width or height of `a` and `b` are not equal, otherwise false.
+     */
+    constexpr bool operator!=(const Size2D &rhs) const
+    {
+        return !(*this == rhs);
+    }
+
+    /**
+     * @brief Compares two Size2D structures.
+     *
+     * The comparison is based on the width first, and then the height.
+     *
+     * @param a First size to compare.
+     * @param b Second size to compare.
+     * @return true if `a` is less than `b`, otherwise false.
+     */
+    inline bool operator<(const nvcv::Size2D &rhs) const
+    {
+        return std::tie(w, h) < std::tie(rhs.w, rhs.h);
+    }
+};
 
 /**
- * @brief Compares two Size2D structures.
- *
- * The comparison is based on the width first, and then the height.
+ * @brief Computes the maximum size in each dimension
  *
  * @param a First size to compare.
  * @param b Second size to compare.
- * @return true if `a` is less than `b`, otherwise false.
+ * @return The size with `w` and `h` computed as a maximum of the respective fields in `a` and `b`.
  */
-inline bool operator<(const Size2D &a, const Size2D &b)
+constexpr Size2D MaxSize(const Size2D &a, const Size2D &b)
 {
-    return std::tie(a.w, a.h) < std::tie(b.w, b.h);
+    return {b.w > a.w ? b.w : a.w, b.h > a.h ? b.h : a.h};
 }
 
 /**
@@ -92,7 +117,7 @@ inline bool operator<(const Size2D &a, const Size2D &b)
  * @param size Size2D structure to be output.
  * @return Reference to the modified output stream.
  */
-inline std::ostream &operator<<(std::ostream &out, const Size2D &size)
+inline std::ostream &operator<<(std::ostream &out, const nvcv::Size2D &size)
 {
     return out << size.w << "x" << size.h;
 }
diff --git a/src/nvcv_types/include/nvcv/Tensor.h b/src/nvcv_types/include/nvcv/Tensor.h
index fc1f2385..fef89e67 100644
--- a/src/nvcv_types/include/nvcv/Tensor.h
+++ b/src/nvcv_types/include/nvcv/Tensor.h
@@ -371,6 +371,31 @@ NVCV_PUBLIC NVCVStatus nvcvTensorExportData(NVCVTensorHandle handle, NVCVTensorD
  */
 NVCV_PUBLIC NVCVStatus nvcvTensorGetShape(NVCVTensorHandle handle, int32_t *rank, int64_t *shape);
 
+/**
+ * Creates a view of a tensor with a different shape and layout.
+ *
+ * @param[in] handle Tensor to create a view from.
+ *                   + Must not be NULL.
+ *
+ * @param[in] rank Number of elements in the shape buffer argument.
+ *                   + Must be a number between 1 and NVCV_TENSOR_MAX_RANK
+ *
+ * @param[in] shape New shape.
+ *                   Must point to a buffer with @p rank elements.
+ *                   Elements above actual number of dimensions will be ignored.
+ *
+ * @param[in] layout New layout.
+ *                   Must have @p rank elements or be empty.
+ *
+ * @param [out] handle Where the tensor instance handle will be written to.
+ *                     + Must not be NULL.
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is invalid.
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+NVCV_PUBLIC NVCVStatus nvcvTensorReshape(NVCVTensorHandle handle, int32_t rank, const int64_t *shape,
+                                         NVCVTensorLayout layout, NVCVTensorHandle *out_handle);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/nvcv_types/include/nvcv/Tensor.hpp b/src/nvcv_types/include/nvcv/Tensor.hpp
index 570ca8c1..acb7e5b9 100644
--- a/src/nvcv_types/include/nvcv/Tensor.hpp
+++ b/src/nvcv_types/include/nvcv/Tensor.hpp
@@ -106,6 +106,12 @@ class Tensor : public CoreResource<NVCVTensorHandle, Tensor>
      */
     void *userPointer() const;
 
+    /**
+     * @brief Creates a view of the tensor with a new shape and layout
+     *
+     */
+    Tensor reshape(const TensorShape &new_shape);
+
     /**
      * @brief Calculates the requirements for a tensor given its shape and data type.
      *
diff --git a/src/nvcv_types/include/nvcv/TensorBatch.h b/src/nvcv_types/include/nvcv/TensorBatch.h
new file mode 100644
index 00000000..597f8dc1
--- /dev/null
+++ b/src/nvcv_types/include/nvcv/TensorBatch.h
@@ -0,0 +1,278 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file TensorBatch.h
+ *
+ * @brief Public C interface to NVCV representation of a batch of tensors.
+ */
+
+#ifndef NVCV_TENSORBATCH_H
+#define NVCV_TENSORBATCH_H
+
+#include "Export.h"
+#include "Fwd.h"
+#include "Image.h"
+#include "Status.h"
+#include "Tensor.h"
+#include "TensorBatchData.h"
+#include "TensorLayout.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+typedef struct NVCVTensorBatch *NVCVTensorBatchHandle;
+
+/** Stores the requirements of an varshape tensor. */
+typedef struct NVCVTensorBatchRequirementsRec
+{
+    /*< Maximum number of tensors in the batch */
+    int32_t capacity;
+
+    /*< Alignment/block size in bytes */
+    int32_t alignBytes;
+
+    /*< Tensor resource requirements. */
+    NVCVRequirements mem;
+} NVCVTensorBatchRequirements;
+
+/** Calculates the resource requirements needed to create a tensor batch.
+ *
+ * @param [in] capacity Maximum number of images that fits in the image batch.
+ *                      + Must be >= 1.
+ *
+ * @param [out] reqs  Where the image batch requirements will be written to.
+ *                    + Must not be NULL.
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range.
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+NVCV_PUBLIC NVCVStatus nvcvTensorBatchCalcRequirements(int32_t capacity, NVCVTensorBatchRequirements *reqs);
+
+NVCVStatus nvcvTensorBatchConstruct(const NVCVTensorBatchRequirements *req, NVCVAllocatorHandle alloc,
+                                    NVCVTensorBatchHandle *outHandle);
+
+NVCVStatus nvcvTensorBatchClear(NVCVTensorBatchHandle handle);
+
+NVCVStatus nvcvTensorBatchPushTensors(NVCVTensorBatchHandle handle, const NVCVTensorHandle *tensors,
+                                      int32_t numTensors);
+
+/**
+ * Pop tensors from the end of the image batch.
+ *
+ * @param[in] handle Tensor batch to be manipulated
+ *                   + Must not be NULL.
+ *                   + The handle must have been created with @ref nvcvTensorBatchConstruct.
+ *
+ * @param[in] numTensors Number of tensors to remove.
+ *                       + Must be >= 1.
+ *                       + Must be <= number of tensors in the batch.
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside its valid range.
+ * @retval #NVCV_ERROR_UNDERFLOW        Tried to remove more tensors that there are in the batch.
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+NVCVStatus nvcvTensorBatchPopTensors(NVCVTensorBatchHandle handle, int32_t numTensors);
+
+/** Allocates multiple tensors and adds them to a TensorBatch
+ *
+ * This function allocates the storage for multiple tensors, creates the tensors and puts them in the batch.
+ *
+ * @param batch             a handle to the batch object to which the new tensors will be added
+ * @param numTensors        the number of tensors to add
+ * @param shapes            the shapes of the tensors to be added
+ * @param strides           the strides of the tensors to be added; if NULL, the tensors are densely packed
+ * @param tensorAlignment   the alignment, in bytes, of the base pointer of each tensor in the batch
+ */
+NVCVStatus nvcvTensorBatchPopulate(NVCVTensorBatchHandle batch, int32_t numTensors, const int64_t **shapes,
+                                   const int64_t **strides /* optional, dense packing if NULL */,
+                                   int32_t         tensorAlignment /* optional, use default if set to 0 */);
+
+/** Gets handles to a range of tensors in the batch
+ *
+ * This function creates new references to the Tensor handles. The caller must release them by calling
+ * nvcvTensorDecRef on all handles returned by this function.
+ *
+ * @param batch         a hadle to the batch object from which the tensors are exracted
+ * @param index         the index of the first handle to get
+ * @param outTensors    the array in which the handles are stored; it must have at least
+ *                      numTensors handles
+ * @param numTensors    the number of tensors to get
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside its valid range.
+ * @retval #NVCV_ERROR_OVERFLOW         Tried to retrieve more tensors that there are in the batch.
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+NVCVStatus nvcvTensorBatchGetTensors(NVCVTensorBatchHandle batch, int32_t index, NVCVTensorHandle *outTensors,
+                                     int32_t numTensors);
+
+/** Sets a range of tensors in the batch
+ *
+ * TBD: Do we need/want it?
+ *      Should it also extend the bach if index + numTensors > size (but within capacity)?
+ */
+NVCVStatus nvcvTensorBatchSetTensors(NVCVTensorBatchHandle batch, int32_t index, const NVCVTensorHandle *tensors,
+                                     int32_t numTensors);
+
+NVCVStatus nvcvTensorBatchGetAllocator(NVCVTensorBatchHandle batch, NVCVAllocatorHandle *alloc);
+
+NVCVStatus nvcvTensorBatchGetType(NVCVTensorBatchHandle batch, NVCVTensorBufferType *outType);
+
+/**
+ * Retrieve the tensor batch contents.
+ *
+ * @param[in] handle Tensor batch to be queried.
+ *                   + Must not be NULL.
+ *
+ * @param[in] stream CUDA stream where the export operation will execute.
+ *
+ * @param[out] data Where the tensor batch buffer information will be written to.
+ *                  + Must not be NULL.
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside its valid range.
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+NVCV_PUBLIC NVCVStatus nvcvTensorBatchExportData(NVCVTensorBatchHandle handle, CUstream stream,
+                                                 NVCVTensorBatchData *data);
+
+NVCVStatus nvcvTensorBatchGetNumTensors(NVCVTensorBatchHandle batch, int32_t *outNumTensors);
+
+/** Decrements the reference count of an existing TensorBatch instance.
+ *
+ * The Tensor batch is destroyed when its reference count reaches zero.
+ *
+ * @param [in] handle       Tensor batch to be destroyed.
+ *                          If NULL, no operation is performed, successfully.
+ *                          + The handle must have been created with any of the nvcvTensorBatchXXXConstruct functions.
+ *
+ * @param [out] newRefCount The decremented reference count. If the return value is 0, the object was destroyed.
+ *                          Can be NULL, if the caller isn't interested in the new reference count.
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT The handle is invalid
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+NVCV_PUBLIC NVCVStatus nvcvTensorBatchDecRef(NVCVTensorBatchHandle handle, int32_t *newRefCount);
+
+/** Increments the reference count of an Tensorbatch.
+ *
+ * @param [in] handle       Tensor batch to be retained.
+ *
+ * @param [out] newRefCount The incremented reference count.
+ *                          Can be NULL, if the caller isn't interested in the new reference count.
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT The handle is invalid
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+NVCV_PUBLIC NVCVStatus nvcvTensorBatchIncRef(NVCVTensorBatchHandle handle, int32_t *newRefCount);
+
+/** Returns the current reference count of an Tensor batch
+ *
+ * @param [in] handle       The handle whose reference count is to be obtained.
+ *
+ * @param [out] outRefCount The reference count.
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT The handle is invalid
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+NVCV_PUBLIC NVCVStatus nvcvTensorBatchRefCount(NVCVTensorBatchHandle handle, int32_t *outRefCount);
+
+/** Associates a user pointer to the Tensor batch handle.
+ *
+ * This pointer can be used to associate any kind of data with the Tensor batch object.
+ *
+ * @param [in] handle Tensor batch to be associated with the user pointer.
+ *
+ * @param [in] userPtr User pointer.
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range.
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+NVCV_PUBLIC NVCVStatus nvcvTensorBatchSetUserPointer(NVCVTensorBatchHandle handle, void *userPtr);
+
+/** Returns the user pointer associated with the Tensor batch handle.
+ *
+ * If no user pointer was associated, it'll return a pointer to NULL.
+ *
+ * @param [in] handle Tensor batch to be queried.
+ *
+ * @param [in] outUserPtr Pointer to where the user pointer will be stored.
+ *                        + Cannot be NULL.
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range.
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+NVCV_PUBLIC NVCVStatus nvcvTensorBatchGetUserPointer(NVCVTensorBatchHandle handle, void **outUserPtr);
+
+/** Returns the capacity of the Tensor batch handle.
+ *
+ * @param [in] handle Tensor batch to be queried.
+ *
+ * @param [in] outCapacityPtr Pointer to where the capacity will be stored.
+ *                        + Cannot be NULL.
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range.
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+NVCV_PUBLIC NVCVStatus nvcvTensorBatchGetCapacity(NVCVTensorBatchHandle handle, int32_t *outCapacityPtr);
+
+/** Returns the data type of the Tensor batch handle.
+ *
+ * Returns NVCV_DATA_TYPE_NONE for empty batches.
+ *
+ * @param [in] handle Tensor batch to be queried.
+ *
+ * @param [in] outDTypePtr Pointer to where the data type will be stored.
+ *                        + Cannot be NULL.
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range.
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+NVCV_PUBLIC NVCVStatus nvcvTensorBatchGetDType(NVCVTensorBatchHandle handle, NVCVDataType *outDTypePtr);
+
+/** Returns the layout of the Tensor batch handle.
+ *
+ * Returns the empty layout for empty batches.
+ *
+ * @param [in] handle Tensor batch to be queried.
+ *
+ * @param [in] outDTypePtr Pointer to where the layout will be stored.
+ *                        + Cannot be NULL.
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range.
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+NVCV_PUBLIC NVCVStatus nvcvTensorBatchGetLayout(NVCVTensorBatchHandle handle, NVCVTensorLayout *outLayoutPtr);
+
+/** Returns the rank of tensors in the tensor batch or -1 for an empty batch.
+ *
+ * @param [in] handle Tensor batch to be queried.
+ *
+ * @param [in] outRankPtr Pointer to where the rank will be stored.
+ *                        + Cannot be NULL.
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range.
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+NVCV_PUBLIC NVCVStatus nvcvTensorBatchGetRank(NVCVTensorBatchHandle handle, int32_t *outRankPtr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // NVCV_TENSORBATCH_H
diff --git a/src/nvcv_types/include/nvcv/TensorBatch.hpp b/src/nvcv_types/include/nvcv/TensorBatch.hpp
new file mode 100644
index 00000000..4aee9e14
--- /dev/null
+++ b/src/nvcv_types/include/nvcv/TensorBatch.hpp
@@ -0,0 +1,244 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_TENSORBATCH_HPP
+#define NVCV_TENSORBATCH_HPP
+
+#include "CoreResource.hpp"
+#include "TensorBatch.h"
+#include "TensorBatchData.hpp"
+#include "alloc/Allocator.hpp"
+
+#include <nvcv/Tensor.hpp>
+
+#include <vector>
+
+namespace nvcv {
+
+NVCV_IMPL_SHARED_HANDLE(TensorBatch);
+
+/**
+ * @brief Handle to a tensor batch object.
+ *
+ * Tensor batch is a container type that can hold a list of non-uniformly shaped tensors.
+ * Rank, data type and layout must be consistent between the tensors.
+ */
+class TensorBatch : public CoreResource<NVCVTensorBatchHandle, TensorBatch>
+{
+public:
+    using Base         = CoreResource<NVCVTensorBatchHandle, TensorBatch>;
+    using Requirements = NVCVTensorBatchRequirements;
+    using HandleType   = NVCVTensorBatchHandle;
+
+    static Requirements CalcRequirements(int32_t capacity);
+
+    NVCV_IMPLEMENT_SHARED_RESOURCE(TensorBatch, Base);
+
+    TensorBatch(const Requirements &reqs, const Allocator &alloc = nullptr);
+
+    TensorBatch(int32_t capacity, const Allocator &alloc = nullptr);
+
+    /**
+     * @brief Return the maximal number of tensors the tensor batch can hold.
+     */
+    int32_t capacity() const;
+
+    /**
+     * @brief Return the rank of the tensors in the tensor batch or -1 for an empty batch.
+     */
+    int32_t rank() const;
+
+    /**
+     * @brief Return the number of tensors in the tensor batch.
+     */
+    int32_t numTensors() const;
+
+    /**
+     * @brief Return the data type of the tensors in the tensor batch.
+     */
+    DataType dtype() const;
+
+    /**
+     * @brief Return the layout of the tensors in the tensor batch.
+     */
+    TensorLayout layout() const;
+
+    /**
+     * @brief Return the buffer type of the tensors' data.
+     */
+    NVCVTensorBufferType type() const;
+
+    /**
+     * @brief Return the allocator used by the tensor batch.
+     */
+    Allocator alloc() const;
+
+    /**
+     * @brief Append tensors from the given range to the end of the batch.
+     *
+     * @param begin,end range of the tensors to append.
+     */
+    template<typename It>
+    void pushBack(It begin, It end);
+
+    /**
+     * @brief Append the \a tensor to the end of the batch.
+     *
+     * @param tensor Appended tensor.
+     */
+    void pushBack(const Tensor &tensor);
+
+    /**
+     * @brief Truncate tensors from the end of the batch.
+     *
+     * @param numTensors Number of tensors to remove.
+     */
+    void popTensors(int32_t numTensors);
+
+    /**
+     * @brief Delete the last tensor from the batch.
+     */
+    void popTensor();
+
+    /**
+     * @brief Generate the tensor batch data descriptor.
+     *
+     * The necessary copies to GPU are scheduled on the given stream.
+     * The struct is valid after the scheduled work is finished.
+     *
+     * @param stream CUDA stream on which the buffers copy will be scheduled.
+     */
+    TensorBatchData exportData(CUstream stream);
+
+    void clear();
+
+    /**
+     * @brief Associates a user pointer to the tensor batch.
+     *
+     * @param ptr User pointer
+     */
+    void setUserPointer(void *ptr);
+
+    /**
+     * @brief Get the user pointer that was previously assciated to the tensor batch
+     * with the setUserPointer(void*) method. Returns nullptr if no pointer was set.
+     */
+    void *getUserPointer() const;
+
+    /**
+     * @brief Return a handle to a tensor at a given positon.
+     *
+     * @param idx Index of a tensor to return
+     */
+    Tensor operator[](int32_t idx) const;
+
+    /**
+     * @brief Replace the tensor on position \a index.
+     */
+    void setTensor(int32_t index, const Tensor &tensor);
+
+    class Iterator;
+
+    Iterator begin() const;
+
+    Iterator end() const;
+};
+
+class TensorBatch::Iterator
+{
+public:
+    using value_type        = Tensor;
+    using reference         = const Tensor &;
+    using pointer           = const Tensor *;
+    using iterator_category = std::random_access_iterator_tag;
+    using difference_type   = int32_t;
+
+    reference operator*() const;
+    pointer   operator->() const;
+
+    Iterator  operator++(int);
+    Iterator &operator++();
+    Iterator  operator--(int);
+    Iterator &operator--();
+
+    Iterator operator+(difference_type diff) const;
+    Iterator operator-(difference_type diff) const;
+
+    difference_type operator-(const Iterator &rhs) const;
+
+    bool operator==(const Iterator &rhs) const;
+    bool operator!=(const Iterator &rhs) const;
+    bool operator<(const Iterator &rhs) const;
+    bool operator>(const Iterator &rhs) const;
+    bool operator<=(const Iterator &rhs) const;
+    bool operator>=(const Iterator &rhs) const;
+
+    Iterator(Iterator &other)
+        : Iterator()
+    {
+        *this = other;
+    }
+
+    Iterator(Iterator &&other)
+        : Iterator()
+    {
+        *this = std::move(other);
+    }
+
+    Iterator &operator=(Iterator &other)
+    {
+        m_tensorBatch   = other.m_tensorBatch;
+        m_idx           = other.m_idx;
+        m_currentTensor = other.m_currentTensor;
+        return *this;
+    }
+
+    Iterator &operator=(Iterator &&other)
+    {
+        m_tensorBatch   = other.m_tensorBatch;
+        m_idx           = other.m_idx;
+        m_currentTensor = std::move(other.m_currentTensor);
+        return *this;
+    }
+
+private:
+    friend class TensorBatch;
+
+    Iterator() = default;
+
+    Iterator(const TensorBatch *tensorBatch, int32_t idx)
+        : m_tensorBatch(tensorBatch)
+        , m_idx(idx)
+        , m_currentTensor{}
+    {
+        UpdateCurrentTensor();
+    }
+
+    void UpdateCurrentTensor();
+
+    const TensorBatch *m_tensorBatch   = nullptr;
+    int32_t            m_idx           = 0;
+    mutable Tensor     m_currentTensor = {};
+};
+
+using TensorBatchWrapHandle = NonOwningResource<TensorBatch>;
+
+} // namespace nvcv
+
+#include "detail/TensorBatchImpl.hpp"
+
+#endif // NVCV_TENSORBATCH_HPP
diff --git a/src/nvcv_types/include/nvcv/TensorBatchData.h b/src/nvcv_types/include/nvcv/TensorBatchData.h
new file mode 100644
index 00000000..9b980e7c
--- /dev/null
+++ b/src/nvcv_types/include/nvcv/TensorBatchData.h
@@ -0,0 +1,65 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_TENSORBATCHDATA_H
+#define NVCV_TENSORBATCHDATA_H
+
+#include "TensorData.h"
+#include "TensorLayout.h"
+
+#include <stdalign.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/** Describes a single tensor in a batch */
+typedef struct NVCVTensorBatchElementStridedRec
+{
+    alignas(128) NVCVByte *data;
+    int64_t shape[NVCV_TENSOR_MAX_RANK];
+    int64_t stride[NVCV_TENSOR_MAX_RANK];
+} NVCVTensorBatchElementStrided;
+
+/** Describes a batch of tensors */
+typedef struct NVCVTensorBatchBufferStridedRec
+{
+    NVCVTensorBatchElementStrided *tensors;
+} NVCVTensorBatchBufferStrided;
+
+typedef union NVCVTensorBatchBufferRec
+{
+    NVCVTensorBatchBufferStrided strided;
+} NVCVTensorBatchBuffer;
+
+typedef struct NVCVTensorBatchDataRec
+{
+    NVCVDataType     dtype;
+    NVCVTensorLayout layout;
+    int32_t          rank;
+    int32_t          numTensors;
+
+    NVCVTensorBufferType  type;
+    NVCVTensorBatchBuffer buffer;
+} NVCVTensorBatchData;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // NVCV_TENSORBATCHDATA_H
diff --git a/src/nvcv_types/include/nvcv/TensorBatchData.hpp b/src/nvcv_types/include/nvcv/TensorBatchData.hpp
new file mode 100644
index 00000000..b9c425d1
--- /dev/null
+++ b/src/nvcv_types/include/nvcv/TensorBatchData.hpp
@@ -0,0 +1,172 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023  NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_TENSORBATCHDATA_HPP
+#define NVCV_TENSORBATCHDATA_HPP
+
+#include "Optional.hpp"
+#include "TensorBatchData.h"
+#include "TensorShape.hpp"
+
+#include <nvcv/DataType.hpp>
+#include <nvcv/TensorData.hpp>
+
+namespace nvcv {
+
+/**
+ * @brief General type represenitng data of any tensor batch.
+ */
+class TensorBatchData
+{
+public:
+    TensorBatchData(const NVCVTensorBatchData &data)
+        : m_data(data)
+    {
+    }
+
+    /**
+     * @brief Return rank of the tensors in the batch.
+     */
+    int rank() const
+    {
+        return m_data.rank;
+    }
+
+    /**
+     * @brief Return the layout of the tensors in the batch.
+     */
+    TensorLayout layout() const
+    {
+        return m_data.layout;
+    }
+
+    /**
+     * @brief Return the data type of the tensors in the batch.
+     */
+    DataType dtype() const
+    {
+        return DataType(m_data.dtype);
+    }
+
+    /**
+     * @brief Return the number of the tensors in the batch.
+     */
+    int32_t numTensors() const
+    {
+        return m_data.numTensors;
+    }
+
+    /**
+     * @brief Return underlying C struct representing the tensor batch data.
+     */
+    NVCVTensorBatchData cdata() const
+    {
+        return m_data;
+    }
+
+    static constexpr bool IsCompatibleKind(NVCVTensorBufferType kind)
+    {
+        return kind != NVCV_TENSOR_BUFFER_NONE;
+    }
+
+    /**
+     * @brief Cast the tensor batch data to a derived type (e.g. TensorBatchDataStridedCuda)
+     * @tparam Derived target type
+     */
+    template<typename Derived>
+    Optional<Derived> cast() const
+    {
+        static_assert(std::is_base_of<TensorBatchData, Derived>::value,
+                      "Cannot cast TensorBatchData to an unrelated type");
+        static_assert(sizeof(Derived) == sizeof(TensorBatchData), "The derived type must not add new data members.");
+
+        if (IsCompatible<Derived>())
+        {
+            return {Derived(m_data)};
+        }
+        else
+        {
+            return {};
+        }
+    }
+
+    /**
+     * @brief Checks if data can be casted to a given derived type.
+     * @tparam Derived tested type
+     */
+    template<typename Derived>
+    bool IsCompatible() const
+    {
+        static_assert(std::is_base_of<TensorBatchData, Derived>::value,
+                      "TensorBatchData cannot be compatible with unrelated type");
+        return Derived::IsCompatibleKind(m_data.type);
+    }
+
+protected:
+    TensorBatchData() = default;
+
+    NVCVTensorBatchData &data()
+    {
+        return m_data;
+    }
+
+private:
+    NVCVTensorBatchData m_data{};
+};
+
+/**
+ * @brief Data of batches of tensors with strides.
+ */
+class TensorBatchDataStrided : public TensorBatchData
+{
+public:
+    using Buffer = NVCVTensorBatchBufferStrided;
+
+    /**
+     * @brief Get the buffer with the tensors' descriptors.
+     */
+    Buffer buffer() const
+    {
+        return cdata().buffer.strided;
+    }
+
+    static constexpr bool IsCompatibleKind(NVCVTensorBufferType kind)
+    {
+        return kind == NVCV_TENSOR_BUFFER_STRIDED_CUDA;
+    }
+
+protected:
+    using TensorBatchData::TensorBatchData;
+};
+
+/**
+ * @brief Data of batches of CUDA tensors with strides.
+ */
+class TensorBatchDataStridedCuda : public TensorBatchDataStrided
+{
+public:
+    using TensorBatchDataStrided::TensorBatchDataStrided;
+
+    static constexpr bool IsCompatibleKind(NVCVTensorBufferType kind)
+    {
+        return kind == NVCV_TENSOR_BUFFER_STRIDED_CUDA;
+    }
+};
+
+} // namespace nvcv
+
+#endif // NVCV_TENSORBATCHDATA_HPP
diff --git a/src/nvcv_types/include/nvcv/alloc/Allocator.hpp b/src/nvcv_types/include/nvcv/alloc/Allocator.hpp
index f4a15a3c..8c725183 100644
--- a/src/nvcv_types/include/nvcv/alloc/Allocator.hpp
+++ b/src/nvcv_types/include/nvcv/alloc/Allocator.hpp
@@ -25,6 +25,7 @@
 #include "Allocator.h"
 
 #include <cassert>
+#include <cstddef>
 #include <cstring>
 #include <functional>
 
diff --git a/src/nvcv_types/include/nvcv/cuda/BorderVarShapeWrap.hpp b/src/nvcv_types/include/nvcv/cuda/BorderVarShapeWrap.hpp
index a20e975a..71f8e0cc 100644
--- a/src/nvcv_types/include/nvcv/cuda/BorderVarShapeWrap.hpp
+++ b/src/nvcv_types/include/nvcv/cuda/BorderVarShapeWrap.hpp
@@ -412,25 +412,25 @@ class BorderVarShapeWrapNHWC : public detail::BorderVarShapeWrapNHWCImpl<T, B>
     /**
      * Subscript operator for read-only or read-and-write access (depending on value type).
      *
-     * @param[in] c 4D coordinate (x column, y row, z sample, w channel) to be accessed.
+     * @param[in] c 4D coordinates (x sample, y row, z col, w channel) to be accessed.
      *
      * @return Accessed (const) reference.
      */
     inline __host__ __device__ ValueType &operator[](int4 c) const
     {
-        return *doGetPtr(c.z, c.y, c.x, c.w);
+        return *doGetPtr(c.x, c.y, c.z, c.w);
     }
 
     /**
      * Subscript operator for read-only or read-and-write access (depending on value type, considering plane=0).
      *
-     * @param[in] c 3D coordinate (x column, y row, z sample) (first channel) to be accessed.
+     * @param[in] c 3D coordinates (x sammple, y row, z col) (first channel) to be accessed.
      *
      * @return Accessed (const) reference.
      */
     inline __host__ __device__ ValueType &operator[](int3 c) const
     {
-        return *doGetPtr(c.z, c.y, c.x, 0);
+        return *doGetPtr(c.x, c.y, c.z, 0);
     }
 
     /**
@@ -523,13 +523,13 @@ class BorderVarShapeWrapNHWC<T, NVCV_BORDER_CONSTANT>
     /**
      * Subscript operator for read-only or read-and-write access (depending on value type).
      *
-     * @param[in] c 4D coordinate (x column, y row, z sample, w channel) to be accessed.
+     * @param[in] c 4D coordinate (x sample, y row, z column, w channel) to be accessed.
      *
      * @return Accessed (const) reference.
      */
     inline __host__ __device__ ValueType &operator[](int4 c) const
     {
-        ValueType *p = doGetPtr(c.z, c.y, c.x, c.w);
+        ValueType *p = doGetPtr(c.x, c.y, c.z, c.w);
 
         if (p == nullptr)
         {
@@ -542,13 +542,13 @@ class BorderVarShapeWrapNHWC<T, NVCV_BORDER_CONSTANT>
     /**
      * Subscript operator for read-only or read-and-write access (depending on value type, considering plane=0).
      *
-     * @param[in] c 3D coordinate (x column, y row, z sample) (first channel) to be accessed.
+     * @param[in] c 3D coordinate (x sample, y row, z col) (first channel) to be accessed.
      *
      * @return Accessed (const) reference.
      */
     inline __host__ __device__ ValueType &operator[](int3 c) const
     {
-        ValueType *p = doGetPtr(c.z, c.y, c.x, 0);
+        ValueType *p = doGetPtr(c.x, c.y, c.z, 0);
 
         if (p == nullptr)
         {
diff --git a/src/nvcv_types/include/nvcv/cuda/ImageBatchVarShapeWrap.hpp b/src/nvcv_types/include/nvcv/cuda/ImageBatchVarShapeWrap.hpp
index 7373b977..482f5b3a 100644
--- a/src/nvcv_types/include/nvcv/cuda/ImageBatchVarShapeWrap.hpp
+++ b/src/nvcv_types/include/nvcv/cuda/ImageBatchVarShapeWrap.hpp
@@ -384,25 +384,26 @@ class ImageBatchVarShapeWrapNHWC : ImageBatchVarShapeWrap<T>
     /**
      * Subscript operator for either read-only or read-and-write access.
      *
-     * @param[in] c 4D coordinates (x column, y row, z sample, w channel) to be accessed.
+     * @param[in] c 4D coordinates (x sample, y row, z col, w channel) to be accessed.
+     *
      *
      * @return Accessed reference.
      */
     inline __host__ __device__ T &operator[](int4 c) const
     {
-        return *doGetPtr(c.z, c.y, c.x, c.w);
+        return *doGetPtr(c.x, c.y, c.z, c.w);
     }
 
     /**
      * Subscript operator for either read-only or read-and-write access.
      *
-     * @param[in] c 3D coordinates (x column, y row, z sample) (first channel) to be accessed.
+     * @param[in] c 3D coordinates (x sample, y row, z col) (first channel) to be accessed.
      *
      * @return Accessed reference.
      */
     inline __host__ __device__ T &operator[](int3 c) const
     {
-        return *doGetPtr(c.z, c.y, c.x, 0);
+        return *doGetPtr(c.x, c.y, c.z, 0);
     }
 
     /**
diff --git a/src/nvcv_types/include/nvcv/cuda/math/LinAlg.hpp b/src/nvcv_types/include/nvcv/cuda/math/LinAlg.hpp
index b8a00432..d91e3f5f 100644
--- a/src/nvcv_types/include/nvcv/cuda/math/LinAlg.hpp
+++ b/src/nvcv_types/include/nvcv/cuda/math/LinAlg.hpp
@@ -209,7 +209,7 @@ class Vector
     }
 
     // On-purpose public data to allow POD-class direct initialization.
-    T m_data[N] = {};
+    T m_data[N];
 };
 
 /**
diff --git a/src/nvcv_types/include/nvcv/detail/Align.hpp b/src/nvcv_types/include/nvcv/detail/Align.hpp
new file mode 100644
index 00000000..11e4d814
--- /dev/null
+++ b/src/nvcv_types/include/nvcv/detail/Align.hpp
@@ -0,0 +1,106 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_DETAIL_ALIGN_HPP
+#define NVCV_DETAIL_ALIGN_HPP
+
+#include <cstdint>
+#include <type_traits>
+
+namespace nvcv { namespace detail {
+
+/**
+ * @brief Aligns the @p value down to a multiple of @p alignment_pow2
+ *
+ * The function operates by masking the least significant bits of the value.
+ * If the alignment is not a power of two, the behavior is undefined.
+ *
+ * @remark Negative values are aligned down, not towards zero.
+ *
+ * @tparam T                an integral type
+ * @param value             a value to align
+ * @param alignment_pow2    the alignment, must be a positive power of 2
+ * @return constexpr T      the value aligned down to a multiple of @p alignment_pow2
+ */
+template<typename T>
+constexpr T AlignDown(T value, T alignment_pow2)
+{
+    static_assert(std::is_integral<T>::value, "Cannot align a value of a non-integral type");
+    // Explanation:
+    // When alignmnent_pow2 is a power of 2, (for example 16) it has a form:
+    // 00010000
+    // Negating it in U2 gives:
+    // 11110000
+    // We can use this as a mask to align a number _down_.
+
+    // NOTE: This is much more efficient than (value/alignment) * alignment for run-time alignment values, where
+    //       the compiler cannot replace the division/multiplication with bit shifts.
+    return value & -alignment_pow2;
+}
+
+/**
+ * @brief Aligns the @p value up to a multiple of @p alignment_pow2
+ *
+ * The function operates by adding alignment-1 to the value and masking the least significant bits.
+ * If the alignment is not a power of two, the behavior is undefined.
+ *
+ * @remark Negative values are aligned up, that is, towards zero.
+ *
+ * @tparam T                an integral type
+ * @param value             a value to align
+ * @param alignment_pow2    the alignment, must be a positive power of 2
+ * @return constexpr T      the value aligned up to a multiple of @p alignment_pow2
+ */
+template<typename T>
+constexpr T AlignUp(T value, T alignment_pow2)
+{
+    static_assert(std::is_integral<T>::value, "Cannot align a value of a non-integral type");
+    return AlignDown(value + (alignment_pow2 - 1), alignment_pow2);
+}
+
+/**
+ * @brief Checks if the value is a multiple of alignment
+ *
+ * @tparam T                an integral type
+ * @param value             the value whose alignment is checked
+ * @param alignment_pow2    the alignment, must be a power of 2
+ * @return true             if value is a multiple of alignment_pow2
+ * @return false            otherwise
+ */
+template<typename T>
+constexpr bool IsAligned(T value, T alignment_pow2)
+{
+    static_assert(std::is_integral<T>::value, "Cannot check alignment of a value of a non-integral type");
+    return (value & (alignment_pow2 - 1)) == 0;
+}
+
+/**
+ * @brief Checks if a pointer is aligned to a multiple of @p alignment_pow2 bytes.
+ *
+ * @param ptr               the pointer whose alignment is checked
+ * @param alignment_pow2    the alignment, must be a power of 2
+ * @return true             if value is a multiple of alignment_pow2
+ * @return false            otherwise
+ */
+inline bool IsAligned(const void *ptr, uintptr_t alignment_pow2)
+{
+    return IsAligned((uintptr_t)ptr, alignment_pow2);
+}
+
+}} // namespace nvcv::detail
+
+#endif // NVCV_DETAIL_ALIGN_HPP
diff --git a/src/nvcv_types/include/nvcv/detail/ArrayImpl.hpp b/src/nvcv_types/include/nvcv/detail/ArrayImpl.hpp
index 13b4d60b..65775a47 100644
--- a/src/nvcv_types/include/nvcv/detail/ArrayImpl.hpp
+++ b/src/nvcv_types/include/nvcv/detail/ArrayImpl.hpp
@@ -36,6 +36,13 @@ inline int64_t Array::length() const
     return length;
 }
 
+inline void Array::resize(int64_t length)
+{
+    NVCVArrayHandle harray = this->handle();
+
+    detail::CheckThrow(nvcvArrayResize(harray, length));
+}
+
 inline int64_t Array::capacity() const
 {
     NVCVArrayHandle harray = this->handle();
diff --git a/src/nvcv_types/include/nvcv/detail/TensorBatchImpl.hpp b/src/nvcv_types/include/nvcv/detail/TensorBatchImpl.hpp
new file mode 100644
index 00000000..1d028a65
--- /dev/null
+++ b/src/nvcv_types/include/nvcv/detail/TensorBatchImpl.hpp
@@ -0,0 +1,266 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_TENSORBATCH_IMPL_HPP
+#define NVCV_TENSORBATCH_IMPL_HPP
+
+namespace nvcv {
+
+// TensorBatch
+
+inline TensorBatch::Requirements TensorBatch::CalcRequirements(int32_t capacity)
+{
+    TensorBatch::Requirements reqs = {};
+    detail::CheckThrow(nvcvTensorBatchCalcRequirements(capacity, &reqs));
+    return reqs;
+}
+
+inline TensorBatch::TensorBatch(const TensorBatch::Requirements &reqs, const Allocator &alloc)
+{
+    NVCVTensorBatchHandle handle = nullptr;
+    detail::CheckThrow(nvcvTensorBatchConstruct(&reqs, alloc.handle(), &handle));
+    reset(std::move(handle));
+}
+
+inline TensorBatch::TensorBatch(int32_t capacity, const Allocator &alloc)
+{
+    auto                  reqs   = TensorBatch::CalcRequirements(capacity);
+    NVCVTensorBatchHandle handle = nullptr;
+    detail::CheckThrow(nvcvTensorBatchConstruct(&reqs, alloc.handle(), &handle));
+    reset(std::move(handle));
+}
+
+inline int32_t TensorBatch::capacity() const
+{
+    int32_t output;
+    detail::CheckThrow(nvcvTensorBatchGetCapacity(handle(), &output));
+    return output;
+}
+
+inline int32_t TensorBatch::rank() const
+{
+    int32_t output;
+    detail::CheckThrow(nvcvTensorBatchGetRank(handle(), &output));
+    return output;
+}
+
+inline int32_t TensorBatch::numTensors() const
+{
+    int32_t output;
+    detail::CheckThrow(nvcvTensorBatchGetNumTensors(handle(), &output));
+    return output;
+}
+
+inline DataType TensorBatch::dtype() const
+{
+    NVCVDataType dataType = {};
+    detail::CheckThrow(nvcvTensorBatchGetDType(handle(), &dataType));
+    return DataType(dataType);
+}
+
+inline TensorLayout TensorBatch::layout() const
+{
+    NVCVTensorLayout tensorLayout;
+    detail::CheckThrow(nvcvTensorBatchGetLayout(handle(), &tensorLayout));
+    return TensorLayout(tensorLayout);
+}
+
+inline NVCVTensorBufferType TensorBatch::type() const
+{
+    NVCVTensorBufferType bufferType;
+    detail::CheckThrow(nvcvTensorBatchGetType(handle(), &bufferType));
+    return bufferType;
+}
+
+inline Allocator TensorBatch::alloc() const
+{
+    NVCVAllocatorHandle halloc;
+    detail::CheckThrow(nvcvTensorBatchGetAllocator(handle(), &halloc));
+    return Allocator(std::move(halloc));
+}
+
+template<typename It>
+inline void TensorBatch::pushBack(It begin, It end)
+{
+    std::vector<NVCVTensorHandle> handles;
+    handles.reserve(capacity() - numTensors());
+    for (auto it = begin; it != end; ++it)
+    {
+        handles.push_back(it->handle());
+    }
+    detail::CheckThrow(nvcvTensorBatchPushTensors(handle(), handles.data(), handles.size()));
+}
+
+inline void TensorBatch::pushBack(const Tensor &tensor)
+{
+    auto hTensor = tensor.handle();
+    detail::CheckThrow(nvcvTensorBatchPushTensors(handle(), &hTensor, 1));
+}
+
+inline void TensorBatch::popTensors(int32_t numTensors)
+{
+    detail::CheckThrow(nvcvTensorBatchPopTensors(handle(), numTensors));
+}
+
+inline void TensorBatch::popTensor()
+{
+    detail::CheckThrow(nvcvTensorBatchPopTensors(handle(), 1));
+}
+
+inline TensorBatchData TensorBatch::exportData(CUstream stream)
+{
+    NVCVTensorBatchData output = {};
+    detail::CheckThrow(nvcvTensorBatchExportData(handle(), stream, &output));
+    return TensorBatchData(output);
+}
+
+inline void TensorBatch::clear()
+{
+    detail::CheckThrow(nvcvTensorBatchClear(handle()));
+}
+
+inline void TensorBatch::setUserPointer(void *ptr)
+{
+    detail::CheckThrow(nvcvTensorBatchSetUserPointer(handle(), ptr));
+}
+
+inline void *TensorBatch::getUserPointer() const
+{
+    void *outPtr = nullptr;
+    detail::CheckThrow(nvcvTensorBatchGetUserPointer(handle(), &outPtr));
+    return outPtr;
+}
+
+inline Tensor TensorBatch::operator[](int32_t idx) const
+{
+    NVCVTensorHandle hTensor = nullptr;
+    detail::CheckThrow(nvcvTensorBatchGetTensors(handle(), idx, &hTensor, 1));
+    return Tensor(std::move(hTensor));
+}
+
+inline void TensorBatch::setTensor(int32_t idx, const Tensor &tensor)
+{
+    auto hTensor = tensor.handle();
+    detail::CheckThrow(nvcvTensorBatchSetTensors(handle(), idx, &hTensor, 1));
+}
+
+inline TensorBatch::Iterator TensorBatch::begin() const
+{
+    return Iterator(this, 0);
+}
+
+inline TensorBatch::Iterator TensorBatch::end() const
+{
+    return Iterator(this, numTensors());
+}
+
+// TensorBatch::Iterator
+
+inline TensorBatch::Iterator::reference TensorBatch::Iterator::operator*() const
+{
+    return m_currentTensor;
+}
+
+inline TensorBatch::Iterator::pointer TensorBatch::Iterator::operator->() const
+{
+    return &m_currentTensor;
+}
+
+inline TensorBatch::Iterator TensorBatch::Iterator::operator++(int)
+{
+    Iterator output(*this);
+    ++(*this);
+    return output;
+}
+
+inline TensorBatch::Iterator &TensorBatch::Iterator::operator++()
+{
+    ++m_idx;
+    UpdateCurrentTensor();
+    return *this;
+}
+
+inline TensorBatch::Iterator TensorBatch::Iterator::operator--(int)
+{
+    Iterator output(*this);
+    --(*this);
+    return output;
+}
+
+inline TensorBatch::Iterator &TensorBatch::Iterator::operator--()
+{
+    --m_idx;
+    UpdateCurrentTensor();
+    return *this;
+}
+
+inline TensorBatch::Iterator TensorBatch::Iterator::operator+(difference_type diff) const
+{
+    return Iterator(m_tensorBatch, m_idx + diff);
+}
+
+inline TensorBatch::Iterator TensorBatch::Iterator::operator-(difference_type diff) const
+{
+    return Iterator(m_tensorBatch, m_idx - diff);
+}
+
+inline void TensorBatch::Iterator::UpdateCurrentTensor()
+{
+    if (m_idx < m_tensorBatch->numTensors() && m_idx >= 0)
+    {
+        m_currentTensor = (*m_tensorBatch)[m_idx];
+    }
+}
+
+inline TensorBatch::Iterator::difference_type TensorBatch::Iterator::operator-(const Iterator &rhs) const
+{
+    return m_idx - rhs.m_idx;
+}
+
+inline bool TensorBatch::Iterator::operator==(const Iterator &rhs) const
+{
+    return m_tensorBatch == rhs.m_tensorBatch && m_idx == rhs.m_idx;
+}
+
+inline bool TensorBatch::Iterator::operator!=(const Iterator &rhs) const
+{
+    return !(rhs == *this);
+}
+
+inline bool TensorBatch::Iterator::operator<(const Iterator &rhs) const
+{
+    return std::make_pair(m_tensorBatch, m_idx) < std::make_pair(rhs.m_tensorBatch, rhs.m_idx);
+}
+
+inline bool TensorBatch::Iterator::operator>(const Iterator &rhs) const
+{
+    return std::make_pair(m_tensorBatch, m_idx) > std::make_pair(rhs.m_tensorBatch, rhs.m_idx);
+}
+
+inline bool TensorBatch::Iterator::operator<=(const Iterator &rhs) const
+{
+    return !(rhs < *this);
+}
+
+inline bool TensorBatch::Iterator::operator>=(const Iterator &rhs) const
+{
+    return !(rhs > *this);
+}
+
+} // namespace nvcv
+
+#endif // NVCV_TENSORBATCH_IMPL_HPP
diff --git a/src/nvcv_types/include/nvcv/detail/TensorImpl.hpp b/src/nvcv_types/include/nvcv/detail/TensorImpl.hpp
index 2e7b7cc1..f4ceab2a 100644
--- a/src/nvcv_types/include/nvcv/detail/TensorImpl.hpp
+++ b/src/nvcv_types/include/nvcv/detail/TensorImpl.hpp
@@ -91,6 +91,15 @@ inline void *Tensor::userPointer() const
     return ptr;
 }
 
+inline Tensor Tensor::reshape(const TensorShape &new_shape)
+{
+    NVCVTensorHandle out_handle;
+    detail::CheckThrow(
+        nvcvTensorReshape(this->handle(), new_shape.rank(), &new_shape.shape()[0], new_shape.layout(), &out_handle));
+    Tensor out_tensor(std::move(out_handle));
+    return out_tensor;
+}
+
 inline auto Tensor::CalcRequirements(const TensorShape &shape, DataType dtype, const MemAlignment &bufAlign)
     -> Requirements
 {
diff --git a/src/nvcv_types/priv/Array.cpp b/src/nvcv_types/priv/Array.cpp
index ea0ab0cc..2eb02a11 100644
--- a/src/nvcv_types/priv/Array.cpp
+++ b/src/nvcv_types/priv/Array.cpp
@@ -157,6 +157,7 @@ Array::Array(NVCVArrayRequirements reqs, IAllocator &alloc, NVCVResourceType tar
     NVCV_ASSERT(m_memBuffer != nullptr);
 
     this->exportData(m_data);
+    m_data.length = 0;
 }
 
 Array::~Array()
@@ -223,4 +224,12 @@ void Array::exportData(NVCVArrayData &data) const
     }
 }
 
+void Array::resize(int64_t length)
+{
+    if (length <= this->capacity())
+    {
+        m_data.length = length;
+    }
+}
+
 } // namespace nvcv::priv
diff --git a/src/nvcv_types/priv/Array.hpp b/src/nvcv_types/priv/Array.hpp
index 3d522fce..19c76abd 100644
--- a/src/nvcv_types/priv/Array.hpp
+++ b/src/nvcv_types/priv/Array.hpp
@@ -47,6 +47,8 @@ class Array final : public CoreObjectBase<IArray>
 
     void exportData(NVCVArrayData &data) const override;
 
+    void resize(int64_t length) override;
+
 private:
     SharedCoreObj<IAllocator> m_alloc;
     NVCVArrayRequirements     m_reqs;
diff --git a/src/nvcv_types/priv/ArrayWrapData.cpp b/src/nvcv_types/priv/ArrayWrapData.cpp
index 20bdc431..23b8d1c7 100644
--- a/src/nvcv_types/priv/ArrayWrapData.cpp
+++ b/src/nvcv_types/priv/ArrayWrapData.cpp
@@ -119,4 +119,12 @@ void ArrayWrapData::exportData(NVCVArrayData &data) const
     data = m_data;
 }
 
+void ArrayWrapData::resize(int64_t length)
+{
+    if (length <= this->capacity())
+    {
+        m_data.length = length;
+    }
+}
+
 } // namespace nvcv::priv
diff --git a/src/nvcv_types/priv/ArrayWrapData.hpp b/src/nvcv_types/priv/ArrayWrapData.hpp
index eee9b303..2b15565f 100644
--- a/src/nvcv_types/priv/ArrayWrapData.hpp
+++ b/src/nvcv_types/priv/ArrayWrapData.hpp
@@ -42,6 +42,8 @@ class ArrayWrapData final : public CoreObjectBase<IArray>
 
     void exportData(NVCVArrayData &data) const override;
 
+    void resize(int64_t length) override;
+
 private:
     NVCVArrayData    m_data;
     NVCVResourceType m_target;
diff --git a/src/nvcv_types/priv/CMakeLists.txt b/src/nvcv_types/priv/CMakeLists.txt
index 0617c8f9..049f173e 100644
--- a/src/nvcv_types/priv/CMakeLists.txt
+++ b/src/nvcv_types/priv/CMakeLists.txt
@@ -36,6 +36,7 @@ add_library(nvcv_types_priv STATIC
     ImageFormat.cpp
     Array.cpp
     ArrayWrapData.cpp
+    TensorBatch.cpp
 )
 
 target_include_directories(nvcv_types_priv
diff --git a/src/nvcv_types/priv/Context.cpp b/src/nvcv_types/priv/Context.cpp
index b3bc3a2d..c1505716 100644
--- a/src/nvcv_types/priv/Context.cpp
+++ b/src/nvcv_types/priv/Context.cpp
@@ -34,8 +34,10 @@ Context::Context()
     , m_imageManager("Image")
     , m_imageBatchManager("ImageBatch")
     , m_tensorManager("Tensor")
+    , m_tensorBatchManager("TensorBatch")
     , m_arrayManager("Array")
-    , m_managerList{m_allocatorManager, m_imageManager, m_imageBatchManager, m_tensorManager, m_arrayManager}
+    , m_managerList{m_allocatorManager, m_imageManager,       m_imageBatchManager,
+                    m_tensorManager,    m_tensorBatchManager, m_arrayManager}
 {
 }
 
@@ -59,5 +61,6 @@ template class HandleManager<IImageBatch>;
 template class HandleManager<ITensor>;
 template class HandleManager<IArray>;
 template class HandleManager<IAllocator>;
+template class HandleManager<ITensorBatch>;
 
 } // namespace nvcv::priv
diff --git a/src/nvcv_types/priv/Context.hpp b/src/nvcv_types/priv/Context.hpp
index 651e565e..b963a0ef 100644
--- a/src/nvcv_types/priv/Context.hpp
+++ b/src/nvcv_types/priv/Context.hpp
@@ -24,6 +24,7 @@
 #include "IContext.hpp"
 #include "ImageBatchManager.hpp"
 #include "ImageManager.hpp"
+#include "TensorBatchManager.hpp"
 #include "TensorManager.hpp"
 
 namespace nvcv::priv {
@@ -39,12 +40,13 @@ class Context final : public IContext
 
 private:
     // Order is important due to inter-dependencies
-    DefaultAllocator  m_allocDefault;
-    AllocatorManager  m_allocatorManager;
-    ImageManager      m_imageManager;
-    ImageBatchManager m_imageBatchManager;
-    TensorManager     m_tensorManager;
-    ArrayManager      m_arrayManager;
+    DefaultAllocator   m_allocDefault;
+    AllocatorManager   m_allocatorManager;
+    ImageManager       m_imageManager;
+    ImageBatchManager  m_imageBatchManager;
+    TensorManager      m_tensorManager;
+    TensorBatchManager m_tensorBatchManager;
+    ArrayManager       m_arrayManager;
 
     Managers m_managerList;
 };
diff --git a/src/nvcv_types/priv/HandleTraits.hpp b/src/nvcv_types/priv/HandleTraits.hpp
index 246ff3a2..cba0036a 100644
--- a/src/nvcv_types/priv/HandleTraits.hpp
+++ b/src/nvcv_types/priv/HandleTraits.hpp
@@ -56,6 +56,12 @@ struct HandleTraits<NVCVImageBatchHandle>
     constexpr static bool hasManager = true;
 };
 
+template<>
+struct HandleTraits<NVCVTensorBatchHandle>
+{
+    constexpr static bool hasManager = true;
+};
+
 template<class T>
 constexpr bool HasObjManager = HandleTraits<T>::hasManager;
 
diff --git a/src/nvcv_types/priv/IArray.hpp b/src/nvcv_types/priv/IArray.hpp
index caab7e95..0b9127bd 100644
--- a/src/nvcv_types/priv/IArray.hpp
+++ b/src/nvcv_types/priv/IArray.hpp
@@ -42,6 +42,8 @@ class IArray : public ICoreObjectHandle<IArray, NVCVArrayHandle>
     virtual NVCVResourceType target() const = 0;
 
     virtual void exportData(NVCVArrayData &data) const = 0;
+
+    virtual void resize(int64_t length) = 0;
 };
 
 template<>
diff --git a/src/nvcv_types/priv/IContext.hpp b/src/nvcv_types/priv/IContext.hpp
index e1a14ebe..309d77a6 100644
--- a/src/nvcv_types/priv/IContext.hpp
+++ b/src/nvcv_types/priv/IContext.hpp
@@ -28,19 +28,20 @@ namespace nvcv::priv {
 template<class HandleType>
 class CoreObjManager;
 
-using ImageManager      = CoreObjManager<NVCVImageHandle>;
-using ImageBatchManager = CoreObjManager<NVCVImageBatchHandle>;
-using TensorManager     = CoreObjManager<NVCVTensorHandle>;
-using ArrayManager      = CoreObjManager<NVCVArrayHandle>;
-using AllocatorManager  = CoreObjManager<NVCVAllocatorHandle>;
+using ImageManager       = CoreObjManager<NVCVImageHandle>;
+using ImageBatchManager  = CoreObjManager<NVCVImageBatchHandle>;
+using TensorManager      = CoreObjManager<NVCVTensorHandle>;
+using TensorBatchManager = CoreObjManager<NVCVTensorBatchHandle>;
+using ArrayManager       = CoreObjManager<NVCVArrayHandle>;
+using AllocatorManager   = CoreObjManager<NVCVAllocatorHandle>;
 
 class IAllocator;
 
 class IContext
 {
 public:
-    using Managers
-        = std::tuple<AllocatorManager &, ImageManager &, ImageBatchManager &, TensorManager &, ArrayManager &>;
+    using Managers = std::tuple<AllocatorManager &, ImageManager &, ImageBatchManager &, TensorManager &,
+                                TensorBatchManager &, ArrayManager &>;
 
     template<class HandleType>
     CoreObjManager<HandleType> &manager()
diff --git a/src/nvcv_types/priv/ITensorBatch.hpp b/src/nvcv_types/priv/ITensorBatch.hpp
new file mode 100644
index 00000000..14be328b
--- /dev/null
+++ b/src/nvcv_types/priv/ITensorBatch.hpp
@@ -0,0 +1,66 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_CORE_PRIV_ITENSORBATCH_HPP
+#define NVCV_CORE_PRIV_ITENSORBATCH_HPP
+
+#include "ICoreObject.hpp"
+#include "SharedCoreObj.hpp"
+
+#include <nvcv/TensorBatch.h>
+
+namespace nvcv::priv {
+
+class IAllocator;
+
+class ITensorBatch : public ICoreObjectHandle<ITensorBatch, NVCVTensorBatchHandle>
+{
+public:
+    virtual int32_t              capacity() const   = 0;
+    virtual int32_t              rank() const       = 0;
+    virtual NVCVDataType         dtype() const      = 0;
+    virtual int32_t              numTensors() const = 0;
+    virtual NVCVTensorLayout     layout() const     = 0;
+    virtual NVCVTensorBufferType type() const       = 0;
+
+    virtual SharedCoreObj<IAllocator> alloc() const = 0;
+
+    virtual void clear() = 0;
+
+    virtual void pushTensors(const NVCVTensorHandle *tensors, int32_t numTensors) = 0;
+
+    virtual void popTensors(int32_t numTensors) = 0;
+
+    virtual void getTensors(int32_t index, NVCVTensorHandle *tensors, int32_t numTensors) const = 0;
+
+    virtual void setTensors(int32_t index, const NVCVTensorHandle *tensors, int32_t numTensors) = 0;
+
+    virtual void exportData(CUstream stream, NVCVTensorBatchData &data) = 0;
+};
+
+template<>
+class CoreObjManager<NVCVTensorBatchHandle> : public HandleManager<ITensorBatch>
+{
+    using Base = HandleManager<ITensorBatch>;
+
+public:
+    using Base::Base;
+};
+
+} // namespace nvcv::priv
+
+#endif // NVCV_CORE_PRIV_TENSORBATCH_HPP
diff --git a/src/nvcv_types/priv/TensorBatch.cpp b/src/nvcv_types/priv/TensorBatch.cpp
new file mode 100644
index 00000000..356bdbe6
--- /dev/null
+++ b/src/nvcv_types/priv/TensorBatch.cpp
@@ -0,0 +1,339 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TensorBatch.hpp"
+
+#include "Requirements.hpp"
+#include "TensorBatchManager.hpp"
+
+#include <util/CheckError.hpp>
+#include <util/Math.hpp>
+
+namespace nvcv::priv {
+
+TensorBatch::TensorBatch(const NVCVTensorBatchRequirements &reqs, IAllocator &alloc)
+    : m_alloc(alloc)
+    , m_reqs(reqs)
+    , m_dirtyBegin(0)
+    , m_dirtyEnd(0)
+    , m_dtype(NVCV_DATA_TYPE_NONE)
+    , m_layout(NVCV_TENSOR_LAYOUT_MAKE(""))
+    , m_rank(-1)
+    , m_userPointer(nullptr)
+{
+    m_evPostFence         = nullptr;
+    m_devTensorsBuffer    = nullptr;
+    m_pinnedTensorsBuffer = nullptr;
+    m_Tensors             = nullptr;
+
+    int64_t bufferSize = m_reqs.capacity * sizeof(BatchElement);
+
+    try
+    {
+        m_devTensorsBuffer = static_cast<BatchElement *>(m_alloc->allocCudaMem(bufferSize, m_reqs.alignBytes));
+        NVCV_ASSERT(m_devTensorsBuffer != nullptr);
+
+        m_pinnedTensorsBuffer = static_cast<BatchElement *>(m_alloc->allocHostPinnedMem(bufferSize, m_reqs.alignBytes));
+        NVCV_ASSERT(m_pinnedTensorsBuffer != nullptr);
+
+        m_Tensors = static_cast<NVCVTensorHandle *>(m_alloc->allocHostMem(bufferSize, m_reqs.alignBytes));
+        NVCV_ASSERT(m_Tensors != nullptr);
+
+        NVCV_CHECK_THROW(cudaEventCreateWithFlags(&m_evPostFence, cudaEventDisableTiming));
+    }
+    catch (...)
+    {
+        cleanUp();
+        throw;
+    }
+}
+
+NVCVTensorBatchRequirements TensorBatch::CalcRequirements(int32_t capacity)
+{
+    NVCVTensorBatchRequirements reqs;
+    reqs.capacity = capacity;
+    reqs.mem      = {};
+
+    reqs.alignBytes = alignof(BatchElement);
+    reqs.alignBytes = util::RoundUpNextPowerOfTwo(reqs.alignBytes);
+
+    if (reqs.alignBytes > NVCV_MAX_MEM_REQUIREMENTS_BLOCK_SIZE)
+    {
+        throw Exception(NVCV_ERROR_INVALID_ARGUMENT,
+                        "Alignment requirement of %d is larger than the maximum allowed %ld", reqs.alignBytes,
+                        NVCV_MAX_MEM_REQUIREMENTS_BLOCK_SIZE);
+    }
+
+    AddBuffer(reqs.mem.cudaMem, capacity * sizeof(BatchElement), reqs.alignBytes);
+    AddBuffer(reqs.mem.hostPinnedMem, capacity * sizeof(BatchElement), reqs.alignBytes);
+    AddBuffer(reqs.mem.hostMem, capacity * sizeof(BatchElement), reqs.alignBytes);
+
+    return reqs;
+}
+
+TensorBatch::~TensorBatch()
+{
+    cleanUp();
+}
+
+void TensorBatch::cleanUp()
+{
+    if (m_evPostFence)
+    {
+        NVCV_CHECK_LOG(cudaEventDestroy(m_evPostFence));
+    }
+
+    for (int i = 0; i < m_numTensors; ++i)
+    {
+        CoreObjectDecRef(m_Tensors[i]);
+    }
+
+    int64_t bufferSize = m_reqs.capacity * sizeof(BatchElement);
+
+    m_alloc->freeCudaMem(m_devTensorsBuffer, bufferSize, m_reqs.alignBytes);
+    m_alloc->freeHostPinnedMem(m_pinnedTensorsBuffer, bufferSize, m_reqs.alignBytes);
+    m_alloc->freeHostMem(m_Tensors, bufferSize, m_reqs.alignBytes);
+}
+
+void TensorBatch::exportData(CUstream stream, NVCVTensorBatchData &data)
+{
+    if (m_dirtyBegin < m_dirtyEnd)
+    {
+        // Block until the previous call to exportData finishes the buffer copy.
+        NVCV_CHECK_THROW(cudaEventSynchronize(m_evPostFence));
+
+        for (auto i = m_dirtyBegin; i < m_dirtyEnd; ++i)
+        {
+            auto          &t = ToStaticRef<ITensor>(m_Tensors[i]);
+            NVCVTensorData tdata;
+            t.exportData(tdata);
+            auto &element = m_pinnedTensorsBuffer[i];
+            element.data  = tdata.buffer.strided.basePtr;
+            for (int d = 0; d < tdata.rank; ++d)
+            {
+                element.shape[d]  = tdata.shape[d];
+                element.stride[d] = tdata.buffer.strided.strides[d];
+            }
+        }
+
+        int64_t copySize = (m_dirtyEnd - m_dirtyBegin) * sizeof(BatchElement);
+        NVCV_CHECK_THROW(cudaMemcpyAsync(m_devTensorsBuffer + m_dirtyBegin, m_pinnedTensorsBuffer + m_dirtyBegin,
+                                         copySize, cudaMemcpyHostToDevice, stream));
+
+        // Signal the buffer copy is finished.
+        NVCV_CHECK_THROW(cudaEventRecord(m_evPostFence, stream));
+        m_dirtyBegin = m_dirtyEnd;
+    }
+    NVCVTensorBatchBuffer buffer;
+    buffer.strided  = NVCVTensorBatchBufferStrided{m_devTensorsBuffer};
+    data.buffer     = buffer;
+    data.type       = NVCV_TENSOR_BUFFER_STRIDED_CUDA;
+    data.rank       = m_rank;
+    data.dtype      = m_dtype;
+    data.layout     = m_layout;
+    data.numTensors = m_numTensors;
+}
+
+void TensorBatch::validateTensors(const NVCVTensorHandle *tensors, int32_t numTensors)
+{
+    for (int32_t i = 0; i < numTensors; ++i)
+    {
+        auto &t = ToStaticRef<ITensor>(tensors[i]);
+        if (m_rank != -1 && t.rank() != m_rank)
+        {
+            throw Exception(NVCV_ERROR_INVALID_ARGUMENT,
+                            "Trying to add a tensor to a tensor batch with an inconsistent rank.");
+        }
+        if (t.dtype().value() != m_dtype)
+        {
+            throw Exception(NVCV_ERROR_INVALID_ARGUMENT,
+                            "Trying to add a tensor to a tensor batch with an inconsistent type.");
+        }
+        if (nvcvTensorLayoutCompare(t.layout(), m_layout) != 0)
+        {
+            throw Exception(NVCV_ERROR_INVALID_ARGUMENT,
+                            "Trying to add a tensor to a tensor batch with an inconsistent layout.");
+        }
+    }
+}
+
+void TensorBatch::setLayoutAndDType(const NVCVTensorHandle *tensors, int32_t numTensors)
+{
+    if (numTensors > 0 && m_numTensors == 0)
+    {
+        auto &t  = ToStaticRef<ITensor>(tensors[0]);
+        m_rank   = t.rank();
+        m_dtype  = t.dtype().value();
+        m_layout = t.layout();
+    }
+}
+
+void TensorBatch::pushTensors(const NVCVTensorHandle *tensors, int32_t numTensors)
+{
+    if (numTensors == 0)
+    {
+        return;
+    }
+    if (numTensors < 0)
+    {
+        throw Exception(NVCV_ERROR_INVALID_ARGUMENT) << "Number of tensors cannot be nagative: " << numTensors;
+    }
+    if (m_numTensors + numTensors > capacity())
+    {
+        throw Exception(NVCV_ERROR_OVERFLOW)
+            << "Adding " << numTensors << " tensors to a tensor batch would exceed its capacity (" << capacity()
+            << ") by " << m_numTensors + numTensors - capacity();
+    }
+    setLayoutAndDType(tensors, numTensors);
+    validateTensors(tensors, numTensors);
+    for (int32_t i = 0; i < numTensors; ++i)
+    {
+        CoreObjectIncRef(tensors[i]);
+        m_Tensors[m_numTensors + i] = tensors[i];
+    }
+    if (m_dirtyEnd == m_dirtyBegin)
+    {
+        m_dirtyBegin = m_numTensors;
+    }
+    m_numTensors += numTensors;
+    m_dirtyEnd = m_numTensors;
+}
+
+void TensorBatch::popTensors(int32_t numTensors)
+{
+    if (numTensors < 0)
+    {
+        throw Exception(NVCV_ERROR_INVALID_ARGUMENT) << "Trying to pop a negative number of tensors: " << numTensors;
+    }
+    if (numTensors > m_numTensors)
+    {
+        throw Exception(NVCV_ERROR_UNDERFLOW)
+            << "Trying to pop " << numTensors << " tensors from a tensor batch with " << m_numTensors << " tensors.";
+    }
+    for (int i = m_numTensors - numTensors; i < m_numTensors; ++i)
+    {
+        CoreObjectDecRef(m_Tensors[i]);
+    }
+    m_numTensors -= numTensors;
+    m_dirtyEnd   = std::min(m_dirtyEnd, m_numTensors);
+    m_dirtyBegin = std::min(m_dirtyBegin, m_dirtyEnd);
+    if (m_numTensors == 0)
+    {
+        m_dtype  = NVCV_DATA_TYPE_NONE;
+        m_layout = NVCV_TENSOR_LAYOUT_MAKE("");
+        m_rank   = -1;
+    }
+}
+
+void TensorBatch::getTensors(int32_t index, NVCVTensorHandle *tensors, int32_t numTensors) const
+{
+    if (index + numTensors > m_numTensors)
+    {
+        throw Exception(NVCV_ERROR_OVERFLOW) << "Trying to get a tensor on index " << index + numTensors
+                                             << " while the tensor batch contains only " << m_numTensors << " tensors.";
+    }
+    if (index < 0)
+    {
+        throw Exception(NVCV_ERROR_INVALID_ARGUMENT) << "Trying to get a tensor with negative index: " << index;
+    }
+    std::copy(m_Tensors + index, m_Tensors + index + numTensors, tensors);
+    for (int i = 0; i < numTensors; ++i)
+    {
+        CoreObjectIncRef(tensors[i]);
+    }
+}
+
+void TensorBatch::setTensors(int32_t index, const NVCVTensorHandle *tensors, int32_t numTensors)
+{
+    if (index + numTensors > m_numTensors)
+    {
+        throw Exception(NVCV_ERROR_OVERFLOW) << "Trying to set a tensor on index " << index + numTensors
+                                             << " while the tensor batch contains only " << m_numTensors << " tensors.";
+    }
+    if (index < 0)
+    {
+        throw Exception(NVCV_ERROR_INVALID_ARGUMENT) << "Trying to set a tensor with negative index: " << index;
+    }
+    validateTensors(tensors, numTensors);
+    for (int32_t idx = 0; idx < numTensors; ++idx)
+    {
+        CoreObjectDecRef(m_Tensors[idx + index]);
+        CoreObjectIncRef(tensors[idx]);
+        m_Tensors[idx + index] = tensors[idx];
+    }
+    if (m_dirtyBegin != m_dirtyEnd)
+    {
+        m_dirtyBegin = std::min(m_dirtyBegin, index);
+        m_dirtyEnd   = std::max(m_dirtyEnd, index + numTensors);
+    }
+    else
+    {
+        m_dirtyBegin = index;
+        m_dirtyEnd   = m_dirtyBegin + numTensors;
+    }
+}
+
+SharedCoreObj<IAllocator> TensorBatch::alloc() const
+{
+    return m_alloc;
+}
+
+int32_t TensorBatch::capacity() const
+{
+    return m_reqs.capacity;
+}
+
+int32_t TensorBatch::rank() const
+{
+    return m_rank;
+}
+
+int32_t TensorBatch::numTensors() const
+{
+    return m_numTensors;
+}
+
+NVCVDataType TensorBatch::dtype() const
+{
+    return m_dtype;
+}
+
+NVCVTensorLayout TensorBatch::layout() const
+{
+    return m_layout;
+}
+
+NVCVTensorBufferType TensorBatch::type() const
+{
+    return BUFFER_TYPE;
+}
+
+void TensorBatch::clear()
+{
+    for (int i = 0; i < m_numTensors; ++i)
+    {
+        CoreObjectDecRef(m_Tensors[i]);
+    }
+    m_numTensors = 0;
+    m_dirtyBegin = 0;
+    m_dirtyEnd   = 0;
+    m_dtype      = NVCV_DATA_TYPE_NONE;
+    m_layout     = NVCV_TENSOR_LAYOUT_MAKE("");
+    m_rank       = -1;
+}
+
+} // namespace nvcv::priv
diff --git a/src/nvcv_types/priv/TensorBatch.hpp b/src/nvcv_types/priv/TensorBatch.hpp
new file mode 100644
index 00000000..16a41f53
--- /dev/null
+++ b/src/nvcv_types/priv/TensorBatch.hpp
@@ -0,0 +1,107 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_CORE_PRIV_TENSORBATCH_HPP
+#define NVCV_CORE_PRIV_TENSORBATCH_HPP
+
+#include "DataType.hpp"
+#include "IAllocator.hpp"
+#include "ITensorBatch.hpp"
+#include "SharedCoreObj.hpp"
+#include "Tensor.hpp"
+
+#include <cuda_runtime.h>
+
+namespace nvcv::priv {
+
+class TensorBatch final : public CoreObjectBase<ITensorBatch>
+{
+public:
+    using BatchElement                            = NVCVTensorBatchElementStrided;
+    static const NVCVTensorBufferType BUFFER_TYPE = NVCV_TENSOR_BUFFER_STRIDED_CUDA;
+
+    static NVCVTensorBatchRequirements CalcRequirements(int32_t capacity);
+
+    TensorBatch(const NVCVTensorBatchRequirements &reqs, IAllocator &alloc);
+
+    ~TensorBatch();
+
+    SharedCoreObj<IAllocator> alloc() const override;
+
+    int32_t capacity() const override;
+
+    NVCVDataType dtype() const override;
+
+    NVCVTensorLayout layout() const override;
+
+    int32_t numTensors() const override;
+
+    NVCVTensorBufferType type() const override;
+
+    int32_t rank() const override;
+
+    void clear() override;
+
+    void exportData(CUstream stream, NVCVTensorBatchData &data) override;
+
+    void pushTensors(const NVCVTensorHandle *tensors, int32_t numTensors) override;
+
+    void popTensors(int32_t numTensors) override;
+
+    void getTensors(int32_t index, NVCVTensorHandle *tensors, int32_t numTensors) const override;
+
+    void setTensors(int32_t index, const NVCVTensorHandle *tensors, int32_t numTensors) override;
+
+private:
+    SharedCoreObj<IAllocator>   m_alloc;
+    NVCVTensorBatchRequirements m_reqs;
+
+    // Dirty begin and end describe a range containing all the tensors that have been modified
+    // since the previous exportData call and thus should be updated in the exported buffer.
+    int32_t m_dirtyBegin;
+    int32_t m_dirtyEnd;
+
+    int32_t m_numTensors = 0;
+
+    NVCVTensorHandle              *m_Tensors; // host buffer for tensor handles
+    // Pinned buffer for the tensor data descriptors
+    // It's updated every time the user updates the tensor batch.
+    // Changes are tracked with the m_dirty flags.
+    NVCVTensorBatchElementStrided *m_pinnedTensorsBuffer;
+    // Device buffer for the tensor data descriptors.
+    // It's updated and returned when the exportData method is called.
+    NVCVTensorBatchElementStrided *m_devTensorsBuffer;
+
+    NVCVDataType     m_dtype;
+    NVCVTensorLayout m_layout;
+    int32_t          m_rank;
+
+    // TODO: must be retrieved from the resource allocator;
+    cudaEvent_t m_evPostFence;
+
+    void *m_userPointer;
+
+    void cleanUp();
+
+    void validateTensors(const NVCVTensorHandle *tensors, int32_t numTensors);
+
+    void setLayoutAndDType(const NVCVTensorHandle *tensors, int32_t numTensors);
+};
+
+} // namespace nvcv::priv
+
+#endif // NVCV_CORE_PRIV_TENSORBATCH_HPP
diff --git a/src/nvcv_types/priv/TensorBatchManager.hpp b/src/nvcv_types/priv/TensorBatchManager.hpp
new file mode 100644
index 00000000..d9082a31
--- /dev/null
+++ b/src/nvcv_types/priv/TensorBatchManager.hpp
@@ -0,0 +1,36 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_PRIV_CORE_TENSORBATCHMANAGER_HPP
+#define NVCV_PRIV_CORE_TENSORBATCHMANAGER_HPP
+
+#include "IContext.hpp"
+#include "TensorBatch.hpp"
+
+namespace nvcv::priv {
+
+using TensorBatchManager = CoreObjManager<NVCVTensorBatchHandle>;
+
+template<>
+struct ResourceStorage<ITensorBatch>
+{
+    using type = CompatibleStorage<TensorBatch>;
+};
+
+} // namespace nvcv::priv
+
+#endif // NVCV_PRIV_CORE_TENSORBATCHMANAGER_HPP
diff --git a/src/nvcv_types/priv/TensorData.cpp b/src/nvcv_types/priv/TensorData.cpp
index 142ad714..68edc8ce 100644
--- a/src/nvcv_types/priv/TensorData.cpp
+++ b/src/nvcv_types/priv/TensorData.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -23,6 +23,8 @@
 
 #include <nvcv/TensorLayout.h>
 
+#include <sstream>
+
 namespace nvcv::priv {
 
 NVCVTensorLayout GetTensorLayoutFor(ImageFormat fmt, int nbatches)
@@ -186,4 +188,164 @@ void FillTensorData(IImage &img, NVCVTensorData &tensorData)
     tensorStrided.basePtr = imgStrided.planes[0].basePtr;
 }
 
+/**
+ * @brief Simplifies a shape by collapsing dimensions that are not strided
+ *
+ * @param[in] rank number of dimensions
+ * @param[in] shape
+ * @param[in] stride
+ * @param[out] out_shape
+ * @param[out] out_strides
+ * @return int out_rank
+ */
+static int Simplify(int rank, int64_t *shape, int64_t *stride, int64_t *out_shape, int64_t *out_strides)
+{
+    if (rank <= 1) // Nothing to simplify
+    {
+        if (rank == 1)
+        {
+            out_shape[0]   = shape[0];
+            out_strides[0] = stride[0];
+        }
+        return rank;
+    }
+
+    int     out_rank = 0;
+    int64_t vol      = shape[0];
+    for (int d = 1; d < rank; d++)
+    {
+        if (stride[d - 1] != shape[d] * stride[d])
+        {
+            out_strides[out_rank] = stride[d - 1];
+            out_shape[out_rank]   = vol;
+            vol                   = shape[d];
+            out_rank++;
+        }
+        else
+        {
+            vol *= shape[d];
+        }
+    }
+    out_strides[out_rank] = stride[rank - 1];
+    out_shape[out_rank]   = vol;
+    out_rank++;
+    return out_rank;
+}
+
+/**
+ * @brief Reshapes a simplified shape (non-strided dimensions are collapsed) to a target shape if possible.
+ *        Calculates the output strides.
+ *
+ * @param[in] in_rank
+ * @param[in] in_shape
+ * @param[in] in_strides
+ * @param[in] target_rank
+ * @param[in] target_shape
+ * @param[out] out_strides
+ *
+ * @return true if reshape is possible, false otherwise
+ */
+static bool ReshapeSimplified(int in_rank, const int64_t *in_shape, const int64_t *in_strides, int target_rank,
+                              const int64_t *target_shape, int64_t *out_strides)
+{
+    int i = 0, j = 0;
+    for (; i < in_rank && j < target_rank; i++)
+    {
+        int64_t in_e        = in_shape[i];
+        int64_t out_v       = 1;
+        int     group_start = j;
+        while (j < target_rank && (out_v * target_shape[j]) <= in_e) out_v *= target_shape[j++];
+
+        if (out_v != in_e)
+            return false; // reshape is not possible
+
+        int64_t s = in_strides[i];
+        for (int d = j - 1; d >= group_start; d--)
+        {
+            out_strides[d] = s;
+            s *= target_shape[d];
+        }
+    }
+    return true;
+}
+
+static std::string ShapeStr(int rank, const int64_t *sh)
+{
+    std::stringstream ss;
+    ss << "(";
+    for (int d = 0; d < rank; d++)
+    {
+        if (d > 0)
+            ss << ", ";
+        ss << sh[d];
+    }
+    ss << ")";
+    return ss.str();
+}
+
+void ReshapeTensorData(NVCVTensorData &tensor_data, int new_rank, const int64_t *new_shape, NVCVTensorLayout new_layout)
+{
+    int64_t old_volume = 1;
+    for (int d = 0; d < tensor_data.rank; d++) old_volume *= tensor_data.shape[d];
+
+    // TODO: Add 0D tensor support, once it's supported accross the board
+    if (new_rank < 1 || new_rank > NVCV_TENSOR_MAX_RANK)
+        throw Exception(NVCV_ERROR_INVALID_ARGUMENT)
+            << "Number of dimensions must be between 1 and " << NVCV_TENSOR_MAX_RANK << ", not " << new_rank;
+
+    int64_t new_volume = 1;
+    for (int d = 0; d < new_rank; d++) new_volume *= new_shape[d];
+
+    if (new_volume != old_volume)
+    {
+        throw Exception(NVCV_ERROR_INVALID_ARGUMENT)
+            << "The volume (" << new_volume << ") of the provided shape " << ShapeStr(new_rank, new_shape)
+            << " does not match the size of the array (" << old_volume << ")";
+    }
+
+    // layout ------------
+    if (new_layout.rank > 0)
+    {
+        if (new_layout.rank != new_rank)
+            throw Exception(NVCV_ERROR_INVALID_ARGUMENT)
+                << "The number of dimensions of the provided layout and shape do not match. Got a "
+                   "shape with "
+                << new_rank << " dimensions and a layout with " << new_layout.rank << " dimensions";
+    }
+    tensor_data.layout = new_layout;
+
+    // Check strides ------------
+
+    // right now is the only option supported
+    assert(tensor_data.bufferType == NVCV_TENSOR_BUFFER_STRIDED_CUDA);
+
+    // Collapses non-strided dimensions into groups
+    // Example 1:
+    // A tensor with shape (480, 640, 3) and strides (2560, 4, 1)
+    // will be collapsed into (307200, 3) with strides (4, 1).
+    // Example 2:
+    // A tensor with shape (480, 640, 3) and strides (2560, 3, 1)
+    // will be collapsed into (921600,) with strides (1,).
+    int64_t simplified_shape[NVCV_TENSOR_MAX_RANK];
+    int64_t simplified_strides[NVCV_TENSOR_MAX_RANK];
+    int     simplified_rank = Simplify(tensor_data.rank, tensor_data.shape, tensor_data.buffer.strided.strides,
+                                       simplified_shape, simplified_strides);
+
+    // Calculate output strides (if reshape is possible) or throw an error
+    bool ret = ReshapeSimplified(simplified_rank, simplified_shape, simplified_strides, new_rank, new_shape,
+                                 tensor_data.buffer.strided.strides);
+    if (!ret)
+    {
+        throw Exception(NVCV_ERROR_INVALID_ARGUMENT)
+            << "Cannot reshape"
+            << ". Original shape: " << ShapeStr(tensor_data.rank, tensor_data.shape)
+            << ", Strides: " << ShapeStr(tensor_data.rank, tensor_data.buffer.strided.strides)
+            << ", Target shape: " << ShapeStr(new_rank, new_shape);
+    }
+
+    // Set the new shape to the tensor data
+    tensor_data.rank = new_rank;
+    for (int d = 0; d < tensor_data.rank; d++) tensor_data.shape[d] = new_shape[d];
+}
+
 } // namespace nvcv::priv
diff --git a/src/nvcv_types/priv/TensorData.hpp b/src/nvcv_types/priv/TensorData.hpp
index c131c2e8..6ad85915 100644
--- a/src/nvcv_types/priv/TensorData.hpp
+++ b/src/nvcv_types/priv/TensorData.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -30,6 +30,9 @@ NVCVTensorLayout GetTensorLayoutFor(ImageFormat fmt, int nbatches);
 
 void FillTensorData(IImage &img, NVCVTensorData &data);
 
+void ReshapeTensorData(NVCVTensorData &tensor_data, int new_rank, const int64_t *new_shape,
+                       NVCVTensorLayout new_layout);
+
 } // namespace nvcv::priv
 
 #endif // NVCV_CORE_PRIV_TENSORDATA_HPP
diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt
index ffe90225..a889a14a 100644
--- a/src/util/CMakeLists.txt
+++ b/src/util/CMakeLists.txt
@@ -184,6 +184,9 @@ add_library(nvcv_util
     String.cpp
     Version.cpp
     TensorDataUtils.cpp
+    Event.cpp
+    Stream.cpp
+    StreamId.cpp
 )
 
 target_include_directories(nvcv_util
diff --git a/src/util/Event.cpp b/src/util/Event.cpp
new file mode 100644
index 00000000..65ca3123
--- /dev/null
+++ b/src/util/Event.cpp
@@ -0,0 +1,56 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Event.hpp"
+
+#include "CheckError.hpp"
+
+#include <cuda_runtime_api.h>
+
+namespace nvcv::util {
+
+CudaEvent CudaEvent::Create(int deviceId)
+{
+    return CreateWithFlags(cudaEventDisableTiming, deviceId);
+}
+
+CudaEvent CudaEvent::CreateWithFlags(unsigned flags, int deviceId)
+{
+    cudaEvent_t event;
+    int         prevDev = -1;
+    if (deviceId >= 0)
+    {
+        NVCV_CHECK_THROW(cudaGetDevice(&prevDev));
+        NVCV_CHECK_THROW(cudaSetDevice(deviceId));
+    }
+    auto err = cudaEventCreateWithFlags(&event, flags);
+    if (prevDev >= 0)
+        NVCV_CHECK_THROW(cudaSetDevice(prevDev));
+    NVCV_CHECK_THROW(err);
+    return CudaEvent(event);
+}
+
+void CudaEvent::DestroyHandle(cudaEvent_t event)
+{
+    auto err = cudaEventDestroy(event);
+    if (err != cudaSuccess && err != cudaErrorCudartUnloading)
+    {
+        NVCV_CHECK_THROW(err);
+    }
+}
+
+} // namespace nvcv::util
diff --git a/src/util/Event.hpp b/src/util/Event.hpp
new file mode 100644
index 00000000..64e8adc5
--- /dev/null
+++ b/src/util/Event.hpp
@@ -0,0 +1,59 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_UTIL_CUDA_EVENT_H_
+#define NVCV_UTIL_CUDA_EVENT_H_
+
+#include "UniqueHandle.hpp"
+
+#include <driver_types.h>
+
+#include <utility>
+
+namespace nvcv::util {
+
+/**
+ * @brief A wrapper class for CUDA event handle (cudaEvent_t),
+ *
+ * The purpose of this class is to provide safe ownership and lifecycle management
+ * for CUDA event handles.
+ * The event object may be created using the factory functions @ref Create and @ref CreateWithFlags.
+ *
+ * The object may also assume ownership of a pre-existing handle via constructor or
+ * @link UniqueHandle::reset(handle_type) reset @endlink function.
+ */
+class CudaEvent : public UniqueHandle<cudaEvent_t, CudaEvent>
+{
+public:
+    NVCV_INHERIT_UNIQUE_HANDLE(cudaEvent_t, CudaEvent)
+    constexpr CudaEvent() = default;
+
+    /** @brief Creates an event on specified device (or current device, if deviceId < 0) */
+    static CudaEvent Create(int deviceId = -1);
+
+    /** @brief Creates an event event with specific flags on the device specified
+   *         (or current device, if deviceId < 0)
+   */
+    static CudaEvent CreateWithFlags(unsigned flags, int deviceId = -1);
+
+    /** @brief Calls cudaEventDestroy on the handle. */
+    static void DestroyHandle(cudaEvent_t);
+};
+
+} // namespace nvcv::util
+
+#endif // DALI_CORE_CUDA_EVENT_H_
diff --git a/src/util/PerStreamCache.hpp b/src/util/PerStreamCache.hpp
new file mode 100644
index 00000000..ce7bf460
--- /dev/null
+++ b/src/util/PerStreamCache.hpp
@@ -0,0 +1,265 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_UTIL_PER_STREAM_CACHE_HPP
+#define NVCV_UTIL_PER_STREAM_CACHE_HPP
+
+#include "CheckError.hpp"
+#include "Event.hpp"
+#include "SimpleCache.hpp"
+#include "StreamId.hpp"
+
+#include <cassert>
+#include <map>
+#include <mutex>
+#include <set>
+#include <unordered_map>
+
+namespace nvcv::util {
+
+class EventCache : public nvcv::util::SimpleCache<CudaEvent>
+{
+public:
+    CudaEvent get()
+    {
+        return getOrCreate([]() { return CudaEvent::Create(); });
+    }
+};
+
+template<typename Payload>
+auto StreamCachePayloadReady(const Payload &payload)
+{
+    return payload.ready;
+}
+
+template<typename Payload>
+auto StreamCachePayloadSize(const Payload &payload)
+{
+    return payload.size;
+}
+
+template<typename Payload>
+auto StreamCachePayloadAlignment(const Payload &payload)
+{
+    return payload.alignment;
+}
+
+namespace detail {
+
+/** Cache item - stores a payload in a bidirectional list item.
+ *
+ * @tparam Payload   The payload of the cache item. It must have the followin fields:
+ *                      size_t      size
+ *                      cudaEvent_t ready
+ */
+template<typename Payload>
+struct StreamCacheItem
+{
+    StreamCacheItem *next = nullptr, *prev = nullptr;
+
+    mutable bool wasReady = false;
+
+    Payload payload{};
+
+    /** Gets a CUDA event that signifies that the payload is ready.
+     */
+    cudaEvent_t readyEvent() const
+    {
+        return StreamCachePayloadReady(payload);
+    }
+
+    size_t payloadSize() const
+    {
+        return StreamCachePayloadSize(payload);
+    }
+
+    bool isReady() const
+    {
+        if (wasReady)
+            return true;
+        if (auto ev = readyEvent())
+        {
+            auto err = cudaEventQuery(ev);
+            if (err == cudaErrorNotReady)
+                return false;
+            NVCV_CHECK_THROW(err);
+        }
+        wasReady = true;
+        return true;
+    }
+};
+
+template<typename Payload, typename Item = StreamCacheItem<Payload>>
+class StreamCacheItemAllocator
+{
+public:
+    using item_t = Item;
+
+    ~StreamCacheItemAllocator()
+    {
+        assert(m_allocated == 0);
+        while (m_head)
+        {
+            auto *next = m_head->next;
+            delete m_head;
+            m_head = next;
+        }
+    }
+
+    item_t *allocate()
+    {
+        if (auto *p = m_head)
+        {
+            m_head  = p->next;
+            p->next = nullptr;
+            assert(!p->prev);
+            m_allocated++;
+            m_free--;
+
+            *p = {}; // clear the object
+            return p;
+        }
+
+        auto *p = new item_t();
+        m_allocated++;
+        return p;
+    }
+
+    void deallocate(item_t *item)
+    {
+        if (!item)
+            return;
+
+        assert(!item->next && !item->prev && "The item is still linked");
+        item->payload = {};
+
+        item->next = m_head;
+        m_head     = item;
+        m_allocated--;
+        m_free++;
+    }
+
+private:
+    item_t *m_head = nullptr;
+
+    size_t m_allocated = 0, m_free = 0;
+};
+
+template<typename Payload, typename Item = StreamCacheItem<Payload>>
+class StreamOrderedCache
+{
+public:
+    using item_t = Item;
+
+    explicit StreamOrderedCache(StreamCacheItemAllocator<Payload, Item> *itemAlloc)
+        : m_itemAlloc(itemAlloc)
+    {
+    }
+
+    ~StreamOrderedCache()
+    {
+        waitAndPurge();
+    }
+
+    void waitAndPurge();
+
+    template<typename PayloadCallback>
+    void removeAllReady(PayloadCallback callback);
+
+    item_t *findNewestReady();
+
+    void put(Payload &&payload);
+
+    bool empty() const
+    {
+        return m_bySize.empty();
+    }
+
+    template<typename Predicate>
+    std::optional<Payload> getIf(size_t minSize, Predicate &&pred);
+
+    std::optional<Payload> get(size_t minSize, size_t minAlignment)
+    {
+        return getIf(
+            minSize, [=](const Payload &p)
+            { return StreamCachePayloadSize(p) >= minSize && StreamCachePayloadAlignment(p) >= minAlignment; });
+    }
+
+private:
+    void insert(item_t *item);
+
+    void remove(size_t payloadSize, item_t *item) noexcept;
+
+    StreamCacheItemAllocator<Payload, item_t> *m_itemAlloc;
+
+    std::set<std::pair<size_t, item_t *>> m_bySize;
+
+    item_t *m_head = nullptr, *m_tail = nullptr;
+};
+
+} // namespace detail
+
+template<typename Payload, typename Item = detail::StreamCacheItem<Payload>>
+class PerStreamCache
+{
+    using StreamOrderedCache = detail::StreamOrderedCache<Payload, Item>;
+
+public:
+    template<typename Predicate>
+    std::optional<Payload> getIf(size_t minSize, Predicate &&pred, std::optional<cudaStream_t> stream);
+
+    auto get(size_t minSize, size_t minAlignment, std::optional<cudaStream_t> stream)
+    {
+        return getIf(
+            minSize,
+            [=](const Payload &p)
+            { return StreamCachePayloadSize(p) >= minSize && StreamCachePayloadAlignment(p) >= minAlignment; },
+            stream);
+    }
+
+    void put(Payload &&payload, std::optional<cudaStream_t> stream);
+
+    void purge()
+    {
+        std::lock_guard g(m_lock);
+        for (auto &[k, v] : m_perStreamCache) v.waitAndPurge();
+        m_globalCache.clear();
+    }
+
+private:
+    template<typename Predicate>
+    std::optional<Payload> tryGetPerStream(size_t minSize, Predicate &&pred, cudaStream_t stream);
+
+    template<typename Predicate>
+    std::optional<Payload> tryGetGlobal(size_t minSize, Predicate &&pred);
+
+    int moveReadyToGlobal();
+
+    detail::StreamCacheItemAllocator<Payload, Item> m_cacheItemAlloc;
+
+    std::unordered_map<uint64_t, StreamOrderedCache> m_perStreamCache;
+
+    std::multimap<size_t, Payload> m_globalCache;
+
+    std::mutex m_lock;
+};
+
+} // namespace nvcv::util
+
+#include "PerStreamCacheImpl.hpp"
+
+#endif // NVCV_UTIL_PER_STREAM_CACHE_HPP
diff --git a/src/util/PerStreamCacheImpl.hpp b/src/util/PerStreamCacheImpl.hpp
new file mode 100644
index 00000000..8a152562
--- /dev/null
+++ b/src/util/PerStreamCacheImpl.hpp
@@ -0,0 +1,330 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_UTIL_PER_STREAM_CACHE_IMPL_HPP
+#define NVCV_UTIL_PER_STREAM_CACHE_IMPL_HPP
+
+#ifndef NVCV_UTIL_PER_STREAM_CACHE_HPP
+#    error "This file must not be included directly. Include StreamOrderedCache.hpp."
+#endif
+
+namespace nvcv::util {
+namespace detail {
+
+template<typename Payload, typename Item>
+void StreamOrderedCache<Payload, Item>::waitAndPurge()
+{
+    bool   ready  = false;
+    size_t erased = 0;
+    while (m_tail)
+    {
+        if (!ready && m_tail->readyEvent())
+        {
+            auto err = cudaEventSynchronize(m_tail->readyEvent());
+            if (err != cudaErrorCudartUnloading)
+                NVCV_CHECK_THROW(err);
+            ready = true;
+        }
+        auto *curr = m_tail;
+        m_tail     = m_tail->prev;
+        curr->prev = nullptr;
+        if (m_tail)
+            m_tail->next = nullptr;
+        m_itemAlloc->deallocate(curr);
+        erased++;
+    }
+    assert(erased == m_bySize.size());
+    m_head = nullptr;
+    m_bySize.clear();
+}
+
+template<typename Payload, typename Item>
+template<typename PayloadCallback>
+void StreamOrderedCache<Payload, Item>::removeAllReady(PayloadCallback callback)
+{
+    if (nvcv::util::IsCudaStreamIdHintUnambiguous())
+    {
+        // If the stream id hint is unambiguous, we can find the newest item
+        // and all older items will naturally be ready.
+
+        item_t *item = findNewestReady();
+        // This item and all older items are ready
+        while (item)
+        {
+            item_t *prev        = item->prev;
+            size_t  payloadSize = item->payloadSize();
+            callback(std::move(item->payload));
+            remove(payloadSize, item);
+            item = prev;
+        }
+    }
+    else
+    {
+        // The system's stream id hint is ambiguous, so we may have a mixture
+        // of items actually coming from different streams. We need to
+        // chek them one by one, since the readiness order may be lost.
+
+        item_t *item = m_tail;
+        while (item)
+        {
+            item_t *prev = item->prev;
+            if (item->isReady())
+            {
+                size_t payloadSize = item->payloadSize();
+                callback(std::move(item->payload));
+                remove(payloadSize, item);
+            }
+            item = prev;
+        }
+    }
+}
+
+template<typename Payload, typename Item>
+auto StreamOrderedCache<Payload, Item>::findNewestReady() -> item_t *
+{
+    constexpr int kMaxItemsOnStack = 256;
+    item_t       *tmp[kMaxItemsOnStack];
+    item_t       *sectionStart = m_tail;
+    // Process the items in blocks of up to kMaxItemsOnStack. On each block, a binary search is performed.
+    while (sectionStart)
+    {
+        if (sectionStart->isReady())
+            return sectionStart; // everything elese is newer, hence also ready
+
+        item_t *it = sectionStart->prev; // no point in re-checking the section start
+        int     hi = 0;
+        for (; it && hi < kMaxItemsOnStack; hi++, it = it->prev) tmp[hi] = it;
+
+        if (hi == 0)
+            return nullptr;
+
+        // There are no ready elements in this range - move on
+        if (!tmp[hi - 1]->isReady())
+        {
+            sectionStart = it;
+            continue;
+        }
+
+        int lo = 0, m = (lo + hi) >> 1;
+        // After this loop, `m` is going to contain the index of the newest ready element
+        while (lo < hi) // exclusive upper bound
+        {
+            // halfway element is ready - maybe there are newer ones that are ready, too?
+            if (tmp[m]->isReady())
+            {
+                hi = m; // exclusive upper bound
+                m  = (lo + hi) >> 1;
+            }
+            else // halfway element isn't ready - move to `m+1` as a potential lower bound
+            {
+                lo = m + 1;
+                m  = (lo + hi) >> 1;
+            }
+        }
+        assert(0 <= m && m <= hi);
+        assert(tmp[m]->wasReady);
+        return tmp[m];
+    }
+    return nullptr;
+}
+
+template<typename Payload, typename Item>
+void StreamOrderedCache<Payload, Item>::put(Payload &&payload)
+{
+    item_t *item  = m_itemAlloc->allocate();
+    item->payload = std::move(payload);
+    payload       = {};
+    try
+    {
+        insert(item);
+    }
+    catch (...)
+    {
+        m_itemAlloc->deallocate(item);
+        throw;
+    }
+}
+
+template<typename Payload, typename Item>
+template<typename Predicate>
+std::optional<Payload> StreamOrderedCache<Payload, Item>::getIf(size_t minSize, Predicate &&pred)
+{
+    auto it = m_bySize.lower_bound({minSize, nullptr});
+    for (; it != m_bySize.end(); ++it)
+    {
+        auto *item = it->second;
+        if (pred(item->payload))
+        {
+            size_t  payloadSize = item->payloadSize();
+            Payload ret         = std::move(item->payload);
+            remove(payloadSize, item);
+            return ret;
+        }
+    }
+    return std::nullopt;
+}
+
+template<typename Payload, typename Item>
+void StreamOrderedCache<Payload, Item>::insert(item_t *item)
+{
+    auto inserted = m_bySize.insert({item->payloadSize(), item});
+#ifdef NDEBUG
+    (void)inserted;
+#endif
+    assert(inserted.second);
+
+    if (!m_tail)
+    {
+        assert(!m_head);
+        m_head = m_tail = item;
+    }
+    else
+    {
+        assert(!m_tail->next);
+        item->prev   = m_tail;
+        m_tail->next = item;
+        m_tail       = item;
+    }
+}
+
+template<typename Payload, typename Item>
+void StreamOrderedCache<Payload, Item>::remove(size_t payloadSize, item_t *item) noexcept
+{
+    if (item == m_head)
+        m_head = m_head->next;
+    if (item == m_tail)
+        m_tail = m_tail->prev;
+
+    if (item->prev)
+        item->prev->next = item->next;
+    if (item->next)
+        item->next->prev = item->prev;
+    item->prev = item->next = nullptr;
+
+    size_t erased = m_bySize.erase({payloadSize, item});
+#ifdef NDEBUG
+    (void)erased;
+#endif
+    assert(erased == 1);
+
+    m_itemAlloc->deallocate(item);
+}
+
+} // namespace detail
+
+template<typename Payload, typename Item>
+template<typename Predicate>
+std::optional<Payload> PerStreamCache<Payload, Item>::getIf(size_t minSize, Predicate &&pred,
+                                                            std::optional<cudaStream_t> stream)
+{
+    std::optional<Payload> ret;
+
+    std::lock_guard guard(m_lock);
+
+    if (stream)
+    {
+        ret = tryGetPerStream(minSize, pred, *stream);
+        if (ret)
+            return ret;
+    }
+
+    do
+    {
+        ret = tryGetGlobal(minSize, pred);
+        if (ret)
+            return ret;
+    }
+    while (moveReadyToGlobal());
+
+    return std::nullopt;
+}
+
+template<typename Payload, typename Item>
+template<typename Predicate>
+std::optional<Payload> PerStreamCache<Payload, Item>::tryGetPerStream(size_t size, Predicate &&pred,
+                                                                      cudaStream_t stream)
+{
+    uint64_t streamId = GetCudaStreamIdHint(stream);
+    auto     it       = m_perStreamCache.find(streamId);
+    if (it == m_perStreamCache.end())
+        return std::nullopt;
+    return it->second.getIf(size, std::forward<Predicate>(pred));
+}
+
+template<typename Payload, typename Item>
+template<typename Predicate>
+std::optional<Payload> PerStreamCache<Payload, Item>::tryGetGlobal(size_t size, Predicate &&pred)
+{
+    for (auto it = m_globalCache.lower_bound(size); it != m_globalCache.end(); ++it)
+    {
+        if (pred(it->second))
+        {
+            Payload ret = std::move(it->second);
+            m_globalCache.erase(it);
+            return ret;
+        }
+    }
+    return std::nullopt;
+}
+
+template<typename Payload, typename Item>
+int PerStreamCache<Payload, Item>::moveReadyToGlobal()
+{
+    int moved = 0;
+    for (auto it = m_perStreamCache.begin(); it != m_perStreamCache.end();)
+    {
+        it->second.removeAllReady(
+            [&](Payload &&payload)
+            {
+                m_globalCache.emplace(StreamCachePayloadSize(payload), std::move(payload));
+                moved++;
+            });
+        if (it->second.empty())
+            it = m_perStreamCache.erase(it);
+        else
+            ++it;
+    }
+    return moved;
+}
+
+template<typename Payload, typename Item>
+void PerStreamCache<Payload, Item>::put(Payload &&payload, std::optional<cudaStream_t> stream)
+{
+    cudaEvent_t readyEvent = StreamCachePayloadReady(payload);
+    bool        per_stream = readyEvent != nullptr && cudaEventQuery(readyEvent) == cudaErrorNotReady;
+
+    std::lock_guard guard(m_lock);
+
+    if (per_stream)
+    {
+        uint64_t id      = stream ? GetCudaStreamIdHint(*stream) : (uint64_t)-1ll;
+        auto     cacheIt = m_perStreamCache.find(id);
+        if (cacheIt == m_perStreamCache.end())
+            cacheIt = m_perStreamCache.emplace(id, &m_cacheItemAlloc).first;
+
+        cacheIt->second.put(std::move(payload));
+    }
+    else
+    {
+        size_t size = StreamCachePayloadSize(payload);
+        m_globalCache.emplace(size, std::move(payload));
+    }
+}
+
+} // namespace nvcv::util
+
+#endif // NVCV_UTIL_PER_STREAM_CACHE_IMPL_HPP
diff --git a/src/util/SimpleCache.hpp b/src/util/SimpleCache.hpp
new file mode 100644
index 00000000..f9091c5a
--- /dev/null
+++ b/src/util/SimpleCache.hpp
@@ -0,0 +1,137 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_UTIL_SIMPLE_CACHE_HPP
+#define NVCV_UTIL_SIMPLE_CACHE_HPP
+
+#include <memory>
+#include <mutex>
+#include <optional>
+
+namespace nvcv::util {
+
+/** A simple cache that stores objects of type T
+ *
+ * The cache stores objects of type T. Upon a call to `Get`, an object
+ * is moved out from the cache and returned as `optional<T>`. If the cache
+ * is empty, `nullopt` is returned.
+ * `GetOrCreate` alwas returns an object (unless it throws) - if the cache is empty,
+ * a user-provided factory function is invoked and a new object is returned.
+ * Objects can be placed in the cache with a call to `Put` or `Emplace`.
+ *
+ * The cache is guarded with a lockable object (by default std::mutex).
+ *
+ * The cache is implemented as a unidirectional list of entries.
+ * Each entry holds an optional instance of type T.
+ * Once an object is removed, the cache entry is stored for reuse in an auxiliary list,
+ * reducing the number of dynamic allocations.
+ *
+ * @tparam T        The type of itmes held in the cache
+ * @tparam LockType A lockable object
+ */
+template<typename T, typename LockType = std::mutex>
+class SimpleCache
+{
+public:
+    std::optional<T> get()
+    {
+        if (m_items)
+        {
+            std::lock_guard lg(m_lock);
+            if (m_items)
+            {
+                auto tmp  = std::move(m_items);
+                m_items   = std::move(tmp->next);
+                auto obj  = std::move(tmp->payload);
+                tmp->next = std::move(m_empty);
+                m_empty   = std::move(tmp);
+                return obj;
+            }
+        }
+        return std::nullopt;
+    }
+
+    template<typename CreateFunc>
+    T getOrCreate(CreateFunc &&create)
+    {
+        auto cached = get();
+        if (cached.has_value())
+            return std::move(cached).value();
+        else
+            return create();
+    }
+
+    void put(T &&payload)
+    {
+        emplace(std::move(payload));
+    }
+
+    template<typename... Args>
+    void emplace(Args &&...args)
+    {
+        std::lock_guard lg(m_lock);
+
+        std::unique_ptr<CacheItem> item;
+        if (m_empty)
+        {
+            item    = std::move(m_empty);
+            m_empty = std::move(item->next);
+        }
+        else
+        {
+            item = std::make_unique<CacheItem>();
+        }
+        item->payload.emplace(std::forward<Args>(args)...);
+
+        item->next = std::move(m_items);
+        m_items    = std::move(item);
+    }
+
+    void purge()
+    {
+        std::lock_guard lg(m_lock);
+
+        m_items.reset();
+        m_empty.reset();
+    }
+
+private:
+    struct CacheItem
+    {
+        ~CacheItem()
+        {
+            // Iterate through all subsequent elements to avoid deep recursion
+            while (next)
+            {
+                // detach the chain from the `next`
+                auto tmp = std::move(next->next);
+                // this will delete the next
+                next = std::move(tmp);
+            }
+        }
+
+        std::unique_ptr<CacheItem> next;
+        std::optional<T>           payload;
+    };
+
+    std::unique_ptr<CacheItem> m_items, m_empty;
+    LockType                   m_lock;
+};
+
+} // namespace nvcv::util
+
+#endif // NVCV_UTIL_SIMPLE_CACHE_HPP
diff --git a/src/util/Stream.cpp b/src/util/Stream.cpp
new file mode 100644
index 00000000..a2aaab12
--- /dev/null
+++ b/src/util/Stream.cpp
@@ -0,0 +1,70 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Stream.hpp"
+
+#include "CheckError.hpp"
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+namespace nvcv::util {
+
+CudaStream CudaStream::Create(bool nonBlocking, int deviceId)
+{
+    cudaStream_t stream;
+    int          flags   = nonBlocking ? cudaStreamNonBlocking : cudaStreamDefault;
+    int          prevDev = -1;
+    if (deviceId >= 0)
+    {
+        NVCV_CHECK_THROW(cudaGetDevice(&prevDev));
+        NVCV_CHECK_THROW(cudaSetDevice(deviceId));
+    }
+    auto err = cudaStreamCreateWithFlags(&stream, flags);
+    if (prevDev >= 0)
+        NVCV_CHECK_THROW(cudaSetDevice(prevDev));
+    NVCV_CHECK_THROW(err);
+    return CudaStream(stream);
+}
+
+CudaStream CudaStream::CreateWithPriority(bool nonBlocking, int priority, int deviceId)
+{
+    cudaStream_t stream;
+    int          flags   = nonBlocking ? cudaStreamNonBlocking : cudaStreamDefault;
+    int          prevDev = -1;
+    if (deviceId >= 0)
+    {
+        NVCV_CHECK_THROW(cudaGetDevice(&prevDev));
+        NVCV_CHECK_THROW(cudaSetDevice(deviceId));
+    }
+    auto err = cudaStreamCreateWithPriority(&stream, flags, priority);
+    if (prevDev >= 0)
+        NVCV_CHECK_THROW(cudaSetDevice(prevDev));
+    NVCV_CHECK_THROW(err);
+    return CudaStream(stream);
+}
+
+void CudaStream::DestroyHandle(cudaStream_t stream)
+{
+    auto err = cudaStreamDestroy(stream);
+    if (err != cudaSuccess && err != cudaErrorCudartUnloading)
+    {
+        NVCV_CHECK_THROW(err);
+    }
+}
+
+} // namespace nvcv::util
diff --git a/src/util/Stream.hpp b/src/util/Stream.hpp
new file mode 100644
index 00000000..44399a3b
--- /dev/null
+++ b/src/util/Stream.hpp
@@ -0,0 +1,58 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_UTIL_CUDA_STREAM_H_
+#define NVCV_UTIL_CUDA_STREAM_H_
+
+#include "UniqueHandle.hpp"
+
+#include <driver_types.h>
+
+#include <utility>
+
+namespace nvcv::util {
+
+/**
+ * @brief A wrapper class for CUDA stream handle (cudaStream_t),
+ *
+ * The purpose of this class is to provide safe ownership and lifecycle management
+ * for CUDA stream handles.
+ * The stream object may be created using the factory functions @ref Create and
+ * @ref CreateWithPriority.
+ *
+ * The object may also assume ownership of a pre-existing handle via constructor or
+ * @link UniqueHandle::reset(handle_type) reset @endlink function.
+ */
+class CudaStream : public UniqueHandle<cudaStream_t, CudaStream>
+{
+public:
+    NVCV_INHERIT_UNIQUE_HANDLE(cudaStream_t, CudaStream)
+
+    /// @brief Creates a stream on specified device (or current device, if device_id < 0)
+    static CudaStream Create(bool nonBlocking, int deviceId = -1);
+
+    /// @brief Creates a stream with given priority on specified device
+    ///        (or current device, if device_id < 0)
+    static CudaStream CreateWithPriority(bool nonBlocking, int priority, int deviceId = -1);
+
+    /// @brief Calls cudaStreamDestroy on the handle.
+    static void DestroyHandle(cudaStream_t stream);
+};
+
+} // namespace nvcv::util
+
+#endif // NVCV_UTIL_CUDA_STREAM_H_
diff --git a/src/util/StreamId.cpp b/src/util/StreamId.cpp
new file mode 100644
index 00000000..27fc0f7e
--- /dev/null
+++ b/src/util/StreamId.cpp
@@ -0,0 +1,150 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "StreamId.hpp"
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <nvcv/Exception.hpp>
+
+using cuStreamGetId_t = CUresult(CUstream, unsigned long long *);
+
+#if CUDA_VERSION >= 12000
+
+namespace {
+
+cuStreamGetId_t *_cuStreamGetId = cuStreamGetId;
+
+bool _hasPreciseHint()
+{
+    return true;
+}
+
+} // namespace
+
+#else
+
+#    include <dlfcn.h>
+#    include <sys/syscall.h>
+#    include <unistd.h>
+
+namespace {
+
+inline int getTID()
+{
+    return syscall(SYS_gettid);
+}
+
+constexpr uint64_t MakeLegacyStreamId(int dev, int tid)
+{
+    return (uint64_t)dev << 32 | tid;
+}
+
+CUresult cuStreamGetIdFallback(CUstream stream, unsigned long long *id)
+{
+    // If the stream handle is a pseudohandle, use some special treatment....
+    if (stream == 0 || stream == CU_STREAM_LEGACY || stream == CU_STREAM_PER_THREAD)
+    {
+        int dev = -1;
+        if (cudaGetDevice(&dev) != cudaSuccess)
+            return CUDA_ERROR_INVALID_CONTEXT;
+        // If we use a per-thread stream, get TID; otherwise use -1 as a pseudo-tid
+        *id = MakeLegacyStreamId(dev, stream == CU_STREAM_PER_THREAD ? getTID() : -1);
+        return CUDA_SUCCESS;
+    }
+    else
+    {
+        // Otherwise just use the handle - it's not perfactly safe, but should do.
+        *id = (uint64_t)stream;
+        return CUDA_SUCCESS;
+    }
+}
+
+cuStreamGetId_t *getRealStreamIdFunc()
+{
+    static cuStreamGetId_t *fn = []()
+    {
+        void *sym = nullptr;
+        // If it fails, we'll just return nullptr.
+        (void)cuGetProcAddress("cuStreamGetId", &sym, 12000, CU_GET_PROC_ADDRESS_DEFAULT);
+        return (cuStreamGetId_t *)sym;
+    }();
+    return fn;
+}
+
+bool _hasPreciseHint()
+{
+    static bool ret = getRealStreamIdFunc() != nullptr;
+    return ret;
+}
+
+CUresult cuStreamGetIdBootstrap(CUstream stream, unsigned long long *id);
+
+cuStreamGetId_t *_cuStreamGetId = cuStreamGetIdBootstrap;
+
+CUresult cuStreamGetIdBootstrap(CUstream stream, unsigned long long *id)
+{
+    cuStreamGetId_t *realFunc = getRealStreamIdFunc();
+    if (realFunc)
+        _cuStreamGetId = realFunc;
+    else
+        _cuStreamGetId = cuStreamGetIdFallback;
+
+    return _cuStreamGetId(stream, id);
+}
+
+} // namespace
+
+#endif
+
+namespace nvcv::util {
+
+bool IsCudaStreamIdHintUnambiguous()
+{
+    return _hasPreciseHint();
+}
+
+uint64_t GetCudaStreamIdHint(CUstream stream)
+{
+    static auto initResult = cuInit(0);
+    (void)initResult;
+    unsigned long long id;
+    CUresult           err = _cuStreamGetId(stream, &id);
+    if (err != CUDA_SUCCESS)
+    {
+        switch (err)
+        {
+        case CUDA_ERROR_DEINITIALIZED:
+            // This is most likely to happen during process teardown, so likely in a destructor
+            // - we don't want to throw there and the stream equality is immaterial anyway at this point.
+            return -1;
+        case CUDA_ERROR_INVALID_VALUE:
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid stream handle");
+        default:
+        {
+            const char *msg  = "";
+            const char *name = "Unknown error";
+            (void)cuGetErrorString(err, &msg);
+            (void)cuGetErrorName(err, &name);
+            throw nvcv::Exception(nvcv::Status::ERROR_INTERNAL, "CUDA error %s %i %s", name, err, msg);
+        }
+        }
+    }
+    return id;
+}
+
+} // namespace nvcv::util
diff --git a/src/util/StreamId.hpp b/src/util/StreamId.hpp
new file mode 100644
index 00000000..dbed99d9
--- /dev/null
+++ b/src/util/StreamId.hpp
@@ -0,0 +1,47 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_UTIL_STREAM_ID_HPP
+#define NVCV_UTIL_STREAM_ID_HPP
+
+#include <nvcv/detail/CudaFwd.h>
+
+#include <cstdint>
+
+namespace nvcv::util {
+
+/** Retrieves a value that identifies a stream.
+ *
+ * @warning On older drivers ID aliasing is possible, when a still-running stream is deleted
+ *          and a new one is created before the one just deleted completes its work.
+ *
+ * @param stream    CUDA stream handle (note that CUstram and cudaStream_t are one type)
+ * @return int64_t  The ID of the stream within the system.
+ */
+uint64_t GetCudaStreamIdHint(CUstream stream);
+
+/** Checks whether the stream id hint is unique
+ *
+ * If the system supports cuStreamGetId, then the value returned by GetCudaStreamIdHint
+ * uniquely identifies a stream. This creates some optimization opportunities when managing
+ * stream-bound resources.
+ */
+bool IsCudaStreamIdHintUnambiguous();
+
+} // namespace nvcv::util
+
+#endif // NVCV_UTIL_STREAM_ID_HPP
diff --git a/src/util/TensorDataUtils.cpp b/src/util/TensorDataUtils.cpp
index e716fecc..862f62aa 100644
--- a/src/util/TensorDataUtils.cpp
+++ b/src/util/TensorDataUtils.cpp
@@ -35,7 +35,7 @@ static void printPlane(const uint8_t *data, int width, int height, int rowStride
             std::cout << "]";
             endB = true;
         }
-        else if (i % bytesPC == 0)
+        else if (x % bytesPC == 0)
         {
             if (x % (bytesPC * numC) == 0 && !endB)
             {
@@ -46,6 +46,7 @@ static void printPlane(const uint8_t *data, int width, int height, int rowStride
                 std::cout << ",";
             }
         }
+
         if (i % rowStride == 0)
         {
             std::cout << "\n[";
@@ -236,4 +237,151 @@ nvcv::Tensor CreateTensor(int numImages, int imgWidth, int imgHeight, const nvcv
     return nvcv::Tensor(numImages, {imgWidth, imgHeight}, imgFormat);
 }
 
+static void GetImageByteVectorFromTensorPlanar(const TensorData &tensorData, int sample,
+                                               std::vector<nvcv::Byte> &outData)
+{
+    Optional<TensorDataAccessStridedImagePlanar> tDataAc = nvcv::TensorDataAccessStridedImagePlanar::Create(tensorData);
+
+    if (!tDataAc)
+        throw std::runtime_error("Tensor Data not compatible with planar access.");
+
+    if (tDataAc->numSamples() <= sample || sample < 0)
+        throw std::runtime_error("Number of samples smaller than requested sample.");
+
+    // in a planar tensor the dtype represents each plane so the total bytes per pixel must be calculated
+    int bytesPerC       = tDataAc->dtype().bitsPerPixel() / 8;
+    int outputSizeBytes = tDataAc->numRows() * tDataAc->numCols() * bytesPerC * tDataAc->numChannels();
+
+    // Make sure we have the right size.
+    outData.resize(outputSizeBytes);
+    Byte  *basePtr  = tDataAc->sampleData(sample);
+    size_t dstWidth = tDataAc->numCols() * bytesPerC;
+    for (int i = 0; i < tDataAc->numChannels(); ++i)
+    {
+        if (cudaSuccess
+            != cudaMemcpy2D(outData.data() + (i * (tDataAc->numCols() * tDataAc->numRows()) * bytesPerC), dstWidth,
+                            basePtr, tDataAc->rowStride(), dstWidth, tDataAc->numRows(), cudaMemcpyDeviceToHost))
+        {
+            throw std::runtime_error("CudaMemcpy failed on copy of channel plane from device to host.");
+        }
+        basePtr += tDataAc->planeStride();
+    }
+    return;
+}
+
+void GetImageByteVectorFromTensor(const TensorData &tensorData, int sample, std::vector<nvcv::Byte> &outData)
+{
+    Optional<TensorDataAccessStridedImage> tDataAc = nvcv::TensorDataAccessStridedImage::Create(tensorData);
+
+    if (!tDataAc)
+        throw std::runtime_error("Tensor Data not compatible with pitch access.");
+    if (tDataAc->infoLayout().isChannelFirst())
+        return GetImageByteVectorFromTensorPlanar(tensorData, sample, outData);
+
+    if (tDataAc->numSamples() <= sample || sample < 0)
+        throw std::runtime_error("Number of samples smaller than requested sample.");
+
+    int bytesPerPixel   = (tDataAc->dtype().bitsPerPixel() / 8) * tDataAc->numChannels();
+    int outputSizeBytes = tDataAc->numRows() * tDataAc->numCols() * bytesPerPixel;
+
+    // Make sure we have the right size.
+    outData.resize(outputSizeBytes);
+
+    if (cudaSuccess
+        != cudaMemcpy2D(outData.data(), tDataAc->numCols() * bytesPerPixel, tDataAc->sampleData(sample),
+                        tDataAc->rowStride(), tDataAc->numCols() * bytesPerPixel, tDataAc->numRows(),
+                        cudaMemcpyDeviceToHost))
+    {
+        throw std::runtime_error("CudaMemcpy failed");
+    }
+    return;
+}
+
+static void SetImageTensorFromByteVectorPlanar(const TensorData &tensorData, std::vector<nvcv::Byte> &data, int sample)
+{
+    Optional<TensorDataAccessStridedImagePlanar> tDataAc = nvcv::TensorDataAccessStridedImagePlanar::Create(tensorData);
+
+    if (!tDataAc)
+        throw std::runtime_error("Tensor Data not compatible with planar image access.");
+
+    if (tDataAc->numSamples() <= sample)
+        throw std::runtime_error("Number of samples smaller than requested sample.");
+
+    if ((int64_t)data.size()
+        != tDataAc->numCols() * tDataAc->numRows() * (tDataAc->dtype().bitsPerPixel() / 8) * tDataAc->numChannels())
+        throw std::runtime_error("Data vector is incorrect size, size must be W*H*bytesPerPixel.");
+
+    int bytesPerC = (tDataAc->dtype().bitsPerPixel() / 8);
+
+    auto copyToGpu = [&](int j)
+    {
+        Byte *basePtr = tDataAc->sampleData(j);
+
+        for (int i = 0; i < tDataAc->numChannels(); ++i)
+        {
+            Byte  *srcPtr        = data.data() + (i * (tDataAc->numCols() * tDataAc->numRows() * bytesPerC));
+            size_t srcPitch      = tDataAc->numCols() * bytesPerC;
+            size_t srcWidthBytes = tDataAc->numCols() * bytesPerC;
+            if (cudaSuccess
+                != cudaMemcpy2D(basePtr, tDataAc->rowStride(), srcPtr, srcPitch, srcWidthBytes, tDataAc->numRows(),
+                                cudaMemcpyHostToDevice))
+            {
+                throw std::runtime_error("CudaMemcpy failed for channel plane copy from host to device.");
+            }
+            basePtr += tDataAc->planeStride();
+        }
+    };
+
+    if (sample < 0)
+        for (auto i = 0; i < tDataAc->numSamples(); ++i)
+        {
+            copyToGpu(i);
+        }
+    else
+        copyToGpu(sample);
+}
+
+void SetImageTensorFromByteVector(const TensorData &tensorData, std::vector<nvcv::Byte> &data, int sample)
+{
+    Optional<TensorDataAccessStridedImage> tDataAc = nvcv::TensorDataAccessStridedImage::Create(tensorData);
+
+    if (!tDataAc)
+        throw std::runtime_error("Tensor Data not compatible with pitch access.");
+
+    if (tDataAc->infoLayout().isChannelFirst()) // planar case
+        return SetImageTensorFromByteVectorPlanar(tensorData, data, sample);
+
+    if (tDataAc->numSamples() <= sample)
+        throw std::runtime_error("Number of samples smaller than requested sample.");
+
+    if ((int64_t)data.size()
+        != tDataAc->numCols() * tDataAc->numRows() * (tDataAc->dtype().bitsPerPixel() / 8) * tDataAc->numChannels())
+        throw std::runtime_error("Data vector is incorrect size, size must be N*W*sizeof(pixel).");
+
+    int bytesPerC = (tDataAc->dtype().bitsPerPixel() / 8);
+
+    auto copyToGpu = [&](int i)
+    {
+        Byte  *basePtr       = tDataAc->sampleData(i);
+        Byte  *srcPtr        = data.data();
+        size_t srcPitch      = tDataAc->numCols() * bytesPerC * tDataAc->numChannels();
+        size_t srcWidthBytes = tDataAc->numCols() * bytesPerC * tDataAc->numChannels();
+
+        if (cudaSuccess
+            != cudaMemcpy2D(basePtr, tDataAc->rowStride(), srcPtr, srcPitch, srcWidthBytes, tDataAc->numRows(),
+                            cudaMemcpyHostToDevice))
+        {
+            throw std::runtime_error("CudaMemcpy failed on copy of image from host to device.");
+        }
+    };
+
+    if (sample < 0)
+        for (auto i = 0; i < tDataAc->numSamples(); ++i)
+        {
+            copyToGpu(i);
+        }
+    else
+        copyToGpu(sample);
+}
+
 } // namespace nvcv::util
diff --git a/src/util/TensorDataUtils.hpp b/src/util/TensorDataUtils.hpp
index 3a0ec6f0..fb641a07 100644
--- a/src/util/TensorDataUtils.hpp
+++ b/src/util/TensorDataUtils.hpp
@@ -243,8 +243,7 @@ static void SetTensorToRandomValue(const TensorData &tensorData, DT minVal, DT m
 
 /**
  * Writes over the Tensor data with an array of type DT array must be
- * the size of sampleStride(). All samples will be overriden.
- * Function does not do data type checking
+ * the size of H*W*C with DT matching the Tensor datatype. Function does not do data type checking.
  *
  * @param[in,out] tensorData created tensor object.
  *
@@ -272,7 +271,20 @@ template<typename DT>
 static void SetImageTensorFromVector(const TensorData &tensorData, std::vector<DT> &data, int sample = -1);
 
 /**
- * Returns a vector contains the values of the provided sample.
+ * Writes over the Tensor data with an byte array, array must be
+ * the size of H*W*C*bytesPerC. Function does not do data type checking.
+ *
+ * @param[in,out] tensorData created tensor object.
+ *
+ * @param[in] data vector of bytes with the data to set the tensor to.
+ *
+ * @param[in] sample optional the sample to write to if -1 all samples are written
+ */
+void SetImageTensorFromByteVector(const TensorData &tensorData, std::vector<nvcv::Byte> &data, int sample = -1);
+
+/**
+ * Returns a vector contains the values of the provided sample including any padding. This function assumes that the DT data type
+ * matches the datatype in the TensorData.
  *
  * @param[in] tensorData created tensor object.
  *
@@ -286,7 +298,8 @@ static void GetVectorFromTensor(const TensorData &tensorData, int sample, std::v
 
 /**
  * Returns a vector contains the values of the provided sample. This vector will only contain
- * the values of the image and not any padding/stride.
+ * the values of the image and not any padding/stride. This function assumes that the DT data type
+ * matches the datatype in the TensorData.
  *
  * @param[in] tensorData created tensor object.
  *
@@ -298,6 +311,20 @@ static void GetVectorFromTensor(const TensorData &tensorData, int sample, std::v
 template<typename DT>
 static void GetImageVectorFromTensor(const TensorData &tensorData, int sample, std::vector<DT> &outData);
 
+/**
+ * Returns a byte vector which contains the values of the specified sample. This vector will only contain
+ * the values of the image and not any padding/stride. Also this will return a byte array regardless of
+ * the DataType of the tensor. The byte vector returned will be the size of H*W*C*bytesPerC.
+ *
+ * @param[in] tensorData created tensor object.
+ *
+ * @param[in] sample the sample to copy to vector 0 index.
+ *
+ * @param[out] outData the data to set the tensor to.
+ *
+ */
+void GetImageByteVectorFromTensor(const TensorData &tensorData, int sample, std::vector<nvcv::Byte> &outData);
+
 /**
  * Sets the TensorImageData to the value set by the data parameter
  * region defines the amount of image to set starting at 0,0
@@ -534,9 +561,9 @@ static void SetImageTensorFromVectorPlanar(const TensorData &tensorData, std::ve
     if ((int64_t)data.size() != tDataAc->numCols() * tDataAc->numRows() * tDataAc->numChannels())
         throw std::runtime_error("Data vector is incorrect size, size must be W*C*sizeof(DT)*channels.");
 
-    auto copyToGpu = [&](int i)
+    auto copyToGpu = [&](int j)
     {
-        Byte *basePtr = tDataAc->sampleData(i);
+        Byte *basePtr = tDataAc->sampleData(j);
         for (int i = 0; i < tDataAc->numChannels(); ++i)
         {
             if (cudaSuccess
@@ -672,37 +699,49 @@ void SetCvDataTo(TensorImageData &cvImg, DT data, Size2D region, uint8_t chFlags
 
 // Useful for debugging
 template<typename VT, typename ST>
-inline void PrintBuffer(const std::vector<uint8_t> &vec, const ST &strides, const ST &shape, const char *name = "")
+inline void PrintBuffer(const std::vector<uint8_t> &vec, const ST &strides, const ST &shape, const char *name = "",
+                        uint32_t endls = 0b1111)
 {
+    using BT  = nvcv::cuda::BaseType<ST>;
+    using BT4 = nvcv::cuda::MakeType<BT, 4>;
+    using CVT = std::conditional_t<sizeof(VT) == 1, int, VT>;
+
     std::cout << "I Printing buffer " << name << " with:\nI\tSize = " << vec.size() << " Bytes\nI\tShape = " << shape
-              << "\nI\tStrides = " << strides << "\nI\tValues = " << std::flush;
+              << "\nI\tStrides = " << strides << "\nI\tValues = " << std::endl;
 
-    for (long w = 0; w < (nvcv::cuda::NumElements<ST> == 4 ? nvcv::cuda::GetElement(shape, 3) : 1); ++w)
+    for (BT x = 0; x < (nvcv::cuda::NumElements<ST> >= 1 ? nvcv::cuda::GetElement(shape, 0) : 1); ++x)
     {
-        if (w > 0)
-            std::cout << std::endl;
-        std::cout << "{" << std::flush;
-        for (long z = 0; z < (nvcv::cuda::NumElements<ST> >= 3 ? nvcv::cuda::GetElement(shape, 2) : 1); ++z)
+        if (endls & 0b1000)
+            std::cout << "{" << std::endl;
+        else
+            std::cout << "{" << std::flush;
+        for (BT y = 0; y < (nvcv::cuda::NumElements<ST> >= 2 ? nvcv::cuda::GetElement(shape, 1) : 1); ++y)
         {
-            std::cout << "[" << std::flush;
-            for (long y = 0; y < (nvcv::cuda::NumElements<ST> >= 2 ? nvcv::cuda::GetElement(shape, 1) : 1); ++y)
+            if (endls & 0b0100)
+                std::cout << "  [" << std::endl;
+            else
+                std::cout << "  [" << std::flush;
+            for (BT z = 0; z < (nvcv::cuda::NumElements<ST> >= 3 ? nvcv::cuda::GetElement(shape, 2) : 1); ++z)
             {
-                std::cout << "(" << std::flush;
-                for (long x = 0; x < (nvcv::cuda::NumElements<ST> >= 1 ? nvcv::cuda::GetElement(shape, 0) : 1); ++x)
+                std::cout << " " << std::flush;
+                for (BT w = 0; w < (nvcv::cuda::NumElements<ST> >= 4 ? nvcv::cuda::GetElement(shape, 3) : 1); ++w)
                 {
-                    ST coord = nvcv::cuda::DropCast<nvcv::cuda::NumElements<ST>>(long4{x, y, z, w});
+                    ST coord = nvcv::cuda::DropCast<nvcv::cuda::NumElements<ST>>(BT4{x, y, z, w});
 
-                    if (x > 0)
-                        std::cout << ", " << std::flush;
-                    std::cout << ValueAt<VT>(vec, strides, coord) << std::flush;
+                    std::cout << " " << static_cast<CVT>(ValueAt<VT>(vec, strides, coord)) << std::flush;
                 }
-                std::cout << ")" << std::flush;
+                if (endls & 0b0010)
+                    std::cout << std::endl;
+                else
+                    std::cout << std::flush;
             }
-            std::cout << "]" << std::flush;
+            if (endls & 0b0001)
+                std::cout << "  ]" << std::endl;
+            else
+                std::cout << "  ]" << std::flush;
         }
-        std::cout << "}" << std::flush;
+        std::cout << "}" << std::endl;
     }
-    std::cout << std::endl;
 }
 
 // Write images in *HW tensor buffer vec to PGM files.
@@ -738,6 +777,18 @@ inline void WriteImagesToPGM(const char *filename, const std::vector<uint8_t> &v
         return ST{coord.z, coord.w};
     };
 
+    auto convertValue = [](VT val)
+    {
+        if constexpr (std::is_same_v<VT, uint8_t>)
+            return val;
+        else if constexpr (std::is_integral_v<VT> && !std::is_signed_v<VT>)
+            return std::min((VT)255, std::max((VT)0, val));
+        else if constexpr (std::is_integral_v<VT> && std::is_signed_v<VT>)
+            return std::min((VT)255, std::max((VT)0, (VT)std::abs(val)));
+        else
+            return std::min((VT)255, std::max((VT)0, (VT)std::round(std::abs(val))));
+    };
+
     char fn[256];
 
     for (long c0 = 0; c0 < c0size; ++c0)
@@ -758,9 +809,7 @@ inline void WriteImagesToPGM(const char *filename, const std::vector<uint8_t> &v
 
                     VT val = util::ValueAt<VT>(vec, strides, coord);
 
-                    int iVal = std::min(255, std::max(0, (int)std::round(std::abs(val))));
-
-                    ofs << iVal << ((j == width - 1) ? "\n" : " ");
+                    ofs << convertValue(val) << ((j == width - 1) ? "\n" : " ");
                 }
             }
 
diff --git a/src/util/UniqueHandle.hpp b/src/util/UniqueHandle.hpp
new file mode 100644
index 00000000..dcfb4b10
--- /dev/null
+++ b/src/util/UniqueHandle.hpp
@@ -0,0 +1,191 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_UTIL_UNIQUE_HANDLE_H_
+#define NVCV_UTIL_UNIQUE_HANDLE_H_
+
+#include <utility>
+
+namespace nvcv::util {
+
+/**
+ * @brief This class is an analogue of `unique_ptr` for non-memory resource handles.
+ *
+ * UniqueHandle is a base class for implementing managed resources (files, OS handles, etc).
+ * This class provides construction, assigment and decay to underlying handle type as well
+ * as equality comparison operators.
+ *
+ * @tparam HandleType  type of the handle, e.g. `int` for file descriptors or `FILE*` for buffers
+ * @tparam Actual      derived class (if using CRTP) or a handle information class.
+ *
+ * The interface of the `Actual` type:
+ * ```
+ * static void DestroyHandle(HandleType h);    // free or un-reference the underlying resource
+ *
+ * static constexpr HandleType null_handle();  // return a null handle; when using CRTP it's
+ *                                             // optional and default implementation returns
+ *                                             // default-constructed handle value.
+ * ```
+ *
+ * The handle can be populated by either the explicit constructor or using
+ * @link reset(handle_type) reset @endlink
+ * function. The derived classes may provide other ways of constructing the handle or the entire
+ * handle wrapper object.
+
+ */
+template<typename HandleType, typename Actual>
+class UniqueHandle
+{
+public:
+    using handle_type = HandleType;
+
+    constexpr inline UniqueHandle()
+        : handle_(Actual::null_handle())
+    {
+    }
+
+    /// @brief Constructs a handle wrapper, assuming ownership of given handle.
+    constexpr explicit UniqueHandle(handle_type handle)
+        : handle_(handle)
+    {
+    }
+
+    UniqueHandle(const UniqueHandle &) = delete;
+
+    UniqueHandle &operator=(const UniqueHandle &) = delete;
+
+    inline UniqueHandle(UniqueHandle &&other)
+        : handle_(other.handle_)
+    {
+        other.handle_ = Actual::null_handle();
+    }
+
+    inline UniqueHandle &operator=(UniqueHandle &&other)
+    {
+        std::swap(handle_, other.handle_);
+        other.reset();
+        return *this;
+    }
+
+    /**
+   * @brief Obtains the stored handle
+   *
+   * The value is valid as long as the owning unique handle object is not destroyed, reset
+   * or overwritten.
+   */
+    constexpr handle_type get() const &noexcept
+    {
+        return handle_;
+    }
+
+    /**
+   * @brief Cannot obtain a valid handle from a temporary UniqueHandle
+   *
+   * If this function was allowed, the returned handle would have been destroyed
+   * by the time it's available to the caller.
+   */
+    constexpr handle_type get() && = delete;
+
+    /// @brief Make the wrapper usable in most context in which the handle type can be used
+    constexpr operator handle_type() const &noexcept
+    {
+        return get();
+    }
+
+    /// @brief Cannot obtain a valid handle from a temporary UniqueHandle (see `get`)
+    constexpr operator handle_type() && = delete;
+
+    /**
+   * @brief Destroys the underlying resource and resets the handle to null value.
+   *
+   * @remarks
+   * * If the handle is already null, this function is a no-op.
+   * * The null value to replace the handle with, is taken from `Actual::null_value()`.
+   */
+    inline void reset()
+    {
+        if (!Actual::is_null_handle(handle_))
+        {
+            Actual::DestroyHandle(handle_);
+            handle_ = Actual::null_handle();
+        }
+    }
+
+    /**
+   * @brief Replaces the managed handle by the new one and destroying the old handle.
+   * @remarks If `handle` is equal to the currently managed handle, this function is no-op
+   */
+    inline void reset(handle_type handle)
+    {
+        if (handle != handle_)
+        {
+            reset();
+            handle_ = handle;
+        }
+    }
+
+    /**
+   * @brief Relinquishes the ownership of the handle.
+   *
+   * The function replaces the managed handle with a null value and returns the old value.
+   *
+   * @returns The old handle value, no longer managed by UniqueHandle
+   * @remarks The null value to replace the handle with, is taken from `Actual::null_value()`.
+   */
+    inline handle_type release() noexcept
+    {
+        handle_type old = handle_;
+        handle_         = Actual::null_handle();
+        return old;
+    }
+
+    /// @brief Indicates whether the handle is non-null.
+    constexpr explicit operator bool() const noexcept
+    {
+        return !Actual::is_null_handle(handle_);
+    }
+
+    static constexpr handle_type null_handle() noexcept
+    {
+        return {};
+    }
+
+    static constexpr bool is_null_handle(const handle_type &handle) noexcept
+    {
+        return handle == Actual::null_handle();
+    }
+
+protected:
+    inline ~UniqueHandle()
+    {
+        reset();
+    }
+
+    handle_type handle_;
+};
+
+/**
+ * A macro to inherit the common interface from UniqueHandle
+ * - useful when using UniqueHandle in CRTP
+ */
+#define NVCV_INHERIT_UNIQUE_HANDLE(HandleType, WrapperClass)                \
+    using nvcv::util::UniqueHandle<HandleType, WrapperClass>::UniqueHandle; \
+    using nvcv::util::UniqueHandle<HandleType, WrapperClass>::operator=;
+
+} // namespace nvcv::util
+
+#endif // NVCV_UTIL_UNIQUE_HANDLE_H_
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index de95135c..88ceacc5 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -42,7 +42,7 @@ if(UNIX)
     configure_file(${CMAKE_CURRENT_SOURCE_DIR}/run_tests.sh.in ${TESTS_DRIVER}
         @ONLY)
 
-    macro(nvcv_add_test TESTCMD)
+    macro(nvcv_add_test TESTCMD TESTGROUP)
         get_filename_component(TESTNAME "${TESTCMD}" NAME)
 
         add_test(NAME "${TESTNAME}" COMMAND "${TESTCMD}")
@@ -57,7 +57,7 @@ if(UNIX)
 
         set_tests_properties(${TESTNAME} PROPERTIES TIMEOUT ${TIMEOUT_TERM})
 
-        file(APPEND "${TESTS_DRIVER}" "run ${TESTNAME}\n")
+        file(APPEND "${TESTS_DRIVER}" "run ${TESTNAME} ${TESTGROUP}\n")
 
         if(TARGET ${TESTNAME})
             install(TARGETS ${TESTNAME}
diff --git a/tests/common/CheckStatus.hpp b/tests/common/CheckStatus.hpp
index b409301a..2374e359 100644
--- a/tests/common/CheckStatus.hpp
+++ b/tests/common/CheckStatus.hpp
@@ -133,3 +133,14 @@ using nvcv::Exception;
     {                                                                                                          \
         ADD_FAILURE() << "Expected an exception of type " #E ", got an unknown exception";                     \
     }
+
+#define NVCV_EXPECT_THROW_STATUS(status, ...)                 \
+    try                                                       \
+    {                                                         \
+        __VA_ARGS__;                                          \
+        FAIL() << "Expected an error with status " << status; \
+    }                                                         \
+    catch (nvcv::Exception & e)                               \
+    {                                                         \
+        EXPECT_EQ(e.code(), nvcv::Status(status));            \
+    }
diff --git a/tests/cvcuda/CMakeLists.txt b/tests/cvcuda/CMakeLists.txt
index 5b2f54a6..6fcbd49b 100644
--- a/tests/cvcuda/CMakeLists.txt
+++ b/tests/cvcuda/CMakeLists.txt
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,6 +18,9 @@
 # System tests for cvcuda public API
 add_subdirectory(system)
 
+# Unit tests for cvcuda utilities and operator internals
+add_subdirectory(unit)
+
 if(BUILD_PYTHON)
     # System tests for cvcuda python
     add_subdirectory(python)
diff --git a/tests/cvcuda/python/CMakeLists.txt b/tests/cvcuda/python/CMakeLists.txt
index 813cc21c..503f50f4 100644
--- a/tests/cvcuda/python/CMakeLists.txt
+++ b/tests/cvcuda/python/CMakeLists.txt
@@ -43,4 +43,4 @@ set(PYTHON_TEST_DIR ${CMAKE_INSTALL_PREFIX}/${PYTHON_TEST_INSTDIR})
 set(PYTHON_MODULE_DIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR})
 configure_file(cvcuda_test_python.in cvcuda_test_python @ONLY)
 
-nvcv_add_test(${CMAKE_CURRENT_BINARY_DIR}/cvcuda_test_python)
+nvcv_add_test(${CMAKE_CURRENT_BINARY_DIR}/cvcuda_test_python cvcuda)
diff --git a/tests/cvcuda/python/cvcuda_test_python.in b/tests/cvcuda/python/cvcuda_test_python.in
index c74bb94a..eb648d82 100755
--- a/tests/cvcuda/python/cvcuda_test_python.in
+++ b/tests/cvcuda/python/cvcuda_test_python.in
@@ -45,14 +45,30 @@ function on_exit()
 }
 trap 'on_exit' EXIT
 
+export PYTHONPATH="$PYTHONPATH:@PYTHON_MODULE_DIR@"
+
 for ver in $python_versions; do
 
     if [[ "$NVCV_FORCE_PYTHON" != 1 && "$NVCV_FORCE_PYTHON" != yes ]]; then
-        if ! PYTHONPATH="$PYTHONPATH:@PYTHON_MODULE_DIR@" python$ver -c 'import nvcv'; then
-            echo "Skipping python-$ver, NVCV python bindings not installed"
+        if ! python$ver -c 'import cvcuda'; then
+            echo "Skipping python-$ver, CV-CUDA python bindings not installed"
             continue
         fi
     fi
 
-    PYTHONPATH="$PYTHONPATH:@PYTHON_MODULE_DIR@" NVCV_VERSION="@NVCV_VERSION_FULL@" python$ver -m pytest -o cache_dir="$tmpdir" "$@" "$tests_dir"
+    # Check if python module is exposing only PyInit_cvcuda.
+    # Also provide some helpful info is exposing too much.
+    modfile=$(python$ver -c "import cvcuda; print(cvcuda.__file__)")
+    pubsyms=$(readelf -sWD $modfile | grep -v ' UND ' | grep ' GLOBAL ')
+    if [[ $(echo "$pubsyms" | wc -l) != 1 ]]; then
+        echo -e "cvcuda python $ver module is exposing too many symbols:\n$pubsyms"
+        exit 1
+    fi
+    if ! echo "$pubsyms" | grep PyInit_cvcuda > /dev/null; then
+        echo -e "cvcuda python $ver module must expose symbol PyInit_cvcuda, but instead exposes:\n$pubsyms"
+        exit 2
+    fi
+
+    # Run python tests
+    NVCV_VERSION="@NVCV_VERSION_FULL@" python$ver -m pytest -o cache_dir="$tmpdir" "$@" "$tests_dir"
 done
diff --git a/tests/cvcuda/python/cvcuda_util.py b/tests/cvcuda/python/cvcuda_util.py
index 0761e47a..dcdf55f3 100644
--- a/tests/cvcuda/python/cvcuda_util.py
+++ b/tests/cvcuda/python/cvcuda_util.py
@@ -13,13 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import math
+import torch
 import numpy as np
 import numbers
-import torch
 import nvcv
 import copy
 import colorsys
-import math
 
 
 IMG_FORMAT_TO_TYPE = {
@@ -96,7 +96,7 @@ def generate_data(shape, dtype, max_random=None, rng=None):
 
 
 class CudaBuffer:
-    __cuda_array_interface = None
+    __cuda_array_interface__ = None
     obj = None
 
 
@@ -165,17 +165,27 @@ def to_cuda_buffer(host_data):
     return buf
 
 
-def to_nvcv_tensor(host_data, layout):
-    """Convert a tensor in host data with layout to nvcv.Tensor
+def to_nvcv_tensor(data, layout):
+    """Convert a tensor in host or CUDA data with layout to nvcv.Tensor
 
     Args:
-        host_data (numpy array): Tensor in host data
+        data (numpy array or CUDA array): Tensor in host data
         layout (string): Tensor layout (e.g. NC, HWC, NHWC)
 
     Returns:
         nvcv.Tensor: The converted tensor
     """
-    return nvcv.as_tensor(to_cuda_buffer(host_data), layout=layout)
+    cuda_data = data
+    if "__cuda_array_interface__" not in dir(cuda_data):
+        cuda_data = to_cuda_buffer(data)
+    shape = cuda_data.__cuda_array_interface__["shape"]
+    if layout is not None:
+        if len(shape) < len(layout):
+            shape = (1,) * (len(layout) - len(shape)) + shape
+        elif len(shape) > len(layout):
+            raise ValueError("Layout smaller than shape of tensor data")
+    cuda_data.__cuda_array_interface__["shape"] = shape
+    return nvcv.as_tensor(cuda_data, layout=layout)
 
 
 def create_tensor(shape, dtype, layout, max_random=None, rng=None, transform_dist=None):
diff --git a/tests/cvcuda/python/test_adaptivethresholdtype.py b/tests/cvcuda/python/test_adaptivethresholdtype.py
index 8e0ff2db..cf51ea29 100644
--- a/tests/cvcuda/python/test_adaptivethresholdtype.py
+++ b/tests/cvcuda/python/test_adaptivethresholdtype.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import torch  # noqa(F401)
 import cvcuda
 
 
diff --git a/tests/cvcuda/python/test_bordertype.py b/tests/cvcuda/python/test_bordertype.py
index f2f230ee..2f650932 100644
--- a/tests/cvcuda/python/test_bordertype.py
+++ b/tests/cvcuda/python/test_bordertype.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import torch  # noqa(F401)
 import cvcuda
 
 
diff --git a/tests/cvcuda/python/test_import_order.py b/tests/cvcuda/python/test_import_order.py
new file mode 100644
index 00000000..7178768a
--- /dev/null
+++ b/tests/cvcuda/python/test_import_order.py
@@ -0,0 +1,25 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Import order is important,
+# torch must be loaded correctly even if cvcuda was imported first
+import cvcuda
+import torch
+import numpy as np
+
+
+def test_import_cvcuda_first_works():
+    torch.as_tensor(np.ndarray((4, 6), dtype=np.uint8), device="cuda")
+    cvcuda.Tensor((4, 6), dtype=np.uint8)
diff --git a/tests/cvcuda/python/test_interptype.py b/tests/cvcuda/python/test_interptype.py
index bf8c7371..7abf1f32 100644
--- a/tests/cvcuda/python/test_interptype.py
+++ b/tests/cvcuda/python/test_interptype.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import torch  # noqa(F401)
 import cvcuda
 
 
diff --git a/tests/cvcuda/python/test_opadaptivethreshold.py b/tests/cvcuda/python/test_opadaptivethreshold.py
index 30fe7cb3..a21eabe5 100644
--- a/tests/cvcuda/python/test_opadaptivethreshold.py
+++ b/tests/cvcuda/python/test_opadaptivethreshold.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import torch  # noqa(F401)
 import cvcuda
 import pytest as t
 import numpy as np
diff --git a/tests/cvcuda/python/test_opbndbox.py b/tests/cvcuda/python/test_opbndbox.py
index 3165bfff..8efa3129 100644
--- a/tests/cvcuda/python/test_opbndbox.py
+++ b/tests/cvcuda/python/test_opbndbox.py
@@ -24,62 +24,67 @@
         (
             (((3, 224, 224, 4), np.uint8, "NHWC")),
             cvcuda.BndBoxesI(
-                numBoxes=[3, 3, 3],
                 boxes=[
-                    cvcuda.BndBoxI(
-                        box=(10, 10, 5, 5),
-                        thickness=2,
-                        borderColor=(255, 255, 0),
-                        fillColor=(0, 128, 255, 128),
-                    ),
-                    cvcuda.BndBoxI(
-                        box=(20, 10, 5, 5),
-                        thickness=3,
-                        borderColor=(0, 255, 255),
-                        fillColor=(0, 128, 255, 128),
-                    ),
-                    cvcuda.BndBoxI(
-                        box=(30, 10, 5, 5),
-                        thickness=3,
-                        borderColor=(0, 255, 255),
-                        fillColor=(0, 128, 255, 128),
-                    ),
-                    cvcuda.BndBoxI(
-                        box=(10, 20, 5, 5),
-                        thickness=2,
-                        borderColor=(255, 255, 0),
-                        fillColor=(0, 128, 255, 128),
-                    ),
-                    cvcuda.BndBoxI(
-                        box=(20, 20, 5, 5),
-                        thickness=3,
-                        borderColor=(0, 255, 255),
-                        fillColor=(0, 128, 255, 128),
-                    ),
-                    cvcuda.BndBoxI(
-                        box=(30, 20, 5, 5),
-                        thickness=3,
-                        borderColor=(0, 255, 255),
-                        fillColor=(0, 128, 255, 128),
-                    ),
-                    cvcuda.BndBoxI(
-                        box=(10, 20, 5, 5),
-                        thickness=2,
-                        borderColor=(255, 255, 0),
-                        fillColor=(0, 128, 255, 128),
-                    ),
-                    cvcuda.BndBoxI(
-                        box=(20, 20, 5, 5),
-                        thickness=3,
-                        borderColor=(0, 255, 255),
-                        fillColor=(0, 128, 255, 128),
-                    ),
-                    cvcuda.BndBoxI(
-                        box=(30, 20, 5, 5),
-                        thickness=3,
-                        borderColor=(0, 255, 255),
-                        fillColor=(0, 128, 255, 128),
-                    ),
+                    [
+                        cvcuda.BndBoxI(
+                            box=(10, 10, 5, 5),
+                            thickness=2,
+                            borderColor=(255, 255, 0),
+                            fillColor=(0, 128, 255, 128),
+                        ),
+                        cvcuda.BndBoxI(
+                            box=(20, 10, 5, 5),
+                            thickness=3,
+                            borderColor=(0, 255, 255),
+                            fillColor=(0, 128, 255, 128),
+                        ),
+                        cvcuda.BndBoxI(
+                            box=(30, 10, 5, 5),
+                            thickness=3,
+                            borderColor=(0, 255, 255),
+                            fillColor=(0, 128, 255, 128),
+                        ),
+                    ],
+                    [
+                        cvcuda.BndBoxI(
+                            box=(10, 20, 5, 5),
+                            thickness=2,
+                            borderColor=(255, 255, 0),
+                            fillColor=(0, 128, 255, 128),
+                        ),
+                        cvcuda.BndBoxI(
+                            box=(20, 20, 5, 5),
+                            thickness=3,
+                            borderColor=(0, 255, 255),
+                            fillColor=(0, 128, 255, 128),
+                        ),
+                        cvcuda.BndBoxI(
+                            box=(30, 20, 5, 5),
+                            thickness=3,
+                            borderColor=(0, 255, 255),
+                            fillColor=(0, 128, 255, 128),
+                        ),
+                    ],
+                    [
+                        cvcuda.BndBoxI(
+                            box=(10, 20, 5, 5),
+                            thickness=2,
+                            borderColor=(255, 255, 0),
+                            fillColor=(0, 128, 255, 128),
+                        ),
+                        cvcuda.BndBoxI(
+                            box=(20, 20, 5, 5),
+                            thickness=3,
+                            borderColor=(0, 255, 255),
+                            fillColor=(0, 128, 255, 128),
+                        ),
+                        cvcuda.BndBoxI(
+                            box=(30, 20, 5, 5),
+                            thickness=3,
+                            borderColor=(0, 255, 255),
+                            fillColor=(0, 128, 255, 128),
+                        ),
+                    ],
                 ],
             ),
         ),
diff --git a/tests/cvcuda/python/test_opboxblur.py b/tests/cvcuda/python/test_opboxblur.py
index b361893c..981b6c8a 100644
--- a/tests/cvcuda/python/test_opboxblur.py
+++ b/tests/cvcuda/python/test_opboxblur.py
@@ -24,17 +24,22 @@
         (
             (((3, 224, 224, 4), np.uint8, "NHWC")),
             cvcuda.BlurBoxesI(
-                numBoxes=[3, 3, 3],
                 boxes=[
-                    cvcuda.BlurBoxI(box=(10, 10, 5, 5), kernelSize=7),
-                    cvcuda.BlurBoxI(box=(50, 50, 7, 7), kernelSize=11),
-                    cvcuda.BlurBoxI(box=(90, 90, 9, 9), kernelSize=17),
-                    cvcuda.BlurBoxI(box=(10, 10, 5, 5), kernelSize=7),
-                    cvcuda.BlurBoxI(box=(50, 50, 7, 7), kernelSize=11),
-                    cvcuda.BlurBoxI(box=(90, 90, 9, 9), kernelSize=17),
-                    cvcuda.BlurBoxI(box=(10, 10, 5, 5), kernelSize=7),
-                    cvcuda.BlurBoxI(box=(50, 50, 7, 7), kernelSize=11),
-                    cvcuda.BlurBoxI(box=(90, 90, 9, 9), kernelSize=17),
+                    [
+                        cvcuda.BlurBoxI(box=(10, 10, 5, 5), kernelSize=7),
+                        cvcuda.BlurBoxI(box=(50, 50, 7, 7), kernelSize=11),
+                        cvcuda.BlurBoxI(box=(90, 90, 9, 9), kernelSize=17),
+                    ],
+                    [
+                        cvcuda.BlurBoxI(box=(10, 10, 5, 5), kernelSize=7),
+                        cvcuda.BlurBoxI(box=(50, 50, 7, 7), kernelSize=11),
+                        cvcuda.BlurBoxI(box=(90, 90, 9, 9), kernelSize=17),
+                    ],
+                    [
+                        cvcuda.BlurBoxI(box=(10, 10, 5, 5), kernelSize=7),
+                        cvcuda.BlurBoxI(box=(50, 50, 7, 7), kernelSize=11),
+                        cvcuda.BlurBoxI(box=(90, 90, 9, 9), kernelSize=17),
+                    ],
                 ],
             ),
         ),
diff --git a/tests/cvcuda/python/test_opfindcontours.py b/tests/cvcuda/python/test_opfindcontours.py
index 9da5fafc..90e1e89e 100644
--- a/tests/cvcuda/python/test_opfindcontours.py
+++ b/tests/cvcuda/python/test_opfindcontours.py
@@ -29,9 +29,10 @@
 def test_op_find_contours(shape, dtype, layout):
     print(shape, dtype, layout)
     image = util.create_tensor(shape, dtype, layout, 1, rng=RNG)
-    points = cvcuda.find_contours(image)
+    points, num_contours_and_points = cvcuda.find_contours(image)
     assert points.shape[0] == image.shape[0]
     assert points.shape[2] == 2
+    assert points.shape[0] == num_contours_and_points.shape[0]
 
     stream = cvcuda.Stream()
     points = cvcuda.Tensor(
@@ -40,12 +41,13 @@ def test_op_find_contours(shape, dtype, layout):
     num_points = cvcuda.Tensor(
         (image.shape[0], 32), nvcv.Type.U32, nvcv.TensorLayout.NW
     )
-    tmp = cvcuda.find_contours_into(
+    points_into, num_contours_and_points_into = cvcuda.find_contours_into(
         src=image,
         points=points,
         num_points=num_points,
         stream=stream,
     )
-    assert tmp is points
-    assert points.shape[0] == image.shape[0]
-    assert points.shape[2] == 2
+    assert points_into is points
+    assert points_into.shape[0] == image.shape[0]
+    assert points_into.shape[2] == 2
+    assert points_into.shape[0] == num_contours_and_points_into.shape[0]
diff --git a/tests/cvcuda/python/test_opfindhomography.py b/tests/cvcuda/python/test_opfindhomography.py
new file mode 100644
index 00000000..3f8d5faa
--- /dev/null
+++ b/tests/cvcuda/python/test_opfindhomography.py
@@ -0,0 +1,92 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import nvcv
+import cvcuda
+import pytest as t
+import numpy as np
+
+RNG = np.random.default_rng(0)
+
+
+@t.mark.parametrize(
+    "num_samples, num_points",
+    [
+        (16, 1024),
+        (32, 1024),
+        (64, 1024),
+    ],
+)
+def test_op_findhomography(num_samples, num_points):
+    tensor_args = ((num_samples, num_points * 2), nvcv.Type._2F32, "NW")
+    src = cvcuda.Tensor(*tensor_args)
+    dst = cvcuda.Tensor(*tensor_args)
+    out = cvcuda.findhomography(src, dst)
+    assert out.shape == (num_samples, 3, 3)
+    assert out.dtype == np.float32
+
+    stream = cvcuda.Stream()
+    out_tensor_args = ((num_samples, 3, 3), np.float32, "NHW")
+    out = cvcuda.Tensor(*out_tensor_args)
+    tmp = cvcuda.findhomography_into(
+        models=out,
+        srcPts=src,
+        dstPts=dst,
+        stream=stream,
+    )
+    assert tmp is out
+    assert out.shape == (num_samples, 3, 3)
+    assert out.dtype == nvcv.Type.F32
+
+
+@t.mark.parametrize(
+    "num_samples, num_points",
+    [
+        (16, 1024),
+        (32, 1024),
+        (64, 1024),
+    ],
+)
+def test_op_findhomographyvarshape(num_samples, num_points):
+    tensor_args = ((1, num_points * 2), nvcv.Type._2F32, "NW")
+    srcBatch = cvcuda.TensorBatch(num_samples)
+    dstBatch = cvcuda.TensorBatch(num_samples)
+    for i in range(num_samples):
+        src = cvcuda.Tensor(*tensor_args)
+        dst = cvcuda.Tensor(*tensor_args)
+        srcBatch.pushback(src)
+        dstBatch.pushback(dst)
+
+    outBatch = cvcuda.findhomography(srcPts=srcBatch, dstPts=dstBatch)
+    assert outBatch.dtype == nvcv.Type.F32
+    assert outBatch.layout == "NHW"
+
+    stream = cvcuda.Stream()
+    out_tensor_args = ((1, 3, 3), np.float32, "NHW")
+    outBatch = cvcuda.TensorBatch(num_samples)
+    for i in range(num_samples):
+        out = cvcuda.Tensor(*(out_tensor_args))
+        outBatch.pushback(out)
+
+    tmpBatch = cvcuda.findhomography_into(
+        models=outBatch,
+        srcPts=srcBatch,
+        dstPts=dstBatch,
+        stream=stream,
+    )
+    assert tmpBatch is outBatch
+    assert outBatch.ndim == 3
+    assert outBatch.dtype == nvcv.Type.F32
+    assert outBatch.capacity == srcBatch.capacity
diff --git a/tests/cvcuda/python/test_ophistogram.py b/tests/cvcuda/python/test_ophistogram.py
index de1c2122..3a9f93e4 100644
--- a/tests/cvcuda/python/test_ophistogram.py
+++ b/tests/cvcuda/python/test_ophistogram.py
@@ -13,8 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import cvcuda
 import torch
+import cvcuda
 import pytest as t
 import numpy as np
 import cvcuda_util as util
diff --git a/tests/cvcuda/python/test_oplabel.py b/tests/cvcuda/python/test_oplabel.py
new file mode 100644
index 00000000..8a3eb92c
--- /dev/null
+++ b/tests/cvcuda/python/test_oplabel.py
@@ -0,0 +1,135 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cvcuda
+import pytest as t
+import numpy as np
+
+
+DEF_OUT_DTYPE = np.uint32
+DEF_MAX_CAPACITY = 10000
+
+
+def defaultNumStats(layout):
+    return 8 if "D" in layout else 6
+
+
+@t.mark.parametrize(
+    "src_args",
+    [
+        (((2, 11, 26, 32, 1), np.uint8, "NDHWC")),
+        (((3, 12, 29, 31), np.uint8, "NDHW")),
+        (((10, 22, 33, 1), np.uint8, "DHWC")),
+        (((14, 23, 34), np.uint8, "DHW")),
+        (((2, 15, 25, 1), np.uint8, "NHWC")),
+        (((3, 17, 24), np.uint8, "NHW")),
+        (((28, 37, 1), np.uint8, "HWC")),
+        (((18, 16), np.uint8, "HW")),
+    ],
+)
+def test_op_label_api(src_args):
+    src = cvcuda.Tensor(*src_args)
+
+    if "D" not in src_args[2]:
+        dst, count, stats = cvcuda.label(src)
+        assert count is None and stats is None
+        assert dst.layout == src.layout
+        assert dst.shape == src.shape
+        assert dst.dtype == DEF_OUT_DTYPE
+        connectivity = cvcuda.CONNECTIVITY_4_2D
+    else:
+        connectivity = cvcuda.CONNECTIVITY_6_3D
+        dst, count, stats = cvcuda.label(src, connectivity)
+        assert count is None and stats is None
+        assert dst.layout == src.layout
+        assert dst.shape == src.shape
+        assert dst.dtype == DEF_OUT_DTYPE
+
+    out = cvcuda.Tensor(src.shape, DEF_OUT_DTYPE, src.layout)
+    tmp, count, stats = cvcuda.label_into(out, src=src, connectivity=connectivity)
+    assert tmp is out and count is None and stats is None
+    assert out.layout == src.layout
+    assert out.shape == src.shape
+    assert out.dtype == DEF_OUT_DTYPE
+
+    num_samples = src_args[0][0] if "N" in src_args[2] else 1
+    bg_label = cvcuda.Tensor((num_samples,), src.dtype, "N")
+    min_thresh = cvcuda.Tensor((num_samples,), src.dtype, "N")
+    max_thresh = cvcuda.Tensor((num_samples,), src.dtype, "N")
+
+    out, count, stats = cvcuda.label(
+        src,
+        connectivity,
+        bg_label=bg_label,
+        min_thresh=min_thresh,
+        max_thresh=max_thresh,
+    )
+    assert count is None and stats is None
+    assert out.layout == src.layout
+    assert out.shape == src.shape
+    assert out.dtype == DEF_OUT_DTYPE
+
+    out, count, stats = cvcuda.label(src, connectivity, count=True, stats=False)
+    assert count is not None and stats is None
+
+    out, count, stats = cvcuda.label(src, connectivity, count=True, stats=True)
+    assert count is not None and stats is not None
+
+    min_size = cvcuda.Tensor((num_samples,), DEF_OUT_DTYPE, "N")
+
+    out, count, stats = cvcuda.label(
+        src,
+        connectivity,
+        cvcuda.LABEL.SEQUENTIAL,
+        count=True,
+        stats=True,
+        bg_label=bg_label,
+        min_size=min_size,
+    )
+    assert count is not None and stats is not None
+    assert out.layout == src.layout
+    assert out.shape == src.shape
+    assert out.dtype == DEF_OUT_DTYPE
+
+    t_out, t_count, t_stats = cvcuda.label_into(out, count, stats, src, connectivity)
+    assert t_out is out and t_count is count and t_stats is stats
+    assert out.layout == src.layout
+    assert out.shape == src.shape
+    assert out.dtype == DEF_OUT_DTYPE
+    assert count.layout == "N"
+    assert count.shape[0] == num_samples
+    assert count.dtype == DEF_OUT_DTYPE
+    assert stats.layout == "NMA"
+    assert stats.shape == (num_samples, DEF_MAX_CAPACITY, defaultNumStats(src_args[2]))
+    assert stats.dtype == DEF_OUT_DTYPE
+
+    out, count, stats = cvcuda.label(
+        src, connectivity, count=True, stats=True, max_labels=12345
+    )
+    assert stats.shape == (num_samples, 12345, defaultNumStats(src_args[2]))
+
+    stream = cvcuda.Stream()
+    out, _, _ = cvcuda.label(src=src, connectivity=connectivity, stream=stream)
+    assert out.layout == src.layout
+    assert out.shape == src.shape
+    assert out.dtype == DEF_OUT_DTYPE
+
+    tmp, _, _ = cvcuda.label_into(
+        dst=out, src=src, connectivity=connectivity, stream=stream
+    )
+    assert tmp is out
+    assert out.layout == src.layout
+    assert out.shape == src.shape
+    assert out.dtype == np.uint32
diff --git a/tests/cvcuda/python/test_opmatch.py b/tests/cvcuda/python/test_opmatch.py
new file mode 100644
index 00000000..2d29fd81
--- /dev/null
+++ b/tests/cvcuda/python/test_opmatch.py
@@ -0,0 +1,212 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cvcuda
+import pytest as t
+import numpy as np
+import cvcuda_util as util
+
+
+RNG = np.random.default_rng(0)
+
+
+class ref:
+    """Python reference class to store constants and test output content"""
+
+    num_dtype = np.int32
+    out_dtype = np.int32
+    dist_dtype = np.float32
+
+    def absdiff(a, b):
+        if type(a) == float:
+            return abs(a - b)
+        else:
+            return b - a if a < b else a - b
+
+    def distance(p1, p2, norm_type):
+        if norm_type == cvcuda.Norm.HAMMING:
+            return sum([bin(c1 ^ c2).count("1") for c1, c2 in zip(p1, p2)])
+        elif norm_type == cvcuda.Norm.L1:
+            return sum([abs(ref.absdiff(c1, c2)) for c1, c2 in zip(p1, p2)])
+        elif norm_type == cvcuda.Norm.L2:
+            return np.sqrt(sum([ref.absdiff(c1, c2) ** 2 for c1, c2 in zip(p1, p2)]))
+
+    def brute_force_matcher(batch_set1, batch_set2, cross_check, norm_type):
+        batch_matches = []
+        batch_num_matches = []
+        batch_distances = []
+        for set1, set2 in zip(batch_set1, batch_set2):
+            batch_matches.append([])
+            batch_num_matches.append(0)
+            batch_distances.append([])
+            for set1_idx, p1 in enumerate(set1):
+                dist1to2 = []
+                for set2_idx, p2 in enumerate(set2):
+                    dist1to2.append((ref.distance(p1, p2, norm_type), set2_idx))
+                sorted_dist_ids = sorted(dist1to2)
+                if cross_check:
+                    p2 = set2[sorted_dist_ids[0][1]]
+                    dist2to1 = []
+                    for q1_idx, q1 in enumerate(set1):
+                        dist2to1.append((ref.distance(q1, p2, norm_type), q1_idx))
+                    cc_sorted_dist_ids = sorted(dist2to1)
+                    if cc_sorted_dist_ids[0][1] == set1_idx:
+                        batch_matches[-1].append([set1_idx, sorted_dist_ids[0][1]])
+                        batch_distances[-1].append(sorted_dist_ids[0][0])
+                        batch_num_matches[-1] += 1
+                else:
+                    batch_matches[-1].append([set1_idx, sorted_dist_ids[0][1]])
+                    batch_distances[-1].append(sorted_dist_ids[0][0])
+                    batch_num_matches[-1] += 1
+        return batch_matches, batch_num_matches, batch_distances
+
+    def sort(matches, num_matches, distances):
+        output = []
+        for sample_idx in range(len(matches)):
+            for match_idx in range(num_matches[sample_idx]):
+                set1_idx = matches[sample_idx][match_idx][0]
+                set2_idx = matches[sample_idx][match_idx][1]
+                distance = distances[sample_idx][match_idx]
+                output.append((sample_idx, set1_idx, set2_idx, distance))
+        return sorted(output)
+
+
+@t.mark.parametrize(
+    "set_shape, set_dtype",
+    [
+        ((1, 11, 1), np.uint8),
+        ((2, 12, 2), np.uint32),
+        ((3, 22, 3), np.float32),
+        ((4, 123, 32), np.uint8),
+        ((3, 234, 26), np.uint32),
+        ((2, 345, 13), np.float32),
+    ],
+)
+def test_op_match_api(set_shape, set_dtype):
+    set1 = cvcuda.Tensor(set_shape, set_dtype, "NMD")
+    set2 = cvcuda.Tensor(set_shape, set_dtype, "NMD")
+
+    matches, num_matches, distances = cvcuda.match(set1, set2)
+    assert num_matches is None and distances is None
+    assert matches.shape == (set_shape[0], set_shape[1], 2)
+    assert matches.layout == "NMA"
+    assert matches.dtype == ref.out_dtype
+
+    _, num_matches, _ = cvcuda.match(set1, set2, num_matches=True)
+    assert num_matches.shape == (set_shape[0],)
+    assert num_matches.layout == "N"
+    assert num_matches.dtype == ref.out_dtype
+
+    _, _, distances = cvcuda.match(set1, set2, distances=True)
+    assert distances.shape == (set_shape[0], set_shape[1])
+    assert distances.layout == "NM"
+    assert distances.dtype == ref.dist_dtype
+
+    _, num_matches, _ = cvcuda.match(set1, set2, cross_check=True)
+    assert num_matches is not None
+
+    _, num_matches, distances = cvcuda.match(
+        set1, set2, num_matches=True, distances=True
+    )
+    assert num_matches is not None and distances is not None
+
+    num_set1 = cvcuda.Tensor(set_shape[:1], ref.num_dtype, "N")
+    num_set2 = cvcuda.Tensor(set_shape[:1], ref.num_dtype, "N")
+
+    big_matches, _, _ = cvcuda.match(
+        set1,
+        set2,
+        num_set1,
+        num_set2,
+        cross_check=False,
+        norm_type=cvcuda.Norm.L2,
+        matches_per_point=64,
+        algo_choice=cvcuda.Matcher.BRUTE_FORCE,
+    )
+    assert big_matches.shape == (set_shape[0], set_shape[1] * 64, 2)
+
+    tmp = cvcuda.match_into(
+        matches,
+        num_matches,
+        distances,
+        set1,
+        set2,
+        num_set1,
+        num_set2,
+    )
+    assert tmp[0] is matches and tmp[1] is num_matches and tmp[2] is distances
+
+    stream = cvcuda.Stream()
+    matches, _, _ = cvcuda.match(set1, set2, num_set1, num_set2, stream=stream)
+    assert matches.shape == (set_shape[0], set_shape[1], 2)
+    assert matches.layout == "NMA"
+    assert matches.dtype == ref.out_dtype
+
+    tmp = cvcuda.match_into(
+        matches,
+        None,
+        None,
+        set1,
+        set2,
+        None,
+        None,
+        False,
+        1,
+        cvcuda.Norm.L1,
+        cvcuda.Matcher.BRUTE_FORCE,
+        stream=stream,
+    )
+    assert tmp[0] is matches and tmp[1] is None and tmp[2] is None
+
+
+@t.mark.parametrize(
+    "set_shape, set_dtype, cross_check, norm_type",
+    [
+        ((1, 18, 32), np.uint8, False, cvcuda.Norm.HAMMING),
+        ((2, 28, 21), np.uint32, False, cvcuda.Norm.L1),
+        ((3, 36, 10), np.float32, False, cvcuda.Norm.L2),
+        ((2, 17, 33), np.uint8, True, cvcuda.Norm.L1),
+        ((3, 57, 13), np.float32, True, cvcuda.Norm.L2),
+    ],
+)
+def test_op_match_content(set_shape, set_dtype, cross_check, norm_type):
+    h_set1 = util.generate_data(set_shape, set_dtype, max_random=255, rng=RNG)
+    h_set2 = util.generate_data(set_shape, set_dtype, max_random=255, rng=RNG)
+
+    set1 = util.to_nvcv_tensor(h_set1, "NMD")
+    set2 = util.to_nvcv_tensor(h_set2, "NMD")
+
+    matches, num_matches, distances = cvcuda.match(
+        set1,
+        set2,
+        num_matches=True,
+        distances=True,
+        cross_check=cross_check,
+        norm_type=norm_type,
+        algo_choice=cvcuda.Matcher.BRUTE_FORCE,
+    )
+
+    h_test_matches = util.to_cpu_numpy_buffer(matches.cuda())
+    h_test_num_matches = util.to_cpu_numpy_buffer(num_matches.cuda())
+    h_test_distances = util.to_cpu_numpy_buffer(distances.cuda())
+
+    h_gold_matches, h_gold_num_matches, h_gold_distances = ref.brute_force_matcher(
+        h_set1, h_set2, cross_check, norm_type
+    )
+
+    h_test_output = ref.sort(h_test_matches, h_test_num_matches, h_test_distances)
+    h_gold_output = ref.sort(h_gold_matches, h_gold_num_matches, h_gold_distances)
+
+    np.testing.assert_allclose(h_test_output, h_gold_output, rtol=1e-5, atol=1e-5)
diff --git a/tests/cvcuda/python/test_opmorphology.py b/tests/cvcuda/python/test_opmorphology.py
index 94e17ee6..99f3543b 100644
--- a/tests/cvcuda/python/test_opmorphology.py
+++ b/tests/cvcuda/python/test_opmorphology.py
@@ -13,11 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import torch
 import cvcuda
 import pytest as t
 import numpy as np
 import cvcuda_util as util
-import torch
 
 
 RNG = np.random.default_rng(0)
diff --git a/tests/cvcuda/python/test_opnms.py b/tests/cvcuda/python/test_opnms.py
index 481c0b7c..708075d5 100644
--- a/tests/cvcuda/python/test_opnms.py
+++ b/tests/cvcuda/python/test_opnms.py
@@ -13,11 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import torch
 import cvcuda
 import cvcuda_util as util
 import pytest as t
 import numpy as np
-import torch
 
 
 RNG = np.random.default_rng(0)
diff --git a/tests/cvcuda/python/test_oposd.py b/tests/cvcuda/python/test_oposd.py
index 37be497c..d12ea49a 100644
--- a/tests/cvcuda/python/test_oposd.py
+++ b/tests/cvcuda/python/test_oposd.py
@@ -24,100 +24,109 @@
         (
             (((2, 224, 224, 4), np.uint8, "NHWC")),
             cvcuda.Elements(
-                numElements=[5, 5],
-                elements=(
-                    cvcuda.BndBoxI(
-                        box=(10, 10, 5, 5),
-                        thickness=2,
-                        borderColor=(255, 255, 0),
-                        fillColor=(0, 128, 255, 128),
-                    ),
-                    cvcuda.Label(
-                        utf8Text="def",
-                        fontSize=30,
-                        tlPos=(50, 50),
-                        fontColor=(255, 255, 0),
-                        bgColor=(0, 128, 255, 128),
-                    ),
-                    cvcuda.Segment(
-                        box=(20, 20, 30, 30),
-                        thickness=1,
-                        segArray=np.array(
-                            [
-                                [0, 0, 0, 0, 0.2, 0.2, 0, 0, 0, 0],
-                                [0, 0, 0, 0.2, 0.3, 0.3, 0.2, 0, 0, 0],
-                                [0, 0, 0.2, 0.3, 0.4, 0.4, 0.3, 0.2, 0, 0],
-                                [0, 0.2, 0.3, 0.4, 0.5, 0.5, 0.4, 0.3, 0.2, 0],
-                                [0.2, 0.3, 0.4, 0.5, 0.5, 0.5, 0.5, 0.4, 0.3, 0.2],
-                                [0.2, 0.3, 0.4, 0.5, 0.5, 0.5, 0.5, 0.4, 0.3, 0.2],
-                                [0, 0.2, 0.3, 0.4, 0.5, 0.5, 0.4, 0.3, 0.2, 0],
-                                [0, 0, 0.2, 0.3, 0.4, 0.4, 0.3, 0.2, 0, 0],
-                                [0, 0, 0, 0.2, 0.3, 0.3, 0.2, 0, 0, 0],
-                                [0, 0, 0, 0, 0.2, 0.2, 0, 0, 0, 0],
-                            ]
+                elements=[
+                    [
+                        cvcuda.BndBoxI(
+                            box=(10, 10, 5, 5),
+                            thickness=2,
+                            borderColor=(255, 255, 0),
+                            fillColor=(0, 128, 255, 128),
+                        )
+                    ],
+                    [
+                        cvcuda.BndBoxI(
+                            box=(10, 10, 5, 5),
+                            thickness=2,
+                            borderColor=(255, 255, 0),
+                            fillColor=(0, 128, 255, 128),
                         ),
-                        segThreshold=0.2,
-                        borderColor=(255, 255, 0),
-                        segColor=(0, 128, 255, 128),
-                    ),
-                    cvcuda.Point(
-                        centerPos=(30, 30),
-                        radius=5,
-                        color=(255, 255, 0),
-                    ),
-                    cvcuda.Line(
-                        pos0=(50, 50),
-                        pos1=(150, 50),
-                        thickness=1,
-                        color=(255, 0, 0),
-                    ),
-                    cvcuda.PolyLine(
-                        points=np.array(
-                            [
-                                [100, 100],
-                                [600, 100],
-                                [350, 300],
-                                [600, 500],
-                                [300, 500],
-                            ]
+                        cvcuda.Label(
+                            utf8Text="def",
+                            fontSize=30,
+                            tlPos=(50, 50),
+                            fontColor=(255, 255, 0),
+                            bgColor=(0, 128, 255, 128),
                         ),
-                        thickness=1,
-                        isClosed=True,
-                        borderColor=(255, 255, 0),
-                        fillColor=(0, 128, 255, 128),
-                    ),
-                    cvcuda.RotatedBox(
-                        centerPos=(30, 30),
-                        width=5,
-                        height=5,
-                        yaw=0.3,
-                        thickness=1,
-                        borderColor=(255, 255, 0),
-                        bgColor=(0, 128, 255, 128),
-                    ),
-                    cvcuda.Circle(
-                        centerPos=(30, 30),
-                        radius=5,
-                        thickness=2,
-                        borderColor=(255, 255, 0),
-                        bgColor=(0, 128, 255, 128),
-                    ),
-                    cvcuda.Arrow(
-                        pos0=(50, 50),
-                        pos1=(150, 50),
-                        arrowSize=3,
-                        thickness=1,
-                        color=(255, 0, 0),
-                    ),
-                    cvcuda.Clock(
-                        clockFormat=cvcuda.ClockFormat.YYMMDD_HHMMSS,
-                        time=0,
-                        fontSize=10,
-                        tlPos=(150, 50),
-                        fontColor=(255, 255, 0),
-                        bgColor=(0, 128, 255, 128),
-                    ),
-                ),
+                        cvcuda.Segment(
+                            box=(20, 20, 30, 30),
+                            thickness=1,
+                            segArray=np.array(
+                                [
+                                    [0, 0, 0, 0, 0.2, 0.2, 0, 0, 0, 0],
+                                    [0, 0, 0, 0.2, 0.3, 0.3, 0.2, 0, 0, 0],
+                                    [0, 0, 0.2, 0.3, 0.4, 0.4, 0.3, 0.2, 0, 0],
+                                    [0, 0.2, 0.3, 0.4, 0.5, 0.5, 0.4, 0.3, 0.2, 0],
+                                    [0.2, 0.3, 0.4, 0.5, 0.5, 0.5, 0.5, 0.4, 0.3, 0.2],
+                                    [0.2, 0.3, 0.4, 0.5, 0.5, 0.5, 0.5, 0.4, 0.3, 0.2],
+                                    [0, 0.2, 0.3, 0.4, 0.5, 0.5, 0.4, 0.3, 0.2, 0],
+                                    [0, 0, 0.2, 0.3, 0.4, 0.4, 0.3, 0.2, 0, 0],
+                                    [0, 0, 0, 0.2, 0.3, 0.3, 0.2, 0, 0, 0],
+                                    [0, 0, 0, 0, 0.2, 0.2, 0, 0, 0, 0],
+                                ]
+                            ),
+                            segThreshold=0.2,
+                            borderColor=(255, 255, 0),
+                            segColor=(0, 128, 255, 128),
+                        ),
+                        cvcuda.Point(
+                            centerPos=(30, 30),
+                            radius=5,
+                            color=(255, 255, 0),
+                        ),
+                        cvcuda.Line(
+                            pos0=(50, 50),
+                            pos1=(150, 50),
+                            thickness=1,
+                            color=(255, 0, 0),
+                        ),
+                        cvcuda.PolyLine(
+                            points=np.array(
+                                [
+                                    [100, 100],
+                                    [600, 100],
+                                    [350, 300],
+                                    [600, 500],
+                                    [300, 500],
+                                ]
+                            ),
+                            thickness=1,
+                            isClosed=True,
+                            borderColor=(255, 255, 0),
+                            fillColor=(0, 128, 255, 128),
+                        ),
+                        cvcuda.RotatedBox(
+                            centerPos=(30, 30),
+                            width=5,
+                            height=5,
+                            yaw=0.3,
+                            thickness=1,
+                            borderColor=(255, 255, 0),
+                            bgColor=(0, 128, 255, 128),
+                        ),
+                        cvcuda.Circle(
+                            centerPos=(30, 30),
+                            radius=5,
+                            thickness=2,
+                            borderColor=(255, 255, 0),
+                            bgColor=(0, 128, 255, 128),
+                        ),
+                        cvcuda.Arrow(
+                            pos0=(50, 50),
+                            pos1=(150, 50),
+                            arrowSize=3,
+                            thickness=1,
+                            color=(255, 0, 0),
+                        ),
+                        cvcuda.Clock(
+                            clockFormat=cvcuda.ClockFormat.YYMMDD_HHMMSS,
+                            time=0,
+                            fontSize=10,
+                            tlPos=(150, 50),
+                            fontColor=(255, 255, 0),
+                            bgColor=(0, 128, 255, 128),
+                        ),
+                    ],
+                ],
             ),
         ),
     ],
diff --git a/tests/cvcuda/python/test_oppillowresize.py b/tests/cvcuda/python/test_oppillowresize.py
index 303e4a1c..bba37e5a 100644
--- a/tests/cvcuda/python/test_oppillowresize.py
+++ b/tests/cvcuda/python/test_oppillowresize.py
@@ -13,12 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import torch
 import cvcuda
 import pytest as t
 import numpy as np
 import cvcuda_util as util
 import threading
-import torch
 
 
 RNG = np.random.default_rng(0)
diff --git a/tests/cvcuda/python/test_opreformat.py b/tests/cvcuda/python/test_opreformat.py
index 0d507db7..2f478bb0 100644
--- a/tests/cvcuda/python/test_opreformat.py
+++ b/tests/cvcuda/python/test_opreformat.py
@@ -13,11 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import torch
 import cvcuda
 import pytest as t
 import numpy as np
 import threading
-import torch
 
 
 RNG = np.random.default_rng(0)
diff --git a/tests/cvcuda/python/test_opremap.py b/tests/cvcuda/python/test_opremap.py
index 2cd7ce66..2ff5d552 100644
--- a/tests/cvcuda/python/test_opremap.py
+++ b/tests/cvcuda/python/test_opremap.py
@@ -13,12 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import torch
 import nvcv
 import cvcuda
 import pytest as t
 import numpy as np
 import cvcuda_util as util
-import torch
 
 
 RNG = np.random.default_rng(0)
diff --git a/tests/cvcuda/python/test_opstack.py b/tests/cvcuda/python/test_opstack.py
new file mode 100644
index 00000000..43d86fbb
--- /dev/null
+++ b/tests/cvcuda/python/test_opstack.py
@@ -0,0 +1,103 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cvcuda
+import pytest as t
+import numpy as np
+import random
+
+random.seed(1)
+
+
+@t.mark.parametrize(
+    "input, dtype, number",
+    [
+        (((5, 16, 23, 4), np.uint8, "NHWC"), np.int8, 2),
+        (((1, 160, 221, 2), np.uint8, "NHWC"), np.int8, 3),
+        (((1, 60, 1, 1), np.uint8, "NHWC"), np.int8, 1),
+        (((6, 61, 12, 3), np.uint8, "NHWC"), np.int8, 5),
+        (((5, 161, 23, 4), np.uint8, "NCHW"), np.int8, 2),
+        (((1, 160, 221, 2), np.uint8, "NCHW"), np.int8, 3),
+        (((1, 1, 2, 1), np.uint8, "NCHW"), np.int8, 1),
+        (((6, 13, 1, 3), np.uint8, "NCHW"), np.int8, 5),
+        (((16, 23, 4), np.uint8, "HWC"), np.int8, 2),
+        (((160, 221, 2), np.uint8, "HWC"), np.int8, 3),
+        (((60, 1, 1), np.uint8, "HWC"), np.int8, 1),
+        (((61, 12, 3), np.uint8, "HWC"), np.int8, 5),
+        (((161, 23, 4), np.uint8, "CHW"), np.int8, 2),
+        (((160, 221, 2), np.uint8, "CHW"), np.int8, 3),
+        (((1, 2, 1), np.uint8, "CHW"), np.int8, 1),
+        (((13, 1, 3), np.uint8, "CHW"), np.int8, 5),
+    ],
+)
+def test_op_stack(input, dtype, number):
+
+    input_tensors = []
+
+    numberOfTensors = 0
+
+    updated_input = list(input)
+    for _ in range(number):
+        if updated_input[2] == "NHWC" or updated_input[2] == "NCHW":
+            updated_input[0] = (random.randint(1, input[0][0]),) + input[0][
+                1:
+            ]  # Update the first value
+            numberOfTensors += updated_input[0][0]
+        else:
+            numberOfTensors += 1
+        input_tensor = cvcuda.Tensor(*updated_input)
+        input_tensors.append(input_tensor)
+
+    out = cvcuda.stack(input_tensors)
+
+    assert out.shape[0] == numberOfTensors
+    assert out.dtype == input_tensors[0].dtype
+
+    if input_tensors[0].shape == 3:
+        assert out.shape[1] == input_tensors[0].shape[0]
+        assert out.shape[2] == input_tensors[0].shape[1]
+        assert out.shape[3] == input_tensors[0].shape[2]
+    if input_tensors[0].shape == 4:
+        assert out.layout == input_tensors[0].layout
+        assert out.shape[1] == input_tensors[0].shape[1]
+        assert out.shape[2] == input_tensors[0].shape[2]
+        assert out.shape[3] == input_tensors[0].shape[3]
+
+    # check stack into
+    outputTensorDef = list(updated_input)
+    if updated_input[2] == "NHWC" or updated_input[2] == "NCHW":
+        outputTensorDef[0] = (numberOfTensors,) + input[0][1:]
+    else:
+        outputTensorDef[0] = (numberOfTensors,) + input[0][0:]
+        if updated_input[2] == "HWC":
+            outputTensorDef[2] = "NHWC"
+        else:
+            outputTensorDef[2] = "NCHW"
+
+    output_tensor = cvcuda.Tensor(*outputTensorDef)
+    cvcuda.stack_into(output_tensor, input_tensors)
+
+    assert output_tensor.shape[0] == numberOfTensors
+    assert output_tensor.dtype == input_tensors[0].dtype
+
+    if input_tensors[0].shape == 3:
+        assert output_tensor.shape[1] == input_tensors[0].shape[0]
+        assert output_tensor.shape[2] == input_tensors[0].shape[1]
+        assert output_tensor.shape[3] == input_tensors[0].shape[2]
+    if input_tensors[0].shape == 4:
+        assert output_tensor.layout == input_tensors[0].layout
+        assert output_tensor.shape[1] == input_tensors[0].shape[1]
+        assert output_tensor.shape[2] == input_tensors[0].shape[2]
+        assert output_tensor.shape[3] == input_tensors[0].shape[3]
diff --git a/tests/cvcuda/python/test_opwarpperspective.py b/tests/cvcuda/python/test_opwarpperspective.py
index 6410986d..2b111750 100644
--- a/tests/cvcuda/python/test_opwarpperspective.py
+++ b/tests/cvcuda/python/test_opwarpperspective.py
@@ -102,6 +102,17 @@
             cvcuda.Border.REPLICATE,
             [1, 2, 3, 4],
         ),
+        (
+            ((11, 21, 4), np.uint8, "HWC"),
+            [
+                [1, 2, 0],
+                [2, 1, 1],
+                [0, 0, 1],
+            ],
+            cvcuda.Interp.LINEAR | cvcuda.Interp.WARP_INVERSE_MAP,
+            cvcuda.Border.REPLICATE,
+            [1, 2, 3, 4],
+        ),
     ],
 )
 def test_op_warp_perspective(input_args, xform, flags, border_mode, border_value):
diff --git a/tests/cvcuda/python/test_util.py b/tests/cvcuda/python/test_util.py
index ff3a8726..bf9da883 100644
--- a/tests/cvcuda/python/test_util.py
+++ b/tests/cvcuda/python/test_util.py
@@ -13,9 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import torch
 import nvcv
 import numpy as np
-import torch
 import cvcuda_util as util
 
 
diff --git a/tests/cvcuda/system/CMakeLists.txt b/tests/cvcuda/system/CMakeLists.txt
index 2da3ac5b..a9e1a648 100644
--- a/tests/cvcuda/system/CMakeLists.txt
+++ b/tests/cvcuda/system/CMakeLists.txt
@@ -31,6 +31,9 @@ endif()
 
 # system core -------------------------------------------------
 add_executable(cvcuda_test_system
+    TestOpPairwiseMatcher.cpp
+    TestOpStack.cpp
+    TestOpLabel.cpp
     TestOpFindContours.cpp
     TestOpOSD.cpp
     TestOpHistogramEq.cpp
@@ -39,8 +42,8 @@ add_executable(cvcuda_test_system
     TestOpMinMaxLoc.cpp
     TestOpHistogram.cpp
     TestOpMinAreaRect.cpp
-    TestOpBoxBlur.cpp
     TestOpBndBox.cpp
+    TestOpBoxBlur.cpp
     OsdUtils.cu
     TestOpSIFT.cpp
     TestOpMinMaxLoc.cpp
@@ -84,6 +87,7 @@ add_executable(cvcuda_test_system
     TestOpGaussianNoise.cpp
     GaussianNoiseUtils.cu
     TestOpInpaint.cpp
+    TestOpFindHomography.cpp
 )
 
 target_link_libraries(cvcuda_test_system
@@ -93,7 +97,7 @@ target_link_libraries(cvcuda_test_system
         cuosd
 )
 
-nvcv_add_test(cvcuda_test_system)
+nvcv_add_test(cvcuda_test_system cvcuda)
 
 # header compatibility tests ---------------------------------------------
 
diff --git a/tests/cvcuda/system/OsdUtils.cu b/tests/cvcuda/system/OsdUtils.cu
index 63e48103..1fbf4d00 100644
--- a/tests/cvcuda/system/OsdUtils.cu
+++ b/tests/cvcuda/system/OsdUtils.cu
@@ -105,7 +105,7 @@ Segment *create_segment()
     Segment *output = new Segment();
     output->width   = 10;
     output->height  = 10;
-    checkRuntime(cudaMalloc(&output->data, output->width * output->height * sizeof(float)));
+    output->data    = (float *)malloc(output->width * output->height * sizeof(float));
     std::vector<float> diamond;
     diamond.insert(diamond.end(), {0, 0, 0, 0, 0.2, 0.2, 0, 0, 0, 0});
     diamond.insert(diamond.end(), {0, 0, 0, 0.2, 0.3, 0.3, 0.2, 0, 0, 0});
@@ -117,16 +117,16 @@ Segment *create_segment()
     diamond.insert(diamond.end(), {0, 0, 0.2, 0.3, 0.4, 0.4, 0.3, 0.2, 0, 0});
     diamond.insert(diamond.end(), {0, 0, 0, 0.2, 0.3, 0.3, 0.2, 0, 0, 0});
     diamond.insert(diamond.end(), {0, 0, 0, 0, 0.2, 0.2, 0, 0, 0, 0});
-    checkRuntime(cudaMemcpy(output->data, diamond.data(), output->width * output->height * sizeof(float),
-                            cudaMemcpyHostToDevice));
+    memcpy(output->data, diamond.data(), output->width * output->height * sizeof(float));
     return output;
 }
 
 void free_segment(Segment *segment)
 {
-    if (segment->data)
+    if (segment->data != nullptr)
     {
-        checkRuntime(cudaFree(segment->data));
+        free(segment->data);
+        segment->data = nullptr;
     }
     segment->width  = 0;
     segment->height = 0;
@@ -146,17 +146,11 @@ Polyline *create_polyline()
     output->n_pts = points.size();
     output->h_pts = (int *)malloc(output->n_pts * 2 * sizeof(int));
     memcpy(output->h_pts, points.data(), output->n_pts * 2 * sizeof(int));
-    checkRuntime(cudaMalloc(&output->d_pts, output->n_pts * 2 * sizeof(int)));
-    checkRuntime(cudaMemcpy(output->d_pts, points.data(), output->n_pts * 2 * sizeof(int), cudaMemcpyHostToDevice));
     return output;
 }
 
 void free_polyline(Polyline *polyline)
 {
-    if (polyline->d_pts)
-    {
-        checkRuntime(cudaFree(polyline->d_pts));
-    }
     if (polyline->h_pts)
     {
         free(polyline->h_pts);
diff --git a/tests/cvcuda/system/OsdUtils.cuh b/tests/cvcuda/system/OsdUtils.cuh
index 69d1a7ce..1686d44c 100644
--- a/tests/cvcuda/system/OsdUtils.cuh
+++ b/tests/cvcuda/system/OsdUtils.cuh
@@ -59,7 +59,6 @@ struct Point
 struct Polyline
 {
     int *h_pts = nullptr;
-    int *d_pts = nullptr;
     int  n_pts = 0;
 };
 
diff --git a/tests/cvcuda/system/TestOpBndBox.cpp b/tests/cvcuda/system/TestOpBndBox.cpp
index 45a1c3a6..e895347b 100644
--- a/tests/cvcuda/system/TestOpBndBox.cpp
+++ b/tests/cvcuda/system/TestOpBndBox.cpp
@@ -21,6 +21,7 @@
 
 #include <common/ValueTests.hpp>
 #include <cvcuda/OpBndBox.hpp>
+#include <cvcuda/priv/Types.hpp>
 #include <nvcv/Image.hpp>
 #include <nvcv/Tensor.hpp>
 #include <nvcv/TensorDataAccess.hpp>
@@ -33,6 +34,7 @@
 
 namespace gt   = ::testing;
 namespace test = nvcv::test;
+using namespace cvcuda::priv;
 
 static int randl(int l, int h)
 {
@@ -41,12 +43,12 @@ static int randl(int l, int h)
 }
 
 static void setGoldBuffer(std::vector<uint8_t> &vect, nvcv::ImageFormat format,
-                          const nvcv::TensorDataAccessStridedImagePlanar &data, nvcv::Byte *inBuf, NVCVBndBoxesI bboxes,
-                          cudaStream_t stream)
+                          const nvcv::TensorDataAccessStridedImagePlanar &data, nvcv::Byte *inBuf,
+                          std::shared_ptr<NVCVBndBoxesImpl> bboxes, cudaStream_t stream)
 {
     auto context = cuosd_context_create();
 
-    for (int n = 0; n < bboxes.batch; n++)
+    for (int n = 0; n < bboxes->batch(); n++)
     {
         test::osd::Image *image = test::osd::create_image(
             data.numCols(), data.numRows(),
@@ -54,11 +56,11 @@ static void setGoldBuffer(std::vector<uint8_t> &vect, nvcv::ImageFormat format,
         int bufSize = data.numCols() * data.numRows() * data.numChannels();
         EXPECT_EQ(cudaSuccess, cudaMemcpy(image->data0, inBuf + n * bufSize, bufSize, cudaMemcpyDeviceToDevice));
 
-        auto numBoxes = bboxes.numBoxes[n];
+        auto numBoxes = bboxes->numBoxesAt(n);
 
         for (int i = 0; i < numBoxes; i++)
         {
-            auto bbox = bboxes.boxes[i];
+            auto bbox = bboxes->boxAt(n, i);
 
             int left   = std::max(std::min(bbox.box.x, data.numCols() - 1), 0);
             int top    = std::max(std::min(bbox.box.y, data.numRows() - 1), 0);
@@ -80,7 +82,6 @@ static void setGoldBuffer(std::vector<uint8_t> &vect, nvcv::ImageFormat format,
 
         test::osd::cuosd_apply(context, image, stream);
 
-        bboxes.boxes = (NVCVBndBoxI *)((unsigned char *)bboxes.boxes + numBoxes * sizeof(NVCVBndBoxI));
         EXPECT_EQ(cudaSuccess, cudaMemcpy(vect.data() + n * bufSize, image->data0, bufSize, cudaMemcpyDeviceToHost));
 
         test::osd::free_image(image);
@@ -93,14 +94,12 @@ static void setGoldBuffer(std::vector<uint8_t> &vect, nvcv::ImageFormat format,
 static void runOp(cudaStream_t &stream, cvcuda::BndBox &op, int &inN, int &inW, int &inH, int &num, int &sed,
                   nvcv::ImageFormat &format)
 {
-    NVCVBndBoxesI            bndBoxes;
-    std::vector<int>         numBoxVec;
-    std::vector<NVCVBndBoxI> bndBoxVec;
+    std::vector<std::vector<NVCVBndBoxI>> bndBoxVec;
 
     srand(sed);
     for (int n = 0; n < inN; n++)
     {
-        numBoxVec.push_back(num);
+        std::vector<NVCVBndBoxI> curVec;
         for (int i = 0; i < num; i++)
         {
             NVCVBndBoxI bndBox;
@@ -113,13 +112,12 @@ static void runOp(cudaStream_t &stream, cvcuda::BndBox &op, int &inN, int &inW,
                                   (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)};
             bndBox.borderColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
                                   (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)};
-            bndBoxVec.push_back(bndBox);
+            curVec.push_back(bndBox);
         }
+        bndBoxVec.push_back(curVec);
     }
 
-    bndBoxes.batch    = inN;
-    bndBoxes.numBoxes = numBoxVec.data();
-    bndBoxes.boxes    = bndBoxVec.data();
+    std::shared_ptr<NVCVBndBoxesImpl> bndBoxes = std::make_shared<NVCVBndBoxesImpl>(bndBoxVec);
 
     nvcv::Tensor imgIn  = nvcv::util::CreateTensor(inN, inW, inH, format);
     nvcv::Tensor imgOut = nvcv::util::CreateTensor(inN, inW, inH, format);
@@ -145,7 +143,7 @@ static void runOp(cudaStream_t &stream, cvcuda::BndBox &op, int &inN, int &inW,
     EXPECT_EQ(cudaSuccess, cudaMemset(input->basePtr(), 0xFF, inSampleStride * inAccess->numSamples()));
     EXPECT_EQ(cudaSuccess, cudaMemset(output->basePtr(), 0xFF, outSampleStride * outAccess->numSamples()));
 
-    EXPECT_NO_THROW(op(stream, imgIn, imgOut, bndBoxes));
+    EXPECT_NO_THROW(op(stream, imgIn, imgOut, (NVCVBndBoxesI)bndBoxes.get()));
 
     // check cdata
     std::vector<uint8_t> test(outBufSize);
diff --git a/tests/cvcuda/system/TestOpBoxBlur.cpp b/tests/cvcuda/system/TestOpBoxBlur.cpp
index e18e73d0..e3efd566 100644
--- a/tests/cvcuda/system/TestOpBoxBlur.cpp
+++ b/tests/cvcuda/system/TestOpBoxBlur.cpp
@@ -21,6 +21,7 @@
 
 #include <common/ValueTests.hpp>
 #include <cvcuda/OpBoxBlur.hpp>
+#include <cvcuda/priv/Types.hpp>
 #include <nvcv/Image.hpp>
 #include <nvcv/Tensor.hpp>
 #include <nvcv/TensorDataAccess.hpp>
@@ -32,14 +33,15 @@
 
 namespace gt   = ::testing;
 namespace test = nvcv::test;
+using namespace cvcuda::priv;
 
 static void setGoldBuffer(std::vector<uint8_t> &vect, nvcv::ImageFormat format,
                           const nvcv::TensorDataAccessStridedImagePlanar &data, nvcv::Byte *inBuf,
-                          NVCVBlurBoxesI bboxes, cudaStream_t stream)
+                          std::shared_ptr<NVCVBlurBoxesImpl> bboxes, cudaStream_t stream)
 {
     auto context = cuosd_context_create();
 
-    for (int n = 0; n < bboxes.batch; n++)
+    for (int n = 0; n < bboxes->batch(); n++)
     {
         test::osd::Image *image = test::osd::create_image(
             data.numCols(), data.numRows(),
@@ -47,11 +49,11 @@ static void setGoldBuffer(std::vector<uint8_t> &vect, nvcv::ImageFormat format,
         int bufSize = data.numCols() * data.numRows() * data.numChannels();
         EXPECT_EQ(cudaSuccess, cudaMemcpy(image->data0, inBuf + n * bufSize, bufSize, cudaMemcpyDeviceToDevice));
 
-        auto numBoxes = bboxes.numBoxes[n];
+        auto numBoxes = bboxes->numBoxesAt(n);
 
         for (int i = 0; i < numBoxes; i++)
         {
-            auto bbox = bboxes.boxes[i];
+            auto bbox = bboxes->boxAt(n, i);
 
             int left   = std::max(std::min(bbox.box.x, data.numCols() - 1), 0);
             int top    = std::max(std::min(bbox.box.y, data.numRows() - 1), 0);
@@ -70,7 +72,6 @@ static void setGoldBuffer(std::vector<uint8_t> &vect, nvcv::ImageFormat format,
 
         test::osd::cuosd_apply(context, image, stream);
 
-        bboxes.boxes = (NVCVBlurBoxI *)((unsigned char *)bboxes.boxes + numBoxes * sizeof(NVCVBlurBoxI));
         EXPECT_EQ(cudaSuccess, cudaMemcpy(vect.data() + n * bufSize, image->data0, bufSize, cudaMemcpyDeviceToHost));
 
         test::osd::free_image(image);
@@ -82,13 +83,11 @@ static void setGoldBuffer(std::vector<uint8_t> &vect, nvcv::ImageFormat format,
 static void runOp(cudaStream_t &stream, cvcuda::BoxBlur &op, int &inN, int &inW, int &inH, int &cols, int &rows,
                   int &wBox, int &hBox, int &ks, nvcv::ImageFormat &format)
 {
-    NVCVBlurBoxesI            blurBoxes;
-    std::vector<int>          numBoxVec;
-    std::vector<NVCVBlurBoxI> blurBoxVec;
+    std::vector<std::vector<NVCVBlurBoxI>> blurBoxVec;
 
     for (int n = 0; n < inN; n++)
     {
-        numBoxVec.push_back(cols * rows);
+        std::vector<NVCVBlurBoxI> curVec;
         for (int i = 0; i < cols; i++)
         {
             int x = (inW / cols) * i + wBox / 2;
@@ -100,14 +99,13 @@ static void runOp(cudaStream_t &stream, cvcuda::BoxBlur &op, int &inN, int &inW,
                 blurBox.box.width  = wBox;
                 blurBox.box.height = hBox;
                 blurBox.kernelSize = ks;
-                blurBoxVec.push_back(blurBox);
+                curVec.push_back(blurBox);
             }
         }
+        blurBoxVec.push_back(curVec);
     }
 
-    blurBoxes.batch    = inN;
-    blurBoxes.numBoxes = numBoxVec.data();
-    blurBoxes.boxes    = blurBoxVec.data();
+    std::shared_ptr<NVCVBlurBoxesImpl> blurBoxes = std::make_shared<NVCVBlurBoxesImpl>(blurBoxVec);
 
     nvcv::Tensor imgIn  = nvcv::util::CreateTensor(inN, inW, inH, format);
     nvcv::Tensor imgOut = nvcv::util::CreateTensor(inN, inW, inH, format);
@@ -140,7 +138,7 @@ static void runOp(cudaStream_t &stream, cvcuda::BoxBlur &op, int &inN, int &inW,
     EXPECT_EQ(cudaSuccess, cudaMemcpy(output->basePtr(), inVec.data(), outBufSize, cudaMemcpyHostToDevice));
 
     // run operator
-    EXPECT_NO_THROW(op(stream, imgIn, imgOut, blurBoxes));
+    EXPECT_NO_THROW(op(stream, imgIn, imgOut, (NVCVBlurBoxesI)blurBoxes.get()));
 
     // check cdata
     std::vector<uint8_t> test(outBufSize);
@@ -179,7 +177,6 @@ TEST_P(OpBoxBlur, BoxBlur_sanity)
 {
     cudaStream_t stream;
     ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream));
-    /*
     int               inN    = GetParamValue<0>();
     int               inW    = GetParamValue<1>();
     int               inH    = GetParamValue<2>();
@@ -189,9 +186,8 @@ TEST_P(OpBoxBlur, BoxBlur_sanity)
     int               hBox   = GetParamValue<6>();
     int               ks     = GetParamValue<7>();
     nvcv::ImageFormat format = GetParamValue<8>();
-    cvcuda::BoxBlur op;
+    cvcuda::BoxBlur   op;
     runOp(stream, op, inN, inW, inH, cols, rows, wBox, hBox, ks, format);
-    */
     EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream));
 }
 
diff --git a/tests/cvcuda/system/TestOpFindHomography.cpp b/tests/cvcuda/system/TestOpFindHomography.cpp
new file mode 100644
index 00000000..a0ef4fb8
--- /dev/null
+++ b/tests/cvcuda/system/TestOpFindHomography.cpp
@@ -0,0 +1,394 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/ValueTests.hpp>
+#include <cvcuda/OpFindHomography.hpp>
+#include <math.h>
+#include <nvcv/Tensor.hpp>
+#include <nvcv/TensorBatch.hpp>
+#include <nvcv/TensorDataAccess.hpp>
+#include <nvcv/cuda/DropCast.hpp>
+#include <nvcv/cuda/MathOps.hpp>
+#include <nvcv/cuda/StaticCast.hpp>
+#include <nvcv/cuda/TypeTraits.hpp>
+#include <nvcv/cuda/math/LinAlg.hpp>
+#include <util/Math.hpp>
+#include <util/TensorDataUtils.hpp>
+
+#include <iostream>
+#include <random>
+#include <vector>
+
+#ifdef PERFORMANCE_RUN
+#    define WARMUP_ITERATIONS 5
+#    define PERF_ITERATIONS   50
+#endif
+
+namespace test = nvcv::test;
+namespace util = nvcv::util;
+namespace cuda = nvcv::cuda;
+
+static std::default_random_engine g_rng(std::random_device{}());
+
+static void calculateDst(float x, float y, float *X, float *Y, float *model)
+{
+    *X = model[0] * x + model[1] * y + model[2] * 1;
+    *Y = model[3] * x + model[4] * y + model[5] * 1;
+}
+
+static void calculateGoldModelMatrix(float *m, std::mt19937 &rng, std::uniform_int_distribution<int> &dis)
+{
+    // random rotation angle between 0 and pi
+    float                           theta = (M_PI / 2.0) * dis(rng) / 100;
+    float                           Tx    = (float)dis(rng) / 100;
+    float                           Ty    = (float)dis(rng) / 100;
+    float                           sx    = (float)dis(rng) / 100;
+    float                           sy    = (float)dis(rng) / 100;
+    float                           p1    = (float)dis(rng) / 100;
+    float                           p2    = (float)dis(rng) / 100 * 2;
+    cuda::math::Matrix<float, 3, 3> He;
+    He[0] = {cos(theta), -sin(theta), Tx};
+    He[1] = {sin(theta), cos(theta), Ty};
+    He[2] = {0, 0, 1};
+    cuda::math::Matrix<float, 3, 3> Ha;
+    Ha[0] = {1, sy, 0};
+    Ha[1] = {sx, 1, 0};
+    Ha[2] = {0, 0, 1};
+    cuda::math::Matrix<float, 3, 3> Hp;
+    Hp[0]                                  = {1, 0, 0};
+    Hp[1]                                  = {0, 1, 0};
+    Hp[2]                                  = {p1, p2, 1};
+    cuda::math::Matrix<float, 3, 3> result = He * (Ha * Hp);
+    for (int i = 0; i < 3; i++)
+        for (int j = 0; j < 3; j++) m[i * 3 + j] = result[i][j];
+}
+
+// clang-format off
+NVCV_TEST_SUITE_P(OpFindHomography, test::ValueList<int, int>
+{
+    // numSamples, numPoints}
+    {8, 16},
+    {16, 20},
+    {25, 40}
+});
+
+// clang-format on
+
+TEST_P(OpFindHomography, correct_output)
+{
+    int numSamples = GetParamValue<0>();
+    int numPoints  = GetParamValue<1>();
+    numPoints *= numPoints;
+
+    // clang-format off
+    nvcv::Tensor srcPoints({{numSamples, numPoints}, "NW"}, nvcv::TYPE_2F32);
+    nvcv::Tensor dstPoints({{numSamples, numPoints}, "NW"}, nvcv::TYPE_2F32);
+    nvcv::Tensor models({{numSamples, 3, 3}, "NHW"}, nvcv::TYPE_F32);
+
+    // clang-format on
+
+    auto srcData    = srcPoints.exportData<nvcv::TensorDataStridedCuda>();
+    auto dstData    = dstPoints.exportData<nvcv::TensorDataStridedCuda>();
+    auto modelsData = models.exportData<nvcv::TensorDataStridedCuda>();
+
+    ASSERT_EQ(srcData->shape(0), srcData->shape(0));
+    ASSERT_EQ(srcData->shape(1), srcData->shape(1));
+
+    std::vector<float> srcVec(2 * numSamples * numPoints);
+    std::vector<float> dstVec(2 * numSamples * numPoints);
+    std::vector<float> modelsVec(numSamples * 9);
+    std::vector<float> estimatedModelsVec(numSamples * 9);
+    std::vector<float> computedDstVec(2 * numSamples * numPoints);
+
+    std::random_device              rd;
+    std::mt19937                    gen(rd()); // Mersenne Twister engine
+    std::uniform_int_distribution<> dis(0, 100);
+
+    int numXPoints = static_cast<int>(std::sqrt(numPoints));
+    int numYPoints = numXPoints;
+
+#ifdef WRITE_COORDINATES_TO_FILE
+    std::string src_filename
+        = "src_coordinates_" + std::to_string(numSamples) + "x" + std::to_string(numPoints) + ".bin";
+    std::string dst_filename
+        = "dst_coordinates_" + std::to_string(numSamples) + "x" + std::to_string(numPoints) + ".bin";
+
+    std::ofstream outSrcFile(src_filename.c_str(), std::ios::binary);
+    if (!outSrcFile.is_open())
+    {
+        std::cerr << "Failed to open the src file for writing." << std::endl;
+        return;
+    }
+
+    std::ofstream outDstFile(dst_filename.c_str(), std::ios::binary);
+    if (!outDstFile.is_open())
+    {
+        std::cerr << "Failed to open the dst file for writing." << std::endl;
+        return;
+    }
+#endif
+
+    // Fill gold models and src and dst points
+    for (int i = 0; i < numSamples; i++)
+    {
+#pragma unroll
+        calculateGoldModelMatrix(&modelsVec[i * 9], gen, dis);
+        // generate src and dst points
+        for (int j = 0; j < numYPoints; j++)
+        {
+            for (int k = 0; k < numXPoints; k++)
+            {
+                int idx                                 = j * numYPoints + k;
+                srcVec[i * numPoints * 2 + 2 * idx]     = dis(gen);
+                srcVec[i * numPoints * 2 + 2 * idx + 1] = dis(gen);
+
+                float dstx, dsty;
+                calculateDst(srcVec[i * numPoints * 2 + 2 * idx], srcVec[i * numPoints * 2 + 2 * idx + 1], &dstx, &dsty,
+                             modelsVec.data() + i * 9);
+                dstVec[i * numPoints * 2 + 2 * idx]     = dstx;
+                dstVec[i * numPoints * 2 + 2 * idx + 1] = dsty;
+            }
+        }
+    }
+
+#ifdef WRITE_COORDINATES_TO_FILE
+    outSrcFile.write(reinterpret_cast<const char *>(srcVec.data()), srcVec.size() * sizeof(float));
+    outDstFile.write(reinterpret_cast<const char *>(dstVec.data()), dstVec.size() * sizeof(float));
+
+    outSrcFile.close();
+    outDstFile.close();
+#endif
+
+    ASSERT_EQ(cudaSuccess, cudaMemcpy(srcData->basePtr(), srcVec.data(), sizeof(float) * 2 * numPoints * numSamples,
+                                      cudaMemcpyHostToDevice));
+    ASSERT_EQ(cudaSuccess, cudaMemcpy(dstData->basePtr(), dstVec.data(), sizeof(float) * 2 * numPoints * numSamples,
+                                      cudaMemcpyHostToDevice));
+
+    cudaStream_t stream;
+    ASSERT_EQ(cudaSuccess, cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+    cvcuda::FindHomography fh(numSamples, numPoints);
+
+#ifdef PERFORMANCE_RUN
+    for (int it = 0; it < WARMUP_ITERATIONS; it++)
+    {
+        EXPECT_NO_THROW(fh(stream, srcPoints, dstPoints, models));
+    }
+
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord(start, stream);
+    for (int it = 0; it < PERF_ITERATIONS; it++)
+    {
+        EXPECT_NO_THROW(fh(stream, srcPoints, dstPoints, models));
+    }
+    cudaEventRecord(stop, stream);
+    cudaEventSynchronize(stop);
+
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+    std::cout << "Time taken for " << numSamples << "x" << numPoints << " = " << milliseconds / PERF_ITERATIONS
+              << "ms\n";
+    // std::cout << "Time taken per image  = " << milliseconds / PERF_ITERATIONS / numSamples << "ms\n";
+
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+#else
+    EXPECT_NO_THROW(fh(stream, srcPoints, dstPoints, models));
+#endif
+
+    EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
+    ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream));
+
+    // copy back the estimated models into modelsVec
+    for (int i = 0; i < numSamples; i++)
+    {
+        ASSERT_EQ(cudaSuccess, cudaMemcpy2D(estimatedModelsVec.data() + i * 9, sizeof(float) * 3,
+                                            modelsData->basePtr() + i * modelsData->stride(0), modelsData->stride(1),
+                                            sizeof(float) * 3, 3, cudaMemcpyDeviceToHost));
+    }
+
+    // Compute dst vec based on model estimated
+#ifndef PERFORMANCE_RUN
+    for (int i = 0; i < numSamples; i++)
+    {
+        for (int j = 0; j < numYPoints; j++)
+        {
+            for (int k = 0; k < numXPoints; k++)
+            {
+                int   idx = j * numYPoints + k;
+                float dstx, dsty;
+                calculateDst(srcVec[i * numPoints * 2 + 2 * idx], srcVec[i * numPoints * 2 + 2 * idx + 1], &dstx, &dsty,
+                             estimatedModelsVec.data() + i * 9);
+                computedDstVec[i * numPoints * 2 + 2 * idx]     = dstx;
+                computedDstVec[i * numPoints * 2 + 2 * idx + 1] = dsty;
+                float A                                         = dstVec[i * numPoints * 2 + 2 * idx];
+                float B                                         = computedDstVec[i * numPoints * 2 + 2 * idx];
+                EXPECT_NEAR(A, B, 1e-03);
+                A = dstVec[i * numPoints * 2 + 2 * idx + 1];
+                B = computedDstVec[i * numPoints * 2 + 2 * idx + 1];
+                EXPECT_NEAR(A, B, 1e-03);
+            }
+        }
+    }
+#endif
+}
+
+TEST_P(OpFindHomography, varshape_correct_output)
+{
+    int              numSamples = GetParamValue<0>();
+    int              maxPoints  = GetParamValue<1>();
+    std::vector<int> numPoints(numSamples);
+    std::vector<int> numXPoints(numSamples);
+
+    std::mt19937                       rng(12345);
+    std::uniform_int_distribution<int> dis(0, 100);
+    std::uniform_int_distribution<int> dis_num_points(4, maxPoints);
+
+    auto              reqs = nvcv::TensorBatch::CalcRequirements(numSamples);
+    nvcv::TensorBatch srcTensorBatch(reqs);
+    nvcv::TensorBatch dstTensorBatch(reqs);
+    nvcv::TensorBatch modelsTensorBatch(reqs);
+
+    std::vector<std::vector<float>> srcVec(numSamples);
+    std::vector<std::vector<float>> dstVec(numSamples);
+    std::vector<float>              modelsVec(numSamples * 9);
+    std::vector<float>              estimatedModelsVec(numSamples * 9);
+    std::vector<std::vector<float>> computedDstVec(numSamples);
+
+    int maxNumPoints = 0;
+    for (int i = 0; i < numSamples; i++)
+    {
+        numXPoints[i] = dis_num_points(rng);
+        numPoints[i]  = numXPoints[i] * numXPoints[i];
+        if (numPoints[i] > maxNumPoints)
+            maxNumPoints = numPoints[i];
+
+        // Fill gold models and src and dst points
+        calculateGoldModelMatrix(&modelsVec[i * 9], rng, dis);
+        for (int j = 0; j < numPoints[i]; j++)
+        {
+            int sx = dis(rng);
+            int sy = dis(rng);
+            srcVec[i].push_back(sx);
+            srcVec[i].push_back(sy);
+
+            float dstx, dsty;
+            calculateDst(sx, sy, &dstx, &dsty, modelsVec.data() + i * 9);
+            dstVec[i].push_back(dstx);
+            dstVec[i].push_back(dsty);
+        }
+
+        nvcv::Tensor srcPoints(
+            {
+                {1, numPoints[i]},
+                "NW"
+        },
+            nvcv::TYPE_2F32);
+        nvcv::Tensor dstPoints(
+            {
+                {1, numPoints[i]},
+                "NW"
+        },
+            nvcv::TYPE_2F32);
+        nvcv::Tensor models(
+            {
+                {1, 3, 3},
+                "NHW"
+        },
+            nvcv::TYPE_F32);
+
+        auto srcData = srcPoints.exportData<nvcv::TensorDataStridedCuda>();
+        auto dstData = dstPoints.exportData<nvcv::TensorDataStridedCuda>();
+
+        ASSERT_EQ(cudaSuccess, cudaMemcpy(srcData->basePtr(), srcVec[i].data(), sizeof(float) * srcVec[i].size(),
+                                          cudaMemcpyHostToDevice));
+        ASSERT_EQ(cudaSuccess, cudaMemcpy(dstData->basePtr(), dstVec[i].data(), sizeof(float) * dstVec[i].size(),
+                                          cudaMemcpyHostToDevice));
+
+        srcTensorBatch.pushBack(srcPoints);
+        dstTensorBatch.pushBack(dstPoints);
+        modelsTensorBatch.pushBack(models);
+    }
+
+    cudaStream_t stream;
+    ASSERT_EQ(cudaSuccess, cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+    cvcuda::FindHomography fh(numSamples, maxNumPoints);
+
+#ifdef PERFORMANCE_RUN
+    for (int it = 0; it < WARMUP_ITERATIONS; it++)
+    {
+        EXPECT_NO_THROW(fh(stream, batchSrc, batchDst, models));
+    }
+
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord(start, stream);
+    for (int it = 0; it < PERF_ITERATIONS; it++)
+    {
+        EXPECT_NO_THROW(fh(stream, batchSrc, batchDst, models));
+    }
+    cudaEventRecord(stop, stream);
+    cudaEventSynchronize(stop);
+
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+    std::cout << "Time taken for " << numSamples << "x" << maxPoints << " = " << milliseconds / PERF_ITERATIONS
+              << "ms\n";
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+#else
+    EXPECT_NO_THROW(fh(stream, srcTensorBatch, dstTensorBatch, modelsTensorBatch));
+#endif
+
+    EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
+    ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream));
+
+    // copy back the estimated models into modelsVec
+    for (int i = 0; i < numSamples; i++)
+    {
+        auto modelsData = modelsTensorBatch[i].exportData<nvcv::TensorDataStridedCuda>();
+        ASSERT_EQ(cudaSuccess, cudaMemcpy2D(estimatedModelsVec.data() + i * 9, sizeof(float) * 3, modelsData->basePtr(),
+                                            modelsData->stride(1), sizeof(float) * 3, 3, cudaMemcpyDeviceToHost));
+    }
+
+    // Compute dst vec based on model estimated
+#ifndef PERFORMANCE_RUN
+    for (int i = 0; i < numSamples; i++)
+    {
+        for (int j = 0; j < numPoints[i]; j++)
+        {
+            float dstx, dsty;
+            float sx, sy;
+            sx = srcVec[i][2 * j + 0];
+            sy = srcVec[i][2 * j + 1];
+            calculateDst(sx, sy, &dstx, &dsty, estimatedModelsVec.data() + i * 9);
+            computedDstVec[i].push_back(dstx);
+            computedDstVec[i].push_back(dsty);
+            float A = dstVec[i][2 * j + 0];
+            float B = computedDstVec[i][2 * j + 0];
+            EXPECT_NEAR(A, B, 1e-03);
+            A = dstVec[i][2 * j + 1];
+            B = computedDstVec[i][2 * j + 1];
+            EXPECT_NEAR(A, B, 1e-03);
+        }
+    }
+#endif
+}
diff --git a/tests/cvcuda/system/TestOpLabel.cpp b/tests/cvcuda/system/TestOpLabel.cpp
new file mode 100644
index 00000000..12516ab4
--- /dev/null
+++ b/tests/cvcuda/system/TestOpLabel.cpp
@@ -0,0 +1,835 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Definitions.hpp"
+
+#include <common/TypedTests.hpp>
+#include <cvcuda/OpLabel.hpp>
+#include <nvcv/Tensor.hpp>
+#include <nvcv/TensorDataAccess.hpp>
+#include <nvcv/cuda/DropCast.hpp>
+#include <nvcv/cuda/MathOps.hpp>
+#include <nvcv/cuda/TypeTraits.hpp>
+#include <util/TensorDataUtils.hpp>
+
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <iterator>
+#include <map>
+#include <optional>
+#include <random>
+#include <set>
+#include <vector>
+
+// ----------------------- Basic utility definitions ---------------------------
+
+namespace cuda = nvcv::cuda;
+namespace util = nvcv::util;
+namespace test = nvcv::test;
+namespace type = nvcv::test::type;
+
+using U8 = uint8_t;
+
+using RawBufferType = std::vector<U8>;
+
+// --------------------- Reference (gold) computations -------------------------
+
+namespace ref {
+
+// Pre-filter step is to binarize srcVec using threshold range [min, max] -> 1, zero otherwise
+template<typename ST>
+inline void Binarize(RawBufferType &srcVec, const RawBufferType &minVec, const RawBufferType &maxVec,
+                     const long4 &srcStrides, const long1 &minStrides, const long1 &maxStrides, const long4 &shape)
+{
+    bool hasMinThresh = minStrides.x > 0;
+    bool hasMaxThresh = maxStrides.x > 0;
+
+    for (long x = 0; x < shape.x; ++x)
+    {
+        ST minThresh = hasMinThresh ? util::ValueAt<ST>(minVec, minStrides, long1{x}) : 0;
+        ST maxThresh = hasMaxThresh ? util::ValueAt<ST>(maxVec, maxStrides, long1{x}) : 0;
+
+        for (long y = 0; y < shape.y; ++y)
+        {
+            for (long z = 0; z < shape.z; ++z)
+            {
+                for (long w = 0; w < shape.w; ++w)
+                {
+                    long4 curCoord{x, y, z, w};
+
+                    ST value = util::ValueAt<ST>(srcVec, srcStrides, curCoord);
+
+                    if (hasMinThresh && hasMaxThresh)
+                    {
+                        value = (value < minThresh || value > maxThresh) ? 0 : 1;
+                    }
+                    else if (hasMinThresh)
+                    {
+                        value = (value < minThresh) ? 0 : 1;
+                    }
+                    else if (hasMaxThresh)
+                    {
+                        value = (value > maxThresh) ? 0 : 1;
+                    }
+
+                    util::ValueAt<ST>(srcVec, srcStrides, curCoord) = value;
+                }
+            }
+        }
+    }
+}
+
+// Label each component with label in dstVec matching value in srcVec, marking labeled elements as 1 in tmpVec
+// (since this function is called recursively, using big input sizes may lead to stack overflow)
+template<typename ST, typename DT>
+inline void LabelComponent(RawBufferType &tmpVec, RawBufferType &dstVec, const RawBufferType &srcVec,
+                           const long4 &tmpStrides, const long4 &dstStrides, const long4 &srcStrides,
+                           const long4 &shape, const long4 &curCoord, ST value, DT label)
+{
+    if (util::ValueAt<U8>(tmpVec, tmpStrides, curCoord) == 1)
+    {
+        return; // The element was already labeled, skip it
+    }
+    if (value != util::ValueAt<ST>(srcVec, srcStrides, curCoord))
+    {
+        return; // The element is not in the same labeled region, skip it
+    }
+
+    // Set element label in dstVec and mark it as labeled in tmpVec
+    util::ValueAt<DT>(dstVec, dstStrides, curCoord) = label;
+    util::ValueAt<U8>(tmpVec, tmpStrides, curCoord) = 1;
+
+    // For each neighbor, recursively call label component to label each neighbor
+    if (curCoord.y > 0)
+    {
+        LabelComponent(tmpVec, dstVec, srcVec, tmpStrides, dstStrides, srcStrides, shape,
+                       long4{curCoord.x, curCoord.y - 1, curCoord.z, curCoord.w}, value, label);
+    }
+    if (curCoord.y < shape.y - 1)
+    {
+        LabelComponent(tmpVec, dstVec, srcVec, tmpStrides, dstStrides, srcStrides, shape,
+                       long4{curCoord.x, curCoord.y + 1, curCoord.z, curCoord.w}, value, label);
+    }
+    if (curCoord.z > 0)
+    {
+        LabelComponent(tmpVec, dstVec, srcVec, tmpStrides, dstStrides, srcStrides, shape,
+                       long4{curCoord.x, curCoord.y, curCoord.z - 1, curCoord.w}, value, label);
+    }
+    if (curCoord.z < shape.z - 1)
+    {
+        LabelComponent(tmpVec, dstVec, srcVec, tmpStrides, dstStrides, srcStrides, shape,
+                       long4{curCoord.x, curCoord.y, curCoord.z + 1, curCoord.w}, value, label);
+    }
+    if (curCoord.w > 0)
+    {
+        LabelComponent(tmpVec, dstVec, srcVec, tmpStrides, dstStrides, srcStrides, shape,
+                       long4{curCoord.x, curCoord.y, curCoord.z, curCoord.w - 1}, value, label);
+    }
+    if (curCoord.w < shape.w - 1)
+    {
+        LabelComponent(tmpVec, dstVec, srcVec, tmpStrides, dstStrides, srcStrides, shape,
+                       long4{curCoord.x, curCoord.y, curCoord.z, curCoord.w + 1}, value, label);
+    }
+}
+
+// Label N volumes in NDHW tensor stored in srcVec yielding dstVec, with corresponding srcStrides/dstStrides
+// - ST is the source type, the data type of the input tensor in srcVec
+// - DT is the destination type, the data type of the output tensor in dstVec
+template<typename ST, typename DT>
+void Label(RawBufferType &dstVec, const RawBufferType &srcVec, const long4 &dstStrides, const long4 &srcStrides,
+           const long4 &shape)
+{
+    // Use a temporary NDHW tensor stored in tmpVec to set elements already labeled, initially zeroes (all unlabeled)
+    RawBufferType tmpVec(shape.x * shape.y * shape.z * shape.w, 0);
+
+    // The temporary tensor is packed and each element is a single byte, thus:
+    long4 tmpStrides{shape.y * shape.z * shape.w, shape.z * shape.w, shape.w, 1};
+
+    // For all elements in input tensor
+    for (long x = 0; x < shape.x; ++x)
+    {
+        for (long y = 0; y < shape.y; ++y)
+        {
+            for (long z = 0; z < shape.z; ++z)
+            {
+                for (long w = 0; w < shape.w; ++w)
+                {
+                    long4 curCoord{x, y, z, w};
+
+                    if (util::ValueAt<U8>(tmpVec, tmpStrides, curCoord) == 1)
+                    {
+                        continue; // The element was already labeled, skip it
+                    }
+
+                    // Get current value from input tensor and set label as a 1D flattened (global) position
+                    ST value = util::ValueAt<ST>(srcVec, srcStrides, curCoord);
+                    DT label = y * dstStrides.y / sizeof(DT) + z * dstStrides.z / sizeof(DT) + w;
+
+                    // Recursively call to label component
+                    LabelComponent(tmpVec, dstVec, srcVec, tmpStrides, dstStrides, srcStrides, shape, curCoord, value,
+                                   label);
+                }
+            }
+        }
+    }
+}
+
+// Replace labels assigned to regions marked as background in source, and fix a potential region labeled with
+// background label in destination by another label (since background label is a reserved label)
+template<typename ST, typename DT>
+void ReplaceBgLabels(RawBufferType &dstVec, const RawBufferType &srcVec, const RawBufferType &bglVec,
+                     const long4 &dstStrides, const long4 &srcStrides, const long1 &bglStrides, const long4 &shape)
+{
+    for (long x = 0; x < shape.x; ++x)
+    {
+        ST backgroundLabel = util::ValueAt<ST>(bglVec, bglStrides, long1{x});
+
+        for (long y = 0; y < shape.y; ++y)
+        {
+            for (long z = 0; z < shape.z; ++z)
+            {
+                for (long w = 0; w < shape.w; ++w)
+                {
+                    long4 curCoord{x, y, z, w};
+
+                    ST value = util::ValueAt<ST>(srcVec, srcStrides, curCoord);
+                    DT label = util::ValueAt<DT>(dstVec, dstStrides, curCoord);
+
+                    if (value == backgroundLabel)
+                    {
+                        // The current value is a background label, write it to output
+                        util::ValueAt<DT>(dstVec, dstStrides, curCoord) = (DT)backgroundLabel;
+                    }
+                    else if (label == (DT)backgroundLabel)
+                    {
+                        // If the label assigned happens to be the same as the background label, replace it by
+                        // another label that is never assigned outside the possible offsets
+                        util::ValueAt<DT>(dstVec, dstStrides, curCoord) = dstStrides.x / sizeof(DT);
+                    }
+                }
+            }
+        }
+    }
+}
+
+// Get the unique set of labels from output in dstVec, disregarding background labels
+template<typename ST, typename DT>
+void GetLabels(std::vector<std::set<DT>> &labels, const RawBufferType &dstVec, const RawBufferType &bglVec,
+               const long4 &dstStrides, const long1 &bglStrides, const long4 &dstShape)
+{
+    bool hasBgLabel = bglStrides.x > 0;
+
+    for (long x = 0; x < dstShape.x; ++x)
+    {
+        ST backgroundLabel = hasBgLabel ? util::ValueAt<ST>(bglVec, bglStrides, long1{x}) : 0;
+
+        for (long y = 0; y < dstShape.y; ++y)
+        {
+            for (long z = 0; z < dstShape.z; ++z)
+            {
+                for (long w = 0; w < dstShape.w; ++w)
+                {
+                    DT label = util::ValueAt<DT>(dstVec, dstStrides, long4{x, y, z, w});
+
+                    if (hasBgLabel && label == (DT)backgroundLabel)
+                    {
+                        continue; // ignore (do not get) background labels
+                    }
+
+                    labels[x].insert(label);
+                }
+            }
+        }
+    }
+}
+
+// Get the unique set of labels from statistics in staVec
+template<typename DT>
+void GetLabels(std::vector<std::set<DT>> &labels, const RawBufferType &cntVec, const RawBufferType &staVec,
+               const long1 &cntStrides, const long3 &staStrides, long numSamples)
+{
+    for (long x = 0; x < numSamples; ++x)
+    {
+        long numLabels = util::ValueAt<DT>(cntVec, cntStrides, long1{x});
+
+        for (long y = 0; y < numLabels; ++y)
+        {
+            DT label = util::ValueAt<DT>(staVec, staStrides, long3{x, y, 0});
+
+            labels[x].insert(label);
+        }
+    }
+}
+
+// Count how many different labels were found
+template<typename DT>
+void CountLabels(RawBufferType &cntVec, const long1 &cntStrides, const std::vector<std::set<DT>> &labels,
+                 long numSamples)
+{
+    for (long x = 0; x < numSamples; ++x)
+    {
+        util::ValueAt<DT>(cntVec, cntStrides, long1{x}) = (DT)labels[x].size();
+    }
+}
+
+// Sort statistics according to region index as test stats have no imposed ordering, it allows comparing against gold
+template<typename DT>
+void SortStats(std::vector<std::vector<std::vector<DT>>> &stats, std::vector<std::set<DT>> &labels,
+               const RawBufferType &staVec, const long3 &staStrides, const long3 &staShape)
+{
+    for (long x = 0; x < staShape.x; ++x)
+    {
+        long numLabels = labels[x].size();
+
+        stats[x].resize(numLabels);
+
+        for (long y = 0; y < numLabels; ++y)
+        {
+            DT   label = util::ValueAt<DT>(staVec, staStrides, long3{x, y, 0});
+            auto fit   = labels[x].find(label);
+
+            long regionIdx = std::distance(labels[x].cbegin(), fit);
+            ASSERT_LE(regionIdx, numLabels) << "E idx " << regionIdx << " >= " << numLabels;
+
+            stats[x][regionIdx].resize(staShape.z);
+
+            for (long z = 0; z < staShape.z; ++z)
+            {
+                stats[x][regionIdx][z] = util::ValueAt<DT>(staVec, staStrides, long3{x, y, z});
+            }
+        }
+    }
+}
+
+// Compute statistics of labeled regions
+template<typename ST, typename DT>
+void ComputeStats(std::vector<std::vector<std::vector<DT>>> &stats, const RawBufferType &dstVec,
+                  const RawBufferType &bglVec, const long4 &dstStrides, const long1 &bglStrides,
+                  const std::vector<std::set<DT>> &labels, const long4 &shape, int numStats)
+{
+    // One-element-after-the-end label is a special label assigned to a region which got the background label
+    DT endLabel = dstStrides.x / sizeof(DT);
+
+    bool hasBgLabel = bglStrides.x > 0;
+
+    for (long x = 0; x < shape.x; ++x)
+    {
+        ST backgroundLabel = hasBgLabel ? util::ValueAt<ST>(bglVec, bglStrides, long1{x}) : 0;
+
+        stats[x].resize(labels[x].size());
+
+        for (long y = 0; y < shape.y; ++y)
+        {
+            for (long z = 0; z < shape.z; ++z)
+            {
+                for (long w = 0; w < shape.w; ++w)
+                {
+                    DT   label = util::ValueAt<DT>(dstVec, dstStrides, long4{x, y, z, w});
+                    auto fit   = labels[x].find(label); // result of find iterator
+                    if (fit == labels[x].end())
+                    {
+                        continue; // this label is to be ignored
+                    }
+
+                    DT posLabel = y * dstStrides.y / sizeof(DT) + z * dstStrides.z / sizeof(DT) + w;
+
+                    if ((hasBgLabel && label == endLabel && posLabel == (DT)backgroundLabel) || label == posLabel)
+                    {
+                        long regionIdx = std::distance(labels[x].cbegin(), fit);
+
+                        stats[x][regionIdx].resize(numStats);
+                        stats[x][regionIdx][0] = label;
+                        stats[x][regionIdx][1] = w;
+                        stats[x][regionIdx][2] = z;
+
+                        if (numStats == 6)
+                        {
+                            stats[x][regionIdx][3] = 1;
+                            stats[x][regionIdx][4] = 1;
+                            stats[x][regionIdx][5] = 1;
+                        }
+                        else
+                        {
+                            stats[x][regionIdx][3] = y;
+                            stats[x][regionIdx][4] = 1;
+                            stats[x][regionIdx][5] = 1;
+                            stats[x][regionIdx][6] = 1;
+                            stats[x][regionIdx][7] = 1;
+                        }
+                    }
+                }
+            }
+        }
+        for (long y = 0; y < shape.y; ++y)
+        {
+            for (long z = 0; z < shape.z; ++z)
+            {
+                for (long w = 0; w < shape.w; ++w)
+                {
+                    DT   label = util::ValueAt<DT>(dstVec, dstStrides, long4{x, y, z, w});
+                    auto fit   = labels[x].find(label);
+                    if (fit == labels[x].end())
+                    {
+                        continue;
+                    }
+
+                    DT posLabel = y * dstStrides.y / sizeof(DT) + z * dstStrides.z / sizeof(DT) + w;
+
+                    if ((hasBgLabel && label == endLabel && posLabel == (DT)backgroundLabel) || label == posLabel)
+                    {
+                        continue; // statistics for this element was already computed
+                    }
+
+                    long regionIdx = std::distance(labels[x].cbegin(), fit);
+                    DT   bboxAreaW = std::abs(stats[x][regionIdx][1] - w) + 1;
+                    DT   bboxAreaH = std::abs(stats[x][regionIdx][2] - z) + 1;
+
+                    if (numStats == 6)
+                    {
+                        stats[x][regionIdx][3] = std::max(stats[x][regionIdx][3], bboxAreaW);
+                        stats[x][regionIdx][4] = std::max(stats[x][regionIdx][4], bboxAreaH);
+                        stats[x][regionIdx][5] += 1;
+                    }
+                    else
+                    {
+                        DT bboxAreaD = std::abs(stats[x][regionIdx][3] - y) + 1;
+
+                        stats[x][regionIdx][4] = std::max(stats[x][regionIdx][4], bboxAreaW);
+                        stats[x][regionIdx][5] = std::max(stats[x][regionIdx][5], bboxAreaH);
+                        stats[x][regionIdx][6] = std::max(stats[x][regionIdx][6], bboxAreaD);
+                        stats[x][regionIdx][7] += 1;
+                    }
+                }
+            }
+        }
+    }
+}
+
+// Remove islands (regions with less than minimum size in mszVec) from dstVec based on statistics
+template<typename ST, typename DT>
+void RemoveIslands(std::vector<std::set<DT>> &labels, RawBufferType &dstVec, const RawBufferType &bglVec,
+                   const RawBufferType &mszVec, const long4 &dstStrides, const long1 &bglStrides,
+                   const long1 &mszStrides, const std::vector<std::vector<std::vector<DT>>> &stats, const long4 &shape,
+                   int numStats)
+{
+    for (long x = 0; x < shape.x; ++x)
+    {
+        ST backgroundLabel = util::ValueAt<ST>(bglVec, bglStrides, long1{x});
+        DT minSize         = util::ValueAt<DT>(mszVec, mszStrides, long1{x});
+
+        for (long y = 0; y < shape.y; ++y)
+        {
+            for (long z = 0; z < shape.z; ++z)
+            {
+                for (long w = 0; w < shape.w; ++w)
+                {
+                    long4 curCoord{x, y, z, w};
+
+                    DT   label = util::ValueAt<DT>(dstVec, dstStrides, curCoord);
+                    auto fit   = labels[x].find(label); // result of find iterator
+                    if (fit == labels[x].end())
+                    {
+                        continue; // this label is to be ignored
+                    }
+
+                    long regionIdx  = std::distance(labels[x].cbegin(), fit);
+                    DT   regionSize = stats[x][regionIdx][numStats - 1];
+
+                    if (regionSize < minSize)
+                    {
+                        util::ValueAt<DT>(dstVec, dstStrides, curCoord) = backgroundLabel;
+                    }
+                }
+            }
+        }
+    }
+}
+
+// Relabel replaces index-based labels by consecutive region indices
+template<typename ST, typename DT>
+void Relabel(RawBufferType &dstVec, const RawBufferType &bglVec, const RawBufferType &staVec,
+             const RawBufferType &cntVec, const long4 &dstStrides, const long1 &bglStrides, const long3 &staStrides,
+             const long1 &cntStrides, const long4 &shape)
+{
+    for (long x = 0; x < shape.x; ++x)
+    {
+        ST backgroundLabel = util::ValueAt<ST>(bglVec, bglStrides, long1{x});
+
+        std::map<DT, DT> origLabelToRegionIdx;
+
+        DT numLabels = util::ValueAt<DT>(cntVec, cntStrides, long1{x});
+
+        for (DT y = 0; y < numLabels; ++y)
+        {
+            DT origLabel = util::ValueAt<DT>(staVec, staStrides, long3{x, y, 0});
+            origLabelToRegionIdx.insert({origLabel, y});
+        }
+        for (long y = 0; y < shape.y; ++y)
+        {
+            for (long z = 0; z < shape.z; ++z)
+            {
+                for (long w = 0; w < shape.w; ++w)
+                {
+                    DT label = util::ValueAt<DT>(dstVec, dstStrides, long4{x, y, z, w});
+
+                    if (label == (DT)backgroundLabel)
+                    {
+                        continue;
+                    }
+
+                    DT regionIdx = origLabelToRegionIdx[label];
+
+                    if (regionIdx >= (DT)backgroundLabel)
+                    {
+                        regionIdx += 1; // increment region indices to skip background labels
+                    }
+
+                    util::ValueAt<DT>(dstVec, dstStrides, long4{x, y, z, w}) = regionIdx;
+                }
+            }
+        }
+    }
+}
+
+} // namespace ref
+
+// ----------------------------- Start tests -----------------------------------
+
+// clang-format off
+
+#define NVCV_SHAPE(w, h, d, n) (int4{w, h, d, n})
+
+#define NVCV_TEST_ROW(InShape, DataType, Type, HasBgLabel, HasMinThresh, HasMaxThresh, DoPostFilters, DoRelabel)       \
+    type::Types<type::Value<InShape>, type::Value<DataType>, Type, type::Value<HasBgLabel>, type::Value<HasMinThresh>, \
+                type::Value<HasMaxThresh>, type::Value<DoPostFilters>, type::Value<DoRelabel>>
+
+// DoPostFilters: (0) none; (1) count regions; (2) + compute statistics; (3) + island removal.
+
+NVCV_TYPED_TEST_SUITE(OpLabel, type::Types<
+    NVCV_TEST_ROW(NVCV_SHAPE(33, 16, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, false, false, false, 0, false),
+    NVCV_TEST_ROW(NVCV_SHAPE(23, 81, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, false, true, false, 1, false),
+    NVCV_TEST_ROW(NVCV_SHAPE(13, 14, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, false, true, true, 2, false),
+    NVCV_TEST_ROW(NVCV_SHAPE(32, 43, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, true, false, false, 3, false),
+    NVCV_TEST_ROW(NVCV_SHAPE(22, 12, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, false, false, true, 0, false),
+    NVCV_TEST_ROW(NVCV_SHAPE(15, 16, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, true, false, true, 1, false),
+    NVCV_TEST_ROW(NVCV_SHAPE(14, 26, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, true, true, false, 2, true),
+    NVCV_TEST_ROW(NVCV_SHAPE(28, 73, 1, 3), NVCV_DATA_TYPE_U16, uint16_t, true, true, true, 3, true),
+    NVCV_TEST_ROW(NVCV_SHAPE(23, 21, 12, 1), NVCV_DATA_TYPE_U32, uint32_t, false, false, false, 0, false),
+    NVCV_TEST_ROW(NVCV_SHAPE(33, 41, 22, 1), NVCV_DATA_TYPE_U32, uint32_t, false, false, false, 1, false),
+    NVCV_TEST_ROW(NVCV_SHAPE(25, 38, 13, 2), NVCV_DATA_TYPE_S8, int8_t, true, false, false, 2, false),
+    NVCV_TEST_ROW(NVCV_SHAPE(25, 18, 13, 1), NVCV_DATA_TYPE_S8, int8_t, true, false, false, 3, false),
+    NVCV_TEST_ROW(NVCV_SHAPE(22, 37, 19, 2), NVCV_DATA_TYPE_S16, int16_t, true, true, false, 0, false),
+    NVCV_TEST_ROW(NVCV_SHAPE(18, 27, 3, 1), NVCV_DATA_TYPE_S32, int32_t, true, false, true, 1, false),
+    NVCV_TEST_ROW(NVCV_SHAPE(17, 29, 5, 2), NVCV_DATA_TYPE_U8, uint8_t, true, true, true, 2, false),
+    NVCV_TEST_ROW(NVCV_SHAPE(16, 28, 4, 3), NVCV_DATA_TYPE_U8, uint8_t, true, true, true, 3, true)
+>);
+
+// clang-format on
+
+TYPED_TEST(OpLabel, correct_output)
+{
+    // First setup: get test parameters, create input and output tensors and get their data accesses
+
+    int4           shape{type::GetValue<TypeParam, 0>};
+    nvcv::DataType srcDT{type::GetValue<TypeParam, 1>};
+    nvcv::DataType dstDT{nvcv::TYPE_U32};
+
+    using SrcT = type::GetType<TypeParam, 2>;
+    using DstT = uint32_t;
+
+    bool hasBgLabel    = type::GetValue<TypeParam, 3>;
+    bool hasMinThresh  = type::GetValue<TypeParam, 4>;
+    bool hasMaxThresh  = type::GetValue<TypeParam, 5>;
+    int  doPostFilters = type::GetValue<TypeParam, 6>;
+    bool doRelabel     = type::GetValue<TypeParam, 7>;
+
+    // @note The tensors below are defined as: input or source (src), output or destination (dst), background
+    // labels (bgl), minimum threshold (min), maximum threshold (max), minimum size for islands removal (msz),
+    // count of labeled regions (count) and statistics computed per labeled region (sta)
+
+    nvcv::Tensor srcTensor, dstTensor, bglTensor, minTensor, maxTensor, mszTensor, cntTensor, staTensor;
+
+    nvcv::Optional<nvcv::TensorDataStridedCuda> srcData, dstData, bglData, minData, maxData, mszData, cntData, staData;
+
+    NVCVConnectivityType connectivity = (shape.z == 1) ? NVCV_CONNECTIVITY_4_2D : NVCV_CONNECTIVITY_6_3D;
+    NVCVLabelType        assignLabels = doRelabel ? NVCV_LABEL_SEQUENTIAL : NVCV_LABEL_FAST;
+
+    long3 staShape{shape.w, 10000, (shape.z == 1) ? 6 : 8};
+
+    // clang-format off
+
+    if (shape.w == 1) // tensors without N in layout (single-sample problem)
+    {
+        if (shape.z == 1) // tensors without D in layout (2D problem)
+        {
+            srcTensor = nvcv::Tensor({{shape.y, shape.x}, "HW"}, srcDT);
+        }
+        else // tensors with D in layout (3D problem)
+        {
+            srcTensor = nvcv::Tensor({{shape.z, shape.y, shape.x}, "DHW"}, srcDT);
+        }
+    }
+    else // tensors with N in layout (batched problem)
+    {
+        if (shape.z == 1) // tensors without D in layout (2D problem)
+        {
+            srcTensor = nvcv::Tensor({{shape.w, shape.y, shape.x}, "NHW"}, srcDT);
+        }
+        else // tensors with D in layout (3D problem)
+        {
+            srcTensor = nvcv::Tensor({{shape.w, shape.z, shape.y, shape.x}, "NDHW"}, srcDT);
+        }
+    }
+
+    if (hasBgLabel)
+    {
+        bglTensor = nvcv::Tensor({{shape.w}, "N"}, srcDT);
+
+        bglData = bglTensor.exportData<nvcv::TensorDataStridedCuda>();
+        ASSERT_TRUE(bglData);
+    }
+    if (hasMinThresh)
+    {
+        minTensor = nvcv::Tensor({{shape.w}, "N"}, srcDT);
+
+        minData = minTensor.exportData<nvcv::TensorDataStridedCuda>();
+        ASSERT_TRUE(minData);
+    }
+    if (hasMaxThresh)
+    {
+        maxTensor = nvcv::Tensor({{shape.w}, "N"}, srcDT);
+
+        maxData = maxTensor.exportData<nvcv::TensorDataStridedCuda>();
+        ASSERT_TRUE(maxData);
+    }
+    if (doPostFilters >= 1)
+    {
+        cntTensor = nvcv::Tensor({{shape.w}, "N"}, dstDT);
+
+        cntData = cntTensor.exportData<nvcv::TensorDataStridedCuda>();
+        ASSERT_TRUE(cntData);
+    }
+    if (doPostFilters >= 2)
+    {
+        staTensor = nvcv::Tensor({{staShape.x, staShape.y, staShape.z}, "NMA"}, dstDT);
+
+        staData = staTensor.exportData<nvcv::TensorDataStridedCuda>();
+        ASSERT_TRUE(staData);
+    }
+    if (doPostFilters == 3)
+    {
+        mszTensor = nvcv::Tensor({{shape.w}, "N"}, dstDT);
+
+        mszData = mszTensor.exportData<nvcv::TensorDataStridedCuda>();
+        ASSERT_TRUE(mszData);
+    }
+
+    // clang-format on
+
+    dstTensor = nvcv::Tensor(srcTensor.shape(), dstDT);
+
+    srcData = srcTensor.exportData<nvcv::TensorDataStridedCuda>();
+    ASSERT_TRUE(srcData);
+
+    dstData = dstTensor.exportData<nvcv::TensorDataStridedCuda>();
+    ASSERT_TRUE(dstData);
+
+    // Second setup: get tensors shape, strides and buffer sizes
+
+    int4 ids{srcTensor.layout().find('N'), srcTensor.layout().find('D'), srcTensor.layout().find('H'),
+             srcTensor.layout().find('W')};
+
+    long4 srcShape{shape.w, shape.z, shape.y, shape.x}; // srcShape is NDHW whereas shape is WHDN
+
+    long4 srcStrides{0, 0, srcData->stride(ids.z), srcData->stride(ids.w)};
+    long4 dstStrides{0, 0, dstData->stride(ids.z), dstData->stride(ids.w)};
+    long1 bglStrides{(bglTensor) ? bglData->stride(0) : 0};
+    long1 minStrides{(minTensor) ? minData->stride(0) : 0};
+    long1 maxStrides{(maxTensor) ? maxData->stride(0) : 0};
+    long1 mszStrides{(mszTensor) ? mszData->stride(0) : 0};
+    long1 cntStrides{(cntTensor) ? cntData->stride(0) : 0};
+    long3 staStrides = (staTensor) ? long3{staData->stride(0), staData->stride(1), staData->stride(2)} : long3{0, 0, 0};
+
+    srcStrides.y = (ids.y == -1) ? srcStrides.z * srcShape.z : srcData->stride(ids.y);
+    srcStrides.x = (ids.x == -1) ? srcStrides.y * srcShape.y : srcData->stride(ids.x);
+    dstStrides.y = (ids.y == -1) ? dstStrides.z * srcShape.z : dstData->stride(ids.y);
+    dstStrides.x = (ids.x == -1) ? dstStrides.y * srcShape.y : dstData->stride(ids.x);
+
+    long srcBufSize = srcStrides.x * srcShape.x;
+    long dstBufSize = dstStrides.x * srcShape.x;
+    long bglBufSize = bglStrides.x * srcShape.x;
+    long minBufSize = minStrides.x * srcShape.x;
+    long maxBufSize = maxStrides.x * srcShape.x;
+    long mszBufSize = mszStrides.x * srcShape.x;
+    long cntBufSize = cntStrides.x * srcShape.x;
+    long staBufSize = staStrides.x * srcShape.x;
+
+    // Third setup: generate raw buffer data and copy them into tensors
+
+    RawBufferType srcVec(srcBufSize);
+    RawBufferType bglVec(bglBufSize);
+    RawBufferType minVec(minBufSize);
+    RawBufferType maxVec(maxBufSize);
+    RawBufferType mszVec(mszBufSize);
+
+    std::default_random_engine rng(0);
+
+    std::uniform_int_distribution<SrcT> srcRandom(0, 6);
+    std::uniform_int_distribution<SrcT> bglRandom(0, (minTensor || maxTensor) ? 1 : 6);
+    std::uniform_int_distribution<SrcT> minRandom(1, 3);
+    std::uniform_int_distribution<SrcT> maxRandom(3, 5);
+
+    // clang-format off
+
+    for (long x = 0; x < srcShape.x; ++x)
+        for (long y = 0; y < srcShape.y; ++y)
+            for (long z = 0; z < srcShape.z; ++z)
+                for (long w = 0; w < srcShape.w; ++w)
+                    util::ValueAt<SrcT>(srcVec, srcStrides, long4{x, y, z, w}) = srcRandom(rng);
+
+    ASSERT_EQ(cudaSuccess, cudaMemcpy(srcData->basePtr(), srcVec.data(), srcBufSize, cudaMemcpyHostToDevice));
+
+    if (bglTensor)
+    {
+        for (long x = 0; x < srcShape.x; ++x)
+            util::ValueAt<SrcT>(bglVec, bglStrides, long1{x}) = bglRandom(rng);
+
+        ASSERT_EQ(cudaSuccess, cudaMemcpy(bglData->basePtr(), bglVec.data(), bglBufSize, cudaMemcpyHostToDevice));
+    }
+    if (minTensor)
+    {
+        for (long x = 0; x < srcShape.x; ++x)
+            util::ValueAt<SrcT>(minVec, minStrides, long1{x}) = minRandom(rng);
+
+        ASSERT_EQ(cudaSuccess, cudaMemcpy(minData->basePtr(), minVec.data(), minBufSize, cudaMemcpyHostToDevice));
+    }
+    if (maxTensor)
+    {
+        for (long x = 0; x < srcShape.x; ++x)
+            util::ValueAt<SrcT>(maxVec, maxStrides, long1{x}) = maxRandom(rng);
+
+        ASSERT_EQ(cudaSuccess, cudaMemcpy(maxData->basePtr(), maxVec.data(), maxBufSize, cudaMemcpyHostToDevice));
+    }
+    if (mszTensor)
+    {
+        for (long x = 0; x < srcShape.x; ++x)
+            util::ValueAt<DstT>(mszVec, mszStrides, long1{x}) = 2;
+
+        ASSERT_EQ(cudaSuccess, cudaMemcpy(mszData->basePtr(), mszVec.data(), mszBufSize, cudaMemcpyHostToDevice));
+    }
+
+    // clang-format on
+
+    // After all above setups are done, run the operator, synchronize the stream and copy its results back to host
+
+    cudaStream_t stream;
+    ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream));
+
+    cvcuda::Label op;
+    EXPECT_NO_THROW(op(stream, srcTensor, dstTensor, bglTensor, minTensor, maxTensor, mszTensor, cntTensor, staTensor,
+                       connectivity, assignLabels));
+
+    ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
+    ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream));
+
+    // The operator's results are named as test that must be equal to gold, the three outputs are: labels (lab),
+    // count (cnt) and statistics (sta); gold statistics are not written as raw buffer, only in 3-vector form
+
+    RawBufferType labTestVec(dstBufSize, 0);
+    RawBufferType labGoldVec(dstBufSize, 0);
+    RawBufferType cntTestVec(cntBufSize, 0);
+    RawBufferType cntGoldVec(cntBufSize, 0);
+    RawBufferType staTestVec(staBufSize, 0);
+
+    std::vector<std::set<DstT>> testLabels(srcShape.x);
+    std::vector<std::set<DstT>> goldLabels(srcShape.x);
+
+    std::vector<std::vector<std::vector<DstT>>> testStats(srcShape.x);
+    std::vector<std::vector<std::vector<DstT>>> goldStats(srcShape.x);
+
+    ASSERT_EQ(cudaSuccess, cudaMemcpy(labTestVec.data(), dstData->basePtr(), dstBufSize, cudaMemcpyDeviceToHost));
+
+    // To generate the gold data, the reference code (in ref namespace) is used in a specific sequence of steps:
+    // (1) pre-filter binarization uses min/max thresholds (if present) to replace input mask to binary; (2) the
+    // label operation itself; (3) background labels are replaced (if present); (4) get all original gold labels;
+    // (5) count the labels got; (6) compute statistics of the labeled regions; (7) get all original test labels;
+    // (8) remove islands as post-filter step (if minSize tensor is present); (9) relabel to replace non-sequential
+    // labels to consecutive region indices; (10) sort test statistics to be able to compare against gold.
+
+    // In-between the generation of gold data, EXPECT_EQ is used to compare test data against gold.
+
+    if (minTensor || maxTensor)
+    {
+        ref::Binarize<SrcT>(srcVec, minVec, maxVec, srcStrides, minStrides, maxStrides, srcShape);
+    }
+
+    ref::Label<SrcT, DstT>(labGoldVec, srcVec, dstStrides, srcStrides, srcShape);
+
+    if (bglTensor)
+    {
+        ref::ReplaceBgLabels<SrcT, DstT>(labGoldVec, srcVec, bglVec, dstStrides, srcStrides, bglStrides, srcShape);
+    }
+
+    ref::GetLabels<SrcT, DstT>(goldLabels, labGoldVec, bglVec, dstStrides, bglStrides, srcShape);
+
+    if (cntTensor)
+    {
+        ASSERT_EQ(cudaSuccess, cudaMemcpy(cntTestVec.data(), cntData->basePtr(), cntBufSize, cudaMemcpyDeviceToHost));
+
+        ref::CountLabels<DstT>(cntGoldVec, cntStrides, goldLabels, srcShape.x);
+    }
+
+    EXPECT_EQ(cntTestVec, cntGoldVec);
+
+    if (staTensor)
+    {
+        ASSERT_EQ(cudaSuccess, cudaMemcpy(staTestVec.data(), staData->basePtr(), staBufSize, cudaMemcpyDeviceToHost));
+
+        ref::ComputeStats<SrcT, DstT>(goldStats, labGoldVec, bglVec, dstStrides, bglStrides, goldLabels, srcShape,
+                                      staShape.z);
+
+        ref::GetLabels<DstT>(testLabels, cntTestVec, staTestVec, cntStrides, staStrides, srcShape.x);
+    }
+    else
+    {
+        ref::GetLabels<SrcT, DstT>(testLabels, labTestVec, bglVec, dstStrides, bglStrides, srcShape);
+    }
+
+    EXPECT_EQ(testLabels, goldLabels);
+
+    if (mszTensor)
+    {
+        ref::RemoveIslands<SrcT, DstT>(goldLabels, labGoldVec, bglVec, mszVec, dstStrides, bglStrides, mszStrides,
+                                       goldStats, srcShape, staShape.z);
+    }
+
+    if (doRelabel)
+    {
+        ref::Relabel<SrcT, DstT>(labGoldVec, bglVec, staTestVec, cntTestVec, dstStrides, bglStrides, staStrides,
+                                 cntStrides, srcShape);
+    }
+
+    if (staTensor)
+    {
+        ref::SortStats<DstT>(testStats, testLabels, staTestVec, staStrides, staShape);
+    }
+
+    EXPECT_EQ(testStats, goldStats);
+
+    EXPECT_EQ(labTestVec, labGoldVec);
+}
diff --git a/tests/cvcuda/system/TestOpOSD.cpp b/tests/cvcuda/system/TestOpOSD.cpp
index fcfd9323..5ef18eab 100644
--- a/tests/cvcuda/system/TestOpOSD.cpp
+++ b/tests/cvcuda/system/TestOpOSD.cpp
@@ -21,6 +21,7 @@
 
 #include <common/ValueTests.hpp>
 #include <cvcuda/OpOSD.hpp>
+#include <cvcuda/priv/Types.hpp>
 #include <nvcv/Image.hpp>
 #include <nvcv/Tensor.hpp>
 #include <nvcv/TensorDataAccess.hpp>
@@ -33,6 +34,7 @@
 
 namespace gt   = ::testing;
 namespace test = nvcv::test;
+using namespace cvcuda::priv;
 
 static int randl(int l, int h)
 {
@@ -44,12 +46,12 @@ static int randl(int l, int h)
 #pragma GCC optimize("O1")
 
 static void setGoldBuffer(std::vector<uint8_t> &vect, nvcv::ImageFormat format,
-                          const nvcv::TensorDataAccessStridedImagePlanar &data, nvcv::Byte *inBuf, NVCVElements ctx,
-                          cudaStream_t stream)
+                          const nvcv::TensorDataAccessStridedImagePlanar &data, nvcv::Byte *inBuf,
+                          std::shared_ptr<NVCVElementsImpl> ctx, cudaStream_t stream)
 {
     auto context = cuosd_context_create();
 
-    for (int n = 0; n < ctx.batch; n++)
+    for (int n = 0; n < ctx->batch(); n++)
     {
         test::osd::Image *image = test::osd::create_image(
             data.numCols(), data.numRows(),
@@ -57,16 +59,16 @@ static void setGoldBuffer(std::vector<uint8_t> &vect, nvcv::ImageFormat format,
         int bufSize = data.numCols() * data.numRows() * data.numChannels();
         EXPECT_EQ(cudaSuccess, cudaMemcpy(image->data0, inBuf + n * bufSize, bufSize, cudaMemcpyDeviceToDevice));
 
-        auto numElements = ctx.numElements[n];
+        auto numElements = ctx->numElementsAt(n);
 
         for (int i = 0; i < numElements; i++)
         {
-            auto element = ctx.elements[i];
-            switch (element.type)
+            auto element = ctx->elementAt(n, i);
+            switch (element->type())
             {
             case NVCVOSDType::NVCV_OSD_RECT:
             {
-                auto bbox = *((NVCVBndBoxI *)element.data);
+                auto bbox = *((NVCVBndBoxI *)element->ptr());
 
                 int left   = std::max(std::min(bbox.box.x, data.numCols() - 1), 0);
                 int top    = std::max(std::min(bbox.box.y, data.numRows() - 1), 0);
@@ -86,7 +88,7 @@ static void setGoldBuffer(std::vector<uint8_t> &vect, nvcv::ImageFormat format,
             }
             case NVCVOSDType::NVCV_OSD_TEXT:
             {
-                auto       text      = *((NVCVText *)element.data);
+                auto       text      = *((NVCVText *)element->ptr());
                 cuOSDColor fontColor = *(cuOSDColor *)(&text.fontColor);
                 cuOSDColor bgColor   = *(cuOSDColor *)(&text.bgColor);
                 cuosd_draw_text(context, text.utf8Text, text.fontSize, text.fontName, text.tlPos.x, text.tlPos.y,
@@ -95,35 +97,34 @@ static void setGoldBuffer(std::vector<uint8_t> &vect, nvcv::ImageFormat format,
             }
             case NVCVOSDType::NVCV_OSD_SEGMENT:
             {
-                auto segment = *((NVCVSegment *)element.data);
+                auto segment = (NVCVSegment *)element->ptr();
 
-                int left   = segment.box.x;
-                int top    = segment.box.y;
-                int right  = left + segment.box.width - 1;
-                int bottom = top + segment.box.height - 1;
+                int left   = segment->box.x;
+                int top    = segment->box.y;
+                int right  = left + segment->box.width - 1;
+                int bottom = top + segment->box.height - 1;
 
-                if (left == right || top == bottom || segment.box.width <= 0 || segment.box.height <= 0)
+                if (left == right || top == bottom || segment->box.width <= 0 || segment->box.height <= 0)
                 {
                     continue;
                 }
-
-                cuOSDColor borderColor = *(cuOSDColor *)(&segment.borderColor);
-                cuOSDColor segColor    = *(cuOSDColor *)(&segment.segColor);
-                cuosd_draw_segmentmask(context, left, top, right, bottom, segment.thickness, segment.dSeg,
-                                       segment.segWidth, segment.segHeight, segment.segThreshold, borderColor,
+                cuOSDColor borderColor = *(cuOSDColor *)(&segment->borderColor);
+                cuOSDColor segColor    = *(cuOSDColor *)(&segment->segColor);
+                cuosd_draw_segmentmask(context, left, top, right, bottom, segment->thickness, segment->dSeg,
+                                       segment->segWidth, segment->segHeight, segment->segThreshold, borderColor,
                                        segColor);
                 break;
             }
             case NVCVOSDType::NVCV_OSD_POINT:
             {
-                auto       point = *((NVCVPoint *)element.data);
+                auto       point = *((NVCVPoint *)element->ptr());
                 cuOSDColor color = *(cuOSDColor *)(&point.color);
                 cuosd_draw_point(context, point.centerPos.x, point.centerPos.y, point.radius, color);
                 break;
             }
             case NVCVOSDType::NVCV_OSD_LINE:
             {
-                auto       line  = *((NVCVLine *)element.data);
+                auto       line  = *((NVCVLine *)element->ptr());
                 cuOSDColor color = *(cuOSDColor *)(&line.color);
                 cuosd_draw_line(context, line.pos0.x, line.pos0.y, line.pos1.x, line.pos1.y, line.thickness, color,
                                 line.interpolation);
@@ -131,16 +132,16 @@ static void setGoldBuffer(std::vector<uint8_t> &vect, nvcv::ImageFormat format,
             }
             case NVCVOSDType::NVCV_OSD_POLYLINE:
             {
-                auto       pl          = *((NVCVPolyLine *)element.data);
-                cuOSDColor borderColor = *(cuOSDColor *)(&pl.borderColor);
-                cuOSDColor fill_color  = *(cuOSDColor *)(&pl.fillColor);
-                cuosd_draw_polyline(context, pl.hPoints, pl.dPoints, pl.numPoints, pl.thickness, pl.isClosed,
-                                    borderColor, pl.interpolation, fill_color);
+                auto       pl          = (NVCVPolyLine *)element->ptr();
+                cuOSDColor borderColor = *(cuOSDColor *)(&pl->borderColor);
+                cuOSDColor fill_color  = *(cuOSDColor *)(&pl->fillColor);
+                cuosd_draw_polyline(context, pl->hPoints, pl->dPoints, pl->numPoints, pl->thickness, pl->isClosed,
+                                    borderColor, pl->interpolation, fill_color);
                 break;
             }
             case NVCVOSDType::NVCV_OSD_ROTATED_RECT:
             {
-                auto       rb          = *((NVCVRotatedBox *)element.data);
+                auto       rb          = *((NVCVRotatedBox *)element->ptr());
                 cuOSDColor borderColor = *(cuOSDColor *)(&rb.borderColor);
                 cuOSDColor bgColor     = *(cuOSDColor *)(&rb.bgColor);
                 cuosd_draw_rotationbox(context, rb.centerPos.x, rb.centerPos.y, rb.width, rb.height, rb.yaw,
@@ -149,7 +150,7 @@ static void setGoldBuffer(std::vector<uint8_t> &vect, nvcv::ImageFormat format,
             }
             case NVCVOSDType::NVCV_OSD_CIRCLE:
             {
-                auto       circle      = *((NVCVCircle *)element.data);
+                auto       circle      = *((NVCVCircle *)element->ptr());
                 cuOSDColor borderColor = *(cuOSDColor *)(&circle.borderColor);
                 cuOSDColor bgColor     = *(cuOSDColor *)(&circle.bgColor);
                 cuosd_draw_circle(context, circle.centerPos.x, circle.centerPos.y, circle.radius, circle.thickness,
@@ -158,7 +159,7 @@ static void setGoldBuffer(std::vector<uint8_t> &vect, nvcv::ImageFormat format,
             }
             case NVCVOSDType::NVCV_OSD_ARROW:
             {
-                auto       arrow = *((NVCVArrow *)element.data);
+                auto       arrow = *((NVCVArrow *)element->ptr());
                 cuOSDColor color = *(cuOSDColor *)(&arrow.color);
                 cuosd_draw_arrow(context, arrow.pos0.x, arrow.pos0.y, arrow.pos1.x, arrow.pos1.y, arrow.arrowSize,
                                  arrow.thickness, color, arrow.interpolation);
@@ -166,7 +167,7 @@ static void setGoldBuffer(std::vector<uint8_t> &vect, nvcv::ImageFormat format,
             }
             case NVCVOSDType::NVCV_OSD_CLOCK:
             {
-                auto             clock       = *((NVCVClock *)element.data);
+                auto             clock       = *((NVCVClock *)element->ptr());
                 cuOSDClockFormat clockFormat = (cuOSDClockFormat)(int)(clock.clockFormat);
                 cuOSDColor       fontColor   = *(cuOSDColor *)(&clock.fontColor);
                 cuOSDColor       bgColor     = *(cuOSDColor *)(&clock.bgColor);
@@ -180,7 +181,6 @@ static void setGoldBuffer(std::vector<uint8_t> &vect, nvcv::ImageFormat format,
         }
 
         test::osd::cuosd_apply(context, image, stream);
-        ctx.elements = (NVCVElement *)((unsigned char *)ctx.elements + numElements * sizeof(NVCVElement));
         EXPECT_EQ(cudaSuccess, cudaMemcpy(vect.data() + n * bufSize, image->data0, bufSize, cudaMemcpyDeviceToHost));
         test::osd::free_image(image);
     }
@@ -191,85 +191,11 @@ static void setGoldBuffer(std::vector<uint8_t> &vect, nvcv::ImageFormat format,
 
 #pragma GCC pop_options
 
-static void free_elements(std::vector<NVCVElement> &elementVec)
-{
-    for (auto element : elementVec)
-    {
-        switch (element.type)
-        {
-        case NVCVOSDType::NVCV_OSD_RECT:
-        {
-            NVCVBndBoxI *bndBox = (NVCVBndBoxI *)element.data;
-            delete (bndBox);
-            break;
-        }
-        case NVCVOSDType::NVCV_OSD_TEXT:
-        {
-            NVCVText *label = (NVCVText *)element.data;
-            delete (label);
-            break;
-        }
-        case NVCVOSDType::NVCV_OSD_SEGMENT:
-        {
-            NVCVSegment *segment = (NVCVSegment *)element.data;
-            delete (segment);
-            break;
-        }
-        case NVCVOSDType::NVCV_OSD_POINT:
-        {
-            NVCVPoint *point = (NVCVPoint *)element.data;
-            delete (point);
-            break;
-        }
-        case NVCVOSDType::NVCV_OSD_LINE:
-        {
-            NVCVLine *line = (NVCVLine *)element.data;
-            delete (line);
-            break;
-        }
-        case NVCVOSDType::NVCV_OSD_POLYLINE:
-        {
-            NVCVPolyLine *pl = (NVCVPolyLine *)element.data;
-            delete (pl);
-            break;
-        }
-        case NVCVOSDType::NVCV_OSD_ROTATED_RECT:
-        {
-            NVCVRotatedBox *rb = (NVCVRotatedBox *)element.data;
-            delete (rb);
-            break;
-        }
-        case NVCVOSDType::NVCV_OSD_CIRCLE:
-        {
-            NVCVCircle *circle = (NVCVCircle *)element.data;
-            delete (circle);
-            break;
-        }
-        case NVCVOSDType::NVCV_OSD_ARROW:
-        {
-            NVCVArrow *arrow = (NVCVArrow *)element.data;
-            delete (arrow);
-            break;
-        }
-        case NVCVOSDType::NVCV_OSD_CLOCK:
-        {
-            NVCVClock *clock = (NVCVClock *)element.data;
-            delete (clock);
-            break;
-        }
-        default:
-            break;
-        }
-    }
-}
-
 // run operator
 static void runOp(cudaStream_t &stream, cvcuda::OSD &op, int &inN, int &inW, int &inH, int &num, int &sed,
                   nvcv::ImageFormat &format)
 {
-    NVCVElements             ctx;
-    std::vector<int>         numElementVec;
-    std::vector<NVCVElement> elementVec;
+    std::vector<std::vector<std::shared_ptr<NVCVElement>>> elementVec;
 
     test::osd::Segment  *test_segment  = test::osd::create_segment();
     test::osd::Polyline *test_polyline = test::osd::create_polyline();
@@ -277,176 +203,158 @@ static void runOp(cudaStream_t &stream, cvcuda::OSD &op, int &inN, int &inW, int
     srand(sed);
     for (int n = 0; n < inN; n++)
     {
-        numElementVec.push_back(num);
+        std::vector<std::shared_ptr<NVCVElement>> curVec;
         for (int i = 0; i < num; i++)
         {
-            NVCVElement element;
-            element.type = (NVCVOSDType)randl(int(NVCV_OSD_NONE) + 1, int(NVCV_OSD_MAX) - 1);
-            switch (element.type)
+            NVCVOSDType                  type = (NVCVOSDType)randl(int(NVCV_OSD_NONE) + 1, int(NVCV_OSD_MAX) - 1);
+            std::shared_ptr<NVCVElement> element;
+            switch (type)
             {
             case NVCVOSDType::NVCV_OSD_RECT:
             {
-                NVCVBndBoxI *bndBox = new NVCVBndBoxI();
-                bndBox->box.x       = randl(0, inW - 1);
-                bndBox->box.y       = randl(0, inH - 1);
-                bndBox->box.width   = randl(1, inW);
-                bndBox->box.height  = randl(1, inH);
-                bndBox->thickness   = randl(-1, 30);
-                bndBox->fillColor   = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
-                                       (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)};
-                bndBox->borderColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
-                                       (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)};
-                element.data        = (void *)bndBox;
+                NVCVBndBoxI bndBox;
+                bndBox.box.x       = randl(0, inW - 1);
+                bndBox.box.y       = randl(0, inH - 1);
+                bndBox.box.width   = randl(1, inW);
+                bndBox.box.height  = randl(1, inH);
+                bndBox.thickness   = randl(-1, 30);
+                bndBox.fillColor   = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
+                                      (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)};
+                bndBox.borderColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
+                                      (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)};
+                element            = std::make_shared<NVCVElement>(type, &bndBox);
                 break;
             }
             case NVCVOSDType::NVCV_OSD_TEXT:
             {
-                NVCVText *label  = new NVCVText();
-                label->utf8Text  = "abcdefghijklmnopqrstuvwxyz";
-                label->fontSize  = 5 * randl(1, 10);
-                label->fontName  = DEFAULT_OSD_FONT;
-                label->tlPos.x   = randl(0, inW - 1);
-                label->tlPos.y   = randl(0, inH - 1);
-                label->fontColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
-                                    (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)};
-                label->bgColor   = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
-                                    (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)};
-                element.data     = (void *)label;
+                NVCVText text = NVCVText("abcdefghijklmnopqrstuvwxyz", 5 * randl(1, 10), DEFAULT_OSD_FONT,
+                                         NVCVPointI({randl(0, inW - 1), randl(0, inH - 1)}),
+                                         NVCVColorRGBA({(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
+                                                        (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}),
+                                         NVCVColorRGBA({(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
+                                                        (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}));
+                element       = std::make_shared<NVCVElement>(type, &text);
                 break;
             }
             case NVCVOSDType::NVCV_OSD_SEGMENT:
             {
-                NVCVSegment *segment  = new NVCVSegment();
-                segment->box.x        = randl(0, inW - 1);
-                segment->box.y        = randl(0, inH - 1);
-                segment->box.width    = randl(1, inW);
-                segment->box.height   = randl(1, inH);
-                segment->thickness    = randl(-1, 5);
-                segment->dSeg         = test_segment->data;
-                segment->segWidth     = test_segment->width;
-                segment->segHeight    = test_segment->height;
-                segment->segThreshold = 0.1 * randl(1, 5);
-                segment->borderColor  = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
-                                         (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)};
-                segment->segColor     = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
-                                         (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)};
-                element.data          = (void *)segment;
+                NVCVSegment segment = NVCVSegment(
+                    NVCVBoxI({randl(0, inW - 1), randl(0, inH - 1), randl(1, inW), randl(1, inH)}), randl(-1, 5),
+                    test_segment->data, test_segment->width, test_segment->height, 0.1 * randl(1, 5),
+                    NVCVColorRGBA({(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
+                                   (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}),
+                    NVCVColorRGBA({(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
+                                   (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}));
+                element = std::make_shared<NVCVElement>(type, &segment);
                 break;
             }
             case NVCVOSDType::NVCV_OSD_POINT:
             {
-                NVCVPoint *point   = new NVCVPoint();
-                point->centerPos.x = randl(0, inW - 1);
-                point->centerPos.y = randl(0, inH - 1);
-                point->radius      = randl(1, 50);
-                point->color       = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
-                                      (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)};
-                element.data       = (void *)point;
+                NVCVPoint point;
+                point.centerPos.x = randl(0, inW - 1);
+                point.centerPos.y = randl(0, inH - 1);
+                point.radius      = randl(1, 50);
+                point.color = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
+                               (unsigned char)randl(0, 255)};
+                element     = std::make_shared<NVCVElement>(type, &point);
                 break;
             }
             case NVCVOSDType::NVCV_OSD_LINE:
             {
-                NVCVLine *line  = new NVCVLine();
-                line->pos0.x    = randl(0, inW - 1);
-                line->pos0.y    = randl(0, inH - 1);
-                line->pos1.x    = randl(0, inW - 1);
-                line->pos1.y    = randl(0, inH - 1);
-                line->thickness = randl(1, 5);
-                line->color = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
-                               (unsigned char)randl(0, 255)};
-                line->interpolation = true;
-                element.data        = (void *)line;
+                NVCVLine line;
+                line.pos0.x    = randl(0, inW - 1);
+                line.pos0.y    = randl(0, inH - 1);
+                line.pos1.x    = randl(0, inW - 1);
+                line.pos1.y    = randl(0, inH - 1);
+                line.thickness = randl(1, 5);
+                line.color = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
+                              (unsigned char)randl(0, 255)};
+                line.interpolation = true;
+                element            = std::make_shared<NVCVElement>(type, &line);
                 break;
             }
             case NVCVOSDType::NVCV_OSD_POLYLINE:
             {
-                NVCVPolyLine *pl  = new NVCVPolyLine();
-                pl->hPoints       = test_polyline->h_pts;
-                pl->dPoints       = test_polyline->d_pts;
-                pl->numPoints     = test_polyline->n_pts;
-                pl->thickness     = randl(1, 5);
-                pl->isClosed      = randl(0, 1);
-                pl->borderColor   = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
-                                     (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)};
-                pl->fillColor     = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
-                                     (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)};
-                pl->interpolation = true;
-                element.data      = (void *)pl;
+                NVCVPolyLine pl
+                    = NVCVPolyLine(test_polyline->h_pts, test_polyline->n_pts, randl(1, 5), randl(0, 1),
+                                   NVCVColorRGBA({(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
+                                                  (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}),
+                                   NVCVColorRGBA({(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
+                                                  (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}),
+                                   true);
+                element = std::make_shared<NVCVElement>(type, &pl);
                 break;
             }
             case NVCVOSDType::NVCV_OSD_ROTATED_RECT:
             {
-                NVCVRotatedBox *rb = new NVCVRotatedBox();
-                rb->centerPos.x    = randl(0, inW - 1);
-                rb->centerPos.y    = randl(0, inH - 1);
-                rb->width          = randl(1, inW);
-                rb->height         = randl(1, inH);
-                rb->yaw            = 0.02 * randl(1, 314);
-                rb->thickness      = randl(1, 5);
-                rb->borderColor    = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
-                                      (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)};
-                rb->bgColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
-                               (unsigned char)randl(0, 255)};
-                rb->interpolation = false;
-                element.data      = (void *)rb;
+                NVCVRotatedBox rb;
+                rb.centerPos.x = randl(0, inW - 1);
+                rb.centerPos.y = randl(0, inH - 1);
+                rb.width       = randl(1, inW);
+                rb.height      = randl(1, inH);
+                rb.yaw         = 0.02 * randl(1, 314);
+                rb.thickness   = randl(1, 5);
+                rb.borderColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
+                                  (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)};
+                rb.bgColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
+                              (unsigned char)randl(0, 255)};
+                rb.interpolation = false;
+                element          = std::make_shared<NVCVElement>(type, &rb);
                 break;
             }
             case NVCVOSDType::NVCV_OSD_CIRCLE:
             {
-                NVCVCircle *circle  = new NVCVCircle();
-                circle->centerPos.x = randl(0, inW - 1);
-                circle->centerPos.y = randl(0, inH - 1);
-                circle->radius      = randl(1, 50);
-                circle->thickness   = randl(1, 5);
-                circle->borderColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
-                                       (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)};
-                circle->bgColor     = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
-                                       (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)};
-                element.data        = (void *)circle;
+                NVCVCircle circle;
+                circle.centerPos.x = randl(0, inW - 1);
+                circle.centerPos.y = randl(0, inH - 1);
+                circle.radius      = randl(1, 50);
+                circle.thickness   = randl(1, 5);
+                circle.borderColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
+                                      (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)};
+                circle.bgColor     = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
+                                      (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)};
+                element            = std::make_shared<NVCVElement>(type, &circle);
                 break;
             }
             case NVCVOSDType::NVCV_OSD_ARROW:
             {
-                NVCVArrow *arrow     = new NVCVArrow();
-                arrow->pos0.x        = randl(0, inW - 1);
-                arrow->pos0.y        = randl(0, inH - 1);
-                arrow->pos1.x        = randl(0, inW - 1);
-                arrow->pos1.y        = randl(0, inH - 1);
-                arrow->arrowSize     = randl(1, 5);
-                arrow->thickness     = randl(1, 5);
-                arrow->color         = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
-                                        (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)};
-                arrow->interpolation = false;
-                element.data         = (void *)arrow;
+                NVCVArrow arrow;
+                arrow.pos0.x    = randl(0, inW - 1);
+                arrow.pos0.y    = randl(0, inH - 1);
+                arrow.pos1.x    = randl(0, inW - 1);
+                arrow.pos1.y    = randl(0, inH - 1);
+                arrow.arrowSize = randl(1, 5);
+                arrow.thickness = randl(1, 5);
+                arrow.color = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
+                               (unsigned char)randl(0, 255)};
+                arrow.interpolation = false;
+                element             = std::make_shared<NVCVElement>(type, &arrow);
                 break;
             }
             case NVCVOSDType::NVCV_OSD_CLOCK:
             {
-                NVCVClock *clock   = new NVCVClock();
-                clock->clockFormat = (NVCVClockFormat)(randl(1, 3));
-                clock->time        = time(0);
-                clock->fontSize    = 5 * randl(1, 10);
-                clock->font        = DEFAULT_OSD_FONT;
-                clock->tlPos.x     = randl(0, inW - 1);
-                clock->tlPos.y     = randl(0, inH - 1);
-                clock->fontColor   = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
-                                      (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)};
-                clock->bgColor     = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
-                                      (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)};
-                element.data       = (void *)clock;
+                NVCVClock clock
+                    = NVCVClock{(NVCVClockFormat)(randl(1, 3)),
+                                time(0),
+                                5 * randl(1, 10),
+                                DEFAULT_OSD_FONT,
+                                NVCVPointI({randl(0, inW - 1), randl(0, inH - 1)}),
+                                NVCVColorRGBA({(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
+                                               (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}),
+                                NVCVColorRGBA({(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
+                                               (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)})};
+                element = std::make_shared<NVCVElement>(type, &clock);
                 break;
             }
             default:
                 break;
             }
-
-            elementVec.push_back(element);
+            curVec.push_back(element);
         }
+        elementVec.push_back(curVec);
     }
 
-    ctx.batch       = inN;
-    ctx.numElements = numElementVec.data();
-    ctx.elements    = elementVec.data();
+    std::shared_ptr<NVCVElementsImpl> ctx = std::make_shared<NVCVElementsImpl>(elementVec);
 
     nvcv::Tensor imgIn  = nvcv::util::CreateTensor(inN, inW, inH, format);
     nvcv::Tensor imgOut = nvcv::util::CreateTensor(inN, inW, inH, format);
@@ -472,7 +380,7 @@ static void runOp(cudaStream_t &stream, cvcuda::OSD &op, int &inN, int &inW, int
     EXPECT_EQ(cudaSuccess, cudaMemset(input->basePtr(), 0xFF, inSampleStride * inAccess->numSamples()));
     EXPECT_EQ(cudaSuccess, cudaMemset(output->basePtr(), 0xFF, outSampleStride * outAccess->numSamples()));
 
-    EXPECT_NO_THROW(op(stream, imgIn, imgOut, ctx));
+    EXPECT_NO_THROW(op(stream, imgIn, imgOut, (NVCVElements)ctx.get()));
 
     // check cdata
     std::vector<uint8_t> test(outBufSize);
@@ -487,7 +395,6 @@ static void runOp(cudaStream_t &stream, cvcuda::OSD &op, int &inN, int &inW, int
 
     test::osd::free_segment(test_segment);
     test::osd::free_polyline(test_polyline);
-    free_elements(elementVec);
 
     EXPECT_EQ(gold, test);
 }
diff --git a/tests/cvcuda/system/TestOpPairwiseMatcher.cpp b/tests/cvcuda/system/TestOpPairwiseMatcher.cpp
new file mode 100644
index 00000000..c2742461
--- /dev/null
+++ b/tests/cvcuda/system/TestOpPairwiseMatcher.cpp
@@ -0,0 +1,442 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Definitions.hpp"
+
+#include <common/TypedTests.hpp>
+#include <cvcuda/OpPairwiseMatcher.hpp>
+#include <nvcv/cuda/TypeTraits.hpp>
+#include <util/TensorDataUtils.hpp>
+
+#include <algorithm>
+#include <bitset>
+#include <cmath>
+#include <iostream>
+#include <random>
+#include <tuple>
+#include <vector>
+
+// ----------------------- Basic utility definitions ---------------------------
+
+namespace cuda = nvcv::cuda;
+namespace util = nvcv::util;
+namespace type = nvcv::test::type;
+
+using RawBufferType = std::vector<uint8_t>;
+
+template<typename T>
+using uniform_distribution
+    = std::conditional_t<std::is_integral_v<T>, std::uniform_int_distribution<T>, std::uniform_real_distribution<T>>;
+
+template<typename T>
+constexpr nvcv::DataType ToDataType()
+{
+    if constexpr (std::is_same_v<T, uint8_t>)
+    {
+        return nvcv::TYPE_U8;
+    }
+    else if constexpr (std::is_same_v<T, uint32_t>)
+    {
+        return nvcv::TYPE_U32;
+    }
+    else if constexpr (std::is_same_v<T, float>)
+    {
+        return nvcv::TYPE_F32;
+    }
+}
+
+// --------------------- Reference (gold) computations -------------------------
+
+namespace ref {
+
+template<typename T>
+T absdiff(T a, T b)
+{
+    if constexpr (std::is_floating_point_v<T>)
+    {
+        return std::abs(a - b);
+    }
+    else
+    {
+        return a < b ? b - a : a - b;
+    }
+}
+
+template<typename DT, typename ST>
+void ComputeDistance(DT &dist, ST p1, ST p2, NVCVNormType normType)
+{
+    if (normType == NVCV_NORM_HAMMING)
+    {
+        if constexpr (!std::is_floating_point_v<ST>)
+        {
+            dist += std::bitset<sizeof(ST) * 8>(p1 ^ p2).count();
+        }
+    }
+    else if (normType == NVCV_NORM_L1)
+    {
+        dist += absdiff(p1, p2);
+    }
+    else if (normType == NVCV_NORM_L2)
+    {
+        dist += std::pow(absdiff(p1, p2), 2);
+    }
+}
+
+template<typename ST>
+void BruteForceMatcher(RawBufferType &mchVec, RawBufferType &nmVec, RawBufferType &dVec, const RawBufferType &set1Vec,
+                       const RawBufferType &set2Vec, const long3 &mchStrides, const long1 &nmStrides,
+                       const long2 &dStrides, const long3 &set1Strides, const long3 &set2Strides, int numSamples,
+                       int numDim, int set1Size, int set2Size, bool crossCheck, int matchesPerPoint,
+                       NVCVNormType normType)
+{
+    std::vector<std::tuple<float, int>> distIdx(set2Size);
+    std::vector<std::tuple<float, int>> cckDistIdx(set1Size);
+
+    for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++)
+    {
+        int mchIdx = 0;
+
+        for (int set1Idx = 0; set1Idx < set1Size; set1Idx++)
+        {
+            for (int set2Idx = 0; set2Idx < set2Size; set2Idx++)
+            {
+                float dist = 0.f;
+
+                for (int coordIdx = 0; coordIdx < numDim; coordIdx++)
+                {
+                    ST p1 = util::ValueAt<ST>(set1Vec, set1Strides, long3{sampleIdx, set1Idx, coordIdx});
+                    ST p2 = util::ValueAt<ST>(set2Vec, set2Strides, long3{sampleIdx, set2Idx, coordIdx});
+
+                    ComputeDistance(dist, p1, p2, normType);
+                }
+                if (normType == NVCV_NORM_L2)
+                {
+                    dist = std::sqrt(dist);
+                }
+
+                distIdx[set2Idx] = std::tie(dist, set2Idx);
+            }
+
+            std::sort(distIdx.begin(), distIdx.end());
+
+            if (crossCheck)
+            {
+                int set2Idx = std::get<1>(distIdx[0]);
+
+                for (int cck1Idx = 0; cck1Idx < set1Size; cck1Idx++)
+                {
+                    float dist = 0.f;
+
+                    for (int coordIdx = 0; coordIdx < numDim; coordIdx++)
+                    {
+                        ST p1 = util::ValueAt<ST>(set1Vec, set1Strides, long3{sampleIdx, cck1Idx, coordIdx});
+                        ST p2 = util::ValueAt<ST>(set2Vec, set2Strides, long3{sampleIdx, set2Idx, coordIdx});
+
+                        ComputeDistance(dist, p1, p2, normType);
+                    }
+                    if (normType == NVCV_NORM_L2)
+                    {
+                        dist = std::sqrt(dist);
+                    }
+
+                    cckDistIdx[cck1Idx] = std::tie(dist, cck1Idx);
+                }
+
+                std::sort(cckDistIdx.begin(), cckDistIdx.end());
+
+                if (std::get<1>(cckDistIdx[0]) == set1Idx)
+                {
+                    util::ValueAt<int>(mchVec, mchStrides, long3{sampleIdx, mchIdx, 0}) = set1Idx;
+                    util::ValueAt<int>(mchVec, mchStrides, long3{sampleIdx, mchIdx, 1}) = std::get<1>(distIdx[0]);
+                    if (dStrides.x > 0)
+                    {
+                        util::ValueAt<float>(dVec, dStrides, long2{sampleIdx, mchIdx}) = std::get<0>(distIdx[0]);
+                    }
+
+                    mchIdx++;
+                    if (nmStrides.x > 0)
+                    {
+                        util::ValueAt<int>(nmVec, nmStrides, long1{sampleIdx}) = mchIdx;
+                    }
+                }
+            }
+            else
+            {
+                for (int m = 0; m < matchesPerPoint; m++)
+                {
+                    util::ValueAt<int>(mchVec, mchStrides, long3{sampleIdx, mchIdx, 0}) = set1Idx;
+                    util::ValueAt<int>(mchVec, mchStrides, long3{sampleIdx, mchIdx, 1}) = std::get<1>(distIdx[m]);
+                    if (dStrides.x > 0)
+                    {
+                        util::ValueAt<float>(dVec, dStrides, long2{sampleIdx, mchIdx}) = std::get<0>(distIdx[m]);
+                    }
+
+                    mchIdx++;
+                    if (nmStrides.x > 0)
+                    {
+                        util::ValueAt<int>(nmVec, nmStrides, long1{sampleIdx}) = mchIdx;
+                    }
+                }
+            }
+        }
+    }
+}
+
+template<typename ST>
+void PairwiseMatcher(NVCVPairwiseMatcherType algoChoice, RawBufferType &mchVec, RawBufferType &nmVec,
+                     RawBufferType &dVec, const RawBufferType &set1Vec, const RawBufferType &set2Vec,
+                     const long3 &mchStrides, const long1 &nmStrides, const long2 &dStrides, const long3 &set1Strides,
+                     const long3 &set2Strides, int numSamples, int numDim, int set1Size, int set2Size, bool crossCheck,
+                     int matchesPerPoint, NVCVNormType normType)
+{
+    if (algoChoice == NVCV_BRUTE_FORCE)
+    {
+        BruteForceMatcher<ST>(mchVec, nmVec, dVec, set1Vec, set2Vec, mchStrides, nmStrides, dStrides, set1Strides,
+                              set2Strides, numSamples, numDim, set1Size, set2Size, crossCheck, matchesPerPoint,
+                              normType);
+    }
+}
+
+inline void SortOutput(std::vector<std::tuple<int, int, int, float>> &outIdsDist, const RawBufferType &mchVec,
+                       const RawBufferType &nmVec, const RawBufferType &dVec, const long3 &mchStrides,
+                       const long1 &nmStrides, const long2 &dStrides, int numSamples, int set1Size, int matchesPerPoint,
+                       int maxMatches)
+{
+    int totalMatches = set1Size * matchesPerPoint;
+
+    for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++)
+    {
+        if (nmStrides.x > 0)
+        {
+            totalMatches = util::ValueAt<int>(nmVec, nmStrides, long1{sampleIdx});
+        }
+
+        for (int matchIdx = 0; matchIdx < totalMatches && matchIdx < maxMatches; matchIdx++)
+        {
+            int   set1Idx  = util::ValueAt<int>(mchVec, mchStrides, long3{sampleIdx, matchIdx, 0});
+            int   set2Idx  = util::ValueAt<int>(mchVec, mchStrides, long3{sampleIdx, matchIdx, 1});
+            float distance = (dStrides.x > 0) ? util::ValueAt<float>(dVec, dStrides, long2{sampleIdx, matchIdx}) : 0.f;
+
+            outIdsDist.emplace_back(sampleIdx, set1Idx, set2Idx, distance);
+        }
+    }
+
+    std::sort(outIdsDist.begin(), outIdsDist.end());
+}
+
+} // namespace ref
+
+// ----------------------------- Start tests -----------------------------------
+
+// clang-format off
+
+#define NVCV_TEST_ROW(NumSamples, Set1Size, Set2Size, NumDim, MatchesPerPoint, CrossCheck, StoreDistances,  \
+                      AlgoChoice, NormType, Type)                                                           \
+    type::Types<type::Value<NumSamples>, type::Value<Set1Size>, type::Value<Set2Size>, type::Value<NumDim>, \
+                type::Value<MatchesPerPoint>, type::Value<CrossCheck>, type::Value<StoreDistances>,         \
+                type::Value<AlgoChoice>, type::Value<NormType>, Type>
+
+NVCV_TYPED_TEST_SUITE(OpPairwiseMatcher, type::Types<
+    NVCV_TEST_ROW(1, 2, 2, 1, 1, false, false, NVCV_BRUTE_FORCE, NVCV_NORM_HAMMING, uint8_t),
+    NVCV_TEST_ROW(2, 3, 4, 5, 1, false, true, NVCV_BRUTE_FORCE, NVCV_NORM_HAMMING, uint8_t),
+    NVCV_TEST_ROW(3, 4, 3, 32, 1, false, true, NVCV_BRUTE_FORCE, NVCV_NORM_HAMMING, uint32_t),
+    NVCV_TEST_ROW(4, 11, 12, 128, 2, false, true, NVCV_BRUTE_FORCE, NVCV_NORM_HAMMING, uint8_t),
+    NVCV_TEST_ROW(3, 17, 16, 128, 3, false, true, NVCV_BRUTE_FORCE, NVCV_NORM_HAMMING, uint8_t),
+    NVCV_TEST_ROW(2, 3, 4, 32, 1, true, false, NVCV_BRUTE_FORCE, NVCV_NORM_HAMMING, uint32_t),
+    NVCV_TEST_ROW(1, 5, 6, 7, 1, false, false, NVCV_BRUTE_FORCE, NVCV_NORM_L1, uint8_t),
+    NVCV_TEST_ROW(2, 18, 19, 17, 1, false, true, NVCV_BRUTE_FORCE, NVCV_NORM_L1, uint32_t),
+    NVCV_TEST_ROW(3, 98, 17, 32, 1, false, true, NVCV_BRUTE_FORCE, NVCV_NORM_L1, float),
+    NVCV_TEST_ROW(2, 54, 65, 32, 2, false, true, NVCV_BRUTE_FORCE, NVCV_NORM_L1, uint8_t),
+    NVCV_TEST_ROW(3, 68, 37, 1025, 1, true, true, NVCV_BRUTE_FORCE, NVCV_NORM_L1, float),
+    NVCV_TEST_ROW(2, 14, 24, 32, 3, false, true, NVCV_BRUTE_FORCE, NVCV_NORM_L1, uint8_t),
+    NVCV_TEST_ROW(3, 48, 37, 8, 1, true, false, NVCV_BRUTE_FORCE, NVCV_NORM_L1, float),
+    NVCV_TEST_ROW(4, 8, 9, 1025, 1, false, true, NVCV_BRUTE_FORCE, NVCV_NORM_L2, uint8_t),
+    NVCV_TEST_ROW(3, 27, 16, 8, 1, false, false, NVCV_BRUTE_FORCE, NVCV_NORM_L2, uint32_t),
+    NVCV_TEST_ROW(2, 73, 132, 64, 1, false, true, NVCV_BRUTE_FORCE, NVCV_NORM_L2, float),
+    NVCV_TEST_ROW(3, 87, 98, 19, 2, false, true, NVCV_BRUTE_FORCE, NVCV_NORM_L2, uint8_t),
+    NVCV_TEST_ROW(4, 43, 32, 26, 1, true, true, NVCV_BRUTE_FORCE, NVCV_NORM_L2, float),
+    NVCV_TEST_ROW(3, 67, 58, 32, 3, false, true, NVCV_BRUTE_FORCE, NVCV_NORM_L2, uint8_t),
+    NVCV_TEST_ROW(2, 73, 62, 8, 1, true, false, NVCV_BRUTE_FORCE, NVCV_NORM_L2, float)
+>);
+
+// clang-format on
+
+TYPED_TEST(OpPairwiseMatcher, CorrectOutput)
+{
+    int  numSamples      = type::GetValue<TypeParam, 0>;
+    int  set1Size        = type::GetValue<TypeParam, 1>;
+    int  set2Size        = type::GetValue<TypeParam, 2>;
+    int  numDim          = type::GetValue<TypeParam, 3>;
+    int  matchesPerPoint = type::GetValue<TypeParam, 4>;
+    bool crossCheck      = type::GetValue<TypeParam, 5>;
+    bool storeDistances  = type::GetValue<TypeParam, 6>;
+
+    NVCVPairwiseMatcherType algoChoice{type::GetValue<TypeParam, 7>};
+
+    NVCVNormType normType{type::GetValue<TypeParam, 8>};
+
+    using SrcT = type::GetType<TypeParam, 9>;
+
+    constexpr nvcv::DataType srcDT{ToDataType<SrcT>()};
+
+    int maxSet1    = set1Size + 12;
+    int maxSet2    = set2Size + 23; // adding extra sizes to test different capacities on set 1 and 2
+    int maxMatches = maxSet1 * matchesPerPoint;
+
+    // clang-format off
+
+    nvcv::Tensor set1({{numSamples, maxSet1, numDim}, "NMD"}, srcDT);
+    nvcv::Tensor set2({{numSamples, maxSet2, numDim}, "NMD"}, srcDT);
+
+    nvcv::Tensor numSet1({{numSamples}, "N"}, nvcv::TYPE_S32);
+    nvcv::Tensor numSet2({{numSamples}, "N"}, nvcv::TYPE_S32);
+
+    nvcv::Tensor matches({{numSamples, maxMatches, 2}, "NMD"}, nvcv::TYPE_S32);
+
+    nvcv::Tensor numMatches;
+    nvcv::Optional<nvcv::TensorDataStridedCuda> nmData;
+
+    nvcv::Tensor distances;
+    nvcv::Optional<nvcv::TensorDataStridedCuda> dData;
+
+    if (crossCheck)
+    {
+        numMatches = nvcv::Tensor({{numSamples}, "N"}, nvcv::TYPE_S32);
+
+        nmData = numMatches.exportData<nvcv::TensorDataStridedCuda>();
+        ASSERT_TRUE(nmData);
+    }
+    if (storeDistances)
+    {
+        distances = nvcv::Tensor({{numSamples, maxMatches}, "NM"}, nvcv::TYPE_F32);
+
+        dData = distances.exportData<nvcv::TensorDataStridedCuda>();
+        ASSERT_TRUE(dData);
+    }
+
+    // clang-format on
+
+    auto set1Data = set1.exportData<nvcv::TensorDataStridedCuda>();
+    ASSERT_TRUE(set1Data);
+
+    auto set2Data = set2.exportData<nvcv::TensorDataStridedCuda>();
+    ASSERT_TRUE(set2Data);
+
+    auto ns1Data = numSet1.exportData<nvcv::TensorDataStridedCuda>();
+    ASSERT_TRUE(ns1Data);
+
+    auto ns2Data = numSet2.exportData<nvcv::TensorDataStridedCuda>();
+    ASSERT_TRUE(ns2Data);
+
+    auto mchData = matches.exportData<nvcv::TensorDataStridedCuda>();
+    ASSERT_TRUE(mchData);
+
+    long3 set1Strides{set1Data->stride(0), set1Data->stride(1), set1Data->stride(2)};
+    long3 set2Strides{set2Data->stride(0), set2Data->stride(1), set2Data->stride(2)};
+    long1 ns1Strides{ns1Data->stride(0)};
+    long1 ns2Strides{ns2Data->stride(0)};
+    long3 mchStrides{mchData->stride(0), mchData->stride(1), mchData->stride(2)};
+    long1 nmStrides = (numMatches) ? long1{nmData->stride(0)} : long1{0};
+    long2 dStrides  = (distances) ? long2{dData->stride(0), dData->stride(1)} : long2{0, 0};
+
+    long set1BufSize = set1Strides.x * numSamples;
+    long set2BufSize = set2Strides.x * numSamples;
+    long ns1BufSize  = ns1Strides.x * numSamples;
+    long ns2BufSize  = ns2Strides.x * numSamples;
+    long mchBufSize  = mchStrides.x * numSamples;
+    long nmBufSize   = nmStrides.x * numSamples;
+    long dBufSize    = dStrides.x * numSamples;
+
+    RawBufferType set1Vec(set1BufSize);
+    RawBufferType set2Vec(set2BufSize);
+    RawBufferType ns1Vec(ns1BufSize);
+    RawBufferType ns2Vec(ns2BufSize);
+
+    std::default_random_engine rng(12345u);
+
+    SrcT minV = std::is_integral_v<SrcT> ? cuda::TypeTraits<SrcT>::min : -1;
+    SrcT maxV = std::is_integral_v<SrcT> ? cuda::TypeTraits<SrcT>::max : +1;
+
+    uniform_distribution<SrcT> rand(minV, maxV);
+
+    for (int x = 0; x < numSamples; ++x)
+    {
+        for (int z = 0; z < numDim; ++z)
+        {
+            for (int y = 0; y < set1Size; ++y)
+            {
+                util::ValueAt<SrcT>(set1Vec, set1Strides, long3{x, y, z}) = rand(rng);
+            }
+            for (int y = 0; y < set2Size; ++y)
+            {
+                util::ValueAt<SrcT>(set2Vec, set2Strides, long3{x, y, z}) = rand(rng);
+            }
+        }
+
+        util::ValueAt<int>(ns1Vec, ns1Strides, long1{x}) = set1Size;
+        util::ValueAt<int>(ns2Vec, ns2Strides, long1{x}) = set2Size;
+    }
+
+    ASSERT_EQ(cudaSuccess, cudaMemcpy(set1Data->basePtr(), set1Vec.data(), set1BufSize, cudaMemcpyHostToDevice));
+    ASSERT_EQ(cudaSuccess, cudaMemcpy(set2Data->basePtr(), set2Vec.data(), set2BufSize, cudaMemcpyHostToDevice));
+    ASSERT_EQ(cudaSuccess, cudaMemcpy(ns1Data->basePtr(), ns1Vec.data(), ns1BufSize, cudaMemcpyHostToDevice));
+    ASSERT_EQ(cudaSuccess, cudaMemcpy(ns2Data->basePtr(), ns2Vec.data(), ns2BufSize, cudaMemcpyHostToDevice));
+
+    cudaStream_t stream;
+    ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream));
+
+    cvcuda::PairwiseMatcher op(algoChoice);
+
+    op(stream, set1, set2, numSet1, numSet2, matches, numMatches, distances, crossCheck, matchesPerPoint, normType);
+
+    ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
+    ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream));
+
+    RawBufferType nmTestVec(nmBufSize, 0);
+    RawBufferType nmGoldVec(nmBufSize, 0);
+    RawBufferType mchTestVec(mchBufSize, 0);
+    RawBufferType mchGoldVec(mchBufSize, 0);
+    RawBufferType dTestVec(dBufSize, 0);
+    RawBufferType dGoldVec(dBufSize, 0);
+
+    // Treated output is a vector of (sampleIdx, set1Idx, set2Idx, distance)
+    std::vector<std::tuple<int, int, int, float>> testIdsDist;
+    std::vector<std::tuple<int, int, int, float>> goldIdsDist;
+
+    ASSERT_EQ(cudaSuccess, cudaMemcpy(mchTestVec.data(), mchData->basePtr(), mchBufSize, cudaMemcpyDeviceToHost));
+
+    if (numMatches)
+    {
+        ASSERT_EQ(cudaSuccess, cudaMemcpy(nmTestVec.data(), nmData->basePtr(), nmBufSize, cudaMemcpyDeviceToHost));
+    }
+    if (distances)
+    {
+        ASSERT_EQ(cudaSuccess, cudaMemcpy(dTestVec.data(), dData->basePtr(), dBufSize, cudaMemcpyDeviceToHost));
+    }
+
+    ref::SortOutput(testIdsDist, mchTestVec, nmTestVec, dTestVec, mchStrides, nmStrides, dStrides, numSamples, set1Size,
+                    matchesPerPoint, maxMatches);
+
+    ref::PairwiseMatcher<SrcT>(algoChoice, mchGoldVec, nmGoldVec, dGoldVec, set1Vec, set2Vec, mchStrides, nmStrides,
+                               dStrides, set1Strides, set2Strides, numSamples, numDim, set1Size, set2Size, crossCheck,
+                               matchesPerPoint, normType);
+
+    ref::SortOutput(goldIdsDist, mchGoldVec, nmGoldVec, dGoldVec, mchStrides, nmStrides, dStrides, numSamples, set1Size,
+                    matchesPerPoint, maxMatches);
+
+    EXPECT_EQ(testIdsDist, goldIdsDist);
+}
diff --git a/tests/cvcuda/system/TestOpPillowResize.cpp b/tests/cvcuda/system/TestOpPillowResize.cpp
index 78316d5b..4f276dba 100644
--- a/tests/cvcuda/system/TestOpPillowResize.cpp
+++ b/tests/cvcuda/system/TestOpPillowResize.cpp
@@ -1010,9 +1010,11 @@ void StartTest(int srcWidth, int srcHeight, int dstWidth, int dstHeight, NVCVInt
     // Generate test result
     nvcv::Tensor imgDst(numberOfImages, {dstWidth, dstHeight}, fmt);
 
-    cvcuda::PillowResize pillowResizeOp(nvcv::Size2D{std::max(srcWidth, dstWidth), std::max(srcHeight, dstHeight)},
-                                        numberOfImages, fmt);
-    EXPECT_NO_THROW(pillowResizeOp(stream, imgSrc, imgDst, interpolation));
+    cvcuda::PillowResize pillowResizeOp;
+
+    cvcuda::UniqueWorkspace ws = cvcuda::AllocateWorkspace(
+        pillowResizeOp.getWorkspaceRequirements(numberOfImages, {srcWidth, srcHeight}, {dstWidth, dstHeight}, fmt));
+    EXPECT_NO_THROW(pillowResizeOp(stream, ws.get(), imgSrc, imgDst, interpolation));
 
     EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
     EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream));
@@ -1144,9 +1146,11 @@ void StartVarShapeTest(int srcWidthBase, int srcHeightBase, int dstWidthBase, in
     nvcv::Size2D maxDstSize = batchDst.maxSize();
 
     // Generate test result
-    cvcuda::PillowResize pillowResizeOp(
-        nvcv::Size2D{std::max(maxSrcSize.w, maxDstSize.w), std::max(maxSrcSize.h, maxDstSize.h)}, numberOfImages, fmt);
-    EXPECT_NO_THROW(pillowResizeOp(stream, batchSrc, batchDst, interpolation));
+    cvcuda::PillowResize pillowResizeOp;
+
+    cvcuda::UniqueWorkspace ws = cvcuda::AllocateWorkspace(
+        pillowResizeOp.getWorkspaceRequirements(numberOfImages, maxSrcSize, maxDstSize, fmt));
+    EXPECT_NO_THROW(pillowResizeOp(stream, ws.get(), batchSrc, batchDst, interpolation));
 
     // Get test data back
     EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
diff --git a/tests/cvcuda/system/TestOpStack.cpp b/tests/cvcuda/system/TestOpStack.cpp
new file mode 100644
index 00000000..b8f2bff2
--- /dev/null
+++ b/tests/cvcuda/system/TestOpStack.cpp
@@ -0,0 +1,190 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Definitions.hpp"
+
+#include <common/ValueTests.hpp>
+#include <cvcuda/OpStack.hpp>
+#include <nvcv/Image.hpp>
+#include <nvcv/Tensor.hpp>
+#include <nvcv/TensorDataAccess.hpp>
+#include <util/TensorDataUtils.hpp>
+
+#include <iostream>
+#include <random>
+
+namespace gt   = ::testing;
+namespace test = nvcv::test;
+namespace util = nvcv::util;
+
+// clang-format off
+NVCV_TEST_SUITE_P(OpStack, test::ValueList<int, int, nvcv::ImageFormat, int, int>
+{
+    //inWidth, inHeight,                format,    numberOfTensors, maxNumberInBatch
+    {     320,      240,         nvcv::FMT_U8,                   5,                2},
+    {      40,       81,         nvcv::FMT_RGB8,                 1,                3},
+    {     800,      600,         nvcv::FMT_BGR8,                 1,                4},
+    {    1024,      768,         nvcv::FMT_RGBA8,                2,                1},
+    {      12,      720,         nvcv::FMT_BGRA8,                3,                5},
+    {     160,      121,         nvcv::FMT_BGR8p,                2,                2},
+    {     920,       80,         nvcv::FMT_RGB8p,                1,                3},
+    {      41,      536,         nvcv::FMT_RGBA8p,               1,                4},
+    {     592,      944,         nvcv::FMT_BGRA8p,               2,                5},
+    {       1,        2,         nvcv::FMT_U32,                  1,                1},
+    {      48,       36,         nvcv::FMT_RGBf32,               1,                2},
+    {     192,     1944,         nvcv::FMT_BGRf32,               1,                3},
+    {    1920,     1080,         nvcv::FMT_RGBAf32,              4,                4},
+    {    2048,     1536,         nvcv::FMT_BGRAf32,              1,                5},
+    {    1024,      768,         nvcv::FMT_RGBA8p,               3,                1},
+    {    1280,      720,         nvcv::FMT_RGBf32p,              1,                2},
+    {     192,       80,         nvcv::FMT_BGRf32p,              1,                3},
+    {    2048,      536,         nvcv::FMT_RGBAf32p,             1,                4},
+    {     259,      194,         nvcv::FMT_BGRAf32p,             1,                5},
+    {    1921,     1080,         nvcv::FMT_F64,                  1,                1},
+    {    1920,     1080,         nvcv::FMT_F16,                  2,                2},
+    {      48,       36,         nvcv::FMT_BGRAf32,              1,                3},
+});
+
+// clang-format on
+TEST_P(OpStack, test_NCHW_tensors)
+{
+    cudaStream_t stream;
+    ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream));
+
+    int               width            = GetParamValue<0>();
+    int               height           = GetParamValue<1>();
+    nvcv::ImageFormat format           = GetParamValue<2>();
+    int               numberOfTensors  = GetParamValue<3>();
+    int               maxNumberInBatch = GetParamValue<4>();
+
+    int numChannels          = format.numChannels();
+    int bytesPerPixel        = 0;
+    int totalNumberOfTensors = 0;
+
+    for (int32_t i = 0; i < numChannels; i++)
+    {
+        bytesPerPixel += format.bitsPerChannel()[i] / 8;
+    }
+
+    // generate the output tensor to contain all of the input tensors
+
+    auto                                 reqs = nvcv::TensorBatch::CalcRequirements(numberOfTensors);
+    nvcv::TensorBatch                    inTensorBatch(reqs);
+    std::vector<std::vector<nvcv::Byte>> inputVecs;
+
+    // generate random input images
+    std::default_random_engine         randEng(0);
+    std::uniform_int_distribution      rand(0u, 255u);
+    std::uniform_int_distribution<int> distribution(1, maxNumberInBatch);
+    int                                numberInBatch = distribution(randEng);
+
+    for (int i = 0; i < numberOfTensors; ++i)
+    {
+        nvcv::Tensor inTensor(numberInBatch, {width, height}, format);
+        totalNumberOfTensors += numberInBatch; // include individual tensors and tensors in N > 1 tensor(s)
+
+        for (int j = 0; j < numberInBatch; j++)
+        {
+            // generate random input image in bytes
+            std::vector<nvcv::Byte> imageVec((width * height) * bytesPerPixel);
+            std::generate(imageVec.begin(), imageVec.end(), [&]() { return (nvcv::Byte)rand(randEng); });
+            // copy random input to device tensor
+            EXPECT_NO_THROW(util::SetImageTensorFromByteVector(inTensor.exportData(), imageVec, j));
+            // add tensor to batch and input vector
+            inputVecs.push_back(imageVec);
+        }
+        inTensorBatch.pushBack(inTensor);
+    }
+
+    nvcv::Tensor  outTensor(totalNumberOfTensors, {width, height}, format);
+    // run operator
+    cvcuda::Stack op;
+    EXPECT_NO_THROW(op(stream, inTensorBatch, outTensor));
+
+    ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
+    ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream));
+
+    // go through each sample of the output tensor and compare vals.
+    for (int i = 0; i < totalNumberOfTensors; ++i)
+    {
+        // generate random input image in bytes
+        std::vector<nvcv::Byte> outSample;
+        EXPECT_NO_THROW(util::GetImageByteVectorFromTensor(outTensor.exportData(), i, outSample));
+        // Compare the computed histogram with the output histogram
+        ASSERT_EQ(inputVecs[i], outSample);
+    }
+}
+
+TEST_P(OpStack, test_CHW_tensors)
+{
+    cudaStream_t stream;
+    ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream));
+
+    int               width           = GetParamValue<0>();
+    int               height          = GetParamValue<1>();
+    nvcv::ImageFormat format          = GetParamValue<2>();
+    int               numberOfTensors = GetParamValue<3>();
+
+    int numChannels   = format.numChannels();
+    int bytesPerPixel = 0;
+
+    for (int32_t i = 0; i < numChannels; i++)
+    {
+        bytesPerPixel += format.bitsPerChannel()[i] / 8;
+    }
+
+    // generate the output tensor to contain all of the input tensors
+
+    auto              reqs = nvcv::TensorBatch::CalcRequirements(numberOfTensors);
+    nvcv::TensorBatch inTensorBatch(reqs);
+
+    // generate random input images
+    std::default_random_engine           randEng(0);
+    std::uniform_int_distribution        rand(0u, 255u);
+    std::vector<std::vector<nvcv::Byte>> inputVecs;
+
+    for (int i = 0; i < numberOfTensors; ++i)
+    {
+        nvcv::Tensor inTensor = nvcv::util::CreateTensor(1, width, height, format); //this will create a CHW/HWC tensor
+        // generate random input image in bytes
+        std::vector<nvcv::Byte> imageVec((width * height) * bytesPerPixel);
+        std::generate(imageVec.begin(), imageVec.end(), [&]() { return (nvcv::Byte)rand(randEng); });
+        // copy random input to device tensor
+        EXPECT_NO_THROW(util::SetImageTensorFromByteVector(inTensor.exportData(), imageVec));
+        // add tensor to batch and input vector
+        inputVecs.push_back(imageVec);
+        inTensorBatch.pushBack(inTensor);
+    }
+
+    nvcv::Tensor  outTensor(numberOfTensors, {width, height}, format);
+    // run operator
+    cvcuda::Stack op;
+    EXPECT_NO_THROW(op(stream, inTensorBatch, outTensor));
+
+    ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
+    ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream));
+
+    // go through each sample of the output tensor and compare vals.
+    for (int i = 0; i < numberOfTensors; ++i)
+    {
+        // generate random input image in bytes
+        std::vector<nvcv::Byte> outSample;
+        EXPECT_NO_THROW(util::GetImageByteVectorFromTensor(outTensor.exportData(), i, outSample));
+        // Compare the computed histogram with the output histogram
+        ASSERT_EQ(inputVecs[i], outSample);
+    }
+}
diff --git a/tests/cvcuda/system/TestOpWarpPerspective.cpp b/tests/cvcuda/system/TestOpWarpPerspective.cpp
index 59b45e9f..04c6d647 100644
--- a/tests/cvcuda/system/TestOpWarpPerspective.cpp
+++ b/tests/cvcuda/system/TestOpWarpPerspective.cpp
@@ -129,7 +129,7 @@ static void WarpPerspectiveGold(std::vector<uint8_t> &hDst, const int dstRowStri
 
     NVCVPerspectiveTransform finalTransformMatrix;
 
-    if (flags & NVCV_WARP_INVERSE_MAP)
+    if (!(flags & NVCV_WARP_INVERSE_MAP))
     {
         cuda::math::Matrix<float, 3, 3> tempMatrixForInverse;
 
diff --git a/tests/cvcuda/unit/CMakeLists.txt b/tests/cvcuda/unit/CMakeLists.txt
new file mode 100644
index 00000000..53e5aba1
--- /dev/null
+++ b/tests/cvcuda/unit/CMakeLists.txt
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_executable(cvcuda_test_unit
+    TestWorkspaceAllocator.cpp
+    TestWorkspaceEstimator.cpp
+)
+
+target_compile_definitions(cvcuda_test_unit
+    PRIVATE
+        -DNVCV_UNIT_TEST=1
+)
+
+target_link_libraries(cvcuda_test_unit
+    PRIVATE
+        nvcv_test_main
+        nvcv_util
+        nvcv_test_common
+        cvcuda_priv
+)
+
+nvcv_add_test(cvcuda_test_unit cvcuda)
diff --git a/tests/cvcuda/unit/Definitions.hpp b/tests/cvcuda/unit/Definitions.hpp
new file mode 100644
index 00000000..76fd4cd3
--- /dev/null
+++ b/tests/cvcuda/unit/Definitions.hpp
@@ -0,0 +1,26 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_TEST_SYSTEM_CORE_OP_DEFINITIONS_HPP
+#define NVCV_TEST_SYSTEM_CORE_OP_DEFINITIONS_HPP
+
+#include <common/CheckStatus.hpp>
+#include <common/Printers.hpp>
+#include <gmock/gmock-matchers.h>
+#include <gtest/gtest.h>
+
+#endif // NVCV_TEST_SYSTEM_CORE_OP_DEFINITIONS_HPP
diff --git a/tests/cvcuda/unit/TestWorkspaceAllocator.cpp b/tests/cvcuda/unit/TestWorkspaceAllocator.cpp
new file mode 100644
index 00000000..805752f0
--- /dev/null
+++ b/tests/cvcuda/unit/TestWorkspaceAllocator.cpp
@@ -0,0 +1,203 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Definitions.hpp"
+
+#include <cvcuda/priv/WorkspaceAllocator.hpp>
+
+#define EXPECT_PTR_EQ(a, b) EXPECT_EQ((const void *)(a), (const void *)(b))
+
+TEST(WorkspaceMemAllocatorTest, Get)
+{
+    alignas(64) char     base[64];
+    cvcuda::WorkspaceMem wm{};
+    wm.req  = {64, 64};
+    wm.data = base;
+
+    cvcuda::WorkspaceMemAllocator wa(wm);
+    EXPECT_PTR_EQ(wa.get<char>(3), base + 0);
+    EXPECT_PTR_EQ(wa.get<int32_t>(3), base + 4);
+    EXPECT_PTR_EQ(wa.get<float>(), base + 16);
+    EXPECT_PTR_EQ(wa.get<float>(1, 16), base + 32);
+    EXPECT_PTR_EQ(wa.get<float>(4), base + 48);
+}
+
+TEST(WorkspaceMemAllocatorTest, ExceedWorkspaceSize)
+{
+    alignas(64) char     base[64];
+    cvcuda::WorkspaceMem wm{};
+    wm.req  = {64, 64};
+    wm.data = base;
+
+    cvcuda::WorkspaceMemAllocator wa(wm);
+    EXPECT_PTR_EQ(wa.get<double>(4), base + 0);
+    EXPECT_PTR_EQ(wa.get<float>(7), base + 32);
+    EXPECT_PTR_EQ(wa.allocated(), 60);
+    EXPECT_THROW(wa.get<float>(2), nvcv::Exception);
+    EXPECT_PTR_EQ(wa.get<float>(1), base + 60);
+    EXPECT_THROW(wa.get<char>(1), nvcv::Exception);
+}
+
+TEST(WorkspaceAllocatorTest, Get)
+{
+    alignas(64) char  base[64];
+    alignas(64) char  pinnedBase[64];
+    cvcuda::Workspace ws{};
+    ws.hostMem.req    = {64, 64};
+    ws.hostMem.data   = base;
+    ws.pinnedMem.req  = {64, 64};
+    ws.pinnedMem.data = pinnedBase;
+
+    cvcuda::WorkspaceAllocator wa(ws);
+    EXPECT_PTR_EQ(wa.getHost<double>(4), base + 0);
+    EXPECT_PTR_EQ(wa.getHost<float>(7), base + 32);
+    EXPECT_PTR_EQ(wa.getPinned<double>(4), pinnedBase + 0);
+    EXPECT_EQ(wa.hostMem.allocated(), 60);
+    EXPECT_EQ(wa.pinnedMem.allocated(), 32);
+    EXPECT_THROW(wa.getHost<float>(2), nvcv::Exception);
+    EXPECT_PTR_EQ(wa.getHost<float>(1), base + 60);
+    EXPECT_THROW(wa.getHost<char>(1), nvcv::Exception);
+}
+
+TEST(WorkspaceMemAllocatorTest, AcquireRelease)
+{
+    alignas(64) char     base[64];
+    cvcuda::WorkspaceMem wm{};
+    wm.req  = {64, 64};
+    wm.data = base;
+    ASSERT_EQ(cudaEventCreateWithFlags(&wm.ready, cudaEventDisableTiming), cudaSuccess);
+
+    EXPECT_NO_THROW({
+        cvcuda::WorkspaceMemAllocator wa(wm, cudaStream_t(0));
+        EXPECT_PTR_EQ(wa.get(32), base);
+    });
+
+    EXPECT_NO_THROW({ cvcuda::WorkspaceMemAllocator wa(wm, cudaStream_t(0)); });
+
+    EXPECT_NO_THROW({
+        cvcuda::WorkspaceMemAllocator wa(wm, cudaStream_t(0));
+        wa.acquire(std::nullopt);
+        EXPECT_PTR_EQ(wa.get(32), base);
+    });
+
+    EXPECT_THROW(
+        {
+            cvcuda::WorkspaceMemAllocator wa(wm, std::nullopt, std::nullopt);
+            EXPECT_PTR_EQ(wa.get(32), base);
+            wa.acquire(std::nullopt);
+        },
+        std::logic_error)
+        << "acquire after get should be an error";
+
+    EXPECT_THROW(
+        {
+            cvcuda::WorkspaceMemAllocator wa(wm, std::nullopt, std::nullopt);
+            wa.acquire(std::nullopt);
+            wa.acquire(std::nullopt);
+        },
+        std::logic_error)
+        << "double acquire should be an error";
+
+    EXPECT_THROW(
+        {
+            cvcuda::WorkspaceMemAllocator wa(wm, std::nullopt, std::nullopt);
+            wa.release(std::nullopt);
+            EXPECT_PTR_EQ(wa.get(32), base);
+        },
+        std::logic_error)
+        << "get after release should be an error";
+
+    EXPECT_THROW(
+        {
+            cvcuda::WorkspaceMemAllocator wa(wm, std::nullopt, std::nullopt);
+            wa.release(std::nullopt);
+            wa.acquire(std::nullopt);
+        },
+        std::logic_error)
+        << "acquire after release should be an error";
+
+    EXPECT_THROW(
+        {
+            cvcuda::WorkspaceMemAllocator wa(wm, std::nullopt, std::nullopt);
+            wa.release(std::nullopt);
+            wa.release(std::nullopt);
+        },
+        std::logic_error)
+        << "double release should be an error";
+
+    ASSERT_EQ(cudaEventDestroy(wm.ready), cudaSuccess);
+}
+
+TEST(WorkspaceMemAllocatorTest, Sync)
+{
+    void  *_junk;
+    size_t junk_size = 100 << 20;
+    ASSERT_EQ(cudaMalloc(&_junk, junk_size), cudaSuccess);
+    std::unique_ptr<void, void (*)(void *)> junk(_junk, [](void *p) { EXPECT_EQ(cudaFree(p), cudaSuccess); });
+
+    alignas(64) char     base[64];
+    cvcuda::WorkspaceMem wm{};
+    wm.req  = {64, 64};
+    wm.data = base;
+    ASSERT_EQ(cudaEventCreateWithFlags(&wm.ready, cudaEventDisableTiming), cudaSuccess);
+
+    // this is supposed to last long enough to be reliably "not ready"
+    auto hog = [&]()
+    {
+        for (int i = 0; i < 256; i++)
+        {
+            ASSERT_EQ(cudaMemset(junk.get(), i, junk_size), cudaSuccess);
+        }
+    };
+
+    EXPECT_NO_THROW({
+        hog();
+        ASSERT_EQ(cudaEventRecord(wm.ready, 0), cudaSuccess);
+        {
+            cvcuda::WorkspaceMemAllocator wa(wm, cudaStream_t(0));
+            EXPECT_EQ(cudaEventQuery(wm.ready), cudaErrorNotReady); // no sync yet
+        }
+        EXPECT_EQ(cudaEventQuery(wm.ready), cudaErrorNotReady); // no sync necessary
+    }) << "No memory was requested, no sync is necessary, no error expected";
+
+    ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
+
+    EXPECT_NO_THROW({
+        ASSERT_EQ(cudaEventRecord(wm.ready, 0), cudaSuccess);
+        {
+            cvcuda::WorkspaceMemAllocator wa(wm, cudaStream_t(0));
+            EXPECT_PTR_EQ(wa.get(32), base);
+            hog();
+        }
+        EXPECT_EQ(cudaEventQuery(wm.ready), cudaErrorNotReady); // device sync only
+    }) << "Acquire and release properly called, no exception should be raised";
+
+    ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
+
+    EXPECT_NO_THROW({
+        hog();
+        ASSERT_EQ(cudaEventRecord(wm.ready, 0), cudaSuccess);
+        {
+            cvcuda::WorkspaceMemAllocator wa(wm, std::nullopt, std::nullopt);
+            EXPECT_EQ(cudaEventQuery(wm.ready), cudaErrorNotReady); // no sync yet
+            EXPECT_PTR_EQ(wa.get(32), base);
+            EXPECT_EQ(cudaEventQuery(wm.ready), cudaSuccess); // sync in get
+        }
+    }) << "Acquire and release properly called, no exception should be raised";
+
+    ASSERT_EQ(cudaEventDestroy(wm.ready), cudaSuccess);
+}
diff --git a/tests/cvcuda/unit/TestWorkspaceEstimator.cpp b/tests/cvcuda/unit/TestWorkspaceEstimator.cpp
new file mode 100644
index 00000000..a950e073
--- /dev/null
+++ b/tests/cvcuda/unit/TestWorkspaceEstimator.cpp
@@ -0,0 +1,69 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Definitions.hpp"
+
+#include <cvcuda/priv/WorkspaceEstimator.hpp>
+
+TEST(WorkspaceMemEstimatorTest, Add)
+{
+    // set the alignment to 1 to see if element alignment gets propagated to the base alignment
+    cvcuda::WorkspaceMemEstimator est(0, 1);
+    est.add(3);
+    EXPECT_EQ(est.req.alignment, 1);
+    EXPECT_EQ(est.req.size, 3);
+    est.add<int32_t>(3);
+    EXPECT_EQ(est.req.size, 16);
+    EXPECT_EQ(est.req.alignment, 4);
+    est.add<float>();
+    EXPECT_EQ(est.req.size, 20);
+    est.add<float>(1, 16);
+    EXPECT_EQ(est.req.size, 48);
+    EXPECT_EQ(est.req.alignment, 16);
+}
+
+TEST(WorkspaceEstimatorTest, Add)
+{
+    cvcuda::WorkspaceEstimator est;
+    EXPECT_EQ(est.hostMem.req.alignment, 16);
+    EXPECT_EQ(est.pinnedMem.req.alignment, 256);
+    EXPECT_EQ(est.cudaMem.req.alignment, 256);
+
+    // set the alignment to 1 to see if element alignment gets propagated to the base alignment for each memory type
+    est.hostMem.req.alignment   = 1;
+    est.pinnedMem.req.alignment = 1;
+    est.cudaMem.req.alignment   = 1;
+
+    est.add(true, false, true, 3);
+    EXPECT_EQ(est.hostMem.req.size, 3);
+    EXPECT_EQ(est.pinnedMem.req.size, 0);
+    EXPECT_EQ(est.cudaMem.req.size, 3);
+
+    // clang-format off
+    est.add<char>(true, false, false, 4)
+       .add<int32_t>(false, true, true, 2);
+    // clang-format on
+
+    EXPECT_EQ(est.hostMem.req.size, 7);      // 7 chars
+    EXPECT_EQ(est.hostMem.req.alignment, 1); // no change
+
+    EXPECT_EQ(est.pinnedMem.req.size, 8);      // just the 2 integers
+    EXPECT_EQ(est.pinnedMem.req.alignment, 4); // alignment for int32
+
+    EXPECT_EQ(est.cudaMem.req.size, 12);     // 3 chars, padding, 2 ints
+    EXPECT_EQ(est.cudaMem.req.alignment, 4); // alignment for int32
+}
diff --git a/tests/nvcv_types/cudatools_system/CMakeLists.txt b/tests/nvcv_types/cudatools_system/CMakeLists.txt
index b0a0bb06..fd97891b 100644
--- a/tests/nvcv_types/cudatools_system/CMakeLists.txt
+++ b/tests/nvcv_types/cudatools_system/CMakeLists.txt
@@ -59,4 +59,4 @@ target_include_directories(nvcv_test_cudatools_system
         ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
 )
 
-nvcv_add_test(nvcv_test_cudatools_system)
+nvcv_add_test(nvcv_test_cudatools_system nvcv)
diff --git a/tests/nvcv_types/cudatools_system/DeviceBorderVarShapeWrap.cu b/tests/nvcv_types/cudatools_system/DeviceBorderVarShapeWrap.cu
index 22af91da..add11639 100644
--- a/tests/nvcv_types/cudatools_system/DeviceBorderVarShapeWrap.cu
+++ b/tests/nvcv_types/cudatools_system/DeviceBorderVarShapeWrap.cu
@@ -66,8 +66,8 @@ __global__ void FillBorderNHWC(DstWrapper dst, SrcWrapper src, int numSamples, i
     }
     for (int c = 0; c < numChannels; ++c)
     {
-        int4 srcCoord = {dstCoord.x - borderSize.x, dstCoord.y - borderSize.y, dstCoord.z, c};
-        dst[{dstCoord.x, dstCoord.y, dstCoord.z, c}] = src[srcCoord];
+        int4 srcCoord = {dstCoord.z, dstCoord.y - borderSize.y, dstCoord.x - borderSize.x, c};
+        dst[{dstCoord.z, dstCoord.y, dstCoord.x, c}] = src[srcCoord];
     }
 }
 
diff --git a/tests/nvcv_types/cudatools_system/DeviceImageBatchVarShapeWrap.cu b/tests/nvcv_types/cudatools_system/DeviceImageBatchVarShapeWrap.cu
index 1cc25603..a749c1be 100644
--- a/tests/nvcv_types/cudatools_system/DeviceImageBatchVarShapeWrap.cu
+++ b/tests/nvcv_types/cudatools_system/DeviceImageBatchVarShapeWrap.cu
@@ -71,7 +71,7 @@ __global__ void SetTwos(cuda::ImageBatchVarShapeWrapNHWC<ChannelType> dst, int n
     {
         *dst.ptr(coord.z, coord.y, coord.x, ch) = cuda::SetAll<ChannelType>(1);
 
-        int4 dstCoord{coord.x, coord.y, coord.z, ch};
+        int4 dstCoord{coord.z, coord.y, coord.x, ch};
         dst[dstCoord] += cuda::SetAll<ChannelType>(1);
     }
 }
diff --git a/tests/nvcv_types/cudatools_unit/CMakeLists.txt b/tests/nvcv_types/cudatools_unit/CMakeLists.txt
index 80adee9f..14477b51 100644
--- a/tests/nvcv_types/cudatools_unit/CMakeLists.txt
+++ b/tests/nvcv_types/cudatools_unit/CMakeLists.txt
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -32,4 +32,4 @@ target_include_directories(nvcv_test_cudatools_unit
         ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
 )
 
-nvcv_add_test(nvcv_test_cudatools_unit)
+nvcv_add_test(nvcv_test_cudatools_unit nvcv)
diff --git a/tests/nvcv_types/python/CMakeLists.txt b/tests/nvcv_types/python/CMakeLists.txt
index a08a98ca..abd39cb5 100644
--- a/tests/nvcv_types/python/CMakeLists.txt
+++ b/tests/nvcv_types/python/CMakeLists.txt
@@ -43,4 +43,4 @@ set(PYTHON_TEST_DIR ${CMAKE_INSTALL_PREFIX}/${PYTHON_TEST_INSTDIR})
 set(PYTHON_MODULE_DIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR})
 configure_file(nvcv_test_types_python.in nvcv_test_types_python @ONLY)
 
-nvcv_add_test(${CMAKE_CURRENT_BINARY_DIR}/nvcv_test_types_python)
+nvcv_add_test(${CMAKE_CURRENT_BINARY_DIR}/nvcv_test_types_python nvcv)
diff --git a/tests/nvcv_types/python/nvcv_test_types_python.in b/tests/nvcv_types/python/nvcv_test_types_python.in
index c74bb94a..2abb0fc1 100755
--- a/tests/nvcv_types/python/nvcv_test_types_python.in
+++ b/tests/nvcv_types/python/nvcv_test_types_python.in
@@ -45,14 +45,30 @@ function on_exit()
 }
 trap 'on_exit' EXIT
 
+export PYTHONPATH="$PYTHONPATH:@PYTHON_MODULE_DIR@"
+
 for ver in $python_versions; do
 
     if [[ "$NVCV_FORCE_PYTHON" != 1 && "$NVCV_FORCE_PYTHON" != yes ]]; then
-        if ! PYTHONPATH="$PYTHONPATH:@PYTHON_MODULE_DIR@" python$ver -c 'import nvcv'; then
+        if ! python$ver -c 'import nvcv'; then
             echo "Skipping python-$ver, NVCV python bindings not installed"
             continue
         fi
     fi
 
-    PYTHONPATH="$PYTHONPATH:@PYTHON_MODULE_DIR@" NVCV_VERSION="@NVCV_VERSION_FULL@" python$ver -m pytest -o cache_dir="$tmpdir" "$@" "$tests_dir"
+    # Check if python module is exposing only PyInit_cvcuda.
+    # Also provide some helpful info is exposing too much.
+    modfile=$(python$ver -c "import nvcv; print(nvcv.__file__)")
+    pubsyms=$(readelf -sWD $modfile | grep -v ' UND ' | grep ' GLOBAL ')
+    if [[ $(echo "$pubsyms" | wc -l) != 1 ]]; then
+        echo -e "nvcv python $ver module is exposing too many symbols:\n$pubsyms"
+        exit 1
+    fi
+    if ! echo "$pubsyms" | grep PyInit_nvcv > /dev/null; then
+        echo -e "nvcv python $ver module must expose symbol PyInit_nvcv, but instead exposes:\n$pubsyms"
+        exit 2
+    fi
+
+    # Run python tests
+    NVCV_VERSION="@NVCV_VERSION_FULL@" python$ver -m pytest -o cache_dir="$tmpdir" "$@" "$tests_dir"
 done
diff --git a/tests/nvcv_types/python/test_image.py b/tests/nvcv_types/python/test_image.py
index 79207a01..4615ca0d 100644
--- a/tests/nvcv_types/python/test_image.py
+++ b/tests/nvcv_types/python/test_image.py
@@ -13,10 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import torch
+import numpy as np
 import pytest as t
 import nvcv
-import numpy as np
-import torch
 import nvcv_util as util
 
 
diff --git a/tests/nvcv_types/python/test_imgbatchvarshape.py b/tests/nvcv_types/python/test_imgbatchvarshape.py
index dfb5bd30..0caaedb3 100644
--- a/tests/nvcv_types/python/test_imgbatchvarshape.py
+++ b/tests/nvcv_types/python/test_imgbatchvarshape.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,6 +14,9 @@
 # limitations under the License.
 
 import nvcv
+import pytest as t
+import numpy as np
+import nvcv_util as util
 
 
 def test_imgbatchvarshape_creation_works():
@@ -98,3 +101,53 @@ def test_imgbatchvarshape_several_images():
     assert cnt == 0
 
     assert batch.maxsize == (0, 0)
+
+
+buffmt_common = [
+    ([5, 7, 1], np.uint8, nvcv.Format.U8),
+    ([5, 7, 1], np.uint8, nvcv.Format.U8),
+    ([5, 7, 1], np.uint8, nvcv.Format.U8),
+    ([5, 7], np.uint8, nvcv.Format.U8),
+    ([5, 7, 1], np.int8, nvcv.Format.S8),
+    ([5, 7, 1], np.uint16, nvcv.Format.U16),
+    ([5, 7, 1], np.int16, nvcv.Format.S16),
+    ([5, 7, 2], np.int16, nvcv.Format._2S16),
+    ([5, 7, 1], np.float32, nvcv.Format.F32),
+    ([5, 7, 1], np.float64, nvcv.Format.F64),
+    ([5, 7, 2], np.float32, nvcv.Format._2F32),
+    ([5, 7, 3], np.uint8, nvcv.Format.RGB8),
+    ([5, 7, 4], np.uint8, nvcv.Format.RGBA8),
+    ([5, 7], np.csingle, nvcv.Format.C64),
+    ([5, 7], np.cdouble, nvcv.Format.C128),
+    ([5, 7], np.dtype("2f"), nvcv.Format._2F32),
+]
+
+
+@t.mark.parametrize("base_shape,dt,format", buffmt_common)
+def test_wrap_buffer_list(base_shape, dt, format):
+    nimages = 3
+    ndim = len(base_shape)
+    shapes = []
+    for i in range(nimages):
+        ith_shape = []
+        for d in range(ndim):
+            if d < 2:
+                ith_shape.append(base_shape[d] + i)
+            else:
+                ith_shape.append(base_shape[d])
+        shapes.append(ith_shape)
+    max_height = base_shape[0] + nimages - 1
+    max_width = base_shape[1] + nimages - 1
+    host_buffers = [np.ndarray(shape, dt) for shape in shapes]
+    cuda_buffers = [util.to_cuda_buffer(buf) for buf in host_buffers]
+    batch = nvcv.as_images(cuda_buffers)
+    assert batch.capacity == 3
+    assert batch.maxsize == (max_width, max_height)
+    assert batch.uniqueformat == format
+
+    images = [image for image in batch]
+    for i in range(len(shapes)):
+        sh = shapes[i]
+        assert images[i].width == sh[1]
+        assert images[i].height == sh[0]
+        assert images[i].format == format
diff --git a/tests/nvcv_types/python/test_import_order.py b/tests/nvcv_types/python/test_import_order.py
new file mode 100644
index 00000000..7d82f0dc
--- /dev/null
+++ b/tests/nvcv_types/python/test_import_order.py
@@ -0,0 +1,25 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Import order is important,
+# torch must be loaded correctly even if nvcv was imported first
+import nvcv
+import torch
+import numpy as np
+
+
+def test_import_nvcv_first_works():
+    torch.as_tensor(np.ndarray((4, 6), dtype=np.uint8), device="cuda")
+    nvcv.Tensor((4, 6), dtype=np.uint8)
diff --git a/tests/nvcv_types/python/test_stream.py b/tests/nvcv_types/python/test_stream.py
index 2948a48e..d7034305 100644
--- a/tests/nvcv_types/python/test_stream.py
+++ b/tests/nvcv_types/python/test_stream.py
@@ -13,8 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import nvcv
 import torch
+import nvcv
 import ctypes
 import pytest as t
 
diff --git a/tests/nvcv_types/python/test_tensor.py b/tests/nvcv_types/python/test_tensor.py
index aa215c7f..52ff631f 100644
--- a/tests/nvcv_types/python/test_tensor.py
+++ b/tests/nvcv_types/python/test_tensor.py
@@ -13,10 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import torch
 import nvcv
 import pytest as t
 import numpy as np
-import torch
 
 
 @t.mark.parametrize(
@@ -296,3 +296,135 @@ def test_tensor_create_packed():
 def test_tensor_create_for_imgbatch_packed():
     tensor = nvcv.Tensor(2, (37, 7), nvcv.Format.RGB8, rowalign=1)
     assert tensor.cuda().strides == (37 * 7 * 3, 37 * 3, 3, 1)
+
+
+@t.mark.parametrize(
+    "orig_shape, orig_layout, dtype, shape_arg, layout_arg",
+    [
+        ((1, 23, 65, 3), "NHWC", np.uint8, (23, 65, 3), "HWC"),
+        ((5, 23, 65, 3), None, np.int8, (5, 23 * 65, 3), None),
+        ((5, 23, 65, 3), None, np.int8, (5, 23 * 65, 3), "ABC"),
+        ((1,), "A", np.float32, (1, 1, 1, 1, 1, 1), "ABCDEF"),
+    ],
+)
+def test_tensor_reshape(orig_shape, orig_layout, dtype, shape_arg, layout_arg):
+    tensor = nvcv.Tensor(orig_shape, dtype, layout=orig_layout, rowalign=1)
+
+    def strides(shape):
+        out = [0] * len(shape)
+        for d in range(len(shape)):
+            out[d] = 1
+            for d2 in range(d + 1, len(shape)):
+                out[d] = out[d] * shape[d2]
+        return tuple(out)
+
+    assert tensor.dtype == dtype
+    assert tensor.shape == orig_shape
+    assert tensor.cuda().strides == strides(orig_shape)
+
+    new_tensors = [
+        tensor.reshape(shape_arg, layout=layout_arg),
+        nvcv.reshape(tensor, shape_arg, layout=layout_arg),
+    ]
+    for new_tensor in new_tensors:
+        assert new_tensor.dtype == dtype
+        assert new_tensor.shape == shape_arg
+        assert new_tensor.cuda().strides == strides(shape_arg)
+
+
+@t.mark.parametrize(
+    "orig_shape, orig_layout, dtype, shape_arg, layout_arg",
+    [
+        # wrong number of dims in layout
+        ((1, 23, 65, 3), "NHWC", np.uint8, (23, 65, 3), "ABCD"),
+        # wrong number of dims in layout
+        ((1, 23, 65, 3), None, np.uint8, (23, 65, 3), "ABCD"),
+        # dims in current layout
+        ((5, 23, 65, 3), "NHWC", np.int8, (5, 23 * 65, 3), None),
+        # volume mismatch
+        ((5, 23, 65, 3), "NHWC", np.int8, (100, 100), "AB"),
+        # 0-dim tensors not supported
+        ((1,), "A", np.int8, tuple(), ""),
+    ],
+)
+def test_tensor_reshape_error(orig_shape, orig_layout, dtype, shape_arg, layout_arg):
+    tensor = nvcv.Tensor(orig_shape, dtype, layout=orig_layout, rowalign=1)
+
+    with t.raises(RuntimeError):
+        tensor.reshape(shape_arg, layout=layout_arg),
+
+    with t.raises(RuntimeError):
+        nvcv.reshape(tensor, shape_arg, layout=layout_arg)
+
+
+def test_tensor_reshape_lifetime_ref_obj():
+    tensor1 = nvcv.Tensor((20, 10, 3), np.uint8, layout="HWC", rowalign=1)
+    tensor2 = tensor1.reshape((200, 3), layout="WC")
+
+    # tensor2 increased the reference count of the underlying handle,
+    # so it should be kept alive after tensor1 is deleted
+    del tensor1
+
+    assert tensor2.dtype == np.uint8
+    assert tensor2.shape == (200, 3)
+    assert tensor2.cuda().strides == (3, 1)
+
+
+@t.mark.parametrize(
+    "shape_arg, layout_arg, expected_strides",
+    [
+        ((1, 10, 10, 3), "XHWC", (320, 32, 3, 1)),
+        ((10, 10, 3, 1), "HWCX", (32, 3, 1, 1)),
+        ((10, 1, 10, 3), "HXWC", (32, 32, 3, 1)),
+        ((10, 2, 5, 3), "HABC", (32, 15, 3, 1)),
+        ((2, 5, 10, 3), "ABWC", (160, 32, 3, 1)),
+    ],
+)
+def test_tensor_reshape_strided(shape_arg, layout_arg, expected_strides):
+    tensor = nvcv.Tensor((10, 10, 3), np.uint8, layout="HWC")
+    assert tensor.cuda().strides == (32, 3, 1)  # strided rows
+
+    new_tensors = [
+        tensor.reshape(shape_arg, layout=layout_arg),
+        nvcv.reshape(tensor, shape_arg, layout=layout_arg),
+    ]
+    for new_tensor in new_tensors:
+        assert new_tensor.cuda().strides == expected_strides
+
+
+@t.mark.parametrize(
+    "shape_arg, layout_arg",
+    [((300,), "A")],
+)
+def test_tensor_reshape_strided_error(shape_arg, layout_arg):
+    tensor = nvcv.Tensor((10, 10, 3), np.uint8, layout="HWC")
+    assert tensor.cuda().strides == (32, 3, 1)  # strided rows
+
+    with t.raises(RuntimeError):
+        tensor.reshape(shape_arg, layout=layout_arg)
+
+    with t.raises(RuntimeError):
+        nvcv.reshape(tensor, shape_arg, layout=layout_arg)
+
+
+@t.mark.parametrize(
+    "shape_arg, dtype_arg, layout_arg",
+    [
+        ((3, 5, 7), np.dtype("2f4"), "NHW"),
+        ((3, 5, 3), np.dtype("4f8"), "NHW"),
+        ((3, 5, 2), np.dtype("2i1"), "NHW"),
+    ],
+)
+def test_tensor_wrap_cuda_array_interface(shape_arg, dtype_arg, layout_arg):
+    tensor = nvcv.Tensor(shape_arg, dtype_arg, layout_arg)
+
+    tcuda = tensor.cuda()
+    cai = tcuda.__cuda_array_interface__
+    assert cai["typestr"] == dtype_arg.str
+    assert cai["shape"] == shape_arg
+
+    wrapped = nvcv.as_tensor(tcuda, layout_arg)
+
+    assert wrapped.shape == shape_arg
+    assert wrapped.dtype == dtype_arg
+    assert wrapped.layout == layout_arg
diff --git a/tests/nvcv_types/python/test_tensor_batch.py b/tests/nvcv_types/python/test_tensor_batch.py
new file mode 100644
index 00000000..26012033
--- /dev/null
+++ b/tests/nvcv_types/python/test_tensor_batch.py
@@ -0,0 +1,227 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import nvcv
+import pytest as t
+import numpy as np
+import nvcv_util as util
+import torch
+import re
+
+
+def rand_shape(rank, low=1, high=10):
+    return np.random.randint(low=1, high=10, size=rank)
+
+
+def rand_torch_tensor(dtype, rank):
+    return torch.as_tensor(
+        np.random.random(size=rand_shape(rank)).astype(dtype), device="cuda"
+    )
+
+
+def random_tensors(n, dtype, rank, layout):
+    return [
+        nvcv.as_tensor(rand_torch_tensor(dtype, rank), layout=layout) for _ in range(n)
+    ]
+
+
+def test_tensorbatch_creation_works():
+    batch = nvcv.TensorBatch(15)
+    assert batch.capacity == 15
+    assert len(batch) == 0
+    assert batch.layout is None
+    assert batch.dtype is None
+    assert batch.ndim == -1
+
+    # range must be empty
+    cnt = 0
+    for i in batch:
+        cnt += 1
+    assert cnt == 0
+
+
+def test_tensorbatch_one_tensor():
+    batch = nvcv.TensorBatch(15)
+
+    tensor = nvcv.as_tensor(nvcv.Image((64, 32), nvcv.Format.RGBA8))
+    batch.pushback(tensor)
+    assert len(batch) == 1
+    assert batch.layout == "NHWC"
+    assert batch.dtype == np.uint8
+    assert batch.ndim == 4
+    assert list(batch) == [tensor]
+
+    # range must contain one
+    cnt = 0
+    for elem in batch:
+        assert elem is tensor
+        cnt += 1
+    assert cnt == 1
+
+    # remove added tensor
+    batch.popback()
+
+    # check if its indeed removed
+    assert len(batch) == 0
+    assert list(batch) == []
+
+
+def test_tensorbatch_change_layout():
+    batch = nvcv.TensorBatch(10)
+    tensorsA = random_tensors(5, np.float32, 3, "HWC")
+    batch.pushback(tensorsA)
+    assert list(batch) == tensorsA
+    assert batch.layout == "HWC"
+    assert batch.dtype == np.float32
+    assert batch.ndim == 3
+
+    batch.popback(len(tensorsA))
+    assert list(batch) == []
+    assert batch.layout is None
+    assert batch.dtype is None
+    assert batch.ndim == -1
+
+    tensorsB = [
+        nvcv.as_tensor(nvcv.Image(rand_shape(2), nvcv.Format.RGBA8)) for _ in range(7)
+    ]
+    batch.pushback(tensorsB)
+    assert list(batch) == tensorsB
+    assert batch.layout == "NHWC"
+    assert batch.dtype == np.uint8
+    assert batch.ndim == 4
+
+    batch.clear()
+    assert list(batch) == []
+    assert batch.layout is None
+    assert batch.dtype is None
+    assert batch.ndim == -1
+
+
+def test_tensorbatch_multiply_tensors():
+    N = 10
+    tensorsA = random_tensors(5, np.int16, 3, "HWC")
+    batch = nvcv.TensorBatch(len(tensorsA) * N)
+    for _ in range(N):
+        batch.pushback(tensorsA)
+
+    assert list(batch) == tensorsA * N
+    assert batch.layout == "HWC"
+    assert batch.dtype == np.int16
+    assert batch.ndim == 3
+
+
+def test_tensorbatch_subscript():
+    tensorsA = random_tensors(10, np.float32, 3, "HWC")
+    batch = nvcv.TensorBatch(10)
+    batch.pushback(tensorsA)
+
+    # test get item
+    for i in range(len(batch)):
+        assert batch[i] is tensorsA[i]
+
+    # out of bounds subscript
+    with t.raises(
+        RuntimeError,
+        match=f"Cannot get tensor at index {len(tensorsA)}. Batch has only {len(tensorsA)} elements.",
+    ):
+        batch[len(tensorsA)]
+
+    # test set item
+    tensorsB = random_tensors(5, np.float32, 3, "HWC")
+    for i in range(len(tensorsB)):
+        batch[i] = tensorsB[i]
+
+    for i in range(len(batch)):
+        if i < len(tensorsB):
+            assert batch[i] is tensorsB[i]
+        else:
+            assert batch[i] is tensorsA[i]
+
+
+def test_tensorbatch_wrap_buffers():
+    # from cuda buffer, without layout
+    buffers = [
+        util.to_cuda_buffer(np.ones(rand_shape(3), dtype=np.int32)) for _ in range(10)
+    ]
+    batch = nvcv.as_tensors(buffers)
+    assert batch.capacity == len(buffers)
+    assert len(batch) == len(buffers)
+    assert batch.dtype == np.int32
+    assert batch.layout is None
+    assert batch.ndim == 3
+
+    # from torch tensor, with layout
+    buffers = [rand_torch_tensor(np.int16, 4) for i in range(5)]
+    batch = nvcv.as_tensors(buffers, layout="NHWC")
+    assert batch.capacity == len(buffers)
+    assert len(batch) == len(buffers)
+    assert batch.dtype == np.int16
+    assert batch.layout == "NHWC"
+    assert batch.ndim == 4
+
+    # mismatching rank
+    with t.raises(
+        RuntimeError,
+        match="NVCV_ERROR_INVALID_ARGUMENT: "
+        "Trying to add a tensor to a tensor batch with an inconsistent rank.",
+    ):
+        buffers = [rand_torch_tensor(np.int16, 3), rand_torch_tensor(np.int16, 4)]
+        nvcv.as_tensors(buffers)
+
+    # mismatching dtype
+    with t.raises(
+        RuntimeError,
+        match="NVCV_ERROR_INVALID_ARGUMENT: "
+        "Trying to add a tensor to a tensor batch with an inconsistent type.",
+    ):
+        buffers = [rand_torch_tensor(np.int16, 3), rand_torch_tensor(np.int32, 3)]
+        nvcv.as_tensors(buffers)
+
+    # invalid types
+    with t.raises(
+        RuntimeError,
+        match="Input buffer doesn't provide cuda_array_interface or DLPack interfaces.",
+    ):
+        buffers = [[1, 2, 3]]
+        nvcv.as_tensors(buffers)
+
+
+def test_tensorbatch_errors():
+    with t.raises(
+        RuntimeError,
+        match=re.escape(
+            "NVCV_ERROR_OVERFLOW: Adding 2 tensors to a tensor batch would exceed its capacity (2) by 1"
+        ),
+    ):
+        batch = nvcv.TensorBatch(2)
+        batch.pushback(random_tensors(1, np.int16, 3, ""))
+        batch.pushback(random_tensors(2, np.int16, 3, ""))
+
+    with t.raises(
+        RuntimeError,
+        match="NVCV_ERROR_UNDERFLOW: Trying to pop 3 tensors from a tensor batch with 2 tensors.",
+    ):
+        batch = nvcv.TensorBatch(5)
+        batch.pushback(random_tensors(2, np.int16, 3, ""))
+        batch.popback(3)
+
+    with t.raises(
+        RuntimeError,
+        match="NVCV_ERROR_INVALID_ARGUMENT: "
+        "Trying to add a tensor to a tensor batch with an inconsistent layout.",
+    ):
+        batch = nvcv.TensorBatch(10)
+        batch.pushback(random_tensors(2, np.int16, 4, "NHWC"))
+        batch.pushback(random_tensors(3, np.int16, 4, "FHWC"))
diff --git a/tests/nvcv_types/system/CMakeLists.txt b/tests/nvcv_types/system/CMakeLists.txt
index 7ba4a26c..c4038188 100644
--- a/tests/nvcv_types/system/CMakeLists.txt
+++ b/tests/nvcv_types/system/CMakeLists.txt
@@ -39,6 +39,7 @@ add_executable(nvcv_test_types_system
     TestExceptions.cpp
     TestConfig.cpp
     TestArray.cpp
+    TestTensorBatch.cpp
 )
 
 target_link_libraries(nvcv_test_types_system
@@ -48,7 +49,7 @@ target_link_libraries(nvcv_test_types_system
         nvcv_types
 )
 
-nvcv_add_test(nvcv_test_types_system)
+nvcv_add_test(nvcv_test_types_system nvcv)
 
 # header compatibility tests ---------------------------------------------
 
@@ -89,4 +90,4 @@ target_link_libraries(nvcv_test_types_system_version_commit
         nvcv_types
 )
 
-nvcv_add_test(nvcv_test_types_system_version_commit)
+nvcv_add_test(nvcv_test_types_system_version_commit nvcv)
diff --git a/tests/nvcv_types/system/TestColorSpec.cpp b/tests/nvcv_types/system/TestColorSpec.cpp
index bffedc9b..7d6d6a73 100644
--- a/tests/nvcv_types/system/TestColorSpec.cpp
+++ b/tests/nvcv_types/system/TestColorSpec.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -82,7 +82,42 @@ TEST_P(ChromaSubsamplingTests, get_name)
     EXPECT_STREQ(gold, nvcvChromaSubsamplingGetName(css));
 }
 
+TEST(ChromaSubsamplingTests, invalidChromaSubsamplingGetNumSamples)
+{
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvChromaSubsamplingGetNumSamples(NVCV_CSS_444, nullptr, nullptr));
+}
+
+TEST(ChromaSubsamplingTests, validChromaSubsampling)
+{
+    NVCVChromaSubsampling test;
+    ASSERT_EQ(NVCV_SUCCESS, nvcvMakeChromaSubsampling(&test, 2, 1));
+    EXPECT_EQ(NVCV_CSS_410R, test);
+
+    ASSERT_EQ(NVCV_SUCCESS, nvcvMakeChromaSubsampling(&test, 1, 2));
+    EXPECT_EQ(NVCV_CSS_410, test);
+
+    ASSERT_EQ(NVCV_SUCCESS, nvcvMakeChromaSubsampling(&test, 1, 1));
+    EXPECT_EQ(NVCV_CSS_444, test);
+}
+
+TEST(ChromaSubsamplingTests, invalidChromaSubsampling)
+{
+    NVCVChromaSubsampling test;
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvMakeChromaSubsampling(&test, 5, 5));
+}
+
+TEST(ChromaSubsamplingTests, invalidOut)
+{
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvMakeChromaSubsampling(nullptr, 2, 4));
+}
+
 // Colorspec ===================================================
+TEST(ColorSpecTests, invalidOutputMake)
+{
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT,
+              nvcvMakeColorSpec(nullptr, NVCV_COLOR_SPACE_DCIP3, NVCV_YCbCr_ENC_BT2020c, NVCV_COLOR_XFER_sYCC,
+                                NVCV_COLOR_RANGE_LIMITED, NVCV_CHROMA_LOC_ODD, NVCV_CHROMA_LOC_CENTER));
+}
 
 TEST(ColorSpecTests, get_name_predefined)
 {
@@ -100,6 +135,15 @@ TEST(ColorSpecTests, get_name_non_predefined)
                  nvcvColorSpecGetName(fmt));
 }
 
+TEST(ColorSpecTests, get_name_invald)
+{
+    /// NVCVColorSpace: 0b111 NVCVYCbCrEncoding: 0b111 NVCVColorTransferFunction: 0b1111 ......
+    EXPECT_STREQ(
+        "NVCVColorSpec(invalid)NVCVColorSpec(NVCVColorSpace(7),NVCVYCbCrEncoding(7),NVCVColorTransferFunction(15),"
+        "RANGE_LIMITED,LOC_ODD,LOC_ODD)",
+        nvcvColorSpecGetName(NVCV_COLOR_SPEC_FORCE32));
+}
+
 TEST(ColorSpecTests, set_encoding_to_undefined)
 {
     NVCVColorSpec cspec = NVCV_COLOR_SPEC_BT601;
@@ -202,6 +246,9 @@ NVCV_INSTANTIATE_TEST_SUITE_P(Positive, ColorModelNeedsColorSpecTests,
                                   {NVCV_COLOR_MODEL_UNDEFINED, false},
                                   {      NVCV_COLOR_MODEL_RAW, false},
                                   {      NVCV_COLOR_MODEL_XYZ, false},
+                                  {      NVCV_COLOR_MODEL_HSV, false},
+                                  {     NVCV_COLOR_MODEL_CMYK, false},
+                                  {     NVCV_COLOR_MODEL_YCCK, false},
 } * NVCV_SUCCESS);
 
 #if !NVCV_SANITIZED
@@ -230,6 +277,11 @@ TEST_P(ColorModelNeedsColorSpecTests, run)
     }
 }
 
+TEST_P(ColorModelNeedsColorSpecTests, invalidColorModelNeedsColorspecOut)
+{
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvColorModelNeedsColorspec(NVCV_COLOR_MODEL_RGB, nullptr));
+}
+
 // The tests below explicitly create invalid enums just to test if there's any
 // overflow in bitfield representation. This will trigger -fsanitize=enum. Let's
 // disable them now in sanitized builds.
@@ -249,6 +301,11 @@ TEST(ColorSpecTests, set_color_space)
     }
 }
 
+TEST(ColorSpecTests, invalid_set_color_space)
+{
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvColorSpecSetColorSpace(nullptr, (NVCVColorSpace)0));
+}
+
 TEST(ColorSpecTests, get_color_space)
 {
     for (int cspace = 0; cspace < 1 << 3; cspace ? cspace <<= 1 : ++cspace)
@@ -263,6 +320,15 @@ TEST(ColorSpecTests, get_color_space)
     }
 }
 
+TEST(ColorSpecTests, invalid_get_color_space)
+{
+    uint64_t mask = UINT64_MAX;
+
+    NVCVColorSpec type = NVCV_MAKE_COLOR_SPEC(mask, mask, mask, mask, mask, mask);
+
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvColorSpecGetColorSpace(type, nullptr));
+}
+
 TEST(ColorSpecTests, set_encodings)
 {
     for (int enc = 0; enc < 1 << 3; enc ? enc <<= 1 : ++enc)
@@ -284,6 +350,11 @@ TEST(ColorSpecTests, set_encodings)
     }
 }
 
+TEST(ColorSpecTests, invalid_set_encodings)
+{
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvColorSpecSetYCbCrEncoding(nullptr, (NVCVYCbCrEncoding)1));
+}
+
 TEST(ColorSpecTests, get_encodings)
 {
     for (int enc = 0; enc < 1 << 3; enc ? enc <<= 1 : ++enc)
@@ -298,6 +369,15 @@ TEST(ColorSpecTests, get_encodings)
     }
 }
 
+TEST(ColorSpecTests, invalid_get_encodings)
+{
+    uint64_t mask = UINT64_MAX;
+
+    NVCVColorSpec type = NVCV_MAKE_COLOR_SPEC(mask, 0, mask, mask, mask, mask);
+
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvColorSpecGetYCbCrEncoding(type, nullptr));
+}
+
 TEST(ColorSpecTests, set_xfer_func)
 {
     for (int xfer = 0; xfer < 1 << 3; xfer ? xfer <<= 1 : ++xfer)
@@ -313,6 +393,12 @@ TEST(ColorSpecTests, set_xfer_func)
     }
 }
 
+TEST(ColorSpecTests, invalid_set_xfer_func)
+{
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT,
+              nvcvColorSpecSetColorTransferFunction(nullptr, (NVCVColorTransferFunction)1));
+}
+
 TEST(ColorSpecTests, get_xfer_func)
 {
     for (int xfer = 0; xfer < 1 << 3; xfer ? xfer <<= 1 : ++xfer)
@@ -328,6 +414,15 @@ TEST(ColorSpecTests, get_xfer_func)
     }
 }
 
+TEST(ColorSpecTests, invalid_get_xfer_func)
+{
+    uint64_t mask = UINT64_MAX;
+
+    NVCVColorSpec type = NVCV_MAKE_COLOR_SPEC(mask, mask, 0, mask, mask, mask);
+
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvColorSpecGetColorTransferFunction(type, nullptr));
+}
+
 TEST(ColorSpecTests, set_range)
 {
     for (int range = 0; range < 1 << 1; range ? range <<= 1 : ++range)
@@ -343,6 +438,11 @@ TEST(ColorSpecTests, set_range)
     }
 }
 
+TEST(ColorSpecTests, invalid_set_range)
+{
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvColorSpecSetRange(nullptr, (NVCVColorRange)0));
+}
+
 TEST(ColorSpecTests, get_range)
 {
     for (int range = 0; range < 1 << 1; range ? range <<= 1 : ++range)
@@ -357,6 +457,15 @@ TEST(ColorSpecTests, get_range)
     }
 }
 
+TEST(ColorSpecTests, invalid_get_range)
+{
+    uint64_t mask = UINT64_MAX;
+
+    NVCVColorSpec type = NVCV_MAKE_COLOR_SPEC(mask, mask, mask, mask, mask, mask);
+
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvColorSpecGetRange(type, nullptr));
+}
+
 TEST(ColorSpecTests, set_chroma_loc_horiz)
 {
     for (int loc = 0; loc < 1 << 2; loc ? loc <<= 1 : ++loc)
@@ -372,6 +481,14 @@ TEST(ColorSpecTests, set_chroma_loc_horiz)
     }
 }
 
+TEST(ColorSpecTests, invalid_set_chroma_loc)
+{
+    uint64_t mask = UINT64_MAX;
+
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT,
+              nvcvColorSpecSetChromaLoc(nullptr, (NVCVChromaLocation)1, (NVCVChromaLocation)mask));
+}
+
 TEST(ColorSpecTests, get_chroma_loc_horiz)
 {
     for (int loc = 0; loc < 1 << 2; loc ? loc <<= 1 : ++loc)
@@ -445,6 +562,9 @@ TEST(ColorModelTests, undefined_color_model_is_zero)
 TEST(ColorModelTests, get_name)
 {
     EXPECT_STREQ("NVCV_COLOR_MODEL_YCbCr", nvcvColorModelGetName(NVCV_COLOR_MODEL_YCbCr));
+    EXPECT_STREQ("NVCV_COLOR_MODEL_HSV", nvcvColorModelGetName(NVCV_COLOR_MODEL_HSV));
+    EXPECT_STREQ("NVCV_COLOR_MODEL_CMYK", nvcvColorModelGetName(NVCV_COLOR_MODEL_CMYK));
+    EXPECT_STREQ("NVCV_COLOR_MODEL_YCCK", nvcvColorModelGetName(NVCV_COLOR_MODEL_YCCK));
     EXPECT_STREQ("NVCVColorModel(-1)", nvcvColorModelGetName(static_cast<NVCVColorModel>(-1)));
 }
 
@@ -457,7 +577,12 @@ TEST(YCbCrEncodingTests, undefined_ycbcr_encoding_is_zero)
 
 TEST(YCbCrEncodingTests, get_name)
 {
+    EXPECT_STREQ("NVCV_YCbCr_ENC_UNDEFINED", nvcvYCbCrEncodingGetName(NVCV_YCbCr_ENC_UNDEFINED));
     EXPECT_STREQ("NVCV_YCbCr_ENC_BT601", nvcvYCbCrEncodingGetName(NVCV_YCbCr_ENC_BT601));
+    EXPECT_STREQ("NVCV_YCbCr_ENC_BT709", nvcvYCbCrEncodingGetName(NVCV_YCbCr_ENC_BT709));
+    EXPECT_STREQ("NVCV_YCbCr_ENC_BT2020", nvcvYCbCrEncodingGetName(NVCV_YCbCr_ENC_BT2020));
+    EXPECT_STREQ("NVCV_YCbCr_ENC_BT2020c", nvcvYCbCrEncodingGetName(NVCV_YCbCr_ENC_BT2020c));
+    EXPECT_STREQ("NVCV_YCbCr_ENC_SMPTE240M", nvcvYCbCrEncodingGetName(NVCV_YCbCr_ENC_SMPTE240M));
     EXPECT_STREQ("NVCVYCbCrEncoding(-1)", nvcvYCbCrEncodingGetName(static_cast<NVCVYCbCrEncoding>(-1)));
 }
 
@@ -466,6 +591,7 @@ TEST(YCbCrEncodingTests, get_name)
 TEST(ChromaLocationTests, get_name)
 {
     EXPECT_STREQ("NVCV_CHROMA_LOC_EVEN", nvcvChromaLocationGetName(NVCV_CHROMA_LOC_EVEN));
+    EXPECT_STREQ("NVCV_CHROMA_LOC_BOTH", nvcvChromaLocationGetName(NVCV_CHROMA_LOC_BOTH));
     EXPECT_STREQ("NVCVChromaLocation(-1)", nvcvChromaLocationGetName(static_cast<NVCVChromaLocation>(-1)));
 }
 
@@ -474,6 +600,19 @@ TEST(ChromaLocationTests, get_name)
 TEST(RawPatternTests, get_name)
 {
     EXPECT_STREQ("NVCV_RAW_BAYER_RGGB", nvcvRawPatternGetName(NVCV_RAW_BAYER_RGGB));
+    EXPECT_STREQ("NVCV_RAW_BAYER_BGGR", nvcvRawPatternGetName(NVCV_RAW_BAYER_BGGR));
+    EXPECT_STREQ("NVCV_RAW_BAYER_GRBG", nvcvRawPatternGetName(NVCV_RAW_BAYER_GRBG));
+    EXPECT_STREQ("NVCV_RAW_BAYER_GBRG", nvcvRawPatternGetName(NVCV_RAW_BAYER_GBRG));
+    EXPECT_STREQ("NVCV_RAW_BAYER_RCCB", nvcvRawPatternGetName(NVCV_RAW_BAYER_RCCB));
+    EXPECT_STREQ("NVCV_RAW_BAYER_BCCR", nvcvRawPatternGetName(NVCV_RAW_BAYER_BCCR));
+    EXPECT_STREQ("NVCV_RAW_BAYER_CRBC", nvcvRawPatternGetName(NVCV_RAW_BAYER_CRBC));
+    EXPECT_STREQ("NVCV_RAW_BAYER_CBRC", nvcvRawPatternGetName(NVCV_RAW_BAYER_CBRC));
+    EXPECT_STREQ("NVCV_RAW_BAYER_RCCC", nvcvRawPatternGetName(NVCV_RAW_BAYER_RCCC));
+    EXPECT_STREQ("NVCV_RAW_BAYER_CRCC", nvcvRawPatternGetName(NVCV_RAW_BAYER_CRCC));
+    EXPECT_STREQ("NVCV_RAW_BAYER_CCRC", nvcvRawPatternGetName(NVCV_RAW_BAYER_CCRC));
+    EXPECT_STREQ("NVCV_RAW_BAYER_CCCR", nvcvRawPatternGetName(NVCV_RAW_BAYER_CCCR));
+    EXPECT_STREQ("NVCV_RAW_BAYER_CCCC", nvcvRawPatternGetName(NVCV_RAW_BAYER_CCCC));
+    EXPECT_STREQ("NVCVRawPattern(255)", nvcvRawPatternGetName(NVCV_RAW_FORCE8));
     EXPECT_STREQ("NVCVRawPattern(-1)", nvcvRawPatternGetName(static_cast<NVCVRawPattern>(-1)));
 }
 
@@ -481,7 +620,10 @@ TEST(RawPatternTests, get_name)
 
 TEST(ColorSpaceTests, get_name)
 {
+    EXPECT_STREQ("NVCV_COLOR_SPACE_BT601", nvcvColorSpaceGetName(NVCV_COLOR_SPACE_BT601));
     EXPECT_STREQ("NVCV_COLOR_SPACE_BT709", nvcvColorSpaceGetName(NVCV_COLOR_SPACE_BT709));
+    EXPECT_STREQ("NVCV_COLOR_SPACE_BT2020", nvcvColorSpaceGetName(NVCV_COLOR_SPACE_BT2020));
+    EXPECT_STREQ("NVCV_COLOR_SPACE_DCIP3", nvcvColorSpaceGetName(NVCV_COLOR_SPACE_DCIP3));
     EXPECT_STREQ("NVCVColorSpace(-1)", nvcvColorSpaceGetName(static_cast<NVCVColorSpace>(-1)));
 }
 
@@ -490,6 +632,7 @@ TEST(ColorSpaceTests, get_name)
 TEST(WhitePointTests, get_name)
 {
     EXPECT_STREQ("NVCV_WHITE_POINT_D65", nvcvWhitePointGetName(NVCV_WHITE_POINT_D65));
+    EXPECT_STREQ("NVCVWhitePoint(255)", nvcvWhitePointGetName(NVCV_WHITE_POINT_FORCE8));
     EXPECT_STREQ("NVCVWhitePoint(-1)", nvcvWhitePointGetName(static_cast<NVCVWhitePoint>(-1)));
 }
 
diff --git a/tests/nvcv_types/system/TestDataLayout.cpp b/tests/nvcv_types/system/TestDataLayout.cpp
index 5f33df0e..437c2151 100644
--- a/tests/nvcv_types/system/TestDataLayout.cpp
+++ b/tests/nvcv_types/system/TestDataLayout.cpp
@@ -677,6 +677,47 @@ TEST(PackingTests, get_name)
     EXPECT_STREQ("NVCVPacking(2147483647)", nvcvPackingGetName(NVCV_PACKING_LIMIT32));
 }
 
+class PackingTests_Alignment : public t::TestWithParam<std::tuple<NVCVPacking, int32_t>>
+{
+};
+
+INSTANTIATE_TEST_SUITE_P(
+    _, PackingTests_Alignment,
+    t::Values(std::make_tuple(NVCV_PACKING_X24, 4), std::make_tuple(NVCV_PACKING_X4b4, 1),
+              std::make_tuple(NVCV_PACKING_X10b6, 2), std::make_tuple(NVCV_PACKING_b4X12, 2),
+              std::make_tuple(NVCV_PACKING_b4X12, 2), std::make_tuple(NVCV_PACKING_X8_Y8, 1),
+              std::make_tuple(NVCV_PACKING_X5Y5Z6, 2), std::make_tuple(NVCV_PACKING_b4X4Y4Z4, 2),
+              std::make_tuple(NVCV_PACKING_X1Y5Z5W5, 2), std::make_tuple(NVCV_PACKING_X8_Y8__X8_Z8, 1),
+              std::make_tuple(NVCV_PACKING_X8_Y8_Z8, 1), std::make_tuple(NVCV_PACKING_b2X10Y10Z10, 4),
+              std::make_tuple(NVCV_PACKING_X12b4_Y12b4, 2), std::make_tuple(NVCV_PACKING_X32_Y24b8, 4)));
+
+TEST_P(PackingTests_Alignment, get_alignment)
+{
+    auto param = GetParam();
+
+    NVCVPacking   packing           = std::get<0>(param);
+    const int32_t expectedAlignment = std::get<1>(param);
+    int32_t       outAlignment      = -1;
+
+    ASSERT_EQ(NVCV_SUCCESS, nvcvPackingGetAlignment(packing, &outAlignment)); // 16 / 8 = 2
+    EXPECT_EQ(expectedAlignment, outAlignment);
+}
+
+TEST(PackingTests_Negative, Invalid_parameter)
+{
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvPackingGetParams(NVCV_PACKING_X16, nullptr));
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvPackingGetNumComponents(NVCV_PACKING_X16, nullptr));
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvPackingGetBitsPerPixel(NVCV_PACKING_X16, nullptr));
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvPackingGetBitsPerComponent(NVCV_PACKING_X16, nullptr));
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvPackingGetAlignment(NVCV_PACKING_X16, nullptr));
+
+    NVCVPackingParams params;
+    NVCVPacking       packing;
+    ASSERT_EQ(NVCV_SUCCESS, nvcvPackingGetParams(NVCV_PACKING_X16, &params));
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvMakePacking(&packing, nullptr));
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvMakePacking(nullptr, &params));
+}
+
 TEST(ByteOrderTests, get_name)
 {
     EXPECT_STREQ("LSB", nvcvByteOrderGetName(NVCV_ORDER_LSB));
@@ -690,15 +731,26 @@ TEST(SwizzleTests, get_name)
     //              nvcvSwizzleGetName(NVCV_DETAIL_MAKE_SWIZZLE(NVCV_CHANNEL_1, NVCV_CHANNEL_1, NVCV_CHANNEL_0, NVCV_CHANNEL_X)));
 }
 
+TEST(SwizzleTests_Negative, Invalid_parameter)
+{
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvSwizzleGetNumChannels(NVCV_SWIZZLE_0000, nullptr));
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT,
+              nvcvMakeSwizzle(nullptr, NVCV_CHANNEL_X, NVCV_CHANNEL_Y, NVCV_CHANNEL_Z, NVCV_CHANNEL_W));
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvSwizzleGetChannels(NVCV_SWIZZLE_0000, nullptr));
+}
+
 TEST(SwizzleChannelTests, get_name)
 {
     EXPECT_STREQ("Y", nvcvChannelGetName(NVCV_CHANNEL_Y));
+    EXPECT_STREQ("NVCVChannel(255)", nvcvChannelGetName(NVCV_CHANNEL_FORCE8));
     EXPECT_STREQ("NVCVChannel(7)", nvcvChannelGetName(static_cast<NVCVChannel>(7)));
 }
 
 TEST(MemLayoutTests, get_name)
 {
+    EXPECT_STREQ("NVCV_MEM_LAYOUT_BLOCK1_LINEAR", nvcvMemLayoutGetName(NVCV_MEM_LAYOUT_BLOCK1_LINEAR));
     EXPECT_STREQ("NVCV_MEM_LAYOUT_BLOCK2_LINEAR", nvcvMemLayoutGetName(NVCV_MEM_LAYOUT_BLOCK2_LINEAR));
+    EXPECT_STREQ("NVCV_MEM_LAYOUT_BLOCK32_LINEAR", nvcvMemLayoutGetName(NVCV_MEM_LAYOUT_BLOCK32_LINEAR));
     EXPECT_STREQ("NVCVMemLayout(-1)", nvcvMemLayoutGetName(static_cast<NVCVMemLayout>(-1)));
 }
 
@@ -706,4 +758,5 @@ TEST(DataKindTests, get_name)
 {
     EXPECT_STREQ("NVCV_DATA_KIND_FLOAT", nvcvDataKindGetName(NVCV_DATA_KIND_FLOAT));
     EXPECT_STREQ("NVCV_DATA_KIND_UNSPECIFIED", nvcvDataKindGetName(static_cast<NVCVDataKind>(-1)));
+    EXPECT_STREQ("NVCVDataKind(-128)", nvcvDataKindGetName(static_cast<NVCVDataKind>(-128)));
 }
diff --git a/tests/nvcv_types/system/TestImageBatch.cpp b/tests/nvcv_types/system/TestImageBatch.cpp
index c201aa67..cc6b1250 100644
--- a/tests/nvcv_types/system/TestImageBatch.cpp
+++ b/tests/nvcv_types/system/TestImageBatch.cpp
@@ -356,6 +356,199 @@ TEST(ImageBatchVarShape, smoke_sync)
     ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream));
 }
 
+TEST(ImageBatchVarShape, push_exceed_capacity)
+{
+    nvcv::ImageBatchVarShape batch(32);
+
+    std::mt19937                  rng(123);
+    std::uniform_int_distribution rnd(1, 4);
+
+    std::list<nvcv::Image>       vec1;
+    std::vector<NVCVImageHandle> vec1Handles;
+    for (int i = 0; i < batch.capacity() + 1; ++i)
+    {
+        vec1.emplace_back(nvcv::Size2D{rnd(rng) * 2, rnd(rng) * 2}, nvcv::FMT_NV12);
+        vec1Handles.push_back(vec1.back().handle());
+    }
+
+    EXPECT_EQ(NVCV_ERROR_OVERFLOW,
+              nvcvImageBatchVarShapePushImages(batch.handle(), vec1Handles.data(), vec1Handles.size()));
+}
+
+TEST(ImageBatchVarShape, push_null_images)
+{
+    nvcv::ImageBatchVarShape batch(32);
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchVarShapePushImages(batch.handle(), nullptr, batch.capacity()));
+}
+
+TEST(ImageBatchVarShape, push_callback_exceed_capacity)
+{
+    nvcv::ImageBatchVarShape     batch(32);
+    std::vector<NVCVImageHandle> vec1Handles;
+
+    auto cb = [&]() -> nvcv::Image
+    {
+        int i = batch.numImages();
+        if (i < batch.capacity() + 1)
+        {
+            nvcv::Image img(nvcv::Size2D{320 + i * 2, 122 - i * 2}, nvcv::FMT_NV12);
+            vec1Handles.push_back(img.handle());
+            return img;
+        }
+        else
+        {
+            return {};
+        }
+    };
+    auto *pcb = &cb;
+    auto  ccb = [](void *ctx) -> NVCVImageHandle
+    {
+        return nvcv::detail::GetImageHandleForPushBack((*decltype(pcb)(ctx))());
+    };
+
+    EXPECT_EQ(NVCV_ERROR_OVERFLOW, nvcvImageBatchVarShapePushImagesCallback(batch.handle(), ccb, pcb));
+
+    // clean
+    for (auto imgHandle : vec1Handles)
+    {
+        int newRef = -1;
+        nvcvImageDecRef(imgHandle, &newRef);
+        EXPECT_EQ(0, newRef);
+    }
+}
+
+TEST(ImageBatchVarShape, push_callback_null_cbPushImage)
+{
+    nvcv::ImageBatchVarShape batch(32);
+
+    auto cb = [&]() -> nvcv::Image
+    {
+        int i = batch.numImages();
+        if (i < batch.capacity())
+        {
+            nvcv::Image img(nvcv::Size2D{320 + i * 2, 122 - i * 2}, nvcv::FMT_NV12);
+            return img;
+        }
+        else
+        {
+            return {};
+        }
+    };
+    auto *pcb = &cb;
+
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchVarShapePushImagesCallback(batch.handle(), nullptr, pcb));
+    batch.clear();
+}
+
+TEST(ImageBatchVarShape, pop_negative_num_iamges)
+{
+    nvcv::ImageBatchVarShape batch(32);
+
+    std::mt19937                  rng(123);
+    std::uniform_int_distribution rnd(1, 4);
+
+    std::list<nvcv::Image>       vec1;
+    std::vector<NVCVImageHandle> vec1Handles;
+    for (int i = 0; i < batch.capacity(); ++i)
+    {
+        vec1.emplace_back(nvcv::Size2D{rnd(rng) * 2, rnd(rng) * 2}, nvcv::FMT_NV12);
+        vec1Handles.push_back(vec1.back().handle());
+    }
+
+    ASSERT_EQ(NVCV_SUCCESS, nvcvImageBatchVarShapePushImages(batch.handle(), vec1Handles.data(), vec1Handles.size()));
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchVarShapePopImages(batch.handle(), -1));
+}
+
+TEST(ImageBatchVarShape, pop_exceed_max_images)
+{
+    nvcv::ImageBatchVarShape batch(32);
+
+    std::mt19937                  rng(123);
+    std::uniform_int_distribution rnd(1, 4);
+
+    std::list<nvcv::Image>       vec1;
+    std::vector<NVCVImageHandle> vec1Handles;
+    for (int i = 0; i < batch.capacity(); ++i)
+    {
+        vec1.emplace_back(nvcv::Size2D{rnd(rng) * 2, rnd(rng) * 2}, nvcv::FMT_NV12);
+        vec1Handles.push_back(vec1.back().handle());
+    }
+
+    ASSERT_EQ(NVCV_SUCCESS, nvcvImageBatchVarShapePushImages(batch.handle(), vec1Handles.data(), vec1Handles.size()));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvImageBatchVarShapePopImages(batch.handle(), batch.capacity()));
+    EXPECT_EQ(NVCV_ERROR_UNDERFLOW, nvcvImageBatchVarShapePopImages(batch.handle(), 1));
+}
+
+TEST(ImageBatchVarShape, get_null_images)
+{
+    nvcv::ImageBatchVarShape batch(32);
+
+    std::mt19937                  rng(123);
+    std::uniform_int_distribution rnd(1, 4);
+
+    std::list<nvcv::Image>       vec1;
+    std::vector<NVCVImageHandle> vec1Handles;
+
+    for (int i = 0; i < batch.capacity(); ++i)
+    {
+        vec1.emplace_back(nvcv::Size2D{rnd(rng) * 2, rnd(rng) * 2}, nvcv::FMT_NV12);
+        vec1Handles.push_back(vec1.back().handle());
+    }
+
+    ASSERT_EQ(NVCV_SUCCESS, nvcvImageBatchVarShapePushImages(batch.handle(), vec1Handles.data(), vec1Handles.size()));
+
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT,
+              nvcvImageBatchVarShapeGetImages(batch.handle(), 0, nullptr, batch.capacity()));
+}
+
+TEST(ImageBatchVarShape, get_negative_index)
+{
+    nvcv::ImageBatchVarShape     batch(32);
+    std::vector<NVCVImageHandle> outputHandles(32);
+
+    std::mt19937                  rng(123);
+    std::uniform_int_distribution rnd(1, 4);
+
+    std::list<nvcv::Image>       vec1;
+    std::vector<NVCVImageHandle> vec1Handles;
+
+    for (int i = 0; i < batch.capacity(); ++i)
+    {
+        vec1.emplace_back(nvcv::Size2D{rnd(rng) * 2, rnd(rng) * 2}, nvcv::FMT_NV12);
+        vec1Handles.push_back(vec1.back().handle());
+    }
+
+    ASSERT_EQ(NVCV_SUCCESS, nvcvImageBatchVarShapePushImages(batch.handle(), vec1Handles.data(), vec1Handles.size()));
+
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT,
+              nvcvImageBatchVarShapeGetImages(batch.handle(), -1, outputHandles.data(), batch.capacity()));
+}
+
+TEST(ImageBatchVarShape, get_overflow_index_in_handle)
+{
+    nvcv::ImageBatchVarShape     batch(32);
+    std::vector<NVCVImageHandle> outputHandles(32);
+
+    std::mt19937                  rng(123);
+    std::uniform_int_distribution rnd(1, 4);
+
+    std::list<nvcv::Image>       vec1;
+    std::vector<NVCVImageHandle> vec1Handles;
+
+    for (int i = 0; i < batch.capacity(); ++i)
+    {
+        vec1.emplace_back(nvcv::Size2D{rnd(rng) * 2, rnd(rng) * 2}, nvcv::FMT_NV12);
+        vec1Handles.push_back(vec1.back().handle());
+    }
+
+    ASSERT_EQ(NVCV_SUCCESS, nvcvImageBatchVarShapePushImages(batch.handle(), vec1Handles.data(), vec1Handles.size()));
+
+    EXPECT_EQ(NVCV_ERROR_OVERFLOW,
+              nvcvImageBatchVarShapeGetImages(batch.handle(), 0, outputHandles.data(), batch.capacity() + 1));
+    EXPECT_EQ(NVCV_ERROR_OVERFLOW,
+              nvcvImageBatchVarShapeGetImages(batch.handle(), 1, outputHandles.data(), batch.capacity()));
+}
+
 TEST(ImageBatch, smoke_user_pointer)
 {
     nvcv::ImageBatchVarShape batch(3);
@@ -393,3 +586,79 @@ TEST(ImageBatch, smoke_cast)
     ref = img.reset();
     EXPECT_EQ(ref, 0);
 }
+
+class ImageBatchNullParamTest : public ::testing::Test
+{
+protected:
+    ImageBatchNullParamTest() {}
+
+    ~ImageBatchNullParamTest() {}
+
+    void SetUp() override
+    {
+        ASSERT_EQ(NVCV_SUCCESS, nvcvImageBatchVarShapeCalcRequirements(5, &reqs));
+        ASSERT_EQ(NVCV_SUCCESS, nvcvImageBatchVarShapeConstruct(&reqs, nullptr, &handle));
+    }
+
+    void TearDown() override
+    {
+        int newRef = 1;
+        ASSERT_EQ(NVCV_SUCCESS, nvcvImageBatchDecRef(handle, &newRef));
+        ASSERT_EQ(newRef, 0);
+        handle = nullptr;
+    }
+
+    NVCVImageBatchHandle               handle;
+    NVCVImageBatchVarShapeRequirements reqs;
+};
+
+TEST(ImageBatch, calc_req_invalid_parameters)
+{
+    NVCVImageBatchVarShapeRequirements reqs;
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchVarShapeCalcRequirements(5, nullptr));
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchVarShapeCalcRequirements(-1, &reqs));
+}
+
+TEST(ImageBatch, construct_null_parameters)
+{
+    NVCVImageBatchHandle               handle;
+    NVCVImageBatchVarShapeRequirements reqs;
+    ASSERT_EQ(NVCV_SUCCESS, nvcvImageBatchVarShapeCalcRequirements(5, &reqs));
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchVarShapeConstruct(nullptr, nullptr, &handle));
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchVarShapeConstruct(&reqs, nullptr, nullptr));
+}
+
+TEST_F(ImageBatchNullParamTest, get_user_pointer_null_output)
+{
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchGetUserPointer(handle, nullptr));
+}
+
+TEST_F(ImageBatchNullParamTest, get_num_images_null_output)
+{
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchGetNumImages(handle, nullptr));
+}
+
+TEST_F(ImageBatchNullParamTest, get_batch_capacity_null_output)
+{
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchGetCapacity(handle, nullptr));
+}
+
+TEST_F(ImageBatchNullParamTest, get_unique_format_null_output)
+{
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchVarShapeGetUniqueFormat(handle, nullptr));
+}
+
+TEST_F(ImageBatchNullParamTest, gbatch_get_type_null_output)
+{
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchGetType(handle, nullptr));
+}
+
+TEST_F(ImageBatchNullParamTest, export_data_null_output)
+{
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchExportData(handle, 0, nullptr));
+}
+
+TEST_F(ImageBatchNullParamTest, get_max_size_null_output)
+{
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchVarShapeGetMaxSize(handle, nullptr, nullptr));
+}
diff --git a/tests/nvcv_types/system/TestSize.cpp b/tests/nvcv_types/system/TestSize.cpp
index f9272ee5..73c30916 100644
--- a/tests/nvcv_types/system/TestSize.cpp
+++ b/tests/nvcv_types/system/TestSize.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -23,6 +23,20 @@
 namespace gt   = ::testing;
 namespace test = nvcv::test;
 
+TEST(Size2D, C_interop)
+{
+    NVCVSize2D   c_size{5, 7};
+    nvcv::Size2D cxx_size = c_size; // construction
+    EXPECT_EQ(cxx_size, c_size);    // comparison
+    cxx_size.w++;
+    EXPECT_NE(cxx_size, c_size);
+    cxx_size = c_size; // assignment
+    EXPECT_EQ(cxx_size, c_size);
+    c_size             = cxx_size; // reverse assignment
+    NVCVSize2D c_size2 = cxx_size; // reverse construction
+    EXPECT_EQ(c_size2, cxx_size);
+}
+
 // Size2D Equality --------------------------------------------
 class Size2DEqualityTests : public gt::TestWithParam<std::tuple<nvcv::Size2D, nvcv::Size2D, bool>>
 {
diff --git a/tests/nvcv_types/system/TestTensorBatch.cpp b/tests/nvcv_types/system/TestTensorBatch.cpp
new file mode 100644
index 00000000..8cd1bf28
--- /dev/null
+++ b/tests/nvcv_types/system/TestTensorBatch.cpp
@@ -0,0 +1,467 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Definitions.hpp"
+
+#include <common/ValueTests.hpp>
+#include <nvcv/TensorBatch.hpp>
+#include <nvcv/TensorData.hpp>
+#include <nvcv/TensorLayout.hpp>
+
+#include <list>
+#include <random>
+
+#include <nvcv/Fwd.hpp>
+
+namespace t    = ::testing;
+namespace test = nvcv::test;
+
+template<typename R>
+nvcv::Tensor GetRandomTensor(R &rg, const nvcv::ImageFormat &format)
+{
+    std::uniform_int_distribution<int32_t> shape_dist(100, 400);
+    std::uniform_int_distribution<int32_t> images_num_dist(1, 16);
+    return nvcv::Tensor(images_num_dist(rg), {shape_dist(rg), shape_dist(rg)}, format);
+}
+
+template<typename It>
+void CheckTensorBatchData(const nvcv::TensorBatchData &tbdata, It tensors_begin, It tensors_end, CUstream stream)
+{
+    auto numTensors = tensors_end - tensors_begin;
+    ASSERT_EQ(numTensors, tbdata.numTensors());
+    std::vector<NVCVTensorBatchElementStrided> elements(numTensors);
+    ASSERT_TRUE(tbdata.cast<nvcv::TensorBatchDataStridedCuda>().hasValue());
+    auto buffer = tbdata.cast<nvcv::TensorBatchDataStridedCuda>()->buffer();
+    ASSERT_EQ(cudaSuccess,
+              cudaMemcpyAsync(elements.data(), buffer.tensors, sizeof(NVCVTensorBatchElementStrided) * numTensors,
+                              cudaMemcpyDeviceToHost, stream));
+
+    int i = 0;
+    for (auto it = tensors_begin; it != tensors_end; ++it)
+    {
+        nvcv::Tensor &tensor  = *it;
+        auto          tdata   = tensor.exportData().cast<nvcv::TensorDataStridedCuda>().value();
+        auto         &element = elements[i];
+        EXPECT_EQ(tdata.layout(), tbdata.layout());
+        EXPECT_EQ(tdata.dtype(), tbdata.dtype());
+        EXPECT_EQ(tdata.basePtr(), reinterpret_cast<nvcv::Byte *>(element.data));
+        ASSERT_EQ(tdata.rank(), tbdata.rank());
+        for (int d = 0; d < tbdata.rank(); ++d)
+        {
+            EXPECT_EQ(tdata.shape(d), element.shape[d]);
+            EXPECT_EQ(tdata.stride(d), element.stride[d]);
+        }
+        ++i;
+    }
+}
+
+TEST(TensorBatch, create)
+{
+    auto                      reqs = nvcv::TensorBatch::CalcRequirements(1);
+    std::vector<nvcv::Tensor> tensors;
+    tensors.emplace_back(nvcv::Tensor(1, {300, 300}, nvcv::FMT_RGB8));
+    {
+        nvcv::TensorBatch tb(reqs);
+        EXPECT_EQ(tb.layout(), nvcv::TensorLayout(""));
+        EXPECT_EQ(tb.dtype(), nvcv::DataType());
+        tb.pushBack(tensors[0]);
+        ASSERT_EQ(tb.numTensors(), 1);
+        ASSERT_EQ(tensors[0].refCount(), 2);
+        auto tbdata = tb.exportData(nullptr);
+        CheckTensorBatchData(tbdata, tensors.begin(), tensors.end(), nullptr);
+    }
+    ASSERT_EQ(tensors[0].refCount(), 1);
+}
+
+TEST(TensorBatch, ref_counting)
+{
+    std::mt19937 rg{231};
+    nvcv::Tensor tensor = GetRandomTensor(rg, nvcv::FMT_RGB8);
+    {
+        auto              reqs = nvcv::TensorBatch::CalcRequirements(1);
+        nvcv::TensorBatch tb(reqs);
+        tb.pushBack(tensor);
+        ASSERT_EQ(tb.refCount(), 1);
+        ASSERT_EQ(tensor.refCount(), 2);
+        int                            numMul = 32;
+        std::vector<nvcv::TensorBatch> tbs(numMul, tb);
+        ASSERT_EQ(tb.refCount(), numMul + 1);
+        ASSERT_EQ(tensor.refCount(), 2);
+    }
+    ASSERT_EQ(tensor.refCount(), 1);
+}
+
+TEST(TensorBatch, properties)
+{
+    int32_t                   capacity = 32;
+    std::vector<nvcv::Tensor> tensors(capacity / 2);
+    std::mt19937              rg{321};
+    for (int i = 0; i < capacity / 2; ++i)
+    {
+        tensors[i] = GetRandomTensor(rg, nvcv::FMT_RGB8);
+    }
+    auto              reqs = nvcv::TensorBatch::CalcRequirements(capacity);
+    nvcv::TensorBatch tb(reqs);
+    tb.pushBack(tensors.begin(), tensors.end());
+    EXPECT_EQ(tb.dtype(), nvcv::TYPE_U8);
+    EXPECT_EQ(tb.capacity(), capacity);
+    EXPECT_EQ(tb.numTensors(), capacity / 2);
+    EXPECT_EQ(tb.layout(), nvcv::TensorLayout("NHWC"));
+    EXPECT_EQ(tb.type(), NVCV_TENSOR_BUFFER_STRIDED_CUDA);
+}
+
+TEST(TensorBatch, user_pointer)
+{
+    auto              reqs = nvcv::TensorBatch::CalcRequirements(1);
+    nvcv::TensorBatch tb(reqs);
+    int               valueA = 0;
+    tb.setUserPointer(&valueA);
+    EXPECT_EQ(tb.getUserPointer(), &valueA);
+    auto tbCopy = tb;
+    std::cout << tb.refCount() << std::endl;
+    EXPECT_EQ(tbCopy.getUserPointer(), &valueA);
+    int valueB = 0;
+    tb.setUserPointer(&valueB);
+    EXPECT_EQ(tb.getUserPointer(), &valueB);
+    EXPECT_EQ(tbCopy.getUserPointer(), &valueB);
+}
+
+TEST(TensorBatch, consistency_validation)
+{
+    std::mt19937 rg{321};
+    auto         base_tensor = GetRandomTensor(rg, nvcv::FMT_RGB8);
+
+    auto test_inconsistency = [&](int32_t rank, nvcv::DataType dtype, nvcv::TensorLayout layout)
+    {
+        auto                 reqs = nvcv::TensorBatch::CalcRequirements(2);
+        nvcv::TensorBatch    tb(reqs);
+        std::vector<int64_t> shape(rank, 1);
+        nvcv::Tensor         tensor(nvcv::TensorShape(shape.data(), rank, layout), dtype);
+        tb.pushBack(tensor);
+        NVCV_EXPECT_THROW_STATUS(NVCV_ERROR_INVALID_ARGUMENT, tb.pushBack(base_tensor));
+    };
+    test_inconsistency(4, nvcv::TYPE_U8, nvcv::TensorLayout("FHWC"));
+    test_inconsistency(4, nvcv::TYPE_U32, nvcv::TensorLayout("NHWC"));
+    test_inconsistency(3, nvcv::TYPE_U8, nvcv::TensorLayout("HWC"));
+}
+
+TEST(TensorBatch, push_in_parts)
+{
+    const int32_t             iters    = 20;
+    const int32_t             capacity = iters * (iters + 1) / 2;
+    std::vector<nvcv::Tensor> tensors(capacity);
+    std::mt19937              rg{123};
+    for (int32_t i = 0; i < capacity; ++i)
+    {
+        tensors[i] = GetRandomTensor(rg, nvcv::FMT_RGB8);
+    }
+    std::array<CUstream, 2> streams{};
+    ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[0]));
+    ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[1]));
+    {
+        auto              reqs = nvcv::TensorBatch::CalcRequirements(capacity);
+        nvcv::TensorBatch tb(reqs);
+        auto              tensors_begin = tensors.data();
+        for (int32_t i = 1; i < 21; ++i)
+        {
+            tb.pushBack(tensors_begin, tensors_begin + i);
+            ASSERT_EQ(tb.numTensors(), tensors_begin + i - tensors.data());
+            if (i % 2 == 0)
+            {
+                auto stream = streams[i / 2 % 2];
+                auto tbdata = tb.exportData(stream);
+                CheckTensorBatchData(tbdata, tensors.data(), tensors_begin + i, stream);
+            }
+            tensors_begin += i;
+        }
+        for (auto &t : tensors)
+        {
+            ASSERT_EQ(t.refCount(), 2);
+        }
+    }
+    for (auto &t : tensors)
+    {
+        ASSERT_EQ(t.refCount(), 1);
+    }
+}
+
+TEST(TensorBatch, push_the_same_tensor)
+{
+    const int                 numMul = 32;
+    std::mt19937              rg{123};
+    auto                      tensor = GetRandomTensor(rg, nvcv::FMT_RGB8);
+    std::vector<nvcv::Tensor> tensors;
+    for (int i = 0; i < numMul; ++i)
+    {
+        tensors.push_back(tensor);
+    }
+    ASSERT_EQ(tensor.refCount(), numMul + 1);
+    {
+        auto              reqs = nvcv::TensorBatch::CalcRequirements(numMul);
+        nvcv::TensorBatch tb(reqs);
+        tb.pushBack(tensors.begin(), tensors.end());
+        ASSERT_EQ(tensor.refCount(), numMul * 2 + 1);
+        NVCV_EXPECT_THROW_STATUS(NVCV_ERROR_OVERFLOW, tb.pushBack(tensor));
+    }
+    ASSERT_EQ(tensor.refCount(), numMul + 1);
+}
+
+TEST(TensorBatch, clear)
+{
+    const int32_t             capacity = 32;
+    std::vector<nvcv::Tensor> tensors(capacity);
+    std::mt19937              rg{123};
+    for (int32_t i = 0; i < capacity; ++i)
+    {
+        tensors[i] = GetRandomTensor(rg, nvcv::FMT_RGB8);
+    }
+    auto              reqs = nvcv::TensorBatch::CalcRequirements(capacity);
+    nvcv::TensorBatch tb(reqs);
+    tb.pushBack(tensors.begin(), tensors.end());
+    for (auto &t : tensors)
+    {
+        EXPECT_EQ(t.refCount(), 2);
+    }
+    tb.clear();
+    for (auto &t : tensors)
+    {
+        EXPECT_EQ(t.refCount(), 1);
+    }
+    EXPECT_EQ(tb.layout(), nvcv::TensorLayout(""));
+    EXPECT_EQ(tb.dtype(), nvcv::DataType());
+}
+
+TEST(TensorBatch, pop_tensors)
+{
+    const int32_t             capacity = 32;
+    std::vector<nvcv::Tensor> tensors(capacity);
+    std::mt19937              rg{123};
+    for (int32_t i = 0; i < capacity; ++i)
+    {
+        tensors[i] = GetRandomTensor(rg, nvcv::FMT_RGB8);
+    }
+    auto              reqs = nvcv::TensorBatch::CalcRequirements(capacity);
+    nvcv::TensorBatch tb(reqs);
+
+    tb.pushBack(tensors.data(), tensors.data() + capacity / 2);
+    tb.popTensors(capacity / 4); // remove dirty tensors
+    // tensor batch should contain the first quarter of the tensors
+    ASSERT_EQ(tb.numTensors(), capacity / 4);
+    auto data = tb.exportData(nullptr);
+    CheckTensorBatchData(data, tensors.data(), tensors.data() + capacity / 4, nullptr);
+    for (int i = 0; i < capacity / 4; ++i)
+    {
+        EXPECT_EQ(tensors[i].refCount(), 2);
+    }
+    for (int i = capacity / 4; i < capacity / 2; ++i)
+    {
+        EXPECT_EQ(tensors[i].refCount(), 1);
+    }
+
+    tb.pushBack(tensors.data() + capacity / 2, tensors.data() + capacity);
+    // tensor batch should contain the first quarter and the last half of the tensors
+    EXPECT_EQ(tb.numTensors(), capacity * 3 / 4);
+    for (int i = capacity / 2; i < capacity; ++i)
+    {
+        EXPECT_EQ(tensors[i].refCount(), 2);
+    }
+    std::vector<nvcv::Tensor> result{};
+    result.insert(result.end(), tensors.begin(), tensors.begin() + capacity / 4);
+    result.insert(result.end(), tensors.begin() + capacity / 2, tensors.begin() + capacity);
+    data = tb.exportData(nullptr);
+    CheckTensorBatchData(data, result.begin(), result.end(), nullptr);
+    result.clear();
+
+    tb.popTensors(capacity / 4); // remove clean tensors;
+    // tensor batch should contain the first and the third quarter of the tensors
+    EXPECT_EQ(tb.numTensors(), capacity / 2);
+    for (int i = 0; i < capacity / 4; ++i)
+    {
+        EXPECT_EQ(tensors[i].refCount(), 2);
+        EXPECT_EQ(tensors[i + capacity / 4].refCount(), 1);
+        EXPECT_EQ(tensors[i + capacity * 2 / 4].refCount(), 2);
+        EXPECT_EQ(tensors[i + capacity * 3 / 4].refCount(), 1);
+    }
+    data = tb.exportData(nullptr);
+    result.insert(result.end(), tensors.begin(), tensors.begin() + capacity / 4);
+    result.insert(result.end(), tensors.begin() + capacity / 2, tensors.begin() + capacity * 3 / 4);
+    CheckTensorBatchData(data, result.begin(), result.end(), nullptr);
+    result.clear();
+
+    tb.pushBack(tensors.begin(), tensors.begin() + capacity / 4);
+    // tensor batch should contain the first, the third and the first (again) quarter
+    EXPECT_EQ(tb.numTensors(), capacity * 3 / 4);
+    for (int i = 0; i < capacity / 4; ++i)
+    {
+        EXPECT_EQ(tensors[i].refCount(), 3);
+        EXPECT_EQ(tensors[i + capacity / 4].refCount(), 1);
+        EXPECT_EQ(tensors[i + capacity * 2 / 4].refCount(), 2);
+        EXPECT_EQ(tensors[i + capacity * 3 / 4].refCount(), 1);
+    }
+    tb.popTensors(capacity / 2); // remove clean and dirty tensors
+    // tensor batch should contain the first quarter of the tensors
+    EXPECT_EQ(tb.numTensors(), capacity / 4);
+    for (int i = 0; i < capacity / 4; ++i)
+    {
+        EXPECT_EQ(tensors[i].refCount(), 2);
+    }
+    for (int i = capacity / 4; i < capacity; ++i)
+    {
+        EXPECT_EQ(tensors[i].refCount(), 1);
+    }
+    data = tb.exportData(nullptr);
+    result.insert(result.end(), tensors.begin(), tensors.begin() + capacity / 4);
+    CheckTensorBatchData(data, result.begin(), result.end(), nullptr);
+    result.clear();
+
+    tb.pushBack(tensors[0]);
+    EXPECT_EQ(tensors[0].refCount(), 3);
+    tb.popTensor(); // pop single tensor
+    EXPECT_EQ(tensors[0].refCount(), 2);
+    data = tb.exportData(nullptr);
+    result.insert(result.end(), tensors.begin(), tensors.begin() + capacity / 4);
+    CheckTensorBatchData(data, result.begin(), result.end(), nullptr);
+
+    NVCV_EXPECT_THROW_STATUS(NVCV_ERROR_UNDERFLOW, tb.popTensors(capacity / 4 + 1));
+    NVCV_EXPECT_THROW_STATUS(NVCV_ERROR_INVALID_ARGUMENT, tb.popTensors(-1));
+}
+
+TEST(TensorBatch, iterator_arithm)
+{
+    int32_t                   capacity = 4;
+    std::vector<nvcv::Tensor> tensors(capacity);
+    std::mt19937              rg{321};
+    for (int i = 0; i < capacity; ++i)
+    {
+        tensors[i] = GetRandomTensor(rg, nvcv::FMT_RGB8);
+    }
+    auto              reqs = nvcv::TensorBatch::CalcRequirements(capacity);
+    nvcv::TensorBatch tb(reqs);
+
+    auto it = tb.begin();
+    EXPECT_EQ(it, tb.end());
+
+    tb.pushBack(tensors.begin(), tensors.end());
+    it = tb.begin();
+
+    EXPECT_EQ(it->handle(), tensors[0].handle());
+    EXPECT_EQ((++it)->handle(), tensors[1].handle());
+    EXPECT_EQ((it++)->handle(), tensors[1].handle());
+    EXPECT_EQ((--it)->handle(), tensors[1].handle());
+    EXPECT_EQ((it--)->handle(), tensors[1].handle());
+    EXPECT_EQ((it + capacity - 1)->handle(), tensors[capacity - 1].handle());
+
+    EXPECT_EQ((tb.end() - capacity), tb.begin());
+    EXPECT_GT(tb.end(), tb.begin());
+    EXPECT_GE(it, tb.begin());
+    EXPECT_LT(it, it + 2);
+    EXPECT_LE(it, it + 1);
+
+    EXPECT_EQ(tb.end() - it, capacity);
+}
+
+TEST(TensorBatch, indexing_and_iterating)
+{
+    int32_t                   capacity = 32;
+    std::vector<nvcv::Tensor> tensors(capacity);
+    std::mt19937              rg{321};
+    for (int i = 0; i < capacity; ++i)
+    {
+        tensors[i] = GetRandomTensor(rg, nvcv::FMT_RGB8);
+    }
+    auto              reqs = nvcv::TensorBatch::CalcRequirements(capacity);
+    nvcv::TensorBatch tb(reqs);
+    tb.pushBack(tensors.begin(), tensors.end());
+    for (int i = 0; i < capacity; ++i)
+    {
+        EXPECT_EQ(tb[i].handle(), tensors[i].handle());
+    }
+
+    int i = 0;
+    for (auto t : tb)
+    {
+        EXPECT_EQ(t.handle(), tensors[i++].handle());
+    }
+
+    NVCV_EXPECT_THROW_STATUS(NVCV_ERROR_OVERFLOW, tb[capacity]);
+    NVCV_EXPECT_THROW_STATUS(NVCV_ERROR_INVALID_ARGUMENT, tb[-1]);
+}
+
+TEST(TensorBatch, set_tensor)
+{
+    int32_t                   capacity = 32;
+    std::vector<nvcv::Tensor> tensors(capacity);
+    std::mt19937              rg{321};
+    for (int i = 0; i < capacity; ++i)
+    {
+        tensors[i] = GetRandomTensor(rg, nvcv::FMT_RGB8);
+    }
+    auto              reqs = nvcv::TensorBatch::CalcRequirements(capacity);
+    nvcv::TensorBatch tb(reqs);
+    tb.pushBack(tensors.begin(), tensors.end());
+    auto tensorA = GetRandomTensor(rg, nvcv::FMT_RGB8);
+    auto tensorB = GetRandomTensor(rg, nvcv::FMT_RGB8);
+
+    tb.setTensor(0, tensorA); // set at dirty position
+    auto data   = tb.exportData(nullptr);
+    auto result = tensors;
+    result[0]   = tensorA;
+    CheckTensorBatchData(data, result.begin(), result.end(), nullptr);
+    result.clear();
+    EXPECT_EQ(tensors[0].refCount(), 1);
+    EXPECT_EQ(tensorA.refCount(), 2);
+
+    tb.setTensor(capacity / 4, tensorA);
+    tb.setTensor(capacity / 2, tensorB); // set at clean positions
+    data      = tb.exportData(nullptr);
+    result    = tensors;
+    result[0] = result[capacity / 4] = tensorA;
+    result[capacity / 2]             = tensorB;
+    CheckTensorBatchData(data, result.begin(), result.end(), nullptr);
+    result.clear();
+    EXPECT_EQ(tensors[0].refCount(), 1);
+    EXPECT_EQ(tensors[capacity / 4].refCount(), 1);
+    EXPECT_EQ(tensors[capacity / 2].refCount(), 1);
+    EXPECT_EQ(tensorA.refCount(), 3);
+    EXPECT_EQ(tensorB.refCount(), 2);
+
+    for (int i = capacity - 10; i < capacity; ++i)
+    {
+        tensors[i] = GetRandomTensor(rg, nvcv::FMT_RGB8);
+    }
+    tb.popTensors(10);
+    tb.pushBack(tensors.begin() + capacity - 10, tensors.end());
+    ASSERT_EQ(tb.numTensors(), capacity);
+
+    tb.setTensor(capacity / 2 + 1, tensorA); // set at clean position
+    tb.setTensor(capacity - 2, tensorB);     // set at dirty position
+    data      = tb.exportData(nullptr);
+    result    = tensors;
+    result[0] = result[capacity / 4] = tensorA;
+    result[capacity / 2]             = tensorB;
+    result[capacity / 2 + 1]         = tensorA;
+    result[capacity - 2]             = tensorB;
+    CheckTensorBatchData(data, result.begin(), result.end(), nullptr);
+    result.clear();
+    EXPECT_EQ(tensors[0].refCount(), 1);
+    EXPECT_EQ(tensors[capacity / 4].refCount(), 1);
+    EXPECT_EQ(tensors[capacity / 2].refCount(), 1);
+    EXPECT_EQ(tensors[capacity / 2 + 1].refCount(), 1);
+    EXPECT_EQ(tensors[capacity - 2].refCount(), 1);
+    EXPECT_EQ(tensorA.refCount(), 4);
+    EXPECT_EQ(tensorB.refCount(), 3);
+}
diff --git a/tests/nvcv_types/system/TestTensorDataUtils.cpp b/tests/nvcv_types/system/TestTensorDataUtils.cpp
index c4a3d207..04b54440 100644
--- a/tests/nvcv_types/system/TestTensorDataUtils.cpp
+++ b/tests/nvcv_types/system/TestTensorDataUtils.cpp
@@ -35,7 +35,7 @@ namespace util = nvcv::util;
 NVCV_TEST_SUITE_P(TensorDataUtils, test::ValueList<int, int, int, uint8_t, nvcv::ImageFormat>
 {
     //width, height, numImages, fill byte, format
-    {     2,      2,         2,         2, nvcv::FMT_RGB8},
+    {     2,      2,         1,         2, nvcv::FMT_RGB8},
     {     3,      3,         5,         2, nvcv::FMT_BGR8},
     {     10,    11,         2,         2, nvcv::FMT_RGBA8},
     {     5,      5,         1,         2, nvcv::FMT_BGRA8},
@@ -216,6 +216,41 @@ TEST_P(TensorDataUtils, SetGetTensorFromVector)
     EXPECT_NO_THROW(GetSetTensor<float>(tensor));
 }
 
+TEST_P(TensorDataUtils, SetGetTensorToFromByteVector)
+{
+    int               width  = GetParamValue<0>();
+    int               height = GetParamValue<1>();
+    int               number = GetParamValue<2>();
+    nvcv::ImageFormat fmt    = GetParamValue<4>();
+
+    // This will return the number of channels in the plane 0, so with planar
+    // this must be considered only for that plane.
+    int numChannels   = fmt.numChannels();
+    int bytesPerPixel = 0;
+
+    for (int i = 0; i < numChannels; i++)
+    {
+        bytesPerPixel += fmt.bitsPerChannel()[i] / 8;
+    }
+    nvcv::Tensor tensor(number, {width, height}, fmt);
+
+    std::default_random_engine    randEng(0);
+    std::uniform_int_distribution rand(0u, 255u);
+
+    // Test the CHW/HWC tensors
+    for (int i = 0; i < number; ++i)
+    {
+        std::vector<nvcv::Byte> imageVec((width * height) * bytesPerPixel);
+        std::generate(imageVec.begin(), imageVec.end(), [&]() { return (nvcv::Byte)rand(randEng); });
+        std::vector<nvcv::Byte> outVec((width * height) * bytesPerPixel);
+        EXPECT_NO_THROW(util::SetImageTensorFromByteVector(tensor.exportData(), imageVec, i));
+        EXPECT_NO_THROW(util::GetImageByteVectorFromTensor(tensor.exportData(), i, outVec));
+        EXPECT_EQ(imageVec, outVec);
+    }
+
+    return;
+}
+
 TEST_P(TensorDataUtils, SetGetTensorFromImageVector)
 {
     int               width  = GetParamValue<0>();
diff --git a/tests/nvcv_types/unit/CMakeLists.txt b/tests/nvcv_types/unit/CMakeLists.txt
index e2be5753..d42a9ca2 100644
--- a/tests/nvcv_types/unit/CMakeLists.txt
+++ b/tests/nvcv_types/unit/CMakeLists.txt
@@ -35,6 +35,9 @@ add_executable(nvcv_test_types_unit
     TestHandleWrapper.cpp
     TestTypeTraits.cpp
     TestSharedCoreObj.cpp
+    TestStreamId.cpp
+    TestSimpleCache.cpp
+    TestPerStreamCache.cpp
 )
 
 if(ENABLE_COMPAT_OLD_GLIBC)
@@ -52,6 +55,7 @@ target_link_libraries(nvcv_test_types_unit
         nvcv_util
         nvcv_test_common
         nvcv_types_priv
+        cuda
 )
 
-nvcv_add_test(nvcv_test_types_unit)
+nvcv_add_test(nvcv_test_types_unit nvcv)
diff --git a/tests/nvcv_types/unit/TestPerStreamCache.cpp b/tests/nvcv_types/unit/TestPerStreamCache.cpp
new file mode 100644
index 00000000..8f066426
--- /dev/null
+++ b/tests/nvcv_types/unit/TestPerStreamCache.cpp
@@ -0,0 +1,396 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Definitions.hpp"
+
+#include <util/Event.hpp>
+#include <util/PerStreamCache.hpp>
+#include <util/Stream.hpp>
+
+#include <random>
+
+namespace {
+
+class Hog
+{
+public:
+    Hog(size_t size = 64 << 20)
+        : m_size(size)
+    {
+        NVCV_CHECK_THROW(cudaMalloc(&m_buf, m_size));
+    }
+
+    ~Hog()
+    {
+        (void)cudaFree(m_buf);
+    }
+
+    void Run(cudaStream_t stream, int iters)
+    {
+        for (int i = 0; i < iters; i++)
+        {
+            NVCV_CHECK_THROW(cudaMemsetAsync(m_buf, i, m_size, stream));
+        }
+    }
+
+private:
+    void  *m_buf = nullptr;
+    size_t m_size;
+};
+
+struct DummyPayload
+{
+    size_t      size = 0, alignment = 1;
+    cudaEvent_t ready = nullptr;
+};
+
+using ItemAlloc = nvcv::util::detail::StreamCacheItemAllocator<DummyPayload>;
+
+} // namespace
+
+namespace nvcv::util {
+
+TEST(StreamCacheItemAllocator, BasicTest)
+{
+    ItemAlloc                        alloc;
+    std::vector<ItemAlloc::item_t *> items;
+    std::mt19937_64                  rng;
+    std::bernoulli_distribution      action;
+    for (int i = 0; i < 1000; i++)
+    {
+        if (action(rng) || items.empty())
+        {
+            items.push_back(alloc.allocate());
+        }
+        else
+        {
+            int                                n = items.size();
+            std::uniform_int_distribution<int> dist(0, n - 1);
+            int                                i = dist(rng);
+            std::swap(items[i], items.back());
+            alloc.deallocate(items.back());
+            items.pop_back();
+        }
+    }
+    while (!items.empty())
+    {
+        alloc.deallocate(items.back());
+        items.pop_back();
+    }
+}
+
+namespace {
+
+struct EventAlloc
+{
+    void reserve(int count)
+    {
+        for (int i = 0; i < count; i++) get();
+        clear();
+    }
+
+    void clear()
+    {
+        for (auto &event : events) cache.put(std::move(event));
+        events.clear();
+    }
+
+    std::vector<CudaEvent> events;
+    EventCache             cache;
+
+    cudaEvent_t get()
+    {
+        events.push_back(cache.get());
+        return events.back().get();
+    };
+};
+
+} // namespace
+
+TEST(StreamOrderedCacheTest, InsertGet)
+{
+    EventAlloc                               events;
+    ItemAlloc                                alloc;
+    detail::StreamOrderedCache<DummyPayload> cache(&alloc);
+
+    EXPECT_FALSE(cache.get(1000, 1).has_value()) << "The cache should be empty";
+    DummyPayload p{};
+    p.size  = 1000;
+    p.ready = events.get();
+    cache.put(std::move(p));
+    EXPECT_FALSE(cache.get(2000, 1).has_value()) << "The cache doesn't contain any element large enough";
+    auto v = cache.get(500, 1);
+    ASSERT_TRUE(v.has_value());
+    EXPECT_EQ(v->size, 1000) << "The cache contains a suitable element";
+    v = cache.get(500, 1);
+    EXPECT_FALSE(cache.get(0, 0)) << "The element was already removed";
+}
+
+TEST(StreamOrderedCacheTest, FindNextReady)
+{
+    EventAlloc                               events;
+    ItemAlloc                                alloc;
+    detail::StreamOrderedCache<DummyPayload> cache(&alloc);
+
+    int N = 801;
+    events.reserve(N);
+    CudaStream stream = CudaStream::Create(true);
+
+    Hog hog;
+
+    const int kMaxRetries = 10;
+    int       retries     = kMaxRetries;
+
+    for (int split = 0; split < N; split += 20)
+    {
+        std::cout << split + 1 << "/" << N << std::endl;
+        events.clear();
+
+        int i;
+        for (i = 0; i < split; i++)
+        {
+            DummyPayload dp{};
+            dp.ready = events.get();
+            dp.size  = i;
+            ASSERT_EQ(cudaSuccess, cudaEventRecord(dp.ready, stream.get()));
+            cache.put(std::move(dp));
+        }
+
+        hog.Run(stream.get(), 50);
+
+        for (; i < N; i++)
+        {
+            DummyPayload dp{};
+            dp.ready = events.get();
+            dp.size  = i;
+            ASSERT_EQ(cudaSuccess, cudaEventRecord(dp.ready, stream.get()));
+            cache.put(std::move(dp));
+        }
+
+        if (split > 0)
+        {
+            ASSERT_EQ(cudaSuccess, cudaEventSynchronize(events.events[split - 1]));
+        }
+        auto *item = cache.findNewestReady();
+        if (cudaEventQuery(events.events[split]) == cudaSuccess)
+        {
+            if (--retries < 0)
+                GTEST_SKIP() << "Unreliable test";
+            split--;
+            cache.waitAndPurge();
+            ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize());
+            continue;
+        }
+        retries = kMaxRetries;
+
+        if (split == 0)
+            EXPECT_EQ(item, nullptr);
+        else
+        {
+            EXPECT_NE(item, nullptr);
+            if (item)
+            {
+                EXPECT_EQ(item->payload.size, split - 1);
+            }
+        }
+
+        ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize());
+
+        cache.waitAndPurge();
+    }
+}
+
+TEST(StreamOrderedCacheTest, RemoveAllReady)
+{
+    EventAlloc                               events;
+    ItemAlloc                                alloc;
+    detail::StreamOrderedCache<DummyPayload> cache(&alloc);
+
+    CudaStream stream = CudaStream::Create(true);
+    int        N      = 801;
+    events.reserve(N);
+
+    Hog hog;
+
+    const int kMaxRetries = 10;
+    int       retries     = kMaxRetries;
+
+    std::vector<bool> mask(N);
+
+    for (int split = 0; split < N; split += 20)
+    {
+        std::cout << split + 1 << "/" << N << std::endl;
+        events.clear();
+        for (int i = 0; i < N; i++) mask[i] = false;
+
+        int i;
+        for (i = 0; i < split; i++)
+        {
+            DummyPayload dp{};
+            dp.ready = events.get();
+            dp.size  = i;
+            ASSERT_EQ(cudaSuccess, cudaEventRecord(dp.ready, stream.get()));
+            cache.put(std::move(dp));
+        }
+
+        hog.Run(stream.get(), 50);
+
+        for (; i < N; i++)
+        {
+            DummyPayload dp{};
+            dp.ready = events.get();
+            dp.size  = i;
+            ASSERT_EQ(cudaSuccess, cudaEventRecord(dp.ready, stream.get()));
+            cache.put(std::move(dp));
+        }
+
+        if (split > 0)
+        {
+            ASSERT_EQ(cudaSuccess, cudaEventSynchronize(events.events[split - 1]));
+        }
+        cache.removeAllReady([&](const DummyPayload &p) { mask[p.size] = true; });
+        if (cudaEventQuery(events.events[split]) != cudaErrorNotReady)
+        {
+            if (--retries < 0)
+                GTEST_SKIP() << "Unreliable test";
+            split--;
+            cache.waitAndPurge();
+            ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize());
+            continue;
+        }
+        retries = kMaxRetries;
+        for (int i = 0; i < N; i++)
+        {
+            EXPECT_EQ(mask[i], (i < split)) << "@ i = " << i << " split = " << split;
+        }
+
+        ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize());
+
+        cache.waitAndPurge();
+    }
+}
+
+TEST(PerStreamCacheTest, NoStream)
+{
+    PerStreamCache<DummyPayload> cache;
+
+    EventAlloc events;
+
+    DummyPayload p1{1000, 1, events.get()};
+    DummyPayload p2{2000, 1, events.get()};
+    DummyPayload p3{3000, 1, events.get()};
+    cache.put(std::move(p1), std::nullopt);
+    cache.put(std::move(p2), std::nullopt);
+    cache.put(std::move(p3), std::nullopt);
+    auto v1 = cache.get(1001, 0, std::nullopt);
+    ASSERT_TRUE(v1.has_value());
+    EXPECT_EQ(v1->size, 2000);
+    auto v2 = cache.get(900, 0, std::nullopt);
+    ASSERT_TRUE(v2.has_value());
+    EXPECT_EQ(v2->size, 1000);
+    auto v3 = cache.get(900, 0, std::nullopt);
+    ASSERT_TRUE(v3.has_value());
+    EXPECT_EQ(v3->size, 3000);
+}
+
+TEST(PerStreamCacheTest, TwoStream)
+{
+    for (int attempt = 0; attempt < 10; attempt++)
+    {
+        ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize());
+        EventAlloc                   events;
+        PerStreamCache<DummyPayload> cache;
+
+        Hog hog;
+
+        CudaStream s1 = CudaStream::Create(true);
+        CudaStream s2 = CudaStream::Create(true);
+
+        DummyPayload p1{1000, 1, events.get()};
+        DummyPayload p2{2000, 1, events.get()};
+        DummyPayload p3{3000, 1, events.get()};
+        DummyPayload p4{4000, 1, events.get()};
+
+        hog.Run(s1.get(), 100);
+        hog.Run(s2.get(), 100);
+
+        ASSERT_EQ(cudaSuccess, cudaEventRecord(p1.ready, s1.get()));
+        ASSERT_EQ(cudaSuccess, cudaEventRecord(p2.ready, s2.get()));
+        ASSERT_EQ(cudaSuccess, cudaEventRecord(p3.ready, s1.get()));
+        ASSERT_EQ(cudaSuccess, cudaEventRecord(p4.ready, s2.get()));
+
+        auto s = std::chrono::high_resolution_clock::now();
+        cache.put(std::move(p1), s1.get());
+        cache.put(std::move(p2), s2.get());
+        cache.put(std::move(p3), s1.get());
+        cache.put(std::move(p4), s2.get());
+        auto   e               = std::chrono::high_resolution_clock::now();
+        double insert_time     = (e - s).count() / 4;
+        double stream_get_time = 0;
+
+        s                      = std::chrono::high_resolution_clock::now();
+        auto v0                = cache.get(1, 0, std::nullopt);
+        e                      = std::chrono::high_resolution_clock::now();
+        double failed_get_time = (e - s).count();
+        if (v0.has_value())
+        {
+            if (cudaSuccess == cudaEventQuery(p1.ready) || cudaSuccess == cudaEventQuery(p1.ready))
+                continue;
+            EXPECT_FALSE(v0.has_value()) << "The resources are not ready - none should be returned for null stream.";
+        }
+
+        s               = std::chrono::high_resolution_clock::now();
+        auto v1s1       = cache.get(1001, 0, s1);
+        e               = std::chrono::high_resolution_clock::now();
+        stream_get_time = (e - s).count();
+        ASSERT_TRUE(v1s1.has_value());
+        EXPECT_EQ(v1s1->size, 3000);
+
+        s               = std::chrono::high_resolution_clock::now();
+        auto v2s1       = cache.get(900, 0, s1);
+        e               = std::chrono::high_resolution_clock::now();
+        stream_get_time = (e - s).count();
+        ASSERT_TRUE(v2s1.has_value());
+        EXPECT_EQ(v2s1->size, 1000);
+
+        s               = std::chrono::high_resolution_clock::now();
+        auto v1s2       = cache.get(900, 0, s2);
+        e               = std::chrono::high_resolution_clock::now();
+        stream_get_time = (e - s).count();
+        stream_get_time /= 3;
+        ASSERT_TRUE(v1s2.has_value());
+        EXPECT_EQ(v1s2->size, 2000);
+
+        ASSERT_EQ(cudaSuccess, cudaEventSynchronize(events.events[3]));
+        s               = std::chrono::high_resolution_clock::now();
+        auto v0ready    = cache.get(1, 0, std::nullopt);
+        e               = std::chrono::high_resolution_clock::now();
+        double get_time = (e - s).count();
+        ASSERT_TRUE(v0ready.has_value());
+        EXPECT_EQ(v0ready->size, 4000);
+
+        std::cout << "Insert time = " << insert_time << "ns" << std::endl;
+        std::cout << "Get time (stream) = " << stream_get_time << "ns" << std::endl;
+        std::cout << "Get time (global, success) = " << get_time << "ns" << std::endl;
+        std::cout << "Get time (global, failed) = " << failed_get_time << "ns" << std::endl;
+
+        return;
+    }
+    GTEST_SKIP() << "Test unreliable - cannot make the CPU wait for the GPU";
+}
+
+} // namespace nvcv::util
diff --git a/tests/nvcv_types/unit/TestSimpleCache.cpp b/tests/nvcv_types/unit/TestSimpleCache.cpp
new file mode 100644
index 00000000..4b970d25
--- /dev/null
+++ b/tests/nvcv_types/unit/TestSimpleCache.cpp
@@ -0,0 +1,92 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Definitions.hpp"
+
+#include <util/SimpleCache.hpp>
+
+namespace {
+struct Payload
+{
+    int  data      = 0;
+    bool destroyed = false;
+    bool movedOut  = false;
+
+    explicit Payload(int data)
+        : data(data)
+    {
+    }
+
+    ~Payload()
+    {
+        data      = -2;
+        destroyed = true;
+    }
+
+    Payload(Payload &&p)
+    {
+        *this = std::move(p);
+    }
+
+    Payload &operator=(Payload &&p)
+    {
+        data       = p.data;
+        destroyed  = p.destroyed;
+        movedOut   = p.movedOut;
+        p.data     = -1;
+        p.movedOut = true;
+        return *this;
+    }
+};
+
+} // namespace
+
+TEST(SimpleCacheTest, PutGet)
+{
+    nvcv::util::SimpleCache<Payload> cache;
+    EXPECT_FALSE(cache.get().has_value());
+    Payload p = cache.getOrCreate([]() { return Payload(42); });
+    EXPECT_EQ(p.data, 42);
+    EXPECT_FALSE(p.destroyed);
+    EXPECT_FALSE(p.movedOut);
+    cache.put(std::move(p));
+    cache.put(Payload(1234));
+    cache.emplace(4321);
+    EXPECT_TRUE(p.movedOut);
+    EXPECT_FALSE(p.destroyed);
+
+    std::optional<Payload> o = cache.get();
+    ASSERT_TRUE(o.has_value());
+    EXPECT_EQ(o->data, 4321);
+    EXPECT_FALSE(o->destroyed);
+    EXPECT_FALSE(o->movedOut);
+
+    o = cache.get();
+    ASSERT_TRUE(o.has_value());
+    EXPECT_EQ(o->data, 1234);
+    EXPECT_FALSE(o->destroyed);
+    EXPECT_FALSE(o->movedOut);
+
+    o = cache.get();
+    ASSERT_TRUE(o.has_value());
+    EXPECT_EQ(o->data, 42);
+    EXPECT_FALSE(o->destroyed);
+    EXPECT_FALSE(o->movedOut);
+
+    o = cache.get();
+    EXPECT_FALSE(o.has_value());
+}
diff --git a/tests/nvcv_types/unit/TestStreamId.cpp b/tests/nvcv_types/unit/TestStreamId.cpp
new file mode 100644
index 00000000..b5108bbf
--- /dev/null
+++ b/tests/nvcv_types/unit/TestStreamId.cpp
@@ -0,0 +1,128 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Definitions.hpp"
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <util/StreamId.hpp>
+
+#include <thread>
+
+TEST(StreamIdTest, RegularAndDefault)
+{
+    cudaStream_t stream1 = 0, stream2 = 0;
+    (void)cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking);
+    (void)cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking);
+    if (!stream1 || !stream2)
+    {
+        if (stream1)
+            (void)cudaStreamDestroy(stream1);
+        if (stream2)
+            (void)cudaStreamDestroy(stream2);
+        FAIL() << "Could not create two CUDA streams";
+    }
+
+    uint64_t id1 = nvcv::util::GetCudaStreamIdHint(stream1);
+    uint64_t id2 = nvcv::util::GetCudaStreamIdHint(stream2);
+    uint64_t id3 = nvcv::util::GetCudaStreamIdHint(0);
+    EXPECT_NE(id1, id2);
+    EXPECT_NE(id1, id3);
+    EXPECT_NE(id2, id3);
+    (void)cudaStreamDestroy(stream1);
+    (void)cudaStreamDestroy(stream2);
+}
+
+/** Tests that distinct streams with the same handle get different IDs
+ */
+TEST(StreamIdTest, HandleReuse)
+{
+    if (!nvcv::util::IsCudaStreamIdHintUnambiguous())
+        GTEST_SKIP() << "This platform doesn't have an unambiguous CUDA stream id\n";
+
+    struct CudaDeleter
+    {
+        void operator()(void *p)
+        {
+            cudaFree(p);
+        }
+    };
+
+    auto CudaAlloc = [](size_t size)
+    {
+        void *ret = nullptr;
+        cudaMalloc(&ret, size);
+        return ret;
+    };
+
+    size_t                             bufSize = 256 << 20; // 256MiB
+    std::unique_ptr<void, CudaDeleter> mem(CudaAlloc(bufSize));
+
+    cudaEvent_t e;
+    (void)cudaEventCreateWithFlags(&e, cudaEventDisableTiming);
+
+    bool done        = false;
+    int  maxAttempts = 10;
+    for (int i = 0; i < maxAttempts; i++)
+    {
+        (void)cudaDeviceSynchronize();
+        cudaStream_t stream1 = 0, stream2 = 0;
+        (void)cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking);
+        uint64_t id1 = nvcv::util::GetCudaStreamIdHint(stream1);
+        for (int i = 0; i < 10; i++) cudaMemsetAsync(mem.get(), i, bufSize, stream1);
+        cudaEventRecord(e, stream1);
+        if (stream1)
+            (void)cudaStreamDestroy(stream1);
+        (void)cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking);
+        bool     stillRunning = (cudaEventQuery(e) == cudaErrorNotReady);
+        uint64_t id2          = nvcv::util::GetCudaStreamIdHint(stream2);
+        if (stream2)
+            (void)cudaStreamDestroy(stream2);
+        if (stream1 != stream2)
+            continue; // no handle reuse - retry
+        if (!stillRunning)
+            continue; // the stream wasn't running - the ID may be the same without any harm
+        EXPECT_NE(id1, id2);
+        done = true;
+        break;
+    }
+
+    (void)cudaEventDestroy(e);
+
+    if (!done)
+        GTEST_SKIP() << "Could not trigger handle reuse - no way to conduct the test";
+}
+
+TEST(StreamIdTest, PerThreadDefault)
+{
+    const int                N = 4;
+    std::vector<std::thread> threads(N);
+    std::vector<uint64_t>    ids(N);
+    for (int i = 0; i < N; i++)
+    {
+        threads[i] = std::thread(
+            [&, i]()
+            {
+                (void)cudaFree(0); // create/assign a context
+                ids[i] = nvcv::util::GetCudaStreamIdHint(cudaStreamPerThread);
+            });
+    }
+    for (int i = 0; i < N; i++) threads[i].join();
+    for (int i = 0; i < N - 1; i++)
+        for (int j = i + 1; j < N; j++)
+            EXPECT_NE(ids[i], ids[j]) << "Per-thread streams for threads " << i << " and " << j << " do not differ.";
+}
diff --git a/tests/run_tests.sh.in b/tests/run_tests.sh.in
index 82cbefda..af178ca4 100755
--- a/tests/run_tests.sh.in
+++ b/tests/run_tests.sh.in
@@ -17,8 +17,16 @@
 
 shopt -s extglob
 
+# Defaults
+test_set="all"
 curdir=$(dirname "$(readlink -f "$0")")
 
+if [[ $# -ge 1 ]]; then
+    test_set=$1
+fi
+
+IFS="," read -r -a test_set <<< "$test_set"
+
 function on_exit()
 {
     set +e
@@ -43,8 +51,16 @@ trap 'on_exit' EXIT
 function run()
 {
     local testexec=$1
+    local testgroup=$2
 
-    echo "Running $testexec test suite..."
+    for test in "${test_set[@]}"
+    do
+        if [ "$testgroup" == "$test" ] || [ "$test" == "all" ];then
+            echo "Running $testexec test suite..."
+            NVCV_LEAK_DETECTION=abort "$curdir/$testexec"
+            return
+        fi
+    done
 
-    NVCV_LEAK_DETECTION=abort "$curdir/$testexec"
+    echo "Skipping $testexec test suite..."
 }