From 0765bc19481f771843e6340543b01fecd3f4967a Mon Sep 17 00:00:00 2001 From: Miles Price Date: Wed, 13 Dec 2023 17:39:45 -0800 Subject: [PATCH] feat: adding code for release v0.5.0 (beta-3) of CV-CUDA --- .github/workflows/codeql.yml | 89 + .gitmodules | 3 + 3rdparty/CMakeLists.txt | 8 + 3rdparty/nvbench | 1 + CMakeLists.txt | 9 +- CONTRIBUTING.md | 4 +- DEVELOPER_GUIDE.md | 13 +- LICENSE.md | 2 +- README.md | 238 ++- SECURITY.md | 17 + bench/BenchAdaptiveThreshold.cpp | 96 + bench/BenchAdvCvtColor.cpp | 74 + bench/BenchAverageBlur.cpp | 93 + bench/BenchBilateralFilter.cpp | 96 + bench/BenchBndBox.cpp | 97 + bench/BenchBoxBlur.cpp | 97 + bench/BenchBrightnessContrast.cpp | 91 + bench/BenchCenterCrop.cpp | 85 + bench/BenchChannelReorder.cpp | 73 + bench/BenchColorTwist.cpp | 85 + bench/BenchComposite.cpp | 91 + bench/BenchConv2D.cpp | 83 + bench/BenchConvertTo.cpp | 70 + bench/BenchCopyMakeBorder.cpp | 96 + bench/BenchCropFlipNormalizeReformat.cpp | 116 ++ bench/BenchCustomCrop.cpp | 70 + bench/BenchCvtColor.cpp | 83 + bench/BenchErase.cpp | 95 + bench/BenchFindContours.cpp | 126 ++ bench/BenchFlip.cpp | 99 + bench/BenchGammaContrast.cpp | 75 + bench/BenchGaussian.cpp | 96 + bench/BenchGaussianNoise.cpp | 87 + bench/BenchHistogram.cpp | 71 + bench/BenchHistogramEq.cpp | 77 + bench/BenchInpaint.cpp | 85 + bench/BenchJointBilateralFilter.cpp | 100 + bench/BenchLabel.cpp | 108 + bench/BenchLaplacian.cpp | 91 + bench/BenchMedianBlur.cpp | 86 + bench/BenchMinAreaRect.cpp | 70 + bench/BenchMinMaxLoc.cpp | 92 + bench/BenchMinMaxLoc.hpp | 121 ++ bench/BenchMorphology.cpp | 133 ++ bench/BenchNMS.cpp | 76 + bench/BenchNormalize.cpp | 99 + bench/BenchOSD.cpp | 94 + bench/BenchPadAndStack.cpp | 82 + bench/BenchPairwiseMatcher.cpp | 108 + bench/BenchPillowResize.cpp | 105 + bench/BenchRandomResizedCrop.cpp | 103 + bench/BenchReformat.cpp | 67 + bench/BenchRemap.cpp | 120 ++ bench/BenchResize.cpp | 97 + bench/BenchRotate.cpp | 91 + bench/BenchSIFT.cpp | 109 + bench/BenchStack.cpp | 68 + bench/BenchThreshold.cpp | 85 + bench/BenchUtils.hpp | 324 +++ bench/BenchWarpAffine.cpp | 95 + bench/BenchWarpPerspective.cpp | 95 + bench/CMakeLists.txt | 82 + bench/run_bench.py | 95 + ci/build.sh | 11 +- ci/check_formatting.sh | 42 - cmake/ConfigCompiler.cmake | 2 + cmake/PrintConfig.cmake | 6 + docker/build/Dockerfile | 6 +- docker/config | 3 +- docker/test/Dockerfile | 7 + docker/update_test_image.sh | 2 +- docs/sphinx/content/cvcuda_oplist.csv | 7 +- docs/sphinx/index.rst | 3 +- docs/sphinx/installation.rst | 2 +- docs/sphinx/relnotes/v0.5.0-beta.rst | 75 + .../samples/python_samples/classification.rst | 2 +- .../python_samples/object_detection.rst | 2 +- .../samples/python_samples/segmentation.rst | 2 +- python/mod_cvcuda/CMakeLists.txt | 18 + python/mod_cvcuda/ConnectivityType.cpp | 34 + python/mod_cvcuda/ConnectivityType.hpp | 30 + python/mod_cvcuda/InterpolationType.cpp | 4 +- python/mod_cvcuda/LabelType.cpp | 31 + python/mod_cvcuda/LabelType.hpp | 30 + python/mod_cvcuda/Main.cpp | 15 +- python/mod_cvcuda/NormType.cpp | 32 + python/mod_cvcuda/NormType.hpp | 30 + python/mod_cvcuda/OpAdvCvtColor.cpp | 2 - python/mod_cvcuda/OpCvtColor.cpp | 20 +- python/mod_cvcuda/OpFindContours.cpp | 18 +- python/mod_cvcuda/OpFindHomography.cpp | 330 ++++ python/mod_cvcuda/OpLabel.cpp | 210 ++ python/mod_cvcuda/OpPairwiseMatcher.cpp | 204 ++ python/mod_cvcuda/OpPillowResize.cpp | 99 +- python/mod_cvcuda/OpStack.cpp | 179 ++ python/mod_cvcuda/Operators.hpp | 8 + python/mod_cvcuda/OsdElement.cpp | 380 ++-- python/mod_cvcuda/OsdElement.hpp | 2 - python/mod_cvcuda/PairwiseMatcherType.cpp | 29 + python/mod_cvcuda/PairwiseMatcherType.hpp | 30 + python/mod_cvcuda/WorkspaceCache.cpp | 87 + python/mod_cvcuda/WorkspaceCache.hpp | 319 +++ python/mod_cvcuda/exports.ldscript | 22 + python/mod_nvcv/Array.cpp | 350 ++++ python/mod_nvcv/Array.hpp | 106 + python/mod_nvcv/CAPI.cpp | 48 + python/mod_nvcv/CMakeLists.txt | 9 + python/mod_nvcv/CastUtils.hpp | 47 + python/mod_nvcv/DLPackUtils.cpp | 56 +- python/mod_nvcv/DLPackUtils.hpp | 2 + python/mod_nvcv/ExternalBuffer.cpp | 6 +- python/mod_nvcv/Image.cpp | 16 +- python/mod_nvcv/ImageBatch.cpp | 23 + python/mod_nvcv/ImageBatch.hpp | 2 + python/mod_nvcv/Main.cpp | 2 + python/mod_nvcv/Resource.cpp | 2 - python/mod_nvcv/Tensor.cpp | 35 +- python/mod_nvcv/Tensor.hpp | 4 + python/mod_nvcv/TensorBatch.cpp | 261 +++ python/mod_nvcv/TensorBatch.hpp | 96 + python/mod_nvcv/exports.ldscript | 22 + python/mod_nvcv/include/nvcv/python/Array.hpp | 102 + python/mod_nvcv/include/nvcv/python/CAPI.hpp | 14 + python/mod_nvcv/include/nvcv/python/Fwd.hpp | 1 + python/mod_nvcv/include/nvcv/python/Shape.hpp | 25 + .../include/nvcv/python/TensorBatch.hpp | 113 ++ samples/CMakeLists.txt | 3 +- samples/README.md | 8 +- samples/common/python/interop_utils.py | 88 + samples/common/python/perf_utils.py | 10 +- samples/label/python/main.py | 316 +++ samples/object_detection/python/pipelines.py | 24 +- samples/scripts/benchmark.py | 15 +- samples/scripts/install_dependencies.sh | 2 +- samples/scripts/run_samples.sh | 6 + src/cvcuda/CMakeLists.txt | 6 +- src/cvcuda/OpFindHomography.cpp | 67 + src/cvcuda/OpLabel.cpp | 55 + src/cvcuda/OpPairwiseMatcher.cpp | 58 + src/cvcuda/OpPillowResize.cpp | 61 +- src/cvcuda/OpStack.cpp | 53 + src/cvcuda/include/cvcuda/OpFindHomography.h | 151 ++ .../include/cvcuda/OpFindHomography.hpp | 86 + src/cvcuda/include/cvcuda/OpLabel.h | 242 +++ src/cvcuda/include/cvcuda/OpLabel.hpp | 86 + src/cvcuda/include/cvcuda/OpPairwiseMatcher.h | 173 ++ .../include/cvcuda/OpPairwiseMatcher.hpp | 86 + src/cvcuda/include/cvcuda/OpPillowResize.h | 60 +- src/cvcuda/include/cvcuda/OpPillowResize.hpp | 49 +- src/cvcuda/include/cvcuda/OpSIFT.h | 4 +- src/cvcuda/include/cvcuda/OpStack.h | 121 ++ src/cvcuda/include/cvcuda/OpStack.hpp | 79 + src/cvcuda/include/cvcuda/Types.h | 136 +- src/cvcuda/include/cvcuda/Workspace.h | 104 + src/cvcuda/include/cvcuda/Workspace.hpp | 203 ++ src/cvcuda/priv/CMakeLists.txt | 9 +- src/cvcuda/priv/OpBndBox.cpp | 4 +- src/cvcuda/priv/OpBndBox.hpp | 2 +- .../priv/OpCropFlipNormalizeReformat.cu | 4 +- src/cvcuda/priv/OpFindHomography.cu | 1615 +++++++++++++++ src/cvcuda/priv/OpFindHomography.hpp | 78 + src/cvcuda/priv/OpLabel.cu | 1751 +++++++++++++++++ src/cvcuda/priv/OpLabel.hpp | 48 + src/cvcuda/priv/OpMinMaxLoc.cu | 24 +- src/cvcuda/priv/OpPairwiseMatcher.cu | 665 +++++++ src/cvcuda/priv/OpPairwiseMatcher.hpp | 49 + src/cvcuda/priv/OpPillowResize.cpp | 43 +- src/cvcuda/priv/OpPillowResize.hpp | 16 +- src/cvcuda/priv/OpSIFT.cu | 10 + src/cvcuda/priv/OpStack.cpp | 101 + src/cvcuda/priv/OpStack.hpp | 49 + src/cvcuda/priv/Types.hpp | 643 ++++++ src/cvcuda/priv/WorkspaceAllocator.hpp | 216 ++ src/cvcuda/priv/WorkspaceEstimator.hpp | 90 + src/cvcuda/priv/WorkspaceUtil.hpp | 24 + src/cvcuda/priv/legacy/CMakeLists.txt | 1 - src/cvcuda/priv/legacy/CvCudaLegacy.h | 178 +- src/cvcuda/priv/legacy/CvCudaOSD.hpp | 4 - src/cvcuda/priv/legacy/bnd_box.cu | 573 ------ src/cvcuda/priv/legacy/box_blur.cu | 25 +- src/cvcuda/priv/legacy/center_crop.cu | 5 - src/cvcuda/priv/legacy/convert_to.cu | 5 - src/cvcuda/priv/legacy/custom_crop.cu | 5 - src/cvcuda/priv/legacy/cvt_color.cu | 5 - src/cvcuda/priv/legacy/cvt_color_var_shape.cu | 14 +- src/cvcuda/priv/legacy/filter.cu | 5 - src/cvcuda/priv/legacy/find_contours.cu | 9 +- src/cvcuda/priv/legacy/flip.cu | 5 - .../priv/legacy/flip_or_copy_var_shape.cu | 5 - .../priv/legacy/histogram_eq_var_shape.cu | 4 +- src/cvcuda/priv/legacy/median_blur.cu | 5 - src/cvcuda/priv/legacy/min_area_rect.cu | 3 +- src/cvcuda/priv/legacy/normalize.cu | 5 - src/cvcuda/priv/legacy/osd.cu | 131 +- src/cvcuda/priv/legacy/pad_and_stack.cu | 5 - src/cvcuda/priv/legacy/pillow_resize.cu | 39 +- src/cvcuda/priv/legacy/pillow_resize.h | 5 +- .../priv/legacy/pillow_resize_var_shape.cu | 101 +- src/cvcuda/priv/legacy/reformat.cu | 5 - src/cvcuda/priv/legacy/resize.cu | 5 - src/cvcuda/priv/legacy/warp.cu | 7 +- src/cvcuda/priv/legacy/warp_var_shape.cu | 2 +- src/nvcv_types/Array.cpp | 17 + src/nvcv_types/CMakeLists.txt | 1 + src/nvcv_types/ImageBatch.cpp | 10 + src/nvcv_types/Tensor.cpp | 38 + src/nvcv_types/TensorBatch.cpp | 318 +++ src/nvcv_types/include/nvcv/Array.h | 14 + src/nvcv_types/include/nvcv/Array.hpp | 2 + src/nvcv_types/include/nvcv/Fwd.h | 9 +- src/nvcv_types/include/nvcv/ImageBatch.h | 6 +- src/nvcv_types/include/nvcv/ImageData.h | 2 +- src/nvcv_types/include/nvcv/Size.h | 42 + src/nvcv_types/include/nvcv/Size.hpp | 99 +- src/nvcv_types/include/nvcv/Tensor.h | 25 + src/nvcv_types/include/nvcv/Tensor.hpp | 6 + src/nvcv_types/include/nvcv/TensorBatch.h | 278 +++ src/nvcv_types/include/nvcv/TensorBatch.hpp | 244 +++ src/nvcv_types/include/nvcv/TensorBatchData.h | 65 + .../include/nvcv/TensorBatchData.hpp | 172 ++ .../include/nvcv/alloc/Allocator.hpp | 1 + .../include/nvcv/cuda/BorderVarShapeWrap.hpp | 16 +- .../nvcv/cuda/ImageBatchVarShapeWrap.hpp | 9 +- .../include/nvcv/cuda/math/LinAlg.hpp | 2 +- src/nvcv_types/include/nvcv/detail/Align.hpp | 106 + .../include/nvcv/detail/ArrayImpl.hpp | 7 + .../include/nvcv/detail/TensorBatchImpl.hpp | 266 +++ .../include/nvcv/detail/TensorImpl.hpp | 9 + src/nvcv_types/priv/Array.cpp | 9 + src/nvcv_types/priv/Array.hpp | 2 + src/nvcv_types/priv/ArrayWrapData.cpp | 8 + src/nvcv_types/priv/ArrayWrapData.hpp | 2 + src/nvcv_types/priv/CMakeLists.txt | 1 + src/nvcv_types/priv/Context.cpp | 5 +- src/nvcv_types/priv/Context.hpp | 14 +- src/nvcv_types/priv/HandleTraits.hpp | 6 + src/nvcv_types/priv/IArray.hpp | 2 + src/nvcv_types/priv/IContext.hpp | 15 +- src/nvcv_types/priv/ITensorBatch.hpp | 66 + src/nvcv_types/priv/TensorBatch.cpp | 339 ++++ src/nvcv_types/priv/TensorBatch.hpp | 107 + src/nvcv_types/priv/TensorBatchManager.hpp | 36 + src/nvcv_types/priv/TensorData.cpp | 164 +- src/nvcv_types/priv/TensorData.hpp | 5 +- src/util/CMakeLists.txt | 3 + src/util/Event.cpp | 56 + src/util/Event.hpp | 59 + src/util/PerStreamCache.hpp | 265 +++ src/util/PerStreamCacheImpl.hpp | 330 ++++ src/util/SimpleCache.hpp | 137 ++ src/util/Stream.cpp | 70 + src/util/Stream.hpp | 58 + src/util/StreamId.cpp | 150 ++ src/util/StreamId.hpp | 47 + src/util/TensorDataUtils.cpp | 150 +- src/util/TensorDataUtils.hpp | 105 +- src/util/UniqueHandle.hpp | 191 ++ tests/CMakeLists.txt | 4 +- tests/common/CheckStatus.hpp | 11 + tests/cvcuda/CMakeLists.txt | 5 +- tests/cvcuda/python/CMakeLists.txt | 2 +- tests/cvcuda/python/cvcuda_test_python.in | 22 +- tests/cvcuda/python/cvcuda_util.py | 24 +- .../python/test_adaptivethresholdtype.py | 1 + tests/cvcuda/python/test_bordertype.py | 3 +- tests/cvcuda/python/test_import_order.py | 25 + tests/cvcuda/python/test_interptype.py | 3 +- .../cvcuda/python/test_opadaptivethreshold.py | 1 + tests/cvcuda/python/test_opbndbox.py | 115 +- tests/cvcuda/python/test_opboxblur.py | 25 +- tests/cvcuda/python/test_opfindcontours.py | 12 +- tests/cvcuda/python/test_opfindhomography.py | 92 + tests/cvcuda/python/test_ophistogram.py | 2 +- tests/cvcuda/python/test_oplabel.py | 135 ++ tests/cvcuda/python/test_opmatch.py | 212 ++ tests/cvcuda/python/test_opmorphology.py | 2 +- tests/cvcuda/python/test_opnms.py | 2 +- tests/cvcuda/python/test_oposd.py | 193 +- tests/cvcuda/python/test_oppillowresize.py | 2 +- tests/cvcuda/python/test_opreformat.py | 2 +- tests/cvcuda/python/test_opremap.py | 2 +- tests/cvcuda/python/test_opstack.py | 103 + tests/cvcuda/python/test_opwarpperspective.py | 11 + tests/cvcuda/python/test_util.py | 2 +- tests/cvcuda/system/CMakeLists.txt | 8 +- tests/cvcuda/system/OsdUtils.cu | 16 +- tests/cvcuda/system/OsdUtils.cuh | 1 - tests/cvcuda/system/TestOpBndBox.cpp | 28 +- tests/cvcuda/system/TestOpBoxBlur.cpp | 30 +- tests/cvcuda/system/TestOpFindHomography.cpp | 394 ++++ tests/cvcuda/system/TestOpLabel.cpp | 835 ++++++++ tests/cvcuda/system/TestOpOSD.cpp | 365 ++-- tests/cvcuda/system/TestOpPairwiseMatcher.cpp | 442 +++++ tests/cvcuda/system/TestOpPillowResize.cpp | 16 +- tests/cvcuda/system/TestOpStack.cpp | 190 ++ tests/cvcuda/system/TestOpWarpPerspective.cpp | 2 +- tests/cvcuda/unit/CMakeLists.txt | 34 + tests/cvcuda/unit/Definitions.hpp | 26 + tests/cvcuda/unit/TestWorkspaceAllocator.cpp | 203 ++ tests/cvcuda/unit/TestWorkspaceEstimator.cpp | 69 + .../cudatools_system/CMakeLists.txt | 2 +- .../DeviceBorderVarShapeWrap.cu | 4 +- .../DeviceImageBatchVarShapeWrap.cu | 2 +- .../nvcv_types/cudatools_unit/CMakeLists.txt | 4 +- tests/nvcv_types/python/CMakeLists.txt | 2 +- .../python/nvcv_test_types_python.in | 20 +- tests/nvcv_types/python/test_image.py | 4 +- .../python/test_imgbatchvarshape.py | 55 +- tests/nvcv_types/python/test_import_order.py | 25 + tests/nvcv_types/python/test_stream.py | 2 +- tests/nvcv_types/python/test_tensor.py | 134 +- tests/nvcv_types/python/test_tensor_batch.py | 227 +++ tests/nvcv_types/system/CMakeLists.txt | 5 +- tests/nvcv_types/system/TestColorSpec.cpp | 145 +- tests/nvcv_types/system/TestDataLayout.cpp | 53 + tests/nvcv_types/system/TestImageBatch.cpp | 269 +++ tests/nvcv_types/system/TestSize.cpp | 16 +- tests/nvcv_types/system/TestTensorBatch.cpp | 467 +++++ .../nvcv_types/system/TestTensorDataUtils.cpp | 37 +- tests/nvcv_types/unit/CMakeLists.txt | 6 +- tests/nvcv_types/unit/TestPerStreamCache.cpp | 396 ++++ tests/nvcv_types/unit/TestSimpleCache.cpp | 92 + tests/nvcv_types/unit/TestStreamId.cpp | 128 ++ tests/run_tests.sh.in | 20 +- 324 files changed, 25820 insertions(+), 2246 deletions(-) create mode 100644 .github/workflows/codeql.yml create mode 160000 3rdparty/nvbench create mode 100644 bench/BenchAdaptiveThreshold.cpp create mode 100644 bench/BenchAdvCvtColor.cpp create mode 100644 bench/BenchAverageBlur.cpp create mode 100644 bench/BenchBilateralFilter.cpp create mode 100644 bench/BenchBndBox.cpp create mode 100644 bench/BenchBoxBlur.cpp create mode 100644 bench/BenchBrightnessContrast.cpp create mode 100644 bench/BenchCenterCrop.cpp create mode 100644 bench/BenchChannelReorder.cpp create mode 100644 bench/BenchColorTwist.cpp create mode 100644 bench/BenchComposite.cpp create mode 100644 bench/BenchConv2D.cpp create mode 100644 bench/BenchConvertTo.cpp create mode 100644 bench/BenchCopyMakeBorder.cpp create mode 100644 bench/BenchCropFlipNormalizeReformat.cpp create mode 100644 bench/BenchCustomCrop.cpp create mode 100644 bench/BenchCvtColor.cpp create mode 100644 bench/BenchErase.cpp create mode 100644 bench/BenchFindContours.cpp create mode 100644 bench/BenchFlip.cpp create mode 100644 bench/BenchGammaContrast.cpp create mode 100644 bench/BenchGaussian.cpp create mode 100644 bench/BenchGaussianNoise.cpp create mode 100644 bench/BenchHistogram.cpp create mode 100644 bench/BenchHistogramEq.cpp create mode 100644 bench/BenchInpaint.cpp create mode 100644 bench/BenchJointBilateralFilter.cpp create mode 100644 bench/BenchLabel.cpp create mode 100644 bench/BenchLaplacian.cpp create mode 100644 bench/BenchMedianBlur.cpp create mode 100644 bench/BenchMinAreaRect.cpp create mode 100644 bench/BenchMinMaxLoc.cpp create mode 100644 bench/BenchMinMaxLoc.hpp create mode 100644 bench/BenchMorphology.cpp create mode 100644 bench/BenchNMS.cpp create mode 100644 bench/BenchNormalize.cpp create mode 100644 bench/BenchOSD.cpp create mode 100644 bench/BenchPadAndStack.cpp create mode 100644 bench/BenchPairwiseMatcher.cpp create mode 100644 bench/BenchPillowResize.cpp create mode 100644 bench/BenchRandomResizedCrop.cpp create mode 100644 bench/BenchReformat.cpp create mode 100644 bench/BenchRemap.cpp create mode 100644 bench/BenchResize.cpp create mode 100644 bench/BenchRotate.cpp create mode 100644 bench/BenchSIFT.cpp create mode 100644 bench/BenchStack.cpp create mode 100644 bench/BenchThreshold.cpp create mode 100644 bench/BenchUtils.hpp create mode 100644 bench/BenchWarpAffine.cpp create mode 100644 bench/BenchWarpPerspective.cpp create mode 100644 bench/CMakeLists.txt create mode 100644 bench/run_bench.py delete mode 100755 ci/check_formatting.sh create mode 100644 docs/sphinx/relnotes/v0.5.0-beta.rst create mode 100644 python/mod_cvcuda/ConnectivityType.cpp create mode 100644 python/mod_cvcuda/ConnectivityType.hpp create mode 100644 python/mod_cvcuda/LabelType.cpp create mode 100644 python/mod_cvcuda/LabelType.hpp create mode 100644 python/mod_cvcuda/NormType.cpp create mode 100644 python/mod_cvcuda/NormType.hpp create mode 100644 python/mod_cvcuda/OpFindHomography.cpp create mode 100644 python/mod_cvcuda/OpLabel.cpp create mode 100644 python/mod_cvcuda/OpPairwiseMatcher.cpp create mode 100644 python/mod_cvcuda/OpStack.cpp create mode 100644 python/mod_cvcuda/PairwiseMatcherType.cpp create mode 100644 python/mod_cvcuda/PairwiseMatcherType.hpp create mode 100644 python/mod_cvcuda/WorkspaceCache.cpp create mode 100644 python/mod_cvcuda/WorkspaceCache.hpp create mode 100644 python/mod_cvcuda/exports.ldscript create mode 100644 python/mod_nvcv/Array.cpp create mode 100644 python/mod_nvcv/Array.hpp create mode 100644 python/mod_nvcv/CastUtils.hpp create mode 100644 python/mod_nvcv/TensorBatch.cpp create mode 100644 python/mod_nvcv/TensorBatch.hpp create mode 100644 python/mod_nvcv/exports.ldscript create mode 100644 python/mod_nvcv/include/nvcv/python/Array.hpp create mode 100644 python/mod_nvcv/include/nvcv/python/TensorBatch.hpp create mode 100644 samples/common/python/interop_utils.py create mode 100644 samples/label/python/main.py create mode 100644 src/cvcuda/OpFindHomography.cpp create mode 100644 src/cvcuda/OpLabel.cpp create mode 100644 src/cvcuda/OpPairwiseMatcher.cpp create mode 100644 src/cvcuda/OpStack.cpp create mode 100644 src/cvcuda/include/cvcuda/OpFindHomography.h create mode 100644 src/cvcuda/include/cvcuda/OpFindHomography.hpp create mode 100644 src/cvcuda/include/cvcuda/OpLabel.h create mode 100644 src/cvcuda/include/cvcuda/OpLabel.hpp create mode 100644 src/cvcuda/include/cvcuda/OpPairwiseMatcher.h create mode 100644 src/cvcuda/include/cvcuda/OpPairwiseMatcher.hpp create mode 100644 src/cvcuda/include/cvcuda/OpStack.h create mode 100644 src/cvcuda/include/cvcuda/OpStack.hpp create mode 100644 src/cvcuda/include/cvcuda/Workspace.h create mode 100644 src/cvcuda/include/cvcuda/Workspace.hpp create mode 100644 src/cvcuda/priv/OpFindHomography.cu create mode 100644 src/cvcuda/priv/OpFindHomography.hpp create mode 100644 src/cvcuda/priv/OpLabel.cu create mode 100644 src/cvcuda/priv/OpLabel.hpp create mode 100644 src/cvcuda/priv/OpPairwiseMatcher.cu create mode 100644 src/cvcuda/priv/OpPairwiseMatcher.hpp create mode 100644 src/cvcuda/priv/OpStack.cpp create mode 100644 src/cvcuda/priv/OpStack.hpp create mode 100644 src/cvcuda/priv/Types.hpp create mode 100644 src/cvcuda/priv/WorkspaceAllocator.hpp create mode 100644 src/cvcuda/priv/WorkspaceEstimator.hpp create mode 100644 src/cvcuda/priv/WorkspaceUtil.hpp delete mode 100644 src/cvcuda/priv/legacy/bnd_box.cu create mode 100644 src/nvcv_types/TensorBatch.cpp create mode 100644 src/nvcv_types/include/nvcv/Size.h create mode 100644 src/nvcv_types/include/nvcv/TensorBatch.h create mode 100644 src/nvcv_types/include/nvcv/TensorBatch.hpp create mode 100644 src/nvcv_types/include/nvcv/TensorBatchData.h create mode 100644 src/nvcv_types/include/nvcv/TensorBatchData.hpp create mode 100644 src/nvcv_types/include/nvcv/detail/Align.hpp create mode 100644 src/nvcv_types/include/nvcv/detail/TensorBatchImpl.hpp create mode 100644 src/nvcv_types/priv/ITensorBatch.hpp create mode 100644 src/nvcv_types/priv/TensorBatch.cpp create mode 100644 src/nvcv_types/priv/TensorBatch.hpp create mode 100644 src/nvcv_types/priv/TensorBatchManager.hpp create mode 100644 src/util/Event.cpp create mode 100644 src/util/Event.hpp create mode 100644 src/util/PerStreamCache.hpp create mode 100644 src/util/PerStreamCacheImpl.hpp create mode 100644 src/util/SimpleCache.hpp create mode 100644 src/util/Stream.cpp create mode 100644 src/util/Stream.hpp create mode 100644 src/util/StreamId.cpp create mode 100644 src/util/StreamId.hpp create mode 100644 src/util/UniqueHandle.hpp create mode 100644 tests/cvcuda/python/test_import_order.py create mode 100644 tests/cvcuda/python/test_opfindhomography.py create mode 100644 tests/cvcuda/python/test_oplabel.py create mode 100644 tests/cvcuda/python/test_opmatch.py create mode 100644 tests/cvcuda/python/test_opstack.py create mode 100644 tests/cvcuda/system/TestOpFindHomography.cpp create mode 100644 tests/cvcuda/system/TestOpLabel.cpp create mode 100644 tests/cvcuda/system/TestOpPairwiseMatcher.cpp create mode 100644 tests/cvcuda/system/TestOpStack.cpp create mode 100644 tests/cvcuda/unit/CMakeLists.txt create mode 100644 tests/cvcuda/unit/Definitions.hpp create mode 100644 tests/cvcuda/unit/TestWorkspaceAllocator.cpp create mode 100644 tests/cvcuda/unit/TestWorkspaceEstimator.cpp create mode 100644 tests/nvcv_types/python/test_import_order.py create mode 100644 tests/nvcv_types/python/test_tensor_batch.py create mode 100644 tests/nvcv_types/system/TestTensorBatch.cpp create mode 100644 tests/nvcv_types/unit/TestPerStreamCache.cpp create mode 100644 tests/nvcv_types/unit/TestSimpleCache.cpp create mode 100644 tests/nvcv_types/unit/TestStreamId.cpp diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 00000000..2843346a --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,89 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +name: "CodeQL" + +on: [push, pull_request] + +jobs: + analyze: + name: Analyze + runs-on: ${{ (matrix.language == 'c-cpp' && 'ubuntu-20.04') || 'ubuntu-latest' }} + timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }} + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: [ 'c-cpp', 'javascript-typescript', 'python' ] + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + lfs: true + submodules: 'recursive' + + - if: matrix.language == 'c-cpp' + name: Setup environment + run: | + sudo apt update -y && sudo apt install -y --no-install-recommends \ + git git-lfs gcc-11 g++-11 ninja-build ccache libgtest-dev libgmock-dev \ + shellcheck curl doxygen python3 python3-pip python3-dev python3-distutils + + - if: matrix.language == 'c-cpp' + name: Install Python Dependencies + run: | + python3 -m pip install sphinx-rtd-theme sphinx==4.5.0 breathe exhale \ + recommonmark graphviz numpy==1.24.1 + + - if: matrix.language == 'c-cpp' + name: Install CUDA Toolkit + uses: Jimver/cuda-toolkit@v0.2.11 + id: cuda-toolkit + with: + cuda: '11.7.1' + linux-local-args: '["--toolkit"]' + + - if: matrix.language == 'c-cpp' + name: Verify CUDA installation + run: | + echo "Installed CUDA version is: ${{ steps.cuda-toolkit.outputs.cuda }}" + echo "CUDA install location: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}" + nvcc -V + + - name: Initialize CodeQL + uses: github/codeql-action/init@v2 + with: + languages: ${{ matrix.language }} + queries: +security-and-quality + + - if: matrix.language != 'c-cpp' + name: Autobuild + uses: github/codeql-action/autobuild@v2 + + - if: matrix.language == 'c-cpp' + name: Build CMake project + run: | + echo "Running CMake project build script" + ./ci/build.sh debug build "-DBUILD_SAMPLES=OFF -DBUILD_TESTS=OFF -DBUILD_PYTHON=1" $* + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v2 + with: + category: "/language:${{matrix.language}}" diff --git a/.gitmodules b/.gitmodules index b09e10a1..9de0bf30 100644 --- a/.gitmodules +++ b/.gitmodules @@ -22,3 +22,6 @@ [submodule "3rdparty/dlpack"] path = 3rdparty/dlpack url = https://github.com/dmlc/dlpack.git +[submodule "3rdparty/nvbench"] + path = 3rdparty/nvbench + url = https://github.com/NVIDIA/nvbench.git diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt index 98933a1a..51e72f3c 100644 --- a/3rdparty/CMakeLists.txt +++ b/3rdparty/CMakeLists.txt @@ -38,3 +38,11 @@ set(DLPACK_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/dlpack" PARENT_SCOPE) # cuOSD ----------------------------- set(CUOSD_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/cuOSD" PARENT_SCOPE) + +# NVBench -------------------------------- +set(NVBENCH_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/nvbench" PARENT_SCOPE) + +if(BUILD_BENCH) + set(NVBench_ENABLE_CUPTI off) + add_subdirectory(nvbench) +endif() diff --git a/3rdparty/nvbench b/3rdparty/nvbench new file mode 160000 index 00000000..75212298 --- /dev/null +++ b/3rdparty/nvbench @@ -0,0 +1 @@ +Subproject commit 75212298727e8f6e1df9215f2fcb47c8c721ffc9 diff --git a/CMakeLists.txt b/CMakeLists.txt index 5d2515cf..6256d837 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.18) +cmake_minimum_required(VERSION 3.20.1) # We need to check this variable before starting a CUDA project - otherwise it will appear # as set, with the default value pointing to the oldest supported architecture (52 as of CUDA 11.8) @@ -23,7 +23,7 @@ endif() project(cvcuda LANGUAGES C CXX - VERSION 0.4.0 + VERSION 0.5.0 DESCRIPTION "CUDA-accelerated Computer Vision algorithms" ) @@ -48,6 +48,7 @@ endif() # Options to configure the build tree ======= option(BUILD_TESTS "Enable testsuite" OFF) option(BUILD_PYTHON "Build python bindings" OFF) +option(BUILD_BENCH "Build benchmark" OFF) option(ENABLE_SANITIZER "Enabled sanitized build" OFF) # Configure build tree ====================== @@ -85,6 +86,10 @@ if(BUILD_SAMPLES) add_subdirectory(samples) endif() +if(BUILD_BENCH) + add_subdirectory(bench) +endif() + # Must be done after build tree is defined include(ConfigCPack) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 31387c3c..89506b78 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,6 +1,6 @@ # Contributing to CV-CUDA -**As of release v0.4.0-beta, CV-CUDA is not accepting outside contribution.** +**As of release v0.5.0-beta, CV-CUDA is not accepting outside contribution.** Contributions to CV-CUDA fall into the following categories: @@ -12,7 +12,7 @@ Contributions to CV-CUDA fall into the following categories: 1. To propose a new feature, please file a new feature request [issue](https://github.com/CVCUDA/CV-CUDA/issues/new/choose). Describe the intended feature and discuss the design and implementation with the team and - community. NOTE: Currently, as of release v0.4.0-beta, CV-CUDA is not accepting + community. NOTE: Currently, as of release v0.5.0-beta, CV-CUDA is not accepting outside contribution. 1. To ask a general question, please sumbit a question [issue](https://github.com/CVCUDA/CV-CUDA/issues/new/choose). If you need diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md index 483b6e87..ae4f0923 100644 --- a/DEVELOPER_GUIDE.md +++ b/DEVELOPER_GUIDE.md @@ -24,7 +24,7 @@ CV-CUDA includes: | Advanced Color Format Conversions | Performs color conversion from interleaved RGB/BGR <-> YUV/YVU and semi planar. Supported standards: BT.601. BT.709. BT.2020 | | AverageBlur | Reduces image noise using an average filter | | BilateralFilter | Reduces image noise while preserving strong edges | -| Bounding Box | Draws an rectangular border using the X-Y coordinates and dimensions typically to define the location and size of an object in an image | +| Bounding Box | Draws a rectangular border using the X-Y coordinates and dimensions typically to define the location and size of an object in an image | | Box Blurring | Overlays a blurred rectangle using the X-Y coordinates and dimensions that define the location and size of an object in an image | | Brightness_Contrast | Adjusts brightness and contrast of an image | | CenterCrop | Crops an image at its center | @@ -38,6 +38,7 @@ CV-CUDA includes: | DataTypeConvert | Converts an image’s data type with optional scaling | | Erase | Erases image regions | | Find Contours | Extract closed contours from an input binary image | +| FindHomography | Calculates a perspective transform from four pairs of the corresponding points | | Flip | Flips a 2D image around its axis | | GammaContrast | Adjusts image contrast | | Gaussian | Applies a gaussian blur filter to the image | @@ -45,18 +46,20 @@ CV-CUDA includes: | Histogram | Provides a grayscale value distribution showing the frequency of occurrence of each gray value. | | Histogram Equalizer | Allows effective spreading out the intensity range of the image typically used to improve contrast | | Inpainting | Performs inpainting by replacing a pixel by normalized weighted sum of all the known pixels in the neighborhood | -| Joint Bilateral Filter | Provides a edge-preserving denoising filter | +| Joint Bilateral Filter | Reduces image noise while preserving strong edges based on a guidance image | +| Label | Labels connected regions in an image using 4-way connectivity for foreground and 8-way for background pixels | | Laplacian | Applies a Laplace transform to an image | | MedianBlur | Reduces an image’s salt-and-pepper noise | | MinArea Rect | Finds the minimum area rotated rectangle typically used to draw bounding rectangle with minimum area | | MinMaxLoc | Finds the maximum and minimum values in a given array | | Morphology | Performs morphological erode and dilate transformations | -| Morphology (close) | Performs morphological operation that involves dilation followed by erosion on an image | -| Morphology (open) | Performs morphological operation that involves erosion followed by dilation on an image | +| Morphology (close) | Performs a morphological operation that involves dilation followed by erosion on an image | +| Morphology (open) | Performs a morphological operation that involves erosion followed by dilation on an image | | Non-max Suppression | Enables selecting a single entity out of many overlapping ones typically used for selecting from multiple bounding boxes during object detection | | Normalize | Normalizes an image pixel’s range | -| OSD (Polyline Line Text Rotated Rect Segmented Mask) | Displays an overlay on the image of of different forms including polyline line text rotated rectangle segmented mask | +| OSD (Polyline Line Text Rotated Rect Segmented Mask) | Displays an overlay on the image of different forms including polyline line text rotated rectangle segmented mask | | PadStack | Stacks several images into a tensor with border extension | +| PairwiseMatcher | Matches features computed separately (e.g. via the SIFT operator) in two images using the brute force method | | PillowResize | Changes the size and scale of an image using python-pillow algorithm | | RandomResizedCrop | Crops a random portion of an image and resizes it to a specified size. | | Reformat | Converts a planar image into non-planar and vice versa | diff --git a/LICENSE.md b/LICENSE.md index 00ac932d..f0b0397a 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -74,7 +74,7 @@ END OF TERMS AND CONDITIONS To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/README.md b/README.md index 155bcd8a..95dd9223 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,13 @@ [![License](https://img.shields.io/badge/License-Apache_2.0-yellogreen.svg)](https://opensource.org/licenses/Apache-2.0) -![Version](https://img.shields.io/badge/Version-v0.4.0--beta-blue) +![Version](https://img.shields.io/badge/Version-v0.5.0--beta-blue) ![Platform](https://img.shields.io/badge/Platform-linux--64_%7C_win--64_wsl2-gray) [![Cuda](https://img.shields.io/badge/CUDA-v11.7-%2376B900?logo=nvidia)](https://developer.nvidia.com/cuda-toolkit-archive) [![GCC](https://img.shields.io/badge/GCC-v11.0-yellow)](https://gcc.gnu.org/gcc-11/changes.html) -[![Python](https://img.shields.io/badge/python-v3.7_%7c_v3.8_%7c_v3.10-blue?logo=python)](https://www.python.org/) +[![Python](https://img.shields.io/badge/python-v3.7_%7c_v3.8_%7c_v3.9_%7c_v3.10-blue?logo=python)](https://www.python.org/) [![CMake](https://img.shields.io/badge/CMake-v3.22-%23008FBA?logo=cmake)](https://cmake.org/) CV-CUDA is an open-source project that enables building efficient cloud-scale @@ -18,7 +18,7 @@ efficient pre- and post-processing pipelines. CV-CUDA originated as a collaborative effort between [NVIDIA][NVIDIA Develop] and [ByteDance][ByteDance]. Refer to our [Developer Guide](DEVELOPER_GUIDE.md) for more information on the -operators available as of release v0.4.0-beta. +operators available as of release v0.5.0-beta. ## Getting Started @@ -45,170 +45,150 @@ packages. Choose the installation method that meets your environment needs. #### Tar File Installation ```shell -tar -xvf nvcv-lib-0.4.0-cuda11-x86_64-linux.tar.xz -tar -xvf nvcv-dev-0.4.0-cuda11-x86_64-linux.tar.xz +tar -xvf nvcv-lib-0.5.0-cuda11-x86_64-linux.tar.xz +tar -xvf nvcv-dev-0.5.0-cuda11-x86_64-linux.tar.xz ``` #### DEB File Installation ```shell -sudo apt-get install -y ./nvcv-lib-0.4.0-cuda11-x86_64-linux.deb ./nvcv-dev-0.4.0-cuda11-x86_64-linux.deb +sudo apt-get install -y ./nvcv-lib-0.5.0-cuda11-x86_64-linux.deb ./nvcv-dev-0.5.0-cuda11-x86_64-linux.deb ``` #### Python WHL File Installation ```shell -pip install nvcv_python-0.4.0-cp38-cp38-linux_x86_64.whl +pip install nvcv_python-0.5.0-cp38-cp38-linux_x86_64.whl ``` ### Build from Source -Follow these instruction to build CV-CUDA from source: +Building CV-CUDA from source allows for customization and is essential for contributing to the project. Here are detailed steps to guide you through the process: -1. Set up your local CV-CUDA repository +#### 1. Repository Setup - 1. Install prerequisites needed to setup up the repository. + Before you begin, ensure you have cloned the CV-CUDA repository to your local machine. Let's assume you've cloned it into `~/cvcuda`. - On Ubuntu 22.04, install the following packages: - - git-lfs: to retrieve binary files from remote repository + - **Initialize the Repository**: + After cloning, initialize the repository to configure it correctly. This setup is required only once. - ```shell - sudo apt-get install -y git git-lfs - ``` + ```shell + cd ~/cvcuda + ./init_repo.sh + ``` - 2. After cloning the repository (assuming it was cloned in `~/cvcuda`), - it needs to be properly configured by running the `init_repo.sh` script only once. +#### 2. Install Build Dependencies - ```shell - cd ~/cvcuda - ./init_repo.sh - ``` + CV-CUDA requires several dependencies to build from source. The following steps are based on Ubuntu 22.04, but similar packages can be found for other distributions. -1. Build CV-CUDA + - **Install Essential Packages**: + These include the compiler, build system, and necessary libraries. - 1. Install the dependencies required for building CV-CUDA + ```shell + sudo apt-get install -y g++-11 cmake ninja-build python3-dev libssl-dev + ``` - On Ubuntu 22.04, install the following packages: - - g++-11: compiler to be used - - cmake, ninja-build (optional): manage build rules - - python3-dev: for python bindings - - libssl-dev: needed by the testsuite (MD5 hashing utilities) + - **CUDA Toolkit**: + The CUDA Toolkit is essential for GPU acceleration. Although any 11.x version is compatible, 11.7 is recommended. - ```shell - sudo apt-get install -y g++-11 cmake ninja-build python3-dev libssl-dev - ``` + ```shell + sudo apt-get install -y cuda-minimal-build-11-7 + ``` - For CUDA Toolkit, any version of the 11.x series should work. - CV-CUDA was tested with 11.7, thus it should be preferred. +#### 3. Build Process - ```shell - sudo apt-get install -y cuda-minimal-build-11-7 - ``` + Once the dependencies are in place, you can proceed to build CV-CUDA. - 2. Build the project + - **Run Build Script**: + A build script is provided to simplify the compilation process. It creates a build tree and compiles the source code. - ```shell - ci/build.sh - ``` + ```shell + ci/build.sh + ``` - This will compile a x86 release build of CV-CUDA inside `build-rel` directory. - The library is in build-rel/lib, docs in build-rel/docs and executables - (tests, etc...) are in build-rel/bin. + This script creates a release build by default, placing output in `build-rel`. You can specify a debug build or a different output directory: - The script accepts some parameters to control the creation of the build tree: + ```shell + ci/build.sh [release|debug] [output build tree path] + ``` - ```shell - ci/build.sh [release|debug] [output build tree path] - ``` +#### 4. Build Documentation (Optional) - By default it builds for release. + If you need to build the documentation, additional dependencies are required: - If output build tree path isn't specified, it'll be `build-rel` for release - builds, and `build-deb` for debug. + - **Install Documentation Dependencies**: + These tools are used to generate and format the documentation. -1. Build Documentation + ```shell + sudo apt-get install -y doxygen graphviz python3 python3-pip + sudo python3 -m pip install sphinx==4.5.0 breathe exhale recommonmark graphviz sphinx-rtd-theme + ``` - 1. Install the dependencies required for building the documentation + - **Generate Documentation**: + Use the provided script to build the documentation. - On Ubuntu 22.04, install the following packages: - - doxygen: parse header files for reference documentation - - python3, python3-pip: to install some python packages needed - - sphinx, breathe, exhale, recommonmark, graphiviz: to render the documentation - - sphinx-rtd-theme: documenation theme used + ```shell + ci/build_docs.sh [build folder] + ``` - ```shell - sudo apt-get install -y doxygen graphviz python3 python3-pip - sudo python3 -m pip install sphinx==4.5.0 breathe exhale recommonmark graphviz sphinx-rtd-theme - ``` + For example: - 2. Build the documentation - ```shell - ci/build_docs.sh [build folder] - ``` + ```shell + ci/build_docs.sh build_docs + ``` - Example: - `ci/build_docs.sh build_docs` +#### 5. Build and Run Samples (Optional) -1. Build and run Samples + CV-CUDA comes with a variety of samples to demonstrate its capabilities. - 1. For instructions on how to build samples from source and run them, see the [Samples](samples/README.md) documentation. + - **See the Samples Documentation**: + Detailed instructions for building and running samples are available in the [Samples](samples/README.md) documentation. -1. Run Tests +#### 6. Running Tests - 1. Install the dependencies required for running the tests + To ensure everything is working as expected, you can run CV-CUDA's test suite. - On Ubuntu 22.04, install the following packages: - - python3, python3-pip: to run python bindings tests - - torch: dependencies needed by python bindings tests + - **Install Test Dependencies**: + These are necessary to run the Python binding tests. - ```shell - sudo apt-get install -y python3 python3-pip - sudo python3 -m pip install pytest torch - ``` + ```shell + sudo apt-get install -y python3 python3-pip + sudo python3 -m pip install pytest torch + ``` - 2. Run the tests + - **Execute Tests**: + Run the test scripts located in the build tree. - The tests are in `/bin`. You can run the script below to run all - tests at once. Here's an example when build tree is created in `build-rel` + ```shell + build-rel/bin/run_tests.sh + ``` - ```shell - build-rel/bin/run_tests.sh - ``` +#### 7. Packaging -1. Package installers + After a successful build, you can create installers using `cpack`. - Installers can be generated using the following cpack command once you have successfully built the project + - **Generate Installers**: + This step produces Debian packages and tarballs, suitable for distribution or installation on other systems. - ```shell - cd build-rel - cpack . - ``` + ```shell + cd build-rel + cpack . + ``` - This will generate in the build directory both Debian installers and tarballs - (\*.tar.xz), needed for integration in other distros. + For specific installer types: - For a fine-grained choice of what installers to generate, the full syntax is: + ```shell + cpack . -G [DEB|TXZ] + ``` - ```shell - cpack . -G [DEB|TXZ] - ``` - - - DEB for Debian packages - - TXZ for \*.tar.xz tarballs. - -## Tools - -1. CV-CUDA make operator tool - - This tool will create an noop operator; python bindings, and tests. - - This tool is located in 'tools/mkop'. To run it, navigate to the directory and execute the command './mkop.sh OperatorName', where 'OperatorName' is the desired name of the operator. + - `DEB` for Debian packages. + - `TXZ` for `.tar.xz` tarballs. ## Contributing CV-CUDA is an open source project. As part of the Open Source Community, we are committed to the cycle of learning, improving, and updating that makes this -community thrive. However, as of release v0.4.0-beta, CV-CUDA is not yet ready +community thrive. However, as of release v0.5.0-beta, CV-CUDA is not yet ready for external contributions. To understand the process for contributing the CV-CUDA, see our @@ -217,6 +197,48 @@ Source Community, and providing an environment that both supports and respects the efforts of all contributors, please read our [Code of Conduct](CODE_OF_CONDUCT.md). +### CV-CUDA Make Operator Tool + +The `mkop.sh` script is a powerful tool for creating a scaffold for new operators in the CV-CUDA library. It automates several tasks, ensuring consistency and saving time. + +#### Features of `mkop.sh`: + +1. **Operator Stub Creation**: Generates no-op (no-operation) operator templates, which serve as a starting point for implementing new functionalities. + +1. **File Customization**: Modifies template files to include the new operator's name, ensuring consistent naming conventions across the codebase. + +1. **CMake Integration**: Adds the new operator files to the appropriate CMakeLists, facilitating seamless compilation and integration into the build system. + +1. **Python Bindings**: Creates Python wrapper stubs for the new operator, allowing it to be used within Python environments. + +1. **Test Setup**: Generates test files for both C++ and Python, enabling immediate development of unit tests for the new operator. + +#### How to Use `mkop.sh`: + +Run the script with the desired operator name. The script assumes it's located in `/cvcuda/tools/mkop`. + + ```shell + ./mkop.sh [Operator Name] + ``` + +If the script is run from a different location, provide the path to the CV-CUDA root directory. + + ```shell + ./mkop.sh [Operator Name] [CV-CUDA root] + ``` + +**NOTE**: The first letter of the new operator name is captitalized where needed to match the rest of the file structures. + +#### Process Details: + +- **Initial Setup**: The script begins by validating the input and setting up necessary variables. It then capitalizes the first letter of the operator name to adhere to naming conventions. + +- **Template Modification**: It processes various template files (`Public.h`, `PrivateImpl.cpp`, etc.), replacing placeholders with the new operator name. This includes adjusting file headers, namespaces, and function signatures. + +- **CMake and Python Integration**: The script updates `CMakeLists.txt` files and Python module files to include the new operator, ensuring it's recognized by the build system and Python interface. + +- **Testing Framework**: Finally, it sets up test files for both C++ and Python, allowing developers to immediately start writing tests for the new operator. + ## License CV-CUDA operates under the [Apache-2.0](LICENSE.md) license. diff --git a/SECURITY.md b/SECURITY.md index 695cf3fe..1bcc2896 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -18,3 +18,20 @@ To report a potential security vulnerability in any NVIDIA product: - We encourage you to use the following PGP key for secure email communication: [NVIDIA public PGP Key for communication](https://www.nvidia.com/en-us/security/pgp-key) - Please include the following information: - Product/Driver name and version/branch that contains the vulnerability + +## Code Static Analysis + +In our commitment to maintaining the highest standards of code quality and security, we have enabled GitHub's Code Static Analysis scanning on our repositories. Static Analysis is a powerful tool for analyzing the codebase for potential vulnerabilities. + +- Scope: CodeQL scanning is activated across all branches of this repository. +- Frequency: Scans are conducted regularly on new commits to ensure continuous integration and delivery are secure. +- Results Handling: Any identified vulnerabilities or code issues are reviewed and addressed promptly by our development team. +- Community Contribution: We welcome contributions to enhance our CodeQL queries. If you have suggestions or improvements, please submit a pull request or contact us via the outlined channels. + +## Secrets Scanning + +To further bolster our repository's security, we have implemented GitHub's secrets scanning feature. This feature helps detect and prevent accidental commits of sensitive information such as passwords, private keys, and API tokens. + +- Active Scanning: Secrets scanning is active on all branches of this repository. +- Alerts and Notifications: In the event that a potential secret is committed to the repository, an alert is generated. These alerts are reviewed and addressed swiftly by our security team. +- Prevention and Education: We continuously educate our contributors about best practices in handling secrets and sensitive data. We encourage the use of environment variables and secure vaults for managing secrets. diff --git a/bench/BenchAdaptiveThreshold.cpp b/bench/BenchAdaptiveThreshold.cpp new file mode 100644 index 00000000..658281fd --- /dev/null +++ b/bench/BenchAdaptiveThreshold.cpp @@ -0,0 +1,96 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void AdaptiveThreshold(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + int blockSize = static_cast(state.get_int64("blockSize")); + + NVCVThresholdType threshType = NVCV_THRESH_BINARY; + NVCVAdaptiveThresholdType adaptType = NVCV_ADAPTIVE_THRESH_GAUSSIAN_C; + + double maxValue = 123.; + double c = -2.3; + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::AdaptiveThreshold op(blockSize, shape.x); + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &maxValue, &adaptType, &threshType, &blockSize, &c](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, maxValue, adaptType, threshType, blockSize, c); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(shape.x); + nvcv::ImageBatchVarShape dst(shape.x); + + benchutils::FillImageBatch(src, long2{shape.z, shape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + dst.pushBack(src.begin(), src.end()); + + nvcv::Tensor maxValueTensor({{shape.x}, "N"}, nvcv::TYPE_F64); + nvcv::Tensor blockSizeTensor({{shape.x}, "N"}, nvcv::TYPE_S32); + nvcv::Tensor cTensor({{shape.x}, "N"}, nvcv::TYPE_F64); + + benchutils::FillTensor(maxValueTensor, [&maxValue](const long4 &){ return maxValue; }); + benchutils::FillTensor(maxValueTensor, [&blockSize](const long4 &){ return blockSize; }); + benchutils::FillTensor(cTensor, [&c](const long4 &){ return c; }); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &maxValueTensor, &adaptType, &threshType, &blockSizeTensor, &cTensor] + (nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, maxValueTensor, adaptType, threshType, blockSizeTensor, cTensor); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using AdaptiveThresholdTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(AdaptiveThreshold, NVBENCH_TYPE_AXES(AdaptiveThresholdTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}) + .add_int64_axis("blockSize", {7}); diff --git a/bench/BenchAdvCvtColor.cpp b/bench/BenchAdvCvtColor.cpp new file mode 100644 index 00000000..04459bdb --- /dev/null +++ b/bench/BenchAdvCvtColor.cpp @@ -0,0 +1,74 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void AdvCvtColor(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + + using BT = typename nvcv::cuda::BaseType; + + int ch = nvcv::cuda::NumElements; + + NVCVColorConversionCode code = NVCV_COLOR_BGR2YUV; + nvcv::ColorSpec colorSpec = NVCV_COLOR_SPEC_BT2020; + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::AdvCvtColor op; + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &code, &colorSpec](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, code, colorSpec); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + throw std::invalid_argument("ImageBatchVarShape not implemented for this operator"); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using AdvCvtColorTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(AdvCvtColor, NVBENCH_TYPE_AXES(AdvCvtColorTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}); diff --git a/bench/BenchAverageBlur.cpp b/bench/BenchAverageBlur.cpp new file mode 100644 index 00000000..fbfc9c4c --- /dev/null +++ b/bench/BenchAverageBlur.cpp @@ -0,0 +1,93 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void AverageBlur(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + int2 kernelSize = nvcv::cuda::StaticCast(benchutils::GetShape<2>(state.get_string("kernelSize"))); + + NVCVBorderType borderType = benchutils::GetBorderType(state.get_string("border")); + + nvcv::Size2D kernelSize2d{kernelSize.x, kernelSize.y}; + int2 kernelAnchor{-1, -1}; + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::AverageBlur op(kernelSize2d, shape.x); + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &kernelSize2d, &kernelAnchor, &borderType](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, kernelSize2d, kernelAnchor, borderType); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(shape.x); + nvcv::ImageBatchVarShape dst(shape.x); + + benchutils::FillImageBatch(src, long2{shape.z, shape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + dst.pushBack(src.begin(), src.end()); + + nvcv::Tensor kernelSizeTensor({{shape.x}, "N"}, nvcv::TYPE_2S32); + nvcv::Tensor kernelAnchorTensor({{shape.x}, "N"}, nvcv::TYPE_2S32); + + benchutils::FillTensor(kernelSizeTensor, [&kernelSize](const long4 &){ return kernelSize; }); + benchutils::FillTensor(kernelAnchorTensor, [&kernelAnchor](const long4 &){ return kernelAnchor; }); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &kernelSizeTensor, &kernelAnchorTensor, &borderType](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, kernelSizeTensor, kernelAnchorTensor, borderType); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using AverageBlurTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(AverageBlur, NVBENCH_TYPE_AXES(AverageBlurTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}) + .add_string_axis("kernelSize", {"7x7"}) + .add_string_axis("border", {"REPLICATE"}); diff --git a/bench/BenchBilateralFilter.cpp b/bench/BenchBilateralFilter.cpp new file mode 100644 index 00000000..73875d8e --- /dev/null +++ b/bench/BenchBilateralFilter.cpp @@ -0,0 +1,96 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void BilateralFilter(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + int diameter = static_cast(state.get_int64("diameter")); + float sigmaSpace = static_cast(state.get_float64("sigmaSpace")); + float sigmaColor = -1.f; + + NVCVBorderType borderType = benchutils::GetBorderType(state.get_string("border")); + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::BilateralFilter op; + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &diameter, &sigmaColor, &sigmaSpace, &borderType](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, diameter, sigmaColor, sigmaSpace, borderType); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(shape.x); + nvcv::ImageBatchVarShape dst(shape.x); + + benchutils::FillImageBatch(src, long2{shape.z, shape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + dst.pushBack(src.begin(), src.end()); + + nvcv::Tensor diameterTensor({{shape.x}, "N"}, nvcv::TYPE_S32); + nvcv::Tensor sigmaSpaceTensor({{shape.x}, "N"}, nvcv::TYPE_F32); + nvcv::Tensor sigmaColorTensor({{shape.x}, "N"}, nvcv::TYPE_F32); + + benchutils::FillTensor(diameterTensor, [&diameter](const long4 &){ return diameter; }); + benchutils::FillTensor(sigmaSpaceTensor, [&sigmaSpace](const long4 &){ return sigmaSpace; }); + benchutils::FillTensor(sigmaColorTensor, [&sigmaColor](const long4 &){ return sigmaColor; }); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &diameterTensor, &sigmaColorTensor, &sigmaSpaceTensor, &borderType] + (nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, diameterTensor, sigmaColorTensor, sigmaSpaceTensor, borderType); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using BilateralFilterTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(BilateralFilter, NVBENCH_TYPE_AXES(BilateralFilterTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}) + .add_int64_axis("diameter", {-1}) + .add_float64_axis("sigmaSpace", {1.2}) + .add_string_axis("border", {"REFLECT"}); diff --git a/bench/BenchBndBox.cpp b/bench/BenchBndBox.cpp new file mode 100644 index 00000000..9f714e0a --- /dev/null +++ b/bench/BenchBndBox.cpp @@ -0,0 +1,97 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include <../priv/Types.hpp> +#include + +#include + +template +inline void BndBox(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + int numBoxes = static_cast(state.get_int64("numBoxes")); + + using BT = typename nvcv::cuda::BaseType; + + int ch = nvcv::cuda::NumElements; + + NVCVBndBoxI bndBox{ + {43, 21, 12, 34}, // box x, y position w, h size + 2, // box thickness + { 0, 0, 0, 255}, // box border color + { 0, 0, 0, 0} // box fill color + }; + + std::vector> bndBoxesVec; + + for (int i = 0; i < shape.x; i++) + { + std::vector curVec; + for (int j = 0; j < numBoxes; j++) + { + curVec.push_back(bndBox); + } + bndBoxesVec.push_back(curVec); + } + + std::shared_ptr bndBoxesImpl + = std::make_shared(bndBoxesVec); + NVCVBndBoxesI bndBoxes = (NVCVBndBoxesI)bndBoxesImpl.get(); + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T) + shape.x * numBoxes * sizeof(NVCVBndBoxI)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::BndBox op; + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &bndBoxes](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, bndBoxes); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + throw std::invalid_argument("ImageBatchVarShape not implemented for this operator"); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using BndBoxTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(BndBox, NVBENCH_TYPE_AXES(BndBoxTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}) + .add_int64_axis("numBoxes", {10, 100}); diff --git a/bench/BenchBoxBlur.cpp b/bench/BenchBoxBlur.cpp new file mode 100644 index 00000000..031d74f9 --- /dev/null +++ b/bench/BenchBoxBlur.cpp @@ -0,0 +1,97 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include <../priv/Types.hpp> +#include + +#include + +template +inline void BoxBlur(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + int numBoxes = static_cast(state.get_int64("numBoxes")); + int kernelSize = static_cast(state.get_int64("kernelSize")); + + using BT = typename nvcv::cuda::BaseType; + + int ch = nvcv::cuda::NumElements; + + NVCVBlurBoxI blurBox{ + {43, 21, 12, 34}, // box x, y position w, h size + kernelSize // median filter kernel size + }; + + std::vector> blurBoxesVec; + + for (int i = 0; i < shape.x; i++) + { + std::vector curVec; + for (int j = 0; j < numBoxes; j++) + { + curVec.push_back(blurBox); + } + blurBoxesVec.push_back(curVec); + } + + std::shared_ptr blurBoxesImpl + = std::make_shared(blurBoxesVec); + NVCVBlurBoxesI blurBoxes = (NVCVBlurBoxesI)blurBoxesImpl.get(); + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T) + shape.x * numBoxes * sizeof(NVCVBlurBoxI)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::BoxBlur op; + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &blurBoxes](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, blurBoxes); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + throw std::invalid_argument("ImageBatchVarShape not implemented for this operator"); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using BoxBlurTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(BoxBlur, NVBENCH_TYPE_AXES(BoxBlurTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}) + .add_int64_axis("numBoxes", {4}) + .add_int64_axis("kernelSize", {5}); diff --git a/bench/BenchBrightnessContrast.cpp b/bench/BenchBrightnessContrast.cpp new file mode 100644 index 00000000..8e741169 --- /dev/null +++ b/bench/BenchBrightnessContrast.cpp @@ -0,0 +1,91 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void BrightnessContrast(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T) + shape.x * sizeof(float) * 4); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::BrightnessContrast op; + + // clang-format off + + nvcv::Tensor brightness({{shape.x}, "N"}, nvcv::TYPE_F32); + nvcv::Tensor contrast({{shape.x}, "N"}, nvcv::TYPE_F32); + nvcv::Tensor brightnessShift({{shape.x}, "N"}, nvcv::TYPE_F32); + nvcv::Tensor contrastCenter({{shape.x}, "N"}, nvcv::TYPE_F32); + + benchutils::FillTensor(brightness, benchutils::RandomValues(0.f, 1.f)); + benchutils::FillTensor(contrast, benchutils::RandomValues()); + benchutils::FillTensor(brightnessShift, benchutils::RandomValues()); + benchutils::FillTensor(contrastCenter, benchutils::RandomValues()); + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &brightness, &contrast, &brightnessShift, &contrastCenter] + (nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, brightness, contrast, brightnessShift, contrastCenter); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(shape.x); + nvcv::ImageBatchVarShape dst(shape.x); + + benchutils::FillImageBatch(src, long2{shape.z, shape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + dst.pushBack(src.begin(), src.end()); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &brightness, &contrast, &brightnessShift, &contrastCenter] + (nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, brightness, contrast, brightnessShift, contrastCenter); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using BrightnessContrastTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(BrightnessContrast, NVBENCH_TYPE_AXES(BrightnessContrastTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}); diff --git a/bench/BenchCenterCrop.cpp b/bench/BenchCenterCrop.cpp new file mode 100644 index 00000000..3ebe7d13 --- /dev/null +++ b/bench/BenchCenterCrop.cpp @@ -0,0 +1,85 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void CenterCrop(nvbench::state &state, nvbench::type_list) +try +{ + long3 srcShape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + + nvcv::Size2D cropSize; + + if (state.get_string("cropType") == "SAME") + { + cropSize = nvcv::Size2D{(int)srcShape.z, (int)srcShape.y}; + } + else if (state.get_string("cropType") == "QUARTER") + { + cropSize = nvcv::Size2D{(int)srcShape.z / 2, (int)srcShape.y / 2}; + } + else + { + throw std::invalid_argument("Invalid resizeType = " + state.get_string("resizeType")); + } + + long3 dstShape{srcShape.x, cropSize.h, cropSize.w}; + + state.add_global_memory_reads(dstShape.x * dstShape.y * dstShape.z * sizeof(T)); + state.add_global_memory_writes(dstShape.x * dstShape.y * dstShape.z * sizeof(T)); + + cvcuda::CenterCrop op; + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{srcShape.x, srcShape.y, srcShape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &cropSize](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, cropSize); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + throw std::invalid_argument("ImageBatchVarShape not implemented for this operator"); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using CenterCropTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(CenterCrop, NVBENCH_TYPE_AXES(CenterCropTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}) + .add_string_axis("cropType", {"QUARTER"}); diff --git a/bench/BenchChannelReorder.cpp b/bench/BenchChannelReorder.cpp new file mode 100644 index 00000000..fed1a600 --- /dev/null +++ b/bench/BenchChannelReorder.cpp @@ -0,0 +1,73 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void ChannelReorder(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::ChannelReorder op; + + // clang-format off + + nvcv::Tensor orders({{shape.x, 4}, "NC"}, nvcv::TYPE_S32); + + benchutils::FillTensor(orders, benchutils::RandomValues(0, nvcv::cuda::NumElements)); + + if (varShape < 0) // negative var shape means use Tensor + { + throw std::invalid_argument("Tensor not implemented for this operator"); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(shape.x); + nvcv::ImageBatchVarShape dst(shape.x); + + benchutils::FillImageBatch(src, long2{shape.z, shape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + dst.pushBack(src.begin(), src.end()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &orders](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, orders); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using ChannelReorderTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(ChannelReorder, NVBENCH_TYPE_AXES(ChannelReorderTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {0}); diff --git a/bench/BenchColorTwist.cpp b/bench/BenchColorTwist.cpp new file mode 100644 index 00000000..67e90af8 --- /dev/null +++ b/bench/BenchColorTwist.cpp @@ -0,0 +1,85 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void ColorTwist(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + + using BT = typename nvcv::cuda::BaseType; + + int ch = nvcv::cuda::NumElements; + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::ColorTwist op; + + // clang-format off + + nvcv::Tensor twist({{shape.x, 3}, "NH"}, nvcv::TYPE_4F32); + + benchutils::FillTensor(twist, benchutils::RandomValues()); + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &twist](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, twist); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(shape.x); + nvcv::ImageBatchVarShape dst(shape.x); + + benchutils::FillImageBatch(src, long2{shape.z, shape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + dst.pushBack(src.begin(), src.end()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &twist](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, twist); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using ColorTwistTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(ColorTwist, NVBENCH_TYPE_AXES(ColorTwistTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}); diff --git a/bench/BenchComposite.cpp b/bench/BenchComposite.cpp new file mode 100644 index 00000000..2293ecab --- /dev/null +++ b/bench/BenchComposite.cpp @@ -0,0 +1,91 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void Composite(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + + using BT = typename nvcv::cuda::BaseType; + + int ch = nvcv::cuda::NumElements; + + state.add_global_memory_reads(shape.x * shape.y * shape.z * (sizeof(T) * 2 + sizeof(M))); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::Composite op; + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor fg({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor bg({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor mask({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(fg, benchutils::RandomValues()); + benchutils::FillTensor(bg, benchutils::RandomValues()); + benchutils::FillTensor(mask, [](const long4 &){ return 1; }); + + state.exec(nvbench::exec_tag::sync, [&op, &fg, &bg, &mask, &dst](nvbench::launch &launch) + { + op(launch.get_stream(), fg, bg, mask, dst); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape fg(shape.x); + nvcv::ImageBatchVarShape bg(shape.x); + nvcv::ImageBatchVarShape mask(shape.x); + nvcv::ImageBatchVarShape dst(shape.x); + + benchutils::FillImageBatch(fg, long2{shape.z, shape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + bg.pushBack(fg.begin(), fg.end()); + dst.pushBack(fg.begin(), fg.end()); + + benchutils::FillImageBatch(mask, long2{shape.z, shape.y}, long2{varShape, varShape}, + [](const long4 &){ return 1; }); + + state.exec(nvbench::exec_tag::sync, [&op, &fg, &bg, &mask, &dst](nvbench::launch &launch) + { + op(launch.get_stream(), fg, bg, mask, dst); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using CompositeTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(Composite, NVBENCH_TYPE_AXES(CompositeTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}); diff --git a/bench/BenchConv2D.cpp b/bench/BenchConv2D.cpp new file mode 100644 index 00000000..ba88b921 --- /dev/null +++ b/bench/BenchConv2D.cpp @@ -0,0 +1,83 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void Conv2D(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + int2 kernelSize = nvcv::cuda::StaticCast(benchutils::GetShape<2>(state.get_string("kernelSize"))); + + NVCVBorderType borderType = benchutils::GetBorderType(state.get_string("border")); + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::Conv2D op; + + // clang-format off + + nvcv::Tensor kernelAnchor({{shape.x}, "N"}, nvcv::TYPE_2S32); + + benchutils::FillTensor(kernelAnchor, [](const long4 &){ return int2{-1, -1}; }); + + if (varShape < 0) // negative var shape means use Tensor + { + throw std::invalid_argument("Tensor not implemented for this operator"); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(shape.x); + nvcv::ImageBatchVarShape dst(shape.x); + nvcv::ImageBatchVarShape kernel(shape.x); + + benchutils::FillImageBatch(src, long2{shape.z, shape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + dst.pushBack(src.begin(), src.end()); + + benchutils::FillImageBatch(kernel, long2{kernelSize.x, kernelSize.y}, long2{0, 0}, + benchutils::RandomValues(0.f, 1.f)); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &kernel, &kernelAnchor, &borderType](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, kernel, kernelAnchor, borderType); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using Conv2DTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(Conv2D, NVBENCH_TYPE_AXES(Conv2DTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {0}) + .add_string_axis("kernelSize", {"7x7"}) + .add_string_axis("border", {"REPLICATE"}); diff --git a/bench/BenchConvertTo.cpp b/bench/BenchConvertTo.cpp new file mode 100644 index 00000000..2bbd74a7 --- /dev/null +++ b/bench/BenchConvertTo.cpp @@ -0,0 +1,70 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void ConvertTo(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + + double alpha = 0.123; + double beta = 0.456; + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::ConvertTo op; + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &alpha, &beta](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, alpha, beta); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + throw std::invalid_argument("ImageBatchVarShape not implemented for this operator"); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using ConvertToTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(ConvertTo, NVBENCH_TYPE_AXES(ConvertToTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}); diff --git a/bench/BenchCopyMakeBorder.cpp b/bench/BenchCopyMakeBorder.cpp new file mode 100644 index 00000000..722c37d0 --- /dev/null +++ b/bench/BenchCopyMakeBorder.cpp @@ -0,0 +1,96 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void CopyMakeBorder(nvbench::state &state, nvbench::type_list) +try +{ + long3 srcShape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + + NVCVBorderType borderType = benchutils::GetBorderType(state.get_string("border")); + + float4 borderValue{0.f, 0.f, 0.f, 0.f}; + + int top = srcShape.y / 2; + int left = srcShape.z / 2; + + long3 dstShape{srcShape.x, top + srcShape.y, left + srcShape.z}; + + state.add_global_memory_reads(srcShape.x * srcShape.y * srcShape.z * sizeof(T)); + state.add_global_memory_writes(dstShape.x * dstShape.y * dstShape.z * sizeof(T)); + + cvcuda::CopyMakeBorder op; + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{srcShape.x, srcShape.y, srcShape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &top, &left, &borderType, &borderValue](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, top, left, borderType, borderValue); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(srcShape.x); + nvcv::ImageBatchVarShape dst(dstShape.x); + + benchutils::FillImageBatch(src, long2{srcShape.z, srcShape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + benchutils::FillImageBatch(dst, long2{dstShape.z, dstShape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + + nvcv::Tensor topTensor({{srcShape.x, 1, 1, 1}, "NHWC"}, nvcv::TYPE_S32); + nvcv::Tensor leftTensor({{srcShape.x, 1, 1, 1}, "NHWC"}, nvcv::TYPE_S32); + + benchutils::FillTensor(topTensor, [&top](const long4 &){ return top; }); + benchutils::FillTensor(leftTensor, [&left](const long4 &){ return left; }); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &topTensor, &leftTensor, &borderType, &borderValue](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, topTensor, leftTensor, borderType, borderValue); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using CopyMakeBorderTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(CopyMakeBorder, NVBENCH_TYPE_AXES(CopyMakeBorderTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}) + .add_string_axis("border", {"REFLECT101"}); diff --git a/bench/BenchCropFlipNormalizeReformat.cpp b/bench/BenchCropFlipNormalizeReformat.cpp new file mode 100644 index 00000000..32f82737 --- /dev/null +++ b/bench/BenchCropFlipNormalizeReformat.cpp @@ -0,0 +1,116 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void CropFlipNormalizeReformat(nvbench::state &state, nvbench::type_list) +try +{ + long3 srcShape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + long3 dstShape = srcShape; + + NVCVBorderType borderType = benchutils::GetBorderType(state.get_string("border")); + + float borderValue{0.f}; + + float globalScale = 1.234f; + float globalShift = 2.345f; + float epsilon = 12.34f; + uint32_t flags = 0; + + long3 baseShape{srcShape.x, 1, 1}; + long3 scaleShape{srcShape.x, 1, 1}; + long3 cropShape{srcShape.x, 1, 1}; + + state.add_global_memory_reads(srcShape.x * srcShape.y * srcShape.z * sizeof(T) + + baseShape.x * baseShape.y * baseShape.z * sizeof(float) + + scaleShape.x * scaleShape.y * scaleShape.z * sizeof(float) + + cropShape.x * cropShape.y * cropShape.z * sizeof(int) * 4); + state.add_global_memory_writes(dstShape.x * dstShape.y * dstShape.z * sizeof(T)); + + cvcuda::CropFlipNormalizeReformat op; + + // clang-format off + + nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + nvcv::Tensor flipCode({{srcShape.x}, "N"}, nvcv::TYPE_S32); + + nvcv::Tensor base({{baseShape.x, baseShape.y, baseShape.z, 1}, "NHWC"}, nvcv::TYPE_F32); + nvcv::Tensor scale({{scaleShape.x, scaleShape.y, scaleShape.z, 1}, "NHWC"}, nvcv::TYPE_F32); + + nvcv::Tensor crop({{cropShape.x, cropShape.y, cropShape.z, 4}, "NHWC"}, nvcv::TYPE_S32); + + benchutils::FillTensor(flipCode, [](const long4 &){ return -1; }); + + benchutils::FillTensor(base, benchutils::RandomValues()); + benchutils::FillTensor(scale, benchutils::RandomValues(0.f, 1.f)); + + // Always crop entire source image for easy bandwidth calculations + benchutils::FillTensor(crop, [&srcShape](const long4 &c) + { + if (c.w == 2) + { + return (int)srcShape.z; + } + else if (c.w == 3) + { + return (int)srcShape.y; + } + return 0; + }); + + if (varShape < 0) // negative var shape means use Tensor + { + throw std::invalid_argument("Tensor not implemented for this operator"); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(srcShape.x); + + benchutils::FillImageBatch(src, long2{srcShape.z, srcShape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &crop, &borderType, &borderValue, &flipCode, &base, &scale, &globalScale, + &globalShift, &epsilon, &flags](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, crop, borderType, borderValue, flipCode, base, scale, globalScale, + globalShift, epsilon, flags); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using CropFlipNormalizeReformatTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(CropFlipNormalizeReformat, NVBENCH_TYPE_AXES(CropFlipNormalizeReformatTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {0}) + .add_string_axis("border", {"REFLECT101"}); diff --git a/bench/BenchCustomCrop.cpp b/bench/BenchCustomCrop.cpp new file mode 100644 index 00000000..07478a39 --- /dev/null +++ b/bench/BenchCustomCrop.cpp @@ -0,0 +1,70 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void CustomCrop(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + + // Always crop entire source image for easy bandwidth calculations + NVCVRectI cropRect{0, 0, (int)shape.z, (int)shape.y}; + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::CustomCrop op; + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &cropRect](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, cropRect); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + throw std::invalid_argument("ImageBatchVarShape not implemented for this operator"); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using CustomCropTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(CustomCrop, NVBENCH_TYPE_AXES(CustomCropTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}); diff --git a/bench/BenchCvtColor.cpp b/bench/BenchCvtColor.cpp new file mode 100644 index 00000000..05469e0f --- /dev/null +++ b/bench/BenchCvtColor.cpp @@ -0,0 +1,83 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void CvtColor(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + + using BT = typename nvcv::cuda::BaseType; + + int ch = nvcv::cuda::NumElements; + + NVCVColorConversionCode code = ch == 3 ? NVCV_COLOR_BGR2RGB : NVCV_COLOR_BGRA2RGBA; + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::CvtColor op; + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &code](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, code); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(shape.x); + nvcv::ImageBatchVarShape dst(shape.x); + + benchutils::FillImageBatch(src, long2{shape.z, shape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + dst.pushBack(src.begin(), src.end()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &code](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, code); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using CvtColorTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(CvtColor, NVBENCH_TYPE_AXES(CvtColorTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}); diff --git a/bench/BenchErase.cpp b/bench/BenchErase.cpp new file mode 100644 index 00000000..68419ad9 --- /dev/null +++ b/bench/BenchErase.cpp @@ -0,0 +1,95 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void Erase(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + int numErase = static_cast(state.get_int64("numErase")); + + bool random = true; + int seed = 0; + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T) + + shape.x * (sizeof(int2) + sizeof(int3) + sizeof(float) + sizeof(int))); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::Erase op(numErase); + + // clang-format off + + nvcv::Tensor anchor({{shape.x}, "N"}, nvcv::TYPE_2S32); + nvcv::Tensor erasing({{shape.x}, "N"}, nvcv::TYPE_3S32); + nvcv::Tensor values({{shape.x}, "N"}, nvcv::TYPE_F32); + nvcv::Tensor imgIdx({{shape.x}, "N"}, nvcv::TYPE_S32); + + benchutils::FillTensor(anchor, [](const long4 &){ return int2{0, 0}; }); + benchutils::FillTensor(erasing, [](const long4 &){ return int3{10, 10, 1}; }); + benchutils::FillTensor(values, [](const long4 &){ return 1.f; }); + benchutils::FillTensor(imgIdx, [](const long4 &){ return 0; }); + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &anchor, &erasing, &values, &imgIdx, &random, &seed](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, anchor, erasing, values, imgIdx, random, seed); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(shape.x); + nvcv::ImageBatchVarShape dst(shape.x); + + benchutils::FillImageBatch(src, long2{shape.z, shape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + dst.pushBack(src.begin(), src.end()); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &anchor, &erasing, &values, &imgIdx, &random, &seed](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, anchor, erasing, values, imgIdx, random, seed); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using EraseTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(Erase, NVBENCH_TYPE_AXES(EraseTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {0}) + .add_int64_axis("numErase", {3}); diff --git a/bench/BenchFindContours.cpp b/bench/BenchFindContours.cpp new file mode 100644 index 00000000..2beb2747 --- /dev/null +++ b/bench/BenchFindContours.cpp @@ -0,0 +1,126 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include +#include +#include + +#include + +using CPUImage = std::vector; + +static void generateRectangle(CPUImage &image, nvcv::Size2D boundary, nvcv::Size2D anchor = {0, 0}, + nvcv::Size2D size = {5, 5}, double angle = 0.0, bool fill = true, uint8_t setValue = 1); + +static void generateRectangle(CPUImage &image, nvcv::Size2D boundary, nvcv::Size2D anchor, nvcv::Size2D size, + double angle, bool fill, uint8_t setValue) +{ + auto rad = angle * (M_PI / 180.0); + auto cosAngle = std::cos(rad); + auto sinAngle = std::sin(rad); + + auto transformed = anchor; + for (auto y = 0; y < size.h; ++y) + { + for (auto x = 0; x < size.w; ++x) + { + transformed.w = anchor.w + (x * cosAngle - y * sinAngle); + transformed.h = anchor.h + (x * sinAngle + y * cosAngle); + + if (fill || y == 0 || y == size.h - 1 || x == 0 || x == size.w - 1) + { + if (transformed.w >= 0 && transformed.w < boundary.w && transformed.h >= 0 + && transformed.h < boundary.h) + { + image[transformed.h * boundary.w + transformed.w] = setValue; + } + } + } + } +} + +template +inline void FindContours(nvbench::state &state, nvbench::type_list) +try +{ + srand(0U); // Use a fixed random seed + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + int numPoints = static_cast(state.get_int64("numPoints")); + + // R/W bandwidth rationale: + // Read image + connected components (S32) + // Write points + contours (U32) + state.add_global_memory_reads(shape.x * shape.y * shape.z * (sizeof(T) + sizeof(int))); + state.add_global_memory_writes(shape.x * numPoints * sizeof(int) * 2 + shape.x * 4 * sizeof(int)); + + cvcuda::FindContours op(nvcv::Size2D{(int)shape.z, (int)shape.y}, shape.x); + + // clang-format off + + nvcv::Tensor points({{shape.x, numPoints, 2}, "NCW"}, nvcv::TYPE_S32); + nvcv::Tensor counts({{shape.x, 4}, "NW"}, nvcv::TYPE_S32); + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + auto inData = src.exportData(); + auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inData); + + //Generate input + CPUImage srcVec(shape.y * shape.z, 0); + for (auto i = 0; i < 10; ++i) + { + auto anchorX = rand() % shape.z; + auto anchorY = rand() % shape.y; + auto sizeX = rand() % (shape.z - anchorX); + auto sizeY = rand() % (shape.y - anchorY); + generateRectangle(srcVec, {anchorX, anchorY}, {sizeX, sizeY}); + } + + for (auto i = 0; i < shape.x; ++i) + { + CUDA_CHECK_ERROR(cudaMemcpy2D(inAccess->sampleData(i), inAccess->rowStride(), srcVec.data(), shape.z, shape.z, + shape.y, cudaMemcpyHostToDevice)); + } + + state.exec(nvbench::exec_tag::sync, [&op, &src, &points, &counts](nvbench::launch &launch) + { + op(launch.get_stream(), src, points, counts); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + throw std::invalid_argument("ImageBatchVarShape not implemented for this operator"); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using FindContoursTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(FindContours, NVBENCH_TYPE_AXES(FindContoursTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}) + .add_int64_axis("numPoints", {1024}); diff --git a/bench/BenchFlip.cpp b/bench/BenchFlip.cpp new file mode 100644 index 00000000..620eac7f --- /dev/null +++ b/bench/BenchFlip.cpp @@ -0,0 +1,99 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void Flip(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + int flipCode; + + if (state.get_string("flipType") == "HORIZONTAL") + { + flipCode = 0; + } + else if (state.get_string("flipType") == "VERTICAL") + { + flipCode = 1; + } + else if (state.get_string("flipType") == "BOTH") + { + flipCode = -1; + } + else + { + throw std::invalid_argument("Invalid flipType = " + state.get_string("flipType")); + } + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::Flip op; + + // clang-format off + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &flipCode](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, flipCode); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(shape.x); + nvcv::ImageBatchVarShape dst(shape.x); + + benchutils::FillImageBatch(src, long2{shape.z, shape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + dst.pushBack(src.begin(), src.end()); + + nvcv::Tensor flipCodeTensor({{shape.x}, "N"}, nvcv::TYPE_S32); + + benchutils::FillTensor(flipCodeTensor, [&flipCode](const long4 &){ return flipCode; }); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &flipCodeTensor](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, flipCodeTensor); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using FlipTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(Flip, NVBENCH_TYPE_AXES(FlipTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}) + .add_string_axis("flipType", {"BOTH"}); diff --git a/bench/BenchGammaContrast.cpp b/bench/BenchGammaContrast.cpp new file mode 100644 index 00000000..e1e16958 --- /dev/null +++ b/bench/BenchGammaContrast.cpp @@ -0,0 +1,75 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void GammaContrast(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + + int ch = nvcv::cuda::NumElements; + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::GammaContrast op(shape.x, ch); + + // clang-format off + + nvcv::Tensor gamma({{shape.x * ch}, "N"}, nvcv::TYPE_F32); + + benchutils::FillTensor(gamma, benchutils::RandomValues(.5f, 1.f)); + + if (varShape < 0) // negative var shape means use Tensor + { + throw std::invalid_argument("Tensor not implemented for this operator"); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(shape.x); + nvcv::ImageBatchVarShape dst(shape.x); + + benchutils::FillImageBatch(src, long2{shape.z, shape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + dst.pushBack(src.begin(), src.end()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &gamma](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, gamma); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using GammaContrastTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(GammaContrast, NVBENCH_TYPE_AXES(GammaContrastTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {0}); diff --git a/bench/BenchGaussian.cpp b/bench/BenchGaussian.cpp new file mode 100644 index 00000000..8b4fc30d --- /dev/null +++ b/bench/BenchGaussian.cpp @@ -0,0 +1,96 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void Gaussian(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + double sigma = state.get_float64("sigma"); + + NVCVBorderType borderType = benchutils::GetBorderType(state.get_string("border")); + + int kernelSize = (int)std::round(sigma * (std::is_same_v, uint8_t> ? 3 : 4) * 2 + 1) | 1; + int2 ksize2{kernelSize, kernelSize}; + + nvcv::Size2D kernelSize2{kernelSize, kernelSize}; + double2 sigma2{sigma, sigma}; + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::Gaussian op(kernelSize2, shape.x); + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &kernelSize2, &sigma2, &borderType](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, kernelSize2, sigma2, borderType); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(shape.x); + nvcv::ImageBatchVarShape dst(shape.x); + + benchutils::FillImageBatch(src, long2{shape.z, shape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + dst.pushBack(src.begin(), src.end()); + + nvcv::Tensor kernelSizeTensor({{shape.x}, "N"}, nvcv::TYPE_2S32); + nvcv::Tensor sigmaTensor({{shape.x}, "N"}, nvcv::TYPE_2F64); + + benchutils::FillTensor(kernelSizeTensor, [&ksize2](const long4 &){ return ksize2; }); + benchutils::FillTensor(sigmaTensor, [&sigma2](const long4 &){ return sigma2; }); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &kernelSizeTensor, &sigmaTensor, &borderType](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, kernelSizeTensor, sigmaTensor, borderType); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using GaussianTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(Gaussian, NVBENCH_TYPE_AXES(GaussianTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}) + .add_float64_axis("sigma", {1.2}) + .add_string_axis("border", {"REFLECT"}); diff --git a/bench/BenchGaussianNoise.cpp b/bench/BenchGaussianNoise.cpp new file mode 100644 index 00000000..68633a90 --- /dev/null +++ b/bench/BenchGaussianNoise.cpp @@ -0,0 +1,87 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void GaussianNoise(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + + bool perCh = nvcv::cuda::NumElements > 1; + + unsigned long long int seed = 12345; + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::GaussianNoise op(shape.x); + + // clang-format off + + nvcv::Tensor mu({{shape.x}, "N"}, nvcv::TYPE_F32); + nvcv::Tensor sigma({{shape.x}, "N"}, nvcv::TYPE_F32); + + benchutils::FillTensor(mu, benchutils::RandomValues(.0f, 1.f)); + benchutils::FillTensor(sigma, benchutils::RandomValues(.05f, .1f)); + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &mu, &sigma, &perCh, &seed](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, mu, sigma, perCh, seed); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(shape.x); + nvcv::ImageBatchVarShape dst(shape.x); + + benchutils::FillImageBatch(src, long2{shape.z, shape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + dst.pushBack(src.begin(), src.end()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &mu, &sigma, &perCh, &seed](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, mu, sigma, perCh, seed); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using GaussianNoiseTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(GaussianNoise, NVBENCH_TYPE_AXES(GaussianNoiseTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}); diff --git a/bench/BenchHistogram.cpp b/bench/BenchHistogram.cpp new file mode 100644 index 00000000..73f00cda --- /dev/null +++ b/bench/BenchHistogram.cpp @@ -0,0 +1,71 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void Histogram(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + + int numBins = 256; + nvcv::Tensor mask{nullptr}; + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T)); + state.add_global_memory_writes(numBins * sizeof(int)); + + cvcuda::Histogram op; + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor hist({{shape.x, numBins, 1}, "HWC"}, nvcv::TYPE_S32); + + benchutils::FillTensor(src, benchutils::RandomValues()); + benchutils::FillTensor(hist, [](const long4 &){ return 0; }); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &mask, &hist](nvbench::launch &launch) + { + op(launch.get_stream(), src, mask, hist); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + throw std::invalid_argument("ImageBatchVarShape not implemented for this operator"); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using HistogramTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(Histogram, NVBENCH_TYPE_AXES(HistogramTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}); diff --git a/bench/BenchHistogramEq.cpp b/bench/BenchHistogramEq.cpp new file mode 100644 index 00000000..54082d55 --- /dev/null +++ b/bench/BenchHistogramEq.cpp @@ -0,0 +1,77 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void HistogramEq(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::HistogramEq op(shape.x); + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(shape.x); + nvcv::ImageBatchVarShape dst(shape.x); + + benchutils::FillImageBatch(src, long2{shape.z, shape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + dst.pushBack(src.begin(), src.end()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using HistogramEqTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(HistogramEq, NVBENCH_TYPE_AXES(HistogramEqTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}); diff --git a/bench/BenchInpaint.cpp b/bench/BenchInpaint.cpp new file mode 100644 index 00000000..88a237b3 --- /dev/null +++ b/bench/BenchInpaint.cpp @@ -0,0 +1,85 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void Inpaint(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + + double inpaintRadius = 5.0; + + state.add_global_memory_reads(shape.x * shape.y * shape.z * (sizeof(T) + sizeof(uint8_t))); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::Inpaint op(shape.x, nvcv::Size2D{(int)shape.z, (int)shape.y}); + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor mask({{shape.x, shape.y, shape.z, 1}, "NHWC"}, nvcv::TYPE_U8); + + benchutils::FillTensor(src, benchutils::RandomValues()); + benchutils::FillTensor(mask, benchutils::RandomValues(0, 1)); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &mask, &dst, &inpaintRadius](nvbench::launch &launch) + { + op(launch.get_stream(), src, mask, dst, inpaintRadius); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(shape.x); + nvcv::ImageBatchVarShape dst(shape.x); + nvcv::ImageBatchVarShape mask(shape.x); + + benchutils::FillImageBatch(src, long2{shape.z, shape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + dst.pushBack(src.begin(), src.end()); + + benchutils::FillImageBatch(mask, long2{shape.z, shape.y}, long2{varShape, varShape}, + benchutils::RandomValues(0, 1)); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &mask, &dst, &inpaintRadius](nvbench::launch &launch) + { + op(launch.get_stream(), src, mask, dst, inpaintRadius); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using InpaintTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(Inpaint, NVBENCH_TYPE_AXES(InpaintTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}); diff --git a/bench/BenchJointBilateralFilter.cpp b/bench/BenchJointBilateralFilter.cpp new file mode 100644 index 00000000..45c325bd --- /dev/null +++ b/bench/BenchJointBilateralFilter.cpp @@ -0,0 +1,100 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void JointBilateralFilter(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + int diameter = static_cast(state.get_int64("diameter")); + float sigmaSpace = static_cast(state.get_float64("sigmaSpace")); + float sigmaColor = -1.f; + + NVCVBorderType borderType = benchutils::GetBorderType(state.get_string("border")); + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::JointBilateralFilter op; + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor color({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + benchutils::FillTensor(color, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &color, &dst, &diameter, &sigmaColor, &sigmaSpace, &borderType](nvbench::launch &launch) + { + op(launch.get_stream(), src, color, dst, diameter, sigmaColor, sigmaSpace, borderType); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(shape.x); + nvcv::ImageBatchVarShape color(shape.x); + nvcv::ImageBatchVarShape dst(shape.x); + + benchutils::FillImageBatch(src, long2{shape.z, shape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + color.pushBack(src.begin(), src.end()); + dst.pushBack(src.begin(), src.end()); + + nvcv::Tensor diameterTensor({{shape.x}, "N"}, nvcv::TYPE_S32); + nvcv::Tensor sigmaSpaceTensor({{shape.x}, "N"}, nvcv::TYPE_F32); + nvcv::Tensor sigmaColorTensor({{shape.x}, "N"}, nvcv::TYPE_F32); + + benchutils::FillTensor(diameterTensor, [&diameter](const long4 &){ return diameter; }); + benchutils::FillTensor(sigmaSpaceTensor, [&sigmaSpace](const long4 &){ return sigmaSpace; }); + benchutils::FillTensor(sigmaColorTensor, [&sigmaColor](const long4 &){ return sigmaColor; }); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &color, &dst, &diameterTensor, &sigmaColorTensor, &sigmaSpaceTensor, &borderType] + (nvbench::launch &launch) + { + op(launch.get_stream(), src, color, dst, diameterTensor, sigmaColorTensor, sigmaSpaceTensor, borderType); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using JointBilateralFilterTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(JointBilateralFilter, NVBENCH_TYPE_AXES(JointBilateralFilterTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}) + .add_int64_axis("diameter", {-1}) + .add_float64_axis("sigmaSpace", {1.2}) + .add_string_axis("border", {"REFLECT"}); diff --git a/bench/BenchLabel.cpp b/bench/BenchLabel.cpp new file mode 100644 index 00000000..41005379 --- /dev/null +++ b/bench/BenchLabel.cpp @@ -0,0 +1,108 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void Label(nvbench::state &state, nvbench::type_list) +try +{ + using DT = uint32_t; + + long3 srcShape = benchutils::GetShape<3>(state.get_string("shape")); + long3 dstShape = srcShape; + + std::string runChoice = state.get_string("runChoice"); + + // Use [BG][MIN][MAX][ISLAND][COUNT][STAT] in runChoice to run Label with: + // background; minThreshold; maxThreshold; island removal; count; statistics + + long3 staShape{srcShape.x, 10000, 6}; // using fixed 10K max. cap. and 2D problem + + NVCVConnectivityType conn = NVCV_CONNECTIVITY_4_2D; + NVCVLabelType alab = NVCV_LABEL_FAST; + + nvcv::Tensor bgT, minT, maxT, countT, statsT, mszT; + + cvcuda::Label op; + + state.add_global_memory_reads(srcShape.x * srcShape.y * srcShape.z * sizeof(ST)); + state.add_global_memory_writes(dstShape.x * dstShape.y * dstShape.z * sizeof(DT)); + + // clang-format off + + if (runChoice.find("BG") != std::string::npos) + { + bgT = nvcv::Tensor({{srcShape.x}, "N"}, benchutils::GetDataType()); + + benchutils::FillTensor(bgT, benchutils::RandomValues()); + } + if (runChoice.find("MIN") != std::string::npos) + { + minT = nvcv::Tensor({{srcShape.x}, "N"}, benchutils::GetDataType()); + + benchutils::FillTensor(minT, benchutils::RandomValues()); + } + if (runChoice.find("MAX") != std::string::npos) + { + maxT = nvcv::Tensor({{srcShape.x}, "N"}, benchutils::GetDataType()); + + benchutils::FillTensor(maxT, benchutils::RandomValues()); + } + if (runChoice.find("ISLAND") != std::string::npos) + { + mszT = nvcv::Tensor({{srcShape.x}, "N"}, benchutils::GetDataType
()); + + benchutils::FillTensor
(mszT, benchutils::RandomValues
()); + } + if (runChoice.find("COUNT") != std::string::npos) + { + countT = nvcv::Tensor({{srcShape.x}, "N"}, benchutils::GetDataType
()); + } + if (runChoice.find("STAT") != std::string::npos) + { + statsT = nvcv::Tensor({{staShape.x, staShape.y, staShape.z}, "NMA"}, benchutils::GetDataType
()); + } + + nvcv::Tensor src({{srcShape.x, srcShape.y, srcShape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, 1}, "NHWC"}, benchutils::GetDataType
()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &bgT, &minT, &maxT, &mszT, &countT, &statsT, &conn, &alab](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, bgT, minT, maxT, mszT, countT, statsT, conn, alab); + }); +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using LabelTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(Label, NVBENCH_TYPE_AXES(LabelTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_string_axis("runChoice", {""}); diff --git a/bench/BenchLaplacian.cpp b/bench/BenchLaplacian.cpp new file mode 100644 index 00000000..e685198e --- /dev/null +++ b/bench/BenchLaplacian.cpp @@ -0,0 +1,91 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void Laplacian(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + int ksize = static_cast(state.get_int64("ksize")); + float scale = static_cast(state.get_float64("scale")); + + NVCVBorderType borderType = benchutils::GetBorderType(state.get_string("border")); + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::Laplacian op; + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &ksize, &scale, &borderType](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, ksize, scale, borderType); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(shape.x); + nvcv::ImageBatchVarShape dst(shape.x); + + benchutils::FillImageBatch(src, long2{shape.z, shape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + dst.pushBack(src.begin(), src.end()); + + nvcv::Tensor ksizeTensor({{shape.x}, "N"}, nvcv::TYPE_S32); + nvcv::Tensor scaleTensor({{shape.x}, "N"}, nvcv::TYPE_F32); + + benchutils::FillTensor(ksizeTensor, [&ksize](const long4 &){ return ksize; }); + benchutils::FillTensor(scaleTensor, [&scale](const long4 &){ return scale; }); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &ksizeTensor, &scaleTensor, &borderType](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, ksizeTensor, scaleTensor, borderType); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using LaplacianTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(Laplacian, NVBENCH_TYPE_AXES(LaplacianTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}) + .add_int64_axis("ksize", {1}) + .add_float64_axis("scale", {1.0}) + .add_string_axis("border", {"REFLECT101"}); diff --git a/bench/BenchMedianBlur.cpp b/bench/BenchMedianBlur.cpp new file mode 100644 index 00000000..45b2c1a6 --- /dev/null +++ b/bench/BenchMedianBlur.cpp @@ -0,0 +1,86 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void MedianBlur(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + int2 kernelSize = nvcv::cuda::StaticCast(benchutils::GetShape<2>(state.get_string("kernelSize"))); + + nvcv::Size2D kernelSize2d{kernelSize.x, kernelSize.y}; + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::MedianBlur op(shape.x); + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &kernelSize2d](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, kernelSize2d); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(shape.x); + nvcv::ImageBatchVarShape dst(shape.x); + + benchutils::FillImageBatch(src, long2{shape.z, shape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + dst.pushBack(src.begin(), src.end()); + + nvcv::Tensor kernelSizeTensor({{shape.x, 2}, "NW"}, nvcv::TYPE_S32); + + benchutils::FillTensor(kernelSizeTensor, + [&kernelSize](const long4 &c){ return nvcv::cuda::GetElement(kernelSize, c.y); }); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &kernelSizeTensor](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, kernelSizeTensor); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using MedianBlurTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(MedianBlur, NVBENCH_TYPE_AXES(MedianBlurTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}) + .add_string_axis("kernelSize", {"5x5"}); diff --git a/bench/BenchMinAreaRect.cpp b/bench/BenchMinAreaRect.cpp new file mode 100644 index 00000000..1eae7a3e --- /dev/null +++ b/bench/BenchMinAreaRect.cpp @@ -0,0 +1,70 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void MinAreaRect(nvbench::state &state, nvbench::type_list) +try +{ + long2 shape = benchutils::GetShape<2>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + + state.add_global_memory_reads(shape.x * shape.y * sizeof(T)); + state.add_global_memory_writes(shape.x * 8 * sizeof(float) + shape.x * sizeof(int)); + + cvcuda::MinAreaRect op(shape.x); + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, 2}, "NWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, 8}, "NW"}, nvcv::TYPE_F32); + nvcv::Tensor points({{1, shape.x}, "NW"}, nvcv::TYPE_S32); + + benchutils::FillTensor(src, benchutils::RandomValues()); + benchutils::FillTensor(dst, benchutils::RandomValues(0.f, 1.f)); + benchutils::FillTensor(points, benchutils::RandomValues(10, 100)); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &points, &shape](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, points, shape.x); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + throw std::invalid_argument("ImageBatchVarShape not implemented for this operator"); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using MinAreaRectTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(MinAreaRect, NVBENCH_TYPE_AXES(MinAreaRectTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1024"}) + .add_int64_axis("varShape", {-1}); diff --git a/bench/BenchMinMaxLoc.cpp b/bench/BenchMinMaxLoc.cpp new file mode 100644 index 00000000..582348fd --- /dev/null +++ b/bench/BenchMinMaxLoc.cpp @@ -0,0 +1,92 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchMinMaxLoc.hpp" + +#include + +#include + +template +inline void MinMaxLoc(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + long maxLocs = state.get_int64("maxLocations"); + + // clang-format off + + nvcv::Tensor minVal({{shape.x}, "N"}, nvcv::TYPE_U32); + nvcv::Tensor minLoc({{shape.x, maxLocs}, "NM"}, nvcv::TYPE_2S32); + nvcv::Tensor numMin({{shape.x}, "N"}, nvcv::TYPE_S32); + + nvcv::Tensor maxVal({{shape.x}, "N"}, nvcv::TYPE_U32); + nvcv::Tensor maxLoc({{shape.x, maxLocs}, "NM"}, nvcv::TYPE_2S32); + nvcv::Tensor numMax({{shape.x}, "N"}, nvcv::TYPE_S32); + + // clang-format on + + // R/W bandwidth rationale: + // 1 read to find min/max + 1 read to collect their locations + // 2 writes of min/max values (U32), locations (2S32) and quantity (S32) + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T) * 2); + state.add_global_memory_writes(shape.x * (sizeof(uint32_t) + maxLocs * sizeof(int2) + sizeof(int)) * 2); + + cvcuda::MinMaxLoc op; + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensorWithMinMax(src, maxLocs); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &minVal, &minLoc, &numMin, &maxVal, &maxLoc, &numMax](nvbench::launch &launch) + { + op(launch.get_stream(), src, minVal, minLoc, numMin, maxVal, maxLoc, numMax); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(shape.x); + + benchutils::FillImageBatchWithMinMax(src, long2{shape.z, shape.y}, long2{varShape, varShape}, maxLocs); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &minVal, &minLoc, &numMin, &maxVal, &maxLoc, &numMax](nvbench::launch &launch) + { + op(launch.get_stream(), src, minVal, minLoc, numMin, maxVal, maxLoc, numMax); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using MinMaxLocTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(MinMaxLoc, NVBENCH_TYPE_AXES(MinMaxLocTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}) + .add_int64_axis("maxLocations", {100000}); diff --git a/bench/BenchMinMaxLoc.hpp b/bench/BenchMinMaxLoc.hpp new file mode 100644 index 00000000..cd7f6a27 --- /dev/null +++ b/bench/BenchMinMaxLoc.hpp @@ -0,0 +1,121 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CVCUDA_BENCH_MINMAXLOC_HPP +#define CVCUDA_BENCH_MINMAXLOC_HPP + +#include "BenchUtils.hpp" + +namespace benchutils { + +template, typename RE = typename R::RE, typename UD = typename R::UD, + class = nvcv::cuda::Require == 0 && std::is_integral_v>> +inline auto RandomValuesWithoutMinMax(VT min = nvcv::cuda::TypeTraits::min, + VT max = nvcv::cuda::TypeTraits::max, RE rng = DefaultGenerator()) +{ + return R{UD(min + 1, max - 1), rng}; +} + +template, typename RE = typename R::RE> +inline void RandomMinMax(std::vector &srcVec, const long3 &shape, const long3 &strides, long locs, + RE randomGenerator = DefaultGenerator()) +{ + long locsPerHeight = static_cast(std::ceil(locs / shape.y)); + if (locsPerHeight * 2 >= shape.z) + { + throw std::runtime_error("Locations is bigger than available pixels"); + } + + for (long x = 0; x < shape.x; ++x) + { + long countLocs = 0; + for (long y = 0; y < shape.y; ++y) + { + long numLocs = (countLocs + locsPerHeight > locs) ? (locs - countLocs) : locsPerHeight; + for (long z = 0; z < numLocs; ++z) + { + ValueAt(srcVec, strides, long3{x, y, z}) = nvcv::cuda::TypeTraits::min; + } + for (long z = numLocs; z < numLocs * 2; ++z) + { + ValueAt(srcVec, strides, long3{x, y, z}) = nvcv::cuda::TypeTraits::max; + } + std::shuffle(&ValueAt(srcVec, strides, long3{x, y, 0}), + &ValueAt(srcVec, strides, long3{x, y, shape.z}), randomGenerator); + + countLocs += locsPerHeight; + } + } +} + +template +inline void FillTensorWithMinMax(const nvcv::Tensor &tensor, long locations) +{ + auto tensorData = tensor.exportData(); + CVCUDA_CHECK_DATA(tensorData); + + if (tensor.rank() != 3 && tensor.rank() != 4) + { + throw std::invalid_argument("Tensor rank is not 3 or 4"); + } + + long3 strides{tensorData->stride(0), tensorData->stride(1), tensorData->stride(2)}; + long3 shape{tensorData->shape(0), tensorData->shape(1), tensorData->shape(2)}; + long bufSize{nvcv::cuda::GetElement(strides, 0) * nvcv::cuda::GetElement(shape, 0)}; + + std::vector tensorVec(bufSize); + + FillBuffer(tensorVec, shape, strides, RandomValuesWithoutMinMax()); + + RandomMinMax(tensorVec, shape, strides, locations); + + CUDA_CHECK_ERROR(cudaMemcpy(tensorData->basePtr(), tensorVec.data(), bufSize, cudaMemcpyHostToDevice)); +} + +template +inline void FillImageBatchWithMinMax(nvcv::ImageBatchVarShape &imageBatch, long2 size, long2 varSize, long locations) +{ + auto randomWidth = RandomValues(static_cast(size.x - varSize.x), static_cast(size.x)); + auto randomHeight = RandomValues(static_cast(size.y - varSize.y), static_cast(size.y)); + + for (int i = 0; i < imageBatch.capacity(); ++i) + { + nvcv::Image image(nvcv::Size2D{randomWidth(), randomHeight()}, GetFormat()); + + auto data = image.exportData(); + CVCUDA_CHECK_DATA(data); + + long2 strides{data->plane(0).rowStride, sizeof(VT)}; + long2 shape{data->plane(0).height, data->plane(0).width}; + long bufSize{strides.x * shape.x}; + + std::vector imageBuffer(bufSize); + + FillBuffer(imageBuffer, shape, strides, RandomValuesWithoutMinMax()); + + RandomMinMax(imageBuffer, long3{1, shape.x, shape.y}, long3{bufSize, strides.x, strides.y}, locations); + + CUDA_CHECK_ERROR(cudaMemcpy2D(data->plane(0).basePtr, strides.x, imageBuffer.data(), strides.x, strides.x, + data->plane(0).height, cudaMemcpyHostToDevice)); + + imageBatch.pushBack(image); + } +} + +} // namespace benchutils + +#endif // CVCUDA_BENCH_MINMAXLOC_HPP diff --git a/bench/BenchMorphology.cpp b/bench/BenchMorphology.cpp new file mode 100644 index 00000000..69ed2f97 --- /dev/null +++ b/bench/BenchMorphology.cpp @@ -0,0 +1,133 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void Morphology(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + int iteration = static_cast(state.get_int64("iteration")); + + NVCVBorderType borderType = benchutils::GetBorderType(state.get_string("border")); + + NVCVMorphologyType morphType; + + if (state.get_string("morphType") == "ERODE") + { + morphType = NVCV_ERODE; + } + else if (state.get_string("morphType") == "DILATE") + { + morphType = NVCV_DILATE; + } + else if (state.get_string("morphType") == "OPEN") + { + morphType = NVCV_OPEN; + } + else if (state.get_string("morphType") == "CLOSE") + { + morphType = NVCV_CLOSE; + } + + nvcv::Size2D mask{3, 3}; + int2 anchor{-1, -1}; + + int bwIteration = (morphType == NVCV_OPEN || morphType == NVCV_CLOSE || iteration > 1) ? 2 * iteration : iteration; + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T) * bwIteration); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T) * bwIteration); + + cvcuda::Morphology op; + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + nvcv::Tensor workspace{nullptr}; + + if (morphType == NVCV_OPEN || morphType == NVCV_CLOSE || iteration > 1) + { + workspace = nvcv::Tensor({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + } + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &workspace, &morphType, &mask, &anchor, &iteration, &borderType] + (nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, workspace, morphType, mask, anchor, iteration, borderType); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(shape.x); + nvcv::ImageBatchVarShape dst(shape.x); + + benchutils::FillImageBatch(src, long2{shape.z, shape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + dst.pushBack(src.begin(), src.end()); + + nvcv::Tensor maskTensor({{shape.x}, "N"}, nvcv::TYPE_2S32); + nvcv::Tensor anchorTensor({{shape.x}, "N"}, nvcv::TYPE_2S32); + + benchutils::FillTensor(maskTensor, [&mask](const long4 &){ return int2{mask.w, mask.h}; }); + benchutils::FillTensor(anchorTensor, [&anchor](const long4 &){ return anchor; }); + + nvcv::ImageBatchVarShape workspace{nullptr}; + + if (morphType == NVCV_OPEN || morphType == NVCV_CLOSE || iteration > 1) + { + workspace = nvcv::ImageBatchVarShape(shape.x); + + workspace.pushBack(dst.begin(), dst.end()); + } + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &workspace, &morphType, &maskTensor, &anchorTensor, &iteration, &borderType] + (nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, workspace, morphType, maskTensor, anchorTensor, iteration, borderType); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using MorphologyTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(Morphology, NVBENCH_TYPE_AXES(MorphologyTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}) + .add_int64_axis("iteration", {1}) + .add_string_axis("morphType", {"ERODE", "DILATE", "OPEN", "CLOSE"}) + .add_string_axis("border", {"REPLICATE"}); diff --git a/bench/BenchNMS.cpp b/bench/BenchNMS.cpp new file mode 100644 index 00000000..bad16a31 --- /dev/null +++ b/bench/BenchNMS.cpp @@ -0,0 +1,76 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void NMS(nvbench::state &state, nvbench::type_list) +try +{ + long2 shape = benchutils::GetShape<2>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + float scThr = static_cast(state.get_float64("scoreThreshold")); + float iouThr = static_cast(state.get_float64("iouThreshold")); + + // R/W bandwidth rationale: + // 1 read of scores (F32) to mask out lower scores boxes + 1 read of boxes (4S16) for IoU threshold + // 2 writes of masks (U8) by score and IoU thresholds + state.add_global_memory_reads(shape.x * shape.y * (sizeof(T) + sizeof(S))); + state.add_global_memory_writes(shape.x * shape.y * sizeof(M) * 2); + + cvcuda::NonMaximumSuppression op; + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor srcBB({{shape.x, shape.y}, "NB"}, benchutils::GetDataType()); + nvcv::Tensor srcSc({{shape.x, shape.y}, "NB"}, benchutils::GetDataType()); + nvcv::Tensor dstMk({{shape.x, shape.y}, "NB"}, benchutils::GetDataType()); + + benchutils::FillTensor(srcBB, benchutils::RandomValues(10, 50)); + benchutils::FillTensor(srcSc, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &srcBB, &dstMk, &srcSc, &scThr, &iouThr](nvbench::launch &launch) + { + op(launch.get_stream(), srcBB, dstMk, srcSc, scThr, iouThr); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + throw std::invalid_argument("ImageBatchVarShape not implemented for this operator"); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using NMSTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(NMS, NVBENCH_TYPE_AXES(NMSTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1024", "32x1024"}) + .add_int64_axis("varShape", {-1}) + .add_float64_axis("scoreThreshold", {0.5}) + .add_float64_axis("iouThreshold", {0.75}); diff --git a/bench/BenchNormalize.cpp b/bench/BenchNormalize.cpp new file mode 100644 index 00000000..64eed3e3 --- /dev/null +++ b/bench/BenchNormalize.cpp @@ -0,0 +1,99 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void Normalize(nvbench::state &state, nvbench::type_list) +try +{ + long3 srcShape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + long3 dstShape = srcShape; + + float globalScale = 1.234f; + float globalShift = 2.345f; + float epsilon = 12.34f; + uint32_t flags = CVCUDA_NORMALIZE_SCALE_IS_STDDEV; + + long3 baseShape{srcShape.x, 1, 1}; + long3 scaleShape{srcShape.x, 1, 1}; + + state.add_global_memory_reads(srcShape.x * srcShape.y * srcShape.z * sizeof(T) + + baseShape.x * baseShape.y * baseShape.z * sizeof(float) + + scaleShape.x * scaleShape.y * scaleShape.z * sizeof(float)); + state.add_global_memory_writes(dstShape.x * dstShape.y * dstShape.z * sizeof(T)); + + cvcuda::Normalize op; + + // clang-format off + + nvcv::Tensor base({{baseShape.x, baseShape.y, baseShape.z, 1}, "NHWC"}, nvcv::TYPE_F32); + nvcv::Tensor scale({{scaleShape.x, scaleShape.y, scaleShape.z, 1}, "NHWC"}, nvcv::TYPE_F32); + + benchutils::FillTensor(base, benchutils::RandomValues()); + benchutils::FillTensor(scale, benchutils::RandomValues(0.f, 1.f)); + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{srcShape.x, srcShape.y, srcShape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &base, &scale, &dst, &globalScale, &globalShift, &epsilon, &flags] + (nvbench::launch &launch) + { + op(launch.get_stream(), src, base, scale, dst, globalScale, globalShift, epsilon, flags); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(srcShape.x); + nvcv::ImageBatchVarShape dst(dstShape.x); + + benchutils::FillImageBatch(src, long2{srcShape.z, srcShape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + benchutils::FillImageBatch(dst, long2{dstShape.z, dstShape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &base, &scale, &dst, &globalScale, &globalShift, &epsilon, &flags] + (nvbench::launch &launch) + { + op(launch.get_stream(), src, base, scale, dst, globalScale, globalShift, epsilon, flags); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using NormalizeTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(Normalize, NVBENCH_TYPE_AXES(NormalizeTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}); diff --git a/bench/BenchOSD.cpp b/bench/BenchOSD.cpp new file mode 100644 index 00000000..b5e9b0a6 --- /dev/null +++ b/bench/BenchOSD.cpp @@ -0,0 +1,94 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include <../priv/Types.hpp> +#include + +#include + +template +inline void OSD(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + int numElem = static_cast(state.get_int64("numElem")); + + int ch = nvcv::cuda::NumElements; + + using BT = nvcv::cuda::BaseType; + + std::vector>> elementVec; + + for (int n = 0; n < (int)shape.x; n++) + { + std::vector> curVec; + for (int i = 0; i < numElem; i++) + { + NVCVPoint point; + point.centerPos.x = shape.z / 2; + point.centerPos.y = shape.y / 2; + point.radius = std::min(shape.z, shape.y) / 2; + point.color = {0, 0, 0, 255}; + auto element = std::make_shared(NVCVOSDType::NVCV_OSD_POINT, &point); + curVec.push_back(element); + } + elementVec.push_back(curVec); + } + + std::shared_ptr ctx = std::make_shared(elementVec); + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T) + numElem * sizeof(int) * 16); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::OSD op; + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &ctx](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, (NVCVElements)ctx.get()); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + throw std::invalid_argument("ImageBatchVarShape not implemented for this operator"); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using OSDTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(OSD, NVBENCH_TYPE_AXES(OSDTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}) + .add_int64_axis("numElem", {100}); diff --git a/bench/BenchPadAndStack.cpp b/bench/BenchPadAndStack.cpp new file mode 100644 index 00000000..18a36c38 --- /dev/null +++ b/bench/BenchPadAndStack.cpp @@ -0,0 +1,82 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void PadAndStack(nvbench::state &state, nvbench::type_list) +try +{ + long3 srcShape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + long3 dstShape = srcShape; + + NVCVBorderType borderType = benchutils::GetBorderType(state.get_string("border")); + + float borderValue{0.f}; + + state.add_global_memory_reads(srcShape.x * srcShape.y * srcShape.z * sizeof(T) + srcShape.x * sizeof(int) * 2); + state.add_global_memory_writes(dstShape.x * dstShape.y * dstShape.z * sizeof(T)); + + cvcuda::PadAndStack op; + + // clang-format off + + nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + nvcv::Tensor top({{srcShape.x, 1, 1, 1}, "NHWC"}, nvcv::TYPE_S32); + nvcv::Tensor left({{srcShape.x, 1, 1, 1}, "NHWC"}, nvcv::TYPE_S32); + + benchutils::FillTensor(top, [&srcShape](const long4 &){ return srcShape.y / 2; }); + benchutils::FillTensor(left, [&srcShape](const long4 &){ return srcShape.z / 2; }); + + if (varShape < 0) // negative var shape means use Tensor + { + throw std::invalid_argument("Tensor not implemented for this operator"); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(srcShape.x); + + benchutils::FillImageBatch(src, long2{srcShape.z, srcShape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &top, &left, &borderType, &borderValue](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, top, left, borderType, borderValue); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using PadAndStackTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(PadAndStack, NVBENCH_TYPE_AXES(PadAndStackTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {0}) + .add_string_axis("border", {"REFLECT101"}); diff --git a/bench/BenchPairwiseMatcher.cpp b/bench/BenchPairwiseMatcher.cpp new file mode 100644 index 00000000..4d7ebaf2 --- /dev/null +++ b/bench/BenchPairwiseMatcher.cpp @@ -0,0 +1,108 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void PairwiseMatcher(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + + int matchesPerPoint = static_cast(state.get_int64("matchesPerPoint")); + + bool crossCheck = state.get_string("crossCheck") == "T"; + bool readNumSets = state.get_string("readNumSets") == "T"; + bool writeDistances = state.get_string("writeDistances") == "T"; + + NVCVNormType normType = benchutils::GetNormType(state.get_string("normType")); + + NVCVPairwiseMatcherType algoChoice; + + if (state.get_string("algoChoice") == "BRUTE_FORCE") + { + algoChoice = NVCV_BRUTE_FORCE; + } + else + { + throw std::invalid_argument("Unexpected algorithm choice = " + state.get_string("algoChoice")); + } + + int maxMatches = shape.y * matchesPerPoint; + + cvcuda::PairwiseMatcher op(algoChoice); + + state.add_global_memory_reads((crossCheck ? 3 : 2) * shape.x * shape.y * shape.z * sizeof(ST)); + state.add_global_memory_writes(shape.x * (sizeof(int) + maxMatches * (2 * sizeof(int) + sizeof(float)))); + + // clang-format off + + nvcv::Tensor set1({{shape.x, shape.y, shape.z}, "NMD"}, benchutils::GetDataType()); + nvcv::Tensor set2({{shape.x, shape.y, shape.z}, "NMD"}, benchutils::GetDataType()); + + nvcv::Tensor matches({{shape.x, maxMatches, 2}, "NMD"}, nvcv::TYPE_S32); + + nvcv::Tensor numMatches({{shape.x}, "N"}, nvcv::TYPE_S32); + + nvcv::Tensor numSet1, numSet2, distances; + + if (readNumSets) + { + numSet1 = nvcv::Tensor({{shape.x}, "N"}, nvcv::TYPE_S32); + numSet2 = nvcv::Tensor({{shape.x}, "N"}, nvcv::TYPE_S32); + + benchutils::FillTensor(numSet1, [&shape](const long4 &){ return shape.y; }); + benchutils::FillTensor(numSet2, [&shape](const long4 &){ return shape.y; }); + } + if (writeDistances) + { + distances = nvcv::Tensor({{shape.x, maxMatches}, "NM"}, nvcv::TYPE_F32); + } + + benchutils::FillTensor(set1, benchutils::RandomValues()); + benchutils::FillTensor(set2, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, + [&op, &set1, &set2, &numSet1, &numSet2, &matches, &numMatches, &distances, &crossCheck, + &matchesPerPoint, &normType](nvbench::launch &launch) + { + op(launch.get_stream(), set1, set2, numSet1, numSet2, matches, numMatches, distances, crossCheck, + matchesPerPoint, normType); + }); +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using PairwiseMatcherTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(PairwiseMatcher, NVBENCH_TYPE_AXES(PairwiseMatcherTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x10000x32"}) + .add_int64_axis("matchesPerPoint", {1}) + .add_string_axis("crossCheck", {"T"}) + .add_string_axis("readNumSets", {"F"}) + .add_string_axis("writeDistances", {"T"}) + .add_string_axis("normType", {"HAMMING"}) + .add_string_axis("algoChoice", {"BRUTE_FORCE"}); diff --git a/bench/BenchPillowResize.cpp b/bench/BenchPillowResize.cpp new file mode 100644 index 00000000..359480e2 --- /dev/null +++ b/bench/BenchPillowResize.cpp @@ -0,0 +1,105 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void PillowResize(nvbench::state &state, nvbench::type_list) +try +{ + long3 srcShape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + + NVCVInterpolationType interpType = benchutils::GetInterpolationType(state.get_string("interpolation")); + + long3 dstShape; + + if (state.get_string("resizeType") == "EXPAND") + { + dstShape = long3{srcShape.x, srcShape.y * 2, srcShape.z * 2}; + } + else if (state.get_string("resizeType") == "CONTRACT") + { + dstShape = long3{srcShape.x, srcShape.y / 2, srcShape.z / 2}; + } + else + { + throw std::invalid_argument("Invalid resizeType = " + state.get_string("resizeType")); + } + + nvcv::Size2D srcSize{(int)srcShape.z, (int)srcShape.y}; + nvcv::Size2D dstSize{(int)dstShape.z, (int)dstShape.y}; + + nvcv::DataType dtype{benchutils::GetDataType()}; + nvcv::ImageFormat fmt(nvcv::MemLayout::PITCH_LINEAR, dtype.dataKind(), nvcv::Swizzle::S_X000, dtype.packing()); + + state.add_global_memory_reads(srcShape.x * srcShape.y * srcShape.z * sizeof(T)); + state.add_global_memory_writes(dstShape.x * dstShape.y * dstShape.z * sizeof(T)); + + cvcuda::PillowResize op; + cvcuda::UniqueWorkspace ws + = cvcuda::AllocateWorkspace(op.getWorkspaceRequirements(srcShape.x, srcSize, dstSize, fmt)); + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{srcShape.x, srcShape.y, srcShape.z, 1}, "NHWC"}, dtype); + nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, 1}, "NHWC"}, dtype); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &ws, &src, &dst, &interpType](nvbench::launch &launch) + { + op(launch.get_stream(), ws.get(), src, dst, interpType); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(srcShape.x); + nvcv::ImageBatchVarShape dst(dstShape.x); + + benchutils::FillImageBatch(src, long2{srcShape.z, srcShape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + benchutils::FillImageBatch(dst, long2{dstShape.z, dstShape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &ws, &src, &dst, &interpType](nvbench::launch &launch) + { + op(launch.get_stream(), ws.get(), src, dst, interpType); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using PillowResizeTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(PillowResize, NVBENCH_TYPE_AXES(PillowResizeTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}) + .add_string_axis("resizeType", {"CONTRACT"}) + .add_string_axis("interpolation", {"CUBIC"}); diff --git a/bench/BenchRandomResizedCrop.cpp b/bench/BenchRandomResizedCrop.cpp new file mode 100644 index 00000000..b7f58c57 --- /dev/null +++ b/bench/BenchRandomResizedCrop.cpp @@ -0,0 +1,103 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void RandomResizedCrop(nvbench::state &state, nvbench::type_list) +try +{ + long3 srcShape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + + NVCVInterpolationType interpType = benchutils::GetInterpolationType(state.get_string("interpolation")); + + long3 dstShape; + + if (state.get_string("resizeType") == "EXPAND") + { + dstShape = long3{srcShape.x, srcShape.y * 2, srcShape.z * 2}; + } + else if (state.get_string("resizeType") == "CONTRACT") + { + dstShape = long3{srcShape.x, srcShape.y / 2, srcShape.z / 2}; + } + else + { + throw std::invalid_argument("Invalid resizeType = " + state.get_string("resizeType")); + } + + double minScale = 0.08; + double maxScale = 1.0; + double minRatio = 0.5; + double maxRatio = 2.0; + uint32_t seed = 1234; + + state.add_global_memory_reads(srcShape.x * srcShape.y * srcShape.z * sizeof(T)); + state.add_global_memory_writes(dstShape.x * dstShape.y * dstShape.z * sizeof(T)); + + cvcuda::RandomResizedCrop op(minScale, maxScale, minRatio, maxRatio, srcShape.x, seed); + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{srcShape.x, srcShape.y, srcShape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &interpType](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, interpType); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(srcShape.x); + nvcv::ImageBatchVarShape dst(dstShape.x); + + benchutils::FillImageBatch(src, long2{srcShape.z, srcShape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + benchutils::FillImageBatch(dst, long2{dstShape.z, dstShape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &interpType](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, interpType); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using RandomResizedCropTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(RandomResizedCrop, NVBENCH_TYPE_AXES(RandomResizedCropTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}) + .add_string_axis("resizeType", {"EXPAND"}) + .add_string_axis("interpolation", {"LINEAR"}); diff --git a/bench/BenchReformat.cpp b/bench/BenchReformat.cpp new file mode 100644 index 00000000..26e8f28c --- /dev/null +++ b/bench/BenchReformat.cpp @@ -0,0 +1,67 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void Reformat(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::Reformat op; + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, 1, shape.y, shape.z}, "NCHW"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + throw std::invalid_argument("ImageBatchVarShape not implemented for this operator"); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using ReformatTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(Reformat, NVBENCH_TYPE_AXES(ReformatTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}); diff --git a/bench/BenchRemap.cpp b/bench/BenchRemap.cpp new file mode 100644 index 00000000..7fc20600 --- /dev/null +++ b/bench/BenchRemap.cpp @@ -0,0 +1,120 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void Remap(nvbench::state &state, nvbench::type_list) +try +{ + long3 srcShape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + long3 dstShape = srcShape; + long3 mapShape; + + NVCVInterpolationType srcInterp, mapInterp; + NVCVBorderType borderType; + NVCVRemapMapValueType mapValueType; + + bool alignCorners{true}; + float4 borderValue{0, 0, 0, 0}; + + if (state.get_string("mapType") == "DENSE") + { + srcInterp = NVCV_INTERP_NEAREST; + mapInterp = NVCV_INTERP_NEAREST; + borderType = NVCV_BORDER_CONSTANT; + mapValueType = NVCV_REMAP_ABSOLUTE_NORMALIZED; + mapShape = srcShape; + } + else if (state.get_string("mapType") == "RELATIVE") + { + srcInterp = NVCV_INTERP_CUBIC; + mapInterp = NVCV_INTERP_CUBIC; + borderType = NVCV_BORDER_REFLECT101; + mapValueType = NVCV_REMAP_RELATIVE_NORMALIZED; + mapShape = long3{srcShape.x, 4, 4}; + } + else + { + throw std::invalid_argument("Invalid mapType = " + state.get_string("mapType")); + } + + state.add_global_memory_reads(srcShape.x * srcShape.y * srcShape.z * sizeof(T) + + mapShape.x * mapShape.y * mapShape.z * sizeof(float2)); + state.add_global_memory_writes(dstShape.x * dstShape.y * dstShape.z * sizeof(T)); + + cvcuda::Remap op; + + // clang-format off + + nvcv::Tensor map({{mapShape.x, mapShape.y, mapShape.z, 1}, "NHWC"}, nvcv::TYPE_2F32); + + benchutils::FillTensor(map, benchutils::RandomValues()); + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{srcShape.x, srcShape.y, srcShape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &map, &srcInterp, &mapInterp, &mapValueType, &alignCorners, &borderType, + &borderValue](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, map, srcInterp, mapInterp, mapValueType, alignCorners, borderType, + borderValue); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(srcShape.x); + nvcv::ImageBatchVarShape dst(dstShape.x); + + benchutils::FillImageBatch(src, long2{srcShape.z, srcShape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + benchutils::FillImageBatch(dst, long2{dstShape.z, dstShape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &map, &srcInterp, &mapInterp, &mapValueType, &alignCorners, &borderType, + &borderValue](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, map, srcInterp, mapInterp, mapValueType, alignCorners, borderType, + borderValue); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using RemapTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(Remap, NVBENCH_TYPE_AXES(RemapTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}) + .add_string_axis("mapType", {"DENSE"}); diff --git a/bench/BenchResize.cpp b/bench/BenchResize.cpp new file mode 100644 index 00000000..7446a6f8 --- /dev/null +++ b/bench/BenchResize.cpp @@ -0,0 +1,97 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void Resize(nvbench::state &state, nvbench::type_list) +try +{ + long3 srcShape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + + NVCVInterpolationType interpType = benchutils::GetInterpolationType(state.get_string("interpolation")); + + long3 dstShape; + + if (state.get_string("resizeType") == "EXPAND") + { + dstShape = long3{srcShape.x, srcShape.y * 2, srcShape.z * 2}; + } + else if (state.get_string("resizeType") == "CONTRACT") + { + dstShape = long3{srcShape.x, srcShape.y / 2, srcShape.z / 2}; + } + else + { + throw std::invalid_argument("Invalid resizeType = " + state.get_string("resizeType")); + } + + state.add_global_memory_reads(srcShape.x * srcShape.y * srcShape.z * sizeof(T)); + state.add_global_memory_writes(dstShape.x * dstShape.y * dstShape.z * sizeof(T)); + + cvcuda::Resize op; + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{srcShape.x, srcShape.y, srcShape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &interpType](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, interpType); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(srcShape.x); + nvcv::ImageBatchVarShape dst(dstShape.x); + + benchutils::FillImageBatch(src, long2{srcShape.z, srcShape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + benchutils::FillImageBatch(dst, long2{dstShape.z, dstShape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &interpType](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, interpType); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using ResizeTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(Resize, NVBENCH_TYPE_AXES(ResizeTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}) + .add_string_axis("resizeType", {"EXPAND"}) + .add_string_axis("interpolation", {"LINEAR"}); diff --git a/bench/BenchRotate.cpp b/bench/BenchRotate.cpp new file mode 100644 index 00000000..4f4af05c --- /dev/null +++ b/bench/BenchRotate.cpp @@ -0,0 +1,91 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void Rotate(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + + NVCVInterpolationType interpType = benchutils::GetInterpolationType(state.get_string("interpolation")); + + double angleDeg = 123.456; + double2 shift{12.34, 12.34}; + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::Rotate op(shape.x); + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &angleDeg, &shift, &interpType](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, angleDeg, shift, interpType); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(shape.x); + nvcv::ImageBatchVarShape dst(shape.x); + + benchutils::FillImageBatch(src, long2{shape.z, shape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + dst.pushBack(src.begin(), src.end()); + + nvcv::Tensor angleDegTensor({{shape.x}, "N"}, nvcv::TYPE_F64); + nvcv::Tensor shiftTensor({{shape.x, 2}, "NW"}, nvcv::TYPE_F64); + + benchutils::FillTensor(angleDegTensor, [&angleDeg](const long4 &){ return angleDeg; }); + benchutils::FillTensor(shiftTensor, + [&shift](const long4 &c){ return nvcv::cuda::GetElement(shift, c.y); }); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &angleDegTensor, &shiftTensor, &interpType](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, angleDegTensor, shiftTensor, interpType); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using RotateTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(Rotate, NVBENCH_TYPE_AXES(RotateTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}) + .add_string_axis("interpolation", {"CUBIC"}); diff --git a/bench/BenchSIFT.cpp b/bench/BenchSIFT.cpp new file mode 100644 index 00000000..804c3598 --- /dev/null +++ b/bench/BenchSIFT.cpp @@ -0,0 +1,109 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void SIFT(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + int capacity = static_cast(state.get_int64("maxCapacity")); + int numOctLayers = static_cast(state.get_int64("numOctaveLayers")); + float contThr = static_cast(state.get_float64("contrastThreshold")); + float edgeThr = static_cast(state.get_float64("edgeThreshold")); + float initSigma = static_cast(state.get_float64("initSigma")); + + NVCVSIFTFlagType flags; + + int3 maxShape; + + if (state.get_string("expandInput") == "Y") + { + flags = NVCV_SIFT_USE_EXPANDED_INPUT; + maxShape = int3{(int)shape.z * 2, (int)shape.y * 2, (int)shape.x}; + } + else if (state.get_string("expandInput") == "N") + { + flags = NVCV_SIFT_USE_ORIGINAL_INPUT; + maxShape = int3{(int)shape.z, (int)shape.y, (int)shape.x}; + } + else + { + throw std::invalid_argument("Invalid expandInput = " + state.get_string("expandInput")); + } + + // Each pyramid has shape approximately (3 + L) * N * (2 HW size) * F32 + std::size_t pyrSize = (numOctLayers + 3) * shape.x * (maxShape.x * maxShape.y * 2) * sizeof(float); + + // R/W bandwidth rationale: + // 1 read of input (U8) to build (F32) pyramids, 1 read of Gauss and 1 read of DoG pyramids + // 1 write of Gauss and 1 write of DoG pyramids, 1 write of 4 output data + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T) + 2 * pyrSize); + state.add_global_memory_writes(2 * pyrSize + shape.x * sizeof(int) + + shape.x * capacity * (sizeof(float4) + sizeof(float3) + 128 * sizeof(T))); + + cvcuda::SIFT op(maxShape, numOctLayers); + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, nvcv::TYPE_U8); + nvcv::Tensor dstC({{shape.x, capacity}, "NM"}, nvcv::TYPE_4F32); + nvcv::Tensor dstM({{shape.x, capacity}, "NM"}, nvcv::TYPE_3F32); + nvcv::Tensor dstD({{shape.x, capacity, 128}, "NMD"}, nvcv::TYPE_U8); + nvcv::Tensor dstN({{shape.x}, "N"}, nvcv::TYPE_S32); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dstC, &dstM, &dstD, &dstN, &numOctLayers, &contThr, &edgeThr, &initSigma, &flags] + (nvbench::launch &launch) + { + op(launch.get_stream(), src, dstC, dstM, dstD, dstN, numOctLayers, contThr, edgeThr, initSigma, flags); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + throw std::invalid_argument("ImageBatchVarShape not implemented for this operator"); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using SIFTTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(SIFT, NVBENCH_TYPE_AXES(SIFTTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}) + .add_int64_axis("maxCapacity", {10000}) + .add_int64_axis("numOctaveLayers", {3}) + .add_float64_axis("contrastThreshold", {0.04}) + .add_float64_axis("edgeThreshold", {10.0}) + .add_float64_axis("initSigma", {1.6}) + .add_string_axis("expandInput", {"Y"}); diff --git a/bench/BenchStack.cpp b/bench/BenchStack.cpp new file mode 100644 index 00000000..ebd3c079 --- /dev/null +++ b/bench/BenchStack.cpp @@ -0,0 +1,68 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void Stack(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + + using BT = typename nvcv::cuda::BaseType; + + int ch = nvcv::cuda::NumElements; + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::Stack op; + + // clang-format off + + nvcv::TensorBatch src(nvcv::TensorBatch::CalcRequirements(shape.x)); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType()); + + for (int i = 0 ; i < shape.x; i++) + { + nvcv::Tensor srcIn({{shape.y, shape.z, ch}, "HWC"}, benchutils::GetDataType()); + benchutils::FillTensor(srcIn, benchutils::RandomValues()); + src.pushBack(srcIn); + } + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst); + }); + +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using StackTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(Stack, NVBENCH_TYPE_AXES(StackTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"10x1080x1920"}); diff --git a/bench/BenchThreshold.cpp b/bench/BenchThreshold.cpp new file mode 100644 index 00000000..648a83ac --- /dev/null +++ b/bench/BenchThreshold.cpp @@ -0,0 +1,85 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void Threshold(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + + uint32_t threshType = NVCV_THRESH_BINARY | (std::is_same_v ? NVCV_THRESH_OTSU : 0); + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::Threshold op(threshType, shape.x); + + // clang-format off + + nvcv::Tensor thresh({{shape.x}, "N"}, nvcv::TYPE_F64); + nvcv::Tensor maxval({{shape.x}, "N"}, nvcv::TYPE_F64); + + benchutils::FillTensor(thresh, benchutils::RandomValues()); + benchutils::FillTensor(maxval, benchutils::RandomValues()); + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &thresh, &maxval](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, thresh, maxval); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(shape.x); + nvcv::ImageBatchVarShape dst(shape.x); + + benchutils::FillImageBatch(src, long2{shape.z, shape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + dst.pushBack(src.begin(), src.end()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &thresh, &maxval](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, thresh, maxval); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using ThresholdTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(Threshold, NVBENCH_TYPE_AXES(ThresholdTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}); diff --git a/bench/BenchUtils.hpp b/bench/BenchUtils.hpp new file mode 100644 index 00000000..3875928e --- /dev/null +++ b/bench/BenchUtils.hpp @@ -0,0 +1,324 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CVCUDA_BENCH_UTILS_HPP +#define CVCUDA_BENCH_UTILS_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define CVCUDA_CHECK_DATA(data) \ + if (!data) \ + { \ + throw std::runtime_error("Invalid data"); \ + } + +#define CUDA_CHECK_ERROR(RC) \ + { \ + benchutils::cudaCheckError((RC), __FILE__, __LINE__); \ + } + +namespace benchutils { + +inline void cudaCheckError(cudaError_t code, const char *file, int line) +{ + if (code != cudaSuccess) + { + fprintf(stderr, "\nE In CUDA: %s %s %d\n", cudaGetErrorString(code), file, line); + exit(code); + } +} + +template> +inline RT GetShape(const std::string &shapeStr, const std::string &delimiter = "x") +{ + std::string str = shapeStr; + RT shape; + for (int i = 0; i < N; ++i) + { + size_t pos = str.find(delimiter); + + if ((pos == std::string::npos && i != (N - 1)) || (pos != std::string::npos && i == (N - 1))) + { + throw std::invalid_argument("Expecting " + std::to_string(N) + "-rank shape in " + shapeStr + + " (pass shape separated by " + delimiter + ")"); + } + + nvcv::cuda::GetElement(shape, i) = std::stoi(str.substr(0, pos)); + + str.erase(0, pos + delimiter.length()); + } + + return shape; +} + +template +inline nvcv::DataType GetDataType() +{ +#define CVCUDA_BENCH_GET_DATA_TYPE(TYPE, DATA_TYPE) \ + if constexpr (std::is_same_v) \ + { \ + return DATA_TYPE; \ + } + + CVCUDA_BENCH_GET_DATA_TYPE(uint8_t, nvcv::TYPE_U8); + CVCUDA_BENCH_GET_DATA_TYPE(uint16_t, nvcv::TYPE_U16); + CVCUDA_BENCH_GET_DATA_TYPE(uint32_t, nvcv::TYPE_U32); + + CVCUDA_BENCH_GET_DATA_TYPE(uchar3, nvcv::TYPE_3U8); + CVCUDA_BENCH_GET_DATA_TYPE(uchar4, nvcv::TYPE_4U8); + CVCUDA_BENCH_GET_DATA_TYPE(float, nvcv::TYPE_F32); + + CVCUDA_BENCH_GET_DATA_TYPE(float3, nvcv::TYPE_3F32); + CVCUDA_BENCH_GET_DATA_TYPE(float4, nvcv::TYPE_4F32); + + CVCUDA_BENCH_GET_DATA_TYPE(int, nvcv::TYPE_S32); + + CVCUDA_BENCH_GET_DATA_TYPE(short, nvcv::TYPE_S16); + + CVCUDA_BENCH_GET_DATA_TYPE(ushort3, nvcv::TYPE_3U16); + CVCUDA_BENCH_GET_DATA_TYPE(ushort4, nvcv::TYPE_4U16); + CVCUDA_BENCH_GET_DATA_TYPE(short4, nvcv::TYPE_4S16); + +#undef CVCUDA_BENCH_GET_DATA_TYPE + + throw std::invalid_argument("Unexpected data type"); +} + +template +inline nvcv::ImageFormat GetFormat() +{ + return nvcv::ImageFormat{GetDataType()}; +} + +inline NVCVBorderType GetBorderType(const std::string &border) +{ +#define CVCUDA_BENCH_GET_BORDER_TYPE(BORDER) \ + if (border == #BORDER) \ + { \ + return NVCV_BORDER_##BORDER; \ + } + + CVCUDA_BENCH_GET_BORDER_TYPE(CONSTANT); + CVCUDA_BENCH_GET_BORDER_TYPE(REPLICATE); + CVCUDA_BENCH_GET_BORDER_TYPE(REFLECT); + CVCUDA_BENCH_GET_BORDER_TYPE(WRAP); + CVCUDA_BENCH_GET_BORDER_TYPE(REFLECT101); + +#undef CVCUDA_BENCH_GET_BORDER_TYPE + + throw std::invalid_argument("Unexpected border type = " + border); +} + +inline NVCVNormType GetNormType(const std::string &normType) +{ +#define CVCUDA_BENCH_GET_NORM_TYPE(NORM) \ + if (normType == #NORM) \ + { \ + return NVCV_NORM_##NORM; \ + } + + CVCUDA_BENCH_GET_NORM_TYPE(HAMMING); + CVCUDA_BENCH_GET_NORM_TYPE(L1); + CVCUDA_BENCH_GET_NORM_TYPE(L2); + +#undef CVCUDA_BENCH_GET_NORM_TYPE + + throw std::invalid_argument("Unexpected norm type = " + normType); +} + +inline NVCVInterpolationType GetInterpolationType(const std::string &interpolation) +{ +#define CVCUDA_BENCH_GET_INTERPOLATION_TYPE(INTERP) \ + if (interpolation == #INTERP) \ + { \ + return NVCV_INTERP_##INTERP; \ + } + + CVCUDA_BENCH_GET_INTERPOLATION_TYPE(NEAREST); + CVCUDA_BENCH_GET_INTERPOLATION_TYPE(LINEAR); + CVCUDA_BENCH_GET_INTERPOLATION_TYPE(CUBIC); + CVCUDA_BENCH_GET_INTERPOLATION_TYPE(AREA); + +#undef CVCUDA_BENCH_GET_INTERPOLATION_TYPE + + throw std::invalid_argument("Unexpected interpolation type = " + interpolation); +} + +template, const T, T>> +inline RT &ValueAt(VecType &vec, const ST &strides, const ST &coord) +{ + return *reinterpret_cast(&vec[nvcv::cuda::dot(coord, strides)]); +} + +static std::default_random_engine DefaultGenerator(unsigned long int seed = 0) +{ + static std::default_random_engine defaultRandomGenerator{std::random_device{}()}; + + defaultRandomGenerator.seed(seed); + + return defaultRandomGenerator; +} + +template +struct Randomizer +{ + using BT = nvcv::cuda::BaseType; + using RE = std::default_random_engine; + using UD = std::conditional_t, std::uniform_real_distribution, + std::uniform_int_distribution>; + + VT operator()() + { + VT ret; + for (int i = 0; i < nvcv::cuda::NumElements; ++i) + { + nvcv::cuda::GetElement(ret, i) = uniformDistribution(randomGenerator); + } + return ret; + } + + VT operator()(const long4 &) + { + return operator()(); + } + + UD uniformDistribution; + RE randomGenerator; +}; + +template, typename BT = typename R::BT, typename RE = typename R::RE, + typename UD = typename R::UD> +inline auto RandomValues(BT min = std::is_integral_v ? nvcv::cuda::TypeTraits::min : -1, + BT max = std::is_integral_v ? nvcv::cuda::TypeTraits::max : +1, + RE rng = DefaultGenerator()) +{ + return R{UD(min, max), rng}; +} + +template +inline void FillBuffer(std::vector &vec, const ST &shape, const ST &strides, VG valuesGenerator) +{ + for (long x = 0; x < (nvcv::cuda::NumElements >= 1 ? nvcv::cuda::GetElement(shape, 0) : 1); ++x) + { + for (long y = 0; y < (nvcv::cuda::NumElements >= 2 ? nvcv::cuda::GetElement(shape, 1) : 1); ++y) + { + for (long z = 0; z < (nvcv::cuda::NumElements >= 3 ? nvcv::cuda::GetElement(shape, 2) : 1); ++z) + { + for (long w = 0; w < (nvcv::cuda::NumElements == 4 ? nvcv::cuda::GetElement(shape, 3) : 1); ++w) + { + long4 coord{x, y, z, w}; + ST stCoord = nvcv::cuda::DropCast>(coord); + + ValueAt(vec, strides, stCoord) = valuesGenerator(coord); + } + } + } + } +} + +template +inline void FillTensor(const nvcv::Tensor &tensor, VG valuesGenerator) +{ + using longR = nvcv::cuda::MakeType; + + auto tensorData = tensor.exportData(); + CVCUDA_CHECK_DATA(tensorData); + + longR strides, shape; + + for (int i = 0; i < RANK; ++i) + { + nvcv::cuda::GetElement(strides, i) = tensorData->stride(i); + nvcv::cuda::GetElement(shape, i) = tensorData->shape(i); + } + + long bufSize{nvcv::cuda::GetElement(strides, 0) * nvcv::cuda::GetElement(shape, 0)}; + + std::vector tensorVec(bufSize); + + FillBuffer(tensorVec, shape, strides, valuesGenerator); + + CUDA_CHECK_ERROR(cudaMemcpy(tensorData->basePtr(), tensorVec.data(), bufSize, cudaMemcpyHostToDevice)); +} + +template +inline void FillTensor(const nvcv::Tensor &tensor, VG valuesGenerator) +{ + switch (tensor.rank()) + { +#define CVCUDA_BENCH_FILL_TENSOR_CASE(RANK) \ +case RANK: \ + FillTensor(tensor, valuesGenerator); \ + break + + CVCUDA_BENCH_FILL_TENSOR_CASE(1); + CVCUDA_BENCH_FILL_TENSOR_CASE(2); + CVCUDA_BENCH_FILL_TENSOR_CASE(3); + CVCUDA_BENCH_FILL_TENSOR_CASE(4); + +#undef CVCUDA_BENCH_FILL_TENSOR_CASE + default: + throw std::invalid_argument("Tensor has rank not in [1, 4]"); + } +} + +template +inline void FillImageBatch(nvcv::ImageBatchVarShape &imageBatch, long2 size, long2 varSize, VG valuesGenerator) +{ + auto randomWidth = RandomValues(static_cast(size.x - varSize.x), static_cast(size.x)); + auto randomHeight = RandomValues(static_cast(size.y - varSize.y), static_cast(size.y)); + + for (int i = 0; i < imageBatch.capacity(); ++i) + { + nvcv::Image image(nvcv::Size2D{randomWidth(), randomHeight()}, GetFormat()); + + auto data = image.exportData(); + CVCUDA_CHECK_DATA(data); + + long2 strides{data->plane(0).rowStride, sizeof(VT)}; + long2 shape{data->plane(0).height, data->plane(0).width}; + + std::vector imageBuffer(strides.x * shape.x); + + FillBuffer(imageBuffer, shape, strides, valuesGenerator); + + CUDA_CHECK_ERROR(cudaMemcpy2D(data->plane(0).basePtr, strides.x, imageBuffer.data(), strides.x, strides.x, + data->plane(0).height, cudaMemcpyHostToDevice)); + + imageBatch.pushBack(image); + } +} + +} // namespace benchutils + +#endif // CVCUDA_BENCH_UTILS_HPP diff --git a/bench/BenchWarpAffine.cpp b/bench/BenchWarpAffine.cpp new file mode 100644 index 00000000..459c3b32 --- /dev/null +++ b/bench/BenchWarpAffine.cpp @@ -0,0 +1,95 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void WarpAffine(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + + NVCVBorderType borderType = benchutils::GetBorderType(state.get_string("border")); + NVCVInterpolationType interpType = benchutils::GetInterpolationType(state.get_string("interpolation")); + + int flags = interpType | ((state.get_string("inverseMap") == "Y") ? NVCV_WARP_INVERSE_MAP : 0); + + float4 borderValue{0, 0, 0, 0}; + + NVCVAffineTransform transMatrix{2.f, 2.f, 0.f, 3.f, 1.f, 0.f}; + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T) + 6 * sizeof(float)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::WarpAffine op(shape.x); + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &transMatrix, &flags, &borderType, &borderValue] + (nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, transMatrix, flags, borderType, borderValue); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(shape.x); + nvcv::ImageBatchVarShape dst(shape.x); + + benchutils::FillImageBatch(src, long2{shape.z, shape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + dst.pushBack(src.begin(), src.end()); + + nvcv::Tensor transMatrixTensor({{shape.x, 6}, "NW"}, nvcv::TYPE_F32); + + benchutils::FillTensor(transMatrixTensor, [&transMatrix](const long4 &c){ return transMatrix[c.y]; }); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &transMatrixTensor, &flags, &borderType, &borderValue](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, transMatrixTensor, flags, borderType, borderValue); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using WarpAffineTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(WarpAffine, NVBENCH_TYPE_AXES(WarpAffineTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}) + .add_string_axis("border", {"REFLECT"}) + .add_string_axis("interpolation", {"CUBIC"}) + .add_string_axis("inverseMap", {"Y"}); diff --git a/bench/BenchWarpPerspective.cpp b/bench/BenchWarpPerspective.cpp new file mode 100644 index 00000000..87498612 --- /dev/null +++ b/bench/BenchWarpPerspective.cpp @@ -0,0 +1,95 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void WarpPerspective(nvbench::state &state, nvbench::type_list) +try +{ + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + + NVCVBorderType borderType = benchutils::GetBorderType(state.get_string("border")); + NVCVInterpolationType interpType = benchutils::GetInterpolationType(state.get_string("interpolation")); + + int flags = interpType | ((state.get_string("inverseMap") == "Y") ? NVCV_WARP_INVERSE_MAP : 0); + + float4 borderValue{0, 0, 0, 0}; + + NVCVPerspectiveTransform transMatrix{0.27, 0.16, 0.00, -0.11, 0.61, 0.65, -0.09, 0.06, 1.00}; + + state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T) + 9 * sizeof(float)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + + cvcuda::WarpPerspective op(shape.x); + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &transMatrix, &flags, &borderType, &borderValue](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, transMatrix, flags, borderType, borderValue); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(shape.x); + nvcv::ImageBatchVarShape dst(shape.x); + + benchutils::FillImageBatch(src, long2{shape.z, shape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + dst.pushBack(src.begin(), src.end()); + + nvcv::Tensor transMatrixTensor({{shape.x, 9}, "NW"}, nvcv::TYPE_F32); + + benchutils::FillTensor(transMatrixTensor, [&transMatrix](const long4 &c){ return transMatrix[c.y]; }); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &transMatrixTensor, &flags, &borderType, &borderValue](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, transMatrixTensor, flags, borderType, borderValue); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using WarpPerspectiveTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(WarpPerspective, NVBENCH_TYPE_AXES(WarpPerspectiveTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1}) + .add_string_axis("border", {"REFLECT"}) + .add_string_axis("interpolation", {"CUBIC"}) + .add_string_axis("inverseMap", {"Y"}); diff --git a/bench/CMakeLists.txt b/bench/CMakeLists.txt new file mode 100644 index 00000000..67fd8c5f --- /dev/null +++ b/bench/CMakeLists.txt @@ -0,0 +1,82 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +project(cvcuda_bench) + +set(bench_sources + BenchMinMaxLoc.cpp + BenchNMS.cpp + BenchRemap.cpp + BenchGaussian.cpp + BenchLaplacian.cpp + BenchAverageBlur.cpp + BenchMedianBlur.cpp + BenchBilateralFilter.cpp + BenchSIFT.cpp + BenchReformat.cpp + BenchResize.cpp + BenchFlip.cpp + BenchRotate.cpp + BenchPillowResize.cpp + BenchCenterCrop.cpp + BenchWarpPerspective.cpp + BenchWarpAffine.cpp + BenchThreshold.cpp + BenchRandomResizedCrop.cpp + BenchNormalize.cpp + BenchMorphology.cpp + BenchAdaptiveThreshold.cpp + BenchCvtColor.cpp + BenchAdvCvtColor.cpp + BenchBndBox.cpp + BenchBoxBlur.cpp + BenchBrightnessContrast.cpp + BenchChannelReorder.cpp + BenchColorTwist.cpp + BenchComposite.cpp + BenchConv2D.cpp + BenchConvertTo.cpp + BenchCopyMakeBorder.cpp + BenchCropFlipNormalizeReformat.cpp + BenchCustomCrop.cpp + BenchErase.cpp + BenchFindContours.cpp + BenchGammaContrast.cpp + BenchGaussianNoise.cpp + BenchHistogramEq.cpp + BenchHistogram.cpp + BenchInpaint.cpp + BenchJointBilateralFilter.cpp + BenchMinAreaRect.cpp + BenchPadAndStack.cpp + BenchOSD.cpp + BenchLabel.cpp + BenchPairwiseMatcher.cpp + BenchStack.cpp +) + +# Metatarget for all benchmarks +add_custom_target(bench_all) + +foreach(bench_source IN LISTS bench_sources) + get_filename_component(bench_file_name "${bench_source}" NAME_WLE) + string(REPLACE "Bench" "cvcuda_bench_" algo_name ${bench_file_name}) + string(TOLOWER ${algo_name} bench_name) + add_executable(${bench_name} "${bench_source}") + target_include_directories(${bench_name} PRIVATE "${CMAKE_CURRENT_LIST_DIR}") + target_link_libraries(${bench_name} PRIVATE nvbench::main PUBLIC cvcuda) + set_target_properties(${bench_name} PROPERTIES COMPILE_FEATURES cuda_std_17) + add_dependencies(bench_all ${bench_name}) +endforeach() diff --git a/bench/run_bench.py b/bench/run_bench.py new file mode 100644 index 00000000..cc995ee8 --- /dev/null +++ b/bench/run_bench.py @@ -0,0 +1,95 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import time +import subprocess +import pandas as pd + + +BENCH_PREFIX = "cvcuda_bench_" +BENCH_OUTPUT = "out.csv" +BENCH_COMMAND = "{} {} --csv {}" +BENCH_COLNAME = "Benchmark" +BENCH_RESULTS = "bench_output.csv" +BENCH_COLUMNS = {"Benchmark", "BWUtil", "Skipped"} +BANDWIDTH_COLNAME = "BWUtil" + + +if __name__ == "__main__": + if len(sys.argv) == 1: + print( + "E At least one argument must be provided: benchmark folder" + f"I Usage: {sys.argv[0]} bench_folder [extra args for benchmarks]" + ) + sys.exit(1) + + bench_args = " ".join(sys.argv[2:]) if len(sys.argv) > 2 else "" + bench_folder = sys.argv[1] + bench_files = [fn for fn in sorted(os.listdir(bench_folder)) if BENCH_PREFIX in fn] + + if len(bench_files) == 0: + print(f"E No benchmarks found in {bench_folder}") + sys.exit(1) + + print(f"I Found {len(bench_files)} benchmark(s) in {bench_folder} to run") + + l_df = [] + + for filename in bench_files: + filepath = os.path.join(bench_folder, filename) + + cmd = BENCH_COMMAND.format(filepath, bench_args, BENCH_OUTPUT) + + print(f'I Running "{cmd}"', end=" ") + + beg = time.time() + subprocess.run(cmd.split(), stdout=subprocess.PIPE) + end = time.time() + + print(f"took {end - beg:.03f} sec") + + if os.path.exists(BENCH_OUTPUT) is False or os.path.getsize(BENCH_OUTPUT) == 0: + print("W Skipping as benchmark output does not exist or is empty") + continue + + df = pd.read_csv(BENCH_OUTPUT) + + if not BENCH_COLUMNS.issubset(df.columns): + print(f"W Skipping as benchmark output does not have: {BENCH_COLUMNS}") + continue + + df = df[df["Skipped"] == "No"] + + os.remove(BENCH_OUTPUT) + + if len(df) > 0: + l_df.append(df) + + df = pd.concat(l_df, axis=0) + df = df.reset_index(drop=True) + + filepath = os.path.join(bench_folder, BENCH_RESULTS) + + df.to_csv(filepath) + + print(f"I Full results written to {filepath}") + + df = df.groupby("Benchmark")["BWUtil"].mean() + + pd.options.display.float_format = "{:.2%}".format + + print(f"I Summary results:\n{df}") diff --git a/ci/build.sh b/ci/build.sh index d13d719f..bcbbec23 100755 --- a/ci/build.sh +++ b/ci/build.sh @@ -1,6 +1,6 @@ #!/bin/bash -e -# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -37,7 +37,7 @@ source_dir="$SDIR/.." if [[ $# -ge 1 ]]; then case $1 in - debug|release) + debug|release|profile) build_type=$1 if [[ $# -ge 2 ]]; then build_dir=$2 @@ -77,10 +77,13 @@ else fi if [ "$PYTHON_VERSIONS" ]; then - cmake_args="-DPYTHON_VERSIONS=$PYTHON_VERSIONS" + cmake_args="$cmake_args -DPYTHON_VERSIONS=$PYTHON_VERSIONS" fi case $build_type in + profile) + cmake_args="$cmake_args -DCMAKE_BUILD_TYPE=Release -DBUILD_BENCH=1" + ;; release) cmake_args="$cmake_args -DCMAKE_BUILD_TYPE=Release" ;; @@ -132,7 +135,7 @@ cmake -B "$build_dir" "$source_dir" \ $user_args # Build CV-CUDA -cmake --build "$build_dir" -- $MAKE_OPTS +cmake --build "$build_dir" --parallel 8 -- $MAKE_OPTS # Show ccache status, if available! if [[ $has_ccache ]]; then diff --git a/ci/check_formatting.sh b/ci/check_formatting.sh deleted file mode 100755 index b91d518c..00000000 --- a/ci/check_formatting.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash -e - -# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -if [ $# = 0 ]; then - # No arguments? Lint all code. - echo "Linting all code in the repository ==========================" - pre-commit run -a -else - from=$1 - if [ $# = 1 ]; then - to=HEAD - elif [ $# = 2 ]; then - to=$2 - else - echo "Invalid arguments" - echo "Usage: $(basename "$0") [ref_from [ref_to]]" - exit 1 - fi - - echo "Linting files touched from commit $from to $to ==============" - echo "Files to be linted:" - git diff --stat $from..$to - if ! pre-commit run --from-ref $from --to-ref $to ; then - echo "Formatting errors:" - git diff - false - fi -fi diff --git a/cmake/ConfigCompiler.cmake b/cmake/ConfigCompiler.cmake index 5c4f834c..898a7ee8 100644 --- a/cmake/ConfigCompiler.cmake +++ b/cmake/ConfigCompiler.cmake @@ -17,6 +17,8 @@ set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3 -ggdb") set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -O3 -ggdb") +# Use old behavior (before CMake 3.20) for Ninja DEPFILES generators +cmake_policy(SET CMP0116 OLD) if(WARNINGS_AS_ERRORS) set(C_WARNING_ERROR_FLAG "-Werror") diff --git a/cmake/PrintConfig.cmake b/cmake/PrintConfig.cmake index b43529cc..4c58c7e8 100644 --- a/cmake/PrintConfig.cmake +++ b/cmake/PrintConfig.cmake @@ -49,6 +49,12 @@ else() message(STATUS " ENABLE_SANITIZER : off") endif() +if(BUILD_BENCH) + message(STATUS " BUILD_BENCH : ON") +else() + message(STATUS " BUILD_BENCH : off") +endif() + if(ENABLE_TEGRA) message(STATUS " ENABLE_TEGRA : ON") else() diff --git a/docker/build/Dockerfile b/docker/build/Dockerfile index 306cd2ec..8fd5fb8e 100644 --- a/docker/build/Dockerfile +++ b/docker/build/Dockerfile @@ -36,9 +36,9 @@ RUN DEBIAN_FRONTEND="noninteractive" apt-get update \ pre-commit shellcheck \ curl \ && rm -rf /var/lib/apt/lists/* \ - && curl -L https://cmake.org/files/v3.18/cmake-3.18.6-Linux-x86_64.tar.gz --output /tmp/cmake-3.18.6.tar.gz \ - && tar -xzf /tmp/cmake-3.18.6.tar.gz -C /tmp/ && cd /tmp/cmake-3.18.6-Linux-x86_64/ \ - && cp bin/ share/ doc/ /usr/local/ -r && rm -rf /tmp/cmake-3.18.6* + && curl -L https://cmake.org/files/v3.20/cmake-3.20.1-linux-x86_64.tar.gz --output /tmp/cmake-3.20.1.tar.gz \ + && tar -xzf /tmp/cmake-3.20.1.tar.gz -C /tmp/ && cd /tmp/cmake-3.20.1-linux-x86_64/ \ + && cp bin/ share/ doc/ /usr/local/ -r && rm -rf /tmp/cmake-3.20.1* # Configure ccache RUN mkdir -p /cache diff --git a/docker/config b/docker/config index e7b31e24..56cc639d 100644 --- a/docker/config +++ b/docker/config @@ -22,8 +22,9 @@ IMAGE_URL_BASE='' # image versions must be upgraded whenever a breaking # change is done, such as removing some package, or updating # packaged versions that introduces incompatibilities. -TAG_IMAGE=5 +TAG_IMAGE=6 TAG_IMAGE_SAMPLES=5.1 +TAG_IMAGE_TEST=5 VER_CUDA=11.7.1 VER_UBUNTU=22.04 diff --git a/docker/test/Dockerfile b/docker/test/Dockerfile index f6600b1e..0c55129e 100644 --- a/docker/test/Dockerfile +++ b/docker/test/Dockerfile @@ -47,3 +47,10 @@ RUN set -e \ python$ver -m pip install torch numpy torchvision; \ done \ && rm -rf /root/.cache/pip + +# Other dependencies of python tests +# binutils: for readelf +RUN DEBIAN_FRONTEND="noninteractive" apt-get update \ + && apt-get install -y --no-install-recommends \ + binutils \ + && rm -rf /var/lib/apt/lists/* diff --git a/docker/update_test_image.sh b/docker/update_test_image.sh index adce0abf..d0cc5199 100755 --- a/docker/update_test_image.sh +++ b/docker/update_test_image.sh @@ -35,7 +35,7 @@ cd "$SDIR" cd test -image=$IMAGE_URL_BASE/test-linux-x64:$TAG_IMAGE +image=$IMAGE_URL_BASE/test-linux-x64:$TAG_IMAGE_TEST docker build --network=host \ --build-arg "VER_CUDA=$VER_CUDA" \ diff --git a/docs/sphinx/content/cvcuda_oplist.csv b/docs/sphinx/content/cvcuda_oplist.csv index cd564dbb..d6b48a6a 100644 --- a/docs/sphinx/content/cvcuda_oplist.csv +++ b/docs/sphinx/content/cvcuda_oplist.csv @@ -16,7 +16,8 @@ CustomCrop,Crops an image with a given region-of-interest CvtColor,Converts an image from one color space to another DataTypeConvert,Converts an image’s data type with optional scaling Erase,Erases image regions -Find Contours,Extract closed contours from an input binary image +FindContours,Extract closed contours from an input binary image +FindHomography,Calculates a perspective transform from four pairs of the corresponding points Flip,Flips a 2D image around its axis GammaContrast,Adjusts image contrast Gaussian,Applies a gaussian blur filter to the image @@ -24,7 +25,8 @@ Gaussian Noise,Generates a statistical noise with a normal (Gaussian) distributi Histogram,Provides a grayscale value distribution showing the frequency of occurrence of each gray value. Histogram Equalizer,Allows effective spreading out the intensity range of the image typically used to improve contrast Inpainting,Performs inpainting by replacing a pixel by normalized weighted sum of all the known pixels in the neighborhood -Joint Bilateral Filter,Provides a edge-preserving denoising filter +Joint Bilateral Filter,Reduces image noise while preserving strong edges based on a guidance image +Label,Labels connected regions in an image using 4-way connectivity for foreground and 8-way for background pixels Laplacian,Applies a Laplace transform to an image MedianBlur,Reduces an image’s salt-and-pepper noise MinArea Rect,Finds the minimum area rotated rectangle typically used to draw bounding rectangle with minimum area @@ -36,6 +38,7 @@ Non-max Suppression,Enables selecting a single entity out of many overlapping on Normalize,Normalizes an image pixel’s range OSD (Polyline Line Text Rotated Rect Segmented Mask),Displays an overlay on the image of of different forms including polyline line text rotated rectangle segmented mask PadStack,Stacks several images into a tensor with border extension +PairwiseMatcher,Matches features computed separately (e.g. via the SIFT operator) in two images using the brute force method PillowResize,Changes the size and scale of an image using python-pillow algorithm RandomResizedCrop,Crops a random portion of an image and resizes it to a specified size. Reformat,Converts a planar image into non-planar and vice versa diff --git a/docs/sphinx/index.rst b/docs/sphinx/index.rst index 69d72c9c..ec3667c6 100644 --- a/docs/sphinx/index.rst +++ b/docs/sphinx/index.rst @@ -50,7 +50,7 @@ CV-CUDA offers more than 20 Computer Vision and Image Processing operators. Find Where Are the Release Notes? ------------------ -An awesome product requires excellent support. CV-CUDA release notes can be found `here `_. +An awesome product requires excellent support. CV-CUDA release notes can be found `here `_. Where Can I Get Help? @@ -124,6 +124,7 @@ Copyright :maxdepth: 1 :hidden: + Beta.3 Beta.2 Beta.1 Beta diff --git a/docs/sphinx/installation.rst b/docs/sphinx/installation.rst index 2130f299..c37fd42f 100644 --- a/docs/sphinx/installation.rst +++ b/docs/sphinx/installation.rst @@ -32,7 +32,7 @@ Setup The following steps describe how to install cvcuda. Choose the installation method that meets your environment needs. -Download the cvcuda tar/deb package from `here `_ +Download the cvcuda tar/deb package from `here `_ * Tar File Installation diff --git a/docs/sphinx/relnotes/v0.5.0-beta.rst b/docs/sphinx/relnotes/v0.5.0-beta.rst new file mode 100644 index 00000000..bd363319 --- /dev/null +++ b/docs/sphinx/relnotes/v0.5.0-beta.rst @@ -0,0 +1,75 @@ +.. + # SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-License-Identifier: Apache-2.0 + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + +.. _v0.5.0-beta: + +Beta.3 +====== + +CV-CUDA 0.5.0 is a comprehensive update introducing new security, compliance, and performance enhancements, alongside bug fixes and new features. + +Release Highlights +------------------ + +CV-CUDA v0.5.0 includes significant improvements: + +* **New Operators**: + - FindHomography: Calculates a perspective transform from four pairs of the corresponding points + - Label: Labels connected regions in an image using 4-way connectivity for foreground and 8-way for background pixels + - PairwiseMatcher: Matches features computed separately (e.g. via the SIFT operator) in two images using the brute force method + +* **New Features**: + - Implemented Python class for `TensorBatch``, a container type that can hold a list of non-uniformly shaped tensors + - Added support for RGBD image formats + - Enhanced documentation + +* **Bug Fixes**: + - Resolved memory leak in NvBlurBoxes + - Fixed segmentation fault issue in Python with certain imports + - Corrected typestr format issue in `__cuda_array_interface__` + - Addressed occasional hanging in OpBoxBlur on RGBA images + +Compatibility +------------- + +* Continues to support GPU Compute Capability: 7+.x +* Compatible with Ubuntu x86_64: 20.04, 22.04 +* CUDA Toolkit: 11.7+ (11.2+ for library build and run) +* GCC: 11.0+ (9.0 and 10.0 for APIs, with pre-built binary and run) +* Python: 3.7, 3.8, 3.10 + +Known Issues/Limitations +------------------------ + +* The release notes do not specify new known issues or limitations for this version. + +License +------- + +CV-CUDA is licensed under the `Apache 2.0 `_ license. + +Resources +--------- + +1. `CV-CUDA GitHub `_ +2. `CV-CUDA Increasing Throughput and Reducing Costs for AI-Based Computer Vision with CV-CUDA `_ +3. `NVIDIA Announces Microsoft, Tencent, Baidu Adopting CV-CUDA for Computer Vision AI `_ +4. `CV-CUDA helps Tencent Cloud audio and video PaaS platform achieve full-process GPU acceleration for video enhancement AI `_ + +Acknowledgements +---------------- + +CV-CUDA is developed jointly by NVIDIA and the ByteDance Machine Learning team. diff --git a/docs/sphinx/samples/python_samples/classification.rst b/docs/sphinx/samples/python_samples/classification.rst index 8356d46d..11961162 100644 --- a/docs/sphinx/samples/python_samples/classification.rst +++ b/docs/sphinx/samples/python_samples/classification.rst @@ -171,7 +171,7 @@ The top 5 classification results for the tabby_cat_tiger.jpg image is as follows user@machine:~/cvcuda/samples$ python3 classification/python/main.py -b 1 [perf_utils:85] 2023-07-27 22:27:17 WARNING perf_utils is used without benchmark.py. Benchmarking mode is turned off. - [perf_utils:89] 2023-07-27 22:27:17 INFO Using CV-CUDA version: 0.4.0-beta + [perf_utils:89] 2023-07-27 22:27:17 INFO Using CV-CUDA version: 0.5.0-beta [pipelines:35] 2023-07-27 22:27:17 INFO Using CVCUDA as preprocessor. [torch_utils:77] 2023-07-27 22:27:17 INFO Using torchnvjpeg as decoder. [pipelines:122] 2023-07-27 22:27:17 INFO Using CVCUDA as post-processor. diff --git a/docs/sphinx/samples/python_samples/object_detection.rst b/docs/sphinx/samples/python_samples/object_detection.rst index 3e2cad01..a2d05499 100644 --- a/docs/sphinx/samples/python_samples/object_detection.rst +++ b/docs/sphinx/samples/python_samples/object_detection.rst @@ -177,7 +177,7 @@ This sample takes as input one or more images or one video and generates the obj user@machine:~/cvcuda/samples$ python3 object_detection/python/main.py [perf_utils:85] 2023-07-27 23:15:34 WARNING perf_utils is used without benchmark.py. Benchmarking mode is turned off. - [perf_utils:89] 2023-07-27 23:15:34 INFO Using CV-CUDA version: 0.4.0-beta + [perf_utils:89] 2023-07-27 23:15:34 INFO Using CV-CUDA version: 0.5.0-beta [pipelines:30] 2023-07-27 23:15:36 INFO Using CVCUDA as preprocessor. [torch_utils:77] 2023-07-27 23:15:36 INFO Using torchnvjpeg as decoder. [torch_utils:151] 2023-07-27 23:15:36 INFO Using PyTorch/PIL as encoder. diff --git a/docs/sphinx/samples/python_samples/segmentation.rst b/docs/sphinx/samples/python_samples/segmentation.rst index c062c03e..5dd4d194 100644 --- a/docs/sphinx/samples/python_samples/segmentation.rst +++ b/docs/sphinx/samples/python_samples/segmentation.rst @@ -182,7 +182,7 @@ This sample takes as input the one or more images or one video and generates the user@machine:~/cvcuda/samples$ python3 segmentation/python/main.py -b 5 -c __background__ -o /tmp -i assets/images/ [perf_utils:85] 2023-07-27 23:17:49 WARNING perf_utils is used without benchmark.py. Benchmarking mode is turned off. - [perf_utils:89] 2023-07-27 23:17:49 INFO Using CV-CUDA version: 0.4.0-beta + [perf_utils:89] 2023-07-27 23:17:49 INFO Using CV-CUDA version: 0.5.0-beta [pipelines:35] 2023-07-27 23:17:50 INFO Using CVCUDA as preprocessor. [torch_utils:60] 2023-07-27 23:17:50 INFO Found a total of 3 JPEG images. [torch_utils:77] 2023-07-27 23:17:50 INFO Using torchnvjpeg as decoder. diff --git a/python/mod_cvcuda/CMakeLists.txt b/python/mod_cvcuda/CMakeLists.txt index 87ed892d..5db4089a 100644 --- a/python/mod_cvcuda/CMakeLists.txt +++ b/python/mod_cvcuda/CMakeLists.txt @@ -21,6 +21,14 @@ nvcv_python_add_module( OUTPUT_NAME cvcuda SOURCES Main.cpp + OpPairwiseMatcher.cpp + PairwiseMatcherType.cpp + NormType.cpp + OpStack.cpp + WorkspaceCache.cpp + OpLabel.cpp + LabelType.cpp + ConnectivityType.cpp OpFindContours.cpp OpHistogramEq.cpp OpOSD.cpp @@ -77,6 +85,7 @@ nvcv_python_add_module( OpGaussianNoise.cpp OpInpaint.cpp CvtColorUtil.cpp + OpFindHomography.cpp ) target_link_libraries(cvcuda_module_python @@ -86,7 +95,16 @@ target_link_libraries(cvcuda_module_python nvcv_util_compat cvcuda nvcv_python_common + nvcv_util + cuda -lrt ) +# use exports file to expose only the symbol dl-loaded by python, +# and nothing else. +target_link_options(cvcuda_module_python + PRIVATE + -Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/exports.ldscript +) + set_target_properties(cvcuda_module_python PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/python) diff --git a/python/mod_cvcuda/ConnectivityType.cpp b/python/mod_cvcuda/ConnectivityType.cpp new file mode 100644 index 00000000..8cb5d41f --- /dev/null +++ b/python/mod_cvcuda/ConnectivityType.cpp @@ -0,0 +1,34 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ConnectivityType.hpp" + +#include + +namespace cvcudapy { + +void ExportConnectivityType(py::module &m) +{ + py::enum_(m, "ConnectivityType", py::arithmetic()) + .value("CONNECTIVITY_4_2D", NVCV_CONNECTIVITY_4_2D) + .value("CONNECTIVITY_6_3D", NVCV_CONNECTIVITY_6_3D) + .value("CONNECTIVITY_8_2D", NVCV_CONNECTIVITY_8_2D) + .value("CONNECTIVITY_26_2D", NVCV_CONNECTIVITY_26_3D) + .export_values(); +} + +} // namespace cvcudapy diff --git a/python/mod_cvcuda/ConnectivityType.hpp b/python/mod_cvcuda/ConnectivityType.hpp new file mode 100644 index 00000000..a7cc8b0a --- /dev/null +++ b/python/mod_cvcuda/ConnectivityType.hpp @@ -0,0 +1,30 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_PYTHON_CONNECTIVITY_TYPE_HPP +#define NVCV_PYTHON_CONNECTIVITY_TYPE_HPP + +#include + +namespace cvcudapy { +namespace py = ::pybind11; + +void ExportConnectivityType(py::module &m); + +} // namespace cvcudapy + +#endif // NVCV_PYTHON_CONNECTIVITY_TYPE_HPP diff --git a/python/mod_cvcuda/InterpolationType.cpp b/python/mod_cvcuda/InterpolationType.cpp index 65010bf3..7b6c0fa1 100644 --- a/python/mod_cvcuda/InterpolationType.cpp +++ b/python/mod_cvcuda/InterpolationType.cpp @@ -29,8 +29,10 @@ void ExportInterpolationType(py::module &m) .value("CUBIC", NVCV_INTERP_CUBIC, "Cubic interpolation") .value("AREA", NVCV_INTERP_AREA, "Area-based (resampling using pixels in area) interpolation") .value("LANCZOS", NVCV_INTERP_LANCZOS, "Lanczos interpolation") + .value("WARP_INVERSE_MAP", NVCV_WARP_INVERSE_MAP, "Inverse transformation") .value("HAMMING", NVCV_INTERP_HAMMING, "Hamming interpolation") - .value("BOX", NVCV_INTERP_BOX, "Box interpolation"); + .value("BOX", NVCV_INTERP_BOX, "Box interpolation") + .def("__or__", [](NVCVInterpolationType e1, NVCVInterpolationType e2) { return int(e1) | int(e2); }); } } // namespace cvcudapy diff --git a/python/mod_cvcuda/LabelType.cpp b/python/mod_cvcuda/LabelType.cpp new file mode 100644 index 00000000..04efc42d --- /dev/null +++ b/python/mod_cvcuda/LabelType.cpp @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "LabelType.hpp" + +#include + +namespace cvcudapy { + +void ExportLabelType(py::module &m) +{ + py::enum_(m, "LABEL", py::arithmetic()) + .value("FAST", NVCV_LABEL_FAST) + .value("SEQUENTIAL", NVCV_LABEL_SEQUENTIAL); +} + +} // namespace cvcudapy diff --git a/python/mod_cvcuda/LabelType.hpp b/python/mod_cvcuda/LabelType.hpp new file mode 100644 index 00000000..3260e2d2 --- /dev/null +++ b/python/mod_cvcuda/LabelType.hpp @@ -0,0 +1,30 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_PYTHON_LABEL_TYPE_HPP +#define NVCV_PYTHON_LABEL_TYPE_HPP + +#include + +namespace cvcudapy { +namespace py = ::pybind11; + +void ExportLabelType(py::module &m); + +} // namespace cvcudapy + +#endif // NVCV_PYTHON_LABEL_TYPE_HPP diff --git a/python/mod_cvcuda/Main.cpp b/python/mod_cvcuda/Main.cpp index 41034e5b..226336f2 100644 --- a/python/mod_cvcuda/Main.cpp +++ b/python/mod_cvcuda/Main.cpp @@ -18,10 +18,14 @@ #include "AdaptiveThresholdType.hpp" #include "BorderType.hpp" #include "ColorConversionCode.hpp" +#include "ConnectivityType.hpp" #include "InterpolationType.hpp" +#include "LabelType.hpp" #include "MorphologyType.hpp" +#include "NormType.hpp" #include "Operators.hpp" #include "OsdElement.hpp" +#include "PairwiseMatcherType.hpp" #include "RemapMapValueType.hpp" #include "SIFTFlagType.hpp" #include "ThresholdType.hpp" @@ -77,14 +81,19 @@ PYBIND11_MODULE(cvcuda, m) ExportMorphologyType(m); ExportColorConversionCode(m); ExportRemapMapValueType(m); - ExportBndBox(m); ExportBoxBlur(m); ExportOSD(m); ExportThresholdType(m); ExportAdaptiveThresholdType(m); ExportSIFTFlagType(m); + ExportConnectivityType(m); + ExportLabelType(m); + ExportNormType(m); + ExportPairwiseMatcherType(m); // CV-CUDA Operators + ExportOpPairwiseMatcher(m); + ExportOpLabel(m); ExportOpFindContours(m); ExportOpOSD(m); ExportOpHistogramEq(m); @@ -93,8 +102,8 @@ PYBIND11_MODULE(cvcuda, m) ExportOpMinMaxLoc(m); ExportOpHistogram(m); ExportOpMinAreaRect(m); - ExportOpBoxBlur(m); ExportOpBndBox(m); + ExportOpBoxBlur(m); ExportOpBrightnessContrast(m); ExportOpColorTwist(m); ExportOpRemap(m); @@ -131,4 +140,6 @@ PYBIND11_MODULE(cvcuda, m) ExportOpRandomResizedCrop(m); ExportOpGaussianNoise(m); ExportOpInpaint(m); + ExportOpStack(m); + ExportOpFindHomography(m); } diff --git a/python/mod_cvcuda/NormType.cpp b/python/mod_cvcuda/NormType.cpp new file mode 100644 index 00000000..0f53f820 --- /dev/null +++ b/python/mod_cvcuda/NormType.cpp @@ -0,0 +1,32 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "NormType.hpp" + +#include + +namespace cvcudapy { + +void ExportNormType(py::module &m) +{ + py::enum_(m, "Norm", py::arithmetic()) + .value("HAMMING", NVCV_NORM_HAMMING, "Hamming distance") + .value("L1", NVCV_NORM_L1, "Manhattan distance") + .value("L2", NVCV_NORM_L2, "Euclidean distance"); +} + +} // namespace cvcudapy diff --git a/python/mod_cvcuda/NormType.hpp b/python/mod_cvcuda/NormType.hpp new file mode 100644 index 00000000..51d7b47e --- /dev/null +++ b/python/mod_cvcuda/NormType.hpp @@ -0,0 +1,30 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_PYTHON_NORM_TYPE_HPP +#define NVCV_PYTHON_NORM_TYPE_HPP + +#include + +namespace cvcudapy { +namespace py = ::pybind11; + +void ExportNormType(py::module &m); + +} // namespace cvcudapy + +#endif // NVCV_PYTHON_NORM_TYPE_HPP diff --git a/python/mod_cvcuda/OpAdvCvtColor.cpp b/python/mod_cvcuda/OpAdvCvtColor.cpp index 0acbc637..f24337b7 100644 --- a/python/mod_cvcuda/OpAdvCvtColor.cpp +++ b/python/mod_cvcuda/OpAdvCvtColor.cpp @@ -88,14 +88,12 @@ Tensor AdvCvtColor(Tensor &input, NVCVColorConversionCode code, NVCVColorSpec sp { nvcv::TensorShape yuvCorrectedShape({outputShape[0], outputShape[1], outputShape[2], outputShape[3]}, "NHWC"); Tensor output = Tensor::Create(yuvCorrectedShape, input.dtype()); - std::cout << yuvCorrectedShape; return AdvCvtColorInto(output, input, code, spec, pstream); } else { nvcv::TensorShape yuvCorrectedShape({outputShape[0], outputShape[1], outputShape[2]}, "HWC"); Tensor output = Tensor::Create(yuvCorrectedShape, input.dtype()); - std::cout << yuvCorrectedShape; return AdvCvtColorInto(output, input, code, spec, pstream); } } diff --git a/python/mod_cvcuda/OpCvtColor.cpp b/python/mod_cvcuda/OpCvtColor.cpp index 051db2f9..3b8eb883 100644 --- a/python/mod_cvcuda/OpCvtColor.cpp +++ b/python/mod_cvcuda/OpCvtColor.cpp @@ -56,17 +56,25 @@ Tensor CvtColorInto(Tensor &output, Tensor &input, NVCVColorConversionCode code, Tensor CvtColor(Tensor &input, NVCVColorConversionCode code, std::optional pstream) { + int ndim = input.shape().size(); + auto layout = input.layout(); auto outFormat = GetOutputFormat(input.dtype(), code); - - if (input.shape().size() < 3) + auto out_dtype = outFormat.planeDataType(0).channelType(0); + if (ndim < 3) { throw std::runtime_error("Invalid input tensor shape"); } - int numImgs{static_cast(input.shape()[0])}; - nvcv::Size2D size{static_cast(input.shape()[2]), static_cast(input.shape()[1])}; - - Tensor output = Tensor::CreateForImageBatch(numImgs, size, outFormat); + std::array shape_data; + for (int d = 0; d < ndim; d++) + { + if (layout[d] == 'C') + shape_data[d] = outFormat.numChannels(); + else + shape_data[d] = input.shape()[d]; + } + nvcv::TensorShape out_shape(shape_data.data(), ndim, layout); + Tensor output = Tensor::Create(out_shape, out_dtype); return CvtColorInto(output, input, code, pstream); } diff --git a/python/mod_cvcuda/OpFindContours.cpp b/python/mod_cvcuda/OpFindContours.cpp index 8b50af16..5202905b 100644 --- a/python/mod_cvcuda/OpFindContours.cpp +++ b/python/mod_cvcuda/OpFindContours.cpp @@ -32,7 +32,10 @@ namespace cvcudapy { namespace { -Tensor FindContoursInto(Tensor &points, Tensor &numPoints, Tensor &input, std::optional pstream) + +using TupleTensor2 = std::tuple; + +TupleTensor2 FindContoursInto(Tensor &points, Tensor &numPoints, Tensor &input, std::optional pstream) { if (!pstream) { @@ -50,10 +53,10 @@ Tensor FindContoursInto(Tensor &points, Tensor &numPoints, Tensor &input, std::o findContours->submit(pstream->cudaHandle(), input, points, numPoints); - return points; + return TupleTensor2(std::move(points), std::move(numPoints)); } -Tensor FindContours(Tensor &input, std::optional pstream) +TupleTensor2 FindContours(Tensor &input, std::optional pstream) { auto pointShape = nvcv::TensorShape{ {input.shape()[0], cvcuda::FindContours::MAX_TOTAL_POINTS, 2}, @@ -65,7 +68,7 @@ Tensor FindContours(Tensor &input, std::optional pstream) {input.shape()[0], cvcuda::FindContours::MAX_NUM_CONTOURS}, nvcv::TENSOR_NW }; - Tensor numPoints = Tensor::Create(countShape, nvcv::TYPE_U32); + Tensor numPoints = Tensor::Create(countShape, nvcv::TYPE_S32); return FindContoursInto(points, numPoints, input, pstream); } @@ -92,7 +95,12 @@ void ExportOpFindContours(py::module &m) stream (Stream, optional): CUDA Stream on which to perform the operation. Returns: - cvcuda.Tensor: The output tensor. + Tuple[Tensor, Tensor]: A tuple of two tensors. The first is the contour points tensor with dimensions NxMx2 - + where N is the batch size, M is the maximum number of points allowed. Each point of the contour is specified + in (x, y) coordinates. The second tensor specifies the number of valid contours per image and the number of + valid points in those contours. It has dimensions NxC where N is the batch size and C is the maximum number + of contours found. The actual number of contours can be calculated by counting the number of non-zero elements + in the C dimension and the actual number of points in each of those contours are the values stored in the C dimension. Caution: Restrictions to several arguments may apply. Check the C diff --git a/python/mod_cvcuda/OpFindHomography.cpp b/python/mod_cvcuda/OpFindHomography.cpp new file mode 100644 index 00000000..12553598 --- /dev/null +++ b/python/mod_cvcuda/OpFindHomography.cpp @@ -0,0 +1,330 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Operators.hpp" +#include "WorkspaceCache.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cvcudapy { + +namespace { + +// Specialized class for cvcuda::FindHomography operator with a better cache Key. +// It allows for reusing an existing operator object from cache if its payload size is >= the required size. +// It also allows to fetch the biggest payload object to be reused while removing all others. +// This is more flexible than using the generic PyOperator class and its Key class. +class PyOpFindHomography : public nvcvpy::Container +{ +public: + // Define a Key class to be used by the cache to fetch similar items for potential reuse. + class Key : public nvcvpy::IKey + { + public: + // Arguments of the key constructor should match the corresponding cvcuda operator arguments. + Key(int batchSize, int maxNumPoints) {} + + size_t payloadSize() const + { + return 0; + } + + private: + size_t doGetHash() const override + { + return 0; + } + + // The comparison of keys is based on the payload size, the one in the cache is "that" key. + bool doIsCompatible(const nvcvpy::IKey &that_) const override + { + return dynamic_cast(&that_) != nullptr; + } + }; + + // Constructor instantiate the cache key and the operator object. + PyOpFindHomography(int batchSize, int maxNumPoints) + : m_key(batchSize, maxNumPoints) + , m_op(batchSize, maxNumPoints) + { + } + + inline void submit(cudaStream_t stream, const nvcv::Tensor &srcPts, const nvcv::Tensor &dstPts, + const nvcv::Tensor &models) + { + m_op(stream, srcPts, dstPts, models); + } + + inline void submit(cudaStream_t stream, const nvcv::TensorBatch &srcPts, const nvcv::TensorBatch &dstPts, + const nvcv::TensorBatch &models) + { + m_op(stream, srcPts, dstPts, models); + } + + // Required override to get the py object container. + py::object container() const override + { + return *this; + } + + // Required override to get the key as the base interface class. + const nvcvpy::IKey &key() const override + { + return m_key; + } + + // The static fetch function can be used to specialize the fetch of a specific object from the cache. + // It can be used to select the best object among a number of matched cache objects. + // It can also be used to remove other objects that are not needed in the cache anymore. + // Here, it fetches the biggest payload OP among cache items and remove all other OPs from the cache. + // It is ok to remove them since the biggest payload OP can be used to accomodate all of them, + // so they will never be reused and thus are no longer necessary. + static std::shared_ptr fetch(std::vector> &cache) + { + assert(!cache.empty()); + + std::shared_ptr retItem = cache[0]; + size_t maxPayloadSize = 0; + + for (const auto &item : cache) + { + const Key &key = static_cast(item.get()->key()); + size_t keyPayloadSize = key.payloadSize(); + + if (keyPayloadSize > maxPayloadSize) + { + maxPayloadSize = keyPayloadSize; + retItem = item; + } + } + + cache.clear(); + + nvcvpy::Cache::removeAllNotInUseMatching(retItem.get()->key()); + + return retItem; + } + +private: + Key m_key; + cvcuda::FindHomography m_op; +}; + +Tensor FindHomographyInto(Tensor &models, Tensor &srcPts, Tensor &dstPts, std::optional pstream) +{ + if (!pstream) + { + pstream = Stream::Current(); + } + + // Use CreateOperatorEx to use the extended create operator function passing the specialized PyOperator above + // as template type, instead of the regular cvcuda::OP class used in the CreateOperator function. + int32_t batchSize = srcPts.shape()[0]; + int32_t numPoints = srcPts.shape()[1]; + + auto findHomography = CreateOperatorEx(batchSize, numPoints); + + ResourceGuard guard(*pstream); + guard.add(LockMode::LOCK_READ, {srcPts}); + guard.add(LockMode::LOCK_READ, {dstPts}); + guard.add(LockMode::LOCK_WRITE, {models}); + + findHomography->submit(pstream->cudaHandle(), srcPts, dstPts, models); + + return models; +} + +Tensor FindHomography(Tensor &srcPts, Tensor dstPts, std::optional pstream) +{ + Shape modelsShape(3); + modelsShape[0] = srcPts.shape()[0]; + modelsShape[1] = 3; + modelsShape[2] = 3; + + Tensor models = Tensor::Create(modelsShape, nvcv::TYPE_F32, nvcv::TENSOR_NHW); + + return FindHomographyInto(models, srcPts, dstPts, pstream); +} + +TensorBatch VarShapeFindHomographyInto(TensorBatch &models, TensorBatch &srcPts, TensorBatch &dstPts, + std::optional pstream) +{ + if (!pstream) + { + pstream = Stream::Current(); + } + + // The same PyOpFindHomography class and CreateOperatorEx function can be used regardless of Tensors or VarShape. + int batchSize = srcPts.numTensors(); + int maxNumPoints = 0; + + for (int i = 0; i < batchSize; i++) + { + int numPoints = srcPts[i].shape()[1]; + if (numPoints > maxNumPoints) + maxNumPoints = numPoints; + } + + auto findHomography = CreateOperatorEx(batchSize, maxNumPoints); + + ResourceGuard guard(*pstream); + guard.add(LockMode::LOCK_READ, {srcPts}); + guard.add(LockMode::LOCK_READ, {dstPts}); + guard.add(LockMode::LOCK_WRITE, {models}); + + findHomography->submit(pstream->cudaHandle(), srcPts, dstPts, models); + + return models; +} + +TensorBatch VarShapeFindHomography(TensorBatch &srcPts, TensorBatch &dstPts, std::optional pstream) +{ + TensorBatch models = TensorBatch::Create(srcPts.numTensors()); + + Shape modelsShape(3); + modelsShape[0] = 1; + modelsShape[1] = 3; + modelsShape[2] = 3; + + for (int i = 0; i < srcPts.numTensors(); i++) + { + Tensor outTensor = Tensor::Create(modelsShape, nvcv::TYPE_F32, nvcv::TENSOR_NHW); + models.pushBack(outTensor); + } + + return VarShapeFindHomographyInto(models, srcPts, dstPts, pstream); +} + +} // namespace + +void ExportOpFindHomography(py::module &m) +{ + using namespace pybind11::literals; + + py::options options; + options.disable_function_signatures(); + + m.def("findhomography", &FindHomography, "srcPts"_a, "dstPts"_a, "stream"_a = nullptr, R"pbdoc( + + cvcuda.findhomography(srcPts: nvcv.Tensor, dstPts: nvcv.Tensor, stream: Optional[nvcv.cuda.Stream] = None) -> nvcv.Tensor + + Estimates the homography matrix between srcPts and dstPts coordinates on the given cuda stream. + + See also: + Refer to the CV-CUDA C API reference for the Find Homography operator + for more details and usage examples. + + Args: + srcPts (Tensor): Input source coordinates tensor containing 2D coordinates in the source image. + dstPts (Tensor): Input destination coordinates tensor containing 2D coordinates in the target image. + stream (Stream, optional): CUDA Stream on which to perform the operation. + + Returns: + cvcuda.Tensor: The model homography matrix tensor. + + Caution: + Restrictions to several arguments may apply. Check the C + API references of the CV-CUDA operator. + )pbdoc"); + + m.def("findhomography_into", &FindHomographyInto, "models"_a, "srcPts"_a, "dstPts"_a, "stream"_a = nullptr, R"pbdoc( + + cvcuda.findhomography_into(models: nvcv.Tensor, srcPts: nvcv.Tensor, dstPts: nvcv.Tensor, stream: Optional[nvcv.cuda.Stream] = None) + + Executes the Find Homography operation on the given cuda stream. + + See also: + Refer to the CV-CUDA C API reference for the Find Homography operator + for more details and usage examples. + + Args: + models (Tensor) : Output model tensor containing 3x3 homography matrices. + srcPts (Tensor): Input source coordinates tensor containing 2D coordinates in the source image. + dstPts (Tensor): Input destination coordinates tensor containing 2D coordinates in the target image. + stream (Stream, optional): CUDA Stream on which to perform the operation. + + Returns: + cvcuda.Tensor: The model homography matrix tensor. + + Caution: + Restrictions to several arguments may apply. Check the C + API references of the CV-CUDA operator. + )pbdoc"); + + m.def("findhomography", &VarShapeFindHomography, "srcPts"_a, "dstPts"_a, "stream"_a = nullptr, R"pbdoc( + + cvcuda.findhomography(srcPts: nvcv.TensorBatch, dstPts: nvcv.TensorBatch, stream: Optional[nvcv.cuda.Stream] = None) -> TensorBatch + + Executes the Find Homography operation on the given cuda stream. + + See also: + Refer to the CV-CUDA C API reference for the Find Homography operator + for more details and usage examples. + + Args: + srcPts (TensorBatch): Input source coordinates tensor containing 2D coordinates in the source image. + dstPts (TensorBatch): Input destination coordinates tensor containing 2D coordinates in the target image. + stream (Stream, optional): CUDA Stream on which to perform the operation. + + Returns: + cvcuda.TensorBatch: The model homography matrix tensor batch. + + Caution: + Restrictions to several arguments may apply. Check the C + API references of the CV-CUDA operator. + )pbdoc"); + + m.def("findhomography_into", &VarShapeFindHomographyInto, "models"_a, "srcPts"_a, "dstPts"_a, "stream"_a = nullptr, + R"pbdoc( + + cvcuda.findhomography(models: nvcv.TensorBatch, srcPts: nvcv.TensorBatch, dstPts: nvcv.TensorBatch, stream: Optional[nvcv.cuda.Stream] = None) + + Executes the Find Homography operation on the given cuda stream. + + See also: + Refer to the CV-CUDA C API reference for the Find Homography operator + for more details and usage examples. + + Args: + models (TensorBatch) : Output model tensor containing 3x3 homography matrices. + srcPts (TensorBatch): Input source coordinates tensor containing 2D coordinates in the source image. + dstPts (TensorBatch): Input destination coordinates tensor containing 2D coordinates in the target image. + stream (Stream, optional): CUDA Stream on which to perform the operation. + + Returns: + cvcuda.TensorBatch: The model homography matrix tensor batch. + + + Caution: + Restrictions to several arguments may apply. Check the C + API references of the CV-CUDA operator. + )pbdoc"); +} + +} // namespace cvcudapy diff --git a/python/mod_cvcuda/OpLabel.cpp b/python/mod_cvcuda/OpLabel.cpp new file mode 100644 index 00000000..eb89d55b --- /dev/null +++ b/python/mod_cvcuda/OpLabel.cpp @@ -0,0 +1,210 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Operators.hpp" + +#include +#include +#include +#include +#include +#include + +#include + +namespace cvcudapy { + +using TupleTensor3 = std::tuple, std::optional>; + +namespace { + +TupleTensor3 LabelInto(Tensor &output, std::optional count, std::optional stats, Tensor &input, + NVCVConnectivityType connectivity, NVCVLabelType assignLabels, std::optional bgLabel, + std::optional minThresh, std::optional maxThresh, std::optional minSize, + std::optional pstream) +{ + if (!pstream) + { + pstream = Stream::Current(); + } + + auto op = CreateOperator(); + + ResourceGuard guard(*pstream); + guard.add(LockMode::LOCK_READ, {input}); + guard.add(LockMode::LOCK_WRITE, {output}); + guard.add(LockMode::LOCK_NONE, {*op}); + + if (count) + { + guard.add(LockMode::LOCK_WRITE, {*count}); + } + if (stats) + { + guard.add(LockMode::LOCK_WRITE, {*stats}); + } + if (bgLabel) + { + guard.add(LockMode::LOCK_READ, {*bgLabel}); + } + if (minThresh) + { + guard.add(LockMode::LOCK_READ, {*minThresh}); + } + if (maxThresh) + { + guard.add(LockMode::LOCK_READ, {*maxThresh}); + } + if (minSize) + { + guard.add(LockMode::LOCK_READ, {*minSize}); + } + + op->submit(pstream->cudaHandle(), input, output, (bgLabel ? *bgLabel : nvcv::Tensor{nullptr}), + (minThresh ? *minThresh : nvcv::Tensor{nullptr}), (maxThresh ? *maxThresh : nvcv::Tensor{nullptr}), + (minSize ? *minSize : nvcv::Tensor{nullptr}), (count ? *count : nvcv::Tensor{nullptr}), + (stats ? *stats : nvcv::Tensor{nullptr}), connectivity, assignLabels); + + return TupleTensor3(std::move(output), count, stats); +} + +TupleTensor3 Label(Tensor &input, NVCVConnectivityType connectivity, NVCVLabelType assignLabels, bool count, bool stats, + int maxLabels, std::optional bgLabel, std::optional minThresh, + std::optional maxThresh, std::optional minSize, std::optional pstream) +{ + constexpr nvcv::DataType outType = nvcv::TYPE_U32; + + auto inputData = input.exportData(); + if (!inputData) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input must be a valid CUDA strided tensor"); + } + auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inputData); + if (!inAccess) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input must be a valid image-based tensor"); + } + int numSamples = inAccess->numSamples(); + + Tensor output = Tensor::Create(input.shape(), outType); + std::optional countTensor, statsTensor; + + if (count) + { + countTensor = Tensor::Create({{numSamples}, "N"}, outType); + } + if (stats) + { + int numStats = 1; + if (connectivity == NVCV_CONNECTIVITY_4_2D || connectivity == NVCV_CONNECTIVITY_8_2D) + { + numStats = 6; + } + if (connectivity == NVCV_CONNECTIVITY_6_3D || connectivity == NVCV_CONNECTIVITY_26_3D) + { + numStats = 8; + } + + statsTensor = Tensor::Create( + { + {numSamples, maxLabels, numStats}, + "NMA" + }, + outType); + } + + return LabelInto(output, countTensor, statsTensor, input, connectivity, assignLabels, bgLabel, minThresh, maxThresh, + minSize, pstream); +} + +} // namespace + +void ExportOpLabel(py::module &m) +{ + using namespace pybind11::literals; + + m.def("label", &Label, "src"_a, "connectivity"_a = NVCV_CONNECTIVITY_4_2D, "assign_labels"_a = NVCV_LABEL_FAST, + py::kw_only(), "count"_a = false, "stats"_a = false, "max_labels"_a = 10000, "bg_label"_a = nullptr, + "min_thresh"_a = nullptr, "max_thresh"_a = nullptr, "min_size"_a = nullptr, "stream"_a = nullptr, R"pbdoc( + + Executes the Label operation on the given cuda stream. + + See also: + Refer to the CV-CUDA C API reference for the Label operator for more details and usage examples. + + Args: + src (Tensor): Input tensor to label connected-component regions. + connectivity (cvcuda.ConnectivityType, optional): Choice to control connectivity of input elements, + default is cvcuda.CONNECTIVITY_4_2D. + assign_labels (cvcuda.LABEL, optional): Choice on how labels are assigned, + default is cvcuda.LABEL.FAST. + count (bool, optional): Use True to return the count of valid labeled regions. + stats (bool, optional): Use True to return the statistics of valid labeled regions. + max_labels (Number, optional): Maximum number of labels to compute statistics for, default is 10000. + bg_label (Tensor, optional): Background tensor to define input values to be considered background + labels and thus ignored. + min_thresh (Tensor, optional): Minimum threshold tensor to mask input values below it to be 0, and others 1. + max_thresh (Tensor, optional): Maximum threshold tensor to mask input values above it to be 0, and others 1. + min_size (Tensor, optional): Minimum size tensor to remove islands, i.e. labeled regions with number of + elements less than the minimum size. + stream (Stream, optional): CUDA Stream on which to perform the operation. + + Returns: + Tuple[Tensor, Tensor, Tensor]: A tuple wih output labels, count of regions and their statistics. + The count or stats tensors may be None if theirs arguments are False. + + Caution: + Restrictions to several arguments may apply. Check the C API references of the CV-CUDA operator. + )pbdoc"); + + m.def("label_into", &LabelInto, "dst"_a, "count"_a = nullptr, "stats"_a = nullptr, "src"_a, + "connectivity"_a = NVCV_CONNECTIVITY_4_2D, "assign_labels"_a = NVCV_LABEL_FAST, py::kw_only(), + "bg_label"_a = nullptr, "min_thresh"_a = nullptr, "max_thresh"_a = nullptr, "min_size"_a = nullptr, + "stream"_a = nullptr, R"pbdoc( + + Executes the Label operation on the given cuda stream. + + See also: + Refer to the CV-CUDA C API reference for the Label operator for more details and usage examples. + + Args: + dst (Tensor): Output tensor with labels. + count (Tensor, optional): Output tensor with count number of labeled regions. + stats (Tensor, optional): Output tensor with statistics for each labeled region. + src (Tensor): Input tensor to label connected-component regions. + connectivity (cvcuda.ConnectivityType, optional): Choice to control connectivity of input elements, + default is cvcuda.CONNECTIVITY_4_2D. + assign_labels (cvcuda.LABEL, optional): Choice on how labels are assigned, + default is cvcuda.LABEL.FAST. + bg_label (Tensor, optional): Background tensor to define input values to be considered background + labels and thus ignored. + min_thresh (Tensor, optional): Minimum threshold tensor to mask input values below it to be 0, and others 1. + max_thresh (Tensor, optional): Maximum threshold tensor to mask input values above it to be 0, and others 1. + min_size (Tensor, optional): Minimum size tensor to remove islands, i.e. labeled regions with number of + elements less than the minimum size. + stream (Stream, optional): CUDA Stream on which to perform the operation. + + Returns: + Tuple[Tensor, Tensor, Tensor]: A tuple wih output labels, count of regions and their statistics. + The count or stats tensors may be None if theirs arguments are None. + + Caution: + Restrictions to several arguments may apply. Check the C API references of the CV-CUDA operator. + )pbdoc"); +} + +} // namespace cvcudapy diff --git a/python/mod_cvcuda/OpPairwiseMatcher.cpp b/python/mod_cvcuda/OpPairwiseMatcher.cpp new file mode 100644 index 00000000..2b9248d7 --- /dev/null +++ b/python/mod_cvcuda/OpPairwiseMatcher.cpp @@ -0,0 +1,204 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Operators.hpp" + +#include +#include +#include +#include +#include + +#include + +namespace cvcudapy { + +using TupleTensor3 = std::tuple, std::optional>; + +namespace { + +TupleTensor3 PairwiseMatcherInto(Tensor &matches, std::optional numMatches, std::optional distances, + Tensor &set1, Tensor &set2, std::optional numSet1, + std::optional numSet2, bool crossCheck, int matchesPerPoint, + std::optional normType, NVCVPairwiseMatcherType algoChoice, + std::optional pstream) +{ + if (!pstream) + { + pstream = Stream::Current(); + } + + if (!normType) + { + normType = set1.dtype() == nvcv::TYPE_F32 ? NVCV_NORM_L2 : NVCV_NORM_HAMMING; + } + + auto op = CreateOperator(algoChoice); + + ResourceGuard guard(*pstream); + guard.add(LockMode::LOCK_READ, {set1, set2}); + guard.add(LockMode::LOCK_WRITE, {matches}); + guard.add(LockMode::LOCK_NONE, {*op}); + + if (numSet1) + { + guard.add(LockMode::LOCK_READ, {*numSet1}); + } + if (numSet2) + { + guard.add(LockMode::LOCK_READ, {*numSet2}); + } + if (numMatches) + { + guard.add(LockMode::LOCK_WRITE, {*numMatches}); + } + if (distances) + { + guard.add(LockMode::LOCK_WRITE, {*distances}); + } + + op->submit(pstream->cudaHandle(), set1, set2, (numSet1 ? *numSet1 : nvcv::Tensor{nullptr}), + (numSet2 ? *numSet2 : nvcv::Tensor{nullptr}), matches, + (numMatches ? *numMatches : nvcv::Tensor{nullptr}), (distances ? *distances : nvcv::Tensor{nullptr}), + crossCheck, matchesPerPoint, *normType); + + return TupleTensor3(std::move(matches), numMatches, distances); +} + +TupleTensor3 PairwiseMatcher(Tensor &set1, Tensor &set2, std::optional numSet1, std::optional numSet2, + std::optional numMatches, bool distances, bool crossCheck, int matchesPerPoint, + std::optional normType, NVCVPairwiseMatcherType algoChoice, + std::optional pstream) +{ + nvcv::TensorShape set1Shape = set1.shape(); + nvcv::TensorShape set2Shape = set2.shape(); + + if (set1Shape.rank() != 3 || set2Shape.rank() != 3) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input sets must be rank-3 tensors"); + } + + int64_t numSamples = set1Shape[0]; + int64_t maxMatches = std::max(set1Shape[1], set2Shape[1]) * matchesPerPoint; + + if (!numMatches) + { + numMatches = crossCheck; + } + + // clang-format off + + Tensor matches = Tensor::Create({{numSamples, maxMatches, 2}, "NMA"}, nvcv::TYPE_S32); + + std::optional numMatchesTensor, distancesTensor; + + if (*numMatches) + { + numMatchesTensor = Tensor::Create({{numSamples}, "N"}, nvcv::TYPE_S32); + } + if (distances) + { + distancesTensor = Tensor::Create({{numSamples, maxMatches}, "NM"}, nvcv::TYPE_F32); + } + + // clang-format on + + return PairwiseMatcherInto(matches, numMatchesTensor, distancesTensor, set1, set2, numSet1, numSet2, crossCheck, + matchesPerPoint, normType, algoChoice, pstream); +} + +} // namespace + +void ExportOpPairwiseMatcher(py::module &m) +{ + using namespace pybind11::literals; + + m.def("match", &PairwiseMatcher, "set1"_a, "set2"_a, "num_set1"_a = nullptr, "num_set2"_a = nullptr, + "num_matches"_a = nullptr, "distances"_a = false, "cross_check"_a = false, "matches_per_point"_a = 1, + "norm_type"_a = nullptr, "algo_choice"_a = NVCV_BRUTE_FORCE, py::kw_only(), "stream"_a = nullptr, R"pbdoc( + + Executes the Pairwise matcher operation on the given CUDA stream. + + See also: + Refer to the CV-CUDA C API reference for this operator for more details and usage examples. + + Args: + set1 (Tensor): Input tensor with 1st set of points. + set2 (Tensor): Input tensor with 2nd set of points. + num_set1 (Tensor, optional): Input tensor with number of valid points in the 1st set. If not provided, + consider the entire set1 containing valid points. + num_set2 (Tensor, optional): Input tensor with number of valid points in the 2nd set. If not provided, + consider the entire set2 containing valid points. + num_matches (bool, optional): Use True to return the number of matches. If not provided, it is set + to True if crossCheck=True and False otherwise. + distances (bool, optional): Use True to return the match distances. + cross_check (bool, optional): Use True to cross check best matches, a best match is only returned if it is + the best match (minimum distance) from 1st set to 2nd set and vice versa. + matches_per_point (Number, optional): Number of best matches to return per point. + norm_type (cvcuda.Norm, optional): Choice on how distances are normalized. Defaults to cvcuda.Norm.L2 + for float input and cvcuda.Norm.HAMMING for other input data types. + algo_choice (cvcuda.Matcher, optional): Choice of the algorithm to perform the match. + stream (Stream, optional): CUDA Stream on which to perform the operation. + + Returns: + Tuple[Tensor, Tensor, Tensor]: A tuple wih output matches, number of matches and their distances. + The number of matches tensor may be None if its argument is False. + The distances tensor may be None if its argument is False. + + Caution: + Restrictions to several arguments may apply. Check the C API references of the CV-CUDA operator. + )pbdoc"); + + m.def("match_into", &PairwiseMatcherInto, "matches"_a, "num_matches"_a = nullptr, "distances"_a = nullptr, "set1"_a, + "set2"_a, "num_set1"_a = nullptr, "num_set2"_a = nullptr, "cross_check"_a = false, "matches_per_point"_a = 1, + "norm_type"_a = nullptr, "algo_choice"_a = NVCV_BRUTE_FORCE, py::kw_only(), "stream"_a = nullptr, + R"pbdoc( + + Executes the Pairwise matcher operation on the given CUDA stream. + + See also: + Refer to the CV-CUDA C API reference for this operator for more details and usage examples. + + Args: + matches (Tensor): Output tensor with matches. + num_matches (Tensor, optional): Output tensor with number of matches. + distances (Tensor, optional): Output tensor with match distances. + set1 (Tensor): Input tensor with 1st set of points. + set2 (Tensor): Input tensor with 2nd set of points. + num_set1 (Tensor, optional): Input tensor with number of valid points in the 1st set. If not provided, + consider the entire set1 containing valid points. + num_set2 (Tensor, optional): Input tensor with number of valid points in the 2nd set. If not provided, + consider the entire set2 containing valid points. + cross_check (bool, optional): Use True to cross check best matches, a best match is only returned if it is + the best match (minimum distance) from 1st set to 2nd set and vice versa. + matches_per_point (Number, optional): Number of best matches to return per point. + norm_type (cvcuda.Norm, optional): Choice on how distances are normalized. Defaults to cvcuda.Norm.L2 + for float input and cvcuda.Norm.HAMMING for other input data types. + algo_choice (cvcuda.Matcher, optional): Choice of the algorithm to perform the match. + stream (Stream, optional): CUDA Stream on which to perform the operation. + + Returns: + Tuple[Tensor, Tensor, Tensor]: A tuple wih output matches, number of matches and their distances. + The number of matches tensor may be None if its argument is None. + The distances tensor may be None if its argument is None. + + Caution: + Restrictions to several arguments may apply. Check the C API references of the CV-CUDA operator. + )pbdoc"); +} + +} // namespace cvcudapy diff --git a/python/mod_cvcuda/OpPillowResize.cpp b/python/mod_cvcuda/OpPillowResize.cpp index 9c393398..75a5b908 100644 --- a/python/mod_cvcuda/OpPillowResize.cpp +++ b/python/mod_cvcuda/OpPillowResize.cpp @@ -16,6 +16,7 @@ */ #include "Operators.hpp" +#include "WorkspaceCache.hpp" #include #include @@ -47,52 +48,82 @@ class PyOpPillowResize : public nvcvpy::Container { public: // Arguments of the key constructor should match the corresponding cvcuda operator arguments. - Key(const nvcv::Size2D &maxSize, int maxBatchSize, nvcv::ImageFormat fmt) - : m_maxSize{maxSize} - , m_maxBatchSize{maxBatchSize} - , m_format{fmt} - { - } + Key() {} - // The payload size is an approximate function of the actual size of the payload. - // There is no need to be an exact value, it is just provide ordering inside cache. size_t payloadSize() const { - return m_maxSize.w * m_maxSize.h * m_maxBatchSize; + return 0; } private: - // The hash is based only on the image format used by the operator. - // (In addition to the OP type as defined by IKey). size_t doGetHash() const override { - return ComputeHash(m_format); + return 0; } // The comparison of keys is based on the payload size, the one in the cache is "that" key. bool doIsCompatible(const nvcvpy::IKey &that_) const override { - const Key &that = static_cast(that_); - return this->payloadSize() <= that.payloadSize(); + return dynamic_cast(&that_) != nullptr; } - - nvcv::Size2D m_maxSize; - int m_maxBatchSize; - nvcv::ImageFormat m_format; }; // Constructor instantiate the cache key and the operator object. - PyOpPillowResize(const nvcv::Size2D &maxSize, int maxBatchSize, nvcv::ImageFormat fmt) - : m_key(maxSize, maxBatchSize, fmt) - , m_op(maxSize, maxBatchSize, fmt) + PyOpPillowResize() + : m_key() + , m_op() { } - // The submit forwards its args to the OP's call operator. - template - void submit(AA &&...args) + inline void submit(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, nvcv::ImageFormat format, + NVCVInterpolationType interpolation) { - m_op(std::forward(args)...); + int batch_size = getBatchSize(in); + nvcv::Size2D in_size = imageSize(in); + nvcv::Size2D out_size = imageSize(out); + + auto req = m_op.getWorkspaceRequirements(batch_size, out_size, in_size, format); + auto ws = WorkspaceCache::instance().get(req, stream); + m_op(stream, ws.get(), in, out, interpolation); + } + + inline int getBatchSize(const nvcv::Tensor &tensor) + { + auto access = nvcv::TensorDataAccessStridedImagePlanar::Create(tensor.exportData()); + if (!access) + throw std::runtime_error("Incompatible tensor layout"); + + return access->numSamples(); + } + + static nvcv::Size2D imageSize(const nvcv::Tensor &tensor) + { + auto access = nvcv::TensorDataAccessStridedImagePlanar::Create(tensor.exportData()); + if (!access) + throw std::runtime_error("Incompatible tensor layout"); + + return access->size(); + } + + inline void submit(cudaStream_t stream, const nvcv::ImageBatchVarShape &in, const nvcv::ImageBatchVarShape &out, + const NVCVInterpolationType interpolation) + { + assert(in.numImages() == out.numImages()); + auto in_sizes = imageSizes(in); + auto out_sizes = imageSizes(out); + int N = in_sizes.size(); + auto req = m_op.getWorkspaceRequirements(N, in_sizes.data(), out_sizes.data(), in.uniqueFormat()); + auto ws = WorkspaceCache::instance().get(req, stream); + m_op(stream, ws.get(), in, out, interpolation); + } + + static std::vector imageSizes(const nvcv::ImageBatchVarShape &batch) + { + std::vector sizes(batch.numImages()); + + for (size_t i = 0; i < sizes.size(); i++) sizes[i] = batch[i].size(); + + return sizes; } // Required override to get the py object container. @@ -158,21 +189,16 @@ Tensor PillowResizeInto(Tensor &output, Tensor &input, nvcv::ImageFormat format, throw std::runtime_error("Incompatible input/output tensor layout"); } - nvcv::Size2D maxSize{std::max(in_access->numCols(), out_access->numCols()), - std::max(in_access->numRows(), out_access->numRows())}; - - int maxBatchSize = static_cast(in_access->numSamples()); - // Use CreateOperatorEx to use the extended create operator function passing the specialized PyOperator above // as template type, instead of the regular cvcuda::OP class used in the CreateOperator function. - auto pillowResize = CreateOperatorEx(maxSize, maxBatchSize, format); + auto pillowResize = CreateOperatorEx(); ResourceGuard guard(*pstream); guard.add(LockMode::LOCK_READ, {input}); guard.add(LockMode::LOCK_WRITE, {output}); guard.add(LockMode::LOCK_WRITE, {*pillowResize}); - pillowResize->submit(pstream->cudaHandle(), input, output, interp); + pillowResize->submit(pstream->cudaHandle(), input, output, format, interp); return output; } @@ -193,15 +219,8 @@ ImageBatchVarShape VarShapePillowResizeInto(ImageBatchVarShape &output, ImageBat pstream = Stream::Current(); } - nvcv::Size2D maxSrcSize = input.maxSize(); - nvcv::Size2D maxDstSize = output.maxSize(); - - nvcv::Size2D maxSize{std::max(maxSrcSize.w, maxDstSize.w), std::max(maxSrcSize.h, maxDstSize.h)}; - - int maxBatchSize = static_cast(input.capacity()); - // The same PyOpPillowResize class and CreateOperatorEx function can be used regardless of Tensors or VarShape. - auto pillowResize = CreateOperatorEx(maxSize, maxBatchSize, input.uniqueFormat()); + auto pillowResize = CreateOperatorEx(); ResourceGuard guard(*pstream); guard.add(LockMode::LOCK_READ, {input}); diff --git a/python/mod_cvcuda/OpStack.cpp b/python/mod_cvcuda/OpStack.cpp new file mode 100644 index 00000000..41c7b891 --- /dev/null +++ b/python/mod_cvcuda/OpStack.cpp @@ -0,0 +1,179 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Operators.hpp" + +#include +#include +#include +#include +#include +#include + +namespace cvcudapy { + +namespace { + +void checkTensorList(std::vector &tensorList, int64_t (&outputShape)[4], nvcv::TensorLayout &layout, + nvcv::DataType &dtype) +{ + int32_t totalTensors = 0; + + if (tensorList.size() == 0) + { + throw std::runtime_error("Invalid input tensor list"); + } + + for (auto &tensor : tensorList) + { + if (tensor.shape().rank() < 3 || tensor.shape().rank() > 4) + { + throw std::runtime_error("Invalid input tensor shape"); + } + if (tensor.shape().rank() == 4) + { + totalTensors += tensor.shape()[0]; + outputShape[1] = tensor.shape()[1]; + outputShape[2] = tensor.shape()[2]; + outputShape[3] = tensor.shape()[3]; + } + else + { + totalTensors++; + outputShape[1] = tensor.shape()[0]; + outputShape[2] = tensor.shape()[1]; + outputShape[3] = tensor.shape()[2]; + } + + if (tensor.shape().layout() == nvcv::TENSOR_CHW || tensor.shape().layout() == nvcv::TENSOR_NCHW) + layout = nvcv::TENSOR_NCHW; + else + layout = nvcv::TENSOR_NHWC; + } + outputShape[0] = totalTensors; // set N to total number of tensors + dtype = tensorList[0].dtype(); +} + +Tensor StackIntoInternal(Tensor &output, std::vector &tensorList, std::optional pstream, + int32_t numberOfTensors) +{ + if (!pstream) + { + pstream = Stream::Current(); + } + + nvcvpy::TensorBatch inTensorBatch = nvcvpy::TensorBatch::Create(numberOfTensors); + + for (auto &tensor : tensorList) + { + inTensorBatch.pushBack(tensor); + } + + auto op = CreateOperator(); + + ResourceGuard guard(*pstream); + guard.add(LockMode::LOCK_READ, {inTensorBatch}); + guard.add(LockMode::LOCK_WRITE, {output}); + guard.add(LockMode::LOCK_NONE, {*op}); + op->submit(pstream->cudaHandle(), inTensorBatch, output); + return std::move(output); +} + +Tensor StackInto(Tensor &output, std::vector &tensorList, std::optional pstream) +{ + int64_t outputShape[4] = {}; // NCHW/NHWC + nvcv::TensorLayout layout = nvcv::TENSOR_CHW; + nvcv::DataType dtype; + + checkTensorList(tensorList, outputShape, layout, dtype); + + if (output.shape().layout() != nvcv::TENSOR_NCHW && output.shape().layout() != nvcv::TENSOR_NHWC) + throw std::runtime_error("Invalid output tensor shape"); + + if (output.shape()[0] != outputShape[0]) + throw std::runtime_error("Invalid output tensor shape"); + + StackIntoInternal(output, tensorList, pstream, outputShape[0]); + return std::move(output); +} + +Tensor Stack(std::vector &tensorList, std::optional pstream) +{ + int64_t outputShape[4] = {}; // NCHW/NHWC + nvcv::TensorLayout layout = nvcv::TENSOR_CHW; + nvcv::DataType dtype; + checkTensorList(tensorList, outputShape, layout, dtype); + + //create new output tensor + Tensor output = Tensor::Create( + { + {outputShape[0], outputShape[1], outputShape[2], outputShape[3]}, + layout + }, + dtype); + return StackIntoInternal(output, tensorList, pstream, outputShape[0]); +} + +} // namespace + +void ExportOpStack(py::module &m) +{ + using namespace pybind11::literals; + + m.def("stack", &Stack, "src"_a, py::kw_only(), "stream"_a = nullptr, R"pbdoc( + + Executes the Stack operation on the given cuda stream. This takes input tensors and combines them into a N(HWC/CHW) tensor. + + See also: + Refer to the CV-CUDA C API reference for the Stack operator + for more details and usage examples. + + Args: + src (Tensor List): Input tensors containing one or more samples each images all tensors must be N(HWC/CHW) or HWC/CHW and have the same data type and shape. + stream (Stream, optional): CUDA Stream on which to perform the operation. + + Returns: + cvcuda.Tensor: The output tensor containing the stacked input tensors. + + Caution: + Restrictions to several arguments may apply. Check the C + API references of the CV-CUDA operator. + )pbdoc"); + + m.def("stack_into", &StackInto, "dst"_a, "src"_a, py::kw_only(), "stream"_a = nullptr, R"pbdoc( + + Executes the Stack operation on the given cuda stream. This takes input tensors and combines them into a N(HWC/CHW) tensor. + + See also: + Refer to the CV-CUDA C API reference for the Stack operator + for more details and usage examples. + + Args: + dst (Tensor): Output N(CHW/HWC) tensor to store the result of the operation. + src (Tensor List): Input tensors containing one or more samples each images all tensors must be N(HWC/CHW) or HWC/CHW and have the same data type and shape. + stream (Stream, optional): CUDA Stream on which to perform the operation. + + Returns: + None + + Caution: + Restrictions to several arguments may apply. Check the C + API references of the CV-CUDA operator. + )pbdoc"); +} + +} // namespace cvcudapy diff --git a/python/mod_cvcuda/Operators.hpp b/python/mod_cvcuda/Operators.hpp index bd28d41b..2b8886b6 100644 --- a/python/mod_cvcuda/Operators.hpp +++ b/python/mod_cvcuda/Operators.hpp @@ -17,12 +17,14 @@ #include #include +#include #include #include #include #include #include #include +#include #include #include @@ -32,6 +34,7 @@ namespace nvcvpy::util { namespace cvcudapy { +using nvcvpy::Array; using nvcvpy::CreateNVCVTensorShape; using nvcvpy::CreateShape; using nvcvpy::Image; @@ -41,6 +44,7 @@ using nvcvpy::ResourceGuard; using nvcvpy::Shape; using nvcvpy::Stream; using nvcvpy::Tensor; +using nvcvpy::TensorBatch; namespace util = nvcvpy::util; namespace py = ::pybind11; @@ -91,6 +95,10 @@ void ExportOpInpaint(py::module &m); void ExportOpHistogramEq(py::module &m); void ExportOpMinAreaRect(py::module &m); void ExportOpAdvCvtColor(py::module &m); +void ExportOpLabel(py::module &m); +void ExportOpPairwiseMatcher(py::module &m); +void ExportOpStack(py::module &m); +void ExportOpFindHomography(py::module &m); // Helper class that serves as generic python-side operator class. // OP: native operator class diff --git a/python/mod_cvcuda/OsdElement.cpp b/python/mod_cvcuda/OsdElement.cpp index bf787218..e47730fe 100644 --- a/python/mod_cvcuda/OsdElement.cpp +++ b/python/mod_cvcuda/OsdElement.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -91,57 +92,12 @@ static NVCVColorRGBA pytocolor(py::tuple color) } // namespace -void ExportBndBox(py::module &m) -{ - using namespace py::literals; - - py::class_(m, "BndBoxI") - .def(py::init([]() { return NVCVBndBoxI{}; })) - .def(py::init( - [](py::tuple box, int thickness, py::tuple borderColor, py::tuple fillColor) - { - NVCVBndBoxI bndbox; - bndbox.box = pytobox(box); - bndbox.thickness = thickness; - bndbox.borderColor = pytocolor(borderColor); - bndbox.fillColor = pytocolor(fillColor); - return bndbox; - }), - "box"_a, "thickness"_a, "borderColor"_a, "fillColor"_a) - .def_readonly("box", &NVCVBndBoxI::box, "Tuple describing a box: x-coordinate, y-coordinate, width, height.") - .def_readonly("thickness", &NVCVBndBoxI::thickness, "Border thickness of bounding box.") - .def_readonly("borderColor", &NVCVBndBoxI::borderColor, "Border color of bounding box.") - .def_readonly("fillColor", &NVCVBndBoxI::fillColor, "Filled color of bounding box."); - - py::class_(m, "BndBoxesI") - .def(py::init([]() { return NVCVBndBoxesI{}; })) - .def(py::init( - [](std::vector numBoxes_vec, std::vector bndboxes_vec) - { - NVCVBndBoxesI bndboxes; - - bndboxes.batch = numBoxes_vec.size(); - bndboxes.numBoxes = new int[bndboxes.batch]; - memcpy(bndboxes.numBoxes, numBoxes_vec.data(), numBoxes_vec.size() * sizeof(int)); - - int total_box_num = bndboxes_vec.size(); - bndboxes.boxes = new NVCVBndBoxI[total_box_num]; - memcpy(bndboxes.boxes, bndboxes_vec.data(), bndboxes_vec.size() * sizeof(NVCVBndBoxI)); - - return bndboxes; - }), - "numBoxes"_a, "boxes"_a) - .def_readonly("batch", &NVCVBndBoxesI::batch, "Number of images in the image batch.") - .def_readonly("numBoxes", &NVCVBndBoxesI::numBoxes, "Number array of bounding boxes for image batch.") - .def_readonly("boxes", &NVCVBndBoxesI::boxes, "Bounding box array for image batch, \ref NVCVBndBoxI."); -} - void ExportBoxBlur(py::module &m) { using namespace py::literals; + using namespace cvcuda::priv; py::class_(m, "BlurBoxI") - .def(py::init([]() { return NVCVBlurBoxI{}; })) .def(py::init( [](py::tuple box, int kernelSize) { @@ -154,98 +110,65 @@ void ExportBoxBlur(py::module &m) .def_readonly("box", &NVCVBlurBoxI::box, "Tuple describing a box: x-coordinate, y-coordinate, width, height.") .def_readonly("kernelSize", &NVCVBlurBoxI::kernelSize, "Kernel sizes of mean filter."); - py::class_(m, "BlurBoxesI") - .def(py::init([]() { return NVCVBlurBoxesI{}; })) - .def(py::init( - [](std::vector numBoxes_vec, std::vector blurboxes_vec) - { - NVCVBlurBoxesI blurboxes; - - blurboxes.batch = numBoxes_vec.size(); - blurboxes.numBoxes = new int[blurboxes.batch]; - memcpy(blurboxes.numBoxes, numBoxes_vec.data(), numBoxes_vec.size() * sizeof(int)); - - int total_box_num = blurboxes_vec.size(); - blurboxes.boxes = new NVCVBlurBoxI[total_box_num]; - memcpy(blurboxes.boxes, blurboxes_vec.data(), blurboxes_vec.size() * sizeof(NVCVBlurBoxI)); - - return blurboxes; - }), - "numBoxes"_a, "boxes"_a) - .def_readonly("batch", &NVCVBlurBoxesI::batch, "Number of images in the image batch.") - .def_readonly("numBoxes", &NVCVBlurBoxesI::numBoxes, "Number array of blurring boxes for image batch.") - .def_readonly("boxes", &NVCVBlurBoxesI::boxes, "Blurring box array for image batch, \ref NVCVBlurBoxI."); + py::class_>(m, "BlurBoxesI") + .def(py::init([](const std::vector> &blurboxes_vec) + { return std::make_shared(blurboxes_vec); }), + "boxes"_a); } void ExportOSD(py::module &m) { using namespace py::literals; + using namespace cvcuda::priv; - py::class_(m, "Label") - .def(py::init([]() { return NVCVText{}; })) + py::class_(m, "BndBoxI") .def(py::init( - [](const char *utf8Text, int32_t fontSize, const char *fontName, py::tuple tlPos, py::tuple fontColor, - py::tuple bgColor) + [](py::tuple box, int thickness, py::tuple borderColor, py::tuple fillColor) { - NVCVText label; - label.utf8Text = (const char *)malloc(strlen(utf8Text)); - memcpy(const_cast(label.utf8Text), utf8Text, strlen(utf8Text) + 1); - label.fontName = (const char *)malloc(strlen(fontName)); - memcpy(const_cast(label.fontName), fontName, strlen(fontName) + 1); - label.fontSize = fontSize; - label.tlPos = pytopoint(tlPos); - label.fontColor = pytocolor(fontColor); - label.bgColor = pytocolor(bgColor); - return label; + NVCVBndBoxI bndbox; + bndbox.box = pytobox(box); + bndbox.thickness = thickness; + bndbox.borderColor = pytocolor(borderColor); + bndbox.fillColor = pytocolor(fillColor); + return bndbox; }), - "utf8Text"_a, "fontSize"_a, py::arg("fontName") = "DejaVuSansMono", "tlPos"_a, "fontColor"_a, "bgColor"_a) - .def_readonly("utf8Text", &NVCVText::utf8Text, "Label text in utf8 format.") - .def_readonly("fontSize", &NVCVText::fontSize, "Font size of label text.") - .def_readonly("fontName", &NVCVText::fontName, "Font name of label text, default: DejaVuSansMono.") - .def_readonly("tlPos", &NVCVText::tlPos, "Top-left corner point for label text.") - .def_readonly("fontColor", &NVCVText::fontColor, "Font color of label text.") - .def_readonly("bgColor", &NVCVText::bgColor, "Back color of label text."); + "box"_a, "thickness"_a, "borderColor"_a, "fillColor"_a) + .def_readonly("box", &NVCVBndBoxI::box, "Tuple describing a box: x-coordinate, y-coordinate, width, height.") + .def_readonly("thickness", &NVCVBndBoxI::thickness, "Border thickness of bounding box.") + .def_readonly("borderColor", &NVCVBndBoxI::borderColor, "Border color of bounding box.") + .def_readonly("fillColor", &NVCVBndBoxI::fillColor, "Filled color of bounding box."); + + py::class_>(m, "BndBoxesI") + .def(py::init([](const std::vector> &bndboxes_vec) + { return std::make_shared(bndboxes_vec); }), + "boxes"_a); + + py::class_>(m, "Label") + .def(py::init( + [](const char *utf8Text, int32_t fontSize, const char *fontName, py::tuple tlPos, py::tuple fontColor, + py::tuple bgColor) { + return NVCVText(utf8Text, fontSize, fontName, pytopoint(tlPos), pytocolor(fontColor), + pytocolor(bgColor)); + }), + "utf8Text"_a, "fontSize"_a, py::arg("fontName") = "DejaVuSansMono", "tlPos"_a, "fontColor"_a, "bgColor"_a); py::class_(m, "Segment") - .def(py::init([]() { return NVCVSegment{}; })) .def(py::init( [](py::tuple box, int32_t thickness, py::array_t segArray, float segThreshold, py::tuple borderColor, py::tuple segColor) { - NVCVSegment segment; - segment.box = pytobox(box); - segment.thickness = thickness; - py::buffer_info hSeg = segArray.request(); if (hSeg.ndim != 2) { throw std::runtime_error("segArray dims must be 2!"); } - segment.segWidth = hSeg.shape[0]; - segment.segHeight = hSeg.shape[1]; - - checkRuntime(cudaMalloc(&segment.dSeg, segment.segWidth * segment.segHeight * sizeof(float))); - checkRuntime(cudaMemcpy(segment.dSeg, hSeg.ptr, - segment.segWidth * segment.segHeight * sizeof(float), - cudaMemcpyHostToDevice)); - - segment.segThreshold = segThreshold; - segment.borderColor = pytocolor(borderColor); - segment.segColor = pytocolor(segColor); - return segment; + + return NVCVSegment(pytobox(box), thickness, (float *)hSeg.ptr, hSeg.shape[0], hSeg.shape[1], + segThreshold, pytocolor(borderColor), pytocolor(segColor)); }), - "box"_a, "thickness"_a, "segArray"_a, "segThreshold"_a, "borderColor"_a, "segColor"_a) - .def_readonly("box", &NVCVSegment::box, "Bounding box of segment.") - .def_readonly("thickness", &NVCVSegment::thickness, "Line thickness of segment outter rect.") - .def_readonly("dSeg", &NVCVSegment::dSeg, "Device pointer for segment mask.") - .def_readonly("segWidth", &NVCVSegment::segWidth, "Segment mask width.") - .def_readonly("segHeight", &NVCVSegment::segHeight, "Segment mask height.") - .def_readonly("segThreshold", &NVCVSegment::segThreshold, "Segment threshold.") - .def_readonly("borderColor", &NVCVSegment::borderColor, "Line color of segment outter rect.") - .def_readonly("segColor", &NVCVSegment::segColor, "Segment mask color."); + "box"_a, "thickness"_a, "segArray"_a, "segThreshold"_a, "borderColor"_a, "segColor"_a); py::class_(m, "Point") - .def(py::init([]() { return NVCVPoint{}; })) .def(py::init( [](py::tuple centerPos, int32_t radius, py::tuple color) { @@ -261,7 +184,6 @@ void ExportOSD(py::module &m) .def_readonly("color", &NVCVPoint::color, "Point color."); py::class_(m, "Line") - .def(py::init([]() { return NVCVLine{}; })) .def(py::init( [](py::tuple pos0, py::tuple pos1, int32_t thickness, py::tuple color, bool interpolation) { @@ -281,46 +203,22 @@ void ExportOSD(py::module &m) .def_readonly("interpolation", &NVCVLine::interpolation, "Default: true."); py::class_(m, "PolyLine") - .def(py::init([]() { return NVCVPolyLine{}; })) .def(py::init( [](py::array_t points, int32_t thickness, bool isClosed, py::tuple borderColor, py::tuple fillColor, bool interpolation) { - NVCVPolyLine pl; - py::buffer_info points_info = points.request(); if (points_info.ndim != 2 || points_info.shape[1] != 2) { throw std::runtime_error("points dims and shape[1] must be 2!"); } - pl.numPoints = points_info.shape[0]; - pl.hPoints = new int[pl.numPoints * 2]; - checkRuntime(cudaMalloc(&pl.dPoints, 2 * pl.numPoints * sizeof(int))); - - memcpy(pl.hPoints, points_info.ptr, 2 * pl.numPoints * sizeof(int)); - checkRuntime(cudaMemcpy(pl.dPoints, points_info.ptr, 2 * pl.numPoints * sizeof(int), - cudaMemcpyHostToDevice)); - - pl.thickness = thickness; - pl.isClosed = isClosed; - pl.borderColor = pytocolor(borderColor); - pl.fillColor = pytocolor(fillColor); - pl.interpolation = interpolation; - return pl; + return NVCVPolyLine((int32_t *)points_info.ptr, points_info.shape[0], thickness, isClosed, + pytocolor(borderColor), pytocolor(fillColor), interpolation); }), - "points"_a, "thickness"_a, "isClosed"_a, "borderColor"_a, "fillColor"_a, py::arg("interpolation") = true) - .def_readonly("hPoints", &NVCVPolyLine::hPoints, "Host pointer for polyline points.") - .def_readonly("dPoints", &NVCVPolyLine::dPoints, "Device pointer for polyline points.") - .def_readonly("numPoints", &NVCVPolyLine::numPoints, "Number of polyline points.") - .def_readonly("thickness", &NVCVPolyLine::thickness, "Polyline thickness.") - .def_readonly("isClosed", &NVCVPolyLine::isClosed, "Connect p(0) to p(n-1) or not.") - .def_readonly("borderColor", &NVCVPolyLine::borderColor, "Line color of polyline border.") - .def_readonly("fillColor", &NVCVPolyLine::fillColor, "Fill color of poly fill area.") - .def_readonly("interpolation", &NVCVPolyLine::interpolation, "Default: true."); + "points"_a, "thickness"_a, "isClosed"_a, "borderColor"_a, "fillColor"_a, py::arg("interpolation") = true); py::class_(m, "RotatedBox") - .def(py::init([]() { return NVCVRotatedBox{}; })) .def(py::init( [](py::tuple centerPos, int32_t width, int32_t height, float yaw, int32_t thickness, py::tuple borderColor, py::tuple bgColor, bool interpolation) @@ -348,7 +246,6 @@ void ExportOSD(py::module &m) .def_readonly("interpolation", &NVCVRotatedBox::interpolation, "Default: false."); py::class_(m, "Circle") - .def(py::init([]() { return NVCVCircle{}; })) .def(py::init( [](py::tuple centerPos, int32_t radius, int32_t thickness, py::tuple borderColor, py::tuple bgColor) { @@ -368,7 +265,6 @@ void ExportOSD(py::module &m) .def_readonly("bgColor", &NVCVCircle::bgColor, "Circle filled color."); py::class_(m, "Arrow") - .def(py::init([]() { return NVCVArrow{}; })) .def(py::init( [](py::tuple pos0, py::tuple pos1, int32_t arrowSize, int32_t thickness, py::tuple color, bool interpolation) @@ -396,144 +292,88 @@ void ExportOSD(py::module &m) .value("HHMMSS", NVCVClockFormat::HHMMSS); py::class_(m, "Clock") - .def(py::init([]() { return NVCVClock{}; })) .def(py::init( [](NVCVClockFormat clockFormat, long time, int32_t fontSize, const char *font, py::tuple tlPos, - py::tuple fontColor, py::tuple bgColor) - { - NVCVClock clock; - clock.clockFormat = clockFormat; - clock.time = time; - clock.fontSize = fontSize; - clock.font = (const char *)malloc(strlen(font)); - memcpy(const_cast(clock.font), font, strlen(font) + 1); - clock.tlPos = pytopoint(tlPos); - clock.fontColor = pytocolor(fontColor); - clock.bgColor = pytocolor(bgColor); - return clock; + py::tuple fontColor, py::tuple bgColor) { + return NVCVClock(clockFormat, time, fontSize, font, pytopoint(tlPos), pytocolor(fontColor), + pytocolor(bgColor)); }), "clockFormat"_a, "time"_a, "fontSize"_a, py::arg("font") = "DejaVuSansMono", "tlPos"_a, "fontColor"_a, - "bgColor"_a) - .def_readonly("clockFormat", &NVCVClock::clockFormat, "Pre-defined clock format.") - .def_readonly("time", &NVCVClock::time, "Clock time.") - .def_readonly("fontSize", &NVCVClock::fontSize, "Font size.") - .def_readonly("font", &NVCVClock::font, "Font name, default: DejaVuSansMono.") - .def_readonly("tlPos", &NVCVClock::tlPos, "Top-left corner point of the text.") - .def_readonly("fontColor", &NVCVClock::fontColor, "Font color of the text.") - .def_readonly("bgColor", &NVCVClock::bgColor, "Background color of text box."); - - py::class_(m, "Element") - .def(py::init([]() { return NVCVElement{}; })) - .def(py::init( - [](NVCVOSDType type, void *data) - { - NVCVElement element; - element.type = type; - element.data = data; - return element; - }), - "type"_a, "data"_a) - .def_readonly("type", &NVCVElement::type, "Element type.") - .def_readonly("data", &NVCVElement::data, "Element data pointer."); + "bgColor"_a); - py::class_(m, "Elements") - .def(py::init([]() { return NVCVElements{}; })) + py::class_>(m, "Elements") .def(py::init( - [](std::vector numElements_vec, py::tuple elements_list) + [](const std::vector &elements_list_vec) { - NVCVElements ctx; - - ctx.batch = numElements_vec.size(); - ctx.numElements = new int[ctx.batch]; - memcpy(ctx.numElements, numElements_vec.data(), numElements_vec.size() * sizeof(int)); - - int total_element_num = elements_list.size(); - ctx.elements = new NVCVElement[total_element_num]; - - for (size_t i = 0; i < elements_list.size(); ++i) + std::vector>> elements_vec; + for (const auto &elements_list : elements_list_vec) { - if (pybind11::isinstance(elements_list[i])) - { - ctx.elements[i].type = NVCVOSDType::NVCV_OSD_RECT; - ctx.elements[i].data = new NVCVBndBoxI(); - auto bndbox = elements_list[i].cast(); - memcpy(ctx.elements[i].data, &bndbox, sizeof(NVCVBndBoxI)); - } - else if (pybind11::isinstance(elements_list[i])) - { - ctx.elements[i].type = NVCVOSDType::NVCV_OSD_TEXT; - ctx.elements[i].data = new NVCVText(); - auto text = elements_list[i].cast(); - memcpy(ctx.elements[i].data, &text, sizeof(NVCVText)); - } - else if (pybind11::isinstance(elements_list[i])) - { - ctx.elements[i].type = NVCVOSDType::NVCV_OSD_SEGMENT; - ctx.elements[i].data = new NVCVSegment(); - auto segment = elements_list[i].cast(); - memcpy(ctx.elements[i].data, &segment, sizeof(NVCVSegment)); - } - else if (pybind11::isinstance(elements_list[i])) - { - ctx.elements[i].type = NVCVOSDType::NVCV_OSD_POINT; - ctx.elements[i].data = new NVCVPoint(); - auto point = elements_list[i].cast(); - memcpy(ctx.elements[i].data, &point, sizeof(NVCVPoint)); - } - else if (pybind11::isinstance(elements_list[i])) - { - ctx.elements[i].type = NVCVOSDType::NVCV_OSD_LINE; - ctx.elements[i].data = new NVCVLine(); - auto line = elements_list[i].cast(); - memcpy(ctx.elements[i].data, &line, sizeof(NVCVLine)); - } - else if (pybind11::isinstance(elements_list[i])) - { - ctx.elements[i].type = NVCVOSDType::NVCV_OSD_POLYLINE; - ctx.elements[i].data = new NVCVPolyLine(); - auto pl = elements_list[i].cast(); - memcpy(ctx.elements[i].data, &pl, sizeof(NVCVPolyLine)); - } - else if (pybind11::isinstance(elements_list[i])) - { - ctx.elements[i].type = NVCVOSDType::NVCV_OSD_ROTATED_RECT; - ctx.elements[i].data = new NVCVRotatedBox(); - auto pl = elements_list[i].cast(); - memcpy(ctx.elements[i].data, &pl, sizeof(NVCVRotatedBox)); - } - else if (pybind11::isinstance(elements_list[i])) - { - ctx.elements[i].type = NVCVOSDType::NVCV_OSD_CIRCLE; - ctx.elements[i].data = new NVCVCircle(); - auto circle = elements_list[i].cast(); - memcpy(ctx.elements[i].data, &circle, sizeof(NVCVCircle)); - } - else if (pybind11::isinstance(elements_list[i])) - { - ctx.elements[i].type = NVCVOSDType::NVCV_OSD_ARROW; - ctx.elements[i].data = new NVCVArrow(); - auto arrow = elements_list[i].cast(); - memcpy(ctx.elements[i].data, &arrow, sizeof(NVCVArrow)); - } - else if (pybind11::isinstance(elements_list[i])) - { - ctx.elements[i].type = NVCVOSDType::NVCV_OSD_CLOCK; - ctx.elements[i].data = new NVCVClock(); - auto clock = elements_list[i].cast(); - memcpy(ctx.elements[i].data, &clock, sizeof(NVCVClock)); - } - else + std::vector> curVec; + for (size_t i = 0; i < elements_list.size(); ++i) { - ctx.elements[i].type = NVCVOSDType::NVCV_OSD_NONE; + std::shared_ptr element; + if (pybind11::isinstance(elements_list[i])) + { + auto rect = elements_list[i].cast(); + element = std::make_shared(NVCVOSDType::NVCV_OSD_RECT, &rect); + } + else if (pybind11::isinstance(elements_list[i])) + { + auto text = elements_list[i].cast(); + element = std::make_shared(NVCVOSDType::NVCV_OSD_TEXT, &text); + } + else if (pybind11::isinstance(elements_list[i])) + { + auto segment = elements_list[i].cast(); + element = std::make_shared(NVCVOSDType::NVCV_OSD_SEGMENT, &segment); + } + else if (pybind11::isinstance(elements_list[i])) + { + auto point = elements_list[i].cast(); + element = std::make_shared(NVCVOSDType::NVCV_OSD_POINT, &point); + } + else if (pybind11::isinstance(elements_list[i])) + { + auto line = elements_list[i].cast(); + element = std::make_shared(NVCVOSDType::NVCV_OSD_LINE, &line); + } + else if (pybind11::isinstance(elements_list[i])) + { + auto pl = elements_list[i].cast(); + element = std::make_shared(NVCVOSDType::NVCV_OSD_POLYLINE, &pl); + } + else if (pybind11::isinstance(elements_list[i])) + { + auto rb = elements_list[i].cast(); + element = std::make_shared(NVCVOSDType::NVCV_OSD_ROTATED_RECT, &rb); + } + else if (pybind11::isinstance(elements_list[i])) + { + auto circle = elements_list[i].cast(); + element = std::make_shared(NVCVOSDType::NVCV_OSD_CIRCLE, &circle); + } + else if (pybind11::isinstance(elements_list[i])) + { + auto arrow = elements_list[i].cast(); + element = std::make_shared(NVCVOSDType::NVCV_OSD_ARROW, &arrow); + } + else if (pybind11::isinstance(elements_list[i])) + { + auto clock = elements_list[i].cast(); + element = std::make_shared(NVCVOSDType::NVCV_OSD_CLOCK, &clock); + } + else + { + element = std::make_shared(NVCVOSDType::NVCV_OSD_NONE, nullptr); + } + curVec.emplace_back(element); } + elements_vec.emplace_back(curVec); } - return ctx; + return std::make_shared(elements_vec); }), - "numElements"_a, "elements"_a) - .def_readonly("batch", &NVCVElements::batch, "Number of images in the image batch.") - .def_readonly("numElements", &NVCVElements::numElements, "Number array of OSD elements for image batch.") - .def_readonly("elements", &NVCVElements::elements, "OSD elements array for image batch, \ref NVCVElement."); + "elements"_a); } } // namespace cvcudapy diff --git a/python/mod_cvcuda/OsdElement.hpp b/python/mod_cvcuda/OsdElement.hpp index 320eac80..c18b664a 100644 --- a/python/mod_cvcuda/OsdElement.hpp +++ b/python/mod_cvcuda/OsdElement.hpp @@ -24,8 +24,6 @@ namespace cvcudapy { namespace py = ::pybind11; -void ExportBndBox(py::module &m); - void ExportBoxBlur(py::module &m); void ExportOSD(py::module &m); diff --git a/python/mod_cvcuda/PairwiseMatcherType.cpp b/python/mod_cvcuda/PairwiseMatcherType.cpp new file mode 100644 index 00000000..deb97578 --- /dev/null +++ b/python/mod_cvcuda/PairwiseMatcherType.cpp @@ -0,0 +1,29 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "PairwiseMatcherType.hpp" + +#include + +namespace cvcudapy { + +void ExportPairwiseMatcherType(py::module &m) +{ + py::enum_(m, "Matcher", py::arithmetic()).value("BRUTE_FORCE", NVCV_BRUTE_FORCE); +} + +} // namespace cvcudapy diff --git a/python/mod_cvcuda/PairwiseMatcherType.hpp b/python/mod_cvcuda/PairwiseMatcherType.hpp new file mode 100644 index 00000000..c49d0097 --- /dev/null +++ b/python/mod_cvcuda/PairwiseMatcherType.hpp @@ -0,0 +1,30 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_PYTHON_PAIRWISE_MATCHER_TYPE_HPP +#define NVCV_PYTHON_PAIRWISE_MATCHER_TYPE_HPP + +#include + +namespace cvcudapy { +namespace py = ::pybind11; + +void ExportPairwiseMatcherType(py::module &m); + +} // namespace cvcudapy + +#endif // NVCV_PYTHON_PAIRWISE_MATCHER_TYPE_HPP diff --git a/python/mod_cvcuda/WorkspaceCache.cpp b/python/mod_cvcuda/WorkspaceCache.cpp new file mode 100644 index 00000000..989b9d51 --- /dev/null +++ b/python/mod_cvcuda/WorkspaceCache.cpp @@ -0,0 +1,87 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "WorkspaceCache.hpp" + +namespace cvcudapy { + +WorkspaceLease::WorkspaceLease(WorkspaceCache *owner, CachedWorkspaceMem &&host, + CachedWorkspaceMem &&pinned, + CachedWorkspaceMem &&cuda, + std::optional hostReleaseStream, + std::optional pinnedReleaseStream, + std::optional cudaReleaseStream) + : m_owner(owner) + , m_host(std::move(host)) + , m_pinned(std::move(pinned)) + , m_cuda(std::move(cuda)) + , m_hostReleaseStream(std::move(hostReleaseStream)) + , m_pinnedReleaseStream(std::move(pinnedReleaseStream)) + , m_cudaReleaseStream(std::move(cudaReleaseStream)) +{ +} + +WorkspaceLease::~WorkspaceLease() +{ + if (m_host) + m_owner->m_host.put(std::move(m_host), m_hostReleaseStream); + if (m_pinned) + m_owner->m_pinned.put(std::move(m_pinned), m_pinnedReleaseStream); + if (m_cuda) + m_owner->m_cuda.put(std::move(m_cuda), m_hostReleaseStream); +} + +WorkspaceCache::WorkspaceCache(nvcv::Allocator allocator) + : m_eventCache(std::make_shared()) + , m_host(allocator, m_eventCache) + , m_pinned(allocator, m_eventCache) + , m_cuda(allocator, m_eventCache) +{ +} + +WorkspaceCache::WorkspaceCache() + : WorkspaceCache(nvcv::CustomAllocator<>{}) +{ +} + +WorkspaceLease WorkspaceCache::get(cvcuda::WorkspaceRequirements req, std::optional hostAcquireStream, + std::optional hostReleaseStream, + std::optional pinnedAcquireStream, + std::optional pinnedReleaseStream, + std::optional cudaAcquireStream, + std::optional cudaReleaseStream) +{ + return WorkspaceLease(this, m_host.get(req.hostMem, hostAcquireStream), + m_pinned.get(req.pinnedMem, pinnedAcquireStream), m_cuda.get(req.cudaMem, cudaAcquireStream), + hostReleaseStream, pinnedReleaseStream, cudaReleaseStream); +} + +WorkspaceCache &WorkspaceCache::instance() +{ + static WorkspaceCache instance; + return instance; +} + +void WorkspaceCache::clear() +{ + m_cuda.clear(); + m_pinned.clear(); + m_host.clear(); + m_eventCache->purge(); +} + +} // namespace cvcudapy diff --git a/python/mod_cvcuda/WorkspaceCache.hpp b/python/mod_cvcuda/WorkspaceCache.hpp new file mode 100644 index 00000000..ffd56a21 --- /dev/null +++ b/python/mod_cvcuda/WorkspaceCache.hpp @@ -0,0 +1,319 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CVCUDA_PYTHON_WORKSPACE_CACHE_HPP +#define CVCUDA_PYTHON_WORKSPACE_CACHE_HPP + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace cvcudapy { + +using WorkspaceMemDestructor_t = std::function; + +enum class MemoryKind +{ + Host, + Pinned, + Cuda +}; + +template +class CachedWorkspaceMem : public cvcuda::WorkspaceMem +{ +public: + CachedWorkspaceMem() + : cvcuda::WorkspaceMem({}) + { + assert(data == nullptr); + assert(ready == nullptr); + } + + CachedWorkspaceMem(const cvcuda::WorkspaceMem &mem, WorkspaceMemDestructor_t destructor) + : cvcuda::WorkspaceMem(mem) + , m_destructor(std::move(destructor)) + { + } + + CachedWorkspaceMem(CachedWorkspaceMem &&mem) + { + *this = std::move(mem); + } + + CachedWorkspaceMem &operator=(CachedWorkspaceMem &&mem) + { + std::swap(wsMem(), mem.wsMem()); + std::swap(m_destructor, mem.m_destructor); + mem.reset(); + return *this; + } + + ~CachedWorkspaceMem() + { + reset(); + } + + void reset() + { + if (m_destructor) + { + m_destructor(*this); + m_destructor = {}; + } + wsMem() = {}; + } + + explicit operator bool() const noexcept + { + return data != nullptr; + } + +private: + cvcuda::WorkspaceMem &wsMem() & + { + return static_cast(*this); + } + + const cvcuda::WorkspaceMem &wsMem() const & + { + return static_cast(*this); + } + + WorkspaceMemDestructor_t m_destructor; +}; + +template +inline size_t StreamCachePayloadSize(const CachedWorkspaceMem &mem) +{ + return mem.req.size; +} + +template +inline size_t StreamCachePayloadAlignment(const CachedWorkspaceMem &mem) +{ + return mem.req.alignment; +} + +template +class WorkspaceMemCache +{ +public: + using Mem = CachedWorkspaceMem; + using Base = nvcv::util::PerStreamCache; + + WorkspaceMemCache(nvcv::Allocator alloc, std::shared_ptr eventCache) + : m_alloc(std::move(alloc)) + , m_eventCache(std::move(eventCache)) + { + } + + ~WorkspaceMemCache() + { + assert(m_outstandingAllocs == 0); + } + + Mem get(cvcuda::WorkspaceMemRequirements req, std::optional stream) + { + if (req.size == 0) + return {}; + + ++m_outstandingAllocs; + auto opt = m_memCache.get(req.size, req.alignment, stream); + if (opt) + return std::move(opt).value(); + + return create(req); + } + + void put(Mem &&mem, std::optional stream) + { + m_memCache.put(std::move(mem), stream); + --m_outstandingAllocs; + } + + void clear() + { + assert(m_outstandingAllocs == 0); + m_memCache.purge(); + } + +private: + void *allocateMem(size_t size, size_t alignment) const + { + if constexpr (kind == MemoryKind::Host) + return m_alloc.hostMem().alloc(size, alignment); + else if constexpr (kind == MemoryKind::Pinned) + return m_alloc.hostPinnedMem().alloc(size, alignment); + else if constexpr (kind == MemoryKind::Cuda) + return m_alloc.cudaMem().alloc(size, alignment); + else + return nullptr; // should never happen + } + + void freeMem(void *mem, size_t size, size_t alignment) const + { + if constexpr (kind == MemoryKind::Host) + return m_alloc.hostMem().free(mem, size, alignment); + else if constexpr (kind == MemoryKind::Pinned) + return m_alloc.hostPinnedMem().free(mem, size, alignment); + else if constexpr (kind == MemoryKind::Cuda) + return m_alloc.cudaMem().free(mem, size, alignment); + } + + auto getMemDeleter() const + { + return [this](cvcuda::WorkspaceMem &mem) + { + // free the memory + freeMem(mem.data, mem.req.size, mem.req.alignment); + // return the event to the event cache + if (mem.ready) + { + m_eventCache->put(nvcv::util::CudaEvent(mem.ready)); + mem.ready = nullptr; + } + }; + } + + Mem create(cvcuda::WorkspaceMemRequirements req) + { + WorkspaceMemDestructor_t del = getMemDeleter(); + + auto evt = nvcv::util::CudaEvent::Create(); + void *data = allocateMem(req.size, req.alignment); + + cvcuda::WorkspaceMem wsmem = {req, data, evt.get()}; + + Mem mem(wsmem, std::move(del)); + evt.release(); // from now on, the event handle is managed by `mem`. + return mem; + } + + nvcv::Allocator m_alloc; + + std::shared_ptr m_eventCache; + + nvcv::util::PerStreamCache> m_memCache; + + std::atomic_int m_outstandingAllocs; +}; + +class WorkspaceCache; + +class WorkspaceLease +{ +public: + cvcuda::Workspace get() const + { + return {m_host, m_pinned, m_cuda}; + } + + ~WorkspaceLease(); + +private: + friend class WorkspaceCache; + WorkspaceLease(WorkspaceCache *owner, CachedWorkspaceMem &&host, + CachedWorkspaceMem &&pinned, CachedWorkspaceMem &&cuda, + std::optional hostReleaseStream, std::optional pinnedReleaseStream, + std::optional cudaReleaseStream); + + WorkspaceCache *m_owner; + CachedWorkspaceMem m_host; + CachedWorkspaceMem m_pinned; + CachedWorkspaceMem m_cuda; + + std::optional m_hostReleaseStream, m_pinnedReleaseStream, m_cudaReleaseStream; +}; + +class WorkspaceCache +{ +public: + WorkspaceCache(); + + WorkspaceCache(nvcv::Allocator allocator); + + /** Gets a workspace with custom stream semantics + * + * @param req The workspace memory sizes and alignments + * @param hostAcquireStream The stream on which regular host memory will be initialky used; typically nullopt + * @param hostReleaseStream The stream on which regular host memory usage will be completed; typically nullopt + * @param pinnedAcquireStream The stream on which pinned memory will be initialky used; typically nullopt + * @param pinnedReleaseStream The stream on which pinned memory usage will be completed; typically the main stream + * on which the operator is executed + * @param cudaAcquireStream The stream on which device memory will be initialky used + * @param cudaReleaseStream The stream on which device memory usage will be completed + */ + WorkspaceLease get(cvcuda::WorkspaceRequirements req, std::optional hostAcquireStream, + std::optional hostReleaseStream, std::optional pinnedAcquireStream, + std::optional pinnedReleaseStream, std::optional cudaAcquireStream, + std::optional cudaReleaseStream); + + /** Gets a workspace with default stream semantics + * + * The default stream semantics are: + * - host memory doesn't use any streams + * - pinned memory is used for h2d copy (released in stream order) + * - device memory is acquired and released on the same stream + * + * NOTE: If these semantics are not honored by the user, the code should still be correct, just less efficient. + */ + WorkspaceLease get(cvcuda::WorkspaceRequirements req, cudaStream_t stream) + { + return get(req, std::nullopt, std::nullopt, std::nullopt, stream, stream, stream); + } + + auto &host() + { + return m_host; + } + + auto &pinned() + { + return m_pinned; + } + + auto &cuda() + { + return m_cuda; + } + + static WorkspaceCache &instance(); + + void clear(); + +private: + std::shared_ptr m_eventCache; + WorkspaceMemCache m_host; + WorkspaceMemCache m_pinned; + WorkspaceMemCache m_cuda; + + friend class WorkspaceLease; +}; + +} // namespace cvcudapy + +#endif // CVCUDA_PYTHON_WORKSPACE_CACHE_HPP diff --git a/python/mod_cvcuda/exports.ldscript b/python/mod_cvcuda/exports.ldscript new file mode 100644 index 00000000..fb32f5a6 --- /dev/null +++ b/python/mod_cvcuda/exports.ldscript @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{ +# Restricts global symbols to the only one +# that needs to be exported by a python module. +global: + PyInit_cvcuda; +local: *; +}; diff --git a/python/mod_nvcv/Array.cpp b/python/mod_nvcv/Array.cpp new file mode 100644 index 00000000..5a39e91e --- /dev/null +++ b/python/mod_nvcv/Array.cpp @@ -0,0 +1,350 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Array.hpp" + +#include "DataType.hpp" +#include "ExternalBuffer.hpp" + +#include +#include +#include +#include +#include +#include +#include + +namespace nvcvpy::priv { + +std::shared_ptr Array::CreateFromReqs(const nvcv::Array::Requirements &reqs) +{ + std::vector> vcont = Cache::Instance().fetch(Key{reqs}); + + // None found? + if (vcont.empty()) + { + std::shared_ptr array(new Array(reqs)); + array->impl().resize(reqs.capacity); + Cache::Instance().add(*array); + return array; + } + else + { + // Get the first one + auto array = std::static_pointer_cast(vcont[0]); + NVCV_ASSERT(array->dtype() == reqs.dtype); + return array; + } +} + +std::shared_ptr Array::Create(int64_t length, nvcv::DataType dtype) +{ + nvcv::Array::Requirements reqs = nvcv::Array::CalcRequirements(length, dtype); + return CreateFromReqs(reqs); +} + +std::shared_ptr Array::Create(const Shape &shape, nvcv::DataType dtype) +{ + return Create(LengthIf1D(shape), dtype); +} + +namespace { + +NVCVArrayData FillNVCVArrayData(const DLTensor &tensor, NVCVArrayBufferType bufType) +{ + NVCVArrayData arrayData = {}; + + // dtype ------------ + arrayData.dtype = py::cast(ToDType(ToNVCVDataType(tensor.dtype))); + + // rank ------------ + { + // TODO: Add 0D support + int rank = tensor.ndim == 0 ? 1 : tensor.ndim; + if (rank != 1) + { + throw std::invalid_argument(util::FormatString("The tensor rank must be 1 not %d", rank)); + } + } + + // shape ------------ + arrayData.capacity = arrayData.length = tensor.shape[0]; + + // buffer type ------------ + if (IsCudaAccessible(tensor.device.device_type)) + { + arrayData.bufferType = NVCV_ARRAY_BUFFER_HOST; + } + else + { + throw std::runtime_error("Only CUDA-accessible arrays are supported for now"); + } + + NVCVArrayBufferStrided &dataStrided = arrayData.buffer.strided; + + // stride ------------ + int elemStrideBytes = (tensor.dtype.bits * tensor.dtype.lanes + 7) / 8; + for (int d = 0; d < tensor.ndim; ++d) + { + dataStrided.stride = tensor.strides[d] * elemStrideBytes; + } + + // Memory buffer ------------ + dataStrided.basePtr = reinterpret_cast(tensor.data) + tensor.byte_offset; + + return arrayData; +} + +NVCVArrayData FillNVCVArrayDataCUDA(const DLTensor &tensor) +{ + return FillNVCVArrayData(tensor, NVCV_ARRAY_BUFFER_HOST); +} + +} // namespace + +std::shared_ptr Array::Wrap(ExternalBuffer &buffer) +{ + const DLTensor &dlTensor = buffer.dlTensor(); + + nvcv::ArrayDataCuda data{FillNVCVArrayDataCUDA(dlTensor)}; + + // This is the key of a tensor wrapper. + // All tensor wrappers have the same key. + Array::Key key; + // We take this opportunity to remove from cache all wrappers that aren't + // being used. They aren't reusable anyway. + Cache::Instance().removeAllNotInUseMatching(key); + + auto array = std::shared_ptr(new Array(data, py::cast(buffer.shared_from_this()))); + + // Need to add wrappers to cache so that they don't get destroyed by + // the cuda stream when they're last used, and python script isn't + // holding a reference to them. If we don't do it, things might break. + Cache::Instance().add(*array); + return array; +} + +std::shared_ptr Array::ResizeArray(Array &array, int64_t length) +{ + Array::Key key; + Cache::Instance().removeAllNotInUseMatching(key); + + auto array_impl = array.impl(); + array_impl.resize(length); + + auto new_array = std::shared_ptr(new Array(std::move(array_impl))); + + // Need to add wrappers to cache so that they don't get destroyed by + // the cuda stream when they're last used, and python script isn't + // holding a reference to them. If we don't do it, things might break. + Cache::Instance().add(*new_array); + return new_array; +} + +std::shared_ptr Array::ResizeArray(Array &array, Shape shape) +{ + return ResizeArray(array, LengthIf1D(shape)); +} + +std::shared_ptr Array::Resize(int64_t length) +{ + return ResizeArray(*this, length); +} + +std::shared_ptr Array::Resize(Shape shape) +{ + return ResizeArray(*this, shape); +} + +Array::Array(const nvcv::Array::Requirements &reqs) + : m_impl{reqs} + , m_key{reqs} +{ +} + +Array::Array(const nvcv::ArrayData &data, py::object wrappedObject) + : m_impl{nvcv::ArrayWrapData(data)} + , m_key{} + , m_wrappedObject(wrappedObject) +{ +} + +Array::Array(nvcv::Array &&array) + : m_impl{std::move(array)} + , m_key{} +{ +} + +std::shared_ptr Array::shared_from_this() +{ + return std::static_pointer_cast(Container::shared_from_this()); +} + +std::shared_ptr Array::shared_from_this() const +{ + return std::static_pointer_cast(Container::shared_from_this()); +} + +nvcv::Array &Array::impl() +{ + return m_impl; +} + +const nvcv::Array &Array::impl() const +{ + return m_impl; +} + +Shape Array::shape() const +{ + return CreateShape(m_impl.length()); +} + +nvcv::DataType Array::dtype() const +{ + return m_impl.dtype(); +} + +int Array::rank() const +{ + return m_impl.rank(); +} + +int64_t Array::length() const +{ + return m_impl.length(); +} + +Array::Key::Key(const nvcv::Array::Requirements &reqs) + : Key(reqs.capacity, static_cast(reqs.dtype)) +{ +} + +Array::Key::Key(int64_t length, nvcv::DataType dtype) + : m_length(std::move(length)) + , m_dtype(dtype) + , m_wrapper(false) +{ +} + +size_t Array::Key::doGetHash() const +{ + if (m_wrapper) + { + return 0; // all wrappers are equal wrt. the cache + } + else + { + using util::ComputeHash; + return ComputeHash(m_length, m_dtype); + } +} + +bool Array::Key::doIsCompatible(const IKey &that_) const +{ + const Key &that = static_cast(that_); + + // Wrapper key's all compare equal, are they can't be used + // and whenever we query the cache for wrappers, we really + // want to get them all (as long as they aren't being used). + if (m_wrapper && that.m_wrapper) + { + return true; + } + else if (m_wrapper || that.m_wrapper) // xor + { + return false; + } + else + { + return std::tie(m_length, m_dtype) == std::tie(that.m_length, that.m_dtype); + } +} + +auto Array::key() const -> const Key & +{ + return m_key; +} + +static py::object ToPython(const nvcv::ArrayData &arrayData, py::object owner) +{ + py::object out; + + auto data = arrayData.cast(); + if (!data) + { + throw std::runtime_error("Only tensors with pitch-linear data can be exported"); + } + + DLPackTensor dlTensor(*data); + return ExternalBuffer::Create(std::move(dlTensor), owner); +} + +py::object Array::cuda() const +{ + nvcv::ArrayData arrayData = m_impl.exportData(); + + // Note: we can't cache the returned ExternalBuffer because it is holding + // a reference to us. Doing so would lead to mem leaks. + return ToPython(arrayData, py::cast(this->shared_from_this())); +} + +std::ostream &operator<<(std::ostream &out, const Array &array) +{ + return out << "(&Array::Create)), "length"_a, "dtype"_a, + "Create a Array object with the given length and data type.") + .def(py::init(static_cast(&Array::Create)), "shape"_a, "dtype"_a, + "Create a Array object with the given shape and data type.") + .def_property_readonly("shape", &Array::shape, "The shape of the Array.") + .def_property_readonly("dtype", &Array::dtype, "The data type of the Array.") + // numpy and others use ndim, let's be consistent with them in python. + // It's not a requirement to be consistent between NVCV Python and C/C++. + // Each language use whatever is appropriate (and expected) in their environment. + .def_property_readonly("ndim", &Array::rank, "The number of dimensions of the Array.") + .def("cuda", &Array::cuda, "Reference to the Array on the CUDA device.") + .def("resize", static_cast(&Array::Resize), "length"_a, + "Produces an array pointing to the same data but with a new length.") + .def("resize", static_cast(&Array::Resize), "shape"_a, + "Produces an array pointing to the same data but with a new shape.") + .def("__repr__", &util::ToString, "Return the string representation of the Array object."); + + m.def("as_array", &Array::Wrap, "buffer"_a, "Wrap an existing buffer into a Array object with the given layout."); + m.def("resize", static_cast(&Array::ResizeArray), "array"_a, "length"_a, + "Produces an array pointing to the same data but with a new length."); + m.def("resize", static_cast(&Array::ResizeArray), "array"_a, "shape"_a, + "Produces an array pointing to the same data but with a new shape."); +} + +} // namespace nvcvpy::priv diff --git a/python/mod_nvcv/Array.hpp b/python/mod_nvcv/Array.hpp new file mode 100644 index 00000000..1cb32b94 --- /dev/null +++ b/python/mod_nvcv/Array.hpp @@ -0,0 +1,106 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_PYTHON_PRIV_ARRAY_HPP +#define NVCV_PYTHON_PRIV_ARRAY_HPP + +#include "Container.hpp" +#include "Size.hpp" + +#include +#include +#include +#include +#include +#include + +namespace nvcvpy::priv { +namespace py = pybind11; + +class ExternalBuffer; + +class Array : public Container +{ +public: + static void Export(py::module &m); + + static std::shared_ptr Create(int64_t length, nvcv::DataType dtype); + static std::shared_ptr Create(const Shape &shape, nvcv::DataType dtype); + + static std::shared_ptr CreateFromReqs(const nvcv::Array::Requirements &reqs); + + static std::shared_ptr Wrap(ExternalBuffer &buffer); + static std::shared_ptr ResizeArray(Array &array, Shape shape); + static std::shared_ptr ResizeArray(Array &array, int64_t length); + + std::shared_ptr Resize(Shape shape); + std::shared_ptr Resize(int64_t length); + + std::shared_ptr shared_from_this(); + std::shared_ptr shared_from_this() const; + + Shape shape() const; + nvcv::DataType dtype() const; + int rank() const; + int64_t length() const; + + nvcv::Array &impl(); + const nvcv::Array &impl() const; + + class Key final : public IKey + { + public: + explicit Key() + : m_wrapper(true) + { + } + + explicit Key(const nvcv::Array::Requirements &reqs); + explicit Key(int64_t length, nvcv::DataType dtype); + + private: + int64_t m_length; + nvcv::DataType m_dtype; + bool m_wrapper; + + virtual size_t doGetHash() const override; + virtual bool doIsCompatible(const IKey &that) const override; + }; + + virtual const Key &key() const override; + + py::object cuda() const; + +private: + Array(const nvcv::Array::Requirements &reqs); + Array(const nvcv::ArrayData &data, py::object wrappedObject); + Array(nvcv::Array &&array); + + // m_impl must come before m_key + nvcv::Array m_impl; + Key m_key; + + mutable py::object m_cacheExternalObject; + + py::object m_wrappedObject; // null if not wrapping +}; + +std::ostream &operator<<(std::ostream &out, const Array &array); + +} // namespace nvcvpy::priv + +#endif // NVCV_PYTHON_PRIV_ARRAY_HPP diff --git a/python/mod_nvcv/CAPI.cpp b/python/mod_nvcv/CAPI.cpp index c59cf8ca..6c5f9cd9 100644 --- a/python/mod_nvcv/CAPI.cpp +++ b/python/mod_nvcv/CAPI.cpp @@ -17,12 +17,14 @@ #include "CAPI.hpp" +#include "Array.hpp" #include "Cache.hpp" #include "DataType.hpp" #include "Image.hpp" #include "ImageBatch.hpp" #include "Stream.hpp" #include "Tensor.hpp" +#include "TensorBatch.hpp" #include #include @@ -73,6 +75,11 @@ extern "C" NVCVTensorHandle ImplTensor_GetHandle(PyObject *obj) return ToSharedObj(obj)->impl().handle(); } +extern "C" NVCVArrayHandle ImplArray_GetHandle(PyObject *obj) +{ + return ToSharedObj(obj)->impl().handle(); +} + LockMode ToLockMode(PyObject *_mode) { std::string s = ToObj(_mode); @@ -156,6 +163,13 @@ extern "C" PyObject *ImplTensor_Create(int32_t ndim, const int64_t *shape, NVCVD return py::cast(std::move(tensor)).release().ptr(); } +extern "C" PyObject *ImplArray_Create(int64_t length, NVCVDataType dtype) +{ + std::shared_ptr array = Array::Create(length, nvcv::DataType{dtype}); + + return py::cast(std::move(array)).release().ptr(); +} + extern "C" PyObject *ImplImageBatchVarShape_Create(int32_t capacity) { std::shared_ptr varshape = ImageBatchVarShape::Create(capacity); @@ -191,6 +205,33 @@ extern "C" void ImplImageBatchVarShape_Clear(PyObject *varshape) return ToSharedObj(varshape)->clear(); } +extern "C" PyObject *ImplTensorBatch_Create(int32_t capacity) +{ + std::shared_ptr tensorBatch = TensorBatch::Create(capacity); + return py::cast(std::move(tensorBatch)).release().ptr(); +} + +extern "C" NVCVTensorBatchHandle ImplTensorBatch_GetHandle(PyObject *tensorBatch) +{ + return ToSharedObj(tensorBatch)->impl().handle(); +} + +extern "C" void ImplTensorBatch_PushBack(PyObject *tensorBatch, PyObject *tensor) +{ + auto ptensor = ToSharedObj(tensor); + ToSharedObj(tensorBatch)->pushBack(*ptensor); +} + +extern "C" void ImplTensorBatch_PopBack(PyObject *tensorBatch, uint32_t cnt) +{ + ToSharedObj(tensorBatch)->popBack(cnt); +} + +extern "C" void ImplTensorBatch_Clear(PyObject *tensorBatch) +{ + ToSharedObj(tensorBatch)->clear(); +} + extern "C" void ImplCache_Add(ICacheItem *extItem) { auto item = std::make_shared(extItem->shared_from_this()); @@ -260,6 +301,8 @@ void ExportCAPI(py::module &m) .Tensor_GetHandle = &ImplTensor_GetHandle, .Tensor_Create = &ImplTensor_Create, .Tensor_CreateForImageBatch = &ImplTensor_CreateForImageBatch, + .Array_GetHandle = &ImplArray_GetHandle, + .Array_Create = &ImplArray_Create, .ImageBatchVarShape_Create = &ImplImageBatchVarShape_Create, .ImageBatchVarShape_GetHandle = &ImplImageBatchVarShape_GetHandle, .ImageBatchVarShape_PushBack = &ImplImageBatchVarShape_PushBack, @@ -271,6 +314,11 @@ void ExportCAPI(py::module &m) .Image_GetHandle = &ImplImage_GetHandle, .Container_Create = &ImplContainer_Create, .Cache_RemoveAllNotInUseMatching = &ImplCache_RemoveAllNotInUseMatching, + .TensorBatch_Create = &ImplTensorBatch_Create, + .TensorBatch_GetHandle = &ImplTensorBatch_GetHandle, + .TensorBatch_PushBack = &ImplTensorBatch_PushBack, + .TensorBatch_PopBack = &ImplTensorBatch_PopBack, + .TensorBatch_Clear = &ImplTensorBatch_Clear, }; m.add_object("_C_API", py::capsule(&capi, "nvcv._C_API")); diff --git a/python/mod_nvcv/CMakeLists.txt b/python/mod_nvcv/CMakeLists.txt index 76e65e3c..d1eb3428 100644 --- a/python/mod_nvcv/CMakeLists.txt +++ b/python/mod_nvcv/CMakeLists.txt @@ -56,12 +56,14 @@ nvcv_python_add_module( Tensor.cpp Image.cpp ImageBatch.cpp + TensorBatch.cpp ExternalBuffer.cpp Rect.cpp Object.cpp CAPI.cpp DLPackUtils.cpp ColorSpec.cpp + Array.cpp ) target_include_directories(nvcv_module_python @@ -83,6 +85,13 @@ target_link_libraries(nvcv_module_python -lrt ) +# use exports file to expose only the symbol dl-loaded by python, +# and nothing else. +target_link_options(nvcv_module_python + PRIVATE + -Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/exports.ldscript +) + add_library(pynvcv INTERFACE) target_include_directories(pynvcv INTERFACE include diff --git a/python/mod_nvcv/CastUtils.hpp b/python/mod_nvcv/CastUtils.hpp new file mode 100644 index 00000000..9614d967 --- /dev/null +++ b/python/mod_nvcv/CastUtils.hpp @@ -0,0 +1,47 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_PYTHON_PRIV_CAST_UTILS_HPP +#define NVCV_PYTHON_PRIV_CAST_UTILS_HPP + +#include +#include + +#include + +namespace nvcvpy::priv { +namespace py = pybind11; + +// pybind11 2.10.3 can't convert an item from the input list into another type +// automatically. It won't be able to match the call to current method definition. +// We have to accept std::vector and try to cast them manually. + +template +std::shared_ptr cast_py_object_as(py::object &obj) +{ + py::detail::type_caster caster; + if (!caster.load(obj, true)) + { + return {}; + } + std::shared_ptr buf = caster; + return buf; +} + +} // namespace nvcvpy::priv + +#endif // NVCV_PYTHON_PRIV_CAST_UTILS_HPP diff --git a/python/mod_nvcv/DLPackUtils.cpp b/python/mod_nvcv/DLPackUtils.cpp index 3f476347..e07cf75e 100644 --- a/python/mod_nvcv/DLPackUtils.cpp +++ b/python/mod_nvcv/DLPackUtils.cpp @@ -135,10 +135,9 @@ DLPackTensor::DLPackTensor(const nvcv::TensorDataStrided &tensorData) for (int i = 0; i < tensor.ndim; ++i) { int64_t stride = tensorData.cdata().buffer.strided.strides[i]; - if (stride % tensorData.dtype().strideBytes() != 0) { - throw std::runtime_error("Stride must be multiple of the element size in bytes"); + throw std::runtime_error("Stride must be a multiple of the element size in bytes"); } tensor.strides[i] = tensorData.cdata().buffer.strided.strides[i] / tensorData.dtype().strideBytes(); @@ -151,6 +150,57 @@ DLPackTensor::DLPackTensor(const nvcv::TensorDataStrided &tensorData) } } +DLPackTensor::DLPackTensor(const nvcv::ArrayData &arrayData) +{ + m_tensor = {}; + m_tensor.deleter = [](DLManagedTensor *self) + { + delete[] self->dl_tensor.shape; + delete[] self->dl_tensor.strides; + }; + + try + { + DLTensor &tensor = m_tensor.dl_tensor; + + // Set up device + if (arrayData.IsCompatible()) + { + // TODO: detect correct device_type from memory buffer + tensor.device.device_type = kDLCUDA; + // TODO: detect correct device_id from memory buffer (if possible) + tensor.device.device_id = 0; + } + else + { + throw std::runtime_error("Array buffer type not supported, must be either CUDA"); + } + + // Set up ndim + tensor.ndim = arrayData.rank(); + + // Set up data + tensor.data = arrayData.basePtr(); + tensor.byte_offset = 0; + + // Set up shape + tensor.shape = new int64_t[tensor.ndim]; + tensor.shape[0] = arrayData.capacity(); + + // Set up dtype + tensor.dtype = ToDLDataType(arrayData.dtype()); + + // Set up strides + tensor.strides = new int64_t[tensor.ndim]; + tensor.strides[0] = arrayData.stride(); + } + catch (...) + { + m_tensor.deleter(&m_tensor); + throw; + } +} + DLPackTensor::DLPackTensor(DLPackTensor &&that) noexcept : m_tensor{std::move(that.m_tensor)} { @@ -216,7 +266,7 @@ bool IsCudaAccessible(DLDeviceType devType) nvcv::DataType ToNVCVDataType(const DLDataType &dtype) { nvcv::PackingParams pp; - pp.byteOrder = nvcv::ByteOrder::LSB; + pp.byteOrder = nvcv::ByteOrder::MSB; int lanes = dtype.lanes; int bits = dtype.bits; diff --git a/python/mod_nvcv/DLPackUtils.hpp b/python/mod_nvcv/DLPackUtils.hpp index 1ad8adfb..f7d9069f 100644 --- a/python/mod_nvcv/DLPackUtils.hpp +++ b/python/mod_nvcv/DLPackUtils.hpp @@ -19,6 +19,7 @@ #define NVCV_PYTHON_PRIV_DLPACKUTILS_HPP #include +#include #include #include @@ -34,6 +35,7 @@ class DLPackTensor final explicit DLPackTensor(DLManagedTensor &&tensor); explicit DLPackTensor(const py::buffer_info &info, const DLDevice &dev); explicit DLPackTensor(const nvcv::TensorDataStrided &tensorData); + explicit DLPackTensor(const nvcv::ArrayData &arrayData); DLPackTensor(DLPackTensor &&that) noexcept; ~DLPackTensor(); diff --git a/python/mod_nvcv/ExternalBuffer.cpp b/python/mod_nvcv/ExternalBuffer.cpp index 822e4178..c78ea225 100644 --- a/python/mod_nvcv/ExternalBuffer.cpp +++ b/python/mod_nvcv/ExternalBuffer.cpp @@ -263,9 +263,9 @@ std::optional ExternalBuffer::cudaArrayInterface() const nvcv::DataType dataType = ToNVCVDataType(m_dlTensor->dtype); - NVCV_ASSERT(dataType.strideBytes() * 8 == m_dlTensor->dtype.bits); NVCV_ASSERT(m_dlTensor->dtype.bits % 8 == 0); - int elemStrideBytes = m_dlTensor->dtype.bits / 8; + NVCV_ASSERT(dataType.strideBytes() * 8 == m_dlTensor->dtype.bits * m_dlTensor->dtype.lanes); + int elemStrideBytes = dataType.strideBytes(); py::object strides; @@ -369,7 +369,7 @@ void ExternalBuffer::Export(py::module &m) .def("__dlpack_device__", &ExternalBuffer::dlpackDevice, "Get the device associated with the buffer"); } -} // namespace nv::vpi::python +} // namespace nvcvpy::priv namespace pybind11::detail { diff --git a/python/mod_nvcv/Image.cpp b/python/mod_nvcv/Image.cpp index 8ce28b8d..703611db 100644 --- a/python/mod_nvcv/Image.cpp +++ b/python/mod_nvcv/Image.cpp @@ -18,6 +18,7 @@ #include "Image.hpp" #include "Cache.hpp" +#include "CastUtils.hpp" #include "DataType.hpp" #include "ImageFormat.hpp" #include "Stream.hpp" @@ -573,19 +574,12 @@ std::shared_ptr Image::WrapExternalBuffer(ExternalBuffer &buffer, nvcv::I std::shared_ptr Image::WrapExternalBufferVector(std::vector buffers, nvcv::ImageFormat fmt) { std::vector> spBuffers; - for (size_t i = 0; i < buffers.size(); ++i) + for (auto &obj : buffers) { - // pybind11 2.10.3 can't convert an item from the input list into an ExternalBuffer - // automatically. It won't be able to match the call to current method definition. - // We have to accept py::objects and try to convert them here. - py::detail::type_caster caster; - if (!caster.load(buffers[i], true)) - { + std::shared_ptr buffer = cast_py_object_as(obj); + if (!buffer) throw std::runtime_error("Input buffer doesn't provide cuda_array_interface or DLPack interfaces"); - } - - std::shared_ptr spbuf = caster; - spBuffers.push_back(spbuf); + spBuffers.push_back(std::move(buffer)); } std::vector bufinfos; diff --git a/python/mod_nvcv/ImageBatch.cpp b/python/mod_nvcv/ImageBatch.cpp index e97bbd4c..58831fe9 100644 --- a/python/mod_nvcv/ImageBatch.cpp +++ b/python/mod_nvcv/ImageBatch.cpp @@ -17,6 +17,8 @@ #include "ImageBatch.hpp" +#include "CastUtils.hpp" +#include "ExternalBuffer.hpp" #include "Image.hpp" #include @@ -55,6 +57,23 @@ std::shared_ptr ImageBatchVarShape::Create(int capacity) } } +std::shared_ptr ImageBatchVarShape::WrapExternalBufferVector(std::vector buffers, + nvcv::ImageFormat fmt) +{ + auto batch = Create(buffers.size()); + for (auto &obj : buffers) + { + std::shared_ptr buffer = cast_py_object_as(obj); + if (!buffer) + { + throw std::runtime_error("Input buffer doesn't provide cuda_array_interface or DLPack interfaces"); + } + auto image = Image::WrapExternalBuffer(*buffer, fmt); + batch->pushBack(*image); + } + return batch; +} + ImageBatchVarShape::ImageBatchVarShape(int capacity) : m_key(capacity) , m_impl(capacity) @@ -174,6 +193,10 @@ void ImageBatchVarShape::Export(py::module &m) .def("popback", &ImageBatchVarShape::popBack, "count"_a = 1, "Remove one or more images from the end of the ImageBatchVarShape.") .def("clear", &ImageBatchVarShape::clear, "Remove all images from the ImageBatchVarShape."); + + m.def("as_images", &ImageBatchVarShape::WrapExternalBufferVector, py::arg_v("buffers", std::vector{}), + "format"_a = nvcv::FMT_NONE, py::keep_alive<0, 1>(), + "Wrap a vector of external buffers as a batch of images, and tie the buffers lifetime to it"); } } // namespace nvcvpy::priv diff --git a/python/mod_nvcv/ImageBatch.hpp b/python/mod_nvcv/ImageBatch.hpp index 7e7ff1d1..7006d4cf 100644 --- a/python/mod_nvcv/ImageBatch.hpp +++ b/python/mod_nvcv/ImageBatch.hpp @@ -38,6 +38,8 @@ class ImageBatchVarShape : public Container static void Export(py::module &m); static std::shared_ptr Create(int capacity); + static std::shared_ptr WrapExternalBufferVector(std::vector buffer, + nvcv::ImageFormat fmt); std::shared_ptr shared_from_this(); std::shared_ptr shared_from_this() const; diff --git a/python/mod_nvcv/Main.cpp b/python/mod_nvcv/Main.cpp index 13e1b459..d02bf389 100644 --- a/python/mod_nvcv/Main.cpp +++ b/python/mod_nvcv/Main.cpp @@ -28,6 +28,7 @@ #include "Resource.hpp" #include "Stream.hpp" #include "Tensor.hpp" +#include "TensorBatch.hpp" #include #include @@ -62,6 +63,7 @@ PYBIND11_MODULE(nvcv, m) Resource::Export(m); Container::Export(m); Tensor::Export(m); + TensorBatch::Export(m); Image::Export(m); ImageBatchVarShape::Export(m); ExportCAPI(m); diff --git a/python/mod_nvcv/Resource.cpp b/python/mod_nvcv/Resource.cpp index b5d283bf..b6b49476 100644 --- a/python/mod_nvcv/Resource.cpp +++ b/python/mod_nvcv/Resource.cpp @@ -22,8 +22,6 @@ #include #include -#include - namespace nvcvpy::priv { Resource::Resource() diff --git a/python/mod_nvcv/Tensor.cpp b/python/mod_nvcv/Tensor.cpp index 4a314103..ea21d08a 100644 --- a/python/mod_nvcv/Tensor.cpp +++ b/python/mod_nvcv/Tensor.cpp @@ -104,6 +104,7 @@ NVCVTensorData FillNVCVTensorData(const DLTensor &tensor, std::optional NVCV_TENSOR_MAX_RANK) { @@ -181,6 +182,28 @@ std::shared_ptr Tensor::WrapImage(Image &img) return tensor; } +std::shared_ptr Tensor::ReshapeTensor(Tensor &tensor, Shape shape, std::optional layout) +{ + Tensor::Key key; + Cache::Instance().removeAllNotInUseMatching(key); + + nvcv::Tensor tensor_impl = tensor.impl(); + auto new_tensor_shape = CreateNVCVTensorShape(shape, layout ? *layout : tensor_impl.layout()); + nvcv::Tensor new_tensor_impl = tensor_impl.reshape(std::move(new_tensor_shape)); + auto new_tensor = std::shared_ptr(new Tensor(std::move(new_tensor_impl))); + + // Need to add wrappers to cache so that they don't get destroyed by + // the cuda stream when they're last used, and python script isn't + // holding a reference to them. If we don't do it, things might break. + Cache::Instance().add(*new_tensor); + return new_tensor; +} + +std::shared_ptr Tensor::Reshape(Shape shape, std::optional layout) +{ + return ReshapeTensor(*this, std::move(shape), std::move(layout)); +} + Tensor::Tensor(const nvcv::Tensor::Requirements &reqs) : m_impl{reqs} , m_key{reqs} @@ -201,6 +224,12 @@ Tensor::Tensor(Image &img) { } +Tensor::Tensor(nvcv::Tensor &&tensor) + : m_impl{std::move(tensor)} + , m_key{} +{ +} + std::shared_ptr Tensor::shared_from_this() { return std::static_pointer_cast(Container::shared_from_this()); @@ -373,12 +402,16 @@ void Tensor::Export(py::module &m) // It's not a requirement to be consistent between NVCV Python and C/C++. // Each language use whatever is appropriate (and expected) in their environment. .def_property_readonly("ndim", &Tensor::rank, "The number of dimensions of the Tensor.") - .def("cuda", &Tensor::cuda, "Referance to the Tensor on the CUDA device.") + .def("cuda", &Tensor::cuda, "Reference to the Tensor on the CUDA device.") + .def("reshape", &Tensor::Reshape, "shape"_a, "layout"_a = std::nullopt, + "Produces a tensor pointing to the same data but with a new shape and layout.") .def("__repr__", &util::ToString, "Return the string representation of the Tensor object."); m.def("as_tensor", &Tensor::Wrap, "buffer"_a, "layout"_a = std::nullopt, "Wrap an existing buffer into a Tensor object with the given layout."); m.def("as_tensor", &Tensor::WrapImage, "image"_a, "Wrap an existing image into a Tensor object."); + m.def("reshape", &Tensor::ReshapeTensor, "tensor"_a, "shape"_a, "layout"_a = std::nullopt, + "Produces a tensor pointing to the same data but with a new shape and layout."); } } // namespace nvcvpy::priv diff --git a/python/mod_nvcv/Tensor.hpp b/python/mod_nvcv/Tensor.hpp index 27a4055a..87f1b7ba 100644 --- a/python/mod_nvcv/Tensor.hpp +++ b/python/mod_nvcv/Tensor.hpp @@ -47,6 +47,9 @@ class Tensor : public Container static std::shared_ptr Wrap(ExternalBuffer &buffer, std::optional layout); static std::shared_ptr WrapImage(Image &img); + static std::shared_ptr ReshapeTensor(Tensor &tensor, Shape shape, std::optional layout); + + std::shared_ptr Reshape(Shape shape, std::optional layout); std::shared_ptr shared_from_this(); std::shared_ptr shared_from_this() const; @@ -87,6 +90,7 @@ class Tensor : public Container Tensor(const nvcv::Tensor::Requirements &reqs); Tensor(const nvcv::TensorData &data, py::object wrappedObject); Tensor(Image &img); + Tensor(nvcv::Tensor &&tensor); // m_impl must come before m_key nvcv::Tensor m_impl; diff --git a/python/mod_nvcv/TensorBatch.cpp b/python/mod_nvcv/TensorBatch.cpp new file mode 100644 index 00000000..99d51497 --- /dev/null +++ b/python/mod_nvcv/TensorBatch.cpp @@ -0,0 +1,261 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TensorBatch.hpp" + +#include "CastUtils.hpp" +#include "DataType.hpp" +#include "ExternalBuffer.hpp" +#include "Tensor.hpp" + +#include + +namespace nvcvpy::priv { + +size_t TensorBatch::Key::doGetHash() const +{ + using util::ComputeHash; + return ComputeHash(m_capacity); +} + +bool TensorBatch::Key::doIsCompatible(const IKey &ithat) const +{ + auto &that = static_cast(ithat); + return m_capacity == that.m_capacity; +} + +std::shared_ptr TensorBatch::Create(int capacity) +{ + std::vector> vcont = Cache::Instance().fetch(Key{capacity}); + + // None found? + if (vcont.empty()) + { + std::shared_ptr batch(new TensorBatch(capacity)); + Cache::Instance().add(*batch); + return batch; + } + else + { + // Get the first one + auto batch = std::static_pointer_cast(vcont[0]); + batch->clear(); // make sure it's in pristine state + return batch; + } +} + +std::shared_ptr TensorBatch::WrapExternalBufferVector(std::vector buffers, + std::optional layout) +{ + TensorList list; + list.reserve(buffers.size()); + for (auto &obj : buffers) + { + std::shared_ptr buffer = cast_py_object_as(obj); + if (!buffer) + { + throw std::runtime_error("Input buffer doesn't provide cuda_array_interface or DLPack interfaces."); + } + auto tensor = Tensor::Wrap(*buffer, layout); + list.push_back(tensor); + } + auto batch = Create(buffers.size()); + batch->pushBackMany(list); + return batch; +} + +TensorBatch::TensorBatch(int capacity) + : m_key(capacity) + , m_impl(capacity) +{ + m_list.reserve(capacity); +} + +const nvcv::TensorBatch &TensorBatch::impl() const +{ + return m_impl; +} + +nvcv::TensorBatch &TensorBatch::impl() +{ + return m_impl; +} + +int32_t TensorBatch::rank() const +{ + return m_impl.rank(); +} + +int32_t TensorBatch::capacity() const +{ + return m_impl.capacity(); +} + +int32_t TensorBatch::numTensors() const +{ + NVCV_ASSERT(m_impl.numTensors() == static_cast(m_list.size())); + return m_impl.numTensors(); +} + +std::optional TensorBatch::dtype() const +{ + auto dtype = m_impl.dtype(); + if (dtype != nvcv::DataType()) + { + return {dtype}; + } + else + { + return std::nullopt; + } +} + +std::optional TensorBatch::layout() const +{ + auto layout = m_impl.layout(); + if (layout != nvcv::TENSOR_NONE) + { + return {layout}; + } + else + { + return std::nullopt; + } +} + +void TensorBatch::pushBack(Tensor &tensor) +{ + m_impl.pushBack(tensor.impl()); + m_list.push_back(tensor.shared_from_this()); +} + +void TensorBatch::pushBackMany(std::vector> &tensorList) +{ + std::vector nvcvTensors; + nvcvTensors.reserve(tensorList.size()); + for (auto &tensor : tensorList) + { + m_list.push_back(tensor); + if (tensor) + nvcvTensors.push_back(tensor->impl()); + else + nvcvTensors.push_back(nvcv::Tensor()); + } + m_impl.pushBack(nvcvTensors.begin(), nvcvTensors.end()); +} + +void TensorBatch::popBack(int tensorCount) +{ + m_impl.popTensors(tensorCount); + m_list.erase(m_list.end() - tensorCount, m_list.end()); +} + +void TensorBatch::clear() +{ + m_impl.clear(); + m_list.clear(); +} + +std::shared_ptr TensorBatch::at(int64_t idx) const +{ + if (idx < 0) + { + throw std::runtime_error("Invalid index: " + std::to_string(idx)); + } + else if (idx >= static_cast(m_list.size())) + { + throw std::runtime_error("Cannot get tensor at index " + std::to_string(idx) + ". Batch has only " + + std::to_string(m_list.size()) + " elements."); + } + return m_list[idx]; +} + +void TensorBatch::set_at(int64_t idx, std::shared_ptr tensor) +{ + if (idx < 0) + { + throw std::runtime_error("Invalid index: " + std::to_string(idx)); + } + else if (idx >= static_cast(m_list.size())) + { + throw std::runtime_error("Cannot set tensor at index " + std::to_string(idx) + ". Batch has only " + + std::to_string(m_list.size()) + " elements."); + } + m_impl.setTensor(static_cast(idx), tensor->impl()); + m_list[idx] = tensor; +} + +auto TensorBatch::begin() const -> TensorList::const_iterator +{ + return m_list.begin(); +} + +auto TensorBatch::end() const -> TensorList::const_iterator +{ + return m_list.end(); +} + +std::shared_ptr TensorBatch::shared_from_this() +{ + return std::static_pointer_cast(Container::shared_from_this()); +} + +std::shared_ptr TensorBatch::shared_from_this() const +{ + return std::static_pointer_cast(Container::shared_from_this()); +} + +void TensorBatch::Export(py::module &m) +{ + using namespace py::literals; + + py::class_, Container>( + m, "TensorBatch", + "Container for a batch of tensors.\n" + "The capacity of the container must be specified upfront in the batch initialization.\n" + "The tensors in the batch may differ in shapes but they must have " + "a uniform dimensionality, data type and layout.") + .def(py::init(&TensorBatch::Create), + "capacity"_a + "Create a new TensorBatch object with the specified capacity.") + .def_property_readonly("layout", &TensorBatch::layout, + "Layout of the tensors in the tensor batch." + " None if the batch is empty.") + .def_property_readonly("dtype", &TensorBatch::dtype, + "Data type of tensors in the tensor batch." + " None if the batch is empty.") + .def_property_readonly("capacity", &TensorBatch::capacity, "Capacity of the tensor batch.") + .def_property_readonly("ndim", &TensorBatch::rank, + "Return the number of dimensions of the tensors or -1 for an empty batch") + .def("__len__", &TensorBatch::numTensors, "Return the number of tensors.") + .def( + "__iter__", [](const TensorBatch &batch) { return py::make_iterator(batch); }, + "Return an iterator over the tensors in the TensorBatch.") + .def("__setitem__", &TensorBatch::set_at, "Set tensor at a given index.") + .def("__getitem__", &TensorBatch::at, "Get a tensor at a given index.") + .def("pushback", &TensorBatch::pushBack, "Add a new image to the end of the TensorBatch.") + .def("pushback", &TensorBatch::pushBackMany, "Add multiple images to the end of the TensorBatch.") + .def("popback", &TensorBatch::popBack, "count"_a = 1, + "Remove one or more images from the end of the TensorBatch.") + .def("clear", &TensorBatch::clear, "Remove all images from the TensorBatch."); + + m.def("as_tensors", &TensorBatch::WrapExternalBufferVector, "buffers"_a = std::vector{}, + "layout"_a = std::nullopt, py::keep_alive<0, 1>(), + "Wrap a list of external buffers as a batch of tensors, and tie the buffers lifetime to it"); +} + +} // namespace nvcvpy::priv diff --git a/python/mod_nvcv/TensorBatch.hpp b/python/mod_nvcv/TensorBatch.hpp new file mode 100644 index 00000000..0edaebba --- /dev/null +++ b/python/mod_nvcv/TensorBatch.hpp @@ -0,0 +1,96 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_PYTHON_PRIV_TENSORBATCH_HPP +#define NVCV_PYTHON_PRIV_TENSORBATCH_HPP + +#include "Container.hpp" + +#include + +#include + +namespace nvcvpy::priv { +namespace py = pybind11; + +class Tensor; + +class TensorBatch : public Container +{ + using TensorList = std::vector>; + +public: + static void Export(py::module &m); + + static std::shared_ptr Create(int capacity); + static std::shared_ptr WrapExternalBufferVector(std::vector buffers, + std::optional layout); + + std::shared_ptr shared_from_this(); + std::shared_ptr shared_from_this() const; + + const nvcv::TensorBatch &impl() const; + nvcv::TensorBatch &impl(); + + int32_t numTensors() const; + int32_t capacity() const; + + int32_t rank() const; + std::optional dtype() const; + std::optional layout() const; + + void pushBack(Tensor &tensor); + void pushBackMany(std::vector> &tensorList); + void popBack(int tensorCount); + void clear(); + + std::shared_ptr at(int64_t idx) const; + void set_at(int64_t idx, std::shared_ptr tensor); + + TensorList::const_iterator begin() const; + TensorList::const_iterator end() const; + + class Key final : public IKey + { + public: + Key(int capacity) + : m_capacity(capacity) + { + } + + private: + int m_capacity; + + virtual size_t doGetHash() const override; + virtual bool doIsCompatible(const IKey &that) const override; + }; + + virtual const Key &key() const override + { + return m_key; + } + +private: + TensorBatch(int capacity); + Key m_key; + nvcv::TensorBatch m_impl; + TensorList m_list; +}; + +} // namespace nvcvpy::priv + +#endif // NVCV_PYTHON_PRIV_TENSORBATCH_HPP diff --git a/python/mod_nvcv/exports.ldscript b/python/mod_nvcv/exports.ldscript new file mode 100644 index 00000000..c3d107ff --- /dev/null +++ b/python/mod_nvcv/exports.ldscript @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{ +# Restricts global symbols to the only one +# that needs to be exported by a python module. +global: + PyInit_nvcv; +local: *; +}; diff --git a/python/mod_nvcv/include/nvcv/python/Array.hpp b/python/mod_nvcv/include/nvcv/python/Array.hpp new file mode 100644 index 00000000..5d3d2bcf --- /dev/null +++ b/python/mod_nvcv/include/nvcv/python/Array.hpp @@ -0,0 +1,102 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_PYTHON_ARRAY_HPP +#define NVCV_PYTHON_ARRAY_HPP + +#include "CAPI.hpp" +#include "DataType.hpp" +#include "Resource.hpp" +#include "Shape.hpp" + +#include +#include +#include + +#include + +namespace nvcvpy { + +namespace py = pybind11; + +class Array + : public Resource + , public nvcv::Array +{ +public: + static Array Create(int64_t length, nvcv::DataType dtype) + { + PyObject *oarray = capi().Array_Create(length, dtype); + + py::object pyarray = py::reinterpret_steal(oarray); + + return Array(pyarray); + } + + static Array Create(const Shape &shape, nvcv::DataType dtype) + { + return Create(LengthIf1D(shape), dtype); + } + +private: + friend struct py::detail::type_caster; + + Array() = default; + + explicit Array(py::object obj) + : Resource(obj) + , nvcv::Array(FromHandle(capi().Array_GetHandle(this->ptr()), true)) + { + } +}; + +} // namespace nvcvpy + +namespace pybind11::detail { + +namespace cvpy = nvcvpy; + +template<> +struct type_caster : type_caster_base +{ + PYBIND11_TYPE_CASTER(cvpy::Array, const_name("nvcv.Array")); + + bool load(handle src, bool) + { + // Does it have the correct object type? + PyTypeObject *srctype = Py_TYPE(src.ptr()); + if (strcmp(name.text, srctype->tp_name) == 0) + { + value = cvpy::Array(reinterpret_borrow(src)); + return true; + } + else + { + return false; + } + } + + static handle cast(cvpy::Array array, return_value_policy /* policy */, handle /*parent */) + { + array.inc_ref(); // for some reason this is needed + return array; + } +}; + +} // namespace pybind11::detail + +#endif // NVCV_PYTHON_ARRAY_HPP diff --git a/python/mod_nvcv/include/nvcv/python/CAPI.hpp b/python/mod_nvcv/include/nvcv/python/CAPI.hpp index 1a158abc..db5f200a 100644 --- a/python/mod_nvcv/include/nvcv/python/CAPI.hpp +++ b/python/mod_nvcv/include/nvcv/python/CAPI.hpp @@ -19,6 +19,7 @@ #define NVCV_PYTHON_CAPI_HPP #include +#include #include #include #include @@ -56,6 +57,9 @@ struct CAPI PyObject *(*Tensor_CreateForImageBatch)(int32_t numImages, int32_t width, int32_t height, NVCVImageFormat fmt, int32_t rowAlign); + NVCVArrayHandle (*Array_GetHandle)(PyObject *array); + PyObject *(*Array_Create)(int64_t length, NVCVDataType dtype); + PyObject *(*ImageBatchVarShape_Create)(int32_t capacity); NVCVImageBatchHandle (*ImageBatchVarShape_GetHandle)(PyObject *varshape); void (*ImageBatchVarShape_PushBack)(PyObject *varshape, PyObject *image); @@ -72,6 +76,16 @@ struct CAPI void (*Cache_RemoveAllNotInUseMatching)(const IKey *key); + PyObject *(*TensorBatch_Create)(int32_t capacity); + + NVCVTensorBatchHandle (*TensorBatch_GetHandle)(PyObject *tensorBatch); + + void (*TensorBatch_PushBack)(PyObject *tensorBatch, PyObject *tensor); + + void (*TensorBatch_PopBack)(PyObject *tensorBatch, uint32_t cnt); + + void (*TensorBatch_Clear)(PyObject *tensorBatch); + // always add new functions at the end, and never change the function prototypes above. }; diff --git a/python/mod_nvcv/include/nvcv/python/Fwd.hpp b/python/mod_nvcv/include/nvcv/python/Fwd.hpp index d6543c56..de280d8f 100644 --- a/python/mod_nvcv/include/nvcv/python/Fwd.hpp +++ b/python/mod_nvcv/include/nvcv/python/Fwd.hpp @@ -30,6 +30,7 @@ class Resource; class Image; class ImageBatchVarShape; class Tensor; +class Array; class Stream; class ResourceGuard; enum LockMode : uint8_t; diff --git a/python/mod_nvcv/include/nvcv/python/Shape.hpp b/python/mod_nvcv/include/nvcv/python/Shape.hpp index c2f7ed51..a6139a7c 100644 --- a/python/mod_nvcv/include/nvcv/python/Shape.hpp +++ b/python/mod_nvcv/include/nvcv/python/Shape.hpp @@ -37,6 +37,13 @@ inline Shape CreateShape(const nvcv::TensorShape &tshape) return s; } +inline Shape CreateShape(int64_t length) +{ + Shape s(1); + s[0] = length; + return s; +} + inline nvcv::TensorShape CreateNVCVTensorShape(const Shape &shape, nvcv::TensorLayout layout = nvcv::TENSOR_NONE) { std::vector dims; @@ -49,6 +56,24 @@ inline nvcv::TensorShape CreateNVCVTensorShape(const Shape &shape, nvcv::TensorL return nvcv::TensorShape(dims.data(), dims.size(), layout); } +inline int64_t LengthIf1D(const Shape &shape) +{ + int64_t largest = 1; + for (size_t i = 0; i < shape.size(); ++i) + { + if (shape[i].cast() > 1) + { + if (largest > 1) + { + throw std::invalid_argument("Non-supported array shape"); + } + largest = shape[i].cast(); + } + } + + return largest; +} + } // namespace nvcvpy #endif // NVCV_PYTHON_SHAPE_HPP diff --git a/python/mod_nvcv/include/nvcv/python/TensorBatch.hpp b/python/mod_nvcv/include/nvcv/python/TensorBatch.hpp new file mode 100644 index 00000000..b13e184f --- /dev/null +++ b/python/mod_nvcv/include/nvcv/python/TensorBatch.hpp @@ -0,0 +1,113 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_PYTHON_TENSORBATCH_HPP +#define NVCV_PYTHON_TENSORBATCH_HPP + +#include "CAPI.hpp" +#include "Resource.hpp" + +#include +#include + +#include + +namespace nvcvpy { + +namespace py = pybind11; + +class TensorBatch + : public Resource + , public nvcv::TensorBatch +{ +public: + static TensorBatch Create(int capacity) + { + PyObject *tensorBatch = capi().TensorBatch_Create(capacity); + + py::object pytensorBatch = py::reinterpret_steal(tensorBatch); + + return TensorBatch(pytensorBatch); + } + + void pushBack(Tensor tensor) + { + capi().TensorBatch_PushBack(this->ptr(), tensor.ptr()); + } + + void popBack(int cnt) + { + capi().TensorBatch_PopBack(this->ptr(), cnt); + } + + void clear() + { + capi().TensorBatch_Clear(this->ptr()); + } + + using nvcv::TensorBatch::operator[]; + using nvcv::TensorBatch::begin; + using nvcv::TensorBatch::end; + +private: + friend struct py::detail::type_caster; + + TensorBatch() = default; + + explicit TensorBatch(py::object obj) + : Resource(obj) + , nvcv::TensorBatch(FromHandle(capi().TensorBatch_GetHandle(this->ptr()), true)) + { + } +}; + +} // namespace nvcvpy + +namespace pybind11::detail { + +namespace cvpy = nvcvpy; + +template<> +struct type_caster : type_caster_base +{ + PYBIND11_TYPE_CASTER(cvpy::TensorBatch, const_name("nvcv.TensorBatch")); + + bool load(handle src, bool) + { + // Does it have the correct object type? + PyTypeObject *srctype = Py_TYPE(src.ptr()); + if (strcmp(name.text, srctype->tp_name) == 0) + { + value = cvpy::TensorBatch(reinterpret_borrow(src)); + return true; + } + else + { + return false; + } + } + + static handle cast(cvpy::TensorBatch tensor, return_value_policy /* policy */, handle /*parent */) + { + tensor.inc_ref(); // for some reason this is needed + return tensor; + } +}; + +} // namespace pybind11::detail + +#endif // NVCV_PYTHON_TENSORBATCH_HPP diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 2c7813bd..ca2ee0c2 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -32,7 +32,8 @@ set(CPPSAMPLES common set(PYSAMPLES classification segmentation - object_detection) + object_detection + label) foreach(sample ${CPPSAMPLES}) add_subdirectory(${sample}) diff --git a/samples/README.md b/samples/README.md index c3b58468..a0c6a150 100644 --- a/samples/README.md +++ b/samples/README.md @@ -57,10 +57,10 @@ Setting up the following is only required if you want to setup and run the sampl 4. Install the CV-CUDA packages. Please note that since the above container comes with Python 3.8.10, we will install nvcv-python3.8-0 package as mentioned below. If you have any other Python distributions, you would need to use the appropriate nvcv-python Debian package below. ```bash - dpkg -i nvcv-lib-0.4.0_beta-cuda11-x86_64-linux.deb - dpkg -i nvcv-dev-0.4.0_beta-cuda11-x86_64-linux.deb - dpkg -i cvcuda-samples-0.4.0_beta-cuda11-x86_64-linux.deb - dpkg -i nvcv-python3.8-0.4.0_beta-cuda11-x86_64-linux.deb + dpkg -i nvcv-lib-0.5.0_beta-cuda11-x86_64-linux.deb + dpkg -i nvcv-dev-0.5.0_beta-cuda11-x86_64-linux.deb + dpkg -i cvcuda-samples-0.5.0_beta-cuda11-x86_64-linux.deb + dpkg -i nvcv-python3.8-0.5.0_beta-cuda11-x86_64-linux.deb ``` 5. Copy the samples folder to the target directory. diff --git a/samples/common/python/interop_utils.py b/samples/common/python/interop_utils.py new file mode 100644 index 00000000..1083452c --- /dev/null +++ b/samples/common/python/interop_utils.py @@ -0,0 +1,88 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import torch +import copy + + +class CudaBuffer: + __cuda_array_interface__ = None + obj = None + + +def to_torch_dtype(data_type): + """Convert a data type into one supported by torch + + Args: + data_type (numpy dtype): Original data type + + Returns: + dtype: A data type supported by torch + """ + if data_type == np.uint16: + return np.dtype(np.int16) + elif data_type == np.uint32: + return np.dtype(np.int32) + elif data_type == np.uint64: + return np.dtype(np.int64) + else: + return data_type + + +def to_cpu_numpy_buffer(cuda_buffer): + """Convert a CUDA buffer to host (CPU) nympy array + + Args: + cuda_buffer: CUDA buffer with __cuda_array_interface__ + + Returns: + numpy array: The CUDA buffer copied to the CPU + """ + torch_dtype = copy.copy(cuda_buffer.dtype) + torch_dtype = to_torch_dtype(torch_dtype) + + buf = CudaBuffer + buf.obj = cuda_buffer + buf.__cuda_array_interface__ = cuda_buffer.__cuda_array_interface__ + buf.__cuda_array_interface__["typestr"] = torch_dtype.str + + return torch.as_tensor(buf).cpu().numpy() + + +def to_cuda_buffer(host_data): + """Convert host data to a CUDA buffer + + Args: + host_data (numpy array): Host data + + Returns: + CudaBuffer: The converted CUDA buffer + """ + orig_dtype = copy.copy(host_data.dtype) + + host_data.dtype = to_torch_dtype(host_data.dtype) + + dev = torch.as_tensor(host_data, device="cuda").cuda() + host_data.dtype = orig_dtype # restore it + + # The cuda buffer only needs the cuda array interface. + # We can then set its dtype to whatever we want. + buf = CudaBuffer() + buf.__cuda_array_interface__ = dev.__cuda_array_interface__ + buf.__cuda_array_interface__["typestr"] = orig_dtype.str + buf.obj = dev # make sure it holds a reference to the torch buffer + + return buf diff --git a/samples/common/python/perf_utils.py b/samples/common/python/perf_utils.py index f9b290ac..1563afd9 100644 --- a/samples/common/python/perf_utils.py +++ b/samples/common/python/perf_utils.py @@ -76,7 +76,7 @@ def __init__( self.timing_info = {} self.batch_info = {} self.inside_batch_info = [] - self.is_inside_batch = False + self.is_inside_batch = 0 self.total_batches_processed = {} # Check if the benchmark.py script was used to run this. We do so # by checking whether an environment variable only set by that script is @@ -108,7 +108,7 @@ def push_range( """ if batch_idx is not None: message += "_%d" % batch_idx - self.is_inside_batch = True + self.is_inside_batch += 1 nvtx.push_range(message, color, domain, category) @@ -133,12 +133,12 @@ def pop_range(self, domain=None, total_items=None): # Actual timing information will be recorded and pulled from NSYS by a # script like benchmark.py. - if self.is_inside_batch: + if self.is_inside_batch > 0: self.inside_batch_info.append(self.stack_path) # Record the batch information if it was present. if total_items is not None: - if not self.is_inside_batch: + if self.is_inside_batch <= 0: raise ValueError( "Non zero value for total_items in pop_range can only be " "passed once inside a batch. No known batch was pushed previously. Please " @@ -146,7 +146,7 @@ def pop_range(self, domain=None, total_items=None): ) self.batch_info[self.stack_path] = (batch_idx, total_items) - self.is_inside_batch = False + self.is_inside_batch -= 1 if total_items > 0: batch_level_prefix = os.path.dirname(self.stack_path) diff --git a/samples/label/python/main.py b/samples/label/python/main.py new file mode 100644 index 00000000..575a2a2a --- /dev/null +++ b/samples/label/python/main.py @@ -0,0 +1,316 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# docs_tag: begin_python_imports + +import pycuda.driver as cuda +import os +import sys +import logging +import cvcuda +import torch +import numpy as np + +# Bring the commons folder from the samples directory into our path so that +# we can import modules from it. +common_dir = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), + "common", + "python", +) +sys.path.insert(0, common_dir) + +from perf_utils import ( # noqa: E402 + CvCudaPerf, + get_default_arg_parser, + parse_validate_default_args, +) + +from torch_utils import ImageBatchDecoderPyTorch, ImageBatchEncoderPyTorch # noqa: E402 +from interop_utils import to_cpu_numpy_buffer, to_cuda_buffer # noqa: E402 + +# docs_tag: end_python_imports + + +def save_batch(images, label, encoder, batch): + """Save a batch of images to disk + + Args: + images (nvcv Tensor): Batch of images to save + + label : label value for output file name, + used to differentiate between outputs + appended to the original file name. + + encoder : Encoder object to save the images + + batch : Batch object to save the images + + Returns: + nvcv Tensor: RGB color, random for each label + """ + # Function to modify filenames in the batch + def modify_filenames(suffix): + modified_filenames = [] + for filename in batch.fileinfo: + name, extension = filename.rsplit(".", 1) + modified_filename = f"{name}_{suffix}.{extension}" + modified_filenames.append(modified_filename) + return modified_filenames + + # convert to NCHW + imagesNCHW = cvcuda.reformat(images, "NCHW") + + # Modify filenames with "_labels" suffix + oldFileNames = batch.fileinfo + batch.fileinfo = modify_filenames(label) + batch.data = torch.as_tensor(imagesNCHW.cuda()) + encoder(batch) + batch.fileinfo = oldFileNames + + +def simple_cmap(label): + """Convert a label map to an random RGB color + + Args: + labels : label value + + Returns: + nvcv Tensor: RGB color, random for each label + """ + np.random.seed(label) # Ensure consistent color for each label + return np.random.randint(0, 256, 3) # Random RGB color + + +def color_labels_nhwc(labels): + """Convert a label map to an RGB image + + Args: + labels : Output of cvcuda.label operator + + Returns: + nvcv Tensor: RGB image, with each label having a unique color + """ + npLabels = to_cpu_numpy_buffer(labels.cuda()) + # Initialize the output array with the same batch size, height, width, and RGB channels + a_rgb = np.zeros( + [npLabels.shape[0], npLabels.shape[1], npLabels.shape[2], 3], dtype=np.uint8 + ) + + # Iterate over each image in the batch + for n in range(npLabels.shape[0]): + # Extract unique labels for the current image + a_labels = np.unique(npLabels[n, :, :, :]) + + # Process each label in the current image + for label in a_labels: + rgb_label_color = simple_cmap(label) + # Create a mask for the current label + mask = npLabels[n] == label + # Use the mask to assign color to the corresponding pixels + a_rgb[n][mask[:, :, 0]] = rgb_label_color.astype(np.uint8) + + return cvcuda.as_tensor(to_cuda_buffer(a_rgb), "NHWC") + + +def run_sample( + input_path, + output_dir, + batch_size, + target_img_height, + target_img_width, + device_id, + cvcuda_perf, +): + logger = logging.getLogger("Distance_Label_Sample") + + logger.debug("Using batch size of %d" % batch_size) + logger.debug("Using CUDA device: %d" % device_id) + + # docs_tag: begin_setup_gpu + cvcuda_perf.push_range("run_sample") + + # Define the objects that handle the pipeline stages --- + image_size = (target_img_width, target_img_height) + logger.debug("Image size: %d %d" % image_size) + + # Define the cuda device, context and streams. + cuda_device = cuda.Device(device_id) + cuda_ctx = cuda_device.retain_primary_context() + cuda_ctx.push() + cvcuda_stream = cvcuda.Stream() + torch_stream = torch.cuda.ExternalStream(cvcuda_stream.handle) + # docs_tag: end_setup_gpu + + # docs_tag: encoder_decoder setup + # Now define the object that will handle pre-processing + if os.path.splitext(input_path)[1] == ".jpg" or os.path.isdir(input_path): + # Treat this as data modality of images + decoder = ImageBatchDecoderPyTorch( + input_path, batch_size, device_id, cuda_ctx, cvcuda_perf + ) + encoder = ImageBatchEncoderPyTorch( + output_dir, + fps=0, + device_id=device_id, + cuda_ctx=cuda_ctx, + cvcuda_perf=cvcuda_perf, + ) + else: + raise ValueError("Unknown data modality: %s." % input_path) + # docs_tag: encoder_decoder setup + + # docs_tag: begin_pipeline + # Define and execute the processing pipeline + cvcuda_perf.push_range("pipeline") + + # Fire up encoder/decoder + decoder.start() + encoder.start() + + # Loop through all input frames + batch_idx = 0 + while True: + cvcuda_perf.push_range("batch", batch_idx=batch_idx) + + # Execute everything inside the streams. + with cvcuda_stream, torch.cuda.stream(torch_stream): + # Stage 1: decode + batch = decoder() + if batch is None: + cvcuda_perf.pop_range(total_items=0) # for batch + break # No more frames to decode + assert batch_idx == batch.batch_idx + + logger.info("Processing batch %d" % batch_idx) + + # docs_tag: process_batch + # Stage 2: process + + # docs_tag: begin_tensor_conversion + # Need to check what type of input we have received: + # 1) CVCUDA tensor --> Nothing needs to be done. + # 2) Numpy Array --> Convert to torch tensor first and then CVCUDA tensor + # 3) Torch Tensor --> Convert to CVCUDA tensor + if isinstance(batch.data, torch.Tensor): + cvcudaTensorNHWC = cvcuda.as_tensor(batch.data, "NHWC") + elif isinstance(batch.data, np.ndarray): + cvcudaTensorNHWC = cvcuda.as_tensor( + torch.as_tensor(batch.data).to( + device="cuda:%d" % device_id, non_blocking=True + ), + "NHWC", + ) + # docs_tag: end_tensor_conversion + + # Convert to grayscale + out = cvcuda.cvtcolor(cvcudaTensorNHWC, cvcuda.ColorConversion.RGB2GRAY) + + save_batch(out, "grayscale", encoder, batch) + + # Histogram eq the image + out = cvcuda.histogrameq(out, cvcuda.Type.U8) + + save_batch(out, "histogrameq", encoder, batch) + + # Threshold the image + # Use torch tensor for this to take advantage of easy data manipulation + thParam = torch.zeros(out.shape[0], dtype=torch.float64).cuda() + maxParam = torch.zeros(out.shape[0], dtype=torch.float64).cuda() + + # The parameters below can be set per image. For now, we are setting them to a constant value. + # Proper threshold values must be determined by the input images and requirement. + thParam.fill_( + 128 + ) # Configure the threshold value for each image anything below this will be 0 in the output. + maxParam.fill_(255) # Value to set the areas meeting the threshold. + + thParam = cvcuda.as_tensor(thParam, "N") + maxParam = cvcuda.as_tensor(maxParam, "N") + out = cvcuda.threshold(out, thParam, maxParam, cvcuda.ThresholdType.BINARY) + + save_batch(out, "threshold", encoder, batch) + + # Create label map + ccLabels, _, _ = cvcuda.label(out) + + # Create and ARGB image from the label map, this is for visualization purposes only. + argbImage = color_labels_nhwc(ccLabels) + + save_batch(argbImage, "label", encoder, batch) + + batch_idx += 1 + # docs_tag: end_process + + cvcuda_perf.pop_range(total_items=batch.data.shape[0]) # for batch + + # Make sure encoder finishes any outstanding work + encoder.join() + + cvcuda_perf.pop_range() # for pipeline + + cuda_ctx.pop() + # docs_tag: end_pipeline + + cvcuda_perf.pop_range() # for this example. + + # Once everything is over, we need to finalize the perf-numbers. + cvcuda_perf.finalize() + + +# docs_tag: begin_main_func +def main(): + # docs_tag: begin_parse_args + assets_dir = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), + "assets", + ) + parser = get_default_arg_parser( + "Label sample using CV-CUDA. This sample will execute the label operator on a " + "single or a batch of images (must be same size). Each step of the pipeline will " + "produce an *_stage.jpg output showing the processing done at that stage.", + input_path=os.path.join(assets_dir, "images", "peoplenet.jpg"), + target_img_height=544, + target_img_width=960, + ) + + args = parse_validate_default_args(parser) + + logging.basicConfig( + format="[%(name)s:%(lineno)d] %(asctime)s %(levelname)-6s %(message)s", + level=getattr(logging, args.log_level.upper()), + datefmt="%Y-%m-%d %H:%M:%S", + ) + # docs_tag: end_parse_args + + # Run the sample. + # docs_tag: start_call_run_sample + cvcuda_perf = CvCudaPerf("Distance_Label_sample", default_args=args) + run_sample( + args.input_path, + args.output_dir, + args.batch_size, + args.target_img_height, + args.target_img_width, + args.device_id, + cvcuda_perf, + ) + # docs_tag: end_call_run_sample + + +# docs_tag: end_main_func + +if __name__ == "__main__": + main() diff --git a/samples/object_detection/python/pipelines.py b/samples/object_detection/python/pipelines.py index 0e8d431e..00ea4685 100644 --- a/samples/object_detection/python/pipelines.py +++ b/samples/object_detection/python/pipelines.py @@ -303,19 +303,19 @@ def __call__(self, batch_bboxes_pyt, nms_masks_pyt, frame_nhwc): # # Once this is done, we can convert these lists to two CV-CUDA # structures that can be given to the blur and bndbox operators: - # 1) cvcuda.BndBoxesI : To store the bounding boxes for the batch + # 1) cvcuda.Elements : To store the bounding boxes for the batch # 2) cvcuda.BlurBoxesI : To store the bounding boxes as blur boxes for the batch. # self.cvcuda_perf.push_range("forloop") - num_boxes = [] - bounding_boxes = [] - blur_boxes = [] + bounding_boxes_list = [] + blur_boxes_list = [] # Create an array of bounding boxes with render settings. for current_boxes, current_masks in zip(batch_bboxes_pyt, nms_masks_pyt): filtered_boxes = current_boxes[current_masks] # Save the count of non-zero bounding boxes of this image. - num_boxes.append(filtered_boxes.shape[0]) + bounding_boxes = [] + blur_boxes = [] for box in filtered_boxes: bounding_boxes.append( @@ -329,13 +329,11 @@ def __call__(self, batch_bboxes_pyt, nms_masks_pyt, frame_nhwc): blur_boxes.append( cvcuda.BlurBoxI(box=tuple(box), kernelSize=self.kernel_size) ) + bounding_boxes_list.append(bounding_boxes) + blur_boxes_list.append(blur_boxes) - batch_bounding_boxes = cvcuda.BndBoxesI( - numBoxes=num_boxes, boxes=tuple(bounding_boxes) - ) - batch_blur_boxes = cvcuda.BlurBoxesI( - numBoxes=num_boxes, boxes=tuple(blur_boxes) - ) + batch_bounding_boxes = cvcuda.Elements(elements=bounding_boxes_list) + batch_blur_boxes = cvcuda.BlurBoxesI(boxes=blur_boxes_list) self.cvcuda_perf.pop_range() # for loop # Apply blur first. @@ -344,8 +342,8 @@ def __call__(self, batch_bboxes_pyt, nms_masks_pyt, frame_nhwc): self.cvcuda_perf.pop_range() # Render the bounding boxes. - self.cvcuda_perf.push_range("bndbox_into") - cvcuda.bndbox_into(frame_nhwc, frame_nhwc, batch_bounding_boxes) + self.cvcuda_perf.push_range("osd_into") + cvcuda.osd_into(frame_nhwc, frame_nhwc, batch_bounding_boxes) self.cvcuda_perf.pop_range() # docs_tag: end_call_cuosd_bboxes diff --git a/samples/scripts/benchmark.py b/samples/scripts/benchmark.py index 30cc5611..3bcb62eb 100644 --- a/samples/scripts/benchmark.py +++ b/samples/scripts/benchmark.py @@ -108,6 +108,10 @@ def parse_nvtx_pushpop_trace_json(json_path): # range_info = {} + # Check if the file was empty or not. Empty file means no ops were recorded. + if os.stat(json_path).st_size == 0: + return range_info + # Read the JSON. with open(json_path, "r") as f: json_data = json.loads(f.read()) @@ -1093,9 +1097,14 @@ def main(): # behaves correctly. proc_device_id = str(args.gpu_offset_id + gpu_idx) proc_args = args.args.copy() - proc_args.extend(["--device_id", proc_device_id]) - proc_args.extend(["--output_dir", proc_output_dir]) - + # The following will make sure that it inserts the additional args + # only at the beginning of the list so that it doesn't interfere with a + # potentially argparse.REMAINDER style arg present at the end. + + # Need to set this to 0 because once CUDA_VISIBLE_DEVICES is used, + # the process won't be able to see other gpus + proc_args[:0] = ["--device_id", "0"] + proc_args[:0] = ["--output_dir", proc_output_dir] # Start the pool. result = pool.apply_async( benchmark_script, diff --git a/samples/scripts/install_dependencies.sh b/samples/scripts/install_dependencies.sh index 1eb4e9e4..bb3a4f24 100755 --- a/samples/scripts/install_dependencies.sh +++ b/samples/scripts/install_dependencies.sh @@ -75,7 +75,7 @@ pip3 install /tmp/VideoProcessingFramework pip3 install /tmp/VideoProcessingFramework/src/PytorchNvCodec # Install tao-converter which parses the .etlt model file, and generates an optimized TensorRT engine -wget --content-disposition 'https://api.ngc.nvidia.com/v2/resources/nvidia/tao/tao-converter/versions/v4.0.0_trt8.5.1.7_x86/files/tao-converter' --directory-prefix=/usr/local/bin +wget 'https://api.ngc.nvidia.com/v2/resources/nvidia/tao/tao-converter/versions/v4.0.0_trt8.5.1.7_x86/files/tao-converter' --directory-prefix=/usr/local/bin chmod a+x /usr/local/bin/tao-converter # Install NVIDIA NSIGHT 2023.2.1 diff --git a/samples/scripts/run_samples.sh b/samples/scripts/run_samples.sh index 0c723340..2679e546 100755 --- a/samples/scripts/run_samples.sh +++ b/samples/scripts/run_samples.sh @@ -80,3 +80,9 @@ python3 ./object_detection/python/main.py -i ./assets/videos/pexels-chiel-slotma python3 ./object_detection/python/main.py -i ./assets/images/ -b 3 # RUn it with the TensorFlow backend python3 ./object_detection/python/main.py -i ./assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4 -b 4 -bk tensorflow + +# Run the label Python sample with default settings, without any command-line args. +find /tmp/ -maxdepth 1 -type f -delete +python3 ./label/python/main.py +# Run it with batch size 1 on a single image +python3 ./label/python/main.py -i ./assets/images/peoplenet.jpg -b 1 diff --git a/src/cvcuda/CMakeLists.txt b/src/cvcuda/CMakeLists.txt index 4b6afe45..9da86508 100644 --- a/src/cvcuda/CMakeLists.txt +++ b/src/cvcuda/CMakeLists.txt @@ -27,8 +27,8 @@ set(CV_CUDA_OP_FILES OpMinMaxLoc.cpp OpHistogram.cpp OpMinAreaRect.cpp - OpBoxBlur.cpp OpBndBox.cpp + OpBoxBlur.cpp OpBrightnessContrast.cpp OpRemap.cpp OpColorTwist.cpp @@ -65,6 +65,9 @@ set(CV_CUDA_OP_FILES OpRandomResizedCrop.cpp OpGaussianNoise.cpp OpInpaint.cpp + OpLabel.cpp + OpPairwiseMatcher.cpp + OpFindHomography.cpp ) # filter only one that matches the patern (case insensitive), should be set on the global level @@ -87,6 +90,7 @@ else() endif() add_library(cvcuda SHARED + OpStack.cpp ${CV_CUDA_LIB_FILES} ) diff --git a/src/cvcuda/OpFindHomography.cpp b/src/cvcuda/OpFindHomography.cpp new file mode 100644 index 00000000..bb2d5062 --- /dev/null +++ b/src/cvcuda/OpFindHomography.cpp @@ -0,0 +1,67 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "priv/OpFindHomography.hpp" + +#include "priv/SymbolVersioning.hpp" + +#include +#include +#include + +namespace priv = cvcuda::priv; + +CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaFindHomographyCreate, + (NVCVOperatorHandle * handle, int batchSize, int numPoints)) +{ + return nvcv::ProtectCall( + [&] + { + if (handle == nullptr) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Pointer to NVCVOperator handle must not be NULL"); + } + + *handle = reinterpret_cast(new priv::FindHomography(batchSize, numPoints)); + }); +} + +CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaFindHomographySubmit, + (NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle srcPts, NVCVTensorHandle dstPts, + NVCVTensorHandle models)) +{ + return nvcv::ProtectCall( + [&] + { + nvcv::TensorWrapHandle _srcPts(srcPts), _dstPts(dstPts), _models(models); + priv::ToDynamicRef(handle)(stream, _srcPts, _dstPts, _models); + }); +} + +CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaFindHomographyVarShapeSubmit, + (NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorBatchHandle srcPts, + NVCVTensorBatchHandle dstPts, NVCVTensorBatchHandle models)) +{ + return nvcv::ProtectCall( + [&] + { + nvcv::TensorBatchWrapHandle _srcPts(srcPts), _dstPts(dstPts); + nvcv::TensorBatchWrapHandle _models(models); + priv::ToDynamicRef(handle)(stream, _srcPts, _dstPts, _models); + }); +} diff --git a/src/cvcuda/OpLabel.cpp b/src/cvcuda/OpLabel.cpp new file mode 100644 index 00000000..351cce2b --- /dev/null +++ b/src/cvcuda/OpLabel.cpp @@ -0,0 +1,55 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "priv/OpLabel.hpp" + +#include "priv/SymbolVersioning.hpp" + +#include +#include +#include + +CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaLabelCreate, (NVCVOperatorHandle * handle)) +{ + return nvcv::ProtectCall( + [&] + { + if (handle == nullptr) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Pointer to NVCVOperator handle must not be NULL"); + } + + *handle = reinterpret_cast(new cvcuda::priv::Label()); + }); +} + +CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaLabelSubmit, + (NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle in, NVCVTensorHandle out, + NVCVTensorHandle bgLabel, NVCVTensorHandle minThresh, NVCVTensorHandle maxThresh, + NVCVTensorHandle minSize, NVCVTensorHandle count, NVCVTensorHandle stats, + NVCVConnectivityType connectivity, NVCVLabelType assignLabels)) +{ + return nvcv::ProtectCall( + [&] + { + cvcuda::priv::ToDynamicRef(handle)( + stream, nvcv::TensorWrapHandle{in}, nvcv::TensorWrapHandle{out}, nvcv::TensorWrapHandle{bgLabel}, + nvcv::TensorWrapHandle{minThresh}, nvcv::TensorWrapHandle{maxThresh}, nvcv::TensorWrapHandle{minSize}, + nvcv::TensorWrapHandle{count}, nvcv::TensorWrapHandle{stats}, connectivity, assignLabels); + }); +} diff --git a/src/cvcuda/OpPairwiseMatcher.cpp b/src/cvcuda/OpPairwiseMatcher.cpp new file mode 100644 index 00000000..07b0db91 --- /dev/null +++ b/src/cvcuda/OpPairwiseMatcher.cpp @@ -0,0 +1,58 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "priv/OpPairwiseMatcher.hpp" + +#include "priv/SymbolVersioning.hpp" + +#include +#include +#include + +namespace priv = cvcuda::priv; + +CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaPairwiseMatcherCreate, + (NVCVOperatorHandle * handle, NVCVPairwiseMatcherType algoChoice)) +{ + return nvcv::ProtectCall( + [&] + { + if (handle == nullptr) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Pointer to NVCVOperator handle must not be NULL"); + } + + *handle = reinterpret_cast(new cvcuda::priv::PairwiseMatcher(algoChoice)); + }); +} + +CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaPairwiseMatcherSubmit, + (NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle set1, NVCVTensorHandle set2, + NVCVTensorHandle numSet1, NVCVTensorHandle numSet2, NVCVTensorHandle matches, + NVCVTensorHandle numMatches, NVCVTensorHandle distances, bool crossCheck, int matchesPerPoint, + NVCVNormType normType)) +{ + return nvcv::ProtectCall( + [&] + { + cvcuda::priv::ToDynamicRef(handle)( + stream, nvcv::TensorWrapHandle{set1}, nvcv::TensorWrapHandle{set2}, nvcv::TensorWrapHandle{numSet1}, + nvcv::TensorWrapHandle{numSet2}, nvcv::TensorWrapHandle{matches}, nvcv::TensorWrapHandle{numMatches}, + nvcv::TensorWrapHandle{distances}, crossCheck, matchesPerPoint, normType); + }); +} diff --git a/src/cvcuda/OpPillowResize.cpp b/src/cvcuda/OpPillowResize.cpp index d599f39f..7ba00b31 100644 --- a/src/cvcuda/OpPillowResize.cpp +++ b/src/cvcuda/OpPillowResize.cpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -15,8 +15,9 @@ * limitations under the License. */ -#include "priv/OpPillowResize.hpp" +#include "cvcuda/OpPillowResize.h" +#include "priv/OpPillowResize.hpp" #include "priv/SymbolVersioning.hpp" #include @@ -26,9 +27,7 @@ namespace priv = cvcuda::priv; -CVCUDA_DEFINE_API(0, 2, NVCVStatus, cvcudaPillowResizeCreate, - (NVCVOperatorHandle * handle, int32_t maxWidth, int32_t maxHeight, int32_t maxBatchSize, - NVCVImageFormat fmt)) +CVCUDA_DEFINE_API(0, 3, NVCVStatus, cvcudaPillowResizeCreate, (NVCVOperatorHandle * handle)) { return nvcv::ProtectCall( [&] @@ -38,31 +37,63 @@ CVCUDA_DEFINE_API(0, 2, NVCVStatus, cvcudaPillowResizeCreate, throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Pointer to NVCVOperator handle must not be NULL"); } - *handle = reinterpret_cast( - new priv::PillowResize(nvcv::Size2D{maxWidth, maxHeight}, maxBatchSize, fmt)); + *handle = reinterpret_cast(new priv::PillowResize()); + }); +} + +CVCUDA_DEFINE_API(0, 3, NVCVStatus, cvcudaPillowResizeGetWorkspaceRequirements, + (NVCVOperatorHandle handle, int maxBatchSize, int32_t maxInWidth, int32_t maxInHeight, + int32_t maxOutWidth, int32_t maxOutHeight, NVCVImageFormat fmt, NVCVWorkspaceRequirements *reqOut)) +{ + if (!reqOut) + return NVCV_ERROR_INVALID_ARGUMENT; + + return nvcv::ProtectCall( + [&] + { + NVCVSize2D maxInSize = {maxInWidth, maxInHeight}; + NVCVSize2D maxOutSize = {maxOutWidth, maxOutHeight}; + *reqOut = priv::ToDynamicRef(handle).getWorkspaceRequirements(maxBatchSize, maxInSize, + maxOutSize, fmt); + }); +} + +CVCUDA_DEFINE_API(0, 3, NVCVStatus, cvcudaPillowResizeVarShapeGetWorkspaceRequirements, + (NVCVOperatorHandle handle, int batchSize, const NVCVSize2D *inputSizes, + const NVCVSize2D *outputSizes, NVCVImageFormat fmt, NVCVWorkspaceRequirements *reqOut)) +{ + if (!reqOut) + return NVCV_ERROR_INVALID_ARGUMENT; + + return nvcv::ProtectCall( + [&] + { + *reqOut = priv::ToDynamicRef(handle).getWorkspaceRequirements( + batchSize, static_cast(inputSizes), + static_cast(outputSizes), fmt); }); } -CVCUDA_DEFINE_API(0, 2, NVCVStatus, cvcudaPillowResizeSubmit, - (NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle in, NVCVTensorHandle out, - const NVCVInterpolationType interpolation)) +CVCUDA_DEFINE_API(0, 3, NVCVStatus, cvcudaPillowResizeSubmit, + (NVCVOperatorHandle handle, cudaStream_t stream, const NVCVWorkspace *ws, NVCVTensorHandle in, + NVCVTensorHandle out, const NVCVInterpolationType interpolation)) { return nvcv::ProtectCall( [&] { nvcv::TensorWrapHandle input(in), output(out); - priv::ToDynamicRef(handle)(stream, input, output, interpolation); + priv::ToDynamicRef(handle)(stream, *ws, input, output, interpolation); }); } -CVCUDA_DEFINE_API(0, 2, NVCVStatus, nvcvopPillowResizeVarShapeSubmit, - (NVCVOperatorHandle handle, cudaStream_t stream, NVCVImageBatchHandle in, NVCVImageBatchHandle out, - const NVCVInterpolationType interpolation)) +CVCUDA_DEFINE_API(0, 3, NVCVStatus, cvcudaPillowResizeVarShapeSubmit, + (NVCVOperatorHandle handle, cudaStream_t stream, const NVCVWorkspace *ws, NVCVImageBatchHandle in, + NVCVImageBatchHandle out, const NVCVInterpolationType interpolation)) { return nvcv::ProtectCall( [&] { nvcv::ImageBatchVarShapeWrapHandle input(in), output(out); - priv::ToDynamicRef(handle)(stream, input, output, interpolation); + priv::ToDynamicRef(handle)(stream, *ws, input, output, interpolation); }); } diff --git a/src/cvcuda/OpStack.cpp b/src/cvcuda/OpStack.cpp new file mode 100644 index 00000000..b7a4a293 --- /dev/null +++ b/src/cvcuda/OpStack.cpp @@ -0,0 +1,53 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "priv/OpStack.hpp" + +#include "priv/SymbolVersioning.hpp" + +#include +#include +#include + +namespace priv = cvcuda::priv; + +CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaStackCreate, (NVCVOperatorHandle * handle)) +{ + return nvcv::ProtectCall( + [&] + { + if (handle == nullptr) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Pointer to NVCVOperator handle must not be NULL"); + } + + *handle = reinterpret_cast(new priv::Stack()); + }); +} + +CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaStackSubmit, + (NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorBatchHandle in, NVCVTensorHandle out)) +{ + return nvcv::ProtectCall( + [&] + { + nvcv::TensorWrapHandle output(out); + nvcv::TensorBatchWrapHandle input(in); + priv::ToDynamicRef(handle)(stream, input, output); + }); +} diff --git a/src/cvcuda/include/cvcuda/OpFindHomography.h b/src/cvcuda/include/cvcuda/OpFindHomography.h new file mode 100644 index 00000000..6d5c5dcc --- /dev/null +++ b/src/cvcuda/include/cvcuda/OpFindHomography.h @@ -0,0 +1,151 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file OpFindHomography.h + * + * @brief Defines types and functions to handle the Find-Homography operation. + * @defgroup NVCV_C_ALGORITHM_FIND_HOMOGRAPHY Find-Homography + * @{ + */ + +#ifndef CVCUDA__FIND_HOMOGRAPHY_H +#define CVCUDA__FIND_HOMOGRAPHY_H + +#include "Operator.h" +#include "Types.h" +#include "detail/Export.h" + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** Constructs an instance of the Find-Homography operator. + * + * @param [out] handle Where the image instance handle will be written to. + * + Must not be NULL. + * @param [in] batchSize number of samples in the batch + * @param [in] numPoints maximum number of coordinates that in the batch + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Handle is null. + * @retval #NVCV_ERROR_OUT_OF_MEMORY Not enough memory to create the operator. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +CVCUDA_PUBLIC NVCVStatus cvcudaFindHomographyCreate(NVCVOperatorHandle *handle, int batchSize, int maxNumPoints); + +/** Executes the Find-Homography operation on the given cuda stream. + * + * Limitations: + * + * Input: + * Data Layout: [NW] + * Channel count: [1] + * + * Data Type | Allowed + * -------------- | ------------- + * 8bit Unsigned | No + * 8bit Signed | No + * 16bit Unsigned | No + * 16bit Signed | No + * 32bit Unsigned | No + * 32bit Signed | No + * 32bit Float | Yes + * 64bit Float | No + * + * Output: + * Data Layout: [NHW] + * Channel count: [1] + * + * Data Type | Allowed + * -------------- | ------------- + * 8bit Unsigned | No + * 8bit Signed | No + * 16bit Unsigned | No + * 16bit Signed | No + * 32bit Unsigned | No + * 32bit Signed | No + * 32bit Float | Yes + * 64bit Float | No + * + * Input/Output dependency + * + * Property | Input == Output + * -------------- | ------------- + * Data Layout | No + * Data Type | Yes + * Batches (N) | Yes + * Channels | No + * + * @param [in] handle Handle to the operator. + * + Must not be NULL. + * @param [in] stream Handle to a valid CUDA stream. + * + * @param [in] srcPts Input tensor, srcPts[i, j] is the set of coordinates for the source image where i ranges + * from 0 to batch-1, j ranges from 4 to number of coordinates per image, and the data type being + * float2 for (x=x, y=y) + * + Number of coordinates must be >= 4 + * + Must have data type 2F32 + * + Must have rank 2 + * + * * @param [in] dstPts Input tensor, dstPts[i, j] is the set of coordinates for the destination image where i ranges + * from 0 to batch-1, j ranges from 4 to number of coordinates per image, and the data type being + * float2 for (x=x, y=y) + * + Number of coordinates must be >= 4 + * + Must have data type 2F32 + * + Must have rank 2 + * + * @param [out] out Output tensor, models[i, j, k] is the output model tensor which maps the src points to dst points + * in image i, where i ranges from 0 to batch-1, j ranges from 0 to 2 and k ranges from 0 to 2, and + * the data type being F32. + * + Must have data type F32 + * + Must have rank 3 + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range. + * @retval #NVCV_ERROR_INTERNAL Internal error in the operator, invalid types passed in. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +/** @{ */ +CVCUDA_PUBLIC NVCVStatus cvcudaFindHomographySubmit(NVCVOperatorHandle handle, cudaStream_t stream, + NVCVTensorHandle srcPts, NVCVTensorHandle dstPts, + NVCVTensorHandle models); + +/** + * Executes the FindHomography operation on a batch of images. + * + * Apart from input and output image batches, all parameters are the same as \ref cvcudaFindHomographySubmit. + * + * @param[in] srcPts batch of coordinates in the source image. + * @param[out] dstPts batch of coordinates in the destination image. + * @param [in] models model tensor batch. + * + */ +CVCUDA_PUBLIC NVCVStatus cvcudaFindHomographyVarShapeSubmit(NVCVOperatorHandle handle, cudaStream_t stream, + NVCVTensorBatchHandle srcPts, NVCVTensorBatchHandle dstPts, + NVCVTensorBatchHandle models); + +#ifdef __cplusplus +} +#endif + +#endif /* CVCUDA__FIND_HOMOGRAPHY_H */ diff --git a/src/cvcuda/include/cvcuda/OpFindHomography.hpp b/src/cvcuda/include/cvcuda/OpFindHomography.hpp new file mode 100644 index 00000000..7e7c807d --- /dev/null +++ b/src/cvcuda/include/cvcuda/OpFindHomography.hpp @@ -0,0 +1,86 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file FindHomography.hpp + * + * @brief Defines the private C++ Class for the Find-Homography operation. + */ + +#ifndef CVCUDA__FIND_HOMOGRAPHY_HPP +#define CVCUDA__FIND_HOMOGRAPHY_HPP + +#include "IOperator.hpp" +#include "OpFindHomography.h" + +#include +#include +#include +#include + +namespace cvcuda { + +class FindHomography final : public IOperator +{ +public: + explicit FindHomography(int batchSize, int numPoints); + + ~FindHomography(); + + void operator()(cudaStream_t stream, const nvcv::Tensor &src, const nvcv::Tensor &dst, const nvcv::Tensor &models); + + void operator()(cudaStream_t stream, const nvcv::TensorBatch &src, const nvcv::TensorBatch &dst, + const nvcv::TensorBatch &models); + + virtual NVCVOperatorHandle handle() const noexcept override; + +private: + NVCVOperatorHandle m_handle; +}; + +inline FindHomography::FindHomography(int batchSize, int numPoints) +{ + nvcv::detail::CheckThrow(cvcudaFindHomographyCreate(&m_handle, batchSize, numPoints)); + assert(m_handle); +} + +inline FindHomography::~FindHomography() +{ + nvcvOperatorDestroy(m_handle); +} + +inline void FindHomography::operator()(cudaStream_t stream, const nvcv::Tensor &src, const nvcv::Tensor &dst, + const nvcv::Tensor &models) +{ + nvcv::detail::CheckThrow(cvcudaFindHomographySubmit(m_handle, stream, src.handle(), dst.handle(), models.handle())); +} + +inline void FindHomography::operator()(cudaStream_t stream, const nvcv::TensorBatch &src, const nvcv::TensorBatch &dst, + const nvcv::TensorBatch &models) +{ + nvcv::detail::CheckThrow( + cvcudaFindHomographyVarShapeSubmit(m_handle, stream, src.handle(), dst.handle(), models.handle())); +} + +inline NVCVOperatorHandle FindHomography::handle() const noexcept +{ + return m_handle; +} + +} // namespace cvcuda + +#endif // CVCUDA__FIND_HOMOGRAPHY_HPP diff --git a/src/cvcuda/include/cvcuda/OpLabel.h b/src/cvcuda/include/cvcuda/OpLabel.h new file mode 100644 index 00000000..ad0b40aa --- /dev/null +++ b/src/cvcuda/include/cvcuda/OpLabel.h @@ -0,0 +1,242 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file OpLabel.h + * + * @brief Defines types and functions to handle the Label operation. + * @defgroup NVCV_C_ALGORITHM_LABEL Label + * @{ + */ + +#ifndef CVCUDA_LABEL_H +#define CVCUDA_LABEL_H + +#include "Operator.h" +#include "Types.h" +#include "detail/Export.h" + +#include +#include +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** + * Constructs an instance of the Label operator. + * + * @param [out] handle Where the operator instance handle will be written to. + * + Must not be NULL. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Handle is null. + * @retval #NVCV_ERROR_OUT_OF_MEMORY Not enough memory to create the operator. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +CVCUDA_PUBLIC NVCVStatus cvcudaLabelCreate(NVCVOperatorHandle *handle); + +/** + * Executes the Label operation on the given cuda stream. This operation does not wait for completion. + * + * This operation computes the connected-component labeling of one or more input images (in 2D) or volumes (in 3D) + * inside the input tensor, yielding labels in the output tensor with same rank and shape. Labels are numbers + * uniquely assigned to each connected region, for example: + * + * Input 0 0 0 0 Output 0 0 0 0 + * image: 1 1 0 1 labels: 4 4 0 7 + * 0 0 0 1 0 0 0 7 + * 0 1 1 1 0 7 7 7 + * + * In the above example, three distinct regions were identified and labeled as 0, 4 and 7. Note that the region + * labeled with 0 remained with the same value as the input, and label numbers 4 and 7 were assigned in + * non-consecutive ordering. Some values in the input may be ignored, i.e. not labeled, using the \ref bgLabel + * tensor to define those values as background, which usually is set to the value zero. For example: + * + * Input 0 0 1 0 Output 0 0 2 3 Zeros in 0 0 2 0 + * image: 0 1 0 1 labels: 0 5 6 7 bgLabel: 0 5 0 7 + * 0 0 1 1 0 0 7 7 0 0 7 7 + * 0 1 1 1 0 7 7 7 0 7 7 7 + * + * Limitations: + * + * Input: + * Data Layout: [HWC], [NHWC], [DHWC], [NDHWC] + * Channels: [1] + * + * Data Type | Allowed + * -------------- | ------------- + * 8bit Unsigned | Yes + * 8bit Signed | Yes + * 16bit Unsigned | Yes + * 16bit Signed | Yes + * 32bit Unsigned | Yes + * 32bit Signed | Yes + * 32bit Float | No + * 64bit Float | No + * + * Output: + * Data Layout: [HWC], [NHWC], [DHWC], [NDHWC] + * Channels: [1] + * + * Data Type | Allowed + * -------------- | ------------- + * 8bit Unsigned | No + * 8bit Signed | No + * 16bit Unsigned | No + * 16bit Signed | No + * 32bit Unsigned | Yes + * 32bit Signed | No + * 32bit Float | No + * 64bit Float | No + * + * Input/Output dependency + * + * Property | Input == Output + * -------------- | ------------- + * Data Layout | Yes + * Data Type | No + * Number | Yes + * Channels | Yes + * Width | Yes + * Height | Yes + * Depth | Yes + * + * @param [in] handle Handle to the operator. + * + Must not be NULL. + * @param [in] stream Handle to a valid CUDA stream. + * + * @param [in] in Input tensor. The expected layout is [HWC] or [NHWC] for 2D labeling or [DHWC] or [NDHWC] for + * 3D labeling, with either explicit C dimension or missing C with channels embedded in the data type. + * The N dimension is the number of samples, i.e. either 2D images with height H and width W or + * 3D volumes with depth D and height H and width W, inside the tensor. This operator labels + * regions, i.e. connected components, of each input image or volume read from the \ref in tensor. + * + Check above limitations table to the input tensor data layout, number of channels and data type. + * + * @param [out] out Output tensor. The expected layout is [HWC] or [NHWC] for 2D labeling or [DHWC] or [NDHWC] for + * 3D labeling, with either explicit C dimension or missing C with channels embedded in the data type. + * The N dimension is the number of samples, i.e. either 2D images with height H and width W or + * 3D volumes with depth D and height H and width W, inside the tensor. This operator labels + * regions, i.e. connected components, on the input writing the labels to the \ref out tensor. + * + Check above limitations table to the output tensor data layout, number of channels and data type. + * + * @param [in] bgLabel Background label tensor. The expected layout is [N] or [NC], meaning rank-1 or rank-2 + * tensor with first dimension as the number of samples N, matching input and output tensors, + * and a potential last dimension C with number of channels. If present, this tensor is used + * by the operator to define background values in the input tensor to be ignored during + * labeling. If not present, all values in the input are labeled. + * + It must have the same number of samples as input and output tensors. + * + It must have one element per sample, i.e. number of channels must be 1 in a [NC] tensor. + * + It must have data type the same as the input. + * + It may be NULL to consider all values in the input as valid values to be labeled. + * + * @param [in] minThresh Minimum-threshold value tensor. The expected layout is [N] or [NC], meaning rank-1 or + * rank-2 tensor with first dimension as the number of samples N, matching input and output + * tensors, and a potential last dimension C with number of channels. If present, this + * tensor is used by the operator as a pre-filter step to define minimum values in the input + * tensor to be thresholded into a binary image, i.e. values below it are set to 0 and above + * or equal it are set to 1. Labeling is done after this pre-filter step, where \ref + * bgLabel may be applied for instance to ignore zeroes as background. + * + It must have the same number of samples as input and output tensors. + * + It must have one element per sample, i.e. number of channels must be 1 in a [NC] tensor. + * + It must have data type the same as the input. + * + It may be NULL to not apply minimum thresholding as a pre-filter. + * + * @param [in] maxThresh Maximum-threshold value tensor. The expected layout is [N] or [NC], meaning rank-1 or + * rank-2 tensor with first dimension as the number of samples N, matching input and output + * tensors, and a potential last dimension C with number of channels. If present, this + * tensor is used by the operator as a pre-filter step to define maximum values in the input + * tensor to be thresholded into a binary image, i.e. values above it are set to 0 and below + * or equal it are set to 1. Labeling is done after this pre-filter step, where \ref + * bgLabel may be applied for instance to ignore zeroes as background. + * + It must have the same number of samples as input and output tensors. + * + It must have one element per sample, i.e. number of channels must be 1 in a [NC] tensor. + * + It must have data type the same as the input. + * + It may be NULL to not apply maximum thresholding as a pre-filter. + * + * @param [in] minSize Minimum-size value tensor. The expected layout is [N] or [NC], meaning rank-1 or rank-2 + * tensor with first dimension as the number of samples N, matching input and output tensors, + * and a potential last dimension C with number of channels. If present, this tensor is used + * by the operator as a post-filter step to define minimum-size regions in the output tensor to + * keep their labels, i.e. connected-component regions with less than this minimum number of + * elements are set to the background value defined in the \ref bgLabel value. Labeling is + * done before this post-filter step, also known as island-removal step. + * + It must have the same number of samples as input and output tensors. + * + It must have one element per sample, i.e. number of channels must be 1 in a [NC] tensor. + * + It must have U32 data type. + * + It may be NULL to not apply minimum size regions removal as a post-filter. + * + If not NULL, the \ref bgLabel and \ref stats tensors must not be NULL as well. + * + * @param [out] count Count of labels tensor. The expected layout is [N] or [NC], meaning rank-1 or rank-2 tensor + * with first dimension as the number of samples N, matching input and output tensors, and a + * potential last dimension C with number of channels. If present, this tensor is used by the + * operator to store the number of connected regions, or components, labeled. The background + * label is ignored and thus not counted. It counts regions that may be beyond the maximum capacity + * of \ref stats tensor, and regions potentially removed by \ref minSize tensor. + * + It must have the same number of samples as input and output tensors. + * + It must have one element per sample, i.e. number of channels must be 1 in a [NC] tensor. + * + It must have U32 data type. + * + It may be NULL to disregard counting the number of different labels found. + * + * @param [out] stats Statistics tensor. The expected layout is [NMA], meaning rank-3 tensor with first dimension + * as the number of samples N, matching input and output tensors, second dimension M as maximum + * number of different labels statistics to be computed, and a third dimension A as the amount + * of statistics to be computed per label (fixed as 6 for 2D or 8 for 3D). If present, this + * tensor is used by the operator to store information per connected-component label. The + * background label is ignored and thus its statistics is not computed. + * + It must have the same number of samples as input and output tensors. + * + It must have a number of statistics M per sample N equal to the maximum allowed number of + * label statistics that can be computed by the Label operator per sample image (or volume). + * The actual number of labels found is stored in \ref count (see above). + * + For 2D labeling, it must have in the last dimension A=6 elements to store at: (0) the + * original label number; (1) leftmost position; (2) topmost position; (3) width size; (4) + * height size; (5) count of pixels (i.e. size of the labeled region). And for 3D labeling, + * it must have in the last dimension A=8 elements to store at: (0) the original label number; + * (1) leftmost position; (2) topmost position; (3) shallowmost position; (4) width size; (5) + * height size; (6) depth size; (7) count of voxels (i.e. size of the labeled region). + * + It must have U32 data type. + * + It may be NULL to disregard computing statistics information on different labels found. + * + It must not be NULL if \ref assignLabel is NVCV_LABEL_SEQUENTIAL, the index of each label + * statistics is used as the new sequential label replacing the original label in the output, + * the sequential labels are up to the maximum capacity M + * + If not NULL, the \ref count tensor must not be NULL as well. + * + * @param [in] connectivity Specify connectivity of elements for the operator, see \ref NVCVConnectivityType. + * + It must conform with \ref in and \ref out tensors, i.e. 3D labeling requires [DHWC] + * or [NDHWC] tensor layouts and 2D labeling requires [HWC] or [NHWC], where the C + * channel may be missing as embedded in data type. + * + * @param [in] assignLabels Specify how labels are assigned by the operator, see \ref NVCVLabelType. Use + * NVCV_LABEL_FAST to do fast labeling, i.e. assign non-consecutive label numbers fast. + * Use NCVC_LABEL_SEQUENTIAL to have consecutive label numbers instead. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range. + * @retval #NVCV_ERROR_INTERNAL Internal error in the operator, invalid types passed in. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +CVCUDA_PUBLIC NVCVStatus cvcudaLabelSubmit(NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle in, + NVCVTensorHandle out, NVCVTensorHandle bgLabel, NVCVTensorHandle minThresh, + NVCVTensorHandle maxThresh, NVCVTensorHandle minSize, NVCVTensorHandle count, + NVCVTensorHandle stats, NVCVConnectivityType connectivity, + NVCVLabelType assignLabels); + +#ifdef __cplusplus +} +#endif + +#endif /* CVCUDA_LABEL_H */ diff --git a/src/cvcuda/include/cvcuda/OpLabel.hpp b/src/cvcuda/include/cvcuda/OpLabel.hpp new file mode 100644 index 00000000..54ebd54e --- /dev/null +++ b/src/cvcuda/include/cvcuda/OpLabel.hpp @@ -0,0 +1,86 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file OpLabel.hpp + * + * @brief Defines the public C++ Class for the Label operation. + * @defgroup NVCV_CPP_ALGORITHM_LABEL Label + * @{ + */ + +#ifndef CVCUDA__LABEL_HPP +#define CVCUDA__LABEL_HPP + +#include "IOperator.hpp" +#include "OpLabel.h" + +#include +#include +#include +#include + +namespace cvcuda { + +class Label final : public IOperator +{ +public: + explicit Label(); + + ~Label(); + + void operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, const nvcv::Tensor &bgLabel, + const nvcv::Tensor &minThresh, const nvcv::Tensor &maxThresh, const nvcv::Tensor &minSize, + const nvcv::Tensor &count, const nvcv::Tensor &stats, NVCVConnectivityType connectivity, + NVCVLabelType assignLabels) const; + + virtual NVCVOperatorHandle handle() const noexcept override; + +private: + NVCVOperatorHandle m_handle; +}; + +inline Label::Label() +{ + nvcv::detail::CheckThrow(cvcudaLabelCreate(&m_handle)); + assert(m_handle); +} + +inline Label::~Label() +{ + nvcvOperatorDestroy(m_handle); + m_handle = nullptr; +} + +inline void Label::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, + const nvcv::Tensor &bgLabel, const nvcv::Tensor &minThresh, const nvcv::Tensor &maxThresh, + const nvcv::Tensor &minSize, const nvcv::Tensor &count, const nvcv::Tensor &stats, + NVCVConnectivityType connectivity, NVCVLabelType assignLabels) const +{ + nvcv::detail::CheckThrow(cvcudaLabelSubmit(m_handle, stream, in.handle(), out.handle(), bgLabel.handle(), + minThresh.handle(), maxThresh.handle(), minSize.handle(), count.handle(), + stats.handle(), connectivity, assignLabels)); +} + +inline NVCVOperatorHandle Label::handle() const noexcept +{ + return m_handle; +} + +} // namespace cvcuda + +#endif // CVCUDA_LABEL_HPP diff --git a/src/cvcuda/include/cvcuda/OpPairwiseMatcher.h b/src/cvcuda/include/cvcuda/OpPairwiseMatcher.h new file mode 100644 index 00000000..02705857 --- /dev/null +++ b/src/cvcuda/include/cvcuda/OpPairwiseMatcher.h @@ -0,0 +1,173 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file OpairwiseMatcher.h + * + * @brief Defines types and functions to handle the airwiseMatcher operation. + * @defgroup NVCV_C_ALGORITHM_PAIRWISE_MATCHER Pairwise Matcher + * @{ + */ + +#ifndef CVCUDA_PAIRWISE_MATCHER_H +#define CVCUDA_PAIRWISE_MATCHER_H + +#include "Operator.h" +#include "Types.h" +#include "detail/Export.h" + +#include +#include +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** Constructs and an instance of the PairwiseMatcher operator. + * + * @param [out] handle Where the image instance handle will be written to. + * + Must not be NULL. + * + * @param [in] algoChoice Choice of algorithm to find pair-wise matches. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Handle is null. + * @retval #NVCV_ERROR_OUT_OF_MEMORY Not enough memory to create the operator. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +CVCUDA_PUBLIC NVCVStatus cvcudaPairwiseMatcherCreate(NVCVOperatorHandle *handle, NVCVPairwiseMatcherType algoChoice); + +/** Executes the PairwiseMatcher operation on the given CUDA stream. This operation does not wait for completion. + * + * This operation computes the pair-wise matcher between two sets of n-dimensional points. For instance + * 128-dimensional descriptors as points. For each point $p1_i$, in the 1st set defined by \ref set1 with size + * \ref numSet1, the operator finds the best match (minimum distance) from $p1_i$ to a point in the 2nd set $p2_j$, + * defined by \ref set2 with size \ref numSet2. If \ref crossCheck is true, $p1_i$ must also be the best match + * from $p2_j$ considering all possible matches from the 2nd set to the 1st set, to return them as a match. + * + * @note This operation does not guarantee deterministic output. Each output tensor limits the number of matches + * found by the operator, that is the total number may be greater than this limitation and the order of + * matches returned might differ in different runs. + * + * @param [in] handle Handle to the operator. + * + Must not be NULL. + * + * @param [in] stream Handle to a CUDA stream. + * + Must be a valid CUDA stream. + * + * @param [in] set1 Input 1st set of points tensor. The first set of points to calculate pair-wise matcher between + * this 1st set and the 2nd set. The expected layout is [NMD] meaning a rank-3 tensor with first + * dimension as number of samples N, second dimension M as maximum number of points, and a third + * dimension D as depth dimension of each point, e.g. the output of \ref NVCV_C_ALGORITHM_SIFT has + * 128-Byte descriptor or D=128 and U8 data type that can be used as a set of points. + * + It must have consistent number of samples N across input and output tensors. + * + The size of the depth dimension D and data type must be consistent across input + * set of points tensors. + * + It must have U8 or U32 or F32 data type. + * + * @param [in] set2 Input 2nd set of points tensor. The second set of points to calculate pair-wise matcher between + * this 2nd set and the 1st set. The expected layout is [NMD] meaning a rank-3 tensor with first + * dimension as number of samples N, second dimension M as maximum number of points, and a third + * dimension D as depth dimension of each point, e.g. the output of \ref NVCV_C_ALGORITHM_SIFT has + * 128-Byte descriptor or D=128 and U8 data type that can be used as a set of points. + * + It must have consistent number of samples N across input and output tensors. + * + The size of the depth dimension D and data type must be consistent across input + * set of points tensors. + * + It must have U8 or U32 or F32 data type. + * + * @param [in] numSet1 Input tensor storing the actual number of points in \ref set1 tensor. The expected layout + * is [N] or [NC], meaning rank-1 or rank-2 tensor with first dimension as number of samples N, + * and a potential last dimension C with number of channels. It expresses the total number of + * valid points in \ref set1 if less than its maximum capacity M, else uses all M points. + * + It must have consistent number of samples N across input and output tensors. + * + It must have one element per sample, i.e. number of channels must be 1 in a [NC] tensor. + * + It must have S32 data type. + * + It may be NULL to use entire set1 maximum capacity M as valid points. + * + * @param [in] numSet2 Input tensor storing the actual number of points in \ref set2 tensor. The expected layout + * is [N] or [NC], meaning rank-1 or rank-2 tensor with first dimension as number of samples N, + * and a potential last dimension C with number of channels. It expresses the total number of + * valid points in \ref set2 if less than its maximum capacity M, else uses all M points. + * + It must have consistent number of samples N across input and output tensors. + * + It must have one element per sample, i.e. number of channels must be 1 in a [NC] tensor. + * + It must have S32 data type. + * + It may be NULL to use entire set2 maximum capacity M as valid points. + * + * @param [out] matches Output tensor to store the matches of points between 1st set \ref set1 and 2nd set \ref + * set2. The expected layout is [NMA], meaning rank-3 tensor with first dimension as the + * number of samples N, same as other tensors, second dimension M as maximum number of + * matches, not necessarily the same as other tensors, and third dimension A as the attributes + * of each match, fixed to 2 attributes: set1 index and set2 index. + * + It must have consistent number of samples N across input and output tensors. + * + It must have a number of matches M per sample N equal to the maximum allowed number of + * matches to be found between \ref set1 and \ref set2. The actual number + * of matches found is stored in \ref numMatches. + * + It must have size of attributes dimension A equal 2. + * + It must have S32 data type. + * + * @param [out] numMatches Output tensor to store the number of matches found by the operator. The expected layout + * is [N] or [NC], meaning rank-1 or rank-2 tensor with first dimension as number of + * samples N, and a potential last dimension C with number of channels. It expresses the + * toal number of matches found, regardless of the maximum allowed number of matches M in + * output tensor \ref matches. Since matches are found randomly, they are discarded in a + * non-deterministic way when the number of matches found is bigger than M. + * + It must have consistent number of samples N across input and output tensors. + * + It must have one element per sample, i.e. number of channels must be 1 in a [NC] tensor. + * + It must have S32 data type. + * + It may be NULL if \ref crossCheck is false to disregard storing number of matches. + * + * @param [out] distances Output tensor to store distances of matches found by the operator. The expected layout + * is [NM] or [NMC], meaning rank-2 or rank-3 tensor with first dimension as number of + * samples N, same as other tensors, second dimension M as maximum number of distances, same + * as \ref matches output tensors, and a potential last dimension C with number of channels. + * For each match found in \ref matches, the distance between matched points is stored. + * + It must have consistent number of samples N across input and output tensors. + * + It must have the same dimension M of the \ref matches tensor, meaning the maximum + * allowed number of distances must be equal to the maximum allowed number of matches. + * + It must have one element per sample, i.e. number of channels must be 1 in a [NMC] tensor. + * + It must have F32 data type. + * + It may be NULL to disregard storing distances. + * + * @param [in] crossCheck Choice to do cross check. Use false to search only for matches from 1st set of points in + * \ref set1 to 2nd set of points in \ref set2. Use true to cross check best matches, a + * best match is only returned if it is the best match (minimum distance) from 1st set to + * 2nd set and vice versa. + * + * @param [in] matchesPerPoint Number of best matches $k$ per point. The operator returns the top-$k$ best matches + * from 1st set to 2nd set. + * + It must be between 1 and 64. + * + It has to be 1 if \ref crossCheck is true. + * + * @param [in] normType Choice of norm type to normalize distances, used in points difference $|p1 - p2|$. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range. + * @retval #NVCV_ERROR_INTERNAL Internal error in the operator, invalid types passed in. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +CVCUDA_PUBLIC NVCVStatus cvcudaPairwiseMatcherSubmit(NVCVOperatorHandle handle, cudaStream_t stream, + NVCVTensorHandle set1, NVCVTensorHandle set2, + NVCVTensorHandle numSet1, NVCVTensorHandle numSet2, + NVCVTensorHandle matches, NVCVTensorHandle numMatches, + NVCVTensorHandle distances, bool crossCheck, int matchesPerPoint, + NVCVNormType normType); + +#ifdef __cplusplus +} +#endif + +#endif /* CVCUDA_PAIRWISE_MATCHER_H */ diff --git a/src/cvcuda/include/cvcuda/OpPairwiseMatcher.hpp b/src/cvcuda/include/cvcuda/OpPairwiseMatcher.hpp new file mode 100644 index 00000000..13178ac7 --- /dev/null +++ b/src/cvcuda/include/cvcuda/OpPairwiseMatcher.hpp @@ -0,0 +1,86 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file OpPairwiseMatcher.hpp + * + * @brief Defines the public C++ Class for the PairwiseMatcher operation. + * @defgroup NVCV_CPP_ALGORITHM_PAIRWISE_MATCHER PairwiseMatcher + * @{ + */ + +#ifndef CVCUDA_PAIRWISE_MATCHER_HPP +#define CVCUDA_PAIRWISE_MATCHER_HPP + +#include "IOperator.hpp" +#include "OpPairwiseMatcher.h" + +#include +#include +#include + +namespace cvcuda { + +class PairwiseMatcher final : public IOperator +{ +public: + explicit PairwiseMatcher(NVCVPairwiseMatcherType algoChoice); + + ~PairwiseMatcher(); + + void operator()(cudaStream_t stream, const nvcv::Tensor &set1, const nvcv::Tensor &set2, + const nvcv::Tensor &numSet1, const nvcv::Tensor &numSet2, const nvcv::Tensor &matches, + const nvcv::Tensor &numMatches, const nvcv::Tensor &distances, bool crossCheck, int matchesPerPoint, + NVCVNormType normType); + + virtual NVCVOperatorHandle handle() const noexcept override; + +private: + NVCVOperatorHandle m_handle; +}; + +inline PairwiseMatcher::PairwiseMatcher(NVCVPairwiseMatcherType algoChoice) +{ + nvcv::detail::CheckThrow(cvcudaPairwiseMatcherCreate(&m_handle, algoChoice)); + assert(m_handle); +} + +inline PairwiseMatcher::~PairwiseMatcher() +{ + nvcvOperatorDestroy(m_handle); + m_handle = nullptr; +} + +inline void PairwiseMatcher::operator()(cudaStream_t stream, const nvcv::Tensor &set1, const nvcv::Tensor &set2, + const nvcv::Tensor &numSet1, const nvcv::Tensor &numSet2, + const nvcv::Tensor &matches, const nvcv::Tensor &numMatches, + const nvcv::Tensor &distances, bool crossCheck, int matchesPerPoint, + NVCVNormType normType) +{ + nvcv::detail::CheckThrow(cvcudaPairwiseMatcherSubmit( + m_handle, stream, set1.handle(), set2.handle(), numSet1.handle(), numSet2.handle(), matches.handle(), + numMatches.handle(), distances.handle(), crossCheck, matchesPerPoint, normType)); +} + +inline NVCVOperatorHandle PairwiseMatcher::handle() const noexcept +{ + return m_handle; +} + +} // namespace cvcuda + +#endif // CVCUDA_PAIRWISE_MATCHER_HPP diff --git a/src/cvcuda/include/cvcuda/OpPillowResize.h b/src/cvcuda/include/cvcuda/OpPillowResize.h index e3326f72..73bdb3a2 100644 --- a/src/cvcuda/include/cvcuda/OpPillowResize.h +++ b/src/cvcuda/include/cvcuda/OpPillowResize.h @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -28,10 +28,12 @@ #include "Operator.h" #include "Types.h" +#include "Workspace.h" #include "detail/Export.h" #include #include +#include #include #include @@ -44,17 +46,51 @@ extern "C" * * @param [out] handle Where the image instance handle will be written to. * + Must not be NULL. - * @param [in] maxWidth Maximum input and output image width. - * @param [in] maxHeight Maximum input and output image height. - * @param [in] maxBatchSize Maximum batchsize used in this operator. * @param [in] fmt Image format * * @retval #NVCV_ERROR_INVALID_ARGUMENT Handle is null. * @retval #NVCV_ERROR_OUT_OF_MEMORY Not enough memory to create the operator. * @retval #NVCV_SUCCESS Operation executed successfully. */ -CVCUDA_PUBLIC NVCVStatus cvcudaPillowResizeCreate(NVCVOperatorHandle *handle, int32_t maxWidth, int32_t maxHeight, - int32_t maxBatchSize, NVCVImageFormat fmt); +CVCUDA_PUBLIC NVCVStatus cvcudaPillowResizeCreate(NVCVOperatorHandle *handle); + +/** Calculates the upper bounds of buffer sizes required to run the operator + * + * @param [in] handle Where the image instance handle will be written to. + * + Must not be NULL. + * @param [in] maxBatchSize Maximum batchsize used in this operator. + * @param [in] maxWidth Maximum input and output image width. + * @param [in] maxHeight Maximum input and output image height. + * @param [in] fmt Image format + * @param [out] reqOut Requirements for the operator's workspace + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Handle is null or one of the arguments is out of range. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +CVCUDA_PUBLIC NVCVStatus cvcudaPillowResizeGetWorkspaceRequirements(NVCVOperatorHandle handle, int maxBatchSize, + int32_t maxInWidth, int32_t maxInHeight, + int32_t maxOutWidth, int32_t maxOutHeight, + NVCVImageFormat fmt, + NVCVWorkspaceRequirements *reqOut); + +/** Calculates the buffer sizes required to run the operator + * + * @param [in] handle Where the image instance handle will be written to. + * + Must not be NULL. + * @param [in] batchSize The number of images + * @param [in] inputSizes The sizes of the input images + * @param [in] outputSizes The sizes of the output images + * @param [in] fmt Image format + * @param [out] reqOut Requirements for the operator's workspace + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Handle is null or one of the arguments is out of range. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +CVCUDA_PUBLIC NVCVStatus cvcudaPillowResizeVarShapeGetWorkspaceRequirements(NVCVOperatorHandle handle, int batchSize, + const NVCVSize2D *inputSizesWH, + const NVCVSize2D *outputSizesWH, + NVCVImageFormat fmt, + NVCVWorkspaceRequirements *reqOut); /** Executes the pillow resize operation on the given cuda stream. This operation does not * wait for completion. @@ -117,12 +153,14 @@ CVCUDA_PUBLIC NVCVStatus cvcudaPillowResizeCreate(NVCVOperatorHandle *handle, in * @retval #NVCV_SUCCESS Operation executed successfully. */ /** @{ */ -CVCUDA_PUBLIC NVCVStatus cvcudaPillowResizeSubmit(NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle in, - NVCVTensorHandle out, const NVCVInterpolationType interpolation); +CVCUDA_PUBLIC NVCVStatus cvcudaPillowResizeSubmit(NVCVOperatorHandle handle, cudaStream_t stream, + const NVCVWorkspace *workspace, NVCVTensorHandle in, + NVCVTensorHandle out, NVCVInterpolationType interpolation); -CVCUDA_PUBLIC NVCVStatus nvcvopPillowResizeVarShapeSubmit(NVCVOperatorHandle handle, cudaStream_t stream, - NVCVImageBatchHandle in, NVCVImageBatchHandle out, - const NVCVInterpolationType interpolation); +CVCUDA_PUBLIC NVCVStatus cvcudaPillowResizeVarShapeSubmit(NVCVOperatorHandle handle, cudaStream_t stream, + const NVCVWorkspace *workspace, NVCVImageBatchHandle in, + NVCVImageBatchHandle out, + NVCVInterpolationType interpolation); /** @} */ #ifdef __cplusplus } diff --git a/src/cvcuda/include/cvcuda/OpPillowResize.hpp b/src/cvcuda/include/cvcuda/OpPillowResize.hpp index ff48c799..6d647503 100644 --- a/src/cvcuda/include/cvcuda/OpPillowResize.hpp +++ b/src/cvcuda/include/cvcuda/OpPillowResize.hpp @@ -28,6 +28,7 @@ #include "IOperator.hpp" #include "OpPillowResize.h" +#include "Workspace.hpp" #include #include @@ -41,26 +42,31 @@ namespace cvcuda { class PillowResize final : public IOperator { public: - explicit PillowResize(nvcv::Size2D maxSize, int32_t maxBatchSize, nvcv::ImageFormat fmt); + PillowResize(); ~PillowResize(); - void operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, - const NVCVInterpolationType interpolation); + WorkspaceRequirements getWorkspaceRequirements(int batchSize, const nvcv::Size2D *in_sizes, + const nvcv::Size2D *out_sizes, nvcv::ImageFormat fmt); + + WorkspaceRequirements getWorkspaceRequirements(int maxBatchSize, nvcv::Size2D maxInSize, nvcv::Size2D maxOutSize, + nvcv::ImageFormat fmt); - void operator()(cudaStream_t stream, const nvcv::ImageBatchVarShape &in, const nvcv::ImageBatchVarShape &out, + void operator()(cudaStream_t stream, const Workspace &ws, const nvcv::Tensor &in, const nvcv::Tensor &out, const NVCVInterpolationType interpolation); + void operator()(cudaStream_t stream, const Workspace &ws, const nvcv::ImageBatchVarShape &in, + const nvcv::ImageBatchVarShape &out, const NVCVInterpolationType interpolation); + virtual NVCVOperatorHandle handle() const noexcept override; private: NVCVOperatorHandle m_handle; }; -inline PillowResize::PillowResize(nvcv::Size2D maxSize, int32_t maxBatchSize, nvcv::ImageFormat fmt) +inline PillowResize::PillowResize() { - NVCVImageFormat cfmt = fmt.cvalue(); - nvcv::detail::CheckThrow(cvcudaPillowResizeCreate(&m_handle, maxSize.w, maxSize.h, maxBatchSize, cfmt)); + nvcv::detail::CheckThrow(cvcudaPillowResizeCreate(&m_handle)); assert(m_handle); } @@ -70,17 +76,36 @@ inline PillowResize::~PillowResize() m_handle = nullptr; } -inline void PillowResize::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, - const NVCVInterpolationType interpolation) +inline WorkspaceRequirements PillowResize::getWorkspaceRequirements(int batchSize, const nvcv::Size2D *in_sizes, + const nvcv::Size2D *out_sizes, + nvcv::ImageFormat fmt) +{ + WorkspaceRequirements req{}; + nvcv::detail::CheckThrow(cvcudaPillowResizeVarShapeGetWorkspaceRequirements(m_handle, batchSize, in_sizes, + out_sizes, fmt.cvalue(), &req)); + return req; +} + +inline WorkspaceRequirements PillowResize::getWorkspaceRequirements(int maxBatchSize, nvcv::Size2D maxInSize, + nvcv::Size2D maxOutSize, nvcv::ImageFormat fmt) +{ + WorkspaceRequirements req{}; + nvcv::detail::CheckThrow(cvcudaPillowResizeGetWorkspaceRequirements( + m_handle, maxBatchSize, maxInSize.w, maxInSize.h, maxOutSize.w, maxOutSize.h, fmt.cvalue(), &req)); + return req; +} + +inline void PillowResize::operator()(cudaStream_t stream, const Workspace &ws, const nvcv::Tensor &in, + const nvcv::Tensor &out, const NVCVInterpolationType interpolation) { - nvcv::detail::CheckThrow(cvcudaPillowResizeSubmit(m_handle, stream, in.handle(), out.handle(), interpolation)); + nvcv::detail::CheckThrow(cvcudaPillowResizeSubmit(m_handle, stream, &ws, in.handle(), out.handle(), interpolation)); } -inline void PillowResize::operator()(cudaStream_t stream, const nvcv::ImageBatchVarShape &in, +inline void PillowResize::operator()(cudaStream_t stream, const Workspace &ws, const nvcv::ImageBatchVarShape &in, const nvcv::ImageBatchVarShape &out, const NVCVInterpolationType interpolation) { nvcv::detail::CheckThrow( - nvcvopPillowResizeVarShapeSubmit(m_handle, stream, in.handle(), out.handle(), interpolation)); + cvcudaPillowResizeVarShapeSubmit(m_handle, stream, &ws, in.handle(), out.handle(), interpolation)); } inline NVCVOperatorHandle PillowResize::handle() const noexcept diff --git a/src/cvcuda/include/cvcuda/OpSIFT.h b/src/cvcuda/include/cvcuda/OpSIFT.h index e22e2f48..45fa7308 100644 --- a/src/cvcuda/include/cvcuda/OpSIFT.h +++ b/src/cvcuda/include/cvcuda/OpSIFT.h @@ -157,12 +157,12 @@ CVCUDA_PUBLIC NVCVStatus cvcudaSIFTCreate(NVCVOperatorHandle *handle, int3 maxSh * @param [in] contrastThreshold The contrast threshold used to remove features with low contrast. The larger this * threshold, the less features are extracted by the operator. One suggestion, given * by the original algorithm description, is to use \f$ 0.03 \f$. - * + It must be between 0 and 1. + * + It must be positive. * * @param [in] edgeThreshold The edge threshold used to remove features that are similar to edges. The larger this * threshold, the more features are extracted by the operator. One suggestion, given by * the original algorithm description, is to use \f$ 10.0 \f$. - * + It must be between 0 and 1. + * + It must be positive. * * @param [in] initSigma The initial sigma to be applied by the first Gaussian filter done at the first octave. * This sigma is progressively applied for each scale-space layer within each octave diff --git a/src/cvcuda/include/cvcuda/OpStack.h b/src/cvcuda/include/cvcuda/OpStack.h new file mode 100644 index 00000000..58e9bff1 --- /dev/null +++ b/src/cvcuda/include/cvcuda/OpStack.h @@ -0,0 +1,121 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file OpStack.h + * + * @brief Defines types and functions to handle the Stack operation. + * @defgroup NVCV_C_ALGORITHM__STACK Stack + * @{ + */ + +#ifndef CVCUDA__STACK_H +#define CVCUDA__STACK_H + +#include "Operator.h" +#include "detail/Export.h" + +#include +#include +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** Constructs and an instance of the Stack operator. + * + * @param [out] handle Where the image instance handle will be written to. + * + Must not be NULL. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Handle is null. + * @retval #NVCV_ERROR_OUT_OF_MEMORY Not enough memory to create the operator. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +CVCUDA_PUBLIC NVCVStatus cvcudaStackCreate(NVCVOperatorHandle *handle); + +/** + * + * Executes the Stack operation on the given cuda stream. This operation does not + * wait for completion. The stack operation copies source tensors from into an output tensor. + * The output tensor is a concatenation of the source tensors, with each source tensor copied into + * the output tensor. All of the source tensors must have the same data type and number of channels width and height. + * + * Limitations: + * + * Input: + * Data Layout: [NHWC, NCHW, CHW, HWC] + * Channels: [1,2,3,4] + * + * Data Type | Allowed + * -------------- | ------------- + * 8bit Unsigned | Yes + * 8bit Signed | Yes + * 16bit Unsigned | Yes + * 16bit Signed | Yes + * 32bit Unsigned | Yes + * 32bit Signed | Yes + * 32bit Float | Yes + * 64bit Float | Yes + * + * Output: + * Data Layout: [NHWC, NCHW] + * Channels: [1,2,3,4] + * + * Data Type | Allowed + * -------------- | ------------- + * 8bit Unsigned | Yes + * 8bit Signed | Yes + * 16bit Unsigned | Yes + * 16bit Signed | Yes + * 32bit Unsigned | Yes + * 32bit Signed | Yes + * 32bit Float | Yes + * 64bit Float | Yes + * + * Input/Output dependency + * + * Property | Input == Output + * -------------- | ------------- + * Data Layout | Yes + * Data Type | Yes + * Number | No + * Channels | Yes + * Width | Yes + * Height | Yes + * + * @param [in] handle Handle to the operator. + * + Must not be NULL. + * @param [in] stream Handle to a valid CUDA stream. + * + * @param [in] in input tensors batch. + * + * @param [out] out output tensor NHWC/CHW where N is equal to the number of all input tensors. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range. + * @retval #NVCV_ERROR_INTERNAL Internal error in the operator, invalid types passed in. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +CVCUDA_PUBLIC NVCVStatus cvcudaStackSubmit(NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorBatchHandle in, + NVCVTensorHandle out); + +#ifdef __cplusplus +} +#endif + +#endif /* CVCUDA__STACK_H */ diff --git a/src/cvcuda/include/cvcuda/OpStack.hpp b/src/cvcuda/include/cvcuda/OpStack.hpp new file mode 100644 index 00000000..8f85a736 --- /dev/null +++ b/src/cvcuda/include/cvcuda/OpStack.hpp @@ -0,0 +1,79 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file OpStack.hpp + * + * @brief Defines the public C++ Class for the Stack operation. + * @defgroup NVCV_CPP_ALGORITHM__STACK Stack + * @{ + */ + +#ifndef CVCUDA__STACK_HPP +#define CVCUDA__STACK_HPP + +#include "IOperator.hpp" +#include "OpStack.h" + +#include +#include +#include +#include +#include + +namespace cvcuda { + +class Stack final : public IOperator +{ +public: + explicit Stack(); + + ~Stack(); + + void operator()(cudaStream_t stream, const nvcv::TensorBatch &in, const nvcv::Tensor &out); + + virtual NVCVOperatorHandle handle() const noexcept override; + +private: + NVCVOperatorHandle m_handle; +}; + +inline Stack::Stack() +{ + nvcv::detail::CheckThrow(cvcudaStackCreate(&m_handle)); + assert(m_handle); +} + +inline Stack::~Stack() +{ + nvcvOperatorDestroy(m_handle); + m_handle = nullptr; +} + +inline void Stack::operator()(cudaStream_t stream, const nvcv::TensorBatch &in, const nvcv::Tensor &out) +{ + nvcv::detail::CheckThrow(cvcudaStackSubmit(m_handle, stream, in.handle(), out.handle())); +} + +inline NVCVOperatorHandle Stack::handle() const noexcept +{ + return m_handle; +} + +} // namespace cvcuda + +#endif // CVCUDA__STACK_HPP diff --git a/src/cvcuda/include/cvcuda/Types.h b/src/cvcuda/include/cvcuda/Types.h index 67b527cc..17bb8f62 100644 --- a/src/cvcuda/include/cvcuda/Types.h +++ b/src/cvcuda/include/cvcuda/Types.h @@ -385,6 +385,42 @@ typedef enum NVCV_SIFT_USE_EXPANDED_INPUT = 1 } NVCVSIFTFlagType; +// @brief Defines connectivity of elements +typedef enum +{ + NVCV_CONNECTIVITY_4_2D, //!< Immediate (cross) neighborhood of pixels. + NVCV_CONNECTIVITY_6_3D, //!< Immediate (cross) neighborhood of voxels. + NVCV_CONNECTIVITY_8_2D, //!< All direct (full) neighborhood of pixels. + NVCV_CONNECTIVITY_26_3D, //!< All direct (full) neighborhood of voxels. +} NVCVConnectivityType; + +// @brief Defines how labels are assigned in Label operator +typedef enum +{ + NVCV_LABEL_FAST, //!< Does not guarantee consecutive label numbers. + NVCV_LABEL_SEQUENTIAL, //!< Assigns consecutive numbers to labels. +} NVCVLabelType; + +// @brief Defines pair-wise matcher algorithms of choice +typedef enum +{ + NVCV_BRUTE_FORCE //!< Select brute-force algorithm as the matcher +} NVCVPairwiseMatcherType; + +// @brief Defines how a vector normalization should occur +typedef enum +{ + NVCV_NORM_HAMMING = 0, //!< Equivalent to the Hamming distance (or L_0 norm) + NVCV_NORM_L1 = 1, //!< Equivalent to the absolute distance = |x1-x2| + |y1-y2| (or L_1 norm) + NVCV_NORM_L2 = 2, //!< Equivalent to the Euclidean distance (or L_2 norm) + NVCV_NORM_C = 3, //!< distance = max(|x1-x2|,|y1-y2|) + NVCV_NORM_L12 = 4, //!< L1-L2 metric: distance = 2(sqrt(1+x*x/2) - 1)) + NVCV_NORM_FAIR = 5, //!< distance = c^2(|x|/c-log(1+|x|/c)), c = 1.3998 + NVCV_NORM_WELSCH = 6, //!< distance = c^2/2(1-exp(-(x/c)^2)), c = 2.9846 + NVCV_NORM_HUBER = 7, //!< distance = |x| +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** Defines requirements for workspace memory + */ +typedef struct NVCVWorkspaceMemRequirementsRec +{ + /** Size, in bytes, of the required memory */ + size_t size; + /** Alignment, in bytes, of the required memory */ + size_t alignment; +} NVCVWorkspaceMemRequirements; + +/** Aggregates requirements for all resource kinds in a workspace + */ +typedef struct NVCVWorkspaceRequirementsRec +{ + /** Requirements for plain host memory */ + NVCVWorkspaceMemRequirements hostMem; + /** Requirements for GPU-accessible host memory (e.g. allocated with cudaHostAlloc) */ + NVCVWorkspaceMemRequirements pinnedMem; + /** Requirements for GPU memory */ + NVCVWorkspaceMemRequirements cudaMem; +} NVCVWorkspaceRequirements; + +/** Memory block for use in a workspace object. + * + * A workspace memory structure contains the requriements (these can be useful when obtaining memory from the workspace) + * a pointer to the memory object and an optional CUDA event object which notifies that the memory is ready to use. + * + */ +typedef struct NVCVWorkspaceMemRec +{ + /** The requirements that the memory pointed to by `data` must satisfy */ + NVCVWorkspaceMemRequirements req; + + /** The pointer to the workspace memory. + * + * @remark The accessibility of the memory may be restricted to the host or a specific device. + */ + void *data; + + /** The event which notifies that the memory is ready to use. + * + * The event object is used in two ways - the user (e.g. an operator) of the workspace memory should wait for the + * event in the context in which it will use the memory as well as record the event after it has scheduled all work + * that uses the memory object. + */ + cudaEvent_t ready; +} NVCVWorkspaceMem; + +/** Aggregates multiple resources into a single workspace objects */ +typedef struct NVCVWorkspaceRec +{ + /** Plain host memory. This should not be used in any GPU code. + * + * On systems with a discrete GPU, this kind of memory doesn't need a CUDA event. On systems with integrated GPU + * or HMM systems, there's no difference between plain and pinned host memory with respect to synchronization. + */ + NVCVWorkspaceMem hostMem; + + /** Pinned host memory. + * + * cudaXxxAsync operations on this kind of memory are performed truly asynchronously, which calls for + * synchronization. + * When used as a staging buffer for passing data to a CUDA kernel, a typical synchronization scheme would be to + * wait for the `ready` event on host (cudaEventSynchronize), issue H2D copy and record the `ready` event. + */ + NVCVWorkspaceMem pinnedMem; + + /** GPU memory */ + NVCVWorkspaceMem cudaMem; +} NVCVWorkspace; + +#ifdef __cplusplus +} +#endif + +#endif /* CVCUDAERATORS_WORKSPACE_H */ diff --git a/src/cvcuda/include/cvcuda/Workspace.hpp b/src/cvcuda/include/cvcuda/Workspace.hpp new file mode 100644 index 00000000..65a9ddfd --- /dev/null +++ b/src/cvcuda/include/cvcuda/Workspace.hpp @@ -0,0 +1,203 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CVCUDAERATORS_WORKSPACE_HPP +#define CVCUDAERATORS_WORKSPACE_HPP + +#include "Workspace.h" + +#include +#include + +#include +#include +#include + +namespace cvcuda { + +using Workspace = NVCVWorkspace; +using WorkspaceMem = NVCVWorkspaceMem; +using WorkspaceRequirements = NVCVWorkspaceRequirements; +using WorkspaceMemRequirements = NVCVWorkspaceMemRequirements; + +/** Computes memory requirements that can cover both input requirements. + * + * The resulting memory requriements will have alignment and size that is not smaller than that of either + * of the arguments. + * + * alignment = max(a.alignment, b.alignment) + * size = align_up(max(a.size, b.size), alignment) + */ +inline WorkspaceMemRequirements MaxWorkspaceReq(WorkspaceMemRequirements a, WorkspaceMemRequirements b) +{ + WorkspaceMemRequirements ret; + assert(!a.size || a.alignment > 0); + assert(!b.size || b.alignment > 0); + ret.alignment = b.alignment > a.alignment ? b.alignment : a.alignment; + ret.size = b.size > a.size ? b.size : a.size; + assert((ret.alignment & (ret.alignment - 1)) == 0 && "Alignment must be a power of 2"); + ret.size = nvcv::detail::AlignUp(ret.size, ret.alignment); + return ret; +} + +/** Computes workspace requirements that can cover both input requirments. */ +inline NVCVWorkspaceRequirements MaxWorkspaceReq(const WorkspaceRequirements &a, const WorkspaceRequirements &b) +{ + WorkspaceRequirements ret; + ret.hostMem = MaxWorkspaceReq(a.hostMem, b.hostMem); + ret.pinnedMem = MaxWorkspaceReq(a.pinnedMem, b.pinnedMem); + ret.cudaMem = MaxWorkspaceReq(a.cudaMem, b.cudaMem); + return ret; +} + +/** A helper class that manages the lifetime of resources stored in a Workspace structure. + * + * This class works in a way similar to unique_ptr with a custom deleter. + */ +class UniqueWorkspace +{ +public: + using DeleterFunc = void(NVCVWorkspace &); + using Deleter = std::function; + + UniqueWorkspace() = default; + + UniqueWorkspace(const UniqueWorkspace &) = delete; + + UniqueWorkspace(UniqueWorkspace &&ws) + { + swap(ws); + } + + UniqueWorkspace &operator=(const UniqueWorkspace &) = delete; + + UniqueWorkspace &operator=(UniqueWorkspace &&ws) noexcept + { + swap(ws); + ws.reset(); + return *this; + } + + UniqueWorkspace(Workspace workspace, Deleter del = {}) + : m_impl(workspace) + , m_del(std::move(del)) + { + } + + UniqueWorkspace(WorkspaceMem host, WorkspaceMem pinned, WorkspaceMem cuda, Deleter del = {}) + : m_impl{host, pinned, cuda} + , m_del(std::move(del)) + { + } + + ~UniqueWorkspace() + { + reset(); + } + + void reset() noexcept + { + if (m_del) + { + m_del(m_impl); + m_del = {}; + m_impl = {}; + } + } + + const Workspace &get() const + { + return m_impl; + } + +private: + void swap(UniqueWorkspace &ws) + { + std::swap(m_impl, ws.m_impl); + std::swap(m_del, ws.m_del); + } + + Workspace m_impl{}; + Deleter m_del{}; +}; + +/** Allocates a workspace with an allocator specified in `alloc` (or a default one). + * + * This function is meant as a simple helper to simplify the usage operators requiring a workspace, but its intense use + * may degrade performance due to excessive allocations and deallocations. + * For code used in tight loops, some workspace reuse scheme and/or resource pools are recommended. + */ +inline UniqueWorkspace AllocateWorkspace(WorkspaceRequirements req, nvcv::Allocator alloc = {}) +{ + if (!alloc) + { + nvcv::CustomAllocator<> cust{}; + alloc = std::move(cust); + } + auto del = [alloc](NVCVWorkspace &ws) + { + // TODO(michalz): Add proper CUDA error handling in public API + if (ws.hostMem.data) + { + if (ws.hostMem.ready) + if (cudaEventSynchronize(ws.hostMem.ready) != cudaSuccess) + throw std::runtime_error("cudaEventSynchronize failed"); + alloc.hostMem().free(ws.hostMem.data, ws.hostMem.req.size, ws.hostMem.req.alignment); + ws.hostMem.data = nullptr; + } + if (ws.pinnedMem.data) + { + if (ws.pinnedMem.ready) + if (cudaEventSynchronize(ws.pinnedMem.ready) != cudaSuccess) + throw std::runtime_error("cudaEventSynchronize failed"); + alloc.hostPinnedMem().free(ws.pinnedMem.data, ws.pinnedMem.req.size, ws.pinnedMem.req.alignment); + ws.pinnedMem.data = nullptr; + } + if (ws.cudaMem.data) + { + if (ws.cudaMem.ready) + if (cudaEventSynchronize(ws.cudaMem.ready) != cudaSuccess) + throw std::runtime_error("cudaEventSynchronize failed"); + alloc.cudaMem().free(ws.cudaMem.data, ws.cudaMem.req.size, ws.cudaMem.req.alignment); + ws.cudaMem.data = nullptr; + } + }; + NVCVWorkspace ws = {}; + try + { + ws.hostMem.req = req.hostMem; + ws.pinnedMem.req = req.pinnedMem; + ws.cudaMem.req = req.cudaMem; + + if (req.hostMem.size) + ws.hostMem.data = alloc.hostMem().alloc(req.hostMem.size, req.hostMem.alignment); + if (req.pinnedMem.size) + ws.pinnedMem.data = alloc.hostPinnedMem().alloc(req.pinnedMem.size, req.pinnedMem.alignment); + if (req.cudaMem.size) + ws.cudaMem.data = alloc.cudaMem().alloc(req.cudaMem.size, req.cudaMem.alignment); + return UniqueWorkspace(ws, std::move(del)); + } + catch (...) + { + del(ws); + throw; + } +} + +} // namespace cvcuda + +#endif // CVCUDAERATORS_WORKSPACE_HPP diff --git a/src/cvcuda/priv/CMakeLists.txt b/src/cvcuda/priv/CMakeLists.txt index 419dc05a..cd3904c4 100644 --- a/src/cvcuda/priv/CMakeLists.txt +++ b/src/cvcuda/priv/CMakeLists.txt @@ -26,8 +26,8 @@ set(CV_CUDA_PRIV_OP_FILES OpMinMaxLoc.cu OpHistogram.cpp OpMinAreaRect.cpp - OpBoxBlur.cpp OpBndBox.cpp + OpBoxBlur.cpp OpBrightnessContrast.cu OpRemap.cu OpColorTwist.cu @@ -64,6 +64,10 @@ set(CV_CUDA_PRIV_OP_FILES OpRandomResizedCrop.cpp OpGaussianNoise.cpp OpInpaint.cpp + OpLabel.cu + OpPairwiseMatcher.cu + OpStack.cpp + OpFindHomography.cu ) # filter only one that matches the patern (case insensitive), should be set on the global level @@ -97,5 +101,8 @@ target_link_libraries(cvcuda_priv nvcv_util_sanitizer cvcuda_legacy CUDA::cudart_static + CUDA::cusolver + CUDA::cublas + CUDA::cublasLt -lrt ) diff --git a/src/cvcuda/priv/OpBndBox.cpp b/src/cvcuda/priv/OpBndBox.cpp index 05974888..36fa702f 100644 --- a/src/cvcuda/priv/OpBndBox.cpp +++ b/src/cvcuda/priv/OpBndBox.cpp @@ -30,7 +30,7 @@ namespace legacy = nvcv::legacy::cuda_op; BndBox::BndBox() { legacy::DataShape maxIn, maxOut; //maxIn/maxOut not used by op. - m_legacyOp = std::make_unique(maxIn, maxOut); + m_legacyOp = std::make_unique(maxIn, maxOut); } void BndBox::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, @@ -50,7 +50,7 @@ void BndBox::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv: "Output must be cuda-accessible, pitch-linear tensor"); } - NVCV_CHECK_THROW(m_legacyOp->infer(*inData, *outData, bboxes, stream)); + NVCV_CHECK_THROW(m_legacyOp->inferBox(*inData, *outData, bboxes, stream)); } } // namespace cvcuda::priv diff --git a/src/cvcuda/priv/OpBndBox.hpp b/src/cvcuda/priv/OpBndBox.hpp index d2904917..00364f59 100644 --- a/src/cvcuda/priv/OpBndBox.hpp +++ b/src/cvcuda/priv/OpBndBox.hpp @@ -42,7 +42,7 @@ class BndBox final : public IOperator const NVCVBndBoxesI &bboxes) const; private: - std::unique_ptr m_legacyOp; + std::unique_ptr m_legacyOp; }; } // namespace cvcuda::priv diff --git a/src/cvcuda/priv/OpCropFlipNormalizeReformat.cu b/src/cvcuda/priv/OpCropFlipNormalizeReformat.cu index 51ebf8d3..8c9426b7 100644 --- a/src/cvcuda/priv/OpCropFlipNormalizeReformat.cu +++ b/src/cvcuda/priv/OpCropFlipNormalizeReformat.cu @@ -100,7 +100,7 @@ __device__ void transfer_data(cuda::BorderVarShapeWrapNHWC srcWrap, float base = get_base_value(baseWrap, c, base_channels); float scale = get_scale_value(scaleWrap, c, scale_channels, epsilon, flags); dstWrap[(int4){dst_idx.x, dst_idx.y, c, batchidx}] = cuda::SaturateCast( - (srcWrap[(int4){src_idx.x, src_idx.y, batchidx, c}] - base) * scale * global_scale + global_shift); + (srcWrap[(int4){batchidx, src_idx.y, src_idx.x, c}] - base) * scale * global_scale + global_shift); } } else @@ -110,7 +110,7 @@ __device__ void transfer_data(cuda::BorderVarShapeWrapNHWC srcWrap, float base = get_base_value(baseWrap, c, base_channels); float scale = get_scale_value(scaleWrap, c, scale_channels, epsilon, flags); dstWrap[(int4){c, dst_idx.x, dst_idx.y, batchidx}] = cuda::SaturateCast( - (srcWrap[(int4){src_idx.x, src_idx.y, batchidx, c}] - base) * scale * global_scale + global_shift); + (srcWrap[(int4){batchidx, src_idx.y, src_idx.x, c}] - base) * scale * global_scale + global_shift); } } } diff --git a/src/cvcuda/priv/OpFindHomography.cu b/src/cvcuda/priv/OpFindHomography.cu new file mode 100644 index 00000000..d3e712cb --- /dev/null +++ b/src/cvcuda/priv/OpFindHomography.cu @@ -0,0 +1,1615 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "OpFindHomography.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define BLOCK_SIZE 128 +#define PIPELINES 8 + +namespace cuda = nvcv::cuda; +namespace util = nvcv::util; + +typedef cuda::math::Vector vector8; +typedef cuda::math::Vector vector9; +typedef cuda::math::Vector intvector8; +typedef cuda::math::Vector vector32; +typedef cuda::math::Matrix matrix8x8; +typedef cuda::math::Matrix matrix8x32; +typedef cuda::math::Matrix dmatrix8x8; +typedef cuda::math::Vector dvector8; + +namespace { + +#define is_aligned(POINTER, BYTE_COUNT, msg) \ + do \ + { \ + if (((uintptr_t)(const void *)(POINTER)) % (BYTE_COUNT) != 0) \ + { \ + std::cerr << msg << " at line " << __LINE__ << " in " << __FILE__ << std::endl; \ + return; \ + } \ + } \ + while (0) + +#define CUDA_CHECK_ERROR(err, msg) \ + do \ + { \ + cudaError_t _err = (err); \ + if (_err != cudaSuccess) \ + { \ + std::cerr << "(" << cudaGetErrorString(_err) << ") at line " << __LINE__ << " in " << __FILE__ << " : " \ + << msg << std::endl; \ + return; \ + } \ + } \ + while (0) + +#define CUBLAS_CHECK_ERROR(err, msg) \ + do \ + { \ + cublasStatus_t _err = (err); \ + if (_err != CUBLAS_STATUS_SUCCESS) \ + { \ + std::cerr << "CUBLAS error (" << _err << ") at line " << __LINE__ << " in " << __FILE__ << " : " << msg \ + << std::endl; \ + return; \ + } \ + } \ + while (0) + +#define CUSOLVER_CHECK_ERROR(err, msg) \ + do \ + { \ + cusolverStatus_t _err = (err); \ + if (_err != CUSOLVER_STATUS_SUCCESS) \ + { \ + std::cerr << "CUSOLVER error (" << _err << ") at line " << __LINE__ << " in " << __FILE__ << " : " << msg \ + << std::endl; \ + return; \ + } \ + } \ + while (0) + +#ifdef DEBUG +template +__global__ void printKernel(T *data, int numPoints, int batchIdx) +{ + int i = threadIdx.x + blockIdx.x * blockDim.x; + if (i < numPoints) + printf("Batch = %d, i = %d, val = %.9g,\n", batchIdx, i, (double)data[i]); +} + +__global__ void printKernelfloat2(float2 *data, int numPoints, int batchIdx) +{ + int i = threadIdx.x + blockIdx.x * blockDim.x; + if (i < numPoints) + printf("Batch = %d, i = %d, val = %.9g,%.9g\n", batchIdx, i, (double)data[i].x, (double)data[i].y); +} +#endif + +#ifdef DEBUG_MODEL_KERNEL +template +__global__ void printMatrix(T *data, int M, int N) +{ + for (int i = 0; i < M; i++) + { + for (int j = 0; j < N; j++) + { + printf("%.9g, ", (double)data[i * N + j]); + } + printf("\n"); + } +} + +template +__global__ void printMatrixCols(T *data, int M, int N) +{ + for (int j = 0; j < N; j++) + { + printf("ROw %d\n", j); + for (int i = 0; i < M; i++) + { + printf("A[%d + %d * lda] = %g;\n", i, j, (double)data[i * N + j]); + } + printf("\n"); + } +} + +template +__device__ void printMatrixDevice(cuda::math::Matrix &A) +{ + for (int i = 0; i < N; i++) + { + printf("["); + for (int j = 0; j < N; j++) + { + printf("%.9g, ", A[i][j]); + } + printf("],\n"); + } +} + +template +__device__ void printMatrixDeviceParallel(cuda::math::Matrix &A) +{ + __threadfence(); + if (threadIdx.x < N) + { + for (int i = 0; i < N; i++) + { + printf("A[%d][%d] = %g\n", i, threadIdx.x, (double)A[i][threadIdx.x]); + } + } +} + +template +__device__ void printMatrixDeviceRaw(T *A, int M, int N, int batch) +{ + printf("Batch = %d\n", batch); + for (int i = 0; i < M; i++) + { + for (int j = 0; j < N; j++) + { + printf("%g, ", (double)A[i * N + j]); + } + printf("\n"); + } +} + +template +__device__ void printVectorDevice(cuda::math::Vector &x, int batch) +{ + printf("Batch = %d\n", batch); + for (int i = 0; i < N; i++) printf("%.9g, ", x[i]); +} + +template +__device__ void printVectorDeviceParallel(cuda::math::Vector &x, int batch) +{ + if (threadIdx.x < N) + printf("x[%d] = %g\n", threadIdx.x, (double)x[threadIdx.x]); +} + +template +__device__ void printVectorDeviceRaw(T *x, int N, int batch) +{ + printf("Batch = %d\n", batch); + for (int i = 0; i < N; i++) printf("%g, ", (double)x[i]); +} +#endif + +__device__ void calculate_residual_and_jacobian_device(float2 *src, float2 *dst, vector8 &h, int numPoints, float *Jptr, + float *errptr); +__device__ void calculate_residual_norm(float *r, float *r_norm_2, vector32 &warpSums, int numPoints); + +__device__ void calculate_Jtx_matvec(float *A, float *B, float *result, matrix8x32 &warpSums, int row, int numPoints); + +__device__ void calculate_JtJ(float *Jt, matrix8x8 &A, matrix8x32 &warpSums, float *reductionBuffer, int numPoints); + +__device__ void calculate_Jtr(float *Jt, float *r, vector8 &v, matrix8x32 &warpSums, float *reductionBuffer, + int numPoints); + +__device__ void fetch_diagonal(matrix8x8 &A, vector8 &D, int tid); + +__device__ void copy_A_to_Ap_App(matrix8x8 &A, matrix8x8 &Ap, matrix8x8 &App); + +__device__ void scale_diagonal8(vector8 &D, matrix8x8 &Ap, float lambda); + +__device__ void compute_qr8x8(matrix8x8 &sA, matrix8x8 &sQ); + +__device__ bool backsolve_inplace(matrix8x8 &A, vector8 &d); + +__device__ bool solve8x8(matrix8x8 &A, matrix8x8 &Q, vector8 &v, vector8 &d, int tid); + +__device__ bool invert8x8(matrix8x8 &A, matrix8x8 &Q, matrix8x8 &invA, int tid); + +__device__ void subtract8(vector8 &x, vector8 &d, vector8 &xd, int tid); + +__device__ void max_diag_val8(matrix8x8 &A, float *maxval); + +__device__ void max8(vector8 &v, float *maxval); + +__device__ static float atomicMax(float *address, float val); + +__device__ void calculate_temp_d(matrix8x8 &A, vector8 &x, vector8 &y, vector8 &z, float alpha, float beta, int tid); + +__device__ int compute_model_estimate(float2 cM, float2 cm, float2 sM, float2 sm, float *W, float *V, vector8 &x, + cuda::Tensor3DWrap model, int batch, int numPoints); + +__device__ void calculate_residual_and_jacobian_device(float2 *src, float2 *dst, vector8 &h, int numPoints, float *Jptr, + float *errptr) +{ + int idx = threadIdx.x; + + for (int tid = idx; tid < numPoints; tid += blockDim.x) + { + float2 M_i = src[tid]; + float2 m_i = dst[tid]; + float Mx = M_i.x, My = M_i.y; + float mx = m_i.x, my = m_i.y; + + float ww = h[6] * Mx + h[7] * My + 1.; + ww = fabs(ww) > FLT_EPSILON ? 1. / ww : 0; + float xi = (h[0] * Mx + h[1] * My + h[2]) * ww; + float yi = (h[3] * Mx + h[4] * My + h[5]) * ww; + + errptr[tid * 2] = xi - mx; + errptr[tid * 2 + 1] = yi - my; + + if (Jptr) + { + // Column major format + Jptr[tid * 2 + numPoints * 0 + 0] = Mx * ww; + Jptr[tid * 2 + numPoints * 0 + 1] = 0; + Jptr[tid * 2 + numPoints * 2 + 0] = My * ww; + Jptr[tid * 2 + numPoints * 2 + 1] = 0; + Jptr[tid * 2 + numPoints * 4 + 0] = ww; + Jptr[tid * 2 + numPoints * 4 + 1] = 0; + Jptr[tid * 2 + numPoints * 6 + 0] = 0; + Jptr[tid * 2 + numPoints * 6 + 1] = Mx * ww; + Jptr[tid * 2 + numPoints * 8 + 0] = 0; + Jptr[tid * 2 + numPoints * 8 + 1] = My * ww; + Jptr[tid * 2 + numPoints * 10 + 0] = 0; + Jptr[tid * 2 + numPoints * 10 + 1] = ww; + Jptr[tid * 2 + numPoints * 12 + 0] = -Mx * ww * xi; + Jptr[tid * 2 + numPoints * 12 + 1] = -Mx * ww * yi; + Jptr[tid * 2 + numPoints * 14 + 0] = -My * ww * xi; + Jptr[tid * 2 + numPoints * 14 + 1] = -My * ww * yi; + } + } +} + +__host__ __device__ inline float myfabs(float val) +{ + return fabsf(val); +} + +inline __host__ __device__ float2 myfabs2(float2 val) +{ + float2 ret; + ret.x = fabsf(val.x); + ret.y = fabsf(val.y); + return ret; +} + +__host__ __device__ inline int getNumPoints(cuda::Tensor2DWrap src, int numPoints, int batch) +{ + return numPoints; +} + +struct MeanOp +{ + __host__ __device__ float2 eval(float2 val, int numPoints, int batch) + { + return val / numPoints; + } +}; + +struct SquareOp +{ + __host__ __device__ float eval(float val, int batch) + { + return val * val; + } +}; + +class AbsShiftOp +{ +private: + float2 *_data; + +public: + // Constructor that takes a float* pointer as a parameter + __host__ __device__ AbsShiftOp(float2 *data) + : _data(data){}; + + // Method to update the float value pointed to by the pointer + __host__ __device__ float2 eval(float2 newVal, int numPoints, int batch) + { + _data += batch; + return myfabs2(newVal - _data[0]); + } +}; + +class LtLOp +{ +private: + float2 *cm, *cM, *sm, *sM; + +public: + __host__ __device__ LtLOp(float2 *srcMean, float2 *dstMean, float2 *srcShiftSum, float2 *dstShiftSum) + { + cM = srcMean; + sM = srcShiftSum; + cm = dstMean; + sm = dstShiftSum; + } + + __host__ __device__ float eval(float2 *src, float2 *dst, int batch, int numPoints, int tid, int j, int k) + { + cm += batch; + cM += batch; + sm += batch; + sM += batch; + float X = (src[tid].x - cM[0].x) * (numPoints / sM[0].x); + float Y = (src[tid].y - cM[0].y) * (numPoints / sM[0].y); + float x = (dst[tid].x - cm[0].x) * (numPoints / sm[0].x); + float y = (dst[tid].y - cm[0].y) * (numPoints / sm[0].y); + float Lx[9] = {X, Y, 1, 0, 0, 0, -x * X, -x * Y, -x}; + float Ly[9] = {0, 0, 0, X, Y, 1, -y * X, -y * Y, -y}; + return Lx[j] * Lx[k] + Ly[j] * Ly[k]; + } +}; + +template +__device__ void reducef(float *data, cuda::math::Vector &warpSums, float *result, Func op, int numPoints, + int batch) +{ + int tid = threadIdx.x; + int idx = threadIdx.x + blockIdx.x * blockDim.x; + float val = 0.0f; + unsigned mask = 0xFFFFFFFFU; + int lane = threadIdx.x % warpSize; + int warpID = threadIdx.x / warpSize; + while (idx < numPoints) + { + val += op.eval(data[idx], batch); + idx += gridDim.x * blockDim.x; + } + + for (int offset = warpSize / 2; offset > 0; offset >>= 1) val += __shfl_down_sync(mask, val, offset); + if (lane == 0) + warpSums[warpID] = val; + + __syncthreads(); + + if (warpID == 0) + { + val = (tid < blockDim.x / warpSize) ? warpSums[lane] : 0.0f; + + for (int offset = warpSize / 2; offset > 0; offset >>= 1) val += __shfl_down_sync(mask, val, offset); + + if (tid == 0) + atomicAdd(result, val); + } +} + +template +__device__ void reducef2(float2 *data, cuda::math::Vector &warpSums, float2 *result, Func op, int numPoints, + int batch) +{ + int tid = threadIdx.x; + int idx = threadIdx.x + blockIdx.x * blockDim.x; + float2 val = {0.0f, 0.0f}; + unsigned mask = 0xFFFFFFFFU; + int lane = threadIdx.x % warpSize; + int warpID = threadIdx.x / warpSize; + while (idx < numPoints) + { + val += op.eval(data[idx], numPoints, batch); + idx += gridDim.x * blockDim.x; + } + + for (int offset = warpSize / 2; offset > 0; offset >>= 1) + { + val.x += __shfl_down_sync(mask, val.x, offset); + val.y += __shfl_down_sync(mask, val.y, offset); + } + if (lane == 0) + warpSums[warpID] = val; + + __syncthreads(); + + if (warpID == 0) + { + val = (tid < blockDim.x / warpSize) ? warpSums[lane] : float2{0.0f, 0.0f}; + + for (int offset = warpSize / 2; offset > 0; offset >>= 1) + { + val.x += __shfl_down_sync(mask, val.x, offset); + val.y += __shfl_down_sync(mask, val.y, offset); + } + + if (tid == 0) + { + atomicAdd(&result[0].x, val.x); + atomicAdd(&result[0].y, val.y); + } + } +} + +template +__device__ void reduceLtL(float2 *src, float2 *dst, cuda::math::Vector &warpSums, float *result, Func op, + int numPoints, int batch, int j, int k) +{ + int tid = threadIdx.x; + int idx = threadIdx.x + blockIdx.x * blockDim.x; + float val = 0.0f; + ; + unsigned mask = 0xFFFFFFFFU; + int lane = threadIdx.x % warpSize; + int warpID = threadIdx.x / warpSize; + while (idx < numPoints) + { + // j < 9 and k < 9 are indices of the LtL matrix + val += op.eval(src, dst, batch, numPoints, idx, j, k); + idx += gridDim.x * blockDim.x; + } + + for (int offset = warpSize / 2; offset > 0; offset >>= 1) val += __shfl_down_sync(mask, val, offset); + if (lane == 0) + warpSums[warpID] = val; + __syncthreads(); + + if (warpID == 0) + { + val = (tid < blockDim.x / warpSize) ? warpSums[lane] : 0.0f; + + for (int offset = warpSize / 2; offset > 0; offset >>= 1) val += __shfl_down_sync(mask, val, offset); + + if (tid == 0) + atomicAdd(result, val); + } +} + +__device__ void calculate_residual_norm(float *r, float *r_norm_2, vector32 &warpSums, int numPoints) +{ + SquareOp square_op; + reducef(r, warpSums, r_norm_2, square_op, numPoints, 0); + __syncthreads(); +} + +__device__ void calculate_Jtx_matvec(float *A, float *B, float *result, matrix8x32 &warpSums, int row, int numPoints) +{ + // NOTE : Jt has to be of dimension (8 x innerDim) where innerDim = numPoints x 2 + int tid = threadIdx.x; + int idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (threadIdx.x < 8) + { + for (int i = 0; i < 8; i++) + { + warpSums[i][threadIdx.x] = 0; + } + } + __syncthreads(); + + float val[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + unsigned mask = 0xFFFFFFFFU; + int lane = threadIdx.x % warpSize; + int warpID = threadIdx.x / warpSize; + while (idx < numPoints) + { + float src_data_val = A[row * numPoints + idx]; +#pragma unroll + for (int r = row; r < 8; r++) val[r] += src_data_val * B[r * numPoints + idx]; + idx += gridDim.x * blockDim.x; + } + + for (int r = row; r < 8; r++) + { + for (int offset = warpSize / 2; offset > 0; offset >>= 1) val[r] += __shfl_down_sync(mask, val[r], offset); + if (lane == 0) + warpSums[r][warpID] = val[r]; + } + __syncthreads(); + + if (warpID == 0) + { +#pragma unroll + for (int r = row; r < 8; r++) + { + val[r] = (tid < blockDim.x / warpSize) ? warpSums[r][lane] : 0; + + for (int offset = warpSize / 2; offset > 0; offset >>= 1) val[r] += __shfl_down_sync(mask, val[r], offset); + + if (tid == 0) + atomicAdd(&result[r], val[r]); + } + } +} + +__device__ void calculate_JtJ(float *Jt, matrix8x8 &A, matrix8x32 &warpSums, float *reductionBuffer, int numPoints) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (int row = 0; row < 8; row++) + { + if (tid < 8) + reductionBuffer[tid] = 0; + calculate_Jtx_matvec(Jt, Jt, reductionBuffer, warpSums, row, numPoints); + __syncthreads(); + if (tid < 8) + { + A[row][tid] = reductionBuffer[tid]; + } + } + __syncwarp(); + + for (int row = 1; row < 8; row++) + { + if (tid < row) + A[row][tid] = A[tid][row]; + } + __syncwarp(); +} + +__device__ void calculate_Jtr(float *Jt, float *r, vector8 &v, matrix8x32 &warpSums, float *reductionBuffer, + int numPoints) +{ + if (threadIdx.x < 8) + reductionBuffer[threadIdx.x] = 0.0f; + calculate_Jtx_matvec(r, Jt, reductionBuffer, warpSums, 0, numPoints); + __syncthreads(); + if (threadIdx.x < 8) + v[threadIdx.x] = reductionBuffer[threadIdx.x]; + __syncwarp(); +} + +__device__ void fetch_diagonal(matrix8x8 &A, vector8 &D, int tid) +{ + if (tid < 8) + D[tid] = A[tid][tid]; + __syncwarp(); +} + +__device__ void copy_A_to_Ap_App(matrix8x8 &A, matrix8x8 &Ap, matrix8x8 &App) +{ + if (threadIdx.x < 8) + { + for (int i = 0; i < 8; i++) + { + Ap[i][threadIdx.x] = A[i][threadIdx.x]; + App[i][threadIdx.x] = A[i][threadIdx.x]; + } + } + __syncwarp(); +} + +__device__ void scale_diagonal8(vector8 &D, matrix8x8 &Ap, float lambda) +{ + if (threadIdx.x < 8) + Ap[threadIdx.x][threadIdx.x] += lambda * D[threadIdx.x]; + __syncwarp(); +} + +__device__ void compute_qr8x8(matrix8x8 &sA, matrix8x8 &sQ) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + const int N = 8; + if (tid < N) + { + for (int i = 0; i < N; i++) + { + sQ[i][tid] = 0; + if (i == tid) + sQ[i][tid] = 1; + } + } + __syncwarp(); + + float s[2]; + double temp[2]; + for (int j = 0; j < N; j++) + { + int pivot_row = j; + for (int i = j + 1; i < N; i++) + { + if (tid < N) + { + double theta = atan(-(double)sA[i][j] / (double)sA[pivot_row][j]); + double ctheta = cos(theta); + double stheta = sin(theta); + float sthetaf = (float)stheta; + float cthetaf = (float)ctheta; + + temp[0] = ctheta * sA[pivot_row][tid] - stheta * sA[i][tid]; + temp[1] = stheta * sA[pivot_row][tid] + ctheta * sA[i][tid]; + sA[pivot_row][tid] = temp[0]; + sA[i][tid] = temp[1]; + + s[0] = cthetaf * sQ[pivot_row][tid] - sthetaf * sQ[i][tid]; + s[1] = sthetaf * sQ[pivot_row][tid] + cthetaf * sQ[i][tid]; + sQ[pivot_row][tid] = s[0]; + sQ[i][tid] = s[1]; + } + __syncwarp(); + } + } + __syncwarp(); +} + +__device__ bool backsolve_inplace(matrix8x8 &A, vector8 &d) +{ + const int N = 8; + for (int j = N - 1; j >= 0; j--) + { + if (A[j][j] < FLT_EPSILON) + return false; + d[j] /= A[j][j]; + for (int i = j - 1; i >= 0; i--) + { + d[i] = d[i] - A[i][j] * d[j]; + } + } + return true; +} + +__device__ bool solve8x8(matrix8x8 &A, matrix8x8 &Q, vector8 &v, vector8 &d, int tid) +{ + // Do Q^T * d + if (tid < 8) + { + d[tid] = 0; + for (int i = 0; i < 8; i++) d[tid] += Q[tid][i] * v[i]; + } + + __syncwarp(); + + if (tid == 0) + { + if (!backsolve_inplace(A, d)) + return false; + } + + __syncwarp(); + + return true; +} + +__device__ bool invert8x8(matrix8x8 &A, matrix8x8 &Q, matrix8x8 &invA, int tid) +{ + if (tid < 8) + { + vector8 d = Q.col(tid); + if (!backsolve_inplace(A, d)) + return false; + invA.set_col(tid, d); + } + __syncwarp(); + return true; +} + +__device__ void subtract8(vector8 &x, vector8 &d, vector8 &xd, int tid) +{ + if (tid < 8) + xd[tid] = x[tid] - d[tid]; + __syncwarp(); +} + +__device__ inline void dot8(vector8 &x, vector8 &y, float *r) +{ + *r = x[0] * y[0] + x[1] * y[1] + x[2] * y[2] + x[3] * y[3] + x[4] * y[4] + x[5] * y[5] + x[6] * y[6] + x[7] * y[7]; +} + +__device__ void max_diag_val8(matrix8x8 &A, float *maxval) +{ + *maxval = A[0][0]; + *maxval = fmaxf(A[1][1], *maxval); + *maxval = fmaxf(A[2][2], *maxval); + *maxval = fmaxf(A[3][3], *maxval); + *maxval = fmaxf(A[4][4], *maxval); + *maxval = fmaxf(A[5][5], *maxval); + *maxval = fmaxf(A[6][6], *maxval); + *maxval = fmaxf(A[7][7], *maxval); +} + +__device__ void max8(vector8 &v, float *maxval) +{ + *maxval = fabsf(v[0]); + *maxval = fmaxf(fabsf(v[1]), *maxval); + *maxval = fmaxf(fabsf(v[2]), *maxval); + *maxval = fmaxf(fabsf(v[3]), *maxval); + *maxval = fmaxf(fabsf(v[4]), *maxval); + *maxval = fmaxf(fabsf(v[5]), *maxval); + *maxval = fmaxf(fabsf(v[6]), *maxval); + *maxval = fmaxf(fabsf(v[7]), *maxval); +} + +__device__ static float atomicMax(float *address, float val) +{ + int *address_as_i = (int *)address; + int old = *address_as_i, assumed; + do + { + assumed = old; + old = ::atomicCAS(address_as_i, assumed, __float_as_int(::fmaxf(val, __int_as_float(assumed)))); + } + while (assumed != old); + return __int_as_float(old); +} + +template +__device__ void max(float *data, vector32 &warpSums, float *result, int numPoints) +{ + int tid = threadIdx.x; + int idx = threadIdx.x + blockIdx.x * blockDim.x; + float val = 0.0f; + unsigned mask = 0xFFFFFFFFU; + int lane = threadIdx.x % warpSize; + int warpID = threadIdx.x / warpSize; + while (idx < numPoints) + { + val = fmaxf(val, Func(data[idx])); + idx += gridDim.x * blockDim.x; + } + + for (int offset = warpSize / 2; offset > 0; offset >>= 1) val = fmaxf(val, __shfl_down_sync(mask, val, offset)); + if (lane == 0) + warpSums[warpID] = val; + __syncthreads(); + + if (warpID == 0) + { + val = (tid < blockDim.x / warpSize) ? warpSums[lane] : 0; + + for (int offset = warpSize / 2; offset > 0; offset >>= 1) val = fmaxf(val, __shfl_down_sync(mask, val, offset)); + + if (tid == 0) + atomicMax(result, val); + } +} + +__device__ void calculate_temp_d(matrix8x8 &A, vector8 &x, vector8 &y, vector8 &z, float alpha, float beta, int tid) +{ + if (tid < 8) + { + z[tid] = beta * y[tid]; +#pragma unroll + for (int i = 0; i < 8; i++) z[tid] += alpha * A[tid][i] * x[i]; + } + __syncwarp(); +} + +__device__ int compute_model_estimate(float2 cM, float2 cm, float2 sM, float2 sm, float *W, float *V, vector8 &x, + cuda::Tensor3DWrap model, int batch, int numPoints) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + + if (sm.x < FLT_EPSILON || sm.y < FLT_EPSILON || sM.x < FLT_EPSILON || sM.y < FLT_EPSILON) + { + if (tid < 8) + x[tid] = 0; + __syncwarp(); + return 1; + } + + // compute model estimate + float2 _sm{numPoints / sm.x, numPoints / sm.y}; + float2 _sM{numPoints / sM.x, numPoints / sM.y}; + + int minIdx = 0; + float minEig = fabs(W[0]); + + for (int i = 1; i < 9; i++) + { + if (fabs(W[i]) < minEig) + { + minIdx = i; + minEig = fabs(W[i]); + } + } + + float *H0 = V + 9 * minIdx; + +#ifdef DEBUG_MODEL_ESTIMATE + if (tid == 0) + { + for (int i = 0; i < 9; i++) printf("H0[%d] = %.9g\n", i, H0[i]); + } +#endif + + cuda::math::Matrix tH0; + cuda::math::Matrix tHtemp1; + cuda::math::Matrix tHtemp2; + + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + tH0[i][j] = H0[i * 3 + j]; + } + } + + // load inv_Hnorm + tHtemp2[0][0] = 1.0f / _sm.x; + tHtemp2[0][1] = 0.0f; + tHtemp2[0][2] = cm.x; + tHtemp2[1][0] = 0.0f; + tHtemp2[1][1] = 1.0f / _sm.y; + tHtemp2[1][2] = cm.y; + tHtemp2[2][0] = 0.0f; + tHtemp2[2][1] = 0.0f; + tHtemp2[2][2] = 1.0f; + tHtemp1 = tHtemp2 * tH0; + +#ifdef DEBUG_MODEL_ESTIMATE + if (tid == 0) + { + printf("\n========================_Htemp=========================\n"); + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + printf("_Htemp[%d][%d] = %.9g,", i, j, tHtemp1[i][j]); + } + printf("\n"); + } + } +#endif + + // load Hnorm2 + tHtemp2[0][0] = _sM.x; + tHtemp2[0][1] = 0.0f; + tHtemp2[0][2] = -cM.x * _sM.x; + tHtemp2[1][0] = 0.0f; + tHtemp2[1][1] = _sM.y; + tHtemp2[1][2] = -cM.y * _sM.y; + tHtemp2[2][0] = 0.0f; + tHtemp2[2][1] = 0.0f; + tHtemp2[2][2] = 1.0f; + tH0 = tHtemp1 * tHtemp2; + +#ifdef DEBUG_MODEL_ESTIMATE + if (tid == 0) + { + printf("\n===============_H0====================\n"); + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + printf("_H0[%d][%d] = %.9g,", i, j, tH0[i][j]); + } + printf("\n"); + } + } +#endif + +#pragma unroll + for (int i = 0; i < 3; i++) +#pragma unroll + for (int j = 0; j < 3; j++) tH0[i][j] = tH0[i][j] / tH0[2][2]; + +#ifdef DEBUG_MODEL_ESTIMATE + if (tid == 0) + { + printf("\n===============_H0====================\n"); + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + printf("_H0[%d][%d] = %.9g,", i, j, tH0[i][j]); + } + printf("\n"); + } + } +#endif + + if (tid == 0) + { + x[0] = tH0[0][0]; + x[1] = tH0[0][1]; + x[2] = tH0[0][2]; + x[3] = tH0[1][0]; + x[4] = tH0[1][1]; + x[5] = tH0[1][2]; + x[6] = tH0[2][0]; + x[7] = tH0[2][1]; + } + __syncwarp(); + __syncthreads(); + return 0; +} + +template +__global__ void computeModel(SrcDstWrapper src, SrcDstWrapper dst, float2 *srcMean, float2 *dstMean, + float2 *srcShiftSum, float2 *dstShiftSum, float *V_batch, float *W_batch, float *r_batch, + float *J_batch, float *calc_buffer_batch, ModelWrapper model, int maxNumPoints, + int batchSize) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int batch = blockIdx.y; + + if (batch < batchSize) + { + int numPoints = getNumPoints(src, maxNumPoints, batch); + float2 *srcPtr = src.ptr(batch); + float2 *dstPtr = dst.ptr(batch); + float2 cM = srcMean[batch]; + float2 sM = srcShiftSum[batch]; + float2 cm = dstMean[batch]; + float2 sm = dstShiftSum[batch]; + float *W = W_batch + 9 * batch; + float *V = V_batch + 81 * batch; + float *r = r_batch + 2 * numPoints * batch; + float *J = J_batch + 2 * numPoints * 8 * batch; + float *calc_buffer = calc_buffer_batch + numPoints * batch; + float *modelPtr = model.ptr(batch); + bool status = true; + + __shared__ matrix8x32 shared_mem; + __shared__ vector8 v; + __shared__ vector8 d; + __shared__ vector8 D; + __shared__ vector8 xd; + __shared__ vector8 x; + __shared__ vector8 temp_d; + __shared__ matrix8x8 A; + __shared__ matrix8x8 Ap; + __shared__ matrix8x8 App; + __shared__ matrix8x8 Q; + + int ret = compute_model_estimate(cM, cm, sM, sm, W, V, x, model, batch, numPoints); + if (!(ret || numPoints == 4)) + { +#ifdef DEBUG_MODEL_KERNEL + if (tid == 0 && blockIdx.y == 0) + { + printf("Model estimated Matrix\n"); + printVectorDevice(x, blockIdx.y); + printf("\n"); + } +#endif + + // Begin refinement + calculate_residual_and_jacobian_device(srcPtr, dstPtr, x, numPoints, J, r); + + calculate_residual_norm(r, calc_buffer, shared_mem[0], numPoints * 2); + float S = calc_buffer[0]; + +#ifdef DEBUG_MODEL_KERNEL + if (tid == 0) + { + printf("\n\n============Residual================\n"); + printVectorDeviceRaw(r, 2 * numPoints, blockIdx.y); + printf("\n\n============Jacobian================\n"); + printMatrixDeviceRaw(J, 8, 2 * numPoints, blockIdx.y); + printf("\n\n============Residual L2 norm==================\n"); + printf("S = %f\n", S); + } +#endif + + int nfJ = 2; + + if (tid < 8) + calc_buffer[tid] = 0; + calculate_JtJ(J, A, shared_mem, calc_buffer, numPoints * 2); + +#ifdef DEBUG_MODEL_KERNEL + if (tid == 0) + { + printf("\n================ J^T * J = A ================\n"); + printMatrixDevice(A); + printf("\n\n"); + } +#endif + + if (tid < 8) + calc_buffer[tid] = 0; + calculate_Jtr(J, r, v, shared_mem, calc_buffer, numPoints * 2); + + // only blockIdx.x == 0 needs to do this right now. + fetch_diagonal(A, D, tid); + +#ifdef DEBUG_MODEL_KERNEL + if (tid == 0) + { + printf("\n=============== J^T * r = v ===================\n"); + printVectorDevice(v, blockIdx.y); + printf("\n================ D ========================\n"); + printVectorDevice(D, blockIdx.y); + printf("\n"); + } +#endif + + const float Rlo = 0.25, Rhi = 0.75; + float lambda = 1, lc = 0.75; + int iter = 0, maxIters = 10; + float epsx = 1.19209290e-7f; + float epsf = 1.19209290e-7f; + bool status; + + while (true) + { +#ifdef DEBUG_MODEL_KERNEL + if (tid == 0) + { + printf("\n========================================\n"); + printf("================== ITER = %d =============\n", iter); + printf("==========================================\n"); + printf("\n=============== A before copying ===================\n"); + printMatrixDevice(A); + } + +#endif + copy_A_to_Ap_App(A, Ap, App); + +#ifdef DEBUG_MODEL_KERNEL + if (tid == 0) + { + printf("\n=============== Ap before scaling of diagonal ===================\n"); + printMatrixDevice(Ap); + } +#endif + // blockIdx.x == 0 + scale_diagonal8(D, Ap, lambda); + +#ifdef DEBUG_MODEL_KERNEL + if (tid == 0) + { + printf("\n================ D ========================\n"); + printVectorDevice(D, blockIdx.y); + printf("\n=============== Ap after scaling of diagonal ===================\n"); + printMatrixDevice(Ap); + } +#endif + + compute_qr8x8(Ap, Q); + status = solve8x8(Ap, Q, v, d, tid); + if (!status) + break; + + subtract8(x, d, xd, tid); + +#ifdef DEBUG_MODEL_KERNEL + if (tid == 0) + { + printf("\n=============== d ====================\n"); + printVectorDevice(d, blockIdx.y); + printf("\n=============== xd ===================\n"); + printVectorDevice(xd, blockIdx.y); + } +#endif + + // calculate residual but not Jacobian + __syncthreads(); + calculate_residual_and_jacobian_device(srcPtr, dstPtr, xd, numPoints, nullptr, r); + + nfJ++; + + float Sd; + if (tid < 8) + calc_buffer[tid] = 0; + calculate_residual_norm(r, calc_buffer, shared_mem[0], numPoints * 2); + Sd = calc_buffer[0]; + + calculate_temp_d(A, d, v, temp_d, -1.0f, 2.0f, tid); + + float dS; + __syncthreads(); + dot8(d, temp_d, &dS); + + float R = (S - Sd) / (fabsf(dS) > FLT_EPSILON ? dS : 1); + +#ifdef DEBUG_MODEL_KERNEL + if (tid == 0) + { + printf("\n=============== r ====================\n"); + printVectorDeviceRaw(r, 2 * numPoints, blockIdx.y); + printf("\n============== || r || ==================\n"); + printf("||r||^2 = %f\n", Sd); + printf("\ndS = %f\n", dS); + printf("\nR = %f\n", R); + } +#endif + + if (R > Rhi) + { + lambda *= 0.5; + if (lambda < lc) + lambda = 0; + } + else if (R < Rlo) + { + float t; + dot8(d, v, &t); + + float nu = (Sd - S) / (fabsf(t) > FLT_EPSILON ? t : 1.0f) + 2.0f; + nu = fminf(fmaxf(nu, 2.0f), 10.0f); + + if (lambda == 0) + { + compute_qr8x8(App, Q); + status = invert8x8(App, Q, Ap, tid); + if (!status) + break; + + float maxval; + max_diag_val8(Ap, &maxval); + + lambda = lc = 1. / maxval; + nu *= 0.5; + } + lambda *= nu; + } + +#ifdef DEBUG_MODEL_KERNEL + if (tid == 0) + { + printf("\nlambda = %f\n", lambda); + } +#endif + + if (Sd < S) + { + nfJ++; + S = Sd; + +#ifdef DEBUG_MODEL_KERNEL + if (tid == 0) + { + printf("\n================== Before swapping =======================\n"); + printf("\n =================== X =======================\n"); + printVectorDevice(x, blockIdx.y); + printf("\n =================== Xd =======================\n"); + printVectorDevice(xd, blockIdx.y); + } +#endif + + if (tid < 8) + cuda::math::detail::swap(x[tid], xd[tid]); + __syncwarp(); + __syncthreads(); + +#ifdef DEBUG_MODEL_KERNEL + if (tid == 0) + { + printf("\n================== After swapping =======================\n"); + printf("\n =================== X =======================\n"); + printVectorDevice(x, blockIdx.y); + printf("\n =================== Xd =======================\n"); + printVectorDevice(xd, blockIdx.y); + } +#endif + + calculate_residual_and_jacobian_device(srcPtr, dstPtr, x, numPoints, J, r); + calculate_JtJ(J, A, shared_mem, calc_buffer, numPoints * 2); + calculate_Jtr(J, r, v, shared_mem, calc_buffer, numPoints * 2); + +#ifdef DEBUG_MODEL_KERNEL + if (tid == 0) + { + printf("\n =================== J =======================\n"); + printMatrixDeviceRaw(J, 8, 2 * numPoints, blockIdx.y); + printf("\n\n =================== r =======================\n"); + printVectorDeviceRaw(r, 2 * numPoints, blockIdx.y); + printf("\n\n==================== A ========================\n"); + printMatrixDevice(A); + printf("\n\n===================== v ========================\n"); + printVectorDevice(v, blockIdx.y); + printf("\n"); + } +#endif + } + + iter++; + + if (tid == 0) + calc_buffer[tid] = 0; + max(r, shared_mem[0], calc_buffer, numPoints * 2); + __syncthreads(); + float maxResidualValue = calc_buffer[0]; + float maxDvecValue; + max8(d, &maxDvecValue); + + bool proceed = maxDvecValue >= epsx && maxResidualValue >= epsf && iter < maxIters; + if (!proceed) + break; + } + } + + // Copy back the estimate to output buffer + if (tid == 0) + { + if (status) + { + *(model.ptr(batch, 0, 0)) = x[0]; + *(model.ptr(batch, 0, 1)) = x[1]; + *(model.ptr(batch, 0, 2)) = x[2]; + *(model.ptr(batch, 1, 0)) = x[3]; + *(model.ptr(batch, 1, 1)) = x[4]; + *(model.ptr(batch, 1, 2)) = x[5]; + *(model.ptr(batch, 2, 0)) = x[6]; + *(model.ptr(batch, 2, 1)) = x[7]; + *(model.ptr(batch, 2, 2)) = 1; + } + else + { + *(model.ptr(batch, 0, 0)) = 0; + *(model.ptr(batch, 0, 1)) = 0; + *(model.ptr(batch, 0, 2)) = 0; + *(model.ptr(batch, 1, 0)) = 0; + *(model.ptr(batch, 1, 1)) = 0; + *(model.ptr(batch, 1, 2)) = 0; + *(model.ptr(batch, 2, 0)) = 0; + *(model.ptr(batch, 2, 1)) = 0; + *(model.ptr(batch, 2, 2)) = 0; + } + } + } +} + +template +__global__ void compute_src_dst_mean(SrcDstWrapper src, SrcDstWrapper dst, float2 *srcMean, float2 *dstMean, + Func src_op, Func dst_op, int maxNumPoints, int batchSize) +{ + int batch = blockIdx.y; + __shared__ cuda::math::Vector warpSums; + if (batch < batchSize) + { + int numPoints = getNumPoints(src, maxNumPoints, batch); + float2 *srcMeanBatch = srcMean + batch; + float2 *dstMeanBatch = dstMean + batch; + float2 *srcPtr = src.ptr(batch); + float2 *dstPtr = dst.ptr(batch); + reducef2(srcPtr, warpSums, srcMeanBatch, src_op, numPoints, batch); + __syncthreads(); + reducef2(dstPtr, warpSums, dstMeanBatch, dst_op, numPoints, batch); + } +} + +template +__global__ void compute_LtL(SrcDstWrapper src, SrcDstWrapper dst, float *LtL, Func ltl_op, int maxNumPoints, + int batchSize) +{ + int batch = blockIdx.z; + int j = blockIdx.y / 9; // LtL row index + int k = blockIdx.y % 9; // LtL col index + __shared__ cuda::math::Vector warpSums; + if (batch < batchSize) + { + int numPoints = getNumPoints(src, maxNumPoints, batch); + float *LtLBatch = LtL + 81 * batch; + float2 *srcPtr = src.ptr(batch); + float2 *dstPtr = dst.ptr(batch); + reduceLtL(srcPtr, dstPtr, warpSums, &LtLBatch[j * 9 + k], ltl_op, numPoints, batch, j, k); + } +} + +/* numPoints should be maxNumPoints in the case of varshape. */ +template +void FindHomographyWrapper(SrcDstWrapper srcWrap, SrcDstWrapper dstWrap, ModelType &models, + const BufferOffsets *bufferOffset, const cuSolver *cusolverData, int numPoints, + cudaStream_t stream) +{ + dim3 block(256, 1, 1); + cuda::Tensor3DWrap modelWrap = cuda::CreateTensorWrapNHW(models); + int batchSize = models.shape(0); + + float2 *srcMean = bufferOffset->srcMean; + float2 *dstMean = bufferOffset->dstMean; + float2 *srcShiftSum = bufferOffset->srcShiftSum; + float2 *dstShiftSum = bufferOffset->dstShiftSum; + float *J = bufferOffset->J; + float *r = bufferOffset->r; + float *LtL = bufferOffset->LtL; + float *W = bufferOffset->W; + float *calc_buffer = bufferOffset->calc_buffer; + float *cusolverBuffer = cusolverData->cusolverBuffer; + int *cusolverInfo = cusolverData->cusolverInfo; + int lwork = cusolverData->lwork; + cusolverDnHandle_t cusolverH = cusolverData->cusolverH; + syevjInfo_t syevj_params = cusolverData->syevj_params; + + cudaMemsetAsync(reinterpret_cast(srcMean), 0, batchSize * sizeof(float2), stream); + cudaMemsetAsync(reinterpret_cast(dstMean), 0, batchSize * sizeof(float2), stream); + cudaMemsetAsync(reinterpret_cast(srcShiftSum), 0, batchSize * sizeof(float2), stream); + cudaMemsetAsync(reinterpret_cast(dstShiftSum), 0, batchSize * sizeof(float2), stream); + cudaMemsetAsync(reinterpret_cast(J), 0, 2 * numPoints * 8 * batchSize * sizeof(float), stream); + cudaMemsetAsync(reinterpret_cast(r), 0, 2 * numPoints * batchSize * sizeof(float), stream); + cudaMemsetAsync(reinterpret_cast(LtL), 0, 81 * batchSize * sizeof(float), stream); + cudaMemsetAsync(reinterpret_cast(W), 0, 9 * batchSize * sizeof(float), stream); + cudaMemsetAsync(reinterpret_cast(calc_buffer), 0, numPoints * batchSize * sizeof(float), stream); + cudaMemsetAsync(reinterpret_cast(cusolverBuffer), 0, lwork * sizeof(float), stream); + cudaMemsetAsync(reinterpret_cast(cusolverInfo), 0, batchSize * sizeof(int), stream); + + dim3 grid((numPoints + block.x - 1) / block.x, batchSize, 1); + + MeanOp meanop; + compute_src_dst_mean<<>>(srcWrap, dstWrap, srcMean, dstMean, meanop, meanop, numPoints, + batchSize); +#ifdef DEBUG + int check_batch = 0; + printKernelfloat2<<<1, 1, 0, stream>>>(srcMean + check_batch, 1, 0); + printKernelfloat2<<<1, 1, 0, stream>>>(dstMean + check_batch, 1, 0); +#endif + + AbsShiftOp src_abs_shift_op(srcMean); + AbsShiftOp dst_abs_shift_op(dstMean); + compute_src_dst_mean<<>>(srcWrap, dstWrap, srcShiftSum, dstShiftSum, src_abs_shift_op, + dst_abs_shift_op, numPoints, batchSize); +#ifdef DEBUG + printKernelfloat2<<<1, 1, 0, stream>>>(srcShiftSum + check_batch, 1, 0); + printKernelfloat2<<<1, 1, 0, stream>>>(dstShiftSum + check_batch, 1, 0); +#endif + + grid.y = 81; + grid.z = batchSize; + LtLOp ltl_op(srcMean, dstMean, srcShiftSum, dstShiftSum); + compute_LtL<<>>(srcWrap, dstWrap, LtL, ltl_op, numPoints, batchSize); +#ifdef DEBUG + for (int b = 0; b < batchSize; b++) + { + std::cout << "==================== Batch " << b << " =======================" << std::endl; + printMatrix<<<1, 1, 0, stream>>>(LtL + 81 * b, 9, 9); + cudaStreamSynchronize(stream); + } +#endif + + // compute Eigen values + CUSOLVER_CHECK_ERROR(cusolverDnSetStream(cusolverH, stream), "Failed to set cuda stream in cusolver"); + CUSOLVER_CHECK_ERROR(cusolverDnSsyevjBatched(cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_LOWER, 9, LtL, 9, + W, cusolverBuffer, lwork, cusolverInfo, syevj_params, batchSize), + "Failed to calculate eigen values using syevj"); +#ifdef DEBUG + CUDA_CHECK_ERROR(cudaStreamSynchronize(stream), "synchronization failed after eigen value solver"); + std::vector info(batchSize); + cudaMemcpyAsync((void *)info.data(), (void *)cusolverInfo, batchSize * sizeof(int), cudaMemcpyDeviceToHost, stream); + CUDA_CHECK_ERROR(cudaStreamSynchronize(stream), "synchronization failed after copying back cusolverInfo"); + + for (int b = 0; b < batchSize; b++) + { + if (info[b] == 0) + { + std::cout << "cusolver converged for matrix " << b << std::endl; + printKernel<<<1, 9, 0, stream>>>(W + 9 * b, 9, 0); + printf("\n"); + } + else if (info[b] < 0) + { + std::cout << info[b] << "th parameter is wrong for image " << b << std::endl; + } + else + { + std::cout << "cusolver did not converge for image " << b << std::endl; + } + CUDA_CHECK_ERROR(cudaStreamSynchronize(stream), "failed to synchronize"); + } +#endif + + block.x = 256; + grid.x = 1; + grid.y = batchSize; + grid.z = 1; + computeModel<<>>(srcWrap, dstWrap, srcMean, dstMean, srcShiftSum, dstShiftSum, LtL, W, r, J, + calc_buffer, modelWrap, numPoints, batchSize); +} + +template +void RunFindHomography(const SrcDstType &src, const SrcDstType &dst, const nvcv::TensorDataStridedCuda &models, + const BufferOffsets *bufferOffset, const cuSolver *cusolverData, cudaStream_t stream) +{ + using SrcDstWrapper = cuda::Tensor2DWrap; + SrcDstWrapper srcWrap(src); + SrcDstWrapper dstWrap(dst); + int numPoints = src.shape(1); + FindHomographyWrapper(srcWrap, dstWrap, models, bufferOffset, cusolverData, numPoints, stream); +} + +} // namespace + +namespace cvcuda::priv { + +// Constructor ----------------------------------------------------------------- + +FindHomography::FindHomography(int batchSize, int maxNumPoints) +{ + cudaMalloc(reinterpret_cast(&(bufferOffset.srcMean)), sizeof(float2) * batchSize); + cudaMalloc(reinterpret_cast(&(bufferOffset.dstMean)), sizeof(float2) * batchSize); + cudaMalloc(reinterpret_cast(&(bufferOffset.srcShiftSum)), sizeof(float2) * batchSize); + cudaMalloc(reinterpret_cast(&(bufferOffset.dstShiftSum)), sizeof(float2) * batchSize); + cudaMalloc(reinterpret_cast(&(bufferOffset.LtL)), 81 * sizeof(float) * batchSize); + cudaMalloc(reinterpret_cast(&(bufferOffset.W)), 9 * sizeof(float) * batchSize); + cudaMalloc(reinterpret_cast(&(bufferOffset.r)), 2 * maxNumPoints * sizeof(float) * batchSize); + cudaMalloc(reinterpret_cast(&(bufferOffset.J)), 2 * maxNumPoints * 8 * sizeof(float) * batchSize); + cudaMalloc(reinterpret_cast(&(bufferOffset.calc_buffer)), maxNumPoints * sizeof(float) * batchSize); + CUSOLVER_CHECK_ERROR(cusolverDnCreate(&(cusolverData.cusolverH)), "Failed to create cusolver handle"); + CUSOLVER_CHECK_ERROR(cusolverDnCreateSyevjInfo(&(cusolverData.syevj_params)), "Failed to create syevj params"); + CUSOLVER_CHECK_ERROR(cusolverDnXsyevjSetTolerance(cusolverData.syevj_params, 1e-7), + "Failed to set tolerance for syevj"); + CUSOLVER_CHECK_ERROR(cusolverDnXsyevjSetMaxSweeps(cusolverData.syevj_params, 15), + "Failed to set max sweeps for syevj"); + CUSOLVER_CHECK_ERROR(cusolverDnXsyevjSetSortEig(cusolverData.syevj_params, 1), + "Failed to set sorting of eigen values in syevj"); + CUSOLVER_CHECK_ERROR( + cusolverDnSsyevjBatched_bufferSize(cusolverData.cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_LOWER, 9, + NULL, 9, NULL, &(cusolverData.lwork), cusolverData.syevj_params, batchSize), + "Failed to calculate buffer size for syevj"); + cudaMalloc(reinterpret_cast(&(cusolverData.cusolverBuffer)), cusolverData.lwork * sizeof(float)); + cudaMalloc(reinterpret_cast(&(cusolverData.cusolverInfo)), batchSize * sizeof(int)); +} + +FindHomography::~FindHomography() +{ + cudaFree(bufferOffset.srcMean); + cudaFree(bufferOffset.dstMean); + cudaFree(bufferOffset.srcShiftSum); + cudaFree(bufferOffset.dstShiftSum); + cudaFree(bufferOffset.LtL); + cudaFree(bufferOffset.W); + cudaFree(bufferOffset.r); + cudaFree(bufferOffset.J); + cudaFree(bufferOffset.calc_buffer); + cusolverDnDestroySyevjInfo(cusolverData.syevj_params); + cusolverDnDestroy(cusolverData.cusolverH); + cudaFree(cusolverData.cusolverBuffer); + cudaFree(cusolverData.cusolverInfo); +} + +// Operator -------------------------------------------------------------------- + +// Tensor input variant +void FindHomography::operator()(cudaStream_t stream, const nvcv::Tensor &srcPoints, const nvcv::Tensor &dstPoints, + const nvcv::Tensor &models) const +{ + auto srcData = srcPoints.exportData(); + if (!srcData) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Input must be cuda-accessible, pitch-linear tensor"); + } + + auto dstData = dstPoints.exportData(); + if (!dstData) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Output must be cuda-accessible, pitch-linear tensor"); + } + + auto modelData = models.exportData(); + if (!modelData) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Input must be cuda-accessible, pitch-linear tensor"); + } + + // validation of input data + if (!((srcData->rank() == dstData->rank()) && (srcData->rank() == 2))) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "source and destination points must have rank 2"); + } + + if (!(srcData->shape(0) == dstData->shape(0) && srcData->shape(0) == modelData->shape(0))) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "source, destination and model must have same batch size"); + } + + if (srcData->shape(1) != dstData->shape(1)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "source and destination array length must be same length to return a valid model"); + } + + if (srcData->shape(1) < 4 || dstData->shape(1) < 4) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "source and destination array length must be >=4 to return a valid model"); + } + + if (!(modelData->rank() == 3 && modelData->shape(1) == 3 && modelData->shape(2) == 3)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "model tensor must be 2D with shape 3x3"); + } + + if (!(srcData->dtype() == nvcv::TYPE_2F32 && dstData->dtype() == nvcv::TYPE_2F32 + && modelData->dtype() == nvcv::TYPE_F32)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "source, destination and model tensors must have data type F32"); + } + + RunFindHomography(*srcData, *dstData, *modelData, &bufferOffset, &cusolverData, stream); +} + +void FindHomography::operator()(cudaStream_t stream, const nvcv::TensorBatch &srcPoints, + const nvcv::TensorBatch &dstPoints, const nvcv::TensorBatch &models) const +{ + if (!(srcPoints.numTensors() == dstPoints.numTensors() && srcPoints.numTensors() == models.numTensors())) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "source, destination and model tensors must have same batch size"); + } + + for (int b = 0; b < srcPoints.numTensors(); b++) + { + auto srcData = srcPoints[b].exportData(); + if (!srcData) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Input src points must be cuda-accessible, pitch-linear tensor"); + } + + auto dstData = dstPoints[b].exportData(); + if (!dstData) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Input dst points must be cuda-accessible, pitch-linear tensor"); + } + + auto modelData = models[b].exportData(); + if (!modelData) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "model must be cuda-accessible, pitch-linear tensor"); + } + + // validation of input data + if (!((srcData->shape(0) == dstData->shape(0)) && (srcData->shape(0) == modelData->shape(0)) + && (srcData->shape(0) == 1))) + { + throw nvcv::Exception( + nvcv::Status::ERROR_INVALID_ARGUMENT, + "Invdividual samples (src, dst and model) in the batch must be tensors with batch size 1"); + } + + if (!((srcData->rank() == dstData->rank()) && (srcData->rank() == 2))) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "source and destination tensors must have rank 2"); + } + + if (srcData->shape(1) != dstData->shape(1)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "source and destination array length must be same length to return a valid model"); + } + + if (srcData->shape(1) < 4 || dstData->shape(1) < 4) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "source and destination array length must be >=4 to return a valid model"); + } + + if (!(modelData->rank() == 3 && modelData->shape(1) == 3 && modelData->shape(2) == 3)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "model tensor must be 2D with shape 3x3"); + } + + if (!(srcData->dtype() == nvcv::TYPE_2F32 && dstData->dtype() == nvcv::TYPE_2F32 + && modelData->dtype() == nvcv::TYPE_F32)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "source, destination and model tensors must have data type F32"); + } + + RunFindHomography(*srcData, *dstData, *modelData, &bufferOffset, &cusolverData, stream); + } +} + +} // namespace cvcuda::priv diff --git a/src/cvcuda/priv/OpFindHomography.hpp b/src/cvcuda/priv/OpFindHomography.hpp new file mode 100644 index 00000000..c18d3ef4 --- /dev/null +++ b/src/cvcuda/priv/OpFindHomography.hpp @@ -0,0 +1,78 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file FindHomography.hpp + * + * @brief Defines the private C++ Class for the FindHomography operation. + */ + +#ifndef CVCUDA_PRIV__FIND_HOMOGRAPHY_HPP +#define CVCUDA_PRIV__FIND_HOMOGRAPHY_HPP +#include "IOperator.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct +{ + float2 *srcMean; + float2 *dstMean; + float2 *srcShiftSum; + float2 *dstShiftSum; + float *LtL; + float *W; + float *r; + float *J; + float *calc_buffer; +} BufferOffsets; + +typedef struct +{ + int *cusolverInfo; + float *cusolverBuffer; + cusolverDnHandle_t cusolverH; + syevjInfo_t syevj_params; + int lwork; +} cuSolver; + +namespace cvcuda::priv { + +class FindHomography final : public IOperator +{ +public: + explicit FindHomography(int batchSize, int numPoints); + ~FindHomography(); + void operator()(cudaStream_t stream, const nvcv::Tensor &src, const nvcv::Tensor &dst, + const nvcv::Tensor &models) const; + void operator()(cudaStream_t stream, const nvcv::TensorBatch &src, const nvcv::TensorBatch &dst, + const nvcv::TensorBatch &models) const; + +private: + BufferOffsets bufferOffset; + cuSolver cusolverData; +}; + +} // namespace cvcuda::priv + +#endif // CVCUDA_PRIV__FIND_HOMOGRAPHY_HPP diff --git a/src/cvcuda/priv/OpLabel.cu b/src/cvcuda/priv/OpLabel.cu new file mode 100644 index 00000000..8a1c5118 --- /dev/null +++ b/src/cvcuda/priv/OpLabel.cu @@ -0,0 +1,1751 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// MIT License + +// Copyright (c) 2018 - Daniel Peter Playne + +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +/** + * @note The CUDA kernels implemented below are based on the paper: + * D. P. Playne and K. Hawick, + * "A New Algorithm for Parallel Connected-Component Labelling on GPUs," + * in IEEE Transactions on Parallel and Distributed Systems, + * vol. 29, no. 6, pp. 1217-1230, 1 June 2018. + */ + +#include "Assert.h" +#include "OpLabel.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cuda = nvcv::cuda; +namespace util = nvcv::util; + +namespace { + +// CUDA kernels ---------------------------------------------------------------- + +template +__device__ DT FindRoot(DT *labels, DT label) +{ + DT next = labels[label]; + + while (label != next) + { + label = next; + next = labels[label]; + } + + return label; +} + +template +__device__ DT Reduction(DT *labels, DT label1, DT label2) +{ + DT next1 = (label1 != label2) ? labels[label1] : 0; + DT next2 = (label1 != label2) ? labels[label2] : 0; + + while ((label1 != label2) && (label1 != next1)) + { + label1 = next1; + next1 = labels[label1]; + } + + while ((label1 != label2) && (label2 != next2)) + { + label2 = next2; + next2 = labels[label2]; + } + + DT label3; + + while (label1 != label2) + { + if (label1 < label2) + { + label3 = label1; + label1 = label2; + label2 = label3; + } + + label3 = atomicMin(&labels[label1], label2); + label1 = (label1 == label3) ? label2 : label3; + } + + return label1; +} + +// -- 2D kernels -- + +template +__global__ void BlockLabel2D(cuda::Tensor3DWrap
dst, cuda::Tensor3DWrap src, cuda::Tensor1DWrap minThresh, + cuda::Tensor1DWrap maxThresh, int2 size) +{ + __shared__ DT labels[BW * BH]; + + int2 tc = cuda::StaticCast(cuda::DropCast<2>(threadIdx)); + int3 gc{(int)(blockIdx.x * BW) + tc.x, (int)(blockIdx.y * BH) + tc.y, (int)blockIdx.z}; + + bool nym1x, nyxm1, nym1xm1; + DT label1; + + bool hasMinThresh = (minThresh.ptr(0) != nullptr); + bool hasMaxThresh = (maxThresh.ptr(0) != nullptr); + ST minThreshold = hasMinThresh ? minThresh[gc.z] : 0; + ST maxThreshold = hasMaxThresh ? maxThresh[gc.z] : 0; + + if (gc.x < size.x && gc.y < size.y) + { + ST pyx = src[gc]; + ST pym1x = (tc.y > 0) ? *src.ptr(gc.z, gc.y - 1, gc.x) : 0; + + if (hasMinThresh && hasMaxThresh) + { + pyx = pyx < minThreshold || pyx > maxThreshold ? 0 : 1; + pym1x = (tc.y > 0) ? (pym1x < minThreshold || pym1x > maxThreshold ? 0 : 1) : 0; + } + else if (hasMinThresh) + { + pyx = pyx < minThreshold ? 0 : 1; + pym1x = (tc.y > 0) ? (pym1x < minThreshold ? 0 : 1) : 0; + } + else if (hasMaxThresh) + { + pyx = pyx > maxThreshold ? 0 : 1; + pym1x = (tc.y > 0) ? (pym1x > maxThreshold ? 0 : 1) : 0; + } + + ST pyxm1 = __shfl_up_sync(__activemask(), pyx, 1); + ST pym1xm1 = __shfl_up_sync(__activemask(), pym1x, 1); + + nym1x = (tc.y > 0) ? (pyx == pym1x) : false; + nyxm1 = (tc.x > 0) ? (pyx == pyxm1) : false; + nym1xm1 = (tc.y > 0 && tc.x > 0) ? (pyx == pym1xm1) : false; + + label1 = (nyxm1) ? tc.y * BW + (tc.x - 1) : tc.y * BW + tc.x; + label1 = (nym1x) ? (tc.y - 1) * BW + tc.x : label1; + + labels[tc.y * BW + tc.x] = label1; + } + + __syncthreads(); + + if (gc.x < size.x && gc.y < size.y) + { + labels[tc.y * BW + tc.x] = FindRoot(labels, label1); + } + + __syncthreads(); + + if (gc.x < size.x && gc.y < size.y) + { + if (nym1x && nyxm1 && !nym1xm1) + { + DT label2 = labels[tc.y * BW + tc.x - 1]; + + label1 = Reduction(labels, label1, label2); + } + } + + __syncthreads(); + + if (gc.x < size.x && gc.y < size.y) + { + label1 = FindRoot(labels, label1); + + DT lx = label1 % BW; + DT ly = (label1 / BW) % BH; + + DT dstStrideH = dst.strides()[1] / sizeof(DT); + + dst[gc] = (blockIdx.y * BH + ly) * dstStrideH + blockIdx.x * BW + lx; + } +} + +template +__global__ void YLabelReduction2D(cuda::Tensor3DWrap
dst, cuda::Tensor3DWrap src, + cuda::Tensor1DWrap minThresh, cuda::Tensor1DWrap maxThresh, int2 size) +{ + int3 gc; + gc.x = blockIdx.x * blockDim.x + threadIdx.x; + gc.y = (blockIdx.y * blockDim.y + threadIdx.y) * blockDim.y + blockDim.y; + gc.z = blockIdx.z; + + if (gc.x >= size.x || gc.y >= size.y) + { + return; + } + + bool hasMinThresh = (minThresh.ptr(0) != nullptr); + bool hasMaxThresh = (maxThresh.ptr(0) != nullptr); + ST minThreshold = hasMinThresh ? minThresh[gc.z] : 0; + ST maxThreshold = hasMaxThresh ? maxThresh[gc.z] : 0; + + ST pyx = src[gc]; + ST pym1x = *src.ptr(gc.z, gc.y - 1, gc.x); + + if (hasMinThresh && hasMaxThresh) + { + pyx = pyx < minThreshold || pyx > maxThreshold ? 0 : 1; + pym1x = pym1x < minThreshold || pym1x > maxThreshold ? 0 : 1; + } + else if (hasMinThresh) + { + pyx = pyx < minThreshold ? 0 : 1; + pym1x = pym1x < minThreshold ? 0 : 1; + } + else if (hasMaxThresh) + { + pyx = pyx > maxThreshold ? 0 : 1; + pym1x = pym1x > maxThreshold ? 0 : 1; + } + + ST pyxm1 = __shfl_up_sync(0xffffffff, pyx, 1); + ST pym1xm1 = __shfl_up_sync(0xffffffff, pym1x, 1); + + if ((pyx == pym1x) && ((threadIdx.x == 0) || (pyx != pyxm1) || (pyx != pym1xm1))) + { + DT label1 = dst[gc]; + DT label2 = *dst.ptr(gc.z, gc.y - 1, gc.x); + + Reduction(dst.ptr(gc.z), label1, label2); + } +} + +template +__global__ void XLabelReduction2D(cuda::Tensor3DWrap
dst, cuda::Tensor3DWrap src, + cuda::Tensor1DWrap minThresh, cuda::Tensor1DWrap maxThresh, int2 size) +{ + int3 gc; + gc.x = (blockIdx.y * blockDim.y + threadIdx.y) * blockDim.x + blockDim.x; + gc.y = blockIdx.x * blockDim.x + threadIdx.x; + gc.z = blockIdx.z; + + if (gc.x >= size.x || gc.y >= size.y) + { + return; + } + + bool hasMinThresh = (minThresh.ptr(0) != nullptr); + bool hasMaxThresh = (maxThresh.ptr(0) != nullptr); + ST minThreshold = hasMinThresh ? minThresh[gc.z] : 0; + ST maxThreshold = hasMaxThresh ? maxThresh[gc.z] : 0; + + ST pyx = src[gc]; + ST pyxm1 = *src.ptr(gc.z, gc.y, gc.x - 1); + + if (hasMinThresh && hasMaxThresh) + { + pyx = pyx < minThreshold || pyx > maxThreshold ? 0 : 1; + pyxm1 = pyxm1 < minThreshold || pyxm1 > maxThreshold ? 0 : 1; + } + else if (hasMinThresh) + { + pyx = pyx < minThreshold ? 0 : 1; + pyxm1 = pyxm1 < minThreshold ? 0 : 1; + } + else if (hasMaxThresh) + { + pyx = pyx > maxThreshold ? 0 : 1; + pyxm1 = pyxm1 > maxThreshold ? 0 : 1; + } + + bool thread_y = (gc.y % blockDim.y) == 0; + + ST pym1x = __shfl_up_sync(0xffffffff, pyx, 1); + ST pym1xm1 = __shfl_up_sync(0xffffffff, pyxm1, 1); + + if ((pyx == pyxm1) && (thread_y || (pyx != pym1x) || (pyx != pym1xm1))) + { + DT label1 = dst[gc]; + DT label2 = *dst.ptr(gc.z, gc.y, gc.x - 1); + + Reduction(dst.ptr(gc.z), label1, label2); + } +} + +template +__global__ void ResolveLabels2D(cuda::Tensor3DWrap
dst, int2 size) +{ + int3 gc; + gc.x = blockIdx.x * blockDim.x + threadIdx.x; + gc.y = blockIdx.y * blockDim.y + threadIdx.y; + gc.z = blockIdx.z; + + if (gc.x >= size.x || gc.y >= size.y) + { + return; + } + + dst[gc] = FindRoot(dst.ptr(gc.z), dst[gc]); +} + +template +__global__ void ReplaceBgLabels2D(cuda::Tensor3DWrap
dst, cuda::Tensor3DWrap src, + cuda::Tensor1DWrap bgLabel, cuda::Tensor1DWrap minThresh, + cuda::Tensor1DWrap maxThresh, int2 size) +{ + int3 gc; + gc.x = blockIdx.x * blockDim.x + threadIdx.x; + gc.y = blockIdx.y * blockDim.y + threadIdx.y; + gc.z = blockIdx.z; + + if (gc.x >= size.x || gc.y >= size.y) + { + return; + } + + bool hasMinThresh = (minThresh.ptr(0) != nullptr); + bool hasMaxThresh = (maxThresh.ptr(0) != nullptr); + ST minThreshold = hasMinThresh ? minThresh[gc.z] : 0; + ST maxThreshold = hasMaxThresh ? maxThresh[gc.z] : 0; + + ST pyx = src[gc]; + + if (hasMinThresh && hasMaxThresh) + { + pyx = pyx < minThreshold || pyx > maxThreshold ? 0 : 1; + } + else if (hasMinThresh) + { + pyx = pyx < minThreshold ? 0 : 1; + } + else if (hasMaxThresh) + { + pyx = pyx > maxThreshold ? 0 : 1; + } + + ST backgroundLabel = bgLabel[gc.z]; + + // If src has bg label, put it in dst; if dst has bg label, it means a wrong label was assigned to a region, + // replace its label by a label never assigned, the stride zero meaning one-element-after-the-end stride + + if (pyx == backgroundLabel) + { + dst[gc] = backgroundLabel; + } + else if (dst[gc] == (DT)backgroundLabel) + { + dst[gc] = dst.strides()[0] / sizeof(DT); + } +} + +template +__global__ void CountLabels2D(cuda::Tensor1DWrap
count, cuda::Tensor3DWrap
stats, cuda::Tensor3DWrap
dst, + cuda::Tensor1DWrap bgLabel, int2 size, int maxCapacity) +{ + int3 gc; + gc.x = blockIdx.x * blockDim.x + threadIdx.x; + gc.y = blockIdx.y * blockDim.y + threadIdx.y; + gc.z = blockIdx.z; + + if (gc.x >= size.x || gc.y >= size.y) + { + return; + } + + bool hasBgLabel = (bgLabel.ptr(0) != nullptr); + ST backgroundLabel = hasBgLabel ? bgLabel[gc.z] : 0; + + DT label = dst[gc]; + + if (hasBgLabel && label == (DT)backgroundLabel) + { + return; // do not count background labels + } + + DT posLabel = gc.y * dst.strides()[1] / sizeof(DT) + gc.x; + DT endLabel = dst.strides()[0] / sizeof(DT); + + DT regionIdx; + bool counted = false; + + if (hasBgLabel && label == endLabel && posLabel == (DT)backgroundLabel) + { + // This is a special region marked with one-element-after-the-end label, count it + regionIdx = atomicAdd(count.ptr(gc.z), 1); + counted = true; + } + else if (label == posLabel) + { + // This is the first element of a regular region, count it + regionIdx = atomicAdd(count.ptr(gc.z), 1); + counted = true; + } + + // If statistics should be computed and the region index is inside the allowed storage (the M maximum + // capacity in stats tensor), replace the output label by the region index and store initial statistics + + if (counted && stats.ptr(0) != nullptr && regionIdx < maxCapacity) + { + // TODO: improve the mark of output label as region index with 1 in the 1st bit + dst[gc] = regionIdx | (DT)(1 << 31); + + *stats.ptr(gc.z, (int)regionIdx, 0) = label; + *stats.ptr(gc.z, (int)regionIdx, 1) = (DT)gc.x; + *stats.ptr(gc.z, (int)regionIdx, 2) = (DT)gc.y; + *stats.ptr(gc.z, (int)regionIdx, 3) = 1; + *stats.ptr(gc.z, (int)regionIdx, 4) = 1; + *stats.ptr(gc.z, (int)regionIdx, 5) = 1; + } +} + +template +__global__ void ComputeStats2D(cuda::Tensor3DWrap
stats, cuda::Tensor3DWrap
dst, cuda::Tensor1DWrap bgLabel, + int2 size, bool relabel) +{ + int3 gc; + gc.x = blockIdx.x * blockDim.x + threadIdx.x; + gc.y = blockIdx.y * blockDim.y + threadIdx.y; + gc.z = blockIdx.z; + + if (gc.x >= size.x || gc.y >= size.y) + { + return; + } + + bool hasBgLabel = (bgLabel.ptr(0) != nullptr); + ST backgroundLabel = hasBgLabel ? bgLabel[gc.z] : 0; + DT endLabel = dst.strides()[0] / sizeof(DT); + DT label = dst[gc]; + + if (hasBgLabel && label == (DT)backgroundLabel) + { + return; // do not compute statistics for background labels + } + if (label & (DT)(1 << 31)) + { + return; // label is marked as region index, its statistics is already computed + } + if (hasBgLabel && label == endLabel) + { + // This is a special region marked with one-element-after-the-end label, its label was the backgroundLabel + label = backgroundLabel; + } + + DT regionIdx = dst.ptr(gc.z)[label]; + + if (regionIdx & (DT)(1 << 31)) + { + regionIdx = regionIdx & (DT) ~(1 << 31); + + if (relabel) + { + if (hasBgLabel && regionIdx >= (DT)backgroundLabel) + { + dst[gc] = regionIdx + 1; // skip one region index equals to background label when relabeling + } + else + { + dst[gc] = regionIdx; + } + } + + int2 cornerPos{(int)*stats.ptr(gc.z, (int)regionIdx, 1), (int)*stats.ptr(gc.z, (int)regionIdx, 2)}; + + int2 bboxArea = cuda::abs(cornerPos - cuda::DropCast<2>(gc)) + 1; + + atomicMax(stats.ptr(gc.z, (int)regionIdx, 3), (DT)bboxArea.x); + atomicMax(stats.ptr(gc.z, (int)regionIdx, 4), (DT)bboxArea.y); + atomicAdd(stats.ptr(gc.z, (int)regionIdx, 5), 1); + } +} + +template +__global__ void RemoveIslands2D(cuda::Tensor3DWrap
stats, cuda::Tensor3DWrap
dst, + cuda::Tensor1DWrap bgLabel, cuda::Tensor1DWrap
minSize, int2 size, bool relabel) +{ + int3 gc; + gc.x = blockIdx.x * blockDim.x + threadIdx.x; + gc.y = blockIdx.y * blockDim.y + threadIdx.y; + gc.z = blockIdx.z; + + if (gc.x >= size.x || gc.y >= size.y) + { + return; + } + + DT endLabel = dst.strides()[0] / sizeof(DT); + + DT label = dst[gc]; + + ST backgroundLabel = bgLabel[gc.z]; + + if (label == (DT)backgroundLabel) + { + return; + } + if (label == endLabel) + { + // This is a special region marked with one-element-after-the-end label, its label was the backgroundLabel + label = backgroundLabel; + } + + DT regionIdx = 0; + + if (!(label & (DT)(1 << 31))) + { + if (relabel) + { + if (label >= (DT)backgroundLabel + 1) + { + regionIdx = label - 1; // go back one region index to account for background label + } + else + { + regionIdx = label; + } + } + else + { + regionIdx = dst.ptr(gc.z)[label]; + + if (regionIdx & (DT)(1 << 31)) + { + regionIdx = regionIdx & (DT) ~(1 << 31); + } + else + { + return; // invalid region index + } + } + } + else + { + regionIdx = label & (DT) ~(1 << 31); + } + + DT regionSize = *stats.ptr(gc.z, (int)regionIdx, 5); + + // If region size is less than minimum size, it is an island and should be removed, i.e. set to background label + if (regionSize < minSize[gc.z]) + { + dst[gc] = backgroundLabel; + } +} + +template +__global__ void Relabel2D(cuda::Tensor3DWrap
stats, cuda::Tensor3DWrap
dst, cuda::Tensor1DWrap bgLabel, + int2 size, bool relabel) +{ + int3 gc; + gc.x = blockIdx.x * blockDim.x + threadIdx.x; + gc.y = blockIdx.y * blockDim.y + threadIdx.y; + gc.z = blockIdx.z; + + if (gc.x >= size.x || gc.y >= size.y) + { + return; + } + + DT label = dst[gc]; + + if (label & (DT)(1 << 31)) + { + // Label is marked as region index, relabel it back to proper label + DT regionIdx = label & (DT) ~(1 << 31); + + if (relabel) + { + bool hasBgLabel = (bgLabel.ptr(0) != nullptr); + ST backgroundLabel = hasBgLabel ? bgLabel[gc.z] : 0; + + if (hasBgLabel && regionIdx >= (DT)backgroundLabel) + { + dst[gc] = regionIdx + 1; // skip one region index equals to background label when relabeling + } + else + { + dst[gc] = regionIdx; + } + } + else + { + dst[gc] = *stats.ptr(gc.z, (int)regionIdx, 0); + } + } +} + +// -- 3D kernels -- + +template +__global__ void BlockLabel3D(cuda::Tensor4DWrap
dst, cuda::Tensor4DWrap src, cuda::Tensor1DWrap minThresh, + cuda::Tensor1DWrap maxThresh, int4 shape) +{ + __shared__ DT labels[BW * BH * BD]; + + int3 tc = cuda::StaticCast(threadIdx); + int4 gc{(int)blockIdx.x * BW + tc.x, (int)blockIdx.y * BH + tc.y, (int)blockIdx.z * BD + tc.z, 0}; + + bool nzm1yx, nzym1x, nzyxm1, nzym1xm1, nzm1yxm1, nzm1ym1x; + DT label; + + bool hasMinThresh = (minThresh.ptr(0) != nullptr); + bool hasMaxThresh = (maxThresh.ptr(0) != nullptr); + + for (gc.w = 0; gc.w < shape.w; gc.w++) + { + ST minThreshold = hasMinThresh ? minThresh[gc.w] : 0; + ST maxThreshold = hasMaxThresh ? maxThresh[gc.w] : 0; + + if (gc.x < shape.x && gc.y < shape.y && gc.z < shape.z) + { + ST pzyx = src[gc]; + ST pzym1x = (tc.y > 0) ? *src.ptr(gc.w, gc.z, gc.y - 1, gc.x) : 0; + ST pzm1yx = (tc.z > 0) ? *src.ptr(gc.w, gc.z - 1, gc.y, gc.x) : 0; + ST pzm1ym1x = (tc.z > 0 && tc.y > 0) ? *src.ptr(gc.w, gc.z - 1, gc.y - 1, gc.x) : 0; + + if (hasMinThresh && hasMaxThresh) + { + pzyx = pzyx < minThreshold || pzyx > maxThreshold ? 0 : 1; + pzym1x = (tc.y > 0) ? (pzym1x < minThreshold || pzym1x > maxThreshold ? 0 : 1) : 0; + pzm1yx = (tc.z > 0) ? (pzm1yx < minThreshold || pzm1yx > maxThreshold ? 0 : 1) : 0; + pzm1ym1x = (tc.z > 0 && tc.y > 0) ? (pzm1ym1x < minThreshold || pzm1ym1x > maxThreshold ? 0 : 1) : 0; + } + else if (hasMinThresh) + { + pzyx = pzyx < minThreshold ? 0 : 1; + pzym1x = (tc.y > 0) ? (pzym1x < minThreshold ? 0 : 1) : 0; + pzm1yx = (tc.z > 0) ? (pzm1yx < minThreshold ? 0 : 1) : 0; + pzm1ym1x = (tc.z > 0 && tc.y > 0) ? (pzm1ym1x < minThreshold ? 0 : 1) : 0; + } + else if (hasMaxThresh) + { + pzyx = pzyx > maxThreshold ? 0 : 1; + pzym1x = (tc.y > 0) ? (pzym1x > maxThreshold ? 0 : 1) : 0; + pzm1yx = (tc.z > 0) ? (pzm1yx > maxThreshold ? 0 : 1) : 0; + pzm1ym1x = (tc.z > 0 && tc.y > 0) ? (pzm1ym1x > maxThreshold ? 0 : 1) : 0; + } + + ST pzyxm1 = __shfl_up_sync(__activemask(), pzyx, 1); + ST pzym1xm1 = __shfl_up_sync(__activemask(), pzym1x, 1); + ST pzm1yxm1 = __shfl_up_sync(__activemask(), pzm1yx, 1); + + nzm1yx = (tc.z > 0) && (pzyx == pzm1yx); + nzym1x = (tc.y > 0) && (pzyx == pzym1x); + nzyxm1 = (tc.x > 0) && (pzyx == pzyxm1); + + nzym1xm1 = ((tc.y > 0) && (tc.x > 0) && (pzyx == pzym1xm1)); + nzm1yxm1 = ((tc.z > 0) && (tc.x > 0) && (pzyx == pzm1yxm1)); + nzm1ym1x = ((tc.z > 0) && (tc.y > 0) && (pzyx == pzm1ym1x)); + + label = (nzyxm1) ? (tc.z * BW * BH + tc.y * BW + (tc.x - 1)) : (tc.z * BW * BH + tc.y * BW + tc.x); + label = (nzym1x) ? (tc.z * BW * BH + (tc.y - 1) * BW + tc.x) : label; + label = (nzm1yx) ? ((tc.z - 1) * BW * BH + tc.y * BW + tc.x) : label; + + labels[tc.z * BW * BH + tc.y * BW + tc.x] = label; + } + + __syncthreads(); + + if (gc.x < shape.x && gc.y < shape.y && gc.z < shape.z) + { + labels[tc.z * BW * BH + tc.y * BW + tc.x] = FindRoot(labels, label); + } + + __syncthreads(); + + if (gc.x < shape.x && gc.y < shape.y && gc.z < shape.z) + { + if (nzym1x && nzm1yx && !nzm1ym1x) + { + Reduction(labels, label, labels[tc.z * BW * BH + (tc.y - 1) * BW + tc.x]); + } + + if (nzyxm1 && ((nzm1yx && !nzm1yxm1) || (nzym1x && !nzym1xm1))) + { + label = Reduction(labels, label, labels[tc.z * BW * BH + tc.y * BW + tc.x - 1]); + } + } + + __syncthreads(); + + if (gc.x < shape.x && gc.y < shape.y && gc.z < shape.z) + { + label = labels[tc.z * BW * BH + tc.y * BW + tc.x]; + + label = FindRoot(labels, label); + + DT lx = label % BW; + DT ly = (label / BW) % BH; + DT lz = (label / (BW * BH)) % BD; + + DT dstStrideD = dst.strides()[1] / sizeof(DT); + DT dstStrideH = dst.strides()[2] / sizeof(DT); + + dst[gc] = (blockIdx.z * BD + lz) * dstStrideD + (blockIdx.y * BH + ly) * dstStrideH + blockIdx.x * BW + lx; + } + } +} + +template +__global__ void ZLabelReduction3D(cuda::Tensor4DWrap
dst, cuda::Tensor4DWrap src, + cuda::Tensor1DWrap minThresh, cuda::Tensor1DWrap maxThresh, int4 shape) +{ + int4 gc; + gc.x = ((blockIdx.x * blockDim.x) + threadIdx.x); + gc.y = ((blockIdx.y * blockDim.y) + threadIdx.y); + gc.z = ((blockIdx.z * blockDim.z) + threadIdx.z) * blockDim.z + blockDim.z; + + if (gc.x >= shape.x || gc.y >= shape.y || gc.z >= shape.z) + { + return; + } + + bool hasMinThresh = (minThresh.ptr(0) != nullptr); + bool hasMaxThresh = (maxThresh.ptr(0) != nullptr); + + bool thread_x = (gc.x % blockDim.x) == 0; + bool thread_y = (gc.y % blockDim.y) == 0; + + for (gc.w = 0; gc.w < shape.w; gc.w++) + { + ST minThreshold = hasMinThresh ? minThresh[gc.w] : 0; + ST maxThreshold = hasMaxThresh ? maxThresh[gc.w] : 0; + + ST pzyx = src[gc]; + ST pzm1yx = *src.ptr(gc.w, gc.z - 1, gc.y, gc.x); + + if (hasMinThresh && hasMaxThresh) + { + pzyx = pzyx < minThreshold || pzyx > maxThreshold ? 0 : 1; + pzm1yx = pzm1yx < minThreshold || pzm1yx > maxThreshold ? 0 : 1; + } + else if (hasMinThresh) + { + pzyx = pzyx < minThreshold ? 0 : 1; + pzm1yx = pzm1yx < minThreshold ? 0 : 1; + } + else if (hasMaxThresh) + { + pzyx = pzyx > maxThreshold ? 0 : 1; + pzm1yx = pzm1yx > maxThreshold ? 0 : 1; + } + + ST pzyxm1 = __shfl_up_sync(0xffffffff, pzyx, 1); + ST pzm1yxm1 = __shfl_up_sync(0xffffffff, pzm1yx, 1); + + if (pzyx == pzm1yx) + { + ST pzym1x = (!thread_y) ? *src.ptr(gc.w, gc.z, gc.y - 1, gc.x) : 0; + ST pzm1ym1x = (!thread_y) ? *src.ptr(gc.w, gc.z - 1, gc.y - 1, gc.x) : 0; + + if (hasMinThresh && hasMaxThresh) + { + pzym1x = pzym1x < minThreshold || pzym1x > maxThreshold ? 0 : 1; + pzm1ym1x = pzm1ym1x < minThreshold || pzm1ym1x > maxThreshold ? 0 : 1; + } + else if (hasMinThresh) + { + pzym1x = pzym1x < minThreshold ? 0 : 1; + pzm1ym1x = pzm1ym1x < minThreshold ? 0 : 1; + } + else if (hasMaxThresh) + { + pzym1x = pzym1x > maxThreshold ? 0 : 1; + pzm1ym1x = pzm1ym1x > maxThreshold ? 0 : 1; + } + + bool nzym1x = (!thread_y) ? (pzyx == pzym1x) : false; + bool nzm1ym1x = (!thread_y) ? (pzyx == pzm1ym1x) : false; + + if ((thread_x || (pzyx != pzyxm1) || (pzyx != pzm1yxm1)) && (!nzym1x || !nzm1ym1x)) + { + DT label1 = dst[gc]; + DT label2 = *dst.ptr(gc.w, gc.z - 1, gc.y, gc.x); + + Reduction(dst.ptr(gc.w), label1, label2); + } + } + } +} + +template +__global__ void YLabelReduction3D(cuda::Tensor4DWrap
dst, cuda::Tensor4DWrap src, + cuda::Tensor1DWrap minThresh, cuda::Tensor1DWrap maxThresh, int4 shape) +{ + int4 gc; + gc.x = ((blockIdx.x * blockDim.x) + threadIdx.x); + gc.y = ((blockIdx.z * blockDim.z) + threadIdx.z) * blockDim.y + blockDim.y; + gc.z = ((blockIdx.y * blockDim.y) + threadIdx.y); + + if (gc.x >= shape.x || gc.y >= shape.y || gc.z >= shape.z) + { + return; + } + + bool hasMinThresh = (minThresh.ptr(0) != nullptr); + bool hasMaxThresh = (maxThresh.ptr(0) != nullptr); + + bool thread_x = (gc.x % blockDim.x) == 0; + bool thread_z = (gc.z % blockDim.z) == 0; + + for (gc.w = 0; gc.w < shape.w; gc.w++) + { + ST minThreshold = hasMinThresh ? minThresh[gc.w] : 0; + ST maxThreshold = hasMaxThresh ? maxThresh[gc.w] : 0; + + ST pzyx = src[gc]; + ST pzym1x = *src.ptr(gc.w, gc.z, gc.y - 1, gc.x); + + if (hasMinThresh && hasMaxThresh) + { + pzyx = pzyx < minThreshold || pzyx > maxThreshold ? 0 : 1; + pzym1x = pzym1x < minThreshold || pzym1x > maxThreshold ? 0 : 1; + } + else if (hasMinThresh) + { + pzyx = pzyx < minThreshold ? 0 : 1; + pzym1x = pzym1x < minThreshold ? 0 : 1; + } + else if (hasMaxThresh) + { + pzyx = pzyx > maxThreshold ? 0 : 1; + pzym1x = pzym1x > maxThreshold ? 0 : 1; + } + + ST pzyxm1 = __shfl_up_sync(0xffffffff, pzyx, 1); + ST pzym1xm1 = __shfl_up_sync(0xffffffff, pzym1x, 1); + + if (pzyx == pzym1x) + { + ST pzm1yx = (!thread_z) ? *src.ptr(gc.w, gc.z - 1, gc.y, gc.x) : 0; + ST pzm1ym1x = (!thread_z) ? *src.ptr(gc.w, gc.z - 1, gc.y - 1, gc.x) : 0; + + if (hasMinThresh && hasMaxThresh) + { + pzm1yx = pzm1yx < minThreshold || pzm1yx > maxThreshold ? 0 : 1; + pzm1ym1x = pzm1ym1x < minThreshold || pzm1ym1x > maxThreshold ? 0 : 1; + } + else if (hasMinThresh) + { + pzm1yx = pzm1yx < minThreshold ? 0 : 1; + pzm1ym1x = pzm1ym1x < minThreshold ? 0 : 1; + } + else if (hasMaxThresh) + { + pzm1yx = pzm1yx > maxThreshold ? 0 : 1; + pzm1ym1x = pzm1ym1x > maxThreshold ? 0 : 1; + } + + bool nzm1yx = (!thread_z) ? (pzyx == pzm1yx) : false; + bool nzm1ym1x = (!thread_z) ? (pzyx == pzm1ym1x) : false; + + if ((!nzm1yx || !nzm1ym1x) && (thread_x || (pzyx != pzyxm1) || (pzyx != pzym1xm1))) + { + DT label1 = dst[gc]; + DT label2 = *dst.ptr(gc.w, gc.z, gc.y - 1, gc.x); + + Reduction(dst.ptr(gc.w), label1, label2); + } + } + } +} + +template +__global__ void XLabelReduction3D(cuda::Tensor4DWrap
dst, cuda::Tensor4DWrap src, + cuda::Tensor1DWrap minThresh, cuda::Tensor1DWrap maxThresh, int4 shape) +{ + int4 gc; + gc.x = ((blockIdx.z * blockDim.z) + threadIdx.z) * blockDim.x + blockDim.x; + gc.y = ((blockIdx.y * blockDim.y) + threadIdx.y); + gc.z = ((blockIdx.x * blockDim.x) + threadIdx.x); + + if (gc.x >= shape.x || gc.y >= shape.y || gc.z >= shape.z) + { + return; + } + + bool hasMinThresh = (minThresh.ptr(0) != nullptr); + bool hasMaxThresh = (maxThresh.ptr(0) != nullptr); + + bool thread_y = (gc.y % blockDim.y) == 0; + bool thread_z = (gc.z % blockDim.z) == 0; + + for (gc.w = 0; gc.w < shape.w; gc.w++) + { + ST minThreshold = hasMinThresh ? minThresh[gc.w] : 0; + ST maxThreshold = hasMaxThresh ? maxThresh[gc.w] : 0; + + ST pzyx = src[gc]; + ST pzyxm1 = *src.ptr(gc.w, gc.z, gc.y, gc.x - 1); + + if (hasMinThresh && hasMaxThresh) + { + pzyx = pzyx < minThreshold || pzyx > maxThreshold ? 0 : 1; + pzyxm1 = pzyxm1 < minThreshold || pzyxm1 > maxThreshold ? 0 : 1; + } + else if (hasMinThresh) + { + pzyx = pzyx < minThreshold ? 0 : 1; + pzyxm1 = pzyxm1 < minThreshold ? 0 : 1; + } + else if (hasMaxThresh) + { + pzyx = pzyx > maxThreshold ? 0 : 1; + pzyxm1 = pzyxm1 > maxThreshold ? 0 : 1; + } + + ST pzm1yx = __shfl_up_sync(0xffffffff, pzyx, 1); + ST pzm1yxm1 = __shfl_up_sync(0xffffffff, pzyxm1, 1); + + if (pzyx == pzyxm1) + { + ST pzym1x = (!thread_y) ? *src.ptr(gc.w, gc.z, gc.y - 1, gc.x) : 0; + ST pzym1xm1 = (!thread_y) ? *src.ptr(gc.w, gc.z, gc.y - 1, gc.x - 1) : 0; + + if (hasMinThresh && hasMaxThresh) + { + pzym1x = pzym1x < minThreshold || pzym1x > maxThreshold ? 0 : 1; + pzym1xm1 = pzym1xm1 < minThreshold || pzym1xm1 > maxThreshold ? 0 : 1; + } + else if (hasMinThresh) + { + pzym1x = pzym1x < minThreshold ? 0 : 1; + pzym1xm1 = pzym1xm1 < minThreshold ? 0 : 1; + } + else if (hasMaxThresh) + { + pzym1x = pzym1x > maxThreshold ? 0 : 1; + pzym1xm1 = pzym1xm1 > maxThreshold ? 0 : 1; + } + + bool nzym1x = (!thread_y) ? (pzyx == pzym1x) : false; + bool nzym1xm1 = (!thread_y) ? (pzyx == pzym1xm1) : false; + + if ((thread_z || (pzyx != pzm1yx) || (pzyx != pzm1yxm1)) && (!nzym1x || !nzym1xm1)) + { + DT label1 = dst[gc]; + DT label2 = *dst.ptr(gc.w, gc.z, gc.y, gc.x - 1); + + Reduction(dst.ptr(gc.w), label1, label2); + } + } + } +} + +template +__global__ void ResolveLabels3D(cuda::Tensor4DWrap
dst, int4 shape) +{ + int4 gc; + gc.x = blockIdx.x * blockDim.x + threadIdx.x; + gc.y = blockIdx.y * blockDim.y + threadIdx.y; + gc.z = blockIdx.z * blockDim.z + threadIdx.z; + + if (gc.x >= shape.x || gc.y >= shape.y || gc.z >= shape.z) + { + return; + } + + for (gc.w = 0; gc.w < shape.w; gc.w++) + { + dst[gc] = FindRoot(dst.ptr(gc.w), dst[gc]); + } +} + +template +__global__ void ReplaceBgLabels3D(cuda::Tensor4DWrap
dst, cuda::Tensor4DWrap src, + cuda::Tensor1DWrap bgLabel, cuda::Tensor1DWrap minThresh, + cuda::Tensor1DWrap maxThresh, int4 shape) +{ + int4 gc; + gc.x = blockIdx.x * blockDim.x + threadIdx.x; + gc.y = blockIdx.y * blockDim.y + threadIdx.y; + gc.z = blockIdx.z * blockDim.z + threadIdx.z; + + if (gc.x >= shape.x || gc.y >= shape.y || gc.z >= shape.z) + { + return; + } + + bool hasMinThresh = (minThresh.ptr(0) != nullptr); + bool hasMaxThresh = (maxThresh.ptr(0) != nullptr); + + for (gc.w = 0; gc.w < shape.w; gc.w++) + { + ST minThreshold = hasMinThresh ? minThresh[gc.w] : 0; + ST maxThreshold = hasMaxThresh ? maxThresh[gc.w] : 0; + + ST pzyx = src[gc]; + + if (hasMinThresh && hasMaxThresh) + { + pzyx = pzyx < minThreshold || pzyx > maxThreshold ? 0 : 1; + } + else if (hasMinThresh) + { + pzyx = pzyx < minThreshold ? 0 : 1; + } + else if (hasMaxThresh) + { + pzyx = pzyx > maxThreshold ? 0 : 1; + } + + DT backgroundLabel = bgLabel[gc.w]; + + // If src has bg label, put it in dst; if dst has bg label, it means a wrong label was assigned to a + // region, replace its label by a label never assigned, i.e. one-element-after-the-end stride + + if (pzyx == backgroundLabel) + { + dst[gc] = backgroundLabel; + } + else if (dst[gc] == (DT)backgroundLabel) + { + dst[gc] = dst.strides()[0] / sizeof(DT); + } + } +} + +template +__global__ void CountLabels3D(cuda::Tensor1DWrap
count, cuda::Tensor3DWrap
stats, cuda::Tensor4DWrap
dst, + cuda::Tensor1DWrap bgLabel, int4 shape, int maxCapacity) +{ + int4 gc; + gc.x = blockIdx.x * blockDim.x + threadIdx.x; + gc.y = blockIdx.y * blockDim.y + threadIdx.y; + gc.z = blockIdx.z * blockDim.z + threadIdx.z; + + if (gc.x >= shape.x || gc.y >= shape.y || gc.z >= shape.z) + { + return; + } + + DT posLabel = gc.z * dst.strides()[1] / sizeof(DT) + gc.y * dst.strides()[2] / sizeof(DT) + gc.x; + DT endLabel = dst.strides()[0] / sizeof(DT); + + bool hasBgLabel = (bgLabel.ptr(0) != nullptr); + + for (gc.w = 0; gc.w < shape.w; gc.w++) + { + ST backgroundLabel = hasBgLabel ? bgLabel[gc.w] : 0; + + DT label = dst[gc]; + + if (hasBgLabel && label == (DT)backgroundLabel) + { + continue; // do not count background labels + } + + DT regionIdx; + bool counted = false; + + if (hasBgLabel && label == endLabel && posLabel == (DT)backgroundLabel) + { + // This is a special region marked with one-element-after-the-end label, count it + regionIdx = atomicAdd(count.ptr(gc.w), 1); + counted = true; + } + else if (label == posLabel) + { + // This is the first element of a regular region, count it + regionIdx = atomicAdd(count.ptr(gc.w), 1); + counted = true; + } + + // If statistics should be computed and the region index is inside the allowed storage (the M maximum + // capacity in stats tensor), replace the output label by the region index and store initial statistics + + if (counted && stats.ptr(0) != nullptr && regionIdx < maxCapacity) + { + // TODO: improve the mark of output label as region index with 1 in the 1st bit + dst[gc] = regionIdx | (DT)(1 << 31); + + *stats.ptr(gc.w, (int)regionIdx, 0) = label; + *stats.ptr(gc.w, (int)regionIdx, 1) = (DT)gc.x; + *stats.ptr(gc.w, (int)regionIdx, 2) = (DT)gc.y; + *stats.ptr(gc.w, (int)regionIdx, 3) = (DT)gc.z; + *stats.ptr(gc.w, (int)regionIdx, 4) = 1; + *stats.ptr(gc.w, (int)regionIdx, 5) = 1; + *stats.ptr(gc.w, (int)regionIdx, 6) = 1; + *stats.ptr(gc.w, (int)regionIdx, 7) = 1; + } + } +} + +template +__global__ void ComputeStats3D(cuda::Tensor3DWrap
stats, cuda::Tensor4DWrap
dst, cuda::Tensor1DWrap bgLabel, + int4 shape, bool relabel) +{ + int4 gc; + gc.x = blockIdx.x * blockDim.x + threadIdx.x; + gc.y = blockIdx.y * blockDim.y + threadIdx.y; + gc.z = blockIdx.z * blockDim.z + threadIdx.z; + + if (gc.x >= shape.x || gc.y >= shape.y || gc.z >= shape.z) + { + return; + } + + bool hasBgLabel = (bgLabel.ptr(0) != nullptr); + DT endLabel = dst.strides()[0] / sizeof(DT); + + for (gc.w = 0; gc.w < shape.w; gc.w++) + { + ST backgroundLabel = hasBgLabel ? bgLabel[gc.w] : 0; + + DT label = dst[gc]; + + if (hasBgLabel && label == (DT)backgroundLabel) + { + continue; // do not compute statistics for background labels + } + if (label & (DT)(1 << 31)) + { + continue; // label is marked as region index, its statistics is already computed + } + if (hasBgLabel && label == endLabel) + { + // This is a special region marked with one-element-after-the-end label, its label was the bg label + label = backgroundLabel; + } + + DT regionIdx = dst.ptr(gc.w)[label]; + + if (regionIdx & (DT)(1 << 31)) + { + regionIdx = regionIdx & (DT) ~(1 << 31); + + if (relabel) + { + if (hasBgLabel && regionIdx >= (DT)backgroundLabel) + { + dst[gc] = regionIdx + 1; // skip one region index equals to background label when relabeling + } + else + { + dst[gc] = regionIdx; + } + } + + int3 cornerPos{(int)*stats.ptr(gc.w, (int)regionIdx, 1), (int)*stats.ptr(gc.w, (int)regionIdx, 2), + (int)*stats.ptr(gc.w, (int)regionIdx, 3)}; + + int3 bboxArea = cuda::abs(cornerPos - cuda::DropCast<3>(gc)) + 1; + + atomicMax(stats.ptr(gc.w, (int)regionIdx, 4), (DT)bboxArea.x); + atomicMax(stats.ptr(gc.w, (int)regionIdx, 5), (DT)bboxArea.y); + atomicMax(stats.ptr(gc.w, (int)regionIdx, 6), (DT)bboxArea.z); + atomicAdd(stats.ptr(gc.w, (int)regionIdx, 7), 1); + } + } +} + +template +__global__ void RemoveIslands3D(cuda::Tensor3DWrap
stats, cuda::Tensor4DWrap
dst, + cuda::Tensor1DWrap bgLabel, cuda::Tensor1DWrap
minSize, int4 shape, + bool relabel) +{ + int4 gc; + gc.x = blockIdx.x * blockDim.x + threadIdx.x; + gc.y = blockIdx.y * blockDim.y + threadIdx.y; + gc.z = blockIdx.z * blockDim.z + threadIdx.z; + + if (gc.x >= shape.x || gc.y >= shape.y || gc.z >= shape.z) + { + return; + } + + DT endLabel = dst.strides()[0] / sizeof(DT); + + for (gc.w = 0; gc.w < shape.w; gc.w++) + { + DT label = dst[gc]; + + ST backgroundLabel = bgLabel[gc.w]; + + if (label == (DT)backgroundLabel) + { + continue; + } + if (label == endLabel) + { + // This is a special region marked with one-element-after-the-end label, its label was the backgroundLabel + label = backgroundLabel; + } + + DT regionIdx = 0; + + if (!(label & (DT)(1 << 31))) + { + if (relabel) + { + if (label >= (DT)backgroundLabel + 1) + { + regionIdx = label - 1; // go back one region index to account for background label + } + else + { + regionIdx = label; + } + } + else + { + regionIdx = dst.ptr(gc.w)[label]; + + if (regionIdx & (DT)(1 << 31)) + { + regionIdx = regionIdx & (DT) ~(1 << 31); + } + else + { + return; // invalid region index + } + } + } + else + { + regionIdx = label & (DT) ~(1 << 31); + } + + DT regionSize = *stats.ptr(gc.w, (int)regionIdx, 7); + + // If region size is less than minimum size, it is an island and should be removed, i.e. set to background label + if (regionSize < minSize[gc.w]) + { + dst[gc] = backgroundLabel; + } + } +} + +template +__global__ void Relabel3D(cuda::Tensor3DWrap
stats, cuda::Tensor4DWrap
dst, cuda::Tensor1DWrap bgLabel, + int4 shape, bool relabel) +{ + int4 gc; + gc.x = blockIdx.x * blockDim.x + threadIdx.x; + gc.y = blockIdx.y * blockDim.y + threadIdx.y; + gc.z = blockIdx.z * blockDim.z + threadIdx.z; + + if (gc.x >= shape.x || gc.y >= shape.y || gc.z >= shape.z) + { + return; + } + + for (gc.w = 0; gc.w < shape.w; gc.w++) + { + DT label = dst[gc]; + + if (label & (DT)(1 << 31)) + { + // Label is marked as region index, relabel it back to proper label + DT regionIdx = label & (DT) ~(1 << 31); + + if (relabel) + { + bool hasBgLabel = (bgLabel.ptr(0) != nullptr); + ST backgroundLabel = hasBgLabel ? bgLabel[gc.w] : 0; + + if (hasBgLabel && regionIdx >= (DT)backgroundLabel) + { + dst[gc] = regionIdx + 1; // skip one region index equals to background label when relabeling + } + else + { + dst[gc] = regionIdx; + } + } + else + { + dst[gc] = *stats.ptr(gc.w, (int)regionIdx, 0); + } + } + } +} + +// Run functions --------------------------------------------------------------- + +template +inline void RunLabelForType(cudaStream_t stream, const nvcv::TensorDataStridedCuda &srcData, + const nvcv::TensorDataStridedCuda &dstData, const int4 &shapeWHDN, + const nvcv::Tensor &bgLabel, const nvcv::Tensor &minThresh, const nvcv::Tensor &maxThresh, + const nvcv::Tensor &minSize, const nvcv::Tensor &count, const nvcv::Tensor &stats, + int numDim, bool relabel) +{ + constexpr int BW = 32, BH = 4, BD = 2; // block width, height and depth + + int4 idsNDHW{srcData.layout().find('N'), srcData.layout().find('D'), srcData.layout().find('H'), + srcData.layout().find('W')}; + + NVCV_ASSERT(srcData.stride(idsNDHW.w) == sizeof(SrcT)); + NVCV_ASSERT(dstData.stride(idsNDHW.w) == sizeof(DstT)); + + cuda::Tensor1DWrap bgLabelWrap, minThreshWrap, maxThreshWrap; + cuda::Tensor1DWrap minSizeWrap, countWrap; + cuda::Tensor3DWrap statsWrap; + + int maxCapacity = 0; + +#define CVCUDA_LABEL_WRAP(TENSOR, WRAPPER, TENSORWRAP) \ + if (TENSOR) \ + { \ + auto data = TENSOR.exportData(); \ + if (!data) \ + { \ + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, #TENSOR " tensor must be cuda-accessible"); \ + } \ + TENSORWRAP = WRAPPER(data->basePtr()); \ + } + + CVCUDA_LABEL_WRAP(bgLabel, cuda::Tensor1DWrap, bgLabelWrap); + CVCUDA_LABEL_WRAP(minThresh, cuda::Tensor1DWrap, minThreshWrap); + CVCUDA_LABEL_WRAP(maxThresh, cuda::Tensor1DWrap, maxThreshWrap); + CVCUDA_LABEL_WRAP(minSize, cuda::Tensor1DWrap, minSizeWrap); + +#undef CVCUDA_LABEL_WRAP + + if (count) + { + auto data = count.exportData(); + if (!data) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "count tensor must be cuda-accessible"); + } + + countWrap = cuda::Tensor1DWrap(data->basePtr()); + + NVCV_CHECK_THROW(cudaMemsetAsync(data->basePtr(), 0, sizeof(DstT) * shapeWHDN.w, stream)); + } + if (stats) + { + auto data = stats.exportData(); + if (!data) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "stats tensor must be cuda-accessible"); + } + + statsWrap = cuda::Tensor3DWrap(data->basePtr(), (int)data->stride(0), (int)data->stride(1)); + + maxCapacity = data->shape(1); + } + + if (numDim == 2) + { + int2 sizeWH{shapeWHDN.x, shapeWHDN.y}; + int2 srcStridesNH{0, (int)srcData.stride(idsNDHW.z)}; + int2 dstStridesNH{0, (int)dstData.stride(idsNDHW.z)}; + + srcStridesNH.x = idsNDHW.x == -1 ? srcStridesNH.y * shapeWHDN.y : (int)srcData.stride(idsNDHW.x); + dstStridesNH.x = idsNDHW.x == -1 ? dstStridesNH.y * shapeWHDN.y : (int)dstData.stride(idsNDHW.x); + + dim3 larThreads(BW, BH, 1); + dim3 labBlocks(util::DivUp(sizeWH.x, BW), util::DivUp(sizeWH.y, BH), shapeWHDN.w); + dim3 redBlocksX(util::DivUp(sizeWH.y, BW), util::DivUp((int)labBlocks.x, BH), shapeWHDN.w); + dim3 redBlocksY(util::DivUp(sizeWH.x, BW), util::DivUp((int)labBlocks.y, BH), shapeWHDN.w); + + cuda::Tensor3DWrap srcWrap(srcData.basePtr(), srcStridesNH.x, srcStridesNH.y); + cuda::Tensor3DWrap dstWrap(dstData.basePtr(), dstStridesNH.x, dstStridesNH.y); + + BlockLabel2D + <<>>(dstWrap, srcWrap, minThreshWrap, maxThreshWrap, sizeWH); + + YLabelReduction2D<<>>(dstWrap, srcWrap, minThreshWrap, maxThreshWrap, + sizeWH); + + XLabelReduction2D<<>>(dstWrap, srcWrap, minThreshWrap, maxThreshWrap, + sizeWH); + + ResolveLabels2D<<>>(dstWrap, sizeWH); + + if (bgLabel) + { + ReplaceBgLabels2D<<>>(dstWrap, srcWrap, bgLabelWrap, minThreshWrap, + maxThreshWrap, sizeWH); + } + if (count) + { + CountLabels2D<<>>(countWrap, statsWrap, dstWrap, bgLabelWrap, sizeWH, + maxCapacity); + + if (stats) + { + ComputeStats2D<<>>(statsWrap, dstWrap, bgLabelWrap, sizeWH, relabel); + + if (minSize) + { + RemoveIslands2D<<>>(statsWrap, dstWrap, bgLabelWrap, minSizeWrap, + sizeWH, relabel); + } + + Relabel2D<<>>(statsWrap, dstWrap, bgLabelWrap, sizeWH, relabel); + } + } + } + else + { + int3 srcStridesNDH{0, (int)srcData.stride(idsNDHW.y), (int)srcData.stride(idsNDHW.z)}; + int3 dstStridesNDH{0, (int)dstData.stride(idsNDHW.y), (int)dstData.stride(idsNDHW.z)}; + + srcStridesNDH.x = idsNDHW.x == -1 ? srcStridesNDH.y * shapeWHDN.z : (int)srcData.stride(idsNDHW.x); + dstStridesNDH.x = idsNDHW.x == -1 ? dstStridesNDH.y * shapeWHDN.z : (int)dstData.stride(idsNDHW.x); + + dim3 larThreads(BW, BH, BD); + dim3 labBlocks(util::DivUp(shapeWHDN.x, BW), util::DivUp(shapeWHDN.y, BH), util::DivUp(shapeWHDN.z, BD)); + dim3 redBlocksX(util::DivUp(shapeWHDN.z, BW), util::DivUp(shapeWHDN.y, BH), util::DivUp((int)labBlocks.x, BD)); + dim3 redBlocksY(util::DivUp(shapeWHDN.x, BW), util::DivUp(shapeWHDN.z, BH), util::DivUp((int)labBlocks.y, BD)); + dim3 redBlocksZ(util::DivUp(shapeWHDN.x, BW), util::DivUp(shapeWHDN.y, BH), util::DivUp((int)labBlocks.z, BD)); + + cuda::Tensor4DWrap srcWrap(srcData.basePtr(), srcStridesNDH.x, srcStridesNDH.y, srcStridesNDH.z); + cuda::Tensor4DWrap dstWrap(dstData.basePtr(), dstStridesNDH.x, dstStridesNDH.y, dstStridesNDH.z); + + BlockLabel3D + <<>>(dstWrap, srcWrap, minThreshWrap, maxThreshWrap, shapeWHDN); + + ZLabelReduction3D<<>>(dstWrap, srcWrap, minThreshWrap, maxThreshWrap, + shapeWHDN); + + YLabelReduction3D<<>>(dstWrap, srcWrap, minThreshWrap, maxThreshWrap, + shapeWHDN); + + XLabelReduction3D<<>>(dstWrap, srcWrap, minThreshWrap, maxThreshWrap, + shapeWHDN); + + ResolveLabels3D<<>>(dstWrap, shapeWHDN); + + if (bgLabel) + { + ReplaceBgLabels3D<<>>(dstWrap, srcWrap, bgLabelWrap, minThreshWrap, + maxThreshWrap, shapeWHDN); + } + if (count) + { + CountLabels3D<<>>(countWrap, statsWrap, dstWrap, bgLabelWrap, shapeWHDN, + maxCapacity); + + if (stats) + { + ComputeStats3D<<>>(statsWrap, dstWrap, bgLabelWrap, shapeWHDN, + relabel); + + if (minSize) + { + RemoveIslands3D<<>>(statsWrap, dstWrap, bgLabelWrap, minSizeWrap, + shapeWHDN, relabel); + } + + Relabel3D<<>>(statsWrap, dstWrap, bgLabelWrap, shapeWHDN, relabel); + } + } + } +} + +inline void RunLabel(cudaStream_t stream, const nvcv::TensorDataStridedCuda &srcData, + const nvcv::TensorDataStridedCuda &dstData, const int4 &srcShape, nvcv::DataType srcDataType, + const nvcv::Tensor &bgLabel, const nvcv::Tensor &minThresh, const nvcv::Tensor &maxThresh, + const nvcv::Tensor &minSize, const nvcv::Tensor &count, const nvcv::Tensor &stats, int numDim, + bool relabel) +{ + switch (srcDataType) + { +#define CVCUDA_LABEL_CASE(DT, T) \ + case nvcv::TYPE_##DT: \ + RunLabelForType(stream, srcData, dstData, srcShape, bgLabel, minThresh, maxThresh, minSize, count, stats, \ + numDim, relabel); \ + break + + CVCUDA_LABEL_CASE(U8, uint8_t); + CVCUDA_LABEL_CASE(U16, uint16_t); + CVCUDA_LABEL_CASE(U32, uint32_t); + CVCUDA_LABEL_CASE(S8, int8_t); + CVCUDA_LABEL_CASE(S16, int16_t); + CVCUDA_LABEL_CASE(S32, int32_t); + +#undef CVCUDA_LABEL_CASE + + default: + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid input data type"); + } +} + +} // anonymous namespace + +namespace cvcuda::priv { + +// Constructor ----------------------------------------------------------------- + +Label::Label() {} + +// Tensor operator ------------------------------------------------------------- + +void Label::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, + const nvcv::Tensor &bgLabel, const nvcv::Tensor &minThresh, const nvcv::Tensor &maxThresh, + const nvcv::Tensor &minSize, const nvcv::Tensor &count, const nvcv::Tensor &stats, + NVCVConnectivityType connectivity, NVCVLabelType assignLabels) const +{ + if (!(in.shape().layout() == nvcv::TENSOR_HW || in.shape().layout() == nvcv::TENSOR_HWC + || in.shape().layout() == nvcv::TENSOR_NHW || in.shape().layout() == nvcv::TENSOR_NHWC + || in.shape().layout() == nvcv::TENSOR_DHW || in.shape().layout() == nvcv::TENSOR_DHWC + || in.shape().layout() == nvcv::TENSOR_NDHW || in.shape().layout() == nvcv::TENSOR_NDHWC)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input tensor must have [N][D]HW[C] layout"); + } + + // We expect input and output shape to be the same as TensorShape contains TensorLayout + + if (!(in.shape() == out.shape())) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Input and output tensors must have the same shape and layout"); + } + if (!(out.dtype() == nvcv::TYPE_U32)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output tensor data type must be U32"); + } + + auto inData = in.exportData(); + if (!inData) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input tensor must be cuda-accessible"); + } + + auto outData = out.exportData(); + if (!outData) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output tensor must be cuda-accessible"); + } + + if (outData->stride(0) >= cuda::TypeTraits::max + || (uint32_t)outData->stride(0) / (uint32_t)sizeof(uint32_t) >= (uint32_t)(1 << 31)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Too big input and output tensors"); + } + + auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inData); + if (!inAccess) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input tensor must have strided access"); + } + if (!(inAccess->numChannels() == 1)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input tensor must have a single channel"); + } + if (!(inAccess->numPlanes() == 1)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input tensor must have a single plane"); + } + if (inAccess->numSamples() > cuda::TypeTraits::max) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Too big number of samples %ld, must be smaller than or equal to %d", + inAccess->numSamples(), cuda::TypeTraits::max); + } + + int4 inShape{inAccess->numCols(), inAccess->numRows(), 1, (int)inAccess->numSamples()}; // WHDN shape + + int inDepthIdx = in.shape().layout().find('D'); + + int numDim = (connectivity == NVCV_CONNECTIVITY_4_2D || connectivity == NVCV_CONNECTIVITY_8_2D) ? 2 : 3; + + bool relabel = (assignLabels == NVCV_LABEL_SEQUENTIAL); + + if (inDepthIdx != -1) + { + if (numDim == 2) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Connectivity 2D not allowed in tensors with depth D dimension"); + } + + NVCV_ASSERT(inDepthIdx >= 0 && inDepthIdx < in.shape().rank()); + + int64_t inDepth = in.shape()[inDepthIdx]; + + if (inDepth > cuda::TypeTraits::max) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Too big depth %ld, must be smaller than or equal to %d", inDepth, + cuda::TypeTraits::max); + } + + inShape.z = static_cast(inDepth); + } + else + { + if (numDim == 3) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Connectivity 3D not allowed in tensors without depth D dimension"); + } + } + + if (bgLabel) + { + if (!((bgLabel.rank() == 1 && bgLabel.shape()[0] == inShape.w) + || (bgLabel.rank() == 2 && bgLabel.shape()[0] == inShape.w && bgLabel.shape()[1] == 1))) + { + std::ostringstream oss; + oss << bgLabel.shape(); + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Input bgLabel must be [N] or [NC] tensor, with N=%d and C=1, got %s", inShape.w, + oss.str().c_str()); + } + if (!(bgLabel.dtype() == in.dtype())) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Input (%s) and bgLabel (%s) tensors must have the same data type", + nvcvDataTypeGetName(in.dtype()), nvcvDataTypeGetName(bgLabel.dtype())); + } + } + + if (minThresh) + { + if (!((minThresh.rank() == 1 && minThresh.shape()[0] == inShape.w) + || (minThresh.rank() == 2 && minThresh.shape()[0] == inShape.w && minThresh.shape()[1] == 1))) + { + std::ostringstream oss; + oss << minThresh.shape(); + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Input minThresh must be [N] or [NC] tensor, with N=%d and C=1, got %s", inShape.w, + oss.str().c_str()); + } + if (!(minThresh.dtype() == in.dtype())) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Input (%s) and minThresh (%s) tensors must have the same data type", + nvcvDataTypeGetName(in.dtype()), nvcvDataTypeGetName(minThresh.dtype())); + } + } + + if (maxThresh) + { + if (!((maxThresh.rank() == 1 && maxThresh.shape()[0] == inShape.w) + || (maxThresh.rank() == 2 && maxThresh.shape()[0] == inShape.w && maxThresh.shape()[1] == 1))) + { + std::ostringstream oss; + oss << maxThresh.shape(); + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Input maxThresh must be [N] or [NC] tensor, with N=%d and C=1, got %s", inShape.w, + oss.str().c_str()); + } + if (!(maxThresh.dtype() == in.dtype())) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Input (%s) and maxThresh (%s) tensors must have the same data type", + nvcvDataTypeGetName(in.dtype()), nvcvDataTypeGetName(maxThresh.dtype())); + } + } + + if (count) + { + if (!((count.rank() == 1 && count.shape()[0] == inShape.w) + || (count.rank() == 2 && count.shape()[0] == inShape.w && count.shape()[1] == 1))) + { + std::ostringstream oss; + oss << count.shape(); + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Output count must be [N] or [NC] tensor, with N=%d and C=1, got %s", inShape.w, + oss.str().c_str()); + } + if (!(count.dtype() == nvcv::TYPE_U32)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output count (%s) must have U32 data type", + nvcvDataTypeGetName(count.dtype())); + } + } + + if (stats) + { + if (!count) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output stats requires count tensor"); + } + if (!((stats.rank() == 3 && stats.shape()[0] == inShape.w && stats.shape()[2] == 2 + 2 * numDim))) + { + std::ostringstream oss; + oss << stats.shape(); + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Output stats must be [NMA] tensor, with rank=3 N=%d A=%d, got %s", inShape.w, + 2 + 2 * numDim, oss.str().c_str()); + } + if (!(stats.dtype() == nvcv::TYPE_U32)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output stats (%s) must have U32 data type", + nvcvDataTypeGetName(stats.dtype())); + } + } + else if (relabel) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Output stats tensor must not be NULL to have sequential labels"); + } + + if (minSize) + { + if (!bgLabel || !stats) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Input minSize requires bgLabel and stats tensors"); + } + + if (!((minSize.rank() == 1 && minSize.shape()[0] == inShape.w) + || (minSize.rank() == 2 && minSize.shape()[0] == inShape.w && minSize.shape()[1] == 1))) + { + std::ostringstream oss; + oss << minSize.shape(); + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Input minSize must be [N] or [NC] tensor, with N=%d and C=1, got %s", inShape.w, + oss.str().c_str()); + } + if (!(minSize.dtype() == nvcv::TYPE_U32)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input minSize (%s) must have U32 data type", + nvcvDataTypeGetName(minSize.dtype())); + } + } + + // TODO: Support full connectivity + if (connectivity == NVCV_CONNECTIVITY_8_2D || connectivity == NVCV_CONNECTIVITY_26_3D) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Full neighborhood labeling not supported yet"); + } + + RunLabel(stream, *inData, *outData, inShape, in.dtype(), bgLabel, minThresh, maxThresh, minSize, count, stats, + numDim, relabel); +} + +} // namespace cvcuda::priv diff --git a/src/cvcuda/priv/OpLabel.hpp b/src/cvcuda/priv/OpLabel.hpp new file mode 100644 index 00000000..08d34f33 --- /dev/null +++ b/src/cvcuda/priv/OpLabel.hpp @@ -0,0 +1,48 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file OpLabel.hpp + * + * @brief Defines the private C++ Class for the Label operation. + */ + +#ifndef CVCUDA_PRIV_LABEL_HPP +#define CVCUDA_PRIV_LABEL_HPP + +#include "IOperator.hpp" + +#include +#include +#include + +namespace cvcuda::priv { + +class Label final : public IOperator +{ +public: + explicit Label(); + + void operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, const nvcv::Tensor &bgLabel, + const nvcv::Tensor &minThresh, const nvcv::Tensor &maxThresh, const nvcv::Tensor &minSize, + const nvcv::Tensor &count, const nvcv::Tensor &stats, NVCVConnectivityType connectivity, + NVCVLabelType assignLabels) const; +}; + +} // namespace cvcuda::priv + +#endif // CVCUDA_PRIV_LABEL_HPP diff --git a/src/cvcuda/priv/OpMinMaxLoc.cu b/src/cvcuda/priv/OpMinMaxLoc.cu index d60f69c4..ba16fbe7 100644 --- a/src/cvcuda/priv/OpMinMaxLoc.cu +++ b/src/cvcuda/priv/OpMinMaxLoc.cu @@ -31,6 +31,8 @@ #include +#include + namespace { // Utilities for MinMaxLoc operator -------------------------------------------- @@ -764,11 +766,12 @@ inline void RunMinMaxLocDataIn(cudaStream_t stream, const DataStridedCuda &inDat if (!DataTypeMatches(inDataType, minValData->dtype())) { - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, - "Wrong output minVal data type %s for input tensor data type %s output minVal data " - "type must be S32/U32/F32/F64: for input data type S8/S16/S32 use S32; for " - "U8/U16/U32 use U32; for all other data types use same data type as input tensor", - nvcvDataTypeGetName(minValData->dtype()), nvcvDataTypeGetName(inDataType)); + std::ostringstream oss; + oss << "for minVal=" << nvcvDataTypeGetName(minValData->dtype()) + << " for input=" << nvcvDataTypeGetName(inDataType) + << "; output minVal data type must be S32/U32/F32/F64: for input " + << "data type S8/S16 use S32; for U8/U16 use U32; for all other data types use same as input tensor"; + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Wrong data types: %s", oss.str().c_str()); } if (!((minValData->rank() == 0 && inNumSamples == 1) || ((minValData->rank() == 1 || minValData->rank() == 2) && inNumSamples == minValData->shape(0)))) @@ -842,11 +845,12 @@ inline void RunMinMaxLocDataIn(cudaStream_t stream, const DataStridedCuda &inDat if (!DataTypeMatches(inDataType, maxValData->dtype())) { - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, - "Wrong output maxVal data type %s for input tensor data type %s output maxVal data " - "type must be S32/U32/F32/F64: for input data type S8/S16/S32 use S32; for " - "U8/U16/U32 use U32; for all other data types use same data type as input tensor", - nvcvDataTypeGetName(maxValData->dtype()), nvcvDataTypeGetName(inDataType)); + std::ostringstream oss; + oss << "for maxVal=" << nvcvDataTypeGetName(maxValData->dtype()) + << " for input=" << nvcvDataTypeGetName(inDataType) + << "; output maxVal data type must be S32/U32/F32/F64: for input " + << "data type S8/S16 use S32; for U8/U16 use U32; for all other data types use same as input tensor"; + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Wrong data types: %s", oss.str().c_str()); } if (!((maxValData->rank() == 0 && inNumSamples == 1) || ((maxValData->rank() == 1 || maxValData->rank() == 2) && inNumSamples == maxValData->shape(0)))) diff --git a/src/cvcuda/priv/OpPairwiseMatcher.cu b/src/cvcuda/priv/OpPairwiseMatcher.cu new file mode 100644 index 00000000..676ec865 --- /dev/null +++ b/src/cvcuda/priv/OpPairwiseMatcher.cu @@ -0,0 +1,665 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Assert.h" +#include "OpPairwiseMatcher.hpp" + +#include +#include +#include +#include +#include +#include + +#include + +#include + +namespace { + +// Utilities definitions ------------------------------------------------------- + +namespace cuda = nvcv::cuda; +namespace util = nvcv::util; + +constexpr int kIntMax = cuda::TypeTraits::max; // maximum number represented in an int + +constexpr int kNumThreads = 64; // number of threads per block + +// Key value pair type used in CUB (CUDA Unbound) sort and reduce-min operations +// The idea is to sort or get the minimum by distance (dist) and then by index (idx) +struct KeyValueT +{ + float dist; + int idx; +}; + +// Point class primary template is not intended to be used directly, instead only its partial specializations +// (below) are used, where: is the point type, i.e. the tensor type of the set storing the points; is the +// maximum number of bytes hold by the point class as a cache from global memory (GMEM) +template +class PointT; + +// Point class partial specialization for type T with NB = 0, a fall-through Point class meaning that n-dimensional +// points in a set are loaded directly from global memory (GMEM) without caching +template +class PointT +{ +public: + static constexpr int kMaxSize = 0; // maximum size in bytes of a single point stored by this class + + __device__ PointT() = default; + + inline __device__ void load(cuda::Tensor3DWrap set, int sampleIdx, int setIdx, int numDim) + { + data = set.ptr(sampleIdx, setIdx); + } + + inline __device__ T &operator[](int i) const + { + return data[i]; + } + +private: + T *data; +}; + +using RT = uint32_t; // resource type used in a cache inside the PointT class below + +// Point class partial specialization for type T with any NB, meaning that n-dimensional points in a set are loaded +// from global memory (GMEM) and stored in a cache (can be registers or local memory or shared memory), thus this +// class can only be used by points with up to NB size in bytes, i.e. n * sizeof(T) <= NB +template +class PointT +{ + static_assert(NB > 0, "Maximum number of bytes capacity in PointT class must be positive"); + +public: + static constexpr int kMaxSize = NB; // maximum size in bytes of a single point stored by this class + static constexpr int kNumElem = NB / sizeof(RT); // number of elements in array serving as a cache + static constexpr int kMaxDims = NB / sizeof(T); // maximum number of dimensions a single point may have + + __device__ PointT() = default; + + inline __device__ void load(cuda::Tensor3DWrap set, int sampleIdx, int setIdx, int numDim) + { +#pragma unroll + for (int i = 0; i < kNumElem && i < util::DivUp(numDim * (int)sizeof(T), (int)sizeof(RT)); ++i) + { + data[i] = *reinterpret_cast(set.ptr(sampleIdx, setIdx, i * (int)(sizeof(RT) / sizeof(T)))); + } + } + + inline __device__ T &operator[](int i) const + { + return reinterpret_cast(&data[0])[i]; + } + +private: + RT data[kNumElem]; +}; + +// Is compatible checks if a {numDim}-dimensional point fits in the corresponding Point T class (above) +template +inline __host__ bool isCompatible(int numDim) +{ + return (numDim * (int)sizeof(T)) <= NB; +} + +// Get minimum stride is used to check if a {numDim}-dimensional Point of type T smaller than RT (the cache +// resource type in PointT class) can be read in steps of RT, allowing overflow after the last T element +template +inline __host__ int getMinStride(int numDim) +{ + return util::DivUp(numDim * (int)sizeof(T), (int)sizeof(RT)) * (int)sizeof(RT); +} + +// CUDA functions -------------------------------------------------------------- + +// Reduce-min by key a key-value pair for CUB (CUDA Unbound) to do block-wide reduction to minimum in the first thread +inline __device__ KeyValueT minkey(const KeyValueT &a, const KeyValueT &b) +{ + return (a.dist < b.dist || (a.dist == b.dist && a.idx < b.idx)) ? a : b; +} + +// Absolute difference | a - b | for floating-point values +template>> +inline __device__ T absdiff(T a, T b) +{ + return cuda::abs(a - b); +} + +// Absolute difference for integral values, computing difference in unsigned types may lead to wrap around +template>> +inline __device__ std::make_unsigned_t absdiff(T a, T b) +{ + return a < b ? b - a : a - b; // wrapping around is fine! +} + +// Compute {distance} between elements {e1} and {e2} from n-dimensional points p1 and p2 +template +inline __device__ void ComputeDistance(float &distance, const T &e1, const T &e2) +{ + if constexpr (NORM == NVCV_NORM_HAMMING) + { + distance += __popc(e1 ^ e2); + } + else if constexpr (NORM == NVCV_NORM_L1) + { + distance += absdiff(e1, e2); + } + else + { + static_assert(NORM == NVCV_NORM_L2, "ComputeDistance accepts only HAMMING, L1 or L2 norms"); + + float d = absdiff(e1, e2); + + distance = fma(d, d, distance); // square-root is postponed as not needed to find best matches + } +} + +// Sort pairs of (distance, index) one per thread from a fixed point p1 to all points p2 in set2 with numDim +// dimensions, each point is an array with numDim elements of source type ST, each set is an array of points, and +// the tensor is an array of sets where the sampleIdx selects the current set within it with set2Size points +template +inline __device__ void SortKeyValue(float &sortedDist, int &sortedIdx, const Point &p1, const SetWrapper &set2, + int numDim, int matchesPerPoint, int sampleIdx, int set2Size) +{ + sortedDist = cuda::TypeTraits::max; + sortedIdx = -1; + + float curDist; + Point p2; + + for (int set2Idx = threadIdx.x; set2Idx < set2Size; set2Idx += kNumThreads) + { + p2.load(set2, sampleIdx, set2Idx, numDim); + + curDist = 0.f; + + if constexpr (Point::kMaxSize > 0) + { +#pragma unroll + for (int i = 0; i < Point::kMaxDims && i < numDim; ++i) + { + ComputeDistance(curDist, p1[i], p2[i]); + } + } + else + { + for (int i = 0; i < numDim; ++i) + { + ComputeDistance(curDist, p1[i], p2[i]); + } + } + + if (curDist < sortedDist) + { + sortedDist = curDist; + sortedIdx = set2Idx; + } + } + + __syncthreads(); // wait for all the threads to complete their local sorted (distance, index) pair + + if (matchesPerPoint == 1) // fast path for top-1 sort is reduce minimum + { + using BlockReduce = cub::BlockReduce; + + __shared__ typename BlockReduce::TempStorage cubTempStorage; + + KeyValueT keyValue{sortedDist, sortedIdx}; + + KeyValueT minKeyValue = BlockReduce(cubTempStorage).Reduce(keyValue, minkey); + + if (threadIdx.x == 0) + { + sortedDist = minKeyValue.dist; + sortedIdx = minKeyValue.idx; + } + } + else // normal path to get top-N where N > 1 requires block sort + { + using BlockSort = cub::BlockRadixSort; + + __shared__ typename BlockSort::TempStorage cubTempStorage; + + float keys[1] = {sortedDist}; + int values[1] = {sortedIdx}; + + BlockSort(cubTempStorage).Sort(keys, values); + + if (threadIdx.x < matchesPerPoint) + { + sortedDist = keys[0]; + sortedIdx = values[0]; + } + } +} + +// Write a match of (set1Idx, set2Idx) with (distance) found at matchIdx inside output matches and distances +template +inline __device__ void WriteMatch(int matchIdx, int set1Idx, int set2Idx, int sampleIdx, float &distance, + cuda::Tensor3DWrap matches, cuda::Tensor2DWrap distances) +{ + *matches.ptr(sampleIdx, matchIdx, 0) = set1Idx; + *matches.ptr(sampleIdx, matchIdx, 1) = set2Idx; + + if (distances.ptr(0) != nullptr) + { + if constexpr (NORM == NVCV_NORM_L2) + { + distance = cuda::sqrt(distance); // square-root was postpone for writing time, which is now + } + + *distances.ptr(sampleIdx, matchIdx) = distance; + } +} + +// Brute-force matcher finds closest pairs of n-dimensional points in set1 and set2, comparing all against all, it +// is instantiated by: an upper limit of each point size in bytes; type; and source type +template +__global__ void BruteForceMatcher(cuda::Tensor3DWrap set1, cuda::Tensor3DWrap set2, + cuda::Tensor1DWrap numSet1, cuda::Tensor1DWrap numSet2, + cuda::Tensor3DWrap matches, cuda::Tensor1DWrap numMatches, + cuda::Tensor2DWrap distances, int set1Capacity, int set2Capacity, + int outCapacity, int numDim, bool crossCheck, int matchesPerPoint) +{ + int sampleIdx = blockIdx.x; + int set1Idx = blockIdx.y; + int set1Size = set1Capacity; + + if (numSet1.ptr(0) != nullptr) + { + set1Size = numSet1[sampleIdx]; + set1Size = set1Size > set1Capacity ? set1Capacity : set1Size; + } + + if (set1Idx >= set1Size) + { + return; + } + + int set2Size = set2Capacity; + + if (numSet2.ptr(0) != nullptr) + { + set2Size = numSet2[sampleIdx]; + set2Size = set2Size > set2Capacity ? set2Capacity : set2Size; + } + + PointT p; + + p.load(set1, sampleIdx, set1Idx, numDim); + + float dist; + int set2Idx; + + SortKeyValue(dist, set2Idx, p, set2, numDim, matchesPerPoint, sampleIdx, set2Size); + + if (crossCheck) + { + __shared__ int set2Idx2; + + if (threadIdx.x == 0) + { + set2Idx2 = set2Idx; + } + + __syncthreads(); // wait the first thread to communicate the best match in set2 index + + p.load(set2, sampleIdx, set2Idx2, numDim); + + float dist2; + int set1Idx2; + + SortKeyValue(dist2, set1Idx2, p, set1, numDim, matchesPerPoint, sampleIdx, set1Size); + + if (threadIdx.x == 0 && set1Idx2 == set1Idx) + { + int matchIdx = atomicAdd(numMatches.ptr(sampleIdx), 1); + + if (matchIdx < outCapacity) + { + WriteMatch(matchIdx, set1Idx, set2Idx, sampleIdx, dist, matches, distances); + } + } + } + else + { + if (threadIdx.x < matchesPerPoint) + { + int matchIdx = set1Idx * matchesPerPoint + threadIdx.x; + + if (matchIdx < outCapacity) + { + WriteMatch(matchIdx, set1Idx, set2Idx, sampleIdx, dist, matches, distances); + } + } + } +} + +// Write number of matches in the case without cross check this number is set1 size times matches per point +__global__ void WriteNumMatches(cuda::Tensor1DWrap numSet1, cuda::Tensor1DWrap numMatches, + int set1Capacity, int matchesPerPoint) +{ + int sampleIdx = blockIdx.x; + int set1Size = (numSet1.ptr(0) == nullptr) ? set1Capacity : numSet1[sampleIdx]; + + numMatches[sampleIdx] = set1Size * matchesPerPoint; +} + +// Run functions --------------------------------------------------------------- + +// Run brute-force matcher, using NORM type for distance calculations and SrcT is the input source data type +template +inline void RunBruteForceMatcherForNorm(cudaStream_t stream, const nvcv::Tensor &set1, const nvcv::Tensor &set2, + const nvcv::Tensor &numSet1, const nvcv::Tensor &numSet2, + const nvcv::Tensor &matches, const nvcv::Tensor &numMatches, + const nvcv::Tensor &distances, bool crossCheck, int matchesPerPoint) +{ + cuda::Tensor3DWrap w_set1, w_set2; // tensor wraps of set1 and set2 and other tensors + cuda::Tensor1DWrap w_numSet1, w_numSet2; + cuda::Tensor3DWrap w_matches; + cuda::Tensor1DWrap w_numMatches; + cuda::Tensor2DWrap w_distances; + +#define CVCUDA_BFM_WRAP(TENSOR) \ + if (TENSOR) \ + { \ + auto data = TENSOR.exportData(); \ + if (!data) \ + { \ + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, #TENSOR " tensor must be cuda-accessible"); \ + } \ + w_##TENSOR = decltype(w_##TENSOR)(*data); \ + } + + CVCUDA_BFM_WRAP(set1); + CVCUDA_BFM_WRAP(set2); + + CVCUDA_BFM_WRAP(numSet1); + CVCUDA_BFM_WRAP(numSet2); + + CVCUDA_BFM_WRAP(matches); + CVCUDA_BFM_WRAP(numMatches); + + CVCUDA_BFM_WRAP(distances); + +#undef CVCUDA_BFM_WRAP + + int numSamples = set1.shape()[0]; // number of samples, where each sample is a set of points + int set1Capacity = set1.shape()[1]; // set capacity is the maximum allowed number of points in set1 + int set2Capacity = set2.shape()[1]; // set capacity is the maximum allowed number of points in set2 + int numDim = set1.shape()[2]; // number of dimensions of each n-dimensional point in set1 and set2 + int outCapacity = matches.shape()[1]; // output capacity to store matches and distances + int minStride = getMinStride(numDim); // minimum stride in sets to allow the usage of PointT class + + dim3 threads(kNumThreads, 1, 1); + dim3 blocks1(numSamples, 1, 1); + dim3 blocks2(numSamples, set1Capacity, 1); + + if (crossCheck) + { + // Cross check returns a varying number of matches, as a match is only valid if it is the best (closest) + // match from set1 to set2 and back from set2 to set1, the numMatches output starts at zero and is + // atomically incremented in the BruteForceMatcher kernel + + NVCV_CHECK_THROW(cudaMemsetAsync(w_numMatches.ptr(0), 0, sizeof(int32_t) * numSamples, stream)); + } + else + { + // Without cross check has a fixed number of matches equal to the set1 size, meaning for every point in + // set1 there is (are) one (or more) matche(s) (up to matchesPerPoint) in set2 + + if (numMatches) + { + WriteNumMatches<<>>(w_numSet1, w_numMatches, set1Capacity, matchesPerPoint); + } + } + + // Cache-based kernel specialization: numDim and SrcT must fit a cache in PointT class; it works for 32B and + // 128B descriptors, such as ORB and SIFT. Even though it has 256 bytes spill loads/stores for NB = 128, it + // still gives almost 2x performance benefit. + + // TODO: The caveat of below kernel specializations is that it takes time to compile (~30sec) and it does not + // cover points bigger than 128B in size, incurring in low performance for big points. It may be better + // to use shared memory for those big points, given a certain maximum point dimension, and use threads to + // compute per element results instead of per point. + +#define CVCUDA_BFM_RUN(NB) \ + BruteForceMatcher<<>>( \ + w_set1, w_set2, w_numSet1, w_numSet2, w_matches, w_numMatches, w_distances, set1Capacity, set2Capacity, \ + outCapacity, numDim, crossCheck, matchesPerPoint); \ + return + + if (w_set1.strides()[1] >= minStride && w_set2.strides()[1] >= minStride) + { + if (isCompatible(numDim)) + { + CVCUDA_BFM_RUN(32); + } + else if (isCompatible(numDim)) + { + CVCUDA_BFM_RUN(128); + } + } + + CVCUDA_BFM_RUN(0); + +#undef CVCUDA_BFM_RUN +} + +template +inline void RunBruteForceMatcherForType(cudaStream_t stream, const nvcv::Tensor &set1, const nvcv::Tensor &set2, + const nvcv::Tensor &numSet1, const nvcv::Tensor &numSet2, + const nvcv::Tensor &matches, const nvcv::Tensor &numMatches, + const nvcv::Tensor &distances, bool crossCheck, int matchesPerPoint, + NVCVNormType normType) +{ + switch (normType) + { + case NVCV_NORM_HAMMING: + if constexpr (std::is_floating_point_v) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid norm Hamming with float input type"); + } + else + { + RunBruteForceMatcherForNorm(stream, set1, set2, numSet1, numSet2, matches, + numMatches, distances, crossCheck, matchesPerPoint); + } + break; + +#define CVCUDA_BFM_CASE(NORM) \ + case NORM: \ + RunBruteForceMatcherForNorm(stream, set1, set2, numSet1, numSet2, matches, numMatches, distances, \ + crossCheck, matchesPerPoint); \ + break + + CVCUDA_BFM_CASE(NVCV_NORM_L1); + CVCUDA_BFM_CASE(NVCV_NORM_L2); + +#undef CVCUDA_BFM_CASE + + default: + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid norm type"); + } +} + +inline void RunBruteForceMatcher(cudaStream_t stream, const nvcv::Tensor &set1, const nvcv::Tensor &set2, + const nvcv::Tensor &numSet1, const nvcv::Tensor &numSet2, const nvcv::Tensor &matches, + const nvcv::Tensor &numMatches, const nvcv::Tensor &distances, bool crossCheck, + int matchesPerPoint, NVCVNormType normType) +{ + switch (set1.dtype()) + { +#define CVCUDA_BFM_CASE(DT, T) \ + case nvcv::TYPE_##DT: \ + RunBruteForceMatcherForType(stream, set1, set2, numSet1, numSet2, matches, numMatches, distances, \ + crossCheck, matchesPerPoint, normType); \ + break + + CVCUDA_BFM_CASE(U8, uint8_t); + CVCUDA_BFM_CASE(U32, uint32_t); + CVCUDA_BFM_CASE(F32, float); + +#undef CVCUDA_BFM_CASE + + default: + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid input data type"); + } +} + +} // anonymous namespace + +namespace cvcuda::priv { + +// Constructor ----------------------------------------------------------------- + +PairwiseMatcher::PairwiseMatcher(NVCVPairwiseMatcherType algoChoice) + : m_algoChoice(algoChoice) +{ + // Support additional algorithms here (only brute force for now), they may require payload + if (algoChoice != NVCV_BRUTE_FORCE) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid algorithm choice"); + } +} + +// Tensor operator ------------------------------------------------------------- + +void PairwiseMatcher::operator()(cudaStream_t stream, const nvcv::Tensor &set1, const nvcv::Tensor &set2, + const nvcv::Tensor &numSet1, const nvcv::Tensor &numSet2, const nvcv::Tensor &matches, + const nvcv::Tensor &numMatches, const nvcv::Tensor &distances, bool crossCheck, + int matchesPerPoint, NVCVNormType normType) +{ + // Check each input and output tensor and their properties are conforming to what is expected + + if (!set1 || !set2 || !matches) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Required tensors: set1 set2 matches"); + } + if (set1.rank() != 3) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input set1 must be a rank-3 tensor"); + } + if (set2.rank() != 3) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input set2 must be a rank-3 tensor"); + } + if (set1.dtype() != set2.dtype()) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input sets must have the same data type"); + } + + int64_t numSamples = set1.shape()[0]; + int64_t numDim = set1.shape()[2]; + + if (set2.shape()[0] != numSamples || set2.shape()[2] != numDim) + { + std::ostringstream oss; + oss << (set2 ? set2.shape() : nvcv::TensorShape()); + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid set2 shape %s is not [NMD]: N=%ld D=%ld", + oss.str().c_str(), numSamples, numDim); + } + + if (numSamples > kIntMax || numDim > kIntMax || set1.shape()[1] > kIntMax || set2.shape()[1] > kIntMax) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Too big input tensors, shape > %d", kIntMax); + } + + if (numSet1 + && ((numSet1.rank() != 1 && numSet1.rank() != 2) || numSet1.shape()[0] != numSamples + || (numSet1.rank() == 2 && numSet1.shape()[1] != 1) || numSet1.dtype() != nvcv::TYPE_S32)) + { + std::ostringstream oss; + oss << numSet1.shape(); + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Invalid numSet1 shape %s dtype %s are not [N] or [NC]: N=%ld C=1 dtype=S32", + oss.str().c_str(), nvcvDataTypeGetName(numSet1.dtype()), numSamples); + } + + if (numSet2 + && ((numSet2.rank() != 1 && numSet2.rank() != 2) || numSet2.shape()[0] != numSamples + || (numSet2.rank() == 2 && numSet2.shape()[1] != 1) || numSet2.dtype() != nvcv::TYPE_S32)) + { + std::ostringstream oss; + oss << numSet2.shape(); + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Invalid numSet2 shape %s dtype %s are not [N] or [NC]: N=%ld C=1 dtype=S32", + oss.str().c_str(), nvcvDataTypeGetName(numSet2.dtype()), numSamples); + } + + if (matches.rank() != 3 || matches.shape()[0] != numSamples || matches.shape()[1] >= kIntMax + || matches.shape()[2] != 2 || matches.dtype() != nvcv::TYPE_S32) + { + std::ostringstream oss; + oss << matches.shape(); + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Invalid matches shape %s dtype %s are not [NMA]: N=%ld M<%d A=2 dtype=S32", + oss.str().c_str(), nvcvDataTypeGetName(matches.dtype()), numSamples, kIntMax); + } + + if (numMatches + && ((numMatches.rank() != 1 && numMatches.rank() != 2) || numMatches.shape()[0] != numSamples + || (numMatches.rank() == 2 && numMatches.shape()[1] != 1) || numMatches.dtype() != nvcv::TYPE_S32)) + { + std::ostringstream oss; + oss << numMatches.shape(); + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Invalid numMatches shape %s dtype %s are not [N] or [NC]: N=%ld C=1 dtype=S32", + oss.str().c_str(), nvcvDataTypeGetName(numMatches.dtype()), numSamples); + } + + int64_t outCapacity = matches.shape()[1]; + + if (distances + && ((distances.rank() != 2 && distances.rank() != 3) || distances.shape()[0] != numSamples + || distances.shape()[1] != outCapacity || (distances.rank() == 3 && distances.shape()[2] != 1) + || distances.dtype() != nvcv::TYPE_F32)) + { + std::ostringstream oss; + oss << distances.shape(); + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Invalid distances shape %s dtype %s are not [NM] or [NMC]: N=%ld M=%ld C=1 dtype=S32", + oss.str().c_str(), nvcvDataTypeGetName(distances.dtype()), numSamples, outCapacity); + } + + if (matchesPerPoint <= 0 || matchesPerPoint > kNumThreads) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid matchesPerPoint %d is not in [1, %d]", + matchesPerPoint, kNumThreads); + } + if (crossCheck && matchesPerPoint != 1) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Invalid matchesPerPoint %d for crossCheck=true is not 1", matchesPerPoint); + } + if (crossCheck && !numMatches) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid numMatches=NULL for crossCheck=true"); + } + + if (m_algoChoice == NVCV_BRUTE_FORCE) + { + RunBruteForceMatcher(stream, set1, set2, numSet1, numSet2, matches, numMatches, distances, crossCheck, + matchesPerPoint, normType); + } +} + +} // namespace cvcuda::priv diff --git a/src/cvcuda/priv/OpPairwiseMatcher.hpp b/src/cvcuda/priv/OpPairwiseMatcher.hpp new file mode 100644 index 00000000..208a4cc9 --- /dev/null +++ b/src/cvcuda/priv/OpPairwiseMatcher.hpp @@ -0,0 +1,49 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file OpPairwiseMatcher.hpp + * + * @brief Defines the private C++ Class for the PairwiseMatcher operation. + */ + +#ifndef CVCUDA_PRIV_PAIRWISE_MATCHER_HPP +#define CVCUDA_PRIV_PAIRWISE_MATCHER_HPP + +#include "IOperator.hpp" + +#include + +namespace cvcuda::priv { + +class PairwiseMatcher final : public IOperator +{ +public: + explicit PairwiseMatcher(NVCVPairwiseMatcherType algoChoice); + + void operator()(cudaStream_t stream, const nvcv::Tensor &set1, const nvcv::Tensor &set2, + const nvcv::Tensor &numSet1, const nvcv::Tensor &numSet2, const nvcv::Tensor &matches, + const nvcv::Tensor &numMatches, const nvcv::Tensor &distances, bool crossCheck, int matchesPerPoint, + NVCVNormType normType); + +private: + NVCVPairwiseMatcherType m_algoChoice; +}; + +} // namespace cvcuda::priv + +#endif // CVCUDA_PRIV_PAIRWISE_MATCHER_HPP diff --git a/src/cvcuda/priv/OpPillowResize.cpp b/src/cvcuda/priv/OpPillowResize.cpp index 73ce7ebc..a72fa7f9 100644 --- a/src/cvcuda/priv/OpPillowResize.cpp +++ b/src/cvcuda/priv/OpPillowResize.cpp @@ -28,7 +28,26 @@ namespace cvcuda::priv { namespace leg = nvcv::legacy; namespace legacy = nvcv::legacy::cuda_op; -PillowResize::PillowResize(nvcv::Size2D maxSize, int maxBatchSize, NVCVImageFormat fmt) +PillowResize::PillowResize() +{ + m_legacyOp = std::make_unique(); + m_legacyOpVarShape = std::make_unique(); +} + +WorkspaceRequirements PillowResize::getWorkspaceRequirements(int batchSize, const nvcv::Size2D *in_sizes, + const nvcv::Size2D *out_sizes, NVCVImageFormat fmt) +{ + nvcv::Size2D maxInSize{0, 0}, maxOutSize{0, 0}; + for (int i = 0; i < batchSize; i++) + { + maxInSize = nvcv::MaxSize(in_sizes[i], maxInSize); + maxOutSize = nvcv::MaxSize(out_sizes[i], maxOutSize); + } + return getWorkspaceRequirements(batchSize, maxInSize, maxOutSize, fmt); +} + +WorkspaceRequirements PillowResize::getWorkspaceRequirements(int maxBatchSize, nvcv::Size2D maxInSize, + nvcv::Size2D maxOutSize, NVCVImageFormat fmt) { int32_t bpc[4]; nvcvImageFormatGetBitsPerChannel(fmt, bpc); @@ -36,15 +55,17 @@ PillowResize::PillowResize(nvcv::Size2D maxSize, int maxBatchSize, NVCVImageForm nvcvImageFormatGetNumChannels(fmt, &maxChannel); NVCVDataKind dataKind; nvcvImageFormatGetDataKind(fmt, &dataKind); - nvcv::DataKind dkind = static_cast(dataKind); - leg::cuda_op::DataType data_type = leg::helpers::GetLegacyDataType(bpc[0], dkind); - leg::cuda_op::DataShape maxIn(maxBatchSize, maxChannel, maxSize.h, maxSize.w), - maxOut(maxBatchSize, maxChannel, maxSize.h, maxSize.w); - m_legacyOp = std::make_unique(maxIn, maxOut, data_type); - m_legacyOpVarShape = std::make_unique(maxIn, maxOut, data_type); + nvcv::DataKind dkind = static_cast(dataKind); + leg::cuda_op::DataType dataType = leg::helpers::GetLegacyDataType(bpc[0], dkind); + leg::cuda_op::DataShape maxIn(maxBatchSize, maxChannel, maxInSize.h, maxInSize.w); + leg::cuda_op::DataShape maxOut(maxBatchSize, maxChannel, maxOutSize.h, maxOutSize.w); + auto req = m_legacyOp->getWorkspaceRequirements(maxIn, maxOut, dataType); + auto reqVarShape = m_legacyOpVarShape->getWorkspaceRequirements(maxIn, maxOut, dataType); + + return MaxWorkspaceReq(req, reqVarShape); } -void PillowResize::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, +void PillowResize::operator()(cudaStream_t stream, const Workspace &ws, const nvcv::Tensor &in, const nvcv::Tensor &out, const NVCVInterpolationType interpolation) const { auto inData = in.exportData(); @@ -61,13 +82,13 @@ void PillowResize::operator()(cudaStream_t stream, const nvcv::Tensor &in, const "Output must be device-acessible, pitch-linear tensor"); } - NVCV_CHECK_THROW(m_legacyOp->infer(*inData, *outData, interpolation, stream)); + NVCV_CHECK_THROW(m_legacyOp->infer(*inData, *outData, interpolation, stream, ws)); } -void PillowResize::operator()(cudaStream_t stream, const nvcv::ImageBatchVarShape &in, +void PillowResize::operator()(cudaStream_t stream, const Workspace &ws, const nvcv::ImageBatchVarShape &in, const nvcv::ImageBatchVarShape &out, const NVCVInterpolationType interpolation) const { - NVCV_CHECK_THROW(m_legacyOpVarShape->infer(in, out, interpolation, stream)); + NVCV_CHECK_THROW(m_legacyOpVarShape->infer(in, out, interpolation, stream, ws)); } } // namespace cvcuda::priv diff --git a/src/cvcuda/priv/OpPillowResize.hpp b/src/cvcuda/priv/OpPillowResize.hpp index 7b2356fd..c6f743b1 100644 --- a/src/cvcuda/priv/OpPillowResize.hpp +++ b/src/cvcuda/priv/OpPillowResize.hpp @@ -25,6 +25,7 @@ #define CVCUDA_PRIV_PILLOW_RESIZE_HPP #include "IOperator.hpp" +#include "cvcuda/Workspace.hpp" #include "legacy/CvCudaLegacy.h" #include @@ -37,13 +38,20 @@ namespace cvcuda::priv { class PillowResize final : public IOperator { public: - explicit PillowResize(nvcv::Size2D maxSize, int maxBatchSize, NVCVImageFormat fmt); + PillowResize(); - void operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, - const NVCVInterpolationType interpolation) const; - void operator()(cudaStream_t stream, const nvcv::ImageBatchVarShape &in, const nvcv::ImageBatchVarShape &out, + WorkspaceRequirements getWorkspaceRequirements(int batchSize, const nvcv::Size2D *in_sizes, + const nvcv::Size2D *out_sizes, NVCVImageFormat fmt); + + WorkspaceRequirements getWorkspaceRequirements(int batchSize, nvcv::Size2D maxInSize, nvcv::Size2D maxOutSize, + NVCVImageFormat fmt); + + void operator()(cudaStream_t stream, const Workspace &ws, const nvcv::Tensor &in, const nvcv::Tensor &out, const NVCVInterpolationType interpolation) const; + void operator()(cudaStream_t stream, const Workspace &ws, const nvcv::ImageBatchVarShape &in, + const nvcv::ImageBatchVarShape &out, const NVCVInterpolationType interpolation) const; + private: std::unique_ptr m_legacyOp; std::unique_ptr m_legacyOpVarShape; diff --git a/src/cvcuda/priv/OpSIFT.cu b/src/cvcuda/priv/OpSIFT.cu index 045b1cd7..cfe2499b 100644 --- a/src/cvcuda/priv/OpSIFT.cu +++ b/src/cvcuda/priv/OpSIFT.cu @@ -1237,6 +1237,16 @@ void SIFT::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::T throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Operator call arg. initSigma=%f must be positive", initSigma); } + if (contrastThreshold <= 0) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Argument contrastThreshold=%f must be positive", + contrastThreshold); + } + if (edgeThreshold <= 0) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Argument edgeThreshold=%f must be positive", + edgeThreshold); + } if (numOctaveLayers < 1 || numOctaveLayers > m_maxOctaveLayers) { diff --git a/src/cvcuda/priv/OpStack.cpp b/src/cvcuda/priv/OpStack.cpp new file mode 100644 index 00000000..3b1707a7 --- /dev/null +++ b/src/cvcuda/priv/OpStack.cpp @@ -0,0 +1,101 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "OpStack.hpp" + +#include "nvcv/TensorDataAccess.hpp" + +#include +#include + +namespace cvcuda::priv { + +void Stack::operator()(cudaStream_t stream, const nvcv::TensorBatch &in, const nvcv::Tensor &out) const +{ + auto outData = out.exportData(); + if (outData == nullptr) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Output must be cuda-accessible, pitch-linear tensor"); + } + + // read out data N, H, W and C + if (out.rank() != 4) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output must be NCHW orNHWC tensor"); + } + + uint32_t outN = out.shape()[0]; + // this works for both NCHW and NHWC since we are just checking if H,W,C are the same + uint32_t outH = out.shape()[1]; + uint32_t outW = out.shape()[2]; + uint32_t outC = out.shape()[3]; + + uint32_t copyIndex = 0; + for (auto it = in.begin(); it != in.end(); ++it) + { + // check if output is large enough since we could have a combo of N and non N tensors on input. + if (copyIndex >= outN) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Output tensor is not large enough to hold all input tensors"); + } + + //check if data layout and shape is are equal. + uint32_t isN = (it->rank() == 4) ? 1 : 0; + if (outH != it->shape()[0 + isN] || outW != it->shape()[1 + isN] || outC != it->shape()[2 + isN]) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Input tensors must have the same H, W, and C as output Tensor"); + } + + auto inData = it->exportData(); + if (inData == nullptr) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Output must be cuda-accessible, pitch-linear tensor"); + } + + copyIndex = copyTensorToNTensor(*outData, *inData, copyIndex, stream); + } +} + +// copies all samples from indata to out data, returns the next index in out data. +int Stack::copyTensorToNTensor(const nvcv::TensorDataStridedCuda &outData, const nvcv::TensorDataStridedCuda &inData, + uint32_t outIndex, cudaStream_t stream) const +{ + auto in = nvcv::TensorDataAccessStridedImagePlanar::Create(inData); + NVCV_ASSERT(in); + auto out = nvcv::TensorDataAccessStridedImagePlanar::Create(outData); + NVCV_ASSERT(out); + + for (uint32_t i = 0; i < in->numSamples(); ++i) + { + nvcv::Byte *inSampData = in->sampleData(i); + nvcv::Byte *outSampData = out->sampleData(outIndex); + for (int32_t p = 0; p < in->numPlanes(); ++p) + { + NVCV_CHECK_LOG(cudaMemcpy2DAsync( + out->planeData(p, outSampData), out->rowStride(), in->planeData(p, inSampData), in->rowStride(), + in->numCols() * in->colStride(), in->numRows(), cudaMemcpyDeviceToDevice, stream)); + } + outIndex++; + } + return outIndex; +} + +} // namespace cvcuda::priv diff --git a/src/cvcuda/priv/OpStack.hpp b/src/cvcuda/priv/OpStack.hpp new file mode 100644 index 00000000..744af988 --- /dev/null +++ b/src/cvcuda/priv/OpStack.hpp @@ -0,0 +1,49 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file OpStack.hpp + * + * @brief Defines the private C++ Class for the Stack operation. + */ + +#ifndef CVCUDA_PRIV__STACK_HPP +#define CVCUDA_PRIV__STACK_HPP + +#include "IOperator.hpp" + +#include +#include +#include + +#include + +namespace cvcuda::priv { + +class Stack final : public IOperator +{ +public: + void operator()(cudaStream_t stream, const nvcv::TensorBatch &in, const nvcv::Tensor &out) const; + +private: + int copyTensorToNTensor(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData, + uint32_t outIndex, cudaStream_t stream) const; +}; + +} // namespace cvcuda::priv + +#endif // CVCUDA_PRIV__STACK_HPP diff --git a/src/cvcuda/priv/Types.hpp b/src/cvcuda/priv/Types.hpp new file mode 100644 index 00000000..9580e6bc --- /dev/null +++ b/src/cvcuda/priv/Types.hpp @@ -0,0 +1,643 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CVCUDA_TYPES_HPP +#define CVCUDA_TYPES_HPP + +#include + +#include +#include + +namespace cvcuda::priv { + +#define checkERR(call) check_error(call, #call, __LINE__, __FILE__) + +inline static bool check_error(cudaError_t e, const char *call, int line, const char *file) +{ + if (e != cudaSuccess) + { + fprintf(stderr, "CUDA Runtime error %s # %s, code = %s [ %d ] in file %s:%d\n", call, cudaGetErrorString(e), + cudaGetErrorName(e), e, file, line); + return false; + } + return true; +} + +// Default font, user can install via below command: +// sudo apt-get update +// sudo apt-get install ttf-dejavu fonts-dejavu +#define DEFAULT_OSD_FONT "DejaVuSansMono" + +class NVCVText +{ +public: + const char *utf8Text = nullptr; // Text to draw in utf8 format. + int32_t fontSize; // Font size for the text. + const char *fontName = nullptr; // Font name for the text. + NVCVPointI tlPos; // Top-left corner point for label text, \ref NVCVPointI. + NVCVColorRGBA fontColor; // Font color of the text. + NVCVColorRGBA bgColor; // Background color of text box. + + NVCVText(const char *_utf8Text, int32_t _fontSize, const char *_fontName, NVCVPointI _tlPos, + NVCVColorRGBA _fontColor, NVCVColorRGBA _bgColor) + : fontSize(_fontSize) + , tlPos(_tlPos) + , fontColor(_fontColor) + , bgColor(_bgColor) + { + utf8Text = (const char *)malloc(strlen(_utf8Text) + 1); + memcpy(const_cast(utf8Text), _utf8Text, strlen(_utf8Text) + 1); + fontName = (const char *)malloc(strlen(_fontName) + 1); + memcpy(const_cast(fontName), _fontName, strlen(_fontName) + 1); + } + + NVCVText(const NVCVText &text) + : fontSize(text.fontSize) + , tlPos(text.tlPos) + , fontColor(text.fontColor) + , bgColor(text.bgColor) + { + utf8Text = (const char *)malloc(strlen(text.utf8Text) + 1); + memcpy(const_cast(utf8Text), text.utf8Text, strlen(text.utf8Text) + 1); + fontName = (const char *)malloc(strlen(text.fontName) + 1); + memcpy(const_cast(fontName), text.fontName, strlen(text.fontName) + 1); + } + + NVCVText &operator=(const NVCVText &text) + { + if (this != &text) + { + if (utf8Text != nullptr) + { + free((void *)utf8Text); + utf8Text = nullptr; + } + if (fontName != nullptr) + { + free((void *)fontName); + fontName = nullptr; + } + *this = NVCVText(text); + } + return *this; + }; + + ~NVCVText() + { + if (utf8Text != nullptr) + { + free((void *)utf8Text); + utf8Text = nullptr; + } + if (fontName != nullptr) + { + free((void *)fontName); + fontName = nullptr; + } + }; +}; + +class NVCVSegment +{ +public: + NVCVBoxI box; // Bounding box of segment, \ref NVCVBoxI. + int32_t thickness; // Line thickness of segment outter rect. + float *dSeg = nullptr; // Device pointer for segment mask, cannot be nullptr. + // Array length: segWidth * segHeight + // Format: + // Score_00, Score_01, ..., Score_0k, ... + // Score_10, Score_11, ..., Score_kk, ... + // ... , ... , ..., ... , ... + int32_t segWidth; // Segment mask width. + int32_t segHeight; // Segment mask height. + float segThreshold; // Segment threshold. + NVCVColorRGBA borderColor; // Line color of segment outter rect. + NVCVColorRGBA segColor; // Segment mask color. + + NVCVSegment(NVCVBoxI _box, int32_t _thickness, float *_hSeg, int32_t _segWidth, int32_t _segHeight, + float _segThreshold, NVCVColorRGBA _borderColor, NVCVColorRGBA _segColor) + : box(_box) + , thickness(_thickness) + , segWidth(_segWidth) + , segHeight(_segHeight) + , segThreshold(_segThreshold) + , borderColor(_borderColor) + , segColor(_segColor) + { + checkERR(cudaMalloc(&dSeg, segWidth * segHeight * sizeof(float))); + checkERR(cudaMemcpy(dSeg, _hSeg, segWidth * segHeight * sizeof(float), cudaMemcpyHostToDevice)); + } + + NVCVSegment(const NVCVSegment &segment) + : box(segment.box) + , thickness(segment.thickness) + , segWidth(segment.segWidth) + , segHeight(segment.segHeight) + , segThreshold(segment.segThreshold) + , borderColor(segment.borderColor) + , segColor(segment.segColor) + { + checkERR(cudaMalloc(&dSeg, segWidth * segHeight * sizeof(float))); + checkERR(cudaMemcpy(dSeg, segment.dSeg, segWidth * segHeight * sizeof(float), cudaMemcpyDeviceToDevice)); + } + + NVCVSegment &operator=(const NVCVSegment &) = delete; + + ~NVCVSegment() + { + if (dSeg != nullptr) + { + checkERR(cudaFree(dSeg)); + dSeg = nullptr; + } + }; +}; + +class NVCVPolyLine +{ +public: + int32_t *hPoints = nullptr; // Host pointer for polyline points' xy, cannot be nullptr. + // Array length: 2 * numPoints. + // Format : X0, Y0, X1, Y1, ..., Xk, Yk, ... + int32_t *dPoints = nullptr; // Device pointer for polyline points' xy. + // Can be nullptr only if fillColor.a == 0. + // Array length: 2 * numPoints. + // Format: X0, Y0, X1, Y1, ..., Xk, Yk, ... + int32_t numPoints; // Number of polyline points. + int32_t thickness; // Polyline thickness. + bool isClosed; // Connect p(0) to p(n-1) or not. + NVCVColorRGBA borderColor; // Line color of polyline border. + NVCVColorRGBA fillColor; // Fill color of poly fill area. + bool interpolation; // Default: true + + NVCVPolyLine(int32_t *_hPoints, int32_t _numPoints, int32_t _thickness, bool _isClosed, NVCVColorRGBA _borderColor, + NVCVColorRGBA _fillColor, bool _interpolation) + : numPoints(_numPoints) + , thickness(_thickness) + , isClosed(_isClosed) + , borderColor(_borderColor) + , fillColor(_fillColor) + , interpolation(_interpolation) + { + hPoints = (int *)malloc(numPoints * 2 * sizeof(int)); + checkERR(cudaMalloc(&dPoints, 2 * numPoints * sizeof(int))); + + memcpy(hPoints, _hPoints, 2 * numPoints * sizeof(int)); + checkERR(cudaMemcpy(dPoints, _hPoints, 2 * numPoints * sizeof(int), cudaMemcpyHostToDevice)); + } + + NVCVPolyLine(const NVCVPolyLine &pl) + : numPoints(pl.numPoints) + , thickness(pl.thickness) + , isClosed(pl.isClosed) + , borderColor(pl.borderColor) + , fillColor(pl.fillColor) + , interpolation(pl.interpolation) + { + hPoints = (int *)malloc(numPoints * 2 * sizeof(int)); + checkERR(cudaMalloc(&dPoints, 2 * numPoints * sizeof(int))); + + memcpy(hPoints, pl.hPoints, 2 * numPoints * sizeof(int)); + checkERR(cudaMemcpy(dPoints, pl.dPoints, 2 * numPoints * sizeof(int), cudaMemcpyDeviceToDevice)); + } + + NVCVPolyLine &operator=(const NVCVPolyLine &) = delete; + + ~NVCVPolyLine() + { + if (hPoints != nullptr) + { + free(hPoints); + hPoints = nullptr; + } + if (dPoints != nullptr) + { + checkERR(cudaFree(dPoints)); + dPoints = nullptr; + } + }; +}; + +class NVCVClock +{ +public: + NVCVClockFormat clockFormat; // Pre-defined clock format. + long time; // Clock time. + int32_t fontSize; // Font size. + const char *font = nullptr; // Font name. + NVCVPointI tlPos; // Top-left corner point, \ref NVCVPointI. + NVCVColorRGBA fontColor; // Font color of the text. + NVCVColorRGBA bgColor; // Background color of text box. + + NVCVClock(NVCVClockFormat _clockFormat, long _time, int32_t _fontSize, const char *_font, NVCVPointI _tlPos, + NVCVColorRGBA _fontColor, NVCVColorRGBA _bgColor) + : clockFormat(_clockFormat) + , time(_time) + , fontSize(_fontSize) + , tlPos(_tlPos) + , fontColor(_fontColor) + , bgColor(_bgColor) + { + font = (const char *)malloc(strlen(_font) + 1); + memcpy(const_cast(font), _font, strlen(_font) + 1); + } + + NVCVClock(const NVCVClock &clock) + : clockFormat(clock.clockFormat) + , time(clock.time) + , fontSize(clock.fontSize) + , tlPos(clock.tlPos) + , fontColor(clock.fontColor) + , bgColor(clock.bgColor) + { + font = (const char *)malloc(strlen(clock.font) + 1); + memcpy(const_cast(font), clock.font, strlen(clock.font) + 1); + } + + NVCVClock &operator=(const NVCVClock &clock) + { + if (this != &clock) + { + if (font != nullptr) + { + free((void *)font); + font = nullptr; + } + *this = NVCVClock(clock); + } + return *this; + }; + + ~NVCVClock() + { + if (font != nullptr) + { + free((void *)font); + font = nullptr; + } + }; +}; + +class NVCVElement +{ +public: + NVCVElement(NVCVOSDType osd_type, const void *src); + NVCVElement(const NVCVElement &) = delete; + NVCVElement &operator=(const NVCVElement &) = delete; + ~NVCVElement(); + + NVCVOSDType type(); + void *ptr(); + // void assign(const void* src); + +private: + /* + * type: + * NVCV_OSD_RECT - \ref NVCVBndBoxI. + * NVCV_OSD_TEXT - \ref NVCVText. + * NVCV_OSD_SEGMENT - \ref NVCVSegment. + * NVCV_OSD_POINT - \ref NVCVPoint. + * NVCV_OSD_LINE - \ref NVCVLine. + * NVCV_OSD_POLYLINE - \ref NVCVPolyLine. + * NVCV_OSD_ROTATED_RECT - \ref NVCVRotatedBox. + * NVCV_OSD_CIRCLE - \ref NVCVCircle. + * NVCV_OSD_ARROW - \ref NVCVArrow. + * NVCV_OSD_CLOCK - \ref NVCVClock. + */ + NVCVOSDType m_type; // OSD element type to draw. + void *m_data; // OSD element data pointer. +}; + +inline NVCVElement::NVCVElement(NVCVOSDType osd_type, const void *src) + : m_type(osd_type) +{ + switch (m_type) + { + case NVCVOSDType::NVCV_OSD_RECT: + { + auto rect = NVCVBndBoxI(*(NVCVBndBoxI *)src); + m_data = new NVCVBndBoxI(rect); + break; + } + case NVCVOSDType::NVCV_OSD_TEXT: + { + auto text = NVCVText(*(NVCVText *)src); + m_data = new NVCVText(text); + break; + } + case NVCVOSDType::NVCV_OSD_SEGMENT: + { + auto segment = NVCVSegment(*(NVCVSegment *)src); + m_data = new NVCVSegment(segment); + break; + } + case NVCVOSDType::NVCV_OSD_POINT: + { + auto point = NVCVPoint(*(NVCVPoint *)src); + m_data = new NVCVPoint(point); + break; + } + case NVCVOSDType::NVCV_OSD_LINE: + { + auto line = NVCVLine(*(NVCVLine *)src); + m_data = new NVCVLine(line); + break; + } + case NVCVOSDType::NVCV_OSD_POLYLINE: + { + auto pl = NVCVPolyLine(*(NVCVPolyLine *)src); + m_data = new NVCVPolyLine(pl); + break; + } + case NVCVOSDType::NVCV_OSD_ROTATED_RECT: + { + auto rb = NVCVRotatedBox(*(NVCVRotatedBox *)src); + m_data = new NVCVRotatedBox(rb); + break; + } + case NVCVOSDType::NVCV_OSD_CIRCLE: + { + auto circle = NVCVCircle(*(NVCVCircle *)src); + m_data = new NVCVCircle(circle); + break; + } + case NVCVOSDType::NVCV_OSD_ARROW: + { + auto arrow = NVCVArrow(*(NVCVArrow *)src); + m_data = new NVCVArrow(arrow); + break; + } + case NVCVOSDType::NVCV_OSD_CLOCK: + { + auto clock = NVCVClock(*(NVCVClock *)src); + m_data = new NVCVClock(clock); + break; + } + default: + break; + } +} + +inline NVCVElement::~NVCVElement() +{ + switch (m_type) + { + case NVCVOSDType::NVCV_OSD_RECT: + { + NVCVBndBoxI *bndBox = (NVCVBndBoxI *)m_data; + if (bndBox != nullptr) + { + delete (bndBox); + bndBox = nullptr; + } + break; + } + case NVCVOSDType::NVCV_OSD_TEXT: + { + NVCVText *label = (NVCVText *)m_data; + if (label != nullptr) + { + delete (label); + label = nullptr; + } + break; + } + case NVCVOSDType::NVCV_OSD_SEGMENT: + { + NVCVSegment *segment = (NVCVSegment *)m_data; + if (segment != nullptr) + { + delete (segment); + segment = nullptr; + } + break; + } + case NVCVOSDType::NVCV_OSD_POINT: + { + NVCVPoint *point = (NVCVPoint *)m_data; + if (point != nullptr) + { + delete (point); + point = nullptr; + } + break; + } + case NVCVOSDType::NVCV_OSD_LINE: + { + NVCVLine *line = (NVCVLine *)m_data; + if (line != nullptr) + { + delete (line); + line = nullptr; + } + break; + } + case NVCVOSDType::NVCV_OSD_POLYLINE: + { + NVCVPolyLine *pl = (NVCVPolyLine *)m_data; + if (pl != nullptr) + { + delete (pl); + pl = nullptr; + } + break; + } + case NVCVOSDType::NVCV_OSD_ROTATED_RECT: + { + NVCVRotatedBox *rb = (NVCVRotatedBox *)m_data; + if (rb != nullptr) + { + delete (rb); + rb = nullptr; + } + break; + } + case NVCVOSDType::NVCV_OSD_CIRCLE: + { + NVCVCircle *circle = (NVCVCircle *)m_data; + if (circle != nullptr) + { + delete (circle); + circle = nullptr; + } + break; + } + case NVCVOSDType::NVCV_OSD_ARROW: + { + NVCVArrow *arrow = (NVCVArrow *)m_data; + if (arrow != nullptr) + { + delete (arrow); + arrow = nullptr; + } + break; + } + case NVCVOSDType::NVCV_OSD_CLOCK: + { + NVCVClock *clock = (NVCVClock *)m_data; + if (clock != nullptr) + { + delete (clock); + clock = nullptr; + } + break; + } + default: + break; + } +} + +inline NVCVOSDType NVCVElement::type() +{ + return m_type; +} + +inline void *NVCVElement::ptr() +{ + return m_data; +} + +class NVCVBlurBoxesImpl +{ +public: + NVCVBlurBoxesImpl(const std::vector> &blurboxes_vec); + NVCVBlurBoxesImpl(const NVCVBlurBoxesImpl &) = delete; + NVCVBlurBoxesImpl &operator=(const NVCVBlurBoxesImpl &) = delete; + ~NVCVBlurBoxesImpl(); + + int32_t batch() const; + int32_t numBoxesAt(int32_t b) const; + NVCVBlurBoxI boxAt(int32_t b, int32_t i) const; + +private: + std::vector> m_blurboxes_vec; +}; + +inline NVCVBlurBoxesImpl::NVCVBlurBoxesImpl(const std::vector> &blurboxes_vec) +{ + m_blurboxes_vec = blurboxes_vec; +} + +inline NVCVBlurBoxesImpl::~NVCVBlurBoxesImpl() +{ + std::vector> tmp; + m_blurboxes_vec.swap(tmp); +} + +inline int32_t NVCVBlurBoxesImpl::batch() const +{ + return m_blurboxes_vec.size(); +} + +inline int32_t NVCVBlurBoxesImpl::numBoxesAt(int32_t b) const +{ + return m_blurboxes_vec[b].size(); +} + +inline NVCVBlurBoxI NVCVBlurBoxesImpl::boxAt(int32_t b, int32_t i) const +{ + return m_blurboxes_vec[b][i]; +} + +class NVCVBndBoxesImpl +{ +public: + NVCVBndBoxesImpl(const std::vector> &bndboxes_vec); + NVCVBndBoxesImpl(const NVCVBndBoxesImpl &) = delete; + NVCVBndBoxesImpl &operator=(const NVCVBndBoxesImpl &) = delete; + ~NVCVBndBoxesImpl(); + + int32_t batch() const; + int32_t numBoxesAt(int32_t b) const; + NVCVBndBoxI boxAt(int32_t b, int32_t i) const; + +private: + std::vector> m_bndboxes_vec; +}; + +inline NVCVBndBoxesImpl::NVCVBndBoxesImpl(const std::vector> &bndboxes_vec) +{ + m_bndboxes_vec = bndboxes_vec; +} + +inline NVCVBndBoxesImpl::~NVCVBndBoxesImpl() +{ + std::vector> tmp; + m_bndboxes_vec.swap(tmp); +} + +inline int32_t NVCVBndBoxesImpl::batch() const +{ + return m_bndboxes_vec.size(); +} + +inline int32_t NVCVBndBoxesImpl::numBoxesAt(int32_t b) const +{ + return m_bndboxes_vec[b].size(); +} + +inline NVCVBndBoxI NVCVBndBoxesImpl::boxAt(int32_t b, int32_t i) const +{ + return m_bndboxes_vec[b][i]; +} + +class NVCVElementsImpl +{ +public: + NVCVElementsImpl(const std::vector>> &elements_vec); + NVCVElementsImpl(const NVCVElementsImpl &) = delete; + NVCVElementsImpl &operator=(const NVCVElementsImpl &) = delete; + ~NVCVElementsImpl(); + + int32_t batch() const; + int32_t numElementsAt(int32_t b) const; + std::shared_ptr elementAt(int32_t b, int32_t i) const; + +private: + std::vector>> m_elements_vec; +}; + +inline NVCVElementsImpl::NVCVElementsImpl(const std::vector>> &elements_vec) +{ + m_elements_vec = elements_vec; +} + +inline NVCVElementsImpl::~NVCVElementsImpl() +{ + std::vector>> tmp; + m_elements_vec.swap(tmp); +} + +inline int32_t NVCVElementsImpl::batch() const +{ + return m_elements_vec.size(); +} + +inline int32_t NVCVElementsImpl::numElementsAt(int32_t b) const +{ + return m_elements_vec[b].size(); +} + +inline std::shared_ptr NVCVElementsImpl::elementAt(int32_t b, int32_t i) const +{ + return m_elements_vec[b][i]; +} + +} // namespace cvcuda::priv + +#endif // CVCUDA_TYPES_HPP diff --git a/src/cvcuda/priv/WorkspaceAllocator.hpp b/src/cvcuda/priv/WorkspaceAllocator.hpp new file mode 100644 index 00000000..bb5575fd --- /dev/null +++ b/src/cvcuda/priv/WorkspaceAllocator.hpp @@ -0,0 +1,216 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CVCUDA_PRIV_WORKSPACE_ALLOCATOR_HPP +#define CVCUDA_PRIV_WORKSPACE_ALLOCATOR_HPP + +#include + +#include + +namespace cvcuda { + +class WorkspaceMemAllocator +{ +public: + WorkspaceMemAllocator(const WorkspaceMemAllocator &) = delete; + WorkspaceMemAllocator &operator=(const WorkspaceMemAllocator &) = delete; + + /** + * @brief Construct a new workspace memory allocator + * + * The function constructs a new allocator. Subsequent calls to `get` will obtain memory pointers to + * workspace entries. + * + * This function sets the default acquire and release streams, but doesn't call acquire - this is deferred to the + * first call to `get`. + * The streams can be overriden in manual calls to `acquire` and `release`. + * + * @param mem Workspace memory + * @param acquireReleaseStream A stream on which the data will be used (or nullopt to denote host usage) + */ + WorkspaceMemAllocator(const WorkspaceMem &mem, std::optional acquireReleaseStream = std::nullopt) + : WorkspaceMemAllocator(mem, acquireReleaseStream, acquireReleaseStream) + { + } + + /** + * @brief Construct a new workspace memory allocator + * + * The function constructs a new allocator. Subsequent calls to `get` will obtain memory pointers to + * workspace entries. + * + * This function sets the default acquire and release streams, but doesn't call acquire - this is deferred to the + * first call to `get`. + * The streams can be overriden in manual calls to `acquire` and `release`. + * + * @param mem Workspace memory + * @param acquireStream A stream on which the data will be used first (or nullopt to denote host usage) + * @param acquireStream A stream on which the data will be used last (or nullopt to denote host usage) + */ + WorkspaceMemAllocator(const WorkspaceMem &mem, std::optional acquireStream, + std::optional releaseStream) + : m_mem(mem) + , m_acquireStream(acquireStream) + , m_releaseStream(releaseStream) + { + } + + ~WorkspaceMemAllocator() + { + if (!m_released) + release(m_releaseStream); + } + + /** + * @brief Allocates `count` elements of type `T` from the workspace memory. + * + * This function calls `acquire` if not called explicitly before. + * + * @tparam T the type of the object to get + * @param count the number of objects to allocate + * @param alignment the extra alignment, must not be less than `alignof(T)` + * @return T* a pointer to the workspace buffer where the requested object is located + */ + template + T *get(size_t count = 1, size_t alignment = alignof(T)) + { + assert(alignment >= alignof(T)); + + if (m_released) + throw std::logic_error("This workspace memory has been released."); + + if (!m_acquired && count > 0) + acquire(m_acquireStream); + + if ((uintptr_t)m_mem.data & (alignment - 1)) + { + throw nvcv::Exception( + nvcv::Status::ERROR_INVALID_ARGUMENT, + "The workspace base pointer is not aligned to match the required alignment of a workspace entry."); + } + + size_t offset = nvcv::detail::AlignUp(m_offset, alignment); + T *ret = reinterpret_cast(static_cast(m_mem.data) + offset); + size_t real_size = nvcv::detail::AlignUp(count * sizeof(T), alignment); + offset += real_size; + if (offset > m_mem.req.size) + throw nvcv::Exception(nvcv::Status::ERROR_OUT_OF_MEMORY, "Operator workspace too small."); + m_offset = offset; + return ret; + } + + constexpr size_t capacity() const + { + return m_mem.req.size; + } + + constexpr size_t allocated() const + { + return m_offset; + } + + /** + * @brief Waits for the memory to become ready for use on the acquire stream, if specified, or on host. + */ + void acquire(std::optional stream) + { + if (m_acquired) + throw std::logic_error("Acquire called multiple times"); + + if (m_released) + throw std::logic_error("This workspace memory has been released."); + + if (m_mem.ready) + { + if (stream) + { + if (cudaStreamWaitEvent(*stream, m_mem.ready) != cudaSuccess) + throw nvcv::Exception(nvcv::Status::ERROR_INTERNAL, "cudaStreamWairEvent failed"); + } + else + { + if (cudaEventSynchronize(m_mem.ready) != cudaSuccess) + throw nvcv::Exception(nvcv::Status::ERROR_INTERNAL, "cudaEventSynchronize failed"); + } + } + m_acquired = true; + } + + /** + * @brief Declares that the memory is ready for reuse by the release stream (if specified) or any thread or stream. + */ + void release(std::optional stream) + { + if (m_released) + throw std::logic_error("Release called multiple times"); + + if (m_mem.ready && m_offset) + { + assert(m_acquired); + + if (stream) + if (cudaEventRecord(m_mem.ready, *stream) != cudaSuccess) + throw nvcv::Exception(nvcv::Status::ERROR_INTERNAL, "cudaEventRecord failed"); + } + m_released = true; + } + +private: + WorkspaceMem m_mem; + size_t m_offset = 0; + bool m_acquired = false, m_released = false; + + std::optional m_acquireStream, m_releaseStream; +}; + +struct WorkspaceAllocator +{ +public: + explicit WorkspaceAllocator(const Workspace &ws) + : hostMem(ws.hostMem) + , pinnedMem(ws.pinnedMem) + , cudaMem(ws.cudaMem) + { + } + + template + T *getHost(size_t count = 1, size_t alignment = alignof(T)) + { + return hostMem.get(count, alignment); + } + + template + T *getPinned(size_t count = 1, size_t alignment = alignof(T)) + { + return pinnedMem.get(count, alignment); + } + + template + T *getCuda(size_t count = 1, size_t alignment = alignof(T)) + { + return cudaMem.get(count, alignment); + } + + WorkspaceMemAllocator hostMem; + WorkspaceMemAllocator pinnedMem; + WorkspaceMemAllocator cudaMem; +}; + +} // namespace cvcuda + +#endif // CVCUDA_PRIV_WORKSPACE_ALLOCATOR_HPP diff --git a/src/cvcuda/priv/WorkspaceEstimator.hpp b/src/cvcuda/priv/WorkspaceEstimator.hpp new file mode 100644 index 00000000..08ec5936 --- /dev/null +++ b/src/cvcuda/priv/WorkspaceEstimator.hpp @@ -0,0 +1,90 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CVCUDA_PRIV_WORKSPACE_ESTIMATOR_HPP +#define CVCUDA_PRIV_WORKSPACE_ESTIMATOR_HPP + +#include + +namespace cvcuda { + +struct WorkspaceMemEstimator +{ + explicit WorkspaceMemEstimator(size_t initial_size = 0, size_t base_alignment = alignof(std::max_align_t)) + : req{initial_size, base_alignment} + { + } + + WorkspaceMemRequirements req; + + template + WorkspaceMemEstimator &add(size_t count = 1, size_t alignment = alignof(T)) + { + if (alignment > req.alignment) + req.alignment = alignment; + req.size = nvcv::detail::AlignUp(req.size, alignment); + req.size += nvcv::detail::AlignUp(count * sizeof(T), alignment); + return *this; + } +}; + +struct WorkspaceEstimator +{ + static constexpr size_t kDefaultPinnedAlignment = 256; + static constexpr size_t kDefaultDeviceAlignment = 256; + + WorkspaceMemEstimator hostMem; + WorkspaceMemEstimator pinnedMem{0, kDefaultPinnedAlignment}; + WorkspaceMemEstimator cudaMem{0, kDefaultDeviceAlignment}; + + template + WorkspaceEstimator &add(bool host, bool pinned, bool cuda, size_t count = 1, size_t alignment = alignof(T)) + { + if (host) + addHost(count, alignment); + if (pinned) + addPinned(count, alignment); + if (cuda) + addCuda(count, alignment); + return *this; + } + + template + WorkspaceEstimator &addHost(size_t count = 1, size_t alignment = alignof(T)) + { + hostMem.add(count, alignment); + return *this; + } + + template + WorkspaceEstimator &addPinned(size_t count = 1, size_t alignment = alignof(T)) + { + pinnedMem.add(count, alignment); + return *this; + } + + template + WorkspaceEstimator &addCuda(size_t count = 1, size_t alignment = alignof(T)) + { + cudaMem.add(count, alignment); + return *this; + } +}; + +} // namespace cvcuda + +#endif // CVCUDA_PRIV_WORKSPACE_ESTIMATOR_HPP diff --git a/src/cvcuda/priv/WorkspaceUtil.hpp b/src/cvcuda/priv/WorkspaceUtil.hpp new file mode 100644 index 00000000..6a174171 --- /dev/null +++ b/src/cvcuda/priv/WorkspaceUtil.hpp @@ -0,0 +1,24 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CVCUDA_PRIV_WORKSPACE_UTIL_HPP +#define CVCUDA_PRIV_WORKSPACE_UTIL_HPP + +#include "WorkspaceAllocator.hpp" +#include "WorkspaceEstimator.hpp" + +#endif // CVCUDA_PRIV_WORKSPACE_UTIL_HPP diff --git a/src/cvcuda/priv/legacy/CMakeLists.txt b/src/cvcuda/priv/legacy/CMakeLists.txt index 99c09da1..11a2a517 100644 --- a/src/cvcuda/priv/legacy/CMakeLists.txt +++ b/src/cvcuda/priv/legacy/CMakeLists.txt @@ -68,7 +68,6 @@ set(CV_CUDA_PRIV_LEGACY_OP_FILES adaptive_threshold_var_shape.cu threshold_var_shape.cu threshold_util.cu - bnd_box.cu box_blur.cu osd.cu textbackend/backend.cpp diff --git a/src/cvcuda/priv/legacy/CvCudaLegacy.h b/src/cvcuda/priv/legacy/CvCudaLegacy.h index 4b24a7f1..f2919dd9 100644 --- a/src/cvcuda/priv/legacy/CvCudaLegacy.h +++ b/src/cvcuda/priv/legacy/CvCudaLegacy.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -35,6 +36,11 @@ namespace nvcv::legacy::cuda_op { +using cvcuda::Workspace; +using cvcuda::WorkspaceMem; +using cvcuda::WorkspaceMemRequirements; +using cvcuda::WorkspaceRequirements; + enum ErrorCode { SUCCESS = 0, @@ -294,13 +300,6 @@ class ConvertTo : public CudaBaseOp */ ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, const double alpha, const double beta, cudaStream_t stream); - /** - * @brief calculate the cpu/gpu buffer size needed by this operator - * @param max_input_shape maximum input DataShape that may be used - * @param max_output_shape maximum output DataShape that may be used - * @param max_data_type DataType with the maximum size that may be used - */ - size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type); }; class CustomCrop : public CudaBaseOp @@ -374,13 +373,6 @@ class CustomCrop : public CudaBaseOp */ ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, NVCVRectI roi, cudaStream_t stream); - /** - * @brief calculate the cpu/gpu buffer size needed by this operator - * @param max_input_shape maximum input DataShape that may be used - * @param max_output_shape maximum output DataShape that may be used - * @param max_data_type DataType with the maximum size that may be used - */ - size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type); }; class MinAreaRect : public CudaBaseOp @@ -527,14 +519,6 @@ class Flip : public CudaBaseOp */ ErrorCode infer(const TensorDataStridedCuda &input, const TensorDataStridedCuda &output, const int32_t flipCode, cudaStream_t stream); - - /** - * @brief calculate the cpu/gpu buffer size needed by this operator - * @param max_input_shape maximum input DataShape that may be used - * @param max_output_shape maximum output DataShape that may be used - * @param max_data_type DataType with the maximum size that may be used - */ - size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type); }; class FlipOrCopyVarShape : public CudaBaseOp @@ -600,12 +584,6 @@ class FlipOrCopyVarShape : public CudaBaseOp */ ErrorCode infer(const ImageBatchVarShapeDataStridedCuda &input, const ImageBatchVarShapeDataStridedCuda &output, const TensorDataStridedCuda &flipCode, cudaStream_t stream); - - /** - * @brief calculate the gpu buffer size needed by this operator - * @param maxBatchSize Maximum batch size that may be used - */ - size_t calBufferSize(int maxBatchSize); }; class Reformat : public CudaBaseOp @@ -677,14 +655,8 @@ class Reformat : public CudaBaseOp * @param stream for the asynchronous execution. */ ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, cudaStream_t stream); - /** - * @brief calculate the cpu/gpu buffer size needed by this operator - * @param max_input_shape maximum input DataShape that may be used - * @param max_output_shape maximum output DataShape that may be used - * @param max_data_type DataType with the maximum size that may be used - */ - size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type); - void checkDataFormat(DataFormat format); + + void checkDataFormat(DataFormat format); }; class Resize : public CudaBaseOp @@ -753,13 +725,6 @@ class Resize : public CudaBaseOp */ ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, const NVCVInterpolationType interpolation, cudaStream_t stream); - /** - * @brief calculate the cpu/gpu buffer size needed by this operator - * @param max_input_shape maximum input DataShape that may be used - * @param max_output_shape maximum output DataShape that may be used - * @param max_data_type DataType with the maximum size that may be used - */ - size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type); }; class Morphology : public CudaBaseOp @@ -986,14 +951,8 @@ class Normalize : public CudaBaseOp const TensorDataStridedCuda &scaleData, const TensorDataStridedCuda &outData, const float global_scale, const float shift, const float epsilon, const uint32_t flags, cudaStream_t stream); - /** - * @brief calculate the cpu/gpu buffer size needed by this operator - * @param max_input_shape maximum input DataShape that may be used - * @param max_output_shape maximum output DataShape that may be used - * @param max_data_type DataType with the maximum size that may be used - */ - size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type); - void checkParamShape(DataShape input_shape, DataShape param_shape); + + void checkParamShape(DataShape input_shape, DataShape param_shape); }; class PadAndStack : public CudaBaseOp @@ -1059,8 +1018,6 @@ class PadAndStack : public CudaBaseOp ErrorCode infer(const ImageBatchVarShapeDataStridedCuda &inData, const TensorDataStridedCuda &outData, const TensorDataStridedCuda &top, const TensorDataStridedCuda &left, const NVCVBorderType borderMode, const float borderValue, cudaStream_t stream); - - size_t calBufferSize(int batch_size); }; class Rotate : public CudaBaseOp @@ -1125,13 +1082,6 @@ class MedianBlur : public CudaBaseOp */ ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, const nvcv::Size2D ksize, cudaStream_t stream); - /** - * @brief calculate the cpu/gpu buffer size needed by this operator - * @param max_input_shape maximum input DataShape that may be used - * @param max_output_shape maximum output DataShape that may be used - * @param max_data_type DataType with the maximum size that may be used - */ - size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type); }; class NormalizeVarShape : public CudaBaseOp @@ -1386,13 +1336,6 @@ class CenterCrop : public CudaBaseOp */ ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, int crop_rows, int crop_columns, cudaStream_t stream); - /** - * @brief calculate the cpu/gpu buffer size needed by this operator - * @param max_input_shape maximum input DataShape that may be used - * @param max_output_shape maximum output DataShape that may be used - * @param max_data_type DataType with the maximum size that may be used - */ - size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type); }; class RotateVarShape : public CudaBaseOp @@ -1496,14 +1439,6 @@ class Laplacian : public CudaBaseOp */ ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, int ksize, float scale, NVCVBorderType borderMode, cudaStream_t stream); - - /** - * @brief calculate the cpu/gpu buffer size needed by this operator - * @param max_input_shape maximum input DataShape that may be used - * @param max_output_shape maximum output DataShape that may be used - * @param max_data_type DataType with the maximum size that may be used - */ - size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type); }; class Gaussian : public CudaBaseOp @@ -2227,43 +2162,22 @@ class OSD : public CudaBaseOp ~OSD(); /** - * @brief Converts an image from one color space to another. + * @brief Draw OSD elements onto input tensor, then return back output tensor. * @param inData Input tensor. * @param outData Output tensor. - * @param elements OSD elements, \ref NVCVElement. + * @param elements OSD elements, \ref NVCVElements. */ ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, NVCVElements elements, cudaStream_t stream); /** - * @brief calculate the cpu/gpu buffer size needed by this operator - * @param max_input_shape maximum input DataShape that may be used - * @param max_output_shape maximum output DataShape that may be used - * @param max_data_type DataType with the maximum size that may be used - */ - size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type); - -private: - nvcv::cuda::osd::cuOSDContext_t m_context; -}; - -class BndBox : public CudaBaseOp -{ -public: - BndBox() = delete; - - BndBox(DataShape max_input_shape, DataShape max_output_shape); - - ~BndBox(); - - /** - * @brief Converts an image from one color space to another. + * @brief Draw BndBox elements onto input tensor, then return back output tensor. * @param inData Input tensor. * @param outData Output tensor. * @param boxes Bounding box rectangle, \ref NVCVBndBoxesI. */ - ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, NVCVBndBoxesI bboxes, - cudaStream_t stream); + ErrorCode inferBox(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, NVCVBndBoxesI bboxes, + cudaStream_t stream); /** * @brief calculate the cpu/gpu buffer size needed by this operator @@ -2326,14 +2240,6 @@ class CvtColor : public CudaBaseOp */ ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, NVCVColorConversionCode code, cudaStream_t stream); - - /** - * @brief calculate the cpu/gpu buffer size needed by this operator - * @param max_input_shape maximum input DataShape that may be used - * @param max_output_shape maximum output DataShape that may be used - * @param max_data_type DataType with the maximum size that may be used - */ - size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type); }; class WarpAffine : public CudaBaseOp @@ -2394,13 +2300,6 @@ class WarpPerspective : public CudaBaseOp ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, const float *transMatrix, const int32_t flags, const NVCVBorderType borderMode, const float4 borderValue, cudaStream_t stream); - /** - * @brief calculate the cpu/gpu buffer size needed by this operator - * @param max_input_shape maximum input DataShape that may be used - * @param max_output_shape maximum output DataShape that may be used - * @param max_data_type DataType with the maximum size that may be used - */ - size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type); }; class WarpPerspectiveVarShape : public CudaBaseOp @@ -2502,12 +2401,6 @@ class CvtColorVarShape : public CudaBaseOp */ ErrorCode infer(const ImageBatchVarShapeDataStridedCuda &inData, const ImageBatchVarShapeDataStridedCuda &outData, NVCVColorConversionCode code, cudaStream_t stream); - - /** - * @brief calculate the cpu/gpu buffer size needed by this operator - * @param batch_size maximum input batch size - */ - size_t calBufferSize(int batch_size); }; class Composite : public CudaBaseOp @@ -2602,12 +2495,6 @@ class CompositeVarShape : public CudaBaseOp class PillowResize : public CudaBaseOp { public: - PillowResize() = delete; - - PillowResize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type); - - ~PillowResize(); - /** * @brief Resizes the input images. The function resize resizes the image down to or up to the specified size. * @param inputs gpu pointer, inputs[0] are batched input images, whose shape is input_shape and type is data_type. @@ -2623,29 +2510,15 @@ class PillowResize : public CudaBaseOp * */ ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, - const NVCVInterpolationType interpolation, cudaStream_t stream); + const NVCVInterpolationType interpolation, cudaStream_t stream, const Workspace &workspace); - /** - * @brief calculate the cpu/gpu buffer size needed by this operator - * @param max_input_shape maximum input DataShape that may be used - * @param max_output_shape maximum output DataShape that may be used - * @param max_data_type DataType with the maximum size that may be used - */ - size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type); - -private: - void *gpu_workspace; + NVCVWorkspaceRequirements getWorkspaceRequirements(DataShape max_input_shape, DataShape max_output_shape, + DataType max_data_type); }; class PillowResizeVarShape : public CudaBaseOp { public: - PillowResizeVarShape() = delete; - - PillowResizeVarShape(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type); - - ~PillowResizeVarShape(); - /** * @brief Resizes the input images. The function resize resizes the image down to or up to the specified size. * @param inputs gpu pointer, inputs[0] are batched input images, whose shape is input_shape and type is data_type. @@ -2664,19 +2537,10 @@ class PillowResizeVarShape : public CudaBaseOp * */ ErrorCode infer(const ImageBatchVarShape &inData, const ImageBatchVarShape &outData, - const NVCVInterpolationType interpolation, cudaStream_t stream); + const NVCVInterpolationType interpolation, cudaStream_t stream, const Workspace &workspace); - /** - * @brief calculate the cpu/gpu buffer size needed by this operator - * @param max_input_shape maximum input DataShape that may be used - * @param max_output_shape maximum output DataShape that may be used - * @param max_data_type DataType with the maximum size that may be used - */ - size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type); - -private: - void *gpu_workspace = nullptr; - void *cpu_workspace = nullptr; + NVCVWorkspaceRequirements getWorkspaceRequirements(DataShape max_input_shape, DataShape max_output_shape, + DataType max_data_type); }; class Threshold : public CudaBaseOp diff --git a/src/cvcuda/priv/legacy/CvCudaOSD.hpp b/src/cvcuda/priv/legacy/CvCudaOSD.hpp index 0a33a6dc..4b3a8303 100644 --- a/src/cvcuda/priv/legacy/CvCudaOSD.hpp +++ b/src/cvcuda/priv/legacy/CvCudaOSD.hpp @@ -320,10 +320,6 @@ struct cuOSDContext std::unique_ptr> gpu_commands; std::unique_ptr> gpu_commands_offset; - // For OpBndBox only, to be deprecated. - std::vector> rect_commands; - std::unique_ptr> gpu_rect_commands; - std::vector> blur_commands; std::unique_ptr> gpu_blur_commands; diff --git a/src/cvcuda/priv/legacy/bnd_box.cu b/src/cvcuda/priv/legacy/bnd_box.cu deleted file mode 100644 index 6d68507a..00000000 --- a/src/cvcuda/priv/legacy/bnd_box.cu +++ /dev/null @@ -1,573 +0,0 @@ -/* Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES - * SPDX-License-Identifier: Apache-2.0 - * - * Copyright (C) 2021-2022, Bytedance Inc. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. -*/ - -#include "CvCudaLegacy.h" -#include "CvCudaLegacyHelpers.hpp" - -#include "CvCudaUtils.cuh" - -#include -#include -#include - -#include - -using namespace nvcv::legacy::cuda_op; -using namespace nvcv::legacy::helpers; -using namespace nvcv::cuda::osd; - -namespace nvcv::legacy::cuda_op { - -template -static __host__ __device__ uint8_t u8cast(_T value) -{ - return value < 0 ? 0 : (value > 255 ? 255 : value); -} - -// inbox_single_pixel: -// check if given coordinate is in box -// a --- d -// | | -// b --- c -static __device__ __forceinline__ bool inbox_single_pixel(float ix, float iy, float ax, float ay, float bx, float by, - float cx, float cy, float dx, float dy) -{ - return ((bx - ax) * (iy - ay) - (by - ay) * (ix - ax)) < 0 && ((cx - bx) * (iy - by) - (cy - by) * (ix - bx)) < 0 - && ((dx - cx) * (iy - cy) - (dy - cy) * (ix - cx)) < 0 && ((ax - dx) * (iy - dy) - (ay - dy) * (ix - dx)) < 0; -} - -static __device__ void blend_single_color(uchar4 &color, uint8_t &c0, uint8_t &c1, uint8_t &c2, uint8_t a) -{ - int foreground_alpha = a; - int background_alpha = color.w; - int blend_alpha = ((background_alpha * (255 - foreground_alpha)) >> 8) + foreground_alpha; - color.x = u8cast((((color.x * background_alpha * (255 - foreground_alpha)) >> 8) + (c0 * foreground_alpha)) - / blend_alpha); - color.y = u8cast((((color.y * background_alpha * (255 - foreground_alpha)) >> 8) + (c1 * foreground_alpha)) - / blend_alpha); - color.z = u8cast((((color.z * background_alpha * (255 - foreground_alpha)) >> 8) + (c2 * foreground_alpha)) - / blend_alpha); - color.w = blend_alpha; -} - -// render_rectangle_fill: -// render filled rectangle with border msaa4x interpolation off -static __device__ void render_rectangle_fill(int ix, int iy, RectangleCommand *p, uchar4 color[4]) -{ - if (inbox_single_pixel(ix, iy, p->ax1, p->ay1, p->bx1, p->by1, p->cx1, p->cy1, p->dx1, p->dy1)) - { - blend_single_color(color[0], p->c0, p->c1, p->c2, p->c3); - } - if (inbox_single_pixel(ix + 1, iy, p->ax1, p->ay1, p->bx1, p->by1, p->cx1, p->cy1, p->dx1, p->dy1)) - { - blend_single_color(color[1], p->c0, p->c1, p->c2, p->c3); - } - if (inbox_single_pixel(ix, iy + 1, p->ax1, p->ay1, p->bx1, p->by1, p->cx1, p->cy1, p->dx1, p->dy1)) - { - blend_single_color(color[2], p->c0, p->c1, p->c2, p->c3); - } - if (inbox_single_pixel(ix + 1, iy + 1, p->ax1, p->ay1, p->bx1, p->by1, p->cx1, p->cy1, p->dx1, p->dy1)) - { - blend_single_color(color[3], p->c0, p->c1, p->c2, p->c3); - } -} - -// render_rectangle_border: -// render hollow rectangle with border msaa4x interpolation off -static __device__ void render_rectangle_border(int ix, int iy, RectangleCommand *p, uchar4 color[4]) -{ - if (!inbox_single_pixel(ix, iy, p->ax2, p->ay2, p->bx2, p->by2, p->cx2, p->cy2, p->dx2, p->dy2) - && inbox_single_pixel(ix, iy, p->ax1, p->ay1, p->bx1, p->by1, p->cx1, p->cy1, p->dx1, p->dy1)) - { - blend_single_color(color[0], p->c0, p->c1, p->c2, p->c3); - } - if (!inbox_single_pixel(ix + 1, iy, p->ax2, p->ay2, p->bx2, p->by2, p->cx2, p->cy2, p->dx2, p->dy2) - && inbox_single_pixel(ix + 1, iy, p->ax1, p->ay1, p->bx1, p->by1, p->cx1, p->cy1, p->dx1, p->dy1)) - { - blend_single_color(color[1], p->c0, p->c1, p->c2, p->c3); - } - if (!inbox_single_pixel(ix, iy + 1, p->ax2, p->ay2, p->bx2, p->by2, p->cx2, p->cy2, p->dx2, p->dy2) - && inbox_single_pixel(ix, iy + 1, p->ax1, p->ay1, p->bx1, p->by1, p->cx1, p->cy1, p->dx1, p->dy1)) - { - blend_single_color(color[2], p->c0, p->c1, p->c2, p->c3); - } - if (!inbox_single_pixel(ix + 1, iy + 1, p->ax2, p->ay2, p->bx2, p->by2, p->cx2, p->cy2, p->dx2, p->dy2) - && inbox_single_pixel(ix + 1, iy + 1, p->ax1, p->ay1, p->bx1, p->by1, p->cx1, p->cy1, p->dx1, p->dy1)) - { - blend_single_color(color[3], p->c0, p->c1, p->c2, p->c3); - } -} - -static __device__ void do_rectangle_woMSAA(RectangleCommand *cmd, int ix, int iy, uchar4 context_color[4]) -{ - if (cmd->thickness == -1) - { - render_rectangle_fill(ix, iy, cmd, context_color); - } - else - { - render_rectangle_border(ix, iy, cmd, context_color); - } -} - -template -static __device__ void blending_rgb_pixel(SrcWrapper src, DstWrapper dst, int x, int y, uchar4 plot_colors[4]) -{ - const int batch_idx = get_batch_idx(); - - for (int i = 0; i < 2; ++i) - { - T *in = src.ptr(batch_idx, y + i, x, 0); - T *out = dst.ptr(batch_idx, y + i, x, 0); - for (int j = 0; j < 2; ++j, in += 3, out += 3) - { - uchar4 &rcolor = plot_colors[i * 2 + j]; - int foreground_alpha = rcolor.w; - int background_alpha = 255; - int blend_alpha = ((background_alpha * (255 - foreground_alpha)) >> 8) + foreground_alpha; - out[0] - = u8cast((((in[0] * background_alpha * (255 - foreground_alpha)) >> 8) + (rcolor.x * foreground_alpha)) - / blend_alpha); - out[1] - = u8cast((((in[1] * background_alpha * (255 - foreground_alpha)) >> 8) + (rcolor.y * foreground_alpha)) - / blend_alpha); - out[2] - = u8cast((((in[2] * background_alpha * (255 - foreground_alpha)) >> 8) + (rcolor.z * foreground_alpha)) - / blend_alpha); - } - } -} - -template -static __device__ void blending_rgba_pixel(SrcWrapper src, DstWrapper dst, int x, int y, uchar4 plot_colors[4]) -{ - const int batch_idx = get_batch_idx(); - - for (int i = 0; i < 2; ++i) - { - T *in = src.ptr(batch_idx, y + i, x, 0); - T *out = dst.ptr(batch_idx, y + i, x, 0); - for (int j = 0; j < 2; ++j, in += 4, out += 4) - { - uchar4 &rcolor = plot_colors[i * 2 + j]; - int foreground_alpha = rcolor.w; - int background_alpha = in[3]; - int blend_alpha = ((background_alpha * (255 - foreground_alpha)) >> 8) + foreground_alpha; - out[0] - = u8cast((((in[0] * background_alpha * (255 - foreground_alpha)) >> 8) + (rcolor.x * foreground_alpha)) - / blend_alpha); - out[1] - = u8cast((((in[1] * background_alpha * (255 - foreground_alpha)) >> 8) + (rcolor.y * foreground_alpha)) - / blend_alpha); - out[2] - = u8cast((((in[2] * background_alpha * (255 - foreground_alpha)) >> 8) + (rcolor.z * foreground_alpha)) - / blend_alpha); - out[3] = blend_alpha; - } - } -} - -template -static __global__ void render_bndbox_rgb_womsaa_kernel(SrcWrapper src, DstWrapper dst, int bx, int by, - const RectangleCommand *commands, int num_command, int width, - int height, bool inplace) -{ - int ix = ((blockDim.x * blockIdx.x + threadIdx.x) << 1) + bx; - int iy = ((blockDim.y * blockIdx.y + threadIdx.y) << 1) + by; - if (ix < 0 || iy < 0 || ix >= width - 1 || iy >= height - 1) - return; - - uchar4 context_color[4] = {0}; - const int batch_idx = get_batch_idx(); - - for (int i = 0; i < num_command; ++i) - { - RectangleCommand pcommand = commands[i]; - if (pcommand.batch_index != batch_idx) - continue; - do_rectangle_woMSAA(&pcommand, ix, iy, context_color); - } - - if (context_color[0].w == 0 && context_color[1].w == 0 && context_color[2].w == 0 && context_color[3].w == 0) - { - if (inplace) - return; - *(uchar3 *)(dst.ptr(batch_idx, iy, ix, 0)) = *(uchar3 *)(src.ptr(batch_idx, iy, ix, 0)); - *(uchar3 *)(dst.ptr(batch_idx, iy, ix + 1, 0)) = *(uchar3 *)(src.ptr(batch_idx, iy, ix + 1, 0)); - *(uchar3 *)(dst.ptr(batch_idx, iy + 1, ix, 0)) = *(uchar3 *)(src.ptr(batch_idx, iy + 1, ix, 0)); - *(uchar3 *)(dst.ptr(batch_idx, iy + 1, ix + 1, 0)) = *(uchar3 *)(src.ptr(batch_idx, iy + 1, ix + 1, 0)); - return; - } - - blending_rgb_pixel(src, dst, ix, iy, context_color); -} - -template -static __global__ void render_bndbox_rgba_womsaa_kernel(SrcWrapper src, DstWrapper dst, int bx, int by, - const RectangleCommand *commands, int num_command, int width, - int height, bool inplace) -{ - int ix = ((blockDim.x * blockIdx.x + threadIdx.x) << 1) + bx; - int iy = ((blockDim.y * blockIdx.y + threadIdx.y) << 1) + by; - if (ix < 0 || iy < 0 || ix >= width - 1 || iy >= height - 1) - return; - - uchar4 context_color[4] = {0}; - const int batch_idx = get_batch_idx(); - - for (int i = 0; i < num_command; ++i) - { - RectangleCommand pcommand = commands[i]; - if (pcommand.batch_index != batch_idx) - continue; - do_rectangle_woMSAA(&pcommand, ix, iy, context_color); - } - - if (context_color[0].w == 0 && context_color[1].w == 0 && context_color[2].w == 0 && context_color[3].w == 0) - { - if (inplace) - return; - *(uchar4 *)(dst.ptr(batch_idx, iy, ix, 0)) = *(uchar4 *)(src.ptr(batch_idx, iy, ix, 0)); - *(uchar4 *)(dst.ptr(batch_idx, iy, ix + 1, 0)) = *(uchar4 *)(src.ptr(batch_idx, iy, ix + 1, 0)); - *(uchar4 *)(dst.ptr(batch_idx, iy + 1, ix, 0)) = *(uchar4 *)(src.ptr(batch_idx, iy + 1, ix, 0)); - *(uchar4 *)(dst.ptr(batch_idx, iy + 1, ix + 1, 0)) = *(uchar4 *)(src.ptr(batch_idx, iy + 1, ix + 1, 0)); - return; - } - - blending_rgba_pixel(src, dst, ix, iy, context_color); -} - -static ErrorCode cuosd_draw_rectangle(cuOSDContext_t context, int width, int height, NVCVBndBoxesI bboxes) -{ - for (int n = 0; n < bboxes.batch; n++) - { - auto numBoxes = bboxes.numBoxes[n]; - - for (int i = 0; i < numBoxes; i++) - { - auto bbox = bboxes.boxes[i]; - int left = max(min(bbox.box.x, width - 1), 0); - int top = max(min(bbox.box.y, height - 1), 0); - int right = max(min(left + bbox.box.width - 1, width - 1), 0); - int bottom = max(min(top + bbox.box.height - 1, height - 1), 0); - - if (left == right || top == bottom || bbox.box.width <= 0 || bbox.box.height <= 0) - { - LOG_DEBUG("Skipped bnd_box(" << bbox.box.x << ", " << bbox.box.y << ", " << bbox.box.width << ", " - << bbox.box.height << ") in image(" << width << ", " << height << ")"); - continue; - } - - if (bbox.borderColor.a == 0) - continue; - if (bbox.fillColor.a || bbox.thickness == -1) - { - if (bbox.thickness == -1) - { - bbox.fillColor = bbox.borderColor; - } - - auto cmd = std::make_shared(); - cmd->batch_index = n; - cmd->thickness = -1; - cmd->interpolation = false; - cmd->c0 = bbox.fillColor.r; - cmd->c1 = bbox.fillColor.g; - cmd->c2 = bbox.fillColor.b; - cmd->c3 = bbox.fillColor.a; - - // a d - // b c - cmd->ax1 = left; - cmd->ay1 = top; - cmd->dx1 = right; - cmd->dy1 = top; - cmd->cx1 = right; - cmd->cy1 = bottom; - cmd->bx1 = left; - cmd->by1 = bottom; - cmd->bounding_left = left; - cmd->bounding_right = right; - cmd->bounding_top = top; - cmd->bounding_bottom = bottom; - context->rect_commands.emplace_back(cmd); - } - if (bbox.thickness == -1) - continue; - - auto cmd = std::make_shared(); - cmd->batch_index = n; - cmd->thickness = bbox.thickness; - cmd->interpolation = false; - cmd->c0 = bbox.borderColor.r; - cmd->c1 = bbox.borderColor.g; - cmd->c2 = bbox.borderColor.b; - cmd->c3 = bbox.borderColor.a; - - float half_thickness = bbox.thickness / 2.0f; - cmd->ax2 = left + half_thickness; - cmd->ay2 = top + half_thickness; - cmd->dx2 = right - half_thickness; - cmd->dy2 = top + half_thickness; - cmd->cx2 = right - half_thickness; - cmd->cy2 = bottom - half_thickness; - cmd->bx2 = left + half_thickness; - cmd->by2 = bottom - half_thickness; - - // a d - // b c - cmd->ax1 = left - half_thickness; - cmd->ay1 = top - half_thickness; - cmd->dx1 = right + half_thickness; - cmd->dy1 = top - half_thickness; - cmd->cx1 = right + half_thickness; - cmd->cy1 = bottom + half_thickness; - cmd->bx1 = left - half_thickness; - cmd->by1 = bottom + half_thickness; - - int int_half = ceil(half_thickness); - cmd->bounding_left = left - int_half; - cmd->bounding_right = right + int_half; - cmd->bounding_top = top - int_half; - cmd->bounding_bottom = bottom + int_half; - context->rect_commands.emplace_back(cmd); - } - - bboxes.boxes = (NVCVBndBoxI *)((uint8_t *)bboxes.boxes + numBoxes * sizeof(NVCVBndBoxI)); - } - return ErrorCode::SUCCESS; -} - -static void cuosd_apply(cuOSDContext_t context, int width, int height, cudaStream_t stream) -{ - context->bounding_left = width; - context->bounding_top = height; - context->bounding_right = 0; - context->bounding_bottom = 0; - - for (int i = 0; i < (int)context->rect_commands.size(); ++i) - { - auto &cmd = context->rect_commands[i]; - context->bounding_left = min(context->bounding_left, cmd->bounding_left); - context->bounding_top = min(context->bounding_top, cmd->bounding_top); - context->bounding_right = max(context->bounding_right, cmd->bounding_right); - context->bounding_bottom = max(context->bounding_bottom, cmd->bounding_bottom); - } - - if (context->gpu_rect_commands == nullptr) - { - context->gpu_rect_commands.reset(new Memory()); - } - - context->gpu_rect_commands->alloc_or_resize_to(context->rect_commands.size()); - - for (int i = 0; i < (int)context->rect_commands.size(); ++i) - { - auto &cmd = context->rect_commands[i]; - memcpy((void *)(context->gpu_rect_commands->host() + i), cmd.get(), sizeof(RectangleCommand)); - } - - context->gpu_rect_commands->copy_host_to_device(stream); -} - -inline ErrorCode ApplyBndBox_RGB(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData, - cuOSDContext_t context, cudaStream_t stream) -{ - auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(inData); - NVCV_ASSERT(inAccess); - - cuda_op::DataType inDataType = helpers::GetLegacyDataType(inData.dtype()); - cuda_op::DataShape inputShape = helpers::GetLegacyDataShape(inAccess->infoShape()); - - auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(outData); - NVCV_ASSERT(outAccess); - - cuda_op::DataType outDataType = helpers::GetLegacyDataType(outData.dtype()); - cuda_op::DataShape outputShape = helpers::GetLegacyDataShape(outAccess->infoShape()); - - if (outDataType != inDataType) - { - LOG_ERROR("Unsupported input/output DataType " << inDataType << "/" << outDataType); - return ErrorCode::INVALID_DATA_TYPE; - } - if (outputShape.H != inputShape.H || outputShape.W != inputShape.W || outputShape.N != inputShape.N - || outputShape.C != inputShape.C || outputShape.C != 3) - { - LOG_ERROR("Invalid output shape " << outputShape); - return ErrorCode::INVALID_DATA_SHAPE; - } - - cuosd_apply(context, inputShape.W, inputShape.H, stream); - - dim3 blockSize(16, 8); - dim3 gridSize(divUp(int((inputShape.W + 1) / 2), (int)blockSize.x), - divUp(int((inputShape.H + 1) / 2), (int)blockSize.y), inputShape.N); - - auto src = nvcv::cuda::CreateTensorWrapNHWC(inData); - auto dst = nvcv::cuda::CreateTensorWrapNHWC(outData); - - render_bndbox_rgb_womsaa_kernel<<>>( - src, dst, 0, 0, context->gpu_rect_commands ? context->gpu_rect_commands->device() : nullptr, - context->rect_commands.size(), inputShape.W, inputShape.H, inData.basePtr() == outData.basePtr()); - checkKernelErrors(); - - return ErrorCode::SUCCESS; -} - -inline ErrorCode ApplyBndBox_RGBA(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData, - cuOSDContext_t context, cudaStream_t stream) -{ - auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(inData); - NVCV_ASSERT(inAccess); - - cuda_op::DataType inDataType = helpers::GetLegacyDataType(inData.dtype()); - cuda_op::DataShape inputShape = helpers::GetLegacyDataShape(inAccess->infoShape()); - - auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(outData); - NVCV_ASSERT(outAccess); - - cuda_op::DataType outDataType = helpers::GetLegacyDataType(outData.dtype()); - cuda_op::DataShape outputShape = helpers::GetLegacyDataShape(outAccess->infoShape()); - - if (outDataType != inDataType) - { - LOG_ERROR("Unsupported input/output DataType " << inDataType << "/" << outDataType); - return ErrorCode::INVALID_DATA_TYPE; - } - if (outputShape.H != inputShape.H || outputShape.W != inputShape.W || outputShape.N != inputShape.N - || outputShape.C != inputShape.C || outputShape.C != 4) - { - LOG_ERROR("Invalid output shape " << outputShape); - return ErrorCode::INVALID_DATA_SHAPE; - } - - cuosd_apply(context, inputShape.W, inputShape.H, stream); - - dim3 blockSize(16, 8); - dim3 gridSize(divUp(int((inputShape.W + 1) / 2), (int)blockSize.x), - divUp(int((inputShape.H + 1) / 2), (int)blockSize.y), inputShape.N); - - auto src = nvcv::cuda::CreateTensorWrapNHWC(inData); - auto dst = nvcv::cuda::CreateTensorWrapNHWC(outData); - - render_bndbox_rgba_womsaa_kernel<<>>( - src, dst, 0, 0, context->gpu_rect_commands ? context->gpu_rect_commands->device() : nullptr, - context->rect_commands.size(), inputShape.W, inputShape.H, inData.basePtr() == outData.basePtr()); - checkKernelErrors(); - - return ErrorCode::SUCCESS; -} - -BndBox::BndBox(DataShape max_input_shape, DataShape max_output_shape) - : CudaBaseOp(max_input_shape, max_output_shape) -{ - m_context = new cuOSDContext(); - if (m_context->gpu_rect_commands == nullptr) - { - m_context->gpu_rect_commands.reset(new Memory()); - } - m_context->gpu_rect_commands->alloc_or_resize_to(PREALLOC_CMD_NUM * sizeof(RectangleCommand)); -} - -BndBox::~BndBox() -{ - if (m_context) - { - m_context->rect_commands.clear(); - cuOSDContext *p = (cuOSDContext *)m_context; - delete p; - } -} - -size_t BndBox::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type) -{ - return 0; -} - -ErrorCode BndBox::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData, - NVCVBndBoxesI bboxes, cudaStream_t stream) -{ - cuda_op::DataFormat input_format = GetLegacyDataFormat(inData.layout()); - cuda_op::DataFormat output_format = GetLegacyDataFormat(outData.layout()); - - if (!(input_format == kNHWC || input_format == kHWC) || !(output_format == kNHWC || output_format == kHWC)) - { - LOG_ERROR("Invliad DataFormat both Input and Output must be kNHWC or kHWC"); - return ErrorCode::INVALID_DATA_FORMAT; - } - - if (inData.dtype() != outData.dtype()) - { - LOG_ERROR("Input and Output formats must be same input format =" << inData.dtype() - << " output format = " << outData.dtype()); - return ErrorCode::INVALID_DATA_FORMAT; - } - - auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(inData); - if (!inAccess) - { - return ErrorCode::INVALID_DATA_FORMAT; - } - - int batch = inAccess->numSamples(); - int channels = inAccess->numChannels(); - int rows = inAccess->numRows(); - int cols = inAccess->numCols(); - - if (channels > 4 || channels < 1) - { - LOG_ERROR("Invalid channel number ch = " << channels); - return ErrorCode::INVALID_DATA_SHAPE; - } - - if (bboxes.batch != batch) - { - LOG_ERROR("Invalid bboxes batch = " << bboxes.batch); - return ErrorCode::INVALID_DATA_SHAPE; - } - - auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(outData); - if (!outAccess) - { - return ErrorCode::INVALID_DATA_FORMAT; - } - - auto ret = cuosd_draw_rectangle(m_context, cols, rows, bboxes); - if (ret != ErrorCode::SUCCESS) - { - return ret; - } - - typedef ErrorCode (*func_t)(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData, - cuOSDContext_t context, cudaStream_t stream); - - static const func_t funcs[] = { - ApplyBndBox_RGB, - ApplyBndBox_RGBA, - }; - - int type_idx = channels - 3; - funcs[type_idx](inData, outData, m_context, stream); - m_context->rect_commands.clear(); // Clear the command buffer so next render does not contain previous boxes. - m_context->blur_commands.clear(); - return ErrorCode::SUCCESS; -} - -} // namespace nvcv::legacy::cuda_op diff --git a/src/cvcuda/priv/legacy/box_blur.cu b/src/cvcuda/priv/legacy/box_blur.cu index 02c26103..aa3637d4 100644 --- a/src/cvcuda/priv/legacy/box_blur.cu +++ b/src/cvcuda/priv/legacy/box_blur.cu @@ -23,6 +23,7 @@ #include "CvCudaUtils.cuh" +#include #include #include #include @@ -32,6 +33,7 @@ using namespace nvcv::legacy::cuda_op; using namespace nvcv::legacy::helpers; using namespace nvcv::cuda::osd; +using namespace cvcuda::priv; namespace nvcv::legacy::cuda_op { @@ -327,15 +329,15 @@ inline ErrorCode ApplyBoxBlur_RGBA(const nvcv::TensorDataStridedCuda &inData, return ErrorCode::SUCCESS; } -static ErrorCode cuosd_draw_boxblur(cuOSDContext_t context, int width, int height, NVCVBlurBoxesI bboxes) +static ErrorCode cuosd_draw_boxblur(cuOSDContext_t context, int width, int height, NVCVBlurBoxesImpl *bboxes) { - for (int n = 0; n < bboxes.batch; n++) + for (int n = 0; n < bboxes->batch(); n++) { - auto numBoxes = bboxes.numBoxes[n]; + auto numBoxes = bboxes->numBoxesAt(n); for (int i = 0; i < numBoxes; i++) { - auto bbox = bboxes.boxes[i]; + auto bbox = bboxes->boxAt(n, i); int left = max(min(bbox.box.x, width - 1), 0); int top = max(min(bbox.box.y, height - 1), 0); int right = max(min(left + bbox.box.width - 1, width - 1), 0); @@ -367,8 +369,6 @@ static ErrorCode cuosd_draw_boxblur(cuOSDContext_t context, int width, int heigh cmd->bounding_bottom = bottom; context->blur_commands.emplace_back(cmd); } - - bboxes.boxes = (NVCVBlurBoxI *)((uint8_t *)bboxes.boxes + numBoxes * sizeof(NVCVBlurBoxI)); } return ErrorCode::SUCCESS; } @@ -394,11 +394,6 @@ BoxBlur::~BoxBlur() } } -size_t BoxBlur::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type) -{ - return 0; -} - ErrorCode BoxBlur::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData, NVCVBlurBoxesI bboxes, cudaStream_t stream) { @@ -435,9 +430,10 @@ ErrorCode BoxBlur::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv:: return ErrorCode::INVALID_DATA_SHAPE; } - if (bboxes.batch != batch) + NVCVBlurBoxesImpl *_bboxes = (NVCVBlurBoxesImpl *)bboxes; + if (_bboxes->batch() != batch) { - LOG_ERROR("Invalid bboxes batch = " << bboxes.batch); + LOG_ERROR("Invalid bboxes batch = " << _bboxes->batch()); return ErrorCode::INVALID_DATA_SHAPE; } @@ -447,7 +443,7 @@ ErrorCode BoxBlur::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv:: return ErrorCode::INVALID_DATA_FORMAT; } - auto ret = cuosd_draw_boxblur(m_context, cols, rows, bboxes); + auto ret = cuosd_draw_boxblur(m_context, cols, rows, _bboxes); if (ret != ErrorCode::SUCCESS) { return ret; @@ -464,7 +460,6 @@ ErrorCode BoxBlur::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv:: int type_idx = channels - 3; funcs[type_idx](inData, outData, m_context, stream); m_context->blur_commands.clear(); // Clear the command buffer so next render does not contain previous boxes. - m_context->rect_commands.clear(); return ErrorCode::SUCCESS; } diff --git a/src/cvcuda/priv/legacy/center_crop.cu b/src/cvcuda/priv/legacy/center_crop.cu index c7245dfe..aa8e6542 100644 --- a/src/cvcuda/priv/legacy/center_crop.cu +++ b/src/cvcuda/priv/legacy/center_crop.cu @@ -70,11 +70,6 @@ void center_crop(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDa namespace nvcv::legacy::cuda_op { -size_t CenterCrop::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type) -{ - return 0; -} - ErrorCode CenterCrop::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, int crop_rows, int crop_columns, cudaStream_t stream) { diff --git a/src/cvcuda/priv/legacy/convert_to.cu b/src/cvcuda/priv/legacy/convert_to.cu index 9d68ba76..5e510d04 100644 --- a/src/cvcuda/priv/legacy/convert_to.cu +++ b/src/cvcuda/priv/legacy/convert_to.cu @@ -120,11 +120,6 @@ void convertToScale(const nvcv::TensorDataStridedCuda &inData, const nvcv::Tenso namespace nvcv::legacy::cuda_op { -size_t ConvertTo::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type) -{ - return 0; -} - ErrorCode ConvertTo::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, const double alpha, const double beta, cudaStream_t stream) { diff --git a/src/cvcuda/priv/legacy/custom_crop.cu b/src/cvcuda/priv/legacy/custom_crop.cu index fe725a68..eea4ebfe 100644 --- a/src/cvcuda/priv/legacy/custom_crop.cu +++ b/src/cvcuda/priv/legacy/custom_crop.cu @@ -64,11 +64,6 @@ void customCrop(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDat namespace nvcv::legacy::cuda_op { -size_t CustomCrop::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type) -{ - return 0; -} - ErrorCode CustomCrop::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, NVCVRectI roi, cudaStream_t stream) { diff --git a/src/cvcuda/priv/legacy/cvt_color.cu b/src/cvcuda/priv/legacy/cvt_color.cu index c3bd307e..8b794c63 100644 --- a/src/cvcuda/priv/legacy/cvt_color.cu +++ b/src/cvcuda/priv/legacy/cvt_color.cu @@ -1494,11 +1494,6 @@ inline ErrorCode BGR_to_YUV420xp(const TensorDataStridedCuda &inData, const Tens return ErrorCode::SUCCESS; } -size_t CvtColor::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type) -{ - return 0; -} - ErrorCode CvtColor::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, NVCVColorConversionCode code, cudaStream_t stream) { diff --git a/src/cvcuda/priv/legacy/cvt_color_var_shape.cu b/src/cvcuda/priv/legacy/cvt_color_var_shape.cu index 8dd369de..2dc01bbd 100644 --- a/src/cvcuda/priv/legacy/cvt_color_var_shape.cu +++ b/src/cvcuda/priv/legacy/cvt_color_var_shape.cu @@ -79,7 +79,7 @@ static constexpr int ITUR_BT_601_CBV = -74448; namespace nvcv::legacy::cuda_op { -__device__ inline bool checkShapeFromYUV420(int rows, int cols, NVCVColorConversionCode code) +inline __device__ bool checkShapeFromYUV420(int rows, int cols, NVCVColorConversionCode code) { int valid_row = 1, valid_col = 1; switch (code) @@ -210,7 +210,8 @@ __global__ void bgr_to_gray_float_nhwc(cuda::ImageBatchVarShapeWrapNHWC src, T g = *src.ptr(batch_idx, dst_y, dst_x, 1); T r = *src.ptr(batch_idx, dst_y, dst_x, bidx ^ 2); - T gray = (T)(b * B2YF + g * G2YF + r * R2YF) * dst.ptr(batch_idx, dst_y, dst_x, 0) = gray; + T gray = (T)(b * B2YF + g * G2YF + r * R2YF); + *dst.ptr(batch_idx, dst_y, dst_x, 0) = gray; } template @@ -400,7 +401,7 @@ __global__ void bgr_to_hsv_float_nhwc(cuda::ImageBatchVarShapeWrapNHWC src, c *dst.ptr(batch_idx, dst_y, dst_x, 2) = v; } -__device__ inline void HSV2RGB_native_var_shape(float h, float s, float v, float &b, float &g, float &r, +inline __device__ void HSV2RGB_native_var_shape(float h, float s, float v, float &b, float &g, float &r, const float hscale) { if (s == 0) @@ -980,7 +981,7 @@ inline ErrorCode BGR_to_GRAY(const ImageBatchVarShapeDataStridedCuda &inData, { cuda::ImageBatchVarShapeWrapNHWC src_ptr(inData, channels); cuda::ImageBatchVarShapeWrapNHWC dst_ptr(outData, dcn); - bgr_to_gray_char_nhwc<<>>(src_ptr, dst_ptr, bidx); + bgr_to_gray_float_nhwc<<>>(src_ptr, dst_ptr, bidx); checkKernelErrors(); } break; @@ -1596,11 +1597,6 @@ inline ErrorCode BGR_to_YUV420xp(const ImageBatchVarShapeDataStridedCuda &inData return ErrorCode::SUCCESS; } -size_t CvtColorVarShape::calBufferSize(int batch_size) -{ - return 0; -} - ErrorCode CvtColorVarShape::infer(const ImageBatchVarShapeDataStridedCuda &inData, const ImageBatchVarShapeDataStridedCuda &outData, NVCVColorConversionCode code, cudaStream_t stream) diff --git a/src/cvcuda/priv/legacy/filter.cu b/src/cvcuda/priv/legacy/filter.cu index 2c84261a..105f9260 100644 --- a/src/cvcuda/priv/legacy/filter.cu +++ b/src/cvcuda/priv/legacy/filter.cu @@ -131,11 +131,6 @@ constexpr cuda::math::Vector kLaplacianKernel3{ // clang-format on -size_t Laplacian::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type) -{ - return 0; -} - ErrorCode Laplacian::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, int ksize, float scale, NVCVBorderType borderMode, cudaStream_t stream) { diff --git a/src/cvcuda/priv/legacy/find_contours.cu b/src/cvcuda/priv/legacy/find_contours.cu index 64058378..abcb798c 100644 --- a/src/cvcuda/priv/legacy/find_contours.cu +++ b/src/cvcuda/priv/legacy/find_contours.cu @@ -923,10 +923,11 @@ __global__ void flattenContours(IndexType *dConnectList, CountType *dNodeCount, + block.group_index().y * grid.group_dim().x + block.group_index().x; // Calculate block tile dimensions and total number of iterations needed. - auto contourTile = util::DivUp(FindContours::MAX_NUM_CONTOURS, warp.meta_group_size()); - auto neededThreads = warp.size() * FindContours::MAX_NUM_CONTOURS * batchSize; - auto neededBlocks = (neededThreads + block.size() - 1) / block.size(); - auto numSteps = (neededBlocks + gridBlocks - 1) / gridBlocks; + auto contourTile = util::DivUp(FindContours::MAX_NUM_CONTOURS, warp.meta_group_size()); + auto neededThreads = warp.size() * FindContours::MAX_NUM_CONTOURS * batchSize; + auto neededBlocks = (neededThreads + block.size() - 1) / block.size(); + auto numStepsBatchSize = ((batchSize * contourTile - blockRank) + gridBlocks - 1) / gridBlocks; + auto numSteps = max((neededBlocks + gridBlocks - 1) / gridBlocks, numStepsBatchSize); // Calculate the thread's block dimensions and its position within the block. CoordType blockDims{warp.size(), warp.meta_group_size(), 1}; diff --git a/src/cvcuda/priv/legacy/flip.cu b/src/cvcuda/priv/legacy/flip.cu index aa6e7a63..eaba1ccf 100644 --- a/src/cvcuda/priv/legacy/flip.cu +++ b/src/cvcuda/priv/legacy/flip.cu @@ -115,11 +115,6 @@ void flip(const TensorDataStridedCuda &input, const TensorDataStridedCuda &outpu #endif // CUDA_DEBUG_LOG } -size_t Flip::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type) -{ - return 0; -} - ErrorCode Flip::infer(const TensorDataStridedCuda &input, const TensorDataStridedCuda &output, const int32_t flipCode, cudaStream_t stream) { diff --git a/src/cvcuda/priv/legacy/flip_or_copy_var_shape.cu b/src/cvcuda/priv/legacy/flip_or_copy_var_shape.cu index db4cb400..595e0672 100644 --- a/src/cvcuda/priv/legacy/flip_or_copy_var_shape.cu +++ b/src/cvcuda/priv/legacy/flip_or_copy_var_shape.cu @@ -86,11 +86,6 @@ void flip(const ImageBatchVarShapeDataStridedCuda &input, const ImageBatchVarSha #endif // CUDA_DEBUG_LOG } -size_t FlipOrCopyVarShape::calBufferSize(int maxBatchSize) -{ - return (sizeof(void *) * 2 + sizeof(int) * 3) * maxBatchSize; -} - ErrorCode FlipOrCopyVarShape::infer(const ImageBatchVarShapeDataStridedCuda &input, const ImageBatchVarShapeDataStridedCuda &output, const TensorDataStridedCuda &flipCode, cudaStream_t stream) diff --git a/src/cvcuda/priv/legacy/histogram_eq_var_shape.cu b/src/cvcuda/priv/legacy/histogram_eq_var_shape.cu index 9df8d9e8..b9b4b4d3 100644 --- a/src/cvcuda/priv/legacy/histogram_eq_var_shape.cu +++ b/src/cvcuda/priv/legacy/histogram_eq_var_shape.cu @@ -51,7 +51,7 @@ __global__ void hist_kernel(const SrcWrapper src, DstWrapper histogram, int chan { for (int ch = 0; ch < channels; ch++) { - int4 coordImg{src_x, src_y, batch_idx, ch}; + int4 coordImg{batch_idx, src_y, src_x, ch}; uchar out = src[coordImg]; int idx = out + (256 * ch); atomicAdd(&shist[idx], 1); @@ -160,7 +160,7 @@ __global__ void lookup(const SrcWrapper src, DstWrapper dst, CdfWrapper cdf, int for (int ch = 0; ch < channels; ch++) { offset = 256 * ch; - int4 coordImg{src_x, src_y, batch_idx, ch}; + int4 coordImg{batch_idx, src_y, src_x, ch}; int2 coordHisto{src[coordImg] + offset, batch_idx}; dst[coordImg] = nvcv::cuda::SaturateCast((temp[src[coordImg] + offset])); } diff --git a/src/cvcuda/priv/legacy/median_blur.cu b/src/cvcuda/priv/legacy/median_blur.cu index 1db91fe7..3f05eeb1 100644 --- a/src/cvcuda/priv/legacy/median_blur.cu +++ b/src/cvcuda/priv/legacy/median_blur.cu @@ -363,11 +363,6 @@ void median(const nvcv::TensorDataAccessStridedImagePlanar &inData, namespace nvcv::legacy::cuda_op { -size_t MedianBlur::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type) -{ - return 0; -} - ErrorCode MedianBlur::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, const nvcv::Size2D ksize, cudaStream_t stream) { diff --git a/src/cvcuda/priv/legacy/min_area_rect.cu b/src/cvcuda/priv/legacy/min_area_rect.cu index de01c690..384c011c 100644 --- a/src/cvcuda/priv/legacy/min_area_rect.cu +++ b/src/cvcuda/priv/legacy/min_area_rect.cu @@ -230,8 +230,7 @@ void minAreaRect(const TensorDataStridedCuda &inData, void *rotatedPointsDev, } MinAreaRect::MinAreaRect(DataShape max_input_shape, DataShape max_output_shape, int maxContourNum) - : CudaBaseOp(max_input_shape, max_output_shape) - , mMaxContourNum(maxContourNum) + : mMaxContourNum(maxContourNum) { NVCV_CHECK_THROW(cudaMalloc(&mRotateCoeffsBufDev, _MAX_ROTATE_DEGREES * 2 * sizeof(float))); NVCV_CHECK_THROW( diff --git a/src/cvcuda/priv/legacy/normalize.cu b/src/cvcuda/priv/legacy/normalize.cu index 3613f0fb..60eaf3cd 100644 --- a/src/cvcuda/priv/legacy/normalize.cu +++ b/src/cvcuda/priv/legacy/normalize.cu @@ -258,11 +258,6 @@ void Normalize::checkParamShape(DataShape input_shape, DataShape param_shape) NVCV_ASSERT(param_shape.W == input_shape.W || param_shape.W == 1); } -size_t Normalize::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type) -{ - return 0; -} - ErrorCode Normalize::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &baseData, const nvcv::TensorDataStridedCuda &scaleData, const nvcv::TensorDataStridedCuda &outData, const float global_scale, const float shift, const float epsilon, const uint32_t flags, diff --git a/src/cvcuda/priv/legacy/osd.cu b/src/cvcuda/priv/legacy/osd.cu index cf3221f0..f979e688 100644 --- a/src/cvcuda/priv/legacy/osd.cu +++ b/src/cvcuda/priv/legacy/osd.cu @@ -23,6 +23,7 @@ #include "CvCudaUtils.cuh" +#include #include #include #include @@ -35,6 +36,7 @@ using namespace nvcv::legacy::cuda_op; using namespace nvcv::legacy::helpers; using namespace nvcv::cuda::osd; +using namespace cvcuda::priv; namespace nvcv::legacy::cuda_op { @@ -1358,7 +1360,7 @@ static ErrorCode cuosd_draw_rectangle(cuOSDContext_t context, int batch_idx, int } static ErrorCode cuosd_draw_segmentmask(cuOSDContext_t context, int batch_idx, int width, int height, - NVCVSegment segment) + const NVCVSegment &segment) { int left = segment.box.x; int top = segment.box.y; @@ -1508,7 +1510,7 @@ static ErrorCode cuosd_draw_line(cuOSDContext_t context, int batch_idx, NVCVLine return ErrorCode::SUCCESS; } -static ErrorCode cuosd_draw_polyline(cuOSDContext_t context, int batch_idx, NVCVPolyLine pl) +static ErrorCode cuosd_draw_polyline(cuOSDContext_t context, int batch_idx, const NVCVPolyLine &pl) { if (pl.numPoints < 2) return ErrorCode::INVALID_PARAMETER; @@ -1722,16 +1724,17 @@ static ErrorCode cuosd_draw_clock(cuOSDContext_t context, int batch_idx, NVCVClo return ErrorCode::SUCCESS; } -static ErrorCode cuosd_draw_elements(cuOSDContext_t context, int width, int height, NVCVElements ctx) +static ErrorCode cuosd_draw_elements(cuOSDContext_t context, int width, int height, NVCVElementsImpl *ctx) { - for (int n = 0; n < ctx.batch; n++) + for (int n = 0; n < ctx->batch(); n++) { - auto numElements = ctx.numElements[n]; + auto numElements = ctx->numElementsAt(n); for (int i = 0; i < numElements; i++) { - auto type = ctx.elements[i].type; - auto data = ctx.elements[i].data; + auto element = ctx->elementAt(n, i); + auto type = element->type(); + auto data = element->ptr(); switch (type) { case NVCVOSDType::NVCV_OSD_NONE: @@ -1792,7 +1795,21 @@ static ErrorCode cuosd_draw_elements(cuOSDContext_t context, int width, int heig break; } } - ctx.elements = (NVCVElement *)((unsigned char *)ctx.elements + numElements * sizeof(NVCVElement)); + } + return ErrorCode::SUCCESS; +} + +static ErrorCode cuosd_draw_bndbox(cuOSDContext_t context, int width, int height, NVCVBndBoxesImpl *bboxes) +{ + for (int n = 0; n < bboxes->batch(); n++) + { + auto numBoxes = bboxes->numBoxesAt(n); + + for (int i = 0; i < numBoxes; i++) + { + auto bbox = bboxes->boxAt(n, i); + cuosd_draw_rectangle(context, n, width, height, bbox); + } } return ErrorCode::SUCCESS; } @@ -1812,11 +1829,6 @@ OSD::~OSD() } } -size_t OSD::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type) -{ - return 0; -} - ErrorCode OSD::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData, NVCVElements elements, cudaStream_t stream) { @@ -1853,9 +1865,10 @@ ErrorCode OSD::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv::Tens return ErrorCode::INVALID_DATA_SHAPE; } - if (elements.batch != batch) + NVCVElementsImpl *_elements = (NVCVElementsImpl *)elements; + if (_elements->batch() != batch) { - LOG_ERROR("Invalid elements batch = " << elements.batch); + LOG_ERROR("Invalid elements batch = " << _elements->batch()); return ErrorCode::INVALID_DATA_SHAPE; } @@ -1884,7 +1897,7 @@ ErrorCode OSD::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv::Tens return ErrorCode::INVALID_DATA_SHAPE; } - auto ret = cuosd_draw_elements(m_context, cols, rows, elements); + auto ret = cuosd_draw_elements(m_context, cols, rows, _elements); if (ret != ErrorCode::SUCCESS) { LOG_ERROR("cuosd_draw_elements failed, ret - " << ret); @@ -1911,4 +1924,90 @@ ErrorCode OSD::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv::Tens return ErrorCode::SUCCESS; } +ErrorCode OSD::inferBox(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData, + NVCVBndBoxesI bboxes, cudaStream_t stream) +{ + cuda_op::DataFormat input_format = GetLegacyDataFormat(inData.layout()); + cuda_op::DataFormat output_format = GetLegacyDataFormat(outData.layout()); + + if (!(input_format == kNHWC || input_format == kHWC) || !(output_format == kNHWC || output_format == kHWC)) + { + LOG_ERROR("Invliad DataFormat both Input and Output must be kNHWC or kHWC"); + return ErrorCode::INVALID_DATA_FORMAT; + } + + if (inData.dtype() != outData.dtype()) + { + LOG_ERROR("Input and Output formats must be same input format =" << inData.dtype() + << " output format = " << outData.dtype()); + return ErrorCode::INVALID_DATA_FORMAT; + } + + auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(inData); + if (!inAccess) + { + return ErrorCode::INVALID_DATA_FORMAT; + } + + auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(outData); + if (!outAccess) + { + return ErrorCode::INVALID_DATA_FORMAT; + } + + cuda_op::DataShape inputShape = helpers::GetLegacyDataShape(inAccess->infoShape()); + cuda_op::DataShape outputShape = helpers::GetLegacyDataShape(outAccess->infoShape()); + + if (outputShape.H != inputShape.H || outputShape.W != inputShape.W || outputShape.N != inputShape.N + || outputShape.C != inputShape.C) + { + LOG_ERROR("Invalid input/output shape " << inputShape << "/" << outputShape); + return ErrorCode::INVALID_DATA_SHAPE; + } + + int batch = inAccess->numSamples(); + int channels = inAccess->numChannels(); + int rows = inAccess->numRows(); + int cols = inAccess->numCols(); + + if (channels > 4 || channels < 1) + { + LOG_ERROR("Invalid channel number ch = " << channels); + return ErrorCode::INVALID_DATA_SHAPE; + } + + NVCVBndBoxesImpl *_bboxes = (NVCVBndBoxesImpl *)bboxes; + if (_bboxes->batch() != batch) + { + LOG_ERROR("Invalid bboxes batch = " << _bboxes->batch()); + return ErrorCode::INVALID_DATA_SHAPE; + } + + auto ret = cuosd_draw_bndbox(m_context, cols, rows, _bboxes); + if (ret != ErrorCode::SUCCESS) + { + LOG_ERROR("cuosd_draw_bndbox failed, ret - " << ret); + return ret; + } + + auto format = cuOSDImageFormat::RGBA; + if (inputShape.C == 3) + format = cuOSDImageFormat::RGB; + + cuosd_apply(m_context, inputShape.W, inputShape.H, format, stream); + + auto src = nvcv::cuda::CreateTensorWrapNHWC(inData); + auto dst = nvcv::cuda::CreateTensorWrapNHWC(outData); + bool inplace = inData.basePtr() == outData.basePtr(); + + cuosd_launch(m_context, src, dst, inputShape.W, inputShape.C * inputShape.W, inputShape.H, format, inplace, + inputShape.N, stream); + + checkKernelErrors(); + + cuosd_clear(m_context); + + return ErrorCode::SUCCESS; +} + } // namespace nvcv::legacy::cuda_op diff --git a/src/cvcuda/priv/legacy/pad_and_stack.cu b/src/cvcuda/priv/legacy/pad_and_stack.cu index f40bc0b2..623dd4a3 100644 --- a/src/cvcuda/priv/legacy/pad_and_stack.cu +++ b/src/cvcuda/priv/legacy/pad_and_stack.cu @@ -81,11 +81,6 @@ void padAndStack(const ImageBatchVarShapeDataStridedCuda &inData, const TensorDa funcs[borderMode](inData, outData, top, left, borderValue, stream); } -size_t PadAndStack::calBufferSize(int batch_size) -{ - return 0; -} - ErrorCode PadAndStack::infer(const ImageBatchVarShapeDataStridedCuda &inData, const TensorDataStridedCuda &outData, const TensorDataStridedCuda &top, const TensorDataStridedCuda &left, const NVCVBorderType borderMode, const float borderValue, cudaStream_t stream) diff --git a/src/cvcuda/priv/legacy/pillow_resize.cu b/src/cvcuda/priv/legacy/pillow_resize.cu index 4a4cd9b3..fcf583e5 100644 --- a/src/cvcuda/priv/legacy/pillow_resize.cu +++ b/src/cvcuda/priv/legacy/pillow_resize.cu @@ -368,8 +368,8 @@ void pillow_resize_filter(const TensorDataAccessStridedImagePlanar &inData, } } -PillowResize::PillowResize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type) - : CudaBaseOp(max_input_shape, max_output_shape) +WorkspaceRequirements PillowResize::getWorkspaceRequirements(DataShape max_input_shape, DataShape max_output_shape, + DataType max_data_type) { int max_support = 1; //3 size_t size @@ -381,35 +381,22 @@ PillowResize::PillowResize(DataShape max_input_shape, DataShape max_output_shape * (((1.0 * max_input_shape.W / max_output_shape.W + 1) * max_support * 2 + 1) * sizeof(work_type) + 2 * sizeof(int))) + max_input_shape.N * max_input_shape.C * max_input_shape.H * max_output_shape.W * DataSize(max_data_type); - NVCV_CHECK_LOG(cudaMalloc(&gpu_workspace, size)); -} - -PillowResize::~PillowResize() -{ - NVCV_CHECK_LOG(cudaFree(gpu_workspace)); -} - -size_t PillowResize::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type) -{ - int max_support = 1; //3 - size_t size - = std::ceil( - max_output_shape.H - * (((1.0 * max_input_shape.H / max_output_shape.H + 1) * max_support * 2 + 1) * sizeof(work_type) - + 2 * sizeof(int)) - + max_output_shape.W - * (((1.0 * max_input_shape.W / max_output_shape.W + 1) * max_support * 2 + 1) * sizeof(work_type) - + 2 * sizeof(int))) - + max_input_shape.N * max_input_shape.C * max_input_shape.H * max_output_shape.W * DataSize(max_data_type); - return size; + WorkspaceRequirements req{}; + req.cudaMem = {size, 256}; + return req; } ErrorCode PillowResize::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, - const NVCVInterpolationType interpolation, cudaStream_t stream) + const NVCVInterpolationType interpolation, cudaStream_t stream, const Workspace &ws) { DataFormat format = GetLegacyDataFormat(inData.layout()); DataFormat output_format = GetLegacyDataFormat(outData.layout()); + if (ws.cudaMem.ready != nullptr) + checkCudaErrors(cudaStreamWaitEvent(stream, ws.cudaMem.ready)); + + void *gpu_workspace = ws.cudaMem.data; + if (format != output_format) { LOG_ERROR("Invalid DataFormat between input (" << format << ") and output (" << output_format << ")"); @@ -467,6 +454,10 @@ ErrorCode PillowResize::infer(const TensorDataStridedCuda &inData, const TensorD return ErrorCode::INVALID_PARAMETER; break; } + + if (ws.cudaMem.ready != nullptr) + checkCudaErrors(cudaEventRecord(ws.cudaMem.ready, stream)); + return ErrorCode::SUCCESS; } diff --git a/src/cvcuda/priv/legacy/pillow_resize.h b/src/cvcuda/priv/legacy/pillow_resize.h index 294018ed..37429a99 100644 --- a/src/cvcuda/priv/legacy/pillow_resize.h +++ b/src/cvcuda/priv/legacy/pillow_resize.h @@ -29,8 +29,9 @@ using namespace nvcv; using namespace nvcv::legacy::cuda_op; using namespace nvcv::legacy::helpers; -#define work_type float -#define M_PI 3.14159265358979323846 /* pi */ +using work_type = float; + +#define M_PI 3.14159265358979323846 /* pi */ namespace nvcv::legacy::cuda_op { diff --git a/src/cvcuda/priv/legacy/pillow_resize_var_shape.cu b/src/cvcuda/priv/legacy/pillow_resize_var_shape.cu index f2ba65b8..e95cc20e 100644 --- a/src/cvcuda/priv/legacy/pillow_resize_var_shape.cu +++ b/src/cvcuda/priv/legacy/pillow_resize_var_shape.cu @@ -275,9 +275,18 @@ __global__ void vertical_pass_var_shape(const Ptr2dNHWC src, Ptr2dVarShapeNH template void pillow_resize_var_shape(const ImageBatchVarShape &inDataBase, const ImageBatchVarShape &outDataBase, - void *gpu_workspace, void *cpu_workspace, bool normalize_coeff, work_type init_buffer, - bool round_up, cudaStream_t stream) + const Workspace &ws, bool normalize_coeff, work_type init_buffer, bool round_up, + cudaStream_t stream) { + if (ws.hostMem.ready != nullptr) + checkCudaErrors(cudaEventSynchronize(ws.hostMem.ready)); + + if (ws.cudaMem.ready != nullptr) + checkCudaErrors(cudaStreamWaitEvent(stream, ws.cudaMem.ready)); + + void *cpu_workspace = ws.hostMem.data; + void *gpu_workspace = ws.cudaMem.data; + auto inDataPtr = inDataBase.exportData(stream); if (!inDataPtr) { @@ -419,6 +428,9 @@ void pillow_resize_var_shape(const ImageBatchVarShape &inDataBase, const ImageBa checkCudaErrors(cudaMemcpyAsync((void *)gpu_workspace, (void *)cpu_workspace, current_buffer_size, cudaMemcpyHostToDevice, stream)); + if (ws.hostMem.ready != nullptr) + checkCudaErrors(cudaEventRecord(ws.hostMem.ready, stream)); + Ptr2dVarShapeNHWC src_ptr(inData); Ptr2dVarShapeNHWC dst_ptr(outData); Ptr2dNHWC ptr_h_out(batch, max_input_height, max_width, channels, (work_type *)hori_gpu_data); @@ -479,39 +491,37 @@ void pillow_resize_var_shape(const ImageBatchVarShape &inDataBase, const ImageBa init_buffer, round_up, hv_use_share_mem); checkKernelErrors(); + + if (ws.cudaMem.ready != nullptr) + checkCudaErrors(cudaEventRecord(ws.cudaMem.ready, stream)); } } // namespace template void pillow_resize_filter_var_shape(const ImageBatchVarShape &inData, const ImageBatchVarShape &outData, - void *gpu_workspace, void *cpu_workspace, NVCVInterpolationType interpolation, - cudaStream_t stream) + const Workspace &ws, NVCVInterpolationType interpolation, cudaStream_t stream) { DataType data_type = helpers::GetLegacyDataType(inData.uniqueFormat()); switch (data_type) { case kCV_8U: - pillow_resize_var_shape(inData, outData, gpu_workspace, cpu_workspace, false, 0., false, - stream); + pillow_resize_var_shape(inData, outData, ws, false, 0., false, stream); break; case kCV_8S: - pillow_resize_var_shape(inData, outData, gpu_workspace, cpu_workspace, false, 0., true, - stream); + pillow_resize_var_shape(inData, outData, ws, false, 0., true, stream); break; case kCV_16U: - pillow_resize_var_shape(inData, outData, gpu_workspace, cpu_workspace, false, 0., false, - stream); + pillow_resize_var_shape(inData, outData, ws, false, 0., false, stream); break; case kCV_16S: - pillow_resize_var_shape(inData, outData, gpu_workspace, cpu_workspace, false, 0., true, - stream); + pillow_resize_var_shape(inData, outData, ws, false, 0., true, stream); break; case kCV_32S: - pillow_resize_var_shape(inData, outData, gpu_workspace, cpu_workspace, false, 0., true, stream); + pillow_resize_var_shape(inData, outData, ws, false, 0., true, stream); break; case kCV_32F: - pillow_resize_var_shape(inData, outData, gpu_workspace, cpu_workspace, false, 0., false, stream); + pillow_resize_var_shape(inData, outData, ws, false, 0., false, stream); break; case kCV_64F: default: @@ -519,40 +529,13 @@ void pillow_resize_filter_var_shape(const ImageBatchVarShape &inData, const Imag } } -PillowResizeVarShape::PillowResizeVarShape(DataShape max_input_shape, DataShape max_output_shape, - DataType max_data_type) - : CudaBaseOp(max_input_shape, max_output_shape) +WorkspaceRequirements PillowResizeVarShape::getWorkspaceRequirements(DataShape max_input_shape, + DataShape max_output_shape, DataType max_data_type) { - int max_support = 1; //3 - size_t size = std::ceil( - max_output_shape.H - * (((1.0 * max_input_shape.H / max_output_shape.H + 1) * max_support * 2 + 1) * sizeof(work_type) - + 2 * sizeof(int)) - + max_output_shape.W - * (((1.0 * max_input_shape.W / max_output_shape.W + 1) * max_support * 2 + 1) * sizeof(work_type) - + 2 * sizeof(int))); - size_t buffer_size = (sizeof(void *) * 3 + sizeof(int) * 12 + sizeof(work_type) * 6 + size) * max_input_shape.N; - buffer_size += max_input_shape.N * max_input_shape.C * max_input_shape.H * max_output_shape.W * sizeof(float); + constexpr size_t kDefaultDeviceAlignment = 256; - NVCV_CHECK_LOG(cudaMalloc(&gpu_workspace, buffer_size)); + WorkspaceRequirements req{}; - cpu_workspace = malloc(buffer_size); - if (!cpu_workspace) - { - LOG_ERROR("Memory allocation error of size: " << buffer_size); - throw std::runtime_error("Memory allocation error!"); - } -} - -PillowResizeVarShape::~PillowResizeVarShape() -{ - NVCV_CHECK_LOG(cudaFree(gpu_workspace)); - free(cpu_workspace); -} - -size_t PillowResizeVarShape::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, - DataType max_data_type) -{ int max_support = 1; //3 size_t size = std::ceil( max_output_shape.H @@ -561,15 +544,24 @@ size_t PillowResizeVarShape::calBufferSize(DataShape max_input_shape, DataShape + max_output_shape.W * (((1.0 * max_input_shape.W / max_output_shape.W + 1) * max_support * 2 + 1) * sizeof(work_type) + 2 * sizeof(int))); + size_t buffer_size = (sizeof(void *) * 3 + sizeof(int) * 12 + sizeof(work_type) * 6 + size) * max_input_shape.N; + + req.hostMem.size = buffer_size; + req.hostMem.alignment = alignof(std::max_align_t); + buffer_size += max_input_shape.N * max_input_shape.C * max_input_shape.H * max_output_shape.W * sizeof(float); - return buffer_size; + req.cudaMem.size = buffer_size; + req.cudaMem.alignment = kDefaultDeviceAlignment; + + return req; } ErrorCode PillowResizeVarShape::infer(const nvcv::ImageBatchVarShape &inDataBase, const nvcv::ImageBatchVarShape &outDataBase, - const NVCVInterpolationType interpolation, cudaStream_t stream) + const NVCVInterpolationType interpolation, cudaStream_t stream, + const NVCVWorkspace &ws) { if (!inDataBase.uniqueFormat() || !outDataBase.uniqueFormat()) { @@ -610,24 +602,19 @@ ErrorCode PillowResizeVarShape::infer(const nvcv::ImageBatchVarShape &inDataBase switch (interpolation) { case NVCV_INTERP_LINEAR: - pillow_resize_filter_var_shape(inDataBase, outDataBase, gpu_workspace, cpu_workspace, - interpolation, stream); + pillow_resize_filter_var_shape(inDataBase, outDataBase, ws, interpolation, stream); break; case NVCV_INTERP_BOX: - pillow_resize_filter_var_shape(inDataBase, outDataBase, gpu_workspace, cpu_workspace, interpolation, - stream); + pillow_resize_filter_var_shape(inDataBase, outDataBase, ws, interpolation, stream); break; case NVCV_INTERP_HAMMING: - pillow_resize_filter_var_shape(inDataBase, outDataBase, gpu_workspace, cpu_workspace, - interpolation, stream); + pillow_resize_filter_var_shape(inDataBase, outDataBase, ws, interpolation, stream); break; case NVCV_INTERP_CUBIC: - pillow_resize_filter_var_shape(inDataBase, outDataBase, gpu_workspace, cpu_workspace, - interpolation, stream); + pillow_resize_filter_var_shape(inDataBase, outDataBase, ws, interpolation, stream); break; case NVCV_INTERP_LANCZOS: - pillow_resize_filter_var_shape(inDataBase, outDataBase, gpu_workspace, cpu_workspace, - interpolation, stream); + pillow_resize_filter_var_shape(inDataBase, outDataBase, ws, interpolation, stream); break; default: LOG_ERROR("Unsupported interpolation method " << interpolation); diff --git a/src/cvcuda/priv/legacy/reformat.cu b/src/cvcuda/priv/legacy/reformat.cu index 919f535f..826e0f0c 100644 --- a/src/cvcuda/priv/legacy/reformat.cu +++ b/src/cvcuda/priv/legacy/reformat.cu @@ -104,11 +104,6 @@ void Reformat::checkDataFormat(DataFormat format) NVCV_ASSERT(format == kNHWC || format == kHWC || format == kNCHW || format == kCHW); } -size_t Reformat::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type) -{ - return 0; -} - ErrorCode Reformat::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData, cudaStream_t stream) { diff --git a/src/cvcuda/priv/legacy/resize.cu b/src/cvcuda/priv/legacy/resize.cu index c262bc59..51721843 100644 --- a/src/cvcuda/priv/legacy/resize.cu +++ b/src/cvcuda/priv/legacy/resize.cu @@ -624,11 +624,6 @@ void resize(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &ou #endif } //resize -size_t Resize::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type) -{ - return 0; -} //Resize::calBufferSize - ErrorCode Resize::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, const NVCVInterpolationType interpolation, cudaStream_t stream) { diff --git a/src/cvcuda/priv/legacy/warp.cu b/src/cvcuda/priv/legacy/warp.cu index eb5f06fd..e1dbc8a1 100644 --- a/src/cvcuda/priv/legacy/warp.cu +++ b/src/cvcuda/priv/legacy/warp.cu @@ -216,11 +216,6 @@ ErrorCode WarpAffine::infer(const TensorDataStridedCuda &inData, const TensorDat return ErrorCode::SUCCESS; } -size_t WarpPerspective::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type) -{ - return 9 * sizeof(float); -} - ErrorCode WarpPerspective::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, const float *transMatrix, const int32_t flags, const NVCVBorderType borderMode, const float4 borderValue, cudaStream_t stream) @@ -290,7 +285,7 @@ ErrorCode WarpPerspective::infer(const TensorDataStridedCuda &inData, const Tens PerspectiveTransform transform(transMatrix); - if (flags & NVCV_WARP_INVERSE_MAP) + if (!(flags & NVCV_WARP_INVERSE_MAP)) { cuda::math::Matrix tempMatrixForInverse; diff --git a/src/cvcuda/priv/legacy/warp_var_shape.cu b/src/cvcuda/priv/legacy/warp_var_shape.cu index 52023d82..e99dd18c 100644 --- a/src/cvcuda/priv/legacy/warp_var_shape.cu +++ b/src/cvcuda/priv/legacy/warp_var_shape.cu @@ -391,7 +391,7 @@ ErrorCode WarpPerspectiveVarShape::infer(const ImageBatchVarShapeDataStridedCuda cuda::Tensor2DWrap transMatrixInput(transMatrix); cuda::Tensor2DWrap transMatrixOutput(m_transformationMatrix, static_cast(sizeof(float) * 9)); - if (performInverse) + if (!performInverse) { inverseMatWarpPerspective<<<1, inData.numImages(), 0, stream>>>(inData.numImages(), transMatrixInput, transMatrixOutput); diff --git a/src/nvcv_types/Array.cpp b/src/nvcv_types/Array.cpp index d47af91a..be7d98a2 100644 --- a/src/nvcv_types/Array.cpp +++ b/src/nvcv_types/Array.cpp @@ -264,6 +264,23 @@ NVCV_DEFINE_API(0, 4, NVCVStatus, nvcvArrayGetCapacity, (NVCVArrayHandle handle, }); } +NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvArrayResize, (NVCVArrayHandle handle, int64_t length)) +{ + return priv::ProtectCall( + [&] + { + auto &array = priv::ToStaticRef(handle); + + if (length > array.capacity()) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, + "Cannot resize array to input length because greater than capacity"); + } + + array.resize(length); + }); +} + NVCV_DEFINE_API(0, 4, NVCVStatus, nvcvArrayGetTarget, (NVCVArrayHandle handle, NVCVResourceType *target)) { return priv::ProtectCall( diff --git a/src/nvcv_types/CMakeLists.txt b/src/nvcv_types/CMakeLists.txt index f7bbf2b5..2d561704 100644 --- a/src/nvcv_types/CMakeLists.txt +++ b/src/nvcv_types/CMakeLists.txt @@ -32,6 +32,7 @@ add_library(nvcv_types SHARED DataType.cpp ImageFormat.cpp Array.cpp + TensorBatch.cpp ) target_link_libraries(nvcv_types diff --git a/src/nvcv_types/ImageBatch.cpp b/src/nvcv_types/ImageBatch.cpp index fca335cc..0de96b23 100644 --- a/src/nvcv_types/ImageBatch.cpp +++ b/src/nvcv_types/ImageBatch.cpp @@ -39,6 +39,11 @@ NVCV_DEFINE_API(0, 0, NVCVStatus, nvcvImageBatchVarShapeCalcRequirements, throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to output requirements must not be NULL"); } + if (capacity < 0) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Capacity must >= 0"); + } + *reqs = priv::ImageBatchVarShape::CalcRequirements(capacity); }); } @@ -295,6 +300,11 @@ NVCV_DEFINE_API(0, 3, NVCVStatus, nvcvImageBatchVarShapeGetImages, return priv::ProtectCall( [&] { + if (outImages == nullptr) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to output handle cannot be NULL"); + } + auto &batch = priv::ToDynamicRef(handle); batch.getImages(begIndex, outImages, numImages); diff --git a/src/nvcv_types/Tensor.cpp b/src/nvcv_types/Tensor.cpp index 6db2ed95..89b35239 100644 --- a/src/nvcv_types/Tensor.cpp +++ b/src/nvcv_types/Tensor.cpp @@ -304,3 +304,41 @@ NVCV_DEFINE_API(0, 3, NVCVStatus, nvcvTensorGetUserPointer, (NVCVTensorHandle ha *outUserPtr = tensor.userPointer(); }); } + +NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorReshape, + (NVCVTensorHandle handle, int32_t rank, const int64_t *shape, NVCVTensorLayout layout, + NVCVTensorHandle *out_handle)) +{ + return priv::ProtectCall( + [&] + { + if (handle == nullptr) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Tensor handle must not be NULL"); + } + + if (out_handle == nullptr) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to output handle must not be NULL"); + } + + auto tensor_ptr = priv::ToSharedObj(handle); // this will call incRef + + NVCVTensorData new_tensor_data; + tensor_ptr->exportData(new_tensor_data); + + // Modifies rank, shape, layout and strides + priv::ReshapeTensorData(new_tensor_data, rank, shape, layout); + + // The cleanup consists of dropping the reference to the handle we reference + auto cleanup = [](void *h, const NVCVTensorData *) + { + priv::CoreObjectDecRef(static_cast(h)); + }; + void *cleanup_ctx = handle; + + *out_handle = priv::CreateCoreObject(new_tensor_data, cleanup, cleanup_ctx); + + (void)tensor_ptr.release(); // we transferred ownership, we can release + }); +} diff --git a/src/nvcv_types/TensorBatch.cpp b/src/nvcv_types/TensorBatch.cpp new file mode 100644 index 00000000..a8f608d0 --- /dev/null +++ b/src/nvcv_types/TensorBatch.cpp @@ -0,0 +1,318 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "priv/TensorBatch.hpp" + +#include "priv/Status.hpp" +#include "priv/SymbolVersioning.hpp" +#include "priv/TensorBatchManager.hpp" + +#include + +namespace priv = nvcv::priv; + +NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchCalcRequirements, + (int32_t capacity, NVCVTensorBatchRequirements *reqs)) +{ + return priv::ProtectCall( + [&] + { + if (reqs == nullptr) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to output requirements must not be NULL"); + } + + *reqs = priv::TensorBatch::CalcRequirements(capacity); + }); +} + +NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchConstruct, + (const NVCVTensorBatchRequirements *reqs, NVCVAllocatorHandle halloc, NVCVTensorBatchHandle *outHandle)) +{ + return priv::ProtectCall( + [&] + { + if (reqs == nullptr) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to requirements must not be NULL"); + } + if (outHandle == nullptr) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to output handle must not be NULL"); + } + + priv::IAllocator &alloc = priv::GetAllocator(halloc); + *outHandle = priv::CreateCoreObject(*reqs, alloc); + }); +} + +NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchClear, (NVCVTensorBatchHandle handle)) +{ + return priv::ProtectCall( + [&] + { + auto &tb = priv::ToStaticRef(handle); + tb.clear(); + }); +} + +NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchPushTensors, + (NVCVTensorBatchHandle handle, const NVCVTensorHandle *tensors, int32_t numTensors)) +{ + return priv::ProtectCall( + [&] + { + if (tensors == nullptr) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to tensors must not be NULL"); + } + auto &tb = priv::ToStaticRef(handle); + tb.pushTensors(tensors, numTensors); + }); +} + +NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchDecRef, (NVCVTensorBatchHandle handle, int32_t *newRefCount)) +{ + return priv::ProtectCall( + [&] + { + int32_t newRef = priv::CoreObjectDecRef(handle); + if (newRefCount) + *newRefCount = newRef; + }); +} + +NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchIncRef, (NVCVTensorBatchHandle handle, int32_t *newRefCount)) +{ + return priv::ProtectCall( + [&] + { + int32_t refCount = priv::CoreObjectIncRef(handle); + if (newRefCount) + *newRefCount = refCount; + }); +} + +NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchRefCount, (NVCVTensorBatchHandle handle, int32_t *outRefCount)) +{ + return priv::ProtectCall( + [&] + { + if (outRefCount == nullptr) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to reference count must not be NULL"); + } + *outRefCount = priv::CoreObjectRefCount(handle); + }); +} + +NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchGetCapacity, (NVCVTensorBatchHandle handle, int32_t *outCapacityPtr)) +{ + return priv::ProtectCall( + [&] + { + if (outCapacityPtr == nullptr) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to capacity must not be NULL"); + } + auto &tb = priv::ToStaticRef(handle); + *outCapacityPtr = tb.capacity(); + }); +} + +NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchGetRank, (NVCVTensorBatchHandle handle, int32_t *outRankPtr)) +{ + return priv::ProtectCall( + [&] + { + if (outRankPtr == nullptr) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to rank must not be NULL"); + } + auto &tb = priv::ToStaticRef(handle); + *outRankPtr = tb.rank(); + }); +} + +NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchGetDType, (NVCVTensorBatchHandle handle, NVCVDataType *outDTypePtr)) +{ + return priv::ProtectCall( + [&] + { + if (outDTypePtr == nullptr) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to data type must not be NULL"); + } + auto &tb = priv::ToStaticRef(handle); + *outDTypePtr = tb.dtype(); + }); +} + +NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchGetLayout, + (NVCVTensorBatchHandle handle, NVCVTensorLayout *outLayoutPtr)) +{ + return priv::ProtectCall( + [&] + { + if (outLayoutPtr == nullptr) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to layout must not be NULL"); + } + auto &tb = priv::ToStaticRef(handle); + *outLayoutPtr = tb.layout(); + }); +} + +NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchGetType, + (NVCVTensorBatchHandle handle, NVCVTensorBufferType *outTypePtr)) +{ + return priv::ProtectCall( + [&] + { + if (outTypePtr == nullptr) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to buffer type must not be NULL"); + } + auto &tb = priv::ToStaticRef(handle); + *outTypePtr = tb.type(); + }); +} + +NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchGetNumTensors, + (NVCVTensorBatchHandle handle, int32_t *outNumTensorsPtr)) +{ + return priv::ProtectCall( + [&] + { + if (outNumTensorsPtr == nullptr) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to tensors number must not be NULL"); + } + auto &tb = priv::ToStaticRef(handle); + *outNumTensorsPtr = tb.numTensors(); + }); +} + +NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchGetAllocator, + (NVCVTensorBatchHandle handle, NVCVAllocatorHandle *outAllocatorPtr)) +{ + return priv::ProtectCall( + [&] + { + if (outAllocatorPtr == nullptr) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to allocator must not be NULL"); + } + auto &tb = priv::ToStaticRef(handle); + *outAllocatorPtr = tb.alloc().release()->handle(); + }); +} + +NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchExportData, + (NVCVTensorBatchHandle handle, CUstream stream, NVCVTensorBatchData *data)) +{ + return priv::ProtectCall( + [&] + { + if (data == nullptr) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to tensor batch data must not be NULL"); + } + auto &tb = priv::ToStaticRef(handle); + tb.exportData(stream, *data); + }); +} + +NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchPopTensors, (NVCVTensorBatchHandle handle, int32_t numTensors)) +{ + return priv::ProtectCall( + [&] + { + auto &tb = priv::ToStaticRef(handle); + tb.popTensors(numTensors); + }); +} + +NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchGetTensors, + (NVCVTensorBatchHandle handle, int32_t index, NVCVTensorHandle *outTensors, int32_t numTensors)) +{ + return priv::ProtectCall( + [&] + { + if (outTensors == nullptr) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to output tensors must not be NULL"); + } + if (index < 0) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Index cannot be negative"); + } + if (numTensors < 0) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Number of tensors cannot be negative"); + } + auto &tb = priv::ToStaticRef(handle); + tb.getTensors(index, outTensors, numTensors); + }); +} + +NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchSetTensors, + (NVCVTensorBatchHandle handle, int32_t index, const NVCVTensorHandle *tensors, int32_t numTensors)) +{ + return priv::ProtectCall( + [&] + { + if (tensors == nullptr) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to tensors must not be NULL"); + } + if (index < 0) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Index cannot be negative"); + } + if (numTensors < 0) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Number of tensors cannot be negative"); + } + auto &tb = priv::ToStaticRef(handle); + tb.setTensors(index, tensors, numTensors); + }); +} + +NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchSetUserPointer, (NVCVTensorBatchHandle handle, void *userPointer)) +{ + return priv::ProtectCall( + [&] + { + auto &tb = priv::ToStaticRef(handle); + tb.setUserPointer(userPointer); + }); +} + +NVCV_DEFINE_API(0, 5, NVCVStatus, nvcvTensorBatchGetUserPointer, (NVCVTensorBatchHandle handle, void **outUserPointer)) +{ + return priv::ProtectCall( + [&] + { + if (outUserPointer == nullptr) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to user poniter must not be NULL"); + } + auto &tb = priv::ToStaticRef(handle); + *outUserPointer = tb.userPointer(); + }); +} diff --git a/src/nvcv_types/include/nvcv/Array.h b/src/nvcv_types/include/nvcv/Array.h index cb4bdb83..077e6f09 100644 --- a/src/nvcv_types/include/nvcv/Array.h +++ b/src/nvcv_types/include/nvcv/Array.h @@ -301,6 +301,20 @@ NVCV_PUBLIC NVCVStatus nvcvArrayGetLength(NVCVArrayHandle handle, int64_t *lengt */ NVCV_PUBLIC NVCVStatus nvcvArrayGetCapacity(NVCVArrayHandle handle, int64_t *capacity); +/** + * Resizes the array legnth to the specified length up to the capacity. + * + * @param[in] handle Array to be queried. + * + Must not be NULL. + * + Must have been created by @ref nvcvArrayConstruct. + * + * @param[in] length The input length of the array. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside its valid range. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +NVCV_PUBLIC NVCVStatus nvcvArrayResize(NVCVArrayHandle handle, int64_t length); + /** * Retrieve the array target. * diff --git a/src/nvcv_types/include/nvcv/Array.hpp b/src/nvcv_types/include/nvcv/Array.hpp index 5acd3387..81b99675 100644 --- a/src/nvcv_types/include/nvcv/Array.hpp +++ b/src/nvcv_types/include/nvcv/Array.hpp @@ -46,6 +46,8 @@ class Array : public CoreResource ArrayData exportData() const; + void resize(int64_t length); + template Optional exportData() const { diff --git a/src/nvcv_types/include/nvcv/Fwd.h b/src/nvcv_types/include/nvcv/Fwd.h index e3152414..c9702cb1 100644 --- a/src/nvcv_types/include/nvcv/Fwd.h +++ b/src/nvcv_types/include/nvcv/Fwd.h @@ -31,10 +31,11 @@ extern "C" { #endif -typedef struct NVCVImage *NVCVImageHandle; -typedef struct NVCVImageBatch *NVCVImageBatchHandle; -typedef struct NVCVTensor *NVCVTensorHandle; -typedef struct NVCVArray *NVCVArrayHandle; +typedef struct NVCVImage *NVCVImageHandle; +typedef struct NVCVImageBatch *NVCVImageBatchHandle; +typedef struct NVCVTensor *NVCVTensorHandle; +typedef struct NVCVTensorBatch *NVCVTensorBatchHandle; +typedef struct NVCVArray *NVCVArrayHandle; #ifdef __cplusplus } diff --git a/src/nvcv_types/include/nvcv/ImageBatch.h b/src/nvcv_types/include/nvcv/ImageBatch.h index 801d95ef..3673b25b 100644 --- a/src/nvcv_types/include/nvcv/ImageBatch.h +++ b/src/nvcv_types/include/nvcv/ImageBatch.h @@ -68,7 +68,7 @@ typedef struct NVCVImageBatchVarShapeRequirementsRec /** Calculates the resource requirements needed to create a varshape image batch. * * @param [in] capacity Maximum number of images that fits in the image batch. - * + Must be >= 1. + * + Must be >= 0. * * @param [out] reqs Where the image batch requirements will be written to. * + Must not be NULL. @@ -104,12 +104,10 @@ NVCV_PUBLIC NVCVStatus nvcvImageBatchVarShapeConstruct(const NVCVImageBatchVarSh /** Decrements the reference count of an existing image batch instance. * * The image batch is destroyed when its reference count reaches zero. - + * * If the image has type @ref NVCV_TYPE_IMAGEBATCH_TENSOR_WRAPDATA and has a cleanup function defined, * cleanup will be called. * - * @note The image batch object must not be in use in current and future operations. - * * @param [in] handle Image batch to be destroyed. * If NULL, no operation is performed, successfully. * + The handle must have been created with any of the nvcvImageBatchXXXConstruct functions. diff --git a/src/nvcv_types/include/nvcv/ImageData.h b/src/nvcv_types/include/nvcv/ImageData.h index f8a49889..c34fb226 100644 --- a/src/nvcv_types/include/nvcv/ImageData.h +++ b/src/nvcv_types/include/nvcv/ImageData.h @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/src/nvcv_types/include/nvcv/Size.h b/src/nvcv_types/include/nvcv/Size.h new file mode 100644 index 00000000..fe6db006 --- /dev/null +++ b/src/nvcv_types/include/nvcv/Size.h @@ -0,0 +1,42 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_SIZE_H +#define NVCV_SIZE_H + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** + * @brief Struct representing a two-dimensional size. + * + * This structure is designed to represent a width and height in 2D space. + */ +typedef struct +{ + int32_t w, h; +} NVCVSize2D; + +#ifdef __cplusplus +} +#endif + +#endif // NVCV_SIZE_H diff --git a/src/nvcv_types/include/nvcv/Size.hpp b/src/nvcv_types/include/nvcv/Size.hpp index 9a3020be..cce55e04 100644 --- a/src/nvcv_types/include/nvcv/Size.hpp +++ b/src/nvcv_types/include/nvcv/Size.hpp @@ -24,6 +24,8 @@ #ifndef NVCV_SIZE_HPP #define NVCV_SIZE_HPP +#include "Size.h" + #include #include #include @@ -35,52 +37,75 @@ namespace nvcv { * @{ */ -/** - * @brief Struct representing a two-dimensional size. - * - * This structure is designed to represent a width and height in 2D space. - */ -struct Size2D +struct Size2D : NVCVSize2D { - int w, h; -}; + using NVCVSize2D::NVCVSize2D; -/** - * @brief Compares two Size2D structures for equality. - * - * @param a First size to compare. - * @param b Second size to compare. - * @return true if both width and height of `a` and `b` are equal, otherwise false. - */ -inline bool operator==(const Size2D &a, const Size2D &b) -{ - return std::tie(a.w, a.h) == std::tie(b.w, b.h); -} + constexpr Size2D(int32_t w, int32_t h) + : NVCVSize2D{w, h} + { + } -/** - * @brief Compares two Size2D structures for inequality. - * - * @param a First size to compare. - * @param b Second size to compare. - * @return true if width or height of `a` and `b` are not equal, otherwise false. - */ -inline bool operator!=(const Size2D &a, const Size2D &b) -{ - return !(a == b); -} + constexpr Size2D(const NVCVSize2D &s) + : NVCVSize2D{s.w, s.h} + { + } + + inline Size2D &operator=(const NVCVSize2D &s) + { + static_cast(*this) = s; + return *this; + } + + /** + * @brief Compares two Size2D structures for equality. + * + * @param a First size to compare. + * @param b Second size to compare. + * @return true if both width and height of `a` and `b` are equal, otherwise false. + */ + constexpr bool operator==(const Size2D &rhs) const + { + return w == rhs.w && h == rhs.h; + } + + /** + * @brief Compares two Size2D structures for inequality. + * + * @param a First size to compare. + * @param b Second size to compare. + * @return true if width or height of `a` and `b` are not equal, otherwise false. + */ + constexpr bool operator!=(const Size2D &rhs) const + { + return !(*this == rhs); + } + + /** + * @brief Compares two Size2D structures. + * + * The comparison is based on the width first, and then the height. + * + * @param a First size to compare. + * @param b Second size to compare. + * @return true if `a` is less than `b`, otherwise false. + */ + inline bool operator<(const nvcv::Size2D &rhs) const + { + return std::tie(w, h) < std::tie(rhs.w, rhs.h); + } +}; /** - * @brief Compares two Size2D structures. - * - * The comparison is based on the width first, and then the height. + * @brief Computes the maximum size in each dimension * * @param a First size to compare. * @param b Second size to compare. - * @return true if `a` is less than `b`, otherwise false. + * @return The size with `w` and `h` computed as a maximum of the respective fields in `a` and `b`. */ -inline bool operator<(const Size2D &a, const Size2D &b) +constexpr Size2D MaxSize(const Size2D &a, const Size2D &b) { - return std::tie(a.w, a.h) < std::tie(b.w, b.h); + return {b.w > a.w ? b.w : a.w, b.h > a.h ? b.h : a.h}; } /** @@ -92,7 +117,7 @@ inline bool operator<(const Size2D &a, const Size2D &b) * @param size Size2D structure to be output. * @return Reference to the modified output stream. */ -inline std::ostream &operator<<(std::ostream &out, const Size2D &size) +inline std::ostream &operator<<(std::ostream &out, const nvcv::Size2D &size) { return out << size.w << "x" << size.h; } diff --git a/src/nvcv_types/include/nvcv/Tensor.h b/src/nvcv_types/include/nvcv/Tensor.h index fc1f2385..fef89e67 100644 --- a/src/nvcv_types/include/nvcv/Tensor.h +++ b/src/nvcv_types/include/nvcv/Tensor.h @@ -371,6 +371,31 @@ NVCV_PUBLIC NVCVStatus nvcvTensorExportData(NVCVTensorHandle handle, NVCVTensorD */ NVCV_PUBLIC NVCVStatus nvcvTensorGetShape(NVCVTensorHandle handle, int32_t *rank, int64_t *shape); +/** + * Creates a view of a tensor with a different shape and layout. + * + * @param[in] handle Tensor to create a view from. + * + Must not be NULL. + * + * @param[in] rank Number of elements in the shape buffer argument. + * + Must be a number between 1 and NVCV_TENSOR_MAX_RANK + * + * @param[in] shape New shape. + * Must point to a buffer with @p rank elements. + * Elements above actual number of dimensions will be ignored. + * + * @param[in] layout New layout. + * Must have @p rank elements or be empty. + * + * @param [out] handle Where the tensor instance handle will be written to. + * + Must not be NULL. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is invalid. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +NVCV_PUBLIC NVCVStatus nvcvTensorReshape(NVCVTensorHandle handle, int32_t rank, const int64_t *shape, + NVCVTensorLayout layout, NVCVTensorHandle *out_handle); + #ifdef __cplusplus } #endif diff --git a/src/nvcv_types/include/nvcv/Tensor.hpp b/src/nvcv_types/include/nvcv/Tensor.hpp index 570ca8c1..acb7e5b9 100644 --- a/src/nvcv_types/include/nvcv/Tensor.hpp +++ b/src/nvcv_types/include/nvcv/Tensor.hpp @@ -106,6 +106,12 @@ class Tensor : public CoreResource */ void *userPointer() const; + /** + * @brief Creates a view of the tensor with a new shape and layout + * + */ + Tensor reshape(const TensorShape &new_shape); + /** * @brief Calculates the requirements for a tensor given its shape and data type. * diff --git a/src/nvcv_types/include/nvcv/TensorBatch.h b/src/nvcv_types/include/nvcv/TensorBatch.h new file mode 100644 index 00000000..597f8dc1 --- /dev/null +++ b/src/nvcv_types/include/nvcv/TensorBatch.h @@ -0,0 +1,278 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file TensorBatch.h + * + * @brief Public C interface to NVCV representation of a batch of tensors. + */ + +#ifndef NVCV_TENSORBATCH_H +#define NVCV_TENSORBATCH_H + +#include "Export.h" +#include "Fwd.h" +#include "Image.h" +#include "Status.h" +#include "Tensor.h" +#include "TensorBatchData.h" +#include "TensorLayout.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +typedef struct NVCVTensorBatch *NVCVTensorBatchHandle; + +/** Stores the requirements of an varshape tensor. */ +typedef struct NVCVTensorBatchRequirementsRec +{ + /*< Maximum number of tensors in the batch */ + int32_t capacity; + + /*< Alignment/block size in bytes */ + int32_t alignBytes; + + /*< Tensor resource requirements. */ + NVCVRequirements mem; +} NVCVTensorBatchRequirements; + +/** Calculates the resource requirements needed to create a tensor batch. + * + * @param [in] capacity Maximum number of images that fits in the image batch. + * + Must be >= 1. + * + * @param [out] reqs Where the image batch requirements will be written to. + * + Must not be NULL. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +NVCV_PUBLIC NVCVStatus nvcvTensorBatchCalcRequirements(int32_t capacity, NVCVTensorBatchRequirements *reqs); + +NVCVStatus nvcvTensorBatchConstruct(const NVCVTensorBatchRequirements *req, NVCVAllocatorHandle alloc, + NVCVTensorBatchHandle *outHandle); + +NVCVStatus nvcvTensorBatchClear(NVCVTensorBatchHandle handle); + +NVCVStatus nvcvTensorBatchPushTensors(NVCVTensorBatchHandle handle, const NVCVTensorHandle *tensors, + int32_t numTensors); + +/** + * Pop tensors from the end of the image batch. + * + * @param[in] handle Tensor batch to be manipulated + * + Must not be NULL. + * + The handle must have been created with @ref nvcvTensorBatchConstruct. + * + * @param[in] numTensors Number of tensors to remove. + * + Must be >= 1. + * + Must be <= number of tensors in the batch. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside its valid range. + * @retval #NVCV_ERROR_UNDERFLOW Tried to remove more tensors that there are in the batch. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +NVCVStatus nvcvTensorBatchPopTensors(NVCVTensorBatchHandle handle, int32_t numTensors); + +/** Allocates multiple tensors and adds them to a TensorBatch + * + * This function allocates the storage for multiple tensors, creates the tensors and puts them in the batch. + * + * @param batch a handle to the batch object to which the new tensors will be added + * @param numTensors the number of tensors to add + * @param shapes the shapes of the tensors to be added + * @param strides the strides of the tensors to be added; if NULL, the tensors are densely packed + * @param tensorAlignment the alignment, in bytes, of the base pointer of each tensor in the batch + */ +NVCVStatus nvcvTensorBatchPopulate(NVCVTensorBatchHandle batch, int32_t numTensors, const int64_t **shapes, + const int64_t **strides /* optional, dense packing if NULL */, + int32_t tensorAlignment /* optional, use default if set to 0 */); + +/** Gets handles to a range of tensors in the batch + * + * This function creates new references to the Tensor handles. The caller must release them by calling + * nvcvTensorDecRef on all handles returned by this function. + * + * @param batch a hadle to the batch object from which the tensors are exracted + * @param index the index of the first handle to get + * @param outTensors the array in which the handles are stored; it must have at least + * numTensors handles + * @param numTensors the number of tensors to get + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside its valid range. + * @retval #NVCV_ERROR_OVERFLOW Tried to retrieve more tensors that there are in the batch. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +NVCVStatus nvcvTensorBatchGetTensors(NVCVTensorBatchHandle batch, int32_t index, NVCVTensorHandle *outTensors, + int32_t numTensors); + +/** Sets a range of tensors in the batch + * + * TBD: Do we need/want it? + * Should it also extend the bach if index + numTensors > size (but within capacity)? + */ +NVCVStatus nvcvTensorBatchSetTensors(NVCVTensorBatchHandle batch, int32_t index, const NVCVTensorHandle *tensors, + int32_t numTensors); + +NVCVStatus nvcvTensorBatchGetAllocator(NVCVTensorBatchHandle batch, NVCVAllocatorHandle *alloc); + +NVCVStatus nvcvTensorBatchGetType(NVCVTensorBatchHandle batch, NVCVTensorBufferType *outType); + +/** + * Retrieve the tensor batch contents. + * + * @param[in] handle Tensor batch to be queried. + * + Must not be NULL. + * + * @param[in] stream CUDA stream where the export operation will execute. + * + * @param[out] data Where the tensor batch buffer information will be written to. + * + Must not be NULL. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside its valid range. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +NVCV_PUBLIC NVCVStatus nvcvTensorBatchExportData(NVCVTensorBatchHandle handle, CUstream stream, + NVCVTensorBatchData *data); + +NVCVStatus nvcvTensorBatchGetNumTensors(NVCVTensorBatchHandle batch, int32_t *outNumTensors); + +/** Decrements the reference count of an existing TensorBatch instance. + * + * The Tensor batch is destroyed when its reference count reaches zero. + * + * @param [in] handle Tensor batch to be destroyed. + * If NULL, no operation is performed, successfully. + * + The handle must have been created with any of the nvcvTensorBatchXXXConstruct functions. + * + * @param [out] newRefCount The decremented reference count. If the return value is 0, the object was destroyed. + * Can be NULL, if the caller isn't interested in the new reference count. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT The handle is invalid + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +NVCV_PUBLIC NVCVStatus nvcvTensorBatchDecRef(NVCVTensorBatchHandle handle, int32_t *newRefCount); + +/** Increments the reference count of an Tensorbatch. + * + * @param [in] handle Tensor batch to be retained. + * + * @param [out] newRefCount The incremented reference count. + * Can be NULL, if the caller isn't interested in the new reference count. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT The handle is invalid + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +NVCV_PUBLIC NVCVStatus nvcvTensorBatchIncRef(NVCVTensorBatchHandle handle, int32_t *newRefCount); + +/** Returns the current reference count of an Tensor batch + * + * @param [in] handle The handle whose reference count is to be obtained. + * + * @param [out] outRefCount The reference count. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT The handle is invalid + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +NVCV_PUBLIC NVCVStatus nvcvTensorBatchRefCount(NVCVTensorBatchHandle handle, int32_t *outRefCount); + +/** Associates a user pointer to the Tensor batch handle. + * + * This pointer can be used to associate any kind of data with the Tensor batch object. + * + * @param [in] handle Tensor batch to be associated with the user pointer. + * + * @param [in] userPtr User pointer. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +NVCV_PUBLIC NVCVStatus nvcvTensorBatchSetUserPointer(NVCVTensorBatchHandle handle, void *userPtr); + +/** Returns the user pointer associated with the Tensor batch handle. + * + * If no user pointer was associated, it'll return a pointer to NULL. + * + * @param [in] handle Tensor batch to be queried. + * + * @param [in] outUserPtr Pointer to where the user pointer will be stored. + * + Cannot be NULL. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +NVCV_PUBLIC NVCVStatus nvcvTensorBatchGetUserPointer(NVCVTensorBatchHandle handle, void **outUserPtr); + +/** Returns the capacity of the Tensor batch handle. + * + * @param [in] handle Tensor batch to be queried. + * + * @param [in] outCapacityPtr Pointer to where the capacity will be stored. + * + Cannot be NULL. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +NVCV_PUBLIC NVCVStatus nvcvTensorBatchGetCapacity(NVCVTensorBatchHandle handle, int32_t *outCapacityPtr); + +/** Returns the data type of the Tensor batch handle. + * + * Returns NVCV_DATA_TYPE_NONE for empty batches. + * + * @param [in] handle Tensor batch to be queried. + * + * @param [in] outDTypePtr Pointer to where the data type will be stored. + * + Cannot be NULL. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +NVCV_PUBLIC NVCVStatus nvcvTensorBatchGetDType(NVCVTensorBatchHandle handle, NVCVDataType *outDTypePtr); + +/** Returns the layout of the Tensor batch handle. + * + * Returns the empty layout for empty batches. + * + * @param [in] handle Tensor batch to be queried. + * + * @param [in] outDTypePtr Pointer to where the layout will be stored. + * + Cannot be NULL. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +NVCV_PUBLIC NVCVStatus nvcvTensorBatchGetLayout(NVCVTensorBatchHandle handle, NVCVTensorLayout *outLayoutPtr); + +/** Returns the rank of tensors in the tensor batch or -1 for an empty batch. + * + * @param [in] handle Tensor batch to be queried. + * + * @param [in] outRankPtr Pointer to where the rank will be stored. + * + Cannot be NULL. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +NVCV_PUBLIC NVCVStatus nvcvTensorBatchGetRank(NVCVTensorBatchHandle handle, int32_t *outRankPtr); + +#ifdef __cplusplus +} +#endif + +#endif // NVCV_TENSORBATCH_H diff --git a/src/nvcv_types/include/nvcv/TensorBatch.hpp b/src/nvcv_types/include/nvcv/TensorBatch.hpp new file mode 100644 index 00000000..4aee9e14 --- /dev/null +++ b/src/nvcv_types/include/nvcv/TensorBatch.hpp @@ -0,0 +1,244 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_TENSORBATCH_HPP +#define NVCV_TENSORBATCH_HPP + +#include "CoreResource.hpp" +#include "TensorBatch.h" +#include "TensorBatchData.hpp" +#include "alloc/Allocator.hpp" + +#include + +#include + +namespace nvcv { + +NVCV_IMPL_SHARED_HANDLE(TensorBatch); + +/** + * @brief Handle to a tensor batch object. + * + * Tensor batch is a container type that can hold a list of non-uniformly shaped tensors. + * Rank, data type and layout must be consistent between the tensors. + */ +class TensorBatch : public CoreResource +{ +public: + using Base = CoreResource; + using Requirements = NVCVTensorBatchRequirements; + using HandleType = NVCVTensorBatchHandle; + + static Requirements CalcRequirements(int32_t capacity); + + NVCV_IMPLEMENT_SHARED_RESOURCE(TensorBatch, Base); + + TensorBatch(const Requirements &reqs, const Allocator &alloc = nullptr); + + TensorBatch(int32_t capacity, const Allocator &alloc = nullptr); + + /** + * @brief Return the maximal number of tensors the tensor batch can hold. + */ + int32_t capacity() const; + + /** + * @brief Return the rank of the tensors in the tensor batch or -1 for an empty batch. + */ + int32_t rank() const; + + /** + * @brief Return the number of tensors in the tensor batch. + */ + int32_t numTensors() const; + + /** + * @brief Return the data type of the tensors in the tensor batch. + */ + DataType dtype() const; + + /** + * @brief Return the layout of the tensors in the tensor batch. + */ + TensorLayout layout() const; + + /** + * @brief Return the buffer type of the tensors' data. + */ + NVCVTensorBufferType type() const; + + /** + * @brief Return the allocator used by the tensor batch. + */ + Allocator alloc() const; + + /** + * @brief Append tensors from the given range to the end of the batch. + * + * @param begin,end range of the tensors to append. + */ + template + void pushBack(It begin, It end); + + /** + * @brief Append the \a tensor to the end of the batch. + * + * @param tensor Appended tensor. + */ + void pushBack(const Tensor &tensor); + + /** + * @brief Truncate tensors from the end of the batch. + * + * @param numTensors Number of tensors to remove. + */ + void popTensors(int32_t numTensors); + + /** + * @brief Delete the last tensor from the batch. + */ + void popTensor(); + + /** + * @brief Generate the tensor batch data descriptor. + * + * The necessary copies to GPU are scheduled on the given stream. + * The struct is valid after the scheduled work is finished. + * + * @param stream CUDA stream on which the buffers copy will be scheduled. + */ + TensorBatchData exportData(CUstream stream); + + void clear(); + + /** + * @brief Associates a user pointer to the tensor batch. + * + * @param ptr User pointer + */ + void setUserPointer(void *ptr); + + /** + * @brief Get the user pointer that was previously assciated to the tensor batch + * with the setUserPointer(void*) method. Returns nullptr if no pointer was set. + */ + void *getUserPointer() const; + + /** + * @brief Return a handle to a tensor at a given positon. + * + * @param idx Index of a tensor to return + */ + Tensor operator[](int32_t idx) const; + + /** + * @brief Replace the tensor on position \a index. + */ + void setTensor(int32_t index, const Tensor &tensor); + + class Iterator; + + Iterator begin() const; + + Iterator end() const; +}; + +class TensorBatch::Iterator +{ +public: + using value_type = Tensor; + using reference = const Tensor &; + using pointer = const Tensor *; + using iterator_category = std::random_access_iterator_tag; + using difference_type = int32_t; + + reference operator*() const; + pointer operator->() const; + + Iterator operator++(int); + Iterator &operator++(); + Iterator operator--(int); + Iterator &operator--(); + + Iterator operator+(difference_type diff) const; + Iterator operator-(difference_type diff) const; + + difference_type operator-(const Iterator &rhs) const; + + bool operator==(const Iterator &rhs) const; + bool operator!=(const Iterator &rhs) const; + bool operator<(const Iterator &rhs) const; + bool operator>(const Iterator &rhs) const; + bool operator<=(const Iterator &rhs) const; + bool operator>=(const Iterator &rhs) const; + + Iterator(Iterator &other) + : Iterator() + { + *this = other; + } + + Iterator(Iterator &&other) + : Iterator() + { + *this = std::move(other); + } + + Iterator &operator=(Iterator &other) + { + m_tensorBatch = other.m_tensorBatch; + m_idx = other.m_idx; + m_currentTensor = other.m_currentTensor; + return *this; + } + + Iterator &operator=(Iterator &&other) + { + m_tensorBatch = other.m_tensorBatch; + m_idx = other.m_idx; + m_currentTensor = std::move(other.m_currentTensor); + return *this; + } + +private: + friend class TensorBatch; + + Iterator() = default; + + Iterator(const TensorBatch *tensorBatch, int32_t idx) + : m_tensorBatch(tensorBatch) + , m_idx(idx) + , m_currentTensor{} + { + UpdateCurrentTensor(); + } + + void UpdateCurrentTensor(); + + const TensorBatch *m_tensorBatch = nullptr; + int32_t m_idx = 0; + mutable Tensor m_currentTensor = {}; +}; + +using TensorBatchWrapHandle = NonOwningResource; + +} // namespace nvcv + +#include "detail/TensorBatchImpl.hpp" + +#endif // NVCV_TENSORBATCH_HPP diff --git a/src/nvcv_types/include/nvcv/TensorBatchData.h b/src/nvcv_types/include/nvcv/TensorBatchData.h new file mode 100644 index 00000000..9b980e7c --- /dev/null +++ b/src/nvcv_types/include/nvcv/TensorBatchData.h @@ -0,0 +1,65 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_TENSORBATCHDATA_H +#define NVCV_TENSORBATCHDATA_H + +#include "TensorData.h" +#include "TensorLayout.h" + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** Describes a single tensor in a batch */ +typedef struct NVCVTensorBatchElementStridedRec +{ + alignas(128) NVCVByte *data; + int64_t shape[NVCV_TENSOR_MAX_RANK]; + int64_t stride[NVCV_TENSOR_MAX_RANK]; +} NVCVTensorBatchElementStrided; + +/** Describes a batch of tensors */ +typedef struct NVCVTensorBatchBufferStridedRec +{ + NVCVTensorBatchElementStrided *tensors; +} NVCVTensorBatchBufferStrided; + +typedef union NVCVTensorBatchBufferRec +{ + NVCVTensorBatchBufferStrided strided; +} NVCVTensorBatchBuffer; + +typedef struct NVCVTensorBatchDataRec +{ + NVCVDataType dtype; + NVCVTensorLayout layout; + int32_t rank; + int32_t numTensors; + + NVCVTensorBufferType type; + NVCVTensorBatchBuffer buffer; +} NVCVTensorBatchData; + +#ifdef __cplusplus +} +#endif + +#endif // NVCV_TENSORBATCHDATA_H diff --git a/src/nvcv_types/include/nvcv/TensorBatchData.hpp b/src/nvcv_types/include/nvcv/TensorBatchData.hpp new file mode 100644 index 00000000..b9c425d1 --- /dev/null +++ b/src/nvcv_types/include/nvcv/TensorBatchData.hpp @@ -0,0 +1,172 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_TENSORBATCHDATA_HPP +#define NVCV_TENSORBATCHDATA_HPP + +#include "Optional.hpp" +#include "TensorBatchData.h" +#include "TensorShape.hpp" + +#include +#include + +namespace nvcv { + +/** + * @brief General type represenitng data of any tensor batch. + */ +class TensorBatchData +{ +public: + TensorBatchData(const NVCVTensorBatchData &data) + : m_data(data) + { + } + + /** + * @brief Return rank of the tensors in the batch. + */ + int rank() const + { + return m_data.rank; + } + + /** + * @brief Return the layout of the tensors in the batch. + */ + TensorLayout layout() const + { + return m_data.layout; + } + + /** + * @brief Return the data type of the tensors in the batch. + */ + DataType dtype() const + { + return DataType(m_data.dtype); + } + + /** + * @brief Return the number of the tensors in the batch. + */ + int32_t numTensors() const + { + return m_data.numTensors; + } + + /** + * @brief Return underlying C struct representing the tensor batch data. + */ + NVCVTensorBatchData cdata() const + { + return m_data; + } + + static constexpr bool IsCompatibleKind(NVCVTensorBufferType kind) + { + return kind != NVCV_TENSOR_BUFFER_NONE; + } + + /** + * @brief Cast the tensor batch data to a derived type (e.g. TensorBatchDataStridedCuda) + * @tparam Derived target type + */ + template + Optional cast() const + { + static_assert(std::is_base_of::value, + "Cannot cast TensorBatchData to an unrelated type"); + static_assert(sizeof(Derived) == sizeof(TensorBatchData), "The derived type must not add new data members."); + + if (IsCompatible()) + { + return {Derived(m_data)}; + } + else + { + return {}; + } + } + + /** + * @brief Checks if data can be casted to a given derived type. + * @tparam Derived tested type + */ + template + bool IsCompatible() const + { + static_assert(std::is_base_of::value, + "TensorBatchData cannot be compatible with unrelated type"); + return Derived::IsCompatibleKind(m_data.type); + } + +protected: + TensorBatchData() = default; + + NVCVTensorBatchData &data() + { + return m_data; + } + +private: + NVCVTensorBatchData m_data{}; +}; + +/** + * @brief Data of batches of tensors with strides. + */ +class TensorBatchDataStrided : public TensorBatchData +{ +public: + using Buffer = NVCVTensorBatchBufferStrided; + + /** + * @brief Get the buffer with the tensors' descriptors. + */ + Buffer buffer() const + { + return cdata().buffer.strided; + } + + static constexpr bool IsCompatibleKind(NVCVTensorBufferType kind) + { + return kind == NVCV_TENSOR_BUFFER_STRIDED_CUDA; + } + +protected: + using TensorBatchData::TensorBatchData; +}; + +/** + * @brief Data of batches of CUDA tensors with strides. + */ +class TensorBatchDataStridedCuda : public TensorBatchDataStrided +{ +public: + using TensorBatchDataStrided::TensorBatchDataStrided; + + static constexpr bool IsCompatibleKind(NVCVTensorBufferType kind) + { + return kind == NVCV_TENSOR_BUFFER_STRIDED_CUDA; + } +}; + +} // namespace nvcv + +#endif // NVCV_TENSORBATCHDATA_HPP diff --git a/src/nvcv_types/include/nvcv/alloc/Allocator.hpp b/src/nvcv_types/include/nvcv/alloc/Allocator.hpp index f4a15a3c..8c725183 100644 --- a/src/nvcv_types/include/nvcv/alloc/Allocator.hpp +++ b/src/nvcv_types/include/nvcv/alloc/Allocator.hpp @@ -25,6 +25,7 @@ #include "Allocator.h" #include +#include #include #include diff --git a/src/nvcv_types/include/nvcv/cuda/BorderVarShapeWrap.hpp b/src/nvcv_types/include/nvcv/cuda/BorderVarShapeWrap.hpp index a20e975a..71f8e0cc 100644 --- a/src/nvcv_types/include/nvcv/cuda/BorderVarShapeWrap.hpp +++ b/src/nvcv_types/include/nvcv/cuda/BorderVarShapeWrap.hpp @@ -412,25 +412,25 @@ class BorderVarShapeWrapNHWC : public detail::BorderVarShapeWrapNHWCImpl /** * Subscript operator for read-only or read-and-write access (depending on value type). * - * @param[in] c 4D coordinate (x column, y row, z sample, w channel) to be accessed. + * @param[in] c 4D coordinates (x sample, y row, z col, w channel) to be accessed. * * @return Accessed (const) reference. */ inline __host__ __device__ ValueType &operator[](int4 c) const { - return *doGetPtr(c.z, c.y, c.x, c.w); + return *doGetPtr(c.x, c.y, c.z, c.w); } /** * Subscript operator for read-only or read-and-write access (depending on value type, considering plane=0). * - * @param[in] c 3D coordinate (x column, y row, z sample) (first channel) to be accessed. + * @param[in] c 3D coordinates (x sammple, y row, z col) (first channel) to be accessed. * * @return Accessed (const) reference. */ inline __host__ __device__ ValueType &operator[](int3 c) const { - return *doGetPtr(c.z, c.y, c.x, 0); + return *doGetPtr(c.x, c.y, c.z, 0); } /** @@ -523,13 +523,13 @@ class BorderVarShapeWrapNHWC /** * Subscript operator for read-only or read-and-write access (depending on value type). * - * @param[in] c 4D coordinate (x column, y row, z sample, w channel) to be accessed. + * @param[in] c 4D coordinate (x sample, y row, z column, w channel) to be accessed. * * @return Accessed (const) reference. */ inline __host__ __device__ ValueType &operator[](int4 c) const { - ValueType *p = doGetPtr(c.z, c.y, c.x, c.w); + ValueType *p = doGetPtr(c.x, c.y, c.z, c.w); if (p == nullptr) { @@ -542,13 +542,13 @@ class BorderVarShapeWrapNHWC /** * Subscript operator for read-only or read-and-write access (depending on value type, considering plane=0). * - * @param[in] c 3D coordinate (x column, y row, z sample) (first channel) to be accessed. + * @param[in] c 3D coordinate (x sample, y row, z col) (first channel) to be accessed. * * @return Accessed (const) reference. */ inline __host__ __device__ ValueType &operator[](int3 c) const { - ValueType *p = doGetPtr(c.z, c.y, c.x, 0); + ValueType *p = doGetPtr(c.x, c.y, c.z, 0); if (p == nullptr) { diff --git a/src/nvcv_types/include/nvcv/cuda/ImageBatchVarShapeWrap.hpp b/src/nvcv_types/include/nvcv/cuda/ImageBatchVarShapeWrap.hpp index 7373b977..482f5b3a 100644 --- a/src/nvcv_types/include/nvcv/cuda/ImageBatchVarShapeWrap.hpp +++ b/src/nvcv_types/include/nvcv/cuda/ImageBatchVarShapeWrap.hpp @@ -384,25 +384,26 @@ class ImageBatchVarShapeWrapNHWC : ImageBatchVarShapeWrap /** * Subscript operator for either read-only or read-and-write access. * - * @param[in] c 4D coordinates (x column, y row, z sample, w channel) to be accessed. + * @param[in] c 4D coordinates (x sample, y row, z col, w channel) to be accessed. + * * * @return Accessed reference. */ inline __host__ __device__ T &operator[](int4 c) const { - return *doGetPtr(c.z, c.y, c.x, c.w); + return *doGetPtr(c.x, c.y, c.z, c.w); } /** * Subscript operator for either read-only or read-and-write access. * - * @param[in] c 3D coordinates (x column, y row, z sample) (first channel) to be accessed. + * @param[in] c 3D coordinates (x sample, y row, z col) (first channel) to be accessed. * * @return Accessed reference. */ inline __host__ __device__ T &operator[](int3 c) const { - return *doGetPtr(c.z, c.y, c.x, 0); + return *doGetPtr(c.x, c.y, c.z, 0); } /** diff --git a/src/nvcv_types/include/nvcv/cuda/math/LinAlg.hpp b/src/nvcv_types/include/nvcv/cuda/math/LinAlg.hpp index b8a00432..d91e3f5f 100644 --- a/src/nvcv_types/include/nvcv/cuda/math/LinAlg.hpp +++ b/src/nvcv_types/include/nvcv/cuda/math/LinAlg.hpp @@ -209,7 +209,7 @@ class Vector } // On-purpose public data to allow POD-class direct initialization. - T m_data[N] = {}; + T m_data[N]; }; /** diff --git a/src/nvcv_types/include/nvcv/detail/Align.hpp b/src/nvcv_types/include/nvcv/detail/Align.hpp new file mode 100644 index 00000000..11e4d814 --- /dev/null +++ b/src/nvcv_types/include/nvcv/detail/Align.hpp @@ -0,0 +1,106 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_DETAIL_ALIGN_HPP +#define NVCV_DETAIL_ALIGN_HPP + +#include +#include + +namespace nvcv { namespace detail { + +/** + * @brief Aligns the @p value down to a multiple of @p alignment_pow2 + * + * The function operates by masking the least significant bits of the value. + * If the alignment is not a power of two, the behavior is undefined. + * + * @remark Negative values are aligned down, not towards zero. + * + * @tparam T an integral type + * @param value a value to align + * @param alignment_pow2 the alignment, must be a positive power of 2 + * @return constexpr T the value aligned down to a multiple of @p alignment_pow2 + */ +template +constexpr T AlignDown(T value, T alignment_pow2) +{ + static_assert(std::is_integral::value, "Cannot align a value of a non-integral type"); + // Explanation: + // When alignmnent_pow2 is a power of 2, (for example 16) it has a form: + // 00010000 + // Negating it in U2 gives: + // 11110000 + // We can use this as a mask to align a number _down_. + + // NOTE: This is much more efficient than (value/alignment) * alignment for run-time alignment values, where + // the compiler cannot replace the division/multiplication with bit shifts. + return value & -alignment_pow2; +} + +/** + * @brief Aligns the @p value up to a multiple of @p alignment_pow2 + * + * The function operates by adding alignment-1 to the value and masking the least significant bits. + * If the alignment is not a power of two, the behavior is undefined. + * + * @remark Negative values are aligned up, that is, towards zero. + * + * @tparam T an integral type + * @param value a value to align + * @param alignment_pow2 the alignment, must be a positive power of 2 + * @return constexpr T the value aligned up to a multiple of @p alignment_pow2 + */ +template +constexpr T AlignUp(T value, T alignment_pow2) +{ + static_assert(std::is_integral::value, "Cannot align a value of a non-integral type"); + return AlignDown(value + (alignment_pow2 - 1), alignment_pow2); +} + +/** + * @brief Checks if the value is a multiple of alignment + * + * @tparam T an integral type + * @param value the value whose alignment is checked + * @param alignment_pow2 the alignment, must be a power of 2 + * @return true if value is a multiple of alignment_pow2 + * @return false otherwise + */ +template +constexpr bool IsAligned(T value, T alignment_pow2) +{ + static_assert(std::is_integral::value, "Cannot check alignment of a value of a non-integral type"); + return (value & (alignment_pow2 - 1)) == 0; +} + +/** + * @brief Checks if a pointer is aligned to a multiple of @p alignment_pow2 bytes. + * + * @param ptr the pointer whose alignment is checked + * @param alignment_pow2 the alignment, must be a power of 2 + * @return true if value is a multiple of alignment_pow2 + * @return false otherwise + */ +inline bool IsAligned(const void *ptr, uintptr_t alignment_pow2) +{ + return IsAligned((uintptr_t)ptr, alignment_pow2); +} + +}} // namespace nvcv::detail + +#endif // NVCV_DETAIL_ALIGN_HPP diff --git a/src/nvcv_types/include/nvcv/detail/ArrayImpl.hpp b/src/nvcv_types/include/nvcv/detail/ArrayImpl.hpp index 13b4d60b..65775a47 100644 --- a/src/nvcv_types/include/nvcv/detail/ArrayImpl.hpp +++ b/src/nvcv_types/include/nvcv/detail/ArrayImpl.hpp @@ -36,6 +36,13 @@ inline int64_t Array::length() const return length; } +inline void Array::resize(int64_t length) +{ + NVCVArrayHandle harray = this->handle(); + + detail::CheckThrow(nvcvArrayResize(harray, length)); +} + inline int64_t Array::capacity() const { NVCVArrayHandle harray = this->handle(); diff --git a/src/nvcv_types/include/nvcv/detail/TensorBatchImpl.hpp b/src/nvcv_types/include/nvcv/detail/TensorBatchImpl.hpp new file mode 100644 index 00000000..1d028a65 --- /dev/null +++ b/src/nvcv_types/include/nvcv/detail/TensorBatchImpl.hpp @@ -0,0 +1,266 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_TENSORBATCH_IMPL_HPP +#define NVCV_TENSORBATCH_IMPL_HPP + +namespace nvcv { + +// TensorBatch + +inline TensorBatch::Requirements TensorBatch::CalcRequirements(int32_t capacity) +{ + TensorBatch::Requirements reqs = {}; + detail::CheckThrow(nvcvTensorBatchCalcRequirements(capacity, &reqs)); + return reqs; +} + +inline TensorBatch::TensorBatch(const TensorBatch::Requirements &reqs, const Allocator &alloc) +{ + NVCVTensorBatchHandle handle = nullptr; + detail::CheckThrow(nvcvTensorBatchConstruct(&reqs, alloc.handle(), &handle)); + reset(std::move(handle)); +} + +inline TensorBatch::TensorBatch(int32_t capacity, const Allocator &alloc) +{ + auto reqs = TensorBatch::CalcRequirements(capacity); + NVCVTensorBatchHandle handle = nullptr; + detail::CheckThrow(nvcvTensorBatchConstruct(&reqs, alloc.handle(), &handle)); + reset(std::move(handle)); +} + +inline int32_t TensorBatch::capacity() const +{ + int32_t output; + detail::CheckThrow(nvcvTensorBatchGetCapacity(handle(), &output)); + return output; +} + +inline int32_t TensorBatch::rank() const +{ + int32_t output; + detail::CheckThrow(nvcvTensorBatchGetRank(handle(), &output)); + return output; +} + +inline int32_t TensorBatch::numTensors() const +{ + int32_t output; + detail::CheckThrow(nvcvTensorBatchGetNumTensors(handle(), &output)); + return output; +} + +inline DataType TensorBatch::dtype() const +{ + NVCVDataType dataType = {}; + detail::CheckThrow(nvcvTensorBatchGetDType(handle(), &dataType)); + return DataType(dataType); +} + +inline TensorLayout TensorBatch::layout() const +{ + NVCVTensorLayout tensorLayout; + detail::CheckThrow(nvcvTensorBatchGetLayout(handle(), &tensorLayout)); + return TensorLayout(tensorLayout); +} + +inline NVCVTensorBufferType TensorBatch::type() const +{ + NVCVTensorBufferType bufferType; + detail::CheckThrow(nvcvTensorBatchGetType(handle(), &bufferType)); + return bufferType; +} + +inline Allocator TensorBatch::alloc() const +{ + NVCVAllocatorHandle halloc; + detail::CheckThrow(nvcvTensorBatchGetAllocator(handle(), &halloc)); + return Allocator(std::move(halloc)); +} + +template +inline void TensorBatch::pushBack(It begin, It end) +{ + std::vector handles; + handles.reserve(capacity() - numTensors()); + for (auto it = begin; it != end; ++it) + { + handles.push_back(it->handle()); + } + detail::CheckThrow(nvcvTensorBatchPushTensors(handle(), handles.data(), handles.size())); +} + +inline void TensorBatch::pushBack(const Tensor &tensor) +{ + auto hTensor = tensor.handle(); + detail::CheckThrow(nvcvTensorBatchPushTensors(handle(), &hTensor, 1)); +} + +inline void TensorBatch::popTensors(int32_t numTensors) +{ + detail::CheckThrow(nvcvTensorBatchPopTensors(handle(), numTensors)); +} + +inline void TensorBatch::popTensor() +{ + detail::CheckThrow(nvcvTensorBatchPopTensors(handle(), 1)); +} + +inline TensorBatchData TensorBatch::exportData(CUstream stream) +{ + NVCVTensorBatchData output = {}; + detail::CheckThrow(nvcvTensorBatchExportData(handle(), stream, &output)); + return TensorBatchData(output); +} + +inline void TensorBatch::clear() +{ + detail::CheckThrow(nvcvTensorBatchClear(handle())); +} + +inline void TensorBatch::setUserPointer(void *ptr) +{ + detail::CheckThrow(nvcvTensorBatchSetUserPointer(handle(), ptr)); +} + +inline void *TensorBatch::getUserPointer() const +{ + void *outPtr = nullptr; + detail::CheckThrow(nvcvTensorBatchGetUserPointer(handle(), &outPtr)); + return outPtr; +} + +inline Tensor TensorBatch::operator[](int32_t idx) const +{ + NVCVTensorHandle hTensor = nullptr; + detail::CheckThrow(nvcvTensorBatchGetTensors(handle(), idx, &hTensor, 1)); + return Tensor(std::move(hTensor)); +} + +inline void TensorBatch::setTensor(int32_t idx, const Tensor &tensor) +{ + auto hTensor = tensor.handle(); + detail::CheckThrow(nvcvTensorBatchSetTensors(handle(), idx, &hTensor, 1)); +} + +inline TensorBatch::Iterator TensorBatch::begin() const +{ + return Iterator(this, 0); +} + +inline TensorBatch::Iterator TensorBatch::end() const +{ + return Iterator(this, numTensors()); +} + +// TensorBatch::Iterator + +inline TensorBatch::Iterator::reference TensorBatch::Iterator::operator*() const +{ + return m_currentTensor; +} + +inline TensorBatch::Iterator::pointer TensorBatch::Iterator::operator->() const +{ + return &m_currentTensor; +} + +inline TensorBatch::Iterator TensorBatch::Iterator::operator++(int) +{ + Iterator output(*this); + ++(*this); + return output; +} + +inline TensorBatch::Iterator &TensorBatch::Iterator::operator++() +{ + ++m_idx; + UpdateCurrentTensor(); + return *this; +} + +inline TensorBatch::Iterator TensorBatch::Iterator::operator--(int) +{ + Iterator output(*this); + --(*this); + return output; +} + +inline TensorBatch::Iterator &TensorBatch::Iterator::operator--() +{ + --m_idx; + UpdateCurrentTensor(); + return *this; +} + +inline TensorBatch::Iterator TensorBatch::Iterator::operator+(difference_type diff) const +{ + return Iterator(m_tensorBatch, m_idx + diff); +} + +inline TensorBatch::Iterator TensorBatch::Iterator::operator-(difference_type diff) const +{ + return Iterator(m_tensorBatch, m_idx - diff); +} + +inline void TensorBatch::Iterator::UpdateCurrentTensor() +{ + if (m_idx < m_tensorBatch->numTensors() && m_idx >= 0) + { + m_currentTensor = (*m_tensorBatch)[m_idx]; + } +} + +inline TensorBatch::Iterator::difference_type TensorBatch::Iterator::operator-(const Iterator &rhs) const +{ + return m_idx - rhs.m_idx; +} + +inline bool TensorBatch::Iterator::operator==(const Iterator &rhs) const +{ + return m_tensorBatch == rhs.m_tensorBatch && m_idx == rhs.m_idx; +} + +inline bool TensorBatch::Iterator::operator!=(const Iterator &rhs) const +{ + return !(rhs == *this); +} + +inline bool TensorBatch::Iterator::operator<(const Iterator &rhs) const +{ + return std::make_pair(m_tensorBatch, m_idx) < std::make_pair(rhs.m_tensorBatch, rhs.m_idx); +} + +inline bool TensorBatch::Iterator::operator>(const Iterator &rhs) const +{ + return std::make_pair(m_tensorBatch, m_idx) > std::make_pair(rhs.m_tensorBatch, rhs.m_idx); +} + +inline bool TensorBatch::Iterator::operator<=(const Iterator &rhs) const +{ + return !(rhs < *this); +} + +inline bool TensorBatch::Iterator::operator>=(const Iterator &rhs) const +{ + return !(rhs > *this); +} + +} // namespace nvcv + +#endif // NVCV_TENSORBATCH_IMPL_HPP diff --git a/src/nvcv_types/include/nvcv/detail/TensorImpl.hpp b/src/nvcv_types/include/nvcv/detail/TensorImpl.hpp index 2e7b7cc1..f4ceab2a 100644 --- a/src/nvcv_types/include/nvcv/detail/TensorImpl.hpp +++ b/src/nvcv_types/include/nvcv/detail/TensorImpl.hpp @@ -91,6 +91,15 @@ inline void *Tensor::userPointer() const return ptr; } +inline Tensor Tensor::reshape(const TensorShape &new_shape) +{ + NVCVTensorHandle out_handle; + detail::CheckThrow( + nvcvTensorReshape(this->handle(), new_shape.rank(), &new_shape.shape()[0], new_shape.layout(), &out_handle)); + Tensor out_tensor(std::move(out_handle)); + return out_tensor; +} + inline auto Tensor::CalcRequirements(const TensorShape &shape, DataType dtype, const MemAlignment &bufAlign) -> Requirements { diff --git a/src/nvcv_types/priv/Array.cpp b/src/nvcv_types/priv/Array.cpp index ea0ab0cc..2eb02a11 100644 --- a/src/nvcv_types/priv/Array.cpp +++ b/src/nvcv_types/priv/Array.cpp @@ -157,6 +157,7 @@ Array::Array(NVCVArrayRequirements reqs, IAllocator &alloc, NVCVResourceType tar NVCV_ASSERT(m_memBuffer != nullptr); this->exportData(m_data); + m_data.length = 0; } Array::~Array() @@ -223,4 +224,12 @@ void Array::exportData(NVCVArrayData &data) const } } +void Array::resize(int64_t length) +{ + if (length <= this->capacity()) + { + m_data.length = length; + } +} + } // namespace nvcv::priv diff --git a/src/nvcv_types/priv/Array.hpp b/src/nvcv_types/priv/Array.hpp index 3d522fce..19c76abd 100644 --- a/src/nvcv_types/priv/Array.hpp +++ b/src/nvcv_types/priv/Array.hpp @@ -47,6 +47,8 @@ class Array final : public CoreObjectBase void exportData(NVCVArrayData &data) const override; + void resize(int64_t length) override; + private: SharedCoreObj m_alloc; NVCVArrayRequirements m_reqs; diff --git a/src/nvcv_types/priv/ArrayWrapData.cpp b/src/nvcv_types/priv/ArrayWrapData.cpp index 20bdc431..23b8d1c7 100644 --- a/src/nvcv_types/priv/ArrayWrapData.cpp +++ b/src/nvcv_types/priv/ArrayWrapData.cpp @@ -119,4 +119,12 @@ void ArrayWrapData::exportData(NVCVArrayData &data) const data = m_data; } +void ArrayWrapData::resize(int64_t length) +{ + if (length <= this->capacity()) + { + m_data.length = length; + } +} + } // namespace nvcv::priv diff --git a/src/nvcv_types/priv/ArrayWrapData.hpp b/src/nvcv_types/priv/ArrayWrapData.hpp index eee9b303..2b15565f 100644 --- a/src/nvcv_types/priv/ArrayWrapData.hpp +++ b/src/nvcv_types/priv/ArrayWrapData.hpp @@ -42,6 +42,8 @@ class ArrayWrapData final : public CoreObjectBase void exportData(NVCVArrayData &data) const override; + void resize(int64_t length) override; + private: NVCVArrayData m_data; NVCVResourceType m_target; diff --git a/src/nvcv_types/priv/CMakeLists.txt b/src/nvcv_types/priv/CMakeLists.txt index 0617c8f9..049f173e 100644 --- a/src/nvcv_types/priv/CMakeLists.txt +++ b/src/nvcv_types/priv/CMakeLists.txt @@ -36,6 +36,7 @@ add_library(nvcv_types_priv STATIC ImageFormat.cpp Array.cpp ArrayWrapData.cpp + TensorBatch.cpp ) target_include_directories(nvcv_types_priv diff --git a/src/nvcv_types/priv/Context.cpp b/src/nvcv_types/priv/Context.cpp index b3bc3a2d..c1505716 100644 --- a/src/nvcv_types/priv/Context.cpp +++ b/src/nvcv_types/priv/Context.cpp @@ -34,8 +34,10 @@ Context::Context() , m_imageManager("Image") , m_imageBatchManager("ImageBatch") , m_tensorManager("Tensor") + , m_tensorBatchManager("TensorBatch") , m_arrayManager("Array") - , m_managerList{m_allocatorManager, m_imageManager, m_imageBatchManager, m_tensorManager, m_arrayManager} + , m_managerList{m_allocatorManager, m_imageManager, m_imageBatchManager, + m_tensorManager, m_tensorBatchManager, m_arrayManager} { } @@ -59,5 +61,6 @@ template class HandleManager; template class HandleManager; template class HandleManager; template class HandleManager; +template class HandleManager; } // namespace nvcv::priv diff --git a/src/nvcv_types/priv/Context.hpp b/src/nvcv_types/priv/Context.hpp index 651e565e..b963a0ef 100644 --- a/src/nvcv_types/priv/Context.hpp +++ b/src/nvcv_types/priv/Context.hpp @@ -24,6 +24,7 @@ #include "IContext.hpp" #include "ImageBatchManager.hpp" #include "ImageManager.hpp" +#include "TensorBatchManager.hpp" #include "TensorManager.hpp" namespace nvcv::priv { @@ -39,12 +40,13 @@ class Context final : public IContext private: // Order is important due to inter-dependencies - DefaultAllocator m_allocDefault; - AllocatorManager m_allocatorManager; - ImageManager m_imageManager; - ImageBatchManager m_imageBatchManager; - TensorManager m_tensorManager; - ArrayManager m_arrayManager; + DefaultAllocator m_allocDefault; + AllocatorManager m_allocatorManager; + ImageManager m_imageManager; + ImageBatchManager m_imageBatchManager; + TensorManager m_tensorManager; + TensorBatchManager m_tensorBatchManager; + ArrayManager m_arrayManager; Managers m_managerList; }; diff --git a/src/nvcv_types/priv/HandleTraits.hpp b/src/nvcv_types/priv/HandleTraits.hpp index 246ff3a2..cba0036a 100644 --- a/src/nvcv_types/priv/HandleTraits.hpp +++ b/src/nvcv_types/priv/HandleTraits.hpp @@ -56,6 +56,12 @@ struct HandleTraits constexpr static bool hasManager = true; }; +template<> +struct HandleTraits +{ + constexpr static bool hasManager = true; +}; + template constexpr bool HasObjManager = HandleTraits::hasManager; diff --git a/src/nvcv_types/priv/IArray.hpp b/src/nvcv_types/priv/IArray.hpp index caab7e95..0b9127bd 100644 --- a/src/nvcv_types/priv/IArray.hpp +++ b/src/nvcv_types/priv/IArray.hpp @@ -42,6 +42,8 @@ class IArray : public ICoreObjectHandle virtual NVCVResourceType target() const = 0; virtual void exportData(NVCVArrayData &data) const = 0; + + virtual void resize(int64_t length) = 0; }; template<> diff --git a/src/nvcv_types/priv/IContext.hpp b/src/nvcv_types/priv/IContext.hpp index e1a14ebe..309d77a6 100644 --- a/src/nvcv_types/priv/IContext.hpp +++ b/src/nvcv_types/priv/IContext.hpp @@ -28,19 +28,20 @@ namespace nvcv::priv { template class CoreObjManager; -using ImageManager = CoreObjManager; -using ImageBatchManager = CoreObjManager; -using TensorManager = CoreObjManager; -using ArrayManager = CoreObjManager; -using AllocatorManager = CoreObjManager; +using ImageManager = CoreObjManager; +using ImageBatchManager = CoreObjManager; +using TensorManager = CoreObjManager; +using TensorBatchManager = CoreObjManager; +using ArrayManager = CoreObjManager; +using AllocatorManager = CoreObjManager; class IAllocator; class IContext { public: - using Managers - = std::tuple; + using Managers = std::tuple; template CoreObjManager &manager() diff --git a/src/nvcv_types/priv/ITensorBatch.hpp b/src/nvcv_types/priv/ITensorBatch.hpp new file mode 100644 index 00000000..14be328b --- /dev/null +++ b/src/nvcv_types/priv/ITensorBatch.hpp @@ -0,0 +1,66 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_CORE_PRIV_ITENSORBATCH_HPP +#define NVCV_CORE_PRIV_ITENSORBATCH_HPP + +#include "ICoreObject.hpp" +#include "SharedCoreObj.hpp" + +#include + +namespace nvcv::priv { + +class IAllocator; + +class ITensorBatch : public ICoreObjectHandle +{ +public: + virtual int32_t capacity() const = 0; + virtual int32_t rank() const = 0; + virtual NVCVDataType dtype() const = 0; + virtual int32_t numTensors() const = 0; + virtual NVCVTensorLayout layout() const = 0; + virtual NVCVTensorBufferType type() const = 0; + + virtual SharedCoreObj alloc() const = 0; + + virtual void clear() = 0; + + virtual void pushTensors(const NVCVTensorHandle *tensors, int32_t numTensors) = 0; + + virtual void popTensors(int32_t numTensors) = 0; + + virtual void getTensors(int32_t index, NVCVTensorHandle *tensors, int32_t numTensors) const = 0; + + virtual void setTensors(int32_t index, const NVCVTensorHandle *tensors, int32_t numTensors) = 0; + + virtual void exportData(CUstream stream, NVCVTensorBatchData &data) = 0; +}; + +template<> +class CoreObjManager : public HandleManager +{ + using Base = HandleManager; + +public: + using Base::Base; +}; + +} // namespace nvcv::priv + +#endif // NVCV_CORE_PRIV_TENSORBATCH_HPP diff --git a/src/nvcv_types/priv/TensorBatch.cpp b/src/nvcv_types/priv/TensorBatch.cpp new file mode 100644 index 00000000..356bdbe6 --- /dev/null +++ b/src/nvcv_types/priv/TensorBatch.cpp @@ -0,0 +1,339 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TensorBatch.hpp" + +#include "Requirements.hpp" +#include "TensorBatchManager.hpp" + +#include +#include + +namespace nvcv::priv { + +TensorBatch::TensorBatch(const NVCVTensorBatchRequirements &reqs, IAllocator &alloc) + : m_alloc(alloc) + , m_reqs(reqs) + , m_dirtyBegin(0) + , m_dirtyEnd(0) + , m_dtype(NVCV_DATA_TYPE_NONE) + , m_layout(NVCV_TENSOR_LAYOUT_MAKE("")) + , m_rank(-1) + , m_userPointer(nullptr) +{ + m_evPostFence = nullptr; + m_devTensorsBuffer = nullptr; + m_pinnedTensorsBuffer = nullptr; + m_Tensors = nullptr; + + int64_t bufferSize = m_reqs.capacity * sizeof(BatchElement); + + try + { + m_devTensorsBuffer = static_cast(m_alloc->allocCudaMem(bufferSize, m_reqs.alignBytes)); + NVCV_ASSERT(m_devTensorsBuffer != nullptr); + + m_pinnedTensorsBuffer = static_cast(m_alloc->allocHostPinnedMem(bufferSize, m_reqs.alignBytes)); + NVCV_ASSERT(m_pinnedTensorsBuffer != nullptr); + + m_Tensors = static_cast(m_alloc->allocHostMem(bufferSize, m_reqs.alignBytes)); + NVCV_ASSERT(m_Tensors != nullptr); + + NVCV_CHECK_THROW(cudaEventCreateWithFlags(&m_evPostFence, cudaEventDisableTiming)); + } + catch (...) + { + cleanUp(); + throw; + } +} + +NVCVTensorBatchRequirements TensorBatch::CalcRequirements(int32_t capacity) +{ + NVCVTensorBatchRequirements reqs; + reqs.capacity = capacity; + reqs.mem = {}; + + reqs.alignBytes = alignof(BatchElement); + reqs.alignBytes = util::RoundUpNextPowerOfTwo(reqs.alignBytes); + + if (reqs.alignBytes > NVCV_MAX_MEM_REQUIREMENTS_BLOCK_SIZE) + { + throw Exception(NVCV_ERROR_INVALID_ARGUMENT, + "Alignment requirement of %d is larger than the maximum allowed %ld", reqs.alignBytes, + NVCV_MAX_MEM_REQUIREMENTS_BLOCK_SIZE); + } + + AddBuffer(reqs.mem.cudaMem, capacity * sizeof(BatchElement), reqs.alignBytes); + AddBuffer(reqs.mem.hostPinnedMem, capacity * sizeof(BatchElement), reqs.alignBytes); + AddBuffer(reqs.mem.hostMem, capacity * sizeof(BatchElement), reqs.alignBytes); + + return reqs; +} + +TensorBatch::~TensorBatch() +{ + cleanUp(); +} + +void TensorBatch::cleanUp() +{ + if (m_evPostFence) + { + NVCV_CHECK_LOG(cudaEventDestroy(m_evPostFence)); + } + + for (int i = 0; i < m_numTensors; ++i) + { + CoreObjectDecRef(m_Tensors[i]); + } + + int64_t bufferSize = m_reqs.capacity * sizeof(BatchElement); + + m_alloc->freeCudaMem(m_devTensorsBuffer, bufferSize, m_reqs.alignBytes); + m_alloc->freeHostPinnedMem(m_pinnedTensorsBuffer, bufferSize, m_reqs.alignBytes); + m_alloc->freeHostMem(m_Tensors, bufferSize, m_reqs.alignBytes); +} + +void TensorBatch::exportData(CUstream stream, NVCVTensorBatchData &data) +{ + if (m_dirtyBegin < m_dirtyEnd) + { + // Block until the previous call to exportData finishes the buffer copy. + NVCV_CHECK_THROW(cudaEventSynchronize(m_evPostFence)); + + for (auto i = m_dirtyBegin; i < m_dirtyEnd; ++i) + { + auto &t = ToStaticRef(m_Tensors[i]); + NVCVTensorData tdata; + t.exportData(tdata); + auto &element = m_pinnedTensorsBuffer[i]; + element.data = tdata.buffer.strided.basePtr; + for (int d = 0; d < tdata.rank; ++d) + { + element.shape[d] = tdata.shape[d]; + element.stride[d] = tdata.buffer.strided.strides[d]; + } + } + + int64_t copySize = (m_dirtyEnd - m_dirtyBegin) * sizeof(BatchElement); + NVCV_CHECK_THROW(cudaMemcpyAsync(m_devTensorsBuffer + m_dirtyBegin, m_pinnedTensorsBuffer + m_dirtyBegin, + copySize, cudaMemcpyHostToDevice, stream)); + + // Signal the buffer copy is finished. + NVCV_CHECK_THROW(cudaEventRecord(m_evPostFence, stream)); + m_dirtyBegin = m_dirtyEnd; + } + NVCVTensorBatchBuffer buffer; + buffer.strided = NVCVTensorBatchBufferStrided{m_devTensorsBuffer}; + data.buffer = buffer; + data.type = NVCV_TENSOR_BUFFER_STRIDED_CUDA; + data.rank = m_rank; + data.dtype = m_dtype; + data.layout = m_layout; + data.numTensors = m_numTensors; +} + +void TensorBatch::validateTensors(const NVCVTensorHandle *tensors, int32_t numTensors) +{ + for (int32_t i = 0; i < numTensors; ++i) + { + auto &t = ToStaticRef(tensors[i]); + if (m_rank != -1 && t.rank() != m_rank) + { + throw Exception(NVCV_ERROR_INVALID_ARGUMENT, + "Trying to add a tensor to a tensor batch with an inconsistent rank."); + } + if (t.dtype().value() != m_dtype) + { + throw Exception(NVCV_ERROR_INVALID_ARGUMENT, + "Trying to add a tensor to a tensor batch with an inconsistent type."); + } + if (nvcvTensorLayoutCompare(t.layout(), m_layout) != 0) + { + throw Exception(NVCV_ERROR_INVALID_ARGUMENT, + "Trying to add a tensor to a tensor batch with an inconsistent layout."); + } + } +} + +void TensorBatch::setLayoutAndDType(const NVCVTensorHandle *tensors, int32_t numTensors) +{ + if (numTensors > 0 && m_numTensors == 0) + { + auto &t = ToStaticRef(tensors[0]); + m_rank = t.rank(); + m_dtype = t.dtype().value(); + m_layout = t.layout(); + } +} + +void TensorBatch::pushTensors(const NVCVTensorHandle *tensors, int32_t numTensors) +{ + if (numTensors == 0) + { + return; + } + if (numTensors < 0) + { + throw Exception(NVCV_ERROR_INVALID_ARGUMENT) << "Number of tensors cannot be nagative: " << numTensors; + } + if (m_numTensors + numTensors > capacity()) + { + throw Exception(NVCV_ERROR_OVERFLOW) + << "Adding " << numTensors << " tensors to a tensor batch would exceed its capacity (" << capacity() + << ") by " << m_numTensors + numTensors - capacity(); + } + setLayoutAndDType(tensors, numTensors); + validateTensors(tensors, numTensors); + for (int32_t i = 0; i < numTensors; ++i) + { + CoreObjectIncRef(tensors[i]); + m_Tensors[m_numTensors + i] = tensors[i]; + } + if (m_dirtyEnd == m_dirtyBegin) + { + m_dirtyBegin = m_numTensors; + } + m_numTensors += numTensors; + m_dirtyEnd = m_numTensors; +} + +void TensorBatch::popTensors(int32_t numTensors) +{ + if (numTensors < 0) + { + throw Exception(NVCV_ERROR_INVALID_ARGUMENT) << "Trying to pop a negative number of tensors: " << numTensors; + } + if (numTensors > m_numTensors) + { + throw Exception(NVCV_ERROR_UNDERFLOW) + << "Trying to pop " << numTensors << " tensors from a tensor batch with " << m_numTensors << " tensors."; + } + for (int i = m_numTensors - numTensors; i < m_numTensors; ++i) + { + CoreObjectDecRef(m_Tensors[i]); + } + m_numTensors -= numTensors; + m_dirtyEnd = std::min(m_dirtyEnd, m_numTensors); + m_dirtyBegin = std::min(m_dirtyBegin, m_dirtyEnd); + if (m_numTensors == 0) + { + m_dtype = NVCV_DATA_TYPE_NONE; + m_layout = NVCV_TENSOR_LAYOUT_MAKE(""); + m_rank = -1; + } +} + +void TensorBatch::getTensors(int32_t index, NVCVTensorHandle *tensors, int32_t numTensors) const +{ + if (index + numTensors > m_numTensors) + { + throw Exception(NVCV_ERROR_OVERFLOW) << "Trying to get a tensor on index " << index + numTensors + << " while the tensor batch contains only " << m_numTensors << " tensors."; + } + if (index < 0) + { + throw Exception(NVCV_ERROR_INVALID_ARGUMENT) << "Trying to get a tensor with negative index: " << index; + } + std::copy(m_Tensors + index, m_Tensors + index + numTensors, tensors); + for (int i = 0; i < numTensors; ++i) + { + CoreObjectIncRef(tensors[i]); + } +} + +void TensorBatch::setTensors(int32_t index, const NVCVTensorHandle *tensors, int32_t numTensors) +{ + if (index + numTensors > m_numTensors) + { + throw Exception(NVCV_ERROR_OVERFLOW) << "Trying to set a tensor on index " << index + numTensors + << " while the tensor batch contains only " << m_numTensors << " tensors."; + } + if (index < 0) + { + throw Exception(NVCV_ERROR_INVALID_ARGUMENT) << "Trying to set a tensor with negative index: " << index; + } + validateTensors(tensors, numTensors); + for (int32_t idx = 0; idx < numTensors; ++idx) + { + CoreObjectDecRef(m_Tensors[idx + index]); + CoreObjectIncRef(tensors[idx]); + m_Tensors[idx + index] = tensors[idx]; + } + if (m_dirtyBegin != m_dirtyEnd) + { + m_dirtyBegin = std::min(m_dirtyBegin, index); + m_dirtyEnd = std::max(m_dirtyEnd, index + numTensors); + } + else + { + m_dirtyBegin = index; + m_dirtyEnd = m_dirtyBegin + numTensors; + } +} + +SharedCoreObj TensorBatch::alloc() const +{ + return m_alloc; +} + +int32_t TensorBatch::capacity() const +{ + return m_reqs.capacity; +} + +int32_t TensorBatch::rank() const +{ + return m_rank; +} + +int32_t TensorBatch::numTensors() const +{ + return m_numTensors; +} + +NVCVDataType TensorBatch::dtype() const +{ + return m_dtype; +} + +NVCVTensorLayout TensorBatch::layout() const +{ + return m_layout; +} + +NVCVTensorBufferType TensorBatch::type() const +{ + return BUFFER_TYPE; +} + +void TensorBatch::clear() +{ + for (int i = 0; i < m_numTensors; ++i) + { + CoreObjectDecRef(m_Tensors[i]); + } + m_numTensors = 0; + m_dirtyBegin = 0; + m_dirtyEnd = 0; + m_dtype = NVCV_DATA_TYPE_NONE; + m_layout = NVCV_TENSOR_LAYOUT_MAKE(""); + m_rank = -1; +} + +} // namespace nvcv::priv diff --git a/src/nvcv_types/priv/TensorBatch.hpp b/src/nvcv_types/priv/TensorBatch.hpp new file mode 100644 index 00000000..16a41f53 --- /dev/null +++ b/src/nvcv_types/priv/TensorBatch.hpp @@ -0,0 +1,107 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_CORE_PRIV_TENSORBATCH_HPP +#define NVCV_CORE_PRIV_TENSORBATCH_HPP + +#include "DataType.hpp" +#include "IAllocator.hpp" +#include "ITensorBatch.hpp" +#include "SharedCoreObj.hpp" +#include "Tensor.hpp" + +#include + +namespace nvcv::priv { + +class TensorBatch final : public CoreObjectBase +{ +public: + using BatchElement = NVCVTensorBatchElementStrided; + static const NVCVTensorBufferType BUFFER_TYPE = NVCV_TENSOR_BUFFER_STRIDED_CUDA; + + static NVCVTensorBatchRequirements CalcRequirements(int32_t capacity); + + TensorBatch(const NVCVTensorBatchRequirements &reqs, IAllocator &alloc); + + ~TensorBatch(); + + SharedCoreObj alloc() const override; + + int32_t capacity() const override; + + NVCVDataType dtype() const override; + + NVCVTensorLayout layout() const override; + + int32_t numTensors() const override; + + NVCVTensorBufferType type() const override; + + int32_t rank() const override; + + void clear() override; + + void exportData(CUstream stream, NVCVTensorBatchData &data) override; + + void pushTensors(const NVCVTensorHandle *tensors, int32_t numTensors) override; + + void popTensors(int32_t numTensors) override; + + void getTensors(int32_t index, NVCVTensorHandle *tensors, int32_t numTensors) const override; + + void setTensors(int32_t index, const NVCVTensorHandle *tensors, int32_t numTensors) override; + +private: + SharedCoreObj m_alloc; + NVCVTensorBatchRequirements m_reqs; + + // Dirty begin and end describe a range containing all the tensors that have been modified + // since the previous exportData call and thus should be updated in the exported buffer. + int32_t m_dirtyBegin; + int32_t m_dirtyEnd; + + int32_t m_numTensors = 0; + + NVCVTensorHandle *m_Tensors; // host buffer for tensor handles + // Pinned buffer for the tensor data descriptors + // It's updated every time the user updates the tensor batch. + // Changes are tracked with the m_dirty flags. + NVCVTensorBatchElementStrided *m_pinnedTensorsBuffer; + // Device buffer for the tensor data descriptors. + // It's updated and returned when the exportData method is called. + NVCVTensorBatchElementStrided *m_devTensorsBuffer; + + NVCVDataType m_dtype; + NVCVTensorLayout m_layout; + int32_t m_rank; + + // TODO: must be retrieved from the resource allocator; + cudaEvent_t m_evPostFence; + + void *m_userPointer; + + void cleanUp(); + + void validateTensors(const NVCVTensorHandle *tensors, int32_t numTensors); + + void setLayoutAndDType(const NVCVTensorHandle *tensors, int32_t numTensors); +}; + +} // namespace nvcv::priv + +#endif // NVCV_CORE_PRIV_TENSORBATCH_HPP diff --git a/src/nvcv_types/priv/TensorBatchManager.hpp b/src/nvcv_types/priv/TensorBatchManager.hpp new file mode 100644 index 00000000..d9082a31 --- /dev/null +++ b/src/nvcv_types/priv/TensorBatchManager.hpp @@ -0,0 +1,36 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_PRIV_CORE_TENSORBATCHMANAGER_HPP +#define NVCV_PRIV_CORE_TENSORBATCHMANAGER_HPP + +#include "IContext.hpp" +#include "TensorBatch.hpp" + +namespace nvcv::priv { + +using TensorBatchManager = CoreObjManager; + +template<> +struct ResourceStorage +{ + using type = CompatibleStorage; +}; + +} // namespace nvcv::priv + +#endif // NVCV_PRIV_CORE_TENSORBATCHMANAGER_HPP diff --git a/src/nvcv_types/priv/TensorData.cpp b/src/nvcv_types/priv/TensorData.cpp index 142ad714..68edc8ce 100644 --- a/src/nvcv_types/priv/TensorData.cpp +++ b/src/nvcv_types/priv/TensorData.cpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -23,6 +23,8 @@ #include +#include + namespace nvcv::priv { NVCVTensorLayout GetTensorLayoutFor(ImageFormat fmt, int nbatches) @@ -186,4 +188,164 @@ void FillTensorData(IImage &img, NVCVTensorData &tensorData) tensorStrided.basePtr = imgStrided.planes[0].basePtr; } +/** + * @brief Simplifies a shape by collapsing dimensions that are not strided + * + * @param[in] rank number of dimensions + * @param[in] shape + * @param[in] stride + * @param[out] out_shape + * @param[out] out_strides + * @return int out_rank + */ +static int Simplify(int rank, int64_t *shape, int64_t *stride, int64_t *out_shape, int64_t *out_strides) +{ + if (rank <= 1) // Nothing to simplify + { + if (rank == 1) + { + out_shape[0] = shape[0]; + out_strides[0] = stride[0]; + } + return rank; + } + + int out_rank = 0; + int64_t vol = shape[0]; + for (int d = 1; d < rank; d++) + { + if (stride[d - 1] != shape[d] * stride[d]) + { + out_strides[out_rank] = stride[d - 1]; + out_shape[out_rank] = vol; + vol = shape[d]; + out_rank++; + } + else + { + vol *= shape[d]; + } + } + out_strides[out_rank] = stride[rank - 1]; + out_shape[out_rank] = vol; + out_rank++; + return out_rank; +} + +/** + * @brief Reshapes a simplified shape (non-strided dimensions are collapsed) to a target shape if possible. + * Calculates the output strides. + * + * @param[in] in_rank + * @param[in] in_shape + * @param[in] in_strides + * @param[in] target_rank + * @param[in] target_shape + * @param[out] out_strides + * + * @return true if reshape is possible, false otherwise + */ +static bool ReshapeSimplified(int in_rank, const int64_t *in_shape, const int64_t *in_strides, int target_rank, + const int64_t *target_shape, int64_t *out_strides) +{ + int i = 0, j = 0; + for (; i < in_rank && j < target_rank; i++) + { + int64_t in_e = in_shape[i]; + int64_t out_v = 1; + int group_start = j; + while (j < target_rank && (out_v * target_shape[j]) <= in_e) out_v *= target_shape[j++]; + + if (out_v != in_e) + return false; // reshape is not possible + + int64_t s = in_strides[i]; + for (int d = j - 1; d >= group_start; d--) + { + out_strides[d] = s; + s *= target_shape[d]; + } + } + return true; +} + +static std::string ShapeStr(int rank, const int64_t *sh) +{ + std::stringstream ss; + ss << "("; + for (int d = 0; d < rank; d++) + { + if (d > 0) + ss << ", "; + ss << sh[d]; + } + ss << ")"; + return ss.str(); +} + +void ReshapeTensorData(NVCVTensorData &tensor_data, int new_rank, const int64_t *new_shape, NVCVTensorLayout new_layout) +{ + int64_t old_volume = 1; + for (int d = 0; d < tensor_data.rank; d++) old_volume *= tensor_data.shape[d]; + + // TODO: Add 0D tensor support, once it's supported accross the board + if (new_rank < 1 || new_rank > NVCV_TENSOR_MAX_RANK) + throw Exception(NVCV_ERROR_INVALID_ARGUMENT) + << "Number of dimensions must be between 1 and " << NVCV_TENSOR_MAX_RANK << ", not " << new_rank; + + int64_t new_volume = 1; + for (int d = 0; d < new_rank; d++) new_volume *= new_shape[d]; + + if (new_volume != old_volume) + { + throw Exception(NVCV_ERROR_INVALID_ARGUMENT) + << "The volume (" << new_volume << ") of the provided shape " << ShapeStr(new_rank, new_shape) + << " does not match the size of the array (" << old_volume << ")"; + } + + // layout ------------ + if (new_layout.rank > 0) + { + if (new_layout.rank != new_rank) + throw Exception(NVCV_ERROR_INVALID_ARGUMENT) + << "The number of dimensions of the provided layout and shape do not match. Got a " + "shape with " + << new_rank << " dimensions and a layout with " << new_layout.rank << " dimensions"; + } + tensor_data.layout = new_layout; + + // Check strides ------------ + + // right now is the only option supported + assert(tensor_data.bufferType == NVCV_TENSOR_BUFFER_STRIDED_CUDA); + + // Collapses non-strided dimensions into groups + // Example 1: + // A tensor with shape (480, 640, 3) and strides (2560, 4, 1) + // will be collapsed into (307200, 3) with strides (4, 1). + // Example 2: + // A tensor with shape (480, 640, 3) and strides (2560, 3, 1) + // will be collapsed into (921600,) with strides (1,). + int64_t simplified_shape[NVCV_TENSOR_MAX_RANK]; + int64_t simplified_strides[NVCV_TENSOR_MAX_RANK]; + int simplified_rank = Simplify(tensor_data.rank, tensor_data.shape, tensor_data.buffer.strided.strides, + simplified_shape, simplified_strides); + + // Calculate output strides (if reshape is possible) or throw an error + bool ret = ReshapeSimplified(simplified_rank, simplified_shape, simplified_strides, new_rank, new_shape, + tensor_data.buffer.strided.strides); + if (!ret) + { + throw Exception(NVCV_ERROR_INVALID_ARGUMENT) + << "Cannot reshape" + << ". Original shape: " << ShapeStr(tensor_data.rank, tensor_data.shape) + << ", Strides: " << ShapeStr(tensor_data.rank, tensor_data.buffer.strided.strides) + << ", Target shape: " << ShapeStr(new_rank, new_shape); + } + + // Set the new shape to the tensor data + tensor_data.rank = new_rank; + for (int d = 0; d < tensor_data.rank; d++) tensor_data.shape[d] = new_shape[d]; +} + } // namespace nvcv::priv diff --git a/src/nvcv_types/priv/TensorData.hpp b/src/nvcv_types/priv/TensorData.hpp index c131c2e8..6ad85915 100644 --- a/src/nvcv_types/priv/TensorData.hpp +++ b/src/nvcv_types/priv/TensorData.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -30,6 +30,9 @@ NVCVTensorLayout GetTensorLayoutFor(ImageFormat fmt, int nbatches); void FillTensorData(IImage &img, NVCVTensorData &data); +void ReshapeTensorData(NVCVTensorData &tensor_data, int new_rank, const int64_t *new_shape, + NVCVTensorLayout new_layout); + } // namespace nvcv::priv #endif // NVCV_CORE_PRIV_TENSORDATA_HPP diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index ffe90225..a889a14a 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -184,6 +184,9 @@ add_library(nvcv_util String.cpp Version.cpp TensorDataUtils.cpp + Event.cpp + Stream.cpp + StreamId.cpp ) target_include_directories(nvcv_util diff --git a/src/util/Event.cpp b/src/util/Event.cpp new file mode 100644 index 00000000..65ca3123 --- /dev/null +++ b/src/util/Event.cpp @@ -0,0 +1,56 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Event.hpp" + +#include "CheckError.hpp" + +#include + +namespace nvcv::util { + +CudaEvent CudaEvent::Create(int deviceId) +{ + return CreateWithFlags(cudaEventDisableTiming, deviceId); +} + +CudaEvent CudaEvent::CreateWithFlags(unsigned flags, int deviceId) +{ + cudaEvent_t event; + int prevDev = -1; + if (deviceId >= 0) + { + NVCV_CHECK_THROW(cudaGetDevice(&prevDev)); + NVCV_CHECK_THROW(cudaSetDevice(deviceId)); + } + auto err = cudaEventCreateWithFlags(&event, flags); + if (prevDev >= 0) + NVCV_CHECK_THROW(cudaSetDevice(prevDev)); + NVCV_CHECK_THROW(err); + return CudaEvent(event); +} + +void CudaEvent::DestroyHandle(cudaEvent_t event) +{ + auto err = cudaEventDestroy(event); + if (err != cudaSuccess && err != cudaErrorCudartUnloading) + { + NVCV_CHECK_THROW(err); + } +} + +} // namespace nvcv::util diff --git a/src/util/Event.hpp b/src/util/Event.hpp new file mode 100644 index 00000000..64e8adc5 --- /dev/null +++ b/src/util/Event.hpp @@ -0,0 +1,59 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_UTIL_CUDA_EVENT_H_ +#define NVCV_UTIL_CUDA_EVENT_H_ + +#include "UniqueHandle.hpp" + +#include + +#include + +namespace nvcv::util { + +/** + * @brief A wrapper class for CUDA event handle (cudaEvent_t), + * + * The purpose of this class is to provide safe ownership and lifecycle management + * for CUDA event handles. + * The event object may be created using the factory functions @ref Create and @ref CreateWithFlags. + * + * The object may also assume ownership of a pre-existing handle via constructor or + * @link UniqueHandle::reset(handle_type) reset @endlink function. + */ +class CudaEvent : public UniqueHandle +{ +public: + NVCV_INHERIT_UNIQUE_HANDLE(cudaEvent_t, CudaEvent) + constexpr CudaEvent() = default; + + /** @brief Creates an event on specified device (or current device, if deviceId < 0) */ + static CudaEvent Create(int deviceId = -1); + + /** @brief Creates an event event with specific flags on the device specified + * (or current device, if deviceId < 0) + */ + static CudaEvent CreateWithFlags(unsigned flags, int deviceId = -1); + + /** @brief Calls cudaEventDestroy on the handle. */ + static void DestroyHandle(cudaEvent_t); +}; + +} // namespace nvcv::util + +#endif // DALI_CORE_CUDA_EVENT_H_ diff --git a/src/util/PerStreamCache.hpp b/src/util/PerStreamCache.hpp new file mode 100644 index 00000000..ce7bf460 --- /dev/null +++ b/src/util/PerStreamCache.hpp @@ -0,0 +1,265 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_UTIL_PER_STREAM_CACHE_HPP +#define NVCV_UTIL_PER_STREAM_CACHE_HPP + +#include "CheckError.hpp" +#include "Event.hpp" +#include "SimpleCache.hpp" +#include "StreamId.hpp" + +#include +#include +#include +#include +#include + +namespace nvcv::util { + +class EventCache : public nvcv::util::SimpleCache +{ +public: + CudaEvent get() + { + return getOrCreate([]() { return CudaEvent::Create(); }); + } +}; + +template +auto StreamCachePayloadReady(const Payload &payload) +{ + return payload.ready; +} + +template +auto StreamCachePayloadSize(const Payload &payload) +{ + return payload.size; +} + +template +auto StreamCachePayloadAlignment(const Payload &payload) +{ + return payload.alignment; +} + +namespace detail { + +/** Cache item - stores a payload in a bidirectional list item. + * + * @tparam Payload The payload of the cache item. It must have the followin fields: + * size_t size + * cudaEvent_t ready + */ +template +struct StreamCacheItem +{ + StreamCacheItem *next = nullptr, *prev = nullptr; + + mutable bool wasReady = false; + + Payload payload{}; + + /** Gets a CUDA event that signifies that the payload is ready. + */ + cudaEvent_t readyEvent() const + { + return StreamCachePayloadReady(payload); + } + + size_t payloadSize() const + { + return StreamCachePayloadSize(payload); + } + + bool isReady() const + { + if (wasReady) + return true; + if (auto ev = readyEvent()) + { + auto err = cudaEventQuery(ev); + if (err == cudaErrorNotReady) + return false; + NVCV_CHECK_THROW(err); + } + wasReady = true; + return true; + } +}; + +template> +class StreamCacheItemAllocator +{ +public: + using item_t = Item; + + ~StreamCacheItemAllocator() + { + assert(m_allocated == 0); + while (m_head) + { + auto *next = m_head->next; + delete m_head; + m_head = next; + } + } + + item_t *allocate() + { + if (auto *p = m_head) + { + m_head = p->next; + p->next = nullptr; + assert(!p->prev); + m_allocated++; + m_free--; + + *p = {}; // clear the object + return p; + } + + auto *p = new item_t(); + m_allocated++; + return p; + } + + void deallocate(item_t *item) + { + if (!item) + return; + + assert(!item->next && !item->prev && "The item is still linked"); + item->payload = {}; + + item->next = m_head; + m_head = item; + m_allocated--; + m_free++; + } + +private: + item_t *m_head = nullptr; + + size_t m_allocated = 0, m_free = 0; +}; + +template> +class StreamOrderedCache +{ +public: + using item_t = Item; + + explicit StreamOrderedCache(StreamCacheItemAllocator *itemAlloc) + : m_itemAlloc(itemAlloc) + { + } + + ~StreamOrderedCache() + { + waitAndPurge(); + } + + void waitAndPurge(); + + template + void removeAllReady(PayloadCallback callback); + + item_t *findNewestReady(); + + void put(Payload &&payload); + + bool empty() const + { + return m_bySize.empty(); + } + + template + std::optional getIf(size_t minSize, Predicate &&pred); + + std::optional get(size_t minSize, size_t minAlignment) + { + return getIf( + minSize, [=](const Payload &p) + { return StreamCachePayloadSize(p) >= minSize && StreamCachePayloadAlignment(p) >= minAlignment; }); + } + +private: + void insert(item_t *item); + + void remove(size_t payloadSize, item_t *item) noexcept; + + StreamCacheItemAllocator *m_itemAlloc; + + std::set> m_bySize; + + item_t *m_head = nullptr, *m_tail = nullptr; +}; + +} // namespace detail + +template> +class PerStreamCache +{ + using StreamOrderedCache = detail::StreamOrderedCache; + +public: + template + std::optional getIf(size_t minSize, Predicate &&pred, std::optional stream); + + auto get(size_t minSize, size_t minAlignment, std::optional stream) + { + return getIf( + minSize, + [=](const Payload &p) + { return StreamCachePayloadSize(p) >= minSize && StreamCachePayloadAlignment(p) >= minAlignment; }, + stream); + } + + void put(Payload &&payload, std::optional stream); + + void purge() + { + std::lock_guard g(m_lock); + for (auto &[k, v] : m_perStreamCache) v.waitAndPurge(); + m_globalCache.clear(); + } + +private: + template + std::optional tryGetPerStream(size_t minSize, Predicate &&pred, cudaStream_t stream); + + template + std::optional tryGetGlobal(size_t minSize, Predicate &&pred); + + int moveReadyToGlobal(); + + detail::StreamCacheItemAllocator m_cacheItemAlloc; + + std::unordered_map m_perStreamCache; + + std::multimap m_globalCache; + + std::mutex m_lock; +}; + +} // namespace nvcv::util + +#include "PerStreamCacheImpl.hpp" + +#endif // NVCV_UTIL_PER_STREAM_CACHE_HPP diff --git a/src/util/PerStreamCacheImpl.hpp b/src/util/PerStreamCacheImpl.hpp new file mode 100644 index 00000000..8a152562 --- /dev/null +++ b/src/util/PerStreamCacheImpl.hpp @@ -0,0 +1,330 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_UTIL_PER_STREAM_CACHE_IMPL_HPP +#define NVCV_UTIL_PER_STREAM_CACHE_IMPL_HPP + +#ifndef NVCV_UTIL_PER_STREAM_CACHE_HPP +# error "This file must not be included directly. Include StreamOrderedCache.hpp." +#endif + +namespace nvcv::util { +namespace detail { + +template +void StreamOrderedCache::waitAndPurge() +{ + bool ready = false; + size_t erased = 0; + while (m_tail) + { + if (!ready && m_tail->readyEvent()) + { + auto err = cudaEventSynchronize(m_tail->readyEvent()); + if (err != cudaErrorCudartUnloading) + NVCV_CHECK_THROW(err); + ready = true; + } + auto *curr = m_tail; + m_tail = m_tail->prev; + curr->prev = nullptr; + if (m_tail) + m_tail->next = nullptr; + m_itemAlloc->deallocate(curr); + erased++; + } + assert(erased == m_bySize.size()); + m_head = nullptr; + m_bySize.clear(); +} + +template +template +void StreamOrderedCache::removeAllReady(PayloadCallback callback) +{ + if (nvcv::util::IsCudaStreamIdHintUnambiguous()) + { + // If the stream id hint is unambiguous, we can find the newest item + // and all older items will naturally be ready. + + item_t *item = findNewestReady(); + // This item and all older items are ready + while (item) + { + item_t *prev = item->prev; + size_t payloadSize = item->payloadSize(); + callback(std::move(item->payload)); + remove(payloadSize, item); + item = prev; + } + } + else + { + // The system's stream id hint is ambiguous, so we may have a mixture + // of items actually coming from different streams. We need to + // chek them one by one, since the readiness order may be lost. + + item_t *item = m_tail; + while (item) + { + item_t *prev = item->prev; + if (item->isReady()) + { + size_t payloadSize = item->payloadSize(); + callback(std::move(item->payload)); + remove(payloadSize, item); + } + item = prev; + } + } +} + +template +auto StreamOrderedCache::findNewestReady() -> item_t * +{ + constexpr int kMaxItemsOnStack = 256; + item_t *tmp[kMaxItemsOnStack]; + item_t *sectionStart = m_tail; + // Process the items in blocks of up to kMaxItemsOnStack. On each block, a binary search is performed. + while (sectionStart) + { + if (sectionStart->isReady()) + return sectionStart; // everything elese is newer, hence also ready + + item_t *it = sectionStart->prev; // no point in re-checking the section start + int hi = 0; + for (; it && hi < kMaxItemsOnStack; hi++, it = it->prev) tmp[hi] = it; + + if (hi == 0) + return nullptr; + + // There are no ready elements in this range - move on + if (!tmp[hi - 1]->isReady()) + { + sectionStart = it; + continue; + } + + int lo = 0, m = (lo + hi) >> 1; + // After this loop, `m` is going to contain the index of the newest ready element + while (lo < hi) // exclusive upper bound + { + // halfway element is ready - maybe there are newer ones that are ready, too? + if (tmp[m]->isReady()) + { + hi = m; // exclusive upper bound + m = (lo + hi) >> 1; + } + else // halfway element isn't ready - move to `m+1` as a potential lower bound + { + lo = m + 1; + m = (lo + hi) >> 1; + } + } + assert(0 <= m && m <= hi); + assert(tmp[m]->wasReady); + return tmp[m]; + } + return nullptr; +} + +template +void StreamOrderedCache::put(Payload &&payload) +{ + item_t *item = m_itemAlloc->allocate(); + item->payload = std::move(payload); + payload = {}; + try + { + insert(item); + } + catch (...) + { + m_itemAlloc->deallocate(item); + throw; + } +} + +template +template +std::optional StreamOrderedCache::getIf(size_t minSize, Predicate &&pred) +{ + auto it = m_bySize.lower_bound({minSize, nullptr}); + for (; it != m_bySize.end(); ++it) + { + auto *item = it->second; + if (pred(item->payload)) + { + size_t payloadSize = item->payloadSize(); + Payload ret = std::move(item->payload); + remove(payloadSize, item); + return ret; + } + } + return std::nullopt; +} + +template +void StreamOrderedCache::insert(item_t *item) +{ + auto inserted = m_bySize.insert({item->payloadSize(), item}); +#ifdef NDEBUG + (void)inserted; +#endif + assert(inserted.second); + + if (!m_tail) + { + assert(!m_head); + m_head = m_tail = item; + } + else + { + assert(!m_tail->next); + item->prev = m_tail; + m_tail->next = item; + m_tail = item; + } +} + +template +void StreamOrderedCache::remove(size_t payloadSize, item_t *item) noexcept +{ + if (item == m_head) + m_head = m_head->next; + if (item == m_tail) + m_tail = m_tail->prev; + + if (item->prev) + item->prev->next = item->next; + if (item->next) + item->next->prev = item->prev; + item->prev = item->next = nullptr; + + size_t erased = m_bySize.erase({payloadSize, item}); +#ifdef NDEBUG + (void)erased; +#endif + assert(erased == 1); + + m_itemAlloc->deallocate(item); +} + +} // namespace detail + +template +template +std::optional PerStreamCache::getIf(size_t minSize, Predicate &&pred, + std::optional stream) +{ + std::optional ret; + + std::lock_guard guard(m_lock); + + if (stream) + { + ret = tryGetPerStream(minSize, pred, *stream); + if (ret) + return ret; + } + + do + { + ret = tryGetGlobal(minSize, pred); + if (ret) + return ret; + } + while (moveReadyToGlobal()); + + return std::nullopt; +} + +template +template +std::optional PerStreamCache::tryGetPerStream(size_t size, Predicate &&pred, + cudaStream_t stream) +{ + uint64_t streamId = GetCudaStreamIdHint(stream); + auto it = m_perStreamCache.find(streamId); + if (it == m_perStreamCache.end()) + return std::nullopt; + return it->second.getIf(size, std::forward(pred)); +} + +template +template +std::optional PerStreamCache::tryGetGlobal(size_t size, Predicate &&pred) +{ + for (auto it = m_globalCache.lower_bound(size); it != m_globalCache.end(); ++it) + { + if (pred(it->second)) + { + Payload ret = std::move(it->second); + m_globalCache.erase(it); + return ret; + } + } + return std::nullopt; +} + +template +int PerStreamCache::moveReadyToGlobal() +{ + int moved = 0; + for (auto it = m_perStreamCache.begin(); it != m_perStreamCache.end();) + { + it->second.removeAllReady( + [&](Payload &&payload) + { + m_globalCache.emplace(StreamCachePayloadSize(payload), std::move(payload)); + moved++; + }); + if (it->second.empty()) + it = m_perStreamCache.erase(it); + else + ++it; + } + return moved; +} + +template +void PerStreamCache::put(Payload &&payload, std::optional stream) +{ + cudaEvent_t readyEvent = StreamCachePayloadReady(payload); + bool per_stream = readyEvent != nullptr && cudaEventQuery(readyEvent) == cudaErrorNotReady; + + std::lock_guard guard(m_lock); + + if (per_stream) + { + uint64_t id = stream ? GetCudaStreamIdHint(*stream) : (uint64_t)-1ll; + auto cacheIt = m_perStreamCache.find(id); + if (cacheIt == m_perStreamCache.end()) + cacheIt = m_perStreamCache.emplace(id, &m_cacheItemAlloc).first; + + cacheIt->second.put(std::move(payload)); + } + else + { + size_t size = StreamCachePayloadSize(payload); + m_globalCache.emplace(size, std::move(payload)); + } +} + +} // namespace nvcv::util + +#endif // NVCV_UTIL_PER_STREAM_CACHE_IMPL_HPP diff --git a/src/util/SimpleCache.hpp b/src/util/SimpleCache.hpp new file mode 100644 index 00000000..f9091c5a --- /dev/null +++ b/src/util/SimpleCache.hpp @@ -0,0 +1,137 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_UTIL_SIMPLE_CACHE_HPP +#define NVCV_UTIL_SIMPLE_CACHE_HPP + +#include +#include +#include + +namespace nvcv::util { + +/** A simple cache that stores objects of type T + * + * The cache stores objects of type T. Upon a call to `Get`, an object + * is moved out from the cache and returned as `optional`. If the cache + * is empty, `nullopt` is returned. + * `GetOrCreate` alwas returns an object (unless it throws) - if the cache is empty, + * a user-provided factory function is invoked and a new object is returned. + * Objects can be placed in the cache with a call to `Put` or `Emplace`. + * + * The cache is guarded with a lockable object (by default std::mutex). + * + * The cache is implemented as a unidirectional list of entries. + * Each entry holds an optional instance of type T. + * Once an object is removed, the cache entry is stored for reuse in an auxiliary list, + * reducing the number of dynamic allocations. + * + * @tparam T The type of itmes held in the cache + * @tparam LockType A lockable object + */ +template +class SimpleCache +{ +public: + std::optional get() + { + if (m_items) + { + std::lock_guard lg(m_lock); + if (m_items) + { + auto tmp = std::move(m_items); + m_items = std::move(tmp->next); + auto obj = std::move(tmp->payload); + tmp->next = std::move(m_empty); + m_empty = std::move(tmp); + return obj; + } + } + return std::nullopt; + } + + template + T getOrCreate(CreateFunc &&create) + { + auto cached = get(); + if (cached.has_value()) + return std::move(cached).value(); + else + return create(); + } + + void put(T &&payload) + { + emplace(std::move(payload)); + } + + template + void emplace(Args &&...args) + { + std::lock_guard lg(m_lock); + + std::unique_ptr item; + if (m_empty) + { + item = std::move(m_empty); + m_empty = std::move(item->next); + } + else + { + item = std::make_unique(); + } + item->payload.emplace(std::forward(args)...); + + item->next = std::move(m_items); + m_items = std::move(item); + } + + void purge() + { + std::lock_guard lg(m_lock); + + m_items.reset(); + m_empty.reset(); + } + +private: + struct CacheItem + { + ~CacheItem() + { + // Iterate through all subsequent elements to avoid deep recursion + while (next) + { + // detach the chain from the `next` + auto tmp = std::move(next->next); + // this will delete the next + next = std::move(tmp); + } + } + + std::unique_ptr next; + std::optional payload; + }; + + std::unique_ptr m_items, m_empty; + LockType m_lock; +}; + +} // namespace nvcv::util + +#endif // NVCV_UTIL_SIMPLE_CACHE_HPP diff --git a/src/util/Stream.cpp b/src/util/Stream.cpp new file mode 100644 index 00000000..a2aaab12 --- /dev/null +++ b/src/util/Stream.cpp @@ -0,0 +1,70 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Stream.hpp" + +#include "CheckError.hpp" + +#include +#include + +namespace nvcv::util { + +CudaStream CudaStream::Create(bool nonBlocking, int deviceId) +{ + cudaStream_t stream; + int flags = nonBlocking ? cudaStreamNonBlocking : cudaStreamDefault; + int prevDev = -1; + if (deviceId >= 0) + { + NVCV_CHECK_THROW(cudaGetDevice(&prevDev)); + NVCV_CHECK_THROW(cudaSetDevice(deviceId)); + } + auto err = cudaStreamCreateWithFlags(&stream, flags); + if (prevDev >= 0) + NVCV_CHECK_THROW(cudaSetDevice(prevDev)); + NVCV_CHECK_THROW(err); + return CudaStream(stream); +} + +CudaStream CudaStream::CreateWithPriority(bool nonBlocking, int priority, int deviceId) +{ + cudaStream_t stream; + int flags = nonBlocking ? cudaStreamNonBlocking : cudaStreamDefault; + int prevDev = -1; + if (deviceId >= 0) + { + NVCV_CHECK_THROW(cudaGetDevice(&prevDev)); + NVCV_CHECK_THROW(cudaSetDevice(deviceId)); + } + auto err = cudaStreamCreateWithPriority(&stream, flags, priority); + if (prevDev >= 0) + NVCV_CHECK_THROW(cudaSetDevice(prevDev)); + NVCV_CHECK_THROW(err); + return CudaStream(stream); +} + +void CudaStream::DestroyHandle(cudaStream_t stream) +{ + auto err = cudaStreamDestroy(stream); + if (err != cudaSuccess && err != cudaErrorCudartUnloading) + { + NVCV_CHECK_THROW(err); + } +} + +} // namespace nvcv::util diff --git a/src/util/Stream.hpp b/src/util/Stream.hpp new file mode 100644 index 00000000..44399a3b --- /dev/null +++ b/src/util/Stream.hpp @@ -0,0 +1,58 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_UTIL_CUDA_STREAM_H_ +#define NVCV_UTIL_CUDA_STREAM_H_ + +#include "UniqueHandle.hpp" + +#include + +#include + +namespace nvcv::util { + +/** + * @brief A wrapper class for CUDA stream handle (cudaStream_t), + * + * The purpose of this class is to provide safe ownership and lifecycle management + * for CUDA stream handles. + * The stream object may be created using the factory functions @ref Create and + * @ref CreateWithPriority. + * + * The object may also assume ownership of a pre-existing handle via constructor or + * @link UniqueHandle::reset(handle_type) reset @endlink function. + */ +class CudaStream : public UniqueHandle +{ +public: + NVCV_INHERIT_UNIQUE_HANDLE(cudaStream_t, CudaStream) + + /// @brief Creates a stream on specified device (or current device, if device_id < 0) + static CudaStream Create(bool nonBlocking, int deviceId = -1); + + /// @brief Creates a stream with given priority on specified device + /// (or current device, if device_id < 0) + static CudaStream CreateWithPriority(bool nonBlocking, int priority, int deviceId = -1); + + /// @brief Calls cudaStreamDestroy on the handle. + static void DestroyHandle(cudaStream_t stream); +}; + +} // namespace nvcv::util + +#endif // NVCV_UTIL_CUDA_STREAM_H_ diff --git a/src/util/StreamId.cpp b/src/util/StreamId.cpp new file mode 100644 index 00000000..27fc0f7e --- /dev/null +++ b/src/util/StreamId.cpp @@ -0,0 +1,150 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "StreamId.hpp" + +#include +#include +#include + +using cuStreamGetId_t = CUresult(CUstream, unsigned long long *); + +#if CUDA_VERSION >= 12000 + +namespace { + +cuStreamGetId_t *_cuStreamGetId = cuStreamGetId; + +bool _hasPreciseHint() +{ + return true; +} + +} // namespace + +#else + +# include +# include +# include + +namespace { + +inline int getTID() +{ + return syscall(SYS_gettid); +} + +constexpr uint64_t MakeLegacyStreamId(int dev, int tid) +{ + return (uint64_t)dev << 32 | tid; +} + +CUresult cuStreamGetIdFallback(CUstream stream, unsigned long long *id) +{ + // If the stream handle is a pseudohandle, use some special treatment.... + if (stream == 0 || stream == CU_STREAM_LEGACY || stream == CU_STREAM_PER_THREAD) + { + int dev = -1; + if (cudaGetDevice(&dev) != cudaSuccess) + return CUDA_ERROR_INVALID_CONTEXT; + // If we use a per-thread stream, get TID; otherwise use -1 as a pseudo-tid + *id = MakeLegacyStreamId(dev, stream == CU_STREAM_PER_THREAD ? getTID() : -1); + return CUDA_SUCCESS; + } + else + { + // Otherwise just use the handle - it's not perfactly safe, but should do. + *id = (uint64_t)stream; + return CUDA_SUCCESS; + } +} + +cuStreamGetId_t *getRealStreamIdFunc() +{ + static cuStreamGetId_t *fn = []() + { + void *sym = nullptr; + // If it fails, we'll just return nullptr. + (void)cuGetProcAddress("cuStreamGetId", &sym, 12000, CU_GET_PROC_ADDRESS_DEFAULT); + return (cuStreamGetId_t *)sym; + }(); + return fn; +} + +bool _hasPreciseHint() +{ + static bool ret = getRealStreamIdFunc() != nullptr; + return ret; +} + +CUresult cuStreamGetIdBootstrap(CUstream stream, unsigned long long *id); + +cuStreamGetId_t *_cuStreamGetId = cuStreamGetIdBootstrap; + +CUresult cuStreamGetIdBootstrap(CUstream stream, unsigned long long *id) +{ + cuStreamGetId_t *realFunc = getRealStreamIdFunc(); + if (realFunc) + _cuStreamGetId = realFunc; + else + _cuStreamGetId = cuStreamGetIdFallback; + + return _cuStreamGetId(stream, id); +} + +} // namespace + +#endif + +namespace nvcv::util { + +bool IsCudaStreamIdHintUnambiguous() +{ + return _hasPreciseHint(); +} + +uint64_t GetCudaStreamIdHint(CUstream stream) +{ + static auto initResult = cuInit(0); + (void)initResult; + unsigned long long id; + CUresult err = _cuStreamGetId(stream, &id); + if (err != CUDA_SUCCESS) + { + switch (err) + { + case CUDA_ERROR_DEINITIALIZED: + // This is most likely to happen during process teardown, so likely in a destructor + // - we don't want to throw there and the stream equality is immaterial anyway at this point. + return -1; + case CUDA_ERROR_INVALID_VALUE: + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid stream handle"); + default: + { + const char *msg = ""; + const char *name = "Unknown error"; + (void)cuGetErrorString(err, &msg); + (void)cuGetErrorName(err, &name); + throw nvcv::Exception(nvcv::Status::ERROR_INTERNAL, "CUDA error %s %i %s", name, err, msg); + } + } + } + return id; +} + +} // namespace nvcv::util diff --git a/src/util/StreamId.hpp b/src/util/StreamId.hpp new file mode 100644 index 00000000..dbed99d9 --- /dev/null +++ b/src/util/StreamId.hpp @@ -0,0 +1,47 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_UTIL_STREAM_ID_HPP +#define NVCV_UTIL_STREAM_ID_HPP + +#include + +#include + +namespace nvcv::util { + +/** Retrieves a value that identifies a stream. + * + * @warning On older drivers ID aliasing is possible, when a still-running stream is deleted + * and a new one is created before the one just deleted completes its work. + * + * @param stream CUDA stream handle (note that CUstram and cudaStream_t are one type) + * @return int64_t The ID of the stream within the system. + */ +uint64_t GetCudaStreamIdHint(CUstream stream); + +/** Checks whether the stream id hint is unique + * + * If the system supports cuStreamGetId, then the value returned by GetCudaStreamIdHint + * uniquely identifies a stream. This creates some optimization opportunities when managing + * stream-bound resources. + */ +bool IsCudaStreamIdHintUnambiguous(); + +} // namespace nvcv::util + +#endif // NVCV_UTIL_STREAM_ID_HPP diff --git a/src/util/TensorDataUtils.cpp b/src/util/TensorDataUtils.cpp index e716fecc..862f62aa 100644 --- a/src/util/TensorDataUtils.cpp +++ b/src/util/TensorDataUtils.cpp @@ -35,7 +35,7 @@ static void printPlane(const uint8_t *data, int width, int height, int rowStride std::cout << "]"; endB = true; } - else if (i % bytesPC == 0) + else if (x % bytesPC == 0) { if (x % (bytesPC * numC) == 0 && !endB) { @@ -46,6 +46,7 @@ static void printPlane(const uint8_t *data, int width, int height, int rowStride std::cout << ","; } } + if (i % rowStride == 0) { std::cout << "\n["; @@ -236,4 +237,151 @@ nvcv::Tensor CreateTensor(int numImages, int imgWidth, int imgHeight, const nvcv return nvcv::Tensor(numImages, {imgWidth, imgHeight}, imgFormat); } +static void GetImageByteVectorFromTensorPlanar(const TensorData &tensorData, int sample, + std::vector &outData) +{ + Optional tDataAc = nvcv::TensorDataAccessStridedImagePlanar::Create(tensorData); + + if (!tDataAc) + throw std::runtime_error("Tensor Data not compatible with planar access."); + + if (tDataAc->numSamples() <= sample || sample < 0) + throw std::runtime_error("Number of samples smaller than requested sample."); + + // in a planar tensor the dtype represents each plane so the total bytes per pixel must be calculated + int bytesPerC = tDataAc->dtype().bitsPerPixel() / 8; + int outputSizeBytes = tDataAc->numRows() * tDataAc->numCols() * bytesPerC * tDataAc->numChannels(); + + // Make sure we have the right size. + outData.resize(outputSizeBytes); + Byte *basePtr = tDataAc->sampleData(sample); + size_t dstWidth = tDataAc->numCols() * bytesPerC; + for (int i = 0; i < tDataAc->numChannels(); ++i) + { + if (cudaSuccess + != cudaMemcpy2D(outData.data() + (i * (tDataAc->numCols() * tDataAc->numRows()) * bytesPerC), dstWidth, + basePtr, tDataAc->rowStride(), dstWidth, tDataAc->numRows(), cudaMemcpyDeviceToHost)) + { + throw std::runtime_error("CudaMemcpy failed on copy of channel plane from device to host."); + } + basePtr += tDataAc->planeStride(); + } + return; +} + +void GetImageByteVectorFromTensor(const TensorData &tensorData, int sample, std::vector &outData) +{ + Optional tDataAc = nvcv::TensorDataAccessStridedImage::Create(tensorData); + + if (!tDataAc) + throw std::runtime_error("Tensor Data not compatible with pitch access."); + if (tDataAc->infoLayout().isChannelFirst()) + return GetImageByteVectorFromTensorPlanar(tensorData, sample, outData); + + if (tDataAc->numSamples() <= sample || sample < 0) + throw std::runtime_error("Number of samples smaller than requested sample."); + + int bytesPerPixel = (tDataAc->dtype().bitsPerPixel() / 8) * tDataAc->numChannels(); + int outputSizeBytes = tDataAc->numRows() * tDataAc->numCols() * bytesPerPixel; + + // Make sure we have the right size. + outData.resize(outputSizeBytes); + + if (cudaSuccess + != cudaMemcpy2D(outData.data(), tDataAc->numCols() * bytesPerPixel, tDataAc->sampleData(sample), + tDataAc->rowStride(), tDataAc->numCols() * bytesPerPixel, tDataAc->numRows(), + cudaMemcpyDeviceToHost)) + { + throw std::runtime_error("CudaMemcpy failed"); + } + return; +} + +static void SetImageTensorFromByteVectorPlanar(const TensorData &tensorData, std::vector &data, int sample) +{ + Optional tDataAc = nvcv::TensorDataAccessStridedImagePlanar::Create(tensorData); + + if (!tDataAc) + throw std::runtime_error("Tensor Data not compatible with planar image access."); + + if (tDataAc->numSamples() <= sample) + throw std::runtime_error("Number of samples smaller than requested sample."); + + if ((int64_t)data.size() + != tDataAc->numCols() * tDataAc->numRows() * (tDataAc->dtype().bitsPerPixel() / 8) * tDataAc->numChannels()) + throw std::runtime_error("Data vector is incorrect size, size must be W*H*bytesPerPixel."); + + int bytesPerC = (tDataAc->dtype().bitsPerPixel() / 8); + + auto copyToGpu = [&](int j) + { + Byte *basePtr = tDataAc->sampleData(j); + + for (int i = 0; i < tDataAc->numChannels(); ++i) + { + Byte *srcPtr = data.data() + (i * (tDataAc->numCols() * tDataAc->numRows() * bytesPerC)); + size_t srcPitch = tDataAc->numCols() * bytesPerC; + size_t srcWidthBytes = tDataAc->numCols() * bytesPerC; + if (cudaSuccess + != cudaMemcpy2D(basePtr, tDataAc->rowStride(), srcPtr, srcPitch, srcWidthBytes, tDataAc->numRows(), + cudaMemcpyHostToDevice)) + { + throw std::runtime_error("CudaMemcpy failed for channel plane copy from host to device."); + } + basePtr += tDataAc->planeStride(); + } + }; + + if (sample < 0) + for (auto i = 0; i < tDataAc->numSamples(); ++i) + { + copyToGpu(i); + } + else + copyToGpu(sample); +} + +void SetImageTensorFromByteVector(const TensorData &tensorData, std::vector &data, int sample) +{ + Optional tDataAc = nvcv::TensorDataAccessStridedImage::Create(tensorData); + + if (!tDataAc) + throw std::runtime_error("Tensor Data not compatible with pitch access."); + + if (tDataAc->infoLayout().isChannelFirst()) // planar case + return SetImageTensorFromByteVectorPlanar(tensorData, data, sample); + + if (tDataAc->numSamples() <= sample) + throw std::runtime_error("Number of samples smaller than requested sample."); + + if ((int64_t)data.size() + != tDataAc->numCols() * tDataAc->numRows() * (tDataAc->dtype().bitsPerPixel() / 8) * tDataAc->numChannels()) + throw std::runtime_error("Data vector is incorrect size, size must be N*W*sizeof(pixel)."); + + int bytesPerC = (tDataAc->dtype().bitsPerPixel() / 8); + + auto copyToGpu = [&](int i) + { + Byte *basePtr = tDataAc->sampleData(i); + Byte *srcPtr = data.data(); + size_t srcPitch = tDataAc->numCols() * bytesPerC * tDataAc->numChannels(); + size_t srcWidthBytes = tDataAc->numCols() * bytesPerC * tDataAc->numChannels(); + + if (cudaSuccess + != cudaMemcpy2D(basePtr, tDataAc->rowStride(), srcPtr, srcPitch, srcWidthBytes, tDataAc->numRows(), + cudaMemcpyHostToDevice)) + { + throw std::runtime_error("CudaMemcpy failed on copy of image from host to device."); + } + }; + + if (sample < 0) + for (auto i = 0; i < tDataAc->numSamples(); ++i) + { + copyToGpu(i); + } + else + copyToGpu(sample); +} + } // namespace nvcv::util diff --git a/src/util/TensorDataUtils.hpp b/src/util/TensorDataUtils.hpp index 3a0ec6f0..fb641a07 100644 --- a/src/util/TensorDataUtils.hpp +++ b/src/util/TensorDataUtils.hpp @@ -243,8 +243,7 @@ static void SetTensorToRandomValue(const TensorData &tensorData, DT minVal, DT m /** * Writes over the Tensor data with an array of type DT array must be - * the size of sampleStride(). All samples will be overriden. - * Function does not do data type checking + * the size of H*W*C with DT matching the Tensor datatype. Function does not do data type checking. * * @param[in,out] tensorData created tensor object. * @@ -272,7 +271,20 @@ template static void SetImageTensorFromVector(const TensorData &tensorData, std::vector
&data, int sample = -1); /** - * Returns a vector contains the values of the provided sample. + * Writes over the Tensor data with an byte array, array must be + * the size of H*W*C*bytesPerC. Function does not do data type checking. + * + * @param[in,out] tensorData created tensor object. + * + * @param[in] data vector of bytes with the data to set the tensor to. + * + * @param[in] sample optional the sample to write to if -1 all samples are written + */ +void SetImageTensorFromByteVector(const TensorData &tensorData, std::vector &data, int sample = -1); + +/** + * Returns a vector contains the values of the provided sample including any padding. This function assumes that the DT data type + * matches the datatype in the TensorData. * * @param[in] tensorData created tensor object. * @@ -286,7 +298,8 @@ static void GetVectorFromTensor(const TensorData &tensorData, int sample, std::v /** * Returns a vector contains the values of the provided sample. This vector will only contain - * the values of the image and not any padding/stride. + * the values of the image and not any padding/stride. This function assumes that the DT data type + * matches the datatype in the TensorData. * * @param[in] tensorData created tensor object. * @@ -298,6 +311,20 @@ static void GetVectorFromTensor(const TensorData &tensorData, int sample, std::v template static void GetImageVectorFromTensor(const TensorData &tensorData, int sample, std::vector
&outData); +/** + * Returns a byte vector which contains the values of the specified sample. This vector will only contain + * the values of the image and not any padding/stride. Also this will return a byte array regardless of + * the DataType of the tensor. The byte vector returned will be the size of H*W*C*bytesPerC. + * + * @param[in] tensorData created tensor object. + * + * @param[in] sample the sample to copy to vector 0 index. + * + * @param[out] outData the data to set the tensor to. + * + */ +void GetImageByteVectorFromTensor(const TensorData &tensorData, int sample, std::vector &outData); + /** * Sets the TensorImageData to the value set by the data parameter * region defines the amount of image to set starting at 0,0 @@ -534,9 +561,9 @@ static void SetImageTensorFromVectorPlanar(const TensorData &tensorData, std::ve if ((int64_t)data.size() != tDataAc->numCols() * tDataAc->numRows() * tDataAc->numChannels()) throw std::runtime_error("Data vector is incorrect size, size must be W*C*sizeof(DT)*channels."); - auto copyToGpu = [&](int i) + auto copyToGpu = [&](int j) { - Byte *basePtr = tDataAc->sampleData(i); + Byte *basePtr = tDataAc->sampleData(j); for (int i = 0; i < tDataAc->numChannels(); ++i) { if (cudaSuccess @@ -672,37 +699,49 @@ void SetCvDataTo(TensorImageData &cvImg, DT data, Size2D region, uint8_t chFlags // Useful for debugging template -inline void PrintBuffer(const std::vector &vec, const ST &strides, const ST &shape, const char *name = "") +inline void PrintBuffer(const std::vector &vec, const ST &strides, const ST &shape, const char *name = "", + uint32_t endls = 0b1111) { + using BT = nvcv::cuda::BaseType; + using BT4 = nvcv::cuda::MakeType; + using CVT = std::conditional_t; + std::cout << "I Printing buffer " << name << " with:\nI\tSize = " << vec.size() << " Bytes\nI\tShape = " << shape - << "\nI\tStrides = " << strides << "\nI\tValues = " << std::flush; + << "\nI\tStrides = " << strides << "\nI\tValues = " << std::endl; - for (long w = 0; w < (nvcv::cuda::NumElements == 4 ? nvcv::cuda::GetElement(shape, 3) : 1); ++w) + for (BT x = 0; x < (nvcv::cuda::NumElements >= 1 ? nvcv::cuda::GetElement(shape, 0) : 1); ++x) { - if (w > 0) - std::cout << std::endl; - std::cout << "{" << std::flush; - for (long z = 0; z < (nvcv::cuda::NumElements >= 3 ? nvcv::cuda::GetElement(shape, 2) : 1); ++z) + if (endls & 0b1000) + std::cout << "{" << std::endl; + else + std::cout << "{" << std::flush; + for (BT y = 0; y < (nvcv::cuda::NumElements >= 2 ? nvcv::cuda::GetElement(shape, 1) : 1); ++y) { - std::cout << "[" << std::flush; - for (long y = 0; y < (nvcv::cuda::NumElements >= 2 ? nvcv::cuda::GetElement(shape, 1) : 1); ++y) + if (endls & 0b0100) + std::cout << " [" << std::endl; + else + std::cout << " [" << std::flush; + for (BT z = 0; z < (nvcv::cuda::NumElements >= 3 ? nvcv::cuda::GetElement(shape, 2) : 1); ++z) { - std::cout << "(" << std::flush; - for (long x = 0; x < (nvcv::cuda::NumElements >= 1 ? nvcv::cuda::GetElement(shape, 0) : 1); ++x) + std::cout << " " << std::flush; + for (BT w = 0; w < (nvcv::cuda::NumElements >= 4 ? nvcv::cuda::GetElement(shape, 3) : 1); ++w) { - ST coord = nvcv::cuda::DropCast>(long4{x, y, z, w}); + ST coord = nvcv::cuda::DropCast>(BT4{x, y, z, w}); - if (x > 0) - std::cout << ", " << std::flush; - std::cout << ValueAt(vec, strides, coord) << std::flush; + std::cout << " " << static_cast(ValueAt(vec, strides, coord)) << std::flush; } - std::cout << ")" << std::flush; + if (endls & 0b0010) + std::cout << std::endl; + else + std::cout << std::flush; } - std::cout << "]" << std::flush; + if (endls & 0b0001) + std::cout << " ]" << std::endl; + else + std::cout << " ]" << std::flush; } - std::cout << "}" << std::flush; + std::cout << "}" << std::endl; } - std::cout << std::endl; } // Write images in *HW tensor buffer vec to PGM files. @@ -738,6 +777,18 @@ inline void WriteImagesToPGM(const char *filename, const std::vector &v return ST{coord.z, coord.w}; }; + auto convertValue = [](VT val) + { + if constexpr (std::is_same_v) + return val; + else if constexpr (std::is_integral_v && !std::is_signed_v) + return std::min((VT)255, std::max((VT)0, val)); + else if constexpr (std::is_integral_v && std::is_signed_v) + return std::min((VT)255, std::max((VT)0, (VT)std::abs(val))); + else + return std::min((VT)255, std::max((VT)0, (VT)std::round(std::abs(val)))); + }; + char fn[256]; for (long c0 = 0; c0 < c0size; ++c0) @@ -758,9 +809,7 @@ inline void WriteImagesToPGM(const char *filename, const std::vector &v VT val = util::ValueAt(vec, strides, coord); - int iVal = std::min(255, std::max(0, (int)std::round(std::abs(val)))); - - ofs << iVal << ((j == width - 1) ? "\n" : " "); + ofs << convertValue(val) << ((j == width - 1) ? "\n" : " "); } } diff --git a/src/util/UniqueHandle.hpp b/src/util/UniqueHandle.hpp new file mode 100644 index 00000000..dcfb4b10 --- /dev/null +++ b/src/util/UniqueHandle.hpp @@ -0,0 +1,191 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_UTIL_UNIQUE_HANDLE_H_ +#define NVCV_UTIL_UNIQUE_HANDLE_H_ + +#include + +namespace nvcv::util { + +/** + * @brief This class is an analogue of `unique_ptr` for non-memory resource handles. + * + * UniqueHandle is a base class for implementing managed resources (files, OS handles, etc). + * This class provides construction, assigment and decay to underlying handle type as well + * as equality comparison operators. + * + * @tparam HandleType type of the handle, e.g. `int` for file descriptors or `FILE*` for buffers + * @tparam Actual derived class (if using CRTP) or a handle information class. + * + * The interface of the `Actual` type: + * ``` + * static void DestroyHandle(HandleType h); // free or un-reference the underlying resource + * + * static constexpr HandleType null_handle(); // return a null handle; when using CRTP it's + * // optional and default implementation returns + * // default-constructed handle value. + * ``` + * + * The handle can be populated by either the explicit constructor or using + * @link reset(handle_type) reset @endlink + * function. The derived classes may provide other ways of constructing the handle or the entire + * handle wrapper object. + + */ +template +class UniqueHandle +{ +public: + using handle_type = HandleType; + + constexpr inline UniqueHandle() + : handle_(Actual::null_handle()) + { + } + + /// @brief Constructs a handle wrapper, assuming ownership of given handle. + constexpr explicit UniqueHandle(handle_type handle) + : handle_(handle) + { + } + + UniqueHandle(const UniqueHandle &) = delete; + + UniqueHandle &operator=(const UniqueHandle &) = delete; + + inline UniqueHandle(UniqueHandle &&other) + : handle_(other.handle_) + { + other.handle_ = Actual::null_handle(); + } + + inline UniqueHandle &operator=(UniqueHandle &&other) + { + std::swap(handle_, other.handle_); + other.reset(); + return *this; + } + + /** + * @brief Obtains the stored handle + * + * The value is valid as long as the owning unique handle object is not destroyed, reset + * or overwritten. + */ + constexpr handle_type get() const &noexcept + { + return handle_; + } + + /** + * @brief Cannot obtain a valid handle from a temporary UniqueHandle + * + * If this function was allowed, the returned handle would have been destroyed + * by the time it's available to the caller. + */ + constexpr handle_type get() && = delete; + + /// @brief Make the wrapper usable in most context in which the handle type can be used + constexpr operator handle_type() const &noexcept + { + return get(); + } + + /// @brief Cannot obtain a valid handle from a temporary UniqueHandle (see `get`) + constexpr operator handle_type() && = delete; + + /** + * @brief Destroys the underlying resource and resets the handle to null value. + * + * @remarks + * * If the handle is already null, this function is a no-op. + * * The null value to replace the handle with, is taken from `Actual::null_value()`. + */ + inline void reset() + { + if (!Actual::is_null_handle(handle_)) + { + Actual::DestroyHandle(handle_); + handle_ = Actual::null_handle(); + } + } + + /** + * @brief Replaces the managed handle by the new one and destroying the old handle. + * @remarks If `handle` is equal to the currently managed handle, this function is no-op + */ + inline void reset(handle_type handle) + { + if (handle != handle_) + { + reset(); + handle_ = handle; + } + } + + /** + * @brief Relinquishes the ownership of the handle. + * + * The function replaces the managed handle with a null value and returns the old value. + * + * @returns The old handle value, no longer managed by UniqueHandle + * @remarks The null value to replace the handle with, is taken from `Actual::null_value()`. + */ + inline handle_type release() noexcept + { + handle_type old = handle_; + handle_ = Actual::null_handle(); + return old; + } + + /// @brief Indicates whether the handle is non-null. + constexpr explicit operator bool() const noexcept + { + return !Actual::is_null_handle(handle_); + } + + static constexpr handle_type null_handle() noexcept + { + return {}; + } + + static constexpr bool is_null_handle(const handle_type &handle) noexcept + { + return handle == Actual::null_handle(); + } + +protected: + inline ~UniqueHandle() + { + reset(); + } + + handle_type handle_; +}; + +/** + * A macro to inherit the common interface from UniqueHandle + * - useful when using UniqueHandle in CRTP + */ +#define NVCV_INHERIT_UNIQUE_HANDLE(HandleType, WrapperClass) \ + using nvcv::util::UniqueHandle::UniqueHandle; \ + using nvcv::util::UniqueHandle::operator=; + +} // namespace nvcv::util + +#endif // NVCV_UTIL_UNIQUE_HANDLE_H_ diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index de95135c..88ceacc5 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -42,7 +42,7 @@ if(UNIX) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/run_tests.sh.in ${TESTS_DRIVER} @ONLY) - macro(nvcv_add_test TESTCMD) + macro(nvcv_add_test TESTCMD TESTGROUP) get_filename_component(TESTNAME "${TESTCMD}" NAME) add_test(NAME "${TESTNAME}" COMMAND "${TESTCMD}") @@ -57,7 +57,7 @@ if(UNIX) set_tests_properties(${TESTNAME} PROPERTIES TIMEOUT ${TIMEOUT_TERM}) - file(APPEND "${TESTS_DRIVER}" "run ${TESTNAME}\n") + file(APPEND "${TESTS_DRIVER}" "run ${TESTNAME} ${TESTGROUP}\n") if(TARGET ${TESTNAME}) install(TARGETS ${TESTNAME} diff --git a/tests/common/CheckStatus.hpp b/tests/common/CheckStatus.hpp index b409301a..2374e359 100644 --- a/tests/common/CheckStatus.hpp +++ b/tests/common/CheckStatus.hpp @@ -133,3 +133,14 @@ using nvcv::Exception; { \ ADD_FAILURE() << "Expected an exception of type " #E ", got an unknown exception"; \ } + +#define NVCV_EXPECT_THROW_STATUS(status, ...) \ + try \ + { \ + __VA_ARGS__; \ + FAIL() << "Expected an error with status " << status; \ + } \ + catch (nvcv::Exception & e) \ + { \ + EXPECT_EQ(e.code(), nvcv::Status(status)); \ + } diff --git a/tests/cvcuda/CMakeLists.txt b/tests/cvcuda/CMakeLists.txt index 5b2f54a6..6fcbd49b 100644 --- a/tests/cvcuda/CMakeLists.txt +++ b/tests/cvcuda/CMakeLists.txt @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -18,6 +18,9 @@ # System tests for cvcuda public API add_subdirectory(system) +# Unit tests for cvcuda utilities and operator internals +add_subdirectory(unit) + if(BUILD_PYTHON) # System tests for cvcuda python add_subdirectory(python) diff --git a/tests/cvcuda/python/CMakeLists.txt b/tests/cvcuda/python/CMakeLists.txt index 813cc21c..503f50f4 100644 --- a/tests/cvcuda/python/CMakeLists.txt +++ b/tests/cvcuda/python/CMakeLists.txt @@ -43,4 +43,4 @@ set(PYTHON_TEST_DIR ${CMAKE_INSTALL_PREFIX}/${PYTHON_TEST_INSTDIR}) set(PYTHON_MODULE_DIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}) configure_file(cvcuda_test_python.in cvcuda_test_python @ONLY) -nvcv_add_test(${CMAKE_CURRENT_BINARY_DIR}/cvcuda_test_python) +nvcv_add_test(${CMAKE_CURRENT_BINARY_DIR}/cvcuda_test_python cvcuda) diff --git a/tests/cvcuda/python/cvcuda_test_python.in b/tests/cvcuda/python/cvcuda_test_python.in index c74bb94a..eb648d82 100755 --- a/tests/cvcuda/python/cvcuda_test_python.in +++ b/tests/cvcuda/python/cvcuda_test_python.in @@ -45,14 +45,30 @@ function on_exit() } trap 'on_exit' EXIT +export PYTHONPATH="$PYTHONPATH:@PYTHON_MODULE_DIR@" + for ver in $python_versions; do if [[ "$NVCV_FORCE_PYTHON" != 1 && "$NVCV_FORCE_PYTHON" != yes ]]; then - if ! PYTHONPATH="$PYTHONPATH:@PYTHON_MODULE_DIR@" python$ver -c 'import nvcv'; then - echo "Skipping python-$ver, NVCV python bindings not installed" + if ! python$ver -c 'import cvcuda'; then + echo "Skipping python-$ver, CV-CUDA python bindings not installed" continue fi fi - PYTHONPATH="$PYTHONPATH:@PYTHON_MODULE_DIR@" NVCV_VERSION="@NVCV_VERSION_FULL@" python$ver -m pytest -o cache_dir="$tmpdir" "$@" "$tests_dir" + # Check if python module is exposing only PyInit_cvcuda. + # Also provide some helpful info is exposing too much. + modfile=$(python$ver -c "import cvcuda; print(cvcuda.__file__)") + pubsyms=$(readelf -sWD $modfile | grep -v ' UND ' | grep ' GLOBAL ') + if [[ $(echo "$pubsyms" | wc -l) != 1 ]]; then + echo -e "cvcuda python $ver module is exposing too many symbols:\n$pubsyms" + exit 1 + fi + if ! echo "$pubsyms" | grep PyInit_cvcuda > /dev/null; then + echo -e "cvcuda python $ver module must expose symbol PyInit_cvcuda, but instead exposes:\n$pubsyms" + exit 2 + fi + + # Run python tests + NVCV_VERSION="@NVCV_VERSION_FULL@" python$ver -m pytest -o cache_dir="$tmpdir" "$@" "$tests_dir" done diff --git a/tests/cvcuda/python/cvcuda_util.py b/tests/cvcuda/python/cvcuda_util.py index 0761e47a..dcdf55f3 100644 --- a/tests/cvcuda/python/cvcuda_util.py +++ b/tests/cvcuda/python/cvcuda_util.py @@ -13,13 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import math +import torch import numpy as np import numbers -import torch import nvcv import copy import colorsys -import math IMG_FORMAT_TO_TYPE = { @@ -96,7 +96,7 @@ def generate_data(shape, dtype, max_random=None, rng=None): class CudaBuffer: - __cuda_array_interface = None + __cuda_array_interface__ = None obj = None @@ -165,17 +165,27 @@ def to_cuda_buffer(host_data): return buf -def to_nvcv_tensor(host_data, layout): - """Convert a tensor in host data with layout to nvcv.Tensor +def to_nvcv_tensor(data, layout): + """Convert a tensor in host or CUDA data with layout to nvcv.Tensor Args: - host_data (numpy array): Tensor in host data + data (numpy array or CUDA array): Tensor in host data layout (string): Tensor layout (e.g. NC, HWC, NHWC) Returns: nvcv.Tensor: The converted tensor """ - return nvcv.as_tensor(to_cuda_buffer(host_data), layout=layout) + cuda_data = data + if "__cuda_array_interface__" not in dir(cuda_data): + cuda_data = to_cuda_buffer(data) + shape = cuda_data.__cuda_array_interface__["shape"] + if layout is not None: + if len(shape) < len(layout): + shape = (1,) * (len(layout) - len(shape)) + shape + elif len(shape) > len(layout): + raise ValueError("Layout smaller than shape of tensor data") + cuda_data.__cuda_array_interface__["shape"] = shape + return nvcv.as_tensor(cuda_data, layout=layout) def create_tensor(shape, dtype, layout, max_random=None, rng=None, transform_dist=None): diff --git a/tests/cvcuda/python/test_adaptivethresholdtype.py b/tests/cvcuda/python/test_adaptivethresholdtype.py index 8e0ff2db..cf51ea29 100644 --- a/tests/cvcuda/python/test_adaptivethresholdtype.py +++ b/tests/cvcuda/python/test_adaptivethresholdtype.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import torch # noqa(F401) import cvcuda diff --git a/tests/cvcuda/python/test_bordertype.py b/tests/cvcuda/python/test_bordertype.py index f2f230ee..2f650932 100644 --- a/tests/cvcuda/python/test_bordertype.py +++ b/tests/cvcuda/python/test_bordertype.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import torch # noqa(F401) import cvcuda diff --git a/tests/cvcuda/python/test_import_order.py b/tests/cvcuda/python/test_import_order.py new file mode 100644 index 00000000..7178768a --- /dev/null +++ b/tests/cvcuda/python/test_import_order.py @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Import order is important, +# torch must be loaded correctly even if cvcuda was imported first +import cvcuda +import torch +import numpy as np + + +def test_import_cvcuda_first_works(): + torch.as_tensor(np.ndarray((4, 6), dtype=np.uint8), device="cuda") + cvcuda.Tensor((4, 6), dtype=np.uint8) diff --git a/tests/cvcuda/python/test_interptype.py b/tests/cvcuda/python/test_interptype.py index bf8c7371..7abf1f32 100644 --- a/tests/cvcuda/python/test_interptype.py +++ b/tests/cvcuda/python/test_interptype.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import torch # noqa(F401) import cvcuda diff --git a/tests/cvcuda/python/test_opadaptivethreshold.py b/tests/cvcuda/python/test_opadaptivethreshold.py index 30fe7cb3..a21eabe5 100644 --- a/tests/cvcuda/python/test_opadaptivethreshold.py +++ b/tests/cvcuda/python/test_opadaptivethreshold.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import torch # noqa(F401) import cvcuda import pytest as t import numpy as np diff --git a/tests/cvcuda/python/test_opbndbox.py b/tests/cvcuda/python/test_opbndbox.py index 3165bfff..8efa3129 100644 --- a/tests/cvcuda/python/test_opbndbox.py +++ b/tests/cvcuda/python/test_opbndbox.py @@ -24,62 +24,67 @@ ( (((3, 224, 224, 4), np.uint8, "NHWC")), cvcuda.BndBoxesI( - numBoxes=[3, 3, 3], boxes=[ - cvcuda.BndBoxI( - box=(10, 10, 5, 5), - thickness=2, - borderColor=(255, 255, 0), - fillColor=(0, 128, 255, 128), - ), - cvcuda.BndBoxI( - box=(20, 10, 5, 5), - thickness=3, - borderColor=(0, 255, 255), - fillColor=(0, 128, 255, 128), - ), - cvcuda.BndBoxI( - box=(30, 10, 5, 5), - thickness=3, - borderColor=(0, 255, 255), - fillColor=(0, 128, 255, 128), - ), - cvcuda.BndBoxI( - box=(10, 20, 5, 5), - thickness=2, - borderColor=(255, 255, 0), - fillColor=(0, 128, 255, 128), - ), - cvcuda.BndBoxI( - box=(20, 20, 5, 5), - thickness=3, - borderColor=(0, 255, 255), - fillColor=(0, 128, 255, 128), - ), - cvcuda.BndBoxI( - box=(30, 20, 5, 5), - thickness=3, - borderColor=(0, 255, 255), - fillColor=(0, 128, 255, 128), - ), - cvcuda.BndBoxI( - box=(10, 20, 5, 5), - thickness=2, - borderColor=(255, 255, 0), - fillColor=(0, 128, 255, 128), - ), - cvcuda.BndBoxI( - box=(20, 20, 5, 5), - thickness=3, - borderColor=(0, 255, 255), - fillColor=(0, 128, 255, 128), - ), - cvcuda.BndBoxI( - box=(30, 20, 5, 5), - thickness=3, - borderColor=(0, 255, 255), - fillColor=(0, 128, 255, 128), - ), + [ + cvcuda.BndBoxI( + box=(10, 10, 5, 5), + thickness=2, + borderColor=(255, 255, 0), + fillColor=(0, 128, 255, 128), + ), + cvcuda.BndBoxI( + box=(20, 10, 5, 5), + thickness=3, + borderColor=(0, 255, 255), + fillColor=(0, 128, 255, 128), + ), + cvcuda.BndBoxI( + box=(30, 10, 5, 5), + thickness=3, + borderColor=(0, 255, 255), + fillColor=(0, 128, 255, 128), + ), + ], + [ + cvcuda.BndBoxI( + box=(10, 20, 5, 5), + thickness=2, + borderColor=(255, 255, 0), + fillColor=(0, 128, 255, 128), + ), + cvcuda.BndBoxI( + box=(20, 20, 5, 5), + thickness=3, + borderColor=(0, 255, 255), + fillColor=(0, 128, 255, 128), + ), + cvcuda.BndBoxI( + box=(30, 20, 5, 5), + thickness=3, + borderColor=(0, 255, 255), + fillColor=(0, 128, 255, 128), + ), + ], + [ + cvcuda.BndBoxI( + box=(10, 20, 5, 5), + thickness=2, + borderColor=(255, 255, 0), + fillColor=(0, 128, 255, 128), + ), + cvcuda.BndBoxI( + box=(20, 20, 5, 5), + thickness=3, + borderColor=(0, 255, 255), + fillColor=(0, 128, 255, 128), + ), + cvcuda.BndBoxI( + box=(30, 20, 5, 5), + thickness=3, + borderColor=(0, 255, 255), + fillColor=(0, 128, 255, 128), + ), + ], ], ), ), diff --git a/tests/cvcuda/python/test_opboxblur.py b/tests/cvcuda/python/test_opboxblur.py index b361893c..981b6c8a 100644 --- a/tests/cvcuda/python/test_opboxblur.py +++ b/tests/cvcuda/python/test_opboxblur.py @@ -24,17 +24,22 @@ ( (((3, 224, 224, 4), np.uint8, "NHWC")), cvcuda.BlurBoxesI( - numBoxes=[3, 3, 3], boxes=[ - cvcuda.BlurBoxI(box=(10, 10, 5, 5), kernelSize=7), - cvcuda.BlurBoxI(box=(50, 50, 7, 7), kernelSize=11), - cvcuda.BlurBoxI(box=(90, 90, 9, 9), kernelSize=17), - cvcuda.BlurBoxI(box=(10, 10, 5, 5), kernelSize=7), - cvcuda.BlurBoxI(box=(50, 50, 7, 7), kernelSize=11), - cvcuda.BlurBoxI(box=(90, 90, 9, 9), kernelSize=17), - cvcuda.BlurBoxI(box=(10, 10, 5, 5), kernelSize=7), - cvcuda.BlurBoxI(box=(50, 50, 7, 7), kernelSize=11), - cvcuda.BlurBoxI(box=(90, 90, 9, 9), kernelSize=17), + [ + cvcuda.BlurBoxI(box=(10, 10, 5, 5), kernelSize=7), + cvcuda.BlurBoxI(box=(50, 50, 7, 7), kernelSize=11), + cvcuda.BlurBoxI(box=(90, 90, 9, 9), kernelSize=17), + ], + [ + cvcuda.BlurBoxI(box=(10, 10, 5, 5), kernelSize=7), + cvcuda.BlurBoxI(box=(50, 50, 7, 7), kernelSize=11), + cvcuda.BlurBoxI(box=(90, 90, 9, 9), kernelSize=17), + ], + [ + cvcuda.BlurBoxI(box=(10, 10, 5, 5), kernelSize=7), + cvcuda.BlurBoxI(box=(50, 50, 7, 7), kernelSize=11), + cvcuda.BlurBoxI(box=(90, 90, 9, 9), kernelSize=17), + ], ], ), ), diff --git a/tests/cvcuda/python/test_opfindcontours.py b/tests/cvcuda/python/test_opfindcontours.py index 9da5fafc..90e1e89e 100644 --- a/tests/cvcuda/python/test_opfindcontours.py +++ b/tests/cvcuda/python/test_opfindcontours.py @@ -29,9 +29,10 @@ def test_op_find_contours(shape, dtype, layout): print(shape, dtype, layout) image = util.create_tensor(shape, dtype, layout, 1, rng=RNG) - points = cvcuda.find_contours(image) + points, num_contours_and_points = cvcuda.find_contours(image) assert points.shape[0] == image.shape[0] assert points.shape[2] == 2 + assert points.shape[0] == num_contours_and_points.shape[0] stream = cvcuda.Stream() points = cvcuda.Tensor( @@ -40,12 +41,13 @@ def test_op_find_contours(shape, dtype, layout): num_points = cvcuda.Tensor( (image.shape[0], 32), nvcv.Type.U32, nvcv.TensorLayout.NW ) - tmp = cvcuda.find_contours_into( + points_into, num_contours_and_points_into = cvcuda.find_contours_into( src=image, points=points, num_points=num_points, stream=stream, ) - assert tmp is points - assert points.shape[0] == image.shape[0] - assert points.shape[2] == 2 + assert points_into is points + assert points_into.shape[0] == image.shape[0] + assert points_into.shape[2] == 2 + assert points_into.shape[0] == num_contours_and_points_into.shape[0] diff --git a/tests/cvcuda/python/test_opfindhomography.py b/tests/cvcuda/python/test_opfindhomography.py new file mode 100644 index 00000000..3f8d5faa --- /dev/null +++ b/tests/cvcuda/python/test_opfindhomography.py @@ -0,0 +1,92 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import nvcv +import cvcuda +import pytest as t +import numpy as np + +RNG = np.random.default_rng(0) + + +@t.mark.parametrize( + "num_samples, num_points", + [ + (16, 1024), + (32, 1024), + (64, 1024), + ], +) +def test_op_findhomography(num_samples, num_points): + tensor_args = ((num_samples, num_points * 2), nvcv.Type._2F32, "NW") + src = cvcuda.Tensor(*tensor_args) + dst = cvcuda.Tensor(*tensor_args) + out = cvcuda.findhomography(src, dst) + assert out.shape == (num_samples, 3, 3) + assert out.dtype == np.float32 + + stream = cvcuda.Stream() + out_tensor_args = ((num_samples, 3, 3), np.float32, "NHW") + out = cvcuda.Tensor(*out_tensor_args) + tmp = cvcuda.findhomography_into( + models=out, + srcPts=src, + dstPts=dst, + stream=stream, + ) + assert tmp is out + assert out.shape == (num_samples, 3, 3) + assert out.dtype == nvcv.Type.F32 + + +@t.mark.parametrize( + "num_samples, num_points", + [ + (16, 1024), + (32, 1024), + (64, 1024), + ], +) +def test_op_findhomographyvarshape(num_samples, num_points): + tensor_args = ((1, num_points * 2), nvcv.Type._2F32, "NW") + srcBatch = cvcuda.TensorBatch(num_samples) + dstBatch = cvcuda.TensorBatch(num_samples) + for i in range(num_samples): + src = cvcuda.Tensor(*tensor_args) + dst = cvcuda.Tensor(*tensor_args) + srcBatch.pushback(src) + dstBatch.pushback(dst) + + outBatch = cvcuda.findhomography(srcPts=srcBatch, dstPts=dstBatch) + assert outBatch.dtype == nvcv.Type.F32 + assert outBatch.layout == "NHW" + + stream = cvcuda.Stream() + out_tensor_args = ((1, 3, 3), np.float32, "NHW") + outBatch = cvcuda.TensorBatch(num_samples) + for i in range(num_samples): + out = cvcuda.Tensor(*(out_tensor_args)) + outBatch.pushback(out) + + tmpBatch = cvcuda.findhomography_into( + models=outBatch, + srcPts=srcBatch, + dstPts=dstBatch, + stream=stream, + ) + assert tmpBatch is outBatch + assert outBatch.ndim == 3 + assert outBatch.dtype == nvcv.Type.F32 + assert outBatch.capacity == srcBatch.capacity diff --git a/tests/cvcuda/python/test_ophistogram.py b/tests/cvcuda/python/test_ophistogram.py index de1c2122..3a9f93e4 100644 --- a/tests/cvcuda/python/test_ophistogram.py +++ b/tests/cvcuda/python/test_ophistogram.py @@ -13,8 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import cvcuda import torch +import cvcuda import pytest as t import numpy as np import cvcuda_util as util diff --git a/tests/cvcuda/python/test_oplabel.py b/tests/cvcuda/python/test_oplabel.py new file mode 100644 index 00000000..8a3eb92c --- /dev/null +++ b/tests/cvcuda/python/test_oplabel.py @@ -0,0 +1,135 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cvcuda +import pytest as t +import numpy as np + + +DEF_OUT_DTYPE = np.uint32 +DEF_MAX_CAPACITY = 10000 + + +def defaultNumStats(layout): + return 8 if "D" in layout else 6 + + +@t.mark.parametrize( + "src_args", + [ + (((2, 11, 26, 32, 1), np.uint8, "NDHWC")), + (((3, 12, 29, 31), np.uint8, "NDHW")), + (((10, 22, 33, 1), np.uint8, "DHWC")), + (((14, 23, 34), np.uint8, "DHW")), + (((2, 15, 25, 1), np.uint8, "NHWC")), + (((3, 17, 24), np.uint8, "NHW")), + (((28, 37, 1), np.uint8, "HWC")), + (((18, 16), np.uint8, "HW")), + ], +) +def test_op_label_api(src_args): + src = cvcuda.Tensor(*src_args) + + if "D" not in src_args[2]: + dst, count, stats = cvcuda.label(src) + assert count is None and stats is None + assert dst.layout == src.layout + assert dst.shape == src.shape + assert dst.dtype == DEF_OUT_DTYPE + connectivity = cvcuda.CONNECTIVITY_4_2D + else: + connectivity = cvcuda.CONNECTIVITY_6_3D + dst, count, stats = cvcuda.label(src, connectivity) + assert count is None and stats is None + assert dst.layout == src.layout + assert dst.shape == src.shape + assert dst.dtype == DEF_OUT_DTYPE + + out = cvcuda.Tensor(src.shape, DEF_OUT_DTYPE, src.layout) + tmp, count, stats = cvcuda.label_into(out, src=src, connectivity=connectivity) + assert tmp is out and count is None and stats is None + assert out.layout == src.layout + assert out.shape == src.shape + assert out.dtype == DEF_OUT_DTYPE + + num_samples = src_args[0][0] if "N" in src_args[2] else 1 + bg_label = cvcuda.Tensor((num_samples,), src.dtype, "N") + min_thresh = cvcuda.Tensor((num_samples,), src.dtype, "N") + max_thresh = cvcuda.Tensor((num_samples,), src.dtype, "N") + + out, count, stats = cvcuda.label( + src, + connectivity, + bg_label=bg_label, + min_thresh=min_thresh, + max_thresh=max_thresh, + ) + assert count is None and stats is None + assert out.layout == src.layout + assert out.shape == src.shape + assert out.dtype == DEF_OUT_DTYPE + + out, count, stats = cvcuda.label(src, connectivity, count=True, stats=False) + assert count is not None and stats is None + + out, count, stats = cvcuda.label(src, connectivity, count=True, stats=True) + assert count is not None and stats is not None + + min_size = cvcuda.Tensor((num_samples,), DEF_OUT_DTYPE, "N") + + out, count, stats = cvcuda.label( + src, + connectivity, + cvcuda.LABEL.SEQUENTIAL, + count=True, + stats=True, + bg_label=bg_label, + min_size=min_size, + ) + assert count is not None and stats is not None + assert out.layout == src.layout + assert out.shape == src.shape + assert out.dtype == DEF_OUT_DTYPE + + t_out, t_count, t_stats = cvcuda.label_into(out, count, stats, src, connectivity) + assert t_out is out and t_count is count and t_stats is stats + assert out.layout == src.layout + assert out.shape == src.shape + assert out.dtype == DEF_OUT_DTYPE + assert count.layout == "N" + assert count.shape[0] == num_samples + assert count.dtype == DEF_OUT_DTYPE + assert stats.layout == "NMA" + assert stats.shape == (num_samples, DEF_MAX_CAPACITY, defaultNumStats(src_args[2])) + assert stats.dtype == DEF_OUT_DTYPE + + out, count, stats = cvcuda.label( + src, connectivity, count=True, stats=True, max_labels=12345 + ) + assert stats.shape == (num_samples, 12345, defaultNumStats(src_args[2])) + + stream = cvcuda.Stream() + out, _, _ = cvcuda.label(src=src, connectivity=connectivity, stream=stream) + assert out.layout == src.layout + assert out.shape == src.shape + assert out.dtype == DEF_OUT_DTYPE + + tmp, _, _ = cvcuda.label_into( + dst=out, src=src, connectivity=connectivity, stream=stream + ) + assert tmp is out + assert out.layout == src.layout + assert out.shape == src.shape + assert out.dtype == np.uint32 diff --git a/tests/cvcuda/python/test_opmatch.py b/tests/cvcuda/python/test_opmatch.py new file mode 100644 index 00000000..2d29fd81 --- /dev/null +++ b/tests/cvcuda/python/test_opmatch.py @@ -0,0 +1,212 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cvcuda +import pytest as t +import numpy as np +import cvcuda_util as util + + +RNG = np.random.default_rng(0) + + +class ref: + """Python reference class to store constants and test output content""" + + num_dtype = np.int32 + out_dtype = np.int32 + dist_dtype = np.float32 + + def absdiff(a, b): + if type(a) == float: + return abs(a - b) + else: + return b - a if a < b else a - b + + def distance(p1, p2, norm_type): + if norm_type == cvcuda.Norm.HAMMING: + return sum([bin(c1 ^ c2).count("1") for c1, c2 in zip(p1, p2)]) + elif norm_type == cvcuda.Norm.L1: + return sum([abs(ref.absdiff(c1, c2)) for c1, c2 in zip(p1, p2)]) + elif norm_type == cvcuda.Norm.L2: + return np.sqrt(sum([ref.absdiff(c1, c2) ** 2 for c1, c2 in zip(p1, p2)])) + + def brute_force_matcher(batch_set1, batch_set2, cross_check, norm_type): + batch_matches = [] + batch_num_matches = [] + batch_distances = [] + for set1, set2 in zip(batch_set1, batch_set2): + batch_matches.append([]) + batch_num_matches.append(0) + batch_distances.append([]) + for set1_idx, p1 in enumerate(set1): + dist1to2 = [] + for set2_idx, p2 in enumerate(set2): + dist1to2.append((ref.distance(p1, p2, norm_type), set2_idx)) + sorted_dist_ids = sorted(dist1to2) + if cross_check: + p2 = set2[sorted_dist_ids[0][1]] + dist2to1 = [] + for q1_idx, q1 in enumerate(set1): + dist2to1.append((ref.distance(q1, p2, norm_type), q1_idx)) + cc_sorted_dist_ids = sorted(dist2to1) + if cc_sorted_dist_ids[0][1] == set1_idx: + batch_matches[-1].append([set1_idx, sorted_dist_ids[0][1]]) + batch_distances[-1].append(sorted_dist_ids[0][0]) + batch_num_matches[-1] += 1 + else: + batch_matches[-1].append([set1_idx, sorted_dist_ids[0][1]]) + batch_distances[-1].append(sorted_dist_ids[0][0]) + batch_num_matches[-1] += 1 + return batch_matches, batch_num_matches, batch_distances + + def sort(matches, num_matches, distances): + output = [] + for sample_idx in range(len(matches)): + for match_idx in range(num_matches[sample_idx]): + set1_idx = matches[sample_idx][match_idx][0] + set2_idx = matches[sample_idx][match_idx][1] + distance = distances[sample_idx][match_idx] + output.append((sample_idx, set1_idx, set2_idx, distance)) + return sorted(output) + + +@t.mark.parametrize( + "set_shape, set_dtype", + [ + ((1, 11, 1), np.uint8), + ((2, 12, 2), np.uint32), + ((3, 22, 3), np.float32), + ((4, 123, 32), np.uint8), + ((3, 234, 26), np.uint32), + ((2, 345, 13), np.float32), + ], +) +def test_op_match_api(set_shape, set_dtype): + set1 = cvcuda.Tensor(set_shape, set_dtype, "NMD") + set2 = cvcuda.Tensor(set_shape, set_dtype, "NMD") + + matches, num_matches, distances = cvcuda.match(set1, set2) + assert num_matches is None and distances is None + assert matches.shape == (set_shape[0], set_shape[1], 2) + assert matches.layout == "NMA" + assert matches.dtype == ref.out_dtype + + _, num_matches, _ = cvcuda.match(set1, set2, num_matches=True) + assert num_matches.shape == (set_shape[0],) + assert num_matches.layout == "N" + assert num_matches.dtype == ref.out_dtype + + _, _, distances = cvcuda.match(set1, set2, distances=True) + assert distances.shape == (set_shape[0], set_shape[1]) + assert distances.layout == "NM" + assert distances.dtype == ref.dist_dtype + + _, num_matches, _ = cvcuda.match(set1, set2, cross_check=True) + assert num_matches is not None + + _, num_matches, distances = cvcuda.match( + set1, set2, num_matches=True, distances=True + ) + assert num_matches is not None and distances is not None + + num_set1 = cvcuda.Tensor(set_shape[:1], ref.num_dtype, "N") + num_set2 = cvcuda.Tensor(set_shape[:1], ref.num_dtype, "N") + + big_matches, _, _ = cvcuda.match( + set1, + set2, + num_set1, + num_set2, + cross_check=False, + norm_type=cvcuda.Norm.L2, + matches_per_point=64, + algo_choice=cvcuda.Matcher.BRUTE_FORCE, + ) + assert big_matches.shape == (set_shape[0], set_shape[1] * 64, 2) + + tmp = cvcuda.match_into( + matches, + num_matches, + distances, + set1, + set2, + num_set1, + num_set2, + ) + assert tmp[0] is matches and tmp[1] is num_matches and tmp[2] is distances + + stream = cvcuda.Stream() + matches, _, _ = cvcuda.match(set1, set2, num_set1, num_set2, stream=stream) + assert matches.shape == (set_shape[0], set_shape[1], 2) + assert matches.layout == "NMA" + assert matches.dtype == ref.out_dtype + + tmp = cvcuda.match_into( + matches, + None, + None, + set1, + set2, + None, + None, + False, + 1, + cvcuda.Norm.L1, + cvcuda.Matcher.BRUTE_FORCE, + stream=stream, + ) + assert tmp[0] is matches and tmp[1] is None and tmp[2] is None + + +@t.mark.parametrize( + "set_shape, set_dtype, cross_check, norm_type", + [ + ((1, 18, 32), np.uint8, False, cvcuda.Norm.HAMMING), + ((2, 28, 21), np.uint32, False, cvcuda.Norm.L1), + ((3, 36, 10), np.float32, False, cvcuda.Norm.L2), + ((2, 17, 33), np.uint8, True, cvcuda.Norm.L1), + ((3, 57, 13), np.float32, True, cvcuda.Norm.L2), + ], +) +def test_op_match_content(set_shape, set_dtype, cross_check, norm_type): + h_set1 = util.generate_data(set_shape, set_dtype, max_random=255, rng=RNG) + h_set2 = util.generate_data(set_shape, set_dtype, max_random=255, rng=RNG) + + set1 = util.to_nvcv_tensor(h_set1, "NMD") + set2 = util.to_nvcv_tensor(h_set2, "NMD") + + matches, num_matches, distances = cvcuda.match( + set1, + set2, + num_matches=True, + distances=True, + cross_check=cross_check, + norm_type=norm_type, + algo_choice=cvcuda.Matcher.BRUTE_FORCE, + ) + + h_test_matches = util.to_cpu_numpy_buffer(matches.cuda()) + h_test_num_matches = util.to_cpu_numpy_buffer(num_matches.cuda()) + h_test_distances = util.to_cpu_numpy_buffer(distances.cuda()) + + h_gold_matches, h_gold_num_matches, h_gold_distances = ref.brute_force_matcher( + h_set1, h_set2, cross_check, norm_type + ) + + h_test_output = ref.sort(h_test_matches, h_test_num_matches, h_test_distances) + h_gold_output = ref.sort(h_gold_matches, h_gold_num_matches, h_gold_distances) + + np.testing.assert_allclose(h_test_output, h_gold_output, rtol=1e-5, atol=1e-5) diff --git a/tests/cvcuda/python/test_opmorphology.py b/tests/cvcuda/python/test_opmorphology.py index 94e17ee6..99f3543b 100644 --- a/tests/cvcuda/python/test_opmorphology.py +++ b/tests/cvcuda/python/test_opmorphology.py @@ -13,11 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +import torch import cvcuda import pytest as t import numpy as np import cvcuda_util as util -import torch RNG = np.random.default_rng(0) diff --git a/tests/cvcuda/python/test_opnms.py b/tests/cvcuda/python/test_opnms.py index 481c0b7c..708075d5 100644 --- a/tests/cvcuda/python/test_opnms.py +++ b/tests/cvcuda/python/test_opnms.py @@ -13,11 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +import torch import cvcuda import cvcuda_util as util import pytest as t import numpy as np -import torch RNG = np.random.default_rng(0) diff --git a/tests/cvcuda/python/test_oposd.py b/tests/cvcuda/python/test_oposd.py index 37be497c..d12ea49a 100644 --- a/tests/cvcuda/python/test_oposd.py +++ b/tests/cvcuda/python/test_oposd.py @@ -24,100 +24,109 @@ ( (((2, 224, 224, 4), np.uint8, "NHWC")), cvcuda.Elements( - numElements=[5, 5], - elements=( - cvcuda.BndBoxI( - box=(10, 10, 5, 5), - thickness=2, - borderColor=(255, 255, 0), - fillColor=(0, 128, 255, 128), - ), - cvcuda.Label( - utf8Text="def", - fontSize=30, - tlPos=(50, 50), - fontColor=(255, 255, 0), - bgColor=(0, 128, 255, 128), - ), - cvcuda.Segment( - box=(20, 20, 30, 30), - thickness=1, - segArray=np.array( - [ - [0, 0, 0, 0, 0.2, 0.2, 0, 0, 0, 0], - [0, 0, 0, 0.2, 0.3, 0.3, 0.2, 0, 0, 0], - [0, 0, 0.2, 0.3, 0.4, 0.4, 0.3, 0.2, 0, 0], - [0, 0.2, 0.3, 0.4, 0.5, 0.5, 0.4, 0.3, 0.2, 0], - [0.2, 0.3, 0.4, 0.5, 0.5, 0.5, 0.5, 0.4, 0.3, 0.2], - [0.2, 0.3, 0.4, 0.5, 0.5, 0.5, 0.5, 0.4, 0.3, 0.2], - [0, 0.2, 0.3, 0.4, 0.5, 0.5, 0.4, 0.3, 0.2, 0], - [0, 0, 0.2, 0.3, 0.4, 0.4, 0.3, 0.2, 0, 0], - [0, 0, 0, 0.2, 0.3, 0.3, 0.2, 0, 0, 0], - [0, 0, 0, 0, 0.2, 0.2, 0, 0, 0, 0], - ] + elements=[ + [ + cvcuda.BndBoxI( + box=(10, 10, 5, 5), + thickness=2, + borderColor=(255, 255, 0), + fillColor=(0, 128, 255, 128), + ) + ], + [ + cvcuda.BndBoxI( + box=(10, 10, 5, 5), + thickness=2, + borderColor=(255, 255, 0), + fillColor=(0, 128, 255, 128), ), - segThreshold=0.2, - borderColor=(255, 255, 0), - segColor=(0, 128, 255, 128), - ), - cvcuda.Point( - centerPos=(30, 30), - radius=5, - color=(255, 255, 0), - ), - cvcuda.Line( - pos0=(50, 50), - pos1=(150, 50), - thickness=1, - color=(255, 0, 0), - ), - cvcuda.PolyLine( - points=np.array( - [ - [100, 100], - [600, 100], - [350, 300], - [600, 500], - [300, 500], - ] + cvcuda.Label( + utf8Text="def", + fontSize=30, + tlPos=(50, 50), + fontColor=(255, 255, 0), + bgColor=(0, 128, 255, 128), ), - thickness=1, - isClosed=True, - borderColor=(255, 255, 0), - fillColor=(0, 128, 255, 128), - ), - cvcuda.RotatedBox( - centerPos=(30, 30), - width=5, - height=5, - yaw=0.3, - thickness=1, - borderColor=(255, 255, 0), - bgColor=(0, 128, 255, 128), - ), - cvcuda.Circle( - centerPos=(30, 30), - radius=5, - thickness=2, - borderColor=(255, 255, 0), - bgColor=(0, 128, 255, 128), - ), - cvcuda.Arrow( - pos0=(50, 50), - pos1=(150, 50), - arrowSize=3, - thickness=1, - color=(255, 0, 0), - ), - cvcuda.Clock( - clockFormat=cvcuda.ClockFormat.YYMMDD_HHMMSS, - time=0, - fontSize=10, - tlPos=(150, 50), - fontColor=(255, 255, 0), - bgColor=(0, 128, 255, 128), - ), - ), + cvcuda.Segment( + box=(20, 20, 30, 30), + thickness=1, + segArray=np.array( + [ + [0, 0, 0, 0, 0.2, 0.2, 0, 0, 0, 0], + [0, 0, 0, 0.2, 0.3, 0.3, 0.2, 0, 0, 0], + [0, 0, 0.2, 0.3, 0.4, 0.4, 0.3, 0.2, 0, 0], + [0, 0.2, 0.3, 0.4, 0.5, 0.5, 0.4, 0.3, 0.2, 0], + [0.2, 0.3, 0.4, 0.5, 0.5, 0.5, 0.5, 0.4, 0.3, 0.2], + [0.2, 0.3, 0.4, 0.5, 0.5, 0.5, 0.5, 0.4, 0.3, 0.2], + [0, 0.2, 0.3, 0.4, 0.5, 0.5, 0.4, 0.3, 0.2, 0], + [0, 0, 0.2, 0.3, 0.4, 0.4, 0.3, 0.2, 0, 0], + [0, 0, 0, 0.2, 0.3, 0.3, 0.2, 0, 0, 0], + [0, 0, 0, 0, 0.2, 0.2, 0, 0, 0, 0], + ] + ), + segThreshold=0.2, + borderColor=(255, 255, 0), + segColor=(0, 128, 255, 128), + ), + cvcuda.Point( + centerPos=(30, 30), + radius=5, + color=(255, 255, 0), + ), + cvcuda.Line( + pos0=(50, 50), + pos1=(150, 50), + thickness=1, + color=(255, 0, 0), + ), + cvcuda.PolyLine( + points=np.array( + [ + [100, 100], + [600, 100], + [350, 300], + [600, 500], + [300, 500], + ] + ), + thickness=1, + isClosed=True, + borderColor=(255, 255, 0), + fillColor=(0, 128, 255, 128), + ), + cvcuda.RotatedBox( + centerPos=(30, 30), + width=5, + height=5, + yaw=0.3, + thickness=1, + borderColor=(255, 255, 0), + bgColor=(0, 128, 255, 128), + ), + cvcuda.Circle( + centerPos=(30, 30), + radius=5, + thickness=2, + borderColor=(255, 255, 0), + bgColor=(0, 128, 255, 128), + ), + cvcuda.Arrow( + pos0=(50, 50), + pos1=(150, 50), + arrowSize=3, + thickness=1, + color=(255, 0, 0), + ), + cvcuda.Clock( + clockFormat=cvcuda.ClockFormat.YYMMDD_HHMMSS, + time=0, + fontSize=10, + tlPos=(150, 50), + fontColor=(255, 255, 0), + bgColor=(0, 128, 255, 128), + ), + ], + ], ), ), ], diff --git a/tests/cvcuda/python/test_oppillowresize.py b/tests/cvcuda/python/test_oppillowresize.py index 303e4a1c..bba37e5a 100644 --- a/tests/cvcuda/python/test_oppillowresize.py +++ b/tests/cvcuda/python/test_oppillowresize.py @@ -13,12 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import torch import cvcuda import pytest as t import numpy as np import cvcuda_util as util import threading -import torch RNG = np.random.default_rng(0) diff --git a/tests/cvcuda/python/test_opreformat.py b/tests/cvcuda/python/test_opreformat.py index 0d507db7..2f478bb0 100644 --- a/tests/cvcuda/python/test_opreformat.py +++ b/tests/cvcuda/python/test_opreformat.py @@ -13,11 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +import torch import cvcuda import pytest as t import numpy as np import threading -import torch RNG = np.random.default_rng(0) diff --git a/tests/cvcuda/python/test_opremap.py b/tests/cvcuda/python/test_opremap.py index 2cd7ce66..2ff5d552 100644 --- a/tests/cvcuda/python/test_opremap.py +++ b/tests/cvcuda/python/test_opremap.py @@ -13,12 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import torch import nvcv import cvcuda import pytest as t import numpy as np import cvcuda_util as util -import torch RNG = np.random.default_rng(0) diff --git a/tests/cvcuda/python/test_opstack.py b/tests/cvcuda/python/test_opstack.py new file mode 100644 index 00000000..43d86fbb --- /dev/null +++ b/tests/cvcuda/python/test_opstack.py @@ -0,0 +1,103 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cvcuda +import pytest as t +import numpy as np +import random + +random.seed(1) + + +@t.mark.parametrize( + "input, dtype, number", + [ + (((5, 16, 23, 4), np.uint8, "NHWC"), np.int8, 2), + (((1, 160, 221, 2), np.uint8, "NHWC"), np.int8, 3), + (((1, 60, 1, 1), np.uint8, "NHWC"), np.int8, 1), + (((6, 61, 12, 3), np.uint8, "NHWC"), np.int8, 5), + (((5, 161, 23, 4), np.uint8, "NCHW"), np.int8, 2), + (((1, 160, 221, 2), np.uint8, "NCHW"), np.int8, 3), + (((1, 1, 2, 1), np.uint8, "NCHW"), np.int8, 1), + (((6, 13, 1, 3), np.uint8, "NCHW"), np.int8, 5), + (((16, 23, 4), np.uint8, "HWC"), np.int8, 2), + (((160, 221, 2), np.uint8, "HWC"), np.int8, 3), + (((60, 1, 1), np.uint8, "HWC"), np.int8, 1), + (((61, 12, 3), np.uint8, "HWC"), np.int8, 5), + (((161, 23, 4), np.uint8, "CHW"), np.int8, 2), + (((160, 221, 2), np.uint8, "CHW"), np.int8, 3), + (((1, 2, 1), np.uint8, "CHW"), np.int8, 1), + (((13, 1, 3), np.uint8, "CHW"), np.int8, 5), + ], +) +def test_op_stack(input, dtype, number): + + input_tensors = [] + + numberOfTensors = 0 + + updated_input = list(input) + for _ in range(number): + if updated_input[2] == "NHWC" or updated_input[2] == "NCHW": + updated_input[0] = (random.randint(1, input[0][0]),) + input[0][ + 1: + ] # Update the first value + numberOfTensors += updated_input[0][0] + else: + numberOfTensors += 1 + input_tensor = cvcuda.Tensor(*updated_input) + input_tensors.append(input_tensor) + + out = cvcuda.stack(input_tensors) + + assert out.shape[0] == numberOfTensors + assert out.dtype == input_tensors[0].dtype + + if input_tensors[0].shape == 3: + assert out.shape[1] == input_tensors[0].shape[0] + assert out.shape[2] == input_tensors[0].shape[1] + assert out.shape[3] == input_tensors[0].shape[2] + if input_tensors[0].shape == 4: + assert out.layout == input_tensors[0].layout + assert out.shape[1] == input_tensors[0].shape[1] + assert out.shape[2] == input_tensors[0].shape[2] + assert out.shape[3] == input_tensors[0].shape[3] + + # check stack into + outputTensorDef = list(updated_input) + if updated_input[2] == "NHWC" or updated_input[2] == "NCHW": + outputTensorDef[0] = (numberOfTensors,) + input[0][1:] + else: + outputTensorDef[0] = (numberOfTensors,) + input[0][0:] + if updated_input[2] == "HWC": + outputTensorDef[2] = "NHWC" + else: + outputTensorDef[2] = "NCHW" + + output_tensor = cvcuda.Tensor(*outputTensorDef) + cvcuda.stack_into(output_tensor, input_tensors) + + assert output_tensor.shape[0] == numberOfTensors + assert output_tensor.dtype == input_tensors[0].dtype + + if input_tensors[0].shape == 3: + assert output_tensor.shape[1] == input_tensors[0].shape[0] + assert output_tensor.shape[2] == input_tensors[0].shape[1] + assert output_tensor.shape[3] == input_tensors[0].shape[2] + if input_tensors[0].shape == 4: + assert output_tensor.layout == input_tensors[0].layout + assert output_tensor.shape[1] == input_tensors[0].shape[1] + assert output_tensor.shape[2] == input_tensors[0].shape[2] + assert output_tensor.shape[3] == input_tensors[0].shape[3] diff --git a/tests/cvcuda/python/test_opwarpperspective.py b/tests/cvcuda/python/test_opwarpperspective.py index 6410986d..2b111750 100644 --- a/tests/cvcuda/python/test_opwarpperspective.py +++ b/tests/cvcuda/python/test_opwarpperspective.py @@ -102,6 +102,17 @@ cvcuda.Border.REPLICATE, [1, 2, 3, 4], ), + ( + ((11, 21, 4), np.uint8, "HWC"), + [ + [1, 2, 0], + [2, 1, 1], + [0, 0, 1], + ], + cvcuda.Interp.LINEAR | cvcuda.Interp.WARP_INVERSE_MAP, + cvcuda.Border.REPLICATE, + [1, 2, 3, 4], + ), ], ) def test_op_warp_perspective(input_args, xform, flags, border_mode, border_value): diff --git a/tests/cvcuda/python/test_util.py b/tests/cvcuda/python/test_util.py index ff3a8726..bf9da883 100644 --- a/tests/cvcuda/python/test_util.py +++ b/tests/cvcuda/python/test_util.py @@ -13,9 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import torch import nvcv import numpy as np -import torch import cvcuda_util as util diff --git a/tests/cvcuda/system/CMakeLists.txt b/tests/cvcuda/system/CMakeLists.txt index 2da3ac5b..a9e1a648 100644 --- a/tests/cvcuda/system/CMakeLists.txt +++ b/tests/cvcuda/system/CMakeLists.txt @@ -31,6 +31,9 @@ endif() # system core ------------------------------------------------- add_executable(cvcuda_test_system + TestOpPairwiseMatcher.cpp + TestOpStack.cpp + TestOpLabel.cpp TestOpFindContours.cpp TestOpOSD.cpp TestOpHistogramEq.cpp @@ -39,8 +42,8 @@ add_executable(cvcuda_test_system TestOpMinMaxLoc.cpp TestOpHistogram.cpp TestOpMinAreaRect.cpp - TestOpBoxBlur.cpp TestOpBndBox.cpp + TestOpBoxBlur.cpp OsdUtils.cu TestOpSIFT.cpp TestOpMinMaxLoc.cpp @@ -84,6 +87,7 @@ add_executable(cvcuda_test_system TestOpGaussianNoise.cpp GaussianNoiseUtils.cu TestOpInpaint.cpp + TestOpFindHomography.cpp ) target_link_libraries(cvcuda_test_system @@ -93,7 +97,7 @@ target_link_libraries(cvcuda_test_system cuosd ) -nvcv_add_test(cvcuda_test_system) +nvcv_add_test(cvcuda_test_system cvcuda) # header compatibility tests --------------------------------------------- diff --git a/tests/cvcuda/system/OsdUtils.cu b/tests/cvcuda/system/OsdUtils.cu index 63e48103..1fbf4d00 100644 --- a/tests/cvcuda/system/OsdUtils.cu +++ b/tests/cvcuda/system/OsdUtils.cu @@ -105,7 +105,7 @@ Segment *create_segment() Segment *output = new Segment(); output->width = 10; output->height = 10; - checkRuntime(cudaMalloc(&output->data, output->width * output->height * sizeof(float))); + output->data = (float *)malloc(output->width * output->height * sizeof(float)); std::vector diamond; diamond.insert(diamond.end(), {0, 0, 0, 0, 0.2, 0.2, 0, 0, 0, 0}); diamond.insert(diamond.end(), {0, 0, 0, 0.2, 0.3, 0.3, 0.2, 0, 0, 0}); @@ -117,16 +117,16 @@ Segment *create_segment() diamond.insert(diamond.end(), {0, 0, 0.2, 0.3, 0.4, 0.4, 0.3, 0.2, 0, 0}); diamond.insert(diamond.end(), {0, 0, 0, 0.2, 0.3, 0.3, 0.2, 0, 0, 0}); diamond.insert(diamond.end(), {0, 0, 0, 0, 0.2, 0.2, 0, 0, 0, 0}); - checkRuntime(cudaMemcpy(output->data, diamond.data(), output->width * output->height * sizeof(float), - cudaMemcpyHostToDevice)); + memcpy(output->data, diamond.data(), output->width * output->height * sizeof(float)); return output; } void free_segment(Segment *segment) { - if (segment->data) + if (segment->data != nullptr) { - checkRuntime(cudaFree(segment->data)); + free(segment->data); + segment->data = nullptr; } segment->width = 0; segment->height = 0; @@ -146,17 +146,11 @@ Polyline *create_polyline() output->n_pts = points.size(); output->h_pts = (int *)malloc(output->n_pts * 2 * sizeof(int)); memcpy(output->h_pts, points.data(), output->n_pts * 2 * sizeof(int)); - checkRuntime(cudaMalloc(&output->d_pts, output->n_pts * 2 * sizeof(int))); - checkRuntime(cudaMemcpy(output->d_pts, points.data(), output->n_pts * 2 * sizeof(int), cudaMemcpyHostToDevice)); return output; } void free_polyline(Polyline *polyline) { - if (polyline->d_pts) - { - checkRuntime(cudaFree(polyline->d_pts)); - } if (polyline->h_pts) { free(polyline->h_pts); diff --git a/tests/cvcuda/system/OsdUtils.cuh b/tests/cvcuda/system/OsdUtils.cuh index 69d1a7ce..1686d44c 100644 --- a/tests/cvcuda/system/OsdUtils.cuh +++ b/tests/cvcuda/system/OsdUtils.cuh @@ -59,7 +59,6 @@ struct Point struct Polyline { int *h_pts = nullptr; - int *d_pts = nullptr; int n_pts = 0; }; diff --git a/tests/cvcuda/system/TestOpBndBox.cpp b/tests/cvcuda/system/TestOpBndBox.cpp index 45a1c3a6..e895347b 100644 --- a/tests/cvcuda/system/TestOpBndBox.cpp +++ b/tests/cvcuda/system/TestOpBndBox.cpp @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -33,6 +34,7 @@ namespace gt = ::testing; namespace test = nvcv::test; +using namespace cvcuda::priv; static int randl(int l, int h) { @@ -41,12 +43,12 @@ static int randl(int l, int h) } static void setGoldBuffer(std::vector &vect, nvcv::ImageFormat format, - const nvcv::TensorDataAccessStridedImagePlanar &data, nvcv::Byte *inBuf, NVCVBndBoxesI bboxes, - cudaStream_t stream) + const nvcv::TensorDataAccessStridedImagePlanar &data, nvcv::Byte *inBuf, + std::shared_ptr bboxes, cudaStream_t stream) { auto context = cuosd_context_create(); - for (int n = 0; n < bboxes.batch; n++) + for (int n = 0; n < bboxes->batch(); n++) { test::osd::Image *image = test::osd::create_image( data.numCols(), data.numRows(), @@ -54,11 +56,11 @@ static void setGoldBuffer(std::vector &vect, nvcv::ImageFormat format, int bufSize = data.numCols() * data.numRows() * data.numChannels(); EXPECT_EQ(cudaSuccess, cudaMemcpy(image->data0, inBuf + n * bufSize, bufSize, cudaMemcpyDeviceToDevice)); - auto numBoxes = bboxes.numBoxes[n]; + auto numBoxes = bboxes->numBoxesAt(n); for (int i = 0; i < numBoxes; i++) { - auto bbox = bboxes.boxes[i]; + auto bbox = bboxes->boxAt(n, i); int left = std::max(std::min(bbox.box.x, data.numCols() - 1), 0); int top = std::max(std::min(bbox.box.y, data.numRows() - 1), 0); @@ -80,7 +82,6 @@ static void setGoldBuffer(std::vector &vect, nvcv::ImageFormat format, test::osd::cuosd_apply(context, image, stream); - bboxes.boxes = (NVCVBndBoxI *)((unsigned char *)bboxes.boxes + numBoxes * sizeof(NVCVBndBoxI)); EXPECT_EQ(cudaSuccess, cudaMemcpy(vect.data() + n * bufSize, image->data0, bufSize, cudaMemcpyDeviceToHost)); test::osd::free_image(image); @@ -93,14 +94,12 @@ static void setGoldBuffer(std::vector &vect, nvcv::ImageFormat format, static void runOp(cudaStream_t &stream, cvcuda::BndBox &op, int &inN, int &inW, int &inH, int &num, int &sed, nvcv::ImageFormat &format) { - NVCVBndBoxesI bndBoxes; - std::vector numBoxVec; - std::vector bndBoxVec; + std::vector> bndBoxVec; srand(sed); for (int n = 0; n < inN; n++) { - numBoxVec.push_back(num); + std::vector curVec; for (int i = 0; i < num; i++) { NVCVBndBoxI bndBox; @@ -113,13 +112,12 @@ static void runOp(cudaStream_t &stream, cvcuda::BndBox &op, int &inN, int &inW, (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}; bndBox.borderColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}; - bndBoxVec.push_back(bndBox); + curVec.push_back(bndBox); } + bndBoxVec.push_back(curVec); } - bndBoxes.batch = inN; - bndBoxes.numBoxes = numBoxVec.data(); - bndBoxes.boxes = bndBoxVec.data(); + std::shared_ptr bndBoxes = std::make_shared(bndBoxVec); nvcv::Tensor imgIn = nvcv::util::CreateTensor(inN, inW, inH, format); nvcv::Tensor imgOut = nvcv::util::CreateTensor(inN, inW, inH, format); @@ -145,7 +143,7 @@ static void runOp(cudaStream_t &stream, cvcuda::BndBox &op, int &inN, int &inW, EXPECT_EQ(cudaSuccess, cudaMemset(input->basePtr(), 0xFF, inSampleStride * inAccess->numSamples())); EXPECT_EQ(cudaSuccess, cudaMemset(output->basePtr(), 0xFF, outSampleStride * outAccess->numSamples())); - EXPECT_NO_THROW(op(stream, imgIn, imgOut, bndBoxes)); + EXPECT_NO_THROW(op(stream, imgIn, imgOut, (NVCVBndBoxesI)bndBoxes.get())); // check cdata std::vector test(outBufSize); diff --git a/tests/cvcuda/system/TestOpBoxBlur.cpp b/tests/cvcuda/system/TestOpBoxBlur.cpp index e18e73d0..e3efd566 100644 --- a/tests/cvcuda/system/TestOpBoxBlur.cpp +++ b/tests/cvcuda/system/TestOpBoxBlur.cpp @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -32,14 +33,15 @@ namespace gt = ::testing; namespace test = nvcv::test; +using namespace cvcuda::priv; static void setGoldBuffer(std::vector &vect, nvcv::ImageFormat format, const nvcv::TensorDataAccessStridedImagePlanar &data, nvcv::Byte *inBuf, - NVCVBlurBoxesI bboxes, cudaStream_t stream) + std::shared_ptr bboxes, cudaStream_t stream) { auto context = cuosd_context_create(); - for (int n = 0; n < bboxes.batch; n++) + for (int n = 0; n < bboxes->batch(); n++) { test::osd::Image *image = test::osd::create_image( data.numCols(), data.numRows(), @@ -47,11 +49,11 @@ static void setGoldBuffer(std::vector &vect, nvcv::ImageFormat format, int bufSize = data.numCols() * data.numRows() * data.numChannels(); EXPECT_EQ(cudaSuccess, cudaMemcpy(image->data0, inBuf + n * bufSize, bufSize, cudaMemcpyDeviceToDevice)); - auto numBoxes = bboxes.numBoxes[n]; + auto numBoxes = bboxes->numBoxesAt(n); for (int i = 0; i < numBoxes; i++) { - auto bbox = bboxes.boxes[i]; + auto bbox = bboxes->boxAt(n, i); int left = std::max(std::min(bbox.box.x, data.numCols() - 1), 0); int top = std::max(std::min(bbox.box.y, data.numRows() - 1), 0); @@ -70,7 +72,6 @@ static void setGoldBuffer(std::vector &vect, nvcv::ImageFormat format, test::osd::cuosd_apply(context, image, stream); - bboxes.boxes = (NVCVBlurBoxI *)((unsigned char *)bboxes.boxes + numBoxes * sizeof(NVCVBlurBoxI)); EXPECT_EQ(cudaSuccess, cudaMemcpy(vect.data() + n * bufSize, image->data0, bufSize, cudaMemcpyDeviceToHost)); test::osd::free_image(image); @@ -82,13 +83,11 @@ static void setGoldBuffer(std::vector &vect, nvcv::ImageFormat format, static void runOp(cudaStream_t &stream, cvcuda::BoxBlur &op, int &inN, int &inW, int &inH, int &cols, int &rows, int &wBox, int &hBox, int &ks, nvcv::ImageFormat &format) { - NVCVBlurBoxesI blurBoxes; - std::vector numBoxVec; - std::vector blurBoxVec; + std::vector> blurBoxVec; for (int n = 0; n < inN; n++) { - numBoxVec.push_back(cols * rows); + std::vector curVec; for (int i = 0; i < cols; i++) { int x = (inW / cols) * i + wBox / 2; @@ -100,14 +99,13 @@ static void runOp(cudaStream_t &stream, cvcuda::BoxBlur &op, int &inN, int &inW, blurBox.box.width = wBox; blurBox.box.height = hBox; blurBox.kernelSize = ks; - blurBoxVec.push_back(blurBox); + curVec.push_back(blurBox); } } + blurBoxVec.push_back(curVec); } - blurBoxes.batch = inN; - blurBoxes.numBoxes = numBoxVec.data(); - blurBoxes.boxes = blurBoxVec.data(); + std::shared_ptr blurBoxes = std::make_shared(blurBoxVec); nvcv::Tensor imgIn = nvcv::util::CreateTensor(inN, inW, inH, format); nvcv::Tensor imgOut = nvcv::util::CreateTensor(inN, inW, inH, format); @@ -140,7 +138,7 @@ static void runOp(cudaStream_t &stream, cvcuda::BoxBlur &op, int &inN, int &inW, EXPECT_EQ(cudaSuccess, cudaMemcpy(output->basePtr(), inVec.data(), outBufSize, cudaMemcpyHostToDevice)); // run operator - EXPECT_NO_THROW(op(stream, imgIn, imgOut, blurBoxes)); + EXPECT_NO_THROW(op(stream, imgIn, imgOut, (NVCVBlurBoxesI)blurBoxes.get())); // check cdata std::vector test(outBufSize); @@ -179,7 +177,6 @@ TEST_P(OpBoxBlur, BoxBlur_sanity) { cudaStream_t stream; ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream)); - /* int inN = GetParamValue<0>(); int inW = GetParamValue<1>(); int inH = GetParamValue<2>(); @@ -189,9 +186,8 @@ TEST_P(OpBoxBlur, BoxBlur_sanity) int hBox = GetParamValue<6>(); int ks = GetParamValue<7>(); nvcv::ImageFormat format = GetParamValue<8>(); - cvcuda::BoxBlur op; + cvcuda::BoxBlur op; runOp(stream, op, inN, inW, inH, cols, rows, wBox, hBox, ks, format); - */ EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream)); } diff --git a/tests/cvcuda/system/TestOpFindHomography.cpp b/tests/cvcuda/system/TestOpFindHomography.cpp new file mode 100644 index 00000000..a0ef4fb8 --- /dev/null +++ b/tests/cvcuda/system/TestOpFindHomography.cpp @@ -0,0 +1,394 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifdef PERFORMANCE_RUN +# define WARMUP_ITERATIONS 5 +# define PERF_ITERATIONS 50 +#endif + +namespace test = nvcv::test; +namespace util = nvcv::util; +namespace cuda = nvcv::cuda; + +static std::default_random_engine g_rng(std::random_device{}()); + +static void calculateDst(float x, float y, float *X, float *Y, float *model) +{ + *X = model[0] * x + model[1] * y + model[2] * 1; + *Y = model[3] * x + model[4] * y + model[5] * 1; +} + +static void calculateGoldModelMatrix(float *m, std::mt19937 &rng, std::uniform_int_distribution &dis) +{ + // random rotation angle between 0 and pi + float theta = (M_PI / 2.0) * dis(rng) / 100; + float Tx = (float)dis(rng) / 100; + float Ty = (float)dis(rng) / 100; + float sx = (float)dis(rng) / 100; + float sy = (float)dis(rng) / 100; + float p1 = (float)dis(rng) / 100; + float p2 = (float)dis(rng) / 100 * 2; + cuda::math::Matrix He; + He[0] = {cos(theta), -sin(theta), Tx}; + He[1] = {sin(theta), cos(theta), Ty}; + He[2] = {0, 0, 1}; + cuda::math::Matrix Ha; + Ha[0] = {1, sy, 0}; + Ha[1] = {sx, 1, 0}; + Ha[2] = {0, 0, 1}; + cuda::math::Matrix Hp; + Hp[0] = {1, 0, 0}; + Hp[1] = {0, 1, 0}; + Hp[2] = {p1, p2, 1}; + cuda::math::Matrix result = He * (Ha * Hp); + for (int i = 0; i < 3; i++) + for (int j = 0; j < 3; j++) m[i * 3 + j] = result[i][j]; +} + +// clang-format off +NVCV_TEST_SUITE_P(OpFindHomography, test::ValueList +{ + // numSamples, numPoints} + {8, 16}, + {16, 20}, + {25, 40} +}); + +// clang-format on + +TEST_P(OpFindHomography, correct_output) +{ + int numSamples = GetParamValue<0>(); + int numPoints = GetParamValue<1>(); + numPoints *= numPoints; + + // clang-format off + nvcv::Tensor srcPoints({{numSamples, numPoints}, "NW"}, nvcv::TYPE_2F32); + nvcv::Tensor dstPoints({{numSamples, numPoints}, "NW"}, nvcv::TYPE_2F32); + nvcv::Tensor models({{numSamples, 3, 3}, "NHW"}, nvcv::TYPE_F32); + + // clang-format on + + auto srcData = srcPoints.exportData(); + auto dstData = dstPoints.exportData(); + auto modelsData = models.exportData(); + + ASSERT_EQ(srcData->shape(0), srcData->shape(0)); + ASSERT_EQ(srcData->shape(1), srcData->shape(1)); + + std::vector srcVec(2 * numSamples * numPoints); + std::vector dstVec(2 * numSamples * numPoints); + std::vector modelsVec(numSamples * 9); + std::vector estimatedModelsVec(numSamples * 9); + std::vector computedDstVec(2 * numSamples * numPoints); + + std::random_device rd; + std::mt19937 gen(rd()); // Mersenne Twister engine + std::uniform_int_distribution<> dis(0, 100); + + int numXPoints = static_cast(std::sqrt(numPoints)); + int numYPoints = numXPoints; + +#ifdef WRITE_COORDINATES_TO_FILE + std::string src_filename + = "src_coordinates_" + std::to_string(numSamples) + "x" + std::to_string(numPoints) + ".bin"; + std::string dst_filename + = "dst_coordinates_" + std::to_string(numSamples) + "x" + std::to_string(numPoints) + ".bin"; + + std::ofstream outSrcFile(src_filename.c_str(), std::ios::binary); + if (!outSrcFile.is_open()) + { + std::cerr << "Failed to open the src file for writing." << std::endl; + return; + } + + std::ofstream outDstFile(dst_filename.c_str(), std::ios::binary); + if (!outDstFile.is_open()) + { + std::cerr << "Failed to open the dst file for writing." << std::endl; + return; + } +#endif + + // Fill gold models and src and dst points + for (int i = 0; i < numSamples; i++) + { +#pragma unroll + calculateGoldModelMatrix(&modelsVec[i * 9], gen, dis); + // generate src and dst points + for (int j = 0; j < numYPoints; j++) + { + for (int k = 0; k < numXPoints; k++) + { + int idx = j * numYPoints + k; + srcVec[i * numPoints * 2 + 2 * idx] = dis(gen); + srcVec[i * numPoints * 2 + 2 * idx + 1] = dis(gen); + + float dstx, dsty; + calculateDst(srcVec[i * numPoints * 2 + 2 * idx], srcVec[i * numPoints * 2 + 2 * idx + 1], &dstx, &dsty, + modelsVec.data() + i * 9); + dstVec[i * numPoints * 2 + 2 * idx] = dstx; + dstVec[i * numPoints * 2 + 2 * idx + 1] = dsty; + } + } + } + +#ifdef WRITE_COORDINATES_TO_FILE + outSrcFile.write(reinterpret_cast(srcVec.data()), srcVec.size() * sizeof(float)); + outDstFile.write(reinterpret_cast(dstVec.data()), dstVec.size() * sizeof(float)); + + outSrcFile.close(); + outDstFile.close(); +#endif + + ASSERT_EQ(cudaSuccess, cudaMemcpy(srcData->basePtr(), srcVec.data(), sizeof(float) * 2 * numPoints * numSamples, + cudaMemcpyHostToDevice)); + ASSERT_EQ(cudaSuccess, cudaMemcpy(dstData->basePtr(), dstVec.data(), sizeof(float) * 2 * numPoints * numSamples, + cudaMemcpyHostToDevice)); + + cudaStream_t stream; + ASSERT_EQ(cudaSuccess, cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + + cvcuda::FindHomography fh(numSamples, numPoints); + +#ifdef PERFORMANCE_RUN + for (int it = 0; it < WARMUP_ITERATIONS; it++) + { + EXPECT_NO_THROW(fh(stream, srcPoints, dstPoints, models)); + } + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start, stream); + for (int it = 0; it < PERF_ITERATIONS; it++) + { + EXPECT_NO_THROW(fh(stream, srcPoints, dstPoints, models)); + } + cudaEventRecord(stop, stream); + cudaEventSynchronize(stop); + + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + std::cout << "Time taken for " << numSamples << "x" << numPoints << " = " << milliseconds / PERF_ITERATIONS + << "ms\n"; + // std::cout << "Time taken per image = " << milliseconds / PERF_ITERATIONS / numSamples << "ms\n"; + + cudaEventDestroy(start); + cudaEventDestroy(stop); +#else + EXPECT_NO_THROW(fh(stream, srcPoints, dstPoints, models)); +#endif + + EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream)); + + // copy back the estimated models into modelsVec + for (int i = 0; i < numSamples; i++) + { + ASSERT_EQ(cudaSuccess, cudaMemcpy2D(estimatedModelsVec.data() + i * 9, sizeof(float) * 3, + modelsData->basePtr() + i * modelsData->stride(0), modelsData->stride(1), + sizeof(float) * 3, 3, cudaMemcpyDeviceToHost)); + } + + // Compute dst vec based on model estimated +#ifndef PERFORMANCE_RUN + for (int i = 0; i < numSamples; i++) + { + for (int j = 0; j < numYPoints; j++) + { + for (int k = 0; k < numXPoints; k++) + { + int idx = j * numYPoints + k; + float dstx, dsty; + calculateDst(srcVec[i * numPoints * 2 + 2 * idx], srcVec[i * numPoints * 2 + 2 * idx + 1], &dstx, &dsty, + estimatedModelsVec.data() + i * 9); + computedDstVec[i * numPoints * 2 + 2 * idx] = dstx; + computedDstVec[i * numPoints * 2 + 2 * idx + 1] = dsty; + float A = dstVec[i * numPoints * 2 + 2 * idx]; + float B = computedDstVec[i * numPoints * 2 + 2 * idx]; + EXPECT_NEAR(A, B, 1e-03); + A = dstVec[i * numPoints * 2 + 2 * idx + 1]; + B = computedDstVec[i * numPoints * 2 + 2 * idx + 1]; + EXPECT_NEAR(A, B, 1e-03); + } + } + } +#endif +} + +TEST_P(OpFindHomography, varshape_correct_output) +{ + int numSamples = GetParamValue<0>(); + int maxPoints = GetParamValue<1>(); + std::vector numPoints(numSamples); + std::vector numXPoints(numSamples); + + std::mt19937 rng(12345); + std::uniform_int_distribution dis(0, 100); + std::uniform_int_distribution dis_num_points(4, maxPoints); + + auto reqs = nvcv::TensorBatch::CalcRequirements(numSamples); + nvcv::TensorBatch srcTensorBatch(reqs); + nvcv::TensorBatch dstTensorBatch(reqs); + nvcv::TensorBatch modelsTensorBatch(reqs); + + std::vector> srcVec(numSamples); + std::vector> dstVec(numSamples); + std::vector modelsVec(numSamples * 9); + std::vector estimatedModelsVec(numSamples * 9); + std::vector> computedDstVec(numSamples); + + int maxNumPoints = 0; + for (int i = 0; i < numSamples; i++) + { + numXPoints[i] = dis_num_points(rng); + numPoints[i] = numXPoints[i] * numXPoints[i]; + if (numPoints[i] > maxNumPoints) + maxNumPoints = numPoints[i]; + + // Fill gold models and src and dst points + calculateGoldModelMatrix(&modelsVec[i * 9], rng, dis); + for (int j = 0; j < numPoints[i]; j++) + { + int sx = dis(rng); + int sy = dis(rng); + srcVec[i].push_back(sx); + srcVec[i].push_back(sy); + + float dstx, dsty; + calculateDst(sx, sy, &dstx, &dsty, modelsVec.data() + i * 9); + dstVec[i].push_back(dstx); + dstVec[i].push_back(dsty); + } + + nvcv::Tensor srcPoints( + { + {1, numPoints[i]}, + "NW" + }, + nvcv::TYPE_2F32); + nvcv::Tensor dstPoints( + { + {1, numPoints[i]}, + "NW" + }, + nvcv::TYPE_2F32); + nvcv::Tensor models( + { + {1, 3, 3}, + "NHW" + }, + nvcv::TYPE_F32); + + auto srcData = srcPoints.exportData(); + auto dstData = dstPoints.exportData(); + + ASSERT_EQ(cudaSuccess, cudaMemcpy(srcData->basePtr(), srcVec[i].data(), sizeof(float) * srcVec[i].size(), + cudaMemcpyHostToDevice)); + ASSERT_EQ(cudaSuccess, cudaMemcpy(dstData->basePtr(), dstVec[i].data(), sizeof(float) * dstVec[i].size(), + cudaMemcpyHostToDevice)); + + srcTensorBatch.pushBack(srcPoints); + dstTensorBatch.pushBack(dstPoints); + modelsTensorBatch.pushBack(models); + } + + cudaStream_t stream; + ASSERT_EQ(cudaSuccess, cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + + cvcuda::FindHomography fh(numSamples, maxNumPoints); + +#ifdef PERFORMANCE_RUN + for (int it = 0; it < WARMUP_ITERATIONS; it++) + { + EXPECT_NO_THROW(fh(stream, batchSrc, batchDst, models)); + } + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start, stream); + for (int it = 0; it < PERF_ITERATIONS; it++) + { + EXPECT_NO_THROW(fh(stream, batchSrc, batchDst, models)); + } + cudaEventRecord(stop, stream); + cudaEventSynchronize(stop); + + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + std::cout << "Time taken for " << numSamples << "x" << maxPoints << " = " << milliseconds / PERF_ITERATIONS + << "ms\n"; + cudaEventDestroy(start); + cudaEventDestroy(stop); +#else + EXPECT_NO_THROW(fh(stream, srcTensorBatch, dstTensorBatch, modelsTensorBatch)); +#endif + + EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream)); + + // copy back the estimated models into modelsVec + for (int i = 0; i < numSamples; i++) + { + auto modelsData = modelsTensorBatch[i].exportData(); + ASSERT_EQ(cudaSuccess, cudaMemcpy2D(estimatedModelsVec.data() + i * 9, sizeof(float) * 3, modelsData->basePtr(), + modelsData->stride(1), sizeof(float) * 3, 3, cudaMemcpyDeviceToHost)); + } + + // Compute dst vec based on model estimated +#ifndef PERFORMANCE_RUN + for (int i = 0; i < numSamples; i++) + { + for (int j = 0; j < numPoints[i]; j++) + { + float dstx, dsty; + float sx, sy; + sx = srcVec[i][2 * j + 0]; + sy = srcVec[i][2 * j + 1]; + calculateDst(sx, sy, &dstx, &dsty, estimatedModelsVec.data() + i * 9); + computedDstVec[i].push_back(dstx); + computedDstVec[i].push_back(dsty); + float A = dstVec[i][2 * j + 0]; + float B = computedDstVec[i][2 * j + 0]; + EXPECT_NEAR(A, B, 1e-03); + A = dstVec[i][2 * j + 1]; + B = computedDstVec[i][2 * j + 1]; + EXPECT_NEAR(A, B, 1e-03); + } + } +#endif +} diff --git a/tests/cvcuda/system/TestOpLabel.cpp b/tests/cvcuda/system/TestOpLabel.cpp new file mode 100644 index 00000000..12516ab4 --- /dev/null +++ b/tests/cvcuda/system/TestOpLabel.cpp @@ -0,0 +1,835 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Definitions.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// ----------------------- Basic utility definitions --------------------------- + +namespace cuda = nvcv::cuda; +namespace util = nvcv::util; +namespace test = nvcv::test; +namespace type = nvcv::test::type; + +using U8 = uint8_t; + +using RawBufferType = std::vector; + +// --------------------- Reference (gold) computations ------------------------- + +namespace ref { + +// Pre-filter step is to binarize srcVec using threshold range [min, max] -> 1, zero otherwise +template +inline void Binarize(RawBufferType &srcVec, const RawBufferType &minVec, const RawBufferType &maxVec, + const long4 &srcStrides, const long1 &minStrides, const long1 &maxStrides, const long4 &shape) +{ + bool hasMinThresh = minStrides.x > 0; + bool hasMaxThresh = maxStrides.x > 0; + + for (long x = 0; x < shape.x; ++x) + { + ST minThresh = hasMinThresh ? util::ValueAt(minVec, minStrides, long1{x}) : 0; + ST maxThresh = hasMaxThresh ? util::ValueAt(maxVec, maxStrides, long1{x}) : 0; + + for (long y = 0; y < shape.y; ++y) + { + for (long z = 0; z < shape.z; ++z) + { + for (long w = 0; w < shape.w; ++w) + { + long4 curCoord{x, y, z, w}; + + ST value = util::ValueAt(srcVec, srcStrides, curCoord); + + if (hasMinThresh && hasMaxThresh) + { + value = (value < minThresh || value > maxThresh) ? 0 : 1; + } + else if (hasMinThresh) + { + value = (value < minThresh) ? 0 : 1; + } + else if (hasMaxThresh) + { + value = (value > maxThresh) ? 0 : 1; + } + + util::ValueAt(srcVec, srcStrides, curCoord) = value; + } + } + } + } +} + +// Label each component with label in dstVec matching value in srcVec, marking labeled elements as 1 in tmpVec +// (since this function is called recursively, using big input sizes may lead to stack overflow) +template +inline void LabelComponent(RawBufferType &tmpVec, RawBufferType &dstVec, const RawBufferType &srcVec, + const long4 &tmpStrides, const long4 &dstStrides, const long4 &srcStrides, + const long4 &shape, const long4 &curCoord, ST value, DT label) +{ + if (util::ValueAt(tmpVec, tmpStrides, curCoord) == 1) + { + return; // The element was already labeled, skip it + } + if (value != util::ValueAt(srcVec, srcStrides, curCoord)) + { + return; // The element is not in the same labeled region, skip it + } + + // Set element label in dstVec and mark it as labeled in tmpVec + util::ValueAt
(dstVec, dstStrides, curCoord) = label; + util::ValueAt(tmpVec, tmpStrides, curCoord) = 1; + + // For each neighbor, recursively call label component to label each neighbor + if (curCoord.y > 0) + { + LabelComponent(tmpVec, dstVec, srcVec, tmpStrides, dstStrides, srcStrides, shape, + long4{curCoord.x, curCoord.y - 1, curCoord.z, curCoord.w}, value, label); + } + if (curCoord.y < shape.y - 1) + { + LabelComponent(tmpVec, dstVec, srcVec, tmpStrides, dstStrides, srcStrides, shape, + long4{curCoord.x, curCoord.y + 1, curCoord.z, curCoord.w}, value, label); + } + if (curCoord.z > 0) + { + LabelComponent(tmpVec, dstVec, srcVec, tmpStrides, dstStrides, srcStrides, shape, + long4{curCoord.x, curCoord.y, curCoord.z - 1, curCoord.w}, value, label); + } + if (curCoord.z < shape.z - 1) + { + LabelComponent(tmpVec, dstVec, srcVec, tmpStrides, dstStrides, srcStrides, shape, + long4{curCoord.x, curCoord.y, curCoord.z + 1, curCoord.w}, value, label); + } + if (curCoord.w > 0) + { + LabelComponent(tmpVec, dstVec, srcVec, tmpStrides, dstStrides, srcStrides, shape, + long4{curCoord.x, curCoord.y, curCoord.z, curCoord.w - 1}, value, label); + } + if (curCoord.w < shape.w - 1) + { + LabelComponent(tmpVec, dstVec, srcVec, tmpStrides, dstStrides, srcStrides, shape, + long4{curCoord.x, curCoord.y, curCoord.z, curCoord.w + 1}, value, label); + } +} + +// Label N volumes in NDHW tensor stored in srcVec yielding dstVec, with corresponding srcStrides/dstStrides +// - ST is the source type, the data type of the input tensor in srcVec +// - DT is the destination type, the data type of the output tensor in dstVec +template +void Label(RawBufferType &dstVec, const RawBufferType &srcVec, const long4 &dstStrides, const long4 &srcStrides, + const long4 &shape) +{ + // Use a temporary NDHW tensor stored in tmpVec to set elements already labeled, initially zeroes (all unlabeled) + RawBufferType tmpVec(shape.x * shape.y * shape.z * shape.w, 0); + + // The temporary tensor is packed and each element is a single byte, thus: + long4 tmpStrides{shape.y * shape.z * shape.w, shape.z * shape.w, shape.w, 1}; + + // For all elements in input tensor + for (long x = 0; x < shape.x; ++x) + { + for (long y = 0; y < shape.y; ++y) + { + for (long z = 0; z < shape.z; ++z) + { + for (long w = 0; w < shape.w; ++w) + { + long4 curCoord{x, y, z, w}; + + if (util::ValueAt(tmpVec, tmpStrides, curCoord) == 1) + { + continue; // The element was already labeled, skip it + } + + // Get current value from input tensor and set label as a 1D flattened (global) position + ST value = util::ValueAt(srcVec, srcStrides, curCoord); + DT label = y * dstStrides.y / sizeof(DT) + z * dstStrides.z / sizeof(DT) + w; + + // Recursively call to label component + LabelComponent(tmpVec, dstVec, srcVec, tmpStrides, dstStrides, srcStrides, shape, curCoord, value, + label); + } + } + } + } +} + +// Replace labels assigned to regions marked as background in source, and fix a potential region labeled with +// background label in destination by another label (since background label is a reserved label) +template +void ReplaceBgLabels(RawBufferType &dstVec, const RawBufferType &srcVec, const RawBufferType &bglVec, + const long4 &dstStrides, const long4 &srcStrides, const long1 &bglStrides, const long4 &shape) +{ + for (long x = 0; x < shape.x; ++x) + { + ST backgroundLabel = util::ValueAt(bglVec, bglStrides, long1{x}); + + for (long y = 0; y < shape.y; ++y) + { + for (long z = 0; z < shape.z; ++z) + { + for (long w = 0; w < shape.w; ++w) + { + long4 curCoord{x, y, z, w}; + + ST value = util::ValueAt(srcVec, srcStrides, curCoord); + DT label = util::ValueAt
(dstVec, dstStrides, curCoord); + + if (value == backgroundLabel) + { + // The current value is a background label, write it to output + util::ValueAt
(dstVec, dstStrides, curCoord) = (DT)backgroundLabel; + } + else if (label == (DT)backgroundLabel) + { + // If the label assigned happens to be the same as the background label, replace it by + // another label that is never assigned outside the possible offsets + util::ValueAt
(dstVec, dstStrides, curCoord) = dstStrides.x / sizeof(DT); + } + } + } + } + } +} + +// Get the unique set of labels from output in dstVec, disregarding background labels +template +void GetLabels(std::vector> &labels, const RawBufferType &dstVec, const RawBufferType &bglVec, + const long4 &dstStrides, const long1 &bglStrides, const long4 &dstShape) +{ + bool hasBgLabel = bglStrides.x > 0; + + for (long x = 0; x < dstShape.x; ++x) + { + ST backgroundLabel = hasBgLabel ? util::ValueAt(bglVec, bglStrides, long1{x}) : 0; + + for (long y = 0; y < dstShape.y; ++y) + { + for (long z = 0; z < dstShape.z; ++z) + { + for (long w = 0; w < dstShape.w; ++w) + { + DT label = util::ValueAt
(dstVec, dstStrides, long4{x, y, z, w}); + + if (hasBgLabel && label == (DT)backgroundLabel) + { + continue; // ignore (do not get) background labels + } + + labels[x].insert(label); + } + } + } + } +} + +// Get the unique set of labels from statistics in staVec +template +void GetLabels(std::vector> &labels, const RawBufferType &cntVec, const RawBufferType &staVec, + const long1 &cntStrides, const long3 &staStrides, long numSamples) +{ + for (long x = 0; x < numSamples; ++x) + { + long numLabels = util::ValueAt
(cntVec, cntStrides, long1{x}); + + for (long y = 0; y < numLabels; ++y) + { + DT label = util::ValueAt
(staVec, staStrides, long3{x, y, 0}); + + labels[x].insert(label); + } + } +} + +// Count how many different labels were found +template +void CountLabels(RawBufferType &cntVec, const long1 &cntStrides, const std::vector> &labels, + long numSamples) +{ + for (long x = 0; x < numSamples; ++x) + { + util::ValueAt
(cntVec, cntStrides, long1{x}) = (DT)labels[x].size(); + } +} + +// Sort statistics according to region index as test stats have no imposed ordering, it allows comparing against gold +template +void SortStats(std::vector>> &stats, std::vector> &labels, + const RawBufferType &staVec, const long3 &staStrides, const long3 &staShape) +{ + for (long x = 0; x < staShape.x; ++x) + { + long numLabels = labels[x].size(); + + stats[x].resize(numLabels); + + for (long y = 0; y < numLabels; ++y) + { + DT label = util::ValueAt
(staVec, staStrides, long3{x, y, 0}); + auto fit = labels[x].find(label); + + long regionIdx = std::distance(labels[x].cbegin(), fit); + ASSERT_LE(regionIdx, numLabels) << "E idx " << regionIdx << " >= " << numLabels; + + stats[x][regionIdx].resize(staShape.z); + + for (long z = 0; z < staShape.z; ++z) + { + stats[x][regionIdx][z] = util::ValueAt
(staVec, staStrides, long3{x, y, z}); + } + } + } +} + +// Compute statistics of labeled regions +template +void ComputeStats(std::vector>> &stats, const RawBufferType &dstVec, + const RawBufferType &bglVec, const long4 &dstStrides, const long1 &bglStrides, + const std::vector> &labels, const long4 &shape, int numStats) +{ + // One-element-after-the-end label is a special label assigned to a region which got the background label + DT endLabel = dstStrides.x / sizeof(DT); + + bool hasBgLabel = bglStrides.x > 0; + + for (long x = 0; x < shape.x; ++x) + { + ST backgroundLabel = hasBgLabel ? util::ValueAt(bglVec, bglStrides, long1{x}) : 0; + + stats[x].resize(labels[x].size()); + + for (long y = 0; y < shape.y; ++y) + { + for (long z = 0; z < shape.z; ++z) + { + for (long w = 0; w < shape.w; ++w) + { + DT label = util::ValueAt
(dstVec, dstStrides, long4{x, y, z, w}); + auto fit = labels[x].find(label); // result of find iterator + if (fit == labels[x].end()) + { + continue; // this label is to be ignored + } + + DT posLabel = y * dstStrides.y / sizeof(DT) + z * dstStrides.z / sizeof(DT) + w; + + if ((hasBgLabel && label == endLabel && posLabel == (DT)backgroundLabel) || label == posLabel) + { + long regionIdx = std::distance(labels[x].cbegin(), fit); + + stats[x][regionIdx].resize(numStats); + stats[x][regionIdx][0] = label; + stats[x][regionIdx][1] = w; + stats[x][regionIdx][2] = z; + + if (numStats == 6) + { + stats[x][regionIdx][3] = 1; + stats[x][regionIdx][4] = 1; + stats[x][regionIdx][5] = 1; + } + else + { + stats[x][regionIdx][3] = y; + stats[x][regionIdx][4] = 1; + stats[x][regionIdx][5] = 1; + stats[x][regionIdx][6] = 1; + stats[x][regionIdx][7] = 1; + } + } + } + } + } + for (long y = 0; y < shape.y; ++y) + { + for (long z = 0; z < shape.z; ++z) + { + for (long w = 0; w < shape.w; ++w) + { + DT label = util::ValueAt
(dstVec, dstStrides, long4{x, y, z, w}); + auto fit = labels[x].find(label); + if (fit == labels[x].end()) + { + continue; + } + + DT posLabel = y * dstStrides.y / sizeof(DT) + z * dstStrides.z / sizeof(DT) + w; + + if ((hasBgLabel && label == endLabel && posLabel == (DT)backgroundLabel) || label == posLabel) + { + continue; // statistics for this element was already computed + } + + long regionIdx = std::distance(labels[x].cbegin(), fit); + DT bboxAreaW = std::abs(stats[x][regionIdx][1] - w) + 1; + DT bboxAreaH = std::abs(stats[x][regionIdx][2] - z) + 1; + + if (numStats == 6) + { + stats[x][regionIdx][3] = std::max(stats[x][regionIdx][3], bboxAreaW); + stats[x][regionIdx][4] = std::max(stats[x][regionIdx][4], bboxAreaH); + stats[x][regionIdx][5] += 1; + } + else + { + DT bboxAreaD = std::abs(stats[x][regionIdx][3] - y) + 1; + + stats[x][regionIdx][4] = std::max(stats[x][regionIdx][4], bboxAreaW); + stats[x][regionIdx][5] = std::max(stats[x][regionIdx][5], bboxAreaH); + stats[x][regionIdx][6] = std::max(stats[x][regionIdx][6], bboxAreaD); + stats[x][regionIdx][7] += 1; + } + } + } + } + } +} + +// Remove islands (regions with less than minimum size in mszVec) from dstVec based on statistics +template +void RemoveIslands(std::vector> &labels, RawBufferType &dstVec, const RawBufferType &bglVec, + const RawBufferType &mszVec, const long4 &dstStrides, const long1 &bglStrides, + const long1 &mszStrides, const std::vector>> &stats, const long4 &shape, + int numStats) +{ + for (long x = 0; x < shape.x; ++x) + { + ST backgroundLabel = util::ValueAt(bglVec, bglStrides, long1{x}); + DT minSize = util::ValueAt
(mszVec, mszStrides, long1{x}); + + for (long y = 0; y < shape.y; ++y) + { + for (long z = 0; z < shape.z; ++z) + { + for (long w = 0; w < shape.w; ++w) + { + long4 curCoord{x, y, z, w}; + + DT label = util::ValueAt
(dstVec, dstStrides, curCoord); + auto fit = labels[x].find(label); // result of find iterator + if (fit == labels[x].end()) + { + continue; // this label is to be ignored + } + + long regionIdx = std::distance(labels[x].cbegin(), fit); + DT regionSize = stats[x][regionIdx][numStats - 1]; + + if (regionSize < minSize) + { + util::ValueAt
(dstVec, dstStrides, curCoord) = backgroundLabel; + } + } + } + } + } +} + +// Relabel replaces index-based labels by consecutive region indices +template +void Relabel(RawBufferType &dstVec, const RawBufferType &bglVec, const RawBufferType &staVec, + const RawBufferType &cntVec, const long4 &dstStrides, const long1 &bglStrides, const long3 &staStrides, + const long1 &cntStrides, const long4 &shape) +{ + for (long x = 0; x < shape.x; ++x) + { + ST backgroundLabel = util::ValueAt(bglVec, bglStrides, long1{x}); + + std::map origLabelToRegionIdx; + + DT numLabels = util::ValueAt
(cntVec, cntStrides, long1{x}); + + for (DT y = 0; y < numLabels; ++y) + { + DT origLabel = util::ValueAt
(staVec, staStrides, long3{x, y, 0}); + origLabelToRegionIdx.insert({origLabel, y}); + } + for (long y = 0; y < shape.y; ++y) + { + for (long z = 0; z < shape.z; ++z) + { + for (long w = 0; w < shape.w; ++w) + { + DT label = util::ValueAt
(dstVec, dstStrides, long4{x, y, z, w}); + + if (label == (DT)backgroundLabel) + { + continue; + } + + DT regionIdx = origLabelToRegionIdx[label]; + + if (regionIdx >= (DT)backgroundLabel) + { + regionIdx += 1; // increment region indices to skip background labels + } + + util::ValueAt
(dstVec, dstStrides, long4{x, y, z, w}) = regionIdx; + } + } + } + } +} + +} // namespace ref + +// ----------------------------- Start tests ----------------------------------- + +// clang-format off + +#define NVCV_SHAPE(w, h, d, n) (int4{w, h, d, n}) + +#define NVCV_TEST_ROW(InShape, DataType, Type, HasBgLabel, HasMinThresh, HasMaxThresh, DoPostFilters, DoRelabel) \ + type::Types, type::Value, Type, type::Value, type::Value, \ + type::Value, type::Value, type::Value> + +// DoPostFilters: (0) none; (1) count regions; (2) + compute statistics; (3) + island removal. + +NVCV_TYPED_TEST_SUITE(OpLabel, type::Types< + NVCV_TEST_ROW(NVCV_SHAPE(33, 16, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, false, false, false, 0, false), + NVCV_TEST_ROW(NVCV_SHAPE(23, 81, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, false, true, false, 1, false), + NVCV_TEST_ROW(NVCV_SHAPE(13, 14, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, false, true, true, 2, false), + NVCV_TEST_ROW(NVCV_SHAPE(32, 43, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, true, false, false, 3, false), + NVCV_TEST_ROW(NVCV_SHAPE(22, 12, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, false, false, true, 0, false), + NVCV_TEST_ROW(NVCV_SHAPE(15, 16, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, true, false, true, 1, false), + NVCV_TEST_ROW(NVCV_SHAPE(14, 26, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, true, true, false, 2, true), + NVCV_TEST_ROW(NVCV_SHAPE(28, 73, 1, 3), NVCV_DATA_TYPE_U16, uint16_t, true, true, true, 3, true), + NVCV_TEST_ROW(NVCV_SHAPE(23, 21, 12, 1), NVCV_DATA_TYPE_U32, uint32_t, false, false, false, 0, false), + NVCV_TEST_ROW(NVCV_SHAPE(33, 41, 22, 1), NVCV_DATA_TYPE_U32, uint32_t, false, false, false, 1, false), + NVCV_TEST_ROW(NVCV_SHAPE(25, 38, 13, 2), NVCV_DATA_TYPE_S8, int8_t, true, false, false, 2, false), + NVCV_TEST_ROW(NVCV_SHAPE(25, 18, 13, 1), NVCV_DATA_TYPE_S8, int8_t, true, false, false, 3, false), + NVCV_TEST_ROW(NVCV_SHAPE(22, 37, 19, 2), NVCV_DATA_TYPE_S16, int16_t, true, true, false, 0, false), + NVCV_TEST_ROW(NVCV_SHAPE(18, 27, 3, 1), NVCV_DATA_TYPE_S32, int32_t, true, false, true, 1, false), + NVCV_TEST_ROW(NVCV_SHAPE(17, 29, 5, 2), NVCV_DATA_TYPE_U8, uint8_t, true, true, true, 2, false), + NVCV_TEST_ROW(NVCV_SHAPE(16, 28, 4, 3), NVCV_DATA_TYPE_U8, uint8_t, true, true, true, 3, true) +>); + +// clang-format on + +TYPED_TEST(OpLabel, correct_output) +{ + // First setup: get test parameters, create input and output tensors and get their data accesses + + int4 shape{type::GetValue}; + nvcv::DataType srcDT{type::GetValue}; + nvcv::DataType dstDT{nvcv::TYPE_U32}; + + using SrcT = type::GetType; + using DstT = uint32_t; + + bool hasBgLabel = type::GetValue; + bool hasMinThresh = type::GetValue; + bool hasMaxThresh = type::GetValue; + int doPostFilters = type::GetValue; + bool doRelabel = type::GetValue; + + // @note The tensors below are defined as: input or source (src), output or destination (dst), background + // labels (bgl), minimum threshold (min), maximum threshold (max), minimum size for islands removal (msz), + // count of labeled regions (count) and statistics computed per labeled region (sta) + + nvcv::Tensor srcTensor, dstTensor, bglTensor, minTensor, maxTensor, mszTensor, cntTensor, staTensor; + + nvcv::Optional srcData, dstData, bglData, minData, maxData, mszData, cntData, staData; + + NVCVConnectivityType connectivity = (shape.z == 1) ? NVCV_CONNECTIVITY_4_2D : NVCV_CONNECTIVITY_6_3D; + NVCVLabelType assignLabels = doRelabel ? NVCV_LABEL_SEQUENTIAL : NVCV_LABEL_FAST; + + long3 staShape{shape.w, 10000, (shape.z == 1) ? 6 : 8}; + + // clang-format off + + if (shape.w == 1) // tensors without N in layout (single-sample problem) + { + if (shape.z == 1) // tensors without D in layout (2D problem) + { + srcTensor = nvcv::Tensor({{shape.y, shape.x}, "HW"}, srcDT); + } + else // tensors with D in layout (3D problem) + { + srcTensor = nvcv::Tensor({{shape.z, shape.y, shape.x}, "DHW"}, srcDT); + } + } + else // tensors with N in layout (batched problem) + { + if (shape.z == 1) // tensors without D in layout (2D problem) + { + srcTensor = nvcv::Tensor({{shape.w, shape.y, shape.x}, "NHW"}, srcDT); + } + else // tensors with D in layout (3D problem) + { + srcTensor = nvcv::Tensor({{shape.w, shape.z, shape.y, shape.x}, "NDHW"}, srcDT); + } + } + + if (hasBgLabel) + { + bglTensor = nvcv::Tensor({{shape.w}, "N"}, srcDT); + + bglData = bglTensor.exportData(); + ASSERT_TRUE(bglData); + } + if (hasMinThresh) + { + minTensor = nvcv::Tensor({{shape.w}, "N"}, srcDT); + + minData = minTensor.exportData(); + ASSERT_TRUE(minData); + } + if (hasMaxThresh) + { + maxTensor = nvcv::Tensor({{shape.w}, "N"}, srcDT); + + maxData = maxTensor.exportData(); + ASSERT_TRUE(maxData); + } + if (doPostFilters >= 1) + { + cntTensor = nvcv::Tensor({{shape.w}, "N"}, dstDT); + + cntData = cntTensor.exportData(); + ASSERT_TRUE(cntData); + } + if (doPostFilters >= 2) + { + staTensor = nvcv::Tensor({{staShape.x, staShape.y, staShape.z}, "NMA"}, dstDT); + + staData = staTensor.exportData(); + ASSERT_TRUE(staData); + } + if (doPostFilters == 3) + { + mszTensor = nvcv::Tensor({{shape.w}, "N"}, dstDT); + + mszData = mszTensor.exportData(); + ASSERT_TRUE(mszData); + } + + // clang-format on + + dstTensor = nvcv::Tensor(srcTensor.shape(), dstDT); + + srcData = srcTensor.exportData(); + ASSERT_TRUE(srcData); + + dstData = dstTensor.exportData(); + ASSERT_TRUE(dstData); + + // Second setup: get tensors shape, strides and buffer sizes + + int4 ids{srcTensor.layout().find('N'), srcTensor.layout().find('D'), srcTensor.layout().find('H'), + srcTensor.layout().find('W')}; + + long4 srcShape{shape.w, shape.z, shape.y, shape.x}; // srcShape is NDHW whereas shape is WHDN + + long4 srcStrides{0, 0, srcData->stride(ids.z), srcData->stride(ids.w)}; + long4 dstStrides{0, 0, dstData->stride(ids.z), dstData->stride(ids.w)}; + long1 bglStrides{(bglTensor) ? bglData->stride(0) : 0}; + long1 minStrides{(minTensor) ? minData->stride(0) : 0}; + long1 maxStrides{(maxTensor) ? maxData->stride(0) : 0}; + long1 mszStrides{(mszTensor) ? mszData->stride(0) : 0}; + long1 cntStrides{(cntTensor) ? cntData->stride(0) : 0}; + long3 staStrides = (staTensor) ? long3{staData->stride(0), staData->stride(1), staData->stride(2)} : long3{0, 0, 0}; + + srcStrides.y = (ids.y == -1) ? srcStrides.z * srcShape.z : srcData->stride(ids.y); + srcStrides.x = (ids.x == -1) ? srcStrides.y * srcShape.y : srcData->stride(ids.x); + dstStrides.y = (ids.y == -1) ? dstStrides.z * srcShape.z : dstData->stride(ids.y); + dstStrides.x = (ids.x == -1) ? dstStrides.y * srcShape.y : dstData->stride(ids.x); + + long srcBufSize = srcStrides.x * srcShape.x; + long dstBufSize = dstStrides.x * srcShape.x; + long bglBufSize = bglStrides.x * srcShape.x; + long minBufSize = minStrides.x * srcShape.x; + long maxBufSize = maxStrides.x * srcShape.x; + long mszBufSize = mszStrides.x * srcShape.x; + long cntBufSize = cntStrides.x * srcShape.x; + long staBufSize = staStrides.x * srcShape.x; + + // Third setup: generate raw buffer data and copy them into tensors + + RawBufferType srcVec(srcBufSize); + RawBufferType bglVec(bglBufSize); + RawBufferType minVec(minBufSize); + RawBufferType maxVec(maxBufSize); + RawBufferType mszVec(mszBufSize); + + std::default_random_engine rng(0); + + std::uniform_int_distribution srcRandom(0, 6); + std::uniform_int_distribution bglRandom(0, (minTensor || maxTensor) ? 1 : 6); + std::uniform_int_distribution minRandom(1, 3); + std::uniform_int_distribution maxRandom(3, 5); + + // clang-format off + + for (long x = 0; x < srcShape.x; ++x) + for (long y = 0; y < srcShape.y; ++y) + for (long z = 0; z < srcShape.z; ++z) + for (long w = 0; w < srcShape.w; ++w) + util::ValueAt(srcVec, srcStrides, long4{x, y, z, w}) = srcRandom(rng); + + ASSERT_EQ(cudaSuccess, cudaMemcpy(srcData->basePtr(), srcVec.data(), srcBufSize, cudaMemcpyHostToDevice)); + + if (bglTensor) + { + for (long x = 0; x < srcShape.x; ++x) + util::ValueAt(bglVec, bglStrides, long1{x}) = bglRandom(rng); + + ASSERT_EQ(cudaSuccess, cudaMemcpy(bglData->basePtr(), bglVec.data(), bglBufSize, cudaMemcpyHostToDevice)); + } + if (minTensor) + { + for (long x = 0; x < srcShape.x; ++x) + util::ValueAt(minVec, minStrides, long1{x}) = minRandom(rng); + + ASSERT_EQ(cudaSuccess, cudaMemcpy(minData->basePtr(), minVec.data(), minBufSize, cudaMemcpyHostToDevice)); + } + if (maxTensor) + { + for (long x = 0; x < srcShape.x; ++x) + util::ValueAt(maxVec, maxStrides, long1{x}) = maxRandom(rng); + + ASSERT_EQ(cudaSuccess, cudaMemcpy(maxData->basePtr(), maxVec.data(), maxBufSize, cudaMemcpyHostToDevice)); + } + if (mszTensor) + { + for (long x = 0; x < srcShape.x; ++x) + util::ValueAt(mszVec, mszStrides, long1{x}) = 2; + + ASSERT_EQ(cudaSuccess, cudaMemcpy(mszData->basePtr(), mszVec.data(), mszBufSize, cudaMemcpyHostToDevice)); + } + + // clang-format on + + // After all above setups are done, run the operator, synchronize the stream and copy its results back to host + + cudaStream_t stream; + ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream)); + + cvcuda::Label op; + EXPECT_NO_THROW(op(stream, srcTensor, dstTensor, bglTensor, minTensor, maxTensor, mszTensor, cntTensor, staTensor, + connectivity, assignLabels)); + + ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream)); + + // The operator's results are named as test that must be equal to gold, the three outputs are: labels (lab), + // count (cnt) and statistics (sta); gold statistics are not written as raw buffer, only in 3-vector form + + RawBufferType labTestVec(dstBufSize, 0); + RawBufferType labGoldVec(dstBufSize, 0); + RawBufferType cntTestVec(cntBufSize, 0); + RawBufferType cntGoldVec(cntBufSize, 0); + RawBufferType staTestVec(staBufSize, 0); + + std::vector> testLabels(srcShape.x); + std::vector> goldLabels(srcShape.x); + + std::vector>> testStats(srcShape.x); + std::vector>> goldStats(srcShape.x); + + ASSERT_EQ(cudaSuccess, cudaMemcpy(labTestVec.data(), dstData->basePtr(), dstBufSize, cudaMemcpyDeviceToHost)); + + // To generate the gold data, the reference code (in ref namespace) is used in a specific sequence of steps: + // (1) pre-filter binarization uses min/max thresholds (if present) to replace input mask to binary; (2) the + // label operation itself; (3) background labels are replaced (if present); (4) get all original gold labels; + // (5) count the labels got; (6) compute statistics of the labeled regions; (7) get all original test labels; + // (8) remove islands as post-filter step (if minSize tensor is present); (9) relabel to replace non-sequential + // labels to consecutive region indices; (10) sort test statistics to be able to compare against gold. + + // In-between the generation of gold data, EXPECT_EQ is used to compare test data against gold. + + if (minTensor || maxTensor) + { + ref::Binarize(srcVec, minVec, maxVec, srcStrides, minStrides, maxStrides, srcShape); + } + + ref::Label(labGoldVec, srcVec, dstStrides, srcStrides, srcShape); + + if (bglTensor) + { + ref::ReplaceBgLabels(labGoldVec, srcVec, bglVec, dstStrides, srcStrides, bglStrides, srcShape); + } + + ref::GetLabels(goldLabels, labGoldVec, bglVec, dstStrides, bglStrides, srcShape); + + if (cntTensor) + { + ASSERT_EQ(cudaSuccess, cudaMemcpy(cntTestVec.data(), cntData->basePtr(), cntBufSize, cudaMemcpyDeviceToHost)); + + ref::CountLabels(cntGoldVec, cntStrides, goldLabels, srcShape.x); + } + + EXPECT_EQ(cntTestVec, cntGoldVec); + + if (staTensor) + { + ASSERT_EQ(cudaSuccess, cudaMemcpy(staTestVec.data(), staData->basePtr(), staBufSize, cudaMemcpyDeviceToHost)); + + ref::ComputeStats(goldStats, labGoldVec, bglVec, dstStrides, bglStrides, goldLabels, srcShape, + staShape.z); + + ref::GetLabels(testLabels, cntTestVec, staTestVec, cntStrides, staStrides, srcShape.x); + } + else + { + ref::GetLabels(testLabels, labTestVec, bglVec, dstStrides, bglStrides, srcShape); + } + + EXPECT_EQ(testLabels, goldLabels); + + if (mszTensor) + { + ref::RemoveIslands(goldLabels, labGoldVec, bglVec, mszVec, dstStrides, bglStrides, mszStrides, + goldStats, srcShape, staShape.z); + } + + if (doRelabel) + { + ref::Relabel(labGoldVec, bglVec, staTestVec, cntTestVec, dstStrides, bglStrides, staStrides, + cntStrides, srcShape); + } + + if (staTensor) + { + ref::SortStats(testStats, testLabels, staTestVec, staStrides, staShape); + } + + EXPECT_EQ(testStats, goldStats); + + EXPECT_EQ(labTestVec, labGoldVec); +} diff --git a/tests/cvcuda/system/TestOpOSD.cpp b/tests/cvcuda/system/TestOpOSD.cpp index fcfd9323..5ef18eab 100644 --- a/tests/cvcuda/system/TestOpOSD.cpp +++ b/tests/cvcuda/system/TestOpOSD.cpp @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -33,6 +34,7 @@ namespace gt = ::testing; namespace test = nvcv::test; +using namespace cvcuda::priv; static int randl(int l, int h) { @@ -44,12 +46,12 @@ static int randl(int l, int h) #pragma GCC optimize("O1") static void setGoldBuffer(std::vector &vect, nvcv::ImageFormat format, - const nvcv::TensorDataAccessStridedImagePlanar &data, nvcv::Byte *inBuf, NVCVElements ctx, - cudaStream_t stream) + const nvcv::TensorDataAccessStridedImagePlanar &data, nvcv::Byte *inBuf, + std::shared_ptr ctx, cudaStream_t stream) { auto context = cuosd_context_create(); - for (int n = 0; n < ctx.batch; n++) + for (int n = 0; n < ctx->batch(); n++) { test::osd::Image *image = test::osd::create_image( data.numCols(), data.numRows(), @@ -57,16 +59,16 @@ static void setGoldBuffer(std::vector &vect, nvcv::ImageFormat format, int bufSize = data.numCols() * data.numRows() * data.numChannels(); EXPECT_EQ(cudaSuccess, cudaMemcpy(image->data0, inBuf + n * bufSize, bufSize, cudaMemcpyDeviceToDevice)); - auto numElements = ctx.numElements[n]; + auto numElements = ctx->numElementsAt(n); for (int i = 0; i < numElements; i++) { - auto element = ctx.elements[i]; - switch (element.type) + auto element = ctx->elementAt(n, i); + switch (element->type()) { case NVCVOSDType::NVCV_OSD_RECT: { - auto bbox = *((NVCVBndBoxI *)element.data); + auto bbox = *((NVCVBndBoxI *)element->ptr()); int left = std::max(std::min(bbox.box.x, data.numCols() - 1), 0); int top = std::max(std::min(bbox.box.y, data.numRows() - 1), 0); @@ -86,7 +88,7 @@ static void setGoldBuffer(std::vector &vect, nvcv::ImageFormat format, } case NVCVOSDType::NVCV_OSD_TEXT: { - auto text = *((NVCVText *)element.data); + auto text = *((NVCVText *)element->ptr()); cuOSDColor fontColor = *(cuOSDColor *)(&text.fontColor); cuOSDColor bgColor = *(cuOSDColor *)(&text.bgColor); cuosd_draw_text(context, text.utf8Text, text.fontSize, text.fontName, text.tlPos.x, text.tlPos.y, @@ -95,35 +97,34 @@ static void setGoldBuffer(std::vector &vect, nvcv::ImageFormat format, } case NVCVOSDType::NVCV_OSD_SEGMENT: { - auto segment = *((NVCVSegment *)element.data); + auto segment = (NVCVSegment *)element->ptr(); - int left = segment.box.x; - int top = segment.box.y; - int right = left + segment.box.width - 1; - int bottom = top + segment.box.height - 1; + int left = segment->box.x; + int top = segment->box.y; + int right = left + segment->box.width - 1; + int bottom = top + segment->box.height - 1; - if (left == right || top == bottom || segment.box.width <= 0 || segment.box.height <= 0) + if (left == right || top == bottom || segment->box.width <= 0 || segment->box.height <= 0) { continue; } - - cuOSDColor borderColor = *(cuOSDColor *)(&segment.borderColor); - cuOSDColor segColor = *(cuOSDColor *)(&segment.segColor); - cuosd_draw_segmentmask(context, left, top, right, bottom, segment.thickness, segment.dSeg, - segment.segWidth, segment.segHeight, segment.segThreshold, borderColor, + cuOSDColor borderColor = *(cuOSDColor *)(&segment->borderColor); + cuOSDColor segColor = *(cuOSDColor *)(&segment->segColor); + cuosd_draw_segmentmask(context, left, top, right, bottom, segment->thickness, segment->dSeg, + segment->segWidth, segment->segHeight, segment->segThreshold, borderColor, segColor); break; } case NVCVOSDType::NVCV_OSD_POINT: { - auto point = *((NVCVPoint *)element.data); + auto point = *((NVCVPoint *)element->ptr()); cuOSDColor color = *(cuOSDColor *)(&point.color); cuosd_draw_point(context, point.centerPos.x, point.centerPos.y, point.radius, color); break; } case NVCVOSDType::NVCV_OSD_LINE: { - auto line = *((NVCVLine *)element.data); + auto line = *((NVCVLine *)element->ptr()); cuOSDColor color = *(cuOSDColor *)(&line.color); cuosd_draw_line(context, line.pos0.x, line.pos0.y, line.pos1.x, line.pos1.y, line.thickness, color, line.interpolation); @@ -131,16 +132,16 @@ static void setGoldBuffer(std::vector &vect, nvcv::ImageFormat format, } case NVCVOSDType::NVCV_OSD_POLYLINE: { - auto pl = *((NVCVPolyLine *)element.data); - cuOSDColor borderColor = *(cuOSDColor *)(&pl.borderColor); - cuOSDColor fill_color = *(cuOSDColor *)(&pl.fillColor); - cuosd_draw_polyline(context, pl.hPoints, pl.dPoints, pl.numPoints, pl.thickness, pl.isClosed, - borderColor, pl.interpolation, fill_color); + auto pl = (NVCVPolyLine *)element->ptr(); + cuOSDColor borderColor = *(cuOSDColor *)(&pl->borderColor); + cuOSDColor fill_color = *(cuOSDColor *)(&pl->fillColor); + cuosd_draw_polyline(context, pl->hPoints, pl->dPoints, pl->numPoints, pl->thickness, pl->isClosed, + borderColor, pl->interpolation, fill_color); break; } case NVCVOSDType::NVCV_OSD_ROTATED_RECT: { - auto rb = *((NVCVRotatedBox *)element.data); + auto rb = *((NVCVRotatedBox *)element->ptr()); cuOSDColor borderColor = *(cuOSDColor *)(&rb.borderColor); cuOSDColor bgColor = *(cuOSDColor *)(&rb.bgColor); cuosd_draw_rotationbox(context, rb.centerPos.x, rb.centerPos.y, rb.width, rb.height, rb.yaw, @@ -149,7 +150,7 @@ static void setGoldBuffer(std::vector &vect, nvcv::ImageFormat format, } case NVCVOSDType::NVCV_OSD_CIRCLE: { - auto circle = *((NVCVCircle *)element.data); + auto circle = *((NVCVCircle *)element->ptr()); cuOSDColor borderColor = *(cuOSDColor *)(&circle.borderColor); cuOSDColor bgColor = *(cuOSDColor *)(&circle.bgColor); cuosd_draw_circle(context, circle.centerPos.x, circle.centerPos.y, circle.radius, circle.thickness, @@ -158,7 +159,7 @@ static void setGoldBuffer(std::vector &vect, nvcv::ImageFormat format, } case NVCVOSDType::NVCV_OSD_ARROW: { - auto arrow = *((NVCVArrow *)element.data); + auto arrow = *((NVCVArrow *)element->ptr()); cuOSDColor color = *(cuOSDColor *)(&arrow.color); cuosd_draw_arrow(context, arrow.pos0.x, arrow.pos0.y, arrow.pos1.x, arrow.pos1.y, arrow.arrowSize, arrow.thickness, color, arrow.interpolation); @@ -166,7 +167,7 @@ static void setGoldBuffer(std::vector &vect, nvcv::ImageFormat format, } case NVCVOSDType::NVCV_OSD_CLOCK: { - auto clock = *((NVCVClock *)element.data); + auto clock = *((NVCVClock *)element->ptr()); cuOSDClockFormat clockFormat = (cuOSDClockFormat)(int)(clock.clockFormat); cuOSDColor fontColor = *(cuOSDColor *)(&clock.fontColor); cuOSDColor bgColor = *(cuOSDColor *)(&clock.bgColor); @@ -180,7 +181,6 @@ static void setGoldBuffer(std::vector &vect, nvcv::ImageFormat format, } test::osd::cuosd_apply(context, image, stream); - ctx.elements = (NVCVElement *)((unsigned char *)ctx.elements + numElements * sizeof(NVCVElement)); EXPECT_EQ(cudaSuccess, cudaMemcpy(vect.data() + n * bufSize, image->data0, bufSize, cudaMemcpyDeviceToHost)); test::osd::free_image(image); } @@ -191,85 +191,11 @@ static void setGoldBuffer(std::vector &vect, nvcv::ImageFormat format, #pragma GCC pop_options -static void free_elements(std::vector &elementVec) -{ - for (auto element : elementVec) - { - switch (element.type) - { - case NVCVOSDType::NVCV_OSD_RECT: - { - NVCVBndBoxI *bndBox = (NVCVBndBoxI *)element.data; - delete (bndBox); - break; - } - case NVCVOSDType::NVCV_OSD_TEXT: - { - NVCVText *label = (NVCVText *)element.data; - delete (label); - break; - } - case NVCVOSDType::NVCV_OSD_SEGMENT: - { - NVCVSegment *segment = (NVCVSegment *)element.data; - delete (segment); - break; - } - case NVCVOSDType::NVCV_OSD_POINT: - { - NVCVPoint *point = (NVCVPoint *)element.data; - delete (point); - break; - } - case NVCVOSDType::NVCV_OSD_LINE: - { - NVCVLine *line = (NVCVLine *)element.data; - delete (line); - break; - } - case NVCVOSDType::NVCV_OSD_POLYLINE: - { - NVCVPolyLine *pl = (NVCVPolyLine *)element.data; - delete (pl); - break; - } - case NVCVOSDType::NVCV_OSD_ROTATED_RECT: - { - NVCVRotatedBox *rb = (NVCVRotatedBox *)element.data; - delete (rb); - break; - } - case NVCVOSDType::NVCV_OSD_CIRCLE: - { - NVCVCircle *circle = (NVCVCircle *)element.data; - delete (circle); - break; - } - case NVCVOSDType::NVCV_OSD_ARROW: - { - NVCVArrow *arrow = (NVCVArrow *)element.data; - delete (arrow); - break; - } - case NVCVOSDType::NVCV_OSD_CLOCK: - { - NVCVClock *clock = (NVCVClock *)element.data; - delete (clock); - break; - } - default: - break; - } - } -} - // run operator static void runOp(cudaStream_t &stream, cvcuda::OSD &op, int &inN, int &inW, int &inH, int &num, int &sed, nvcv::ImageFormat &format) { - NVCVElements ctx; - std::vector numElementVec; - std::vector elementVec; + std::vector>> elementVec; test::osd::Segment *test_segment = test::osd::create_segment(); test::osd::Polyline *test_polyline = test::osd::create_polyline(); @@ -277,176 +203,158 @@ static void runOp(cudaStream_t &stream, cvcuda::OSD &op, int &inN, int &inW, int srand(sed); for (int n = 0; n < inN; n++) { - numElementVec.push_back(num); + std::vector> curVec; for (int i = 0; i < num; i++) { - NVCVElement element; - element.type = (NVCVOSDType)randl(int(NVCV_OSD_NONE) + 1, int(NVCV_OSD_MAX) - 1); - switch (element.type) + NVCVOSDType type = (NVCVOSDType)randl(int(NVCV_OSD_NONE) + 1, int(NVCV_OSD_MAX) - 1); + std::shared_ptr element; + switch (type) { case NVCVOSDType::NVCV_OSD_RECT: { - NVCVBndBoxI *bndBox = new NVCVBndBoxI(); - bndBox->box.x = randl(0, inW - 1); - bndBox->box.y = randl(0, inH - 1); - bndBox->box.width = randl(1, inW); - bndBox->box.height = randl(1, inH); - bndBox->thickness = randl(-1, 30); - bndBox->fillColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), - (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}; - bndBox->borderColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), - (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}; - element.data = (void *)bndBox; + NVCVBndBoxI bndBox; + bndBox.box.x = randl(0, inW - 1); + bndBox.box.y = randl(0, inH - 1); + bndBox.box.width = randl(1, inW); + bndBox.box.height = randl(1, inH); + bndBox.thickness = randl(-1, 30); + bndBox.fillColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), + (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}; + bndBox.borderColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), + (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}; + element = std::make_shared(type, &bndBox); break; } case NVCVOSDType::NVCV_OSD_TEXT: { - NVCVText *label = new NVCVText(); - label->utf8Text = "abcdefghijklmnopqrstuvwxyz"; - label->fontSize = 5 * randl(1, 10); - label->fontName = DEFAULT_OSD_FONT; - label->tlPos.x = randl(0, inW - 1); - label->tlPos.y = randl(0, inH - 1); - label->fontColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), - (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}; - label->bgColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), - (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}; - element.data = (void *)label; + NVCVText text = NVCVText("abcdefghijklmnopqrstuvwxyz", 5 * randl(1, 10), DEFAULT_OSD_FONT, + NVCVPointI({randl(0, inW - 1), randl(0, inH - 1)}), + NVCVColorRGBA({(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), + (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}), + NVCVColorRGBA({(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), + (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)})); + element = std::make_shared(type, &text); break; } case NVCVOSDType::NVCV_OSD_SEGMENT: { - NVCVSegment *segment = new NVCVSegment(); - segment->box.x = randl(0, inW - 1); - segment->box.y = randl(0, inH - 1); - segment->box.width = randl(1, inW); - segment->box.height = randl(1, inH); - segment->thickness = randl(-1, 5); - segment->dSeg = test_segment->data; - segment->segWidth = test_segment->width; - segment->segHeight = test_segment->height; - segment->segThreshold = 0.1 * randl(1, 5); - segment->borderColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), - (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}; - segment->segColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), - (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}; - element.data = (void *)segment; + NVCVSegment segment = NVCVSegment( + NVCVBoxI({randl(0, inW - 1), randl(0, inH - 1), randl(1, inW), randl(1, inH)}), randl(-1, 5), + test_segment->data, test_segment->width, test_segment->height, 0.1 * randl(1, 5), + NVCVColorRGBA({(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), + (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}), + NVCVColorRGBA({(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), + (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)})); + element = std::make_shared(type, &segment); break; } case NVCVOSDType::NVCV_OSD_POINT: { - NVCVPoint *point = new NVCVPoint(); - point->centerPos.x = randl(0, inW - 1); - point->centerPos.y = randl(0, inH - 1); - point->radius = randl(1, 50); - point->color = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), - (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}; - element.data = (void *)point; + NVCVPoint point; + point.centerPos.x = randl(0, inW - 1); + point.centerPos.y = randl(0, inH - 1); + point.radius = randl(1, 50); + point.color = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), (unsigned char)randl(0, 255), + (unsigned char)randl(0, 255)}; + element = std::make_shared(type, &point); break; } case NVCVOSDType::NVCV_OSD_LINE: { - NVCVLine *line = new NVCVLine(); - line->pos0.x = randl(0, inW - 1); - line->pos0.y = randl(0, inH - 1); - line->pos1.x = randl(0, inW - 1); - line->pos1.y = randl(0, inH - 1); - line->thickness = randl(1, 5); - line->color = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), (unsigned char)randl(0, 255), - (unsigned char)randl(0, 255)}; - line->interpolation = true; - element.data = (void *)line; + NVCVLine line; + line.pos0.x = randl(0, inW - 1); + line.pos0.y = randl(0, inH - 1); + line.pos1.x = randl(0, inW - 1); + line.pos1.y = randl(0, inH - 1); + line.thickness = randl(1, 5); + line.color = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), (unsigned char)randl(0, 255), + (unsigned char)randl(0, 255)}; + line.interpolation = true; + element = std::make_shared(type, &line); break; } case NVCVOSDType::NVCV_OSD_POLYLINE: { - NVCVPolyLine *pl = new NVCVPolyLine(); - pl->hPoints = test_polyline->h_pts; - pl->dPoints = test_polyline->d_pts; - pl->numPoints = test_polyline->n_pts; - pl->thickness = randl(1, 5); - pl->isClosed = randl(0, 1); - pl->borderColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), - (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}; - pl->fillColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), - (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}; - pl->interpolation = true; - element.data = (void *)pl; + NVCVPolyLine pl + = NVCVPolyLine(test_polyline->h_pts, test_polyline->n_pts, randl(1, 5), randl(0, 1), + NVCVColorRGBA({(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), + (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}), + NVCVColorRGBA({(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), + (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}), + true); + element = std::make_shared(type, &pl); break; } case NVCVOSDType::NVCV_OSD_ROTATED_RECT: { - NVCVRotatedBox *rb = new NVCVRotatedBox(); - rb->centerPos.x = randl(0, inW - 1); - rb->centerPos.y = randl(0, inH - 1); - rb->width = randl(1, inW); - rb->height = randl(1, inH); - rb->yaw = 0.02 * randl(1, 314); - rb->thickness = randl(1, 5); - rb->borderColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), - (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}; - rb->bgColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), (unsigned char)randl(0, 255), - (unsigned char)randl(0, 255)}; - rb->interpolation = false; - element.data = (void *)rb; + NVCVRotatedBox rb; + rb.centerPos.x = randl(0, inW - 1); + rb.centerPos.y = randl(0, inH - 1); + rb.width = randl(1, inW); + rb.height = randl(1, inH); + rb.yaw = 0.02 * randl(1, 314); + rb.thickness = randl(1, 5); + rb.borderColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), + (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}; + rb.bgColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), (unsigned char)randl(0, 255), + (unsigned char)randl(0, 255)}; + rb.interpolation = false; + element = std::make_shared(type, &rb); break; } case NVCVOSDType::NVCV_OSD_CIRCLE: { - NVCVCircle *circle = new NVCVCircle(); - circle->centerPos.x = randl(0, inW - 1); - circle->centerPos.y = randl(0, inH - 1); - circle->radius = randl(1, 50); - circle->thickness = randl(1, 5); - circle->borderColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), - (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}; - circle->bgColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), - (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}; - element.data = (void *)circle; + NVCVCircle circle; + circle.centerPos.x = randl(0, inW - 1); + circle.centerPos.y = randl(0, inH - 1); + circle.radius = randl(1, 50); + circle.thickness = randl(1, 5); + circle.borderColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), + (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}; + circle.bgColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), + (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}; + element = std::make_shared(type, &circle); break; } case NVCVOSDType::NVCV_OSD_ARROW: { - NVCVArrow *arrow = new NVCVArrow(); - arrow->pos0.x = randl(0, inW - 1); - arrow->pos0.y = randl(0, inH - 1); - arrow->pos1.x = randl(0, inW - 1); - arrow->pos1.y = randl(0, inH - 1); - arrow->arrowSize = randl(1, 5); - arrow->thickness = randl(1, 5); - arrow->color = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), - (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}; - arrow->interpolation = false; - element.data = (void *)arrow; + NVCVArrow arrow; + arrow.pos0.x = randl(0, inW - 1); + arrow.pos0.y = randl(0, inH - 1); + arrow.pos1.x = randl(0, inW - 1); + arrow.pos1.y = randl(0, inH - 1); + arrow.arrowSize = randl(1, 5); + arrow.thickness = randl(1, 5); + arrow.color = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), (unsigned char)randl(0, 255), + (unsigned char)randl(0, 255)}; + arrow.interpolation = false; + element = std::make_shared(type, &arrow); break; } case NVCVOSDType::NVCV_OSD_CLOCK: { - NVCVClock *clock = new NVCVClock(); - clock->clockFormat = (NVCVClockFormat)(randl(1, 3)); - clock->time = time(0); - clock->fontSize = 5 * randl(1, 10); - clock->font = DEFAULT_OSD_FONT; - clock->tlPos.x = randl(0, inW - 1); - clock->tlPos.y = randl(0, inH - 1); - clock->fontColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), - (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}; - clock->bgColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), - (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}; - element.data = (void *)clock; + NVCVClock clock + = NVCVClock{(NVCVClockFormat)(randl(1, 3)), + time(0), + 5 * randl(1, 10), + DEFAULT_OSD_FONT, + NVCVPointI({randl(0, inW - 1), randl(0, inH - 1)}), + NVCVColorRGBA({(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), + (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}), + NVCVColorRGBA({(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), + (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)})}; + element = std::make_shared(type, &clock); break; } default: break; } - - elementVec.push_back(element); + curVec.push_back(element); } + elementVec.push_back(curVec); } - ctx.batch = inN; - ctx.numElements = numElementVec.data(); - ctx.elements = elementVec.data(); + std::shared_ptr ctx = std::make_shared(elementVec); nvcv::Tensor imgIn = nvcv::util::CreateTensor(inN, inW, inH, format); nvcv::Tensor imgOut = nvcv::util::CreateTensor(inN, inW, inH, format); @@ -472,7 +380,7 @@ static void runOp(cudaStream_t &stream, cvcuda::OSD &op, int &inN, int &inW, int EXPECT_EQ(cudaSuccess, cudaMemset(input->basePtr(), 0xFF, inSampleStride * inAccess->numSamples())); EXPECT_EQ(cudaSuccess, cudaMemset(output->basePtr(), 0xFF, outSampleStride * outAccess->numSamples())); - EXPECT_NO_THROW(op(stream, imgIn, imgOut, ctx)); + EXPECT_NO_THROW(op(stream, imgIn, imgOut, (NVCVElements)ctx.get())); // check cdata std::vector test(outBufSize); @@ -487,7 +395,6 @@ static void runOp(cudaStream_t &stream, cvcuda::OSD &op, int &inN, int &inW, int test::osd::free_segment(test_segment); test::osd::free_polyline(test_polyline); - free_elements(elementVec); EXPECT_EQ(gold, test); } diff --git a/tests/cvcuda/system/TestOpPairwiseMatcher.cpp b/tests/cvcuda/system/TestOpPairwiseMatcher.cpp new file mode 100644 index 00000000..c2742461 --- /dev/null +++ b/tests/cvcuda/system/TestOpPairwiseMatcher.cpp @@ -0,0 +1,442 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Definitions.hpp" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +// ----------------------- Basic utility definitions --------------------------- + +namespace cuda = nvcv::cuda; +namespace util = nvcv::util; +namespace type = nvcv::test::type; + +using RawBufferType = std::vector; + +template +using uniform_distribution + = std::conditional_t, std::uniform_int_distribution, std::uniform_real_distribution>; + +template +constexpr nvcv::DataType ToDataType() +{ + if constexpr (std::is_same_v) + { + return nvcv::TYPE_U8; + } + else if constexpr (std::is_same_v) + { + return nvcv::TYPE_U32; + } + else if constexpr (std::is_same_v) + { + return nvcv::TYPE_F32; + } +} + +// --------------------- Reference (gold) computations ------------------------- + +namespace ref { + +template +T absdiff(T a, T b) +{ + if constexpr (std::is_floating_point_v) + { + return std::abs(a - b); + } + else + { + return a < b ? b - a : a - b; + } +} + +template +void ComputeDistance(DT &dist, ST p1, ST p2, NVCVNormType normType) +{ + if (normType == NVCV_NORM_HAMMING) + { + if constexpr (!std::is_floating_point_v) + { + dist += std::bitset(p1 ^ p2).count(); + } + } + else if (normType == NVCV_NORM_L1) + { + dist += absdiff(p1, p2); + } + else if (normType == NVCV_NORM_L2) + { + dist += std::pow(absdiff(p1, p2), 2); + } +} + +template +void BruteForceMatcher(RawBufferType &mchVec, RawBufferType &nmVec, RawBufferType &dVec, const RawBufferType &set1Vec, + const RawBufferType &set2Vec, const long3 &mchStrides, const long1 &nmStrides, + const long2 &dStrides, const long3 &set1Strides, const long3 &set2Strides, int numSamples, + int numDim, int set1Size, int set2Size, bool crossCheck, int matchesPerPoint, + NVCVNormType normType) +{ + std::vector> distIdx(set2Size); + std::vector> cckDistIdx(set1Size); + + for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) + { + int mchIdx = 0; + + for (int set1Idx = 0; set1Idx < set1Size; set1Idx++) + { + for (int set2Idx = 0; set2Idx < set2Size; set2Idx++) + { + float dist = 0.f; + + for (int coordIdx = 0; coordIdx < numDim; coordIdx++) + { + ST p1 = util::ValueAt(set1Vec, set1Strides, long3{sampleIdx, set1Idx, coordIdx}); + ST p2 = util::ValueAt(set2Vec, set2Strides, long3{sampleIdx, set2Idx, coordIdx}); + + ComputeDistance(dist, p1, p2, normType); + } + if (normType == NVCV_NORM_L2) + { + dist = std::sqrt(dist); + } + + distIdx[set2Idx] = std::tie(dist, set2Idx); + } + + std::sort(distIdx.begin(), distIdx.end()); + + if (crossCheck) + { + int set2Idx = std::get<1>(distIdx[0]); + + for (int cck1Idx = 0; cck1Idx < set1Size; cck1Idx++) + { + float dist = 0.f; + + for (int coordIdx = 0; coordIdx < numDim; coordIdx++) + { + ST p1 = util::ValueAt(set1Vec, set1Strides, long3{sampleIdx, cck1Idx, coordIdx}); + ST p2 = util::ValueAt(set2Vec, set2Strides, long3{sampleIdx, set2Idx, coordIdx}); + + ComputeDistance(dist, p1, p2, normType); + } + if (normType == NVCV_NORM_L2) + { + dist = std::sqrt(dist); + } + + cckDistIdx[cck1Idx] = std::tie(dist, cck1Idx); + } + + std::sort(cckDistIdx.begin(), cckDistIdx.end()); + + if (std::get<1>(cckDistIdx[0]) == set1Idx) + { + util::ValueAt(mchVec, mchStrides, long3{sampleIdx, mchIdx, 0}) = set1Idx; + util::ValueAt(mchVec, mchStrides, long3{sampleIdx, mchIdx, 1}) = std::get<1>(distIdx[0]); + if (dStrides.x > 0) + { + util::ValueAt(dVec, dStrides, long2{sampleIdx, mchIdx}) = std::get<0>(distIdx[0]); + } + + mchIdx++; + if (nmStrides.x > 0) + { + util::ValueAt(nmVec, nmStrides, long1{sampleIdx}) = mchIdx; + } + } + } + else + { + for (int m = 0; m < matchesPerPoint; m++) + { + util::ValueAt(mchVec, mchStrides, long3{sampleIdx, mchIdx, 0}) = set1Idx; + util::ValueAt(mchVec, mchStrides, long3{sampleIdx, mchIdx, 1}) = std::get<1>(distIdx[m]); + if (dStrides.x > 0) + { + util::ValueAt(dVec, dStrides, long2{sampleIdx, mchIdx}) = std::get<0>(distIdx[m]); + } + + mchIdx++; + if (nmStrides.x > 0) + { + util::ValueAt(nmVec, nmStrides, long1{sampleIdx}) = mchIdx; + } + } + } + } + } +} + +template +void PairwiseMatcher(NVCVPairwiseMatcherType algoChoice, RawBufferType &mchVec, RawBufferType &nmVec, + RawBufferType &dVec, const RawBufferType &set1Vec, const RawBufferType &set2Vec, + const long3 &mchStrides, const long1 &nmStrides, const long2 &dStrides, const long3 &set1Strides, + const long3 &set2Strides, int numSamples, int numDim, int set1Size, int set2Size, bool crossCheck, + int matchesPerPoint, NVCVNormType normType) +{ + if (algoChoice == NVCV_BRUTE_FORCE) + { + BruteForceMatcher(mchVec, nmVec, dVec, set1Vec, set2Vec, mchStrides, nmStrides, dStrides, set1Strides, + set2Strides, numSamples, numDim, set1Size, set2Size, crossCheck, matchesPerPoint, + normType); + } +} + +inline void SortOutput(std::vector> &outIdsDist, const RawBufferType &mchVec, + const RawBufferType &nmVec, const RawBufferType &dVec, const long3 &mchStrides, + const long1 &nmStrides, const long2 &dStrides, int numSamples, int set1Size, int matchesPerPoint, + int maxMatches) +{ + int totalMatches = set1Size * matchesPerPoint; + + for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) + { + if (nmStrides.x > 0) + { + totalMatches = util::ValueAt(nmVec, nmStrides, long1{sampleIdx}); + } + + for (int matchIdx = 0; matchIdx < totalMatches && matchIdx < maxMatches; matchIdx++) + { + int set1Idx = util::ValueAt(mchVec, mchStrides, long3{sampleIdx, matchIdx, 0}); + int set2Idx = util::ValueAt(mchVec, mchStrides, long3{sampleIdx, matchIdx, 1}); + float distance = (dStrides.x > 0) ? util::ValueAt(dVec, dStrides, long2{sampleIdx, matchIdx}) : 0.f; + + outIdsDist.emplace_back(sampleIdx, set1Idx, set2Idx, distance); + } + } + + std::sort(outIdsDist.begin(), outIdsDist.end()); +} + +} // namespace ref + +// ----------------------------- Start tests ----------------------------------- + +// clang-format off + +#define NVCV_TEST_ROW(NumSamples, Set1Size, Set2Size, NumDim, MatchesPerPoint, CrossCheck, StoreDistances, \ + AlgoChoice, NormType, Type) \ + type::Types, type::Value, type::Value, type::Value, \ + type::Value, type::Value, type::Value, \ + type::Value, type::Value, Type> + +NVCV_TYPED_TEST_SUITE(OpPairwiseMatcher, type::Types< + NVCV_TEST_ROW(1, 2, 2, 1, 1, false, false, NVCV_BRUTE_FORCE, NVCV_NORM_HAMMING, uint8_t), + NVCV_TEST_ROW(2, 3, 4, 5, 1, false, true, NVCV_BRUTE_FORCE, NVCV_NORM_HAMMING, uint8_t), + NVCV_TEST_ROW(3, 4, 3, 32, 1, false, true, NVCV_BRUTE_FORCE, NVCV_NORM_HAMMING, uint32_t), + NVCV_TEST_ROW(4, 11, 12, 128, 2, false, true, NVCV_BRUTE_FORCE, NVCV_NORM_HAMMING, uint8_t), + NVCV_TEST_ROW(3, 17, 16, 128, 3, false, true, NVCV_BRUTE_FORCE, NVCV_NORM_HAMMING, uint8_t), + NVCV_TEST_ROW(2, 3, 4, 32, 1, true, false, NVCV_BRUTE_FORCE, NVCV_NORM_HAMMING, uint32_t), + NVCV_TEST_ROW(1, 5, 6, 7, 1, false, false, NVCV_BRUTE_FORCE, NVCV_NORM_L1, uint8_t), + NVCV_TEST_ROW(2, 18, 19, 17, 1, false, true, NVCV_BRUTE_FORCE, NVCV_NORM_L1, uint32_t), + NVCV_TEST_ROW(3, 98, 17, 32, 1, false, true, NVCV_BRUTE_FORCE, NVCV_NORM_L1, float), + NVCV_TEST_ROW(2, 54, 65, 32, 2, false, true, NVCV_BRUTE_FORCE, NVCV_NORM_L1, uint8_t), + NVCV_TEST_ROW(3, 68, 37, 1025, 1, true, true, NVCV_BRUTE_FORCE, NVCV_NORM_L1, float), + NVCV_TEST_ROW(2, 14, 24, 32, 3, false, true, NVCV_BRUTE_FORCE, NVCV_NORM_L1, uint8_t), + NVCV_TEST_ROW(3, 48, 37, 8, 1, true, false, NVCV_BRUTE_FORCE, NVCV_NORM_L1, float), + NVCV_TEST_ROW(4, 8, 9, 1025, 1, false, true, NVCV_BRUTE_FORCE, NVCV_NORM_L2, uint8_t), + NVCV_TEST_ROW(3, 27, 16, 8, 1, false, false, NVCV_BRUTE_FORCE, NVCV_NORM_L2, uint32_t), + NVCV_TEST_ROW(2, 73, 132, 64, 1, false, true, NVCV_BRUTE_FORCE, NVCV_NORM_L2, float), + NVCV_TEST_ROW(3, 87, 98, 19, 2, false, true, NVCV_BRUTE_FORCE, NVCV_NORM_L2, uint8_t), + NVCV_TEST_ROW(4, 43, 32, 26, 1, true, true, NVCV_BRUTE_FORCE, NVCV_NORM_L2, float), + NVCV_TEST_ROW(3, 67, 58, 32, 3, false, true, NVCV_BRUTE_FORCE, NVCV_NORM_L2, uint8_t), + NVCV_TEST_ROW(2, 73, 62, 8, 1, true, false, NVCV_BRUTE_FORCE, NVCV_NORM_L2, float) +>); + +// clang-format on + +TYPED_TEST(OpPairwiseMatcher, CorrectOutput) +{ + int numSamples = type::GetValue; + int set1Size = type::GetValue; + int set2Size = type::GetValue; + int numDim = type::GetValue; + int matchesPerPoint = type::GetValue; + bool crossCheck = type::GetValue; + bool storeDistances = type::GetValue; + + NVCVPairwiseMatcherType algoChoice{type::GetValue}; + + NVCVNormType normType{type::GetValue}; + + using SrcT = type::GetType; + + constexpr nvcv::DataType srcDT{ToDataType()}; + + int maxSet1 = set1Size + 12; + int maxSet2 = set2Size + 23; // adding extra sizes to test different capacities on set 1 and 2 + int maxMatches = maxSet1 * matchesPerPoint; + + // clang-format off + + nvcv::Tensor set1({{numSamples, maxSet1, numDim}, "NMD"}, srcDT); + nvcv::Tensor set2({{numSamples, maxSet2, numDim}, "NMD"}, srcDT); + + nvcv::Tensor numSet1({{numSamples}, "N"}, nvcv::TYPE_S32); + nvcv::Tensor numSet2({{numSamples}, "N"}, nvcv::TYPE_S32); + + nvcv::Tensor matches({{numSamples, maxMatches, 2}, "NMD"}, nvcv::TYPE_S32); + + nvcv::Tensor numMatches; + nvcv::Optional nmData; + + nvcv::Tensor distances; + nvcv::Optional dData; + + if (crossCheck) + { + numMatches = nvcv::Tensor({{numSamples}, "N"}, nvcv::TYPE_S32); + + nmData = numMatches.exportData(); + ASSERT_TRUE(nmData); + } + if (storeDistances) + { + distances = nvcv::Tensor({{numSamples, maxMatches}, "NM"}, nvcv::TYPE_F32); + + dData = distances.exportData(); + ASSERT_TRUE(dData); + } + + // clang-format on + + auto set1Data = set1.exportData(); + ASSERT_TRUE(set1Data); + + auto set2Data = set2.exportData(); + ASSERT_TRUE(set2Data); + + auto ns1Data = numSet1.exportData(); + ASSERT_TRUE(ns1Data); + + auto ns2Data = numSet2.exportData(); + ASSERT_TRUE(ns2Data); + + auto mchData = matches.exportData(); + ASSERT_TRUE(mchData); + + long3 set1Strides{set1Data->stride(0), set1Data->stride(1), set1Data->stride(2)}; + long3 set2Strides{set2Data->stride(0), set2Data->stride(1), set2Data->stride(2)}; + long1 ns1Strides{ns1Data->stride(0)}; + long1 ns2Strides{ns2Data->stride(0)}; + long3 mchStrides{mchData->stride(0), mchData->stride(1), mchData->stride(2)}; + long1 nmStrides = (numMatches) ? long1{nmData->stride(0)} : long1{0}; + long2 dStrides = (distances) ? long2{dData->stride(0), dData->stride(1)} : long2{0, 0}; + + long set1BufSize = set1Strides.x * numSamples; + long set2BufSize = set2Strides.x * numSamples; + long ns1BufSize = ns1Strides.x * numSamples; + long ns2BufSize = ns2Strides.x * numSamples; + long mchBufSize = mchStrides.x * numSamples; + long nmBufSize = nmStrides.x * numSamples; + long dBufSize = dStrides.x * numSamples; + + RawBufferType set1Vec(set1BufSize); + RawBufferType set2Vec(set2BufSize); + RawBufferType ns1Vec(ns1BufSize); + RawBufferType ns2Vec(ns2BufSize); + + std::default_random_engine rng(12345u); + + SrcT minV = std::is_integral_v ? cuda::TypeTraits::min : -1; + SrcT maxV = std::is_integral_v ? cuda::TypeTraits::max : +1; + + uniform_distribution rand(minV, maxV); + + for (int x = 0; x < numSamples; ++x) + { + for (int z = 0; z < numDim; ++z) + { + for (int y = 0; y < set1Size; ++y) + { + util::ValueAt(set1Vec, set1Strides, long3{x, y, z}) = rand(rng); + } + for (int y = 0; y < set2Size; ++y) + { + util::ValueAt(set2Vec, set2Strides, long3{x, y, z}) = rand(rng); + } + } + + util::ValueAt(ns1Vec, ns1Strides, long1{x}) = set1Size; + util::ValueAt(ns2Vec, ns2Strides, long1{x}) = set2Size; + } + + ASSERT_EQ(cudaSuccess, cudaMemcpy(set1Data->basePtr(), set1Vec.data(), set1BufSize, cudaMemcpyHostToDevice)); + ASSERT_EQ(cudaSuccess, cudaMemcpy(set2Data->basePtr(), set2Vec.data(), set2BufSize, cudaMemcpyHostToDevice)); + ASSERT_EQ(cudaSuccess, cudaMemcpy(ns1Data->basePtr(), ns1Vec.data(), ns1BufSize, cudaMemcpyHostToDevice)); + ASSERT_EQ(cudaSuccess, cudaMemcpy(ns2Data->basePtr(), ns2Vec.data(), ns2BufSize, cudaMemcpyHostToDevice)); + + cudaStream_t stream; + ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream)); + + cvcuda::PairwiseMatcher op(algoChoice); + + op(stream, set1, set2, numSet1, numSet2, matches, numMatches, distances, crossCheck, matchesPerPoint, normType); + + ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream)); + + RawBufferType nmTestVec(nmBufSize, 0); + RawBufferType nmGoldVec(nmBufSize, 0); + RawBufferType mchTestVec(mchBufSize, 0); + RawBufferType mchGoldVec(mchBufSize, 0); + RawBufferType dTestVec(dBufSize, 0); + RawBufferType dGoldVec(dBufSize, 0); + + // Treated output is a vector of (sampleIdx, set1Idx, set2Idx, distance) + std::vector> testIdsDist; + std::vector> goldIdsDist; + + ASSERT_EQ(cudaSuccess, cudaMemcpy(mchTestVec.data(), mchData->basePtr(), mchBufSize, cudaMemcpyDeviceToHost)); + + if (numMatches) + { + ASSERT_EQ(cudaSuccess, cudaMemcpy(nmTestVec.data(), nmData->basePtr(), nmBufSize, cudaMemcpyDeviceToHost)); + } + if (distances) + { + ASSERT_EQ(cudaSuccess, cudaMemcpy(dTestVec.data(), dData->basePtr(), dBufSize, cudaMemcpyDeviceToHost)); + } + + ref::SortOutput(testIdsDist, mchTestVec, nmTestVec, dTestVec, mchStrides, nmStrides, dStrides, numSamples, set1Size, + matchesPerPoint, maxMatches); + + ref::PairwiseMatcher(algoChoice, mchGoldVec, nmGoldVec, dGoldVec, set1Vec, set2Vec, mchStrides, nmStrides, + dStrides, set1Strides, set2Strides, numSamples, numDim, set1Size, set2Size, crossCheck, + matchesPerPoint, normType); + + ref::SortOutput(goldIdsDist, mchGoldVec, nmGoldVec, dGoldVec, mchStrides, nmStrides, dStrides, numSamples, set1Size, + matchesPerPoint, maxMatches); + + EXPECT_EQ(testIdsDist, goldIdsDist); +} diff --git a/tests/cvcuda/system/TestOpPillowResize.cpp b/tests/cvcuda/system/TestOpPillowResize.cpp index 78316d5b..4f276dba 100644 --- a/tests/cvcuda/system/TestOpPillowResize.cpp +++ b/tests/cvcuda/system/TestOpPillowResize.cpp @@ -1010,9 +1010,11 @@ void StartTest(int srcWidth, int srcHeight, int dstWidth, int dstHeight, NVCVInt // Generate test result nvcv::Tensor imgDst(numberOfImages, {dstWidth, dstHeight}, fmt); - cvcuda::PillowResize pillowResizeOp(nvcv::Size2D{std::max(srcWidth, dstWidth), std::max(srcHeight, dstHeight)}, - numberOfImages, fmt); - EXPECT_NO_THROW(pillowResizeOp(stream, imgSrc, imgDst, interpolation)); + cvcuda::PillowResize pillowResizeOp; + + cvcuda::UniqueWorkspace ws = cvcuda::AllocateWorkspace( + pillowResizeOp.getWorkspaceRequirements(numberOfImages, {srcWidth, srcHeight}, {dstWidth, dstHeight}, fmt)); + EXPECT_NO_THROW(pillowResizeOp(stream, ws.get(), imgSrc, imgDst, interpolation)); EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream)); @@ -1144,9 +1146,11 @@ void StartVarShapeTest(int srcWidthBase, int srcHeightBase, int dstWidthBase, in nvcv::Size2D maxDstSize = batchDst.maxSize(); // Generate test result - cvcuda::PillowResize pillowResizeOp( - nvcv::Size2D{std::max(maxSrcSize.w, maxDstSize.w), std::max(maxSrcSize.h, maxDstSize.h)}, numberOfImages, fmt); - EXPECT_NO_THROW(pillowResizeOp(stream, batchSrc, batchDst, interpolation)); + cvcuda::PillowResize pillowResizeOp; + + cvcuda::UniqueWorkspace ws = cvcuda::AllocateWorkspace( + pillowResizeOp.getWorkspaceRequirements(numberOfImages, maxSrcSize, maxDstSize, fmt)); + EXPECT_NO_THROW(pillowResizeOp(stream, ws.get(), batchSrc, batchDst, interpolation)); // Get test data back EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); diff --git a/tests/cvcuda/system/TestOpStack.cpp b/tests/cvcuda/system/TestOpStack.cpp new file mode 100644 index 00000000..b8f2bff2 --- /dev/null +++ b/tests/cvcuda/system/TestOpStack.cpp @@ -0,0 +1,190 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Definitions.hpp" + +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace gt = ::testing; +namespace test = nvcv::test; +namespace util = nvcv::util; + +// clang-format off +NVCV_TEST_SUITE_P(OpStack, test::ValueList +{ + //inWidth, inHeight, format, numberOfTensors, maxNumberInBatch + { 320, 240, nvcv::FMT_U8, 5, 2}, + { 40, 81, nvcv::FMT_RGB8, 1, 3}, + { 800, 600, nvcv::FMT_BGR8, 1, 4}, + { 1024, 768, nvcv::FMT_RGBA8, 2, 1}, + { 12, 720, nvcv::FMT_BGRA8, 3, 5}, + { 160, 121, nvcv::FMT_BGR8p, 2, 2}, + { 920, 80, nvcv::FMT_RGB8p, 1, 3}, + { 41, 536, nvcv::FMT_RGBA8p, 1, 4}, + { 592, 944, nvcv::FMT_BGRA8p, 2, 5}, + { 1, 2, nvcv::FMT_U32, 1, 1}, + { 48, 36, nvcv::FMT_RGBf32, 1, 2}, + { 192, 1944, nvcv::FMT_BGRf32, 1, 3}, + { 1920, 1080, nvcv::FMT_RGBAf32, 4, 4}, + { 2048, 1536, nvcv::FMT_BGRAf32, 1, 5}, + { 1024, 768, nvcv::FMT_RGBA8p, 3, 1}, + { 1280, 720, nvcv::FMT_RGBf32p, 1, 2}, + { 192, 80, nvcv::FMT_BGRf32p, 1, 3}, + { 2048, 536, nvcv::FMT_RGBAf32p, 1, 4}, + { 259, 194, nvcv::FMT_BGRAf32p, 1, 5}, + { 1921, 1080, nvcv::FMT_F64, 1, 1}, + { 1920, 1080, nvcv::FMT_F16, 2, 2}, + { 48, 36, nvcv::FMT_BGRAf32, 1, 3}, +}); + +// clang-format on +TEST_P(OpStack, test_NCHW_tensors) +{ + cudaStream_t stream; + ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream)); + + int width = GetParamValue<0>(); + int height = GetParamValue<1>(); + nvcv::ImageFormat format = GetParamValue<2>(); + int numberOfTensors = GetParamValue<3>(); + int maxNumberInBatch = GetParamValue<4>(); + + int numChannels = format.numChannels(); + int bytesPerPixel = 0; + int totalNumberOfTensors = 0; + + for (int32_t i = 0; i < numChannels; i++) + { + bytesPerPixel += format.bitsPerChannel()[i] / 8; + } + + // generate the output tensor to contain all of the input tensors + + auto reqs = nvcv::TensorBatch::CalcRequirements(numberOfTensors); + nvcv::TensorBatch inTensorBatch(reqs); + std::vector> inputVecs; + + // generate random input images + std::default_random_engine randEng(0); + std::uniform_int_distribution rand(0u, 255u); + std::uniform_int_distribution distribution(1, maxNumberInBatch); + int numberInBatch = distribution(randEng); + + for (int i = 0; i < numberOfTensors; ++i) + { + nvcv::Tensor inTensor(numberInBatch, {width, height}, format); + totalNumberOfTensors += numberInBatch; // include individual tensors and tensors in N > 1 tensor(s) + + for (int j = 0; j < numberInBatch; j++) + { + // generate random input image in bytes + std::vector imageVec((width * height) * bytesPerPixel); + std::generate(imageVec.begin(), imageVec.end(), [&]() { return (nvcv::Byte)rand(randEng); }); + // copy random input to device tensor + EXPECT_NO_THROW(util::SetImageTensorFromByteVector(inTensor.exportData(), imageVec, j)); + // add tensor to batch and input vector + inputVecs.push_back(imageVec); + } + inTensorBatch.pushBack(inTensor); + } + + nvcv::Tensor outTensor(totalNumberOfTensors, {width, height}, format); + // run operator + cvcuda::Stack op; + EXPECT_NO_THROW(op(stream, inTensorBatch, outTensor)); + + ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream)); + + // go through each sample of the output tensor and compare vals. + for (int i = 0; i < totalNumberOfTensors; ++i) + { + // generate random input image in bytes + std::vector outSample; + EXPECT_NO_THROW(util::GetImageByteVectorFromTensor(outTensor.exportData(), i, outSample)); + // Compare the computed histogram with the output histogram + ASSERT_EQ(inputVecs[i], outSample); + } +} + +TEST_P(OpStack, test_CHW_tensors) +{ + cudaStream_t stream; + ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream)); + + int width = GetParamValue<0>(); + int height = GetParamValue<1>(); + nvcv::ImageFormat format = GetParamValue<2>(); + int numberOfTensors = GetParamValue<3>(); + + int numChannels = format.numChannels(); + int bytesPerPixel = 0; + + for (int32_t i = 0; i < numChannels; i++) + { + bytesPerPixel += format.bitsPerChannel()[i] / 8; + } + + // generate the output tensor to contain all of the input tensors + + auto reqs = nvcv::TensorBatch::CalcRequirements(numberOfTensors); + nvcv::TensorBatch inTensorBatch(reqs); + + // generate random input images + std::default_random_engine randEng(0); + std::uniform_int_distribution rand(0u, 255u); + std::vector> inputVecs; + + for (int i = 0; i < numberOfTensors; ++i) + { + nvcv::Tensor inTensor = nvcv::util::CreateTensor(1, width, height, format); //this will create a CHW/HWC tensor + // generate random input image in bytes + std::vector imageVec((width * height) * bytesPerPixel); + std::generate(imageVec.begin(), imageVec.end(), [&]() { return (nvcv::Byte)rand(randEng); }); + // copy random input to device tensor + EXPECT_NO_THROW(util::SetImageTensorFromByteVector(inTensor.exportData(), imageVec)); + // add tensor to batch and input vector + inputVecs.push_back(imageVec); + inTensorBatch.pushBack(inTensor); + } + + nvcv::Tensor outTensor(numberOfTensors, {width, height}, format); + // run operator + cvcuda::Stack op; + EXPECT_NO_THROW(op(stream, inTensorBatch, outTensor)); + + ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream)); + + // go through each sample of the output tensor and compare vals. + for (int i = 0; i < numberOfTensors; ++i) + { + // generate random input image in bytes + std::vector outSample; + EXPECT_NO_THROW(util::GetImageByteVectorFromTensor(outTensor.exportData(), i, outSample)); + // Compare the computed histogram with the output histogram + ASSERT_EQ(inputVecs[i], outSample); + } +} diff --git a/tests/cvcuda/system/TestOpWarpPerspective.cpp b/tests/cvcuda/system/TestOpWarpPerspective.cpp index 59b45e9f..04c6d647 100644 --- a/tests/cvcuda/system/TestOpWarpPerspective.cpp +++ b/tests/cvcuda/system/TestOpWarpPerspective.cpp @@ -129,7 +129,7 @@ static void WarpPerspectiveGold(std::vector &hDst, const int dstRowStri NVCVPerspectiveTransform finalTransformMatrix; - if (flags & NVCV_WARP_INVERSE_MAP) + if (!(flags & NVCV_WARP_INVERSE_MAP)) { cuda::math::Matrix tempMatrixForInverse; diff --git a/tests/cvcuda/unit/CMakeLists.txt b/tests/cvcuda/unit/CMakeLists.txt new file mode 100644 index 00000000..53e5aba1 --- /dev/null +++ b/tests/cvcuda/unit/CMakeLists.txt @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_executable(cvcuda_test_unit + TestWorkspaceAllocator.cpp + TestWorkspaceEstimator.cpp +) + +target_compile_definitions(cvcuda_test_unit + PRIVATE + -DNVCV_UNIT_TEST=1 +) + +target_link_libraries(cvcuda_test_unit + PRIVATE + nvcv_test_main + nvcv_util + nvcv_test_common + cvcuda_priv +) + +nvcv_add_test(cvcuda_test_unit cvcuda) diff --git a/tests/cvcuda/unit/Definitions.hpp b/tests/cvcuda/unit/Definitions.hpp new file mode 100644 index 00000000..76fd4cd3 --- /dev/null +++ b/tests/cvcuda/unit/Definitions.hpp @@ -0,0 +1,26 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_TEST_SYSTEM_CORE_OP_DEFINITIONS_HPP +#define NVCV_TEST_SYSTEM_CORE_OP_DEFINITIONS_HPP + +#include +#include +#include +#include + +#endif // NVCV_TEST_SYSTEM_CORE_OP_DEFINITIONS_HPP diff --git a/tests/cvcuda/unit/TestWorkspaceAllocator.cpp b/tests/cvcuda/unit/TestWorkspaceAllocator.cpp new file mode 100644 index 00000000..805752f0 --- /dev/null +++ b/tests/cvcuda/unit/TestWorkspaceAllocator.cpp @@ -0,0 +1,203 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Definitions.hpp" + +#include + +#define EXPECT_PTR_EQ(a, b) EXPECT_EQ((const void *)(a), (const void *)(b)) + +TEST(WorkspaceMemAllocatorTest, Get) +{ + alignas(64) char base[64]; + cvcuda::WorkspaceMem wm{}; + wm.req = {64, 64}; + wm.data = base; + + cvcuda::WorkspaceMemAllocator wa(wm); + EXPECT_PTR_EQ(wa.get(3), base + 0); + EXPECT_PTR_EQ(wa.get(3), base + 4); + EXPECT_PTR_EQ(wa.get(), base + 16); + EXPECT_PTR_EQ(wa.get(1, 16), base + 32); + EXPECT_PTR_EQ(wa.get(4), base + 48); +} + +TEST(WorkspaceMemAllocatorTest, ExceedWorkspaceSize) +{ + alignas(64) char base[64]; + cvcuda::WorkspaceMem wm{}; + wm.req = {64, 64}; + wm.data = base; + + cvcuda::WorkspaceMemAllocator wa(wm); + EXPECT_PTR_EQ(wa.get(4), base + 0); + EXPECT_PTR_EQ(wa.get(7), base + 32); + EXPECT_PTR_EQ(wa.allocated(), 60); + EXPECT_THROW(wa.get(2), nvcv::Exception); + EXPECT_PTR_EQ(wa.get(1), base + 60); + EXPECT_THROW(wa.get(1), nvcv::Exception); +} + +TEST(WorkspaceAllocatorTest, Get) +{ + alignas(64) char base[64]; + alignas(64) char pinnedBase[64]; + cvcuda::Workspace ws{}; + ws.hostMem.req = {64, 64}; + ws.hostMem.data = base; + ws.pinnedMem.req = {64, 64}; + ws.pinnedMem.data = pinnedBase; + + cvcuda::WorkspaceAllocator wa(ws); + EXPECT_PTR_EQ(wa.getHost(4), base + 0); + EXPECT_PTR_EQ(wa.getHost(7), base + 32); + EXPECT_PTR_EQ(wa.getPinned(4), pinnedBase + 0); + EXPECT_EQ(wa.hostMem.allocated(), 60); + EXPECT_EQ(wa.pinnedMem.allocated(), 32); + EXPECT_THROW(wa.getHost(2), nvcv::Exception); + EXPECT_PTR_EQ(wa.getHost(1), base + 60); + EXPECT_THROW(wa.getHost(1), nvcv::Exception); +} + +TEST(WorkspaceMemAllocatorTest, AcquireRelease) +{ + alignas(64) char base[64]; + cvcuda::WorkspaceMem wm{}; + wm.req = {64, 64}; + wm.data = base; + ASSERT_EQ(cudaEventCreateWithFlags(&wm.ready, cudaEventDisableTiming), cudaSuccess); + + EXPECT_NO_THROW({ + cvcuda::WorkspaceMemAllocator wa(wm, cudaStream_t(0)); + EXPECT_PTR_EQ(wa.get(32), base); + }); + + EXPECT_NO_THROW({ cvcuda::WorkspaceMemAllocator wa(wm, cudaStream_t(0)); }); + + EXPECT_NO_THROW({ + cvcuda::WorkspaceMemAllocator wa(wm, cudaStream_t(0)); + wa.acquire(std::nullopt); + EXPECT_PTR_EQ(wa.get(32), base); + }); + + EXPECT_THROW( + { + cvcuda::WorkspaceMemAllocator wa(wm, std::nullopt, std::nullopt); + EXPECT_PTR_EQ(wa.get(32), base); + wa.acquire(std::nullopt); + }, + std::logic_error) + << "acquire after get should be an error"; + + EXPECT_THROW( + { + cvcuda::WorkspaceMemAllocator wa(wm, std::nullopt, std::nullopt); + wa.acquire(std::nullopt); + wa.acquire(std::nullopt); + }, + std::logic_error) + << "double acquire should be an error"; + + EXPECT_THROW( + { + cvcuda::WorkspaceMemAllocator wa(wm, std::nullopt, std::nullopt); + wa.release(std::nullopt); + EXPECT_PTR_EQ(wa.get(32), base); + }, + std::logic_error) + << "get after release should be an error"; + + EXPECT_THROW( + { + cvcuda::WorkspaceMemAllocator wa(wm, std::nullopt, std::nullopt); + wa.release(std::nullopt); + wa.acquire(std::nullopt); + }, + std::logic_error) + << "acquire after release should be an error"; + + EXPECT_THROW( + { + cvcuda::WorkspaceMemAllocator wa(wm, std::nullopt, std::nullopt); + wa.release(std::nullopt); + wa.release(std::nullopt); + }, + std::logic_error) + << "double release should be an error"; + + ASSERT_EQ(cudaEventDestroy(wm.ready), cudaSuccess); +} + +TEST(WorkspaceMemAllocatorTest, Sync) +{ + void *_junk; + size_t junk_size = 100 << 20; + ASSERT_EQ(cudaMalloc(&_junk, junk_size), cudaSuccess); + std::unique_ptr junk(_junk, [](void *p) { EXPECT_EQ(cudaFree(p), cudaSuccess); }); + + alignas(64) char base[64]; + cvcuda::WorkspaceMem wm{}; + wm.req = {64, 64}; + wm.data = base; + ASSERT_EQ(cudaEventCreateWithFlags(&wm.ready, cudaEventDisableTiming), cudaSuccess); + + // this is supposed to last long enough to be reliably "not ready" + auto hog = [&]() + { + for (int i = 0; i < 256; i++) + { + ASSERT_EQ(cudaMemset(junk.get(), i, junk_size), cudaSuccess); + } + }; + + EXPECT_NO_THROW({ + hog(); + ASSERT_EQ(cudaEventRecord(wm.ready, 0), cudaSuccess); + { + cvcuda::WorkspaceMemAllocator wa(wm, cudaStream_t(0)); + EXPECT_EQ(cudaEventQuery(wm.ready), cudaErrorNotReady); // no sync yet + } + EXPECT_EQ(cudaEventQuery(wm.ready), cudaErrorNotReady); // no sync necessary + }) << "No memory was requested, no sync is necessary, no error expected"; + + ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); + + EXPECT_NO_THROW({ + ASSERT_EQ(cudaEventRecord(wm.ready, 0), cudaSuccess); + { + cvcuda::WorkspaceMemAllocator wa(wm, cudaStream_t(0)); + EXPECT_PTR_EQ(wa.get(32), base); + hog(); + } + EXPECT_EQ(cudaEventQuery(wm.ready), cudaErrorNotReady); // device sync only + }) << "Acquire and release properly called, no exception should be raised"; + + ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); + + EXPECT_NO_THROW({ + hog(); + ASSERT_EQ(cudaEventRecord(wm.ready, 0), cudaSuccess); + { + cvcuda::WorkspaceMemAllocator wa(wm, std::nullopt, std::nullopt); + EXPECT_EQ(cudaEventQuery(wm.ready), cudaErrorNotReady); // no sync yet + EXPECT_PTR_EQ(wa.get(32), base); + EXPECT_EQ(cudaEventQuery(wm.ready), cudaSuccess); // sync in get + } + }) << "Acquire and release properly called, no exception should be raised"; + + ASSERT_EQ(cudaEventDestroy(wm.ready), cudaSuccess); +} diff --git a/tests/cvcuda/unit/TestWorkspaceEstimator.cpp b/tests/cvcuda/unit/TestWorkspaceEstimator.cpp new file mode 100644 index 00000000..a950e073 --- /dev/null +++ b/tests/cvcuda/unit/TestWorkspaceEstimator.cpp @@ -0,0 +1,69 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Definitions.hpp" + +#include + +TEST(WorkspaceMemEstimatorTest, Add) +{ + // set the alignment to 1 to see if element alignment gets propagated to the base alignment + cvcuda::WorkspaceMemEstimator est(0, 1); + est.add(3); + EXPECT_EQ(est.req.alignment, 1); + EXPECT_EQ(est.req.size, 3); + est.add(3); + EXPECT_EQ(est.req.size, 16); + EXPECT_EQ(est.req.alignment, 4); + est.add(); + EXPECT_EQ(est.req.size, 20); + est.add(1, 16); + EXPECT_EQ(est.req.size, 48); + EXPECT_EQ(est.req.alignment, 16); +} + +TEST(WorkspaceEstimatorTest, Add) +{ + cvcuda::WorkspaceEstimator est; + EXPECT_EQ(est.hostMem.req.alignment, 16); + EXPECT_EQ(est.pinnedMem.req.alignment, 256); + EXPECT_EQ(est.cudaMem.req.alignment, 256); + + // set the alignment to 1 to see if element alignment gets propagated to the base alignment for each memory type + est.hostMem.req.alignment = 1; + est.pinnedMem.req.alignment = 1; + est.cudaMem.req.alignment = 1; + + est.add(true, false, true, 3); + EXPECT_EQ(est.hostMem.req.size, 3); + EXPECT_EQ(est.pinnedMem.req.size, 0); + EXPECT_EQ(est.cudaMem.req.size, 3); + + // clang-format off + est.add(true, false, false, 4) + .add(false, true, true, 2); + // clang-format on + + EXPECT_EQ(est.hostMem.req.size, 7); // 7 chars + EXPECT_EQ(est.hostMem.req.alignment, 1); // no change + + EXPECT_EQ(est.pinnedMem.req.size, 8); // just the 2 integers + EXPECT_EQ(est.pinnedMem.req.alignment, 4); // alignment for int32 + + EXPECT_EQ(est.cudaMem.req.size, 12); // 3 chars, padding, 2 ints + EXPECT_EQ(est.cudaMem.req.alignment, 4); // alignment for int32 +} diff --git a/tests/nvcv_types/cudatools_system/CMakeLists.txt b/tests/nvcv_types/cudatools_system/CMakeLists.txt index b0a0bb06..fd97891b 100644 --- a/tests/nvcv_types/cudatools_system/CMakeLists.txt +++ b/tests/nvcv_types/cudatools_system/CMakeLists.txt @@ -59,4 +59,4 @@ target_include_directories(nvcv_test_cudatools_system ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ) -nvcv_add_test(nvcv_test_cudatools_system) +nvcv_add_test(nvcv_test_cudatools_system nvcv) diff --git a/tests/nvcv_types/cudatools_system/DeviceBorderVarShapeWrap.cu b/tests/nvcv_types/cudatools_system/DeviceBorderVarShapeWrap.cu index 22af91da..add11639 100644 --- a/tests/nvcv_types/cudatools_system/DeviceBorderVarShapeWrap.cu +++ b/tests/nvcv_types/cudatools_system/DeviceBorderVarShapeWrap.cu @@ -66,8 +66,8 @@ __global__ void FillBorderNHWC(DstWrapper dst, SrcWrapper src, int numSamples, i } for (int c = 0; c < numChannels; ++c) { - int4 srcCoord = {dstCoord.x - borderSize.x, dstCoord.y - borderSize.y, dstCoord.z, c}; - dst[{dstCoord.x, dstCoord.y, dstCoord.z, c}] = src[srcCoord]; + int4 srcCoord = {dstCoord.z, dstCoord.y - borderSize.y, dstCoord.x - borderSize.x, c}; + dst[{dstCoord.z, dstCoord.y, dstCoord.x, c}] = src[srcCoord]; } } diff --git a/tests/nvcv_types/cudatools_system/DeviceImageBatchVarShapeWrap.cu b/tests/nvcv_types/cudatools_system/DeviceImageBatchVarShapeWrap.cu index 1cc25603..a749c1be 100644 --- a/tests/nvcv_types/cudatools_system/DeviceImageBatchVarShapeWrap.cu +++ b/tests/nvcv_types/cudatools_system/DeviceImageBatchVarShapeWrap.cu @@ -71,7 +71,7 @@ __global__ void SetTwos(cuda::ImageBatchVarShapeWrapNHWC dst, int n { *dst.ptr(coord.z, coord.y, coord.x, ch) = cuda::SetAll(1); - int4 dstCoord{coord.x, coord.y, coord.z, ch}; + int4 dstCoord{coord.z, coord.y, coord.x, ch}; dst[dstCoord] += cuda::SetAll(1); } } diff --git a/tests/nvcv_types/cudatools_unit/CMakeLists.txt b/tests/nvcv_types/cudatools_unit/CMakeLists.txt index 80adee9f..14477b51 100644 --- a/tests/nvcv_types/cudatools_unit/CMakeLists.txt +++ b/tests/nvcv_types/cudatools_unit/CMakeLists.txt @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -32,4 +32,4 @@ target_include_directories(nvcv_test_cudatools_unit ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ) -nvcv_add_test(nvcv_test_cudatools_unit) +nvcv_add_test(nvcv_test_cudatools_unit nvcv) diff --git a/tests/nvcv_types/python/CMakeLists.txt b/tests/nvcv_types/python/CMakeLists.txt index a08a98ca..abd39cb5 100644 --- a/tests/nvcv_types/python/CMakeLists.txt +++ b/tests/nvcv_types/python/CMakeLists.txt @@ -43,4 +43,4 @@ set(PYTHON_TEST_DIR ${CMAKE_INSTALL_PREFIX}/${PYTHON_TEST_INSTDIR}) set(PYTHON_MODULE_DIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}) configure_file(nvcv_test_types_python.in nvcv_test_types_python @ONLY) -nvcv_add_test(${CMAKE_CURRENT_BINARY_DIR}/nvcv_test_types_python) +nvcv_add_test(${CMAKE_CURRENT_BINARY_DIR}/nvcv_test_types_python nvcv) diff --git a/tests/nvcv_types/python/nvcv_test_types_python.in b/tests/nvcv_types/python/nvcv_test_types_python.in index c74bb94a..2abb0fc1 100755 --- a/tests/nvcv_types/python/nvcv_test_types_python.in +++ b/tests/nvcv_types/python/nvcv_test_types_python.in @@ -45,14 +45,30 @@ function on_exit() } trap 'on_exit' EXIT +export PYTHONPATH="$PYTHONPATH:@PYTHON_MODULE_DIR@" + for ver in $python_versions; do if [[ "$NVCV_FORCE_PYTHON" != 1 && "$NVCV_FORCE_PYTHON" != yes ]]; then - if ! PYTHONPATH="$PYTHONPATH:@PYTHON_MODULE_DIR@" python$ver -c 'import nvcv'; then + if ! python$ver -c 'import nvcv'; then echo "Skipping python-$ver, NVCV python bindings not installed" continue fi fi - PYTHONPATH="$PYTHONPATH:@PYTHON_MODULE_DIR@" NVCV_VERSION="@NVCV_VERSION_FULL@" python$ver -m pytest -o cache_dir="$tmpdir" "$@" "$tests_dir" + # Check if python module is exposing only PyInit_cvcuda. + # Also provide some helpful info is exposing too much. + modfile=$(python$ver -c "import nvcv; print(nvcv.__file__)") + pubsyms=$(readelf -sWD $modfile | grep -v ' UND ' | grep ' GLOBAL ') + if [[ $(echo "$pubsyms" | wc -l) != 1 ]]; then + echo -e "nvcv python $ver module is exposing too many symbols:\n$pubsyms" + exit 1 + fi + if ! echo "$pubsyms" | grep PyInit_nvcv > /dev/null; then + echo -e "nvcv python $ver module must expose symbol PyInit_nvcv, but instead exposes:\n$pubsyms" + exit 2 + fi + + # Run python tests + NVCV_VERSION="@NVCV_VERSION_FULL@" python$ver -m pytest -o cache_dir="$tmpdir" "$@" "$tests_dir" done diff --git a/tests/nvcv_types/python/test_image.py b/tests/nvcv_types/python/test_image.py index 79207a01..4615ca0d 100644 --- a/tests/nvcv_types/python/test_image.py +++ b/tests/nvcv_types/python/test_image.py @@ -13,10 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import torch +import numpy as np import pytest as t import nvcv -import numpy as np -import torch import nvcv_util as util diff --git a/tests/nvcv_types/python/test_imgbatchvarshape.py b/tests/nvcv_types/python/test_imgbatchvarshape.py index dfb5bd30..0caaedb3 100644 --- a/tests/nvcv_types/python/test_imgbatchvarshape.py +++ b/tests/nvcv_types/python/test_imgbatchvarshape.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -14,6 +14,9 @@ # limitations under the License. import nvcv +import pytest as t +import numpy as np +import nvcv_util as util def test_imgbatchvarshape_creation_works(): @@ -98,3 +101,53 @@ def test_imgbatchvarshape_several_images(): assert cnt == 0 assert batch.maxsize == (0, 0) + + +buffmt_common = [ + ([5, 7, 1], np.uint8, nvcv.Format.U8), + ([5, 7, 1], np.uint8, nvcv.Format.U8), + ([5, 7, 1], np.uint8, nvcv.Format.U8), + ([5, 7], np.uint8, nvcv.Format.U8), + ([5, 7, 1], np.int8, nvcv.Format.S8), + ([5, 7, 1], np.uint16, nvcv.Format.U16), + ([5, 7, 1], np.int16, nvcv.Format.S16), + ([5, 7, 2], np.int16, nvcv.Format._2S16), + ([5, 7, 1], np.float32, nvcv.Format.F32), + ([5, 7, 1], np.float64, nvcv.Format.F64), + ([5, 7, 2], np.float32, nvcv.Format._2F32), + ([5, 7, 3], np.uint8, nvcv.Format.RGB8), + ([5, 7, 4], np.uint8, nvcv.Format.RGBA8), + ([5, 7], np.csingle, nvcv.Format.C64), + ([5, 7], np.cdouble, nvcv.Format.C128), + ([5, 7], np.dtype("2f"), nvcv.Format._2F32), +] + + +@t.mark.parametrize("base_shape,dt,format", buffmt_common) +def test_wrap_buffer_list(base_shape, dt, format): + nimages = 3 + ndim = len(base_shape) + shapes = [] + for i in range(nimages): + ith_shape = [] + for d in range(ndim): + if d < 2: + ith_shape.append(base_shape[d] + i) + else: + ith_shape.append(base_shape[d]) + shapes.append(ith_shape) + max_height = base_shape[0] + nimages - 1 + max_width = base_shape[1] + nimages - 1 + host_buffers = [np.ndarray(shape, dt) for shape in shapes] + cuda_buffers = [util.to_cuda_buffer(buf) for buf in host_buffers] + batch = nvcv.as_images(cuda_buffers) + assert batch.capacity == 3 + assert batch.maxsize == (max_width, max_height) + assert batch.uniqueformat == format + + images = [image for image in batch] + for i in range(len(shapes)): + sh = shapes[i] + assert images[i].width == sh[1] + assert images[i].height == sh[0] + assert images[i].format == format diff --git a/tests/nvcv_types/python/test_import_order.py b/tests/nvcv_types/python/test_import_order.py new file mode 100644 index 00000000..7d82f0dc --- /dev/null +++ b/tests/nvcv_types/python/test_import_order.py @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Import order is important, +# torch must be loaded correctly even if nvcv was imported first +import nvcv +import torch +import numpy as np + + +def test_import_nvcv_first_works(): + torch.as_tensor(np.ndarray((4, 6), dtype=np.uint8), device="cuda") + nvcv.Tensor((4, 6), dtype=np.uint8) diff --git a/tests/nvcv_types/python/test_stream.py b/tests/nvcv_types/python/test_stream.py index 2948a48e..d7034305 100644 --- a/tests/nvcv_types/python/test_stream.py +++ b/tests/nvcv_types/python/test_stream.py @@ -13,8 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import nvcv import torch +import nvcv import ctypes import pytest as t diff --git a/tests/nvcv_types/python/test_tensor.py b/tests/nvcv_types/python/test_tensor.py index aa215c7f..52ff631f 100644 --- a/tests/nvcv_types/python/test_tensor.py +++ b/tests/nvcv_types/python/test_tensor.py @@ -13,10 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import torch import nvcv import pytest as t import numpy as np -import torch @t.mark.parametrize( @@ -296,3 +296,135 @@ def test_tensor_create_packed(): def test_tensor_create_for_imgbatch_packed(): tensor = nvcv.Tensor(2, (37, 7), nvcv.Format.RGB8, rowalign=1) assert tensor.cuda().strides == (37 * 7 * 3, 37 * 3, 3, 1) + + +@t.mark.parametrize( + "orig_shape, orig_layout, dtype, shape_arg, layout_arg", + [ + ((1, 23, 65, 3), "NHWC", np.uint8, (23, 65, 3), "HWC"), + ((5, 23, 65, 3), None, np.int8, (5, 23 * 65, 3), None), + ((5, 23, 65, 3), None, np.int8, (5, 23 * 65, 3), "ABC"), + ((1,), "A", np.float32, (1, 1, 1, 1, 1, 1), "ABCDEF"), + ], +) +def test_tensor_reshape(orig_shape, orig_layout, dtype, shape_arg, layout_arg): + tensor = nvcv.Tensor(orig_shape, dtype, layout=orig_layout, rowalign=1) + + def strides(shape): + out = [0] * len(shape) + for d in range(len(shape)): + out[d] = 1 + for d2 in range(d + 1, len(shape)): + out[d] = out[d] * shape[d2] + return tuple(out) + + assert tensor.dtype == dtype + assert tensor.shape == orig_shape + assert tensor.cuda().strides == strides(orig_shape) + + new_tensors = [ + tensor.reshape(shape_arg, layout=layout_arg), + nvcv.reshape(tensor, shape_arg, layout=layout_arg), + ] + for new_tensor in new_tensors: + assert new_tensor.dtype == dtype + assert new_tensor.shape == shape_arg + assert new_tensor.cuda().strides == strides(shape_arg) + + +@t.mark.parametrize( + "orig_shape, orig_layout, dtype, shape_arg, layout_arg", + [ + # wrong number of dims in layout + ((1, 23, 65, 3), "NHWC", np.uint8, (23, 65, 3), "ABCD"), + # wrong number of dims in layout + ((1, 23, 65, 3), None, np.uint8, (23, 65, 3), "ABCD"), + # dims in current layout + ((5, 23, 65, 3), "NHWC", np.int8, (5, 23 * 65, 3), None), + # volume mismatch + ((5, 23, 65, 3), "NHWC", np.int8, (100, 100), "AB"), + # 0-dim tensors not supported + ((1,), "A", np.int8, tuple(), ""), + ], +) +def test_tensor_reshape_error(orig_shape, orig_layout, dtype, shape_arg, layout_arg): + tensor = nvcv.Tensor(orig_shape, dtype, layout=orig_layout, rowalign=1) + + with t.raises(RuntimeError): + tensor.reshape(shape_arg, layout=layout_arg), + + with t.raises(RuntimeError): + nvcv.reshape(tensor, shape_arg, layout=layout_arg) + + +def test_tensor_reshape_lifetime_ref_obj(): + tensor1 = nvcv.Tensor((20, 10, 3), np.uint8, layout="HWC", rowalign=1) + tensor2 = tensor1.reshape((200, 3), layout="WC") + + # tensor2 increased the reference count of the underlying handle, + # so it should be kept alive after tensor1 is deleted + del tensor1 + + assert tensor2.dtype == np.uint8 + assert tensor2.shape == (200, 3) + assert tensor2.cuda().strides == (3, 1) + + +@t.mark.parametrize( + "shape_arg, layout_arg, expected_strides", + [ + ((1, 10, 10, 3), "XHWC", (320, 32, 3, 1)), + ((10, 10, 3, 1), "HWCX", (32, 3, 1, 1)), + ((10, 1, 10, 3), "HXWC", (32, 32, 3, 1)), + ((10, 2, 5, 3), "HABC", (32, 15, 3, 1)), + ((2, 5, 10, 3), "ABWC", (160, 32, 3, 1)), + ], +) +def test_tensor_reshape_strided(shape_arg, layout_arg, expected_strides): + tensor = nvcv.Tensor((10, 10, 3), np.uint8, layout="HWC") + assert tensor.cuda().strides == (32, 3, 1) # strided rows + + new_tensors = [ + tensor.reshape(shape_arg, layout=layout_arg), + nvcv.reshape(tensor, shape_arg, layout=layout_arg), + ] + for new_tensor in new_tensors: + assert new_tensor.cuda().strides == expected_strides + + +@t.mark.parametrize( + "shape_arg, layout_arg", + [((300,), "A")], +) +def test_tensor_reshape_strided_error(shape_arg, layout_arg): + tensor = nvcv.Tensor((10, 10, 3), np.uint8, layout="HWC") + assert tensor.cuda().strides == (32, 3, 1) # strided rows + + with t.raises(RuntimeError): + tensor.reshape(shape_arg, layout=layout_arg) + + with t.raises(RuntimeError): + nvcv.reshape(tensor, shape_arg, layout=layout_arg) + + +@t.mark.parametrize( + "shape_arg, dtype_arg, layout_arg", + [ + ((3, 5, 7), np.dtype("2f4"), "NHW"), + ((3, 5, 3), np.dtype("4f8"), "NHW"), + ((3, 5, 2), np.dtype("2i1"), "NHW"), + ], +) +def test_tensor_wrap_cuda_array_interface(shape_arg, dtype_arg, layout_arg): + tensor = nvcv.Tensor(shape_arg, dtype_arg, layout_arg) + + tcuda = tensor.cuda() + cai = tcuda.__cuda_array_interface__ + assert cai["typestr"] == dtype_arg.str + assert cai["shape"] == shape_arg + + wrapped = nvcv.as_tensor(tcuda, layout_arg) + + assert wrapped.shape == shape_arg + assert wrapped.dtype == dtype_arg + assert wrapped.layout == layout_arg diff --git a/tests/nvcv_types/python/test_tensor_batch.py b/tests/nvcv_types/python/test_tensor_batch.py new file mode 100644 index 00000000..26012033 --- /dev/null +++ b/tests/nvcv_types/python/test_tensor_batch.py @@ -0,0 +1,227 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import nvcv +import pytest as t +import numpy as np +import nvcv_util as util +import torch +import re + + +def rand_shape(rank, low=1, high=10): + return np.random.randint(low=1, high=10, size=rank) + + +def rand_torch_tensor(dtype, rank): + return torch.as_tensor( + np.random.random(size=rand_shape(rank)).astype(dtype), device="cuda" + ) + + +def random_tensors(n, dtype, rank, layout): + return [ + nvcv.as_tensor(rand_torch_tensor(dtype, rank), layout=layout) for _ in range(n) + ] + + +def test_tensorbatch_creation_works(): + batch = nvcv.TensorBatch(15) + assert batch.capacity == 15 + assert len(batch) == 0 + assert batch.layout is None + assert batch.dtype is None + assert batch.ndim == -1 + + # range must be empty + cnt = 0 + for i in batch: + cnt += 1 + assert cnt == 0 + + +def test_tensorbatch_one_tensor(): + batch = nvcv.TensorBatch(15) + + tensor = nvcv.as_tensor(nvcv.Image((64, 32), nvcv.Format.RGBA8)) + batch.pushback(tensor) + assert len(batch) == 1 + assert batch.layout == "NHWC" + assert batch.dtype == np.uint8 + assert batch.ndim == 4 + assert list(batch) == [tensor] + + # range must contain one + cnt = 0 + for elem in batch: + assert elem is tensor + cnt += 1 + assert cnt == 1 + + # remove added tensor + batch.popback() + + # check if its indeed removed + assert len(batch) == 0 + assert list(batch) == [] + + +def test_tensorbatch_change_layout(): + batch = nvcv.TensorBatch(10) + tensorsA = random_tensors(5, np.float32, 3, "HWC") + batch.pushback(tensorsA) + assert list(batch) == tensorsA + assert batch.layout == "HWC" + assert batch.dtype == np.float32 + assert batch.ndim == 3 + + batch.popback(len(tensorsA)) + assert list(batch) == [] + assert batch.layout is None + assert batch.dtype is None + assert batch.ndim == -1 + + tensorsB = [ + nvcv.as_tensor(nvcv.Image(rand_shape(2), nvcv.Format.RGBA8)) for _ in range(7) + ] + batch.pushback(tensorsB) + assert list(batch) == tensorsB + assert batch.layout == "NHWC" + assert batch.dtype == np.uint8 + assert batch.ndim == 4 + + batch.clear() + assert list(batch) == [] + assert batch.layout is None + assert batch.dtype is None + assert batch.ndim == -1 + + +def test_tensorbatch_multiply_tensors(): + N = 10 + tensorsA = random_tensors(5, np.int16, 3, "HWC") + batch = nvcv.TensorBatch(len(tensorsA) * N) + for _ in range(N): + batch.pushback(tensorsA) + + assert list(batch) == tensorsA * N + assert batch.layout == "HWC" + assert batch.dtype == np.int16 + assert batch.ndim == 3 + + +def test_tensorbatch_subscript(): + tensorsA = random_tensors(10, np.float32, 3, "HWC") + batch = nvcv.TensorBatch(10) + batch.pushback(tensorsA) + + # test get item + for i in range(len(batch)): + assert batch[i] is tensorsA[i] + + # out of bounds subscript + with t.raises( + RuntimeError, + match=f"Cannot get tensor at index {len(tensorsA)}. Batch has only {len(tensorsA)} elements.", + ): + batch[len(tensorsA)] + + # test set item + tensorsB = random_tensors(5, np.float32, 3, "HWC") + for i in range(len(tensorsB)): + batch[i] = tensorsB[i] + + for i in range(len(batch)): + if i < len(tensorsB): + assert batch[i] is tensorsB[i] + else: + assert batch[i] is tensorsA[i] + + +def test_tensorbatch_wrap_buffers(): + # from cuda buffer, without layout + buffers = [ + util.to_cuda_buffer(np.ones(rand_shape(3), dtype=np.int32)) for _ in range(10) + ] + batch = nvcv.as_tensors(buffers) + assert batch.capacity == len(buffers) + assert len(batch) == len(buffers) + assert batch.dtype == np.int32 + assert batch.layout is None + assert batch.ndim == 3 + + # from torch tensor, with layout + buffers = [rand_torch_tensor(np.int16, 4) for i in range(5)] + batch = nvcv.as_tensors(buffers, layout="NHWC") + assert batch.capacity == len(buffers) + assert len(batch) == len(buffers) + assert batch.dtype == np.int16 + assert batch.layout == "NHWC" + assert batch.ndim == 4 + + # mismatching rank + with t.raises( + RuntimeError, + match="NVCV_ERROR_INVALID_ARGUMENT: " + "Trying to add a tensor to a tensor batch with an inconsistent rank.", + ): + buffers = [rand_torch_tensor(np.int16, 3), rand_torch_tensor(np.int16, 4)] + nvcv.as_tensors(buffers) + + # mismatching dtype + with t.raises( + RuntimeError, + match="NVCV_ERROR_INVALID_ARGUMENT: " + "Trying to add a tensor to a tensor batch with an inconsistent type.", + ): + buffers = [rand_torch_tensor(np.int16, 3), rand_torch_tensor(np.int32, 3)] + nvcv.as_tensors(buffers) + + # invalid types + with t.raises( + RuntimeError, + match="Input buffer doesn't provide cuda_array_interface or DLPack interfaces.", + ): + buffers = [[1, 2, 3]] + nvcv.as_tensors(buffers) + + +def test_tensorbatch_errors(): + with t.raises( + RuntimeError, + match=re.escape( + "NVCV_ERROR_OVERFLOW: Adding 2 tensors to a tensor batch would exceed its capacity (2) by 1" + ), + ): + batch = nvcv.TensorBatch(2) + batch.pushback(random_tensors(1, np.int16, 3, "")) + batch.pushback(random_tensors(2, np.int16, 3, "")) + + with t.raises( + RuntimeError, + match="NVCV_ERROR_UNDERFLOW: Trying to pop 3 tensors from a tensor batch with 2 tensors.", + ): + batch = nvcv.TensorBatch(5) + batch.pushback(random_tensors(2, np.int16, 3, "")) + batch.popback(3) + + with t.raises( + RuntimeError, + match="NVCV_ERROR_INVALID_ARGUMENT: " + "Trying to add a tensor to a tensor batch with an inconsistent layout.", + ): + batch = nvcv.TensorBatch(10) + batch.pushback(random_tensors(2, np.int16, 4, "NHWC")) + batch.pushback(random_tensors(3, np.int16, 4, "FHWC")) diff --git a/tests/nvcv_types/system/CMakeLists.txt b/tests/nvcv_types/system/CMakeLists.txt index 7ba4a26c..c4038188 100644 --- a/tests/nvcv_types/system/CMakeLists.txt +++ b/tests/nvcv_types/system/CMakeLists.txt @@ -39,6 +39,7 @@ add_executable(nvcv_test_types_system TestExceptions.cpp TestConfig.cpp TestArray.cpp + TestTensorBatch.cpp ) target_link_libraries(nvcv_test_types_system @@ -48,7 +49,7 @@ target_link_libraries(nvcv_test_types_system nvcv_types ) -nvcv_add_test(nvcv_test_types_system) +nvcv_add_test(nvcv_test_types_system nvcv) # header compatibility tests --------------------------------------------- @@ -89,4 +90,4 @@ target_link_libraries(nvcv_test_types_system_version_commit nvcv_types ) -nvcv_add_test(nvcv_test_types_system_version_commit) +nvcv_add_test(nvcv_test_types_system_version_commit nvcv) diff --git a/tests/nvcv_types/system/TestColorSpec.cpp b/tests/nvcv_types/system/TestColorSpec.cpp index bffedc9b..7d6d6a73 100644 --- a/tests/nvcv_types/system/TestColorSpec.cpp +++ b/tests/nvcv_types/system/TestColorSpec.cpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -82,7 +82,42 @@ TEST_P(ChromaSubsamplingTests, get_name) EXPECT_STREQ(gold, nvcvChromaSubsamplingGetName(css)); } +TEST(ChromaSubsamplingTests, invalidChromaSubsamplingGetNumSamples) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvChromaSubsamplingGetNumSamples(NVCV_CSS_444, nullptr, nullptr)); +} + +TEST(ChromaSubsamplingTests, validChromaSubsampling) +{ + NVCVChromaSubsampling test; + ASSERT_EQ(NVCV_SUCCESS, nvcvMakeChromaSubsampling(&test, 2, 1)); + EXPECT_EQ(NVCV_CSS_410R, test); + + ASSERT_EQ(NVCV_SUCCESS, nvcvMakeChromaSubsampling(&test, 1, 2)); + EXPECT_EQ(NVCV_CSS_410, test); + + ASSERT_EQ(NVCV_SUCCESS, nvcvMakeChromaSubsampling(&test, 1, 1)); + EXPECT_EQ(NVCV_CSS_444, test); +} + +TEST(ChromaSubsamplingTests, invalidChromaSubsampling) +{ + NVCVChromaSubsampling test; + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvMakeChromaSubsampling(&test, 5, 5)); +} + +TEST(ChromaSubsamplingTests, invalidOut) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvMakeChromaSubsampling(nullptr, 2, 4)); +} + // Colorspec =================================================== +TEST(ColorSpecTests, invalidOutputMake) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvMakeColorSpec(nullptr, NVCV_COLOR_SPACE_DCIP3, NVCV_YCbCr_ENC_BT2020c, NVCV_COLOR_XFER_sYCC, + NVCV_COLOR_RANGE_LIMITED, NVCV_CHROMA_LOC_ODD, NVCV_CHROMA_LOC_CENTER)); +} TEST(ColorSpecTests, get_name_predefined) { @@ -100,6 +135,15 @@ TEST(ColorSpecTests, get_name_non_predefined) nvcvColorSpecGetName(fmt)); } +TEST(ColorSpecTests, get_name_invald) +{ + /// NVCVColorSpace: 0b111 NVCVYCbCrEncoding: 0b111 NVCVColorTransferFunction: 0b1111 ...... + EXPECT_STREQ( + "NVCVColorSpec(invalid)NVCVColorSpec(NVCVColorSpace(7),NVCVYCbCrEncoding(7),NVCVColorTransferFunction(15)," + "RANGE_LIMITED,LOC_ODD,LOC_ODD)", + nvcvColorSpecGetName(NVCV_COLOR_SPEC_FORCE32)); +} + TEST(ColorSpecTests, set_encoding_to_undefined) { NVCVColorSpec cspec = NVCV_COLOR_SPEC_BT601; @@ -202,6 +246,9 @@ NVCV_INSTANTIATE_TEST_SUITE_P(Positive, ColorModelNeedsColorSpecTests, {NVCV_COLOR_MODEL_UNDEFINED, false}, { NVCV_COLOR_MODEL_RAW, false}, { NVCV_COLOR_MODEL_XYZ, false}, + { NVCV_COLOR_MODEL_HSV, false}, + { NVCV_COLOR_MODEL_CMYK, false}, + { NVCV_COLOR_MODEL_YCCK, false}, } * NVCV_SUCCESS); #if !NVCV_SANITIZED @@ -230,6 +277,11 @@ TEST_P(ColorModelNeedsColorSpecTests, run) } } +TEST_P(ColorModelNeedsColorSpecTests, invalidColorModelNeedsColorspecOut) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvColorModelNeedsColorspec(NVCV_COLOR_MODEL_RGB, nullptr)); +} + // The tests below explicitly create invalid enums just to test if there's any // overflow in bitfield representation. This will trigger -fsanitize=enum. Let's // disable them now in sanitized builds. @@ -249,6 +301,11 @@ TEST(ColorSpecTests, set_color_space) } } +TEST(ColorSpecTests, invalid_set_color_space) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvColorSpecSetColorSpace(nullptr, (NVCVColorSpace)0)); +} + TEST(ColorSpecTests, get_color_space) { for (int cspace = 0; cspace < 1 << 3; cspace ? cspace <<= 1 : ++cspace) @@ -263,6 +320,15 @@ TEST(ColorSpecTests, get_color_space) } } +TEST(ColorSpecTests, invalid_get_color_space) +{ + uint64_t mask = UINT64_MAX; + + NVCVColorSpec type = NVCV_MAKE_COLOR_SPEC(mask, mask, mask, mask, mask, mask); + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvColorSpecGetColorSpace(type, nullptr)); +} + TEST(ColorSpecTests, set_encodings) { for (int enc = 0; enc < 1 << 3; enc ? enc <<= 1 : ++enc) @@ -284,6 +350,11 @@ TEST(ColorSpecTests, set_encodings) } } +TEST(ColorSpecTests, invalid_set_encodings) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvColorSpecSetYCbCrEncoding(nullptr, (NVCVYCbCrEncoding)1)); +} + TEST(ColorSpecTests, get_encodings) { for (int enc = 0; enc < 1 << 3; enc ? enc <<= 1 : ++enc) @@ -298,6 +369,15 @@ TEST(ColorSpecTests, get_encodings) } } +TEST(ColorSpecTests, invalid_get_encodings) +{ + uint64_t mask = UINT64_MAX; + + NVCVColorSpec type = NVCV_MAKE_COLOR_SPEC(mask, 0, mask, mask, mask, mask); + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvColorSpecGetYCbCrEncoding(type, nullptr)); +} + TEST(ColorSpecTests, set_xfer_func) { for (int xfer = 0; xfer < 1 << 3; xfer ? xfer <<= 1 : ++xfer) @@ -313,6 +393,12 @@ TEST(ColorSpecTests, set_xfer_func) } } +TEST(ColorSpecTests, invalid_set_xfer_func) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvColorSpecSetColorTransferFunction(nullptr, (NVCVColorTransferFunction)1)); +} + TEST(ColorSpecTests, get_xfer_func) { for (int xfer = 0; xfer < 1 << 3; xfer ? xfer <<= 1 : ++xfer) @@ -328,6 +414,15 @@ TEST(ColorSpecTests, get_xfer_func) } } +TEST(ColorSpecTests, invalid_get_xfer_func) +{ + uint64_t mask = UINT64_MAX; + + NVCVColorSpec type = NVCV_MAKE_COLOR_SPEC(mask, mask, 0, mask, mask, mask); + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvColorSpecGetColorTransferFunction(type, nullptr)); +} + TEST(ColorSpecTests, set_range) { for (int range = 0; range < 1 << 1; range ? range <<= 1 : ++range) @@ -343,6 +438,11 @@ TEST(ColorSpecTests, set_range) } } +TEST(ColorSpecTests, invalid_set_range) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvColorSpecSetRange(nullptr, (NVCVColorRange)0)); +} + TEST(ColorSpecTests, get_range) { for (int range = 0; range < 1 << 1; range ? range <<= 1 : ++range) @@ -357,6 +457,15 @@ TEST(ColorSpecTests, get_range) } } +TEST(ColorSpecTests, invalid_get_range) +{ + uint64_t mask = UINT64_MAX; + + NVCVColorSpec type = NVCV_MAKE_COLOR_SPEC(mask, mask, mask, mask, mask, mask); + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvColorSpecGetRange(type, nullptr)); +} + TEST(ColorSpecTests, set_chroma_loc_horiz) { for (int loc = 0; loc < 1 << 2; loc ? loc <<= 1 : ++loc) @@ -372,6 +481,14 @@ TEST(ColorSpecTests, set_chroma_loc_horiz) } } +TEST(ColorSpecTests, invalid_set_chroma_loc) +{ + uint64_t mask = UINT64_MAX; + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvColorSpecSetChromaLoc(nullptr, (NVCVChromaLocation)1, (NVCVChromaLocation)mask)); +} + TEST(ColorSpecTests, get_chroma_loc_horiz) { for (int loc = 0; loc < 1 << 2; loc ? loc <<= 1 : ++loc) @@ -445,6 +562,9 @@ TEST(ColorModelTests, undefined_color_model_is_zero) TEST(ColorModelTests, get_name) { EXPECT_STREQ("NVCV_COLOR_MODEL_YCbCr", nvcvColorModelGetName(NVCV_COLOR_MODEL_YCbCr)); + EXPECT_STREQ("NVCV_COLOR_MODEL_HSV", nvcvColorModelGetName(NVCV_COLOR_MODEL_HSV)); + EXPECT_STREQ("NVCV_COLOR_MODEL_CMYK", nvcvColorModelGetName(NVCV_COLOR_MODEL_CMYK)); + EXPECT_STREQ("NVCV_COLOR_MODEL_YCCK", nvcvColorModelGetName(NVCV_COLOR_MODEL_YCCK)); EXPECT_STREQ("NVCVColorModel(-1)", nvcvColorModelGetName(static_cast(-1))); } @@ -457,7 +577,12 @@ TEST(YCbCrEncodingTests, undefined_ycbcr_encoding_is_zero) TEST(YCbCrEncodingTests, get_name) { + EXPECT_STREQ("NVCV_YCbCr_ENC_UNDEFINED", nvcvYCbCrEncodingGetName(NVCV_YCbCr_ENC_UNDEFINED)); EXPECT_STREQ("NVCV_YCbCr_ENC_BT601", nvcvYCbCrEncodingGetName(NVCV_YCbCr_ENC_BT601)); + EXPECT_STREQ("NVCV_YCbCr_ENC_BT709", nvcvYCbCrEncodingGetName(NVCV_YCbCr_ENC_BT709)); + EXPECT_STREQ("NVCV_YCbCr_ENC_BT2020", nvcvYCbCrEncodingGetName(NVCV_YCbCr_ENC_BT2020)); + EXPECT_STREQ("NVCV_YCbCr_ENC_BT2020c", nvcvYCbCrEncodingGetName(NVCV_YCbCr_ENC_BT2020c)); + EXPECT_STREQ("NVCV_YCbCr_ENC_SMPTE240M", nvcvYCbCrEncodingGetName(NVCV_YCbCr_ENC_SMPTE240M)); EXPECT_STREQ("NVCVYCbCrEncoding(-1)", nvcvYCbCrEncodingGetName(static_cast(-1))); } @@ -466,6 +591,7 @@ TEST(YCbCrEncodingTests, get_name) TEST(ChromaLocationTests, get_name) { EXPECT_STREQ("NVCV_CHROMA_LOC_EVEN", nvcvChromaLocationGetName(NVCV_CHROMA_LOC_EVEN)); + EXPECT_STREQ("NVCV_CHROMA_LOC_BOTH", nvcvChromaLocationGetName(NVCV_CHROMA_LOC_BOTH)); EXPECT_STREQ("NVCVChromaLocation(-1)", nvcvChromaLocationGetName(static_cast(-1))); } @@ -474,6 +600,19 @@ TEST(ChromaLocationTests, get_name) TEST(RawPatternTests, get_name) { EXPECT_STREQ("NVCV_RAW_BAYER_RGGB", nvcvRawPatternGetName(NVCV_RAW_BAYER_RGGB)); + EXPECT_STREQ("NVCV_RAW_BAYER_BGGR", nvcvRawPatternGetName(NVCV_RAW_BAYER_BGGR)); + EXPECT_STREQ("NVCV_RAW_BAYER_GRBG", nvcvRawPatternGetName(NVCV_RAW_BAYER_GRBG)); + EXPECT_STREQ("NVCV_RAW_BAYER_GBRG", nvcvRawPatternGetName(NVCV_RAW_BAYER_GBRG)); + EXPECT_STREQ("NVCV_RAW_BAYER_RCCB", nvcvRawPatternGetName(NVCV_RAW_BAYER_RCCB)); + EXPECT_STREQ("NVCV_RAW_BAYER_BCCR", nvcvRawPatternGetName(NVCV_RAW_BAYER_BCCR)); + EXPECT_STREQ("NVCV_RAW_BAYER_CRBC", nvcvRawPatternGetName(NVCV_RAW_BAYER_CRBC)); + EXPECT_STREQ("NVCV_RAW_BAYER_CBRC", nvcvRawPatternGetName(NVCV_RAW_BAYER_CBRC)); + EXPECT_STREQ("NVCV_RAW_BAYER_RCCC", nvcvRawPatternGetName(NVCV_RAW_BAYER_RCCC)); + EXPECT_STREQ("NVCV_RAW_BAYER_CRCC", nvcvRawPatternGetName(NVCV_RAW_BAYER_CRCC)); + EXPECT_STREQ("NVCV_RAW_BAYER_CCRC", nvcvRawPatternGetName(NVCV_RAW_BAYER_CCRC)); + EXPECT_STREQ("NVCV_RAW_BAYER_CCCR", nvcvRawPatternGetName(NVCV_RAW_BAYER_CCCR)); + EXPECT_STREQ("NVCV_RAW_BAYER_CCCC", nvcvRawPatternGetName(NVCV_RAW_BAYER_CCCC)); + EXPECT_STREQ("NVCVRawPattern(255)", nvcvRawPatternGetName(NVCV_RAW_FORCE8)); EXPECT_STREQ("NVCVRawPattern(-1)", nvcvRawPatternGetName(static_cast(-1))); } @@ -481,7 +620,10 @@ TEST(RawPatternTests, get_name) TEST(ColorSpaceTests, get_name) { + EXPECT_STREQ("NVCV_COLOR_SPACE_BT601", nvcvColorSpaceGetName(NVCV_COLOR_SPACE_BT601)); EXPECT_STREQ("NVCV_COLOR_SPACE_BT709", nvcvColorSpaceGetName(NVCV_COLOR_SPACE_BT709)); + EXPECT_STREQ("NVCV_COLOR_SPACE_BT2020", nvcvColorSpaceGetName(NVCV_COLOR_SPACE_BT2020)); + EXPECT_STREQ("NVCV_COLOR_SPACE_DCIP3", nvcvColorSpaceGetName(NVCV_COLOR_SPACE_DCIP3)); EXPECT_STREQ("NVCVColorSpace(-1)", nvcvColorSpaceGetName(static_cast(-1))); } @@ -490,6 +632,7 @@ TEST(ColorSpaceTests, get_name) TEST(WhitePointTests, get_name) { EXPECT_STREQ("NVCV_WHITE_POINT_D65", nvcvWhitePointGetName(NVCV_WHITE_POINT_D65)); + EXPECT_STREQ("NVCVWhitePoint(255)", nvcvWhitePointGetName(NVCV_WHITE_POINT_FORCE8)); EXPECT_STREQ("NVCVWhitePoint(-1)", nvcvWhitePointGetName(static_cast(-1))); } diff --git a/tests/nvcv_types/system/TestDataLayout.cpp b/tests/nvcv_types/system/TestDataLayout.cpp index 5f33df0e..437c2151 100644 --- a/tests/nvcv_types/system/TestDataLayout.cpp +++ b/tests/nvcv_types/system/TestDataLayout.cpp @@ -677,6 +677,47 @@ TEST(PackingTests, get_name) EXPECT_STREQ("NVCVPacking(2147483647)", nvcvPackingGetName(NVCV_PACKING_LIMIT32)); } +class PackingTests_Alignment : public t::TestWithParam> +{ +}; + +INSTANTIATE_TEST_SUITE_P( + _, PackingTests_Alignment, + t::Values(std::make_tuple(NVCV_PACKING_X24, 4), std::make_tuple(NVCV_PACKING_X4b4, 1), + std::make_tuple(NVCV_PACKING_X10b6, 2), std::make_tuple(NVCV_PACKING_b4X12, 2), + std::make_tuple(NVCV_PACKING_b4X12, 2), std::make_tuple(NVCV_PACKING_X8_Y8, 1), + std::make_tuple(NVCV_PACKING_X5Y5Z6, 2), std::make_tuple(NVCV_PACKING_b4X4Y4Z4, 2), + std::make_tuple(NVCV_PACKING_X1Y5Z5W5, 2), std::make_tuple(NVCV_PACKING_X8_Y8__X8_Z8, 1), + std::make_tuple(NVCV_PACKING_X8_Y8_Z8, 1), std::make_tuple(NVCV_PACKING_b2X10Y10Z10, 4), + std::make_tuple(NVCV_PACKING_X12b4_Y12b4, 2), std::make_tuple(NVCV_PACKING_X32_Y24b8, 4))); + +TEST_P(PackingTests_Alignment, get_alignment) +{ + auto param = GetParam(); + + NVCVPacking packing = std::get<0>(param); + const int32_t expectedAlignment = std::get<1>(param); + int32_t outAlignment = -1; + + ASSERT_EQ(NVCV_SUCCESS, nvcvPackingGetAlignment(packing, &outAlignment)); // 16 / 8 = 2 + EXPECT_EQ(expectedAlignment, outAlignment); +} + +TEST(PackingTests_Negative, Invalid_parameter) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvPackingGetParams(NVCV_PACKING_X16, nullptr)); + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvPackingGetNumComponents(NVCV_PACKING_X16, nullptr)); + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvPackingGetBitsPerPixel(NVCV_PACKING_X16, nullptr)); + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvPackingGetBitsPerComponent(NVCV_PACKING_X16, nullptr)); + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvPackingGetAlignment(NVCV_PACKING_X16, nullptr)); + + NVCVPackingParams params; + NVCVPacking packing; + ASSERT_EQ(NVCV_SUCCESS, nvcvPackingGetParams(NVCV_PACKING_X16, ¶ms)); + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvMakePacking(&packing, nullptr)); + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvMakePacking(nullptr, ¶ms)); +} + TEST(ByteOrderTests, get_name) { EXPECT_STREQ("LSB", nvcvByteOrderGetName(NVCV_ORDER_LSB)); @@ -690,15 +731,26 @@ TEST(SwizzleTests, get_name) // nvcvSwizzleGetName(NVCV_DETAIL_MAKE_SWIZZLE(NVCV_CHANNEL_1, NVCV_CHANNEL_1, NVCV_CHANNEL_0, NVCV_CHANNEL_X))); } +TEST(SwizzleTests_Negative, Invalid_parameter) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvSwizzleGetNumChannels(NVCV_SWIZZLE_0000, nullptr)); + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvMakeSwizzle(nullptr, NVCV_CHANNEL_X, NVCV_CHANNEL_Y, NVCV_CHANNEL_Z, NVCV_CHANNEL_W)); + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvSwizzleGetChannels(NVCV_SWIZZLE_0000, nullptr)); +} + TEST(SwizzleChannelTests, get_name) { EXPECT_STREQ("Y", nvcvChannelGetName(NVCV_CHANNEL_Y)); + EXPECT_STREQ("NVCVChannel(255)", nvcvChannelGetName(NVCV_CHANNEL_FORCE8)); EXPECT_STREQ("NVCVChannel(7)", nvcvChannelGetName(static_cast(7))); } TEST(MemLayoutTests, get_name) { + EXPECT_STREQ("NVCV_MEM_LAYOUT_BLOCK1_LINEAR", nvcvMemLayoutGetName(NVCV_MEM_LAYOUT_BLOCK1_LINEAR)); EXPECT_STREQ("NVCV_MEM_LAYOUT_BLOCK2_LINEAR", nvcvMemLayoutGetName(NVCV_MEM_LAYOUT_BLOCK2_LINEAR)); + EXPECT_STREQ("NVCV_MEM_LAYOUT_BLOCK32_LINEAR", nvcvMemLayoutGetName(NVCV_MEM_LAYOUT_BLOCK32_LINEAR)); EXPECT_STREQ("NVCVMemLayout(-1)", nvcvMemLayoutGetName(static_cast(-1))); } @@ -706,4 +758,5 @@ TEST(DataKindTests, get_name) { EXPECT_STREQ("NVCV_DATA_KIND_FLOAT", nvcvDataKindGetName(NVCV_DATA_KIND_FLOAT)); EXPECT_STREQ("NVCV_DATA_KIND_UNSPECIFIED", nvcvDataKindGetName(static_cast(-1))); + EXPECT_STREQ("NVCVDataKind(-128)", nvcvDataKindGetName(static_cast(-128))); } diff --git a/tests/nvcv_types/system/TestImageBatch.cpp b/tests/nvcv_types/system/TestImageBatch.cpp index c201aa67..cc6b1250 100644 --- a/tests/nvcv_types/system/TestImageBatch.cpp +++ b/tests/nvcv_types/system/TestImageBatch.cpp @@ -356,6 +356,199 @@ TEST(ImageBatchVarShape, smoke_sync) ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream)); } +TEST(ImageBatchVarShape, push_exceed_capacity) +{ + nvcv::ImageBatchVarShape batch(32); + + std::mt19937 rng(123); + std::uniform_int_distribution rnd(1, 4); + + std::list vec1; + std::vector vec1Handles; + for (int i = 0; i < batch.capacity() + 1; ++i) + { + vec1.emplace_back(nvcv::Size2D{rnd(rng) * 2, rnd(rng) * 2}, nvcv::FMT_NV12); + vec1Handles.push_back(vec1.back().handle()); + } + + EXPECT_EQ(NVCV_ERROR_OVERFLOW, + nvcvImageBatchVarShapePushImages(batch.handle(), vec1Handles.data(), vec1Handles.size())); +} + +TEST(ImageBatchVarShape, push_null_images) +{ + nvcv::ImageBatchVarShape batch(32); + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchVarShapePushImages(batch.handle(), nullptr, batch.capacity())); +} + +TEST(ImageBatchVarShape, push_callback_exceed_capacity) +{ + nvcv::ImageBatchVarShape batch(32); + std::vector vec1Handles; + + auto cb = [&]() -> nvcv::Image + { + int i = batch.numImages(); + if (i < batch.capacity() + 1) + { + nvcv::Image img(nvcv::Size2D{320 + i * 2, 122 - i * 2}, nvcv::FMT_NV12); + vec1Handles.push_back(img.handle()); + return img; + } + else + { + return {}; + } + }; + auto *pcb = &cb; + auto ccb = [](void *ctx) -> NVCVImageHandle + { + return nvcv::detail::GetImageHandleForPushBack((*decltype(pcb)(ctx))()); + }; + + EXPECT_EQ(NVCV_ERROR_OVERFLOW, nvcvImageBatchVarShapePushImagesCallback(batch.handle(), ccb, pcb)); + + // clean + for (auto imgHandle : vec1Handles) + { + int newRef = -1; + nvcvImageDecRef(imgHandle, &newRef); + EXPECT_EQ(0, newRef); + } +} + +TEST(ImageBatchVarShape, push_callback_null_cbPushImage) +{ + nvcv::ImageBatchVarShape batch(32); + + auto cb = [&]() -> nvcv::Image + { + int i = batch.numImages(); + if (i < batch.capacity()) + { + nvcv::Image img(nvcv::Size2D{320 + i * 2, 122 - i * 2}, nvcv::FMT_NV12); + return img; + } + else + { + return {}; + } + }; + auto *pcb = &cb; + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchVarShapePushImagesCallback(batch.handle(), nullptr, pcb)); + batch.clear(); +} + +TEST(ImageBatchVarShape, pop_negative_num_iamges) +{ + nvcv::ImageBatchVarShape batch(32); + + std::mt19937 rng(123); + std::uniform_int_distribution rnd(1, 4); + + std::list vec1; + std::vector vec1Handles; + for (int i = 0; i < batch.capacity(); ++i) + { + vec1.emplace_back(nvcv::Size2D{rnd(rng) * 2, rnd(rng) * 2}, nvcv::FMT_NV12); + vec1Handles.push_back(vec1.back().handle()); + } + + ASSERT_EQ(NVCV_SUCCESS, nvcvImageBatchVarShapePushImages(batch.handle(), vec1Handles.data(), vec1Handles.size())); + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchVarShapePopImages(batch.handle(), -1)); +} + +TEST(ImageBatchVarShape, pop_exceed_max_images) +{ + nvcv::ImageBatchVarShape batch(32); + + std::mt19937 rng(123); + std::uniform_int_distribution rnd(1, 4); + + std::list vec1; + std::vector vec1Handles; + for (int i = 0; i < batch.capacity(); ++i) + { + vec1.emplace_back(nvcv::Size2D{rnd(rng) * 2, rnd(rng) * 2}, nvcv::FMT_NV12); + vec1Handles.push_back(vec1.back().handle()); + } + + ASSERT_EQ(NVCV_SUCCESS, nvcvImageBatchVarShapePushImages(batch.handle(), vec1Handles.data(), vec1Handles.size())); + EXPECT_EQ(NVCV_SUCCESS, nvcvImageBatchVarShapePopImages(batch.handle(), batch.capacity())); + EXPECT_EQ(NVCV_ERROR_UNDERFLOW, nvcvImageBatchVarShapePopImages(batch.handle(), 1)); +} + +TEST(ImageBatchVarShape, get_null_images) +{ + nvcv::ImageBatchVarShape batch(32); + + std::mt19937 rng(123); + std::uniform_int_distribution rnd(1, 4); + + std::list vec1; + std::vector vec1Handles; + + for (int i = 0; i < batch.capacity(); ++i) + { + vec1.emplace_back(nvcv::Size2D{rnd(rng) * 2, rnd(rng) * 2}, nvcv::FMT_NV12); + vec1Handles.push_back(vec1.back().handle()); + } + + ASSERT_EQ(NVCV_SUCCESS, nvcvImageBatchVarShapePushImages(batch.handle(), vec1Handles.data(), vec1Handles.size())); + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvImageBatchVarShapeGetImages(batch.handle(), 0, nullptr, batch.capacity())); +} + +TEST(ImageBatchVarShape, get_negative_index) +{ + nvcv::ImageBatchVarShape batch(32); + std::vector outputHandles(32); + + std::mt19937 rng(123); + std::uniform_int_distribution rnd(1, 4); + + std::list vec1; + std::vector vec1Handles; + + for (int i = 0; i < batch.capacity(); ++i) + { + vec1.emplace_back(nvcv::Size2D{rnd(rng) * 2, rnd(rng) * 2}, nvcv::FMT_NV12); + vec1Handles.push_back(vec1.back().handle()); + } + + ASSERT_EQ(NVCV_SUCCESS, nvcvImageBatchVarShapePushImages(batch.handle(), vec1Handles.data(), vec1Handles.size())); + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvImageBatchVarShapeGetImages(batch.handle(), -1, outputHandles.data(), batch.capacity())); +} + +TEST(ImageBatchVarShape, get_overflow_index_in_handle) +{ + nvcv::ImageBatchVarShape batch(32); + std::vector outputHandles(32); + + std::mt19937 rng(123); + std::uniform_int_distribution rnd(1, 4); + + std::list vec1; + std::vector vec1Handles; + + for (int i = 0; i < batch.capacity(); ++i) + { + vec1.emplace_back(nvcv::Size2D{rnd(rng) * 2, rnd(rng) * 2}, nvcv::FMT_NV12); + vec1Handles.push_back(vec1.back().handle()); + } + + ASSERT_EQ(NVCV_SUCCESS, nvcvImageBatchVarShapePushImages(batch.handle(), vec1Handles.data(), vec1Handles.size())); + + EXPECT_EQ(NVCV_ERROR_OVERFLOW, + nvcvImageBatchVarShapeGetImages(batch.handle(), 0, outputHandles.data(), batch.capacity() + 1)); + EXPECT_EQ(NVCV_ERROR_OVERFLOW, + nvcvImageBatchVarShapeGetImages(batch.handle(), 1, outputHandles.data(), batch.capacity())); +} + TEST(ImageBatch, smoke_user_pointer) { nvcv::ImageBatchVarShape batch(3); @@ -393,3 +586,79 @@ TEST(ImageBatch, smoke_cast) ref = img.reset(); EXPECT_EQ(ref, 0); } + +class ImageBatchNullParamTest : public ::testing::Test +{ +protected: + ImageBatchNullParamTest() {} + + ~ImageBatchNullParamTest() {} + + void SetUp() override + { + ASSERT_EQ(NVCV_SUCCESS, nvcvImageBatchVarShapeCalcRequirements(5, &reqs)); + ASSERT_EQ(NVCV_SUCCESS, nvcvImageBatchVarShapeConstruct(&reqs, nullptr, &handle)); + } + + void TearDown() override + { + int newRef = 1; + ASSERT_EQ(NVCV_SUCCESS, nvcvImageBatchDecRef(handle, &newRef)); + ASSERT_EQ(newRef, 0); + handle = nullptr; + } + + NVCVImageBatchHandle handle; + NVCVImageBatchVarShapeRequirements reqs; +}; + +TEST(ImageBatch, calc_req_invalid_parameters) +{ + NVCVImageBatchVarShapeRequirements reqs; + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchVarShapeCalcRequirements(5, nullptr)); + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchVarShapeCalcRequirements(-1, &reqs)); +} + +TEST(ImageBatch, construct_null_parameters) +{ + NVCVImageBatchHandle handle; + NVCVImageBatchVarShapeRequirements reqs; + ASSERT_EQ(NVCV_SUCCESS, nvcvImageBatchVarShapeCalcRequirements(5, &reqs)); + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchVarShapeConstruct(nullptr, nullptr, &handle)); + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchVarShapeConstruct(&reqs, nullptr, nullptr)); +} + +TEST_F(ImageBatchNullParamTest, get_user_pointer_null_output) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchGetUserPointer(handle, nullptr)); +} + +TEST_F(ImageBatchNullParamTest, get_num_images_null_output) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchGetNumImages(handle, nullptr)); +} + +TEST_F(ImageBatchNullParamTest, get_batch_capacity_null_output) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchGetCapacity(handle, nullptr)); +} + +TEST_F(ImageBatchNullParamTest, get_unique_format_null_output) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchVarShapeGetUniqueFormat(handle, nullptr)); +} + +TEST_F(ImageBatchNullParamTest, gbatch_get_type_null_output) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchGetType(handle, nullptr)); +} + +TEST_F(ImageBatchNullParamTest, export_data_null_output) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchExportData(handle, 0, nullptr)); +} + +TEST_F(ImageBatchNullParamTest, get_max_size_null_output) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchVarShapeGetMaxSize(handle, nullptr, nullptr)); +} diff --git a/tests/nvcv_types/system/TestSize.cpp b/tests/nvcv_types/system/TestSize.cpp index f9272ee5..73c30916 100644 --- a/tests/nvcv_types/system/TestSize.cpp +++ b/tests/nvcv_types/system/TestSize.cpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -23,6 +23,20 @@ namespace gt = ::testing; namespace test = nvcv::test; +TEST(Size2D, C_interop) +{ + NVCVSize2D c_size{5, 7}; + nvcv::Size2D cxx_size = c_size; // construction + EXPECT_EQ(cxx_size, c_size); // comparison + cxx_size.w++; + EXPECT_NE(cxx_size, c_size); + cxx_size = c_size; // assignment + EXPECT_EQ(cxx_size, c_size); + c_size = cxx_size; // reverse assignment + NVCVSize2D c_size2 = cxx_size; // reverse construction + EXPECT_EQ(c_size2, cxx_size); +} + // Size2D Equality -------------------------------------------- class Size2DEqualityTests : public gt::TestWithParam> { diff --git a/tests/nvcv_types/system/TestTensorBatch.cpp b/tests/nvcv_types/system/TestTensorBatch.cpp new file mode 100644 index 00000000..8cd1bf28 --- /dev/null +++ b/tests/nvcv_types/system/TestTensorBatch.cpp @@ -0,0 +1,467 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Definitions.hpp" + +#include +#include +#include +#include + +#include +#include + +#include + +namespace t = ::testing; +namespace test = nvcv::test; + +template +nvcv::Tensor GetRandomTensor(R &rg, const nvcv::ImageFormat &format) +{ + std::uniform_int_distribution shape_dist(100, 400); + std::uniform_int_distribution images_num_dist(1, 16); + return nvcv::Tensor(images_num_dist(rg), {shape_dist(rg), shape_dist(rg)}, format); +} + +template +void CheckTensorBatchData(const nvcv::TensorBatchData &tbdata, It tensors_begin, It tensors_end, CUstream stream) +{ + auto numTensors = tensors_end - tensors_begin; + ASSERT_EQ(numTensors, tbdata.numTensors()); + std::vector elements(numTensors); + ASSERT_TRUE(tbdata.cast().hasValue()); + auto buffer = tbdata.cast()->buffer(); + ASSERT_EQ(cudaSuccess, + cudaMemcpyAsync(elements.data(), buffer.tensors, sizeof(NVCVTensorBatchElementStrided) * numTensors, + cudaMemcpyDeviceToHost, stream)); + + int i = 0; + for (auto it = tensors_begin; it != tensors_end; ++it) + { + nvcv::Tensor &tensor = *it; + auto tdata = tensor.exportData().cast().value(); + auto &element = elements[i]; + EXPECT_EQ(tdata.layout(), tbdata.layout()); + EXPECT_EQ(tdata.dtype(), tbdata.dtype()); + EXPECT_EQ(tdata.basePtr(), reinterpret_cast(element.data)); + ASSERT_EQ(tdata.rank(), tbdata.rank()); + for (int d = 0; d < tbdata.rank(); ++d) + { + EXPECT_EQ(tdata.shape(d), element.shape[d]); + EXPECT_EQ(tdata.stride(d), element.stride[d]); + } + ++i; + } +} + +TEST(TensorBatch, create) +{ + auto reqs = nvcv::TensorBatch::CalcRequirements(1); + std::vector tensors; + tensors.emplace_back(nvcv::Tensor(1, {300, 300}, nvcv::FMT_RGB8)); + { + nvcv::TensorBatch tb(reqs); + EXPECT_EQ(tb.layout(), nvcv::TensorLayout("")); + EXPECT_EQ(tb.dtype(), nvcv::DataType()); + tb.pushBack(tensors[0]); + ASSERT_EQ(tb.numTensors(), 1); + ASSERT_EQ(tensors[0].refCount(), 2); + auto tbdata = tb.exportData(nullptr); + CheckTensorBatchData(tbdata, tensors.begin(), tensors.end(), nullptr); + } + ASSERT_EQ(tensors[0].refCount(), 1); +} + +TEST(TensorBatch, ref_counting) +{ + std::mt19937 rg{231}; + nvcv::Tensor tensor = GetRandomTensor(rg, nvcv::FMT_RGB8); + { + auto reqs = nvcv::TensorBatch::CalcRequirements(1); + nvcv::TensorBatch tb(reqs); + tb.pushBack(tensor); + ASSERT_EQ(tb.refCount(), 1); + ASSERT_EQ(tensor.refCount(), 2); + int numMul = 32; + std::vector tbs(numMul, tb); + ASSERT_EQ(tb.refCount(), numMul + 1); + ASSERT_EQ(tensor.refCount(), 2); + } + ASSERT_EQ(tensor.refCount(), 1); +} + +TEST(TensorBatch, properties) +{ + int32_t capacity = 32; + std::vector tensors(capacity / 2); + std::mt19937 rg{321}; + for (int i = 0; i < capacity / 2; ++i) + { + tensors[i] = GetRandomTensor(rg, nvcv::FMT_RGB8); + } + auto reqs = nvcv::TensorBatch::CalcRequirements(capacity); + nvcv::TensorBatch tb(reqs); + tb.pushBack(tensors.begin(), tensors.end()); + EXPECT_EQ(tb.dtype(), nvcv::TYPE_U8); + EXPECT_EQ(tb.capacity(), capacity); + EXPECT_EQ(tb.numTensors(), capacity / 2); + EXPECT_EQ(tb.layout(), nvcv::TensorLayout("NHWC")); + EXPECT_EQ(tb.type(), NVCV_TENSOR_BUFFER_STRIDED_CUDA); +} + +TEST(TensorBatch, user_pointer) +{ + auto reqs = nvcv::TensorBatch::CalcRequirements(1); + nvcv::TensorBatch tb(reqs); + int valueA = 0; + tb.setUserPointer(&valueA); + EXPECT_EQ(tb.getUserPointer(), &valueA); + auto tbCopy = tb; + std::cout << tb.refCount() << std::endl; + EXPECT_EQ(tbCopy.getUserPointer(), &valueA); + int valueB = 0; + tb.setUserPointer(&valueB); + EXPECT_EQ(tb.getUserPointer(), &valueB); + EXPECT_EQ(tbCopy.getUserPointer(), &valueB); +} + +TEST(TensorBatch, consistency_validation) +{ + std::mt19937 rg{321}; + auto base_tensor = GetRandomTensor(rg, nvcv::FMT_RGB8); + + auto test_inconsistency = [&](int32_t rank, nvcv::DataType dtype, nvcv::TensorLayout layout) + { + auto reqs = nvcv::TensorBatch::CalcRequirements(2); + nvcv::TensorBatch tb(reqs); + std::vector shape(rank, 1); + nvcv::Tensor tensor(nvcv::TensorShape(shape.data(), rank, layout), dtype); + tb.pushBack(tensor); + NVCV_EXPECT_THROW_STATUS(NVCV_ERROR_INVALID_ARGUMENT, tb.pushBack(base_tensor)); + }; + test_inconsistency(4, nvcv::TYPE_U8, nvcv::TensorLayout("FHWC")); + test_inconsistency(4, nvcv::TYPE_U32, nvcv::TensorLayout("NHWC")); + test_inconsistency(3, nvcv::TYPE_U8, nvcv::TensorLayout("HWC")); +} + +TEST(TensorBatch, push_in_parts) +{ + const int32_t iters = 20; + const int32_t capacity = iters * (iters + 1) / 2; + std::vector tensors(capacity); + std::mt19937 rg{123}; + for (int32_t i = 0; i < capacity; ++i) + { + tensors[i] = GetRandomTensor(rg, nvcv::FMT_RGB8); + } + std::array streams{}; + ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[0])); + ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[1])); + { + auto reqs = nvcv::TensorBatch::CalcRequirements(capacity); + nvcv::TensorBatch tb(reqs); + auto tensors_begin = tensors.data(); + for (int32_t i = 1; i < 21; ++i) + { + tb.pushBack(tensors_begin, tensors_begin + i); + ASSERT_EQ(tb.numTensors(), tensors_begin + i - tensors.data()); + if (i % 2 == 0) + { + auto stream = streams[i / 2 % 2]; + auto tbdata = tb.exportData(stream); + CheckTensorBatchData(tbdata, tensors.data(), tensors_begin + i, stream); + } + tensors_begin += i; + } + for (auto &t : tensors) + { + ASSERT_EQ(t.refCount(), 2); + } + } + for (auto &t : tensors) + { + ASSERT_EQ(t.refCount(), 1); + } +} + +TEST(TensorBatch, push_the_same_tensor) +{ + const int numMul = 32; + std::mt19937 rg{123}; + auto tensor = GetRandomTensor(rg, nvcv::FMT_RGB8); + std::vector tensors; + for (int i = 0; i < numMul; ++i) + { + tensors.push_back(tensor); + } + ASSERT_EQ(tensor.refCount(), numMul + 1); + { + auto reqs = nvcv::TensorBatch::CalcRequirements(numMul); + nvcv::TensorBatch tb(reqs); + tb.pushBack(tensors.begin(), tensors.end()); + ASSERT_EQ(tensor.refCount(), numMul * 2 + 1); + NVCV_EXPECT_THROW_STATUS(NVCV_ERROR_OVERFLOW, tb.pushBack(tensor)); + } + ASSERT_EQ(tensor.refCount(), numMul + 1); +} + +TEST(TensorBatch, clear) +{ + const int32_t capacity = 32; + std::vector tensors(capacity); + std::mt19937 rg{123}; + for (int32_t i = 0; i < capacity; ++i) + { + tensors[i] = GetRandomTensor(rg, nvcv::FMT_RGB8); + } + auto reqs = nvcv::TensorBatch::CalcRequirements(capacity); + nvcv::TensorBatch tb(reqs); + tb.pushBack(tensors.begin(), tensors.end()); + for (auto &t : tensors) + { + EXPECT_EQ(t.refCount(), 2); + } + tb.clear(); + for (auto &t : tensors) + { + EXPECT_EQ(t.refCount(), 1); + } + EXPECT_EQ(tb.layout(), nvcv::TensorLayout("")); + EXPECT_EQ(tb.dtype(), nvcv::DataType()); +} + +TEST(TensorBatch, pop_tensors) +{ + const int32_t capacity = 32; + std::vector tensors(capacity); + std::mt19937 rg{123}; + for (int32_t i = 0; i < capacity; ++i) + { + tensors[i] = GetRandomTensor(rg, nvcv::FMT_RGB8); + } + auto reqs = nvcv::TensorBatch::CalcRequirements(capacity); + nvcv::TensorBatch tb(reqs); + + tb.pushBack(tensors.data(), tensors.data() + capacity / 2); + tb.popTensors(capacity / 4); // remove dirty tensors + // tensor batch should contain the first quarter of the tensors + ASSERT_EQ(tb.numTensors(), capacity / 4); + auto data = tb.exportData(nullptr); + CheckTensorBatchData(data, tensors.data(), tensors.data() + capacity / 4, nullptr); + for (int i = 0; i < capacity / 4; ++i) + { + EXPECT_EQ(tensors[i].refCount(), 2); + } + for (int i = capacity / 4; i < capacity / 2; ++i) + { + EXPECT_EQ(tensors[i].refCount(), 1); + } + + tb.pushBack(tensors.data() + capacity / 2, tensors.data() + capacity); + // tensor batch should contain the first quarter and the last half of the tensors + EXPECT_EQ(tb.numTensors(), capacity * 3 / 4); + for (int i = capacity / 2; i < capacity; ++i) + { + EXPECT_EQ(tensors[i].refCount(), 2); + } + std::vector result{}; + result.insert(result.end(), tensors.begin(), tensors.begin() + capacity / 4); + result.insert(result.end(), tensors.begin() + capacity / 2, tensors.begin() + capacity); + data = tb.exportData(nullptr); + CheckTensorBatchData(data, result.begin(), result.end(), nullptr); + result.clear(); + + tb.popTensors(capacity / 4); // remove clean tensors; + // tensor batch should contain the first and the third quarter of the tensors + EXPECT_EQ(tb.numTensors(), capacity / 2); + for (int i = 0; i < capacity / 4; ++i) + { + EXPECT_EQ(tensors[i].refCount(), 2); + EXPECT_EQ(tensors[i + capacity / 4].refCount(), 1); + EXPECT_EQ(tensors[i + capacity * 2 / 4].refCount(), 2); + EXPECT_EQ(tensors[i + capacity * 3 / 4].refCount(), 1); + } + data = tb.exportData(nullptr); + result.insert(result.end(), tensors.begin(), tensors.begin() + capacity / 4); + result.insert(result.end(), tensors.begin() + capacity / 2, tensors.begin() + capacity * 3 / 4); + CheckTensorBatchData(data, result.begin(), result.end(), nullptr); + result.clear(); + + tb.pushBack(tensors.begin(), tensors.begin() + capacity / 4); + // tensor batch should contain the first, the third and the first (again) quarter + EXPECT_EQ(tb.numTensors(), capacity * 3 / 4); + for (int i = 0; i < capacity / 4; ++i) + { + EXPECT_EQ(tensors[i].refCount(), 3); + EXPECT_EQ(tensors[i + capacity / 4].refCount(), 1); + EXPECT_EQ(tensors[i + capacity * 2 / 4].refCount(), 2); + EXPECT_EQ(tensors[i + capacity * 3 / 4].refCount(), 1); + } + tb.popTensors(capacity / 2); // remove clean and dirty tensors + // tensor batch should contain the first quarter of the tensors + EXPECT_EQ(tb.numTensors(), capacity / 4); + for (int i = 0; i < capacity / 4; ++i) + { + EXPECT_EQ(tensors[i].refCount(), 2); + } + for (int i = capacity / 4; i < capacity; ++i) + { + EXPECT_EQ(tensors[i].refCount(), 1); + } + data = tb.exportData(nullptr); + result.insert(result.end(), tensors.begin(), tensors.begin() + capacity / 4); + CheckTensorBatchData(data, result.begin(), result.end(), nullptr); + result.clear(); + + tb.pushBack(tensors[0]); + EXPECT_EQ(tensors[0].refCount(), 3); + tb.popTensor(); // pop single tensor + EXPECT_EQ(tensors[0].refCount(), 2); + data = tb.exportData(nullptr); + result.insert(result.end(), tensors.begin(), tensors.begin() + capacity / 4); + CheckTensorBatchData(data, result.begin(), result.end(), nullptr); + + NVCV_EXPECT_THROW_STATUS(NVCV_ERROR_UNDERFLOW, tb.popTensors(capacity / 4 + 1)); + NVCV_EXPECT_THROW_STATUS(NVCV_ERROR_INVALID_ARGUMENT, tb.popTensors(-1)); +} + +TEST(TensorBatch, iterator_arithm) +{ + int32_t capacity = 4; + std::vector tensors(capacity); + std::mt19937 rg{321}; + for (int i = 0; i < capacity; ++i) + { + tensors[i] = GetRandomTensor(rg, nvcv::FMT_RGB8); + } + auto reqs = nvcv::TensorBatch::CalcRequirements(capacity); + nvcv::TensorBatch tb(reqs); + + auto it = tb.begin(); + EXPECT_EQ(it, tb.end()); + + tb.pushBack(tensors.begin(), tensors.end()); + it = tb.begin(); + + EXPECT_EQ(it->handle(), tensors[0].handle()); + EXPECT_EQ((++it)->handle(), tensors[1].handle()); + EXPECT_EQ((it++)->handle(), tensors[1].handle()); + EXPECT_EQ((--it)->handle(), tensors[1].handle()); + EXPECT_EQ((it--)->handle(), tensors[1].handle()); + EXPECT_EQ((it + capacity - 1)->handle(), tensors[capacity - 1].handle()); + + EXPECT_EQ((tb.end() - capacity), tb.begin()); + EXPECT_GT(tb.end(), tb.begin()); + EXPECT_GE(it, tb.begin()); + EXPECT_LT(it, it + 2); + EXPECT_LE(it, it + 1); + + EXPECT_EQ(tb.end() - it, capacity); +} + +TEST(TensorBatch, indexing_and_iterating) +{ + int32_t capacity = 32; + std::vector tensors(capacity); + std::mt19937 rg{321}; + for (int i = 0; i < capacity; ++i) + { + tensors[i] = GetRandomTensor(rg, nvcv::FMT_RGB8); + } + auto reqs = nvcv::TensorBatch::CalcRequirements(capacity); + nvcv::TensorBatch tb(reqs); + tb.pushBack(tensors.begin(), tensors.end()); + for (int i = 0; i < capacity; ++i) + { + EXPECT_EQ(tb[i].handle(), tensors[i].handle()); + } + + int i = 0; + for (auto t : tb) + { + EXPECT_EQ(t.handle(), tensors[i++].handle()); + } + + NVCV_EXPECT_THROW_STATUS(NVCV_ERROR_OVERFLOW, tb[capacity]); + NVCV_EXPECT_THROW_STATUS(NVCV_ERROR_INVALID_ARGUMENT, tb[-1]); +} + +TEST(TensorBatch, set_tensor) +{ + int32_t capacity = 32; + std::vector tensors(capacity); + std::mt19937 rg{321}; + for (int i = 0; i < capacity; ++i) + { + tensors[i] = GetRandomTensor(rg, nvcv::FMT_RGB8); + } + auto reqs = nvcv::TensorBatch::CalcRequirements(capacity); + nvcv::TensorBatch tb(reqs); + tb.pushBack(tensors.begin(), tensors.end()); + auto tensorA = GetRandomTensor(rg, nvcv::FMT_RGB8); + auto tensorB = GetRandomTensor(rg, nvcv::FMT_RGB8); + + tb.setTensor(0, tensorA); // set at dirty position + auto data = tb.exportData(nullptr); + auto result = tensors; + result[0] = tensorA; + CheckTensorBatchData(data, result.begin(), result.end(), nullptr); + result.clear(); + EXPECT_EQ(tensors[0].refCount(), 1); + EXPECT_EQ(tensorA.refCount(), 2); + + tb.setTensor(capacity / 4, tensorA); + tb.setTensor(capacity / 2, tensorB); // set at clean positions + data = tb.exportData(nullptr); + result = tensors; + result[0] = result[capacity / 4] = tensorA; + result[capacity / 2] = tensorB; + CheckTensorBatchData(data, result.begin(), result.end(), nullptr); + result.clear(); + EXPECT_EQ(tensors[0].refCount(), 1); + EXPECT_EQ(tensors[capacity / 4].refCount(), 1); + EXPECT_EQ(tensors[capacity / 2].refCount(), 1); + EXPECT_EQ(tensorA.refCount(), 3); + EXPECT_EQ(tensorB.refCount(), 2); + + for (int i = capacity - 10; i < capacity; ++i) + { + tensors[i] = GetRandomTensor(rg, nvcv::FMT_RGB8); + } + tb.popTensors(10); + tb.pushBack(tensors.begin() + capacity - 10, tensors.end()); + ASSERT_EQ(tb.numTensors(), capacity); + + tb.setTensor(capacity / 2 + 1, tensorA); // set at clean position + tb.setTensor(capacity - 2, tensorB); // set at dirty position + data = tb.exportData(nullptr); + result = tensors; + result[0] = result[capacity / 4] = tensorA; + result[capacity / 2] = tensorB; + result[capacity / 2 + 1] = tensorA; + result[capacity - 2] = tensorB; + CheckTensorBatchData(data, result.begin(), result.end(), nullptr); + result.clear(); + EXPECT_EQ(tensors[0].refCount(), 1); + EXPECT_EQ(tensors[capacity / 4].refCount(), 1); + EXPECT_EQ(tensors[capacity / 2].refCount(), 1); + EXPECT_EQ(tensors[capacity / 2 + 1].refCount(), 1); + EXPECT_EQ(tensors[capacity - 2].refCount(), 1); + EXPECT_EQ(tensorA.refCount(), 4); + EXPECT_EQ(tensorB.refCount(), 3); +} diff --git a/tests/nvcv_types/system/TestTensorDataUtils.cpp b/tests/nvcv_types/system/TestTensorDataUtils.cpp index c4a3d207..04b54440 100644 --- a/tests/nvcv_types/system/TestTensorDataUtils.cpp +++ b/tests/nvcv_types/system/TestTensorDataUtils.cpp @@ -35,7 +35,7 @@ namespace util = nvcv::util; NVCV_TEST_SUITE_P(TensorDataUtils, test::ValueList { //width, height, numImages, fill byte, format - { 2, 2, 2, 2, nvcv::FMT_RGB8}, + { 2, 2, 1, 2, nvcv::FMT_RGB8}, { 3, 3, 5, 2, nvcv::FMT_BGR8}, { 10, 11, 2, 2, nvcv::FMT_RGBA8}, { 5, 5, 1, 2, nvcv::FMT_BGRA8}, @@ -216,6 +216,41 @@ TEST_P(TensorDataUtils, SetGetTensorFromVector) EXPECT_NO_THROW(GetSetTensor(tensor)); } +TEST_P(TensorDataUtils, SetGetTensorToFromByteVector) +{ + int width = GetParamValue<0>(); + int height = GetParamValue<1>(); + int number = GetParamValue<2>(); + nvcv::ImageFormat fmt = GetParamValue<4>(); + + // This will return the number of channels in the plane 0, so with planar + // this must be considered only for that plane. + int numChannels = fmt.numChannels(); + int bytesPerPixel = 0; + + for (int i = 0; i < numChannels; i++) + { + bytesPerPixel += fmt.bitsPerChannel()[i] / 8; + } + nvcv::Tensor tensor(number, {width, height}, fmt); + + std::default_random_engine randEng(0); + std::uniform_int_distribution rand(0u, 255u); + + // Test the CHW/HWC tensors + for (int i = 0; i < number; ++i) + { + std::vector imageVec((width * height) * bytesPerPixel); + std::generate(imageVec.begin(), imageVec.end(), [&]() { return (nvcv::Byte)rand(randEng); }); + std::vector outVec((width * height) * bytesPerPixel); + EXPECT_NO_THROW(util::SetImageTensorFromByteVector(tensor.exportData(), imageVec, i)); + EXPECT_NO_THROW(util::GetImageByteVectorFromTensor(tensor.exportData(), i, outVec)); + EXPECT_EQ(imageVec, outVec); + } + + return; +} + TEST_P(TensorDataUtils, SetGetTensorFromImageVector) { int width = GetParamValue<0>(); diff --git a/tests/nvcv_types/unit/CMakeLists.txt b/tests/nvcv_types/unit/CMakeLists.txt index e2be5753..d42a9ca2 100644 --- a/tests/nvcv_types/unit/CMakeLists.txt +++ b/tests/nvcv_types/unit/CMakeLists.txt @@ -35,6 +35,9 @@ add_executable(nvcv_test_types_unit TestHandleWrapper.cpp TestTypeTraits.cpp TestSharedCoreObj.cpp + TestStreamId.cpp + TestSimpleCache.cpp + TestPerStreamCache.cpp ) if(ENABLE_COMPAT_OLD_GLIBC) @@ -52,6 +55,7 @@ target_link_libraries(nvcv_test_types_unit nvcv_util nvcv_test_common nvcv_types_priv + cuda ) -nvcv_add_test(nvcv_test_types_unit) +nvcv_add_test(nvcv_test_types_unit nvcv) diff --git a/tests/nvcv_types/unit/TestPerStreamCache.cpp b/tests/nvcv_types/unit/TestPerStreamCache.cpp new file mode 100644 index 00000000..8f066426 --- /dev/null +++ b/tests/nvcv_types/unit/TestPerStreamCache.cpp @@ -0,0 +1,396 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Definitions.hpp" + +#include +#include +#include + +#include + +namespace { + +class Hog +{ +public: + Hog(size_t size = 64 << 20) + : m_size(size) + { + NVCV_CHECK_THROW(cudaMalloc(&m_buf, m_size)); + } + + ~Hog() + { + (void)cudaFree(m_buf); + } + + void Run(cudaStream_t stream, int iters) + { + for (int i = 0; i < iters; i++) + { + NVCV_CHECK_THROW(cudaMemsetAsync(m_buf, i, m_size, stream)); + } + } + +private: + void *m_buf = nullptr; + size_t m_size; +}; + +struct DummyPayload +{ + size_t size = 0, alignment = 1; + cudaEvent_t ready = nullptr; +}; + +using ItemAlloc = nvcv::util::detail::StreamCacheItemAllocator; + +} // namespace + +namespace nvcv::util { + +TEST(StreamCacheItemAllocator, BasicTest) +{ + ItemAlloc alloc; + std::vector items; + std::mt19937_64 rng; + std::bernoulli_distribution action; + for (int i = 0; i < 1000; i++) + { + if (action(rng) || items.empty()) + { + items.push_back(alloc.allocate()); + } + else + { + int n = items.size(); + std::uniform_int_distribution dist(0, n - 1); + int i = dist(rng); + std::swap(items[i], items.back()); + alloc.deallocate(items.back()); + items.pop_back(); + } + } + while (!items.empty()) + { + alloc.deallocate(items.back()); + items.pop_back(); + } +} + +namespace { + +struct EventAlloc +{ + void reserve(int count) + { + for (int i = 0; i < count; i++) get(); + clear(); + } + + void clear() + { + for (auto &event : events) cache.put(std::move(event)); + events.clear(); + } + + std::vector events; + EventCache cache; + + cudaEvent_t get() + { + events.push_back(cache.get()); + return events.back().get(); + }; +}; + +} // namespace + +TEST(StreamOrderedCacheTest, InsertGet) +{ + EventAlloc events; + ItemAlloc alloc; + detail::StreamOrderedCache cache(&alloc); + + EXPECT_FALSE(cache.get(1000, 1).has_value()) << "The cache should be empty"; + DummyPayload p{}; + p.size = 1000; + p.ready = events.get(); + cache.put(std::move(p)); + EXPECT_FALSE(cache.get(2000, 1).has_value()) << "The cache doesn't contain any element large enough"; + auto v = cache.get(500, 1); + ASSERT_TRUE(v.has_value()); + EXPECT_EQ(v->size, 1000) << "The cache contains a suitable element"; + v = cache.get(500, 1); + EXPECT_FALSE(cache.get(0, 0)) << "The element was already removed"; +} + +TEST(StreamOrderedCacheTest, FindNextReady) +{ + EventAlloc events; + ItemAlloc alloc; + detail::StreamOrderedCache cache(&alloc); + + int N = 801; + events.reserve(N); + CudaStream stream = CudaStream::Create(true); + + Hog hog; + + const int kMaxRetries = 10; + int retries = kMaxRetries; + + for (int split = 0; split < N; split += 20) + { + std::cout << split + 1 << "/" << N << std::endl; + events.clear(); + + int i; + for (i = 0; i < split; i++) + { + DummyPayload dp{}; + dp.ready = events.get(); + dp.size = i; + ASSERT_EQ(cudaSuccess, cudaEventRecord(dp.ready, stream.get())); + cache.put(std::move(dp)); + } + + hog.Run(stream.get(), 50); + + for (; i < N; i++) + { + DummyPayload dp{}; + dp.ready = events.get(); + dp.size = i; + ASSERT_EQ(cudaSuccess, cudaEventRecord(dp.ready, stream.get())); + cache.put(std::move(dp)); + } + + if (split > 0) + { + ASSERT_EQ(cudaSuccess, cudaEventSynchronize(events.events[split - 1])); + } + auto *item = cache.findNewestReady(); + if (cudaEventQuery(events.events[split]) == cudaSuccess) + { + if (--retries < 0) + GTEST_SKIP() << "Unreliable test"; + split--; + cache.waitAndPurge(); + ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize()); + continue; + } + retries = kMaxRetries; + + if (split == 0) + EXPECT_EQ(item, nullptr); + else + { + EXPECT_NE(item, nullptr); + if (item) + { + EXPECT_EQ(item->payload.size, split - 1); + } + } + + ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize()); + + cache.waitAndPurge(); + } +} + +TEST(StreamOrderedCacheTest, RemoveAllReady) +{ + EventAlloc events; + ItemAlloc alloc; + detail::StreamOrderedCache cache(&alloc); + + CudaStream stream = CudaStream::Create(true); + int N = 801; + events.reserve(N); + + Hog hog; + + const int kMaxRetries = 10; + int retries = kMaxRetries; + + std::vector mask(N); + + for (int split = 0; split < N; split += 20) + { + std::cout << split + 1 << "/" << N << std::endl; + events.clear(); + for (int i = 0; i < N; i++) mask[i] = false; + + int i; + for (i = 0; i < split; i++) + { + DummyPayload dp{}; + dp.ready = events.get(); + dp.size = i; + ASSERT_EQ(cudaSuccess, cudaEventRecord(dp.ready, stream.get())); + cache.put(std::move(dp)); + } + + hog.Run(stream.get(), 50); + + for (; i < N; i++) + { + DummyPayload dp{}; + dp.ready = events.get(); + dp.size = i; + ASSERT_EQ(cudaSuccess, cudaEventRecord(dp.ready, stream.get())); + cache.put(std::move(dp)); + } + + if (split > 0) + { + ASSERT_EQ(cudaSuccess, cudaEventSynchronize(events.events[split - 1])); + } + cache.removeAllReady([&](const DummyPayload &p) { mask[p.size] = true; }); + if (cudaEventQuery(events.events[split]) != cudaErrorNotReady) + { + if (--retries < 0) + GTEST_SKIP() << "Unreliable test"; + split--; + cache.waitAndPurge(); + ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize()); + continue; + } + retries = kMaxRetries; + for (int i = 0; i < N; i++) + { + EXPECT_EQ(mask[i], (i < split)) << "@ i = " << i << " split = " << split; + } + + ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize()); + + cache.waitAndPurge(); + } +} + +TEST(PerStreamCacheTest, NoStream) +{ + PerStreamCache cache; + + EventAlloc events; + + DummyPayload p1{1000, 1, events.get()}; + DummyPayload p2{2000, 1, events.get()}; + DummyPayload p3{3000, 1, events.get()}; + cache.put(std::move(p1), std::nullopt); + cache.put(std::move(p2), std::nullopt); + cache.put(std::move(p3), std::nullopt); + auto v1 = cache.get(1001, 0, std::nullopt); + ASSERT_TRUE(v1.has_value()); + EXPECT_EQ(v1->size, 2000); + auto v2 = cache.get(900, 0, std::nullopt); + ASSERT_TRUE(v2.has_value()); + EXPECT_EQ(v2->size, 1000); + auto v3 = cache.get(900, 0, std::nullopt); + ASSERT_TRUE(v3.has_value()); + EXPECT_EQ(v3->size, 3000); +} + +TEST(PerStreamCacheTest, TwoStream) +{ + for (int attempt = 0; attempt < 10; attempt++) + { + ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize()); + EventAlloc events; + PerStreamCache cache; + + Hog hog; + + CudaStream s1 = CudaStream::Create(true); + CudaStream s2 = CudaStream::Create(true); + + DummyPayload p1{1000, 1, events.get()}; + DummyPayload p2{2000, 1, events.get()}; + DummyPayload p3{3000, 1, events.get()}; + DummyPayload p4{4000, 1, events.get()}; + + hog.Run(s1.get(), 100); + hog.Run(s2.get(), 100); + + ASSERT_EQ(cudaSuccess, cudaEventRecord(p1.ready, s1.get())); + ASSERT_EQ(cudaSuccess, cudaEventRecord(p2.ready, s2.get())); + ASSERT_EQ(cudaSuccess, cudaEventRecord(p3.ready, s1.get())); + ASSERT_EQ(cudaSuccess, cudaEventRecord(p4.ready, s2.get())); + + auto s = std::chrono::high_resolution_clock::now(); + cache.put(std::move(p1), s1.get()); + cache.put(std::move(p2), s2.get()); + cache.put(std::move(p3), s1.get()); + cache.put(std::move(p4), s2.get()); + auto e = std::chrono::high_resolution_clock::now(); + double insert_time = (e - s).count() / 4; + double stream_get_time = 0; + + s = std::chrono::high_resolution_clock::now(); + auto v0 = cache.get(1, 0, std::nullopt); + e = std::chrono::high_resolution_clock::now(); + double failed_get_time = (e - s).count(); + if (v0.has_value()) + { + if (cudaSuccess == cudaEventQuery(p1.ready) || cudaSuccess == cudaEventQuery(p1.ready)) + continue; + EXPECT_FALSE(v0.has_value()) << "The resources are not ready - none should be returned for null stream."; + } + + s = std::chrono::high_resolution_clock::now(); + auto v1s1 = cache.get(1001, 0, s1); + e = std::chrono::high_resolution_clock::now(); + stream_get_time = (e - s).count(); + ASSERT_TRUE(v1s1.has_value()); + EXPECT_EQ(v1s1->size, 3000); + + s = std::chrono::high_resolution_clock::now(); + auto v2s1 = cache.get(900, 0, s1); + e = std::chrono::high_resolution_clock::now(); + stream_get_time = (e - s).count(); + ASSERT_TRUE(v2s1.has_value()); + EXPECT_EQ(v2s1->size, 1000); + + s = std::chrono::high_resolution_clock::now(); + auto v1s2 = cache.get(900, 0, s2); + e = std::chrono::high_resolution_clock::now(); + stream_get_time = (e - s).count(); + stream_get_time /= 3; + ASSERT_TRUE(v1s2.has_value()); + EXPECT_EQ(v1s2->size, 2000); + + ASSERT_EQ(cudaSuccess, cudaEventSynchronize(events.events[3])); + s = std::chrono::high_resolution_clock::now(); + auto v0ready = cache.get(1, 0, std::nullopt); + e = std::chrono::high_resolution_clock::now(); + double get_time = (e - s).count(); + ASSERT_TRUE(v0ready.has_value()); + EXPECT_EQ(v0ready->size, 4000); + + std::cout << "Insert time = " << insert_time << "ns" << std::endl; + std::cout << "Get time (stream) = " << stream_get_time << "ns" << std::endl; + std::cout << "Get time (global, success) = " << get_time << "ns" << std::endl; + std::cout << "Get time (global, failed) = " << failed_get_time << "ns" << std::endl; + + return; + } + GTEST_SKIP() << "Test unreliable - cannot make the CPU wait for the GPU"; +} + +} // namespace nvcv::util diff --git a/tests/nvcv_types/unit/TestSimpleCache.cpp b/tests/nvcv_types/unit/TestSimpleCache.cpp new file mode 100644 index 00000000..4b970d25 --- /dev/null +++ b/tests/nvcv_types/unit/TestSimpleCache.cpp @@ -0,0 +1,92 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Definitions.hpp" + +#include + +namespace { +struct Payload +{ + int data = 0; + bool destroyed = false; + bool movedOut = false; + + explicit Payload(int data) + : data(data) + { + } + + ~Payload() + { + data = -2; + destroyed = true; + } + + Payload(Payload &&p) + { + *this = std::move(p); + } + + Payload &operator=(Payload &&p) + { + data = p.data; + destroyed = p.destroyed; + movedOut = p.movedOut; + p.data = -1; + p.movedOut = true; + return *this; + } +}; + +} // namespace + +TEST(SimpleCacheTest, PutGet) +{ + nvcv::util::SimpleCache cache; + EXPECT_FALSE(cache.get().has_value()); + Payload p = cache.getOrCreate([]() { return Payload(42); }); + EXPECT_EQ(p.data, 42); + EXPECT_FALSE(p.destroyed); + EXPECT_FALSE(p.movedOut); + cache.put(std::move(p)); + cache.put(Payload(1234)); + cache.emplace(4321); + EXPECT_TRUE(p.movedOut); + EXPECT_FALSE(p.destroyed); + + std::optional o = cache.get(); + ASSERT_TRUE(o.has_value()); + EXPECT_EQ(o->data, 4321); + EXPECT_FALSE(o->destroyed); + EXPECT_FALSE(o->movedOut); + + o = cache.get(); + ASSERT_TRUE(o.has_value()); + EXPECT_EQ(o->data, 1234); + EXPECT_FALSE(o->destroyed); + EXPECT_FALSE(o->movedOut); + + o = cache.get(); + ASSERT_TRUE(o.has_value()); + EXPECT_EQ(o->data, 42); + EXPECT_FALSE(o->destroyed); + EXPECT_FALSE(o->movedOut); + + o = cache.get(); + EXPECT_FALSE(o.has_value()); +} diff --git a/tests/nvcv_types/unit/TestStreamId.cpp b/tests/nvcv_types/unit/TestStreamId.cpp new file mode 100644 index 00000000..b5108bbf --- /dev/null +++ b/tests/nvcv_types/unit/TestStreamId.cpp @@ -0,0 +1,128 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Definitions.hpp" + +#include +#include +#include + +#include + +TEST(StreamIdTest, RegularAndDefault) +{ + cudaStream_t stream1 = 0, stream2 = 0; + (void)cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking); + (void)cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking); + if (!stream1 || !stream2) + { + if (stream1) + (void)cudaStreamDestroy(stream1); + if (stream2) + (void)cudaStreamDestroy(stream2); + FAIL() << "Could not create two CUDA streams"; + } + + uint64_t id1 = nvcv::util::GetCudaStreamIdHint(stream1); + uint64_t id2 = nvcv::util::GetCudaStreamIdHint(stream2); + uint64_t id3 = nvcv::util::GetCudaStreamIdHint(0); + EXPECT_NE(id1, id2); + EXPECT_NE(id1, id3); + EXPECT_NE(id2, id3); + (void)cudaStreamDestroy(stream1); + (void)cudaStreamDestroy(stream2); +} + +/** Tests that distinct streams with the same handle get different IDs + */ +TEST(StreamIdTest, HandleReuse) +{ + if (!nvcv::util::IsCudaStreamIdHintUnambiguous()) + GTEST_SKIP() << "This platform doesn't have an unambiguous CUDA stream id\n"; + + struct CudaDeleter + { + void operator()(void *p) + { + cudaFree(p); + } + }; + + auto CudaAlloc = [](size_t size) + { + void *ret = nullptr; + cudaMalloc(&ret, size); + return ret; + }; + + size_t bufSize = 256 << 20; // 256MiB + std::unique_ptr mem(CudaAlloc(bufSize)); + + cudaEvent_t e; + (void)cudaEventCreateWithFlags(&e, cudaEventDisableTiming); + + bool done = false; + int maxAttempts = 10; + for (int i = 0; i < maxAttempts; i++) + { + (void)cudaDeviceSynchronize(); + cudaStream_t stream1 = 0, stream2 = 0; + (void)cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking); + uint64_t id1 = nvcv::util::GetCudaStreamIdHint(stream1); + for (int i = 0; i < 10; i++) cudaMemsetAsync(mem.get(), i, bufSize, stream1); + cudaEventRecord(e, stream1); + if (stream1) + (void)cudaStreamDestroy(stream1); + (void)cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking); + bool stillRunning = (cudaEventQuery(e) == cudaErrorNotReady); + uint64_t id2 = nvcv::util::GetCudaStreamIdHint(stream2); + if (stream2) + (void)cudaStreamDestroy(stream2); + if (stream1 != stream2) + continue; // no handle reuse - retry + if (!stillRunning) + continue; // the stream wasn't running - the ID may be the same without any harm + EXPECT_NE(id1, id2); + done = true; + break; + } + + (void)cudaEventDestroy(e); + + if (!done) + GTEST_SKIP() << "Could not trigger handle reuse - no way to conduct the test"; +} + +TEST(StreamIdTest, PerThreadDefault) +{ + const int N = 4; + std::vector threads(N); + std::vector ids(N); + for (int i = 0; i < N; i++) + { + threads[i] = std::thread( + [&, i]() + { + (void)cudaFree(0); // create/assign a context + ids[i] = nvcv::util::GetCudaStreamIdHint(cudaStreamPerThread); + }); + } + for (int i = 0; i < N; i++) threads[i].join(); + for (int i = 0; i < N - 1; i++) + for (int j = i + 1; j < N; j++) + EXPECT_NE(ids[i], ids[j]) << "Per-thread streams for threads " << i << " and " << j << " do not differ."; +} diff --git a/tests/run_tests.sh.in b/tests/run_tests.sh.in index 82cbefda..af178ca4 100755 --- a/tests/run_tests.sh.in +++ b/tests/run_tests.sh.in @@ -17,8 +17,16 @@ shopt -s extglob +# Defaults +test_set="all" curdir=$(dirname "$(readlink -f "$0")") +if [[ $# -ge 1 ]]; then + test_set=$1 +fi + +IFS="," read -r -a test_set <<< "$test_set" + function on_exit() { set +e @@ -43,8 +51,16 @@ trap 'on_exit' EXIT function run() { local testexec=$1 + local testgroup=$2 - echo "Running $testexec test suite..." + for test in "${test_set[@]}" + do + if [ "$testgroup" == "$test" ] || [ "$test" == "all" ];then + echo "Running $testexec test suite..." + NVCV_LEAK_DETECTION=abort "$curdir/$testexec" + return + fi + done - NVCV_LEAK_DETECTION=abort "$curdir/$testexec" + echo "Skipping $testexec test suite..." }