diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index adf28df7..c41e59e5 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -1,12 +1,19 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# # Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. # You may obtain a copy of the License at +# # http://www.apache.org/licenses/LICENSE-2.0 # -# See the License for the specific language governing permissions and limitations under the License. +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -name: CodeQL Analysis +name: "CodeQL" on: push: @@ -17,12 +24,9 @@ on: - cron: '28 22 * * 1' jobs: - codeql-analysis: - name: CodeQL Analysis + analyze: + name: Analyze runs-on: ubuntu-22.04-64core - container: - image: nvidia/cuda:12.2.0-devel-ubuntu22.04 - options: --user root timeout-minutes: 360 permissions: actions: write @@ -35,73 +39,89 @@ jobs: language: [ 'c-cpp', 'javascript-typescript', 'python' ] steps: - - name: Set up Environment - run: | - apt-get update -y && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends nvidia-open \ - git git-lfs gcc-11 g++-11 ninja-build build-essential ccache libgtest-dev libgmock-dev \ - shellcheck curl doxygen python3 python3-pip python3-dev python3-distutils \ - texlive-latex-extra ghostscript graphviz rsync \ - && curl -L https://cmake.org/files/v3.20/cmake-3.20.1-linux-x86_64.tar.gz --output /tmp/cmake-3.20.1.tar.gz \ - && tar -xzf /tmp/cmake-3.20.1.tar.gz -C /tmp/ && cp -r /tmp/cmake-3.20.1-linux-x86_64/bin/ /usr/local/ \ - && cp -r /tmp/cmake-3.20.1-linux-x86_64/share/ /usr/local/ && cp -r /tmp/cmake-3.20.1-linux-x86_64/doc/ /usr/local/ \ - && rm -rf /tmp/cmake-3.20.1* - - - name: Checkout Repository - uses: actions/checkout@v4 - with: - lfs: true - submodules: recursive - - - name: Install Python Dependencies (C/C++) - if: matrix.language == 'c-cpp' - run: | - apt-get update -y && apt-get install -y --no-install-recommends \ - python3 python3-pip python3-dev python3-distutils doxygen \ - && rm -rf /var/lib/apt/lists/* \ - && python3 -m pip install sphinx-rtd-theme sphinx breathe recommonmark graphviz \ - && python3 -m pip install numpy==2.0.1 patchelf==0.17.2.1 \ - && python3 -m pip install cuda-python==12.2.0 \ - && python3 -m pip install -U sphinx - - - name: Initialize CodeQL - uses: github/codeql-action/init@v3 - with: - languages: ${{ matrix.language }} - queries: +security-and-quality - - - name: Autobuild (Non C/C++) - if: matrix.language != 'c-cpp' - uses: github/codeql-action/autobuild@v3 - - - name: Build CMake Project (C/C++) - if: matrix.language == 'c-cpp' - run: | - echo "Running CMake project build script" - ./ci/build.sh debug build "-DBUILD_SAMPLES=OFF -DBUILD_TESTS=OFF -DBUILD_PYTHON=ON" - - - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v3 - with: - category: "/language:${{ matrix.language }}" - - - name: Build and Clean Documentation (C/C++, Push Event) - if: matrix.language == 'c-cpp' && github.event_name == 'push' - run: | - ./ci/build.sh debug build "-DBUILD_SAMPLES=OFF -DBUILD_TESTS=OFF -DBUILD_DOCS=ON -DBUILD_PYTHON=ON -DPYTHON_VERSIONS=3.10" - find build/docs/sphinx -name '*.doctree' -delete - find build/docs/sphinx -name '*.map' -delete - find build/docs/sphinx -name '*.pickle' -delete - find build/docs/sphinx -name '*.inv' -delete - find build/docs/sphinx -name '*.gz' -delete - - - name: Create .nojekyll File (C/C++, Push Event) - if: matrix.language == 'c-cpp' && github.event_name == 'push' - run: touch build/docs/sphinx/.nojekyll - - - name: Deploy to GitHub Pages (C/C++, Push Event) - if: matrix.language == 'c-cpp' && github.event_name == 'push' - uses: JamesIves/github-pages-deploy-action@v4 - with: - folder: build/docs/sphinx - branch: gh-pages - clean: true + - name: Checkout repository + uses: actions/checkout@v4 + with: + lfs: true + submodules: 'recursive' + + - if: matrix.language == 'c-cpp' + name: Setup environment + run: | + sudo apt update -y && sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y && \ + sudo apt update -y && sudo apt install -y --no-install-recommends \ + git git-lfs gcc-11 g++-11 ninja-build ccache libgtest-dev libgmock-dev \ + shellcheck curl doxygen python3 python3-pip python3-dev python3-distutils \ + texlive-latex-extra ghostscript graphviz \ + && curl -L https://cmake.org/files/v3.20/cmake-3.20.1-linux-x86_64.tar.gz --output /tmp/cmake-3.20.1.tar.gz \ + && tar -xzf /tmp/cmake-3.20.1.tar.gz -C /tmp/ && sudo cp -r /tmp/cmake-3.20.1-linux-x86_64/bin/ /usr/local/ \ + && sudo cp -r /tmp/cmake-3.20.1-linux-x86_64/share/ /usr/local/ && sudo cp -r /tmp/cmake-3.20.1-linux-x86_64/doc/ /usr/local/ \ + && rm -rf /tmp/cmake-3.20.1* + + - if: matrix.language == 'c-cpp' + name: Install Python Dependencies + run: | + sudo apt update -y && sudo apt install -y --no-install-recommends \ + python3 python3-pip python3-dev python3-distutils doxygen && sudo rm -rf /var/lib/apt/lists/* \ + && python3 -m pip install sphinx-rtd-theme sphinx breathe recommonmark graphviz \ + && python3 -m pip install numpy==2.0.1 patchelf==0.17.2.1 + + - if: matrix.language == 'c-cpp' + name: Install CUDA Toolkit + uses: Jimver/cuda-toolkit@v0.2.16 + id: cuda-toolkit + with: + cuda: '12.2.0' + linux-local-args: '["--toolkit"]' + + - if: matrix.language == 'c-cpp' + name: Verify CUDA installation + run: | + echo "Installed CUDA version is: ${{ steps.cuda-toolkit.outputs.cuda }}" + echo "CUDA install location: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}" + sudo ln -s ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}/lib64/libcudart.so \ + /usr/lib/x86_64-linux-gnu/libcuda.so + nvcc -V + + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: ${{ matrix.language }} + queries: +security-and-quality + + - if: matrix.language != 'c-cpp' + name: Autobuild + uses: github/codeql-action/autobuild@v3 + + - if: matrix.language == 'c-cpp' + name: Build CMake project + run: | + echo "Running CMake project build script" + ./ci/build.sh debug build "-DBUILD_SAMPLES=OFF -DBUILD_TESTS=OFF -DBUILD_PYTHON=ON" $* + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:${{matrix.language}}" + + - if: matrix.language == 'c-cpp' && github.event_name == 'push' + name: Build Docs and Clean up Sphinx Build Directory + run: | + ./ci/build.sh debug build "-DBUILD_SAMPLES=OFF -DBUILD_TESTS=OFF -DBUILD_DOCS=ON -DBUILD_PYTHON=ON -DPYTHON_VERSIONS=3.10" $* + find build/docs/sphinx -name '*.doctree' -delete + find build/docs/sphinx -name '*.map' -delete + find build/docs/sphinx -name '*.pickle' -delete + find build/docs/sphinx -name '*.inv' -delete + find build/docs/sphinx -name '*.gz' -delete + + - if: matrix.language == 'c-cpp' && github.event_name == 'push' + name: Create .nojekyll file + run: touch build/docs/sphinx/.nojekyll + + - if: matrix.language == 'c-cpp' && github.event_name == 'push' + name: Deploy to GitHub Pages + uses: JamesIves/github-pages-deploy-action@v4 + with: + folder: build/docs/sphinx + branch: gh-pages + clean: true diff --git a/CMakeLists.txt b/CMakeLists.txt index 13378bf1..4c2f48cd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,7 +23,7 @@ endif() project(cvcuda LANGUAGES C CXX - VERSION 0.11.0 + VERSION 0.12.0 DESCRIPTION "CUDA-accelerated Computer Vision algorithms" ) diff --git a/README.md b/README.md index 07f84fec..ddb93741 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ [![License](https://img.shields.io/badge/License-Apache_2.0-yellogreen.svg)](https://opensource.org/licenses/Apache-2.0) -![Version](https://img.shields.io/badge/Version-v0.11.0--beta-blue) +![Version](https://img.shields.io/badge/Version-v0.12.0--beta-blue) ![Platform](https://img.shields.io/badge/Platform-linux--64_%7C_win--64_wsl2%7C_aarch64-gray) @@ -61,27 +61,6 @@ To get a local copy up and running follow these steps. - Only one CUDA version (CUDA 11.x or CUDA 12.x) of CV-CUDA packages (Debian packages, tarballs, Python Wheels) can be installed at a time. Please uninstall all packages from a given CUDA version before installing packages from a different version. - Documentation built on Ubuntu 20.04 needs an up-to-date version of sphinx (`pip install --upgrade sphinx`) as well as explicitly parsing the system's default python version ` ./ci/build_docs path/to/build -DPYTHON_VERSIONS=""`. - The Resize and RandomResizedCrop operators incorrectly interpolate pixel values near the boundary of an image or tensor when using cubic interpolation. This will be fixed in an upcoming release. -- The CvtColor operator incorrectly computes the data location of the second chromaticity channel for conversions that involve YUV(420) semi-planar formats. This issue persists through the current release and we intend to address this bug in CV-CUDA v0.12. We do not recommend using these formats.​ - - Known affected formats:​ - - NVCV_COLOR_YUV2RGB_I420​ - - NVCV_COLOR_RGB2YUV_I420​ - - NVCV_COLOR_YUV2BGR_I420​ - - NVCV_COLOR_BGR2YUV_I420​ - - NVCV_COLOR_YUV2RGBA_I420​ - - NVCV_COLOR_RGBA2YUV_I420​ - - NVCV_COLOR_YUV2BGRA_I420​ - - NVCV_COLOR_BGRA2YUV_I420​ - - NVCV_COLOR_RGB2YUV_I420​ - - NVCV_COLOR_YUV2RGB_YV12​ - - NVCV_COLOR_RGB2YUV_YV12​ - - NVCV_COLOR_YUV2BGR_YV12​ - - NVCV_COLOR_BGR2YUV_YV12​ - - NVCV_COLOR_YUV2RGBA_YV12​ - - NVCV_COLOR_RGBA2YUV_YV12​ - - NVCV_COLOR_YUV2BGRA_YV12​ - - NVCV_COLOR_BGRA2YUV_YV12​ - - NVCV_COLOR_RGB2YUV_YV12​ - - NVCV_COLOR_YUV2GRAY_420​ ### Installation diff --git a/bench/BenchCvtColor.cpp b/bench/BenchCvtColor.cpp index 7d6c5c39..de110a61 100644 --- a/bench/BenchCvtColor.cpp +++ b/bench/BenchCvtColor.cpp @@ -21,63 +21,163 @@ #include -template -inline void CvtColor(nvbench::state &state, nvbench::type_list) +#include +#include +#include + +using ConvCodeToFormat = std::tuple; +using CodeMap = std::map; + +inline static ConvCodeToFormat str2Frmt(const std::string &str) +{ + // clang-format off + static const CodeMap codeMap { + { "RGB2BGR", {NVCV_COLOR_RGB2BGR, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8 }}, + { "RGB2RGBA", {NVCV_COLOR_RGB2RGBA, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBA8}}, + { "RGBA2RGB", {NVCV_COLOR_RGBA2RGB, NVCV_IMAGE_FORMAT_RGBA8, NVCV_IMAGE_FORMAT_RGB8 }}, + { "RGB2GRAY", {NVCV_COLOR_RGB2GRAY, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_Y8 }}, + { "GRAY2RGB", {NVCV_COLOR_GRAY2RGB, NVCV_IMAGE_FORMAT_Y8, NVCV_IMAGE_FORMAT_RGB8 }}, + { "RGB2HSV", {NVCV_COLOR_RGB2HSV, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_HSV8 }}, + { "HSV2RGB", {NVCV_COLOR_HSV2RGB, NVCV_IMAGE_FORMAT_HSV8, NVCV_IMAGE_FORMAT_RGB8 }}, + { "RGB2YUV", {NVCV_COLOR_RGB2YUV, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_YUV8 }}, + { "YUV2RGB", {NVCV_COLOR_YUV2RGB, NVCV_IMAGE_FORMAT_YUV8, NVCV_IMAGE_FORMAT_RGB8 }}, + {"RGB2YUV_NV12", {NVCV_COLOR_RGB2YUV_NV12, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_NV12 }}, + {"YUV2RGB_NV12", {NVCV_COLOR_YUV2RGB_NV12, NVCV_IMAGE_FORMAT_NV12, NVCV_IMAGE_FORMAT_RGB8 }}, + }; + // clang-format on + + if (auto it = codeMap.find(str); it != codeMap.end()) + { + return it->second; + } + else + { + throw std::invalid_argument("Unrecognized color code"); + } +} + +template +inline float bytesPerPixel(NVCVImageFormat imgFormat) +{ +#define BPP_CASE(frmt, bytes) \ + case frmt: \ + return bytes * sizeof(BT) + + switch (imgFormat) + { + BPP_CASE(NVCV_IMAGE_FORMAT_RGB8, 3); + BPP_CASE(NVCV_IMAGE_FORMAT_BGR8, 3); + BPP_CASE(NVCV_IMAGE_FORMAT_HSV8, 3); + BPP_CASE(NVCV_IMAGE_FORMAT_RGBA8, 4); + BPP_CASE(NVCV_IMAGE_FORMAT_YUV8, 3); + BPP_CASE(NVCV_IMAGE_FORMAT_NV12, 1.5f); + BPP_CASE(NVCV_IMAGE_FORMAT_Y8, 1); + default: + throw std::invalid_argument("Unrecognized format"); + } +#undef BPP_CASE +} + +// Adapted from src/util/TensorDataUtils.hpp +inline static nvcv::Tensor CreateTensor(int numImages, int imgWidth, int imgHeight, const nvcv::ImageFormat &imgFormat) +{ + if (imgFormat == NVCV_IMAGE_FORMAT_NV12 || imgFormat == NVCV_IMAGE_FORMAT_NV12_ER + || imgFormat == NVCV_IMAGE_FORMAT_NV21 || imgFormat == NVCV_IMAGE_FORMAT_NV21_ER) + { + if (imgHeight % 2 != 0 || imgWidth % 2 != 0) + { + throw std::invalid_argument("Invalid height"); + } + + int height420 = (imgHeight * 3) / 2; + + return nvcv::Tensor(numImages, {imgWidth, height420}, nvcv::ImageFormat(NVCV_IMAGE_FORMAT_Y8)); + } + else + { + return nvcv::Tensor(numImages, {imgWidth, imgHeight}, imgFormat); + } +} + +template +inline void CvtColor(nvbench::state &state, nvbench::type_list) try { long3 shape = benchutils::GetShape<3>(state.get_string("shape")); long varShape = state.get_int64("varShape"); - using BT = typename nvcv::cuda::BaseType; + ConvCodeToFormat formats = str2Frmt(state.get_string("code")); - int ch = nvcv::cuda::NumElements; + NVCVColorConversionCode code = std::get<0>(formats); + nvcv::ImageFormat inFormat{std::get<1>(formats)}; + nvcv::ImageFormat outFormat{std::get<2>(formats)}; - NVCVColorConversionCode code = ch == 3 ? NVCV_COLOR_BGR2RGB : NVCV_COLOR_BGRA2RGBA; - - state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T)); - state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T)); + state.add_global_memory_reads(shape.x * shape.y * shape.z * bytesPerPixel(inFormat)); + state.add_global_memory_writes(shape.x * shape.y * shape.z * bytesPerPixel(outFormat)); cvcuda::CvtColor op; - // clang-format off - if (varShape < 0) // negative var shape means use Tensor { - nvcv::Tensor src({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType()); - nvcv::Tensor dst({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor src = CreateTensor(shape.x, shape.z, shape.y, inFormat); + nvcv::Tensor dst = CreateTensor(shape.x, shape.z, shape.y, outFormat); benchutils::FillTensor(src, benchutils::RandomValues()); - state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &code](nvbench::launch &launch) - { - op(launch.get_stream(), src, dst, code); - }); + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &code](nvbench::launch &launch) { op(launch.get_stream(), src, dst, code); }); } else // zero and positive var shape means use ImageBatchVarShape { - nvcv::ImageBatchVarShape src(shape.x); - nvcv::ImageBatchVarShape dst(shape.x); + if (inFormat.chromaSubsampling() != nvcv::ChromaSubsampling::CSS_444 + || outFormat.chromaSubsampling() != nvcv::ChromaSubsampling::CSS_444) + { + state.skip("Skipping formats that have subsampled planes for the varshape benchmark"); + } + + std::vector imgSrc; + std::vector imgDst; + nvcv::ImageBatchVarShape src(shape.x); + nvcv::ImageBatchVarShape dst(shape.x); + std::vector> srcVec(shape.x); - benchutils::FillImageBatch(src, long2{shape.z, shape.y}, long2{varShape, varShape}, - benchutils::RandomValues()); - dst.pushBack(src.begin(), src.end()); + auto randomValuesU8 = benchutils::RandomValues(); - state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &code](nvbench::launch &launch) + for (int i = 0; i < shape.x; i++) { - op(launch.get_stream(), src, dst, code); - }); + imgSrc.emplace_back(nvcv::Size2D{(int)shape.z, (int)shape.y}, inFormat); + imgDst.emplace_back(nvcv::Size2D{(int)shape.z, (int)shape.y}, outFormat); + + int srcRowStride = imgSrc[i].size().w * inFormat.planePixelStrideBytes(0); + int srcBufSize = imgSrc[i].size().h * srcRowStride; + srcVec[i].resize(srcBufSize); + for (int idx = 0; idx < srcBufSize; idx++) + { + srcVec[i][idx] = randomValuesU8(); + } + + auto imgData = imgSrc[i].exportData(); + CUDA_CHECK_ERROR(cudaMemcpy2D(imgData->plane(0).basePtr, imgData->plane(0).rowStride, srcVec[i].data(), + srcRowStride, srcRowStride, imgSrc[i].size().h, cudaMemcpyHostToDevice)); + } + src.pushBack(imgSrc.begin(), imgSrc.end()); + dst.pushBack(imgDst.begin(), imgDst.end()); + + state.exec(nvbench::exec_tag::sync, + [&op, &src, &dst, &code](nvbench::launch &launch) { op(launch.get_stream(), src, dst, code); }); } } + catch (const std::exception &err) { state.skip(err.what()); } -// clang-format on - -using CvtColorTypes = nvbench::type_list; +using BaseTypes = nvbench::type_list; -NVBENCH_BENCH_TYPES(CvtColor, NVBENCH_TYPE_AXES(CvtColorTypes)) - .set_type_axes_names({"InOutDataType"}) - .add_string_axis("shape", {"1x1080x1920"}) +NVBENCH_BENCH_TYPES(CvtColor, NVBENCH_TYPE_AXES(BaseTypes)) + .set_type_axes_names({"BaseType"}) + .add_string_axis("shape", {"1x1080x1920", "64x720x1280"}) + .add_string_axis("code", {"RGB2BGR", "RGB2RGBA", "RGBA2RGB", "RGB2GRAY", "GRAY2RGB", "RGB2HSV", "HSV2RGB", + "RGB2YUV", "YUV2RGB", "RGB2YUV_NV12", "YUV2RGB_NV12"}) .add_int64_axis("varShape", {-1, 0}); diff --git a/cmake/ConfigCUDA.cmake b/cmake/ConfigCUDA.cmake index 56bf632f..319a157e 100644 --- a/cmake/ConfigCUDA.cmake +++ b/cmake/ConfigCUDA.cmake @@ -32,6 +32,9 @@ set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD}) # Compress kernels to generate smaller executables set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin=--compress-all") +# Enable device lambdas +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda") + if(NOT USE_CMAKE_CUDA_ARCHITECTURES) set(CMAKE_CUDA_ARCHITECTURES "$ENV{CUDAARCHS}") diff --git a/docs/sphinx/conf.py b/docs/sphinx/conf.py index 15672135..9ab82fd6 100644 --- a/docs/sphinx/conf.py +++ b/docs/sphinx/conf.py @@ -43,10 +43,6 @@ lib_path = os.getenv("SPHINX_PYTHON_SRC", default=".") sys.path.insert(0, os.path.abspath(lib_path)) -# -- Module mocking ---------------------------------------------------------- - -autodoc_mock_imports = ['nvcv', 'cvcuda'] - # -- General configuration --------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be diff --git a/docs/sphinx/index.rst b/docs/sphinx/index.rst index 66f03f3f..69d555b1 100644 --- a/docs/sphinx/index.rst +++ b/docs/sphinx/index.rst @@ -123,7 +123,9 @@ Copyright :maxdepth: 1 :hidden: + v0.12.0-beta v0.11.0-beta + v0.10.1-beta v0.10.0-beta v0.9.0-beta v0.8.0-beta diff --git a/docs/sphinx/relnotes/v0.10.1-beta.rst b/docs/sphinx/relnotes/v0.10.1-beta.rst index 2e03b5b6..a03c4166 100644 --- a/docs/sphinx/relnotes/v0.10.1-beta.rst +++ b/docs/sphinx/relnotes/v0.10.1-beta.rst @@ -1,44 +1,44 @@ -.. - # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - # SPDX-License-Identifier: Apache-2.0 - # - # Licensed under the Apache License, Version 2.0 (the "License"); - # you may not use this file except in compliance with the License. - # You may obtain a copy of the License at - # - # http://www.apache.org/licenses/LICENSE-2.0 - # - # Unless required by applicable law or agreed to in writing, software - # distributed under the License is distributed on an "AS IS" BASIS, - # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - # See the License for the specific language governing permissions and - # limitations under the License. - -.. _v0.10.1-beta: - -v0.10.1-beta -============ - -Release Highlights ------------------- - -CV-CUDA v0.10.1 reverts the OpCvtColor performance improvements introduced in v0.10.0 due to discovered bugs. -These optimizations will be reintroduced, with consolidated testing, in a future release. - -License -------- - -CV-CUDA is licensed under the `Apache 2.0 `_ license. - -Resources ---------- - -1. `CV-CUDA GitHub `_ -2. `CV-CUDA Increasing Throughput and Reducing Costs for AI-Based Computer Vision with CV-CUDA `_ -3. `NVIDIA Announces Microsoft, Tencent, Baidu Adopting CV-CUDA for Computer Vision AI `_ -4. `CV-CUDA helps Tencent Cloud audio and video PaaS platform achieve full-process GPU acceleration for video enhancement AI `_ - -Acknowledgements ----------------- - -CV-CUDA is developed jointly by NVIDIA and the ByteDance Machine Learning team. +.. + # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-License-Identifier: Apache-2.0 + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + +.. _v0.10.1-beta: + +v0.10.1-beta +============ + +Release Highlights +------------------ + +CV-CUDA v0.10.1 reverts the OpCvtColor performance improvements introduced in v0.10.0 due to discovered bugs. +These optimizations will be reintroduced, with consolidated testing, in a future release. + +License +------- + +CV-CUDA is licensed under the `Apache 2.0 `_ license. + +Resources +--------- + +1. `CV-CUDA GitHub `_ +2. `CV-CUDA Increasing Throughput and Reducing Costs for AI-Based Computer Vision with CV-CUDA `_ +3. `NVIDIA Announces Microsoft, Tencent, Baidu Adopting CV-CUDA for Computer Vision AI `_ +4. `CV-CUDA helps Tencent Cloud audio and video PaaS platform achieve full-process GPU acceleration for video enhancement AI `_ + +Acknowledgements +---------------- + +CV-CUDA is developed jointly by NVIDIA and the ByteDance Machine Learning team. diff --git a/docs/sphinx/relnotes/v0.11.0-beta.rst b/docs/sphinx/relnotes/v0.11.0-beta.rst index 9f50e975..2957e40e 100644 --- a/docs/sphinx/relnotes/v0.11.0-beta.rst +++ b/docs/sphinx/relnotes/v0.11.0-beta.rst @@ -38,28 +38,9 @@ Compatibility and Known Limitations * **Pre-existing limitations**: - * The CvtColor operator incorrectly computes the data location of the second chromaticity channel for conversions that involve YUV(420) semi-planar formats. This issue persists through the current release and we intend to address this bug in CV-CUDA v0.12. We do not recommend using these formats.​ - - * Known affected formats:​ - * NVCV_COLOR_YUV2RGB_I420​ - * NVCV_COLOR_RGB2YUV_I420​ - * NVCV_COLOR_YUV2BGR_I420​ - * NVCV_COLOR_BGR2YUV_I420​ - * NVCV_COLOR_YUV2RGBA_I420​ - * NVCV_COLOR_RGBA2YUV_I420​ - * NVCV_COLOR_YUV2BGRA_I420​ - * NVCV_COLOR_BGRA2YUV_I420​ - * NVCV_COLOR_RGB2YUV_I420​ - * NVCV_COLOR_YUV2RGB_YV12​ - * NVCV_COLOR_RGB2YUV_YV12​ - * NVCV_COLOR_YUV2BGR_YV12​ - * NVCV_COLOR_BGR2YUV_YV12​ - * NVCV_COLOR_YUV2RGBA_YV12​ - * NVCV_COLOR_RGBA2YUV_YV12​ - * NVCV_COLOR_YUV2BGRA_YV12​ - * NVCV_COLOR_BGRA2YUV_YV12​ - * NVCV_COLOR_RGB2YUV_YV12​ - * NVCV_COLOR_YUV2GRAY_420​ + * We note a bug in the YUV(420) color conversion API (NVCV_COLOR_RGB2YUV_I420) which incorrectly computes the U and V plane index​ + + * This persists through this release and we intend to address this bug in CV-CUDA v0.12.0​ For the full list, see main README on `CV-CUDA GitHub `_. diff --git a/docs/sphinx/relnotes/v0.12.0-beta.rst b/docs/sphinx/relnotes/v0.12.0-beta.rst new file mode 100644 index 00000000..2fe84ca5 --- /dev/null +++ b/docs/sphinx/relnotes/v0.12.0-beta.rst @@ -0,0 +1,60 @@ +.. + # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-License-Identifier: Apache-2.0 + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + +.. _v0.12.0-beta: + +v0.12.0-beta +============ + +Release Highlights +------------------ + +CV-CUDA v0.12.0 includes critical bug fixes alongside the following changes:​ + +* **New Features**:​ + + * Increased functional test coverage of color conversions. ​ + * Reintroduced from 24.07: Improved performance of color conversion operators (e.g., 2x faster RGB2YUV). + +* **Bug Fixes**:​ + + * Fixed bug in YUV(420) conversions: The CvtColor operator incorrectly computed the data location of the second chromaticity channel for conversions.​ + * Fixed bug in YUV(422) conversions: The CvtColor operator incorrectly interpreted the interleaved YUV(422) data layout as a three-channel tensor.​ + * Prevent CV_16F alpha addition: some color conversions in the CvtColor operator allowed for the addition of an alpha channel to the destination tensor, which is undefined for the CV_16F data type. + + +Compatibility and Known Limitations +----------------------------------- + +For the full list, see main README on `CV-CUDA GitHub `_. + +License +------- + +CV-CUDA is licensed under the `Apache 2.0 `_ license. + +Resources +--------- + +1. `CV-CUDA GitHub `_ +2. `CV-CUDA Increasing Throughput and Reducing Costs for AI-Based Computer Vision with CV-CUDA `_ +3. `NVIDIA Announces Microsoft, Tencent, Baidu Adopting CV-CUDA for Computer Vision AI `_ +4. `CV-CUDA helps Tencent Cloud audio and video PaaS platform achieve full-process GPU acceleration for video enhancement AI `_ + +Acknowledgements +---------------- + +CV-CUDA is developed jointly by NVIDIA and the ByteDance Machine Learning team. diff --git a/samples/scripts/run_samples.sh b/samples/scripts/run_samples.sh index e207fd30..f7f17b5b 100755 --- a/samples/scripts/run_samples.sh +++ b/samples/scripts/run_samples.sh @@ -136,4 +136,3 @@ python3 $SAMPLES_DIR/label/python/main.py -o "$LABEL_RUN_DIR" # Run it with batch size 1 on a single image LABEL_RUN_DIR=$(create_output_dir "$LABEL_OUT_DIR") python3 $SAMPLES_DIR/label/python/main.py -i $SAMPLES_DIR/assets/images/peoplenet.jpg -b 1 -o "$LABEL_RUN_DIR" - diff --git a/src/cvcuda/include/cvcuda/cuda_tools/math/LinAlg.hpp b/src/cvcuda/include/cvcuda/cuda_tools/math/LinAlg.hpp index 2a7d8bd3..c413a468 100644 --- a/src/cvcuda/include/cvcuda/cuda_tools/math/LinAlg.hpp +++ b/src/cvcuda/include/cvcuda/cuda_tools/math/LinAlg.hpp @@ -220,7 +220,11 @@ class Vector } // On-purpose public data to allow POD-class direct initialization. +#ifdef __CUDA_ARCH__ T m_data[N]; +#else + T m_data[N] = {}; +#endif }; /** diff --git a/src/cvcuda/priv/legacy/cvt_color.cu b/src/cvcuda/priv/legacy/cvt_color.cu index 332ff865..d202ac54 100644 --- a/src/cvcuda/priv/legacy/cvt_color.cu +++ b/src/cvcuda/priv/legacy/cvt_color.cu @@ -27,9 +27,13 @@ #include -static constexpr float B2YF = 0.114f; -static constexpr float G2YF = 0.587f; -static constexpr float R2YF = 0.299f; +// NOTE: Below are the "standard" (NTSC and ITU Rec.601) RGB to luma conversion +// coefficients. More accurate coefficents, given as comments on the right, are +// found at http://www.brucelindbloom.com/index.html?WorkingSpaceInfo.html and +// https://www.imagemagick.org/include/api/pixel.php. +static constexpr float R2YF = 0.299f; // 0.298839 +static constexpr float G2YF = 0.587f; // 0.586811 +static constexpr float B2YF = 0.114f; // 0.114350 static constexpr int gray_shift = 15; static constexpr int yuv_shift = 14; @@ -43,8 +47,8 @@ static constexpr int B2Y = 1868; // == B2YF*16384 static constexpr int R2VI = 14369; // == R2VF*16384 static constexpr int B2UI = 8061; // == B2UF*16384 -static constexpr float B2UF = 0.492f; -static constexpr float R2VF = 0.877f; +static constexpr float B2UF = 0.492f; // 0.492111: U = (B - Y) * B2UF + 0.5 +static constexpr float R2VF = 0.877f; // 0.877283: V = (R - Y) * R2VF + 0.5 static constexpr int U2BI = 33292; static constexpr int U2GI = -6472; @@ -77,354 +81,541 @@ static constexpr int ITUR_BT_601_CBV = -74448; #define BLOCK 32 +#define DEVICE_INLINE __device__ __forceinline__ +#define GLOBAL_BOUNDS __global__ __launch_bounds__(Policy::BlockSize) + +template> +constexpr BT Alpha = std::is_floating_point_v ? 1 : nvcv::cuda::TypeTraits::max; + namespace nvcv::legacy::cuda_op { -template -__global__ void rgb_to_bgr_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int sch, int dch, int bidx) +template +using TensorWrap3D = nvcv::cuda::Tensor3DWrap; + +template +using TensorWrap4D = nvcv::cuda::Tensor4DWrap; + +template +struct CvtKernelPolicy { - int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - if (dst_x >= dstSize.x || dst_y >= dstSize.y) - return; + static_assert(BlockWidth_ % 32 == 0); + static constexpr int BlockWidth = BlockWidth_; + static constexpr int BlockHeight = BlockHeight_; + static constexpr int BlockSize = BlockWidth * BlockHeight; + static constexpr int RowsPerThread = RowsPerThread_; + static constexpr int TileWidth = BlockWidth; + static constexpr int TileHeight = BlockHeight * RowsPerThread; + static constexpr int ThreadRowStride = BlockHeight; +}; + +template +DEVICE_INLINE void color_conversion_common(LoadOpT load_op, ConvOpT conv_op, StoreOpT store_op, int2 size) +{ + const int x = blockIdx.x * Policy::TileWidth + threadIdx.x; + const int y0 = blockIdx.y * Policy::TileHeight + threadIdx.y; const int batch_idx = get_batch_idx(); + if (x >= size.x) + { + return; + } - T b = *src.ptr(batch_idx, dst_y, dst_x, bidx); - T g = *src.ptr(batch_idx, dst_y, dst_x, 1); - T r = *src.ptr(batch_idx, dst_y, dst_x, bidx ^ 2); - - *dst.ptr(batch_idx, dst_y, dst_x, 0) = b; - *dst.ptr(batch_idx, dst_y, dst_x, 1) = g; - *dst.ptr(batch_idx, dst_y, dst_x, 2) = r; + // Branchless efficient path for inner blocks. + if (y0 + Policy::TileHeight <= size.y) + { + EltT r_in[Policy::RowsPerThread][N_IN]; + EltT r_out[Policy::RowsPerThread][N_OUT]; - if (dch == 4) +#pragma unroll + for (int i = 0; i < Policy::RowsPerThread; i++) + { + const int y = y0 + Policy::ThreadRowStride * i; + load_op(r_in[i], batch_idx, x, y); + } +#pragma unroll + for (int i = 0; i < Policy::RowsPerThread; i++) conv_op(r_in[i], r_out[i]); +#pragma unroll + for (int i = 0; i < Policy::RowsPerThread; i++) + { + const int y = y0 + Policy::ThreadRowStride * i; + store_op(r_out[i], batch_idx, x, y); + } + } + else { - T al = sch == 4 ? *src.ptr(batch_idx, dst_y, dst_x, 3) : cuda::TypeTraits::max; - *dst.ptr(batch_idx, dst_y, dst_x, 3) = al; + int y = y0; + for (int i = 0; i < Policy::RowsPerThread && y < size.y; i++) + { + EltT r_in[N_IN]; + EltT r_out[N_OUT]; + + load_op(r_in, batch_idx, x, y); + conv_op(r_in, r_out); + store_op(r_out, batch_idx, x, y); + + y += Policy::ThreadRowStride; + } } } -template -__global__ void gray_to_bgr_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int dch) +template +DEVICE_INLINE void load3_nhwc(const TensorWrap3D &src, EltT &C0, EltT &C1, EltT &C2, int batch_idx, + int x, int y) { - int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - if (dst_x >= dstSize.x || dst_y >= dstSize.y) - return; - const int batch_idx = get_batch_idx(); + SrcT vec = *src.ptr(batch_idx, y, x); + C0 = vec.x; + C1 = vec.y; + C2 = vec.z; +} - T g = *src.ptr(batch_idx, dst_y, dst_x, 0); +template +DEVICE_INLINE void store3_nhwc(const TensorWrap3D &dst, EltT C0, EltT C1, EltT C2, int batch_idx, int x, + int y) +{ + DstT vec; + vec.x = C0; + vec.y = C1; + vec.z = C2; + *dst.ptr(batch_idx, y, x) = vec; +} - *dst.ptr(batch_idx, dst_y, dst_x, 0) = g; - *dst.ptr(batch_idx, dst_y, dst_x, 1) = g; - *dst.ptr(batch_idx, dst_y, dst_x, 2) = g; - if (dch == 4) +template +DEVICE_INLINE void load_bgra_nhwc(const TensorWrap3D &src, EltT &B, EltT &G, EltT &R, EltT &A, + int batch_idx, int x, int y, int bidx) +{ + SrcT vec = *src.ptr(batch_idx, y, x); + B = bidx == 0 ? vec.x : vec.z; + G = vec.y; + R = bidx == 0 ? vec.z : vec.x; + if constexpr (nvcv::cuda::NumComponents == 4) { - *dst.ptr(batch_idx, dst_y, dst_x, 3) = g; + A = vec.w; + } + else + { + A = Alpha; } } -template -__global__ void bgr_to_gray_char_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int bidx) +template +DEVICE_INLINE void store_bgra_nhwc(const TensorWrap3D &dst, EltT B, EltT G, EltT R, EltT A, + int batch_idx, int x, int y, int bidx) { - int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - if (dst_x >= dstSize.x || dst_y >= dstSize.y) - return; - const int batch_idx = get_batch_idx(); - int b = *src.ptr(batch_idx, dst_y, dst_x, bidx); - int g = *src.ptr(batch_idx, dst_y, dst_x, 1); - int r = *src.ptr(batch_idx, dst_y, dst_x, bidx ^ 2); + DstT vec; + vec.x = bidx == 0 ? B : R; + vec.y = G; + vec.z = bidx == 0 ? R : B; + if constexpr (nvcv::cuda::NumComponents == 4) + { + vec.w = A; + } + *dst.ptr(batch_idx, y, x) = vec; +} - T gray = (T)CV_DESCALE(b * BY15 + g * GY15 + r * RY15, gray_shift); - *dst.ptr(batch_idx, dst_y, dst_x, 0) = gray; +template +GLOBAL_BOUNDS void rgb_to_bgr_nhwc(const TensorWrap3D src, const TensorWrap3D dst, + int2 dstSize, int bidx) +{ + using EltT = nvcv::cuda::BaseType; + color_conversion_common( + [&src, bidx] __device__(EltT(&r_in)[4], int batch_idx, int x, int y) + { load_bgra_nhwc(src, r_in[0], r_in[1], r_in[2], r_in[3], batch_idx, x, y, bidx); }, + [] __device__(const EltT(&r_in)[4], EltT(&r_out)[4]) + { +#pragma unroll + for (int i = 0; i < 4; i++) r_out[i] = r_in[i]; + }, + [&dst] __device__(const EltT(&r_out)[4], int batch_idx, int x, int y) + { store_bgra_nhwc(dst, r_out[0], r_out[1], r_out[2], r_out[3], batch_idx, x, y, 0); }, + dstSize); } -template -__global__ void bgr_to_gray_float_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int bidx) +template +GLOBAL_BOUNDS void gray_to_bgr_nhwc(const TensorWrap3D src, const TensorWrap3D dst, + int2 dstSize) { - int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - if (dst_x >= dstSize.x || dst_y >= dstSize.y) - return; - const int batch_idx = get_batch_idx(); - T b = *src.ptr(batch_idx, dst_y, dst_x, bidx); - T g = *src.ptr(batch_idx, dst_y, dst_x, 1); - T r = *src.ptr(batch_idx, dst_y, dst_x, bidx ^ 2); + using EltT = nvcv::cuda::BaseType; + color_conversion_common( + [&src] __device__(EltT(&r_gray)[1], int batch_idx, int x, int y) { r_gray[0] = *src.ptr(batch_idx, y, x); }, + [] __device__(const EltT(&r_gray)[1], EltT(&r_BGRA)[4]) + { +#pragma unroll + for (int i = 0; i < 4; i++) r_BGRA[i] = r_gray[0]; + }, + [&dst] __device__(const EltT(&r_BGRA)[4], int batch_idx, int x, int y) + { store_bgra_nhwc(dst, r_BGRA[0], r_BGRA[1], r_BGRA[2], r_BGRA[3], batch_idx, x, y, 0); }, + dstSize); +} - T gray = (T)(b * B2YF + g * G2YF + r * R2YF); - *dst.ptr(batch_idx, dst_y, dst_x, 0) = gray; +template +GLOBAL_BOUNDS void bgr_to_gray_nhwc(const TensorWrap3D src, const TensorWrap3D dst, + int2 dstSize, int bidx) +{ + using EltT = nvcv::cuda::BaseType; + color_conversion_common( + [&src, bidx] __device__(EltT(&r_BGR)[3], int batch_idx, int x, int y) + { + EltT A; + load_bgra_nhwc(src, r_BGR[0], r_BGR[1], r_BGR[2], A, batch_idx, x, y, bidx); + }, + [] __device__(const EltT(&r_BGR)[3], EltT(&r_gray)[1]) + { + if constexpr (std::is_integral_v) + r_gray[0] + = (EltT)CV_DESCALE((int)r_BGR[0] * BY15 + (int)r_BGR[1] * GY15 + (int)r_BGR[2] * RY15, gray_shift); + else + r_gray[0] = (EltT)(r_BGR[0] * B2YF + r_BGR[1] * G2YF + r_BGR[2] * R2YF); + }, + [&dst] __device__(const EltT(&r_gray)[1], int batch_idx, int x, int y) + { *dst.ptr(batch_idx, y, x) = r_gray[0]; }, + dstSize); } -template -__global__ void bgr_to_yuv_char_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int bidx) +template +DEVICE_INLINE void bgr_to_yuv_int(T B_, T G_, T R_, T &Y_, T &Cb_, T &Cr_) { - int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - if (dst_x >= dstSize.x || dst_y >= dstSize.y) - return; - const int batch_idx = get_batch_idx(); - int B = *src.ptr(batch_idx, dst_y, dst_x, bidx); - int G = *src.ptr(batch_idx, dst_y, dst_x, 1); - int R = *src.ptr(batch_idx, dst_y, dst_x, bidx ^ 2); - - int C0 = R2Y, C1 = G2Y, C2 = B2Y, C3 = R2VI, C4 = B2UI; - int delta = ((T)(cuda::TypeTraits::max / 2 + 1)) * (1 << yuv_shift); - int Y = CV_DESCALE(R * C0 + G * C1 + B * C2, yuv_shift); - int Cr = CV_DESCALE((R - Y) * C3 + delta, yuv_shift); - int Cb = CV_DESCALE((B - Y) * C4 + delta, yuv_shift); - - *dst.ptr(batch_idx, dst_y, dst_x, 0) = cuda::SaturateCast(Y); - *dst.ptr(batch_idx, dst_y, dst_x, 1) = cuda::SaturateCast(Cb); - *dst.ptr(batch_idx, dst_y, dst_x, 2) = cuda::SaturateCast(Cr); + constexpr int C0 = R2Y, C1 = G2Y, C2 = B2Y, C3 = R2VI, C4 = B2UI; + constexpr int delta = ((T)(cuda::TypeTraits::max / 2 + 1)) << yuv_shift; + + const int B = B_, G = G_, R = R_; + + const int Y = CV_DESCALE(R * C0 + G * C1 + B * C2, yuv_shift); + const int Cr = CV_DESCALE((R - Y) * C3 + delta, yuv_shift); + const int Cb = CV_DESCALE((B - Y) * C4 + delta, yuv_shift); + + Y_ = cuda::SaturateCast(Y); + Cb_ = cuda::SaturateCast(Cb); + Cr_ = cuda::SaturateCast(Cr); } -template -__global__ void bgr_to_yuv_float_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int bidx) +DEVICE_INLINE void bgr_to_yuv_float(float B, float G, float R, float &Y, float &Cb, float &Cr) { - int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - if (dst_x >= dstSize.x || dst_y >= dstSize.y) - return; - const int batch_idx = get_batch_idx(); - T B = *src.ptr(batch_idx, dst_y, dst_x, bidx); - T G = *src.ptr(batch_idx, dst_y, dst_x, 1); - T R = *src.ptr(batch_idx, dst_y, dst_x, bidx ^ 2); - - T C0 = R2YF, C1 = G2YF, C2 = B2YF, C3 = R2VF, C4 = B2UF; - T delta = 0.5f; - T Y = R * C0 + G * C1 + B * C2; - T Cr = (R - Y) * C3 + delta; - T Cb = (B - Y) * C4 + delta; - *dst.ptr(batch_idx, dst_y, dst_x, 0) = Y; - *dst.ptr(batch_idx, dst_y, dst_x, 1) = Cb; - *dst.ptr(batch_idx, dst_y, dst_x, 2) = Cr; + constexpr float C0 = R2YF, C1 = G2YF, C2 = B2YF, C3 = R2VF, C4 = B2UF; + constexpr float delta = 0.5f; + + Y = R * C0 + G * C1 + B * C2; + Cr = (R - Y) * C3 + delta; + Cb = (B - Y) * C4 + delta; } -template -__global__ void yuv_to_bgr_char_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int bidx) +template +GLOBAL_BOUNDS void bgr_to_yuv_nhwc(const TensorWrap3D src, const TensorWrap3D dst, + int2 dstSize, int bidx) { - int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - if (dst_x >= dstSize.x || dst_y >= dstSize.y) - return; - const int batch_idx = get_batch_idx(); - T Y = *src.ptr(batch_idx, dst_y, dst_x, 0); - T Cb = *src.ptr(batch_idx, dst_y, dst_x, 1); - T Cr = *src.ptr(batch_idx, dst_y, dst_x, 2); - - int C0 = V2RI, C1 = V2GI, C2 = U2GI, C3 = U2BI; - int delta = ((T)(cuda::TypeTraits::max / 2 + 1)); - int b = Y + CV_DESCALE((Cb - delta) * C3, yuv_shift); - int g = Y + CV_DESCALE((Cb - delta) * C2 + (Cr - delta) * C1, yuv_shift); - int r = Y + CV_DESCALE((Cr - delta) * C0, yuv_shift); - - *dst.ptr(batch_idx, dst_y, dst_x, bidx) = cuda::SaturateCast(b); - *dst.ptr(batch_idx, dst_y, dst_x, 1) = cuda::SaturateCast(g); - *dst.ptr(batch_idx, dst_y, dst_x, bidx ^ 2) = cuda::SaturateCast(r); + using EltT = nvcv::cuda::BaseType; + color_conversion_common( + [&src, bidx] __device__(EltT(&r_BGR)[3], int batch_idx, int x, int y) + { + EltT A; + load_bgra_nhwc(src, r_BGR[0], r_BGR[1], r_BGR[2], A, batch_idx, x, y, bidx); + }, + [] __device__(const EltT(&r_BGR)[3], EltT(&r_YCbCr)[3]) + { + if constexpr (std::is_integral_v) + bgr_to_yuv_int(r_BGR[0], r_BGR[1], r_BGR[2], r_YCbCr[0], r_YCbCr[1], r_YCbCr[2]); + else + bgr_to_yuv_float(r_BGR[0], r_BGR[1], r_BGR[2], r_YCbCr[0], r_YCbCr[1], r_YCbCr[2]); + }, + [&dst] __device__(const EltT(&r_YCbCr)[3], int batch_idx, int x, int y) + { store3_nhwc(dst, r_YCbCr[0], r_YCbCr[1], r_YCbCr[2], batch_idx, x, y); }, + dstSize); } -template -__global__ void yuv_to_bgr_float_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int bidx) +template +DEVICE_INLINE void yuv_to_bgr_int(T Y_, T Cb_, T Cr_, T &B_, T &G_, T &R_) { - int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - if (dst_x >= dstSize.x || dst_y >= dstSize.y) - return; - const int batch_idx = get_batch_idx(); - T Y = *src.ptr(batch_idx, dst_y, dst_x, 0); - T Cb = *src.ptr(batch_idx, dst_y, dst_x, 1); - T Cr = *src.ptr(batch_idx, dst_y, dst_x, 2); + constexpr int C0 = V2RI, C1 = V2GI, C2 = U2GI, C3 = U2BI; + constexpr int delta = ((T)(cuda::TypeTraits::max / 2 + 1)); - T C0 = V2RF, C1 = V2GF, C2 = U2GF, C3 = U2BF; - T delta = 0.5f; - T b = Y + (Cb - delta) * C3; - T g = Y + (Cb - delta) * C2 + (Cr - delta) * C1; - T r = Y + (Cr - delta) * C0; + const int Y = Y_, Cb = Cb_, Cr = Cr_; + const int B = Y + CV_DESCALE((Cb - delta) * C3, yuv_shift); + const int G = Y + CV_DESCALE((Cb - delta) * C2 + (Cr - delta) * C1, yuv_shift); + const int R = Y + CV_DESCALE((Cr - delta) * C0, yuv_shift); - *dst.ptr(batch_idx, dst_y, dst_x, bidx) = b; - *dst.ptr(batch_idx, dst_y, dst_x, 1) = g; - *dst.ptr(batch_idx, dst_y, dst_x, bidx ^ 2) = r; + B_ = cuda::SaturateCast(B); + G_ = cuda::SaturateCast(G); + R_ = cuda::SaturateCast(R); } -template -__global__ void bgr_to_hsv_char_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int bidx, bool isFullRange) +DEVICE_INLINE void yuv_to_bgr_flt(float Y, float Cb, float Cr, float &B, float &G, float &R) { - int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - if (dst_x >= dstSize.x || dst_y >= dstSize.y) - return; - const int batch_idx = get_batch_idx(); + constexpr float C0 = V2RF, C1 = V2GF, C2 = U2GF, C3 = U2BF; + constexpr float delta = 0.5f; - int b = *src.ptr(batch_idx, dst_y, dst_x, bidx); - int g = *src.ptr(batch_idx, dst_y, dst_x, 1); - int r = *src.ptr(batch_idx, dst_y, dst_x, bidx ^ 2); - int hrange = isFullRange ? 256 : 180; - int hr = hrange; - const int hsv_shift = 12; - int h, s, v = b; - int vmin = b; - int vr, vg; - - v = cuda::max(v, g); - v = cuda::max(v, r); - vmin = cuda::min(vmin, g); - vmin = cuda::min(vmin, r); - - unsigned char diff = cuda::SaturateCast(v - vmin); - vr = v == r ? -1 : 0; - vg = v == g ? -1 : 0; - - int hdiv_table = diff == 0 ? 0 : cuda::SaturateCast((hrange << hsv_shift) / (6. * diff)); - int sdiv_table = v == 0 ? 0 : cuda::SaturateCast((255 << hsv_shift) / (1. * v)); - s = (diff * sdiv_table + (1 << (hsv_shift - 1))) >> hsv_shift; - h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff)))); - h = (h * hdiv_table + (1 << (hsv_shift - 1))) >> hsv_shift; - h += h < 0 ? hr : 0; - - *dst.ptr(batch_idx, dst_y, dst_x, 0) = cuda::SaturateCast(h); - *dst.ptr(batch_idx, dst_y, dst_x, 1) = (unsigned char)s; - *dst.ptr(batch_idx, dst_y, dst_x, 2) = (unsigned char)v; + B = Y + (Cb - delta) * C3; + G = Y + (Cb - delta) * C2 + (Cr - delta) * C1; + R = Y + (Cr - delta) * C0; } -template -__global__ void bgr_to_hsv_float_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int bidx) +template +GLOBAL_BOUNDS void yuv_to_bgr_nhwc(const TensorWrap3D src, const TensorWrap3D dst, + int2 dstSize, int bidx) { - int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - if (dst_x >= dstSize.x || dst_y >= dstSize.y) - return; - const int batch_idx = get_batch_idx(); + using EltT = nvcv::cuda::BaseType; + color_conversion_common( + [&src] __device__(EltT(&r_YCbCr)[3], int batch_idx, int x, int y) + { load3_nhwc(src, r_YCbCr[0], r_YCbCr[1], r_YCbCr[2], batch_idx, x, y); }, + [] __device__(const EltT(&r_YCbCr)[3], EltT(&r_BGR)[3]) + { + if constexpr (std::is_integral_v) + yuv_to_bgr_int(r_YCbCr[0], r_YCbCr[1], r_YCbCr[2], r_BGR[0], r_BGR[1], r_BGR[2]); + else + yuv_to_bgr_flt(r_YCbCr[0], r_YCbCr[1], r_YCbCr[2], r_BGR[0], r_BGR[1], r_BGR[2]); + }, + [&dst, bidx] __device__(const EltT(&r_BGR)[3], int batch_idx, int x, int y) + { store_bgra_nhwc(dst, r_BGR[0], r_BGR[1], r_BGR[2], Alpha, batch_idx, x, y, bidx); }, + dstSize); +} - float b = *src.ptr(batch_idx, dst_y, dst_x, bidx); - float g = *src.ptr(batch_idx, dst_y, dst_x, 1); - float r = *src.ptr(batch_idx, dst_y, dst_x, bidx ^ 2); - float h, s, v; - float hrange = 360.0; - float hscale = hrange * (1.f / 360.f); - - float vmin, diff; - - v = vmin = r; - if (v < g) - v = g; - if (v < b) - v = b; - if (vmin > g) - vmin = g; - if (vmin > b) - vmin = b; - - diff = v - vmin; - s = diff / (float)(fabs(v) + FLT_EPSILON); - diff = (float)(60. / (diff + FLT_EPSILON)); - if (v == r) - h = (g - b) * diff; - else if (v == g) - h = (b - r) * diff + 120.f; - else - h = (r - g) * diff + 240.f; +DEVICE_INLINE void bgr_to_hsv_uchar(uchar b8, uchar g8, uchar r8, uchar &h8, uchar &s8, uchar &v8, bool isFullRange) +{ + const int hrange = isFullRange ? 256 : 180; + const int hsv_shift = 12; - if (h < 0) - h += 360.f; + const int b = (int)b8; + const int g = (int)g8; + const int r = (int)r8; - *dst.ptr(batch_idx, dst_y, dst_x, 0) = h * hscale; - *dst.ptr(batch_idx, dst_y, dst_x, 1) = s; - *dst.ptr(batch_idx, dst_y, dst_x, 2) = v; + const int vmin = cuda::min(b, cuda::min(g, r)); + const int v = cuda::max(b, cuda::max(g, r)); + + const int diff = v - vmin; + const int vr = v == r ? -1 : 0; + const int vg = v == g ? -1 : 0; + + const int hdiv_table = diff == 0 ? 0 : cuda::SaturateCast((hrange << hsv_shift) / (6.f * diff)); + const int sdiv_table = v == 0 ? 0 : cuda::SaturateCast((255 << hsv_shift) / (float)v); + + const int s = (diff * sdiv_table + (1 << (hsv_shift - 1))) >> hsv_shift; + int h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff)))); + + h = (h * hdiv_table + (1 << (hsv_shift - 1))) >> hsv_shift; + h += h < 0 ? hrange : 0; + + h8 = cuda::SaturateCast(h); + s8 = (uint8_t)s; + v8 = (uint8_t)v; +} + +DEVICE_INLINE void bgr_to_hsv_float(float b, float g, float r, float &h, float &s, float &v) +{ + float vmin = cuda::min(r, cuda::min(g, b)); + v = cuda::max(r, cuda::max(g, b)); + float diff = v - vmin; + s = diff / (fabs(v) + FLT_EPSILON); + diff = 60.f / (diff + FLT_EPSILON); + + // clang-format off + if (v == r) h = (g - b) * diff; + else if (v == g) h = (b - r) * diff + 120.f; + else h = (r - g) * diff + 240.f; + + if (h < 0.f) h += 360.f; + // clang-format on +} + +template +GLOBAL_BOUNDS void bgr_to_hsv_nhwc(const TensorWrap3D src, const TensorWrap3D dst, + int2 dstSize, int bidx, bool isFullRange) +{ + using EltT = nvcv::cuda::BaseType; + color_conversion_common( + [&src, bidx] __device__(EltT(&r_BGR)[3], int batch_idx, int x, int y) + { + EltT A; + load_bgra_nhwc(src, r_BGR[0], r_BGR[1], r_BGR[2], A, batch_idx, x, y, bidx); + }, + [isFullRange] __device__(const EltT(&r_BGR)[3], EltT(&r_HSV)[3]) + { + if constexpr (std::is_integral_v) + bgr_to_hsv_uchar(r_BGR[0], r_BGR[1], r_BGR[2], r_HSV[0], r_HSV[1], r_HSV[2], isFullRange); + else + bgr_to_hsv_float(r_BGR[0], r_BGR[1], r_BGR[2], r_HSV[0], r_HSV[1], r_HSV[2]); + }, + [&dst] __device__(const EltT(&r_HSV)[3], int batch_idx, int x, int y) + { store3_nhwc(dst, r_HSV[0], r_HSV[1], r_HSV[2], batch_idx, x, y); }, + dstSize); } -__device__ inline void HSV2RGB_native(float h, float s, float v, float &b, float &g, float &r, const float hscale) +template +DEVICE_INLINE T select4_reg(const T (&tab)[4], int idx) +{ + // Random access in a register array of size 4, with 6 instructions. + // The compiler was generating 10 instructions for tab[idx]. + T out; + out = idx == 1 ? tab[1] : tab[0]; + out = idx == 2 ? tab[2] : out; + out = idx == 3 ? tab[3] : out; + return out; +} + +DEVICE_INLINE void hsv_to_bgr_float(float h, float s, float v, float &b, float &g, float &r) { if (s == 0) b = g = r = v; else { - static const int sector_data[][3] = { - {1, 3, 0}, - {1, 0, 2}, - {3, 0, 1}, - {0, 2, 1}, - {0, 1, 3}, - {2, 1, 0} - }; - float tab[4]; - int sector; - h *= hscale; - h = fmod(h, 6.f); - sector = (int)floor(h); - h -= sector; - if ((unsigned)sector >= 6u) - { - sector = 0; - h = 0.f; - } + h += 6 * (h < 0); + int idx = static_cast(h); // Sector index. + h -= idx; // Fractional part of h. + idx = (idx % 6) << 2; // Shift index for sector LUT. + + // clang-format off + const float tab[4] {v, + v * (1 - s), + v * (1 - s * h), + v * (1 - s * (1 - h))}; + // clang-format on - tab[0] = v; - tab[1] = v * (1.f - s); - tab[2] = v * (1.f - s * h); - tab[3] = v * (1.f - s * (1.f - h)); + constexpr int32_t idx_lutb = 0x00200311; + constexpr int32_t idx_lutg = 0x00112003; + constexpr int32_t idx_lutr = 0x00031120; - b = tab[sector_data[sector][0]]; - g = tab[sector_data[sector][1]]; - r = tab[sector_data[sector][2]]; + b = select4_reg(tab, (idx_lutb >> idx) & 0xf); + g = select4_reg(tab, (idx_lutg >> idx) & 0xf); + r = select4_reg(tab, (idx_lutr >> idx) & 0xf); } } -template -__global__ void hsv_to_bgr_char_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int bidx, int dcn, bool isFullRange) +template +GLOBAL_BOUNDS void hsv_to_bgr_nhwc(const TensorWrap3D src, const TensorWrap3D dst, + int2 dstSize, int bidx, bool isFullRange) { - int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - if (dst_x >= dstSize.x || dst_y >= dstSize.y) - return; - const int batch_idx = get_batch_idx(); + using EltT = nvcv::cuda::BaseType; + color_conversion_common( + [&src] __device__(EltT(&r_HSV)[3], int batch_idx, int x, int y) + { load3_nhwc(src, r_HSV[0], r_HSV[1], r_HSV[2], batch_idx, x, y); }, + [isFullRange] __device__(const EltT(&r_HSV)[3], EltT(&r_BGR)[3]) + { + if constexpr (std::is_same_v) + { + const float scaleH = isFullRange ? (6.0f / 256.0f) : (6.0f / 180.0f); + constexpr float scaleSV = 1.0f / 255.0f; + + float Bf, Gf, Rf; + + hsv_to_bgr_float((float)r_HSV[0] * scaleH, r_HSV[1] * scaleSV, r_HSV[2] * scaleSV, Bf, Gf, Rf); + + r_BGR[0] = cuda::SaturateCast(Bf * 255.0f); + r_BGR[1] = cuda::SaturateCast(Gf * 255.0f); + r_BGR[2] = cuda::SaturateCast(Rf * 255.0f); + } + else + { + constexpr float scaleH = 6.0f / 360.0f; + + hsv_to_bgr_float(r_HSV[0] * scaleH, r_HSV[1], r_HSV[2], r_BGR[0], r_BGR[1], r_BGR[2]); + } + }, + [&dst, bidx] __device__(const EltT(&r_BGR)[3], int batch_idx, int x, int y) + { store_bgra_nhwc(dst, r_BGR[0], r_BGR[1], r_BGR[2], Alpha, batch_idx, x, y, bidx); }, + dstSize); +} + +template +DEVICE_INLINE void load_yuv420(const nvcv::cuda::Tensor4DWrap &src, EltT &Y, EltT &U, EltT &V, + int2 size, int batch_idx, int x, int y, int uidx) +{ + if constexpr (IsSemiPlanar) + { + // U and V are subsampled at half the full resolution (in both x and y), combined (i.e., interleaved), and + // arranged as full rows after the full resolution Y data. Example memory layout for 4 x 4 image (NV12): + // Y_00 Y_01 Y_02 Y_03 + // Y_10 Y_11 Y_12 Y_13 + // Y_20 Y_21 Y_22 Y_23 + // Y_30 Y_31 Y_32 Y_33 + // U_00 V_00 U_02 V_02 + // U_20 V_20 U_22 V_22 + // Each U and V value corresponds to a 2x2 block of Y values--e.g. U_00 and V_00 correspond to Y_00, Y_01, Y_10, + // and Y_11. Each full U-V row represents 2 rows of Y values. Some layouts (e.g., NV21) swap the location + // of the U and V values in each U-V pair (indicated by the uidx parameter). + + const int uv_y = size.y + y / 2; // The interleaved U-V semi-plane is 1/2 the height of the Y data. + const int uv_x = (x & ~1); // Convert x to even # (set lowest bit to 0). + + Y = *src.ptr(batch_idx, y, x); // Y (luma) is at full resolution. + U = *src.ptr(batch_idx, uv_y, uv_x + uidx); // Some formats swap the U and V elements (as indicated + V = *src.ptr(batch_idx, uv_y, uv_x + (uidx ^ 1)); // by the uidx parameter). + } + else + { + // U and V are subsampled at half the full resolution (in both x and y) and arranged as non-interleaved planes + // (i.e., planar format). Each subsampled U and V "plane" is arranged as full rows after the full resolution Y + // data--so two consecutive subsampled U or V rows are combined into one row spanning the same width as the Y + // plane. Example memory layout for 4 x 4 image (e.g. I420): + // Y_00 Y_01 Y_02 Y_03 + // Y_10 Y_11 Y_12 Y_13 + // Y_20 Y_21 Y_22 Y_23 + // Y_30 Y_31 Y_32 Y_33 + // U_00 U_02 U_20 U_22 + // V_00 V_02 V_20 V_22 + // Each U and V value corresponds to a 2x2 block of Y values--e.g. U_00 and V_00 correspond to Y_00, Y_01, Y_10, + // and Y_11. Each full U and V row represents 4 rows of Y values. Some layouts (e.g., YV12) swap the location + // of the U and V planes (indicated by the uidx parameter). + + const int by = size.y + y / 4; // Base row coordinate for U and V: subsampled plane is 1/4 the height. + const int h4 = size.y / 4; // Height (# of rows) of each subsampled U and V plane. + + // Compute x position that combines two subsampled rows into one. + const int uv_x = (x / 2) + ((size.x / 2) & -((y / 2) & 1)); // Second half of row for odd y coordinates. + + Y = *src.ptr(batch_idx, y, x); // Y (luma) is at full resolution. + U = *src.ptr(batch_idx, by + h4 * uidx, uv_x); // Some formats swap the U and V "planes" (as indicated + V = *src.ptr(batch_idx, by + h4 * (uidx ^ 1), uv_x); // by the uidx parameter). + } +} - float h = *src.ptr(batch_idx, dst_y, dst_x, 0); - float s = *src.ptr(batch_idx, dst_y, dst_x, 1) * (1.0f / 255.0f); - float v = *src.ptr(batch_idx, dst_y, dst_x, 2) * (1.0f / 255.0f); +template +DEVICE_INLINE void store_yuv420(const TensorWrap4D &dst, EltT Y, EltT U, EltT V, int2 size, + int batch_idx, int x, int y, int uidx) +{ + if constexpr (IsSemiPlanar) + { + // See YUV420 semi-planar layout commments in load_yuv420 above. + *dst.ptr(batch_idx, y, x) = Y; // Y (luma) is at full resolution. + if (y % 2 == 0 && x % 2 == 0) + { + const int uv_y = size.y + y / 2; // The interleaved U-V semi-plane is 1/2 the height of the Y data. + const int uv_x = (x & ~1); // Convert x to even # (set lowest bit to 0). - float hrange = isFullRange ? 255 : 180; - unsigned char alpha = cuda::TypeTraits::max; - float hs = 6.f / hrange; + *dst.ptr(batch_idx, uv_y, uv_x + uidx) = U; // Some formats swap the U and V elements (as indicated + *dst.ptr(batch_idx, uv_y, uv_x + (uidx ^ 1)) = V; // by the uidx parameter). + } + } + else + { + // See YUV420 planar layout commments in load_yuv420 above. + *dst.ptr(batch_idx, y, x, 0) = Y; // Y (luma) is at full resolution. + if (y % 2 == 0 && x % 2 == 0) + { + const int by = size.y + y / 4; // Base row coordinate for U and V: subsampled plane is 1/4 the height. + const int h4 = size.y / 4; // Height (# of rows) of each subsampled U and V plane. - float b, g, r; - HSV2RGB_native(h, s, v, b, g, r, hs); + // Compute x position that combines two subsampled rows into one. + const int uv_x = (x / 2) + ((size.x / 2) & -((y / 2) & 1)); // Second half of row for odd y coordinates. - *dst.ptr(batch_idx, dst_y, dst_x, bidx) = cuda::SaturateCast(b * 255.0f); - *dst.ptr(batch_idx, dst_y, dst_x, 1) = cuda::SaturateCast(g * 255.0f); - *dst.ptr(batch_idx, dst_y, dst_x, bidx ^ 2) = cuda::SaturateCast(r * 255.0f); - if (dcn == 4) - *dst.ptr(batch_idx, dst_y, dst_x, 3) = alpha; + *dst.ptr(batch_idx, by + h4 * uidx, uv_x) = U; // Some formats swap the U and V "planes" (as indicated + *dst.ptr(batch_idx, by + h4 * (uidx ^ 1), uv_x) = V; // by the uidx parameter). + } + } } -template -__global__ void hsv_to_bgr_float_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int bidx, int dcn) +DEVICE_INLINE void bgr_to_yuv42xxp(const uchar &b, const uchar &g, const uchar &r, uchar &Y, uchar &U, uchar &V) { - int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - if (dst_x >= dstSize.x || dst_y >= dstSize.y) - return; - const int batch_idx = get_batch_idx(); - - float h = *src.ptr(batch_idx, dst_y, dst_x, 0); - float s = *src.ptr(batch_idx, dst_y, dst_x, 1); - float v = *src.ptr(batch_idx, dst_y, dst_x, 2); + const int shifted16 = (16 << ITUR_BT_601_SHIFT); + const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1)); + int yy = ITUR_BT_601_CRY * r + ITUR_BT_601_CGY * g + ITUR_BT_601_CBY * b + halfShift + shifted16; - float hrange = 360.0; - float alpha = 1.f; - float hs = 6.f / hrange; + Y = cuda::SaturateCast(yy >> ITUR_BT_601_SHIFT); - float b, g, r; - HSV2RGB_native(h, s, v, b, g, r, hs); + const int shifted128 = (128 << ITUR_BT_601_SHIFT); + int uu = ITUR_BT_601_CRU * r + ITUR_BT_601_CGU * g + ITUR_BT_601_CBU * b + halfShift + shifted128; + int vv = ITUR_BT_601_CBU * r + ITUR_BT_601_CGV * g + ITUR_BT_601_CBV * b + halfShift + shifted128; - *dst.ptr(batch_idx, dst_y, dst_x, bidx) = b; - *dst.ptr(batch_idx, dst_y, dst_x, 1) = g; - *dst.ptr(batch_idx, dst_y, dst_x, bidx ^ 2) = r; - if (dcn == 4) - *dst.ptr(batch_idx, dst_y, dst_x, 3) = alpha; + U = cuda::SaturateCast(uu >> ITUR_BT_601_SHIFT); + V = cuda::SaturateCast(vv >> ITUR_BT_601_SHIFT); } -__device__ __forceinline__ void yuv42xxp_to_bgr_kernel(const int &Y, const int &U, const int &V, uchar &r, uchar &g, - uchar &b) +DEVICE_INLINE void yuv42xxp_to_bgr(const int &Y, const int &U, const int &V, uchar &b, uchar &g, uchar &r) { //R = 1.164(Y - 16) + 1.596(V - 128) //G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) @@ -446,181 +637,139 @@ __device__ __forceinline__ void yuv42xxp_to_bgr_kernel(const int &Y, const int & b = cuda::SaturateCast(CV_DESCALE((yy + C4 * uu), yuv4xx_shift)); } -__device__ __forceinline__ void bgr_to_yuv42xxp_kernel(const uchar &r, const uchar &g, const uchar &b, uchar &Y, - uchar &U, uchar &V) +template +GLOBAL_BOUNDS void bgr_to_yuv420_char_nhwc(const TensorWrap3D src, + const TensorWrap4D dst, int2 size, int bidx, int uidx) { - const int shifted16 = (16 << ITUR_BT_601_SHIFT); - const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1)); - int yy = ITUR_BT_601_CRY * r + ITUR_BT_601_CGY * g + ITUR_BT_601_CBY * b + halfShift + shifted16; - - Y = cuda::SaturateCast(yy >> ITUR_BT_601_SHIFT); - - const int shifted128 = (128 << ITUR_BT_601_SHIFT); - int uu = ITUR_BT_601_CRU * r + ITUR_BT_601_CGU * g + ITUR_BT_601_CBU * b + halfShift + shifted128; - int vv = ITUR_BT_601_CBU * r + ITUR_BT_601_CGV * g + ITUR_BT_601_CBV * b + halfShift + shifted128; - - U = cuda::SaturateCast(uu >> ITUR_BT_601_SHIFT); - V = cuda::SaturateCast(vv >> ITUR_BT_601_SHIFT); + static_assert(std::is_same_v, EltT>); + color_conversion_common( + [&src, bidx] __device__(EltT(&r_BGR)[3], int batch_idx, int x, int y) + { + EltT A; + load_bgra_nhwc(src, r_BGR[0], r_BGR[1], r_BGR[2], A, batch_idx, x, y, bidx); + }, + [] __device__(const EltT(&r_BGR)[3], EltT(&r_YUV)[3]) + { bgr_to_yuv42xxp(r_BGR[0], r_BGR[1], r_BGR[2], r_YUV[0], r_YUV[1], r_YUV[2]); }, + [&dst, uidx, size] __device__(const EltT(&r_YUV)[3], int batch_idx, int x, int y) + { store_yuv420(dst, r_YUV[0], r_YUV[1], r_YUV[2], size, batch_idx, x, y, uidx); }, + size); } -template -__global__ void bgr_to_yuv420p_char_nhwc(SrcWrapper src, DstWrapper dst, int2 srcSize, int scn, int bidx, int uidx) +template +GLOBAL_BOUNDS void yuv420_to_bgr_char_nhwc(const TensorWrap4D src, + const TensorWrap3D dst, int2 size, int bidx, int uidx) { - int src_x = blockIdx.x * blockDim.x + threadIdx.x; - int src_y = blockIdx.y * blockDim.y + threadIdx.y; - if (src_x >= srcSize.x || src_y >= srcSize.y) - return; - const int batch_idx = get_batch_idx(); - int plane_y_step = srcSize.y * srcSize.x; - int plane_uv_step = plane_y_step / 4; - int uv_x = (src_y % 4 < 2) ? src_x / 2 : (src_x / 2 + srcSize.x / 2); - - uchar b = static_cast(*src.ptr(batch_idx, src_y, src_x, bidx)); - uchar g = static_cast(*src.ptr(batch_idx, src_y, src_x, 1)); - uchar r = static_cast(*src.ptr(batch_idx, src_y, src_x, bidx ^ 2)); - // Ignore gray channel if input is RGBA - - uchar Y{0}, U{0}, V{0}; - bgr_to_yuv42xxp_kernel(r, g, b, Y, U, V); - - *dst.ptr(batch_idx, src_y, src_x, 0) = Y; - if (src_y % 2 == 0 && src_x % 2 == 0) - { - *dst.ptr(batch_idx, srcSize.y + src_y / 4, uv_x + plane_uv_step * uidx) = U; - *dst.ptr(batch_idx, srcSize.y + src_y / 4, uv_x + plane_uv_step * (1 - uidx)) = V; - } + static_assert(std::is_same_v, EltT>); + color_conversion_common( + [&src, uidx, size] __device__(EltT(&r_YUV)[3], int batch_idx, int x, int y) + { load_yuv420(src, r_YUV[0], r_YUV[1], r_YUV[2], size, batch_idx, x, y, uidx); }, + [] __device__(const EltT(&r_YUV)[3], EltT(&r_BGR)[3]) + { + yuv42xxp_to_bgr(static_cast(r_YUV[0]), static_cast(r_YUV[1]), static_cast(r_YUV[2]), + r_BGR[0], r_BGR[1], r_BGR[2]); + }, + [&dst, bidx] __device__(const EltT(&r_BGR)[3], int batch_idx, int x, int y) + { store_bgra_nhwc(dst, r_BGR[0], r_BGR[1], r_BGR[2], Alpha, batch_idx, x, y, bidx); }, + size); } +// YUV 422 interleaved formats (e.g., YUYV, YVYU, and UYVY) group 2 pixels into groups of 4 elements. Each group of two +// pixels has two distinct luma (Y) values, one for each pixel. The chromaticity values (U and V) are subsampled by a +// factor of two so that there is only one U and one V value for each group of 2 pixels. Example memory layout for +// 4 x 4 image (UYVY format): +// U_00 Y_00 V_00 Y_01 U_02 Y_02 V_02 Y_03 +// U_10 Y_10 V_10 Y_11 U_12 Y_12 V_12 Y_13 +// U_20 Y_20 V_20 Y_21 U_22 Y_22 V_22 Y_23 +// U_30 Y_30 V_30 Y_31 U_32 Y_32 V_32 Y_33 +// Each U and V value corresponds to two Y values--e.g. U_00 and V_00 correspond to Y_00 and Y_10 while U_12 and V_12 +// correspond to Y_12 and Y_13. Thus, a given Y value, Y_rc = Y(r,c) (where r is the row, or y coordinate, and c is the +// column, or x coordinate), corresponds to U(r,c') and V(r,c') where c' is the even column coordinate <= c -- that is, +// c' = 2 * floor(c/2) = (c & ~1). Some layouts swap the positions of the chromaticity and luma values (e.g., YUYV) +// (indicated by the yidx parameter) and / or swap the the positions of the U and V chromaticity valus (e.g., YVYU) +// (indicated by the uidx parameter). +// The data layout is treated as a single channel tensor, so each group of 4 values corresponds to two pixels. As such, +// the tensor width is twice the actual pixel width. Thus, it's easiest to process 4 consecutive values (2 pixels) per +// thread. template -__global__ void bgr_to_yuv420sp_char_nhwc(SrcWrapper src, DstWrapper dst, int2 srcSize, int scn, int bidx, int uidx) +__global__ void yuv422_to_bgr_char_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int dcn, int bidx, int yidx, + int uidx) { - int src_x = blockIdx.x * blockDim.x + threadIdx.x; - int src_y = blockIdx.y * blockDim.y + threadIdx.y; - if (src_x >= srcSize.x || src_y >= srcSize.y) + using T = typename DstWrapper::ValueType; + + int dst_y = blockIdx.y * blockDim.y + threadIdx.y; + if (dst_y >= dstSize.y) return; - const int batch_idx = get_batch_idx(); - int uv_x = (src_x % 2 == 0) ? src_x : (src_x - 1); - uchar b = static_cast(*src.ptr(batch_idx, src_y, src_x, bidx)); - uchar g = static_cast(*src.ptr(batch_idx, src_y, src_x, 1)); - uchar r = static_cast(*src.ptr(batch_idx, src_y, src_x, bidx ^ 2)); - // Ignore gray channel if input is RGBA + int dst_x = 2 * (blockIdx.x * blockDim.x + threadIdx.x); // Process 2 destination pixels/thread. + if (dst_x >= dstSize.x) + return; - uchar Y{0}, U{0}, V{0}; - bgr_to_yuv42xxp_kernel(r, g, b, Y, U, V); + const int batch_idx = get_batch_idx(); - *dst.ptr(batch_idx, src_y, src_x, 0) = Y; - if (src_y % 2 == 0 && src_x % 2 == 0) - { - *dst.ptr(batch_idx, srcSize.y + src_y / 2, uv_x + uidx) = U; - *dst.ptr(batch_idx, srcSize.y + src_y / 2, uv_x + (1 - uidx)) = V; - } -} + const int src_x = 2 * dst_x; // Process 4 source elements/thread (i.e., 2 destination pixels). + const int uv_x = (src_x & ~3); // Compute "even" x coordinate for U and V (set lowest two bits to 0). -template -__global__ void yuv420sp_to_bgr_char_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int dcn, int bidx, int uidx) -{ - int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - if (dst_x >= dstSize.x || dst_y >= dstSize.y) - return; - const int batch_idx = get_batch_idx(); - int uv_x = (dst_x % 2 == 0) ? dst_x : (dst_x - 1); + const T Y0 = *src.ptr(batch_idx, dst_y, src_x + yidx); + const T Y1 = *src.ptr(batch_idx, dst_y, src_x + yidx + 2); + const T U = *src.ptr(batch_idx, dst_y, uv_x + (yidx ^ 1) + uidx); + const T V = *src.ptr(batch_idx, dst_y, uv_x + (yidx ^ 1) + (uidx ^ 2)); - T Y = *src.ptr(batch_idx, dst_y, dst_x, 0); - T U = *src.ptr(batch_idx, dstSize.y + dst_y / 2, uv_x + uidx); - T V = *src.ptr(batch_idx, dstSize.y + dst_y / 2, uv_x + 1 - uidx); + T r{0}, g{0}, b{0}; - uchar r{0}, g{0}, b{0}, a{0xff}; - yuv42xxp_to_bgr_kernel(int(Y), int(U), int(V), r, g, b); + yuv42xxp_to_bgr(int(Y0), int(U), int(V), b, g, r); *dst.ptr(batch_idx, dst_y, dst_x, bidx) = b; *dst.ptr(batch_idx, dst_y, dst_x, 1) = g; *dst.ptr(batch_idx, dst_y, dst_x, bidx ^ 2) = r; if (dcn == 4) - { - *dst.ptr(batch_idx, dst_y, dst_x, 3) = a; - } -} + *dst.ptr(batch_idx, dst_y, dst_x, 3) = Alpha; -template -__global__ void yuv420p_to_bgr_char_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int dcn, int bidx, int uidx) -{ - int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - if (dst_x >= dstSize.x || dst_y >= dstSize.y) - return; - - const int batch_idx = get_batch_idx(); - int plane_y_step = dstSize.y * dstSize.x; - int plane_uv_step = plane_y_step / 4; - int uv_x = (dst_y % 4 < 2) ? dst_x / 2 : (dst_x / 2 + dstSize.x / 2); - - T Y = *src.ptr(batch_idx, dst_y, dst_x, 0); - T U = *src.ptr(batch_idx, dstSize.y + dst_y / 4, uv_x + plane_uv_step * uidx); - T V = *src.ptr(batch_idx, dstSize.y + dst_y / 4, uv_x + plane_uv_step * (1 - uidx)); - - uchar r{0}, g{0}, b{0}, a{0xff}; - yuv42xxp_to_bgr_kernel(int(Y), int(U), int(V), r, g, b); + dst_x++; // Move to next output pixel. + yuv42xxp_to_bgr(int(Y1), int(U), int(V), b, g, r); *dst.ptr(batch_idx, dst_y, dst_x, bidx) = b; *dst.ptr(batch_idx, dst_y, dst_x, 1) = g; *dst.ptr(batch_idx, dst_y, dst_x, bidx ^ 2) = r; if (dcn == 4) - { - *dst.ptr(batch_idx, dst_y, dst_x, 3) = a; - } + *dst.ptr(batch_idx, dst_y, dst_x, 3) = Alpha; } template -__global__ void yuv422_to_bgr_char_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int dcn, int bidx, int yidx, - int uidx) +__global__ void yuv422_to_gray_char_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int yidx) { - int dst_x = blockIdx.x * blockDim.x + threadIdx.x; int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - if (dst_x >= dstSize.x || dst_y >= dstSize.y) + if (dst_y >= dstSize.y) return; - const int batch_idx = get_batch_idx(); - int uv_x = (dst_x % 2 == 0) ? dst_x : dst_x - 1; - T Y = *src.ptr(batch_idx, dst_y, dst_x, yidx); - T U = *src.ptr(batch_idx, dst_y, uv_x, (1 - yidx) + uidx); - T V = *src.ptr(batch_idx, dst_y, uv_x, (1 - yidx) + uidx ^ 2); + int dst_x = 2 * (blockIdx.x * blockDim.x + threadIdx.x); // Process 2 destination pixels/thread. + if (dst_x >= dstSize.x) + return; - uchar r{0}, g{0}, b{0}, a{0xff}; - yuv42xxp_to_bgr_kernel(int(Y), int(U), int(V), r, g, b); + const int batch_idx = get_batch_idx(); - *dst.ptr(batch_idx, dst_y, dst_x, bidx) = b; - *dst.ptr(batch_idx, dst_y, dst_x, 1) = g; - *dst.ptr(batch_idx, dst_y, dst_x, bidx ^ 2) = r; - if (dcn == 4) - { - *dst.ptr(batch_idx, dst_y, dst_x, 3) = a; - } -} + const int src_x = 2 * dst_x; // Process 4 source elements/thread. -template -__global__ void yuv420_to_gray_char_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize) -{ - int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - if (dst_x >= dstSize.x || dst_y >= dstSize.y) - return; - const int batch_idx = get_batch_idx(); - T Y = *src.ptr(batch_idx, dst_y, dst_x, 0); - *dst.ptr(batch_idx, dst_y, dst_x, 0) = Y; + *dst.ptr(batch_idx, dst_y, dst_x++) = *src.ptr(batch_idx, dst_y, src_x + yidx); + *dst.ptr(batch_idx, dst_y, dst_x) = *src.ptr(batch_idx, dst_y, src_x + yidx + 2); } -template -__global__ void yuv422_to_gray_char_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int yidx) +template +inline ErrorCode Launch_BGR_to_RGB(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + NVCVColorConversionCode code, cuda_op::DataShape shape, int bidx, + cudaStream_t stream) { - int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - if (dst_x >= dstSize.x || dst_y >= dstSize.y) - return; - const int batch_idx = get_batch_idx(); - T Y = *src.ptr(batch_idx, dst_y, dst_x, yidx); - *dst.ptr(batch_idx, dst_y, dst_x, 0) = Y; + using Policy = CvtKernelPolicy<32, 4, 4>; + + dim3 blockSize(Policy::BlockWidth, Policy::BlockHeight); + dim3 gridSize(divUp(shape.W, Policy::TileWidth), divUp(shape.H, Policy::TileHeight), shape.N); + int2 dstSize{shape.W, shape.H}; + + auto srcWrap = cuda::CreateTensorWrapNHW(inData); + auto dstWrap = cuda::CreateTensorWrapNHW(outData); + rgb_to_bgr_nhwc<<>>(srcWrap, dstWrap, dstSize, bidx); + checkKernelErrors(); + + return ErrorCode::SUCCESS; } inline ErrorCode BGR_to_RGB(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, @@ -628,10 +777,7 @@ inline ErrorCode BGR_to_RGB(const TensorDataStridedCuda &inData, const TensorDat { int sch = (code == NVCV_COLOR_BGRA2BGR || code == NVCV_COLOR_RGBA2BGR || code == NVCV_COLOR_BGRA2RGBA) ? 4 : 3; int dch = (code == NVCV_COLOR_BGR2BGRA || code == NVCV_COLOR_BGR2RGBA || code == NVCV_COLOR_BGRA2RGBA) ? 4 : 3; - int bidx = (code == NVCV_COLOR_BGR2RGB || code == NVCV_COLOR_RGBA2BGR || code == NVCV_COLOR_BGRA2RGBA - || code == NVCV_COLOR_BGR2RGBA) - ? 2 - : 0; + int bidx = (code != NVCV_COLOR_BGRA2BGR && code != NVCV_COLOR_BGR2BGRA) ? 2 : 0; auto inAccess = TensorDataAccessStridedImagePlanar::Create(inData); NVCV_ASSERT(inAccess); @@ -647,72 +793,82 @@ inline ErrorCode BGR_to_RGB(const TensorDataStridedCuda &inData, const TensorDat if (inputShape.C != sch) { - LOG_ERROR("Invalid input channel number " << inputShape.C << " expecting: " << sch); + LOG_ERROR("Invalid input channel number " << inputShape.C << " -- expecting " << sch); + return ErrorCode::INVALID_DATA_SHAPE; + } + if (outputShape.C != dch) + { + LOG_ERROR("Invalid output channel number " << outputShape.C << " -- expecting " << dch); return ErrorCode::INVALID_DATA_SHAPE; } if (outDataType != inDataType) { - LOG_ERROR("Unsupported input/output DataType " << inDataType << "/" << outDataType); + LOG_ERROR("Mismatched input / output DataTypes " << inDataType << " / " << outDataType); return ErrorCode::INVALID_DATA_TYPE; } - if (outputShape.H != inputShape.H || outputShape.W != inputShape.W || outputShape.N != inputShape.N - || outputShape.C != dch) + if (outputShape.H != inputShape.H || outputShape.W != inputShape.W || outputShape.N != inputShape.N) { - LOG_ERROR("Invalid output shape " << outputShape); + LOG_ERROR("Shape mismatch -- output tensor shape " << outputShape << " doesn't match input tensor shape " + << inputShape); + return ErrorCode::INVALID_DATA_SHAPE; + } + if (outDataType == kCV_16F && sch < 4 && dch == 4) + { + LOG_ERROR("Adding alpha to the output is not supported for " << outDataType); return ErrorCode::INVALID_DATA_SHAPE; } - dim3 blockSize(BLOCK, BLOCK / 4, 1); - dim3 gridSize(divUp(inputShape.W, blockSize.x), divUp(inputShape.H, blockSize.y), inputShape.N); +#define CVCUDA_BGR2RGB_IF(SCH, DCH, SRC_T, DST_T) \ + if (sch == SCH && dch == DCH) \ + return Launch_BGR_to_RGB(inData, outData, code, inputShape, bidx, stream) - int2 dstSize{outputShape.W, outputShape.H}; +#define CVCUDA_BGR2RGB_CASE(T3, T4) \ + CVCUDA_BGR2RGB_IF(3, 3, T3, T3); \ + else CVCUDA_BGR2RGB_IF(3, 4, T3, T4); \ + else CVCUDA_BGR2RGB_IF(4, 3, T4, T3); \ + else CVCUDA_BGR2RGB_IF(4, 4, T4, T4); \ + else return ErrorCode::INVALID_DATA_SHAPE switch (inDataType) { case kCV_8U: case kCV_8S: - { - auto srcWrap = cuda::CreateTensorWrapNHWC(inData); - auto dstWrap = cuda::CreateTensorWrapNHWC(outData); - rgb_to_bgr_nhwc<<>>(srcWrap, dstWrap, dstSize, sch, dch, bidx); - checkKernelErrors(); - } - break; + CVCUDA_BGR2RGB_CASE(uchar3, uchar4); + case kCV_16F: // Not properly handled when adding alpha to the destination. case kCV_16U: - case kCV_16F: case kCV_16S: - { - auto srcWrap = cuda::CreateTensorWrapNHWC(inData); - auto dstWrap = cuda::CreateTensorWrapNHWC(outData); - rgb_to_bgr_nhwc<<>>(srcWrap, dstWrap, dstSize, sch, dch, bidx); - checkKernelErrors(); - } - break; + CVCUDA_BGR2RGB_CASE(ushort3, ushort4); case kCV_32S: - { - auto srcWrap = cuda::CreateTensorWrapNHWC(inData); - auto dstWrap = cuda::CreateTensorWrapNHWC(outData); - rgb_to_bgr_nhwc<<>>(srcWrap, dstWrap, dstSize, sch, dch, bidx); - checkKernelErrors(); - } - break; + CVCUDA_BGR2RGB_CASE(int3, int4); case kCV_32F: - { - auto srcWrap = cuda::CreateTensorWrapNHWC(inData); - auto dstWrap = cuda::CreateTensorWrapNHWC(outData); - rgb_to_bgr_nhwc<<>>(srcWrap, dstWrap, dstSize, sch, dch, bidx); - checkKernelErrors(); - } - break; + CVCUDA_BGR2RGB_CASE(float3, float4); case kCV_64F: - { - auto srcWrap = cuda::CreateTensorWrapNHWC(inData); - auto dstWrap = cuda::CreateTensorWrapNHWC(outData); - rgb_to_bgr_nhwc<<>>(srcWrap, dstWrap, dstSize, sch, dch, bidx); - checkKernelErrors(); - } - break; + CVCUDA_BGR2RGB_CASE(double3, double4); + default: + LOG_ERROR("Unsupported DataType " << inDataType); + return ErrorCode::INVALID_DATA_TYPE; } +#undef CVCUDA_BGR2RGB_CASE +#undef CVCUDA_BGR2RGB_IF + return ErrorCode::SUCCESS; +} + +template +inline ErrorCode Launch_GRAY_to_BGR(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + cuda_op::DataShape shape, cudaStream_t stream) +{ + using Policy = CvtKernelPolicy<32, 4, 8>; + + dim3 blockSize(Policy::BlockWidth, Policy::BlockHeight); + dim3 gridSize(divUp(shape.W, Policy::TileWidth), divUp(shape.H, Policy::TileHeight), shape.N); + + int2 dstSize{shape.W, shape.H}; + + auto srcWrap = cuda::CreateTensorWrapNHW(inData); + auto dstWrap = cuda::CreateTensorWrapNHW(outData); + gray_to_bgr_nhwc<<>>(srcWrap, dstWrap, dstSize); + checkKernelErrors(); + return ErrorCode::SUCCESS; } @@ -735,72 +891,80 @@ inline ErrorCode GRAY_to_BGR(const TensorDataStridedCuda &inData, const TensorDa if (inputShape.C != 1) { - LOG_ERROR("Invalid input channel number " << inputShape.C); + LOG_ERROR("Invalid input channel number " << inputShape.C << " -- expecting 1"); + return ErrorCode::INVALID_DATA_SHAPE; + } + if (outputShape.C != dch) + { + LOG_ERROR("Invalid output channel number " << outputShape.C << " -- expecting " << dch); return ErrorCode::INVALID_DATA_SHAPE; } if (outDataType != inDataType) { - LOG_ERROR("Unsupported input/output DataType " << inDataType << "/" << outDataType); + LOG_ERROR("Mismatched input / output DataTypes " << inDataType << " / " << outDataType); return ErrorCode::INVALID_DATA_TYPE; } - if (outputShape.H != inputShape.H || outputShape.W != inputShape.W || outputShape.N != inputShape.N - || outputShape.C != dch) + if (outputShape.H != inputShape.H || outputShape.W != inputShape.W || outputShape.N != inputShape.N) { - LOG_ERROR("Invalid output shape " << outputShape); + LOG_ERROR("Shape mismatch -- output tensor shape " << outputShape << " doesn't match input tensor shape " + << inputShape); + return ErrorCode::INVALID_DATA_SHAPE; + } + if (outDataType == kCV_16F && dch == 4) + { + LOG_ERROR("Adding alpha to the output is not supported for " << outDataType); return ErrorCode::INVALID_DATA_SHAPE; } - dim3 blockSize(BLOCK, BLOCK / 4, 1); - dim3 gridSize(divUp(inputShape.W, blockSize.x), divUp(inputShape.H, blockSize.y), inputShape.N); +#define CVCUDA_GRAY2BGR_IF(DCH, SRC_T, DST_T) \ + if (dch == DCH) \ + return Launch_GRAY_to_BGR(inData, outData, inputShape, stream) - int2 dstSize{outputShape.W, outputShape.H}; +#define CVCUDA_GRAY2BGR_CASE(T, T3, T4) \ + CVCUDA_GRAY2BGR_IF(3, T, T3); \ + else CVCUDA_GRAY2BGR_IF(4, T, T4); \ + else return ErrorCode::INVALID_DATA_SHAPE switch (inDataType) { case kCV_8U: case kCV_8S: - { - auto srcWrap = cuda::CreateTensorWrapNHWC(inData); - auto dstWrap = cuda::CreateTensorWrapNHWC(outData); - gray_to_bgr_nhwc<<>>(srcWrap, dstWrap, dstSize, dch); - checkKernelErrors(); - } - break; + CVCUDA_GRAY2BGR_CASE(uchar, uchar3, uchar4); + case kCV_16F: // Not properly handled when adding alpha to the destination. case kCV_16U: - case kCV_16F: case kCV_16S: - { - auto srcWrap = cuda::CreateTensorWrapNHWC(inData); - auto dstWrap = cuda::CreateTensorWrapNHWC(outData); - gray_to_bgr_nhwc<<>>(srcWrap, dstWrap, dstSize, dch); - checkKernelErrors(); - } - break; + CVCUDA_GRAY2BGR_CASE(ushort, ushort3, ushort4); case kCV_32S: - { - auto srcWrap = cuda::CreateTensorWrapNHWC(inData); - auto dstWrap = cuda::CreateTensorWrapNHWC(outData); - gray_to_bgr_nhwc<<>>(srcWrap, dstWrap, dstSize, dch); - checkKernelErrors(); - } - break; + CVCUDA_GRAY2BGR_CASE(int, int3, int4); case kCV_32F: - { - auto srcWrap = cuda::CreateTensorWrapNHWC(inData); - auto dstWrap = cuda::CreateTensorWrapNHWC(outData); - gray_to_bgr_nhwc<<>>(srcWrap, dstWrap, dstSize, dch); - checkKernelErrors(); - } - break; + CVCUDA_GRAY2BGR_CASE(float, float3, float4); case kCV_64F: - { - auto srcWrap = cuda::CreateTensorWrapNHWC(inData); - auto dstWrap = cuda::CreateTensorWrapNHWC(outData); - gray_to_bgr_nhwc<<>>(srcWrap, dstWrap, dstSize, dch); - checkKernelErrors(); - } - break; + CVCUDA_GRAY2BGR_CASE(double, double3, double4); + default: + LOG_ERROR("Unsupported DataType " << inDataType); + return ErrorCode::INVALID_DATA_TYPE; } +#undef CVCUDA_GRAY2BGR_CASE +#undef CVCUDA_GRAY2BGR_IF + return ErrorCode::SUCCESS; +} + +template +inline ErrorCode Launch_BGR_to_GRAY(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + cuda_op::DataShape shape, int bidx, cudaStream_t stream) +{ + using Policy = CvtKernelPolicy<32, 4, 4>; + + dim3 blockSize(Policy::BlockWidth, Policy::BlockHeight); + dim3 gridSize(divUp(shape.W, Policy::TileWidth), divUp(shape.H, Policy::TileHeight), shape.N); + + int2 dstSize{shape.W, shape.H}; + + auto srcWrap = cuda::CreateTensorWrapNHW(inData); + auto dstWrap = cuda::CreateTensorWrapNHW(outData); + bgr_to_gray_nhwc<<>>(srcWrap, dstWrap, dstSize, bidx); + checkKernelErrors(); + return ErrorCode::SUCCESS; } @@ -824,56 +988,68 @@ inline ErrorCode BGR_to_GRAY(const TensorDataStridedCuda &inData, const TensorDa if (inputShape.C != sch) { - LOG_ERROR("Invalid input channel number " << inputShape.C << " expecting: " << sch); + LOG_ERROR("Invalid input channel number " << inputShape.C << " -- expecting " << sch); + return ErrorCode::INVALID_DATA_SHAPE; + } + if (outputShape.C != 1) + { + LOG_ERROR("Invalid output channel number " << outputShape.C << " -- expecting 1"); return ErrorCode::INVALID_DATA_SHAPE; } if (outDataType != inDataType) { - LOG_ERROR("Unsupported input/output DataType " << inDataType << "/" << outDataType); + LOG_ERROR("Mismatched input / output DataTypes " << inDataType << " / " << outDataType); return ErrorCode::INVALID_DATA_TYPE; } - if (outputShape.H != inputShape.H || outputShape.W != inputShape.W || outputShape.N != inputShape.N - || outputShape.C != 1) + if (outputShape.H != inputShape.H || outputShape.W != inputShape.W || outputShape.N != inputShape.N) { - LOG_ERROR("Invalid output shape " << outputShape); + LOG_ERROR("Shape mismatch -- output tensor shape " << outputShape << " doesn't match input tensor shape " + << inputShape); return ErrorCode::INVALID_DATA_SHAPE; } - dim3 blockSize(BLOCK, BLOCK / 4, 1); - dim3 gridSize(divUp(inputShape.W, blockSize.x), divUp(inputShape.H, blockSize.y), inputShape.N); +#define CVCUDA_BGR2GRAY_IF(SCH, SRC_T, DST_T) \ + if (sch == SCH) \ + return Launch_BGR_to_GRAY(inData, outData, inputShape, bidx, stream) - int2 dstSize{outputShape.W, outputShape.H}; +#define CVCUDA_BGR2GRAY_CASE(T, T3, T4) \ + CVCUDA_BGR2GRAY_IF(3, T3, T); \ + else CVCUDA_BGR2GRAY_IF(4, T4, T); \ + else return ErrorCode::INVALID_DATA_SHAPE switch (inDataType) { case kCV_8U: - { - auto srcWrap = cuda::CreateTensorWrapNHWC(inData); - auto dstWrap = cuda::CreateTensorWrapNHWC(outData); - bgr_to_gray_char_nhwc<<>>(srcWrap, dstWrap, dstSize, bidx); - checkKernelErrors(); - } - break; + CVCUDA_BGR2GRAY_CASE(uchar, uchar3, uchar4); case kCV_16U: - { - auto srcWrap = cuda::CreateTensorWrapNHWC(inData); - auto dstWrap = cuda::CreateTensorWrapNHWC(outData); - bgr_to_gray_char_nhwc<<>>(srcWrap, dstWrap, dstSize, bidx); - checkKernelErrors(); - } - break; + CVCUDA_BGR2GRAY_CASE(ushort, ushort3, ushort4); case kCV_32F: - { - auto srcWrap = cuda::CreateTensorWrapNHWC(inData); - auto dstWrap = cuda::CreateTensorWrapNHWC(outData); - bgr_to_gray_float_nhwc<<>>(srcWrap, dstWrap, dstSize, bidx); - checkKernelErrors(); - } - break; + CVCUDA_BGR2GRAY_CASE(float, float3, float4); default: LOG_ERROR("Unsupported DataType " << inDataType); return ErrorCode::INVALID_DATA_TYPE; } +#undef CVCUDA_BGR2GRAY_CASE +#undef CVCUDA_BGR2GRAY_IF + return ErrorCode::SUCCESS; +} + +template +inline ErrorCode Launch_BGR_to_YUV(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + cuda_op::DataShape shape, int bidx, cudaStream_t stream) +{ + using Policy = CvtKernelPolicy<32, 4, 4>; + + dim3 blockSize(Policy::BlockWidth, Policy::BlockHeight); + dim3 gridSize(divUp(shape.W, Policy::TileWidth), divUp(shape.H, Policy::TileHeight), shape.N); + + int2 dstSize{shape.W, shape.H}; + + auto srcWrap = cuda::CreateTensorWrapNHW(inData); + auto dstWrap = cuda::CreateTensorWrapNHW(outData); + bgr_to_yuv_nhwc<<>>(srcWrap, dstWrap, dstSize, bidx); + checkKernelErrors(); + return ErrorCode::SUCCESS; } @@ -910,41 +1086,39 @@ inline ErrorCode BGR_to_YUV(const TensorDataStridedCuda &inData, const TensorDat return ErrorCode::INVALID_DATA_SHAPE; } - dim3 blockSize(BLOCK, BLOCK / 4, 1); - dim3 gridSize(divUp(inputShape.W, blockSize.x), divUp(inputShape.H, blockSize.y), inputShape.N); - - int2 dstSize{outputShape.W, outputShape.H}; - +#define CVCUDA_BGR2YUV_CASE(T3) return Launch_BGR_to_YUV(inData, outData, inputShape, bidx, stream) switch (inDataType) { case kCV_8U: - { - auto srcWrap = cuda::CreateTensorWrapNHWC(inData); - auto dstWrap = cuda::CreateTensorWrapNHWC(outData); - bgr_to_yuv_char_nhwc<<>>(srcWrap, dstWrap, dstSize, bidx); - checkKernelErrors(); - } - break; + CVCUDA_BGR2YUV_CASE(uchar3); case kCV_16U: - { - auto srcWrap = cuda::CreateTensorWrapNHWC(inData); - auto dstWrap = cuda::CreateTensorWrapNHWC(outData); - bgr_to_yuv_char_nhwc<<>>(srcWrap, dstWrap, dstSize, bidx); - checkKernelErrors(); - } - break; + CVCUDA_BGR2YUV_CASE(ushort3); case kCV_32F: - { - auto srcWrap = cuda::CreateTensorWrapNHWC(inData); - auto dstWrap = cuda::CreateTensorWrapNHWC(outData); - bgr_to_yuv_float_nhwc<<>>(srcWrap, dstWrap, dstSize, bidx); - checkKernelErrors(); - } - break; + CVCUDA_BGR2YUV_CASE(float3); default: LOG_ERROR("Unsupported DataType " << inDataType); return ErrorCode::INVALID_DATA_TYPE; } +#undef CVCUDA_BGR2YUV_CASE + return ErrorCode::SUCCESS; +} + +template +inline ErrorCode Launch_YUV_to_BGR(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + cuda_op::DataShape shape, int bidx, cudaStream_t stream) +{ + using Policy = CvtKernelPolicy<32, 4, 4>; + + dim3 blockSize(Policy::BlockWidth, Policy::BlockHeight); + dim3 gridSize(divUp(shape.W, Policy::TileWidth), divUp(shape.H, Policy::TileHeight), shape.N); + + int2 dstSize{shape.W, shape.H}; + + auto srcWrap = cuda::CreateTensorWrapNHW(inData); + auto dstWrap = cuda::CreateTensorWrapNHW(outData); + yuv_to_bgr_nhwc<<>>(srcWrap, dstWrap, dstSize, bidx); + checkKernelErrors(); + return ErrorCode::SUCCESS; } @@ -981,41 +1155,49 @@ inline ErrorCode YUV_to_BGR(const TensorDataStridedCuda &inData, const TensorDat return ErrorCode::INVALID_DATA_SHAPE; } - dim3 blockSize(BLOCK, BLOCK / 4, 1); - dim3 gridSize(divUp(inputShape.W, blockSize.x), divUp(inputShape.H, blockSize.y), inputShape.N); - - int2 dstSize{outputShape.W, outputShape.H}; - +#define CVCUDA_YUV2BGR_CASE(T3) return Launch_YUV_to_BGR(inData, outData, inputShape, bidx, stream) switch (inDataType) { case kCV_8U: - { - auto srcWrap = cuda::CreateTensorWrapNHWC(inData); - auto dstWrap = cuda::CreateTensorWrapNHWC(outData); - yuv_to_bgr_char_nhwc<<>>(srcWrap, dstWrap, dstSize, bidx); - checkKernelErrors(); - } - break; + CVCUDA_YUV2BGR_CASE(uchar3); case kCV_16U: - { - auto srcWrap = cuda::CreateTensorWrapNHWC(inData); - auto dstWrap = cuda::CreateTensorWrapNHWC(outData); - yuv_to_bgr_char_nhwc<<>>(srcWrap, dstWrap, dstSize, bidx); - checkKernelErrors(); - } - break; + CVCUDA_YUV2BGR_CASE(ushort3); case kCV_32F: - { - auto srcWrap = cuda::CreateTensorWrapNHWC(inData); - auto dstWrap = cuda::CreateTensorWrapNHWC(outData); - yuv_to_bgr_float_nhwc<<>>(srcWrap, dstWrap, dstSize, bidx); - checkKernelErrors(); - } - break; + CVCUDA_YUV2BGR_CASE(float3); default: LOG_ERROR("Unsupported DataType " << inDataType); return ErrorCode::INVALID_DATA_TYPE; } +#undef CVCUDA_YUV2BGR_CASE + return ErrorCode::SUCCESS; +} + +template +inline ErrorCode Launch_BGR_to_HSV(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + cuda_op::DataShape shape, int bidx, bool isFullRange, bool strides_64b, + cudaStream_t stream) +{ + using Policy = CvtKernelPolicy<32, 4, 4>; + + dim3 blockSize(Policy::BlockWidth, Policy::BlockHeight); + dim3 gridSize(divUp(shape.W, Policy::TileWidth), divUp(shape.H, Policy::TileHeight), shape.N); + + int2 dstSize{shape.W, shape.H}; + + if (strides_64b) + { + auto srcWrap = cuda::CreateTensorWrapNHW(inData); + auto dstWrap = cuda::CreateTensorWrapNHW(outData); + bgr_to_hsv_nhwc<<>>(srcWrap, dstWrap, dstSize, bidx, isFullRange); + } + else + { + auto srcWrap = cuda::CreateTensorWrapNHW(inData); + auto dstWrap = cuda::CreateTensorWrapNHW(outData); + bgr_to_hsv_nhwc<<>>(srcWrap, dstWrap, dstSize, bidx, isFullRange); + } + checkKernelErrors(); + return ErrorCode::SUCCESS; } @@ -1053,33 +1235,53 @@ inline ErrorCode BGR_to_HSV(const TensorDataStridedCuda &inData, const TensorDat return ErrorCode::INVALID_DATA_SHAPE; } - dim3 blockSize(BLOCK, BLOCK / 4, 1); - dim3 gridSize(divUp(inputShape.W, blockSize.x), divUp(inputShape.H, blockSize.y), inputShape.N); + const bool strides_64b = std::max(inAccess->sampleStride() * inAccess->numSamples(), + outAccess->sampleStride() * outAccess->numSamples()) + > nvcv::cuda::TypeTraits::max; + +#define CVCUDA_BGR2HSV_CASE(T3) \ + return Launch_BGR_to_HSV(inData, outData, inputShape, bidx, isFullRange, strides_64b, stream) + + switch (inDataType) + { + case kCV_8U: + CVCUDA_BGR2HSV_CASE(uchar3); + case kCV_32F: + CVCUDA_BGR2HSV_CASE(float3); + default: + LOG_ERROR("Unsupported DataType " << inDataType); + return ErrorCode::INVALID_DATA_TYPE; + } +#undef CVCUDA_BGR2HSV_CASE + return ErrorCode::SUCCESS; +} + +template +inline ErrorCode Launch_HSV_to_BGR(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + cuda_op::DataShape shape, int bidx, bool isFullRange, bool strides_64b, + cudaStream_t stream) +{ + using Policy = CvtKernelPolicy<32, 4, 4>; + + dim3 blockSize(Policy::BlockWidth, Policy::BlockHeight); + dim3 gridSize(divUp(shape.W, Policy::TileWidth), divUp(shape.H, Policy::TileHeight), shape.N); - int2 dstSize{outputShape.W, outputShape.H}; + int2 dstSize{shape.W, shape.H}; - switch (inDataType) - { - case kCV_8U: + if (strides_64b) { - auto srcWrap = cuda::CreateTensorWrapNHWC(inData); - auto dstWrap = cuda::CreateTensorWrapNHWC(outData); - bgr_to_hsv_char_nhwc<<>>(srcWrap, dstWrap, dstSize, bidx, isFullRange); - checkKernelErrors(); + auto srcWrap = cuda::CreateTensorWrapNHW(inData); + auto dstWrap = cuda::CreateTensorWrapNHW(outData); + hsv_to_bgr_nhwc<<>>(srcWrap, dstWrap, dstSize, bidx, isFullRange); } - break; - case kCV_32F: + else { - auto srcWrap = cuda::CreateTensorWrapNHWC(inData); - auto dstWrap = cuda::CreateTensorWrapNHWC(outData); - bgr_to_hsv_float_nhwc<<>>(srcWrap, dstWrap, dstSize, bidx); - checkKernelErrors(); - } - break; - default: - LOG_ERROR("Unsupported DataType " << inDataType); - return ErrorCode::INVALID_DATA_TYPE; + auto srcWrap = cuda::CreateTensorWrapNHW(inData); + auto dstWrap = cuda::CreateTensorWrapNHW(outData); + hsv_to_bgr_nhwc<<>>(srcWrap, dstWrap, dstSize, bidx, isFullRange); } + checkKernelErrors(); + return ErrorCode::SUCCESS; } @@ -1122,34 +1324,63 @@ inline ErrorCode HSV_to_BGR(const TensorDataStridedCuda &inData, const TensorDat return ErrorCode::INVALID_DATA_SHAPE; } - dim3 blockSize(BLOCK, BLOCK / 4, 1); - dim3 gridSize(divUp(inputShape.W, blockSize.x), divUp(inputShape.H, blockSize.y), inputShape.N); + const int dcn = outputShape.C; + const bool strides_64b = std::max(inAccess->sampleStride() * inAccess->numSamples(), + outAccess->sampleStride() * outAccess->numSamples()) + > nvcv::cuda::TypeTraits::max; - int2 dstSize{outputShape.W, outputShape.H}; - int dcn = outputShape.C; +#define CVCUDA_HSV2BGR_CASE(T3, T4) \ + if (dcn == 3) \ + return Launch_HSV_to_BGR(inData, outData, inputShape, bidx, isFullRange, strides_64b, stream); \ + else \ + return Launch_HSV_to_BGR(inData, outData, inputShape, bidx, isFullRange, strides_64b, stream) switch (inDataType) { case kCV_8U: - { - auto srcWrap = cuda::CreateTensorWrapNHWC(inData); - auto dstWrap = cuda::CreateTensorWrapNHWC(outData); - hsv_to_bgr_char_nhwc<<>>(srcWrap, dstWrap, dstSize, bidx, dcn, isFullRange); - checkKernelErrors(); - } - break; + CVCUDA_HSV2BGR_CASE(uchar3, uchar4); case kCV_32F: - { - auto srcWrap = cuda::CreateTensorWrapNHWC(inData); - auto dstWrap = cuda::CreateTensorWrapNHWC(outData); - hsv_to_bgr_float_nhwc<<>>(srcWrap, dstWrap, dstSize, bidx, dcn); - checkKernelErrors(); - } - break; + CVCUDA_HSV2BGR_CASE(float3, float4); default: LOG_ERROR("Unsupported DataType " << inDataType); return ErrorCode::INVALID_DATA_TYPE; } +#undef CVCUDA_HSV2BGR_CASE + return ErrorCode::SUCCESS; +} + +template +inline ErrorCode Launch_YUV420xp_to_BGR(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + cuda_op::DataShape shape, int bidx, int uidx, bool strides_64b, + cudaStream_t stream) +{ + using Policy = CvtKernelPolicy<32, 4, 4>; + + dim3 blockSize(Policy::BlockWidth, Policy::BlockHeight); + dim3 gridSize(divUp(shape.W, Policy::TileWidth), divUp(shape.H, Policy::TileHeight), shape.N); + + int2 dstSize{shape.W, shape.H}; + + if (strides_64b) + { + // YUV420 input: 4D tensor with scalar type. + auto srcWrap = cuda::CreateTensorWrapNHWC(inData); + // BGR output: 3D tensor with vector type. + auto dstWrap = cuda::CreateTensorWrapNHW(outData); + yuv420_to_bgr_char_nhwc + <<>>(srcWrap, dstWrap, dstSize, bidx, uidx); + } + else + { + // YUV420 input: 4D tensor with scalar type. + auto srcWrap = cuda::CreateTensorWrapNHWC(inData); + // BGR output: 3D tensor with vector type. + auto dstWrap = cuda::CreateTensorWrapNHW(outData); + yuv420_to_bgr_char_nhwc + <<>>(srcWrap, dstWrap, dstSize, bidx, uidx); + } + checkKernelErrors(); + return ErrorCode::SUCCESS; } @@ -1170,6 +1401,13 @@ inline ErrorCode YUV420xp_to_BGR(const TensorDataStridedCuda &inData, const Tens ? 0 : 1; + // clang-format off + bool p420 = (code == NVCV_COLOR_YUV2BGR_YV12 || code == NVCV_COLOR_YUV2BGRA_YV12 || + code == NVCV_COLOR_YUV2RGB_YV12 || code == NVCV_COLOR_YUV2RGBA_YV12 || + code == NVCV_COLOR_YUV2BGR_IYUV || code == NVCV_COLOR_YUV2BGRA_IYUV || + code == NVCV_COLOR_YUV2RGB_IYUV || code == NVCV_COLOR_YUV2RGBA_IYUV); + // clang-format on + auto inAccess = TensorDataAccessStridedImagePlanar::Create(inData); NVCV_ASSERT(inAccess); @@ -1182,7 +1420,7 @@ inline ErrorCode YUV420xp_to_BGR(const TensorDataStridedCuda &inData, const Tens cuda_op::DataType outDataType = helpers::GetLegacyDataType(outData.dtype()); cuda_op::DataShape outputShape = helpers::GetLegacyDataShape(outAccess->infoShape()); - if (outputShape.C != 3 && outputShape.C != 4) + if ((code != NVCV_COLOR_YUV2GRAY_420 || outputShape.C != 1) && outputShape.C != 3 && outputShape.C != 4) { LOG_ERROR("Invalid output channel number " << outputShape.C); return ErrorCode::INVALID_DATA_SHAPE; @@ -1211,32 +1449,36 @@ inline ErrorCode YUV420xp_to_BGR(const TensorDataStridedCuda &inData, const Tens LOG_ERROR("Invalid output shape " << outputShape); return ErrorCode::INVALID_DATA_SHAPE; } + if (p420 && rgb_height % 4 != 0) // YUV 420 planar formats need 4 rows of Y for every full row of U or V. + { + LOG_ERROR( + "Invalid input shape: to convert from YUV 420 planar formats, the output " + "tensor height must be a multiple of 4; height = " + << rgb_height); + return ErrorCode::INVALID_DATA_SHAPE; + } - dim3 blockSize(BLOCK, BLOCK / 1, 1); - dim3 gridSize(divUp(rgb_width, blockSize.x), divUp(rgb_height, blockSize.y), inputShape.N); - - int2 dstSize{outputShape.W, outputShape.H}; - int dcn = outputShape.C; - - auto srcWrap = cuda::CreateTensorWrapNHWC(inData); - auto dstWrap = cuda::CreateTensorWrapNHWC(outData); + const int dcn = outputShape.C; + const bool strides_64b = std::max(inAccess->sampleStride() * inAccess->numSamples(), + outAccess->sampleStride() * outAccess->numSamples()) + > nvcv::cuda::TypeTraits::max; switch (code) { case NVCV_COLOR_YUV2GRAY_420: { - /* Method 1 */ - // yuv420_to_gray_char_nhwc<<>>(srcWrap, dstWrap, dstSize); - // checkKernelErrors(); + int dpitch = static_cast(outAccess->rowStride()); + int spitch = static_cast(inAccess->rowStride()); - /* Method 2 (Better performance, but only works with fixed input shapes) */ - int dpitch = static_cast(outAccess->sampleStride()); - int spitch = static_cast(inAccess->sampleStride()); - int cpy_width = static_cast(outAccess->sampleStride()); - int cpy_height = inputShape.N; + for (int i = 0; i < inputShape.N; i++) + { + const void *srcPtr = inData.basePtr() + (size_t)i * inAccess->sampleStride(); + + void *dstPtr = outData.basePtr() + (size_t)i * outAccess->sampleStride(); - checkCudaErrors(cudaMemcpy2DAsync(outData.basePtr(), dpitch, inData.basePtr(), spitch, cpy_width, cpy_height, - cudaMemcpyDeviceToDevice, stream)); + checkCudaErrors(cudaMemcpy2DAsync(dstPtr, dpitch, srcPtr, spitch, rgb_width, rgb_height, + cudaMemcpyDeviceToDevice, stream)); + } } break; case NVCV_COLOR_YUV2BGR_NV12: @@ -1247,11 +1489,12 @@ inline ErrorCode YUV420xp_to_BGR(const TensorDataStridedCuda &inData, const Tens case NVCV_COLOR_YUV2RGB_NV21: case NVCV_COLOR_YUV2RGBA_NV12: case NVCV_COLOR_YUV2RGBA_NV21: - { - yuv420sp_to_bgr_char_nhwc<<>>(srcWrap, dstWrap, dstSize, dcn, bidx, uidx); - checkKernelErrors(); - } - break; + if (dcn == 3) + return Launch_YUV420xp_to_BGR(inData, outData, outputShape, bidx, uidx, strides_64b, + stream); + else + return Launch_YUV420xp_to_BGR(inData, outData, outputShape, bidx, uidx, strides_64b, + stream); case NVCV_COLOR_YUV2BGR_YV12: case NVCV_COLOR_YUV2BGR_IYUV: case NVCV_COLOR_YUV2BGRA_YV12: @@ -1260,11 +1503,12 @@ inline ErrorCode YUV420xp_to_BGR(const TensorDataStridedCuda &inData, const Tens case NVCV_COLOR_YUV2RGB_IYUV: case NVCV_COLOR_YUV2RGBA_YV12: case NVCV_COLOR_YUV2RGBA_IYUV: - { - yuv420p_to_bgr_char_nhwc<<>>(srcWrap, dstWrap, dstSize, dcn, bidx, uidx); - checkKernelErrors(); - } - break; + if (dcn == 3) + return Launch_YUV420xp_to_BGR(inData, outData, outputShape, bidx, uidx, strides_64b, + stream); + else + return Launch_YUV420xp_to_BGR(inData, outData, outputShape, bidx, uidx, strides_64b, + stream); default: LOG_ERROR("Unsupported conversion code " << code); return ErrorCode::INVALID_PARAMETER; @@ -1272,28 +1516,61 @@ inline ErrorCode YUV420xp_to_BGR(const TensorDataStridedCuda &inData, const Tens return ErrorCode::SUCCESS; } -inline ErrorCode YUV422_to_BGR(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, - NVCVColorConversionCode code, cudaStream_t stream) +template +inline ErrorCode Launch_BGR_to_YUV420xp(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + DataShape inputShape, int bidx, int uidx, bool strides_64b, cudaStream_t stream) +{ + using Policy = CvtKernelPolicy<32, 4, 4>; + + int2 srcSize{inputShape.W, inputShape.H}; + + dim3 blockSize(Policy::BlockWidth, Policy::BlockHeight); + dim3 gridSize(divUp(inputShape.W, Policy::TileWidth), divUp(inputShape.H, Policy::TileHeight), inputShape.N); + + if (strides_64b) + { + auto srcWrap = cuda::CreateTensorWrapNHW(inData); // RGB input: 3D tensor with vector type. + auto dstWrap = cuda::CreateTensorWrapNHWC(outData); // YUV420 output: 4D scalar tensor. + + bgr_to_yuv420_char_nhwc + <<>>(srcWrap, dstWrap, srcSize, bidx, uidx); + } + else + { + auto srcWrap = cuda::CreateTensorWrapNHW(inData); // RGB input: 3D tensor with vector type. + auto dstWrap = cuda::CreateTensorWrapNHWC(outData); // YUV420 output: 4D scalar tensor. + + bgr_to_yuv420_char_nhwc + <<>>(srcWrap, dstWrap, srcSize, bidx, uidx); + } + checkKernelErrors(); + + return ErrorCode::SUCCESS; +} + +inline ErrorCode BGR_to_YUV420xp(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + NVCVColorConversionCode code, cudaStream_t stream) { int bidx - = (code == NVCV_COLOR_YUV2BGR_YUY2 || code == NVCV_COLOR_YUV2BGRA_YUY2 || code == NVCV_COLOR_YUV2BGR_YVYU - || code == NVCV_COLOR_YUV2BGRA_YVYU || code == NVCV_COLOR_YUV2BGR_UYVY || code == NVCV_COLOR_YUV2BGRA_UYVY) + = (code == NVCV_COLOR_BGR2YUV_NV12 || code == NVCV_COLOR_BGRA2YUV_NV12 || code == NVCV_COLOR_BGR2YUV_NV21 + || code == NVCV_COLOR_BGRA2YUV_NV21 || code == NVCV_COLOR_BGR2YUV_YV12 || code == NVCV_COLOR_BGRA2YUV_YV12 + || code == NVCV_COLOR_BGR2YUV_IYUV || code == NVCV_COLOR_BGRA2YUV_IYUV) ? 0 : 2; - int yidx - = (code == NVCV_COLOR_YUV2BGR_YUY2 || code == NVCV_COLOR_YUV2BGRA_YUY2 || code == NVCV_COLOR_YUV2RGB_YUY2 - || code == NVCV_COLOR_YUV2RGBA_YUY2 || code == NVCV_COLOR_YUV2BGR_YVYU || code == NVCV_COLOR_YUV2BGRA_YVYU - || code == NVCV_COLOR_YUV2RGB_YVYU || code == NVCV_COLOR_YUV2RGBA_YVYU || code == NVCV_COLOR_YUV2GRAY_YUY2) + int uidx + = (code == NVCV_COLOR_BGR2YUV_NV12 || code == NVCV_COLOR_BGRA2YUV_NV12 || code == NVCV_COLOR_RGB2YUV_NV12 + || code == NVCV_COLOR_RGBA2YUV_NV12 || code == NVCV_COLOR_BGR2YUV_IYUV || code == NVCV_COLOR_BGRA2YUV_IYUV + || code == NVCV_COLOR_RGB2YUV_IYUV || code == NVCV_COLOR_RGBA2YUV_IYUV) ? 0 : 1; - int uidx - = (code == NVCV_COLOR_YUV2BGR_YUY2 || code == NVCV_COLOR_YUV2BGRA_YUY2 || code == NVCV_COLOR_YUV2RGB_YUY2 - || code == NVCV_COLOR_YUV2RGBA_YUY2 || code == NVCV_COLOR_YUV2BGR_UYVY || code == NVCV_COLOR_YUV2BGRA_UYVY - || code == NVCV_COLOR_YUV2RGB_UYVY || code == NVCV_COLOR_YUV2RGBA_UYVY) - ? 0 - : 2; + // clang-format off + bool p420 = (code == NVCV_COLOR_BGR2YUV_YV12 || code == NVCV_COLOR_BGRA2YUV_YV12 || + code == NVCV_COLOR_RGB2YUV_YV12 || code == NVCV_COLOR_RGBA2YUV_YV12 || + code == NVCV_COLOR_BGR2YUV_IYUV || code == NVCV_COLOR_BGRA2YUV_IYUV || + code == NVCV_COLOR_RGB2YUV_IYUV || code == NVCV_COLOR_RGBA2YUV_IYUV); + // clang-format on auto inAccess = TensorDataAccessStridedImagePlanar::Create(inData); NVCV_ASSERT(inAccess); @@ -1307,14 +1584,22 @@ inline ErrorCode YUV422_to_BGR(const TensorDataStridedCuda &inData, const Tensor cuda_op::DataType outDataType = helpers::GetLegacyDataType(outData.dtype()); cuda_op::DataShape outputShape = helpers::GetLegacyDataShape(outAccess->infoShape()); - if (outputShape.C != 3 && outputShape.C != 4) + if (inputShape.C != 3 && inputShape.C != 4) { - LOG_ERROR("Invalid output channel number " << outputShape.C); + LOG_ERROR("Invalid input channel number " << inputShape.C); return ErrorCode::INVALID_DATA_SHAPE; } - if (inputShape.C != 2) + if (inputShape.H % 2 != 0 || inputShape.W % 2 != 0) { - LOG_ERROR("Invalid input channel number " << inputShape.C); + LOG_ERROR("Invalid input shape " << inputShape); + return ErrorCode::INVALID_DATA_SHAPE; + } + if (p420 && inputShape.H % 4 != 0) // YUV 420 planar formats need 4 rows of Y for every full row of U or V. + { + LOG_ERROR( + "Invalid input shape: to convert to YUV 420 planar formats, the input " + "tensor height must be a multiple of 4; height = " + << inputShape.H); return ErrorCode::INVALID_DATA_SHAPE; } if (inDataType != kCV_8U || outDataType != kCV_8U) @@ -1322,47 +1607,50 @@ inline ErrorCode YUV422_to_BGR(const TensorDataStridedCuda &inData, const Tensor LOG_ERROR("Unsupported input/output DataType " << inDataType << "/" << outDataType); return ErrorCode::INVALID_DATA_TYPE; } - if (outputShape.H != inputShape.H || outputShape.W != inputShape.W || outputShape.N != inputShape.N) + + int yuv420_width = inputShape.W; + int yuv420_height = inputShape.H / 2 * 3; + + if (outputShape.H != yuv420_height || outputShape.W != yuv420_width || outputShape.N != inputShape.N) { LOG_ERROR("Invalid output shape " << outputShape); return ErrorCode::INVALID_DATA_SHAPE; } - dim3 blockSize(BLOCK, BLOCK / 4, 1); - dim3 gridSize(divUp(inputShape.W, blockSize.x), divUp(inputShape.H, blockSize.y), inputShape.N); - - int2 dstSize{outputShape.W, outputShape.H}; - int dcn = outputShape.C; - - auto srcWrap = cuda::CreateTensorWrapNHWC(inData); - auto dstWrap = cuda::CreateTensorWrapNHWC(outData); + const bool strides_64b = std::max(inAccess->sampleStride() * inAccess->numSamples(), + outAccess->sampleStride() * outAccess->numSamples()) + > nvcv::cuda::TypeTraits::max; switch (code) { - case NVCV_COLOR_YUV2GRAY_YUY2: - case NVCV_COLOR_YUV2GRAY_UYVY: - { - yuv422_to_gray_char_nhwc<<>>(srcWrap, dstWrap, dstSize, yidx); - checkKernelErrors(); - } - break; - case NVCV_COLOR_YUV2BGR_YUY2: - case NVCV_COLOR_YUV2BGR_YVYU: - case NVCV_COLOR_YUV2BGRA_YUY2: - case NVCV_COLOR_YUV2BGRA_YVYU: - case NVCV_COLOR_YUV2RGB_YUY2: - case NVCV_COLOR_YUV2RGB_YVYU: - case NVCV_COLOR_YUV2RGBA_YUY2: - case NVCV_COLOR_YUV2RGBA_YVYU: - case NVCV_COLOR_YUV2RGB_UYVY: - case NVCV_COLOR_YUV2BGR_UYVY: - case NVCV_COLOR_YUV2RGBA_UYVY: - case NVCV_COLOR_YUV2BGRA_UYVY: - { - yuv422_to_bgr_char_nhwc<<>>(srcWrap, dstWrap, dstSize, dcn, bidx, yidx, uidx); - checkKernelErrors(); - } - break; + case NVCV_COLOR_BGR2YUV_NV12: + case NVCV_COLOR_BGR2YUV_NV21: + case NVCV_COLOR_BGRA2YUV_NV12: + case NVCV_COLOR_BGRA2YUV_NV21: + case NVCV_COLOR_RGB2YUV_NV12: + case NVCV_COLOR_RGB2YUV_NV21: + case NVCV_COLOR_RGBA2YUV_NV12: + case NVCV_COLOR_RGBA2YUV_NV21: + if (inputShape.C == 3) + return Launch_BGR_to_YUV420xp(inData, outData, inputShape, bidx, uidx, strides_64b, + stream); + else + return Launch_BGR_to_YUV420xp(inData, outData, inputShape, bidx, uidx, strides_64b, + stream); + case NVCV_COLOR_BGR2YUV_YV12: + case NVCV_COLOR_BGR2YUV_IYUV: + case NVCV_COLOR_BGRA2YUV_YV12: + case NVCV_COLOR_BGRA2YUV_IYUV: + case NVCV_COLOR_RGB2YUV_YV12: + case NVCV_COLOR_RGB2YUV_IYUV: + case NVCV_COLOR_RGBA2YUV_YV12: + case NVCV_COLOR_RGBA2YUV_IYUV: + if (inputShape.C == 3) + return Launch_BGR_to_YUV420xp(inData, outData, inputShape, bidx, uidx, strides_64b, + stream); + else + return Launch_BGR_to_YUV420xp(inData, outData, inputShape, bidx, uidx, strides_64b, + stream); default: LOG_ERROR("Unsupported conversion code " << code); return ErrorCode::INVALID_PARAMETER; @@ -1370,53 +1658,29 @@ inline ErrorCode YUV422_to_BGR(const TensorDataStridedCuda &inData, const Tensor return ErrorCode::SUCCESS; } -template -inline static void bgr_to_yuv420p_launcher(SrcWrapper srcWrap, DstWrapper dstWrap, DataShape inputShape, int bidx, - int uidx, cudaStream_t stream) -{ - int2 srcSize{inputShape.W, inputShape.H}; - // method 1 - dim3 blockSize(BLOCK, BLOCK / 1, 1); - dim3 gridSize(divUp(inputShape.W, blockSize.x), divUp(inputShape.H, blockSize.y), inputShape.N); - bgr_to_yuv420p_char_nhwc<<>>(srcWrap, dstWrap, srcSize, inputShape.C, bidx, uidx); - checkKernelErrors(); - - // method 2 (TODO) - // NPP -} - -template -inline static void bgr_to_yuv420sp_launcher(SrcWrapper srcWrap, DstWrapper dstWrap, DataShape inputShape, int bidx, - int uidx, cudaStream_t stream) -{ - int2 srcSize{inputShape.W, inputShape.H}; - // method 1 - dim3 blockSize(BLOCK, BLOCK / 1, 1); - dim3 gridSize(divUp(inputShape.W, blockSize.x), divUp(inputShape.H, blockSize.y), inputShape.N); - bgr_to_yuv420sp_char_nhwc<<>>(srcWrap, dstWrap, srcSize, inputShape.C, bidx, uidx); - checkKernelErrors(); - - // method 2 (TODO) - // NPP -} - -inline ErrorCode BGR_to_YUV420xp(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, - NVCVColorConversionCode code, cudaStream_t stream) +inline ErrorCode YUV422_to_BGR(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + NVCVColorConversionCode code, cudaStream_t stream) { int bidx - = (code == NVCV_COLOR_BGR2YUV_NV12 || code == NVCV_COLOR_BGRA2YUV_NV12 || code == NVCV_COLOR_BGR2YUV_NV21 - || code == NVCV_COLOR_BGRA2YUV_NV21 || code == NVCV_COLOR_BGR2YUV_YV12 || code == NVCV_COLOR_BGRA2YUV_YV12 - || code == NVCV_COLOR_BGR2YUV_IYUV || code == NVCV_COLOR_BGRA2YUV_IYUV) + = (code == NVCV_COLOR_YUV2BGR_YUY2 || code == NVCV_COLOR_YUV2BGRA_YUY2 || code == NVCV_COLOR_YUV2BGR_YVYU + || code == NVCV_COLOR_YUV2BGRA_YVYU || code == NVCV_COLOR_YUV2BGR_UYVY || code == NVCV_COLOR_YUV2BGRA_UYVY) ? 0 : 2; - int uidx - = (code == NVCV_COLOR_BGR2YUV_NV12 || code == NVCV_COLOR_BGRA2YUV_NV12 || code == NVCV_COLOR_RGB2YUV_NV12 - || code == NVCV_COLOR_RGBA2YUV_NV12 || code == NVCV_COLOR_BGR2YUV_IYUV || code == NVCV_COLOR_BGRA2YUV_IYUV - || code == NVCV_COLOR_RGB2YUV_IYUV || code == NVCV_COLOR_RGBA2YUV_IYUV) + int yidx + = (code == NVCV_COLOR_YUV2BGR_YUY2 || code == NVCV_COLOR_YUV2BGRA_YUY2 || code == NVCV_COLOR_YUV2RGB_YUY2 + || code == NVCV_COLOR_YUV2RGBA_YUY2 || code == NVCV_COLOR_YUV2BGR_YVYU || code == NVCV_COLOR_YUV2BGRA_YVYU + || code == NVCV_COLOR_YUV2RGB_YVYU || code == NVCV_COLOR_YUV2RGBA_YVYU || code == NVCV_COLOR_YUV2GRAY_YUY2) ? 0 : 1; + int uidx + = (code == NVCV_COLOR_YUV2BGR_YUY2 || code == NVCV_COLOR_YUV2BGRA_YUY2 || code == NVCV_COLOR_YUV2RGB_YUY2 + || code == NVCV_COLOR_YUV2RGBA_YUY2 || code == NVCV_COLOR_YUV2BGR_UYVY || code == NVCV_COLOR_YUV2BGRA_UYVY + || code == NVCV_COLOR_YUV2RGB_UYVY || code == NVCV_COLOR_YUV2RGBA_UYVY) + ? 0 + : 2; + auto inAccess = TensorDataAccessStridedImagePlanar::Create(inData); NVCV_ASSERT(inAccess); @@ -1429,61 +1693,68 @@ inline ErrorCode BGR_to_YUV420xp(const TensorDataStridedCuda &inData, const Tens cuda_op::DataType outDataType = helpers::GetLegacyDataType(outData.dtype()); cuda_op::DataShape outputShape = helpers::GetLegacyDataShape(outAccess->infoShape()); - if (inputShape.C != 3 && inputShape.C != 4) + if (inputShape.W % 4 != 0) { - LOG_ERROR("Invalid input channel number " << inputShape.C); + LOG_ERROR("Invalid input shape " << inputShape << " -- width must be a multiple of 4"); return ErrorCode::INVALID_DATA_SHAPE; } - if (inputShape.H % 2 != 0 || inputShape.W % 2 != 0) + + if ((code != NVCV_COLOR_YUV2GRAY_UYVY && code != NVCV_COLOR_YUV2GRAY_YUY2 || outputShape.C != 1) + && outputShape.C != 3 && outputShape.C != 4) { - LOG_ERROR("Invalid input shape " << inputShape); + LOG_ERROR("Invalid output channel number " + << outputShape.C + << " -- RGB output must have 3 or 4 channels and grayscale output must have 1 channel"); + return ErrorCode::INVALID_DATA_SHAPE; + } + if (inputShape.C != 1) + { + LOG_ERROR("Invalid input channel number " << inputShape.C << " -- input must have 1 channel"); return ErrorCode::INVALID_DATA_SHAPE; } if (inDataType != kCV_8U || outDataType != kCV_8U) { - LOG_ERROR("Unsupported input/output DataType " << inDataType << "/" << outDataType); + LOG_ERROR("Unsupported input / output DataType " << inDataType << " / " << outDataType); return ErrorCode::INVALID_DATA_TYPE; } - - int yuv420_width = inputShape.W; - int yuv420_height = inputShape.H / 2 * 3; - - if (outputShape.H != yuv420_height || outputShape.W != yuv420_width || outputShape.N != inputShape.N) + if (outputShape.H != inputShape.H || 2 * outputShape.W != inputShape.W || outputShape.N != inputShape.N) { LOG_ERROR("Invalid output shape " << outputShape); return ErrorCode::INVALID_DATA_SHAPE; } - // BGR input + dim3 blockSize(BLOCK, BLOCK / 4, 1); + dim3 gridSize(divUp(inputShape.W / 4, blockSize.x), divUp(inputShape.H, blockSize.y), inputShape.N); + + int2 dstSize{outputShape.W, outputShape.H}; + int dcn = outputShape.C; + auto srcWrap = cuda::CreateTensorWrapNHWC(inData); - // YUV420 output auto dstWrap = cuda::CreateTensorWrapNHWC(outData); switch (code) { - case NVCV_COLOR_BGR2YUV_NV12: - case NVCV_COLOR_BGR2YUV_NV21: - case NVCV_COLOR_BGRA2YUV_NV12: - case NVCV_COLOR_BGRA2YUV_NV21: - case NVCV_COLOR_RGB2YUV_NV12: - case NVCV_COLOR_RGB2YUV_NV21: - case NVCV_COLOR_RGBA2YUV_NV12: - case NVCV_COLOR_RGBA2YUV_NV21: + case NVCV_COLOR_YUV2GRAY_YUY2: + case NVCV_COLOR_YUV2GRAY_UYVY: { - bgr_to_yuv420sp_launcher(srcWrap, dstWrap, inputShape, bidx, uidx, stream); + yuv422_to_gray_char_nhwc<<>>(srcWrap, dstWrap, dstSize, yidx); checkKernelErrors(); } break; - case NVCV_COLOR_BGR2YUV_YV12: - case NVCV_COLOR_BGR2YUV_IYUV: - case NVCV_COLOR_BGRA2YUV_YV12: - case NVCV_COLOR_BGRA2YUV_IYUV: - case NVCV_COLOR_RGB2YUV_YV12: - case NVCV_COLOR_RGB2YUV_IYUV: - case NVCV_COLOR_RGBA2YUV_YV12: - case NVCV_COLOR_RGBA2YUV_IYUV: + case NVCV_COLOR_YUV2BGR_YUY2: + case NVCV_COLOR_YUV2BGR_YVYU: + case NVCV_COLOR_YUV2BGRA_YUY2: + case NVCV_COLOR_YUV2BGRA_YVYU: + case NVCV_COLOR_YUV2RGB_YUY2: + case NVCV_COLOR_YUV2RGB_YVYU: + case NVCV_COLOR_YUV2RGBA_YUY2: + case NVCV_COLOR_YUV2RGBA_YVYU: + case NVCV_COLOR_YUV2RGB_UYVY: + case NVCV_COLOR_YUV2BGR_UYVY: + case NVCV_COLOR_YUV2RGBA_UYVY: + case NVCV_COLOR_YUV2BGRA_UYVY: { - bgr_to_yuv420p_launcher(srcWrap, dstWrap, inputShape, bidx, uidx, stream); + yuv422_to_bgr_char_nhwc<<>>(srcWrap, dstWrap, dstSize, dcn, bidx, yidx, uidx); checkKernelErrors(); } break; diff --git a/src/cvcuda/priv/legacy/cvt_color_var_shape.cu b/src/cvcuda/priv/legacy/cvt_color_var_shape.cu index d40fa847..a12c50d3 100644 --- a/src/cvcuda/priv/legacy/cvt_color_var_shape.cu +++ b/src/cvcuda/priv/legacy/cvt_color_var_shape.cu @@ -338,9 +338,9 @@ __global__ void bgr_to_hsv_char_nhwc(cuda::ImageBatchVarShapeWrapNHWC src, cu vmin = min(vmin, g); vmin = min(vmin, r); - unsigned char diff = cuda::SaturateCast(v - vmin); - vr = v == r ? -1 : 0; - vg = v == g ? -1 : 0; + uint8_t diff = cuda::SaturateCast(v - vmin); + vr = v == r ? -1 : 0; + vg = v == g ? -1 : 0; int hdiv_table = diff == 0 ? 0 : cuda::SaturateCast((hrange << hsv_shift) / (6. * diff)); int sdiv_table = v == 0 ? 0 : cuda::SaturateCast((255 << hsv_shift) / (1. * v)); @@ -349,9 +349,9 @@ __global__ void bgr_to_hsv_char_nhwc(cuda::ImageBatchVarShapeWrapNHWC src, cu h = (h * hdiv_table + (1 << (hsv_shift - 1))) >> hsv_shift; h += h < 0 ? hr : 0; - *dst.ptr(batch_idx, dst_y, dst_x, 0) = cuda::SaturateCast(h); - *dst.ptr(batch_idx, dst_y, dst_x, 1) = (unsigned char)s; - *dst.ptr(batch_idx, dst_y, dst_x, 2) = (unsigned char)v; + *dst.ptr(batch_idx, dst_y, dst_x, 0) = cuda::SaturateCast(h); + *dst.ptr(batch_idx, dst_y, dst_x, 1) = (uint8_t)s; + *dst.ptr(batch_idx, dst_y, dst_x, 2) = (uint8_t)v; } template @@ -401,8 +401,7 @@ __global__ void bgr_to_hsv_float_nhwc(cuda::ImageBatchVarShapeWrapNHWC src, c *dst.ptr(batch_idx, dst_y, dst_x, 2) = v; } -inline __device__ void HSV2RGB_native_var_shape(float h, float s, float v, float &b, float &g, float &r, - const float hscale) +inline __device__ void HSV2RGB_native_var_shape(float h, float s, float v, float &b, float &g, float &r) { if (s == 0) b = g = r = v; @@ -416,26 +415,22 @@ inline __device__ void HSV2RGB_native_var_shape(float h, float s, float v, float {0, 1, 3}, {2, 1, 0} }; - float tab[4]; - int sector; - h *= hscale; - h = fmod(h, 6.f); - sector = (int)floor(h); - h -= sector; - if ((unsigned)sector >= 6u) - { - sector = 0; - h = 0.f; - } - tab[0] = v; - tab[1] = v * (1.f - s); - tab[2] = v * (1.f - s * h); - tab[3] = v * (1.f - s * (1.f - h)); + h += 6 * (h < 0); // Add 6 if h < 0. + int idx = static_cast(h); // Sector index. + h -= idx; // Fractional part of h. + idx %= 6; // Make sure index is in valid range. + + // clang-format off + const float tab[4] {v, + v * (1 - s), + v * (1 - s * h), + v * (1 - s * (1 - h))}; + // clang-format on - b = tab[sector_data[sector][0]]; - g = tab[sector_data[sector][1]]; - r = tab[sector_data[sector][2]]; + b = tab[sector_data[idx][0]]; + g = tab[sector_data[idx][1]]; + r = tab[sector_data[idx][2]]; } } @@ -449,16 +444,16 @@ __global__ void hsv_to_bgr_char_nhwc(cuda::ImageBatchVarShapeWrapNHWC src, cu if (dst_x >= dst.width(batch_idx) || dst_y >= dst.height(batch_idx)) return; - float h = *src.ptr(batch_idx, dst_y, dst_x, 0); - float s = *src.ptr(batch_idx, dst_y, dst_x, 1) * (1.0f / 255.0f); - float v = *src.ptr(batch_idx, dst_y, dst_x, 2) * (1.0f / 255.0f); + const float scaleH = 6.f / (isFullRange ? 256 : 180); + constexpr float scaleSV = 1.0f / 255.0f; + constexpr T alpha = cuda::TypeTraits::max; - float hrange = isFullRange ? 255 : 180; - unsigned char alpha = cuda::TypeTraits::max; - float hs = 6.f / hrange; + float h = *src.ptr(batch_idx, dst_y, dst_x, 0) * scaleH; + float s = *src.ptr(batch_idx, dst_y, dst_x, 1) * scaleSV; + float v = *src.ptr(batch_idx, dst_y, dst_x, 2) * scaleSV; float b, g, r; - HSV2RGB_native_var_shape(h, s, v, b, g, r, hs); + HSV2RGB_native_var_shape(h, s, v, b, g, r); *dst.ptr(batch_idx, dst_y, dst_x, bidx) = cuda::SaturateCast(b * 255.0f); *dst.ptr(batch_idx, dst_y, dst_x, 1) = cuda::SaturateCast(g * 255.0f); @@ -477,16 +472,15 @@ __global__ void hsv_to_bgr_float_nhwc(cuda::ImageBatchVarShapeWrapNHWC src, c if (dst_x >= dst.width(batch_idx) || dst_y >= dst.height(batch_idx)) return; - float h = *src.ptr(batch_idx, dst_y, dst_x, 0); + constexpr float scaleH = 6.0f / 360.0f; + constexpr float alpha = 1.0f; + + float h = *src.ptr(batch_idx, dst_y, dst_x, 0) * scaleH; float s = *src.ptr(batch_idx, dst_y, dst_x, 1); float v = *src.ptr(batch_idx, dst_y, dst_x, 2); - float hrange = 360.0; - float alpha = 1.f; - float hs = 6.f / hrange; - float b, g, r; - HSV2RGB_native_var_shape(h, s, v, b, g, r, hs); + HSV2RGB_native_var_shape(h, s, v, b, g, r); *dst.ptr(batch_idx, dst_y, dst_x, bidx) = b; *dst.ptr(batch_idx, dst_y, dst_x, 1) = g; @@ -550,21 +544,34 @@ __global__ void bgr_to_yuv420sp_char_nhwc(cuda::ImageBatchVarShapeWrapNHWC sr assert(checkShapeFromYUV420(dst.height(batch_idx), dst.width(batch_idx), code)); - int uv_x = (src_x % 2 == 0) ? src_x : (src_x - 1); - uchar b = static_cast(*src.ptr(batch_idx, src_y, src_x, bidx)); uchar g = static_cast(*src.ptr(batch_idx, src_y, src_x, 1)); uchar r = static_cast(*src.ptr(batch_idx, src_y, src_x, bidx ^ 2)); - // Ignore gray channel if input is RGBA + // Ignore alpha channel if input is RGBA. uchar Y{0}, U{0}, V{0}; bgr_to_yuv42xxp_kernel(r, g, b, Y, U, V); - *dst.ptr(batch_idx, src_y, src_x, 0) = Y; + // U and V are subsampled at half the full resolution (both in x and y), combined (i.e., interleaved), and arranged + // as full rows after the full resolution Y data. Example memory layout for 4 x 4 image (NV12): + // Y_00 Y_01 Y_02 Y_03 + // Y_10 Y_11 Y_12 Y_13 + // Y_20 Y_21 Y_22 Y_23 + // Y_30 Y_31 Y_32 Y_33 + // U_00 V_00 U_02 V_02 + // U_20 V_20 U_22 V_22 + // Each U and V value corresponds to a 2x2 block of Y values--e.g. U_00 and V_00 correspond to Y_00, Y_01, Y_10, + // and Y_11. Each full U-V row represents 2 rows of Y values. Some layouts (e.g., NV21) swap the location + // of the U and V values in each U-V pair. + + *dst.ptr(batch_idx, src_y, src_x) = Y; if (src_y % 2 == 0 && src_x % 2 == 0) { - *dst.ptr(batch_idx, src_rows + src_y / 2, uv_x + uidx) = U; - *dst.ptr(batch_idx, src_rows + src_y / 2, uv_x + (1 - uidx)) = V; + const int uv_y = src_rows + src_y / 2; // The interleaved U-V semi-plane is 1/2 the height of the Y data. + const int uv_x = (src_x & ~1); // Convert x to even # (set lowest bit to 0). + + *dst.ptr(batch_idx, uv_y, uv_x + uidx) = U; // Some formats swap the U and V elements (as indicated + *dst.ptr(batch_idx, uv_y, uv_x + (uidx ^ 1)) = V; // by the uidx parameter). } } @@ -583,23 +590,39 @@ __global__ void bgr_to_yuv420p_char_nhwc(cuda::ImageBatchVarShapeWrapNHWC src assert(checkShapeFromYUV420(dst.height(batch_idx), dst.width(batch_idx), code)); - int plane_y_step = src_rows * src_cols; - int plane_uv_step = plane_y_step / 4; - int uv_x = (src_y % 4 < 2) ? src_x / 2 : (src_x / 2 + src_cols / 2); - uchar b = static_cast(*src.ptr(batch_idx, src_y, src_x, bidx)); uchar g = static_cast(*src.ptr(batch_idx, src_y, src_x, 1)); uchar r = static_cast(*src.ptr(batch_idx, src_y, src_x, bidx ^ 2)); - // Ignore gray channel if input is RGBA + // Ignore alpha channel if input is RGBA. uchar Y{0}, U{0}, V{0}; bgr_to_yuv42xxp_kernel(r, g, b, Y, U, V); - *dst.ptr(batch_idx, src_y, src_x, 0) = Y; + // U and V are sampled at half the full resolution (in both x and y) and arranged as non-interleaved planes + // (i.e., planar format). Each subsampled U and V "plane" is arranged as full rows after the full resolution Y + // data--so two consecutive subsampled U or V rows are combined into one row spanning the same width as the Y + // plane. Example memory layout for 4 x 4 image (e.g. I420): + // Y_00 Y_01 Y_02 Y_03 + // Y_10 Y_11 Y_12 Y_13 + // Y_20 Y_21 Y_22 Y_23 + // Y_30 Y_31 Y_32 Y_33 + // U_00 U_02 U_20 U_22 + // V_00 V_02 V_20 V_22 + // Each U and V value corresponds to a 2x2 block of Y values--e.g. U_00 and V_00 correspond to Y_00, Y_01, Y_10, + // and Y_11. Each full U and V row represents 4 rows of Y values. Some layouts (e.g., YV12) swap the location + // of the U and V planes. + + *dst.ptr(batch_idx, src_y, src_x) = Y; if (src_y % 2 == 0 && src_x % 2 == 0) { - *dst.ptr(batch_idx, src_rows + src_y / 4, uv_x + plane_uv_step * uidx) = U; - *dst.ptr(batch_idx, src_rows + src_y / 4, uv_x + plane_uv_step * (1 - uidx)) = V; + const int by = src_rows + src_y / 4; // Base row index for U and V: subsampled plane is 1/4 the height. + const int h4 = src_rows / 4; // Height (# of rows) of each subsampled U and V plane. + + // Compute x position that combines two subsampled rows into one. + const int uv_x = (src_x / 2) + ((src_rows / 2) & -((src_y / 2) & 1)); + + *dst.ptr(batch_idx, by + h4 * uidx, uv_x) = U; // Some formats swap the U and V "planes" (as indicated + *dst.ptr(batch_idx, by + h4 * (uidx ^ 1), uv_x) = V; // by the uidx parameter). } } @@ -618,11 +641,13 @@ __global__ void yuv420sp_to_bgr_char_nhwc(cuda::ImageBatchVarShapeWrapNHWC sr assert(checkShapeFromYUV420(src.height(batch_idx), src.width(batch_idx), code)); - int uv_x = (dst_x % 2 == 0) ? dst_x : (dst_x - 1); + // See layout commments in bgr_to_yuv420sp_char_nhwc. + const int uv_y = dst_rows + dst_y / 2; // The interleaved U-V semi-plane is 1/2 the height of the Y data. + const int uv_x = (dst_x & ~1); // Convert x to even # (set lowest bit to 0). T Y = *src.ptr(batch_idx, dst_y, dst_x); - T U = *src.ptr(batch_idx, dst_rows + dst_y / 2, uv_x + uidx); - T V = *src.ptr(batch_idx, dst_rows + dst_y / 2, uv_x + 1 - uidx); + T U = *src.ptr(batch_idx, uv_y, uv_x + uidx); // Some formats swap the U and V elements (as indicated + T V = *src.ptr(batch_idx, uv_y, uv_x + (uidx ^ 1)); // by the uidx parameter). uchar r{0}, g{0}, b{0}, a{0xff}; yuv42xxp_to_bgr_kernel(int(Y), int(U), int(V), r, g, b); @@ -651,13 +676,16 @@ __global__ void yuv420p_to_bgr_char_nhwc(cuda::ImageBatchVarShapeWrapNHWC src assert(checkShapeFromYUV420(src.height(batch_idx), src.width(batch_idx), code)); - int plane_y_step = dst_rows * dst_cols; - int plane_uv_step = plane_y_step / 4; - int uv_x = (dst_y % 4 < 2) ? dst_x / 2 : (dst_x / 2 + dst_cols / 2); + // See layout commments in bgr_to_yuv420p_char_nhwc. + const int by = dst_rows + dst_y / 4; // Base row index for U and V: subsampled plane is 1/4 the height. + const int h4 = dst_rows / 4; // Height (# of rows) of each subsampled U and V plane. + + // Compute x position that combines two subsampled rows into one. + const int uv_x = (dst_x / 2) + ((dst_cols / 2) & -((dst_y / 2) & 1)); T Y = *src.ptr(batch_idx, dst_y, dst_x); - T U = *src.ptr(batch_idx, dst_rows + dst_y / 4, uv_x + plane_uv_step * uidx); - T V = *src.ptr(batch_idx, dst_rows + dst_y / 4, uv_x + plane_uv_step * (1 - uidx)); + T U = *src.ptr(batch_idx, by + h4 * uidx, uv_x); // Some formats swap the U and V "planes" (as indicated + T V = *src.ptr(batch_idx, by + h4 * (uidx ^ 1), uv_x); // by the uidx parameter). uchar r{0}, g{0}, b{0}, a{0xff}; yuv42xxp_to_bgr_kernel(int(Y), int(U), int(V), r, g, b); @@ -671,33 +699,63 @@ __global__ void yuv420p_to_bgr_char_nhwc(cuda::ImageBatchVarShapeWrapNHWC src } } +// YUV 422 interleaved formats (e.g., YUYV, YVYU, and UYVY) group 2 pixels into groups of 4 elements. Each group of two +// pixels has two distinct luma (Y) values, one for each pixel. The chromaticity values (U and V) are subsampled by a +// factor of two so that there is only one U and one V value for each group of 2 pixels. Example memory layout for +// 4 x 4 image (UYVY format): +// U_00 Y_00 V_00 Y_01 U_02 Y_02 V_02 Y_03 +// U_10 Y_10 V_10 Y_11 U_12 Y_12 V_12 Y_13 +// U_20 Y_20 V_20 Y_21 U_22 Y_22 V_22 Y_23 +// U_30 Y_30 V_30 Y_31 U_32 Y_32 V_32 Y_33 +// Each U and V value corresponds to two Y values--e.g. U_00 and V_00 correspond to Y_00 and Y_10 while U_12 and V_12 +// correspond to Y_12 and Y_13. Thus, a given Y value, Y_rc = Y(r,c) (where r is the row, or y coordinate, and c is the +// column, or x coordinate), corresponds to U(r,c') and V(r,c') where c' is the even column coordinate <= c -- that is, +// c' = 2 * floor(c/2) = (c & ~1). Some layouts swap the positions of the chromaticity and luma values (e.g., YUYV) +// (indicated by the yidx parameter) and / or swap the the positions of the U and V chromaticity valus (e.g., YVYU) +// (indicated by the uidx parameter). +// The data layout is treated as a single channel tensor, so each group of 4 values corresponds to two pixels. As such, +// the tensor width is twice the actual pixel width. Thus, it's easiest to process 4 consecutive values (2 pixels) per +// thread. template -__global__ void yuv422_to_bgr_char_nhwc(cuda::ImageBatchVarShapeWrapNHWC src, - cuda::ImageBatchVarShapeWrapNHWC dst, int bidx, int yidx, int uidx) +__global__ void yuv422_to_bgr_char_nhwc(cuda::ImageBatchVarShapeWrap src, cuda::ImageBatchVarShapeWrapNHWC dst, + int dcn, int bidx, int yidx, int uidx) { - int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - int dst_y = blockIdx.y * blockDim.y + threadIdx.y; const int batch_idx = get_batch_idx(); - int dst_cols = dst.width(batch_idx); - int dst_rows = dst.height(batch_idx); - if (dst_x >= dst_cols || dst_y >= dst_rows) + + int dst_y = blockIdx.y * blockDim.y + threadIdx.y; + if (dst_y >= dst.height(batch_idx)) + return; + + int dst_x = blockIdx.x * blockDim.x + threadIdx.x; + if (dst_x >= dst.width(batch_idx)) return; - int uv_x = (dst_x % 2 == 0) ? dst_x : dst_x - 1; - T Y = *src.ptr(batch_idx, dst_y, dst_x, yidx); - T U = *src.ptr(batch_idx, dst_y, uv_x, (1 - yidx) + uidx); - T V = *src.ptr(batch_idx, dst_y, uv_x, (1 - yidx) + uidx ^ 2); + int src_x = 2 * dst_x; // Process 4 source elements/thread (i.e., 2 destination pixels). + int uv_x = (src_x & ~3); // Compute "even" x coordinate for U and V (set lowest two bits to 0). + + T Y0 = *src.ptr(batch_idx, dst_y, src_x + yidx); + T Y1 = *src.ptr(batch_idx, dst_y, src_x + yidx + 2); + T U = *src.ptr(batch_idx, dst_y, uv_x + (yidx ^ 1) + uidx); + T V = *src.ptr(batch_idx, dst_y, uv_x + (yidx ^ 1) + (uidx ^ 2)); uchar r{0}, g{0}, b{0}, a{0xff}; - yuv42xxp_to_bgr_kernel(int(Y), int(U), int(V), r, g, b); + + yuv42xxp_to_bgr_kernel(int(Y0), int(U), int(V), r, g, b); *dst.ptr(batch_idx, dst_y, dst_x, bidx) = b; *dst.ptr(batch_idx, dst_y, dst_x, 1) = g; *dst.ptr(batch_idx, dst_y, dst_x, bidx ^ 2) = r; - if (dst.numChannels() == 4) - { + if (dcn == 4) + *dst.ptr(batch_idx, dst_y, dst_x, 3) = a; + + dst_x++; // Move to next output pixel. + yuv42xxp_to_bgr_kernel(int(Y1), int(U), int(V), r, g, b); + + *dst.ptr(batch_idx, dst_y, dst_x, bidx) = b; + *dst.ptr(batch_idx, dst_y, dst_x, 1) = g; + *dst.ptr(batch_idx, dst_y, dst_x, bidx ^ 2) = r; + if (dcn == 4) *dst.ptr(batch_idx, dst_y, dst_x, 3) = a; - } } template @@ -716,17 +774,25 @@ __global__ void yuv420_to_gray_char_nhwc(cuda::ImageBatchVarShapeWrapNHWC src *dst.ptr(batch_idx, dst_y, dst_x, 0) = Y; } +// See layout comment before yuv422_to_bgr_char_nhwc. template -__global__ void yuv422_to_gray_char_nhwc(cuda::ImageBatchVarShapeWrapNHWC src, - cuda::ImageBatchVarShapeWrapNHWC dst, int yidx) +__global__ void yuv422_to_gray_char_nhwc(cuda::ImageBatchVarShapeWrap src, cuda::ImageBatchVarShapeWrapNHWC dst, + int yidx) { - int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - int dst_y = blockIdx.y * blockDim.y + threadIdx.y; const int batch_idx = get_batch_idx(); - if (dst_x >= dst.width(batch_idx) || dst_y >= dst.height(batch_idx)) + + int dst_y = blockIdx.y * blockDim.y + threadIdx.y; + if (dst_y >= dst.height(batch_idx)) return; - T Y = *src.ptr(batch_idx, dst_y, dst_x, yidx); - *dst.ptr(batch_idx, dst_y, dst_x, 0) = Y; + + int dst_x = blockIdx.x * blockDim.x + threadIdx.x; + if (dst_x >= dst.width(batch_idx)) + return; + + int src_x = 2 * dst_x; // Process 4 source elements/thread (i.e., 2 destination pixels). + + *dst.ptr(batch_idx, dst_y, dst_x++) = *src.ptr(batch_idx, dst_y, src_x + yidx); + *dst.ptr(batch_idx, dst_y, dst_x) = *src.ptr(batch_idx, dst_y, src_x + yidx + 2); } inline ErrorCode BGR_to_RGB(const ImageBatchVarShapeDataStridedCuda &inData, @@ -776,6 +842,12 @@ inline ErrorCode BGR_to_RGB(const ImageBatchVarShapeDataStridedCuda &inData, return ErrorCode::INVALID_DATA_SHAPE; } + if (out_data_type == kCV_16F && sch < 4 && dch == 4) + { + LOG_ERROR("Adding alpha to the output is not supported for " << out_data_type); + return ErrorCode::INVALID_DATA_SHAPE; + } + int max_width = inData.maxSize().w; int max_height = inData.maxSize().h; int batch_size = inData.numImages(); @@ -794,8 +866,8 @@ inline ErrorCode BGR_to_RGB(const ImageBatchVarShapeDataStridedCuda &inData, checkKernelErrors(); } break; + case kCV_16F: // Not properly handled when adding alpha to the destination. case kCV_16U: - case kCV_16F: case kCV_16S: { cuda::ImageBatchVarShapeWrapNHWC src_ptr(inData, sch); @@ -874,6 +946,12 @@ inline ErrorCode GRAY_to_BGR(const ImageBatchVarShapeDataStridedCuda &inData, return ErrorCode::INVALID_DATA_SHAPE; } + if (out_data_type == kCV_16F && dch == 4) + { + LOG_ERROR("Adding alpha to the output is not supported for " << out_data_type); + return ErrorCode::INVALID_DATA_SHAPE; + } + int max_width = inData.maxSize().w; int max_height = inData.maxSize().h; int batch_size = inData.numImages(); @@ -892,8 +970,8 @@ inline ErrorCode GRAY_to_BGR(const ImageBatchVarShapeDataStridedCuda &inData, checkKernelErrors(); } break; + case kCV_16F: // Not properly handled when adding alpha to the destination. case kCV_16U: - case kCV_16F: case kCV_16S: { cuda::ImageBatchVarShapeWrapNHWC src_ptr(inData, channels); @@ -1464,7 +1542,7 @@ inline ErrorCode YUV422_to_BGR(const ImageBatchVarShapeDataStridedCuda &inData, int channels = inData.uniqueFormat().numChannels(); DataType data_type = helpers::GetLegacyDataType(inData.uniqueFormat()); - if (channels != 2) + if (channels != 3) { LOG_ERROR("Invalid input channel number " << channels); return ErrorCode::INVALID_DATA_SHAPE; @@ -1483,28 +1561,52 @@ inline ErrorCode YUV422_to_BGR(const ImageBatchVarShapeDataStridedCuda &inData, int dcn = outData.uniqueFormat().numChannels(); - if (dcn != 3 && dcn != 4) + if ((code != NVCV_COLOR_YUV2GRAY_UYVY && code != NVCV_COLOR_YUV2GRAY_YUY2 || dcn != 1) && dcn != 3 && dcn != 4) { LOG_ERROR("Invalid output channel number " << dcn); return ErrorCode::INVALID_DATA_SHAPE; } + auto inList = inData.imageList(); + + for (int i = 0; i < inData.numImages(); i++) + { + if (inList[i].numPlanes != 1) + { + LOG_ERROR("Input batch images must all be a single plane of data"); + return ErrorCode::INVALID_DATA_SHAPE; + } + + NVCVImagePlaneStrided plane = inList[i].planes[0]; + + if (plane.width % 2 != 0) + { + LOG_ERROR("Input batch images must all have a width that is a multiple of 2"); + return ErrorCode::INVALID_DATA_SHAPE; + } + if (plane.rowStride < plane.width * 2) + { + LOG_ERROR("Insufficient input batch image stride"); + return ErrorCode::INVALID_DATA_SHAPE; + } + } + int max_width = inData.maxSize().w; int max_height = inData.maxSize().h; int batch_size = inData.numImages(); dim3 blockSize(BLOCK, BLOCK / 4, 1); - dim3 gridSize(divUp(max_width, blockSize.x), divUp(max_height, blockSize.y), batch_size); + dim3 gridSize(divUp(max_width / 2, blockSize.x), divUp(max_height, blockSize.y), batch_size); - cuda::ImageBatchVarShapeWrapNHWC src_ptr(inData, channels); - cuda::ImageBatchVarShapeWrapNHWC dst_ptr(outData, dcn); + cuda::ImageBatchVarShapeWrap src_ptr(inData); + cuda::ImageBatchVarShapeWrapNHWC dst_ptr(outData, dcn); switch (code) { case NVCV_COLOR_YUV2GRAY_YUY2: case NVCV_COLOR_YUV2GRAY_UYVY: { - yuv422_to_gray_char_nhwc<<>>(src_ptr, dst_ptr, yidx); + yuv422_to_gray_char_nhwc<<>>(src_ptr, dst_ptr, yidx); checkKernelErrors(); } break; @@ -1521,7 +1623,7 @@ inline ErrorCode YUV422_to_BGR(const ImageBatchVarShapeDataStridedCuda &inData, case NVCV_COLOR_YUV2RGBA_UYVY: case NVCV_COLOR_YUV2BGRA_UYVY: { - yuv422_to_bgr_char_nhwc<<>>(src_ptr, dst_ptr, bidx, yidx, uidx); + yuv422_to_bgr_char_nhwc<<>>(src_ptr, dst_ptr, dcn, bidx, yidx, uidx); checkKernelErrors(); } break; diff --git a/src/cvcuda/priv/legacy/morphology_var_shape.cu b/src/cvcuda/priv/legacy/morphology_var_shape.cu index 9edfa907..9f751f39 100644 --- a/src/cvcuda/priv/legacy/morphology_var_shape.cu +++ b/src/cvcuda/priv/legacy/morphology_var_shape.cu @@ -209,12 +209,6 @@ ErrorCode MorphologyVarShape::infer(const nvcv::ImageBatchVarShape &inBatch, con return ErrorCode::INVALID_DATA_FORMAT; } - if (input_format != output_format) - { - LOG_ERROR("Invalid DataFormat between input (" << input_format << ") and output (" << output_format << ")"); - return ErrorCode::INVALID_DATA_FORMAT; - } - DataFormat format = input_format; if (!(format == kNHWC || format == kHWC)) { diff --git a/src/nvcv/CMakeLists.txt b/src/nvcv/CMakeLists.txt index f5f544ff..2f2ae1ea 100644 --- a/src/nvcv/CMakeLists.txt +++ b/src/nvcv/CMakeLists.txt @@ -17,7 +17,7 @@ cmake_minimum_required(VERSION 3.20.1) project(nvcv LANGUAGES C CXX - VERSION 0.11.0 + VERSION 0.12.0 DESCRIPTION "NVCV is NVIDIA Computer Vision library" ) diff --git a/tests/common/TensorDataUtils.cpp b/tests/common/TensorDataUtils.cpp index a5486a13..243fb69e 100644 --- a/tests/common/TensorDataUtils.cpp +++ b/tests/common/TensorDataUtils.cpp @@ -184,14 +184,17 @@ nvcv::Tensor CreateTensor(int numImages, int imgWidth, int imgHeight, const nvcv if (imgFormat == NVCV_IMAGE_FORMAT_NV12 || imgFormat == NVCV_IMAGE_FORMAT_NV12_ER || imgFormat == NVCV_IMAGE_FORMAT_NV21 || imgFormat == NVCV_IMAGE_FORMAT_NV21_ER) { - int height420 - = (imgHeight * 3) / 2; // tensor size is 3/2 times the image size to accommodate the 1/2 chroma planes - // width must be even and height must be multiple of 3 (original height must be even and multiple of 2) - if (height420 % 3 != 0 || imgWidth % 2 != 0) + // Width and height must be a multiple of 2 (i.e., even). + if (imgHeight % 2 != 0 || imgWidth % 2 != 0) { - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid height"); + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Invalid height or width: height and width need to be a " + "multiple of 2 for planar and semi-planar YUV420 formats."); } + // Tensor height is 3/2 times the image height to accommodate the half-height chroma planes. + int height420 = (imgHeight * 3) / 2; + if (numImages == 1) { return nvcv::Tensor( @@ -209,6 +212,25 @@ nvcv::Tensor CreateTensor(int numImages, int imgWidth, int imgHeight, const nvcv return nvcv::Tensor(numImages, {imgWidth, height420}, nvcv::ImageFormat(NVCV_IMAGE_FORMAT_Y8)); } } + else if (imgFormat == NVCV_IMAGE_FORMAT_UYVY || imgFormat == NVCV_IMAGE_FORMAT_UYVY_ER + || imgFormat == NVCV_IMAGE_FORMAT_YUYV || imgFormat == NVCV_IMAGE_FORMAT_YUYV_ER) + { + if (imgWidth % 2 != 0) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Invalid width: width needs to be a multiple of 2 for interleaved YUV422 formats."); + } + + int wdth422 = 2 * imgWidth; // Tensor width is 2x the image width to accomodate two chromaticity values for + // each group of two luma values (UYVY or YUYV). + // clang-format off + nvcv::DataType type = imgFormat.planeDataType(0).channelType(0); + nvcv::TensorShape shape = numImages > 1 ? nvcv::TensorShape({numImages, imgHeight, wdth422, 1}, "NHWC") + : nvcv::TensorShape( {imgHeight, wdth422, 1}, "HWC"); + + return nvcv::Tensor(shape, type); + // clang-format on + } if (numImages == 1) { int numChannels = imgFormat.numPlanes() == 1 ? imgFormat.planeNumChannels(0) : imgFormat.numPlanes(); diff --git a/tests/cvcuda/system/CMakeLists.txt b/tests/cvcuda/system/CMakeLists.txt index fcf9a455..c89260a3 100644 --- a/tests/cvcuda/system/CMakeLists.txt +++ b/tests/cvcuda/system/CMakeLists.txt @@ -53,7 +53,9 @@ add_executable(cvcuda_test_system TestOpCropFlipNormalizeReformat.cpp FlipUtils.cpp ConvUtils.cpp + CvtColorUtils.cpp ResizeUtils.cpp + TestUtils.cpp TestOpNonMaximumSuppression.cpp TestOpReformat.cpp TestOpResize.cpp diff --git a/tests/cvcuda/system/CvtColorUtils.cpp b/tests/cvcuda/system/CvtColorUtils.cpp new file mode 100644 index 00000000..b3fb91a2 --- /dev/null +++ b/tests/cvcuda/system/CvtColorUtils.cpp @@ -0,0 +1,1074 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "CvtColorUtils.hpp" + +#include +#include + +#include // For std::floor +#include // For std::memcpy + +namespace cuda = nvcv::cuda; + +template +using Vector3 = cuda::math::Vector; + +template +using Matrix3x3 = cuda::math::Matrix; + +using Vec3f = Vector3; +using Vec3d = Vector3; + +using Mat3f = Matrix3x3; +using Mat3d = Matrix3x3; + +using std::vector; + +// Accurate coefficients for converting RGB to ITU Rec.601 luma. +// Found at +// http://www.brucelindbloom.com/index.html?WorkingSpaceInfo.html +// and +// https://www.imagemagick.org/include/api/pixel.php. +// NOTE: These coefficients are more accurate than the standard [0.299, 0.587, 0.144] values used elsewhere and may +// results in slightly different floating point or large integer (e.g., uint16 or uint32) pixel values. +// static constexpr double Red2Y = 0.298839; // Y = Red2Y * R +// static constexpr double Grn2Y = 0.586811; // + Grn2Y * G +// static constexpr double Blu2Y = 0.114350; // + Blu2Y * B +static constexpr double Red2Y = 0.299; // Y = Red2Y * R +static constexpr double Grn2Y = 0.587; // + Grn2Y * G +static constexpr double Blu2Y = 0.114; // + Blu2Y * B + +// Coefficients to convert non-linear RGB to PAL (analog color TV standard) chromaticity (U and V) components. +// NOTE: Both PAL and NTSC use the ITU Rec.601 RGB coefficients to compute Y. +// static constexpr double Blu2U_PAL = 0.492111; // U = Blu2U_PAL * (B - Y) + 0.5 +// static constexpr double Red2V_PAL = 0.877283; // V = Red2V_PAL * (R - Y) + 0.5 +static constexpr double Blu2U_PAL = 0.492; // U = Blu2U_PAL * (B - Y) + 0.5 +static constexpr double Red2V_PAL = 0.877; // V = Red2V_PAL * (R - Y) + 0.5 + +// Coefficients to convert non-linear RGB to ITU Rec.601 chromaticity (Cb and Cr) components. +static constexpr double Blu2Cb_601 = 0.56455710; // 1.0 / 1.7713 Cb/U +static constexpr double Red2Cr_601 = 0.71310298; // 1.0 / 1.402322 Cr/V + +// clang-format off + +// Coefficients to convert chromaticity (U and V) components to RGB . +// static constexpr double U2Blu = 2.03211; +// static constexpr double U2Grn = -0.39465; +// static constexpr double V2Grn = -0.58060; +// static constexpr double V2Red = 1.13983; +static constexpr double U2Blu = 2.032; +static constexpr double U2Grn = -0.395; +static constexpr double V2Grn = -0.581; +static constexpr double V2Red = 1.140; + +// Coefficients to convert RGB to ITU Rec.601 YCbCr. +static constexpr double R2Y_NV12 = 0.255785; +static constexpr double G2Y_NV12 = 0.502160; +static constexpr double B2Y_NV12 = 0.097523; + +static constexpr double R2U_NV12 = -0.147644; +static constexpr double G2U_NV12 = -0.289856; +static constexpr double B2U_NV12 = 0.4375; + +static constexpr double R2V_NV12 = 0.4375; +static constexpr double G2V_NV12 = -0.366352; +static constexpr double B2V_NV12 = -0.071148; + +// Coefficients to convert RGB to ITU Rec.601 YCbCr. +static constexpr double Y2R_NV12 = 1.16895; +static constexpr double U2R_NV12 = 0.0; +static constexpr double V2R_NV12 = 1.60229; + +static constexpr double Y2G_NV12 = 1.16895; +static constexpr double U2G_NV12 = -0.3933; +static constexpr double V2G_NV12 = -0.81616; + +static constexpr double Y2B_NV12 = 1.16895; +static constexpr double U2B_NV12 = 2.02514; +static constexpr double V2B_NV12 = 0.0; + +// Coefficients to add or subtract from YCbCr (abbreviated YUV)components to convert between RGB and ITU Rec.601 YCbCr. +static constexpr double Add2Y_NV12 = 16.0; +static constexpr double Add2U_NV12 = 128.0; +static constexpr double Add2V_NV12 = 128.0; + +// clang-format on + +template> +constexpr BT Alpha = std::is_floating_point_v ? 1 : cuda::TypeTraits::max; + +//-==================================================================================================================-// +// Set AlphaOnly to true to add/remove alpha channel to RGB/BGR image (without switching between RGB and BGR). +template +static void convertRGBtoBGR(T *dst, const T *src, size_t numPixels, bool srcRGBA, bool dstRGBA) +{ + const uint incr = 3 + srcRGBA; + + for (size_t i = 0; i < numPixels; i++, src += incr) + { + // clang-format off + if constexpr (AlphaOnly) { *dst++ = src[0]; *dst++ = src[1]; *dst++ = src[2]; } + else { *dst++ = src[2]; *dst++ = src[1]; *dst++ = src[0]; } + if (dstRGBA) *dst++ = srcRGBA ? src[3] : Alpha; + // clang-format on + } +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +template +void convertRGBtoBGR(vector &dst, const vector &src, size_t numPixels, bool srcRGBA, bool dstRGBA) +{ + convertRGBtoBGR(dst.data(), src.data(), numPixels, srcRGBA, dstRGBA); +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +#define MAKE_RGBtoBGR(T) template void convertRGBtoBGR(vector &, const vector &, size_t, bool, bool) + +MAKE_RGBtoBGR(uint8_t); +MAKE_RGBtoBGR(uint16_t); +MAKE_RGBtoBGR(int32_t); +MAKE_RGBtoBGR(float); +MAKE_RGBtoBGR(double); + +#undef MAKE_RGBtoBGR + +//--------------------------------------------------------------------------------------------------------------------// + +//-==================================================================================================================-// +template +void changeAlpha(vector &dst, const vector &src, size_t numPixels, bool srcRGBA, bool dstRGBA) +{ + convertRGBtoBGR(dst.data(), src.data(), numPixels, srcRGBA, dstRGBA); +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +#define MAKE_CHANGE_ALPHA(T) template void changeAlpha(vector &, const vector &, size_t, bool, bool) + +MAKE_CHANGE_ALPHA(uint8_t); +MAKE_CHANGE_ALPHA(uint16_t); +MAKE_CHANGE_ALPHA(int32_t); +MAKE_CHANGE_ALPHA(float); +MAKE_CHANGE_ALPHA(double); + +#undef MAKE_CHANGE_ALPHA + +//--------------------------------------------------------------------------------------------------------------------// + +//-==================================================================================================================-// +template +void convertRGBtoGray(T *dst, const T *src, size_t numPixels, bool rgba, bool bgr) +{ + const int incr = 3 + rgba; + + for (size_t i = 0; i < numPixels; i++, dst++, src += incr) + { + // clang-format off + if (bgr) *dst = static_cast(Blu2Y * src[0] + Grn2Y * src[1] + Red2Y * src[2]); + else *dst = static_cast(Red2Y * src[0] + Grn2Y * src[1] + Blu2Y * src[2]); + // clang-format on + } +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +template +void convertRGBtoGray(vector &dst, const vector &src, size_t numPixels, bool rgba, bool bgr) +{ + convertRGBtoGray(dst.data(), src.data(), numPixels, rgba, bgr); +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +#define MAKE_RGBtoGray(T) template void convertRGBtoGray(vector &, const vector &, size_t, bool, bool) + +MAKE_RGBtoGray(uint8_t); +MAKE_RGBtoGray(uint16_t); +MAKE_RGBtoGray(int32_t); +MAKE_RGBtoGray(float); +MAKE_RGBtoGray(double); + +#undef MAKE_RGBtoGray + +//--------------------------------------------------------------------------------------------------------------------// + +//-==================================================================================================================-// +template +void convertGrayToRGB(T *dst, const T *src, size_t numPixels, bool rgba) +{ + for (size_t i = 0; i < numPixels; i++) + { + T val = *src++; + + // clang-format off + *dst++ = val; *dst++ = val; *dst++ = val; + if (rgba) *dst++ = Alpha; + // clang-format on + } +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +template +void convertGrayToRGB(vector &dst, const vector &src, size_t numPixels, bool rgba) +{ + convertGrayToRGB(dst.data(), src.data(), numPixels, rgba); +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +#define MAKE_GrayToRGB(T) template void convertGrayToRGB(vector &, const vector &, size_t, bool) + +MAKE_GrayToRGB(uint8_t); +MAKE_GrayToRGB(uint16_t); +MAKE_GrayToRGB(int32_t); +MAKE_GrayToRGB(float); +MAKE_GrayToRGB(double); + +#undef MAKE_GrayToRGB + +//--------------------------------------------------------------------------------------------------------------------// + +//-==================================================================================================================-// +template +void convertRGBtoHSV(T *dst, const T *src, size_t numPixels, bool rgba, bool bgr) +{ + // Set the hue range (e.g., 0-360 for float types) and scale factor (to convert the final value to output hue value). + constexpr double range = (sizeof(T) > 1) ? 360.0 : (FullRange ? 256.0 : 180.0); + constexpr double scale = range / 360.0; + constexpr double norm = std::is_floating_point_v ? 1 : cuda::TypeTraits::max; + constexpr double round = std::is_floating_point_v ? 0 : 0.5; + + for (size_t i = 0; i < numPixels; i++) + { + double R = static_cast(*src++) / norm; + double G = static_cast(*src++) / norm; + double B = static_cast(*src++) / norm; + + // clang-format off + if (bgr) std::swap(R, B); + if (rgba) src++; + // clang-format on + + double Vmin = std::min(R, std::min(G, B)); + double V = std::max(R, std::max(G, B)); + + double diff = static_cast(V - Vmin); + + double S = static_cast(V) > DBL_EPSILON ? diff / V : 0.0; + double H = 0.0; + + if (diff > DBL_EPSILON) + { + // clang-format off + diff = 60.0 / diff; + if (V == R) H = (G - B) * diff; + else if (V == G) H = (B - R) * diff + 120.0; + else H = (R - G) * diff + 240.0; + // clang-format on + } + H *= scale; + S *= norm; + V *= norm; + + // Make sure hue falls within the proper range: the value 'range' (e.g., 360) should not appear since it's equivalent to 0. + H += round; + // clang-format off + if (H >= range) H -= range; // For the case when T is uint8_t and FullRange is false, H can be > 180. + else if (H < 0.0) H += range; + // clang-format on + H -= round; + + *dst++ = static_cast(H + round); + *dst++ = static_cast(S + round); + *dst++ = static_cast(V + round); + } +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +template +void convertRGBtoHSV(vector &dst, const vector &src, size_t numPixels, bool rgba, bool bgr) +{ + convertRGBtoHSV(dst.data(), src.data(), numPixels, rgba, bgr); +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +// Restricted range hue (FullRange = false): values between [0-180). Applies only to uint8_t, but still need to +// instantiate all the types. +#define MAKE_RGBtoHSV(T) template void convertRGBtoHSV(vector &, const vector &, size_t, bool, bool) + +MAKE_RGBtoHSV(uint8_t); +MAKE_RGBtoHSV(uint16_t); +MAKE_RGBtoHSV(int32_t); +MAKE_RGBtoHSV(float); +MAKE_RGBtoHSV(double); + +#undef MAKE_RGBtoHSV + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +// Full range hue (FullRange = false): values between [0-256). Applies only to uint8_t, but still need to +// instantiate all the types. +#define MAKE_RGBtoHSV(T) template void convertRGBtoHSV(vector &, const vector &, size_t, bool, bool) + +MAKE_RGBtoHSV(uint8_t); +MAKE_RGBtoHSV(uint16_t); +MAKE_RGBtoHSV(int32_t); +MAKE_RGBtoHSV(float); +MAKE_RGBtoHSV(double); + +#undef MAKE_RGBtoHSV + +//--------------------------------------------------------------------------------------------------------------------// + +//-==================================================================================================================-// +/* To convert HSV to RGB: + 1) Ensure that (H,S,V) range is (360.0, 1.0, 1.0) + 2) H' = H / 60 + 3) C = V * S + 4) I = (int)H + 5) h = H' - I // Fractional part of H' + 6) X = C * (1 - fabs(fmod(H', 2.0) - 1.0)) + = C * (1 - fabs(H' - (I & ~1) - 1.0)) + = C * ((I & 1) ? 1 - h : h) + 5) m = V - C + = V - V * S + = V * (1 - S) + 7) p = X + m // When I is even: (I & 1) == 0 (I = 0, 2, or 4) + = C * h + V - C + = V * S * h + V * (1 - S) + = V * (S * h + 1 - S) + = V * (1 - S + S * h) + = V * (1 - S * (1 - h)) + 8) q = X + m // When I is odd: (I & 1) == 1 (I = 1, 3, or 5) + = C * (1 - h) + V - C + = V * S * (1 - h) + V * (1 - S) + = V * (S - S * h + 1 - S) + = V * (1 - S * h) + 9) Cases: // Note: C + m = C + V - C = V + I == 0: R = C + m = V + G = X + m = p // Even case + B = m + + I == 1: R = X + m = q // Odd case + G = C + m = V + B = m + + I == 2: R = m + G = C + m = V + B = X + m = p // Even case + + I == 3: R = m + G = X + m = q // Odd case + B = C + m = V + + I == 4: R = X + m = p // Even case + G = m + B = C + m = V + + I == 5: R = C + m = V + G = m + B = X + m = q // Odd case +*/ +template +void convertHSVtoRGB(T *dst, const T *src, size_t numPixels, bool rgba, bool bgr) +{ + constexpr double range = (sizeof(T) > 1) ? 360.0 : (FullRange ? 256.0 : 180.0); + constexpr double scale = 6.0 / range; + constexpr double norm = std::is_floating_point_v ? 1 : cuda::TypeTraits::max; + constexpr double round = std::is_floating_point_v ? 0 : 0.5; + + constexpr uint mapR[6] = {0, 2, 1, 1, 3, 0}; + constexpr uint mapG[6] = {3, 0, 0, 2, 1, 1}; + constexpr uint mapB[6] = {1, 1, 3, 0, 0, 2}; + + for (size_t i = 0; i < numPixels; i++) + { + double H = *src++ * scale; // 0 <= H < 6 + double S = *src++ / norm; // 0 <= S <= 1 + double V = *src++ / norm; // 0 <= V <= 1 + + int idx = static_cast(std::floor(H)); + + H -= idx; + + // clang-format off + idx %= 6; + if (idx < 0) idx += 6; + + double val[] = {V, + V * (1 - S), + V * (1 - S * H), + V * (1 - S * (1 - H))}; + + uint r = mapR[idx]; + uint g = mapG[idx]; + uint b = mapB[idx]; + + if (bgr) std::swap(r, b); + *dst++ = static_cast(val[r] * norm + round); + *dst++ = static_cast(val[g] * norm + round); + *dst++ = static_cast(val[b] * norm + round); + if (rgba) *dst++ = Alpha; + // clang-format on + } +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +template +void convertHSVtoRGB(vector &dst, const vector &src, size_t numPixels, bool rgba, bool bgr) +{ + convertHSVtoRGB(dst.data(), src.data(), numPixels, rgba, bgr); +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +// Restricted range hue (FullRange = false): values between [0-180). Applies only to uint8_t, but still need to +// instantiate all the types. +#define MAKE_HSVtoRGB(T) template void convertHSVtoRGB(vector &, const vector &, size_t, bool, bool) + +MAKE_HSVtoRGB(uint8_t); +MAKE_HSVtoRGB(uint16_t); +MAKE_HSVtoRGB(int32_t); +MAKE_HSVtoRGB(float); +MAKE_HSVtoRGB(double); + +#undef MAKE_HSVtoRGB + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +// Full range hue (FullRange = false): values between [0-256). Applies only to uint8_t, but still need to +// instantiate all the types. +#define MAKE_HSVtoRGB(T) template void convertHSVtoRGB(vector &, const vector &, size_t, bool, bool) + +MAKE_HSVtoRGB(uint8_t); +MAKE_HSVtoRGB(uint16_t); +MAKE_HSVtoRGB(int32_t); +MAKE_HSVtoRGB(float); +MAKE_HSVtoRGB(double); + +#undef MAKE_HSVtoRGB + +//-==================================================================================================================-// +template +void convertRGBtoYUV_PAL(T *dst, const T *src, size_t numPixels, bool rgba, bool bgr) +{ + constexpr T max = std::is_floating_point_v ? 1 : cuda::TypeTraits::max; + constexpr T delta = max / 2 + (std::is_floating_point_v ? 0 : 1); + + for (size_t i = 0; i < numPixels; i++) + { + T red = *src++; + T grn = *src++; + T blu = *src++; + + // clang-format off + if (bgr) std::swap(red, blu); + if (rgba) src++; + // clang-format on + + double Y = Red2Y * red + Grn2Y * grn + Blu2Y * blu; + + *dst++ = cuda::SaturateCast(Y); + *dst++ = cuda::SaturateCast(Blu2U_PAL * (blu - Y) + delta); + *dst++ = cuda::SaturateCast(Red2V_PAL * (red - Y) + delta); + } +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +template +void convertRGBtoYUV_PAL(vector &dst, const vector &src, size_t numPixels, bool rgba, bool bgr) +{ + convertRGBtoYUV_PAL(dst.data(), src.data(), numPixels, rgba, bgr); +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +#define MAKE_RGBtoYUV(T) template void convertRGBtoYUV_PAL(vector &, const vector &, size_t, bool, bool) + +MAKE_RGBtoYUV(uint8_t); +MAKE_RGBtoYUV(uint16_t); +MAKE_RGBtoYUV(int32_t); +MAKE_RGBtoYUV(float); +MAKE_RGBtoYUV(double); + +#undef MAKE_RGBtoYUV + +//--------------------------------------------------------------------------------------------------------------------// + +//-==================================================================================================================-// +template +void convertYUVtoRGB_PAL(T *dst, const T *src, size_t numPixels, bool rgba, bool bgr) +{ + constexpr T max = std::is_floating_point_v ? 1 : cuda::TypeTraits::max; + constexpr T delta = max / 2 + (std::is_floating_point_v ? 0 : 1); + + for (size_t i = 0; i < numPixels; i++) + { + double Y = *src++; + double U = *src++; + double V = *src++; + + U -= delta; + V -= delta; + + double red = Y + V * V2Red; + double grn = Y + U * U2Grn + V * V2Grn; + double blu = Y + U * U2Blu; + + // clang-format off + if (bgr) std::swap(red, blu); + *dst++ = cuda::SaturateCast(red); + *dst++ = cuda::SaturateCast(grn); + *dst++ = cuda::SaturateCast(blu); + if (rgba) *dst++ = Alpha; + // clang-format on + } +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +template +void convertYUVtoRGB_PAL(vector &dst, const vector &src, size_t numPixels, bool rgba, bool bgr) +{ + convertYUVtoRGB_PAL(dst.data(), src.data(), numPixels, rgba, bgr); +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +#define MAKE_YUVtoRGB(T) template void convertYUVtoRGB_PAL(vector &, const vector &, size_t, bool, bool) + +MAKE_YUVtoRGB(uint8_t); +MAKE_YUVtoRGB(uint16_t); +MAKE_YUVtoRGB(int32_t); +MAKE_YUVtoRGB(float); +MAKE_YUVtoRGB(double); + +#undef MAKE_YUVtoRGB + +//--------------------------------------------------------------------------------------------------------------------// + +//-==================================================================================================================-// +template +void convertRGBtoYUV_420(T *dst, const T *src, uint wdth, uint hght, uint numImgs, bool rgba, bool bgr, bool yvu) +{ + // Ensure both width and height are multiples of 2 since we're processing 2x2 blocks. + assert(wdth % 2 == 0 && hght % 2 == 0); + + const size_t imgPixels = (size_t)hght * (size_t)wdth; + const size_t incrPix = 3 + rgba; + const size_t incrSrc = imgPixels * incrPix; + const size_t incrDst = imgPixels * 3 / 2; + + for (uint n = 0; n < numImgs; n++, src += incrSrc, dst += incrDst) + { + T *y = dst; + T *u = y + imgPixels; + T *v = u + imgPixels / 4; + + const T *rgb = src; + + // clang-format off + if (yvu) std::swap(u, v); + // clang-format on + + for (uint h = 0; h < hght; h++) + { + for (uint w = 0; w < wdth; w++, rgb += incrPix) + { + T R = rgb[0]; + T G = rgb[1]; + T B = rgb[2]; + + // Convert all RGB values to Y values and store them. + // clang-format off + if (bgr) std::swap(R, B); + *y++ = cuda::SaturateCast(R2Y_NV12 * R + G2Y_NV12 * G + B2Y_NV12 * B + Add2Y_NV12); + // clang-format on + + // Convert only even pixels (in width and height) to U and V values and store them. + if ((w & 1) == 0 && (h & 1) == 0) + { + double U = R2U_NV12 * R + G2U_NV12 * G + B2U_NV12 * B + Add2U_NV12; + double V = R2V_NV12 * R + G2V_NV12 * G + B2V_NV12 * B + Add2V_NV12; + + *u++ = cuda::SaturateCast(U); + *v++ = cuda::SaturateCast(V); + } + } + } + } +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +template +void convertRGBtoYUV_420(vector &dst, const vector &src, uint wdth, uint hght, uint numImgs, bool rgba, bool bgr, + bool yvu) +{ + // Ensure input data has sets of 3 or 4 (RGB/BGA with or w/o alpha) values for the given width and height and batch size. + assert(src.size() == (size_t)numImgs * (size_t)hght * (size_t)wdth * (size_t)(3 + rgba)); + + // YUV 420 needs 3 elements for each two RGB pixels. + assert(dst.size() == (size_t)numImgs * (size_t)hght * (size_t)wdth * 3 / 2); + + convertRGBtoYUV_420(dst.data(), src.data(), wdth, hght, numImgs, rgba, bgr, yvu); +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +#define MAKE_RGBtoYUV(T) \ + template void convertRGBtoYUV_420(vector &, const vector &, uint, uint, uint, bool, bool, bool) + +MAKE_RGBtoYUV(uint8_t); +MAKE_RGBtoYUV(uint16_t); +MAKE_RGBtoYUV(int32_t); +MAKE_RGBtoYUV(float); +MAKE_RGBtoYUV(double); + +#undef MAKE_RGBtoYUV + +//--------------------------------------------------------------------------------------------------------------------// + +//-==================================================================================================================-// +template +void convertYUVtoRGB_420(T *dst, const T *src, uint wdth, uint hght, uint numImgs, bool rgba, bool bgr, bool yvu) +{ + // Ensure both width and height are multiples of 2 since we're processing 2x2 blocks. + assert(wdth % 2 == 0 && hght % 2 == 0); + + const size_t imgPixels = (size_t)hght * (size_t)wdth; + const size_t incrSrc = imgPixels * 3 / 2; + const size_t incrDst = imgPixels * (3 + rgba); + + for (uint n = 0; n < numImgs; n++, src += incrSrc, dst += incrDst) + { + T *rgb = dst; + + const T *y = src; + + for (uint h = 0; h < hght; h++) + { + // clang-format off + // NOTE: when computing subsampled row index, h needs to be integer divided by 4 before multiplying by width. + const T *u = src + imgPixels + (h / 4) * wdth + ((h / 2) & 1) * (wdth / 2); + const T *v = u + imgPixels / 4; + + if (yvu) std::swap(u, v); + // clang-format on + + for (uint w = 0; w < wdth; w++) + { + double Y = *y++; + double U = *u; + double V = *v; + + // Convert all YUV (ITU Rec.601) values to RGB values and store them. + Y -= Add2Y_NV12; + U -= Add2U_NV12; + V -= Add2V_NV12; + + // clang-format off + if (Y < 0.0) Y = 0.0; + T R = cuda::SaturateCast(Y2R_NV12 * Y + U2R_NV12 * U + V2R_NV12 * V); + T G = cuda::SaturateCast(Y2G_NV12 * Y + U2G_NV12 * U + V2G_NV12 * V); + T B = cuda::SaturateCast(Y2B_NV12 * Y + U2B_NV12 * U + V2B_NV12 * V); + if (bgr) std::swap(R, B); + *rgb++ = R; + *rgb++ = G; + *rgb++ = B; + if (rgba) *rgb++ = Alpha; + // clang-format on + + u += (w & 1); + v += (w & 1); + } + } + } +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +template +void convertYUVtoRGB_420(vector &dst, const vector &src, uint wdth, uint hght, uint numImgs, bool rgba, bool bgr, + bool yvu) +{ + // Ensure output data has sets of 3 or 4 (RGB/BGA with or w/o alpha) values for the given width and height and batch size. + assert(dst.size() == (size_t)numImgs * (size_t)hght * (size_t)wdth * (size_t)(3 + rgba)); + + // YUV 420 needs 3 elements for each two RGB pixels. + assert(src.size() == (size_t)numImgs * (size_t)hght * (size_t)wdth * 3 / 2); + + convertYUVtoRGB_420(dst.data(), src.data(), wdth, hght, numImgs, rgba, bgr, yvu); +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +#define MAKE_NV12toRGB(T) \ + template void convertYUVtoRGB_420(vector &, const vector &, uint, uint, uint, bool, bool, bool) + +MAKE_NV12toRGB(uint8_t); +MAKE_NV12toRGB(uint16_t); +MAKE_NV12toRGB(int32_t); +MAKE_NV12toRGB(float); +MAKE_NV12toRGB(double); + +#undef MAKE_NV12toRGB + +//--------------------------------------------------------------------------------------------------------------------// + +//-==================================================================================================================-// +template +void convertYUVtoGray_420(T *dst, const T *src, uint wdth, uint hght, uint numImgs) +{ + // Ensure both width and height are multiples of 2. + assert(wdth % 2 == 0 && hght % 2 == 0); + + const size_t imgPixels = (size_t)hght * (size_t)wdth; + const size_t incrSrc = imgPixels * 3 / 2; + + for (uint n = 0; n < numImgs; n++, src += incrSrc, dst += imgPixels) + { + std::memcpy(dst, src, imgPixels * sizeof(T)); // Copy Y plane of each image to destination tensor. + } +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +template +void convertYUVtoGray_420(vector &dst, const vector &src, uint wdth, uint hght, uint numImgs) +{ + // Ensure output data has sets of 3 or 4 (RGB/BGA with or w/o alpha) values for the given width and height and batch size. + assert(dst.size() == (size_t)numImgs * (size_t)hght * (size_t)wdth); + + // YUV 420 needs 3 elements for each two RGB pixels. + assert(src.size() == (size_t)numImgs * (size_t)hght * (size_t)wdth * 3 / 2); + + convertYUVtoGray_420(dst.data(), src.data(), wdth, hght, numImgs); +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +#define MAKE_YUVtoGray(T) template void convertYUVtoGray_420(vector &, const vector &, uint, uint, uint) + +MAKE_YUVtoGray(uint8_t); +MAKE_YUVtoGray(uint16_t); +MAKE_YUVtoGray(int32_t); +MAKE_YUVtoGray(float); +MAKE_YUVtoGray(double); + +#undef MAKE_YUVtoGray + +//--------------------------------------------------------------------------------------------------------------------// + +//-==================================================================================================================-// +template +void convertRGBtoNV12(T *dst, const T *src, uint wdth, uint hght, uint numImgs, bool rgba, bool bgr, bool yvu) +{ + // Ensure both width and height are multiples of 2 since we're processing 2x2 blocks. + assert(wdth % 2 == 0 && hght % 2 == 0); + + const size_t imgPixels = (size_t)hght * (size_t)wdth; + const size_t incrPix = 3 + rgba; + const size_t incrSrc = imgPixels * incrPix; + const size_t incrDst = imgPixels * 3 / 2; + + for (uint n = 0; n < numImgs; n++, src += incrSrc, dst += incrDst) + { + T *y = dst; + T *uv = dst + imgPixels; + + const T *rgb = src; + + for (uint h = 0; h < hght; h++) + { + for (uint w = 0; w < wdth; w++, rgb += incrPix) + { + T R = rgb[0]; + T G = rgb[1]; + T B = rgb[2]; + + // Convert all RGB values to Y values and store them. + // clang-format off + if (bgr) std::swap(R, B); + *y++ = cuda::SaturateCast(R2Y_NV12 * R + G2Y_NV12 * G + B2Y_NV12 * B + Add2Y_NV12); + // clang-format on + + // Convert only even pixels (in width and height) to U and V values and store them. + if ((w & 1) == 0 && (h & 1) == 0) + { + double U = R2U_NV12 * R + G2U_NV12 * G + B2U_NV12 * B + Add2U_NV12; + double V = R2V_NV12 * R + G2V_NV12 * G + B2V_NV12 * B + Add2V_NV12; + + // clang-format off + if (yvu) std::swap(U, V); + // clang-format on + *uv++ = cuda::SaturateCast(U); + *uv++ = cuda::SaturateCast(V); + } + } + } + } +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +template +void convertRGBtoNV12(vector &dst, const vector &src, uint wdth, uint hght, uint numImgs, bool rgba, bool bgr, + bool yvu) +{ + // Ensure input data has sets of 3 or 4 (RGB/BGA with or w/o alpha) values for the given width and height and batch size. + assert(src.size() == (size_t)numImgs * (size_t)hght * (size_t)wdth * (size_t)(3 + rgba)); + + // YUV NV12 needs 3 elements for each two RGB pixels. + assert(dst.size() == (size_t)numImgs * (size_t)hght * (size_t)wdth * 3 / 2); + + convertRGBtoNV12(dst.data(), src.data(), wdth, hght, numImgs, rgba, bgr, yvu); +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +#define MAKE_RGBtoNV12(T) \ + template void convertRGBtoNV12(vector &, const vector &, uint, uint, uint, bool, bool, bool) + +MAKE_RGBtoNV12(uint8_t); +MAKE_RGBtoNV12(uint16_t); +MAKE_RGBtoNV12(int32_t); +MAKE_RGBtoNV12(float); +MAKE_RGBtoNV12(double); + +#undef MAKE_RGBtoNV12 + +//--------------------------------------------------------------------------------------------------------------------// + +//-==================================================================================================================-// +template +void convertNV12toRGB(T *dst, const T *src, uint wdth, uint hght, uint numImgs, bool rgba, bool bgr, bool yvu) +{ + // Ensure both width and height are multiples of 2 since we're processing 2x2 blocks. + assert(wdth % 2 == 0 && hght % 2 == 0); + + const size_t imgPixels = (size_t)hght * (size_t)wdth; + const size_t incrSrc = imgPixels * 3 / 2; + const size_t incrDst = imgPixels * (3 + rgba); + + for (uint n = 0; n < numImgs; n++, src += incrSrc, dst += incrDst) + { + T *rgb = dst; + + const T *y = src; + + for (uint h = 0; h < hght; h++) + { + // NOTE: when computing uv row index, h needs to be integer divided by 2 before multiplying by width. + const T *uv = src + imgPixels + (h >> 1) * wdth; + + for (uint w = 0; w < wdth; w++) + { + double Y = *y++; + double U = uv[0]; + double V = uv[1]; + + // clang-format off + if (yvu) std::swap(U, V); + + // Convert all YUV (ITU Rec.601) values to RGB values and store them. + Y -= Add2Y_NV12; + U -= Add2U_NV12; + V -= Add2V_NV12; + if (Y < 0.0) Y = 0.0; + + T R = cuda::SaturateCast(Y2R_NV12 * Y + U2R_NV12 * U + V2R_NV12 * V); + T G = cuda::SaturateCast(Y2G_NV12 * Y + U2G_NV12 * U + V2G_NV12 * V); + T B = cuda::SaturateCast(Y2B_NV12 * Y + U2B_NV12 * U + V2B_NV12 * V); + + if (bgr) std::swap(R, B); + *rgb++ = R; + *rgb++ = G; + *rgb++ = B; + if (rgba) *rgb++ = Alpha; + + if (w & 1) uv += 2; + // clang-format on + } + } + } +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +template +void convertNV12toRGB(vector &dst, const vector &src, uint wdth, uint hght, uint numImgs, bool rgba, bool bgr, + bool yvu) +{ + // Ensure output data has sets of 3 or 4 (RGB/BGA with or w/o alpha) values for the given width and height and batch size. + assert(dst.size() == (size_t)numImgs * (size_t)hght * (size_t)wdth * (size_t)(3 + rgba)); + + // YUV NV12 needs 3 elements for each two RGB pixels. + assert(src.size() == (size_t)numImgs * (size_t)hght * (size_t)wdth * 3 / 2); + + convertNV12toRGB(dst.data(), src.data(), wdth, hght, numImgs, rgba, bgr, yvu); +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +#define MAKE_NV12toRGB(T) \ + template void convertNV12toRGB(vector &, const vector &, uint, uint, uint, bool, bool, bool) + +MAKE_NV12toRGB(uint8_t); +MAKE_NV12toRGB(uint16_t); +MAKE_NV12toRGB(int32_t); +MAKE_NV12toRGB(float); +MAKE_NV12toRGB(double); + +#undef MAKE_NV12toRGB + +//--------------------------------------------------------------------------------------------------------------------// + +//-==================================================================================================================-// +template +void convertYUVtoRGB_422(T *dst, const T *src, uint wdth, uint hght, uint numImgs, bool rgba, bool bgr, bool yvu) +{ + // Ensure width is a multiple of 2. + assert(wdth % 2 == 0); + + constexpr uint idx0 = (LumaFirst ? 0 : 1); // First luma value index. + constexpr uint idx1 = idx0 + 2; // Second luma value index. + constexpr uint idxU = (LumaFirst ? 1 : 0); // U chroma value index. + constexpr uint idxV = idxU + 2; // V chroma value index. + + const size_t imgPixels = (size_t)hght * (size_t)wdth; + const size_t incrSrc = imgPixels * 2; + const size_t incrDst = imgPixels * (3 + rgba); + + for (uint n = 0; n < numImgs; n++, src += incrSrc, dst += incrDst) + { + T *rgb = dst; + + const T *img = src; + + for (uint h = 0; h < hght; h++) + { + for (uint w = 0; w < wdth; w += 2, img += 4) + { + T R, G, B; + + // clang-format off + double U = img[idxU], + V = img[idxV], + Y0 = img[idx0], + Y1 = img[idx1]; + + if (yvu) std::swap(U, V); + + // Convert all YUV (ITU Rec.601) values to RGB values and store them. + Y0 -= Add2Y_NV12; + Y1 -= Add2Y_NV12; + U -= Add2U_NV12; + V -= Add2V_NV12; + + if (Y0 < 0.0) Y0 = 0.0; + if (Y1 < 0.0) Y1 = 0.0; + // clang-format on + + double Y_0 = Y2R_NV12 * Y0; // NOTE: Y2R_NV12 == Y2G_NV12 == Y2B_NV12. + double Y_1 = Y2R_NV12 * Y1; + double UV_r = U2R_NV12 * U + V2R_NV12 * V; + double UV_g = U2G_NV12 * U + V2G_NV12 * V; + double UV_b = U2B_NV12 * U + V2B_NV12 * V; + + R = cuda::SaturateCast(Y_0 + UV_r); + G = cuda::SaturateCast(Y_0 + UV_g); + B = cuda::SaturateCast(Y_0 + UV_b); + + // clang-format off + if (bgr) std::swap(R, B); + *rgb++ = R; *rgb++ = G; *rgb++ = B; + if (rgba) *rgb++ = Alpha; + // clang-format on + + R = cuda::SaturateCast(Y_1 + UV_r); + G = cuda::SaturateCast(Y_1 + UV_g); + B = cuda::SaturateCast(Y_1 + UV_b); + + // clang-format off + if (bgr) std::swap(R, B); + *rgb++ = R; *rgb++ = G; *rgb++ = B; + if (rgba) *rgb++ = Alpha; + // clang-format on + } + } + } +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +template +void convertYUVtoRGB_422(vector &dst, const vector &src, uint wdth, uint hght, uint numImgs, bool rgba, bool bgr, + bool yvu) +{ + // Ensure output data has sets of 3 or 4 (RGB/BGA w/ or w/o alpha) values for the given width, height, & batch size. + assert(dst.size() == (size_t)numImgs * (size_t)hght * (size_t)wdth * (size_t)(3 + rgba)); + assert(src.size() == (size_t)numImgs * (size_t)hght * (size_t)wdth * 2); // 4 values for each two RGB pixels. + + convertYUVtoRGB_422(dst.data(), src.data(), wdth, hght, numImgs, rgba, bgr, yvu); +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +#define MAKE_422toRGB(T) \ + template void convertYUVtoRGB_422(vector &, const vector &, uint, uint, uint, bool, bool, bool) + +MAKE_422toRGB(uint8_t); +MAKE_422toRGB(uint16_t); +MAKE_422toRGB(int32_t); +MAKE_422toRGB(float); +MAKE_422toRGB(double); + +#undef MAKE_422toRGB +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +#define MAKE_422toRGB(T) \ + template void convertYUVtoRGB_422(vector &, const vector &, uint, uint, uint, bool, bool, bool) + +MAKE_422toRGB(uint8_t); +MAKE_422toRGB(uint16_t); +MAKE_422toRGB(int32_t); +MAKE_422toRGB(float); +MAKE_422toRGB(double); + +#undef MAKE_422toRGB + +//--------------------------------------------------------------------------------------------------------------------// + +//-==================================================================================================================-// +template +void convertYUVtoGray_422(T *dst, const T *src, size_t numPixels) +{ + src += (1 - LumaFirst); // Increment to first Y value if luma not first. + + for (size_t i = 0; i < numPixels; i++, src += 2) *dst++ = *src; +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +template +void convertYUVtoGray_422(vector &dst, const vector &src, size_t numPixels) +{ + assert(dst.size() == numPixels); + assert(src.size() == numPixels * 2); // YUV 422 needs 4 values for each two RGB pixels. + + convertYUVtoGray_422(dst.data(), src.data(), numPixels); +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +#define MAKE_422toGray(T) template void convertYUVtoGray_422(vector &, const vector &, size_t) + +MAKE_422toGray(uint8_t); +MAKE_422toGray(uint16_t); +MAKE_422toGray(int32_t); +MAKE_422toGray(float); +MAKE_422toGray(double); + +#undef MAKE_422toGray +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +#define MAKE_422toGray(T) template void convertYUVtoGray_422(vector &, const vector &, size_t) + +MAKE_422toGray(uint8_t); +MAKE_422toGray(uint16_t); +MAKE_422toGray(int32_t); +MAKE_422toGray(float); +MAKE_422toGray(double); + +#undef MAKE_422toGray +//--------------------------------------------------------------------------------------------------------------------// diff --git a/tests/cvcuda/system/CvtColorUtils.hpp b/tests/cvcuda/system/CvtColorUtils.hpp new file mode 100644 index 00000000..197adf86 --- /dev/null +++ b/tests/cvcuda/system/CvtColorUtils.hpp @@ -0,0 +1,78 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_TEST_COMMON_CVT_COLOR_UTILS_HPP +#define NVCV_TEST_COMMON_CVT_COLOR_UTILS_HPP + +#include + +#include + +// clang-format off + + +template +void changeAlpha(std::vector &dst, const std::vector &src, size_t numPixels, bool srcRGBA, bool dstRGBA); + +template +void convertRGBtoBGR(std::vector &dst, const std::vector &src, size_t numPixels, bool srcRGBA, bool dstRGBA); + +template +void convertRGBtoGray(std::vector &dst, const std::vector &src, size_t numPixels, bool rgba, bool bgr); + +template +void convertGrayToRGB(std::vector &dst, const std::vector &src, size_t numPixels, bool rgba); + +template +void convertRGBtoHSV(std::vector &dst, const std::vector &src, size_t numPixels, bool rgba, bool bgr); + +template +void convertHSVtoRGB(std::vector &dst, const std::vector &src, size_t numPixels, bool rgba, bool bgr); + +template +void convertRGBtoYUV_PAL(std::vector &dst, const std::vector &src, size_t numPixels, bool rgba, bool bgr); + +template +void convertYUVtoRGB_PAL(std::vector &dst, const std::vector &src, size_t numPixels, bool rgba, bool bgr); + +template +void convertRGBtoYUV_420(std::vector &dst, const std::vector &src, uint wdth, uint hght, uint numImgs, + bool rgba, bool bgr, bool yvu); + +template +void convertYUVtoRGB_420(std::vector &dst, const std::vector &src, uint wdth, uint hght, uint numImgs, + bool rgba, bool bgr, bool yvu); + +template +void convertYUVtoGray_420(std::vector &dst, const std::vector &src, uint wdth, uint hght, uint numImgs); + +template +void convertRGBtoNV12(std::vector &dst, const std::vector &src, uint wdth, uint hght, uint num, + bool rgba, bool bgr, bool yvu); + +template +void convertNV12toRGB(std::vector &dst, const std::vector &src, uint wdth, uint hght, uint num, + bool rgba, bool bgr, bool yvu); + +template +void convertYUVtoRGB_422(std::vector &dst, const std::vector &src, uint wdth, uint hght, uint numImgs, + bool rgba, bool bgr, bool yvu); + +template +void convertYUVtoGray_422(std::vector &dst, const std::vector &src, size_t numPixels); + +#endif // NVCV_TEST_COMMON_CVT_COLOR_UTILS_HPP diff --git a/tests/cvcuda/system/TestOpCvtColor.cpp b/tests/cvcuda/system/TestOpCvtColor.cpp index 31e5c682..67055b89 100644 --- a/tests/cvcuda/system/TestOpCvtColor.cpp +++ b/tests/cvcuda/system/TestOpCvtColor.cpp @@ -15,50 +15,23 @@ * limitations under the License. */ -#include "ConvUtils.hpp" +#include "CvtColorUtils.hpp" #include "Definitions.hpp" +#include "TestUtils.hpp" #include #include #include -#include #include #include #include #include -#include - namespace test = nvcv::test; +namespace util = nvcv::util; namespace cuda = nvcv::cuda; -#define VEC_EXPECT_NEAR(vec1, vec2, delta, dtype) \ - ASSERT_EQ(vec1.size(), vec2.size()); \ - for (std::size_t idx = 0; idx < vec1.size() / sizeof(dtype); ++idx) \ - { \ - EXPECT_NEAR(reinterpret_cast(vec1.data())[idx], reinterpret_cast(vec2.data())[idx], delta) \ - << "At index " << idx; \ - } - -template -void myGenerate(T *src, std::size_t size, std::default_random_engine &randEng) -{ - std::uniform_int_distribution rand(0u, 255u); - for (std::size_t idx = 0; idx < size; ++idx) - { - src[idx] = rand(randEng); - } -} - -template<> -void myGenerate(float *src, std::size_t size, std::default_random_engine &randEng) -{ - std::uniform_real_distribution rand(0.f, 1.f); - for (std::size_t idx = 0; idx < size; ++idx) - { - src[idx] = rand(randEng); - } -} +using std::vector; #define NVCV_IMAGE_FORMAT_RGBS8 NVCV_DETAIL_MAKE_COLOR_FMT1(RGB, UNDEFINED, PL, SIGNED, XYZ1, ASSOCIATED, X8_Y8_Z8) #define NVCV_IMAGE_FORMAT_BGRS8 NVCV_DETAIL_MAKE_COLOR_FMT1(RGB, UNDEFINED, PL, SIGNED, ZYX1, ASSOCIATED, X8_Y8_Z8) @@ -101,71 +74,739 @@ void myGenerate(float *src, std::size_t size, std::default_random_engine &randEn // clang-format off +//--------------------------------------------------------------------------------------------------------------------// +template +static void verifyOutput(nvcv::Tensor srcTensor, nvcv::ImageFormat srcFrmt, + nvcv::Tensor dstTensor, nvcv::ImageFormat dstFrmt, + NVCVColorConversionCode code, int wdth, int hght, int imgs, double maxDiff) +{ + auto srcData = srcTensor.exportData(); + auto dstData = dstTensor.exportData(); + ASSERT_TRUE(srcData); + ASSERT_TRUE(dstData); + + auto srcAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*srcData); + auto dstAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*dstData); + ASSERT_TRUE(srcAccess); + ASSERT_TRUE(dstAccess); + + int srcChannels = srcAccess->numChannels(); + int dstChannels = dstAccess->numChannels(); + + ASSERT_LE(srcChannels, 4); + ASSERT_LE(dstChannels, 4); + + int srcWdth = wdth, + srcHght = hght; + int dstWdth = wdth, + dstHght = hght; + + if (srcFrmt == NVCV_IMAGE_FORMAT_UYVY || srcFrmt == NVCV_IMAGE_FORMAT_UYVY_ER || + srcFrmt == NVCV_IMAGE_FORMAT_YUYV || srcFrmt == NVCV_IMAGE_FORMAT_YUYV_ER) + srcWdth = srcWdth << 1; + if (srcFrmt == NVCV_IMAGE_FORMAT_NV12 || srcFrmt == NVCV_IMAGE_FORMAT_NV12_ER || + srcFrmt == NVCV_IMAGE_FORMAT_NV21 || srcFrmt == NVCV_IMAGE_FORMAT_NV21_ER) + srcHght = (srcHght * 3) >> 1; + ASSERT_EQ(srcWdth, srcAccess->numCols()); + ASSERT_EQ(srcHght, srcAccess->numRows()); + + if (dstFrmt == NVCV_IMAGE_FORMAT_UYVY || dstFrmt == NVCV_IMAGE_FORMAT_UYVY_ER || + dstFrmt == NVCV_IMAGE_FORMAT_YUYV || dstFrmt == NVCV_IMAGE_FORMAT_YUYV_ER) + dstWdth = dstWdth << 1; + if (dstFrmt == NVCV_IMAGE_FORMAT_NV12 || dstFrmt == NVCV_IMAGE_FORMAT_NV12_ER || + dstFrmt == NVCV_IMAGE_FORMAT_NV21 || dstFrmt == NVCV_IMAGE_FORMAT_NV21_ER) + dstHght = (dstHght * 3) >> 1; + ASSERT_EQ(dstWdth, dstAccess->numCols()); + ASSERT_EQ(dstHght, dstAccess->numRows()); + + int srcRowElems = srcChannels * srcWdth; + int dstRowElems = dstChannels * dstWdth; + + size_t numPixels = (size_t)imgs * (size_t)wdth * (size_t)hght; + size_t srcElems = (size_t)imgs * (size_t)srcWdth * (size_t)srcHght * (size_t)srcChannels; + size_t dstElems = (size_t)imgs * (size_t)dstWdth * (size_t)dstHght * (size_t)dstChannels; + + size_t srcPitchCPU = srcRowElems * sizeof(T); + size_t dstPitchCPU = dstRowElems * sizeof(T); + + nvcv::Swizzle srcSwizzle = srcFrmt.swizzle(); + nvcv::Swizzle dstSwizzle = dstFrmt.swizzle(); + + vector srcVec(srcElems); + vector refVec(dstElems); + + bool srcBGR = (srcSwizzle == nvcv::Swizzle::S_ZYXW || + srcSwizzle == nvcv::Swizzle::S_ZYX1 || + srcSwizzle == nvcv::Swizzle::S_ZYX0); + bool dstBGR = (dstSwizzle == nvcv::Swizzle::S_ZYXW || + dstSwizzle == nvcv::Swizzle::S_ZYX1 || + dstSwizzle == nvcv::Swizzle::S_ZYX0); + bool srcRGBA = (srcChannels == 4), + dstRGBA = (dstChannels == 4); + bool success = true; + + RandEng randEng(0); + + constexpr size_t minCntAllRGB = 128 * 256 * 256; // Minimum # of pixels to call generateAllRGB. + constexpr size_t minCntAllHSV = 90 * 256 * 256; // Minimum # of pixels to call generateAllHSV. + constexpr double minMultHSV = -0.5; // Set hue range multiplier to be outside normal range + constexpr double maxMultHSV = 1.5; // to test robustness to wrapped hue values. + + // Populate source tensor. + if (srcChannels > 2) + { + if (code == NVCV_COLOR_HSV2BGR || code == NVCV_COLOR_HSV2BGR_FULL || + code == NVCV_COLOR_HSV2RGB || code == NVCV_COLOR_HSV2RGB_FULL) + { + bool full = (code == NVCV_COLOR_HSV2BGR_FULL || code == NVCV_COLOR_HSV2RGB_FULL); + + if (numPixels >= minCntAllHSV) + { + if (full) generateAllHSV(srcVec, srcWdth, srcHght, imgs); + else generateAllHSV(srcVec, srcWdth, srcHght, imgs); + } + else + { + if (full) generateRandHSV(srcVec, randEng, minMultHSV, maxMultHSV); + else generateRandHSV(srcVec, randEng, minMultHSV, maxMultHSV); + } + } + else + { + if (numPixels >= minCntAllRGB) + generateAllRGB(srcVec, srcWdth, srcHght, imgs, srcRGBA, srcBGR); + else + generateRandTestRGB(srcVec, randEng, srcRGBA, srcBGR); + } + } + else + generateRandVec(srcVec, randEng); + + // Copy source from image vector to device tensor. + ASSERT_EQ(cudaSuccess, cudaMemcpy2D(srcData->basePtr(), srcAccess->rowStride(), srcVec.data(), srcPitchCPU, + srcPitchCPU, (size_t)imgs * (size_t)srcHght, cudaMemcpyHostToDevice)); + + switch (code) + { + // Add/remove alpha channel to RGB/BGR image. + case NVCV_COLOR_BGR2BGRA : // NVCV_COLOR_BGR2BGRA = 0 (NVCV_COLOR_RGB2RGBA) + case NVCV_COLOR_BGRA2BGR : // NVCV_COLOR_BGRA2BGR = 1 (NVCV_COLOR_RGBA2RGB) + changeAlpha(refVec, srcVec, numPixels, srcRGBA, dstRGBA); + break; + + // Convert between RGB and BGR (with or without alpha channel). + case NVCV_COLOR_BGR2RGBA : // NVCV_COLOR_BGR2RGBA = 2 (NVCV_COLOR_RGB2BGRA) + case NVCV_COLOR_RGBA2BGR : // NVCV_COLOR_RGBA2BGR = 3 (NVCV_COLOR_BGRA2RGB) + case NVCV_COLOR_BGR2RGB : // NVCV_COLOR_BGR2RGB = 4 (NVCV_COLOR_BGR2RGB) + case NVCV_COLOR_BGRA2RGBA : // NVCV_COLOR_BGRA2RGBA = 5 (NVCV_COLOR_RGBA2BGRA) + convertRGBtoBGR(refVec, srcVec, numPixels, srcRGBA, dstRGBA); + break; + + // Convert from RGB/BGR to grayscale. + case NVCV_COLOR_BGR2GRAY : // NVCV_COLOR_BGR2GRAY = 6 + case NVCV_COLOR_RGB2GRAY : // NVCV_COLOR_RGB2GRAY = 7 + case NVCV_COLOR_BGRA2GRAY : // NVCV_COLOR_BGRA2GRAY = 10 + case NVCV_COLOR_RGBA2GRAY : // NVCV_COLOR_RGBA2GRAY = 11 + convertRGBtoGray(refVec, srcVec, numPixels, srcRGBA, srcBGR); + break; + + // Convert from grayscale to RGB/BGR. + case NVCV_COLOR_GRAY2BGR : // NVCV_COLOR_GRAY2BGR = 8 (NVCV_COLOR_GRAY2RGB) + case NVCV_COLOR_GRAY2BGRA : // NVCV_COLOR_GRAY2BGRA = 9 (NVCV_COLOR_GRAY2RGBA) + convertGrayToRGB(refVec, srcVec, numPixels, dstRGBA); + break; + + // Convert between RGB/BGR and BGR565 (16-bit images) --> Conversion codes 12-19 not implemented. + // Convert between grayscale and BGR565 (16-bit images) --> Conversion codes 20-21 not implemented. + // Convert between RGB/BGR and BGR555 (16-bit images) --> Conversion codes 22-29 not implemented. + // Convert between grayscale and BGR555 (16-bit images) --> Conversion codes 30-31 not implemented. + // Convert between RGB/BGR and CIE XYZ --> Conversion codes 32-35 not implemented. + // Convert between RGB/BGR and YCrCb (aka YCC) --> Conversion codes 36-39 not implemented. + + // Convert from RGB/BGR to HSV (hue, saturation, value). + case NVCV_COLOR_BGR2HSV : // NVCV_COLOR_BGR2HSV = 40 + case NVCV_COLOR_RGB2HSV : // NVCV_COLOR_RGB2HSV = 41 + convertRGBtoHSV(refVec, srcVec, numPixels, srcRGBA, srcBGR); + break; + + // Conversion codes 42 and 43 not specified. + // Convert from RGB/BGR to CIE Lab --> Conversion codes 44-45 not implemented. + // Bayer demosaicing to RGB/BGR --> Conversion codes 46-49 not implemented. + // Convert from RGB/BGR to CIE Luv --> Conversion codes 50-51 not implemented. + // Convert from RGB/BGR to HLS (hue, lightness, saturation) --> Conversion codes 52-53 not implemented. + + // Convert from HSV (hue, saturation, value) to RGB/BGR. + case NVCV_COLOR_HSV2BGR : // NVCV_COLOR_HSV2BGR = 54 + case NVCV_COLOR_HSV2RGB : // NVCV_COLOR_HSV2RGB = 55 + convertHSVtoRGB(refVec, srcVec, numPixels, dstRGBA, dstBGR); + break; + + // Convert to RGB/BGR from CIE Lab --> Conversion codes 56-57 not implemented. + // Convert to RGB/BGR from CIE Luv --> Conversion codes 58-59 not implemented. + // Convert to RGB/BGR from HLS (hue, lightness, saturation) --> Conversion codes 60-61 not implemented. + // VNG (Variable Number of Gradients) demosaicing to RGB/BGR --> Conversion codes 62-65 not implemented. + + // Convert from RGB/BGR to full-range HSV (hue, saturation, value). + case NVCV_COLOR_BGR2HSV_FULL : // NVCV_COLOR_BGR2HSV_FULL = 66 + case NVCV_COLOR_RGB2HSV_FULL : // NVCV_COLOR_RGB2HSV_FULL = 67 + convertRGBtoHSV(refVec, srcVec, numPixels, srcRGBA, srcBGR); + break; + + // Convert from RGB/BGR to full-range HLS (hue, lightness, saturation) --> Conversion codes 68-69 not implemented. + + // Convert from full-range HSV (hue, saturation, value) to RGB/BGR. + case NVCV_COLOR_HSV2BGR_FULL : // NVCV_COLOR_HSV2BGR_FULL = 70 + case NVCV_COLOR_HSV2RGB_FULL : // NVCV_COLOR_HSV2RGB_FULL = 71 + convertHSVtoRGB(refVec, srcVec, numPixels, dstRGBA, dstBGR); + break; + + // Convert from full-range HLS (hue, lightness, saturation) to RGB/BGR --> Conversion codes 72-73 not implemented. + // Convert from LRGB/LBGR (luminance, red, green, blue) to CIE Lab --> Conversion codes 74-75 not implemented. + // Convert from LRGB/LBGR (luminance, red, green, blue) to CIE Luv --> Conversion codes 76-77 not implemented. + // Convert to LRGB/LBGR (luminance, red, green, blue) from CIE Lab --> Conversion codes 78-79 not implemented. + // Convert to LRGB/LBGR (luminance, red, green, blue) from CIE Luv --> Conversion codes 80-81 not implemented. + + // Convert from RGB/BGR to YUV. + case NVCV_COLOR_BGR2YUV : // NVCV_COLOR_BGR2YUV = 82 + case NVCV_COLOR_RGB2YUV : // NVCV_COLOR_RGB2YUV = 83 + convertRGBtoYUV_PAL(refVec, srcVec, numPixels, srcRGBA, srcBGR); + break; + + // Convert from YUV to RGB/BGR. + case NVCV_COLOR_YUV2BGR : // NVCV_COLOR_YUV2BGR = 84 + case NVCV_COLOR_YUV2RGB : // NVCV_COLOR_YUV2RGB = 85 + convertYUVtoRGB_PAL(refVec, srcVec, numPixels, dstRGBA, dstBGR); + break; + + // Bayer demosaicing to grayscale --> Conversion codes 86-89 not implemented. + + // Convert from YUV 4:2:0 family to RGB/BGR. + case NVCV_COLOR_YUV2RGB_NV12 : // NVCV_COLOR_YUV2RGB_NV12 = 90 + case NVCV_COLOR_YUV2BGR_NV12 : // NVCV_COLOR_YUV2BGR_NV12 = 91 + case NVCV_COLOR_YUV2RGBA_NV12: // NVCV_COLOR_YUV2RGBA_NV12 = 94 + case NVCV_COLOR_YUV2BGRA_NV12: // NVCV_COLOR_YUV2BGRA_NV12 = 95 + convertNV12toRGB(refVec, srcVec, wdth, hght, imgs, dstRGBA, dstBGR, false); + break; + + case NVCV_COLOR_YUV2RGB_NV21 : // NVCV_COLOR_YUV2RGB_NV21 = 92 (NVCV_COLOR_YUV420sp2RGB) + case NVCV_COLOR_YUV2BGR_NV21 : // NVCV_COLOR_YUV2BGR_NV21 = 93 (NVCV_COLOR_YUV420sp2BGR) + case NVCV_COLOR_YUV2RGBA_NV21: // NVCV_COLOR_YUV2RGBA_NV21 = 96 (NVCV_COLOR_YUV420sp2RGBA) + case NVCV_COLOR_YUV2BGRA_NV21: // NVCV_COLOR_YUV2BGRA_NV21 = 97 (NVCV_COLOR_YUV420sp2BGRA) + convertNV12toRGB(refVec, srcVec, wdth, hght, imgs, dstRGBA, dstBGR, true); + break; + + case NVCV_COLOR_YUV2RGB_YV12 : // NVCV_COLOR_YUV2RGB_YV12 = 98 (NVCV_COLOR_YUV420p2RGB) + case NVCV_COLOR_YUV2BGR_YV12 : // NVCV_COLOR_YUV2BGR_YV12 = 99 (NVCV_COLOR_YUV420p2BGR) + case NVCV_COLOR_YUV2RGBA_YV12: // NVCV_COLOR_YUV2RGBA_YV12 = 102 (NVCV_COLOR_YUV420p2RGBA) + case NVCV_COLOR_YUV2BGRA_YV12: // NVCV_COLOR_YUV2BGRA_YV12 = 103 (NVCV_COLOR_YUV420p2BGRA) + convertYUVtoRGB_420(refVec, srcVec, wdth, hght, imgs, dstRGBA, dstBGR, true); + break; + + case NVCV_COLOR_YUV2RGB_IYUV : // NVCV_COLOR_YUV2RGB_IYUV = 100 (NVCV_COLOR_YUV2RGB_I420) + case NVCV_COLOR_YUV2BGR_IYUV : // NVCV_COLOR_YUV2BGR_IYUV = 101 (NVCV_COLOR_YUV2BGR_I420) + case NVCV_COLOR_YUV2RGBA_IYUV: // NVCV_COLOR_YUV2RGBA_IYUV = 104 (NVCV_COLOR_YUV2RGBA_I420) + case NVCV_COLOR_YUV2BGRA_IYUV: // NVCV_COLOR_YUV2BGRA_IYUV = 105 (NVCV_COLOR_YUV2BGRA_I420) + convertYUVtoRGB_420(refVec, srcVec, wdth, hght, imgs, dstRGBA, dstBGR, false); + break; + + // Convert from YUV 4:2:0 family to grayscale. + case NVCV_COLOR_YUV2GRAY_420 : // NVCV_COLOR_YUV2GRAY_420 = 106 (NVCV_COLOR_YUV2GRAY_NV21, NVCV_COLOR_YUV2GRAY_NV12, + // NVCV_COLOR_YUV2GRAY_YV12, NVCV_COLOR_YUV2GRAY_IYUV, + // NVCV_COLOR_YUV2GRAY_I420, NVCV_COLOR_YUV420sp2GRAY, + // NVCV_COLOR_YUV420p2GRAY) + convertYUVtoGray_420(refVec, srcVec, wdth, hght, imgs); + break; + + // Convert from YUV 4:2:2 family to RGB/BGR. + case NVCV_COLOR_YUV2RGB_UYVY : // NVCV_COLOR_YUV2RGB_UYVY = 107 ( NVCV_COLOR_YUV2RGB_Y422, NVCV_COLOR_YUV2RGB_UYNV) + case NVCV_COLOR_YUV2BGR_UYVY : // NVCV_COLOR_YUV2BGR_UYVY = 108 ( NVCV_COLOR_YUV2RGB_Y422, NVCV_COLOR_YUV2RGB_UYNV) + // Conversion codes 109 (NVCV_COLOR_YUV2RGB_VYUY) and 110 (NVCV_COLOR_YUV2BGR_VYUY) not available. + case NVCV_COLOR_YUV2RGBA_UYVY: // NVCV_COLOR_YUV2RGBA_UYVY = 111 ( NVCV_COLOR_YUV2RGBA_Y422, NVCV_COLOR_YUV2RGBA_UYNV) + case NVCV_COLOR_YUV2BGRA_UYVY: // NVCV_COLOR_YUV2BGRA_UYVY = 112 ( NVCV_COLOR_YUV2BGRA_Y422, NVCV_COLOR_YUV2BGRA_UYNV) + convertYUVtoRGB_422(refVec, srcVec, wdth, hght, imgs, dstRGBA, dstBGR, false); + break; + + // Conversion codes 113 (NVCV_COLOR_YUV2RGBA_VYUY) and 114 (NVCV_COLOR_YUV2BGRA_VYUY) not available. + case NVCV_COLOR_YUV2RGB_YUY2 : // NVCV_COLOR_YUV2RGB_YUY2 = 115 (NVCV_COLOR_YUV2RGB_YUYV, NVCV_COLOR_YUV2RGB_YUNV) + case NVCV_COLOR_YUV2BGR_YUY2 : // NVCV_COLOR_YUV2BGR_YUY2 = 116 (NVCV_COLOR_YUV2BGR_YUYV, NVCV_COLOR_YUV2BGR_YUNV) + case NVCV_COLOR_YUV2RGBA_YUY2: // NVCV_COLOR_YUV2RGBA_YUY2 = 119 (NVCV_COLOR_YUV2RGBA_YUYV, NVCV_COLOR_YUV2RGBA_YUNV) + case NVCV_COLOR_YUV2BGRA_YUY2: // NVCV_COLOR_YUV2BGRA_YUY2 = 120 (NVCV_COLOR_YUV2BGRA_YUYV, NVCV_COLOR_YUV2BGRA_YUNV) + convertYUVtoRGB_422(refVec, srcVec, wdth, hght, imgs, dstRGBA, dstBGR, false); + break; + + case NVCV_COLOR_YUV2RGB_YVYU : // NVCV_COLOR_YUV2RGB_YVYU = 117 + case NVCV_COLOR_YUV2BGR_YVYU : // NVCV_COLOR_YUV2BGR_YVYU = 118 + case NVCV_COLOR_YUV2RGBA_YVYU: // NVCV_COLOR_YUV2RGBA_YVYU = 121 + case NVCV_COLOR_YUV2BGRA_YVYU: // NVCV_COLOR_YUV2BGRA_YVYU = 122 + convertYUVtoRGB_422(refVec, srcVec, wdth, hght, imgs, dstRGBA, dstBGR, true); + break; + + // Convert from YUV 4:2:2 family to grayscale. + case NVCV_COLOR_YUV2GRAY_UYVY: // NVCV_COLOR_YUV2GRAY_UYVY = 123 (NVCV_COLOR_YUV2GRAY_Y422, NVCV_COLOR_YUV2GRAY_UYNV) + convertYUVtoGray_422(refVec, srcVec, numPixels); + break; + + case NVCV_COLOR_YUV2GRAY_YUY2: // NVCV_COLOR_YUV2GRAY_YUY2 = 124 (NVCV_COLOR_YUV2GRAY_YVYU, NVCV_COLOR_YUV2GRAY_YUYV, + // NVCV_COLOR_YUV2GRAY_YUNV) + convertYUVtoGray_422(refVec, srcVec, numPixels); + break; + + // RGB/BGA alpha premultiplication --> Conversion codes 125-126 not implemented. + + // Convert from RGB/BGR to YUV 4:2:0 family. + case NVCV_COLOR_RGB2YUV_I420 : // NVCV_COLOR_RGB2YUV_I420 = 127 (NVCV_COLOR_RGB2YUV_IYUV) + case NVCV_COLOR_BGR2YUV_I420 : // NVCV_COLOR_BGR2YUV_I420 = 128 (NVCV_COLOR_BGR2YUV_IYUV) + case NVCV_COLOR_RGBA2YUV_I420: // NVCV_COLOR_RGBA2YUV_I420 = 129 (NVCV_COLOR_RGBA2YUV_IYUV) + case NVCV_COLOR_BGRA2YUV_I420: // NVCV_COLOR_BGRA2YUV_I420 = 130 (NVCV_COLOR_BGRA2YUV_IYUV) + convertRGBtoYUV_420(refVec, srcVec, wdth, hght, imgs, srcRGBA, srcBGR, false); + break; + + case NVCV_COLOR_RGB2YUV_YV12 : // NVCV_COLOR_RGB2YUV_YV12 = 131 + case NVCV_COLOR_BGR2YUV_YV12 : // NVCV_COLOR_BGR2YUV_YV12 = 132 + case NVCV_COLOR_RGBA2YUV_YV12: // NVCV_COLOR_RGBA2YUV_YV12 = 133 + case NVCV_COLOR_BGRA2YUV_YV12: // NVCV_COLOR_BGRA2YUV_YV12 = 134 + convertRGBtoYUV_420(refVec, srcVec, wdth, hght, imgs, srcRGBA, srcBGR, true); + break; + + // Edge-aware demosaicing to RGB/BGR --> Conversion codes 135-138 not implemented. + // OpenCV COLORCVT_MAX --> Conversion code 139 not implemented. + + // Convert RGB/BGR to YUV 4:2:0 family (two plane YUV; not in OpenCV). + case NVCV_COLOR_RGB2YUV_NV12 : // NVCV_COLOR_RGB2YUV_NV12 = 140 + case NVCV_COLOR_BGR2YUV_NV12 : // NVCV_COLOR_BGR2YUV_NV12 = 141 + case NVCV_COLOR_RGBA2YUV_NV12: // NVCV_COLOR_RGBA2YUV_NV12 = 144 + case NVCV_COLOR_BGRA2YUV_NV12: // NVCV_COLOR_BGRA2YUV_NV12 = 145 + convertRGBtoNV12(refVec, srcVec, wdth, hght, imgs, srcRGBA, srcBGR, false); + break; + + case NVCV_COLOR_RGB2YUV_NV21 : // NVCV_COLOR_RGB2YUV_NV21 = 142 (NVCV_COLOR_RGB2YUV420sp) + case NVCV_COLOR_BGR2YUV_NV21 : // NVCV_COLOR_BGR2YUV_NV21 = 143 (NVCV_COLOR_BGR2YUV420sp) + case NVCV_COLOR_RGBA2YUV_NV21: // NVCV_COLOR_RGBA2YUV_NV21 = 146 (NVCV_COLOR_RGBA2YUV420sp) + case NVCV_COLOR_BGRA2YUV_NV21: // NVCV_COLOR_BGRA2YUV_NV21 = 147 (NVCV_COLOR_BGRA2YUV420sp) + convertRGBtoNV12(refVec, srcVec, wdth, hght, imgs, srcRGBA, srcBGR, true); + break; + + default: + std::cerr << "**** ERROR: Color conversion not implemented for conversion code " << code << ". ****\n\n"; + success = false; + } + + if (success) + { + // Run color conversion operator. + cudaStream_t stream; + + ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream)); + + cvcuda::CvtColor convertColor; + + EXPECT_NO_THROW(convertColor(stream, srcTensor, dstTensor, code)); + + ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream)); + + // Copy destination tensor back to host. + vector dstVec(dstElems); + + ASSERT_EQ(cudaSuccess, cudaMemcpy2D(dstVec.data(), dstPitchCPU, dstData->basePtr(), dstAccess->rowStride(), + dstPitchCPU, (size_t)imgs * (size_t)dstHght, cudaMemcpyDeviceToHost)); + + constexpr uint maxErrCnt = 16; + + // Compare "gold" reference to computed output. + if (dstFrmt == NVCV_IMAGE_FORMAT_HSV8 || dstFrmt == NVCV_IMAGE_FORMAT_HSVf32) + { + const bool full = (code == NVCV_COLOR_BGR2HSV_FULL || code == NVCV_COLOR_RGB2HSV_FULL); + const double range = (sizeof(T) > 1) ? 360.0 : (full ? 256.0 : 180.0); + + EXPECT_NEAR_HSV_VEC_CNT(refVec, dstVec, range, maxDiff, maxErrCnt, success); + } + else + EXPECT_NEAR_VEC_CNT(refVec, dstVec, maxDiff, maxErrCnt, success); + } + else + { + GTEST_SKIP() << "Waived: this test hasn't been implemented."; + } +} + +//--------------------------------------------------------------------------------------------------------------------// + +#define ERR2_3 (2.0 / 1024.0) // 0.0009765625 --> approximates 2e-3 but can be exactly represented in floating point. +#define ERR1_3 (1.0 / 1024.0) // 0.0009765625 --> approximates 1e-3 but can be exactly represented in floating point. +#define ERR1_4 (1.0 / 8192.0) // 0.0001220703125 --> approximates 1e-4 but can be exactly represented in floating point. + NVCV_TEST_SUITE_P(OpCvtColor, +test::ValueList +{ + // W, H, N, Input Format, Output Format, Convert Code, maxDiff + { 177, 113, 1, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_BGR2BGRA, 0.0}, + { 113, 176, 2, NVCV_IMAGE_FORMAT_BGRA8, NVCV_IMAGE_FORMAT_BGR8, NVCV_COLOR_BGRA2BGR, 0.0}, + { 335, 432, 2, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBA8, NVCV_COLOR_RGB2RGBA, 0.0}, + { 431, 336, 2, NVCV_IMAGE_FORMAT_RGBA8, NVCV_IMAGE_FORMAT_RGB8, NVCV_COLOR_RGBA2RGB, 0.0}, + { 77, 212, 3, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGBA8, NVCV_COLOR_BGR2RGBA, 0.0}, + { 77, 212, 3, NVCV_IMAGE_FORMAT_RGBA8, NVCV_IMAGE_FORMAT_BGR8, NVCV_COLOR_RGBA2BGR, 0.0}, + { 33, 55, 4, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_RGB2BGRA, 0.0}, + { 33, 55, 4, NVCV_IMAGE_FORMAT_BGRA8, NVCV_IMAGE_FORMAT_RGB8, NVCV_COLOR_BGRA2RGB, 0.0}, + { 123, 321, 5, NVCV_IMAGE_FORMAT_RGBA8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_RGBA2BGRA, 0.0}, + { 123, 321, 5, NVCV_IMAGE_FORMAT_BGRA8, NVCV_IMAGE_FORMAT_RGBA8, NVCV_COLOR_BGRA2RGBA, 0.0}, + + { 177, 113, 1, NVCV_IMAGE_FORMAT_BGRS8, NVCV_IMAGE_FORMAT_BGRAS8, NVCV_COLOR_BGR2BGRA, 0.0}, + { 113, 176, 2, NVCV_IMAGE_FORMAT_BGRAS8, NVCV_IMAGE_FORMAT_BGRS8, NVCV_COLOR_BGRA2BGR, 0.0}, + { 335, 432, 2, NVCV_IMAGE_FORMAT_RGBS8, NVCV_IMAGE_FORMAT_RGBAS8, NVCV_COLOR_RGB2RGBA, 0.0}, + { 431, 336, 2, NVCV_IMAGE_FORMAT_RGBAS8, NVCV_IMAGE_FORMAT_RGBS8, NVCV_COLOR_RGBA2RGB, 0.0}, + { 77, 212, 3, NVCV_IMAGE_FORMAT_BGRS8, NVCV_IMAGE_FORMAT_RGBAS8, NVCV_COLOR_BGR2RGBA, 0.0}, + { 77, 212, 3, NVCV_IMAGE_FORMAT_RGBAS8, NVCV_IMAGE_FORMAT_BGRS8, NVCV_COLOR_RGBA2BGR, 0.0}, + { 33, 55, 4, NVCV_IMAGE_FORMAT_RGBS8, NVCV_IMAGE_FORMAT_BGRAS8, NVCV_COLOR_RGB2BGRA, 0.0}, + { 33, 55, 4, NVCV_IMAGE_FORMAT_BGRAS8, NVCV_IMAGE_FORMAT_RGBS8, NVCV_COLOR_BGRA2RGB, 0.0}, + { 123, 321, 5, NVCV_IMAGE_FORMAT_RGBAS8, NVCV_IMAGE_FORMAT_BGRAS8, NVCV_COLOR_RGBA2BGRA, 0.0}, + { 123, 321, 5, NVCV_IMAGE_FORMAT_BGRAS8, NVCV_IMAGE_FORMAT_RGBAS8, NVCV_COLOR_BGRA2RGBA, 0.0}, + + { 177, 113, 1, NVCV_IMAGE_FORMAT_BGR16, NVCV_IMAGE_FORMAT_BGRA16, NVCV_COLOR_BGR2BGRA, 0.0}, + { 113, 176, 2, NVCV_IMAGE_FORMAT_BGRA16, NVCV_IMAGE_FORMAT_BGR16, NVCV_COLOR_BGRA2BGR, 0.0}, + { 335, 432, 2, NVCV_IMAGE_FORMAT_RGB16, NVCV_IMAGE_FORMAT_RGBA16, NVCV_COLOR_RGB2RGBA, 0.0}, + { 431, 336, 2, NVCV_IMAGE_FORMAT_RGBA16, NVCV_IMAGE_FORMAT_RGB16, NVCV_COLOR_RGBA2RGB, 0.0}, + { 77, 212, 3, NVCV_IMAGE_FORMAT_BGR16, NVCV_IMAGE_FORMAT_RGBA16, NVCV_COLOR_BGR2RGBA, 0.0}, + { 77, 212, 3, NVCV_IMAGE_FORMAT_RGBA16, NVCV_IMAGE_FORMAT_BGR16, NVCV_COLOR_RGBA2BGR, 0.0}, + { 33, 55, 4, NVCV_IMAGE_FORMAT_RGB16, NVCV_IMAGE_FORMAT_BGRA16, NVCV_COLOR_RGB2BGRA, 0.0}, + { 33, 55, 4, NVCV_IMAGE_FORMAT_BGRA16, NVCV_IMAGE_FORMAT_RGB16, NVCV_COLOR_BGRA2RGB, 0.0}, + { 123, 321, 5, NVCV_IMAGE_FORMAT_RGBA16, NVCV_IMAGE_FORMAT_BGRA16, NVCV_COLOR_RGBA2BGRA, 0.0}, + { 123, 321, 5, NVCV_IMAGE_FORMAT_BGRA16, NVCV_IMAGE_FORMAT_RGBA16, NVCV_COLOR_BGRA2RGBA, 0.0}, + + { 177, 113, 1, NVCV_IMAGE_FORMAT_BGRS16, NVCV_IMAGE_FORMAT_BGRAS16, NVCV_COLOR_BGR2BGRA, 0.0}, + { 113, 176, 2, NVCV_IMAGE_FORMAT_BGRAS16, NVCV_IMAGE_FORMAT_BGRS16, NVCV_COLOR_BGRA2BGR, 0.0}, + { 335, 432, 2, NVCV_IMAGE_FORMAT_RGBS16, NVCV_IMAGE_FORMAT_RGBAS16, NVCV_COLOR_RGB2RGBA, 0.0}, + { 431, 336, 2, NVCV_IMAGE_FORMAT_RGBAS16, NVCV_IMAGE_FORMAT_RGBS16, NVCV_COLOR_RGBA2RGB, 0.0}, + { 77, 212, 3, NVCV_IMAGE_FORMAT_BGRS16, NVCV_IMAGE_FORMAT_RGBAS16, NVCV_COLOR_BGR2RGBA, 0.0}, + { 77, 212, 3, NVCV_IMAGE_FORMAT_RGBAS16, NVCV_IMAGE_FORMAT_BGRS16, NVCV_COLOR_RGBA2BGR, 0.0}, + { 33, 55, 4, NVCV_IMAGE_FORMAT_RGBS16, NVCV_IMAGE_FORMAT_BGRAS16, NVCV_COLOR_RGB2BGRA, 0.0}, + { 33, 55, 4, NVCV_IMAGE_FORMAT_BGRAS16, NVCV_IMAGE_FORMAT_RGBS16, NVCV_COLOR_BGRA2RGB, 0.0}, + { 123, 321, 5, NVCV_IMAGE_FORMAT_RGBAS16, NVCV_IMAGE_FORMAT_BGRAS16, NVCV_COLOR_RGBA2BGRA, 0.0}, + { 123, 321, 5, NVCV_IMAGE_FORMAT_BGRAS16, NVCV_IMAGE_FORMAT_RGBAS16, NVCV_COLOR_BGRA2RGBA, 0.0}, + + { 177, 113, 1, NVCV_IMAGE_FORMAT_BGRS32, NVCV_IMAGE_FORMAT_BGRAS32, NVCV_COLOR_BGR2BGRA, 0.0}, + { 113, 176, 2, NVCV_IMAGE_FORMAT_BGRAS32, NVCV_IMAGE_FORMAT_BGRS32, NVCV_COLOR_BGRA2BGR, 0.0}, + { 335, 432, 2, NVCV_IMAGE_FORMAT_RGBS32, NVCV_IMAGE_FORMAT_RGBAS32, NVCV_COLOR_RGB2RGBA, 0.0}, + { 431, 336, 2, NVCV_IMAGE_FORMAT_RGBAS32, NVCV_IMAGE_FORMAT_RGBS32, NVCV_COLOR_RGBA2RGB, 0.0}, + { 77, 212, 3, NVCV_IMAGE_FORMAT_BGRS32, NVCV_IMAGE_FORMAT_RGBAS32, NVCV_COLOR_BGR2RGBA, 0.0}, + { 77, 212, 3, NVCV_IMAGE_FORMAT_RGBAS32, NVCV_IMAGE_FORMAT_BGRS32, NVCV_COLOR_RGBA2BGR, 0.0}, + { 33, 55, 4, NVCV_IMAGE_FORMAT_RGBS32, NVCV_IMAGE_FORMAT_BGRAS32, NVCV_COLOR_RGB2BGRA, 0.0}, + { 33, 55, 4, NVCV_IMAGE_FORMAT_BGRAS32, NVCV_IMAGE_FORMAT_RGBS32, NVCV_COLOR_BGRA2RGB, 0.0}, + { 123, 321, 5, NVCV_IMAGE_FORMAT_RGBAS32, NVCV_IMAGE_FORMAT_BGRAS32, NVCV_COLOR_RGBA2BGRA, 0.0}, + { 123, 321, 5, NVCV_IMAGE_FORMAT_BGRAS32, NVCV_IMAGE_FORMAT_RGBAS32, NVCV_COLOR_BGRA2RGBA, 0.0}, + + // Conversions that add alpha to output tensor are not allowed for f16 type. + { 113, 176, 2, NVCV_IMAGE_FORMAT_BGRAf16, NVCV_IMAGE_FORMAT_BGRf16, NVCV_COLOR_BGRA2BGR, 0.0}, + { 431, 336, 2, NVCV_IMAGE_FORMAT_RGBAf16, NVCV_IMAGE_FORMAT_RGBf16, NVCV_COLOR_RGBA2RGB, 0.0}, + { 77, 212, 3, NVCV_IMAGE_FORMAT_RGBAf16, NVCV_IMAGE_FORMAT_BGRf16, NVCV_COLOR_RGBA2BGR, 0.0}, + { 33, 55, 4, NVCV_IMAGE_FORMAT_BGRAf16, NVCV_IMAGE_FORMAT_RGBf16, NVCV_COLOR_BGRA2RGB, 0.0}, + { 123, 321, 5, NVCV_IMAGE_FORMAT_RGBAf16, NVCV_IMAGE_FORMAT_BGRAf16, NVCV_COLOR_RGBA2BGRA, 0.0}, + { 123, 321, 5, NVCV_IMAGE_FORMAT_BGRAf16, NVCV_IMAGE_FORMAT_RGBAf16, NVCV_COLOR_BGRA2RGBA, 0.0}, + + { 129, 61, 4, NVCV_IMAGE_FORMAT_BGRf32, NVCV_IMAGE_FORMAT_BGRAf32, NVCV_COLOR_BGR2BGRA, 0.0}, + { 129, 61, 4, NVCV_IMAGE_FORMAT_BGRAf32, NVCV_IMAGE_FORMAT_BGRf32, NVCV_COLOR_BGRA2BGR, 0.0}, + { 63, 31, 3, NVCV_IMAGE_FORMAT_RGBf32, NVCV_IMAGE_FORMAT_RGBAf32, NVCV_COLOR_RGB2RGBA, 0.0}, + { 63, 31, 3, NVCV_IMAGE_FORMAT_RGBAf32, NVCV_IMAGE_FORMAT_RGBf32, NVCV_COLOR_RGBA2RGB, 0.0}, + { 42, 111, 2, NVCV_IMAGE_FORMAT_BGRf32, NVCV_IMAGE_FORMAT_RGBAf32, NVCV_COLOR_BGR2RGBA, 0.0}, + { 42, 111, 2, NVCV_IMAGE_FORMAT_RGBAf32, NVCV_IMAGE_FORMAT_BGRf32, NVCV_COLOR_RGBA2BGR, 0.0}, + { 21, 72, 2, NVCV_IMAGE_FORMAT_RGBf32, NVCV_IMAGE_FORMAT_BGRAf32, NVCV_COLOR_RGB2BGRA, 0.0}, + { 21, 72, 2, NVCV_IMAGE_FORMAT_BGRAf32, NVCV_IMAGE_FORMAT_RGBf32, NVCV_COLOR_BGRA2RGB, 0.0}, + { 23, 31, 3, NVCV_IMAGE_FORMAT_RGBAf32, NVCV_IMAGE_FORMAT_BGRAf32, NVCV_COLOR_RGBA2BGRA, 0.0}, + { 23, 31, 3, NVCV_IMAGE_FORMAT_BGRAf32, NVCV_IMAGE_FORMAT_RGBAf32, NVCV_COLOR_BGRA2RGBA, 0.0}, + + { 177, 113, 1, NVCV_IMAGE_FORMAT_BGRf64, NVCV_IMAGE_FORMAT_BGRAf64, NVCV_COLOR_BGR2BGRA, 0.0}, + { 113, 176, 2, NVCV_IMAGE_FORMAT_BGRAf64, NVCV_IMAGE_FORMAT_BGRf64, NVCV_COLOR_BGRA2BGR, 0.0}, + { 335, 432, 2, NVCV_IMAGE_FORMAT_RGBf64, NVCV_IMAGE_FORMAT_RGBAf64, NVCV_COLOR_RGB2RGBA, 0.0}, + { 431, 336, 2, NVCV_IMAGE_FORMAT_RGBAf64, NVCV_IMAGE_FORMAT_RGBf64, NVCV_COLOR_RGBA2RGB, 0.0}, + { 77, 212, 3, NVCV_IMAGE_FORMAT_BGRf64, NVCV_IMAGE_FORMAT_RGBAf64, NVCV_COLOR_BGR2RGBA, 0.0}, + { 77, 212, 3, NVCV_IMAGE_FORMAT_RGBAf64, NVCV_IMAGE_FORMAT_BGRf64, NVCV_COLOR_RGBA2BGR, 0.0}, + { 33, 55, 4, NVCV_IMAGE_FORMAT_RGBf64, NVCV_IMAGE_FORMAT_BGRAf64, NVCV_COLOR_RGB2BGRA, 0.0}, + { 33, 55, 4, NVCV_IMAGE_FORMAT_BGRAf64, NVCV_IMAGE_FORMAT_RGBf64, NVCV_COLOR_BGRA2RGB, 0.0}, + { 123, 321, 5, NVCV_IMAGE_FORMAT_RGBAf64, NVCV_IMAGE_FORMAT_BGRAf64, NVCV_COLOR_RGBA2BGRA, 0.0}, + { 123, 321, 5, NVCV_IMAGE_FORMAT_BGRAf64, NVCV_IMAGE_FORMAT_RGBAf64, NVCV_COLOR_BGRA2RGBA, 0.0}, + + { 23, 21, 63, NVCV_IMAGE_FORMAT_Y8_ER, NVCV_IMAGE_FORMAT_BGR8, NVCV_COLOR_GRAY2BGR, 0.0}, + { 21, 22, 63, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_Y8_ER, NVCV_COLOR_BGR2GRAY, 1.0}, + { 401, 202, 5, NVCV_IMAGE_FORMAT_Y8_ER, NVCV_IMAGE_FORMAT_RGB8, NVCV_COLOR_GRAY2RGB, 0.0}, + { 201, 402, 5, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_Y8_ER, NVCV_COLOR_RGB2GRAY, 1.0}, + {4096,4096, 1, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_Y8_ER, NVCV_COLOR_RGB2GRAY, 1.0}, + + { 32, 21, 4, NVCV_IMAGE_FORMAT_Y16, NVCV_IMAGE_FORMAT_BGR16, NVCV_COLOR_GRAY2BGR, 0.0}, + { 32, 21, 4, NVCV_IMAGE_FORMAT_BGR16, NVCV_IMAGE_FORMAT_Y16, NVCV_COLOR_BGR2GRAY, 2.0}, + { 54, 66, 5, NVCV_IMAGE_FORMAT_Y16, NVCV_IMAGE_FORMAT_RGB16, NVCV_COLOR_GRAY2RGB, 0.0}, + { 54, 66, 5, NVCV_IMAGE_FORMAT_RGB16, NVCV_IMAGE_FORMAT_Y16, NVCV_COLOR_RGB2GRAY, 2.0}, + {4096,4096, 1, NVCV_IMAGE_FORMAT_RGB16, NVCV_IMAGE_FORMAT_Y16, NVCV_COLOR_RGB2GRAY, 2.0}, + + { 64, 21, 3, NVCV_IMAGE_FORMAT_Yf32, NVCV_IMAGE_FORMAT_BGRf32, NVCV_COLOR_GRAY2BGR, ERR1_4}, + { 64, 21, 3, NVCV_IMAGE_FORMAT_BGRf32, NVCV_IMAGE_FORMAT_Yf32, NVCV_COLOR_BGR2GRAY, ERR1_4}, + { 121, 66, 5, NVCV_IMAGE_FORMAT_Yf32, NVCV_IMAGE_FORMAT_RGBf32, NVCV_COLOR_GRAY2RGB, ERR1_4}, + { 121, 66, 5, NVCV_IMAGE_FORMAT_RGBf32, NVCV_IMAGE_FORMAT_Yf32, NVCV_COLOR_RGB2GRAY, ERR1_4}, + {4096,4096, 1, NVCV_IMAGE_FORMAT_RGBf32, NVCV_IMAGE_FORMAT_Yf32, NVCV_COLOR_RGB2GRAY, ERR1_4}, + + // Codes 9 to 39 are not implemented + { 55, 257, 4, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_HSV8, NVCV_COLOR_BGR2HSV, 1.0}, + { 55, 257, 4, NVCV_IMAGE_FORMAT_HSV8, NVCV_IMAGE_FORMAT_BGR8, NVCV_COLOR_HSV2BGR, 1.0}, + { 366, 14, 5, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_HSV8, NVCV_COLOR_RGB2HSV, 1.0}, + { 366, 14, 5, NVCV_IMAGE_FORMAT_HSV8, NVCV_IMAGE_FORMAT_RGB8, NVCV_COLOR_HSV2RGB, 1.0}, + {4096,4096, 1, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_HSV8, NVCV_COLOR_RGB2HSV, 1.0}, + {2880,4096, 1, NVCV_IMAGE_FORMAT_HSV8, NVCV_IMAGE_FORMAT_RGB8, NVCV_COLOR_HSV2RGB, 1.0}, + + // Hue computation differs slightly because CUDA kernel adds FLT_EPSILON to denominator for 'diff' division. + { 55, 257, 4, NVCV_IMAGE_FORMAT_BGRf32, NVCV_IMAGE_FORMAT_HSVf32, NVCV_COLOR_BGR2HSV, ERR2_3}, + { 33, 525, 3, NVCV_IMAGE_FORMAT_HSVf32, NVCV_IMAGE_FORMAT_BGRf32, NVCV_COLOR_HSV2BGR, ERR1_4}, + { 365, 14, 5, NVCV_IMAGE_FORMAT_RGBf32, NVCV_IMAGE_FORMAT_HSVf32, NVCV_COLOR_RGB2HSV, ERR2_3}, + { 367, 223, 2, NVCV_IMAGE_FORMAT_HSVf32, NVCV_IMAGE_FORMAT_RGBf32, NVCV_COLOR_HSV2RGB, ERR1_4}, + {4096,4096, 1, NVCV_IMAGE_FORMAT_RGBf32, NVCV_IMAGE_FORMAT_HSVf32, NVCV_COLOR_RGB2HSV, ERR2_3}, + {5760,4096, 1, NVCV_IMAGE_FORMAT_HSVf32, NVCV_IMAGE_FORMAT_RGBf32, NVCV_COLOR_RGB2HSV, ERR2_3}, + + // // Codes 42 to 53 and 56 to 65 and 68 to 69 are not implemented + { 112, 157, 4, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_HSV8, NVCV_COLOR_BGR2HSV_FULL, 1.0}, + { 112, 157, 4, NVCV_IMAGE_FORMAT_HSV8, NVCV_IMAGE_FORMAT_BGR8, NVCV_COLOR_HSV2BGR_FULL, 1.0}, + { 333, 13, 3, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_HSV8, NVCV_COLOR_RGB2HSV_FULL, 1.0}, + { 333, 13, 3, NVCV_IMAGE_FORMAT_HSV8, NVCV_IMAGE_FORMAT_RGB8, NVCV_COLOR_HSV2RGB_FULL, 1.0}, + {4096,4096, 1, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_HSV8, NVCV_COLOR_RGB2HSV_FULL, 1.0}, + {4096,4096, 1, NVCV_IMAGE_FORMAT_HSV8, NVCV_IMAGE_FORMAT_RGB8, NVCV_COLOR_RGB2HSV_FULL, 1.0}, + + // Codes 72 to 81 are not implemented + { 133, 22, 2, NVCV_IMAGE_FORMAT_YUV8, NVCV_IMAGE_FORMAT_BGR8, NVCV_COLOR_YUV2BGR, 1.0}, + { 133, 22, 2, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_YUV8, NVCV_COLOR_BGR2YUV, 1.0}, + { 123, 21, 3, NVCV_IMAGE_FORMAT_YUV8, NVCV_IMAGE_FORMAT_RGB8, NVCV_COLOR_YUV2RGB, 1.0}, + { 123, 21, 3, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_YUV8, NVCV_COLOR_RGB2YUV, 1.0}, + {4096,4096, 1, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_YUV8, NVCV_COLOR_RGB2YUV, 1.0}, + + { 133, 21, 3, NVCV_IMAGE_FORMAT_YUV16, NVCV_IMAGE_FORMAT_BGR16, NVCV_COLOR_YUV2BGR, 1.0}, + { 133, 21, 3, NVCV_IMAGE_FORMAT_BGR16, NVCV_IMAGE_FORMAT_YUV16, NVCV_COLOR_BGR2YUV, 2.0}, + { 123, 21, 3, NVCV_IMAGE_FORMAT_YUV16, NVCV_IMAGE_FORMAT_RGB16, NVCV_COLOR_YUV2RGB, 1.0}, + { 123, 21, 3, NVCV_IMAGE_FORMAT_RGB16, NVCV_IMAGE_FORMAT_YUV16, NVCV_COLOR_RGB2YUV, 2.0}, + {4096,4096, 1, NVCV_IMAGE_FORMAT_RGB16, NVCV_IMAGE_FORMAT_YUV16, NVCV_COLOR_RGB2YUV, 2.0}, + + { 133, 21, 3, NVCV_IMAGE_FORMAT_YUVf32, NVCV_IMAGE_FORMAT_BGRf32, NVCV_COLOR_YUV2BGR, ERR1_4}, + { 133, 21, 3, NVCV_IMAGE_FORMAT_BGRf32, NVCV_IMAGE_FORMAT_YUVf32, NVCV_COLOR_BGR2YUV, ERR1_4}, + { 123, 21, 3, NVCV_IMAGE_FORMAT_YUVf32, NVCV_IMAGE_FORMAT_RGBf32, NVCV_COLOR_YUV2RGB, ERR1_4}, + { 123, 21, 3, NVCV_IMAGE_FORMAT_RGBf32, NVCV_IMAGE_FORMAT_YUVf32, NVCV_COLOR_RGB2YUV, ERR1_4}, + {4096,4096, 1, NVCV_IMAGE_FORMAT_RGBf32, NVCV_IMAGE_FORMAT_YUVf32, NVCV_COLOR_RGB2YUV, ERR1_4}, + // Codes 86 to 89 are not implemented + // Codes 90 to 147 dealing with subsampled planes (NV12, etc. formats) are postponed (see comment below) + // Codes 109, 110, 113, 114 dealing with VYUY format are not implemented + // Codes 125, 126 dealing alpha premultiplication are not implemented + // Codes 135 to 139 dealing edge-aware demosaicing are not implemented + + { 120, 20, 2, NVCV_IMAGE_FORMAT_NV12, NVCV_IMAGE_FORMAT_RGB8, NVCV_COLOR_YUV2RGB_I420, 2.0}, + { 120, 20, 2, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_NV12, NVCV_COLOR_RGB2YUV_I420, 1.0}, + { 100, 40, 3, NVCV_IMAGE_FORMAT_NV12, NVCV_IMAGE_FORMAT_BGR8, NVCV_COLOR_YUV2BGR_I420, 2.0}, + { 100, 40, 3, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_NV12, NVCV_COLOR_BGR2YUV_I420, 1.0}, + { 80, 120, 4, NVCV_IMAGE_FORMAT_NV12, NVCV_IMAGE_FORMAT_RGBA8, NVCV_COLOR_YUV2RGBA_I420, 2.0}, + { 80, 120, 4, NVCV_IMAGE_FORMAT_RGBA8, NVCV_IMAGE_FORMAT_NV12, NVCV_COLOR_RGBA2YUV_I420, 1.0}, + { 60, 60, 5, NVCV_IMAGE_FORMAT_NV12, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_YUV2BGRA_I420, 2.0}, + { 60, 60, 5, NVCV_IMAGE_FORMAT_BGRA8, NVCV_IMAGE_FORMAT_NV12, NVCV_COLOR_BGRA2YUV_I420, 1.0}, + {4096,4096, 1, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_NV12, NVCV_COLOR_RGB2YUV_I420, 1.0}, + + { 140, 80, 6, NVCV_IMAGE_FORMAT_NV21, NVCV_IMAGE_FORMAT_RGB8, NVCV_COLOR_YUV2RGB_YV12, 2.0}, + { 140, 80, 6, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_NV21, NVCV_COLOR_RGB2YUV_YV12, 1.0}, + { 160, 60, 5, NVCV_IMAGE_FORMAT_NV21, NVCV_IMAGE_FORMAT_BGR8, NVCV_COLOR_YUV2BGR_YV12, 2.0}, + { 160, 60, 5, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_NV21, NVCV_COLOR_BGR2YUV_YV12, 1.0}, + { 60, 100, 4, NVCV_IMAGE_FORMAT_NV21, NVCV_IMAGE_FORMAT_RGBA8, NVCV_COLOR_YUV2RGBA_YV12, 2.0}, + { 60, 100, 4, NVCV_IMAGE_FORMAT_RGBA8, NVCV_IMAGE_FORMAT_NV21, NVCV_COLOR_RGBA2YUV_YV12, 1.0}, + { 80, 80, 3, NVCV_IMAGE_FORMAT_NV21, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_YUV2BGRA_YV12, 2.0}, + { 80, 80, 3, NVCV_IMAGE_FORMAT_BGRA8, NVCV_IMAGE_FORMAT_NV21, NVCV_COLOR_BGRA2YUV_YV12, 1.0}, + {4096,4096, 1, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_NV21, NVCV_COLOR_RGB2YUV_YV12, 1.0}, + + // NV12, ... makes varShape raise an error: + // "NVCV_ERROR_NOT_IMPLEMENTED: Batch image format must not have subsampled planes, but it is: X" + { 120, 20, 2, NVCV_IMAGE_FORMAT_NV12, NVCV_IMAGE_FORMAT_RGB8, NVCV_COLOR_YUV2RGB_NV12, 2.0}, + { 120, 20, 2, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_NV12, NVCV_COLOR_RGB2YUV_NV12, 1.0}, + { 100, 40, 3, NVCV_IMAGE_FORMAT_NV12, NVCV_IMAGE_FORMAT_BGR8, NVCV_COLOR_YUV2BGR_NV12, 2.0}, + { 100, 40, 3, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_NV12, NVCV_COLOR_BGR2YUV_NV12, 1.0}, + { 80, 120, 4, NVCV_IMAGE_FORMAT_NV12, NVCV_IMAGE_FORMAT_RGBA8, NVCV_COLOR_YUV2RGBA_NV12, 2.0}, + { 80, 120, 4, NVCV_IMAGE_FORMAT_RGBA8, NVCV_IMAGE_FORMAT_NV12, NVCV_COLOR_RGBA2YUV_NV12, 1.0}, + { 60, 60, 5, NVCV_IMAGE_FORMAT_NV12, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_YUV2BGRA_NV12, 2.0}, + { 60, 60, 5, NVCV_IMAGE_FORMAT_BGRA8, NVCV_IMAGE_FORMAT_NV12, NVCV_COLOR_BGRA2YUV_NV12, 1.0}, + {4096,4096, 1, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_NV12, NVCV_COLOR_RGB2YUV_NV12, 1.0}, + + { 140, 80, 6, NVCV_IMAGE_FORMAT_NV21, NVCV_IMAGE_FORMAT_RGB8, NVCV_COLOR_YUV2RGB_NV21, 2.0}, + { 140, 80, 6, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_NV21, NVCV_COLOR_RGB2YUV_NV21, 1.0}, + { 160, 60, 5, NVCV_IMAGE_FORMAT_NV21, NVCV_IMAGE_FORMAT_BGR8, NVCV_COLOR_YUV2BGR_NV21, 2.0}, + { 160, 60, 5, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_NV21, NVCV_COLOR_BGR2YUV_NV21, 1.0}, + { 60, 100, 4, NVCV_IMAGE_FORMAT_NV21, NVCV_IMAGE_FORMAT_RGBA8, NVCV_COLOR_YUV2RGBA_NV21, 2.0}, + { 60, 100, 4, NVCV_IMAGE_FORMAT_RGBA8, NVCV_IMAGE_FORMAT_NV21, NVCV_COLOR_RGBA2YUV_NV21, 1.0}, + { 80, 80, 3, NVCV_IMAGE_FORMAT_NV21, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_YUV2BGRA_NV21, 2.0}, + { 80, 80, 3, NVCV_IMAGE_FORMAT_BGRA8, NVCV_IMAGE_FORMAT_NV21, NVCV_COLOR_BGRA2YUV_NV21, 1.0}, + {4096,4096, 1, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_NV21, NVCV_COLOR_RGB2YUV_NV21, 1.0}, + + { 80, 120, 2, NVCV_IMAGE_FORMAT_NV12, NVCV_IMAGE_FORMAT_Y8, NVCV_COLOR_YUV2GRAY_420, 0.0}, + { 100, 40, 3, NVCV_IMAGE_FORMAT_NV21, NVCV_IMAGE_FORMAT_Y8, NVCV_COLOR_YUV2GRAY_420, 0.0}, + + { 120, 20, 2, NVCV_IMAGE_FORMAT_UYVY, NVCV_IMAGE_FORMAT_RGB8, NVCV_COLOR_YUV2RGB_UYVY, 2.0}, + { 120, 20, 2, NVCV_IMAGE_FORMAT_UYVY, NVCV_IMAGE_FORMAT_BGR8, NVCV_COLOR_YUV2BGR_UYVY, 2.0}, + { 100, 40, 3, NVCV_IMAGE_FORMAT_UYVY, NVCV_IMAGE_FORMAT_RGBA8, NVCV_COLOR_YUV2RGBA_UYVY, 2.0}, + { 100, 40, 3, NVCV_IMAGE_FORMAT_UYVY, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_YUV2BGRA_UYVY, 2.0}, + + { 80, 120, 4, NVCV_IMAGE_FORMAT_YUYV, NVCV_IMAGE_FORMAT_RGB8, NVCV_COLOR_YUV2RGB_YUY2, 2.0}, + { 80, 120, 4, NVCV_IMAGE_FORMAT_YUYV, NVCV_IMAGE_FORMAT_BGR8, NVCV_COLOR_YUV2BGR_YUY2, 2.0}, + { 60, 60, 5, NVCV_IMAGE_FORMAT_YUYV, NVCV_IMAGE_FORMAT_RGB8, NVCV_COLOR_YUV2RGB_YVYU, 2.0}, + { 60, 60, 5, NVCV_IMAGE_FORMAT_YUYV, NVCV_IMAGE_FORMAT_BGR8, NVCV_COLOR_YUV2BGR_YVYU, 2.0}, + { 80, 120, 4, NVCV_IMAGE_FORMAT_YUYV, NVCV_IMAGE_FORMAT_RGBA8, NVCV_COLOR_YUV2RGBA_YUY2, 2.0}, + { 80, 120, 4, NVCV_IMAGE_FORMAT_YUYV, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_YUV2BGRA_YUY2, 2.0}, + { 60, 60, 5, NVCV_IMAGE_FORMAT_YUYV, NVCV_IMAGE_FORMAT_RGBA8, NVCV_COLOR_YUV2RGBA_YVYU, 2.0}, + { 60, 60, 5, NVCV_IMAGE_FORMAT_YUYV, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_YUV2BGRA_YVYU, 2.0}, + + { 80, 120, 2, NVCV_IMAGE_FORMAT_UYVY, NVCV_IMAGE_FORMAT_Y8, NVCV_COLOR_YUV2GRAY_UYVY, 0.0}, + { 100, 40, 3, NVCV_IMAGE_FORMAT_YUYV, NVCV_IMAGE_FORMAT_Y8, NVCV_COLOR_YUV2GRAY_YUY2, 0.0}, + + // Code 148 is not implemented +}); + +// clang-format on + +//--------------------------------------------------------------------------------------------------------------------// +TEST_P(OpCvtColor, correct_output) +{ + int wdth = GetParamValue<0>(); + int hght = GetParamValue<1>(); + int imgs = GetParamValue<2>(); + + nvcv::ImageFormat srcFrmt{GetParamValue<3>()}; + nvcv::ImageFormat dstFrmt{GetParamValue<4>()}; + + NVCVColorConversionCode code{GetParamValue<5>()}; + + double maxDiff{GetParamValue<6>()}; + + // Create input and output tensors. + nvcv::Tensor srcTensor = util::CreateTensor(imgs, wdth, hght, srcFrmt); + nvcv::Tensor dstTensor = util::CreateTensor(imgs, wdth, hght, dstFrmt); + + NVCVDataType dataType; + ASSERT_EQ(nvcvImageFormatGetPlaneDataType(srcFrmt, 0, &dataType), NVCV_SUCCESS); + + switch (dataType) + { + case NVCV_DATA_TYPE_U8: + case NVCV_DATA_TYPE_2U8: + case NVCV_DATA_TYPE_3U8: + case NVCV_DATA_TYPE_4U8: + case NVCV_DATA_TYPE_S8: + case NVCV_DATA_TYPE_2S8: + case NVCV_DATA_TYPE_3S8: + case NVCV_DATA_TYPE_4S8: + verifyOutput(srcTensor, srcFrmt, dstTensor, dstFrmt, code, wdth, hght, imgs, maxDiff); + break; + + case NVCV_DATA_TYPE_U16: + case NVCV_DATA_TYPE_2U16: + case NVCV_DATA_TYPE_3U16: + case NVCV_DATA_TYPE_4U16: + case NVCV_DATA_TYPE_S16: + case NVCV_DATA_TYPE_2S16: + case NVCV_DATA_TYPE_3S16: + case NVCV_DATA_TYPE_4S16: + case NVCV_DATA_TYPE_F16: // Data type float16 is only allowed in conversions that treat it as 16-bit integer + case NVCV_DATA_TYPE_2F16: // (e.g., RGB2BGR or Gray2RGB). + case NVCV_DATA_TYPE_3F16: + case NVCV_DATA_TYPE_4F16: + verifyOutput(srcTensor, srcFrmt, dstTensor, dstFrmt, code, wdth, hght, imgs, maxDiff); + break; + + case NVCV_DATA_TYPE_S32: + case NVCV_DATA_TYPE_2S32: + case NVCV_DATA_TYPE_3S32: + case NVCV_DATA_TYPE_4S32: + verifyOutput(srcTensor, srcFrmt, dstTensor, dstFrmt, code, wdth, hght, imgs, maxDiff); + break; + + case NVCV_DATA_TYPE_F32: + case NVCV_DATA_TYPE_2F32: + case NVCV_DATA_TYPE_3F32: + case NVCV_DATA_TYPE_4F32: + verifyOutput(srcTensor, srcFrmt, dstTensor, dstFrmt, code, wdth, hght, imgs, maxDiff); + break; + + case NVCV_DATA_TYPE_F64: + case NVCV_DATA_TYPE_2F64: + case NVCV_DATA_TYPE_3F64: + case NVCV_DATA_TYPE_4F64: + verifyOutput(srcTensor, srcFrmt, dstTensor, dstFrmt, code, wdth, hght, imgs, maxDiff); + break; + default: + FAIL() << "Unsupported tensor data type."; + break; + } +} + +//--------------------------------------------------------------------------------------------------------------------// + +#define VEC_EXPECT_NEAR(vec1, vec2, delta, dtype) \ + ASSERT_EQ(vec1.size(), vec2.size()); \ + for (std::size_t idx = 0; idx < vec1.size() / sizeof(dtype); ++idx) \ + { \ + EXPECT_NEAR(reinterpret_cast(vec1.data())[idx], reinterpret_cast(vec2.data())[idx], delta) \ + << "At index " << idx; \ + } + +// clang-format off + +NVCV_TEST_SUITE_P(OpCvtColor_circular, test::ValueList { - // W, H, N, inputFormat, outputFormat, in2outCode, out2inCode, maxDiff - { 176, 113, 1, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_BGR2BGRA, NVCV_COLOR_BGRA2BGR, 0.0}, - { 336, 432, 2, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBA8, NVCV_COLOR_RGB2RGBA, NVCV_COLOR_RGBA2RGB, 0.0}, - { 77, 212, 3, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGBA8, NVCV_COLOR_BGR2RGBA, NVCV_COLOR_RGBA2BGR, 0.0}, - { 33, 55, 4, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_RGB2BGRA, NVCV_COLOR_BGRA2RGB, 0.0}, - { 123, 321, 5, NVCV_IMAGE_FORMAT_RGBA8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_RGBA2BGRA, NVCV_COLOR_BGRA2RGBA, 0.0}, - { 176, 113, 1, NVCV_IMAGE_FORMAT_BGRS8, NVCV_IMAGE_FORMAT_BGRAS8, NVCV_COLOR_BGR2BGRA, NVCV_COLOR_BGRA2BGR, 0.0}, - { 336, 432, 2, NVCV_IMAGE_FORMAT_RGBS8, NVCV_IMAGE_FORMAT_RGBAS8, NVCV_COLOR_RGB2RGBA, NVCV_COLOR_RGBA2RGB, 0.0}, - { 77, 212, 3, NVCV_IMAGE_FORMAT_BGRS8, NVCV_IMAGE_FORMAT_RGBAS8, NVCV_COLOR_BGR2RGBA, NVCV_COLOR_RGBA2BGR, 0.0}, - { 33, 55, 4, NVCV_IMAGE_FORMAT_RGBS8, NVCV_IMAGE_FORMAT_BGRAS8, NVCV_COLOR_RGB2BGRA, NVCV_COLOR_BGRA2RGB, 0.0}, - { 123, 321, 5, NVCV_IMAGE_FORMAT_RGBAS8, NVCV_IMAGE_FORMAT_BGRAS8, NVCV_COLOR_RGBA2BGRA, NVCV_COLOR_BGRA2RGBA, 0.0}, - { 176, 113, 1, NVCV_IMAGE_FORMAT_BGR16, NVCV_IMAGE_FORMAT_BGRA16, NVCV_COLOR_BGR2BGRA, NVCV_COLOR_BGRA2BGR, 0.0}, - { 336, 432, 2, NVCV_IMAGE_FORMAT_RGB16, NVCV_IMAGE_FORMAT_RGBA16, NVCV_COLOR_RGB2RGBA, NVCV_COLOR_RGBA2RGB, 0.0}, - { 77, 212, 3, NVCV_IMAGE_FORMAT_BGR16, NVCV_IMAGE_FORMAT_RGBA16, NVCV_COLOR_BGR2RGBA, NVCV_COLOR_RGBA2BGR, 0.0}, - { 33, 55, 4, NVCV_IMAGE_FORMAT_RGB16, NVCV_IMAGE_FORMAT_BGRA16, NVCV_COLOR_RGB2BGRA, NVCV_COLOR_BGRA2RGB, 0.0}, - { 123, 321, 5, NVCV_IMAGE_FORMAT_RGBA16, NVCV_IMAGE_FORMAT_BGRA16, NVCV_COLOR_RGBA2BGRA, NVCV_COLOR_BGRA2RGBA, 0.0}, - { 176, 113, 1, NVCV_IMAGE_FORMAT_BGRS16, NVCV_IMAGE_FORMAT_BGRAS16, NVCV_COLOR_BGR2BGRA, NVCV_COLOR_BGRA2BGR, 0.0}, - { 336, 432, 2, NVCV_IMAGE_FORMAT_RGBS16, NVCV_IMAGE_FORMAT_RGBAS16, NVCV_COLOR_RGB2RGBA, NVCV_COLOR_RGBA2RGB, 0.0}, - { 77, 212, 3, NVCV_IMAGE_FORMAT_BGRS16, NVCV_IMAGE_FORMAT_RGBAS16, NVCV_COLOR_BGR2RGBA, NVCV_COLOR_RGBA2BGR, 0.0}, - { 33, 55, 4, NVCV_IMAGE_FORMAT_RGBS16, NVCV_IMAGE_FORMAT_BGRAS16, NVCV_COLOR_RGB2BGRA, NVCV_COLOR_BGRA2RGB, 0.0}, - { 123, 321, 5, NVCV_IMAGE_FORMAT_RGBAS16, NVCV_IMAGE_FORMAT_BGRAS16, NVCV_COLOR_RGBA2BGRA, NVCV_COLOR_BGRA2RGBA, 0.0}, - { 176, 113, 1, NVCV_IMAGE_FORMAT_BGRf16, NVCV_IMAGE_FORMAT_BGRAf16, NVCV_COLOR_BGR2BGRA, NVCV_COLOR_BGRA2BGR, 0.0}, - { 336, 432, 2, NVCV_IMAGE_FORMAT_RGBf16, NVCV_IMAGE_FORMAT_RGBAf16, NVCV_COLOR_RGB2RGBA, NVCV_COLOR_RGBA2RGB, 0.0}, - { 77, 212, 3, NVCV_IMAGE_FORMAT_BGRf16, NVCV_IMAGE_FORMAT_RGBAf16, NVCV_COLOR_BGR2RGBA, NVCV_COLOR_RGBA2BGR, 0.0}, - { 33, 55, 4, NVCV_IMAGE_FORMAT_RGBf16, NVCV_IMAGE_FORMAT_BGRAf16, NVCV_COLOR_RGB2BGRA, NVCV_COLOR_BGRA2RGB, 0.0}, - { 123, 321, 5, NVCV_IMAGE_FORMAT_RGBAf16, NVCV_IMAGE_FORMAT_BGRAf16, NVCV_COLOR_RGBA2BGRA, NVCV_COLOR_BGRA2RGBA, 0.0}, - { 176, 113, 1, NVCV_IMAGE_FORMAT_BGRS32, NVCV_IMAGE_FORMAT_BGRAS32, NVCV_COLOR_BGR2BGRA, NVCV_COLOR_BGRA2BGR, 0.0}, - { 336, 432, 2, NVCV_IMAGE_FORMAT_RGBS32, NVCV_IMAGE_FORMAT_RGBAS32, NVCV_COLOR_RGB2RGBA, NVCV_COLOR_RGBA2RGB, 0.0}, - { 77, 212, 3, NVCV_IMAGE_FORMAT_BGRS32, NVCV_IMAGE_FORMAT_RGBAS32, NVCV_COLOR_BGR2RGBA, NVCV_COLOR_RGBA2BGR, 0.0}, - { 33, 55, 4, NVCV_IMAGE_FORMAT_RGBS32, NVCV_IMAGE_FORMAT_BGRAS32, NVCV_COLOR_RGB2BGRA, NVCV_COLOR_BGRA2RGB, 0.0}, - { 123, 321, 5, NVCV_IMAGE_FORMAT_RGBAS32, NVCV_IMAGE_FORMAT_BGRAS32, NVCV_COLOR_RGBA2BGRA, NVCV_COLOR_BGRA2RGBA, 0.0}, - { 176, 113, 1, NVCV_IMAGE_FORMAT_BGRf64, NVCV_IMAGE_FORMAT_BGRAf64, NVCV_COLOR_BGR2BGRA, NVCV_COLOR_BGRA2BGR, 0.0}, - { 336, 432, 2, NVCV_IMAGE_FORMAT_RGBf64, NVCV_IMAGE_FORMAT_RGBAf64, NVCV_COLOR_RGB2RGBA, NVCV_COLOR_RGBA2RGB, 0.0}, - { 77, 212, 3, NVCV_IMAGE_FORMAT_BGRf64, NVCV_IMAGE_FORMAT_RGBAf64, NVCV_COLOR_BGR2RGBA, NVCV_COLOR_RGBA2BGR, 0.0}, - { 33, 55, 4, NVCV_IMAGE_FORMAT_RGBf64, NVCV_IMAGE_FORMAT_BGRAf64, NVCV_COLOR_RGB2BGRA, NVCV_COLOR_BGRA2RGB, 0.0}, - { 123, 321, 5, NVCV_IMAGE_FORMAT_RGBAf64, NVCV_IMAGE_FORMAT_BGRAf64, NVCV_COLOR_RGBA2BGRA, NVCV_COLOR_BGRA2RGBA, 0.0}, - { 23, 21, 63, NVCV_IMAGE_FORMAT_Y8, NVCV_IMAGE_FORMAT_BGR8, NVCV_COLOR_GRAY2BGR, NVCV_COLOR_BGR2GRAY, 0.0}, - { 402, 202, 5, NVCV_IMAGE_FORMAT_Y8, NVCV_IMAGE_FORMAT_RGB8, NVCV_COLOR_GRAY2RGB, NVCV_COLOR_RGB2GRAY, 0.0}, - { 32, 21, 4, NVCV_IMAGE_FORMAT_Y16, NVCV_IMAGE_FORMAT_BGR16, NVCV_COLOR_GRAY2BGR, NVCV_COLOR_BGR2GRAY, 0.0}, - { 54, 66, 5, NVCV_IMAGE_FORMAT_Y16, NVCV_IMAGE_FORMAT_RGB16, NVCV_COLOR_GRAY2RGB, NVCV_COLOR_RGB2GRAY, 0.0}, - { 64, 21, 3, NVCV_IMAGE_FORMAT_Yf32, NVCV_IMAGE_FORMAT_BGRf32, NVCV_COLOR_GRAY2BGR, NVCV_COLOR_BGR2GRAY, 1E-4}, - { 121, 66, 5, NVCV_IMAGE_FORMAT_Yf32, NVCV_IMAGE_FORMAT_RGBf32, NVCV_COLOR_GRAY2RGB, NVCV_COLOR_RGB2GRAY, 1E-4}, - { 129, 61, 4, NVCV_IMAGE_FORMAT_BGRf32, NVCV_IMAGE_FORMAT_BGRAf32, NVCV_COLOR_BGR2BGRA, NVCV_COLOR_BGRA2BGR, 0.0}, - { 63, 31, 3, NVCV_IMAGE_FORMAT_RGBf32, NVCV_IMAGE_FORMAT_RGBAf32, NVCV_COLOR_RGB2RGBA, NVCV_COLOR_RGBA2RGB, 0.0}, - { 42, 111, 2, NVCV_IMAGE_FORMAT_BGRf32, NVCV_IMAGE_FORMAT_RGBAf32, NVCV_COLOR_BGR2RGBA, NVCV_COLOR_RGBA2BGR, 0.0}, - { 21, 72, 2, NVCV_IMAGE_FORMAT_RGBf32, NVCV_IMAGE_FORMAT_BGRAf32, NVCV_COLOR_RGB2BGRA, NVCV_COLOR_BGRA2RGB, 0.0}, - { 23, 31, 3, NVCV_IMAGE_FORMAT_RGBAf32, NVCV_IMAGE_FORMAT_BGRAf32, NVCV_COLOR_RGBA2BGRA, NVCV_COLOR_BGRA2RGBA, 0.0}, + // W, H, N, Input Format, Output Format, Convert Code (-->), Convert Code (<--), maxDiff + { 176, 113, 1, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_BGR2BGRA, NVCV_COLOR_BGRA2BGR, 0.0}, + { 336, 432, 2, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBA8, NVCV_COLOR_RGB2RGBA, NVCV_COLOR_RGBA2RGB, 0.0}, + { 77, 212, 3, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGBA8, NVCV_COLOR_BGR2RGBA, NVCV_COLOR_RGBA2BGR, 0.0}, + { 33, 55, 4, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_RGB2BGRA, NVCV_COLOR_BGRA2RGB, 0.0}, + { 123, 321, 5, NVCV_IMAGE_FORMAT_RGBA8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_RGBA2BGRA, NVCV_COLOR_BGRA2RGBA, 0.0}, + { 176, 113, 1, NVCV_IMAGE_FORMAT_BGRS8, NVCV_IMAGE_FORMAT_BGRAS8, NVCV_COLOR_BGR2BGRA, NVCV_COLOR_BGRA2BGR, 0.0}, + { 336, 432, 2, NVCV_IMAGE_FORMAT_RGBS8, NVCV_IMAGE_FORMAT_RGBAS8, NVCV_COLOR_RGB2RGBA, NVCV_COLOR_RGBA2RGB, 0.0}, + { 77, 212, 3, NVCV_IMAGE_FORMAT_BGRS8, NVCV_IMAGE_FORMAT_RGBAS8, NVCV_COLOR_BGR2RGBA, NVCV_COLOR_RGBA2BGR, 0.0}, + { 33, 55, 4, NVCV_IMAGE_FORMAT_RGBS8, NVCV_IMAGE_FORMAT_BGRAS8, NVCV_COLOR_RGB2BGRA, NVCV_COLOR_BGRA2RGB, 0.0}, + { 123, 321, 5, NVCV_IMAGE_FORMAT_RGBAS8, NVCV_IMAGE_FORMAT_BGRAS8, NVCV_COLOR_RGBA2BGRA, NVCV_COLOR_BGRA2RGBA, 0.0}, + { 176, 113, 1, NVCV_IMAGE_FORMAT_BGR16, NVCV_IMAGE_FORMAT_BGRA16, NVCV_COLOR_BGR2BGRA, NVCV_COLOR_BGRA2BGR, 0.0}, + { 336, 432, 2, NVCV_IMAGE_FORMAT_RGB16, NVCV_IMAGE_FORMAT_RGBA16, NVCV_COLOR_RGB2RGBA, NVCV_COLOR_RGBA2RGB, 0.0}, + { 77, 212, 3, NVCV_IMAGE_FORMAT_BGR16, NVCV_IMAGE_FORMAT_RGBA16, NVCV_COLOR_BGR2RGBA, NVCV_COLOR_RGBA2BGR, 0.0}, + { 33, 55, 4, NVCV_IMAGE_FORMAT_RGB16, NVCV_IMAGE_FORMAT_BGRA16, NVCV_COLOR_RGB2BGRA, NVCV_COLOR_BGRA2RGB, 0.0}, + { 123, 321, 5, NVCV_IMAGE_FORMAT_RGBA16, NVCV_IMAGE_FORMAT_BGRA16, NVCV_COLOR_RGBA2BGRA, NVCV_COLOR_BGRA2RGBA, 0.0}, + { 176, 113, 1, NVCV_IMAGE_FORMAT_BGRS16, NVCV_IMAGE_FORMAT_BGRAS16, NVCV_COLOR_BGR2BGRA, NVCV_COLOR_BGRA2BGR, 0.0}, + { 336, 432, 2, NVCV_IMAGE_FORMAT_RGBS16, NVCV_IMAGE_FORMAT_RGBAS16, NVCV_COLOR_RGB2RGBA, NVCV_COLOR_RGBA2RGB, 0.0}, + { 77, 212, 3, NVCV_IMAGE_FORMAT_BGRS16, NVCV_IMAGE_FORMAT_RGBAS16, NVCV_COLOR_BGR2RGBA, NVCV_COLOR_RGBA2BGR, 0.0}, + { 33, 55, 4, NVCV_IMAGE_FORMAT_RGBS16, NVCV_IMAGE_FORMAT_BGRAS16, NVCV_COLOR_RGB2BGRA, NVCV_COLOR_BGRA2RGB, 0.0}, + { 123, 321, 5, NVCV_IMAGE_FORMAT_RGBAS16, NVCV_IMAGE_FORMAT_BGRAS16, NVCV_COLOR_RGBA2BGRA, NVCV_COLOR_BGRA2RGBA, 0.0}, + { 77, 212, 3, NVCV_IMAGE_FORMAT_BGRf16, NVCV_IMAGE_FORMAT_RGBf16, NVCV_COLOR_BGR2RGB, NVCV_COLOR_RGB2BGR, 0.0}, + { 123, 321, 5, NVCV_IMAGE_FORMAT_RGBAf16, NVCV_IMAGE_FORMAT_BGRAf16, NVCV_COLOR_RGBA2BGRA, NVCV_COLOR_BGRA2RGBA, 0.0}, + { 176, 113, 1, NVCV_IMAGE_FORMAT_BGRS32, NVCV_IMAGE_FORMAT_BGRAS32, NVCV_COLOR_BGR2BGRA, NVCV_COLOR_BGRA2BGR, 0.0}, + { 336, 432, 2, NVCV_IMAGE_FORMAT_RGBS32, NVCV_IMAGE_FORMAT_RGBAS32, NVCV_COLOR_RGB2RGBA, NVCV_COLOR_RGBA2RGB, 0.0}, + { 77, 212, 3, NVCV_IMAGE_FORMAT_BGRS32, NVCV_IMAGE_FORMAT_RGBAS32, NVCV_COLOR_BGR2RGBA, NVCV_COLOR_RGBA2BGR, 0.0}, + { 33, 55, 4, NVCV_IMAGE_FORMAT_RGBS32, NVCV_IMAGE_FORMAT_BGRAS32, NVCV_COLOR_RGB2BGRA, NVCV_COLOR_BGRA2RGB, 0.0}, + { 123, 321, 5, NVCV_IMAGE_FORMAT_RGBAS32, NVCV_IMAGE_FORMAT_BGRAS32, NVCV_COLOR_RGBA2BGRA, NVCV_COLOR_BGRA2RGBA, 0.0}, + { 176, 113, 1, NVCV_IMAGE_FORMAT_BGRf64, NVCV_IMAGE_FORMAT_BGRAf64, NVCV_COLOR_BGR2BGRA, NVCV_COLOR_BGRA2BGR, 0.0}, + { 336, 432, 2, NVCV_IMAGE_FORMAT_RGBf64, NVCV_IMAGE_FORMAT_RGBAf64, NVCV_COLOR_RGB2RGBA, NVCV_COLOR_RGBA2RGB, 0.0}, + { 77, 212, 3, NVCV_IMAGE_FORMAT_BGRf64, NVCV_IMAGE_FORMAT_RGBAf64, NVCV_COLOR_BGR2RGBA, NVCV_COLOR_RGBA2BGR, 0.0}, + { 33, 55, 4, NVCV_IMAGE_FORMAT_RGBf64, NVCV_IMAGE_FORMAT_BGRAf64, NVCV_COLOR_RGB2BGRA, NVCV_COLOR_BGRA2RGB, 0.0}, + { 123, 321, 5, NVCV_IMAGE_FORMAT_RGBAf64, NVCV_IMAGE_FORMAT_BGRAf64, NVCV_COLOR_RGBA2BGRA, NVCV_COLOR_BGRA2RGBA, 0.0}, + { 23, 21, 63, NVCV_IMAGE_FORMAT_Y8, NVCV_IMAGE_FORMAT_BGR8, NVCV_COLOR_GRAY2BGR, NVCV_COLOR_BGR2GRAY, 0.0}, + { 402, 202, 5, NVCV_IMAGE_FORMAT_Y8, NVCV_IMAGE_FORMAT_RGB8, NVCV_COLOR_GRAY2RGB, NVCV_COLOR_RGB2GRAY, 0.0}, + { 32, 21, 4, NVCV_IMAGE_FORMAT_Y16, NVCV_IMAGE_FORMAT_BGR16, NVCV_COLOR_GRAY2BGR, NVCV_COLOR_BGR2GRAY, 0.0}, + { 54, 66, 5, NVCV_IMAGE_FORMAT_Y16, NVCV_IMAGE_FORMAT_RGB16, NVCV_COLOR_GRAY2RGB, NVCV_COLOR_RGB2GRAY, 0.0}, + { 64, 21, 3, NVCV_IMAGE_FORMAT_Yf32, NVCV_IMAGE_FORMAT_BGRf32, NVCV_COLOR_GRAY2BGR, NVCV_COLOR_BGR2GRAY, 1E-4}, + { 121, 66, 5, NVCV_IMAGE_FORMAT_Yf32, NVCV_IMAGE_FORMAT_RGBf32, NVCV_COLOR_GRAY2RGB, NVCV_COLOR_RGB2GRAY, 1E-4}, + { 129, 61, 4, NVCV_IMAGE_FORMAT_BGRf32, NVCV_IMAGE_FORMAT_BGRAf32, NVCV_COLOR_BGR2BGRA, NVCV_COLOR_BGRA2BGR, 0.0}, + { 63, 31, 3, NVCV_IMAGE_FORMAT_RGBf32, NVCV_IMAGE_FORMAT_RGBAf32, NVCV_COLOR_RGB2RGBA, NVCV_COLOR_RGBA2RGB, 0.0}, + { 42, 111, 2, NVCV_IMAGE_FORMAT_BGRf32, NVCV_IMAGE_FORMAT_RGBAf32, NVCV_COLOR_BGR2RGBA, NVCV_COLOR_RGBA2BGR, 0.0}, + { 21, 72, 2, NVCV_IMAGE_FORMAT_RGBf32, NVCV_IMAGE_FORMAT_BGRAf32, NVCV_COLOR_RGB2BGRA, NVCV_COLOR_BGRA2RGB, 0.0}, + { 23, 31, 3, NVCV_IMAGE_FORMAT_RGBAf32, NVCV_IMAGE_FORMAT_BGRAf32, NVCV_COLOR_RGBA2BGRA, NVCV_COLOR_BGRA2RGBA, 0.0}, // Codes 9 to 39 are not implemented - { 55, 257, 4, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_HSV8, NVCV_COLOR_BGR2HSV, NVCV_COLOR_HSV2BGR, 5.0}, - { 366, 14, 5, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_HSV8, NVCV_COLOR_RGB2HSV, NVCV_COLOR_HSV2RGB, 5.0}, - { 55, 257, 4, NVCV_IMAGE_FORMAT_BGRf32, NVCV_IMAGE_FORMAT_HSVf32, NVCV_COLOR_BGR2HSV, NVCV_COLOR_HSV2BGR, 1E-2}, - { 366, 14, 5, NVCV_IMAGE_FORMAT_RGBf32, NVCV_IMAGE_FORMAT_HSVf32, NVCV_COLOR_RGB2HSV, NVCV_COLOR_HSV2RGB, 1E-2}, + { 55, 257, 4, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_HSV8, NVCV_COLOR_BGR2HSV, NVCV_COLOR_HSV2BGR, 5.0}, + { 366, 14, 5, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_HSV8, NVCV_COLOR_RGB2HSV, NVCV_COLOR_HSV2RGB, 5.0}, + { 55, 257, 4, NVCV_IMAGE_FORMAT_BGRf32, NVCV_IMAGE_FORMAT_HSVf32, NVCV_COLOR_BGR2HSV, NVCV_COLOR_HSV2BGR, 1E-2}, + { 366, 14, 5, NVCV_IMAGE_FORMAT_RGBf32, NVCV_IMAGE_FORMAT_HSVf32, NVCV_COLOR_RGB2HSV, NVCV_COLOR_HSV2RGB, 1E-2}, // Codes 42 to 53 and 56 to 65 and 68 to 69 are not implemented - { 112, 157, 4, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_HSV8, NVCV_COLOR_BGR2HSV_FULL, NVCV_COLOR_HSV2BGR_FULL, 8.0}, - { 333, 13, 3, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_HSV8, NVCV_COLOR_RGB2HSV_FULL, NVCV_COLOR_HSV2RGB_FULL, 8.0}, + { 112, 157, 4, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_HSV8, NVCV_COLOR_BGR2HSV_FULL, NVCV_COLOR_HSV2BGR_FULL, 8.0}, + { 333, 13, 3, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_HSV8, NVCV_COLOR_RGB2HSV_FULL, NVCV_COLOR_HSV2RGB_FULL, 8.0}, // Codes 72 to 81 are not implemented - { 133, 22, 2, NVCV_IMAGE_FORMAT_YUV8, NVCV_IMAGE_FORMAT_BGR8, NVCV_COLOR_YUV2BGR, NVCV_COLOR_BGR2YUV, 128.0}, - { 123, 21, 3, NVCV_IMAGE_FORMAT_YUV8, NVCV_IMAGE_FORMAT_RGB8, NVCV_COLOR_YUV2RGB, NVCV_COLOR_RGB2YUV, 128.0}, - { 133, 21, 3, NVCV_IMAGE_FORMAT_YUV16, NVCV_IMAGE_FORMAT_BGR16, NVCV_COLOR_YUV2RGB, NVCV_COLOR_RGB2YUV, 32768.0}, - { 123, 21, 3, NVCV_IMAGE_FORMAT_YUV16, NVCV_IMAGE_FORMAT_RGB16, NVCV_COLOR_YUV2RGB, NVCV_COLOR_RGB2YUV, 32768.0}, - { 133, 21, 3, NVCV_IMAGE_FORMAT_YUVf32, NVCV_IMAGE_FORMAT_BGRf32, NVCV_COLOR_YUV2RGB, NVCV_COLOR_RGB2YUV, 1E-2}, - { 123, 21, 3, NVCV_IMAGE_FORMAT_YUVf32, NVCV_IMAGE_FORMAT_RGBf32, NVCV_COLOR_YUV2RGB, NVCV_COLOR_RGB2YUV, 1E-2}, + { 133, 22, 2, NVCV_IMAGE_FORMAT_YUV8, NVCV_IMAGE_FORMAT_BGR8, NVCV_COLOR_YUV2BGR, NVCV_COLOR_BGR2YUV, 128.0}, + { 123, 21, 3, NVCV_IMAGE_FORMAT_YUV8, NVCV_IMAGE_FORMAT_RGB8, NVCV_COLOR_YUV2RGB, NVCV_COLOR_RGB2YUV, 128.0}, + { 133, 21, 3, NVCV_IMAGE_FORMAT_YUV16, NVCV_IMAGE_FORMAT_BGR16, NVCV_COLOR_YUV2RGB, NVCV_COLOR_RGB2YUV, 32768.0}, + { 123, 21, 3, NVCV_IMAGE_FORMAT_YUV16, NVCV_IMAGE_FORMAT_RGB16, NVCV_COLOR_YUV2RGB, NVCV_COLOR_RGB2YUV, 32768.0}, + { 133, 21, 3, NVCV_IMAGE_FORMAT_YUVf32, NVCV_IMAGE_FORMAT_BGRf32, NVCV_COLOR_YUV2RGB, NVCV_COLOR_RGB2YUV, 1E-2}, + { 123, 21, 3, NVCV_IMAGE_FORMAT_YUVf32, NVCV_IMAGE_FORMAT_RGBf32, NVCV_COLOR_YUV2RGB, NVCV_COLOR_RGB2YUV, 1E-2}, // Codes 86 to 89 are not implemented // Codes 90 to 147 dealing with subsampled planes (NV12, etc. formats) are postponed (see comment below) // Codes 109, 110, 113, 114 dealing with VYUY format are not implemented @@ -174,22 +815,24 @@ test::ValueList(); - int height = GetParamValue<1>(); - int batches = GetParamValue<2>(); - - nvcv::ImageFormat srcFormat{GetParamValue<3>()}; - nvcv::ImageFormat dstFormat{GetParamValue<4>()}; - - NVCVDataType nvcvDataType; - ASSERT_EQ(NVCV_SUCCESS, nvcvImageFormatGetPlaneDataType(srcFormat, 0, &nvcvDataType)); - - NVCVColorConversionCode src2dstCode{GetParamValue<5>()}; - NVCVColorConversionCode dst2srcCode{GetParamValue<6>()}; - - double maxDiff{GetParamValue<7>()}; - - nvcv::Tensor srcTensor = nvcv::util::CreateTensor(batches, width, height, srcFormat); - nvcv::Tensor dstTensor = nvcv::util::CreateTensor(batches, width, height, dstFormat); - - auto srcData = srcTensor.exportData(); - auto dstData = dstTensor.exportData(); - - ASSERT_NE(srcData, nullptr); - ASSERT_NE(dstData, nullptr); - - auto srcAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*srcData); - ASSERT_TRUE(srcAccess); - - auto dstAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*dstData); - ASSERT_TRUE(dstAccess); - - long srcSampleStride = srcAccess->sampleStride(); - - if (srcData->rank() == 3) - { - srcSampleStride = srcAccess->numRows() * srcAccess->rowStride(); - } - - long srcBufSize = srcSampleStride * srcAccess->numSamples(); - - std::vector srcVec(srcBufSize); - std::default_random_engine randEng(0); - switch (nvcvDataType) - { - case NVCV_DATA_TYPE_F32: - case NVCV_DATA_TYPE_2F32: - case NVCV_DATA_TYPE_3F32: - case NVCV_DATA_TYPE_4F32: - myGenerate(reinterpret_cast(srcVec.data()), srcVec.size() / sizeof(float), randEng); - break; - default: - myGenerate(reinterpret_cast(srcVec.data()), srcVec.size(), randEng); - break; - } - - // copy random input to device - ASSERT_EQ(cudaSuccess, cudaMemcpy(srcData->basePtr(), srcVec.data(), srcBufSize, cudaMemcpyHostToDevice)); - - cudaStream_t stream; - ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream)); - - // run operator - cvcuda::CvtColor cvtColorOp; - - EXPECT_NO_THROW(cvtColorOp(stream, srcTensor, dstTensor, src2dstCode)); - - EXPECT_NO_THROW(cvtColorOp(stream, dstTensor, srcTensor, dst2srcCode)); - - ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); - ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream)); - - std::vector testVec(srcBufSize); - - // copy output back to host - ASSERT_EQ(cudaSuccess, cudaMemcpy(testVec.data(), srcData->basePtr(), srcBufSize, cudaMemcpyDeviceToHost)); - - switch (nvcvDataType) - { - case NVCV_DATA_TYPE_F32: - case NVCV_DATA_TYPE_2F32: - case NVCV_DATA_TYPE_3F32: - case NVCV_DATA_TYPE_4F32: - VEC_EXPECT_NEAR(testVec, srcVec, maxDiff, float); - break; - default: - VEC_EXPECT_NEAR(testVec, srcVec, maxDiff, uint8_t); - break; - } -} - -TEST_P(OpCvtColor, varshape_correct_output) +TEST_P(OpCvtColor_circular, varshape_correct_output) { cudaStream_t stream; ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream)); @@ -325,12 +875,14 @@ TEST_P(OpCvtColor, varshape_correct_output) nvcv::ImageFormat srcFormat{GetParamValue<3>()}; nvcv::ImageFormat dstFormat{GetParamValue<4>()}; - // Waive the formats that have subsampled planes - if (srcFormat.chromaSubsampling() != nvcv::ChromaSubsampling::CSS_444 - || dstFormat.chromaSubsampling() != nvcv::ChromaSubsampling::CSS_444) + // clang-format off + // Waive the formats that have subsampled planes. + if (srcFormat.chromaSubsampling() != nvcv::ChromaSubsampling::CSS_444 || + dstFormat.chromaSubsampling() != nvcv::ChromaSubsampling::CSS_444) { GTEST_SKIP() << "Waived the formats that have subsampled planes for OpCvtColor varshape test"; } + // clang-format on NVCVDataType nvcvDataType; ASSERT_EQ(NVCV_SUCCESS, nvcvImageFormatGetPlaneDataType(srcFormat, 0, &nvcvDataType)); @@ -366,10 +918,10 @@ TEST_P(OpCvtColor, varshape_correct_output) case NVCV_DATA_TYPE_2F32: case NVCV_DATA_TYPE_3F32: case NVCV_DATA_TYPE_4F32: - myGenerate(reinterpret_cast(srcVec[i].data()), srcVec[i].size() / sizeof(float), rng); + generateRandVec(reinterpret_cast(srcVec[i].data()), srcVec[i].size() / sizeof(float), rng); break; default: - myGenerate(reinterpret_cast(srcVec[i].data()), srcVec[i].size(), rng); + generateRandVec(reinterpret_cast(srcVec[i].data()), srcVec[i].size(), rng); break; } @@ -447,28 +999,29 @@ TEST(OpCvtColor_negative, create_with_null_handle) NVCV_TEST_SUITE_P(OpCvtColor_negative, test::ValueList { - // W, H, N, inputFormat, outputFormat, in2outCode - { 8, 8, 3, NVCV_IMAGE_FORMAT_Y8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_BGR2BGRA}, // invalid input channel - { 8, 8, 3, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGRAf32, NVCV_COLOR_BGR2BGRA}, // mismatch data type - { 8, 8, 3, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_Y8, NVCV_COLOR_BGR2BGRA}, // invalid output channel - { 8, 8, 3, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGR8, NVCV_COLOR_GRAY2BGR}, // invalid input channel - { 8, 8, 3, NVCV_IMAGE_FORMAT_Y8, NVCV_IMAGE_FORMAT_BGRf32, NVCV_COLOR_GRAY2BGR}, // mismatch data type - { 8, 8, 3, NVCV_IMAGE_FORMAT_Y8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_GRAY2BGR}, // invalid output channel - { 8, 8, 3, NVCV_IMAGE_FORMAT_BGRA8, NVCV_IMAGE_FORMAT_Y8, NVCV_COLOR_BGR2GRAY}, // invalid input channel - { 8, 8, 3, NVCV_IMAGE_FORMAT_BGRf32, NVCV_IMAGE_FORMAT_Y8, NVCV_COLOR_BGR2GRAY}, // mismatch data type - { 8, 8, 3, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_BGR2GRAY}, // invalid output channel - { 8, 8, 3, NVCV_IMAGE_FORMAT_BGRA8, NVCV_IMAGE_FORMAT_YUV8, NVCV_COLOR_BGR2YUV,}, // invalid input channel - { 8, 8, 3, NVCV_IMAGE_FORMAT_BGRf32, NVCV_IMAGE_FORMAT_YUV8, NVCV_COLOR_BGR2YUV}, // mismatch data type - { 8, 8, 3, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_BGR2YUV}, // invalid output channel - { 8, 8, 3, NVCV_IMAGE_FORMAT_BGRA8, NVCV_IMAGE_FORMAT_BGR8, NVCV_COLOR_YUV2BGR,}, // invalid input channel - { 8, 8, 3, NVCV_IMAGE_FORMAT_YUV8, NVCV_IMAGE_FORMAT_BGRf32, NVCV_COLOR_YUV2BGR}, // mismatch data type - { 8, 8, 3, NVCV_IMAGE_FORMAT_YUV8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_YUV2BGR}, // invalid output channel - { 8, 8, 3, NVCV_IMAGE_FORMAT_BGRA8, NVCV_IMAGE_FORMAT_HSV8, NVCV_COLOR_BGR2HSV}, // invalid input channel - { 8, 8, 3, NVCV_IMAGE_FORMAT_BGRf32, NVCV_IMAGE_FORMAT_HSV8, NVCV_COLOR_BGR2HSV}, // mismatch data type - { 8, 8, 3, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_BGR2HSV}, // invalid output channel - { 8, 8, 3, NVCV_IMAGE_FORMAT_BGRA8, NVCV_IMAGE_FORMAT_BGR8, NVCV_COLOR_HSV2BGR}, // invalid input channel - { 8, 8, 3, NVCV_IMAGE_FORMAT_HSV8, NVCV_IMAGE_FORMAT_BGRf32, NVCV_COLOR_HSV2BGR}, // mismatch data type - { 8, 8, 3, NVCV_IMAGE_FORMAT_HSV8, NVCV_IMAGE_FORMAT_Y8, NVCV_COLOR_HSV2BGR}, // invalid output channel + // W, H, N, Input Format, Output Format, Conversion Code + { 8, 8, 3, NVCV_IMAGE_FORMAT_Y8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_BGR2BGRA}, // invalid input channel + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGRAf32, NVCV_COLOR_BGR2BGRA}, // mismatch data type + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_Y8, NVCV_COLOR_BGR2BGRA}, // invalid output channel + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGR8, NVCV_COLOR_GRAY2BGR}, // invalid input channel + { 8, 8, 3, NVCV_IMAGE_FORMAT_Y8, NVCV_IMAGE_FORMAT_BGRf32, NVCV_COLOR_GRAY2BGR}, // mismatch data type + { 8, 8, 3, NVCV_IMAGE_FORMAT_Y8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_GRAY2BGR}, // invalid output channel + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGRA8, NVCV_IMAGE_FORMAT_Y8, NVCV_COLOR_BGR2GRAY}, // invalid input channel + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGRf32, NVCV_IMAGE_FORMAT_Y8, NVCV_COLOR_BGR2GRAY}, // mismatch data type + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGRf16, NVCV_IMAGE_FORMAT_BGRAf16, NVCV_COLOR_BGR2BGRA}, // f16 type not allowed to add alpha + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_BGR2GRAY}, // invalid output channel + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGRA8, NVCV_IMAGE_FORMAT_YUV8, NVCV_COLOR_BGR2YUV}, // invalid input channel + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGRf32, NVCV_IMAGE_FORMAT_YUV8, NVCV_COLOR_BGR2YUV}, // mismatch data type + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_BGR2YUV}, // invalid output channel + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGRA8, NVCV_IMAGE_FORMAT_BGR8, NVCV_COLOR_YUV2BGR}, // invalid input channel + { 8, 8, 3, NVCV_IMAGE_FORMAT_YUV8, NVCV_IMAGE_FORMAT_BGRf32, NVCV_COLOR_YUV2BGR}, // mismatch data type + { 8, 8, 3, NVCV_IMAGE_FORMAT_YUV8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_YUV2BGR}, // invalid output channel + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGRA8, NVCV_IMAGE_FORMAT_HSV8, NVCV_COLOR_BGR2HSV}, // invalid input channel + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGRf32, NVCV_IMAGE_FORMAT_HSV8, NVCV_COLOR_BGR2HSV}, // mismatch data type + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_BGR2HSV}, // invalid output channel + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGRA8, NVCV_IMAGE_FORMAT_BGR8, NVCV_COLOR_HSV2BGR}, // invalid input channel + { 8, 8, 3, NVCV_IMAGE_FORMAT_HSV8, NVCV_IMAGE_FORMAT_BGRf32, NVCV_COLOR_HSV2BGR}, // mismatch data type + { 8, 8, 3, NVCV_IMAGE_FORMAT_HSV8, NVCV_IMAGE_FORMAT_Y8, NVCV_COLOR_HSV2BGR}, // invalid output channel }); // clang-format on @@ -484,8 +1037,8 @@ TEST_P(OpCvtColor_negative, invalid_input) NVCVColorConversionCode src2dstCode{GetParamValue<5>()}; - nvcv::Tensor srcTensor = nvcv::util::CreateTensor(batches, width, height, srcFormat); - nvcv::Tensor dstTensor = nvcv::util::CreateTensor(batches, width, height, dstFormat); + nvcv::Tensor srcTensor = util::CreateTensor(batches, width, height, srcFormat); + nvcv::Tensor dstTensor = util::CreateTensor(batches, width, height, dstFormat); // run operator cvcuda::CvtColor cvtColorOp; diff --git a/tests/cvcuda/system/TestOpMorphology.cpp b/tests/cvcuda/system/TestOpMorphology.cpp index 8219b87b..d1aecd8e 100644 --- a/tests/cvcuda/system/TestOpMorphology.cpp +++ b/tests/cvcuda/system/TestOpMorphology.cpp @@ -428,6 +428,7 @@ NVCV_TEST_SUITE_P(OpMorphology, test::ValueListbasePtr(), outBufSize, cudaMemcpyDeviceToHost)); // generate gold result + if (maskSize.w == -1 || maskSize.h == -1) + { + maskSize.w = 3; + maskSize.h = 3; + } int2 kernelAnchor{maskSize.w / 2, maskSize.h / 2}; hostMorph(goldVec, outStrides, inVec, inStrides, shape, format, maskSize, kernelAnchor, iteration, borderMode, morphType); @@ -595,8 +616,23 @@ NVCV_TEST_SUITE_P(OpMorphologyVarShape, test::ValueList 1 && null workspace + std::vector testSet1{NVCV_DILATE, NVCV_ERODE}; + for (auto morphType : testSet1) + { + EXPECT_THROW(morphOp(nullptr, inTensor, outTensor, nvcv::NullOpt, morphType, maskSize, anchor, 2, borderMode), + nvcv::Exception); + } + + // testSet2: NVCV_CLOSE and NVCV_OPEN && null workspace + std::vector testSet2{NVCV_CLOSE, NVCV_OPEN}; + for (auto morphType : testSet2) + { + EXPECT_THROW(morphOp(nullptr, inTensor, outTensor, nvcv::NullOpt, morphType, maskSize, anchor, 1, borderMode), + nvcv::Exception); + } + + // testSet3: invalid data type + { + nvcv::Tensor inTensorInvalid + = nvcv::util::CreateTensor(1, 24, 24, nvcv::ImageFormat{NVCV_IMAGE_FORMAT_RGBAf16}); + nvcv::Tensor outTensorInvalid + = nvcv::util::CreateTensor(1, 24, 24, nvcv::ImageFormat{NVCV_IMAGE_FORMAT_RGBAf16}); + EXPECT_THROW(morphOp(nullptr, inTensorInvalid, outTensorInvalid, nvcv::NullOpt, NVCV_ERODE, maskSize, anchor, 0, + borderMode), + nvcv::Exception); + } + + // testSet4: input format is not equal to output format + { + nvcv::Tensor outTensorInvalid = nvcv::util::CreateTensor(2, 24, 24, format); + EXPECT_THROW( + morphOp(nullptr, inTensor, outTensorInvalid, nvcv::NullOpt, NVCV_ERODE, maskSize, anchor, 0, borderMode), + nvcv::Exception); + } +} + +TEST(OpMorphology_Negative, operator_varshape) +{ + NVCVBorderType borderMode = NVCV_BORDER_CONSTANT; + nvcv::ImageFormat format{NVCV_IMAGE_FORMAT_U8}; + const int batches = 2; + + std::vector imgSrc; + nvcv::ImageBatchVarShape batchSrc(batches); + for (int i = 0; i < batches; ++i) + { + imgSrc.emplace_back(nvcv::Size2D{24, 24}, format); + } + batchSrc.pushBack(imgSrc.begin(), imgSrc.end()); + + std::vector imgDst; + std::vector imgWorkspace; + nvcv::ImageBatchVarShape batchDst(batches); + nvcv::ImageBatchVarShape batchWorkspace(batches); + for (int i = 0; i < batches; ++i) + { + imgDst.emplace_back(imgSrc[i].size(), imgSrc[i].format()); + imgWorkspace.emplace_back(imgSrc[i].size(), imgSrc[i].format()); + } + batchDst.pushBack(imgDst.begin(), imgDst.end()); + batchWorkspace.pushBack(imgWorkspace.begin(), imgWorkspace.end()); + + // Create kernel mask size tensor + nvcv::Tensor maskTensor({{batches}, "N"}, nvcv::TYPE_2S32); + { + auto dev = maskTensor.exportData(); + ASSERT_NE(dev, nullptr); + + std::vector vec(batches, int2{1, 1}); + + ASSERT_EQ(cudaSuccess, + cudaMemcpy(dev->basePtr(), vec.data(), vec.size() * sizeof(int2), cudaMemcpyHostToDevice)); + } + + // Create Anchor tensor + nvcv::Tensor anchorTensor({{batches}, "N"}, nvcv::TYPE_2S32); + { + auto dev = anchorTensor.exportData(); + ASSERT_NE(dev, nullptr); + + std::vector vec(batches, int2{0, 0}); + + ASSERT_EQ(cudaSuccess, + cudaMemcpy(dev->basePtr(), vec.data(), vec.size() * sizeof(int2), cudaMemcpyHostToDevice)); + } + + cvcuda::Morphology morphOp; + + // testSet0: iteration < 0 + EXPECT_THROW( + morphOp(nullptr, batchSrc, batchDst, batchWorkspace, NVCV_ERODE, maskTensor, anchorTensor, -1, borderMode), + nvcv::Exception); + + // testSet1: NVCV_DILATE and NVCV_ERODE && iteration > 1 && null workspace + std::vector testSet1{NVCV_DILATE, NVCV_ERODE}; + for (auto morphType : testSet1) + { + EXPECT_THROW( + morphOp(nullptr, batchSrc, batchDst, nvcv::NullOpt, morphType, maskTensor, anchorTensor, 2, borderMode), + nvcv::Exception); + } + + // testSet2: NVCV_CLOSE and NVCV_OPEN && null workspace + std::vector testSet2{NVCV_CLOSE, NVCV_OPEN}; + for (auto morphType : testSet2) + { + EXPECT_THROW( + morphOp(nullptr, batchSrc, batchDst, nvcv::NullOpt, morphType, maskTensor, anchorTensor, 1, borderMode), + nvcv::Exception); + } + + // testSet3: invalid data type + { + nvcv::ImageFormat formatInvalid{NVCV_IMAGE_FORMAT_RGBAf16}; + std::vector imgSrcInvalid; + nvcv::ImageBatchVarShape batchSrcInvalid(batches); + for (int i = 0; i < batches; ++i) + { + imgSrcInvalid.emplace_back(nvcv::Size2D{24, 24}, formatInvalid); + } + batchSrcInvalid.pushBack(imgSrcInvalid.begin(), imgSrcInvalid.end()); + + std::vector imgDstInvalid; + std::vector imgWorkspaceInvalid; + nvcv::ImageBatchVarShape batchDstInvalid(batches); + nvcv::ImageBatchVarShape batchWorkspaceInvalid(batches); + for (int i = 0; i < batches; ++i) + { + imgDstInvalid.emplace_back(imgSrcInvalid[i].size(), imgSrcInvalid[i].format()); + imgWorkspaceInvalid.emplace_back(imgSrcInvalid[i].size(), imgSrcInvalid[i].format()); + } + batchDstInvalid.pushBack(imgDstInvalid.begin(), imgDstInvalid.end()); + batchWorkspaceInvalid.pushBack(imgWorkspaceInvalid.begin(), imgWorkspaceInvalid.end()); + + EXPECT_THROW(morphOp(nullptr, batchSrcInvalid, batchDstInvalid, batchWorkspaceInvalid, NVCV_ERODE, maskTensor, + anchorTensor, 1, borderMode), + nvcv::Exception); + } + + // testSet4: input format is not equal to output format + { + std::vector imgDstInvalid; + std::vector imgWorkspaceInvalid; + nvcv::ImageBatchVarShape batchDstInvalid(1); + nvcv::ImageBatchVarShape batchWorkspaceInvalid(1); + imgDstInvalid.emplace_back(imgSrc[0].size(), imgSrc[0].format()); + imgWorkspaceInvalid.emplace_back(imgSrc[0].size(), imgSrc[0].format()); + batchDstInvalid.pushBack(imgDstInvalid.begin(), imgDstInvalid.end()); + batchWorkspaceInvalid.pushBack(imgWorkspaceInvalid.begin(), imgWorkspaceInvalid.end()); + + EXPECT_THROW(morphOp(nullptr, batchSrc, batchDstInvalid, batchWorkspaceInvalid, NVCV_ERODE, maskTensor, + anchorTensor, 1, borderMode), + nvcv::Exception); + } +} diff --git a/tests/cvcuda/system/TestUtils.cpp b/tests/cvcuda/system/TestUtils.cpp new file mode 100644 index 00000000..07199806 --- /dev/null +++ b/tests/cvcuda/system/TestUtils.cpp @@ -0,0 +1,332 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TestUtils.hpp" + +#include "Definitions.hpp" + +#include + +namespace cuda = nvcv::cuda; + +using std::vector; + +//-==================================================================================================================-// +// Generate an random image image vector. +template +void generateRandVec(T *dst, size_t size, RandEng &eng) +{ + RandInt rand(0, cuda::TypeTraits::max); + + // clang-format off + for (size_t i = 0; i < size; i++) dst[i] = rand(eng); + // clang-format on +} + +template<> +void generateRandVec(float *dst, size_t size, RandEng &eng) +{ + RandFlt rand(0.0f, 1.0f); + + // clang-format off + for (size_t i = 0; i < size; i++) dst[i] = rand(eng); + // clang-format on +} + +template<> +void generateRandVec(double *dst, size_t size, RandEng &eng) +{ + RandFlt rand(0.0, 1.0); + + // clang-format off + for (size_t i = 0; i < size; i++) dst[i] = rand(eng); + // clang-format on +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +#define MAKE_RAND_VEC(T) template void generateRandVec(T *, size_t, RandEng &) + +MAKE_RAND_VEC(uint8_t); +MAKE_RAND_VEC(uint16_t); +MAKE_RAND_VEC(int32_t); +MAKE_RAND_VEC(float); +MAKE_RAND_VEC(double); + +#undef MAKE_RAND_VEC + +//--------------------------------------------------------------------------------------------------------------------// + +//-==================================================================================================================-// +template +void generateRandTestRGB(T *dst, size_t size, RandEng &eng, bool rgba, bool bga) +{ + constexpr T max = std::is_floating_point_v ? 1 : cuda::TypeTraits::max; + constexpr T val[3] = {0, max / 2, max}; + + const size_t minSize = 3 * 3 * 3 * (3 + rgba); + + generateRandVec(dst, size, eng); + + if (size > minSize) + { + size_t idx = 0; + + for (uint r = 0; r < 3; r++) + { + const T red = val[r]; + + for (uint g = 0; g < 3; g++) + { + const T grn = val[g]; + + for (uint b = 0; b < 3; b++) + { + const T blu = val[b]; + + // clang-format off + if (bga) { dst[idx++] = blu; dst[idx++] = grn; dst[idx++] = red; } + else { dst[idx++] = red; dst[idx++] = grn; dst[idx++] = blu; } + if (rgba) dst[idx++] = max; + // clang-format on + } + } + } + } +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +#define MAKE_RAND_RGB_TEST(T) template void generateRandTestRGB(T *, size_t, RandEng &, bool, bool) + +MAKE_RAND_RGB_TEST(uint8_t); +MAKE_RAND_RGB_TEST(uint16_t); +MAKE_RAND_RGB_TEST(int32_t); +MAKE_RAND_RGB_TEST(float); +MAKE_RAND_RGB_TEST(double); + +#undef MAKE_RAND_RGB_TEST + +//--------------------------------------------------------------------------------------------------------------------// + +//-==================================================================================================================-// +// Generate an image potentially containing all 16,777,216 RGB8 colors, assuming the image / tensor dimensions are +// sufficiently large (http://www.brucelindbloom.com/downloads/RGB16Million.png); otherwise the image is cropped to the +// provided sizes. Generates consecutive 256 x 256 image blocks where, within each block, red varies from 0 to 255 +// horizontally and green varies from 0 to 255 vertically. Blue increments from 0 to 255 in consecutive blocks; partial +// blocks (i.e., those that may be cropped to specified dimensions) still increment the blue value. All values are then +// rescaled to fit the data type--e.g., floating point types are rescaled to be between 0 and 1. +// To get all 16,777,216 8-bit RGB colors, generate a single image (i.e., tensor batch = 1) of size 1 x 4096 x 4096, +// or generate a tensor of 4 x 2048 x 2048, 16 x 1024 x 1024, 64 x 512 x 512, or 256 x 256 x 256. +// Note: generates interleaved (non-planar) data. +template +void generateAllRGB(T *dst, uint wdth, uint hght, uint num, bool rgba, bool bga) +{ + constexpr T max = std::is_floating_point_v ? 1 : cuda::TypeTraits::max; + constexpr double round = std::is_floating_point_v ? 0 : 0.5; + constexpr double scale = (double)max / 255.0; + + const size_t incrH = wdth * (3 + rgba); + const size_t incrN = hght * incrH; + + uint addB = 0; + + for (uint i = 0; i < num; i++) + { + T *img = dst + i * incrN; + + for (uint y = 0; y < hght; y++) + { + T *row = img + y * incrH; + + uint8_t grn = static_cast(y & 255); + + for (uint x = 0; x < wdth; x++) + { + uint8_t red = static_cast(x & 255); + uint8_t blu = static_cast(((x >> 8) + addB) & 255); + + // clang-format off + if (bga) std::swap(red, blu); + *row++ = static_cast(red * scale + round); + *row++ = static_cast(grn * scale + round); + *row++ = static_cast(blu * scale + round); + if (rgba) *row++ = max; + // clang-format on + } + // clang-format off + if (grn == 255) addB += ((wdth + 255) >> 8); + // clang-format on + } + } +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +#define MAKE_ALL_RGB_TEST(T) template void generateAllRGB(T *, uint, uint, uint, bool rgba, bool bga) + +MAKE_ALL_RGB_TEST(uint8_t); +MAKE_ALL_RGB_TEST(uint16_t); +MAKE_ALL_RGB_TEST(int32_t); +MAKE_ALL_RGB_TEST(float); +MAKE_ALL_RGB_TEST(double); + +#undef MAKE_ALL_RGB_TEST + +//--------------------------------------------------------------------------------------------------------------------// + +//-==================================================================================================================-// +// Generate a random HSV (Hue-Saturation-Value) image where the Hue range can be specified and the Saturation and Value +// ranges are scaled according to the data type. Since Hue is circular, it can be useful to generate Hue values outside +// the standard range (e.g., min to test if a function that processes HSV images properly accounts for wrap-around Hue values. +// Note: generates interleaved (non-planar) data. +template +void generateRandHSV(T *dst, size_t size, RandEng &eng, double minHueMult, double maxHueMult) +{ + ASSERT_EQ(size % 3, 0); + + constexpr T max = std::is_floating_point_v ? 1 : cuda::TypeTraits::max; + constexpr uint range = (sizeof(T) > 1) ? 360 : (FullRange ? 256 : 180); + constexpr double scale = (double)range / 360.0; + constexpr double round = std::is_floating_point_v ? 0 : 0.5; + + // clang-format off + if (minHueMult > 1.0) minHueMult = 0.0; + if (maxHueMult < 0.0) maxHueMult = 1.0; + // clang-format on + + double minHue = minHueMult * range; + double maxHue = maxHueMult * range; + + RandFlt randHue(minHue, maxHue); + RandFlt randSV(0.0, 1.0); + + for (size_t i = 0; i < size; i += 3) + { + // clang-format off + *dst++ = static_cast(randHue(eng) * scale + round); + *dst++ = static_cast(randSV (eng) * max + round); + *dst++ = static_cast(randSV (eng) * max + round); + // clang-format on + } +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +// Restricted range hue (FullRange = false): values between [0-180). Applies only to uint8_t, but still need to +// instantiate all the types. +#define MAKE_RAND_HSV_TEST(T) template void generateRandHSV(T *, size_t, RandEng &, double, double) + +MAKE_RAND_HSV_TEST(uint8_t); +MAKE_RAND_HSV_TEST(uint16_t); +MAKE_RAND_HSV_TEST(int32_t); +MAKE_RAND_HSV_TEST(float); +MAKE_RAND_HSV_TEST(double); + +#undef MAKE_RAND_HSV_TEST + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +// Full range hue (FullRange = false): values between [0-256). Applies only to uint8_t, but still need to +// instantiate all the types. +#define MAKE_RAND_HSV_TEST(T) template void generateRandHSV(T *, size_t, RandEng &, double, double) + +MAKE_RAND_HSV_TEST(uint8_t); +MAKE_RAND_HSV_TEST(uint16_t); +MAKE_RAND_HSV_TEST(int32_t); +MAKE_RAND_HSV_TEST(float); +MAKE_RAND_HSV_TEST(double); + +#undef MAKE_RAND_HSV_TEST + +//--------------------------------------------------------------------------------------------------------------------// + +//-==================================================================================================================-// +// Generate an HSV (Hue-Saturation-Value) image containing blocks of size H_range x 256 where H_range is either: +// * 360 (for size(T) > 1), +// * 255 (for size(T) == 1 and FullRange == true), or +// * 180 (for size(T) == 1 and FullRange == false). +// Within each block, H (Hue) varies from 0 to H_range-1 horizontally and S (Saturation) varies from 0 to 255 vertically. +// V (Value) increments from 0 to 255 in consecutive blocks. The values for S and V are normalized (i.e., rescaled) +// according to the data type. +// To get all available HSV values, generate a single image (i.e., tensor batch = 1) of size 1 x (16*H_range) x 4096, +// or a tensor of 4 x (8*H_range) x 2048, 16 x (4*H_range) x 1024, 64 x (2*H_range) x 512, or 256 x H_range x 256. +// Note: generates interleaved (non-planar) data. +template +void generateAllHSV(T *dst, uint wdth, uint hght, uint num) +{ + constexpr T max = std::is_floating_point_v ? 1 : cuda::TypeTraits::max; + constexpr uint range = (sizeof(T) > 1) ? 360 : (FullRange ? 256 : 180); + constexpr double scale = (double)range / 360.0; + constexpr double norm = (double)max / 255.0; + constexpr double round = std::is_floating_point_v ? 0 : 0.5; + + constexpr uint stepV = 1; // Step size for V (value) from one block to the next. 17 is prime, so 256 % (17 * m) will + // always be unique for 0 <= m < 256. + const size_t incrH = wdth * 3; + const size_t incrN = hght * incrH; + + uint addV = 0; + + for (uint i = 0; i < num; i++) + { + T *img = dst + i * incrN; + + for (uint y = 0; y < hght; y++) + { + T *row = img + y * incrH; + + uint8_t S = static_cast(y & 255); + + // clang-format off + for (uint x = 0; x < wdth; x++) + { + uint8_t H = static_cast(x % range); + uint8_t V = static_cast((((uint)(x / range) + addV) * stepV) & 255); + + *row++ = static_cast(H * scale + round); + *row++ = static_cast(S * norm + round); + *row++ = static_cast(V * norm + round); + } + if (S == 255) addV += ((wdth + range - 1) / range); + // clang-format on + } + } +} + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +// Restricted range hue (FullRange = false): values between [0-180). Applies only to uint8_t, but still need to +// instantiate all the types. +#define MAKE_ALL_HSV_TEST(T) template void generateAllHSV(T *, uint, uint, uint) + +MAKE_ALL_HSV_TEST(uint8_t); +MAKE_ALL_HSV_TEST(uint16_t); +MAKE_ALL_HSV_TEST(int32_t); +MAKE_ALL_HSV_TEST(float); +MAKE_ALL_HSV_TEST(double); + +#undef MAKE_ALL_HSV_TEST + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +// Full range hue (FullRange = false): values between [0-256). Applies only to uint8_t, but still need to +// instantiate all the types. +#define MAKE_ALL_HSV_TEST(T) template void generateAllHSV(T *, uint, uint, uint) + +MAKE_ALL_HSV_TEST(uint8_t); +MAKE_ALL_HSV_TEST(uint16_t); +MAKE_ALL_HSV_TEST(int32_t); +MAKE_ALL_HSV_TEST(float); +MAKE_ALL_HSV_TEST(double); + +#undef MAKE_ALL_HSV_TEST + +//--------------------------------------------------------------------------------------------------------------------// diff --git a/tests/cvcuda/system/TestUtils.hpp b/tests/cvcuda/system/TestUtils.hpp new file mode 100644 index 00000000..6915e1db --- /dev/null +++ b/tests/cvcuda/system/TestUtils.hpp @@ -0,0 +1,187 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_TEST_COMMON_UTILS_HPP +#define NVCV_TEST_COMMON_UTILS_HPP + +#include +#include + +using RandEng = std::default_random_engine; + +template +using RandInt = std::uniform_int_distribution; + +template +using RandFlt = std::uniform_real_distribution; + +//--------------------------------------------------------------------------------------------------------------------// +template +void generateRandVec(T *dst, size_t size, RandEng &eng); + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +template +inline void generateRandVec(std::vector &dst, RandEng &eng) +{ + generateRandVec(dst.data(), dst.size(), eng); +} + +//--------------------------------------------------------------------------------------------------------------------// +template +void generateRandTestRGB(T *dst, size_t size, RandEng &eng, bool rgba = false, bool bga = false); + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +template +inline void generateRandTestRGB(std::vector &dst, RandEng &eng, bool rgba = false, bool bga = false) +{ + generateRandTestRGB(dst.data(), dst.size(), eng, rgba, bga); +} + +//--------------------------------------------------------------------------------------------------------------------// +template +void generateAllRGB(T *dst, uint wdth, uint hght, uint num, bool rgba = false, bool bga = false); + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +template +inline void generateAllRGB(std::vector &dst, uint wdth, uint hght, uint num, bool rgba = false, bool bga = false) +{ + ASSERT_GE(dst.size(), (size_t)num * (size_t)hght * (size_t)wdth * (size_t)(3 + rgba)); + generateAllRGB(dst.data(), wdth, hght, num, rgba, bga); +} + +//--------------------------------------------------------------------------------------------------------------------// +template +void generateRandHSV(T *dst, size_t size, RandEng &eng, double minHueMult = 0.0, double maxHueMult = 1.0); + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +template +inline void generateRandHSV(std::vector &dst, RandEng &eng, double minHueMult = 0.0, double maxHueMult = 1.0) +{ + generateRandHSV(dst.data(), dst.size(), eng, minHueMult, maxHueMult); +} + +//--------------------------------------------------------------------------------------------------------------------// +template +void generateAllHSV(T *dst, uint wdth, uint hght, uint num); + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +template +inline void generateAllHSV(std::vector &dst, uint wdth, uint hght, uint num) +{ + ASSERT_EQ(dst.size() % 3, 0); + ASSERT_GE(dst.size(), (size_t)num * (size_t)hght * (size_t)wdth * (size_t)3); + generateAllHSV(dst.data(), wdth, hght, num); +} + +//--------------------------------------------------------------------------------------------------------------------// + +// NOTE: the "do {" ... "} while (false)" statements in the macros below add scope context to multi-statement macro +// expansions so they can be nested inside non-scoped statements (e.g., "if", "for", etc. statements that don't +// have braces) and still be treated like a single statement that can be terminated with a semicolon (";"). +// For example, the "do-while" construct allows for: +// +// if () +// EXPECT_NEAR_VEC_CNT(vec1, vec2, maxDiff, maxCnt, passes); +// else +// std::cout << "Test condition not satisfied.\n"; +// +// without the problems that would otherwise occur from multi-statement macro expansion. +//--------------------------------------------------------------------------------------------------------------------// +#define EXPECT_NEAR_ARR_CNT(data1, data2, size, maxDiff, maxCnt, passes) \ + do \ + { \ + uint cnt = 0; \ + for (size_t i = 0; i < size && cnt < maxCnt; i++) \ + { \ + EXPECT_NEAR(data1[i], data2[i], maxDiff) << "At index " << i << " (error count = " << ++cnt << ")"; \ + } \ + passes = (cnt == 0); \ + } \ + while (false) + +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +#define EXPECT_NEAR_VEC_CNT(vec1, vec2, maxDiff, maxCnt, passes) \ + do \ + { \ + ASSERT_EQ(vec1.size(), vec2.size()); \ + EXPECT_NEAR_ARR_CNT(vec1.data(), vec2.data(), vec1.size(), maxDiff, maxCnt, passes); \ + } \ + while (false) + +//--------------------------------------------------------------------------------------------------------------------// + +//--------------------------------------------------------------------------------------------------------------------// +// clang-format off +#define EXPECT_NEAR_HSV_ARR_CNT(data1, data2, size, range, maxDiff, maxCnt, passes) \ + do \ + { \ + ASSERT_EQ(size % 3, 0); \ + uint cnt = 0; \ + double half = range * 0.5; \ + for (size_t i = 0; i < size && cnt < maxCnt; i += 3) \ + { \ + double val1 = static_cast(data1[i]); \ + double val2 = static_cast(data2[i]); \ + if (val2 >= val1 && val2 - val1 > half) \ + EXPECT_NEAR(data1[i] + range, data2[i], maxDiff) << "At index " << i \ + << " (error count = " << ++cnt << ")"; \ + else if (val1 - val2 > half) \ + EXPECT_NEAR(data1[i], data2[i] + range, maxDiff) << "At index " << i \ + << " (error count = " << ++cnt << ")"; \ + else \ + EXPECT_NEAR(data1[i], data2[i], maxDiff) << "At index " << i << " (error count = " << ++cnt << ")"; \ + EXPECT_NEAR(data1[i+1], data2[i+1], maxDiff) << "At index " << i+1 << " (error count = " << ++cnt << ")"; \ + EXPECT_NEAR(data1[i+2], data2[i+2], maxDiff) << "At index " << i+2 << " (error count = " << ++cnt << ")"; \ + } \ + passes = (cnt == 0); \ + } \ + while (false) + +// clang-format on +//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // +#define EXPECT_NEAR_HSV_VEC_CNT(vec1, vec2, range, maxDiff, maxCnt, passes) \ + do \ + { \ + ASSERT_EQ(vec1.size(), vec2.size()); \ + EXPECT_NEAR_HSV_ARR_CNT(vec1.data(), vec2.data(), vec1.size(), range, maxDiff, maxCnt, passes); \ + } \ + while (false) + +//--------------------------------------------------------------------------------------------------------------------// + +/* +FYI: gtest expands the following macro statement: + + EXPECT_NEAR(data1[i], data2[i], maxDiff) << "At index " << i << " (error count = " << ++cnt << ")"; + +to: + + switch (0) + case 0: + default: + if (const ::testing::AssertionResult gtest_ar + = ::testing::internal::DoubleNearPredFormat("refVec.data()[i]", "dstVec.data()[i]", "maxDiff", + refVec.data()[i] , dstVec.data()[i] , maxDiff)) ; + else + ::testing::internal::AssertHelper(::testing::TestPartResult::kNonFatalFailure, + __FILE__, __LINE__, gtest_ar.failure_message()) + = ::testing::Message() << "At index " << i << " (error count = " << ++cnt << ")"; + +The switch statement is to disambiguate the else clause if the macro is expanded in a nested if without braces. +*/ + +#endif // NVCV_TEST_COMMON_UTILS_HPP